All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC] drm/i915: Emit to ringbuffer directly
@ 2016-09-08 15:12 Tvrtko Ursulin
  2016-09-08 15:54 ` ✗ Fi.CI.BAT: failure for " Patchwork
                   ` (3 more replies)
  0 siblings, 4 replies; 15+ messages in thread
From: Tvrtko Ursulin @ 2016-09-08 15:12 UTC (permalink / raw)
  To: Intel-gfx

From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

This removes the usage of intel_ring_emit in favour of
directly writing to the ring buffer.

intel_ring_emit was preventing the compiler for optimising
fetch and increment of the current ring buffer pointer and
therefore generating very verbose code for every write.

It had no useful purpose since all ringbuffer operations
are started and ended with intel_ring_begin and
intel_ring_advance respectively, with no bail out in the
middle possible, so it is fine to increment the tail in
intel_ring_begin and let the code manage the pointer
itself.

Useless instruction removal amounts to approximately
2384 bytes of saved text on my build.

Not sure if this has any measurable performance
implications but executing a ton of useless instructions
on fast paths cannot be good.

Patch is not fully polished, but it compiles and runs
on Gen9 at least.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/i915_gem_context.c    |  62 ++--
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |  27 +-
 drivers/gpu/drm/i915/i915_gem_gtt.c        |  57 ++--
 drivers/gpu/drm/i915/intel_display.c       | 113 ++++---
 drivers/gpu/drm/i915/intel_lrc.c           | 223 +++++++-------
 drivers/gpu/drm/i915/intel_mocs.c          |  43 +--
 drivers/gpu/drm/i915/intel_overlay.c       |  69 ++---
 drivers/gpu/drm/i915/intel_ringbuffer.c    | 480 +++++++++++++++--------------
 drivers/gpu/drm/i915/intel_ringbuffer.h    |  19 +-
 9 files changed, 555 insertions(+), 538 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 35950ee46a1d..c9b61953f23b 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -577,7 +577,6 @@ static inline int
 mi_set_context(struct drm_i915_gem_request *req, u32 hw_flags)
 {
 	struct drm_i915_private *dev_priv = req->i915;
-	struct intel_ring *ring = req->ring;
 	struct intel_engine_cs *engine = req->engine;
 	u32 flags = hw_flags | MI_MM_SPACE_GTT;
 	const int num_rings =
@@ -585,6 +584,7 @@ mi_set_context(struct drm_i915_gem_request *req, u32 hw_flags)
 		i915.semaphores ?
 		INTEL_INFO(dev_priv)->num_rings - 1 :
 		0;
+	u32 *rbuf;
 	int len, ret;
 
 	/* w/a: If Flush TLB Invalidation Mode is enabled, driver must do a TLB
@@ -609,70 +609,61 @@ mi_set_context(struct drm_i915_gem_request *req, u32 hw_flags)
 	if (INTEL_GEN(dev_priv) >= 7)
 		len += 2 + (num_rings ? 4*num_rings + 6 : 0);
 
-	ret = intel_ring_begin(req, len);
+	ret = intel_ring_begin(req, len, &rbuf);
 	if (ret)
 		return ret;
 
 	/* WaProgramMiArbOnOffAroundMiSetContext:ivb,vlv,hsw,bdw,chv */
 	if (INTEL_GEN(dev_priv) >= 7) {
-		intel_ring_emit(ring, MI_ARB_ON_OFF | MI_ARB_DISABLE);
+		*rbuf++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
 		if (num_rings) {
 			struct intel_engine_cs *signaller;
 
-			intel_ring_emit(ring,
-					MI_LOAD_REGISTER_IMM(num_rings));
+			*rbuf++ = MI_LOAD_REGISTER_IMM(num_rings);
 			for_each_engine(signaller, dev_priv) {
 				if (signaller == engine)
 					continue;
 
-				intel_ring_emit_reg(ring,
-						    RING_PSMI_CTL(signaller->mmio_base));
-				intel_ring_emit(ring,
-						_MASKED_BIT_ENABLE(GEN6_PSMI_SLEEP_MSG_DISABLE));
+				*rbuf++ = RING_PSMI_CTL(signaller->mmio_base).reg;
+				*rbuf++ = _MASKED_BIT_ENABLE(GEN6_PSMI_SLEEP_MSG_DISABLE);
 			}
 		}
 	}
 
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_emit(ring, MI_SET_CONTEXT);
-	intel_ring_emit(ring,
-			i915_ggtt_offset(req->ctx->engine[RCS].state) | flags);
+	*rbuf++ = MI_NOOP;
+	*rbuf++ = MI_SET_CONTEXT;
+	*rbuf++ = i915_ggtt_offset(req->ctx->engine[RCS].state) | flags;
 	/*
 	 * w/a: MI_SET_CONTEXT must always be followed by MI_NOOP
 	 * WaMiSetContext_Hang:snb,ivb,vlv
 	 */
-	intel_ring_emit(ring, MI_NOOP);
+	*rbuf++ = MI_NOOP;
 
 	if (INTEL_GEN(dev_priv) >= 7) {
 		if (num_rings) {
 			struct intel_engine_cs *signaller;
 			i915_reg_t last_reg = {}; /* keep gcc quiet */
 
-			intel_ring_emit(ring,
-					MI_LOAD_REGISTER_IMM(num_rings));
+			*rbuf++ = MI_LOAD_REGISTER_IMM(num_rings);
 			for_each_engine(signaller, dev_priv) {
 				if (signaller == engine)
 					continue;
 
 				last_reg = RING_PSMI_CTL(signaller->mmio_base);
-				intel_ring_emit_reg(ring, last_reg);
-				intel_ring_emit(ring,
-						_MASKED_BIT_DISABLE(GEN6_PSMI_SLEEP_MSG_DISABLE));
+				*rbuf++ = last_reg.reg;
+				*rbuf++ = _MASKED_BIT_DISABLE(GEN6_PSMI_SLEEP_MSG_DISABLE);
 			}
 
 			/* Insert a delay before the next switch! */
-			intel_ring_emit(ring,
-					MI_STORE_REGISTER_MEM |
-					MI_SRM_LRM_GLOBAL_GTT);
-			intel_ring_emit_reg(ring, last_reg);
-			intel_ring_emit(ring,
-					i915_ggtt_offset(engine->scratch));
-			intel_ring_emit(ring, MI_NOOP);
+			*rbuf++ = MI_STORE_REGISTER_MEM | MI_SRM_LRM_GLOBAL_GTT;
+			*rbuf++ = last_reg.reg;
+			*rbuf++ = i915_ggtt_offset(engine->scratch);
+			*rbuf++ = MI_NOOP;
 		}
-		intel_ring_emit(ring, MI_ARB_ON_OFF | MI_ARB_ENABLE);
+		*rbuf++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
 	}
 
-	intel_ring_advance(ring);
+	intel_ring_advance(req->ring);
 
 	return ret;
 }
@@ -680,13 +671,13 @@ mi_set_context(struct drm_i915_gem_request *req, u32 hw_flags)
 static int remap_l3(struct drm_i915_gem_request *req, int slice)
 {
 	u32 *remap_info = req->i915->l3_parity.remap_info[slice];
-	struct intel_ring *ring = req->ring;
+	u32 *rbuf;
 	int i, ret;
 
 	if (!remap_info)
 		return 0;
 
-	ret = intel_ring_begin(req, GEN7_L3LOG_SIZE/4 * 2 + 2);
+	ret = intel_ring_begin(req, GEN7_L3LOG_SIZE/4 * 2 + 2, &rbuf);
 	if (ret)
 		return ret;
 
@@ -695,13 +686,14 @@ static int remap_l3(struct drm_i915_gem_request *req, int slice)
 	 * here because no other code should access these registers other than
 	 * at initialization time.
 	 */
-	intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(GEN7_L3LOG_SIZE/4));
+	*rbuf++ = MI_LOAD_REGISTER_IMM(GEN7_L3LOG_SIZE/4);
 	for (i = 0; i < GEN7_L3LOG_SIZE/4; i++) {
-		intel_ring_emit_reg(ring, GEN7_L3LOG(slice, i));
-		intel_ring_emit(ring, remap_info[i]);
+		*rbuf++ = GEN7_L3LOG(slice, i).reg;
+		*rbuf++ = remap_info[i];
 	}
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_advance(ring);
+	*rbuf++ = MI_NOOP;
+
+	intel_ring_advance(req->ring);
 
 	return 0;
 }
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 9432d4ce9ffb..d9a226ea768c 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1358,7 +1358,7 @@ i915_gem_execbuffer_move_to_active(struct list_head *vmas,
 static int
 i915_reset_gen7_sol_offsets(struct drm_i915_gem_request *req)
 {
-	struct intel_ring *ring = req->ring;
+	u32 *rbuf;
 	int ret, i;
 
 	if (!IS_GEN7(req->i915) || req->engine->id != RCS) {
@@ -1366,17 +1366,17 @@ i915_reset_gen7_sol_offsets(struct drm_i915_gem_request *req)
 		return -EINVAL;
 	}
 
-	ret = intel_ring_begin(req, 4 * 3);
+	ret = intel_ring_begin(req, 4 * 3, &rbuf);
 	if (ret)
 		return ret;
 
 	for (i = 0; i < 4; i++) {
-		intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
-		intel_ring_emit_reg(ring, GEN7_SO_WRITE_OFFSET(i));
-		intel_ring_emit(ring, 0);
+		*rbuf++ = MI_LOAD_REGISTER_IMM(1);
+		*rbuf++ = GEN7_SO_WRITE_OFFSET(i).reg;
+		*rbuf++ = 0;
 	}
 
-	intel_ring_advance(ring);
+	intel_ring_advance(req->ring);
 
 	return 0;
 }
@@ -1483,17 +1483,18 @@ execbuf_submit(struct i915_execbuffer_params *params,
 
 	if (params->engine->id == RCS &&
 	    instp_mode != dev_priv->relative_constants_mode) {
-		struct intel_ring *ring = params->request->ring;
+		u32 *rbuf;
 
-		ret = intel_ring_begin(params->request, 4);
+		ret = intel_ring_begin(params->request, 4, &rbuf);
 		if (ret)
 			return ret;
 
-		intel_ring_emit(ring, MI_NOOP);
-		intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
-		intel_ring_emit_reg(ring, INSTPM);
-		intel_ring_emit(ring, instp_mask << 16 | instp_mode);
-		intel_ring_advance(ring);
+		*rbuf++ = MI_NOOP;
+		*rbuf++ = MI_LOAD_REGISTER_IMM(1);
+		*rbuf++ = INSTPM.reg;
+		*rbuf++ = instp_mask << 16 | instp_mode;
+
+		intel_ring_advance(params->request->ring);
 
 		dev_priv->relative_constants_mode = instp_mode;
 	}
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index e16c38086abe..d5b5cea08263 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -663,23 +663,24 @@ static int gen8_write_pdp(struct drm_i915_gem_request *req,
 			  unsigned entry,
 			  dma_addr_t addr)
 {
-	struct intel_ring *ring = req->ring;
 	struct intel_engine_cs *engine = req->engine;
+	u32 *rbuf;
 	int ret;
 
 	BUG_ON(entry >= 4);
 
-	ret = intel_ring_begin(req, 6);
+	ret = intel_ring_begin(req, 6, &rbuf);
 	if (ret)
 		return ret;
 
-	intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
-	intel_ring_emit_reg(ring, GEN8_RING_PDP_UDW(engine, entry));
-	intel_ring_emit(ring, upper_32_bits(addr));
-	intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
-	intel_ring_emit_reg(ring, GEN8_RING_PDP_LDW(engine, entry));
-	intel_ring_emit(ring, lower_32_bits(addr));
-	intel_ring_advance(ring);
+	*rbuf++ = MI_LOAD_REGISTER_IMM(1);
+	*rbuf++ = GEN8_RING_PDP_UDW(engine, entry).reg;
+	*rbuf++ = upper_32_bits(addr);
+	*rbuf++ = MI_LOAD_REGISTER_IMM(1);
+	*rbuf++ = GEN8_RING_PDP_LDW(engine, entry).reg;
+	*rbuf++ = lower_32_bits(addr);
+
+	intel_ring_advance(req->ring);
 
 	return 0;
 }
@@ -1655,8 +1656,8 @@ static uint32_t get_pd_offset(struct i915_hw_ppgtt *ppgtt)
 static int hsw_mm_switch(struct i915_hw_ppgtt *ppgtt,
 			 struct drm_i915_gem_request *req)
 {
-	struct intel_ring *ring = req->ring;
 	struct intel_engine_cs *engine = req->engine;
+	u32 *rbuf;
 	int ret;
 
 	/* NB: TLBs must be flushed and invalidated before a switch */
@@ -1664,17 +1665,18 @@ static int hsw_mm_switch(struct i915_hw_ppgtt *ppgtt,
 	if (ret)
 		return ret;
 
-	ret = intel_ring_begin(req, 6);
+	ret = intel_ring_begin(req, 6, &rbuf);
 	if (ret)
 		return ret;
 
-	intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(2));
-	intel_ring_emit_reg(ring, RING_PP_DIR_DCLV(engine));
-	intel_ring_emit(ring, PP_DIR_DCLV_2G);
-	intel_ring_emit_reg(ring, RING_PP_DIR_BASE(engine));
-	intel_ring_emit(ring, get_pd_offset(ppgtt));
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_advance(ring);
+	*rbuf++ = MI_LOAD_REGISTER_IMM(2);
+	*rbuf++ = RING_PP_DIR_DCLV(engine).reg;
+	*rbuf++ = PP_DIR_DCLV_2G;
+	*rbuf++ = RING_PP_DIR_BASE(engine).reg;
+	*rbuf++ = get_pd_offset(ppgtt);
+	*rbuf++ = MI_NOOP;
+
+	intel_ring_advance(req->ring);
 
 	return 0;
 }
@@ -1682,8 +1684,8 @@ static int hsw_mm_switch(struct i915_hw_ppgtt *ppgtt,
 static int gen7_mm_switch(struct i915_hw_ppgtt *ppgtt,
 			  struct drm_i915_gem_request *req)
 {
-	struct intel_ring *ring = req->ring;
 	struct intel_engine_cs *engine = req->engine;
+	u32 *rbuf;
 	int ret;
 
 	/* NB: TLBs must be flushed and invalidated before a switch */
@@ -1691,17 +1693,18 @@ static int gen7_mm_switch(struct i915_hw_ppgtt *ppgtt,
 	if (ret)
 		return ret;
 
-	ret = intel_ring_begin(req, 6);
+	ret = intel_ring_begin(req, 6, &rbuf);
 	if (ret)
 		return ret;
 
-	intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(2));
-	intel_ring_emit_reg(ring, RING_PP_DIR_DCLV(engine));
-	intel_ring_emit(ring, PP_DIR_DCLV_2G);
-	intel_ring_emit_reg(ring, RING_PP_DIR_BASE(engine));
-	intel_ring_emit(ring, get_pd_offset(ppgtt));
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_advance(ring);
+	*rbuf++ = MI_LOAD_REGISTER_IMM(2);
+	*rbuf++ = RING_PP_DIR_DCLV(engine).reg;
+	*rbuf++ = PP_DIR_DCLV_2G;
+	*rbuf++ = RING_PP_DIR_BASE(engine).reg;
+	*rbuf++ = get_pd_offset(ppgtt);
+	*rbuf++ = MI_NOOP;
+
+	intel_ring_advance(req->ring);
 
 	/* XXX: RCS is the only one to auto invalidate the TLBs? */
 	if (engine->id != RCS) {
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 0c65212781e4..110252644287 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -11679,12 +11679,11 @@ static int intel_gen2_queue_flip(struct drm_device *dev,
 				 struct drm_i915_gem_request *req,
 				 uint32_t flags)
 {
-	struct intel_ring *ring = req->ring;
 	struct intel_crtc *intel_crtc = to_intel_crtc(crtc);
-	u32 flip_mask;
+	u32 flip_mask, *rbuf;
 	int ret;
 
-	ret = intel_ring_begin(req, 6);
+	ret = intel_ring_begin(req, 6, &rbuf);
 	if (ret)
 		return ret;
 
@@ -11695,13 +11694,14 @@ static int intel_gen2_queue_flip(struct drm_device *dev,
 		flip_mask = MI_WAIT_FOR_PLANE_B_FLIP;
 	else
 		flip_mask = MI_WAIT_FOR_PLANE_A_FLIP;
-	intel_ring_emit(ring, MI_WAIT_FOR_EVENT | flip_mask);
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_emit(ring, MI_DISPLAY_FLIP |
-			MI_DISPLAY_FLIP_PLANE(intel_crtc->plane));
-	intel_ring_emit(ring, fb->pitches[0]);
-	intel_ring_emit(ring, intel_crtc->flip_work->gtt_offset);
-	intel_ring_emit(ring, 0); /* aux display base address, unused */
+
+	*rbuf++ = MI_WAIT_FOR_EVENT | flip_mask;
+	*rbuf++ = MI_NOOP;
+	*rbuf++ = MI_DISPLAY_FLIP |
+		  MI_DISPLAY_FLIP_PLANE(intel_crtc->plane);
+	*rbuf++ = fb->pitches[0];
+	*rbuf++ = intel_crtc->flip_work->gtt_offset;
+	*rbuf++ = 0; /* aux display base address, unused */
 
 	return 0;
 }
@@ -11713,12 +11713,11 @@ static int intel_gen3_queue_flip(struct drm_device *dev,
 				 struct drm_i915_gem_request *req,
 				 uint32_t flags)
 {
-	struct intel_ring *ring = req->ring;
 	struct intel_crtc *intel_crtc = to_intel_crtc(crtc);
-	u32 flip_mask;
+	u32 flip_mask, *rbuf;
 	int ret;
 
-	ret = intel_ring_begin(req, 6);
+	ret = intel_ring_begin(req, 6, &rbuf);
 	if (ret)
 		return ret;
 
@@ -11726,13 +11725,14 @@ static int intel_gen3_queue_flip(struct drm_device *dev,
 		flip_mask = MI_WAIT_FOR_PLANE_B_FLIP;
 	else
 		flip_mask = MI_WAIT_FOR_PLANE_A_FLIP;
-	intel_ring_emit(ring, MI_WAIT_FOR_EVENT | flip_mask);
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_emit(ring, MI_DISPLAY_FLIP_I915 |
-			MI_DISPLAY_FLIP_PLANE(intel_crtc->plane));
-	intel_ring_emit(ring, fb->pitches[0]);
-	intel_ring_emit(ring, intel_crtc->flip_work->gtt_offset);
-	intel_ring_emit(ring, MI_NOOP);
+
+	*rbuf++ = MI_WAIT_FOR_EVENT | flip_mask;
+	*rbuf++ = MI_NOOP;
+	*rbuf++ = MI_DISPLAY_FLIP_I915 |
+		  MI_DISPLAY_FLIP_PLANE(intel_crtc->plane);
+	*rbuf++ = fb->pitches[0];
+	*rbuf++ = intel_crtc->flip_work->gtt_offset;
+	*rbuf++ = MI_NOOP;
 
 	return 0;
 }
@@ -11744,13 +11744,13 @@ static int intel_gen4_queue_flip(struct drm_device *dev,
 				 struct drm_i915_gem_request *req,
 				 uint32_t flags)
 {
-	struct intel_ring *ring = req->ring;
 	struct drm_i915_private *dev_priv = to_i915(dev);
 	struct intel_crtc *intel_crtc = to_intel_crtc(crtc);
+	u32 *rbuf;
 	uint32_t pf, pipesrc;
 	int ret;
 
-	ret = intel_ring_begin(req, 4);
+	ret = intel_ring_begin(req, 4, &rbuf);
 	if (ret)
 		return ret;
 
@@ -11758,11 +11758,11 @@ static int intel_gen4_queue_flip(struct drm_device *dev,
 	 * Display Registers (which do not change across a page-flip)
 	 * so we need only reprogram the base address.
 	 */
-	intel_ring_emit(ring, MI_DISPLAY_FLIP |
-			MI_DISPLAY_FLIP_PLANE(intel_crtc->plane));
-	intel_ring_emit(ring, fb->pitches[0]);
-	intel_ring_emit(ring, intel_crtc->flip_work->gtt_offset |
-			intel_fb_modifier_to_tiling(fb->modifier[0]));
+	*rbuf++ = MI_DISPLAY_FLIP |
+		  MI_DISPLAY_FLIP_PLANE(intel_crtc->plane);
+	*rbuf++ = fb->pitches[0];
+	*rbuf++ = intel_crtc->flip_work->gtt_offset |
+		  intel_fb_modifier_to_tiling(fb->modifier[0]);
 
 	/* XXX Enabling the panel-fitter across page-flip is so far
 	 * untested on non-native modes, so ignore it for now.
@@ -11770,7 +11770,7 @@ static int intel_gen4_queue_flip(struct drm_device *dev,
 	 */
 	pf = 0;
 	pipesrc = I915_READ(PIPESRC(intel_crtc->pipe)) & 0x0fff0fff;
-	intel_ring_emit(ring, pf | pipesrc);
+	*rbuf++ = pf | pipesrc;
 
 	return 0;
 }
@@ -11782,21 +11782,21 @@ static int intel_gen6_queue_flip(struct drm_device *dev,
 				 struct drm_i915_gem_request *req,
 				 uint32_t flags)
 {
-	struct intel_ring *ring = req->ring;
 	struct drm_i915_private *dev_priv = to_i915(dev);
 	struct intel_crtc *intel_crtc = to_intel_crtc(crtc);
+	u32 *rbuf;
 	uint32_t pf, pipesrc;
 	int ret;
 
-	ret = intel_ring_begin(req, 4);
+	ret = intel_ring_begin(req, 4, &rbuf);
 	if (ret)
 		return ret;
 
-	intel_ring_emit(ring, MI_DISPLAY_FLIP |
-			MI_DISPLAY_FLIP_PLANE(intel_crtc->plane));
-	intel_ring_emit(ring, fb->pitches[0] |
-			intel_fb_modifier_to_tiling(fb->modifier[0]));
-	intel_ring_emit(ring, intel_crtc->flip_work->gtt_offset);
+	*rbuf++ = MI_DISPLAY_FLIP |
+		  MI_DISPLAY_FLIP_PLANE(intel_crtc->plane);
+	*rbuf++ = fb->pitches[0] |
+		  intel_fb_modifier_to_tiling(fb->modifier[0]);
+	*rbuf++ = intel_crtc->flip_work->gtt_offset;
 
 	/* Contrary to the suggestions in the documentation,
 	 * "Enable Panel Fitter" does not seem to be required when page
@@ -11806,7 +11806,7 @@ static int intel_gen6_queue_flip(struct drm_device *dev,
 	 */
 	pf = 0;
 	pipesrc = I915_READ(PIPESRC(intel_crtc->pipe)) & 0x0fff0fff;
-	intel_ring_emit(ring, pf | pipesrc);
+	*rbuf++ = pf | pipesrc;
 
 	return 0;
 }
@@ -11818,8 +11818,8 @@ static int intel_gen7_queue_flip(struct drm_device *dev,
 				 struct drm_i915_gem_request *req,
 				 uint32_t flags)
 {
-	struct intel_ring *ring = req->ring;
 	struct intel_crtc *intel_crtc = to_intel_crtc(crtc);
+	u32 *rbuf;
 	uint32_t plane_bit = 0;
 	int len, ret;
 
@@ -11864,7 +11864,7 @@ static int intel_gen7_queue_flip(struct drm_device *dev,
 	if (ret)
 		return ret;
 
-	ret = intel_ring_begin(req, len);
+	ret = intel_ring_begin(req, len, &rbuf);
 	if (ret)
 		return ret;
 
@@ -11878,31 +11878,30 @@ static int intel_gen7_queue_flip(struct drm_device *dev,
 	 * to zero does lead to lockups within MI_DISPLAY_FLIP.
 	 */
 	if (req->engine->id == RCS) {
-		intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
-		intel_ring_emit_reg(ring, DERRMR);
-		intel_ring_emit(ring, ~(DERRMR_PIPEA_PRI_FLIP_DONE |
-					  DERRMR_PIPEB_PRI_FLIP_DONE |
-					  DERRMR_PIPEC_PRI_FLIP_DONE));
+		*rbuf++ = MI_LOAD_REGISTER_IMM(1);
+		*rbuf++ = DERRMR.reg;
+		*rbuf++ = ~(DERRMR_PIPEA_PRI_FLIP_DONE |
+			   DERRMR_PIPEB_PRI_FLIP_DONE |
+			   DERRMR_PIPEC_PRI_FLIP_DONE);
 		if (IS_GEN8(dev))
-			intel_ring_emit(ring, MI_STORE_REGISTER_MEM_GEN8 |
-					      MI_SRM_LRM_GLOBAL_GTT);
+			*rbuf++ = MI_STORE_REGISTER_MEM_GEN8 |
+				  MI_SRM_LRM_GLOBAL_GTT;
 		else
-			intel_ring_emit(ring, MI_STORE_REGISTER_MEM |
-					      MI_SRM_LRM_GLOBAL_GTT);
-		intel_ring_emit_reg(ring, DERRMR);
-		intel_ring_emit(ring,
-				i915_ggtt_offset(req->engine->scratch) + 256);
+			*rbuf++ = MI_STORE_REGISTER_MEM |
+				  MI_SRM_LRM_GLOBAL_GTT;
+		*rbuf++ = DERRMR.reg;
+		*rbuf++ = i915_ggtt_offset(req->engine->scratch) + 256;
 		if (IS_GEN8(dev)) {
-			intel_ring_emit(ring, 0);
-			intel_ring_emit(ring, MI_NOOP);
+			*rbuf++ = 0;
+			*rbuf++ = MI_NOOP;
 		}
 	}
 
-	intel_ring_emit(ring, MI_DISPLAY_FLIP_I915 | plane_bit);
-	intel_ring_emit(ring, fb->pitches[0] |
-			intel_fb_modifier_to_tiling(fb->modifier[0]));
-	intel_ring_emit(ring, intel_crtc->flip_work->gtt_offset);
-	intel_ring_emit(ring, (MI_NOOP));
+	*rbuf++ = MI_DISPLAY_FLIP_I915 | plane_bit;
+	*rbuf++ = fb->pitches[0] |
+		  intel_fb_modifier_to_tiling(fb->modifier[0]);
+	*rbuf++ = intel_crtc->flip_work->gtt_offset;
+	*rbuf++ = MI_NOOP;
 
 	return 0;
 }
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 92bfe47ad33c..e8846523be4b 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -677,7 +677,7 @@ int intel_logical_ring_alloc_request_extras(struct drm_i915_gem_request *request
 	if (ret)
 		return ret;
 
-	ret = intel_ring_begin(request, 0);
+	ret = __intel_ring_begin(request, 0);
 	if (ret)
 		goto err_unpin;
 
@@ -705,6 +705,13 @@ err_unpin:
 }
 
 /*
+ * Reserve space for 2 NOOPs at the end of each request to be
+ * used as a workaround for not being allowed to do lite
+ * restore with HEAD==TAIL (WaIdleLiteRestore).
+ */
+#define WA_TAIL_DWORDS 2
+
+/*
  * intel_logical_ring_advance() - advance the tail and prepare for submission
  * @request: Request to advance the logical ringbuffer of.
  *
@@ -714,13 +721,14 @@ err_unpin:
  * point, the tail *inside* the context is updated and the ELSP written to.
  */
 static int
-intel_logical_ring_advance(struct drm_i915_gem_request *request)
+intel_logical_ring_advance(struct drm_i915_gem_request *request, u32 *rbuf)
 {
 	struct intel_ring *ring = request->ring;
 	struct intel_engine_cs *engine = request->engine;
 
 	intel_ring_advance(ring);
-	request->tail = ring->tail;
+
+	request->tail = ring->tail - WA_TAIL_DWORDS * sizeof(u32);
 
 	/*
 	 * Here we add two extra NOOPs as padding to avoid
@@ -728,8 +736,9 @@ intel_logical_ring_advance(struct drm_i915_gem_request *request)
 	 *
 	 * Caller must reserve WA_TAIL_DWORDS for us!
 	 */
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_emit(ring, MI_NOOP);
+	*rbuf++ = MI_NOOP;
+	*rbuf++ = MI_NOOP;
+
 	intel_ring_advance(ring);
 
 	/* We keep the previous context alive until we retire the following
@@ -837,7 +846,7 @@ void intel_lr_context_unpin(struct i915_gem_context *ctx,
 static int intel_logical_ring_workarounds_emit(struct drm_i915_gem_request *req)
 {
 	int ret, i;
-	struct intel_ring *ring = req->ring;
+	u32 *rbuf;
 	struct i915_workarounds *w = &req->i915->workarounds;
 
 	if (w->count == 0)
@@ -847,18 +856,18 @@ static int intel_logical_ring_workarounds_emit(struct drm_i915_gem_request *req)
 	if (ret)
 		return ret;
 
-	ret = intel_ring_begin(req, w->count * 2 + 2);
-	if (ret)
+	ret = intel_ring_begin(req, w->count * 2 + 2, &rbuf);
+	if (unlikely(ret))
 		return ret;
 
-	intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(w->count));
+	*rbuf++ = MI_LOAD_REGISTER_IMM(w->count);
 	for (i = 0; i < w->count; i++) {
-		intel_ring_emit_reg(ring, w->reg[i].addr);
-		intel_ring_emit(ring, w->reg[i].value);
+		*rbuf++ = w->reg[i].addr.reg;
+		*rbuf++ = w->reg[i].value;
 	}
-	intel_ring_emit(ring, MI_NOOP);
+	*rbuf++ = MI_NOOP;
 
-	intel_ring_advance(ring);
+	intel_ring_advance(req->ring);
 
 	ret = req->engine->emit_flush(req, EMIT_BARRIER);
 	if (ret)
@@ -1360,27 +1369,27 @@ static int gen9_init_render_ring(struct intel_engine_cs *engine)
 static int intel_logical_ring_emit_pdps(struct drm_i915_gem_request *req)
 {
 	struct i915_hw_ppgtt *ppgtt = req->ctx->ppgtt;
-	struct intel_ring *ring = req->ring;
 	struct intel_engine_cs *engine = req->engine;
 	const int num_lri_cmds = GEN8_LEGACY_PDPES * 2;
-	int i, ret;
+	u32 *rbuf;
+	int ret, i;
 
-	ret = intel_ring_begin(req, num_lri_cmds * 2 + 2);
-	if (ret)
+	ret = intel_ring_begin(req, num_lri_cmds * 2 + 2, &rbuf);
+	if (unlikely(ret))
 		return ret;
 
-	intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(num_lri_cmds));
+	*rbuf++ = MI_LOAD_REGISTER_IMM(num_lri_cmds);
 	for (i = GEN8_LEGACY_PDPES - 1; i >= 0; i--) {
 		const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i);
 
-		intel_ring_emit_reg(ring, GEN8_RING_PDP_UDW(engine, i));
-		intel_ring_emit(ring, upper_32_bits(pd_daddr));
-		intel_ring_emit_reg(ring, GEN8_RING_PDP_LDW(engine, i));
-		intel_ring_emit(ring, lower_32_bits(pd_daddr));
+		*rbuf++ = GEN8_RING_PDP_UDW(engine, i).reg;
+		*rbuf++ = upper_32_bits(pd_daddr);
+		*rbuf++ = GEN8_RING_PDP_LDW(engine, i).reg;
+		*rbuf++ = lower_32_bits(pd_daddr);
 	}
+	*rbuf++ = MI_NOOP;
 
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_advance(ring);
+	intel_ring_advance(req->ring);
 
 	return 0;
 }
@@ -1389,8 +1398,8 @@ static int gen8_emit_bb_start(struct drm_i915_gem_request *req,
 			      u64 offset, u32 len,
 			      unsigned int dispatch_flags)
 {
-	struct intel_ring *ring = req->ring;
 	bool ppgtt = !(dispatch_flags & I915_DISPATCH_SECURE);
+	u32 *rbuf;
 	int ret;
 
 	/* Don't rely in hw updating PDPs, specially in lite-restore.
@@ -1411,19 +1420,20 @@ static int gen8_emit_bb_start(struct drm_i915_gem_request *req,
 		req->ctx->ppgtt->pd_dirty_rings &= ~intel_engine_flag(req->engine);
 	}
 
-	ret = intel_ring_begin(req, 4);
-	if (ret)
+	ret = intel_ring_begin(req, 4, &rbuf);
+	if (unlikely(ret))
 		return ret;
 
 	/* FIXME(BDW): Address space and security selectors. */
-	intel_ring_emit(ring, MI_BATCH_BUFFER_START_GEN8 |
-			(ppgtt<<8) |
-			(dispatch_flags & I915_DISPATCH_RS ?
-			 MI_BATCH_RESOURCE_STREAMER : 0));
-	intel_ring_emit(ring, lower_32_bits(offset));
-	intel_ring_emit(ring, upper_32_bits(offset));
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_advance(ring);
+	*rbuf++ = MI_BATCH_BUFFER_START_GEN8 |
+		  (ppgtt << 8) |
+		  (dispatch_flags & I915_DISPATCH_RS ?
+		  MI_BATCH_RESOURCE_STREAMER : 0);
+	*rbuf++ = lower_32_bits(offset);
+	*rbuf++ = upper_32_bits(offset);
+	*rbuf++ = MI_NOOP;
+
+	intel_ring_advance(req->ring);
 
 	return 0;
 }
@@ -1444,12 +1454,11 @@ static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine)
 
 static int gen8_emit_flush(struct drm_i915_gem_request *request, u32 mode)
 {
-	struct intel_ring *ring = request->ring;
-	u32 cmd;
 	int ret;
+	u32 cmd, *rbuf;
 
-	ret = intel_ring_begin(request, 4);
-	if (ret)
+	ret = intel_ring_begin(request, 4, &rbuf);
+	if (unlikely(ret))
 		return ret;
 
 	cmd = MI_FLUSH_DW + 1;
@@ -1467,13 +1476,12 @@ static int gen8_emit_flush(struct drm_i915_gem_request *request, u32 mode)
 			cmd |= MI_INVALIDATE_BSD;
 	}
 
-	intel_ring_emit(ring, cmd);
-	intel_ring_emit(ring,
-			I915_GEM_HWS_SCRATCH_ADDR |
-			MI_FLUSH_DW_USE_GTT);
-	intel_ring_emit(ring, 0); /* upper addr */
-	intel_ring_emit(ring, 0); /* value */
-	intel_ring_advance(ring);
+	*rbuf++ = cmd;
+	*rbuf++ = I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
+	*rbuf++ = 0; /* upper addr */
+	*rbuf++ = 0; /* value */
+
+	intel_ring_advance(request->ring);
 
 	return 0;
 }
@@ -1481,14 +1489,11 @@ static int gen8_emit_flush(struct drm_i915_gem_request *request, u32 mode)
 static int gen8_emit_flush_render(struct drm_i915_gem_request *request,
 				  u32 mode)
 {
-	struct intel_ring *ring = request->ring;
-	struct intel_engine_cs *engine = request->engine;
 	u32 scratch_addr =
-		i915_ggtt_offset(engine->scratch) + 2 * CACHELINE_BYTES;
+	       i915_ggtt_offset(request->engine->scratch) + 2 * CACHELINE_BYTES;
 	bool vf_flush_wa = false, dc_flush_wa = false;
-	u32 flags = 0;
-	int ret;
-	int len;
+	u32 *rbuf, flags = 0;
+	int ret, len;
 
 	flags |= PIPE_CONTROL_CS_STALL;
 
@@ -1529,45 +1534,45 @@ static int gen8_emit_flush_render(struct drm_i915_gem_request *request,
 	if (dc_flush_wa)
 		len += 12;
 
-	ret = intel_ring_begin(request, len);
-	if (ret)
+	ret = intel_ring_begin(request, len, &rbuf);
+	if (unlikely(ret))
 		return ret;
 
 	if (vf_flush_wa) {
-		intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(6));
-		intel_ring_emit(ring, 0);
-		intel_ring_emit(ring, 0);
-		intel_ring_emit(ring, 0);
-		intel_ring_emit(ring, 0);
-		intel_ring_emit(ring, 0);
+		*rbuf++ = GFX_OP_PIPE_CONTROL(6);
+		*rbuf++ = 0;
+		*rbuf++ = 0;
+		*rbuf++ = 0;
+		*rbuf++ = 0;
+		*rbuf++ = 0;
 	}
 
 	if (dc_flush_wa) {
-		intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(6));
-		intel_ring_emit(ring, PIPE_CONTROL_DC_FLUSH_ENABLE);
-		intel_ring_emit(ring, 0);
-		intel_ring_emit(ring, 0);
-		intel_ring_emit(ring, 0);
-		intel_ring_emit(ring, 0);
+		*rbuf++ = GFX_OP_PIPE_CONTROL(6);
+		*rbuf++ = PIPE_CONTROL_DC_FLUSH_ENABLE;
+		*rbuf++ = 0;
+		*rbuf++ = 0;
+		*rbuf++ = 0;
+		*rbuf++ = 0;
 	}
 
-	intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(6));
-	intel_ring_emit(ring, flags);
-	intel_ring_emit(ring, scratch_addr);
-	intel_ring_emit(ring, 0);
-	intel_ring_emit(ring, 0);
-	intel_ring_emit(ring, 0);
+	*rbuf++ = GFX_OP_PIPE_CONTROL(6);
+	*rbuf++ = flags;
+	*rbuf++ = scratch_addr;
+	*rbuf++ = 0;
+	*rbuf++ = 0;
+	*rbuf++ = 0;
 
 	if (dc_flush_wa) {
-		intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(6));
-		intel_ring_emit(ring, PIPE_CONTROL_CS_STALL);
-		intel_ring_emit(ring, 0);
-		intel_ring_emit(ring, 0);
-		intel_ring_emit(ring, 0);
-		intel_ring_emit(ring, 0);
+		*rbuf++ = GFX_OP_PIPE_CONTROL(6);
+		*rbuf++ = PIPE_CONTROL_CS_STALL;
+		*rbuf++ = 0;
+		*rbuf++ = 0;
+		*rbuf++ = 0;
+		*rbuf++ = 0;
 	}
 
-	intel_ring_advance(ring);
+	intel_ring_advance(request->ring);
 
 	return 0;
 }
@@ -1587,43 +1592,36 @@ static void bxt_a_seqno_barrier(struct intel_engine_cs *engine)
 	intel_flush_status_page(engine, I915_GEM_HWS_INDEX);
 }
 
-/*
- * Reserve space for 2 NOOPs at the end of each request to be
- * used as a workaround for not being allowed to do lite
- * restore with HEAD==TAIL (WaIdleLiteRestore).
- */
-#define WA_TAIL_DWORDS 2
-
 static int gen8_emit_request(struct drm_i915_gem_request *request)
 {
-	struct intel_ring *ring = request->ring;
 	int ret;
+	u32 *rbuf;
 
-	ret = intel_ring_begin(request, 6 + WA_TAIL_DWORDS);
-	if (ret)
+	ret = intel_ring_begin(request, 6 + WA_TAIL_DWORDS, &rbuf);
+	if (unlikely(ret))
 		return ret;
 
 	/* w/a: bit 5 needs to be zero for MI_FLUSH_DW address. */
 	BUILD_BUG_ON(I915_GEM_HWS_INDEX_ADDR & (1 << 5));
 
-	intel_ring_emit(ring, (MI_FLUSH_DW + 1) | MI_FLUSH_DW_OP_STOREDW);
-	intel_ring_emit(ring,
-			intel_hws_seqno_address(request->engine) |
-			MI_FLUSH_DW_USE_GTT);
-	intel_ring_emit(ring, 0);
-	intel_ring_emit(ring, request->fence.seqno);
-	intel_ring_emit(ring, MI_USER_INTERRUPT);
-	intel_ring_emit(ring, MI_NOOP);
-	return intel_logical_ring_advance(request);
+	*rbuf++ = (MI_FLUSH_DW + 1) | MI_FLUSH_DW_OP_STOREDW;
+	*rbuf++ =  intel_hws_seqno_address(request->engine) |
+		   MI_FLUSH_DW_USE_GTT;
+	*rbuf++ = 0;
+	*rbuf++ = request->fence.seqno;
+	*rbuf++ = MI_USER_INTERRUPT;
+	*rbuf++ = MI_NOOP;
+
+	return intel_logical_ring_advance(request, rbuf);
 }
 
 static int gen8_emit_request_render(struct drm_i915_gem_request *request)
 {
-	struct intel_ring *ring = request->ring;
 	int ret;
+	u32 *rbuf;
 
-	ret = intel_ring_begin(request, 8 + WA_TAIL_DWORDS);
-	if (ret)
+	ret = intel_ring_begin(request, 8 + WA_TAIL_DWORDS, &rbuf);
+	if (unlikely(ret))
 		return ret;
 
 	/* We're using qword write, seqno should be aligned to 8 bytes. */
@@ -1633,19 +1631,18 @@ static int gen8_emit_request_render(struct drm_i915_gem_request *request)
 	 * need a prior CS_STALL, which is emitted by the flush
 	 * following the batch.
 	 */
-	intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(6));
-	intel_ring_emit(ring,
-			(PIPE_CONTROL_GLOBAL_GTT_IVB |
-			 PIPE_CONTROL_CS_STALL |
-			 PIPE_CONTROL_QW_WRITE));
-	intel_ring_emit(ring, intel_hws_seqno_address(request->engine));
-	intel_ring_emit(ring, 0);
-	intel_ring_emit(ring, i915_gem_request_get_seqno(request));
+	*rbuf++ = GFX_OP_PIPE_CONTROL(6);
+	*rbuf++ = PIPE_CONTROL_GLOBAL_GTT_IVB | PIPE_CONTROL_CS_STALL |
+		  PIPE_CONTROL_QW_WRITE;
+	*rbuf++ = intel_hws_seqno_address(request->engine);
+	*rbuf++ = 0;
+	*rbuf++ = i915_gem_request_get_seqno(request);
 	/* We're thrashing one dword of HWS. */
-	intel_ring_emit(ring, 0);
-	intel_ring_emit(ring, MI_USER_INTERRUPT);
-	intel_ring_emit(ring, MI_NOOP);
-	return intel_logical_ring_advance(request);
+	*rbuf++ = 0;
+	*rbuf++ = MI_USER_INTERRUPT;
+	*rbuf++ = MI_NOOP;
+
+	return intel_logical_ring_advance(request, rbuf);
 }
 
 static int gen8_init_rcs_context(struct drm_i915_gem_request *req)
diff --git a/drivers/gpu/drm/i915/intel_mocs.c b/drivers/gpu/drm/i915/intel_mocs.c
index 80bb9247ce66..7080d3225f6b 100644
--- a/drivers/gpu/drm/i915/intel_mocs.c
+++ b/drivers/gpu/drm/i915/intel_mocs.c
@@ -276,23 +276,23 @@ int intel_mocs_init_engine(struct intel_engine_cs *engine)
 static int emit_mocs_control_table(struct drm_i915_gem_request *req,
 				   const struct drm_i915_mocs_table *table)
 {
-	struct intel_ring *ring = req->ring;
 	enum intel_engine_id engine = req->engine->id;
 	unsigned int index;
+	u32 *rbuf;
 	int ret;
 
 	if (WARN_ON(table->size > GEN9_NUM_MOCS_ENTRIES))
 		return -ENODEV;
 
-	ret = intel_ring_begin(req, 2 + 2 * GEN9_NUM_MOCS_ENTRIES);
+	ret = intel_ring_begin(req, 2 + 2 * GEN9_NUM_MOCS_ENTRIES, &rbuf);
 	if (ret)
 		return ret;
 
-	intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(GEN9_NUM_MOCS_ENTRIES));
+	*rbuf++ = MI_LOAD_REGISTER_IMM(GEN9_NUM_MOCS_ENTRIES);
 
 	for (index = 0; index < table->size; index++) {
-		intel_ring_emit_reg(ring, mocs_register(engine, index));
-		intel_ring_emit(ring, table->table[index].control_value);
+		*rbuf++ = mocs_register(engine, index).reg;
+		*rbuf++ = table->table[index].control_value;
 	}
 
 	/*
@@ -304,12 +304,13 @@ static int emit_mocs_control_table(struct drm_i915_gem_request *req,
 	 * that value to all the used entries.
 	 */
 	for (; index < GEN9_NUM_MOCS_ENTRIES; index++) {
-		intel_ring_emit_reg(ring, mocs_register(engine, index));
-		intel_ring_emit(ring, table->table[0].control_value);
+		*rbuf++ = mocs_register(engine, index).reg;
+		*rbuf++ = table->table[0].control_value;
 	}
 
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_advance(ring);
+	*rbuf++ = MI_NOOP;
+
+	intel_ring_advance(req->ring);
 
 	return 0;
 }
@@ -336,29 +337,28 @@ static inline u32 l3cc_combine(const struct drm_i915_mocs_table *table,
 static int emit_mocs_l3cc_table(struct drm_i915_gem_request *req,
 				const struct drm_i915_mocs_table *table)
 {
-	struct intel_ring *ring = req->ring;
+	u32 *rbuf;
 	unsigned int i;
 	int ret;
 
 	if (WARN_ON(table->size > GEN9_NUM_MOCS_ENTRIES))
 		return -ENODEV;
 
-	ret = intel_ring_begin(req, 2 + GEN9_NUM_MOCS_ENTRIES);
+	ret = intel_ring_begin(req, 2 + GEN9_NUM_MOCS_ENTRIES, &rbuf);
 	if (ret)
 		return ret;
 
-	intel_ring_emit(ring,
-			MI_LOAD_REGISTER_IMM(GEN9_NUM_MOCS_ENTRIES / 2));
+	*rbuf++ = MI_LOAD_REGISTER_IMM(GEN9_NUM_MOCS_ENTRIES / 2);
 
 	for (i = 0; i < table->size/2; i++) {
-		intel_ring_emit_reg(ring, GEN9_LNCFCMOCS(i));
-		intel_ring_emit(ring, l3cc_combine(table, 2*i, 2*i+1));
+		*rbuf++ = GEN9_LNCFCMOCS(i).reg;
+		*rbuf++ = l3cc_combine(table, 2*i, 2*i+1);
 	}
 
 	if (table->size & 0x01) {
 		/* Odd table size - 1 left over */
-		intel_ring_emit_reg(ring, GEN9_LNCFCMOCS(i));
-		intel_ring_emit(ring, l3cc_combine(table, 2*i, 0));
+		*rbuf++ = GEN9_LNCFCMOCS(i).reg;
+		*rbuf++ = l3cc_combine(table, 2*i, 0);
 		i++;
 	}
 
@@ -368,12 +368,13 @@ static int emit_mocs_l3cc_table(struct drm_i915_gem_request *req,
 	 * they are reserved by the hardware.
 	 */
 	for (; i < GEN9_NUM_MOCS_ENTRIES / 2; i++) {
-		intel_ring_emit_reg(ring, GEN9_LNCFCMOCS(i));
-		intel_ring_emit(ring, l3cc_combine(table, 0, 0));
+		*rbuf++ = GEN9_LNCFCMOCS(i).reg;
+		*rbuf++ = l3cc_combine(table, 0, 0);
 	}
 
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_advance(ring);
+	*rbuf++ = MI_NOOP;
+
+	intel_ring_advance(req->ring);
 
 	return 0;
 }
diff --git a/drivers/gpu/drm/i915/intel_overlay.c b/drivers/gpu/drm/i915/intel_overlay.c
index a24bc8c7889f..cf49e5855df7 100644
--- a/drivers/gpu/drm/i915/intel_overlay.c
+++ b/drivers/gpu/drm/i915/intel_overlay.c
@@ -243,7 +243,7 @@ static int intel_overlay_on(struct intel_overlay *overlay)
 {
 	struct drm_i915_private *dev_priv = overlay->i915;
 	struct drm_i915_gem_request *req;
-	struct intel_ring *ring;
+	u32 *rbuf;
 	int ret;
 
 	WARN_ON(overlay->active);
@@ -253,7 +253,7 @@ static int intel_overlay_on(struct intel_overlay *overlay)
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
-	ret = intel_ring_begin(req, 4);
+	ret = intel_ring_begin(req, 4, &rbuf);
 	if (ret) {
 		i915_add_request_no_flush(req);
 		return ret;
@@ -261,12 +261,12 @@ static int intel_overlay_on(struct intel_overlay *overlay)
 
 	overlay->active = true;
 
-	ring = req->ring;
-	intel_ring_emit(ring, MI_OVERLAY_FLIP | MI_OVERLAY_ON);
-	intel_ring_emit(ring, overlay->flip_addr | OFC_UPDATE);
-	intel_ring_emit(ring, MI_WAIT_FOR_EVENT | MI_WAIT_FOR_OVERLAY_FLIP);
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_advance(ring);
+	*rbuf++ = MI_OVERLAY_FLIP | MI_OVERLAY_ON;
+	*rbuf++ = overlay->flip_addr | OFC_UPDATE;
+	*rbuf++ = MI_WAIT_FOR_EVENT | MI_WAIT_FOR_OVERLAY_FLIP;
+	*rbuf++ = MI_NOOP;
+
+	intel_ring_advance(req->ring);
 
 	return intel_overlay_do_wait_request(overlay, req, NULL);
 }
@@ -277,9 +277,8 @@ static int intel_overlay_continue(struct intel_overlay *overlay,
 {
 	struct drm_i915_private *dev_priv = overlay->i915;
 	struct drm_i915_gem_request *req;
-	struct intel_ring *ring;
 	u32 flip_addr = overlay->flip_addr;
-	u32 tmp;
+	u32 *rbuf, tmp;
 	int ret;
 
 	WARN_ON(!overlay->active);
@@ -296,16 +295,16 @@ static int intel_overlay_continue(struct intel_overlay *overlay,
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
-	ret = intel_ring_begin(req, 2);
+	ret = intel_ring_begin(req, 2, &rbuf);
 	if (ret) {
 		i915_add_request_no_flush(req);
 		return ret;
 	}
 
-	ring = req->ring;
-	intel_ring_emit(ring, MI_OVERLAY_FLIP | MI_OVERLAY_CONTINUE);
-	intel_ring_emit(ring, flip_addr);
-	intel_ring_advance(ring);
+	*rbuf++ = MI_OVERLAY_FLIP | MI_OVERLAY_CONTINUE;
+	*rbuf++ = flip_addr;
+
+	intel_ring_advance(req->ring);
 
 	intel_overlay_submit_request(overlay, req, NULL);
 
@@ -355,8 +354,8 @@ static int intel_overlay_off(struct intel_overlay *overlay)
 {
 	struct drm_i915_private *dev_priv = overlay->i915;
 	struct drm_i915_gem_request *req;
-	struct intel_ring *ring;
 	u32 flip_addr = overlay->flip_addr;
+	u32 *rbuf;
 	int ret;
 
 	WARN_ON(!overlay->active);
@@ -371,31 +370,30 @@ static int intel_overlay_off(struct intel_overlay *overlay)
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
-	ret = intel_ring_begin(req, 6);
+	ret = intel_ring_begin(req, 6, &rbuf);
 	if (ret) {
 		i915_add_request_no_flush(req);
 		return ret;
 	}
 
-	ring = req->ring;
 	/* wait for overlay to go idle */
-	intel_ring_emit(ring, MI_OVERLAY_FLIP | MI_OVERLAY_CONTINUE);
-	intel_ring_emit(ring, flip_addr);
-	intel_ring_emit(ring, MI_WAIT_FOR_EVENT | MI_WAIT_FOR_OVERLAY_FLIP);
+	*rbuf++ = MI_OVERLAY_FLIP | MI_OVERLAY_CONTINUE;
+	*rbuf++ = flip_addr;
+	*rbuf++ = MI_WAIT_FOR_EVENT | MI_WAIT_FOR_OVERLAY_FLIP;
 	/* turn overlay off */
 	if (IS_I830(dev_priv)) {
 		/* Workaround: Don't disable the overlay fully, since otherwise
 		 * it dies on the next OVERLAY_ON cmd. */
-		intel_ring_emit(ring, MI_NOOP);
-		intel_ring_emit(ring, MI_NOOP);
-		intel_ring_emit(ring, MI_NOOP);
+		*rbuf++ = MI_NOOP;
+		*rbuf++ = MI_NOOP;
+		*rbuf++ = MI_NOOP;
 	} else {
-		intel_ring_emit(ring, MI_OVERLAY_FLIP | MI_OVERLAY_OFF);
-		intel_ring_emit(ring, flip_addr);
-		intel_ring_emit(ring,
-				MI_WAIT_FOR_EVENT | MI_WAIT_FOR_OVERLAY_FLIP);
+		*rbuf++ = MI_OVERLAY_FLIP | MI_OVERLAY_OFF;
+		*rbuf++ = flip_addr;
+		*rbuf++ = MI_WAIT_FOR_EVENT | MI_WAIT_FOR_OVERLAY_FLIP;
 	}
-	intel_ring_advance(ring);
+
+	intel_ring_advance(req->ring);
 
 	return intel_overlay_do_wait_request(overlay, req,
 					     intel_overlay_off_tail);
@@ -429,23 +427,22 @@ static int intel_overlay_release_old_vid(struct intel_overlay *overlay)
 	if (I915_READ(ISR) & I915_OVERLAY_PLANE_FLIP_PENDING_INTERRUPT) {
 		/* synchronous slowpath */
 		struct drm_i915_gem_request *req;
-		struct intel_ring *ring;
+		u32 *rbuf;
 
 		req = alloc_request(overlay);
 		if (IS_ERR(req))
 			return PTR_ERR(req);
 
-		ret = intel_ring_begin(req, 2);
+		ret = intel_ring_begin(req, 2, &rbuf);
 		if (ret) {
 			i915_add_request_no_flush(req);
 			return ret;
 		}
 
-		ring = req->ring;
-		intel_ring_emit(ring,
-				MI_WAIT_FOR_EVENT | MI_WAIT_FOR_OVERLAY_FLIP);
-		intel_ring_emit(ring, MI_NOOP);
-		intel_ring_advance(ring);
+		*rbuf++ = MI_WAIT_FOR_EVENT | MI_WAIT_FOR_OVERLAY_FLIP;
+		*rbuf++ = MI_NOOP;
+
+		intel_ring_advance(req->ring);
 
 		ret = intel_overlay_do_wait_request(overlay, req,
 						    intel_overlay_release_old_vid_tail);
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index fd8fcc6ec970..d7892538d130 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -61,8 +61,7 @@ void intel_ring_update_space(struct intel_ring *ring)
 static int
 gen2_render_ring_flush(struct drm_i915_gem_request *req, u32 mode)
 {
-	struct intel_ring *ring = req->ring;
-	u32 cmd;
+	u32 cmd, *rbuf;
 	int ret;
 
 	cmd = MI_FLUSH;
@@ -70,13 +69,14 @@ gen2_render_ring_flush(struct drm_i915_gem_request *req, u32 mode)
 	if (mode & EMIT_INVALIDATE)
 		cmd |= MI_READ_FLUSH;
 
-	ret = intel_ring_begin(req, 2);
+	ret = intel_ring_begin(req, 2, &rbuf);
 	if (ret)
 		return ret;
 
-	intel_ring_emit(ring, cmd);
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_advance(ring);
+	*rbuf++ = cmd;
+	*rbuf++ = MI_NOOP;
+
+	intel_ring_advance(req->ring);
 
 	return 0;
 }
@@ -84,8 +84,7 @@ gen2_render_ring_flush(struct drm_i915_gem_request *req, u32 mode)
 static int
 gen4_render_ring_flush(struct drm_i915_gem_request *req, u32 mode)
 {
-	struct intel_ring *ring = req->ring;
-	u32 cmd;
+	u32 cmd, *rbuf;
 	int ret;
 
 	/*
@@ -123,13 +122,14 @@ gen4_render_ring_flush(struct drm_i915_gem_request *req, u32 mode)
 			cmd |= MI_INVALIDATE_ISP;
 	}
 
-	ret = intel_ring_begin(req, 2);
+	ret = intel_ring_begin(req, 2, &rbuf);
 	if (ret)
 		return ret;
 
-	intel_ring_emit(ring, cmd);
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_advance(ring);
+	*rbuf++ = cmd;
+	*rbuf++ = MI_NOOP;
+
+	intel_ring_advance(req->ring);
 
 	return 0;
 }
@@ -174,35 +174,37 @@ gen4_render_ring_flush(struct drm_i915_gem_request *req, u32 mode)
 static int
 intel_emit_post_sync_nonzero_flush(struct drm_i915_gem_request *req)
 {
-	struct intel_ring *ring = req->ring;
 	u32 scratch_addr =
 		i915_ggtt_offset(req->engine->scratch) + 2 * CACHELINE_BYTES;
+	u32 *rbuf;
 	int ret;
 
-	ret = intel_ring_begin(req, 6);
+	ret = intel_ring_begin(req, 6, &rbuf);
 	if (ret)
 		return ret;
 
-	intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(5));
-	intel_ring_emit(ring, PIPE_CONTROL_CS_STALL |
-			PIPE_CONTROL_STALL_AT_SCOREBOARD);
-	intel_ring_emit(ring, scratch_addr | PIPE_CONTROL_GLOBAL_GTT);
-	intel_ring_emit(ring, 0); /* low dword */
-	intel_ring_emit(ring, 0); /* high dword */
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_advance(ring);
+	*rbuf++ = GFX_OP_PIPE_CONTROL(5);
+	*rbuf++ = PIPE_CONTROL_CS_STALL |
+		  PIPE_CONTROL_STALL_AT_SCOREBOARD;
+	*rbuf++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
+	*rbuf++ = 0; /* low dword */
+	*rbuf++ = 0; /* high dword */
+	*rbuf++ = MI_NOOP;
+
+	intel_ring_advance(req->ring);
 
-	ret = intel_ring_begin(req, 6);
+	ret = intel_ring_begin(req, 6, &rbuf);
 	if (ret)
 		return ret;
 
-	intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(5));
-	intel_ring_emit(ring, PIPE_CONTROL_QW_WRITE);
-	intel_ring_emit(ring, scratch_addr | PIPE_CONTROL_GLOBAL_GTT);
-	intel_ring_emit(ring, 0);
-	intel_ring_emit(ring, 0);
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_advance(ring);
+	*rbuf++ = GFX_OP_PIPE_CONTROL(5);
+	*rbuf++ = PIPE_CONTROL_QW_WRITE;
+	*rbuf++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
+	*rbuf++ = 0;
+	*rbuf++ = 0;
+	*rbuf++ = MI_NOOP;
+
+	intel_ring_advance(req->ring);
 
 	return 0;
 }
@@ -210,10 +212,9 @@ intel_emit_post_sync_nonzero_flush(struct drm_i915_gem_request *req)
 static int
 gen6_render_ring_flush(struct drm_i915_gem_request *req, u32 mode)
 {
-	struct intel_ring *ring = req->ring;
 	u32 scratch_addr =
 		i915_ggtt_offset(req->engine->scratch) + 2 * CACHELINE_BYTES;
-	u32 flags = 0;
+	u32 *rbuf, flags = 0;
 	int ret;
 
 	/* Force SNB workarounds for PIPE_CONTROL flushes */
@@ -247,15 +248,16 @@ gen6_render_ring_flush(struct drm_i915_gem_request *req, u32 mode)
 		flags |= PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_CS_STALL;
 	}
 
-	ret = intel_ring_begin(req, 4);
+	ret = intel_ring_begin(req, 4, &rbuf);
 	if (ret)
 		return ret;
 
-	intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(4));
-	intel_ring_emit(ring, flags);
-	intel_ring_emit(ring, scratch_addr | PIPE_CONTROL_GLOBAL_GTT);
-	intel_ring_emit(ring, 0);
-	intel_ring_advance(ring);
+	*rbuf++ = GFX_OP_PIPE_CONTROL(4);
+	*rbuf++ = flags;
+	*rbuf++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
+	*rbuf++ = 0;
+
+	intel_ring_advance(req->ring);
 
 	return 0;
 }
@@ -263,20 +265,20 @@ gen6_render_ring_flush(struct drm_i915_gem_request *req, u32 mode)
 static int
 gen7_render_ring_cs_stall_wa(struct drm_i915_gem_request *req)
 {
-	struct intel_ring *ring = req->ring;
+	u32 *rbuf;
 	int ret;
 
-	ret = intel_ring_begin(req, 4);
+	ret = intel_ring_begin(req, 4, &rbuf);
 	if (ret)
 		return ret;
 
-	intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(4));
-	intel_ring_emit(ring,
-			PIPE_CONTROL_CS_STALL |
-			PIPE_CONTROL_STALL_AT_SCOREBOARD);
-	intel_ring_emit(ring, 0);
-	intel_ring_emit(ring, 0);
-	intel_ring_advance(ring);
+	*rbuf++ = GFX_OP_PIPE_CONTROL(4);
+	*rbuf++ = PIPE_CONTROL_CS_STALL |
+		  PIPE_CONTROL_STALL_AT_SCOREBOARD;
+	*rbuf++ = 0;
+	*rbuf++ = 0;
+
+	intel_ring_advance(req->ring);
 
 	return 0;
 }
@@ -284,10 +286,9 @@ gen7_render_ring_cs_stall_wa(struct drm_i915_gem_request *req)
 static int
 gen7_render_ring_flush(struct drm_i915_gem_request *req, u32 mode)
 {
-	struct intel_ring *ring = req->ring;
 	u32 scratch_addr =
 		i915_ggtt_offset(req->engine->scratch) + 2 * CACHELINE_BYTES;
-	u32 flags = 0;
+	u32 *rbuf, flags = 0;
 	int ret;
 
 	/*
@@ -332,15 +333,16 @@ gen7_render_ring_flush(struct drm_i915_gem_request *req, u32 mode)
 		gen7_render_ring_cs_stall_wa(req);
 	}
 
-	ret = intel_ring_begin(req, 4);
+	ret = intel_ring_begin(req, 4, &rbuf);
 	if (ret)
 		return ret;
 
-	intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(4));
-	intel_ring_emit(ring, flags);
-	intel_ring_emit(ring, scratch_addr);
-	intel_ring_emit(ring, 0);
-	intel_ring_advance(ring);
+	*rbuf++ = GFX_OP_PIPE_CONTROL(4);
+	*rbuf++ = flags;
+	*rbuf++ = scratch_addr;
+	*rbuf++ = 0;
+
+	intel_ring_advance(req->ring);
 
 	return 0;
 }
@@ -349,20 +351,21 @@ static int
 gen8_emit_pipe_control(struct drm_i915_gem_request *req,
 		       u32 flags, u32 scratch_addr)
 {
-	struct intel_ring *ring = req->ring;
+	u32 *rbuf;
 	int ret;
 
-	ret = intel_ring_begin(req, 6);
+	ret = intel_ring_begin(req, 6, &rbuf);
 	if (ret)
 		return ret;
 
-	intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(6));
-	intel_ring_emit(ring, flags);
-	intel_ring_emit(ring, scratch_addr);
-	intel_ring_emit(ring, 0);
-	intel_ring_emit(ring, 0);
-	intel_ring_emit(ring, 0);
-	intel_ring_advance(ring);
+	*rbuf++ = GFX_OP_PIPE_CONTROL(6);
+	*rbuf++ = flags;
+	*rbuf++ = scratch_addr;
+	*rbuf++ = 0;
+	*rbuf++ = 0;
+	*rbuf++ = 0;
+
+	intel_ring_advance(req->ring);
 
 	return 0;
 }
@@ -615,8 +618,8 @@ out:
 
 static int intel_ring_workarounds_emit(struct drm_i915_gem_request *req)
 {
-	struct intel_ring *ring = req->ring;
 	struct i915_workarounds *w = &req->i915->workarounds;
+	u32 *rbuf;
 	int ret, i;
 
 	if (w->count == 0)
@@ -626,18 +629,18 @@ static int intel_ring_workarounds_emit(struct drm_i915_gem_request *req)
 	if (ret)
 		return ret;
 
-	ret = intel_ring_begin(req, (w->count * 2 + 2));
+	ret = intel_ring_begin(req, w->count * 2 + 2, &rbuf);
 	if (ret)
 		return ret;
 
-	intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(w->count));
+	*rbuf++ = MI_LOAD_REGISTER_IMM(w->count);
 	for (i = 0; i < w->count; i++) {
-		intel_ring_emit_reg(ring, w->reg[i].addr);
-		intel_ring_emit(ring, w->reg[i].value);
+		*rbuf++ = w->reg[i].addr.reg;
+		*rbuf++ = w->reg[i].value;
 	}
-	intel_ring_emit(ring, MI_NOOP);
+	*rbuf++ = MI_NOOP;
 
-	intel_ring_advance(ring);
+	intel_ring_advance(req->ring);
 
 	ret = req->engine->emit_flush(req, EMIT_BARRIER);
 	if (ret)
@@ -1263,14 +1266,14 @@ static void render_ring_cleanup(struct intel_engine_cs *engine)
 
 static int gen8_rcs_signal(struct drm_i915_gem_request *req)
 {
-	struct intel_ring *ring = req->ring;
 	struct drm_i915_private *dev_priv = req->i915;
 	struct intel_engine_cs *waiter;
 	enum intel_engine_id id;
+	u32 *rbuf;
 	int ret, num_rings;
 
 	num_rings = INTEL_INFO(dev_priv)->num_rings;
-	ret = intel_ring_begin(req, (num_rings-1) * 8);
+	ret = intel_ring_begin(req, (num_rings-1) * 8, &rbuf);
 	if (ret)
 		return ret;
 
@@ -1279,35 +1282,33 @@ static int gen8_rcs_signal(struct drm_i915_gem_request *req)
 		if (gtt_offset == MI_SEMAPHORE_SYNC_INVALID)
 			continue;
 
-		intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(6));
-		intel_ring_emit(ring,
-				PIPE_CONTROL_GLOBAL_GTT_IVB |
-				PIPE_CONTROL_QW_WRITE |
-				PIPE_CONTROL_CS_STALL);
-		intel_ring_emit(ring, lower_32_bits(gtt_offset));
-		intel_ring_emit(ring, upper_32_bits(gtt_offset));
-		intel_ring_emit(ring, req->fence.seqno);
-		intel_ring_emit(ring, 0);
-		intel_ring_emit(ring,
-				MI_SEMAPHORE_SIGNAL |
-				MI_SEMAPHORE_TARGET(waiter->hw_id));
-		intel_ring_emit(ring, 0);
+		*rbuf++ = GFX_OP_PIPE_CONTROL(6);
+		*rbuf++ = PIPE_CONTROL_GLOBAL_GTT_IVB |
+			  PIPE_CONTROL_QW_WRITE |
+			  PIPE_CONTROL_CS_STALL;
+		*rbuf++ = lower_32_bits(gtt_offset);
+		*rbuf++ = upper_32_bits(gtt_offset);
+		*rbuf++ = req->fence.seqno;
+		*rbuf++ = 0;
+		*rbuf++ = MI_SEMAPHORE_SIGNAL |
+			  MI_SEMAPHORE_TARGET(waiter->hw_id);
+		*rbuf++ = 0;
 	}
-	intel_ring_advance(ring);
+	intel_ring_advance(req->ring);
 
 	return 0;
 }
 
 static int gen8_xcs_signal(struct drm_i915_gem_request *req)
 {
-	struct intel_ring *ring = req->ring;
 	struct drm_i915_private *dev_priv = req->i915;
 	struct intel_engine_cs *waiter;
 	enum intel_engine_id id;
+	u32 *rbuf;
 	int ret, num_rings;
 
 	num_rings = INTEL_INFO(dev_priv)->num_rings;
-	ret = intel_ring_begin(req, (num_rings-1) * 6);
+	ret = intel_ring_begin(req, (num_rings-1) * 6, &rbuf);
 	if (ret)
 		return ret;
 
@@ -1316,32 +1317,28 @@ static int gen8_xcs_signal(struct drm_i915_gem_request *req)
 		if (gtt_offset == MI_SEMAPHORE_SYNC_INVALID)
 			continue;
 
-		intel_ring_emit(ring,
-				(MI_FLUSH_DW + 1) | MI_FLUSH_DW_OP_STOREDW);
-		intel_ring_emit(ring,
-				lower_32_bits(gtt_offset) |
-				MI_FLUSH_DW_USE_GTT);
-		intel_ring_emit(ring, upper_32_bits(gtt_offset));
-		intel_ring_emit(ring, req->fence.seqno);
-		intel_ring_emit(ring,
-				MI_SEMAPHORE_SIGNAL |
-				MI_SEMAPHORE_TARGET(waiter->hw_id));
-		intel_ring_emit(ring, 0);
+		*rbuf++ = (MI_FLUSH_DW + 1) | MI_FLUSH_DW_OP_STOREDW;
+		*rbuf++ = lower_32_bits(gtt_offset) | MI_FLUSH_DW_USE_GTT;
+		*rbuf++ = upper_32_bits(gtt_offset);
+		*rbuf++ = req->fence.seqno;
+		*rbuf++ = MI_SEMAPHORE_SIGNAL |
+			  MI_SEMAPHORE_TARGET(waiter->hw_id);
+		*rbuf++ = 0;
 	}
-	intel_ring_advance(ring);
+	intel_ring_advance(req->ring);
 
 	return 0;
 }
 
 static int gen6_signal(struct drm_i915_gem_request *req)
 {
-	struct intel_ring *ring = req->ring;
 	struct drm_i915_private *dev_priv = req->i915;
 	struct intel_engine_cs *engine;
+	u32 *rbuf;
 	int ret, num_rings;
 
 	num_rings = INTEL_INFO(dev_priv)->num_rings;
-	ret = intel_ring_begin(req, round_up((num_rings-1) * 3, 2));
+	ret = intel_ring_begin(req, round_up((num_rings-1) * 3, 2), &rbuf);
 	if (ret)
 		return ret;
 
@@ -1353,16 +1350,17 @@ static int gen6_signal(struct drm_i915_gem_request *req)
 
 		mbox_reg = req->engine->semaphore.mbox.signal[engine->hw_id];
 		if (i915_mmio_reg_valid(mbox_reg)) {
-			intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
-			intel_ring_emit_reg(ring, mbox_reg);
-			intel_ring_emit(ring, req->fence.seqno);
+			*rbuf++ = MI_LOAD_REGISTER_IMM(1);
+			*rbuf++ = mbox_reg.reg;
+			*rbuf++ = req->fence.seqno;
 		}
 	}
 
 	/* If num_dwords was rounded, make sure the tail pointer is correct */
 	if (num_rings % 2 == 0)
-		intel_ring_emit(ring, MI_NOOP);
-	intel_ring_advance(ring);
+		*rbuf++ = MI_NOOP;
+
+	intel_ring_advance(req->ring);
 
 	return 0;
 }
@@ -1378,16 +1376,18 @@ static void i9xx_submit_request(struct drm_i915_gem_request *request)
 static int i9xx_emit_request(struct drm_i915_gem_request *req)
 {
 	struct intel_ring *ring = req->ring;
+	u32 *rbuf;
 	int ret;
 
-	ret = intel_ring_begin(req, 4);
+	ret = intel_ring_begin(req, 4, &rbuf);
 	if (ret)
 		return ret;
 
-	intel_ring_emit(ring, MI_STORE_DWORD_INDEX);
-	intel_ring_emit(ring, I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT);
-	intel_ring_emit(ring, req->fence.seqno);
-	intel_ring_emit(ring, MI_USER_INTERRUPT);
+	*rbuf++ = MI_STORE_DWORD_INDEX;
+	*rbuf++ = I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT;
+	*rbuf++ = req->fence.seqno;
+	*rbuf++ = MI_USER_INTERRUPT;
+
 	intel_ring_advance(ring);
 
 	req->tail = ring->tail;
@@ -1418,6 +1418,7 @@ static int gen8_render_emit_request(struct drm_i915_gem_request *req)
 {
 	struct intel_engine_cs *engine = req->engine;
 	struct intel_ring *ring = req->ring;
+	u32 *rbuf;
 	int ret;
 
 	if (engine->semaphore.signal) {
@@ -1426,21 +1427,22 @@ static int gen8_render_emit_request(struct drm_i915_gem_request *req)
 			return ret;
 	}
 
-	ret = intel_ring_begin(req, 8);
+	ret = intel_ring_begin(req, 8, &rbuf);
 	if (ret)
 		return ret;
 
-	intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(6));
-	intel_ring_emit(ring, (PIPE_CONTROL_GLOBAL_GTT_IVB |
-			       PIPE_CONTROL_CS_STALL |
-			       PIPE_CONTROL_QW_WRITE));
-	intel_ring_emit(ring, intel_hws_seqno_address(engine));
-	intel_ring_emit(ring, 0);
-	intel_ring_emit(ring, i915_gem_request_get_seqno(req));
+	*rbuf++ = GFX_OP_PIPE_CONTROL(6);
+	*rbuf++ = (PIPE_CONTROL_GLOBAL_GTT_IVB |
+		  PIPE_CONTROL_CS_STALL |
+		  PIPE_CONTROL_QW_WRITE);
+	*rbuf++ = intel_hws_seqno_address(engine);
+	*rbuf++ = 0;
+	*rbuf++ = i915_gem_request_get_seqno(req);
 	/* We're thrashing one dword of HWS. */
-	intel_ring_emit(ring, 0);
-	intel_ring_emit(ring, MI_USER_INTERRUPT);
-	intel_ring_emit(ring, MI_NOOP);
+	*rbuf++ = 0;
+	*rbuf++ = MI_USER_INTERRUPT;
+	*rbuf++ = MI_NOOP;
+
 	intel_ring_advance(ring);
 
 	req->tail = ring->tail;
@@ -1460,24 +1462,24 @@ static int
 gen8_ring_sync_to(struct drm_i915_gem_request *req,
 		  struct drm_i915_gem_request *signal)
 {
-	struct intel_ring *ring = req->ring;
 	struct drm_i915_private *dev_priv = req->i915;
 	u64 offset = GEN8_WAIT_OFFSET(req->engine, signal->engine->id);
 	struct i915_hw_ppgtt *ppgtt;
+	u32 *rbuf;
 	int ret;
 
-	ret = intel_ring_begin(req, 4);
+	ret = intel_ring_begin(req, 4, &rbuf);
 	if (ret)
 		return ret;
 
-	intel_ring_emit(ring,
-			MI_SEMAPHORE_WAIT |
-			MI_SEMAPHORE_GLOBAL_GTT |
-			MI_SEMAPHORE_SAD_GTE_SDD);
-	intel_ring_emit(ring, signal->fence.seqno);
-	intel_ring_emit(ring, lower_32_bits(offset));
-	intel_ring_emit(ring, upper_32_bits(offset));
-	intel_ring_advance(ring);
+	*rbuf++ = MI_SEMAPHORE_WAIT |
+		  MI_SEMAPHORE_GLOBAL_GTT |
+		  MI_SEMAPHORE_SAD_GTE_SDD;
+	*rbuf++ = signal->fence.seqno;
+	*rbuf++ = lower_32_bits(offset);
+	*rbuf++ = upper_32_bits(offset);
+
+	intel_ring_advance(req->ring);
 
 	/* When the !RCS engines idle waiting upon a semaphore, they lose their
 	 * pagetables and we must reload them before executing the batch.
@@ -1494,28 +1496,29 @@ static int
 gen6_ring_sync_to(struct drm_i915_gem_request *req,
 		  struct drm_i915_gem_request *signal)
 {
-	struct intel_ring *ring = req->ring;
 	u32 dw1 = MI_SEMAPHORE_MBOX |
 		  MI_SEMAPHORE_COMPARE |
 		  MI_SEMAPHORE_REGISTER;
 	u32 wait_mbox = signal->engine->semaphore.mbox.wait[req->engine->hw_id];
+	u32 *rbuf;
 	int ret;
 
 	WARN_ON(wait_mbox == MI_SEMAPHORE_SYNC_INVALID);
 
-	ret = intel_ring_begin(req, 4);
+	ret = intel_ring_begin(req, 4, &rbuf);
 	if (ret)
 		return ret;
 
-	intel_ring_emit(ring, dw1 | wait_mbox);
+	*rbuf++ = dw1 | wait_mbox;
 	/* Throughout all of the GEM code, seqno passed implies our current
 	 * seqno is >= the last seqno executed. However for hardware the
 	 * comparison is strictly greater than.
 	 */
-	intel_ring_emit(ring, signal->fence.seqno - 1);
-	intel_ring_emit(ring, 0);
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_advance(ring);
+	*rbuf++ = signal->fence.seqno - 1;
+	*rbuf++ = 0;
+	*rbuf++ = MI_NOOP;
+
+	intel_ring_advance(req->ring);
 
 	return 0;
 }
@@ -1616,16 +1619,17 @@ i8xx_irq_disable(struct intel_engine_cs *engine)
 static int
 bsd_ring_flush(struct drm_i915_gem_request *req, u32 mode)
 {
-	struct intel_ring *ring = req->ring;
+	u32 *rbuf;
 	int ret;
 
-	ret = intel_ring_begin(req, 2);
+	ret = intel_ring_begin(req, 2, &rbuf);
 	if (ret)
 		return ret;
 
-	intel_ring_emit(ring, MI_FLUSH);
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_advance(ring);
+	*rbuf++ = MI_FLUSH;
+	*rbuf++ = MI_NOOP;
+
+	intel_ring_advance(req->ring);
 	return 0;
 }
 
@@ -1691,20 +1695,20 @@ i965_emit_bb_start(struct drm_i915_gem_request *req,
 		   u64 offset, u32 length,
 		   unsigned int dispatch_flags)
 {
-	struct intel_ring *ring = req->ring;
+	u32 *rbuf;
 	int ret;
 
-	ret = intel_ring_begin(req, 2);
+	ret = intel_ring_begin(req, 2, &rbuf);
 	if (ret)
 		return ret;
 
-	intel_ring_emit(ring,
-			MI_BATCH_BUFFER_START |
-			MI_BATCH_GTT |
-			(dispatch_flags & I915_DISPATCH_SECURE ?
-			 0 : MI_BATCH_NON_SECURE_I965));
-	intel_ring_emit(ring, offset);
-	intel_ring_advance(ring);
+	*rbuf++ = MI_BATCH_BUFFER_START |
+		  MI_BATCH_GTT |
+		  (dispatch_flags & I915_DISPATCH_SECURE ?
+		  0 : MI_BATCH_NON_SECURE_I965);
+	*rbuf++ = offset;
+
+	intel_ring_advance(req->ring);
 
 	return 0;
 }
@@ -1720,26 +1724,28 @@ i830_emit_bb_start(struct drm_i915_gem_request *req,
 {
 	struct intel_ring *ring = req->ring;
 	u32 cs_offset = i915_ggtt_offset(req->engine->scratch);
+	u32 *rbuf;
 	int ret;
 
-	ret = intel_ring_begin(req, 6);
+	ret = intel_ring_begin(req, 6, &rbuf);
 	if (ret)
 		return ret;
 
 	/* Evict the invalid PTE TLBs */
-	intel_ring_emit(ring, COLOR_BLT_CMD | BLT_WRITE_RGBA);
-	intel_ring_emit(ring, BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | 4096);
-	intel_ring_emit(ring, I830_TLB_ENTRIES << 16 | 4); /* load each page */
-	intel_ring_emit(ring, cs_offset);
-	intel_ring_emit(ring, 0xdeadbeef);
-	intel_ring_emit(ring, MI_NOOP);
+	*rbuf++ = COLOR_BLT_CMD | BLT_WRITE_RGBA;
+	*rbuf++ = BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | 4096;
+	*rbuf++ = I830_TLB_ENTRIES << 16 | 4; /* load each page */
+	*rbuf++ = cs_offset;
+	*rbuf++ = 0xdeadbeef;
+	*rbuf++ = MI_NOOP;
+
 	intel_ring_advance(ring);
 
 	if ((dispatch_flags & I915_DISPATCH_PINNED) == 0) {
 		if (len > I830_BATCH_LIMIT)
 			return -ENOSPC;
 
-		ret = intel_ring_begin(req, 6 + 2);
+		ret = intel_ring_begin(req, 6 + 2, &rbuf);
 		if (ret)
 			return ret;
 
@@ -1747,29 +1753,30 @@ i830_emit_bb_start(struct drm_i915_gem_request *req,
 		 * stable batch scratch bo area (so that the CS never
 		 * stumbles over its tlb invalidation bug) ...
 		 */
-		intel_ring_emit(ring, SRC_COPY_BLT_CMD | BLT_WRITE_RGBA);
-		intel_ring_emit(ring,
-				BLT_DEPTH_32 | BLT_ROP_SRC_COPY | 4096);
-		intel_ring_emit(ring, DIV_ROUND_UP(len, 4096) << 16 | 4096);
-		intel_ring_emit(ring, cs_offset);
-		intel_ring_emit(ring, 4096);
-		intel_ring_emit(ring, offset);
-
-		intel_ring_emit(ring, MI_FLUSH);
-		intel_ring_emit(ring, MI_NOOP);
+		*rbuf++ = SRC_COPY_BLT_CMD | BLT_WRITE_RGBA;
+		*rbuf++ = BLT_DEPTH_32 | BLT_ROP_SRC_COPY | 4096;
+		*rbuf++ = DIV_ROUND_UP(len, 4096) << 16 | 4096;
+		*rbuf++ = cs_offset;
+		*rbuf++ = 4096;
+		*rbuf++ = offset;
+
+		*rbuf++ = MI_FLUSH;
+		*rbuf++ = MI_NOOP;
+
 		intel_ring_advance(ring);
 
 		/* ... and execute it. */
 		offset = cs_offset;
 	}
 
-	ret = intel_ring_begin(req, 2);
+	ret = intel_ring_begin(req, 2, &rbuf);
 	if (ret)
 		return ret;
 
-	intel_ring_emit(ring, MI_BATCH_BUFFER_START | MI_BATCH_GTT);
-	intel_ring_emit(ring, offset | (dispatch_flags & I915_DISPATCH_SECURE ?
-					0 : MI_BATCH_NON_SECURE));
+	*rbuf++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT;
+	*rbuf++ = offset | (dispatch_flags & I915_DISPATCH_SECURE ?
+		  0 : MI_BATCH_NON_SECURE);
+
 	intel_ring_advance(ring);
 
 	return 0;
@@ -1780,17 +1787,18 @@ i915_emit_bb_start(struct drm_i915_gem_request *req,
 		   u64 offset, u32 len,
 		   unsigned int dispatch_flags)
 {
-	struct intel_ring *ring = req->ring;
+	u32 *rbuf;
 	int ret;
 
-	ret = intel_ring_begin(req, 2);
+	ret = intel_ring_begin(req, 2, &rbuf);
 	if (ret)
 		return ret;
 
-	intel_ring_emit(ring, MI_BATCH_BUFFER_START | MI_BATCH_GTT);
-	intel_ring_emit(ring, offset | (dispatch_flags & I915_DISPATCH_SECURE ?
-					0 : MI_BATCH_NON_SECURE));
-	intel_ring_advance(ring);
+	*rbuf++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT;
+	*rbuf++ = offset | (dispatch_flags & I915_DISPATCH_SECURE ?
+		  0 : MI_BATCH_NON_SECURE);
+
+	intel_ring_advance(req->ring);
 
 	return 0;
 }
@@ -2181,7 +2189,7 @@ int intel_ring_alloc_request_extras(struct drm_i915_gem_request *request)
 
 	request->ring = request->engine->buffer;
 
-	ret = intel_ring_begin(request, 0);
+	ret = __intel_ring_begin(request, 0);
 	if (ret)
 		return ret;
 
@@ -2237,7 +2245,7 @@ static int wait_for_space(struct drm_i915_gem_request *req, int bytes)
 	return 0;
 }
 
-int intel_ring_begin(struct drm_i915_gem_request *req, int num_dwords)
+int __intel_ring_begin(struct drm_i915_gem_request *req, int num_dwords)
 {
 	struct intel_ring *ring = req->ring;
 	int remain_actual = ring->size - ring->tail;
@@ -2295,18 +2303,19 @@ int intel_ring_cacheline_align(struct drm_i915_gem_request *req)
 	struct intel_ring *ring = req->ring;
 	int num_dwords =
 		(ring->tail & (CACHELINE_BYTES - 1)) / sizeof(uint32_t);
+	u32 *rbuf;
 	int ret;
 
 	if (num_dwords == 0)
 		return 0;
 
 	num_dwords = CACHELINE_BYTES / sizeof(uint32_t) - num_dwords;
-	ret = intel_ring_begin(req, num_dwords);
+	ret = intel_ring_begin(req, num_dwords, &rbuf);
 	if (ret)
 		return ret;
 
 	while (num_dwords--)
-		intel_ring_emit(ring, MI_NOOP);
+		*rbuf++ = MI_NOOP;
 
 	intel_ring_advance(ring);
 
@@ -2352,11 +2361,10 @@ static void gen6_bsd_submit_request(struct drm_i915_gem_request *request)
 
 static int gen6_bsd_ring_flush(struct drm_i915_gem_request *req, u32 mode)
 {
-	struct intel_ring *ring = req->ring;
-	uint32_t cmd;
+	u32 cmd, *rbuf;
 	int ret;
 
-	ret = intel_ring_begin(req, 4);
+	ret = intel_ring_begin(req, 4, &rbuf);
 	if (ret)
 		return ret;
 
@@ -2380,16 +2388,18 @@ static int gen6_bsd_ring_flush(struct drm_i915_gem_request *req, u32 mode)
 	if (mode & EMIT_INVALIDATE)
 		cmd |= MI_INVALIDATE_TLB | MI_INVALIDATE_BSD;
 
-	intel_ring_emit(ring, cmd);
-	intel_ring_emit(ring, I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT);
+	*rbuf++ = cmd;
+	*rbuf++ = I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
 	if (INTEL_GEN(req->i915) >= 8) {
-		intel_ring_emit(ring, 0); /* upper addr */
-		intel_ring_emit(ring, 0); /* value */
+		*rbuf++ = 0; /* upper addr */
+		*rbuf++ = 0; /* value */
 	} else  {
-		intel_ring_emit(ring, 0);
-		intel_ring_emit(ring, MI_NOOP);
+		*rbuf++ = 0;
+		*rbuf++ = MI_NOOP;
 	}
-	intel_ring_advance(ring);
+
+	intel_ring_advance(req->ring);
+
 	return 0;
 }
 
@@ -2398,23 +2408,24 @@ gen8_emit_bb_start(struct drm_i915_gem_request *req,
 		   u64 offset, u32 len,
 		   unsigned int dispatch_flags)
 {
-	struct intel_ring *ring = req->ring;
 	bool ppgtt = USES_PPGTT(req->i915) &&
 			!(dispatch_flags & I915_DISPATCH_SECURE);
+	u32 *rbuf;
 	int ret;
 
-	ret = intel_ring_begin(req, 4);
+	ret = intel_ring_begin(req, 4, &rbuf);
 	if (ret)
 		return ret;
 
 	/* FIXME(BDW): Address space and security selectors. */
-	intel_ring_emit(ring, MI_BATCH_BUFFER_START_GEN8 | (ppgtt<<8) |
-			(dispatch_flags & I915_DISPATCH_RS ?
-			 MI_BATCH_RESOURCE_STREAMER : 0));
-	intel_ring_emit(ring, lower_32_bits(offset));
-	intel_ring_emit(ring, upper_32_bits(offset));
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_advance(ring);
+	*rbuf++ = MI_BATCH_BUFFER_START_GEN8 | (ppgtt<<8) |
+		  (dispatch_flags & I915_DISPATCH_RS ?
+		  MI_BATCH_RESOURCE_STREAMER : 0);
+	*rbuf++ = lower_32_bits(offset);
+	*rbuf++ = upper_32_bits(offset);
+	*rbuf++ = MI_NOOP;
+
+	intel_ring_advance(req->ring);
 
 	return 0;
 }
@@ -2424,22 +2435,22 @@ hsw_emit_bb_start(struct drm_i915_gem_request *req,
 		  u64 offset, u32 len,
 		  unsigned int dispatch_flags)
 {
-	struct intel_ring *ring = req->ring;
+	u32 *rbuf;
 	int ret;
 
-	ret = intel_ring_begin(req, 2);
+	ret = intel_ring_begin(req, 2, &rbuf);
 	if (ret)
 		return ret;
 
-	intel_ring_emit(ring,
-			MI_BATCH_BUFFER_START |
-			(dispatch_flags & I915_DISPATCH_SECURE ?
-			 0 : MI_BATCH_PPGTT_HSW | MI_BATCH_NON_SECURE_HSW) |
-			(dispatch_flags & I915_DISPATCH_RS ?
-			 MI_BATCH_RESOURCE_STREAMER : 0));
+	*rbuf++ = MI_BATCH_BUFFER_START |
+		  (dispatch_flags & I915_DISPATCH_SECURE ?
+		  0 : MI_BATCH_PPGTT_HSW | MI_BATCH_NON_SECURE_HSW) |
+		  (dispatch_flags & I915_DISPATCH_RS ?
+		  MI_BATCH_RESOURCE_STREAMER : 0);
 	/* bit0-7 is the length on GEN6+ */
-	intel_ring_emit(ring, offset);
-	intel_ring_advance(ring);
+	*rbuf++ = offset;
+
+	intel_ring_advance(req->ring);
 
 	return 0;
 }
@@ -2449,20 +2460,20 @@ gen6_emit_bb_start(struct drm_i915_gem_request *req,
 		   u64 offset, u32 len,
 		   unsigned int dispatch_flags)
 {
-	struct intel_ring *ring = req->ring;
+	u32 *rbuf;
 	int ret;
 
-	ret = intel_ring_begin(req, 2);
+	ret = intel_ring_begin(req, 2, &rbuf);
 	if (ret)
 		return ret;
 
-	intel_ring_emit(ring,
-			MI_BATCH_BUFFER_START |
-			(dispatch_flags & I915_DISPATCH_SECURE ?
-			 0 : MI_BATCH_NON_SECURE_I965));
+	*rbuf++ = MI_BATCH_BUFFER_START |
+		  (dispatch_flags & I915_DISPATCH_SECURE ?
+		  0 : MI_BATCH_NON_SECURE_I965);
 	/* bit0-7 is the length on GEN6+ */
-	intel_ring_emit(ring, offset);
-	intel_ring_advance(ring);
+	*rbuf++ = offset;
+
+	intel_ring_advance(req->ring);
 
 	return 0;
 }
@@ -2471,11 +2482,10 @@ gen6_emit_bb_start(struct drm_i915_gem_request *req,
 
 static int gen6_ring_flush(struct drm_i915_gem_request *req, u32 mode)
 {
-	struct intel_ring *ring = req->ring;
-	uint32_t cmd;
+	u32 cmd, *rbuf;
 	int ret;
 
-	ret = intel_ring_begin(req, 4);
+	ret = intel_ring_begin(req, 4, &rbuf);
 	if (ret)
 		return ret;
 
@@ -2498,17 +2508,17 @@ static int gen6_ring_flush(struct drm_i915_gem_request *req, u32 mode)
 	 */
 	if (mode & EMIT_INVALIDATE)
 		cmd |= MI_INVALIDATE_TLB;
-	intel_ring_emit(ring, cmd);
-	intel_ring_emit(ring,
-			I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT);
+	*rbuf++ = cmd;
+	*rbuf++ = I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
 	if (INTEL_GEN(req->i915) >= 8) {
-		intel_ring_emit(ring, 0); /* upper addr */
-		intel_ring_emit(ring, 0); /* value */
+		*rbuf++ = 0; /* upper addr */
+		*rbuf++ = 0; /* value */
 	} else  {
-		intel_ring_emit(ring, 0);
-		intel_ring_emit(ring, MI_NOOP);
+		*rbuf++ = 0;
+		*rbuf++ = MI_NOOP;
 	}
-	intel_ring_advance(ring);
+
+	intel_ring_advance(req->ring);
 
 	return 0;
 }
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 84aea549de5d..1182116f20a0 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -443,7 +443,24 @@ void intel_engine_cleanup(struct intel_engine_cs *engine);
 
 int intel_ring_alloc_request_extras(struct drm_i915_gem_request *request);
 
-int __must_check intel_ring_begin(struct drm_i915_gem_request *req, int n);
+int __must_check __intel_ring_begin(struct drm_i915_gem_request *req, int n);
+
+static inline int __must_check
+intel_ring_begin(struct drm_i915_gem_request *req, int n, u32 **rbuf)
+{
+	struct intel_ring *ring = req->ring;
+	int ret;
+
+	ret = __intel_ring_begin(req, n);
+	if (unlikely(ret))
+		return ret;
+
+	*rbuf = (u32 *)(ring->vaddr + ring->tail);
+	ring->tail += n * sizeof(u32);
+
+	return 0;
+}
+
 int __must_check intel_ring_cacheline_align(struct drm_i915_gem_request *req);
 
 static inline void intel_ring_emit(struct intel_ring *ring, u32 data)
-- 
1.9.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* ✗ Fi.CI.BAT: failure for drm/i915: Emit to ringbuffer directly
  2016-09-08 15:12 [RFC] drm/i915: Emit to ringbuffer directly Tvrtko Ursulin
@ 2016-09-08 15:54 ` Patchwork
  2016-09-08 16:40 ` [RFC] " Chris Wilson
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 15+ messages in thread
From: Patchwork @ 2016-09-08 15:54 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: intel-gfx

== Series Details ==

Series: drm/i915: Emit to ringbuffer directly
URL   : https://patchwork.freedesktop.org/series/12186/
State : failure

== Summary ==

Series 12186v1 drm/i915: Emit to ringbuffer directly
http://patchwork.freedesktop.org/api/1.0/series/12186/revisions/1/mbox/

Test drv_module_reload_basic:
                skip       -> PASS       (fi-skl-6260u)
Test kms_cursor_legacy:
        Subgroup basic-cursor-vs-flip-legacy:
                pass       -> FAIL       (fi-bsw-n3050)

fi-bdw-5557u     total:252  pass:236  dwarn:0   dfail:0   fail:1   skip:15 
fi-bsw-n3050     total:252  pass:204  dwarn:0   dfail:0   fail:2   skip:46 
fi-hsw-4770k     total:252  pass:229  dwarn:0   dfail:0   fail:1   skip:22 
fi-hsw-4770r     total:252  pass:225  dwarn:0   dfail:0   fail:1   skip:26 
fi-ilk-650       total:252  pass:182  dwarn:0   dfail:0   fail:3   skip:67 
fi-ivb-3520m     total:252  pass:220  dwarn:0   dfail:0   fail:1   skip:31 
fi-ivb-3770      total:252  pass:220  dwarn:0   dfail:0   fail:1   skip:31 
fi-skl-6260u     total:252  pass:237  dwarn:0   dfail:0   fail:1   skip:14 
fi-skl-6700k     total:252  pass:222  dwarn:1   dfail:0   fail:1   skip:28 
fi-snb-2520m     total:252  pass:206  dwarn:0   dfail:0   fail:1   skip:45 
fi-snb-2600      total:252  pass:206  dwarn:0   dfail:0   fail:1   skip:45 
fi-byt-n2820 failed to collect. IGT log at Patchwork_2494/fi-byt-n2820/igt.log

Results at /archive/results/CI_IGT_test/Patchwork_2494/

5986f290e25f42d3d5df390411cc43683deb1301 drm-intel-nightly: 2016y-09m-08d-09h-11m-50s UTC integration manifest
bbd9747 drm/i915: Emit to ringbuffer directly

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC] drm/i915: Emit to ringbuffer directly
  2016-09-08 15:12 [RFC] drm/i915: Emit to ringbuffer directly Tvrtko Ursulin
  2016-09-08 15:54 ` ✗ Fi.CI.BAT: failure for " Patchwork
@ 2016-09-08 16:40 ` Chris Wilson
  2016-09-09  8:32   ` Tvrtko Ursulin
  2016-09-09 16:26 ` ✗ Fi.CI.BAT: failure for drm/i915: Emit to ringbuffer directly (rev2) Patchwork
  2016-09-12 10:19 ` ✓ Fi.CI.BAT: success for drm/i915: Emit to ringbuffer directly (rev3) Patchwork
  3 siblings, 1 reply; 15+ messages in thread
From: Chris Wilson @ 2016-09-08 16:40 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: Intel-gfx

On Thu, Sep 08, 2016 at 04:12:55PM +0100, Tvrtko Ursulin wrote:
> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> 
> This removes the usage of intel_ring_emit in favour of
> directly writing to the ring buffer.

I have the same patch! But I called it out, for historical reasons.

Oh, except mine uses out[0]...out[N] because gcc prefers that over
*out++ = ...

> intel_ring_emit was preventing the compiler for optimising
> fetch and increment of the current ring buffer pointer and
> therefore generating very verbose code for every write.
> 
> It had no useful purpose since all ringbuffer operations
> are started and ended with intel_ring_begin and
> intel_ring_advance respectively, with no bail out in the
> middle possible, so it is fine to increment the tail in
> intel_ring_begin and let the code manage the pointer
> itself.
> 
> Useless instruction removal amounts to approximately
> 2384 bytes of saved text on my build.
> 
> Not sure if this has any measurable performance
> implications but executing a ton of useless instructions
> on fast paths cannot be good.

It does show up in perf.
 
> Patch is not fully polished, but it compiles and runs
> on Gen9 at least.
> 
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_gem_context.c    |  62 ++--
>  drivers/gpu/drm/i915/i915_gem_execbuffer.c |  27 +-
>  drivers/gpu/drm/i915/i915_gem_gtt.c        |  57 ++--
>  drivers/gpu/drm/i915/intel_display.c       | 113 ++++---
>  drivers/gpu/drm/i915/intel_lrc.c           | 223 +++++++-------
>  drivers/gpu/drm/i915/intel_mocs.c          |  43 +--
>  drivers/gpu/drm/i915/intel_overlay.c       |  69 ++---
>  drivers/gpu/drm/i915/intel_ringbuffer.c    | 480 +++++++++++++++--------------
>  drivers/gpu/drm/i915/intel_ringbuffer.h    |  19 +-
>  9 files changed, 555 insertions(+), 538 deletions(-)

Hmm, mine is bigger.

 drivers/gpu/drm/i915/i915_gem_context.c    |  85 ++--
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |  37 +-
 drivers/gpu/drm/i915/i915_gem_gtt.c        |  62 +--
 drivers/gpu/drm/i915/i915_gem_request.c    | 135 ++++-
 drivers/gpu/drm/i915/i915_gem_request.h    |   2 +
 drivers/gpu/drm/i915/intel_display.c       | 133 +++--
 drivers/gpu/drm/i915/intel_lrc.c           | 188 ++++---
 drivers/gpu/drm/i915/intel_lrc.h           |   2 -
 drivers/gpu/drm/i915/intel_mocs.c          |  50 +-
 drivers/gpu/drm/i915/intel_overlay.c       |  77 ++-
 drivers/gpu/drm/i915/intel_ringbuffer.c    | 762 ++++++++++++-----------------
 drivers/gpu/drm/i915/intel_ringbuffer.h    |  36 +-
 12 files changed, 721 insertions(+), 848 deletions(-)

(this includes moving the intel_ring_begin to i915_gem_request)

plus an ealier

 drivers/gpu/drm/i915/i915_gem_request.c |  26 ++---
 drivers/gpu/drm/i915/intel_lrc.c        | 121 ++++++++---------------
 drivers/gpu/drm/i915/intel_ringbuffer.c | 168 +++++++++++---------------------
 drivers/gpu/drm/i915/intel_ringbuffer.h |  10 +-
 4 files changed, 112 insertions(+), 213 deletions(-)

since I wanted parts of it for emitting timelines.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC] drm/i915: Emit to ringbuffer directly
  2016-09-08 16:40 ` [RFC] " Chris Wilson
@ 2016-09-09  8:32   ` Tvrtko Ursulin
  2016-09-09 13:20     ` Dave Gordon
                       ` (2 more replies)
  0 siblings, 3 replies; 15+ messages in thread
From: Tvrtko Ursulin @ 2016-09-09  8:32 UTC (permalink / raw)
  To: Chris Wilson, Intel-gfx


On 08/09/16 17:40, Chris Wilson wrote:
> On Thu, Sep 08, 2016 at 04:12:55PM +0100, Tvrtko Ursulin wrote:
>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>
>> This removes the usage of intel_ring_emit in favour of
>> directly writing to the ring buffer.
>
> I have the same patch! But I called it out, for historical reasons.

Yes I know we talked about it in the past but I did not think you will 
find time to actually write it amongst all the other things.

> Oh, except mine uses out[0]...out[N] because gcc prefers that over
> *out++ = ...

It copes just fine with the latter here, for example:

	*rbuf++ = cmd;
	*rbuf++ = I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
	*rbuf++ = 0; /* upper addr */
	*rbuf++ = 0; /* value */

Is:

      3e9:       89 10                   mov    %edx,(%rax)
      3eb:       c7 40 04 04 01 00 00    movl   $0x104,0x4(%rax)
      3f2:       c7 40 08 00 00 00 00    movl   $0x0,0x8(%rax)
      3f9:       c7 40 0c 00 00 00 00    movl   $0x0,0xc(%rax)

And for the record, before this patch, with intel_ring_emit:

      53a:       8b 53 3c                mov    0x3c(%rbx),%edx
      53d:       48 8b 4b 08             mov    0x8(%rbx),%rcx
      541:       89 04 11                mov    %eax,(%rcx,%rdx,1)
      544:       8b 43 3c                mov    0x3c(%rbx),%eax
      547:       48 8b 53 08             mov    0x8(%rbx),%rdx
      54b:       83 c0 04                add    $0x4,%eax
      54e:       89 43 3c                mov    %eax,0x3c(%rbx)
      551:       c7 04 02 04 01 00 00    movl   $0x104,(%rdx,%rax,1)
      558:       8b 43 3c                mov    0x3c(%rbx),%eax
      55b:       48 8b 53 08             mov    0x8(%rbx),%rdx
      55f:       83 c0 04                add    $0x4,%eax
      562:       89 43 3c                mov    %eax,0x3c(%rbx)
      565:       c7 04 02 00 00 00 00    movl   $0x0,(%rdx,%rax,1)
      56c:       8b 43 3c                mov    0x3c(%rbx),%eax
      56f:       48 8b 53 08             mov    0x8(%rbx),%rdx
      573:       83 c0 04                add    $0x4,%eax
      576:       89 43 3c                mov    %eax,0x3c(%rbx)
      579:       c7 04 02 00 00 00 00    movl   $0x0,(%rdx,%rax,1)

Yuck :) At least they are not function calls to iowrite any more. :)

>> intel_ring_emit was preventing the compiler for optimising
>> fetch and increment of the current ring buffer pointer and
>> therefore generating very verbose code for every write.
>>
>> It had no useful purpose since all ringbuffer operations
>> are started and ended with intel_ring_begin and
>> intel_ring_advance respectively, with no bail out in the
>> middle possible, so it is fine to increment the tail in
>> intel_ring_begin and let the code manage the pointer
>> itself.
>>
>> Useless instruction removal amounts to approximately
>> 2384 bytes of saved text on my build.
>>
>> Not sure if this has any measurable performance
>> implications but executing a ton of useless instructions
>> on fast paths cannot be good.
>
> It does show up in perf.

Cool.

>> Patch is not fully polished, but it compiles and runs
>> on Gen9 at least.
>>
>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>> ---
>>   drivers/gpu/drm/i915/i915_gem_context.c    |  62 ++--
>>   drivers/gpu/drm/i915/i915_gem_execbuffer.c |  27 +-
>>   drivers/gpu/drm/i915/i915_gem_gtt.c        |  57 ++--
>>   drivers/gpu/drm/i915/intel_display.c       | 113 ++++---
>>   drivers/gpu/drm/i915/intel_lrc.c           | 223 +++++++-------
>>   drivers/gpu/drm/i915/intel_mocs.c          |  43 +--
>>   drivers/gpu/drm/i915/intel_overlay.c       |  69 ++---
>>   drivers/gpu/drm/i915/intel_ringbuffer.c    | 480 +++++++++++++++--------------
>>   drivers/gpu/drm/i915/intel_ringbuffer.h    |  19 +-
>>   9 files changed, 555 insertions(+), 538 deletions(-)
>
> Hmm, mine is bigger.
>
>   drivers/gpu/drm/i915/i915_gem_context.c    |  85 ++--
>   drivers/gpu/drm/i915/i915_gem_execbuffer.c |  37 +-
>   drivers/gpu/drm/i915/i915_gem_gtt.c        |  62 +--
>   drivers/gpu/drm/i915/i915_gem_request.c    | 135 ++++-
>   drivers/gpu/drm/i915/i915_gem_request.h    |   2 +
>   drivers/gpu/drm/i915/intel_display.c       | 133 +++--
>   drivers/gpu/drm/i915/intel_lrc.c           | 188 ++++---
>   drivers/gpu/drm/i915/intel_lrc.h           |   2 -
>   drivers/gpu/drm/i915/intel_mocs.c          |  50 +-
>   drivers/gpu/drm/i915/intel_overlay.c       |  77 ++-
>   drivers/gpu/drm/i915/intel_ringbuffer.c    | 762 ++++++++++++-----------------
>   drivers/gpu/drm/i915/intel_ringbuffer.h    |  36 +-
>   12 files changed, 721 insertions(+), 848 deletions(-)
>
> (this includes moving the intel_ring_begin to i915_gem_request)
>
> plus an ealier
>
>   drivers/gpu/drm/i915/i915_gem_request.c |  26 ++---
>   drivers/gpu/drm/i915/intel_lrc.c        | 121 ++++++++---------------
>   drivers/gpu/drm/i915/intel_ringbuffer.c | 168 +++++++++++---------------------
>   drivers/gpu/drm/i915/intel_ringbuffer.h |  10 +-
>   4 files changed, 112 insertions(+), 213 deletions(-)
>
> since I wanted parts of it for emitting timelines.

Ok what do you want to do?

Regards,

Tvrtko


_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC] drm/i915: Emit to ringbuffer directly
  2016-09-09  8:32   ` Tvrtko Ursulin
@ 2016-09-09 13:20     ` Dave Gordon
  2016-09-09 13:58       ` Tvrtko Ursulin
  2016-09-09 13:40     ` [RFC] " Chris Wilson
  2016-09-09 13:45     ` Chris Wilson
  2 siblings, 1 reply; 15+ messages in thread
From: Dave Gordon @ 2016-09-09 13:20 UTC (permalink / raw)
  To: Tvrtko Ursulin, Chris Wilson, Intel-gfx

On 09/09/16 09:32, Tvrtko Ursulin wrote:
>
> On 08/09/16 17:40, Chris Wilson wrote:
>> On Thu, Sep 08, 2016 at 04:12:55PM +0100, Tvrtko Ursulin wrote:
>>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>>
>>> This removes the usage of intel_ring_emit in favour of
>>> directly writing to the ring buffer.
>>
>> I have the same patch! But I called it out, for historical reasons.
>
> Yes I know we talked about it in the past but I did not think you will
> find time to actually write it amongst all the other things.
>
>> Oh, except mine uses out[0]...out[N] because gcc prefers that over
>> *out++ = ...
>
> It copes just fine with the latter here, for example:
>
>     *rbuf++ = cmd;
>     *rbuf++ = I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
>     *rbuf++ = 0; /* upper addr */
>     *rbuf++ = 0; /* value */
>
> Is:
>
>      3e9:       89 10                   mov    %edx,(%rax)
>      3eb:       c7 40 04 04 01 00 00    movl   $0x104,0x4(%rax)
>      3f2:       c7 40 08 00 00 00 00    movl   $0x0,0x8(%rax)
>      3f9:       c7 40 0c 00 00 00 00    movl   $0x0,0xc(%rax)
>
> And for the record, before this patch, with intel_ring_emit:
>
>      53a:       8b 53 3c                mov    0x3c(%rbx),%edx
>      53d:       48 8b 4b 08             mov    0x8(%rbx),%rcx
>      541:       89 04 11                mov    %eax,(%rcx,%rdx,1)

>      544:       8b 43 3c                mov    0x3c(%rbx),%eax
>      547:       48 8b 53 08             mov    0x8(%rbx),%rdx
>      54b:       83 c0 04                add    $0x4,%eax
>      54e:       89 43 3c                mov    %eax,0x3c(%rbx)
>      551:       c7 04 02 04 01 00 00    movl   $0x104,(%rdx,%rax,1)

>      558:       8b 43 3c                mov    0x3c(%rbx),%eax
>      55b:       48 8b 53 08             mov    0x8(%rbx),%rdx
>      55f:       83 c0 04                add    $0x4,%eax
>      562:       89 43 3c                mov    %eax,0x3c(%rbx)
>      565:       c7 04 02 00 00 00 00    movl   $0x0,(%rdx,%rax,1)

>      56c:       8b 43 3c                mov    0x3c(%rbx),%eax
>      56f:       48 8b 53 08             mov    0x8(%rbx),%rdx
>      573:       83 c0 04                add    $0x4,%eax
>      576:       89 43 3c                mov    %eax,0x3c(%rbx)
>      579:       c7 04 02 00 00 00 00    movl   $0x0,(%rdx,%rax,1)
>
> Yuck :) At least they are not function calls to iowrite any more. :)

Curious that the inlining wasn't doing a better job, though. For 
example, it's not preserving %eax as a local cache of 0x3c(%rbx).

>>> intel_ring_emit was preventing the compiler for optimising
>>> fetch and increment of the current ring buffer pointer and
>>> therefore generating very verbose code for every write.
>>>
>>> It had no useful purpose since all ringbuffer operations
>>> are started and ended with intel_ring_begin and
>>> intel_ring_advance respectively, with no bail out in the
>>> middle possible, so it is fine to increment the tail in
>>> intel_ring_begin and let the code manage the pointer
>>> itself.

Or you could have intel_ring_advance() take the updated local and use 
that to update the ring->tail. It could then check that you hadn't 
exceeded your allocation, OR that you had used exactly as much as you'd 
allocated. I'm sure I had a version that did that, long ago.

>>> Useless instruction removal amounts to approximately
>>> 2384 bytes of saved text on my build.
>>>
>>> Not sure if this has any measurable performance
>>> implications but executing a ton of useless instructions
>>> on fast paths cannot be good.
>>
>> It does show up in perf.
>
> Cool.
>
>>> Patch is not fully polished, but it compiles and runs
>>> on Gen9 at least.
>>>
>>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>> ---
>>>   drivers/gpu/drm/i915/i915_gem_context.c    |  62 ++--
>>>   drivers/gpu/drm/i915/i915_gem_execbuffer.c |  27 +-
>>>   drivers/gpu/drm/i915/i915_gem_gtt.c        |  57 ++--
>>>   drivers/gpu/drm/i915/intel_display.c       | 113 ++++---
>>>   drivers/gpu/drm/i915/intel_lrc.c           | 223 +++++++-------
>>>   drivers/gpu/drm/i915/intel_mocs.c          |  43 +--
>>>   drivers/gpu/drm/i915/intel_overlay.c       |  69 ++---
>>>   drivers/gpu/drm/i915/intel_ringbuffer.c    | 480
>>> +++++++++++++++--------------
>>>   drivers/gpu/drm/i915/intel_ringbuffer.h    |  19 +-
>>>   9 files changed, 555 insertions(+), 538 deletions(-)
>>
>> Hmm, mine is bigger.
>>
>>   drivers/gpu/drm/i915/i915_gem_context.c    |  85 ++--
>>   drivers/gpu/drm/i915/i915_gem_execbuffer.c |  37 +-
>>   drivers/gpu/drm/i915/i915_gem_gtt.c        |  62 +--
>>   drivers/gpu/drm/i915/i915_gem_request.c    | 135 ++++-
>>   drivers/gpu/drm/i915/i915_gem_request.h    |   2 +
>>   drivers/gpu/drm/i915/intel_display.c       | 133 +++--
>>   drivers/gpu/drm/i915/intel_lrc.c           | 188 ++++---
>>   drivers/gpu/drm/i915/intel_lrc.h           |   2 -
>>   drivers/gpu/drm/i915/intel_mocs.c          |  50 +-
>>   drivers/gpu/drm/i915/intel_overlay.c       |  77 ++-
>>   drivers/gpu/drm/i915/intel_ringbuffer.c    | 762
>> ++++++++++++-----------------
>>   drivers/gpu/drm/i915/intel_ringbuffer.h    |  36 +-
>>   12 files changed, 721 insertions(+), 848 deletions(-)
>>
>> (this includes moving the intel_ring_begin to i915_gem_request)
>>
>> plus an ealier
>>
>>   drivers/gpu/drm/i915/i915_gem_request.c |  26 ++---
>>   drivers/gpu/drm/i915/intel_lrc.c        | 121 ++++++++---------------
>>   drivers/gpu/drm/i915/intel_ringbuffer.c | 168
>> +++++++++++---------------------
>>   drivers/gpu/drm/i915/intel_ringbuffer.h |  10 +-
>>   4 files changed, 112 insertions(+), 213 deletions(-)
>>
>> since I wanted parts of it for emitting timelines.
>
> Ok what do you want to do?
>
> Regards,
> Tvrtko

You are giving up the possibility of wrapping over the end of the 
buffer, but we avoid doing that at present and it doesn't work on some 
hardware, so no great loss.

What about

	rptr = ring_begin(req, len);
	if (unlikely(IS_ERR(rptr))
		return PTR_ERR(rptr);

rather than

	ret = intel_ring_begin(req, len, &rbuf);
  	if (ret)
  		return ret;

(because returning values through ref parameters is ugly).

Do you have/do you want a Coccinelle script to automate this?

.Dave.
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC] drm/i915: Emit to ringbuffer directly
  2016-09-09  8:32   ` Tvrtko Ursulin
  2016-09-09 13:20     ` Dave Gordon
@ 2016-09-09 13:40     ` Chris Wilson
  2016-09-09 13:45     ` Chris Wilson
  2 siblings, 0 replies; 15+ messages in thread
From: Chris Wilson @ 2016-09-09 13:40 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: Intel-gfx

On Fri, Sep 09, 2016 at 09:32:50AM +0100, Tvrtko Ursulin wrote:
> 
> On 08/09/16 17:40, Chris Wilson wrote:
> >On Thu, Sep 08, 2016 at 04:12:55PM +0100, Tvrtko Ursulin wrote:
> >>From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> >>
> >>This removes the usage of intel_ring_emit in favour of
> >>directly writing to the ring buffer.
> >
> >I have the same patch! But I called it out, for historical reasons.
> 
> Yes I know we talked about it in the past but I did not think you
> will find time to actually write it amongst all the other things.
> 
> >Oh, except mine uses out[0]...out[N] because gcc prefers that over
> >*out++ = ...
> 
> It copes just fine with the latter here, for example:
> 
> 	*rbuf++ = cmd;
> 	*rbuf++ = I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
> 	*rbuf++ = 0; /* upper addr */
> 	*rbuf++ = 0; /* value */
> 
> Is:
> 
>      3e9:       89 10                   mov    %edx,(%rax)
>      3eb:       c7 40 04 04 01 00 00    movl   $0x104,0x4(%rax)
>      3f2:       c7 40 08 00 00 00 00    movl   $0x0,0x8(%rax)
>      3f9:       c7 40 0c 00 00 00 00    movl   $0x0,0xc(%rax)

Great. Last time we had a conversation about this, and when we looked at
constructing batchbuffers in userpspace, gcc was still generating two
instuctions (*ptr followed by ptr++) rather than emitting the mov to a
fixed offset for that sequence.

> >plus an ealier
> >
> >  drivers/gpu/drm/i915/i915_gem_request.c |  26 ++---
> >  drivers/gpu/drm/i915/intel_lrc.c        | 121 ++++++++---------------
> >  drivers/gpu/drm/i915/intel_ringbuffer.c | 168 +++++++++++---------------------
> >  drivers/gpu/drm/i915/intel_ringbuffer.h |  10 +-
> >  4 files changed, 112 insertions(+), 213 deletions(-)
> >
> >since I wanted parts of it for emitting timelines.
> 
> Ok what do you want to do?

I have plans to use that particular patch soon, but updating
intel_ring_begin() itself is a long way down my list. Given that you have
a patch ready, let's keep going. I'm just curious as to what I did
differently to trim off the extra lines (probably intel_ring_advance()). 
The other thing I did was to relax the restriction to only emit in qword 
aligned packets (by fixing up the tail for qword alignment on sealing the
request). Also, I would rather the function be expressed as operating on
the request, i915_gem_request_emit() was my choice.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC] drm/i915: Emit to ringbuffer directly
  2016-09-09  8:32   ` Tvrtko Ursulin
  2016-09-09 13:20     ` Dave Gordon
  2016-09-09 13:40     ` [RFC] " Chris Wilson
@ 2016-09-09 13:45     ` Chris Wilson
  2016-09-09 14:14       ` Tvrtko Ursulin
  2 siblings, 1 reply; 15+ messages in thread
From: Chris Wilson @ 2016-09-09 13:45 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: Intel-gfx

On Fri, Sep 09, 2016 at 09:32:50AM +0100, Tvrtko Ursulin wrote:
> 
> On 08/09/16 17:40, Chris Wilson wrote:
> >On Thu, Sep 08, 2016 at 04:12:55PM +0100, Tvrtko Ursulin wrote:
> >>From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> >>
> >>This removes the usage of intel_ring_emit in favour of
> >>directly writing to the ring buffer.
> >
> >I have the same patch! But I called it out, for historical reasons.
> 
> Yes I know we talked about it in the past but I did not think you
> will find time to actually write it amongst all the other things.
> 
> >Oh, except mine uses out[0]...out[N] because gcc prefers that over
> >*out++ = ...
> 
> It copes just fine with the latter here, for example:
> 
> 	*rbuf++ = cmd;
> 	*rbuf++ = I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
> 	*rbuf++ = 0; /* upper addr */
> 	*rbuf++ = 0; /* value */
> 
> Is:
> 
>      3e9:       89 10                   mov    %edx,(%rax)
>      3eb:       c7 40 04 04 01 00 00    movl   $0x104,0x4(%rax)
>      3f2:       c7 40 08 00 00 00 00    movl   $0x0,0x8(%rax)
>      3f9:       c7 40 0c 00 00 00 00    movl   $0x0,0xc(%rax)

Last time Dave suggested using something like

i915_gem_request_emit(req, (struct cmd_packet){ dw0, dw1, dw2 });

I tried mocking something up, but just found gcc was constructing the
struct on the stack and then copying across, and generating far more
code than the sequence above. Worth seeing if that is better (or if my
mockup was just bad).
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC] drm/i915: Emit to ringbuffer directly
  2016-09-09 13:20     ` Dave Gordon
@ 2016-09-09 13:58       ` Tvrtko Ursulin
  2016-09-09 15:52         ` [RFC v2] " Tvrtko Ursulin
  0 siblings, 1 reply; 15+ messages in thread
From: Tvrtko Ursulin @ 2016-09-09 13:58 UTC (permalink / raw)
  To: Dave Gordon, Chris Wilson, Intel-gfx


On 09/09/16 14:20, Dave Gordon wrote:
> On 09/09/16 09:32, Tvrtko Ursulin wrote:
>>
>> On 08/09/16 17:40, Chris Wilson wrote:
>>> On Thu, Sep 08, 2016 at 04:12:55PM +0100, Tvrtko Ursulin wrote:
>>>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>>>
>>>> This removes the usage of intel_ring_emit in favour of
>>>> directly writing to the ring buffer.
>>>
>>> I have the same patch! But I called it out, for historical reasons.
>>
>> Yes I know we talked about it in the past but I did not think you will
>> find time to actually write it amongst all the other things.
>>
>>> Oh, except mine uses out[0]...out[N] because gcc prefers that over
>>> *out++ = ...
>>
>> It copes just fine with the latter here, for example:
>>
>>     *rbuf++ = cmd;
>>     *rbuf++ = I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
>>     *rbuf++ = 0; /* upper addr */
>>     *rbuf++ = 0; /* value */
>>
>> Is:
>>
>>      3e9:       89 10                   mov    %edx,(%rax)
>>      3eb:       c7 40 04 04 01 00 00    movl   $0x104,0x4(%rax)
>>      3f2:       c7 40 08 00 00 00 00    movl   $0x0,0x8(%rax)
>>      3f9:       c7 40 0c 00 00 00 00    movl   $0x0,0xc(%rax)
>>
>> And for the record, before this patch, with intel_ring_emit:
>>
>>      53a:       8b 53 3c                mov    0x3c(%rbx),%edx
>>      53d:       48 8b 4b 08             mov    0x8(%rbx),%rcx
>>      541:       89 04 11                mov    %eax,(%rcx,%rdx,1)
>
>>      544:       8b 43 3c                mov    0x3c(%rbx),%eax
>>      547:       48 8b 53 08             mov    0x8(%rbx),%rdx
>>      54b:       83 c0 04                add    $0x4,%eax
>>      54e:       89 43 3c                mov    %eax,0x3c(%rbx)
>>      551:       c7 04 02 04 01 00 00    movl   $0x104,(%rdx,%rax,1)
>
>>      558:       8b 43 3c                mov    0x3c(%rbx),%eax
>>      55b:       48 8b 53 08             mov    0x8(%rbx),%rdx
>>      55f:       83 c0 04                add    $0x4,%eax
>>      562:       89 43 3c                mov    %eax,0x3c(%rbx)
>>      565:       c7 04 02 00 00 00 00    movl   $0x0,(%rdx,%rax,1)
>
>>      56c:       8b 43 3c                mov    0x3c(%rbx),%eax
>>      56f:       48 8b 53 08             mov    0x8(%rbx),%rdx
>>      573:       83 c0 04                add    $0x4,%eax
>>      576:       89 43 3c                mov    %eax,0x3c(%rbx)
>>      579:       c7 04 02 00 00 00 00    movl   $0x0,(%rdx,%rax,1)
>>
>> Yuck :) At least they are not function calls to iowrite any more. :)
>
> Curious that the inlining wasn't doing a better job, though. For
> example, it's not preserving %eax as a local cache of 0x3c(%rbx).

Yeah I don't know. Even by employing the restrict keyword in various 
ways I couldn't make it do a better job.

>>>> intel_ring_emit was preventing the compiler for optimising
>>>> fetch and increment of the current ring buffer pointer and
>>>> therefore generating very verbose code for every write.
>>>>
>>>> It had no useful purpose since all ringbuffer operations
>>>> are started and ended with intel_ring_begin and
>>>> intel_ring_advance respectively, with no bail out in the
>>>> middle possible, so it is fine to increment the tail in
>>>> intel_ring_begin and let the code manage the pointer
>>>> itself.
>
> Or you could have intel_ring_advance() take the updated local and use
> that to update the ring->tail. It could then check that you hadn't
> exceeded your allocation, OR that you had used exactly as much as you'd
> allocated. I'm sure I had a version that did that, long ago.

Sounds good to me.

>>>> Useless instruction removal amounts to approximately
>>>> 2384 bytes of saved text on my build.
>>>>
>>>> Not sure if this has any measurable performance
>>>> implications but executing a ton of useless instructions
>>>> on fast paths cannot be good.
>>>
>>> It does show up in perf.
>>
>> Cool.
>>
>>>> Patch is not fully polished, but it compiles and runs
>>>> on Gen9 at least.
>>>>
>>>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>>> ---
>>>>   drivers/gpu/drm/i915/i915_gem_context.c    |  62 ++--
>>>>   drivers/gpu/drm/i915/i915_gem_execbuffer.c |  27 +-
>>>>   drivers/gpu/drm/i915/i915_gem_gtt.c        |  57 ++--
>>>>   drivers/gpu/drm/i915/intel_display.c       | 113 ++++---
>>>>   drivers/gpu/drm/i915/intel_lrc.c           | 223 +++++++-------
>>>>   drivers/gpu/drm/i915/intel_mocs.c          |  43 +--
>>>>   drivers/gpu/drm/i915/intel_overlay.c       |  69 ++---
>>>>   drivers/gpu/drm/i915/intel_ringbuffer.c    | 480
>>>> +++++++++++++++--------------
>>>>   drivers/gpu/drm/i915/intel_ringbuffer.h    |  19 +-
>>>>   9 files changed, 555 insertions(+), 538 deletions(-)
>>>
>>> Hmm, mine is bigger.
>>>
>>>   drivers/gpu/drm/i915/i915_gem_context.c    |  85 ++--
>>>   drivers/gpu/drm/i915/i915_gem_execbuffer.c |  37 +-
>>>   drivers/gpu/drm/i915/i915_gem_gtt.c        |  62 +--
>>>   drivers/gpu/drm/i915/i915_gem_request.c    | 135 ++++-
>>>   drivers/gpu/drm/i915/i915_gem_request.h    |   2 +
>>>   drivers/gpu/drm/i915/intel_display.c       | 133 +++--
>>>   drivers/gpu/drm/i915/intel_lrc.c           | 188 ++++---
>>>   drivers/gpu/drm/i915/intel_lrc.h           |   2 -
>>>   drivers/gpu/drm/i915/intel_mocs.c          |  50 +-
>>>   drivers/gpu/drm/i915/intel_overlay.c       |  77 ++-
>>>   drivers/gpu/drm/i915/intel_ringbuffer.c    | 762
>>> ++++++++++++-----------------
>>>   drivers/gpu/drm/i915/intel_ringbuffer.h    |  36 +-
>>>   12 files changed, 721 insertions(+), 848 deletions(-)
>>>
>>> (this includes moving the intel_ring_begin to i915_gem_request)
>>>
>>> plus an ealier
>>>
>>>   drivers/gpu/drm/i915/i915_gem_request.c |  26 ++---
>>>   drivers/gpu/drm/i915/intel_lrc.c        | 121 ++++++++---------------
>>>   drivers/gpu/drm/i915/intel_ringbuffer.c | 168
>>> +++++++++++---------------------
>>>   drivers/gpu/drm/i915/intel_ringbuffer.h |  10 +-
>>>   4 files changed, 112 insertions(+), 213 deletions(-)
>>>
>>> since I wanted parts of it for emitting timelines.
>>
>> Ok what do you want to do?
>>
>> Regards,
>> Tvrtko
>
> You are giving up the possibility of wrapping over the end of the
> buffer, but we avoid doing that at present and it doesn't work on some
> hardware, so no great loss.
>
> What about
>
>      rptr = ring_begin(req, len);
>      if (unlikely(IS_ERR(rptr))
>          return PTR_ERR(rptr);
>
> rather than
>
>      ret = intel_ring_begin(req, len, &rbuf);
>       if (ret)
>           return ret;
>
> (because returning values through ref parameters is ugly).

I tried that first but then just got reminded how error pointers make 
gcc generate more verbose code. Especially in this case where it would 
do ERR_PTR followed by PTR_ERR for nothing. So I thought since I am 
shrinking it, lets go full hog.

> Do you have/do you want a Coccinelle script to automate this?

I don't and why not - more Coccinelle examples in the public domain the 
better. :)

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC] drm/i915: Emit to ringbuffer directly
  2016-09-09 13:45     ` Chris Wilson
@ 2016-09-09 14:14       ` Tvrtko Ursulin
  0 siblings, 0 replies; 15+ messages in thread
From: Tvrtko Ursulin @ 2016-09-09 14:14 UTC (permalink / raw)
  To: Chris Wilson, Intel-gfx



On 09/09/16 14:45, Chris Wilson wrote:
> On Fri, Sep 09, 2016 at 09:32:50AM +0100, Tvrtko Ursulin wrote:
>>
>> On 08/09/16 17:40, Chris Wilson wrote:
>>> On Thu, Sep 08, 2016 at 04:12:55PM +0100, Tvrtko Ursulin wrote:
>>>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>>>
>>>> This removes the usage of intel_ring_emit in favour of
>>>> directly writing to the ring buffer.
>>>
>>> I have the same patch! But I called it out, for historical reasons.
>>
>> Yes I know we talked about it in the past but I did not think you
>> will find time to actually write it amongst all the other things.
>>
>>> Oh, except mine uses out[0]...out[N] because gcc prefers that over
>>> *out++ = ...
>>
>> It copes just fine with the latter here, for example:
>>
>> 	*rbuf++ = cmd;
>> 	*rbuf++ = I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
>> 	*rbuf++ = 0; /* upper addr */
>> 	*rbuf++ = 0; /* value */
>>
>> Is:
>>
>>       3e9:       89 10                   mov    %edx,(%rax)
>>       3eb:       c7 40 04 04 01 00 00    movl   $0x104,0x4(%rax)
>>       3f2:       c7 40 08 00 00 00 00    movl   $0x0,0x8(%rax)
>>       3f9:       c7 40 0c 00 00 00 00    movl   $0x0,0xc(%rax)
>
> Last time Dave suggested using something like
>
> i915_gem_request_emit(req, (struct cmd_packet){ dw0, dw1, dw2 });
>
> I tried mocking something up, but just found gcc was constructing the
> struct on the stack and then copying across, and generating far more
> code than the sequence above. Worth seeing if that is better (or if my
> mockup was just bad).

Not sure that I like that. It would be a bit ugly in cases where batches 
are built dynamically, no? Perhaps I am misunderstanding the idea?

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [RFC v2] drm/i915: Emit to ringbuffer directly
  2016-09-09 13:58       ` Tvrtko Ursulin
@ 2016-09-09 15:52         ` Tvrtko Ursulin
  2016-09-09 16:04           ` Chris Wilson
  0 siblings, 1 reply; 15+ messages in thread
From: Tvrtko Ursulin @ 2016-09-09 15:52 UTC (permalink / raw)
  To: Intel-gfx

From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

This removes the usage of intel_ring_emit in favour of
directly writing to the ring buffer.

intel_ring_emit was preventing the compiler for optimising
fetch and increment of the current ring buffer pointer and
therefore generating very verbose code for every write.

It had no useful purpose since all ringbuffer operations
are started and ended with intel_ring_begin and
intel_ring_advance respectively, with no bail out in the
middle possible, so it is fine to increment the tail in
intel_ring_begin and let the code manage the pointer
itself.

Useless instruction removal amounts to approximately
two and half kilobytes of saved text on my build.

Not sure if this has any measurable performance
implications but executing a ton of useless instructions
on fast paths cannot be good.

Patch is not fully polished, but it compiles and runs
on Gen9 at least.

v2:
 * Change return from intel_ring_begin to error pointer by
   popular demand.
 * Move tail increment to intel_ring_advance to enable some
   error checking.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Dave Gordon <david.s.gordon@intel.com>
---
 drivers/gpu/drm/i915/i915_gem_context.c    |  72 ++--
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |  37 +-
 drivers/gpu/drm/i915/i915_gem_gtt.c        |  70 ++--
 drivers/gpu/drm/i915/intel_display.c       | 137 +++---
 drivers/gpu/drm/i915/intel_lrc.c           | 231 +++++------
 drivers/gpu/drm/i915/intel_mocs.c          |  53 ++-
 drivers/gpu/drm/i915/intel_overlay.c       |  88 ++--
 drivers/gpu/drm/i915/intel_ringbuffer.c    | 642 ++++++++++++++---------------
 drivers/gpu/drm/i915/intel_ringbuffer.h    |  17 +-
 9 files changed, 658 insertions(+), 689 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 35950ee46a1d..a6193cda743f 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -577,7 +577,6 @@ static inline int
 mi_set_context(struct drm_i915_gem_request *req, u32 hw_flags)
 {
 	struct drm_i915_private *dev_priv = req->i915;
-	struct intel_ring *ring = req->ring;
 	struct intel_engine_cs *engine = req->engine;
 	u32 flags = hw_flags | MI_MM_SPACE_GTT;
 	const int num_rings =
@@ -585,6 +584,7 @@ mi_set_context(struct drm_i915_gem_request *req, u32 hw_flags)
 		i915.semaphores ?
 		INTEL_INFO(dev_priv)->num_rings - 1 :
 		0;
+	u32 *rbuf;
 	int len, ret;
 
 	/* w/a: If Flush TLB Invalidation Mode is enabled, driver must do a TLB
@@ -609,70 +609,61 @@ mi_set_context(struct drm_i915_gem_request *req, u32 hw_flags)
 	if (INTEL_GEN(dev_priv) >= 7)
 		len += 2 + (num_rings ? 4*num_rings + 6 : 0);
 
-	ret = intel_ring_begin(req, len);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, len);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
 	/* WaProgramMiArbOnOffAroundMiSetContext:ivb,vlv,hsw,bdw,chv */
 	if (INTEL_GEN(dev_priv) >= 7) {
-		intel_ring_emit(ring, MI_ARB_ON_OFF | MI_ARB_DISABLE);
+		*rbuf++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
 		if (num_rings) {
 			struct intel_engine_cs *signaller;
 
-			intel_ring_emit(ring,
-					MI_LOAD_REGISTER_IMM(num_rings));
+			*rbuf++ = MI_LOAD_REGISTER_IMM(num_rings);
 			for_each_engine(signaller, dev_priv) {
 				if (signaller == engine)
 					continue;
 
-				intel_ring_emit_reg(ring,
-						    RING_PSMI_CTL(signaller->mmio_base));
-				intel_ring_emit(ring,
-						_MASKED_BIT_ENABLE(GEN6_PSMI_SLEEP_MSG_DISABLE));
+				*rbuf++ = RING_PSMI_CTL(signaller->mmio_base).reg;
+				*rbuf++ = _MASKED_BIT_ENABLE(GEN6_PSMI_SLEEP_MSG_DISABLE);
 			}
 		}
 	}
 
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_emit(ring, MI_SET_CONTEXT);
-	intel_ring_emit(ring,
-			i915_ggtt_offset(req->ctx->engine[RCS].state) | flags);
+	*rbuf++ = MI_NOOP;
+	*rbuf++ = MI_SET_CONTEXT;
+	*rbuf++ = i915_ggtt_offset(req->ctx->engine[RCS].state) | flags;
 	/*
 	 * w/a: MI_SET_CONTEXT must always be followed by MI_NOOP
 	 * WaMiSetContext_Hang:snb,ivb,vlv
 	 */
-	intel_ring_emit(ring, MI_NOOP);
+	*rbuf++ = MI_NOOP;
 
 	if (INTEL_GEN(dev_priv) >= 7) {
 		if (num_rings) {
 			struct intel_engine_cs *signaller;
 			i915_reg_t last_reg = {}; /* keep gcc quiet */
 
-			intel_ring_emit(ring,
-					MI_LOAD_REGISTER_IMM(num_rings));
+			*rbuf++ = MI_LOAD_REGISTER_IMM(num_rings);
 			for_each_engine(signaller, dev_priv) {
 				if (signaller == engine)
 					continue;
 
 				last_reg = RING_PSMI_CTL(signaller->mmio_base);
-				intel_ring_emit_reg(ring, last_reg);
-				intel_ring_emit(ring,
-						_MASKED_BIT_DISABLE(GEN6_PSMI_SLEEP_MSG_DISABLE));
+				*rbuf++ = last_reg.reg;
+				*rbuf++ = _MASKED_BIT_DISABLE(GEN6_PSMI_SLEEP_MSG_DISABLE);
 			}
 
 			/* Insert a delay before the next switch! */
-			intel_ring_emit(ring,
-					MI_STORE_REGISTER_MEM |
-					MI_SRM_LRM_GLOBAL_GTT);
-			intel_ring_emit_reg(ring, last_reg);
-			intel_ring_emit(ring,
-					i915_ggtt_offset(engine->scratch));
-			intel_ring_emit(ring, MI_NOOP);
+			*rbuf++ = MI_STORE_REGISTER_MEM | MI_SRM_LRM_GLOBAL_GTT;
+			*rbuf++ = last_reg.reg;
+			*rbuf++ = i915_ggtt_offset(engine->scratch);
+			*rbuf++ = MI_NOOP;
 		}
-		intel_ring_emit(ring, MI_ARB_ON_OFF | MI_ARB_ENABLE);
+		*rbuf++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
 	}
 
-	intel_ring_advance(ring);
+	intel_ring_advance(req->ring, rbuf);
 
 	return ret;
 }
@@ -680,28 +671,29 @@ mi_set_context(struct drm_i915_gem_request *req, u32 hw_flags)
 static int remap_l3(struct drm_i915_gem_request *req, int slice)
 {
 	u32 *remap_info = req->i915->l3_parity.remap_info[slice];
-	struct intel_ring *ring = req->ring;
-	int i, ret;
+	u32 *rbuf;
+	int i;
 
 	if (!remap_info)
 		return 0;
 
-	ret = intel_ring_begin(req, GEN7_L3LOG_SIZE/4 * 2 + 2);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, GEN7_L3LOG_SIZE/4 * 2 + 2);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
 	/*
 	 * Note: We do not worry about the concurrent register cacheline hang
 	 * here because no other code should access these registers other than
 	 * at initialization time.
 	 */
-	intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(GEN7_L3LOG_SIZE/4));
+	*rbuf++ = MI_LOAD_REGISTER_IMM(GEN7_L3LOG_SIZE/4);
 	for (i = 0; i < GEN7_L3LOG_SIZE/4; i++) {
-		intel_ring_emit_reg(ring, GEN7_L3LOG(slice, i));
-		intel_ring_emit(ring, remap_info[i]);
+		*rbuf++ = GEN7_L3LOG(slice, i).reg;
+		*rbuf++ = remap_info[i];
 	}
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_advance(ring);
+	*rbuf++ = MI_NOOP;
+
+	intel_ring_advance(req->ring, rbuf);
 
 	return 0;
 }
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 9432d4ce9ffb..50954c0bc725 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1358,25 +1358,25 @@ i915_gem_execbuffer_move_to_active(struct list_head *vmas,
 static int
 i915_reset_gen7_sol_offsets(struct drm_i915_gem_request *req)
 {
-	struct intel_ring *ring = req->ring;
-	int ret, i;
+	u32 *rbuf;
+	int i;
 
 	if (!IS_GEN7(req->i915) || req->engine->id != RCS) {
 		DRM_DEBUG("sol reset is gen7/rcs only\n");
 		return -EINVAL;
 	}
 
-	ret = intel_ring_begin(req, 4 * 3);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, 4 * 3);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
 	for (i = 0; i < 4; i++) {
-		intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
-		intel_ring_emit_reg(ring, GEN7_SO_WRITE_OFFSET(i));
-		intel_ring_emit(ring, 0);
+		*rbuf++ = MI_LOAD_REGISTER_IMM(1);
+		*rbuf++ = GEN7_SO_WRITE_OFFSET(i).reg;
+		*rbuf++ = 0;
 	}
 
-	intel_ring_advance(ring);
+	intel_ring_advance(req->ring, rbuf);
 
 	return 0;
 }
@@ -1483,17 +1483,18 @@ execbuf_submit(struct i915_execbuffer_params *params,
 
 	if (params->engine->id == RCS &&
 	    instp_mode != dev_priv->relative_constants_mode) {
-		struct intel_ring *ring = params->request->ring;
+		u32 *rbuf;
 
-		ret = intel_ring_begin(params->request, 4);
-		if (ret)
-			return ret;
+		rbuf = intel_ring_begin(params->request, 4);
+		if (IS_ERR(rbuf))
+			return PTR_ERR(rbuf);
+
+		*rbuf++ = MI_NOOP;
+		*rbuf++ = MI_LOAD_REGISTER_IMM(1);
+		*rbuf++ = INSTPM.reg;
+		*rbuf++ = instp_mask << 16 | instp_mode;
 
-		intel_ring_emit(ring, MI_NOOP);
-		intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
-		intel_ring_emit_reg(ring, INSTPM);
-		intel_ring_emit(ring, instp_mask << 16 | instp_mode);
-		intel_ring_advance(ring);
+		intel_ring_advance(params->request->ring, rbuf);
 
 		dev_priv->relative_constants_mode = instp_mode;
 	}
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index e16c38086abe..80186b28bed6 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -663,23 +663,23 @@ static int gen8_write_pdp(struct drm_i915_gem_request *req,
 			  unsigned entry,
 			  dma_addr_t addr)
 {
-	struct intel_ring *ring = req->ring;
 	struct intel_engine_cs *engine = req->engine;
-	int ret;
+	u32 *rbuf;
 
 	BUG_ON(entry >= 4);
 
-	ret = intel_ring_begin(req, 6);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, 6);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
-	intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
-	intel_ring_emit_reg(ring, GEN8_RING_PDP_UDW(engine, entry));
-	intel_ring_emit(ring, upper_32_bits(addr));
-	intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
-	intel_ring_emit_reg(ring, GEN8_RING_PDP_LDW(engine, entry));
-	intel_ring_emit(ring, lower_32_bits(addr));
-	intel_ring_advance(ring);
+	*rbuf++ = MI_LOAD_REGISTER_IMM(1);
+	*rbuf++ = GEN8_RING_PDP_UDW(engine, entry).reg;
+	*rbuf++ = upper_32_bits(addr);
+	*rbuf++ = MI_LOAD_REGISTER_IMM(1);
+	*rbuf++ = GEN8_RING_PDP_LDW(engine, entry).reg;
+	*rbuf++ = lower_32_bits(addr);
+
+	intel_ring_advance(req->ring, rbuf);
 
 	return 0;
 }
@@ -1655,8 +1655,8 @@ static uint32_t get_pd_offset(struct i915_hw_ppgtt *ppgtt)
 static int hsw_mm_switch(struct i915_hw_ppgtt *ppgtt,
 			 struct drm_i915_gem_request *req)
 {
-	struct intel_ring *ring = req->ring;
 	struct intel_engine_cs *engine = req->engine;
+	u32 *rbuf;
 	int ret;
 
 	/* NB: TLBs must be flushed and invalidated before a switch */
@@ -1664,17 +1664,18 @@ static int hsw_mm_switch(struct i915_hw_ppgtt *ppgtt,
 	if (ret)
 		return ret;
 
-	ret = intel_ring_begin(req, 6);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, 6);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
+
+	*rbuf++ = MI_LOAD_REGISTER_IMM(2);
+	*rbuf++ = RING_PP_DIR_DCLV(engine).reg;
+	*rbuf++ = PP_DIR_DCLV_2G;
+	*rbuf++ = RING_PP_DIR_BASE(engine).reg;
+	*rbuf++ = get_pd_offset(ppgtt);
+	*rbuf++ = MI_NOOP;
 
-	intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(2));
-	intel_ring_emit_reg(ring, RING_PP_DIR_DCLV(engine));
-	intel_ring_emit(ring, PP_DIR_DCLV_2G);
-	intel_ring_emit_reg(ring, RING_PP_DIR_BASE(engine));
-	intel_ring_emit(ring, get_pd_offset(ppgtt));
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_advance(ring);
+	intel_ring_advance(req->ring, rbuf);
 
 	return 0;
 }
@@ -1682,8 +1683,8 @@ static int hsw_mm_switch(struct i915_hw_ppgtt *ppgtt,
 static int gen7_mm_switch(struct i915_hw_ppgtt *ppgtt,
 			  struct drm_i915_gem_request *req)
 {
-	struct intel_ring *ring = req->ring;
 	struct intel_engine_cs *engine = req->engine;
+	u32 *rbuf;
 	int ret;
 
 	/* NB: TLBs must be flushed and invalidated before a switch */
@@ -1691,17 +1692,18 @@ static int gen7_mm_switch(struct i915_hw_ppgtt *ppgtt,
 	if (ret)
 		return ret;
 
-	ret = intel_ring_begin(req, 6);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, 6);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
+
+	*rbuf++ = MI_LOAD_REGISTER_IMM(2);
+	*rbuf++ = RING_PP_DIR_DCLV(engine).reg;
+	*rbuf++ = PP_DIR_DCLV_2G;
+	*rbuf++ = RING_PP_DIR_BASE(engine).reg;
+	*rbuf++ = get_pd_offset(ppgtt);
+	*rbuf++ = MI_NOOP;
 
-	intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(2));
-	intel_ring_emit_reg(ring, RING_PP_DIR_DCLV(engine));
-	intel_ring_emit(ring, PP_DIR_DCLV_2G);
-	intel_ring_emit_reg(ring, RING_PP_DIR_BASE(engine));
-	intel_ring_emit(ring, get_pd_offset(ppgtt));
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_advance(ring);
+	intel_ring_advance(req->ring, rbuf);
 
 	/* XXX: RCS is the only one to auto invalidate the TLBs? */
 	if (engine->id != RCS) {
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 0c65212781e4..3d083be10a47 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -11679,14 +11679,12 @@ static int intel_gen2_queue_flip(struct drm_device *dev,
 				 struct drm_i915_gem_request *req,
 				 uint32_t flags)
 {
-	struct intel_ring *ring = req->ring;
 	struct intel_crtc *intel_crtc = to_intel_crtc(crtc);
-	u32 flip_mask;
-	int ret;
+	u32 flip_mask, *rbuf;
 
-	ret = intel_ring_begin(req, 6);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, 6);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
 	/* Can't queue multiple flips, so wait for the previous
 	 * one to finish before executing the next.
@@ -11695,13 +11693,14 @@ static int intel_gen2_queue_flip(struct drm_device *dev,
 		flip_mask = MI_WAIT_FOR_PLANE_B_FLIP;
 	else
 		flip_mask = MI_WAIT_FOR_PLANE_A_FLIP;
-	intel_ring_emit(ring, MI_WAIT_FOR_EVENT | flip_mask);
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_emit(ring, MI_DISPLAY_FLIP |
-			MI_DISPLAY_FLIP_PLANE(intel_crtc->plane));
-	intel_ring_emit(ring, fb->pitches[0]);
-	intel_ring_emit(ring, intel_crtc->flip_work->gtt_offset);
-	intel_ring_emit(ring, 0); /* aux display base address, unused */
+
+	*rbuf++ = MI_WAIT_FOR_EVENT | flip_mask;
+	*rbuf++ = MI_NOOP;
+	*rbuf++ = MI_DISPLAY_FLIP |
+		  MI_DISPLAY_FLIP_PLANE(intel_crtc->plane);
+	*rbuf++ = fb->pitches[0];
+	*rbuf++ = intel_crtc->flip_work->gtt_offset;
+	*rbuf++ = 0; /* aux display base address, unused */
 
 	return 0;
 }
@@ -11713,26 +11712,25 @@ static int intel_gen3_queue_flip(struct drm_device *dev,
 				 struct drm_i915_gem_request *req,
 				 uint32_t flags)
 {
-	struct intel_ring *ring = req->ring;
 	struct intel_crtc *intel_crtc = to_intel_crtc(crtc);
-	u32 flip_mask;
-	int ret;
+	u32 flip_mask, *rbuf;
 
-	ret = intel_ring_begin(req, 6);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, 6);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
 	if (intel_crtc->plane)
 		flip_mask = MI_WAIT_FOR_PLANE_B_FLIP;
 	else
 		flip_mask = MI_WAIT_FOR_PLANE_A_FLIP;
-	intel_ring_emit(ring, MI_WAIT_FOR_EVENT | flip_mask);
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_emit(ring, MI_DISPLAY_FLIP_I915 |
-			MI_DISPLAY_FLIP_PLANE(intel_crtc->plane));
-	intel_ring_emit(ring, fb->pitches[0]);
-	intel_ring_emit(ring, intel_crtc->flip_work->gtt_offset);
-	intel_ring_emit(ring, MI_NOOP);
+
+	*rbuf++ = MI_WAIT_FOR_EVENT | flip_mask;
+	*rbuf++ = MI_NOOP;
+	*rbuf++ = MI_DISPLAY_FLIP_I915 |
+		  MI_DISPLAY_FLIP_PLANE(intel_crtc->plane);
+	*rbuf++ = fb->pitches[0];
+	*rbuf++ = intel_crtc->flip_work->gtt_offset;
+	*rbuf++ = MI_NOOP;
 
 	return 0;
 }
@@ -11744,25 +11742,24 @@ static int intel_gen4_queue_flip(struct drm_device *dev,
 				 struct drm_i915_gem_request *req,
 				 uint32_t flags)
 {
-	struct intel_ring *ring = req->ring;
 	struct drm_i915_private *dev_priv = to_i915(dev);
 	struct intel_crtc *intel_crtc = to_intel_crtc(crtc);
+	u32 *rbuf;
 	uint32_t pf, pipesrc;
-	int ret;
 
-	ret = intel_ring_begin(req, 4);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, 4);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
 	/* i965+ uses the linear or tiled offsets from the
 	 * Display Registers (which do not change across a page-flip)
 	 * so we need only reprogram the base address.
 	 */
-	intel_ring_emit(ring, MI_DISPLAY_FLIP |
-			MI_DISPLAY_FLIP_PLANE(intel_crtc->plane));
-	intel_ring_emit(ring, fb->pitches[0]);
-	intel_ring_emit(ring, intel_crtc->flip_work->gtt_offset |
-			intel_fb_modifier_to_tiling(fb->modifier[0]));
+	*rbuf++ = MI_DISPLAY_FLIP |
+		  MI_DISPLAY_FLIP_PLANE(intel_crtc->plane);
+	*rbuf++ = fb->pitches[0];
+	*rbuf++ = intel_crtc->flip_work->gtt_offset |
+		  intel_fb_modifier_to_tiling(fb->modifier[0]);
 
 	/* XXX Enabling the panel-fitter across page-flip is so far
 	 * untested on non-native modes, so ignore it for now.
@@ -11770,7 +11767,7 @@ static int intel_gen4_queue_flip(struct drm_device *dev,
 	 */
 	pf = 0;
 	pipesrc = I915_READ(PIPESRC(intel_crtc->pipe)) & 0x0fff0fff;
-	intel_ring_emit(ring, pf | pipesrc);
+	*rbuf++ = pf | pipesrc;
 
 	return 0;
 }
@@ -11782,21 +11779,20 @@ static int intel_gen6_queue_flip(struct drm_device *dev,
 				 struct drm_i915_gem_request *req,
 				 uint32_t flags)
 {
-	struct intel_ring *ring = req->ring;
 	struct drm_i915_private *dev_priv = to_i915(dev);
 	struct intel_crtc *intel_crtc = to_intel_crtc(crtc);
+	u32 *rbuf;
 	uint32_t pf, pipesrc;
-	int ret;
 
-	ret = intel_ring_begin(req, 4);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, 4);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
-	intel_ring_emit(ring, MI_DISPLAY_FLIP |
-			MI_DISPLAY_FLIP_PLANE(intel_crtc->plane));
-	intel_ring_emit(ring, fb->pitches[0] |
-			intel_fb_modifier_to_tiling(fb->modifier[0]));
-	intel_ring_emit(ring, intel_crtc->flip_work->gtt_offset);
+	*rbuf++ = MI_DISPLAY_FLIP |
+		  MI_DISPLAY_FLIP_PLANE(intel_crtc->plane);
+	*rbuf++ = fb->pitches[0] |
+		  intel_fb_modifier_to_tiling(fb->modifier[0]);
+	*rbuf++ = intel_crtc->flip_work->gtt_offset;
 
 	/* Contrary to the suggestions in the documentation,
 	 * "Enable Panel Fitter" does not seem to be required when page
@@ -11806,7 +11802,7 @@ static int intel_gen6_queue_flip(struct drm_device *dev,
 	 */
 	pf = 0;
 	pipesrc = I915_READ(PIPESRC(intel_crtc->pipe)) & 0x0fff0fff;
-	intel_ring_emit(ring, pf | pipesrc);
+	*rbuf++ = pf | pipesrc;
 
 	return 0;
 }
@@ -11818,8 +11814,8 @@ static int intel_gen7_queue_flip(struct drm_device *dev,
 				 struct drm_i915_gem_request *req,
 				 uint32_t flags)
 {
-	struct intel_ring *ring = req->ring;
 	struct intel_crtc *intel_crtc = to_intel_crtc(crtc);
+	u32 *rbuf;
 	uint32_t plane_bit = 0;
 	int len, ret;
 
@@ -11864,9 +11860,9 @@ static int intel_gen7_queue_flip(struct drm_device *dev,
 	if (ret)
 		return ret;
 
-	ret = intel_ring_begin(req, len);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, len);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
 	/* Unmask the flip-done completion message. Note that the bspec says that
 	 * we should do this for both the BCS and RCS, and that we must not unmask
@@ -11878,31 +11874,30 @@ static int intel_gen7_queue_flip(struct drm_device *dev,
 	 * to zero does lead to lockups within MI_DISPLAY_FLIP.
 	 */
 	if (req->engine->id == RCS) {
-		intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
-		intel_ring_emit_reg(ring, DERRMR);
-		intel_ring_emit(ring, ~(DERRMR_PIPEA_PRI_FLIP_DONE |
-					  DERRMR_PIPEB_PRI_FLIP_DONE |
-					  DERRMR_PIPEC_PRI_FLIP_DONE));
+		*rbuf++ = MI_LOAD_REGISTER_IMM(1);
+		*rbuf++ = DERRMR.reg;
+		*rbuf++ = ~(DERRMR_PIPEA_PRI_FLIP_DONE |
+			   DERRMR_PIPEB_PRI_FLIP_DONE |
+			   DERRMR_PIPEC_PRI_FLIP_DONE);
 		if (IS_GEN8(dev))
-			intel_ring_emit(ring, MI_STORE_REGISTER_MEM_GEN8 |
-					      MI_SRM_LRM_GLOBAL_GTT);
+			*rbuf++ = MI_STORE_REGISTER_MEM_GEN8 |
+				  MI_SRM_LRM_GLOBAL_GTT;
 		else
-			intel_ring_emit(ring, MI_STORE_REGISTER_MEM |
-					      MI_SRM_LRM_GLOBAL_GTT);
-		intel_ring_emit_reg(ring, DERRMR);
-		intel_ring_emit(ring,
-				i915_ggtt_offset(req->engine->scratch) + 256);
+			*rbuf++ = MI_STORE_REGISTER_MEM |
+				  MI_SRM_LRM_GLOBAL_GTT;
+		*rbuf++ = DERRMR.reg;
+		*rbuf++ = i915_ggtt_offset(req->engine->scratch) + 256;
 		if (IS_GEN8(dev)) {
-			intel_ring_emit(ring, 0);
-			intel_ring_emit(ring, MI_NOOP);
+			*rbuf++ = 0;
+			*rbuf++ = MI_NOOP;
 		}
 	}
 
-	intel_ring_emit(ring, MI_DISPLAY_FLIP_I915 | plane_bit);
-	intel_ring_emit(ring, fb->pitches[0] |
-			intel_fb_modifier_to_tiling(fb->modifier[0]));
-	intel_ring_emit(ring, intel_crtc->flip_work->gtt_offset);
-	intel_ring_emit(ring, (MI_NOOP));
+	*rbuf++ = MI_DISPLAY_FLIP_I915 | plane_bit;
+	*rbuf++ = fb->pitches[0] |
+		  intel_fb_modifier_to_tiling(fb->modifier[0]);
+	*rbuf++ = intel_crtc->flip_work->gtt_offset;
+	*rbuf++ = MI_NOOP;
 
 	return 0;
 }
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 92bfe47ad33c..b21ebf4e08f2 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -646,6 +646,7 @@ int intel_logical_ring_alloc_request_extras(struct drm_i915_gem_request *request
 {
 	struct intel_engine_cs *engine = request->engine;
 	struct intel_context *ce = &request->ctx->engine[engine->id];
+	u32 *rbuf;
 	int ret;
 
 	/* Flush enough space to reduce the likelihood of waiting after
@@ -677,9 +678,11 @@ int intel_logical_ring_alloc_request_extras(struct drm_i915_gem_request *request
 	if (ret)
 		return ret;
 
-	ret = intel_ring_begin(request, 0);
-	if (ret)
+	rbuf = intel_ring_begin(request, 0);
+	if (IS_ERR(rbuf)) {
+		ret = PTR_ERR(rbuf);
 		goto err_unpin;
+	}
 
 	if (!ce->initialised) {
 		ret = engine->init_context(request);
@@ -714,12 +717,13 @@ err_unpin:
  * point, the tail *inside* the context is updated and the ELSP written to.
  */
 static int
-intel_logical_ring_advance(struct drm_i915_gem_request *request)
+intel_logical_ring_advance(struct drm_i915_gem_request *request, u32 *rbuf)
 {
 	struct intel_ring *ring = request->ring;
 	struct intel_engine_cs *engine = request->engine;
 
-	intel_ring_advance(ring);
+	intel_ring_advance(ring, rbuf);
+
 	request->tail = ring->tail;
 
 	/*
@@ -728,9 +732,10 @@ intel_logical_ring_advance(struct drm_i915_gem_request *request)
 	 *
 	 * Caller must reserve WA_TAIL_DWORDS for us!
 	 */
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_advance(ring);
+	*rbuf++ = MI_NOOP;
+	*rbuf++ = MI_NOOP;
+
+	intel_ring_advance(ring, rbuf);
 
 	/* We keep the previous context alive until we retire the following
 	 * request. This ensures that any the context object is still pinned
@@ -837,7 +842,7 @@ void intel_lr_context_unpin(struct i915_gem_context *ctx,
 static int intel_logical_ring_workarounds_emit(struct drm_i915_gem_request *req)
 {
 	int ret, i;
-	struct intel_ring *ring = req->ring;
+	u32 *rbuf;
 	struct i915_workarounds *w = &req->i915->workarounds;
 
 	if (w->count == 0)
@@ -847,18 +852,18 @@ static int intel_logical_ring_workarounds_emit(struct drm_i915_gem_request *req)
 	if (ret)
 		return ret;
 
-	ret = intel_ring_begin(req, w->count * 2 + 2);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, w->count * 2 + 2);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
-	intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(w->count));
+	*rbuf++ = MI_LOAD_REGISTER_IMM(w->count);
 	for (i = 0; i < w->count; i++) {
-		intel_ring_emit_reg(ring, w->reg[i].addr);
-		intel_ring_emit(ring, w->reg[i].value);
+		*rbuf++ = w->reg[i].addr.reg;
+		*rbuf++ = w->reg[i].value;
 	}
-	intel_ring_emit(ring, MI_NOOP);
+	*rbuf++ = MI_NOOP;
 
-	intel_ring_advance(ring);
+	intel_ring_advance(req->ring, rbuf);
 
 	ret = req->engine->emit_flush(req, EMIT_BARRIER);
 	if (ret)
@@ -1360,27 +1365,27 @@ static int gen9_init_render_ring(struct intel_engine_cs *engine)
 static int intel_logical_ring_emit_pdps(struct drm_i915_gem_request *req)
 {
 	struct i915_hw_ppgtt *ppgtt = req->ctx->ppgtt;
-	struct intel_ring *ring = req->ring;
 	struct intel_engine_cs *engine = req->engine;
 	const int num_lri_cmds = GEN8_LEGACY_PDPES * 2;
-	int i, ret;
+	u32 *rbuf;
+	int i;
 
-	ret = intel_ring_begin(req, num_lri_cmds * 2 + 2);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, num_lri_cmds * 2 + 2);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
-	intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(num_lri_cmds));
+	*rbuf++ = MI_LOAD_REGISTER_IMM(num_lri_cmds);
 	for (i = GEN8_LEGACY_PDPES - 1; i >= 0; i--) {
 		const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i);
 
-		intel_ring_emit_reg(ring, GEN8_RING_PDP_UDW(engine, i));
-		intel_ring_emit(ring, upper_32_bits(pd_daddr));
-		intel_ring_emit_reg(ring, GEN8_RING_PDP_LDW(engine, i));
-		intel_ring_emit(ring, lower_32_bits(pd_daddr));
+		*rbuf++ = GEN8_RING_PDP_UDW(engine, i).reg;
+		*rbuf++ = upper_32_bits(pd_daddr);
+		*rbuf++ = GEN8_RING_PDP_LDW(engine, i).reg;
+		*rbuf++ = lower_32_bits(pd_daddr);
 	}
+	*rbuf++ = MI_NOOP;
 
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_advance(ring);
+	intel_ring_advance(req->ring, rbuf);
 
 	return 0;
 }
@@ -1389,8 +1394,8 @@ static int gen8_emit_bb_start(struct drm_i915_gem_request *req,
 			      u64 offset, u32 len,
 			      unsigned int dispatch_flags)
 {
-	struct intel_ring *ring = req->ring;
 	bool ppgtt = !(dispatch_flags & I915_DISPATCH_SECURE);
+	u32 *rbuf;
 	int ret;
 
 	/* Don't rely in hw updating PDPs, specially in lite-restore.
@@ -1411,19 +1416,20 @@ static int gen8_emit_bb_start(struct drm_i915_gem_request *req,
 		req->ctx->ppgtt->pd_dirty_rings &= ~intel_engine_flag(req->engine);
 	}
 
-	ret = intel_ring_begin(req, 4);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, 4);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
 	/* FIXME(BDW): Address space and security selectors. */
-	intel_ring_emit(ring, MI_BATCH_BUFFER_START_GEN8 |
-			(ppgtt<<8) |
-			(dispatch_flags & I915_DISPATCH_RS ?
-			 MI_BATCH_RESOURCE_STREAMER : 0));
-	intel_ring_emit(ring, lower_32_bits(offset));
-	intel_ring_emit(ring, upper_32_bits(offset));
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_advance(ring);
+	*rbuf++ = MI_BATCH_BUFFER_START_GEN8 |
+		  (ppgtt << 8) |
+		  (dispatch_flags & I915_DISPATCH_RS ?
+		  MI_BATCH_RESOURCE_STREAMER : 0);
+	*rbuf++ = lower_32_bits(offset);
+	*rbuf++ = upper_32_bits(offset);
+	*rbuf++ = MI_NOOP;
+
+	intel_ring_advance(req->ring, rbuf);
 
 	return 0;
 }
@@ -1444,13 +1450,11 @@ static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine)
 
 static int gen8_emit_flush(struct drm_i915_gem_request *request, u32 mode)
 {
-	struct intel_ring *ring = request->ring;
-	u32 cmd;
-	int ret;
+	u32 cmd, *rbuf;
 
-	ret = intel_ring_begin(request, 4);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(request, 4);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
 	cmd = MI_FLUSH_DW + 1;
 
@@ -1467,13 +1471,12 @@ static int gen8_emit_flush(struct drm_i915_gem_request *request, u32 mode)
 			cmd |= MI_INVALIDATE_BSD;
 	}
 
-	intel_ring_emit(ring, cmd);
-	intel_ring_emit(ring,
-			I915_GEM_HWS_SCRATCH_ADDR |
-			MI_FLUSH_DW_USE_GTT);
-	intel_ring_emit(ring, 0); /* upper addr */
-	intel_ring_emit(ring, 0); /* value */
-	intel_ring_advance(ring);
+	*rbuf++ = cmd;
+	*rbuf++ = I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
+	*rbuf++ = 0; /* upper addr */
+	*rbuf++ = 0; /* value */
+
+	intel_ring_advance(request->ring, rbuf);
 
 	return 0;
 }
@@ -1481,13 +1484,10 @@ static int gen8_emit_flush(struct drm_i915_gem_request *request, u32 mode)
 static int gen8_emit_flush_render(struct drm_i915_gem_request *request,
 				  u32 mode)
 {
-	struct intel_ring *ring = request->ring;
-	struct intel_engine_cs *engine = request->engine;
 	u32 scratch_addr =
-		i915_ggtt_offset(engine->scratch) + 2 * CACHELINE_BYTES;
+	       i915_ggtt_offset(request->engine->scratch) + 2 * CACHELINE_BYTES;
 	bool vf_flush_wa = false, dc_flush_wa = false;
-	u32 flags = 0;
-	int ret;
+	u32 *rbuf, flags = 0;
 	int len;
 
 	flags |= PIPE_CONTROL_CS_STALL;
@@ -1529,45 +1529,45 @@ static int gen8_emit_flush_render(struct drm_i915_gem_request *request,
 	if (dc_flush_wa)
 		len += 12;
 
-	ret = intel_ring_begin(request, len);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(request, len);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
 	if (vf_flush_wa) {
-		intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(6));
-		intel_ring_emit(ring, 0);
-		intel_ring_emit(ring, 0);
-		intel_ring_emit(ring, 0);
-		intel_ring_emit(ring, 0);
-		intel_ring_emit(ring, 0);
+		*rbuf++ = GFX_OP_PIPE_CONTROL(6);
+		*rbuf++ = 0;
+		*rbuf++ = 0;
+		*rbuf++ = 0;
+		*rbuf++ = 0;
+		*rbuf++ = 0;
 	}
 
 	if (dc_flush_wa) {
-		intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(6));
-		intel_ring_emit(ring, PIPE_CONTROL_DC_FLUSH_ENABLE);
-		intel_ring_emit(ring, 0);
-		intel_ring_emit(ring, 0);
-		intel_ring_emit(ring, 0);
-		intel_ring_emit(ring, 0);
+		*rbuf++ = GFX_OP_PIPE_CONTROL(6);
+		*rbuf++ = PIPE_CONTROL_DC_FLUSH_ENABLE;
+		*rbuf++ = 0;
+		*rbuf++ = 0;
+		*rbuf++ = 0;
+		*rbuf++ = 0;
 	}
 
-	intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(6));
-	intel_ring_emit(ring, flags);
-	intel_ring_emit(ring, scratch_addr);
-	intel_ring_emit(ring, 0);
-	intel_ring_emit(ring, 0);
-	intel_ring_emit(ring, 0);
+	*rbuf++ = GFX_OP_PIPE_CONTROL(6);
+	*rbuf++ = flags;
+	*rbuf++ = scratch_addr;
+	*rbuf++ = 0;
+	*rbuf++ = 0;
+	*rbuf++ = 0;
 
 	if (dc_flush_wa) {
-		intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(6));
-		intel_ring_emit(ring, PIPE_CONTROL_CS_STALL);
-		intel_ring_emit(ring, 0);
-		intel_ring_emit(ring, 0);
-		intel_ring_emit(ring, 0);
-		intel_ring_emit(ring, 0);
+		*rbuf++ = GFX_OP_PIPE_CONTROL(6);
+		*rbuf++ = PIPE_CONTROL_CS_STALL;
+		*rbuf++ = 0;
+		*rbuf++ = 0;
+		*rbuf++ = 0;
+		*rbuf++ = 0;
 	}
 
-	intel_ring_advance(ring);
+	intel_ring_advance(request->ring, rbuf);
 
 	return 0;
 }
@@ -1596,35 +1596,33 @@ static void bxt_a_seqno_barrier(struct intel_engine_cs *engine)
 
 static int gen8_emit_request(struct drm_i915_gem_request *request)
 {
-	struct intel_ring *ring = request->ring;
-	int ret;
+	u32 *rbuf;
 
-	ret = intel_ring_begin(request, 6 + WA_TAIL_DWORDS);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(request, 6 + WA_TAIL_DWORDS);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
 	/* w/a: bit 5 needs to be zero for MI_FLUSH_DW address. */
 	BUILD_BUG_ON(I915_GEM_HWS_INDEX_ADDR & (1 << 5));
 
-	intel_ring_emit(ring, (MI_FLUSH_DW + 1) | MI_FLUSH_DW_OP_STOREDW);
-	intel_ring_emit(ring,
-			intel_hws_seqno_address(request->engine) |
-			MI_FLUSH_DW_USE_GTT);
-	intel_ring_emit(ring, 0);
-	intel_ring_emit(ring, request->fence.seqno);
-	intel_ring_emit(ring, MI_USER_INTERRUPT);
-	intel_ring_emit(ring, MI_NOOP);
-	return intel_logical_ring_advance(request);
+	*rbuf++ = (MI_FLUSH_DW + 1) | MI_FLUSH_DW_OP_STOREDW;
+	*rbuf++ =  intel_hws_seqno_address(request->engine) |
+		   MI_FLUSH_DW_USE_GTT;
+	*rbuf++ = 0;
+	*rbuf++ = request->fence.seqno;
+	*rbuf++ = MI_USER_INTERRUPT;
+	*rbuf++ = MI_NOOP;
+
+	return intel_logical_ring_advance(request, rbuf);
 }
 
 static int gen8_emit_request_render(struct drm_i915_gem_request *request)
 {
-	struct intel_ring *ring = request->ring;
-	int ret;
+	u32 *rbuf;
 
-	ret = intel_ring_begin(request, 8 + WA_TAIL_DWORDS);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(request, 8 + WA_TAIL_DWORDS);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
 	/* We're using qword write, seqno should be aligned to 8 bytes. */
 	BUILD_BUG_ON(I915_GEM_HWS_INDEX & 1);
@@ -1633,19 +1631,18 @@ static int gen8_emit_request_render(struct drm_i915_gem_request *request)
 	 * need a prior CS_STALL, which is emitted by the flush
 	 * following the batch.
 	 */
-	intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(6));
-	intel_ring_emit(ring,
-			(PIPE_CONTROL_GLOBAL_GTT_IVB |
-			 PIPE_CONTROL_CS_STALL |
-			 PIPE_CONTROL_QW_WRITE));
-	intel_ring_emit(ring, intel_hws_seqno_address(request->engine));
-	intel_ring_emit(ring, 0);
-	intel_ring_emit(ring, i915_gem_request_get_seqno(request));
+	*rbuf++ = GFX_OP_PIPE_CONTROL(6);
+	*rbuf++ = PIPE_CONTROL_GLOBAL_GTT_IVB | PIPE_CONTROL_CS_STALL |
+		  PIPE_CONTROL_QW_WRITE;
+	*rbuf++ = intel_hws_seqno_address(request->engine);
+	*rbuf++ = 0;
+	*rbuf++ = i915_gem_request_get_seqno(request);
 	/* We're thrashing one dword of HWS. */
-	intel_ring_emit(ring, 0);
-	intel_ring_emit(ring, MI_USER_INTERRUPT);
-	intel_ring_emit(ring, MI_NOOP);
-	return intel_logical_ring_advance(request);
+	*rbuf++ = 0;
+	*rbuf++ = MI_USER_INTERRUPT;
+	*rbuf++ = MI_NOOP;
+
+	return intel_logical_ring_advance(request, rbuf);
 }
 
 static int gen8_init_rcs_context(struct drm_i915_gem_request *req)
diff --git a/drivers/gpu/drm/i915/intel_mocs.c b/drivers/gpu/drm/i915/intel_mocs.c
index 80bb9247ce66..2bc8ca260451 100644
--- a/drivers/gpu/drm/i915/intel_mocs.c
+++ b/drivers/gpu/drm/i915/intel_mocs.c
@@ -276,23 +276,22 @@ int intel_mocs_init_engine(struct intel_engine_cs *engine)
 static int emit_mocs_control_table(struct drm_i915_gem_request *req,
 				   const struct drm_i915_mocs_table *table)
 {
-	struct intel_ring *ring = req->ring;
 	enum intel_engine_id engine = req->engine->id;
 	unsigned int index;
-	int ret;
+	u32 *rbuf;
 
 	if (WARN_ON(table->size > GEN9_NUM_MOCS_ENTRIES))
 		return -ENODEV;
 
-	ret = intel_ring_begin(req, 2 + 2 * GEN9_NUM_MOCS_ENTRIES);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, 2 + 2 * GEN9_NUM_MOCS_ENTRIES);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
-	intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(GEN9_NUM_MOCS_ENTRIES));
+	*rbuf++ = MI_LOAD_REGISTER_IMM(GEN9_NUM_MOCS_ENTRIES);
 
 	for (index = 0; index < table->size; index++) {
-		intel_ring_emit_reg(ring, mocs_register(engine, index));
-		intel_ring_emit(ring, table->table[index].control_value);
+		*rbuf++ = mocs_register(engine, index).reg;
+		*rbuf++ = table->table[index].control_value;
 	}
 
 	/*
@@ -304,12 +303,13 @@ static int emit_mocs_control_table(struct drm_i915_gem_request *req,
 	 * that value to all the used entries.
 	 */
 	for (; index < GEN9_NUM_MOCS_ENTRIES; index++) {
-		intel_ring_emit_reg(ring, mocs_register(engine, index));
-		intel_ring_emit(ring, table->table[0].control_value);
+		*rbuf++ = mocs_register(engine, index).reg;
+		*rbuf++ = table->table[0].control_value;
 	}
 
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_advance(ring);
+	*rbuf++ = MI_NOOP;
+
+	intel_ring_advance(req->ring, rbuf);
 
 	return 0;
 }
@@ -336,29 +336,27 @@ static inline u32 l3cc_combine(const struct drm_i915_mocs_table *table,
 static int emit_mocs_l3cc_table(struct drm_i915_gem_request *req,
 				const struct drm_i915_mocs_table *table)
 {
-	struct intel_ring *ring = req->ring;
+	u32 *rbuf;
 	unsigned int i;
-	int ret;
 
 	if (WARN_ON(table->size > GEN9_NUM_MOCS_ENTRIES))
 		return -ENODEV;
 
-	ret = intel_ring_begin(req, 2 + GEN9_NUM_MOCS_ENTRIES);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, 2 + GEN9_NUM_MOCS_ENTRIES);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
-	intel_ring_emit(ring,
-			MI_LOAD_REGISTER_IMM(GEN9_NUM_MOCS_ENTRIES / 2));
+	*rbuf++ = MI_LOAD_REGISTER_IMM(GEN9_NUM_MOCS_ENTRIES / 2);
 
 	for (i = 0; i < table->size/2; i++) {
-		intel_ring_emit_reg(ring, GEN9_LNCFCMOCS(i));
-		intel_ring_emit(ring, l3cc_combine(table, 2*i, 2*i+1));
+		*rbuf++ = GEN9_LNCFCMOCS(i).reg;
+		*rbuf++ = l3cc_combine(table, 2*i, 2*i+1);
 	}
 
 	if (table->size & 0x01) {
 		/* Odd table size - 1 left over */
-		intel_ring_emit_reg(ring, GEN9_LNCFCMOCS(i));
-		intel_ring_emit(ring, l3cc_combine(table, 2*i, 0));
+		*rbuf++ = GEN9_LNCFCMOCS(i).reg;
+		*rbuf++ = l3cc_combine(table, 2*i, 0);
 		i++;
 	}
 
@@ -368,12 +366,13 @@ static int emit_mocs_l3cc_table(struct drm_i915_gem_request *req,
 	 * they are reserved by the hardware.
 	 */
 	for (; i < GEN9_NUM_MOCS_ENTRIES / 2; i++) {
-		intel_ring_emit_reg(ring, GEN9_LNCFCMOCS(i));
-		intel_ring_emit(ring, l3cc_combine(table, 0, 0));
+		*rbuf++ = GEN9_LNCFCMOCS(i).reg;
+		*rbuf++ = l3cc_combine(table, 0, 0);
 	}
 
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_advance(ring);
+	*rbuf++ = MI_NOOP;
+
+	intel_ring_advance(req->ring, rbuf);
 
 	return 0;
 }
diff --git a/drivers/gpu/drm/i915/intel_overlay.c b/drivers/gpu/drm/i915/intel_overlay.c
index a24bc8c7889f..1de86c316058 100644
--- a/drivers/gpu/drm/i915/intel_overlay.c
+++ b/drivers/gpu/drm/i915/intel_overlay.c
@@ -243,8 +243,7 @@ static int intel_overlay_on(struct intel_overlay *overlay)
 {
 	struct drm_i915_private *dev_priv = overlay->i915;
 	struct drm_i915_gem_request *req;
-	struct intel_ring *ring;
-	int ret;
+	u32 *rbuf;
 
 	WARN_ON(overlay->active);
 	WARN_ON(IS_I830(dev_priv) && !(dev_priv->quirks & QUIRK_PIPEA_FORCE));
@@ -253,20 +252,20 @@ static int intel_overlay_on(struct intel_overlay *overlay)
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
-	ret = intel_ring_begin(req, 4);
-	if (ret) {
+	rbuf = intel_ring_begin(req, 4);
+	if (IS_ERR(rbuf)) {
 		i915_add_request_no_flush(req);
-		return ret;
+		return PTR_ERR(rbuf);
 	}
 
 	overlay->active = true;
 
-	ring = req->ring;
-	intel_ring_emit(ring, MI_OVERLAY_FLIP | MI_OVERLAY_ON);
-	intel_ring_emit(ring, overlay->flip_addr | OFC_UPDATE);
-	intel_ring_emit(ring, MI_WAIT_FOR_EVENT | MI_WAIT_FOR_OVERLAY_FLIP);
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_advance(ring);
+	*rbuf++ = MI_OVERLAY_FLIP | MI_OVERLAY_ON;
+	*rbuf++ = overlay->flip_addr | OFC_UPDATE;
+	*rbuf++ = MI_WAIT_FOR_EVENT | MI_WAIT_FOR_OVERLAY_FLIP;
+	*rbuf++ = MI_NOOP;
+
+	intel_ring_advance(req->ring, rbuf);
 
 	return intel_overlay_do_wait_request(overlay, req, NULL);
 }
@@ -277,10 +276,8 @@ static int intel_overlay_continue(struct intel_overlay *overlay,
 {
 	struct drm_i915_private *dev_priv = overlay->i915;
 	struct drm_i915_gem_request *req;
-	struct intel_ring *ring;
 	u32 flip_addr = overlay->flip_addr;
-	u32 tmp;
-	int ret;
+	u32 *rbuf, tmp;
 
 	WARN_ON(!overlay->active);
 
@@ -296,16 +293,16 @@ static int intel_overlay_continue(struct intel_overlay *overlay,
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
-	ret = intel_ring_begin(req, 2);
-	if (ret) {
+	rbuf = intel_ring_begin(req, 2);
+	if (IS_ERR(rbuf)) {
 		i915_add_request_no_flush(req);
-		return ret;
+		return PTR_ERR(rbuf);
 	}
 
-	ring = req->ring;
-	intel_ring_emit(ring, MI_OVERLAY_FLIP | MI_OVERLAY_CONTINUE);
-	intel_ring_emit(ring, flip_addr);
-	intel_ring_advance(ring);
+	*rbuf++ = MI_OVERLAY_FLIP | MI_OVERLAY_CONTINUE;
+	*rbuf++ = flip_addr;
+
+	intel_ring_advance(req->ring, rbuf);
 
 	intel_overlay_submit_request(overlay, req, NULL);
 
@@ -355,9 +352,8 @@ static int intel_overlay_off(struct intel_overlay *overlay)
 {
 	struct drm_i915_private *dev_priv = overlay->i915;
 	struct drm_i915_gem_request *req;
-	struct intel_ring *ring;
 	u32 flip_addr = overlay->flip_addr;
-	int ret;
+	u32 *rbuf;
 
 	WARN_ON(!overlay->active);
 
@@ -371,31 +367,30 @@ static int intel_overlay_off(struct intel_overlay *overlay)
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
-	ret = intel_ring_begin(req, 6);
-	if (ret) {
+	rbuf = intel_ring_begin(req, 6);
+	if (IS_ERR(rbuf)) {
 		i915_add_request_no_flush(req);
-		return ret;
+		return PTR_ERR(rbuf);
 	}
 
-	ring = req->ring;
 	/* wait for overlay to go idle */
-	intel_ring_emit(ring, MI_OVERLAY_FLIP | MI_OVERLAY_CONTINUE);
-	intel_ring_emit(ring, flip_addr);
-	intel_ring_emit(ring, MI_WAIT_FOR_EVENT | MI_WAIT_FOR_OVERLAY_FLIP);
+	*rbuf++ = MI_OVERLAY_FLIP | MI_OVERLAY_CONTINUE;
+	*rbuf++ = flip_addr;
+	*rbuf++ = MI_WAIT_FOR_EVENT | MI_WAIT_FOR_OVERLAY_FLIP;
 	/* turn overlay off */
 	if (IS_I830(dev_priv)) {
 		/* Workaround: Don't disable the overlay fully, since otherwise
 		 * it dies on the next OVERLAY_ON cmd. */
-		intel_ring_emit(ring, MI_NOOP);
-		intel_ring_emit(ring, MI_NOOP);
-		intel_ring_emit(ring, MI_NOOP);
+		*rbuf++ = MI_NOOP;
+		*rbuf++ = MI_NOOP;
+		*rbuf++ = MI_NOOP;
 	} else {
-		intel_ring_emit(ring, MI_OVERLAY_FLIP | MI_OVERLAY_OFF);
-		intel_ring_emit(ring, flip_addr);
-		intel_ring_emit(ring,
-				MI_WAIT_FOR_EVENT | MI_WAIT_FOR_OVERLAY_FLIP);
+		*rbuf++ = MI_OVERLAY_FLIP | MI_OVERLAY_OFF;
+		*rbuf++ = flip_addr;
+		*rbuf++ = MI_WAIT_FOR_EVENT | MI_WAIT_FOR_OVERLAY_FLIP;
 	}
-	intel_ring_advance(ring);
+
+	intel_ring_advance(req->ring, rbuf);
 
 	return intel_overlay_do_wait_request(overlay, req,
 					     intel_overlay_off_tail);
@@ -429,23 +424,22 @@ static int intel_overlay_release_old_vid(struct intel_overlay *overlay)
 	if (I915_READ(ISR) & I915_OVERLAY_PLANE_FLIP_PENDING_INTERRUPT) {
 		/* synchronous slowpath */
 		struct drm_i915_gem_request *req;
-		struct intel_ring *ring;
+		u32 *rbuf;
 
 		req = alloc_request(overlay);
 		if (IS_ERR(req))
 			return PTR_ERR(req);
 
-		ret = intel_ring_begin(req, 2);
-		if (ret) {
+		rbuf = intel_ring_begin(req, 2);
+		if (IS_ERR(rbuf)) {
 			i915_add_request_no_flush(req);
-			return ret;
+			return PTR_ERR(rbuf);
 		}
 
-		ring = req->ring;
-		intel_ring_emit(ring,
-				MI_WAIT_FOR_EVENT | MI_WAIT_FOR_OVERLAY_FLIP);
-		intel_ring_emit(ring, MI_NOOP);
-		intel_ring_advance(ring);
+		*rbuf++ = MI_WAIT_FOR_EVENT | MI_WAIT_FOR_OVERLAY_FLIP;
+		*rbuf++ = MI_NOOP;
+
+		intel_ring_advance(req->ring, rbuf);
 
 		ret = intel_overlay_do_wait_request(overlay, req,
 						    intel_overlay_release_old_vid_tail);
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index fd8fcc6ec970..ac7837daa3bf 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -61,22 +61,21 @@ void intel_ring_update_space(struct intel_ring *ring)
 static int
 gen2_render_ring_flush(struct drm_i915_gem_request *req, u32 mode)
 {
-	struct intel_ring *ring = req->ring;
-	u32 cmd;
-	int ret;
+	u32 cmd, *rbuf;
 
 	cmd = MI_FLUSH;
 
 	if (mode & EMIT_INVALIDATE)
 		cmd |= MI_READ_FLUSH;
 
-	ret = intel_ring_begin(req, 2);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, 2);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
+
+	*rbuf++ = cmd;
+	*rbuf++ = MI_NOOP;
 
-	intel_ring_emit(ring, cmd);
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_advance(ring);
+	intel_ring_advance(req->ring, rbuf);
 
 	return 0;
 }
@@ -84,9 +83,7 @@ gen2_render_ring_flush(struct drm_i915_gem_request *req, u32 mode)
 static int
 gen4_render_ring_flush(struct drm_i915_gem_request *req, u32 mode)
 {
-	struct intel_ring *ring = req->ring;
-	u32 cmd;
-	int ret;
+	u32 cmd, *rbuf;
 
 	/*
 	 * read/write caches:
@@ -123,13 +120,14 @@ gen4_render_ring_flush(struct drm_i915_gem_request *req, u32 mode)
 			cmd |= MI_INVALIDATE_ISP;
 	}
 
-	ret = intel_ring_begin(req, 2);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, 2);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
-	intel_ring_emit(ring, cmd);
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_advance(ring);
+	*rbuf++ = cmd;
+	*rbuf++ = MI_NOOP;
+
+	intel_ring_advance(req->ring, rbuf);
 
 	return 0;
 }
@@ -174,35 +172,36 @@ gen4_render_ring_flush(struct drm_i915_gem_request *req, u32 mode)
 static int
 intel_emit_post_sync_nonzero_flush(struct drm_i915_gem_request *req)
 {
-	struct intel_ring *ring = req->ring;
 	u32 scratch_addr =
 		i915_ggtt_offset(req->engine->scratch) + 2 * CACHELINE_BYTES;
-	int ret;
+	u32 *rbuf;
 
-	ret = intel_ring_begin(req, 6);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, 6);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
-	intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(5));
-	intel_ring_emit(ring, PIPE_CONTROL_CS_STALL |
-			PIPE_CONTROL_STALL_AT_SCOREBOARD);
-	intel_ring_emit(ring, scratch_addr | PIPE_CONTROL_GLOBAL_GTT);
-	intel_ring_emit(ring, 0); /* low dword */
-	intel_ring_emit(ring, 0); /* high dword */
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_advance(ring);
+	*rbuf++ = GFX_OP_PIPE_CONTROL(5);
+	*rbuf++ = PIPE_CONTROL_CS_STALL |
+		  PIPE_CONTROL_STALL_AT_SCOREBOARD;
+	*rbuf++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
+	*rbuf++ = 0; /* low dword */
+	*rbuf++ = 0; /* high dword */
+	*rbuf++ = MI_NOOP;
 
-	ret = intel_ring_begin(req, 6);
-	if (ret)
-		return ret;
+	intel_ring_advance(req->ring, rbuf);
+
+	rbuf = intel_ring_begin(req, 6);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
-	intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(5));
-	intel_ring_emit(ring, PIPE_CONTROL_QW_WRITE);
-	intel_ring_emit(ring, scratch_addr | PIPE_CONTROL_GLOBAL_GTT);
-	intel_ring_emit(ring, 0);
-	intel_ring_emit(ring, 0);
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_advance(ring);
+	*rbuf++ = GFX_OP_PIPE_CONTROL(5);
+	*rbuf++ = PIPE_CONTROL_QW_WRITE;
+	*rbuf++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
+	*rbuf++ = 0;
+	*rbuf++ = 0;
+	*rbuf++ = MI_NOOP;
+
+	intel_ring_advance(req->ring, rbuf);
 
 	return 0;
 }
@@ -210,10 +209,9 @@ intel_emit_post_sync_nonzero_flush(struct drm_i915_gem_request *req)
 static int
 gen6_render_ring_flush(struct drm_i915_gem_request *req, u32 mode)
 {
-	struct intel_ring *ring = req->ring;
 	u32 scratch_addr =
 		i915_ggtt_offset(req->engine->scratch) + 2 * CACHELINE_BYTES;
-	u32 flags = 0;
+	u32 *rbuf, flags = 0;
 	int ret;
 
 	/* Force SNB workarounds for PIPE_CONTROL flushes */
@@ -247,15 +245,16 @@ gen6_render_ring_flush(struct drm_i915_gem_request *req, u32 mode)
 		flags |= PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_CS_STALL;
 	}
 
-	ret = intel_ring_begin(req, 4);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, 4);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
+
+	*rbuf++ = GFX_OP_PIPE_CONTROL(4);
+	*rbuf++ = flags;
+	*rbuf++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
+	*rbuf++ = 0;
 
-	intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(4));
-	intel_ring_emit(ring, flags);
-	intel_ring_emit(ring, scratch_addr | PIPE_CONTROL_GLOBAL_GTT);
-	intel_ring_emit(ring, 0);
-	intel_ring_advance(ring);
+	intel_ring_advance(req->ring, rbuf);
 
 	return 0;
 }
@@ -263,20 +262,19 @@ gen6_render_ring_flush(struct drm_i915_gem_request *req, u32 mode)
 static int
 gen7_render_ring_cs_stall_wa(struct drm_i915_gem_request *req)
 {
-	struct intel_ring *ring = req->ring;
-	int ret;
+	u32 *rbuf;
 
-	ret = intel_ring_begin(req, 4);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, 4);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
-	intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(4));
-	intel_ring_emit(ring,
-			PIPE_CONTROL_CS_STALL |
-			PIPE_CONTROL_STALL_AT_SCOREBOARD);
-	intel_ring_emit(ring, 0);
-	intel_ring_emit(ring, 0);
-	intel_ring_advance(ring);
+	*rbuf++ = GFX_OP_PIPE_CONTROL(4);
+	*rbuf++ = PIPE_CONTROL_CS_STALL |
+		  PIPE_CONTROL_STALL_AT_SCOREBOARD;
+	*rbuf++ = 0;
+	*rbuf++ = 0;
+
+	intel_ring_advance(req->ring, rbuf);
 
 	return 0;
 }
@@ -284,11 +282,9 @@ gen7_render_ring_cs_stall_wa(struct drm_i915_gem_request *req)
 static int
 gen7_render_ring_flush(struct drm_i915_gem_request *req, u32 mode)
 {
-	struct intel_ring *ring = req->ring;
 	u32 scratch_addr =
 		i915_ggtt_offset(req->engine->scratch) + 2 * CACHELINE_BYTES;
-	u32 flags = 0;
-	int ret;
+	u32 *rbuf, flags = 0;
 
 	/*
 	 * Ensure that any following seqno writes only happen when the render
@@ -332,15 +328,16 @@ gen7_render_ring_flush(struct drm_i915_gem_request *req, u32 mode)
 		gen7_render_ring_cs_stall_wa(req);
 	}
 
-	ret = intel_ring_begin(req, 4);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, 4);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
-	intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(4));
-	intel_ring_emit(ring, flags);
-	intel_ring_emit(ring, scratch_addr);
-	intel_ring_emit(ring, 0);
-	intel_ring_advance(ring);
+	*rbuf++ = GFX_OP_PIPE_CONTROL(4);
+	*rbuf++ = flags;
+	*rbuf++ = scratch_addr;
+	*rbuf++ = 0;
+
+	intel_ring_advance(req->ring, rbuf);
 
 	return 0;
 }
@@ -349,20 +346,20 @@ static int
 gen8_emit_pipe_control(struct drm_i915_gem_request *req,
 		       u32 flags, u32 scratch_addr)
 {
-	struct intel_ring *ring = req->ring;
-	int ret;
+	u32 *rbuf;
 
-	ret = intel_ring_begin(req, 6);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, 6);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
-	intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(6));
-	intel_ring_emit(ring, flags);
-	intel_ring_emit(ring, scratch_addr);
-	intel_ring_emit(ring, 0);
-	intel_ring_emit(ring, 0);
-	intel_ring_emit(ring, 0);
-	intel_ring_advance(ring);
+	*rbuf++ = GFX_OP_PIPE_CONTROL(6);
+	*rbuf++ = flags;
+	*rbuf++ = scratch_addr;
+	*rbuf++ = 0;
+	*rbuf++ = 0;
+	*rbuf++ = 0;
+
+	intel_ring_advance(req->ring, rbuf);
 
 	return 0;
 }
@@ -615,8 +612,8 @@ out:
 
 static int intel_ring_workarounds_emit(struct drm_i915_gem_request *req)
 {
-	struct intel_ring *ring = req->ring;
 	struct i915_workarounds *w = &req->i915->workarounds;
+	u32 *rbuf;
 	int ret, i;
 
 	if (w->count == 0)
@@ -626,18 +623,18 @@ static int intel_ring_workarounds_emit(struct drm_i915_gem_request *req)
 	if (ret)
 		return ret;
 
-	ret = intel_ring_begin(req, (w->count * 2 + 2));
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, w->count * 2 + 2);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
-	intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(w->count));
+	*rbuf++ = MI_LOAD_REGISTER_IMM(w->count);
 	for (i = 0; i < w->count; i++) {
-		intel_ring_emit_reg(ring, w->reg[i].addr);
-		intel_ring_emit(ring, w->reg[i].value);
+		*rbuf++ = w->reg[i].addr.reg;
+		*rbuf++ = w->reg[i].value;
 	}
-	intel_ring_emit(ring, MI_NOOP);
+	*rbuf++ = MI_NOOP;
 
-	intel_ring_advance(ring);
+	intel_ring_advance(req->ring, rbuf);
 
 	ret = req->engine->emit_flush(req, EMIT_BARRIER);
 	if (ret)
@@ -1263,87 +1260,81 @@ static void render_ring_cleanup(struct intel_engine_cs *engine)
 
 static int gen8_rcs_signal(struct drm_i915_gem_request *req)
 {
-	struct intel_ring *ring = req->ring;
 	struct drm_i915_private *dev_priv = req->i915;
 	struct intel_engine_cs *waiter;
 	enum intel_engine_id id;
-	int ret, num_rings;
+	u32 *rbuf;
+	int num_rings;
 
 	num_rings = INTEL_INFO(dev_priv)->num_rings;
-	ret = intel_ring_begin(req, (num_rings-1) * 8);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, (num_rings-1) * 8);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
 	for_each_engine_id(waiter, dev_priv, id) {
 		u64 gtt_offset = req->engine->semaphore.signal_ggtt[id];
 		if (gtt_offset == MI_SEMAPHORE_SYNC_INVALID)
 			continue;
 
-		intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(6));
-		intel_ring_emit(ring,
-				PIPE_CONTROL_GLOBAL_GTT_IVB |
-				PIPE_CONTROL_QW_WRITE |
-				PIPE_CONTROL_CS_STALL);
-		intel_ring_emit(ring, lower_32_bits(gtt_offset));
-		intel_ring_emit(ring, upper_32_bits(gtt_offset));
-		intel_ring_emit(ring, req->fence.seqno);
-		intel_ring_emit(ring, 0);
-		intel_ring_emit(ring,
-				MI_SEMAPHORE_SIGNAL |
-				MI_SEMAPHORE_TARGET(waiter->hw_id));
-		intel_ring_emit(ring, 0);
+		*rbuf++ = GFX_OP_PIPE_CONTROL(6);
+		*rbuf++ = PIPE_CONTROL_GLOBAL_GTT_IVB |
+			  PIPE_CONTROL_QW_WRITE |
+			  PIPE_CONTROL_CS_STALL;
+		*rbuf++ = lower_32_bits(gtt_offset);
+		*rbuf++ = upper_32_bits(gtt_offset);
+		*rbuf++ = req->fence.seqno;
+		*rbuf++ = 0;
+		*rbuf++ = MI_SEMAPHORE_SIGNAL |
+			  MI_SEMAPHORE_TARGET(waiter->hw_id);
+		*rbuf++ = 0;
 	}
-	intel_ring_advance(ring);
+	intel_ring_advance(req->ring, rbuf);
 
 	return 0;
 }
 
 static int gen8_xcs_signal(struct drm_i915_gem_request *req)
 {
-	struct intel_ring *ring = req->ring;
 	struct drm_i915_private *dev_priv = req->i915;
 	struct intel_engine_cs *waiter;
 	enum intel_engine_id id;
-	int ret, num_rings;
+	u32 *rbuf;
+	int num_rings;
 
 	num_rings = INTEL_INFO(dev_priv)->num_rings;
-	ret = intel_ring_begin(req, (num_rings-1) * 6);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, (num_rings-1) * 6);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
 	for_each_engine_id(waiter, dev_priv, id) {
 		u64 gtt_offset = req->engine->semaphore.signal_ggtt[id];
 		if (gtt_offset == MI_SEMAPHORE_SYNC_INVALID)
 			continue;
 
-		intel_ring_emit(ring,
-				(MI_FLUSH_DW + 1) | MI_FLUSH_DW_OP_STOREDW);
-		intel_ring_emit(ring,
-				lower_32_bits(gtt_offset) |
-				MI_FLUSH_DW_USE_GTT);
-		intel_ring_emit(ring, upper_32_bits(gtt_offset));
-		intel_ring_emit(ring, req->fence.seqno);
-		intel_ring_emit(ring,
-				MI_SEMAPHORE_SIGNAL |
-				MI_SEMAPHORE_TARGET(waiter->hw_id));
-		intel_ring_emit(ring, 0);
+		*rbuf++ = (MI_FLUSH_DW + 1) | MI_FLUSH_DW_OP_STOREDW;
+		*rbuf++ = lower_32_bits(gtt_offset) | MI_FLUSH_DW_USE_GTT;
+		*rbuf++ = upper_32_bits(gtt_offset);
+		*rbuf++ = req->fence.seqno;
+		*rbuf++ = MI_SEMAPHORE_SIGNAL |
+			  MI_SEMAPHORE_TARGET(waiter->hw_id);
+		*rbuf++ = 0;
 	}
-	intel_ring_advance(ring);
+	intel_ring_advance(req->ring, rbuf);
 
 	return 0;
 }
 
 static int gen6_signal(struct drm_i915_gem_request *req)
 {
-	struct intel_ring *ring = req->ring;
 	struct drm_i915_private *dev_priv = req->i915;
 	struct intel_engine_cs *engine;
-	int ret, num_rings;
+	u32 *rbuf;
+	int num_rings;
 
 	num_rings = INTEL_INFO(dev_priv)->num_rings;
-	ret = intel_ring_begin(req, round_up((num_rings-1) * 3, 2));
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, round_up((num_rings-1) * 3, 2));
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
 	for_each_engine(engine, dev_priv) {
 		i915_reg_t mbox_reg;
@@ -1353,16 +1344,17 @@ static int gen6_signal(struct drm_i915_gem_request *req)
 
 		mbox_reg = req->engine->semaphore.mbox.signal[engine->hw_id];
 		if (i915_mmio_reg_valid(mbox_reg)) {
-			intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
-			intel_ring_emit_reg(ring, mbox_reg);
-			intel_ring_emit(ring, req->fence.seqno);
+			*rbuf++ = MI_LOAD_REGISTER_IMM(1);
+			*rbuf++ = mbox_reg.reg;
+			*rbuf++ = req->fence.seqno;
 		}
 	}
 
 	/* If num_dwords was rounded, make sure the tail pointer is correct */
 	if (num_rings % 2 == 0)
-		intel_ring_emit(ring, MI_NOOP);
-	intel_ring_advance(ring);
+		*rbuf++ = MI_NOOP;
+
+	intel_ring_advance(req->ring, rbuf);
 
 	return 0;
 }
@@ -1378,17 +1370,18 @@ static void i9xx_submit_request(struct drm_i915_gem_request *request)
 static int i9xx_emit_request(struct drm_i915_gem_request *req)
 {
 	struct intel_ring *ring = req->ring;
-	int ret;
+	u32 *rbuf;
 
-	ret = intel_ring_begin(req, 4);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, 4);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
+
+	*rbuf++ = MI_STORE_DWORD_INDEX;
+	*rbuf++ = I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT;
+	*rbuf++ = req->fence.seqno;
+	*rbuf++ = MI_USER_INTERRUPT;
 
-	intel_ring_emit(ring, MI_STORE_DWORD_INDEX);
-	intel_ring_emit(ring, I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT);
-	intel_ring_emit(ring, req->fence.seqno);
-	intel_ring_emit(ring, MI_USER_INTERRUPT);
-	intel_ring_advance(ring);
+	intel_ring_advance(ring, rbuf);
 
 	req->tail = ring->tail;
 
@@ -1418,6 +1411,7 @@ static int gen8_render_emit_request(struct drm_i915_gem_request *req)
 {
 	struct intel_engine_cs *engine = req->engine;
 	struct intel_ring *ring = req->ring;
+	u32 *rbuf;
 	int ret;
 
 	if (engine->semaphore.signal) {
@@ -1426,22 +1420,23 @@ static int gen8_render_emit_request(struct drm_i915_gem_request *req)
 			return ret;
 	}
 
-	ret = intel_ring_begin(req, 8);
-	if (ret)
-		return ret;
-
-	intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(6));
-	intel_ring_emit(ring, (PIPE_CONTROL_GLOBAL_GTT_IVB |
-			       PIPE_CONTROL_CS_STALL |
-			       PIPE_CONTROL_QW_WRITE));
-	intel_ring_emit(ring, intel_hws_seqno_address(engine));
-	intel_ring_emit(ring, 0);
-	intel_ring_emit(ring, i915_gem_request_get_seqno(req));
+	rbuf = intel_ring_begin(req, 8);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
+
+	*rbuf++ = GFX_OP_PIPE_CONTROL(6);
+	*rbuf++ = (PIPE_CONTROL_GLOBAL_GTT_IVB |
+		  PIPE_CONTROL_CS_STALL |
+		  PIPE_CONTROL_QW_WRITE);
+	*rbuf++ = intel_hws_seqno_address(engine);
+	*rbuf++ = 0;
+	*rbuf++ = i915_gem_request_get_seqno(req);
 	/* We're thrashing one dword of HWS. */
-	intel_ring_emit(ring, 0);
-	intel_ring_emit(ring, MI_USER_INTERRUPT);
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_advance(ring);
+	*rbuf++ = 0;
+	*rbuf++ = MI_USER_INTERRUPT;
+	*rbuf++ = MI_NOOP;
+
+	intel_ring_advance(ring, rbuf);
 
 	req->tail = ring->tail;
 
@@ -1460,24 +1455,23 @@ static int
 gen8_ring_sync_to(struct drm_i915_gem_request *req,
 		  struct drm_i915_gem_request *signal)
 {
-	struct intel_ring *ring = req->ring;
 	struct drm_i915_private *dev_priv = req->i915;
 	u64 offset = GEN8_WAIT_OFFSET(req->engine, signal->engine->id);
 	struct i915_hw_ppgtt *ppgtt;
-	int ret;
+	u32 *rbuf;
 
-	ret = intel_ring_begin(req, 4);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, 4);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
-	intel_ring_emit(ring,
-			MI_SEMAPHORE_WAIT |
-			MI_SEMAPHORE_GLOBAL_GTT |
-			MI_SEMAPHORE_SAD_GTE_SDD);
-	intel_ring_emit(ring, signal->fence.seqno);
-	intel_ring_emit(ring, lower_32_bits(offset));
-	intel_ring_emit(ring, upper_32_bits(offset));
-	intel_ring_advance(ring);
+	*rbuf++ = MI_SEMAPHORE_WAIT |
+		  MI_SEMAPHORE_GLOBAL_GTT |
+		  MI_SEMAPHORE_SAD_GTE_SDD;
+	*rbuf++ = signal->fence.seqno;
+	*rbuf++ = lower_32_bits(offset);
+	*rbuf++ = upper_32_bits(offset);
+
+	intel_ring_advance(req->ring, rbuf);
 
 	/* When the !RCS engines idle waiting upon a semaphore, they lose their
 	 * pagetables and we must reload them before executing the batch.
@@ -1494,28 +1488,28 @@ static int
 gen6_ring_sync_to(struct drm_i915_gem_request *req,
 		  struct drm_i915_gem_request *signal)
 {
-	struct intel_ring *ring = req->ring;
 	u32 dw1 = MI_SEMAPHORE_MBOX |
 		  MI_SEMAPHORE_COMPARE |
 		  MI_SEMAPHORE_REGISTER;
 	u32 wait_mbox = signal->engine->semaphore.mbox.wait[req->engine->hw_id];
-	int ret;
+	u32 *rbuf;
 
 	WARN_ON(wait_mbox == MI_SEMAPHORE_SYNC_INVALID);
 
-	ret = intel_ring_begin(req, 4);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, 4);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
-	intel_ring_emit(ring, dw1 | wait_mbox);
+	*rbuf++ = dw1 | wait_mbox;
 	/* Throughout all of the GEM code, seqno passed implies our current
 	 * seqno is >= the last seqno executed. However for hardware the
 	 * comparison is strictly greater than.
 	 */
-	intel_ring_emit(ring, signal->fence.seqno - 1);
-	intel_ring_emit(ring, 0);
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_advance(ring);
+	*rbuf++ = signal->fence.seqno - 1;
+	*rbuf++ = 0;
+	*rbuf++ = MI_NOOP;
+
+	intel_ring_advance(req->ring, rbuf);
 
 	return 0;
 }
@@ -1616,16 +1610,16 @@ i8xx_irq_disable(struct intel_engine_cs *engine)
 static int
 bsd_ring_flush(struct drm_i915_gem_request *req, u32 mode)
 {
-	struct intel_ring *ring = req->ring;
-	int ret;
+	u32 *rbuf;
 
-	ret = intel_ring_begin(req, 2);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, 2);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
+
+	*rbuf++ = MI_FLUSH;
+	*rbuf++ = MI_NOOP;
 
-	intel_ring_emit(ring, MI_FLUSH);
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_advance(ring);
+	intel_ring_advance(req->ring, rbuf);
 	return 0;
 }
 
@@ -1691,20 +1685,19 @@ i965_emit_bb_start(struct drm_i915_gem_request *req,
 		   u64 offset, u32 length,
 		   unsigned int dispatch_flags)
 {
-	struct intel_ring *ring = req->ring;
-	int ret;
+	u32 *rbuf;
 
-	ret = intel_ring_begin(req, 2);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, 2);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
-	intel_ring_emit(ring,
-			MI_BATCH_BUFFER_START |
-			MI_BATCH_GTT |
-			(dispatch_flags & I915_DISPATCH_SECURE ?
-			 0 : MI_BATCH_NON_SECURE_I965));
-	intel_ring_emit(ring, offset);
-	intel_ring_advance(ring);
+	*rbuf++ = MI_BATCH_BUFFER_START |
+		  MI_BATCH_GTT |
+		  (dispatch_flags & I915_DISPATCH_SECURE ?
+		  0 : MI_BATCH_NON_SECURE_I965);
+	*rbuf++ = offset;
+
+	intel_ring_advance(req->ring, rbuf);
 
 	return 0;
 }
@@ -1720,57 +1713,59 @@ i830_emit_bb_start(struct drm_i915_gem_request *req,
 {
 	struct intel_ring *ring = req->ring;
 	u32 cs_offset = i915_ggtt_offset(req->engine->scratch);
-	int ret;
+	u32 *rbuf;
 
-	ret = intel_ring_begin(req, 6);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, 6);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
 	/* Evict the invalid PTE TLBs */
-	intel_ring_emit(ring, COLOR_BLT_CMD | BLT_WRITE_RGBA);
-	intel_ring_emit(ring, BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | 4096);
-	intel_ring_emit(ring, I830_TLB_ENTRIES << 16 | 4); /* load each page */
-	intel_ring_emit(ring, cs_offset);
-	intel_ring_emit(ring, 0xdeadbeef);
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_advance(ring);
+	*rbuf++ = COLOR_BLT_CMD | BLT_WRITE_RGBA;
+	*rbuf++ = BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | 4096;
+	*rbuf++ = I830_TLB_ENTRIES << 16 | 4; /* load each page */
+	*rbuf++ = cs_offset;
+	*rbuf++ = 0xdeadbeef;
+	*rbuf++ = MI_NOOP;
+
+	intel_ring_advance(ring, rbuf);
 
 	if ((dispatch_flags & I915_DISPATCH_PINNED) == 0) {
 		if (len > I830_BATCH_LIMIT)
 			return -ENOSPC;
 
-		ret = intel_ring_begin(req, 6 + 2);
-		if (ret)
-			return ret;
+		rbuf = intel_ring_begin(req, 6 + 2);
+		if (IS_ERR(rbuf))
+			return PTR_ERR(rbuf);
 
 		/* Blit the batch (which has now all relocs applied) to the
 		 * stable batch scratch bo area (so that the CS never
 		 * stumbles over its tlb invalidation bug) ...
 		 */
-		intel_ring_emit(ring, SRC_COPY_BLT_CMD | BLT_WRITE_RGBA);
-		intel_ring_emit(ring,
-				BLT_DEPTH_32 | BLT_ROP_SRC_COPY | 4096);
-		intel_ring_emit(ring, DIV_ROUND_UP(len, 4096) << 16 | 4096);
-		intel_ring_emit(ring, cs_offset);
-		intel_ring_emit(ring, 4096);
-		intel_ring_emit(ring, offset);
-
-		intel_ring_emit(ring, MI_FLUSH);
-		intel_ring_emit(ring, MI_NOOP);
-		intel_ring_advance(ring);
+		*rbuf++ = SRC_COPY_BLT_CMD | BLT_WRITE_RGBA;
+		*rbuf++ = BLT_DEPTH_32 | BLT_ROP_SRC_COPY | 4096;
+		*rbuf++ = DIV_ROUND_UP(len, 4096) << 16 | 4096;
+		*rbuf++ = cs_offset;
+		*rbuf++ = 4096;
+		*rbuf++ = offset;
+
+		*rbuf++ = MI_FLUSH;
+		*rbuf++ = MI_NOOP;
+
+		intel_ring_advance(ring, rbuf);
 
 		/* ... and execute it. */
 		offset = cs_offset;
 	}
 
-	ret = intel_ring_begin(req, 2);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, 2);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
+
+	*rbuf++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT;
+	*rbuf++ = offset | (dispatch_flags & I915_DISPATCH_SECURE ?
+		  0 : MI_BATCH_NON_SECURE);
 
-	intel_ring_emit(ring, MI_BATCH_BUFFER_START | MI_BATCH_GTT);
-	intel_ring_emit(ring, offset | (dispatch_flags & I915_DISPATCH_SECURE ?
-					0 : MI_BATCH_NON_SECURE));
-	intel_ring_advance(ring);
+	intel_ring_advance(ring, rbuf);
 
 	return 0;
 }
@@ -1780,17 +1775,17 @@ i915_emit_bb_start(struct drm_i915_gem_request *req,
 		   u64 offset, u32 len,
 		   unsigned int dispatch_flags)
 {
-	struct intel_ring *ring = req->ring;
-	int ret;
+	u32 *rbuf;
 
-	ret = intel_ring_begin(req, 2);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, 2);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
-	intel_ring_emit(ring, MI_BATCH_BUFFER_START | MI_BATCH_GTT);
-	intel_ring_emit(ring, offset | (dispatch_flags & I915_DISPATCH_SECURE ?
-					0 : MI_BATCH_NON_SECURE));
-	intel_ring_advance(ring);
+	*rbuf++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT;
+	*rbuf++ = offset | (dispatch_flags & I915_DISPATCH_SECURE ?
+		  0 : MI_BATCH_NON_SECURE);
+
+	intel_ring_advance(req->ring, rbuf);
 
 	return 0;
 }
@@ -2171,7 +2166,7 @@ void intel_engine_cleanup(struct intel_engine_cs *engine)
 
 int intel_ring_alloc_request_extras(struct drm_i915_gem_request *request)
 {
-	int ret;
+	u32 *rbuf;
 
 	/* Flush enough space to reduce the likelihood of waiting after
 	 * we start building the request - in which case we will just
@@ -2181,9 +2176,9 @@ int intel_ring_alloc_request_extras(struct drm_i915_gem_request *request)
 
 	request->ring = request->engine->buffer;
 
-	ret = intel_ring_begin(request, 0);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(request, 0);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
 	request->reserved_space -= LEGACY_REQUEST_SIZE;
 	return 0;
@@ -2237,7 +2232,7 @@ static int wait_for_space(struct drm_i915_gem_request *req, int bytes)
 	return 0;
 }
 
-int intel_ring_begin(struct drm_i915_gem_request *req, int num_dwords)
+u32 *intel_ring_begin(struct drm_i915_gem_request *req, int num_dwords)
 {
 	struct intel_ring *ring = req->ring;
 	int remain_actual = ring->size - ring->tail;
@@ -2271,7 +2266,7 @@ int intel_ring_begin(struct drm_i915_gem_request *req, int num_dwords)
 	if (wait_bytes > ring->space) {
 		int ret = wait_for_space(req, wait_bytes);
 		if (unlikely(ret))
-			return ret;
+			return ERR_PTR(ret);
 	}
 
 	if (unlikely(need_wrap)) {
@@ -2286,7 +2281,8 @@ int intel_ring_begin(struct drm_i915_gem_request *req, int num_dwords)
 
 	ring->space -= bytes;
 	GEM_BUG_ON(ring->space < 0);
-	return 0;
+
+	return (u32 *)(ring->vaddr + ring->tail);
 }
 
 /* Align the ring tail to a cacheline boundary */
@@ -2295,20 +2291,20 @@ int intel_ring_cacheline_align(struct drm_i915_gem_request *req)
 	struct intel_ring *ring = req->ring;
 	int num_dwords =
 		(ring->tail & (CACHELINE_BYTES - 1)) / sizeof(uint32_t);
-	int ret;
+	u32 *rbuf;
 
 	if (num_dwords == 0)
 		return 0;
 
 	num_dwords = CACHELINE_BYTES / sizeof(uint32_t) - num_dwords;
-	ret = intel_ring_begin(req, num_dwords);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, num_dwords);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
 	while (num_dwords--)
-		intel_ring_emit(ring, MI_NOOP);
+		*rbuf++ = MI_NOOP;
 
-	intel_ring_advance(ring);
+	intel_ring_advance(ring, rbuf);
 
 	return 0;
 }
@@ -2352,13 +2348,11 @@ static void gen6_bsd_submit_request(struct drm_i915_gem_request *request)
 
 static int gen6_bsd_ring_flush(struct drm_i915_gem_request *req, u32 mode)
 {
-	struct intel_ring *ring = req->ring;
-	uint32_t cmd;
-	int ret;
+	u32 cmd, *rbuf;
 
-	ret = intel_ring_begin(req, 4);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, 4);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
 	cmd = MI_FLUSH_DW;
 	if (INTEL_GEN(req->i915) >= 8)
@@ -2380,16 +2374,18 @@ static int gen6_bsd_ring_flush(struct drm_i915_gem_request *req, u32 mode)
 	if (mode & EMIT_INVALIDATE)
 		cmd |= MI_INVALIDATE_TLB | MI_INVALIDATE_BSD;
 
-	intel_ring_emit(ring, cmd);
-	intel_ring_emit(ring, I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT);
+	*rbuf++ = cmd;
+	*rbuf++ = I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
 	if (INTEL_GEN(req->i915) >= 8) {
-		intel_ring_emit(ring, 0); /* upper addr */
-		intel_ring_emit(ring, 0); /* value */
+		*rbuf++ = 0; /* upper addr */
+		*rbuf++ = 0; /* value */
 	} else  {
-		intel_ring_emit(ring, 0);
-		intel_ring_emit(ring, MI_NOOP);
+		*rbuf++ = 0;
+		*rbuf++ = MI_NOOP;
 	}
-	intel_ring_advance(ring);
+
+	intel_ring_advance(req->ring, rbuf);
+
 	return 0;
 }
 
@@ -2398,23 +2394,23 @@ gen8_emit_bb_start(struct drm_i915_gem_request *req,
 		   u64 offset, u32 len,
 		   unsigned int dispatch_flags)
 {
-	struct intel_ring *ring = req->ring;
 	bool ppgtt = USES_PPGTT(req->i915) &&
 			!(dispatch_flags & I915_DISPATCH_SECURE);
-	int ret;
+	u32 *rbuf;
 
-	ret = intel_ring_begin(req, 4);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, 4);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
 	/* FIXME(BDW): Address space and security selectors. */
-	intel_ring_emit(ring, MI_BATCH_BUFFER_START_GEN8 | (ppgtt<<8) |
-			(dispatch_flags & I915_DISPATCH_RS ?
-			 MI_BATCH_RESOURCE_STREAMER : 0));
-	intel_ring_emit(ring, lower_32_bits(offset));
-	intel_ring_emit(ring, upper_32_bits(offset));
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_advance(ring);
+	*rbuf++ = MI_BATCH_BUFFER_START_GEN8 | (ppgtt<<8) |
+		  (dispatch_flags & I915_DISPATCH_RS ?
+		  MI_BATCH_RESOURCE_STREAMER : 0);
+	*rbuf++ = lower_32_bits(offset);
+	*rbuf++ = upper_32_bits(offset);
+	*rbuf++ = MI_NOOP;
+
+	intel_ring_advance(req->ring, rbuf);
 
 	return 0;
 }
@@ -2424,22 +2420,21 @@ hsw_emit_bb_start(struct drm_i915_gem_request *req,
 		  u64 offset, u32 len,
 		  unsigned int dispatch_flags)
 {
-	struct intel_ring *ring = req->ring;
-	int ret;
+	u32 *rbuf;
 
-	ret = intel_ring_begin(req, 2);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, 2);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
-	intel_ring_emit(ring,
-			MI_BATCH_BUFFER_START |
-			(dispatch_flags & I915_DISPATCH_SECURE ?
-			 0 : MI_BATCH_PPGTT_HSW | MI_BATCH_NON_SECURE_HSW) |
-			(dispatch_flags & I915_DISPATCH_RS ?
-			 MI_BATCH_RESOURCE_STREAMER : 0));
+	*rbuf++ = MI_BATCH_BUFFER_START |
+		  (dispatch_flags & I915_DISPATCH_SECURE ?
+		  0 : MI_BATCH_PPGTT_HSW | MI_BATCH_NON_SECURE_HSW) |
+		  (dispatch_flags & I915_DISPATCH_RS ?
+		  MI_BATCH_RESOURCE_STREAMER : 0);
 	/* bit0-7 is the length on GEN6+ */
-	intel_ring_emit(ring, offset);
-	intel_ring_advance(ring);
+	*rbuf++ = offset;
+
+	intel_ring_advance(req->ring, rbuf);
 
 	return 0;
 }
@@ -2449,20 +2444,19 @@ gen6_emit_bb_start(struct drm_i915_gem_request *req,
 		   u64 offset, u32 len,
 		   unsigned int dispatch_flags)
 {
-	struct intel_ring *ring = req->ring;
-	int ret;
+	u32 *rbuf;
 
-	ret = intel_ring_begin(req, 2);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, 2);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
-	intel_ring_emit(ring,
-			MI_BATCH_BUFFER_START |
-			(dispatch_flags & I915_DISPATCH_SECURE ?
-			 0 : MI_BATCH_NON_SECURE_I965));
+	*rbuf++ = MI_BATCH_BUFFER_START |
+		  (dispatch_flags & I915_DISPATCH_SECURE ?
+		  0 : MI_BATCH_NON_SECURE_I965);
 	/* bit0-7 is the length on GEN6+ */
-	intel_ring_emit(ring, offset);
-	intel_ring_advance(ring);
+	*rbuf++ = offset;
+
+	intel_ring_advance(req->ring, rbuf);
 
 	return 0;
 }
@@ -2471,13 +2465,11 @@ gen6_emit_bb_start(struct drm_i915_gem_request *req,
 
 static int gen6_ring_flush(struct drm_i915_gem_request *req, u32 mode)
 {
-	struct intel_ring *ring = req->ring;
-	uint32_t cmd;
-	int ret;
+	u32 cmd, *rbuf;
 
-	ret = intel_ring_begin(req, 4);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, 4);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
 	cmd = MI_FLUSH_DW;
 	if (INTEL_GEN(req->i915) >= 8)
@@ -2498,17 +2490,17 @@ static int gen6_ring_flush(struct drm_i915_gem_request *req, u32 mode)
 	 */
 	if (mode & EMIT_INVALIDATE)
 		cmd |= MI_INVALIDATE_TLB;
-	intel_ring_emit(ring, cmd);
-	intel_ring_emit(ring,
-			I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT);
+	*rbuf++ = cmd;
+	*rbuf++ = I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
 	if (INTEL_GEN(req->i915) >= 8) {
-		intel_ring_emit(ring, 0); /* upper addr */
-		intel_ring_emit(ring, 0); /* value */
+		*rbuf++ = 0; /* upper addr */
+		*rbuf++ = 0; /* value */
 	} else  {
-		intel_ring_emit(ring, 0);
-		intel_ring_emit(ring, MI_NOOP);
+		*rbuf++ = 0;
+		*rbuf++ = MI_NOOP;
 	}
-	intel_ring_advance(ring);
+
+	intel_ring_advance(req->ring, rbuf);
 
 	return 0;
 }
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 84aea549de5d..07bcc650c7bc 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -443,7 +443,7 @@ void intel_engine_cleanup(struct intel_engine_cs *engine);
 
 int intel_ring_alloc_request_extras(struct drm_i915_gem_request *request);
 
-int __must_check intel_ring_begin(struct drm_i915_gem_request *req, int n);
+u32 __must_check *intel_ring_begin(struct drm_i915_gem_request *req, int n);
 int __must_check intel_ring_cacheline_align(struct drm_i915_gem_request *req);
 
 static inline void intel_ring_emit(struct intel_ring *ring, u32 data)
@@ -457,16 +457,13 @@ static inline void intel_ring_emit_reg(struct intel_ring *ring, i915_reg_t reg)
 	intel_ring_emit(ring, i915_mmio_reg_offset(reg));
 }
 
-static inline void intel_ring_advance(struct intel_ring *ring)
+static inline void intel_ring_advance(struct intel_ring *ring, u32 *rbuf)
 {
-	/* Dummy function.
-	 *
-	 * This serves as a placeholder in the code so that the reader
-	 * can compare against the preceding intel_ring_begin() and
-	 * check that the number of dwords emitted matches the space
-	 * reserved for the command packet (i.e. the value passed to
-	 * intel_ring_begin()).
-	 */
+	unsigned int written = (void *)rbuf - (ring->vaddr + ring->tail);
+
+	GEM_BUG_ON(ring->tail + written <= ring->size);
+
+	ring->tail += written;
 }
 
 static inline u32 intel_ring_offset(struct intel_ring *ring, u32 value)
-- 
1.9.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [RFC v2] drm/i915: Emit to ringbuffer directly
  2016-09-09 15:52         ` [RFC v2] " Tvrtko Ursulin
@ 2016-09-09 16:04           ` Chris Wilson
  2016-09-12  9:44             ` [PATCH v3] " Tvrtko Ursulin
  0 siblings, 1 reply; 15+ messages in thread
From: Chris Wilson @ 2016-09-09 16:04 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: Intel-gfx

On Fri, Sep 09, 2016 at 04:52:28PM +0100, Tvrtko Ursulin wrote:
> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> 
> This removes the usage of intel_ring_emit in favour of
> directly writing to the ring buffer.
> 
> intel_ring_emit was preventing the compiler for optimising
> fetch and increment of the current ring buffer pointer and
> therefore generating very verbose code for every write.
> 
> It had no useful purpose since all ringbuffer operations
> are started and ended with intel_ring_begin and
> intel_ring_advance respectively, with no bail out in the
> middle possible, so it is fine to increment the tail in
> intel_ring_begin and let the code manage the pointer
> itself.
> 
> Useless instruction removal amounts to approximately
> two and half kilobytes of saved text on my build.
> 
> Not sure if this has any measurable performance
> implications but executing a ton of useless instructions
> on fast paths cannot be good.
> 
> Patch is not fully polished, but it compiles and runs
> on Gen9 at least.
> 
> v2:
>  * Change return from intel_ring_begin to error pointer by
>    popular demand.
>  * Move tail increment to intel_ring_advance to enable some
>    error checking.

The increment can stay in begin (it's not intel_ring_begin() anymore
since it operates on the request!) as that will be smaller at no
usability cost.

Just check that rbuf == ring->vaddr + ring->tail at end.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 15+ messages in thread

* ✗ Fi.CI.BAT: failure for drm/i915: Emit to ringbuffer directly (rev2)
  2016-09-08 15:12 [RFC] drm/i915: Emit to ringbuffer directly Tvrtko Ursulin
  2016-09-08 15:54 ` ✗ Fi.CI.BAT: failure for " Patchwork
  2016-09-08 16:40 ` [RFC] " Chris Wilson
@ 2016-09-09 16:26 ` Patchwork
  2016-09-12 10:19 ` ✓ Fi.CI.BAT: success for drm/i915: Emit to ringbuffer directly (rev3) Patchwork
  3 siblings, 0 replies; 15+ messages in thread
From: Patchwork @ 2016-09-09 16:26 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: intel-gfx

== Series Details ==

Series: drm/i915: Emit to ringbuffer directly (rev2)
URL   : https://patchwork.freedesktop.org/series/12186/
State : failure

== Summary ==

Series 12186v2 drm/i915: Emit to ringbuffer directly
http://patchwork.freedesktop.org/api/1.0/series/12186/revisions/2/mbox/

Test drv_module_reload_basic:
                skip       -> PASS       (fi-skl-6260u)
Test kms_busy:
        Subgroup basic-flip-default-a:
                pass       -> TIMEOUT    (fi-ilk-650)
                pass       -> TIMEOUT    (fi-snb-2600)
                pass       -> INCOMPLETE (fi-snb-2520m)
        Subgroup basic-flip-default-b:
                pass       -> INCOMPLETE (fi-ilk-650)
                pass       -> INCOMPLETE (fi-snb-2600)
Test kms_frontbuffer_tracking:
        Subgroup basic:
                pass       -> DMESG-FAIL (fi-ivb-3770)
                pass       -> DMESG-FAIL (fi-hsw-4770k)
                pass       -> DMESG-FAIL (fi-hsw-4770r)
                pass       -> DMESG-FAIL (fi-ivb-3520m)
                pass       -> DMESG-FAIL (fi-byt-n2820)

fi-bdw-5557u     total:254  pass:238  dwarn:0   dfail:0   fail:1   skip:15 
fi-bsw-n3050     total:254  pass:207  dwarn:0   dfail:0   fail:1   skip:46 
fi-byt-n2820     total:254  pass:210  dwarn:0   dfail:1   fail:2   skip:41 
fi-hsw-4770k     total:254  pass:230  dwarn:0   dfail:1   fail:1   skip:22 
fi-hsw-4770r     total:254  pass:226  dwarn:0   dfail:1   fail:1   skip:26 
fi-ilk-650       total:185  pass:129  dwarn:0   dfail:0   fail:1   skip:53 
fi-ivb-3520m     total:254  pass:221  dwarn:0   dfail:1   fail:1   skip:31 
fi-ivb-3770      total:254  pass:221  dwarn:0   dfail:1   fail:1   skip:31 
fi-skl-6260u     total:254  pass:239  dwarn:0   dfail:0   fail:1   skip:14 
fi-skl-6700hq    total:254  pass:226  dwarn:0   dfail:0   fail:2   skip:26 
fi-skl-6700k     total:254  pass:224  dwarn:1   dfail:0   fail:1   skip:28 
fi-snb-2520m     total:184  pass:151  dwarn:0   dfail:0   fail:0   skip:32 
fi-snb-2600      total:185  pass:151  dwarn:0   dfail:0   fail:0   skip:32 

Results at /archive/results/CI_IGT_test/Patchwork_2503/

5986f290e25f42d3d5df390411cc43683deb1301 drm-intel-nightly: 2016y-09m-08d-09h-11m-50s UTC integration manifest
0909701 drm/i915: Emit to ringbuffer directly

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH v3] drm/i915: Emit to ringbuffer directly
  2016-09-09 16:04           ` Chris Wilson
@ 2016-09-12  9:44             ` Tvrtko Ursulin
  2016-09-12 15:04               ` Dave Gordon
  0 siblings, 1 reply; 15+ messages in thread
From: Tvrtko Ursulin @ 2016-09-12  9:44 UTC (permalink / raw)
  To: Intel-gfx

From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

This removes the usage of intel_ring_emit in favour of
directly writing to the ring buffer.

intel_ring_emit was preventing the compiler for optimising
fetch and increment of the current ring buffer pointer and
therefore generating very verbose code for every write.

It had no useful purpose since all ringbuffer operations
are started and ended with intel_ring_begin and
intel_ring_advance respectively, with no bail out in the
middle possible, so it is fine to increment the tail in
intel_ring_begin and let the code manage the pointer
itself.

Useless instruction removal amounts to approximately
two and half kilobytes of saved text on my build.

Not sure if this has any measurable performance
implications but executing a ton of useless instructions
on fast paths cannot be good.

Patch is not fully polished, but it compiles and runs
on Gen9 at least.

v2:
 * Change return from intel_ring_begin to error pointer by
   popular demand.
 * Move tail increment to intel_ring_advance to enable some
   error checking.

v3:
 * Move tail advance back into intel_ring_begin.
 * Rebase and tidy.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Dave Gordon <david.s.gordon@intel.com>
---
 drivers/gpu/drm/i915/i915_gem_context.c    |  75 ++--
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |  37 +-
 drivers/gpu/drm/i915/i915_gem_gtt.c        |  70 ++--
 drivers/gpu/drm/i915/intel_display.c       | 132 +++---
 drivers/gpu/drm/i915/intel_lrc.c           | 245 ++++++-----
 drivers/gpu/drm/i915/intel_mocs.c          |  53 ++-
 drivers/gpu/drm/i915/intel_overlay.c       |  88 ++--
 drivers/gpu/drm/i915/intel_ringbuffer.c    | 638 ++++++++++++++---------------
 drivers/gpu/drm/i915/intel_ringbuffer.h    |  24 +-
 9 files changed, 655 insertions(+), 707 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index df10f4e95736..1394408203c3 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -561,7 +561,6 @@ static inline int
 mi_set_context(struct drm_i915_gem_request *req, u32 hw_flags)
 {
 	struct drm_i915_private *dev_priv = req->i915;
-	struct intel_ring *ring = req->ring;
 	struct intel_engine_cs *engine = req->engine;
 	u32 flags = hw_flags | MI_MM_SPACE_GTT;
 	const int num_rings =
@@ -569,6 +568,7 @@ mi_set_context(struct drm_i915_gem_request *req, u32 hw_flags)
 		i915.semaphores ?
 		INTEL_INFO(dev_priv)->num_rings - 1 :
 		0;
+	u32 *rbuf;
 	int len, ret;
 
 	/* w/a: If Flush TLB Invalidation Mode is enabled, driver must do a TLB
@@ -593,70 +593,64 @@ mi_set_context(struct drm_i915_gem_request *req, u32 hw_flags)
 	if (INTEL_GEN(dev_priv) >= 7)
 		len += 2 + (num_rings ? 4*num_rings + 6 : 0);
 
-	ret = intel_ring_begin(req, len);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, len);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
 	/* WaProgramMiArbOnOffAroundMiSetContext:ivb,vlv,hsw,bdw,chv */
 	if (INTEL_GEN(dev_priv) >= 7) {
-		intel_ring_emit(ring, MI_ARB_ON_OFF | MI_ARB_DISABLE);
+		*rbuf++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
 		if (num_rings) {
 			struct intel_engine_cs *signaller;
 
-			intel_ring_emit(ring,
-					MI_LOAD_REGISTER_IMM(num_rings));
+			*rbuf++ = MI_LOAD_REGISTER_IMM(num_rings);
 			for_each_engine(signaller, dev_priv) {
 				if (signaller == engine)
 					continue;
 
-				intel_ring_emit_reg(ring,
-						    RING_PSMI_CTL(signaller->mmio_base));
-				intel_ring_emit(ring,
-						_MASKED_BIT_ENABLE(GEN6_PSMI_SLEEP_MSG_DISABLE));
+				*rbuf++ = i915_mmio_reg_offset(
+					  RING_PSMI_CTL(signaller->mmio_base));
+				*rbuf++ = _MASKED_BIT_ENABLE(
+					   GEN6_PSMI_SLEEP_MSG_DISABLE);
 			}
 		}
 	}
 
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_emit(ring, MI_SET_CONTEXT);
-	intel_ring_emit(ring,
-			i915_ggtt_offset(req->ctx->engine[RCS].state) | flags);
+	*rbuf++ = MI_NOOP;
+	*rbuf++ = MI_SET_CONTEXT;
+	*rbuf++ = i915_ggtt_offset(req->ctx->engine[RCS].state) | flags;
 	/*
 	 * w/a: MI_SET_CONTEXT must always be followed by MI_NOOP
 	 * WaMiSetContext_Hang:snb,ivb,vlv
 	 */
-	intel_ring_emit(ring, MI_NOOP);
+	*rbuf++ = MI_NOOP;
 
 	if (INTEL_GEN(dev_priv) >= 7) {
 		if (num_rings) {
 			struct intel_engine_cs *signaller;
 			i915_reg_t last_reg = {}; /* keep gcc quiet */
 
-			intel_ring_emit(ring,
-					MI_LOAD_REGISTER_IMM(num_rings));
+			*rbuf++ = MI_LOAD_REGISTER_IMM(num_rings);
 			for_each_engine(signaller, dev_priv) {
 				if (signaller == engine)
 					continue;
 
 				last_reg = RING_PSMI_CTL(signaller->mmio_base);
-				intel_ring_emit_reg(ring, last_reg);
-				intel_ring_emit(ring,
-						_MASKED_BIT_DISABLE(GEN6_PSMI_SLEEP_MSG_DISABLE));
+				*rbuf++ = i915_mmio_reg_offset(last_reg);
+				*rbuf++ = _MASKED_BIT_DISABLE(
+					  GEN6_PSMI_SLEEP_MSG_DISABLE);
 			}
 
 			/* Insert a delay before the next switch! */
-			intel_ring_emit(ring,
-					MI_STORE_REGISTER_MEM |
-					MI_SRM_LRM_GLOBAL_GTT);
-			intel_ring_emit_reg(ring, last_reg);
-			intel_ring_emit(ring,
-					i915_ggtt_offset(engine->scratch));
-			intel_ring_emit(ring, MI_NOOP);
+			*rbuf++ = MI_STORE_REGISTER_MEM | MI_SRM_LRM_GLOBAL_GTT;
+			*rbuf++ = i915_mmio_reg_offset(last_reg);
+			*rbuf++ = i915_ggtt_offset(engine->scratch);
+			*rbuf++ = MI_NOOP;
 		}
-		intel_ring_emit(ring, MI_ARB_ON_OFF | MI_ARB_ENABLE);
+		*rbuf++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
 	}
 
-	intel_ring_advance(ring);
+	intel_ring_advance(req->ring, rbuf);
 
 	return ret;
 }
@@ -664,28 +658,29 @@ mi_set_context(struct drm_i915_gem_request *req, u32 hw_flags)
 static int remap_l3(struct drm_i915_gem_request *req, int slice)
 {
 	u32 *remap_info = req->i915->l3_parity.remap_info[slice];
-	struct intel_ring *ring = req->ring;
-	int i, ret;
+	u32 *rbuf;
+	int i;
 
 	if (!remap_info)
 		return 0;
 
-	ret = intel_ring_begin(req, GEN7_L3LOG_SIZE/4 * 2 + 2);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, GEN7_L3LOG_SIZE / 4 * 2 + 2);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
 	/*
 	 * Note: We do not worry about the concurrent register cacheline hang
 	 * here because no other code should access these registers other than
 	 * at initialization time.
 	 */
-	intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(GEN7_L3LOG_SIZE/4));
+	*rbuf++ = MI_LOAD_REGISTER_IMM(GEN7_L3LOG_SIZE / 4);
 	for (i = 0; i < GEN7_L3LOG_SIZE/4; i++) {
-		intel_ring_emit_reg(ring, GEN7_L3LOG(slice, i));
-		intel_ring_emit(ring, remap_info[i]);
+		*rbuf++ = i915_mmio_reg_offset(GEN7_L3LOG(slice, i));
+		*rbuf++ = remap_info[i];
 	}
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_advance(ring);
+	*rbuf++ = MI_NOOP;
+
+	intel_ring_advance(req->ring, rbuf);
 
 	return 0;
 }
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 33c85227643d..06f3aad37bc6 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1370,25 +1370,25 @@ i915_gem_execbuffer_move_to_active(struct list_head *vmas,
 static int
 i915_reset_gen7_sol_offsets(struct drm_i915_gem_request *req)
 {
-	struct intel_ring *ring = req->ring;
-	int ret, i;
+	u32 *rbuf;
+	int i;
 
 	if (!IS_GEN7(req->i915) || req->engine->id != RCS) {
 		DRM_DEBUG("sol reset is gen7/rcs only\n");
 		return -EINVAL;
 	}
 
-	ret = intel_ring_begin(req, 4 * 3);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, 4 * 3);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
 	for (i = 0; i < 4; i++) {
-		intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
-		intel_ring_emit_reg(ring, GEN7_SO_WRITE_OFFSET(i));
-		intel_ring_emit(ring, 0);
+		*rbuf++ = MI_LOAD_REGISTER_IMM(1);
+		*rbuf++ = i915_mmio_reg_offset(GEN7_SO_WRITE_OFFSET(i));
+		*rbuf++ = 0;
 	}
 
-	intel_ring_advance(ring);
+	intel_ring_advance(req->ring, rbuf);
 
 	return 0;
 }
@@ -1495,17 +1495,18 @@ execbuf_submit(struct i915_execbuffer_params *params,
 
 	if (params->engine->id == RCS &&
 	    instp_mode != dev_priv->relative_constants_mode) {
-		struct intel_ring *ring = params->request->ring;
+		u32 *rbuf;
 
-		ret = intel_ring_begin(params->request, 4);
-		if (ret)
-			return ret;
+		rbuf = intel_ring_begin(params->request, 4);
+		if (IS_ERR(rbuf))
+			return PTR_ERR(rbuf);
+
+		*rbuf++ = MI_NOOP;
+		*rbuf++ = MI_LOAD_REGISTER_IMM(1);
+		*rbuf++ = i915_mmio_reg_offset(INSTPM);
+		*rbuf++ = instp_mask << 16 | instp_mode;
 
-		intel_ring_emit(ring, MI_NOOP);
-		intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
-		intel_ring_emit_reg(ring, INSTPM);
-		intel_ring_emit(ring, instp_mask << 16 | instp_mode);
-		intel_ring_advance(ring);
+		intel_ring_advance(params->request->ring, rbuf);
 
 		dev_priv->relative_constants_mode = instp_mode;
 	}
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index 0bb4232f66bc..9ea82dcec028 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -663,23 +663,23 @@ static int gen8_write_pdp(struct drm_i915_gem_request *req,
 			  unsigned entry,
 			  dma_addr_t addr)
 {
-	struct intel_ring *ring = req->ring;
 	struct intel_engine_cs *engine = req->engine;
-	int ret;
+	u32 *rbuf;
 
 	BUG_ON(entry >= 4);
 
-	ret = intel_ring_begin(req, 6);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, 6);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
-	intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
-	intel_ring_emit_reg(ring, GEN8_RING_PDP_UDW(engine, entry));
-	intel_ring_emit(ring, upper_32_bits(addr));
-	intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
-	intel_ring_emit_reg(ring, GEN8_RING_PDP_LDW(engine, entry));
-	intel_ring_emit(ring, lower_32_bits(addr));
-	intel_ring_advance(ring);
+	*rbuf++ = MI_LOAD_REGISTER_IMM(1);
+	*rbuf++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(engine, entry));
+	*rbuf++ = upper_32_bits(addr);
+	*rbuf++ = MI_LOAD_REGISTER_IMM(1);
+	*rbuf++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(engine, entry));
+	*rbuf++ = lower_32_bits(addr);
+
+	intel_ring_advance(req->ring, rbuf);
 
 	return 0;
 }
@@ -1655,8 +1655,8 @@ static uint32_t get_pd_offset(struct i915_hw_ppgtt *ppgtt)
 static int hsw_mm_switch(struct i915_hw_ppgtt *ppgtt,
 			 struct drm_i915_gem_request *req)
 {
-	struct intel_ring *ring = req->ring;
 	struct intel_engine_cs *engine = req->engine;
+	u32 *rbuf;
 	int ret;
 
 	/* NB: TLBs must be flushed and invalidated before a switch */
@@ -1664,17 +1664,18 @@ static int hsw_mm_switch(struct i915_hw_ppgtt *ppgtt,
 	if (ret)
 		return ret;
 
-	ret = intel_ring_begin(req, 6);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, 6);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
+
+	*rbuf++ = MI_LOAD_REGISTER_IMM(2);
+	*rbuf++ = i915_mmio_reg_offset(RING_PP_DIR_DCLV(engine));
+	*rbuf++ = PP_DIR_DCLV_2G;
+	*rbuf++ = i915_mmio_reg_offset(RING_PP_DIR_BASE(engine));
+	*rbuf++ = get_pd_offset(ppgtt);
+	*rbuf++ = MI_NOOP;
 
-	intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(2));
-	intel_ring_emit_reg(ring, RING_PP_DIR_DCLV(engine));
-	intel_ring_emit(ring, PP_DIR_DCLV_2G);
-	intel_ring_emit_reg(ring, RING_PP_DIR_BASE(engine));
-	intel_ring_emit(ring, get_pd_offset(ppgtt));
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_advance(ring);
+	intel_ring_advance(req->ring, rbuf);
 
 	return 0;
 }
@@ -1682,8 +1683,8 @@ static int hsw_mm_switch(struct i915_hw_ppgtt *ppgtt,
 static int gen7_mm_switch(struct i915_hw_ppgtt *ppgtt,
 			  struct drm_i915_gem_request *req)
 {
-	struct intel_ring *ring = req->ring;
 	struct intel_engine_cs *engine = req->engine;
+	u32 *rbuf;
 	int ret;
 
 	/* NB: TLBs must be flushed and invalidated before a switch */
@@ -1691,17 +1692,18 @@ static int gen7_mm_switch(struct i915_hw_ppgtt *ppgtt,
 	if (ret)
 		return ret;
 
-	ret = intel_ring_begin(req, 6);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, 6);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
+
+	*rbuf++ = MI_LOAD_REGISTER_IMM(2);
+	*rbuf++ = i915_mmio_reg_offset(RING_PP_DIR_DCLV(engine));
+	*rbuf++ = PP_DIR_DCLV_2G;
+	*rbuf++ = i915_mmio_reg_offset(RING_PP_DIR_BASE(engine));
+	*rbuf++ = get_pd_offset(ppgtt);
+	*rbuf++ = MI_NOOP;
 
-	intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(2));
-	intel_ring_emit_reg(ring, RING_PP_DIR_DCLV(engine));
-	intel_ring_emit(ring, PP_DIR_DCLV_2G);
-	intel_ring_emit_reg(ring, RING_PP_DIR_BASE(engine));
-	intel_ring_emit(ring, get_pd_offset(ppgtt));
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_advance(ring);
+	intel_ring_advance(req->ring, rbuf);
 
 	/* XXX: RCS is the only one to auto invalidate the TLBs? */
 	if (engine->id != RCS) {
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 497d99b88468..eee778e2dda6 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -11686,14 +11686,12 @@ static int intel_gen2_queue_flip(struct drm_device *dev,
 				 struct drm_i915_gem_request *req,
 				 uint32_t flags)
 {
-	struct intel_ring *ring = req->ring;
 	struct intel_crtc *intel_crtc = to_intel_crtc(crtc);
-	u32 flip_mask;
-	int ret;
+	u32 flip_mask, *rbuf;
 
-	ret = intel_ring_begin(req, 6);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, 6);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
 	/* Can't queue multiple flips, so wait for the previous
 	 * one to finish before executing the next.
@@ -11702,13 +11700,14 @@ static int intel_gen2_queue_flip(struct drm_device *dev,
 		flip_mask = MI_WAIT_FOR_PLANE_B_FLIP;
 	else
 		flip_mask = MI_WAIT_FOR_PLANE_A_FLIP;
-	intel_ring_emit(ring, MI_WAIT_FOR_EVENT | flip_mask);
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_emit(ring, MI_DISPLAY_FLIP |
-			MI_DISPLAY_FLIP_PLANE(intel_crtc->plane));
-	intel_ring_emit(ring, fb->pitches[0]);
-	intel_ring_emit(ring, intel_crtc->flip_work->gtt_offset);
-	intel_ring_emit(ring, 0); /* aux display base address, unused */
+
+	*rbuf++ = MI_WAIT_FOR_EVENT | flip_mask;
+	*rbuf++ = MI_NOOP;
+	*rbuf++ = MI_DISPLAY_FLIP |
+		  MI_DISPLAY_FLIP_PLANE(intel_crtc->plane);
+	*rbuf++ = fb->pitches[0];
+	*rbuf++ = intel_crtc->flip_work->gtt_offset;
+	*rbuf++ = 0; /* aux display base address, unused */
 
 	return 0;
 }
@@ -11720,26 +11719,25 @@ static int intel_gen3_queue_flip(struct drm_device *dev,
 				 struct drm_i915_gem_request *req,
 				 uint32_t flags)
 {
-	struct intel_ring *ring = req->ring;
 	struct intel_crtc *intel_crtc = to_intel_crtc(crtc);
-	u32 flip_mask;
-	int ret;
+	u32 flip_mask, *rbuf;
 
-	ret = intel_ring_begin(req, 6);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, 6);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
 	if (intel_crtc->plane)
 		flip_mask = MI_WAIT_FOR_PLANE_B_FLIP;
 	else
 		flip_mask = MI_WAIT_FOR_PLANE_A_FLIP;
-	intel_ring_emit(ring, MI_WAIT_FOR_EVENT | flip_mask);
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_emit(ring, MI_DISPLAY_FLIP_I915 |
-			MI_DISPLAY_FLIP_PLANE(intel_crtc->plane));
-	intel_ring_emit(ring, fb->pitches[0]);
-	intel_ring_emit(ring, intel_crtc->flip_work->gtt_offset);
-	intel_ring_emit(ring, MI_NOOP);
+
+	*rbuf++ = MI_WAIT_FOR_EVENT | flip_mask;
+	*rbuf++ = MI_NOOP;
+	*rbuf++ = MI_DISPLAY_FLIP_I915 |
+		  MI_DISPLAY_FLIP_PLANE(intel_crtc->plane);
+	*rbuf++ = fb->pitches[0];
+	*rbuf++ = intel_crtc->flip_work->gtt_offset;
+	*rbuf++ = MI_NOOP;
 
 	return 0;
 }
@@ -11751,25 +11749,23 @@ static int intel_gen4_queue_flip(struct drm_device *dev,
 				 struct drm_i915_gem_request *req,
 				 uint32_t flags)
 {
-	struct intel_ring *ring = req->ring;
 	struct drm_i915_private *dev_priv = to_i915(dev);
 	struct intel_crtc *intel_crtc = to_intel_crtc(crtc);
+	u32 *rbuf;
 	uint32_t pf, pipesrc;
-	int ret;
 
-	ret = intel_ring_begin(req, 4);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, 4);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
 	/* i965+ uses the linear or tiled offsets from the
 	 * Display Registers (which do not change across a page-flip)
 	 * so we need only reprogram the base address.
 	 */
-	intel_ring_emit(ring, MI_DISPLAY_FLIP |
-			MI_DISPLAY_FLIP_PLANE(intel_crtc->plane));
-	intel_ring_emit(ring, fb->pitches[0]);
-	intel_ring_emit(ring, intel_crtc->flip_work->gtt_offset |
-			intel_fb_modifier_to_tiling(fb->modifier[0]));
+	*rbuf++ = MI_DISPLAY_FLIP | MI_DISPLAY_FLIP_PLANE(intel_crtc->plane);
+	*rbuf++ = fb->pitches[0];
+	*rbuf++ = intel_crtc->flip_work->gtt_offset |
+		  intel_fb_modifier_to_tiling(fb->modifier[0]);
 
 	/* XXX Enabling the panel-fitter across page-flip is so far
 	 * untested on non-native modes, so ignore it for now.
@@ -11777,7 +11773,7 @@ static int intel_gen4_queue_flip(struct drm_device *dev,
 	 */
 	pf = 0;
 	pipesrc = I915_READ(PIPESRC(intel_crtc->pipe)) & 0x0fff0fff;
-	intel_ring_emit(ring, pf | pipesrc);
+	*rbuf++ = pf | pipesrc;
 
 	return 0;
 }
@@ -11789,21 +11785,18 @@ static int intel_gen6_queue_flip(struct drm_device *dev,
 				 struct drm_i915_gem_request *req,
 				 uint32_t flags)
 {
-	struct intel_ring *ring = req->ring;
 	struct drm_i915_private *dev_priv = to_i915(dev);
 	struct intel_crtc *intel_crtc = to_intel_crtc(crtc);
+	u32 *rbuf;
 	uint32_t pf, pipesrc;
-	int ret;
 
-	ret = intel_ring_begin(req, 4);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, 4);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
-	intel_ring_emit(ring, MI_DISPLAY_FLIP |
-			MI_DISPLAY_FLIP_PLANE(intel_crtc->plane));
-	intel_ring_emit(ring, fb->pitches[0] |
-			intel_fb_modifier_to_tiling(fb->modifier[0]));
-	intel_ring_emit(ring, intel_crtc->flip_work->gtt_offset);
+	*rbuf++ = MI_DISPLAY_FLIP | MI_DISPLAY_FLIP_PLANE(intel_crtc->plane);
+	*rbuf++ = fb->pitches[0] | intel_fb_modifier_to_tiling(fb->modifier[0]);
+	*rbuf++ = intel_crtc->flip_work->gtt_offset;
 
 	/* Contrary to the suggestions in the documentation,
 	 * "Enable Panel Fitter" does not seem to be required when page
@@ -11813,7 +11806,7 @@ static int intel_gen6_queue_flip(struct drm_device *dev,
 	 */
 	pf = 0;
 	pipesrc = I915_READ(PIPESRC(intel_crtc->pipe)) & 0x0fff0fff;
-	intel_ring_emit(ring, pf | pipesrc);
+	*rbuf++ = pf | pipesrc;
 
 	return 0;
 }
@@ -11825,8 +11818,8 @@ static int intel_gen7_queue_flip(struct drm_device *dev,
 				 struct drm_i915_gem_request *req,
 				 uint32_t flags)
 {
-	struct intel_ring *ring = req->ring;
 	struct intel_crtc *intel_crtc = to_intel_crtc(crtc);
+	u32 *rbuf;
 	uint32_t plane_bit = 0;
 	int len, ret;
 
@@ -11871,9 +11864,9 @@ static int intel_gen7_queue_flip(struct drm_device *dev,
 	if (ret)
 		return ret;
 
-	ret = intel_ring_begin(req, len);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, len);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
 	/* Unmask the flip-done completion message. Note that the bspec says that
 	 * we should do this for both the BCS and RCS, and that we must not unmask
@@ -11885,31 +11878,28 @@ static int intel_gen7_queue_flip(struct drm_device *dev,
 	 * to zero does lead to lockups within MI_DISPLAY_FLIP.
 	 */
 	if (req->engine->id == RCS) {
-		intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
-		intel_ring_emit_reg(ring, DERRMR);
-		intel_ring_emit(ring, ~(DERRMR_PIPEA_PRI_FLIP_DONE |
-					  DERRMR_PIPEB_PRI_FLIP_DONE |
-					  DERRMR_PIPEC_PRI_FLIP_DONE));
+		*rbuf++ = MI_LOAD_REGISTER_IMM(1);
+		*rbuf++ = i915_mmio_reg_offset(DERRMR);
+		*rbuf++ = ~(DERRMR_PIPEA_PRI_FLIP_DONE |
+			   DERRMR_PIPEB_PRI_FLIP_DONE |
+			   DERRMR_PIPEC_PRI_FLIP_DONE);
 		if (IS_GEN8(dev))
-			intel_ring_emit(ring, MI_STORE_REGISTER_MEM_GEN8 |
-					      MI_SRM_LRM_GLOBAL_GTT);
+			*rbuf++ = MI_STORE_REGISTER_MEM_GEN8 |
+				  MI_SRM_LRM_GLOBAL_GTT;
 		else
-			intel_ring_emit(ring, MI_STORE_REGISTER_MEM |
-					      MI_SRM_LRM_GLOBAL_GTT);
-		intel_ring_emit_reg(ring, DERRMR);
-		intel_ring_emit(ring,
-				i915_ggtt_offset(req->engine->scratch) + 256);
+			*rbuf++ = MI_STORE_REGISTER_MEM | MI_SRM_LRM_GLOBAL_GTT;
+		*rbuf++ = i915_mmio_reg_offset(DERRMR);
+		*rbuf++ = i915_ggtt_offset(req->engine->scratch) + 256;
 		if (IS_GEN8(dev)) {
-			intel_ring_emit(ring, 0);
-			intel_ring_emit(ring, MI_NOOP);
+			*rbuf++ = 0;
+			*rbuf++ = MI_NOOP;
 		}
 	}
 
-	intel_ring_emit(ring, MI_DISPLAY_FLIP_I915 | plane_bit);
-	intel_ring_emit(ring, fb->pitches[0] |
-			intel_fb_modifier_to_tiling(fb->modifier[0]));
-	intel_ring_emit(ring, intel_crtc->flip_work->gtt_offset);
-	intel_ring_emit(ring, (MI_NOOP));
+	*rbuf++ = MI_DISPLAY_FLIP_I915 | plane_bit;
+	*rbuf++ = fb->pitches[0] | intel_fb_modifier_to_tiling(fb->modifier[0]);
+	*rbuf++ = intel_crtc->flip_work->gtt_offset;
+	*rbuf++ = MI_NOOP;
 
 	return 0;
 }
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 16d7cdd11082..82ae39651911 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -605,6 +605,7 @@ int intel_logical_ring_alloc_request_extras(struct drm_i915_gem_request *request
 {
 	struct intel_engine_cs *engine = request->engine;
 	struct intel_context *ce = &request->ctx->engine[engine->id];
+	u32 *rbuf;
 	int ret;
 
 	/* Flush enough space to reduce the likelihood of waiting after
@@ -636,9 +637,11 @@ int intel_logical_ring_alloc_request_extras(struct drm_i915_gem_request *request
 	if (ret)
 		return ret;
 
-	ret = intel_ring_begin(request, 0);
-	if (ret)
+	rbuf = intel_ring_begin(request, 0);
+	if (IS_ERR(rbuf)) {
+		ret = PTR_ERR(rbuf);
 		goto err_unpin;
+	}
 
 	if (!ce->initialised) {
 		ret = engine->init_context(request);
@@ -664,6 +667,13 @@ err_unpin:
 }
 
 /*
+ * Reserve space for 2 NOOPs at the end of each request to be
+ * used as a workaround for not being allowed to do lite
+ * restore with HEAD==TAIL (WaIdleLiteRestore).
+ */
+#define WA_TAIL_DWORDS 2
+
+/*
  * intel_logical_ring_advance() - advance the tail and prepare for submission
  * @request: Request to advance the logical ringbuffer of.
  *
@@ -673,13 +683,12 @@ err_unpin:
  * point, the tail *inside* the context is updated and the ELSP written to.
  */
 static int
-intel_logical_ring_advance(struct drm_i915_gem_request *request)
+intel_logical_ring_advance(struct drm_i915_gem_request *request, u32 *rbuf)
 {
 	struct intel_ring *ring = request->ring;
 	struct intel_engine_cs *engine = request->engine;
 
-	intel_ring_advance(ring);
-	request->tail = ring->tail;
+	request->tail = ring->tail - WA_TAIL_DWORDS * sizeof(u32);
 
 	/*
 	 * Here we add two extra NOOPs as padding to avoid
@@ -687,9 +696,11 @@ intel_logical_ring_advance(struct drm_i915_gem_request *request)
 	 *
 	 * Caller must reserve WA_TAIL_DWORDS for us!
 	 */
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_advance(ring);
+	*rbuf++ = MI_NOOP;
+	*rbuf++ = MI_NOOP;
+
+	intel_ring_advance(ring, rbuf);
+
 	request->wa_tail = ring->tail;
 
 	/* We keep the previous context alive until we retire the following
@@ -780,7 +791,7 @@ void intel_lr_context_unpin(struct i915_gem_context *ctx,
 static int intel_logical_ring_workarounds_emit(struct drm_i915_gem_request *req)
 {
 	int ret, i;
-	struct intel_ring *ring = req->ring;
+	u32 *rbuf;
 	struct i915_workarounds *w = &req->i915->workarounds;
 
 	if (w->count == 0)
@@ -790,18 +801,18 @@ static int intel_logical_ring_workarounds_emit(struct drm_i915_gem_request *req)
 	if (ret)
 		return ret;
 
-	ret = intel_ring_begin(req, w->count * 2 + 2);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, w->count * 2 + 2);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
-	intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(w->count));
+	*rbuf++ = MI_LOAD_REGISTER_IMM(w->count);
 	for (i = 0; i < w->count; i++) {
-		intel_ring_emit_reg(ring, w->reg[i].addr);
-		intel_ring_emit(ring, w->reg[i].value);
+		*rbuf++ = i915_mmio_reg_offset(w->reg[i].addr);
+		*rbuf++ = w->reg[i].value;
 	}
-	intel_ring_emit(ring, MI_NOOP);
+	*rbuf++ = MI_NOOP;
 
-	intel_ring_advance(ring);
+	intel_ring_advance(req->ring, rbuf);
 
 	ret = req->engine->emit_flush(req, EMIT_BARRIER);
 	if (ret)
@@ -1315,27 +1326,27 @@ static void reset_common_ring(struct intel_engine_cs *engine,
 static int intel_logical_ring_emit_pdps(struct drm_i915_gem_request *req)
 {
 	struct i915_hw_ppgtt *ppgtt = req->ctx->ppgtt;
-	struct intel_ring *ring = req->ring;
 	struct intel_engine_cs *engine = req->engine;
 	const int num_lri_cmds = GEN8_LEGACY_PDPES * 2;
-	int i, ret;
+	u32 *rbuf;
+	int i;
 
-	ret = intel_ring_begin(req, num_lri_cmds * 2 + 2);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, num_lri_cmds * 2 + 2);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
-	intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(num_lri_cmds));
+	*rbuf++ = MI_LOAD_REGISTER_IMM(num_lri_cmds);
 	for (i = GEN8_LEGACY_PDPES - 1; i >= 0; i--) {
 		const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i);
 
-		intel_ring_emit_reg(ring, GEN8_RING_PDP_UDW(engine, i));
-		intel_ring_emit(ring, upper_32_bits(pd_daddr));
-		intel_ring_emit_reg(ring, GEN8_RING_PDP_LDW(engine, i));
-		intel_ring_emit(ring, lower_32_bits(pd_daddr));
+		*rbuf++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(engine, i));
+		*rbuf++ = upper_32_bits(pd_daddr);
+		*rbuf++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(engine, i));
+		*rbuf++ = lower_32_bits(pd_daddr);
 	}
+	*rbuf++ = MI_NOOP;
 
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_advance(ring);
+	intel_ring_advance(req->ring, rbuf);
 
 	return 0;
 }
@@ -1344,8 +1355,8 @@ static int gen8_emit_bb_start(struct drm_i915_gem_request *req,
 			      u64 offset, u32 len,
 			      unsigned int dispatch_flags)
 {
-	struct intel_ring *ring = req->ring;
 	bool ppgtt = !(dispatch_flags & I915_DISPATCH_SECURE);
+	u32 *rbuf;
 	int ret;
 
 	/* Don't rely in hw updating PDPs, specially in lite-restore.
@@ -1366,19 +1377,19 @@ static int gen8_emit_bb_start(struct drm_i915_gem_request *req,
 		req->ctx->ppgtt->pd_dirty_rings &= ~intel_engine_flag(req->engine);
 	}
 
-	ret = intel_ring_begin(req, 4);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, 4);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
 	/* FIXME(BDW): Address space and security selectors. */
-	intel_ring_emit(ring, MI_BATCH_BUFFER_START_GEN8 |
-			(ppgtt<<8) |
-			(dispatch_flags & I915_DISPATCH_RS ?
-			 MI_BATCH_RESOURCE_STREAMER : 0));
-	intel_ring_emit(ring, lower_32_bits(offset));
-	intel_ring_emit(ring, upper_32_bits(offset));
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_advance(ring);
+	*rbuf++ = MI_BATCH_BUFFER_START_GEN8 | (ppgtt << 8) |
+		  (dispatch_flags & I915_DISPATCH_RS ?
+		  MI_BATCH_RESOURCE_STREAMER : 0);
+	*rbuf++ = lower_32_bits(offset);
+	*rbuf++ = upper_32_bits(offset);
+	*rbuf++ = MI_NOOP;
+
+	intel_ring_advance(req->ring, rbuf);
 
 	return 0;
 }
@@ -1399,13 +1410,11 @@ static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine)
 
 static int gen8_emit_flush(struct drm_i915_gem_request *request, u32 mode)
 {
-	struct intel_ring *ring = request->ring;
-	u32 cmd;
-	int ret;
+	u32 cmd, *rbuf;
 
-	ret = intel_ring_begin(request, 4);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(request, 4);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
 	cmd = MI_FLUSH_DW + 1;
 
@@ -1422,13 +1431,12 @@ static int gen8_emit_flush(struct drm_i915_gem_request *request, u32 mode)
 			cmd |= MI_INVALIDATE_BSD;
 	}
 
-	intel_ring_emit(ring, cmd);
-	intel_ring_emit(ring,
-			I915_GEM_HWS_SCRATCH_ADDR |
-			MI_FLUSH_DW_USE_GTT);
-	intel_ring_emit(ring, 0); /* upper addr */
-	intel_ring_emit(ring, 0); /* value */
-	intel_ring_advance(ring);
+	*rbuf++ = cmd;
+	*rbuf++ = I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
+	*rbuf++ = 0; /* upper addr */
+	*rbuf++ = 0; /* value */
+
+	intel_ring_advance(request->ring, rbuf);
 
 	return 0;
 }
@@ -1436,13 +1444,10 @@ static int gen8_emit_flush(struct drm_i915_gem_request *request, u32 mode)
 static int gen8_emit_flush_render(struct drm_i915_gem_request *request,
 				  u32 mode)
 {
-	struct intel_ring *ring = request->ring;
-	struct intel_engine_cs *engine = request->engine;
 	u32 scratch_addr =
-		i915_ggtt_offset(engine->scratch) + 2 * CACHELINE_BYTES;
+	       i915_ggtt_offset(request->engine->scratch) + 2 * CACHELINE_BYTES;
 	bool vf_flush_wa = false, dc_flush_wa = false;
-	u32 flags = 0;
-	int ret;
+	u32 *rbuf, flags = 0;
 	int len;
 
 	flags |= PIPE_CONTROL_CS_STALL;
@@ -1484,45 +1489,45 @@ static int gen8_emit_flush_render(struct drm_i915_gem_request *request,
 	if (dc_flush_wa)
 		len += 12;
 
-	ret = intel_ring_begin(request, len);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(request, len);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
 	if (vf_flush_wa) {
-		intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(6));
-		intel_ring_emit(ring, 0);
-		intel_ring_emit(ring, 0);
-		intel_ring_emit(ring, 0);
-		intel_ring_emit(ring, 0);
-		intel_ring_emit(ring, 0);
+		*rbuf++ = GFX_OP_PIPE_CONTROL(6);
+		*rbuf++ = 0;
+		*rbuf++ = 0;
+		*rbuf++ = 0;
+		*rbuf++ = 0;
+		*rbuf++ = 0;
 	}
 
 	if (dc_flush_wa) {
-		intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(6));
-		intel_ring_emit(ring, PIPE_CONTROL_DC_FLUSH_ENABLE);
-		intel_ring_emit(ring, 0);
-		intel_ring_emit(ring, 0);
-		intel_ring_emit(ring, 0);
-		intel_ring_emit(ring, 0);
+		*rbuf++ = GFX_OP_PIPE_CONTROL(6);
+		*rbuf++ = PIPE_CONTROL_DC_FLUSH_ENABLE;
+		*rbuf++ = 0;
+		*rbuf++ = 0;
+		*rbuf++ = 0;
+		*rbuf++ = 0;
 	}
 
-	intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(6));
-	intel_ring_emit(ring, flags);
-	intel_ring_emit(ring, scratch_addr);
-	intel_ring_emit(ring, 0);
-	intel_ring_emit(ring, 0);
-	intel_ring_emit(ring, 0);
+	*rbuf++ = GFX_OP_PIPE_CONTROL(6);
+	*rbuf++ = flags;
+	*rbuf++ = scratch_addr;
+	*rbuf++ = 0;
+	*rbuf++ = 0;
+	*rbuf++ = 0;
 
 	if (dc_flush_wa) {
-		intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(6));
-		intel_ring_emit(ring, PIPE_CONTROL_CS_STALL);
-		intel_ring_emit(ring, 0);
-		intel_ring_emit(ring, 0);
-		intel_ring_emit(ring, 0);
-		intel_ring_emit(ring, 0);
+		*rbuf++ = GFX_OP_PIPE_CONTROL(6);
+		*rbuf++ = PIPE_CONTROL_CS_STALL;
+		*rbuf++ = 0;
+		*rbuf++ = 0;
+		*rbuf++ = 0;
+		*rbuf++ = 0;
 	}
 
-	intel_ring_advance(ring);
+	intel_ring_advance(request->ring, rbuf);
 
 	return 0;
 }
@@ -1542,44 +1547,35 @@ static void bxt_a_seqno_barrier(struct intel_engine_cs *engine)
 	intel_flush_status_page(engine, I915_GEM_HWS_INDEX);
 }
 
-/*
- * Reserve space for 2 NOOPs at the end of each request to be
- * used as a workaround for not being allowed to do lite
- * restore with HEAD==TAIL (WaIdleLiteRestore).
- */
-#define WA_TAIL_DWORDS 2
-
 static int gen8_emit_request(struct drm_i915_gem_request *request)
 {
-	struct intel_ring *ring = request->ring;
-	int ret;
+	u32 *rbuf;
 
-	ret = intel_ring_begin(request, 6 + WA_TAIL_DWORDS);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(request, 6 + WA_TAIL_DWORDS);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
 	/* w/a: bit 5 needs to be zero for MI_FLUSH_DW address. */
 	BUILD_BUG_ON(I915_GEM_HWS_INDEX_ADDR & (1 << 5));
 
-	intel_ring_emit(ring, (MI_FLUSH_DW + 1) | MI_FLUSH_DW_OP_STOREDW);
-	intel_ring_emit(ring,
-			intel_hws_seqno_address(request->engine) |
-			MI_FLUSH_DW_USE_GTT);
-	intel_ring_emit(ring, 0);
-	intel_ring_emit(ring, request->fence.seqno);
-	intel_ring_emit(ring, MI_USER_INTERRUPT);
-	intel_ring_emit(ring, MI_NOOP);
-	return intel_logical_ring_advance(request);
+	*rbuf++ = (MI_FLUSH_DW + 1) | MI_FLUSH_DW_OP_STOREDW;
+	*rbuf++ = intel_hws_seqno_address(request->engine) |
+		  MI_FLUSH_DW_USE_GTT;
+	*rbuf++ = 0;
+	*rbuf++ = request->fence.seqno;
+	*rbuf++ = MI_USER_INTERRUPT;
+	*rbuf++ = MI_NOOP;
+
+	return intel_logical_ring_advance(request, rbuf);
 }
 
 static int gen8_emit_request_render(struct drm_i915_gem_request *request)
 {
-	struct intel_ring *ring = request->ring;
-	int ret;
+	u32 *rbuf;
 
-	ret = intel_ring_begin(request, 8 + WA_TAIL_DWORDS);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(request, 8 + WA_TAIL_DWORDS);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
 	/* We're using qword write, seqno should be aligned to 8 bytes. */
 	BUILD_BUG_ON(I915_GEM_HWS_INDEX & 1);
@@ -1588,19 +1584,18 @@ static int gen8_emit_request_render(struct drm_i915_gem_request *request)
 	 * need a prior CS_STALL, which is emitted by the flush
 	 * following the batch.
 	 */
-	intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(6));
-	intel_ring_emit(ring,
-			(PIPE_CONTROL_GLOBAL_GTT_IVB |
-			 PIPE_CONTROL_CS_STALL |
-			 PIPE_CONTROL_QW_WRITE));
-	intel_ring_emit(ring, intel_hws_seqno_address(request->engine));
-	intel_ring_emit(ring, 0);
-	intel_ring_emit(ring, i915_gem_request_get_seqno(request));
+	*rbuf++ = GFX_OP_PIPE_CONTROL(6);
+	*rbuf++ = PIPE_CONTROL_GLOBAL_GTT_IVB | PIPE_CONTROL_CS_STALL |
+		  PIPE_CONTROL_QW_WRITE;
+	*rbuf++ = intel_hws_seqno_address(request->engine);
+	*rbuf++ = 0;
+	*rbuf++ = i915_gem_request_get_seqno(request);
 	/* We're thrashing one dword of HWS. */
-	intel_ring_emit(ring, 0);
-	intel_ring_emit(ring, MI_USER_INTERRUPT);
-	intel_ring_emit(ring, MI_NOOP);
-	return intel_logical_ring_advance(request);
+	*rbuf++ = 0;
+	*rbuf++ = MI_USER_INTERRUPT;
+	*rbuf++ = MI_NOOP;
+
+	return intel_logical_ring_advance(request, rbuf);
 }
 
 static int gen8_init_rcs_context(struct drm_i915_gem_request *req)
diff --git a/drivers/gpu/drm/i915/intel_mocs.c b/drivers/gpu/drm/i915/intel_mocs.c
index 80bb9247ce66..8060658d953f 100644
--- a/drivers/gpu/drm/i915/intel_mocs.c
+++ b/drivers/gpu/drm/i915/intel_mocs.c
@@ -276,23 +276,22 @@ int intel_mocs_init_engine(struct intel_engine_cs *engine)
 static int emit_mocs_control_table(struct drm_i915_gem_request *req,
 				   const struct drm_i915_mocs_table *table)
 {
-	struct intel_ring *ring = req->ring;
 	enum intel_engine_id engine = req->engine->id;
 	unsigned int index;
-	int ret;
+	u32 *rbuf;
 
 	if (WARN_ON(table->size > GEN9_NUM_MOCS_ENTRIES))
 		return -ENODEV;
 
-	ret = intel_ring_begin(req, 2 + 2 * GEN9_NUM_MOCS_ENTRIES);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, 2 + 2 * GEN9_NUM_MOCS_ENTRIES);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
-	intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(GEN9_NUM_MOCS_ENTRIES));
+	*rbuf++ = MI_LOAD_REGISTER_IMM(GEN9_NUM_MOCS_ENTRIES);
 
 	for (index = 0; index < table->size; index++) {
-		intel_ring_emit_reg(ring, mocs_register(engine, index));
-		intel_ring_emit(ring, table->table[index].control_value);
+		*rbuf++ = i915_mmio_reg_offset(mocs_register(engine, index));
+		*rbuf++ = table->table[index].control_value;
 	}
 
 	/*
@@ -304,12 +303,13 @@ static int emit_mocs_control_table(struct drm_i915_gem_request *req,
 	 * that value to all the used entries.
 	 */
 	for (; index < GEN9_NUM_MOCS_ENTRIES; index++) {
-		intel_ring_emit_reg(ring, mocs_register(engine, index));
-		intel_ring_emit(ring, table->table[0].control_value);
+		*rbuf++ = i915_mmio_reg_offset(mocs_register(engine, index));
+		*rbuf++ = table->table[0].control_value;
 	}
 
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_advance(ring);
+	*rbuf++ = MI_NOOP;
+
+	intel_ring_advance(req->ring, rbuf);
 
 	return 0;
 }
@@ -336,29 +336,27 @@ static inline u32 l3cc_combine(const struct drm_i915_mocs_table *table,
 static int emit_mocs_l3cc_table(struct drm_i915_gem_request *req,
 				const struct drm_i915_mocs_table *table)
 {
-	struct intel_ring *ring = req->ring;
+	u32 *rbuf;
 	unsigned int i;
-	int ret;
 
 	if (WARN_ON(table->size > GEN9_NUM_MOCS_ENTRIES))
 		return -ENODEV;
 
-	ret = intel_ring_begin(req, 2 + GEN9_NUM_MOCS_ENTRIES);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, 2 + GEN9_NUM_MOCS_ENTRIES);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
-	intel_ring_emit(ring,
-			MI_LOAD_REGISTER_IMM(GEN9_NUM_MOCS_ENTRIES / 2));
+	*rbuf++ = MI_LOAD_REGISTER_IMM(GEN9_NUM_MOCS_ENTRIES / 2);
 
 	for (i = 0; i < table->size/2; i++) {
-		intel_ring_emit_reg(ring, GEN9_LNCFCMOCS(i));
-		intel_ring_emit(ring, l3cc_combine(table, 2*i, 2*i+1));
+		*rbuf++ = i915_mmio_reg_offset(GEN9_LNCFCMOCS(i));
+		*rbuf++ = l3cc_combine(table, 2 * i, 2 * i + 1);
 	}
 
 	if (table->size & 0x01) {
 		/* Odd table size - 1 left over */
-		intel_ring_emit_reg(ring, GEN9_LNCFCMOCS(i));
-		intel_ring_emit(ring, l3cc_combine(table, 2*i, 0));
+		*rbuf++ = i915_mmio_reg_offset(GEN9_LNCFCMOCS(i));
+		*rbuf++ = l3cc_combine(table, 2 * i, 0);
 		i++;
 	}
 
@@ -368,12 +366,13 @@ static int emit_mocs_l3cc_table(struct drm_i915_gem_request *req,
 	 * they are reserved by the hardware.
 	 */
 	for (; i < GEN9_NUM_MOCS_ENTRIES / 2; i++) {
-		intel_ring_emit_reg(ring, GEN9_LNCFCMOCS(i));
-		intel_ring_emit(ring, l3cc_combine(table, 0, 0));
+		*rbuf++ = i915_mmio_reg_offset(GEN9_LNCFCMOCS(i));
+		*rbuf++ = l3cc_combine(table, 0, 0);
 	}
 
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_advance(ring);
+	*rbuf++ = MI_NOOP;
+
+	intel_ring_advance(req->ring, rbuf);
 
 	return 0;
 }
diff --git a/drivers/gpu/drm/i915/intel_overlay.c b/drivers/gpu/drm/i915/intel_overlay.c
index a24bc8c7889f..1de86c316058 100644
--- a/drivers/gpu/drm/i915/intel_overlay.c
+++ b/drivers/gpu/drm/i915/intel_overlay.c
@@ -243,8 +243,7 @@ static int intel_overlay_on(struct intel_overlay *overlay)
 {
 	struct drm_i915_private *dev_priv = overlay->i915;
 	struct drm_i915_gem_request *req;
-	struct intel_ring *ring;
-	int ret;
+	u32 *rbuf;
 
 	WARN_ON(overlay->active);
 	WARN_ON(IS_I830(dev_priv) && !(dev_priv->quirks & QUIRK_PIPEA_FORCE));
@@ -253,20 +252,20 @@ static int intel_overlay_on(struct intel_overlay *overlay)
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
-	ret = intel_ring_begin(req, 4);
-	if (ret) {
+	rbuf = intel_ring_begin(req, 4);
+	if (IS_ERR(rbuf)) {
 		i915_add_request_no_flush(req);
-		return ret;
+		return PTR_ERR(rbuf);
 	}
 
 	overlay->active = true;
 
-	ring = req->ring;
-	intel_ring_emit(ring, MI_OVERLAY_FLIP | MI_OVERLAY_ON);
-	intel_ring_emit(ring, overlay->flip_addr | OFC_UPDATE);
-	intel_ring_emit(ring, MI_WAIT_FOR_EVENT | MI_WAIT_FOR_OVERLAY_FLIP);
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_advance(ring);
+	*rbuf++ = MI_OVERLAY_FLIP | MI_OVERLAY_ON;
+	*rbuf++ = overlay->flip_addr | OFC_UPDATE;
+	*rbuf++ = MI_WAIT_FOR_EVENT | MI_WAIT_FOR_OVERLAY_FLIP;
+	*rbuf++ = MI_NOOP;
+
+	intel_ring_advance(req->ring, rbuf);
 
 	return intel_overlay_do_wait_request(overlay, req, NULL);
 }
@@ -277,10 +276,8 @@ static int intel_overlay_continue(struct intel_overlay *overlay,
 {
 	struct drm_i915_private *dev_priv = overlay->i915;
 	struct drm_i915_gem_request *req;
-	struct intel_ring *ring;
 	u32 flip_addr = overlay->flip_addr;
-	u32 tmp;
-	int ret;
+	u32 *rbuf, tmp;
 
 	WARN_ON(!overlay->active);
 
@@ -296,16 +293,16 @@ static int intel_overlay_continue(struct intel_overlay *overlay,
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
-	ret = intel_ring_begin(req, 2);
-	if (ret) {
+	rbuf = intel_ring_begin(req, 2);
+	if (IS_ERR(rbuf)) {
 		i915_add_request_no_flush(req);
-		return ret;
+		return PTR_ERR(rbuf);
 	}
 
-	ring = req->ring;
-	intel_ring_emit(ring, MI_OVERLAY_FLIP | MI_OVERLAY_CONTINUE);
-	intel_ring_emit(ring, flip_addr);
-	intel_ring_advance(ring);
+	*rbuf++ = MI_OVERLAY_FLIP | MI_OVERLAY_CONTINUE;
+	*rbuf++ = flip_addr;
+
+	intel_ring_advance(req->ring, rbuf);
 
 	intel_overlay_submit_request(overlay, req, NULL);
 
@@ -355,9 +352,8 @@ static int intel_overlay_off(struct intel_overlay *overlay)
 {
 	struct drm_i915_private *dev_priv = overlay->i915;
 	struct drm_i915_gem_request *req;
-	struct intel_ring *ring;
 	u32 flip_addr = overlay->flip_addr;
-	int ret;
+	u32 *rbuf;
 
 	WARN_ON(!overlay->active);
 
@@ -371,31 +367,30 @@ static int intel_overlay_off(struct intel_overlay *overlay)
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
-	ret = intel_ring_begin(req, 6);
-	if (ret) {
+	rbuf = intel_ring_begin(req, 6);
+	if (IS_ERR(rbuf)) {
 		i915_add_request_no_flush(req);
-		return ret;
+		return PTR_ERR(rbuf);
 	}
 
-	ring = req->ring;
 	/* wait for overlay to go idle */
-	intel_ring_emit(ring, MI_OVERLAY_FLIP | MI_OVERLAY_CONTINUE);
-	intel_ring_emit(ring, flip_addr);
-	intel_ring_emit(ring, MI_WAIT_FOR_EVENT | MI_WAIT_FOR_OVERLAY_FLIP);
+	*rbuf++ = MI_OVERLAY_FLIP | MI_OVERLAY_CONTINUE;
+	*rbuf++ = flip_addr;
+	*rbuf++ = MI_WAIT_FOR_EVENT | MI_WAIT_FOR_OVERLAY_FLIP;
 	/* turn overlay off */
 	if (IS_I830(dev_priv)) {
 		/* Workaround: Don't disable the overlay fully, since otherwise
 		 * it dies on the next OVERLAY_ON cmd. */
-		intel_ring_emit(ring, MI_NOOP);
-		intel_ring_emit(ring, MI_NOOP);
-		intel_ring_emit(ring, MI_NOOP);
+		*rbuf++ = MI_NOOP;
+		*rbuf++ = MI_NOOP;
+		*rbuf++ = MI_NOOP;
 	} else {
-		intel_ring_emit(ring, MI_OVERLAY_FLIP | MI_OVERLAY_OFF);
-		intel_ring_emit(ring, flip_addr);
-		intel_ring_emit(ring,
-				MI_WAIT_FOR_EVENT | MI_WAIT_FOR_OVERLAY_FLIP);
+		*rbuf++ = MI_OVERLAY_FLIP | MI_OVERLAY_OFF;
+		*rbuf++ = flip_addr;
+		*rbuf++ = MI_WAIT_FOR_EVENT | MI_WAIT_FOR_OVERLAY_FLIP;
 	}
-	intel_ring_advance(ring);
+
+	intel_ring_advance(req->ring, rbuf);
 
 	return intel_overlay_do_wait_request(overlay, req,
 					     intel_overlay_off_tail);
@@ -429,23 +424,22 @@ static int intel_overlay_release_old_vid(struct intel_overlay *overlay)
 	if (I915_READ(ISR) & I915_OVERLAY_PLANE_FLIP_PENDING_INTERRUPT) {
 		/* synchronous slowpath */
 		struct drm_i915_gem_request *req;
-		struct intel_ring *ring;
+		u32 *rbuf;
 
 		req = alloc_request(overlay);
 		if (IS_ERR(req))
 			return PTR_ERR(req);
 
-		ret = intel_ring_begin(req, 2);
-		if (ret) {
+		rbuf = intel_ring_begin(req, 2);
+		if (IS_ERR(rbuf)) {
 			i915_add_request_no_flush(req);
-			return ret;
+			return PTR_ERR(rbuf);
 		}
 
-		ring = req->ring;
-		intel_ring_emit(ring,
-				MI_WAIT_FOR_EVENT | MI_WAIT_FOR_OVERLAY_FLIP);
-		intel_ring_emit(ring, MI_NOOP);
-		intel_ring_advance(ring);
+		*rbuf++ = MI_WAIT_FOR_EVENT | MI_WAIT_FOR_OVERLAY_FLIP;
+		*rbuf++ = MI_NOOP;
+
+		intel_ring_advance(req->ring, rbuf);
 
 		ret = intel_overlay_do_wait_request(overlay, req,
 						    intel_overlay_release_old_vid_tail);
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 7a74750076c5..06955691f52d 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -61,22 +61,21 @@ void intel_ring_update_space(struct intel_ring *ring)
 static int
 gen2_render_ring_flush(struct drm_i915_gem_request *req, u32 mode)
 {
-	struct intel_ring *ring = req->ring;
-	u32 cmd;
-	int ret;
+	u32 cmd, *rbuf;
 
 	cmd = MI_FLUSH;
 
 	if (mode & EMIT_INVALIDATE)
 		cmd |= MI_READ_FLUSH;
 
-	ret = intel_ring_begin(req, 2);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, 2);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
+
+	*rbuf++ = cmd;
+	*rbuf++ = MI_NOOP;
 
-	intel_ring_emit(ring, cmd);
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_advance(ring);
+	intel_ring_advance(req->ring, rbuf);
 
 	return 0;
 }
@@ -84,9 +83,7 @@ gen2_render_ring_flush(struct drm_i915_gem_request *req, u32 mode)
 static int
 gen4_render_ring_flush(struct drm_i915_gem_request *req, u32 mode)
 {
-	struct intel_ring *ring = req->ring;
-	u32 cmd;
-	int ret;
+	u32 cmd, *rbuf;
 
 	/*
 	 * read/write caches:
@@ -123,13 +120,14 @@ gen4_render_ring_flush(struct drm_i915_gem_request *req, u32 mode)
 			cmd |= MI_INVALIDATE_ISP;
 	}
 
-	ret = intel_ring_begin(req, 2);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, 2);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
-	intel_ring_emit(ring, cmd);
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_advance(ring);
+	*rbuf++ = cmd;
+	*rbuf++ = MI_NOOP;
+
+	intel_ring_advance(req->ring, rbuf);
 
 	return 0;
 }
@@ -174,35 +172,35 @@ gen4_render_ring_flush(struct drm_i915_gem_request *req, u32 mode)
 static int
 intel_emit_post_sync_nonzero_flush(struct drm_i915_gem_request *req)
 {
-	struct intel_ring *ring = req->ring;
 	u32 scratch_addr =
 		i915_ggtt_offset(req->engine->scratch) + 2 * CACHELINE_BYTES;
-	int ret;
+	u32 *rbuf;
 
-	ret = intel_ring_begin(req, 6);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, 6);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
-	intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(5));
-	intel_ring_emit(ring, PIPE_CONTROL_CS_STALL |
-			PIPE_CONTROL_STALL_AT_SCOREBOARD);
-	intel_ring_emit(ring, scratch_addr | PIPE_CONTROL_GLOBAL_GTT);
-	intel_ring_emit(ring, 0); /* low dword */
-	intel_ring_emit(ring, 0); /* high dword */
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_advance(ring);
+	*rbuf++ = GFX_OP_PIPE_CONTROL(5);
+	*rbuf++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
+	*rbuf++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
+	*rbuf++ = 0; /* low dword */
+	*rbuf++ = 0; /* high dword */
+	*rbuf++ = MI_NOOP;
 
-	ret = intel_ring_begin(req, 6);
-	if (ret)
-		return ret;
+	intel_ring_advance(req->ring, rbuf);
+
+	rbuf = intel_ring_begin(req, 6);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
-	intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(5));
-	intel_ring_emit(ring, PIPE_CONTROL_QW_WRITE);
-	intel_ring_emit(ring, scratch_addr | PIPE_CONTROL_GLOBAL_GTT);
-	intel_ring_emit(ring, 0);
-	intel_ring_emit(ring, 0);
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_advance(ring);
+	*rbuf++ = GFX_OP_PIPE_CONTROL(5);
+	*rbuf++ = PIPE_CONTROL_QW_WRITE;
+	*rbuf++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
+	*rbuf++ = 0;
+	*rbuf++ = 0;
+	*rbuf++ = MI_NOOP;
+
+	intel_ring_advance(req->ring, rbuf);
 
 	return 0;
 }
@@ -210,10 +208,9 @@ intel_emit_post_sync_nonzero_flush(struct drm_i915_gem_request *req)
 static int
 gen6_render_ring_flush(struct drm_i915_gem_request *req, u32 mode)
 {
-	struct intel_ring *ring = req->ring;
 	u32 scratch_addr =
 		i915_ggtt_offset(req->engine->scratch) + 2 * CACHELINE_BYTES;
-	u32 flags = 0;
+	u32 *rbuf, flags = 0;
 	int ret;
 
 	/* Force SNB workarounds for PIPE_CONTROL flushes */
@@ -247,15 +244,16 @@ gen6_render_ring_flush(struct drm_i915_gem_request *req, u32 mode)
 		flags |= PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_CS_STALL;
 	}
 
-	ret = intel_ring_begin(req, 4);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, 4);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
+
+	*rbuf++ = GFX_OP_PIPE_CONTROL(4);
+	*rbuf++ = flags;
+	*rbuf++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
+	*rbuf++ = 0;
 
-	intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(4));
-	intel_ring_emit(ring, flags);
-	intel_ring_emit(ring, scratch_addr | PIPE_CONTROL_GLOBAL_GTT);
-	intel_ring_emit(ring, 0);
-	intel_ring_advance(ring);
+	intel_ring_advance(req->ring, rbuf);
 
 	return 0;
 }
@@ -263,20 +261,18 @@ gen6_render_ring_flush(struct drm_i915_gem_request *req, u32 mode)
 static int
 gen7_render_ring_cs_stall_wa(struct drm_i915_gem_request *req)
 {
-	struct intel_ring *ring = req->ring;
-	int ret;
+	u32 *rbuf;
 
-	ret = intel_ring_begin(req, 4);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, 4);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
-	intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(4));
-	intel_ring_emit(ring,
-			PIPE_CONTROL_CS_STALL |
-			PIPE_CONTROL_STALL_AT_SCOREBOARD);
-	intel_ring_emit(ring, 0);
-	intel_ring_emit(ring, 0);
-	intel_ring_advance(ring);
+	*rbuf++ = GFX_OP_PIPE_CONTROL(4);
+	*rbuf++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
+	*rbuf++ = 0;
+	*rbuf++ = 0;
+
+	intel_ring_advance(req->ring, rbuf);
 
 	return 0;
 }
@@ -284,11 +280,9 @@ gen7_render_ring_cs_stall_wa(struct drm_i915_gem_request *req)
 static int
 gen7_render_ring_flush(struct drm_i915_gem_request *req, u32 mode)
 {
-	struct intel_ring *ring = req->ring;
 	u32 scratch_addr =
 		i915_ggtt_offset(req->engine->scratch) + 2 * CACHELINE_BYTES;
-	u32 flags = 0;
-	int ret;
+	u32 *rbuf, flags = 0;
 
 	/*
 	 * Ensure that any following seqno writes only happen when the render
@@ -332,15 +326,16 @@ gen7_render_ring_flush(struct drm_i915_gem_request *req, u32 mode)
 		gen7_render_ring_cs_stall_wa(req);
 	}
 
-	ret = intel_ring_begin(req, 4);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, 4);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
-	intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(4));
-	intel_ring_emit(ring, flags);
-	intel_ring_emit(ring, scratch_addr);
-	intel_ring_emit(ring, 0);
-	intel_ring_advance(ring);
+	*rbuf++ = GFX_OP_PIPE_CONTROL(4);
+	*rbuf++ = flags;
+	*rbuf++ = scratch_addr;
+	*rbuf++ = 0;
+
+	intel_ring_advance(req->ring, rbuf);
 
 	return 0;
 }
@@ -349,20 +344,20 @@ static int
 gen8_emit_pipe_control(struct drm_i915_gem_request *req,
 		       u32 flags, u32 scratch_addr)
 {
-	struct intel_ring *ring = req->ring;
-	int ret;
+	u32 *rbuf;
 
-	ret = intel_ring_begin(req, 6);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, 6);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
-	intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(6));
-	intel_ring_emit(ring, flags);
-	intel_ring_emit(ring, scratch_addr);
-	intel_ring_emit(ring, 0);
-	intel_ring_emit(ring, 0);
-	intel_ring_emit(ring, 0);
-	intel_ring_advance(ring);
+	*rbuf++ = GFX_OP_PIPE_CONTROL(6);
+	*rbuf++ = flags;
+	*rbuf++ = scratch_addr;
+	*rbuf++ = 0;
+	*rbuf++ = 0;
+	*rbuf++ = 0;
+
+	intel_ring_advance(req->ring, rbuf);
 
 	return 0;
 }
@@ -625,8 +620,8 @@ static void reset_ring_common(struct intel_engine_cs *engine,
 
 static int intel_ring_workarounds_emit(struct drm_i915_gem_request *req)
 {
-	struct intel_ring *ring = req->ring;
 	struct i915_workarounds *w = &req->i915->workarounds;
+	u32 *rbuf;
 	int ret, i;
 
 	if (w->count == 0)
@@ -636,18 +631,18 @@ static int intel_ring_workarounds_emit(struct drm_i915_gem_request *req)
 	if (ret)
 		return ret;
 
-	ret = intel_ring_begin(req, (w->count * 2 + 2));
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, w->count * 2 + 2);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
-	intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(w->count));
+	*rbuf++ = MI_LOAD_REGISTER_IMM(w->count);
 	for (i = 0; i < w->count; i++) {
-		intel_ring_emit_reg(ring, w->reg[i].addr);
-		intel_ring_emit(ring, w->reg[i].value);
+		*rbuf++ = i915_mmio_reg_offset(w->reg[i].addr);
+		*rbuf++ = w->reg[i].value;
 	}
-	intel_ring_emit(ring, MI_NOOP);
+	*rbuf++ = MI_NOOP;
 
-	intel_ring_advance(ring);
+	intel_ring_advance(req->ring, rbuf);
 
 	ret = req->engine->emit_flush(req, EMIT_BARRIER);
 	if (ret)
@@ -1273,87 +1268,80 @@ static void render_ring_cleanup(struct intel_engine_cs *engine)
 
 static int gen8_rcs_signal(struct drm_i915_gem_request *req)
 {
-	struct intel_ring *ring = req->ring;
 	struct drm_i915_private *dev_priv = req->i915;
 	struct intel_engine_cs *waiter;
 	enum intel_engine_id id;
-	int ret, num_rings;
+	u32 *rbuf;
+	int num_rings;
 
 	num_rings = INTEL_INFO(dev_priv)->num_rings;
-	ret = intel_ring_begin(req, (num_rings-1) * 8);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, (num_rings - 1) * 8);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
 	for_each_engine_id(waiter, dev_priv, id) {
 		u64 gtt_offset = req->engine->semaphore.signal_ggtt[id];
 		if (gtt_offset == MI_SEMAPHORE_SYNC_INVALID)
 			continue;
 
-		intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(6));
-		intel_ring_emit(ring,
-				PIPE_CONTROL_GLOBAL_GTT_IVB |
-				PIPE_CONTROL_QW_WRITE |
-				PIPE_CONTROL_CS_STALL);
-		intel_ring_emit(ring, lower_32_bits(gtt_offset));
-		intel_ring_emit(ring, upper_32_bits(gtt_offset));
-		intel_ring_emit(ring, req->fence.seqno);
-		intel_ring_emit(ring, 0);
-		intel_ring_emit(ring,
-				MI_SEMAPHORE_SIGNAL |
-				MI_SEMAPHORE_TARGET(waiter->hw_id));
-		intel_ring_emit(ring, 0);
+		*rbuf++ = GFX_OP_PIPE_CONTROL(6);
+		*rbuf++ = PIPE_CONTROL_GLOBAL_GTT_IVB | PIPE_CONTROL_QW_WRITE |
+			  PIPE_CONTROL_CS_STALL;
+		*rbuf++ = lower_32_bits(gtt_offset);
+		*rbuf++ = upper_32_bits(gtt_offset);
+		*rbuf++ = req->fence.seqno;
+		*rbuf++ = 0;
+		*rbuf++ = MI_SEMAPHORE_SIGNAL |
+			  MI_SEMAPHORE_TARGET(waiter->hw_id);
+		*rbuf++ = 0;
 	}
-	intel_ring_advance(ring);
+	intel_ring_advance(req->ring, rbuf);
 
 	return 0;
 }
 
 static int gen8_xcs_signal(struct drm_i915_gem_request *req)
 {
-	struct intel_ring *ring = req->ring;
 	struct drm_i915_private *dev_priv = req->i915;
 	struct intel_engine_cs *waiter;
 	enum intel_engine_id id;
-	int ret, num_rings;
+	u32 *rbuf;
+	int num_rings;
 
 	num_rings = INTEL_INFO(dev_priv)->num_rings;
-	ret = intel_ring_begin(req, (num_rings-1) * 6);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, (num_rings - 1) * 6);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
 	for_each_engine_id(waiter, dev_priv, id) {
 		u64 gtt_offset = req->engine->semaphore.signal_ggtt[id];
 		if (gtt_offset == MI_SEMAPHORE_SYNC_INVALID)
 			continue;
 
-		intel_ring_emit(ring,
-				(MI_FLUSH_DW + 1) | MI_FLUSH_DW_OP_STOREDW);
-		intel_ring_emit(ring,
-				lower_32_bits(gtt_offset) |
-				MI_FLUSH_DW_USE_GTT);
-		intel_ring_emit(ring, upper_32_bits(gtt_offset));
-		intel_ring_emit(ring, req->fence.seqno);
-		intel_ring_emit(ring,
-				MI_SEMAPHORE_SIGNAL |
-				MI_SEMAPHORE_TARGET(waiter->hw_id));
-		intel_ring_emit(ring, 0);
+		*rbuf++ = (MI_FLUSH_DW + 1) | MI_FLUSH_DW_OP_STOREDW;
+		*rbuf++ = lower_32_bits(gtt_offset) | MI_FLUSH_DW_USE_GTT;
+		*rbuf++ = upper_32_bits(gtt_offset);
+		*rbuf++ = req->fence.seqno;
+		*rbuf++ = MI_SEMAPHORE_SIGNAL |
+			  MI_SEMAPHORE_TARGET(waiter->hw_id);
+		*rbuf++ = 0;
 	}
-	intel_ring_advance(ring);
+	intel_ring_advance(req->ring, rbuf);
 
 	return 0;
 }
 
 static int gen6_signal(struct drm_i915_gem_request *req)
 {
-	struct intel_ring *ring = req->ring;
 	struct drm_i915_private *dev_priv = req->i915;
 	struct intel_engine_cs *engine;
-	int ret, num_rings;
+	u32 *rbuf;
+	int num_rings;
 
 	num_rings = INTEL_INFO(dev_priv)->num_rings;
-	ret = intel_ring_begin(req, round_up((num_rings-1) * 3, 2));
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, round_up((num_rings - 1) * 3, 2));
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
 	for_each_engine(engine, dev_priv) {
 		i915_reg_t mbox_reg;
@@ -1363,16 +1351,17 @@ static int gen6_signal(struct drm_i915_gem_request *req)
 
 		mbox_reg = req->engine->semaphore.mbox.signal[engine->hw_id];
 		if (i915_mmio_reg_valid(mbox_reg)) {
-			intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
-			intel_ring_emit_reg(ring, mbox_reg);
-			intel_ring_emit(ring, req->fence.seqno);
+			*rbuf++ = MI_LOAD_REGISTER_IMM(1);
+			*rbuf++ = i915_mmio_reg_offset(mbox_reg);
+			*rbuf++ = req->fence.seqno;
 		}
 	}
 
 	/* If num_dwords was rounded, make sure the tail pointer is correct */
 	if (num_rings % 2 == 0)
-		intel_ring_emit(ring, MI_NOOP);
-	intel_ring_advance(ring);
+		*rbuf++ = MI_NOOP;
+
+	intel_ring_advance(req->ring, rbuf);
 
 	return 0;
 }
@@ -1388,17 +1377,18 @@ static void i9xx_submit_request(struct drm_i915_gem_request *request)
 static int i9xx_emit_request(struct drm_i915_gem_request *req)
 {
 	struct intel_ring *ring = req->ring;
-	int ret;
+	u32 *rbuf;
 
-	ret = intel_ring_begin(req, 4);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, 4);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
+
+	*rbuf++ = MI_STORE_DWORD_INDEX;
+	*rbuf++ = I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT;
+	*rbuf++ = req->fence.seqno;
+	*rbuf++ = MI_USER_INTERRUPT;
 
-	intel_ring_emit(ring, MI_STORE_DWORD_INDEX);
-	intel_ring_emit(ring, I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT);
-	intel_ring_emit(ring, req->fence.seqno);
-	intel_ring_emit(ring, MI_USER_INTERRUPT);
-	intel_ring_advance(ring);
+	intel_ring_advance(ring, rbuf);
 
 	req->tail = ring->tail;
 
@@ -1428,6 +1418,7 @@ static int gen8_render_emit_request(struct drm_i915_gem_request *req)
 {
 	struct intel_engine_cs *engine = req->engine;
 	struct intel_ring *ring = req->ring;
+	u32 *rbuf;
 	int ret;
 
 	if (engine->semaphore.signal) {
@@ -1436,22 +1427,22 @@ static int gen8_render_emit_request(struct drm_i915_gem_request *req)
 			return ret;
 	}
 
-	ret = intel_ring_begin(req, 8);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, 8);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
-	intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(6));
-	intel_ring_emit(ring, (PIPE_CONTROL_GLOBAL_GTT_IVB |
-			       PIPE_CONTROL_CS_STALL |
-			       PIPE_CONTROL_QW_WRITE));
-	intel_ring_emit(ring, intel_hws_seqno_address(engine));
-	intel_ring_emit(ring, 0);
-	intel_ring_emit(ring, i915_gem_request_get_seqno(req));
+	*rbuf++ = GFX_OP_PIPE_CONTROL(6);
+	*rbuf++ = (PIPE_CONTROL_GLOBAL_GTT_IVB | PIPE_CONTROL_CS_STALL |
+		  PIPE_CONTROL_QW_WRITE);
+	*rbuf++ = intel_hws_seqno_address(engine);
+	*rbuf++ = 0;
+	*rbuf++ = i915_gem_request_get_seqno(req);
 	/* We're thrashing one dword of HWS. */
-	intel_ring_emit(ring, 0);
-	intel_ring_emit(ring, MI_USER_INTERRUPT);
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_advance(ring);
+	*rbuf++ = 0;
+	*rbuf++ = MI_USER_INTERRUPT;
+	*rbuf++ = MI_NOOP;
+
+	intel_ring_advance(ring, rbuf);
 
 	req->tail = ring->tail;
 
@@ -1470,24 +1461,22 @@ static int
 gen8_ring_sync_to(struct drm_i915_gem_request *req,
 		  struct drm_i915_gem_request *signal)
 {
-	struct intel_ring *ring = req->ring;
 	struct drm_i915_private *dev_priv = req->i915;
 	u64 offset = GEN8_WAIT_OFFSET(req->engine, signal->engine->id);
 	struct i915_hw_ppgtt *ppgtt;
-	int ret;
+	u32 *rbuf;
 
-	ret = intel_ring_begin(req, 4);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, 4);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
+
+	*rbuf++ = MI_SEMAPHORE_WAIT | MI_SEMAPHORE_GLOBAL_GTT |
+		  MI_SEMAPHORE_SAD_GTE_SDD;
+	*rbuf++ = signal->fence.seqno;
+	*rbuf++ = lower_32_bits(offset);
+	*rbuf++ = upper_32_bits(offset);
 
-	intel_ring_emit(ring,
-			MI_SEMAPHORE_WAIT |
-			MI_SEMAPHORE_GLOBAL_GTT |
-			MI_SEMAPHORE_SAD_GTE_SDD);
-	intel_ring_emit(ring, signal->fence.seqno);
-	intel_ring_emit(ring, lower_32_bits(offset));
-	intel_ring_emit(ring, upper_32_bits(offset));
-	intel_ring_advance(ring);
+	intel_ring_advance(req->ring, rbuf);
 
 	/* When the !RCS engines idle waiting upon a semaphore, they lose their
 	 * pagetables and we must reload them before executing the batch.
@@ -1504,28 +1493,28 @@ static int
 gen6_ring_sync_to(struct drm_i915_gem_request *req,
 		  struct drm_i915_gem_request *signal)
 {
-	struct intel_ring *ring = req->ring;
 	u32 dw1 = MI_SEMAPHORE_MBOX |
 		  MI_SEMAPHORE_COMPARE |
 		  MI_SEMAPHORE_REGISTER;
 	u32 wait_mbox = signal->engine->semaphore.mbox.wait[req->engine->hw_id];
-	int ret;
+	u32 *rbuf;
 
 	WARN_ON(wait_mbox == MI_SEMAPHORE_SYNC_INVALID);
 
-	ret = intel_ring_begin(req, 4);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, 4);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
-	intel_ring_emit(ring, dw1 | wait_mbox);
+	*rbuf++ = dw1 | wait_mbox;
 	/* Throughout all of the GEM code, seqno passed implies our current
 	 * seqno is >= the last seqno executed. However for hardware the
 	 * comparison is strictly greater than.
 	 */
-	intel_ring_emit(ring, signal->fence.seqno - 1);
-	intel_ring_emit(ring, 0);
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_advance(ring);
+	*rbuf++ = signal->fence.seqno - 1;
+	*rbuf++ = 0;
+	*rbuf++ = MI_NOOP;
+
+	intel_ring_advance(req->ring, rbuf);
 
 	return 0;
 }
@@ -1626,16 +1615,16 @@ i8xx_irq_disable(struct intel_engine_cs *engine)
 static int
 bsd_ring_flush(struct drm_i915_gem_request *req, u32 mode)
 {
-	struct intel_ring *ring = req->ring;
-	int ret;
+	u32 *rbuf;
 
-	ret = intel_ring_begin(req, 2);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, 2);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
-	intel_ring_emit(ring, MI_FLUSH);
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_advance(ring);
+	*rbuf++ = MI_FLUSH;
+	*rbuf++ = MI_NOOP;
+
+	intel_ring_advance(req->ring, rbuf);
 	return 0;
 }
 
@@ -1701,20 +1690,18 @@ i965_emit_bb_start(struct drm_i915_gem_request *req,
 		   u64 offset, u32 length,
 		   unsigned int dispatch_flags)
 {
-	struct intel_ring *ring = req->ring;
-	int ret;
+	u32 *rbuf;
 
-	ret = intel_ring_begin(req, 2);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, 2);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
+
+	*rbuf++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT |
+		  (dispatch_flags & I915_DISPATCH_SECURE ?
+		  0 : MI_BATCH_NON_SECURE_I965);
+	*rbuf++ = offset;
 
-	intel_ring_emit(ring,
-			MI_BATCH_BUFFER_START |
-			MI_BATCH_GTT |
-			(dispatch_flags & I915_DISPATCH_SECURE ?
-			 0 : MI_BATCH_NON_SECURE_I965));
-	intel_ring_emit(ring, offset);
-	intel_ring_advance(ring);
+	intel_ring_advance(req->ring, rbuf);
 
 	return 0;
 }
@@ -1730,57 +1717,59 @@ i830_emit_bb_start(struct drm_i915_gem_request *req,
 {
 	struct intel_ring *ring = req->ring;
 	u32 cs_offset = i915_ggtt_offset(req->engine->scratch);
-	int ret;
+	u32 *rbuf;
 
-	ret = intel_ring_begin(req, 6);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, 6);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
 	/* Evict the invalid PTE TLBs */
-	intel_ring_emit(ring, COLOR_BLT_CMD | BLT_WRITE_RGBA);
-	intel_ring_emit(ring, BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | 4096);
-	intel_ring_emit(ring, I830_TLB_ENTRIES << 16 | 4); /* load each page */
-	intel_ring_emit(ring, cs_offset);
-	intel_ring_emit(ring, 0xdeadbeef);
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_advance(ring);
+	*rbuf++ = COLOR_BLT_CMD | BLT_WRITE_RGBA;
+	*rbuf++ = BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | 4096;
+	*rbuf++ = I830_TLB_ENTRIES << 16 | 4; /* load each page */
+	*rbuf++ = cs_offset;
+	*rbuf++ = 0xdeadbeef;
+	*rbuf++ = MI_NOOP;
+
+	intel_ring_advance(ring, rbuf);
 
 	if ((dispatch_flags & I915_DISPATCH_PINNED) == 0) {
 		if (len > I830_BATCH_LIMIT)
 			return -ENOSPC;
 
-		ret = intel_ring_begin(req, 6 + 2);
-		if (ret)
-			return ret;
+		rbuf = intel_ring_begin(req, 6 + 2);
+		if (IS_ERR(rbuf))
+			return PTR_ERR(rbuf);
 
 		/* Blit the batch (which has now all relocs applied) to the
 		 * stable batch scratch bo area (so that the CS never
 		 * stumbles over its tlb invalidation bug) ...
 		 */
-		intel_ring_emit(ring, SRC_COPY_BLT_CMD | BLT_WRITE_RGBA);
-		intel_ring_emit(ring,
-				BLT_DEPTH_32 | BLT_ROP_SRC_COPY | 4096);
-		intel_ring_emit(ring, DIV_ROUND_UP(len, 4096) << 16 | 4096);
-		intel_ring_emit(ring, cs_offset);
-		intel_ring_emit(ring, 4096);
-		intel_ring_emit(ring, offset);
-
-		intel_ring_emit(ring, MI_FLUSH);
-		intel_ring_emit(ring, MI_NOOP);
-		intel_ring_advance(ring);
+		*rbuf++ = SRC_COPY_BLT_CMD | BLT_WRITE_RGBA;
+		*rbuf++ = BLT_DEPTH_32 | BLT_ROP_SRC_COPY | 4096;
+		*rbuf++ = DIV_ROUND_UP(len, 4096) << 16 | 4096;
+		*rbuf++ = cs_offset;
+		*rbuf++ = 4096;
+		*rbuf++ = offset;
+
+		*rbuf++ = MI_FLUSH;
+		*rbuf++ = MI_NOOP;
+
+		intel_ring_advance(ring, rbuf);
 
 		/* ... and execute it. */
 		offset = cs_offset;
 	}
 
-	ret = intel_ring_begin(req, 2);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, 2);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
-	intel_ring_emit(ring, MI_BATCH_BUFFER_START | MI_BATCH_GTT);
-	intel_ring_emit(ring, offset | (dispatch_flags & I915_DISPATCH_SECURE ?
-					0 : MI_BATCH_NON_SECURE));
-	intel_ring_advance(ring);
+	*rbuf++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT;
+	*rbuf++ = offset | (dispatch_flags & I915_DISPATCH_SECURE ?
+		  0 : MI_BATCH_NON_SECURE);
+
+	intel_ring_advance(ring, rbuf);
 
 	return 0;
 }
@@ -1790,17 +1779,17 @@ i915_emit_bb_start(struct drm_i915_gem_request *req,
 		   u64 offset, u32 len,
 		   unsigned int dispatch_flags)
 {
-	struct intel_ring *ring = req->ring;
-	int ret;
+	u32 *rbuf;
 
-	ret = intel_ring_begin(req, 2);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, 2);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
+
+	*rbuf++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT;
+	*rbuf++ = offset | (dispatch_flags & I915_DISPATCH_SECURE ?
+		  0 : MI_BATCH_NON_SECURE);
 
-	intel_ring_emit(ring, MI_BATCH_BUFFER_START | MI_BATCH_GTT);
-	intel_ring_emit(ring, offset | (dispatch_flags & I915_DISPATCH_SECURE ?
-					0 : MI_BATCH_NON_SECURE));
-	intel_ring_advance(ring);
+	intel_ring_advance(req->ring, rbuf);
 
 	return 0;
 }
@@ -2189,7 +2178,7 @@ void intel_legacy_submission_resume(struct drm_i915_private *dev_priv)
 
 int intel_ring_alloc_request_extras(struct drm_i915_gem_request *request)
 {
-	int ret;
+	u32 *rbuf;
 
 	/* Flush enough space to reduce the likelihood of waiting after
 	 * we start building the request - in which case we will just
@@ -2199,9 +2188,9 @@ int intel_ring_alloc_request_extras(struct drm_i915_gem_request *request)
 
 	request->ring = request->engine->buffer;
 
-	ret = intel_ring_begin(request, 0);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(request, 0);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
 	request->reserved_space -= LEGACY_REQUEST_SIZE;
 	return 0;
@@ -2254,13 +2243,14 @@ static int wait_for_space(struct drm_i915_gem_request *req, int bytes)
 	return 0;
 }
 
-int intel_ring_begin(struct drm_i915_gem_request *req, int num_dwords)
+u32 *intel_ring_begin(struct drm_i915_gem_request *req, int num_dwords)
 {
 	struct intel_ring *ring = req->ring;
 	int remain_actual = ring->size - ring->tail;
 	int remain_usable = ring->effective_size - ring->tail;
 	int bytes = num_dwords * sizeof(u32);
 	int total_bytes, wait_bytes;
+	u32 *rbuf;
 	bool need_wrap = false;
 
 	total_bytes = bytes + req->reserved_space;
@@ -2288,7 +2278,7 @@ int intel_ring_begin(struct drm_i915_gem_request *req, int num_dwords)
 	if (wait_bytes > ring->space) {
 		int ret = wait_for_space(req, wait_bytes);
 		if (unlikely(ret))
-			return ret;
+			return ERR_PTR(ret);
 	}
 
 	if (unlikely(need_wrap)) {
@@ -2303,7 +2293,11 @@ int intel_ring_begin(struct drm_i915_gem_request *req, int num_dwords)
 
 	ring->space -= bytes;
 	GEM_BUG_ON(ring->space < 0);
-	return 0;
+
+	rbuf = (u32 *)(ring->vaddr + ring->tail);
+	ring->tail += bytes;
+
+	return rbuf;
 }
 
 /* Align the ring tail to a cacheline boundary */
@@ -2312,20 +2306,20 @@ int intel_ring_cacheline_align(struct drm_i915_gem_request *req)
 	struct intel_ring *ring = req->ring;
 	int num_dwords =
 		(ring->tail & (CACHELINE_BYTES - 1)) / sizeof(uint32_t);
-	int ret;
+	u32 *rbuf;
 
 	if (num_dwords == 0)
 		return 0;
 
 	num_dwords = CACHELINE_BYTES / sizeof(uint32_t) - num_dwords;
-	ret = intel_ring_begin(req, num_dwords);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, num_dwords);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
 	while (num_dwords--)
-		intel_ring_emit(ring, MI_NOOP);
+		*rbuf++ = MI_NOOP;
 
-	intel_ring_advance(ring);
+	intel_ring_advance(ring, rbuf);
 
 	return 0;
 }
@@ -2369,13 +2363,11 @@ static void gen6_bsd_submit_request(struct drm_i915_gem_request *request)
 
 static int gen6_bsd_ring_flush(struct drm_i915_gem_request *req, u32 mode)
 {
-	struct intel_ring *ring = req->ring;
-	uint32_t cmd;
-	int ret;
+	u32 cmd, *rbuf;
 
-	ret = intel_ring_begin(req, 4);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, 4);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
 	cmd = MI_FLUSH_DW;
 	if (INTEL_GEN(req->i915) >= 8)
@@ -2397,16 +2389,18 @@ static int gen6_bsd_ring_flush(struct drm_i915_gem_request *req, u32 mode)
 	if (mode & EMIT_INVALIDATE)
 		cmd |= MI_INVALIDATE_TLB | MI_INVALIDATE_BSD;
 
-	intel_ring_emit(ring, cmd);
-	intel_ring_emit(ring, I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT);
+	*rbuf++ = cmd;
+	*rbuf++ = I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
 	if (INTEL_GEN(req->i915) >= 8) {
-		intel_ring_emit(ring, 0); /* upper addr */
-		intel_ring_emit(ring, 0); /* value */
+		*rbuf++ = 0; /* upper addr */
+		*rbuf++ = 0; /* value */
 	} else  {
-		intel_ring_emit(ring, 0);
-		intel_ring_emit(ring, MI_NOOP);
+		*rbuf++ = 0;
+		*rbuf++ = MI_NOOP;
 	}
-	intel_ring_advance(ring);
+
+	intel_ring_advance(req->ring, rbuf);
+
 	return 0;
 }
 
@@ -2415,23 +2409,23 @@ gen8_emit_bb_start(struct drm_i915_gem_request *req,
 		   u64 offset, u32 len,
 		   unsigned int dispatch_flags)
 {
-	struct intel_ring *ring = req->ring;
 	bool ppgtt = USES_PPGTT(req->i915) &&
 			!(dispatch_flags & I915_DISPATCH_SECURE);
-	int ret;
+	u32 *rbuf;
 
-	ret = intel_ring_begin(req, 4);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, 4);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
 	/* FIXME(BDW): Address space and security selectors. */
-	intel_ring_emit(ring, MI_BATCH_BUFFER_START_GEN8 | (ppgtt<<8) |
-			(dispatch_flags & I915_DISPATCH_RS ?
-			 MI_BATCH_RESOURCE_STREAMER : 0));
-	intel_ring_emit(ring, lower_32_bits(offset));
-	intel_ring_emit(ring, upper_32_bits(offset));
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_advance(ring);
+	*rbuf++ = MI_BATCH_BUFFER_START_GEN8 | (ppgtt << 8) |
+		  (dispatch_flags & I915_DISPATCH_RS ?
+		  MI_BATCH_RESOURCE_STREAMER : 0);
+	*rbuf++ = lower_32_bits(offset);
+	*rbuf++ = upper_32_bits(offset);
+	*rbuf++ = MI_NOOP;
+
+	intel_ring_advance(req->ring, rbuf);
 
 	return 0;
 }
@@ -2441,22 +2435,21 @@ hsw_emit_bb_start(struct drm_i915_gem_request *req,
 		  u64 offset, u32 len,
 		  unsigned int dispatch_flags)
 {
-	struct intel_ring *ring = req->ring;
-	int ret;
+	u32 *rbuf;
 
-	ret = intel_ring_begin(req, 2);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, 2);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
-	intel_ring_emit(ring,
-			MI_BATCH_BUFFER_START |
-			(dispatch_flags & I915_DISPATCH_SECURE ?
-			 0 : MI_BATCH_PPGTT_HSW | MI_BATCH_NON_SECURE_HSW) |
-			(dispatch_flags & I915_DISPATCH_RS ?
-			 MI_BATCH_RESOURCE_STREAMER : 0));
+	*rbuf++ = MI_BATCH_BUFFER_START |
+		  (dispatch_flags & I915_DISPATCH_SECURE ?
+		  0 : MI_BATCH_PPGTT_HSW | MI_BATCH_NON_SECURE_HSW) |
+		  (dispatch_flags & I915_DISPATCH_RS ?
+		  MI_BATCH_RESOURCE_STREAMER : 0);
 	/* bit0-7 is the length on GEN6+ */
-	intel_ring_emit(ring, offset);
-	intel_ring_advance(ring);
+	*rbuf++ = offset;
+
+	intel_ring_advance(req->ring, rbuf);
 
 	return 0;
 }
@@ -2466,20 +2459,19 @@ gen6_emit_bb_start(struct drm_i915_gem_request *req,
 		   u64 offset, u32 len,
 		   unsigned int dispatch_flags)
 {
-	struct intel_ring *ring = req->ring;
-	int ret;
+	u32 *rbuf;
 
-	ret = intel_ring_begin(req, 2);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, 2);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
-	intel_ring_emit(ring,
-			MI_BATCH_BUFFER_START |
-			(dispatch_flags & I915_DISPATCH_SECURE ?
-			 0 : MI_BATCH_NON_SECURE_I965));
+	*rbuf++ = MI_BATCH_BUFFER_START |
+		  (dispatch_flags & I915_DISPATCH_SECURE ?
+		  0 : MI_BATCH_NON_SECURE_I965);
 	/* bit0-7 is the length on GEN6+ */
-	intel_ring_emit(ring, offset);
-	intel_ring_advance(ring);
+	*rbuf++ = offset;
+
+	intel_ring_advance(req->ring, rbuf);
 
 	return 0;
 }
@@ -2488,13 +2480,11 @@ gen6_emit_bb_start(struct drm_i915_gem_request *req,
 
 static int gen6_ring_flush(struct drm_i915_gem_request *req, u32 mode)
 {
-	struct intel_ring *ring = req->ring;
-	uint32_t cmd;
-	int ret;
+	u32 cmd, *rbuf;
 
-	ret = intel_ring_begin(req, 4);
-	if (ret)
-		return ret;
+	rbuf = intel_ring_begin(req, 4);
+	if (IS_ERR(rbuf))
+		return PTR_ERR(rbuf);
 
 	cmd = MI_FLUSH_DW;
 	if (INTEL_GEN(req->i915) >= 8)
@@ -2515,17 +2505,17 @@ static int gen6_ring_flush(struct drm_i915_gem_request *req, u32 mode)
 	 */
 	if (mode & EMIT_INVALIDATE)
 		cmd |= MI_INVALIDATE_TLB;
-	intel_ring_emit(ring, cmd);
-	intel_ring_emit(ring,
-			I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT);
+	*rbuf++ = cmd;
+	*rbuf++ = I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
 	if (INTEL_GEN(req->i915) >= 8) {
-		intel_ring_emit(ring, 0); /* upper addr */
-		intel_ring_emit(ring, 0); /* value */
+		*rbuf++ = 0; /* upper addr */
+		*rbuf++ = 0; /* value */
 	} else  {
-		intel_ring_emit(ring, 0);
-		intel_ring_emit(ring, MI_NOOP);
+		*rbuf++ = 0;
+		*rbuf++ = MI_NOOP;
 	}
-	intel_ring_advance(ring);
+
+	intel_ring_advance(req->ring, rbuf);
 
 	return 0;
 }
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 7f64d611159b..53dcd7b9a72d 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -456,30 +456,12 @@ void intel_legacy_submission_resume(struct drm_i915_private *dev_priv);
 
 int intel_ring_alloc_request_extras(struct drm_i915_gem_request *request);
 
-int __must_check intel_ring_begin(struct drm_i915_gem_request *req, int n);
+u32 __must_check *intel_ring_begin(struct drm_i915_gem_request *req, int n);
 int __must_check intel_ring_cacheline_align(struct drm_i915_gem_request *req);
 
-static inline void intel_ring_emit(struct intel_ring *ring, u32 data)
+static inline void intel_ring_advance(struct intel_ring *ring, u32 *rbuf)
 {
-	*(uint32_t *)(ring->vaddr + ring->tail) = data;
-	ring->tail += 4;
-}
-
-static inline void intel_ring_emit_reg(struct intel_ring *ring, i915_reg_t reg)
-{
-	intel_ring_emit(ring, i915_mmio_reg_offset(reg));
-}
-
-static inline void intel_ring_advance(struct intel_ring *ring)
-{
-	/* Dummy function.
-	 *
-	 * This serves as a placeholder in the code so that the reader
-	 * can compare against the preceding intel_ring_begin() and
-	 * check that the number of dwords emitted matches the space
-	 * reserved for the command packet (i.e. the value passed to
-	 * intel_ring_begin()).
-	 */
+	GEM_BUG_ON((ring->vaddr + ring->tail) != rbuf);
 }
 
 static inline u32 intel_ring_offset(struct intel_ring *ring, u32 value)
-- 
1.9.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* ✓ Fi.CI.BAT: success for drm/i915: Emit to ringbuffer directly (rev3)
  2016-09-08 15:12 [RFC] drm/i915: Emit to ringbuffer directly Tvrtko Ursulin
                   ` (2 preceding siblings ...)
  2016-09-09 16:26 ` ✗ Fi.CI.BAT: failure for drm/i915: Emit to ringbuffer directly (rev2) Patchwork
@ 2016-09-12 10:19 ` Patchwork
  3 siblings, 0 replies; 15+ messages in thread
From: Patchwork @ 2016-09-12 10:19 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: intel-gfx

== Series Details ==

Series: drm/i915: Emit to ringbuffer directly (rev3)
URL   : https://patchwork.freedesktop.org/series/12186/
State : success

== Summary ==

Series 12186v3 drm/i915: Emit to ringbuffer directly
https://patchwork.freedesktop.org/api/1.0/series/12186/revisions/3/mbox/

Test kms_cursor_legacy:
        Subgroup basic-cursor-vs-flip-varying-size:
                fail       -> PASS       (fi-bsw-n3050)

fi-bdw-5557u     total:254  pass:239  dwarn:0   dfail:0   fail:0   skip:15 
fi-bsw-n3050     total:254  pass:208  dwarn:0   dfail:0   fail:0   skip:46 
fi-byt-n2820     total:254  pass:212  dwarn:0   dfail:0   fail:1   skip:41 
fi-hsw-4770k     total:254  pass:232  dwarn:0   dfail:0   fail:0   skip:22 
fi-hsw-4770r     total:254  pass:228  dwarn:0   dfail:0   fail:0   skip:26 
fi-ilk-650       total:254  pass:184  dwarn:0   dfail:0   fail:2   skip:68 
fi-ivb-3520m     total:254  pass:223  dwarn:0   dfail:0   fail:0   skip:31 
fi-ivb-3770      total:254  pass:223  dwarn:0   dfail:0   fail:0   skip:31 
fi-skl-6260u     total:254  pass:240  dwarn:0   dfail:0   fail:0   skip:14 
fi-skl-6700hq    total:254  pass:227  dwarn:0   dfail:0   fail:1   skip:26 
fi-skl-6700k     total:254  pass:225  dwarn:1   dfail:0   fail:0   skip:28 
fi-snb-2520m     total:254  pass:209  dwarn:0   dfail:0   fail:0   skip:45 
fi-snb-2600      total:254  pass:209  dwarn:0   dfail:0   fail:0   skip:45 

Results at /archive/results/CI_IGT_test/Patchwork_2511/

bef9c1f4afe24cfff578d386bde349add65673eb drm-intel-nightly: 2016y-09m-12d-08h-35m-02s UTC integration manifest
2f42e1d drm/i915: Emit to ringbuffer directly

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v3] drm/i915: Emit to ringbuffer directly
  2016-09-12  9:44             ` [PATCH v3] " Tvrtko Ursulin
@ 2016-09-12 15:04               ` Dave Gordon
  0 siblings, 0 replies; 15+ messages in thread
From: Dave Gordon @ 2016-09-12 15:04 UTC (permalink / raw)
  To: Tvrtko Ursulin, Intel-gfx

On 12/09/16 10:44, Tvrtko Ursulin wrote:
> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>
> This removes the usage of intel_ring_emit in favour of
> directly writing to the ring buffer.
>
> intel_ring_emit was preventing the compiler for optimising
> fetch and increment of the current ring buffer pointer and
> therefore generating very verbose code for every write.
>
> It had no useful purpose since all ringbuffer operations
> are started and ended with intel_ring_begin and
> intel_ring_advance respectively, with no bail out in the
> middle possible, so it is fine to increment the tail in
> intel_ring_begin and let the code manage the pointer
> itself.
>
> Useless instruction removal amounts to approximately
> two and half kilobytes of saved text on my build.
>
> Not sure if this has any measurable performance
> implications but executing a ton of useless instructions
> on fast paths cannot be good.
>
> Patch is not fully polished, but it compiles and runs
> on Gen9 at least.
>
> v2:
>  * Change return from intel_ring_begin to error pointer by
>    popular demand.
>  * Move tail increment to intel_ring_advance to enable some
>    error checking.
>
> v3:
>  * Move tail advance back into intel_ring_begin.

Downside of this is that you now have no check that _advance() is ever 
called, let alone called once per begin. If we want to strictly enforce 
the begin-advance pairing, _advance() has to do something, either 
something that is necessary for the next _begin(), or at least reset 
some state flag that tracks whether we're between a begin and an 
advance, or an advance and a begin (when writing to the ring is not 
allowed!).

.Dave.

>  * Rebase and tidy.
>
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Dave Gordon <david.s.gordon@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_gem_context.c    |  75 ++--
>  drivers/gpu/drm/i915/i915_gem_execbuffer.c |  37 +-
>  drivers/gpu/drm/i915/i915_gem_gtt.c        |  70 ++--
>  drivers/gpu/drm/i915/intel_display.c       | 132 +++---
>  drivers/gpu/drm/i915/intel_lrc.c           | 245 ++++++-----
>  drivers/gpu/drm/i915/intel_mocs.c          |  53 ++-
>  drivers/gpu/drm/i915/intel_overlay.c       |  88 ++--
>  drivers/gpu/drm/i915/intel_ringbuffer.c    | 638 ++++++++++++++---------------
>  drivers/gpu/drm/i915/intel_ringbuffer.h    |  24 +-
>  9 files changed, 655 insertions(+), 707 deletions(-)
>
[snip]

> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 7f64d611159b..53dcd7b9a72d 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -456,30 +456,12 @@ void intel_legacy_submission_resume(struct drm_i915_private *dev_priv);
>
>  int intel_ring_alloc_request_extras(struct drm_i915_gem_request *request);
>
> -int __must_check intel_ring_begin(struct drm_i915_gem_request *req, int n);
> +u32 __must_check *intel_ring_begin(struct drm_i915_gem_request *req, int n);
>  int __must_check intel_ring_cacheline_align(struct drm_i915_gem_request *req);
>
> -static inline void intel_ring_emit(struct intel_ring *ring, u32 data)
> +static inline void intel_ring_advance(struct intel_ring *ring, u32 *rbuf)
>  {
> -	*(uint32_t *)(ring->vaddr + ring->tail) = data;
> -	ring->tail += 4;
> -}
> -
> -static inline void intel_ring_emit_reg(struct intel_ring *ring, i915_reg_t reg)
> -{
> -	intel_ring_emit(ring, i915_mmio_reg_offset(reg));
> -}
> -
> -static inline void intel_ring_advance(struct intel_ring *ring)
> -{
> -	/* Dummy function.
> -	 *
> -	 * This serves as a placeholder in the code so that the reader
> -	 * can compare against the preceding intel_ring_begin() and
> -	 * check that the number of dwords emitted matches the space
> -	 * reserved for the command packet (i.e. the value passed to
> -	 * intel_ring_begin()).
> -	 */
> +	GEM_BUG_ON((ring->vaddr + ring->tail) != rbuf);
>  }
>
>  static inline u32 intel_ring_offset(struct intel_ring *ring, u32 value)
>

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2016-09-12 15:04 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-09-08 15:12 [RFC] drm/i915: Emit to ringbuffer directly Tvrtko Ursulin
2016-09-08 15:54 ` ✗ Fi.CI.BAT: failure for " Patchwork
2016-09-08 16:40 ` [RFC] " Chris Wilson
2016-09-09  8:32   ` Tvrtko Ursulin
2016-09-09 13:20     ` Dave Gordon
2016-09-09 13:58       ` Tvrtko Ursulin
2016-09-09 15:52         ` [RFC v2] " Tvrtko Ursulin
2016-09-09 16:04           ` Chris Wilson
2016-09-12  9:44             ` [PATCH v3] " Tvrtko Ursulin
2016-09-12 15:04               ` Dave Gordon
2016-09-09 13:40     ` [RFC] " Chris Wilson
2016-09-09 13:45     ` Chris Wilson
2016-09-09 14:14       ` Tvrtko Ursulin
2016-09-09 16:26 ` ✗ Fi.CI.BAT: failure for drm/i915: Emit to ringbuffer directly (rev2) Patchwork
2016-09-12 10:19 ` ✓ Fi.CI.BAT: success for drm/i915: Emit to ringbuffer directly (rev3) Patchwork

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.