All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/5] drm/i915: Add the support of dual BSD rings on BDW GT3
@ 2014-04-09  1:59 Zhao Yakui
  2014-04-09  1:59 ` [PATCH 1/5] drm/i915: Split the BDW device definition to prepare for " Zhao Yakui
                   ` (5 more replies)
  0 siblings, 6 replies; 24+ messages in thread
From: Zhao Yakui @ 2014-04-09  1:59 UTC (permalink / raw)
  To: intel-gfx


This is the patch set that tries to add the support of dual BSD rings on BDW
GT3. Based on hardware spec, the BDW GT3 has two independent BSD rings, which
can be used to process the video commands. To be simpler, it is transparent 
to user-space driver/middleware. In such case the kernel driver will decide
which ring is to dispatch the BSD video command.

As every BSD ring is powerful, it is enough to dispatch the BSD video command
based on the drm fd. In such case the different BSD ring is used for video playing
back and encoding. At the same time the coarse dispatch mechanism can help to avoid
the object synchronization between the BSD rings.


Zhao Yakui (5):
  drm/i915: Split the BDW device definition to prepare for dual BSD
    rings on BDW GT3
  drm/i915: Initialize the second BSD ring on BDW GT3 machine
  drm/i915: Handle the irq interrupt for the second BSD ring
  drm/i915: Add the VCS2 switch in Intel_ring_setup_status_page for Gen7 to
    remove the switch check warning
  drm/i915: Use the coarse mechanism based on drm fd to dispatch the BSD command
    on BDW GT3

 drivers/gpu/drm/i915/i915_dma.c            |   14 ++++++
 drivers/gpu/drm/i915/i915_drv.c            |   24 ++++++++-
 drivers/gpu/drm/i915/i915_drv.h            |    5 ++
 drivers/gpu/drm/i915/i915_gem.c            |    9 +++-
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |   73 +++++++++++++++++++++++++++-
 drivers/gpu/drm/i915/i915_gpu_error.c      |    1 +
 drivers/gpu/drm/i915/i915_irq.c            |    5 +-
 drivers/gpu/drm/i915/i915_reg.h            |    1 +
 drivers/gpu/drm/i915/intel_ringbuffer.c    |   57 ++++++++++++++++++++++
 drivers/gpu/drm/i915/intel_ringbuffer.h    |    6 ++-
 include/drm/i915_pciids.h                  |   10 ++--
 11 files changed, 197 insertions(+), 8 deletions(-)

-- 
1.7.10.1

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH 1/5] drm/i915: Split the BDW device definition to prepare for dual BSD rings on BDW GT3
  2014-04-09  1:59 [PATCH 0/5] drm/i915: Add the support of dual BSD rings on BDW GT3 Zhao Yakui
@ 2014-04-09  1:59 ` Zhao Yakui
  2014-04-09 14:27   ` Daniel Vetter
  2014-04-09  1:59 ` [PATCH 2/5] drm/i915:Initialize the second BSD ring on BDW GT3 machine Zhao Yakui
                   ` (4 subsequent siblings)
  5 siblings, 1 reply; 24+ messages in thread
From: Zhao Yakui @ 2014-04-09  1:59 UTC (permalink / raw)
  To: intel-gfx

Based on the hardware spec, the BDW GT3 has the different configuration
with the BDW GT1/GT2. So split the BDW device info definition.
This is to do the preparation for adding the Dual BSD rings on BDW GT3 machine.

Signed-off-by: Zhao Yakui <yakui.zhao@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.c |   24 +++++++++++++++++++++++-
 include/drm/i915_pciids.h       |   10 +++++++---
 2 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index a01faea..609f837 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -279,6 +279,26 @@ static const struct intel_device_info intel_broadwell_m_info = {
 	GEN_DEFAULT_PIPEOFFSETS,
 };
 
+static const struct intel_device_info intel_broadwell_gt3d_info = {
+	.gen = 8, .num_pipes = 3,
+	.need_gfx_hws = 1, .has_hotplug = 1,
+	.ring_mask = RENDER_RING | BSD_RING | BLT_RING | VEBOX_RING,
+	.has_llc = 1,
+	.has_ddi = 1,
+	.has_fbc = 1,
+	GEN_DEFAULT_PIPEOFFSETS,
+};
+
+static const struct intel_device_info intel_broadwell_gt3m_info = {
+	.gen = 8, .is_mobile = 1, .num_pipes = 3,
+	.need_gfx_hws = 1, .has_hotplug = 1,
+	.ring_mask = RENDER_RING | BSD_RING | BLT_RING | VEBOX_RING,
+	.has_llc = 1,
+	.has_ddi = 1,
+	.has_fbc = 1,
+	GEN_DEFAULT_PIPEOFFSETS,
+};
+
 /*
  * Make sure any device matches here are from most specific to most
  * general.  For example, since the Quanta match is based on the subsystem
@@ -312,7 +332,9 @@ static const struct intel_device_info intel_broadwell_m_info = {
 	INTEL_VLV_M_IDS(&intel_valleyview_m_info),	\
 	INTEL_VLV_D_IDS(&intel_valleyview_d_info),	\
 	INTEL_BDW_M_IDS(&intel_broadwell_m_info),	\
-	INTEL_BDW_D_IDS(&intel_broadwell_d_info)
+	INTEL_BDW_D_IDS(&intel_broadwell_d_info),	\
+	INTEL_BDW_GT3M_IDS(&intel_broadwell_gt3m_info),	\
+	INTEL_BDW_GT3D_IDS(&intel_broadwell_gt3d_info)
 
 static const struct pci_device_id pciidlist[] = {		/* aka */
 	INTEL_PCI_IDS,
diff --git a/include/drm/i915_pciids.h b/include/drm/i915_pciids.h
index 940ece4..32d75f8 100644
--- a/include/drm/i915_pciids.h
+++ b/include/drm/i915_pciids.h
@@ -225,12 +225,16 @@
 
 #define INTEL_BDW_M_IDS(info) \
 	_INTEL_BDW_M_IDS(1, info), \
-	_INTEL_BDW_M_IDS(2, info), \
-	_INTEL_BDW_M_IDS(3, info)
+	_INTEL_BDW_M_IDS(2, info)
 
 #define INTEL_BDW_D_IDS(info) \
 	_INTEL_BDW_D_IDS(1, info), \
-	_INTEL_BDW_D_IDS(2, info), \
+	_INTEL_BDW_D_IDS(2, info)
+
+#define INTEL_BDW_GT3M_IDS(info) \
+	_INTEL_BDW_M_IDS(3, info)
+
+#define INTEL_BDW_GT3D_IDS(info) \
 	_INTEL_BDW_D_IDS(3, info)
 
 #endif /* _I915_PCIIDS_H */
-- 
1.7.10.1

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 2/5] drm/i915:Initialize the second BSD ring on BDW GT3 machine
  2014-04-09  1:59 [PATCH 0/5] drm/i915: Add the support of dual BSD rings on BDW GT3 Zhao Yakui
  2014-04-09  1:59 ` [PATCH 1/5] drm/i915: Split the BDW device definition to prepare for " Zhao Yakui
@ 2014-04-09  1:59 ` Zhao Yakui
  2014-04-09  1:59 ` [PATCH 3/5] drm/i915:Handle the irq interrupt for the second BSD ring Zhao Yakui
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 24+ messages in thread
From: Zhao Yakui @ 2014-04-09  1:59 UTC (permalink / raw)
  To: intel-gfx

Based on the hardware spec, the BDW GT3 machine has two independent
BSD ring that can be used to dispatch the video commands.
So just initialize it.

Signed-off-by: Zhao Yakui <yakui.zhao@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.c         |    4 +--
 drivers/gpu/drm/i915/i915_drv.h         |    2 ++
 drivers/gpu/drm/i915/i915_gem.c         |    9 +++++-
 drivers/gpu/drm/i915/i915_gpu_error.c   |    1 +
 drivers/gpu/drm/i915/i915_reg.h         |    1 +
 drivers/gpu/drm/i915/intel_ringbuffer.c |   54 +++++++++++++++++++++++++++++++
 drivers/gpu/drm/i915/intel_ringbuffer.h |    4 ++-
 7 files changed, 71 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 609f837..10941c5 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -282,7 +282,7 @@ static const struct intel_device_info intel_broadwell_m_info = {
 static const struct intel_device_info intel_broadwell_gt3d_info = {
 	.gen = 8, .num_pipes = 3,
 	.need_gfx_hws = 1, .has_hotplug = 1,
-	.ring_mask = RENDER_RING | BSD_RING | BLT_RING | VEBOX_RING,
+	.ring_mask = RENDER_RING | BSD_RING | BLT_RING | VEBOX_RING | BSD2_RING,
 	.has_llc = 1,
 	.has_ddi = 1,
 	.has_fbc = 1,
@@ -292,7 +292,7 @@ static const struct intel_device_info intel_broadwell_gt3d_info = {
 static const struct intel_device_info intel_broadwell_gt3m_info = {
 	.gen = 8, .is_mobile = 1, .num_pipes = 3,
 	.need_gfx_hws = 1, .has_hotplug = 1,
-	.ring_mask = RENDER_RING | BSD_RING | BLT_RING | VEBOX_RING,
+	.ring_mask = RENDER_RING | BSD_RING | BLT_RING | VEBOX_RING | BSD2_RING,
 	.has_llc = 1,
 	.has_ddi = 1,
 	.has_fbc = 1,
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 55addaa..d77f4e0 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1817,7 +1817,9 @@ struct drm_i915_cmd_table {
 #define BSD_RING		(1<<VCS)
 #define BLT_RING		(1<<BCS)
 #define VEBOX_RING		(1<<VECS)
+#define BSD2_RING		(1<<VCS2)
 #define HAS_BSD(dev)            (INTEL_INFO(dev)->ring_mask & BSD_RING)
+#define HAS_BSD2(dev)		(INTEL_INFO(dev)->ring_mask & BSD2_RING)
 #define HAS_BLT(dev)            (INTEL_INFO(dev)->ring_mask & BLT_RING)
 #define HAS_VEBOX(dev)            (INTEL_INFO(dev)->ring_mask & VEBOX_RING)
 #define HAS_LLC(dev)            (INTEL_INFO(dev)->has_llc)
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index c70121d..1756276 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -4372,13 +4372,20 @@ static int i915_gem_init_rings(struct drm_device *dev)
 			goto cleanup_blt_ring;
 	}
 
+	if (HAS_BSD2(dev)) {
+		ret = intel_init_bsd2_ring_buffer(dev);
+		if (ret)
+			goto cleanup_vebox_ring;
+	}
 
 	ret = i915_gem_set_seqno(dev, ((u32)~0 - 0x1000));
 	if (ret)
-		goto cleanup_vebox_ring;
+		goto cleanup_ring;
 
 	return 0;
 
+cleanup_ring:
+	intel_cleanup_ring_buffer(&dev_priv->ring[VCS2]);
 cleanup_vebox_ring:
 	intel_cleanup_ring_buffer(&dev_priv->ring[VECS]);
 cleanup_blt_ring:
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 1005af0..f6d21b3 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -42,6 +42,7 @@ static const char *ring_str(int ring)
 	case VCS: return "bsd";
 	case BCS: return "blt";
 	case VECS: return "vebox";
+	case VCS2: return "second bsd";
 	default: return "";
 	}
 }
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index 8e60737..8f5c103 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -750,6 +750,7 @@ enum punit_power_well {
 #define RENDER_RING_BASE	0x02000
 #define BSD_RING_BASE		0x04000
 #define GEN6_BSD_RING_BASE	0x12000
+#define GEN8_BSD2_RING_BASE	0x1c000
 #define VEBOX_RING_BASE		0x1a000
 #define BLT_RING_BASE		0x22000
 #define RING_TAIL(base)		((base)+0x30)
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 3d76ce1..11d0687 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -1920,10 +1920,12 @@ int intel_init_render_ring_buffer(struct drm_device *dev)
 		ring->semaphore_register[VCS] = MI_SEMAPHORE_SYNC_RV;
 		ring->semaphore_register[BCS] = MI_SEMAPHORE_SYNC_RB;
 		ring->semaphore_register[VECS] = MI_SEMAPHORE_SYNC_RVE;
+		ring->semaphore_register[VCS2] = MI_SEMAPHORE_SYNC_INVALID;
 		ring->signal_mbox[RCS] = GEN6_NOSYNC;
 		ring->signal_mbox[VCS] = GEN6_VRSYNC;
 		ring->signal_mbox[BCS] = GEN6_BRSYNC;
 		ring->signal_mbox[VECS] = GEN6_VERSYNC;
+		ring->signal_mbox[VCS2] = GEN6_NOSYNC;
 	} else if (IS_GEN5(dev)) {
 		ring->add_request = pc_render_add_request;
 		ring->flush = gen4_render_ring_flush;
@@ -2096,10 +2098,12 @@ int intel_init_bsd_ring_buffer(struct drm_device *dev)
 		ring->semaphore_register[VCS] = MI_SEMAPHORE_SYNC_INVALID;
 		ring->semaphore_register[BCS] = MI_SEMAPHORE_SYNC_VB;
 		ring->semaphore_register[VECS] = MI_SEMAPHORE_SYNC_VVE;
+		ring->semaphore_register[VCS2] = MI_SEMAPHORE_SYNC_INVALID;
 		ring->signal_mbox[RCS] = GEN6_RVSYNC;
 		ring->signal_mbox[VCS] = GEN6_NOSYNC;
 		ring->signal_mbox[BCS] = GEN6_BVSYNC;
 		ring->signal_mbox[VECS] = GEN6_VEVSYNC;
+		ring->signal_mbox[VCS2] = GEN6_NOSYNC;
 	} else {
 		ring->mmio_base = BSD_RING_BASE;
 		ring->flush = bsd_ring_flush;
@@ -2122,6 +2126,52 @@ int intel_init_bsd_ring_buffer(struct drm_device *dev)
 	return intel_init_ring_buffer(dev, ring);
 }
 
+/**
+ * Initialize the second BSD ring for Broadwell GT3.
+ * It is noted that this only exists on Broadwell GT3.
+ */
+int intel_init_bsd2_ring_buffer(struct drm_device *dev)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	struct intel_ring_buffer *ring = &dev_priv->ring[VCS2];
+
+	if ((INTEL_INFO(dev)->gen != 8) ) {
+		DRM_ERROR("No dual-BSD ring on non-BDW GT3 machine\n");
+		return -EINVAL;
+	}
+
+	ring->name = "second bsd ring";
+	ring->id = VCS2;
+
+	ring->write_tail = ring_write_tail;
+	ring->mmio_base = GEN8_BSD2_RING_BASE;
+	ring->flush = gen6_bsd_ring_flush;
+	ring->add_request = gen6_add_request;
+	ring->get_seqno = gen6_ring_get_seqno;
+	ring->set_seqno = ring_set_seqno;
+	ring->irq_enable_mask =
+			GT_RENDER_USER_INTERRUPT << GEN8_VCS2_IRQ_SHIFT;
+	ring->irq_get = gen8_ring_get_irq;
+	ring->irq_put = gen8_ring_put_irq;
+	ring->dispatch_execbuffer =
+			gen8_ring_dispatch_execbuffer;
+	ring->sync_to = gen6_ring_sync;
+	ring->semaphore_register[RCS] = MI_SEMAPHORE_SYNC_VR;
+	ring->semaphore_register[VCS] = MI_SEMAPHORE_SYNC_INVALID;
+	ring->semaphore_register[BCS] = MI_SEMAPHORE_SYNC_VB;
+	ring->semaphore_register[VECS] = MI_SEMAPHORE_SYNC_VVE;
+	ring->semaphore_register[VCS2] = MI_SEMAPHORE_SYNC_INVALID;
+	ring->signal_mbox[RCS] = GEN6_RVSYNC;
+	ring->signal_mbox[VCS] = GEN6_NOSYNC;
+	ring->signal_mbox[BCS] = GEN6_BVSYNC;
+	ring->signal_mbox[VECS] = GEN6_VEVSYNC;
+	ring->signal_mbox[VCS2] = GEN6_NOSYNC;
+
+	ring->init = init_ring_common;
+
+	return intel_init_ring_buffer(dev, ring);
+}
+
 int intel_init_blt_ring_buffer(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
@@ -2153,10 +2203,12 @@ int intel_init_blt_ring_buffer(struct drm_device *dev)
 	ring->semaphore_register[VCS] = MI_SEMAPHORE_SYNC_BV;
 	ring->semaphore_register[BCS] = MI_SEMAPHORE_SYNC_INVALID;
 	ring->semaphore_register[VECS] = MI_SEMAPHORE_SYNC_BVE;
+	ring->semaphore_register[VCS2] = MI_SEMAPHORE_SYNC_INVALID;
 	ring->signal_mbox[RCS] = GEN6_RBSYNC;
 	ring->signal_mbox[VCS] = GEN6_VBSYNC;
 	ring->signal_mbox[BCS] = GEN6_NOSYNC;
 	ring->signal_mbox[VECS] = GEN6_VEBSYNC;
+	ring->signal_mbox[VCS2] = GEN6_NOSYNC;
 	ring->init = init_ring_common;
 
 	return intel_init_ring_buffer(dev, ring);
@@ -2194,10 +2246,12 @@ int intel_init_vebox_ring_buffer(struct drm_device *dev)
 	ring->semaphore_register[VCS] = MI_SEMAPHORE_SYNC_VEV;
 	ring->semaphore_register[BCS] = MI_SEMAPHORE_SYNC_VEB;
 	ring->semaphore_register[VECS] = MI_SEMAPHORE_SYNC_INVALID;
+	ring->semaphore_register[VCS2] = MI_SEMAPHORE_SYNC_INVALID;
 	ring->signal_mbox[RCS] = GEN6_RVESYNC;
 	ring->signal_mbox[VCS] = GEN6_VVESYNC;
 	ring->signal_mbox[BCS] = GEN6_BVESYNC;
 	ring->signal_mbox[VECS] = GEN6_NOSYNC;
+	ring->signal_mbox[VCS2] = GEN6_NOSYNC;
 	ring->init = init_ring_common;
 
 	return intel_init_ring_buffer(dev, ring);
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 413cdc7..8ca4285 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -61,8 +61,9 @@ struct  intel_ring_buffer {
 		VCS,
 		BCS,
 		VECS,
+		VCS2,
 	} id;
-#define I915_NUM_RINGS 4
+#define I915_NUM_RINGS 5
 	u32		mmio_base;
 	void		__iomem *virtual_start;
 	struct		drm_device *dev;
@@ -286,6 +287,7 @@ int intel_ring_invalidate_all_caches(struct intel_ring_buffer *ring);
 
 int intel_init_render_ring_buffer(struct drm_device *dev);
 int intel_init_bsd_ring_buffer(struct drm_device *dev);
+int intel_init_bsd2_ring_buffer(struct drm_device *dev);
 int intel_init_blt_ring_buffer(struct drm_device *dev);
 int intel_init_vebox_ring_buffer(struct drm_device *dev);
 
-- 
1.7.10.1

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 3/5] drm/i915:Handle the irq interrupt for the second BSD ring
  2014-04-09  1:59 [PATCH 0/5] drm/i915: Add the support of dual BSD rings on BDW GT3 Zhao Yakui
  2014-04-09  1:59 ` [PATCH 1/5] drm/i915: Split the BDW device definition to prepare for " Zhao Yakui
  2014-04-09  1:59 ` [PATCH 2/5] drm/i915:Initialize the second BSD ring on BDW GT3 machine Zhao Yakui
@ 2014-04-09  1:59 ` Zhao Yakui
  2014-04-09  1:59 ` [PATCH 4/5] drm/i915:Add the VCS2 switch in Intel_ring_setup_status_page for Gen7 to remove the switch check warning Zhao Yakui
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 24+ messages in thread
From: Zhao Yakui @ 2014-04-09  1:59 UTC (permalink / raw)
  To: intel-gfx

Signed-off-by: Zhao Yakui <yakui.zhao@intel.com>
---
 drivers/gpu/drm/i915/i915_irq.c |    5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index bdda3b5..d5b1dd3 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -1347,13 +1347,16 @@ static irqreturn_t gen8_gt_irq_handler(struct drm_device *dev,
 			DRM_ERROR("The master control interrupt lied (GT0)!\n");
 	}
 
-	if (master_ctl & GEN8_GT_VCS1_IRQ) {
+	if (master_ctl & (GEN8_GT_VCS1_IRQ | GEN8_GT_VCS2_IRQ)) {
 		tmp = I915_READ(GEN8_GT_IIR(1));
 		if (tmp) {
 			ret = IRQ_HANDLED;
 			vcs = tmp >> GEN8_VCS1_IRQ_SHIFT;
 			if (vcs & GT_RENDER_USER_INTERRUPT)
 				notify_ring(dev, &dev_priv->ring[VCS]);
+			vcs = tmp >> GEN8_VCS2_IRQ_SHIFT;
+			if (vcs & GT_RENDER_USER_INTERRUPT)
+				notify_ring(dev, &dev_priv->ring[VCS2]);
 			I915_WRITE(GEN8_GT_IIR(1), tmp);
 		} else
 			DRM_ERROR("The master control interrupt lied (GT1)!\n");
-- 
1.7.10.1

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 4/5] drm/i915:Add the VCS2 switch in Intel_ring_setup_status_page for Gen7 to remove the switch check warning
  2014-04-09  1:59 [PATCH 0/5] drm/i915: Add the support of dual BSD rings on BDW GT3 Zhao Yakui
                   ` (2 preceding siblings ...)
  2014-04-09  1:59 ` [PATCH 3/5] drm/i915:Handle the irq interrupt for the second BSD ring Zhao Yakui
@ 2014-04-09  1:59 ` Zhao Yakui
  2014-04-09 14:29   ` Daniel Vetter
  2014-04-09  1:59 ` [PATCH 5/5] drm/i915:Use the coarse mechanism based on drm fd to dispatch the BSD command on BDW GT3 Zhao Yakui
  2014-04-09 14:45 ` [PATCH 0/5] drm/i915: Add the support of dual BSD rings " Daniel Vetter
  5 siblings, 1 reply; 24+ messages in thread
From: Zhao Yakui @ 2014-04-09  1:59 UTC (permalink / raw)
  To: intel-gfx

The Gen7 doesn't have the second BSD ring. But it will complain the switch check
warning message during compilation. So just add it to remove the
switch check warning.

Signed-off-by: Zhao Yakui <yakui.zhao@intel.com>
---
 drivers/gpu/drm/i915/intel_ringbuffer.c |    1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 11d0687..43e0227 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -984,6 +984,7 @@ void intel_ring_setup_status_page(struct intel_ring_buffer *ring)
 		case BCS:
 			mmio = BLT_HWS_PGA_GEN7;
 			break;
+		case VCS2:
 		case VCS:
 			mmio = BSD_HWS_PGA_GEN7;
 			break;
-- 
1.7.10.1

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 5/5] drm/i915:Use the coarse mechanism based on drm fd to dispatch the BSD command on BDW GT3
  2014-04-09  1:59 [PATCH 0/5] drm/i915: Add the support of dual BSD rings on BDW GT3 Zhao Yakui
                   ` (3 preceding siblings ...)
  2014-04-09  1:59 ` [PATCH 4/5] drm/i915:Add the VCS2 switch in Intel_ring_setup_status_page for Gen7 to remove the switch check warning Zhao Yakui
@ 2014-04-09  1:59 ` Zhao Yakui
  2014-04-09 14:34   ` Daniel Vetter
  2014-04-09 14:45 ` [PATCH 0/5] drm/i915: Add the support of dual BSD rings " Daniel Vetter
  5 siblings, 1 reply; 24+ messages in thread
From: Zhao Yakui @ 2014-04-09  1:59 UTC (permalink / raw)
  To: intel-gfx

The BDW GT3 has two independent BSD rings, which can be used to process the
video commands. To be simpler, it is transparent to user-space driver/middleware.
Instead the kernel driver will decide which ring is to dispatch the BSD video
command.

As every BSD ring is powerful, it is enough to dispatch the BSD video command
based on the drm fd. In such case the different BSD ring is used for video playing
back and encoding. At the same time the coarse dispatch mechanism can help to avoid
the object synchronization between the BSD rings.

Signed-off-by: Zhao Yakui <yakui.zhao@intel.com>
---
 drivers/gpu/drm/i915/i915_dma.c            |   14 ++++++
 drivers/gpu/drm/i915/i915_drv.h            |    3 ++
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |   73 +++++++++++++++++++++++++++-
 drivers/gpu/drm/i915/intel_ringbuffer.c    |    2 +
 drivers/gpu/drm/i915/intel_ringbuffer.h    |    2 +
 5 files changed, 93 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
index 0b38f88..8260463 100644
--- a/drivers/gpu/drm/i915/i915_dma.c
+++ b/drivers/gpu/drm/i915/i915_dma.c
@@ -1572,6 +1572,7 @@ int i915_driver_load(struct drm_device *dev, unsigned long flags)
 	spin_lock_init(&dev_priv->backlight_lock);
 	spin_lock_init(&dev_priv->uncore.lock);
 	spin_lock_init(&dev_priv->mm.object_stat_lock);
+	spin_lock_init(&dev_priv->bsd_lock);
 	mutex_init(&dev_priv->dpio_lock);
 	mutex_init(&dev_priv->modeset_restore_lock);
 
@@ -1928,7 +1929,20 @@ void i915_driver_preclose(struct drm_device * dev, struct drm_file *file_priv)
 void i915_driver_postclose(struct drm_device *dev, struct drm_file *file)
 {
 	struct drm_i915_file_private *file_priv = file->driver_priv;
+	struct intel_ring_buffer *bsd_ring;
+	struct drm_i915_private *dev_priv = dev->dev_private;
 
+	if (file_priv && file_priv->bsd_ring) {
+		int cmd_counter;
+		bsd_ring = file_priv->bsd_ring;
+		file_priv->bsd_ring = NULL;
+		spin_lock(&dev_priv->bsd_lock);
+		cmd_counter = atomic_sub_return(1, &bsd_ring->bsd_cmd_counter);
+		if (cmd_counter < 0) {
+			atomic_set(&bsd_ring->bsd_cmd_counter, 0);
+		}
+		spin_unlock(&dev_priv->bsd_lock);
+	}
 	kfree(file_priv);
 }
 
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index d77f4e0..128639c 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1457,6 +1457,8 @@ struct drm_i915_private {
 	struct i915_dri1_state dri1;
 	/* Old ums support infrastructure, same warning applies. */
 	struct i915_ums_state ums;
+	/* the lock for dispatch video commands on two BSD rings */
+	spinlock_t bsd_lock;
 };
 
 static inline struct drm_i915_private *to_i915(const struct drm_device *dev)
@@ -1664,6 +1666,7 @@ struct drm_i915_file_private {
 
 	struct i915_hw_context *private_default_ctx;
 	atomic_t rps_wait_boost;
+	struct  intel_ring_buffer *bsd_ring;
 };
 
 /*
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 3491402..75d8cc0 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -999,6 +999,70 @@ i915_reset_gen7_sol_offsets(struct drm_device *dev,
 	return 0;
 }
 
+/**
+ * Find one BSD ring to dispatch the corresponding BSD command.
+ * The Ring ID is returned.
+ */
+static int gen8_dispatch_bsd_ring(struct drm_device *dev,
+				  struct drm_file *file)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	struct drm_i915_file_private *file_priv = file->driver_priv;
+	struct intel_ring_buffer *temp_ring, *bsd_ring;
+	int bsd_counter, temp_counter;
+
+	if (file_priv->bsd_ring) {
+		/* Check whether the load balance is required.*/
+		spin_lock(&dev_priv->bsd_lock);
+		bsd_counter = atomic_read(&(file_priv->bsd_ring->bsd_cmd_counter));
+		temp_ring = &dev_priv->ring[VCS];
+		temp_counter = atomic_read(&temp_ring->bsd_cmd_counter);
+		bsd_ring = &dev_priv->ring[VCS];
+
+		temp_ring = &dev_priv->ring[VCS2];
+		if (atomic_read(&temp_ring->bsd_cmd_counter) < temp_counter) {
+			temp_counter = atomic_read(&temp_ring->bsd_cmd_counter);
+			bsd_ring = temp_ring;
+		}
+		/*
+		 * If it is already the ring with the minimum load, it is
+		 * unnecessary to switch it.
+		 */
+		if (bsd_ring == file_priv->bsd_ring) {
+			spin_unlock(&dev_priv->bsd_lock);
+			return bsd_ring->id;
+		}
+		/*
+		 * If the load delta between current ring and target ring is
+		 * small, it is unnecessary to switch it.
+		 */
+		if ((bsd_counter - temp_counter) <= 1) {
+			spin_unlock(&dev_priv->bsd_lock);
+			return file_priv->bsd_ring->id;
+		}
+		/* balance the load between current ring and target ring */
+		atomic_dec(&file_priv->bsd_ring->bsd_cmd_counter);
+		atomic_inc(&bsd_ring->bsd_cmd_counter);
+		spin_unlock(&dev_priv->bsd_lock);
+		file_priv->bsd_ring = bsd_ring;
+		return bsd_ring->id;
+	} else {
+		spin_lock(&dev_priv->bsd_lock);
+		bsd_ring = &dev_priv->ring[VCS];
+		bsd_counter = atomic_read(&bsd_ring->bsd_cmd_counter);
+		temp_ring = &dev_priv->ring[VCS2];
+		temp_counter = atomic_read(&temp_ring->bsd_cmd_counter);
+		if (temp_counter < bsd_counter) {
+			bsd_ring = temp_ring;
+			bsd_counter = temp_counter;
+		}
+		atomic_inc(&bsd_ring->bsd_cmd_counter);
+		file_priv->bsd_ring = bsd_ring;
+		spin_unlock(&dev_priv->bsd_lock);
+		return bsd_ring->id;
+	}
+}
+
 static int
 i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 		       struct drm_file *file,
@@ -1043,7 +1107,14 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 
 	if ((args->flags & I915_EXEC_RING_MASK) == I915_EXEC_DEFAULT)
 		ring = &dev_priv->ring[RCS];
-	else
+	else if ((args->flags & I915_EXEC_RING_MASK) == I915_EXEC_BSD) {
+		if (HAS_BSD2(dev)) {
+			int ring_id;
+			ring_id = gen8_dispatch_bsd_ring(dev, file);
+			ring = &dev_priv->ring[ring_id];
+		} else
+			ring = &dev_priv->ring[VCS];
+	} else
 		ring = &dev_priv->ring[(args->flags & I915_EXEC_RING_MASK) - 1];
 
 	if (!intel_ring_initialized(ring)) {
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 43e0227..852fc29 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -2123,6 +2123,7 @@ int intel_init_bsd_ring_buffer(struct drm_device *dev)
 		ring->dispatch_execbuffer = i965_dispatch_execbuffer;
 	}
 	ring->init = init_ring_common;
+	atomic_set(&ring->bsd_cmd_counter, 0);
 
 	return intel_init_ring_buffer(dev, ring);
 }
@@ -2170,6 +2171,7 @@ int intel_init_bsd2_ring_buffer(struct drm_device *dev)
 
 	ring->init = init_ring_common;
 
+	atomic_set(&ring->bsd_cmd_counter, 0);
 	return intel_init_ring_buffer(dev, ring);
 }
 
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 8ca4285..4f87b08 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -196,6 +196,8 @@ struct  intel_ring_buffer {
 	 * to encode the command length in the header).
 	 */
 	u32 (*get_cmd_length_mask)(u32 cmd_header);
+
+	atomic_t bsd_cmd_counter;
 };
 
 static inline bool
-- 
1.7.10.1

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: [PATCH 1/5] drm/i915: Split the BDW device definition to prepare for dual BSD rings on BDW GT3
  2014-04-09  1:59 ` [PATCH 1/5] drm/i915: Split the BDW device definition to prepare for " Zhao Yakui
@ 2014-04-09 14:27   ` Daniel Vetter
  2014-04-10  0:44     ` Zhao Yakui
  0 siblings, 1 reply; 24+ messages in thread
From: Daniel Vetter @ 2014-04-09 14:27 UTC (permalink / raw)
  To: Zhao Yakui; +Cc: intel-gfx

On Wed, Apr 09, 2014 at 09:59:52AM +0800, Zhao Yakui wrote:
> Based on the hardware spec, the BDW GT3 has the different configuration
> with the BDW GT1/GT2. So split the BDW device info definition.
> This is to do the preparation for adding the Dual BSD rings on BDW GT3 machine.
> 
> Signed-off-by: Zhao Yakui <yakui.zhao@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.c |   24 +++++++++++++++++++++++-
>  include/drm/i915_pciids.h       |   10 +++++++---
>  2 files changed, 30 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index a01faea..609f837 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -279,6 +279,26 @@ static const struct intel_device_info intel_broadwell_m_info = {
>  	GEN_DEFAULT_PIPEOFFSETS,
>  };
>  
> +static const struct intel_device_info intel_broadwell_gt3d_info = {
> +	.gen = 8, .num_pipes = 3,
> +	.need_gfx_hws = 1, .has_hotplug = 1,
> +	.ring_mask = RENDER_RING | BSD_RING | BLT_RING | VEBOX_RING,
> +	.has_llc = 1,
> +	.has_ddi = 1,
> +	.has_fbc = 1,
> +	GEN_DEFAULT_PIPEOFFSETS,
> +};
> +
> +static const struct intel_device_info intel_broadwell_gt3m_info = {
> +	.gen = 8, .is_mobile = 1, .num_pipes = 3,
> +	.need_gfx_hws = 1, .has_hotplug = 1,
> +	.ring_mask = RENDER_RING | BSD_RING | BLT_RING | VEBOX_RING,
> +	.has_llc = 1,
> +	.has_ddi = 1,
> +	.has_fbc = 1,
> +	GEN_DEFAULT_PIPEOFFSETS,
> +};
> +
>  /*
>   * Make sure any device matches here are from most specific to most
>   * general.  For example, since the Quanta match is based on the subsystem
> @@ -312,7 +332,9 @@ static const struct intel_device_info intel_broadwell_m_info = {
>  	INTEL_VLV_M_IDS(&intel_valleyview_m_info),	\
>  	INTEL_VLV_D_IDS(&intel_valleyview_d_info),	\
>  	INTEL_BDW_M_IDS(&intel_broadwell_m_info),	\
> -	INTEL_BDW_D_IDS(&intel_broadwell_d_info)
> +	INTEL_BDW_D_IDS(&intel_broadwell_d_info),	\
> +	INTEL_BDW_GT3M_IDS(&intel_broadwell_gt3m_info),	\
> +	INTEL_BDW_GT3D_IDS(&intel_broadwell_gt3d_info)

You've forgotten to update the stolen memory quirk table in the x86 code.
Just grep for INTEL_BDW_M_IDS to see all users of these macros.
-Daniel

>  
>  static const struct pci_device_id pciidlist[] = {		/* aka */
>  	INTEL_PCI_IDS,
> diff --git a/include/drm/i915_pciids.h b/include/drm/i915_pciids.h
> index 940ece4..32d75f8 100644
> --- a/include/drm/i915_pciids.h
> +++ b/include/drm/i915_pciids.h
> @@ -225,12 +225,16 @@
>  
>  #define INTEL_BDW_M_IDS(info) \
>  	_INTEL_BDW_M_IDS(1, info), \
> -	_INTEL_BDW_M_IDS(2, info), \
> -	_INTEL_BDW_M_IDS(3, info)
> +	_INTEL_BDW_M_IDS(2, info)
>  
>  #define INTEL_BDW_D_IDS(info) \
>  	_INTEL_BDW_D_IDS(1, info), \
> -	_INTEL_BDW_D_IDS(2, info), \
> +	_INTEL_BDW_D_IDS(2, info)
> +
> +#define INTEL_BDW_GT3M_IDS(info) \
> +	_INTEL_BDW_M_IDS(3, info)
> +
> +#define INTEL_BDW_GT3D_IDS(info) \
>  	_INTEL_BDW_D_IDS(3, info)
>  
>  #endif /* _I915_PCIIDS_H */
> -- 
> 1.7.10.1
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 4/5] drm/i915:Add the VCS2 switch in Intel_ring_setup_status_page for Gen7 to remove the switch check warning
  2014-04-09  1:59 ` [PATCH 4/5] drm/i915:Add the VCS2 switch in Intel_ring_setup_status_page for Gen7 to remove the switch check warning Zhao Yakui
@ 2014-04-09 14:29   ` Daniel Vetter
  2014-04-10  0:45     ` Zhao Yakui
  0 siblings, 1 reply; 24+ messages in thread
From: Daniel Vetter @ 2014-04-09 14:29 UTC (permalink / raw)
  To: Zhao Yakui; +Cc: intel-gfx

On Wed, Apr 09, 2014 at 09:59:55AM +0800, Zhao Yakui wrote:
> The Gen7 doesn't have the second BSD ring. But it will complain the switch check
> warning message during compilation. So just add it to remove the
> switch check warning.
> 
> Signed-off-by: Zhao Yakui <yakui.zhao@intel.com>
> ---
>  drivers/gpu/drm/i915/intel_ringbuffer.c |    1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index 11d0687..43e0227 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -984,6 +984,7 @@ void intel_ring_setup_status_page(struct intel_ring_buffer *ring)
>  		case BCS:
>  			mmio = BLT_HWS_PGA_GEN7;
>  			break;
> +		case VCS2:

Maybe add a /* doesn't actually exist but shuts up gcc */ comment?
-Daniel

>  		case VCS:
>  			mmio = BSD_HWS_PGA_GEN7;
>  			break;
> -- 
> 1.7.10.1
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 5/5] drm/i915:Use the coarse mechanism based on drm fd to dispatch the BSD command on BDW GT3
  2014-04-09  1:59 ` [PATCH 5/5] drm/i915:Use the coarse mechanism based on drm fd to dispatch the BSD command on BDW GT3 Zhao Yakui
@ 2014-04-09 14:34   ` Daniel Vetter
  2014-04-10  2:24     ` Zhao Yakui
  0 siblings, 1 reply; 24+ messages in thread
From: Daniel Vetter @ 2014-04-09 14:34 UTC (permalink / raw)
  To: Zhao Yakui; +Cc: intel-gfx

On Wed, Apr 09, 2014 at 09:59:56AM +0800, Zhao Yakui wrote:
> The BDW GT3 has two independent BSD rings, which can be used to process the
> video commands. To be simpler, it is transparent to user-space driver/middleware.
> Instead the kernel driver will decide which ring is to dispatch the BSD video
> command.
> 
> As every BSD ring is powerful, it is enough to dispatch the BSD video command
> based on the drm fd. In such case the different BSD ring is used for video playing
> back and encoding. At the same time the coarse dispatch mechanism can help to avoid
> the object synchronization between the BSD rings.
> 
> Signed-off-by: Zhao Yakui <yakui.zhao@intel.com>

This looks way too complicated. First things first please get rid of the
atomic_t usage. If you don't have _massive_ comments explaining the memory
barriers you're most likely using linux kernel atomic_t wrong. They are
fully unordered.

With that out of the way this still looks a bit complicated really. Can't
we just use a very simple static rule in gen8_dispatch_bsd_ring which
hashed the pointer address of the file_priv? Just to get things going,
once we have a clear need we can try to make things more intelligent. But
in case of doubt I really prefer if we start with the dumbest possible
approach first and add complexity instead of starting with something
really complex and simplifying it.
-Daniel

> ---
>  drivers/gpu/drm/i915/i915_dma.c            |   14 ++++++
>  drivers/gpu/drm/i915/i915_drv.h            |    3 ++
>  drivers/gpu/drm/i915/i915_gem_execbuffer.c |   73 +++++++++++++++++++++++++++-
>  drivers/gpu/drm/i915/intel_ringbuffer.c    |    2 +
>  drivers/gpu/drm/i915/intel_ringbuffer.h    |    2 +
>  5 files changed, 93 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
> index 0b38f88..8260463 100644
> --- a/drivers/gpu/drm/i915/i915_dma.c
> +++ b/drivers/gpu/drm/i915/i915_dma.c
> @@ -1572,6 +1572,7 @@ int i915_driver_load(struct drm_device *dev, unsigned long flags)
>  	spin_lock_init(&dev_priv->backlight_lock);
>  	spin_lock_init(&dev_priv->uncore.lock);
>  	spin_lock_init(&dev_priv->mm.object_stat_lock);
> +	spin_lock_init(&dev_priv->bsd_lock);
>  	mutex_init(&dev_priv->dpio_lock);
>  	mutex_init(&dev_priv->modeset_restore_lock);
>  
> @@ -1928,7 +1929,20 @@ void i915_driver_preclose(struct drm_device * dev, struct drm_file *file_priv)
>  void i915_driver_postclose(struct drm_device *dev, struct drm_file *file)
>  {
>  	struct drm_i915_file_private *file_priv = file->driver_priv;
> +	struct intel_ring_buffer *bsd_ring;
> +	struct drm_i915_private *dev_priv = dev->dev_private;
>  
> +	if (file_priv && file_priv->bsd_ring) {
> +		int cmd_counter;
> +		bsd_ring = file_priv->bsd_ring;
> +		file_priv->bsd_ring = NULL;
> +		spin_lock(&dev_priv->bsd_lock);
> +		cmd_counter = atomic_sub_return(1, &bsd_ring->bsd_cmd_counter);
> +		if (cmd_counter < 0) {
> +			atomic_set(&bsd_ring->bsd_cmd_counter, 0);
> +		}
> +		spin_unlock(&dev_priv->bsd_lock);
> +	}
>  	kfree(file_priv);
>  }
>  
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index d77f4e0..128639c 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1457,6 +1457,8 @@ struct drm_i915_private {
>  	struct i915_dri1_state dri1;
>  	/* Old ums support infrastructure, same warning applies. */
>  	struct i915_ums_state ums;
> +	/* the lock for dispatch video commands on two BSD rings */
> +	spinlock_t bsd_lock;
>  };
>  
>  static inline struct drm_i915_private *to_i915(const struct drm_device *dev)
> @@ -1664,6 +1666,7 @@ struct drm_i915_file_private {
>  
>  	struct i915_hw_context *private_default_ctx;
>  	atomic_t rps_wait_boost;
> +	struct  intel_ring_buffer *bsd_ring;
>  };
>  
>  /*
> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> index 3491402..75d8cc0 100644
> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> @@ -999,6 +999,70 @@ i915_reset_gen7_sol_offsets(struct drm_device *dev,
>  	return 0;
>  }
>  
> +/**
> + * Find one BSD ring to dispatch the corresponding BSD command.
> + * The Ring ID is returned.
> + */
> +static int gen8_dispatch_bsd_ring(struct drm_device *dev,
> +				  struct drm_file *file)
> +{
> +	struct drm_i915_private *dev_priv = dev->dev_private;
> +	struct drm_i915_file_private *file_priv = file->driver_priv;
> +	struct intel_ring_buffer *temp_ring, *bsd_ring;
> +	int bsd_counter, temp_counter;
> +
> +	if (file_priv->bsd_ring) {
> +		/* Check whether the load balance is required.*/
> +		spin_lock(&dev_priv->bsd_lock);
> +		bsd_counter = atomic_read(&(file_priv->bsd_ring->bsd_cmd_counter));
> +		temp_ring = &dev_priv->ring[VCS];
> +		temp_counter = atomic_read(&temp_ring->bsd_cmd_counter);
> +		bsd_ring = &dev_priv->ring[VCS];
> +
> +		temp_ring = &dev_priv->ring[VCS2];
> +		if (atomic_read(&temp_ring->bsd_cmd_counter) < temp_counter) {
> +			temp_counter = atomic_read(&temp_ring->bsd_cmd_counter);
> +			bsd_ring = temp_ring;
> +		}
> +		/*
> +		 * If it is already the ring with the minimum load, it is
> +		 * unnecessary to switch it.
> +		 */
> +		if (bsd_ring == file_priv->bsd_ring) {
> +			spin_unlock(&dev_priv->bsd_lock);
> +			return bsd_ring->id;
> +		}
> +		/*
> +		 * If the load delta between current ring and target ring is
> +		 * small, it is unnecessary to switch it.
> +		 */
> +		if ((bsd_counter - temp_counter) <= 1) {
> +			spin_unlock(&dev_priv->bsd_lock);
> +			return file_priv->bsd_ring->id;
> +		}
> +		/* balance the load between current ring and target ring */
> +		atomic_dec(&file_priv->bsd_ring->bsd_cmd_counter);
> +		atomic_inc(&bsd_ring->bsd_cmd_counter);
> +		spin_unlock(&dev_priv->bsd_lock);
> +		file_priv->bsd_ring = bsd_ring;
> +		return bsd_ring->id;
> +	} else {
> +		spin_lock(&dev_priv->bsd_lock);
> +		bsd_ring = &dev_priv->ring[VCS];
> +		bsd_counter = atomic_read(&bsd_ring->bsd_cmd_counter);
> +		temp_ring = &dev_priv->ring[VCS2];
> +		temp_counter = atomic_read(&temp_ring->bsd_cmd_counter);
> +		if (temp_counter < bsd_counter) {
> +			bsd_ring = temp_ring;
> +			bsd_counter = temp_counter;
> +		}
> +		atomic_inc(&bsd_ring->bsd_cmd_counter);
> +		file_priv->bsd_ring = bsd_ring;
> +		spin_unlock(&dev_priv->bsd_lock);
> +		return bsd_ring->id;
> +	}
> +}
> +
>  static int
>  i915_gem_do_execbuffer(struct drm_device *dev, void *data,
>  		       struct drm_file *file,
> @@ -1043,7 +1107,14 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
>  
>  	if ((args->flags & I915_EXEC_RING_MASK) == I915_EXEC_DEFAULT)
>  		ring = &dev_priv->ring[RCS];
> -	else
> +	else if ((args->flags & I915_EXEC_RING_MASK) == I915_EXEC_BSD) {
> +		if (HAS_BSD2(dev)) {
> +			int ring_id;
> +			ring_id = gen8_dispatch_bsd_ring(dev, file);
> +			ring = &dev_priv->ring[ring_id];
> +		} else
> +			ring = &dev_priv->ring[VCS];
> +	} else
>  		ring = &dev_priv->ring[(args->flags & I915_EXEC_RING_MASK) - 1];
>  
>  	if (!intel_ring_initialized(ring)) {
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index 43e0227..852fc29 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -2123,6 +2123,7 @@ int intel_init_bsd_ring_buffer(struct drm_device *dev)
>  		ring->dispatch_execbuffer = i965_dispatch_execbuffer;
>  	}
>  	ring->init = init_ring_common;
> +	atomic_set(&ring->bsd_cmd_counter, 0);
>  
>  	return intel_init_ring_buffer(dev, ring);
>  }
> @@ -2170,6 +2171,7 @@ int intel_init_bsd2_ring_buffer(struct drm_device *dev)
>  
>  	ring->init = init_ring_common;
>  
> +	atomic_set(&ring->bsd_cmd_counter, 0);
>  	return intel_init_ring_buffer(dev, ring);
>  }
>  
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 8ca4285..4f87b08 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -196,6 +196,8 @@ struct  intel_ring_buffer {
>  	 * to encode the command length in the header).
>  	 */
>  	u32 (*get_cmd_length_mask)(u32 cmd_header);
> +
> +	atomic_t bsd_cmd_counter;
>  };
>  
>  static inline bool
> -- 
> 1.7.10.1
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 0/5] drm/i915: Add the support of dual BSD rings on BDW GT3
  2014-04-09  1:59 [PATCH 0/5] drm/i915: Add the support of dual BSD rings on BDW GT3 Zhao Yakui
                   ` (4 preceding siblings ...)
  2014-04-09  1:59 ` [PATCH 5/5] drm/i915:Use the coarse mechanism based on drm fd to dispatch the BSD command on BDW GT3 Zhao Yakui
@ 2014-04-09 14:45 ` Daniel Vetter
  2014-04-10  3:28   ` Zhao Yakui
  5 siblings, 1 reply; 24+ messages in thread
From: Daniel Vetter @ 2014-04-09 14:45 UTC (permalink / raw)
  To: Zhao Yakui; +Cc: intel-gfx

On Wed, Apr 09, 2014 at 09:59:51AM +0800, Zhao Yakui wrote:
> 
> This is the patch set that tries to add the support of dual BSD rings on BDW
> GT3. Based on hardware spec, the BDW GT3 has two independent BSD rings, which
> can be used to process the video commands. To be simpler, it is transparent 
> to user-space driver/middleware. In such case the kernel driver will decide
> which ring is to dispatch the BSD video command.
> 
> As every BSD ring is powerful, it is enough to dispatch the BSD video command
> based on the drm fd. In such case the different BSD ring is used for video playing
> back and encoding. At the same time the coarse dispatch mechanism can help to avoid
> the object synchronization between the BSD rings.

Ok, I've quickly read through it all and commented on a few things. Imo
the last patch should be massively simplified, at least for the first
round. Other things look small.

What's still missing are testcases, and I have two things in mind here:
- Exercise the 2nd ring dispatch and sync a bit. Since the 2nd bsd ring is
  hidden within the kernel I think the right approach would be to open a
  few drm fds (10 or so) and then randomly use them with a dummy reloc. We
  have two testcases which can be used as blueprints that need
  adjustement:

  - gem_ring_sync_loop: Probably easiest to copy it to a new file as
    gem_multi_bsd_sync_loop. This test exercises semaphores.
  - gem_dummy_reloc_loop, subtest mixed: Almost the same as the above, but
    the sync is done _inside_ the loop and hence this exercises gpu/cpu
    sync. We need both tests adjusted, for for this we need a new
    multi-bsd test.

- New testcase to fully test main execbuffer flags. This is simply
  something that's we don't yet have. The next guy to touch execbuf code
  needs to add it, and it looks like that's you ;-) I've done a JIRA task
  for the resource streamer work, but I think the resource streamer wont
  be merged anytime soon. So I'll reassign to you. Jira task is VIZ-3129.

Thanks, Daniel

> 
> 
> Zhao Yakui (5):
>   drm/i915: Split the BDW device definition to prepare for dual BSD
>     rings on BDW GT3
>   drm/i915: Initialize the second BSD ring on BDW GT3 machine
>   drm/i915: Handle the irq interrupt for the second BSD ring
>   drm/i915: Add the VCS2 switch in Intel_ring_setup_status_page for Gen7 to
>     remove the switch check warning
>   drm/i915: Use the coarse mechanism based on drm fd to dispatch the BSD command
>     on BDW GT3
> 
>  drivers/gpu/drm/i915/i915_dma.c            |   14 ++++++
>  drivers/gpu/drm/i915/i915_drv.c            |   24 ++++++++-
>  drivers/gpu/drm/i915/i915_drv.h            |    5 ++
>  drivers/gpu/drm/i915/i915_gem.c            |    9 +++-
>  drivers/gpu/drm/i915/i915_gem_execbuffer.c |   73 +++++++++++++++++++++++++++-
>  drivers/gpu/drm/i915/i915_gpu_error.c      |    1 +
>  drivers/gpu/drm/i915/i915_irq.c            |    5 +-
>  drivers/gpu/drm/i915/i915_reg.h            |    1 +
>  drivers/gpu/drm/i915/intel_ringbuffer.c    |   57 ++++++++++++++++++++++
>  drivers/gpu/drm/i915/intel_ringbuffer.h    |    6 ++-
>  include/drm/i915_pciids.h                  |   10 ++--
>  11 files changed, 197 insertions(+), 8 deletions(-)
> 
> -- 
> 1.7.10.1
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 1/5] drm/i915: Split the BDW device definition to prepare for dual BSD rings on BDW GT3
  2014-04-09 14:27   ` Daniel Vetter
@ 2014-04-10  0:44     ` Zhao Yakui
  0 siblings, 0 replies; 24+ messages in thread
From: Zhao Yakui @ 2014-04-10  0:44 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: intel-gfx

On Wed, 2014-04-09 at 08:27 -0600, Daniel Vetter wrote:
> On Wed, Apr 09, 2014 at 09:59:52AM +0800, Zhao Yakui wrote:
> > Based on the hardware spec, the BDW GT3 has the different configuration
> > with the BDW GT1/GT2. So split the BDW device info definition.
> > This is to do the preparation for adding the Dual BSD rings on BDW GT3 machine.
> > 
> > Signed-off-by: Zhao Yakui <yakui.zhao@intel.com>
> > ---
> >  drivers/gpu/drm/i915/i915_drv.c |   24 +++++++++++++++++++++++-
> >  include/drm/i915_pciids.h       |   10 +++++++---
> >  2 files changed, 30 insertions(+), 4 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> > index a01faea..609f837 100644
> > --- a/drivers/gpu/drm/i915/i915_drv.c
> > +++ b/drivers/gpu/drm/i915/i915_drv.c
> > @@ -279,6 +279,26 @@ static const struct intel_device_info intel_broadwell_m_info = {
> >  	GEN_DEFAULT_PIPEOFFSETS,
> >  };
> >  
> > +static const struct intel_device_info intel_broadwell_gt3d_info = {
> > +	.gen = 8, .num_pipes = 3,
> > +	.need_gfx_hws = 1, .has_hotplug = 1,
> > +	.ring_mask = RENDER_RING | BSD_RING | BLT_RING | VEBOX_RING,
> > +	.has_llc = 1,
> > +	.has_ddi = 1,
> > +	.has_fbc = 1,
> > +	GEN_DEFAULT_PIPEOFFSETS,
> > +};
> > +
> > +static const struct intel_device_info intel_broadwell_gt3m_info = {
> > +	.gen = 8, .is_mobile = 1, .num_pipes = 3,
> > +	.need_gfx_hws = 1, .has_hotplug = 1,
> > +	.ring_mask = RENDER_RING | BSD_RING | BLT_RING | VEBOX_RING,
> > +	.has_llc = 1,
> > +	.has_ddi = 1,
> > +	.has_fbc = 1,
> > +	GEN_DEFAULT_PIPEOFFSETS,
> > +};
> > +
> >  /*
> >   * Make sure any device matches here are from most specific to most
> >   * general.  For example, since the Quanta match is based on the subsystem
> > @@ -312,7 +332,9 @@ static const struct intel_device_info intel_broadwell_m_info = {
> >  	INTEL_VLV_M_IDS(&intel_valleyview_m_info),	\
> >  	INTEL_VLV_D_IDS(&intel_valleyview_d_info),	\
> >  	INTEL_BDW_M_IDS(&intel_broadwell_m_info),	\
> > -	INTEL_BDW_D_IDS(&intel_broadwell_d_info)
> > +	INTEL_BDW_D_IDS(&intel_broadwell_d_info),	\
> > +	INTEL_BDW_GT3M_IDS(&intel_broadwell_gt3m_info),	\
> > +	INTEL_BDW_GT3D_IDS(&intel_broadwell_gt3d_info)
> 
> You've forgotten to update the stolen memory quirk table in the x86 code.
> Just grep for INTEL_BDW_M_IDS to see all users of these macros.

Thanks for your info. 
I will update it in next version.

Thanks.
    Yakui

> -Daniel
> 
> >  
> >  static const struct pci_device_id pciidlist[] = {		/* aka */
> >  	INTEL_PCI_IDS,
> > diff --git a/include/drm/i915_pciids.h b/include/drm/i915_pciids.h
> > index 940ece4..32d75f8 100644
> > --- a/include/drm/i915_pciids.h
> > +++ b/include/drm/i915_pciids.h
> > @@ -225,12 +225,16 @@
> >  
> >  #define INTEL_BDW_M_IDS(info) \
> >  	_INTEL_BDW_M_IDS(1, info), \
> > -	_INTEL_BDW_M_IDS(2, info), \
> > -	_INTEL_BDW_M_IDS(3, info)
> > +	_INTEL_BDW_M_IDS(2, info)
> >  
> >  #define INTEL_BDW_D_IDS(info) \
> >  	_INTEL_BDW_D_IDS(1, info), \
> > -	_INTEL_BDW_D_IDS(2, info), \
> > +	_INTEL_BDW_D_IDS(2, info)
> > +
> > +#define INTEL_BDW_GT3M_IDS(info) \
> > +	_INTEL_BDW_M_IDS(3, info)
> > +
> > +#define INTEL_BDW_GT3D_IDS(info) \
> >  	_INTEL_BDW_D_IDS(3, info)
> >  
> >  #endif /* _I915_PCIIDS_H */
> > -- 
> > 1.7.10.1
> > 
> > _______________________________________________
> > Intel-gfx mailing list
> > Intel-gfx@lists.freedesktop.org
> > http://lists.freedesktop.org/mailman/listinfo/intel-gfx
> 

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 4/5] drm/i915:Add the VCS2 switch in Intel_ring_setup_status_page for Gen7 to remove the switch check warning
  2014-04-09 14:29   ` Daniel Vetter
@ 2014-04-10  0:45     ` Zhao Yakui
  0 siblings, 0 replies; 24+ messages in thread
From: Zhao Yakui @ 2014-04-10  0:45 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: intel-gfx

On Wed, 2014-04-09 at 08:29 -0600, Daniel Vetter wrote:
> On Wed, Apr 09, 2014 at 09:59:55AM +0800, Zhao Yakui wrote:
> > The Gen7 doesn't have the second BSD ring. But it will complain the switch check
> > warning message during compilation. So just add it to remove the
> > switch check warning.
> > 
> > Signed-off-by: Zhao Yakui <yakui.zhao@intel.com>
> > ---
> >  drivers/gpu/drm/i915/intel_ringbuffer.c |    1 +
> >  1 file changed, 1 insertion(+)
> > 
> > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> > index 11d0687..43e0227 100644
> > --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> > @@ -984,6 +984,7 @@ void intel_ring_setup_status_page(struct intel_ring_buffer *ring)
> >  		case BCS:
> >  			mmio = BLT_HWS_PGA_GEN7;
> >  			break;
> > +		case VCS2:
> 
> Maybe add a /* doesn't actually exist but shuts up gcc */ comment?

Make sense.

I will update it.

Thanks.
    Yakui
> -Daniel
> 
> >  		case VCS:
> >  			mmio = BSD_HWS_PGA_GEN7;
> >  			break;
> > -- 
> > 1.7.10.1
> > 
> > _______________________________________________
> > Intel-gfx mailing list
> > Intel-gfx@lists.freedesktop.org
> > http://lists.freedesktop.org/mailman/listinfo/intel-gfx
> 

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 5/5] drm/i915:Use the coarse mechanism based on drm fd to dispatch the BSD command on BDW GT3
  2014-04-09 14:34   ` Daniel Vetter
@ 2014-04-10  2:24     ` Zhao Yakui
  2014-04-10  6:48       ` Daniel Vetter
  0 siblings, 1 reply; 24+ messages in thread
From: Zhao Yakui @ 2014-04-10  2:24 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: intel-gfx

On Wed, 2014-04-09 at 08:34 -0600, Daniel Vetter wrote:
> On Wed, Apr 09, 2014 at 09:59:56AM +0800, Zhao Yakui wrote:
> > The BDW GT3 has two independent BSD rings, which can be used to process the
> > video commands. To be simpler, it is transparent to user-space driver/middleware.
> > Instead the kernel driver will decide which ring is to dispatch the BSD video
> > command.
> > 
> > As every BSD ring is powerful, it is enough to dispatch the BSD video command
> > based on the drm fd. In such case the different BSD ring is used for video playing
> > back and encoding. At the same time the coarse dispatch mechanism can help to avoid
> > the object synchronization between the BSD rings.
> > 
> > Signed-off-by: Zhao Yakui <yakui.zhao@intel.com>
> 
> This looks way too complicated. First things first please get rid of the
> atomic_t usage. If you don't have _massive_ comments explaining the memory
> barriers you're most likely using linux kernel atomic_t wrong. They are
> fully unordered.

Thanks for the review.

For the atomic_t usage:  I will remove it in next version as the counter
is already protected by the lock.  

> 
> With that out of the way this still looks a bit complicated really. Can't
> we just use a very simple static rule in gen8_dispatch_bsd_ring which
> hashed the pointer address of the file_priv? Just to get things going,
> once we have a clear need we can try to make things more intelligent. But
> in case of doubt I really prefer if we start with the dumbest possible
> approach first and add complexity instead of starting with something
> really complex and simplifying it.

Do you mean that file_priv is hashed and then is mapped to BSD 0 or 1
ring?  
The GT3 machine has two independent BSD rings. It will be better that
the kernel driver can balance the video workload between the two rings. 
When using the hashed file_priv to select BSD ring, the video balance
depends on the design of hash design. Under some scenarios, it will be
possible that one ring is very busy while another ring is very idle. And
then performance of video playing back/encoding will be affected.
At the same time the hash mechanism is only used to select the
corresponding BSD ring when one drm_fd is opened. And it doesn't
consider the video workload balance after finishing some workloads.

The following is the basic idea in my patch.(A counter variable is added
for ring. The bigger the counter, the higher the workload).
   a. When one new fd needs to dispatch the BSD video command, it will
select the ring with the lowest workload(lowest counter). And then
counter in this ring will be added.
   b. when the drm fd is closed(the workload is finished), the counter
of the ring used by file_priv will be decreased. 
   c. When the drm fd already selects one BSD ring in previously
submitted command, it will check whether it is using the ring with the
lowest workload(lowest counter). If not, it can be switched. The purpose
is to assure that the workload is still balanced between the two BSD
rings. For example: User wants to play back four video clips. BSD 0 ring
is selected to play back the two long clips. BSD 1 ring is selected to
play back the two short clips. After it finishes the playing back of two
short clips, the BSD 1 ring can be switched to play back the long clip.
Still balance.

What do you think?

> -Daniel
> 
> > ---
> >  drivers/gpu/drm/i915/i915_dma.c            |   14 ++++++
> >  drivers/gpu/drm/i915/i915_drv.h            |    3 ++
> >  drivers/gpu/drm/i915/i915_gem_execbuffer.c |   73 +++++++++++++++++++++++++++-
> >  drivers/gpu/drm/i915/intel_ringbuffer.c    |    2 +
> >  drivers/gpu/drm/i915/intel_ringbuffer.h    |    2 +
> >  5 files changed, 93 insertions(+), 1 deletion(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
> > index 0b38f88..8260463 100644
> > --- a/drivers/gpu/drm/i915/i915_dma.c
> > +++ b/drivers/gpu/drm/i915/i915_dma.c
> > @@ -1572,6 +1572,7 @@ int i915_driver_load(struct drm_device *dev, unsigned long flags)
> >  	spin_lock_init(&dev_priv->backlight_lock);
> >  	spin_lock_init(&dev_priv->uncore.lock);
> >  	spin_lock_init(&dev_priv->mm.object_stat_lock);
> > +	spin_lock_init(&dev_priv->bsd_lock);
> >  	mutex_init(&dev_priv->dpio_lock);
> >  	mutex_init(&dev_priv->modeset_restore_lock);
> >  
> > @@ -1928,7 +1929,20 @@ void i915_driver_preclose(struct drm_device * dev, struct drm_file *file_priv)
> >  void i915_driver_postclose(struct drm_device *dev, struct drm_file *file)
> >  {
> >  	struct drm_i915_file_private *file_priv = file->driver_priv;
> > +	struct intel_ring_buffer *bsd_ring;
> > +	struct drm_i915_private *dev_priv = dev->dev_private;
> >  
> > +	if (file_priv && file_priv->bsd_ring) {
> > +		int cmd_counter;
> > +		bsd_ring = file_priv->bsd_ring;
> > +		file_priv->bsd_ring = NULL;
> > +		spin_lock(&dev_priv->bsd_lock);
> > +		cmd_counter = atomic_sub_return(1, &bsd_ring->bsd_cmd_counter);
> > +		if (cmd_counter < 0) {
> > +			atomic_set(&bsd_ring->bsd_cmd_counter, 0);
> > +		}
> > +		spin_unlock(&dev_priv->bsd_lock);
> > +	}
> >  	kfree(file_priv);
> >  }
> >  
> > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> > index d77f4e0..128639c 100644
> > --- a/drivers/gpu/drm/i915/i915_drv.h
> > +++ b/drivers/gpu/drm/i915/i915_drv.h
> > @@ -1457,6 +1457,8 @@ struct drm_i915_private {
> >  	struct i915_dri1_state dri1;
> >  	/* Old ums support infrastructure, same warning applies. */
> >  	struct i915_ums_state ums;
> > +	/* the lock for dispatch video commands on two BSD rings */
> > +	spinlock_t bsd_lock;
> >  };
> >  
> >  static inline struct drm_i915_private *to_i915(const struct drm_device *dev)
> > @@ -1664,6 +1666,7 @@ struct drm_i915_file_private {
> >  
> >  	struct i915_hw_context *private_default_ctx;
> >  	atomic_t rps_wait_boost;
> > +	struct  intel_ring_buffer *bsd_ring;
> >  };
> >  
> >  /*
> > diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> > index 3491402..75d8cc0 100644
> > --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> > +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> > @@ -999,6 +999,70 @@ i915_reset_gen7_sol_offsets(struct drm_device *dev,
> >  	return 0;
> >  }
> >  
> > +/**
> > + * Find one BSD ring to dispatch the corresponding BSD command.
> > + * The Ring ID is returned.
> > + */
> > +static int gen8_dispatch_bsd_ring(struct drm_device *dev,
> > +				  struct drm_file *file)
> > +{
> > +	struct drm_i915_private *dev_priv = dev->dev_private;
> > +	struct drm_i915_file_private *file_priv = file->driver_priv;
> > +	struct intel_ring_buffer *temp_ring, *bsd_ring;
> > +	int bsd_counter, temp_counter;
> > +
> > +	if (file_priv->bsd_ring) {
> > +		/* Check whether the load balance is required.*/
> > +		spin_lock(&dev_priv->bsd_lock);
> > +		bsd_counter = atomic_read(&(file_priv->bsd_ring->bsd_cmd_counter));
> > +		temp_ring = &dev_priv->ring[VCS];
> > +		temp_counter = atomic_read(&temp_ring->bsd_cmd_counter);
> > +		bsd_ring = &dev_priv->ring[VCS];
> > +
> > +		temp_ring = &dev_priv->ring[VCS2];
> > +		if (atomic_read(&temp_ring->bsd_cmd_counter) < temp_counter) {
> > +			temp_counter = atomic_read(&temp_ring->bsd_cmd_counter);
> > +			bsd_ring = temp_ring;
> > +		}
> > +		/*
> > +		 * If it is already the ring with the minimum load, it is
> > +		 * unnecessary to switch it.
> > +		 */
> > +		if (bsd_ring == file_priv->bsd_ring) {
> > +			spin_unlock(&dev_priv->bsd_lock);
> > +			return bsd_ring->id;
> > +		}
> > +		/*
> > +		 * If the load delta between current ring and target ring is
> > +		 * small, it is unnecessary to switch it.
> > +		 */
> > +		if ((bsd_counter - temp_counter) <= 1) {
> > +			spin_unlock(&dev_priv->bsd_lock);
> > +			return file_priv->bsd_ring->id;
> > +		}
> > +		/* balance the load between current ring and target ring */
> > +		atomic_dec(&file_priv->bsd_ring->bsd_cmd_counter);
> > +		atomic_inc(&bsd_ring->bsd_cmd_counter);
> > +		spin_unlock(&dev_priv->bsd_lock);
> > +		file_priv->bsd_ring = bsd_ring;
> > +		return bsd_ring->id;
> > +	} else {
> > +		spin_lock(&dev_priv->bsd_lock);
> > +		bsd_ring = &dev_priv->ring[VCS];
> > +		bsd_counter = atomic_read(&bsd_ring->bsd_cmd_counter);
> > +		temp_ring = &dev_priv->ring[VCS2];
> > +		temp_counter = atomic_read(&temp_ring->bsd_cmd_counter);
> > +		if (temp_counter < bsd_counter) {
> > +			bsd_ring = temp_ring;
> > +			bsd_counter = temp_counter;
> > +		}
> > +		atomic_inc(&bsd_ring->bsd_cmd_counter);
> > +		file_priv->bsd_ring = bsd_ring;
> > +		spin_unlock(&dev_priv->bsd_lock);
> > +		return bsd_ring->id;
> > +	}
> > +}
> > +
> >  static int
> >  i915_gem_do_execbuffer(struct drm_device *dev, void *data,
> >  		       struct drm_file *file,
> > @@ -1043,7 +1107,14 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
> >  
> >  	if ((args->flags & I915_EXEC_RING_MASK) == I915_EXEC_DEFAULT)
> >  		ring = &dev_priv->ring[RCS];
> > -	else
> > +	else if ((args->flags & I915_EXEC_RING_MASK) == I915_EXEC_BSD) {
> > +		if (HAS_BSD2(dev)) {
> > +			int ring_id;
> > +			ring_id = gen8_dispatch_bsd_ring(dev, file);
> > +			ring = &dev_priv->ring[ring_id];
> > +		} else
> > +			ring = &dev_priv->ring[VCS];
> > +	} else
> >  		ring = &dev_priv->ring[(args->flags & I915_EXEC_RING_MASK) - 1];
> >  
> >  	if (!intel_ring_initialized(ring)) {
> > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> > index 43e0227..852fc29 100644
> > --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> > @@ -2123,6 +2123,7 @@ int intel_init_bsd_ring_buffer(struct drm_device *dev)
> >  		ring->dispatch_execbuffer = i965_dispatch_execbuffer;
> >  	}
> >  	ring->init = init_ring_common;
> > +	atomic_set(&ring->bsd_cmd_counter, 0);
> >  
> >  	return intel_init_ring_buffer(dev, ring);
> >  }
> > @@ -2170,6 +2171,7 @@ int intel_init_bsd2_ring_buffer(struct drm_device *dev)
> >  
> >  	ring->init = init_ring_common;
> >  
> > +	atomic_set(&ring->bsd_cmd_counter, 0);
> >  	return intel_init_ring_buffer(dev, ring);
> >  }
> >  
> > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> > index 8ca4285..4f87b08 100644
> > --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> > @@ -196,6 +196,8 @@ struct  intel_ring_buffer {
> >  	 * to encode the command length in the header).
> >  	 */
> >  	u32 (*get_cmd_length_mask)(u32 cmd_header);
> > +
> > +	atomic_t bsd_cmd_counter;
> >  };
> >  
> >  static inline bool
> > -- 
> > 1.7.10.1
> > 
> > _______________________________________________
> > Intel-gfx mailing list
> > Intel-gfx@lists.freedesktop.org
> > http://lists.freedesktop.org/mailman/listinfo/intel-gfx
> 

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 0/5] drm/i915: Add the support of dual BSD rings on BDW GT3
  2014-04-09 14:45 ` [PATCH 0/5] drm/i915: Add the support of dual BSD rings " Daniel Vetter
@ 2014-04-10  3:28   ` Zhao Yakui
  2014-04-10  6:58     ` Daniel Vetter
  0 siblings, 1 reply; 24+ messages in thread
From: Zhao Yakui @ 2014-04-10  3:28 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: intel-gfx

On Wed, 2014-04-09 at 08:45 -0600, Daniel Vetter wrote:
> On Wed, Apr 09, 2014 at 09:59:51AM +0800, Zhao Yakui wrote:
> > 
> > This is the patch set that tries to add the support of dual BSD rings on BDW
> > GT3. Based on hardware spec, the BDW GT3 has two independent BSD rings, which
> > can be used to process the video commands. To be simpler, it is transparent 
> > to user-space driver/middleware. In such case the kernel driver will decide
> > which ring is to dispatch the BSD video command.
> > 
> > As every BSD ring is powerful, it is enough to dispatch the BSD video command
> > based on the drm fd. In such case the different BSD ring is used for video playing
> > back and encoding. At the same time the coarse dispatch mechanism can help to avoid
> > the object synchronization between the BSD rings.
> 
> Ok, I've quickly read through it all and commented on a few things. Imo
> the last patch should be massively simplified, at least for the first
> round. Other things look small.
> 
Hi, Daniel

Thanks for your review.

> What's still missing are testcases, and I have two things in mind here:
> - Exercise the 2nd ring dispatch and sync a bit. Since the 2nd bsd ring is
>   hidden within the kernel I think the right approach would be to open a
>   few drm fds (10 or so) and then randomly use them with a dummy reloc. We
>   have two testcases which can be used as blueprints that need
>   adjustement:
> 
>   - gem_ring_sync_loop: Probably easiest to copy it to a new file as
>     gem_multi_bsd_sync_loop. This test exercises semaphores.
>   - gem_dummy_reloc_loop, subtest mixed: Almost the same as the above, but
>     the sync is done _inside_ the loop and hence this exercises gpu/cpu
>     sync. We need both tests adjusted, for for this we need a new
>     multi-bsd test.

Agree with your concerns. I will try to add the
gem_multi_bsd_sync_loop/dummy_reloc_loop test case so that it can test
the sync with multi-BSD.

BTW: How about if I directly add multiple fds in gem_ring_sync_loop test
case and then test the sync among the different rings?  In such case the
user-application doesn't need to know the existence of multi-BSD rings.

> 
> - New testcase to fully test main execbuffer flags. This is simply
>   something that's we don't yet have. The next guy to touch execbuf code
>   needs to add it, and it looks like that's you ;-) I've done a JIRA task
>   for the resource streamer work, but I think the resource streamer wont
>   be merged anytime soon. So I'll reassign to you. Jira task is VIZ-3129.

For the new testcase of execbuffer flag:  Do you have any idea about
which kind of exec flag needs to be checked? Do you have any idea about
the expected failure/successful behavour for the flags?
   For example: I915_EXEC_PINNED : If one object is not pinned and
submitted, what behavour is expected? Fail or wrong?

Thanks.
    Yakui
> 
> Thanks, Daniel
> 
> > 
> > 
> > Zhao Yakui (5):
> >   drm/i915: Split the BDW device definition to prepare for dual BSD
> >     rings on BDW GT3
> >   drm/i915: Initialize the second BSD ring on BDW GT3 machine
> >   drm/i915: Handle the irq interrupt for the second BSD ring
> >   drm/i915: Add the VCS2 switch in Intel_ring_setup_status_page for Gen7 to
> >     remove the switch check warning
> >   drm/i915: Use the coarse mechanism based on drm fd to dispatch the BSD command
> >     on BDW GT3
> > 
> >  drivers/gpu/drm/i915/i915_dma.c            |   14 ++++++
> >  drivers/gpu/drm/i915/i915_drv.c            |   24 ++++++++-
> >  drivers/gpu/drm/i915/i915_drv.h            |    5 ++
> >  drivers/gpu/drm/i915/i915_gem.c            |    9 +++-
> >  drivers/gpu/drm/i915/i915_gem_execbuffer.c |   73 +++++++++++++++++++++++++++-
> >  drivers/gpu/drm/i915/i915_gpu_error.c      |    1 +
> >  drivers/gpu/drm/i915/i915_irq.c            |    5 +-
> >  drivers/gpu/drm/i915/i915_reg.h            |    1 +
> >  drivers/gpu/drm/i915/intel_ringbuffer.c    |   57 ++++++++++++++++++++++
> >  drivers/gpu/drm/i915/intel_ringbuffer.h    |    6 ++-
> >  include/drm/i915_pciids.h                  |   10 ++--
> >  11 files changed, 197 insertions(+), 8 deletions(-)
> > 
> > -- 
> > 1.7.10.1
> > 
> > _______________________________________________
> > Intel-gfx mailing list
> > Intel-gfx@lists.freedesktop.org
> > http://lists.freedesktop.org/mailman/listinfo/intel-gfx
> 

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 5/5] drm/i915:Use the coarse mechanism based on drm fd to dispatch the BSD command on BDW GT3
  2014-04-10  2:24     ` Zhao Yakui
@ 2014-04-10  6:48       ` Daniel Vetter
  2014-04-10  8:04         ` Zhao Yakui
  0 siblings, 1 reply; 24+ messages in thread
From: Daniel Vetter @ 2014-04-10  6:48 UTC (permalink / raw)
  To: Zhao Yakui; +Cc: intel-gfx

On Thu, Apr 10, 2014 at 10:24:53AM +0800, Zhao Yakui wrote:
> On Wed, 2014-04-09 at 08:34 -0600, Daniel Vetter wrote:
> > On Wed, Apr 09, 2014 at 09:59:56AM +0800, Zhao Yakui wrote:
> > > The BDW GT3 has two independent BSD rings, which can be used to process the
> > > video commands. To be simpler, it is transparent to user-space driver/middleware.
> > > Instead the kernel driver will decide which ring is to dispatch the BSD video
> > > command.
> > > 
> > > As every BSD ring is powerful, it is enough to dispatch the BSD video command
> > > based on the drm fd. In such case the different BSD ring is used for video playing
> > > back and encoding. At the same time the coarse dispatch mechanism can help to avoid
> > > the object synchronization between the BSD rings.
> > > 
> > > Signed-off-by: Zhao Yakui <yakui.zhao@intel.com>
> > 
> > This looks way too complicated. First things first please get rid of the
> > atomic_t usage. If you don't have _massive_ comments explaining the memory
> > barriers you're most likely using linux kernel atomic_t wrong. They are
> > fully unordered.
> 
> Thanks for the review.
> 
> For the atomic_t usage:  I will remove it in next version as the counter
> is already protected by the lock.  
> 
> > 
> > With that out of the way this still looks a bit complicated really. Can't
> > we just use a very simple static rule in gen8_dispatch_bsd_ring which
> > hashed the pointer address of the file_priv? Just to get things going,
> > once we have a clear need we can try to make things more intelligent. But
> > in case of doubt I really prefer if we start with the dumbest possible
> > approach first and add complexity instead of starting with something
> > really complex and simplifying it.
> 
> Do you mean that file_priv is hashed and then is mapped to BSD 0 or 1
> ring?  

Yeah, that's the idea. Get in the basic support first, make it fancy like
you describe below second. This has a few upsides:
- We can concentrate on validating basic support in the first round
  instead of potentially fighting a bug in the load balancer.
- Discussions and performance testing for the load balancer won't hold up
  the entire feature.
- Like I've said this might not be required. Before we add more complexity
  than just hashing the file_priv I want to see some benchmarks of
  expected workloads that show that the load balancing is indeed a good
  idea - for the case of a transcode server I guess we should have
  sufficient in-flight operations that it won't really matter. Or at least
  I hope so.

So maybe split this patch up into the first step with the basic file_priv
hashing mapping and the 2nd patch to add the improved algo?

Cheers, Daniel

> The GT3 machine has two independent BSD rings. It will be better that
> the kernel driver can balance the video workload between the two rings. 
> When using the hashed file_priv to select BSD ring, the video balance
> depends on the design of hash design. Under some scenarios, it will be
> possible that one ring is very busy while another ring is very idle. And
> then performance of video playing back/encoding will be affected.
> At the same time the hash mechanism is only used to select the
> corresponding BSD ring when one drm_fd is opened. And it doesn't
> consider the video workload balance after finishing some workloads.
> 
> The following is the basic idea in my patch.(A counter variable is added
> for ring. The bigger the counter, the higher the workload).
>    a. When one new fd needs to dispatch the BSD video command, it will
> select the ring with the lowest workload(lowest counter). And then
> counter in this ring will be added.
>    b. when the drm fd is closed(the workload is finished), the counter
> of the ring used by file_priv will be decreased. 
>    c. When the drm fd already selects one BSD ring in previously
> submitted command, it will check whether it is using the ring with the
> lowest workload(lowest counter). If not, it can be switched. The purpose
> is to assure that the workload is still balanced between the two BSD
> rings. For example: User wants to play back four video clips. BSD 0 ring
> is selected to play back the two long clips. BSD 1 ring is selected to
> play back the two short clips. After it finishes the playing back of two
> short clips, the BSD 1 ring can be switched to play back the long clip.
> Still balance.
> 
> What do you think?
> 
> > -Daniel
> > 
> > > ---
> > >  drivers/gpu/drm/i915/i915_dma.c            |   14 ++++++
> > >  drivers/gpu/drm/i915/i915_drv.h            |    3 ++
> > >  drivers/gpu/drm/i915/i915_gem_execbuffer.c |   73 +++++++++++++++++++++++++++-
> > >  drivers/gpu/drm/i915/intel_ringbuffer.c    |    2 +
> > >  drivers/gpu/drm/i915/intel_ringbuffer.h    |    2 +
> > >  5 files changed, 93 insertions(+), 1 deletion(-)
> > > 
> > > diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
> > > index 0b38f88..8260463 100644
> > > --- a/drivers/gpu/drm/i915/i915_dma.c
> > > +++ b/drivers/gpu/drm/i915/i915_dma.c
> > > @@ -1572,6 +1572,7 @@ int i915_driver_load(struct drm_device *dev, unsigned long flags)
> > >  	spin_lock_init(&dev_priv->backlight_lock);
> > >  	spin_lock_init(&dev_priv->uncore.lock);
> > >  	spin_lock_init(&dev_priv->mm.object_stat_lock);
> > > +	spin_lock_init(&dev_priv->bsd_lock);
> > >  	mutex_init(&dev_priv->dpio_lock);
> > >  	mutex_init(&dev_priv->modeset_restore_lock);
> > >  
> > > @@ -1928,7 +1929,20 @@ void i915_driver_preclose(struct drm_device * dev, struct drm_file *file_priv)
> > >  void i915_driver_postclose(struct drm_device *dev, struct drm_file *file)
> > >  {
> > >  	struct drm_i915_file_private *file_priv = file->driver_priv;
> > > +	struct intel_ring_buffer *bsd_ring;
> > > +	struct drm_i915_private *dev_priv = dev->dev_private;
> > >  
> > > +	if (file_priv && file_priv->bsd_ring) {
> > > +		int cmd_counter;
> > > +		bsd_ring = file_priv->bsd_ring;
> > > +		file_priv->bsd_ring = NULL;
> > > +		spin_lock(&dev_priv->bsd_lock);
> > > +		cmd_counter = atomic_sub_return(1, &bsd_ring->bsd_cmd_counter);
> > > +		if (cmd_counter < 0) {
> > > +			atomic_set(&bsd_ring->bsd_cmd_counter, 0);
> > > +		}
> > > +		spin_unlock(&dev_priv->bsd_lock);
> > > +	}
> > >  	kfree(file_priv);
> > >  }
> > >  
> > > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> > > index d77f4e0..128639c 100644
> > > --- a/drivers/gpu/drm/i915/i915_drv.h
> > > +++ b/drivers/gpu/drm/i915/i915_drv.h
> > > @@ -1457,6 +1457,8 @@ struct drm_i915_private {
> > >  	struct i915_dri1_state dri1;
> > >  	/* Old ums support infrastructure, same warning applies. */
> > >  	struct i915_ums_state ums;
> > > +	/* the lock for dispatch video commands on two BSD rings */
> > > +	spinlock_t bsd_lock;
> > >  };
> > >  
> > >  static inline struct drm_i915_private *to_i915(const struct drm_device *dev)
> > > @@ -1664,6 +1666,7 @@ struct drm_i915_file_private {
> > >  
> > >  	struct i915_hw_context *private_default_ctx;
> > >  	atomic_t rps_wait_boost;
> > > +	struct  intel_ring_buffer *bsd_ring;
> > >  };
> > >  
> > >  /*
> > > diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> > > index 3491402..75d8cc0 100644
> > > --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> > > +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> > > @@ -999,6 +999,70 @@ i915_reset_gen7_sol_offsets(struct drm_device *dev,
> > >  	return 0;
> > >  }
> > >  
> > > +/**
> > > + * Find one BSD ring to dispatch the corresponding BSD command.
> > > + * The Ring ID is returned.
> > > + */
> > > +static int gen8_dispatch_bsd_ring(struct drm_device *dev,
> > > +				  struct drm_file *file)
> > > +{
> > > +	struct drm_i915_private *dev_priv = dev->dev_private;
> > > +	struct drm_i915_file_private *file_priv = file->driver_priv;
> > > +	struct intel_ring_buffer *temp_ring, *bsd_ring;
> > > +	int bsd_counter, temp_counter;
> > > +
> > > +	if (file_priv->bsd_ring) {
> > > +		/* Check whether the load balance is required.*/
> > > +		spin_lock(&dev_priv->bsd_lock);
> > > +		bsd_counter = atomic_read(&(file_priv->bsd_ring->bsd_cmd_counter));
> > > +		temp_ring = &dev_priv->ring[VCS];
> > > +		temp_counter = atomic_read(&temp_ring->bsd_cmd_counter);
> > > +		bsd_ring = &dev_priv->ring[VCS];
> > > +
> > > +		temp_ring = &dev_priv->ring[VCS2];
> > > +		if (atomic_read(&temp_ring->bsd_cmd_counter) < temp_counter) {
> > > +			temp_counter = atomic_read(&temp_ring->bsd_cmd_counter);
> > > +			bsd_ring = temp_ring;
> > > +		}
> > > +		/*
> > > +		 * If it is already the ring with the minimum load, it is
> > > +		 * unnecessary to switch it.
> > > +		 */
> > > +		if (bsd_ring == file_priv->bsd_ring) {
> > > +			spin_unlock(&dev_priv->bsd_lock);
> > > +			return bsd_ring->id;
> > > +		}
> > > +		/*
> > > +		 * If the load delta between current ring and target ring is
> > > +		 * small, it is unnecessary to switch it.
> > > +		 */
> > > +		if ((bsd_counter - temp_counter) <= 1) {
> > > +			spin_unlock(&dev_priv->bsd_lock);
> > > +			return file_priv->bsd_ring->id;
> > > +		}
> > > +		/* balance the load between current ring and target ring */
> > > +		atomic_dec(&file_priv->bsd_ring->bsd_cmd_counter);
> > > +		atomic_inc(&bsd_ring->bsd_cmd_counter);
> > > +		spin_unlock(&dev_priv->bsd_lock);
> > > +		file_priv->bsd_ring = bsd_ring;
> > > +		return bsd_ring->id;
> > > +	} else {
> > > +		spin_lock(&dev_priv->bsd_lock);
> > > +		bsd_ring = &dev_priv->ring[VCS];
> > > +		bsd_counter = atomic_read(&bsd_ring->bsd_cmd_counter);
> > > +		temp_ring = &dev_priv->ring[VCS2];
> > > +		temp_counter = atomic_read(&temp_ring->bsd_cmd_counter);
> > > +		if (temp_counter < bsd_counter) {
> > > +			bsd_ring = temp_ring;
> > > +			bsd_counter = temp_counter;
> > > +		}
> > > +		atomic_inc(&bsd_ring->bsd_cmd_counter);
> > > +		file_priv->bsd_ring = bsd_ring;
> > > +		spin_unlock(&dev_priv->bsd_lock);
> > > +		return bsd_ring->id;
> > > +	}
> > > +}
> > > +
> > >  static int
> > >  i915_gem_do_execbuffer(struct drm_device *dev, void *data,
> > >  		       struct drm_file *file,
> > > @@ -1043,7 +1107,14 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
> > >  
> > >  	if ((args->flags & I915_EXEC_RING_MASK) == I915_EXEC_DEFAULT)
> > >  		ring = &dev_priv->ring[RCS];
> > > -	else
> > > +	else if ((args->flags & I915_EXEC_RING_MASK) == I915_EXEC_BSD) {
> > > +		if (HAS_BSD2(dev)) {
> > > +			int ring_id;
> > > +			ring_id = gen8_dispatch_bsd_ring(dev, file);
> > > +			ring = &dev_priv->ring[ring_id];
> > > +		} else
> > > +			ring = &dev_priv->ring[VCS];
> > > +	} else
> > >  		ring = &dev_priv->ring[(args->flags & I915_EXEC_RING_MASK) - 1];
> > >  
> > >  	if (!intel_ring_initialized(ring)) {
> > > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> > > index 43e0227..852fc29 100644
> > > --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> > > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> > > @@ -2123,6 +2123,7 @@ int intel_init_bsd_ring_buffer(struct drm_device *dev)
> > >  		ring->dispatch_execbuffer = i965_dispatch_execbuffer;
> > >  	}
> > >  	ring->init = init_ring_common;
> > > +	atomic_set(&ring->bsd_cmd_counter, 0);
> > >  
> > >  	return intel_init_ring_buffer(dev, ring);
> > >  }
> > > @@ -2170,6 +2171,7 @@ int intel_init_bsd2_ring_buffer(struct drm_device *dev)
> > >  
> > >  	ring->init = init_ring_common;
> > >  
> > > +	atomic_set(&ring->bsd_cmd_counter, 0);
> > >  	return intel_init_ring_buffer(dev, ring);
> > >  }
> > >  
> > > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> > > index 8ca4285..4f87b08 100644
> > > --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> > > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> > > @@ -196,6 +196,8 @@ struct  intel_ring_buffer {
> > >  	 * to encode the command length in the header).
> > >  	 */
> > >  	u32 (*get_cmd_length_mask)(u32 cmd_header);
> > > +
> > > +	atomic_t bsd_cmd_counter;
> > >  };
> > >  
> > >  static inline bool
> > > -- 
> > > 1.7.10.1
> > > 
> > > _______________________________________________
> > > Intel-gfx mailing list
> > > Intel-gfx@lists.freedesktop.org
> > > http://lists.freedesktop.org/mailman/listinfo/intel-gfx
> > 
> 
> 

-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 0/5] drm/i915: Add the support of dual BSD rings on BDW GT3
  2014-04-10  3:28   ` Zhao Yakui
@ 2014-04-10  6:58     ` Daniel Vetter
  2014-04-10  8:28       ` Zhao Yakui
  0 siblings, 1 reply; 24+ messages in thread
From: Daniel Vetter @ 2014-04-10  6:58 UTC (permalink / raw)
  To: Zhao Yakui; +Cc: intel-gfx

On Thu, Apr 10, 2014 at 11:28:46AM +0800, Zhao Yakui wrote:
> On Wed, 2014-04-09 at 08:45 -0600, Daniel Vetter wrote:
> > On Wed, Apr 09, 2014 at 09:59:51AM +0800, Zhao Yakui wrote:
> > > 
> > > This is the patch set that tries to add the support of dual BSD rings on BDW
> > > GT3. Based on hardware spec, the BDW GT3 has two independent BSD rings, which
> > > can be used to process the video commands. To be simpler, it is transparent 
> > > to user-space driver/middleware. In such case the kernel driver will decide
> > > which ring is to dispatch the BSD video command.
> > > 
> > > As every BSD ring is powerful, it is enough to dispatch the BSD video command
> > > based on the drm fd. In such case the different BSD ring is used for video playing
> > > back and encoding. At the same time the coarse dispatch mechanism can help to avoid
> > > the object synchronization between the BSD rings.
> > 
> > Ok, I've quickly read through it all and commented on a few things. Imo
> > the last patch should be massively simplified, at least for the first
> > round. Other things look small.
> > 
> Hi, Daniel
> 
> Thanks for your review.
> 
> > What's still missing are testcases, and I have two things in mind here:
> > - Exercise the 2nd ring dispatch and sync a bit. Since the 2nd bsd ring is
> >   hidden within the kernel I think the right approach would be to open a
> >   few drm fds (10 or so) and then randomly use them with a dummy reloc. We
> >   have two testcases which can be used as blueprints that need
> >   adjustement:
> > 
> >   - gem_ring_sync_loop: Probably easiest to copy it to a new file as
> >     gem_multi_bsd_sync_loop. This test exercises semaphores.
> >   - gem_dummy_reloc_loop, subtest mixed: Almost the same as the above, but
> >     the sync is done _inside_ the loop and hence this exercises gpu/cpu
> >     sync. We need both tests adjusted, for for this we need a new
> >     multi-bsd test.
> 
> Agree with your concerns. I will try to add the
> gem_multi_bsd_sync_loop/dummy_reloc_loop test case so that it can test
> the sync with multi-BSD.
> 
> BTW: How about if I directly add multiple fds in gem_ring_sync_loop test
> case and then test the sync among the different rings?  In such case the
> user-application doesn't need to know the existence of multi-BSD rings.

We don't need it for the other rings, so I think it's better to leave the
existing tests as-is to avoid introducing bugs. Testing testcase is always
fairly hard, since you have to break your kernel to make sure the test
still catches bugs ;-)

Also for testing VCS1 and VCS2 we need to have multiple fd using the
_same_ logical ring exposed to userspace, so the test logic will look a
bit different anyway.

> > - New testcase to fully test main execbuffer flags. This is simply
> >   something that's we don't yet have. The next guy to touch execbuf code
> >   needs to add it, and it looks like that's you ;-) I've done a JIRA task
> >   for the resource streamer work, but I think the resource streamer wont
> >   be merged anytime soon. So I'll reassign to you. Jira task is VIZ-3129.
> 
> For the new testcase of execbuffer flag:  Do you have any idea about
> which kind of exec flag needs to be checked? Do you have any idea about
> the expected failure/successful behavour for the flags?
>    For example: I915_EXEC_PINNED : If one object is not pinned and
> submitted, what behavour is expected? Fail or wrong?

I've clarified the JIRA, the test is just for the flags/values in the main
execbuf structure. And the idea is to do the basic api sanity checking as
outlined in my blog post

http://blog.ffwll.ch/2013/11/botching-up-ioctls.html

i.e. go through all fields in struct drm_i915_gem_execbuffer2 and write a
test which checks that the kernel correctly rejects invalid input data. So
e.g. for pointer you can supply NULL or a pointer to invalid memory,
buffer count is already checked with the overflow tests, but also invalid
flags and also making sure that if reserved fields aren't 0 the kernel
rejects the batch.

Of course to be able to check this you first need to construct a valid
no-op batch (e.g. copy from gem_exec_nop.c) and submit it (to make sure no
one breaks the test later on). Then each subtest only changes the relevant
field to make sure the kernel really did check the field (and not just
returned -EINVAL due to something else).

Some execbuf fields are special and e.g. contexts are not valid when
there's no hw context support.

If you want to look for examples check out the basic api tests for
recently added ioctls like in gem_reset_stats.c. Execbuffer ioctl is
simply a bit more complex.

Cheers, Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 5/5] drm/i915:Use the coarse mechanism based on drm fd to dispatch the BSD command on BDW GT3
  2014-04-10  6:48       ` Daniel Vetter
@ 2014-04-10  8:04         ` Zhao Yakui
  2014-04-10  9:03           ` Daniel Vetter
  0 siblings, 1 reply; 24+ messages in thread
From: Zhao Yakui @ 2014-04-10  8:04 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: intel-gfx

On Thu, 2014-04-10 at 00:48 -0600, Daniel Vetter wrote:
> On Thu, Apr 10, 2014 at 10:24:53AM +0800, Zhao Yakui wrote:
> > On Wed, 2014-04-09 at 08:34 -0600, Daniel Vetter wrote:
> > > On Wed, Apr 09, 2014 at 09:59:56AM +0800, Zhao Yakui wrote:
> > > > The BDW GT3 has two independent BSD rings, which can be used to process the
> > > > video commands. To be simpler, it is transparent to user-space driver/middleware.
> > > > Instead the kernel driver will decide which ring is to dispatch the BSD video
> > > > command.
> > > > 
> > > > As every BSD ring is powerful, it is enough to dispatch the BSD video command
> > > > based on the drm fd. In such case the different BSD ring is used for video playing
> > > > back and encoding. At the same time the coarse dispatch mechanism can help to avoid
> > > > the object synchronization between the BSD rings.
> > > > 
> > > > Signed-off-by: Zhao Yakui <yakui.zhao@intel.com>
> > > 
> > > This looks way too complicated. First things first please get rid of the
> > > atomic_t usage. If you don't have _massive_ comments explaining the memory
> > > barriers you're most likely using linux kernel atomic_t wrong. They are
> > > fully unordered.
> > 
> > Thanks for the review.
> > 
> > For the atomic_t usage:  I will remove it in next version as the counter
> > is already protected by the lock.  
> > 
> > > 
> > > With that out of the way this still looks a bit complicated really. Can't
> > > we just use a very simple static rule in gen8_dispatch_bsd_ring which
> > > hashed the pointer address of the file_priv? Just to get things going,
> > > once we have a clear need we can try to make things more intelligent. But
> > > in case of doubt I really prefer if we start with the dumbest possible
> > > approach first and add complexity instead of starting with something
> > > really complex and simplifying it.
> > 
> > Do you mean that file_priv is hashed and then is mapped to BSD 0 or 1
> > ring?  
> 
> Yeah, that's the idea. Get in the basic support first, make it fancy like
> you describe below second. This has a few upsides:
> - We can concentrate on validating basic support in the first round
>   instead of potentially fighting a bug in the load balancer.
> - Discussions and performance testing for the load balancer won't hold up
>   the entire feature.
> - Like I've said this might not be required. Before we add more complexity
>   than just hashing the file_priv I want to see some benchmarks of
>   expected workloads that show that the load balancing is indeed a good
>   idea - for the case of a transcode server I guess we should have
>   sufficient in-flight operations that it won't really matter. Or at least
>   I hope so.
> 

OK. Understand your concerns. I can split it two steps. One is to add
the basic support. The second step is for the optimization.

But I don't think that the hash of file_priv is a good idea. As it only
has two rings, it is possible that the hash value is always mapped to
BSD ring 0.  In such case when multiples video clips are played back,
the performance can't meet with the requirement.(For example: User can
play back 4 1080p video clips concurrently when only one BSD ring is
used. On the BDW GT3, they hope to play back 8 1080p video clips
concurrently. The poor hash design will cause that all the workload are
mapped to one BSD ring and then it can't meet with the requirement).

How about using the ping-pong mechanism for the file_priv? For one new
fd, it will use BSD ring 0 and then next file_priv will use BSD ring 1.
Then BSD ring 0....BSD ring 1. 

Does this make sense to you?


> So maybe split this patch up into the first step with the basic file_priv
> hashing mapping and the 2nd patch to add the improved algo?
> 
> Cheers, Daniel
> 
> > The GT3 machine has two independent BSD rings. It will be better that
> > the kernel driver can balance the video workload between the two rings. 
> > When using the hashed file_priv to select BSD ring, the video balance
> > depends on the design of hash design. Under some scenarios, it will be
> > possible that one ring is very busy while another ring is very idle. And
> > then performance of video playing back/encoding will be affected.
> > At the same time the hash mechanism is only used to select the
> > corresponding BSD ring when one drm_fd is opened. And it doesn't
> > consider the video workload balance after finishing some workloads.
> > 
> > The following is the basic idea in my patch.(A counter variable is added
> > for ring. The bigger the counter, the higher the workload).
> >    a. When one new fd needs to dispatch the BSD video command, it will
> > select the ring with the lowest workload(lowest counter). And then
> > counter in this ring will be added.
> >    b. when the drm fd is closed(the workload is finished), the counter
> > of the ring used by file_priv will be decreased. 
> >    c. When the drm fd already selects one BSD ring in previously
> > submitted command, it will check whether it is using the ring with the
> > lowest workload(lowest counter). If not, it can be switched. The purpose
> > is to assure that the workload is still balanced between the two BSD
> > rings. For example: User wants to play back four video clips. BSD 0 ring
> > is selected to play back the two long clips. BSD 1 ring is selected to
> > play back the two short clips. After it finishes the playing back of two
> > short clips, the BSD 1 ring can be switched to play back the long clip.
> > Still balance.
> > 
> > What do you think?
> > 
> > > -Daniel
> > > 
> > > > ---
> > > >  drivers/gpu/drm/i915/i915_dma.c            |   14 ++++++
> > > >  drivers/gpu/drm/i915/i915_drv.h            |    3 ++
> > > >  drivers/gpu/drm/i915/i915_gem_execbuffer.c |   73 +++++++++++++++++++++++++++-
> > > >  drivers/gpu/drm/i915/intel_ringbuffer.c    |    2 +
> > > >  drivers/gpu/drm/i915/intel_ringbuffer.h    |    2 +
> > > >  5 files changed, 93 insertions(+), 1 deletion(-)
> > > > 
> > > > diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
> > > > index 0b38f88..8260463 100644
> > > > --- a/drivers/gpu/drm/i915/i915_dma.c
> > > > +++ b/drivers/gpu/drm/i915/i915_dma.c
> > > > @@ -1572,6 +1572,7 @@ int i915_driver_load(struct drm_device *dev, unsigned long flags)
> > > >  	spin_lock_init(&dev_priv->backlight_lock);
> > > >  	spin_lock_init(&dev_priv->uncore.lock);
> > > >  	spin_lock_init(&dev_priv->mm.object_stat_lock);
> > > > +	spin_lock_init(&dev_priv->bsd_lock);
> > > >  	mutex_init(&dev_priv->dpio_lock);
> > > >  	mutex_init(&dev_priv->modeset_restore_lock);
> > > >  
> > > > @@ -1928,7 +1929,20 @@ void i915_driver_preclose(struct drm_device * dev, struct drm_file *file_priv)
> > > >  void i915_driver_postclose(struct drm_device *dev, struct drm_file *file)
> > > >  {
> > > >  	struct drm_i915_file_private *file_priv = file->driver_priv;
> > > > +	struct intel_ring_buffer *bsd_ring;
> > > > +	struct drm_i915_private *dev_priv = dev->dev_private;
> > > >  
> > > > +	if (file_priv && file_priv->bsd_ring) {
> > > > +		int cmd_counter;
> > > > +		bsd_ring = file_priv->bsd_ring;
> > > > +		file_priv->bsd_ring = NULL;
> > > > +		spin_lock(&dev_priv->bsd_lock);
> > > > +		cmd_counter = atomic_sub_return(1, &bsd_ring->bsd_cmd_counter);
> > > > +		if (cmd_counter < 0) {
> > > > +			atomic_set(&bsd_ring->bsd_cmd_counter, 0);
> > > > +		}
> > > > +		spin_unlock(&dev_priv->bsd_lock);
> > > > +	}
> > > >  	kfree(file_priv);
> > > >  }
> > > >  
> > > > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> > > > index d77f4e0..128639c 100644
> > > > --- a/drivers/gpu/drm/i915/i915_drv.h
> > > > +++ b/drivers/gpu/drm/i915/i915_drv.h
> > > > @@ -1457,6 +1457,8 @@ struct drm_i915_private {
> > > >  	struct i915_dri1_state dri1;
> > > >  	/* Old ums support infrastructure, same warning applies. */
> > > >  	struct i915_ums_state ums;
> > > > +	/* the lock for dispatch video commands on two BSD rings */
> > > > +	spinlock_t bsd_lock;
> > > >  };
> > > >  
> > > >  static inline struct drm_i915_private *to_i915(const struct drm_device *dev)
> > > > @@ -1664,6 +1666,7 @@ struct drm_i915_file_private {
> > > >  
> > > >  	struct i915_hw_context *private_default_ctx;
> > > >  	atomic_t rps_wait_boost;
> > > > +	struct  intel_ring_buffer *bsd_ring;
> > > >  };
> > > >  
> > > >  /*
> > > > diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> > > > index 3491402..75d8cc0 100644
> > > > --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> > > > +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> > > > @@ -999,6 +999,70 @@ i915_reset_gen7_sol_offsets(struct drm_device *dev,
> > > >  	return 0;
> > > >  }
> > > >  
> > > > +/**
> > > > + * Find one BSD ring to dispatch the corresponding BSD command.
> > > > + * The Ring ID is returned.
> > > > + */
> > > > +static int gen8_dispatch_bsd_ring(struct drm_device *dev,
> > > > +				  struct drm_file *file)
> > > > +{
> > > > +	struct drm_i915_private *dev_priv = dev->dev_private;
> > > > +	struct drm_i915_file_private *file_priv = file->driver_priv;
> > > > +	struct intel_ring_buffer *temp_ring, *bsd_ring;
> > > > +	int bsd_counter, temp_counter;
> > > > +
> > > > +	if (file_priv->bsd_ring) {
> > > > +		/* Check whether the load balance is required.*/
> > > > +		spin_lock(&dev_priv->bsd_lock);
> > > > +		bsd_counter = atomic_read(&(file_priv->bsd_ring->bsd_cmd_counter));
> > > > +		temp_ring = &dev_priv->ring[VCS];
> > > > +		temp_counter = atomic_read(&temp_ring->bsd_cmd_counter);
> > > > +		bsd_ring = &dev_priv->ring[VCS];
> > > > +
> > > > +		temp_ring = &dev_priv->ring[VCS2];
> > > > +		if (atomic_read(&temp_ring->bsd_cmd_counter) < temp_counter) {
> > > > +			temp_counter = atomic_read(&temp_ring->bsd_cmd_counter);
> > > > +			bsd_ring = temp_ring;
> > > > +		}
> > > > +		/*
> > > > +		 * If it is already the ring with the minimum load, it is
> > > > +		 * unnecessary to switch it.
> > > > +		 */
> > > > +		if (bsd_ring == file_priv->bsd_ring) {
> > > > +			spin_unlock(&dev_priv->bsd_lock);
> > > > +			return bsd_ring->id;
> > > > +		}
> > > > +		/*
> > > > +		 * If the load delta between current ring and target ring is
> > > > +		 * small, it is unnecessary to switch it.
> > > > +		 */
> > > > +		if ((bsd_counter - temp_counter) <= 1) {
> > > > +			spin_unlock(&dev_priv->bsd_lock);
> > > > +			return file_priv->bsd_ring->id;
> > > > +		}
> > > > +		/* balance the load between current ring and target ring */
> > > > +		atomic_dec(&file_priv->bsd_ring->bsd_cmd_counter);
> > > > +		atomic_inc(&bsd_ring->bsd_cmd_counter);
> > > > +		spin_unlock(&dev_priv->bsd_lock);
> > > > +		file_priv->bsd_ring = bsd_ring;
> > > > +		return bsd_ring->id;
> > > > +	} else {
> > > > +		spin_lock(&dev_priv->bsd_lock);
> > > > +		bsd_ring = &dev_priv->ring[VCS];
> > > > +		bsd_counter = atomic_read(&bsd_ring->bsd_cmd_counter);
> > > > +		temp_ring = &dev_priv->ring[VCS2];
> > > > +		temp_counter = atomic_read(&temp_ring->bsd_cmd_counter);
> > > > +		if (temp_counter < bsd_counter) {
> > > > +			bsd_ring = temp_ring;
> > > > +			bsd_counter = temp_counter;
> > > > +		}
> > > > +		atomic_inc(&bsd_ring->bsd_cmd_counter);
> > > > +		file_priv->bsd_ring = bsd_ring;
> > > > +		spin_unlock(&dev_priv->bsd_lock);
> > > > +		return bsd_ring->id;
> > > > +	}
> > > > +}
> > > > +
> > > >  static int
> > > >  i915_gem_do_execbuffer(struct drm_device *dev, void *data,
> > > >  		       struct drm_file *file,
> > > > @@ -1043,7 +1107,14 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
> > > >  
> > > >  	if ((args->flags & I915_EXEC_RING_MASK) == I915_EXEC_DEFAULT)
> > > >  		ring = &dev_priv->ring[RCS];
> > > > -	else
> > > > +	else if ((args->flags & I915_EXEC_RING_MASK) == I915_EXEC_BSD) {
> > > > +		if (HAS_BSD2(dev)) {
> > > > +			int ring_id;
> > > > +			ring_id = gen8_dispatch_bsd_ring(dev, file);
> > > > +			ring = &dev_priv->ring[ring_id];
> > > > +		} else
> > > > +			ring = &dev_priv->ring[VCS];
> > > > +	} else
> > > >  		ring = &dev_priv->ring[(args->flags & I915_EXEC_RING_MASK) - 1];
> > > >  
> > > >  	if (!intel_ring_initialized(ring)) {
> > > > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> > > > index 43e0227..852fc29 100644
> > > > --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> > > > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> > > > @@ -2123,6 +2123,7 @@ int intel_init_bsd_ring_buffer(struct drm_device *dev)
> > > >  		ring->dispatch_execbuffer = i965_dispatch_execbuffer;
> > > >  	}
> > > >  	ring->init = init_ring_common;
> > > > +	atomic_set(&ring->bsd_cmd_counter, 0);
> > > >  
> > > >  	return intel_init_ring_buffer(dev, ring);
> > > >  }
> > > > @@ -2170,6 +2171,7 @@ int intel_init_bsd2_ring_buffer(struct drm_device *dev)
> > > >  
> > > >  	ring->init = init_ring_common;
> > > >  
> > > > +	atomic_set(&ring->bsd_cmd_counter, 0);
> > > >  	return intel_init_ring_buffer(dev, ring);
> > > >  }
> > > >  
> > > > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> > > > index 8ca4285..4f87b08 100644
> > > > --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> > > > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> > > > @@ -196,6 +196,8 @@ struct  intel_ring_buffer {
> > > >  	 * to encode the command length in the header).
> > > >  	 */
> > > >  	u32 (*get_cmd_length_mask)(u32 cmd_header);
> > > > +
> > > > +	atomic_t bsd_cmd_counter;
> > > >  };
> > > >  
> > > >  static inline bool
> > > > -- 
> > > > 1.7.10.1
> > > > 
> > > > _______________________________________________
> > > > Intel-gfx mailing list
> > > > Intel-gfx@lists.freedesktop.org
> > > > http://lists.freedesktop.org/mailman/listinfo/intel-gfx
> > > 
> > 
> > 
> 
> -- 
> Daniel Vetter
> Software Engineer, Intel Corporation
> +41 (0) 79 365 57 48 - http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 0/5] drm/i915: Add the support of dual BSD rings on BDW GT3
  2014-04-10  6:58     ` Daniel Vetter
@ 2014-04-10  8:28       ` Zhao Yakui
  2014-04-10  9:04         ` Daniel Vetter
  0 siblings, 1 reply; 24+ messages in thread
From: Zhao Yakui @ 2014-04-10  8:28 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: intel-gfx

On Thu, 2014-04-10 at 00:58 -0600, Daniel Vetter wrote:
> On Thu, Apr 10, 2014 at 11:28:46AM +0800, Zhao Yakui wrote:
> > On Wed, 2014-04-09 at 08:45 -0600, Daniel Vetter wrote:
> > > On Wed, Apr 09, 2014 at 09:59:51AM +0800, Zhao Yakui wrote:
> > > > 
> > > > This is the patch set that tries to add the support of dual BSD rings on BDW
> > > > GT3. Based on hardware spec, the BDW GT3 has two independent BSD rings, which
> > > > can be used to process the video commands. To be simpler, it is transparent 
> > > > to user-space driver/middleware. In such case the kernel driver will decide
> > > > which ring is to dispatch the BSD video command.
> > > > 
> > > > As every BSD ring is powerful, it is enough to dispatch the BSD video command
> > > > based on the drm fd. In such case the different BSD ring is used for video playing
> > > > back and encoding. At the same time the coarse dispatch mechanism can help to avoid
> > > > the object synchronization between the BSD rings.
> > > 
> > > Ok, I've quickly read through it all and commented on a few things. Imo
> > > the last patch should be massively simplified, at least for the first
> > > round. Other things look small.
> > > 
> > Hi, Daniel
> > 
> > Thanks for your review.
> > 
> > > What's still missing are testcases, and I have two things in mind here:
> > > - Exercise the 2nd ring dispatch and sync a bit. Since the 2nd bsd ring is
> > >   hidden within the kernel I think the right approach would be to open a
> > >   few drm fds (10 or so) and then randomly use them with a dummy reloc. We
> > >   have two testcases which can be used as blueprints that need
> > >   adjustement:
> > > 
> > >   - gem_ring_sync_loop: Probably easiest to copy it to a new file as
> > >     gem_multi_bsd_sync_loop. This test exercises semaphores.
> > >   - gem_dummy_reloc_loop, subtest mixed: Almost the same as the above, but
> > >     the sync is done _inside_ the loop and hence this exercises gpu/cpu
> > >     sync. We need both tests adjusted, for for this we need a new
> > >     multi-bsd test.
> > 
> > Agree with your concerns. I will try to add the
> > gem_multi_bsd_sync_loop/dummy_reloc_loop test case so that it can test
> > the sync with multi-BSD.
> > 
> > BTW: How about if I directly add multiple fds in gem_ring_sync_loop test
> > case and then test the sync among the different rings?  In such case the
> > user-application doesn't need to know the existence of multi-BSD rings.
> 
> We don't need it for the other rings, so I think it's better to leave the
> existing tests as-is to avoid introducing bugs. Testing testcase is always
> fairly hard, since you have to break your kernel to make sure the test
> still catches bugs ;-)
> 
> Also for testing VCS1 and VCS2 we need to have multiple fd using the
> _same_ logical ring exposed to userspace, so the test logic will look a
> bit different anyway.

OK. I will add the separated two test cases for it.

> 
> > > - New testcase to fully test main execbuffer flags. This is simply
> > >   something that's we don't yet have. The next guy to touch execbuf code
> > >   needs to add it, and it looks like that's you ;-) I've done a JIRA task
> > >   for the resource streamer work, but I think the resource streamer wont
> > >   be merged anytime soon. So I'll reassign to you. Jira task is VIZ-3129.
> > 
> > For the new testcase of execbuffer flag:  Do you have any idea about
> > which kind of exec flag needs to be checked? Do you have any idea about
> > the expected failure/successful behavour for the flags?
> >    For example: I915_EXEC_PINNED : If one object is not pinned and
> > submitted, what behavour is expected? Fail or wrong?
> 
> I've clarified the JIRA, the test is just for the flags/values in the main
> execbuf structure. And the idea is to do the basic api sanity checking as
> outlined in my blog post
> 
> http://blog.ffwll.ch/2013/11/botching-up-ioctls.html
> 
> i.e. go through all fields in struct drm_i915_gem_execbuffer2 and write a
> test which checks that the kernel correctly rejects invalid input data. So
> e.g. for pointer you can supply NULL or a pointer to invalid memory,
> buffer count is already checked with the overflow tests, but also invalid
> flags and also making sure that if reserved fields aren't 0 the kernel
> rejects the batch.
> 
> Of course to be able to check this you first need to construct a valid
> no-op batch (e.g. copy from gem_exec_nop.c) and submit it (to make sure no
> one breaks the test later on). Then each subtest only changes the relevant
> field to make sure the kernel really did check the field (and not just
> returned -EINVAL due to something else).
> 
> Some execbuf fields are special and e.g. contexts are not valid when
> there's no hw context support.
> 
> If you want to look for examples check out the basic api tests for
> recently added ioctls like in gem_reset_stats.c. Execbuffer ioctl is
> simply a bit more complex.

OK. I will take a look at your blog and understand what you mentioned.

BTW: Does it need to check all the flags defined in i915_drm.h or the
exported flag returned by i915_get_parameter?

Thanks.
    Yakui

> 
> Cheers, Daniel

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 5/5] drm/i915:Use the coarse mechanism based on drm fd to dispatch the BSD command on BDW GT3
  2014-04-10  8:04         ` Zhao Yakui
@ 2014-04-10  9:03           ` Daniel Vetter
  2014-04-11  0:53             ` Zhao Yakui
  0 siblings, 1 reply; 24+ messages in thread
From: Daniel Vetter @ 2014-04-10  9:03 UTC (permalink / raw)
  To: Zhao Yakui; +Cc: intel-gfx

On Thu, Apr 10, 2014 at 04:04:22PM +0800, Zhao Yakui wrote:
> On Thu, 2014-04-10 at 00:48 -0600, Daniel Vetter wrote:
> > On Thu, Apr 10, 2014 at 10:24:53AM +0800, Zhao Yakui wrote:
> > > On Wed, 2014-04-09 at 08:34 -0600, Daniel Vetter wrote:
> > > > On Wed, Apr 09, 2014 at 09:59:56AM +0800, Zhao Yakui wrote:
> > > > > The BDW GT3 has two independent BSD rings, which can be used to process the
> > > > > video commands. To be simpler, it is transparent to user-space driver/middleware.
> > > > > Instead the kernel driver will decide which ring is to dispatch the BSD video
> > > > > command.
> > > > > 
> > > > > As every BSD ring is powerful, it is enough to dispatch the BSD video command
> > > > > based on the drm fd. In such case the different BSD ring is used for video playing
> > > > > back and encoding. At the same time the coarse dispatch mechanism can help to avoid
> > > > > the object synchronization between the BSD rings.
> > > > > 
> > > > > Signed-off-by: Zhao Yakui <yakui.zhao@intel.com>
> > > > 
> > > > This looks way too complicated. First things first please get rid of the
> > > > atomic_t usage. If you don't have _massive_ comments explaining the memory
> > > > barriers you're most likely using linux kernel atomic_t wrong. They are
> > > > fully unordered.
> > > 
> > > Thanks for the review.
> > > 
> > > For the atomic_t usage:  I will remove it in next version as the counter
> > > is already protected by the lock.  
> > > 
> > > > 
> > > > With that out of the way this still looks a bit complicated really. Can't
> > > > we just use a very simple static rule in gen8_dispatch_bsd_ring which
> > > > hashed the pointer address of the file_priv? Just to get things going,
> > > > once we have a clear need we can try to make things more intelligent. But
> > > > in case of doubt I really prefer if we start with the dumbest possible
> > > > approach first and add complexity instead of starting with something
> > > > really complex and simplifying it.
> > > 
> > > Do you mean that file_priv is hashed and then is mapped to BSD 0 or 1
> > > ring?  
> > 
> > Yeah, that's the idea. Get in the basic support first, make it fancy like
> > you describe below second. This has a few upsides:
> > - We can concentrate on validating basic support in the first round
> >   instead of potentially fighting a bug in the load balancer.
> > - Discussions and performance testing for the load balancer won't hold up
> >   the entire feature.
> > - Like I've said this might not be required. Before we add more complexity
> >   than just hashing the file_priv I want to see some benchmarks of
> >   expected workloads that show that the load balancing is indeed a good
> >   idea - for the case of a transcode server I guess we should have
> >   sufficient in-flight operations that it won't really matter. Or at least
> >   I hope so.
> > 
> 
> OK. Understand your concerns. I can split it two steps. One is to add
> the basic support. The second step is for the optimization.
> 
> But I don't think that the hash of file_priv is a good idea. As it only
> has two rings, it is possible that the hash value is always mapped to
> BSD ring 0.  In such case when multiples video clips are played back,
> the performance can't meet with the requirement.(For example: User can
> play back 4 1080p video clips concurrently when only one BSD ring is
> used. On the BDW GT3, they hope to play back 8 1080p video clips
> concurrently. The poor hash design will cause that all the workload are
> mapped to one BSD ring and then it can't meet with the requirement).
> 
> How about using the ping-pong mechanism for the file_priv? For one new
> fd, it will use BSD ring 0 and then next file_priv will use BSD ring 1.
> Then BSD ring 0....BSD ring 1. 
> 
> Does this make sense to you?

Well the point of the hash is that it's dumb and simple, but maybe too
dumb. If we wend up with 3 streams on one vcs and 1 on the other, then we
have a good reason to merge the 2nd patch ;-)

Really, the point of the first patch is just so that we have /something/
which uses both rings with a reasonable chance, so that we can get testing
and validation off the ground. E.g. in the test I'd use 10 or so drm fds to
make sure that at least one of them uses the other ring, in case the hash
function isn't great.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 0/5] drm/i915: Add the support of dual BSD rings on BDW GT3
  2014-04-10  8:28       ` Zhao Yakui
@ 2014-04-10  9:04         ` Daniel Vetter
  2014-04-11  0:56           ` Zhao Yakui
  0 siblings, 1 reply; 24+ messages in thread
From: Daniel Vetter @ 2014-04-10  9:04 UTC (permalink / raw)
  To: Zhao Yakui; +Cc: intel-gfx

On Thu, Apr 10, 2014 at 04:28:34PM +0800, Zhao Yakui wrote:
> BTW: Does it need to check all the flags defined in i915_drm.h or the
> exported flag returned by i915_get_parameter?

I don't have i915_get_parameter anywhere in my sources, so no idea what
you mean ...
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 5/5] drm/i915:Use the coarse mechanism based on drm fd to dispatch the BSD command on BDW GT3
  2014-04-10  9:03           ` Daniel Vetter
@ 2014-04-11  0:53             ` Zhao Yakui
  0 siblings, 0 replies; 24+ messages in thread
From: Zhao Yakui @ 2014-04-11  0:53 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: intel-gfx

On Thu, 2014-04-10 at 03:03 -0600, Daniel Vetter wrote:
> On Thu, Apr 10, 2014 at 04:04:22PM +0800, Zhao Yakui wrote:
> > On Thu, 2014-04-10 at 00:48 -0600, Daniel Vetter wrote:
> > > On Thu, Apr 10, 2014 at 10:24:53AM +0800, Zhao Yakui wrote:
> > > > On Wed, 2014-04-09 at 08:34 -0600, Daniel Vetter wrote:
> > > > > On Wed, Apr 09, 2014 at 09:59:56AM +0800, Zhao Yakui wrote:
> > > > > > The BDW GT3 has two independent BSD rings, which can be used to process the
> > > > > > video commands. To be simpler, it is transparent to user-space driver/middleware.
> > > > > > Instead the kernel driver will decide which ring is to dispatch the BSD video
> > > > > > command.
> > > > > > 
> > > > > > As every BSD ring is powerful, it is enough to dispatch the BSD video command
> > > > > > based on the drm fd. In such case the different BSD ring is used for video playing
> > > > > > back and encoding. At the same time the coarse dispatch mechanism can help to avoid
> > > > > > the object synchronization between the BSD rings.
> > > > > > 
> > > > > > Signed-off-by: Zhao Yakui <yakui.zhao@intel.com>
> > > > > 
> > > > > This looks way too complicated. First things first please get rid of the
> > > > > atomic_t usage. If you don't have _massive_ comments explaining the memory
> > > > > barriers you're most likely using linux kernel atomic_t wrong. They are
> > > > > fully unordered.
> > > > 
> > > > Thanks for the review.
> > > > 
> > > > For the atomic_t usage:  I will remove it in next version as the counter
> > > > is already protected by the lock.  
> > > > 
> > > > > 
> > > > > With that out of the way this still looks a bit complicated really. Can't
> > > > > we just use a very simple static rule in gen8_dispatch_bsd_ring which
> > > > > hashed the pointer address of the file_priv? Just to get things going,
> > > > > once we have a clear need we can try to make things more intelligent. But
> > > > > in case of doubt I really prefer if we start with the dumbest possible
> > > > > approach first and add complexity instead of starting with something
> > > > > really complex and simplifying it.
> > > > 
> > > > Do you mean that file_priv is hashed and then is mapped to BSD 0 or 1
> > > > ring?  
> > > 
> > > Yeah, that's the idea. Get in the basic support first, make it fancy like
> > > you describe below second. This has a few upsides:
> > > - We can concentrate on validating basic support in the first round
> > >   instead of potentially fighting a bug in the load balancer.
> > > - Discussions and performance testing for the load balancer won't hold up
> > >   the entire feature.
> > > - Like I've said this might not be required. Before we add more complexity
> > >   than just hashing the file_priv I want to see some benchmarks of
> > >   expected workloads that show that the load balancing is indeed a good
> > >   idea - for the case of a transcode server I guess we should have
> > >   sufficient in-flight operations that it won't really matter. Or at least
> > >   I hope so.
> > > 
> > 
> > OK. Understand your concerns. I can split it two steps. One is to add
> > the basic support. The second step is for the optimization.
> > 
> > But I don't think that the hash of file_priv is a good idea. As it only
> > has two rings, it is possible that the hash value is always mapped to
> > BSD ring 0.  In such case when multiples video clips are played back,
> > the performance can't meet with the requirement.(For example: User can
> > play back 4 1080p video clips concurrently when only one BSD ring is
> > used. On the BDW GT3, they hope to play back 8 1080p video clips
> > concurrently. The poor hash design will cause that all the workload are
> > mapped to one BSD ring and then it can't meet with the requirement).
> > 
> > How about using the ping-pong mechanism for the file_priv? For one new
> > fd, it will use BSD ring 0 and then next file_priv will use BSD ring 1.
> > Then BSD ring 0....BSD ring 1. 
> > 
> > Does this make sense to you?
> 
> Well the point of the hash is that it's dumb and simple, but maybe too
> dumb. If we wend up with 3 streams on one vcs and 1 on the other, then we
> have a good reason to merge the 2nd patch ;-)
> 

Hi, Daniel

    Thanks for your comments. Now we get get the same point about the
support of dual BSD rings on BDW GT3 machine.  So this will be divided
into two steps. The first step is to use the simple ping-pong mechanism
to add the basic support. And the second step is for the
optimization(balance video workloads among the two rings).
    From my point the ping-pong mechanism is simpler and easier to
implement. Of course this can also be regarded as the specific hash.

> Really, the point of the first patch is just so that we have /something/
> which uses both rings with a reasonable chance, so that we can get testing
> and validation off the ground. E.g. in the test I'd use 10 or so drm fds to
> make sure that at least one of them uses the other ring, in case the hash
> function isn't great.

   Understand. We need such test case to verify it. This is already in
my plan.

Thanks.
    Yakui





> -Daniel

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 0/5] drm/i915: Add the support of dual BSD rings on BDW GT3
  2014-04-10  9:04         ` Daniel Vetter
@ 2014-04-11  0:56           ` Zhao Yakui
  2014-04-11  8:57             ` Daniel Vetter
  0 siblings, 1 reply; 24+ messages in thread
From: Zhao Yakui @ 2014-04-11  0:56 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: intel-gfx

On Thu, 2014-04-10 at 03:04 -0600, Daniel Vetter wrote:
> On Thu, Apr 10, 2014 at 04:28:34PM +0800, Zhao Yakui wrote:
> > BTW: Does it need to check all the flags defined in i915_drm.h or the
> > exported flag returned by i915_get_parameter?
> 
> I don't have i915_get_parameter anywhere in my sources, so no idea what
> you mean ...

Sorry that the function should be i915_getparam. It is called by the
I915_GETPARAM ioctl to query the flag supported by the driver.

Thanks.
    Yakui

> -Daniel

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 0/5] drm/i915: Add the support of dual BSD rings on BDW GT3
  2014-04-11  0:56           ` Zhao Yakui
@ 2014-04-11  8:57             ` Daniel Vetter
  2014-04-14  1:05               ` Zhao Yakui
  0 siblings, 1 reply; 24+ messages in thread
From: Daniel Vetter @ 2014-04-11  8:57 UTC (permalink / raw)
  To: Zhao Yakui; +Cc: intel-gfx

On Fri, Apr 11, 2014 at 08:56:28AM +0800, Zhao Yakui wrote:
> On Thu, 2014-04-10 at 03:04 -0600, Daniel Vetter wrote:
> > On Thu, Apr 10, 2014 at 04:28:34PM +0800, Zhao Yakui wrote:
> > > BTW: Does it need to check all the flags defined in i915_drm.h or the
> > > exported flag returned by i915_get_parameter?
> > 
> > I don't have i915_get_parameter anywhere in my sources, so no idea what
> > you mean ...
> 
> Sorry that the function should be i915_getparam. It is called by the
> I915_GETPARAM ioctl to query the flag supported by the driver.

Ah, now I understand. The idea is to test all fields of the structure
exhaustively (so also rsvd to make sure it's 0). Well except for the
buffer count field since we have tests for that already.

For the reasons see my two blog posts on the topic:

http://blog.ffwll.ch/2013/11/testing-requirements-for-drmi915.html

http://blog.ffwll.ch/2013/11/botching-up-ioctls.html

Cheers, Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 0/5] drm/i915: Add the support of dual BSD rings on BDW GT3
  2014-04-11  8:57             ` Daniel Vetter
@ 2014-04-14  1:05               ` Zhao Yakui
  0 siblings, 0 replies; 24+ messages in thread
From: Zhao Yakui @ 2014-04-14  1:05 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: intel-gfx

On Fri, 2014-04-11 at 02:57 -0600, Daniel Vetter wrote:
> On Fri, Apr 11, 2014 at 08:56:28AM +0800, Zhao Yakui wrote:
> > On Thu, 2014-04-10 at 03:04 -0600, Daniel Vetter wrote:
> > > On Thu, Apr 10, 2014 at 04:28:34PM +0800, Zhao Yakui wrote:
> > > > BTW: Does it need to check all the flags defined in i915_drm.h or the
> > > > exported flag returned by i915_get_parameter?
> > > 
> > > I don't have i915_get_parameter anywhere in my sources, so no idea what
> > > you mean ...
> > 
> > Sorry that the function should be i915_getparam. It is called by the
> > I915_GETPARAM ioctl to query the flag supported by the driver.
> 
> Ah, now I understand. The idea is to test all fields of the structure
> exhaustively (so also rsvd to make sure it's 0). Well except for the
> buffer count field since we have tests for that already.
> 
> For the reasons see my two blog posts on the topic:
> 
> http://blog.ffwll.ch/2013/11/testing-requirements-for-drmi915.html
> 
> http://blog.ffwll.ch/2013/11/botching-up-ioctls.html

OK. It seems that the case needs to check more fields than the exported
flag.
I will take a look at your blog and understand how to write the test
case.

Thanks.
    Yakui

> 
> Cheers, Daniel

^ permalink raw reply	[flat|nested] 24+ messages in thread

end of thread, other threads:[~2014-04-14  1:07 UTC | newest]

Thread overview: 24+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-04-09  1:59 [PATCH 0/5] drm/i915: Add the support of dual BSD rings on BDW GT3 Zhao Yakui
2014-04-09  1:59 ` [PATCH 1/5] drm/i915: Split the BDW device definition to prepare for " Zhao Yakui
2014-04-09 14:27   ` Daniel Vetter
2014-04-10  0:44     ` Zhao Yakui
2014-04-09  1:59 ` [PATCH 2/5] drm/i915:Initialize the second BSD ring on BDW GT3 machine Zhao Yakui
2014-04-09  1:59 ` [PATCH 3/5] drm/i915:Handle the irq interrupt for the second BSD ring Zhao Yakui
2014-04-09  1:59 ` [PATCH 4/5] drm/i915:Add the VCS2 switch in Intel_ring_setup_status_page for Gen7 to remove the switch check warning Zhao Yakui
2014-04-09 14:29   ` Daniel Vetter
2014-04-10  0:45     ` Zhao Yakui
2014-04-09  1:59 ` [PATCH 5/5] drm/i915:Use the coarse mechanism based on drm fd to dispatch the BSD command on BDW GT3 Zhao Yakui
2014-04-09 14:34   ` Daniel Vetter
2014-04-10  2:24     ` Zhao Yakui
2014-04-10  6:48       ` Daniel Vetter
2014-04-10  8:04         ` Zhao Yakui
2014-04-10  9:03           ` Daniel Vetter
2014-04-11  0:53             ` Zhao Yakui
2014-04-09 14:45 ` [PATCH 0/5] drm/i915: Add the support of dual BSD rings " Daniel Vetter
2014-04-10  3:28   ` Zhao Yakui
2014-04-10  6:58     ` Daniel Vetter
2014-04-10  8:28       ` Zhao Yakui
2014-04-10  9:04         ` Daniel Vetter
2014-04-11  0:56           ` Zhao Yakui
2014-04-11  8:57             ` Daniel Vetter
2014-04-14  1:05               ` Zhao Yakui

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.