linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/5] drm/msm: Cleanup and drm/sched tdr prep
@ 2021-11-09 18:11 Rob Clark
  2021-11-09 18:11 ` [PATCH 1/5] drm/msm: Remove unnecessary struct_mutex Rob Clark
                   ` (4 more replies)
  0 siblings, 5 replies; 9+ messages in thread
From: Rob Clark @ 2021-11-09 18:11 UTC (permalink / raw)
  To: dri-devel
  Cc: freedreno, linux-arm-msm, Jordan Crouse, Akhil P Oommen,
	Rob Clark, AngeloGioacchino Del Regno, Bjorn Andersson,
	Christian König, Dan Carpenter, Deepak R Varma,
	Dmitry Baryshkov, Douglas Anderson, Eric Anholt, Iskren Chernev,
	Jonathan Marek, Konrad Dybcio, open list, Marijn Suijten,
	Sai Prakash Ranjan, Sharat Masetty, Yangtao Li

From: Rob Clark <robdclark@chromium.org>

This started out as conversion to using drm/sched to handle job timeout,
recovery, and retire (and delete a bunch of code), but the latter part
is on hold until drm/sched is fixed to properly handle job retire/
cleanup before deciding which job triggered the fault/timeout[1].  But
the rest is worthwhile cleanup, and the last patch is needed for an igt
test that I'm working on to exercise timeout/fault recovery[2].

[1] https://lore.kernel.org/all/1630457207-13107-2-git-send-email-Monk.Liu@amd.com/
[2] https://patchwork.freedesktop.org/series/96722/

Rob Clark (5):
  drm/msm: Remove unnecessary struct_mutex
  drm/msm: Drop priv->lastctx
  drm/msm: Remove struct_mutex usage
  drm/msm: Handle fence rollover
  drm/msm: Add debugfs to disable hw err handling

 drivers/gpu/drm/msm/adreno/a2xx_gpu.c      |  3 +-
 drivers/gpu/drm/msm/adreno/a3xx_gpu.c      |  3 +-
 drivers/gpu/drm/msm/adreno/a4xx_gpu.c      |  3 +-
 drivers/gpu/drm/msm/adreno/a5xx_debugfs.c  |  4 +-
 drivers/gpu/drm/msm/adreno/a5xx_gpu.c      | 14 +++---
 drivers/gpu/drm/msm/adreno/a6xx_gpu.c      | 13 +++---
 drivers/gpu/drm/msm/adreno/a6xx_gpu.h      | 10 -----
 drivers/gpu/drm/msm/adreno/adreno_device.c |  4 +-
 drivers/gpu/drm/msm/msm_debugfs.c          | 52 +++++++++-------------
 drivers/gpu/drm/msm/msm_drv.c              |  6 ---
 drivers/gpu/drm/msm/msm_drv.h              | 11 ++++-
 drivers/gpu/drm/msm/msm_fbdev.c            | 13 ++----
 drivers/gpu/drm/msm/msm_fence.h            | 12 +++++
 drivers/gpu/drm/msm/msm_gpu.c              | 22 ++++-----
 drivers/gpu/drm/msm/msm_gpu.h              | 33 +++++++++++---
 drivers/gpu/drm/msm/msm_perf.c             |  9 ++--
 drivers/gpu/drm/msm/msm_rd.c               | 16 ++++---
 drivers/gpu/drm/msm/msm_ringbuffer.c       |  4 +-
 18 files changed, 125 insertions(+), 107 deletions(-)

-- 
2.31.1


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH 1/5] drm/msm: Remove unnecessary struct_mutex
  2021-11-09 18:11 [PATCH 0/5] drm/msm: Cleanup and drm/sched tdr prep Rob Clark
@ 2021-11-09 18:11 ` Rob Clark
  2021-11-09 18:11 ` [PATCH 2/5] drm/msm: Drop priv->lastctx Rob Clark
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 9+ messages in thread
From: Rob Clark @ 2021-11-09 18:11 UTC (permalink / raw)
  To: dri-devel
  Cc: freedreno, linux-arm-msm, Jordan Crouse, Akhil P Oommen,
	Rob Clark, Rob Clark, Sean Paul, David Airlie, Daniel Vetter,
	open list

From: Rob Clark <robdclark@chromium.org>

The struct_mutex locking is a remnant from the days before per-obj locks,
and no longer needed.

Signed-off-by: Rob Clark <robdclark@chromium.org>
---
 drivers/gpu/drm/msm/msm_debugfs.c | 37 ++++++++++---------------------
 drivers/gpu/drm/msm/msm_fbdev.c   | 13 ++++-------
 2 files changed, 16 insertions(+), 34 deletions(-)

diff --git a/drivers/gpu/drm/msm/msm_debugfs.c b/drivers/gpu/drm/msm/msm_debugfs.c
index dee13fedee3b..698e86f5b960 100644
--- a/drivers/gpu/drm/msm/msm_debugfs.c
+++ b/drivers/gpu/drm/msm/msm_debugfs.c
@@ -134,8 +134,10 @@ DEFINE_SIMPLE_ATTRIBUTE(shrink_fops,
 			"0x%08llx\n");
 
 
-static int msm_gem_show(struct drm_device *dev, struct seq_file *m)
+static int msm_gem_show(struct seq_file *m, void *arg)
 {
+	struct drm_info_node *node = (struct drm_info_node *) m->private;
+	struct drm_device *dev = node->minor->dev;
 	struct msm_drm_private *priv = dev->dev_private;
 	int ret;
 
@@ -150,8 +152,10 @@ static int msm_gem_show(struct drm_device *dev, struct seq_file *m)
 	return 0;
 }
 
-static int msm_mm_show(struct drm_device *dev, struct seq_file *m)
+static int msm_mm_show(struct seq_file *m, void *arg)
 {
+	struct drm_info_node *node = (struct drm_info_node *) m->private;
+	struct drm_device *dev = node->minor->dev;
 	struct drm_printer p = drm_seq_file_printer(m);
 
 	drm_mm_print(&dev->vma_offset_manager->vm_addr_space_mm, &p);
@@ -159,8 +163,10 @@ static int msm_mm_show(struct drm_device *dev, struct seq_file *m)
 	return 0;
 }
 
-static int msm_fb_show(struct drm_device *dev, struct seq_file *m)
+static int msm_fb_show(struct seq_file *m, void *arg)
 {
+	struct drm_info_node *node = (struct drm_info_node *) m->private;
+	struct drm_device *dev = node->minor->dev;
 	struct msm_drm_private *priv = dev->dev_private;
 	struct drm_framebuffer *fb, *fbdev_fb = NULL;
 
@@ -183,29 +189,10 @@ static int msm_fb_show(struct drm_device *dev, struct seq_file *m)
 	return 0;
 }
 
-static int show_locked(struct seq_file *m, void *arg)
-{
-	struct drm_info_node *node = (struct drm_info_node *) m->private;
-	struct drm_device *dev = node->minor->dev;
-	int (*show)(struct drm_device *dev, struct seq_file *m) =
-			node->info_ent->data;
-	int ret;
-
-	ret = mutex_lock_interruptible(&dev->struct_mutex);
-	if (ret)
-		return ret;
-
-	ret = show(dev, m);
-
-	mutex_unlock(&dev->struct_mutex);
-
-	return ret;
-}
-
 static struct drm_info_list msm_debugfs_list[] = {
-		{"gem", show_locked, 0, msm_gem_show},
-		{ "mm", show_locked, 0, msm_mm_show },
-		{ "fb", show_locked, 0, msm_fb_show },
+		{"gem", msm_gem_show},
+		{ "mm", msm_mm_show },
+		{ "fb", msm_fb_show },
 };
 
 static int late_init_minor(struct drm_minor *minor)
diff --git a/drivers/gpu/drm/msm/msm_fbdev.c b/drivers/gpu/drm/msm/msm_fbdev.c
index 0daaeb54ff6f..4c39ef9dd75d 100644
--- a/drivers/gpu/drm/msm/msm_fbdev.c
+++ b/drivers/gpu/drm/msm/msm_fbdev.c
@@ -81,8 +81,6 @@ static int msm_fbdev_create(struct drm_fb_helper *helper,
 
 	bo = msm_framebuffer_bo(fb, 0);
 
-	mutex_lock(&dev->struct_mutex);
-
 	/*
 	 * NOTE: if we can be guaranteed to be able to map buffer
 	 * in panic (ie. lock-safe, etc) we could avoid pinning the
@@ -91,14 +89,14 @@ static int msm_fbdev_create(struct drm_fb_helper *helper,
 	ret = msm_gem_get_and_pin_iova(bo, priv->kms->aspace, &paddr);
 	if (ret) {
 		DRM_DEV_ERROR(dev->dev, "failed to get buffer obj iova: %d\n", ret);
-		goto fail_unlock;
+		goto fail;
 	}
 
 	fbi = drm_fb_helper_alloc_fbi(helper);
 	if (IS_ERR(fbi)) {
 		DRM_DEV_ERROR(dev->dev, "failed to allocate fb info\n");
 		ret = PTR_ERR(fbi);
-		goto fail_unlock;
+		goto fail;
 	}
 
 	DBG("fbi=%p, dev=%p", fbi, dev);
@@ -115,7 +113,7 @@ static int msm_fbdev_create(struct drm_fb_helper *helper,
 	fbi->screen_base = msm_gem_get_vaddr(bo);
 	if (IS_ERR(fbi->screen_base)) {
 		ret = PTR_ERR(fbi->screen_base);
-		goto fail_unlock;
+		goto fail;
 	}
 	fbi->screen_size = bo->size;
 	fbi->fix.smem_start = paddr;
@@ -124,12 +122,9 @@ static int msm_fbdev_create(struct drm_fb_helper *helper,
 	DBG("par=%p, %dx%d", fbi->par, fbi->var.xres, fbi->var.yres);
 	DBG("allocated %dx%d fb", fbdev->fb->width, fbdev->fb->height);
 
-	mutex_unlock(&dev->struct_mutex);
-
 	return 0;
 
-fail_unlock:
-	mutex_unlock(&dev->struct_mutex);
+fail:
 	drm_framebuffer_remove(fb);
 	return ret;
 }
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 2/5] drm/msm: Drop priv->lastctx
  2021-11-09 18:11 [PATCH 0/5] drm/msm: Cleanup and drm/sched tdr prep Rob Clark
  2021-11-09 18:11 ` [PATCH 1/5] drm/msm: Remove unnecessary struct_mutex Rob Clark
@ 2021-11-09 18:11 ` Rob Clark
  2021-11-11 16:44   ` Akhil P Oommen
  2021-11-09 18:11 ` [PATCH 3/5] drm/msm: Remove struct_mutex usage Rob Clark
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 9+ messages in thread
From: Rob Clark @ 2021-11-09 18:11 UTC (permalink / raw)
  To: dri-devel
  Cc: freedreno, linux-arm-msm, Jordan Crouse, Akhil P Oommen,
	Rob Clark, Rob Clark, Sean Paul, David Airlie, Daniel Vetter,
	Iskren Chernev, Dan Carpenter, AngeloGioacchino Del Regno,
	Konrad Dybcio, Dmitry Baryshkov, Marijn Suijten, Jonathan Marek,
	Sai Prakash Ranjan, Sharat Masetty, Douglas Anderson, Yangtao Li,
	open list

From: Rob Clark <robdclark@chromium.org>

cur_ctx_seqno already does the same thing, but handles the edge cases
where a refcnt'd context can live after lastclose.  So let's not have
two ways to do the same thing.

Signed-off-by: Rob Clark <robdclark@chromium.org>
---
 drivers/gpu/drm/msm/adreno/a2xx_gpu.c |  3 +--
 drivers/gpu/drm/msm/adreno/a3xx_gpu.c |  3 +--
 drivers/gpu/drm/msm/adreno/a4xx_gpu.c |  3 +--
 drivers/gpu/drm/msm/adreno/a5xx_gpu.c |  8 +++-----
 drivers/gpu/drm/msm/adreno/a6xx_gpu.c |  9 +++------
 drivers/gpu/drm/msm/adreno/a6xx_gpu.h | 10 ----------
 drivers/gpu/drm/msm/msm_drv.c         |  6 ------
 drivers/gpu/drm/msm/msm_drv.h         |  2 +-
 drivers/gpu/drm/msm/msm_gpu.c         |  2 +-
 drivers/gpu/drm/msm/msm_gpu.h         | 11 +++++++++++
 10 files changed, 22 insertions(+), 35 deletions(-)

diff --git a/drivers/gpu/drm/msm/adreno/a2xx_gpu.c b/drivers/gpu/drm/msm/adreno/a2xx_gpu.c
index bdc989183c64..22e8295a5e2b 100644
--- a/drivers/gpu/drm/msm/adreno/a2xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a2xx_gpu.c
@@ -12,7 +12,6 @@ static bool a2xx_idle(struct msm_gpu *gpu);
 
 static void a2xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
 {
-	struct msm_drm_private *priv = gpu->dev->dev_private;
 	struct msm_ringbuffer *ring = submit->ring;
 	unsigned int i;
 
@@ -23,7 +22,7 @@ static void a2xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
 			break;
 		case MSM_SUBMIT_CMD_CTX_RESTORE_BUF:
 			/* ignore if there has not been a ctx switch: */
-			if (priv->lastctx == submit->queue->ctx)
+			if (gpu->cur_ctx_seqno == submit->queue->ctx->seqno)
 				break;
 			fallthrough;
 		case MSM_SUBMIT_CMD_BUF:
diff --git a/drivers/gpu/drm/msm/adreno/a3xx_gpu.c b/drivers/gpu/drm/msm/adreno/a3xx_gpu.c
index 8fb847c174ff..2e481e2692ba 100644
--- a/drivers/gpu/drm/msm/adreno/a3xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a3xx_gpu.c
@@ -30,7 +30,6 @@ static bool a3xx_idle(struct msm_gpu *gpu);
 
 static void a3xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
 {
-	struct msm_drm_private *priv = gpu->dev->dev_private;
 	struct msm_ringbuffer *ring = submit->ring;
 	unsigned int i;
 
@@ -41,7 +40,7 @@ static void a3xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
 			break;
 		case MSM_SUBMIT_CMD_CTX_RESTORE_BUF:
 			/* ignore if there has not been a ctx switch: */
-			if (priv->lastctx == submit->queue->ctx)
+			if (gpu->cur_ctx_seqno == submit->queue->ctx->seqno)
 				break;
 			fallthrough;
 		case MSM_SUBMIT_CMD_BUF:
diff --git a/drivers/gpu/drm/msm/adreno/a4xx_gpu.c b/drivers/gpu/drm/msm/adreno/a4xx_gpu.c
index a96ee79cc5e0..c5524d6e8705 100644
--- a/drivers/gpu/drm/msm/adreno/a4xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a4xx_gpu.c
@@ -24,7 +24,6 @@ static bool a4xx_idle(struct msm_gpu *gpu);
 
 static void a4xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
 {
-	struct msm_drm_private *priv = gpu->dev->dev_private;
 	struct msm_ringbuffer *ring = submit->ring;
 	unsigned int i;
 
@@ -35,7 +34,7 @@ static void a4xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
 			break;
 		case MSM_SUBMIT_CMD_CTX_RESTORE_BUF:
 			/* ignore if there has not been a ctx switch: */
-			if (priv->lastctx == submit->queue->ctx)
+			if (gpu->cur_ctx_seqno == submit->queue->ctx->seqno)
 				break;
 			fallthrough;
 		case MSM_SUBMIT_CMD_BUF:
diff --git a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
index 5e2750eb3810..6163990a4d09 100644
--- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
@@ -65,7 +65,6 @@ void a5xx_flush(struct msm_gpu *gpu, struct msm_ringbuffer *ring,
 
 static void a5xx_submit_in_rb(struct msm_gpu *gpu, struct msm_gem_submit *submit)
 {
-	struct msm_drm_private *priv = gpu->dev->dev_private;
 	struct msm_ringbuffer *ring = submit->ring;
 	struct msm_gem_object *obj;
 	uint32_t *ptr, dwords;
@@ -76,7 +75,7 @@ static void a5xx_submit_in_rb(struct msm_gpu *gpu, struct msm_gem_submit *submit
 		case MSM_SUBMIT_CMD_IB_TARGET_BUF:
 			break;
 		case MSM_SUBMIT_CMD_CTX_RESTORE_BUF:
-			if (priv->lastctx == submit->queue->ctx)
+			if (gpu->cur_ctx_seqno == submit->queue->ctx->seqno)
 				break;
 			fallthrough;
 		case MSM_SUBMIT_CMD_BUF:
@@ -126,12 +125,11 @@ static void a5xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
 {
 	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
 	struct a5xx_gpu *a5xx_gpu = to_a5xx_gpu(adreno_gpu);
-	struct msm_drm_private *priv = gpu->dev->dev_private;
 	struct msm_ringbuffer *ring = submit->ring;
 	unsigned int i, ibs = 0;
 
 	if (IS_ENABLED(CONFIG_DRM_MSM_GPU_SUDO) && submit->in_rb) {
-		priv->lastctx = NULL;
+		gpu->cur_ctx_seqno = 0;
 		a5xx_submit_in_rb(gpu, submit);
 		return;
 	}
@@ -166,7 +164,7 @@ static void a5xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
 		case MSM_SUBMIT_CMD_IB_TARGET_BUF:
 			break;
 		case MSM_SUBMIT_CMD_CTX_RESTORE_BUF:
-			if (priv->lastctx == submit->queue->ctx)
+			if (gpu->cur_ctx_seqno == submit->queue->ctx->seqno)
 				break;
 			fallthrough;
 		case MSM_SUBMIT_CMD_BUF:
diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
index 33da25b81615..3d2da81cb2c9 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
@@ -106,7 +106,7 @@ static void a6xx_set_pagetable(struct a6xx_gpu *a6xx_gpu,
 	u32 asid;
 	u64 memptr = rbmemptr(ring, ttbr0);
 
-	if (ctx->seqno == a6xx_gpu->cur_ctx_seqno)
+	if (ctx->seqno == a6xx_gpu->base.base.cur_ctx_seqno)
 		return;
 
 	if (msm_iommu_pagetable_params(ctx->aspace->mmu, &ttbr, &asid))
@@ -138,14 +138,11 @@ static void a6xx_set_pagetable(struct a6xx_gpu *a6xx_gpu,
 
 	OUT_PKT7(ring, CP_EVENT_WRITE, 1);
 	OUT_RING(ring, 0x31);
-
-	a6xx_gpu->cur_ctx_seqno = ctx->seqno;
 }
 
 static void a6xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
 {
 	unsigned int index = submit->seqno % MSM_GPU_SUBMIT_STATS_COUNT;
-	struct msm_drm_private *priv = gpu->dev->dev_private;
 	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
 	struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu);
 	struct msm_ringbuffer *ring = submit->ring;
@@ -177,7 +174,7 @@ static void a6xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
 		case MSM_SUBMIT_CMD_IB_TARGET_BUF:
 			break;
 		case MSM_SUBMIT_CMD_CTX_RESTORE_BUF:
-			if (priv->lastctx == submit->queue->ctx)
+			if (gpu->cur_ctx_seqno == submit->queue->ctx->seqno)
 				break;
 			fallthrough;
 		case MSM_SUBMIT_CMD_BUF:
@@ -1081,7 +1078,7 @@ static int hw_init(struct msm_gpu *gpu)
 	/* Always come up on rb 0 */
 	a6xx_gpu->cur_ring = gpu->rb[0];
 
-	a6xx_gpu->cur_ctx_seqno = 0;
+	gpu->cur_ctx_seqno = 0;
 
 	/* Enable the SQE_to start the CP engine */
 	gpu_write(gpu, REG_A6XX_CP_SQE_CNTL, 1);
diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
index 8e5527c881b1..86e0a7c3fe6d 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
@@ -20,16 +20,6 @@ struct a6xx_gpu {
 
 	struct msm_ringbuffer *cur_ring;
 
-	/**
-	 * cur_ctx_seqno:
-	 *
-	 * The ctx->seqno value of the context with current pgtables
-	 * installed.  Tracked by seqno rather than pointer value to
-	 * avoid dangling pointers, and cases where a ctx can be freed
-	 * and a new one created with the same address.
-	 */
-	int cur_ctx_seqno;
-
 	struct a6xx_gmu gmu;
 
 	struct drm_gem_object *shadow_bo;
diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c
index 7936e8d498dd..73e827641024 100644
--- a/drivers/gpu/drm/msm/msm_drv.c
+++ b/drivers/gpu/drm/msm/msm_drv.c
@@ -752,14 +752,8 @@ static void context_close(struct msm_file_private *ctx)
 
 static void msm_postclose(struct drm_device *dev, struct drm_file *file)
 {
-	struct msm_drm_private *priv = dev->dev_private;
 	struct msm_file_private *ctx = file->driver_priv;
 
-	mutex_lock(&dev->struct_mutex);
-	if (ctx == priv->lastctx)
-		priv->lastctx = NULL;
-	mutex_unlock(&dev->struct_mutex);
-
 	context_close(ctx);
 }
 
diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h
index 69952b239384..2943c21d9aac 100644
--- a/drivers/gpu/drm/msm/msm_drv.h
+++ b/drivers/gpu/drm/msm/msm_drv.h
@@ -164,7 +164,7 @@ struct msm_drm_private {
 
 	/* when we have more than one 'msm_gpu' these need to be an array: */
 	struct msm_gpu *gpu;
-	struct msm_file_private *lastctx;
+
 	/* gpu is only set on open(), but we need this info earlier */
 	bool is_a2xx;
 	bool has_cached_coherent;
diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
index 2c46cd968ac4..3dfc58e6498f 100644
--- a/drivers/gpu/drm/msm/msm_gpu.c
+++ b/drivers/gpu/drm/msm/msm_gpu.c
@@ -763,7 +763,7 @@ void msm_gpu_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
 	mutex_unlock(&gpu->active_lock);
 
 	gpu->funcs->submit(gpu, submit);
-	priv->lastctx = submit->queue->ctx;
+	gpu->cur_ctx_seqno = submit->queue->ctx->seqno;
 
 	hangcheck_timer_reset(gpu);
 }
diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
index 59870095ea41..623ee416c568 100644
--- a/drivers/gpu/drm/msm/msm_gpu.h
+++ b/drivers/gpu/drm/msm/msm_gpu.h
@@ -144,6 +144,17 @@ struct msm_gpu {
 	struct msm_ringbuffer *rb[MSM_GPU_MAX_RINGS];
 	int nr_rings;
 
+	/**
+	 * cur_ctx_seqno:
+	 *
+	 * The ctx->seqno value of the last context to submit rendering,
+	 * and the one with current pgtables installed (for generations
+	 * that support per-context pgtables).  Tracked by seqno rather
+	 * than pointer value to avoid dangling pointers, and cases where
+	 * a ctx can be freed and a new one created with the same address.
+	 */
+	int cur_ctx_seqno;
+
 	/*
 	 * List of GEM active objects on this gpu.  Protected by
 	 * msm_drm_private::mm_lock
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 3/5] drm/msm: Remove struct_mutex usage
  2021-11-09 18:11 [PATCH 0/5] drm/msm: Cleanup and drm/sched tdr prep Rob Clark
  2021-11-09 18:11 ` [PATCH 1/5] drm/msm: Remove unnecessary struct_mutex Rob Clark
  2021-11-09 18:11 ` [PATCH 2/5] drm/msm: Drop priv->lastctx Rob Clark
@ 2021-11-09 18:11 ` Rob Clark
  2021-11-09 18:11 ` [PATCH 4/5] drm/msm: Handle fence rollover Rob Clark
  2021-11-09 18:11 ` [PATCH 5/5] drm/msm: Add debugfs to disable hw err handling Rob Clark
  4 siblings, 0 replies; 9+ messages in thread
From: Rob Clark @ 2021-11-09 18:11 UTC (permalink / raw)
  To: dri-devel
  Cc: freedreno, linux-arm-msm, Jordan Crouse, Akhil P Oommen,
	Rob Clark, Rob Clark, Sean Paul, David Airlie, Daniel Vetter,
	Dmitry Baryshkov, Deepak R Varma, Christian König,
	Jonathan Marek, AngeloGioacchino Del Regno, Iskren Chernev,
	Bjorn Andersson, open list

From: Rob Clark <robdclark@chromium.org>

The remaining struct_mutex usage is just to serialize various gpu
related things (submit/retire/recover/fault/etc), so replace
struct_mutex with gpu->lock.

Signed-off-by: Rob Clark <robdclark@chromium.org>
---
 drivers/gpu/drm/msm/adreno/a5xx_debugfs.c  |  4 ++--
 drivers/gpu/drm/msm/adreno/adreno_device.c |  4 ++--
 drivers/gpu/drm/msm/msm_debugfs.c          | 12 ++++++------
 drivers/gpu/drm/msm/msm_gpu.c              | 14 +++++++-------
 drivers/gpu/drm/msm/msm_gpu.h              | 20 +++++++++++++++-----
 drivers/gpu/drm/msm/msm_perf.c             |  9 ++++++---
 drivers/gpu/drm/msm/msm_rd.c               | 16 +++++++++-------
 drivers/gpu/drm/msm/msm_ringbuffer.c       |  4 ++--
 8 files changed, 49 insertions(+), 34 deletions(-)

diff --git a/drivers/gpu/drm/msm/adreno/a5xx_debugfs.c b/drivers/gpu/drm/msm/adreno/a5xx_debugfs.c
index dd593ec2bc56..6bd397a85834 100644
--- a/drivers/gpu/drm/msm/adreno/a5xx_debugfs.c
+++ b/drivers/gpu/drm/msm/adreno/a5xx_debugfs.c
@@ -107,7 +107,7 @@ reset_set(void *data, u64 val)
 	 * try to reset an active GPU.
 	 */
 
-	mutex_lock(&dev->struct_mutex);
+	mutex_lock(&gpu->lock);
 
 	release_firmware(adreno_gpu->fw[ADRENO_FW_PM4]);
 	adreno_gpu->fw[ADRENO_FW_PM4] = NULL;
@@ -133,7 +133,7 @@ reset_set(void *data, u64 val)
 	gpu->funcs->recover(gpu);
 
 	pm_runtime_put_sync(&gpu->pdev->dev);
-	mutex_unlock(&dev->struct_mutex);
+	mutex_unlock(&gpu->lock);
 
 	return 0;
 }
diff --git a/drivers/gpu/drm/msm/adreno/adreno_device.c b/drivers/gpu/drm/msm/adreno/adreno_device.c
index 2a6ce76656aa..9e01ccc800a6 100644
--- a/drivers/gpu/drm/msm/adreno/adreno_device.c
+++ b/drivers/gpu/drm/msm/adreno/adreno_device.c
@@ -408,9 +408,9 @@ struct msm_gpu *adreno_load_gpu(struct drm_device *dev)
 		return NULL;
 	}
 
-	mutex_lock(&dev->struct_mutex);
+	mutex_lock(&gpu->lock);
 	ret = msm_gpu_hw_init(gpu);
-	mutex_unlock(&dev->struct_mutex);
+	mutex_unlock(&gpu->lock);
 	pm_runtime_put_autosuspend(&pdev->dev);
 	if (ret) {
 		DRM_DEV_ERROR(dev->dev, "gpu hw init failed: %d\n", ret);
diff --git a/drivers/gpu/drm/msm/msm_debugfs.c b/drivers/gpu/drm/msm/msm_debugfs.c
index 698e86f5b960..6a99e8b5d25d 100644
--- a/drivers/gpu/drm/msm/msm_debugfs.c
+++ b/drivers/gpu/drm/msm/msm_debugfs.c
@@ -29,14 +29,14 @@ static int msm_gpu_show(struct seq_file *m, void *arg)
 	struct msm_gpu *gpu = priv->gpu;
 	int ret;
 
-	ret = mutex_lock_interruptible(&show_priv->dev->struct_mutex);
+	ret = mutex_lock_interruptible(&gpu->lock);
 	if (ret)
 		return ret;
 
 	drm_printf(&p, "%s Status:\n", gpu->name);
 	gpu->funcs->show(gpu, show_priv->state, &p);
 
-	mutex_unlock(&show_priv->dev->struct_mutex);
+	mutex_unlock(&gpu->lock);
 
 	return 0;
 }
@@ -48,9 +48,9 @@ static int msm_gpu_release(struct inode *inode, struct file *file)
 	struct msm_drm_private *priv = show_priv->dev->dev_private;
 	struct msm_gpu *gpu = priv->gpu;
 
-	mutex_lock(&show_priv->dev->struct_mutex);
+	mutex_lock(&gpu->lock);
 	gpu->funcs->gpu_state_put(show_priv->state);
-	mutex_unlock(&show_priv->dev->struct_mutex);
+	mutex_unlock(&gpu->lock);
 
 	kfree(show_priv);
 
@@ -72,7 +72,7 @@ static int msm_gpu_open(struct inode *inode, struct file *file)
 	if (!show_priv)
 		return -ENOMEM;
 
-	ret = mutex_lock_interruptible(&dev->struct_mutex);
+	ret = mutex_lock_interruptible(&gpu->lock);
 	if (ret)
 		goto free_priv;
 
@@ -81,7 +81,7 @@ static int msm_gpu_open(struct inode *inode, struct file *file)
 	show_priv->state = gpu->funcs->gpu_state_get(gpu);
 	pm_runtime_put_sync(&gpu->pdev->dev);
 
-	mutex_unlock(&dev->struct_mutex);
+	mutex_unlock(&gpu->lock);
 
 	if (IS_ERR(show_priv->state)) {
 		ret = PTR_ERR(show_priv->state);
diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
index 3dfc58e6498f..13de1241d595 100644
--- a/drivers/gpu/drm/msm/msm_gpu.c
+++ b/drivers/gpu/drm/msm/msm_gpu.c
@@ -150,7 +150,7 @@ int msm_gpu_hw_init(struct msm_gpu *gpu)
 {
 	int ret;
 
-	WARN_ON(!mutex_is_locked(&gpu->dev->struct_mutex));
+	WARN_ON(!mutex_is_locked(&gpu->lock));
 
 	if (!gpu->needs_hw_init)
 		return 0;
@@ -361,7 +361,7 @@ static void recover_worker(struct kthread_work *work)
 	char *comm = NULL, *cmd = NULL;
 	int i;
 
-	mutex_lock(&dev->struct_mutex);
+	mutex_lock(&gpu->lock);
 
 	DRM_DEV_ERROR(dev->dev, "%s: hangcheck recover!\n", gpu->name);
 
@@ -442,7 +442,7 @@ static void recover_worker(struct kthread_work *work)
 		}
 	}
 
-	mutex_unlock(&dev->struct_mutex);
+	mutex_unlock(&gpu->lock);
 
 	msm_gpu_retire(gpu);
 }
@@ -450,12 +450,11 @@ static void recover_worker(struct kthread_work *work)
 static void fault_worker(struct kthread_work *work)
 {
 	struct msm_gpu *gpu = container_of(work, struct msm_gpu, fault_work);
-	struct drm_device *dev = gpu->dev;
 	struct msm_gem_submit *submit;
 	struct msm_ringbuffer *cur_ring = gpu->funcs->active_ring(gpu);
 	char *comm = NULL, *cmd = NULL;
 
-	mutex_lock(&dev->struct_mutex);
+	mutex_lock(&gpu->lock);
 
 	submit = find_submit(cur_ring, cur_ring->memptrs->fence + 1);
 	if (submit && submit->fault_dumped)
@@ -490,7 +489,7 @@ static void fault_worker(struct kthread_work *work)
 	memset(&gpu->fault_info, 0, sizeof(gpu->fault_info));
 	gpu->aspace->mmu->funcs->resume_translation(gpu->aspace->mmu);
 
-	mutex_unlock(&dev->struct_mutex);
+	mutex_unlock(&gpu->lock);
 }
 
 static void hangcheck_timer_reset(struct msm_gpu *gpu)
@@ -733,7 +732,7 @@ void msm_gpu_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
 	struct msm_ringbuffer *ring = submit->ring;
 	unsigned long flags;
 
-	WARN_ON(!mutex_is_locked(&dev->struct_mutex));
+	WARN_ON(!mutex_is_locked(&gpu->lock));
 
 	pm_runtime_get_sync(&gpu->pdev->dev);
 
@@ -848,6 +847,7 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev,
 
 	INIT_LIST_HEAD(&gpu->active_list);
 	mutex_init(&gpu->active_lock);
+	mutex_init(&gpu->lock);
 	kthread_init_work(&gpu->retire_work, retire_worker);
 	kthread_init_work(&gpu->recover_work, recover_worker);
 	kthread_init_work(&gpu->fault_work, fault_worker);
diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
index 623ee416c568..0dcc31c27ac3 100644
--- a/drivers/gpu/drm/msm/msm_gpu.h
+++ b/drivers/gpu/drm/msm/msm_gpu.h
@@ -161,13 +161,23 @@ struct msm_gpu {
 	 */
 	struct list_head active_list;
 
+	/**
+	 * lock:
+	 *
+	 * General lock for serializing all the gpu things.
+	 *
+	 * TODO move to per-ring locking where feasible (ie. submit/retire
+	 * path, etc)
+	 */
+	struct mutex lock;
+
 	/**
 	 * active_submits:
 	 *
 	 * The number of submitted but not yet retired submits, used to
 	 * determine transitions between active and idle.
 	 *
-	 * Protected by lock
+	 * Protected by active_lock
 	 */
 	int active_submits;
 
@@ -541,28 +551,28 @@ static inline struct msm_gpu_state *msm_gpu_crashstate_get(struct msm_gpu *gpu)
 {
 	struct msm_gpu_state *state = NULL;
 
-	mutex_lock(&gpu->dev->struct_mutex);
+	mutex_lock(&gpu->lock);
 
 	if (gpu->crashstate) {
 		kref_get(&gpu->crashstate->ref);
 		state = gpu->crashstate;
 	}
 
-	mutex_unlock(&gpu->dev->struct_mutex);
+	mutex_unlock(&gpu->lock);
 
 	return state;
 }
 
 static inline void msm_gpu_crashstate_put(struct msm_gpu *gpu)
 {
-	mutex_lock(&gpu->dev->struct_mutex);
+	mutex_lock(&gpu->lock);
 
 	if (gpu->crashstate) {
 		if (gpu->funcs->gpu_state_put(gpu->crashstate))
 			gpu->crashstate = NULL;
 	}
 
-	mutex_unlock(&gpu->dev->struct_mutex);
+	mutex_unlock(&gpu->lock);
 }
 
 /*
diff --git a/drivers/gpu/drm/msm/msm_perf.c b/drivers/gpu/drm/msm/msm_perf.c
index 3a27153eef08..3d3da79fec2a 100644
--- a/drivers/gpu/drm/msm/msm_perf.c
+++ b/drivers/gpu/drm/msm/msm_perf.c
@@ -155,9 +155,12 @@ static int perf_open(struct inode *inode, struct file *file)
 	struct msm_gpu *gpu = priv->gpu;
 	int ret = 0;
 
-	mutex_lock(&dev->struct_mutex);
+	if (!gpu)
+		return -ENODEV;
 
-	if (perf->open || !gpu) {
+	mutex_lock(&gpu->lock);
+
+	if (perf->open) {
 		ret = -EBUSY;
 		goto out;
 	}
@@ -171,7 +174,7 @@ static int perf_open(struct inode *inode, struct file *file)
 	perf->next_jiffies = jiffies + SAMPLE_TIME;
 
 out:
-	mutex_unlock(&dev->struct_mutex);
+	mutex_unlock(&gpu->lock);
 	return ret;
 }
 
diff --git a/drivers/gpu/drm/msm/msm_rd.c b/drivers/gpu/drm/msm/msm_rd.c
index b55398a34fa4..81432ec07012 100644
--- a/drivers/gpu/drm/msm/msm_rd.c
+++ b/drivers/gpu/drm/msm/msm_rd.c
@@ -86,7 +86,7 @@ struct msm_rd_state {
 	struct msm_gem_submit *submit;
 
 	/* fifo access is synchronized on the producer side by
-	 * struct_mutex held by submit code (otherwise we could
+	 * gpu->lock held by submit code (otherwise we could
 	 * end up w/ cmds logged in different order than they
 	 * were executed).  And read_lock synchronizes the reads
 	 */
@@ -181,9 +181,12 @@ static int rd_open(struct inode *inode, struct file *file)
 	uint32_t gpu_id;
 	int ret = 0;
 
-	mutex_lock(&dev->struct_mutex);
+	if (!gpu)
+		return -ENODEV;
 
-	if (rd->open || !gpu) {
+	mutex_lock(&gpu->lock);
+
+	if (rd->open) {
 		ret = -EBUSY;
 		goto out;
 	}
@@ -200,7 +203,7 @@ static int rd_open(struct inode *inode, struct file *file)
 	rd_write_section(rd, RD_GPU_ID, &gpu_id, sizeof(gpu_id));
 
 out:
-	mutex_unlock(&dev->struct_mutex);
+	mutex_unlock(&gpu->lock);
 	return ret;
 }
 
@@ -340,11 +343,10 @@ static void snapshot_buf(struct msm_rd_state *rd,
 	msm_gem_unlock(&obj->base);
 }
 
-/* called under struct_mutex */
+/* called under gpu->lock */
 void msm_rd_dump_submit(struct msm_rd_state *rd, struct msm_gem_submit *submit,
 		const char *fmt, ...)
 {
-	struct drm_device *dev = submit->dev;
 	struct task_struct *task;
 	char msg[256];
 	int i, n;
@@ -355,7 +357,7 @@ void msm_rd_dump_submit(struct msm_rd_state *rd, struct msm_gem_submit *submit,
 	/* writing into fifo is serialized by caller, and
 	 * rd->read_lock is used to serialize the reads
 	 */
-	WARN_ON(!mutex_is_locked(&dev->struct_mutex));
+	WARN_ON(!mutex_is_locked(&submit->gpu->lock));
 
 	if (fmt) {
 		va_list args;
diff --git a/drivers/gpu/drm/msm/msm_ringbuffer.c b/drivers/gpu/drm/msm/msm_ringbuffer.c
index bd54c1412649..a2314b75962f 100644
--- a/drivers/gpu/drm/msm/msm_ringbuffer.c
+++ b/drivers/gpu/drm/msm/msm_ringbuffer.c
@@ -32,11 +32,11 @@ static struct dma_fence *msm_job_run(struct drm_sched_job *job)
 	pm_runtime_get_sync(&gpu->pdev->dev);
 
 	/* TODO move submit path over to using a per-ring lock.. */
-	mutex_lock(&gpu->dev->struct_mutex);
+	mutex_lock(&gpu->lock);
 
 	msm_gpu_submit(gpu, submit);
 
-	mutex_unlock(&gpu->dev->struct_mutex);
+	mutex_unlock(&gpu->lock);
 
 	pm_runtime_put(&gpu->pdev->dev);
 
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 4/5] drm/msm: Handle fence rollover
  2021-11-09 18:11 [PATCH 0/5] drm/msm: Cleanup and drm/sched tdr prep Rob Clark
                   ` (2 preceding siblings ...)
  2021-11-09 18:11 ` [PATCH 3/5] drm/msm: Remove struct_mutex usage Rob Clark
@ 2021-11-09 18:11 ` Rob Clark
  2021-11-11 16:58   ` Akhil P Oommen
  2021-11-09 18:11 ` [PATCH 5/5] drm/msm: Add debugfs to disable hw err handling Rob Clark
  4 siblings, 1 reply; 9+ messages in thread
From: Rob Clark @ 2021-11-09 18:11 UTC (permalink / raw)
  To: dri-devel
  Cc: freedreno, linux-arm-msm, Jordan Crouse, Akhil P Oommen,
	Rob Clark, Rob Clark, Sean Paul, David Airlie, Daniel Vetter,
	open list

From: Rob Clark <robdclark@chromium.org>

Add some helpers for fence comparision, which handle rollover properly,
and stop open coding fence seqno comparisions.

Signed-off-by: Rob Clark <robdclark@chromium.org>
---
 drivers/gpu/drm/msm/msm_fence.h | 12 ++++++++++++
 drivers/gpu/drm/msm/msm_gpu.c   |  6 +++---
 drivers/gpu/drm/msm/msm_gpu.h   |  2 +-
 3 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/msm/msm_fence.h b/drivers/gpu/drm/msm/msm_fence.h
index 4783db528bcc..17ee3822b423 100644
--- a/drivers/gpu/drm/msm/msm_fence.h
+++ b/drivers/gpu/drm/msm/msm_fence.h
@@ -60,4 +60,16 @@ void msm_update_fence(struct msm_fence_context *fctx, uint32_t fence);
 
 struct dma_fence * msm_fence_alloc(struct msm_fence_context *fctx);
 
+static inline bool
+fence_before(uint32_t a, uint32_t b)
+{
+   return (int32_t)(a - b) < 0;
+}
+
+static inline bool
+fence_after(uint32_t a, uint32_t b)
+{
+   return (int32_t)(a - b) > 0;
+}
+
 #endif
diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
index 13de1241d595..0f78c2615272 100644
--- a/drivers/gpu/drm/msm/msm_gpu.c
+++ b/drivers/gpu/drm/msm/msm_gpu.c
@@ -172,7 +172,7 @@ static void update_fences(struct msm_gpu *gpu, struct msm_ringbuffer *ring,
 
 	spin_lock_irqsave(&ring->submit_lock, flags);
 	list_for_each_entry(submit, &ring->submits, node) {
-		if (submit->seqno > fence)
+		if (fence_after(submit->seqno, fence))
 			break;
 
 		msm_update_fence(submit->ring->fctx,
@@ -509,7 +509,7 @@ static void hangcheck_handler(struct timer_list *t)
 	if (fence != ring->hangcheck_fence) {
 		/* some progress has been made.. ya! */
 		ring->hangcheck_fence = fence;
-	} else if (fence < ring->seqno) {
+	} else if (fence_before(fence, ring->seqno)) {
 		/* no progress and not done.. hung! */
 		ring->hangcheck_fence = fence;
 		DRM_DEV_ERROR(dev->dev, "%s: hangcheck detected gpu lockup rb %d!\n",
@@ -523,7 +523,7 @@ static void hangcheck_handler(struct timer_list *t)
 	}
 
 	/* if still more pending work, reset the hangcheck timer: */
-	if (ring->seqno > ring->hangcheck_fence)
+	if (fence_after(ring->seqno, ring->hangcheck_fence))
 		hangcheck_timer_reset(gpu);
 
 	/* workaround for missing irq: */
diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
index 0dcc31c27ac3..bd4e0024033e 100644
--- a/drivers/gpu/drm/msm/msm_gpu.h
+++ b/drivers/gpu/drm/msm/msm_gpu.h
@@ -258,7 +258,7 @@ static inline bool msm_gpu_active(struct msm_gpu *gpu)
 	for (i = 0; i < gpu->nr_rings; i++) {
 		struct msm_ringbuffer *ring = gpu->rb[i];
 
-		if (ring->seqno > ring->memptrs->fence)
+		if (fence_after(ring->seqno, ring->memptrs->fence))
 			return true;
 	}
 
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 5/5] drm/msm: Add debugfs to disable hw err handling
  2021-11-09 18:11 [PATCH 0/5] drm/msm: Cleanup and drm/sched tdr prep Rob Clark
                   ` (3 preceding siblings ...)
  2021-11-09 18:11 ` [PATCH 4/5] drm/msm: Handle fence rollover Rob Clark
@ 2021-11-09 18:11 ` Rob Clark
  2021-11-11 17:14   ` Akhil P Oommen
  4 siblings, 1 reply; 9+ messages in thread
From: Rob Clark @ 2021-11-09 18:11 UTC (permalink / raw)
  To: dri-devel
  Cc: freedreno, linux-arm-msm, Jordan Crouse, Akhil P Oommen,
	Rob Clark, Rob Clark, Sean Paul, David Airlie, Daniel Vetter,
	AngeloGioacchino Del Regno, Konrad Dybcio, Yangtao Li,
	Marijn Suijten, Jonathan Marek, Dmitry Baryshkov,
	Sai Prakash Ranjan, Sharat Masetty, Douglas Anderson, open list

From: Rob Clark <robdclark@chromium.org>

Add a debugfs interface to ignore hw error irqs, in order to force
fallback to sw hangcheck mechanism.  Because the hw error detection is
pretty good on newer gens, we need this for igt tests to test the sw
hang detection.

Signed-off-by: Rob Clark <robdclark@chromium.org>
---
 drivers/gpu/drm/msm/adreno/a5xx_gpu.c | 6 ++++++
 drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 4 ++++
 drivers/gpu/drm/msm/msm_debugfs.c     | 3 +++
 drivers/gpu/drm/msm/msm_drv.h         | 9 +++++++++
 4 files changed, 22 insertions(+)

diff --git a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
index 6163990a4d09..ec8e043c9d38 100644
--- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
@@ -1252,6 +1252,7 @@ static void a5xx_fault_detect_irq(struct msm_gpu *gpu)
 
 static irqreturn_t a5xx_irq(struct msm_gpu *gpu)
 {
+	struct msm_drm_private *priv = gpu->dev->dev_private;
 	u32 status = gpu_read(gpu, REG_A5XX_RBBM_INT_0_STATUS);
 
 	/*
@@ -1261,6 +1262,11 @@ static irqreturn_t a5xx_irq(struct msm_gpu *gpu)
 	gpu_write(gpu, REG_A5XX_RBBM_INT_CLEAR_CMD,
 		status & ~A5XX_RBBM_INT_0_MASK_RBBM_AHB_ERROR);
 
+	if (priv->disable_err_irq) {
+		status &= A5XX_RBBM_INT_0_MASK_CP_CACHE_FLUSH_TS |
+			  A5XX_RBBM_INT_0_MASK_CP_SW;
+	}
+
 	/* Pass status to a5xx_rbbm_err_irq because we've already cleared it */
 	if (status & RBBM_ERROR_MASK)
 		a5xx_rbbm_err_irq(gpu, status);
diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
index 3d2da81cb2c9..8a2af3a27e33 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
@@ -1373,10 +1373,14 @@ static void a6xx_fault_detect_irq(struct msm_gpu *gpu)
 
 static irqreturn_t a6xx_irq(struct msm_gpu *gpu)
 {
+	struct msm_drm_private *priv = gpu->dev->dev_private;
 	u32 status = gpu_read(gpu, REG_A6XX_RBBM_INT_0_STATUS);
 
 	gpu_write(gpu, REG_A6XX_RBBM_INT_CLEAR_CMD, status);
 
+	if (priv->disable_err_irq)
+		status &= A6XX_RBBM_INT_0_MASK_CP_CACHE_FLUSH_TS;
+
 	if (status & A6XX_RBBM_INT_0_MASK_RBBM_HANG_DETECT)
 		a6xx_fault_detect_irq(gpu);
 
diff --git a/drivers/gpu/drm/msm/msm_debugfs.c b/drivers/gpu/drm/msm/msm_debugfs.c
index 6a99e8b5d25d..956b1efc3721 100644
--- a/drivers/gpu/drm/msm/msm_debugfs.c
+++ b/drivers/gpu/drm/msm/msm_debugfs.c
@@ -242,6 +242,9 @@ void msm_debugfs_init(struct drm_minor *minor)
 	debugfs_create_u32("hangcheck_period_ms", 0600, minor->debugfs_root,
 		&priv->hangcheck_period);
 
+	debugfs_create_bool("disable_err_irq", 0600, minor->debugfs_root,
+		&priv->disable_err_irq);
+
 	debugfs_create_file("shrink", S_IRWXU, minor->debugfs_root,
 		dev, &shrink_fops);
 
diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h
index 2943c21d9aac..a8da7a7efb84 100644
--- a/drivers/gpu/drm/msm/msm_drv.h
+++ b/drivers/gpu/drm/msm/msm_drv.h
@@ -246,6 +246,15 @@ struct msm_drm_private {
 
 	/* For hang detection, in ms */
 	unsigned int hangcheck_period;
+
+	/**
+	 * disable_err_irq:
+	 *
+	 * Disable handling of GPU hw error interrupts, to force fallback to
+	 * sw hangcheck timer.  Written (via debugfs) by igt tests to test
+	 * the sw hangcheck mechanism.
+	 */
+	bool disable_err_irq;
 };
 
 struct msm_format {
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH 2/5] drm/msm: Drop priv->lastctx
  2021-11-09 18:11 ` [PATCH 2/5] drm/msm: Drop priv->lastctx Rob Clark
@ 2021-11-11 16:44   ` Akhil P Oommen
  0 siblings, 0 replies; 9+ messages in thread
From: Akhil P Oommen @ 2021-11-11 16:44 UTC (permalink / raw)
  To: Rob Clark, dri-devel
  Cc: freedreno, linux-arm-msm, Jordan Crouse, Rob Clark, Sean Paul,
	David Airlie, Daniel Vetter, Iskren Chernev, Dan Carpenter,
	AngeloGioacchino Del Regno, Konrad Dybcio, Dmitry Baryshkov,
	Marijn Suijten, Jonathan Marek, Sai Prakash Ranjan,
	Sharat Masetty, Douglas Anderson, Yangtao Li, open list

On 11/9/2021 11:41 PM, Rob Clark wrote:
> From: Rob Clark <robdclark@chromium.org>
> 
> cur_ctx_seqno already does the same thing, but handles the edge cases
> where a refcnt'd context can live after lastclose.  So let's not have
> two ways to do the same thing.
> 
> Signed-off-by: Rob Clark <robdclark@chromium.org>
> ---
>   drivers/gpu/drm/msm/adreno/a2xx_gpu.c |  3 +--
>   drivers/gpu/drm/msm/adreno/a3xx_gpu.c |  3 +--
>   drivers/gpu/drm/msm/adreno/a4xx_gpu.c |  3 +--
>   drivers/gpu/drm/msm/adreno/a5xx_gpu.c |  8 +++-----
>   drivers/gpu/drm/msm/adreno/a6xx_gpu.c |  9 +++------
>   drivers/gpu/drm/msm/adreno/a6xx_gpu.h | 10 ----------
>   drivers/gpu/drm/msm/msm_drv.c         |  6 ------
>   drivers/gpu/drm/msm/msm_drv.h         |  2 +-
>   drivers/gpu/drm/msm/msm_gpu.c         |  2 +-
>   drivers/gpu/drm/msm/msm_gpu.h         | 11 +++++++++++
>   10 files changed, 22 insertions(+), 35 deletions(-)
> 
> diff --git a/drivers/gpu/drm/msm/adreno/a2xx_gpu.c b/drivers/gpu/drm/msm/adreno/a2xx_gpu.c
> index bdc989183c64..22e8295a5e2b 100644
> --- a/drivers/gpu/drm/msm/adreno/a2xx_gpu.c
> +++ b/drivers/gpu/drm/msm/adreno/a2xx_gpu.c
> @@ -12,7 +12,6 @@ static bool a2xx_idle(struct msm_gpu *gpu);
>   
>   static void a2xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
>   {
> -	struct msm_drm_private *priv = gpu->dev->dev_private;
>   	struct msm_ringbuffer *ring = submit->ring;
>   	unsigned int i;
>   
> @@ -23,7 +22,7 @@ static void a2xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
>   			break;
>   		case MSM_SUBMIT_CMD_CTX_RESTORE_BUF:
>   			/* ignore if there has not been a ctx switch: */
> -			if (priv->lastctx == submit->queue->ctx)
> +			if (gpu->cur_ctx_seqno == submit->queue->ctx->seqno)
>   				break;
>   			fallthrough;
>   		case MSM_SUBMIT_CMD_BUF:
> diff --git a/drivers/gpu/drm/msm/adreno/a3xx_gpu.c b/drivers/gpu/drm/msm/adreno/a3xx_gpu.c
> index 8fb847c174ff..2e481e2692ba 100644
> --- a/drivers/gpu/drm/msm/adreno/a3xx_gpu.c
> +++ b/drivers/gpu/drm/msm/adreno/a3xx_gpu.c
> @@ -30,7 +30,6 @@ static bool a3xx_idle(struct msm_gpu *gpu);
>   
>   static void a3xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
>   {
> -	struct msm_drm_private *priv = gpu->dev->dev_private;
>   	struct msm_ringbuffer *ring = submit->ring;
>   	unsigned int i;
>   
> @@ -41,7 +40,7 @@ static void a3xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
>   			break;
>   		case MSM_SUBMIT_CMD_CTX_RESTORE_BUF:
>   			/* ignore if there has not been a ctx switch: */
> -			if (priv->lastctx == submit->queue->ctx)
> +			if (gpu->cur_ctx_seqno == submit->queue->ctx->seqno)
>   				break;
>   			fallthrough;
>   		case MSM_SUBMIT_CMD_BUF:
> diff --git a/drivers/gpu/drm/msm/adreno/a4xx_gpu.c b/drivers/gpu/drm/msm/adreno/a4xx_gpu.c
> index a96ee79cc5e0..c5524d6e8705 100644
> --- a/drivers/gpu/drm/msm/adreno/a4xx_gpu.c
> +++ b/drivers/gpu/drm/msm/adreno/a4xx_gpu.c
> @@ -24,7 +24,6 @@ static bool a4xx_idle(struct msm_gpu *gpu);
>   
>   static void a4xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
>   {
> -	struct msm_drm_private *priv = gpu->dev->dev_private;
>   	struct msm_ringbuffer *ring = submit->ring;
>   	unsigned int i;
>   
> @@ -35,7 +34,7 @@ static void a4xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
>   			break;
>   		case MSM_SUBMIT_CMD_CTX_RESTORE_BUF:
>   			/* ignore if there has not been a ctx switch: */
> -			if (priv->lastctx == submit->queue->ctx)
> +			if (gpu->cur_ctx_seqno == submit->queue->ctx->seqno)
>   				break;
>   			fallthrough;
>   		case MSM_SUBMIT_CMD_BUF:
> diff --git a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
> index 5e2750eb3810..6163990a4d09 100644
> --- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
> +++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
> @@ -65,7 +65,6 @@ void a5xx_flush(struct msm_gpu *gpu, struct msm_ringbuffer *ring,
>   
>   static void a5xx_submit_in_rb(struct msm_gpu *gpu, struct msm_gem_submit *submit)
>   {
> -	struct msm_drm_private *priv = gpu->dev->dev_private;
>   	struct msm_ringbuffer *ring = submit->ring;
>   	struct msm_gem_object *obj;
>   	uint32_t *ptr, dwords;
> @@ -76,7 +75,7 @@ static void a5xx_submit_in_rb(struct msm_gpu *gpu, struct msm_gem_submit *submit
>   		case MSM_SUBMIT_CMD_IB_TARGET_BUF:
>   			break;
>   		case MSM_SUBMIT_CMD_CTX_RESTORE_BUF:
> -			if (priv->lastctx == submit->queue->ctx)
> +			if (gpu->cur_ctx_seqno == submit->queue->ctx->seqno)
>   				break;
>   			fallthrough;
>   		case MSM_SUBMIT_CMD_BUF:
> @@ -126,12 +125,11 @@ static void a5xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
>   {
>   	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
>   	struct a5xx_gpu *a5xx_gpu = to_a5xx_gpu(adreno_gpu);
> -	struct msm_drm_private *priv = gpu->dev->dev_private;
>   	struct msm_ringbuffer *ring = submit->ring;
>   	unsigned int i, ibs = 0;
>   
>   	if (IS_ENABLED(CONFIG_DRM_MSM_GPU_SUDO) && submit->in_rb) {
> -		priv->lastctx = NULL;
> +		gpu->cur_ctx_seqno = 0;
>   		a5xx_submit_in_rb(gpu, submit);
>   		return;
>   	}
> @@ -166,7 +164,7 @@ static void a5xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
>   		case MSM_SUBMIT_CMD_IB_TARGET_BUF:
>   			break;
>   		case MSM_SUBMIT_CMD_CTX_RESTORE_BUF:
> -			if (priv->lastctx == submit->queue->ctx)
> +			if (gpu->cur_ctx_seqno == submit->queue->ctx->seqno)
>   				break;
>   			fallthrough;
>   		case MSM_SUBMIT_CMD_BUF:
> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> index 33da25b81615..3d2da81cb2c9 100644
> --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> @@ -106,7 +106,7 @@ static void a6xx_set_pagetable(struct a6xx_gpu *a6xx_gpu,
>   	u32 asid;
>   	u64 memptr = rbmemptr(ring, ttbr0);
>   
> -	if (ctx->seqno == a6xx_gpu->cur_ctx_seqno)
> +	if (ctx->seqno == a6xx_gpu->base.base.cur_ctx_seqno)
>   		return;
>   
>   	if (msm_iommu_pagetable_params(ctx->aspace->mmu, &ttbr, &asid))
> @@ -138,14 +138,11 @@ static void a6xx_set_pagetable(struct a6xx_gpu *a6xx_gpu,
>   
>   	OUT_PKT7(ring, CP_EVENT_WRITE, 1);
>   	OUT_RING(ring, 0x31);
> -
> -	a6xx_gpu->cur_ctx_seqno = ctx->seqno;
>   }
>   
>   static void a6xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
>   {
>   	unsigned int index = submit->seqno % MSM_GPU_SUBMIT_STATS_COUNT;
> -	struct msm_drm_private *priv = gpu->dev->dev_private;
>   	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
>   	struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu);
>   	struct msm_ringbuffer *ring = submit->ring;
> @@ -177,7 +174,7 @@ static void a6xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
>   		case MSM_SUBMIT_CMD_IB_TARGET_BUF:
>   			break;
>   		case MSM_SUBMIT_CMD_CTX_RESTORE_BUF:
> -			if (priv->lastctx == submit->queue->ctx)
> +			if (gpu->cur_ctx_seqno == submit->queue->ctx->seqno)
>   				break;
>   			fallthrough;
>   		case MSM_SUBMIT_CMD_BUF:
> @@ -1081,7 +1078,7 @@ static int hw_init(struct msm_gpu *gpu)
>   	/* Always come up on rb 0 */
>   	a6xx_gpu->cur_ring = gpu->rb[0];
>   
> -	a6xx_gpu->cur_ctx_seqno = 0;
> +	gpu->cur_ctx_seqno = 0;
>   
>   	/* Enable the SQE_to start the CP engine */
>   	gpu_write(gpu, REG_A6XX_CP_SQE_CNTL, 1);
> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
> index 8e5527c881b1..86e0a7c3fe6d 100644
> --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
> @@ -20,16 +20,6 @@ struct a6xx_gpu {
>   
>   	struct msm_ringbuffer *cur_ring;
>   
> -	/**
> -	 * cur_ctx_seqno:
> -	 *
> -	 * The ctx->seqno value of the context with current pgtables
> -	 * installed.  Tracked by seqno rather than pointer value to
> -	 * avoid dangling pointers, and cases where a ctx can be freed
> -	 * and a new one created with the same address.
> -	 */
> -	int cur_ctx_seqno;
> -
>   	struct a6xx_gmu gmu;
>   
>   	struct drm_gem_object *shadow_bo;
> diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c
> index 7936e8d498dd..73e827641024 100644
> --- a/drivers/gpu/drm/msm/msm_drv.c
> +++ b/drivers/gpu/drm/msm/msm_drv.c
> @@ -752,14 +752,8 @@ static void context_close(struct msm_file_private *ctx)
>   
>   static void msm_postclose(struct drm_device *dev, struct drm_file *file)
>   {
> -	struct msm_drm_private *priv = dev->dev_private;
>   	struct msm_file_private *ctx = file->driver_priv;
>   
> -	mutex_lock(&dev->struct_mutex);
> -	if (ctx == priv->lastctx)
> -		priv->lastctx = NULL;
> -	mutex_unlock(&dev->struct_mutex);
> -
>   	context_close(ctx);
>   }
>   
> diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h
> index 69952b239384..2943c21d9aac 100644
> --- a/drivers/gpu/drm/msm/msm_drv.h
> +++ b/drivers/gpu/drm/msm/msm_drv.h
> @@ -164,7 +164,7 @@ struct msm_drm_private {
>   
>   	/* when we have more than one 'msm_gpu' these need to be an array: */
>   	struct msm_gpu *gpu;
> -	struct msm_file_private *lastctx;
> +
>   	/* gpu is only set on open(), but we need this info earlier */
>   	bool is_a2xx;
>   	bool has_cached_coherent;
> diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
> index 2c46cd968ac4..3dfc58e6498f 100644
> --- a/drivers/gpu/drm/msm/msm_gpu.c
> +++ b/drivers/gpu/drm/msm/msm_gpu.c
> @@ -763,7 +763,7 @@ void msm_gpu_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
>   	mutex_unlock(&gpu->active_lock);
>   
>   	gpu->funcs->submit(gpu, submit);
> -	priv->lastctx = submit->queue->ctx;
> +	gpu->cur_ctx_seqno = submit->queue->ctx->seqno;
>   
>   	hangcheck_timer_reset(gpu);
>   }
> diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
> index 59870095ea41..623ee416c568 100644
> --- a/drivers/gpu/drm/msm/msm_gpu.h
> +++ b/drivers/gpu/drm/msm/msm_gpu.h
> @@ -144,6 +144,17 @@ struct msm_gpu {
>   	struct msm_ringbuffer *rb[MSM_GPU_MAX_RINGS];
>   	int nr_rings;
>   
> +	/**
> +	 * cur_ctx_seqno:
> +	 *
> +	 * The ctx->seqno value of the last context to submit rendering,
> +	 * and the one with current pgtables installed (for generations
> +	 * that support per-context pgtables).  Tracked by seqno rather
> +	 * than pointer value to avoid dangling pointers, and cases where
> +	 * a ctx can be freed and a new one created with the same address.
> +	 */
> +	int cur_ctx_seqno;
> +
>   	/*
>   	 * List of GEM active objects on this gpu.  Protected by
>   	 * msm_drm_private::mm_lock
> 

Reviewed-by: Akhil P Oommen <akhilpo@codeaurora.org>

-Akhil.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 4/5] drm/msm: Handle fence rollover
  2021-11-09 18:11 ` [PATCH 4/5] drm/msm: Handle fence rollover Rob Clark
@ 2021-11-11 16:58   ` Akhil P Oommen
  0 siblings, 0 replies; 9+ messages in thread
From: Akhil P Oommen @ 2021-11-11 16:58 UTC (permalink / raw)
  To: Rob Clark, dri-devel
  Cc: freedreno, linux-arm-msm, Jordan Crouse, Rob Clark, Sean Paul,
	David Airlie, Daniel Vetter, open list

On 11/9/2021 11:41 PM, Rob Clark wrote:
> From: Rob Clark <robdclark@chromium.org>
> 
> Add some helpers for fence comparision, which handle rollover properly,
> and stop open coding fence seqno comparisions.
> 
> Signed-off-by: Rob Clark <robdclark@chromium.org>
> ---
>   drivers/gpu/drm/msm/msm_fence.h | 12 ++++++++++++
>   drivers/gpu/drm/msm/msm_gpu.c   |  6 +++---
>   drivers/gpu/drm/msm/msm_gpu.h   |  2 +-
>   3 files changed, 16 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/gpu/drm/msm/msm_fence.h b/drivers/gpu/drm/msm/msm_fence.h
> index 4783db528bcc..17ee3822b423 100644
> --- a/drivers/gpu/drm/msm/msm_fence.h
> +++ b/drivers/gpu/drm/msm/msm_fence.h
> @@ -60,4 +60,16 @@ void msm_update_fence(struct msm_fence_context *fctx, uint32_t fence);
>   
>   struct dma_fence * msm_fence_alloc(struct msm_fence_context *fctx);
>   
> +static inline bool
> +fence_before(uint32_t a, uint32_t b)
> +{
> +   return (int32_t)(a - b) < 0;

This is good enough when a and b have close values. And that is a good 
assumption for KMD generated seqno.

Reviewed-by: Akhil P Oommen <akhilpo@codeaurora.org>

-Akhil.

> +}
> +
> +static inline bool
> +fence_after(uint32_t a, uint32_t b)
> +{
> +   return (int32_t)(a - b) > 0;
> +}
> +
>   #endif
> diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
> index 13de1241d595..0f78c2615272 100644
> --- a/drivers/gpu/drm/msm/msm_gpu.c
> +++ b/drivers/gpu/drm/msm/msm_gpu.c
> @@ -172,7 +172,7 @@ static void update_fences(struct msm_gpu *gpu, struct msm_ringbuffer *ring,
>   
>   	spin_lock_irqsave(&ring->submit_lock, flags);
>   	list_for_each_entry(submit, &ring->submits, node) {
> -		if (submit->seqno > fence)
> +		if (fence_after(submit->seqno, fence))
>   			break;
>   
>   		msm_update_fence(submit->ring->fctx,
> @@ -509,7 +509,7 @@ static void hangcheck_handler(struct timer_list *t)
>   	if (fence != ring->hangcheck_fence) {
>   		/* some progress has been made.. ya! */
>   		ring->hangcheck_fence = fence;
> -	} else if (fence < ring->seqno) {
> +	} else if (fence_before(fence, ring->seqno)) {
>   		/* no progress and not done.. hung! */
>   		ring->hangcheck_fence = fence;
>   		DRM_DEV_ERROR(dev->dev, "%s: hangcheck detected gpu lockup rb %d!\n",
> @@ -523,7 +523,7 @@ static void hangcheck_handler(struct timer_list *t)
>   	}
>   
>   	/* if still more pending work, reset the hangcheck timer: */
> -	if (ring->seqno > ring->hangcheck_fence)
> +	if (fence_after(ring->seqno, ring->hangcheck_fence))
>   		hangcheck_timer_reset(gpu);
>   
>   	/* workaround for missing irq: */
> diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
> index 0dcc31c27ac3..bd4e0024033e 100644
> --- a/drivers/gpu/drm/msm/msm_gpu.h
> +++ b/drivers/gpu/drm/msm/msm_gpu.h
> @@ -258,7 +258,7 @@ static inline bool msm_gpu_active(struct msm_gpu *gpu)
>   	for (i = 0; i < gpu->nr_rings; i++) {
>   		struct msm_ringbuffer *ring = gpu->rb[i];
>   
> -		if (ring->seqno > ring->memptrs->fence)
> +		if (fence_after(ring->seqno, ring->memptrs->fence))
>   			return true;
>   	}
>   
> 


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 5/5] drm/msm: Add debugfs to disable hw err handling
  2021-11-09 18:11 ` [PATCH 5/5] drm/msm: Add debugfs to disable hw err handling Rob Clark
@ 2021-11-11 17:14   ` Akhil P Oommen
  0 siblings, 0 replies; 9+ messages in thread
From: Akhil P Oommen @ 2021-11-11 17:14 UTC (permalink / raw)
  To: Rob Clark, dri-devel
  Cc: freedreno, linux-arm-msm, Jordan Crouse, Rob Clark, Sean Paul,
	David Airlie, Daniel Vetter, AngeloGioacchino Del Regno,
	Konrad Dybcio, Yangtao Li, Marijn Suijten, Jonathan Marek,
	Dmitry Baryshkov, Sai Prakash Ranjan, Sharat Masetty,
	Douglas Anderson, open list

On 11/9/2021 11:41 PM, Rob Clark wrote:
> From: Rob Clark <robdclark@chromium.org>
> 
> Add a debugfs interface to ignore hw error irqs, in order to force
> fallback to sw hangcheck mechanism.  Because the hw error detection is
> pretty good on newer gens, we need this for igt tests to test the sw
> hang detection.
> 
> Signed-off-by: Rob Clark <robdclark@chromium.org>
> ---
>   drivers/gpu/drm/msm/adreno/a5xx_gpu.c | 6 ++++++
>   drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 4 ++++
>   drivers/gpu/drm/msm/msm_debugfs.c     | 3 +++
>   drivers/gpu/drm/msm/msm_drv.h         | 9 +++++++++
>   4 files changed, 22 insertions(+)
> 
> diff --git a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
> index 6163990a4d09..ec8e043c9d38 100644
> --- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
> +++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
> @@ -1252,6 +1252,7 @@ static void a5xx_fault_detect_irq(struct msm_gpu *gpu)
>   
>   static irqreturn_t a5xx_irq(struct msm_gpu *gpu)
>   {
> +	struct msm_drm_private *priv = gpu->dev->dev_private;
>   	u32 status = gpu_read(gpu, REG_A5XX_RBBM_INT_0_STATUS);
>   
>   	/*
> @@ -1261,6 +1262,11 @@ static irqreturn_t a5xx_irq(struct msm_gpu *gpu)
>   	gpu_write(gpu, REG_A5XX_RBBM_INT_CLEAR_CMD,
>   		status & ~A5XX_RBBM_INT_0_MASK_RBBM_AHB_ERROR);
>   
> +	if (priv->disable_err_irq) {
> +		status &= A5XX_RBBM_INT_0_MASK_CP_CACHE_FLUSH_TS |
> +			  A5XX_RBBM_INT_0_MASK_CP_SW;
> +	}
> +
>   	/* Pass status to a5xx_rbbm_err_irq because we've already cleared it */
>   	if (status & RBBM_ERROR_MASK)
>   		a5xx_rbbm_err_irq(gpu, status);
> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> index 3d2da81cb2c9..8a2af3a27e33 100644
> --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> @@ -1373,10 +1373,14 @@ static void a6xx_fault_detect_irq(struct msm_gpu *gpu)
>   
>   static irqreturn_t a6xx_irq(struct msm_gpu *gpu)
>   {
> +	struct msm_drm_private *priv = gpu->dev->dev_private;
>   	u32 status = gpu_read(gpu, REG_A6XX_RBBM_INT_0_STATUS);
>   
>   	gpu_write(gpu, REG_A6XX_RBBM_INT_CLEAR_CMD, status);
>   
> +	if (priv->disable_err_irq)
> +		status &= A6XX_RBBM_INT_0_MASK_CP_CACHE_FLUSH_TS;
> +
>   	if (status & A6XX_RBBM_INT_0_MASK_RBBM_HANG_DETECT)
>   		a6xx_fault_detect_irq(gpu);
>   
> diff --git a/drivers/gpu/drm/msm/msm_debugfs.c b/drivers/gpu/drm/msm/msm_debugfs.c
> index 6a99e8b5d25d..956b1efc3721 100644
> --- a/drivers/gpu/drm/msm/msm_debugfs.c
> +++ b/drivers/gpu/drm/msm/msm_debugfs.c
> @@ -242,6 +242,9 @@ void msm_debugfs_init(struct drm_minor *minor)
>   	debugfs_create_u32("hangcheck_period_ms", 0600, minor->debugfs_root,
>   		&priv->hangcheck_period);
>   
> +	debugfs_create_bool("disable_err_irq", 0600, minor->debugfs_root,
> +		&priv->disable_err_irq);
> +
>   	debugfs_create_file("shrink", S_IRWXU, minor->debugfs_root,
>   		dev, &shrink_fops);
>   
> diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h
> index 2943c21d9aac..a8da7a7efb84 100644
> --- a/drivers/gpu/drm/msm/msm_drv.h
> +++ b/drivers/gpu/drm/msm/msm_drv.h
> @@ -246,6 +246,15 @@ struct msm_drm_private {
>   
>   	/* For hang detection, in ms */
>   	unsigned int hangcheck_period;
> +
> +	/**
> +	 * disable_err_irq:
> +	 *
> +	 * Disable handling of GPU hw error interrupts, to force fallback to
> +	 * sw hangcheck timer.  Written (via debugfs) by igt tests to test
> +	 * the sw hangcheck mechanism.
> +	 */
> +	bool disable_err_irq;
>   };
>   
>   struct msm_format {
> 

Reviewed-by: Akhil P Oommen <akhilpo@codeaurora.org>

-Akhil.

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2021-11-11 17:16 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-11-09 18:11 [PATCH 0/5] drm/msm: Cleanup and drm/sched tdr prep Rob Clark
2021-11-09 18:11 ` [PATCH 1/5] drm/msm: Remove unnecessary struct_mutex Rob Clark
2021-11-09 18:11 ` [PATCH 2/5] drm/msm: Drop priv->lastctx Rob Clark
2021-11-11 16:44   ` Akhil P Oommen
2021-11-09 18:11 ` [PATCH 3/5] drm/msm: Remove struct_mutex usage Rob Clark
2021-11-09 18:11 ` [PATCH 4/5] drm/msm: Handle fence rollover Rob Clark
2021-11-11 16:58   ` Akhil P Oommen
2021-11-09 18:11 ` [PATCH 5/5] drm/msm: Add debugfs to disable hw err handling Rob Clark
2021-11-11 17:14   ` Akhil P Oommen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).