All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/3] drm/amdgpu: Fix entities for disabled HW blocks.
@ 2019-01-29 10:20 Bas Nieuwenhuizen
       [not found] ` <20190129102048.21827-1-bas-dldO88ZXqoXqqjsSq9zF6IRWq/SkRNHw@public.gmane.org>
  0 siblings, 1 reply; 6+ messages in thread
From: Bas Nieuwenhuizen @ 2019-01-29 10:20 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Bas Nieuwenhuizen

If we have some disabled HW blocks (say VCN), then the rings are
not initialized. This reuslts in entities that refer to uninitialized
rqs (runqueues?).

In normal usage this does not result in issues because userspace
generally knows to ignore the unsupported blocks, but e.g. setting
the priorities on all the entities resulted in a NULL access while
locking the rq spinlock.

This could probably also be induced when actually adding a job for
the blocks whith some less smart userspace.

Signed-off-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 31 +++++++++++++++++++------
 drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h |  1 +
 2 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
index d85184b5b35c..6f72ce785b32 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
@@ -169,9 +169,13 @@ static int amdgpu_ctx_init(struct amdgpu_device *adev,
 		for (j = 0; j < num_rings; ++j)
 			rqs[j] = &rings[j]->sched.sched_rq[priority];
 
-		for (j = 0; j < amdgpu_ctx_num_entities[i]; ++j)
-			r = drm_sched_entity_init(&ctx->entities[i][j].entity,
-						  rqs, num_rings, &ctx->guilty);
+		for (j = 0; j < amdgpu_ctx_num_entities[i]; ++j) {
+			ctx->entities[i][j].enabled = rings[0]->adev != NULL;
+			if (ctx->entities[i][j].enabled) {
+				r = drm_sched_entity_init(&ctx->entities[i][j].entity,
+							  rqs, num_rings, &ctx->guilty);
+			}
+		}
 		if (r)
 			goto error_cleanup_entities;
 	}
@@ -180,7 +184,8 @@ static int amdgpu_ctx_init(struct amdgpu_device *adev,
 
 error_cleanup_entities:
 	for (i = 0; i < num_entities; ++i)
-		drm_sched_entity_destroy(&ctx->entities[0][i].entity);
+		if (ctx->entities[0][i].enabled)
+			drm_sched_entity_destroy(&ctx->entities[0][i].entity);
 	kfree(ctx->entities[0]);
 
 error_free_fences:
@@ -229,6 +234,11 @@ int amdgpu_ctx_get_entity(struct amdgpu_ctx *ctx, u32 hw_ip, u32 instance,
 		return -EINVAL;
 	}
 
+	if (!ctx->entities[hw_ip][ring].enabled) {
+		DRM_DEBUG("disabled ring: %d %d\n", hw_ip, ring);
+		return -EINVAL;
+	}
+
 	*entity = &ctx->entities[hw_ip][ring].entity;
 	return 0;
 }
@@ -279,7 +289,8 @@ static void amdgpu_ctx_do_release(struct kref *ref)
 		num_entities += amdgpu_ctx_num_entities[i];
 
 	for (i = 0; i < num_entities; i++)
-		drm_sched_entity_destroy(&ctx->entities[0][i].entity);
+		if (ctx->entities[0][i].enabled)
+			drm_sched_entity_destroy(&ctx->entities[0][i].entity);
 
 	amdgpu_ctx_fini(ref);
 }
@@ -505,7 +516,9 @@ void amdgpu_ctx_priority_override(struct amdgpu_ctx *ctx,
 	for (i = 0; i < num_entities; i++) {
 		struct drm_sched_entity *entity = &ctx->entities[0][i].entity;
 
-		drm_sched_entity_set_priority(entity, ctx_prio);
+
+		if (ctx->entities[0][1].enabled)
+			drm_sched_entity_set_priority(entity, ctx_prio);
 	}
 }
 
@@ -557,6 +570,9 @@ void amdgpu_ctx_mgr_entity_flush(struct amdgpu_ctx_mgr *mgr)
 		for (i = 0; i < num_entities; i++) {
 			struct drm_sched_entity *entity;
 
+			if (!ctx->entities[0][i].enabled)
+				continue;
+
 			entity = &ctx->entities[0][i].entity;
 			max_wait = drm_sched_entity_flush(entity, max_wait);
 		}
@@ -584,7 +600,8 @@ void amdgpu_ctx_mgr_entity_fini(struct amdgpu_ctx_mgr *mgr)
 		}
 
 		for (i = 0; i < num_entities; i++)
-			drm_sched_entity_fini(&ctx->entities[0][i].entity);
+			if (ctx->entities[0][i].enabled)
+				drm_sched_entity_fini(&ctx->entities[0][i].entity);
 	}
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
index b3b012c0a7da..183a783aedd8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
@@ -33,6 +33,7 @@ struct amdgpu_ctx_entity {
 	uint64_t		sequence;
 	struct dma_fence	**fences;
 	struct drm_sched_entity	entity;
+	bool                    enabled;
 };
 
 struct amdgpu_ctx {
-- 
2.20.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 2/3] drm/amdgpu: Check if fd really is an amdgpu fd.
       [not found] ` <20190129102048.21827-1-bas-dldO88ZXqoXqqjsSq9zF6IRWq/SkRNHw@public.gmane.org>
@ 2019-01-29 10:20   ` Bas Nieuwenhuizen
  2019-01-29 10:20   ` [PATCH 3/3] drm/amdgpu: Add command to override the context priority Bas Nieuwenhuizen
  2019-01-29 10:33   ` [PATCH 1/3] drm/amdgpu: Fix entities for disabled HW blocks Christian König
  2 siblings, 0 replies; 6+ messages in thread
From: Bas Nieuwenhuizen @ 2019-01-29 10:20 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Bas Nieuwenhuizen

Otherwise we interpret the file private data as drm & amdgpu data
while it might not be, possibly allowing one to get memory corruption.

Signed-off-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h       |  2 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c   | 16 ++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_sched.c | 10 +++++++---
 3 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index d67f8b1dfe80..17290cdb8ed8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -411,6 +411,8 @@ struct amdgpu_fpriv {
 	struct amdgpu_ctx_mgr	ctx_mgr;
 };
 
+int amdgpu_file_to_fpriv(struct file *filp, struct amdgpu_fpriv **fpriv);
+
 int amdgpu_ib_get(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 		  unsigned size, struct amdgpu_ib *ib);
 void amdgpu_ib_free(struct amdgpu_device *adev, struct amdgpu_ib *ib,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index c806f984bcc5..90a520034c89 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -1176,6 +1176,22 @@ static const struct file_operations amdgpu_driver_kms_fops = {
 #endif
 };
 
+int amdgpu_file_to_fpriv(struct file *filp, struct amdgpu_fpriv **fpriv)
+{
+        struct drm_file *file;
+
+	if (!filp)
+		return -EINVAL;
+
+	if (filp->f_op != &amdgpu_driver_kms_fops) {
+		return -EINVAL;
+	}
+
+	file = filp->private_data;
+	*fpriv = file->driver_priv;
+	return 0;
+}
+
 static bool
 amdgpu_get_crtc_scanout_position(struct drm_device *dev, unsigned int pipe,
 				 bool in_vblank_irq, int *vpos, int *hpos,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sched.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sched.c
index 1cafe8d83a4d..0b70410488b6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sched.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sched.c
@@ -54,16 +54,20 @@ static int amdgpu_sched_process_priority_override(struct amdgpu_device *adev,
 						  enum drm_sched_priority priority)
 {
 	struct file *filp = fget(fd);
-	struct drm_file *file;
 	struct amdgpu_fpriv *fpriv;
 	struct amdgpu_ctx *ctx;
 	uint32_t id;
+	int r;
 
 	if (!filp)
 		return -EINVAL;
 
-	file = filp->private_data;
-	fpriv = file->driver_priv;
+	r = amdgpu_file_to_fpriv(filp, &fpriv);
+	if (r) {
+		fput(filp);
+		return r;
+	}
+
 	idr_for_each_entry(&fpriv->ctx_mgr.ctx_handles, ctx, id)
 		amdgpu_ctx_priority_override(ctx, priority);
 
-- 
2.20.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 3/3] drm/amdgpu: Add command to override the context priority.
       [not found] ` <20190129102048.21827-1-bas-dldO88ZXqoXqqjsSq9zF6IRWq/SkRNHw@public.gmane.org>
  2019-01-29 10:20   ` [PATCH 2/3] drm/amdgpu: Check if fd really is an amdgpu fd Bas Nieuwenhuizen
@ 2019-01-29 10:20   ` Bas Nieuwenhuizen
  2019-01-29 10:33   ` [PATCH 1/3] drm/amdgpu: Fix entities for disabled HW blocks Christian König
  2 siblings, 0 replies; 6+ messages in thread
From: Bas Nieuwenhuizen @ 2019-01-29 10:20 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Bas Nieuwenhuizen

Given a master fd we can then override the priority of the context
in another fd.

Using these overrides was recommended by Christian instead of trying
to submit from a master fd, and I am adding a way to override a
single context instead of the entire process so we can only upgrade
a single Vulkan queue and not effectively the entire process.

Reused the flags field as it was checked to be 0 anyways, so nothing
used it. This is source-incompatible (due to the name change), but
ABI compatible.

Signed-off-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_sched.c | 45 ++++++++++++++++++++++-
 include/uapi/drm/amdgpu_drm.h             |  3 +-
 2 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sched.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sched.c
index 0b70410488b6..78f5a42f36e0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sched.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sched.c
@@ -76,6 +76,39 @@ static int amdgpu_sched_process_priority_override(struct amdgpu_device *adev,
 	return 0;
 }
 
+static int amdgpu_sched_context_priority_override(struct amdgpu_device *adev,
+						  int fd,
+						  unsigned ctx_id,
+						  enum drm_sched_priority priority)
+{
+	struct file *filp = fget(fd);
+	struct amdgpu_fpriv *fpriv;
+	struct amdgpu_ctx *ctx;
+	int r;
+
+	if (!filp)
+		return -EINVAL;
+
+	r = amdgpu_file_to_fpriv(filp, &fpriv);
+	if (r) {
+		fput(filp);
+		return r;
+	}
+
+	ctx = amdgpu_ctx_get(fpriv, ctx_id);
+
+	if (!ctx) {
+		fput(filp);
+		return -EINVAL;
+	}
+
+	amdgpu_ctx_priority_override(ctx, priority);
+	amdgpu_ctx_put(ctx);
+	fput(filp);
+
+	return 0;
+}
+
 int amdgpu_sched_ioctl(struct drm_device *dev, void *data,
 		       struct drm_file *filp)
 {
@@ -85,15 +118,25 @@ int amdgpu_sched_ioctl(struct drm_device *dev, void *data,
 	int r;
 
 	priority = amdgpu_to_sched_priority(args->in.priority);
-	if (args->in.flags || priority == DRM_SCHED_PRIORITY_INVALID)
+	if (priority == DRM_SCHED_PRIORITY_INVALID)
 		return -EINVAL;
 
+        if (priority < 0 || priority >= DRM_SCHED_PRIORITY_MAX)
+                return -EINVAL;
+
+	WARN(1, "priority %d %d\n", args->in.priority, priority);
 	switch (args->in.op) {
 	case AMDGPU_SCHED_OP_PROCESS_PRIORITY_OVERRIDE:
 		r = amdgpu_sched_process_priority_override(adev,
 							   args->in.fd,
 							   priority);
 		break;
+	case AMDGPU_SCHED_OP_CONTEXT_PRIORITY_OVERRIDE:
+		r = amdgpu_sched_context_priority_override(adev,
+							   args->in.fd,
+							   args->in.ctx_id,
+							   priority);
+		break;
 	default:
 		DRM_ERROR("Invalid sched op specified: %d\n", args->in.op);
 		r = -EINVAL;
diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
index faaad04814e4..30fa340790b2 100644
--- a/include/uapi/drm/amdgpu_drm.h
+++ b/include/uapi/drm/amdgpu_drm.h
@@ -275,13 +275,14 @@ union drm_amdgpu_vm {
 
 /* sched ioctl */
 #define AMDGPU_SCHED_OP_PROCESS_PRIORITY_OVERRIDE	1
+#define AMDGPU_SCHED_OP_CONTEXT_PRIORITY_OVERRIDE	2
 
 struct drm_amdgpu_sched_in {
 	/* AMDGPU_SCHED_OP_* */
 	__u32	op;
 	__u32	fd;
 	__s32	priority;
-	__u32	flags;
+	__u32   ctx_id;
 };
 
 union drm_amdgpu_sched {
-- 
2.20.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH 1/3] drm/amdgpu: Fix entities for disabled HW blocks.
       [not found] ` <20190129102048.21827-1-bas-dldO88ZXqoXqqjsSq9zF6IRWq/SkRNHw@public.gmane.org>
  2019-01-29 10:20   ` [PATCH 2/3] drm/amdgpu: Check if fd really is an amdgpu fd Bas Nieuwenhuizen
  2019-01-29 10:20   ` [PATCH 3/3] drm/amdgpu: Add command to override the context priority Bas Nieuwenhuizen
@ 2019-01-29 10:33   ` Christian König
       [not found]     ` <301feb02-ec3a-4635-8ae5-0bfc4be207e8-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
  2 siblings, 1 reply; 6+ messages in thread
From: Christian König @ 2019-01-29 10:33 UTC (permalink / raw)
  To: Bas Nieuwenhuizen, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Am 29.01.19 um 11:20 schrieb Bas Nieuwenhuizen:
> If we have some disabled HW blocks (say VCN), then the rings are
> not initialized. This reuslts in entities that refer to uninitialized
> rqs (runqueues?).
>
> In normal usage this does not result in issues because userspace
> generally knows to ignore the unsupported blocks, but e.g. setting
> the priorities on all the entities resulted in a NULL access while
> locking the rq spinlock.
>
> This could probably also be induced when actually adding a job for
> the blocks whith some less smart userspace.

In general I agree that we need to improve the handling here. But this 
looks completely incorrect to me.

We should always initialize all entities, but only with rq which are 
initialized as well.

When this results in zero rq then we can later on handle that gracefully.

Regards,
Christian.

>
> Signed-off-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 31 +++++++++++++++++++------
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h |  1 +
>   2 files changed, 25 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> index d85184b5b35c..6f72ce785b32 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> @@ -169,9 +169,13 @@ static int amdgpu_ctx_init(struct amdgpu_device *adev,
>   		for (j = 0; j < num_rings; ++j)
>   			rqs[j] = &rings[j]->sched.sched_rq[priority];
>   
> -		for (j = 0; j < amdgpu_ctx_num_entities[i]; ++j)
> -			r = drm_sched_entity_init(&ctx->entities[i][j].entity,
> -						  rqs, num_rings, &ctx->guilty);
> +		for (j = 0; j < amdgpu_ctx_num_entities[i]; ++j) {
> +			ctx->entities[i][j].enabled = rings[0]->adev != NULL;
> +			if (ctx->entities[i][j].enabled) {
> +				r = drm_sched_entity_init(&ctx->entities[i][j].entity,
> +							  rqs, num_rings, &ctx->guilty);
> +			}
> +		}
>   		if (r)
>   			goto error_cleanup_entities;
>   	}
> @@ -180,7 +184,8 @@ static int amdgpu_ctx_init(struct amdgpu_device *adev,
>   
>   error_cleanup_entities:
>   	for (i = 0; i < num_entities; ++i)
> -		drm_sched_entity_destroy(&ctx->entities[0][i].entity);
> +		if (ctx->entities[0][i].enabled)
> +			drm_sched_entity_destroy(&ctx->entities[0][i].entity);
>   	kfree(ctx->entities[0]);
>   
>   error_free_fences:
> @@ -229,6 +234,11 @@ int amdgpu_ctx_get_entity(struct amdgpu_ctx *ctx, u32 hw_ip, u32 instance,
>   		return -EINVAL;
>   	}
>   
> +	if (!ctx->entities[hw_ip][ring].enabled) {
> +		DRM_DEBUG("disabled ring: %d %d\n", hw_ip, ring);
> +		return -EINVAL;
> +	}
> +
>   	*entity = &ctx->entities[hw_ip][ring].entity;
>   	return 0;
>   }
> @@ -279,7 +289,8 @@ static void amdgpu_ctx_do_release(struct kref *ref)
>   		num_entities += amdgpu_ctx_num_entities[i];
>   
>   	for (i = 0; i < num_entities; i++)
> -		drm_sched_entity_destroy(&ctx->entities[0][i].entity);
> +		if (ctx->entities[0][i].enabled)
> +			drm_sched_entity_destroy(&ctx->entities[0][i].entity);
>   
>   	amdgpu_ctx_fini(ref);
>   }
> @@ -505,7 +516,9 @@ void amdgpu_ctx_priority_override(struct amdgpu_ctx *ctx,
>   	for (i = 0; i < num_entities; i++) {
>   		struct drm_sched_entity *entity = &ctx->entities[0][i].entity;
>   
> -		drm_sched_entity_set_priority(entity, ctx_prio);
> +
> +		if (ctx->entities[0][1].enabled)
> +			drm_sched_entity_set_priority(entity, ctx_prio);
>   	}
>   }
>   
> @@ -557,6 +570,9 @@ void amdgpu_ctx_mgr_entity_flush(struct amdgpu_ctx_mgr *mgr)
>   		for (i = 0; i < num_entities; i++) {
>   			struct drm_sched_entity *entity;
>   
> +			if (!ctx->entities[0][i].enabled)
> +				continue;
> +
>   			entity = &ctx->entities[0][i].entity;
>   			max_wait = drm_sched_entity_flush(entity, max_wait);
>   		}
> @@ -584,7 +600,8 @@ void amdgpu_ctx_mgr_entity_fini(struct amdgpu_ctx_mgr *mgr)
>   		}
>   
>   		for (i = 0; i < num_entities; i++)
> -			drm_sched_entity_fini(&ctx->entities[0][i].entity);
> +			if (ctx->entities[0][i].enabled)
> +				drm_sched_entity_fini(&ctx->entities[0][i].entity);
>   	}
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
> index b3b012c0a7da..183a783aedd8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
> @@ -33,6 +33,7 @@ struct amdgpu_ctx_entity {
>   	uint64_t		sequence;
>   	struct dma_fence	**fences;
>   	struct drm_sched_entity	entity;
> +	bool                    enabled;
>   };
>   
>   struct amdgpu_ctx {

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 1/3] drm/amdgpu: Fix entities for disabled HW blocks.
       [not found]     ` <301feb02-ec3a-4635-8ae5-0bfc4be207e8-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
@ 2019-01-29 10:41       ` Bas Nieuwenhuizen
  2019-01-29 12:33         ` Koenig, Christian
  0 siblings, 1 reply; 6+ messages in thread
From: Bas Nieuwenhuizen @ 2019-01-29 10:41 UTC (permalink / raw)
  To: christian.koenig-5C7GfCeVMHo; +Cc: amd-gfx mailing list

On Tue, Jan 29, 2019 at 11:33 AM Christian König
<ckoenig.leichtzumerken@gmail.com> wrote:
>
> Am 29.01.19 um 11:20 schrieb Bas Nieuwenhuizen:
> > If we have some disabled HW blocks (say VCN), then the rings are
> > not initialized. This reuslts in entities that refer to uninitialized
> > rqs (runqueues?).
> >
> > In normal usage this does not result in issues because userspace
> > generally knows to ignore the unsupported blocks, but e.g. setting
> > the priorities on all the entities resulted in a NULL access while
> > locking the rq spinlock.
> >
> > This could probably also be induced when actually adding a job for
> > the blocks whith some less smart userspace.
>
> In general I agree that we need to improve the handling here. But this
> looks completely incorrect to me.

In what sense is this incorrect?

I'm fencing of all access to entities which use an uninitialized ring
(and therefore rq) and do not initialize/deinitialize them.

>
> We should always initialize all entities, but only with rq which are
> initialized as well.
>
> When this results in zero rq then we can later on handle that gracefully.
>
> Regards,
> Christian.
>
> >
> > Signed-off-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
> > ---
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 31 +++++++++++++++++++------
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h |  1 +
> >   2 files changed, 25 insertions(+), 7 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> > index d85184b5b35c..6f72ce785b32 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> > @@ -169,9 +169,13 @@ static int amdgpu_ctx_init(struct amdgpu_device *adev,
> >               for (j = 0; j < num_rings; ++j)
> >                       rqs[j] = &rings[j]->sched.sched_rq[priority];
> >
> > -             for (j = 0; j < amdgpu_ctx_num_entities[i]; ++j)
> > -                     r = drm_sched_entity_init(&ctx->entities[i][j].entity,
> > -                                               rqs, num_rings, &ctx->guilty);
> > +             for (j = 0; j < amdgpu_ctx_num_entities[i]; ++j) {
> > +                     ctx->entities[i][j].enabled = rings[0]->adev != NULL;
> > +                     if (ctx->entities[i][j].enabled) {
> > +                             r = drm_sched_entity_init(&ctx->entities[i][j].entity,
> > +                                                       rqs, num_rings, &ctx->guilty);
> > +                     }
> > +             }
> >               if (r)
> >                       goto error_cleanup_entities;
> >       }
> > @@ -180,7 +184,8 @@ static int amdgpu_ctx_init(struct amdgpu_device *adev,
> >
> >   error_cleanup_entities:
> >       for (i = 0; i < num_entities; ++i)
> > -             drm_sched_entity_destroy(&ctx->entities[0][i].entity);
> > +             if (ctx->entities[0][i].enabled)
> > +                     drm_sched_entity_destroy(&ctx->entities[0][i].entity);
> >       kfree(ctx->entities[0]);
> >
> >   error_free_fences:
> > @@ -229,6 +234,11 @@ int amdgpu_ctx_get_entity(struct amdgpu_ctx *ctx, u32 hw_ip, u32 instance,
> >               return -EINVAL;
> >       }
> >
> > +     if (!ctx->entities[hw_ip][ring].enabled) {
> > +             DRM_DEBUG("disabled ring: %d %d\n", hw_ip, ring);
> > +             return -EINVAL;
> > +     }
> > +
> >       *entity = &ctx->entities[hw_ip][ring].entity;
> >       return 0;
> >   }
> > @@ -279,7 +289,8 @@ static void amdgpu_ctx_do_release(struct kref *ref)
> >               num_entities += amdgpu_ctx_num_entities[i];
> >
> >       for (i = 0; i < num_entities; i++)
> > -             drm_sched_entity_destroy(&ctx->entities[0][i].entity);
> > +             if (ctx->entities[0][i].enabled)
> > +                     drm_sched_entity_destroy(&ctx->entities[0][i].entity);
> >
> >       amdgpu_ctx_fini(ref);
> >   }
> > @@ -505,7 +516,9 @@ void amdgpu_ctx_priority_override(struct amdgpu_ctx *ctx,
> >       for (i = 0; i < num_entities; i++) {
> >               struct drm_sched_entity *entity = &ctx->entities[0][i].entity;
> >
> > -             drm_sched_entity_set_priority(entity, ctx_prio);
> > +
> > +             if (ctx->entities[0][1].enabled)
> > +                     drm_sched_entity_set_priority(entity, ctx_prio);
> >       }
> >   }
> >
> > @@ -557,6 +570,9 @@ void amdgpu_ctx_mgr_entity_flush(struct amdgpu_ctx_mgr *mgr)
> >               for (i = 0; i < num_entities; i++) {
> >                       struct drm_sched_entity *entity;
> >
> > +                     if (!ctx->entities[0][i].enabled)
> > +                             continue;
> > +
> >                       entity = &ctx->entities[0][i].entity;
> >                       max_wait = drm_sched_entity_flush(entity, max_wait);
> >               }
> > @@ -584,7 +600,8 @@ void amdgpu_ctx_mgr_entity_fini(struct amdgpu_ctx_mgr *mgr)
> >               }
> >
> >               for (i = 0; i < num_entities; i++)
> > -                     drm_sched_entity_fini(&ctx->entities[0][i].entity);
> > +                     if (ctx->entities[0][i].enabled)
> > +                             drm_sched_entity_fini(&ctx->entities[0][i].entity);
> >       }
> >   }
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
> > index b3b012c0a7da..183a783aedd8 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
> > @@ -33,6 +33,7 @@ struct amdgpu_ctx_entity {
> >       uint64_t                sequence;
> >       struct dma_fence        **fences;
> >       struct drm_sched_entity entity;
> > +     bool                    enabled;
> >   };
> >
> >   struct amdgpu_ctx {
>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 1/3] drm/amdgpu: Fix entities for disabled HW blocks.
  2019-01-29 10:41       ` Bas Nieuwenhuizen
@ 2019-01-29 12:33         ` Koenig, Christian
  0 siblings, 0 replies; 6+ messages in thread
From: Koenig, Christian @ 2019-01-29 12:33 UTC (permalink / raw)
  To: Bas Nieuwenhuizen; +Cc: amd-gfx mailing list

Am 29.01.19 um 11:41 schrieb Bas Nieuwenhuizen:
> On Tue, Jan 29, 2019 at 11:33 AM Christian König
> <ckoenig.leichtzumerken@gmail.com> wrote:
>> Am 29.01.19 um 11:20 schrieb Bas Nieuwenhuizen:
>>> If we have some disabled HW blocks (say VCN), then the rings are
>>> not initialized. This reuslts in entities that refer to uninitialized
>>> rqs (runqueues?).
>>>
>>> In normal usage this does not result in issues because userspace
>>> generally knows to ignore the unsupported blocks, but e.g. setting
>>> the priorities on all the entities resulted in a NULL access while
>>> locking the rq spinlock.
>>>
>>> This could probably also be induced when actually adding a job for
>>> the blocks whith some less smart userspace.
>> In general I agree that we need to improve the handling here. But this
>> looks completely incorrect to me.
> In what sense is this incorrect?
>
> I'm fencing of all access to entities which use an uninitialized ring
> (and therefore rq) and do not initialize/deinitialize them.

Ok my fault you don't seem to understand the problem here :) And entity 
is not associated with one runqueue, but multiple ones.

And right now it is perfectly normal that one or more of those are not 
present. See for example most AMD GPUs have only one SDMA block with two 
instance, but we add all of them to the rq list anyway.

We should probably stop that and then make sure that entities can also 
initialize properly with zero runqueues in the list and return an error 
if you try to use those.

Christian.

>
>> We should always initialize all entities, but only with rq which are
>> initialized as well.
>>
>> When this results in zero rq then we can later on handle that gracefully.
>>
>> Regards,
>> Christian.
>>
>>> Signed-off-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 31 +++++++++++++++++++------
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h |  1 +
>>>    2 files changed, 25 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>>> index d85184b5b35c..6f72ce785b32 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>>> @@ -169,9 +169,13 @@ static int amdgpu_ctx_init(struct amdgpu_device *adev,
>>>                for (j = 0; j < num_rings; ++j)
>>>                        rqs[j] = &rings[j]->sched.sched_rq[priority];
>>>
>>> -             for (j = 0; j < amdgpu_ctx_num_entities[i]; ++j)
>>> -                     r = drm_sched_entity_init(&ctx->entities[i][j].entity,
>>> -                                               rqs, num_rings, &ctx->guilty);
>>> +             for (j = 0; j < amdgpu_ctx_num_entities[i]; ++j) {
>>> +                     ctx->entities[i][j].enabled = rings[0]->adev != NULL;
>>> +                     if (ctx->entities[i][j].enabled) {
>>> +                             r = drm_sched_entity_init(&ctx->entities[i][j].entity,
>>> +                                                       rqs, num_rings, &ctx->guilty);
>>> +                     }
>>> +             }
>>>                if (r)
>>>                        goto error_cleanup_entities;
>>>        }
>>> @@ -180,7 +184,8 @@ static int amdgpu_ctx_init(struct amdgpu_device *adev,
>>>
>>>    error_cleanup_entities:
>>>        for (i = 0; i < num_entities; ++i)
>>> -             drm_sched_entity_destroy(&ctx->entities[0][i].entity);
>>> +             if (ctx->entities[0][i].enabled)
>>> +                     drm_sched_entity_destroy(&ctx->entities[0][i].entity);
>>>        kfree(ctx->entities[0]);
>>>
>>>    error_free_fences:
>>> @@ -229,6 +234,11 @@ int amdgpu_ctx_get_entity(struct amdgpu_ctx *ctx, u32 hw_ip, u32 instance,
>>>                return -EINVAL;
>>>        }
>>>
>>> +     if (!ctx->entities[hw_ip][ring].enabled) {
>>> +             DRM_DEBUG("disabled ring: %d %d\n", hw_ip, ring);
>>> +             return -EINVAL;
>>> +     }
>>> +
>>>        *entity = &ctx->entities[hw_ip][ring].entity;
>>>        return 0;
>>>    }
>>> @@ -279,7 +289,8 @@ static void amdgpu_ctx_do_release(struct kref *ref)
>>>                num_entities += amdgpu_ctx_num_entities[i];
>>>
>>>        for (i = 0; i < num_entities; i++)
>>> -             drm_sched_entity_destroy(&ctx->entities[0][i].entity);
>>> +             if (ctx->entities[0][i].enabled)
>>> +                     drm_sched_entity_destroy(&ctx->entities[0][i].entity);
>>>
>>>        amdgpu_ctx_fini(ref);
>>>    }
>>> @@ -505,7 +516,9 @@ void amdgpu_ctx_priority_override(struct amdgpu_ctx *ctx,
>>>        for (i = 0; i < num_entities; i++) {
>>>                struct drm_sched_entity *entity = &ctx->entities[0][i].entity;
>>>
>>> -             drm_sched_entity_set_priority(entity, ctx_prio);
>>> +
>>> +             if (ctx->entities[0][1].enabled)
>>> +                     drm_sched_entity_set_priority(entity, ctx_prio);
>>>        }
>>>    }
>>>
>>> @@ -557,6 +570,9 @@ void amdgpu_ctx_mgr_entity_flush(struct amdgpu_ctx_mgr *mgr)
>>>                for (i = 0; i < num_entities; i++) {
>>>                        struct drm_sched_entity *entity;
>>>
>>> +                     if (!ctx->entities[0][i].enabled)
>>> +                             continue;
>>> +
>>>                        entity = &ctx->entities[0][i].entity;
>>>                        max_wait = drm_sched_entity_flush(entity, max_wait);
>>>                }
>>> @@ -584,7 +600,8 @@ void amdgpu_ctx_mgr_entity_fini(struct amdgpu_ctx_mgr *mgr)
>>>                }
>>>
>>>                for (i = 0; i < num_entities; i++)
>>> -                     drm_sched_entity_fini(&ctx->entities[0][i].entity);
>>> +                     if (ctx->entities[0][i].enabled)
>>> +                             drm_sched_entity_fini(&ctx->entities[0][i].entity);
>>>        }
>>>    }
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
>>> index b3b012c0a7da..183a783aedd8 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
>>> @@ -33,6 +33,7 @@ struct amdgpu_ctx_entity {
>>>        uint64_t                sequence;
>>>        struct dma_fence        **fences;
>>>        struct drm_sched_entity entity;
>>> +     bool                    enabled;
>>>    };
>>>
>>>    struct amdgpu_ctx {

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2019-01-29 12:33 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-01-29 10:20 [PATCH 1/3] drm/amdgpu: Fix entities for disabled HW blocks Bas Nieuwenhuizen
     [not found] ` <20190129102048.21827-1-bas-dldO88ZXqoXqqjsSq9zF6IRWq/SkRNHw@public.gmane.org>
2019-01-29 10:20   ` [PATCH 2/3] drm/amdgpu: Check if fd really is an amdgpu fd Bas Nieuwenhuizen
2019-01-29 10:20   ` [PATCH 3/3] drm/amdgpu: Add command to override the context priority Bas Nieuwenhuizen
2019-01-29 10:33   ` [PATCH 1/3] drm/amdgpu: Fix entities for disabled HW blocks Christian König
     [not found]     ` <301feb02-ec3a-4635-8ae5-0bfc4be207e8-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2019-01-29 10:41       ` Bas Nieuwenhuizen
2019-01-29 12:33         ` Koenig, Christian

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.