Rob Herring <robh@kernel.org> writes:

> This adds the initial driver for panfrost which supports Arm Mali
> Midgard and Bifrost family of GPUs. Currently, only the T860 and
> T760 Midgard GPUs have been tested.
>
> v2:
> - Add GPU reset on job hangs (Tomeu)
> - Add RuntimePM and devfreq support (Tomeu)
> - Fix T760 support (Tomeu)
> - Add a TODO file (Rob, Tomeu)
> - Support multiple in fences (Tomeu)
> - Drop support for shared fences (Tomeu)
> - Fill in MMU de-init (Rob)
> - Move register definitions back to single header (Rob)
> - Clean-up hardcoded job submit todos (Rob)
> - Implement feature setup based on features/issues (Rob)
> - Add remaining Midgard DT compatible strings (Rob)
>
> Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
> Cc: Maxime Ripard <maxime.ripard@bootlin.com>
> Cc: Sean Paul <sean@poorly.run>
> Cc: David Airlie <airlied@linux.ie>
> Cc: Daniel Vetter <daniel@ffwll.ch>
> Cc: Alyssa Rosenzweig <alyssa@rosenzweig.io>
> Cc: Lyude Paul <lyude@redhat.com>
> Cc: Eric Anholt <eric@anholt.net>
> Signed-off-by: Marty E. Plummer <hanetzer@startmail.com>
> Signed-off-by: Tomeu Vizoso <tomeu.vizoso@collabora.com>
> Signed-off-by: Rob Herring <robh@kernel.org>
> ---
> Neil, I've kept your reset support separate for now. Let me know if you
> prefer me to squash it or keep it separate.


> +static void panfrost_job_hw_submit(struct panfrost_job *job, int js)
> +{
> +	struct panfrost_device *pfdev = job->pfdev;
> +	unsigned long flags;
> +	u32 cfg;
> +	u64 jc_head = job->jc;
> +	int ret;
> +
> +	panfrost_devfreq_update_utilization(pfdev, js, false);
> +
> +	ret = pm_runtime_get_sync(pfdev->dev);
> +	if (ret < 0)
> +		return;
> +
> +	if (WARN_ON(job_read(pfdev, JS_COMMAND_NEXT(js))))
> +		goto end;
> +
> +	spin_lock_irqsave(&pfdev->hwaccess_lock, flags);
> +
> +	job_write(pfdev, JS_HEAD_NEXT_LO(js), jc_head & 0xFFFFFFFF);
> +	job_write(pfdev, JS_HEAD_NEXT_HI(js), jc_head >> 32);
> +
> +	panfrost_job_write_affinity(pfdev, job->requirements, js);
> +
> +	/* start MMU, medium priority, cache clean/flush on end, clean/flush on
> +	 * start */
> +	// TODO: different address spaces
> +	cfg = JS_CONFIG_THREAD_PRI(8) |
> +		JS_CONFIG_START_FLUSH_CLEAN_INVALIDATE |
> +		JS_CONFIG_END_FLUSH_CLEAN_INVALIDATE;
> +
> +	if (panfrost_has_hw_feature(pfdev, HW_FEATURE_FLUSH_REDUCTION))
> +		cfg |= JS_CONFIG_ENABLE_FLUSH_REDUCTION;
> +
> +	if (panfrost_has_hw_issue(pfdev, HW_ISSUE_10649))
> +		cfg |= JS_CONFIG_START_MMU;
> +
> +	job_write(pfdev, JS_CONFIG_NEXT(js), cfg);
> +
> +	if (panfrost_has_hw_feature(pfdev, HW_FEATURE_FLUSH_REDUCTION))
> +		job_write(pfdev, JS_FLUSH_ID_NEXT(js), job->flush_id);
> +
> +	/* GO ! */
> +	dev_dbg(pfdev->dev, "JS: Submitting atom %p to js[%d] with head=0x%llx",
> +				job, js, jc_head);
> +
> +	job_write(pfdev, JS_COMMAND_NEXT(js), JS_COMMAND_START);
> +
> +	spin_unlock_irqrestore(&pfdev->hwaccess_lock, flags);
> +
> +end:
> +	pm_runtime_mark_last_busy(pfdev->dev);
> +	pm_runtime_put_autosuspend(pfdev->dev);
> +}
> +
> +static void panfrost_acquire_object_fences(struct drm_gem_object **bos,
> +					   int bo_count,
> +					   struct dma_fence **implicit_fences)
> +{
> +	int i;
> +
> +	for (i = 0; i < bo_count; i++)
> +		implicit_fences[i] = reservation_object_get_excl_rcu(bos[i]->resv);
> +}

This is a little bit dodgy for implicit sync if the BOs are shared as
dma-bufs to some other device that distinguishes between shared vs excl
reservations.  (A distinction I think is not worth it, but it's the
interface we have).  If the other device has a read-only job, and you
submit a new job updating it, the other device may get the new contents
when it shouldn't.

However, this is a very minor bug and I don't think it should block
things.

> +int panfrost_job_push(struct panfrost_job *job)
> +{
> +	struct panfrost_device *pfdev = job->pfdev;
> +	int slot = panfrost_job_get_slot(job);
> +	struct drm_sched_entity *entity = &job->file_priv->sched_entity[slot];
> +	struct ww_acquire_ctx acquire_ctx;
> +	int ret = 0;
> +
> +	mutex_lock(&pfdev->sched_lock);
> +
> +	ret = drm_gem_lock_reservations(job->bos, job->bo_count,
> +					    &acquire_ctx);
> +	if (ret) {
> +		mutex_unlock(&pfdev->sched_lock);
> +		return ret;
> +	}
> +
> +	ret = drm_sched_job_init(&job->base, entity, NULL);
> +	if (ret) {
> +		mutex_unlock(&pfdev->sched_lock);
> +		goto unlock;
> +	}
> +
> +	job->render_done_fence = dma_fence_get(&job->base.s_fence->finished);
> +
> +	kref_get(&job->refcount); /* put by scheduler job completion */
> +
> +	drm_sched_entity_push_job(&job->base, entity);
> +
> +	mutex_unlock(&pfdev->sched_lock);
> +
> +	panfrost_acquire_object_fences(job->bos, job->bo_count,
> +				       job->implicit_fences);

I think your implicit fences need to be up above
drm_sched_entity_push_job(), since they might get referenced by the
scheduler any time after push_job.

> +	panfrost_attach_object_fences(job->bos, job->bo_count,
> +				      job->render_done_fence);
> +
> +unlock:
> +	drm_gem_unlock_reservations(job->bos, job->bo_count, &acquire_ctx);
> +
> +	return ret;
> +}

> +static void panfrost_job_timedout(struct drm_sched_job *sched_job)
> +{
> +	struct panfrost_job *job = to_panfrost_job(sched_job);
> +	struct panfrost_device *pfdev = job->pfdev;
> +	int js = panfrost_job_get_slot(job);
> +	int i;
> +
> +	/*
> +	 * If the GPU managed to complete this jobs fence, the timeout is
> +	 * spurious. Bail out.
> +	 */
> +	if (dma_fence_is_signaled(job->done_fence))
> +		return;
> +
> +	dev_err(pfdev->dev, "gpu sched timeout, js=%d, status=0x%x, head=0x%x, tail=0x%x, sched_job=%p",
> +		js,
> +		job_read(pfdev, JS_STATUS(js)),
> +		job_read(pfdev, JS_HEAD_LO(js)),
> +		job_read(pfdev, JS_TAIL_LO(js)),
> +		sched_job);
> +
> +	for (i = 0; i < NUM_JOB_SLOTS; i++)
> +		drm_sched_stop(&pfdev->js->queue[i].sched);
> +
> +	if(sched_job)
> +		drm_sched_increase_karma(sched_job);

whitespace

> +
> +	//panfrost_core_dump(pfdev);

Might want to go over the driver for // comments that should be removed?


> diff --git a/include/uapi/drm/panfrost_drm.h b/include/uapi/drm/panfrost_drm.h
> new file mode 100644
> index 000000000000..508b9621d9db
> --- /dev/null
> +++ b/include/uapi/drm/panfrost_drm.h
> @@ -0,0 +1,140 @@
> +/* SPDX-License-Identifier: MIT */
> +/*
> + * Copyright © 2014-2018 Broadcom
> + * Copyright © 2019 Collabora ltd.
> + */
> +#ifndef _PANFROST_DRM_H_
> +#define _PANFROST_DRM_H_
> +
> +#include "drm.h"
> +
> +#if defined(__cplusplus)
> +extern "C" {
> +#endif
> +
> +#define DRM_PANFROST_SUBMIT			0x00
> +#define DRM_PANFROST_WAIT_BO			0x01
> +#define DRM_PANFROST_CREATE_BO			0x02
> +#define DRM_PANFROST_MMAP_BO			0x03
> +#define DRM_PANFROST_GET_PARAM			0x04
> +#define DRM_PANFROST_GET_BO_OFFSET		0x05
> +
> +#define DRM_IOCTL_PANFROST_SUBMIT		DRM_IOW(DRM_COMMAND_BASE + DRM_PANFROST_SUBMIT, struct drm_panfrost_submit)
> +#define DRM_IOCTL_PANFROST_WAIT_BO		DRM_IOW(DRM_COMMAND_BASE + DRM_PANFROST_WAIT_BO, struct drm_panfrost_wait_bo)
> +#define DRM_IOCTL_PANFROST_CREATE_BO		DRM_IOWR(DRM_COMMAND_BASE + DRM_PANFROST_CREATE_BO, struct drm_panfrost_create_bo)
> +#define DRM_IOCTL_PANFROST_MMAP_BO		DRM_IOWR(DRM_COMMAND_BASE + DRM_PANFROST_MMAP_BO, struct drm_panfrost_mmap_bo)
> +#define DRM_IOCTL_PANFROST_GET_PARAM		DRM_IOWR(DRM_COMMAND_BASE + DRM_PANFROST_GET_PARAM, struct drm_panfrost_get_param)
> +#define DRM_IOCTL_PANFROST_GET_BO_OFFSET	DRM_IOWR(DRM_COMMAND_BASE + DRM_PANFROST_GET_BO_OFFSET, struct drm_panfrost_get_bo_offset)
> +
> +#define PANFROST_JD_REQ_FS (1 << 0)
> +/**
> + * struct drm_panfrost_submit - ioctl argument for submitting commands to the 3D
> + * engine.
> + *
> + * This asks the kernel to have the GPU execute a render command list.
> + */
> +struct drm_panfrost_submit {
> +
> +	/** Address to GPU mapping of job descriptor */
> +	__u64 jc;
> +
> +	/** An optional array of sync objects to wait on before starting this job. */
> +	__u64 in_syncs;
> +
> +	/** Number of sync objects to wait on before starting this job. */
> +	__u32 in_sync_count;
> +
> +	/** An optional sync object to place the completion fence in. */
> +	__u32 out_sync;
> +
> +	/** Pointer to a u32 array of the BOs that are referenced by the job. */
> +	__u64 bo_handles;
> +
> +	/** Number of BO handles passed in (size is that times 4). */
> +	__u32 bo_handle_count;
> +
> +	/** A combination of PANFROST_JD_REQ_* */
> +	__u32 requirements;
> +};
> +
> +/**
> + * struct drm_panfrost_wait_bo - ioctl argument for waiting for
> + * completion of the last DRM_PANFROST_SUBMIT_CL on a BO.
> + *
> + * This is useful for cases where multiple processes might be
> + * rendering to a BO and you want to wait for all rendering to be
> + * completed.
> + */
> +struct drm_panfrost_wait_bo {
> +	__u32 handle;
> +	__u32 pad;
> +	__s64 timeout_ns;	/* absolute */
> +};
> +
> +/**
> + * struct drm_panfrost_create_bo - ioctl argument for creating Panfrost BOs.
> + *
> + * There are currently no values for the flags argument, but it may be
> + * used in a future extension.
> + */
> +struct drm_panfrost_create_bo {
> +	__u32 size;
> +	__u32 flags;
> +	/** Returned GEM handle for the BO. */
> +	__u32 handle;
> +	/**
> +	 * Returned offset for the BO in the GPU address space.  This offset
> +	 * is private to the DRM fd and is valid for the lifetime of the GEM
> +	 * handle.
> +	 *
> +	 * This offset value will always be nonzero, since various HW
> +	 * units treat 0 specially.
> +	 */
> +	__u64 offset;
> +};

I think you've got a u64 alignment issue here that needs explicit
padding so that you don't end up needing 32-vs-64 ioctl wrappers..

This and the implicit fence acquisition race are the two things I think
need fixing, then you can add:

Reviewed-by: Eric Anholt <eric@anholt.net>

The rest is just suggestions for later.

> +
> +/**
> + * struct drm_panfrost_mmap_bo - ioctl argument for mapping Panfrost BOs.
> + *
> + * This doesn't actually perform an mmap.  Instead, it returns the
> + * offset you need to use in an mmap on the DRM device node.  This
> + * means that tools like valgrind end up knowing about the mapped
> + * memory.
> + *
> + * There are currently no values for the flags argument, but it may be
> + * used in a future extension.
> + */
> +struct drm_panfrost_mmap_bo {
> +	/** Handle for the object being mapped. */
> +	__u32 handle;
> +	__u32 flags;
> +	/** offset into the drm node to use for subsequent mmap call. */
> +	__u64 offset;
> +};
> +
> +enum drm_panfrost_param {
> +	DRM_PANFROST_PARAM_GPU_ID,
> +};
> +
> +struct drm_panfrost_get_param {
> +	__u32 param;
> +	__u32 pad;
> +	__u64 value;
> +};
> +
> +/**
> + * Returns the offset for the BO in the GPU address space for this DRM fd.
> + * This is the same value returned by drm_panfrost_create_bo, if that was called
> + * from this DRM fd.
> + */
> +struct drm_panfrost_get_bo_offset {
> +	__u32 handle;
> +	__u32 pad;
> +	__u64 offset;
> +};
> +
> +#if defined(__cplusplus)
> +}
> +#endif
> +
> +#endif /* _PANFROST_DRM_H_ */
> --
> 2.19.1