* [PATCH 1/8] drm/sched: Convert drm scheduler to use a work queue rather than kthread
2023-08-01 20:50 [PATCH 0/8] DRM scheduler changes for Xe Matthew Brost
@ 2023-08-01 20:50 ` Matthew Brost
2023-08-03 10:11 ` Tvrtko Ursulin
2023-08-01 20:50 ` [PATCH 2/8] drm/sched: Move schedule policy to scheduler / entity Matthew Brost
` (6 subsequent siblings)
7 siblings, 1 reply; 24+ messages in thread
From: Matthew Brost @ 2023-08-01 20:50 UTC (permalink / raw)
To: dri-devel, intel-xe
Cc: robdclark, thomas.hellstrom, Matthew Brost, sarah.walker,
ketil.johnsen, Liviu.Dudau, luben.tuikov, lina, donald.robson,
boris.brezillon, christian.koenig, faith.ekstrand
In XE, the new Intel GPU driver, a choice has made to have a 1 to 1
mapping between a drm_gpu_scheduler and drm_sched_entity. At first this
seems a bit odd but let us explain the reasoning below.
1. In XE the submission order from multiple drm_sched_entity is not
guaranteed to be the same completion even if targeting the same hardware
engine. This is because in XE we have a firmware scheduler, the GuC,
which allowed to reorder, timeslice, and preempt submissions. If a using
shared drm_gpu_scheduler across multiple drm_sched_entity, the TDR falls
apart as the TDR expects submission order == completion order. Using a
dedicated drm_gpu_scheduler per drm_sched_entity solve this problem.
2. In XE submissions are done via programming a ring buffer (circular
buffer), a drm_gpu_scheduler provides a limit on number of jobs, if the
limit of number jobs is set to RING_SIZE / MAX_SIZE_PER_JOB we get flow
control on the ring for free.
A problem with this design is currently a drm_gpu_scheduler uses a
kthread for submission / job cleanup. This doesn't scale if a large
number of drm_gpu_scheduler are used. To work around the scaling issue,
use a worker rather than kthread for submission / job cleanup.
v2:
- (Rob Clark) Fix msm build
- Pass in run work queue
v3:
- (Boris) don't have loop in worker
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 14 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 14 +-
drivers/gpu/drm/etnaviv/etnaviv_sched.c | 2 +-
drivers/gpu/drm/lima/lima_sched.c | 2 +-
drivers/gpu/drm/msm/adreno/adreno_device.c | 6 +-
drivers/gpu/drm/msm/msm_ringbuffer.c | 2 +-
drivers/gpu/drm/panfrost/panfrost_job.c | 2 +-
drivers/gpu/drm/scheduler/sched_main.c | 136 +++++++++++---------
drivers/gpu/drm/v3d/v3d_sched.c | 10 +-
include/drm/gpu_scheduler.h | 14 +-
10 files changed, 113 insertions(+), 89 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index f60753f97ac5..9c2a10aeb0b3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1489,9 +1489,9 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file *m, void *unused)
for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
struct amdgpu_ring *ring = adev->rings[i];
- if (!ring || !ring->sched.thread)
+ if (!ring || !ring->sched.ready)
continue;
- kthread_park(ring->sched.thread);
+ drm_sched_run_wq_stop(&ring->sched);
}
seq_printf(m, "run ib test:\n");
@@ -1505,9 +1505,9 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file *m, void *unused)
for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
struct amdgpu_ring *ring = adev->rings[i];
- if (!ring || !ring->sched.thread)
+ if (!ring || !ring->sched.ready)
continue;
- kthread_unpark(ring->sched.thread);
+ drm_sched_run_wq_start(&ring->sched);
}
up_write(&adev->reset_domain->sem);
@@ -1727,7 +1727,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
ring = adev->rings[val];
- if (!ring || !ring->funcs->preempt_ib || !ring->sched.thread)
+ if (!ring || !ring->funcs->preempt_ib || !ring->sched.ready)
return -EINVAL;
/* the last preemption failed */
@@ -1745,7 +1745,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
goto pro_end;
/* stop the scheduler */
- kthread_park(ring->sched.thread);
+ drm_sched_run_wq_stop(&ring->sched);
/* preempt the IB */
r = amdgpu_ring_preempt_ib(ring);
@@ -1779,7 +1779,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
failure:
/* restart the scheduler */
- kthread_unpark(ring->sched.thread);
+ drm_sched_run_wq_start(&ring->sched);
up_read(&adev->reset_domain->sem);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index fac9312b1695..00c9c03c8f94 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2364,7 +2364,7 @@ static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
break;
}
- r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
+ r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, NULL,
ring->num_hw_submission, amdgpu_job_hang_limit,
timeout, adev->reset_domain->wq,
ring->sched_score, ring->name,
@@ -4627,7 +4627,7 @@ bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
struct amdgpu_ring *ring = adev->rings[i];
- if (!ring || !ring->sched.thread)
+ if (!ring || !ring->sched.ready)
continue;
spin_lock(&ring->sched.job_list_lock);
@@ -4753,7 +4753,7 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
struct amdgpu_ring *ring = adev->rings[i];
- if (!ring || !ring->sched.thread)
+ if (!ring || !ring->sched.ready)
continue;
/*clear job fence from fence drv to avoid force_completion
@@ -5294,7 +5294,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
struct amdgpu_ring *ring = tmp_adev->rings[i];
- if (!ring || !ring->sched.thread)
+ if (!ring || !ring->sched.ready)
continue;
drm_sched_stop(&ring->sched, job ? &job->base : NULL);
@@ -5369,7 +5369,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
struct amdgpu_ring *ring = tmp_adev->rings[i];
- if (!ring || !ring->sched.thread)
+ if (!ring || !ring->sched.ready)
continue;
drm_sched_start(&ring->sched, true);
@@ -5696,7 +5696,7 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
struct amdgpu_ring *ring = adev->rings[i];
- if (!ring || !ring->sched.thread)
+ if (!ring || !ring->sched.ready)
continue;
drm_sched_stop(&ring->sched, NULL);
@@ -5824,7 +5824,7 @@ void amdgpu_pci_resume(struct pci_dev *pdev)
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
struct amdgpu_ring *ring = adev->rings[i];
- if (!ring || !ring->sched.thread)
+ if (!ring || !ring->sched.ready)
continue;
drm_sched_start(&ring->sched, true);
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
index 1ae87dfd19c4..8486a2923f1b 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
@@ -133,7 +133,7 @@ int etnaviv_sched_init(struct etnaviv_gpu *gpu)
{
int ret;
- ret = drm_sched_init(&gpu->sched, &etnaviv_sched_ops,
+ ret = drm_sched_init(&gpu->sched, &etnaviv_sched_ops, NULL,
etnaviv_hw_jobs_limit, etnaviv_job_hang_limit,
msecs_to_jiffies(500), NULL, NULL,
dev_name(gpu->dev), gpu->dev);
diff --git a/drivers/gpu/drm/lima/lima_sched.c b/drivers/gpu/drm/lima/lima_sched.c
index ff003403fbbc..54f53bece27c 100644
--- a/drivers/gpu/drm/lima/lima_sched.c
+++ b/drivers/gpu/drm/lima/lima_sched.c
@@ -488,7 +488,7 @@ int lima_sched_pipe_init(struct lima_sched_pipe *pipe, const char *name)
INIT_WORK(&pipe->recover_work, lima_sched_recover_work);
- return drm_sched_init(&pipe->base, &lima_sched_ops, 1,
+ return drm_sched_init(&pipe->base, &lima_sched_ops, NULL, 1,
lima_job_hang_limit,
msecs_to_jiffies(timeout), NULL,
NULL, name, pipe->ldev->dev);
diff --git a/drivers/gpu/drm/msm/adreno/adreno_device.c b/drivers/gpu/drm/msm/adreno/adreno_device.c
index c5c4c93b3689..f76ce11a5384 100644
--- a/drivers/gpu/drm/msm/adreno/adreno_device.c
+++ b/drivers/gpu/drm/msm/adreno/adreno_device.c
@@ -662,7 +662,8 @@ static void suspend_scheduler(struct msm_gpu *gpu)
*/
for (i = 0; i < gpu->nr_rings; i++) {
struct drm_gpu_scheduler *sched = &gpu->rb[i]->sched;
- kthread_park(sched->thread);
+
+ drm_sched_run_wq_stop(sched);
}
}
@@ -672,7 +673,8 @@ static void resume_scheduler(struct msm_gpu *gpu)
for (i = 0; i < gpu->nr_rings; i++) {
struct drm_gpu_scheduler *sched = &gpu->rb[i]->sched;
- kthread_unpark(sched->thread);
+
+ drm_sched_run_wq_start(sched);
}
}
diff --git a/drivers/gpu/drm/msm/msm_ringbuffer.c b/drivers/gpu/drm/msm/msm_ringbuffer.c
index 57a8e9564540..5879fc262047 100644
--- a/drivers/gpu/drm/msm/msm_ringbuffer.c
+++ b/drivers/gpu/drm/msm/msm_ringbuffer.c
@@ -95,7 +95,7 @@ struct msm_ringbuffer *msm_ringbuffer_new(struct msm_gpu *gpu, int id,
/* currently managing hangcheck ourselves: */
sched_timeout = MAX_SCHEDULE_TIMEOUT;
- ret = drm_sched_init(&ring->sched, &msm_sched_ops,
+ ret = drm_sched_init(&ring->sched, &msm_sched_ops, NULL,
num_hw_submissions, 0, sched_timeout,
NULL, NULL, to_msm_bo(ring->bo)->name, gpu->dev->dev);
if (ret) {
diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c
index dbc597ab46fb..f48b07056a16 100644
--- a/drivers/gpu/drm/panfrost/panfrost_job.c
+++ b/drivers/gpu/drm/panfrost/panfrost_job.c
@@ -815,7 +815,7 @@ int panfrost_job_init(struct panfrost_device *pfdev)
js->queue[j].fence_context = dma_fence_context_alloc(1);
ret = drm_sched_init(&js->queue[j].sched,
- &panfrost_sched_ops,
+ &panfrost_sched_ops, NULL,
nentries, 0,
msecs_to_jiffies(JOB_TIMEOUT_MS),
pfdev->reset.wq,
diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index a18c8f5e8cc0..c3eed9e8062a 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -44,7 +44,6 @@
* The jobs in a entity are always scheduled in the order that they were pushed.
*/
-#include <linux/kthread.h>
#include <linux/wait.h>
#include <linux/sched.h>
#include <linux/completion.h>
@@ -252,6 +251,47 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
return rb ? rb_entry(rb, struct drm_sched_entity, rb_tree_node) : NULL;
}
+/**
+ * drm_sched_run_wq_stop - stop scheduler run worker
+ *
+ * @sched: scheduler instance to stop run worker
+ */
+void drm_sched_run_wq_stop(struct drm_gpu_scheduler *sched)
+{
+ WRITE_ONCE(sched->pause_run_wq, true);
+ cancel_work_sync(&sched->work_run);
+}
+EXPORT_SYMBOL(drm_sched_run_wq_stop);
+
+/**
+ * drm_sched_run_wq_start - start scheduler run worker
+ *
+ * @sched: scheduler instance to start run worker
+ */
+void drm_sched_run_wq_start(struct drm_gpu_scheduler *sched)
+{
+ WRITE_ONCE(sched->pause_run_wq, false);
+ queue_work(sched->run_wq, &sched->work_run);
+}
+EXPORT_SYMBOL(drm_sched_run_wq_start);
+
+/**
+ * drm_sched_run_wq_queue - queue scheduler run worker
+ *
+ * @sched: scheduler instance to queue run worker
+ */
+static void drm_sched_run_wq_queue(struct drm_gpu_scheduler *sched)
+{
+ /*
+ * Try not to schedule work if pause_run_wq set but not the end of world
+ * if we do as either it will be cancelled by the above
+ * cancel_work_sync, or drm_sched_main turns into a NOP while
+ * pause_run_wq is set.
+ */
+ if (!READ_ONCE(sched->pause_run_wq))
+ queue_work(sched->run_wq, &sched->work_run);
+}
+
/**
* drm_sched_job_done - complete a job
* @s_job: pointer to the job which is done
@@ -271,7 +311,7 @@ static void drm_sched_job_done(struct drm_sched_job *s_job)
dma_fence_get(&s_fence->finished);
drm_sched_fence_finished(s_fence);
dma_fence_put(&s_fence->finished);
- wake_up_interruptible(&sched->wake_up_worker);
+ drm_sched_run_wq_queue(sched);
}
/**
@@ -434,7 +474,7 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad)
{
struct drm_sched_job *s_job, *tmp;
- kthread_park(sched->thread);
+ drm_sched_run_wq_stop(sched);
/*
* Reinsert back the bad job here - now it's safe as
@@ -547,7 +587,7 @@ void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery)
spin_unlock(&sched->job_list_lock);
}
- kthread_unpark(sched->thread);
+ drm_sched_run_wq_start(sched);
}
EXPORT_SYMBOL(drm_sched_start);
@@ -864,7 +904,7 @@ static bool drm_sched_ready(struct drm_gpu_scheduler *sched)
void drm_sched_wakeup(struct drm_gpu_scheduler *sched)
{
if (drm_sched_ready(sched))
- wake_up_interruptible(&sched->wake_up_worker);
+ drm_sched_run_wq_queue(sched);
}
/**
@@ -974,61 +1014,42 @@ drm_sched_pick_best(struct drm_gpu_scheduler **sched_list,
}
EXPORT_SYMBOL(drm_sched_pick_best);
-/**
- * drm_sched_blocked - check if the scheduler is blocked
- *
- * @sched: scheduler instance
- *
- * Returns true if blocked, otherwise false.
- */
-static bool drm_sched_blocked(struct drm_gpu_scheduler *sched)
-{
- if (kthread_should_park()) {
- kthread_parkme();
- return true;
- }
-
- return false;
-}
-
/**
* drm_sched_main - main scheduler thread
*
* @param: scheduler instance
- *
- * Returns 0.
*/
-static int drm_sched_main(void *param)
+static void drm_sched_main(struct work_struct *w)
{
- struct drm_gpu_scheduler *sched = (struct drm_gpu_scheduler *)param;
+ struct drm_gpu_scheduler *sched =
+ container_of(w, struct drm_gpu_scheduler, work_run);
+ struct drm_sched_entity *entity;
+ struct drm_sched_job *cleanup_job;
int r;
- sched_set_fifo_low(current);
+ if (READ_ONCE(sched->pause_run_wq))
+ return;
- while (!kthread_should_stop()) {
- struct drm_sched_entity *entity = NULL;
- struct drm_sched_fence *s_fence;
- struct drm_sched_job *sched_job;
- struct dma_fence *fence;
- struct drm_sched_job *cleanup_job = NULL;
+ cleanup_job = drm_sched_get_cleanup_job(sched);
+ entity = drm_sched_select_entity(sched);
- wait_event_interruptible(sched->wake_up_worker,
- (cleanup_job = drm_sched_get_cleanup_job(sched)) ||
- (!drm_sched_blocked(sched) &&
- (entity = drm_sched_select_entity(sched))) ||
- kthread_should_stop());
+ if (!entity && !cleanup_job)
+ return; /* No more work */
- if (cleanup_job)
- sched->ops->free_job(cleanup_job);
+ if (cleanup_job)
+ sched->ops->free_job(cleanup_job);
- if (!entity)
- continue;
+ if (entity) {
+ struct dma_fence *fence;
+ struct drm_sched_fence *s_fence;
+ struct drm_sched_job *sched_job;
sched_job = drm_sched_entity_pop_job(entity);
-
if (!sched_job) {
complete_all(&entity->entity_idle);
- continue;
+ if (!cleanup_job)
+ return; /* No more work */
+ goto again;
}
s_fence = sched_job->s_fence;
@@ -1055,14 +1076,17 @@ static int drm_sched_main(void *param)
r);
} else {
if (IS_ERR(fence))
- dma_fence_set_error(&s_fence->finished, PTR_ERR(fence));
+ dma_fence_set_error(&s_fence->finished,
+ PTR_ERR(fence));
drm_sched_job_done(sched_job);
}
wake_up(&sched->job_scheduled);
}
- return 0;
+
+again:
+ drm_sched_run_wq_queue(sched);
}
/**
@@ -1070,6 +1094,7 @@ static int drm_sched_main(void *param)
*
* @sched: scheduler instance
* @ops: backend operations for this scheduler
+ * @run_wq: workqueue to use for run work. If NULL, the system_wq is used
* @hw_submission: number of hw submissions that can be in flight
* @hang_limit: number of times to allow a job to hang before dropping it
* @timeout: timeout value in jiffies for the scheduler
@@ -1083,14 +1108,16 @@ static int drm_sched_main(void *param)
*/
int drm_sched_init(struct drm_gpu_scheduler *sched,
const struct drm_sched_backend_ops *ops,
+ struct workqueue_struct *run_wq,
unsigned hw_submission, unsigned hang_limit,
long timeout, struct workqueue_struct *timeout_wq,
atomic_t *score, const char *name, struct device *dev)
{
- int i, ret;
+ int i;
sched->ops = ops;
sched->hw_submission_limit = hw_submission;
sched->name = name;
+ sched->run_wq = run_wq ? : system_wq;
sched->timeout = timeout;
sched->timeout_wq = timeout_wq ? : system_wq;
sched->hang_limit = hang_limit;
@@ -1099,23 +1126,15 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_COUNT; i++)
drm_sched_rq_init(sched, &sched->sched_rq[i]);
- init_waitqueue_head(&sched->wake_up_worker);
init_waitqueue_head(&sched->job_scheduled);
INIT_LIST_HEAD(&sched->pending_list);
spin_lock_init(&sched->job_list_lock);
atomic_set(&sched->hw_rq_count, 0);
INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
+ INIT_WORK(&sched->work_run, drm_sched_main);
atomic_set(&sched->_score, 0);
atomic64_set(&sched->job_id_count, 0);
-
- /* Each scheduler will run on a seperate kernel thread */
- sched->thread = kthread_run(drm_sched_main, sched, sched->name);
- if (IS_ERR(sched->thread)) {
- ret = PTR_ERR(sched->thread);
- sched->thread = NULL;
- DRM_DEV_ERROR(sched->dev, "Failed to create scheduler for %s.\n", name);
- return ret;
- }
+ sched->pause_run_wq = false;
sched->ready = true;
return 0;
@@ -1134,8 +1153,7 @@ void drm_sched_fini(struct drm_gpu_scheduler *sched)
struct drm_sched_entity *s_entity;
int i;
- if (sched->thread)
- kthread_stop(sched->thread);
+ drm_sched_run_wq_stop(sched);
for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
struct drm_sched_rq *rq = &sched->sched_rq[i];
diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c
index 06238e6d7f5c..38e092ea41e6 100644
--- a/drivers/gpu/drm/v3d/v3d_sched.c
+++ b/drivers/gpu/drm/v3d/v3d_sched.c
@@ -388,7 +388,7 @@ v3d_sched_init(struct v3d_dev *v3d)
int ret;
ret = drm_sched_init(&v3d->queue[V3D_BIN].sched,
- &v3d_bin_sched_ops,
+ &v3d_bin_sched_ops, NULL,
hw_jobs_limit, job_hang_limit,
msecs_to_jiffies(hang_limit_ms), NULL,
NULL, "v3d_bin", v3d->drm.dev);
@@ -396,7 +396,7 @@ v3d_sched_init(struct v3d_dev *v3d)
return ret;
ret = drm_sched_init(&v3d->queue[V3D_RENDER].sched,
- &v3d_render_sched_ops,
+ &v3d_render_sched_ops, NULL,
hw_jobs_limit, job_hang_limit,
msecs_to_jiffies(hang_limit_ms), NULL,
NULL, "v3d_render", v3d->drm.dev);
@@ -404,7 +404,7 @@ v3d_sched_init(struct v3d_dev *v3d)
goto fail;
ret = drm_sched_init(&v3d->queue[V3D_TFU].sched,
- &v3d_tfu_sched_ops,
+ &v3d_tfu_sched_ops, NULL,
hw_jobs_limit, job_hang_limit,
msecs_to_jiffies(hang_limit_ms), NULL,
NULL, "v3d_tfu", v3d->drm.dev);
@@ -413,7 +413,7 @@ v3d_sched_init(struct v3d_dev *v3d)
if (v3d_has_csd(v3d)) {
ret = drm_sched_init(&v3d->queue[V3D_CSD].sched,
- &v3d_csd_sched_ops,
+ &v3d_csd_sched_ops, NULL,
hw_jobs_limit, job_hang_limit,
msecs_to_jiffies(hang_limit_ms), NULL,
NULL, "v3d_csd", v3d->drm.dev);
@@ -421,7 +421,7 @@ v3d_sched_init(struct v3d_dev *v3d)
goto fail;
ret = drm_sched_init(&v3d->queue[V3D_CACHE_CLEAN].sched,
- &v3d_cache_clean_sched_ops,
+ &v3d_cache_clean_sched_ops, NULL,
hw_jobs_limit, job_hang_limit,
msecs_to_jiffies(hang_limit_ms), NULL,
NULL, "v3d_cache_clean", v3d->drm.dev);
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
index c0586d832260..98fb5f85eba6 100644
--- a/include/drm/gpu_scheduler.h
+++ b/include/drm/gpu_scheduler.h
@@ -473,17 +473,16 @@ struct drm_sched_backend_ops {
* @timeout: the time after which a job is removed from the scheduler.
* @name: name of the ring for which this scheduler is being used.
* @sched_rq: priority wise array of run queues.
- * @wake_up_worker: the wait queue on which the scheduler sleeps until a job
- * is ready to be scheduled.
* @job_scheduled: once @drm_sched_entity_do_release is called the scheduler
* waits on this wait queue until all the scheduled jobs are
* finished.
* @hw_rq_count: the number of jobs currently in the hardware queue.
* @job_id_count: used to assign unique id to the each job.
+ * @run_wq: workqueue used to queue @work_run
* @timeout_wq: workqueue used to queue @work_tdr
+ * @work_run: schedules jobs and cleans up entities
* @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the
* timeout interval is over.
- * @thread: the kthread on which the scheduler which run.
* @pending_list: the list of jobs which are currently in the job queue.
* @job_list_lock: lock to protect the pending_list.
* @hang_limit: once the hangs by a job crosses this limit then it is marked
@@ -492,6 +491,7 @@ struct drm_sched_backend_ops {
* @_score: score used when the driver doesn't provide one
* @ready: marks if the underlying HW is ready to work
* @free_guilty: A hit to time out handler to free the guilty job.
+ * @pause_run_wq: pause queuing of @work_run on @run_wq
* @dev: system &struct device
*
* One scheduler is implemented for each hardware ring.
@@ -502,13 +502,13 @@ struct drm_gpu_scheduler {
long timeout;
const char *name;
struct drm_sched_rq sched_rq[DRM_SCHED_PRIORITY_COUNT];
- wait_queue_head_t wake_up_worker;
wait_queue_head_t job_scheduled;
atomic_t hw_rq_count;
atomic64_t job_id_count;
+ struct workqueue_struct *run_wq;
struct workqueue_struct *timeout_wq;
+ struct work_struct work_run;
struct delayed_work work_tdr;
- struct task_struct *thread;
struct list_head pending_list;
spinlock_t job_list_lock;
int hang_limit;
@@ -516,11 +516,13 @@ struct drm_gpu_scheduler {
atomic_t _score;
bool ready;
bool free_guilty;
+ bool pause_run_wq;
struct device *dev;
};
int drm_sched_init(struct drm_gpu_scheduler *sched,
const struct drm_sched_backend_ops *ops,
+ struct workqueue_struct *run_wq,
uint32_t hw_submission, unsigned hang_limit,
long timeout, struct workqueue_struct *timeout_wq,
atomic_t *score, const char *name, struct device *dev);
@@ -550,6 +552,8 @@ void drm_sched_entity_modify_sched(struct drm_sched_entity *entity,
void drm_sched_job_cleanup(struct drm_sched_job *job);
void drm_sched_wakeup(struct drm_gpu_scheduler *sched);
+void drm_sched_run_wq_stop(struct drm_gpu_scheduler *sched);
+void drm_sched_run_wq_start(struct drm_gpu_scheduler *sched);
void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad);
void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery);
void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched);
--
2.34.1
^ permalink raw reply related [flat|nested] 24+ messages in thread
* Re: [PATCH 1/8] drm/sched: Convert drm scheduler to use a work queue rather than kthread
2023-08-01 20:50 ` [PATCH 1/8] drm/sched: Convert drm scheduler to use a work queue rather than kthread Matthew Brost
@ 2023-08-03 10:11 ` Tvrtko Ursulin
2023-08-03 14:43 ` Matthew Brost
0 siblings, 1 reply; 24+ messages in thread
From: Tvrtko Ursulin @ 2023-08-03 10:11 UTC (permalink / raw)
To: Matthew Brost, dri-devel, intel-xe
Cc: robdclark, thomas.hellstrom, sarah.walker, ketil.johnsen, lina,
Liviu.Dudau, luben.tuikov, donald.robson, boris.brezillon,
christian.koenig, faith.ekstrand
On 01/08/2023 21:50, Matthew Brost wrote:
> In XE, the new Intel GPU driver, a choice has made to have a 1 to 1
> mapping between a drm_gpu_scheduler and drm_sched_entity. At first this
> seems a bit odd but let us explain the reasoning below.
>
> 1. In XE the submission order from multiple drm_sched_entity is not
> guaranteed to be the same completion even if targeting the same hardware
> engine. This is because in XE we have a firmware scheduler, the GuC,
> which allowed to reorder, timeslice, and preempt submissions. If a using
> shared drm_gpu_scheduler across multiple drm_sched_entity, the TDR falls
> apart as the TDR expects submission order == completion order. Using a
> dedicated drm_gpu_scheduler per drm_sched_entity solve this problem.
>
> 2. In XE submissions are done via programming a ring buffer (circular
> buffer), a drm_gpu_scheduler provides a limit on number of jobs, if the
> limit of number jobs is set to RING_SIZE / MAX_SIZE_PER_JOB we get flow
> control on the ring for free.
>
> A problem with this design is currently a drm_gpu_scheduler uses a
> kthread for submission / job cleanup. This doesn't scale if a large
> number of drm_gpu_scheduler are used. To work around the scaling issue,
> use a worker rather than kthread for submission / job cleanup.
>
> v2:
> - (Rob Clark) Fix msm build
> - Pass in run work queue
> v3:
> - (Boris) don't have loop in worker
>
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 14 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 14 +-
> drivers/gpu/drm/etnaviv/etnaviv_sched.c | 2 +-
> drivers/gpu/drm/lima/lima_sched.c | 2 +-
> drivers/gpu/drm/msm/adreno/adreno_device.c | 6 +-
> drivers/gpu/drm/msm/msm_ringbuffer.c | 2 +-
> drivers/gpu/drm/panfrost/panfrost_job.c | 2 +-
> drivers/gpu/drm/scheduler/sched_main.c | 136 +++++++++++---------
> drivers/gpu/drm/v3d/v3d_sched.c | 10 +-
> include/drm/gpu_scheduler.h | 14 +-
> 10 files changed, 113 insertions(+), 89 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> index f60753f97ac5..9c2a10aeb0b3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> @@ -1489,9 +1489,9 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file *m, void *unused)
> for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
> struct amdgpu_ring *ring = adev->rings[i];
>
> - if (!ring || !ring->sched.thread)
> + if (!ring || !ring->sched.ready)
> continue;
> - kthread_park(ring->sched.thread);
> + drm_sched_run_wq_stop(&ring->sched);
It would be good to split out adding of these wrappers (including adding
one for ring->sched.thread/ready) to a standalong preceding patch. That
way at least some mechanical changes to various drivers would be
separated from functional changes.
Also, perhaps do not have the wq in the name if it is not really needed
to be verbose with the underlying implementation like that? Like would
drm_sched_run/pause. Or even __drm_sched_start/stop, dunno, just an idea.
> }
>
> seq_printf(m, "run ib test:\n");
> @@ -1505,9 +1505,9 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file *m, void *unused)
> for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
> struct amdgpu_ring *ring = adev->rings[i];
>
> - if (!ring || !ring->sched.thread)
> + if (!ring || !ring->sched.ready)
> continue;
> - kthread_unpark(ring->sched.thread);
> + drm_sched_run_wq_start(&ring->sched);
> }
>
> up_write(&adev->reset_domain->sem);
> @@ -1727,7 +1727,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
>
> ring = adev->rings[val];
>
> - if (!ring || !ring->funcs->preempt_ib || !ring->sched.thread)
> + if (!ring || !ring->funcs->preempt_ib || !ring->sched.ready)
> return -EINVAL;
>
> /* the last preemption failed */
> @@ -1745,7 +1745,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
> goto pro_end;
>
> /* stop the scheduler */
> - kthread_park(ring->sched.thread);
> + drm_sched_run_wq_stop(&ring->sched);
>
> /* preempt the IB */
> r = amdgpu_ring_preempt_ib(ring);
> @@ -1779,7 +1779,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
>
> failure:
> /* restart the scheduler */
> - kthread_unpark(ring->sched.thread);
> + drm_sched_run_wq_start(&ring->sched);
>
> up_read(&adev->reset_domain->sem);
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index fac9312b1695..00c9c03c8f94 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2364,7 +2364,7 @@ static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
> break;
> }
>
> - r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
> + r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, NULL,
> ring->num_hw_submission, amdgpu_job_hang_limit,
> timeout, adev->reset_domain->wq,
> ring->sched_score, ring->name,
> @@ -4627,7 +4627,7 @@ bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
> for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> struct amdgpu_ring *ring = adev->rings[i];
>
> - if (!ring || !ring->sched.thread)
> + if (!ring || !ring->sched.ready)
> continue;
>
> spin_lock(&ring->sched.job_list_lock);
> @@ -4753,7 +4753,7 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
> for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> struct amdgpu_ring *ring = adev->rings[i];
>
> - if (!ring || !ring->sched.thread)
> + if (!ring || !ring->sched.ready)
> continue;
>
> /*clear job fence from fence drv to avoid force_completion
> @@ -5294,7 +5294,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
> for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> struct amdgpu_ring *ring = tmp_adev->rings[i];
>
> - if (!ring || !ring->sched.thread)
> + if (!ring || !ring->sched.ready)
> continue;
>
> drm_sched_stop(&ring->sched, job ? &job->base : NULL);
> @@ -5369,7 +5369,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
> for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> struct amdgpu_ring *ring = tmp_adev->rings[i];
>
> - if (!ring || !ring->sched.thread)
> + if (!ring || !ring->sched.ready)
> continue;
>
> drm_sched_start(&ring->sched, true);
> @@ -5696,7 +5696,7 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta
> for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> struct amdgpu_ring *ring = adev->rings[i];
>
> - if (!ring || !ring->sched.thread)
> + if (!ring || !ring->sched.ready)
> continue;
>
> drm_sched_stop(&ring->sched, NULL);
> @@ -5824,7 +5824,7 @@ void amdgpu_pci_resume(struct pci_dev *pdev)
> for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> struct amdgpu_ring *ring = adev->rings[i];
>
> - if (!ring || !ring->sched.thread)
> + if (!ring || !ring->sched.ready)
> continue;
>
> drm_sched_start(&ring->sched, true);
> diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
> index 1ae87dfd19c4..8486a2923f1b 100644
> --- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c
> +++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
> @@ -133,7 +133,7 @@ int etnaviv_sched_init(struct etnaviv_gpu *gpu)
> {
> int ret;
>
> - ret = drm_sched_init(&gpu->sched, &etnaviv_sched_ops,
> + ret = drm_sched_init(&gpu->sched, &etnaviv_sched_ops, NULL,
> etnaviv_hw_jobs_limit, etnaviv_job_hang_limit,
> msecs_to_jiffies(500), NULL, NULL,
> dev_name(gpu->dev), gpu->dev);
> diff --git a/drivers/gpu/drm/lima/lima_sched.c b/drivers/gpu/drm/lima/lima_sched.c
> index ff003403fbbc..54f53bece27c 100644
> --- a/drivers/gpu/drm/lima/lima_sched.c
> +++ b/drivers/gpu/drm/lima/lima_sched.c
> @@ -488,7 +488,7 @@ int lima_sched_pipe_init(struct lima_sched_pipe *pipe, const char *name)
>
> INIT_WORK(&pipe->recover_work, lima_sched_recover_work);
>
> - return drm_sched_init(&pipe->base, &lima_sched_ops, 1,
> + return drm_sched_init(&pipe->base, &lima_sched_ops, NULL, 1,
> lima_job_hang_limit,
> msecs_to_jiffies(timeout), NULL,
> NULL, name, pipe->ldev->dev);
> diff --git a/drivers/gpu/drm/msm/adreno/adreno_device.c b/drivers/gpu/drm/msm/adreno/adreno_device.c
> index c5c4c93b3689..f76ce11a5384 100644
> --- a/drivers/gpu/drm/msm/adreno/adreno_device.c
> +++ b/drivers/gpu/drm/msm/adreno/adreno_device.c
> @@ -662,7 +662,8 @@ static void suspend_scheduler(struct msm_gpu *gpu)
> */
> for (i = 0; i < gpu->nr_rings; i++) {
> struct drm_gpu_scheduler *sched = &gpu->rb[i]->sched;
> - kthread_park(sched->thread);
> +
> + drm_sched_run_wq_stop(sched);
> }
> }
>
> @@ -672,7 +673,8 @@ static void resume_scheduler(struct msm_gpu *gpu)
>
> for (i = 0; i < gpu->nr_rings; i++) {
> struct drm_gpu_scheduler *sched = &gpu->rb[i]->sched;
> - kthread_unpark(sched->thread);
> +
> + drm_sched_run_wq_start(sched);
> }
> }
>
> diff --git a/drivers/gpu/drm/msm/msm_ringbuffer.c b/drivers/gpu/drm/msm/msm_ringbuffer.c
> index 57a8e9564540..5879fc262047 100644
> --- a/drivers/gpu/drm/msm/msm_ringbuffer.c
> +++ b/drivers/gpu/drm/msm/msm_ringbuffer.c
> @@ -95,7 +95,7 @@ struct msm_ringbuffer *msm_ringbuffer_new(struct msm_gpu *gpu, int id,
> /* currently managing hangcheck ourselves: */
> sched_timeout = MAX_SCHEDULE_TIMEOUT;
>
> - ret = drm_sched_init(&ring->sched, &msm_sched_ops,
> + ret = drm_sched_init(&ring->sched, &msm_sched_ops, NULL,
> num_hw_submissions, 0, sched_timeout,
> NULL, NULL, to_msm_bo(ring->bo)->name, gpu->dev->dev);
> if (ret) {
> diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c
> index dbc597ab46fb..f48b07056a16 100644
> --- a/drivers/gpu/drm/panfrost/panfrost_job.c
> +++ b/drivers/gpu/drm/panfrost/panfrost_job.c
> @@ -815,7 +815,7 @@ int panfrost_job_init(struct panfrost_device *pfdev)
> js->queue[j].fence_context = dma_fence_context_alloc(1);
>
> ret = drm_sched_init(&js->queue[j].sched,
> - &panfrost_sched_ops,
> + &panfrost_sched_ops, NULL,
> nentries, 0,
> msecs_to_jiffies(JOB_TIMEOUT_MS),
> pfdev->reset.wq,
> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> index a18c8f5e8cc0..c3eed9e8062a 100644
> --- a/drivers/gpu/drm/scheduler/sched_main.c
> +++ b/drivers/gpu/drm/scheduler/sched_main.c
> @@ -44,7 +44,6 @@
> * The jobs in a entity are always scheduled in the order that they were pushed.
> */
>
> -#include <linux/kthread.h>
> #include <linux/wait.h>
> #include <linux/sched.h>
> #include <linux/completion.h>
> @@ -252,6 +251,47 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
> return rb ? rb_entry(rb, struct drm_sched_entity, rb_tree_node) : NULL;
> }
>
> +/**
> + * drm_sched_run_wq_stop - stop scheduler run worker
> + *
> + * @sched: scheduler instance to stop run worker
> + */
> +void drm_sched_run_wq_stop(struct drm_gpu_scheduler *sched)
> +{
> + WRITE_ONCE(sched->pause_run_wq, true);
> + cancel_work_sync(&sched->work_run);
> +}
> +EXPORT_SYMBOL(drm_sched_run_wq_stop);
> +
> +/**
> + * drm_sched_run_wq_start - start scheduler run worker
> + *
> + * @sched: scheduler instance to start run worker
> + */
> +void drm_sched_run_wq_start(struct drm_gpu_scheduler *sched)
> +{
> + WRITE_ONCE(sched->pause_run_wq, false);
> + queue_work(sched->run_wq, &sched->work_run);
> +}
> +EXPORT_SYMBOL(drm_sched_run_wq_start);
> +
> +/**
> + * drm_sched_run_wq_queue - queue scheduler run worker
> + *
> + * @sched: scheduler instance to queue run worker
> + */
> +static void drm_sched_run_wq_queue(struct drm_gpu_scheduler *sched)
> +{
> + /*
> + * Try not to schedule work if pause_run_wq set but not the end of world
> + * if we do as either it will be cancelled by the above
> + * cancel_work_sync, or drm_sched_main turns into a NOP while
> + * pause_run_wq is set.
> + */
> + if (!READ_ONCE(sched->pause_run_wq))
> + queue_work(sched->run_wq, &sched->work_run);
> +}
> +
> /**
> * drm_sched_job_done - complete a job
> * @s_job: pointer to the job which is done
> @@ -271,7 +311,7 @@ static void drm_sched_job_done(struct drm_sched_job *s_job)
> dma_fence_get(&s_fence->finished);
> drm_sched_fence_finished(s_fence);
> dma_fence_put(&s_fence->finished);
> - wake_up_interruptible(&sched->wake_up_worker);
> + drm_sched_run_wq_queue(sched);
> }
>
> /**
> @@ -434,7 +474,7 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad)
> {
> struct drm_sched_job *s_job, *tmp;
>
> - kthread_park(sched->thread);
> + drm_sched_run_wq_stop(sched);
>
> /*
> * Reinsert back the bad job here - now it's safe as
> @@ -547,7 +587,7 @@ void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery)
> spin_unlock(&sched->job_list_lock);
> }
>
> - kthread_unpark(sched->thread);
> + drm_sched_run_wq_start(sched);
> }
> EXPORT_SYMBOL(drm_sched_start);
>
> @@ -864,7 +904,7 @@ static bool drm_sched_ready(struct drm_gpu_scheduler *sched)
> void drm_sched_wakeup(struct drm_gpu_scheduler *sched)
> {
> if (drm_sched_ready(sched))
> - wake_up_interruptible(&sched->wake_up_worker);
> + drm_sched_run_wq_queue(sched);
> }
>
> /**
> @@ -974,61 +1014,42 @@ drm_sched_pick_best(struct drm_gpu_scheduler **sched_list,
> }
> EXPORT_SYMBOL(drm_sched_pick_best);
>
> -/**
> - * drm_sched_blocked - check if the scheduler is blocked
> - *
> - * @sched: scheduler instance
> - *
> - * Returns true if blocked, otherwise false.
> - */
> -static bool drm_sched_blocked(struct drm_gpu_scheduler *sched)
> -{
> - if (kthread_should_park()) {
> - kthread_parkme();
> - return true;
> - }
> -
> - return false;
> -}
> -
> /**
> * drm_sched_main - main scheduler thread
> *
> * @param: scheduler instance
> - *
> - * Returns 0.
> */
> -static int drm_sched_main(void *param)
> +static void drm_sched_main(struct work_struct *w)
> {
> - struct drm_gpu_scheduler *sched = (struct drm_gpu_scheduler *)param;
> + struct drm_gpu_scheduler *sched =
> + container_of(w, struct drm_gpu_scheduler, work_run);
> + struct drm_sched_entity *entity;
> + struct drm_sched_job *cleanup_job;
> int r;
>
> - sched_set_fifo_low(current);
> + if (READ_ONCE(sched->pause_run_wq))
> + return;
Is there a point to this check given the comment in drm_sched_run_wq_queue?
>
> - while (!kthread_should_stop()) {
> - struct drm_sched_entity *entity = NULL;
> - struct drm_sched_fence *s_fence;
> - struct drm_sched_job *sched_job;
> - struct dma_fence *fence;
> - struct drm_sched_job *cleanup_job = NULL;
> + cleanup_job = drm_sched_get_cleanup_job(sched);
> + entity = drm_sched_select_entity(sched);
>
> - wait_event_interruptible(sched->wake_up_worker,
> - (cleanup_job = drm_sched_get_cleanup_job(sched)) ||
> - (!drm_sched_blocked(sched) &&
> - (entity = drm_sched_select_entity(sched))) ||
> - kthread_should_stop());
> + if (!entity && !cleanup_job)
> + return; /* No more work */
>
> - if (cleanup_job)
> - sched->ops->free_job(cleanup_job);
> + if (cleanup_job)
> + sched->ops->free_job(cleanup_job);
>
> - if (!entity)
> - continue;
> + if (entity) {
> + struct dma_fence *fence;
> + struct drm_sched_fence *s_fence;
> + struct drm_sched_job *sched_job;
>
> sched_job = drm_sched_entity_pop_job(entity);
> -
> if (!sched_job) {
> complete_all(&entity->entity_idle);
> - continue;
> + if (!cleanup_job)
> + return; /* No more work */
> + goto again;
Loop is gone but now it re-arms itself which is needed to avoid
starvation? Is it guaranteed to be effective by the wq contract?
> }
>
> s_fence = sched_job->s_fence;
> @@ -1055,14 +1076,17 @@ static int drm_sched_main(void *param)
> r);
> } else {
> if (IS_ERR(fence))
> - dma_fence_set_error(&s_fence->finished, PTR_ERR(fence));
> + dma_fence_set_error(&s_fence->finished,
> + PTR_ERR(fence));
>
> drm_sched_job_done(sched_job);
> }
>
> wake_up(&sched->job_scheduled);
> }
> - return 0;
> +
> +again:
> + drm_sched_run_wq_queue(sched);
> }
>
> /**
> @@ -1070,6 +1094,7 @@ static int drm_sched_main(void *param)
> *
> * @sched: scheduler instance
> * @ops: backend operations for this scheduler
> + * @run_wq: workqueue to use for run work. If NULL, the system_wq is used
> * @hw_submission: number of hw submissions that can be in flight
> * @hang_limit: number of times to allow a job to hang before dropping it
> * @timeout: timeout value in jiffies for the scheduler
> @@ -1083,14 +1108,16 @@ static int drm_sched_main(void *param)
> */
> int drm_sched_init(struct drm_gpu_scheduler *sched,
> const struct drm_sched_backend_ops *ops,
> + struct workqueue_struct *run_wq,
> unsigned hw_submission, unsigned hang_limit,
> long timeout, struct workqueue_struct *timeout_wq,
> atomic_t *score, const char *name, struct device *dev)
> {
> - int i, ret;
> + int i;
> sched->ops = ops;
> sched->hw_submission_limit = hw_submission;
> sched->name = name;
> + sched->run_wq = run_wq ? : system_wq;
I still think it is not nice to implicitly move everyone over to the
shared system wq. Maybe even more so with now one at a time execution,
since effect on latency can be even greater.
Have you considered kthread_work as a backend? Maybe it would work to
have callers pass in a kthread_worker they create, or provide a
drm_sched helper to create one, which would then be passed to
drm_sched_init.
That would enable per driver kthread_worker, or per device, or whatever
granularity each driver would want/need/desire.
driver init:
struct drm_sched_worker = drm_sched_create_worker(...);
queue/whatever init:
drm_sched_init(.., worker, ...);
You could create one inside drm_sched_init if not passed in, which would
keep the behaviour for existing drivers more similar - they would still
have a 1:1 kthread context for their exclusive use.
And I *think* self-re-arming would be less problematic latency wise
since kthread_worker consumes everything queued without relinquishing
control and execution context would be guaranteed not to be shared with
random system stuff.
Regards,
Tvrtko
> sched->timeout = timeout;
> sched->timeout_wq = timeout_wq ? : system_wq;
> sched->hang_limit = hang_limit;
> @@ -1099,23 +1126,15 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
> for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_COUNT; i++)
> drm_sched_rq_init(sched, &sched->sched_rq[i]);
>
> - init_waitqueue_head(&sched->wake_up_worker);
> init_waitqueue_head(&sched->job_scheduled);
> INIT_LIST_HEAD(&sched->pending_list);
> spin_lock_init(&sched->job_list_lock);
> atomic_set(&sched->hw_rq_count, 0);
> INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
> + INIT_WORK(&sched->work_run, drm_sched_main);
> atomic_set(&sched->_score, 0);
> atomic64_set(&sched->job_id_count, 0);
> -
> - /* Each scheduler will run on a seperate kernel thread */
> - sched->thread = kthread_run(drm_sched_main, sched, sched->name);
> - if (IS_ERR(sched->thread)) {
> - ret = PTR_ERR(sched->thread);
> - sched->thread = NULL;
> - DRM_DEV_ERROR(sched->dev, "Failed to create scheduler for %s.\n", name);
> - return ret;
> - }
> + sched->pause_run_wq = false;
>
> sched->ready = true;
> return 0;
> @@ -1134,8 +1153,7 @@ void drm_sched_fini(struct drm_gpu_scheduler *sched)
> struct drm_sched_entity *s_entity;
> int i;
>
> - if (sched->thread)
> - kthread_stop(sched->thread);
> + drm_sched_run_wq_stop(sched);
>
> for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
> struct drm_sched_rq *rq = &sched->sched_rq[i];
> diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c
> index 06238e6d7f5c..38e092ea41e6 100644
> --- a/drivers/gpu/drm/v3d/v3d_sched.c
> +++ b/drivers/gpu/drm/v3d/v3d_sched.c
> @@ -388,7 +388,7 @@ v3d_sched_init(struct v3d_dev *v3d)
> int ret;
>
> ret = drm_sched_init(&v3d->queue[V3D_BIN].sched,
> - &v3d_bin_sched_ops,
> + &v3d_bin_sched_ops, NULL,
> hw_jobs_limit, job_hang_limit,
> msecs_to_jiffies(hang_limit_ms), NULL,
> NULL, "v3d_bin", v3d->drm.dev);
> @@ -396,7 +396,7 @@ v3d_sched_init(struct v3d_dev *v3d)
> return ret;
>
> ret = drm_sched_init(&v3d->queue[V3D_RENDER].sched,
> - &v3d_render_sched_ops,
> + &v3d_render_sched_ops, NULL,
> hw_jobs_limit, job_hang_limit,
> msecs_to_jiffies(hang_limit_ms), NULL,
> NULL, "v3d_render", v3d->drm.dev);
> @@ -404,7 +404,7 @@ v3d_sched_init(struct v3d_dev *v3d)
> goto fail;
>
> ret = drm_sched_init(&v3d->queue[V3D_TFU].sched,
> - &v3d_tfu_sched_ops,
> + &v3d_tfu_sched_ops, NULL,
> hw_jobs_limit, job_hang_limit,
> msecs_to_jiffies(hang_limit_ms), NULL,
> NULL, "v3d_tfu", v3d->drm.dev);
> @@ -413,7 +413,7 @@ v3d_sched_init(struct v3d_dev *v3d)
>
> if (v3d_has_csd(v3d)) {
> ret = drm_sched_init(&v3d->queue[V3D_CSD].sched,
> - &v3d_csd_sched_ops,
> + &v3d_csd_sched_ops, NULL,
> hw_jobs_limit, job_hang_limit,
> msecs_to_jiffies(hang_limit_ms), NULL,
> NULL, "v3d_csd", v3d->drm.dev);
> @@ -421,7 +421,7 @@ v3d_sched_init(struct v3d_dev *v3d)
> goto fail;
>
> ret = drm_sched_init(&v3d->queue[V3D_CACHE_CLEAN].sched,
> - &v3d_cache_clean_sched_ops,
> + &v3d_cache_clean_sched_ops, NULL,
> hw_jobs_limit, job_hang_limit,
> msecs_to_jiffies(hang_limit_ms), NULL,
> NULL, "v3d_cache_clean", v3d->drm.dev);
> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> index c0586d832260..98fb5f85eba6 100644
> --- a/include/drm/gpu_scheduler.h
> +++ b/include/drm/gpu_scheduler.h
> @@ -473,17 +473,16 @@ struct drm_sched_backend_ops {
> * @timeout: the time after which a job is removed from the scheduler.
> * @name: name of the ring for which this scheduler is being used.
> * @sched_rq: priority wise array of run queues.
> - * @wake_up_worker: the wait queue on which the scheduler sleeps until a job
> - * is ready to be scheduled.
> * @job_scheduled: once @drm_sched_entity_do_release is called the scheduler
> * waits on this wait queue until all the scheduled jobs are
> * finished.
> * @hw_rq_count: the number of jobs currently in the hardware queue.
> * @job_id_count: used to assign unique id to the each job.
> + * @run_wq: workqueue used to queue @work_run
> * @timeout_wq: workqueue used to queue @work_tdr
> + * @work_run: schedules jobs and cleans up entities
> * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the
> * timeout interval is over.
> - * @thread: the kthread on which the scheduler which run.
> * @pending_list: the list of jobs which are currently in the job queue.
> * @job_list_lock: lock to protect the pending_list.
> * @hang_limit: once the hangs by a job crosses this limit then it is marked
> @@ -492,6 +491,7 @@ struct drm_sched_backend_ops {
> * @_score: score used when the driver doesn't provide one
> * @ready: marks if the underlying HW is ready to work
> * @free_guilty: A hit to time out handler to free the guilty job.
> + * @pause_run_wq: pause queuing of @work_run on @run_wq
> * @dev: system &struct device
> *
> * One scheduler is implemented for each hardware ring.
> @@ -502,13 +502,13 @@ struct drm_gpu_scheduler {
> long timeout;
> const char *name;
> struct drm_sched_rq sched_rq[DRM_SCHED_PRIORITY_COUNT];
> - wait_queue_head_t wake_up_worker;
> wait_queue_head_t job_scheduled;
> atomic_t hw_rq_count;
> atomic64_t job_id_count;
> + struct workqueue_struct *run_wq;
> struct workqueue_struct *timeout_wq;
> + struct work_struct work_run;
> struct delayed_work work_tdr;
> - struct task_struct *thread;
> struct list_head pending_list;
> spinlock_t job_list_lock;
> int hang_limit;
> @@ -516,11 +516,13 @@ struct drm_gpu_scheduler {
> atomic_t _score;
> bool ready;
> bool free_guilty;
> + bool pause_run_wq;
> struct device *dev;
> };
>
> int drm_sched_init(struct drm_gpu_scheduler *sched,
> const struct drm_sched_backend_ops *ops,
> + struct workqueue_struct *run_wq,
> uint32_t hw_submission, unsigned hang_limit,
> long timeout, struct workqueue_struct *timeout_wq,
> atomic_t *score, const char *name, struct device *dev);
> @@ -550,6 +552,8 @@ void drm_sched_entity_modify_sched(struct drm_sched_entity *entity,
>
> void drm_sched_job_cleanup(struct drm_sched_job *job);
> void drm_sched_wakeup(struct drm_gpu_scheduler *sched);
> +void drm_sched_run_wq_stop(struct drm_gpu_scheduler *sched);
> +void drm_sched_run_wq_start(struct drm_gpu_scheduler *sched);
> void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad);
> void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery);
> void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched);
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH 1/8] drm/sched: Convert drm scheduler to use a work queue rather than kthread
2023-08-03 10:11 ` Tvrtko Ursulin
@ 2023-08-03 14:43 ` Matthew Brost
2023-08-03 14:56 ` Christian König
2023-08-03 15:39 ` Tvrtko Ursulin
0 siblings, 2 replies; 24+ messages in thread
From: Matthew Brost @ 2023-08-03 14:43 UTC (permalink / raw)
To: Tvrtko Ursulin
Cc: robdclark, thomas.hellstrom, sarah.walker, ketil.johnsen, lina,
Liviu.Dudau, dri-devel, christian.koenig, luben.tuikov,
donald.robson, boris.brezillon, intel-xe, faith.ekstrand
On Thu, Aug 03, 2023 at 11:11:13AM +0100, Tvrtko Ursulin wrote:
>
> On 01/08/2023 21:50, Matthew Brost wrote:
> > In XE, the new Intel GPU driver, a choice has made to have a 1 to 1
> > mapping between a drm_gpu_scheduler and drm_sched_entity. At first this
> > seems a bit odd but let us explain the reasoning below.
> >
> > 1. In XE the submission order from multiple drm_sched_entity is not
> > guaranteed to be the same completion even if targeting the same hardware
> > engine. This is because in XE we have a firmware scheduler, the GuC,
> > which allowed to reorder, timeslice, and preempt submissions. If a using
> > shared drm_gpu_scheduler across multiple drm_sched_entity, the TDR falls
> > apart as the TDR expects submission order == completion order. Using a
> > dedicated drm_gpu_scheduler per drm_sched_entity solve this problem.
> >
> > 2. In XE submissions are done via programming a ring buffer (circular
> > buffer), a drm_gpu_scheduler provides a limit on number of jobs, if the
> > limit of number jobs is set to RING_SIZE / MAX_SIZE_PER_JOB we get flow
> > control on the ring for free.
> >
> > A problem with this design is currently a drm_gpu_scheduler uses a
> > kthread for submission / job cleanup. This doesn't scale if a large
> > number of drm_gpu_scheduler are used. To work around the scaling issue,
> > use a worker rather than kthread for submission / job cleanup.
> >
> > v2:
> > - (Rob Clark) Fix msm build
> > - Pass in run work queue
> > v3:
> > - (Boris) don't have loop in worker
> >
> > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > ---
> > drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 14 +-
> > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 14 +-
> > drivers/gpu/drm/etnaviv/etnaviv_sched.c | 2 +-
> > drivers/gpu/drm/lima/lima_sched.c | 2 +-
> > drivers/gpu/drm/msm/adreno/adreno_device.c | 6 +-
> > drivers/gpu/drm/msm/msm_ringbuffer.c | 2 +-
> > drivers/gpu/drm/panfrost/panfrost_job.c | 2 +-
> > drivers/gpu/drm/scheduler/sched_main.c | 136 +++++++++++---------
> > drivers/gpu/drm/v3d/v3d_sched.c | 10 +-
> > include/drm/gpu_scheduler.h | 14 +-
> > 10 files changed, 113 insertions(+), 89 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> > index f60753f97ac5..9c2a10aeb0b3 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> > @@ -1489,9 +1489,9 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file *m, void *unused)
> > for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
> > struct amdgpu_ring *ring = adev->rings[i];
> > - if (!ring || !ring->sched.thread)
> > + if (!ring || !ring->sched.ready)
> > continue;
> > - kthread_park(ring->sched.thread);
> > + drm_sched_run_wq_stop(&ring->sched);
>
> It would be good to split out adding of these wrappers (including adding one
> for ring->sched.thread/ready) to a standalong preceding patch. That way at
> least some mechanical changes to various drivers would be separated from
> functional changes.
>
Sure.
> Also, perhaps do not have the wq in the name if it is not really needed to
> be verbose with the underlying implementation like that? Like would
> drm_sched_run/pause. Or even __drm_sched_start/stop, dunno, just an idea.
>
Sure.
> > }
> > seq_printf(m, "run ib test:\n");
> > @@ -1505,9 +1505,9 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file *m, void *unused)
> > for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
> > struct amdgpu_ring *ring = adev->rings[i];
> > - if (!ring || !ring->sched.thread)
> > + if (!ring || !ring->sched.ready)
> > continue;
> > - kthread_unpark(ring->sched.thread);
> > + drm_sched_run_wq_start(&ring->sched);
> > }
> > up_write(&adev->reset_domain->sem);
> > @@ -1727,7 +1727,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
> > ring = adev->rings[val];
> > - if (!ring || !ring->funcs->preempt_ib || !ring->sched.thread)
> > + if (!ring || !ring->funcs->preempt_ib || !ring->sched.ready)
> > return -EINVAL;
> > /* the last preemption failed */
> > @@ -1745,7 +1745,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
> > goto pro_end;
> > /* stop the scheduler */
> > - kthread_park(ring->sched.thread);
> > + drm_sched_run_wq_stop(&ring->sched);
> > /* preempt the IB */
> > r = amdgpu_ring_preempt_ib(ring);
> > @@ -1779,7 +1779,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
> > failure:
> > /* restart the scheduler */
> > - kthread_unpark(ring->sched.thread);
> > + drm_sched_run_wq_start(&ring->sched);
> > up_read(&adev->reset_domain->sem);
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > index fac9312b1695..00c9c03c8f94 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > @@ -2364,7 +2364,7 @@ static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
> > break;
> > }
> > - r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
> > + r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, NULL,
> > ring->num_hw_submission, amdgpu_job_hang_limit,
> > timeout, adev->reset_domain->wq,
> > ring->sched_score, ring->name,
> > @@ -4627,7 +4627,7 @@ bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
> > for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> > struct amdgpu_ring *ring = adev->rings[i];
> > - if (!ring || !ring->sched.thread)
> > + if (!ring || !ring->sched.ready)
> > continue;
> > spin_lock(&ring->sched.job_list_lock);
> > @@ -4753,7 +4753,7 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
> > for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> > struct amdgpu_ring *ring = adev->rings[i];
> > - if (!ring || !ring->sched.thread)
> > + if (!ring || !ring->sched.ready)
> > continue;
> > /*clear job fence from fence drv to avoid force_completion
> > @@ -5294,7 +5294,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
> > for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> > struct amdgpu_ring *ring = tmp_adev->rings[i];
> > - if (!ring || !ring->sched.thread)
> > + if (!ring || !ring->sched.ready)
> > continue;
> > drm_sched_stop(&ring->sched, job ? &job->base : NULL);
> > @@ -5369,7 +5369,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
> > for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> > struct amdgpu_ring *ring = tmp_adev->rings[i];
> > - if (!ring || !ring->sched.thread)
> > + if (!ring || !ring->sched.ready)
> > continue;
> > drm_sched_start(&ring->sched, true);
> > @@ -5696,7 +5696,7 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta
> > for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> > struct amdgpu_ring *ring = adev->rings[i];
> > - if (!ring || !ring->sched.thread)
> > + if (!ring || !ring->sched.ready)
> > continue;
> > drm_sched_stop(&ring->sched, NULL);
> > @@ -5824,7 +5824,7 @@ void amdgpu_pci_resume(struct pci_dev *pdev)
> > for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> > struct amdgpu_ring *ring = adev->rings[i];
> > - if (!ring || !ring->sched.thread)
> > + if (!ring || !ring->sched.ready)
> > continue;
> > drm_sched_start(&ring->sched, true);
> > diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
> > index 1ae87dfd19c4..8486a2923f1b 100644
> > --- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c
> > +++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
> > @@ -133,7 +133,7 @@ int etnaviv_sched_init(struct etnaviv_gpu *gpu)
> > {
> > int ret;
> > - ret = drm_sched_init(&gpu->sched, &etnaviv_sched_ops,
> > + ret = drm_sched_init(&gpu->sched, &etnaviv_sched_ops, NULL,
> > etnaviv_hw_jobs_limit, etnaviv_job_hang_limit,
> > msecs_to_jiffies(500), NULL, NULL,
> > dev_name(gpu->dev), gpu->dev);
> > diff --git a/drivers/gpu/drm/lima/lima_sched.c b/drivers/gpu/drm/lima/lima_sched.c
> > index ff003403fbbc..54f53bece27c 100644
> > --- a/drivers/gpu/drm/lima/lima_sched.c
> > +++ b/drivers/gpu/drm/lima/lima_sched.c
> > @@ -488,7 +488,7 @@ int lima_sched_pipe_init(struct lima_sched_pipe *pipe, const char *name)
> > INIT_WORK(&pipe->recover_work, lima_sched_recover_work);
> > - return drm_sched_init(&pipe->base, &lima_sched_ops, 1,
> > + return drm_sched_init(&pipe->base, &lima_sched_ops, NULL, 1,
> > lima_job_hang_limit,
> > msecs_to_jiffies(timeout), NULL,
> > NULL, name, pipe->ldev->dev);
> > diff --git a/drivers/gpu/drm/msm/adreno/adreno_device.c b/drivers/gpu/drm/msm/adreno/adreno_device.c
> > index c5c4c93b3689..f76ce11a5384 100644
> > --- a/drivers/gpu/drm/msm/adreno/adreno_device.c
> > +++ b/drivers/gpu/drm/msm/adreno/adreno_device.c
> > @@ -662,7 +662,8 @@ static void suspend_scheduler(struct msm_gpu *gpu)
> > */
> > for (i = 0; i < gpu->nr_rings; i++) {
> > struct drm_gpu_scheduler *sched = &gpu->rb[i]->sched;
> > - kthread_park(sched->thread);
> > +
> > + drm_sched_run_wq_stop(sched);
> > }
> > }
> > @@ -672,7 +673,8 @@ static void resume_scheduler(struct msm_gpu *gpu)
> > for (i = 0; i < gpu->nr_rings; i++) {
> > struct drm_gpu_scheduler *sched = &gpu->rb[i]->sched;
> > - kthread_unpark(sched->thread);
> > +
> > + drm_sched_run_wq_start(sched);
> > }
> > }
> > diff --git a/drivers/gpu/drm/msm/msm_ringbuffer.c b/drivers/gpu/drm/msm/msm_ringbuffer.c
> > index 57a8e9564540..5879fc262047 100644
> > --- a/drivers/gpu/drm/msm/msm_ringbuffer.c
> > +++ b/drivers/gpu/drm/msm/msm_ringbuffer.c
> > @@ -95,7 +95,7 @@ struct msm_ringbuffer *msm_ringbuffer_new(struct msm_gpu *gpu, int id,
> > /* currently managing hangcheck ourselves: */
> > sched_timeout = MAX_SCHEDULE_TIMEOUT;
> > - ret = drm_sched_init(&ring->sched, &msm_sched_ops,
> > + ret = drm_sched_init(&ring->sched, &msm_sched_ops, NULL,
> > num_hw_submissions, 0, sched_timeout,
> > NULL, NULL, to_msm_bo(ring->bo)->name, gpu->dev->dev);
> > if (ret) {
> > diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c
> > index dbc597ab46fb..f48b07056a16 100644
> > --- a/drivers/gpu/drm/panfrost/panfrost_job.c
> > +++ b/drivers/gpu/drm/panfrost/panfrost_job.c
> > @@ -815,7 +815,7 @@ int panfrost_job_init(struct panfrost_device *pfdev)
> > js->queue[j].fence_context = dma_fence_context_alloc(1);
> > ret = drm_sched_init(&js->queue[j].sched,
> > - &panfrost_sched_ops,
> > + &panfrost_sched_ops, NULL,
> > nentries, 0,
> > msecs_to_jiffies(JOB_TIMEOUT_MS),
> > pfdev->reset.wq,
> > diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> > index a18c8f5e8cc0..c3eed9e8062a 100644
> > --- a/drivers/gpu/drm/scheduler/sched_main.c
> > +++ b/drivers/gpu/drm/scheduler/sched_main.c
> > @@ -44,7 +44,6 @@
> > * The jobs in a entity are always scheduled in the order that they were pushed.
> > */
> > -#include <linux/kthread.h>
> > #include <linux/wait.h>
> > #include <linux/sched.h>
> > #include <linux/completion.h>
> > @@ -252,6 +251,47 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
> > return rb ? rb_entry(rb, struct drm_sched_entity, rb_tree_node) : NULL;
> > }
> > +/**
> > + * drm_sched_run_wq_stop - stop scheduler run worker
> > + *
> > + * @sched: scheduler instance to stop run worker
> > + */
> > +void drm_sched_run_wq_stop(struct drm_gpu_scheduler *sched)
> > +{
> > + WRITE_ONCE(sched->pause_run_wq, true);
> > + cancel_work_sync(&sched->work_run);
> > +}
> > +EXPORT_SYMBOL(drm_sched_run_wq_stop);
> > +
> > +/**
> > + * drm_sched_run_wq_start - start scheduler run worker
> > + *
> > + * @sched: scheduler instance to start run worker
> > + */
> > +void drm_sched_run_wq_start(struct drm_gpu_scheduler *sched)
> > +{
> > + WRITE_ONCE(sched->pause_run_wq, false);
> > + queue_work(sched->run_wq, &sched->work_run);
> > +}
> > +EXPORT_SYMBOL(drm_sched_run_wq_start);
> > +
> > +/**
> > + * drm_sched_run_wq_queue - queue scheduler run worker
> > + *
> > + * @sched: scheduler instance to queue run worker
> > + */
> > +static void drm_sched_run_wq_queue(struct drm_gpu_scheduler *sched)
> > +{
> > + /*
> > + * Try not to schedule work if pause_run_wq set but not the end of world
> > + * if we do as either it will be cancelled by the above
> > + * cancel_work_sync, or drm_sched_main turns into a NOP while
> > + * pause_run_wq is set.
> > + */
> > + if (!READ_ONCE(sched->pause_run_wq))
> > + queue_work(sched->run_wq, &sched->work_run);
> > +}
> > +
> > /**
> > * drm_sched_job_done - complete a job
> > * @s_job: pointer to the job which is done
> > @@ -271,7 +311,7 @@ static void drm_sched_job_done(struct drm_sched_job *s_job)
> > dma_fence_get(&s_fence->finished);
> > drm_sched_fence_finished(s_fence);
> > dma_fence_put(&s_fence->finished);
> > - wake_up_interruptible(&sched->wake_up_worker);
> > + drm_sched_run_wq_queue(sched);
> > }
> > /**
> > @@ -434,7 +474,7 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad)
> > {
> > struct drm_sched_job *s_job, *tmp;
> > - kthread_park(sched->thread);
> > + drm_sched_run_wq_stop(sched);
> > /*
> > * Reinsert back the bad job here - now it's safe as
> > @@ -547,7 +587,7 @@ void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery)
> > spin_unlock(&sched->job_list_lock);
> > }
> > - kthread_unpark(sched->thread);
> > + drm_sched_run_wq_start(sched);
> > }
> > EXPORT_SYMBOL(drm_sched_start);
> > @@ -864,7 +904,7 @@ static bool drm_sched_ready(struct drm_gpu_scheduler *sched)
> > void drm_sched_wakeup(struct drm_gpu_scheduler *sched)
> > {
> > if (drm_sched_ready(sched))
> > - wake_up_interruptible(&sched->wake_up_worker);
> > + drm_sched_run_wq_queue(sched);
> > }
> > /**
> > @@ -974,61 +1014,42 @@ drm_sched_pick_best(struct drm_gpu_scheduler **sched_list,
> > }
> > EXPORT_SYMBOL(drm_sched_pick_best);
> > -/**
> > - * drm_sched_blocked - check if the scheduler is blocked
> > - *
> > - * @sched: scheduler instance
> > - *
> > - * Returns true if blocked, otherwise false.
> > - */
> > -static bool drm_sched_blocked(struct drm_gpu_scheduler *sched)
> > -{
> > - if (kthread_should_park()) {
> > - kthread_parkme();
> > - return true;
> > - }
> > -
> > - return false;
> > -}
> > -
> > /**
> > * drm_sched_main - main scheduler thread
> > *
> > * @param: scheduler instance
> > - *
> > - * Returns 0.
> > */
> > -static int drm_sched_main(void *param)
> > +static void drm_sched_main(struct work_struct *w)
> > {
> > - struct drm_gpu_scheduler *sched = (struct drm_gpu_scheduler *)param;
> > + struct drm_gpu_scheduler *sched =
> > + container_of(w, struct drm_gpu_scheduler, work_run);
> > + struct drm_sched_entity *entity;
> > + struct drm_sched_job *cleanup_job;
> > int r;
> > - sched_set_fifo_low(current);
> > + if (READ_ONCE(sched->pause_run_wq))
> > + return;
>
> Is there a point to this check given the comment in drm_sched_run_wq_queue?
>
I think so.
> > - while (!kthread_should_stop()) {
> > - struct drm_sched_entity *entity = NULL;
> > - struct drm_sched_fence *s_fence;
> > - struct drm_sched_job *sched_job;
> > - struct dma_fence *fence;
> > - struct drm_sched_job *cleanup_job = NULL;
> > + cleanup_job = drm_sched_get_cleanup_job(sched);
> > + entity = drm_sched_select_entity(sched);
> > - wait_event_interruptible(sched->wake_up_worker,
> > - (cleanup_job = drm_sched_get_cleanup_job(sched)) ||
> > - (!drm_sched_blocked(sched) &&
> > - (entity = drm_sched_select_entity(sched))) ||
> > - kthread_should_stop());
> > + if (!entity && !cleanup_job)
> > + return; /* No more work */
> > - if (cleanup_job)
> > - sched->ops->free_job(cleanup_job);
> > + if (cleanup_job)
> > + sched->ops->free_job(cleanup_job);
> > - if (!entity)
> > - continue;
> > + if (entity) {
> > + struct dma_fence *fence;
> > + struct drm_sched_fence *s_fence;
> > + struct drm_sched_job *sched_job;
> > sched_job = drm_sched_entity_pop_job(entity);
> > -
> > if (!sched_job) {
> > complete_all(&entity->entity_idle);
> > - continue;
> > + if (!cleanup_job)
> > + return; /* No more work */
> > + goto again;
>
> Loop is gone but now it re-arms itself which is needed to avoid starvation?
> Is it guaranteed to be effective by the wq contract?
>
Yea.
> > }
> > s_fence = sched_job->s_fence;
> > @@ -1055,14 +1076,17 @@ static int drm_sched_main(void *param)
> > r);
> > } else {
> > if (IS_ERR(fence))
> > - dma_fence_set_error(&s_fence->finished, PTR_ERR(fence));
> > + dma_fence_set_error(&s_fence->finished,
> > + PTR_ERR(fence));
> > drm_sched_job_done(sched_job);
> > }
> > wake_up(&sched->job_scheduled);
> > }
> > - return 0;
> > +
> > +again:
> > + drm_sched_run_wq_queue(sched);
> > }
> > /**
> > @@ -1070,6 +1094,7 @@ static int drm_sched_main(void *param)
> > *
> > * @sched: scheduler instance
> > * @ops: backend operations for this scheduler
> > + * @run_wq: workqueue to use for run work. If NULL, the system_wq is used
> > * @hw_submission: number of hw submissions that can be in flight
> > * @hang_limit: number of times to allow a job to hang before dropping it
> > * @timeout: timeout value in jiffies for the scheduler
> > @@ -1083,14 +1108,16 @@ static int drm_sched_main(void *param)
> > */
> > int drm_sched_init(struct drm_gpu_scheduler *sched,
> > const struct drm_sched_backend_ops *ops,
> > + struct workqueue_struct *run_wq,
> > unsigned hw_submission, unsigned hang_limit,
> > long timeout, struct workqueue_struct *timeout_wq,
> > atomic_t *score, const char *name, struct device *dev)
> > {
> > - int i, ret;
> > + int i;
> > sched->ops = ops;
> > sched->hw_submission_limit = hw_submission;
> > sched->name = name;
> > + sched->run_wq = run_wq ? : system_wq;
>
> I still think it is not nice to implicitly move everyone over to the shared
> system wq. Maybe even more so with now one at a time execution, since effect
> on latency can be even greater.
>
No one that has a stake in this has pushed back that I can recall. Open
to feedback stakeholders (maintainers of drivers that use the drm
scheduler). The i915 doesn't use the DRM scheduler last time I looked.
Has that changed?
> Have you considered kthread_work as a backend? Maybe it would work to have
> callers pass in a kthread_worker they create, or provide a drm_sched helper
> to create one, which would then be passed to drm_sched_init.
>
> That would enable per driver kthread_worker, or per device, or whatever
> granularity each driver would want/need/desire.
>
> driver init:
> struct drm_sched_worker = drm_sched_create_worker(...);
>
> queue/whatever init:
> drm_sched_init(.., worker, ...);
>
This idea doesn't seem to work for varitey of reasons. Will type it out
if needed but not going to spend time on this unless someone with a
stake raises this as an issue.
> You could create one inside drm_sched_init if not passed in, which would
> keep the behaviour for existing drivers more similar - they would still have
> a 1:1 kthread context for their exclusive use.
>
Part of the idea of a work queue is so a user can't directly create a
kthread via an IOCTL (XE_EXEC_QUEUE_CREATE). What you suggesting exposes
this issue.
> And I *think* self-re-arming would be less problematic latency wise since
> kthread_worker consumes everything queued without relinquishing control and
> execution context would be guaranteed not to be shared with random system
> stuff.
>
So this is essentially so we can use a loop? Seems like a lot effort for
what is pure speculation. Again if a stakeholder raises an issue we can
address then.
Matt
> Regards,
>
> Tvrtko
>
> > sched->timeout = timeout;
> > sched->timeout_wq = timeout_wq ? : system_wq;
> > sched->hang_limit = hang_limit;
> > @@ -1099,23 +1126,15 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
> > for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_COUNT; i++)
> > drm_sched_rq_init(sched, &sched->sched_rq[i]);
> > - init_waitqueue_head(&sched->wake_up_worker);
> > init_waitqueue_head(&sched->job_scheduled);
> > INIT_LIST_HEAD(&sched->pending_list);
> > spin_lock_init(&sched->job_list_lock);
> > atomic_set(&sched->hw_rq_count, 0);
> > INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
> > + INIT_WORK(&sched->work_run, drm_sched_main);
> > atomic_set(&sched->_score, 0);
> > atomic64_set(&sched->job_id_count, 0);
> > -
> > - /* Each scheduler will run on a seperate kernel thread */
> > - sched->thread = kthread_run(drm_sched_main, sched, sched->name);
> > - if (IS_ERR(sched->thread)) {
> > - ret = PTR_ERR(sched->thread);
> > - sched->thread = NULL;
> > - DRM_DEV_ERROR(sched->dev, "Failed to create scheduler for %s.\n", name);
> > - return ret;
> > - }
> > + sched->pause_run_wq = false;
> > sched->ready = true;
> > return 0;
> > @@ -1134,8 +1153,7 @@ void drm_sched_fini(struct drm_gpu_scheduler *sched)
> > struct drm_sched_entity *s_entity;
> > int i;
> > - if (sched->thread)
> > - kthread_stop(sched->thread);
> > + drm_sched_run_wq_stop(sched);
> > for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
> > struct drm_sched_rq *rq = &sched->sched_rq[i];
> > diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c
> > index 06238e6d7f5c..38e092ea41e6 100644
> > --- a/drivers/gpu/drm/v3d/v3d_sched.c
> > +++ b/drivers/gpu/drm/v3d/v3d_sched.c
> > @@ -388,7 +388,7 @@ v3d_sched_init(struct v3d_dev *v3d)
> > int ret;
> > ret = drm_sched_init(&v3d->queue[V3D_BIN].sched,
> > - &v3d_bin_sched_ops,
> > + &v3d_bin_sched_ops, NULL,
> > hw_jobs_limit, job_hang_limit,
> > msecs_to_jiffies(hang_limit_ms), NULL,
> > NULL, "v3d_bin", v3d->drm.dev);
> > @@ -396,7 +396,7 @@ v3d_sched_init(struct v3d_dev *v3d)
> > return ret;
> > ret = drm_sched_init(&v3d->queue[V3D_RENDER].sched,
> > - &v3d_render_sched_ops,
> > + &v3d_render_sched_ops, NULL,
> > hw_jobs_limit, job_hang_limit,
> > msecs_to_jiffies(hang_limit_ms), NULL,
> > NULL, "v3d_render", v3d->drm.dev);
> > @@ -404,7 +404,7 @@ v3d_sched_init(struct v3d_dev *v3d)
> > goto fail;
> > ret = drm_sched_init(&v3d->queue[V3D_TFU].sched,
> > - &v3d_tfu_sched_ops,
> > + &v3d_tfu_sched_ops, NULL,
> > hw_jobs_limit, job_hang_limit,
> > msecs_to_jiffies(hang_limit_ms), NULL,
> > NULL, "v3d_tfu", v3d->drm.dev);
> > @@ -413,7 +413,7 @@ v3d_sched_init(struct v3d_dev *v3d)
> > if (v3d_has_csd(v3d)) {
> > ret = drm_sched_init(&v3d->queue[V3D_CSD].sched,
> > - &v3d_csd_sched_ops,
> > + &v3d_csd_sched_ops, NULL,
> > hw_jobs_limit, job_hang_limit,
> > msecs_to_jiffies(hang_limit_ms), NULL,
> > NULL, "v3d_csd", v3d->drm.dev);
> > @@ -421,7 +421,7 @@ v3d_sched_init(struct v3d_dev *v3d)
> > goto fail;
> > ret = drm_sched_init(&v3d->queue[V3D_CACHE_CLEAN].sched,
> > - &v3d_cache_clean_sched_ops,
> > + &v3d_cache_clean_sched_ops, NULL,
> > hw_jobs_limit, job_hang_limit,
> > msecs_to_jiffies(hang_limit_ms), NULL,
> > NULL, "v3d_cache_clean", v3d->drm.dev);
> > diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> > index c0586d832260..98fb5f85eba6 100644
> > --- a/include/drm/gpu_scheduler.h
> > +++ b/include/drm/gpu_scheduler.h
> > @@ -473,17 +473,16 @@ struct drm_sched_backend_ops {
> > * @timeout: the time after which a job is removed from the scheduler.
> > * @name: name of the ring for which this scheduler is being used.
> > * @sched_rq: priority wise array of run queues.
> > - * @wake_up_worker: the wait queue on which the scheduler sleeps until a job
> > - * is ready to be scheduled.
> > * @job_scheduled: once @drm_sched_entity_do_release is called the scheduler
> > * waits on this wait queue until all the scheduled jobs are
> > * finished.
> > * @hw_rq_count: the number of jobs currently in the hardware queue.
> > * @job_id_count: used to assign unique id to the each job.
> > + * @run_wq: workqueue used to queue @work_run
> > * @timeout_wq: workqueue used to queue @work_tdr
> > + * @work_run: schedules jobs and cleans up entities
> > * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the
> > * timeout interval is over.
> > - * @thread: the kthread on which the scheduler which run.
> > * @pending_list: the list of jobs which are currently in the job queue.
> > * @job_list_lock: lock to protect the pending_list.
> > * @hang_limit: once the hangs by a job crosses this limit then it is marked
> > @@ -492,6 +491,7 @@ struct drm_sched_backend_ops {
> > * @_score: score used when the driver doesn't provide one
> > * @ready: marks if the underlying HW is ready to work
> > * @free_guilty: A hit to time out handler to free the guilty job.
> > + * @pause_run_wq: pause queuing of @work_run on @run_wq
> > * @dev: system &struct device
> > *
> > * One scheduler is implemented for each hardware ring.
> > @@ -502,13 +502,13 @@ struct drm_gpu_scheduler {
> > long timeout;
> > const char *name;
> > struct drm_sched_rq sched_rq[DRM_SCHED_PRIORITY_COUNT];
> > - wait_queue_head_t wake_up_worker;
> > wait_queue_head_t job_scheduled;
> > atomic_t hw_rq_count;
> > atomic64_t job_id_count;
> > + struct workqueue_struct *run_wq;
> > struct workqueue_struct *timeout_wq;
> > + struct work_struct work_run;
> > struct delayed_work work_tdr;
> > - struct task_struct *thread;
> > struct list_head pending_list;
> > spinlock_t job_list_lock;
> > int hang_limit;
> > @@ -516,11 +516,13 @@ struct drm_gpu_scheduler {
> > atomic_t _score;
> > bool ready;
> > bool free_guilty;
> > + bool pause_run_wq;
> > struct device *dev;
> > };
> > int drm_sched_init(struct drm_gpu_scheduler *sched,
> > const struct drm_sched_backend_ops *ops,
> > + struct workqueue_struct *run_wq,
> > uint32_t hw_submission, unsigned hang_limit,
> > long timeout, struct workqueue_struct *timeout_wq,
> > atomic_t *score, const char *name, struct device *dev);
> > @@ -550,6 +552,8 @@ void drm_sched_entity_modify_sched(struct drm_sched_entity *entity,
> > void drm_sched_job_cleanup(struct drm_sched_job *job);
> > void drm_sched_wakeup(struct drm_gpu_scheduler *sched);
> > +void drm_sched_run_wq_stop(struct drm_gpu_scheduler *sched);
> > +void drm_sched_run_wq_start(struct drm_gpu_scheduler *sched);
> > void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad);
> > void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery);
> > void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched);
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH 1/8] drm/sched: Convert drm scheduler to use a work queue rather than kthread
2023-08-03 14:43 ` Matthew Brost
@ 2023-08-03 14:56 ` Christian König
2023-08-03 15:19 ` Tvrtko Ursulin
2023-08-03 15:39 ` Tvrtko Ursulin
1 sibling, 1 reply; 24+ messages in thread
From: Christian König @ 2023-08-03 14:56 UTC (permalink / raw)
To: Matthew Brost, Tvrtko Ursulin
Cc: robdclark, thomas.hellstrom, sarah.walker, ketil.johnsen, lina,
Liviu.Dudau, dri-devel, luben.tuikov, donald.robson,
boris.brezillon, intel-xe, faith.ekstrand
Am 03.08.23 um 16:43 schrieb Matthew Brost:
> On Thu, Aug 03, 2023 at 11:11:13AM +0100, Tvrtko Ursulin wrote:
>> On 01/08/2023 21:50, Matthew Brost wrote:
>> [SNIP]
>>> sched->ops = ops;
>>> sched->hw_submission_limit = hw_submission;
>>> sched->name = name;
>>> + sched->run_wq = run_wq ? : system_wq;
>> I still think it is not nice to implicitly move everyone over to the shared
>> system wq. Maybe even more so with now one at a time execution, since effect
>> on latency can be even greater.
>>
> No one that has a stake in this has pushed back that I can recall. Open
> to feedback stakeholders (maintainers of drivers that use the drm
> scheduler).
No objections to using the system_wq here. Drivers can still pass in
their own or simply use system_highpri_wq instead.
Additional to that the system_wq isn't single threaded, it will create
as much threads as needed to fully utilize all CPUs.
> The i915 doesn't use the DRM scheduler last time I looked.
> Has that changed?
>
>> Have you considered kthread_work as a backend? Maybe it would work to have
>> callers pass in a kthread_worker they create, or provide a drm_sched helper
>> to create one, which would then be passed to drm_sched_init.
>>
>> That would enable per driver kthread_worker, or per device, or whatever
>> granularity each driver would want/need/desire.
>>
>> driver init:
>> struct drm_sched_worker = drm_sched_create_worker(...);
>>
>> queue/whatever init:
>> drm_sched_init(.., worker, ...);
>>
> This idea doesn't seem to work for varitey of reasons. Will type it out
> if needed but not going to spend time on this unless someone with a
> stake raises this as an issue.
Agree completely. kthread_work is for real time workers IIRC.
>
>> You could create one inside drm_sched_init if not passed in, which would
>> keep the behaviour for existing drivers more similar - they would still have
>> a 1:1 kthread context for their exclusive use.
>>
> Part of the idea of a work queue is so a user can't directly create a
> kthread via an IOCTL (XE_EXEC_QUEUE_CREATE). What you suggesting exposes
> this issue.
Yeah, prevent that is indeed a very good idea.
>
>> And I *think* self-re-arming would be less problematic latency wise since
>> kthread_worker consumes everything queued without relinquishing control and
>> execution context would be guaranteed not to be shared with random system
>> stuff.
>>
> So this is essentially so we can use a loop? Seems like a lot effort for
> what is pure speculation. Again if a stakeholder raises an issue we can
> address then.
Instead of a loop what you usually do in the worker is to submit one
item (if possible) and then re-queue yourself if there is more work to do.
This way you give others chance to run as well and/or cancel the work etc...
Christian.
>
> Matt
>
>> Regards,
>>
>> Tvrtko
>>
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH 1/8] drm/sched: Convert drm scheduler to use a work queue rather than kthread
2023-08-03 14:56 ` Christian König
@ 2023-08-03 15:19 ` Tvrtko Ursulin
0 siblings, 0 replies; 24+ messages in thread
From: Tvrtko Ursulin @ 2023-08-03 15:19 UTC (permalink / raw)
To: Christian König, Matthew Brost
Cc: robdclark, thomas.hellstrom, sarah.walker, ketil.johnsen, lina,
Liviu.Dudau, dri-devel, luben.tuikov, donald.robson,
boris.brezillon, intel-xe, faith.ekstrand
On 03/08/2023 15:56, Christian König wrote:
> Am 03.08.23 um 16:43 schrieb Matthew Brost:
>> On Thu, Aug 03, 2023 at 11:11:13AM +0100, Tvrtko Ursulin wrote:
>>> On 01/08/2023 21:50, Matthew Brost wrote:
>>> [SNIP]
>>>> sched->ops = ops;
>>>> sched->hw_submission_limit = hw_submission;
>>>> sched->name = name;
>>>> + sched->run_wq = run_wq ? : system_wq;
>>> I still think it is not nice to implicitly move everyone over to the
>>> shared
>>> system wq. Maybe even more so with now one at a time execution, since
>>> effect
>>> on latency can be even greater.
>>>
>> No one that has a stake in this has pushed back that I can recall. Open
>> to feedback stakeholders (maintainers of drivers that use the drm
>> scheduler).
>
> No objections to using the system_wq here. Drivers can still pass in
> their own or simply use system_highpri_wq instead.
>
> Additional to that the system_wq isn't single threaded, it will create
> as much threads as needed to fully utilize all CPUs.
>
>> The i915 doesn't use the DRM scheduler last time I looked.
>> Has that changed?
>>> Have you considered kthread_work as a backend? Maybe it would work to
>>> have
>>> callers pass in a kthread_worker they create, or provide a drm_sched
>>> helper
>>> to create one, which would then be passed to drm_sched_init.
>>>
>>> That would enable per driver kthread_worker, or per device, or whatever
>>> granularity each driver would want/need/desire.
>>>
>>> driver init:
>>> struct drm_sched_worker = drm_sched_create_worker(...);
>>>
>>> queue/whatever init:
>>> drm_sched_init(.., worker, ...);
>>>
>> This idea doesn't seem to work for varitey of reasons. Will type it out
>> if needed but not going to spend time on this unless someone with a
>> stake raises this as an issue.
>
> Agree completely. kthread_work is for real time workers IIRC.
AFAIK it is indicated if one needs to tweak the kthread priority, but
that is not the only use case.
I am curious to know why the idea does not work for variety of reasons.
>>> You could create one inside drm_sched_init if not passed in, which would
>>> keep the behaviour for existing drivers more similar - they would
>>> still have
>>> a 1:1 kthread context for their exclusive use.
>>>
>> Part of the idea of a work queue is so a user can't directly create a
>> kthread via an IOCTL (XE_EXEC_QUEUE_CREATE). What you suggesting exposes
>> this issue.
>
> Yeah, prevent that is indeed a very good idea.
Nope, I wasn't suggesting that at all.
I was suggesting as many kthread_workers (these are threads) as the
implementation wants. Xe can create one per device. Someone else can
create one per hw engine, whatever.
One kthread_*work* per entity does not mean one thread per
XE_EXEC_QUEUE_CREATE. Kthread_work is just a unit of work executed by
the kthread_worker thread. Same in that conceptual relationship as
workqueue and workitem.
Difference is it may work better for single-shot re-arming design if
regression in submission latency concerns any stakeholders.
>>> And I *think* self-re-arming would be less problematic latency wise
>>> since
>>> kthread_worker consumes everything queued without relinquishing
>>> control and
>>> execution context would be guaranteed not to be shared with random
>>> system
>>> stuff.
>>>
>> So this is essentially so we can use a loop? Seems like a lot effort for
>> what is pure speculation. Again if a stakeholder raises an issue we can
>> address then.
>
> Instead of a loop what you usually do in the worker is to submit one
> item (if possible) and then re-queue yourself if there is more work to do.
>
> This way you give others chance to run as well and/or cancel the work
> etc...
Yeah I was pointing out loop in the worker was bad months ago (or more)
so it is not about that. Here my point is whether it can be done better
than silently convert everyone to system_wq.
Hence my proposal is to *keep* closer to the thread semantics for
everyone and at the same time _allow_ the option of custom
workqueue/whatever.
Where is the problem there?
Regards,
Tvrtko
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH 1/8] drm/sched: Convert drm scheduler to use a work queue rather than kthread
2023-08-03 14:43 ` Matthew Brost
2023-08-03 14:56 ` Christian König
@ 2023-08-03 15:39 ` Tvrtko Ursulin
1 sibling, 0 replies; 24+ messages in thread
From: Tvrtko Ursulin @ 2023-08-03 15:39 UTC (permalink / raw)
To: Matthew Brost
Cc: robdclark, thomas.hellstrom, sarah.walker, ketil.johnsen, lina,
Liviu.Dudau, dri-devel, christian.koenig, luben.tuikov,
donald.robson, boris.brezillon, intel-xe, faith.ekstrand
On 03/08/2023 15:43, Matthew Brost wrote:
> On Thu, Aug 03, 2023 at 11:11:13AM +0100, Tvrtko Ursulin wrote:
>>
>> On 01/08/2023 21:50, Matthew Brost wrote:
>>> In XE, the new Intel GPU driver, a choice has made to have a 1 to 1
>>> mapping between a drm_gpu_scheduler and drm_sched_entity. At first this
>>> seems a bit odd but let us explain the reasoning below.
>>>
>>> 1. In XE the submission order from multiple drm_sched_entity is not
>>> guaranteed to be the same completion even if targeting the same hardware
>>> engine. This is because in XE we have a firmware scheduler, the GuC,
>>> which allowed to reorder, timeslice, and preempt submissions. If a using
>>> shared drm_gpu_scheduler across multiple drm_sched_entity, the TDR falls
>>> apart as the TDR expects submission order == completion order. Using a
>>> dedicated drm_gpu_scheduler per drm_sched_entity solve this problem.
>>>
>>> 2. In XE submissions are done via programming a ring buffer (circular
>>> buffer), a drm_gpu_scheduler provides a limit on number of jobs, if the
>>> limit of number jobs is set to RING_SIZE / MAX_SIZE_PER_JOB we get flow
>>> control on the ring for free.
>>>
>>> A problem with this design is currently a drm_gpu_scheduler uses a
>>> kthread for submission / job cleanup. This doesn't scale if a large
>>> number of drm_gpu_scheduler are used. To work around the scaling issue,
>>> use a worker rather than kthread for submission / job cleanup.
>>>
>>> v2:
>>> - (Rob Clark) Fix msm build
>>> - Pass in run work queue
>>> v3:
>>> - (Boris) don't have loop in worker
>>>
>>> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
>>> ---
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 14 +-
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 14 +-
>>> drivers/gpu/drm/etnaviv/etnaviv_sched.c | 2 +-
>>> drivers/gpu/drm/lima/lima_sched.c | 2 +-
>>> drivers/gpu/drm/msm/adreno/adreno_device.c | 6 +-
>>> drivers/gpu/drm/msm/msm_ringbuffer.c | 2 +-
>>> drivers/gpu/drm/panfrost/panfrost_job.c | 2 +-
>>> drivers/gpu/drm/scheduler/sched_main.c | 136 +++++++++++---------
>>> drivers/gpu/drm/v3d/v3d_sched.c | 10 +-
>>> include/drm/gpu_scheduler.h | 14 +-
>>> 10 files changed, 113 insertions(+), 89 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>> index f60753f97ac5..9c2a10aeb0b3 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>> @@ -1489,9 +1489,9 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file *m, void *unused)
>>> for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
>>> struct amdgpu_ring *ring = adev->rings[i];
>>> - if (!ring || !ring->sched.thread)
>>> + if (!ring || !ring->sched.ready)
>>> continue;
>>> - kthread_park(ring->sched.thread);
>>> + drm_sched_run_wq_stop(&ring->sched);
>>
>> It would be good to split out adding of these wrappers (including adding one
>> for ring->sched.thread/ready) to a standalong preceding patch. That way at
>> least some mechanical changes to various drivers would be separated from
>> functional changes.
>>
>
> Sure.
>
>> Also, perhaps do not have the wq in the name if it is not really needed to
>> be verbose with the underlying implementation like that? Like would
>> drm_sched_run/pause. Or even __drm_sched_start/stop, dunno, just an idea.
>>
>
> Sure.
>
>>> }
>>> seq_printf(m, "run ib test:\n");
>>> @@ -1505,9 +1505,9 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file *m, void *unused)
>>> for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
>>> struct amdgpu_ring *ring = adev->rings[i];
>>> - if (!ring || !ring->sched.thread)
>>> + if (!ring || !ring->sched.ready)
>>> continue;
>>> - kthread_unpark(ring->sched.thread);
>>> + drm_sched_run_wq_start(&ring->sched);
>>> }
>>> up_write(&adev->reset_domain->sem);
>>> @@ -1727,7 +1727,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
>>> ring = adev->rings[val];
>>> - if (!ring || !ring->funcs->preempt_ib || !ring->sched.thread)
>>> + if (!ring || !ring->funcs->preempt_ib || !ring->sched.ready)
>>> return -EINVAL;
>>> /* the last preemption failed */
>>> @@ -1745,7 +1745,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
>>> goto pro_end;
>>> /* stop the scheduler */
>>> - kthread_park(ring->sched.thread);
>>> + drm_sched_run_wq_stop(&ring->sched);
>>> /* preempt the IB */
>>> r = amdgpu_ring_preempt_ib(ring);
>>> @@ -1779,7 +1779,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
>>> failure:
>>> /* restart the scheduler */
>>> - kthread_unpark(ring->sched.thread);
>>> + drm_sched_run_wq_start(&ring->sched);
>>> up_read(&adev->reset_domain->sem);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index fac9312b1695..00c9c03c8f94 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -2364,7 +2364,7 @@ static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
>>> break;
>>> }
>>> - r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
>>> + r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, NULL,
>>> ring->num_hw_submission, amdgpu_job_hang_limit,
>>> timeout, adev->reset_domain->wq,
>>> ring->sched_score, ring->name,
>>> @@ -4627,7 +4627,7 @@ bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
>>> for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>> struct amdgpu_ring *ring = adev->rings[i];
>>> - if (!ring || !ring->sched.thread)
>>> + if (!ring || !ring->sched.ready)
>>> continue;
>>> spin_lock(&ring->sched.job_list_lock);
>>> @@ -4753,7 +4753,7 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>> for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>> struct amdgpu_ring *ring = adev->rings[i];
>>> - if (!ring || !ring->sched.thread)
>>> + if (!ring || !ring->sched.ready)
>>> continue;
>>> /*clear job fence from fence drv to avoid force_completion
>>> @@ -5294,7 +5294,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>> for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>> struct amdgpu_ring *ring = tmp_adev->rings[i];
>>> - if (!ring || !ring->sched.thread)
>>> + if (!ring || !ring->sched.ready)
>>> continue;
>>> drm_sched_stop(&ring->sched, job ? &job->base : NULL);
>>> @@ -5369,7 +5369,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>> for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>> struct amdgpu_ring *ring = tmp_adev->rings[i];
>>> - if (!ring || !ring->sched.thread)
>>> + if (!ring || !ring->sched.ready)
>>> continue;
>>> drm_sched_start(&ring->sched, true);
>>> @@ -5696,7 +5696,7 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta
>>> for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>> struct amdgpu_ring *ring = adev->rings[i];
>>> - if (!ring || !ring->sched.thread)
>>> + if (!ring || !ring->sched.ready)
>>> continue;
>>> drm_sched_stop(&ring->sched, NULL);
>>> @@ -5824,7 +5824,7 @@ void amdgpu_pci_resume(struct pci_dev *pdev)
>>> for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>> struct amdgpu_ring *ring = adev->rings[i];
>>> - if (!ring || !ring->sched.thread)
>>> + if (!ring || !ring->sched.ready)
>>> continue;
>>> drm_sched_start(&ring->sched, true);
>>> diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
>>> index 1ae87dfd19c4..8486a2923f1b 100644
>>> --- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c
>>> +++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
>>> @@ -133,7 +133,7 @@ int etnaviv_sched_init(struct etnaviv_gpu *gpu)
>>> {
>>> int ret;
>>> - ret = drm_sched_init(&gpu->sched, &etnaviv_sched_ops,
>>> + ret = drm_sched_init(&gpu->sched, &etnaviv_sched_ops, NULL,
>>> etnaviv_hw_jobs_limit, etnaviv_job_hang_limit,
>>> msecs_to_jiffies(500), NULL, NULL,
>>> dev_name(gpu->dev), gpu->dev);
>>> diff --git a/drivers/gpu/drm/lima/lima_sched.c b/drivers/gpu/drm/lima/lima_sched.c
>>> index ff003403fbbc..54f53bece27c 100644
>>> --- a/drivers/gpu/drm/lima/lima_sched.c
>>> +++ b/drivers/gpu/drm/lima/lima_sched.c
>>> @@ -488,7 +488,7 @@ int lima_sched_pipe_init(struct lima_sched_pipe *pipe, const char *name)
>>> INIT_WORK(&pipe->recover_work, lima_sched_recover_work);
>>> - return drm_sched_init(&pipe->base, &lima_sched_ops, 1,
>>> + return drm_sched_init(&pipe->base, &lima_sched_ops, NULL, 1,
>>> lima_job_hang_limit,
>>> msecs_to_jiffies(timeout), NULL,
>>> NULL, name, pipe->ldev->dev);
>>> diff --git a/drivers/gpu/drm/msm/adreno/adreno_device.c b/drivers/gpu/drm/msm/adreno/adreno_device.c
>>> index c5c4c93b3689..f76ce11a5384 100644
>>> --- a/drivers/gpu/drm/msm/adreno/adreno_device.c
>>> +++ b/drivers/gpu/drm/msm/adreno/adreno_device.c
>>> @@ -662,7 +662,8 @@ static void suspend_scheduler(struct msm_gpu *gpu)
>>> */
>>> for (i = 0; i < gpu->nr_rings; i++) {
>>> struct drm_gpu_scheduler *sched = &gpu->rb[i]->sched;
>>> - kthread_park(sched->thread);
>>> +
>>> + drm_sched_run_wq_stop(sched);
>>> }
>>> }
>>> @@ -672,7 +673,8 @@ static void resume_scheduler(struct msm_gpu *gpu)
>>> for (i = 0; i < gpu->nr_rings; i++) {
>>> struct drm_gpu_scheduler *sched = &gpu->rb[i]->sched;
>>> - kthread_unpark(sched->thread);
>>> +
>>> + drm_sched_run_wq_start(sched);
>>> }
>>> }
>>> diff --git a/drivers/gpu/drm/msm/msm_ringbuffer.c b/drivers/gpu/drm/msm/msm_ringbuffer.c
>>> index 57a8e9564540..5879fc262047 100644
>>> --- a/drivers/gpu/drm/msm/msm_ringbuffer.c
>>> +++ b/drivers/gpu/drm/msm/msm_ringbuffer.c
>>> @@ -95,7 +95,7 @@ struct msm_ringbuffer *msm_ringbuffer_new(struct msm_gpu *gpu, int id,
>>> /* currently managing hangcheck ourselves: */
>>> sched_timeout = MAX_SCHEDULE_TIMEOUT;
>>> - ret = drm_sched_init(&ring->sched, &msm_sched_ops,
>>> + ret = drm_sched_init(&ring->sched, &msm_sched_ops, NULL,
>>> num_hw_submissions, 0, sched_timeout,
>>> NULL, NULL, to_msm_bo(ring->bo)->name, gpu->dev->dev);
>>> if (ret) {
>>> diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c
>>> index dbc597ab46fb..f48b07056a16 100644
>>> --- a/drivers/gpu/drm/panfrost/panfrost_job.c
>>> +++ b/drivers/gpu/drm/panfrost/panfrost_job.c
>>> @@ -815,7 +815,7 @@ int panfrost_job_init(struct panfrost_device *pfdev)
>>> js->queue[j].fence_context = dma_fence_context_alloc(1);
>>> ret = drm_sched_init(&js->queue[j].sched,
>>> - &panfrost_sched_ops,
>>> + &panfrost_sched_ops, NULL,
>>> nentries, 0,
>>> msecs_to_jiffies(JOB_TIMEOUT_MS),
>>> pfdev->reset.wq,
>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
>>> index a18c8f5e8cc0..c3eed9e8062a 100644
>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>> @@ -44,7 +44,6 @@
>>> * The jobs in a entity are always scheduled in the order that they were pushed.
>>> */
>>> -#include <linux/kthread.h>
>>> #include <linux/wait.h>
>>> #include <linux/sched.h>
>>> #include <linux/completion.h>
>>> @@ -252,6 +251,47 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
>>> return rb ? rb_entry(rb, struct drm_sched_entity, rb_tree_node) : NULL;
>>> }
>>> +/**
>>> + * drm_sched_run_wq_stop - stop scheduler run worker
>>> + *
>>> + * @sched: scheduler instance to stop run worker
>>> + */
>>> +void drm_sched_run_wq_stop(struct drm_gpu_scheduler *sched)
>>> +{
>>> + WRITE_ONCE(sched->pause_run_wq, true);
>>> + cancel_work_sync(&sched->work_run);
>>> +}
>>> +EXPORT_SYMBOL(drm_sched_run_wq_stop);
>>> +
>>> +/**
>>> + * drm_sched_run_wq_start - start scheduler run worker
>>> + *
>>> + * @sched: scheduler instance to start run worker
>>> + */
>>> +void drm_sched_run_wq_start(struct drm_gpu_scheduler *sched)
>>> +{
>>> + WRITE_ONCE(sched->pause_run_wq, false);
>>> + queue_work(sched->run_wq, &sched->work_run);
>>> +}
>>> +EXPORT_SYMBOL(drm_sched_run_wq_start);
>>> +
>>> +/**
>>> + * drm_sched_run_wq_queue - queue scheduler run worker
>>> + *
>>> + * @sched: scheduler instance to queue run worker
>>> + */
>>> +static void drm_sched_run_wq_queue(struct drm_gpu_scheduler *sched)
>>> +{
>>> + /*
>>> + * Try not to schedule work if pause_run_wq set but not the end of world
>>> + * if we do as either it will be cancelled by the above
>>> + * cancel_work_sync, or drm_sched_main turns into a NOP while
>>> + * pause_run_wq is set.
>>> + */
>>> + if (!READ_ONCE(sched->pause_run_wq))
>>> + queue_work(sched->run_wq, &sched->work_run);
>>> +}
>>> +
>>> /**
>>> * drm_sched_job_done - complete a job
>>> * @s_job: pointer to the job which is done
>>> @@ -271,7 +311,7 @@ static void drm_sched_job_done(struct drm_sched_job *s_job)
>>> dma_fence_get(&s_fence->finished);
>>> drm_sched_fence_finished(s_fence);
>>> dma_fence_put(&s_fence->finished);
>>> - wake_up_interruptible(&sched->wake_up_worker);
>>> + drm_sched_run_wq_queue(sched);
>>> }
>>> /**
>>> @@ -434,7 +474,7 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad)
>>> {
>>> struct drm_sched_job *s_job, *tmp;
>>> - kthread_park(sched->thread);
>>> + drm_sched_run_wq_stop(sched);
>>> /*
>>> * Reinsert back the bad job here - now it's safe as
>>> @@ -547,7 +587,7 @@ void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery)
>>> spin_unlock(&sched->job_list_lock);
>>> }
>>> - kthread_unpark(sched->thread);
>>> + drm_sched_run_wq_start(sched);
>>> }
>>> EXPORT_SYMBOL(drm_sched_start);
>>> @@ -864,7 +904,7 @@ static bool drm_sched_ready(struct drm_gpu_scheduler *sched)
>>> void drm_sched_wakeup(struct drm_gpu_scheduler *sched)
>>> {
>>> if (drm_sched_ready(sched))
>>> - wake_up_interruptible(&sched->wake_up_worker);
>>> + drm_sched_run_wq_queue(sched);
>>> }
>>> /**
>>> @@ -974,61 +1014,42 @@ drm_sched_pick_best(struct drm_gpu_scheduler **sched_list,
>>> }
>>> EXPORT_SYMBOL(drm_sched_pick_best);
>>> -/**
>>> - * drm_sched_blocked - check if the scheduler is blocked
>>> - *
>>> - * @sched: scheduler instance
>>> - *
>>> - * Returns true if blocked, otherwise false.
>>> - */
>>> -static bool drm_sched_blocked(struct drm_gpu_scheduler *sched)
>>> -{
>>> - if (kthread_should_park()) {
>>> - kthread_parkme();
>>> - return true;
>>> - }
>>> -
>>> - return false;
>>> -}
>>> -
>>> /**
>>> * drm_sched_main - main scheduler thread
>>> *
>>> * @param: scheduler instance
>>> - *
>>> - * Returns 0.
>>> */
>>> -static int drm_sched_main(void *param)
>>> +static void drm_sched_main(struct work_struct *w)
>>> {
>>> - struct drm_gpu_scheduler *sched = (struct drm_gpu_scheduler *)param;
>>> + struct drm_gpu_scheduler *sched =
>>> + container_of(w, struct drm_gpu_scheduler, work_run);
>>> + struct drm_sched_entity *entity;
>>> + struct drm_sched_job *cleanup_job;
>>> int r;
>>> - sched_set_fifo_low(current);
>>> + if (READ_ONCE(sched->pause_run_wq))
>>> + return;
>>
>> Is there a point to this check given the comment in drm_sched_run_wq_queue?
>>
>
> I think so.
What is it? The helper which arms it already checks the paused flag and
the comment says it is okay if it waits for it to exit.
>>> - while (!kthread_should_stop()) {
>>> - struct drm_sched_entity *entity = NULL;
>>> - struct drm_sched_fence *s_fence;
>>> - struct drm_sched_job *sched_job;
>>> - struct dma_fence *fence;
>>> - struct drm_sched_job *cleanup_job = NULL;
>>> + cleanup_job = drm_sched_get_cleanup_job(sched);
>>> + entity = drm_sched_select_entity(sched);
>>> - wait_event_interruptible(sched->wake_up_worker,
>>> - (cleanup_job = drm_sched_get_cleanup_job(sched)) ||
>>> - (!drm_sched_blocked(sched) &&
>>> - (entity = drm_sched_select_entity(sched))) ||
>>> - kthread_should_stop());
>>> + if (!entity && !cleanup_job)
>>> + return; /* No more work */
>>> - if (cleanup_job)
>>> - sched->ops->free_job(cleanup_job);
>>> + if (cleanup_job)
>>> + sched->ops->free_job(cleanup_job);
>>> - if (!entity)
>>> - continue;
>>> + if (entity) {
>>> + struct dma_fence *fence;
>>> + struct drm_sched_fence *s_fence;
>>> + struct drm_sched_job *sched_job;
>>> sched_job = drm_sched_entity_pop_job(entity);
>>> -
>>> if (!sched_job) {
>>> complete_all(&entity->entity_idle);
>>> - continue;
>>> + if (!cleanup_job)
>>> + return; /* No more work */
>>> + goto again;
>>
>> Loop is gone but now it re-arms itself which is needed to avoid starvation?
>> Is it guaranteed to be effective by the wq contract?
>>
>
> Yea.
What exactly is the requirement for it to effectively avoid starvation?
That same workitems are not executed back-to-back? That wq must be
ordered or what?
>>> }
>>> s_fence = sched_job->s_fence;
>>> @@ -1055,14 +1076,17 @@ static int drm_sched_main(void *param)
>>> r);
>>> } else {
>>> if (IS_ERR(fence))
>>> - dma_fence_set_error(&s_fence->finished, PTR_ERR(fence));
>>> + dma_fence_set_error(&s_fence->finished,
>>> + PTR_ERR(fence));
>>> drm_sched_job_done(sched_job);
>>> }
>>> wake_up(&sched->job_scheduled);
>>> }
>>> - return 0;
>>> +
>>> +again:
>>> + drm_sched_run_wq_queue(sched);
>>> }
>>> /**
>>> @@ -1070,6 +1094,7 @@ static int drm_sched_main(void *param)
>>> *
>>> * @sched: scheduler instance
>>> * @ops: backend operations for this scheduler
>>> + * @run_wq: workqueue to use for run work. If NULL, the system_wq is used
>>> * @hw_submission: number of hw submissions that can be in flight
>>> * @hang_limit: number of times to allow a job to hang before dropping it
>>> * @timeout: timeout value in jiffies for the scheduler
>>> @@ -1083,14 +1108,16 @@ static int drm_sched_main(void *param)
>>> */
>>> int drm_sched_init(struct drm_gpu_scheduler *sched,
>>> const struct drm_sched_backend_ops *ops,
>>> + struct workqueue_struct *run_wq,
>>> unsigned hw_submission, unsigned hang_limit,
>>> long timeout, struct workqueue_struct *timeout_wq,
>>> atomic_t *score, const char *name, struct device *dev)
>>> {
>>> - int i, ret;
>>> + int i;
>>> sched->ops = ops;
>>> sched->hw_submission_limit = hw_submission;
>>> sched->name = name;
>>> + sched->run_wq = run_wq ? : system_wq;
>>
>> I still think it is not nice to implicitly move everyone over to the shared
>> system wq. Maybe even more so with now one at a time execution, since effect
>> on latency can be even greater.
>>
>
> No one that has a stake in this has pushed back that I can recall. Open
> to feedback stakeholders (maintainers of drivers that use the drm
> scheduler). The i915 doesn't use the DRM scheduler last time I looked.
> Has that changed?
Matt, I am not pushing back but commenting and discussing. My interest
is for the stack, including Xe, to work well.
>> Have you considered kthread_work as a backend? Maybe it would work to have
>> callers pass in a kthread_worker they create, or provide a drm_sched helper
>> to create one, which would then be passed to drm_sched_init.
>>
>> That would enable per driver kthread_worker, or per device, or whatever
>> granularity each driver would want/need/desire.
>>
>> driver init:
>> struct drm_sched_worker = drm_sched_create_worker(...);
>>
>> queue/whatever init:
>> drm_sched_init(.., worker, ...);
>>
>
> This idea doesn't seem to work for varitey of reasons. Will type it out
> if needed but not going to spend time on this unless someone with a
> stake raises this as an issue.
Maybe I did not explain this well enough, or maybe you did not really
read it all given the "not a stakeholder" a little bit of a knee jerk
reaction. I will try again.
With what I wrote you could invisibly switch the underlying
implementation between kthread_worker/work to workqueue/workitem and in
many aspects it behaves exactly the same as your design.
I was not implying a kthread per context/queue/entity!
There are two main differences in what I wrote:
1) Abstract out the mechanism.
2) Provide private execution context by default.
And a third difference which kthread_worker brings over the workqueue:
3) Stay closer to current kthread solution in terms of submission
latencies when mutlitple jobs are signalled at once.
So it is really similar to your design. Just discussing those few details.
Regards,
Tvrtko
>> You could create one inside drm_sched_init if not passed in, which would
>> keep the behaviour for existing drivers more similar - they would still have
>> a 1:1 kthread context for their exclusive use.
>>
>
> Part of the idea of a work queue is so a user can't directly create a
> kthread via an IOCTL (XE_EXEC_QUEUE_CREATE). What you suggesting exposes
> this issue.
>
>> And I *think* self-re-arming would be less problematic latency wise since
>> kthread_worker consumes everything queued without relinquishing control and
>> execution context would be guaranteed not to be shared with random system
>> stuff.
>>
>
> So this is essentially so we can use a loop? Seems like a lot effort for
> what is pure speculation. Again if a stakeholder raises an issue we can
> address then.
>
> Matt
>
>> Regards,
>>
>> Tvrtko
>>
>>> sched->timeout = timeout;
>>> sched->timeout_wq = timeout_wq ? : system_wq;
>>> sched->hang_limit = hang_limit;
>>> @@ -1099,23 +1126,15 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>>> for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_COUNT; i++)
>>> drm_sched_rq_init(sched, &sched->sched_rq[i]);
>>> - init_waitqueue_head(&sched->wake_up_worker);
>>> init_waitqueue_head(&sched->job_scheduled);
>>> INIT_LIST_HEAD(&sched->pending_list);
>>> spin_lock_init(&sched->job_list_lock);
>>> atomic_set(&sched->hw_rq_count, 0);
>>> INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
>>> + INIT_WORK(&sched->work_run, drm_sched_main);
>>> atomic_set(&sched->_score, 0);
>>> atomic64_set(&sched->job_id_count, 0);
>>> -
>>> - /* Each scheduler will run on a seperate kernel thread */
>>> - sched->thread = kthread_run(drm_sched_main, sched, sched->name);
>>> - if (IS_ERR(sched->thread)) {
>>> - ret = PTR_ERR(sched->thread);
>>> - sched->thread = NULL;
>>> - DRM_DEV_ERROR(sched->dev, "Failed to create scheduler for %s.\n", name);
>>> - return ret;
>>> - }
>>> + sched->pause_run_wq = false;
>>> sched->ready = true;
>>> return 0;
>>> @@ -1134,8 +1153,7 @@ void drm_sched_fini(struct drm_gpu_scheduler *sched)
>>> struct drm_sched_entity *s_entity;
>>> int i;
>>> - if (sched->thread)
>>> - kthread_stop(sched->thread);
>>> + drm_sched_run_wq_stop(sched);
>>> for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
>>> struct drm_sched_rq *rq = &sched->sched_rq[i];
>>> diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c
>>> index 06238e6d7f5c..38e092ea41e6 100644
>>> --- a/drivers/gpu/drm/v3d/v3d_sched.c
>>> +++ b/drivers/gpu/drm/v3d/v3d_sched.c
>>> @@ -388,7 +388,7 @@ v3d_sched_init(struct v3d_dev *v3d)
>>> int ret;
>>> ret = drm_sched_init(&v3d->queue[V3D_BIN].sched,
>>> - &v3d_bin_sched_ops,
>>> + &v3d_bin_sched_ops, NULL,
>>> hw_jobs_limit, job_hang_limit,
>>> msecs_to_jiffies(hang_limit_ms), NULL,
>>> NULL, "v3d_bin", v3d->drm.dev);
>>> @@ -396,7 +396,7 @@ v3d_sched_init(struct v3d_dev *v3d)
>>> return ret;
>>> ret = drm_sched_init(&v3d->queue[V3D_RENDER].sched,
>>> - &v3d_render_sched_ops,
>>> + &v3d_render_sched_ops, NULL,
>>> hw_jobs_limit, job_hang_limit,
>>> msecs_to_jiffies(hang_limit_ms), NULL,
>>> NULL, "v3d_render", v3d->drm.dev);
>>> @@ -404,7 +404,7 @@ v3d_sched_init(struct v3d_dev *v3d)
>>> goto fail;
>>> ret = drm_sched_init(&v3d->queue[V3D_TFU].sched,
>>> - &v3d_tfu_sched_ops,
>>> + &v3d_tfu_sched_ops, NULL,
>>> hw_jobs_limit, job_hang_limit,
>>> msecs_to_jiffies(hang_limit_ms), NULL,
>>> NULL, "v3d_tfu", v3d->drm.dev);
>>> @@ -413,7 +413,7 @@ v3d_sched_init(struct v3d_dev *v3d)
>>> if (v3d_has_csd(v3d)) {
>>> ret = drm_sched_init(&v3d->queue[V3D_CSD].sched,
>>> - &v3d_csd_sched_ops,
>>> + &v3d_csd_sched_ops, NULL,
>>> hw_jobs_limit, job_hang_limit,
>>> msecs_to_jiffies(hang_limit_ms), NULL,
>>> NULL, "v3d_csd", v3d->drm.dev);
>>> @@ -421,7 +421,7 @@ v3d_sched_init(struct v3d_dev *v3d)
>>> goto fail;
>>> ret = drm_sched_init(&v3d->queue[V3D_CACHE_CLEAN].sched,
>>> - &v3d_cache_clean_sched_ops,
>>> + &v3d_cache_clean_sched_ops, NULL,
>>> hw_jobs_limit, job_hang_limit,
>>> msecs_to_jiffies(hang_limit_ms), NULL,
>>> NULL, "v3d_cache_clean", v3d->drm.dev);
>>> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
>>> index c0586d832260..98fb5f85eba6 100644
>>> --- a/include/drm/gpu_scheduler.h
>>> +++ b/include/drm/gpu_scheduler.h
>>> @@ -473,17 +473,16 @@ struct drm_sched_backend_ops {
>>> * @timeout: the time after which a job is removed from the scheduler.
>>> * @name: name of the ring for which this scheduler is being used.
>>> * @sched_rq: priority wise array of run queues.
>>> - * @wake_up_worker: the wait queue on which the scheduler sleeps until a job
>>> - * is ready to be scheduled.
>>> * @job_scheduled: once @drm_sched_entity_do_release is called the scheduler
>>> * waits on this wait queue until all the scheduled jobs are
>>> * finished.
>>> * @hw_rq_count: the number of jobs currently in the hardware queue.
>>> * @job_id_count: used to assign unique id to the each job.
>>> + * @run_wq: workqueue used to queue @work_run
>>> * @timeout_wq: workqueue used to queue @work_tdr
>>> + * @work_run: schedules jobs and cleans up entities
>>> * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the
>>> * timeout interval is over.
>>> - * @thread: the kthread on which the scheduler which run.
>>> * @pending_list: the list of jobs which are currently in the job queue.
>>> * @job_list_lock: lock to protect the pending_list.
>>> * @hang_limit: once the hangs by a job crosses this limit then it is marked
>>> @@ -492,6 +491,7 @@ struct drm_sched_backend_ops {
>>> * @_score: score used when the driver doesn't provide one
>>> * @ready: marks if the underlying HW is ready to work
>>> * @free_guilty: A hit to time out handler to free the guilty job.
>>> + * @pause_run_wq: pause queuing of @work_run on @run_wq
>>> * @dev: system &struct device
>>> *
>>> * One scheduler is implemented for each hardware ring.
>>> @@ -502,13 +502,13 @@ struct drm_gpu_scheduler {
>>> long timeout;
>>> const char *name;
>>> struct drm_sched_rq sched_rq[DRM_SCHED_PRIORITY_COUNT];
>>> - wait_queue_head_t wake_up_worker;
>>> wait_queue_head_t job_scheduled;
>>> atomic_t hw_rq_count;
>>> atomic64_t job_id_count;
>>> + struct workqueue_struct *run_wq;
>>> struct workqueue_struct *timeout_wq;
>>> + struct work_struct work_run;
>>> struct delayed_work work_tdr;
>>> - struct task_struct *thread;
>>> struct list_head pending_list;
>>> spinlock_t job_list_lock;
>>> int hang_limit;
>>> @@ -516,11 +516,13 @@ struct drm_gpu_scheduler {
>>> atomic_t _score;
>>> bool ready;
>>> bool free_guilty;
>>> + bool pause_run_wq;
>>> struct device *dev;
>>> };
>>> int drm_sched_init(struct drm_gpu_scheduler *sched,
>>> const struct drm_sched_backend_ops *ops,
>>> + struct workqueue_struct *run_wq,
>>> uint32_t hw_submission, unsigned hang_limit,
>>> long timeout, struct workqueue_struct *timeout_wq,
>>> atomic_t *score, const char *name, struct device *dev);
>>> @@ -550,6 +552,8 @@ void drm_sched_entity_modify_sched(struct drm_sched_entity *entity,
>>> void drm_sched_job_cleanup(struct drm_sched_job *job);
>>> void drm_sched_wakeup(struct drm_gpu_scheduler *sched);
>>> +void drm_sched_run_wq_stop(struct drm_gpu_scheduler *sched);
>>> +void drm_sched_run_wq_start(struct drm_gpu_scheduler *sched);
>>> void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad);
>>> void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery);
>>> void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched);
^ permalink raw reply [flat|nested] 24+ messages in thread
* [PATCH 2/8] drm/sched: Move schedule policy to scheduler / entity
2023-08-01 20:50 [PATCH 0/8] DRM scheduler changes for Xe Matthew Brost
2023-08-01 20:50 ` [PATCH 1/8] drm/sched: Convert drm scheduler to use a work queue rather than kthread Matthew Brost
@ 2023-08-01 20:50 ` Matthew Brost
2023-08-01 20:50 ` [PATCH 3/8] drm/sched: Add DRM_SCHED_POLICY_SINGLE_ENTITY scheduling policy Matthew Brost
` (5 subsequent siblings)
7 siblings, 0 replies; 24+ messages in thread
From: Matthew Brost @ 2023-08-01 20:50 UTC (permalink / raw)
To: dri-devel, intel-xe
Cc: robdclark, thomas.hellstrom, Matthew Brost, sarah.walker,
ketil.johnsen, Liviu.Dudau, luben.tuikov, lina, donald.robson,
boris.brezillon, christian.koenig, faith.ekstrand
Rather than a global modparam for scheduling policy, move the scheduling
policy to scheduler / entity so user can control each scheduler / entity
policy.
v2:
- s/DRM_SCHED_POLICY_MAX/DRM_SCHED_POLICY_COUNT (Luben)
- Only include policy in scheduler (Luben)
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 1 +
drivers/gpu/drm/etnaviv/etnaviv_sched.c | 3 ++-
drivers/gpu/drm/lima/lima_sched.c | 3 ++-
drivers/gpu/drm/msm/msm_ringbuffer.c | 3 ++-
drivers/gpu/drm/panfrost/panfrost_job.c | 3 ++-
drivers/gpu/drm/scheduler/sched_entity.c | 24 ++++++++++++++++++----
drivers/gpu/drm/scheduler/sched_main.c | 23 +++++++++++++++------
drivers/gpu/drm/v3d/v3d_sched.c | 15 +++++++++-----
include/drm/gpu_scheduler.h | 20 ++++++++++++------
9 files changed, 70 insertions(+), 25 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 00c9c03c8f94..4df0fca5a74c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2368,6 +2368,7 @@ static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
ring->num_hw_submission, amdgpu_job_hang_limit,
timeout, adev->reset_domain->wq,
ring->sched_score, ring->name,
+ DRM_SCHED_POLICY_DEFAULT,
adev->dev);
if (r) {
DRM_ERROR("Failed to create scheduler on ring %s.\n",
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
index 8486a2923f1b..61204a3f8b0b 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
@@ -136,7 +136,8 @@ int etnaviv_sched_init(struct etnaviv_gpu *gpu)
ret = drm_sched_init(&gpu->sched, &etnaviv_sched_ops, NULL,
etnaviv_hw_jobs_limit, etnaviv_job_hang_limit,
msecs_to_jiffies(500), NULL, NULL,
- dev_name(gpu->dev), gpu->dev);
+ dev_name(gpu->dev), DRM_SCHED_POLICY_DEFAULT,
+ gpu->dev);
if (ret)
return ret;
diff --git a/drivers/gpu/drm/lima/lima_sched.c b/drivers/gpu/drm/lima/lima_sched.c
index 54f53bece27c..33042ba6ae93 100644
--- a/drivers/gpu/drm/lima/lima_sched.c
+++ b/drivers/gpu/drm/lima/lima_sched.c
@@ -491,7 +491,8 @@ int lima_sched_pipe_init(struct lima_sched_pipe *pipe, const char *name)
return drm_sched_init(&pipe->base, &lima_sched_ops, NULL, 1,
lima_job_hang_limit,
msecs_to_jiffies(timeout), NULL,
- NULL, name, pipe->ldev->dev);
+ NULL, name, DRM_SCHED_POLICY_DEFAULT,
+ pipe->ldev->dev);
}
void lima_sched_pipe_fini(struct lima_sched_pipe *pipe)
diff --git a/drivers/gpu/drm/msm/msm_ringbuffer.c b/drivers/gpu/drm/msm/msm_ringbuffer.c
index 5879fc262047..f408a9097315 100644
--- a/drivers/gpu/drm/msm/msm_ringbuffer.c
+++ b/drivers/gpu/drm/msm/msm_ringbuffer.c
@@ -97,7 +97,8 @@ struct msm_ringbuffer *msm_ringbuffer_new(struct msm_gpu *gpu, int id,
ret = drm_sched_init(&ring->sched, &msm_sched_ops, NULL,
num_hw_submissions, 0, sched_timeout,
- NULL, NULL, to_msm_bo(ring->bo)->name, gpu->dev->dev);
+ NULL, NULL, to_msm_bo(ring->bo)->name,
+ DRM_SCHED_POLICY_DEFAULT, gpu->dev->dev);
if (ret) {
goto fail;
}
diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c
index f48b07056a16..effa48b33dce 100644
--- a/drivers/gpu/drm/panfrost/panfrost_job.c
+++ b/drivers/gpu/drm/panfrost/panfrost_job.c
@@ -819,7 +819,8 @@ int panfrost_job_init(struct panfrost_device *pfdev)
nentries, 0,
msecs_to_jiffies(JOB_TIMEOUT_MS),
pfdev->reset.wq,
- NULL, "pan_js", pfdev->dev);
+ NULL, "pan_js", DRM_SCHED_POLICY_DEFAULT,
+ pfdev->dev);
if (ret) {
dev_err(pfdev->dev, "Failed to create scheduler: %d.", ret);
goto err_sched;
diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c
index 15d04a0ec623..941ea8edead2 100644
--- a/drivers/gpu/drm/scheduler/sched_entity.c
+++ b/drivers/gpu/drm/scheduler/sched_entity.c
@@ -33,6 +33,20 @@
#define to_drm_sched_job(sched_job) \
container_of((sched_job), struct drm_sched_job, queue_node)
+static bool bad_policies(struct drm_gpu_scheduler **sched_list,
+ unsigned int num_sched_list)
+{
+ enum drm_sched_policy sched_policy = sched_list[0]->sched_policy;
+ unsigned int i;
+
+ /* All schedule policies must match */
+ for (i = 1; i < num_sched_list; ++i)
+ if (sched_policy != sched_list[i]->sched_policy)
+ return true;
+
+ return false;
+}
+
/**
* drm_sched_entity_init - Init a context entity used by scheduler when
* submit to HW ring.
@@ -62,7 +76,8 @@ int drm_sched_entity_init(struct drm_sched_entity *entity,
unsigned int num_sched_list,
atomic_t *guilty)
{
- if (!(entity && sched_list && (num_sched_list == 0 || sched_list[0])))
+ if (!(entity && sched_list && (num_sched_list == 0 || sched_list[0])) ||
+ bad_policies(sched_list, num_sched_list))
return -EINVAL;
memset(entity, 0, sizeof(struct drm_sched_entity));
@@ -440,7 +455,7 @@ struct drm_sched_job *drm_sched_entity_pop_job(struct drm_sched_entity *entity)
* Update the entity's location in the min heap according to
* the timestamp of the next job, if any.
*/
- if (drm_sched_policy == DRM_SCHED_POLICY_FIFO) {
+ if (entity->rq->sched->sched_policy == DRM_SCHED_POLICY_FIFO) {
struct drm_sched_job *next;
next = to_drm_sched_job(spsc_queue_peek(&entity->job_queue));
@@ -506,7 +521,8 @@ void drm_sched_entity_select_rq(struct drm_sched_entity *entity)
void drm_sched_entity_push_job(struct drm_sched_job *sched_job)
{
struct drm_sched_entity *entity = sched_job->entity;
- bool first;
+ bool first, fifo = entity->rq->sched->sched_policy ==
+ DRM_SCHED_POLICY_FIFO;
trace_drm_sched_job(sched_job, entity);
atomic_inc(entity->rq->sched->score);
@@ -528,7 +544,7 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job)
drm_sched_rq_add_entity(entity->rq, entity);
spin_unlock(&entity->rq_lock);
- if (drm_sched_policy == DRM_SCHED_POLICY_FIFO)
+ if (fifo)
drm_sched_rq_update_fifo(entity, sched_job->submit_ts);
drm_sched_wakeup(entity->rq->sched);
diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index c3eed9e8062a..27989345889d 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -62,14 +62,14 @@
#define to_drm_sched_job(sched_job) \
container_of((sched_job), struct drm_sched_job, queue_node)
-int drm_sched_policy = DRM_SCHED_POLICY_FIFO;
+int default_drm_sched_policy = DRM_SCHED_POLICY_FIFO;
/**
* DOC: sched_policy (int)
* Used to override default entities scheduling policy in a run queue.
*/
-MODULE_PARM_DESC(sched_policy, "Specify the scheduling policy for entities on a run-queue, " __stringify(DRM_SCHED_POLICY_RR) " = Round Robin, " __stringify(DRM_SCHED_POLICY_FIFO) " = FIFO (default).");
-module_param_named(sched_policy, drm_sched_policy, int, 0444);
+MODULE_PARM_DESC(sched_policy, "Specify the default scheduling policy for entities on a run-queue, " __stringify(DRM_SCHED_POLICY_RR) " = Round Robin, " __stringify(DRM_SCHED_POLICY_FIFO) " = FIFO (default).");
+module_param_named(sched_policy, default_drm_sched_policy, int, 0444);
static __always_inline bool drm_sched_entity_compare_before(struct rb_node *a,
const struct rb_node *b)
@@ -173,7 +173,7 @@ void drm_sched_rq_remove_entity(struct drm_sched_rq *rq,
if (rq->current_entity == entity)
rq->current_entity = NULL;
- if (drm_sched_policy == DRM_SCHED_POLICY_FIFO)
+ if (rq->sched->sched_policy == DRM_SCHED_POLICY_FIFO)
drm_sched_rq_remove_fifo_locked(entity);
spin_unlock(&rq->lock);
@@ -925,7 +925,7 @@ drm_sched_select_entity(struct drm_gpu_scheduler *sched)
/* Kernel run queue has higher priority than normal run queue*/
for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
- entity = drm_sched_policy == DRM_SCHED_POLICY_FIFO ?
+ entity = sched->sched_policy == DRM_SCHED_POLICY_FIFO ?
drm_sched_rq_select_entity_fifo(&sched->sched_rq[i]) :
drm_sched_rq_select_entity_rr(&sched->sched_rq[i]);
if (entity)
@@ -1102,6 +1102,7 @@ static void drm_sched_main(struct work_struct *w)
* used
* @score: optional score atomic shared with other schedulers
* @name: name used for debugging
+ * @sched_policy: schedule policy
* @dev: target &struct device
*
* Return 0 on success, otherwise error code.
@@ -1111,9 +1112,15 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
struct workqueue_struct *run_wq,
unsigned hw_submission, unsigned hang_limit,
long timeout, struct workqueue_struct *timeout_wq,
- atomic_t *score, const char *name, struct device *dev)
+ atomic_t *score, const char *name,
+ enum drm_sched_policy sched_policy,
+ struct device *dev)
{
int i;
+
+ if (sched_policy >= DRM_SCHED_POLICY_COUNT)
+ return -EINVAL;
+
sched->ops = ops;
sched->hw_submission_limit = hw_submission;
sched->name = name;
@@ -1123,6 +1130,10 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
sched->hang_limit = hang_limit;
sched->score = score ? score : &sched->_score;
sched->dev = dev;
+ if (sched_policy == DRM_SCHED_POLICY_DEFAULT)
+ sched->sched_policy = default_drm_sched_policy;
+ else
+ sched->sched_policy = sched_policy;
for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_COUNT; i++)
drm_sched_rq_init(sched, &sched->sched_rq[i]);
diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c
index 38e092ea41e6..5e3fe77fa991 100644
--- a/drivers/gpu/drm/v3d/v3d_sched.c
+++ b/drivers/gpu/drm/v3d/v3d_sched.c
@@ -391,7 +391,8 @@ v3d_sched_init(struct v3d_dev *v3d)
&v3d_bin_sched_ops, NULL,
hw_jobs_limit, job_hang_limit,
msecs_to_jiffies(hang_limit_ms), NULL,
- NULL, "v3d_bin", v3d->drm.dev);
+ NULL, "v3d_bin", DRM_SCHED_POLICY_DEFAULT,
+ v3d->drm.dev);
if (ret)
return ret;
@@ -399,7 +400,8 @@ v3d_sched_init(struct v3d_dev *v3d)
&v3d_render_sched_ops, NULL,
hw_jobs_limit, job_hang_limit,
msecs_to_jiffies(hang_limit_ms), NULL,
- NULL, "v3d_render", v3d->drm.dev);
+ ULL, "v3d_render", DRM_SCHED_POLICY_DEFAULT,
+ v3d->drm.dev);
if (ret)
goto fail;
@@ -407,7 +409,8 @@ v3d_sched_init(struct v3d_dev *v3d)
&v3d_tfu_sched_ops, NULL,
hw_jobs_limit, job_hang_limit,
msecs_to_jiffies(hang_limit_ms), NULL,
- NULL, "v3d_tfu", v3d->drm.dev);
+ NULL, "v3d_tfu", DRM_SCHED_POLICY_DEFAULT,
+ v3d->drm.dev);
if (ret)
goto fail;
@@ -416,7 +419,8 @@ v3d_sched_init(struct v3d_dev *v3d)
&v3d_csd_sched_ops, NULL,
hw_jobs_limit, job_hang_limit,
msecs_to_jiffies(hang_limit_ms), NULL,
- NULL, "v3d_csd", v3d->drm.dev);
+ NULL, "v3d_csd", DRM_SCHED_POLICY_DEFAULT,
+ v3d->drm.dev);
if (ret)
goto fail;
@@ -424,7 +428,8 @@ v3d_sched_init(struct v3d_dev *v3d)
&v3d_cache_clean_sched_ops, NULL,
hw_jobs_limit, job_hang_limit,
msecs_to_jiffies(hang_limit_ms), NULL,
- NULL, "v3d_cache_clean", v3d->drm.dev);
+ NULL, "v3d_cache_clean",
+ DRM_SCHED_POLICY_DEFAULT, v3d->drm.dev);
if (ret)
goto fail;
}
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
index 98fb5f85eba6..7474142daca6 100644
--- a/include/drm/gpu_scheduler.h
+++ b/include/drm/gpu_scheduler.h
@@ -72,11 +72,15 @@ enum drm_sched_priority {
DRM_SCHED_PRIORITY_UNSET = -2
};
-/* Used to chose between FIFO and RR jobs scheduling */
-extern int drm_sched_policy;
-
-#define DRM_SCHED_POLICY_RR 0
-#define DRM_SCHED_POLICY_FIFO 1
+/* Used to chose default scheduling policy*/
+extern int default_drm_sched_policy;
+
+enum drm_sched_policy {
+ DRM_SCHED_POLICY_DEFAULT,
+ DRM_SCHED_POLICY_RR,
+ DRM_SCHED_POLICY_FIFO,
+ DRM_SCHED_POLICY_COUNT,
+};
/**
* struct drm_sched_entity - A wrapper around a job queue (typically
@@ -489,6 +493,7 @@ struct drm_sched_backend_ops {
* guilty and it will no longer be considered for scheduling.
* @score: score to help loadbalancer pick a idle sched
* @_score: score used when the driver doesn't provide one
+ * @sched_policy: Schedule policy for scheduler
* @ready: marks if the underlying HW is ready to work
* @free_guilty: A hit to time out handler to free the guilty job.
* @pause_run_wq: pause queuing of @work_run on @run_wq
@@ -514,6 +519,7 @@ struct drm_gpu_scheduler {
int hang_limit;
atomic_t *score;
atomic_t _score;
+ enum drm_sched_policy sched_policy;
bool ready;
bool free_guilty;
bool pause_run_wq;
@@ -525,7 +531,9 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
struct workqueue_struct *run_wq,
uint32_t hw_submission, unsigned hang_limit,
long timeout, struct workqueue_struct *timeout_wq,
- atomic_t *score, const char *name, struct device *dev);
+ atomic_t *score, const char *name,
+ enum drm_sched_policy sched_policy,
+ struct device *dev);
void drm_sched_fini(struct drm_gpu_scheduler *sched);
int drm_sched_job_init(struct drm_sched_job *job,
--
2.34.1
^ permalink raw reply related [flat|nested] 24+ messages in thread
* [PATCH 3/8] drm/sched: Add DRM_SCHED_POLICY_SINGLE_ENTITY scheduling policy
2023-08-01 20:50 [PATCH 0/8] DRM scheduler changes for Xe Matthew Brost
2023-08-01 20:50 ` [PATCH 1/8] drm/sched: Convert drm scheduler to use a work queue rather than kthread Matthew Brost
2023-08-01 20:50 ` [PATCH 2/8] drm/sched: Move schedule policy to scheduler / entity Matthew Brost
@ 2023-08-01 20:50 ` Matthew Brost
2023-08-03 8:50 ` Christian König
2023-08-01 20:50 ` [PATCH 4/8] drm/sched: Add generic scheduler message interface Matthew Brost
` (4 subsequent siblings)
7 siblings, 1 reply; 24+ messages in thread
From: Matthew Brost @ 2023-08-01 20:50 UTC (permalink / raw)
To: dri-devel, intel-xe
Cc: robdclark, thomas.hellstrom, Matthew Brost, sarah.walker,
ketil.johnsen, Liviu.Dudau, luben.tuikov, lina, donald.robson,
boris.brezillon, christian.koenig, faith.ekstrand
DRM_SCHED_POLICY_SINGLE_ENTITY creates a 1 to 1 relationship between
scheduler and entity. No priorities or run queue used in this mode.
Intended for devices with firmware schedulers.
v2:
- Drop sched / rq union (Luben)
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
drivers/gpu/drm/scheduler/sched_entity.c | 69 ++++++++++++++++++------
drivers/gpu/drm/scheduler/sched_fence.c | 2 +-
drivers/gpu/drm/scheduler/sched_main.c | 62 ++++++++++++++++++---
include/drm/gpu_scheduler.h | 8 +++
4 files changed, 118 insertions(+), 23 deletions(-)
diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c
index 941ea8edead2..59c1ca578256 100644
--- a/drivers/gpu/drm/scheduler/sched_entity.c
+++ b/drivers/gpu/drm/scheduler/sched_entity.c
@@ -83,6 +83,7 @@ int drm_sched_entity_init(struct drm_sched_entity *entity,
memset(entity, 0, sizeof(struct drm_sched_entity));
INIT_LIST_HEAD(&entity->list);
entity->rq = NULL;
+ entity->single_sched = NULL;
entity->guilty = guilty;
entity->num_sched_list = num_sched_list;
entity->priority = priority;
@@ -90,8 +91,17 @@ int drm_sched_entity_init(struct drm_sched_entity *entity,
entity->last_scheduled = NULL;
RB_CLEAR_NODE(&entity->rb_tree_node);
- if(num_sched_list)
- entity->rq = &sched_list[0]->sched_rq[entity->priority];
+ if (num_sched_list) {
+ if (sched_list[0]->sched_policy !=
+ DRM_SCHED_POLICY_SINGLE_ENTITY) {
+ entity->rq = &sched_list[0]->sched_rq[entity->priority];
+ } else {
+ if (num_sched_list != 1 || sched_list[0]->single_entity)
+ return -EINVAL;
+ sched_list[0]->single_entity = entity;
+ entity->single_sched = sched_list[0];
+ }
+ }
init_completion(&entity->entity_idle);
@@ -124,7 +134,8 @@ void drm_sched_entity_modify_sched(struct drm_sched_entity *entity,
struct drm_gpu_scheduler **sched_list,
unsigned int num_sched_list)
{
- WARN_ON(!num_sched_list || !sched_list);
+ WARN_ON(!num_sched_list || !sched_list ||
+ !!entity->single_sched);
entity->sched_list = sched_list;
entity->num_sched_list = num_sched_list;
@@ -194,13 +205,15 @@ static void drm_sched_entity_kill(struct drm_sched_entity *entity)
{
struct drm_sched_job *job;
struct dma_fence *prev;
+ bool single_entity = !!entity->single_sched;
- if (!entity->rq)
+ if (!entity->rq && !single_entity)
return;
spin_lock(&entity->rq_lock);
entity->stopped = true;
- drm_sched_rq_remove_entity(entity->rq, entity);
+ if (!single_entity)
+ drm_sched_rq_remove_entity(entity->rq, entity);
spin_unlock(&entity->rq_lock);
/* Make sure this entity is not used by the scheduler at the moment */
@@ -222,6 +235,20 @@ static void drm_sched_entity_kill(struct drm_sched_entity *entity)
dma_fence_put(prev);
}
+/**
+ * drm_sched_entity_to_scheduler - Schedule entity to GPU scheduler
+ * @entity: scheduler entity
+ *
+ * Returns GPU scheduler for the entity
+ */
+struct drm_gpu_scheduler *
+drm_sched_entity_to_scheduler(struct drm_sched_entity *entity)
+{
+ bool single_entity = !!entity->single_sched;
+
+ return single_entity ? entity->single_sched : entity->rq->sched;
+}
+
/**
* drm_sched_entity_flush - Flush a context entity
*
@@ -239,11 +266,12 @@ long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout)
struct drm_gpu_scheduler *sched;
struct task_struct *last_user;
long ret = timeout;
+ bool single_entity = !!entity->single_sched;
- if (!entity->rq)
+ if (!entity->rq && !single_entity)
return 0;
- sched = entity->rq->sched;
+ sched = drm_sched_entity_to_scheduler(entity);
/**
* The client will not queue more IBs during this fini, consume existing
* queued IBs or discard them on SIGKILL
@@ -336,7 +364,7 @@ static void drm_sched_entity_wakeup(struct dma_fence *f,
container_of(cb, struct drm_sched_entity, cb);
drm_sched_entity_clear_dep(f, cb);
- drm_sched_wakeup(entity->rq->sched);
+ drm_sched_wakeup(drm_sched_entity_to_scheduler(entity));
}
/**
@@ -350,6 +378,8 @@ static void drm_sched_entity_wakeup(struct dma_fence *f,
void drm_sched_entity_set_priority(struct drm_sched_entity *entity,
enum drm_sched_priority priority)
{
+ WARN_ON(!!entity->single_sched);
+
spin_lock(&entity->rq_lock);
entity->priority = priority;
spin_unlock(&entity->rq_lock);
@@ -362,7 +392,7 @@ EXPORT_SYMBOL(drm_sched_entity_set_priority);
*/
static bool drm_sched_entity_add_dependency_cb(struct drm_sched_entity *entity)
{
- struct drm_gpu_scheduler *sched = entity->rq->sched;
+ struct drm_gpu_scheduler *sched = drm_sched_entity_to_scheduler(entity);
struct dma_fence *fence = entity->dependency;
struct drm_sched_fence *s_fence;
@@ -455,7 +485,8 @@ struct drm_sched_job *drm_sched_entity_pop_job(struct drm_sched_entity *entity)
* Update the entity's location in the min heap according to
* the timestamp of the next job, if any.
*/
- if (entity->rq->sched->sched_policy == DRM_SCHED_POLICY_FIFO) {
+ if (drm_sched_entity_to_scheduler(entity)->sched_policy ==
+ DRM_SCHED_POLICY_FIFO) {
struct drm_sched_job *next;
next = to_drm_sched_job(spsc_queue_peek(&entity->job_queue));
@@ -472,6 +503,8 @@ void drm_sched_entity_select_rq(struct drm_sched_entity *entity)
struct drm_gpu_scheduler *sched;
struct drm_sched_rq *rq;
+ WARN_ON(!!entity->single_sched);
+
/* single possible engine and already selected */
if (!entity->sched_list)
return;
@@ -521,17 +554,22 @@ void drm_sched_entity_select_rq(struct drm_sched_entity *entity)
void drm_sched_entity_push_job(struct drm_sched_job *sched_job)
{
struct drm_sched_entity *entity = sched_job->entity;
- bool first, fifo = entity->rq->sched->sched_policy ==
- DRM_SCHED_POLICY_FIFO;
+ bool single_entity = !!entity->single_sched;
+ bool first;
trace_drm_sched_job(sched_job, entity);
- atomic_inc(entity->rq->sched->score);
+ if (!single_entity)
+ atomic_inc(entity->rq->sched->score);
WRITE_ONCE(entity->last_user, current->group_leader);
first = spsc_queue_push(&entity->job_queue, &sched_job->queue_node);
sched_job->submit_ts = ktime_get();
/* first job wakes up scheduler */
if (first) {
+ struct drm_gpu_scheduler *sched =
+ drm_sched_entity_to_scheduler(entity);
+ bool fifo = sched->sched_policy == DRM_SCHED_POLICY_FIFO;
+
/* Add the entity to the run queue */
spin_lock(&entity->rq_lock);
if (entity->stopped) {
@@ -541,13 +579,14 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job)
return;
}
- drm_sched_rq_add_entity(entity->rq, entity);
+ if (!single_entity)
+ drm_sched_rq_add_entity(entity->rq, entity);
spin_unlock(&entity->rq_lock);
if (fifo)
drm_sched_rq_update_fifo(entity, sched_job->submit_ts);
- drm_sched_wakeup(entity->rq->sched);
+ drm_sched_wakeup(sched);
}
}
EXPORT_SYMBOL(drm_sched_entity_push_job);
diff --git a/drivers/gpu/drm/scheduler/sched_fence.c b/drivers/gpu/drm/scheduler/sched_fence.c
index fe9c6468e440..d7cfc0441885 100644
--- a/drivers/gpu/drm/scheduler/sched_fence.c
+++ b/drivers/gpu/drm/scheduler/sched_fence.c
@@ -213,7 +213,7 @@ void drm_sched_fence_init(struct drm_sched_fence *fence,
{
unsigned seq;
- fence->sched = entity->rq->sched;
+ fence->sched = drm_sched_entity_to_scheduler(entity);
seq = atomic_inc_return(&entity->fence_seq);
dma_fence_init(&fence->scheduled, &drm_sched_fence_ops_scheduled,
&fence->lock, entity->fence_context, seq);
diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index 27989345889d..2597fb298733 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -32,7 +32,8 @@
* backend operations to the scheduler like submitting a job to hardware run queue,
* returning the dependencies of a job etc.
*
- * The organisation of the scheduler is the following:
+ * The organisation of the scheduler is the following for scheduling policies
+ * DRM_SCHED_POLICY_RR and DRM_SCHED_POLICY_FIFO:
*
* 1. Each hw run queue has one scheduler
* 2. Each scheduler has multiple run queues with different priorities
@@ -42,6 +43,22 @@
* the hardware.
*
* The jobs in a entity are always scheduled in the order that they were pushed.
+ * The organisation of the scheduler is the following for scheduling policy
+ * DRM_SCHED_POLICY_SINGLE_ENTITY:
+ *
+ * 1. One to one relationship between scheduler and entity
+ * 2. No priorities implemented per scheduler (single job queue)
+ * 3. No run queues in scheduler rather jobs are directly dequeued from entity
+ * 4. The entity maintains a queue of jobs that will be scheduled on the
+ * hardware
+ *
+ * The jobs in a entity are always scheduled in the order that they were pushed
+ * regardless of scheduling policy.
+ *
+ * A policy of DRM_SCHED_POLICY_RR or DRM_SCHED_POLICY_FIFO is expected to used
+ * when the KMD is scheduling directly on the hardware while a scheduling policy
+ * of DRM_SCHED_POLICY_SINGLE_ENTITY is expected to be used when there is a
+ * firmware scheduler.
*/
#include <linux/wait.h>
@@ -92,6 +109,8 @@ static inline void drm_sched_rq_remove_fifo_locked(struct drm_sched_entity *enti
void drm_sched_rq_update_fifo(struct drm_sched_entity *entity, ktime_t ts)
{
+ WARN_ON(!!entity->single_sched);
+
/*
* Both locks need to be grabbed, one to protect from entity->rq change
* for entity from within concurrent drm_sched_entity_select_rq and the
@@ -122,6 +141,8 @@ void drm_sched_rq_update_fifo(struct drm_sched_entity *entity, ktime_t ts)
static void drm_sched_rq_init(struct drm_gpu_scheduler *sched,
struct drm_sched_rq *rq)
{
+ WARN_ON(sched->sched_policy == DRM_SCHED_POLICY_SINGLE_ENTITY);
+
spin_lock_init(&rq->lock);
INIT_LIST_HEAD(&rq->entities);
rq->rb_tree_root = RB_ROOT_CACHED;
@@ -140,6 +161,8 @@ static void drm_sched_rq_init(struct drm_gpu_scheduler *sched,
void drm_sched_rq_add_entity(struct drm_sched_rq *rq,
struct drm_sched_entity *entity)
{
+ WARN_ON(!!entity->single_sched);
+
if (!list_empty(&entity->list))
return;
@@ -162,6 +185,8 @@ void drm_sched_rq_add_entity(struct drm_sched_rq *rq,
void drm_sched_rq_remove_entity(struct drm_sched_rq *rq,
struct drm_sched_entity *entity)
{
+ WARN_ON(!!entity->single_sched);
+
if (list_empty(&entity->list))
return;
@@ -667,7 +692,7 @@ int drm_sched_job_init(struct drm_sched_job *job,
struct drm_sched_entity *entity,
void *owner)
{
- if (!entity->rq)
+ if (!entity->rq && !entity->single_sched)
return -ENOENT;
job->entity = entity;
@@ -700,13 +725,16 @@ void drm_sched_job_arm(struct drm_sched_job *job)
{
struct drm_gpu_scheduler *sched;
struct drm_sched_entity *entity = job->entity;
+ bool single_entity = !!entity->single_sched;
BUG_ON(!entity);
- drm_sched_entity_select_rq(entity);
- sched = entity->rq->sched;
+ if (!single_entity)
+ drm_sched_entity_select_rq(entity);
+ sched = drm_sched_entity_to_scheduler(entity);
job->sched = sched;
- job->s_priority = entity->rq - sched->sched_rq;
+ if (!single_entity)
+ job->s_priority = entity->rq - sched->sched_rq;
job->id = atomic64_inc_return(&sched->job_id_count);
drm_sched_fence_init(job->s_fence, job->entity);
@@ -923,6 +951,13 @@ drm_sched_select_entity(struct drm_gpu_scheduler *sched)
if (!drm_sched_ready(sched))
return NULL;
+ if (sched->single_entity) {
+ if (drm_sched_entity_is_ready(sched->single_entity))
+ return sched->single_entity;
+
+ return NULL;
+ }
+
/* Kernel run queue has higher priority than normal run queue*/
for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
entity = sched->sched_policy == DRM_SCHED_POLICY_FIFO ?
@@ -1122,6 +1157,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
return -EINVAL;
sched->ops = ops;
+ sched->single_entity = NULL;
sched->hw_submission_limit = hw_submission;
sched->name = name;
sched->run_wq = run_wq ? : system_wq;
@@ -1134,7 +1170,9 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
sched->sched_policy = default_drm_sched_policy;
else
sched->sched_policy = sched_policy;
- for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_COUNT; i++)
+ for (i = DRM_SCHED_PRIORITY_MIN; sched_policy !=
+ DRM_SCHED_POLICY_SINGLE_ENTITY && i < DRM_SCHED_PRIORITY_COUNT;
+ i++)
drm_sched_rq_init(sched, &sched->sched_rq[i]);
init_waitqueue_head(&sched->job_scheduled);
@@ -1166,7 +1204,15 @@ void drm_sched_fini(struct drm_gpu_scheduler *sched)
drm_sched_run_wq_stop(sched);
- for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
+ if (sched->single_entity) {
+ spin_lock(&sched->single_entity->rq_lock);
+ sched->single_entity->stopped = true;
+ spin_unlock(&sched->single_entity->rq_lock);
+ }
+
+ for (i = DRM_SCHED_PRIORITY_COUNT - 1; sched->sched_policy !=
+ DRM_SCHED_POLICY_SINGLE_ENTITY && i >= DRM_SCHED_PRIORITY_MIN;
+ i--) {
struct drm_sched_rq *rq = &sched->sched_rq[i];
if (!rq)
@@ -1210,6 +1256,8 @@ void drm_sched_increase_karma(struct drm_sched_job *bad)
struct drm_sched_entity *entity;
struct drm_gpu_scheduler *sched = bad->sched;
+ WARN_ON(sched->sched_policy == DRM_SCHED_POLICY_SINGLE_ENTITY);
+
/* don't change @bad's karma if it's from KERNEL RQ,
* because sometimes GPU hang would cause kernel jobs (like VM updating jobs)
* corrupt but keep in mind that kernel jobs always considered good.
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
index 7474142daca6..df1993dd44ae 100644
--- a/include/drm/gpu_scheduler.h
+++ b/include/drm/gpu_scheduler.h
@@ -79,6 +79,7 @@ enum drm_sched_policy {
DRM_SCHED_POLICY_DEFAULT,
DRM_SCHED_POLICY_RR,
DRM_SCHED_POLICY_FIFO,
+ DRM_SCHED_POLICY_SINGLE_ENTITY,
DRM_SCHED_POLICY_COUNT,
};
@@ -112,6 +113,9 @@ struct drm_sched_entity {
*/
struct drm_sched_rq *rq;
+ /** @single_sched: Single scheduler */
+ struct drm_gpu_scheduler *single_sched;
+
/**
* @sched_list:
*
@@ -473,6 +477,7 @@ struct drm_sched_backend_ops {
* struct drm_gpu_scheduler - scheduler instance-specific data
*
* @ops: backend operations provided by the driver.
+ * @single_entity: Single entity for the scheduler
* @hw_submission_limit: the max size of the hardware queue.
* @timeout: the time after which a job is removed from the scheduler.
* @name: name of the ring for which this scheduler is being used.
@@ -503,6 +508,7 @@ struct drm_sched_backend_ops {
*/
struct drm_gpu_scheduler {
const struct drm_sched_backend_ops *ops;
+ struct drm_sched_entity *single_entity;
uint32_t hw_submission_limit;
long timeout;
const char *name;
@@ -584,6 +590,8 @@ int drm_sched_entity_init(struct drm_sched_entity *entity,
struct drm_gpu_scheduler **sched_list,
unsigned int num_sched_list,
atomic_t *guilty);
+struct drm_gpu_scheduler *
+drm_sched_entity_to_scheduler(struct drm_sched_entity *entity);
long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout);
void drm_sched_entity_fini(struct drm_sched_entity *entity);
void drm_sched_entity_destroy(struct drm_sched_entity *entity);
--
2.34.1
^ permalink raw reply related [flat|nested] 24+ messages in thread
* Re: [PATCH 3/8] drm/sched: Add DRM_SCHED_POLICY_SINGLE_ENTITY scheduling policy
2023-08-01 20:50 ` [PATCH 3/8] drm/sched: Add DRM_SCHED_POLICY_SINGLE_ENTITY scheduling policy Matthew Brost
@ 2023-08-03 8:50 ` Christian König
0 siblings, 0 replies; 24+ messages in thread
From: Christian König @ 2023-08-03 8:50 UTC (permalink / raw)
To: Matthew Brost, dri-devel, intel-xe
Cc: robdclark, thomas.hellstrom, sarah.walker, ketil.johnsen,
Liviu.Dudau, luben.tuikov, lina, donald.robson, boris.brezillon,
faith.ekstrand
Am 01.08.23 um 22:50 schrieb Matthew Brost:
> DRM_SCHED_POLICY_SINGLE_ENTITY creates a 1 to 1 relationship between
> scheduler and entity. No priorities or run queue used in this mode.
> Intended for devices with firmware schedulers.
>
> v2:
> - Drop sched / rq union (Luben)
I think we might rather completely remove the runqueues. Essentially we
only needed them for the old round robin scheduling.
For Lubens r/b priority based scheduling we can just give kernel
submissions a reasonable boost instead.
Luben what do you think?
Regards,
Christian.
>
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> ---
> drivers/gpu/drm/scheduler/sched_entity.c | 69 ++++++++++++++++++------
> drivers/gpu/drm/scheduler/sched_fence.c | 2 +-
> drivers/gpu/drm/scheduler/sched_main.c | 62 ++++++++++++++++++---
> include/drm/gpu_scheduler.h | 8 +++
> 4 files changed, 118 insertions(+), 23 deletions(-)
>
> diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c
> index 941ea8edead2..59c1ca578256 100644
> --- a/drivers/gpu/drm/scheduler/sched_entity.c
> +++ b/drivers/gpu/drm/scheduler/sched_entity.c
> @@ -83,6 +83,7 @@ int drm_sched_entity_init(struct drm_sched_entity *entity,
> memset(entity, 0, sizeof(struct drm_sched_entity));
> INIT_LIST_HEAD(&entity->list);
> entity->rq = NULL;
> + entity->single_sched = NULL;
> entity->guilty = guilty;
> entity->num_sched_list = num_sched_list;
> entity->priority = priority;
> @@ -90,8 +91,17 @@ int drm_sched_entity_init(struct drm_sched_entity *entity,
> entity->last_scheduled = NULL;
> RB_CLEAR_NODE(&entity->rb_tree_node);
>
> - if(num_sched_list)
> - entity->rq = &sched_list[0]->sched_rq[entity->priority];
> + if (num_sched_list) {
> + if (sched_list[0]->sched_policy !=
> + DRM_SCHED_POLICY_SINGLE_ENTITY) {
> + entity->rq = &sched_list[0]->sched_rq[entity->priority];
> + } else {
> + if (num_sched_list != 1 || sched_list[0]->single_entity)
> + return -EINVAL;
> + sched_list[0]->single_entity = entity;
> + entity->single_sched = sched_list[0];
> + }
> + }
>
> init_completion(&entity->entity_idle);
>
> @@ -124,7 +134,8 @@ void drm_sched_entity_modify_sched(struct drm_sched_entity *entity,
> struct drm_gpu_scheduler **sched_list,
> unsigned int num_sched_list)
> {
> - WARN_ON(!num_sched_list || !sched_list);
> + WARN_ON(!num_sched_list || !sched_list ||
> + !!entity->single_sched);
>
> entity->sched_list = sched_list;
> entity->num_sched_list = num_sched_list;
> @@ -194,13 +205,15 @@ static void drm_sched_entity_kill(struct drm_sched_entity *entity)
> {
> struct drm_sched_job *job;
> struct dma_fence *prev;
> + bool single_entity = !!entity->single_sched;
>
> - if (!entity->rq)
> + if (!entity->rq && !single_entity)
> return;
>
> spin_lock(&entity->rq_lock);
> entity->stopped = true;
> - drm_sched_rq_remove_entity(entity->rq, entity);
> + if (!single_entity)
> + drm_sched_rq_remove_entity(entity->rq, entity);
> spin_unlock(&entity->rq_lock);
>
> /* Make sure this entity is not used by the scheduler at the moment */
> @@ -222,6 +235,20 @@ static void drm_sched_entity_kill(struct drm_sched_entity *entity)
> dma_fence_put(prev);
> }
>
> +/**
> + * drm_sched_entity_to_scheduler - Schedule entity to GPU scheduler
> + * @entity: scheduler entity
> + *
> + * Returns GPU scheduler for the entity
> + */
> +struct drm_gpu_scheduler *
> +drm_sched_entity_to_scheduler(struct drm_sched_entity *entity)
> +{
> + bool single_entity = !!entity->single_sched;
> +
> + return single_entity ? entity->single_sched : entity->rq->sched;
> +}
> +
> /**
> * drm_sched_entity_flush - Flush a context entity
> *
> @@ -239,11 +266,12 @@ long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout)
> struct drm_gpu_scheduler *sched;
> struct task_struct *last_user;
> long ret = timeout;
> + bool single_entity = !!entity->single_sched;
>
> - if (!entity->rq)
> + if (!entity->rq && !single_entity)
> return 0;
>
> - sched = entity->rq->sched;
> + sched = drm_sched_entity_to_scheduler(entity);
> /**
> * The client will not queue more IBs during this fini, consume existing
> * queued IBs or discard them on SIGKILL
> @@ -336,7 +364,7 @@ static void drm_sched_entity_wakeup(struct dma_fence *f,
> container_of(cb, struct drm_sched_entity, cb);
>
> drm_sched_entity_clear_dep(f, cb);
> - drm_sched_wakeup(entity->rq->sched);
> + drm_sched_wakeup(drm_sched_entity_to_scheduler(entity));
> }
>
> /**
> @@ -350,6 +378,8 @@ static void drm_sched_entity_wakeup(struct dma_fence *f,
> void drm_sched_entity_set_priority(struct drm_sched_entity *entity,
> enum drm_sched_priority priority)
> {
> + WARN_ON(!!entity->single_sched);
> +
> spin_lock(&entity->rq_lock);
> entity->priority = priority;
> spin_unlock(&entity->rq_lock);
> @@ -362,7 +392,7 @@ EXPORT_SYMBOL(drm_sched_entity_set_priority);
> */
> static bool drm_sched_entity_add_dependency_cb(struct drm_sched_entity *entity)
> {
> - struct drm_gpu_scheduler *sched = entity->rq->sched;
> + struct drm_gpu_scheduler *sched = drm_sched_entity_to_scheduler(entity);
> struct dma_fence *fence = entity->dependency;
> struct drm_sched_fence *s_fence;
>
> @@ -455,7 +485,8 @@ struct drm_sched_job *drm_sched_entity_pop_job(struct drm_sched_entity *entity)
> * Update the entity's location in the min heap according to
> * the timestamp of the next job, if any.
> */
> - if (entity->rq->sched->sched_policy == DRM_SCHED_POLICY_FIFO) {
> + if (drm_sched_entity_to_scheduler(entity)->sched_policy ==
> + DRM_SCHED_POLICY_FIFO) {
> struct drm_sched_job *next;
>
> next = to_drm_sched_job(spsc_queue_peek(&entity->job_queue));
> @@ -472,6 +503,8 @@ void drm_sched_entity_select_rq(struct drm_sched_entity *entity)
> struct drm_gpu_scheduler *sched;
> struct drm_sched_rq *rq;
>
> + WARN_ON(!!entity->single_sched);
> +
> /* single possible engine and already selected */
> if (!entity->sched_list)
> return;
> @@ -521,17 +554,22 @@ void drm_sched_entity_select_rq(struct drm_sched_entity *entity)
> void drm_sched_entity_push_job(struct drm_sched_job *sched_job)
> {
> struct drm_sched_entity *entity = sched_job->entity;
> - bool first, fifo = entity->rq->sched->sched_policy ==
> - DRM_SCHED_POLICY_FIFO;
> + bool single_entity = !!entity->single_sched;
> + bool first;
>
> trace_drm_sched_job(sched_job, entity);
> - atomic_inc(entity->rq->sched->score);
> + if (!single_entity)
> + atomic_inc(entity->rq->sched->score);
> WRITE_ONCE(entity->last_user, current->group_leader);
> first = spsc_queue_push(&entity->job_queue, &sched_job->queue_node);
> sched_job->submit_ts = ktime_get();
>
> /* first job wakes up scheduler */
> if (first) {
> + struct drm_gpu_scheduler *sched =
> + drm_sched_entity_to_scheduler(entity);
> + bool fifo = sched->sched_policy == DRM_SCHED_POLICY_FIFO;
> +
> /* Add the entity to the run queue */
> spin_lock(&entity->rq_lock);
> if (entity->stopped) {
> @@ -541,13 +579,14 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job)
> return;
> }
>
> - drm_sched_rq_add_entity(entity->rq, entity);
> + if (!single_entity)
> + drm_sched_rq_add_entity(entity->rq, entity);
> spin_unlock(&entity->rq_lock);
>
> if (fifo)
> drm_sched_rq_update_fifo(entity, sched_job->submit_ts);
>
> - drm_sched_wakeup(entity->rq->sched);
> + drm_sched_wakeup(sched);
> }
> }
> EXPORT_SYMBOL(drm_sched_entity_push_job);
> diff --git a/drivers/gpu/drm/scheduler/sched_fence.c b/drivers/gpu/drm/scheduler/sched_fence.c
> index fe9c6468e440..d7cfc0441885 100644
> --- a/drivers/gpu/drm/scheduler/sched_fence.c
> +++ b/drivers/gpu/drm/scheduler/sched_fence.c
> @@ -213,7 +213,7 @@ void drm_sched_fence_init(struct drm_sched_fence *fence,
> {
> unsigned seq;
>
> - fence->sched = entity->rq->sched;
> + fence->sched = drm_sched_entity_to_scheduler(entity);
> seq = atomic_inc_return(&entity->fence_seq);
> dma_fence_init(&fence->scheduled, &drm_sched_fence_ops_scheduled,
> &fence->lock, entity->fence_context, seq);
> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> index 27989345889d..2597fb298733 100644
> --- a/drivers/gpu/drm/scheduler/sched_main.c
> +++ b/drivers/gpu/drm/scheduler/sched_main.c
> @@ -32,7 +32,8 @@
> * backend operations to the scheduler like submitting a job to hardware run queue,
> * returning the dependencies of a job etc.
> *
> - * The organisation of the scheduler is the following:
> + * The organisation of the scheduler is the following for scheduling policies
> + * DRM_SCHED_POLICY_RR and DRM_SCHED_POLICY_FIFO:
> *
> * 1. Each hw run queue has one scheduler
> * 2. Each scheduler has multiple run queues with different priorities
> @@ -42,6 +43,22 @@
> * the hardware.
> *
> * The jobs in a entity are always scheduled in the order that they were pushed.
> + * The organisation of the scheduler is the following for scheduling policy
> + * DRM_SCHED_POLICY_SINGLE_ENTITY:
> + *
> + * 1. One to one relationship between scheduler and entity
> + * 2. No priorities implemented per scheduler (single job queue)
> + * 3. No run queues in scheduler rather jobs are directly dequeued from entity
> + * 4. The entity maintains a queue of jobs that will be scheduled on the
> + * hardware
> + *
> + * The jobs in a entity are always scheduled in the order that they were pushed
> + * regardless of scheduling policy.
> + *
> + * A policy of DRM_SCHED_POLICY_RR or DRM_SCHED_POLICY_FIFO is expected to used
> + * when the KMD is scheduling directly on the hardware while a scheduling policy
> + * of DRM_SCHED_POLICY_SINGLE_ENTITY is expected to be used when there is a
> + * firmware scheduler.
> */
>
> #include <linux/wait.h>
> @@ -92,6 +109,8 @@ static inline void drm_sched_rq_remove_fifo_locked(struct drm_sched_entity *enti
>
> void drm_sched_rq_update_fifo(struct drm_sched_entity *entity, ktime_t ts)
> {
> + WARN_ON(!!entity->single_sched);
> +
> /*
> * Both locks need to be grabbed, one to protect from entity->rq change
> * for entity from within concurrent drm_sched_entity_select_rq and the
> @@ -122,6 +141,8 @@ void drm_sched_rq_update_fifo(struct drm_sched_entity *entity, ktime_t ts)
> static void drm_sched_rq_init(struct drm_gpu_scheduler *sched,
> struct drm_sched_rq *rq)
> {
> + WARN_ON(sched->sched_policy == DRM_SCHED_POLICY_SINGLE_ENTITY);
> +
> spin_lock_init(&rq->lock);
> INIT_LIST_HEAD(&rq->entities);
> rq->rb_tree_root = RB_ROOT_CACHED;
> @@ -140,6 +161,8 @@ static void drm_sched_rq_init(struct drm_gpu_scheduler *sched,
> void drm_sched_rq_add_entity(struct drm_sched_rq *rq,
> struct drm_sched_entity *entity)
> {
> + WARN_ON(!!entity->single_sched);
> +
> if (!list_empty(&entity->list))
> return;
>
> @@ -162,6 +185,8 @@ void drm_sched_rq_add_entity(struct drm_sched_rq *rq,
> void drm_sched_rq_remove_entity(struct drm_sched_rq *rq,
> struct drm_sched_entity *entity)
> {
> + WARN_ON(!!entity->single_sched);
> +
> if (list_empty(&entity->list))
> return;
>
> @@ -667,7 +692,7 @@ int drm_sched_job_init(struct drm_sched_job *job,
> struct drm_sched_entity *entity,
> void *owner)
> {
> - if (!entity->rq)
> + if (!entity->rq && !entity->single_sched)
> return -ENOENT;
>
> job->entity = entity;
> @@ -700,13 +725,16 @@ void drm_sched_job_arm(struct drm_sched_job *job)
> {
> struct drm_gpu_scheduler *sched;
> struct drm_sched_entity *entity = job->entity;
> + bool single_entity = !!entity->single_sched;
>
> BUG_ON(!entity);
> - drm_sched_entity_select_rq(entity);
> - sched = entity->rq->sched;
> + if (!single_entity)
> + drm_sched_entity_select_rq(entity);
> + sched = drm_sched_entity_to_scheduler(entity);
>
> job->sched = sched;
> - job->s_priority = entity->rq - sched->sched_rq;
> + if (!single_entity)
> + job->s_priority = entity->rq - sched->sched_rq;
> job->id = atomic64_inc_return(&sched->job_id_count);
>
> drm_sched_fence_init(job->s_fence, job->entity);
> @@ -923,6 +951,13 @@ drm_sched_select_entity(struct drm_gpu_scheduler *sched)
> if (!drm_sched_ready(sched))
> return NULL;
>
> + if (sched->single_entity) {
> + if (drm_sched_entity_is_ready(sched->single_entity))
> + return sched->single_entity;
> +
> + return NULL;
> + }
> +
> /* Kernel run queue has higher priority than normal run queue*/
> for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
> entity = sched->sched_policy == DRM_SCHED_POLICY_FIFO ?
> @@ -1122,6 +1157,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
> return -EINVAL;
>
> sched->ops = ops;
> + sched->single_entity = NULL;
> sched->hw_submission_limit = hw_submission;
> sched->name = name;
> sched->run_wq = run_wq ? : system_wq;
> @@ -1134,7 +1170,9 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
> sched->sched_policy = default_drm_sched_policy;
> else
> sched->sched_policy = sched_policy;
> - for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_COUNT; i++)
> + for (i = DRM_SCHED_PRIORITY_MIN; sched_policy !=
> + DRM_SCHED_POLICY_SINGLE_ENTITY && i < DRM_SCHED_PRIORITY_COUNT;
> + i++)
> drm_sched_rq_init(sched, &sched->sched_rq[i]);
>
> init_waitqueue_head(&sched->job_scheduled);
> @@ -1166,7 +1204,15 @@ void drm_sched_fini(struct drm_gpu_scheduler *sched)
>
> drm_sched_run_wq_stop(sched);
>
> - for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
> + if (sched->single_entity) {
> + spin_lock(&sched->single_entity->rq_lock);
> + sched->single_entity->stopped = true;
> + spin_unlock(&sched->single_entity->rq_lock);
> + }
> +
> + for (i = DRM_SCHED_PRIORITY_COUNT - 1; sched->sched_policy !=
> + DRM_SCHED_POLICY_SINGLE_ENTITY && i >= DRM_SCHED_PRIORITY_MIN;
> + i--) {
> struct drm_sched_rq *rq = &sched->sched_rq[i];
>
> if (!rq)
> @@ -1210,6 +1256,8 @@ void drm_sched_increase_karma(struct drm_sched_job *bad)
> struct drm_sched_entity *entity;
> struct drm_gpu_scheduler *sched = bad->sched;
>
> + WARN_ON(sched->sched_policy == DRM_SCHED_POLICY_SINGLE_ENTITY);
> +
> /* don't change @bad's karma if it's from KERNEL RQ,
> * because sometimes GPU hang would cause kernel jobs (like VM updating jobs)
> * corrupt but keep in mind that kernel jobs always considered good.
> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> index 7474142daca6..df1993dd44ae 100644
> --- a/include/drm/gpu_scheduler.h
> +++ b/include/drm/gpu_scheduler.h
> @@ -79,6 +79,7 @@ enum drm_sched_policy {
> DRM_SCHED_POLICY_DEFAULT,
> DRM_SCHED_POLICY_RR,
> DRM_SCHED_POLICY_FIFO,
> + DRM_SCHED_POLICY_SINGLE_ENTITY,
> DRM_SCHED_POLICY_COUNT,
> };
>
> @@ -112,6 +113,9 @@ struct drm_sched_entity {
> */
> struct drm_sched_rq *rq;
>
> + /** @single_sched: Single scheduler */
> + struct drm_gpu_scheduler *single_sched;
> +
> /**
> * @sched_list:
> *
> @@ -473,6 +477,7 @@ struct drm_sched_backend_ops {
> * struct drm_gpu_scheduler - scheduler instance-specific data
> *
> * @ops: backend operations provided by the driver.
> + * @single_entity: Single entity for the scheduler
> * @hw_submission_limit: the max size of the hardware queue.
> * @timeout: the time after which a job is removed from the scheduler.
> * @name: name of the ring for which this scheduler is being used.
> @@ -503,6 +508,7 @@ struct drm_sched_backend_ops {
> */
> struct drm_gpu_scheduler {
> const struct drm_sched_backend_ops *ops;
> + struct drm_sched_entity *single_entity;
> uint32_t hw_submission_limit;
> long timeout;
> const char *name;
> @@ -584,6 +590,8 @@ int drm_sched_entity_init(struct drm_sched_entity *entity,
> struct drm_gpu_scheduler **sched_list,
> unsigned int num_sched_list,
> atomic_t *guilty);
> +struct drm_gpu_scheduler *
> +drm_sched_entity_to_scheduler(struct drm_sched_entity *entity);
> long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout);
> void drm_sched_entity_fini(struct drm_sched_entity *entity);
> void drm_sched_entity_destroy(struct drm_sched_entity *entity);
^ permalink raw reply [flat|nested] 24+ messages in thread
* [PATCH 4/8] drm/sched: Add generic scheduler message interface
2023-08-01 20:50 [PATCH 0/8] DRM scheduler changes for Xe Matthew Brost
` (2 preceding siblings ...)
2023-08-01 20:50 ` [PATCH 3/8] drm/sched: Add DRM_SCHED_POLICY_SINGLE_ENTITY scheduling policy Matthew Brost
@ 2023-08-01 20:50 ` Matthew Brost
2023-08-03 8:53 ` Christian König
2023-08-01 20:51 ` [PATCH 5/8] drm/sched: Add drm_sched_start_timeout_unlocked helper Matthew Brost
` (3 subsequent siblings)
7 siblings, 1 reply; 24+ messages in thread
From: Matthew Brost @ 2023-08-01 20:50 UTC (permalink / raw)
To: dri-devel, intel-xe
Cc: robdclark, thomas.hellstrom, Matthew Brost, sarah.walker,
ketil.johnsen, Liviu.Dudau, luben.tuikov, lina, donald.robson,
boris.brezillon, christian.koenig, faith.ekstrand
Add generic schedule message interface which sends messages to backend
from the drm_gpu_scheduler main submission thread. The idea is some of
these messages modify some state in drm_sched_entity which is also
modified during submission. By scheduling these messages and submission
in the same thread their is not race changing states in
drm_sched_entity.
This interface will be used in XE, new Intel GPU driver, to cleanup,
suspend, resume, and change scheduling properties of a drm_sched_entity.
The interface is designed to be generic and extendable with only the
backend understanding the messages.
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
drivers/gpu/drm/scheduler/sched_main.c | 52 +++++++++++++++++++++++++-
include/drm/gpu_scheduler.h | 29 +++++++++++++-
2 files changed, 78 insertions(+), 3 deletions(-)
diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index 2597fb298733..84821a124ca2 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -1049,6 +1049,49 @@ drm_sched_pick_best(struct drm_gpu_scheduler **sched_list,
}
EXPORT_SYMBOL(drm_sched_pick_best);
+/**
+ * drm_sched_add_msg - add scheduler message
+ *
+ * @sched: scheduler instance
+ * @msg: message to be added
+ *
+ * Can and will pass an jobs waiting on dependencies or in a runnable queue.
+ * Messages processing will stop if schedule run wq is stopped and resume when
+ * run wq is started.
+ */
+void drm_sched_add_msg(struct drm_gpu_scheduler *sched,
+ struct drm_sched_msg *msg)
+{
+ spin_lock(&sched->job_list_lock);
+ list_add_tail(&msg->link, &sched->msgs);
+ spin_unlock(&sched->job_list_lock);
+
+ drm_sched_run_wq_queue(sched);
+}
+EXPORT_SYMBOL(drm_sched_add_msg);
+
+/**
+ * drm_sched_get_msg - get scheduler message
+ *
+ * @sched: scheduler instance
+ *
+ * Returns NULL or message
+ */
+static struct drm_sched_msg *
+drm_sched_get_msg(struct drm_gpu_scheduler *sched)
+{
+ struct drm_sched_msg *msg;
+
+ spin_lock(&sched->job_list_lock);
+ msg = list_first_entry_or_null(&sched->msgs,
+ struct drm_sched_msg, link);
+ if (msg)
+ list_del(&msg->link);
+ spin_unlock(&sched->job_list_lock);
+
+ return msg;
+}
+
/**
* drm_sched_main - main scheduler thread
*
@@ -1060,6 +1103,7 @@ static void drm_sched_main(struct work_struct *w)
container_of(w, struct drm_gpu_scheduler, work_run);
struct drm_sched_entity *entity;
struct drm_sched_job *cleanup_job;
+ struct drm_sched_msg *msg;
int r;
if (READ_ONCE(sched->pause_run_wq))
@@ -1067,12 +1111,15 @@ static void drm_sched_main(struct work_struct *w)
cleanup_job = drm_sched_get_cleanup_job(sched);
entity = drm_sched_select_entity(sched);
+ msg = drm_sched_get_msg(sched);
- if (!entity && !cleanup_job)
+ if (!entity && !cleanup_job && !msg)
return; /* No more work */
if (cleanup_job)
sched->ops->free_job(cleanup_job);
+ if (msg)
+ sched->ops->process_msg(msg);
if (entity) {
struct dma_fence *fence;
@@ -1082,7 +1129,7 @@ static void drm_sched_main(struct work_struct *w)
sched_job = drm_sched_entity_pop_job(entity);
if (!sched_job) {
complete_all(&entity->entity_idle);
- if (!cleanup_job)
+ if (!cleanup_job && !msg)
return; /* No more work */
goto again;
}
@@ -1177,6 +1224,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
init_waitqueue_head(&sched->job_scheduled);
INIT_LIST_HEAD(&sched->pending_list);
+ INIT_LIST_HEAD(&sched->msgs);
spin_lock_init(&sched->job_list_lock);
atomic_set(&sched->hw_rq_count, 0);
INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
index df1993dd44ae..267bd060d178 100644
--- a/include/drm/gpu_scheduler.h
+++ b/include/drm/gpu_scheduler.h
@@ -394,6 +394,23 @@ enum drm_gpu_sched_stat {
DRM_GPU_SCHED_STAT_ENODEV,
};
+/**
+ * struct drm_sched_msg - an in-band (relative to GPU scheduler run queue)
+ * message
+ *
+ * Generic enough for backend defined messages, backend can expand if needed.
+ */
+struct drm_sched_msg {
+ /** @link: list link into the gpu scheduler list of messages */
+ struct list_head link;
+ /**
+ * @private_data: opaque pointer to message private data (backend defined)
+ */
+ void *private_data;
+ /** @opcode: opcode of message (backend defined) */
+ unsigned int opcode;
+};
+
/**
* struct drm_sched_backend_ops - Define the backend operations
* called by the scheduler
@@ -471,6 +488,12 @@ struct drm_sched_backend_ops {
* and it's time to clean it up.
*/
void (*free_job)(struct drm_sched_job *sched_job);
+
+ /**
+ * @process_msg: Process a message. Allowed to block, it is this
+ * function's responsibility to free message if dynamically allocated.
+ */
+ void (*process_msg)(struct drm_sched_msg *msg);
};
/**
@@ -482,6 +505,7 @@ struct drm_sched_backend_ops {
* @timeout: the time after which a job is removed from the scheduler.
* @name: name of the ring for which this scheduler is being used.
* @sched_rq: priority wise array of run queues.
+ * @msgs: list of messages to be processed in @work_run
* @job_scheduled: once @drm_sched_entity_do_release is called the scheduler
* waits on this wait queue until all the scheduled jobs are
* finished.
@@ -489,7 +513,7 @@ struct drm_sched_backend_ops {
* @job_id_count: used to assign unique id to the each job.
* @run_wq: workqueue used to queue @work_run
* @timeout_wq: workqueue used to queue @work_tdr
- * @work_run: schedules jobs and cleans up entities
+ * @work_run: schedules jobs, cleans up jobs, and processes messages
* @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the
* timeout interval is over.
* @pending_list: the list of jobs which are currently in the job queue.
@@ -513,6 +537,7 @@ struct drm_gpu_scheduler {
long timeout;
const char *name;
struct drm_sched_rq sched_rq[DRM_SCHED_PRIORITY_COUNT];
+ struct list_head msgs;
wait_queue_head_t job_scheduled;
atomic_t hw_rq_count;
atomic64_t job_id_count;
@@ -566,6 +591,8 @@ void drm_sched_entity_modify_sched(struct drm_sched_entity *entity,
void drm_sched_job_cleanup(struct drm_sched_job *job);
void drm_sched_wakeup(struct drm_gpu_scheduler *sched);
+void drm_sched_add_msg(struct drm_gpu_scheduler *sched,
+ struct drm_sched_msg *msg);
void drm_sched_run_wq_stop(struct drm_gpu_scheduler *sched);
void drm_sched_run_wq_start(struct drm_gpu_scheduler *sched);
void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad);
--
2.34.1
^ permalink raw reply related [flat|nested] 24+ messages in thread
* Re: [PATCH 4/8] drm/sched: Add generic scheduler message interface
2023-08-01 20:50 ` [PATCH 4/8] drm/sched: Add generic scheduler message interface Matthew Brost
@ 2023-08-03 8:53 ` Christian König
2023-08-03 8:58 ` Daniel Vetter
0 siblings, 1 reply; 24+ messages in thread
From: Christian König @ 2023-08-03 8:53 UTC (permalink / raw)
To: Matthew Brost, dri-devel, intel-xe
Cc: robdclark, thomas.hellstrom, sarah.walker, ketil.johnsen,
Liviu.Dudau, luben.tuikov, lina, donald.robson, boris.brezillon,
faith.ekstrand
Am 01.08.23 um 22:50 schrieb Matthew Brost:
> Add generic schedule message interface which sends messages to backend
> from the drm_gpu_scheduler main submission thread. The idea is some of
> these messages modify some state in drm_sched_entity which is also
> modified during submission. By scheduling these messages and submission
> in the same thread their is not race changing states in
> drm_sched_entity.
>
> This interface will be used in XE, new Intel GPU driver, to cleanup,
> suspend, resume, and change scheduling properties of a drm_sched_entity.
>
> The interface is designed to be generic and extendable with only the
> backend understanding the messages.
I'm still extremely frowned on this.
If you need this functionality then let the drivers decide which
runqueue the scheduler should use.
When you then create a single threaded runqueue you can just submit work
to it and serialize this with the scheduler work.
This way we wouldn't duplicate this core kernel function inside the
scheduler.
Regards,
Christian.
>
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> ---
> drivers/gpu/drm/scheduler/sched_main.c | 52 +++++++++++++++++++++++++-
> include/drm/gpu_scheduler.h | 29 +++++++++++++-
> 2 files changed, 78 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> index 2597fb298733..84821a124ca2 100644
> --- a/drivers/gpu/drm/scheduler/sched_main.c
> +++ b/drivers/gpu/drm/scheduler/sched_main.c
> @@ -1049,6 +1049,49 @@ drm_sched_pick_best(struct drm_gpu_scheduler **sched_list,
> }
> EXPORT_SYMBOL(drm_sched_pick_best);
>
> +/**
> + * drm_sched_add_msg - add scheduler message
> + *
> + * @sched: scheduler instance
> + * @msg: message to be added
> + *
> + * Can and will pass an jobs waiting on dependencies or in a runnable queue.
> + * Messages processing will stop if schedule run wq is stopped and resume when
> + * run wq is started.
> + */
> +void drm_sched_add_msg(struct drm_gpu_scheduler *sched,
> + struct drm_sched_msg *msg)
> +{
> + spin_lock(&sched->job_list_lock);
> + list_add_tail(&msg->link, &sched->msgs);
> + spin_unlock(&sched->job_list_lock);
> +
> + drm_sched_run_wq_queue(sched);
> +}
> +EXPORT_SYMBOL(drm_sched_add_msg);
> +
> +/**
> + * drm_sched_get_msg - get scheduler message
> + *
> + * @sched: scheduler instance
> + *
> + * Returns NULL or message
> + */
> +static struct drm_sched_msg *
> +drm_sched_get_msg(struct drm_gpu_scheduler *sched)
> +{
> + struct drm_sched_msg *msg;
> +
> + spin_lock(&sched->job_list_lock);
> + msg = list_first_entry_or_null(&sched->msgs,
> + struct drm_sched_msg, link);
> + if (msg)
> + list_del(&msg->link);
> + spin_unlock(&sched->job_list_lock);
> +
> + return msg;
> +}
> +
> /**
> * drm_sched_main - main scheduler thread
> *
> @@ -1060,6 +1103,7 @@ static void drm_sched_main(struct work_struct *w)
> container_of(w, struct drm_gpu_scheduler, work_run);
> struct drm_sched_entity *entity;
> struct drm_sched_job *cleanup_job;
> + struct drm_sched_msg *msg;
> int r;
>
> if (READ_ONCE(sched->pause_run_wq))
> @@ -1067,12 +1111,15 @@ static void drm_sched_main(struct work_struct *w)
>
> cleanup_job = drm_sched_get_cleanup_job(sched);
> entity = drm_sched_select_entity(sched);
> + msg = drm_sched_get_msg(sched);
>
> - if (!entity && !cleanup_job)
> + if (!entity && !cleanup_job && !msg)
> return; /* No more work */
>
> if (cleanup_job)
> sched->ops->free_job(cleanup_job);
> + if (msg)
> + sched->ops->process_msg(msg);
>
> if (entity) {
> struct dma_fence *fence;
> @@ -1082,7 +1129,7 @@ static void drm_sched_main(struct work_struct *w)
> sched_job = drm_sched_entity_pop_job(entity);
> if (!sched_job) {
> complete_all(&entity->entity_idle);
> - if (!cleanup_job)
> + if (!cleanup_job && !msg)
> return; /* No more work */
> goto again;
> }
> @@ -1177,6 +1224,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>
> init_waitqueue_head(&sched->job_scheduled);
> INIT_LIST_HEAD(&sched->pending_list);
> + INIT_LIST_HEAD(&sched->msgs);
> spin_lock_init(&sched->job_list_lock);
> atomic_set(&sched->hw_rq_count, 0);
> INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> index df1993dd44ae..267bd060d178 100644
> --- a/include/drm/gpu_scheduler.h
> +++ b/include/drm/gpu_scheduler.h
> @@ -394,6 +394,23 @@ enum drm_gpu_sched_stat {
> DRM_GPU_SCHED_STAT_ENODEV,
> };
>
> +/**
> + * struct drm_sched_msg - an in-band (relative to GPU scheduler run queue)
> + * message
> + *
> + * Generic enough for backend defined messages, backend can expand if needed.
> + */
> +struct drm_sched_msg {
> + /** @link: list link into the gpu scheduler list of messages */
> + struct list_head link;
> + /**
> + * @private_data: opaque pointer to message private data (backend defined)
> + */
> + void *private_data;
> + /** @opcode: opcode of message (backend defined) */
> + unsigned int opcode;
> +};
> +
> /**
> * struct drm_sched_backend_ops - Define the backend operations
> * called by the scheduler
> @@ -471,6 +488,12 @@ struct drm_sched_backend_ops {
> * and it's time to clean it up.
> */
> void (*free_job)(struct drm_sched_job *sched_job);
> +
> + /**
> + * @process_msg: Process a message. Allowed to block, it is this
> + * function's responsibility to free message if dynamically allocated.
> + */
> + void (*process_msg)(struct drm_sched_msg *msg);
> };
>
> /**
> @@ -482,6 +505,7 @@ struct drm_sched_backend_ops {
> * @timeout: the time after which a job is removed from the scheduler.
> * @name: name of the ring for which this scheduler is being used.
> * @sched_rq: priority wise array of run queues.
> + * @msgs: list of messages to be processed in @work_run
> * @job_scheduled: once @drm_sched_entity_do_release is called the scheduler
> * waits on this wait queue until all the scheduled jobs are
> * finished.
> @@ -489,7 +513,7 @@ struct drm_sched_backend_ops {
> * @job_id_count: used to assign unique id to the each job.
> * @run_wq: workqueue used to queue @work_run
> * @timeout_wq: workqueue used to queue @work_tdr
> - * @work_run: schedules jobs and cleans up entities
> + * @work_run: schedules jobs, cleans up jobs, and processes messages
> * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the
> * timeout interval is over.
> * @pending_list: the list of jobs which are currently in the job queue.
> @@ -513,6 +537,7 @@ struct drm_gpu_scheduler {
> long timeout;
> const char *name;
> struct drm_sched_rq sched_rq[DRM_SCHED_PRIORITY_COUNT];
> + struct list_head msgs;
> wait_queue_head_t job_scheduled;
> atomic_t hw_rq_count;
> atomic64_t job_id_count;
> @@ -566,6 +591,8 @@ void drm_sched_entity_modify_sched(struct drm_sched_entity *entity,
>
> void drm_sched_job_cleanup(struct drm_sched_job *job);
> void drm_sched_wakeup(struct drm_gpu_scheduler *sched);
> +void drm_sched_add_msg(struct drm_gpu_scheduler *sched,
> + struct drm_sched_msg *msg);
> void drm_sched_run_wq_stop(struct drm_gpu_scheduler *sched);
> void drm_sched_run_wq_start(struct drm_gpu_scheduler *sched);
> void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad);
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH 4/8] drm/sched: Add generic scheduler message interface
2023-08-03 8:53 ` Christian König
@ 2023-08-03 8:58 ` Daniel Vetter
2023-08-03 9:35 ` Christian König
0 siblings, 1 reply; 24+ messages in thread
From: Daniel Vetter @ 2023-08-03 8:58 UTC (permalink / raw)
To: Christian König
Cc: Matthew Brost, thomas.hellstrom, sarah.walker, ketil.johnsen,
Liviu.Dudau, dri-devel, luben.tuikov, lina, donald.robson,
boris.brezillon, robdclark, intel-xe, faith.ekstrand
On Thu, 3 Aug 2023 at 10:53, Christian König <christian.koenig@amd.com> wrote:
>
> Am 01.08.23 um 22:50 schrieb Matthew Brost:
> > Add generic schedule message interface which sends messages to backend
> > from the drm_gpu_scheduler main submission thread. The idea is some of
> > these messages modify some state in drm_sched_entity which is also
> > modified during submission. By scheduling these messages and submission
> > in the same thread their is not race changing states in
> > drm_sched_entity.
> >
> > This interface will be used in XE, new Intel GPU driver, to cleanup,
> > suspend, resume, and change scheduling properties of a drm_sched_entity.
> >
> > The interface is designed to be generic and extendable with only the
> > backend understanding the messages.
>
> I'm still extremely frowned on this.
>
> If you need this functionality then let the drivers decide which
> runqueue the scheduler should use.
>
> When you then create a single threaded runqueue you can just submit work
> to it and serialize this with the scheduler work.
>
> This way we wouldn't duplicate this core kernel function inside the
> scheduler.
Yeah that's essentially the design we picked for the tdr workers,
where some drivers have requirements that all tdr work must be done on
the same thread (because of cross-engine coordination issues). But
that would require that we rework the scheduler as a pile of
self-submitting work items, and I'm not sure that actually fits all
that well into the core workqueue interfaces either.
Worst case I think this isn't a dead-end and can be refactored to
internally use the workqueue services, with the new functions here
just being dumb wrappers until everyone is converted over. So it
doesn't look like an expensive mistake, if it turns out to be a
mistake.
-Daniel
> Regards,
> Christian.
>
> >
> > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > ---
> > drivers/gpu/drm/scheduler/sched_main.c | 52 +++++++++++++++++++++++++-
> > include/drm/gpu_scheduler.h | 29 +++++++++++++-
> > 2 files changed, 78 insertions(+), 3 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> > index 2597fb298733..84821a124ca2 100644
> > --- a/drivers/gpu/drm/scheduler/sched_main.c
> > +++ b/drivers/gpu/drm/scheduler/sched_main.c
> > @@ -1049,6 +1049,49 @@ drm_sched_pick_best(struct drm_gpu_scheduler **sched_list,
> > }
> > EXPORT_SYMBOL(drm_sched_pick_best);
> >
> > +/**
> > + * drm_sched_add_msg - add scheduler message
> > + *
> > + * @sched: scheduler instance
> > + * @msg: message to be added
> > + *
> > + * Can and will pass an jobs waiting on dependencies or in a runnable queue.
> > + * Messages processing will stop if schedule run wq is stopped and resume when
> > + * run wq is started.
> > + */
> > +void drm_sched_add_msg(struct drm_gpu_scheduler *sched,
> > + struct drm_sched_msg *msg)
> > +{
> > + spin_lock(&sched->job_list_lock);
> > + list_add_tail(&msg->link, &sched->msgs);
> > + spin_unlock(&sched->job_list_lock);
> > +
> > + drm_sched_run_wq_queue(sched);
> > +}
> > +EXPORT_SYMBOL(drm_sched_add_msg);
> > +
> > +/**
> > + * drm_sched_get_msg - get scheduler message
> > + *
> > + * @sched: scheduler instance
> > + *
> > + * Returns NULL or message
> > + */
> > +static struct drm_sched_msg *
> > +drm_sched_get_msg(struct drm_gpu_scheduler *sched)
> > +{
> > + struct drm_sched_msg *msg;
> > +
> > + spin_lock(&sched->job_list_lock);
> > + msg = list_first_entry_or_null(&sched->msgs,
> > + struct drm_sched_msg, link);
> > + if (msg)
> > + list_del(&msg->link);
> > + spin_unlock(&sched->job_list_lock);
> > +
> > + return msg;
> > +}
> > +
> > /**
> > * drm_sched_main - main scheduler thread
> > *
> > @@ -1060,6 +1103,7 @@ static void drm_sched_main(struct work_struct *w)
> > container_of(w, struct drm_gpu_scheduler, work_run);
> > struct drm_sched_entity *entity;
> > struct drm_sched_job *cleanup_job;
> > + struct drm_sched_msg *msg;
> > int r;
> >
> > if (READ_ONCE(sched->pause_run_wq))
> > @@ -1067,12 +1111,15 @@ static void drm_sched_main(struct work_struct *w)
> >
> > cleanup_job = drm_sched_get_cleanup_job(sched);
> > entity = drm_sched_select_entity(sched);
> > + msg = drm_sched_get_msg(sched);
> >
> > - if (!entity && !cleanup_job)
> > + if (!entity && !cleanup_job && !msg)
> > return; /* No more work */
> >
> > if (cleanup_job)
> > sched->ops->free_job(cleanup_job);
> > + if (msg)
> > + sched->ops->process_msg(msg);
> >
> > if (entity) {
> > struct dma_fence *fence;
> > @@ -1082,7 +1129,7 @@ static void drm_sched_main(struct work_struct *w)
> > sched_job = drm_sched_entity_pop_job(entity);
> > if (!sched_job) {
> > complete_all(&entity->entity_idle);
> > - if (!cleanup_job)
> > + if (!cleanup_job && !msg)
> > return; /* No more work */
> > goto again;
> > }
> > @@ -1177,6 +1224,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
> >
> > init_waitqueue_head(&sched->job_scheduled);
> > INIT_LIST_HEAD(&sched->pending_list);
> > + INIT_LIST_HEAD(&sched->msgs);
> > spin_lock_init(&sched->job_list_lock);
> > atomic_set(&sched->hw_rq_count, 0);
> > INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
> > diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> > index df1993dd44ae..267bd060d178 100644
> > --- a/include/drm/gpu_scheduler.h
> > +++ b/include/drm/gpu_scheduler.h
> > @@ -394,6 +394,23 @@ enum drm_gpu_sched_stat {
> > DRM_GPU_SCHED_STAT_ENODEV,
> > };
> >
> > +/**
> > + * struct drm_sched_msg - an in-band (relative to GPU scheduler run queue)
> > + * message
> > + *
> > + * Generic enough for backend defined messages, backend can expand if needed.
> > + */
> > +struct drm_sched_msg {
> > + /** @link: list link into the gpu scheduler list of messages */
> > + struct list_head link;
> > + /**
> > + * @private_data: opaque pointer to message private data (backend defined)
> > + */
> > + void *private_data;
> > + /** @opcode: opcode of message (backend defined) */
> > + unsigned int opcode;
> > +};
> > +
> > /**
> > * struct drm_sched_backend_ops - Define the backend operations
> > * called by the scheduler
> > @@ -471,6 +488,12 @@ struct drm_sched_backend_ops {
> > * and it's time to clean it up.
> > */
> > void (*free_job)(struct drm_sched_job *sched_job);
> > +
> > + /**
> > + * @process_msg: Process a message. Allowed to block, it is this
> > + * function's responsibility to free message if dynamically allocated.
> > + */
> > + void (*process_msg)(struct drm_sched_msg *msg);
> > };
> >
> > /**
> > @@ -482,6 +505,7 @@ struct drm_sched_backend_ops {
> > * @timeout: the time after which a job is removed from the scheduler.
> > * @name: name of the ring for which this scheduler is being used.
> > * @sched_rq: priority wise array of run queues.
> > + * @msgs: list of messages to be processed in @work_run
> > * @job_scheduled: once @drm_sched_entity_do_release is called the scheduler
> > * waits on this wait queue until all the scheduled jobs are
> > * finished.
> > @@ -489,7 +513,7 @@ struct drm_sched_backend_ops {
> > * @job_id_count: used to assign unique id to the each job.
> > * @run_wq: workqueue used to queue @work_run
> > * @timeout_wq: workqueue used to queue @work_tdr
> > - * @work_run: schedules jobs and cleans up entities
> > + * @work_run: schedules jobs, cleans up jobs, and processes messages
> > * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the
> > * timeout interval is over.
> > * @pending_list: the list of jobs which are currently in the job queue.
> > @@ -513,6 +537,7 @@ struct drm_gpu_scheduler {
> > long timeout;
> > const char *name;
> > struct drm_sched_rq sched_rq[DRM_SCHED_PRIORITY_COUNT];
> > + struct list_head msgs;
> > wait_queue_head_t job_scheduled;
> > atomic_t hw_rq_count;
> > atomic64_t job_id_count;
> > @@ -566,6 +591,8 @@ void drm_sched_entity_modify_sched(struct drm_sched_entity *entity,
> >
> > void drm_sched_job_cleanup(struct drm_sched_job *job);
> > void drm_sched_wakeup(struct drm_gpu_scheduler *sched);
> > +void drm_sched_add_msg(struct drm_gpu_scheduler *sched,
> > + struct drm_sched_msg *msg);
> > void drm_sched_run_wq_stop(struct drm_gpu_scheduler *sched);
> > void drm_sched_run_wq_start(struct drm_gpu_scheduler *sched);
> > void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad);
>
--
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH 4/8] drm/sched: Add generic scheduler message interface
2023-08-03 8:58 ` Daniel Vetter
@ 2023-08-03 9:35 ` Christian König
2023-08-04 8:50 ` Daniel Vetter
0 siblings, 1 reply; 24+ messages in thread
From: Christian König @ 2023-08-03 9:35 UTC (permalink / raw)
To: Daniel Vetter
Cc: Matthew Brost, thomas.hellstrom, sarah.walker, ketil.johnsen,
Liviu.Dudau, dri-devel, luben.tuikov, lina, donald.robson,
boris.brezillon, robdclark, intel-xe, faith.ekstrand
Am 03.08.23 um 10:58 schrieb Daniel Vetter:
> On Thu, 3 Aug 2023 at 10:53, Christian König <christian.koenig@amd.com> wrote:
>> Am 01.08.23 um 22:50 schrieb Matthew Brost:
>>> Add generic schedule message interface which sends messages to backend
>>> from the drm_gpu_scheduler main submission thread. The idea is some of
>>> these messages modify some state in drm_sched_entity which is also
>>> modified during submission. By scheduling these messages and submission
>>> in the same thread their is not race changing states in
>>> drm_sched_entity.
>>>
>>> This interface will be used in XE, new Intel GPU driver, to cleanup,
>>> suspend, resume, and change scheduling properties of a drm_sched_entity.
>>>
>>> The interface is designed to be generic and extendable with only the
>>> backend understanding the messages.
>> I'm still extremely frowned on this.
>>
>> If you need this functionality then let the drivers decide which
>> runqueue the scheduler should use.
>>
>> When you then create a single threaded runqueue you can just submit work
>> to it and serialize this with the scheduler work.
>>
>> This way we wouldn't duplicate this core kernel function inside the
>> scheduler.
> Yeah that's essentially the design we picked for the tdr workers,
> where some drivers have requirements that all tdr work must be done on
> the same thread (because of cross-engine coordination issues). But
> that would require that we rework the scheduler as a pile of
> self-submitting work items, and I'm not sure that actually fits all
> that well into the core workqueue interfaces either.
There were already patches floating around which did exactly that.
Last time I checked those were actually looking pretty good.
Additional to message passing advantage the real big issue with the
scheduler and 1 to 1 mapping is that we create a kernel thread for each
instance, which results in tons on overhead.
Just using a work item which is submitted to a work queue completely
avoids that.
Regards,
Christian.
>
> Worst case I think this isn't a dead-end and can be refactored to
> internally use the workqueue services, with the new functions here
> just being dumb wrappers until everyone is converted over. So it
> doesn't look like an expensive mistake, if it turns out to be a
> mistake.
> -Daniel
>
>
>> Regards,
>> Christian.
>>
>>> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
>>> ---
>>> drivers/gpu/drm/scheduler/sched_main.c | 52 +++++++++++++++++++++++++-
>>> include/drm/gpu_scheduler.h | 29 +++++++++++++-
>>> 2 files changed, 78 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
>>> index 2597fb298733..84821a124ca2 100644
>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>> @@ -1049,6 +1049,49 @@ drm_sched_pick_best(struct drm_gpu_scheduler **sched_list,
>>> }
>>> EXPORT_SYMBOL(drm_sched_pick_best);
>>>
>>> +/**
>>> + * drm_sched_add_msg - add scheduler message
>>> + *
>>> + * @sched: scheduler instance
>>> + * @msg: message to be added
>>> + *
>>> + * Can and will pass an jobs waiting on dependencies or in a runnable queue.
>>> + * Messages processing will stop if schedule run wq is stopped and resume when
>>> + * run wq is started.
>>> + */
>>> +void drm_sched_add_msg(struct drm_gpu_scheduler *sched,
>>> + struct drm_sched_msg *msg)
>>> +{
>>> + spin_lock(&sched->job_list_lock);
>>> + list_add_tail(&msg->link, &sched->msgs);
>>> + spin_unlock(&sched->job_list_lock);
>>> +
>>> + drm_sched_run_wq_queue(sched);
>>> +}
>>> +EXPORT_SYMBOL(drm_sched_add_msg);
>>> +
>>> +/**
>>> + * drm_sched_get_msg - get scheduler message
>>> + *
>>> + * @sched: scheduler instance
>>> + *
>>> + * Returns NULL or message
>>> + */
>>> +static struct drm_sched_msg *
>>> +drm_sched_get_msg(struct drm_gpu_scheduler *sched)
>>> +{
>>> + struct drm_sched_msg *msg;
>>> +
>>> + spin_lock(&sched->job_list_lock);
>>> + msg = list_first_entry_or_null(&sched->msgs,
>>> + struct drm_sched_msg, link);
>>> + if (msg)
>>> + list_del(&msg->link);
>>> + spin_unlock(&sched->job_list_lock);
>>> +
>>> + return msg;
>>> +}
>>> +
>>> /**
>>> * drm_sched_main - main scheduler thread
>>> *
>>> @@ -1060,6 +1103,7 @@ static void drm_sched_main(struct work_struct *w)
>>> container_of(w, struct drm_gpu_scheduler, work_run);
>>> struct drm_sched_entity *entity;
>>> struct drm_sched_job *cleanup_job;
>>> + struct drm_sched_msg *msg;
>>> int r;
>>>
>>> if (READ_ONCE(sched->pause_run_wq))
>>> @@ -1067,12 +1111,15 @@ static void drm_sched_main(struct work_struct *w)
>>>
>>> cleanup_job = drm_sched_get_cleanup_job(sched);
>>> entity = drm_sched_select_entity(sched);
>>> + msg = drm_sched_get_msg(sched);
>>>
>>> - if (!entity && !cleanup_job)
>>> + if (!entity && !cleanup_job && !msg)
>>> return; /* No more work */
>>>
>>> if (cleanup_job)
>>> sched->ops->free_job(cleanup_job);
>>> + if (msg)
>>> + sched->ops->process_msg(msg);
>>>
>>> if (entity) {
>>> struct dma_fence *fence;
>>> @@ -1082,7 +1129,7 @@ static void drm_sched_main(struct work_struct *w)
>>> sched_job = drm_sched_entity_pop_job(entity);
>>> if (!sched_job) {
>>> complete_all(&entity->entity_idle);
>>> - if (!cleanup_job)
>>> + if (!cleanup_job && !msg)
>>> return; /* No more work */
>>> goto again;
>>> }
>>> @@ -1177,6 +1224,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>>>
>>> init_waitqueue_head(&sched->job_scheduled);
>>> INIT_LIST_HEAD(&sched->pending_list);
>>> + INIT_LIST_HEAD(&sched->msgs);
>>> spin_lock_init(&sched->job_list_lock);
>>> atomic_set(&sched->hw_rq_count, 0);
>>> INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
>>> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
>>> index df1993dd44ae..267bd060d178 100644
>>> --- a/include/drm/gpu_scheduler.h
>>> +++ b/include/drm/gpu_scheduler.h
>>> @@ -394,6 +394,23 @@ enum drm_gpu_sched_stat {
>>> DRM_GPU_SCHED_STAT_ENODEV,
>>> };
>>>
>>> +/**
>>> + * struct drm_sched_msg - an in-band (relative to GPU scheduler run queue)
>>> + * message
>>> + *
>>> + * Generic enough for backend defined messages, backend can expand if needed.
>>> + */
>>> +struct drm_sched_msg {
>>> + /** @link: list link into the gpu scheduler list of messages */
>>> + struct list_head link;
>>> + /**
>>> + * @private_data: opaque pointer to message private data (backend defined)
>>> + */
>>> + void *private_data;
>>> + /** @opcode: opcode of message (backend defined) */
>>> + unsigned int opcode;
>>> +};
>>> +
>>> /**
>>> * struct drm_sched_backend_ops - Define the backend operations
>>> * called by the scheduler
>>> @@ -471,6 +488,12 @@ struct drm_sched_backend_ops {
>>> * and it's time to clean it up.
>>> */
>>> void (*free_job)(struct drm_sched_job *sched_job);
>>> +
>>> + /**
>>> + * @process_msg: Process a message. Allowed to block, it is this
>>> + * function's responsibility to free message if dynamically allocated.
>>> + */
>>> + void (*process_msg)(struct drm_sched_msg *msg);
>>> };
>>>
>>> /**
>>> @@ -482,6 +505,7 @@ struct drm_sched_backend_ops {
>>> * @timeout: the time after which a job is removed from the scheduler.
>>> * @name: name of the ring for which this scheduler is being used.
>>> * @sched_rq: priority wise array of run queues.
>>> + * @msgs: list of messages to be processed in @work_run
>>> * @job_scheduled: once @drm_sched_entity_do_release is called the scheduler
>>> * waits on this wait queue until all the scheduled jobs are
>>> * finished.
>>> @@ -489,7 +513,7 @@ struct drm_sched_backend_ops {
>>> * @job_id_count: used to assign unique id to the each job.
>>> * @run_wq: workqueue used to queue @work_run
>>> * @timeout_wq: workqueue used to queue @work_tdr
>>> - * @work_run: schedules jobs and cleans up entities
>>> + * @work_run: schedules jobs, cleans up jobs, and processes messages
>>> * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the
>>> * timeout interval is over.
>>> * @pending_list: the list of jobs which are currently in the job queue.
>>> @@ -513,6 +537,7 @@ struct drm_gpu_scheduler {
>>> long timeout;
>>> const char *name;
>>> struct drm_sched_rq sched_rq[DRM_SCHED_PRIORITY_COUNT];
>>> + struct list_head msgs;
>>> wait_queue_head_t job_scheduled;
>>> atomic_t hw_rq_count;
>>> atomic64_t job_id_count;
>>> @@ -566,6 +591,8 @@ void drm_sched_entity_modify_sched(struct drm_sched_entity *entity,
>>>
>>> void drm_sched_job_cleanup(struct drm_sched_job *job);
>>> void drm_sched_wakeup(struct drm_gpu_scheduler *sched);
>>> +void drm_sched_add_msg(struct drm_gpu_scheduler *sched,
>>> + struct drm_sched_msg *msg);
>>> void drm_sched_run_wq_stop(struct drm_gpu_scheduler *sched);
>>> void drm_sched_run_wq_start(struct drm_gpu_scheduler *sched);
>>> void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad);
>
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH 4/8] drm/sched: Add generic scheduler message interface
2023-08-03 9:35 ` Christian König
@ 2023-08-04 8:50 ` Daniel Vetter
2023-08-04 14:13 ` Matthew Brost
0 siblings, 1 reply; 24+ messages in thread
From: Daniel Vetter @ 2023-08-04 8:50 UTC (permalink / raw)
To: Christian König
Cc: Matthew Brost, thomas.hellstrom, sarah.walker, ketil.johnsen,
Liviu.Dudau, dri-devel, luben.tuikov, lina, donald.robson,
boris.brezillon, robdclark, intel-xe, faith.ekstrand
On Thu, Aug 03, 2023 at 11:35:30AM +0200, Christian König wrote:
> Am 03.08.23 um 10:58 schrieb Daniel Vetter:
> > On Thu, 3 Aug 2023 at 10:53, Christian König <christian.koenig@amd.com> wrote:
> > > Am 01.08.23 um 22:50 schrieb Matthew Brost:
> > > > Add generic schedule message interface which sends messages to backend
> > > > from the drm_gpu_scheduler main submission thread. The idea is some of
> > > > these messages modify some state in drm_sched_entity which is also
> > > > modified during submission. By scheduling these messages and submission
> > > > in the same thread their is not race changing states in
> > > > drm_sched_entity.
> > > >
> > > > This interface will be used in XE, new Intel GPU driver, to cleanup,
> > > > suspend, resume, and change scheduling properties of a drm_sched_entity.
> > > >
> > > > The interface is designed to be generic and extendable with only the
> > > > backend understanding the messages.
> > > I'm still extremely frowned on this.
> > >
> > > If you need this functionality then let the drivers decide which
> > > runqueue the scheduler should use.
> > >
> > > When you then create a single threaded runqueue you can just submit work
> > > to it and serialize this with the scheduler work.
> > >
> > > This way we wouldn't duplicate this core kernel function inside the
> > > scheduler.
> > Yeah that's essentially the design we picked for the tdr workers,
> > where some drivers have requirements that all tdr work must be done on
> > the same thread (because of cross-engine coordination issues). But
> > that would require that we rework the scheduler as a pile of
> > self-submitting work items, and I'm not sure that actually fits all
> > that well into the core workqueue interfaces either.
>
> There were already patches floating around which did exactly that.
>
> Last time I checked those were actually looking pretty good.
>
> Additional to message passing advantage the real big issue with the
> scheduler and 1 to 1 mapping is that we create a kernel thread for each
> instance, which results in tons on overhead.
>
> Just using a work item which is submitted to a work queue completely avoids
> that.
Hm I should have read the entire series first, since that does the
conversion still. Apologies for the confusion, and yeah we should be able
to just submit other work to the same wq with the first patch? And so
hand-rolling this infra here isn't needed at all?
Or what am I missing?
> Regards,
> Christian.
>
> >
> > Worst case I think this isn't a dead-end and can be refactored to
> > internally use the workqueue services, with the new functions here
> > just being dumb wrappers until everyone is converted over. So it
> > doesn't look like an expensive mistake, if it turns out to be a
> > mistake.
> > -Daniel
> >
> >
> > > Regards,
> > > Christian.
> > >
> > > > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > > > ---
> > > > drivers/gpu/drm/scheduler/sched_main.c | 52 +++++++++++++++++++++++++-
> > > > include/drm/gpu_scheduler.h | 29 +++++++++++++-
> > > > 2 files changed, 78 insertions(+), 3 deletions(-)
> > > >
> > > > diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> > > > index 2597fb298733..84821a124ca2 100644
> > > > --- a/drivers/gpu/drm/scheduler/sched_main.c
> > > > +++ b/drivers/gpu/drm/scheduler/sched_main.c
> > > > @@ -1049,6 +1049,49 @@ drm_sched_pick_best(struct drm_gpu_scheduler **sched_list,
> > > > }
> > > > EXPORT_SYMBOL(drm_sched_pick_best);
> > > >
> > > > +/**
> > > > + * drm_sched_add_msg - add scheduler message
> > > > + *
> > > > + * @sched: scheduler instance
> > > > + * @msg: message to be added
> > > > + *
> > > > + * Can and will pass an jobs waiting on dependencies or in a runnable queue.
> > > > + * Messages processing will stop if schedule run wq is stopped and resume when
> > > > + * run wq is started.
> > > > + */
> > > > +void drm_sched_add_msg(struct drm_gpu_scheduler *sched,
> > > > + struct drm_sched_msg *msg)
> > > > +{
> > > > + spin_lock(&sched->job_list_lock);
> > > > + list_add_tail(&msg->link, &sched->msgs);
> > > > + spin_unlock(&sched->job_list_lock);
> > > > +
> > > > + drm_sched_run_wq_queue(sched);
> > > > +}
> > > > +EXPORT_SYMBOL(drm_sched_add_msg);
> > > > +
> > > > +/**
> > > > + * drm_sched_get_msg - get scheduler message
> > > > + *
> > > > + * @sched: scheduler instance
> > > > + *
> > > > + * Returns NULL or message
> > > > + */
> > > > +static struct drm_sched_msg *
> > > > +drm_sched_get_msg(struct drm_gpu_scheduler *sched)
> > > > +{
> > > > + struct drm_sched_msg *msg;
> > > > +
> > > > + spin_lock(&sched->job_list_lock);
> > > > + msg = list_first_entry_or_null(&sched->msgs,
> > > > + struct drm_sched_msg, link);
> > > > + if (msg)
> > > > + list_del(&msg->link);
> > > > + spin_unlock(&sched->job_list_lock);
> > > > +
> > > > + return msg;
> > > > +}
> > > > +
> > > > /**
> > > > * drm_sched_main - main scheduler thread
> > > > *
> > > > @@ -1060,6 +1103,7 @@ static void drm_sched_main(struct work_struct *w)
> > > > container_of(w, struct drm_gpu_scheduler, work_run);
> > > > struct drm_sched_entity *entity;
> > > > struct drm_sched_job *cleanup_job;
> > > > + struct drm_sched_msg *msg;
> > > > int r;
> > > >
> > > > if (READ_ONCE(sched->pause_run_wq))
> > > > @@ -1067,12 +1111,15 @@ static void drm_sched_main(struct work_struct *w)
> > > >
> > > > cleanup_job = drm_sched_get_cleanup_job(sched);
> > > > entity = drm_sched_select_entity(sched);
> > > > + msg = drm_sched_get_msg(sched);
> > > >
> > > > - if (!entity && !cleanup_job)
> > > > + if (!entity && !cleanup_job && !msg)
> > > > return; /* No more work */
> > > >
> > > > if (cleanup_job)
> > > > sched->ops->free_job(cleanup_job);
> > > > + if (msg)
> > > > + sched->ops->process_msg(msg);
> > > >
> > > > if (entity) {
> > > > struct dma_fence *fence;
> > > > @@ -1082,7 +1129,7 @@ static void drm_sched_main(struct work_struct *w)
> > > > sched_job = drm_sched_entity_pop_job(entity);
> > > > if (!sched_job) {
> > > > complete_all(&entity->entity_idle);
> > > > - if (!cleanup_job)
> > > > + if (!cleanup_job && !msg)
> > > > return; /* No more work */
> > > > goto again;
> > > > }
> > > > @@ -1177,6 +1224,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
> > > >
> > > > init_waitqueue_head(&sched->job_scheduled);
> > > > INIT_LIST_HEAD(&sched->pending_list);
> > > > + INIT_LIST_HEAD(&sched->msgs);
> > > > spin_lock_init(&sched->job_list_lock);
> > > > atomic_set(&sched->hw_rq_count, 0);
> > > > INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
> > > > diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> > > > index df1993dd44ae..267bd060d178 100644
> > > > --- a/include/drm/gpu_scheduler.h
> > > > +++ b/include/drm/gpu_scheduler.h
> > > > @@ -394,6 +394,23 @@ enum drm_gpu_sched_stat {
> > > > DRM_GPU_SCHED_STAT_ENODEV,
> > > > };
> > > >
> > > > +/**
> > > > + * struct drm_sched_msg - an in-band (relative to GPU scheduler run queue)
> > > > + * message
> > > > + *
> > > > + * Generic enough for backend defined messages, backend can expand if needed.
> > > > + */
> > > > +struct drm_sched_msg {
> > > > + /** @link: list link into the gpu scheduler list of messages */
> > > > + struct list_head link;
> > > > + /**
> > > > + * @private_data: opaque pointer to message private data (backend defined)
> > > > + */
> > > > + void *private_data;
> > > > + /** @opcode: opcode of message (backend defined) */
> > > > + unsigned int opcode;
> > > > +};
> > > > +
> > > > /**
> > > > * struct drm_sched_backend_ops - Define the backend operations
> > > > * called by the scheduler
> > > > @@ -471,6 +488,12 @@ struct drm_sched_backend_ops {
> > > > * and it's time to clean it up.
> > > > */
> > > > void (*free_job)(struct drm_sched_job *sched_job);
> > > > +
> > > > + /**
> > > > + * @process_msg: Process a message. Allowed to block, it is this
> > > > + * function's responsibility to free message if dynamically allocated.
> > > > + */
> > > > + void (*process_msg)(struct drm_sched_msg *msg);
> > > > };
> > > >
> > > > /**
> > > > @@ -482,6 +505,7 @@ struct drm_sched_backend_ops {
> > > > * @timeout: the time after which a job is removed from the scheduler.
> > > > * @name: name of the ring for which this scheduler is being used.
> > > > * @sched_rq: priority wise array of run queues.
> > > > + * @msgs: list of messages to be processed in @work_run
> > > > * @job_scheduled: once @drm_sched_entity_do_release is called the scheduler
> > > > * waits on this wait queue until all the scheduled jobs are
> > > > * finished.
> > > > @@ -489,7 +513,7 @@ struct drm_sched_backend_ops {
> > > > * @job_id_count: used to assign unique id to the each job.
> > > > * @run_wq: workqueue used to queue @work_run
> > > > * @timeout_wq: workqueue used to queue @work_tdr
> > > > - * @work_run: schedules jobs and cleans up entities
> > > > + * @work_run: schedules jobs, cleans up jobs, and processes messages
> > > > * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the
> > > > * timeout interval is over.
> > > > * @pending_list: the list of jobs which are currently in the job queue.
> > > > @@ -513,6 +537,7 @@ struct drm_gpu_scheduler {
> > > > long timeout;
> > > > const char *name;
> > > > struct drm_sched_rq sched_rq[DRM_SCHED_PRIORITY_COUNT];
> > > > + struct list_head msgs;
> > > > wait_queue_head_t job_scheduled;
> > > > atomic_t hw_rq_count;
> > > > atomic64_t job_id_count;
> > > > @@ -566,6 +591,8 @@ void drm_sched_entity_modify_sched(struct drm_sched_entity *entity,
> > > >
> > > > void drm_sched_job_cleanup(struct drm_sched_job *job);
> > > > void drm_sched_wakeup(struct drm_gpu_scheduler *sched);
> > > > +void drm_sched_add_msg(struct drm_gpu_scheduler *sched,
> > > > + struct drm_sched_msg *msg);
> > > > void drm_sched_run_wq_stop(struct drm_gpu_scheduler *sched);
> > > > void drm_sched_run_wq_start(struct drm_gpu_scheduler *sched);
> > > > void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad);
> >
>
--
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH 4/8] drm/sched: Add generic scheduler message interface
2023-08-04 8:50 ` Daniel Vetter
@ 2023-08-04 14:13 ` Matthew Brost
2023-08-07 15:46 ` Christian König
0 siblings, 1 reply; 24+ messages in thread
From: Matthew Brost @ 2023-08-04 14:13 UTC (permalink / raw)
To: Daniel Vetter
Cc: robdclark, thomas.hellstrom, sarah.walker, ketil.johnsen,
Liviu.Dudau, dri-devel, intel-xe, luben.tuikov, lina,
donald.robson, boris.brezillon, Christian König,
faith.ekstrand
On Fri, Aug 04, 2023 at 10:50:36AM +0200, Daniel Vetter wrote:
> On Thu, Aug 03, 2023 at 11:35:30AM +0200, Christian König wrote:
> > Am 03.08.23 um 10:58 schrieb Daniel Vetter:
> > > On Thu, 3 Aug 2023 at 10:53, Christian König <christian.koenig@amd.com> wrote:
> > > > Am 01.08.23 um 22:50 schrieb Matthew Brost:
> > > > > Add generic schedule message interface which sends messages to backend
> > > > > from the drm_gpu_scheduler main submission thread. The idea is some of
> > > > > these messages modify some state in drm_sched_entity which is also
> > > > > modified during submission. By scheduling these messages and submission
> > > > > in the same thread their is not race changing states in
> > > > > drm_sched_entity.
> > > > >
> > > > > This interface will be used in XE, new Intel GPU driver, to cleanup,
> > > > > suspend, resume, and change scheduling properties of a drm_sched_entity.
> > > > >
> > > > > The interface is designed to be generic and extendable with only the
> > > > > backend understanding the messages.
Christian / Daniel - I've read both of you comments and having a hard
time parsing them. I do not really understand the issue with this patch
or exactly what is being suggested instead. Let's try to work through
this.
> > > > I'm still extremely frowned on this.
> > > >
> > > > If you need this functionality then let the drivers decide which
> > > > runqueue the scheduler should use.
What do you mean by runqueue here? Do you mean 'struct
workqueue_struct'? The scheduler in this context is 'struct
drm_gpu_scheduler', right?
Yes, we have added this functionality iin the first patch.
> > > >
> > > > When you then create a single threaded runqueue you can just submit work
> > > > to it and serialize this with the scheduler work.
> > > >
We don't want to use a single threaded workqueue_struct in Xe, we want
to use a system_wq as run_job() can be executed in parallel across
multiple entites (or drm_gpu_scheduler as in Xe we have 1 to 1
relationship between entity and scheduler). What we want is on per
entity / scheduler granularity to be able to communicate into the
backend a message synchronously (run_job / free_job not executing,
scheduler execution not paused for a reset).
If I'm underatanding what you suggesting in Xe we'd create an ordered
workqueue_struct per drm_gpu_scheduler and the queue messages on the
ordered workqueue_struct? This seems pretty messy to me as now we have
open coded a solution bypassing the scheduler, every drm_gpu_scheduler
creates its own workqueue_struct, and we'd also have to open code the
pausing of these messages for resets too.
IMO this is pretty clean solution that follows the pattern of cleanup
jobs already in place.
> > > > This way we wouldn't duplicate this core kernel function inside the
> > > > scheduler.
> > > Yeah that's essentially the design we picked for the tdr workers,
> > > where some drivers have requirements that all tdr work must be done on
> > > the same thread (because of cross-engine coordination issues). But
> > > that would require that we rework the scheduler as a pile of
> > > self-submitting work items, and I'm not sure that actually fits all
> > > that well into the core workqueue interfaces either.
This is the ordering between TDRs firing between different
drm_gpu_scheduler and larger external resets (GT in Xe) an ordered
workqueue_struct makes sense for this. Here we are talking about
ordering jobs and messages within a single drm_gpu_scheduler. Using the
main execution thread to do ordering makes sense in my opinion.
> >
> > There were already patches floating around which did exactly that.
> >
> > Last time I checked those were actually looking pretty good.
> >
Link to patches for reference.
> > Additional to message passing advantage the real big issue with the
> > scheduler and 1 to 1 mapping is that we create a kernel thread for each
> > instance, which results in tons on overhead.
First patch in the series switches from kthread to work queue, that is
still a good idea.
> >
> > Just using a work item which is submitted to a work queue completely avoids
> > that.
>
> Hm I should have read the entire series first, since that does the
> conversion still. Apologies for the confusion, and yeah we should be able
> to just submit other work to the same wq with the first patch? And so
> hand-rolling this infra here isn't needed at all?
>
I wouldn't call this hand rolling, rather it following patten already in
place.
Matt
> Or what am I missing?
>
> > Regards,
> > Christian.
> >
> > >
> > > Worst case I think this isn't a dead-end and can be refactored to
> > > internally use the workqueue services, with the new functions here
> > > just being dumb wrappers until everyone is converted over. So it
> > > doesn't look like an expensive mistake, if it turns out to be a
> > > mistake.
> > > -Daniel
> > >
> > >
> > > > Regards,
> > > > Christian.
> > > >
> > > > > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > > > > ---
> > > > > drivers/gpu/drm/scheduler/sched_main.c | 52 +++++++++++++++++++++++++-
> > > > > include/drm/gpu_scheduler.h | 29 +++++++++++++-
> > > > > 2 files changed, 78 insertions(+), 3 deletions(-)
> > > > >
> > > > > diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> > > > > index 2597fb298733..84821a124ca2 100644
> > > > > --- a/drivers/gpu/drm/scheduler/sched_main.c
> > > > > +++ b/drivers/gpu/drm/scheduler/sched_main.c
> > > > > @@ -1049,6 +1049,49 @@ drm_sched_pick_best(struct drm_gpu_scheduler **sched_list,
> > > > > }
> > > > > EXPORT_SYMBOL(drm_sched_pick_best);
> > > > >
> > > > > +/**
> > > > > + * drm_sched_add_msg - add scheduler message
> > > > > + *
> > > > > + * @sched: scheduler instance
> > > > > + * @msg: message to be added
> > > > > + *
> > > > > + * Can and will pass an jobs waiting on dependencies or in a runnable queue.
> > > > > + * Messages processing will stop if schedule run wq is stopped and resume when
> > > > > + * run wq is started.
> > > > > + */
> > > > > +void drm_sched_add_msg(struct drm_gpu_scheduler *sched,
> > > > > + struct drm_sched_msg *msg)
> > > > > +{
> > > > > + spin_lock(&sched->job_list_lock);
> > > > > + list_add_tail(&msg->link, &sched->msgs);
> > > > > + spin_unlock(&sched->job_list_lock);
> > > > > +
> > > > > + drm_sched_run_wq_queue(sched);
> > > > > +}
> > > > > +EXPORT_SYMBOL(drm_sched_add_msg);
> > > > > +
> > > > > +/**
> > > > > + * drm_sched_get_msg - get scheduler message
> > > > > + *
> > > > > + * @sched: scheduler instance
> > > > > + *
> > > > > + * Returns NULL or message
> > > > > + */
> > > > > +static struct drm_sched_msg *
> > > > > +drm_sched_get_msg(struct drm_gpu_scheduler *sched)
> > > > > +{
> > > > > + struct drm_sched_msg *msg;
> > > > > +
> > > > > + spin_lock(&sched->job_list_lock);
> > > > > + msg = list_first_entry_or_null(&sched->msgs,
> > > > > + struct drm_sched_msg, link);
> > > > > + if (msg)
> > > > > + list_del(&msg->link);
> > > > > + spin_unlock(&sched->job_list_lock);
> > > > > +
> > > > > + return msg;
> > > > > +}
> > > > > +
> > > > > /**
> > > > > * drm_sched_main - main scheduler thread
> > > > > *
> > > > > @@ -1060,6 +1103,7 @@ static void drm_sched_main(struct work_struct *w)
> > > > > container_of(w, struct drm_gpu_scheduler, work_run);
> > > > > struct drm_sched_entity *entity;
> > > > > struct drm_sched_job *cleanup_job;
> > > > > + struct drm_sched_msg *msg;
> > > > > int r;
> > > > >
> > > > > if (READ_ONCE(sched->pause_run_wq))
> > > > > @@ -1067,12 +1111,15 @@ static void drm_sched_main(struct work_struct *w)
> > > > >
> > > > > cleanup_job = drm_sched_get_cleanup_job(sched);
> > > > > entity = drm_sched_select_entity(sched);
> > > > > + msg = drm_sched_get_msg(sched);
> > > > >
> > > > > - if (!entity && !cleanup_job)
> > > > > + if (!entity && !cleanup_job && !msg)
> > > > > return; /* No more work */
> > > > >
> > > > > if (cleanup_job)
> > > > > sched->ops->free_job(cleanup_job);
> > > > > + if (msg)
> > > > > + sched->ops->process_msg(msg);
> > > > >
> > > > > if (entity) {
> > > > > struct dma_fence *fence;
> > > > > @@ -1082,7 +1129,7 @@ static void drm_sched_main(struct work_struct *w)
> > > > > sched_job = drm_sched_entity_pop_job(entity);
> > > > > if (!sched_job) {
> > > > > complete_all(&entity->entity_idle);
> > > > > - if (!cleanup_job)
> > > > > + if (!cleanup_job && !msg)
> > > > > return; /* No more work */
> > > > > goto again;
> > > > > }
> > > > > @@ -1177,6 +1224,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
> > > > >
> > > > > init_waitqueue_head(&sched->job_scheduled);
> > > > > INIT_LIST_HEAD(&sched->pending_list);
> > > > > + INIT_LIST_HEAD(&sched->msgs);
> > > > > spin_lock_init(&sched->job_list_lock);
> > > > > atomic_set(&sched->hw_rq_count, 0);
> > > > > INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
> > > > > diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> > > > > index df1993dd44ae..267bd060d178 100644
> > > > > --- a/include/drm/gpu_scheduler.h
> > > > > +++ b/include/drm/gpu_scheduler.h
> > > > > @@ -394,6 +394,23 @@ enum drm_gpu_sched_stat {
> > > > > DRM_GPU_SCHED_STAT_ENODEV,
> > > > > };
> > > > >
> > > > > +/**
> > > > > + * struct drm_sched_msg - an in-band (relative to GPU scheduler run queue)
> > > > > + * message
> > > > > + *
> > > > > + * Generic enough for backend defined messages, backend can expand if needed.
> > > > > + */
> > > > > +struct drm_sched_msg {
> > > > > + /** @link: list link into the gpu scheduler list of messages */
> > > > > + struct list_head link;
> > > > > + /**
> > > > > + * @private_data: opaque pointer to message private data (backend defined)
> > > > > + */
> > > > > + void *private_data;
> > > > > + /** @opcode: opcode of message (backend defined) */
> > > > > + unsigned int opcode;
> > > > > +};
> > > > > +
> > > > > /**
> > > > > * struct drm_sched_backend_ops - Define the backend operations
> > > > > * called by the scheduler
> > > > > @@ -471,6 +488,12 @@ struct drm_sched_backend_ops {
> > > > > * and it's time to clean it up.
> > > > > */
> > > > > void (*free_job)(struct drm_sched_job *sched_job);
> > > > > +
> > > > > + /**
> > > > > + * @process_msg: Process a message. Allowed to block, it is this
> > > > > + * function's responsibility to free message if dynamically allocated.
> > > > > + */
> > > > > + void (*process_msg)(struct drm_sched_msg *msg);
> > > > > };
> > > > >
> > > > > /**
> > > > > @@ -482,6 +505,7 @@ struct drm_sched_backend_ops {
> > > > > * @timeout: the time after which a job is removed from the scheduler.
> > > > > * @name: name of the ring for which this scheduler is being used.
> > > > > * @sched_rq: priority wise array of run queues.
> > > > > + * @msgs: list of messages to be processed in @work_run
> > > > > * @job_scheduled: once @drm_sched_entity_do_release is called the scheduler
> > > > > * waits on this wait queue until all the scheduled jobs are
> > > > > * finished.
> > > > > @@ -489,7 +513,7 @@ struct drm_sched_backend_ops {
> > > > > * @job_id_count: used to assign unique id to the each job.
> > > > > * @run_wq: workqueue used to queue @work_run
> > > > > * @timeout_wq: workqueue used to queue @work_tdr
> > > > > - * @work_run: schedules jobs and cleans up entities
> > > > > + * @work_run: schedules jobs, cleans up jobs, and processes messages
> > > > > * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the
> > > > > * timeout interval is over.
> > > > > * @pending_list: the list of jobs which are currently in the job queue.
> > > > > @@ -513,6 +537,7 @@ struct drm_gpu_scheduler {
> > > > > long timeout;
> > > > > const char *name;
> > > > > struct drm_sched_rq sched_rq[DRM_SCHED_PRIORITY_COUNT];
> > > > > + struct list_head msgs;
> > > > > wait_queue_head_t job_scheduled;
> > > > > atomic_t hw_rq_count;
> > > > > atomic64_t job_id_count;
> > > > > @@ -566,6 +591,8 @@ void drm_sched_entity_modify_sched(struct drm_sched_entity *entity,
> > > > >
> > > > > void drm_sched_job_cleanup(struct drm_sched_job *job);
> > > > > void drm_sched_wakeup(struct drm_gpu_scheduler *sched);
> > > > > +void drm_sched_add_msg(struct drm_gpu_scheduler *sched,
> > > > > + struct drm_sched_msg *msg);
> > > > > void drm_sched_run_wq_stop(struct drm_gpu_scheduler *sched);
> > > > > void drm_sched_run_wq_start(struct drm_gpu_scheduler *sched);
> > > > > void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad);
> > >
> >
>
> --
> Daniel Vetter
> Software Engineer, Intel Corporation
> http://blog.ffwll.ch
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH 4/8] drm/sched: Add generic scheduler message interface
2023-08-04 14:13 ` Matthew Brost
@ 2023-08-07 15:46 ` Christian König
2023-08-08 14:06 ` Matthew Brost
0 siblings, 1 reply; 24+ messages in thread
From: Christian König @ 2023-08-07 15:46 UTC (permalink / raw)
To: Matthew Brost, Daniel Vetter
Cc: robdclark, thomas.hellstrom, sarah.walker, ketil.johnsen,
Liviu.Dudau, dri-devel, luben.tuikov, lina, donald.robson,
boris.brezillon, intel-xe, faith.ekstrand
Am 04.08.23 um 16:13 schrieb Matthew Brost:
> [SNIP]
> Christian / Daniel - I've read both of you comments and having a hard
> time parsing them. I do not really understand the issue with this patch
> or exactly what is being suggested instead. Let's try to work through
> this.
>
>>>>> I'm still extremely frowned on this.
>>>>>
>>>>> If you need this functionality then let the drivers decide which
>>>>> runqueue the scheduler should use.
> What do you mean by runqueue here? Do you mean 'struct
> workqueue_struct'? The scheduler in this context is 'struct
> drm_gpu_scheduler', right?
Sorry for the confusing wording, your understanding is correct.
> Yes, we have added this functionality iin the first patch.
>
>>>>> When you then create a single threaded runqueue you can just submit work
>>>>> to it and serialize this with the scheduler work.
>>>>>
> We don't want to use a single threaded workqueue_struct in Xe, we want
> to use a system_wq as run_job() can be executed in parallel across
> multiple entites (or drm_gpu_scheduler as in Xe we have 1 to 1
> relationship between entity and scheduler). What we want is on per
> entity / scheduler granularity to be able to communicate into the
> backend a message synchronously (run_job / free_job not executing,
> scheduler execution not paused for a reset).
>
> If I'm underatanding what you suggesting in Xe we'd create an ordered
> workqueue_struct per drm_gpu_scheduler and the queue messages on the
> ordered workqueue_struct?
Yes, correct.
> This seems pretty messy to me as now we have
> open coded a solution bypassing the scheduler, every drm_gpu_scheduler
> creates its own workqueue_struct, and we'd also have to open code the
> pausing of these messages for resets too.
>
> IMO this is pretty clean solution that follows the pattern of cleanup
> jobs already in place.
Yeah, exactly that's the point. Moving the job cleanup into the
scheduler thread is seen as very very bad idea by me.
And I really don't want to exercise that again for different use cases.
>
>>>>> This way we wouldn't duplicate this core kernel function inside the
>>>>> scheduler.
>>>> Yeah that's essentially the design we picked for the tdr workers,
>>>> where some drivers have requirements that all tdr work must be done on
>>>> the same thread (because of cross-engine coordination issues). But
>>>> that would require that we rework the scheduler as a pile of
>>>> self-submitting work items, and I'm not sure that actually fits all
>>>> that well into the core workqueue interfaces either.
> This is the ordering between TDRs firing between different
> drm_gpu_scheduler and larger external resets (GT in Xe) an ordered
> workqueue_struct makes sense for this. Here we are talking about
> ordering jobs and messages within a single drm_gpu_scheduler. Using the
> main execution thread to do ordering makes sense in my opinion.
I completely disagree to that.
Take a look at how this came to be. This is a very very ugly hack and we
already had a hard time making lockdep understand the different fence
signaling dependencies with freeing the job and I'm pretty sure that is
still not 100% correct.
>
>>> There were already patches floating around which did exactly that.
>>>
>>> Last time I checked those were actually looking pretty good.
>>>
> Link to patches for reference.
>
>>> Additional to message passing advantage the real big issue with the
>>> scheduler and 1 to 1 mapping is that we create a kernel thread for each
>>> instance, which results in tons on overhead.
> First patch in the series switches from kthread to work queue, that is
> still a good idea.
This was the patch I was referring to. Sorry didn't remembered that this
was in the same patch set.
>
>>> Just using a work item which is submitted to a work queue completely avoids
>>> that.
>> Hm I should have read the entire series first, since that does the
>> conversion still. Apologies for the confusion, and yeah we should be able
>> to just submit other work to the same wq with the first patch? And so
>> hand-rolling this infra here isn't needed at all?
>>
> I wouldn't call this hand rolling, rather it following patten already in
> place.
Basically workqueues are the in kernel infrastructure for exactly that
use case and we are trying to re-create that here and that is usually a
rather bad idea.
Regards,
Christian.
>
> Matt
>
>> Or what am I missing?
>>
>>> Regards,
>>> Christian.
>>>
>>>> Worst case I think this isn't a dead-end and can be refactored to
>>>> internally use the workqueue services, with the new functions here
>>>> just being dumb wrappers until everyone is converted over. So it
>>>> doesn't look like an expensive mistake, if it turns out to be a
>>>> mistake.
>>>> -Daniel
>>>>
>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>>> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
>>>>>> ---
>>>>>> drivers/gpu/drm/scheduler/sched_main.c | 52 +++++++++++++++++++++++++-
>>>>>> include/drm/gpu_scheduler.h | 29 +++++++++++++-
>>>>>> 2 files changed, 78 insertions(+), 3 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>> index 2597fb298733..84821a124ca2 100644
>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>> @@ -1049,6 +1049,49 @@ drm_sched_pick_best(struct drm_gpu_scheduler **sched_list,
>>>>>> }
>>>>>> EXPORT_SYMBOL(drm_sched_pick_best);
>>>>>>
>>>>>> +/**
>>>>>> + * drm_sched_add_msg - add scheduler message
>>>>>> + *
>>>>>> + * @sched: scheduler instance
>>>>>> + * @msg: message to be added
>>>>>> + *
>>>>>> + * Can and will pass an jobs waiting on dependencies or in a runnable queue.
>>>>>> + * Messages processing will stop if schedule run wq is stopped and resume when
>>>>>> + * run wq is started.
>>>>>> + */
>>>>>> +void drm_sched_add_msg(struct drm_gpu_scheduler *sched,
>>>>>> + struct drm_sched_msg *msg)
>>>>>> +{
>>>>>> + spin_lock(&sched->job_list_lock);
>>>>>> + list_add_tail(&msg->link, &sched->msgs);
>>>>>> + spin_unlock(&sched->job_list_lock);
>>>>>> +
>>>>>> + drm_sched_run_wq_queue(sched);
>>>>>> +}
>>>>>> +EXPORT_SYMBOL(drm_sched_add_msg);
>>>>>> +
>>>>>> +/**
>>>>>> + * drm_sched_get_msg - get scheduler message
>>>>>> + *
>>>>>> + * @sched: scheduler instance
>>>>>> + *
>>>>>> + * Returns NULL or message
>>>>>> + */
>>>>>> +static struct drm_sched_msg *
>>>>>> +drm_sched_get_msg(struct drm_gpu_scheduler *sched)
>>>>>> +{
>>>>>> + struct drm_sched_msg *msg;
>>>>>> +
>>>>>> + spin_lock(&sched->job_list_lock);
>>>>>> + msg = list_first_entry_or_null(&sched->msgs,
>>>>>> + struct drm_sched_msg, link);
>>>>>> + if (msg)
>>>>>> + list_del(&msg->link);
>>>>>> + spin_unlock(&sched->job_list_lock);
>>>>>> +
>>>>>> + return msg;
>>>>>> +}
>>>>>> +
>>>>>> /**
>>>>>> * drm_sched_main - main scheduler thread
>>>>>> *
>>>>>> @@ -1060,6 +1103,7 @@ static void drm_sched_main(struct work_struct *w)
>>>>>> container_of(w, struct drm_gpu_scheduler, work_run);
>>>>>> struct drm_sched_entity *entity;
>>>>>> struct drm_sched_job *cleanup_job;
>>>>>> + struct drm_sched_msg *msg;
>>>>>> int r;
>>>>>>
>>>>>> if (READ_ONCE(sched->pause_run_wq))
>>>>>> @@ -1067,12 +1111,15 @@ static void drm_sched_main(struct work_struct *w)
>>>>>>
>>>>>> cleanup_job = drm_sched_get_cleanup_job(sched);
>>>>>> entity = drm_sched_select_entity(sched);
>>>>>> + msg = drm_sched_get_msg(sched);
>>>>>>
>>>>>> - if (!entity && !cleanup_job)
>>>>>> + if (!entity && !cleanup_job && !msg)
>>>>>> return; /* No more work */
>>>>>>
>>>>>> if (cleanup_job)
>>>>>> sched->ops->free_job(cleanup_job);
>>>>>> + if (msg)
>>>>>> + sched->ops->process_msg(msg);
>>>>>>
>>>>>> if (entity) {
>>>>>> struct dma_fence *fence;
>>>>>> @@ -1082,7 +1129,7 @@ static void drm_sched_main(struct work_struct *w)
>>>>>> sched_job = drm_sched_entity_pop_job(entity);
>>>>>> if (!sched_job) {
>>>>>> complete_all(&entity->entity_idle);
>>>>>> - if (!cleanup_job)
>>>>>> + if (!cleanup_job && !msg)
>>>>>> return; /* No more work */
>>>>>> goto again;
>>>>>> }
>>>>>> @@ -1177,6 +1224,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>>>>>>
>>>>>> init_waitqueue_head(&sched->job_scheduled);
>>>>>> INIT_LIST_HEAD(&sched->pending_list);
>>>>>> + INIT_LIST_HEAD(&sched->msgs);
>>>>>> spin_lock_init(&sched->job_list_lock);
>>>>>> atomic_set(&sched->hw_rq_count, 0);
>>>>>> INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
>>>>>> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
>>>>>> index df1993dd44ae..267bd060d178 100644
>>>>>> --- a/include/drm/gpu_scheduler.h
>>>>>> +++ b/include/drm/gpu_scheduler.h
>>>>>> @@ -394,6 +394,23 @@ enum drm_gpu_sched_stat {
>>>>>> DRM_GPU_SCHED_STAT_ENODEV,
>>>>>> };
>>>>>>
>>>>>> +/**
>>>>>> + * struct drm_sched_msg - an in-band (relative to GPU scheduler run queue)
>>>>>> + * message
>>>>>> + *
>>>>>> + * Generic enough for backend defined messages, backend can expand if needed.
>>>>>> + */
>>>>>> +struct drm_sched_msg {
>>>>>> + /** @link: list link into the gpu scheduler list of messages */
>>>>>> + struct list_head link;
>>>>>> + /**
>>>>>> + * @private_data: opaque pointer to message private data (backend defined)
>>>>>> + */
>>>>>> + void *private_data;
>>>>>> + /** @opcode: opcode of message (backend defined) */
>>>>>> + unsigned int opcode;
>>>>>> +};
>>>>>> +
>>>>>> /**
>>>>>> * struct drm_sched_backend_ops - Define the backend operations
>>>>>> * called by the scheduler
>>>>>> @@ -471,6 +488,12 @@ struct drm_sched_backend_ops {
>>>>>> * and it's time to clean it up.
>>>>>> */
>>>>>> void (*free_job)(struct drm_sched_job *sched_job);
>>>>>> +
>>>>>> + /**
>>>>>> + * @process_msg: Process a message. Allowed to block, it is this
>>>>>> + * function's responsibility to free message if dynamically allocated.
>>>>>> + */
>>>>>> + void (*process_msg)(struct drm_sched_msg *msg);
>>>>>> };
>>>>>>
>>>>>> /**
>>>>>> @@ -482,6 +505,7 @@ struct drm_sched_backend_ops {
>>>>>> * @timeout: the time after which a job is removed from the scheduler.
>>>>>> * @name: name of the ring for which this scheduler is being used.
>>>>>> * @sched_rq: priority wise array of run queues.
>>>>>> + * @msgs: list of messages to be processed in @work_run
>>>>>> * @job_scheduled: once @drm_sched_entity_do_release is called the scheduler
>>>>>> * waits on this wait queue until all the scheduled jobs are
>>>>>> * finished.
>>>>>> @@ -489,7 +513,7 @@ struct drm_sched_backend_ops {
>>>>>> * @job_id_count: used to assign unique id to the each job.
>>>>>> * @run_wq: workqueue used to queue @work_run
>>>>>> * @timeout_wq: workqueue used to queue @work_tdr
>>>>>> - * @work_run: schedules jobs and cleans up entities
>>>>>> + * @work_run: schedules jobs, cleans up jobs, and processes messages
>>>>>> * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the
>>>>>> * timeout interval is over.
>>>>>> * @pending_list: the list of jobs which are currently in the job queue.
>>>>>> @@ -513,6 +537,7 @@ struct drm_gpu_scheduler {
>>>>>> long timeout;
>>>>>> const char *name;
>>>>>> struct drm_sched_rq sched_rq[DRM_SCHED_PRIORITY_COUNT];
>>>>>> + struct list_head msgs;
>>>>>> wait_queue_head_t job_scheduled;
>>>>>> atomic_t hw_rq_count;
>>>>>> atomic64_t job_id_count;
>>>>>> @@ -566,6 +591,8 @@ void drm_sched_entity_modify_sched(struct drm_sched_entity *entity,
>>>>>>
>>>>>> void drm_sched_job_cleanup(struct drm_sched_job *job);
>>>>>> void drm_sched_wakeup(struct drm_gpu_scheduler *sched);
>>>>>> +void drm_sched_add_msg(struct drm_gpu_scheduler *sched,
>>>>>> + struct drm_sched_msg *msg);
>>>>>> void drm_sched_run_wq_stop(struct drm_gpu_scheduler *sched);
>>>>>> void drm_sched_run_wq_start(struct drm_gpu_scheduler *sched);
>>>>>> void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad);
>> --
>> Daniel Vetter
>> Software Engineer, Intel Corporation
>> http://blog.ffwll.ch
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH 4/8] drm/sched: Add generic scheduler message interface
2023-08-07 15:46 ` Christian König
@ 2023-08-08 14:06 ` Matthew Brost
2023-08-08 14:14 ` Christian König
0 siblings, 1 reply; 24+ messages in thread
From: Matthew Brost @ 2023-08-08 14:06 UTC (permalink / raw)
To: Christian König
Cc: robdclark, thomas.hellstrom, sarah.walker, ketil.johnsen,
Liviu.Dudau, dri-devel, luben.tuikov, lina, donald.robson,
boris.brezillon, intel-xe, faith.ekstrand
On Mon, Aug 07, 2023 at 05:46:16PM +0200, Christian König wrote:
> Am 04.08.23 um 16:13 schrieb Matthew Brost:
> > [SNIP]
> > Christian / Daniel - I've read both of you comments and having a hard
> > time parsing them. I do not really understand the issue with this patch
> > or exactly what is being suggested instead. Let's try to work through
> > this.
> >
> > > > > > I'm still extremely frowned on this.
> > > > > >
> > > > > > If you need this functionality then let the drivers decide which
> > > > > > runqueue the scheduler should use.
> > What do you mean by runqueue here? Do you mean 'struct
> > workqueue_struct'? The scheduler in this context is 'struct
> > drm_gpu_scheduler', right?
>
> Sorry for the confusing wording, your understanding is correct.
>
> > Yes, we have added this functionality iin the first patch.
> >
> > > > > > When you then create a single threaded runqueue you can just submit work
> > > > > > to it and serialize this with the scheduler work.
> > > > > >
> > We don't want to use a single threaded workqueue_struct in Xe, we want
> > to use a system_wq as run_job() can be executed in parallel across
> > multiple entites (or drm_gpu_scheduler as in Xe we have 1 to 1
> > relationship between entity and scheduler). What we want is on per
> > entity / scheduler granularity to be able to communicate into the
> > backend a message synchronously (run_job / free_job not executing,
> > scheduler execution not paused for a reset).
> >
> > If I'm underatanding what you suggesting in Xe we'd create an ordered
> > workqueue_struct per drm_gpu_scheduler and the queue messages on the
> > ordered workqueue_struct?
>
> Yes, correct.
>
> > This seems pretty messy to me as now we have
> > open coded a solution bypassing the scheduler, every drm_gpu_scheduler
> > creates its own workqueue_struct, and we'd also have to open code the
> > pausing of these messages for resets too.
> >
> > IMO this is pretty clean solution that follows the pattern of cleanup
> > jobs already in place.
>
> Yeah, exactly that's the point. Moving the job cleanup into the scheduler
> thread is seen as very very bad idea by me.
>
> And I really don't want to exercise that again for different use cases.
>
> >
> > > > > > This way we wouldn't duplicate this core kernel function inside the
> > > > > > scheduler.
> > > > > Yeah that's essentially the design we picked for the tdr workers,
> > > > > where some drivers have requirements that all tdr work must be done on
> > > > > the same thread (because of cross-engine coordination issues). But
> > > > > that would require that we rework the scheduler as a pile of
> > > > > self-submitting work items, and I'm not sure that actually fits all
> > > > > that well into the core workqueue interfaces either.
> > This is the ordering between TDRs firing between different
> > drm_gpu_scheduler and larger external resets (GT in Xe) an ordered
> > workqueue_struct makes sense for this. Here we are talking about
> > ordering jobs and messages within a single drm_gpu_scheduler. Using the
> > main execution thread to do ordering makes sense in my opinion.
>
> I completely disagree to that.
>
> Take a look at how this came to be. This is a very very ugly hack and we
> already had a hard time making lockdep understand the different fence
> signaling dependencies with freeing the job and I'm pretty sure that is
> still not 100% correct.
>
> >
> > > > There were already patches floating around which did exactly that.
> > > >
> > > > Last time I checked those were actually looking pretty good.
> > > >
> > Link to patches for reference.
> >
> > > > Additional to message passing advantage the real big issue with the
> > > > scheduler and 1 to 1 mapping is that we create a kernel thread for each
> > > > instance, which results in tons on overhead.
> > First patch in the series switches from kthread to work queue, that is
> > still a good idea.
>
> This was the patch I was referring to. Sorry didn't remembered that this was
> in the same patch set.
>
> >
> > > > Just using a work item which is submitted to a work queue completely avoids
> > > > that.
> > > Hm I should have read the entire series first, since that does the
> > > conversion still. Apologies for the confusion, and yeah we should be able
> > > to just submit other work to the same wq with the first patch? And so
> > > hand-rolling this infra here isn't needed at all?
> > >
> > I wouldn't call this hand rolling, rather it following patten already in
> > place.
>
> Basically workqueues are the in kernel infrastructure for exactly that use
> case and we are trying to re-create that here and that is usually a rather
> bad idea.
>
Ok let me play around with what this would look like in Xe, what you are
suggesting would be ordered-wq per scheduler, work item for run job,
work item for clean up job, and work item for a message. That might
work I suppose? Only issue I see is scaling as this exposes an
ordered-wq creation directly to an IOCTL. No idea if that is actually a
concern though.
Matt
> Regards,
> Christian.
>
> >
> > Matt
> >
> > > Or what am I missing?
> > >
> > > > Regards,
> > > > Christian.
> > > >
> > > > > Worst case I think this isn't a dead-end and can be refactored to
> > > > > internally use the workqueue services, with the new functions here
> > > > > just being dumb wrappers until everyone is converted over. So it
> > > > > doesn't look like an expensive mistake, if it turns out to be a
> > > > > mistake.
> > > > > -Daniel
> > > > >
> > > > >
> > > > > > Regards,
> > > > > > Christian.
> > > > > >
> > > > > > > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > > > > > > ---
> > > > > > > drivers/gpu/drm/scheduler/sched_main.c | 52 +++++++++++++++++++++++++-
> > > > > > > include/drm/gpu_scheduler.h | 29 +++++++++++++-
> > > > > > > 2 files changed, 78 insertions(+), 3 deletions(-)
> > > > > > >
> > > > > > > diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> > > > > > > index 2597fb298733..84821a124ca2 100644
> > > > > > > --- a/drivers/gpu/drm/scheduler/sched_main.c
> > > > > > > +++ b/drivers/gpu/drm/scheduler/sched_main.c
> > > > > > > @@ -1049,6 +1049,49 @@ drm_sched_pick_best(struct drm_gpu_scheduler **sched_list,
> > > > > > > }
> > > > > > > EXPORT_SYMBOL(drm_sched_pick_best);
> > > > > > >
> > > > > > > +/**
> > > > > > > + * drm_sched_add_msg - add scheduler message
> > > > > > > + *
> > > > > > > + * @sched: scheduler instance
> > > > > > > + * @msg: message to be added
> > > > > > > + *
> > > > > > > + * Can and will pass an jobs waiting on dependencies or in a runnable queue.
> > > > > > > + * Messages processing will stop if schedule run wq is stopped and resume when
> > > > > > > + * run wq is started.
> > > > > > > + */
> > > > > > > +void drm_sched_add_msg(struct drm_gpu_scheduler *sched,
> > > > > > > + struct drm_sched_msg *msg)
> > > > > > > +{
> > > > > > > + spin_lock(&sched->job_list_lock);
> > > > > > > + list_add_tail(&msg->link, &sched->msgs);
> > > > > > > + spin_unlock(&sched->job_list_lock);
> > > > > > > +
> > > > > > > + drm_sched_run_wq_queue(sched);
> > > > > > > +}
> > > > > > > +EXPORT_SYMBOL(drm_sched_add_msg);
> > > > > > > +
> > > > > > > +/**
> > > > > > > + * drm_sched_get_msg - get scheduler message
> > > > > > > + *
> > > > > > > + * @sched: scheduler instance
> > > > > > > + *
> > > > > > > + * Returns NULL or message
> > > > > > > + */
> > > > > > > +static struct drm_sched_msg *
> > > > > > > +drm_sched_get_msg(struct drm_gpu_scheduler *sched)
> > > > > > > +{
> > > > > > > + struct drm_sched_msg *msg;
> > > > > > > +
> > > > > > > + spin_lock(&sched->job_list_lock);
> > > > > > > + msg = list_first_entry_or_null(&sched->msgs,
> > > > > > > + struct drm_sched_msg, link);
> > > > > > > + if (msg)
> > > > > > > + list_del(&msg->link);
> > > > > > > + spin_unlock(&sched->job_list_lock);
> > > > > > > +
> > > > > > > + return msg;
> > > > > > > +}
> > > > > > > +
> > > > > > > /**
> > > > > > > * drm_sched_main - main scheduler thread
> > > > > > > *
> > > > > > > @@ -1060,6 +1103,7 @@ static void drm_sched_main(struct work_struct *w)
> > > > > > > container_of(w, struct drm_gpu_scheduler, work_run);
> > > > > > > struct drm_sched_entity *entity;
> > > > > > > struct drm_sched_job *cleanup_job;
> > > > > > > + struct drm_sched_msg *msg;
> > > > > > > int r;
> > > > > > >
> > > > > > > if (READ_ONCE(sched->pause_run_wq))
> > > > > > > @@ -1067,12 +1111,15 @@ static void drm_sched_main(struct work_struct *w)
> > > > > > >
> > > > > > > cleanup_job = drm_sched_get_cleanup_job(sched);
> > > > > > > entity = drm_sched_select_entity(sched);
> > > > > > > + msg = drm_sched_get_msg(sched);
> > > > > > >
> > > > > > > - if (!entity && !cleanup_job)
> > > > > > > + if (!entity && !cleanup_job && !msg)
> > > > > > > return; /* No more work */
> > > > > > >
> > > > > > > if (cleanup_job)
> > > > > > > sched->ops->free_job(cleanup_job);
> > > > > > > + if (msg)
> > > > > > > + sched->ops->process_msg(msg);
> > > > > > >
> > > > > > > if (entity) {
> > > > > > > struct dma_fence *fence;
> > > > > > > @@ -1082,7 +1129,7 @@ static void drm_sched_main(struct work_struct *w)
> > > > > > > sched_job = drm_sched_entity_pop_job(entity);
> > > > > > > if (!sched_job) {
> > > > > > > complete_all(&entity->entity_idle);
> > > > > > > - if (!cleanup_job)
> > > > > > > + if (!cleanup_job && !msg)
> > > > > > > return; /* No more work */
> > > > > > > goto again;
> > > > > > > }
> > > > > > > @@ -1177,6 +1224,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
> > > > > > >
> > > > > > > init_waitqueue_head(&sched->job_scheduled);
> > > > > > > INIT_LIST_HEAD(&sched->pending_list);
> > > > > > > + INIT_LIST_HEAD(&sched->msgs);
> > > > > > > spin_lock_init(&sched->job_list_lock);
> > > > > > > atomic_set(&sched->hw_rq_count, 0);
> > > > > > > INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
> > > > > > > diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> > > > > > > index df1993dd44ae..267bd060d178 100644
> > > > > > > --- a/include/drm/gpu_scheduler.h
> > > > > > > +++ b/include/drm/gpu_scheduler.h
> > > > > > > @@ -394,6 +394,23 @@ enum drm_gpu_sched_stat {
> > > > > > > DRM_GPU_SCHED_STAT_ENODEV,
> > > > > > > };
> > > > > > >
> > > > > > > +/**
> > > > > > > + * struct drm_sched_msg - an in-band (relative to GPU scheduler run queue)
> > > > > > > + * message
> > > > > > > + *
> > > > > > > + * Generic enough for backend defined messages, backend can expand if needed.
> > > > > > > + */
> > > > > > > +struct drm_sched_msg {
> > > > > > > + /** @link: list link into the gpu scheduler list of messages */
> > > > > > > + struct list_head link;
> > > > > > > + /**
> > > > > > > + * @private_data: opaque pointer to message private data (backend defined)
> > > > > > > + */
> > > > > > > + void *private_data;
> > > > > > > + /** @opcode: opcode of message (backend defined) */
> > > > > > > + unsigned int opcode;
> > > > > > > +};
> > > > > > > +
> > > > > > > /**
> > > > > > > * struct drm_sched_backend_ops - Define the backend operations
> > > > > > > * called by the scheduler
> > > > > > > @@ -471,6 +488,12 @@ struct drm_sched_backend_ops {
> > > > > > > * and it's time to clean it up.
> > > > > > > */
> > > > > > > void (*free_job)(struct drm_sched_job *sched_job);
> > > > > > > +
> > > > > > > + /**
> > > > > > > + * @process_msg: Process a message. Allowed to block, it is this
> > > > > > > + * function's responsibility to free message if dynamically allocated.
> > > > > > > + */
> > > > > > > + void (*process_msg)(struct drm_sched_msg *msg);
> > > > > > > };
> > > > > > >
> > > > > > > /**
> > > > > > > @@ -482,6 +505,7 @@ struct drm_sched_backend_ops {
> > > > > > > * @timeout: the time after which a job is removed from the scheduler.
> > > > > > > * @name: name of the ring for which this scheduler is being used.
> > > > > > > * @sched_rq: priority wise array of run queues.
> > > > > > > + * @msgs: list of messages to be processed in @work_run
> > > > > > > * @job_scheduled: once @drm_sched_entity_do_release is called the scheduler
> > > > > > > * waits on this wait queue until all the scheduled jobs are
> > > > > > > * finished.
> > > > > > > @@ -489,7 +513,7 @@ struct drm_sched_backend_ops {
> > > > > > > * @job_id_count: used to assign unique id to the each job.
> > > > > > > * @run_wq: workqueue used to queue @work_run
> > > > > > > * @timeout_wq: workqueue used to queue @work_tdr
> > > > > > > - * @work_run: schedules jobs and cleans up entities
> > > > > > > + * @work_run: schedules jobs, cleans up jobs, and processes messages
> > > > > > > * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the
> > > > > > > * timeout interval is over.
> > > > > > > * @pending_list: the list of jobs which are currently in the job queue.
> > > > > > > @@ -513,6 +537,7 @@ struct drm_gpu_scheduler {
> > > > > > > long timeout;
> > > > > > > const char *name;
> > > > > > > struct drm_sched_rq sched_rq[DRM_SCHED_PRIORITY_COUNT];
> > > > > > > + struct list_head msgs;
> > > > > > > wait_queue_head_t job_scheduled;
> > > > > > > atomic_t hw_rq_count;
> > > > > > > atomic64_t job_id_count;
> > > > > > > @@ -566,6 +591,8 @@ void drm_sched_entity_modify_sched(struct drm_sched_entity *entity,
> > > > > > >
> > > > > > > void drm_sched_job_cleanup(struct drm_sched_job *job);
> > > > > > > void drm_sched_wakeup(struct drm_gpu_scheduler *sched);
> > > > > > > +void drm_sched_add_msg(struct drm_gpu_scheduler *sched,
> > > > > > > + struct drm_sched_msg *msg);
> > > > > > > void drm_sched_run_wq_stop(struct drm_gpu_scheduler *sched);
> > > > > > > void drm_sched_run_wq_start(struct drm_gpu_scheduler *sched);
> > > > > > > void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad);
> > > --
> > > Daniel Vetter
> > > Software Engineer, Intel Corporation
> > > http://blog.ffwll.ch
>
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH 4/8] drm/sched: Add generic scheduler message interface
2023-08-08 14:06 ` Matthew Brost
@ 2023-08-08 14:14 ` Christian König
2023-08-09 14:36 ` Matthew Brost
0 siblings, 1 reply; 24+ messages in thread
From: Christian König @ 2023-08-08 14:14 UTC (permalink / raw)
To: Matthew Brost
Cc: robdclark, thomas.hellstrom, sarah.walker, ketil.johnsen,
Liviu.Dudau, dri-devel, luben.tuikov, lina, donald.robson,
boris.brezillon, intel-xe, faith.ekstrand
Am 08.08.23 um 16:06 schrieb Matthew Brost:
> [SNIP]
>> Basically workqueues are the in kernel infrastructure for exactly that use
>> case and we are trying to re-create that here and that is usually a rather
>> bad idea.
>>
> Ok let me play around with what this would look like in Xe, what you are
> suggesting would be ordered-wq per scheduler, work item for run job,
> work item for clean up job, and work item for a message. That might
> work I suppose? Only issue I see is scaling as this exposes an
> ordered-wq creation directly to an IOCTL. No idea if that is actually a
> concern though.
That's a very good question I can't answer of hand either.
But from the history of work queues I know that they were invented to
reduce the overhead/costs of having many kernel threads.
So my educated guess is that you probably won't find anything better at
the moment. If work queues then indeed don't match this use case then we
need to figure out how to improve them or find a different solution.
Christian.
>
> Matt
>
>> Regards,
>> Christian.
>>
>>> Matt
>>>
>>>> Or what am I missing?
>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>>> Worst case I think this isn't a dead-end and can be refactored to
>>>>>> internally use the workqueue services, with the new functions here
>>>>>> just being dumb wrappers until everyone is converted over. So it
>>>>>> doesn't look like an expensive mistake, if it turns out to be a
>>>>>> mistake.
>>>>>> -Daniel
>>>>>>
>>>>>>
>>>>>>> Regards,
>>>>>>> Christian.
>>>>>>>
>>>>>>>> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
>>>>>>>> ---
>>>>>>>> drivers/gpu/drm/scheduler/sched_main.c | 52 +++++++++++++++++++++++++-
>>>>>>>> include/drm/gpu_scheduler.h | 29 +++++++++++++-
>>>>>>>> 2 files changed, 78 insertions(+), 3 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>> index 2597fb298733..84821a124ca2 100644
>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>> @@ -1049,6 +1049,49 @@ drm_sched_pick_best(struct drm_gpu_scheduler **sched_list,
>>>>>>>> }
>>>>>>>> EXPORT_SYMBOL(drm_sched_pick_best);
>>>>>>>>
>>>>>>>> +/**
>>>>>>>> + * drm_sched_add_msg - add scheduler message
>>>>>>>> + *
>>>>>>>> + * @sched: scheduler instance
>>>>>>>> + * @msg: message to be added
>>>>>>>> + *
>>>>>>>> + * Can and will pass an jobs waiting on dependencies or in a runnable queue.
>>>>>>>> + * Messages processing will stop if schedule run wq is stopped and resume when
>>>>>>>> + * run wq is started.
>>>>>>>> + */
>>>>>>>> +void drm_sched_add_msg(struct drm_gpu_scheduler *sched,
>>>>>>>> + struct drm_sched_msg *msg)
>>>>>>>> +{
>>>>>>>> + spin_lock(&sched->job_list_lock);
>>>>>>>> + list_add_tail(&msg->link, &sched->msgs);
>>>>>>>> + spin_unlock(&sched->job_list_lock);
>>>>>>>> +
>>>>>>>> + drm_sched_run_wq_queue(sched);
>>>>>>>> +}
>>>>>>>> +EXPORT_SYMBOL(drm_sched_add_msg);
>>>>>>>> +
>>>>>>>> +/**
>>>>>>>> + * drm_sched_get_msg - get scheduler message
>>>>>>>> + *
>>>>>>>> + * @sched: scheduler instance
>>>>>>>> + *
>>>>>>>> + * Returns NULL or message
>>>>>>>> + */
>>>>>>>> +static struct drm_sched_msg *
>>>>>>>> +drm_sched_get_msg(struct drm_gpu_scheduler *sched)
>>>>>>>> +{
>>>>>>>> + struct drm_sched_msg *msg;
>>>>>>>> +
>>>>>>>> + spin_lock(&sched->job_list_lock);
>>>>>>>> + msg = list_first_entry_or_null(&sched->msgs,
>>>>>>>> + struct drm_sched_msg, link);
>>>>>>>> + if (msg)
>>>>>>>> + list_del(&msg->link);
>>>>>>>> + spin_unlock(&sched->job_list_lock);
>>>>>>>> +
>>>>>>>> + return msg;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> /**
>>>>>>>> * drm_sched_main - main scheduler thread
>>>>>>>> *
>>>>>>>> @@ -1060,6 +1103,7 @@ static void drm_sched_main(struct work_struct *w)
>>>>>>>> container_of(w, struct drm_gpu_scheduler, work_run);
>>>>>>>> struct drm_sched_entity *entity;
>>>>>>>> struct drm_sched_job *cleanup_job;
>>>>>>>> + struct drm_sched_msg *msg;
>>>>>>>> int r;
>>>>>>>>
>>>>>>>> if (READ_ONCE(sched->pause_run_wq))
>>>>>>>> @@ -1067,12 +1111,15 @@ static void drm_sched_main(struct work_struct *w)
>>>>>>>>
>>>>>>>> cleanup_job = drm_sched_get_cleanup_job(sched);
>>>>>>>> entity = drm_sched_select_entity(sched);
>>>>>>>> + msg = drm_sched_get_msg(sched);
>>>>>>>>
>>>>>>>> - if (!entity && !cleanup_job)
>>>>>>>> + if (!entity && !cleanup_job && !msg)
>>>>>>>> return; /* No more work */
>>>>>>>>
>>>>>>>> if (cleanup_job)
>>>>>>>> sched->ops->free_job(cleanup_job);
>>>>>>>> + if (msg)
>>>>>>>> + sched->ops->process_msg(msg);
>>>>>>>>
>>>>>>>> if (entity) {
>>>>>>>> struct dma_fence *fence;
>>>>>>>> @@ -1082,7 +1129,7 @@ static void drm_sched_main(struct work_struct *w)
>>>>>>>> sched_job = drm_sched_entity_pop_job(entity);
>>>>>>>> if (!sched_job) {
>>>>>>>> complete_all(&entity->entity_idle);
>>>>>>>> - if (!cleanup_job)
>>>>>>>> + if (!cleanup_job && !msg)
>>>>>>>> return; /* No more work */
>>>>>>>> goto again;
>>>>>>>> }
>>>>>>>> @@ -1177,6 +1224,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>>>>>>>>
>>>>>>>> init_waitqueue_head(&sched->job_scheduled);
>>>>>>>> INIT_LIST_HEAD(&sched->pending_list);
>>>>>>>> + INIT_LIST_HEAD(&sched->msgs);
>>>>>>>> spin_lock_init(&sched->job_list_lock);
>>>>>>>> atomic_set(&sched->hw_rq_count, 0);
>>>>>>>> INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
>>>>>>>> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
>>>>>>>> index df1993dd44ae..267bd060d178 100644
>>>>>>>> --- a/include/drm/gpu_scheduler.h
>>>>>>>> +++ b/include/drm/gpu_scheduler.h
>>>>>>>> @@ -394,6 +394,23 @@ enum drm_gpu_sched_stat {
>>>>>>>> DRM_GPU_SCHED_STAT_ENODEV,
>>>>>>>> };
>>>>>>>>
>>>>>>>> +/**
>>>>>>>> + * struct drm_sched_msg - an in-band (relative to GPU scheduler run queue)
>>>>>>>> + * message
>>>>>>>> + *
>>>>>>>> + * Generic enough for backend defined messages, backend can expand if needed.
>>>>>>>> + */
>>>>>>>> +struct drm_sched_msg {
>>>>>>>> + /** @link: list link into the gpu scheduler list of messages */
>>>>>>>> + struct list_head link;
>>>>>>>> + /**
>>>>>>>> + * @private_data: opaque pointer to message private data (backend defined)
>>>>>>>> + */
>>>>>>>> + void *private_data;
>>>>>>>> + /** @opcode: opcode of message (backend defined) */
>>>>>>>> + unsigned int opcode;
>>>>>>>> +};
>>>>>>>> +
>>>>>>>> /**
>>>>>>>> * struct drm_sched_backend_ops - Define the backend operations
>>>>>>>> * called by the scheduler
>>>>>>>> @@ -471,6 +488,12 @@ struct drm_sched_backend_ops {
>>>>>>>> * and it's time to clean it up.
>>>>>>>> */
>>>>>>>> void (*free_job)(struct drm_sched_job *sched_job);
>>>>>>>> +
>>>>>>>> + /**
>>>>>>>> + * @process_msg: Process a message. Allowed to block, it is this
>>>>>>>> + * function's responsibility to free message if dynamically allocated.
>>>>>>>> + */
>>>>>>>> + void (*process_msg)(struct drm_sched_msg *msg);
>>>>>>>> };
>>>>>>>>
>>>>>>>> /**
>>>>>>>> @@ -482,6 +505,7 @@ struct drm_sched_backend_ops {
>>>>>>>> * @timeout: the time after which a job is removed from the scheduler.
>>>>>>>> * @name: name of the ring for which this scheduler is being used.
>>>>>>>> * @sched_rq: priority wise array of run queues.
>>>>>>>> + * @msgs: list of messages to be processed in @work_run
>>>>>>>> * @job_scheduled: once @drm_sched_entity_do_release is called the scheduler
>>>>>>>> * waits on this wait queue until all the scheduled jobs are
>>>>>>>> * finished.
>>>>>>>> @@ -489,7 +513,7 @@ struct drm_sched_backend_ops {
>>>>>>>> * @job_id_count: used to assign unique id to the each job.
>>>>>>>> * @run_wq: workqueue used to queue @work_run
>>>>>>>> * @timeout_wq: workqueue used to queue @work_tdr
>>>>>>>> - * @work_run: schedules jobs and cleans up entities
>>>>>>>> + * @work_run: schedules jobs, cleans up jobs, and processes messages
>>>>>>>> * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the
>>>>>>>> * timeout interval is over.
>>>>>>>> * @pending_list: the list of jobs which are currently in the job queue.
>>>>>>>> @@ -513,6 +537,7 @@ struct drm_gpu_scheduler {
>>>>>>>> long timeout;
>>>>>>>> const char *name;
>>>>>>>> struct drm_sched_rq sched_rq[DRM_SCHED_PRIORITY_COUNT];
>>>>>>>> + struct list_head msgs;
>>>>>>>> wait_queue_head_t job_scheduled;
>>>>>>>> atomic_t hw_rq_count;
>>>>>>>> atomic64_t job_id_count;
>>>>>>>> @@ -566,6 +591,8 @@ void drm_sched_entity_modify_sched(struct drm_sched_entity *entity,
>>>>>>>>
>>>>>>>> void drm_sched_job_cleanup(struct drm_sched_job *job);
>>>>>>>> void drm_sched_wakeup(struct drm_gpu_scheduler *sched);
>>>>>>>> +void drm_sched_add_msg(struct drm_gpu_scheduler *sched,
>>>>>>>> + struct drm_sched_msg *msg);
>>>>>>>> void drm_sched_run_wq_stop(struct drm_gpu_scheduler *sched);
>>>>>>>> void drm_sched_run_wq_start(struct drm_gpu_scheduler *sched);
>>>>>>>> void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad);
>>>> --
>>>> Daniel Vetter
>>>> Software Engineer, Intel Corporation
>>>> http://blog.ffwll.ch
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH 4/8] drm/sched: Add generic scheduler message interface
2023-08-08 14:14 ` Christian König
@ 2023-08-09 14:36 ` Matthew Brost
0 siblings, 0 replies; 24+ messages in thread
From: Matthew Brost @ 2023-08-09 14:36 UTC (permalink / raw)
To: Christian König
Cc: robdclark, thomas.hellstrom, sarah.walker, ketil.johnsen,
Liviu.Dudau, dri-devel, luben.tuikov, lina, donald.robson,
boris.brezillon, intel-xe, faith.ekstrand
On Tue, Aug 08, 2023 at 04:14:55PM +0200, Christian König wrote:
> Am 08.08.23 um 16:06 schrieb Matthew Brost:
> > [SNIP]
> > > Basically workqueues are the in kernel infrastructure for exactly that use
> > > case and we are trying to re-create that here and that is usually a rather
> > > bad idea.
> > >
> > Ok let me play around with what this would look like in Xe, what you are
> > suggesting would be ordered-wq per scheduler, work item for run job,
> > work item for clean up job, and work item for a message. That might
> > work I suppose? Only issue I see is scaling as this exposes an
> > ordered-wq creation directly to an IOCTL. No idea if that is actually a
> > concern though.
>
> That's a very good question I can't answer of hand either.
>
> But from the history of work queues I know that they were invented to reduce
> the overhead/costs of having many kernel threads.
>
> So my educated guess is that you probably won't find anything better at the
> moment. If work queues then indeed don't match this use case then we need to
> figure out how to improve them or find a different solution.
>
I looked at workqueue code and think the workqueue creation is decoupled
from kthread creation in most cases so I think this fits.
Hacked together a quick PoC of this on top of Xe, seems to work, and
like how the code looks too. Need to clean it up a bit and run some perf
tests but looks promising. Hopely it all comes together and can get
another spin of this series out fairly soon.
Matt
> Christian.
>
> >
> > Matt
> >
> > > Regards,
> > > Christian.
> > >
> > > > Matt
> > > >
> > > > > Or what am I missing?
> > > > >
> > > > > > Regards,
> > > > > > Christian.
> > > > > >
> > > > > > > Worst case I think this isn't a dead-end and can be refactored to
> > > > > > > internally use the workqueue services, with the new functions here
> > > > > > > just being dumb wrappers until everyone is converted over. So it
> > > > > > > doesn't look like an expensive mistake, if it turns out to be a
> > > > > > > mistake.
> > > > > > > -Daniel
> > > > > > >
> > > > > > >
> > > > > > > > Regards,
> > > > > > > > Christian.
> > > > > > > >
> > > > > > > > > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > > > > > > > > ---
> > > > > > > > > drivers/gpu/drm/scheduler/sched_main.c | 52 +++++++++++++++++++++++++-
> > > > > > > > > include/drm/gpu_scheduler.h | 29 +++++++++++++-
> > > > > > > > > 2 files changed, 78 insertions(+), 3 deletions(-)
> > > > > > > > >
> > > > > > > > > diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> > > > > > > > > index 2597fb298733..84821a124ca2 100644
> > > > > > > > > --- a/drivers/gpu/drm/scheduler/sched_main.c
> > > > > > > > > +++ b/drivers/gpu/drm/scheduler/sched_main.c
> > > > > > > > > @@ -1049,6 +1049,49 @@ drm_sched_pick_best(struct drm_gpu_scheduler **sched_list,
> > > > > > > > > }
> > > > > > > > > EXPORT_SYMBOL(drm_sched_pick_best);
> > > > > > > > >
> > > > > > > > > +/**
> > > > > > > > > + * drm_sched_add_msg - add scheduler message
> > > > > > > > > + *
> > > > > > > > > + * @sched: scheduler instance
> > > > > > > > > + * @msg: message to be added
> > > > > > > > > + *
> > > > > > > > > + * Can and will pass an jobs waiting on dependencies or in a runnable queue.
> > > > > > > > > + * Messages processing will stop if schedule run wq is stopped and resume when
> > > > > > > > > + * run wq is started.
> > > > > > > > > + */
> > > > > > > > > +void drm_sched_add_msg(struct drm_gpu_scheduler *sched,
> > > > > > > > > + struct drm_sched_msg *msg)
> > > > > > > > > +{
> > > > > > > > > + spin_lock(&sched->job_list_lock);
> > > > > > > > > + list_add_tail(&msg->link, &sched->msgs);
> > > > > > > > > + spin_unlock(&sched->job_list_lock);
> > > > > > > > > +
> > > > > > > > > + drm_sched_run_wq_queue(sched);
> > > > > > > > > +}
> > > > > > > > > +EXPORT_SYMBOL(drm_sched_add_msg);
> > > > > > > > > +
> > > > > > > > > +/**
> > > > > > > > > + * drm_sched_get_msg - get scheduler message
> > > > > > > > > + *
> > > > > > > > > + * @sched: scheduler instance
> > > > > > > > > + *
> > > > > > > > > + * Returns NULL or message
> > > > > > > > > + */
> > > > > > > > > +static struct drm_sched_msg *
> > > > > > > > > +drm_sched_get_msg(struct drm_gpu_scheduler *sched)
> > > > > > > > > +{
> > > > > > > > > + struct drm_sched_msg *msg;
> > > > > > > > > +
> > > > > > > > > + spin_lock(&sched->job_list_lock);
> > > > > > > > > + msg = list_first_entry_or_null(&sched->msgs,
> > > > > > > > > + struct drm_sched_msg, link);
> > > > > > > > > + if (msg)
> > > > > > > > > + list_del(&msg->link);
> > > > > > > > > + spin_unlock(&sched->job_list_lock);
> > > > > > > > > +
> > > > > > > > > + return msg;
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > /**
> > > > > > > > > * drm_sched_main - main scheduler thread
> > > > > > > > > *
> > > > > > > > > @@ -1060,6 +1103,7 @@ static void drm_sched_main(struct work_struct *w)
> > > > > > > > > container_of(w, struct drm_gpu_scheduler, work_run);
> > > > > > > > > struct drm_sched_entity *entity;
> > > > > > > > > struct drm_sched_job *cleanup_job;
> > > > > > > > > + struct drm_sched_msg *msg;
> > > > > > > > > int r;
> > > > > > > > >
> > > > > > > > > if (READ_ONCE(sched->pause_run_wq))
> > > > > > > > > @@ -1067,12 +1111,15 @@ static void drm_sched_main(struct work_struct *w)
> > > > > > > > >
> > > > > > > > > cleanup_job = drm_sched_get_cleanup_job(sched);
> > > > > > > > > entity = drm_sched_select_entity(sched);
> > > > > > > > > + msg = drm_sched_get_msg(sched);
> > > > > > > > >
> > > > > > > > > - if (!entity && !cleanup_job)
> > > > > > > > > + if (!entity && !cleanup_job && !msg)
> > > > > > > > > return; /* No more work */
> > > > > > > > >
> > > > > > > > > if (cleanup_job)
> > > > > > > > > sched->ops->free_job(cleanup_job);
> > > > > > > > > + if (msg)
> > > > > > > > > + sched->ops->process_msg(msg);
> > > > > > > > >
> > > > > > > > > if (entity) {
> > > > > > > > > struct dma_fence *fence;
> > > > > > > > > @@ -1082,7 +1129,7 @@ static void drm_sched_main(struct work_struct *w)
> > > > > > > > > sched_job = drm_sched_entity_pop_job(entity);
> > > > > > > > > if (!sched_job) {
> > > > > > > > > complete_all(&entity->entity_idle);
> > > > > > > > > - if (!cleanup_job)
> > > > > > > > > + if (!cleanup_job && !msg)
> > > > > > > > > return; /* No more work */
> > > > > > > > > goto again;
> > > > > > > > > }
> > > > > > > > > @@ -1177,6 +1224,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
> > > > > > > > >
> > > > > > > > > init_waitqueue_head(&sched->job_scheduled);
> > > > > > > > > INIT_LIST_HEAD(&sched->pending_list);
> > > > > > > > > + INIT_LIST_HEAD(&sched->msgs);
> > > > > > > > > spin_lock_init(&sched->job_list_lock);
> > > > > > > > > atomic_set(&sched->hw_rq_count, 0);
> > > > > > > > > INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
> > > > > > > > > diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> > > > > > > > > index df1993dd44ae..267bd060d178 100644
> > > > > > > > > --- a/include/drm/gpu_scheduler.h
> > > > > > > > > +++ b/include/drm/gpu_scheduler.h
> > > > > > > > > @@ -394,6 +394,23 @@ enum drm_gpu_sched_stat {
> > > > > > > > > DRM_GPU_SCHED_STAT_ENODEV,
> > > > > > > > > };
> > > > > > > > >
> > > > > > > > > +/**
> > > > > > > > > + * struct drm_sched_msg - an in-band (relative to GPU scheduler run queue)
> > > > > > > > > + * message
> > > > > > > > > + *
> > > > > > > > > + * Generic enough for backend defined messages, backend can expand if needed.
> > > > > > > > > + */
> > > > > > > > > +struct drm_sched_msg {
> > > > > > > > > + /** @link: list link into the gpu scheduler list of messages */
> > > > > > > > > + struct list_head link;
> > > > > > > > > + /**
> > > > > > > > > + * @private_data: opaque pointer to message private data (backend defined)
> > > > > > > > > + */
> > > > > > > > > + void *private_data;
> > > > > > > > > + /** @opcode: opcode of message (backend defined) */
> > > > > > > > > + unsigned int opcode;
> > > > > > > > > +};
> > > > > > > > > +
> > > > > > > > > /**
> > > > > > > > > * struct drm_sched_backend_ops - Define the backend operations
> > > > > > > > > * called by the scheduler
> > > > > > > > > @@ -471,6 +488,12 @@ struct drm_sched_backend_ops {
> > > > > > > > > * and it's time to clean it up.
> > > > > > > > > */
> > > > > > > > > void (*free_job)(struct drm_sched_job *sched_job);
> > > > > > > > > +
> > > > > > > > > + /**
> > > > > > > > > + * @process_msg: Process a message. Allowed to block, it is this
> > > > > > > > > + * function's responsibility to free message if dynamically allocated.
> > > > > > > > > + */
> > > > > > > > > + void (*process_msg)(struct drm_sched_msg *msg);
> > > > > > > > > };
> > > > > > > > >
> > > > > > > > > /**
> > > > > > > > > @@ -482,6 +505,7 @@ struct drm_sched_backend_ops {
> > > > > > > > > * @timeout: the time after which a job is removed from the scheduler.
> > > > > > > > > * @name: name of the ring for which this scheduler is being used.
> > > > > > > > > * @sched_rq: priority wise array of run queues.
> > > > > > > > > + * @msgs: list of messages to be processed in @work_run
> > > > > > > > > * @job_scheduled: once @drm_sched_entity_do_release is called the scheduler
> > > > > > > > > * waits on this wait queue until all the scheduled jobs are
> > > > > > > > > * finished.
> > > > > > > > > @@ -489,7 +513,7 @@ struct drm_sched_backend_ops {
> > > > > > > > > * @job_id_count: used to assign unique id to the each job.
> > > > > > > > > * @run_wq: workqueue used to queue @work_run
> > > > > > > > > * @timeout_wq: workqueue used to queue @work_tdr
> > > > > > > > > - * @work_run: schedules jobs and cleans up entities
> > > > > > > > > + * @work_run: schedules jobs, cleans up jobs, and processes messages
> > > > > > > > > * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the
> > > > > > > > > * timeout interval is over.
> > > > > > > > > * @pending_list: the list of jobs which are currently in the job queue.
> > > > > > > > > @@ -513,6 +537,7 @@ struct drm_gpu_scheduler {
> > > > > > > > > long timeout;
> > > > > > > > > const char *name;
> > > > > > > > > struct drm_sched_rq sched_rq[DRM_SCHED_PRIORITY_COUNT];
> > > > > > > > > + struct list_head msgs;
> > > > > > > > > wait_queue_head_t job_scheduled;
> > > > > > > > > atomic_t hw_rq_count;
> > > > > > > > > atomic64_t job_id_count;
> > > > > > > > > @@ -566,6 +591,8 @@ void drm_sched_entity_modify_sched(struct drm_sched_entity *entity,
> > > > > > > > >
> > > > > > > > > void drm_sched_job_cleanup(struct drm_sched_job *job);
> > > > > > > > > void drm_sched_wakeup(struct drm_gpu_scheduler *sched);
> > > > > > > > > +void drm_sched_add_msg(struct drm_gpu_scheduler *sched,
> > > > > > > > > + struct drm_sched_msg *msg);
> > > > > > > > > void drm_sched_run_wq_stop(struct drm_gpu_scheduler *sched);
> > > > > > > > > void drm_sched_run_wq_start(struct drm_gpu_scheduler *sched);
> > > > > > > > > void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad);
> > > > > --
> > > > > Daniel Vetter
> > > > > Software Engineer, Intel Corporation
> > > > > http://blog.ffwll.ch
>
^ permalink raw reply [flat|nested] 24+ messages in thread
* [PATCH 5/8] drm/sched: Add drm_sched_start_timeout_unlocked helper
2023-08-01 20:50 [PATCH 0/8] DRM scheduler changes for Xe Matthew Brost
` (3 preceding siblings ...)
2023-08-01 20:50 ` [PATCH 4/8] drm/sched: Add generic scheduler message interface Matthew Brost
@ 2023-08-01 20:51 ` Matthew Brost
2023-08-01 20:51 ` [PATCH 6/8] drm/sched: Start run wq before TDR in drm_sched_start Matthew Brost
` (2 subsequent siblings)
7 siblings, 0 replies; 24+ messages in thread
From: Matthew Brost @ 2023-08-01 20:51 UTC (permalink / raw)
To: dri-devel, intel-xe
Cc: robdclark, thomas.hellstrom, Matthew Brost, sarah.walker,
ketil.johnsen, Liviu.Dudau, luben.tuikov, lina, donald.robson,
boris.brezillon, christian.koenig, faith.ekstrand
Also add a lockdep assert to drm_sched_start_timeout.
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
drivers/gpu/drm/scheduler/sched_main.c | 23 +++++++++++++----------
1 file changed, 13 insertions(+), 10 deletions(-)
diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index 84821a124ca2..be963d68a733 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -360,11 +360,20 @@ static void drm_sched_job_done_cb(struct dma_fence *f, struct dma_fence_cb *cb)
*/
static void drm_sched_start_timeout(struct drm_gpu_scheduler *sched)
{
+ lockdep_assert_held(&sched->job_list_lock);
+
if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
!list_empty(&sched->pending_list))
queue_delayed_work(sched->timeout_wq, &sched->work_tdr, sched->timeout);
}
+static void drm_sched_start_timeout_unlocked(struct drm_gpu_scheduler *sched)
+{
+ spin_lock(&sched->job_list_lock);
+ drm_sched_start_timeout(sched);
+ spin_unlock(&sched->job_list_lock);
+}
+
/**
* drm_sched_fault - immediately start timeout handler
*
@@ -476,11 +485,8 @@ static void drm_sched_job_timedout(struct work_struct *work)
spin_unlock(&sched->job_list_lock);
}
- if (status != DRM_GPU_SCHED_STAT_ENODEV) {
- spin_lock(&sched->job_list_lock);
- drm_sched_start_timeout(sched);
- spin_unlock(&sched->job_list_lock);
- }
+ if (status != DRM_GPU_SCHED_STAT_ENODEV)
+ drm_sched_start_timeout_unlocked(sched);
}
/**
@@ -606,11 +612,8 @@ void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery)
drm_sched_job_done(s_job);
}
- if (full_recovery) {
- spin_lock(&sched->job_list_lock);
- drm_sched_start_timeout(sched);
- spin_unlock(&sched->job_list_lock);
- }
+ if (full_recovery)
+ drm_sched_start_timeout_unlocked(sched);
drm_sched_run_wq_start(sched);
}
--
2.34.1
^ permalink raw reply related [flat|nested] 24+ messages in thread
* [PATCH 6/8] drm/sched: Start run wq before TDR in drm_sched_start
2023-08-01 20:50 [PATCH 0/8] DRM scheduler changes for Xe Matthew Brost
` (4 preceding siblings ...)
2023-08-01 20:51 ` [PATCH 5/8] drm/sched: Add drm_sched_start_timeout_unlocked helper Matthew Brost
@ 2023-08-01 20:51 ` Matthew Brost
2023-08-01 20:51 ` [PATCH 7/8] drm/sched: Submit job before starting TDR Matthew Brost
2023-08-01 20:51 ` [PATCH 8/8] drm/sched: Add helper to set TDR timeout Matthew Brost
7 siblings, 0 replies; 24+ messages in thread
From: Matthew Brost @ 2023-08-01 20:51 UTC (permalink / raw)
To: dri-devel, intel-xe
Cc: robdclark, thomas.hellstrom, Matthew Brost, sarah.walker,
ketil.johnsen, Liviu.Dudau, luben.tuikov, lina, donald.robson,
boris.brezillon, christian.koenig, faith.ekstrand
If the TDR is set to a very small value it can fire before the run wq is
started in the function drm_sched_start. The run wq is expected to
running when the TDR fires, fix this ordering so this expectation is
always met.
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
drivers/gpu/drm/scheduler/sched_main.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index be963d68a733..2e404a6542ad 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -612,10 +612,10 @@ void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery)
drm_sched_job_done(s_job);
}
+ drm_sched_run_wq_start(sched);
+
if (full_recovery)
drm_sched_start_timeout_unlocked(sched);
-
- drm_sched_run_wq_start(sched);
}
EXPORT_SYMBOL(drm_sched_start);
--
2.34.1
^ permalink raw reply related [flat|nested] 24+ messages in thread
* [PATCH 7/8] drm/sched: Submit job before starting TDR
2023-08-01 20:50 [PATCH 0/8] DRM scheduler changes for Xe Matthew Brost
` (5 preceding siblings ...)
2023-08-01 20:51 ` [PATCH 6/8] drm/sched: Start run wq before TDR in drm_sched_start Matthew Brost
@ 2023-08-01 20:51 ` Matthew Brost
2023-08-01 20:51 ` [PATCH 8/8] drm/sched: Add helper to set TDR timeout Matthew Brost
7 siblings, 0 replies; 24+ messages in thread
From: Matthew Brost @ 2023-08-01 20:51 UTC (permalink / raw)
To: dri-devel, intel-xe
Cc: robdclark, thomas.hellstrom, Matthew Brost, sarah.walker,
ketil.johnsen, Liviu.Dudau, luben.tuikov, lina, donald.robson,
boris.brezillon, christian.koenig, faith.ekstrand
If the TDR is set to a value, it can fire before a job is submitted in
drm_sched_main. The job should be always be submitted before the TDR
fires, fix this ordering.
v2:
- Add to pending list before run_job, start TDR after (Luben, Boris)
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
drivers/gpu/drm/scheduler/sched_main.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index 2e404a6542ad..9573f13f8459 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -445,7 +445,6 @@ static void drm_sched_job_begin(struct drm_sched_job *s_job)
spin_lock(&sched->job_list_lock);
list_add_tail(&s_job->list, &sched->pending_list);
- drm_sched_start_timeout(sched);
spin_unlock(&sched->job_list_lock);
}
@@ -1146,6 +1145,7 @@ static void drm_sched_main(struct work_struct *w)
fence = sched->ops->run_job(sched_job);
complete_all(&entity->entity_idle);
drm_sched_fence_scheduled(s_fence);
+ drm_sched_start_timeout_unlocked(sched);
if (!IS_ERR_OR_NULL(fence)) {
drm_sched_fence_set_parent(s_fence, fence);
--
2.34.1
^ permalink raw reply related [flat|nested] 24+ messages in thread
* [PATCH 8/8] drm/sched: Add helper to set TDR timeout
2023-08-01 20:50 [PATCH 0/8] DRM scheduler changes for Xe Matthew Brost
` (6 preceding siblings ...)
2023-08-01 20:51 ` [PATCH 7/8] drm/sched: Submit job before starting TDR Matthew Brost
@ 2023-08-01 20:51 ` Matthew Brost
7 siblings, 0 replies; 24+ messages in thread
From: Matthew Brost @ 2023-08-01 20:51 UTC (permalink / raw)
To: dri-devel, intel-xe
Cc: robdclark, thomas.hellstrom, Matthew Brost, sarah.walker,
ketil.johnsen, Liviu.Dudau, luben.tuikov, lina, donald.robson,
boris.brezillon, christian.koenig, faith.ekstrand
Add helper to set TDR timeout and restart the TDR with new timeout
value. This will be used in XE, new Intel GPU driver, to trigger the TDR
to cleanup drm_sched_entity that encounter errors.
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
drivers/gpu/drm/scheduler/sched_main.c | 18 ++++++++++++++++++
include/drm/gpu_scheduler.h | 1 +
2 files changed, 19 insertions(+)
diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index 9573f13f8459..19ec0cb5caee 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -374,6 +374,24 @@ static void drm_sched_start_timeout_unlocked(struct drm_gpu_scheduler *sched)
spin_unlock(&sched->job_list_lock);
}
+/**
+ * drm_sched_set_timeout - set timeout for reset worker
+ *
+ * @sched: scheduler instance to set and (re)-start the worker for
+ * @timeout: timeout period
+ *
+ * Set and (re)-start the timeout for the given scheduler.
+ */
+void drm_sched_set_timeout(struct drm_gpu_scheduler *sched, long timeout)
+{
+ spin_lock(&sched->job_list_lock);
+ sched->timeout = timeout;
+ cancel_delayed_work(&sched->work_tdr);
+ drm_sched_start_timeout(sched);
+ spin_unlock(&sched->job_list_lock);
+}
+EXPORT_SYMBOL(drm_sched_set_timeout);
+
/**
* drm_sched_fault - immediately start timeout handler
*
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
index 267bd060d178..f4af856aebd9 100644
--- a/include/drm/gpu_scheduler.h
+++ b/include/drm/gpu_scheduler.h
@@ -589,6 +589,7 @@ void drm_sched_entity_modify_sched(struct drm_sched_entity *entity,
struct drm_gpu_scheduler **sched_list,
unsigned int num_sched_list);
+void drm_sched_set_timeout(struct drm_gpu_scheduler *sched, long timeout);
void drm_sched_job_cleanup(struct drm_sched_job *job);
void drm_sched_wakeup(struct drm_gpu_scheduler *sched);
void drm_sched_add_msg(struct drm_gpu_scheduler *sched,
--
2.34.1
^ permalink raw reply related [flat|nested] 24+ messages in thread