Boris Brezillon writes: > The HVS block is supposed to fill the pixelvalve FIFOs fast enough to > meet the requested framerate. The problem is, the HVS and memory bus > bandwidths are limited, and if we don't take these limitations into > account we might end up with HVS underflow errors. > > This patch is trying to model the per-plane HVS and memory bus bandwidth > consumption and take a decision at atomic_check() time whether the > estimated load will fit in the HVS and membus budget. > > Note that we take an extra margin on the memory bus consumption to let > the system run smoothly when other blocks are doing heavy use of the > memory bus. Same goes for the HVS limit, except the margin is smaller in > this case, since the HVS is not used by external components. > > Signed-off-by: Boris Brezillon > --- > Changes in v2: > - Remove an unused var in vc4_load_tracker_atomic_check() > --- > drivers/gpu/drm/vc4/vc4_drv.c | 1 + > drivers/gpu/drm/vc4/vc4_drv.h | 11 ++++ > drivers/gpu/drm/vc4/vc4_kms.c | 103 +++++++++++++++++++++++++++++++- > drivers/gpu/drm/vc4/vc4_plane.c | 60 +++++++++++++++++++ > 4 files changed, 174 insertions(+), 1 deletion(-) > > diff --git a/drivers/gpu/drm/vc4/vc4_drv.c b/drivers/gpu/drm/vc4/vc4_drv.c > index f6f5cd80c04d..7195a0bcceb3 100644 > --- a/drivers/gpu/drm/vc4/vc4_drv.c > +++ b/drivers/gpu/drm/vc4/vc4_drv.c > @@ -313,6 +313,7 @@ static void vc4_drm_unbind(struct device *dev) > > drm_mode_config_cleanup(drm); > > + drm_atomic_private_obj_fini(&vc4->load_tracker); > drm_atomic_private_obj_fini(&vc4->ctm_manager); > > drm_dev_put(drm); > diff --git a/drivers/gpu/drm/vc4/vc4_drv.h b/drivers/gpu/drm/vc4/vc4_drv.h > index a6c67c65ffbc..11369da944b6 100644 > --- a/drivers/gpu/drm/vc4/vc4_drv.h > +++ b/drivers/gpu/drm/vc4/vc4_drv.h > @@ -200,6 +200,7 @@ struct vc4_dev { > > struct drm_modeset_lock ctm_state_lock; > struct drm_private_obj ctm_manager; > + struct drm_private_obj load_tracker; > }; > > static inline struct vc4_dev * > @@ -369,6 +370,16 @@ struct vc4_plane_state { > * to enable background color fill. > */ > bool needs_bg_fill; > + > + /* Load of this plane on the HVS block. The load is expressed in HVS > + * cycles/sec. > + */ > + u64 hvs_load; > + > + /* Memory bandwidth needed for this plane. This is expressed in > + * bytes/sec. > + */ > + u64 membus_load; > }; > > static inline struct vc4_plane_state * > diff --git a/drivers/gpu/drm/vc4/vc4_kms.c b/drivers/gpu/drm/vc4/vc4_kms.c > index 8e0183b1e8bb..b905a19c1470 100644 > --- a/drivers/gpu/drm/vc4/vc4_kms.c > +++ b/drivers/gpu/drm/vc4/vc4_kms.c > @@ -34,6 +34,18 @@ static struct vc4_ctm_state *to_vc4_ctm_state(struct drm_private_state *priv) > return container_of(priv, struct vc4_ctm_state, base); > } > > +struct vc4_load_tracker_state { > + struct drm_private_state base; > + u64 hvs_load; > + u64 membus_load; > +}; > + > +static struct vc4_load_tracker_state * > +to_vc4_load_tracker_state(struct drm_private_state *priv) > +{ > + return container_of(priv, struct vc4_load_tracker_state, base); > +} > + > static struct vc4_ctm_state *vc4_get_ctm_state(struct drm_atomic_state *state, > struct drm_private_obj *manager) > { > @@ -383,6 +395,81 @@ vc4_ctm_atomic_check(struct drm_device *dev, struct drm_atomic_state *state) > return 0; > } > > +static int vc4_load_tracker_atomic_check(struct drm_atomic_state *state) > +{ > + struct drm_plane_state *old_plane_state, *new_plane_state; > + struct vc4_dev *vc4 = to_vc4_dev(state->dev); > + struct vc4_load_tracker_state *load_state; > + struct drm_private_state *priv_state; > + struct drm_plane *plane; > + int i; > + > + priv_state = drm_atomic_get_private_obj_state(state, > + &vc4->load_tracker); > + if (IS_ERR(priv_state)) > + return PTR_ERR(priv_state); > + > + load_state = to_vc4_load_tracker_state(priv_state); > + for_each_oldnew_plane_in_state(state, plane, old_plane_state, > + new_plane_state, i) { > + struct vc4_plane_state *vc4_plane_state; > + > + if (old_plane_state->fb && old_plane_state->crtc) { > + vc4_plane_state = to_vc4_plane_state(old_plane_state); > + load_state->membus_load -= vc4_plane_state->membus_load; > + load_state->hvs_load -= vc4_plane_state->hvs_load; > + } > + > + if (new_plane_state->fb && new_plane_state->crtc) { > + vc4_plane_state = to_vc4_plane_state(new_plane_state); > + load_state->membus_load += vc4_plane_state->membus_load; > + load_state->hvs_load += vc4_plane_state->hvs_load; > + } > + } > + > + /* The abolsute limit is 2Gbyte/sec, but let's take a margin to let "absolute" > + * the system work when other blocks are accessing the memory. > + */ > + if (load_state->membus_load > SZ_1G + SZ_512M) > + return -ENOSPC; > + /* HVS clock is supposed to run @ 250Mhz, let's take a margin and > + * consider the maximum number of cycles is 240M. > + */ > + if (load_state->hvs_load > 240000000ULL) > + return -ENOSPC; Some day we should probably be using the HVS's actual clock from DT if available instead of a hardcoded number here. Good enough for now. > +static void vc4_load_tracker_destroy_state(struct drm_private_obj *obj, > + struct drm_private_state *state) > +{ > + struct vc4_load_tracker_state *load_state; > + > + load_state = to_vc4_load_tracker_state(state); > + kfree(load_state); > +} Optional: just kfree(state) for simplicity. > +static void vc4_plane_calc_load(struct drm_plane_state *state) > +{ > + unsigned int hvs_load_shift, vrefresh, i; > + struct drm_framebuffer *fb = state->fb; > + struct vc4_plane_state *vc4_state; > + struct drm_crtc_state *crtc_state; > + unsigned int vscale_factor; > + > + vc4_state = to_vc4_plane_state(state); > + crtc_state = drm_atomic_get_existing_crtc_state(state->state, > + state->crtc); > + vrefresh = drm_mode_vrefresh(&crtc_state->adjusted_mode); > + > + /* The HVS is able to process 2 pixels/cycle when scaling the source, > + * 4 pixels/cycle otherwise. > + * Alpha blending step seems to be pipelined and it's always operating > + * at 4 pixels/cycle, so the limiting aspect here seems to be the > + * scaler block. > + * HVS load is expressed in clk-cycles/sec (AKA Hz). > + */ > + if (vc4_state->x_scaling[0] != VC4_SCALING_NONE || > + vc4_state->x_scaling[1] != VC4_SCALING_NONE || > + vc4_state->y_scaling[0] != VC4_SCALING_NONE || > + vc4_state->y_scaling[1] != VC4_SCALING_NONE) > + hvs_load_shift = 1; > + else > + hvs_load_shift = 2; > + > + vc4_state->membus_load = 0; > + vc4_state->hvs_load = 0; > + for (i = 0; i < fb->format->num_planes; i++) { > + unsigned long pixels_load; I'm scared any time I see longs. Do you want 32 or 64 bits here? > + /* Even if the bandwidth/plane required for a single frame is > + * > + * vc4_state->src_w[i] * vc4_state->src_h[i] * cpp * vrefresh > + * > + * when downscaling, we have to read more pixels per line in > + * the time frame reserved for a single line, so the bandwidth > + * demand can be punctually higher. To account for that, we > + * calculate the down-scaling factor and multiply the plane > + * load by this number. We're likely over-estimating the read > + * demand, but that's better than under-estimating it. > + */ > + vscale_factor = DIV_ROUND_UP(vc4_state->src_h[i], > + vc4_state->crtc_h); > + pixels_load = vc4_state->src_w[i] * vc4_state->src_h[i] * > + vscale_factor; If we're upscaling (common for video, right?), aren't we under-counting the cost? You need to scale/colorspace-convert crtc_w * crtc_h at 2 pixels per cycle. Overall, this is simpler than I expected it to be, and looks really promising. Thanks for working on it! > + vc4_state->membus_load += fb->format->cpp[i] * pixels_load; > + vc4_state->hvs_load += pixels_load; > + } > + > + vc4_state->hvs_load *= vrefresh; > + vc4_state->hvs_load >>= hvs_load_shift; > + vc4_state->membus_load *= vrefresh; > +} > +