Boris Brezillon <boris.brezillon@free-electrons.com> writes:

> On Tue, 17 Oct 2017 17:16:29 -0700
> Eric Anholt <eric@anholt.net> wrote:
>
>> Boris Brezillon <boris.brezillon@free-electrons.com> writes:
>> 
>> > This ioctl will allow us to purge inactive userspace buffers when the
>> > system is running out of contiguous memory.
>> >
>> > For now, the purge logic is rather dumb in that it does not try to
>> > release only the amount of BO needed to meet the last CMA alloc request
>> > but instead purges all objects placed in the purgeable pool as soon as
>> > we experience a CMA allocation failure.
>> >
>> > Note that the in-kernel BO cache is always purged before the purgeable
>> > cache because those objects are known to be unused while objects marked
>> > as purgeable by a userspace application/library might have to be
>> > restored when they are marked back as unpurgeable, which can be
>> > expensive.
>> >
>> > Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>  
>> 
>> Looking good!  Just a couple things I'd like to sort out before we
>> merge.
>> 
>> > ---
>> > Hello,
>> >
>> > Updates to libdrm, mesa and igt making use of this kernel feature can
>> > be found on my github repos [1][2][3].
>> >
>> > There's currently no debugfs hook to manually force a purge, but this
>> > is being discussed and might be added later on.
>> >
>> > Regards,
>> >
>> > Boris
>> >
>> > [1]https://github.com/bbrezillon/drm/tree/vc4/purgeable
>> > [2]https://github.com/bbrezillon/mesa/tree/vc4/purgeable
>> > [3]https://github.com/bbrezillon/igt/tree/vc4/purgeable
>> >
>> > Changes in v3:
>> > - Use %zd formatters when printing size_t values
>> > - rework detection of BOs that do not support the MADV ioctl
>> > - replace DRM_ERROR by DRM_DEBUG in the ioctl path
>> > - do not explicitly memzero purged BO mem
>> > - check that pad is set to zero in the madv ioctl function
>> >
>> > Changes in v2:
>> > - Do not re-allocate BO's memory after when WILLNEED is asked after a purge
>> > - Add purgeable/purged stats to debugfs and dumpstats
>> > - Pad the drm_vc4_gem_madvise to make it 64-bit aligned
>> > - Forbid madvise calls on internal BO objects (everything that is not
>> >   meant to be exposed to userspace)
>> > - Do not increment/decrement usecnt on internal BOs (they cannot be marked
>> >   purgeable, so doing that is useless and error prone)
>> > - Fix a few bugs related to usecnt refcounting
>> > ---
>> >  drivers/gpu/drm/vc4/vc4_bo.c    | 292 ++++++++++++++++++++++++++++++++++++++--
>> >  drivers/gpu/drm/vc4/vc4_drv.c   |  10 +-
>> >  drivers/gpu/drm/vc4/vc4_drv.h   |  30 +++++
>> >  drivers/gpu/drm/vc4/vc4_gem.c   | 156 ++++++++++++++++++++-
>> >  drivers/gpu/drm/vc4/vc4_plane.c |  20 +++
>> >  include/uapi/drm/vc4_drm.h      |  19 +++
>> >  6 files changed, 512 insertions(+), 15 deletions(-)
>> >
>> > diff --git a/drivers/gpu/drm/vc4/vc4_bo.c b/drivers/gpu/drm/vc4/vc4_bo.c
>> > index 3afdbf4bc10b..e4d13bbef54f 100644
>> > --- a/drivers/gpu/drm/vc4/vc4_bo.c
>> > +++ b/drivers/gpu/drm/vc4/vc4_bo.c
>> > @@ -42,6 +42,8 @@ static bool is_user_label(int label)
>> >  
>> >  static void vc4_bo_stats_dump(struct vc4_dev *vc4)
>> >  {
>> > +	size_t purgeable_size, purged_size;
>> > +	unsigned int npurgeable, npurged;
>> >  	int i;
>> >  
>> >  	for (i = 0; i < vc4->num_labels; i++) {
>> > @@ -53,6 +55,21 @@ static void vc4_bo_stats_dump(struct vc4_dev *vc4)
>> >  			 vc4->bo_labels[i].size_allocated / 1024,
>> >  			 vc4->bo_labels[i].num_allocated);
>> >  	}
>> > +
>> > +	mutex_lock(&vc4->purgeable.lock);
>> > +	npurgeable = vc4->purgeable.num;
>> > +	purgeable_size = vc4->purgeable.size;
>> > +	purged_size = vc4->purgeable.purged_size;
>> > +	npurged = vc4->purgeable.purged_num;
>> > +	mutex_unlock(&vc4->purgeable.lock);
>> > +
>> > +	if (npurgeable)
>> > +		DRM_INFO("%30s: %6zdkb BOs (%d)\n", "userspace BO cache",
>> > +			 purgeable_size / 1024, npurgeable);
>> > +
>> > +	if (npurged)
>> > +		DRM_INFO("%30s: %6zdkb BOs (%d)\n", "total purged BO",
>> > +			 purged_size / 1024, npurged);
>> >  }
>> >  
>> >  #ifdef CONFIG_DEBUG_FS
>> > @@ -61,6 +78,8 @@ int vc4_bo_stats_debugfs(struct seq_file *m, void *unused)
>> >  	struct drm_info_node *node = (struct drm_info_node *)m->private;
>> >  	struct drm_device *dev = node->minor->dev;
>> >  	struct vc4_dev *vc4 = to_vc4_dev(dev);
>> > +	size_t purgeable_size, purged_size;
>> > +	unsigned int npurgeable, npurged;
>> >  	int i;
>> >  
>> >  	mutex_lock(&vc4->bo_lock);
>> > @@ -75,6 +94,21 @@ int vc4_bo_stats_debugfs(struct seq_file *m, void *unused)
>> >  	}
>> >  	mutex_unlock(&vc4->bo_lock);
>> >  
>> > +	mutex_lock(&vc4->purgeable.lock);
>> > +	npurgeable = vc4->purgeable.num;
>> > +	purgeable_size = vc4->purgeable.size;
>> > +	purged_size = vc4->purgeable.purged_size;
>> > +	npurged = vc4->purgeable.purged_num;
>> > +	mutex_unlock(&vc4->purgeable.lock);
>> > +
>> > +	if (npurgeable)
>> > +		seq_printf(m, "%30s: %6dkb BOs (%d)\n", "userspace BO cache",
>> > +			   purgeable_size / 1024, npurgeable);
>> > +
>> > +	if (npurged)
>> > +		seq_printf(m, "%30s: %6dkb BOs (%d)\n", "total purged BO",
>> > +			   purged_size / 1024, npurged);  
>> 
>> I think you could just do these seq_printfs under the lock, instead of
>> taking a snapshot ahead of time.  Same for the dump.
>
> The reason I initially did it like that is because printing a message
> on the console can take a non-negligible amount of time if it's a
> serial console. That's less of a problem for the seq_printfs.
>
> Anyway, I can go for your solution if you prefer.

If we're printing this stuff to the console, we're already in a lot of
trouble :) It doesn't need to be fast.

>> > +static void vc4_bo_remove_from_purgeable_pool_locked(struct vc4_bo *bo)
>> > +{
>> > +	struct vc4_dev *vc4 = to_vc4_dev(bo->base.base.dev);
>> > +
>> > +	/* list_del_init() is used here because the caller might release
>> > +	 * the purgeable lock in order to acquire the madv one and update the
>> > +	 * madv status.
>> > +	 * During this short period of time a user might decide to mark
>> > +	 * the BO as unpurgeable, and if bo->madv is set to
>> > +	 * VC4_MADV_DONTNEED it will try to remove the BO from the
>> > +	 * purgeable list which will fail if the ->next/prev fields
>> > +	 * are set to LIST_POISON1/LIST_POISON2 (which is what
>> > +	 * list_del() does).
>> > +	 * Re-initializing the list element guarantees that list_del()
>> > +	 * will work correctly even if it's a NOP.
>> > +	 */
>> > +	list_del_init(&bo->size_head);
>> > +	vc4->purgeable.num--;
>> > +	vc4->purgeable.size -= bo->base.base.size;
>> > +}
>> > +
>> > +void vc4_bo_remove_from_purgeable_pool(struct vc4_bo *bo)
>> > +{
>> > +	struct vc4_dev *vc4 = to_vc4_dev(bo->base.base.dev);
>> > +
>> > +	mutex_lock(&vc4->purgeable.lock);
>> > +	vc4_bo_remove_from_purgeable_pool_locked(bo);
>> > +	mutex_unlock(&vc4->purgeable.lock);
>> > +}
>> > +
>> > +static void vc4_bo_purge(struct drm_gem_object *obj)
>> > +{
>> > +	struct vc4_bo *bo = to_vc4_bo(obj);
>> > +	struct drm_device *dev = obj->dev;
>> > +
>> > +	WARN_ON(!mutex_is_locked(&bo->madv_lock));
>> > +	WARN_ON(bo->madv != VC4_MADV_DONTNEED);
>> > +
>> > +	drm_vma_node_unmap(&obj->vma_node, dev->anon_inode->i_mapping);
>> > +
>> > +	dma_free_wc(dev->dev, obj->size, bo->base.vaddr, bo->base.paddr);
>> > +	bo->base.vaddr = NULL;
>> > +	bo->madv = __VC4_MADV_PURGED;
>> > +}
>> > +
>> > +static void vc4_bo_userspace_cache_purge(struct drm_device *dev)
>> > +{
>> > +	struct vc4_dev *vc4 = to_vc4_dev(dev);
>> > +
>> > +	mutex_lock(&vc4->purgeable.lock);
>> > +	while (!list_empty(&vc4->purgeable.list)) {
>> > +		struct vc4_bo *bo = list_first_entry(&vc4->purgeable.list,
>> > +						     struct vc4_bo, size_head);
>> > +		struct drm_gem_object *obj = &bo->base.base;
>> > +		size_t purged_size = 0;
>> > +
>> > +		vc4_bo_remove_from_purgeable_pool_locked(bo);
>> > +
>> > +		/* Release the purgeable lock while we're purging the BO so
>> > +		 * that other people can continue inserting things in the
>> > +		 * purgeable pool without having to wait for all BOs to be
>> > +		 * purged.
>> > +		 */
>> > +		mutex_unlock(&vc4->purgeable.lock);
>> > +		mutex_lock(&bo->madv_lock);
>> > +
>> > +		/* Since we released the purgeable pool lock before acquiring
>> > +		 * the BO madv one, vc4_gem_madvise_ioctl() may have changed
>> > +		 * the bo->madv status. In this case, just skip this entry.
>> > +		 */
>> > +		if (bo->madv == VC4_MADV_DONTNEED) {
>> > +			purged_size = bo->base.base.size;
>> > +			vc4_bo_purge(obj);
>> > +		}  
>> 
>> Possible ugly race here?
>> 
>> thread 1               thread 2
>> ----------------------------------------------------
>> Idle my BO
>> Mark DONTNEED
>>                        Start purge
>>                        pull BO off of purgeable list
>> Mark NEED
>> Do a submit CL
>> Mark DONTNEED
>>                        purge the BO
>> 
>
> Crap! You're right. I didn't go that far in my analysis.
>
>> Can we plug that by checking the refcount along with checking that we're
>> DONTNEED?
>
> That should solve the problem you're describing above, but I see another
> problem:
>
> thread 1                      thread 2
> ----------------------------------------------------
> Idle my BO
> Mark DONTNEED
>                               Start purge
>                               pull BO off of purgeable list
> Mark NEED
> Mark DONTNEED
>  (BO gets back in the list)
>                               purge the BO
>                                (ouch! we purged a BO that is still in
>                                 the purgeable list)
>
>
> We'd need something like:
>
> 		/* Since we released the purgeable pool lock before
> 		 * acquiring the BO madv one, vc4_gem_madvise_ioctl()
> 		 * may have changed the bo->madv status in the meantime
> 		 * and may have re-used the BO. Before purging the BO we
> 		 * need to make sure
> 		 * - it is still marked as DONTNEED
> 		 * - it has not been re-inserted in the purgeable list
> 		 * - it is not used by HW blocks
> 		 * If one of these conditions is not true, just skip the
> 		 * entry.
> 		 */
> 		if (bo->madv == VC4_MADV_DONTNEED &&
> 		    list_empty(&bo->size_list) &&
> 		    !refcount_read(&bo->usecnt)) {
> 			purged_size = bo->base.base.size;
> 			vc4_bo_purge(obj);
> 		}

Sounds good.  If we can get the respin by tomorrow morning my time, I
think we can get it merged for 4.15.

>> > @@ -639,9 +670,6 @@ vc4_queue_submit(struct drm_device *dev, struct vc4_exec_info *exec,
>> >   * The command validator needs to reference BOs by their index within
>> >   * the submitted job's BO list.  This does the validation of the job's
>> >   * BO list and reference counting for the lifetime of the job.
>> > - *
>> > - * Note that this function doesn't need to unreference the BOs on
>> > - * failure, because that will happen at vc4_complete_exec() time.
>> >   */
>> >  static int
>> >  vc4_cl_lookup_bos(struct drm_device *dev,
>> > @@ -693,16 +721,47 @@ vc4_cl_lookup_bos(struct drm_device *dev,
>> >  			DRM_DEBUG("Failed to look up GEM BO %d: %d\n",
>> >  				  i, handles[i]);
>> >  			ret = -EINVAL;
>> > -			spin_unlock(&file_priv->table_lock);
>> > -			goto fail;
>> > +			break;
>> >  		}
>> > +
>> >  		drm_gem_object_get(bo);
>> >  		exec->bo[i] = (struct drm_gem_cma_object *)bo;
>> >  	}
>> >  	spin_unlock(&file_priv->table_lock);
>> >  
>> > +	if (ret)
>> > +		goto fail_put_bo;
>> > +
>> > +	for (i = 0; i < exec->bo_count; i++) {
>> > +		ret = vc4_bo_inc_usecnt(to_vc4_bo(&exec->bo[i]->base));
>> > +		if (ret)
>> > +			goto fail_dec_usecnt;
>> > +	}
>> > +
>> > +	kvfree(handles);
>> > +	return 0;
>> > +
>> > +fail_dec_usecnt:
>> > +	/* Decrease usecnt on acquired objects.
>> > +	 * We cannot rely on  vc4_complete_exec() to release resources here,
>> > +	 * because vc4_complete_exec() has no information about which BO has
>> > +	 * had its ->usecnt incremented.
>> > +	 * To make things easier we just free everything explicitly and set
>> > +	 * exec->bo to NULL so that vc4_complete_exec() skips the 'BO release'
>> > +	 * step.
>> > +	 */
>> > +	for (i-- ; i >= 0; i--)
>> > +		vc4_bo_dec_usecnt(to_vc4_bo(&exec->bo[i]->base));
>> > +
>> > +fail_put_bo:
>> > +	/* Release any reference to acquired objects. */
>> > +	for (i = 0; i < exec->bo_count && exec->bo[i]; i++)
>> > +		drm_gem_object_put(&exec->bo[i]->base);  
>> 
>> I think this should be drm_gem_object_put_unlocked() -- we're not
>> holding struct_mutex, and don't use it in this driver.
>
> Right, I'll change that.
>
>> You can safely
>> call it with NULL, but I'm fine either way.
>
> But I can't dereference a NULL pointer: I'm passing &exec->bo[i]->base
> to drm_gem_object_put(), and if exec->bo[i] is NULL => PANIC!

Ah, the assumption was that base is the first element, so
&((vc4_bo)NULL)->base == NULL.