Re: [PATCH 1/5] vfio: Delete vfio_get/put_group from vfio_iommu_group_notifier()

From: Alex Williamson <alex.williamson@redhat.com>
To: Jason Gunthorpe <jgg@nvidia.com>
Cc: Cornelia Huck <cohuck@redhat.com>,
	kvm@vger.kernel.org, Christoph Hellwig <hch@lst.de>,
	"Tian, Kevin" <kevin.tian@intel.com>,
	Liu Yi L <yi.l.liu@intel.com>
Subject: Re: [PATCH 1/5] vfio: Delete vfio_get/put_group from vfio_iommu_group_notifier()
Date: Mon, 4 Oct 2021 16:25:32 -0600	[thread overview]
Message-ID: <20211004162532.3b59ed06.alex.williamson@redhat.com> (raw)
In-Reply-To: <1-v1-fba989159158+2f9b-vfio_group_cdev_jgg@nvidia.com>

On Fri,  1 Oct 2021 20:22:20 -0300
Jason Gunthorpe <jgg@nvidia.com> wrote:

> iommu_group_register_notifier()/iommu_group_unregister_notifier() are
> built using a blocking_notifier_chain which integrates a rwsem. The
> notifier function cannot be running outside its registration.
> 
> When considering how the notifier function interacts with create/destroy
> of the group there are two fringe cases, the notifier starts before
> list_add(&vfio.group_list) and the notifier runs after the kref
> becomes 0.
> 
> Prior to vfio_create_group() unlocking and returning we have
>    container_users == 0
>    device_list == empty
> And this cannot change until the mutex is unlocked.
> 
> After the kref goes to zero we must also have
>    container_users == 0
>    device_list == empty
> 
> Both are required because they are balanced operations and a 0 kref means
> some caller became unbalanced. Add the missing assertion that
> container_users must be zero as well.
> 
> These two facts are important because when checking each operation we see:
> 
> - IOMMU_GROUP_NOTIFY_ADD_DEVICE
>    Empty device_list avoids the WARN_ON in vfio_group_nb_add_dev()
>    0 container_users ends the call
> - IOMMU_GROUP_NOTIFY_BOUND_DRIVER
>    0 container_users ends the call
> 
> Finally, we have IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER, which only deletes
> items from the unbound list. During creation this list is empty, during
> kref == 0 nothing can read this list, and it will be freed soon.
> 
> Since the vfio_group_release() doesn't hold the appropriate lock to
> manipulate the unbound_list and could race with the notifier, move the
> cleanup to directly before the kfree.
> 
> This allows deleting all of the deferred group put code.
> 
> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> ---
>  drivers/vfio/vfio.c | 89 +++++----------------------------------------
>  1 file changed, 9 insertions(+), 80 deletions(-)
> 
> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
> index 08b27b64f0f935..32a53cb3598524 100644
> --- a/drivers/vfio/vfio.c
> +++ b/drivers/vfio/vfio.c
> @@ -324,12 +324,20 @@ static void vfio_container_put(struct vfio_container *container)
>  
>  static void vfio_group_unlock_and_free(struct vfio_group *group)
>  {
> +	struct vfio_unbound_dev *unbound, *tmp;
> +
>  	mutex_unlock(&vfio.group_lock);
>  	/*
>  	 * Unregister outside of lock.  A spurious callback is harmless now
>  	 * that the group is no longer in vfio.group_list.
>  	 */

This comment is indirectly referencing the vfio_group_try_get() in the
notifier callback, but as you describe in the commit log, it's actually
the container_users value that prevents this from racing group release
now.  Otherwise, tricky but looks good.  Thanks,

Alex

>  	iommu_group_unregister_notifier(group->iommu_group, &group->nb);
> +
> +	list_for_each_entry_safe(unbound, tmp,
> +				 &group->unbound_list, unbound_next) {
> +		list_del(&unbound->unbound_next);
> +		kfree(unbound);
> +	}
>  	kfree(group);
>  }
>  
> @@ -361,13 +369,6 @@ static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group,
>  
>  	group->nb.notifier_call = vfio_iommu_group_notifier;
>  
> -	/*
> -	 * blocking notifiers acquire a rwsem around registering and hold
> -	 * it around callback.  Therefore, need to register outside of
> -	 * vfio.group_lock to avoid A-B/B-A contention.  Our callback won't
> -	 * do anything unless it can find the group in vfio.group_list, so
> -	 * no harm in registering early.
> -	 */
>  	ret = iommu_group_register_notifier(iommu_group, &group->nb);
>  	if (ret) {
>  		kfree(group);
> @@ -415,18 +416,12 @@ static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group,
>  static void vfio_group_release(struct kref *kref)
>  {
>  	struct vfio_group *group = container_of(kref, struct vfio_group, kref);
> -	struct vfio_unbound_dev *unbound, *tmp;
>  	struct iommu_group *iommu_group = group->iommu_group;
>  
>  	WARN_ON(!list_empty(&group->device_list));
> +	WARN_ON(atomic_read(&group->container_users));
>  	WARN_ON(group->notifier.head);
>  
> -	list_for_each_entry_safe(unbound, tmp,
> -				 &group->unbound_list, unbound_next) {
> -		list_del(&unbound->unbound_next);
> -		kfree(unbound);
> -	}
> -
>  	device_destroy(vfio.class, MKDEV(MAJOR(vfio.group_devt), group->minor));
>  	list_del(&group->vfio_next);
>  	vfio_free_group_minor(group->minor);
> @@ -439,61 +434,12 @@ static void vfio_group_put(struct vfio_group *group)
>  	kref_put_mutex(&group->kref, vfio_group_release, &vfio.group_lock);
>  }
>  
> -struct vfio_group_put_work {
> -	struct work_struct work;
> -	struct vfio_group *group;
> -};
> -
> -static void vfio_group_put_bg(struct work_struct *work)
> -{
> -	struct vfio_group_put_work *do_work;
> -
> -	do_work = container_of(work, struct vfio_group_put_work, work);
> -
> -	vfio_group_put(do_work->group);
> -	kfree(do_work);
> -}
> -
> -static void vfio_group_schedule_put(struct vfio_group *group)
> -{
> -	struct vfio_group_put_work *do_work;
> -
> -	do_work = kmalloc(sizeof(*do_work), GFP_KERNEL);
> -	if (WARN_ON(!do_work))
> -		return;
> -
> -	INIT_WORK(&do_work->work, vfio_group_put_bg);
> -	do_work->group = group;
> -	schedule_work(&do_work->work);
> -}
> -
>  /* Assume group_lock or group reference is held */
>  static void vfio_group_get(struct vfio_group *group)
>  {
>  	kref_get(&group->kref);
>  }
>  
> -/*
> - * Not really a try as we will sleep for mutex, but we need to make
> - * sure the group pointer is valid under lock and get a reference.
> - */
> -static struct vfio_group *vfio_group_try_get(struct vfio_group *group)
> -{
> -	struct vfio_group *target = group;
> -
> -	mutex_lock(&vfio.group_lock);
> -	list_for_each_entry(group, &vfio.group_list, vfio_next) {
> -		if (group == target) {
> -			vfio_group_get(group);
> -			mutex_unlock(&vfio.group_lock);
> -			return group;
> -		}
> -	}
> -	mutex_unlock(&vfio.group_lock);
> -
> -	return NULL;
> -}
> -
>  static
>  struct vfio_group *vfio_group_get_from_iommu(struct iommu_group *iommu_group)
>  {
> @@ -691,14 +637,6 @@ static int vfio_iommu_group_notifier(struct notifier_block *nb,
>  	struct device *dev = data;
>  	struct vfio_unbound_dev *unbound;
>  
> -	/*
> -	 * Need to go through a group_lock lookup to get a reference or we
> -	 * risk racing a group being removed.  Ignore spurious notifies.
> -	 */
> -	group = vfio_group_try_get(group);
> -	if (!group)
> -		return NOTIFY_OK;
> -
>  	switch (action) {
>  	case IOMMU_GROUP_NOTIFY_ADD_DEVICE:
>  		vfio_group_nb_add_dev(group, dev);
> @@ -749,15 +687,6 @@ static int vfio_iommu_group_notifier(struct notifier_block *nb,
>  		mutex_unlock(&group->unbound_lock);
>  		break;
>  	}
> -
> -	/*
> -	 * If we're the last reference to the group, the group will be
> -	 * released, which includes unregistering the iommu group notifier.
> -	 * We hold a read-lock on that notifier list, unregistering needs
> -	 * a write-lock... deadlock.  Release our reference asynchronously
> -	 * to avoid that situation.
> -	 */
> -	vfio_group_schedule_put(group);
>  	return NOTIFY_OK;
>  }
>