All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Zeng, Oak" <oak.zeng@intel.com>
To: "Brost, Matthew" <matthew.brost@intel.com>,
	"intel-xe@lists.freedesktop.org" <intel-xe@lists.freedesktop.org>
Cc: "Brost, Matthew" <matthew.brost@intel.com>
Subject: RE: [PATCH v4 01/30] drm/xe: Lock all gpuva ops during VM bind IOCTL
Date: Sun, 10 Mar 2024 17:44:00 +0000	[thread overview]
Message-ID: <SA1PR11MB6991E7ECA4A2ADD89DA05C1792252@SA1PR11MB6991.namprd11.prod.outlook.com> (raw)
In-Reply-To: <20240308050806.577176-2-matthew.brost@intel.com>



> -----Original Message-----
> From: Intel-xe <intel-xe-bounces@lists.freedesktop.org> On Behalf Of Matthew
> Brost
> Sent: Friday, March 8, 2024 12:08 AM
> To: intel-xe@lists.freedesktop.org
> Cc: Brost, Matthew <matthew.brost@intel.com>
> Subject: [PATCH v4 01/30] drm/xe: Lock all gpuva ops during VM bind IOCTL
> 
> Lock all gpuva ops

Can we have a better wording? Better to say locking all Bos used in gpuva ops? 

Or maybe lock ops by locking and validating all Bos used in ops.

 and validate all BOs in a single step durin the VM
> bind IOCTL. This help with the transition to making all gpuva ops in a
> VM bind IOCTL a single atomic job.

Can you also explain, why you want bind to be a atomic job? 

My guess is, bind ioctl can end up with a series of operations, if some (not all) of those operations fail in the middle, it is hard to revert the successful operations before failure. 
> 
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> ---
>  drivers/gpu/drm/xe/xe_vm.c | 142 ++++++++++++++++++++++++++-----------
>  1 file changed, 101 insertions(+), 41 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
> index 643b3701a738..3b5dc6de07f7 100644
> --- a/drivers/gpu/drm/xe/xe_vm.c
> +++ b/drivers/gpu/drm/xe/xe_vm.c
> @@ -413,19 +413,23 @@ int __xe_vm_userptr_needs_repin(struct xe_vm *vm)
> 
>  #define XE_VM_REBIND_RETRY_TIMEOUT_MS 1000
> 
> -static void xe_vm_kill(struct xe_vm *vm)
> +static void xe_vm_kill(struct xe_vm *vm, bool unlocked)
>  {
>  	struct xe_exec_queue *q;
> 
>  	lockdep_assert_held(&vm->lock);
> 
> -	xe_vm_lock(vm, false);
> +	if (unlocked)
> +		xe_vm_lock(vm, false);
> +

Can you explain why we need xe_vm_lock in the first place here? 

My understanding is, xe_vm_lock protect gpu page table update. Below kill function eventually calls into guc_exec_queue_kill where I don't see any page table operation there. So I doubt whether we need the lock in the first place.

>  	vm->flags |= XE_VM_FLAG_BANNED;
>  	trace_xe_vm_kill(vm);
> 
>  	list_for_each_entry(q, &vm->preempt.exec_queues, compute.link)
>  		q->ops->kill(q);
> -	xe_vm_unlock(vm);
> +
> +	if (unlocked)
> +		xe_vm_unlock(vm);
> 
>  	/* TODO: Inform user the VM is banned */
>  }
> @@ -621,7 +625,7 @@ static void preempt_rebind_work_func(struct work_struct
> *w)
> 
>  	if (err) {
>  		drm_warn(&vm->xe->drm, "VM worker error: %d\n", err);
> -		xe_vm_kill(vm);
> +		xe_vm_kill(vm, true);
>  	}
>  	up_write(&vm->lock);
> 
> @@ -1831,17 +1835,9 @@ static int xe_vm_bind(struct xe_vm *vm, struct xe_vma
> *vma, struct xe_exec_queue
>  		      u32 num_syncs, bool immediate, bool first_op,
>  		      bool last_op)
>  {
> -	int err;
> -
>  	xe_vm_assert_held(vm);
>  	xe_bo_assert_held(bo);
> 
> -	if (bo && immediate) {
> -		err = xe_bo_validate(bo, vm, true);
> -		if (err)
> -			return err;
> -	}
> -
>  	return __xe_vm_bind(vm, vma, q, syncs, num_syncs, immediate, first_op,
>  			    last_op);
>  }
> @@ -2488,17 +2484,12 @@ static int vm_bind_ioctl_ops_parse(struct xe_vm *vm,
> struct xe_exec_queue *q,
>  	return 0;
>  }
> 
> -static int op_execute(struct drm_exec *exec, struct xe_vm *vm,
> -		      struct xe_vma *vma, struct xe_vma_op *op)
> +static int op_execute(struct xe_vm *vm, struct xe_vma *vma,
> +		      struct xe_vma_op *op)
>  {
>  	int err;
> 
>  	lockdep_assert_held_write(&vm->lock);
> -
> -	err = xe_vm_prepare_vma(exec, vma, 1);
> -	if (err)
> -		return err;
> -
>  	xe_vm_assert_held(vm);
>  	xe_bo_assert_held(xe_vma_bo(vma));
> 
> @@ -2579,19 +2570,10 @@ static int op_execute(struct drm_exec *exec, struct
> xe_vm *vm,
>  static int __xe_vma_op_execute(struct xe_vm *vm, struct xe_vma *vma,
>  			       struct xe_vma_op *op)
>  {
> -	struct drm_exec exec;
>  	int err;
> 
>  retry_userptr:
> -	drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT, 0);
> -	drm_exec_until_all_locked(&exec) {
> -		err = op_execute(&exec, vm, vma, op);
> -		drm_exec_retry_on_contention(&exec);
> -		if (err)
> -			break;
> -	}
> -	drm_exec_fini(&exec);
> -
> +	err = op_execute(vm, vma, op);
>  	if (err == -EAGAIN) {
>  		lockdep_assert_held_write(&vm->lock);
> 
> @@ -2756,29 +2738,107 @@ static void vm_bind_ioctl_ops_unwind(struct xe_vm
> *vm,
>  	}
>  }
> 
> +static int vma_lock(struct drm_exec *exec, struct xe_vma *vma, bool validate)
> +{
> +	struct xe_bo *bo = xe_vma_bo(vma);
> +	int err = 0;
> +
> +	if (bo) {
> +		if (!bo->vm)
> +			err = drm_exec_prepare_obj(exec, &bo->ttm.base, 1);
> +		if (!err && validate)
> +			err = xe_bo_validate(bo, xe_vma_vm(vma), true);
> +	}
> +
> +	return err;
> +}
> +
> +static int op_lock(struct drm_exec *exec, struct xe_vm *vm,
> +		   struct xe_vma_op *op)
> +{
> +	int err = 0;
> +
> +	switch (op->base.op) {
> +	case DRM_GPUVA_OP_MAP:
> +		err = vma_lock(exec, op->map.vma, !xe_vm_in_fault_mode(vm));
> +		break;
> +	case DRM_GPUVA_OP_REMAP:
> +		err = vma_lock(exec, gpuva_to_vma(op->base.remap.unmap->va),
> +			       false);
> +		if (!err && op->remap.prev)
> +			err = vma_lock(exec, op->remap.prev, true);
> +		if (!err && op->remap.next)
> +			err = vma_lock(exec, op->remap.next, true);
> +		break;
> +	case DRM_GPUVA_OP_UNMAP:
> +		err = vma_lock(exec, gpuva_to_vma(op->base.unmap.va), false);
> +		break;
> +	case DRM_GPUVA_OP_PREFETCH:
> +		err = vma_lock(exec, gpuva_to_vma(op->base.prefetch.va), true);
> +		break;
> +	default:
> +		drm_warn(&vm->xe->drm, "NOT POSSIBLE");
> +	}
> +
> +	return err;
> +}
> +
> +static int vm_bind_ioctl_ops_lock(struct drm_exec *exec,
> +				  struct xe_vm *vm,
> +				  struct list_head *ops_list)
> +{
> +	struct xe_vma_op *op;
> +	int err;
> +
> +	err = drm_exec_prepare_obj(exec, xe_vm_obj(vm), 1);
> +	if (err)
> +		return err;
> +
> +	list_for_each_entry(op, ops_list, link) {
> +		err = op_lock(exec, vm, op);
> +		if (err)
> +			return err;
> +	}
> +
> +	return 0;
> +}
> +
>  static int vm_bind_ioctl_ops_execute(struct xe_vm *vm,
>  				     struct list_head *ops_list)
>  {
> +	struct drm_exec exec;
>  	struct xe_vma_op *op, *next;
>  	int err;
> 
>  	lockdep_assert_held_write(&vm->lock);
> 
> -	list_for_each_entry_safe(op, next, ops_list, link) {
> -		err = xe_vma_op_execute(vm, op);
> -		if (err) {
> -			drm_warn(&vm->xe->drm, "VM op(%d) failed with %d",
> -				 op->base.op, err);
> -			/*
> -			 * FIXME: Killing VM rather than proper error handling
> -			 */
> -			xe_vm_kill(vm);
> -			return -ENOSPC;
> +	drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT |
> +		      DRM_EXEC_IGNORE_DUPLICATES, 0);
> +	drm_exec_until_all_locked(&exec) {
> +		err = vm_bind_ioctl_ops_lock(&exec, vm, ops_list);
> +		drm_exec_retry_on_contention(&exec);
> +		if (err)
> +			goto unlock;
> +

Do you need below ops inside the drm_exec_until_all_locked loop? After you locked all objects, you can close the drm_exec_until_all_locked loop, then perform below out of drm_exec_until_all_locked loop. 

Oak 

> +		list_for_each_entry_safe(op, next, ops_list, link) {
> +			err = xe_vma_op_execute(vm, op);
> +			if (err) {
> +				drm_warn(&vm->xe->drm, "VM op(%d) failed
> with %d",
> +					 op->base.op, err);
> +				/*
> +				 * FIXME: Killing VM rather than proper error
> handling
> +				 */
> +				xe_vm_kill(vm, false);
> +				err = -ENOSPC;
> +				goto unlock;
> +			}
> +			xe_vma_op_cleanup(vm, op);
>  		}
> -		xe_vma_op_cleanup(vm, op);
>  	}
> 
> -	return 0;
> +unlock:
> +	drm_exec_fini(&exec);
> +	return err;
>  }
> 
>  #define SUPPORTED_FLAGS	(DRM_XE_VM_BIND_FLAG_NULL | \
> --
> 2.34.1


  reply	other threads:[~2024-03-10 17:44 UTC|newest]

Thread overview: 76+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-03-08  5:07 [PATCH v4 00/30] Refactor VM bind code Matthew Brost
2024-03-08  5:07 ` [PATCH v4 01/30] drm/xe: Lock all gpuva ops during VM bind IOCTL Matthew Brost
2024-03-10 17:44   ` Zeng, Oak [this message]
2024-03-11 19:48     ` Matthew Brost
2024-03-11 22:02       ` Zeng, Oak
2024-03-12  1:29         ` Matthew Brost
2024-03-08  5:07 ` [PATCH v4 02/30] drm/xe: Add ops_execute function which returns a fence Matthew Brost
2024-03-22 16:11   ` Zeng, Oak
2024-03-22 17:31     ` Matthew Brost
2024-03-22 19:39       ` Zeng, Oak
2024-03-08  5:07 ` [PATCH v4 03/30] drm/xe: Move migrate to prefetch to op_lock function Matthew Brost
2024-03-22 17:06   ` Zeng, Oak
2024-03-22 17:36     ` Matthew Brost
2024-03-22 19:45       ` Zeng, Oak
2024-03-08  5:07 ` [PATCH v4 04/30] drm/xe: Add struct xe_vma_ops abstraction Matthew Brost
2024-03-22 17:13   ` Zeng, Oak
2024-03-08  5:07 ` [PATCH v4 05/30] drm/xe: Update xe_vm_rebind to use dummy VMA operations Matthew Brost
2024-03-22 21:23   ` Zeng, Oak
2024-03-22 22:51     ` Matthew Brost
2024-03-08  5:07 ` [PATCH v4 06/30] drm/xe: Simplify VM bind IOCTL error handling and cleanup Matthew Brost
2024-03-25 16:03   ` Zeng, Oak
2024-03-26 18:46     ` Matthew Brost
2024-03-08  5:07 ` [PATCH v4 07/30] drm/xe: Update pagefaults to use dummy VMA operations Matthew Brost
2024-03-08  5:07 ` [PATCH v4 08/30] drm/xe: s/xe_tile_migrate_engine/xe_tile_migrate_exec_queue Matthew Brost
2024-03-25 16:05   ` Zeng, Oak
2024-03-08  5:07 ` [PATCH v4 09/30] drm/xe: Add some members to xe_vma_ops Matthew Brost
2024-03-25 16:10   ` Zeng, Oak
2024-03-26 18:47     ` Matthew Brost
2024-03-08  5:07 ` [PATCH v4 10/30] drm/xe: Add vm_bind_ioctl_ops_install_fences helper Matthew Brost
2024-03-25 16:51   ` Zeng, Oak
2024-03-25 19:34     ` Matthew Brost
2024-03-25 19:44       ` Zeng, Oak
2024-03-08  5:07 ` [PATCH v4 11/30] drm/xe: Move setting last fence to vm_bind_ioctl_ops_install_fences Matthew Brost
2024-03-25 17:02   ` Zeng, Oak
2024-03-25 19:35     ` Matthew Brost
2024-03-08  5:07 ` [PATCH v4 12/30] drm/xe: Move ufence check to op_lock Matthew Brost
2024-03-25 20:37   ` Zeng, Oak
2024-03-26 18:49     ` Matthew Brost
2024-03-08  5:07 ` [PATCH v4 13/30] drm/xe: Move ufence add to vm_bind_ioctl_ops_install_fences Matthew Brost
2024-03-25 20:54   ` Zeng, Oak
2024-03-26 18:54     ` Matthew Brost
2024-03-26 20:59       ` Zeng, Oak
2024-03-08  5:07 ` [PATCH v4 14/30] drm/xe: Add xe_gt_tlb_invalidation_range and convert PT layer to use this Matthew Brost
2024-03-25 21:35   ` Zeng, Oak
2024-03-26 18:57     ` Matthew Brost
2024-03-08  5:07 ` [PATCH v4 15/30] drm/xe: Add xe_vm_pgtable_update_op to xe_vma_ops Matthew Brost
2024-03-25 21:58   ` Zeng, Oak
2024-03-26 19:05     ` Matthew Brost
2024-03-27  1:29       ` Zeng, Oak
2024-03-08  5:07 ` [PATCH v4 16/30] drm/xe: Use ordered WQ for TLB invalidation fences Matthew Brost
2024-03-25 22:30   ` Zeng, Oak
2024-03-26 19:10     ` Matthew Brost
2024-03-08  5:07 ` [PATCH v4 17/30] drm/xe: Delete PT update selftest Matthew Brost
2024-03-25 22:31   ` Zeng, Oak
2024-03-08  5:07 ` [PATCH v4 18/30] drm/xe: Convert multiple bind ops into single job Matthew Brost
2024-03-27  2:40   ` Zeng, Oak
2024-03-27 19:26     ` Matthew Brost
2024-03-08  5:07 ` [PATCH v4 19/30] drm/xe: Remove old functions defs in xe_pt.h Matthew Brost
2024-03-08  5:07 ` [PATCH v4 20/30] drm/xe: Update PT layer with better error handling Matthew Brost
2024-03-08  5:07 ` [PATCH v4 21/30] drm/xe: Update xe_vm_rebind to return int Matthew Brost
2024-03-08  5:07 ` [PATCH v4 22/30] drm/xe: Move vma rebinding to the drm_exec locking loop Matthew Brost
2024-03-08  5:07 ` [PATCH v4 23/30] drm/xe: Update VM trace events Matthew Brost
2024-03-08  5:08 ` [PATCH v4 24/30] drm/xe: Update clear / populate arguments Matthew Brost
2024-03-08  5:08 ` [PATCH v4 25/30] drm/xe: Add __xe_migrate_update_pgtables_cpu helper Matthew Brost
2024-03-08  5:08 ` [PATCH v4 26/30] drm/xe: CPU binds for jobs Matthew Brost
2024-03-08  5:08 ` [PATCH v4 27/30] drm/xe: Don't use migrate exec queue for page fault binds Matthew Brost
2024-03-08  5:08 ` [PATCH v4 28/30] drm/xe: Add VM bind IOCTL error injection Matthew Brost
2024-03-08  5:08 ` [PATCH v4 29/30] drm/xe/guc: Assert time'd out jobs are not from a VM exec queue Matthew Brost
2024-03-08  5:08 ` [PATCH v4 30/30] drm/xe: Add PT exec queues Matthew Brost
2024-03-08  5:42 ` ✓ CI.Patch_applied: success for Refactor VM bind code (rev5) Patchwork
2024-03-08  5:43 ` ✗ CI.checkpatch: warning " Patchwork
2024-03-08  5:44 ` ✓ CI.KUnit: success " Patchwork
2024-03-08  5:55 ` ✓ CI.Build: " Patchwork
2024-03-08  5:55 ` ✗ CI.Hooks: failure " Patchwork
2024-03-08  5:56 ` ✓ CI.checksparse: success " Patchwork
2024-03-08  6:26 ` ✗ CI.BAT: failure " Patchwork

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=SA1PR11MB6991E7ECA4A2ADD89DA05C1792252@SA1PR11MB6991.namprd11.prod.outlook.com \
    --to=oak.zeng@intel.com \
    --cc=intel-xe@lists.freedesktop.org \
    --cc=matthew.brost@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.