All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3] drm/amdkfd: Fix error handling in criu_checkpoint
@ 2022-11-03 23:12 ` Felix Kuehling
  0 siblings, 0 replies; 3+ messages in thread
From: Felix Kuehling @ 2022-11-03 23:12 UTC (permalink / raw)
  To: amd-gfx; +Cc: linux-kernel, Jann Horn, Rajneesh Bhardwaj

Checkpoint BOs last. That way we don't need to close dmabuf FDs if
something else fails later. This avoids problematic access to user mode
memory in the error handling code path.

criu_checkpoint_bos has its own error handling and cleanup that does not
depend on access to user memory.

In the private data, keep BOs before the remaining objects. This is
necessary to restore things in the correct order as restoring events
depends on the events-page BO being restored first.

Fixes: be072b06c739 ("drm/amdkfd: CRIU export BOs as prime dmabuf objects")
Reported-by: Jann Horn <jannh@google.com>
CC: Rajneesh Bhardwaj <Rajneesh.Bhardwaj@amd.com>
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>

---

v3: Keep order of private data and restore order the same.
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 34 +++++++++++-------------
 1 file changed, 15 insertions(+), 19 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 5feaba6a77de..6d291aa6386b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1950,7 +1950,7 @@ static int criu_checkpoint(struct file *filep,
 {
 	int ret;
 	uint32_t num_devices, num_bos, num_objects;
-	uint64_t priv_size, priv_offset = 0;
+	uint64_t priv_size, priv_offset = 0, bo_priv_offset;
 
 	if (!args->devices || !args->bos || !args->priv_data)
 		return -EINVAL;
@@ -1994,38 +1994,34 @@ static int criu_checkpoint(struct file *filep,
 	if (ret)
 		goto exit_unlock;
 
-	ret = criu_checkpoint_bos(p, num_bos, (uint8_t __user *)args->bos,
-			    (uint8_t __user *)args->priv_data, &priv_offset);
-	if (ret)
-		goto exit_unlock;
+	/* Leave room for BOs in the private data. They need to be restored
+	 * before events, but we checkpoint them last to simplify the error
+	 * handling.
+	 */
+	bo_priv_offset = priv_offset;
+	priv_offset += num_bos * sizeof(struct kfd_criu_bo_priv_data);
 
 	if (num_objects) {
 		ret = kfd_criu_checkpoint_queues(p, (uint8_t __user *)args->priv_data,
 						 &priv_offset);
 		if (ret)
-			goto close_bo_fds;
+			goto exit_unlock;
 
 		ret = kfd_criu_checkpoint_events(p, (uint8_t __user *)args->priv_data,
 						 &priv_offset);
 		if (ret)
-			goto close_bo_fds;
+			goto exit_unlock;
 
 		ret = kfd_criu_checkpoint_svm(p, (uint8_t __user *)args->priv_data, &priv_offset);
 		if (ret)
-			goto close_bo_fds;
+			goto exit_unlock;
 	}
 
-close_bo_fds:
-	if (ret) {
-		/* If IOCTL returns err, user assumes all FDs opened in criu_dump_bos are closed */
-		uint32_t i;
-		struct kfd_criu_bo_bucket *bo_buckets = (struct kfd_criu_bo_bucket *) args->bos;
-
-		for (i = 0; i < num_bos; i++) {
-			if (bo_buckets[i].alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
-				close_fd(bo_buckets[i].dmabuf_fd);
-		}
-	}
+	/* This must be the last thing in this function that can fail.
+	 * Otherwise we leak dmabuf file descriptors.
+	 */
+	ret = criu_checkpoint_bos(p, num_bos, (uint8_t __user *)args->bos,
+			   (uint8_t __user *)args->priv_data, &bo_priv_offset);
 
 exit_unlock:
 	mutex_unlock(&p->mutex);
-- 
2.32.0


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* [PATCH v3] drm/amdkfd: Fix error handling in criu_checkpoint
@ 2022-11-03 23:12 ` Felix Kuehling
  0 siblings, 0 replies; 3+ messages in thread
From: Felix Kuehling @ 2022-11-03 23:12 UTC (permalink / raw)
  To: amd-gfx; +Cc: Rajneesh Bhardwaj, linux-kernel, Jann Horn

Checkpoint BOs last. That way we don't need to close dmabuf FDs if
something else fails later. This avoids problematic access to user mode
memory in the error handling code path.

criu_checkpoint_bos has its own error handling and cleanup that does not
depend on access to user memory.

In the private data, keep BOs before the remaining objects. This is
necessary to restore things in the correct order as restoring events
depends on the events-page BO being restored first.

Fixes: be072b06c739 ("drm/amdkfd: CRIU export BOs as prime dmabuf objects")
Reported-by: Jann Horn <jannh@google.com>
CC: Rajneesh Bhardwaj <Rajneesh.Bhardwaj@amd.com>
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>

---

v3: Keep order of private data and restore order the same.
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 34 +++++++++++-------------
 1 file changed, 15 insertions(+), 19 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 5feaba6a77de..6d291aa6386b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1950,7 +1950,7 @@ static int criu_checkpoint(struct file *filep,
 {
 	int ret;
 	uint32_t num_devices, num_bos, num_objects;
-	uint64_t priv_size, priv_offset = 0;
+	uint64_t priv_size, priv_offset = 0, bo_priv_offset;
 
 	if (!args->devices || !args->bos || !args->priv_data)
 		return -EINVAL;
@@ -1994,38 +1994,34 @@ static int criu_checkpoint(struct file *filep,
 	if (ret)
 		goto exit_unlock;
 
-	ret = criu_checkpoint_bos(p, num_bos, (uint8_t __user *)args->bos,
-			    (uint8_t __user *)args->priv_data, &priv_offset);
-	if (ret)
-		goto exit_unlock;
+	/* Leave room for BOs in the private data. They need to be restored
+	 * before events, but we checkpoint them last to simplify the error
+	 * handling.
+	 */
+	bo_priv_offset = priv_offset;
+	priv_offset += num_bos * sizeof(struct kfd_criu_bo_priv_data);
 
 	if (num_objects) {
 		ret = kfd_criu_checkpoint_queues(p, (uint8_t __user *)args->priv_data,
 						 &priv_offset);
 		if (ret)
-			goto close_bo_fds;
+			goto exit_unlock;
 
 		ret = kfd_criu_checkpoint_events(p, (uint8_t __user *)args->priv_data,
 						 &priv_offset);
 		if (ret)
-			goto close_bo_fds;
+			goto exit_unlock;
 
 		ret = kfd_criu_checkpoint_svm(p, (uint8_t __user *)args->priv_data, &priv_offset);
 		if (ret)
-			goto close_bo_fds;
+			goto exit_unlock;
 	}
 
-close_bo_fds:
-	if (ret) {
-		/* If IOCTL returns err, user assumes all FDs opened in criu_dump_bos are closed */
-		uint32_t i;
-		struct kfd_criu_bo_bucket *bo_buckets = (struct kfd_criu_bo_bucket *) args->bos;
-
-		for (i = 0; i < num_bos; i++) {
-			if (bo_buckets[i].alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
-				close_fd(bo_buckets[i].dmabuf_fd);
-		}
-	}
+	/* This must be the last thing in this function that can fail.
+	 * Otherwise we leak dmabuf file descriptors.
+	 */
+	ret = criu_checkpoint_bos(p, num_bos, (uint8_t __user *)args->bos,
+			   (uint8_t __user *)args->priv_data, &bo_priv_offset);
 
 exit_unlock:
 	mutex_unlock(&p->mutex);
-- 
2.32.0


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH v3] drm/amdkfd: Fix error handling in criu_checkpoint
  2022-11-03 23:12 ` Felix Kuehling
  (?)
@ 2022-11-04  0:07 ` Bhardwaj, Rajneesh
  -1 siblings, 0 replies; 3+ messages in thread
From: Bhardwaj, Rajneesh @ 2022-11-04  0:07 UTC (permalink / raw)
  To: Felix Kuehling, amd-gfx; +Cc: linux-kernel, Jann Horn

This one is more elegant. Looks good to me!

Reviewed-and-tested-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>

On 11/3/2022 7:12 PM, Felix Kuehling wrote:
> Checkpoint BOs last. That way we don't need to close dmabuf FDs if
> something else fails later. This avoids problematic access to user mode
> memory in the error handling code path.
>
> criu_checkpoint_bos has its own error handling and cleanup that does not
> depend on access to user memory.
>
> In the private data, keep BOs before the remaining objects. This is
> necessary to restore things in the correct order as restoring events
> depends on the events-page BO being restored first.
>
> Fixes: be072b06c739 ("drm/amdkfd: CRIU export BOs as prime dmabuf objects")
> Reported-by: Jann Horn <jannh@google.com>
> CC: Rajneesh Bhardwaj <Rajneesh.Bhardwaj@amd.com>
> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
>
> ---
>
> v3: Keep order of private data and restore order the same.
> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 34 +++++++++++-------------
>   1 file changed, 15 insertions(+), 19 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index 5feaba6a77de..6d291aa6386b 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -1950,7 +1950,7 @@ static int criu_checkpoint(struct file *filep,
>   {
>   	int ret;
>   	uint32_t num_devices, num_bos, num_objects;
> -	uint64_t priv_size, priv_offset = 0;
> +	uint64_t priv_size, priv_offset = 0, bo_priv_offset;
>   
>   	if (!args->devices || !args->bos || !args->priv_data)
>   		return -EINVAL;
> @@ -1994,38 +1994,34 @@ static int criu_checkpoint(struct file *filep,
>   	if (ret)
>   		goto exit_unlock;
>   
> -	ret = criu_checkpoint_bos(p, num_bos, (uint8_t __user *)args->bos,
> -			    (uint8_t __user *)args->priv_data, &priv_offset);
> -	if (ret)
> -		goto exit_unlock;
> +	/* Leave room for BOs in the private data. They need to be restored
> +	 * before events, but we checkpoint them last to simplify the error
> +	 * handling.
> +	 */
> +	bo_priv_offset = priv_offset;
> +	priv_offset += num_bos * sizeof(struct kfd_criu_bo_priv_data);
>   
>   	if (num_objects) {
>   		ret = kfd_criu_checkpoint_queues(p, (uint8_t __user *)args->priv_data,
>   						 &priv_offset);
>   		if (ret)
> -			goto close_bo_fds;
> +			goto exit_unlock;
>   
>   		ret = kfd_criu_checkpoint_events(p, (uint8_t __user *)args->priv_data,
>   						 &priv_offset);
>   		if (ret)
> -			goto close_bo_fds;
> +			goto exit_unlock;
>   
>   		ret = kfd_criu_checkpoint_svm(p, (uint8_t __user *)args->priv_data, &priv_offset);
>   		if (ret)
> -			goto close_bo_fds;
> +			goto exit_unlock;
>   	}
>   
> -close_bo_fds:
> -	if (ret) {
> -		/* If IOCTL returns err, user assumes all FDs opened in criu_dump_bos are closed */
> -		uint32_t i;
> -		struct kfd_criu_bo_bucket *bo_buckets = (struct kfd_criu_bo_bucket *) args->bos;
> -
> -		for (i = 0; i < num_bos; i++) {
> -			if (bo_buckets[i].alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
> -				close_fd(bo_buckets[i].dmabuf_fd);
> -		}
> -	}
> +	/* This must be the last thing in this function that can fail.
> +	 * Otherwise we leak dmabuf file descriptors.
> +	 */
> +	ret = criu_checkpoint_bos(p, num_bos, (uint8_t __user *)args->bos,
> +			   (uint8_t __user *)args->priv_data, &bo_priv_offset);
>   
>   exit_unlock:
>   	mutex_unlock(&p->mutex);

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2022-11-04  0:07 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-11-03 23:12 [PATCH v3] drm/amdkfd: Fix error handling in criu_checkpoint Felix Kuehling
2022-11-03 23:12 ` Felix Kuehling
2022-11-04  0:07 ` Bhardwaj, Rajneesh

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.