Re: [PATCH] drm/amdgpu: improve debug VRAM access performance using sdma

From: "Christian König" <christian.koenig@amd.com>
To: "Kim, Jonathan" <Jonathan.Kim@amd.com>,
	"amd-gfx@lists.freedesktop.org" <amd-gfx@lists.freedesktop.org>
Cc: "Kuehling, Felix" <Felix.Kuehling@amd.com>
Subject: Re: [PATCH] drm/amdgpu: improve debug VRAM access performance using sdma
Date: Wed, 12 Jan 2022 16:12:39 +0100	[thread overview]
Message-ID: <08f4a6e7-9084-def3-43a7-bfb19c5e649d@amd.com> (raw)
In-Reply-To: <DM6PR12MB31631B4FF1C6F80EC5B3F58685529@DM6PR12MB3163.namprd12.prod.outlook.com>

Yeah, that's basically my fault.

I haven't even worked myself through all the mails which piled up during 
the xmas break :(

Christian.

Am 12.01.22 um 15:21 schrieb Kim, Jonathan:
> [Public]
>
> Thanks Christian.  I've already merged based on Felix's review.
> I'll send your suggested cleanup for review out soon.
>
> Jon
>
>> -----Original Message-----
>> From: Koenig, Christian <Christian.Koenig@amd.com>
>> Sent: January 12, 2022 2:33 AM
>> To: Kim, Jonathan <Jonathan.Kim@amd.com>; amd-
>> gfx@lists.freedesktop.org
>> Cc: Kuehling, Felix <Felix.Kuehling@amd.com>
>> Subject: Re: [PATCH] drm/amdgpu: improve debug VRAM access
>> performance using sdma
>>
>> Am 04.01.22 um 20:12 schrieb Jonathan Kim:
>>> For better performance during VRAM access for debugged processes, do
>>> read/write copies over SDMA.
>>>
>>> In order to fulfill post mortem debugging on a broken device, fallback
>>> to stable MMIO access when gpu recovery is disabled or when job
>>> submission time outs are set to max.  Failed SDMA access should
>>> automatically fall back to MMIO access.
>>>
>>> Use a pre-allocated GTT bounce buffer pre-mapped into GART to avoid
>>> page-table updates and TLB flushes on access.
>>>
>>> Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 78
>> +++++++++++++++++++++++++
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h |  5 +-
>>>    2 files changed, 82 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>>> index 367abed1d6e6..512df4c09772 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>>> @@ -48,6 +48,7 @@
>>>    #include <drm/ttm/ttm_range_manager.h>
>>>
>>>    #include <drm/amdgpu_drm.h>
>>> +#include <drm/drm_drv.h>
>>>
>>>    #include "amdgpu.h"
>>>    #include "amdgpu_object.h"
>>> @@ -1429,6 +1430,70 @@ static void
>> amdgpu_ttm_vram_mm_access(struct amdgpu_device *adev, loff_t pos,
>>>      }
>>>    }
>>>
>>> +static int amdgpu_ttm_access_memory_sdma(struct ttm_buffer_object
>> *bo,
>>> +                                   unsigned long offset, void *buf, int
>> len, int write) {
>>> +   struct amdgpu_bo *abo = ttm_to_amdgpu_bo(bo);
>>> +   struct amdgpu_device *adev = amdgpu_ttm_adev(abo->tbo.bdev);
>>> +   struct amdgpu_job *job;
>>> +   struct dma_fence *fence;
>>> +   uint64_t src_addr, dst_addr;
>>> +   unsigned int num_dw;
>>> +   int r, idx;
>>> +
>>> +   if (len != PAGE_SIZE)
>>> +           return -EINVAL;
>>> +
>>> +   if (!adev->mman.sdma_access_ptr)
>>> +           return -EACCES;
>>> +
>>> +   r = drm_dev_enter(adev_to_drm(adev), &idx);
>>> +   if (r)
>>> +           return r;
>>> +
>>> +   if (write)
>>> +           memcpy(adev->mman.sdma_access_ptr, buf, len);
>>> +
>>> +   num_dw = ALIGN(adev->mman.buffer_funcs->copy_num_dw, 8);
>>> +   r = amdgpu_job_alloc_with_ib(adev, num_dw * 4,
>> AMDGPU_IB_POOL_DELAYED, &job);
>>> +   if (r)
>>> +           goto out;
>>> +
>>> +   src_addr = write ? amdgpu_bo_gpu_offset(adev-
>>> mman.sdma_access_bo) :
>>> +                   amdgpu_bo_gpu_offset(abo);
>>> +   dst_addr = write ? amdgpu_bo_gpu_offset(abo) :
>>> +                   amdgpu_bo_gpu_offset(adev-
>>> mman.sdma_access_bo);
>> I suggest to write this as
>>
>> src_addr = a;
>> dst_addr = b;
>> if (write)
>>       swap(src_addr, dst_addr);
>>
>> This way we are not duplicating getting the different offsets.
>>
>>> +   amdgpu_emit_copy_buffer(adev, &job->ibs[0], src_addr, dst_addr,
>>> +PAGE_SIZE, false);
>>> +
>>> +   amdgpu_ring_pad_ib(adev->mman.buffer_funcs_ring, &job-
>>> ibs[0]);
>>> +   WARN_ON(job->ibs[0].length_dw > num_dw);
>>> +
>>> +   r = amdgpu_job_submit(job, &adev->mman.entity,
>> AMDGPU_FENCE_OWNER_UNDEFINED, &fence);
>>> +   if (r) {
>>> +           amdgpu_job_free(job);
>>> +           goto out;
>>> +   }
>>> +
>>> +   if (!dma_fence_wait_timeout(fence, false, adev->sdma_timeout))
>>> +           r = -ETIMEDOUT;
>>> +   dma_fence_put(fence);
>>> +
>>> +   if (!(r || write))
>>> +           memcpy(buf, adev->mman.sdma_access_ptr, len);
>>> +out:
>>> +   drm_dev_exit(idx);
>>> +   return r;
>>> +}
>>> +
>>> +static inline bool amdgpu_ttm_allow_post_mortem_debug(struct
>>> +amdgpu_device *adev) {
>>> +   return amdgpu_gpu_recovery == 0 ||
>>> +           adev->gfx_timeout == MAX_SCHEDULE_TIMEOUT ||
>>> +           adev->compute_timeout == MAX_SCHEDULE_TIMEOUT ||
>>> +           adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT ||
>>> +           adev->video_timeout == MAX_SCHEDULE_TIMEOUT; }
>> This should probably be inside amdgpu_device.c
>>
>>> +
>>>    /**
>>>     * amdgpu_ttm_access_memory - Read or Write memory that backs a
>> buffer object.
>>>     *
>>> @@ -1453,6 +1518,10 @@ static int amdgpu_ttm_access_memory(struct
>> ttm_buffer_object *bo,
>>>      if (bo->resource->mem_type != TTM_PL_VRAM)
>>>              return -EIO;
>>>
>>> +   if (!amdgpu_ttm_allow_post_mortem_debug(adev) &&
>>> +                   !amdgpu_ttm_access_memory_sdma(bo, offset,
>> buf, len, write))
>>> +           return len;
>>> +
>>>      amdgpu_res_first(bo->resource, offset, len, &cursor);
>>>      while (cursor.remaining) {
>>>              size_t count, size = cursor.size;
>>> @@ -1793,6 +1862,12 @@ int amdgpu_ttm_init(struct amdgpu_device
>> *adev)
>>>              return r;
>>>      }
>>>
>>> +   if (amdgpu_bo_create_kernel(adev, PAGE_SIZE, PAGE_SIZE,
>>> +                           AMDGPU_GEM_DOMAIN_GTT,
>>> +                           &adev->mman.sdma_access_bo, NULL,
>>> +                           adev->mman.sdma_access_ptr))
>>> +           DRM_WARN("Debug VRAM access will use slowpath MM
>> access\n");
>>> +
>>>      return 0;
>>>    }
>>>
>>> @@ -1823,6 +1898,9 @@ void amdgpu_ttm_fini(struct amdgpu_device
>> *adev)
>>>      ttm_range_man_fini(&adev->mman.bdev, AMDGPU_PL_OA);
>>>      ttm_device_fini(&adev->mman.bdev);
>>>      adev->mman.initialized = false;
>>> +   if (adev->mman.sdma_access_ptr)
>> You can drop that if. Free functions can usually take a NULL pointer.
>>
>> Apart from those nit picks looks good to me as well.
>>
>> Regards,
>> Christian.
>>
>>> +           amdgpu_bo_free_kernel(&adev->mman.sdma_access_bo,
>> NULL,
>>> +                                   &adev->mman.sdma_access_ptr);
>>>      DRM_INFO("amdgpu: ttm finalized\n");
>>>    }
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
>>> index 91a087f9dc7c..b0116c4a768f 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
>>> @@ -98,6 +98,10 @@ struct amdgpu_mman {
>>>      u64             fw_vram_usage_size;
>>>      struct amdgpu_bo        *fw_vram_usage_reserved_bo;
>>>      void            *fw_vram_usage_va;
>>> +
>>> +   /* PAGE_SIZE'd BO for process memory r/w over SDMA. */
>>> +   struct amdgpu_bo        *sdma_access_bo;
>>> +   void                    *sdma_access_ptr;
>>>    };
>>>
>>>    struct amdgpu_copy_mem {
>>> @@ -193,5 +197,4 @@ uint64_t amdgpu_ttm_tt_pte_flags(struct
>> amdgpu_device *adev, struct ttm_tt *ttm,
>>>    int amdgpu_ttm_evict_resources(struct amdgpu_device *adev, int
>>> mem_type);
>>>
>>>    void amdgpu_ttm_debugfs_init(struct amdgpu_device *adev);
>>> -
>>>    #endif