* [PATCH] drm/amdgpu: enable separate timeout setting for every ring type V4 @ 2019-04-30 3:16 Evan Quan [not found] ` <20190430031619.7906-1-evan.quan-5C7GfCeVMHo@public.gmane.org> 0 siblings, 1 reply; 4+ messages in thread From: Evan Quan @ 2019-04-30 3:16 UTC (permalink / raw) To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW Cc: Alexander.Deucher-5C7GfCeVMHo, Wentao.Lou-5C7GfCeVMHo, Evan Quan, michel.daenzer-5C7GfCeVMHo, Christian.Koenig-5C7GfCeVMHo Every ring type can have its own timeout setting. - V2: update lockup_timeout parameter format and cosmetic fixes - V3: invalidate 0 and negative values - V4: update lockup_timeout parameter format Change-Id: I992f224f36bb33acd560162bffd2c3e987840a7e Signed-off-by: Evan Quan <evan.quan@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 7 +- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++-- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 79 ++++++++++++++++++++-- drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 35 ++++++++-- drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 2 +- 5 files changed, 121 insertions(+), 19 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index f6965b9403eb..c9b44b8c1969 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -118,7 +118,6 @@ extern int amdgpu_disp_priority; extern int amdgpu_hw_i2c; extern int amdgpu_pcie_gen2; extern int amdgpu_msi; -extern int amdgpu_lockup_timeout; extern int amdgpu_dpm; extern int amdgpu_fw_load_type; extern int amdgpu_aspm; @@ -428,6 +427,7 @@ struct amdgpu_fpriv { }; int amdgpu_file_to_fpriv(struct file *filp, struct amdgpu_fpriv **fpriv); +int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev); int amdgpu_ib_get(struct amdgpu_device *adev, struct amdgpu_vm *vm, unsigned size, struct amdgpu_ib *ib); @@ -1001,6 +1001,11 @@ struct amdgpu_device { struct work_struct xgmi_reset_work; bool in_baco_reset; + + long gfx_timeout; + long sdma_timeout; + long video_timeout; + long compute_timeout; }; static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 80bf604019b1..b11af38a0238 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -912,8 +912,10 @@ static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) * Validates certain module parameters and updates * the associated values used by the driver (all asics). */ -static void amdgpu_device_check_arguments(struct amdgpu_device *adev) +static int amdgpu_device_check_arguments(struct amdgpu_device *adev) { + int ret = 0; + if (amdgpu_sched_jobs < 4) { dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", amdgpu_sched_jobs); @@ -958,13 +960,16 @@ static void amdgpu_device_check_arguments(struct amdgpu_device *adev) amdgpu_vram_page_split = 1024; } - if (amdgpu_lockup_timeout == 0) { - dev_warn(adev->dev, "lockup_timeout msut be > 0, adjusting to 10000\n"); - amdgpu_lockup_timeout = 10000; + ret = amdgpu_device_get_job_timeout_settings(adev); + if (ret) { + dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); + return ret; } adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); amdgpu_direct_gma_size = min(amdgpu_direct_gma_size, 96); + + return ret; } /** @@ -2468,7 +2473,9 @@ int amdgpu_device_init(struct amdgpu_device *adev, mutex_init(&adev->lock_reset); mutex_init(&adev->virt.dpm_mutex); - amdgpu_device_check_arguments(adev); + r = amdgpu_device_check_arguments(adev); + if (r) + return r; spin_lock_init(&adev->mmio_idx_lock); spin_lock_init(&adev->smc_idx_lock); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 71df27cd03de..609c7af8a3f0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -83,6 +83,8 @@ #define AMDGPU_VERSION "19.10.9.418" +#define AMDGPU_MAX_TIMEOUT_PARAM_LENTH 256 + int amdgpu_vram_limit = 0; int amdgpu_vis_vram_limit = 0; int amdgpu_gart_size = -1; /* auto */ @@ -95,7 +97,7 @@ int amdgpu_disp_priority = 0; int amdgpu_hw_i2c = 0; int amdgpu_pcie_gen2 = -1; int amdgpu_msi = -1; -int amdgpu_lockup_timeout = 10000; +char amdgpu_lockup_timeout[AMDGPU_MAX_TIMEOUT_PARAM_LENTH]; int amdgpu_dpm = -1; int amdgpu_fw_load_type = -1; int amdgpu_aspm = -1; @@ -232,12 +234,21 @@ MODULE_PARM_DESC(msi, "MSI support (1 = enable, 0 = disable, -1 = auto)"); module_param_named(msi, amdgpu_msi, int, 0444); /** - * DOC: lockup_timeout (int) - * Set GPU scheduler timeout value in ms. Value 0 is invalidated, will be adjusted to 10000. - * Negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET). The default is 10000. + * DOC: lockup_timeout (string) + * Set GPU scheduler timeout value in ms. + * + * The format can be [Non-Compute] or [GFX,Compute,SDMA,Video]. That is there can be one or + * multiple values specified. 0 and negative values are invalidated. They will be adjusted + * to default timeout. + * - With one value specified, the setting will apply to all non-compute jobs. + * - With multiple values specified, the first one will be for GFX. The second one is for Compute. + * And the third and fourth ones are for SDMA and Video. + * By default(with no lockup_timeout settings), the timeout for all non-compute(GFX, SDMA and Video) + * jobs is 10000. And there is no timeout enforced on compute jobs. */ -MODULE_PARM_DESC(lockup_timeout, "GPU lockup timeout in ms > 0 (default 10000)"); -module_param_named(lockup_timeout, amdgpu_lockup_timeout, int, 0444); +MODULE_PARM_DESC(lockup_timeout, "GPU lockup timeout in ms (default: 10000 for non-compute jobs and no timeout for compute jobs), " + "format is [Non-Compute] or [GFX,Compute,SDMA,Video]"); +module_param_string(lockup_timeout, amdgpu_lockup_timeout, sizeof(amdgpu_lockup_timeout), 0444); /** * DOC: dpm (int) @@ -1307,6 +1318,62 @@ int amdgpu_file_to_fpriv(struct file *filp, struct amdgpu_fpriv **fpriv) return 0; } +int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) +{ + char *input = amdgpu_lockup_timeout; + char *timeout_setting = NULL; + int index = 0; + long timeout; + int ret = 0; + + /* + * By default timeout for non compute jobs is 10000. + * And there is no timeout enforced on compute jobs. + */ + adev->gfx_timeout = adev->sdma_timeout = adev->video_timeout = 10000; + adev->compute_timeout = MAX_SCHEDULE_TIMEOUT; + + if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENTH)) { + while ((timeout_setting = strsep(&input, ",")) && + strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENTH)) { + ret = kstrtol(timeout_setting, 0, &timeout); + if (ret) + return ret; + + /* Invalidate 0 and negative values */ + if (timeout <= 0) { + index++; + continue; + } + + switch (index++) { + case 0: + adev->gfx_timeout = timeout; + break; + case 1: + adev->compute_timeout = timeout; + break; + case 2: + adev->sdma_timeout = timeout; + break; + case 3: + adev->video_timeout = timeout; + break; + default: + break; + } + } + /* + * There is only one value specified and + * it should apply to all non-compute jobs. + */ + if (index == 1) + adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; + } + + return ret; +} + static bool amdgpu_get_crtc_scanout_position(struct drm_device *dev, unsigned int pipe, bool in_vblank_irq, int *vpos, int *hpos, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c index d19ad34bef75..16b7e3a22e89 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c @@ -436,9 +436,13 @@ int amdgpu_fence_driver_start_ring(struct amdgpu_ring *ring, int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring, unsigned num_hw_submission) { + struct amdgpu_device *adev = ring->adev; long timeout; int r; + if (!adev) + return -EINVAL; + /* Check that num_hw_submission is a power of two */ if ((num_hw_submission & (num_hw_submission - 1)) != 0) return -EINVAL; @@ -465,12 +469,31 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring, /* No need to setup the GPU scheduler for KIQ ring */ if (ring->funcs->type != AMDGPU_RING_TYPE_KIQ) { - /* for non-sriov case, no timeout enforce on compute ring */ - if ((ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE) - && !amdgpu_sriov_vf(ring->adev)) - timeout = MAX_SCHEDULE_TIMEOUT; - else - timeout = msecs_to_jiffies(amdgpu_lockup_timeout); + switch (ring->funcs->type) { + case AMDGPU_RING_TYPE_GFX: + timeout = adev->gfx_timeout; + break; + case AMDGPU_RING_TYPE_COMPUTE: + /* + * For non-sriov case, no timeout enforce + * on compute ring by default. Unless user + * specifies a timeout for compute ring. + * + * For sriov case, always use the timeout + * as gfx ring + */ + if (!amdgpu_sriov_vf(ring->adev)) + timeout = adev->compute_timeout; + else + timeout = adev->gfx_timeout; + break; + case AMDGPU_RING_TYPE_SDMA: + timeout = adev->sdma_timeout; + break; + default: + timeout = adev->video_timeout; + break; + } r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, num_hw_submission, amdgpu_job_hang_limit, diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c index 8dbad496b29f..089952a1e6b0 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c @@ -343,7 +343,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work) /* Trigger recovery for world switch failure if no TDR */ if (amdgpu_device_should_recover_gpu(adev) - && amdgpu_lockup_timeout == MAX_SCHEDULE_TIMEOUT) + && adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT) amdgpu_device_gpu_recover(adev, NULL); } -- 2.21.0 _______________________________________________ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx ^ permalink raw reply related [flat|nested] 4+ messages in thread
[parent not found: <20190430031619.7906-1-evan.quan-5C7GfCeVMHo@public.gmane.org>]
* Re: [PATCH] drm/amdgpu: enable separate timeout setting for every ring type V4 [not found] ` <20190430031619.7906-1-evan.quan-5C7GfCeVMHo@public.gmane.org> @ 2019-04-30 3:20 ` Alex Deucher [not found] ` <CADnq5_Mw1h2gjQH671wyunQb8u4GshDH0Q8h5AqoC7d6ds7Oyg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org> 0 siblings, 1 reply; 4+ messages in thread From: Alex Deucher @ 2019-04-30 3:20 UTC (permalink / raw) To: Evan Quan Cc: Deucher, Alexander, wentalou, Daenzer, Michel, Christian Koenig, amd-gfx list On Mon, Apr 29, 2019 at 11:16 PM Evan Quan <evan.quan@amd.com> wrote: > > Every ring type can have its own timeout setting. > > - V2: update lockup_timeout parameter format and cosmetic fixes > - V3: invalidate 0 and negative values > - V4: update lockup_timeout parameter format > > Change-Id: I992f224f36bb33acd560162bffd2c3e987840a7e > Signed-off-by: Evan Quan <evan.quan@amd.com> Reviewed-by: Alex Deucher <alexander.deucher@amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu.h | 7 +- > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++-- > drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 79 ++++++++++++++++++++-- > drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 35 ++++++++-- > drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 2 +- > 5 files changed, 121 insertions(+), 19 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > index f6965b9403eb..c9b44b8c1969 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > @@ -118,7 +118,6 @@ extern int amdgpu_disp_priority; > extern int amdgpu_hw_i2c; > extern int amdgpu_pcie_gen2; > extern int amdgpu_msi; > -extern int amdgpu_lockup_timeout; > extern int amdgpu_dpm; > extern int amdgpu_fw_load_type; > extern int amdgpu_aspm; > @@ -428,6 +427,7 @@ struct amdgpu_fpriv { > }; > > int amdgpu_file_to_fpriv(struct file *filp, struct amdgpu_fpriv **fpriv); > +int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev); > > int amdgpu_ib_get(struct amdgpu_device *adev, struct amdgpu_vm *vm, > unsigned size, struct amdgpu_ib *ib); > @@ -1001,6 +1001,11 @@ struct amdgpu_device { > struct work_struct xgmi_reset_work; > > bool in_baco_reset; > + > + long gfx_timeout; > + long sdma_timeout; > + long video_timeout; > + long compute_timeout; > }; > > static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev) > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index 80bf604019b1..b11af38a0238 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -912,8 +912,10 @@ static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) > * Validates certain module parameters and updates > * the associated values used by the driver (all asics). > */ > -static void amdgpu_device_check_arguments(struct amdgpu_device *adev) > +static int amdgpu_device_check_arguments(struct amdgpu_device *adev) > { > + int ret = 0; > + > if (amdgpu_sched_jobs < 4) { > dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", > amdgpu_sched_jobs); > @@ -958,13 +960,16 @@ static void amdgpu_device_check_arguments(struct amdgpu_device *adev) > amdgpu_vram_page_split = 1024; > } > > - if (amdgpu_lockup_timeout == 0) { > - dev_warn(adev->dev, "lockup_timeout msut be > 0, adjusting to 10000\n"); > - amdgpu_lockup_timeout = 10000; > + ret = amdgpu_device_get_job_timeout_settings(adev); > + if (ret) { > + dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); > + return ret; > } > > adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); > amdgpu_direct_gma_size = min(amdgpu_direct_gma_size, 96); > + > + return ret; > } > > /** > @@ -2468,7 +2473,9 @@ int amdgpu_device_init(struct amdgpu_device *adev, > mutex_init(&adev->lock_reset); > mutex_init(&adev->virt.dpm_mutex); > > - amdgpu_device_check_arguments(adev); > + r = amdgpu_device_check_arguments(adev); > + if (r) > + return r; > > spin_lock_init(&adev->mmio_idx_lock); > spin_lock_init(&adev->smc_idx_lock); > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > index 71df27cd03de..609c7af8a3f0 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > @@ -83,6 +83,8 @@ > > #define AMDGPU_VERSION "19.10.9.418" > > +#define AMDGPU_MAX_TIMEOUT_PARAM_LENTH 256 > + > int amdgpu_vram_limit = 0; > int amdgpu_vis_vram_limit = 0; > int amdgpu_gart_size = -1; /* auto */ > @@ -95,7 +97,7 @@ int amdgpu_disp_priority = 0; > int amdgpu_hw_i2c = 0; > int amdgpu_pcie_gen2 = -1; > int amdgpu_msi = -1; > -int amdgpu_lockup_timeout = 10000; > +char amdgpu_lockup_timeout[AMDGPU_MAX_TIMEOUT_PARAM_LENTH]; > int amdgpu_dpm = -1; > int amdgpu_fw_load_type = -1; > int amdgpu_aspm = -1; > @@ -232,12 +234,21 @@ MODULE_PARM_DESC(msi, "MSI support (1 = enable, 0 = disable, -1 = auto)"); > module_param_named(msi, amdgpu_msi, int, 0444); > > /** > - * DOC: lockup_timeout (int) > - * Set GPU scheduler timeout value in ms. Value 0 is invalidated, will be adjusted to 10000. > - * Negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET). The default is 10000. > + * DOC: lockup_timeout (string) > + * Set GPU scheduler timeout value in ms. > + * > + * The format can be [Non-Compute] or [GFX,Compute,SDMA,Video]. That is there can be one or > + * multiple values specified. 0 and negative values are invalidated. They will be adjusted > + * to default timeout. > + * - With one value specified, the setting will apply to all non-compute jobs. > + * - With multiple values specified, the first one will be for GFX. The second one is for Compute. > + * And the third and fourth ones are for SDMA and Video. > + * By default(with no lockup_timeout settings), the timeout for all non-compute(GFX, SDMA and Video) > + * jobs is 10000. And there is no timeout enforced on compute jobs. > */ > -MODULE_PARM_DESC(lockup_timeout, "GPU lockup timeout in ms > 0 (default 10000)"); > -module_param_named(lockup_timeout, amdgpu_lockup_timeout, int, 0444); > +MODULE_PARM_DESC(lockup_timeout, "GPU lockup timeout in ms (default: 10000 for non-compute jobs and no timeout for compute jobs), " > + "format is [Non-Compute] or [GFX,Compute,SDMA,Video]"); > +module_param_string(lockup_timeout, amdgpu_lockup_timeout, sizeof(amdgpu_lockup_timeout), 0444); > > /** > * DOC: dpm (int) > @@ -1307,6 +1318,62 @@ int amdgpu_file_to_fpriv(struct file *filp, struct amdgpu_fpriv **fpriv) > return 0; > } > > +int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) > +{ > + char *input = amdgpu_lockup_timeout; > + char *timeout_setting = NULL; > + int index = 0; > + long timeout; > + int ret = 0; > + > + /* > + * By default timeout for non compute jobs is 10000. > + * And there is no timeout enforced on compute jobs. > + */ > + adev->gfx_timeout = adev->sdma_timeout = adev->video_timeout = 10000; > + adev->compute_timeout = MAX_SCHEDULE_TIMEOUT; > + > + if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENTH)) { > + while ((timeout_setting = strsep(&input, ",")) && > + strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENTH)) { > + ret = kstrtol(timeout_setting, 0, &timeout); > + if (ret) > + return ret; > + > + /* Invalidate 0 and negative values */ > + if (timeout <= 0) { > + index++; > + continue; > + } > + > + switch (index++) { > + case 0: > + adev->gfx_timeout = timeout; > + break; > + case 1: > + adev->compute_timeout = timeout; > + break; > + case 2: > + adev->sdma_timeout = timeout; > + break; > + case 3: > + adev->video_timeout = timeout; > + break; > + default: > + break; > + } > + } > + /* > + * There is only one value specified and > + * it should apply to all non-compute jobs. > + */ > + if (index == 1) > + adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; > + } > + > + return ret; > +} > + > static bool > amdgpu_get_crtc_scanout_position(struct drm_device *dev, unsigned int pipe, > bool in_vblank_irq, int *vpos, int *hpos, > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c > index d19ad34bef75..16b7e3a22e89 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c > @@ -436,9 +436,13 @@ int amdgpu_fence_driver_start_ring(struct amdgpu_ring *ring, > int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring, > unsigned num_hw_submission) > { > + struct amdgpu_device *adev = ring->adev; > long timeout; > int r; > > + if (!adev) > + return -EINVAL; > + > /* Check that num_hw_submission is a power of two */ > if ((num_hw_submission & (num_hw_submission - 1)) != 0) > return -EINVAL; > @@ -465,12 +469,31 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring, > > /* No need to setup the GPU scheduler for KIQ ring */ > if (ring->funcs->type != AMDGPU_RING_TYPE_KIQ) { > - /* for non-sriov case, no timeout enforce on compute ring */ > - if ((ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE) > - && !amdgpu_sriov_vf(ring->adev)) > - timeout = MAX_SCHEDULE_TIMEOUT; > - else > - timeout = msecs_to_jiffies(amdgpu_lockup_timeout); > + switch (ring->funcs->type) { > + case AMDGPU_RING_TYPE_GFX: > + timeout = adev->gfx_timeout; > + break; > + case AMDGPU_RING_TYPE_COMPUTE: > + /* > + * For non-sriov case, no timeout enforce > + * on compute ring by default. Unless user > + * specifies a timeout for compute ring. > + * > + * For sriov case, always use the timeout > + * as gfx ring > + */ > + if (!amdgpu_sriov_vf(ring->adev)) > + timeout = adev->compute_timeout; > + else > + timeout = adev->gfx_timeout; > + break; > + case AMDGPU_RING_TYPE_SDMA: > + timeout = adev->sdma_timeout; > + break; > + default: > + timeout = adev->video_timeout; > + break; > + } > > r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, > num_hw_submission, amdgpu_job_hang_limit, > diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c > index 8dbad496b29f..089952a1e6b0 100644 > --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c > +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c > @@ -343,7 +343,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work) > > /* Trigger recovery for world switch failure if no TDR */ > if (amdgpu_device_should_recover_gpu(adev) > - && amdgpu_lockup_timeout == MAX_SCHEDULE_TIMEOUT) > + && adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT) > amdgpu_device_gpu_recover(adev, NULL); > } > > -- > 2.21.0 > > _______________________________________________ > amd-gfx mailing list > amd-gfx@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/amd-gfx _______________________________________________ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx ^ permalink raw reply [flat|nested] 4+ messages in thread
[parent not found: <CADnq5_Mw1h2gjQH671wyunQb8u4GshDH0Q8h5AqoC7d6ds7Oyg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>]
* Re: [PATCH] drm/amdgpu: enable separate timeout setting for every ring type V4 [not found] ` <CADnq5_Mw1h2gjQH671wyunQb8u4GshDH0Q8h5AqoC7d6ds7Oyg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org> @ 2019-04-30 9:15 ` Christian König [not found] ` <92d820e4-1c93-ed76-5abd-c07cfdfebee7-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> 0 siblings, 1 reply; 4+ messages in thread From: Christian König @ 2019-04-30 9:15 UTC (permalink / raw) To: Alex Deucher, Evan Quan Cc: Deucher, Alexander, wentalou, Daenzer, Michel, Christian Koenig, amd-gfx list Am 30.04.19 um 05:20 schrieb Alex Deucher: > On Mon, Apr 29, 2019 at 11:16 PM Evan Quan <evan.quan@amd.com> wrote: >> Every ring type can have its own timeout setting. >> >> - V2: update lockup_timeout parameter format and cosmetic fixes >> - V3: invalidate 0 and negative values >> - V4: update lockup_timeout parameter format >> >> Change-Id: I992f224f36bb33acd560162bffd2c3e987840a7e >> Signed-off-by: Evan Quan <evan.quan@amd.com> > Reviewed-by: Alex Deucher <alexander.deucher@amd.com> One more issue below, with that fixed the patch is Reviewed-by: Christian König <christian.koenig@amd.com> > >> --- >> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 7 +- >> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++-- >> drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 79 ++++++++++++++++++++-- >> drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 35 ++++++++-- >> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 2 +- >> 5 files changed, 121 insertions(+), 19 deletions(-) >> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >> index f6965b9403eb..c9b44b8c1969 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >> @@ -118,7 +118,6 @@ extern int amdgpu_disp_priority; >> extern int amdgpu_hw_i2c; >> extern int amdgpu_pcie_gen2; >> extern int amdgpu_msi; >> -extern int amdgpu_lockup_timeout; >> extern int amdgpu_dpm; >> extern int amdgpu_fw_load_type; >> extern int amdgpu_aspm; >> @@ -428,6 +427,7 @@ struct amdgpu_fpriv { >> }; >> >> int amdgpu_file_to_fpriv(struct file *filp, struct amdgpu_fpriv **fpriv); >> +int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev); >> >> int amdgpu_ib_get(struct amdgpu_device *adev, struct amdgpu_vm *vm, >> unsigned size, struct amdgpu_ib *ib); >> @@ -1001,6 +1001,11 @@ struct amdgpu_device { >> struct work_struct xgmi_reset_work; >> >> bool in_baco_reset; >> + >> + long gfx_timeout; >> + long sdma_timeout; >> + long video_timeout; >> + long compute_timeout; >> }; >> >> static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev) >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> index 80bf604019b1..b11af38a0238 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> @@ -912,8 +912,10 @@ static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) >> * Validates certain module parameters and updates >> * the associated values used by the driver (all asics). >> */ >> -static void amdgpu_device_check_arguments(struct amdgpu_device *adev) >> +static int amdgpu_device_check_arguments(struct amdgpu_device *adev) >> { >> + int ret = 0; >> + >> if (amdgpu_sched_jobs < 4) { >> dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", >> amdgpu_sched_jobs); >> @@ -958,13 +960,16 @@ static void amdgpu_device_check_arguments(struct amdgpu_device *adev) >> amdgpu_vram_page_split = 1024; >> } >> >> - if (amdgpu_lockup_timeout == 0) { >> - dev_warn(adev->dev, "lockup_timeout msut be > 0, adjusting to 10000\n"); >> - amdgpu_lockup_timeout = 10000; >> + ret = amdgpu_device_get_job_timeout_settings(adev); >> + if (ret) { >> + dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); >> + return ret; >> } >> >> adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); >> amdgpu_direct_gma_size = min(amdgpu_direct_gma_size, 96); >> + >> + return ret; >> } >> >> /** >> @@ -2468,7 +2473,9 @@ int amdgpu_device_init(struct amdgpu_device *adev, >> mutex_init(&adev->lock_reset); >> mutex_init(&adev->virt.dpm_mutex); >> >> - amdgpu_device_check_arguments(adev); >> + r = amdgpu_device_check_arguments(adev); >> + if (r) >> + return r; >> >> spin_lock_init(&adev->mmio_idx_lock); >> spin_lock_init(&adev->smc_idx_lock); >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c >> index 71df27cd03de..609c7af8a3f0 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c >> @@ -83,6 +83,8 @@ >> >> #define AMDGPU_VERSION "19.10.9.418" >> >> +#define AMDGPU_MAX_TIMEOUT_PARAM_LENTH 256 >> + >> int amdgpu_vram_limit = 0; >> int amdgpu_vis_vram_limit = 0; >> int amdgpu_gart_size = -1; /* auto */ >> @@ -95,7 +97,7 @@ int amdgpu_disp_priority = 0; >> int amdgpu_hw_i2c = 0; >> int amdgpu_pcie_gen2 = -1; >> int amdgpu_msi = -1; >> -int amdgpu_lockup_timeout = 10000; >> +char amdgpu_lockup_timeout[AMDGPU_MAX_TIMEOUT_PARAM_LENTH]; >> int amdgpu_dpm = -1; >> int amdgpu_fw_load_type = -1; >> int amdgpu_aspm = -1; >> @@ -232,12 +234,21 @@ MODULE_PARM_DESC(msi, "MSI support (1 = enable, 0 = disable, -1 = auto)"); >> module_param_named(msi, amdgpu_msi, int, 0444); >> >> /** >> - * DOC: lockup_timeout (int) >> - * Set GPU scheduler timeout value in ms. Value 0 is invalidated, will be adjusted to 10000. >> - * Negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET). The default is 10000. >> + * DOC: lockup_timeout (string) >> + * Set GPU scheduler timeout value in ms. >> + * >> + * The format can be [Non-Compute] or [GFX,Compute,SDMA,Video]. That is there can be one or >> + * multiple values specified. 0 and negative values are invalidated. They will be adjusted >> + * to default timeout. >> + * - With one value specified, the setting will apply to all non-compute jobs. >> + * - With multiple values specified, the first one will be for GFX. The second one is for Compute. >> + * And the third and fourth ones are for SDMA and Video. >> + * By default(with no lockup_timeout settings), the timeout for all non-compute(GFX, SDMA and Video) >> + * jobs is 10000. And there is no timeout enforced on compute jobs. >> */ >> -MODULE_PARM_DESC(lockup_timeout, "GPU lockup timeout in ms > 0 (default 10000)"); >> -module_param_named(lockup_timeout, amdgpu_lockup_timeout, int, 0444); >> +MODULE_PARM_DESC(lockup_timeout, "GPU lockup timeout in ms (default: 10000 for non-compute jobs and no timeout for compute jobs), " >> + "format is [Non-Compute] or [GFX,Compute,SDMA,Video]"); >> +module_param_string(lockup_timeout, amdgpu_lockup_timeout, sizeof(amdgpu_lockup_timeout), 0444); >> >> /** >> * DOC: dpm (int) >> @@ -1307,6 +1318,62 @@ int amdgpu_file_to_fpriv(struct file *filp, struct amdgpu_fpriv **fpriv) >> return 0; >> } >> >> +int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) >> +{ >> + char *input = amdgpu_lockup_timeout; >> + char *timeout_setting = NULL; >> + int index = 0; >> + long timeout; >> + int ret = 0; >> + >> + /* >> + * By default timeout for non compute jobs is 10000. >> + * And there is no timeout enforced on compute jobs. >> + */ >> + adev->gfx_timeout = adev->sdma_timeout = adev->video_timeout = 10000; >> + adev->compute_timeout = MAX_SCHEDULE_TIMEOUT; >> + >> + if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENTH)) { >> + while ((timeout_setting = strsep(&input, ",")) && >> + strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENTH)) { >> + ret = kstrtol(timeout_setting, 0, &timeout); >> + if (ret) >> + return ret; >> + >> + /* Invalidate 0 and negative values */ >> + if (timeout <= 0) { >> + index++; >> + continue; >> + } Negative values are perfectly valid and just mean infinite timeout. Take a look at the msecs_to_jiffies() implementation. Christian. >> + >> + switch (index++) { >> + case 0: >> + adev->gfx_timeout = timeout; >> + break; >> + case 1: >> + adev->compute_timeout = timeout; >> + break; >> + case 2: >> + adev->sdma_timeout = timeout; >> + break; >> + case 3: >> + adev->video_timeout = timeout; >> + break; >> + default: >> + break; >> + } >> + } >> + /* >> + * There is only one value specified and >> + * it should apply to all non-compute jobs. >> + */ >> + if (index == 1) >> + adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; >> + } >> + >> + return ret; >> +} >> + >> static bool >> amdgpu_get_crtc_scanout_position(struct drm_device *dev, unsigned int pipe, >> bool in_vblank_irq, int *vpos, int *hpos, >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c >> index d19ad34bef75..16b7e3a22e89 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c >> @@ -436,9 +436,13 @@ int amdgpu_fence_driver_start_ring(struct amdgpu_ring *ring, >> int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring, >> unsigned num_hw_submission) >> { >> + struct amdgpu_device *adev = ring->adev; >> long timeout; >> int r; >> >> + if (!adev) >> + return -EINVAL; >> + >> /* Check that num_hw_submission is a power of two */ >> if ((num_hw_submission & (num_hw_submission - 1)) != 0) >> return -EINVAL; >> @@ -465,12 +469,31 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring, >> >> /* No need to setup the GPU scheduler for KIQ ring */ >> if (ring->funcs->type != AMDGPU_RING_TYPE_KIQ) { >> - /* for non-sriov case, no timeout enforce on compute ring */ >> - if ((ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE) >> - && !amdgpu_sriov_vf(ring->adev)) >> - timeout = MAX_SCHEDULE_TIMEOUT; >> - else >> - timeout = msecs_to_jiffies(amdgpu_lockup_timeout); >> + switch (ring->funcs->type) { >> + case AMDGPU_RING_TYPE_GFX: >> + timeout = adev->gfx_timeout; >> + break; >> + case AMDGPU_RING_TYPE_COMPUTE: >> + /* >> + * For non-sriov case, no timeout enforce >> + * on compute ring by default. Unless user >> + * specifies a timeout for compute ring. >> + * >> + * For sriov case, always use the timeout >> + * as gfx ring >> + */ >> + if (!amdgpu_sriov_vf(ring->adev)) >> + timeout = adev->compute_timeout; >> + else >> + timeout = adev->gfx_timeout; >> + break; >> + case AMDGPU_RING_TYPE_SDMA: >> + timeout = adev->sdma_timeout; >> + break; >> + default: >> + timeout = adev->video_timeout; >> + break; >> + } >> >> r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, >> num_hw_submission, amdgpu_job_hang_limit, >> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >> index 8dbad496b29f..089952a1e6b0 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >> @@ -343,7 +343,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work) >> >> /* Trigger recovery for world switch failure if no TDR */ >> if (amdgpu_device_should_recover_gpu(adev) >> - && amdgpu_lockup_timeout == MAX_SCHEDULE_TIMEOUT) >> + && adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT) >> amdgpu_device_gpu_recover(adev, NULL); >> } >> >> -- >> 2.21.0 >> >> _______________________________________________ >> amd-gfx mailing list >> amd-gfx@lists.freedesktop.org >> https://lists.freedesktop.org/mailman/listinfo/amd-gfx > _______________________________________________ > amd-gfx mailing list > amd-gfx@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/amd-gfx _______________________________________________ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx ^ permalink raw reply [flat|nested] 4+ messages in thread
[parent not found: <92d820e4-1c93-ed76-5abd-c07cfdfebee7-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>]
* RE: [PATCH] drm/amdgpu: enable separate timeout setting for every ring type V4 [not found] ` <92d820e4-1c93-ed76-5abd-c07cfdfebee7-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> @ 2019-05-05 14:25 ` Quan, Evan 0 siblings, 0 replies; 4+ messages in thread From: Quan, Evan @ 2019-05-05 14:25 UTC (permalink / raw) To: Koenig, Christian, Alex Deucher Cc: Deucher, Alexander, Lou, Wentao, Daenzer, Michel, amd-gfx list Thanks! Just sent out a new patch to address this as original patch was already committed. Regards, Evan > -----Original Message----- > From: Christian König <ckoenig.leichtzumerken@gmail.com> > Sent: 2019年4月30日 17:15 > To: Alex Deucher <alexdeucher@gmail.com>; Quan, Evan > <Evan.Quan@amd.com> > Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Lou, Wentao > <Wentao.Lou@amd.com>; Daenzer, Michel <Michel.Daenzer@amd.com>; > Koenig, Christian <Christian.Koenig@amd.com>; amd-gfx list <amd- > gfx@lists.freedesktop.org> > Subject: Re: [PATCH] drm/amdgpu: enable separate timeout setting for > every ring type V4 > > [CAUTION: External Email] > > Am 30.04.19 um 05:20 schrieb Alex Deucher: > > On Mon, Apr 29, 2019 at 11:16 PM Evan Quan <evan.quan@amd.com> > wrote: > >> Every ring type can have its own timeout setting. > >> > >> - V2: update lockup_timeout parameter format and cosmetic fixes > >> - V3: invalidate 0 and negative values > >> - V4: update lockup_timeout parameter format > >> > >> Change-Id: I992f224f36bb33acd560162bffd2c3e987840a7e > >> Signed-off-by: Evan Quan <evan.quan@amd.com> > > Reviewed-by: Alex Deucher <alexander.deucher@amd.com> > > One more issue below, with that fixed the patch is Reviewed-by: > Christian König <christian.koenig@amd.com> > > > > >> --- > >> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 7 +- > >> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++-- > >> drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 79 > ++++++++++++++++++++-- > >> drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 35 ++++++++-- > >> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 2 +- > >> 5 files changed, 121 insertions(+), 19 deletions(-) > >> > >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > >> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > >> index f6965b9403eb..c9b44b8c1969 100644 > >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > >> @@ -118,7 +118,6 @@ extern int amdgpu_disp_priority; > >> extern int amdgpu_hw_i2c; > >> extern int amdgpu_pcie_gen2; > >> extern int amdgpu_msi; > >> -extern int amdgpu_lockup_timeout; > >> extern int amdgpu_dpm; > >> extern int amdgpu_fw_load_type; > >> extern int amdgpu_aspm; > >> @@ -428,6 +427,7 @@ struct amdgpu_fpriv { > >> }; > >> > >> int amdgpu_file_to_fpriv(struct file *filp, struct amdgpu_fpriv > >> **fpriv); > >> +int amdgpu_device_get_job_timeout_settings(struct amdgpu_device > >> +*adev); > >> > >> int amdgpu_ib_get(struct amdgpu_device *adev, struct amdgpu_vm > *vm, > >> unsigned size, struct amdgpu_ib *ib); @@ -1001,6 > >> +1001,11 @@ struct amdgpu_device { > >> struct work_struct xgmi_reset_work; > >> > >> bool in_baco_reset; > >> + > >> + long gfx_timeout; > >> + long sdma_timeout; > >> + long video_timeout; > >> + long compute_timeout; > >> }; > >> > >> static inline struct amdgpu_device *amdgpu_ttm_adev(struct > >> ttm_bo_device *bdev) diff --git > >> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > >> index 80bf604019b1..b11af38a0238 100644 > >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > >> @@ -912,8 +912,10 @@ static void > amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) > >> * Validates certain module parameters and updates > >> * the associated values used by the driver (all asics). > >> */ > >> -static void amdgpu_device_check_arguments(struct amdgpu_device > >> *adev) > >> +static int amdgpu_device_check_arguments(struct amdgpu_device > *adev) > >> { > >> + int ret = 0; > >> + > >> if (amdgpu_sched_jobs < 4) { > >> dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", > >> amdgpu_sched_jobs); @@ -958,13 +960,16 @@ > >> static void amdgpu_device_check_arguments(struct amdgpu_device > *adev) > >> amdgpu_vram_page_split = 1024; > >> } > >> > >> - if (amdgpu_lockup_timeout == 0) { > >> - dev_warn(adev->dev, "lockup_timeout msut be > 0, adjusting to > 10000\n"); > >> - amdgpu_lockup_timeout = 10000; > >> + ret = amdgpu_device_get_job_timeout_settings(adev); > >> + if (ret) { > >> + dev_err(adev->dev, "invalid lockup_timeout parameter > syntax\n"); > >> + return ret; > >> } > >> > >> adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, > amdgpu_fw_load_type); > >> amdgpu_direct_gma_size = min(amdgpu_direct_gma_size, 96); > >> + > >> + return ret; > >> } > >> > >> /** > >> @@ -2468,7 +2473,9 @@ int amdgpu_device_init(struct amdgpu_device > *adev, > >> mutex_init(&adev->lock_reset); > >> mutex_init(&adev->virt.dpm_mutex); > >> > >> - amdgpu_device_check_arguments(adev); > >> + r = amdgpu_device_check_arguments(adev); > >> + if (r) > >> + return r; > >> > >> spin_lock_init(&adev->mmio_idx_lock); > >> spin_lock_init(&adev->smc_idx_lock); > >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > >> index 71df27cd03de..609c7af8a3f0 100644 > >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > >> @@ -83,6 +83,8 @@ > >> > >> #define AMDGPU_VERSION "19.10.9.418" > >> > >> +#define AMDGPU_MAX_TIMEOUT_PARAM_LENTH 256 > >> + > >> int amdgpu_vram_limit = 0; > >> int amdgpu_vis_vram_limit = 0; > >> int amdgpu_gart_size = -1; /* auto */ @@ -95,7 +97,7 @@ int > >> amdgpu_disp_priority = 0; > >> int amdgpu_hw_i2c = 0; > >> int amdgpu_pcie_gen2 = -1; > >> int amdgpu_msi = -1; > >> -int amdgpu_lockup_timeout = 10000; > >> +char > amdgpu_lockup_timeout[AMDGPU_MAX_TIMEOUT_PARAM_LENTH]; > >> int amdgpu_dpm = -1; > >> int amdgpu_fw_load_type = -1; > >> int amdgpu_aspm = -1; > >> @@ -232,12 +234,21 @@ MODULE_PARM_DESC(msi, "MSI support (1 = > enable, 0 = disable, -1 = auto)"); > >> module_param_named(msi, amdgpu_msi, int, 0444); > >> > >> /** > >> - * DOC: lockup_timeout (int) > >> - * Set GPU scheduler timeout value in ms. Value 0 is invalidated, will be > adjusted to 10000. > >> - * Negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET). The > default is 10000. > >> + * DOC: lockup_timeout (string) > >> + * Set GPU scheduler timeout value in ms. > >> + * > >> + * The format can be [Non-Compute] or [GFX,Compute,SDMA,Video]. > That > >> + is there can be one or > >> + * multiple values specified. 0 and negative values are invalidated. > >> + They will be adjusted > >> + * to default timeout. > >> + * - With one value specified, the setting will apply to all non-compute > jobs. > >> + * - With multiple values specified, the first one will be for GFX. The > second one is for Compute. > >> + * And the third and fourth ones are for SDMA and Video. > >> + * By default(with no lockup_timeout settings), the timeout for all > >> + non-compute(GFX, SDMA and Video) > >> + * jobs is 10000. And there is no timeout enforced on compute jobs. > >> */ > >> -MODULE_PARM_DESC(lockup_timeout, "GPU lockup timeout in ms > 0 > >> (default 10000)"); -module_param_named(lockup_timeout, > >> amdgpu_lockup_timeout, int, 0444); > >> +MODULE_PARM_DESC(lockup_timeout, "GPU lockup timeout in ms > (default: 10000 for non-compute jobs and no timeout for compute jobs), " > >> + "format is [Non-Compute] or > >> +[GFX,Compute,SDMA,Video]"); module_param_string(lockup_timeout, > >> +amdgpu_lockup_timeout, sizeof(amdgpu_lockup_timeout), 0444); > >> > >> /** > >> * DOC: dpm (int) > >> @@ -1307,6 +1318,62 @@ int amdgpu_file_to_fpriv(struct file *filp, struct > amdgpu_fpriv **fpriv) > >> return 0; > >> } > >> > >> +int amdgpu_device_get_job_timeout_settings(struct amdgpu_device > >> +*adev) { > >> + char *input = amdgpu_lockup_timeout; > >> + char *timeout_setting = NULL; > >> + int index = 0; > >> + long timeout; > >> + int ret = 0; > >> + > >> + /* > >> + * By default timeout for non compute jobs is 10000. > >> + * And there is no timeout enforced on compute jobs. > >> + */ > >> + adev->gfx_timeout = adev->sdma_timeout = adev->video_timeout > = 10000; > >> + adev->compute_timeout = MAX_SCHEDULE_TIMEOUT; > >> + > >> + if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENTH)) { > >> + while ((timeout_setting = strsep(&input, ",")) && > >> + strnlen(timeout_setting, > AMDGPU_MAX_TIMEOUT_PARAM_LENTH)) { > >> + ret = kstrtol(timeout_setting, 0, &timeout); > >> + if (ret) > >> + return ret; > >> + > >> + /* Invalidate 0 and negative values */ > >> + if (timeout <= 0) { > >> + index++; > >> + continue; > >> + } > > Negative values are perfectly valid and just mean infinite timeout. Take a > look at the msecs_to_jiffies() implementation. > > Christian. > > >> + > >> + switch (index++) { > >> + case 0: > >> + adev->gfx_timeout = timeout; > >> + break; > >> + case 1: > >> + adev->compute_timeout = timeout; > >> + break; > >> + case 2: > >> + adev->sdma_timeout = timeout; > >> + break; > >> + case 3: > >> + adev->video_timeout = timeout; > >> + break; > >> + default: > >> + break; > >> + } > >> + } > >> + /* > >> + * There is only one value specified and > >> + * it should apply to all non-compute jobs. > >> + */ > >> + if (index == 1) > >> + adev->sdma_timeout = adev->video_timeout = adev- > >gfx_timeout; > >> + } > >> + > >> + return ret; > >> +} > >> + > >> static bool > >> amdgpu_get_crtc_scanout_position(struct drm_device *dev, unsigned > int pipe, > >> bool in_vblank_irq, int *vpos, int > >> *hpos, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c > >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c > >> index d19ad34bef75..16b7e3a22e89 100644 > >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c > >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c > >> @@ -436,9 +436,13 @@ int amdgpu_fence_driver_start_ring(struct > amdgpu_ring *ring, > >> int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring, > >> unsigned num_hw_submission) > >> { > >> + struct amdgpu_device *adev = ring->adev; > >> long timeout; > >> int r; > >> > >> + if (!adev) > >> + return -EINVAL; > >> + > >> /* Check that num_hw_submission is a power of two */ > >> if ((num_hw_submission & (num_hw_submission - 1)) != 0) > >> return -EINVAL; > >> @@ -465,12 +469,31 @@ int amdgpu_fence_driver_init_ring(struct > >> amdgpu_ring *ring, > >> > >> /* No need to setup the GPU scheduler for KIQ ring */ > >> if (ring->funcs->type != AMDGPU_RING_TYPE_KIQ) { > >> - /* for non-sriov case, no timeout enforce on compute ring */ > >> - if ((ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE) > >> - && !amdgpu_sriov_vf(ring->adev)) > >> - timeout = MAX_SCHEDULE_TIMEOUT; > >> - else > >> - timeout = msecs_to_jiffies(amdgpu_lockup_timeout); > >> + switch (ring->funcs->type) { > >> + case AMDGPU_RING_TYPE_GFX: > >> + timeout = adev->gfx_timeout; > >> + break; > >> + case AMDGPU_RING_TYPE_COMPUTE: > >> + /* > >> + * For non-sriov case, no timeout enforce > >> + * on compute ring by default. Unless user > >> + * specifies a timeout for compute ring. > >> + * > >> + * For sriov case, always use the timeout > >> + * as gfx ring > >> + */ > >> + if (!amdgpu_sriov_vf(ring->adev)) > >> + timeout = adev->compute_timeout; > >> + else > >> + timeout = adev->gfx_timeout; > >> + break; > >> + case AMDGPU_RING_TYPE_SDMA: > >> + timeout = adev->sdma_timeout; > >> + break; > >> + default: > >> + timeout = adev->video_timeout; > >> + break; > >> + } > >> > >> r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, > >> num_hw_submission, > >> amdgpu_job_hang_limit, diff --git > >> a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c > >> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c > >> index 8dbad496b29f..089952a1e6b0 100644 > >> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c > >> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c > >> @@ -343,7 +343,7 @@ static void xgpu_ai_mailbox_flr_work(struct > >> work_struct *work) > >> > >> /* Trigger recovery for world switch failure if no TDR */ > >> if (amdgpu_device_should_recover_gpu(adev) > >> - && amdgpu_lockup_timeout == MAX_SCHEDULE_TIMEOUT) > >> + && adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT) > >> amdgpu_device_gpu_recover(adev, NULL); > >> } > >> > >> -- > >> 2.21.0 > >> > >> _______________________________________________ > >> amd-gfx mailing list > >> amd-gfx@lists.freedesktop.org > >> https://lists.freedesktop.org/mailman/listinfo/amd-gfx > > _______________________________________________ > > amd-gfx mailing list > > amd-gfx@lists.freedesktop.org > > https://lists.freedesktop.org/mailman/listinfo/amd-gfx _______________________________________________ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx ^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2019-05-05 14:25 UTC | newest] Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2019-04-30 3:16 [PATCH] drm/amdgpu: enable separate timeout setting for every ring type V4 Evan Quan [not found] ` <20190430031619.7906-1-evan.quan-5C7GfCeVMHo@public.gmane.org> 2019-04-30 3:20 ` Alex Deucher [not found] ` <CADnq5_Mw1h2gjQH671wyunQb8u4GshDH0Q8h5AqoC7d6ds7Oyg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org> 2019-04-30 9:15 ` Christian König [not found] ` <92d820e4-1c93-ed76-5abd-c07cfdfebee7-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> 2019-05-05 14:25 ` Quan, Evan
This is an external index of several public inboxes, see mirroring instructions on how to clone and mirror all data and code used by this external index.