linux-arm-msm.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/2] arm64: dts: qcom: sc7180: Add gpu cooling support
@ 2020-10-08 17:09 Akhil P Oommen
  2020-10-08 17:09 ` [PATCH 2/2] drm/msm: Add support for GPU cooling Akhil P Oommen
  2020-10-09 15:05 ` [PATCH 1/2] arm64: dts: qcom: sc7180: Add gpu cooling support Doug Anderson
  0 siblings, 2 replies; 15+ messages in thread
From: Akhil P Oommen @ 2020-10-08 17:09 UTC (permalink / raw)
  To: freedreno, robh, robdclark
  Cc: dri-devel, linux-arm-msm, linux-kernel, jcrouse, mka

Add cooling-cells property and the cooling maps for the gpu tzones
to support GPU cooling.

Signed-off-by: Akhil P Oommen <akhilpo@codeaurora.org>
---
 arch/arm64/boot/dts/qcom/sc7180.dtsi | 29 ++++++++++++++++++++++-------
 1 file changed, 22 insertions(+), 7 deletions(-)

diff --git a/arch/arm64/boot/dts/qcom/sc7180.dtsi b/arch/arm64/boot/dts/qcom/sc7180.dtsi
index d46b383..40d6a28 100644
--- a/arch/arm64/boot/dts/qcom/sc7180.dtsi
+++ b/arch/arm64/boot/dts/qcom/sc7180.dtsi
@@ -2,7 +2,7 @@
 /*
  * SC7180 SoC device tree source
  *
- * Copyright (c) 2019, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2019-20, The Linux Foundation. All rights reserved.
  */
 
 #include <dt-bindings/clock/qcom,dispcc-sc7180.h>
@@ -1885,6 +1885,7 @@
 			iommus = <&adreno_smmu 0>;
 			operating-points-v2 = <&gpu_opp_table>;
 			qcom,gmu = <&gmu>;
+			#cooling-cells = <2>;
 
 			interconnects = <&gem_noc MASTER_GFX3D &mc_virt SLAVE_EBI1>;
 			interconnect-names = "gfx-mem";
@@ -3825,16 +3826,16 @@
 		};
 
 		gpuss0-thermal {
-			polling-delay-passive = <0>;
+			polling-delay-passive = <100>;
 			polling-delay = <0>;
 
 			thermal-sensors = <&tsens0 13>;
 
 			trips {
 				gpuss0_alert0: trip-point0 {
-					temperature = <90000>;
+					temperature = <95000>;
 					hysteresis = <2000>;
-					type = "hot";
+					type = "passive";
 				};
 
 				gpuss0_crit: gpuss0_crit {
@@ -3843,19 +3844,26 @@
 					type = "critical";
 				};
 			};
+
+			cooling-maps {
+				map0 {
+					trip = <&gpuss0_alert0>;
+					cooling-device = <&gpu THERMAL_NO_LIMIT THERMAL_NO_LIMIT>;
+				};
+			};
 		};
 
 		gpuss1-thermal {
-			polling-delay-passive = <0>;
+			polling-delay-passive = <100>;
 			polling-delay = <0>;
 
 			thermal-sensors = <&tsens0 14>;
 
 			trips {
 				gpuss1_alert0: trip-point0 {
-					temperature = <90000>;
+					temperature = <95000>;
 					hysteresis = <2000>;
-					type = "hot";
+					type = "passive";
 				};
 
 				gpuss1_crit: gpuss1_crit {
@@ -3864,6 +3872,13 @@
 					type = "critical";
 				};
 			};
+
+			cooling-maps {
+				map0 {
+					trip = <&gpuss0_alert0>;
+					cooling-device = <&gpu THERMAL_NO_LIMIT THERMAL_NO_LIMIT>;
+				};
+			};
 		};
 
 		aoss1-thermal {
-- 
2.7.4


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 2/2] drm/msm: Add support for GPU cooling
  2020-10-08 17:09 [PATCH 1/2] arm64: dts: qcom: sc7180: Add gpu cooling support Akhil P Oommen
@ 2020-10-08 17:09 ` Akhil P Oommen
  2020-10-09 18:36   ` [2/2] " mka
  2020-10-09 15:05 ` [PATCH 1/2] arm64: dts: qcom: sc7180: Add gpu cooling support Doug Anderson
  1 sibling, 1 reply; 15+ messages in thread
From: Akhil P Oommen @ 2020-10-08 17:09 UTC (permalink / raw)
  To: freedreno, robh, robdclark
  Cc: dri-devel, linux-arm-msm, linux-kernel, jcrouse, mka

Register GPU as a devfreq cooling device so that it can be passively
cooled by the thermal framework.

Signed-off-by: Akhil P Oommen <akhilpo@codeaurora.org>
---
 drivers/gpu/drm/msm/msm_gpu.c | 13 ++++++++++++-
 drivers/gpu/drm/msm/msm_gpu.h |  2 ++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
index 55d1648..93ffd66 100644
--- a/drivers/gpu/drm/msm/msm_gpu.c
+++ b/drivers/gpu/drm/msm/msm_gpu.c
@@ -14,6 +14,7 @@
 #include <generated/utsrelease.h>
 #include <linux/string_helpers.h>
 #include <linux/devfreq.h>
+#include <linux/devfreq_cooling.h>
 #include <linux/devcoredump.h>
 #include <linux/sched/task.h>
 
@@ -107,9 +108,18 @@ static void msm_devfreq_init(struct msm_gpu *gpu)
 	if (IS_ERR(gpu->devfreq.devfreq)) {
 		DRM_DEV_ERROR(&gpu->pdev->dev, "Couldn't initialize GPU devfreq\n");
 		gpu->devfreq.devfreq = NULL;
+		return;
 	}
 
 	devfreq_suspend_device(gpu->devfreq.devfreq);
+
+	gpu->cooling = of_devfreq_cooling_register(gpu->pdev->dev.of_node,
+			gpu->devfreq.devfreq);
+	if (IS_ERR(gpu->cooling)) {
+		DRM_DEV_ERROR(&gpu->pdev->dev,
+				"Couldn't register GPU cooling device\n");
+		gpu->cooling = NULL;
+	}
 }
 
 static int enable_pwrrail(struct msm_gpu *gpu)
@@ -926,7 +936,6 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev,
 
 	msm_devfreq_init(gpu);
 
-
 	gpu->aspace = gpu->funcs->create_address_space(gpu, pdev);
 
 	if (gpu->aspace == NULL)
@@ -1005,4 +1014,6 @@ void msm_gpu_cleanup(struct msm_gpu *gpu)
 		gpu->aspace->mmu->funcs->detach(gpu->aspace->mmu);
 		msm_gem_address_space_put(gpu->aspace);
 	}
+
+	devfreq_cooling_unregister(gpu->cooling);
 }
diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
index 6c9e1fd..9a8f20d 100644
--- a/drivers/gpu/drm/msm/msm_gpu.h
+++ b/drivers/gpu/drm/msm/msm_gpu.h
@@ -147,6 +147,8 @@ struct msm_gpu {
 	struct msm_gpu_state *crashstate;
 	/* True if the hardware supports expanded apriv (a650 and newer) */
 	bool hw_apriv;
+
+	struct thermal_cooling_device *cooling;
 };
 
 static inline struct msm_gpu *dev_to_gpu(struct device *dev)
-- 
2.7.4


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/2] arm64: dts: qcom: sc7180: Add gpu cooling support
  2020-10-08 17:09 [PATCH 1/2] arm64: dts: qcom: sc7180: Add gpu cooling support Akhil P Oommen
  2020-10-08 17:09 ` [PATCH 2/2] drm/msm: Add support for GPU cooling Akhil P Oommen
@ 2020-10-09 15:05 ` Doug Anderson
  2020-10-09 16:57   ` Matthias Kaehlcke
  1 sibling, 1 reply; 15+ messages in thread
From: Doug Anderson @ 2020-10-09 15:05 UTC (permalink / raw)
  To: Akhil P Oommen
  Cc: freedreno, Rob Herring, Rob Clark, dri-devel, linux-arm-msm,
	LKML, Jordan Crouse, Matthias Kaehlcke

Hi,

On Thu, Oct 8, 2020 at 10:10 AM Akhil P Oommen <akhilpo@codeaurora.org> wrote:
>
> Add cooling-cells property and the cooling maps for the gpu tzones
> to support GPU cooling.
>
> Signed-off-by: Akhil P Oommen <akhilpo@codeaurora.org>
> ---
>  arch/arm64/boot/dts/qcom/sc7180.dtsi | 29 ++++++++++++++++++++++-------
>  1 file changed, 22 insertions(+), 7 deletions(-)
>
> diff --git a/arch/arm64/boot/dts/qcom/sc7180.dtsi b/arch/arm64/boot/dts/qcom/sc7180.dtsi
> index d46b383..40d6a28 100644
> --- a/arch/arm64/boot/dts/qcom/sc7180.dtsi
> +++ b/arch/arm64/boot/dts/qcom/sc7180.dtsi
> @@ -2,7 +2,7 @@
>  /*
>   * SC7180 SoC device tree source
>   *
> - * Copyright (c) 2019, The Linux Foundation. All rights reserved.
> + * Copyright (c) 2019-20, The Linux Foundation. All rights reserved.
>   */
>
>  #include <dt-bindings/clock/qcom,dispcc-sc7180.h>
> @@ -1885,6 +1885,7 @@
>                         iommus = <&adreno_smmu 0>;
>                         operating-points-v2 = <&gpu_opp_table>;
>                         qcom,gmu = <&gmu>;
> +                       #cooling-cells = <2>;

Presumably we should add this to the devicetree bindings, too?


>                         interconnects = <&gem_noc MASTER_GFX3D &mc_virt SLAVE_EBI1>;
>                         interconnect-names = "gfx-mem";
> @@ -3825,16 +3826,16 @@
>                 };
>
>                 gpuss0-thermal {
> -                       polling-delay-passive = <0>;
> +                       polling-delay-passive = <100>;

Why did you make this change?  I'm pretty sure that we _don't_ want
this since we're using interrupts for the thermal sensor.  See commit
22337b91022d ("arm64: dts: qcom: sc7180: Changed polling mode in
Thermal-zones node").


>                         polling-delay = <0>;
>
>                         thermal-sensors = <&tsens0 13>;
>
>                         trips {
>                                 gpuss0_alert0: trip-point0 {
> -                                       temperature = <90000>;
> +                                       temperature = <95000>;
>                                         hysteresis = <2000>;
> -                                       type = "hot";
> +                                       type = "passive";

Matthias probably knows better, but I wonder if we should be making
two passive trip levels like we do with CPU.  IIRC this is important
if someone wants to be able to use this with IPA.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/2] arm64: dts: qcom: sc7180: Add gpu cooling support
  2020-10-09 15:05 ` [PATCH 1/2] arm64: dts: qcom: sc7180: Add gpu cooling support Doug Anderson
@ 2020-10-09 16:57   ` Matthias Kaehlcke
  2020-10-14 13:29     ` Akhil P Oommen
  0 siblings, 1 reply; 15+ messages in thread
From: Matthias Kaehlcke @ 2020-10-09 16:57 UTC (permalink / raw)
  To: Doug Anderson
  Cc: Akhil P Oommen, freedreno, Rob Herring, Rob Clark, dri-devel,
	linux-arm-msm, LKML, Jordan Crouse

On Fri, Oct 09, 2020 at 08:05:10AM -0700, Doug Anderson wrote:
> Hi,
> 
> On Thu, Oct 8, 2020 at 10:10 AM Akhil P Oommen <akhilpo@codeaurora.org> wrote:
> >
> > Add cooling-cells property and the cooling maps for the gpu tzones
> > to support GPU cooling.
> >
> > Signed-off-by: Akhil P Oommen <akhilpo@codeaurora.org>
> > ---
> >  arch/arm64/boot/dts/qcom/sc7180.dtsi | 29 ++++++++++++++++++++++-------
> >  1 file changed, 22 insertions(+), 7 deletions(-)
> >
> > diff --git a/arch/arm64/boot/dts/qcom/sc7180.dtsi b/arch/arm64/boot/dts/qcom/sc7180.dtsi
> > index d46b383..40d6a28 100644
> > --- a/arch/arm64/boot/dts/qcom/sc7180.dtsi
> > +++ b/arch/arm64/boot/dts/qcom/sc7180.dtsi
> > @@ -2,7 +2,7 @@
> >  /*
> >   * SC7180 SoC device tree source
> >   *
> > - * Copyright (c) 2019, The Linux Foundation. All rights reserved.
> > + * Copyright (c) 2019-20, The Linux Foundation. All rights reserved.
> >   */
> >
> >  #include <dt-bindings/clock/qcom,dispcc-sc7180.h>
> > @@ -1885,6 +1885,7 @@
> >                         iommus = <&adreno_smmu 0>;
> >                         operating-points-v2 = <&gpu_opp_table>;
> >                         qcom,gmu = <&gmu>;
> > +                       #cooling-cells = <2>;
> 
> Presumably we should add this to the devicetree bindings, too?
> 
> 
> >                         interconnects = <&gem_noc MASTER_GFX3D &mc_virt SLAVE_EBI1>;
> >                         interconnect-names = "gfx-mem";
> > @@ -3825,16 +3826,16 @@
> >                 };
> >
> >                 gpuss0-thermal {
> > -                       polling-delay-passive = <0>;
> > +                       polling-delay-passive = <100>;
> 
> Why did you make this change?  I'm pretty sure that we _don't_ want
> this since we're using interrupts for the thermal sensor.  See commit
> 22337b91022d ("arm64: dts: qcom: sc7180: Changed polling mode in
> Thermal-zones node").

I was going to ask the same, this shouldn't be needed.

> >                         polling-delay = <0>;
> >
> >                         thermal-sensors = <&tsens0 13>;
> >
> >                         trips {
> >                                 gpuss0_alert0: trip-point0 {
> > -                                       temperature = <90000>;
> > +                                       temperature = <95000>;
> >                                         hysteresis = <2000>;
> > -                                       type = "hot";
> > +                                       type = "passive";
> 
> Matthias probably knows better, but I wonder if we should be making
> two passive trip levels like we do with CPU.  IIRC this is important
> if someone wants to be able to use this with IPA.

Yes, please introduce a second trip point and make both of them
'passive'.


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [2/2] drm/msm: Add support for GPU cooling
  2020-10-08 17:09 ` [PATCH 2/2] drm/msm: Add support for GPU cooling Akhil P Oommen
@ 2020-10-09 18:36   ` mka
  2020-10-12 13:33     ` Akhil P Oommen
  0 siblings, 1 reply; 15+ messages in thread
From: mka @ 2020-10-09 18:36 UTC (permalink / raw)
  To: Akhil P Oommen
  Cc: freedreno, robh, robdclark, linux-arm-msm, dri-devel, linux-kernel

Hi Akhil,

On Thu, Oct 08, 2020 at 10:39:07PM +0530, Akhil P Oommen wrote:
> Register GPU as a devfreq cooling device so that it can be passively
> cooled by the thermal framework.
> 
> Signed-off-by: Akhil P Oommen <akhilpo@codeaurora.org>
> ---
>  drivers/gpu/drm/msm/msm_gpu.c | 13 ++++++++++++-
>  drivers/gpu/drm/msm/msm_gpu.h |  2 ++
>  2 files changed, 14 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
> index 55d1648..93ffd66 100644
> --- a/drivers/gpu/drm/msm/msm_gpu.c
> +++ b/drivers/gpu/drm/msm/msm_gpu.c
> @@ -14,6 +14,7 @@
>  #include <generated/utsrelease.h>
>  #include <linux/string_helpers.h>
>  #include <linux/devfreq.h>
> +#include <linux/devfreq_cooling.h>
>  #include <linux/devcoredump.h>
>  #include <linux/sched/task.h>
>  
> @@ -107,9 +108,18 @@ static void msm_devfreq_init(struct msm_gpu *gpu)
>  	if (IS_ERR(gpu->devfreq.devfreq)) {
>  		DRM_DEV_ERROR(&gpu->pdev->dev, "Couldn't initialize GPU devfreq\n");
>  		gpu->devfreq.devfreq = NULL;
> +		return;
>  	}
>  
>  	devfreq_suspend_device(gpu->devfreq.devfreq);
> +
> +	gpu->cooling = of_devfreq_cooling_register(gpu->pdev->dev.of_node,
> +			gpu->devfreq.devfreq);
> +	if (IS_ERR(gpu->cooling)) {
> +		DRM_DEV_ERROR(&gpu->pdev->dev,
> +				"Couldn't register GPU cooling device\n");
> +		gpu->cooling = NULL;
> +	}
>  }
>  
>  static int enable_pwrrail(struct msm_gpu *gpu)
> @@ -926,7 +936,6 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev,
>  
>  	msm_devfreq_init(gpu);
>  
> -
>  	gpu->aspace = gpu->funcs->create_address_space(gpu, pdev);
>  
>  	if (gpu->aspace == NULL)
> @@ -1005,4 +1014,6 @@ void msm_gpu_cleanup(struct msm_gpu *gpu)
>  		gpu->aspace->mmu->funcs->detach(gpu->aspace->mmu);
>  		msm_gem_address_space_put(gpu->aspace);
>  	}
> +
> +	devfreq_cooling_unregister(gpu->cooling);

Resources should be released in reverse order, otherwise the cooling device
could use resources that have already been freed.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [2/2] drm/msm: Add support for GPU cooling
  2020-10-09 18:36   ` [2/2] " mka
@ 2020-10-12 13:33     ` Akhil P Oommen
  2020-10-12 17:40       ` mka
  0 siblings, 1 reply; 15+ messages in thread
From: Akhil P Oommen @ 2020-10-12 13:33 UTC (permalink / raw)
  To: mka; +Cc: freedreno, robh, robdclark, linux-arm-msm, dri-devel, linux-kernel

On 10/10/2020 12:06 AM, mka@chromium.org wrote:
> Hi Akhil,
> 
> On Thu, Oct 08, 2020 at 10:39:07PM +0530, Akhil P Oommen wrote:
>> Register GPU as a devfreq cooling device so that it can be passively
>> cooled by the thermal framework.
>>
>> Signed-off-by: Akhil P Oommen <akhilpo@codeaurora.org>
>> ---
>>   drivers/gpu/drm/msm/msm_gpu.c | 13 ++++++++++++-
>>   drivers/gpu/drm/msm/msm_gpu.h |  2 ++
>>   2 files changed, 14 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
>> index 55d1648..93ffd66 100644
>> --- a/drivers/gpu/drm/msm/msm_gpu.c
>> +++ b/drivers/gpu/drm/msm/msm_gpu.c
>> @@ -14,6 +14,7 @@
>>   #include <generated/utsrelease.h>
>>   #include <linux/string_helpers.h>
>>   #include <linux/devfreq.h>
>> +#include <linux/devfreq_cooling.h>
>>   #include <linux/devcoredump.h>
>>   #include <linux/sched/task.h>
>>   
>> @@ -107,9 +108,18 @@ static void msm_devfreq_init(struct msm_gpu *gpu)
>>   	if (IS_ERR(gpu->devfreq.devfreq)) {
>>   		DRM_DEV_ERROR(&gpu->pdev->dev, "Couldn't initialize GPU devfreq\n");
>>   		gpu->devfreq.devfreq = NULL;
>> +		return;
>>   	}
>>   
>>   	devfreq_suspend_device(gpu->devfreq.devfreq);
>> +
>> +	gpu->cooling = of_devfreq_cooling_register(gpu->pdev->dev.of_node,
>> +			gpu->devfreq.devfreq);
>> +	if (IS_ERR(gpu->cooling)) {
>> +		DRM_DEV_ERROR(&gpu->pdev->dev,
>> +				"Couldn't register GPU cooling device\n");
>> +		gpu->cooling = NULL;
>> +	}
>>   }
>>   
>>   static int enable_pwrrail(struct msm_gpu *gpu)
>> @@ -926,7 +936,6 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev,
>>   
>>   	msm_devfreq_init(gpu);
>>   
>> -
>>   	gpu->aspace = gpu->funcs->create_address_space(gpu, pdev);
>>   
>>   	if (gpu->aspace == NULL)
>> @@ -1005,4 +1014,6 @@ void msm_gpu_cleanup(struct msm_gpu *gpu)
>>   		gpu->aspace->mmu->funcs->detach(gpu->aspace->mmu);
>>   		msm_gem_address_space_put(gpu->aspace);
>>   	}
>> +
>> +	devfreq_cooling_unregister(gpu->cooling);
> 
> Resources should be released in reverse order, otherwise the cooling device
> could use resources that have already been freed.
> Why do you think this is not the correct order? If you are thinking 
about devfreq struct, it is managed device resource.

-Akhil

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [2/2] drm/msm: Add support for GPU cooling
  2020-10-12 13:33     ` Akhil P Oommen
@ 2020-10-12 17:40       ` mka
  2020-10-13 13:53         ` Akhil P Oommen
  0 siblings, 1 reply; 15+ messages in thread
From: mka @ 2020-10-12 17:40 UTC (permalink / raw)
  To: Akhil P Oommen
  Cc: freedreno, robh, robdclark, linux-arm-msm, dri-devel, linux-kernel

On Mon, Oct 12, 2020 at 07:03:51PM +0530, Akhil P Oommen wrote:
> On 10/10/2020 12:06 AM, mka@chromium.org wrote:
> > Hi Akhil,
> > 
> > On Thu, Oct 08, 2020 at 10:39:07PM +0530, Akhil P Oommen wrote:
> > > Register GPU as a devfreq cooling device so that it can be passively
> > > cooled by the thermal framework.
> > > 
> > > Signed-off-by: Akhil P Oommen <akhilpo@codeaurora.org>
> > > ---
> > >   drivers/gpu/drm/msm/msm_gpu.c | 13 ++++++++++++-
> > >   drivers/gpu/drm/msm/msm_gpu.h |  2 ++
> > >   2 files changed, 14 insertions(+), 1 deletion(-)
> > > 
> > > diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
> > > index 55d1648..93ffd66 100644
> > > --- a/drivers/gpu/drm/msm/msm_gpu.c
> > > +++ b/drivers/gpu/drm/msm/msm_gpu.c
> > > @@ -14,6 +14,7 @@
> > >   #include <generated/utsrelease.h>
> > >   #include <linux/string_helpers.h>
> > >   #include <linux/devfreq.h>
> > > +#include <linux/devfreq_cooling.h>
> > >   #include <linux/devcoredump.h>
> > >   #include <linux/sched/task.h>
> > > @@ -107,9 +108,18 @@ static void msm_devfreq_init(struct msm_gpu *gpu)
> > >   	if (IS_ERR(gpu->devfreq.devfreq)) {
> > >   		DRM_DEV_ERROR(&gpu->pdev->dev, "Couldn't initialize GPU devfreq\n");
> > >   		gpu->devfreq.devfreq = NULL;
> > > +		return;
> > >   	}
> > >   	devfreq_suspend_device(gpu->devfreq.devfreq);
> > > +
> > > +	gpu->cooling = of_devfreq_cooling_register(gpu->pdev->dev.of_node,
> > > +			gpu->devfreq.devfreq);
> > > +	if (IS_ERR(gpu->cooling)) {
> > > +		DRM_DEV_ERROR(&gpu->pdev->dev,
> > > +				"Couldn't register GPU cooling device\n");
> > > +		gpu->cooling = NULL;
> > > +	}
> > >   }
> > >   static int enable_pwrrail(struct msm_gpu *gpu)
> > > @@ -926,7 +936,6 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev,
> > >   	msm_devfreq_init(gpu);
> > > -
> > >   	gpu->aspace = gpu->funcs->create_address_space(gpu, pdev);
> > >   	if (gpu->aspace == NULL)
> > > @@ -1005,4 +1014,6 @@ void msm_gpu_cleanup(struct msm_gpu *gpu)
> > >   		gpu->aspace->mmu->funcs->detach(gpu->aspace->mmu);
> > >   		msm_gem_address_space_put(gpu->aspace);
> > >   	}
> > > +
> > > +	devfreq_cooling_unregister(gpu->cooling);
> > 
> > Resources should be released in reverse order, otherwise the cooling device
> > could use resources that have already been freed.
> > Why do you think this is not the correct order? If you are thinking
> about devfreq struct, it is managed device resource.

I did not check specifically if changing the frequency really uses any of the
resources that are released previously, In any case it's not a good idea to
allow other parts of the kernel to use a half initialized/torn down device.
Even if it isn't a problem today someone could change the driver to use any
of these resources (or add a new one) in a frequency change, without even
thinking about the cooling device, just (rightfully) asuming that things are
set up and torn down in a sane order.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [2/2] drm/msm: Add support for GPU cooling
  2020-10-12 17:40       ` mka
@ 2020-10-13 13:53         ` Akhil P Oommen
  2020-10-13 17:40           ` mka
  0 siblings, 1 reply; 15+ messages in thread
From: Akhil P Oommen @ 2020-10-13 13:53 UTC (permalink / raw)
  To: mka; +Cc: linux-arm-msm, linux-kernel, dri-devel, freedreno

On 10/12/2020 11:10 PM, mka@chromium.org wrote:
> On Mon, Oct 12, 2020 at 07:03:51PM +0530, Akhil P Oommen wrote:
>> On 10/10/2020 12:06 AM, mka@chromium.org wrote:
>>> Hi Akhil,
>>>
>>> On Thu, Oct 08, 2020 at 10:39:07PM +0530, Akhil P Oommen wrote:
>>>> Register GPU as a devfreq cooling device so that it can be passively
>>>> cooled by the thermal framework.
>>>>
>>>> Signed-off-by: Akhil P Oommen <akhilpo@codeaurora.org>
>>>> ---
>>>>    drivers/gpu/drm/msm/msm_gpu.c | 13 ++++++++++++-
>>>>    drivers/gpu/drm/msm/msm_gpu.h |  2 ++
>>>>    2 files changed, 14 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
>>>> index 55d1648..93ffd66 100644
>>>> --- a/drivers/gpu/drm/msm/msm_gpu.c
>>>> +++ b/drivers/gpu/drm/msm/msm_gpu.c
>>>> @@ -14,6 +14,7 @@
>>>>    #include <generated/utsrelease.h>
>>>>    #include <linux/string_helpers.h>
>>>>    #include <linux/devfreq.h>
>>>> +#include <linux/devfreq_cooling.h>
>>>>    #include <linux/devcoredump.h>
>>>>    #include <linux/sched/task.h>
>>>> @@ -107,9 +108,18 @@ static void msm_devfreq_init(struct msm_gpu *gpu)
>>>>    	if (IS_ERR(gpu->devfreq.devfreq)) {
>>>>    		DRM_DEV_ERROR(&gpu->pdev->dev, "Couldn't initialize GPU devfreq\n");
>>>>    		gpu->devfreq.devfreq = NULL;
>>>> +		return;
>>>>    	}
>>>>    	devfreq_suspend_device(gpu->devfreq.devfreq);
>>>> +
>>>> +	gpu->cooling = of_devfreq_cooling_register(gpu->pdev->dev.of_node,
>>>> +			gpu->devfreq.devfreq);
>>>> +	if (IS_ERR(gpu->cooling)) {
>>>> +		DRM_DEV_ERROR(&gpu->pdev->dev,
>>>> +				"Couldn't register GPU cooling device\n");
>>>> +		gpu->cooling = NULL;
>>>> +	}
>>>>    }
>>>>    static int enable_pwrrail(struct msm_gpu *gpu)
>>>> @@ -926,7 +936,6 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev,
>>>>    	msm_devfreq_init(gpu);
>>>> -
Will remove this unintended change.
>>>>    	gpu->aspace = gpu->funcs->create_address_space(gpu, pdev);
>>>>    	if (gpu->aspace == NULL)
>>>> @@ -1005,4 +1014,6 @@ void msm_gpu_cleanup(struct msm_gpu *gpu)
>>>>    		gpu->aspace->mmu->funcs->detach(gpu->aspace->mmu);
>>>>    		msm_gem_address_space_put(gpu->aspace);
>>>>    	}
>>>> +
>>>> +	devfreq_cooling_unregister(gpu->cooling);
>>>
>>> Resources should be released in reverse order, otherwise the cooling device
>>> could use resources that have already been freed.
>>> Why do you think this is not the correct order? If you are thinking
>> about devfreq struct, it is managed device resource.
> 
> I did not check specifically if changing the frequency really uses any of the
> resources that are released previously, In any case it's not a good idea to
> allow other parts of the kernel to use a half initialized/torn down device.
> Even if it isn't a problem today someone could change the driver to use any
> of these resources (or add a new one) in a frequency change, without even
> thinking about the cooling device, just (rightfully) asuming that things are
> set up and torn down in a sane order.
'sane order' relative to what specifically here? Should we worry about 
freq change at this point because we have already disabled gpu runtime 
pm and devfreq?

-Akhil.
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/dri-devel
> 


-Akhil.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [2/2] drm/msm: Add support for GPU cooling
  2020-10-13 13:53         ` Akhil P Oommen
@ 2020-10-13 17:40           ` mka
  2020-10-13 19:21             ` Akhil P Oommen
  0 siblings, 1 reply; 15+ messages in thread
From: mka @ 2020-10-13 17:40 UTC (permalink / raw)
  To: Akhil P Oommen; +Cc: linux-arm-msm, linux-kernel, dri-devel, freedreno

On Tue, Oct 13, 2020 at 07:23:34PM +0530, Akhil P Oommen wrote:
> On 10/12/2020 11:10 PM, mka@chromium.org wrote:
> > On Mon, Oct 12, 2020 at 07:03:51PM +0530, Akhil P Oommen wrote:
> > > On 10/10/2020 12:06 AM, mka@chromium.org wrote:
> > > > Hi Akhil,
> > > > 
> > > > On Thu, Oct 08, 2020 at 10:39:07PM +0530, Akhil P Oommen wrote:
> > > > > Register GPU as a devfreq cooling device so that it can be passively
> > > > > cooled by the thermal framework.
> > > > > 
> > > > > Signed-off-by: Akhil P Oommen <akhilpo@codeaurora.org>
> > > > > ---
> > > > >    drivers/gpu/drm/msm/msm_gpu.c | 13 ++++++++++++-
> > > > >    drivers/gpu/drm/msm/msm_gpu.h |  2 ++
> > > > >    2 files changed, 14 insertions(+), 1 deletion(-)
> > > > > 
> > > > > diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
> > > > > index 55d1648..93ffd66 100644
> > > > > --- a/drivers/gpu/drm/msm/msm_gpu.c
> > > > > +++ b/drivers/gpu/drm/msm/msm_gpu.c
> > > > > @@ -14,6 +14,7 @@
> > > > >    #include <generated/utsrelease.h>
> > > > >    #include <linux/string_helpers.h>
> > > > >    #include <linux/devfreq.h>
> > > > > +#include <linux/devfreq_cooling.h>
> > > > >    #include <linux/devcoredump.h>
> > > > >    #include <linux/sched/task.h>
> > > > > @@ -107,9 +108,18 @@ static void msm_devfreq_init(struct msm_gpu *gpu)
> > > > >    	if (IS_ERR(gpu->devfreq.devfreq)) {
> > > > >    		DRM_DEV_ERROR(&gpu->pdev->dev, "Couldn't initialize GPU devfreq\n");
> > > > >    		gpu->devfreq.devfreq = NULL;
> > > > > +		return;
> > > > >    	}
> > > > >    	devfreq_suspend_device(gpu->devfreq.devfreq);
> > > > > +
> > > > > +	gpu->cooling = of_devfreq_cooling_register(gpu->pdev->dev.of_node,
> > > > > +			gpu->devfreq.devfreq);
> > > > > +	if (IS_ERR(gpu->cooling)) {
> > > > > +		DRM_DEV_ERROR(&gpu->pdev->dev,
> > > > > +				"Couldn't register GPU cooling device\n");
> > > > > +		gpu->cooling = NULL;
> > > > > +	}
> > > > >    }
> > > > >    static int enable_pwrrail(struct msm_gpu *gpu)
> > > > > @@ -926,7 +936,6 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev,
> > > > >    	msm_devfreq_init(gpu);
> > > > > -
> Will remove this unintended change.
> > > > >    	gpu->aspace = gpu->funcs->create_address_space(gpu, pdev);
> > > > >    	if (gpu->aspace == NULL)
> > > > > @@ -1005,4 +1014,6 @@ void msm_gpu_cleanup(struct msm_gpu *gpu)
> > > > >    		gpu->aspace->mmu->funcs->detach(gpu->aspace->mmu);
> > > > >    		msm_gem_address_space_put(gpu->aspace);
> > > > >    	}
> > > > > +
> > > > > +	devfreq_cooling_unregister(gpu->cooling);
> > > > 
> > > > Resources should be released in reverse order, otherwise the cooling device
> > > > could use resources that have already been freed.
> > > > Why do you think this is not the correct order? If you are thinking
> > > about devfreq struct, it is managed device resource.
> > 
> > I did not check specifically if changing the frequency really uses any of the
> > resources that are released previously, In any case it's not a good idea to
> > allow other parts of the kernel to use a half initialized/torn down device.
> > Even if it isn't a problem today someone could change the driver to use any
> > of these resources (or add a new one) in a frequency change, without even
> > thinking about the cooling device, just (rightfully) asuming that things are
> > set up and torn down in a sane order.
> 'sane order' relative to what specifically here? Should we worry about freq
> change at this point because we have already disabled gpu runtime pm and
> devfreq?

GPU runtime PM and the devfreq being disabled is not evident from the context
of the function. You are probably right that it's not a problem in practice,
but why give reason for doubts in the first place if this could be avoided
by following a common practice?

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [2/2] drm/msm: Add support for GPU cooling
  2020-10-13 17:40           ` mka
@ 2020-10-13 19:21             ` Akhil P Oommen
  2020-10-14  1:09               ` mka
  0 siblings, 1 reply; 15+ messages in thread
From: Akhil P Oommen @ 2020-10-13 19:21 UTC (permalink / raw)
  To: mka; +Cc: linux-arm-msm, freedreno, linux-kernel, dri-devel

On 10/13/2020 11:10 PM, mka@chromium.org wrote:
> On Tue, Oct 13, 2020 at 07:23:34PM +0530, Akhil P Oommen wrote:
>> On 10/12/2020 11:10 PM, mka@chromium.org wrote:
>>> On Mon, Oct 12, 2020 at 07:03:51PM +0530, Akhil P Oommen wrote:
>>>> On 10/10/2020 12:06 AM, mka@chromium.org wrote:
>>>>> Hi Akhil,
>>>>>
>>>>> On Thu, Oct 08, 2020 at 10:39:07PM +0530, Akhil P Oommen wrote:
>>>>>> Register GPU as a devfreq cooling device so that it can be passively
>>>>>> cooled by the thermal framework.
>>>>>>
>>>>>> Signed-off-by: Akhil P Oommen <akhilpo@codeaurora.org>
>>>>>> ---
>>>>>>     drivers/gpu/drm/msm/msm_gpu.c | 13 ++++++++++++-
>>>>>>     drivers/gpu/drm/msm/msm_gpu.h |  2 ++
>>>>>>     2 files changed, 14 insertions(+), 1 deletion(-)
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
>>>>>> index 55d1648..93ffd66 100644
>>>>>> --- a/drivers/gpu/drm/msm/msm_gpu.c
>>>>>> +++ b/drivers/gpu/drm/msm/msm_gpu.c
>>>>>> @@ -14,6 +14,7 @@
>>>>>>     #include <generated/utsrelease.h>
>>>>>>     #include <linux/string_helpers.h>
>>>>>>     #include <linux/devfreq.h>
>>>>>> +#include <linux/devfreq_cooling.h>
>>>>>>     #include <linux/devcoredump.h>
>>>>>>     #include <linux/sched/task.h>
>>>>>> @@ -107,9 +108,18 @@ static void msm_devfreq_init(struct msm_gpu *gpu)
>>>>>>     	if (IS_ERR(gpu->devfreq.devfreq)) {
>>>>>>     		DRM_DEV_ERROR(&gpu->pdev->dev, "Couldn't initialize GPU devfreq\n");
>>>>>>     		gpu->devfreq.devfreq = NULL;
>>>>>> +		return;
>>>>>>     	}
>>>>>>     	devfreq_suspend_device(gpu->devfreq.devfreq);
>>>>>> +
>>>>>> +	gpu->cooling = of_devfreq_cooling_register(gpu->pdev->dev.of_node,
>>>>>> +			gpu->devfreq.devfreq);
>>>>>> +	if (IS_ERR(gpu->cooling)) {
>>>>>> +		DRM_DEV_ERROR(&gpu->pdev->dev,
>>>>>> +				"Couldn't register GPU cooling device\n");
>>>>>> +		gpu->cooling = NULL;
>>>>>> +	}
>>>>>>     }
>>>>>>     static int enable_pwrrail(struct msm_gpu *gpu)
>>>>>> @@ -926,7 +936,6 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev,
>>>>>>     	msm_devfreq_init(gpu);
>>>>>> -
>> Will remove this unintended change.
>>>>>>     	gpu->aspace = gpu->funcs->create_address_space(gpu, pdev);
>>>>>>     	if (gpu->aspace == NULL)
>>>>>> @@ -1005,4 +1014,6 @@ void msm_gpu_cleanup(struct msm_gpu *gpu)
>>>>>>     		gpu->aspace->mmu->funcs->detach(gpu->aspace->mmu);
>>>>>>     		msm_gem_address_space_put(gpu->aspace);
>>>>>>     	}
>>>>>> +
>>>>>> +	devfreq_cooling_unregister(gpu->cooling);
>>>>>
>>>>> Resources should be released in reverse order, otherwise the cooling device
>>>>> could use resources that have already been freed.
>>>>> Why do you think this is not the correct order? If you are thinking
>>>> about devfreq struct, it is managed device resource.
>>>
>>> I did not check specifically if changing the frequency really uses any of the
>>> resources that are released previously, In any case it's not a good idea to
>>> allow other parts of the kernel to use a half initialized/torn down device.
>>> Even if it isn't a problem today someone could change the driver to use any
>>> of these resources (or add a new one) in a frequency change, without even
>>> thinking about the cooling device, just (rightfully) asuming that things are
>>> set up and torn down in a sane order.
>> 'sane order' relative to what specifically here? Should we worry about freq
>> change at this point because we have already disabled gpu runtime pm and
>> devfreq?
> 
> GPU runtime PM and the devfreq being disabled is not evident from the context
> of the function. You are probably right that it's not a problem in practice,
> but why give reason for doubts in the first place if this could be avoided
> by following a common practice?
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/dri-devel
> 
Other option I see is to create a managed device resource (devm) version 
of the devfreq_cooling_register API and use that. Is that what you are 
trying to suggest?

-Akhil.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [2/2] drm/msm: Add support for GPU cooling
  2020-10-13 19:21             ` Akhil P Oommen
@ 2020-10-14  1:09               ` mka
  0 siblings, 0 replies; 15+ messages in thread
From: mka @ 2020-10-14  1:09 UTC (permalink / raw)
  To: Akhil P Oommen; +Cc: linux-arm-msm, freedreno, linux-kernel, dri-devel

On Wed, Oct 14, 2020 at 12:51:55AM +0530, Akhil P Oommen wrote:
> On 10/13/2020 11:10 PM, mka@chromium.org wrote:
> > On Tue, Oct 13, 2020 at 07:23:34PM +0530, Akhil P Oommen wrote:
> > > On 10/12/2020 11:10 PM, mka@chromium.org wrote:
> > > > On Mon, Oct 12, 2020 at 07:03:51PM +0530, Akhil P Oommen wrote:
> > > > > On 10/10/2020 12:06 AM, mka@chromium.org wrote:
> > > > > > Hi Akhil,
> > > > > > 
> > > > > > On Thu, Oct 08, 2020 at 10:39:07PM +0530, Akhil P Oommen wrote:
> > > > > > > Register GPU as a devfreq cooling device so that it can be passively
> > > > > > > cooled by the thermal framework.
> > > > > > > 
> > > > > > > Signed-off-by: Akhil P Oommen <akhilpo@codeaurora.org>
> > > > > > > ---
> > > > > > >     drivers/gpu/drm/msm/msm_gpu.c | 13 ++++++++++++-
> > > > > > >     drivers/gpu/drm/msm/msm_gpu.h |  2 ++
> > > > > > >     2 files changed, 14 insertions(+), 1 deletion(-)
> > > > > > > 
> > > > > > > diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
> > > > > > > index 55d1648..93ffd66 100644
> > > > > > > --- a/drivers/gpu/drm/msm/msm_gpu.c
> > > > > > > +++ b/drivers/gpu/drm/msm/msm_gpu.c
> > > > > > > @@ -14,6 +14,7 @@
> > > > > > >     #include <generated/utsrelease.h>
> > > > > > >     #include <linux/string_helpers.h>
> > > > > > >     #include <linux/devfreq.h>
> > > > > > > +#include <linux/devfreq_cooling.h>
> > > > > > >     #include <linux/devcoredump.h>
> > > > > > >     #include <linux/sched/task.h>
> > > > > > > @@ -107,9 +108,18 @@ static void msm_devfreq_init(struct msm_gpu *gpu)
> > > > > > >     	if (IS_ERR(gpu->devfreq.devfreq)) {
> > > > > > >     		DRM_DEV_ERROR(&gpu->pdev->dev, "Couldn't initialize GPU devfreq\n");
> > > > > > >     		gpu->devfreq.devfreq = NULL;
> > > > > > > +		return;
> > > > > > >     	}
> > > > > > >     	devfreq_suspend_device(gpu->devfreq.devfreq);
> > > > > > > +
> > > > > > > +	gpu->cooling = of_devfreq_cooling_register(gpu->pdev->dev.of_node,
> > > > > > > +			gpu->devfreq.devfreq);
> > > > > > > +	if (IS_ERR(gpu->cooling)) {
> > > > > > > +		DRM_DEV_ERROR(&gpu->pdev->dev,
> > > > > > > +				"Couldn't register GPU cooling device\n");
> > > > > > > +		gpu->cooling = NULL;
> > > > > > > +	}
> > > > > > >     }
> > > > > > >     static int enable_pwrrail(struct msm_gpu *gpu)
> > > > > > > @@ -926,7 +936,6 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev,
> > > > > > >     	msm_devfreq_init(gpu);
> > > > > > > -
> > > Will remove this unintended change.
> > > > > > >     	gpu->aspace = gpu->funcs->create_address_space(gpu, pdev);
> > > > > > >     	if (gpu->aspace == NULL)
> > > > > > > @@ -1005,4 +1014,6 @@ void msm_gpu_cleanup(struct msm_gpu *gpu)
> > > > > > >     		gpu->aspace->mmu->funcs->detach(gpu->aspace->mmu);
> > > > > > >     		msm_gem_address_space_put(gpu->aspace);
> > > > > > >     	}
> > > > > > > +
> > > > > > > +	devfreq_cooling_unregister(gpu->cooling);
> > > > > > 
> > > > > > Resources should be released in reverse order, otherwise the cooling device
> > > > > > could use resources that have already been freed.
> > > > > > Why do you think this is not the correct order? If you are thinking
> > > > > about devfreq struct, it is managed device resource.
> > > > 
> > > > I did not check specifically if changing the frequency really uses any of the
> > > > resources that are released previously, In any case it's not a good idea to
> > > > allow other parts of the kernel to use a half initialized/torn down device.
> > > > Even if it isn't a problem today someone could change the driver to use any
> > > > of these resources (or add a new one) in a frequency change, without even
> > > > thinking about the cooling device, just (rightfully) asuming that things are
> > > > set up and torn down in a sane order.
> > > 'sane order' relative to what specifically here? Should we worry about freq
> > > change at this point because we have already disabled gpu runtime pm and
> > > devfreq?
> > 
> > GPU runtime PM and the devfreq being disabled is not evident from the context
> > of the function. You are probably right that it's not a problem in practice,
> > but why give reason for doubts in the first place if this could be avoided
> > by following a common practice?
> > _______________________________________________
> > dri-devel mailing list
> > dri-devel@lists.freedesktop.org
> > https://lists.freedesktop.org/mailman/listinfo/dri-devel
> > 
> Other option I see is to create a managed device resource (devm) version of
> the devfreq_cooling_register API and use that. Is that what you are trying
> to suggest?

No, I was not thinking about a devm version, just manual reverse removal.

Actually you can even argue the you are using the right order, I saw the
ring buffer and the address space are actually initialized after
msm_devfreq_init(). That strikes me a bit odd, I guess the
devfreq_suspend_device() in msm_devfreq_init() is supposed to prevent the
devfreq from being active, however that is potentially racy, it could become
active right after being created. I would have expected the devfreq to be
created when everything else is ready, but I don't know this driver well,
nor am I a devfreq expert, maybe there is a good reason for it ...

In summary, the order you are using is consistent with the what the driver
currently does, which might not be entirely correct, but that is beyond the
scope of this patch.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/2] arm64: dts: qcom: sc7180: Add gpu cooling support
  2020-10-09 16:57   ` Matthias Kaehlcke
@ 2020-10-14 13:29     ` Akhil P Oommen
  2020-10-14 18:37       ` manafm
  0 siblings, 1 reply; 15+ messages in thread
From: Akhil P Oommen @ 2020-10-14 13:29 UTC (permalink / raw)
  To: Matthias Kaehlcke, Doug Anderson, manafm
  Cc: linux-arm-msm, LKML, dri-devel, freedreno

On 10/9/2020 10:27 PM, Matthias Kaehlcke wrote:
> On Fri, Oct 09, 2020 at 08:05:10AM -0700, Doug Anderson wrote:
>> Hi,
>>
>> On Thu, Oct 8, 2020 at 10:10 AM Akhil P Oommen <akhilpo@codeaurora.org> wrote:
>>>
>>> Add cooling-cells property and the cooling maps for the gpu tzones
>>> to support GPU cooling.
>>>
>>> Signed-off-by: Akhil P Oommen <akhilpo@codeaurora.org>
>>> ---
>>>   arch/arm64/boot/dts/qcom/sc7180.dtsi | 29 ++++++++++++++++++++++-------
>>>   1 file changed, 22 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/arch/arm64/boot/dts/qcom/sc7180.dtsi b/arch/arm64/boot/dts/qcom/sc7180.dtsi
>>> index d46b383..40d6a28 100644
>>> --- a/arch/arm64/boot/dts/qcom/sc7180.dtsi
>>> +++ b/arch/arm64/boot/dts/qcom/sc7180.dtsi
>>> @@ -2,7 +2,7 @@
>>>   /*
>>>    * SC7180 SoC device tree source
>>>    *
>>> - * Copyright (c) 2019, The Linux Foundation. All rights reserved.
>>> + * Copyright (c) 2019-20, The Linux Foundation. All rights reserved.
>>>    */
>>>
>>>   #include <dt-bindings/clock/qcom,dispcc-sc7180.h>
>>> @@ -1885,6 +1885,7 @@
>>>                          iommus = <&adreno_smmu 0>;
>>>                          operating-points-v2 = <&gpu_opp_table>;
>>>                          qcom,gmu = <&gmu>;
>>> +                       #cooling-cells = <2>;
>>
>> Presumably we should add this to the devicetree bindings, too?
Yes, thanks for catching this. Will update in the next patch.

>>
>>
>>>                          interconnects = <&gem_noc MASTER_GFX3D &mc_virt SLAVE_EBI1>;
>>>                          interconnect-names = "gfx-mem";
>>> @@ -3825,16 +3826,16 @@
>>>                  };
>>>
>>>                  gpuss0-thermal {
>>> -                       polling-delay-passive = <0>;
>>> +                       polling-delay-passive = <100>;
>>
>> Why did you make this change?  I'm pretty sure that we _don't_ want
>> this since we're using interrupts for the thermal sensor.  See commit
>> 22337b91022d ("arm64: dts: qcom: sc7180: Changed polling mode in
>> Thermal-zones node").
> 
> I was going to ask the same, this shouldn't be needed.
> 
>>>                          polling-delay = <0>;
>>>
>>>                          thermal-sensors = <&tsens0 13>;
>>>
>>>                          trips {
>>>                                  gpuss0_alert0: trip-point0 {
>>> -                                       temperature = <90000>;
>>> +                                       temperature = <95000>;
>>>                                          hysteresis = <2000>;
>>> -                                       type = "hot";
>>> +                                       type = "passive";
>>
>> Matthias probably knows better, but I wonder if we should be making
>> two passive trip levels like we do with CPU.  IIRC this is important
>> if someone wants to be able to use this with IPA.
> 
> Yes, please introduce a second trip point and make both of them
> 'passive'.
> 
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/dri-devel
> 
Adding Manaf here.

-Akhil.


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/2] arm64: dts: qcom: sc7180: Add gpu cooling support
  2020-10-14 13:29     ` Akhil P Oommen
@ 2020-10-14 18:37       ` manafm
  2020-10-15 22:19         ` Matthias Kaehlcke
  0 siblings, 1 reply; 15+ messages in thread
From: manafm @ 2020-10-14 18:37 UTC (permalink / raw)
  To: Akhil P Oommen
  Cc: Matthias Kaehlcke, Doug Anderson, linux-arm-msm, LKML, dri-devel,
	freedreno

On 2020-10-14 18:59, Akhil P Oommen wrote:
> On 10/9/2020 10:27 PM, Matthias Kaehlcke wrote:
>> On Fri, Oct 09, 2020 at 08:05:10AM -0700, Doug Anderson wrote:
>>> Hi,
>>> 
>>> On Thu, Oct 8, 2020 at 10:10 AM Akhil P Oommen 
>>> <akhilpo@codeaurora.org> wrote:
>>>> 
>>>> Add cooling-cells property and the cooling maps for the gpu tzones
>>>> to support GPU cooling.
>>>> 
>>>> Signed-off-by: Akhil P Oommen <akhilpo@codeaurora.org>
>>>> ---
>>>>   arch/arm64/boot/dts/qcom/sc7180.dtsi | 29 
>>>> ++++++++++++++++++++++-------
>>>>   1 file changed, 22 insertions(+), 7 deletions(-)
>>>> 
>>>> diff --git a/arch/arm64/boot/dts/qcom/sc7180.dtsi 
>>>> b/arch/arm64/boot/dts/qcom/sc7180.dtsi
>>>> index d46b383..40d6a28 100644
>>>> --- a/arch/arm64/boot/dts/qcom/sc7180.dtsi
>>>> +++ b/arch/arm64/boot/dts/qcom/sc7180.dtsi
>>>> @@ -2,7 +2,7 @@
>>>>   /*
>>>>    * SC7180 SoC device tree source
>>>>    *
>>>> - * Copyright (c) 2019, The Linux Foundation. All rights reserved.
>>>> + * Copyright (c) 2019-20, The Linux Foundation. All rights 
>>>> reserved.
>>>>    */
>>>> 
>>>>   #include <dt-bindings/clock/qcom,dispcc-sc7180.h>
>>>> @@ -1885,6 +1885,7 @@
>>>>                          iommus = <&adreno_smmu 0>;
>>>>                          operating-points-v2 = <&gpu_opp_table>;
>>>>                          qcom,gmu = <&gmu>;
>>>> +                       #cooling-cells = <2>;
>>> 
>>> Presumably we should add this to the devicetree bindings, too?
> Yes, thanks for catching this. Will update in the next patch.
> 
>>> 
>>> 
>>>>                          interconnects = <&gem_noc MASTER_GFX3D 
>>>> &mc_virt SLAVE_EBI1>;
>>>>                          interconnect-names = "gfx-mem";
>>>> @@ -3825,16 +3826,16 @@
>>>>                  };
>>>> 
>>>>                  gpuss0-thermal {
>>>> -                       polling-delay-passive = <0>;
>>>> +                       polling-delay-passive = <100>;
>>> 
>>> Why did you make this change?  I'm pretty sure that we _don't_ want
>>> this since we're using interrupts for the thermal sensor.  See commit
>>> 22337b91022d ("arm64: dts: qcom: sc7180: Changed polling mode in
>>> Thermal-zones node").
>> 
>> I was going to ask the same, this shouldn't be needed.
As per our understanding unlike "polling-delay",  this delay property is 
intended to activate polling thread on post trip threshold violation and 
  it is irrespective of sensor is capable for trip interrupt or not.
This polling is more of governor related. Below are the few references 
from Documentation/code which tells polling-delay-passive is needed for 
IPA for better IPA performance.

As per Power allocator documentations

1. 
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/Documentation/driver-api/thermal/power_allocator.rst?h=v5.4.71#n264

"The power allocator governor's PID controller works best if there is a
periodic tick.  If you have a driver that calls
`thermal_zone_device_update()` (or anything that ends up calling the
governor's `throttle()` function) repetitively, the governor response
won't be very good.  Note that this is not particular to this
governor, step-wise will also misbehave if you call its throttle()
faster than the normal thermal framework tick (due to interrupts for
example) as it will overreact"

2. In Power allocator code, when  switch_on/control trip temp violation, 
it is enabling passive counter to activate passive polling @ 
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/drivers/thermal/power_allocator.c?h=v5.4.71#n634

3. while calculating derivative term, it is using passive_delay @
  
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/drivers/thermal/power_allocator.c?h=v5.4.71#n243

4. Sensor interrupt will work if temperature is fluctuating between 
trip_temp and hysteresis. But say a case where we are not enabling 
polling-delay-passive. In this case if  current temperature > 
control_temp trip(2nd passive trip) and
  temperature trend is still raising, then sensor high trip will be 
disabled (OR configured for critical trip threshold). No more trip 
interrupt from sensor until it reaches critical trip or falls below 
control_temp hysteresis.
  How  the governor re-evaluate its next mitigation without passive 
polling thread  here ?

I think the same is required for CPU thermal zone as well.
>> 
>>>>                          polling-delay = <0>;
>>>> 
>>>>                          thermal-sensors = <&tsens0 13>;
>>>> 
>>>>                          trips {
>>>>                                  gpuss0_alert0: trip-point0 {
>>>> -                                       temperature = <90000>;
>>>> +                                       temperature = <95000>;
>>>>                                          hysteresis = <2000>;
>>>> -                                       type = "hot";
>>>> +                                       type = "passive";
>>> 
>>> Matthias probably knows better, but I wonder if we should be making
>>> two passive trip levels like we do with CPU.  IIRC this is important
>>> if someone wants to be able to use this with IPA.
>> 
>> Yes, please introduce a second trip point and make both of them
>> 'passive'.
>> 
>> _______________________________________________
>> dri-devel mailing list
>> dri-devel@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/dri-devel
>> 
> Adding Manaf here.
> 
> -Akhil.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/2] arm64: dts: qcom: sc7180: Add gpu cooling support
  2020-10-14 18:37       ` manafm
@ 2020-10-15 22:19         ` Matthias Kaehlcke
  2020-10-16 13:46           ` Akhil P Oommen
  0 siblings, 1 reply; 15+ messages in thread
From: Matthias Kaehlcke @ 2020-10-15 22:19 UTC (permalink / raw)
  To: manafm
  Cc: Akhil P Oommen, Doug Anderson, linux-arm-msm, LKML, dri-devel,
	freedreno, Rajeshwari, Amit Kucheria

Hi,

On Thu, Oct 15, 2020 at 12:07:01AM +0530, manafm@codeaurora.org wrote:
> On 2020-10-14 18:59, Akhil P Oommen wrote:
> > On 10/9/2020 10:27 PM, Matthias Kaehlcke wrote:
> > > On Fri, Oct 09, 2020 at 08:05:10AM -0700, Doug Anderson wrote:
> > > > Hi,
> > > > 
> > > > On Thu, Oct 8, 2020 at 10:10 AM Akhil P Oommen
> > > > <akhilpo@codeaurora.org> wrote:
> > > > > 
> > > > > Add cooling-cells property and the cooling maps for the gpu tzones
> > > > > to support GPU cooling.
> > > > > 
> > > > > Signed-off-by: Akhil P Oommen <akhilpo@codeaurora.org>
> > > > > ---
> > > > >   arch/arm64/boot/dts/qcom/sc7180.dtsi | 29
> > > > > ++++++++++++++++++++++-------
> > > > >   1 file changed, 22 insertions(+), 7 deletions(-)
> > > > > 
> > > > > diff --git a/arch/arm64/boot/dts/qcom/sc7180.dtsi
> > > > > b/arch/arm64/boot/dts/qcom/sc7180.dtsi
> > > > > index d46b383..40d6a28 100644
> > > > > --- a/arch/arm64/boot/dts/qcom/sc7180.dtsi
> > > > > +++ b/arch/arm64/boot/dts/qcom/sc7180.dtsi
> > > > > @@ -2,7 +2,7 @@
> > > > >   /*
> > > > >    * SC7180 SoC device tree source
> > > > >    *
> > > > > - * Copyright (c) 2019, The Linux Foundation. All rights reserved.
> > > > > + * Copyright (c) 2019-20, The Linux Foundation. All rights
> > > > > reserved.
> > > > >    */
> > > > > 
> > > > >   #include <dt-bindings/clock/qcom,dispcc-sc7180.h>
> > > > > @@ -1885,6 +1885,7 @@
> > > > >                          iommus = <&adreno_smmu 0>;
> > > > >                          operating-points-v2 = <&gpu_opp_table>;
> > > > >                          qcom,gmu = <&gmu>;
> > > > > +                       #cooling-cells = <2>;
> > > > 
> > > > Presumably we should add this to the devicetree bindings, too?
> > Yes, thanks for catching this. Will update in the next patch.
> > 
> > > > 
> > > > 
> > > > >                          interconnects = <&gem_noc
> > > > > MASTER_GFX3D &mc_virt SLAVE_EBI1>;
> > > > >                          interconnect-names = "gfx-mem";
> > > > > @@ -3825,16 +3826,16 @@
> > > > >                  };
> > > > > 
> > > > >                  gpuss0-thermal {
> > > > > -                       polling-delay-passive = <0>;
> > > > > +                       polling-delay-passive = <100>;
> > > > 
> > > > Why did you make this change?  I'm pretty sure that we _don't_ want
> > > > this since we're using interrupts for the thermal sensor.  See commit
> > > > 22337b91022d ("arm64: dts: qcom: sc7180: Changed polling mode in
> > > > Thermal-zones node").
> > > 
> > > I was going to ask the same, this shouldn't be needed.
> As per our understanding unlike "polling-delay",  this delay property is
> intended to activate polling thread on post trip threshold violation and  it
> is irrespective of sensor is capable for trip interrupt or not.
> This polling is more of governor related. Below are the few references from
> Documentation/code which tells polling-delay-passive is needed for IPA for
> better IPA performance.
> 
> As per Power allocator documentations
> 
> 1. https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/Documentation/driver-api/thermal/power_allocator.rst?h=v5.4.71#n264
> 
> "The power allocator governor's PID controller works best if there is a
> periodic tick.  If you have a driver that calls
> `thermal_zone_device_update()` (or anything that ends up calling the
> governor's `throttle()` function) repetitively, the governor response
> won't be very good.  Note that this is not particular to this
> governor, step-wise will also misbehave if you call its throttle()
> faster than the normal thermal framework tick (due to interrupts for
> example) as it will overreact"
> 
> 2. In Power allocator code, when  switch_on/control trip temp violation, it
> is enabling passive counter to activate passive polling @ https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/drivers/thermal/power_allocator.c?h=v5.4.71#n634
> 
> 3. while calculating derivative term, it is using passive_delay @
> https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/drivers/thermal/power_allocator.c?h=v5.4.71#n243
> 
> 4. Sensor interrupt will work if temperature is fluctuating between
> trip_temp and hysteresis. But say a case where we are not enabling
> polling-delay-passive. In this case if  current temperature > control_temp
> trip(2nd passive trip) and
>  temperature trend is still raising, then sensor high trip will be disabled
> (OR configured for critical trip threshold). No more trip interrupt from
> sensor until it reaches critical trip or falls below control_temp
> hysteresis.
>  How  the governor re-evaluate its next mitigation without passive polling
> thread  here ?
> 
> I think the same is required for CPU thermal zone as well.

Thanks for the explication and pointers!

I ran some tests to re-confirm. For that I lowered the trip point temperatures
of CPU6 to 60/70, to make it easier to trigger throttling without necessarily
affecting the other CPUs. Further I enabled tracing for the events 'thermal_temperature',
'thermal_zone_trip' and 'thermal_power_allocator'. With that I ran a CPU
intensive task on CPU6.

Without polling-delay the trace log looks like this:

  irq/40-c263000.-157   [000] ....    48.035986: thermal_temperature: thermal_zone=cpu6-thermal id=6 temp_prev=57800 temp=60000
  irq/40-c263000.-157   [000] ....    48.036029: thermal_power_allocator_pid: thermal_zone_id=6 err=10000 err_integral=0 p=2402 i=0 d=0 output=1776
  irq/40-c263000.-157   [000] ....    48.036036: thermal_power_allocator: thermal_zone_id=6 req_power={{0x96}} total_req_power=150 granted_power={{0x6f0}} total_granted_power=1776 power_range=1776 max_allocatable_power=1776 current_temperature=60000 delta_temperature=10000
  irq/40-c263000.-157   [000] ....    52.480888: thermal_temperature: thermal_zone=cpu6-thermal id=6 temp_prev=60000 temp=70000
  irq/40-c263000.-157   [000] ....    52.480925: thermal_power_allocator_pid: thermal_zone_id=6 err=0 err_integral=0 p=0 i=0 d=0 output=1202
  irq/40-c263000.-157   [000] ....    52.480931: thermal_power_allocator: thermal_zone_id=6 req_power={{0x45e}} total_req_power=1118 granted_power={{0x4b2}} total_granted_power=1202 power_range=1202 max_allocatable_power=1776 current_temperature=70000 delta_temperature=0

i.e. power_allocator only acts on the sensor interrupts at the trip points

It looks different with a non-zero value for 'polling-delay-passive':

  irq/40-c263000.-156   [000] ....   104.501777: thermal_power_allocator: thermal_zone_id=6 req_power={{0x331}} total_req_power=817 granted_power={{0x591}} total_granted_power=1425 power_range=1425 max_allocatable_power=1776 current_temperature=67800 delta_temperature=2200
  irq/40-c263000.-156   [000] ....   104.523073: thermal_temperature: thermal_zone=cpu6-thermal id=6 temp_prev=67800 temp=70000
  irq/40-c263000.-156   [000] ....   104.523121: thermal_power_allocator_pid: thermal_zone_id=6 err=0 err_integral=-31200 p=0 i=-305 d=0 output=897
  irq/40-c263000.-156   [000] ....   104.523148: thermal_power_allocator: thermal_zone_id=6 req_power={{0x406}} total_req_power=1030 granted_power={{0x381}} total_granted_power=897 power_range=897 max_allocatable_power=1776 current_temperature=70000 delta_temperature=0
  irq/40-c263000.-156   [000] ....   104.608566: thermal_temperature: thermal_zone=cpu6-thermal id=6 temp_prev=70000 temp=67800
  irq/40-c263000.-156   [000] ....   104.608612: thermal_power_allocator_pid: thermal_zone_id=6 err=2200 err_integral=-31200 p=528 i=-305 d=0 output=1425
  irq/40-c263000.-156   [000] ....   104.608642: thermal_power_allocator: thermal_zone_id=6 req_power={{0x331}} total_req_power=817 granted_power={{0x591}} total_granted_power=1425 power_range=1425 max_allocatable_power=1776 current_temperature=67800 delta_temperature=2200
  irq/40-c263000.-156   [000] ....   104.630863: thermal_temperature: thermal_zone=cpu6-thermal id=6 temp_prev=67800 temp=70000
  irq/40-c263000.-156   [000] ....   104.630907: thermal_power_allocator_pid: thermal_zone_id=6 err=0 err_integral=-31200 p=0 i=-305 d=0 output=897
  irq/40-c263000.-156   [000] ....   104.630932: thermal_power_allocator: thermal_zone_id=6 req_power={{0x3f4}} total_req_power=1012 granted_power={{0x381}} total_granted_power=897 power_range=897 max_allocatable_power=1776 current_temperature=70000 delta_temperature=0
  irq/40-c263000.-156   [000] ....   104.687495: thermal_temperature: thermal_zone=cpu6-thermal id=6 temp_prev=70000 temp=67800
  irq/40-c263000.-156   [000] ....   104.687541: thermal_power_allocator_pid: thermal_zone_id=6 err=2200 err_integral=-31200 p=528 i=-305 d=0 output=1425
  irq/40-c263000.-156   [000] ....   104.687567: thermal_power_allocator: thermal_zone_id=6 req_power={{0x338}} total_req_power=824 granted_power={{0x591}} total_granted_power=1425 power_range=1425 max_allocatable_power=1776 current_temperature=67800 delta_temperature=2200
  irq/40-c263000.-156   [000] ....   104.711664: thermal_temperature: thermal_zone=cpu6-thermal id=6 temp_prev=67800 temp=70000

So it seems indeed the 'polling-delay-passive' is needed for better reactivity,
and it should also be re-added to the other thermal zones.

On a different note I first tried something similar on the GPU, the trip points
triggered, however there was no reaction from power_allocator, the reason is
that there is no power information for the GPU (num power actors = 0). It seems
it doesn' make sense to use IPA as long as there is no energy model (even if it
worked by using the lowest frequency as 'estimated power' throttling would
likely be overly aggressive). Since the trip point configuration for IPA and
'step_wise' (and probably others) is somewhat incompatible (IPA aims for a
temperature around the 2nd trip point, 'step_wise' interprets the first trip
point as limit) I think it makes sense to continue with a single trip point
for now.

Thanks

Matthias

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/2] arm64: dts: qcom: sc7180: Add gpu cooling support
  2020-10-15 22:19         ` Matthias Kaehlcke
@ 2020-10-16 13:46           ` Akhil P Oommen
  0 siblings, 0 replies; 15+ messages in thread
From: Akhil P Oommen @ 2020-10-16 13:46 UTC (permalink / raw)
  To: Matthias Kaehlcke, manafm
  Cc: Doug Anderson, linux-arm-msm, Amit Kucheria, LKML, Rajeshwari,
	dri-devel, freedreno

On 10/16/2020 3:49 AM, Matthias Kaehlcke wrote:
> Hi,
> 
> On Thu, Oct 15, 2020 at 12:07:01AM +0530, manafm@codeaurora.org wrote:
>> On 2020-10-14 18:59, Akhil P Oommen wrote:
>>> On 10/9/2020 10:27 PM, Matthias Kaehlcke wrote:
>>>> On Fri, Oct 09, 2020 at 08:05:10AM -0700, Doug Anderson wrote:
>>>>> Hi,
>>>>>
>>>>> On Thu, Oct 8, 2020 at 10:10 AM Akhil P Oommen
>>>>> <akhilpo@codeaurora.org> wrote:
>>>>>>
>>>>>> Add cooling-cells property and the cooling maps for the gpu tzones
>>>>>> to support GPU cooling.
>>>>>>
>>>>>> Signed-off-by: Akhil P Oommen <akhilpo@codeaurora.org>
>>>>>> ---
>>>>>>    arch/arm64/boot/dts/qcom/sc7180.dtsi | 29
>>>>>> ++++++++++++++++++++++-------
>>>>>>    1 file changed, 22 insertions(+), 7 deletions(-)
>>>>>>
>>>>>> diff --git a/arch/arm64/boot/dts/qcom/sc7180.dtsi
>>>>>> b/arch/arm64/boot/dts/qcom/sc7180.dtsi
>>>>>> index d46b383..40d6a28 100644
>>>>>> --- a/arch/arm64/boot/dts/qcom/sc7180.dtsi
>>>>>> +++ b/arch/arm64/boot/dts/qcom/sc7180.dtsi
>>>>>> @@ -2,7 +2,7 @@
>>>>>>    /*
>>>>>>     * SC7180 SoC device tree source
>>>>>>     *
>>>>>> - * Copyright (c) 2019, The Linux Foundation. All rights reserved.
>>>>>> + * Copyright (c) 2019-20, The Linux Foundation. All rights
>>>>>> reserved.
>>>>>>     */
>>>>>>
>>>>>>    #include <dt-bindings/clock/qcom,dispcc-sc7180.h>
>>>>>> @@ -1885,6 +1885,7 @@
>>>>>>                           iommus = <&adreno_smmu 0>;
>>>>>>                           operating-points-v2 = <&gpu_opp_table>;
>>>>>>                           qcom,gmu = <&gmu>;
>>>>>> +                       #cooling-cells = <2>;
>>>>>
>>>>> Presumably we should add this to the devicetree bindings, too?
>>> Yes, thanks for catching this. Will update in the next patch.
>>>
>>>>>
>>>>>
>>>>>>                           interconnects = <&gem_noc
>>>>>> MASTER_GFX3D &mc_virt SLAVE_EBI1>;
>>>>>>                           interconnect-names = "gfx-mem";
>>>>>> @@ -3825,16 +3826,16 @@
>>>>>>                   };
>>>>>>
>>>>>>                   gpuss0-thermal {
>>>>>> -                       polling-delay-passive = <0>;
>>>>>> +                       polling-delay-passive = <100>;
>>>>>
>>>>> Why did you make this change?  I'm pretty sure that we _don't_ want
>>>>> this since we're using interrupts for the thermal sensor.  See commit
>>>>> 22337b91022d ("arm64: dts: qcom: sc7180: Changed polling mode in
>>>>> Thermal-zones node").
>>>>
>>>> I was going to ask the same, this shouldn't be needed.
>> As per our understanding unlike "polling-delay",  this delay property is
>> intended to activate polling thread on post trip threshold violation and  it
>> is irrespective of sensor is capable for trip interrupt or not.
>> This polling is more of governor related. Below are the few references from
>> Documentation/code which tells polling-delay-passive is needed for IPA for
>> better IPA performance.
>>
>> As per Power allocator documentations
>>
>> 1. https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/Documentation/driver-api/thermal/power_allocator.rst?h=v5.4.71#n264
>>
>> "The power allocator governor's PID controller works best if there is a
>> periodic tick.  If you have a driver that calls
>> `thermal_zone_device_update()` (or anything that ends up calling the
>> governor's `throttle()` function) repetitively, the governor response
>> won't be very good.  Note that this is not particular to this
>> governor, step-wise will also misbehave if you call its throttle()
>> faster than the normal thermal framework tick (due to interrupts for
>> example) as it will overreact"
>>
>> 2. In Power allocator code, when  switch_on/control trip temp violation, it
>> is enabling passive counter to activate passive polling @ https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/drivers/thermal/power_allocator.c?h=v5.4.71#n634
>>
>> 3. while calculating derivative term, it is using passive_delay @
>> https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/drivers/thermal/power_allocator.c?h=v5.4.71#n243
>>
>> 4. Sensor interrupt will work if temperature is fluctuating between
>> trip_temp and hysteresis. But say a case where we are not enabling
>> polling-delay-passive. In this case if  current temperature > control_temp
>> trip(2nd passive trip) and
>>   temperature trend is still raising, then sensor high trip will be disabled
>> (OR configured for critical trip threshold). No more trip interrupt from
>> sensor until it reaches critical trip or falls below control_temp
>> hysteresis.
>>   How  the governor re-evaluate its next mitigation without passive polling
>> thread  here ?
>>
>> I think the same is required for CPU thermal zone as well.
> 
> Thanks for the explication and pointers!
> 
> I ran some tests to re-confirm. For that I lowered the trip point temperatures
> of CPU6 to 60/70, to make it easier to trigger throttling without necessarily
> affecting the other CPUs. Further I enabled tracing for the events 'thermal_temperature',
> 'thermal_zone_trip' and 'thermal_power_allocator'. With that I ran a CPU
> intensive task on CPU6.
> 
> Without polling-delay the trace log looks like this:
> 
>    irq/40-c263000.-157   [000] ....    48.035986: thermal_temperature: thermal_zone=cpu6-thermal id=6 temp_prev=57800 temp=60000
>    irq/40-c263000.-157   [000] ....    48.036029: thermal_power_allocator_pid: thermal_zone_id=6 err=10000 err_integral=0 p=2402 i=0 d=0 output=1776
>    irq/40-c263000.-157   [000] ....    48.036036: thermal_power_allocator: thermal_zone_id=6 req_power={{0x96}} total_req_power=150 granted_power={{0x6f0}} total_granted_power=1776 power_range=1776 max_allocatable_power=1776 current_temperature=60000 delta_temperature=10000
>    irq/40-c263000.-157   [000] ....    52.480888: thermal_temperature: thermal_zone=cpu6-thermal id=6 temp_prev=60000 temp=70000
>    irq/40-c263000.-157   [000] ....    52.480925: thermal_power_allocator_pid: thermal_zone_id=6 err=0 err_integral=0 p=0 i=0 d=0 output=1202
>    irq/40-c263000.-157   [000] ....    52.480931: thermal_power_allocator: thermal_zone_id=6 req_power={{0x45e}} total_req_power=1118 granted_power={{0x4b2}} total_granted_power=1202 power_range=1202 max_allocatable_power=1776 current_temperature=70000 delta_temperature=0
> 
> i.e. power_allocator only acts on the sensor interrupts at the trip points
> 
> It looks different with a non-zero value for 'polling-delay-passive':
> 
>    irq/40-c263000.-156   [000] ....   104.501777: thermal_power_allocator: thermal_zone_id=6 req_power={{0x331}} total_req_power=817 granted_power={{0x591}} total_granted_power=1425 power_range=1425 max_allocatable_power=1776 current_temperature=67800 delta_temperature=2200
>    irq/40-c263000.-156   [000] ....   104.523073: thermal_temperature: thermal_zone=cpu6-thermal id=6 temp_prev=67800 temp=70000
>    irq/40-c263000.-156   [000] ....   104.523121: thermal_power_allocator_pid: thermal_zone_id=6 err=0 err_integral=-31200 p=0 i=-305 d=0 output=897
>    irq/40-c263000.-156   [000] ....   104.523148: thermal_power_allocator: thermal_zone_id=6 req_power={{0x406}} total_req_power=1030 granted_power={{0x381}} total_granted_power=897 power_range=897 max_allocatable_power=1776 current_temperature=70000 delta_temperature=0
>    irq/40-c263000.-156   [000] ....   104.608566: thermal_temperature: thermal_zone=cpu6-thermal id=6 temp_prev=70000 temp=67800
>    irq/40-c263000.-156   [000] ....   104.608612: thermal_power_allocator_pid: thermal_zone_id=6 err=2200 err_integral=-31200 p=528 i=-305 d=0 output=1425
>    irq/40-c263000.-156   [000] ....   104.608642: thermal_power_allocator: thermal_zone_id=6 req_power={{0x331}} total_req_power=817 granted_power={{0x591}} total_granted_power=1425 power_range=1425 max_allocatable_power=1776 current_temperature=67800 delta_temperature=2200
>    irq/40-c263000.-156   [000] ....   104.630863: thermal_temperature: thermal_zone=cpu6-thermal id=6 temp_prev=67800 temp=70000
>    irq/40-c263000.-156   [000] ....   104.630907: thermal_power_allocator_pid: thermal_zone_id=6 err=0 err_integral=-31200 p=0 i=-305 d=0 output=897
>    irq/40-c263000.-156   [000] ....   104.630932: thermal_power_allocator: thermal_zone_id=6 req_power={{0x3f4}} total_req_power=1012 granted_power={{0x381}} total_granted_power=897 power_range=897 max_allocatable_power=1776 current_temperature=70000 delta_temperature=0
>    irq/40-c263000.-156   [000] ....   104.687495: thermal_temperature: thermal_zone=cpu6-thermal id=6 temp_prev=70000 temp=67800
>    irq/40-c263000.-156   [000] ....   104.687541: thermal_power_allocator_pid: thermal_zone_id=6 err=2200 err_integral=-31200 p=528 i=-305 d=0 output=1425
>    irq/40-c263000.-156   [000] ....   104.687567: thermal_power_allocator: thermal_zone_id=6 req_power={{0x338}} total_req_power=824 granted_power={{0x591}} total_granted_power=1425 power_range=1425 max_allocatable_power=1776 current_temperature=67800 delta_temperature=2200
>    irq/40-c263000.-156   [000] ....   104.711664: thermal_temperature: thermal_zone=cpu6-thermal id=6 temp_prev=67800 temp=70000
> 
> So it seems indeed the 'polling-delay-passive' is needed for better reactivity,
> and it should also be re-added to the other thermal zones.
> 
> On a different note I first tried something similar on the GPU, the trip points
> triggered, however there was no reaction from power_allocator, the reason is
> that there is no power information for the GPU (num power actors = 0). It seems
> it doesn' make sense to use IPA as long as there is no energy model (even if it
> worked by using the lowest frequency as 'estimated power' throttling would
> likely be overly aggressive). Since the trip point configuration for IPA and
> 'step_wise' (and probably others) is somewhat incompatible (IPA aims for a
> temperature around the 2nd trip point, 'step_wise' interprets the first trip
> point as limit) I think it makes sense to continue with a single trip point
> for now.
> 
> Thanks
> 
> Matthias
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/dri-devel
> 
Looks like we have consensus here. I will spin another patchset.
Manaf will share a separate patch to fix the CPU tzones.

-Akhil.

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2020-10-16 13:48 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-10-08 17:09 [PATCH 1/2] arm64: dts: qcom: sc7180: Add gpu cooling support Akhil P Oommen
2020-10-08 17:09 ` [PATCH 2/2] drm/msm: Add support for GPU cooling Akhil P Oommen
2020-10-09 18:36   ` [2/2] " mka
2020-10-12 13:33     ` Akhil P Oommen
2020-10-12 17:40       ` mka
2020-10-13 13:53         ` Akhil P Oommen
2020-10-13 17:40           ` mka
2020-10-13 19:21             ` Akhil P Oommen
2020-10-14  1:09               ` mka
2020-10-09 15:05 ` [PATCH 1/2] arm64: dts: qcom: sc7180: Add gpu cooling support Doug Anderson
2020-10-09 16:57   ` Matthias Kaehlcke
2020-10-14 13:29     ` Akhil P Oommen
2020-10-14 18:37       ` manafm
2020-10-15 22:19         ` Matthias Kaehlcke
2020-10-16 13:46           ` Akhil P Oommen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).