linux-pm.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] thermal: sun8i: Be loud when probe fails
@ 2020-07-08 10:55 Ondrej Jirman
  2020-07-08 11:03 ` Russell King - ARM Linux admin
                   ` (3 more replies)
  0 siblings, 4 replies; 17+ messages in thread
From: Ondrej Jirman @ 2020-07-08 10:55 UTC (permalink / raw)
  To: linux-sunxi
  Cc: Ondrej Jirman, Vasily Khoruzhick, Yangtao Li, Zhang Rui,
	Daniel Lezcano, Amit Kucheria, Maxime Ripard, Chen-Yu Tsai,
	open list:ALLWINNER THERMAL DRIVER,
	moderated list:ARM/Allwinner sunXi SoC support, open list

I noticed several mobile Linux distributions failing to enable the
thermal regulation correctly, because the kernel is silent
when thermal driver fails to probe. Add enough error reporting
to debug issues and warn users in case thermal sensor is failing
to probe.

Failing to notify users means, that SoC can easily overheat under
load.

Signed-off-by: Ondrej Jirman <megous@megous.com>
---
 drivers/thermal/sun8i_thermal.c | 55 ++++++++++++++++++++++++++-------
 1 file changed, 43 insertions(+), 12 deletions(-)

diff --git a/drivers/thermal/sun8i_thermal.c b/drivers/thermal/sun8i_thermal.c
index 74d73be16496..9065e79ae743 100644
--- a/drivers/thermal/sun8i_thermal.c
+++ b/drivers/thermal/sun8i_thermal.c
@@ -287,8 +287,12 @@ static int sun8i_ths_calibrate(struct ths_device *tmdev)
 
 	calcell = devm_nvmem_cell_get(dev, "calibration");
 	if (IS_ERR(calcell)) {
+		dev_err(dev, "Failed to get calibration nvmem cell (%ld)\n",
+			PTR_ERR(calcell));
+
 		if (PTR_ERR(calcell) == -EPROBE_DEFER)
 			return -EPROBE_DEFER;
+
 		/*
 		 * Even if the external calibration data stored in sid is
 		 * not accessible, the THS hardware can still work, although
@@ -308,6 +312,8 @@ static int sun8i_ths_calibrate(struct ths_device *tmdev)
 	caldata = nvmem_cell_read(calcell, &callen);
 	if (IS_ERR(caldata)) {
 		ret = PTR_ERR(caldata);
+		dev_err(dev, "Failed to read calibration data (%d)\n",
+			ret);
 		goto out;
 	}
 
@@ -330,23 +336,35 @@ static int sun8i_ths_resource_init(struct ths_device *tmdev)
 		return PTR_ERR(base);
 
 	tmdev->regmap = devm_regmap_init_mmio(dev, base, &config);
-	if (IS_ERR(tmdev->regmap))
+	if (IS_ERR(tmdev->regmap)) {
+		dev_err(dev, "Failed to init regmap (%ld)\n",
+			PTR_ERR(tmdev->regmap));
 		return PTR_ERR(tmdev->regmap);
+	}
 
 	if (tmdev->chip->has_bus_clk_reset) {
 		tmdev->reset = devm_reset_control_get(dev, NULL);
-		if (IS_ERR(tmdev->reset))
+		if (IS_ERR(tmdev->reset)) {
+			dev_err(dev, "Failed to get reset (%ld)\n",
+				PTR_ERR(tmdev->reset));
 			return PTR_ERR(tmdev->reset);
+		}
 
 		tmdev->bus_clk = devm_clk_get(&pdev->dev, "bus");
-		if (IS_ERR(tmdev->bus_clk))
+		if (IS_ERR(tmdev->bus_clk)) {
+			dev_err(dev, "Failed to get bus clock (%ld)\n",
+				PTR_ERR(tmdev->bus_clk));
 			return PTR_ERR(tmdev->bus_clk);
+		}
 	}
 
 	if (tmdev->chip->has_mod_clk) {
 		tmdev->mod_clk = devm_clk_get(&pdev->dev, "mod");
-		if (IS_ERR(tmdev->mod_clk))
+		if (IS_ERR(tmdev->mod_clk)) {
+			dev_err(dev, "Failed to get mod clock (%ld)\n",
+				PTR_ERR(tmdev->mod_clk));
 			return PTR_ERR(tmdev->mod_clk);
+		}
 	}
 
 	ret = reset_control_deassert(tmdev->reset);
@@ -471,8 +489,12 @@ static int sun8i_ths_register(struct ths_device *tmdev)
 							     i,
 							     &tmdev->sensor[i],
 							     &ths_ops);
-		if (IS_ERR(tmdev->sensor[i].tzd))
+		if (IS_ERR(tmdev->sensor[i].tzd)) {
+			dev_err(tmdev->dev,
+				"Failed to register sensor %d (%ld)\n",
+				i, PTR_ERR(tmdev->sensor[i].tzd));
 			return PTR_ERR(tmdev->sensor[i].tzd);
+		}
 
 		if (devm_thermal_add_hwmon_sysfs(tmdev->sensor[i].tzd))
 			dev_warn(tmdev->dev,
@@ -501,19 +523,21 @@ static int sun8i_ths_probe(struct platform_device *pdev)
 
 	ret = sun8i_ths_resource_init(tmdev);
 	if (ret)
-		return ret;
+		goto err_out;
 
 	irq = platform_get_irq(pdev, 0);
-	if (irq < 0)
-		return irq;
+	if (irq < 0) {
+		ret = irq;
+		goto err_out;
+	}
 
 	ret = tmdev->chip->init(tmdev);
 	if (ret)
-		return ret;
+		goto err_out;
 
 	ret = sun8i_ths_register(tmdev);
 	if (ret)
-		return ret;
+		goto err_out;
 
 	/*
 	 * Avoid entering the interrupt handler, the thermal device is not
@@ -523,10 +547,17 @@ static int sun8i_ths_probe(struct platform_device *pdev)
 	ret = devm_request_threaded_irq(dev, irq, NULL,
 					sun8i_irq_thread,
 					IRQF_ONESHOT, "ths", tmdev);
-	if (ret)
-		return ret;
+	if (ret) {
+		dev_err(dev, "Failed to request irq (%d)\n", ret);
+		goto err_out;
+	}
 
+	dev_info(dev, "Thermal sensor ready!\n");
 	return 0;
+
+err_out:
+	dev_err(dev, "Failed to probe thermal sensor (%d)\n", ret);
+	return ret;
 }
 
 static int sun8i_ths_remove(struct platform_device *pdev)
-- 
2.27.0


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH] thermal: sun8i: Be loud when probe fails
  2020-07-08 10:55 [PATCH] thermal: sun8i: Be loud when probe fails Ondrej Jirman
@ 2020-07-08 11:03 ` Russell King - ARM Linux admin
  2020-07-08 11:10   ` Ondřej Jirman
  2020-07-20  7:55   ` Icenowy Zheng
  2020-07-08 11:55 ` Frank Lee
                   ` (2 subsequent siblings)
  3 siblings, 2 replies; 17+ messages in thread
From: Russell King - ARM Linux admin @ 2020-07-08 11:03 UTC (permalink / raw)
  To: Ondrej Jirman
  Cc: linux-sunxi, Amit Kucheria, open list:ALLWINNER THERMAL DRIVER,
	Yangtao Li, Daniel Lezcano, open list, Maxime Ripard,
	Vasily Khoruzhick, Chen-Yu Tsai, Zhang Rui,
	moderated list:ARM/Allwinner sunXi SoC support

On Wed, Jul 08, 2020 at 12:55:27PM +0200, Ondrej Jirman wrote:
> I noticed several mobile Linux distributions failing to enable the
> thermal regulation correctly, because the kernel is silent
> when thermal driver fails to probe. Add enough error reporting
> to debug issues and warn users in case thermal sensor is failing
> to probe.
> 
> Failing to notify users means, that SoC can easily overheat under
> load.
> 
> Signed-off-by: Ondrej Jirman <megous@megous.com>
> ---
>  drivers/thermal/sun8i_thermal.c | 55 ++++++++++++++++++++++++++-------
>  1 file changed, 43 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/thermal/sun8i_thermal.c b/drivers/thermal/sun8i_thermal.c
> index 74d73be16496..9065e79ae743 100644
> --- a/drivers/thermal/sun8i_thermal.c
> +++ b/drivers/thermal/sun8i_thermal.c
> @@ -287,8 +287,12 @@ static int sun8i_ths_calibrate(struct ths_device *tmdev)
>  
>  	calcell = devm_nvmem_cell_get(dev, "calibration");
>  	if (IS_ERR(calcell)) {
> +		dev_err(dev, "Failed to get calibration nvmem cell (%ld)\n",
> +			PTR_ERR(calcell));

Consider using:

		dev_err(dev, "Failed to get calibration nvmem cell (%pe)\n",
			calcell);

which means the kernel can print the symbolic errno value.

-- 
RMK's Patch system: https://www.armlinux.org.uk/developer/patches/
FTTP is here! 40Mbps down 10Mbps up. Decent connectivity at last!

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] thermal: sun8i: Be loud when probe fails
  2020-07-08 11:03 ` Russell King - ARM Linux admin
@ 2020-07-08 11:10   ` Ondřej Jirman
  2020-07-20  7:55   ` Icenowy Zheng
  1 sibling, 0 replies; 17+ messages in thread
From: Ondřej Jirman @ 2020-07-08 11:10 UTC (permalink / raw)
  To: Russell King - ARM Linux admin
  Cc: linux-sunxi, Amit Kucheria, open list:ALLWINNER THERMAL DRIVER,
	Yangtao Li, Daniel Lezcano, open list, Maxime Ripard,
	Vasily Khoruzhick, Chen-Yu Tsai, Zhang Rui,
	moderated list:ARM/Allwinner sunXi SoC support

On Wed, Jul 08, 2020 at 12:03:01PM +0100, Russell King - ARM Linux admin wrote:
> On Wed, Jul 08, 2020 at 12:55:27PM +0200, Ondrej Jirman wrote:
> > I noticed several mobile Linux distributions failing to enable the
> > thermal regulation correctly, because the kernel is silent
> > when thermal driver fails to probe. Add enough error reporting
> > to debug issues and warn users in case thermal sensor is failing
> > to probe.
> > 
> > Failing to notify users means, that SoC can easily overheat under
> > load.
> > 
> > Signed-off-by: Ondrej Jirman <megous@megous.com>
> > ---
> >  drivers/thermal/sun8i_thermal.c | 55 ++++++++++++++++++++++++++-------
> >  1 file changed, 43 insertions(+), 12 deletions(-)
> > 
> > diff --git a/drivers/thermal/sun8i_thermal.c b/drivers/thermal/sun8i_thermal.c
> > index 74d73be16496..9065e79ae743 100644
> > --- a/drivers/thermal/sun8i_thermal.c
> > +++ b/drivers/thermal/sun8i_thermal.c
> > @@ -287,8 +287,12 @@ static int sun8i_ths_calibrate(struct ths_device *tmdev)
> >  
> >  	calcell = devm_nvmem_cell_get(dev, "calibration");
> >  	if (IS_ERR(calcell)) {
> > +		dev_err(dev, "Failed to get calibration nvmem cell (%ld)\n",
> > +			PTR_ERR(calcell));
> 
> Consider using:
> 
> 		dev_err(dev, "Failed to get calibration nvmem cell (%pe)\n",
> 			calcell);
> 
> which means the kernel can print the symbolic errno value.

Thank you, I'll change it in v2. :)

regards,
	o.

> -- 
> RMK's Patch system: https://www.armlinux.org.uk/developer/patches/
> FTTP is here! 40Mbps down 10Mbps up. Decent connectivity at last!

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] thermal: sun8i: Be loud when probe fails
  2020-07-08 10:55 [PATCH] thermal: sun8i: Be loud when probe fails Ondrej Jirman
  2020-07-08 11:03 ` Russell King - ARM Linux admin
@ 2020-07-08 11:55 ` Frank Lee
  2020-07-08 13:21   ` Ondřej Jirman
  2020-07-08 13:33   ` Ondřej Jirman
  2020-07-08 12:25 ` Maxime Ripard
  2020-07-08 13:29 ` Maxime Ripard
  3 siblings, 2 replies; 17+ messages in thread
From: Frank Lee @ 2020-07-08 11:55 UTC (permalink / raw)
  To: Ondrej Jirman
  Cc: linux-sunxi, Vasily Khoruzhick, Zhang Rui, Daniel Lezcano,
	Amit Kucheria, Maxime Ripard, Chen-Yu Tsai,
	open list:ALLWINNER THERMAL DRIVER,
	moderated list:ARM/Allwinner sunXi SoC support, open list

HI Ondrej,
On Wed, Jul 8, 2020 at 6:55 PM Ondrej Jirman <megous@megous.com> wrote:
>
> I noticed several mobile Linux distributions failing to enable the
> thermal regulation correctly, because the kernel is silent
> when thermal driver fails to probe. Add enough error reporting
> to debug issues and warn users in case thermal sensor is failing
> to probe.
>
> Failing to notify users means, that SoC can easily overheat under
> load.
>
> Signed-off-by: Ondrej Jirman <megous@megous.com>
> ---
>  drivers/thermal/sun8i_thermal.c | 55 ++++++++++++++++++++++++++-------
>  1 file changed, 43 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/thermal/sun8i_thermal.c b/drivers/thermal/sun8i_thermal.c
> index 74d73be16496..9065e79ae743 100644
> --- a/drivers/thermal/sun8i_thermal.c
> +++ b/drivers/thermal/sun8i_thermal.c
> @@ -287,8 +287,12 @@ static int sun8i_ths_calibrate(struct ths_device *tmdev)
>
>         calcell = devm_nvmem_cell_get(dev, "calibration");
>         if (IS_ERR(calcell)) {
> +               dev_err(dev, "Failed to get calibration nvmem cell (%ld)\n",
> +                       PTR_ERR(calcell));
> +
>                 if (PTR_ERR(calcell) == -EPROBE_DEFER)
>                         return -EPROBE_DEFER;
> +
>                 /*
>                  * Even if the external calibration data stored in sid is
>                  * not accessible, the THS hardware can still work, although
> @@ -308,6 +312,8 @@ static int sun8i_ths_calibrate(struct ths_device *tmdev)
>         caldata = nvmem_cell_read(calcell, &callen);
>         if (IS_ERR(caldata)) {
>                 ret = PTR_ERR(caldata);
> +               dev_err(dev, "Failed to read calibration data (%d)\n",
> +                       ret);
>                 goto out;
>         }
>
> @@ -330,23 +336,35 @@ static int sun8i_ths_resource_init(struct ths_device *tmdev)
>                 return PTR_ERR(base);
>
>         tmdev->regmap = devm_regmap_init_mmio(dev, base, &config);
> -       if (IS_ERR(tmdev->regmap))
> +       if (IS_ERR(tmdev->regmap)) {
> +               dev_err(dev, "Failed to init regmap (%ld)\n",
> +                       PTR_ERR(tmdev->regmap));
>                 return PTR_ERR(tmdev->regmap);
> +       }
>
>         if (tmdev->chip->has_bus_clk_reset) {
>                 tmdev->reset = devm_reset_control_get(dev, NULL);
> -               if (IS_ERR(tmdev->reset))
> +               if (IS_ERR(tmdev->reset)) {
> +                       dev_err(dev, "Failed to get reset (%ld)\n",
> +                               PTR_ERR(tmdev->reset));
>                         return PTR_ERR(tmdev->reset);
> +               }
>
>                 tmdev->bus_clk = devm_clk_get(&pdev->dev, "bus");
> -               if (IS_ERR(tmdev->bus_clk))
> +               if (IS_ERR(tmdev->bus_clk)) {
> +                       dev_err(dev, "Failed to get bus clock (%ld)\n",
> +                               PTR_ERR(tmdev->bus_clk));
>                         return PTR_ERR(tmdev->bus_clk);
> +               }
>         }
>
>         if (tmdev->chip->has_mod_clk) {
>                 tmdev->mod_clk = devm_clk_get(&pdev->dev, "mod");
> -               if (IS_ERR(tmdev->mod_clk))
> +               if (IS_ERR(tmdev->mod_clk)) {
> +                       dev_err(dev, "Failed to get mod clock (%ld)\n",
> +                               PTR_ERR(tmdev->mod_clk));
>                         return PTR_ERR(tmdev->mod_clk);
> +               }
>         }
>
>         ret = reset_control_deassert(tmdev->reset);
> @@ -471,8 +489,12 @@ static int sun8i_ths_register(struct ths_device *tmdev)
>                                                              i,
>                                                              &tmdev->sensor[i],
>                                                              &ths_ops);
> -               if (IS_ERR(tmdev->sensor[i].tzd))
> +               if (IS_ERR(tmdev->sensor[i].tzd)) {
> +                       dev_err(tmdev->dev,
> +                               "Failed to register sensor %d (%ld)\n",
> +                               i, PTR_ERR(tmdev->sensor[i].tzd));
>                         return PTR_ERR(tmdev->sensor[i].tzd);
> +               }
>
>                 if (devm_thermal_add_hwmon_sysfs(tmdev->sensor[i].tzd))
>                         dev_warn(tmdev->dev,
> @@ -501,19 +523,21 @@ static int sun8i_ths_probe(struct platform_device *pdev)
>
>         ret = sun8i_ths_resource_init(tmdev);
>         if (ret)
> -               return ret;
> +               goto err_out;
>
>         irq = platform_get_irq(pdev, 0);
> -       if (irq < 0)
> -               return irq;
> +       if (irq < 0) {
> +               ret = irq;
> +               goto err_out;
> +       }
>
>         ret = tmdev->chip->init(tmdev);
>         if (ret)
> -               return ret;
> +               goto err_out;
>
>         ret = sun8i_ths_register(tmdev);
>         if (ret)
> -               return ret;
> +               goto err_out;
>
>         /*
>          * Avoid entering the interrupt handler, the thermal device is not
> @@ -523,10 +547,17 @@ static int sun8i_ths_probe(struct platform_device *pdev)
>         ret = devm_request_threaded_irq(dev, irq, NULL,
>                                         sun8i_irq_thread,
>                                         IRQF_ONESHOT, "ths", tmdev);
> -       if (ret)
> -               return ret;
> +       if (ret) {
> +               dev_err(dev, "Failed to request irq (%d)\n", ret);
> +               goto err_out;
> +       }
>
> +       dev_info(dev, "Thermal sensor ready!\n");
>         return 0;
> +
> +err_out:
> +       dev_err(dev, "Failed to probe thermal sensor (%d)\n", ret);

When the driver fails, there will be this print. Isn't it superfluous
for you to add these?

sun8i-thermal: probe of 5070400.thermal-sensor failed with error


Yangtao

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] thermal: sun8i: Be loud when probe fails
  2020-07-08 10:55 [PATCH] thermal: sun8i: Be loud when probe fails Ondrej Jirman
  2020-07-08 11:03 ` Russell King - ARM Linux admin
  2020-07-08 11:55 ` Frank Lee
@ 2020-07-08 12:25 ` Maxime Ripard
  2020-07-08 13:29   ` Ondřej Jirman
  2020-07-08 13:29 ` Maxime Ripard
  3 siblings, 1 reply; 17+ messages in thread
From: Maxime Ripard @ 2020-07-08 12:25 UTC (permalink / raw)
  To: Ondrej Jirman
  Cc: linux-sunxi, Vasily Khoruzhick, Yangtao Li, Zhang Rui,
	Daniel Lezcano, Amit Kucheria, Chen-Yu Tsai,
	open list:ALLWINNER THERMAL DRIVER,
	moderated list:ARM/Allwinner sunXi SoC support, open list

Hi,

On Wed, Jul 08, 2020 at 12:55:27PM +0200, Ondrej Jirman wrote:
> I noticed several mobile Linux distributions failing to enable the
> thermal regulation correctly, because the kernel is silent
> when thermal driver fails to probe. Add enough error reporting
> to debug issues and warn users in case thermal sensor is failing
> to probe.
> 
> Failing to notify users means, that SoC can easily overheat under
> load.
> 
> Signed-off-by: Ondrej Jirman <megous@megous.com>
> ---
>  drivers/thermal/sun8i_thermal.c | 55 ++++++++++++++++++++++++++-------
>  1 file changed, 43 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/thermal/sun8i_thermal.c b/drivers/thermal/sun8i_thermal.c
> index 74d73be16496..9065e79ae743 100644
> --- a/drivers/thermal/sun8i_thermal.c
> +++ b/drivers/thermal/sun8i_thermal.c
> @@ -287,8 +287,12 @@ static int sun8i_ths_calibrate(struct ths_device *tmdev)
>  
>  	calcell = devm_nvmem_cell_get(dev, "calibration");
>  	if (IS_ERR(calcell)) {
> +		dev_err(dev, "Failed to get calibration nvmem cell (%ld)\n",
> +			PTR_ERR(calcell));
> +
>  		if (PTR_ERR(calcell) == -EPROBE_DEFER)
>  			return -EPROBE_DEFER;
> +

The rest of the patch makes sense, but we should probably put the error
message after the EPROBE_DEFER return so that we don't print any extra
noise that isn't necessarily useful

Maxime

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] thermal: sun8i: Be loud when probe fails
  2020-07-08 11:55 ` Frank Lee
@ 2020-07-08 13:21   ` Ondřej Jirman
  2020-07-08 13:42     ` Robin Murphy
  2020-07-08 13:33   ` Ondřej Jirman
  1 sibling, 1 reply; 17+ messages in thread
From: Ondřej Jirman @ 2020-07-08 13:21 UTC (permalink / raw)
  To: Frank Lee
  Cc: linux-sunxi, Vasily Khoruzhick, Zhang Rui, Daniel Lezcano,
	Amit Kucheria, Maxime Ripard, Chen-Yu Tsai,
	open list:ALLWINNER THERMAL DRIVER,
	moderated list:ARM/Allwinner sunXi SoC support, open list

On Wed, Jul 08, 2020 at 07:55:40PM +0800, Frank Lee wrote:
> HI Ondrej,
> On Wed, Jul 8, 2020 at 6:55 PM Ondrej Jirman <megous@megous.com> wrote:
> >
> > I noticed several mobile Linux distributions failing to enable the
> > thermal regulation correctly, because the kernel is silent
> > when thermal driver fails to probe. Add enough error reporting
> > to debug issues and warn users in case thermal sensor is failing
> > to probe.
> >
> > Failing to notify users means, that SoC can easily overheat under
> > load.
> >
> > Signed-off-by: Ondrej Jirman <megous@megous.com>
> > ---
> >  drivers/thermal/sun8i_thermal.c | 55 ++++++++++++++++++++++++++-------
> >  1 file changed, 43 insertions(+), 12 deletions(-)
> >
> > diff --git a/drivers/thermal/sun8i_thermal.c b/drivers/thermal/sun8i_thermal.c
> > index 74d73be16496..9065e79ae743 100644
> > --- a/drivers/thermal/sun8i_thermal.c
> > +++ b/drivers/thermal/sun8i_thermal.c
> > @@ -287,8 +287,12 @@ static int sun8i_ths_calibrate(struct ths_device *tmdev)
> >
> >         calcell = devm_nvmem_cell_get(dev, "calibration");
> >         if (IS_ERR(calcell)) {
> > +               dev_err(dev, "Failed to get calibration nvmem cell (%ld)\n",
> > +                       PTR_ERR(calcell));
> > +
> >                 if (PTR_ERR(calcell) == -EPROBE_DEFER)
> >                         return -EPROBE_DEFER;
> > +
> >                 /*
> >                  * Even if the external calibration data stored in sid is
> >                  * not accessible, the THS hardware can still work, although
> > @@ -308,6 +312,8 @@ static int sun8i_ths_calibrate(struct ths_device *tmdev)
> >         caldata = nvmem_cell_read(calcell, &callen);
> >         if (IS_ERR(caldata)) {
> >                 ret = PTR_ERR(caldata);
> > +               dev_err(dev, "Failed to read calibration data (%d)\n",
> > +                       ret);
> >                 goto out;
> >         }
> >
> > @@ -330,23 +336,35 @@ static int sun8i_ths_resource_init(struct ths_device *tmdev)
> >                 return PTR_ERR(base);
> >
> >         tmdev->regmap = devm_regmap_init_mmio(dev, base, &config);
> > -       if (IS_ERR(tmdev->regmap))
> > +       if (IS_ERR(tmdev->regmap)) {
> > +               dev_err(dev, "Failed to init regmap (%ld)\n",
> > +                       PTR_ERR(tmdev->regmap));
> >                 return PTR_ERR(tmdev->regmap);
> > +       }
> >
> >         if (tmdev->chip->has_bus_clk_reset) {
> >                 tmdev->reset = devm_reset_control_get(dev, NULL);
> > -               if (IS_ERR(tmdev->reset))
> > +               if (IS_ERR(tmdev->reset)) {
> > +                       dev_err(dev, "Failed to get reset (%ld)\n",
> > +                               PTR_ERR(tmdev->reset));
> >                         return PTR_ERR(tmdev->reset);
> > +               }
> >
> >                 tmdev->bus_clk = devm_clk_get(&pdev->dev, "bus");
> > -               if (IS_ERR(tmdev->bus_clk))
> > +               if (IS_ERR(tmdev->bus_clk)) {
> > +                       dev_err(dev, "Failed to get bus clock (%ld)\n",
> > +                               PTR_ERR(tmdev->bus_clk));
> >                         return PTR_ERR(tmdev->bus_clk);
> > +               }
> >         }
> >
> >         if (tmdev->chip->has_mod_clk) {
> >                 tmdev->mod_clk = devm_clk_get(&pdev->dev, "mod");
> > -               if (IS_ERR(tmdev->mod_clk))
> > +               if (IS_ERR(tmdev->mod_clk)) {
> > +                       dev_err(dev, "Failed to get mod clock (%ld)\n",
> > +                               PTR_ERR(tmdev->mod_clk));
> >                         return PTR_ERR(tmdev->mod_clk);
> > +               }
> >         }
> >
> >         ret = reset_control_deassert(tmdev->reset);
> > @@ -471,8 +489,12 @@ static int sun8i_ths_register(struct ths_device *tmdev)
> >                                                              i,
> >                                                              &tmdev->sensor[i],
> >                                                              &ths_ops);
> > -               if (IS_ERR(tmdev->sensor[i].tzd))
> > +               if (IS_ERR(tmdev->sensor[i].tzd)) {
> > +                       dev_err(tmdev->dev,
> > +                               "Failed to register sensor %d (%ld)\n",
> > +                               i, PTR_ERR(tmdev->sensor[i].tzd));
> >                         return PTR_ERR(tmdev->sensor[i].tzd);
> > +               }
> >
> >                 if (devm_thermal_add_hwmon_sysfs(tmdev->sensor[i].tzd))
> >                         dev_warn(tmdev->dev,
> > @@ -501,19 +523,21 @@ static int sun8i_ths_probe(struct platform_device *pdev)
> >
> >         ret = sun8i_ths_resource_init(tmdev);
> >         if (ret)
> > -               return ret;
> > +               goto err_out;
> >
> >         irq = platform_get_irq(pdev, 0);
> > -       if (irq < 0)
> > -               return irq;
> > +       if (irq < 0) {
> > +               ret = irq;
> > +               goto err_out;
> > +       }
> >
> >         ret = tmdev->chip->init(tmdev);
> >         if (ret)
> > -               return ret;
> > +               goto err_out;
> >
> >         ret = sun8i_ths_register(tmdev);
> >         if (ret)
> > -               return ret;
> > +               goto err_out;
> >
> >         /*
> >          * Avoid entering the interrupt handler, the thermal device is not
> > @@ -523,10 +547,17 @@ static int sun8i_ths_probe(struct platform_device *pdev)
> >         ret = devm_request_threaded_irq(dev, irq, NULL,
> >                                         sun8i_irq_thread,
> >                                         IRQF_ONESHOT, "ths", tmdev);
> > -       if (ret)
> > -               return ret;
> > +       if (ret) {
> > +               dev_err(dev, "Failed to request irq (%d)\n", ret);
> > +               goto err_out;
> > +       }
> >
> > +       dev_info(dev, "Thermal sensor ready!\n");
> >         return 0;
> > +
> > +err_out:
> > +       dev_err(dev, "Failed to probe thermal sensor (%d)\n", ret);
> 
> When the driver fails, there will be this print. Isn't it superfluous
> for you to add these?
> 
> sun8i-thermal: probe of 5070400.thermal-sensor failed with error

There's no such failure message in the case I investigated, which is
EPROBE_DEFER failure waiting for nvmem driver that never loads,
because it's not configured by the user to build.

regards,
	o.

> 
> Yangtao

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] thermal: sun8i: Be loud when probe fails
  2020-07-08 12:25 ` Maxime Ripard
@ 2020-07-08 13:29   ` Ondřej Jirman
  2020-07-08 13:36     ` Maxime Ripard
  0 siblings, 1 reply; 17+ messages in thread
From: Ondřej Jirman @ 2020-07-08 13:29 UTC (permalink / raw)
  To: Maxime Ripard
  Cc: linux-sunxi, Vasily Khoruzhick, Yangtao Li, Zhang Rui,
	Daniel Lezcano, Amit Kucheria, Chen-Yu Tsai,
	open list:ALLWINNER THERMAL DRIVER,
	moderated list:ARM/Allwinner sunXi SoC support, open list

Hello Maxime,

On Wed, Jul 08, 2020 at 02:25:42PM +0200, Maxime Ripard wrote:
> Hi,
> 
> On Wed, Jul 08, 2020 at 12:55:27PM +0200, Ondrej Jirman wrote:
> > I noticed several mobile Linux distributions failing to enable the
> > thermal regulation correctly, because the kernel is silent
> > when thermal driver fails to probe. Add enough error reporting
> > to debug issues and warn users in case thermal sensor is failing
> > to probe.
> > 
> > Failing to notify users means, that SoC can easily overheat under
> > load.
> > 
> > Signed-off-by: Ondrej Jirman <megous@megous.com>
> > ---
> >  drivers/thermal/sun8i_thermal.c | 55 ++++++++++++++++++++++++++-------
> >  1 file changed, 43 insertions(+), 12 deletions(-)
> > 
> > diff --git a/drivers/thermal/sun8i_thermal.c b/drivers/thermal/sun8i_thermal.c
> > index 74d73be16496..9065e79ae743 100644
> > --- a/drivers/thermal/sun8i_thermal.c
> > +++ b/drivers/thermal/sun8i_thermal.c
> > @@ -287,8 +287,12 @@ static int sun8i_ths_calibrate(struct ths_device *tmdev)
> >  
> >  	calcell = devm_nvmem_cell_get(dev, "calibration");
> >  	if (IS_ERR(calcell)) {
> > +		dev_err(dev, "Failed to get calibration nvmem cell (%ld)\n",
> > +			PTR_ERR(calcell));
> > +
> >  		if (PTR_ERR(calcell) == -EPROBE_DEFER)
> >  			return -EPROBE_DEFER;
> > +
> 
> The rest of the patch makes sense, but we should probably put the error
> message after the EPROBE_DEFER return so that we don't print any extra
> noise that isn't necessarily useful

I thought about that, but in this case this would have helped, see my other
e-mail. Though lack of "probe success" message may be enough for me, to
debug the issue, I'm not sure the user will notice that a message is missing, while
he'll surely notice if there's a flood of repeated EPROBE_DEFER messages.

And people run several distros for 3-4 months without anyone noticing any
issues and that thermal regulation doesn't work. So it seems that lack of a
success message is not enough.

Other solution may be to select CONFIG_NVMEM_SUNXI_SID if this driver
is enabled. That may get rid of this error scenario of waiting infinitely
for calibration data with EPROBE_DEFER. And other potential EPROBE_DEFER sources
will probably be quite visible even without this driver telling the user.
So this message may not be necessary in that case.

thank you and regards,
	o.

> Maxime

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] thermal: sun8i: Be loud when probe fails
  2020-07-08 10:55 [PATCH] thermal: sun8i: Be loud when probe fails Ondrej Jirman
                   ` (2 preceding siblings ...)
  2020-07-08 12:25 ` Maxime Ripard
@ 2020-07-08 13:29 ` Maxime Ripard
  3 siblings, 0 replies; 17+ messages in thread
From: Maxime Ripard @ 2020-07-08 13:29 UTC (permalink / raw)
  To: Ondrej Jirman
  Cc: linux-sunxi, Vasily Khoruzhick, Yangtao Li, Zhang Rui,
	Daniel Lezcano, Amit Kucheria, Chen-Yu Tsai,
	open list:ALLWINNER THERMAL DRIVER,
	moderated list:ARM/Allwinner sunXi SoC support, open list

On Wed, Jul 08, 2020 at 12:55:27PM +0200, Ondrej Jirman wrote:
> @@ -523,10 +547,17 @@ static int sun8i_ths_probe(struct platform_device *pdev)
>  	ret = devm_request_threaded_irq(dev, irq, NULL,
>  					sun8i_irq_thread,
>  					IRQF_ONESHOT, "ths", tmdev);
> -	if (ret)
> -		return ret;
> +	if (ret) {
> +		dev_err(dev, "Failed to request irq (%d)\n", ret);
> +		goto err_out;
> +	}
>  
> +	dev_info(dev, "Thermal sensor ready!\n");
>  	return 0;

I missed that in my first mail, but I'm not sure we want to print
anything on success. This doesn't bring any value and that will only
make it harder to find errors in other drivers.

Maxime

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] thermal: sun8i: Be loud when probe fails
  2020-07-08 11:55 ` Frank Lee
  2020-07-08 13:21   ` Ondřej Jirman
@ 2020-07-08 13:33   ` Ondřej Jirman
  1 sibling, 0 replies; 17+ messages in thread
From: Ondřej Jirman @ 2020-07-08 13:33 UTC (permalink / raw)
  To: Frank Lee
  Cc: linux-sunxi, Vasily Khoruzhick, Zhang Rui, Daniel Lezcano,
	Amit Kucheria, Maxime Ripard, Chen-Yu Tsai,
	open list:ALLWINNER THERMAL DRIVER,
	moderated list:ARM/Allwinner sunXi SoC support, open list

On Wed, Jul 08, 2020 at 07:55:40PM +0800, Frank Lee wrote:
> HI Ondrej,
> On Wed, Jul 8, 2020 at 6:55 PM Ondrej Jirman <megous@megous.com> wrote:
> >
> > I noticed several mobile Linux distributions failing to enable the
> > thermal regulation correctly, because the kernel is silent
> > when thermal driver fails to probe. Add enough error reporting
> > to debug issues and warn users in case thermal sensor is failing
> > to probe.
> >
> > Failing to notify users means, that SoC can easily overheat under
> > load.
> >
> > Signed-off-by: Ondrej Jirman <megous@megous.com>
> > ---
> >  drivers/thermal/sun8i_thermal.c | 55 ++++++++++++++++++++++++++-------
> >  1 file changed, 43 insertions(+), 12 deletions(-)
> >
> > diff --git a/drivers/thermal/sun8i_thermal.c b/drivers/thermal/sun8i_thermal.c
> > index 74d73be16496..9065e79ae743 100644
> > --- a/drivers/thermal/sun8i_thermal.c
> > +++ b/drivers/thermal/sun8i_thermal.c
> > @@ -287,8 +287,12 @@ static int sun8i_ths_calibrate(struct ths_device *tmdev)
> >
> >         calcell = devm_nvmem_cell_get(dev, "calibration");
> >         if (IS_ERR(calcell)) {
> > +               dev_err(dev, "Failed to get calibration nvmem cell (%ld)\n",
> > +                       PTR_ERR(calcell));
> > +
> >                 if (PTR_ERR(calcell) == -EPROBE_DEFER)
> >                         return -EPROBE_DEFER;
> > +
> >                 /*
> >                  * Even if the external calibration data stored in sid is
> >                  * not accessible, the THS hardware can still work, although
> > @@ -308,6 +312,8 @@ static int sun8i_ths_calibrate(struct ths_device *tmdev)
> >         caldata = nvmem_cell_read(calcell, &callen);
> >         if (IS_ERR(caldata)) {
> >                 ret = PTR_ERR(caldata);
> > +               dev_err(dev, "Failed to read calibration data (%d)\n",
> > +                       ret);
> >                 goto out;
> >         }
> >
> > @@ -330,23 +336,35 @@ static int sun8i_ths_resource_init(struct ths_device *tmdev)
> >                 return PTR_ERR(base);
> >
> >         tmdev->regmap = devm_regmap_init_mmio(dev, base, &config);
> > -       if (IS_ERR(tmdev->regmap))
> > +       if (IS_ERR(tmdev->regmap)) {
> > +               dev_err(dev, "Failed to init regmap (%ld)\n",
> > +                       PTR_ERR(tmdev->regmap));
> >                 return PTR_ERR(tmdev->regmap);
> > +       }
> >
> >         if (tmdev->chip->has_bus_clk_reset) {
> >                 tmdev->reset = devm_reset_control_get(dev, NULL);
> > -               if (IS_ERR(tmdev->reset))
> > +               if (IS_ERR(tmdev->reset)) {
> > +                       dev_err(dev, "Failed to get reset (%ld)\n",
> > +                               PTR_ERR(tmdev->reset));
> >                         return PTR_ERR(tmdev->reset);
> > +               }
> >
> >                 tmdev->bus_clk = devm_clk_get(&pdev->dev, "bus");
> > -               if (IS_ERR(tmdev->bus_clk))
> > +               if (IS_ERR(tmdev->bus_clk)) {
> > +                       dev_err(dev, "Failed to get bus clock (%ld)\n",
> > +                               PTR_ERR(tmdev->bus_clk));
> >                         return PTR_ERR(tmdev->bus_clk);
> > +               }
> >         }
> >
> >         if (tmdev->chip->has_mod_clk) {
> >                 tmdev->mod_clk = devm_clk_get(&pdev->dev, "mod");
> > -               if (IS_ERR(tmdev->mod_clk))
> > +               if (IS_ERR(tmdev->mod_clk)) {
> > +                       dev_err(dev, "Failed to get mod clock (%ld)\n",
> > +                               PTR_ERR(tmdev->mod_clk));
> >                         return PTR_ERR(tmdev->mod_clk);
> > +               }
> >         }
> >
> >         ret = reset_control_deassert(tmdev->reset);
> > @@ -471,8 +489,12 @@ static int sun8i_ths_register(struct ths_device *tmdev)
> >                                                              i,
> >                                                              &tmdev->sensor[i],
> >                                                              &ths_ops);
> > -               if (IS_ERR(tmdev->sensor[i].tzd))
> > +               if (IS_ERR(tmdev->sensor[i].tzd)) {
> > +                       dev_err(tmdev->dev,
> > +                               "Failed to register sensor %d (%ld)\n",
> > +                               i, PTR_ERR(tmdev->sensor[i].tzd));
> >                         return PTR_ERR(tmdev->sensor[i].tzd);
> > +               }
> >
> >                 if (devm_thermal_add_hwmon_sysfs(tmdev->sensor[i].tzd))
> >                         dev_warn(tmdev->dev,
> > @@ -501,19 +523,21 @@ static int sun8i_ths_probe(struct platform_device *pdev)
> >
> >         ret = sun8i_ths_resource_init(tmdev);
> >         if (ret)
> > -               return ret;
> > +               goto err_out;
> >
> >         irq = platform_get_irq(pdev, 0);
> > -       if (irq < 0)
> > -               return irq;
> > +       if (irq < 0) {
> > +               ret = irq;
> > +               goto err_out;
> > +       }
> >
> >         ret = tmdev->chip->init(tmdev);
> >         if (ret)
> > -               return ret;
> > +               goto err_out;
> >
> >         ret = sun8i_ths_register(tmdev);
> >         if (ret)
> > -               return ret;
> > +               goto err_out;
> >
> >         /*
> >          * Avoid entering the interrupt handler, the thermal device is not
> > @@ -523,10 +547,17 @@ static int sun8i_ths_probe(struct platform_device *pdev)
> >         ret = devm_request_threaded_irq(dev, irq, NULL,
> >                                         sun8i_irq_thread,
> >                                         IRQF_ONESHOT, "ths", tmdev);
> > -       if (ret)
> > -               return ret;
> > +       if (ret) {
> > +               dev_err(dev, "Failed to request irq (%d)\n", ret);
> > +               goto err_out;
> > +       }
> >
> > +       dev_info(dev, "Thermal sensor ready!\n");
> >         return 0;
> > +
> > +err_out:
> > +       dev_err(dev, "Failed to probe thermal sensor (%d)\n", ret);
> 
> When the driver fails, there will be this print. Isn't it superfluous
> for you to add these?
> 
> sun8i-thermal: probe of 5070400.thermal-sensor failed with error

Thinking more about it. You're right. This is probably only shown for
non-EPROBE_DEFER errors. So this printk is superfluous, since EPROBE_DEFER
from nvmem/sid is already shown elsewhere in this patch.

thanks,
	o.

> 
> Yangtao

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] thermal: sun8i: Be loud when probe fails
  2020-07-08 13:29   ` Ondřej Jirman
@ 2020-07-08 13:36     ` Maxime Ripard
  2020-07-08 13:44       ` Ondřej Jirman
  0 siblings, 1 reply; 17+ messages in thread
From: Maxime Ripard @ 2020-07-08 13:36 UTC (permalink / raw)
  To: Ondřej Jirman, linux-sunxi, Vasily Khoruzhick, Yangtao Li,
	Zhang Rui, Daniel Lezcano, Amit Kucheria, Chen-Yu Tsai,
	open list:ALLWINNER THERMAL DRIVER,
	moderated list:ARM/Allwinner sunXi SoC support, open list

On Wed, Jul 08, 2020 at 03:29:24PM +0200, Ondřej Jirman wrote:
> Hello Maxime,
> 
> On Wed, Jul 08, 2020 at 02:25:42PM +0200, Maxime Ripard wrote:
> > Hi,
> > 
> > On Wed, Jul 08, 2020 at 12:55:27PM +0200, Ondrej Jirman wrote:
> > > I noticed several mobile Linux distributions failing to enable the
> > > thermal regulation correctly, because the kernel is silent
> > > when thermal driver fails to probe. Add enough error reporting
> > > to debug issues and warn users in case thermal sensor is failing
> > > to probe.
> > > 
> > > Failing to notify users means, that SoC can easily overheat under
> > > load.
> > > 
> > > Signed-off-by: Ondrej Jirman <megous@megous.com>
> > > ---
> > >  drivers/thermal/sun8i_thermal.c | 55 ++++++++++++++++++++++++++-------
> > >  1 file changed, 43 insertions(+), 12 deletions(-)
> > > 
> > > diff --git a/drivers/thermal/sun8i_thermal.c b/drivers/thermal/sun8i_thermal.c
> > > index 74d73be16496..9065e79ae743 100644
> > > --- a/drivers/thermal/sun8i_thermal.c
> > > +++ b/drivers/thermal/sun8i_thermal.c
> > > @@ -287,8 +287,12 @@ static int sun8i_ths_calibrate(struct ths_device *tmdev)
> > >  
> > >  	calcell = devm_nvmem_cell_get(dev, "calibration");
> > >  	if (IS_ERR(calcell)) {
> > > +		dev_err(dev, "Failed to get calibration nvmem cell (%ld)\n",
> > > +			PTR_ERR(calcell));
> > > +
> > >  		if (PTR_ERR(calcell) == -EPROBE_DEFER)
> > >  			return -EPROBE_DEFER;
> > > +
> > 
> > The rest of the patch makes sense, but we should probably put the error
> > message after the EPROBE_DEFER return so that we don't print any extra
> > noise that isn't necessarily useful
> 
> I thought about that, but in this case this would have helped, see my other
> e-mail. Though lack of "probe success" message may be enough for me, to
> debug the issue, I'm not sure the user will notice that a message is missing, while
> he'll surely notice if there's a flood of repeated EPROBE_DEFER messages.

Yeah, but on the other hand, we regularly have people that come up and
ask if a "legitimate" EPROBE_DEFER error message (as in: the driver
wasn't there on the first attempt but was there on the second) is a
cause of concern or not.

> And people run several distros for 3-4 months without anyone noticing any
> issues and that thermal regulation doesn't work. So it seems that lack of a
> success message is not enough.

I understand what the issue is, but do you really expect phone users to
monitor the kernel logs every time they boot their phone to see if the
thermal throttling is enabled?

If anything, it looks like a distro problem, and the notification /
policy to deal with that should be implemented in userspace.

> Other solution may be to select CONFIG_NVMEM_SUNXI_SID if this driver
> is enabled. That may get rid of this error scenario of waiting infinitely
> for calibration data with EPROBE_DEFER. And other potential EPROBE_DEFER sources
> will probably be quite visible even without this driver telling the user.
> So this message may not be necessary in that case.

That would only partially solve your issue. If the nvmem driver doesn't
load for some reason, you would end up in a similar situation.

Maxime

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] thermal: sun8i: Be loud when probe fails
  2020-07-08 13:21   ` Ondřej Jirman
@ 2020-07-08 13:42     ` Robin Murphy
  0 siblings, 0 replies; 17+ messages in thread
From: Robin Murphy @ 2020-07-08 13:42 UTC (permalink / raw)
  To: Ondřej Jirman, Frank Lee, linux-sunxi, Vasily Khoruzhick,
	Zhang Rui, Daniel Lezcano, Amit Kucheria, Maxime Ripard,
	Chen-Yu Tsai, open list:ALLWINNER THERMAL DRIVER,
	moderated list:ARM/Allwinner sunXi SoC support, open list

On 2020-07-08 14:21, Ondřej Jirman wrote:
[...]
>>> @@ -523,10 +547,17 @@ static int sun8i_ths_probe(struct platform_device *pdev)
>>>          ret = devm_request_threaded_irq(dev, irq, NULL,
>>>                                          sun8i_irq_thread,
>>>                                          IRQF_ONESHOT, "ths", tmdev);
>>> -       if (ret)
>>> -               return ret;
>>> +       if (ret) {
>>> +               dev_err(dev, "Failed to request irq (%d)\n", ret);
>>> +               goto err_out;
>>> +       }
>>>
>>> +       dev_info(dev, "Thermal sensor ready!\n");
>>>          return 0;
>>> +
>>> +err_out:
>>> +       dev_err(dev, "Failed to probe thermal sensor (%d)\n", ret);
>>
>> When the driver fails, there will be this print. Isn't it superfluous
>> for you to add these?
>>
>> sun8i-thermal: probe of 5070400.thermal-sensor failed with error
> 
> There's no such failure message in the case I investigated, which is
> EPROBE_DEFER failure waiting for nvmem driver that never loads,
> because it's not configured by the user to build.

Ah, in that case this was a bit misleading, since "probe failure" isn't 
really the problem at all. As it happens, there's a whole other 
discussion ongoing around making probe deferral issues easier to debug:

https://lore.kernel.org/linux-arm-kernel/20200626100103.18879-1-a.hajda@samsung.com/

Robin.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] thermal: sun8i: Be loud when probe fails
  2020-07-08 13:36     ` Maxime Ripard
@ 2020-07-08 13:44       ` Ondřej Jirman
  2020-07-08 13:57         ` Maxime Ripard
  0 siblings, 1 reply; 17+ messages in thread
From: Ondřej Jirman @ 2020-07-08 13:44 UTC (permalink / raw)
  To: Maxime Ripard
  Cc: linux-sunxi, Vasily Khoruzhick, Yangtao Li, Zhang Rui,
	Daniel Lezcano, Amit Kucheria, Chen-Yu Tsai,
	open list:ALLWINNER THERMAL DRIVER,
	moderated list:ARM/Allwinner sunXi SoC support, open list

On Wed, Jul 08, 2020 at 03:36:54PM +0200, Maxime Ripard wrote:
> On Wed, Jul 08, 2020 at 03:29:24PM +0200, Ondřej Jirman wrote:
> > Hello Maxime,
> > 
> > On Wed, Jul 08, 2020 at 02:25:42PM +0200, Maxime Ripard wrote:
> > > Hi,
> > > 
> > > On Wed, Jul 08, 2020 at 12:55:27PM +0200, Ondrej Jirman wrote:
> > > > I noticed several mobile Linux distributions failing to enable the
> > > > thermal regulation correctly, because the kernel is silent
> > > > when thermal driver fails to probe. Add enough error reporting
> > > > to debug issues and warn users in case thermal sensor is failing
> > > > to probe.
> > > > 
> > > > Failing to notify users means, that SoC can easily overheat under
> > > > load.
> > > > 
> > > > Signed-off-by: Ondrej Jirman <megous@megous.com>
> > > > ---
> > > >  drivers/thermal/sun8i_thermal.c | 55 ++++++++++++++++++++++++++-------
> > > >  1 file changed, 43 insertions(+), 12 deletions(-)
> > > > 
> > > > diff --git a/drivers/thermal/sun8i_thermal.c b/drivers/thermal/sun8i_thermal.c
> > > > index 74d73be16496..9065e79ae743 100644
> > > > --- a/drivers/thermal/sun8i_thermal.c
> > > > +++ b/drivers/thermal/sun8i_thermal.c
> > > > @@ -287,8 +287,12 @@ static int sun8i_ths_calibrate(struct ths_device *tmdev)
> > > >  
> > > >  	calcell = devm_nvmem_cell_get(dev, "calibration");
> > > >  	if (IS_ERR(calcell)) {
> > > > +		dev_err(dev, "Failed to get calibration nvmem cell (%ld)\n",
> > > > +			PTR_ERR(calcell));
> > > > +
> > > >  		if (PTR_ERR(calcell) == -EPROBE_DEFER)
> > > >  			return -EPROBE_DEFER;
> > > > +
> > > 
> > > The rest of the patch makes sense, but we should probably put the error
> > > message after the EPROBE_DEFER return so that we don't print any extra
> > > noise that isn't necessarily useful
> > 
> > I thought about that, but in this case this would have helped, see my other
> > e-mail. Though lack of "probe success" message may be enough for me, to
> > debug the issue, I'm not sure the user will notice that a message is missing, while
> > he'll surely notice if there's a flood of repeated EPROBE_DEFER messages.
> 
> Yeah, but on the other hand, we regularly have people that come up and
> ask if a "legitimate" EPROBE_DEFER error message (as in: the driver
> wasn't there on the first attempt but was there on the second) is a
> cause of concern or not.

That's why I also added a success message, to distinguish this case. 

> > And people run several distros for 3-4 months without anyone noticing any
> > issues and that thermal regulation doesn't work. So it seems that lack of a
> > success message is not enough.
> 
> I understand what the issue is, but do you really expect phone users to
> monitor the kernel logs every time they boot their phone to see if the
> thermal throttling is enabled?

Not phone users, but people making their own kernels/distributions. Those people
monitor dmesg, and out of 4 distros or more nobody noticed there was an issue
(despite the complaints of overheating by their users).

So I thought some warning may be in order, so that distro people more easily
notice they have misconfigured the kernel or sometging.

End users really don't care about dmesg.

regards,
	o.

> If anything, it looks like a distro problem, and the notification /
> policy to deal with that should be implemented in userspace.
> 
> > Other solution may be to select CONFIG_NVMEM_SUNXI_SID if this driver
> > is enabled. That may get rid of this error scenario of waiting infinitely
> > for calibration data with EPROBE_DEFER. And other potential EPROBE_DEFER sources
> > will probably be quite visible even without this driver telling the user.
> > So this message may not be necessary in that case.
> 
> That would only partially solve your issue. If the nvmem driver doesn't
> load for some reason, you would end up in a similar situation.
> 
> Maxime

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] thermal: sun8i: Be loud when probe fails
  2020-07-08 13:44       ` Ondřej Jirman
@ 2020-07-08 13:57         ` Maxime Ripard
  2020-07-12 23:29           ` Ondřej Jirman
  0 siblings, 1 reply; 17+ messages in thread
From: Maxime Ripard @ 2020-07-08 13:57 UTC (permalink / raw)
  To: Ondřej Jirman, linux-sunxi, Vasily Khoruzhick, Yangtao Li,
	Zhang Rui, Daniel Lezcano, Amit Kucheria, Chen-Yu Tsai,
	open list:ALLWINNER THERMAL DRIVER,
	moderated list:ARM/Allwinner sunXi SoC support, open list

On Wed, Jul 08, 2020 at 03:44:41PM +0200, Ondřej Jirman wrote:
> On Wed, Jul 08, 2020 at 03:36:54PM +0200, Maxime Ripard wrote:
> > On Wed, Jul 08, 2020 at 03:29:24PM +0200, Ondřej Jirman wrote:
> > > Hello Maxime,
> > > 
> > > On Wed, Jul 08, 2020 at 02:25:42PM +0200, Maxime Ripard wrote:
> > > > Hi,
> > > > 
> > > > On Wed, Jul 08, 2020 at 12:55:27PM +0200, Ondrej Jirman wrote:
> > > > > I noticed several mobile Linux distributions failing to enable the
> > > > > thermal regulation correctly, because the kernel is silent
> > > > > when thermal driver fails to probe. Add enough error reporting
> > > > > to debug issues and warn users in case thermal sensor is failing
> > > > > to probe.
> > > > > 
> > > > > Failing to notify users means, that SoC can easily overheat under
> > > > > load.
> > > > > 
> > > > > Signed-off-by: Ondrej Jirman <megous@megous.com>
> > > > > ---
> > > > >  drivers/thermal/sun8i_thermal.c | 55 ++++++++++++++++++++++++++-------
> > > > >  1 file changed, 43 insertions(+), 12 deletions(-)
> > > > > 
> > > > > diff --git a/drivers/thermal/sun8i_thermal.c b/drivers/thermal/sun8i_thermal.c
> > > > > index 74d73be16496..9065e79ae743 100644
> > > > > --- a/drivers/thermal/sun8i_thermal.c
> > > > > +++ b/drivers/thermal/sun8i_thermal.c
> > > > > @@ -287,8 +287,12 @@ static int sun8i_ths_calibrate(struct ths_device *tmdev)
> > > > >  
> > > > >  	calcell = devm_nvmem_cell_get(dev, "calibration");
> > > > >  	if (IS_ERR(calcell)) {
> > > > > +		dev_err(dev, "Failed to get calibration nvmem cell (%ld)\n",
> > > > > +			PTR_ERR(calcell));
> > > > > +
> > > > >  		if (PTR_ERR(calcell) == -EPROBE_DEFER)
> > > > >  			return -EPROBE_DEFER;
> > > > > +
> > > > 
> > > > The rest of the patch makes sense, but we should probably put the error
> > > > message after the EPROBE_DEFER return so that we don't print any extra
> > > > noise that isn't necessarily useful
> > > 
> > > I thought about that, but in this case this would have helped, see my other
> > > e-mail. Though lack of "probe success" message may be enough for me, to
> > > debug the issue, I'm not sure the user will notice that a message is missing, while
> > > he'll surely notice if there's a flood of repeated EPROBE_DEFER messages.
> > 
> > Yeah, but on the other hand, we regularly have people that come up and
> > ask if a "legitimate" EPROBE_DEFER error message (as in: the driver
> > wasn't there on the first attempt but was there on the second) is a
> > cause of concern or not.
> 
> That's why I also added a success message, to distinguish this case. 

That doesn't really help though. We have plenty of drivers that have
some sort of success message and people will still ask about that error
message earlier.

> > > And people run several distros for 3-4 months without anyone noticing any
> > > issues and that thermal regulation doesn't work. So it seems that lack of a
> > > success message is not enough.
> > 
> > I understand what the issue is, but do you really expect phone users to
> > monitor the kernel logs every time they boot their phone to see if the
> > thermal throttling is enabled?
> 
> Not phone users, but people making their own kernels/distributions. Those people
> monitor dmesg, and out of 4 distros or more nobody noticed there was an issue
> (despite the complaints of overheating by their users).
> 
> So I thought some warning may be in order, so that distro people more easily
> notice they have misconfigured the kernel or sometging.

I mean, then there's nothing we can do to properly address that then.

The configuration system is a gun, we can point at the target, but
anyone is definitely free to shot themself in the foot.

You would have exactly the same result if you left the thermal driver
disabled, or if you didn't have cpufreq support.

Maxime

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] thermal: sun8i: Be loud when probe fails
  2020-07-08 13:57         ` Maxime Ripard
@ 2020-07-12 23:29           ` Ondřej Jirman
  2020-07-23 15:20             ` Maxime Ripard
  0 siblings, 1 reply; 17+ messages in thread
From: Ondřej Jirman @ 2020-07-12 23:29 UTC (permalink / raw)
  To: Maxime Ripard
  Cc: linux-sunxi, Vasily Khoruzhick, Yangtao Li, Zhang Rui,
	Daniel Lezcano, Amit Kucheria, Chen-Yu Tsai,
	open list:ALLWINNER THERMAL DRIVER,
	moderated list:ARM/Allwinner sunXi SoC support, open list

Hi Maxime,

On Wed, Jul 08, 2020 at 03:57:48PM +0200, Maxime Ripard wrote:
> On Wed, Jul 08, 2020 at 03:44:41PM +0200, Ondřej Jirman wrote:
> > >

[...]

> > > Yeah, but on the other hand, we regularly have people that come up and
> > > ask if a "legitimate" EPROBE_DEFER error message (as in: the driver
> > > wasn't there on the first attempt but was there on the second) is a
> > > cause of concern or not.
> > 
> > That's why I also added a success message, to distinguish this case. 
> 
> That doesn't really help though. We have plenty of drivers that have
> some sort of success message and people will still ask about that error
> message earlier.
> 
> > > > And people run several distros for 3-4 months without anyone noticing any
> > > > issues and that thermal regulation doesn't work. So it seems that lack of a
> > > > success message is not enough.
> > > 
> > > I understand what the issue is, but do you really expect phone users to
> > > monitor the kernel logs every time they boot their phone to see if the
> > > thermal throttling is enabled?
> > 
> > Not phone users, but people making their own kernels/distributions. Those people
> > monitor dmesg, and out of 4 distros or more nobody noticed there was an issue
> > (despite the complaints of overheating by their users).
> > 
> > So I thought some warning may be in order, so that distro people more easily
> > notice they have misconfigured the kernel or sometging.
> 
> I mean, then there's nothing we can do to properly address that then.
> 
> The configuration system is a gun, we can point at the target, but
> anyone is definitely free to shot themself in the foot.
> 
> You would have exactly the same result if you left the thermal driver
> disabled, or if you didn't have cpufreq support.

Right. Though I hope there's some middle ground. I mean all of those dev_err
in error paths of many drivers are there mostly to help debugging stuff.

And even though I was part of this driver's development, it took me quite
some time to figure out it was the missing sunxi-sid driver causing the issue,
with complete silence from the driver.

Maybe this can/will be solved at another level entirely, like having a device
core report devices probes that failed with EPROBE_DEFER some time after
the boot finished and modules had a chance to load, instead of immediately
for each probe retry.

regards,
	o.

> Maxime

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] thermal: sun8i: Be loud when probe fails
  2020-07-08 11:03 ` Russell King - ARM Linux admin
  2020-07-08 11:10   ` Ondřej Jirman
@ 2020-07-20  7:55   ` Icenowy Zheng
  2020-07-20  8:28     ` Russell King - ARM Linux admin
  1 sibling, 1 reply; 17+ messages in thread
From: Icenowy Zheng @ 2020-07-20  7:55 UTC (permalink / raw)
  To: Russell King - ARM Linux admin, Ondrej Jirman
  Cc: Amit Kucheria, open list:ALLWINNER THERMAL DRIVER, Yangtao Li,
	Daniel Lezcano, open list, Maxime Ripard, Vasily Khoruzhick,
	linux-sunxi, Zhang Rui, Chen-Yu Tsai,
	moderated list:ARM/Allwinner sunXi SoC support

在 2020-07-08星期三的 12:03 +0100,Russell King - ARM Linux admin写道:
> On Wed, Jul 08, 2020 at 12:55:27PM +0200, Ondrej Jirman wrote:
> > I noticed several mobile Linux distributions failing to enable the
> > thermal regulation correctly, because the kernel is silent
> > when thermal driver fails to probe. Add enough error reporting
> > to debug issues and warn users in case thermal sensor is failing
> > to probe.
> > 
> > Failing to notify users means, that SoC can easily overheat under
> > load.
> > 
> > Signed-off-by: Ondrej Jirman <megous@megous.com>
> > ---
> >  drivers/thermal/sun8i_thermal.c | 55 ++++++++++++++++++++++++++---
> > ----
> >  1 file changed, 43 insertions(+), 12 deletions(-)
> > 
> > diff --git a/drivers/thermal/sun8i_thermal.c
> > b/drivers/thermal/sun8i_thermal.c
> > index 74d73be16496..9065e79ae743 100644
> > --- a/drivers/thermal/sun8i_thermal.c
> > +++ b/drivers/thermal/sun8i_thermal.c
> > @@ -287,8 +287,12 @@ static int sun8i_ths_calibrate(struct
> > ths_device *tmdev)
> >  
> >  	calcell = devm_nvmem_cell_get(dev, "calibration");
> >  	if (IS_ERR(calcell)) {
> > +		dev_err(dev, "Failed to get calibration nvmem cell
> > (%ld)\n",
> > +			PTR_ERR(calcell));
> 
> Consider using:
> 
> 		dev_err(dev, "Failed to get calibration nvmem cell
> (%pe)\n",
> 			calcell);
> 
> which means the kernel can print the symbolic errno value.

Oh interesting format here.

When we need to deal with a int return value, is it "%e"?

Thanks

> 

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] thermal: sun8i: Be loud when probe fails
  2020-07-20  7:55   ` Icenowy Zheng
@ 2020-07-20  8:28     ` Russell King - ARM Linux admin
  0 siblings, 0 replies; 17+ messages in thread
From: Russell King - ARM Linux admin @ 2020-07-20  8:28 UTC (permalink / raw)
  To: Icenowy Zheng
  Cc: Ondrej Jirman, Amit Kucheria, open list:ALLWINNER THERMAL DRIVER,
	Yangtao Li, Daniel Lezcano, open list, Maxime Ripard,
	Vasily Khoruzhick, linux-sunxi, Zhang Rui, Chen-Yu Tsai,
	moderated list:ARM/Allwinner sunXi SoC support

On Mon, Jul 20, 2020 at 03:55:26PM +0800, Icenowy Zheng wrote:
> 在 2020-07-08星期三的 12:03 +0100,Russell King - ARM Linux admin写道:
> > On Wed, Jul 08, 2020 at 12:55:27PM +0200, Ondrej Jirman wrote:
> > > I noticed several mobile Linux distributions failing to enable the
> > > thermal regulation correctly, because the kernel is silent
> > > when thermal driver fails to probe. Add enough error reporting
> > > to debug issues and warn users in case thermal sensor is failing
> > > to probe.
> > > 
> > > Failing to notify users means, that SoC can easily overheat under
> > > load.
> > > 
> > > Signed-off-by: Ondrej Jirman <megous@megous.com>
> > > ---
> > >  drivers/thermal/sun8i_thermal.c | 55 ++++++++++++++++++++++++++---
> > > ----
> > >  1 file changed, 43 insertions(+), 12 deletions(-)
> > > 
> > > diff --git a/drivers/thermal/sun8i_thermal.c
> > > b/drivers/thermal/sun8i_thermal.c
> > > index 74d73be16496..9065e79ae743 100644
> > > --- a/drivers/thermal/sun8i_thermal.c
> > > +++ b/drivers/thermal/sun8i_thermal.c
> > > @@ -287,8 +287,12 @@ static int sun8i_ths_calibrate(struct
> > > ths_device *tmdev)
> > >  
> > >  	calcell = devm_nvmem_cell_get(dev, "calibration");
> > >  	if (IS_ERR(calcell)) {
> > > +		dev_err(dev, "Failed to get calibration nvmem cell
> > > (%ld)\n",
> > > +			PTR_ERR(calcell));
> > 
> > Consider using:
> > 
> > 		dev_err(dev, "Failed to get calibration nvmem cell
> > (%pe)\n",
> > 			calcell);
> > 
> > which means the kernel can print the symbolic errno value.
> 
> Oh interesting format here.
> 
> When we need to deal with a int return value, is it "%e"?

No, because that will lose the ability for the compiler to check the
format string and arguments correspond.  All the extensions are
documented at Documentation/core-api/printk-formats.rst.

Use %pe and ERR_PTR(...) to print an integer -ve errno return value.

-- 
RMK's Patch system: https://www.armlinux.org.uk/developer/patches/
FTTP is here! 40Mbps down 10Mbps up. Decent connectivity at last!

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] thermal: sun8i: Be loud when probe fails
  2020-07-12 23:29           ` Ondřej Jirman
@ 2020-07-23 15:20             ` Maxime Ripard
  0 siblings, 0 replies; 17+ messages in thread
From: Maxime Ripard @ 2020-07-23 15:20 UTC (permalink / raw)
  To: Ondřej Jirman, linux-sunxi, Vasily Khoruzhick, Yangtao Li,
	Zhang Rui, Daniel Lezcano, Amit Kucheria, Chen-Yu Tsai,
	open list:ALLWINNER THERMAL DRIVER,
	moderated list:ARM/Allwinner sunXi SoC support, open list

On Mon, Jul 13, 2020 at 01:29:42AM +0200, Ondřej Jirman wrote:
> Hi Maxime,
> 
> On Wed, Jul 08, 2020 at 03:57:48PM +0200, Maxime Ripard wrote:
> > On Wed, Jul 08, 2020 at 03:44:41PM +0200, Ondřej Jirman wrote:
> > > >
> 
> [...]
> 
> > > > Yeah, but on the other hand, we regularly have people that come up and
> > > > ask if a "legitimate" EPROBE_DEFER error message (as in: the driver
> > > > wasn't there on the first attempt but was there on the second) is a
> > > > cause of concern or not.
> > > 
> > > That's why I also added a success message, to distinguish this case. 
> > 
> > That doesn't really help though. We have plenty of drivers that have
> > some sort of success message and people will still ask about that error
> > message earlier.
> > 
> > > > > And people run several distros for 3-4 months without anyone noticing any
> > > > > issues and that thermal regulation doesn't work. So it seems that lack of a
> > > > > success message is not enough.
> > > > 
> > > > I understand what the issue is, but do you really expect phone users to
> > > > monitor the kernel logs every time they boot their phone to see if the
> > > > thermal throttling is enabled?
> > > 
> > > Not phone users, but people making their own kernels/distributions. Those people
> > > monitor dmesg, and out of 4 distros or more nobody noticed there was an issue
> > > (despite the complaints of overheating by their users).
> > > 
> > > So I thought some warning may be in order, so that distro people more easily
> > > notice they have misconfigured the kernel or sometging.
> > 
> > I mean, then there's nothing we can do to properly address that then.
> > 
> > The configuration system is a gun, we can point at the target, but
> > anyone is definitely free to shot themself in the foot.
> > 
> > You would have exactly the same result if you left the thermal driver
> > disabled, or if you didn't have cpufreq support.
> 
> Right. Though I hope there's some middle ground. I mean all of those dev_err
> in error paths of many drivers are there mostly to help debugging stuff.

Adding all the error messages you have in that patch seems like a good
middle ground to me, and we could definitely use more of them in some
other drivers (like the USB PHY)

> And even though I was part of this driver's development, it took me quite
> some time to figure out it was the missing sunxi-sid driver causing the issue,
> with complete silence from the driver.
> 
> Maybe this can/will be solved at another level entirely, like having a device
> core report devices probes that failed with EPROBE_DEFER some time after
> the boot finished and modules had a chance to load, instead of immediately
> for each probe retry.

The thing is that there's never a point in time where "all the modules
had a chance to load". If you're loading the modules on demand and have
an hotpluggable bus, that might happen after a second or after a year,
you can't say.

The actual fix for this would be to use the on demand probing that seems
to be in the works and avoid EPROBE_DEFER entirely, but that probably
won't happen in a near future.

Maxime

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2020-07-23 15:21 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-07-08 10:55 [PATCH] thermal: sun8i: Be loud when probe fails Ondrej Jirman
2020-07-08 11:03 ` Russell King - ARM Linux admin
2020-07-08 11:10   ` Ondřej Jirman
2020-07-20  7:55   ` Icenowy Zheng
2020-07-20  8:28     ` Russell King - ARM Linux admin
2020-07-08 11:55 ` Frank Lee
2020-07-08 13:21   ` Ondřej Jirman
2020-07-08 13:42     ` Robin Murphy
2020-07-08 13:33   ` Ondřej Jirman
2020-07-08 12:25 ` Maxime Ripard
2020-07-08 13:29   ` Ondřej Jirman
2020-07-08 13:36     ` Maxime Ripard
2020-07-08 13:44       ` Ondřej Jirman
2020-07-08 13:57         ` Maxime Ripard
2020-07-12 23:29           ` Ondřej Jirman
2020-07-23 15:20             ` Maxime Ripard
2020-07-08 13:29 ` Maxime Ripard

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).