All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH linux dev-4.10] Revert "drivers/hwmon/occ: Add temperature fault attribute and VRM temp alarm"
@ 2017-10-03 19:14 Eddie James
  2017-10-04  4:38 ` Joel Stanley
  0 siblings, 1 reply; 5+ messages in thread
From: Eddie James @ 2017-10-03 19:14 UTC (permalink / raw)
  To: openbmc; +Cc: joel, Edward A. James

From: "Edward A. James" <eajames@us.ibm.com>

This reverts commit e55423ee10a5057338d24383c00e813436a126ea.

Apologies for pushing this up so early... Userspace applications aren't
ready for this change. The hwmon polling application cannot accept EGAIN
yet, and we can't be returning apparent errors if the sensor is
temporarily unavailable.

Signed-off-by: Eddie James <eajames@us.ibm.com>
---
 drivers/hwmon/occ/common.c | 38 +++-----------------------------------
 1 file changed, 3 insertions(+), 35 deletions(-)

diff --git a/drivers/hwmon/occ/common.c b/drivers/hwmon/occ/common.c
index 8ffb556..34002fb 100644
--- a/drivers/hwmon/occ/common.c
+++ b/drivers/hwmon/occ/common.c
@@ -19,10 +19,6 @@
 #define OCC_EXT_STAT_MEM_THROTTLE	0x20
 #define OCC_EXT_STAT_QUICK_DROP		0x10
 
-#define OCC_TEMP_SENSOR_FAULT		0xFF
-
-#define OCC_FRU_TYPE_VRM		3
-
 atomic_t occ_num_occs = ATOMIC_INIT(0);
 
 struct temp_sensor_1 {
@@ -381,23 +377,11 @@ static ssize_t occ_show_temp_2(struct device *dev,
 		val = get_unaligned_be32(&temp->sensor_id);
 		break;
 	case 1:
-		val = temp->value;
-		if (val == OCC_TEMP_SENSOR_FAULT)
-			return -EREMOTEIO;
-
-		if (temp->fru_type != OCC_FRU_TYPE_VRM) {
-			if (val == 0)
-				return -EAGAIN;
-
-			val *= 1000;
-		}
+		val = temp->value * 1000;
 		break;
 	case 2:
 		val = temp->fru_type;
 		break;
-	case 3:
-		val = temp->value == OCC_TEMP_SENSOR_FAULT;
-		break;
 	}
 
 	return snprintf(buf, PAGE_SIZE - 1, "%u\n", val);
@@ -796,7 +780,6 @@ static ssize_t occ_show_extended(struct device *dev,
 int occ_setup_sensor_attrs(struct occ *occ)
 {
 	unsigned int i, s;
-	struct temp_sensor_2 *temp;
 	struct device *dev = occ->bus_dev;
 	struct occ_sensors *sensors = &occ->sensors;
 	struct occ_attribute *attr;
@@ -816,7 +799,7 @@ int occ_setup_sensor_attrs(struct occ *occ)
 		occ->num_attrs += (sensors->temp.num_sensors * 2);
 		break;
 	case 2:
-		occ->num_attrs += (sensors->temp.num_sensors * 4);
+		occ->num_attrs += (sensors->temp.num_sensors * 3);
 		show_temp = occ_show_temp_2;
 		break;
 	default:
@@ -888,22 +871,13 @@ int occ_setup_sensor_attrs(struct occ *occ)
 
 	for (i = 0; i < sensors->temp.num_sensors; ++i) {
 		s = i + 1;
-		temp = ((struct temp_sensor_2 *)sensors->temp.data) + i;
 
 		snprintf(attr->name, sizeof(attr->name), "temp%d_label", s);
 		attr->sensor = OCC_INIT_ATTR(attr->name, 0444, show_temp, NULL,
 					     0, i);
 		attr++;
 
-		if (sensors->temp.version > 1 &&
-		    temp->fru_type == OCC_FRU_TYPE_VRM) {
-			snprintf(attr->name, sizeof(attr->name), "temp%d_alarm",
-				 s);
-		} else {
-			snprintf(attr->name, sizeof(attr->name), "temp%d_input",
-				 s);
-		}
-
+		snprintf(attr->name, sizeof(attr->name), "temp%d_input", s);
 		attr->sensor = OCC_INIT_ATTR(attr->name, 0444, show_temp, NULL,
 					     1, i);
 		attr++;
@@ -914,12 +888,6 @@ int occ_setup_sensor_attrs(struct occ *occ)
 			attr->sensor = OCC_INIT_ATTR(attr->name, 0444,
 						     show_temp, NULL, 2, i);
 			attr++;
-
-			snprintf(attr->name, sizeof(attr->name), "temp%d_fault",
-				 s);
-			attr->sensor = OCC_INIT_ATTR(attr->name, 0444,
-						     show_temp, NULL, 3, i);
-			attr++;
 		}
 	}
 
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH linux dev-4.10] Revert "drivers/hwmon/occ: Add temperature fault attribute and VRM temp alarm"
  2017-10-03 19:14 [PATCH linux dev-4.10] Revert "drivers/hwmon/occ: Add temperature fault attribute and VRM temp alarm" Eddie James
@ 2017-10-04  4:38 ` Joel Stanley
  2017-10-04 15:04   ` Eddie James
  0 siblings, 1 reply; 5+ messages in thread
From: Joel Stanley @ 2017-10-04  4:38 UTC (permalink / raw)
  To: Eddie James, Brad Bishop; +Cc: OpenBMC Maillist, Edward A. James

On Wed, Oct 4, 2017 at 4:44 AM, Eddie James <eajames@linux.vnet.ibm.com> wrote:
> From: "Edward A. James" <eajames@us.ibm.com>
>
> This reverts commit e55423ee10a5057338d24383c00e813436a126ea.
>
> Apologies for pushing this up so early... Userspace applications aren't
> ready for this change. The hwmon polling application cannot accept EGAIN
> yet, and we can't be returning apparent errors if the sensor is
> temporarily unavailable.

Can you go into some more detail here please.

It looks like these changes are within the hwmon ABI, so userspace
that supports hwmon devices shouldn't break.

Cheers,

Joel

>
> Signed-off-by: Eddie James <eajames@us.ibm.com>
> ---
>  drivers/hwmon/occ/common.c | 38 +++-----------------------------------
>  1 file changed, 3 insertions(+), 35 deletions(-)
>
> diff --git a/drivers/hwmon/occ/common.c b/drivers/hwmon/occ/common.c
> index 8ffb556..34002fb 100644
> --- a/drivers/hwmon/occ/common.c
> +++ b/drivers/hwmon/occ/common.c
> @@ -19,10 +19,6 @@
>  #define OCC_EXT_STAT_MEM_THROTTLE      0x20
>  #define OCC_EXT_STAT_QUICK_DROP                0x10
>
> -#define OCC_TEMP_SENSOR_FAULT          0xFF
> -
> -#define OCC_FRU_TYPE_VRM               3
> -
>  atomic_t occ_num_occs = ATOMIC_INIT(0);
>
>  struct temp_sensor_1 {
> @@ -381,23 +377,11 @@ static ssize_t occ_show_temp_2(struct device *dev,
>                 val = get_unaligned_be32(&temp->sensor_id);
>                 break;
>         case 1:
> -               val = temp->value;
> -               if (val == OCC_TEMP_SENSOR_FAULT)
> -                       return -EREMOTEIO;
> -
> -               if (temp->fru_type != OCC_FRU_TYPE_VRM) {
> -                       if (val == 0)
> -                               return -EAGAIN;
> -
> -                       val *= 1000;
> -               }
> +               val = temp->value * 1000;
>                 break;
>         case 2:
>                 val = temp->fru_type;
>                 break;
> -       case 3:
> -               val = temp->value == OCC_TEMP_SENSOR_FAULT;
> -               break;
>         }
>
>         return snprintf(buf, PAGE_SIZE - 1, "%u\n", val);
> @@ -796,7 +780,6 @@ static ssize_t occ_show_extended(struct device *dev,
>  int occ_setup_sensor_attrs(struct occ *occ)
>  {
>         unsigned int i, s;
> -       struct temp_sensor_2 *temp;
>         struct device *dev = occ->bus_dev;
>         struct occ_sensors *sensors = &occ->sensors;
>         struct occ_attribute *attr;
> @@ -816,7 +799,7 @@ int occ_setup_sensor_attrs(struct occ *occ)
>                 occ->num_attrs += (sensors->temp.num_sensors * 2);
>                 break;
>         case 2:
> -               occ->num_attrs += (sensors->temp.num_sensors * 4);
> +               occ->num_attrs += (sensors->temp.num_sensors * 3);
>                 show_temp = occ_show_temp_2;
>                 break;
>         default:
> @@ -888,22 +871,13 @@ int occ_setup_sensor_attrs(struct occ *occ)
>
>         for (i = 0; i < sensors->temp.num_sensors; ++i) {
>                 s = i + 1;
> -               temp = ((struct temp_sensor_2 *)sensors->temp.data) + i;
>
>                 snprintf(attr->name, sizeof(attr->name), "temp%d_label", s);
>                 attr->sensor = OCC_INIT_ATTR(attr->name, 0444, show_temp, NULL,
>                                              0, i);
>                 attr++;
>
> -               if (sensors->temp.version > 1 &&
> -                   temp->fru_type == OCC_FRU_TYPE_VRM) {
> -                       snprintf(attr->name, sizeof(attr->name), "temp%d_alarm",
> -                                s);
> -               } else {
> -                       snprintf(attr->name, sizeof(attr->name), "temp%d_input",
> -                                s);
> -               }
> -
> +               snprintf(attr->name, sizeof(attr->name), "temp%d_input", s);
>                 attr->sensor = OCC_INIT_ATTR(attr->name, 0444, show_temp, NULL,
>                                              1, i);
>                 attr++;
> @@ -914,12 +888,6 @@ int occ_setup_sensor_attrs(struct occ *occ)
>                         attr->sensor = OCC_INIT_ATTR(attr->name, 0444,
>                                                      show_temp, NULL, 2, i);
>                         attr++;
> -
> -                       snprintf(attr->name, sizeof(attr->name), "temp%d_fault",
> -                                s);
> -                       attr->sensor = OCC_INIT_ATTR(attr->name, 0444,
> -                                                    show_temp, NULL, 3, i);
> -                       attr++;
>                 }
>         }
>
> --
> 1.8.3.1
>

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH linux dev-4.10] Revert "drivers/hwmon/occ: Add temperature fault attribute and VRM temp alarm"
  2017-10-04  4:38 ` Joel Stanley
@ 2017-10-04 15:04   ` Eddie James
  2017-10-05  2:01     ` Brad Bishop
  0 siblings, 1 reply; 5+ messages in thread
From: Eddie James @ 2017-10-04 15:04 UTC (permalink / raw)
  To: Joel Stanley, Brad Bishop; +Cc: Edward A. James, OpenBMC Maillist



On 10/03/2017 11:38 PM, Joel Stanley wrote:
> On Wed, Oct 4, 2017 at 4:44 AM, Eddie James <eajames@linux.vnet.ibm.com> wrote:
>> From: "Edward A. James" <eajames@us.ibm.com>
>>
>> This reverts commit e55423ee10a5057338d24383c00e813436a126ea.
>>
>> Apologies for pushing this up so early... Userspace applications aren't
>> ready for this change. The hwmon polling application cannot accept EGAIN
>> yet, and we can't be returning apparent errors if the sensor is
>> temporarily unavailable.
> Can you go into some more detail here please.
>
> It looks like these changes are within the hwmon ABI, so userspace
> that supports hwmon devices shouldn't break.

Sure. With this change, the driver returns -EAGAIN when reading 
temperature sensors that show 0 temperature. Temperature 0 is how the 
OCC indicates that the sensor is not available at the moment. So, we 
should retry in a bit. But the hwmon application currently sees any 
negative errno as an error, and will not retry on -EAGAIN. So now we get 
errors when we don't want them (for now), basically.

Thanks,
Eddie

>
> Cheers,
>
> Joel
>
>> Signed-off-by: Eddie James <eajames@us.ibm.com>
>> ---
>>   drivers/hwmon/occ/common.c | 38 +++-----------------------------------
>>   1 file changed, 3 insertions(+), 35 deletions(-)
>>
>> diff --git a/drivers/hwmon/occ/common.c b/drivers/hwmon/occ/common.c
>> index 8ffb556..34002fb 100644
>> --- a/drivers/hwmon/occ/common.c
>> +++ b/drivers/hwmon/occ/common.c
>> @@ -19,10 +19,6 @@
>>   #define OCC_EXT_STAT_MEM_THROTTLE      0x20
>>   #define OCC_EXT_STAT_QUICK_DROP                0x10
>>
>> -#define OCC_TEMP_SENSOR_FAULT          0xFF
>> -
>> -#define OCC_FRU_TYPE_VRM               3
>> -
>>   atomic_t occ_num_occs = ATOMIC_INIT(0);
>>
>>   struct temp_sensor_1 {
>> @@ -381,23 +377,11 @@ static ssize_t occ_show_temp_2(struct device *dev,
>>                  val = get_unaligned_be32(&temp->sensor_id);
>>                  break;
>>          case 1:
>> -               val = temp->value;
>> -               if (val == OCC_TEMP_SENSOR_FAULT)
>> -                       return -EREMOTEIO;
>> -
>> -               if (temp->fru_type != OCC_FRU_TYPE_VRM) {
>> -                       if (val == 0)
>> -                               return -EAGAIN;
>> -
>> -                       val *= 1000;
>> -               }
>> +               val = temp->value * 1000;
>>                  break;
>>          case 2:
>>                  val = temp->fru_type;
>>                  break;
>> -       case 3:
>> -               val = temp->value == OCC_TEMP_SENSOR_FAULT;
>> -               break;
>>          }
>>
>>          return snprintf(buf, PAGE_SIZE - 1, "%u\n", val);
>> @@ -796,7 +780,6 @@ static ssize_t occ_show_extended(struct device *dev,
>>   int occ_setup_sensor_attrs(struct occ *occ)
>>   {
>>          unsigned int i, s;
>> -       struct temp_sensor_2 *temp;
>>          struct device *dev = occ->bus_dev;
>>          struct occ_sensors *sensors = &occ->sensors;
>>          struct occ_attribute *attr;
>> @@ -816,7 +799,7 @@ int occ_setup_sensor_attrs(struct occ *occ)
>>                  occ->num_attrs += (sensors->temp.num_sensors * 2);
>>                  break;
>>          case 2:
>> -               occ->num_attrs += (sensors->temp.num_sensors * 4);
>> +               occ->num_attrs += (sensors->temp.num_sensors * 3);
>>                  show_temp = occ_show_temp_2;
>>                  break;
>>          default:
>> @@ -888,22 +871,13 @@ int occ_setup_sensor_attrs(struct occ *occ)
>>
>>          for (i = 0; i < sensors->temp.num_sensors; ++i) {
>>                  s = i + 1;
>> -               temp = ((struct temp_sensor_2 *)sensors->temp.data) + i;
>>
>>                  snprintf(attr->name, sizeof(attr->name), "temp%d_label", s);
>>                  attr->sensor = OCC_INIT_ATTR(attr->name, 0444, show_temp, NULL,
>>                                               0, i);
>>                  attr++;
>>
>> -               if (sensors->temp.version > 1 &&
>> -                   temp->fru_type == OCC_FRU_TYPE_VRM) {
>> -                       snprintf(attr->name, sizeof(attr->name), "temp%d_alarm",
>> -                                s);
>> -               } else {
>> -                       snprintf(attr->name, sizeof(attr->name), "temp%d_input",
>> -                                s);
>> -               }
>> -
>> +               snprintf(attr->name, sizeof(attr->name), "temp%d_input", s);
>>                  attr->sensor = OCC_INIT_ATTR(attr->name, 0444, show_temp, NULL,
>>                                               1, i);
>>                  attr++;
>> @@ -914,12 +888,6 @@ int occ_setup_sensor_attrs(struct occ *occ)
>>                          attr->sensor = OCC_INIT_ATTR(attr->name, 0444,
>>                                                       show_temp, NULL, 2, i);
>>                          attr++;
>> -
>> -                       snprintf(attr->name, sizeof(attr->name), "temp%d_fault",
>> -                                s);
>> -                       attr->sensor = OCC_INIT_ATTR(attr->name, 0444,
>> -                                                    show_temp, NULL, 3, i);
>> -                       attr++;
>>                  }
>>          }
>>
>> --
>> 1.8.3.1
>>

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH linux dev-4.10] Revert "drivers/hwmon/occ: Add temperature fault attribute and VRM temp alarm"
  2017-10-04 15:04   ` Eddie James
@ 2017-10-05  2:01     ` Brad Bishop
  2017-10-05  3:53       ` Joel Stanley
  0 siblings, 1 reply; 5+ messages in thread
From: Brad Bishop @ 2017-10-05  2:01 UTC (permalink / raw)
  To: Eddie James; +Cc: Joel Stanley, Edward A. James, OpenBMC Maillist


> On Oct 4, 2017, at 11:04 AM, Eddie James <eajames@linux.vnet.ibm.com> wrote:
> 
> 
> 
> On 10/03/2017 11:38 PM, Joel Stanley wrote:
>> On Wed, Oct 4, 2017 at 4:44 AM, Eddie James <eajames@linux.vnet.ibm.com> wrote:
>>> From: "Edward A. James" <eajames@us.ibm.com>
>>> 
>>> This reverts commit e55423ee10a5057338d24383c00e813436a126ea.
>>> 
>>> Apologies for pushing this up so early... Userspace applications aren't
>>> ready for this change. The hwmon polling application cannot accept EGAIN
>>> yet, and we can't be returning apparent errors if the sensor is
>>> temporarily unavailable.
>> Can you go into some more detail here please.
>> 
>> It looks like these changes are within the hwmon ABI, so userspace
>> that supports hwmon devices shouldn't break.
> 
> Sure. With this change, the driver returns -EAGAIN when reading temperature sensors that show 0 temperature. Temperature 0 is how the OCC indicates that the sensor is not available at the moment. So, we should retry in a bit. But the hwmon application currently sees any negative errno as an error,

Actually the userspace retries for a configurable period on eio, etimedout, 
ebadmsg, eagain, and exnio.  It also exits cleanly on enoent.

If any of these errnos don’t go away after the configured number of retry
attempts, an error is logged.  Any other errnos and an error is logged
immediately. If this list needs improvement I’d love to hear about it.

ebadmsg and enxio are observed when i2c devices are unplugged with transfers
in various stages of flight.  They occur just before the driver is unbound
after the presence gpio toggle on these devices is noticed and processed
(killing the hwmon userspace daemon cleanly).

eio, etimedout seem to be bus type errors that appear somewhat infrequently,
but occur nevertheless.

we all know what eagain means…

It isn’t so much that the hwmon userspace doesn’t support this change,
its just the resulting behavior difference at the other end of the
hwmon abi <-> dbus api translation is not optimal, right now.

1 - Without this change, the hwmon userspace reads a value of 0 out of
the sysfs attribute and happily reports that as the temp at the DBus level.

With this change, the hwmon userspace would read the attribute, get an
error and retry for a bit.  After a number of retries, the error is
logged and the sensor is marked faulted at a dbus level.

The issue is the consumer of the sensor at the dbus level.  Today, the
dbus consumer happens to translate the sensor value of zero into the
desired behavior.  If we submit this change the sensor will be put
into faulted state and the consuming application doesn’t have support
for that yet.

We have open issues to enhance the application to support faulted sensors,
its just not implemented yet.

I hope this helps a decision to be made.

-brad


> and will not retry on -EAGAIN. So now we get errors when we don't want them (for now), basically.
> 
> Thanks,
> Eddie

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH linux dev-4.10] Revert "drivers/hwmon/occ: Add temperature fault attribute and VRM temp alarm"
  2017-10-05  2:01     ` Brad Bishop
@ 2017-10-05  3:53       ` Joel Stanley
  0 siblings, 0 replies; 5+ messages in thread
From: Joel Stanley @ 2017-10-05  3:53 UTC (permalink / raw)
  To: Brad Bishop; +Cc: Eddie James, Edward A. James, OpenBMC Maillist

On Thu, Oct 5, 2017 at 11:31 AM, Brad Bishop
<bradleyb@fuzziesquirrel.com> wrote:
>
>> On Oct 4, 2017, at 11:04 AM, Eddie James <eajames@linux.vnet.ibm.com> wrote:
>>
>>
>>
>> On 10/03/2017 11:38 PM, Joel Stanley wrote:
>>> On Wed, Oct 4, 2017 at 4:44 AM, Eddie James <eajames@linux.vnet.ibm.com> wrote:
>>>> From: "Edward A. James" <eajames@us.ibm.com>
>>>>
>>>> This reverts commit e55423ee10a5057338d24383c00e813436a126ea.
>>>>
>>>> Apologies for pushing this up so early... Userspace applications aren't
>>>> ready for this change. The hwmon polling application cannot accept EGAIN
>>>> yet, and we can't be returning apparent errors if the sensor is
>>>> temporarily unavailable.
>>> Can you go into some more detail here please.
>>>
>>> It looks like these changes are within the hwmon ABI, so userspace
>>> that supports hwmon devices shouldn't break.
>>
>> Sure. With this change, the driver returns -EAGAIN when reading temperature sensors that show 0 temperature. Temperature 0 is how the OCC indicates that the sensor is not available at the moment. So, we should retry in a bit. But the hwmon application currently sees any negative errno as an error,
>
> Actually the userspace retries for a configurable period on eio, etimedout,
> ebadmsg, eagain, and exnio.  It also exits cleanly on enoent.
>
> If any of these errnos don’t go away after the configured number of retry
> attempts, an error is logged.  Any other errnos and an error is logged
> immediately. If this list needs improvement I’d love to hear about it.
>
> ebadmsg and enxio are observed when i2c devices are unplugged with transfers
> in various stages of flight.  They occur just before the driver is unbound
> after the presence gpio toggle on these devices is noticed and processed
> (killing the hwmon userspace daemon cleanly).
>
> eio, etimedout seem to be bus type errors that appear somewhat infrequently,
> but occur nevertheless.
>
> we all know what eagain means…
>
> It isn’t so much that the hwmon userspace doesn’t support this change,
> its just the resulting behavior difference at the other end of the
> hwmon abi <-> dbus api translation is not optimal, right now.
>
> 1 - Without this change, the hwmon userspace reads a value of 0 out of
> the sysfs attribute and happily reports that as the temp at the DBus level.
>
> With this change, the hwmon userspace would read the attribute, get an
> error and retry for a bit.  After a number of retries, the error is
> logged and the sensor is marked faulted at a dbus level.
>
> The issue is the consumer of the sensor at the dbus level.  Today, the
> dbus consumer happens to translate the sensor value of zero into the
> desired behavior.  If we submit this change the sensor will be put
> into faulted state and the consuming application doesn’t have support
> for that yet.
>
> We have open issues to enhance the application to support faulted sensors,
> its just not implemented yet.
>
> I hope this helps a decision to be made.

Thanks for the background. In general we want to write and test our
userspace such that it can cope with all of the return codes that the
kernel may throw at it. I look forward to helping us in this area in
the future.

We will take this patch as a once-off to keep the OCC train moving.
I've added your notes into the commit message.

Applied to dev-4.10.

Cheers,

Joel

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2017-10-05  3:53 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-10-03 19:14 [PATCH linux dev-4.10] Revert "drivers/hwmon/occ: Add temperature fault attribute and VRM temp alarm" Eddie James
2017-10-04  4:38 ` Joel Stanley
2017-10-04 15:04   ` Eddie James
2017-10-05  2:01     ` Brad Bishop
2017-10-05  3:53       ` Joel Stanley

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.