All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC net-next v2 1/3] devlink: move health state to uAPI
@ 2021-03-11  3:26 Jakub Kicinski
  2021-03-11  3:26 ` [RFC net-next v2 2/3] devlink: health: add remediation type Jakub Kicinski
                   ` (2 more replies)
  0 siblings, 3 replies; 13+ messages in thread
From: Jakub Kicinski @ 2021-03-11  3:26 UTC (permalink / raw)
  To: netdev
  Cc: jiri, saeedm, andrew.gospodarek, jacob.e.keller,
	guglielmo.morandin, eugenem, eranbe, Jakub Kicinski

Move the health states into uAPI, so applications can use them.

Note that we need to change the name of the enum because
user space is likely already defining the same values.
E.g. iproute2 does.

Use this opportunity to shorten the names.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../net/ethernet/broadcom/bnxt/bnxt_devlink.c  |  4 ++--
 .../ethernet/mellanox/mlx5/core/en/health.c    |  4 ++--
 include/net/devlink.h                          |  7 +------
 include/uapi/linux/devlink.h                   | 12 ++++++++++++
 net/core/devlink.c                             | 18 +++++++++---------
 5 files changed, 26 insertions(+), 19 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
index 64381be935a8..cafc98ab4b5e 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
@@ -252,9 +252,9 @@ void bnxt_dl_health_status_update(struct bnxt *bp, bool healthy)
 	u8 state;
 
 	if (healthy)
-		state = DEVLINK_HEALTH_REPORTER_STATE_HEALTHY;
+		state = DL_HEALTH_STATE_HEALTHY;
 	else
-		state = DEVLINK_HEALTH_REPORTER_STATE_ERROR;
+		state = DL_HEALTH_STATE_ERROR;
 
 	if (health->fatal)
 		devlink_health_reporter_state_update(health->fw_fatal_reporter,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/health.c b/drivers/net/ethernet/mellanox/mlx5/core/en/health.c
index 84e501e057b4..c526e31e562c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/health.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/health.c
@@ -151,10 +151,10 @@ void mlx5e_health_channels_update(struct mlx5e_priv *priv)
 {
 	if (priv->tx_reporter)
 		devlink_health_reporter_state_update(priv->tx_reporter,
-						     DEVLINK_HEALTH_REPORTER_STATE_HEALTHY);
+						     DL_HEALTH_STATE_HEALTHY);
 	if (priv->rx_reporter)
 		devlink_health_reporter_state_update(priv->rx_reporter,
-						     DEVLINK_HEALTH_REPORTER_STATE_HEALTHY);
+						     DL_HEALTH_STATE_HEALTHY);
 }
 
 int mlx5e_health_sq_to_ready(struct mlx5_core_dev *mdev, struct net_device *dev, u32 sqn)
diff --git a/include/net/devlink.h b/include/net/devlink.h
index 853420db5d32..b424328af658 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -656,11 +656,6 @@ struct devlink_port_region_ops {
 struct devlink_fmsg;
 struct devlink_health_reporter;
 
-enum devlink_health_reporter_state {
-	DEVLINK_HEALTH_REPORTER_STATE_HEALTHY,
-	DEVLINK_HEALTH_REPORTER_STATE_ERROR,
-};
-
 /**
  * struct devlink_health_reporter_ops - Reporter operations
  * @name: reporter name
@@ -1675,7 +1670,7 @@ int devlink_health_report(struct devlink_health_reporter *reporter,
 			  const char *msg, void *priv_ctx);
 void
 devlink_health_reporter_state_update(struct devlink_health_reporter *reporter,
-				     enum devlink_health_reporter_state state);
+				     enum devlink_health_state state);
 void
 devlink_health_reporter_recovery_done(struct devlink_health_reporter *reporter);
 
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index f6008b2fa60f..41a6ea3b2256 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -608,4 +608,16 @@ enum devlink_port_fn_opstate {
 	DEVLINK_PORT_FN_OPSTATE_ATTACHED,
 };
 
+/**
+ * enum devlink_health_state - indicates the state of a health reporter
+ * @DL_HEALTH_STATE_HEALTHY: fully operational, working state
+ * @DL_HEALTH_STATE_ERROR: error state, running health reporter's recovery
+ *			may fix the issue, otherwise user needs to try
+ *			power cycling or other forms of reset
+ */
+enum devlink_health_state {
+	DL_HEALTH_STATE_HEALTHY,
+	DL_HEALTH_STATE_ERROR,
+};
+
 #endif /* _UAPI_LINUX_DEVLINK_H_ */
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 737b61c2976e..8e4e4bd7bb36 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -6346,7 +6346,7 @@ devlink_health_reporter_recover(struct devlink_health_reporter *reporter,
 {
 	int err;
 
-	if (reporter->health_state == DEVLINK_HEALTH_REPORTER_STATE_HEALTHY)
+	if (reporter->health_state == DL_HEALTH_STATE_HEALTHY)
 		return 0;
 
 	if (!reporter->ops->recover)
@@ -6357,7 +6357,7 @@ devlink_health_reporter_recover(struct devlink_health_reporter *reporter,
 		return err;
 
 	devlink_health_reporter_recovery_done(reporter);
-	reporter->health_state = DEVLINK_HEALTH_REPORTER_STATE_HEALTHY;
+	reporter->health_state = DL_HEALTH_STATE_HEALTHY;
 	devlink_recover_notify(reporter, DEVLINK_CMD_HEALTH_REPORTER_RECOVER);
 
 	return 0;
@@ -6416,7 +6416,7 @@ static int devlink_health_do_dump(struct devlink_health_reporter *reporter,
 int devlink_health_report(struct devlink_health_reporter *reporter,
 			  const char *msg, void *priv_ctx)
 {
-	enum devlink_health_reporter_state prev_health_state;
+	enum devlink_health_state prev_health_state;
 	struct devlink *devlink = reporter->devlink;
 	unsigned long recover_ts_threshold;
 
@@ -6425,14 +6425,14 @@ int devlink_health_report(struct devlink_health_reporter *reporter,
 	trace_devlink_health_report(devlink, reporter->ops->name, msg);
 	reporter->error_count++;
 	prev_health_state = reporter->health_state;
-	reporter->health_state = DEVLINK_HEALTH_REPORTER_STATE_ERROR;
+	reporter->health_state = DL_HEALTH_STATE_ERROR;
 	devlink_recover_notify(reporter, DEVLINK_CMD_HEALTH_REPORTER_RECOVER);
 
 	/* abort if the previous error wasn't recovered */
 	recover_ts_threshold = reporter->last_recovery_ts +
 			       msecs_to_jiffies(reporter->graceful_period);
 	if (reporter->auto_recover &&
-	    (prev_health_state != DEVLINK_HEALTH_REPORTER_STATE_HEALTHY ||
+	    (prev_health_state != DL_HEALTH_STATE_HEALTHY ||
 	     (reporter->last_recovery_ts && reporter->recovery_count &&
 	      time_is_after_jiffies(recover_ts_threshold)))) {
 		trace_devlink_health_recover_aborted(devlink,
@@ -6443,7 +6443,7 @@ int devlink_health_report(struct devlink_health_reporter *reporter,
 		return -ECANCELED;
 	}
 
-	reporter->health_state = DEVLINK_HEALTH_REPORTER_STATE_ERROR;
+	reporter->health_state = DL_HEALTH_STATE_ERROR;
 
 	if (reporter->auto_dump) {
 		mutex_lock(&reporter->dump_lock);
@@ -6520,10 +6520,10 @@ devlink_health_reporter_get_from_cb(struct netlink_callback *cb)
 
 void
 devlink_health_reporter_state_update(struct devlink_health_reporter *reporter,
-				     enum devlink_health_reporter_state state)
+				     enum devlink_health_state state)
 {
-	if (WARN_ON(state != DEVLINK_HEALTH_REPORTER_STATE_HEALTHY &&
-		    state != DEVLINK_HEALTH_REPORTER_STATE_ERROR))
+	if (WARN_ON(state != DL_HEALTH_STATE_HEALTHY &&
+		    state != DL_HEALTH_STATE_ERROR))
 		return;
 
 	if (reporter->health_state == state)
-- 
2.29.2


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [RFC net-next v2 2/3] devlink: health: add remediation type
  2021-03-11  3:26 [RFC net-next v2 1/3] devlink: move health state to uAPI Jakub Kicinski
@ 2021-03-11  3:26 ` Jakub Kicinski
  2021-03-11  7:48   ` Jiri Pirko
  2021-03-11 14:32   ` Eran Ben Elisha
  2021-03-11  3:26 ` [RFC net-next v2 3/3] devlink: add more failure modes Jakub Kicinski
  2021-03-11  7:47 ` [RFC net-next v2 1/3] devlink: move health state to uAPI Jiri Pirko
  2 siblings, 2 replies; 13+ messages in thread
From: Jakub Kicinski @ 2021-03-11  3:26 UTC (permalink / raw)
  To: netdev
  Cc: jiri, saeedm, andrew.gospodarek, jacob.e.keller,
	guglielmo.morandin, eugenem, eranbe, Jakub Kicinski

Currently devlink health does not give user any clear information
of what kind of remediation ->recover callback will perform. This
makes it difficult to understand the impact of enabling auto-
-remediation, and the severity of the error itself.

To allow users to make more informed decision add a new remediation
type attribute.

Note that we only allow one remediation type per reporter, this
is intentional. devlink health is not built for mixing issues
of different severity into one reporter since it only maintains
one dump, of the first event and a single error counter.
Nudging vendors towards categorizing issues beyond coarse
groups is an added bonus.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/devlink.h        |  2 ++
 include/uapi/linux/devlink.h | 25 +++++++++++++++++++++++++
 net/core/devlink.c           |  7 ++++++-
 3 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/include/net/devlink.h b/include/net/devlink.h
index b424328af658..72b37769761f 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -659,6 +659,7 @@ struct devlink_health_reporter;
 /**
  * struct devlink_health_reporter_ops - Reporter operations
  * @name: reporter name
+ * remedy: severity of the remediation required
  * @recover: callback to recover from reported error
  *           if priv_ctx is NULL, run a full recover
  * @dump: callback to dump an object
@@ -669,6 +670,7 @@ struct devlink_health_reporter;
 
 struct devlink_health_reporter_ops {
 	char *name;
+	enum devlink_health_remedy remedy;
 	int (*recover)(struct devlink_health_reporter *reporter,
 		       void *priv_ctx, struct netlink_ext_ack *extack);
 	int (*dump)(struct devlink_health_reporter *reporter,
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 41a6ea3b2256..8cd1508b525b 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -534,6 +534,9 @@ enum devlink_attr {
 	DEVLINK_ATTR_RELOAD_ACTION_STATS,       /* nested */
 
 	DEVLINK_ATTR_PORT_PCI_SF_NUMBER,	/* u32 */
+
+	DEVLINK_ATTR_HEALTH_REPORTER_REMEDY,	/* u32 */
+
 	/* add new attributes above here, update the policy in devlink.c */
 
 	__DEVLINK_ATTR_MAX,
@@ -620,4 +623,26 @@ enum devlink_health_state {
 	DL_HEALTH_STATE_ERROR,
 };
 
+/**
+ * enum devlink_health_reporter_remedy - severity of remediation procedure
+ * @DL_HEALTH_REMEDY_NONE: transient error, no remediation required
+ * @DL_HEALTH_REMEDY_KICK: device stalled, processing will be re-triggered
+ * @DL_HEALTH_REMEDY_COMP_RESET: associated device component (e.g. device queue)
+ *			will be reset
+ * @DL_HEALTH_REMEDY_RESET: full device reset, will result in temporary
+ *			unavailability of the device, device configuration
+ *			should not be lost
+ * @DL_HEALTH_REMEDY_REINIT: device will be reinitialized and configuration lost
+ *
+ * Used in %DEVLINK_ATTR_HEALTH_REPORTER_REMEDY, categorizes the health reporter
+ * by the severity of the remediation.
+ */
+enum devlink_health_remedy {
+	DL_HEALTH_REMEDY_NONE = 1,
+	DL_HEALTH_REMEDY_KICK,
+	DL_HEALTH_REMEDY_COMP_RESET,
+	DL_HEALTH_REMEDY_RESET,
+	DL_HEALTH_REMEDY_REINIT,
+};
+
 #endif /* _UAPI_LINUX_DEVLINK_H_ */
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 8e4e4bd7bb36..09d77d43ff63 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -6095,7 +6095,8 @@ __devlink_health_reporter_create(struct devlink *devlink,
 {
 	struct devlink_health_reporter *reporter;
 
-	if (WARN_ON(graceful_period && !ops->recover))
+	if (WARN_ON(graceful_period && !ops->recover) ||
+	    WARN_ON(ops->recover && !ops->remedy))
 		return ERR_PTR(-EINVAL);
 
 	reporter = kzalloc(sizeof(*reporter), GFP_KERNEL);
@@ -6265,6 +6266,10 @@ devlink_nl_health_reporter_fill(struct sk_buff *msg,
 	if (nla_put_string(msg, DEVLINK_ATTR_HEALTH_REPORTER_NAME,
 			   reporter->ops->name))
 		goto reporter_nest_cancel;
+	if (reporter->ops->remedy &&
+	    nla_put_u32(msg, DEVLINK_ATTR_HEALTH_REPORTER_REMEDY,
+			reporter->ops->remedy))
+		goto reporter_nest_cancel;
 	if (nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_STATE,
 		       reporter->health_state))
 		goto reporter_nest_cancel;
-- 
2.29.2


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [RFC net-next v2 3/3] devlink: add more failure modes
  2021-03-11  3:26 [RFC net-next v2 1/3] devlink: move health state to uAPI Jakub Kicinski
  2021-03-11  3:26 ` [RFC net-next v2 2/3] devlink: health: add remediation type Jakub Kicinski
@ 2021-03-11  3:26 ` Jakub Kicinski
  2021-03-11 14:23   ` Eran Ben Elisha
  2021-03-11  7:47 ` [RFC net-next v2 1/3] devlink: move health state to uAPI Jiri Pirko
  2 siblings, 1 reply; 13+ messages in thread
From: Jakub Kicinski @ 2021-03-11  3:26 UTC (permalink / raw)
  To: netdev
  Cc: jiri, saeedm, andrew.gospodarek, jacob.e.keller,
	guglielmo.morandin, eugenem, eranbe, Jakub Kicinski

>> Pending vendors adding the right reporters. <<

Extend the applicability of devlink health reporters
beyond what can be locally remedied. Add failure modes
which require re-flashing the NVM image or HW changes.

The expectation is that driver will call
devlink_health_reporter_state_update() to put hardware
health reporters into bad state.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/devlink.h | 7 +++++++
 net/core/devlink.c           | 3 +--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 8cd1508b525b..f623bbc63489 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -617,10 +617,17 @@ enum devlink_port_fn_opstate {
  * @DL_HEALTH_STATE_ERROR: error state, running health reporter's recovery
  *			may fix the issue, otherwise user needs to try
  *			power cycling or other forms of reset
+ * @DL_HEALTH_STATE_BAD_IMAGE: device's non-volatile memory needs
+ *			to be re-written, usually due to block corruption
+ * @DL_HEALTH_STATE_BAD_HW: hardware errors detected, device, host
+ *			or the connection between the two may be at fault
  */
 enum devlink_health_state {
 	DL_HEALTH_STATE_HEALTHY,
 	DL_HEALTH_STATE_ERROR,
+
+	DL_HEALTH_STATE_BAD_IMAGE,
+	DL_HEALTH_STATE_BAD_HW,
 };
 
 /**
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 09d77d43ff63..4a9fa6288a4a 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -6527,8 +6527,7 @@ void
 devlink_health_reporter_state_update(struct devlink_health_reporter *reporter,
 				     enum devlink_health_state state)
 {
-	if (WARN_ON(state != DL_HEALTH_STATE_HEALTHY &&
-		    state != DL_HEALTH_STATE_ERROR))
+	if (WARN_ON(state > DL_HEALTH_STATE_BAD_HW))
 		return;
 
 	if (reporter->health_state == state)
-- 
2.29.2


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [RFC net-next v2 1/3] devlink: move health state to uAPI
  2021-03-11  3:26 [RFC net-next v2 1/3] devlink: move health state to uAPI Jakub Kicinski
  2021-03-11  3:26 ` [RFC net-next v2 2/3] devlink: health: add remediation type Jakub Kicinski
  2021-03-11  3:26 ` [RFC net-next v2 3/3] devlink: add more failure modes Jakub Kicinski
@ 2021-03-11  7:47 ` Jiri Pirko
  2021-03-11 16:46   ` Jakub Kicinski
  2 siblings, 1 reply; 13+ messages in thread
From: Jiri Pirko @ 2021-03-11  7:47 UTC (permalink / raw)
  To: f242ed68-d31b-527d-562f-c5a35123861a
  Cc: netdev, saeedm, andrew.gospodarek, jacob.e.keller,
	guglielmo.morandin, eugenem, eranbe, Jakub Kicinski

Thu, Mar 11, 2021 at 04:26:11AM CET, kuba@kernel.org wrote:
>Move the health states into uAPI, so applications can use them.
>
>Note that we need to change the name of the enum because
>user space is likely already defining the same values.
>E.g. iproute2 does.
>
>Use this opportunity to shorten the names.
>
>Signed-off-by: Jakub Kicinski <kuba@kernel.org>
>---
> .../net/ethernet/broadcom/bnxt/bnxt_devlink.c  |  4 ++--
> .../ethernet/mellanox/mlx5/core/en/health.c    |  4 ++--
> include/net/devlink.h                          |  7 +------
> include/uapi/linux/devlink.h                   | 12 ++++++++++++
> net/core/devlink.c                             | 18 +++++++++---------
> 5 files changed, 26 insertions(+), 19 deletions(-)
>
>diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
>index 64381be935a8..cafc98ab4b5e 100644
>--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
>+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
>@@ -252,9 +252,9 @@ void bnxt_dl_health_status_update(struct bnxt *bp, bool healthy)
> 	u8 state;
> 
> 	if (healthy)
>-		state = DEVLINK_HEALTH_REPORTER_STATE_HEALTHY;
>+		state = DL_HEALTH_STATE_HEALTHY;
> 	else
>-		state = DEVLINK_HEALTH_REPORTER_STATE_ERROR;
>+		state = DL_HEALTH_STATE_ERROR;

I don't like the inconsistencies in the uapi (DL/DEVLINK). Can't we
stick with "DEVLINK" prefix for all, which is what we got so far?


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC net-next v2 2/3] devlink: health: add remediation type
  2021-03-11  3:26 ` [RFC net-next v2 2/3] devlink: health: add remediation type Jakub Kicinski
@ 2021-03-11  7:48   ` Jiri Pirko
  2021-03-11 14:32   ` Eran Ben Elisha
  1 sibling, 0 replies; 13+ messages in thread
From: Jiri Pirko @ 2021-03-11  7:48 UTC (permalink / raw)
  To: f242ed68-d31b-527d-562f-c5a35123861a
  Cc: netdev, saeedm, andrew.gospodarek, jacob.e.keller,
	guglielmo.morandin, eugenem, eranbe, Jakub Kicinski

Thu, Mar 11, 2021 at 04:26:12AM CET, kuba@kernel.org wrote:
>Currently devlink health does not give user any clear information
>of what kind of remediation ->recover callback will perform. This
>makes it difficult to understand the impact of enabling auto-
>-remediation, and the severity of the error itself.
>
>To allow users to make more informed decision add a new remediation
>type attribute.
>
>Note that we only allow one remediation type per reporter, this
>is intentional. devlink health is not built for mixing issues
>of different severity into one reporter since it only maintains
>one dump, of the first event and a single error counter.
>Nudging vendors towards categorizing issues beyond coarse
>groups is an added bonus.
>
>Signed-off-by: Jakub Kicinski <kuba@kernel.org>
>---
> include/net/devlink.h        |  2 ++
> include/uapi/linux/devlink.h | 25 +++++++++++++++++++++++++
> net/core/devlink.c           |  7 ++++++-
> 3 files changed, 33 insertions(+), 1 deletion(-)
>
>diff --git a/include/net/devlink.h b/include/net/devlink.h
>index b424328af658..72b37769761f 100644
>--- a/include/net/devlink.h
>+++ b/include/net/devlink.h
>@@ -659,6 +659,7 @@ struct devlink_health_reporter;
> /**
>  * struct devlink_health_reporter_ops - Reporter operations
>  * @name: reporter name
>+ * remedy: severity of the remediation required
>  * @recover: callback to recover from reported error
>  *           if priv_ctx is NULL, run a full recover
>  * @dump: callback to dump an object
>@@ -669,6 +670,7 @@ struct devlink_health_reporter;
> 
> struct devlink_health_reporter_ops {
> 	char *name;
>+	enum devlink_health_remedy remedy;
> 	int (*recover)(struct devlink_health_reporter *reporter,
> 		       void *priv_ctx, struct netlink_ext_ack *extack);
> 	int (*dump)(struct devlink_health_reporter *reporter,
>diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
>index 41a6ea3b2256..8cd1508b525b 100644
>--- a/include/uapi/linux/devlink.h
>+++ b/include/uapi/linux/devlink.h
>@@ -534,6 +534,9 @@ enum devlink_attr {
> 	DEVLINK_ATTR_RELOAD_ACTION_STATS,       /* nested */
> 
> 	DEVLINK_ATTR_PORT_PCI_SF_NUMBER,	/* u32 */
>+
>+	DEVLINK_ATTR_HEALTH_REPORTER_REMEDY,	/* u32 */
>+
> 	/* add new attributes above here, update the policy in devlink.c */
> 
> 	__DEVLINK_ATTR_MAX,
>@@ -620,4 +623,26 @@ enum devlink_health_state {
> 	DL_HEALTH_STATE_ERROR,
> };
> 
>+/**
>+ * enum devlink_health_reporter_remedy - severity of remediation procedure
>+ * @DL_HEALTH_REMEDY_NONE: transient error, no remediation required
>+ * @DL_HEALTH_REMEDY_KICK: device stalled, processing will be re-triggered
>+ * @DL_HEALTH_REMEDY_COMP_RESET: associated device component (e.g. device queue)
>+ *			will be reset
>+ * @DL_HEALTH_REMEDY_RESET: full device reset, will result in temporary
>+ *			unavailability of the device, device configuration
>+ *			should not be lost
>+ * @DL_HEALTH_REMEDY_REINIT: device will be reinitialized and configuration lost
>+ *
>+ * Used in %DEVLINK_ATTR_HEALTH_REPORTER_REMEDY, categorizes the health reporter
>+ * by the severity of the remediation.
>+ */
>+enum devlink_health_remedy {
>+	DL_HEALTH_REMEDY_NONE = 1,
>+	DL_HEALTH_REMEDY_KICK,
>+	DL_HEALTH_REMEDY_COMP_RESET,
>+	DL_HEALTH_REMEDY_RESET,
>+	DL_HEALTH_REMEDY_REINIT,

It is nice if enum name and values are consistent:
enum something {
	SOMETHING_*


>+};
>+
> #endif /* _UAPI_LINUX_DEVLINK_H_ */
>diff --git a/net/core/devlink.c b/net/core/devlink.c
>index 8e4e4bd7bb36..09d77d43ff63 100644
>--- a/net/core/devlink.c
>+++ b/net/core/devlink.c
>@@ -6095,7 +6095,8 @@ __devlink_health_reporter_create(struct devlink *devlink,
> {
> 	struct devlink_health_reporter *reporter;
> 
>-	if (WARN_ON(graceful_period && !ops->recover))
>+	if (WARN_ON(graceful_period && !ops->recover) ||
>+	    WARN_ON(ops->recover && !ops->remedy))
> 		return ERR_PTR(-EINVAL);
> 
> 	reporter = kzalloc(sizeof(*reporter), GFP_KERNEL);
>@@ -6265,6 +6266,10 @@ devlink_nl_health_reporter_fill(struct sk_buff *msg,
> 	if (nla_put_string(msg, DEVLINK_ATTR_HEALTH_REPORTER_NAME,
> 			   reporter->ops->name))
> 		goto reporter_nest_cancel;
>+	if (reporter->ops->remedy &&
>+	    nla_put_u32(msg, DEVLINK_ATTR_HEALTH_REPORTER_REMEDY,
>+			reporter->ops->remedy))
>+		goto reporter_nest_cancel;
> 	if (nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_STATE,
> 		       reporter->health_state))
> 		goto reporter_nest_cancel;
>-- 
>2.29.2
>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC net-next v2 3/3] devlink: add more failure modes
  2021-03-11  3:26 ` [RFC net-next v2 3/3] devlink: add more failure modes Jakub Kicinski
@ 2021-03-11 14:23   ` Eran Ben Elisha
  2021-03-11 16:49     ` Jakub Kicinski
  0 siblings, 1 reply; 13+ messages in thread
From: Eran Ben Elisha @ 2021-03-11 14:23 UTC (permalink / raw)
  To: f242ed68-d31b-527d-562f-c5a35123861a, netdev
  Cc: jiri, saeedm, andrew.gospodarek, jacob.e.keller,
	guglielmo.morandin, eugenem, eranbe, Jakub Kicinski, Aya Levin,
	Moshe Shemesh



On 3/11/2021 5:26 AM, Jakub Kicinski wrote:
>>> Pending vendors adding the right reporters. <<

Would you like Nvidia to reply with the remedy per reporter or to 
actually prepare the patch?

> 
> Extend the applicability of devlink health reporters
> beyond what can be locally remedied. Add failure modes
> which require re-flashing the NVM image or HW changes.
> 
> The expectation is that driver will call
> devlink_health_reporter_state_update() to put hardware
> health reporters into bad state.
> 
> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
> ---
>   include/uapi/linux/devlink.h | 7 +++++++
>   net/core/devlink.c           | 3 +--
>   2 files changed, 8 insertions(+), 2 deletions(-)
> 
> diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
> index 8cd1508b525b..f623bbc63489 100644
> --- a/include/uapi/linux/devlink.h
> +++ b/include/uapi/linux/devlink.h
> @@ -617,10 +617,17 @@ enum devlink_port_fn_opstate {
>    * @DL_HEALTH_STATE_ERROR: error state, running health reporter's recovery
>    *			may fix the issue, otherwise user needs to try
>    *			power cycling or other forms of reset
> + * @DL_HEALTH_STATE_BAD_IMAGE: device's non-volatile memory needs
> + *			to be re-written, usually due to block corruption
> + * @DL_HEALTH_STATE_BAD_HW: hardware errors detected, device, host
> + *			or the connection between the two may be at fault
>    */
>   enum devlink_health_state {
>   	DL_HEALTH_STATE_HEALTHY,
>   	DL_HEALTH_STATE_ERROR,
> +
> +	DL_HEALTH_STATE_BAD_IMAGE,
> +	DL_HEALTH_STATE_BAD_HW,
>   };
>   
>   /**
> diff --git a/net/core/devlink.c b/net/core/devlink.c
> index 09d77d43ff63..4a9fa6288a4a 100644
> --- a/net/core/devlink.c
> +++ b/net/core/devlink.c
> @@ -6527,8 +6527,7 @@ void
>   devlink_health_reporter_state_update(struct devlink_health_reporter *reporter,
>   				     enum devlink_health_state state)
>   {
> -	if (WARN_ON(state != DL_HEALTH_STATE_HEALTHY &&
> -		    state != DL_HEALTH_STATE_ERROR))
> +	if (WARN_ON(state > DL_HEALTH_STATE_BAD_HW))
>   		return;
>   
>   	if (reporter->health_state == state)
> 

devlink_health_reporter_recover() requires an update as well.
something like:

@@ -6346,8 +6346,15 @@ devlink_health_reporter_recover(struct 
devlink_health_reporter *reporter,
  {
         int err;

-   if (reporter->health_state == DL_HEALTH_STATE_HEALTHY)
+ switch (reporter->health_state) {
+ case DL_HEALTH_STATE_HEALTHY:
                 return 0;
+ case DL_HEALTH_STATE_ERROR:
+         break;
+ case DL_HEALTH_STATE_BAD_IMAGE:
+ case DL_HEALTH_STATE_BAD_HW:
+         return -EOPNOTSUPP;
+ }

         if (!reporter->ops->recover)
                 return -EOPNOTSUPP;


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC net-next v2 2/3] devlink: health: add remediation type
  2021-03-11  3:26 ` [RFC net-next v2 2/3] devlink: health: add remediation type Jakub Kicinski
  2021-03-11  7:48   ` Jiri Pirko
@ 2021-03-11 14:32   ` Eran Ben Elisha
  2021-03-11 16:45     ` Jakub Kicinski
  1 sibling, 1 reply; 13+ messages in thread
From: Eran Ben Elisha @ 2021-03-11 14:32 UTC (permalink / raw)
  To: netdev
  Cc: jiri, saeedm, andrew.gospodarek, jacob.e.keller,
	guglielmo.morandin, eugenem, eranbe, Jakub Kicinski



On 3/11/2021 5:26 AM, Jakub Kicinski wrote:
> Currently devlink health does not give user any clear information
> of what kind of remediation ->recover callback will perform. This
> makes it difficult to understand the impact of enabling auto-
> -remediation, and the severity of the error itself.
> 
> To allow users to make more informed decision add a new remediation
> type attribute.
> 
> Note that we only allow one remediation type per reporter, this
> is intentional. devlink health is not built for mixing issues
> of different severity into one reporter since it only maintains
> one dump, of the first event and a single error counter.
> Nudging vendors towards categorizing issues beyond coarse
> groups is an added bonus.
> 
> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
> ---
>   include/net/devlink.h        |  2 ++
>   include/uapi/linux/devlink.h | 25 +++++++++++++++++++++++++
>   net/core/devlink.c           |  7 ++++++-
>   3 files changed, 33 insertions(+), 1 deletion(-)
> 
> diff --git a/include/net/devlink.h b/include/net/devlink.h
> index b424328af658..72b37769761f 100644
> --- a/include/net/devlink.h
> +++ b/include/net/devlink.h
> @@ -659,6 +659,7 @@ struct devlink_health_reporter;
>   /**
>    * struct devlink_health_reporter_ops - Reporter operations
>    * @name: reporter name
> + * remedy: severity of the remediation required
>    * @recover: callback to recover from reported error
>    *           if priv_ctx is NULL, run a full recover
>    * @dump: callback to dump an object
> @@ -669,6 +670,7 @@ struct devlink_health_reporter;
>   
>   struct devlink_health_reporter_ops {
>   	char *name;
> +	enum devlink_health_remedy remedy;
>   	int (*recover)(struct devlink_health_reporter *reporter,
>   		       void *priv_ctx, struct netlink_ext_ack *extack);
>   	int (*dump)(struct devlink_health_reporter *reporter,
> diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
> index 41a6ea3b2256..8cd1508b525b 100644
> --- a/include/uapi/linux/devlink.h
> +++ b/include/uapi/linux/devlink.h
> @@ -534,6 +534,9 @@ enum devlink_attr {
>   	DEVLINK_ATTR_RELOAD_ACTION_STATS,       /* nested */
>   
>   	DEVLINK_ATTR_PORT_PCI_SF_NUMBER,	/* u32 */
> +
> +	DEVLINK_ATTR_HEALTH_REPORTER_REMEDY,	/* u32 */
> +
>   	/* add new attributes above here, update the policy in devlink.c */
>   
>   	__DEVLINK_ATTR_MAX,
> @@ -620,4 +623,26 @@ enum devlink_health_state {
>   	DL_HEALTH_STATE_ERROR,
>   };
>   
> +/**
> + * enum devlink_health_reporter_remedy - severity of remediation procedure
> + * @DL_HEALTH_REMEDY_NONE: transient error, no remediation required
> + * @DL_HEALTH_REMEDY_KICK: device stalled, processing will be re-triggered
> + * @DL_HEALTH_REMEDY_COMP_RESET: associated device component (e.g. device queue)
> + *			will be reset
> + * @DL_HEALTH_REMEDY_RESET: full device reset, will result in temporary
> + *			unavailability of the device, device configuration
> + *			should not be lost
> + * @DL_HEALTH_REMEDY_REINIT: device will be reinitialized and configuration lost
> + *
> + * Used in %DEVLINK_ATTR_HEALTH_REPORTER_REMEDY, categorizes the health reporter
> + * by the severity of the remediation.
> + */
> +enum devlink_health_remedy {
> +	DL_HEALTH_REMEDY_NONE = 1,

What is the reason zero is skipped?

> +	DL_HEALTH_REMEDY_KICK,
> +	DL_HEALTH_REMEDY_COMP_RESET,
> +	DL_HEALTH_REMEDY_RESET,
> +	DL_HEALTH_REMEDY_REINIT,
> +};
> +
>   #endif /* _UAPI_LINUX_DEVLINK_H_ */
> diff --git a/net/core/devlink.c b/net/core/devlink.c
> index 8e4e4bd7bb36..09d77d43ff63 100644
> --- a/net/core/devlink.c
> +++ b/net/core/devlink.c
> @@ -6095,7 +6095,8 @@ __devlink_health_reporter_create(struct devlink *devlink,
>   {
>   	struct devlink_health_reporter *reporter;
>   
> -	if (WARN_ON(graceful_period && !ops->recover))
> +	if (WARN_ON(graceful_period && !ops->recover) ||
> +	    WARN_ON(ops->recover && !ops->remedy))

It allows drivers to set recover callback and report DL_HEALTH_REMEDY_NONE.
Defining DL_HEALTH_REMEDY_NONE = 0  would make this if clause to catch it.

>   		return ERR_PTR(-EINVAL);
>   
>   	reporter = kzalloc(sizeof(*reporter), GFP_KERNEL);
> @@ -6265,6 +6266,10 @@ devlink_nl_health_reporter_fill(struct sk_buff *msg,
>   	if (nla_put_string(msg, DEVLINK_ATTR_HEALTH_REPORTER_NAME,
>   			   reporter->ops->name))
>   		goto reporter_nest_cancel;
> +	if (reporter->ops->remedy &&
> +	    nla_put_u32(msg, DEVLINK_ATTR_HEALTH_REPORTER_REMEDY,
> +			reporter->ops->remedy))
> +		goto reporter_nest_cancel;
>   	if (nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_STATE,
>   		       reporter->health_state))
>   		goto reporter_nest_cancel;
> 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC net-next v2 2/3] devlink: health: add remediation type
  2021-03-11 14:32   ` Eran Ben Elisha
@ 2021-03-11 16:45     ` Jakub Kicinski
  0 siblings, 0 replies; 13+ messages in thread
From: Jakub Kicinski @ 2021-03-11 16:45 UTC (permalink / raw)
  To: Eran Ben Elisha
  Cc: netdev, jiri, saeedm, andrew.gospodarek, jacob.e.keller,
	guglielmo.morandin, eugenem, eranbe

On Thu, 11 Mar 2021 16:32:44 +0200 Eran Ben Elisha wrote:
> > +/**
> > + * enum devlink_health_reporter_remedy - severity of remediation procedure
> > + * @DL_HEALTH_REMEDY_NONE: transient error, no remediation required
> > + * @DL_HEALTH_REMEDY_KICK: device stalled, processing will be re-triggered
> > + * @DL_HEALTH_REMEDY_COMP_RESET: associated device component (e.g. device queue)
> > + *			will be reset
> > + * @DL_HEALTH_REMEDY_RESET: full device reset, will result in temporary
> > + *			unavailability of the device, device configuration
> > + *			should not be lost
> > + * @DL_HEALTH_REMEDY_REINIT: device will be reinitialized and configuration lost
> > + *
> > + * Used in %DEVLINK_ATTR_HEALTH_REPORTER_REMEDY, categorizes the health reporter
> > + * by the severity of the remediation.
> > + */
> > +enum devlink_health_remedy {
> > +	DL_HEALTH_REMEDY_NONE = 1,  
> 
> What is the reason zero is skipped?
> 
> > +	DL_HEALTH_REMEDY_KICK,
> > +	DL_HEALTH_REMEDY_COMP_RESET,
> > +	DL_HEALTH_REMEDY_RESET,
> > +	DL_HEALTH_REMEDY_REINIT,
> > +};
> > +
> >   #endif /* _UAPI_LINUX_DEVLINK_H_ */
> > diff --git a/net/core/devlink.c b/net/core/devlink.c
> > index 8e4e4bd7bb36..09d77d43ff63 100644
> > --- a/net/core/devlink.c
> > +++ b/net/core/devlink.c
> > @@ -6095,7 +6095,8 @@ __devlink_health_reporter_create(struct devlink *devlink,
> >   {
> >   	struct devlink_health_reporter *reporter;
> >   
> > -	if (WARN_ON(graceful_period && !ops->recover))
> > +	if (WARN_ON(graceful_period && !ops->recover) ||
> > +	    WARN_ON(ops->recover && !ops->remedy))  
> 
> It allows drivers to set recover callback and report DL_HEALTH_REMEDY_NONE.
> Defining DL_HEALTH_REMEDY_NONE = 0  would make this if clause to catch it.

I was intending for "none" to mean no remediation from the driver side.
E.g. device sees bad descriptor and tosses it away. 

That's different from cases where remediation is fully manual.

I will improve the kdoc.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC net-next v2 1/3] devlink: move health state to uAPI
  2021-03-11  7:47 ` [RFC net-next v2 1/3] devlink: move health state to uAPI Jiri Pirko
@ 2021-03-11 16:46   ` Jakub Kicinski
  2021-03-12 19:56     ` Keller, Jacob E
  0 siblings, 1 reply; 13+ messages in thread
From: Jakub Kicinski @ 2021-03-11 16:46 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: f242ed68-d31b-527d-562f-c5a35123861a, netdev, saeedm,
	andrew.gospodarek, jacob.e.keller, guglielmo.morandin, eugenem,
	eranbe

On Thu, 11 Mar 2021 08:47:34 +0100 Jiri Pirko wrote:
> Thu, Mar 11, 2021 at 04:26:11AM CET, kuba@kernel.org wrote:
> >Move the health states into uAPI, so applications can use them.
> >
> >Note that we need to change the name of the enum because
> >user space is likely already defining the same values.
> >E.g. iproute2 does.
> >
> >Use this opportunity to shorten the names.
> >
> >Signed-off-by: Jakub Kicinski <kuba@kernel.org>
> >---
> > .../net/ethernet/broadcom/bnxt/bnxt_devlink.c  |  4 ++--
> > .../ethernet/mellanox/mlx5/core/en/health.c    |  4 ++--
> > include/net/devlink.h                          |  7 +------
> > include/uapi/linux/devlink.h                   | 12 ++++++++++++
> > net/core/devlink.c                             | 18 +++++++++---------
> > 5 files changed, 26 insertions(+), 19 deletions(-)
> >
> >diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
> >index 64381be935a8..cafc98ab4b5e 100644
> >--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
> >+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
> >@@ -252,9 +252,9 @@ void bnxt_dl_health_status_update(struct bnxt *bp, bool healthy)
> > 	u8 state;
> > 
> > 	if (healthy)
> >-		state = DEVLINK_HEALTH_REPORTER_STATE_HEALTHY;
> >+		state = DL_HEALTH_STATE_HEALTHY;
> > 	else
> >-		state = DEVLINK_HEALTH_REPORTER_STATE_ERROR;
> >+		state = DL_HEALTH_STATE_ERROR;  
> 
> I don't like the inconsistencies in the uapi (DL/DEVLINK). Can't we
> stick with "DEVLINK" prefix for all, which is what we got so far?

Sure, but you have seen the previous discussion about the length of
devlink names, right? I'm not the only one who thinks this is a counter
productive rule.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC net-next v2 3/3] devlink: add more failure modes
  2021-03-11 14:23   ` Eran Ben Elisha
@ 2021-03-11 16:49     ` Jakub Kicinski
  2021-03-14 12:33       ` Eran Ben Elisha
  0 siblings, 1 reply; 13+ messages in thread
From: Jakub Kicinski @ 2021-03-11 16:49 UTC (permalink / raw)
  To: Eran Ben Elisha
  Cc: f242ed68-d31b-527d-562f-c5a35123861a, netdev, jiri, saeedm,
	andrew.gospodarek, jacob.e.keller, guglielmo.morandin, eugenem,
	eranbe, Aya Levin, Moshe Shemesh

On Thu, 11 Mar 2021 16:23:09 +0200 Eran Ben Elisha wrote:
> On 3/11/2021 5:26 AM, Jakub Kicinski wrote:
> >>> Pending vendors adding the right reporters. <<  
> 
> Would you like Nvidia to reply with the remedy per reporter or to 
> actually prepare the patch?

You mean the patch adding .remedy? If you can that'd be helpful.

Or do you have HW error reporters to add?

> > Extend the applicability of devlink health reporters
> > beyond what can be locally remedied. Add failure modes
> > which require re-flashing the NVM image or HW changes.
> > 
> > The expectation is that driver will call
> > devlink_health_reporter_state_update() to put hardware
> > health reporters into bad state.
> > 
> > Signed-off-by: Jakub Kicinski <kuba@kernel.org>
> > ---
> >   include/uapi/linux/devlink.h | 7 +++++++
> >   net/core/devlink.c           | 3 +--
> >   2 files changed, 8 insertions(+), 2 deletions(-)
> > 
> > diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
> > index 8cd1508b525b..f623bbc63489 100644
> > --- a/include/uapi/linux/devlink.h
> > +++ b/include/uapi/linux/devlink.h
> > @@ -617,10 +617,17 @@ enum devlink_port_fn_opstate {
> >    * @DL_HEALTH_STATE_ERROR: error state, running health reporter's recovery
> >    *			may fix the issue, otherwise user needs to try
> >    *			power cycling or other forms of reset
> > + * @DL_HEALTH_STATE_BAD_IMAGE: device's non-volatile memory needs
> > + *			to be re-written, usually due to block corruption
> > + * @DL_HEALTH_STATE_BAD_HW: hardware errors detected, device, host
> > + *			or the connection between the two may be at fault
> >    */
> >   enum devlink_health_state {
> >   	DL_HEALTH_STATE_HEALTHY,
> >   	DL_HEALTH_STATE_ERROR,
> > +
> > +	DL_HEALTH_STATE_BAD_IMAGE,
> > +	DL_HEALTH_STATE_BAD_HW,
> >   };
> >   
> >   /**
> > diff --git a/net/core/devlink.c b/net/core/devlink.c
> > index 09d77d43ff63..4a9fa6288a4a 100644
> > --- a/net/core/devlink.c
> > +++ b/net/core/devlink.c
> > @@ -6527,8 +6527,7 @@ void
> >   devlink_health_reporter_state_update(struct devlink_health_reporter *reporter,
> >   				     enum devlink_health_state state)
> >   {
> > -	if (WARN_ON(state != DL_HEALTH_STATE_HEALTHY &&
> > -		    state != DL_HEALTH_STATE_ERROR))
> > +	if (WARN_ON(state > DL_HEALTH_STATE_BAD_HW))
> >   		return;
> >   
> >   	if (reporter->health_state == state)
> >   
> 
> devlink_health_reporter_recover() requires an update as well.
> something like:
> 
> @@ -6346,8 +6346,15 @@ devlink_health_reporter_recover(struct 
> devlink_health_reporter *reporter,
>   {
>          int err;
> 
> -   if (reporter->health_state == DL_HEALTH_STATE_HEALTHY)
> + switch (reporter->health_state) {
> + case DL_HEALTH_STATE_HEALTHY:
>                  return 0;
> + case DL_HEALTH_STATE_ERROR:
> +         break;
> + case DL_HEALTH_STATE_BAD_IMAGE:
> + case DL_HEALTH_STATE_BAD_HW:
> +         return -EOPNOTSUPP;
> + }
> 
>          if (!reporter->ops->recover)
>                  return -EOPNOTSUPP;
> 

Thanks!

^ permalink raw reply	[flat|nested] 13+ messages in thread

* RE: [RFC net-next v2 1/3] devlink: move health state to uAPI
  2021-03-11 16:46   ` Jakub Kicinski
@ 2021-03-12 19:56     ` Keller, Jacob E
  0 siblings, 0 replies; 13+ messages in thread
From: Keller, Jacob E @ 2021-03-12 19:56 UTC (permalink / raw)
  To: Jakub Kicinski, Jiri Pirko
  Cc: f242ed68-d31b-527d-562f-c5a35123861a, netdev, saeedm,
	andrew.gospodarek, guglielmo.morandin, eugenem, eranbe



> -----Original Message-----
> From: Jakub Kicinski <kuba@kernel.org>
> Sent: Thursday, March 11, 2021 8:47 AM
> To: Jiri Pirko <jiri@resnulli.us>
> Cc: f242ed68-d31b-527d-562f-c5a35123861a@intel.com;
> netdev@vger.kernel.org; saeedm@nvidia.com;
> andrew.gospodarek@broadcom.com; Keller, Jacob E <jacob.e.keller@intel.com>;
> guglielmo.morandin@broadcom.com; eugenem@fb.com;
> eranbe@mellanox.com
> Subject: Re: [RFC net-next v2 1/3] devlink: move health state to uAPI
> 
> On Thu, 11 Mar 2021 08:47:34 +0100 Jiri Pirko wrote:
> > Thu, Mar 11, 2021 at 04:26:11AM CET, kuba@kernel.org wrote:
> > >Move the health states into uAPI, so applications can use them.
> > >
> > >Note that we need to change the name of the enum because
> > >user space is likely already defining the same values.
> > >E.g. iproute2 does.
> > >
> > >Use this opportunity to shorten the names.
> > >
> > >Signed-off-by: Jakub Kicinski <kuba@kernel.org>
> > >---
> > > .../net/ethernet/broadcom/bnxt/bnxt_devlink.c  |  4 ++--
> > > .../ethernet/mellanox/mlx5/core/en/health.c    |  4 ++--
> > > include/net/devlink.h                          |  7 +------
> > > include/uapi/linux/devlink.h                   | 12 ++++++++++++
> > > net/core/devlink.c                             | 18 +++++++++---------
> > > 5 files changed, 26 insertions(+), 19 deletions(-)
> > >
> > >diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
> b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
> > >index 64381be935a8..cafc98ab4b5e 100644
> > >--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
> > >+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
> > >@@ -252,9 +252,9 @@ void bnxt_dl_health_status_update(struct bnxt *bp,
> bool healthy)
> > > 	u8 state;
> > >
> > > 	if (healthy)
> > >-		state = DEVLINK_HEALTH_REPORTER_STATE_HEALTHY;
> > >+		state = DL_HEALTH_STATE_HEALTHY;
> > > 	else
> > >-		state = DEVLINK_HEALTH_REPORTER_STATE_ERROR;
> > >+		state = DL_HEALTH_STATE_ERROR;
> >
> > I don't like the inconsistencies in the uapi (DL/DEVLINK). Can't we
> > stick with "DEVLINK" prefix for all, which is what we got so far?
> 
> Sure, but you have seen the previous discussion about the length of
> devlink names, right? I'm not the only one who thinks this is a counter
> productive rule.

I'd like  to see us shorten the names where possible. I do think we should be consistent in how we do it. I like DL_, but it would be nice if we could get "DL_HEATH_" for all health related ones, and so on, working towards shortening across the board over time?

I also didn't mind the "DLH_" that you used in another spot, though that could get us into trouble eventually once two features start with the same letter...

Thanks,
Jake

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC net-next v2 3/3] devlink: add more failure modes
  2021-03-11 16:49     ` Jakub Kicinski
@ 2021-03-14 12:33       ` Eran Ben Elisha
  2021-03-15 17:06         ` Jakub Kicinski
  0 siblings, 1 reply; 13+ messages in thread
From: Eran Ben Elisha @ 2021-03-14 12:33 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: netdev, jiri, saeedm, andrew.gospodarek, jacob.e.keller,
	guglielmo.morandin, eugenem, Aya Levin, Moshe Shemesh



On 3/11/2021 6:49 PM, Jakub Kicinski wrote:
> On Thu, 11 Mar 2021 16:23:09 +0200 Eran Ben Elisha wrote:
>> On 3/11/2021 5:26 AM, Jakub Kicinski wrote:
>>>>> Pending vendors adding the right reporters. <<
>> Would you like Nvidia to reply with the remedy per reporter or to
>> actually prepare the patch?
> You mean the patch adding .remedy? If you can that'd be helpful.
> 
> Or do you have HW error reporters to add?
> 

I meant a patch to add .remedy to existing mlx5* reporters to be part of 
your series.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC net-next v2 3/3] devlink: add more failure modes
  2021-03-14 12:33       ` Eran Ben Elisha
@ 2021-03-15 17:06         ` Jakub Kicinski
  0 siblings, 0 replies; 13+ messages in thread
From: Jakub Kicinski @ 2021-03-15 17:06 UTC (permalink / raw)
  To: Eran Ben Elisha
  Cc: netdev, jiri, saeedm, andrew.gospodarek, jacob.e.keller,
	guglielmo.morandin, eugenem, Aya Levin, Moshe Shemesh

On Sun, 14 Mar 2021 14:33:10 +0200 Eran Ben Elisha wrote:
> On 3/11/2021 6:49 PM, Jakub Kicinski wrote:
> > On Thu, 11 Mar 2021 16:23:09 +0200 Eran Ben Elisha wrote:  
> >> Would you like Nvidia to reply with the remedy per reporter or to
> >> actually prepare the patch?  
> > You mean the patch adding .remedy? If you can that'd be helpful.
> > 
> > Or do you have HW error reporters to add?
> 
> I meant a patch to add .remedy to existing mlx5* reporters to be part of 
> your series.

After talking some more with the HW health team the series appears less
necessary than I thought. I'm putting it on hold for now, sorry.

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2021-03-15 17:07 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-03-11  3:26 [RFC net-next v2 1/3] devlink: move health state to uAPI Jakub Kicinski
2021-03-11  3:26 ` [RFC net-next v2 2/3] devlink: health: add remediation type Jakub Kicinski
2021-03-11  7:48   ` Jiri Pirko
2021-03-11 14:32   ` Eran Ben Elisha
2021-03-11 16:45     ` Jakub Kicinski
2021-03-11  3:26 ` [RFC net-next v2 3/3] devlink: add more failure modes Jakub Kicinski
2021-03-11 14:23   ` Eran Ben Elisha
2021-03-11 16:49     ` Jakub Kicinski
2021-03-14 12:33       ` Eran Ben Elisha
2021-03-15 17:06         ` Jakub Kicinski
2021-03-11  7:47 ` [RFC net-next v2 1/3] devlink: move health state to uAPI Jiri Pirko
2021-03-11 16:46   ` Jakub Kicinski
2021-03-12 19:56     ` Keller, Jacob E

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.