From mboxrd@z Thu Jan 1 00:00:00 1970 From: Aya Levin Subject: [PATCH RFC iproute2-next] devlink: Add health command support Date: Mon, 31 Dec 2018 16:41:59 +0200 Message-ID: <1546267319-3117-1-git-send-email-ayal@mellanox.com> References: <1546266733-9512-1-git-send-email-eranbe@mellanox.com> Cc: Moshe Shemesh , Aya Levin , Eran Ben Elisha , Tal Alon , Ariel Almog To: David Ahern , netdev@vger.kernel.org, "David S. Miller" , Jiri Pirko Return-path: Received: from mail-il-dmz.mellanox.com ([193.47.165.129]:53542 "EHLO mellanox.co.il" rhost-flags-OK-OK-OK-FAIL) by vger.kernel.org with ESMTP id S1725899AbeLaOmS (ORCPT ); Mon, 31 Dec 2018 09:42:18 -0500 In-Reply-To: <1546266733-9512-1-git-send-email-eranbe@mellanox.com> Sender: netdev-owner@vger.kernel.org List-ID: This patch add support for the following commands: Devlink health show [DEV reporter REPORTE_NAME], Devlink health recover DEV reporter REPORTER_NAME, Devlink health diagnose DEV reporter REPORTER_NAME, Devlink health objdump show DEV reporter REPORTER_NAME, Devlink health objdump clear DEV reporter REPORTER_NAME, Devlink health set DEV reporter REPORTER_NAME NAME VALUE * Devlink health show command displays status and configuration info on specific reporter on a device or dump the info on all reporters on all devices. * Devlink health recover enables the user to initiate a recovery on a reporter. This operation will increment the recoveries counter displayed in the show command. * Devlink health diagnose enables the user to retrieve diagnostics data on a reporter on a device. The command's output is a free text defined by the reporter. * Devlink health objdump show displays the last saved objdump. Devlink health saves a single objdump. If an objdump is not already stored by the Devlink for this reporter, Devlink generates a new objdump. The objdump can be generated automatically when a reporter reports on an error or by the user's request. Objdump output is defined by the reporter. * Devlink health objdump clear, deletes the last saved objdump file. * Devlink health set, enables the user to configure: 1) grace_period [msec] time interval between auto recoveries. 2) auto_recover [true/false] whether the devlink should execute automatic recover on error. Examples: $devlink health show pci/0000:00:09.0 reporter TX pci/0000:00:09.0: name TX state healthy #err 0 #recover 1 last_objdump_ts N/A objdump_available false attributes: grace period 600 auto recover true $devlink health diagnose pci/0000:00:09.0 reporter TX SQ 0x9b: HW state: 1 stopped: 0 SQ 0x9f: HW state: 1 stopped: 0 SQ 0xa3: HW state: 1 stopped: 0 SQ 0xa7: HW state: 1 stopped: 0 SQ 0xab: HW state: 1 stopped: 0 $devlink health objdump show pci/0000:00:09.0 reporter TX TX dump data $devlink health objdump clear pci/0000:00:09.0 reporter TX $devlink health set pci/0000:00:09.0 reporter TX grace_period 3500 $devlink health set pci/0000:00:09.0 reporter TX auto_recover false Signed-off-by: Aya Levin --- devlink/devlink.c | 531 ++++++++++++++++++++++++++++++++++++++++++- include/uapi/linux/devlink.h | 29 +++ man/man8/devlink-health.8 | 175 ++++++++++++++ man/man8/devlink.8 | 7 +- 4 files changed, 739 insertions(+), 3 deletions(-) create mode 100644 man/man8/devlink-health.8 diff --git a/devlink/devlink.c b/devlink/devlink.c index 3651e90c1159..24dd852d8478 100644 --- a/devlink/devlink.c +++ b/devlink/devlink.c @@ -40,6 +40,11 @@ #define PARAM_CMODE_DRIVERINIT_STR "driverinit" #define PARAM_CMODE_PERMANENT_STR "permanent" +#define HEALTH_REPORTER_STATE_HEALTHY_STR "healthy" +#define HEALTH_REPORTER_STATE_ERROR_STR "error" +#define HEALTH_REPORTER_GRACE_PERIOD_STR "grace_period" +#define HEALTH_REPORTER_AUTO_RECOVER_STR "auto_recover" + static int g_new_line_count; #define pr_err(args...) fprintf(stderr, ##args) @@ -199,6 +204,10 @@ static void ifname_map_free(struct ifname_map *ifname_map) #define DL_OPT_REGION_SNAPSHOT_ID BIT(22) #define DL_OPT_REGION_ADDRESS BIT(23) #define DL_OPT_REGION_LENGTH BIT(24) +#define DL_OPT_HANDLE_HEALTH BIT(25) +#define DL_OPT_HEALTH_REPORTER_NAME BIT(26) +#define DL_OPT_HEALTH_REPORTER_DEV BIT(27) + struct dl_opts { uint32_t present; /* flags of present items */ @@ -230,6 +239,10 @@ struct dl_opts { uint32_t region_snapshot_id; uint64_t region_address; uint64_t region_length; + const char *reporter_name; + const char *reporter_param_name; + const char *reporter_param_value; + }; struct dl { @@ -959,7 +972,7 @@ static int dl_argv_parse(struct dl *dl, uint32_t o_required, if (err) return err; o_found |= handle_bit; - } else if (o_required & DL_OPT_HANDLE) { + } else if (DL_OPT_HANDLE) { err = dl_argv_handle(dl, &opts->bus_name, &opts->dev_name); if (err) return err; @@ -1178,6 +1191,15 @@ static int dl_argv_parse(struct dl *dl, uint32_t o_required, if (err) return err; o_found |= DL_OPT_REGION_LENGTH; + } else if (dl_argv_match(dl, "reporter") && + (o_all & DL_OPT_HEALTH_REPORTER_NAME)) { + dl_arg_inc(dl); + err = dl_argv_str(dl, &opts->reporter_name); + if (err) + return err; + o_found |= DL_OPT_HEALTH_REPORTER_NAME; + o_found |= DL_OPT_HANDLE; + break; } else { pr_err("Unknown option \"%s\"\n", dl_argv(dl)); return -EINVAL; @@ -1298,6 +1320,12 @@ static int dl_argv_parse(struct dl *dl, uint32_t o_required, return -EINVAL; } + if ((o_required & DL_OPT_HEALTH_REPORTER_NAME) && + !(o_found & DL_OPT_HEALTH_REPORTER_NAME)) { + pr_err("Reporter name expected.\n"); + return -EINVAL; + } + return 0; } @@ -1382,6 +1410,9 @@ static void dl_opts_put(struct nlmsghdr *nlh, struct dl *dl) if (opts->present & DL_OPT_REGION_LENGTH) mnl_attr_put_u64(nlh, DEVLINK_ATTR_REGION_CHUNK_LEN, opts->region_length); + if (opts->present & DL_OPT_HEALTH_REPORTER_NAME) + mnl_attr_put_strz(nlh, DEVLINK_ATTR_HEALTH_REPORTER_NAME, + opts->reporter_name); } static int dl_argv_parse_put(struct nlmsghdr *nlh, struct dl *dl, @@ -1513,6 +1544,8 @@ static void __pr_out_handle_start(struct dl *dl, struct nlattr **tb, __pr_out_newline(); __pr_out_indent_inc(); arr_last_handle_set(dl, bus_name, dev_name); + } else { + __pr_out_indent_inc(); } } else { pr_out("%s%s", buf, content ? ":" : ""); @@ -5557,11 +5590,501 @@ static int cmd_region(struct dl *dl) return -ENOENT; } +static int cmd_health_set_params(struct dl *dl) +{ + struct nlmsghdr *nlh; + uint64_t period; + bool auto_recover; + int err; + + nlh = mnlg_msg_prepare(dl->nlg, DEVLINK_CMD_HEALTH_REPORTER_SET, + NLM_F_REQUEST | NLM_F_ACK); + err = dl_argv_parse(dl, DL_OPT_HANDLE | + DL_OPT_HEALTH_REPORTER_NAME, 0); + if (err) + return err; + + err = dl_argv_str(dl, &dl->opts.reporter_param_name); + if (err) + return err; + err = dl_argv_str(dl, &dl->opts.reporter_param_value); + if (err) + return err; + dl_opts_put(nlh, dl); + + if (!strncmp(dl->opts.reporter_param_name, + HEALTH_REPORTER_GRACE_PERIOD_STR, strlen("garce"))) { + err = strtouint64_t(dl->opts.reporter_param_value, &period); + if (err) + goto err_param_value_parse; + mnl_attr_put_u64(nlh, DEVLINK_ATTR_HEALTH_REPORTER_PERIOD, + period); + } else if (!strncmp(dl->opts.reporter_param_name, + HEALTH_REPORTER_AUTO_RECOVER_STR, + strlen("auto"))) { + err = strtobool(dl->opts.reporter_param_value, &auto_recover); + if (err) + goto err_param_value_parse; + mnl_attr_put_u8(nlh, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_REC, + (uint8_t)auto_recover); + } else { + printf("Parameter name: %s is not supported\n", + dl->opts.reporter_param_name); + return -ENOTSUP; + } + + return _mnlg_socket_sndrcv(dl->nlg, nlh, NULL, NULL); + +err_param_value_parse: + pr_err("Value \"%s\" is not a number or not within range\n", + dl->opts.param_value); + return err; +} + +static int cmd_health_objdump_clear(struct dl *dl) +{ + struct nlmsghdr *nlh; + int err; + + nlh = mnlg_msg_prepare(dl->nlg, + DEVLINK_CMD_HEALTH_REPORTER_OBJDUMP_CLEAR, + NLM_F_REQUEST | NLM_F_ACK); + + err = dl_argv_parse_put(nlh, dl, DL_OPT_HANDLE | + DL_OPT_HEALTH_REPORTER_NAME, 0); + if (err) + return err; + + dl_opts_put(nlh, dl); + return _mnlg_socket_sndrcv(dl->nlg, nlh, NULL, NULL); +} + +static int health_value_show(struct dl *dl, int type, struct nlattr *nl_data) +{ + const char *str; + uint8_t *data; + uint32_t len; + uint64_t u64; + uint32_t u32; + uint16_t u16; + uint8_t u8; + int i; + + switch (type) { + case MNL_TYPE_FLAG: + if (dl->json_output) + jsonw_string(dl->jw, nl_data ? "true" : "false"); + else + pr_out("%s ", nl_data ? "true" : "false"); + break; + case MNL_TYPE_U8: + u8 = mnl_attr_get_u8(nl_data); + if (dl->json_output) + jsonw_uint(dl->jw, u8); + else + pr_out("%u ", u8); + break; + case MNL_TYPE_U16: + u16 = mnl_attr_get_u16(nl_data); + if (dl->json_output) + jsonw_uint(dl->jw, u16); + else + pr_out("%u ", u16); + break; + case MNL_TYPE_U32: + u32 = mnl_attr_get_u32(nl_data); + if (dl->json_output) + jsonw_uint(dl->jw, u32); + else + pr_out("%u ", u32); + break; + case MNL_TYPE_U64: + u64 = mnl_attr_get_u64(nl_data); + if (dl->json_output) + jsonw_u64(dl->jw, u64); + else + pr_out("%lu ", u64); + break; + case MNL_TYPE_STRING: + case MNL_TYPE_NUL_STRING: + str = mnl_attr_get_str(nl_data); + if (dl->json_output) + jsonw_string(dl->jw, str); + else + pr_out("%s ", str); + break; + case MNL_TYPE_BINARY: + len = mnl_attr_get_payload_len(nl_data); + data = mnl_attr_get_payload(nl_data); + i = 0; + while (i < len) { + if (dl->json_output) + jsonw_printf(dl->jw, "%d", data[i]); + else + pr_out("%02x ", data[i]); + i++; + } + break; + default: + return -EINVAL; + } + return MNL_CB_OK; +} + +static int health_object_pair_show(struct dl *dl, struct nlattr *nl) +{ + struct nlattr *nla_pair[DEVLINK_ATTR_MAX + 1] = {}; + struct nlattr *nla_value[DEVLINK_ATTR_MAX + 1] = {}; + struct nlattr *nla_value_data; + struct nlattr *nla_obj_pair; + struct nlattr *nla_object; + struct nlattr *nla_array; + struct nlattr *nla_val; + int err, type; + const char *name; + + err = mnl_attr_parse_nested(nl, attr_cb, nla_pair); + if (err != MNL_CB_OK) + return -EINVAL; + + if (!nla_pair[DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_NAME] || + !nla_pair[DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE]) + return -EINVAL; + + name = mnl_attr_get_str(nla_pair[DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_NAME]); + if (dl->json_output) + jsonw_name(dl->jw, name); + else + pr_out("%s: ", name); + + nla_val = nla_pair[DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE]; + err = mnl_attr_parse_nested(nla_val, attr_cb, nla_value); + if (err != MNL_CB_OK) + return -EINVAL; + + if (!nla_value[DEVLINK_ATTR_HEALTH_BUFFER_OBJECT] && + !nla_value[DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_ARRAY] && + !nla_value[DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_TYPE]) + return -EINVAL; + + if (nla_value[DEVLINK_ATTR_HEALTH_BUFFER_OBJECT]) { + nla_object = nla_value[DEVLINK_ATTR_HEALTH_BUFFER_OBJECT]; + /*inside must be an object pair*/ + if (dl->json_output) + jsonw_start_object(dl->jw); + + mnl_attr_for_each_nested(nla_obj_pair, nla_object) { + err = health_object_pair_show(dl, nla_obj_pair); + if (err != MNL_CB_OK) + break; + } + if (dl->json_output) + jsonw_end_object(dl->jw); + } + if (nla_value[DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_TYPE]) { + type = mnl_attr_get_u8(nla_value[DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_TYPE]); + if (!nla_value[DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_DATA]) + return -EINVAL; + nla_value_data = nla_value[DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_DATA]; + if (type == MNL_TYPE_FLAG) + health_value_show(dl, type, nla_value_data); + } + if (nla_value[DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_ARRAY]) { + if (dl->json_output) + jsonw_start_array(dl->jw); + nla_array = nla_value[DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_ARRAY]; + mnl_attr_for_each_nested(nla_object, nla_array) { + mnl_attr_for_each_nested(nla_obj_pair, nla_object) { + err = health_object_pair_show(dl, nla_obj_pair); + if (err != MNL_CB_OK) + break; + } + } + if (dl->json_output) + jsonw_end_array(dl->jw); + } + return err; +} + +static int cmd_health_object_cb(const struct nlmsghdr *nlh, void *data) +{ + struct genlmsghdr *genl = mnl_nlmsg_get_payload(nlh); + struct nlattr *tb[DEVLINK_ATTR_MAX + 1] = {}; + struct nlattr *nla_object; + struct nlattr *nla_object_pair; + struct dl *dl = data; + int err = MNL_CB_OK; + + mnl_attr_parse(nlh, sizeof(*genl), attr_cb, tb); + if (!tb[DEVLINK_ATTR_HEALTH_BUFFER_OBJECT]) + return MNL_CB_ERROR; + + mnl_attr_for_each(nla_object, nlh, sizeof(*genl)) { + mnl_attr_for_each_nested(nla_object_pair, nla_object) { + err = health_object_pair_show(dl, nla_object_pair); + if (err != MNL_CB_OK) + break; + } + if (!dl->json_output) + pr_out("\n"); + } + + return err; +} + +static int cmd_health_objdump_show(struct dl *dl) +{ + struct nlmsghdr *nlh; + int err; + + nlh = mnlg_msg_prepare(dl->nlg, + DEVLINK_CMD_HEALTH_REPORTER_OBJDUMP_GET, + NLM_F_REQUEST | NLM_F_ACK); + + err = dl_argv_parse_put(nlh, dl, DL_OPT_HANDLE | + DL_OPT_HEALTH_REPORTER_NAME, 0); + if (err) + return err; + + if (dl->json_output) + jsonw_start_object(dl->jw); + + err = _mnlg_socket_sndrcv(dl->nlg, nlh, cmd_health_object_cb, dl); + if (err) { + jsonw_end_object(dl->jw); + return err; + } + + if (dl->json_output) + jsonw_end_object(dl->jw); + + return err; +} + +static int cmd_health_diagnose(struct dl *dl) +{ + struct nlmsghdr *nlh; + int err; + + nlh = mnlg_msg_prepare(dl->nlg, DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE, + NLM_F_REQUEST | NLM_F_ACK); + + err = dl_argv_parse_put(nlh, dl, DL_OPT_HANDLE | + DL_OPT_HEALTH_REPORTER_NAME, 0); + if (err) + return err; + + if (dl->json_output) + jsonw_start_object(dl->jw); + + err = _mnlg_socket_sndrcv(dl->nlg, nlh, cmd_health_object_cb, dl); + if (err) { + jsonw_end_object(dl->jw); + return err; + } + + if (dl->json_output) + jsonw_end_object(dl->jw); + + return err; +} + +static int cmd_health_recover(struct dl *dl) +{ + struct nlmsghdr *nlh; + int err; + + nlh = mnlg_msg_prepare(dl->nlg, DEVLINK_CMD_HEALTH_REPORTER_RECOVER, + NLM_F_REQUEST | NLM_F_ACK); + + err = dl_argv_parse_put(nlh, dl, DL_OPT_HANDLE | + DL_OPT_HEALTH_REPORTER_NAME, 0); + if (err) + return err; + + dl_opts_put(nlh, dl); + return _mnlg_socket_sndrcv(dl->nlg, nlh, NULL, NULL); +} + +static void pr_out_array_parameters_start(struct dl *dl, const char *name) +{ + if (dl->json_output) { + jsonw_name(dl->jw, name); + jsonw_start_array(dl->jw); + } else { + __pr_out_newline(); + pr_out("%s:", name); + __pr_out_indent_inc(); + __pr_out_newline(); + } +} + +static const char *health_state_name(uint8_t state) +{ + switch (state) { + case DEVLINK_HEALTH_REPORTER_STATE_HEALTHY: + return HEALTH_REPORTER_STATE_HEALTHY_STR; + case DEVLINK_HEALTH_REPORTER_STATE_ERROR: + return HEALTH_REPORTER_STATE_ERROR_STR; + default: return ""; + } +} + +static void pr_out_health(struct dl *dl, struct nlattr **tb) +{ + struct nlattr *hlt[DEVLINK_ATTR_MAX + 1] = {}; + enum devlink_health_reporter_state state; + char dump_time_date[80] = "N/A"; + bool auto_recover = false; + const struct nlattr *attr; + bool dmp = false; + struct timeval tv; + uint64_t jiffies; + struct tm *info; + int err; + + state = DEVLINK_HEALTH_REPORTER_STATE_HEALTHY; + + err = mnl_attr_parse_nested(tb[DEVLINK_ATTR_HEALTH_REPORTER], attr_cb, + hlt); + if (err != MNL_CB_OK) + return; + + if (!hlt[DEVLINK_ATTR_HEALTH_REPORTER_NAME] || + !hlt[DEVLINK_ATTR_HEALTH_REPORTER_ERR] || + !hlt[DEVLINK_ATTR_HEALTH_REPORTER_RECOVER] || + !hlt[DEVLINK_ATTR_HEALTH_REPORTER_OBJDUMP_AVAIL] || + !hlt[DEVLINK_ATTR_HEALTH_REPORTER_STATE] || + !hlt[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_REC] || + !hlt[DEVLINK_ATTR_HEALTH_REPORTER_PERIOD]) + return; + + dmp = mnl_attr_get_u8(hlt[DEVLINK_ATTR_HEALTH_REPORTER_OBJDUMP_AVAIL]); + if (dmp) { + if (hlt[DEVLINK_ATTR_HEALTH_REPORTER_OBJDUMP_TS]) { + attr = hlt[DEVLINK_ATTR_HEALTH_REPORTER_OBJDUMP_TS]; + jiffies = mnl_attr_get_u64(attr); + __jiffies_to_tv(&tv, jiffies); + info = localtime(&tv.tv_sec); + strftime(dump_time_date, 80, "%b %d %l:%M:%S", info); + } + } + + pr_out_handle_start_arr(dl, tb); + + pr_out_str(dl, "name", + mnl_attr_get_str(hlt[DEVLINK_ATTR_HEALTH_REPORTER_NAME])); + state = mnl_attr_get_u8(hlt[DEVLINK_ATTR_HEALTH_REPORTER_STATE]); + pr_out_str(dl, "state", health_state_name(state)); + pr_out_u64(dl, "#err", + mnl_attr_get_u64(hlt[DEVLINK_ATTR_HEALTH_REPORTER_ERR])); + pr_out_u64(dl, "#recover", + mnl_attr_get_u64(hlt[DEVLINK_ATTR_HEALTH_REPORTER_RECOVER])); + pr_out_str(dl, "last_objdump_ts", dump_time_date); + pr_out_bool(dl, "objdump_available", dmp); + pr_out_array_parameters_start(dl, "parameters"); + pr_out_entry_start(dl); + pr_out_u64(dl, "grace_period", + mnl_attr_get_u64(hlt[DEVLINK_ATTR_HEALTH_REPORTER_PERIOD])); + auto_recover = mnl_attr_get_u8(hlt[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_REC]); + pr_out_bool(dl, "auto_recover", auto_recover); + pr_out_entry_end(dl); + pr_out_array_end(dl); + + pr_out_handle_end(dl); +} + +static int cmd_health_show_cb(const struct nlmsghdr *nlh, void *data) +{ + struct genlmsghdr *genl = mnl_nlmsg_get_payload(nlh); + struct nlattr *tb[DEVLINK_ATTR_MAX + 1] = {}; + struct dl *dl = data; + + mnl_attr_parse(nlh, sizeof(*genl), attr_cb, tb); + if (!tb[DEVLINK_ATTR_BUS_NAME] || !tb[DEVLINK_ATTR_DEV_NAME] || + !tb[DEVLINK_ATTR_HEALTH_REPORTER]) + return MNL_CB_ERROR; + + pr_out_health(dl, tb); + + return MNL_CB_OK; +} + +static int cmd_health_show(struct dl *dl) +{ + struct nlmsghdr *nlh; + uint16_t flags = NLM_F_REQUEST | NLM_F_ACK; + int err; + + if (dl_argc(dl) == 0) + flags |= NLM_F_DUMP; + nlh = mnlg_msg_prepare(dl->nlg, DEVLINK_CMD_HEALTH_REPORTER_GET, + flags); + + if (dl_argc(dl) > 0) { + err = dl_argv_parse_put(nlh, dl, 0, DL_OPT_HANDLE | + DL_OPT_HEALTH_REPORTER_NAME); + if (err) + return err; + } + pr_out_section_start(dl, "health"); + + err = _mnlg_socket_sndrcv(dl->nlg, nlh, cmd_health_show_cb, dl); + if (err) + return err; + pr_out_section_end(dl); + return err; +} + +static void cmd_health_help(void) +{ + pr_err("Usage: devlink health show [ dev DEV reporter REPORTER_NAME ]\n"); + pr_err("Usage: devlink health recover DEV reporter REPORTER_NAME\n"); + pr_err("Usage: devlink health diagnose DEV reporter REPORTER_NAME\n"); + pr_err("Usage: devlink health objdump show DEV reporter REPORTER_NAME\n"); + pr_err("Usage: devlink health objdump clear DEV reporter REPORTER_NAME\n"); + pr_err("Usage: devlink health set DEV reporter REPORTER_NAME NAME VALUE\n"); +} + +static int cmd_health(struct dl *dl) +{ + if (dl_no_arg(dl)) { + return cmd_health_show(dl); + } else if (dl_argv_match(dl, "help")) { + cmd_health_help(); + return 0; + } else if (dl_argv_match(dl, "show")) { + dl_arg_inc(dl); + return cmd_health_show(dl); + } else if (dl_argv_match(dl, "recover")) { + dl_arg_inc(dl); + return cmd_health_recover(dl); + } else if (dl_argv_match(dl, "diagnose")) { + dl_arg_inc(dl); + return cmd_health_diagnose(dl); + } else if (dl_argv_match(dl, "objdump")) { + dl_arg_inc(dl); + if (dl_argv_match(dl, "show")) { + dl_arg_inc(dl); + return cmd_health_objdump_show(dl); + } else if (dl_argv_match(dl, "clear")) { + dl_arg_inc(dl); + return cmd_health_objdump_clear(dl); + } + } else if (dl_argv_match(dl, "set")) { + dl_arg_inc(dl); + return cmd_health_set_params(dl); + } + + pr_err("Command \"%s\" not found\n", dl_argv(dl)); + return -ENOENT; +} + static void help(void) { pr_err("Usage: devlink [ OPTIONS ] OBJECT { COMMAND | help }\n" " devlink [ -f[orce] ] -b[atch] filename\n" - "where OBJECT := { dev | port | sb | monitor | dpipe | resource | region }\n" + "where OBJECT := { dev | port | sb | monitor | dpipe | resource | region | health }\n" " OPTIONS := { -V[ersion] | -n[o-nice-names] | -j[son] | -p[retty] | -v[erbose] }\n"); } @@ -5594,7 +6117,11 @@ static int dl_cmd(struct dl *dl, int argc, char **argv) } else if (dl_argv_match(dl, "region")) { dl_arg_inc(dl); return cmd_region(dl); + } else if (dl_argv_match(dl, "health")) { + dl_arg_inc(dl); + return cmd_health(dl); } + pr_err("Object \"%s\" not found\n", dl_argv(dl)); return -ENOENT; } diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index d0a33d79dc22..2f3bbf39e887 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -89,6 +89,12 @@ enum devlink_command { DEVLINK_CMD_REGION_DEL, DEVLINK_CMD_REGION_READ, + DEVLINK_CMD_HEALTH_REPORTER_GET, + DEVLINK_CMD_HEALTH_REPORTER_SET, + DEVLINK_CMD_HEALTH_REPORTER_RECOVER, + DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE, + DEVLINK_CMD_HEALTH_REPORTER_OBJDUMP_GET, + DEVLINK_CMD_HEALTH_REPORTER_OBJDUMP_CLEAR, /* add new commands above here */ __DEVLINK_CMD_MAX, DEVLINK_CMD_MAX = __DEVLINK_CMD_MAX - 1 @@ -168,6 +174,11 @@ enum devlink_param_fw_load_policy_value { DEVLINK_PARAM_FW_LOAD_POLICY_VALUE_FLASH, }; +enum devlink_health_reporter_state { + DEVLINK_HEALTH_REPORTER_STATE_HEALTHY, + DEVLINK_HEALTH_REPORTER_STATE_ERROR, +}; + enum devlink_attr { /* don't change the order or add anything between, this is ABI! */ DEVLINK_ATTR_UNSPEC, @@ -285,6 +296,24 @@ enum devlink_attr { DEVLINK_ATTR_REGION_CHUNK_ADDR, /* u64 */ DEVLINK_ATTR_REGION_CHUNK_LEN, /* u64 */ + DEVLINK_ATTR_HEALTH_BUFFER_OBJECT, /* nested */ + DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_PAIR, /* nested */ + DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_NAME, /* string */ + DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE, /* nested */ + DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_ARRAY, /* nested */ + DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_TYPE, /* u8 */ + DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_DATA, /* dynamic */ + + DEVLINK_ATTR_HEALTH_REPORTER, /* nested */ + DEVLINK_ATTR_HEALTH_REPORTER_NAME, /* string */ + DEVLINK_ATTR_HEALTH_REPORTER_STATE, /* u8 */ + DEVLINK_ATTR_HEALTH_REPORTER_ERR, /* u64 */ + DEVLINK_ATTR_HEALTH_REPORTER_RECOVER, /* u64 */ + DEVLINK_ATTR_HEALTH_REPORTER_OBJDUMP_AVAIL, /* u8 */ + DEVLINK_ATTR_HEALTH_REPORTER_OBJDUMP_TS, /* u64 */ + DEVLINK_ATTR_HEALTH_REPORTER_PERIOD, /* u64 */ + DEVLINK_ATTR_HEALTH_REPORTER_AUTO_REC, /* u8 */ + /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, diff --git a/man/man8/devlink-health.8 b/man/man8/devlink-health.8 new file mode 100644 index 000000000000..19de4079ff67 --- /dev/null +++ b/man/man8/devlink-health.8 @@ -0,0 +1,175 @@ +.TH DEVLINK\-HEALTH 8 "27 Dec 2018" "iproute2" "Linux" +.SH NAME +devlink-health \- devlink health reporting and recovery +.SH SYNOPSIS +.sp +.ad l +.in +8 +.ti -8 +.B devlink +.RI "[ " OPTIONS " ]" +.B health +.RI " { " COMMAND " | " +.BR help " }" +.sp +.ti -8 +.IR OPTIONS " := { " +\fB\-V\fR[\fIersion\fR] } +.ti -8 +.BR "devlink health show" +.RI "[ " +.RI "" DEV "" +.BR " reporter " +.RI ""REPORTER " ] " +.ti -8 +.BR "devlink health recover" +.RI "" DEV "" +.BR "reporter" +.RI "" REPORTER "" +.ti -8 +.BR "devlink health diagnose" +.RI "" DEV "" +.BR "reporter" +.RI "" REPORTER "" +.ti -8 +.BR "devlink health objdump show" +.RI "" DEV "" +.BR "reporter" +.RI "" REPORTER "" +.ti -8 +.BR "devlink health objdump clear" +.RI "" DEV "" +.BR "reporter" +.RI "" REPORTER "" +.ti -8 +.BR "devlink health set" +.RI "" DEV "" +.BR "reporter" +.RI "" REPORTER "" +.RI "" NAME "" +.RI "" VALUE "" +.ti -8 +.B devlink health help +.SH "DESCRIPTION" +.SS devlink health show - Show status and configuration on all supported reporters on all devlink devices. +.PP +.I "DEV" +- specifies the devlink device. +.PP +.I "REPORTER" +- specifies the reporter's name registered on the devlink device. +.SS devlink health recover - Initiate a recovery operation on a reporter. +This action performs a recovery and increases the recoveries counter on success. +.PP +.I "DEV" +- specifies the devlink device. +.PP +.I "REPORTER" +- specifies the reporter's name registered on the devlink device. +.SS devlink health diagnose - Retrieve diagnostics data on a reporter. +.PP +.I "DEV" +- specifies the devlink device. +.PP +.I "REPORTER" +- specifies the reporter's name registered on the devlink device. +.SS devlink health objdump show - Display the last saved objdump. +.PD 0 +.P +Devlink health saves a single objdump per reporter. If an objdump is +.P +not already stored by the Devlink, this command will generate a new +.P +objdump. The objdump can be generated either automatically when a +.P +reporter reports on an error or manually at the user's request. +.PD +.PP +.I "DEV" +- specifies the devlink device. +.PP +.I "REPORTER" +- specifies the reporter's name registered on the devlink device. +.SS devlink health objdump clear - Delete the saved objdump. +Deleting the save objdump enables a generation of a new objdump on +.PD 0 +.P +the next "devlink health objdump show" command. +.PD +.PP +.I "DEV" +- specifies the devlink device. +.PP +.I "REPORTER" +- specifies the reporter's name registered on the devlink device. +.SS devlink health set - Enable the user to configure: +.PD 0 +1) grace_period [msec] - Time interval between auto recoveries. +.P +2) auto_recover [true/false] - Indicates whether the devlink should execute automatic recover on error. +.PD +.PP +.I "DEV" +- specifies the devlink device. +.PP +.I "REPORTER" +- specifies the reporter's name registered on the devlink device. +.SH "EXAMPLES" +.PP +devlink health show +.RS 4 +pci/0000:00:09.0: + name TX state healthy #err 1 #recover 1 last_objdump_ts N/A objdump_available false + attributes: + grace period 600 auto recover true +.RE +.PP +devlink health recover pci/0000:00:09.0 reporter TX +.RS 4 +Initiate recovery on TX reporter registered on pci/0000:00:09.0. +.RE +.PP +devlink health diagnose pci/0000:00:09.0 reporter TX +.RS 4 +.PD 0 +SQ 0x9b: HW state: 1 stopped: 0 +.P +SQ 0x9f: HW state: 1 stopped: 0 +.P +SQ 0xa3: HW state: 1 stopped: 0 +.P +SQ 0xa7: HW state: 1 stopped: 0 +.P +SQ 0xab: HW state: 1 stopped: 0 +.PD +.RE +.PP +devlink health objdump show pci/0000:00:09.0 reporter TX +.RS 4 +Display the last saved objdump on TX reporter registered on pci/0000:00:09.0. +.RE +.PP +devlink health objdump clear pci/0000:00:09.0 reporter TX +.RS 4 +Delete saved objdump on TX reporter registered on pci/0000:00:09.0. +.RE +.PP +devlink health set pci/0000:00:09.0 reporter TX grace_period 3500 +.RS 4 +Set time interval between auto recoveries to minimum of 3500 mSec on +TX reporter registered on pci/0000:00:09.0. +.RE +.PP +devlink health set pci/0000:00:09.0 reporter TX auto_recover false +.RS 4 +Turn off auto recovery on TX reporter registered on pci/0000:00:09.0. +.RE +.SH SEE ALSO +.BR devlink (8), +.BR devlink-dev (8), +.BR devlink-port (8), +.BR devlink-region (8), +.br + +.SH AUTHOR +Aya Levin diff --git a/man/man8/devlink.8 b/man/man8/devlink.8 index 8d527e7e1d60..13d4dcd908b3 100644 --- a/man/man8/devlink.8 +++ b/man/man8/devlink.8 @@ -7,7 +7,7 @@ devlink \- Devlink tool .in +8 .ti -8 .B devlink -.RI "[ " OPTIONS " ] { " dev | port | monitor | sb | resource | region " } { " COMMAND " | " +.RI "[ " OPTIONS " ] { " dev | port | monitor | sb | resource | region | health " } { " COMMAND " | " .BR help " }" .sp @@ -78,6 +78,10 @@ Turn on verbose output. .B region - devlink address region access +.TP +.B health +- devlink reporting and recovery + .SS .I COMMAND @@ -109,6 +113,7 @@ Exit status is 0 if command was successful or a positive integer upon failure. .BR devlink-sb (8), .BR devlink-resource (8), .BR devlink-region (8), +.BR devlink-health (8), .br .SH REPORTING BUGS -- 2.14.1