From mboxrd@z Thu Jan 1 00:00:00 1970 Content-Type: multipart/mixed; boundary="===============2112443911993127802==" MIME-Version: 1.0 From: Dave Jiang Subject: [Accel-config] Re: [PATCH 1/2] accel-config: Add APIs to get details of last error and associated devices Date: Mon, 12 Jul 2021 13:58:45 -0700 Message-ID: In-Reply-To: BYAPR11MB2535839DBF9F283B04C46F80ED159@BYAPR11MB2535.namprd11.prod.outlook.com To: accel-config@lists.01.org List-ID: --===============2112443911993127802== Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable On 7/12/2021 1:40 PM, Thomas, Ramesh wrote: > On Mon, Jul 12, 2021 at 01:26:52PM -0700, Dave Jiang wrote: >> On 7/9/2021 10:47 AM, ramesh.thomas(a)intel.com wrote: >>> From: Ramesh Thomas >>> >>> Capture a snapshot of error condition containing cmd_status and the >>> associated devices. Provide APIs to return the captured details. Handles >>> both hw and sw error codes stored in sysfs cmd_status field of devices. >>> Following APIs are added: >>> - accfg_ctx_get_last_error; >>> - accfg_ctx_get_last_error_str; >>> - accfg_ctx_get_last_error_device; >>> - accfg_ctx_get_last_error_wq; >>> - accfg_ctx_get_last_error_group; >>> - accfg_ctx_get_last_error_engine; >>> >>> Signed-off-by: Ramesh Thomas >>> --- >>> accfg/lib/libaccel-config.sym | 10 +++ >>> accfg/lib/libaccfg.c | 156 ++++++++++++++++++++++++++++++--= -- >>> accfg/lib/private.h | 9 ++ >>> accfg/libaccel_config.h | 9 +- >>> 4 files changed, 164 insertions(+), 20 deletions(-) >>> >>> diff --git a/accfg/lib/libaccel-config.sym b/accfg/lib/libaccel-config.= sym >>> index aecc98b..679c597 100644 >>> --- a/accfg/lib/libaccel-config.sym >>> +++ b/accfg/lib/libaccel-config.sym >>> @@ -149,3 +149,13 @@ LIBACCFG_9 { >>> global: >>> accfg_device_get_op_cap; >>> } LIBACCFG_8; >>> + >>> +LIBACCFG_10 { >>> +global: >>> + accfg_ctx_get_last_error; >>> + accfg_ctx_get_last_error_str; >>> + accfg_ctx_get_last_error_device; >>> + accfg_ctx_get_last_error_wq; >>> + accfg_ctx_get_last_error_group; >>> + accfg_ctx_get_last_error_engine; >>> +} LIBACCFG_9; >>> diff --git a/accfg/lib/libaccfg.c b/accfg/lib/libaccfg.c >>> index f53751e..00314b0 100644 >>> --- a/accfg/lib/libaccfg.c >>> +++ b/accfg/lib/libaccfg.c >>> @@ -70,6 +70,7 @@ ACCFG_EXPORT char *accfg_mdev_basenames[] =3D { >>> >>> enum { >>> ACCFG_CMD_STATUS_MAX =3D 0x45, >>> + ACCFG_CMD_STATUS_ERROR =3D 0x80010000, >>> }; >>> >>> const char *accfg_device_cmd_status[] =3D { >>> @@ -105,9 +106,43 @@ const char *accfg_device_cmd_status[] =3D { >>> [0x42] =3D "No interrupt handle available", >>> [0x43] =3D "No interrupt handles associated with the index", >>> [0x44] =3D "No revoked handles associted with the index", >>> - [ACCFG_CMD_STATUS_MAX] =3D "", >>> + [ACCFG_CMD_STATUS_MAX] =3D "Unknown error", >>> }; >>> >>> +const char *accfg_sw_cmd_status[] =3D { >>> + [0x01] =3D "Error reading cmd_status", >>> + [0x02] =3D "DMA device registration error", >>> + [0x03] =3D "wq error - group not set", >>> + [0x04] =3D "wq error - name not set", >>> + [0x05] =3D "wq error - no SVM support and needed", >>> + [0x06] =3D "wq error - threshold not set", >>> + [0x07] =3D "wq portal mapping error", >>> + [0x08] =3D "wq resource allocation error", >>> + [0x09] =3D "wq percpu counter allocation error", >>> + [0x0a] =3D "wq DMA channel registration error", >>> + [0x0b] =3D "wq char dev registration error", >>> + [0x0c] =3D "wq error - no shared wq support", >>> + [0x0d] =3D "wq error - no wqs configured", >>> + [0x0e] =3D "wq error - size not set", >>> +}; >>> + >>> +static void save_last_error(struct accfg_device *device, struct accfg_= wq *wq, >>> + struct accfg_group *group, struct accfg_engine *engine) >>> +{ >>> + struct accfg_ctx *ctx =3D device->ctx; >>> + >>> + if (!ctx->error_ctx) >>> + ctx->error_ctx =3D calloc(1, sizeof(struct accfg_error_ctx)= ); >>> + >>> + if (ctx->error_ctx) { >>> + ctx->error_ctx->cmd_status =3D accfg_device_get_cmd_status(= device); >>> + ctx->error_ctx->device =3D device; >>> + ctx->error_ctx->wq =3D wq; >>> + ctx->error_ctx->group =3D group; >>> + ctx->error_ctx->engine =3D engine; >>> + } >>> +} >>> + >>> static inline bool is_mdev_registered(struct accfg_device *device) >>> { >>> return device->mdev_path && !access(device->mdev_path, R_OK); >>> @@ -251,6 +286,7 @@ static void free_context(struct accfg_ctx *ctx) >>> >>> list_for_each_safe(&ctx->devices, device, _b, list) >>> free_device(device, &ctx->devices); >>> + free(ctx->error_ctx); >>> free(ctx); >>> } >>> >>> @@ -1082,6 +1118,7 @@ ACCFG_EXPORT int accfg_create_mdev(struct accfg_d= evice *device, >>> rc =3D sysfs_write_attr(ctx, mdev_path, uuid_str); >>> if (rc < 0) { >>> err(ctx, "create mdev failed %d\n", rc); >>> + save_last_error(device, NULL, NULL, NULL); >>> goto create_err; >>> } >>> >>> @@ -1105,8 +1142,10 @@ static int accfg_device_mdev_remove(struct accfg= _device *device, >>> uuid_unparse(mdev->uuid, uuid_str); >>> sprintf(mdev_path, "%s/%s/remove", device->mdev_path, uuid_str); >>> rc =3D sysfs_write_attr(ctx, mdev_path, "1"); >>> - if (rc < 0) >>> + if (rc < 0) { >>> + save_last_error(device, NULL, NULL, NULL); >>> return rc; >>> + } >>> >>> list_del(&mdev->list); >>> free(mdev); >>> @@ -1355,6 +1394,7 @@ ACCFG_EXPORT enum accfg_device_state accfg_device= _get_state( >>> err(ctx, "%s: sysfs_read_attr failed '%s': %s\n", >>> __func__, device->device_path, >>> strerror(errno)); >>> + save_last_error(device, NULL, NULL, NULL); >>> return ACCFG_DEVICE_UNKNOWN; >>> } >>> >>> @@ -1414,6 +1454,7 @@ ACCFG_EXPORT int accfg_device_get_clients(struct = accfg_device *device) >>> if (sysfs_read_attr(ctx, path, buf) < 0) { >>> err(ctx, "%s: retrieve clients failed '%s': %s\n", >>> __func__, path, strerror(errno)); >>> + save_last_error(device, NULL, NULL, NULL); >>> return -errno; >>> } >>> >>> @@ -1448,6 +1489,7 @@ ACCFG_EXPORT int accfg_device_set_token_limit(str= uct accfg_device >> *dev, int val) >>> if (sysfs_write_attr(ctx, path, buf) < 0) { >>> err(ctx, "%s: write failed: %s\n", >>> accfg_device_get_devname(dev), strerror(er= rno)); >>> + save_last_error(dev, NULL, NULL, NULL); >>> return -errno; >>> } >>> >>> @@ -1476,8 +1518,10 @@ ACCFG_EXPORT int accfg_device_is_active(struct a= ccfg_device *device) >>> return 0; >>> } >>> >>> - if (sysfs_read_attr(ctx, path, buf) < 0) >>> + if (sysfs_read_attr(ctx, path, buf) < 0) { >>> + save_last_error(device, NULL, NULL, NULL); >>> return 0; >>> + } >>> >>> if (strcmp(buf, "enabled") =3D=3D 0) >>> return 1; >>> @@ -1485,17 +1529,16 @@ ACCFG_EXPORT int accfg_device_is_active(struct = accfg_device *device) >>> return 0; >>> } >>> >>> -ACCFG_EXPORT int accfg_device_get_cmd_status(struct accfg_device *devi= ce) >>> +ACCFG_EXPORT unsigned int accfg_device_get_cmd_status(struct accfg_dev= ice *device) >>> { >>> struct accfg_ctx *ctx; >>> long status; >>> char *path; >>> int len; >>> char buf[SYSFS_ATTR_SIZE], *end_ptr; >>> - int rc; >>> >>> if (!device) >>> - return -EINVAL; >>> + return ACCFG_CMD_STATUS_ERROR; >>> >>> ctx =3D accfg_device_get_ctx(device); >>> path =3D device->device_buf; >>> @@ -1504,31 +1547,96 @@ ACCFG_EXPORT int accfg_device_get_cmd_status(st= ruct accfg_device >> *device) >>> if (snprintf(path, len, "%s/cmd_status", device->device_path) >=3D= len) { >>> err(ctx, "%s: buffer too small!\n", >>> accfg_device_get_devname(device)); >>> - return -ENOMEM; >>> + return ACCFG_CMD_STATUS_ERROR; >>> } >>> >>> - rc =3D sysfs_read_attr(ctx, path, buf); >>> - if (rc < 0) >>> - return rc; >>> + if (sysfs_read_attr(ctx, path, buf)) >>> + return ACCFG_CMD_STATUS_ERROR; >>> >>> status =3D strtol(buf, &end_ptr, 0); >>> if (errno =3D=3D ERANGE || end_ptr =3D=3D buf) >>> - return -EIO; >>> + return ACCFG_CMD_STATUS_ERROR; >>> >>> - return (int)status; >>> + return (unsigned int)status; >>> } >>> >>> -ACCFG_EXPORT const char * accfg_device_get_cmd_status_str(struct accfg= _device *device) >>> +static const char *get_cmd_status_str(unsigned int status) >>> { >>> - int status; >>> + unsigned int sw_status; >>> >>> - status =3D accfg_device_get_cmd_status(device); >>> - if (status < 0 || status >=3D ACCFG_CMD_STATUS_MAX) >>> - return NULL; >>> + if (status & 0x80000000) { >>> + sw_status =3D (status & ~0x80000000) >> 16; >> >> Magic number. Do I need to add a mask define to the header? > Or add a comment stating device error codes are in the lower 16 bits and > sw error codes are in the higher bits with the highest bit set. Looks like I already defined the mask in the user header: IDXD_SCMD_SOFTERR_MASK I've also added a shift define. Just fetch djiang5/idxd-upstream-next again. --===============2112443911993127802==--