All of lore.kernel.org
 help / color / mirror / Atom feed
* [Accel-config] Re: [PATCH v2 1/3] accel-config: Add APIs to get details of last error and associated devices
@ 2021-07-13 16:53 Thomas, Ramesh
  0 siblings, 0 replies; 3+ messages in thread
From: Thomas, Ramesh @ 2021-07-13 16:53 UTC (permalink / raw)
  To: accel-config

[-- Attachment #1: Type: text/plain, Size: 17319 bytes --]

On Tue, Jul 13, 2021 at 08:32:18AM -0700, Dave Jiang wrote:
> 
> On 7/12/2021 7:39 PM, ramesh.thomas(a)intel.com wrote:
> > From: Ramesh Thomas <ramesh.thomas(a)intel.com>
> >
> > Capture a snapshot of error condition containing cmd_status and the
> > associated devices. Provide APIs to return the captured details. Handles
> > both hw and sw error codes stored in sysfs cmd_status field of devices.
> > Following APIs are added:
> > - accfg_ctx_get_last_error;
> > - accfg_ctx_get_last_error_str;
> > - accfg_ctx_get_last_error_device;
> > - accfg_ctx_get_last_error_wq;
> > - accfg_ctx_get_last_error_group;
> > - accfg_ctx_get_last_error_engine;
> >
> > Signed-off-by: Ramesh Thomas <ramesh.thomas(a)intel.com>
> > ---
> >   accfg/lib/libaccel-config.sym |   6 ++
> >   accfg/lib/libaccfg.c          | 160 ++++++++++++++++++++++++++++++----
> >   accfg/lib/private.h           |   9 ++
> >   accfg/libaccel_config.h       |   9 +-
> >   4 files changed, 164 insertions(+), 20 deletions(-)
> >
> > diff --git a/accfg/lib/libaccel-config.sym b/accfg/lib/libaccel-config.sym
> > index e9b54e2..f44a3ca 100644
> > --- a/accfg/lib/libaccel-config.sym
> > +++ b/accfg/lib/libaccel-config.sym
> > @@ -154,4 +154,10 @@ LIBACCFG_10 {
> >   global:
> >   	accfg_wq_get_ats_disable;
> >   	accfg_wq_set_ats_disable;
> > +	accfg_ctx_get_last_error;
> > +	accfg_ctx_get_last_error_str;
> > +	accfg_ctx_get_last_error_device;
> > +	accfg_ctx_get_last_error_wq;
> > +	accfg_ctx_get_last_error_group;
> > +	accfg_ctx_get_last_error_engine;
> >   } LIBACCFG_9;
> > diff --git a/accfg/lib/libaccfg.c b/accfg/lib/libaccfg.c
> > index bdfcdcf..6dbfbee 100644
> > --- a/accfg/lib/libaccfg.c
> > +++ b/accfg/lib/libaccfg.c
> > @@ -27,6 +27,7 @@
> >   #include <accfg/libaccel_config.h>
> >   #include <fnmatch.h>
> >   #include "private.h"
> > +#include <linux/idxd.h>
> >
> >   #define MDEV_POSTFIX "mdev_supported_types"
> >   #define IDXD_DRIVER_BIND_PATH "/sys/bus/dsa/drivers/idxd"
> > @@ -70,8 +71,12 @@ ACCFG_EXPORT char *accfg_mdev_basenames[] = {
> >
> >   enum {
> >   	ACCFG_CMD_STATUS_MAX = 0x45,
> > +	ACCFG_CMD_STATUS_ERROR = 0x80010000,
> >   };
> >
> > +#define SCMD_STAT(x) (((x) & ~IDXD_SCMD_SOFTERR_MASK) >> \
> > +		IDXD_SCMD_SOFTERR_SHIFT)
> > +
> >   const char *accfg_device_cmd_status[] = {
> >   	[0x0]   = "Successful completion",
> >   	[0x1]	= "Invalid command code",
> > @@ -105,9 +110,43 @@ const char *accfg_device_cmd_status[] = {
> >   	[0x42]	= "No interrupt handle available",
> >   	[0x43]	= "No interrupt handles associated with the index",
> >   	[0x44]	= "No revoked handles associted with the index",
> > -	[ACCFG_CMD_STATUS_MAX]	= "",
> > +	[ACCFG_CMD_STATUS_MAX]	= "Unknown error",
> > +};
> > +
> > +const char *accfg_sw_cmd_status[] = {
> > +	[0x01] = "Error reading cmd_status",
> > +	[SCMD_STAT(IDXD_SCMD_DEV_DMA_ERR)] = "DMA device registration error",
> > +	[SCMD_STAT(IDXD_SCMD_WQ_NO_GRP)] = "wq error - group not set",
> > +	[SCMD_STAT(IDXD_SCMD_WQ_NO_NAME)] = "wq error - name not set",
> > +	[SCMD_STAT(IDXD_SCMD_WQ_NO_SVM)] = "wq error - no SVM support and needed",
> > +	[SCMD_STAT(IDXD_SCMD_WQ_NO_THRESH)] = "wq error - threshold not set",
> > +	[SCMD_STAT(IDXD_SCMD_WQ_PORTAL_ERR)] = "wq portal mapping error",
> > +	[SCMD_STAT(IDXD_SCMD_WQ_RES_ALLOC_ERR)] = "wq resource allocation error",
> > +	[SCMD_STAT(IDXD_SCMD_PERCPU_ERR)] = "wq percpu counter allocation error",
> > +	[SCMD_STAT(IDXD_SCMD_DMA_CHAN_ERR)] = "wq DMA channel registration error",
> > +	[SCMD_STAT(IDXD_SCMD_CDEV_ERR)] = "wq char dev registration error",
> > +	[SCMD_STAT(IDXD_SCMD_WQ_NO_SWQ_SUPPORT)] = "wq error - no shared wq support",
> > +	[SCMD_STAT(IDXD_SCMD_WQ_NONE_CONFIGURED)] = "wq error - no wqs configured",
> > +	[SCMD_STAT(IDXD_SCMD_WQ_NO_SIZE)] = "wq error - size not set",
> 
> I wonder if we should add to some of these as "driver error" at the end
> so the user would understand not config related.

Do we need to report to user if they cannot do anything about it? You
can print driver errors in dmesg.

> 
> Also, I have also overloaded the hardware status such that the driver
> can report the same error to what spec defined except with the software
> flag set. I think we should cover those as well.

Those are included as part of the device errors. Since they are the same
codes as the device codes, the ones defined in the hw table are used by
masking off the high bit.

> 
> 
> >   };
> >
> > +static void save_last_error(struct accfg_device *device, struct accfg_wq *wq,
> > +		struct accfg_group *group, struct accfg_engine *engine)
> > +{
> > +	struct accfg_ctx *ctx = device->ctx;
> > +
> > +	if (!ctx->error_ctx)
> > +		ctx->error_ctx = calloc(1, sizeof(struct accfg_error_ctx));
> > +
> > +	if (ctx->error_ctx) {
> > +		ctx->error_ctx->cmd_status = accfg_device_get_cmd_status(device);
> > +		ctx->error_ctx->device = device;
> > +		ctx->error_ctx->wq = wq;
> > +		ctx->error_ctx->group = group;
> > +		ctx->error_ctx->engine = engine;
> > +	}
> > +}
> > +
> >   static inline bool is_mdev_registered(struct accfg_device *device)
> >   {
> >   	return device->mdev_path && !access(device->mdev_path, R_OK);
> > @@ -251,6 +290,7 @@ static void free_context(struct accfg_ctx *ctx)
> >
> >   	list_for_each_safe(&ctx->devices, device, _b, list)
> >   		free_device(device, &ctx->devices);
> > +	free(ctx->error_ctx);
> >   	free(ctx);
> >   }
> >
> > @@ -1083,6 +1123,7 @@ ACCFG_EXPORT int accfg_create_mdev(struct accfg_device *device,
> >   	rc = sysfs_write_attr(ctx, mdev_path, uuid_str);
> >   	if (rc < 0) {
> >   		err(ctx, "create mdev failed %d\n", rc);
> > +		save_last_error(device, NULL, NULL, NULL);
> >   		goto create_err;
> >   	}
> >
> > @@ -1106,8 +1147,10 @@ static int accfg_device_mdev_remove(struct accfg_device *device,
> >   	uuid_unparse(mdev->uuid, uuid_str);
> >   	sprintf(mdev_path, "%s/%s/remove", device->mdev_path, uuid_str);
> >   	rc = sysfs_write_attr(ctx, mdev_path, "1");
> > -	if (rc < 0)
> > +	if (rc < 0) {
> > +		save_last_error(device, NULL, NULL, NULL);
> >   		return rc;
> > +	}
> >
> >   	list_del(&mdev->list);
> >   	free(mdev);
> > @@ -1356,6 +1399,7 @@ ACCFG_EXPORT enum accfg_device_state accfg_device_get_state(
> >   		err(ctx, "%s: sysfs_read_attr failed '%s': %s\n",
> >   				__func__, device->device_path,
> >   				strerror(errno));
> > +		save_last_error(device, NULL, NULL, NULL);
> >   		return ACCFG_DEVICE_UNKNOWN;
> >   	}
> >
> > @@ -1415,6 +1459,7 @@ ACCFG_EXPORT int accfg_device_get_clients(struct accfg_device *device)
> >   	if (sysfs_read_attr(ctx, path, buf) < 0) {
> >   		err(ctx, "%s: retrieve clients failed '%s': %s\n",
> >   				__func__, path, strerror(errno));
> > +		save_last_error(device, NULL, NULL, NULL);
> >   		return -errno;
> >   	}
> >
> > @@ -1449,6 +1494,7 @@ ACCFG_EXPORT int accfg_device_set_token_limit(struct accfg_device
> *dev, int val)
> >   	if (sysfs_write_attr(ctx, path, buf) < 0) {
> >   		err(ctx, "%s: write failed: %s\n",
> >   				accfg_device_get_devname(dev), strerror(errno));
> > +		save_last_error(dev, NULL, NULL, NULL);
> >   		return -errno;
> >   	}
> >
> > @@ -1477,8 +1523,10 @@ ACCFG_EXPORT int accfg_device_is_active(struct accfg_device *device)
> >   		return 0;
> >   	}
> >
> > -	if (sysfs_read_attr(ctx, path, buf) < 0)
> > +	if (sysfs_read_attr(ctx, path, buf) < 0) {
> > +		save_last_error(device, NULL, NULL, NULL);
> >   		return 0;
> > +	}
> >
> >   	if (strcmp(buf, "enabled") == 0)
> >   		return 1;
> > @@ -1486,17 +1534,16 @@ ACCFG_EXPORT int accfg_device_is_active(struct accfg_device *device)
> >   	return 0;
> >   }
> >
> > -ACCFG_EXPORT int accfg_device_get_cmd_status(struct accfg_device *device)
> > +ACCFG_EXPORT unsigned int accfg_device_get_cmd_status(struct accfg_device *device)
> >   {
> >   	struct accfg_ctx *ctx;
> >   	long status;
> >   	char *path;
> >   	int len;
> >   	char buf[SYSFS_ATTR_SIZE], *end_ptr;
> > -	int rc;
> >
> >   	if (!device)
> > -		return -EINVAL;
> > +		return ACCFG_CMD_STATUS_ERROR;
> >
> >   	ctx = accfg_device_get_ctx(device);
> >   	path = device->device_buf;
> > @@ -1505,31 +1552,96 @@ ACCFG_EXPORT int accfg_device_get_cmd_status(struct accfg_device
> *device)
> >   	if (snprintf(path, len, "%s/cmd_status", device->device_path) >= len) {
> >   		err(ctx, "%s: buffer too small!\n",
> >   				accfg_device_get_devname(device));
> > -		return -ENOMEM;
> > +		return ACCFG_CMD_STATUS_ERROR;
> >   	}
> >
> > -	rc = sysfs_read_attr(ctx, path, buf);
> > -	if (rc < 0)
> > -		return rc;
> > +	if (sysfs_read_attr(ctx, path, buf))
> > +		return ACCFG_CMD_STATUS_ERROR;
> >
> >   	status = strtol(buf, &end_ptr, 0);
> >   	if (errno == ERANGE || end_ptr == buf)
> > -		return -EIO;
> > +		return ACCFG_CMD_STATUS_ERROR;
> >
> > -	return (int)status;
> > +	return (unsigned int)status;
> >   }
> >
> > -ACCFG_EXPORT const char * accfg_device_get_cmd_status_str(struct accfg_device *device)
> > +static const char *get_cmd_status_str(unsigned int status)
> >   {
> > -	int status;
> > +	unsigned int sw_status;
> >
> > -	status = accfg_device_get_cmd_status(device);
> > -	if (status < 0 || status >= ACCFG_CMD_STATUS_MAX)
> > -		return NULL;
> > +	if (status & IDXD_SCMD_SOFTERR_MASK) {
> > +		sw_status = SCMD_STAT(status);
> > +		if (sw_status) {
> > +			if (sw_status >= (sizeof(accfg_sw_cmd_status) /
> > +						sizeof(char *))) {
> > +				status = ACCFG_CMD_STATUS_MAX;
> > +				goto ext;
> > +			}
> > +			return accfg_sw_cmd_status[sw_status];
> > +		}
> > +		status &= ~IDXD_SCMD_SOFTERR_MASK;
> > +	}
> >
> > +	if (status > ACCFG_CMD_STATUS_MAX)
> > +		status = ACCFG_CMD_STATUS_MAX;
> > +
> > +ext:
> >   	return accfg_device_cmd_status[status];
> >   }
> >
> > +ACCFG_EXPORT const char *accfg_device_get_cmd_status_str(struct accfg_device *device)
> > +{
> > +	return get_cmd_status_str(accfg_device_get_cmd_status(device));
> > +}
> > +
> > +ACCFG_EXPORT unsigned int accfg_ctx_get_last_error(struct accfg_ctx *ctx)
> > +{
> > +	if (ctx->error_ctx)
> > +		return ctx->error_ctx->cmd_status;
> > +
> > +	return 0;
> > +}
> > +
> > +ACCFG_EXPORT const char *accfg_ctx_get_last_error_str(struct accfg_ctx *ctx)
> > +{
> > +	return get_cmd_status_str(accfg_ctx_get_last_error(ctx));
> > +}
> > +
> > +ACCFG_EXPORT struct accfg_device *accfg_ctx_get_last_error_device(
> > +		struct accfg_ctx *ctx)
> > +{
> > +	if (ctx->error_ctx)
> > +		return ctx->error_ctx->device;
> > +
> > +	return NULL;
> > +}
> > +
> > +ACCFG_EXPORT struct accfg_wq *accfg_ctx_get_last_error_wq(struct accfg_ctx *ctx)
> > +{
> > +	if (ctx->error_ctx)
> > +		return ctx->error_ctx->wq;
> > +
> > +	return NULL;
> > +}
> > +
> > +ACCFG_EXPORT struct accfg_group *accfg_ctx_get_last_error_group(
> > +		struct accfg_ctx *ctx)
> > +{
> > +	if (ctx->error_ctx)
> > +		return ctx->error_ctx->group;
> > +
> > +	return NULL;
> > +}
> > +
> > +ACCFG_EXPORT struct accfg_engine *accfg_ctx_get_last_error_engine(
> > +		struct accfg_ctx *ctx)
> > +{
> > +	if (ctx->error_ctx)
> > +		return ctx->error_ctx->engine;
> > +
> > +	return NULL;
> > +}
> > +
> >   /* Helper function to validate device type in the defined device array based on
> >    * device name */
> >   ACCFG_EXPORT int accfg_device_type_validate(const char *dev_name)
> > @@ -1633,6 +1745,7 @@ static int accfg_device_control(struct accfg_device *device,
> >   		rc = sysfs_write_attr(ctx, path, accfg_device_get_devname(device));
> >   		free(path);
> >   		if (rc < 0) {
> > +			save_last_error(device, NULL, NULL, NULL);
> >   			return rc;
> >   		}
> >   	}
> > @@ -1722,8 +1835,10 @@ ACCFG_EXPORT uint64_t accfg_group_get_available_size(
> >   		return ULLONG_MAX;
> >   	}
> >
> > -	if (sysfs_read_attr(ctx, path, buf) < 0)
> > +	if (sysfs_read_attr(ctx, path, buf) < 0) {
> > +		save_last_error(group->device, NULL, group, NULL);
> >   		return ULLONG_MAX;
> > +	}
> >
> >   	return strtoull(buf, NULL, 0);
> >   }
> > @@ -1749,6 +1864,7 @@ ACCFG_EXPORT int accfg_group_set_##field( \
> >   		err(ctx, "%s: write failed: %s\n", \
> >   				accfg_group_get_devname(group), \
> >   				strerror(errno)); \
> > +		save_last_error(group->device, NULL, group, NULL); \
> >   		return -errno; \
> >   	} \
> >   	group->field = val; \
> > @@ -1907,6 +2023,7 @@ ACCFG_EXPORT int accfg_wq_get_clients(struct accfg_wq *wq)
> >   	if (sysfs_read_attr(ctx, path, buf) < 0) {
> >   		err(ctx, "%s: retrieve clients failed: '%s': %s\n",
> >   				__func__, wq->wq_path, strerror(errno));
> > +		save_last_error(wq->device, wq, NULL, NULL);
> >   		return -errno;
> >   	}
> >
> > @@ -1994,6 +2111,7 @@ static int accfg_wq_control(struct accfg_wq *wq, enum
> accfg_control_flag flag,
> >   	if (path) {
> >   		rc = sysfs_write_attr(ctx, path, wq_name);
> >   		if (rc < 0) {
> > +			save_last_error(wq->device, wq, NULL, NULL);
> >   			free(path);
> >   			return rc;
> >   		}
> > @@ -2038,6 +2156,7 @@ ACCFG_EXPORT enum accfg_wq_state accfg_wq_get_state(struct accfg_wq
> *wq)
> >   	if (sysfs_read_attr(ctx, path, read_state) < 0) {
> >   		err(ctx, "%s: sysfs_read_attr failed '%s': %s\n",
> >   				__func__, wq->wq_path, strerror(errno));
> > +		save_last_error(wq->device, wq, NULL, NULL);
> >   		return ACCFG_WQ_UNKNOWN;
> >   	}
> >
> > @@ -2163,6 +2282,7 @@ ACCFG_EXPORT int accfg_wq_set_##field( \
> >   		err(ctx, "%s: write failed: %s\n", \
> >   				accfg_wq_get_devname(wq), \
> >   				strerror(errno)); \
> > +		save_last_error(wq->device, wq, NULL, NULL); \
> >   		return -errno; \
> >   	} \
> >   	wq->field = val; \
> > @@ -2202,6 +2322,7 @@ ACCFG_EXPORT int accfg_wq_set_##field( \
> >   		err(ctx, "%s: write failed: %s\n", \
> >   				accfg_wq_get_devname(wq), \
> >   				strerror(errno)); \
> > +		save_last_error(wq->device, wq, NULL, NULL); \
> >   		return -errno; \
> >   	} \
> >   	wq->field = val; \
> > @@ -2237,6 +2358,7 @@ ACCFG_EXPORT int accfg_wq_set_str_##field( \
> >   		err(ctx, "%s: write failed: %s\n", \
> >   				accfg_wq_get_devname(wq), \
> >   				strerror(errno)); \
> > +		save_last_error(wq->device, wq, NULL, NULL); \
> >   		return -errno; \
> >   	} \
> >   	if (wq->field) \
> > @@ -2275,6 +2397,7 @@ ACCFG_EXPORT int accfg_wq_set_str_type(struct accfg_wq *wq, const char
> *val)
> >   		err(ctx, "%s: write failed: %s\n",
> >   				accfg_wq_get_devname(wq),
> >   				strerror(errno));
> > +		save_last_error(wq->device, wq, NULL, NULL);
> >   		return -errno;
> >   	}
> >
> > @@ -2384,6 +2507,7 @@ ACCFG_EXPORT int accfg_engine_set_##field( \
> >   		err(ctx, "%s: write failed: %s\n", \
> >   				accfg_engine_get_devname(engine), \
> >   				strerror(errno)); \
> > +		save_last_error(engine->device, NULL, NULL, engine); \
> >   		return -errno; \
> >   	} \
> >   	engine->field = val; \
> > diff --git a/accfg/lib/private.h b/accfg/lib/private.h
> > index b28fd40..57aa1ed 100644
> > --- a/accfg/lib/private.h
> > +++ b/accfg/lib/private.h
> > @@ -131,6 +131,14 @@ struct accfg_wq {
> >
> >   #define ACCFG_EXPORT __attribute__ ((visibility("default")))
> >
> > +struct accfg_error_ctx {
> > +	unsigned int cmd_status;
> > +	struct accfg_device *device;
> > +	struct accfg_group *group;
> > +	struct accfg_wq *wq;
> > +	struct accfg_engine *engine;
> > +};
> > +
> >   /**
> >    * struct accfg_ctx - library user context to find device instances
> >    *
> > @@ -149,6 +157,7 @@ struct accfg_ctx {
> >   	uint64_t timeout;
> >   	void *private_data;
> >   	bool compat;
> > +	struct accfg_error_ctx *error_ctx;
> >   };
> >
> >   static inline int check_udev(struct udev *udev)
> > diff --git a/accfg/libaccel_config.h b/accfg/libaccel_config.h
> > index 8dfd6d9..a74571c 100644
> > --- a/accfg/libaccel_config.h
> > +++ b/accfg/libaccel_config.h
> > @@ -189,9 +189,14 @@ unsigned int accfg_device_get_version(struct accfg_device *device);
> >   int accfg_device_get_clients(struct accfg_device *device);
> >   int accfg_device_set_token_limit(struct accfg_device *dev, int val);
> >   int accfg_device_is_active(struct accfg_device *device);
> > -int accfg_device_get_cmd_status(struct accfg_device *device);
> > +unsigned int accfg_device_get_cmd_status(struct accfg_device *device);
> >   const char *accfg_device_get_cmd_status_str(struct accfg_device *device);
> > -
> > +unsigned int accfg_ctx_get_last_error(struct accfg_ctx *ctx);
> > +const char *accfg_ctx_get_last_error_str(struct accfg_ctx *ctx);
> > +struct accfg_device *accfg_ctx_get_last_error_device(struct accfg_ctx *ctx);
> > +struct accfg_wq *accfg_ctx_get_last_error_wq(struct accfg_ctx *ctx);
> > +struct accfg_group *accfg_ctx_get_last_error_group(struct accfg_ctx *ctx);
> > +struct accfg_engine *accfg_ctx_get_last_error_engine(struct accfg_ctx *ctx);
> >   struct accfg_device_mdev;
> >   struct accfg_device_mdev *accfg_device_first_mdev(struct accfg_device *device);
> >   struct accfg_device_mdev *accfg_device_next_mdev(struct accfg_device_mdev *mdev);


^ permalink raw reply	[flat|nested] 3+ messages in thread

* [Accel-config] Re: [PATCH v2 1/3] accel-config: Add APIs to get details of last error and associated devices
@ 2021-07-13 17:29 Dave Jiang
  0 siblings, 0 replies; 3+ messages in thread
From: Dave Jiang @ 2021-07-13 17:29 UTC (permalink / raw)
  To: accel-config

[-- Attachment #1: Type: text/plain, Size: 0 bytes --]



[-- Attachment #2: attachment.htm --]
[-- Type: text/html, Size: 19633 bytes --]

^ permalink raw reply	[flat|nested] 3+ messages in thread

* [Accel-config] Re: [PATCH v2 1/3] accel-config: Add APIs to get details of last error and associated devices
@ 2021-07-13 15:32 Dave Jiang
  0 siblings, 0 replies; 3+ messages in thread
From: Dave Jiang @ 2021-07-13 15:32 UTC (permalink / raw)
  To: accel-config

[-- Attachment #1: Type: text/plain, Size: 16149 bytes --]


On 7/12/2021 7:39 PM, ramesh.thomas(a)intel.com wrote:
> From: Ramesh Thomas <ramesh.thomas(a)intel.com>
>
> Capture a snapshot of error condition containing cmd_status and the
> associated devices. Provide APIs to return the captured details. Handles
> both hw and sw error codes stored in sysfs cmd_status field of devices.
> Following APIs are added:
> - accfg_ctx_get_last_error;
> - accfg_ctx_get_last_error_str;
> - accfg_ctx_get_last_error_device;
> - accfg_ctx_get_last_error_wq;
> - accfg_ctx_get_last_error_group;
> - accfg_ctx_get_last_error_engine;
>
> Signed-off-by: Ramesh Thomas <ramesh.thomas(a)intel.com>
> ---
>   accfg/lib/libaccel-config.sym |   6 ++
>   accfg/lib/libaccfg.c          | 160 ++++++++++++++++++++++++++++++----
>   accfg/lib/private.h           |   9 ++
>   accfg/libaccel_config.h       |   9 +-
>   4 files changed, 164 insertions(+), 20 deletions(-)
>
> diff --git a/accfg/lib/libaccel-config.sym b/accfg/lib/libaccel-config.sym
> index e9b54e2..f44a3ca 100644
> --- a/accfg/lib/libaccel-config.sym
> +++ b/accfg/lib/libaccel-config.sym
> @@ -154,4 +154,10 @@ LIBACCFG_10 {
>   global:
>   	accfg_wq_get_ats_disable;
>   	accfg_wq_set_ats_disable;
> +	accfg_ctx_get_last_error;
> +	accfg_ctx_get_last_error_str;
> +	accfg_ctx_get_last_error_device;
> +	accfg_ctx_get_last_error_wq;
> +	accfg_ctx_get_last_error_group;
> +	accfg_ctx_get_last_error_engine;
>   } LIBACCFG_9;
> diff --git a/accfg/lib/libaccfg.c b/accfg/lib/libaccfg.c
> index bdfcdcf..6dbfbee 100644
> --- a/accfg/lib/libaccfg.c
> +++ b/accfg/lib/libaccfg.c
> @@ -27,6 +27,7 @@
>   #include <accfg/libaccel_config.h>
>   #include <fnmatch.h>
>   #include "private.h"
> +#include <linux/idxd.h>
>   
>   #define MDEV_POSTFIX "mdev_supported_types"
>   #define IDXD_DRIVER_BIND_PATH "/sys/bus/dsa/drivers/idxd"
> @@ -70,8 +71,12 @@ ACCFG_EXPORT char *accfg_mdev_basenames[] = {
>   
>   enum {
>   	ACCFG_CMD_STATUS_MAX = 0x45,
> +	ACCFG_CMD_STATUS_ERROR = 0x80010000,
>   };
>   
> +#define SCMD_STAT(x) (((x) & ~IDXD_SCMD_SOFTERR_MASK) >> \
> +		IDXD_SCMD_SOFTERR_SHIFT)
> +
>   const char *accfg_device_cmd_status[] = {
>   	[0x0]   = "Successful completion",
>   	[0x1]	= "Invalid command code",
> @@ -105,9 +110,43 @@ const char *accfg_device_cmd_status[] = {
>   	[0x42]	= "No interrupt handle available",
>   	[0x43]	= "No interrupt handles associated with the index",
>   	[0x44]	= "No revoked handles associted with the index",
> -	[ACCFG_CMD_STATUS_MAX]	= "",
> +	[ACCFG_CMD_STATUS_MAX]	= "Unknown error",
> +};
> +
> +const char *accfg_sw_cmd_status[] = {
> +	[0x01] = "Error reading cmd_status",
> +	[SCMD_STAT(IDXD_SCMD_DEV_DMA_ERR)] = "DMA device registration error",
> +	[SCMD_STAT(IDXD_SCMD_WQ_NO_GRP)] = "wq error - group not set",
> +	[SCMD_STAT(IDXD_SCMD_WQ_NO_NAME)] = "wq error - name not set",
> +	[SCMD_STAT(IDXD_SCMD_WQ_NO_SVM)] = "wq error - no SVM support and needed",
> +	[SCMD_STAT(IDXD_SCMD_WQ_NO_THRESH)] = "wq error - threshold not set",
> +	[SCMD_STAT(IDXD_SCMD_WQ_PORTAL_ERR)] = "wq portal mapping error",
> +	[SCMD_STAT(IDXD_SCMD_WQ_RES_ALLOC_ERR)] = "wq resource allocation error",
> +	[SCMD_STAT(IDXD_SCMD_PERCPU_ERR)] = "wq percpu counter allocation error",
> +	[SCMD_STAT(IDXD_SCMD_DMA_CHAN_ERR)] = "wq DMA channel registration error",
> +	[SCMD_STAT(IDXD_SCMD_CDEV_ERR)] = "wq char dev registration error",
> +	[SCMD_STAT(IDXD_SCMD_WQ_NO_SWQ_SUPPORT)] = "wq error - no shared wq support",
> +	[SCMD_STAT(IDXD_SCMD_WQ_NONE_CONFIGURED)] = "wq error - no wqs configured",
> +	[SCMD_STAT(IDXD_SCMD_WQ_NO_SIZE)] = "wq error - size not set",

I wonder if we should add to some of these as "driver error" at the end 
so the user would understand not config related.

Also, I have also overloaded the hardware status such that the driver 
can report the same error to what spec defined except with the software 
flag set. I think we should cover those as well.


>   };
>   
> +static void save_last_error(struct accfg_device *device, struct accfg_wq *wq,
> +		struct accfg_group *group, struct accfg_engine *engine)
> +{
> +	struct accfg_ctx *ctx = device->ctx;
> +
> +	if (!ctx->error_ctx)
> +		ctx->error_ctx = calloc(1, sizeof(struct accfg_error_ctx));
> +
> +	if (ctx->error_ctx) {
> +		ctx->error_ctx->cmd_status = accfg_device_get_cmd_status(device);
> +		ctx->error_ctx->device = device;
> +		ctx->error_ctx->wq = wq;
> +		ctx->error_ctx->group = group;
> +		ctx->error_ctx->engine = engine;
> +	}
> +}
> +
>   static inline bool is_mdev_registered(struct accfg_device *device)
>   {
>   	return device->mdev_path && !access(device->mdev_path, R_OK);
> @@ -251,6 +290,7 @@ static void free_context(struct accfg_ctx *ctx)
>   
>   	list_for_each_safe(&ctx->devices, device, _b, list)
>   		free_device(device, &ctx->devices);
> +	free(ctx->error_ctx);
>   	free(ctx);
>   }
>   
> @@ -1083,6 +1123,7 @@ ACCFG_EXPORT int accfg_create_mdev(struct accfg_device *device,
>   	rc = sysfs_write_attr(ctx, mdev_path, uuid_str);
>   	if (rc < 0) {
>   		err(ctx, "create mdev failed %d\n", rc);
> +		save_last_error(device, NULL, NULL, NULL);
>   		goto create_err;
>   	}
>   
> @@ -1106,8 +1147,10 @@ static int accfg_device_mdev_remove(struct accfg_device *device,
>   	uuid_unparse(mdev->uuid, uuid_str);
>   	sprintf(mdev_path, "%s/%s/remove", device->mdev_path, uuid_str);
>   	rc = sysfs_write_attr(ctx, mdev_path, "1");
> -	if (rc < 0)
> +	if (rc < 0) {
> +		save_last_error(device, NULL, NULL, NULL);
>   		return rc;
> +	}
>   
>   	list_del(&mdev->list);
>   	free(mdev);
> @@ -1356,6 +1399,7 @@ ACCFG_EXPORT enum accfg_device_state accfg_device_get_state(
>   		err(ctx, "%s: sysfs_read_attr failed '%s': %s\n",
>   				__func__, device->device_path,
>   				strerror(errno));
> +		save_last_error(device, NULL, NULL, NULL);
>   		return ACCFG_DEVICE_UNKNOWN;
>   	}
>   
> @@ -1415,6 +1459,7 @@ ACCFG_EXPORT int accfg_device_get_clients(struct accfg_device *device)
>   	if (sysfs_read_attr(ctx, path, buf) < 0) {
>   		err(ctx, "%s: retrieve clients failed '%s': %s\n",
>   				__func__, path, strerror(errno));
> +		save_last_error(device, NULL, NULL, NULL);
>   		return -errno;
>   	}
>   
> @@ -1449,6 +1494,7 @@ ACCFG_EXPORT int accfg_device_set_token_limit(struct accfg_device *dev, int val)
>   	if (sysfs_write_attr(ctx, path, buf) < 0) {
>   		err(ctx, "%s: write failed: %s\n",
>   				accfg_device_get_devname(dev), strerror(errno));
> +		save_last_error(dev, NULL, NULL, NULL);
>   		return -errno;
>   	}
>   
> @@ -1477,8 +1523,10 @@ ACCFG_EXPORT int accfg_device_is_active(struct accfg_device *device)
>   		return 0;
>   	}
>   
> -	if (sysfs_read_attr(ctx, path, buf) < 0)
> +	if (sysfs_read_attr(ctx, path, buf) < 0) {
> +		save_last_error(device, NULL, NULL, NULL);
>   		return 0;
> +	}
>   
>   	if (strcmp(buf, "enabled") == 0)
>   		return 1;
> @@ -1486,17 +1534,16 @@ ACCFG_EXPORT int accfg_device_is_active(struct accfg_device *device)
>   	return 0;
>   }
>   
> -ACCFG_EXPORT int accfg_device_get_cmd_status(struct accfg_device *device)
> +ACCFG_EXPORT unsigned int accfg_device_get_cmd_status(struct accfg_device *device)
>   {
>   	struct accfg_ctx *ctx;
>   	long status;
>   	char *path;
>   	int len;
>   	char buf[SYSFS_ATTR_SIZE], *end_ptr;
> -	int rc;
>   
>   	if (!device)
> -		return -EINVAL;
> +		return ACCFG_CMD_STATUS_ERROR;
>   
>   	ctx = accfg_device_get_ctx(device);
>   	path = device->device_buf;
> @@ -1505,31 +1552,96 @@ ACCFG_EXPORT int accfg_device_get_cmd_status(struct accfg_device *device)
>   	if (snprintf(path, len, "%s/cmd_status", device->device_path) >= len) {
>   		err(ctx, "%s: buffer too small!\n",
>   				accfg_device_get_devname(device));
> -		return -ENOMEM;
> +		return ACCFG_CMD_STATUS_ERROR;
>   	}
>   
> -	rc = sysfs_read_attr(ctx, path, buf);
> -	if (rc < 0)
> -		return rc;
> +	if (sysfs_read_attr(ctx, path, buf))
> +		return ACCFG_CMD_STATUS_ERROR;
>   
>   	status = strtol(buf, &end_ptr, 0);
>   	if (errno == ERANGE || end_ptr == buf)
> -		return -EIO;
> +		return ACCFG_CMD_STATUS_ERROR;
>   
> -	return (int)status;
> +	return (unsigned int)status;
>   }
>   
> -ACCFG_EXPORT const char * accfg_device_get_cmd_status_str(struct accfg_device *device)
> +static const char *get_cmd_status_str(unsigned int status)
>   {
> -	int status;
> +	unsigned int sw_status;
>   
> -	status = accfg_device_get_cmd_status(device);
> -	if (status < 0 || status >= ACCFG_CMD_STATUS_MAX)
> -		return NULL;
> +	if (status & IDXD_SCMD_SOFTERR_MASK) {
> +		sw_status = SCMD_STAT(status);
> +		if (sw_status) {
> +			if (sw_status >= (sizeof(accfg_sw_cmd_status) /
> +						sizeof(char *))) {
> +				status = ACCFG_CMD_STATUS_MAX;
> +				goto ext;
> +			}
> +			return accfg_sw_cmd_status[sw_status];
> +		}
> +		status &= ~IDXD_SCMD_SOFTERR_MASK;
> +	}
>   
> +	if (status > ACCFG_CMD_STATUS_MAX)
> +		status = ACCFG_CMD_STATUS_MAX;
> +
> +ext:
>   	return accfg_device_cmd_status[status];
>   }
>   
> +ACCFG_EXPORT const char *accfg_device_get_cmd_status_str(struct accfg_device *device)
> +{
> +	return get_cmd_status_str(accfg_device_get_cmd_status(device));
> +}
> +
> +ACCFG_EXPORT unsigned int accfg_ctx_get_last_error(struct accfg_ctx *ctx)
> +{
> +	if (ctx->error_ctx)
> +		return ctx->error_ctx->cmd_status;
> +
> +	return 0;
> +}
> +
> +ACCFG_EXPORT const char *accfg_ctx_get_last_error_str(struct accfg_ctx *ctx)
> +{
> +	return get_cmd_status_str(accfg_ctx_get_last_error(ctx));
> +}
> +
> +ACCFG_EXPORT struct accfg_device *accfg_ctx_get_last_error_device(
> +		struct accfg_ctx *ctx)
> +{
> +	if (ctx->error_ctx)
> +		return ctx->error_ctx->device;
> +
> +	return NULL;
> +}
> +
> +ACCFG_EXPORT struct accfg_wq *accfg_ctx_get_last_error_wq(struct accfg_ctx *ctx)
> +{
> +	if (ctx->error_ctx)
> +		return ctx->error_ctx->wq;
> +
> +	return NULL;
> +}
> +
> +ACCFG_EXPORT struct accfg_group *accfg_ctx_get_last_error_group(
> +		struct accfg_ctx *ctx)
> +{
> +	if (ctx->error_ctx)
> +		return ctx->error_ctx->group;
> +
> +	return NULL;
> +}
> +
> +ACCFG_EXPORT struct accfg_engine *accfg_ctx_get_last_error_engine(
> +		struct accfg_ctx *ctx)
> +{
> +	if (ctx->error_ctx)
> +		return ctx->error_ctx->engine;
> +
> +	return NULL;
> +}
> +
>   /* Helper function to validate device type in the defined device array based on
>    * device name */
>   ACCFG_EXPORT int accfg_device_type_validate(const char *dev_name)
> @@ -1633,6 +1745,7 @@ static int accfg_device_control(struct accfg_device *device,
>   		rc = sysfs_write_attr(ctx, path, accfg_device_get_devname(device));
>   		free(path);
>   		if (rc < 0) {
> +			save_last_error(device, NULL, NULL, NULL);
>   			return rc;
>   		}
>   	}
> @@ -1722,8 +1835,10 @@ ACCFG_EXPORT uint64_t accfg_group_get_available_size(
>   		return ULLONG_MAX;
>   	}
>   
> -	if (sysfs_read_attr(ctx, path, buf) < 0)
> +	if (sysfs_read_attr(ctx, path, buf) < 0) {
> +		save_last_error(group->device, NULL, group, NULL);
>   		return ULLONG_MAX;
> +	}
>   
>   	return strtoull(buf, NULL, 0);
>   }
> @@ -1749,6 +1864,7 @@ ACCFG_EXPORT int accfg_group_set_##field( \
>   		err(ctx, "%s: write failed: %s\n", \
>   				accfg_group_get_devname(group), \
>   				strerror(errno)); \
> +		save_last_error(group->device, NULL, group, NULL); \
>   		return -errno; \
>   	} \
>   	group->field = val; \
> @@ -1907,6 +2023,7 @@ ACCFG_EXPORT int accfg_wq_get_clients(struct accfg_wq *wq)
>   	if (sysfs_read_attr(ctx, path, buf) < 0) {
>   		err(ctx, "%s: retrieve clients failed: '%s': %s\n",
>   				__func__, wq->wq_path, strerror(errno));
> +		save_last_error(wq->device, wq, NULL, NULL);
>   		return -errno;
>   	}
>   
> @@ -1994,6 +2111,7 @@ static int accfg_wq_control(struct accfg_wq *wq, enum accfg_control_flag flag,
>   	if (path) {
>   		rc = sysfs_write_attr(ctx, path, wq_name);
>   		if (rc < 0) {
> +			save_last_error(wq->device, wq, NULL, NULL);
>   			free(path);
>   			return rc;
>   		}
> @@ -2038,6 +2156,7 @@ ACCFG_EXPORT enum accfg_wq_state accfg_wq_get_state(struct accfg_wq *wq)
>   	if (sysfs_read_attr(ctx, path, read_state) < 0) {
>   		err(ctx, "%s: sysfs_read_attr failed '%s': %s\n",
>   				__func__, wq->wq_path, strerror(errno));
> +		save_last_error(wq->device, wq, NULL, NULL);
>   		return ACCFG_WQ_UNKNOWN;
>   	}
>   
> @@ -2163,6 +2282,7 @@ ACCFG_EXPORT int accfg_wq_set_##field( \
>   		err(ctx, "%s: write failed: %s\n", \
>   				accfg_wq_get_devname(wq), \
>   				strerror(errno)); \
> +		save_last_error(wq->device, wq, NULL, NULL); \
>   		return -errno; \
>   	} \
>   	wq->field = val; \
> @@ -2202,6 +2322,7 @@ ACCFG_EXPORT int accfg_wq_set_##field( \
>   		err(ctx, "%s: write failed: %s\n", \
>   				accfg_wq_get_devname(wq), \
>   				strerror(errno)); \
> +		save_last_error(wq->device, wq, NULL, NULL); \
>   		return -errno; \
>   	} \
>   	wq->field = val; \
> @@ -2237,6 +2358,7 @@ ACCFG_EXPORT int accfg_wq_set_str_##field( \
>   		err(ctx, "%s: write failed: %s\n", \
>   				accfg_wq_get_devname(wq), \
>   				strerror(errno)); \
> +		save_last_error(wq->device, wq, NULL, NULL); \
>   		return -errno; \
>   	} \
>   	if (wq->field) \
> @@ -2275,6 +2397,7 @@ ACCFG_EXPORT int accfg_wq_set_str_type(struct accfg_wq *wq, const char *val)
>   		err(ctx, "%s: write failed: %s\n",
>   				accfg_wq_get_devname(wq),
>   				strerror(errno));
> +		save_last_error(wq->device, wq, NULL, NULL);
>   		return -errno;
>   	}
>   
> @@ -2384,6 +2507,7 @@ ACCFG_EXPORT int accfg_engine_set_##field( \
>   		err(ctx, "%s: write failed: %s\n", \
>   				accfg_engine_get_devname(engine), \
>   				strerror(errno)); \
> +		save_last_error(engine->device, NULL, NULL, engine); \
>   		return -errno; \
>   	} \
>   	engine->field = val; \
> diff --git a/accfg/lib/private.h b/accfg/lib/private.h
> index b28fd40..57aa1ed 100644
> --- a/accfg/lib/private.h
> +++ b/accfg/lib/private.h
> @@ -131,6 +131,14 @@ struct accfg_wq {
>   
>   #define ACCFG_EXPORT __attribute__ ((visibility("default")))
>   
> +struct accfg_error_ctx {
> +	unsigned int cmd_status;
> +	struct accfg_device *device;
> +	struct accfg_group *group;
> +	struct accfg_wq *wq;
> +	struct accfg_engine *engine;
> +};
> +
>   /**
>    * struct accfg_ctx - library user context to find device instances
>    *
> @@ -149,6 +157,7 @@ struct accfg_ctx {
>   	uint64_t timeout;
>   	void *private_data;
>   	bool compat;
> +	struct accfg_error_ctx *error_ctx;
>   };
>   
>   static inline int check_udev(struct udev *udev)
> diff --git a/accfg/libaccel_config.h b/accfg/libaccel_config.h
> index 8dfd6d9..a74571c 100644
> --- a/accfg/libaccel_config.h
> +++ b/accfg/libaccel_config.h
> @@ -189,9 +189,14 @@ unsigned int accfg_device_get_version(struct accfg_device *device);
>   int accfg_device_get_clients(struct accfg_device *device);
>   int accfg_device_set_token_limit(struct accfg_device *dev, int val);
>   int accfg_device_is_active(struct accfg_device *device);
> -int accfg_device_get_cmd_status(struct accfg_device *device);
> +unsigned int accfg_device_get_cmd_status(struct accfg_device *device);
>   const char *accfg_device_get_cmd_status_str(struct accfg_device *device);
> -
> +unsigned int accfg_ctx_get_last_error(struct accfg_ctx *ctx);
> +const char *accfg_ctx_get_last_error_str(struct accfg_ctx *ctx);
> +struct accfg_device *accfg_ctx_get_last_error_device(struct accfg_ctx *ctx);
> +struct accfg_wq *accfg_ctx_get_last_error_wq(struct accfg_ctx *ctx);
> +struct accfg_group *accfg_ctx_get_last_error_group(struct accfg_ctx *ctx);
> +struct accfg_engine *accfg_ctx_get_last_error_engine(struct accfg_ctx *ctx);
>   struct accfg_device_mdev;
>   struct accfg_device_mdev *accfg_device_first_mdev(struct accfg_device *device);
>   struct accfg_device_mdev *accfg_device_next_mdev(struct accfg_device_mdev *mdev);

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2021-07-13 17:29 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-07-13 16:53 [Accel-config] Re: [PATCH v2 1/3] accel-config: Add APIs to get details of last error and associated devices Thomas, Ramesh
  -- strict thread matches above, loose matches on Subject: below --
2021-07-13 17:29 Dave Jiang
2021-07-13 15:32 Dave Jiang

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.