All of lore.kernel.org
 help / color / mirror / Atom feed
* [Accel-config] [PATCH v3 1/3] accel-config: Add APIs to get details of last error and associated devices
@ 2021-07-14 17:17 ramesh.thomas
  0 siblings, 0 replies; only message in thread
From: ramesh.thomas @ 2021-07-14 17:17 UTC (permalink / raw)
  To: accel-config

[-- Attachment #1: Type: text/plain, Size: 14846 bytes --]

From: Ramesh Thomas <ramesh.thomas(a)intel.com>

Capture a snapshot of error condition containing cmd_status and the
associated devices. Provide APIs to return the captured details. Handles
both hw and sw error codes stored in sysfs cmd_status field of devices.
Following APIs are added:
- accfg_ctx_get_last_error;
- accfg_ctx_get_last_error_str;
- accfg_ctx_get_last_error_device;
- accfg_ctx_get_last_error_wq;
- accfg_ctx_get_last_error_group;
- accfg_ctx_get_last_error_engine;

Signed-off-by: Ramesh Thomas <ramesh.thomas(a)intel.com>
---
 accfg/lib/libaccel-config.sym |   6 ++
 accfg/lib/libaccfg.c          | 160 ++++++++++++++++++++++++++++++----
 accfg/lib/private.h           |   9 ++
 accfg/libaccel_config.h       |   9 +-
 4 files changed, 164 insertions(+), 20 deletions(-)

diff --git a/accfg/lib/libaccel-config.sym b/accfg/lib/libaccel-config.sym
index e9b54e2..f44a3ca 100644
--- a/accfg/lib/libaccel-config.sym
+++ b/accfg/lib/libaccel-config.sym
@@ -154,4 +154,10 @@ LIBACCFG_10 {
 global:
 	accfg_wq_get_ats_disable;
 	accfg_wq_set_ats_disable;
+	accfg_ctx_get_last_error;
+	accfg_ctx_get_last_error_str;
+	accfg_ctx_get_last_error_device;
+	accfg_ctx_get_last_error_wq;
+	accfg_ctx_get_last_error_group;
+	accfg_ctx_get_last_error_engine;
 } LIBACCFG_9;
diff --git a/accfg/lib/libaccfg.c b/accfg/lib/libaccfg.c
index bdfcdcf..03dfc92 100644
--- a/accfg/lib/libaccfg.c
+++ b/accfg/lib/libaccfg.c
@@ -27,6 +27,7 @@
 #include <accfg/libaccel_config.h>
 #include <fnmatch.h>
 #include "private.h"
+#include <linux/idxd.h>
 
 #define MDEV_POSTFIX "mdev_supported_types"
 #define IDXD_DRIVER_BIND_PATH "/sys/bus/dsa/drivers/idxd"
@@ -70,8 +71,12 @@ ACCFG_EXPORT char *accfg_mdev_basenames[] = {
 
 enum {
 	ACCFG_CMD_STATUS_MAX = 0x45,
+	ACCFG_CMD_STATUS_ERROR = 0x80010000,
 };
 
+#define SCMD_STAT(x) (((x) & ~IDXD_SCMD_SOFTERR_MASK) >> \
+		IDXD_SCMD_SOFTERR_SHIFT)
+
 const char *accfg_device_cmd_status[] = {
 	[0x0]   = "Successful completion",
 	[0x1]	= "Invalid command code",
@@ -105,9 +110,43 @@ const char *accfg_device_cmd_status[] = {
 	[0x42]	= "No interrupt handle available",
 	[0x43]	= "No interrupt handles associated with the index",
 	[0x44]	= "No revoked handles associted with the index",
-	[ACCFG_CMD_STATUS_MAX]	= "",
+	[ACCFG_CMD_STATUS_MAX]	= "Unknown error",
+};
+
+const char *accfg_sw_cmd_status[] = {
+	[0x01] = "Error reading cmd_status (library error)",
+	[SCMD_STAT(IDXD_SCMD_DEV_DMA_ERR)] = "DMA device registration error (driver error)",
+	[SCMD_STAT(IDXD_SCMD_WQ_NO_GRP)] = "wq error - group not set",
+	[SCMD_STAT(IDXD_SCMD_WQ_NO_NAME)] = "wq error - name not set",
+	[SCMD_STAT(IDXD_SCMD_WQ_NO_SVM)] = "wq error - no SVM support and needed (driver error)",
+	[SCMD_STAT(IDXD_SCMD_WQ_NO_THRESH)] = "wq error - threshold not set",
+	[SCMD_STAT(IDXD_SCMD_WQ_PORTAL_ERR)] = "wq portal mapping error (driver error)",
+	[SCMD_STAT(IDXD_SCMD_WQ_RES_ALLOC_ERR)] = "wq resource allocation error (driver error)",
+	[SCMD_STAT(IDXD_SCMD_PERCPU_ERR)] = "wq percpu counter allocation error (driver error)",
+	[SCMD_STAT(IDXD_SCMD_DMA_CHAN_ERR)] = "wq DMA channel registration error (driver error)",
+	[SCMD_STAT(IDXD_SCMD_CDEV_ERR)] = "wq char dev registration error (driver error)",
+	[SCMD_STAT(IDXD_SCMD_WQ_NO_SWQ_SUPPORT)] = "wq error - no shared wq support (driver error)",
+	[SCMD_STAT(IDXD_SCMD_WQ_NONE_CONFIGURED)] = "wq error - no wqs configured",
+	[SCMD_STAT(IDXD_SCMD_WQ_NO_SIZE)] = "wq error - size not set",
 };
 
+static void save_last_error(struct accfg_device *device, struct accfg_wq *wq,
+		struct accfg_group *group, struct accfg_engine *engine)
+{
+	struct accfg_ctx *ctx = device->ctx;
+
+	if (!ctx->error_ctx)
+		ctx->error_ctx = calloc(1, sizeof(struct accfg_error_ctx));
+
+	if (ctx->error_ctx) {
+		ctx->error_ctx->cmd_status = accfg_device_get_cmd_status(device);
+		ctx->error_ctx->device = device;
+		ctx->error_ctx->wq = wq;
+		ctx->error_ctx->group = group;
+		ctx->error_ctx->engine = engine;
+	}
+}
+
 static inline bool is_mdev_registered(struct accfg_device *device)
 {
 	return device->mdev_path && !access(device->mdev_path, R_OK);
@@ -251,6 +290,7 @@ static void free_context(struct accfg_ctx *ctx)
 
 	list_for_each_safe(&ctx->devices, device, _b, list)
 		free_device(device, &ctx->devices);
+	free(ctx->error_ctx);
 	free(ctx);
 }
 
@@ -1083,6 +1123,7 @@ ACCFG_EXPORT int accfg_create_mdev(struct accfg_device *device,
 	rc = sysfs_write_attr(ctx, mdev_path, uuid_str);
 	if (rc < 0) {
 		err(ctx, "create mdev failed %d\n", rc);
+		save_last_error(device, NULL, NULL, NULL);
 		goto create_err;
 	}
 
@@ -1106,8 +1147,10 @@ static int accfg_device_mdev_remove(struct accfg_device *device,
 	uuid_unparse(mdev->uuid, uuid_str);
 	sprintf(mdev_path, "%s/%s/remove", device->mdev_path, uuid_str);
 	rc = sysfs_write_attr(ctx, mdev_path, "1");
-	if (rc < 0)
+	if (rc < 0) {
+		save_last_error(device, NULL, NULL, NULL);
 		return rc;
+	}
 
 	list_del(&mdev->list);
 	free(mdev);
@@ -1356,6 +1399,7 @@ ACCFG_EXPORT enum accfg_device_state accfg_device_get_state(
 		err(ctx, "%s: sysfs_read_attr failed '%s': %s\n",
 				__func__, device->device_path,
 				strerror(errno));
+		save_last_error(device, NULL, NULL, NULL);
 		return ACCFG_DEVICE_UNKNOWN;
 	}
 
@@ -1415,6 +1459,7 @@ ACCFG_EXPORT int accfg_device_get_clients(struct accfg_device *device)
 	if (sysfs_read_attr(ctx, path, buf) < 0) {
 		err(ctx, "%s: retrieve clients failed '%s': %s\n",
 				__func__, path, strerror(errno));
+		save_last_error(device, NULL, NULL, NULL);
 		return -errno;
 	}
 
@@ -1449,6 +1494,7 @@ ACCFG_EXPORT int accfg_device_set_token_limit(struct accfg_device *dev, int val)
 	if (sysfs_write_attr(ctx, path, buf) < 0) {
 		err(ctx, "%s: write failed: %s\n",
 				accfg_device_get_devname(dev), strerror(errno));
+		save_last_error(dev, NULL, NULL, NULL);
 		return -errno;
 	}
 
@@ -1477,8 +1523,10 @@ ACCFG_EXPORT int accfg_device_is_active(struct accfg_device *device)
 		return 0;
 	}
 
-	if (sysfs_read_attr(ctx, path, buf) < 0)
+	if (sysfs_read_attr(ctx, path, buf) < 0) {
+		save_last_error(device, NULL, NULL, NULL);
 		return 0;
+	}
 
 	if (strcmp(buf, "enabled") == 0)
 		return 1;
@@ -1486,17 +1534,16 @@ ACCFG_EXPORT int accfg_device_is_active(struct accfg_device *device)
 	return 0;
 }
 
-ACCFG_EXPORT int accfg_device_get_cmd_status(struct accfg_device *device)
+ACCFG_EXPORT unsigned int accfg_device_get_cmd_status(struct accfg_device *device)
 {
 	struct accfg_ctx *ctx;
 	long status;
 	char *path;
 	int len;
 	char buf[SYSFS_ATTR_SIZE], *end_ptr;
-	int rc;
 
 	if (!device)
-		return -EINVAL;
+		return ACCFG_CMD_STATUS_ERROR;
 
 	ctx = accfg_device_get_ctx(device);
 	path = device->device_buf;
@@ -1505,31 +1552,96 @@ ACCFG_EXPORT int accfg_device_get_cmd_status(struct accfg_device *device)
 	if (snprintf(path, len, "%s/cmd_status", device->device_path) >= len) {
 		err(ctx, "%s: buffer too small!\n",
 				accfg_device_get_devname(device));
-		return -ENOMEM;
+		return ACCFG_CMD_STATUS_ERROR;
 	}
 
-	rc = sysfs_read_attr(ctx, path, buf);
-	if (rc < 0)
-		return rc;
+	if (sysfs_read_attr(ctx, path, buf))
+		return ACCFG_CMD_STATUS_ERROR;
 
 	status = strtol(buf, &end_ptr, 0);
 	if (errno == ERANGE || end_ptr == buf)
-		return -EIO;
+		return ACCFG_CMD_STATUS_ERROR;
 
-	return (int)status;
+	return (unsigned int)status;
 }
 
-ACCFG_EXPORT const char * accfg_device_get_cmd_status_str(struct accfg_device *device)
+static const char *get_cmd_status_str(unsigned int status)
 {
-	int status;
+	unsigned int sw_status;
 
-	status = accfg_device_get_cmd_status(device);
-	if (status < 0 || status >= ACCFG_CMD_STATUS_MAX)
-		return NULL;
+	if (status & IDXD_SCMD_SOFTERR_MASK) {
+		sw_status = SCMD_STAT(status);
+		if (sw_status) {
+			if (sw_status >= (sizeof(accfg_sw_cmd_status) /
+						sizeof(char *))) {
+				status = ACCFG_CMD_STATUS_MAX;
+				goto ext;
+			}
+			return accfg_sw_cmd_status[sw_status];
+		}
+		status &= ~IDXD_SCMD_SOFTERR_MASK;
+	}
 
+	if (status > ACCFG_CMD_STATUS_MAX)
+		status = ACCFG_CMD_STATUS_MAX;
+
+ext:
 	return accfg_device_cmd_status[status];
 }
 
+ACCFG_EXPORT const char *accfg_device_get_cmd_status_str(struct accfg_device *device)
+{
+	return get_cmd_status_str(accfg_device_get_cmd_status(device));
+}
+
+ACCFG_EXPORT unsigned int accfg_ctx_get_last_error(struct accfg_ctx *ctx)
+{
+	if (ctx->error_ctx)
+		return ctx->error_ctx->cmd_status;
+
+	return 0;
+}
+
+ACCFG_EXPORT const char *accfg_ctx_get_last_error_str(struct accfg_ctx *ctx)
+{
+	return get_cmd_status_str(accfg_ctx_get_last_error(ctx));
+}
+
+ACCFG_EXPORT struct accfg_device *accfg_ctx_get_last_error_device(
+		struct accfg_ctx *ctx)
+{
+	if (ctx->error_ctx)
+		return ctx->error_ctx->device;
+
+	return NULL;
+}
+
+ACCFG_EXPORT struct accfg_wq *accfg_ctx_get_last_error_wq(struct accfg_ctx *ctx)
+{
+	if (ctx->error_ctx)
+		return ctx->error_ctx->wq;
+
+	return NULL;
+}
+
+ACCFG_EXPORT struct accfg_group *accfg_ctx_get_last_error_group(
+		struct accfg_ctx *ctx)
+{
+	if (ctx->error_ctx)
+		return ctx->error_ctx->group;
+
+	return NULL;
+}
+
+ACCFG_EXPORT struct accfg_engine *accfg_ctx_get_last_error_engine(
+		struct accfg_ctx *ctx)
+{
+	if (ctx->error_ctx)
+		return ctx->error_ctx->engine;
+
+	return NULL;
+}
+
 /* Helper function to validate device type in the defined device array based on
  * device name */
 ACCFG_EXPORT int accfg_device_type_validate(const char *dev_name)
@@ -1633,6 +1745,7 @@ static int accfg_device_control(struct accfg_device *device,
 		rc = sysfs_write_attr(ctx, path, accfg_device_get_devname(device));
 		free(path);
 		if (rc < 0) {
+			save_last_error(device, NULL, NULL, NULL);
 			return rc;
 		}
 	}
@@ -1722,8 +1835,10 @@ ACCFG_EXPORT uint64_t accfg_group_get_available_size(
 		return ULLONG_MAX;
 	}
 
-	if (sysfs_read_attr(ctx, path, buf) < 0)
+	if (sysfs_read_attr(ctx, path, buf) < 0) {
+		save_last_error(group->device, NULL, group, NULL);
 		return ULLONG_MAX;
+	}
 
 	return strtoull(buf, NULL, 0);
 }
@@ -1749,6 +1864,7 @@ ACCFG_EXPORT int accfg_group_set_##field( \
 		err(ctx, "%s: write failed: %s\n", \
 				accfg_group_get_devname(group), \
 				strerror(errno)); \
+		save_last_error(group->device, NULL, group, NULL); \
 		return -errno; \
 	} \
 	group->field = val; \
@@ -1907,6 +2023,7 @@ ACCFG_EXPORT int accfg_wq_get_clients(struct accfg_wq *wq)
 	if (sysfs_read_attr(ctx, path, buf) < 0) {
 		err(ctx, "%s: retrieve clients failed: '%s': %s\n",
 				__func__, wq->wq_path, strerror(errno));
+		save_last_error(wq->device, wq, NULL, NULL);
 		return -errno;
 	}
 
@@ -1994,6 +2111,7 @@ static int accfg_wq_control(struct accfg_wq *wq, enum accfg_control_flag flag,
 	if (path) {
 		rc = sysfs_write_attr(ctx, path, wq_name);
 		if (rc < 0) {
+			save_last_error(wq->device, wq, NULL, NULL);
 			free(path);
 			return rc;
 		}
@@ -2038,6 +2156,7 @@ ACCFG_EXPORT enum accfg_wq_state accfg_wq_get_state(struct accfg_wq *wq)
 	if (sysfs_read_attr(ctx, path, read_state) < 0) {
 		err(ctx, "%s: sysfs_read_attr failed '%s': %s\n",
 				__func__, wq->wq_path, strerror(errno));
+		save_last_error(wq->device, wq, NULL, NULL);
 		return ACCFG_WQ_UNKNOWN;
 	}
 
@@ -2163,6 +2282,7 @@ ACCFG_EXPORT int accfg_wq_set_##field( \
 		err(ctx, "%s: write failed: %s\n", \
 				accfg_wq_get_devname(wq), \
 				strerror(errno)); \
+		save_last_error(wq->device, wq, NULL, NULL); \
 		return -errno; \
 	} \
 	wq->field = val; \
@@ -2202,6 +2322,7 @@ ACCFG_EXPORT int accfg_wq_set_##field( \
 		err(ctx, "%s: write failed: %s\n", \
 				accfg_wq_get_devname(wq), \
 				strerror(errno)); \
+		save_last_error(wq->device, wq, NULL, NULL); \
 		return -errno; \
 	} \
 	wq->field = val; \
@@ -2237,6 +2358,7 @@ ACCFG_EXPORT int accfg_wq_set_str_##field( \
 		err(ctx, "%s: write failed: %s\n", \
 				accfg_wq_get_devname(wq), \
 				strerror(errno)); \
+		save_last_error(wq->device, wq, NULL, NULL); \
 		return -errno; \
 	} \
 	if (wq->field) \
@@ -2275,6 +2397,7 @@ ACCFG_EXPORT int accfg_wq_set_str_type(struct accfg_wq *wq, const char *val)
 		err(ctx, "%s: write failed: %s\n",
 				accfg_wq_get_devname(wq),
 				strerror(errno));
+		save_last_error(wq->device, wq, NULL, NULL);
 		return -errno;
 	}
 
@@ -2384,6 +2507,7 @@ ACCFG_EXPORT int accfg_engine_set_##field( \
 		err(ctx, "%s: write failed: %s\n", \
 				accfg_engine_get_devname(engine), \
 				strerror(errno)); \
+		save_last_error(engine->device, NULL, NULL, engine); \
 		return -errno; \
 	} \
 	engine->field = val; \
diff --git a/accfg/lib/private.h b/accfg/lib/private.h
index b28fd40..57aa1ed 100644
--- a/accfg/lib/private.h
+++ b/accfg/lib/private.h
@@ -131,6 +131,14 @@ struct accfg_wq {
 
 #define ACCFG_EXPORT __attribute__ ((visibility("default")))
 
+struct accfg_error_ctx {
+	unsigned int cmd_status;
+	struct accfg_device *device;
+	struct accfg_group *group;
+	struct accfg_wq *wq;
+	struct accfg_engine *engine;
+};
+
 /**
  * struct accfg_ctx - library user context to find device instances
  *
@@ -149,6 +157,7 @@ struct accfg_ctx {
 	uint64_t timeout;
 	void *private_data;
 	bool compat;
+	struct accfg_error_ctx *error_ctx;
 };
 
 static inline int check_udev(struct udev *udev)
diff --git a/accfg/libaccel_config.h b/accfg/libaccel_config.h
index c0b1f16..f8b9bf9 100644
--- a/accfg/libaccel_config.h
+++ b/accfg/libaccel_config.h
@@ -194,9 +194,14 @@ unsigned int accfg_device_get_version(struct accfg_device *device);
 int accfg_device_get_clients(struct accfg_device *device);
 int accfg_device_set_token_limit(struct accfg_device *dev, int val);
 int accfg_device_is_active(struct accfg_device *device);
-int accfg_device_get_cmd_status(struct accfg_device *device);
+unsigned int accfg_device_get_cmd_status(struct accfg_device *device);
 const char *accfg_device_get_cmd_status_str(struct accfg_device *device);
-
+unsigned int accfg_ctx_get_last_error(struct accfg_ctx *ctx);
+const char *accfg_ctx_get_last_error_str(struct accfg_ctx *ctx);
+struct accfg_device *accfg_ctx_get_last_error_device(struct accfg_ctx *ctx);
+struct accfg_wq *accfg_ctx_get_last_error_wq(struct accfg_ctx *ctx);
+struct accfg_group *accfg_ctx_get_last_error_group(struct accfg_ctx *ctx);
+struct accfg_engine *accfg_ctx_get_last_error_engine(struct accfg_ctx *ctx);
 struct accfg_device_mdev;
 struct accfg_device_mdev *accfg_device_first_mdev(struct accfg_device *device);
 struct accfg_device_mdev *accfg_device_next_mdev(struct accfg_device_mdev *mdev);
-- 
2.26.3

^ permalink raw reply related	[flat|nested] only message in thread

only message in thread, other threads:[~2021-07-14 17:17 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-07-14 17:17 [Accel-config] [PATCH v3 1/3] accel-config: Add APIs to get details of last error and associated devices ramesh.thomas

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.