All of lore.kernel.org
 help / color / mirror / Atom feed
* [dpdk-dev] [PATCH] dmadev: introduce DMA device library
@ 2021-07-02 13:18 Chengwen Feng
  2021-07-02 13:59 ` Bruce Richardson
                   ` (29 more replies)
  0 siblings, 30 replies; 339+ messages in thread
From: Chengwen Feng @ 2021-07-02 13:18 UTC (permalink / raw)
  To: thomas, ferruh.yigit, bruce.richardson, jerinj, jerinjacobk
  Cc: dev, mb, nipun.gupta, hemant.agrawal, maxime.coquelin,
	honnappa.nagarahalli, david.marchand, sburla, pkapoor,
	konstantin.ananyev, liangma

This patch introduces 'dmadevice' which is a generic type of DMA
device.

The APIs of dmadev library exposes some generic operations which can
enable configuration and I/O with the DMA devices.

Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>
---
 MAINTAINERS                  |   4 +
 config/rte_config.h          |   3 +
 lib/dmadev/meson.build       |   6 +
 lib/dmadev/rte_dmadev.c      | 438 +++++++++++++++++++++
 lib/dmadev/rte_dmadev.h      | 919 +++++++++++++++++++++++++++++++++++++++++++
 lib/dmadev/rte_dmadev_core.h |  98 +++++
 lib/dmadev/rte_dmadev_pmd.h  | 210 ++++++++++
 lib/dmadev/version.map       |  32 ++
 lib/meson.build              |   1 +
 9 files changed, 1711 insertions(+)
 create mode 100644 lib/dmadev/meson.build
 create mode 100644 lib/dmadev/rte_dmadev.c
 create mode 100644 lib/dmadev/rte_dmadev.h
 create mode 100644 lib/dmadev/rte_dmadev_core.h
 create mode 100644 lib/dmadev/rte_dmadev_pmd.h
 create mode 100644 lib/dmadev/version.map

diff --git a/MAINTAINERS b/MAINTAINERS
index 4347555..2019783 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -496,6 +496,10 @@ F: drivers/raw/skeleton/
 F: app/test/test_rawdev.c
 F: doc/guides/prog_guide/rawdev.rst
 
+Dma device API
+M: Chengwen Feng <fengchengwen@huawei.com>
+F: lib/dmadev/
+
 
 Memory Pool Drivers
 -------------------
diff --git a/config/rte_config.h b/config/rte_config.h
index 590903c..331a431 100644
--- a/config/rte_config.h
+++ b/config/rte_config.h
@@ -81,6 +81,9 @@
 /* rawdev defines */
 #define RTE_RAWDEV_MAX_DEVS 64
 
+/* dmadev defines */
+#define RTE_DMADEV_MAX_DEVS 64
+
 /* ip_fragmentation defines */
 #define RTE_LIBRTE_IP_FRAG_MAX_FRAG 4
 #undef RTE_LIBRTE_IP_FRAG_TBL_STAT
diff --git a/lib/dmadev/meson.build b/lib/dmadev/meson.build
new file mode 100644
index 0000000..c918dae
--- /dev/null
+++ b/lib/dmadev/meson.build
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2021 HiSilicon Limited.
+
+sources = files('rte_dmadev.c')
+headers = files('rte_dmadev.h', 'rte_dmadev_pmd.h')
+indirect_headers += files('rte_dmadev_core.h')
diff --git a/lib/dmadev/rte_dmadev.c b/lib/dmadev/rte_dmadev.c
new file mode 100644
index 0000000..a94e839
--- /dev/null
+++ b/lib/dmadev/rte_dmadev.c
@@ -0,0 +1,438 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2021 HiSilicon Limited.
+ */
+
+#include <ctype.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+
+#include <rte_log.h>
+#include <rte_debug.h>
+#include <rte_dev.h>
+#include <rte_memory.h>
+#include <rte_memzone.h>
+#include <rte_malloc.h>
+#include <rte_errno.h>
+#include <rte_string_fns.h>
+
+#include "rte_dmadev.h"
+#include "rte_dmadev_pmd.h"
+
+struct rte_dmadev rte_dmadevices[RTE_DMADEV_MAX_DEVS];
+
+uint16_t
+rte_dmadev_count(void)
+{
+	uint16_t count = 0;
+	uint16_t i;
+
+	for (i = 0; i < RTE_DMADEV_MAX_DEVS; i++) {
+		if (rte_dmadevices[i].attached)
+			count++;
+	}
+
+	return count;
+}
+
+int
+rte_dmadev_get_dev_id(const char *name)
+{
+	uint16_t i;
+
+	if (name == NULL)
+		return -EINVAL;
+
+	for (i = 0; i < RTE_DMADEV_MAX_DEVS; i++)
+		if ((strcmp(rte_dmadevices[i].name, name) == 0) &&
+		    (rte_dmadevices[i].attached == RTE_DMADEV_ATTACHED))
+			return i;
+
+	return -ENODEV;
+}
+
+int
+rte_dmadev_socket_id(uint16_t dev_id)
+{
+	struct rte_dmadev *dev;
+
+	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
+	dev = &rte_dmadevices[dev_id];
+
+	return dev->socket_id;
+}
+
+int
+rte_dmadev_info_get(uint16_t dev_id, struct rte_dmadev_info *dev_info)
+{
+	struct rte_dmadev *dev;
+	int diag;
+
+	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
+	RTE_FUNC_PTR_OR_ERR_RET(dev_info, -EINVAL);
+
+	dev = &rte_dmadevices[dev_id];
+
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_info_get, -ENOTSUP);
+
+	memset(dev_info, 0, sizeof(struct rte_dmadev_info));
+	diag = (*dev->dev_ops->dev_info_get)(dev, dev_info);
+	if (diag != 0)
+		return diag;
+
+	dev_info->device = dev->device;
+	dev_info->driver_name = dev->driver_name;
+	dev_info->socket_id = dev->socket_id;
+
+	return 0;
+}
+
+int
+rte_dmadev_configure(uint16_t dev_id, const struct rte_dmadev_conf *dev_conf)
+{
+	struct rte_dmadev *dev;
+	int diag;
+
+	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
+	RTE_FUNC_PTR_OR_ERR_RET(dev_conf, -EINVAL);
+
+	dev = &rte_dmadevices[dev_id];
+
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_configure, -ENOTSUP);
+
+	if (dev->started) {
+		RTE_DMADEV_LOG(ERR,
+		   "device %u must be stopped to allow configuration", dev_id);
+		return -EBUSY;
+	}
+
+	diag = (*dev->dev_ops->dev_configure)(dev, dev_conf);
+	if (diag != 0)
+		RTE_DMADEV_LOG(ERR, "device %u dev_configure failed, ret = %d",
+			       dev_id, diag);
+	else
+		dev->attached = 1;
+
+	return diag;
+}
+
+int
+rte_dmadev_start(uint16_t dev_id)
+{
+	struct rte_dmadev *dev;
+	int diag;
+
+	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
+
+	dev = &rte_dmadevices[dev_id];
+	if (dev->started != 0) {
+		RTE_DMADEV_LOG(ERR, "device %u already started", dev_id);
+		return 0;
+	}
+
+	if (dev->dev_ops->dev_start == NULL)
+		goto mark_started;
+
+	diag = (*dev->dev_ops->dev_start)(dev);
+	if (diag != 0)
+		return diag;
+
+mark_started:
+	dev->started = 1;
+	return 0;
+}
+
+int
+rte_dmadev_stop(uint16_t dev_id)
+{
+	struct rte_dmadev *dev;
+	int diag;
+
+	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
+
+	dev = &rte_dmadevices[dev_id];
+
+	if (dev->started == 0) {
+		RTE_DMADEV_LOG(ERR, "device %u already stopped", dev_id);
+		return 0;
+	}
+
+	if (dev->dev_ops->dev_stop == NULL)
+		goto mark_stopped;
+
+	diag = (*dev->dev_ops->dev_stop)(dev);
+	if (diag != 0)
+		return diag;
+
+mark_stopped:
+	dev->started = 0;
+	return 0;
+}
+
+int
+rte_dmadev_close(uint16_t dev_id)
+{
+	struct rte_dmadev *dev;
+
+	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
+
+	dev = &rte_dmadevices[dev_id];
+
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_close, -ENOTSUP);
+
+	/* Device must be stopped before it can be closed */
+	if (dev->started == 1) {
+		RTE_DMADEV_LOG(ERR, "device %u must be stopped before closing",
+			       dev_id);
+		return -EBUSY;
+	}
+
+	return (*dev->dev_ops->dev_close)(dev);
+}
+
+int
+rte_dmadev_reset(uint16_t dev_id)
+{
+	struct rte_dmadev *dev;
+
+	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
+
+	dev = &rte_dmadevices[dev_id];
+
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_reset, -ENOTSUP);
+
+	/* Reset is not dependent on state of the device */
+	return (*dev->dev_ops->dev_reset)(dev);
+}
+
+int
+rte_dmadev_queue_setup(uint16_t dev_id,
+		       const struct rte_dmadev_queue_conf *conf)
+{
+	struct rte_dmadev *dev;
+
+	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
+	RTE_FUNC_PTR_OR_ERR_RET(conf, -EINVAL);
+
+	dev = &rte_dmadevices[dev_id];
+
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->queue_setup, -ENOTSUP);
+
+	return (*dev->dev_ops->queue_setup)(dev, conf);
+}
+
+int
+rte_dmadev_queue_release(uint16_t dev_id, uint16_t vq_id)
+{
+	struct rte_dmadev *dev;
+
+	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
+
+	dev = &rte_dmadevices[dev_id];
+
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->queue_release, -ENOTSUP);
+
+	return (*dev->dev_ops->queue_release)(dev, vq_id);
+}
+
+int
+rte_dmadev_queue_info_get(uint16_t dev_id, uint16_t vq_id,
+			  struct rte_dmadev_queue_info *info)
+{
+	struct rte_dmadev *dev;
+
+	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
+	RTE_FUNC_PTR_OR_ERR_RET(info, -EINVAL);
+
+	dev = &rte_dmadevices[dev_id];
+
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->queue_info_get, -ENOTSUP);
+
+	memset(info, 0, sizeof(struct rte_dmadev_queue_info));
+	return (*dev->dev_ops->queue_info_get)(dev, vq_id, info);
+}
+
+int
+rte_dmadev_stats_get(uint16_t dev_id, int vq_id,
+		     struct rte_dmadev_stats *stats)
+{
+	struct rte_dmadev *dev;
+
+	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
+	RTE_FUNC_PTR_OR_ERR_RET(stats, -EINVAL);
+
+	dev = &rte_dmadevices[dev_id];
+
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->stats_get, -ENOTSUP);
+
+	return (*dev->dev_ops->stats_get)(dev, vq_id, stats);
+}
+
+int
+rte_dmadev_stats_reset(uint16_t dev_id, int vq_id)
+{
+	struct rte_dmadev *dev;
+
+	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
+
+	dev = &rte_dmadevices[dev_id];
+
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->stats_reset, -ENOTSUP);
+
+	return (*dev->dev_ops->stats_reset)(dev, vq_id);
+}
+
+static int
+xstats_get_count(uint16_t dev_id)
+{
+	struct rte_dmadev *dev = &rte_dmadevices[dev_id];
+
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->xstats_get_names, -ENOTSUP);
+
+	return (*dev->dev_ops->xstats_get_names)(dev, NULL, 0);
+}
+
+int
+rte_dmadev_xstats_names_get(uint16_t dev_id,
+			    struct rte_dmadev_xstats_name *xstats_names,
+			    uint32_t size)
+{
+	struct rte_dmadev *dev;
+	int cnt_expected_entries;
+
+	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
+
+	cnt_expected_entries = xstats_get_count(dev_id);
+
+	if (xstats_names == NULL || cnt_expected_entries < 0 ||
+	    (int)size < cnt_expected_entries || size == 0)
+		return cnt_expected_entries;
+
+	dev = &rte_dmadevices[dev_id];
+
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->xstats_get_names, -ENOTSUP);
+	return (*dev->dev_ops->xstats_get_names)(dev, xstats_names, size);
+}
+
+int
+rte_dmadev_xstats_get(uint16_t dev_id, const uint32_t ids[],
+		      uint64_t values[], uint32_t n)
+{
+	struct rte_dmadev *dev;
+
+	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
+	RTE_FUNC_PTR_OR_ERR_RET(ids, -EINVAL);
+	RTE_FUNC_PTR_OR_ERR_RET(values, -EINVAL);
+
+	dev = &rte_dmadevices[dev_id];
+
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->xstats_get, -ENOTSUP);
+
+	return (*dev->dev_ops->xstats_get)(dev, ids, values, n);
+}
+
+int
+rte_dmadev_xstats_reset(uint16_t dev_id, const uint32_t ids[], uint32_t nb_ids)
+{
+	struct rte_dmadev *dev;
+
+	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
+
+	dev = &rte_dmadevices[dev_id];
+
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->xstats_reset, -ENOTSUP);
+
+	return (*dev->dev_ops->xstats_reset)(dev, ids, nb_ids);
+}
+
+int
+rte_dmadev_selftest(uint16_t dev_id)
+{
+	struct rte_dmadev *dev;
+
+	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
+
+	dev = &rte_dmadevices[dev_id];
+
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_selftest, -ENOTSUP);
+
+	return (*dev->dev_ops->dev_selftest)(dev_id);
+}
+
+static inline uint16_t
+rte_dmadev_find_free_device_index(void)
+{
+	uint16_t i;
+
+	for (i = 0; i < RTE_DMADEV_MAX_DEVS; i++) {
+		if (rte_dmadevices[i].attached == RTE_DMADEV_DETACHED)
+			return i;
+	}
+
+	return RTE_DMADEV_MAX_DEVS;
+}
+
+struct rte_dmadev *
+rte_dmadev_pmd_allocate(const char *name, size_t dev_priv_size, int socket_id)
+{
+	struct rte_dmadev *dev;
+	uint16_t dev_id;
+
+	if (rte_dmadev_get_dev_id(name) >= 0) {
+		RTE_DMADEV_LOG(ERR,
+			"device with name %s already allocated!", name);
+		return NULL;
+	}
+
+	dev_id = rte_dmadev_find_free_device_index();
+	if (dev_id == RTE_DMADEV_MAX_DEVS) {
+		RTE_DMADEV_LOG(ERR, "reached maximum number of DMA devices");
+		return NULL;
+	}
+
+	dev = &rte_dmadevices[dev_id];
+
+	if (dev_priv_size > 0) {
+		dev->dev_private = rte_zmalloc_socket("dmadev private",
+				     dev_priv_size,
+				     RTE_CACHE_LINE_SIZE,
+				     socket_id);
+		if (dev->dev_private == NULL) {
+			RTE_DMADEV_LOG(ERR,
+				"unable to allocate memory for dmadev");
+			return NULL;
+		}
+	}
+
+	dev->dev_id = dev_id;
+	dev->socket_id = socket_id;
+	dev->started = 0;
+	strlcpy(dev->name, name, RTE_DMADEV_NAME_MAX_LEN);
+
+	dev->attached = RTE_DMADEV_ATTACHED;
+
+	return dev;
+}
+
+int
+rte_dmadev_pmd_release(struct rte_dmadev *dev)
+{
+	int ret;
+
+	if (dev == NULL)
+		return -EINVAL;
+
+	ret = rte_dmadev_close(dev->dev_id);
+	if (ret != 0)
+		return ret;
+
+	if (dev->dev_private != NULL)
+		rte_free(dev->dev_private);
+
+	memset(dev, 0, sizeof(struct rte_dmadev));
+	dev->attached = RTE_DMADEV_DETACHED;
+
+	return 0;
+}
+
+RTE_LOG_REGISTER(libdmadev_logtype, lib.dmadev, INFO);
diff --git a/lib/dmadev/rte_dmadev.h b/lib/dmadev/rte_dmadev.h
new file mode 100644
index 0000000..f74fc6a
--- /dev/null
+++ b/lib/dmadev/rte_dmadev.h
@@ -0,0 +1,919 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2021 HiSilicon Limited.
+ */
+
+#ifndef _RTE_DMADEV_H_
+#define _RTE_DMADEV_H_
+
+/**
+ * @file rte_dmadev.h
+ *
+ * RTE DMA (Direct Memory Access) device APIs.
+ *
+ * The generic DMA device diagram:
+ *
+ *            ------------     ------------
+ *            | HW-queue |     | HW-queue |
+ *            ------------     ------------
+ *                   \            /
+ *                    \          /
+ *                     \        /
+ *                  ----------------
+ *                  |dma-controller|
+ *                  ----------------
+ *
+ *   The DMA could have multiple HW-queues, each HW-queue could have multiple
+ *   capabilities, e.g. whether to support fill operation, supported DMA
+ *   transfter direction and etc.
+ *
+ * The DMA framework is built on the following abstraction model:
+ *
+ *     ------------    ------------
+ *     |virt-queue|    |virt-queue|
+ *     ------------    ------------
+ *            \           /
+ *             \         /
+ *              \       /
+ *            ------------     ------------
+ *            | HW-queue |     | HW-queue |
+ *            ------------     ------------
+ *                   \            /
+ *                    \          /
+ *                     \        /
+ *                     ----------
+ *                     | dmadev |
+ *                     ----------
+ *
+ *   a) The DMA operation request must be submitted to the virt queue, virt
+ *      queues must be created based on HW queues, the DMA device could have
+ *      multiple HW queues.
+ *   b) The virt queues on the same HW-queue could represent different contexts,
+ *      e.g. user could create virt-queue-0 on HW-queue-0 for mem-to-mem
+ *      transfer scenario, and create virt-queue-1 on the same HW-queue for
+ *      mem-to-dev transfer scenario.
+ *   NOTE: user could also create multiple virt queues for mem-to-mem transfer
+ *         scenario as long as the corresponding driver supports.
+ *
+ * The control plane APIs include configure/queue_setup/queue_release/start/
+ * stop/reset/close, in order to start device work, the call sequence must be
+ * as follows:
+ *     - rte_dmadev_configure()
+ *     - rte_dmadev_queue_setup()
+ *     - rte_dmadev_start()
+ *
+ * The dataplane APIs include two parts:
+ *   a) The first part is the submission of operation requests:
+ *        - rte_dmadev_copy()
+ *        - rte_dmadev_copy_sg() - scatter-gather form of copy
+ *        - rte_dmadev_fill()
+ *        - rte_dmadev_fill_sg() - scatter-gather form of fill
+ *        - rte_dmadev_fence()   - add a fence force ordering between operations
+ *        - rte_dmadev_perform() - issue doorbell to hardware
+ *      These APIs could work with different virt queues which have different
+ *      contexts.
+ *      The first four APIs are used to submit the operation request to the virt
+ *      queue, if the submission is successful, a cookie (as type
+ *      'dma_cookie_t') is returned, otherwise a negative number is returned.
+ *   b) The second part is to obtain the result of requests:
+ *        - rte_dmadev_completed()
+ *            - return the number of operation requests completed successfully.
+ *        - rte_dmadev_completed_fails()
+ *            - return the number of operation requests failed to complete.
+ *
+ * The misc APIs include info_get/queue_info_get/stats/xstats/selftest, provide
+ * information query and self-test capabilities.
+ *
+ * About the dataplane APIs MT-safe, there are two dimensions:
+ *   a) For one virt queue, the submit/completion API could be MT-safe,
+ *      e.g. one thread do submit operation, another thread do completion
+ *      operation.
+ *      If driver support it, then declare RTE_DMA_DEV_CAPA_MT_VQ.
+ *      If driver don't support it, it's up to the application to guarantee
+ *      MT-safe.
+ *   b) For multiple virt queues on the same HW queue, e.g. one thread do
+ *      operation on virt-queue-0, another thread do operation on virt-queue-1.
+ *      If driver support it, then declare RTE_DMA_DEV_CAPA_MT_MVQ.
+ *      If driver don't support it, it's up to the application to guarantee
+ *      MT-safe.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <rte_common.h>
+#include <rte_memory.h>
+#include <rte_errno.h>
+#include <rte_compat.h>
+
+/**
+ * dma_cookie_t - an opaque DMA cookie
+ *
+ * If dma_cookie_t is >=0 it's a DMA operation request cookie, <0 it's a error
+ * code.
+ * When using cookies, comply with the following rules:
+ * a) Cookies for each virtual queue are independent.
+ * b) For a virt queue, the cookie are monotonically incremented, when it reach
+ *    the INT_MAX, it wraps back to zero.
+ * c) The initial cookie of a virt queue is zero, after the device is stopped or
+ *    reset, the virt queue's cookie needs to be reset to zero.
+ * Example:
+ *    step-1: start one dmadev
+ *    step-2: enqueue a copy operation, the cookie return is 0
+ *    step-3: enqueue a copy operation again, the cookie return is 1
+ *    ...
+ *    step-101: stop the dmadev
+ *    step-102: start the dmadev
+ *    step-103: enqueue a copy operation, the cookie return is 0
+ *    ...
+ */
+typedef int32_t dma_cookie_t;
+
+/**
+ * dma_scatterlist - can hold scatter DMA operation request
+ */
+struct dma_scatterlist {
+	void *src;
+	void *dst;
+	uint32_t length;
+};
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Get the total number of DMA devices that have been successfully
+ * initialised.
+ *
+ * @return
+ *   The total number of usable DMA devices.
+ */
+__rte_experimental
+uint16_t
+rte_dmadev_count(void);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Get the device identifier for the named DMA device.
+ *
+ * @param name
+ *   DMA device name to select the DMA device identifier.
+ *
+ * @return
+ *   Returns DMA device identifier on success.
+ *   - <0: Failure to find named DMA device.
+ */
+__rte_experimental
+int
+rte_dmadev_get_dev_id(const char *name);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Return the NUMA socket to which a device is connected.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ *
+ * @return
+ *   The NUMA socket id to which the device is connected or
+ *   a default of zero if the socket could not be determined.
+ *   - -EINVAL: dev_id value is out of range.
+ */
+__rte_experimental
+int
+rte_dmadev_socket_id(uint16_t dev_id);
+
+/**
+ * The capabilities of a DMA device
+ */
+#define RTE_DMA_DEV_CAPA_M2M	(1ull << 0) /**< Support mem-to-mem transfer */
+#define RTE_DMA_DEV_CAPA_M2D	(1ull << 1) /**< Support mem-to-dev transfer */
+#define RTE_DMA_DEV_CAPA_D2M	(1ull << 2) /**< Support dev-to-mem transfer */
+#define RTE_DMA_DEV_CAPA_D2D	(1ull << 3) /**< Support dev-to-dev transfer */
+#define RTE_DMA_DEV_CAPA_COPY	(1ull << 4) /**< Support copy ops */
+#define RTE_DMA_DEV_CAPA_FILL	(1ull << 5) /**< Support fill ops */
+#define RTE_DMA_DEV_CAPA_SG	(1ull << 6) /**< Support scatter-gather ops */
+#define RTE_DMA_DEV_CAPA_FENCE	(1ull << 7) /**< Support fence ops */
+#define RTE_DMA_DEV_CAPA_IOVA	(1ull << 8) /**< Support IOVA as DMA address */
+#define RTE_DMA_DEV_CAPA_VA	(1ull << 9) /**< Support VA as DMA address */
+#define RTE_DMA_DEV_CAPA_MT_VQ	(1ull << 10) /**< Support MT-safe of one virt queue */
+#define RTE_DMA_DEV_CAPA_MT_MVQ	(1ull << 11) /**< Support MT-safe of multiple virt queues */
+
+/**
+ * A structure used to retrieve the contextual information of
+ * an DMA device
+ */
+struct rte_dmadev_info {
+	/**
+	 * Fields filled by framewok
+	 */
+	struct rte_device *device; /**< Generic Device information */
+	const char *driver_name; /**< Device driver name */
+	int socket_id; /**< Socket ID where memory is allocated */
+
+	/**
+	 * Specification fields filled by driver
+	 */
+	uint64_t dev_capa; /**< Device capabilities (RTE_DMA_DEV_CAPA_) */
+	uint16_t max_hw_queues; /**< Maximum number of HW queues. */
+	uint16_t max_vqs_per_hw_queue;
+	/**< Maximum number of virt queues to allocate per HW queue */
+	uint16_t max_desc;
+	/**< Maximum allowed number of virt queue descriptors */
+	uint16_t min_desc;
+	/**< Minimum allowed number of virt queue descriptors */
+
+	/**
+	 * Status fields filled by driver
+	 */
+	uint16_t nb_hw_queues; /**< Number of HW queues configured */
+	uint16_t nb_vqs; /**< Number of virt queues configured */
+};
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Retrieve the contextual information of a DMA device.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ *
+ * @param[out] dev_info
+ *   A pointer to a structure of type *rte_dmadev_info* to be filled with the
+ *   contextual information of the device.
+ * @return
+ *   - =0: Success, driver updates the contextual information of the DMA device
+ *   - <0: Error code returned by the driver info get function.
+ *
+ */
+__rte_experimental
+int
+rte_dmadev_info_get(uint16_t dev_id, struct rte_dmadev_info *dev_info);
+
+/**
+ * dma_address_type
+ */
+enum dma_address_type {
+	DMA_ADDRESS_TYPE_IOVA, /**< Use IOVA as dma address */
+	DMA_ADDRESS_TYPE_VA, /**< Use VA as dma address */
+};
+
+/**
+ * A structure used to configure a DMA device.
+ */
+struct rte_dmadev_conf {
+	enum dma_address_type addr_type; /**< Address type to used */
+	uint16_t nb_hw_queues; /**< Number of HW-queues enable to use */
+	uint16_t max_vqs; /**< Maximum number of virt queues to use */
+};
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Configure a DMA device.
+ *
+ * This function must be invoked first before any other function in the
+ * API. This function can also be re-invoked when a device is in the
+ * stopped state.
+ *
+ * The caller may use rte_dmadev_info_get() to get the capability of each
+ * resources available for this DMA device.
+ *
+ * @param dev_id
+ *   The identifier of the device to configure.
+ * @param dev_conf
+ *   The DMA device configuration structure encapsulated into rte_dmadev_conf
+ *   object.
+ *
+ * @return
+ *   - =0: Success, device configured.
+ *   - <0: Error code returned by the driver configuration function.
+ */
+__rte_experimental
+int
+rte_dmadev_configure(uint16_t dev_id, const struct rte_dmadev_conf *dev_conf);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Start a DMA device.
+ *
+ * The device start step is the last one and consists of setting the DMA
+ * to start accepting jobs.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ *
+ * @return
+ *   - =0: Success, device started.
+ *   - <0: Error code returned by the driver start function.
+ */
+__rte_experimental
+int
+rte_dmadev_start(uint16_t dev_id);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Stop a DMA device.
+ *
+ * The device can be restarted with a call to rte_dmadev_start()
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ *
+ * @return
+ *   - =0: Success, device stopped.
+ *   - <0: Error code returned by the driver stop function.
+ */
+__rte_experimental
+int
+rte_dmadev_stop(uint16_t dev_id);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Close a DMA device.
+ *
+ * The device cannot be restarted after this call.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ *
+ * @return
+ *  - =0: Successfully closing device
+ *  - <0: Failure to close device
+ */
+__rte_experimental
+int
+rte_dmadev_close(uint16_t dev_id);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Reset a DMA device.
+ *
+ * This is different from cycle of rte_dmadev_start->rte_dmadev_stop in the
+ * sense similar to hard or soft reset.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ *
+ * @return
+ *   - =0: Successful reset device.
+ *   - <0: Failure to reset device.
+ *   - (-ENOTSUP): If the device doesn't support this function.
+ */
+__rte_experimental
+int
+rte_dmadev_reset(uint16_t dev_id);
+
+/**
+ * dma_transfer_direction
+ */
+enum dma_transfer_direction {
+	DMA_MEM_TO_MEM,
+	DMA_MEM_TO_DEV,
+	DMA_DEV_TO_MEM,
+	DMA_DEV_TO_DEV,
+};
+
+/**
+ * A structure used to configure a DMA virt queue.
+ */
+struct rte_dmadev_queue_conf {
+	enum dma_transfer_direction direction;
+	/**< Associated transfer direction */
+	uint16_t hw_queue_id; /**< The HW queue on which to create virt queue */
+	uint16_t nb_desc; /**< Number of descriptor for this virt queue */
+	uint64_t dev_flags; /**< Device specific flags */
+	void *dev_ctx; /**< Device specific context */
+};
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Allocate and set up a virt queue.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param conf
+ *   The queue configuration structure encapsulated into rte_dmadev_queue_conf
+ *   object.
+ *
+ * @return
+ *   - >=0: Allocate virt queue success, it is virt queue id.
+ *   - <0: Error code returned by the driver queue setup function.
+ */
+__rte_experimental
+int
+rte_dmadev_queue_setup(uint16_t dev_id,
+		       const struct rte_dmadev_queue_conf *conf);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Release a virt queue.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param vq_id
+ *   The identifier of virt queue which return by queue setup.
+ *
+ * @return
+ *   - =0: Successful release the virt queue.
+ *   - <0: Error code returned by the driver queue release function.
+ */
+__rte_experimental
+int
+rte_dmadev_queue_release(uint16_t dev_id, uint16_t vq_id);
+
+/**
+ * A structure used to retrieve information of a DMA virt queue.
+ */
+struct rte_dmadev_queue_info {
+	enum dma_transfer_direction direction;
+	/**< Associated transfer direction */
+	uint16_t hw_queue_id; /**< The HW queue on which to create virt queue */
+	uint16_t nb_desc; /**< Number of descriptor for this virt queue */
+	uint64_t dev_flags; /**< Device specific flags */
+};
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Retrieve information of a DMA virt queue.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param vq_id
+ *   The identifier of virt queue which return by queue setup.
+ * @param[out] info
+ *   The queue info structure encapsulated into rte_dmadev_queue_info object.
+ *
+ * @return
+ *   - =0: Successful retrieve information.
+ *   - <0: Error code returned by the driver queue release function.
+ */
+__rte_experimental
+int
+rte_dmadev_queue_info_get(uint16_t dev_id, uint16_t vq_id,
+			  struct rte_dmadev_queue_info *info);
+
+#include "rte_dmadev_core.h"
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Enqueue a copy operation onto the DMA virt queue.
+ *
+ * This queues up a copy operation to be performed by hardware, but does not
+ * trigger hardware to begin that operation.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param vq_id
+ *   The identifier of virt queue.
+ * @param src
+ *   The address of the source buffer.
+ * @param dst
+ *   The address of the destination buffer.
+ * @param length
+ *   The length of the data to be copied.
+ * @param flags
+ *   An opaque flags for this operation.
+ *
+ * @return
+ *   dma_cookie_t: please refer to the corresponding definition.
+ *
+ * NOTE: The caller must ensure that the input parameter is valid and the
+ *       corresponding device supports the operation.
+ */
+__rte_experimental
+static inline dma_cookie_t
+rte_dmadev_copy(uint16_t dev_id, uint16_t vq_id, void *src, void *dst,
+		uint32_t length, uint64_t flags)
+{
+	struct rte_dmadev *dev = &rte_dmadevices[dev_id];
+	return (*dev->copy)(dev, vq_id, src, dst, length, flags);
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Enqueue a scatter list copy operation onto the DMA virt queue.
+ *
+ * This queues up a scatter list copy operation to be performed by hardware,
+ * but does not trigger hardware to begin that operation.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param vq_id
+ *   The identifier of virt queue.
+ * @param sg
+ *   The pointer of scatterlist.
+ * @param sg_len
+ *   The number of scatterlist elements.
+ * @param flags
+ *   An opaque flags for this operation.
+ *
+ * @return
+ *   dma_cookie_t: please refer to the corresponding definition.
+ *
+ * NOTE: The caller must ensure that the input parameter is valid and the
+ *       corresponding device supports the operation.
+ */
+__rte_experimental
+static inline dma_cookie_t
+rte_dmadev_copy_sg(uint16_t dev_id, uint16_t vq_id,
+		   const struct dma_scatterlist *sg,
+		   uint32_t sg_len, uint64_t flags)
+{
+	struct rte_dmadev *dev = &rte_dmadevices[dev_id];
+	return (*dev->copy_sg)(dev, vq_id, sg, sg_len, flags);
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Enqueue a fill operation onto the DMA virt queue
+ *
+ * This queues up a fill operation to be performed by hardware, but does not
+ * trigger hardware to begin that operation.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param vq_id
+ *   The identifier of virt queue.
+ * @param pattern
+ *   The pattern to populate the destination buffer with.
+ * @param dst
+ *   The address of the destination buffer.
+ * @param length
+ *   The length of the destination buffer.
+ * @param flags
+ *   An opaque flags for this operation.
+ *
+ * @return
+ *   dma_cookie_t: please refer to the corresponding definition.
+ *
+ * NOTE: The caller must ensure that the input parameter is valid and the
+ *       corresponding device supports the operation.
+ */
+__rte_experimental
+static inline dma_cookie_t
+rte_dmadev_fill(uint16_t dev_id, uint16_t vq_id, uint64_t pattern,
+		void *dst, uint32_t length, uint64_t flags)
+{
+	struct rte_dmadev *dev = &rte_dmadevices[dev_id];
+	return (*dev->fill)(dev, vq_id, pattern, dst, length, flags);
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Enqueue a scatter list fill operation onto the DMA virt queue
+ *
+ * This queues up a scatter list fill operation to be performed by hardware,
+ * but does not trigger hardware to begin that operation.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param vq_id
+ *   The identifier of virt queue.
+ * @param pattern
+ *   The pattern to populate the destination buffer with.
+ * @param sg
+ *   The pointer of scatterlist.
+ * @param sg_len
+ *   The number of scatterlist elements.
+ * @param flags
+ *   An opaque flags for this operation.
+ *
+ * @return
+ *   dma_cookie_t: please refer to the corresponding definition.
+ *
+ * NOTE: The caller must ensure that the input parameter is valid and the
+ *       corresponding device supports the operation.
+ */
+__rte_experimental
+static inline dma_cookie_t
+rte_dmadev_fill_sg(uint16_t dev_id, uint16_t vq_id, uint64_t pattern,
+		   const struct dma_scatterlist *sg, uint32_t sg_len,
+		   uint64_t flags)
+{
+	struct rte_dmadev *dev = &rte_dmadevices[dev_id];
+	return (*dev->fill_sg)(dev, vq_id, pattern, sg, sg_len, flags);
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Add a fence to force ordering between operations
+ *
+ * This adds a fence to a sequence of operations to enforce ordering, such that
+ * all operations enqueued before the fence must be completed before operations
+ * after the fence.
+ * NOTE: Since this fence may be added as a flag to the last operation enqueued,
+ * this API may not function correctly when called immediately after an
+ * "rte_dmadev_perform" call i.e. before any new operations are enqueued.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param vq_id
+ *   The identifier of virt queue.
+ *
+ * @return
+ *   - =0: Successful add fence.
+ *   - <0: Failure to add fence.
+ *
+ * NOTE: The caller must ensure that the input parameter is valid and the
+ *       corresponding device supports the operation.
+ */
+__rte_experimental
+static inline int
+rte_dmadev_fence(uint16_t dev_id, uint16_t vq_id)
+{
+	struct rte_dmadev *dev = &rte_dmadevices[dev_id];
+	return (*dev->fence)(dev, vq_id);
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Trigger hardware to begin performing enqueued operations
+ *
+ * This API is used to write the "doorbell" to the hardware to trigger it
+ * to begin the operations previously enqueued by rte_dmadev_copy/fill()
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param vq_id
+ *   The identifier of virt queue.
+ *
+ * @return
+ *   - =0: Successful trigger hardware.
+ *   - <0: Failure to trigger hardware.
+ *
+ * NOTE: The caller must ensure that the input parameter is valid and the
+ *       corresponding device supports the operation.
+ */
+__rte_experimental
+static inline int
+rte_dmadev_perform(uint16_t dev_id, uint16_t vq_id)
+{
+	struct rte_dmadev *dev = &rte_dmadevices[dev_id];
+	return (*dev->perform)(dev, vq_id);
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Returns the number of operations that have been successful completed.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param vq_id
+ *   The identifier of virt queue.
+ * @param nb_cpls
+ *   The maximum number of completed operations that can be processed.
+ * @param[out] cookie
+ *   The last completed operation's cookie.
+ * @param[out] has_error
+ *   Indicates if there are transfer error.
+ *
+ * @return
+ *   The number of operations that successful completed.
+ *
+ * NOTE: The caller must ensure that the input parameter is valid and the
+ *       corresponding device supports the operation.
+ */
+__rte_experimental
+static inline uint16_t
+rte_dmadev_completed(uint16_t dev_id, uint16_t vq_id, const uint16_t nb_cpls,
+		     dma_cookie_t *cookie, bool *has_error)
+{
+	struct rte_dmadev *dev = &rte_dmadevices[dev_id];
+	has_error = false;
+	return (*dev->completed)(dev, vq_id, nb_cpls, cookie, has_error);
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Returns the number of operations that failed to complete.
+ * NOTE: This API was used when rte_dmadev_completed has_error was set.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param vq_id
+ *   The identifier of virt queue.
+ * @param nb_status
+ *   Indicates the size of status array.
+ * @param[out] status
+ *   The error code of operations that failed to complete.
+ * @param[out] cookie
+ *   The last failed completed operation's cookie.
+ *
+ * @return
+ *   The number of operations that failed to complete.
+ *
+ * NOTE: The caller must ensure that the input parameter is valid and the
+ *       corresponding device supports the operation.
+ */
+__rte_experimental
+static inline uint16_t
+rte_dmadev_completed_fails(uint16_t dev_id, uint16_t vq_id,
+			   const uint16_t nb_status, uint32_t *status,
+			   dma_cookie_t *cookie)
+{
+	struct rte_dmadev *dev = &rte_dmadevices[dev_id];
+	return (*dev->completed_fails)(dev, vq_id, nb_status, status, cookie);
+}
+
+struct rte_dmadev_stats {
+	uint64_t enqueue_fail_count;
+	/**< Conut of all operations which failed enqueued */
+	uint64_t enqueued_count;
+	/**< Count of all operations which successful enqueued */
+	uint64_t completed_fail_count;
+	/**< Count of all operations which failed to complete */
+	uint64_t completed_count;
+	/**< Count of all operations which successful complete */
+};
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Retrieve basic statistics of a or all DMA virt queue(s).
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param vq_id
+ *   The identifier of virt queue, -1 means all virt queues.
+ * @param[out] stats
+ *   The basic statistics structure encapsulated into rte_dmadev_stats
+ *   object.
+ *
+ * @return
+ *   - =0: Successful retrieve stats.
+ *   - <0: Failure to retrieve stats.
+ */
+__rte_experimental
+int
+rte_dmadev_stats_get(uint16_t dev_id, int vq_id,
+		     struct rte_dmadev_stats *stats);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Reset basic statistics of a or all DMA virt queue(s).
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param vq_id
+ *   The identifier of virt queue, -1 means all virt queues.
+ *
+ * @return
+ *   - =0: Successful retrieve stats.
+ *   - <0: Failure to retrieve stats.
+ */
+__rte_experimental
+int
+rte_dmadev_stats_reset(uint16_t dev_id, int vq_id);
+
+/** Maximum name length for extended statistics counters */
+#define RTE_DMA_DEV_XSTATS_NAME_SIZE 64
+
+/**
+ * A name-key lookup element for extended statistics.
+ *
+ * This structure is used to map between names and ID numbers
+ * for extended ethdev statistics.
+ */
+struct rte_dmadev_xstats_name {
+	char name[RTE_DMA_DEV_XSTATS_NAME_SIZE];
+};
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Retrieve names of extended statistics of a DMA device.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param[out] xstats_names
+ *   Block of memory to insert names into. Must be at least size in capacity.
+ *   If set to NULL, function returns required capacity.
+ * @param size
+ *   Capacity of xstats_names (number of names).
+ * @return
+ *   - positive value lower or equal to size: success. The return value
+ *     is the number of entries filled in the stats table.
+ *   - positive value higher than size: error, the given statistics table
+ *     is too small. The return value corresponds to the size that should
+ *     be given to succeed. The entries in the table are not valid and
+ *     shall not be used by the caller.
+ *   - negative value on error.
+ */
+__rte_experimental
+int
+rte_dmadev_xstats_names_get(uint16_t dev_id,
+			    struct rte_dmadev_xstats_name *xstats_names,
+			    uint32_t size);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Retrieve extended statistics of a DMA device.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param ids
+ *   The id numbers of the stats to get. The ids can be got from the stat
+ *   position in the stat list from rte_dmadev_get_xstats_names().
+ * @param[out] values
+ *   The values for each stats request by ID.
+ * @param n
+ *   The number of stats requested.
+ *
+ * @return
+ *   - positive value: number of stat entries filled into the values array.
+ *   - negative value on error.
+ */
+__rte_experimental
+int
+rte_dmadev_xstats_get(uint16_t dev_id, const uint32_t ids[],
+		      uint64_t values[], uint32_t n);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Reset the values of the xstats of the selected component in the device.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param ids
+ *   Selects specific statistics to be reset. When NULL, all statistics
+ *   will be reset. If non-NULL, must point to array of at least
+ *   *nb_ids* size.
+ * @param nb_ids
+ *   The number of ids available from the *ids* array. Ignored when ids is NULL.
+ *
+ * @return
+ *   - zero: successfully reset the statistics to zero.
+ *   - negative value on error.
+ */
+__rte_experimental
+int
+rte_dmadev_xstats_reset(uint16_t dev_id, const uint32_t ids[], uint32_t nb_ids);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Trigger the dmadev self test.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ *
+ * @return
+ *   - 0: Selftest successful.
+ *   - -ENOTSUP if the device doesn't support selftest
+ *   - other values < 0 on failure.
+ */
+__rte_experimental
+int
+rte_dmadev_selftest(uint16_t dev_id);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_DMADEV_H_ */
diff --git a/lib/dmadev/rte_dmadev_core.h b/lib/dmadev/rte_dmadev_core.h
new file mode 100644
index 0000000..a3afea2
--- /dev/null
+++ b/lib/dmadev/rte_dmadev_core.h
@@ -0,0 +1,98 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2021 HiSilicon Limited.
+ */
+
+#ifndef _RTE_DMADEV_CORE_H_
+#define _RTE_DMADEV_CORE_H_
+
+/**
+ * @file
+ *
+ * RTE DMA Device internal header.
+ *
+ * This header contains internal data types. But they are still part of the
+ * public API because they are used by inline public functions.
+ */
+
+struct rte_dmadev;
+
+typedef dma_cookie_t (*dmadev_copy_t)(struct rte_dmadev *dev, uint16_t vq_id,
+				      void *src, void *dst,
+				      uint32_t length, uint64_t flags);
+/**< @internal Function used to enqueue a copy operation. */
+
+typedef dma_cookie_t (*dmadev_copy_sg_t)(struct rte_dmadev *dev, uint16_t vq_id,
+					 const struct dma_scatterlist *sg,
+					 uint32_t sg_len, uint64_t flags);
+/**< @internal Function used to enqueue a scatter list copy operation. */
+
+typedef dma_cookie_t (*dmadev_fill_t)(struct rte_dmadev *dev, uint16_t vq_id,
+				      uint64_t pattern, void *dst,
+				      uint32_t length, uint64_t flags);
+/**< @internal Function used to enqueue a fill operation. */
+
+typedef dma_cookie_t (*dmadev_fill_sg_t)(struct rte_dmadev *dev, uint16_t vq_id,
+			uint64_t pattern, const struct dma_scatterlist *sg,
+			uint32_t sg_len, uint64_t flags);
+/**< @internal Function used to enqueue a scatter list fill operation. */
+
+typedef int (*dmadev_fence_t)(struct rte_dmadev *dev, uint16_t vq_id);
+/**< @internal Function used to add a fence ordering between operations. */
+
+typedef int (*dmadev_perform_t)(struct rte_dmadev *dev, uint16_t vq_id);
+/**< @internal Function used to trigger hardware to begin performing. */
+
+typedef uint16_t (*dmadev_completed_t)(struct rte_dmadev *dev, uint16_t vq_id,
+				       const uint16_t nb_cpls,
+				       dma_cookie_t *cookie, bool *has_error);
+/**< @internal Function used to return number of successful completed operations */
+
+typedef uint16_t (*dmadev_completed_fails_t)(struct rte_dmadev *dev,
+			uint16_t vq_id, const uint16_t nb_status,
+			uint32_t *status, dma_cookie_t *cookie);
+/**< @internal Function used to return number of failed completed operations */
+
+#define RTE_DMADEV_NAME_MAX_LEN	64 /**< Max length of name of DMA PMD */
+
+struct rte_dmadev_ops;
+
+/**
+ * The data structure associated with each DMA device.
+ */
+struct rte_dmadev {
+	/**< Enqueue a copy operation onto the DMA device. */
+	dmadev_copy_t copy;
+	/**< Enqueue a scatter list copy operation onto the DMA device. */
+	dmadev_copy_sg_t copy_sg;
+	/**< Enqueue a fill operation onto the DMA device. */
+	dmadev_fill_t fill;
+	/**< Enqueue a scatter list fill operation onto the DMA device. */
+	dmadev_fill_sg_t fill_sg;
+	/**< Add a fence to force ordering between operations. */
+	dmadev_fence_t fence;
+	/**< Trigger hardware to begin performing enqueued operations. */
+	dmadev_perform_t perform;
+	/**< Returns the number of operations that successful completed. */
+	dmadev_completed_t completed;
+	/**< Returns the number of operations that failed to complete. */
+	dmadev_completed_fails_t completed_fails;
+
+	void *dev_private; /**< PMD-specific private data */
+	const struct rte_dmadev_ops *dev_ops; /**< Functions exported by PMD */
+
+	uint16_t dev_id; /**< Device ID for this instance */
+	int socket_id; /**< Socket ID where memory is allocated */
+	struct rte_device *device;
+	/**< Device info. supplied during device initialization */
+	const char *driver_name; /**< Driver info. supplied by probing */
+	char name[RTE_DMADEV_NAME_MAX_LEN]; /**< Device name */
+
+	RTE_STD_C11
+	uint8_t attached : 1; /**< Flag indicating the device is attached */
+	uint8_t started : 1; /**< Device state: STARTED(1)/STOPPED(0) */
+
+} __rte_cache_aligned;
+
+extern struct rte_dmadev rte_dmadevices[];
+
+#endif /* _RTE_DMADEV_CORE_H_ */
diff --git a/lib/dmadev/rte_dmadev_pmd.h b/lib/dmadev/rte_dmadev_pmd.h
new file mode 100644
index 0000000..ef03cf7
--- /dev/null
+++ b/lib/dmadev/rte_dmadev_pmd.h
@@ -0,0 +1,210 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2021 HiSilicon Limited.
+ */
+
+#ifndef _RTE_DMADEV_PMD_H_
+#define _RTE_DMADEV_PMD_H_
+
+/** @file
+ * RTE DMA PMD APIs
+ *
+ * @note
+ * Driver facing APIs for a DMA device. These are not to be called directly by
+ * any application.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <string.h>
+
+#include <rte_dev.h>
+#include <rte_log.h>
+#include <rte_common.h>
+
+#include "rte_dmadev.h"
+
+extern int libdmadev_logtype;
+
+#define RTE_DMADEV_LOG(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, libdmadev_logtype, "%s(): " fmt "\n", \
+		__func__, ##args)
+
+/* Macros to check for valid device */
+#define RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, retval) do { \
+	if (!rte_dmadev_pmd_is_valid_dev((dev_id))) { \
+		RTE_DMADEV_LOG(ERR, "Invalid dev_id=%d", dev_id); \
+		return retval; \
+	} \
+} while (0)
+
+#define RTE_DMADEV_VALID_DEVID_OR_RET(dev_id) do { \
+	if (!rte_dmadev_pmd_is_valid_dev((dev_id))) { \
+		RTE_DMADEV_LOG(ERR, "Invalid dev_id=%d", dev_id); \
+		return; \
+	} \
+} while (0)
+
+#define RTE_DMADEV_DETACHED  0
+#define RTE_DMADEV_ATTACHED  1
+
+/**
+ * Validate if the DMA device index is a valid attached DMA device.
+ *
+ * @param dev_id
+ *   DMA device index.
+ *
+ * @return
+ *   - If the device index is valid (1) or not (0).
+ */
+static inline unsigned
+rte_dmadev_pmd_is_valid_dev(uint16_t dev_id)
+{
+	struct rte_dmadev *dev;
+
+	if (dev_id >= RTE_DMADEV_MAX_DEVS)
+		return 0;
+
+	dev = &rte_dmadevices[dev_id];
+	if (dev->attached != RTE_DMADEV_ATTACHED)
+		return 0;
+	else
+		return 1;
+}
+
+/**
+ * Definitions of control-plane functions exported by a driver through the
+ * generic structure of type *rte_dmadev_ops* supplied in the *rte_dmadev*
+ * structure associated with a device.
+ */
+
+typedef int (*dmadev_info_get_t)(struct rte_dmadev *dev,
+				 struct rte_dmadev_info *dev_info);
+/**< @internal Function used to get device information of a device. */
+
+typedef int (*dmadev_configure_t)(struct rte_dmadev *dev,
+				  const struct rte_dmadev_conf *dev_conf);
+/**< @internal Function used to configure a device. */
+
+typedef int (*dmadev_start_t)(struct rte_dmadev *dev);
+/**< @internal Function used to start a configured device. */
+
+typedef int (*dmadev_stop_t)(struct rte_dmadev *dev);
+/**< @internal Function used to stop a configured device. */
+
+typedef int (*dmadev_close_t)(struct rte_dmadev *dev);
+/**< @internal Function used to close a configured device. */
+
+typedef int (*dmadev_reset_t)(struct rte_dmadev *dev);
+/**< @internal Function used to reset a configured device. */
+
+typedef int (*dmadev_queue_setup_t)(struct rte_dmadev *dev,
+				    const struct rte_dmadev_queue_conf *conf);
+/**< @internal Function used to allocate and set up a virt queue. */
+
+typedef int (*dmadev_queue_release_t)(struct rte_dmadev *dev, uint16_t vq_id);
+/**< @internal Function used to release a virt queue. */
+
+typedef int (*dmadev_queue_info_t)(struct rte_dmadev *dev, uint16_t vq_id,
+				   struct rte_dmadev_queue_info *info);
+/**< @internal Function used to retrieve information of a virt queue. */
+
+typedef int (*dmadev_stats_get_t)(struct rte_dmadev *dev, int vq_id,
+				  struct rte_dmadev_stats *stats);
+/**< @internal Function used to retrieve basic statistics. */
+
+typedef int (*dmadev_stats_reset_t)(struct rte_dmadev *dev, int vq_id);
+/**< @internal Function used to reset basic statistics. */
+
+typedef int (*dmadev_xstats_get_names_t)(const struct rte_dmadev *dev,
+		struct rte_dmadev_xstats_name *xstats_names,
+		uint32_t size);
+/**< @internal Function used to get names of extended stats. */
+
+typedef int (*dmadev_xstats_get_t)(const struct rte_dmadev *dev,
+		const uint32_t ids[], uint64_t values[], uint32_t n);
+/**< @internal Function used to retrieve extended stats. */
+
+typedef int (*dmadev_xstats_reset_t)(struct rte_dmadev *dev,
+				     const uint32_t ids[], uint32_t nb_ids);
+/**< @internal Function used to reset extended stats. */
+
+typedef int (*dmadev_selftest_t)(uint16_t dev_id);
+/**< @internal Function used to start dmadev selftest. */
+
+/** DMA device operations function pointer table */
+struct rte_dmadev_ops {
+	/**< Get device info. */
+	dmadev_info_get_t dev_info_get;
+	/**< Configure device. */
+	dmadev_configure_t dev_configure;
+	/**< Start device. */
+	dmadev_start_t dev_start;
+	/**< Stop device. */
+	dmadev_stop_t dev_stop;
+	/**< Close device. */
+	dmadev_close_t dev_close;
+	/**< Reset device. */
+	dmadev_reset_t dev_reset;
+
+	/**< Allocate and set up a virt queue. */
+	dmadev_queue_setup_t queue_setup;
+	/**< Release a virt queue. */
+	dmadev_queue_release_t queue_release;
+	/**< Retrieve information of a virt queue */
+	dmadev_queue_info_t queue_info_get;
+
+	/**< Get basic statistics. */
+	dmadev_stats_get_t stats_get;
+	/**< Reset basic statistics. */
+	dmadev_stats_reset_t stats_reset;
+	/**< Get names of extended stats. */
+	dmadev_xstats_get_names_t xstats_get_names;
+	/**< Get extended statistics. */
+	dmadev_xstats_get_t xstats_get;
+	/**< Reset extended statistics values. */
+	dmadev_xstats_reset_t xstats_reset;
+
+	/**< Device selftest function */
+	dmadev_selftest_t dev_selftest;
+};
+
+/**
+ * Allocates a new dmadev slot for an DMA device and returns the pointer
+ * to that slot for the driver to use.
+ *
+ * @param name
+ *   Unique identifier name for each device
+ * @param dev_private_size
+ *   Size of private data memory allocated within rte_dmadev object.
+ *   Set to 0 to disable internal memory allocation and allow for
+ *   self-allocation.
+ * @param socket_id
+ *   Socket to allocate resources on.
+ *
+ * @return
+ *   - NULL: Failure to allocate
+ *   - Other: The rte_dmadev structure pointer for the new device
+ */
+struct rte_dmadev *
+rte_dmadev_pmd_allocate(const char *name, size_t dev_private_size,
+			int socket_id);
+
+/**
+ * Release the specified dmadev device.
+ *
+ * @param dev
+ *   The *dmadev* pointer is the address of the *rte_dmadev* structure.
+ *
+ * @return
+ *   - 0 on success, negative on error
+ */
+int
+rte_dmadev_pmd_release(struct rte_dmadev *dev);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_DMADEV_PMD_H_ */
diff --git a/lib/dmadev/version.map b/lib/dmadev/version.map
new file mode 100644
index 0000000..383b3ca
--- /dev/null
+++ b/lib/dmadev/version.map
@@ -0,0 +1,32 @@
+EXPERIMENTAL {
+	global:
+
+	rte_dmadev_count;
+	rte_dmadev_get_dev_id;
+	rte_dmadev_socket_id;
+	rte_dmadev_info_get;
+	rte_dmadev_configure;
+	rte_dmadev_start;
+	rte_dmadev_stop;
+	rte_dmadev_close;
+	rte_dmadev_reset;
+	rte_dmadev_queue_setup;
+	rte_dmadev_queue_release;
+	rte_dmadev_queue_info_get;
+	rte_dmadev_copy;
+	rte_dmadev_copy_sg;
+	rte_dmadev_fill;
+	rte_dmadev_fill_sg;
+	rte_dmadev_fence;
+	rte_dmadev_perform;
+	rte_dmadev_completed;
+	rte_dmadev_completed_fails;
+	rte_dmadev_stats_get;
+	rte_dmadev_stats_reset;
+	rte_dmadev_xstats_names_get;
+	rte_dmadev_xstats_get;
+	rte_dmadev_xstats_reset;
+	rte_dmadev_selftest;
+
+	local: *;
+};
diff --git a/lib/meson.build b/lib/meson.build
index 1673ca4..68d239f 100644
--- a/lib/meson.build
+++ b/lib/meson.build
@@ -60,6 +60,7 @@ libraries = [
         'bpf',
         'graph',
         'node',
+        'dmadev',
 ]
 
 if is_windows
-- 
2.8.1


^ permalink raw reply	[flat|nested] 339+ messages in thread

* Re: [dpdk-dev] [PATCH] dmadev: introduce DMA device library
  2021-07-02 13:18 [dpdk-dev] [PATCH] dmadev: introduce DMA device library Chengwen Feng
@ 2021-07-02 13:59 ` Bruce Richardson
  2021-07-04  9:30 ` Jerin Jacob
                   ` (28 subsequent siblings)
  29 siblings, 0 replies; 339+ messages in thread
From: Bruce Richardson @ 2021-07-02 13:59 UTC (permalink / raw)
  To: Chengwen Feng
  Cc: thomas, ferruh.yigit, jerinj, jerinjacobk, dev, mb, nipun.gupta,
	hemant.agrawal, maxime.coquelin, honnappa.nagarahalli,
	david.marchand, sburla, pkapoor, konstantin.ananyev, liangma

On Fri, Jul 02, 2021 at 09:18:11PM +0800, Chengwen Feng wrote:
> This patch introduces 'dmadevice' which is a generic type of DMA
> device.
> 
> The APIs of dmadev library exposes some generic operations which can
> enable configuration and I/O with the DMA devices.
> 
> Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>
> ---
Thanks for this new revision. We will try porting our driver
implementations under this API and see how it performs. We'll send on
feedback later based on that and based on code review.

/Bruce

^ permalink raw reply	[flat|nested] 339+ messages in thread

* Re: [dpdk-dev] [PATCH] dmadev: introduce DMA device library
  2021-07-02 13:18 [dpdk-dev] [PATCH] dmadev: introduce DMA device library Chengwen Feng
  2021-07-02 13:59 ` Bruce Richardson
@ 2021-07-04  9:30 ` Jerin Jacob
  2021-07-05 10:52   ` Bruce Richardson
  2021-07-06  3:01   ` fengchengwen
  2021-07-04 14:57 ` Andrew Rybchenko
                   ` (27 subsequent siblings)
  29 siblings, 2 replies; 339+ messages in thread
From: Jerin Jacob @ 2021-07-04  9:30 UTC (permalink / raw)
  To: Chengwen Feng
  Cc: Thomas Monjalon, Ferruh Yigit, Richardson, Bruce, Jerin Jacob,
	dpdk-dev, Morten Brørup, Nipun Gupta, Hemant Agrawal,
	Maxime Coquelin, Honnappa Nagarahalli, David Marchand,
	Satananda Burla, Prasun Kapoor, Ananyev, Konstantin, liangma,
	Radha Mohan Chintakuntla

On Fri, Jul 2, 2021 at 6:51 PM Chengwen Feng <fengchengwen@huawei.com> wrote:
>
> This patch introduces 'dmadevice' which is a generic type of DMA
> device.
>
> The APIs of dmadev library exposes some generic operations which can
> enable configuration and I/O with the DMA devices.
>
> Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>

Thanks for v1.

I would suggest finalizing  lib/dmadev/rte_dmadev.h before doing the
implementation so that you don't need
to waste time on rewoking the implementation.

Comments inline.

> ---
>  MAINTAINERS                  |   4 +
>  config/rte_config.h          |   3 +
>  lib/dmadev/meson.build       |   6 +
>  lib/dmadev/rte_dmadev.c      | 438 +++++++++++++++++++++
>  lib/dmadev/rte_dmadev.h      | 919 +++++++++++++++++++++++++++++++++++++++++++
>  lib/dmadev/rte_dmadev_core.h |  98 +++++
>  lib/dmadev/rte_dmadev_pmd.h  | 210 ++++++++++
>  lib/dmadev/version.map       |  32 ++

Missed to update doxygen. See doc/api/doxy-api.conf.in
Use meson  -Denable_docs=true to verify the generated doxgen doc.

>  lib/meson.build              |   1 +
>  9 files changed, 1711 insertions(+)
>  create mode 100644 lib/dmadev/meson.build
>  create mode 100644 lib/dmadev/rte_dmadev.c
>  create mode 100644 lib/dmadev/rte_dmadev.h
>  create mode 100644 lib/dmadev/rte_dmadev_core.h
>  create mode 100644 lib/dmadev/rte_dmadev_pmd.h
>  create mode 100644 lib/dmadev/version.map
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 4347555..2019783 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -496,6 +496,10 @@ F: drivers/raw/skeleton/
>  F: app/test/test_rawdev.c
>  F: doc/guides/prog_guide/rawdev.rst
>

Add EXPERIMENTAL

> +Dma device API
> +M: Chengwen Feng <fengchengwen@huawei.com>
> +F: lib/dmadev/
> +
>

> new file mode 100644
> index 0000000..a94e839
> --- /dev/null
> +++ b/lib/dmadev/rte_dmadev.c
> @@ -0,0 +1,438 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright 2021 HiSilicon Limited.
> + */
> +
> +#include <ctype.h>
> +#include <stdlib.h>
> +#include <string.h>
> +#include <stdint.h>
> +
> +#include <rte_log.h>
> +#include <rte_debug.h>
> +#include <rte_dev.h>
> +#include <rte_memory.h>
> +#include <rte_memzone.h>
> +#include <rte_malloc.h>
> +#include <rte_errno.h>
> +#include <rte_string_fns.h>

Sort in alphabetical order.

> +
> +#include "rte_dmadev.h"
> +#include "rte_dmadev_pmd.h"
> +
> +struct rte_dmadev rte_dmadevices[RTE_DMADEV_MAX_DEVS];

# Please check have you missed any multiprocess angle.
lib/regexdev/rte_regexdev.c is latest device class implemented in dpdk and
please check *rte_regexdev_shared_data scheme.


# Missing dynamic log for this library.


> diff --git a/lib/dmadev/rte_dmadev.h b/lib/dmadev/rte_dmadev.h
> new file mode 100644
> index 0000000..f74fc6a
> --- /dev/null
> +++ b/lib/dmadev/rte_dmadev.h
> @@ -0,0 +1,919 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright 2021 HiSilicon Limited.

It would be nice to add other companies' names who have contributed to
the specification.

> + */
> +
> +#ifndef _RTE_DMADEV_H_
> +#define _RTE_DMADEV_H_
> +
> +/**
> + * @file rte_dmadev.h
> + *
> + * RTE DMA (Direct Memory Access) device APIs.
> + *
> + * The generic DMA device diagram:
> + *
> + *            ------------     ------------
> + *            | HW-queue |     | HW-queue |
> + *            ------------     ------------
> + *                   \            /
> + *                    \          /
> + *                     \        /
> + *                  ----------------
> + *                  |dma-controller|
> + *                  ----------------
> + *
> + *   The DMA could have multiple HW-queues, each HW-queue could have multiple
> + *   capabilities, e.g. whether to support fill operation, supported DMA
> + *   transfter direction and etc.

typo

> + *
> + * The DMA framework is built on the following abstraction model:
> + *
> + *     ------------    ------------
> + *     |virt-queue|    |virt-queue|
> + *     ------------    ------------
> + *            \           /
> + *             \         /
> + *              \       /
> + *            ------------     ------------
> + *            | HW-queue |     | HW-queue |
> + *            ------------     ------------
> + *                   \            /
> + *                    \          /
> + *                     \        /
> + *                     ----------
> + *                     | dmadev |
> + *                     ----------

Continuing the discussion with @Morten Brørup , I think, we need to
finalize the model.

> + *   a) The DMA operation request must be submitted to the virt queue, virt
> + *      queues must be created based on HW queues, the DMA device could have
> + *      multiple HW queues.
> + *   b) The virt queues on the same HW-queue could represent different contexts,
> + *      e.g. user could create virt-queue-0 on HW-queue-0 for mem-to-mem
> + *      transfer scenario, and create virt-queue-1 on the same HW-queue for
> + *      mem-to-dev transfer scenario.
> + *   NOTE: user could also create multiple virt queues for mem-to-mem transfer
> + *         scenario as long as the corresponding driver supports.
> + *
> + * The control plane APIs include configure/queue_setup/queue_release/start/
> + * stop/reset/close, in order to start device work, the call sequence must be
> + * as follows:
> + *     - rte_dmadev_configure()
> + *     - rte_dmadev_queue_setup()
> + *     - rte_dmadev_start()

Please add reconfigure behaviour etc, Please check the
lib/regexdev/rte_regexdev.h
introduction. I have added similar ones so you could reuse as much as possible.


> + * The dataplane APIs include two parts:
> + *   a) The first part is the submission of operation requests:
> + *        - rte_dmadev_copy()
> + *        - rte_dmadev_copy_sg() - scatter-gather form of copy
> + *        - rte_dmadev_fill()
> + *        - rte_dmadev_fill_sg() - scatter-gather form of fill
> + *        - rte_dmadev_fence()   - add a fence force ordering between operations
> + *        - rte_dmadev_perform() - issue doorbell to hardware
> + *      These APIs could work with different virt queues which have different
> + *      contexts.
> + *      The first four APIs are used to submit the operation request to the virt
> + *      queue, if the submission is successful, a cookie (as type
> + *      'dma_cookie_t') is returned, otherwise a negative number is returned.
> + *   b) The second part is to obtain the result of requests:
> + *        - rte_dmadev_completed()
> + *            - return the number of operation requests completed successfully.
> + *        - rte_dmadev_completed_fails()
> + *            - return the number of operation requests failed to complete.
> + *
> + * The misc APIs include info_get/queue_info_get/stats/xstats/selftest, provide
> + * information query and self-test capabilities.
> + *
> + * About the dataplane APIs MT-safe, there are two dimensions:
> + *   a) For one virt queue, the submit/completion API could be MT-safe,
> + *      e.g. one thread do submit operation, another thread do completion
> + *      operation.
> + *      If driver support it, then declare RTE_DMA_DEV_CAPA_MT_VQ.
> + *      If driver don't support it, it's up to the application to guarantee
> + *      MT-safe.
> + *   b) For multiple virt queues on the same HW queue, e.g. one thread do
> + *      operation on virt-queue-0, another thread do operation on virt-queue-1.
> + *      If driver support it, then declare RTE_DMA_DEV_CAPA_MT_MVQ.
> + *      If driver don't support it, it's up to the application to guarantee
> + *      MT-safe.

From an application PoV it may not be good to write portable
applications. Please check
latest thread with @Morten Brørup

> + */
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +#include <rte_common.h>
> +#include <rte_memory.h>
> +#include <rte_errno.h>
> +#include <rte_compat.h>

Sort in alphabetical order.

> +
> +/**
> + * dma_cookie_t - an opaque DMA cookie

Since we are defining the behaviour is not opaque any more.
I think, it is better to call ring_idx or so.


> +#define RTE_DMA_DEV_CAPA_MT_MVQ (1ull << 11) /**< Support MT-safe of multiple virt queues */

Please lot of @see for all symbols where it is being used. So that one
can understand the full scope of
symbols. See below example.

#define RTE_REGEXDEV_CAPA_RUNTIME_COMPILATION_F (1ULL << 0)
/**< RegEx device does support compiling the rules at runtime unlike
 * loading only the pre-built rule database using
 * struct rte_regexdev_config::rule_db in rte_regexdev_configure()
 *
 * @see struct rte_regexdev_config::rule_db, rte_regexdev_configure()
 * @see struct rte_regexdev_info::regexdev_capa
 */

> + *
> + * If dma_cookie_t is >=0 it's a DMA operation request cookie, <0 it's a error
> + * code.
> + * When using cookies, comply with the following rules:
> + * a) Cookies for each virtual queue are independent.
> + * b) For a virt queue, the cookie are monotonically incremented, when it reach
> + *    the INT_MAX, it wraps back to zero.
> + * c) The initial cookie of a virt queue is zero, after the device is stopped or
> + *    reset, the virt queue's cookie needs to be reset to zero.
> + * Example:
> + *    step-1: start one dmadev
> + *    step-2: enqueue a copy operation, the cookie return is 0
> + *    step-3: enqueue a copy operation again, the cookie return is 1
> + *    ...
> + *    step-101: stop the dmadev
> + *    step-102: start the dmadev
> + *    step-103: enqueue a copy operation, the cookie return is 0
> + *    ...
> + */

Good explanation.

> +typedef int32_t dma_cookie_t;


> +
> +/**
> + * dma_scatterlist - can hold scatter DMA operation request
> + */
> +struct dma_scatterlist {

I prefer to change scatterlist -> sg
i.e rte_dma_sg

> +       void *src;
> +       void *dst;
> +       uint32_t length;
> +};
> +

> +
> +/**
> + * A structure used to retrieve the contextual information of
> + * an DMA device
> + */
> +struct rte_dmadev_info {
> +       /**
> +        * Fields filled by framewok

typo.

> +        */
> +       struct rte_device *device; /**< Generic Device information */
> +       const char *driver_name; /**< Device driver name */
> +       int socket_id; /**< Socket ID where memory is allocated */
> +
> +       /**
> +        * Specification fields filled by driver
> +        */
> +       uint64_t dev_capa; /**< Device capabilities (RTE_DMA_DEV_CAPA_) */
> +       uint16_t max_hw_queues; /**< Maximum number of HW queues. */
> +       uint16_t max_vqs_per_hw_queue;
> +       /**< Maximum number of virt queues to allocate per HW queue */
> +       uint16_t max_desc;
> +       /**< Maximum allowed number of virt queue descriptors */
> +       uint16_t min_desc;
> +       /**< Minimum allowed number of virt queue descriptors */

Please add max_nb_segs. i.e maximum number of segments supported.

> +
> +       /**
> +        * Status fields filled by driver
> +        */
> +       uint16_t nb_hw_queues; /**< Number of HW queues configured */
> +       uint16_t nb_vqs; /**< Number of virt queues configured */
> +};
> + i
> +
> +/**
> + * dma_address_type
> + */
> +enum dma_address_type {
> +       DMA_ADDRESS_TYPE_IOVA, /**< Use IOVA as dma address */
> +       DMA_ADDRESS_TYPE_VA, /**< Use VA as dma address */
> +};
> +
> +/**
> + * A structure used to configure a DMA device.
> + */
> +struct rte_dmadev_conf {
> +       enum dma_address_type addr_type; /**< Address type to used */

I think, there are 3 kinds of limitations/capabilities.

When the system is configured as IOVA as VA
1) Device supports any VA address like memory from rte_malloc(),
rte_memzone(), malloc, stack memory
2) Device support only VA address from rte_malloc(), rte_memzone() i.e
memory backed by hugepage and added to DMA map.

When the system is configured as IOVA as PA
1) Devices support only PA addresses .

IMO, Above needs to be  advertised as capability and application needs
to align with that
and I dont think application requests the driver to work in any of the modes.



> +       uint16_t nb_hw_queues; /**< Number of HW-queues enable to use */
> +       uint16_t max_vqs; /**< Maximum number of virt queues to use */

You need to what is max value allowed etc i.e it is based on
info_get() and mention the field
in info structure


> +
> +/**
> + * dma_transfer_direction
> + */
> +enum dma_transfer_direction {

rte_dma_transter_direction

> +       DMA_MEM_TO_MEM,
> +       DMA_MEM_TO_DEV,
> +       DMA_DEV_TO_MEM,
> +       DMA_DEV_TO_DEV,
> +};
> +
> +/**
> + * A structure used to configure a DMA virt queue.
> + */
> +struct rte_dmadev_queue_conf {
> +       enum dma_transfer_direction direction;


> +       /**< Associated transfer direction */
> +       uint16_t hw_queue_id; /**< The HW queue on which to create virt queue */
> +       uint16_t nb_desc; /**< Number of descriptor for this virt queue */
> +       uint64_t dev_flags; /**< Device specific flags */

Use of this? Need more comments on this.
Since it is in slowpath, We can have non opaque names here based on
each driver capability.


> +       void *dev_ctx; /**< Device specific context */

Use of this ? Need more comment ont this.


Please add some good amount of reserved bits and have API to init this
structure for future ABI stability, say rte_dmadev_queue_config_init()
or so.


> +
> +/**
> + * A structure used to retrieve information of a DMA virt queue.
> + */
> +struct rte_dmadev_queue_info {
> +       enum dma_transfer_direction direction;

A queue may support all directions so I think it should be a bitfield.

> +       /**< Associated transfer direction */
> +       uint16_t hw_queue_id; /**< The HW queue on which to create virt queue */
> +       uint16_t nb_desc; /**< Number of descriptor for this virt queue */
> +       uint64_t dev_flags; /**< Device specific flags */
> +};
> +

> +__rte_experimental
> +static inline dma_cookie_t
> +rte_dmadev_copy_sg(uint16_t dev_id, uint16_t vq_id,
> +                  const struct dma_scatterlist *sg,
> +                  uint32_t sg_len, uint64_t flags)

I would like to change this as:
rte_dmadev_copy_sg(uint16_t dev_id, uint16_t vq_id, const struct
rte_dma_sg *src, uint32_t nb_src,
const struct rte_dma_sg *dst, uint32_t nb_dst) or so allow the use case like
src 30 MB copy can be splitted as written as 1 MB x 30 dst.



> +{
> +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> +       return (*dev->copy_sg)(dev, vq_id, sg, sg_len, flags);
> +}
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Enqueue a fill operation onto the DMA virt queue
> + *
> + * This queues up a fill operation to be performed by hardware, but does not
> + * trigger hardware to begin that operation.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vq_id
> + *   The identifier of virt queue.
> + * @param pattern
> + *   The pattern to populate the destination buffer with.
> + * @param dst
> + *   The address of the destination buffer.
> + * @param length
> + *   The length of the destination buffer.
> + * @param flags
> + *   An opaque flags for this operation.

PLEASE REMOVE opaque stuff from fastpath it will be a pain for
application writers as
they need to write multiple combinations of fastpath. flags are OK, if
we have a valid
generic flag now to control the transfer behavior.


> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Add a fence to force ordering between operations
> + *
> + * This adds a fence to a sequence of operations to enforce ordering, such that
> + * all operations enqueued before the fence must be completed before operations
> + * after the fence.
> + * NOTE: Since this fence may be added as a flag to the last operation enqueued,
> + * this API may not function correctly when called immediately after an
> + * "rte_dmadev_perform" call i.e. before any new operations are enqueued.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vq_id
> + *   The identifier of virt queue.
> + *
> + * @return
> + *   - =0: Successful add fence.
> + *   - <0: Failure to add fence.
> + *
> + * NOTE: The caller must ensure that the input parameter is valid and the
> + *       corresponding device supports the operation.
> + */
> +__rte_experimental
> +static inline int
> +rte_dmadev_fence(uint16_t dev_id, uint16_t vq_id)
> +{
> +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> +       return (*dev->fence)(dev, vq_id);
> +}

Since HW submission is in a queue(FIFO) the ordering is always
maintained. Right?
Could you share more details and use case of fence() from
driver/application PoV?


> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Trigger hardware to begin performing enqueued operations
> + *
> + * This API is used to write the "doorbell" to the hardware to trigger it
> + * to begin the operations previously enqueued by rte_dmadev_copy/fill()
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vq_id
> + *   The identifier of virt queue.
> + *
> + * @return
> + *   - =0: Successful trigger hardware.
> + *   - <0: Failure to trigger hardware.
> + *
> + * NOTE: The caller must ensure that the input parameter is valid and the
> + *       corresponding device supports the operation.
> + */
> +__rte_experimental
> +static inline int
> +rte_dmadev_perform(uint16_t dev_id, uint16_t vq_id)
> +{
> +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> +       return (*dev->perform)(dev, vq_id);
> +}

Since we have additional function call overhead in all the
applications for this scheme, I would like to understand
the use of doing this way vs enq does the doorbell implicitly from
driver/application PoV?


> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Returns the number of operations that have been successful completed.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vq_id
> + *   The identifier of virt queue.
> + * @param nb_cpls
> + *   The maximum number of completed operations that can be processed.
> + * @param[out] cookie
> + *   The last completed operation's cookie.
> + * @param[out] has_error
> + *   Indicates if there are transfer error.
> + *
> + * @return
> + *   The number of operations that successful completed.

successfully

> + *
> + * NOTE: The caller must ensure that the input parameter is valid and the
> + *       corresponding device supports the operation.
> + */
> +__rte_experimental
> +static inline uint16_t
> +rte_dmadev_completed(uint16_t dev_id, uint16_t vq_id, const uint16_t nb_cpls,
> +                    dma_cookie_t *cookie, bool *has_error)
> +{
> +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> +       has_error = false;
> +       return (*dev->completed)(dev, vq_id, nb_cpls, cookie, has_error);

It may be better to have cookie/ring_idx as third argument.

> +}
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Returns the number of operations that failed to complete.
> + * NOTE: This API was used when rte_dmadev_completed has_error was set.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vq_id
> + *   The identifier of virt queue.
(> + * @param nb_status
> + *   Indicates the size  of status array.
> + * @param[out] status
> + *   The error code of operations that failed to complete.
> + * @param[out] cookie
> + *   The last failed completed operation's cookie.
> + *
> + * @return
> + *   The number of operations that failed to complete.
> + *
> + * NOTE: The caller must ensure that the input parameter is valid and the
> + *       corresponding device supports the operation.
> + */
> +__rte_experimental
> +static inline uint16_t
> +rte_dmadev_completed_fails(uint16_t dev_id, uint16_t vq_id,
> +                          const uint16_t nb_status, uint32_t *status,
> +                          dma_cookie_t *cookie)

IMO, it is better to move cookie/rind_idx at 3.
Why it would return any array of errors? since it called after
rte_dmadev_completed() has
has_error. Is it better to change

rte_dmadev_error_status((uint16_t dev_id, uint16_t vq_id, dma_cookie_t
*cookie,  uint32_t *status)

I also think, we may need to set status as bitmask and enumerate all
the combination of error codes
of all the driver and return string from driver existing rte_flow_error

See
struct rte_flow_error {
        enum rte_flow_error_type type; /**< Cause field and error types. */
        const void *cause; /**< Object responsible for the error. */
        const char *message; /**< Human-readable error message. */
};

> +{
> +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> +       return (*dev->completed_fails)(dev, vq_id, nb_status, status, cookie);
> +}
> +
> +struct rte_dmadev_stats {
> +       uint64_t enqueue_fail_count;
> +       /**< Conut of all operations which failed enqueued */
> +       uint64_t enqueued_count;
> +       /**< Count of all operations which successful enqueued */
> +       uint64_t completed_fail_count;
> +       /**< Count of all operations which failed to complete */
> +       uint64_t completed_count;
> +       /**< Count of all operations which successful complete */
> +};

We need to have capability API to tell which items are
updated/supported by the driver.


> diff --git a/lib/dmadev/rte_dmadev_core.h b/lib/dmadev/rte_dmadev_core.h
> new file mode 100644
> index 0000000..a3afea2
> --- /dev/null
> +++ b/lib/dmadev/rte_dmadev_core.h
> @@ -0,0 +1,98 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright 2021 HiSilicon Limited.
> + */
> +
> +#ifndef _RTE_DMADEV_CORE_H_
> +#define _RTE_DMADEV_CORE_H_
> +
> +/**
> + * @file
> + *
> + * RTE DMA Device internal header.
> + *
> + * This header contains internal data types. But they are still part of the
> + * public API because they are used by inline public functions.
> + */
> +
> +struct rte_dmadev;
> +
> +typedef dma_cookie_t (*dmadev_copy_t)(struct rte_dmadev *dev, uint16_t vq_id,
> +                                     void *src, void *dst,
> +                                     uint32_t length, uint64_t flags);
> +/**< @internal Function used to enqueue a copy operation. */

To avoid namespace conflict(as it is public API) use rte_


> +
> +/**
> + * The data structure associated with each DMA device.
> + */
> +struct rte_dmadev {
> +       /**< Enqueue a copy operation onto the DMA device. */
> +       dmadev_copy_t copy;
> +       /**< Enqueue a scatter list copy operation onto the DMA device. */
> +       dmadev_copy_sg_t copy_sg;
> +       /**< Enqueue a fill operation onto the DMA device. */
> +       dmadev_fill_t fill;
> +       /**< Enqueue a scatter list fill operation onto the DMA device. */
> +       dmadev_fill_sg_t fill_sg;
> +       /**< Add a fence to force ordering between operations. */
> +       dmadev_fence_t fence;
> +       /**< Trigger hardware to begin performing enqueued operations. */
> +       dmadev_perform_t perform;
> +       /**< Returns the number of operations that successful completed. */
> +       dmadev_completed_t completed;
> +       /**< Returns the number of operations that failed to complete. */
> +       dmadev_completed_fails_t completed_fails;

We need to limit fastpath items in 1 CL

> +
> +       void *dev_private; /**< PMD-specific private data */
> +       const struct rte_dmadev_ops *dev_ops; /**< Functions exported by PMD */
> +
> +       uint16_t dev_id; /**< Device ID for this instance */
> +       int socket_id; /**< Socket ID where memory is allocated */
> +       struct rte_device *device;
> +       /**< Device info. supplied during device initialization */
> +       const char *driver_name; /**< Driver info. supplied by probing */
> +       char name[RTE_DMADEV_NAME_MAX_LEN]; /**< Device name */
> +
> +       RTE_STD_C11
> +       uint8_t attached : 1; /**< Flag indicating the device is attached */
> +       uint8_t started : 1; /**< Device state: STARTED(1)/STOPPED(0) */

Add a couple of reserved fields for future ABI stability.

> +
> +} __rte_cache_aligned;
> +
> +extern struct rte_dmadev rte_dmadevices[];
> +

^ permalink raw reply	[flat|nested] 339+ messages in thread

* Re: [dpdk-dev] [PATCH] dmadev: introduce DMA device library
  2021-07-02 13:18 [dpdk-dev] [PATCH] dmadev: introduce DMA device library Chengwen Feng
  2021-07-02 13:59 ` Bruce Richardson
  2021-07-04  9:30 ` Jerin Jacob
@ 2021-07-04 14:57 ` Andrew Rybchenko
  2021-07-06  3:56   ` fengchengwen
  2021-07-04 15:21 ` Matan Azrad
                   ` (26 subsequent siblings)
  29 siblings, 1 reply; 339+ messages in thread
From: Andrew Rybchenko @ 2021-07-04 14:57 UTC (permalink / raw)
  To: Chengwen Feng, thomas, ferruh.yigit, bruce.richardson, jerinj,
	jerinjacobk
  Cc: dev, mb, nipun.gupta, hemant.agrawal, maxime.coquelin,
	honnappa.nagarahalli, david.marchand, sburla, pkapoor,
	konstantin.ananyev, liangma

On 7/2/21 4:18 PM, Chengwen Feng wrote:
> This patch introduces 'dmadevice' which is a generic type of DMA
> device.

"This patch introduces ... " -> "Introduce ..."

> 
> The APIs of dmadev library exposes some generic operations which can
> enable configuration and I/O with the DMA devices.
> 
> Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>

[snip]

> diff --git a/MAINTAINERS b/MAINTAINERS
> index 4347555..2019783 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -496,6 +496,10 @@ F: drivers/raw/skeleton/
>  F: app/test/test_rawdev.c
>  F: doc/guides/prog_guide/rawdev.rst
>  
> +Dma device API

Dma -> DMA

> +M: Chengwen Feng <fengchengwen@huawei.com>
> +F: lib/dmadev/
> +
>  
>  Memory Pool Drivers
>  -------------------

[snip]

> diff --git a/lib/dmadev/rte_dmadev.c b/lib/dmadev/rte_dmadev.c
> new file mode 100644
> index 0000000..a94e839
> --- /dev/null
> +++ b/lib/dmadev/rte_dmadev.c
> @@ -0,0 +1,438 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright 2021 HiSilicon Limited.
> + */
> +
> +#include <ctype.h>
> +#include <stdlib.h>
> +#include <string.h>
> +#include <stdint.h>
> +
> +#include <rte_log.h>
> +#include <rte_debug.h>
> +#include <rte_dev.h>
> +#include <rte_memory.h>
> +#include <rte_memzone.h>
> +#include <rte_malloc.h>
> +#include <rte_errno.h>
> +#include <rte_string_fns.h>
> +
> +#include "rte_dmadev.h"
> +#include "rte_dmadev_pmd.h"
> +
> +struct rte_dmadev rte_dmadevices[RTE_DMADEV_MAX_DEVS];
> +
> +uint16_t
> +rte_dmadev_count(void)
> +{
> +	uint16_t count = 0;
> +	uint16_t i;
> +
> +	for (i = 0; i < RTE_DMADEV_MAX_DEVS; i++) {
> +		if (rte_dmadevices[i].attached)
> +			count++;
> +	}
> +
> +	return count;
> +}
> +
> +int
> +rte_dmadev_get_dev_id(const char *name)
> +{
> +	uint16_t i;
> +
> +	if (name == NULL)
> +		return -EINVAL;
> +
> +	for (i = 0; i < RTE_DMADEV_MAX_DEVS; i++)
> +		if ((strcmp(rte_dmadevices[i].name, name) == 0) &&
> +		    (rte_dmadevices[i].attached == RTE_DMADEV_ATTACHED))
> +			return i;
> +
> +	return -ENODEV;
> +}
> +
> +int
> +rte_dmadev_socket_id(uint16_t dev_id)
> +{
> +	struct rte_dmadev *dev;
> +
> +	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +	dev = &rte_dmadevices[dev_id];
> +
> +	return dev->socket_id;
> +}
> +
> +int
> +rte_dmadev_info_get(uint16_t dev_id, struct rte_dmadev_info *dev_info)
> +{
> +	struct rte_dmadev *dev;
> +	int diag;
> +
> +	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +	RTE_FUNC_PTR_OR_ERR_RET(dev_info, -EINVAL);
> +
> +	dev = &rte_dmadevices[dev_id];
> +
> +	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_info_get, -ENOTSUP);
> +
> +	memset(dev_info, 0, sizeof(struct rte_dmadev_info));
> +	diag = (*dev->dev_ops->dev_info_get)(dev, dev_info);
> +	if (diag != 0)
> +		return diag;
> +
> +	dev_info->device = dev->device;
> +	dev_info->driver_name = dev->driver_name;
> +	dev_info->socket_id = dev->socket_id;
> +
> +	return 0;
> +}
> +
> +int
> +rte_dmadev_configure(uint16_t dev_id, const struct rte_dmadev_conf *dev_conf)
> +{
> +	struct rte_dmadev *dev;
> +	int diag;
> +
> +	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +	RTE_FUNC_PTR_OR_ERR_RET(dev_conf, -EINVAL);
> +
> +	dev = &rte_dmadevices[dev_id];
> +
> +	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_configure, -ENOTSUP);
> +
> +	if (dev->started) {
> +		RTE_DMADEV_LOG(ERR,
> +		   "device %u must be stopped to allow configuration", dev_id);
> +		return -EBUSY;
> +	}
> +
> +	diag = (*dev->dev_ops->dev_configure)(dev, dev_conf);
> +	if (diag != 0)
> +		RTE_DMADEV_LOG(ERR, "device %u dev_configure failed, ret = %d",
> +			       dev_id, diag);
> +	else
> +		dev->attached = 1;
> +
> +	return diag;
> +}
> +
> +int
> +rte_dmadev_start(uint16_t dev_id)
> +{
> +	struct rte_dmadev *dev;
> +	int diag;
> +
> +	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +
> +	dev = &rte_dmadevices[dev_id];
> +	if (dev->started != 0) {
> +		RTE_DMADEV_LOG(ERR, "device %u already started", dev_id);
> +		return 0;
> +	}
> +
> +	if (dev->dev_ops->dev_start == NULL)
> +		goto mark_started;
> +
> +	diag = (*dev->dev_ops->dev_start)(dev);
> +	if (diag != 0)
> +		return diag;
> +
> +mark_started:
> +	dev->started = 1;
> +	return 0;
> +}
> +
> +int
> +rte_dmadev_stop(uint16_t dev_id)
> +{
> +	struct rte_dmadev *dev;
> +	int diag;
> +
> +	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +
> +	dev = &rte_dmadevices[dev_id];
> +
> +	if (dev->started == 0) {
> +		RTE_DMADEV_LOG(ERR, "device %u already stopped", dev_id);
> +		return 0;
> +	}
> +
> +	if (dev->dev_ops->dev_stop == NULL)
> +		goto mark_stopped;
> +
> +	diag = (*dev->dev_ops->dev_stop)(dev);
> +	if (diag != 0)
> +		return diag;
> +
> +mark_stopped:
> +	dev->started = 0;
> +	return 0;
> +}
> +
> +int
> +rte_dmadev_close(uint16_t dev_id)
> +{
> +	struct rte_dmadev *dev;
> +
> +	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +
> +	dev = &rte_dmadevices[dev_id];
> +
> +	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_close, -ENOTSUP);
> +
> +	/* Device must be stopped before it can be closed */
> +	if (dev->started == 1) {
> +		RTE_DMADEV_LOG(ERR, "device %u must be stopped before closing",
> +			       dev_id);
> +		return -EBUSY;
> +	}
> +
> +	return (*dev->dev_ops->dev_close)(dev);
> +}
> +
> +int
> +rte_dmadev_reset(uint16_t dev_id)
> +{
> +	struct rte_dmadev *dev;
> +
> +	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +
> +	dev = &rte_dmadevices[dev_id];
> +
> +	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_reset, -ENOTSUP);
> +
> +	/* Reset is not dependent on state of the device */
> +	return (*dev->dev_ops->dev_reset)(dev);
> +}
> +
> +int
> +rte_dmadev_queue_setup(uint16_t dev_id,
> +		       const struct rte_dmadev_queue_conf *conf)
> +{
> +	struct rte_dmadev *dev;
> +
> +	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +	RTE_FUNC_PTR_OR_ERR_RET(conf, -EINVAL);
> +
> +	dev = &rte_dmadevices[dev_id];
> +
> +	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->queue_setup, -ENOTSUP);
> +
> +	return (*dev->dev_ops->queue_setup)(dev, conf);
> +}
> +
> +int
> +rte_dmadev_queue_release(uint16_t dev_id, uint16_t vq_id)
> +{
> +	struct rte_dmadev *dev;
> +
> +	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +
> +	dev = &rte_dmadevices[dev_id];
> +
> +	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->queue_release, -ENOTSUP);
> +
> +	return (*dev->dev_ops->queue_release)(dev, vq_id);
> +}
> +
> +int
> +rte_dmadev_queue_info_get(uint16_t dev_id, uint16_t vq_id,
> +			  struct rte_dmadev_queue_info *info)
> +{
> +	struct rte_dmadev *dev;
> +
> +	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +	RTE_FUNC_PTR_OR_ERR_RET(info, -EINVAL);
> +
> +	dev = &rte_dmadevices[dev_id];
> +
> +	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->queue_info_get, -ENOTSUP);
> +
> +	memset(info, 0, sizeof(struct rte_dmadev_queue_info));
> +	return (*dev->dev_ops->queue_info_get)(dev, vq_id, info);
> +}
> +
> +int
> +rte_dmadev_stats_get(uint16_t dev_id, int vq_id,
> +		     struct rte_dmadev_stats *stats)
> +{
> +	struct rte_dmadev *dev;
> +
> +	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +	RTE_FUNC_PTR_OR_ERR_RET(stats, -EINVAL);
> +
> +	dev = &rte_dmadevices[dev_id];
> +
> +	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->stats_get, -ENOTSUP);
> +
> +	return (*dev->dev_ops->stats_get)(dev, vq_id, stats);
> +}
> +
> +int
> +rte_dmadev_stats_reset(uint16_t dev_id, int vq_id)
> +{
> +	struct rte_dmadev *dev;
> +
> +	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +
> +	dev = &rte_dmadevices[dev_id];
> +
> +	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->stats_reset, -ENOTSUP);
> +
> +	return (*dev->dev_ops->stats_reset)(dev, vq_id);
> +}
> +
> +static int
> +xstats_get_count(uint16_t dev_id)
> +{
> +	struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> +
> +	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->xstats_get_names, -ENOTSUP);
> +
> +	return (*dev->dev_ops->xstats_get_names)(dev, NULL, 0);
> +}
> +
> +int
> +rte_dmadev_xstats_names_get(uint16_t dev_id,
> +			    struct rte_dmadev_xstats_name *xstats_names,
> +			    uint32_t size)
> +{
> +	struct rte_dmadev *dev;
> +	int cnt_expected_entries;
> +
> +	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +
> +	cnt_expected_entries = xstats_get_count(dev_id);
> +
> +	if (xstats_names == NULL || cnt_expected_entries < 0 ||
> +	    (int)size < cnt_expected_entries || size == 0)
> +		return cnt_expected_entries;
> +
> +	dev = &rte_dmadevices[dev_id];
> +
> +	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->xstats_get_names, -ENOTSUP);
> +	return (*dev->dev_ops->xstats_get_names)(dev, xstats_names, size);
> +}
> +
> +int
> +rte_dmadev_xstats_get(uint16_t dev_id, const uint32_t ids[],
> +		      uint64_t values[], uint32_t n)
> +{
> +	struct rte_dmadev *dev;
> +
> +	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +	RTE_FUNC_PTR_OR_ERR_RET(ids, -EINVAL);
> +	RTE_FUNC_PTR_OR_ERR_RET(values, -EINVAL);
> +
> +	dev = &rte_dmadevices[dev_id];
> +
> +	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->xstats_get, -ENOTSUP);
> +
> +	return (*dev->dev_ops->xstats_get)(dev, ids, values, n);
> +}
> +
> +int
> +rte_dmadev_xstats_reset(uint16_t dev_id, const uint32_t ids[], uint32_t nb_ids)
> +{
> +	struct rte_dmadev *dev;
> +
> +	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +
> +	dev = &rte_dmadevices[dev_id];
> +
> +	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->xstats_reset, -ENOTSUP);
> +
> +	return (*dev->dev_ops->xstats_reset)(dev, ids, nb_ids);
> +}
> +
> +int
> +rte_dmadev_selftest(uint16_t dev_id)
> +{
> +	struct rte_dmadev *dev;
> +
> +	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +
> +	dev = &rte_dmadevices[dev_id];
> +
> +	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_selftest, -ENOTSUP);
> +
> +	return (*dev->dev_ops->dev_selftest)(dev_id);
> +}
> +
> +static inline uint16_t
> +rte_dmadev_find_free_device_index(void)
> +{
> +	uint16_t i;
> +
> +	for (i = 0; i < RTE_DMADEV_MAX_DEVS; i++) {
> +		if (rte_dmadevices[i].attached == RTE_DMADEV_DETACHED)
> +			return i;
> +	}
> +
> +	return RTE_DMADEV_MAX_DEVS;
> +}
> +
> +struct rte_dmadev *
> +rte_dmadev_pmd_allocate(const char *name, size_t dev_priv_size, int socket_id)
> +{
> +	struct rte_dmadev *dev;
> +	uint16_t dev_id;
> +
> +	if (rte_dmadev_get_dev_id(name) >= 0) {
> +		RTE_DMADEV_LOG(ERR,
> +			"device with name %s already allocated!", name);
> +		return NULL;
> +	}
> +
> +	dev_id = rte_dmadev_find_free_device_index();
> +	if (dev_id == RTE_DMADEV_MAX_DEVS) {
> +		RTE_DMADEV_LOG(ERR, "reached maximum number of DMA devices");
> +		return NULL;
> +	}
> +
> +	dev = &rte_dmadevices[dev_id];
> +
> +	if (dev_priv_size > 0) {
> +		dev->dev_private = rte_zmalloc_socket("dmadev private",
> +				     dev_priv_size,
> +				     RTE_CACHE_LINE_SIZE,
> +				     socket_id);
> +		if (dev->dev_private == NULL) {
> +			RTE_DMADEV_LOG(ERR,
> +				"unable to allocate memory for dmadev");
> +			return NULL;
> +		}
> +	}
> +
> +	dev->dev_id = dev_id;
> +	dev->socket_id = socket_id;
> +	dev->started = 0;
> +	strlcpy(dev->name, name, RTE_DMADEV_NAME_MAX_LEN);
> +
> +	dev->attached = RTE_DMADEV_ATTACHED;
> +
> +	return dev;
> +}
> +
> +int
> +rte_dmadev_pmd_release(struct rte_dmadev *dev)
> +{
> +	int ret;
> +
> +	if (dev == NULL)
> +		return -EINVAL;
> +
> +	ret = rte_dmadev_close(dev->dev_id);
> +	if (ret != 0)
> +		return ret;
> +
> +	if (dev->dev_private != NULL)
> +		rte_free(dev->dev_private);
> +
> +	memset(dev, 0, sizeof(struct rte_dmadev));
> +	dev->attached = RTE_DMADEV_DETACHED;
> +
> +	return 0;
> +}
> +
> +RTE_LOG_REGISTER(libdmadev_logtype, lib.dmadev, INFO);
> diff --git a/lib/dmadev/rte_dmadev.h b/lib/dmadev/rte_dmadev.h
> new file mode 100644
> index 0000000..f74fc6a
> --- /dev/null
> +++ b/lib/dmadev/rte_dmadev.h
> @@ -0,0 +1,919 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright 2021 HiSilicon Limited.
> + */
> +
> +#ifndef _RTE_DMADEV_H_
> +#define _RTE_DMADEV_H_
> +
> +/**
> + * @file rte_dmadev.h
> + *
> + * RTE DMA (Direct Memory Access) device APIs.
> + *
> + * The generic DMA device diagram:
> + *
> + *            ------------     ------------
> + *            | HW-queue |     | HW-queue |
> + *            ------------     ------------
> + *                   \            /
> + *                    \          /
> + *                     \        /
> + *                  ----------------
> + *                  |dma-controller|
> + *                  ----------------
> + *
> + *   The DMA could have multiple HW-queues, each HW-queue could have multiple
> + *   capabilities, e.g. whether to support fill operation, supported DMA
> + *   transfter direction and etc.
> + *
> + * The DMA framework is built on the following abstraction model:
> + *
> + *     ------------    ------------
> + *     |virt-queue|    |virt-queue|
> + *     ------------    ------------

Do we really need "virt" here? "virt queue" could be
incorrectly associated with virtio spec.
IMHO it would be better w/o virt or full "virtual"
everywhere in the documentation.

> + *            \           /
> + *             \         /
> + *              \       /
> + *            ------------     ------------
> + *            | HW-queue |     | HW-queue |
> + *            ------------     ------------
> + *                   \            /
> + *                    \          /
> + *                     \        /
> + *                     ----------
> + *                     | dmadev |
> + *                     ----------
> + *
> + *   a) The DMA operation request must be submitted to the virt queue, virt
> + *      queues must be created based on HW queues, the DMA device could have
> + *      multiple HW queues.

What does define mapping of virtual queues to HW queues? Does
API user sees HW queues? If no, it should be kept transparent
and HW queues should be simply removed from the picture.

> + *   b) The virt queues on the same HW-queue could represent different contexts,
> + *      e.g. user could create virt-queue-0 on HW-queue-0 for mem-to-mem
> + *      transfer scenario, and create virt-queue-1 on the same HW-queue for
> + *      mem-to-dev transfer scenario.
> + *   NOTE: user could also create multiple virt queues for mem-to-mem transfer
> + *         scenario as long as the corresponding driver supports.
> + *
> + * The control plane APIs include configure/queue_setup/queue_release/start/
> + * stop/reset/close, in order to start device work, the call sequence must be
> + * as follows:
> + *     - rte_dmadev_configure()
> + *     - rte_dmadev_queue_setup()
> + *     - rte_dmadev_start()
> + *
> + * The dataplane APIs include two parts:
> + *   a) The first part is the submission of operation requests:
> + *        - rte_dmadev_copy()
> + *        - rte_dmadev_copy_sg() - scatter-gather form of copy
> + *        - rte_dmadev_fill()
> + *        - rte_dmadev_fill_sg() - scatter-gather form of fill
> + *        - rte_dmadev_fence()   - add a fence force ordering between operations
> + *        - rte_dmadev_perform() - issue doorbell to hardware
> + *      These APIs could work with different virt queues which have different
> + *      contexts.
> + *      The first four APIs are used to submit the operation request to the virt
> + *      queue, if the submission is successful, a cookie (as type
> + *      'dma_cookie_t') is returned, otherwise a negative number is returned.
> + *   b) The second part is to obtain the result of requests:
> + *        - rte_dmadev_completed()
> + *            - return the number of operation requests completed successfully.
> + *        - rte_dmadev_completed_fails()
> + *            - return the number of operation requests failed to complete.
> + *
> + * The misc APIs include info_get/queue_info_get/stats/xstats/selftest, provide
> + * information query and self-test capabilities.
> + *
> + * About the dataplane APIs MT-safe, there are two dimensions:
> + *   a) For one virt queue, the submit/completion API could be MT-safe,
> + *      e.g. one thread do submit operation, another thread do completion
> + *      operation.
> + *      If driver support it, then declare RTE_DMA_DEV_CAPA_MT_VQ.
> + *      If driver don't support it, it's up to the application to guarantee
> + *      MT-safe.
> + *   b) For multiple virt queues on the same HW queue, e.g. one thread do
> + *      operation on virt-queue-0, another thread do operation on virt-queue-1.
> + *      If driver support it, then declare RTE_DMA_DEV_CAPA_MT_MVQ.
> + *      If driver don't support it, it's up to the application to guarantee
> + *      MT-safe.
> + */
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +#include <rte_common.h>
> +#include <rte_memory.h>
> +#include <rte_errno.h>
> +#include <rte_compat.h>
> +
> +/**
> + * dma_cookie_t - an opaque DMA cookie
> + *
> + * If dma_cookie_t is >=0 it's a DMA operation request cookie, <0 it's a error
> + * code.
> + * When using cookies, comply with the following rules:
> + * a) Cookies for each virtual queue are independent.
> + * b) For a virt queue, the cookie are monotonically incremented, when it reach
> + *    the INT_MAX, it wraps back to zero.

INT32_MAX

> + * c) The initial cookie of a virt queue is zero, after the device is stopped or
> + *    reset, the virt queue's cookie needs to be reset to zero.
> + * Example:
> + *    step-1: start one dmadev
> + *    step-2: enqueue a copy operation, the cookie return is 0
> + *    step-3: enqueue a copy operation again, the cookie return is 1
> + *    ...
> + *    step-101: stop the dmadev
> + *    step-102: start the dmadev
> + *    step-103: enqueue a copy operation, the cookie return is 0
> + *    ...
> + */
> +typedef int32_t dma_cookie_t;
> +
> +/**
> + * dma_scatterlist - can hold scatter DMA operation request
> + */
> +struct dma_scatterlist {
> +	void *src;
> +	void *dst;
> +	uint32_t length;
> +};
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Get the total number of DMA devices that have been successfully
> + * initialised.
> + *
> + * @return
> + *   The total number of usable DMA devices.
> + */
> +__rte_experimental
> +uint16_t
> +rte_dmadev_count(void);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Get the device identifier for the named DMA device.
> + *
> + * @param name
> + *   DMA device name to select the DMA device identifier.
> + *
> + * @return
> + *   Returns DMA device identifier on success.
> + *   - <0: Failure to find named DMA device.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_get_dev_id(const char *name);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Return the NUMA socket to which a device is connected.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + *
> + * @return
> + *   The NUMA socket id to which the device is connected or
> + *   a default of zero if the socket could not be determined.
> + *   - -EINVAL: dev_id value is out of range.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_socket_id(uint16_t dev_id);

It should be rte_dmadev_numa_node(), I guess.

> +
> +/**
> + * The capabilities of a DMA device
> + */
> +#define RTE_DMA_DEV_CAPA_M2M	(1ull << 0) /**< Support mem-to-mem transfer */
> +#define RTE_DMA_DEV_CAPA_M2D	(1ull << 1) /**< Support mem-to-dev transfer */
> +#define RTE_DMA_DEV_CAPA_D2M	(1ull << 2) /**< Support dev-to-mem transfer */
> +#define RTE_DMA_DEV_CAPA_D2D	(1ull << 3) /**< Support dev-to-dev transfer */
> +#define RTE_DMA_DEV_CAPA_COPY	(1ull << 4) /**< Support copy ops */
> +#define RTE_DMA_DEV_CAPA_FILL	(1ull << 5) /**< Support fill ops */
> +#define RTE_DMA_DEV_CAPA_SG	(1ull << 6) /**< Support scatter-gather ops */
> +#define RTE_DMA_DEV_CAPA_FENCE	(1ull << 7) /**< Support fence ops */
> +#define RTE_DMA_DEV_CAPA_IOVA	(1ull << 8) /**< Support IOVA as DMA address */
> +#define RTE_DMA_DEV_CAPA_VA	(1ull << 9) /**< Support VA as DMA address */
> +#define RTE_DMA_DEV_CAPA_MT_VQ	(1ull << 10) /**< Support MT-safe of one virt queue */
> +#define RTE_DMA_DEV_CAPA_MT_MVQ	(1ull << 11) /**< Support MT-safe of multiple virt queues */

Above is very hard to read. Values should be aligned at the
same column.

> +
> +/**
> + * A structure used to retrieve the contextual information of
> + * an DMA device
> + */
> +struct rte_dmadev_info {
> +	/**
> +	 * Fields filled by framewok
> +	 */

Doxygen has a way to document groups. Please, use it.

> +	struct rte_device *device; /**< Generic Device information */
> +	const char *driver_name; /**< Device driver name */
> +	int socket_id; /**< Socket ID where memory is allocated */
> +
> +	/**
> +	 * Specification fields filled by driver
> +	 */
> +	uint64_t dev_capa; /**< Device capabilities (RTE_DMA_DEV_CAPA_) */
> +	uint16_t max_hw_queues; /**< Maximum number of HW queues. */
> +	uint16_t max_vqs_per_hw_queue;
> +	/**< Maximum number of virt queues to allocate per HW queue */
> +	uint16_t max_desc;
> +	/**< Maximum allowed number of virt queue descriptors */
> +	uint16_t min_desc;
> +	/**< Minimum allowed number of virt queue descriptors */
> +
> +	/**
> +	 * Status fields filled by driver
> +	 */
> +	uint16_t nb_hw_queues; /**< Number of HW queues configured */
> +	uint16_t nb_vqs; /**< Number of virt queues configured */
> +};
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Retrieve the contextual information of a DMA device.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + *
> + * @param[out] dev_info
> + *   A pointer to a structure of type *rte_dmadev_info* to be filled with the
> + *   contextual information of the device.
> + * @return
> + *   - =0: Success, driver updates the contextual information of the DMA device
> + *   - <0: Error code returned by the driver info get function.
> + *
> + */
> +__rte_experimental
> +int
> +rte_dmadev_info_get(uint16_t dev_id, struct rte_dmadev_info *dev_info);
> +
> +/**
> + * dma_address_type
> + */
> +enum dma_address_type {
> +	DMA_ADDRESS_TYPE_IOVA, /**< Use IOVA as dma address */
> +	DMA_ADDRESS_TYPE_VA, /**< Use VA as dma address */
> +};
> +
> +/**
> + * A structure used to configure a DMA device.
> + */
> +struct rte_dmadev_conf {
> +	enum dma_address_type addr_type; /**< Address type to used */
> +	uint16_t nb_hw_queues; /**< Number of HW-queues enable to use */
> +	uint16_t max_vqs; /**< Maximum number of virt queues to use */

Is it total or per HW queue? Please, clarify in the
documetnation.

> +};
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Configure a DMA device.
> + *
> + * This function must be invoked first before any other function in the
> + * API. This function can also be re-invoked when a device is in the
> + * stopped state.
> + *
> + * The caller may use rte_dmadev_info_get() to get the capability of each
> + * resources available for this DMA device.
> + *
> + * @param dev_id
> + *   The identifier of the device to configure.
> + * @param dev_conf
> + *   The DMA device configuration structure encapsulated into rte_dmadev_conf
> + *   object.
> + *
> + * @return
> + *   - =0: Success, device configured.
> + *   - <0: Error code returned by the driver configuration function.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_configure(uint16_t dev_id, const struct rte_dmadev_conf *dev_conf);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Start a DMA device.
> + *
> + * The device start step is the last one and consists of setting the DMA
> + * to start accepting jobs.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + *
> + * @return
> + *   - =0: Success, device started.
> + *   - <0: Error code returned by the driver start function.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_start(uint16_t dev_id);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Stop a DMA device.
> + *
> + * The device can be restarted with a call to rte_dmadev_start()
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + *
> + * @return
> + *   - =0: Success, device stopped.
> + *   - <0: Error code returned by the driver stop function.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_stop(uint16_t dev_id);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Close a DMA device.
> + *
> + * The device cannot be restarted after this call.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + *
> + * @return
> + *  - =0: Successfully closing device
> + *  - <0: Failure to close device
> + */
> +__rte_experimental
> +int
> +rte_dmadev_close(uint16_t dev_id);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Reset a DMA device.
> + *
> + * This is different from cycle of rte_dmadev_start->rte_dmadev_stop in the
> + * sense similar to hard or soft reset.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + *
> + * @return
> + *   - =0: Successful reset device.
> + *   - <0: Failure to reset device.
> + *   - (-ENOTSUP): If the device doesn't support this function.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_reset(uint16_t dev_id);
> +
> +/**
> + * dma_transfer_direction

Such comments must be avoided.

> + */
> +enum dma_transfer_direction {
> +	DMA_MEM_TO_MEM,
> +	DMA_MEM_TO_DEV,
> +	DMA_DEV_TO_MEM,
> +	DMA_DEV_TO_DEV,
> +};
> +
> +/**
> + * A structure used to configure a DMA virt queue.
> + */
> +struct rte_dmadev_queue_conf {
> +	enum dma_transfer_direction direction;
> +	/**< Associated transfer direction */

Please, put comments before the code if comment is in a
separate line.


> +	uint16_t hw_queue_id; /**< The HW queue on which to create virt queue */

How does caller understand which HW queue ID to use?
Which one should be chosen? May PMD should device
which HW queue to use? Queue configuration can
provide hints to make the right choice: required
capabilties, relation to another virtual queue etc.

> +	uint16_t nb_desc; /**< Number of descriptor for this virt queue */
> +	uint64_t dev_flags; /**< Device specific flags */
> +	void *dev_ctx; /**< Device specific context */

Device specific flags and context sounds bad and adds vendors
specifics  in API. If so, it could be very hard to switch from
vendor to vendor. Do I misunderstand?

> +};
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Allocate and set up a virt queue.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param conf
> + *   The queue configuration structure encapsulated into rte_dmadev_queue_conf
> + *   object.
> + *
> + * @return
> + *   - >=0: Allocate virt queue success, it is virt queue id.
> + *   - <0: Error code returned by the driver queue setup function.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_queue_setup(uint16_t dev_id,
> +		       const struct rte_dmadev_queue_conf *conf);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Release a virt queue.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vq_id
> + *   The identifier of virt queue which return by queue setup.
> + *
> + * @return
> + *   - =0: Successful release the virt queue.
> + *   - <0: Error code returned by the driver queue release function.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_queue_release(uint16_t dev_id, uint16_t vq_id);
> +
> +/**
> + * A structure used to retrieve information of a DMA virt queue.
> + */
> +struct rte_dmadev_queue_info {
> +	enum dma_transfer_direction direction;
> +	/**< Associated transfer direction */

Please, put comments before the code if comment is in a
separate line.

> +	uint16_t hw_queue_id; /**< The HW queue on which to create virt queue */
> +	uint16_t nb_desc; /**< Number of descriptor for this virt queue */
> +	uint64_t dev_flags; /**< Device specific flags */
> +};
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Retrieve information of a DMA virt queue.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vq_id
> + *   The identifier of virt queue which return by queue setup.
> + * @param[out] info
> + *   The queue info structure encapsulated into rte_dmadev_queue_info object.
> + *
> + * @return
> + *   - =0: Successful retrieve information.
> + *   - <0: Error code returned by the driver queue release function.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_queue_info_get(uint16_t dev_id, uint16_t vq_id,
> +			  struct rte_dmadev_queue_info *info);
> +
> +#include "rte_dmadev_core.h"
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Enqueue a copy operation onto the DMA virt queue.
> + *
> + * This queues up a copy operation to be performed by hardware, but does not
> + * trigger hardware to begin that operation.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vq_id
> + *   The identifier of virt queue.
> + * @param src
> + *   The address of the source buffer.
> + * @param dst
> + *   The address of the destination buffer.
> + * @param length
> + *   The length of the data to be copied.
> + * @param flags
> + *   An opaque flags for this operation.
> + *
> + * @return
> + *   dma_cookie_t: please refer to the corresponding definition.
> + *
> + * NOTE: The caller must ensure that the input parameter is valid and the
> + *       corresponding device supports the operation.
> + */
> +__rte_experimental
> +static inline dma_cookie_t
> +rte_dmadev_copy(uint16_t dev_id, uint16_t vq_id, void *src, void *dst,
> +		uint32_t length, uint64_t flags)
> +{
> +	struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> +	return (*dev->copy)(dev, vq_id, src, dst, length, flags);
> +}
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Enqueue a scatter list copy operation onto the DMA virt queue.
> + *
> + * This queues up a scatter list copy operation to be performed by hardware,
> + * but does not trigger hardware to begin that operation.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vq_id
> + *   The identifier of virt queue.
> + * @param sg
> + *   The pointer of scatterlist.
> + * @param sg_len
> + *   The number of scatterlist elements.
> + * @param flags
> + *   An opaque flags for this operation.
> + *
> + * @return
> + *   dma_cookie_t: please refer to the corresponding definition.
> + *
> + * NOTE: The caller must ensure that the input parameter is valid and the
> + *       corresponding device supports the operation.
> + */
> +__rte_experimental
> +static inline dma_cookie_t
> +rte_dmadev_copy_sg(uint16_t dev_id, uint16_t vq_id,
> +		   const struct dma_scatterlist *sg,
> +		   uint32_t sg_len, uint64_t flags)
> +{
> +	struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> +	return (*dev->copy_sg)(dev, vq_id, sg, sg_len, flags);
> +}
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Enqueue a fill operation onto the DMA virt queue
> + *
> + * This queues up a fill operation to be performed by hardware, but does not
> + * trigger hardware to begin that operation.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vq_id
> + *   The identifier of virt queue.
> + * @param pattern
> + *   The pattern to populate the destination buffer with.
> + * @param dst
> + *   The address of the destination buffer.
> + * @param length
> + *   The length of the destination buffer.
> + * @param flags
> + *   An opaque flags for this operation.
> + *
> + * @return
> + *   dma_cookie_t: please refer to the corresponding definition.
> + *
> + * NOTE: The caller must ensure that the input parameter is valid and the
> + *       corresponding device supports the operation.
> + */
> +__rte_experimental
> +static inline dma_cookie_t
> +rte_dmadev_fill(uint16_t dev_id, uint16_t vq_id, uint64_t pattern,
> +		void *dst, uint32_t length, uint64_t flags)
> +{
> +	struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> +	return (*dev->fill)(dev, vq_id, pattern, dst, length, flags);
> +}
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Enqueue a scatter list fill operation onto the DMA virt queue
> + *
> + * This queues up a scatter list fill operation to be performed by hardware,
> + * but does not trigger hardware to begin that operation.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vq_id
> + *   The identifier of virt queue.
> + * @param pattern
> + *   The pattern to populate the destination buffer with.
> + * @param sg
> + *   The pointer of scatterlist.
> + * @param sg_len
> + *   The number of scatterlist elements.
> + * @param flags
> + *   An opaque flags for this operation.
> + *
> + * @return
> + *   dma_cookie_t: please refer to the corresponding definition.
> + *
> + * NOTE: The caller must ensure that the input parameter is valid and the
> + *       corresponding device supports the operation.
> + */
> +__rte_experimental
> +static inline dma_cookie_t
> +rte_dmadev_fill_sg(uint16_t dev_id, uint16_t vq_id, uint64_t pattern,
> +		   const struct dma_scatterlist *sg, uint32_t sg_len,
> +		   uint64_t flags)
> +{
> +	struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> +	return (*dev->fill_sg)(dev, vq_id, pattern, sg, sg_len, flags);
> +}
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Add a fence to force ordering between operations
> + *
> + * This adds a fence to a sequence of operations to enforce ordering, such that
> + * all operations enqueued before the fence must be completed before operations
> + * after the fence.
> + * NOTE: Since this fence may be added as a flag to the last operation enqueued,
> + * this API may not function correctly when called immediately after an
> + * "rte_dmadev_perform" call i.e. before any new operations are enqueued.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vq_id
> + *   The identifier of virt queue.
> + *
> + * @return
> + *   - =0: Successful add fence.
> + *   - <0: Failure to add fence.
> + *
> + * NOTE: The caller must ensure that the input parameter is valid and the
> + *       corresponding device supports the operation.
> + */
> +__rte_experimental
> +static inline int
> +rte_dmadev_fence(uint16_t dev_id, uint16_t vq_id)
> +{
> +	struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> +	return (*dev->fence)(dev, vq_id);
> +}
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Trigger hardware to begin performing enqueued operations
> + *
> + * This API is used to write the "doorbell" to the hardware to trigger it
> + * to begin the operations previously enqueued by rte_dmadev_copy/fill()
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vq_id
> + *   The identifier of virt queue.
> + *
> + * @return
> + *   - =0: Successful trigger hardware.
> + *   - <0: Failure to trigger hardware.
> + *
> + * NOTE: The caller must ensure that the input parameter is valid and the
> + *       corresponding device supports the operation.
> + */
> +__rte_experimental
> +static inline int
> +rte_dmadev_perform(uint16_t dev_id, uint16_t vq_id)
> +{
> +	struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> +	return (*dev->perform)(dev, vq_id);
> +}
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Returns the number of operations that have been successful completed.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vq_id
> + *   The identifier of virt queue.
> + * @param nb_cpls
> + *   The maximum number of completed operations that can be processed.
> + * @param[out] cookie
> + *   The last completed operation's cookie.
> + * @param[out] has_error
> + *   Indicates if there are transfer error.
> + *
> + * @return
> + *   The number of operations that successful completed.
> + *
> + * NOTE: The caller must ensure that the input parameter is valid and the
> + *       corresponding device supports the operation.
> + */
> +__rte_experimental
> +static inline uint16_t
> +rte_dmadev_completed(uint16_t dev_id, uint16_t vq_id, const uint16_t nb_cpls,
> +		     dma_cookie_t *cookie, bool *has_error)
> +{
> +	struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> +	has_error = false;
> +	return (*dev->completed)(dev, vq_id, nb_cpls, cookie, has_error);
> +}
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Returns the number of operations that failed to complete.
> + * NOTE: This API was used when rte_dmadev_completed has_error was set.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vq_id
> + *   The identifier of virt queue.
> + * @param nb_status
> + *   Indicates the size of status array.
> + * @param[out] status
> + *   The error code of operations that failed to complete.
> + * @param[out] cookie
> + *   The last failed completed operation's cookie.
> + *
> + * @return
> + *   The number of operations that failed to complete.
> + *
> + * NOTE: The caller must ensure that the input parameter is valid and the
> + *       corresponding device supports the operation.
> + */
> +__rte_experimental
> +static inline uint16_t
> +rte_dmadev_completed_fails(uint16_t dev_id, uint16_t vq_id,
> +			   const uint16_t nb_status, uint32_t *status,
> +			   dma_cookie_t *cookie)
> +{
> +	struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> +	return (*dev->completed_fails)(dev, vq_id, nb_status, status, cookie);
> +}
> +
> +struct rte_dmadev_stats {
> +	uint64_t enqueue_fail_count;
> +	/**< Conut of all operations which failed enqueued */

Please, put comments before the code.

> +	uint64_t enqueued_count;
> +	/**< Count of all operations which successful enqueued */
> +	uint64_t completed_fail_count;
> +	/**< Count of all operations which failed to complete */
> +	uint64_t completed_count;
> +	/**< Count of all operations which successful complete */
> +};
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Retrieve basic statistics of a or all DMA virt queue(s).
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vq_id
> + *   The identifier of virt queue, -1 means all virt queues.
> + * @param[out] stats
> + *   The basic statistics structure encapsulated into rte_dmadev_stats
> + *   object.
> + *
> + * @return
> + *   - =0: Successful retrieve stats.
> + *   - <0: Failure to retrieve stats.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_stats_get(uint16_t dev_id, int vq_id,
> +		     struct rte_dmadev_stats *stats);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Reset basic statistics of a or all DMA virt queue(s).
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vq_id
> + *   The identifier of virt queue, -1 means all virt queues.
> + *
> + * @return
> + *   - =0: Successful retrieve stats.
> + *   - <0: Failure to retrieve stats.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_stats_reset(uint16_t dev_id, int vq_id);
> +
> +/** Maximum name length for extended statistics counters */
> +#define RTE_DMA_DEV_XSTATS_NAME_SIZE 64
> +
> +/**
> + * A name-key lookup element for extended statistics.
> + *
> + * This structure is used to map between names and ID numbers
> + * for extended ethdev statistics.
> + */
> +struct rte_dmadev_xstats_name {
> +	char name[RTE_DMA_DEV_XSTATS_NAME_SIZE];
> +};
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Retrieve names of extended statistics of a DMA device.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param[out] xstats_names
> + *   Block of memory to insert names into. Must be at least size in capacity.
> + *   If set to NULL, function returns required capacity.
> + * @param size
> + *   Capacity of xstats_names (number of names).
> + * @return
> + *   - positive value lower or equal to size: success. The return value
> + *     is the number of entries filled in the stats table.
> + *   - positive value higher than size: error, the given statistics table
> + *     is too small. The return value corresponds to the size that should
> + *     be given to succeed. The entries in the table are not valid and
> + *     shall not be used by the caller.
> + *   - negative value on error.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_xstats_names_get(uint16_t dev_id,
> +			    struct rte_dmadev_xstats_name *xstats_names,
> +			    uint32_t size);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Retrieve extended statistics of a DMA device.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param ids
> + *   The id numbers of the stats to get. The ids can be got from the stat
> + *   position in the stat list from rte_dmadev_get_xstats_names().
> + * @param[out] values
> + *   The values for each stats request by ID.
> + * @param n
> + *   The number of stats requested.
> + *
> + * @return
> + *   - positive value: number of stat entries filled into the values array.
> + *   - negative value on error.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_xstats_get(uint16_t dev_id, const uint32_t ids[],
> +		      uint64_t values[], uint32_t n);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Reset the values of the xstats of the selected component in the device.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param ids
> + *   Selects specific statistics to be reset. When NULL, all statistics
> + *   will be reset. If non-NULL, must point to array of at least
> + *   *nb_ids* size.
> + * @param nb_ids
> + *   The number of ids available from the *ids* array. Ignored when ids is NULL.
> + *
> + * @return
> + *   - zero: successfully reset the statistics to zero.
> + *   - negative value on error.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_xstats_reset(uint16_t dev_id, const uint32_t ids[], uint32_t nb_ids);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Trigger the dmadev self test.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + *
> + * @return
> + *   - 0: Selftest successful.
> + *   - -ENOTSUP if the device doesn't support selftest
> + *   - other values < 0 on failure.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_selftest(uint16_t dev_id);
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif /* _RTE_DMADEV_H_ */
> diff --git a/lib/dmadev/rte_dmadev_core.h b/lib/dmadev/rte_dmadev_core.h
> new file mode 100644
> index 0000000..a3afea2
> --- /dev/null
> +++ b/lib/dmadev/rte_dmadev_core.h
> @@ -0,0 +1,98 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright 2021 HiSilicon Limited.
> + */
> +
> +#ifndef _RTE_DMADEV_CORE_H_
> +#define _RTE_DMADEV_CORE_H_
> +
> +/**
> + * @file
> + *
> + * RTE DMA Device internal header.
> + *
> + * This header contains internal data types. But they are still part of the
> + * public API because they are used by inline public functions.

Do we really want it? Anyway rte_dmadev must not be here.
Some sub-structure could be, but not entire rte_dmadev.

> + */
> +
> +struct rte_dmadev;
> +
> +typedef dma_cookie_t (*dmadev_copy_t)(struct rte_dmadev *dev, uint16_t vq_id,
> +				      void *src, void *dst,
> +				      uint32_t length, uint64_t flags);
> +/**< @internal Function used to enqueue a copy operation. */

Avoid comments after the code in a seprate line. Move it to be
before the code.

> +
> +typedef dma_cookie_t (*dmadev_copy_sg_t)(struct rte_dmadev *dev, uint16_t vq_id,
> +					 const struct dma_scatterlist *sg,
> +					 uint32_t sg_len, uint64_t flags);
> +/**< @internal Function used to enqueue a scatter list copy operation. */
> +
> +typedef dma_cookie_t (*dmadev_fill_t)(struct rte_dmadev *dev, uint16_t vq_id,
> +				      uint64_t pattern, void *dst,
> +				      uint32_t length, uint64_t flags);
> +/**< @internal Function used to enqueue a fill operation. */
> +
> +typedef dma_cookie_t (*dmadev_fill_sg_t)(struct rte_dmadev *dev, uint16_t vq_id,
> +			uint64_t pattern, const struct dma_scatterlist *sg,
> +			uint32_t sg_len, uint64_t flags);
> +/**< @internal Function used to enqueue a scatter list fill operation. */
> +
> +typedef int (*dmadev_fence_t)(struct rte_dmadev *dev, uint16_t vq_id);
> +/**< @internal Function used to add a fence ordering between operations. */
> +
> +typedef int (*dmadev_perform_t)(struct rte_dmadev *dev, uint16_t vq_id);
> +/**< @internal Function used to trigger hardware to begin performing. */
> +
> +typedef uint16_t (*dmadev_completed_t)(struct rte_dmadev *dev, uint16_t vq_id,
> +				       const uint16_t nb_cpls,
> +				       dma_cookie_t *cookie, bool *has_error);
> +/**< @internal Function used to return number of successful completed operations */
> +
> +typedef uint16_t (*dmadev_completed_fails_t)(struct rte_dmadev *dev,
> +			uint16_t vq_id, const uint16_t nb_status,
> +			uint32_t *status, dma_cookie_t *cookie);
> +/**< @internal Function used to return number of failed completed operations */
> +
> +#define RTE_DMADEV_NAME_MAX_LEN	64 /**< Max length of name of DMA PMD */
> +
> +struct rte_dmadev_ops;
> +
> +/**
> + * The data structure associated with each DMA device.
> + */
> +struct rte_dmadev {
> +	/**< Enqueue a copy operation onto the DMA device. */

Comment before code should start from /** (not /**< ).

> +	dmadev_copy_t copy;
> +	/**< Enqueue a scatter list copy operation onto the DMA device. */
> +	dmadev_copy_sg_t copy_sg;
> +	/**< Enqueue a fill operation onto the DMA device. */
> +	dmadev_fill_t fill;
> +	/**< Enqueue a scatter list fill operation onto the DMA device. */
> +	dmadev_fill_sg_t fill_sg;
> +	/**< Add a fence to force ordering between operations. */
> +	dmadev_fence_t fence;
> +	/**< Trigger hardware to begin performing enqueued operations. */
> +	dmadev_perform_t perform;
> +	/**< Returns the number of operations that successful completed. */
> +	dmadev_completed_t completed;
> +	/**< Returns the number of operations that failed to complete. */
> +	dmadev_completed_fails_t completed_fails;
> +
> +	void *dev_private; /**< PMD-specific private data */
> +	const struct rte_dmadev_ops *dev_ops; /**< Functions exported by PMD */
> +
> +	uint16_t dev_id; /**< Device ID for this instance */
> +	int socket_id; /**< Socket ID where memory is allocated */
> +	struct rte_device *device;
> +	/**< Device info. supplied during device initialization */

Please, put comments before the code if comment is in a
separate line.

> +	const char *driver_name; /**< Driver info. supplied by probing */
> +	char name[RTE_DMADEV_NAME_MAX_LEN]; /**< Device name */
> +
> +	RTE_STD_C11
> +	uint8_t attached : 1; /**< Flag indicating the device is attached */
> +	uint8_t started : 1; /**< Device state: STARTED(1)/STOPPED(0) */
> +
> +} __rte_cache_aligned;
> +
> +extern struct rte_dmadev rte_dmadevices[];
> +
> +#endif /* _RTE_DMADEV_CORE_H_ */
> diff --git a/lib/dmadev/rte_dmadev_pmd.h b/lib/dmadev/rte_dmadev_pmd.h

Let's remove rte_ prefix from DPDK internal headers.

> new file mode 100644
> index 0000000..ef03cf7
> --- /dev/null
> +++ b/lib/dmadev/rte_dmadev_pmd.h
> @@ -0,0 +1,210 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright 2021 HiSilicon Limited.
> + */
> +
> +#ifndef _RTE_DMADEV_PMD_H_
> +#define _RTE_DMADEV_PMD_H_
> +
> +/** @file
> + * RTE DMA PMD APIs
> + *
> + * @note
> + * Driver facing APIs for a DMA device. These are not to be called directly by
> + * any application.
> + */
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +#include <string.h>
> +
> +#include <rte_dev.h>
> +#include <rte_log.h>
> +#include <rte_common.h>
> +
> +#include "rte_dmadev.h"
> +
> +extern int libdmadev_logtype;
> +
> +#define RTE_DMADEV_LOG(level, fmt, args...) \

Do we need RTE_ prefix for internal API?

> +	rte_log(RTE_LOG_ ## level, libdmadev_logtype, "%s(): " fmt "\n", \
> +		__func__, ##args)
> +
> +/* Macros to check for valid device */
> +#define RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, retval) do { \
> +	if (!rte_dmadev_pmd_is_valid_dev((dev_id))) { \
> +		RTE_DMADEV_LOG(ERR, "Invalid dev_id=%d", dev_id); \
> +		return retval; \
> +	} \
> +} while (0)
> +
> +#define RTE_DMADEV_VALID_DEVID_OR_RET(dev_id) do { \
> +	if (!rte_dmadev_pmd_is_valid_dev((dev_id))) { \
> +		RTE_DMADEV_LOG(ERR, "Invalid dev_id=%d", dev_id); \
> +		return; \
> +	} \
> +} while (0)
> +
> +#define RTE_DMADEV_DETACHED  0
> +#define RTE_DMADEV_ATTACHED  1

Do we really need RTE_ prefix for interlal defines?

> +
> +/**
> + * Validate if the DMA device index is a valid attached DMA device.
> + *
> + * @param dev_id
> + *   DMA device index.
> + *
> + * @return
> + *   - If the device index is valid (1) or not (0).
> + */
> +static inline unsigned

'unsigned int', but it sounds like the function should return
'bool'.

> +rte_dmadev_pmd_is_valid_dev(uint16_t dev_id)

Again, do we really need rte_ prefix for internal functions?

> +{
> +	struct rte_dmadev *dev;
> +
> +	if (dev_id >= RTE_DMADEV_MAX_DEVS)
> +		return 0;
> +
> +	dev = &rte_dmadevices[dev_id];
> +	if (dev->attached != RTE_DMADEV_ATTACHED)
> +		return 0;
> +	else
> +		return 1;
> +}
> +
> +/**
> + * Definitions of control-plane functions exported by a driver through the
> + * generic structure of type *rte_dmadev_ops* supplied in the *rte_dmadev*
> + * structure associated with a device.
> + */
> +
> +typedef int (*dmadev_info_get_t)(struct rte_dmadev *dev,
> +				 struct rte_dmadev_info *dev_info);
> +/**< @internal Function used to get device information of a device. */

Let's don't use documentation after code in a separate line in
a new code.

> +
> +typedef int (*dmadev_configure_t)(struct rte_dmadev *dev,
> +				  const struct rte_dmadev_conf *dev_conf);
> +/**< @internal Function used to configure a device. */
> +
> +typedef int (*dmadev_start_t)(struct rte_dmadev *dev);
> +/**< @internal Function used to start a configured device. */
> +
> +typedef int (*dmadev_stop_t)(struct rte_dmadev *dev);
> +/**< @internal Function used to stop a configured device. */
> +
> +typedef int (*dmadev_close_t)(struct rte_dmadev *dev);
> +/**< @internal Function used to close a configured device. */
> +
> +typedef int (*dmadev_reset_t)(struct rte_dmadev *dev);
> +/**< @internal Function used to reset a configured device. */
> +
> +typedef int (*dmadev_queue_setup_t)(struct rte_dmadev *dev,
> +				    const struct rte_dmadev_queue_conf *conf);
> +/**< @internal Function used to allocate and set up a virt queue. */
> +
> +typedef int (*dmadev_queue_release_t)(struct rte_dmadev *dev, uint16_t vq_id);
> +/**< @internal Function used to release a virt queue. */
> +
> +typedef int (*dmadev_queue_info_t)(struct rte_dmadev *dev, uint16_t vq_id,
> +				   struct rte_dmadev_queue_info *info);
> +/**< @internal Function used to retrieve information of a virt queue. */
> +
> +typedef int (*dmadev_stats_get_t)(struct rte_dmadev *dev, int vq_id,
> +				  struct rte_dmadev_stats *stats);
> +/**< @internal Function used to retrieve basic statistics. */
> +
> +typedef int (*dmadev_stats_reset_t)(struct rte_dmadev *dev, int vq_id);
> +/**< @internal Function used to reset basic statistics. */
> +
> +typedef int (*dmadev_xstats_get_names_t)(const struct rte_dmadev *dev,
> +		struct rte_dmadev_xstats_name *xstats_names,
> +		uint32_t size);
> +/**< @internal Function used to get names of extended stats. */
> +
> +typedef int (*dmadev_xstats_get_t)(const struct rte_dmadev *dev,
> +		const uint32_t ids[], uint64_t values[], uint32_t n);
> +/**< @internal Function used to retrieve extended stats. */
> +
> +typedef int (*dmadev_xstats_reset_t)(struct rte_dmadev *dev,
> +				     const uint32_t ids[], uint32_t nb_ids);
> +/**< @internal Function used to reset extended stats. */

Do we really need both stats and xstats from the very
beginning? I think it is better to start from just
generic stats and add xstats when it is really required.

> +
> +typedef int (*dmadev_selftest_t)(uint16_t dev_id);
> +/**< @internal Function used to start dmadev selftest. */
> +
> +/** DMA device operations function pointer table */
> +struct rte_dmadev_ops {

Do we need rte_ prefiix for internal data types?

> +	/**< Get device info. */
> +	dmadev_info_get_t dev_info_get;
> +	/**< Configure device. */
> +	dmadev_configure_t dev_configure;
> +	/**< Start device. */
> +	dmadev_start_t dev_start;
> +	/**< Stop device. */
> +	dmadev_stop_t dev_stop;
> +	/**< Close device. */
> +	dmadev_close_t dev_close;
> +	/**< Reset device. */
> +	dmadev_reset_t dev_reset;
> +
> +	/**< Allocate and set up a virt queue. */
> +	dmadev_queue_setup_t queue_setup;
> +	/**< Release a virt queue. */
> +	dmadev_queue_release_t queue_release;
> +	/**< Retrieve information of a virt queue */
> +	dmadev_queue_info_t queue_info_get;
> +
> +	/**< Get basic statistics. */
> +	dmadev_stats_get_t stats_get;
> +	/**< Reset basic statistics. */
> +	dmadev_stats_reset_t stats_reset;
> +	/**< Get names of extended stats. */
> +	dmadev_xstats_get_names_t xstats_get_names;
> +	/**< Get extended statistics. */
> +	dmadev_xstats_get_t xstats_get;
> +	/**< Reset extended statistics values. */
> +	dmadev_xstats_reset_t xstats_reset;
> +
> +	/**< Device selftest function */
> +	dmadev_selftest_t dev_selftest;
> +};
> +
> +/**
> + * Allocates a new dmadev slot for an DMA device and returns the pointer
> + * to that slot for the driver to use.
> + *
> + * @param name
> + *   Unique identifier name for each device
> + * @param dev_private_size
> + *   Size of private data memory allocated within rte_dmadev object.
> + *   Set to 0 to disable internal memory allocation and allow for
> + *   self-allocation.
> + * @param socket_id
> + *   Socket to allocate resources on.

It should be numa_node. See recent mails from Thomas M.

> + *
> + * @return
> + *   - NULL: Failure to allocate
> + *   - Other: The rte_dmadev structure pointer for the new device
> + */
> +struct rte_dmadev *
> +rte_dmadev_pmd_allocate(const char *name, size_t dev_private_size,
> +			int socket_id);
> +
> +/**
> + * Release the specified dmadev device.

"dmadev device" sounds strange. May be "DMA device"?

> + *
> + * @param dev
> + *   The *dmadev* pointer is the address of the *rte_dmadev* structure.
> + *
> + * @return
> + *   - 0 on success, negative on error
> + */
> +int
> +rte_dmadev_pmd_release(struct rte_dmadev *dev);
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif /* _RTE_DMADEV_PMD_H_ */

[snip]

^ permalink raw reply	[flat|nested] 339+ messages in thread

* Re: [dpdk-dev] [PATCH] dmadev: introduce DMA device library
  2021-07-02 13:18 [dpdk-dev] [PATCH] dmadev: introduce DMA device library Chengwen Feng
                   ` (2 preceding siblings ...)
  2021-07-04 14:57 ` Andrew Rybchenko
@ 2021-07-04 15:21 ` Matan Azrad
  2021-07-06  6:25   ` fengchengwen
  2021-07-06 20:28 ` [dpdk-dev] [RFC UPDATE PATCH 0/9] dmadev rfc suggested updates Bruce Richardson
                   ` (25 subsequent siblings)
  29 siblings, 1 reply; 339+ messages in thread
From: Matan Azrad @ 2021-07-04 15:21 UTC (permalink / raw)
  To: Chengwen Feng, NBU-Contact-Thomas Monjalon, ferruh.yigit,
	bruce.richardson, jerinj, jerinjacobk
  Cc: dev, mb, nipun.gupta, hemant.agrawal, maxime.coquelin,
	honnappa.nagarahalli, david.marchand, sburla, pkapoor,
	konstantin.ananyev, liangma



From: Chengwen Feng
> This patch introduces 'dmadevice' which is a generic type of DMA
> device.
> 
> The APIs of dmadev library exposes some generic operations which can
> enable configuration and I/O with the DMA devices.
> 
Did you consider RTE_COMP_ALGO_NULL xform in compressdev library?

> Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>
> ---
>  MAINTAINERS                  |   4 +
>  config/rte_config.h          |   3 +
>  lib/dmadev/meson.build       |   6 +
>  lib/dmadev/rte_dmadev.c      | 438 +++++++++++++++++++++
>  lib/dmadev/rte_dmadev.h      | 919
> +++++++++++++++++++++++++++++++++++++++++++
>  lib/dmadev/rte_dmadev_core.h |  98 +++++
>  lib/dmadev/rte_dmadev_pmd.h  | 210 ++++++++++
>  lib/dmadev/version.map       |  32 ++
>  lib/meson.build              |   1 +
>  9 files changed, 1711 insertions(+)
>  create mode 100644 lib/dmadev/meson.build
>  create mode 100644 lib/dmadev/rte_dmadev.c
>  create mode 100644 lib/dmadev/rte_dmadev.h
>  create mode 100644 lib/dmadev/rte_dmadev_core.h
>  create mode 100644 lib/dmadev/rte_dmadev_pmd.h
>  create mode 100644 lib/dmadev/version.map
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 4347555..2019783 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -496,6 +496,10 @@ F: drivers/raw/skeleton/
>  F: app/test/test_rawdev.c
>  F: doc/guides/prog_guide/rawdev.rst
> 
> +Dma device API
> +M: Chengwen Feng <fengchengwen@huawei.com>
> +F: lib/dmadev/
> +
> 
>  Memory Pool Drivers
>  -------------------
> diff --git a/config/rte_config.h b/config/rte_config.h
> index 590903c..331a431 100644
> --- a/config/rte_config.h
> +++ b/config/rte_config.h
> @@ -81,6 +81,9 @@
>  /* rawdev defines */
>  #define RTE_RAWDEV_MAX_DEVS 64
> 
> +/* dmadev defines */
> +#define RTE_DMADEV_MAX_DEVS 64
> +
>  /* ip_fragmentation defines */
>  #define RTE_LIBRTE_IP_FRAG_MAX_FRAG 4
>  #undef RTE_LIBRTE_IP_FRAG_TBL_STAT
> diff --git a/lib/dmadev/meson.build b/lib/dmadev/meson.build
> new file mode 100644
> index 0000000..c918dae
> --- /dev/null
> +++ b/lib/dmadev/meson.build
> @@ -0,0 +1,6 @@
> +# SPDX-License-Identifier: BSD-3-Clause
> +# Copyright(c) 2021 HiSilicon Limited.
> +
> +sources = files('rte_dmadev.c')
> +headers = files('rte_dmadev.h', 'rte_dmadev_pmd.h')
> +indirect_headers += files('rte_dmadev_core.h')
> diff --git a/lib/dmadev/rte_dmadev.c b/lib/dmadev/rte_dmadev.c
> new file mode 100644
> index 0000000..a94e839
> --- /dev/null
> +++ b/lib/dmadev/rte_dmadev.c
> @@ -0,0 +1,438 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright 2021 HiSilicon Limited.
> + */
> +
> +#include <ctype.h>
> +#include <stdlib.h>
> +#include <string.h>
> +#include <stdint.h>
> +
> +#include <rte_log.h>
> +#include <rte_debug.h>
> +#include <rte_dev.h>
> +#include <rte_memory.h>
> +#include <rte_memzone.h>
> +#include <rte_malloc.h>
> +#include <rte_errno.h>
> +#include <rte_string_fns.h>
> +
> +#include "rte_dmadev.h"
> +#include "rte_dmadev_pmd.h"
> +
> +struct rte_dmadev rte_dmadevices[RTE_DMADEV_MAX_DEVS];
> +
> +uint16_t
> +rte_dmadev_count(void)
> +{
> +       uint16_t count = 0;
> +       uint16_t i;
> +
> +       for (i = 0; i < RTE_DMADEV_MAX_DEVS; i++) {
> +               if (rte_dmadevices[i].attached)
> +                       count++;
> +       }
> +
> +       return count;
> +}
> +
> +int
> +rte_dmadev_get_dev_id(const char *name)
> +{
> +       uint16_t i;
> +
> +       if (name == NULL)
> +               return -EINVAL;
> +
> +       for (i = 0; i < RTE_DMADEV_MAX_DEVS; i++)
> +               if ((strcmp(rte_dmadevices[i].name, name) == 0) &&
> +                   (rte_dmadevices[i].attached == RTE_DMADEV_ATTACHED))
> +                       return i;
> +
> +       return -ENODEV;
> +}
> +
> +int
> +rte_dmadev_socket_id(uint16_t dev_id)
> +{
> +       struct rte_dmadev *dev;
> +
> +       RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +       dev = &rte_dmadevices[dev_id];
> +
> +       return dev->socket_id;
> +}
> +
> +int
> +rte_dmadev_info_get(uint16_t dev_id, struct rte_dmadev_info *dev_info)
> +{
> +       struct rte_dmadev *dev;
> +       int diag;
> +
> +       RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +       RTE_FUNC_PTR_OR_ERR_RET(dev_info, -EINVAL);
> +
> +       dev = &rte_dmadevices[dev_id];
> +
> +       RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_info_get, -
> ENOTSUP);
> +
> +       memset(dev_info, 0, sizeof(struct rte_dmadev_info));
> +       diag = (*dev->dev_ops->dev_info_get)(dev, dev_info);
> +       if (diag != 0)
> +               return diag;
> +
> +       dev_info->device = dev->device;
> +       dev_info->driver_name = dev->driver_name;
> +       dev_info->socket_id = dev->socket_id;
> +
> +       return 0;
> +}
> +
> +int
> +rte_dmadev_configure(uint16_t dev_id, const struct rte_dmadev_conf
> *dev_conf)
> +{
> +       struct rte_dmadev *dev;
> +       int diag;
> +
> +       RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +       RTE_FUNC_PTR_OR_ERR_RET(dev_conf, -EINVAL);
> +
> +       dev = &rte_dmadevices[dev_id];
> +
> +       RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_configure, -
> ENOTSUP);
> +
> +       if (dev->started) {
> +               RTE_DMADEV_LOG(ERR,
> +                  "device %u must be stopped to allow configuration", dev_id);
> +               return -EBUSY;
> +       }
> +
> +       diag = (*dev->dev_ops->dev_configure)(dev, dev_conf);
> +       if (diag != 0)
> +               RTE_DMADEV_LOG(ERR, "device %u dev_configure failed, ret =
> %d",
> +                              dev_id, diag);
> +       else
> +               dev->attached = 1;
> +
> +       return diag;
> +}
> +
> +int
> +rte_dmadev_start(uint16_t dev_id)
> +{
> +       struct rte_dmadev *dev;
> +       int diag;
> +
> +       RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +
> +       dev = &rte_dmadevices[dev_id];
> +       if (dev->started != 0) {
> +               RTE_DMADEV_LOG(ERR, "device %u already started", dev_id);
> +               return 0;
> +       }
> +
> +       if (dev->dev_ops->dev_start == NULL)
> +               goto mark_started;
> +
> +       diag = (*dev->dev_ops->dev_start)(dev);
> +       if (diag != 0)
> +               return diag;
> +
> +mark_started:
> +       dev->started = 1;
> +       return 0;
> +}
> +
> +int
> +rte_dmadev_stop(uint16_t dev_id)
> +{
> +       struct rte_dmadev *dev;
> +       int diag;
> +
> +       RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +
> +       dev = &rte_dmadevices[dev_id];
> +
> +       if (dev->started == 0) {
> +               RTE_DMADEV_LOG(ERR, "device %u already stopped", dev_id);
> +               return 0;
> +       }
> +
> +       if (dev->dev_ops->dev_stop == NULL)
> +               goto mark_stopped;
> +
> +       diag = (*dev->dev_ops->dev_stop)(dev);
> +       if (diag != 0)
> +               return diag;
> +
> +mark_stopped:
> +       dev->started = 0;
> +       return 0;
> +}
> +
> +int
> +rte_dmadev_close(uint16_t dev_id)
> +{
> +       struct rte_dmadev *dev;
> +
> +       RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +
> +       dev = &rte_dmadevices[dev_id];
> +
> +       RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_close, -ENOTSUP);
> +
> +       /* Device must be stopped before it can be closed */
> +       if (dev->started == 1) {
> +               RTE_DMADEV_LOG(ERR, "device %u must be stopped before
> closing",
> +                              dev_id);
> +               return -EBUSY;
> +       }
> +
> +       return (*dev->dev_ops->dev_close)(dev);
> +}
> +
> +int
> +rte_dmadev_reset(uint16_t dev_id)
> +{
> +       struct rte_dmadev *dev;
> +
> +       RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +
> +       dev = &rte_dmadevices[dev_id];
> +
> +       RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_reset, -ENOTSUP);
> +
> +       /* Reset is not dependent on state of the device */
> +       return (*dev->dev_ops->dev_reset)(dev);
> +}
> +
> +int
> +rte_dmadev_queue_setup(uint16_t dev_id,
> +                      const struct rte_dmadev_queue_conf *conf)
> +{
> +       struct rte_dmadev *dev;
> +
> +       RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +       RTE_FUNC_PTR_OR_ERR_RET(conf, -EINVAL);
> +
> +       dev = &rte_dmadevices[dev_id];
> +
> +       RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->queue_setup, -
> ENOTSUP);
> +
> +       return (*dev->dev_ops->queue_setup)(dev, conf);
> +}
> +
> +int
> +rte_dmadev_queue_release(uint16_t dev_id, uint16_t vq_id)
> +{
> +       struct rte_dmadev *dev;
> +
> +       RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +
> +       dev = &rte_dmadevices[dev_id];
> +
> +       RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->queue_release, -
> ENOTSUP);
> +
> +       return (*dev->dev_ops->queue_release)(dev, vq_id);
> +}
> +
> +int
> +rte_dmadev_queue_info_get(uint16_t dev_id, uint16_t vq_id,
> +                         struct rte_dmadev_queue_info *info)
> +{
> +       struct rte_dmadev *dev;
> +
> +       RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +       RTE_FUNC_PTR_OR_ERR_RET(info, -EINVAL);
> +
> +       dev = &rte_dmadevices[dev_id];
> +
> +       RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->queue_info_get, -
> ENOTSUP);
> +
> +       memset(info, 0, sizeof(struct rte_dmadev_queue_info));
> +       return (*dev->dev_ops->queue_info_get)(dev, vq_id, info);
> +}
> +
> +int
> +rte_dmadev_stats_get(uint16_t dev_id, int vq_id,
> +                    struct rte_dmadev_stats *stats)
> +{
> +       struct rte_dmadev *dev;
> +
> +       RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +       RTE_FUNC_PTR_OR_ERR_RET(stats, -EINVAL);
> +
> +       dev = &rte_dmadevices[dev_id];
> +
> +       RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->stats_get, -ENOTSUP);
> +
> +       return (*dev->dev_ops->stats_get)(dev, vq_id, stats);
> +}
> +
> +int
> +rte_dmadev_stats_reset(uint16_t dev_id, int vq_id)
> +{
> +       struct rte_dmadev *dev;
> +
> +       RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +
> +       dev = &rte_dmadevices[dev_id];
> +
> +       RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->stats_reset, -
> ENOTSUP);
> +
> +       return (*dev->dev_ops->stats_reset)(dev, vq_id);
> +}
> +
> +static int
> +xstats_get_count(uint16_t dev_id)
> +{
> +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> +
> +       RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->xstats_get_names, -
> ENOTSUP);
> +
> +       return (*dev->dev_ops->xstats_get_names)(dev, NULL, 0);
> +}
> +
> +int
> +rte_dmadev_xstats_names_get(uint16_t dev_id,
> +                           struct rte_dmadev_xstats_name *xstats_names,
> +                           uint32_t size)
> +{
> +       struct rte_dmadev *dev;
> +       int cnt_expected_entries;
> +
> +       RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +
> +       cnt_expected_entries = xstats_get_count(dev_id);
> +
> +       if (xstats_names == NULL || cnt_expected_entries < 0 ||
> +           (int)size < cnt_expected_entries || size == 0)
> +               return cnt_expected_entries;
> +
> +       dev = &rte_dmadevices[dev_id];
> +
> +       RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->xstats_get_names, -
> ENOTSUP);
> +       return (*dev->dev_ops->xstats_get_names)(dev, xstats_names, size);
> +}
> +
> +int
> +rte_dmadev_xstats_get(uint16_t dev_id, const uint32_t ids[],
> +                     uint64_t values[], uint32_t n)
> +{
> +       struct rte_dmadev *dev;
> +
> +       RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +       RTE_FUNC_PTR_OR_ERR_RET(ids, -EINVAL);
> +       RTE_FUNC_PTR_OR_ERR_RET(values, -EINVAL);
> +
> +       dev = &rte_dmadevices[dev_id];
> +
> +       RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->xstats_get, -
> ENOTSUP);
> +
> +       return (*dev->dev_ops->xstats_get)(dev, ids, values, n);
> +}
> +
> +int
> +rte_dmadev_xstats_reset(uint16_t dev_id, const uint32_t ids[], uint32_t
> nb_ids)
> +{
> +       struct rte_dmadev *dev;
> +
> +       RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +
> +       dev = &rte_dmadevices[dev_id];
> +
> +       RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->xstats_reset, -
> ENOTSUP);
> +
> +       return (*dev->dev_ops->xstats_reset)(dev, ids, nb_ids);
> +}
> +
> +int
> +rte_dmadev_selftest(uint16_t dev_id)
> +{
> +       struct rte_dmadev *dev;
> +
> +       RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +
> +       dev = &rte_dmadevices[dev_id];
> +
> +       RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_selftest, -
> ENOTSUP);
> +
> +       return (*dev->dev_ops->dev_selftest)(dev_id);
> +}
> +
> +static inline uint16_t
> +rte_dmadev_find_free_device_index(void)
> +{
> +       uint16_t i;
> +
> +       for (i = 0; i < RTE_DMADEV_MAX_DEVS; i++) {
> +               if (rte_dmadevices[i].attached == RTE_DMADEV_DETACHED)
> +                       return i;
> +       }
> +
> +       return RTE_DMADEV_MAX_DEVS;
> +}
> +
> +struct rte_dmadev *
> +rte_dmadev_pmd_allocate(const char *name, size_t dev_priv_size, int
> socket_id)
> +{
> +       struct rte_dmadev *dev;
> +       uint16_t dev_id;
> +
> +       if (rte_dmadev_get_dev_id(name) >= 0) {
> +               RTE_DMADEV_LOG(ERR,
> +                       "device with name %s already allocated!", name);
> +               return NULL;
> +       }
> +
> +       dev_id = rte_dmadev_find_free_device_index();
> +       if (dev_id == RTE_DMADEV_MAX_DEVS) {
> +               RTE_DMADEV_LOG(ERR, "reached maximum number of DMA
> devices");
> +               return NULL;
> +       }
> +
> +       dev = &rte_dmadevices[dev_id];
> +
> +       if (dev_priv_size > 0) {
> +               dev->dev_private = rte_zmalloc_socket("dmadev private",
> +                                    dev_priv_size,
> +                                    RTE_CACHE_LINE_SIZE,
> +                                    socket_id);
> +               if (dev->dev_private == NULL) {
> +                       RTE_DMADEV_LOG(ERR,
> +                               "unable to allocate memory for dmadev");
> +                       return NULL;
> +               }
> +       }
> +
> +       dev->dev_id = dev_id;
> +       dev->socket_id = socket_id;
> +       dev->started = 0;
> +       strlcpy(dev->name, name, RTE_DMADEV_NAME_MAX_LEN);
> +
> +       dev->attached = RTE_DMADEV_ATTACHED;
> +
> +       return dev;
> +}
> +
> +int
> +rte_dmadev_pmd_release(struct rte_dmadev *dev)
> +{
> +       int ret;
> +
> +       if (dev == NULL)
> +               return -EINVAL;
> +
> +       ret = rte_dmadev_close(dev->dev_id);
> +       if (ret != 0)
> +               return ret;
> +
> +       if (dev->dev_private != NULL)
> +               rte_free(dev->dev_private);
> +
> +       memset(dev, 0, sizeof(struct rte_dmadev));
> +       dev->attached = RTE_DMADEV_DETACHED;
> +
> +       return 0;
> +}
> +
> +RTE_LOG_REGISTER(libdmadev_logtype, lib.dmadev, INFO);
> diff --git a/lib/dmadev/rte_dmadev.h b/lib/dmadev/rte_dmadev.h
> new file mode 100644
> index 0000000..f74fc6a
> --- /dev/null
> +++ b/lib/dmadev/rte_dmadev.h
> @@ -0,0 +1,919 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright 2021 HiSilicon Limited.
> + */
> +
> +#ifndef _RTE_DMADEV_H_
> +#define _RTE_DMADEV_H_
> +
> +/**
> + * @file rte_dmadev.h
> + *
> + * RTE DMA (Direct Memory Access) device APIs.
> + *
> + * The generic DMA device diagram:
> + *
> + *            ------------     ------------
> + *            | HW-queue |     | HW-queue |
> + *            ------------     ------------
> + *                   \            /
> + *                    \          /
> + *                     \        /
> + *                  ----------------
> + *                  |dma-controller|
> + *                  ----------------
> + *
> + *   The DMA could have multiple HW-queues, each HW-queue could have
> multiple
> + *   capabilities, e.g. whether to support fill operation, supported DMA
> + *   transfter direction and etc.
> + *
> + * The DMA framework is built on the following abstraction model:
> + *
> + *     ------------    ------------
> + *     |virt-queue|    |virt-queue|
> + *     ------------    ------------
> + *            \           /
> + *             \         /
> + *              \       /
> + *            ------------     ------------
> + *            | HW-queue |     | HW-queue |
> + *            ------------     ------------
> + *                   \            /
> + *                    \          /
> + *                     \        /
> + *                     ----------
> + *                     | dmadev |
> + *                     ----------
> + *
> + *   a) The DMA operation request must be submitted to the virt queue, virt
> + *      queues must be created based on HW queues, the DMA device could
> have
> + *      multiple HW queues.
> + *   b) The virt queues on the same HW-queue could represent different
> contexts,
> + *      e.g. user could create virt-queue-0 on HW-queue-0 for mem-to-mem
> + *      transfer scenario, and create virt-queue-1 on the same HW-queue for
> + *      mem-to-dev transfer scenario.
> + *   NOTE: user could also create multiple virt queues for mem-to-mem
> transfer
> + *         scenario as long as the corresponding driver supports.
> + *
> + * The control plane APIs include
> configure/queue_setup/queue_release/start/
> + * stop/reset/close, in order to start device work, the call sequence must be
> + * as follows:
> + *     - rte_dmadev_configure()
> + *     - rte_dmadev_queue_setup()
> + *     - rte_dmadev_start()
> + *
> + * The dataplane APIs include two parts:
> + *   a) The first part is the submission of operation requests:
> + *        - rte_dmadev_copy()
> + *        - rte_dmadev_copy_sg() - scatter-gather form of copy
> + *        - rte_dmadev_fill()
> + *        - rte_dmadev_fill_sg() - scatter-gather form of fill
> + *        - rte_dmadev_fence()   - add a fence force ordering between
> operations
> + *        - rte_dmadev_perform() - issue doorbell to hardware
> + *      These APIs could work with different virt queues which have different
> + *      contexts.
> + *      The first four APIs are used to submit the operation request to the virt
> + *      queue, if the submission is successful, a cookie (as type
> + *      'dma_cookie_t') is returned, otherwise a negative number is returned.
> + *   b) The second part is to obtain the result of requests:
> + *        - rte_dmadev_completed()
> + *            - return the number of operation requests completed successfully.
> + *        - rte_dmadev_completed_fails()
> + *            - return the number of operation requests failed to complete.
> + *
> + * The misc APIs include info_get/queue_info_get/stats/xstats/selftest,
> provide
> + * information query and self-test capabilities.
> + *
> + * About the dataplane APIs MT-safe, there are two dimensions:
> + *   a) For one virt queue, the submit/completion API could be MT-safe,
> + *      e.g. one thread do submit operation, another thread do completion
> + *      operation.
> + *      If driver support it, then declare RTE_DMA_DEV_CAPA_MT_VQ.
> + *      If driver don't support it, it's up to the application to guarantee
> + *      MT-safe.
> + *   b) For multiple virt queues on the same HW queue, e.g. one thread do
> + *      operation on virt-queue-0, another thread do operation on virt-queue-
> 1.
> + *      If driver support it, then declare RTE_DMA_DEV_CAPA_MT_MVQ.
> + *      If driver don't support it, it's up to the application to guarantee
> + *      MT-safe.
> + */
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +#include <rte_common.h>
> +#include <rte_memory.h>
> +#include <rte_errno.h>
> +#include <rte_compat.h>
> +
> +/**
> + * dma_cookie_t - an opaque DMA cookie
> + *
> + * If dma_cookie_t is >=0 it's a DMA operation request cookie, <0 it's a error
> + * code.
> + * When using cookies, comply with the following rules:
> + * a) Cookies for each virtual queue are independent.
> + * b) For a virt queue, the cookie are monotonically incremented, when it
> reach
> + *    the INT_MAX, it wraps back to zero.
> + * c) The initial cookie of a virt queue is zero, after the device is stopped or
> + *    reset, the virt queue's cookie needs to be reset to zero.
> + * Example:
> + *    step-1: start one dmadev
> + *    step-2: enqueue a copy operation, the cookie return is 0
> + *    step-3: enqueue a copy operation again, the cookie return is 1
> + *    ...
> + *    step-101: stop the dmadev
> + *    step-102: start the dmadev
> + *    step-103: enqueue a copy operation, the cookie return is 0
> + *    ...
> + */
> +typedef int32_t dma_cookie_t;
> +
> +/**
> + * dma_scatterlist - can hold scatter DMA operation request
> + */
> +struct dma_scatterlist {
> +       void *src;
> +       void *dst;
> +       uint32_t length;
> +};
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Get the total number of DMA devices that have been successfully
> + * initialised.
> + *
> + * @return
> + *   The total number of usable DMA devices.
> + */
> +__rte_experimental
> +uint16_t
> +rte_dmadev_count(void);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Get the device identifier for the named DMA device.
> + *
> + * @param name
> + *   DMA device name to select the DMA device identifier.
> + *
> + * @return
> + *   Returns DMA device identifier on success.
> + *   - <0: Failure to find named DMA device.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_get_dev_id(const char *name);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Return the NUMA socket to which a device is connected.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + *
> + * @return
> + *   The NUMA socket id to which the device is connected or
> + *   a default of zero if the socket could not be determined.
> + *   - -EINVAL: dev_id value is out of range.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_socket_id(uint16_t dev_id);
> +
> +/**
> + * The capabilities of a DMA device
> + */
> +#define RTE_DMA_DEV_CAPA_M2M   (1ull << 0) /**< Support mem-to-
> mem transfer */
> +#define RTE_DMA_DEV_CAPA_M2D   (1ull << 1) /**< Support mem-to-dev
> transfer */
> +#define RTE_DMA_DEV_CAPA_D2M   (1ull << 2) /**< Support dev-to-mem
> transfer */
> +#define RTE_DMA_DEV_CAPA_D2D   (1ull << 3) /**< Support dev-to-dev
> transfer */
> +#define RTE_DMA_DEV_CAPA_COPY  (1ull << 4) /**< Support copy ops */
> +#define RTE_DMA_DEV_CAPA_FILL  (1ull << 5) /**< Support fill ops */
> +#define RTE_DMA_DEV_CAPA_SG    (1ull << 6) /**< Support scatter-gather
> ops */
> +#define RTE_DMA_DEV_CAPA_FENCE (1ull << 7) /**< Support fence ops */
> +#define RTE_DMA_DEV_CAPA_IOVA  (1ull << 8) /**< Support IOVA as
> DMA address */
> +#define RTE_DMA_DEV_CAPA_VA    (1ull << 9) /**< Support VA as DMA
> address */
> +#define RTE_DMA_DEV_CAPA_MT_VQ (1ull << 10) /**< Support MT-safe
> of one virt queue */
> +#define RTE_DMA_DEV_CAPA_MT_MVQ        (1ull << 11) /**< Support MT-
> safe of multiple virt queues */
> +
> +/**
> + * A structure used to retrieve the contextual information of
> + * an DMA device
> + */
> +struct rte_dmadev_info {
> +       /**
> +        * Fields filled by framewok
> +        */
> +       struct rte_device *device; /**< Generic Device information */
> +       const char *driver_name; /**< Device driver name */
> +       int socket_id; /**< Socket ID where memory is allocated */
> +
> +       /**
> +        * Specification fields filled by driver
> +        */
> +       uint64_t dev_capa; /**< Device capabilities (RTE_DMA_DEV_CAPA_) */
> +       uint16_t max_hw_queues; /**< Maximum number of HW queues. */
> +       uint16_t max_vqs_per_hw_queue;
> +       /**< Maximum number of virt queues to allocate per HW queue */
> +       uint16_t max_desc;
> +       /**< Maximum allowed number of virt queue descriptors */
> +       uint16_t min_desc;
> +       /**< Minimum allowed number of virt queue descriptors */
> +
> +       /**
> +        * Status fields filled by driver
> +        */
> +       uint16_t nb_hw_queues; /**< Number of HW queues configured */
> +       uint16_t nb_vqs; /**< Number of virt queues configured */
> +};
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Retrieve the contextual information of a DMA device.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + *
> + * @param[out] dev_info
> + *   A pointer to a structure of type *rte_dmadev_info* to be filled with the
> + *   contextual information of the device.
> + * @return
> + *   - =0: Success, driver updates the contextual information of the DMA
> device
> + *   - <0: Error code returned by the driver info get function.
> + *
> + */
> +__rte_experimental
> +int
> +rte_dmadev_info_get(uint16_t dev_id, struct rte_dmadev_info
> *dev_info);
> +
> +/**
> + * dma_address_type
> + */
> +enum dma_address_type {
> +       DMA_ADDRESS_TYPE_IOVA, /**< Use IOVA as dma address */
> +       DMA_ADDRESS_TYPE_VA, /**< Use VA as dma address */
> +};
> +
> +/**
> + * A structure used to configure a DMA device.
> + */
> +struct rte_dmadev_conf {
> +       enum dma_address_type addr_type; /**< Address type to used */
> +       uint16_t nb_hw_queues; /**< Number of HW-queues enable to use */
> +       uint16_t max_vqs; /**< Maximum number of virt queues to use */
> +};
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Configure a DMA device.
> + *
> + * This function must be invoked first before any other function in the
> + * API. This function can also be re-invoked when a device is in the
> + * stopped state.
> + *
> + * The caller may use rte_dmadev_info_get() to get the capability of each
> + * resources available for this DMA device.
> + *
> + * @param dev_id
> + *   The identifier of the device to configure.
> + * @param dev_conf
> + *   The DMA device configuration structure encapsulated into
> rte_dmadev_conf
> + *   object.
> + *
> + * @return
> + *   - =0: Success, device configured.
> + *   - <0: Error code returned by the driver configuration function.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_configure(uint16_t dev_id, const struct rte_dmadev_conf
> *dev_conf);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Start a DMA device.
> + *
> + * The device start step is the last one and consists of setting the DMA
> + * to start accepting jobs.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + *
> + * @return
> + *   - =0: Success, device started.
> + *   - <0: Error code returned by the driver start function.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_start(uint16_t dev_id);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Stop a DMA device.
> + *
> + * The device can be restarted with a call to rte_dmadev_start()
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + *
> + * @return
> + *   - =0: Success, device stopped.
> + *   - <0: Error code returned by the driver stop function.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_stop(uint16_t dev_id);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Close a DMA device.
> + *
> + * The device cannot be restarted after this call.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + *
> + * @return
> + *  - =0: Successfully closing device
> + *  - <0: Failure to close device
> + */
> +__rte_experimental
> +int
> +rte_dmadev_close(uint16_t dev_id);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Reset a DMA device.
> + *
> + * This is different from cycle of rte_dmadev_start->rte_dmadev_stop in
> the
> + * sense similar to hard or soft reset.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + *
> + * @return
> + *   - =0: Successful reset device.
> + *   - <0: Failure to reset device.
> + *   - (-ENOTSUP): If the device doesn't support this function.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_reset(uint16_t dev_id);
> +
> +/**
> + * dma_transfer_direction
> + */
> +enum dma_transfer_direction {
> +       DMA_MEM_TO_MEM,
> +       DMA_MEM_TO_DEV,
> +       DMA_DEV_TO_MEM,
> +       DMA_DEV_TO_DEV,
> +};
> +
> +/**
> + * A structure used to configure a DMA virt queue.
> + */
> +struct rte_dmadev_queue_conf {
> +       enum dma_transfer_direction direction;
> +       /**< Associated transfer direction */
> +       uint16_t hw_queue_id; /**< The HW queue on which to create virt
> queue */
> +       uint16_t nb_desc; /**< Number of descriptor for this virt queue */
> +       uint64_t dev_flags; /**< Device specific flags */
> +       void *dev_ctx; /**< Device specific context */
> +};
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Allocate and set up a virt queue.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param conf
> + *   The queue configuration structure encapsulated into
> rte_dmadev_queue_conf
> + *   object.
> + *
> + * @return
> + *   - >=0: Allocate virt queue success, it is virt queue id.
> + *   - <0: Error code returned by the driver queue setup function.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_queue_setup(uint16_t dev_id,
> +                      const struct rte_dmadev_queue_conf *conf);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Release a virt queue.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vq_id
> + *   The identifier of virt queue which return by queue setup.
> + *
> + * @return
> + *   - =0: Successful release the virt queue.
> + *   - <0: Error code returned by the driver queue release function.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_queue_release(uint16_t dev_id, uint16_t vq_id);
> +
> +/**
> + * A structure used to retrieve information of a DMA virt queue.
> + */
> +struct rte_dmadev_queue_info {
> +       enum dma_transfer_direction direction;
> +       /**< Associated transfer direction */
> +       uint16_t hw_queue_id; /**< The HW queue on which to create virt
> queue */
> +       uint16_t nb_desc; /**< Number of descriptor for this virt queue */
> +       uint64_t dev_flags; /**< Device specific flags */
> +};
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Retrieve information of a DMA virt queue.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vq_id
> + *   The identifier of virt queue which return by queue setup.
> + * @param[out] info
> + *   The queue info structure encapsulated into rte_dmadev_queue_info
> object.
> + *
> + * @return
> + *   - =0: Successful retrieve information.
> + *   - <0: Error code returned by the driver queue release function.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_queue_info_get(uint16_t dev_id, uint16_t vq_id,
> +                         struct rte_dmadev_queue_info *info);
> +
> +#include "rte_dmadev_core.h"
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Enqueue a copy operation onto the DMA virt queue.
> + *
> + * This queues up a copy operation to be performed by hardware, but does
> not
> + * trigger hardware to begin that operation.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vq_id
> + *   The identifier of virt queue.
> + * @param src
> + *   The address of the source buffer.
> + * @param dst
> + *   The address of the destination buffer.
> + * @param length
> + *   The length of the data to be copied.
> + * @param flags
> + *   An opaque flags for this operation.
> + *
> + * @return
> + *   dma_cookie_t: please refer to the corresponding definition.
> + *
> + * NOTE: The caller must ensure that the input parameter is valid and the
> + *       corresponding device supports the operation.
> + */
> +__rte_experimental
> +static inline dma_cookie_t
> +rte_dmadev_copy(uint16_t dev_id, uint16_t vq_id, void *src, void *dst,

Did you consider also mbuf API usage for memory descriptor?

> +               uint32_t length, uint64_t flags)
> +{
> +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> +       return (*dev->copy)(dev, vq_id, src, dst, length, flags);
> +}
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Enqueue a scatter list copy operation onto the DMA virt queue.
> + *
> + * This queues up a scatter list copy operation to be performed by
> hardware,
> + * but does not trigger hardware to begin that operation.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vq_id
> + *   The identifier of virt queue.
> + * @param sg
> + *   The pointer of scatterlist.
> + * @param sg_len
> + *   The number of scatterlist elements.
> + * @param flags
> + *   An opaque flags for this operation.
> + *
> + * @return
> + *   dma_cookie_t: please refer to the corresponding definition.
> + *
> + * NOTE: The caller must ensure that the input parameter is valid and the
> + *       corresponding device supports the operation.
> + */
> +__rte_experimental
> +static inline dma_cookie_t
> +rte_dmadev_copy_sg(uint16_t dev_id, uint16_t vq_id,
> +                  const struct dma_scatterlist *sg,
> +                  uint32_t sg_len, uint64_t flags)
> +{
> +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> +       return (*dev->copy_sg)(dev, vq_id, sg, sg_len, flags);
> +}
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Enqueue a fill operation onto the DMA virt queue
> + *
> + * This queues up a fill operation to be performed by hardware, but does
> not
> + * trigger hardware to begin that operation.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vq_id
> + *   The identifier of virt queue.
> + * @param pattern
> + *   The pattern to populate the destination buffer with.
> + * @param dst
> + *   The address of the destination buffer.
> + * @param length
> + *   The length of the destination buffer.
> + * @param flags
> + *   An opaque flags for this operation.
> + *
> + * @return
> + *   dma_cookie_t: please refer to the corresponding definition.
> + *
> + * NOTE: The caller must ensure that the input parameter is valid and the
> + *       corresponding device supports the operation.
> + */
> +__rte_experimental
> +static inline dma_cookie_t
> +rte_dmadev_fill(uint16_t dev_id, uint16_t vq_id, uint64_t pattern,
> +               void *dst, uint32_t length, uint64_t flags)
> +{
> +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> +       return (*dev->fill)(dev, vq_id, pattern, dst, length, flags);
> +}
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Enqueue a scatter list fill operation onto the DMA virt queue
> + *
> + * This queues up a scatter list fill operation to be performed by hardware,
> + * but does not trigger hardware to begin that operation.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vq_id
> + *   The identifier of virt queue.
> + * @param pattern
> + *   The pattern to populate the destination buffer with.
> + * @param sg
> + *   The pointer of scatterlist.
> + * @param sg_len
> + *   The number of scatterlist elements.
> + * @param flags
> + *   An opaque flags for this operation.
> + *
> + * @return
> + *   dma_cookie_t: please refer to the corresponding definition.
> + *
> + * NOTE: The caller must ensure that the input parameter is valid and the
> + *       corresponding device supports the operation.
> + */
> +__rte_experimental
> +static inline dma_cookie_t
> +rte_dmadev_fill_sg(uint16_t dev_id, uint16_t vq_id, uint64_t pattern,
> +                  const struct dma_scatterlist *sg, uint32_t sg_len,
> +                  uint64_t flags)
> +{
> +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> +       return (*dev->fill_sg)(dev, vq_id, pattern, sg, sg_len, flags);
> +}
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Add a fence to force ordering between operations
> + *
> + * This adds a fence to a sequence of operations to enforce ordering, such
> that
> + * all operations enqueued before the fence must be completed before
> operations
> + * after the fence.
> + * NOTE: Since this fence may be added as a flag to the last operation
> enqueued,
> + * this API may not function correctly when called immediately after an
> + * "rte_dmadev_perform" call i.e. before any new operations are
> enqueued.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vq_id
> + *   The identifier of virt queue.
> + *
> + * @return
> + *   - =0: Successful add fence.
> + *   - <0: Failure to add fence.
> + *
> + * NOTE: The caller must ensure that the input parameter is valid and the
> + *       corresponding device supports the operation.
> + */
> +__rte_experimental
> +static inline int
> +rte_dmadev_fence(uint16_t dev_id, uint16_t vq_id)
> +{
> +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> +       return (*dev->fence)(dev, vq_id);
> +}
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Trigger hardware to begin performing enqueued operations
> + *
> + * This API is used to write the "doorbell" to the hardware to trigger it
> + * to begin the operations previously enqueued by rte_dmadev_copy/fill()
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vq_id
> + *   The identifier of virt queue.
> + *
> + * @return
> + *   - =0: Successful trigger hardware.
> + *   - <0: Failure to trigger hardware.
> + *
> + * NOTE: The caller must ensure that the input parameter is valid and the
> + *       corresponding device supports the operation.
> + */
> +__rte_experimental
> +static inline int
> +rte_dmadev_perform(uint16_t dev_id, uint16_t vq_id)
> +{
> +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> +       return (*dev->perform)(dev, vq_id);
> +}
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Returns the number of operations that have been successful completed.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vq_id
> + *   The identifier of virt queue.
> + * @param nb_cpls
> + *   The maximum number of completed operations that can be processed.
> + * @param[out] cookie
> + *   The last completed operation's cookie.
> + * @param[out] has_error
> + *   Indicates if there are transfer error.
> + *
> + * @return
> + *   The number of operations that successful completed.
> + *
> + * NOTE: The caller must ensure that the input parameter is valid and the
> + *       corresponding device supports the operation.
> + */
> +__rte_experimental
> +static inline uint16_t
> +rte_dmadev_completed(uint16_t dev_id, uint16_t vq_id, const uint16_t
> nb_cpls,
> +                    dma_cookie_t *cookie, bool *has_error)
> +{
> +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> +       has_error = false;
> +       return (*dev->completed)(dev, vq_id, nb_cpls, cookie, has_error);
> +}
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Returns the number of operations that failed to complete.
> + * NOTE: This API was used when rte_dmadev_completed has_error was
> set.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vq_id
> + *   The identifier of virt queue.
> + * @param nb_status
> + *   Indicates the size of status array.
> + * @param[out] status
> + *   The error code of operations that failed to complete.
> + * @param[out] cookie
> + *   The last failed completed operation's cookie.
> + *
> + * @return
> + *   The number of operations that failed to complete.
> + *
> + * NOTE: The caller must ensure that the input parameter is valid and the
> + *       corresponding device supports the operation.
> + */
> +__rte_experimental
> +static inline uint16_t
> +rte_dmadev_completed_fails(uint16_t dev_id, uint16_t vq_id,
> +                          const uint16_t nb_status, uint32_t *status,
> +                          dma_cookie_t *cookie)
> +{
> +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> +       return (*dev->completed_fails)(dev, vq_id, nb_status, status, cookie);
> +}
> +
> +struct rte_dmadev_stats {
> +       uint64_t enqueue_fail_count;
> +       /**< Conut of all operations which failed enqueued */
> +       uint64_t enqueued_count;
> +       /**< Count of all operations which successful enqueued */
> +       uint64_t completed_fail_count;
> +       /**< Count of all operations which failed to complete */
> +       uint64_t completed_count;
> +       /**< Count of all operations which successful complete */
> +};
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Retrieve basic statistics of a or all DMA virt queue(s).
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vq_id
> + *   The identifier of virt queue, -1 means all virt queues.
> + * @param[out] stats
> + *   The basic statistics structure encapsulated into rte_dmadev_stats
> + *   object.
> + *
> + * @return
> + *   - =0: Successful retrieve stats.
> + *   - <0: Failure to retrieve stats.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_stats_get(uint16_t dev_id, int vq_id,
> +                    struct rte_dmadev_stats *stats);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Reset basic statistics of a or all DMA virt queue(s).
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vq_id
> + *   The identifier of virt queue, -1 means all virt queues.
> + *
> + * @return
> + *   - =0: Successful retrieve stats.
> + *   - <0: Failure to retrieve stats.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_stats_reset(uint16_t dev_id, int vq_id);
> +
> +/** Maximum name length for extended statistics counters */
> +#define RTE_DMA_DEV_XSTATS_NAME_SIZE 64
> +
> +/**
> + * A name-key lookup element for extended statistics.
> + *
> + * This structure is used to map between names and ID numbers
> + * for extended ethdev statistics.
> + */
> +struct rte_dmadev_xstats_name {
> +       char name[RTE_DMA_DEV_XSTATS_NAME_SIZE];
> +};
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Retrieve names of extended statistics of a DMA device.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param[out] xstats_names
> + *   Block of memory to insert names into. Must be at least size in capacity.
> + *   If set to NULL, function returns required capacity.
> + * @param size
> + *   Capacity of xstats_names (number of names).
> + * @return
> + *   - positive value lower or equal to size: success. The return value
> + *     is the number of entries filled in the stats table.
> + *   - positive value higher than size: error, the given statistics table
> + *     is too small. The return value corresponds to the size that should
> + *     be given to succeed. The entries in the table are not valid and
> + *     shall not be used by the caller.
> + *   - negative value on error.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_xstats_names_get(uint16_t dev_id,
> +                           struct rte_dmadev_xstats_name *xstats_names,
> +                           uint32_t size);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Retrieve extended statistics of a DMA device.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param ids
> + *   The id numbers of the stats to get. The ids can be got from the stat
> + *   position in the stat list from rte_dmadev_get_xstats_names().
> + * @param[out] values
> + *   The values for each stats request by ID.
> + * @param n
> + *   The number of stats requested.
> + *
> + * @return
> + *   - positive value: number of stat entries filled into the values array.
> + *   - negative value on error.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_xstats_get(uint16_t dev_id, const uint32_t ids[],
> +                     uint64_t values[], uint32_t n);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Reset the values of the xstats of the selected component in the device.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param ids
> + *   Selects specific statistics to be reset. When NULL, all statistics
> + *   will be reset. If non-NULL, must point to array of at least
> + *   *nb_ids* size.
> + * @param nb_ids
> + *   The number of ids available from the *ids* array. Ignored when ids is
> NULL.
> + *
> + * @return
> + *   - zero: successfully reset the statistics to zero.
> + *   - negative value on error.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_xstats_reset(uint16_t dev_id, const uint32_t ids[], uint32_t
> nb_ids);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Trigger the dmadev self test.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + *
> + * @return
> + *   - 0: Selftest successful.
> + *   - -ENOTSUP if the device doesn't support selftest
> + *   - other values < 0 on failure.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_selftest(uint16_t dev_id);
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif /* _RTE_DMADEV_H_ */
> diff --git a/lib/dmadev/rte_dmadev_core.h
> b/lib/dmadev/rte_dmadev_core.h
> new file mode 100644
> index 0000000..a3afea2
> --- /dev/null
> +++ b/lib/dmadev/rte_dmadev_core.h
> @@ -0,0 +1,98 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright 2021 HiSilicon Limited.
> + */
> +
> +#ifndef _RTE_DMADEV_CORE_H_
> +#define _RTE_DMADEV_CORE_H_
> +
> +/**
> + * @file
> + *
> + * RTE DMA Device internal header.
> + *
> + * This header contains internal data types. But they are still part of the
> + * public API because they are used by inline public functions.
> + */
> +
> +struct rte_dmadev;
> +
> +typedef dma_cookie_t (*dmadev_copy_t)(struct rte_dmadev *dev,
> uint16_t vq_id,
> +                                     void *src, void *dst,
> +                                     uint32_t length, uint64_t flags);
> +/**< @internal Function used to enqueue a copy operation. */
> +
> +typedef dma_cookie_t (*dmadev_copy_sg_t)(struct rte_dmadev *dev,
> uint16_t vq_id,
> +                                        const struct dma_scatterlist *sg,
> +                                        uint32_t sg_len, uint64_t flags);
> +/**< @internal Function used to enqueue a scatter list copy operation. */
> +
> +typedef dma_cookie_t (*dmadev_fill_t)(struct rte_dmadev *dev, uint16_t
> vq_id,
> +                                     uint64_t pattern, void *dst,
> +                                     uint32_t length, uint64_t flags);
> +/**< @internal Function used to enqueue a fill operation. */
> +
> +typedef dma_cookie_t (*dmadev_fill_sg_t)(struct rte_dmadev *dev,
> uint16_t vq_id,
> +                       uint64_t pattern, const struct dma_scatterlist *sg,
> +                       uint32_t sg_len, uint64_t flags);
> +/**< @internal Function used to enqueue a scatter list fill operation. */
> +
> +typedef int (*dmadev_fence_t)(struct rte_dmadev *dev, uint16_t vq_id);
> +/**< @internal Function used to add a fence ordering between operations.
> */
> +
> +typedef int (*dmadev_perform_t)(struct rte_dmadev *dev, uint16_t
> vq_id);
> +/**< @internal Function used to trigger hardware to begin performing. */
> +
> +typedef uint16_t (*dmadev_completed_t)(struct rte_dmadev *dev,
> uint16_t vq_id,
> +                                      const uint16_t nb_cpls,
> +                                      dma_cookie_t *cookie, bool *has_error);
> +/**< @internal Function used to return number of successful completed
> operations */
> +
> +typedef uint16_t (*dmadev_completed_fails_t)(struct rte_dmadev *dev,
> +                       uint16_t vq_id, const uint16_t nb_status,
> +                       uint32_t *status, dma_cookie_t *cookie);
> +/**< @internal Function used to return number of failed completed
> operations */
> +
> +#define RTE_DMADEV_NAME_MAX_LEN        64 /**< Max length of name
> of DMA PMD */
> +
> +struct rte_dmadev_ops;
> +
> +/**
> + * The data structure associated with each DMA device.
> + */
> +struct rte_dmadev {
> +       /**< Enqueue a copy operation onto the DMA device. */
> +       dmadev_copy_t copy;
> +       /**< Enqueue a scatter list copy operation onto the DMA device. */
> +       dmadev_copy_sg_t copy_sg;
> +       /**< Enqueue a fill operation onto the DMA device. */
> +       dmadev_fill_t fill;
> +       /**< Enqueue a scatter list fill operation onto the DMA device. */
> +       dmadev_fill_sg_t fill_sg;
> +       /**< Add a fence to force ordering between operations. */
> +       dmadev_fence_t fence;
> +       /**< Trigger hardware to begin performing enqueued operations. */
> +       dmadev_perform_t perform;
> +       /**< Returns the number of operations that successful completed. */
> +       dmadev_completed_t completed;
> +       /**< Returns the number of operations that failed to complete. */
> +       dmadev_completed_fails_t completed_fails;
> +
> +       void *dev_private; /**< PMD-specific private data */
> +       const struct rte_dmadev_ops *dev_ops; /**< Functions exported by
> PMD */
> +
> +       uint16_t dev_id; /**< Device ID for this instance */
> +       int socket_id; /**< Socket ID where memory is allocated */
> +       struct rte_device *device;
> +       /**< Device info. supplied during device initialization */
> +       const char *driver_name; /**< Driver info. supplied by probing */
> +       char name[RTE_DMADEV_NAME_MAX_LEN]; /**< Device name */
> +
> +       RTE_STD_C11
> +       uint8_t attached : 1; /**< Flag indicating the device is attached */
> +       uint8_t started : 1; /**< Device state: STARTED(1)/STOPPED(0) */
> +
> +} __rte_cache_aligned;
> +
> +extern struct rte_dmadev rte_dmadevices[];
> +
> +#endif /* _RTE_DMADEV_CORE_H_ */
> diff --git a/lib/dmadev/rte_dmadev_pmd.h
> b/lib/dmadev/rte_dmadev_pmd.h
> new file mode 100644
> index 0000000..ef03cf7
> --- /dev/null
> +++ b/lib/dmadev/rte_dmadev_pmd.h
> @@ -0,0 +1,210 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright 2021 HiSilicon Limited.
> + */
> +
> +#ifndef _RTE_DMADEV_PMD_H_
> +#define _RTE_DMADEV_PMD_H_
> +
> +/** @file
> + * RTE DMA PMD APIs
> + *
> + * @note
> + * Driver facing APIs for a DMA device. These are not to be called directly by
> + * any application.
> + */
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +#include <string.h>
> +
> +#include <rte_dev.h>
> +#include <rte_log.h>
> +#include <rte_common.h>
> +
> +#include "rte_dmadev.h"
> +
> +extern int libdmadev_logtype;
> +
> +#define RTE_DMADEV_LOG(level, fmt, args...) \
> +       rte_log(RTE_LOG_ ## level, libdmadev_logtype, "%s(): " fmt "\n", \
> +               __func__, ##args)
> +
> +/* Macros to check for valid device */
> +#define RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, retval) do { \
> +       if (!rte_dmadev_pmd_is_valid_dev((dev_id))) { \
> +               RTE_DMADEV_LOG(ERR, "Invalid dev_id=%d", dev_id); \
> +               return retval; \
> +       } \
> +} while (0)
> +
> +#define RTE_DMADEV_VALID_DEVID_OR_RET(dev_id) do { \
> +       if (!rte_dmadev_pmd_is_valid_dev((dev_id))) { \
> +               RTE_DMADEV_LOG(ERR, "Invalid dev_id=%d", dev_id); \
> +               return; \
> +       } \
> +} while (0)
> +
> +#define RTE_DMADEV_DETACHED  0
> +#define RTE_DMADEV_ATTACHED  1
> +
> +/**
> + * Validate if the DMA device index is a valid attached DMA device.
> + *
> + * @param dev_id
> + *   DMA device index.
> + *
> + * @return
> + *   - If the device index is valid (1) or not (0).
> + */
> +static inline unsigned
> +rte_dmadev_pmd_is_valid_dev(uint16_t dev_id)
> +{
> +       struct rte_dmadev *dev;
> +
> +       if (dev_id >= RTE_DMADEV_MAX_DEVS)
> +               return 0;
> +
> +       dev = &rte_dmadevices[dev_id];
> +       if (dev->attached != RTE_DMADEV_ATTACHED)
> +               return 0;
> +       else
> +               return 1;
> +}
> +
> +/**
> + * Definitions of control-plane functions exported by a driver through the
> + * generic structure of type *rte_dmadev_ops* supplied in the
> *rte_dmadev*
> + * structure associated with a device.
> + */
> +
> +typedef int (*dmadev_info_get_t)(struct rte_dmadev *dev,
> +                                struct rte_dmadev_info *dev_info);
> +/**< @internal Function used to get device information of a device. */
> +
> +typedef int (*dmadev_configure_t)(struct rte_dmadev *dev,
> +                                 const struct rte_dmadev_conf *dev_conf);
> +/**< @internal Function used to configure a device. */
> +
> +typedef int (*dmadev_start_t)(struct rte_dmadev *dev);
> +/**< @internal Function used to start a configured device. */
> +
> +typedef int (*dmadev_stop_t)(struct rte_dmadev *dev);
> +/**< @internal Function used to stop a configured device. */
> +
> +typedef int (*dmadev_close_t)(struct rte_dmadev *dev);
> +/**< @internal Function used to close a configured device. */
> +
> +typedef int (*dmadev_reset_t)(struct rte_dmadev *dev);
> +/**< @internal Function used to reset a configured device. */
> +
> +typedef int (*dmadev_queue_setup_t)(struct rte_dmadev *dev,
> +                                   const struct rte_dmadev_queue_conf *conf);
> +/**< @internal Function used to allocate and set up a virt queue. */
> +
> +typedef int (*dmadev_queue_release_t)(struct rte_dmadev *dev,
> uint16_t vq_id);
> +/**< @internal Function used to release a virt queue. */
> +
> +typedef int (*dmadev_queue_info_t)(struct rte_dmadev *dev, uint16_t
> vq_id,
> +                                  struct rte_dmadev_queue_info *info);
> +/**< @internal Function used to retrieve information of a virt queue. */
> +
> +typedef int (*dmadev_stats_get_t)(struct rte_dmadev *dev, int vq_id,
> +                                 struct rte_dmadev_stats *stats);
> +/**< @internal Function used to retrieve basic statistics. */
> +
> +typedef int (*dmadev_stats_reset_t)(struct rte_dmadev *dev, int vq_id);
> +/**< @internal Function used to reset basic statistics. */
> +
> +typedef int (*dmadev_xstats_get_names_t)(const struct rte_dmadev
> *dev,
> +               struct rte_dmadev_xstats_name *xstats_names,
> +               uint32_t size);
> +/**< @internal Function used to get names of extended stats. */
> +
> +typedef int (*dmadev_xstats_get_t)(const struct rte_dmadev *dev,
> +               const uint32_t ids[], uint64_t values[], uint32_t n);
> +/**< @internal Function used to retrieve extended stats. */
> +
> +typedef int (*dmadev_xstats_reset_t)(struct rte_dmadev *dev,
> +                                    const uint32_t ids[], uint32_t nb_ids);
> +/**< @internal Function used to reset extended stats. */
> +
> +typedef int (*dmadev_selftest_t)(uint16_t dev_id);
> +/**< @internal Function used to start dmadev selftest. */
> +
> +/** DMA device operations function pointer table */
> +struct rte_dmadev_ops {
> +       /**< Get device info. */
> +       dmadev_info_get_t dev_info_get;
> +       /**< Configure device. */
> +       dmadev_configure_t dev_configure;
> +       /**< Start device. */
> +       dmadev_start_t dev_start;
> +       /**< Stop device. */
> +       dmadev_stop_t dev_stop;
> +       /**< Close device. */
> +       dmadev_close_t dev_close;
> +       /**< Reset device. */
> +       dmadev_reset_t dev_reset;
> +
> +       /**< Allocate and set up a virt queue. */
> +       dmadev_queue_setup_t queue_setup;
> +       /**< Release a virt queue. */
> +       dmadev_queue_release_t queue_release;
> +       /**< Retrieve information of a virt queue */
> +       dmadev_queue_info_t queue_info_get;
> +
> +       /**< Get basic statistics. */
> +       dmadev_stats_get_t stats_get;
> +       /**< Reset basic statistics. */
> +       dmadev_stats_reset_t stats_reset;
> +       /**< Get names of extended stats. */
> +       dmadev_xstats_get_names_t xstats_get_names;
> +       /**< Get extended statistics. */
> +       dmadev_xstats_get_t xstats_get;
> +       /**< Reset extended statistics values. */
> +       dmadev_xstats_reset_t xstats_reset;
> +
> +       /**< Device selftest function */
> +       dmadev_selftest_t dev_selftest;
> +};
> +
> +/**
> + * Allocates a new dmadev slot for an DMA device and returns the pointer
> + * to that slot for the driver to use.
> + *
> + * @param name
> + *   Unique identifier name for each device
> + * @param dev_private_size
> + *   Size of private data memory allocated within rte_dmadev object.
> + *   Set to 0 to disable internal memory allocation and allow for
> + *   self-allocation.
> + * @param socket_id
> + *   Socket to allocate resources on.
> + *
> + * @return
> + *   - NULL: Failure to allocate
> + *   - Other: The rte_dmadev structure pointer for the new device
> + */
> +struct rte_dmadev *
> +rte_dmadev_pmd_allocate(const char *name, size_t dev_private_size,
> +                       int socket_id);
> +
> +/**
> + * Release the specified dmadev device.
> + *
> + * @param dev
> + *   The *dmadev* pointer is the address of the *rte_dmadev* structure.
> + *
> + * @return
> + *   - 0 on success, negative on error
> + */
> +int
> +rte_dmadev_pmd_release(struct rte_dmadev *dev);
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif /* _RTE_DMADEV_PMD_H_ */
> diff --git a/lib/dmadev/version.map b/lib/dmadev/version.map
> new file mode 100644
> index 0000000..383b3ca
> --- /dev/null
> +++ b/lib/dmadev/version.map
> @@ -0,0 +1,32 @@
> +EXPERIMENTAL {
> +       global:
> +
> +       rte_dmadev_count;
> +       rte_dmadev_get_dev_id;
> +       rte_dmadev_socket_id;
> +       rte_dmadev_info_get;
> +       rte_dmadev_configure;
> +       rte_dmadev_start;
> +       rte_dmadev_stop;
> +       rte_dmadev_close;
> +       rte_dmadev_reset;
> +       rte_dmadev_queue_setup;
> +       rte_dmadev_queue_release;
> +       rte_dmadev_queue_info_get;
> +       rte_dmadev_copy;
> +       rte_dmadev_copy_sg;
> +       rte_dmadev_fill;
> +       rte_dmadev_fill_sg;
> +       rte_dmadev_fence;
> +       rte_dmadev_perform;
> +       rte_dmadev_completed;
> +       rte_dmadev_completed_fails;
> +       rte_dmadev_stats_get;
> +       rte_dmadev_stats_reset;
> +       rte_dmadev_xstats_names_get;
> +       rte_dmadev_xstats_get;
> +       rte_dmadev_xstats_reset;
> +       rte_dmadev_selftest;
> +
> +       local: *;
> +};
> diff --git a/lib/meson.build b/lib/meson.build
> index 1673ca4..68d239f 100644
> --- a/lib/meson.build
> +++ b/lib/meson.build
> @@ -60,6 +60,7 @@ libraries = [
>          'bpf',
>          'graph',
>          'node',
> +        'dmadev',
>  ]
> 
>  if is_windows
> --
> 2.8.1


^ permalink raw reply	[flat|nested] 339+ messages in thread

* Re: [dpdk-dev] [PATCH] dmadev: introduce DMA device library
  2021-07-04  9:30 ` Jerin Jacob
@ 2021-07-05 10:52   ` Bruce Richardson
  2021-07-05 11:12     ` Morten Brørup
                       ` (2 more replies)
  2021-07-06  3:01   ` fengchengwen
  1 sibling, 3 replies; 339+ messages in thread
From: Bruce Richardson @ 2021-07-05 10:52 UTC (permalink / raw)
  To: Jerin Jacob
  Cc: Chengwen Feng, Thomas Monjalon, Ferruh Yigit, Jerin Jacob,
	dpdk-dev, Morten Brørup, Nipun Gupta, Hemant Agrawal,
	Maxime Coquelin, Honnappa Nagarahalli, David Marchand,
	Satananda Burla, Prasun Kapoor, Ananyev, Konstantin, liangma,
	Radha Mohan Chintakuntla

On Sun, Jul 04, 2021 at 03:00:30PM +0530, Jerin Jacob wrote:
> On Fri, Jul 2, 2021 at 6:51 PM Chengwen Feng <fengchengwen@huawei.com> wrote:
> >
> > This patch introduces 'dmadevice' which is a generic type of DMA
> > device.
> >
> > The APIs of dmadev library exposes some generic operations which can
> > enable configuration and I/O with the DMA devices.
> >
> > Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>
> 
> Thanks for v1.
> 
> I would suggest finalizing  lib/dmadev/rte_dmadev.h before doing the
> implementation so that you don't need
> to waste time on rewoking the implementation.
> 

I actually like having the .c file available too. Before we lock down the
.h file and the API, I want to verify the performance of our drivers with
the implementation, and having a working .c file is obviously necessary for
that. So I appreciate having it as part of the RFC.

> Comments inline.
> 
> > ---
<snip>
> > + *
> > + * The DMA framework is built on the following abstraction model:
> > + *
> > + *     ------------    ------------
> > + *     |virt-queue|    |virt-queue|
> > + *     ------------    ------------
> > + *            \           /
> > + *             \         /
> > + *              \       /
> > + *            ------------     ------------
> > + *            | HW-queue |     | HW-queue |
> > + *            ------------     ------------
> > + *                   \            /
> > + *                    \          /
> > + *                     \        /
> > + *                     ----------
> > + *                     | dmadev |
> > + *                     ----------
> 
> Continuing the discussion with @Morten Brørup , I think, we need to
> finalize the model.
> 

+1 and the terminology with regards to queues and channels. With our ioat
hardware, each HW queue was called a channel for instance.

> > + *   a) The DMA operation request must be submitted to the virt queue, virt
> > + *      queues must be created based on HW queues, the DMA device could have
> > + *      multiple HW queues.
> > + *   b) The virt queues on the same HW-queue could represent different contexts,
> > + *      e.g. user could create virt-queue-0 on HW-queue-0 for mem-to-mem
> > + *      transfer scenario, and create virt-queue-1 on the same HW-queue for
> > + *      mem-to-dev transfer scenario.
> > + *   NOTE: user could also create multiple virt queues for mem-to-mem transfer
> > + *         scenario as long as the corresponding driver supports.
> > + *
> > + * The control plane APIs include configure/queue_setup/queue_release/start/
> > + * stop/reset/close, in order to start device work, the call sequence must be
> > + * as follows:
> > + *     - rte_dmadev_configure()
> > + *     - rte_dmadev_queue_setup()
> > + *     - rte_dmadev_start()
> 
> Please add reconfigure behaviour etc, Please check the
> lib/regexdev/rte_regexdev.h
> introduction. I have added similar ones so you could reuse as much as possible.
> 
> 
> > + * The dataplane APIs include two parts:
> > + *   a) The first part is the submission of operation requests:
> > + *        - rte_dmadev_copy()
> > + *        - rte_dmadev_copy_sg() - scatter-gather form of copy
> > + *        - rte_dmadev_fill()
> > + *        - rte_dmadev_fill_sg() - scatter-gather form of fill
> > + *        - rte_dmadev_fence()   - add a fence force ordering between operations
> > + *        - rte_dmadev_perform() - issue doorbell to hardware
> > + *      These APIs could work with different virt queues which have different
> > + *      contexts.
> > + *      The first four APIs are used to submit the operation request to the virt
> > + *      queue, if the submission is successful, a cookie (as type
> > + *      'dma_cookie_t') is returned, otherwise a negative number is returned.
> > + *   b) The second part is to obtain the result of requests:
> > + *        - rte_dmadev_completed()
> > + *            - return the number of operation requests completed successfully.
> > + *        - rte_dmadev_completed_fails()
> > + *            - return the number of operation requests failed to complete.
> > + *
> > + * The misc APIs include info_get/queue_info_get/stats/xstats/selftest, provide
> > + * information query and self-test capabilities.
> > + *
> > + * About the dataplane APIs MT-safe, there are two dimensions:
> > + *   a) For one virt queue, the submit/completion API could be MT-safe,
> > + *      e.g. one thread do submit operation, another thread do completion
> > + *      operation.
> > + *      If driver support it, then declare RTE_DMA_DEV_CAPA_MT_VQ.
> > + *      If driver don't support it, it's up to the application to guarantee
> > + *      MT-safe.
> > + *   b) For multiple virt queues on the same HW queue, e.g. one thread do
> > + *      operation on virt-queue-0, another thread do operation on virt-queue-1.
> > + *      If driver support it, then declare RTE_DMA_DEV_CAPA_MT_MVQ.
> > + *      If driver don't support it, it's up to the application to guarantee
> > + *      MT-safe.
> 
> From an application PoV it may not be good to write portable
> applications. Please check
> latest thread with @Morten Brørup
> 
> > + */
> > +
> > +#ifdef __cplusplus
> > +extern "C" {
> > +#endif
> > +
> > +#include <rte_common.h>
> > +#include <rte_memory.h>
> > +#include <rte_errno.h>
> > +#include <rte_compat.h>
> 
> Sort in alphabetical order.
> 
> > +
> > +/**
> > + * dma_cookie_t - an opaque DMA cookie
> 
> Since we are defining the behaviour is not opaque any more.
> I think, it is better to call ring_idx or so.
> 

+1 for ring index. We don't need a separate type for it though, just
document the index as an unsigned return value.

> 
> > +#define RTE_DMA_DEV_CAPA_MT_MVQ (1ull << 11) /**< Support MT-safe of multiple virt queues */
> 
> Please lot of @see for all symbols where it is being used. So that one
> can understand the full scope of
> symbols. See below example.
> 
> #define RTE_REGEXDEV_CAPA_RUNTIME_COMPILATION_F (1ULL << 0)
> /**< RegEx device does support compiling the rules at runtime unlike
>  * loading only the pre-built rule database using
>  * struct rte_regexdev_config::rule_db in rte_regexdev_configure()
>  *
>  * @see struct rte_regexdev_config::rule_db, rte_regexdev_configure()
>  * @see struct rte_regexdev_info::regexdev_capa
>  */
> 
> > + *
> > + * If dma_cookie_t is >=0 it's a DMA operation request cookie, <0 it's a error
> > + * code.
> > + * When using cookies, comply with the following rules:
> > + * a) Cookies for each virtual queue are independent.
> > + * b) For a virt queue, the cookie are monotonically incremented, when it reach
> > + *    the INT_MAX, it wraps back to zero.

I disagree with the INT_MAX (or INT32_MAX) value here. If we use that
value, it means that we cannot use implicit wrap-around inside the CPU and
have to check for the INT_MAX value. Better to:
1. Specify that it wraps at UINT16_MAX which allows us to just use a
uint16_t internally and wrap-around automatically, or:
2. Specify that it wraps at a power-of-2 value >= UINT16_MAX, giving
drivers the flexibility at what value to wrap around.

> > + * c) The initial cookie of a virt queue is zero, after the device is stopped or
> > + *    reset, the virt queue's cookie needs to be reset to zero.
> > + * Example:
> > + *    step-1: start one dmadev
> > + *    step-2: enqueue a copy operation, the cookie return is 0
> > + *    step-3: enqueue a copy operation again, the cookie return is 1
> > + *    ...
> > + *    step-101: stop the dmadev
> > + *    step-102: start the dmadev
> > + *    step-103: enqueue a copy operation, the cookie return is 0
> > + *    ...
> > + */
> 
> Good explanation.
> 
> > +typedef int32_t dma_cookie_t;
> 

As I mentioned before, I'd just remove this, and use regular int types,
with "ring_idx" as the name.

> 
> > +
> > +/**
> > + * dma_scatterlist - can hold scatter DMA operation request
> > + */
> > +struct dma_scatterlist {
> 
> I prefer to change scatterlist -> sg
> i.e rte_dma_sg
> 
> > +       void *src;
> > +       void *dst;
> > +       uint32_t length;
> > +};
> > +
> 
> > +
> > +/**
> > + * A structure used to retrieve the contextual information of
> > + * an DMA device
> > + */
> > +struct rte_dmadev_info {
> > +       /**
> > +        * Fields filled by framewok
> 
> typo.
> 
> > +        */
> > +       struct rte_device *device; /**< Generic Device information */
> > +       const char *driver_name; /**< Device driver name */
> > +       int socket_id; /**< Socket ID where memory is allocated */
> > +
> > +       /**
> > +        * Specification fields filled by driver
> > +        */
> > +       uint64_t dev_capa; /**< Device capabilities (RTE_DMA_DEV_CAPA_) */
> > +       uint16_t max_hw_queues; /**< Maximum number of HW queues. */
> > +       uint16_t max_vqs_per_hw_queue;
> > +       /**< Maximum number of virt queues to allocate per HW queue */
> > +       uint16_t max_desc;
> > +       /**< Maximum allowed number of virt queue descriptors */
> > +       uint16_t min_desc;
> > +       /**< Minimum allowed number of virt queue descriptors */
> 
> Please add max_nb_segs. i.e maximum number of segments supported.
> 
> > +
> > +       /**
> > +        * Status fields filled by driver
> > +        */
> > +       uint16_t nb_hw_queues; /**< Number of HW queues configured */
> > +       uint16_t nb_vqs; /**< Number of virt queues configured */
> > +};
> > + i
> > +
> > +/**
> > + * dma_address_type
> > + */
> > +enum dma_address_type {
> > +       DMA_ADDRESS_TYPE_IOVA, /**< Use IOVA as dma address */
> > +       DMA_ADDRESS_TYPE_VA, /**< Use VA as dma address */
> > +};
> > +
> > +/**
> > + * A structure used to configure a DMA device.
> > + */
> > +struct rte_dmadev_conf {
> > +       enum dma_address_type addr_type; /**< Address type to used */
> 
> I think, there are 3 kinds of limitations/capabilities.
> 
> When the system is configured as IOVA as VA
> 1) Device supports any VA address like memory from rte_malloc(),
> rte_memzone(), malloc, stack memory
> 2) Device support only VA address from rte_malloc(), rte_memzone() i.e
> memory backed by hugepage and added to DMA map.
> 
> When the system is configured as IOVA as PA
> 1) Devices support only PA addresses .
> 
> IMO, Above needs to be  advertised as capability and application needs
> to align with that
> and I dont think application requests the driver to work in any of the modes.
> 
> 

I don't think we need this level of detail for addressing capabilities.
Unless I'm missing something, the hardware should behave exactly as other
hardware does taking in iova's.  If the user wants to check whether virtual
addresses to pinned memory can be used directly, the user can call
"rte_eal_iova_mode". We can't have a situation where some hardware uses one
type of addresses and another hardware the other.

Therefore, the only additional addressing capability we should need to
report is that the hardware can use SVM/SVA and use virtual addresses not
in hugepage memory.

> 
> > +       uint16_t nb_hw_queues; /**< Number of HW-queues enable to use */
> > +       uint16_t max_vqs; /**< Maximum number of virt queues to use */
> 
> You need to what is max value allowed etc i.e it is based on
> info_get() and mention the field
> in info structure
> 
> 
> > +
> > +/**
> > + * dma_transfer_direction
> > + */
> > +enum dma_transfer_direction {
> 
> rte_dma_transter_direction
> 
> > +       DMA_MEM_TO_MEM,
> > +       DMA_MEM_TO_DEV,
> > +       DMA_DEV_TO_MEM,
> > +       DMA_DEV_TO_DEV,
> > +};
> > +
> > +/**
> > + * A structure used to configure a DMA virt queue.
> > + */
> > +struct rte_dmadev_queue_conf {
> > +       enum dma_transfer_direction direction;
> 
> 
> > +       /**< Associated transfer direction */
> > +       uint16_t hw_queue_id; /**< The HW queue on which to create virt queue */
> > +       uint16_t nb_desc; /**< Number of descriptor for this virt queue */
> > +       uint64_t dev_flags; /**< Device specific flags */
> 
> Use of this? Need more comments on this.
> Since it is in slowpath, We can have non opaque names here based on
> each driver capability.
> 
> 
> > +       void *dev_ctx; /**< Device specific context */
> 
> Use of this ? Need more comment ont this.
> 

I think this should be dropped. We should not have any opaque
device-specific info in these structs, rather if a particular device needs
parameters we should call them out. Drivers for which it's not relevant can
ignore them (and report same in capability if necessary). Since this is not
a dataplane API, we aren't concerned too much about perf and can size the
struct appropriately.

> 
> Please add some good amount of reserved bits and have API to init this
> structure for future ABI stability, say rte_dmadev_queue_config_init()
> or so.
> 

I don't think that is necessary. Since the config struct is used only as
parameter to the config function, any changes to it can be managed by
versioning that single function. Padding would only be necessary if we had
an array of these config structs somewhere.

> 
> > +
> > +/**
> > + * A structure used to retrieve information of a DMA virt queue.
> > + */
> > +struct rte_dmadev_queue_info {
> > +       enum dma_transfer_direction direction;
> 
> A queue may support all directions so I think it should be a bitfield.
> 
> > +       /**< Associated transfer direction */
> > +       uint16_t hw_queue_id; /**< The HW queue on which to create virt queue */
> > +       uint16_t nb_desc; /**< Number of descriptor for this virt queue */
> > +       uint64_t dev_flags; /**< Device specific flags */
> > +};
> > +
> 
> > +__rte_experimental
> > +static inline dma_cookie_t
> > +rte_dmadev_copy_sg(uint16_t dev_id, uint16_t vq_id,
> > +                  const struct dma_scatterlist *sg,
> > +                  uint32_t sg_len, uint64_t flags)
> 
> I would like to change this as:
> rte_dmadev_copy_sg(uint16_t dev_id, uint16_t vq_id, const struct
> rte_dma_sg *src, uint32_t nb_src,
> const struct rte_dma_sg *dst, uint32_t nb_dst) or so allow the use case like
> src 30 MB copy can be splitted as written as 1 MB x 30 dst.
> 
> 
> 
> > +{
> > +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> > +       return (*dev->copy_sg)(dev, vq_id, sg, sg_len, flags);
> > +}
> > +
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change without prior notice.
> > + *
> > + * Enqueue a fill operation onto the DMA virt queue
> > + *
> > + * This queues up a fill operation to be performed by hardware, but does not
> > + * trigger hardware to begin that operation.
> > + *
> > + * @param dev_id
> > + *   The identifier of the device.
> > + * @param vq_id
> > + *   The identifier of virt queue.
> > + * @param pattern
> > + *   The pattern to populate the destination buffer with.
> > + * @param dst
> > + *   The address of the destination buffer.
> > + * @param length
> > + *   The length of the destination buffer.
> > + * @param flags
> > + *   An opaque flags for this operation.
> 
> PLEASE REMOVE opaque stuff from fastpath it will be a pain for
> application writers as
> they need to write multiple combinations of fastpath. flags are OK, if
> we have a valid
> generic flag now to control the transfer behavior.
> 

+1. Flags need to be explicitly listed. If we don't have any flags for now,
we can specify that the value must be given as zero and it's for future
use.

> 
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change without prior notice.
> > + *
> > + * Add a fence to force ordering between operations
> > + *
> > + * This adds a fence to a sequence of operations to enforce ordering, such that
> > + * all operations enqueued before the fence must be completed before operations
> > + * after the fence.
> > + * NOTE: Since this fence may be added as a flag to the last operation enqueued,
> > + * this API may not function correctly when called immediately after an
> > + * "rte_dmadev_perform" call i.e. before any new operations are enqueued.
> > + *
> > + * @param dev_id
> > + *   The identifier of the device.
> > + * @param vq_id
> > + *   The identifier of virt queue.
> > + *
> > + * @return
> > + *   - =0: Successful add fence.
> > + *   - <0: Failure to add fence.
> > + *
> > + * NOTE: The caller must ensure that the input parameter is valid and the
> > + *       corresponding device supports the operation.
> > + */
> > +__rte_experimental
> > +static inline int
> > +rte_dmadev_fence(uint16_t dev_id, uint16_t vq_id)
> > +{
> > +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> > +       return (*dev->fence)(dev, vq_id);
> > +}
> 
> Since HW submission is in a queue(FIFO) the ordering is always
> maintained. Right?
> Could you share more details and use case of fence() from
> driver/application PoV?
> 

There are different kinds of ordering to consider, ordering of completions
and the ordering of operations. While jobs are reported as completed to the
user in order, for performance hardware, may overlap individual jobs within
a burst (or even across bursts). Therefore, we need a fence operation to
inform hardware that one job should not be started until the other has
fully completed.

> 
> > +
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change without prior notice.
> > + *
> > + * Trigger hardware to begin performing enqueued operations
> > + *
> > + * This API is used to write the "doorbell" to the hardware to trigger it
> > + * to begin the operations previously enqueued by rte_dmadev_copy/fill()
> > + *
> > + * @param dev_id
> > + *   The identifier of the device.
> > + * @param vq_id
> > + *   The identifier of virt queue.
> > + *
> > + * @return
> > + *   - =0: Successful trigger hardware.
> > + *   - <0: Failure to trigger hardware.
> > + *
> > + * NOTE: The caller must ensure that the input parameter is valid and the
> > + *       corresponding device supports the operation.
> > + */
> > +__rte_experimental
> > +static inline int
> > +rte_dmadev_perform(uint16_t dev_id, uint16_t vq_id)
> > +{
> > +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> > +       return (*dev->perform)(dev, vq_id);
> > +}
> 
> Since we have additional function call overhead in all the
> applications for this scheme, I would like to understand
> the use of doing this way vs enq does the doorbell implicitly from
> driver/application PoV?
> 

In our benchmarks it's just faster. When we tested it, the overhead of the
function calls was noticably less than the cost of building up the
parameter array(s) for passing the jobs in as a burst. [We don't see this
cost with things like NIC I/O since DPDK tends to already have the mbuf
fully populated before the TX call anyway.]

> 
> > +
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change without prior notice.
> > + *
> > + * Returns the number of operations that have been successful completed.
> > + *
> > + * @param dev_id
> > + *   The identifier of the device.
> > + * @param vq_id
> > + *   The identifier of virt queue.
> > + * @param nb_cpls
> > + *   The maximum number of completed operations that can be processed.
> > + * @param[out] cookie
> > + *   The last completed operation's cookie.
> > + * @param[out] has_error
> > + *   Indicates if there are transfer error.
> > + *
> > + * @return
> > + *   The number of operations that successful completed.
> 
> successfully
> 
> > + *
> > + * NOTE: The caller must ensure that the input parameter is valid and the
> > + *       corresponding device supports the operation.
> > + */
> > +__rte_experimental
> > +static inline uint16_t
> > +rte_dmadev_completed(uint16_t dev_id, uint16_t vq_id, const uint16_t nb_cpls,
> > +                    dma_cookie_t *cookie, bool *has_error)
> > +{
> > +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> > +       has_error = false;
> > +       return (*dev->completed)(dev, vq_id, nb_cpls, cookie, has_error);
> 
> It may be better to have cookie/ring_idx as third argument.
> 

No strong opinions here, but having it as in the code above means all
input parameters come before all output, which makes sense to me.

> > +}
> > +
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change without prior notice.
> > + *
> > + * Returns the number of operations that failed to complete.
> > + * NOTE: This API was used when rte_dmadev_completed has_error was set.
> > + *
> > + * @param dev_id
> > + *   The identifier of the device.
> > + * @param vq_id
> > + *   The identifier of virt queue.
> (> + * @param nb_status
> > + *   Indicates the size  of status array.
> > + * @param[out] status
> > + *   The error code of operations that failed to complete.
> > + * @param[out] cookie
> > + *   The last failed completed operation's cookie.
> > + *
> > + * @return
> > + *   The number of operations that failed to complete.
> > + *
> > + * NOTE: The caller must ensure that the input parameter is valid and the
> > + *       corresponding device supports the operation.
> > + */
> > +__rte_experimental
> > +static inline uint16_t
> > +rte_dmadev_completed_fails(uint16_t dev_id, uint16_t vq_id,
> > +                          const uint16_t nb_status, uint32_t *status,
> > +                          dma_cookie_t *cookie)
> 
> IMO, it is better to move cookie/rind_idx at 3.
> Why it would return any array of errors? since it called after
> rte_dmadev_completed() has
> has_error. Is it better to change
> 
> rte_dmadev_error_status((uint16_t dev_id, uint16_t vq_id, dma_cookie_t
> *cookie,  uint32_t *status)
> 
> I also think, we may need to set status as bitmask and enumerate all
> the combination of error codes
> of all the driver and return string from driver existing rte_flow_error
> 
> See
> struct rte_flow_error {
>         enum rte_flow_error_type type; /**< Cause field and error types. */
>         const void *cause; /**< Object responsible for the error. */
>         const char *message; /**< Human-readable error message. */
> };
> 

I think we need a multi-return value API here, as we may add operations in
future which have non-error status values to return. The obvious case is
DMA engines which support "compare" operations. In that case a successful
compare (as in there were no DMA or HW errors) can return "equal" or
"not-equal" as statuses. For general "copy" operations, the faster
completion op can be used to just return successful values (and only call
this status version on error), while apps using those compare ops or a
mixture of copy and compare ops, would always use the slower one that
returns status values for each and every op..

The ioat APIs used 32-bit integer values for this status array so as to
allow e.g. 16-bits for error code and 16-bits for future status values. For
most operations there should be a fairly small set of things that can go
wrong, i.e. bad source address, bad destination address or invalid length.
Within that we may have a couple of specifics for why an address is bad,
but even so I don't think we need to start having multiple bit
combinations.

> > +{
> > +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> > +       return (*dev->completed_fails)(dev, vq_id, nb_status, status, cookie);
> > +}
> > +
> > +struct rte_dmadev_stats {
> > +       uint64_t enqueue_fail_count;
> > +       /**< Conut of all operations which failed enqueued */
> > +       uint64_t enqueued_count;
> > +       /**< Count of all operations which successful enqueued */
> > +       uint64_t completed_fail_count;
> > +       /**< Count of all operations which failed to complete */
> > +       uint64_t completed_count;
> > +       /**< Count of all operations which successful complete */
> > +};
> 
> We need to have capability API to tell which items are
> updated/supported by the driver.
> 

I also would remove the enqueue fail counts, since they are better counted
by the app. If a driver reports 20,000 failures we have no way of knowing
if that is 20,000 unique operations which failed to enqueue or a single
operation which failed to enqueue 20,000 times but succeeded on attempt
20,001.

> 
> > diff --git a/lib/dmadev/rte_dmadev_core.h b/lib/dmadev/rte_dmadev_core.h
> > new file mode 100644
> > index 0000000..a3afea2
> > --- /dev/null
> > +++ b/lib/dmadev/rte_dmadev_core.h
> > @@ -0,0 +1,98 @@
> > +/* SPDX-License-Identifier: BSD-3-Clause
> > + * Copyright 2021 HiSilicon Limited.
> > + */
> > +
> > +#ifndef _RTE_DMADEV_CORE_H_
> > +#define _RTE_DMADEV_CORE_H_
> > +
> > +/**
> > + * @file
> > + *
> > + * RTE DMA Device internal header.
> > + *
> > + * This header contains internal data types. But they are still part of the
> > + * public API because they are used by inline public functions.
> > + */
> > +
> > +struct rte_dmadev;
> > +
> > +typedef dma_cookie_t (*dmadev_copy_t)(struct rte_dmadev *dev, uint16_t vq_id,
> > +                                     void *src, void *dst,
> > +                                     uint32_t length, uint64_t flags);
> > +/**< @internal Function used to enqueue a copy operation. */
> 
> To avoid namespace conflict(as it is public API) use rte_
> 
> 
> > +
> > +/**
> > + * The data structure associated with each DMA device.
> > + */
> > +struct rte_dmadev {
> > +       /**< Enqueue a copy operation onto the DMA device. */
> > +       dmadev_copy_t copy;
> > +       /**< Enqueue a scatter list copy operation onto the DMA device. */
> > +       dmadev_copy_sg_t copy_sg;
> > +       /**< Enqueue a fill operation onto the DMA device. */
> > +       dmadev_fill_t fill;
> > +       /**< Enqueue a scatter list fill operation onto the DMA device. */
> > +       dmadev_fill_sg_t fill_sg;
> > +       /**< Add a fence to force ordering between operations. */
> > +       dmadev_fence_t fence;
> > +       /**< Trigger hardware to begin performing enqueued operations. */
> > +       dmadev_perform_t perform;
> > +       /**< Returns the number of operations that successful completed. */
> > +       dmadev_completed_t completed;
> > +       /**< Returns the number of operations that failed to complete. */
> > +       dmadev_completed_fails_t completed_fails;
> 
> We need to limit fastpath items in 1 CL
> 

I don't think that is going to be possible. I also would like to see
numbers to check if we benefit much from having these fastpath ops separate
from the regular ops.

> > +
> > +       void *dev_private; /**< PMD-specific private data */
> > +       const struct rte_dmadev_ops *dev_ops; /**< Functions exported by PMD */
> > +
> > +       uint16_t dev_id; /**< Device ID for this instance */
> > +       int socket_id; /**< Socket ID where memory is allocated */
> > +       struct rte_device *device;
> > +       /**< Device info. supplied during device initialization */
> > +       const char *driver_name; /**< Driver info. supplied by probing */
> > +       char name[RTE_DMADEV_NAME_MAX_LEN]; /**< Device name */
> > +
> > +       RTE_STD_C11
> > +       uint8_t attached : 1; /**< Flag indicating the device is attached */
> > +       uint8_t started : 1; /**< Device state: STARTED(1)/STOPPED(0) */
> 
> Add a couple of reserved fields for future ABI stability.
> 
> > +
> > +} __rte_cache_aligned;
> > +
> > +extern struct rte_dmadev rte_dmadevices[];
> > +

^ permalink raw reply	[flat|nested] 339+ messages in thread

* Re: [dpdk-dev] [PATCH] dmadev: introduce DMA device library
  2021-07-05 10:52   ` Bruce Richardson
@ 2021-07-05 11:12     ` Morten Brørup
  2021-07-05 13:44       ` Bruce Richardson
  2021-07-05 15:55     ` Jerin Jacob
  2021-07-06  8:20     ` fengchengwen
  2 siblings, 1 reply; 339+ messages in thread
From: Morten Brørup @ 2021-07-05 11:12 UTC (permalink / raw)
  To: Bruce Richardson, Jerin Jacob
  Cc: Chengwen Feng, Thomas Monjalon, Ferruh Yigit, Jerin Jacob,
	dpdk-dev, Nipun Gupta, Hemant Agrawal, Maxime Coquelin,
	Honnappa Nagarahalli, David Marchand, Satananda Burla,
	Prasun Kapoor, Ananyev, Konstantin, liangma,
	Radha Mohan Chintakuntla

> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Bruce Richardson
> Sent: Monday, 5 July 2021 12.53
> 
> On Sun, Jul 04, 2021 at 03:00:30PM +0530, Jerin Jacob wrote:
> > On Fri, Jul 2, 2021 at 6:51 PM Chengwen Feng
> <fengchengwen@huawei.com> wrote:
> > >
> > > +
> > > +/**
> > > + * The data structure associated with each DMA device.
> > > + */
> > > +struct rte_dmadev {
> > > +       /**< Enqueue a copy operation onto the DMA device. */
> > > +       dmadev_copy_t copy;
> > > +       /**< Enqueue a scatter list copy operation onto the DMA
> device. */
> > > +       dmadev_copy_sg_t copy_sg;
> > > +       /**< Enqueue a fill operation onto the DMA device. */
> > > +       dmadev_fill_t fill;
> > > +       /**< Enqueue a scatter list fill operation onto the DMA
> device. */
> > > +       dmadev_fill_sg_t fill_sg;
> > > +       /**< Add a fence to force ordering between operations. */
> > > +       dmadev_fence_t fence;
> > > +       /**< Trigger hardware to begin performing enqueued
> operations. */
> > > +       dmadev_perform_t perform;
> > > +       /**< Returns the number of operations that successful
> completed. */
> > > +       dmadev_completed_t completed;
> > > +       /**< Returns the number of operations that failed to
> complete. */
> > > +       dmadev_completed_fails_t completed_fails;
> >
> > We need to limit fastpath items in 1 CL
> >
> 
> I don't think that is going to be possible. I also would like to see
> numbers to check if we benefit much from having these fastpath ops
> separate
> from the regular ops.

The fastpath ops may not fit into 1 cache line, but it is a good design practice to separate hot data from cold data, and I do consider the fastpath function pointers hot and configuration function pointers cold.

The important point is keeping all the fastpath ops (of a dmadevice) together and spanning as few cache lines as possible.

Configuration ops and other slow data may follow in the same structure; that should make no performance difference. It might make a difference for memory consumption if the other data are very large and not dynamically allocated, as we are discussing regarding ethdev.


^ permalink raw reply	[flat|nested] 339+ messages in thread

* Re: [dpdk-dev] [PATCH] dmadev: introduce DMA device library
  2021-07-05 11:12     ` Morten Brørup
@ 2021-07-05 13:44       ` Bruce Richardson
  0 siblings, 0 replies; 339+ messages in thread
From: Bruce Richardson @ 2021-07-05 13:44 UTC (permalink / raw)
  To: Morten Brørup
  Cc: Jerin Jacob, Chengwen Feng, Thomas Monjalon, Ferruh Yigit,
	Jerin Jacob, dpdk-dev, Nipun Gupta, Hemant Agrawal,
	Maxime Coquelin, Honnappa Nagarahalli, David Marchand,
	Satananda Burla, Prasun Kapoor, Ananyev, Konstantin, liangma,
	Radha Mohan Chintakuntla

On Mon, Jul 05, 2021 at 01:12:54PM +0200, Morten Brørup wrote:
> > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Bruce Richardson
> > Sent: Monday, 5 July 2021 12.53
> > 
> > On Sun, Jul 04, 2021 at 03:00:30PM +0530, Jerin Jacob wrote:
> > > On Fri, Jul 2, 2021 at 6:51 PM Chengwen Feng
> > <fengchengwen@huawei.com> wrote:
> > > >
> > > > +
> > > > +/**
> > > > + * The data structure associated with each DMA device.
> > > > + */
> > > > +struct rte_dmadev {
> > > > +       /**< Enqueue a copy operation onto the DMA device. */
> > > > +       dmadev_copy_t copy;
> > > > +       /**< Enqueue a scatter list copy operation onto the DMA
> > device. */
> > > > +       dmadev_copy_sg_t copy_sg;
> > > > +       /**< Enqueue a fill operation onto the DMA device. */
> > > > +       dmadev_fill_t fill;
> > > > +       /**< Enqueue a scatter list fill operation onto the DMA
> > device. */
> > > > +       dmadev_fill_sg_t fill_sg;
> > > > +       /**< Add a fence to force ordering between operations. */
> > > > +       dmadev_fence_t fence;
> > > > +       /**< Trigger hardware to begin performing enqueued
> > operations. */
> > > > +       dmadev_perform_t perform;
> > > > +       /**< Returns the number of operations that successful
> > completed. */
> > > > +       dmadev_completed_t completed;
> > > > +       /**< Returns the number of operations that failed to
> > complete. */
> > > > +       dmadev_completed_fails_t completed_fails;
> > >
> > > We need to limit fastpath items in 1 CL
> > >
> > 
> > I don't think that is going to be possible. I also would like to see
> > numbers to check if we benefit much from having these fastpath ops
> > separate
> > from the regular ops.
> 
> The fastpath ops may not fit into 1 cache line, but it is a good design practice to separate hot data from cold data, and I do consider the fastpath function pointers hot and configuration function pointers cold.
> 
> The important point is keeping all the fastpath ops (of a dmadevice) together and spanning as few cache lines as possible.
> 
> Configuration ops and other slow data may follow in the same structure; that should make no performance difference. It might make a difference for memory consumption if the other data are very large and not dynamically allocated, as we are discussing regarding ethdev.
> 

Yes, I agree if it can be done, it should be.

^ permalink raw reply	[flat|nested] 339+ messages in thread

* Re: [dpdk-dev] [PATCH] dmadev: introduce DMA device library
  2021-07-05 10:52   ` Bruce Richardson
  2021-07-05 11:12     ` Morten Brørup
@ 2021-07-05 15:55     ` Jerin Jacob
  2021-07-05 17:16       ` Bruce Richardson
  2021-07-06  8:20     ` fengchengwen
  2 siblings, 1 reply; 339+ messages in thread
From: Jerin Jacob @ 2021-07-05 15:55 UTC (permalink / raw)
  To: Bruce Richardson
  Cc: Chengwen Feng, Thomas Monjalon, Ferruh Yigit, Jerin Jacob,
	dpdk-dev, Morten Brørup, Nipun Gupta, Hemant Agrawal,
	Maxime Coquelin, Honnappa Nagarahalli, David Marchand,
	Satananda Burla, Prasun Kapoor, Ananyev, Konstantin, liangma,
	Radha Mohan Chintakuntla

 need

On Mon, Jul 5, 2021 at 4:22 PM Bruce Richardson
<bruce.richardson@intel.com> wrote:
>
> On Sun, Jul 04, 2021 at 03:00:30PM +0530, Jerin Jacob wrote:
> > On Fri, Jul 2, 2021 at 6:51 PM Chengwen Feng <fengchengwen@huawei.com> wrote:
> > >
> > > This patch introduces 'dmadevice' which is a generic type of DMA
> > > device.
> > >
> > > The APIs of dmadev library exposes some generic operations which can
> > > enable configuration and I/O with the DMA devices.
> > >
> > > Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>
> >
> > Thanks for v1.
> >
> > I would suggest finalizing  lib/dmadev/rte_dmadev.h before doing the
> > implementation so that you don't need
> > to waste time on rewoking the implementation.
> >
>
> I actually like having the .c file available too. Before we lock down the
> .h file and the API, I want to verify the performance of our drivers with
> the implementation, and having a working .c file is obviously necessary for
> that. So I appreciate having it as part of the RFC.

Ack.

>
> > Comments inline.
> >
> > > ---
> <snip>
> > > + *
> > > + * The DMA framework is built on the following abstraction model:
> > > + *
> > > + *     ------------    ------------
> > > + *     |virt-queue|    |virt-queue|
> > > + *     ------------    ------------
> > > + *            \           /
> > > + *             \         /
> > > + *              \       /
> > > + *            ------------     ------------
> > > + *            | HW-queue |     | HW-queue |
> > > + *            ------------     ------------
> > > + *                   \            /
> > > + *                    \          /
> > > + *                     \        /
> > > + *                     ----------
> > > + *                     | dmadev |
> > > + *                     ----------
> >
> > Continuing the discussion with @Morten Brørup , I think, we need to
> > finalize the model.
> >
>
> +1 and the terminology with regards to queues and channels. With our ioat
> hardware, each HW queue was called a channel for instance.

Looks like <dmadev> <> <channel> can cover all the use cases, if the
HW has more than
1 queues it can be exposed as separate dmadev dev.


>
> > > + *   a) The DMA operation request must be submitted to the virt queue, virt
> > > + *      queues must be created based on HW queues, the DMA device could have
> > > + *      multiple HW queues.
> > > + *   b) The virt queues on the same HW-queue could represent different contexts,
> > > + *      e.g. user could create virt-queue-0 on HW-queue-0 for mem-to-mem
> > > + *      transfer scenario, and create virt-queue-1 on the same HW-queue for
> > > + *      mem-to-dev transfer scenario.
> > > + *   NOTE: user could also create multiple virt queues for mem-to-mem transfer
> > > + *         scenario as long as the corresponding driver supports.
> > > + *
> > > + * The control plane APIs include configure/queue_setup/queue_release/start/
> > > + * stop/reset/close, in order to start device work, the call sequence must be
> > > + * as follows:
> > > + *     - rte_dmadev_configure()
> > > + *     - rte_dmadev_queue_setup()
> > > + *     - rte_dmadev_start()
> >
> > Please add reconfigure behaviour etc, Please check the
> > lib/regexdev/rte_regexdev.h
> > introduction. I have added similar ones so you could reuse as much as possible.
> >
> >
> > > + * The dataplane APIs include two parts:
> > > + *   a) The first part is the submission of operation requests:
> > > + *        - rte_dmadev_copy()
> > > + *        - rte_dmadev_copy_sg() - scatter-gather form of copy
> > > + *        - rte_dmadev_fill()
> > > + *        - rte_dmadev_fill_sg() - scatter-gather form of fill
> > > + *        - rte_dmadev_fence()   - add a fence force ordering between operations
> > > + *        - rte_dmadev_perform() - issue doorbell to hardware
> > > + *      These APIs could work with different virt queues which have different
> > > + *      contexts.
> > > + *      The first four APIs are used to submit the operation request to the virt
> > > + *      queue, if the submission is successful, a cookie (as type
> > > + *      'dma_cookie_t') is returned, otherwise a negative number is returned.
> > > + *   b) The second part is to obtain the result of requests:
> > > + *        - rte_dmadev_completed()
> > > + *            - return the number of operation requests completed successfully.
> > > + *        - rte_dmadev_completed_fails()
> > > + *            - return the number of operation requests failed to complete.
> > > + *
> > > + * The misc APIs include info_get/queue_info_get/stats/xstats/selftest, provide
> > > + * information query and self-test capabilities.
> > > + *
> > > + * About the dataplane APIs MT-safe, there are two dimensions:
> > > + *   a) For one virt queue, the submit/completion API could be MT-safe,
> > > + *      e.g. one thread do submit operation, another thread do completion
> > > + *      operation.
> > > + *      If driver support it, then declare RTE_DMA_DEV_CAPA_MT_VQ.
> > > + *      If driver don't support it, it's up to the application to guarantee
> > > + *      MT-safe.
> > > + *   b) For multiple virt queues on the same HW queue, e.g. one thread do
> > > + *      operation on virt-queue-0, another thread do operation on virt-queue-1.
> > > + *      If driver support it, then declare RTE_DMA_DEV_CAPA_MT_MVQ.
> > > + *      If driver don't support it, it's up to the application to guarantee
> > > + *      MT-safe.
> >
> > From an application PoV it may not be good to write portable
> > applications. Please check
> > latest thread with @Morten Brørup
> >
> > > + */
> > > +
> > > +#ifdef __cplusplus
> > > +extern "C" {
> > > +#endif
> > > +
> > > +#include <rte_common.h>
> > > +#include <rte_memory.h>
> > > +#include <rte_errno.h>
> > > +#include <rte_compat.h>
> >
> > Sort in alphabetical order.
> >
> > > +
> > > +/**
> > > + * dma_cookie_t - an opaque DMA cookie
> >
> > Since we are defining the behaviour is not opaque any more.
> > I think, it is better to call ring_idx or so.
> >
>
> +1 for ring index. We don't need a separate type for it though, just
> document the index as an unsigned return value.
>
> >
> > > +#define RTE_DMA_DEV_CAPA_MT_MVQ (1ull << 11) /**< Support MT-safe of multiple virt queues */
> >
> > Please lot of @see for all symbols where it is being used. So that one
> > can understand the full scope of
> > symbols. See below example.
> >
> > #define RTE_REGEXDEV_CAPA_RUNTIME_COMPILATION_F (1ULL << 0)
> > /**< RegEx device does support compiling the rules at runtime unlike
> >  * loading only the pre-built rule database using
> >  * struct rte_regexdev_config::rule_db in rte_regexdev_configure()
> >  *
> >  * @see struct rte_regexdev_config::rule_db, rte_regexdev_configure()
> >  * @see struct rte_regexdev_info::regexdev_capa
> >  */
> >
> > > + *
> > > + * If dma_cookie_t is >=0 it's a DMA operation request cookie, <0 it's a error
> > > + * code.
> > > + * When using cookies, comply with the following rules:
> > > + * a) Cookies for each virtual queue are independent.
> > > + * b) For a virt queue, the cookie are monotonically incremented, when it reach
> > > + *    the INT_MAX, it wraps back to zero.
>
> I disagree with the INT_MAX (or INT32_MAX) value here. If we use that
> value, it means that we cannot use implicit wrap-around inside the CPU and
> have to check for the INT_MAX value. Better to:
> 1. Specify that it wraps at UINT16_MAX which allows us to just use a
> uint16_t internally and wrap-around automatically, or:
> 2. Specify that it wraps at a power-of-2 value >= UINT16_MAX, giving
> drivers the flexibility at what value to wrap around.

I think, (2) better than 1. I think, even better to wrap around the number of
descriptors configured in dev_configure()(We cake make this as the power of 2),


>
> > > + * c) The initial cookie of a virt queue is zero, after the device is stopped or
> > > + *    reset, the virt queue's cookie needs to be reset to zero.
> > > + * Example:
> > > + *    step-1: start one dmadev
> > > + *    step-2: enqueue a copy operation, the cookie return is 0
> > > + *    step-3: enqueue a copy operation again, the cookie return is 1
> > > + *    ...
> > > + *    step-101: stop the dmadev
> > > + *    step-102: start the dmadev
> > > + *    step-103: enqueue a copy operation, the cookie return is 0
> > > + *    ...
> > > + */
> >
> > Good explanation.
> >
> > > +typedef int32_t dma_cookie_t;
> >
>
> As I mentioned before, I'd just remove this, and use regular int types,
> with "ring_idx" as the name.

+1

>
> >
> > > +
> > > +/**
> > > + * dma_scatterlist - can hold scatter DMA operation request
> > > + */
> > > +struct dma_scatterlist {
> >
> > I prefer to change scatterlist -> sg
> > i.e rte_dma_sg
> >
> > > +       void *src;
> > > +       void *dst;
> > > +       uint32_t length;
> > > +};
> > > +
> >
> > > +
> > > +/**
> > > + * A structure used to retrieve the contextual information of
> > > + * an DMA device
> > > + */
> > > +struct rte_dmadev_info {
> > > +       /**
> > > +        * Fields filled by framewok
> >
> > typo.
> >
> > > +        */
> > > +       struct rte_device *device; /**< Generic Device information */
> > > +       const char *driver_name; /**< Device driver name */
> > > +       int socket_id; /**< Socket ID where memory is allocated */
> > > +
> > > +       /**
> > > +        * Specification fields filled by driver
> > > +        */
> > > +       uint64_t dev_capa; /**< Device capabilities (RTE_DMA_DEV_CAPA_) */
> > > +       uint16_t max_hw_queues; /**< Maximum number of HW queues. */
> > > +       uint16_t max_vqs_per_hw_queue;
> > > +       /**< Maximum number of virt queues to allocate per HW queue */
> > > +       uint16_t max_desc;
> > > +       /**< Maximum allowed number of virt queue descriptors */
> > > +       uint16_t min_desc;
> > > +       /**< Minimum allowed number of virt queue descriptors */
> >
> > Please add max_nb_segs. i.e maximum number of segments supported.
> >
> > > +
> > > +       /**
> > > +        * Status fields filled by driver
> > > +        */
> > > +       uint16_t nb_hw_queues; /**< Number of HW queues configured */
> > > +       uint16_t nb_vqs; /**< Number of virt queues configured */
> > > +};
> > > + i
> > > +
> > > +/**
> > > + * dma_address_type
> > > + */
> > > +enum dma_address_type {
> > > +       DMA_ADDRESS_TYPE_IOVA, /**< Use IOVA as dma address */
> > > +       DMA_ADDRESS_TYPE_VA, /**< Use VA as dma address */
> > > +};
> > > +
> > > +/**
> > > + * A structure used to configure a DMA device.
> > > + */
> > > +struct rte_dmadev_conf {
> > > +       enum dma_address_type addr_type; /**< Address type to used */
> >
> > I think, there are 3 kinds of limitations/capabilities.
> >
> > When the system is configured as IOVA as VA
> > 1) Device supports any VA address like memory from rte_malloc(),
> > rte_memzone(), malloc, stack memory
> > 2) Device support only VA address from rte_malloc(), rte_memzone() i.e
> > memory backed by hugepage and added to DMA map.
> >
> > When the system is configured as IOVA as PA
> > 1) Devices support only PA addresses .
> >
> > IMO, Above needs to be  advertised as capability and application needs
> > to align with that
> > and I dont think application requests the driver to work in any of the modes.
> >
> >
>
> I don't think we need this level of detail for addressing capabilities.
> Unless I'm missing something, the hardware should behave exactly as other
> hardware does taking in iova's.  If the user wants to check whether virtual
> addresses to pinned memory can be used directly, the user can call
> "rte_eal_iova_mode". We can't have a situation where some hardware uses one
> type of addresses and another hardware the other.
>
> Therefore, the only additional addressing capability we should need to
> report is that the hardware can use SVM/SVA and use virtual addresses not
> in hugepage memory.

+1.


>
> >
> > > +       uint16_t nb_hw_queues; /**< Number of HW-queues enable to use */
> > > +       uint16_t max_vqs; /**< Maximum number of virt queues to use */
> >
> > You need to what is max value allowed etc i.e it is based on
> > info_get() and mention the field
> > in info structure
> >
> >
> > > +
> > > +/**
> > > + * dma_transfer_direction
> > > + */
> > > +enum dma_transfer_direction {
> >
> > rte_dma_transter_direction
> >
> > > +       DMA_MEM_TO_MEM,
> > > +       DMA_MEM_TO_DEV,
> > > +       DMA_DEV_TO_MEM,
> > > +       DMA_DEV_TO_DEV,
> > > +};
> > > +
> > > +/**
> > > + * A structure used to configure a DMA virt queue.
> > > + */
> > > +struct rte_dmadev_queue_conf {
> > > +       enum dma_transfer_direction direction;
> >
> >
> > > +       /**< Associated transfer direction */
> > > +       uint16_t hw_queue_id; /**< The HW queue on which to create virt queue */
> > > +       uint16_t nb_desc; /**< Number of descriptor for this virt queue */
> > > +       uint64_t dev_flags; /**< Device specific flags */
> >
> > Use of this? Need more comments on this.
> > Since it is in slowpath, We can have non opaque names here based on
> > each driver capability.
> >
> >
> > > +       void *dev_ctx; /**< Device specific context */
> >
> > Use of this ? Need more comment ont this.
> >
>
> I think this should be dropped. We should not have any opaque
> device-specific info in these structs, rather if a particular device needs
> parameters we should call them out. Drivers for which it's not relevant can
> ignore them (and report same in capability if necessary). Since this is not
> a dataplane API, we aren't concerned too much about perf and can size the
> struct appropriately.
>
> >
> > Please add some good amount of reserved bits and have API to init this
> > structure for future ABI stability, say rte_dmadev_queue_config_init()
> > or so.
> >
>
> I don't think that is necessary. Since the config struct is used only as
> parameter to the config function, any changes to it can be managed by
> versioning that single function. Padding would only be necessary if we had
> an array of these config structs somewhere.

OK.

For some reason, the versioning API looks ugly to me in code instead of keeping
some rsvd fields look cool to me with init function.

But I agree. function versioning works in this case. No need to find other API
if tt is not general DPDK API practice.

In other libraries, I have seen such _init or function that can use
for this as well as filling default value
in some cases implementation values is not zero).
So that application can avoid memset for param structure.
Added rte_event_queue_default_conf_get() in eventdev spec for this.

No strong opinion on this.



>
> >
> > > +
> > > +/**
> > > + * A structure used to retrieve information of a DMA virt queue.
> > > + */
> > > +struct rte_dmadev_queue_info {
> > > +       enum dma_transfer_direction direction;
> >
> > A queue may support all directions so I think it should be a bitfield.
> >
> > > +       /**< Associated transfer direction */
> > > +       uint16_t hw_queue_id; /**< The HW queue on which to create virt queue */
> > > +       uint16_t nb_desc; /**< Number of descriptor for this virt queue */
> > > +       uint64_t dev_flags; /**< Device specific flags */
> > > +};
> > > +
> >
> > > +__rte_experimental
> > > +static inline dma_cookie_t
> > > +rte_dmadev_copy_sg(uint16_t dev_id, uint16_t vq_id,
> > > +                  const struct dma_scatterlist *sg,
> > > +                  uint32_t sg_len, uint64_t flags)
> >
> > I would like to change this as:
> > rte_dmadev_copy_sg(uint16_t dev_id, uint16_t vq_id, const struct
> > rte_dma_sg *src, uint32_t nb_src,
> > const struct rte_dma_sg *dst, uint32_t nb_dst) or so allow the use case like
> > src 30 MB copy can be splitted as written as 1 MB x 30 dst.
> >
> >
> >
> > > +{
> > > +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> > > +       return (*dev->copy_sg)(dev, vq_id, sg, sg_len, flags);
> > > +}
> > > +
> > > +/**
> > > + * @warning
> > > + * @b EXPERIMENTAL: this API may change without prior notice.
> > > + *
> > > + * Enqueue a fill operation onto the DMA virt queue
> > > + *
> > > + * This queues up a fill operation to be performed by hardware, but does not
> > > + * trigger hardware to begin that operation.
> > > + *
> > > + * @param dev_id
> > > + *   The identifier of the device.
> > > + * @param vq_id
> > > + *   The identifier of virt queue.
> > > + * @param pattern
> > > + *   The pattern to populate the destination buffer with.
> > > + * @param dst
> > > + *   The address of the destination buffer.
> > > + * @param length
> > > + *   The length of the destination buffer.
> > > + * @param flags
> > > + *   An opaque flags for this operation.
> >
> > PLEASE REMOVE opaque stuff from fastpath it will be a pain for
> > application writers as
> > they need to write multiple combinations of fastpath. flags are OK, if
> > we have a valid
> > generic flag now to control the transfer behavior.
> >
>
> +1. Flags need to be explicitly listed. If we don't have any flags for now,
> we can specify that the value must be given as zero and it's for future
> use.

OK.

>
> >
> > > +/**
> > > + * @warning
> > > + * @b EXPERIMENTAL: this API may change without prior notice.
> > > + *
> > > + * Add a fence to force ordering between operations
> > > + *
> > > + * This adds a fence to a sequence of operations to enforce ordering, such that
> > > + * all operations enqueued before the fence must be completed before operations
> > > + * after the fence.
> > > + * NOTE: Since this fence may be added as a flag to the last operation enqueued,
> > > + * this API may not function correctly when called immediately after an
> > > + * "rte_dmadev_perform" call i.e. before any new operations are enqueued.
> > > + *
> > > + * @param dev_id
> > > + *   The identifier of the device.
> > > + * @param vq_id
> > > + *   The identifier of virt queue.
> > > + *
> > > + * @return
> > > + *   - =0: Successful add fence.
> > > + *   - <0: Failure to add fence.
> > > + *
> > > + * NOTE: The caller must ensure that the input parameter is valid and the
> > > + *       corresponding device supports the operation.
> > > + */
> > > +__rte_experimental
> > > +static inline int
> > > +rte_dmadev_fence(uint16_t dev_id, uint16_t vq_id)
> > > +{
> > > +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> > > +       return (*dev->fence)(dev, vq_id);
> > > +}
> >
> > Since HW submission is in a queue(FIFO) the ordering is always
> > maintained. Right?
> > Could you share more details and use case of fence() from
> > driver/application PoV?
> >
>
> There are different kinds of ordering to consider, ordering of completions
> and the ordering of operations. While jobs are reported as completed to the
> user in order, for performance hardware, may overlap individual jobs within
> a burst (or even across bursts). Therefore, we need a fence operation to
> inform hardware that one job should not be started until the other has
> fully completed.

Got it. In order to save space if first CL size for fastpath(Saving 8B
for the pointer) and to avoid
function overhead, Can we use one bit of flags of op function to
enable the fence?

>
> >
> > > +
> > > +/**
> > > + * @warning
> > > + * @b EXPERIMENTAL: this API may change without prior notice.
> > > + *
> > > + * Trigger hardware to begin performing enqueued operations
> > > + *
> > > + * This API is used to write the "doorbell" to the hardware to trigger it
> > > + * to begin the operations previously enqueued by rte_dmadev_copy/fill()
> > > + *
> > > + * @param dev_id
> > > + *   The identifier of the device.
> > > + * @param vq_id
> > > + *   The identifier of virt queue.
> > > + *
> > > + * @return
> > > + *   - =0: Successful trigger hardware.
> > > + *   - <0: Failure to trigger hardware.
> > > + *
> > > + * NOTE: The caller must ensure that the input parameter is valid and the
> > > + *       corresponding device supports the operation.
> > > + */
> > > +__rte_experimental
> > > +static inline int
> > > +rte_dmadev_perform(uint16_t dev_id, uint16_t vq_id)
> > > +{
> > > +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> > > +       return (*dev->perform)(dev, vq_id);
> > > +}
> >
> > Since we have additional function call overhead in all the
> > applications for this scheme, I would like to understand
> > the use of doing this way vs enq does the doorbell implicitly from
> > driver/application PoV?
> >
>
> In our benchmarks it's just faster. When we tested it, the overhead of the
> function calls was noticably less than the cost of building up the
> parameter array(s) for passing the jobs in as a burst. [We don't see this
> cost with things like NIC I/O since DPDK tends to already have the mbuf
> fully populated before the TX call anyway.]

OK. I agree with stack population.

My question was more on doing implicit doorbell update enq. Is doorbell write
costly in other HW compare to a function call? In our HW, it is just write of
the number of instructions written in a register.

Also, we need to again access the internal PMD memory structure to find
where to write etc if it is a separate function.


>
> >
> > > +
> > > +/**
> > > + * @warning
> > > + * @b EXPERIMENTAL: this API may change without prior notice.
> > > + *
> > > + * Returns the number of operations that have been successful completed.
> > > + *
> > > + * @param dev_id
> > > + *   The identifier of the device.
> > > + * @param vq_id
> > > + *   The identifier of virt queue.
> > > + * @param nb_cpls
> > > + *   The maximum number of completed operations that can be processed.
> > > + * @param[out] cookie
> > > + *   The last completed operation's cookie.
> > > + * @param[out] has_error
> > > + *   Indicates if there are transfer error.
> > > + *
> > > + * @return
> > > + *   The number of operations that successful completed.
> >
> > successfully
> >
> > > + *
> > > + * NOTE: The caller must ensure that the input parameter is valid and the
> > > + *       corresponding device supports the operation.
> > > + */
> > > +__rte_experimental
> > > +static inline uint16_t
> > > +rte_dmadev_completed(uint16_t dev_id, uint16_t vq_id, const uint16_t nb_cpls,
> > > +                    dma_cookie_t *cookie, bool *has_error)
> > > +{
> > > +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> > > +       has_error = false;
> > > +       return (*dev->completed)(dev, vq_id, nb_cpls, cookie, has_error);
> >
> > It may be better to have cookie/ring_idx as third argument.
> >
>
> No strong opinions here, but having it as in the code above means all
> input parameters come before all output, which makes sense to me.

+1

>
> > > +}
> > > +
> > > +/**
> > > + * @warning
> > > + * @b EXPERIMENTAL: this API may change without prior notice.
> > > + *
> > > + * Returns the number of operations that failed to complete.
> > > + * NOTE: This API was used when rte_dmadev_completed has_error was set.
> > > + *
> > > + * @param dev_id
> > > + *   The identifier of the device.
> > > + * @param vq_id
> > > + *   The identifier of virt queue.
> > (> + * @param nb_status
> > > + *   Indicates the size  of status array.
> > > + * @param[out] status
> > > + *   The error code of operations that failed to complete.
> > > + * @param[out] cookie
> > > + *   The last failed completed operation's cookie.
> > > + *
> > > + * @return
> > > + *   The number of operations that failed to complete.
> > > + *
> > > + * NOTE: The caller must ensure that the input parameter is valid and the
> > > + *       corresponding device supports the operation.
> > > + */
> > > +__rte_experimental
> > > +static inline uint16_t
> > > +rte_dmadev_completed_fails(uint16_t dev_id, uint16_t vq_id,
> > > +                          const uint16_t nb_status, uint32_t *status,
> > > +                          dma_cookie_t *cookie)
> >
> > IMO, it is better to move cookie/rind_idx at 3.
> > Why it would return any array of errors? since it called after
> > rte_dmadev_completed() has
> > has_error. Is it better to change
> >
> > rte_dmadev_error_status((uint16_t dev_id, uint16_t vq_id, dma_cookie_t
> > *cookie,  uint32_t *status)
> >
> > I also think, we may need to set status as bitmask and enumerate all
> > the combination of error codes
> > of all the driver and return string from driver existing rte_flow_error
> >
> > See
> > struct rte_flow_error {
> >         enum rte_flow_error_type type; /**< Cause field and error types. */
> >         const void *cause; /**< Object responsible for the error. */
> >         const char *message; /**< Human-readable error message. */
> > };
> >
>
> I think we need a multi-return value API here, as we may add operations in
> future which have non-error status values to return. The obvious case is
> DMA engines which support "compare" operations. In that case a successful
> compare (as in there were no DMA or HW errors) can return "equal" or
> "not-equal" as statuses. For general "copy" operations, the faster
> completion op can be used to just return successful values (and only call
> this status version on error), while apps using those compare ops or a
> mixture of copy and compare ops, would always use the slower one that
> returns status values for each and every op..
>
> The ioat APIs used 32-bit integer values for this status array so as to
> allow e.g. 16-bits for error code and 16-bits for future status values. For
> most operations there should be a fairly small set of things that can go
> wrong, i.e. bad source address, bad destination address or invalid length.
> Within that we may have a couple of specifics for why an address is bad,
> but even so I don't think we need to start having multiple bit
> combinations.

OK. What is the purpose of errors status? Is it for application printing it or
Does the application need to take any action based on specific error requests?

If the former is scope, then we need to define the standard enum value
for the error right?
ie. uint32_t *status needs to change to enum rte_dma_error or so.



>
> > > +{
> > > +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> > > +       return (*dev->completed_fails)(dev, vq_id, nb_status, status, cookie);
> > > +}
> > > +
> > > +struct rte_dmadev_stats {
> > > +       uint64_t enqueue_fail_count;
> > > +       /**< Conut of all operations which failed enqueued */
> > > +       uint64_t enqueued_count;
> > > +       /**< Count of all operations which successful enqueued */
> > > +       uint64_t completed_fail_count;
> > > +       /**< Count of all operations which failed to complete */
> > > +       uint64_t completed_count;
> > > +       /**< Count of all operations which successful complete */
> > > +};
> >
> > We need to have capability API to tell which items are
> > updated/supported by the driver.
> >
>
> I also would remove the enqueue fail counts, since they are better counted
> by the app. If a driver reports 20,000 failures we have no way of knowing
> if that is 20,000 unique operations which failed to enqueue or a single
> operation which failed to enqueue 20,000 times but succeeded on attempt
> 20,001.
>
> >
> > > diff --git a/lib/dmadev/rte_dmadev_core.h b/lib/dmadev/rte_dmadev_core.h
> > > new file mode 100644
> > > index 0000000..a3afea2
> > > --- /dev/null
> > > +++ b/lib/dmadev/rte_dmadev_core.h
> > > @@ -0,0 +1,98 @@
> > > +/* SPDX-License-Identifier: BSD-3-Clause
> > > + * Copyright 2021 HiSilicon Limited.
> > > + */
> > > +
> > > +#ifndef _RTE_DMADEV_CORE_H_
> > > +#define _RTE_DMADEV_CORE_H_
> > > +
> > > +/**
> > > + * @file
> > > + *
> > > + * RTE DMA Device internal header.
> > > + *
> > > + * This header contains internal data types. But they are still part of the
> > > + * public API because they are used by inline public functions.
> > > + */
> > > +
> > > +struct rte_dmadev;
> > > +
> > > +typedef dma_cookie_t (*dmadev_copy_t)(struct rte_dmadev *dev, uint16_t vq_id,
> > > +                                     void *src, void *dst,
> > > +                                     uint32_t length, uint64_t flags);
> > > +/**< @internal Function used to enqueue a copy operation. */
> >
> > To avoid namespace conflict(as it is public API) use rte_
> >
> >
> > > +
> > > +/**
> > > + * The data structure associated with each DMA device.
> > > + */
> > > +struct rte_dmadev {
> > > +       /**< Enqueue a copy operation onto the DMA device. */
> > > +       dmadev_copy_t copy;
> > > +       /**< Enqueue a scatter list copy operation onto the DMA device. */
> > > +       dmadev_copy_sg_t copy_sg;
> > > +       /**< Enqueue a fill operation onto the DMA device. */
> > > +       dmadev_fill_t fill;
> > > +       /**< Enqueue a scatter list fill operation onto the DMA device. */
> > > +       dmadev_fill_sg_t fill_sg;
> > > +       /**< Add a fence to force ordering between operations. */
> > > +       dmadev_fence_t fence;
> > > +       /**< Trigger hardware to begin performing enqueued operations. */
> > > +       dmadev_perform_t perform;
> > > +       /**< Returns the number of operations that successful completed. */
> > > +       dmadev_completed_t completed;
> > > +       /**< Returns the number of operations that failed to complete. */
> > > +       dmadev_completed_fails_t completed_fails;
> >
> > We need to limit fastpath items in 1 CL
> >
>
> I don't think that is going to be possible. I also would like to see
> numbers to check if we benefit much from having these fastpath ops separate
> from the regular ops.
>
> > > +
> > > +       void *dev_private; /**< PMD-specific private data */
> > > +       const struct rte_dmadev_ops *dev_ops; /**< Functions exported by PMD */
> > > +
> > > +       uint16_t dev_id; /**< Device ID for this instance */
> > > +       int socket_id; /**< Socket ID where memory is allocated */
> > > +       struct rte_device *device;
> > > +       /**< Device info. supplied during device initialization */
> > > +       const char *driver_name; /**< Driver info. supplied by probing */
> > > +       char name[RTE_DMADEV_NAME_MAX_LEN]; /**< Device name */
> > > +
> > > +       RTE_STD_C11
> > > +       uint8_t attached : 1; /**< Flag indicating the device is attached */
> > > +       uint8_t started : 1; /**< Device state: STARTED(1)/STOPPED(0) */
> >
> > Add a couple of reserved fields for future ABI stability.
> >
> > > +
> > > +} __rte_cache_aligned;
> > > +
> > > +extern struct rte_dmadev rte_dmadevices[];
> > > +

^ permalink raw reply	[flat|nested] 339+ messages in thread

* Re: [dpdk-dev] [PATCH] dmadev: introduce DMA device library
  2021-07-05 15:55     ` Jerin Jacob
@ 2021-07-05 17:16       ` Bruce Richardson
  2021-07-07  8:08         ` Jerin Jacob
  0 siblings, 1 reply; 339+ messages in thread
From: Bruce Richardson @ 2021-07-05 17:16 UTC (permalink / raw)
  To: Jerin Jacob
  Cc: Chengwen Feng, Thomas Monjalon, Ferruh Yigit, Jerin Jacob,
	dpdk-dev, Morten Brørup, Nipun Gupta, Hemant Agrawal,
	Maxime Coquelin, Honnappa Nagarahalli, David Marchand,
	Satananda Burla, Prasun Kapoor, Ananyev, Konstantin, liangma,
	Radha Mohan Chintakuntla

On Mon, Jul 05, 2021 at 09:25:34PM +0530, Jerin Jacob wrote:
> 
> On Mon, Jul 5, 2021 at 4:22 PM Bruce Richardson
> <bruce.richardson@intel.com> wrote:
> >
> > On Sun, Jul 04, 2021 at 03:00:30PM +0530, Jerin Jacob wrote:
> > > On Fri, Jul 2, 2021 at 6:51 PM Chengwen Feng <fengchengwen@huawei.com> wrote:
> > > >
> > > > This patch introduces 'dmadevice' which is a generic type of DMA
> > > > device.
<snip>
> >
> > +1 and the terminology with regards to queues and channels. With our ioat
> > hardware, each HW queue was called a channel for instance.
> 
> Looks like <dmadev> <> <channel> can cover all the use cases, if the
> HW has more than
> 1 queues it can be exposed as separate dmadev dev.
> 

Fine for me.

However, just to confirm that Morten's suggestion of using a
(device-specific void *) channel pointer rather than dev_id + channel_id
pair of parameters won't work for you? You can't store a pointer or dev
index in the channel struct in the driver?

> 
<snip>
> > > > + *
> > > > + * If dma_cookie_t is >=0 it's a DMA operation request cookie, <0 it's a error
> > > > + * code.
> > > > + * When using cookies, comply with the following rules:
> > > > + * a) Cookies for each virtual queue are independent.
> > > > + * b) For a virt queue, the cookie are monotonically incremented, when it reach
> > > > + *    the INT_MAX, it wraps back to zero.
> >
> > I disagree with the INT_MAX (or INT32_MAX) value here. If we use that
> > value, it means that we cannot use implicit wrap-around inside the CPU and
> > have to check for the INT_MAX value. Better to:
> > 1. Specify that it wraps at UINT16_MAX which allows us to just use a
> > uint16_t internally and wrap-around automatically, or:
> > 2. Specify that it wraps at a power-of-2 value >= UINT16_MAX, giving
> > drivers the flexibility at what value to wrap around.
> 
> I think, (2) better than 1. I think, even better to wrap around the number of
> descriptors configured in dev_configure()(We cake make this as the power of 2),
> 

Interesting, I hadn't really considered that before. My only concern
would be if an app wants to keep values in the app ring for a while after
they have been returned from dmadev. I thought it easier to have the full
16-bit counter value returned to the user to give the most flexibility,
given that going from that to any power-of-2 ring size smaller is a trivial
operation.

Overall, while my ideal situation is to always have a 0..UINT16_MAX return
value from the function, I can live with your suggestion of wrapping at
ring_size, since drivers will likely do that internally anyway.
I think wrapping at INT32_MAX is too awkward and will be error prone since
we can't rely on hardware automatically wrapping to zero, nor on the driver
having pre-masked the value.

> >
> > > > + * c) The initial cookie of a virt queue is zero, after the device is stopped or
> > > > + *    reset, the virt queue's cookie needs to be reset to zero.
<snip>
> > >
> > > Please add some good amount of reserved bits and have API to init this
> > > structure for future ABI stability, say rte_dmadev_queue_config_init()
> > > or so.
> > >
> >
> > I don't think that is necessary. Since the config struct is used only as
> > parameter to the config function, any changes to it can be managed by
> > versioning that single function. Padding would only be necessary if we had
> > an array of these config structs somewhere.
> 
> OK.
> 
> For some reason, the versioning API looks ugly to me in code instead of keeping
> some rsvd fields look cool to me with init function.
> 
> But I agree. function versioning works in this case. No need to find other API
> if tt is not general DPDK API practice.
> 

The one thing I would suggest instead of the padding is for the internal
APIS, to pass the struct size through, since we can't version those - and
for padding we can't know whether any replaced padding should be used or
not. Specifically:

	typedef int (*rte_dmadev_configure_t)(struct rte_dmadev *dev, struct
			rte_dmadev_conf *cfg, size_t cfg_size);

but for the public function:

	int
	rte_dmadev_configure(struct rte_dmadev *dev, struct
			rte_dmadev_conf *cfg)
	{
		...
		ret = dev->ops.configure(dev, cfg, sizeof(*cfg));
		...
	}

Then if we change the structure and version the config API, the driver can
tell from the size what struct version it is and act accordingly. Without
that, each time the struct changed, we'd have to add a new function pointer
to the device ops.

> In other libraries, I have seen such _init or function that can use
> for this as well as filling default value
> in some cases implementation values is not zero).
> So that application can avoid memset for param structure.
> Added rte_event_queue_default_conf_get() in eventdev spec for this.
> 

I think that would largely have the same issues, unless it returned a
pointer to data inside the driver - and which therefore could not be
modified. Alternatively it would mean that the memory would have been
allocated in the driver and we would need to ensure proper cleanup
functions were called to free memory afterwards. Supporting having the
config parameter as a local variable I think makes things a lot easier.

> No strong opinion on this.
> 
> 
> 
> >
> > >
> > > > +
> > > > +/**
> > > > + * A structure used to retrieve information of a DMA virt queue.
> > > > + */
> > > > +struct rte_dmadev_queue_info {
> > > > +       enum dma_transfer_direction direction;
> > >
> > > A queue may support all directions so I think it should be a bitfield.
> > >
> > > > +       /**< Associated transfer direction */
> > > > +       uint16_t hw_queue_id; /**< The HW queue on which to create virt queue */
> > > > +       uint16_t nb_desc; /**< Number of descriptor for this virt queue */
> > > > +       uint64_t dev_flags; /**< Device specific flags */
> > > > +};
> > > > +
> > >
> > > > +__rte_experimental
> > > > +static inline dma_cookie_t
> > > > +rte_dmadev_copy_sg(uint16_t dev_id, uint16_t vq_id,
> > > > +                  const struct dma_scatterlist *sg,
> > > > +                  uint32_t sg_len, uint64_t flags)
> > >
> > > I would like to change this as:
> > > rte_dmadev_copy_sg(uint16_t dev_id, uint16_t vq_id, const struct
> > > rte_dma_sg *src, uint32_t nb_src,
> > > const struct rte_dma_sg *dst, uint32_t nb_dst) or so allow the use case like
> > > src 30 MB copy can be splitted as written as 1 MB x 30 dst.
> > >

Out of interest, do you see much benefit (and in what way) from having the
scatter-gather support? Unlike sending 5 buffers in one packet rather than
5 buffers in 5 packets to a NIC, copying an array of memory in one op vs
multiple is functionally identical.

> > >
> > >
<snip>
> Got it. In order to save space if first CL size for fastpath(Saving 8B
> for the pointer) and to avoid
> function overhead, Can we use one bit of flags of op function to
> enable the fence?
> 

The original ioat implementation did exactly that. However, I then
discovered that because a fence logically belongs between two operations,
does the fence flag on an operation mean "don't do any jobs after this
until this job has completed" or does it mean "don't start this job until
all previous jobs have completed". [Or theoretically does it mean both :-)]
Naturally, some hardware does it the former way (i.e. fence flag goes on
last op before fence), while other hardware the latter way (i.e. fence flag
goes on first op after the fence). Therefore, since fencing is about
ordering *between* two (sets of) jobs, I decided that it should do exactly
that and go between two jobs, so there is no ambiguity!

However, I'm happy enough to switch to having a fence flag, but I think if
we do that, it should be put in the "first job after fence" case, because
it is always easier to modify a previously written job if we need to, than
to save the flag for a future one.

Alternatively, if we keep the fence as a separate function, I'm happy
enough for it not to be on the same cacheline as the "hot" operations,
since fencing will always introduce a small penalty anyway.

> >
> > >
<snip>
> > > Since we have additional function call overhead in all the
> > > applications for this scheme, I would like to understand
> > > the use of doing this way vs enq does the doorbell implicitly from
> > > driver/application PoV?
> > >
> >
> > In our benchmarks it's just faster. When we tested it, the overhead of the
> > function calls was noticably less than the cost of building up the
> > parameter array(s) for passing the jobs in as a burst. [We don't see this
> > cost with things like NIC I/O since DPDK tends to already have the mbuf
> > fully populated before the TX call anyway.]
> 
> OK. I agree with stack population.
> 
> My question was more on doing implicit doorbell update enq. Is doorbell write
> costly in other HW compare to a function call? In our HW, it is just write of
> the number of instructions written in a register.
> 
> Also, we need to again access the internal PMD memory structure to find
> where to write etc if it is a separate function.
> 

The cost varies depending on a number of factors - even writing to a single
HW register can be very slow if that register is mapped as device
(uncacheable) memory, since (AFAIK) it will act as a full fence and wait
for the write to go all the way to hardware. For more modern HW, the cost
can be lighter. However, any cost of HW writes is going to be the same
whether its a separate function call or not.

However, the main thing about the doorbell update is that it's a
once-per-burst thing, rather than a once-per-job. Therefore, even if you
have to re-read the struct memory (which is likely still somewhere in your
cores' cache), any extra small cost of doing so is to be amortized over the
cost of a whole burst of copies.

> 
> >
> > >
<snip>
> > > > +
> > > > +/**
> > > > + * @warning
> > > > + * @b EXPERIMENTAL: this API may change without prior notice.
> > > > + *
> > > > + * Returns the number of operations that failed to complete.
> > > > + * NOTE: This API was used when rte_dmadev_completed has_error was set.
> > > > + *
> > > > + * @param dev_id
> > > > + *   The identifier of the device.
> > > > + * @param vq_id
> > > > + *   The identifier of virt queue.
> > > (> + * @param nb_status
> > > > + *   Indicates the size  of status array.
> > > > + * @param[out] status
> > > > + *   The error code of operations that failed to complete.
> > > > + * @param[out] cookie
> > > > + *   The last failed completed operation's cookie.
> > > > + *
> > > > + * @return
> > > > + *   The number of operations that failed to complete.
> > > > + *
> > > > + * NOTE: The caller must ensure that the input parameter is valid and the
> > > > + *       corresponding device supports the operation.
> > > > + */
> > > > +__rte_experimental
> > > > +static inline uint16_t
> > > > +rte_dmadev_completed_fails(uint16_t dev_id, uint16_t vq_id,
> > > > +                          const uint16_t nb_status, uint32_t *status,
> > > > +                          dma_cookie_t *cookie)
> > >
> > > IMO, it is better to move cookie/rind_idx at 3.
> > > Why it would return any array of errors? since it called after
> > > rte_dmadev_completed() has
> > > has_error. Is it better to change
> > >
> > > rte_dmadev_error_status((uint16_t dev_id, uint16_t vq_id, dma_cookie_t
> > > *cookie,  uint32_t *status)
> > >
> > > I also think, we may need to set status as bitmask and enumerate all
> > > the combination of error codes
> > > of all the driver and return string from driver existing rte_flow_error
> > >
> > > See
> > > struct rte_flow_error {
> > >         enum rte_flow_error_type type; /**< Cause field and error types. */
> > >         const void *cause; /**< Object responsible for the error. */
> > >         const char *message; /**< Human-readable error message. */
> > > };
> > >
> >
> > I think we need a multi-return value API here, as we may add operations in
> > future which have non-error status values to return. The obvious case is
> > DMA engines which support "compare" operations. In that case a successful
> > compare (as in there were no DMA or HW errors) can return "equal" or
> > "not-equal" as statuses. For general "copy" operations, the faster
> > completion op can be used to just return successful values (and only call
> > this status version on error), while apps using those compare ops or a
> > mixture of copy and compare ops, would always use the slower one that
> > returns status values for each and every op..
> >
> > The ioat APIs used 32-bit integer values for this status array so as to
> > allow e.g. 16-bits for error code and 16-bits for future status values. For
> > most operations there should be a fairly small set of things that can go
> > wrong, i.e. bad source address, bad destination address or invalid length.
> > Within that we may have a couple of specifics for why an address is bad,
> > but even so I don't think we need to start having multiple bit
> > combinations.
> 
> OK. What is the purpose of errors status? Is it for application printing it or
> Does the application need to take any action based on specific error requests?

It's largely for information purposes, but in the case of SVA/SVM errors
could occur due to the memory not being pinned, i.e. a page fault, in some
cases. If that happens, then it's up the app to either touch the memory and
retry the copy, or to do a SW memcpy as a fallback.

In other error cases, I think it's good to tell the application if it's
passing around bad data, or data that is beyond the scope of hardware, e.g.
a copy that is beyond what can be done in a single transaction for a HW
instance. Given that there are always things that can go wrong, I think we
need some error reporting mechanism.

> If the former is scope, then we need to define the standard enum value
> for the error right?
> ie. uint32_t *status needs to change to enum rte_dma_error or so.
> 
Sure. Perhaps an error/status structure either is an option, where we
explicitly call out error info from status info.

> 
> 
<snip to end>

/Bruce

^ permalink raw reply	[flat|nested] 339+ messages in thread

* Re: [dpdk-dev] [PATCH] dmadev: introduce DMA device library
  2021-07-04  9:30 ` Jerin Jacob
  2021-07-05 10:52   ` Bruce Richardson
@ 2021-07-06  3:01   ` fengchengwen
  2021-07-06 10:01     ` Bruce Richardson
  1 sibling, 1 reply; 339+ messages in thread
From: fengchengwen @ 2021-07-06  3:01 UTC (permalink / raw)
  To: Jerin Jacob
  Cc: Thomas Monjalon, Ferruh Yigit, Richardson, Bruce, Jerin Jacob,
	dpdk-dev, Morten Brørup, Nipun Gupta, Hemant Agrawal,
	Maxime Coquelin, Honnappa Nagarahalli, David Marchand,
	Satananda Burla, Prasun Kapoor, Ananyev, Konstantin, liangma,
	Radha Mohan Chintakuntla

Many thanks, mostly OK, and a few comment inline

On 2021/7/4 17:30, Jerin Jacob wrote:
> On Fri, Jul 2, 2021 at 6:51 PM Chengwen Feng <fengchengwen@huawei.com> wrote:
>>
>> This patch introduces 'dmadevice' which is a generic type of DMA
>> device.
...
>> +#include <rte_compat.h>
> 
> Sort in alphabetical order.
> 
>> +
>> +/**
>> + * dma_cookie_t - an opaque DMA cookie
> 
> Since we are defining the behaviour is not opaque any more.
> I think, it is better to call ring_idx or so.
> 


This type is designed to have two meanings, return <0 on failure and return >=0 on success.

How about follwing definition:
    typedef int dma_ring_index_t;

if >= 0, it's value range is [0, 65535] = uint16_t, so driver implementation will simply.
if <0, then men enqueue failure

For driver, it could hold uint16_t ring_index, if enquer fail just return fail, else return
the current ring_index, and update it by: ring_index++;

>> +
>> +/**
>> + * A structure used to retrieve the contextual information of
>> + * an DMA device
>> + */
>> +struct rte_dmadev_info {
>> +       /**
>> +        * Fields filled by framewok
> 
> typo.
> 
>> +        */
>> +       struct rte_device *device; /**< Generic Device information */
>> +       const char *driver_name; /**< Device driver name */
>> +       int socket_id; /**< Socket ID where memory is allocated */
>> +
>> +       /**
>> +        * Specification fields filled by driver
>> +        */
>> +       uint64_t dev_capa; /**< Device capabilities (RTE_DMA_DEV_CAPA_) */
>> +       uint16_t max_hw_queues; /**< Maximum number of HW queues. */
>> +       uint16_t max_vqs_per_hw_queue;
>> +       /**< Maximum number of virt queues to allocate per HW queue */
>> +       uint16_t max_desc;
>> +       /**< Maximum allowed number of virt queue descriptors */
>> +       uint16_t min_desc;
>> +       /**< Minimum allowed number of virt queue descriptors */
> 
> Please add max_nb_segs. i.e maximum number of segments supported.

Do you means something like "burst_size" ?

> 
>> +
>> +       /**
>> +        * Status fields filled by driver
>> +        */
>> +       uint16_t nb_hw_queues; /**< Number of HW queues configured */
>> +       uint16_t nb_vqs; /**< Number of virt queues configured */
>> +};
>> + i
>> +
>> +/**
>> + * dma_address_type
>> + */
>> +enum dma_address_type {
>> +       DMA_ADDRESS_TYPE_IOVA, /**< Use IOVA as dma address */
>> +       DMA_ADDRESS_TYPE_VA, /**< Use VA as dma address */
>> +};
>> +
>> +/**
>> + * A structure used to configure a DMA device.
>> + */
>> +struct rte_dmadev_conf {
>> +       enum dma_address_type addr_type; /**< Address type to used */
> 
> I think, there are 3 kinds of limitations/capabilities.
> 
> When the system is configured as IOVA as VA
> 1) Device supports any VA address like memory from rte_malloc(),
> rte_memzone(), malloc, stack memory
> 2) Device support only VA address from rte_malloc(), rte_memzone() i.e
> memory backed by hugepage and added to DMA map.
> 
> When the system is configured as IOVA as PA
> 1) Devices support only PA addresses .
> 
> IMO, Above needs to be  advertised as capability and application needs
> to align with that
> and I dont think application requests the driver to work in any of the modes.
> 

OK, Let's put together our ideas on address type:

There are three mode, we may define as:
	IOVA_as_VA-ALL     ---for device which may need support SVA feature
                           ---may also be a CPU memcpy 'device'
	IOVA_as_VA         ---for device which need support IOMMU
	IOVA_as_PA

There are many combination of the modes which device supports: eg. some device
may only support IOVA_as_PA, some may only support IOVA_as_VA, and some support
IOVA_as_PA and IOVA_as_VA. The specific runtime type is determined by the vfio
and drive capability(e.g RTE_PCI_DRV_NEED_IOVA_AS_VA).

So we already define two capabilities for this:
	#define RTE_DMA_DEV_CAPA_IOVA	(1ull << 8) /**< Support IOVA as DMA address */
					---this cover IOVA_as_VA and IOVA_as_PA
	#define RTE_DMA_DEV_CAPA_VA	(1ull << 9) /**< Support VA as DMA address */
					---this cover IOVA_as_VA-ALL
for a device which don't support SVA:
	only declare RTE_DMA_DEV_CAPA_IOVA
for a device which support SVA:
	delcare RTE_DAMA_DEV_CAPA_IOVA
	delcare RTE_DMA_DEV_CAPA_VA (only when IOMMU enabled and 'SVA flag' was set)
for a CPU memcpy device:
	only declare RTE_DMA_DEV_CAPA_VA

As application:
- if RTE_DMA_DEV_CAPA_VA support, then it could pass any va address to the DMA,
- else if RTE_DMA_DEV_CAPA_IOVA support, then it should pass iova address to the DMA
- else the DMA device should not exist.

> 
>> +__rte_experimental
>> +static inline dma_cookie_t
>> +rte_dmadev_copy_sg(uint16_t dev_id, uint16_t vq_id,
>> +                  const struct dma_scatterlist *sg,
>> +                  uint32_t sg_len, uint64_t flags)
> 
> I would like to change this as:
> rte_dmadev_copy_sg(uint16_t dev_id, uint16_t vq_id, const struct
> rte_dma_sg *src, uint32_t nb_src,
> const struct rte_dma_sg *dst, uint32_t nb_dst) or so allow the use case like
> src 30 MB copy can be splitted as written as 1 MB x 30 dst.
> 

There are already too many arguments, and the above use case could split 30 sg-item.

>> +__rte_experimental
>> +static inline int
>> +rte_dmadev_fence(uint16_t dev_id, uint16_t vq_id)
>> +{
>> +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
>> +       return (*dev->fence)(dev, vq_id);
>> +}
> 
> Since HW submission is in a queue(FIFO) the ordering is always
> maintained. Right?
> Could you share more details and use case of fence() from
> driver/application PoV?
> 

For Kunpeng DMA, hardware supports parallel execution of requests in the same queue,

It applies to the following scenarios: communication with the remote end is involved. driver
should ensure issure 'doorbell' after data was full written.

> 
>> +
>> +/**
>> + * @warning
>> + * @b EXPERIMENTAL: this API may change without prior notice.
>> + *
>> + * Trigger hardware to begin performing enqueued operations
>> + *
>> + * This API is used to write the "doorbell" to the hardware to trigger it
>> + * to begin the operations previously enqueued by rte_dmadev_copy/fill()
>> + *
>> + * @param dev_id
>> + *   The identifier of the device.
>> + * @param vq_id
>> + *   The identifier of virt queue.
>> + *
>> + * @return
>> + *   - =0: Successful trigger hardware.
>> + *   - <0: Failure to trigger hardware.
>> + *
>> + * NOTE: The caller must ensure that the input parameter is valid and the
>> + *       corresponding device supports the operation.
>> + */
>> +__rte_experimental
>> +static inline int
>> +rte_dmadev_perform(uint16_t dev_id, uint16_t vq_id)
>> +{
>> +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
>> +       return (*dev->perform)(dev, vq_id);
>> +}
> 
> Since we have additional function call overhead in all the
> applications for this scheme, I would like to understand
> the use of doing this way vs enq does the doorbell implicitly from
> driver/application PoV?
> 

Because we split the burst operation into multiple substeps: for each enq we
don't issue 'doorbell', and at last call perform() to issue 'doorbell'.

For ARM platform, should call mb ops when issue 'doorbell', if call mb ops
every enq, it may lead significant performance degradation.

>> +__rte_experimental
>> +static inline uint16_t
>> +rte_dmadev_completed_fails(uint16_t dev_id, uint16_t vq_id,
>> +                          const uint16_t nb_status, uint32_t *status,
>> +                          dma_cookie_t *cookie)
> 
> IMO, it is better to move cookie/rind_idx at 3.
> Why it would return any array of errors? since it called after
> rte_dmadev_completed() has
> has_error. Is it better to change
> 
> rte_dmadev_error_status((uint16_t dev_id, uint16_t vq_id, dma_cookie_t
> *cookie,  uint32_t *status)
> 
> I also think, we may need to set status as bitmask and enumerate all
> the combination of error codes
> of all the driver and return string from driver existing rte_flow_error
> 

bitmask has limit for most 32 (or we can extend 64), and also the rte_flow_error is
heavy.

Considering that errors are a small number of scenarios, so it's OK to
pass status array, and status have 32bit it could denotes a very large number
of errcode.

>> +
>> +struct rte_dmadev_stats {
>> +       uint64_t enqueue_fail_count;
>> +       /**< Conut of all operations which failed enqueued */
>> +       uint64_t enqueued_count;
>> +       /**< Count of all operations which successful enqueued */
>> +       uint64_t completed_fail_count;
>> +       /**< Count of all operations which failed to complete */
>> +       uint64_t completed_count;
>> +       /**< Count of all operations which successful complete */
>> +};
> 
> We need to have capability API to tell which items are
> updated/supported by the driver.
> 

There are fewer fields, and I don't think it's necessary to add capability API,
for those who don't support, it could don't implement the callback.
For those support, these fields are minimum et.

> 
>> diff --git a/lib/dmadev/rte_dmadev_core.h b/lib/dmadev/rte_dmadev_core.h
>> new file mode 100644
>> index 0000000..a3afea2
>> --- /dev/null
>> +++ b/lib/dmadev/rte_dmadev_core.h
>> @@ -0,0 +1,98 @@
>> +/* SPDX-License-Identifier: BSD-3-Clause
>> + * Copyright 2021 HiSilicon Limited.
>> + */
>> +
>> +#ifndef _RTE_DMADEV_CORE_H_
>> +#define _RTE_DMADEV_CORE_H_
>> +
>> +/**
>> + * @file
>> + *
>> + * RTE DMA Device internal header.
>> + *
>> + * This header contains internal data types. But they are still part of the
>> + * public API because they are used by inline public functions.
>> + */
>> +
>> +struct rte_dmadev;
>> +
>> +typedef dma_cookie_t (*dmadev_copy_t)(struct rte_dmadev *dev, uint16_t vq_id,
>> +                                     void *src, void *dst,
>> +                                     uint32_t length, uint64_t flags);
>> +/**< @internal Function used to enqueue a copy operation. */
> 
> To avoid namespace conflict(as it is public API) use rte_

These are internal function used by driver, not application.
and the eth/regexdev_core also defined without rte_

So I think it should remain as it is.

> 
> 
>> +
>> +/**
>> + * The data structure associated with each DMA device.
>> + */
>> +struct rte_dmadev {
>> +       /**< Enqueue a copy operation onto the DMA device. */
>> +       dmadev_copy_t copy;
>> +       /**< Enqueue a scatter list copy operation onto the DMA device. */
>> +       dmadev_copy_sg_t copy_sg;
>> +       /**< Enqueue a fill operation onto the DMA device. */
>> +       dmadev_fill_t fill;
>> +       /**< Enqueue a scatter list fill operation onto the DMA device. */
>> +       dmadev_fill_sg_t fill_sg;
>> +       /**< Add a fence to force ordering between operations. */
>> +       dmadev_fence_t fence;
>> +       /**< Trigger hardware to begin performing enqueued operations. */
>> +       dmadev_perform_t perform;
>> +       /**< Returns the number of operations that successful completed. */
>> +       dmadev_completed_t completed;
>> +       /**< Returns the number of operations that failed to complete. */
>> +       dmadev_completed_fails_t completed_fails;
> 
> We need to limit fastpath items in 1 CL

yes, currently there are 8 callback, which just fill one cache line.




^ permalink raw reply	[flat|nested] 339+ messages in thread

* Re: [dpdk-dev] [PATCH] dmadev: introduce DMA device library
  2021-07-04 14:57 ` Andrew Rybchenko
@ 2021-07-06  3:56   ` fengchengwen
  2021-07-06 10:02     ` Bruce Richardson
  0 siblings, 1 reply; 339+ messages in thread
From: fengchengwen @ 2021-07-06  3:56 UTC (permalink / raw)
  To: Andrew Rybchenko, thomas, ferruh.yigit, bruce.richardson, jerinj,
	jerinjacobk
  Cc: dev, mb, nipun.gupta, hemant.agrawal, maxime.coquelin,
	honnappa.nagarahalli, david.marchand, sburla, pkapoor,
	konstantin.ananyev, liangma

Many thanks, mostly OK, a few comment inline

On 2021/7/4 22:57, Andrew Rybchenko wrote:
> On 7/2/21 4:18 PM, Chengwen Feng wrote:
>> This patch introduces 'dmadevice' which is a generic type of DMA
>> device.

[snip]

>> +#ifndef _RTE_DMADEV_CORE_H_
>> +#define _RTE_DMADEV_CORE_H_
>> +
>> +/**
>> + * @file
>> + *
>> + * RTE DMA Device internal header.
>> + *
>> + * This header contains internal data types. But they are still part of the
>> + * public API because they are used by inline public functions.
> 
> Do we really want it? Anyway rte_dmadev must not be here.
> Some sub-structure could be, but not entire rte_dmadev.
> 

struct rte_dmadev should expose to public for device probe and etc.
and because the public dataplane function use static inline to embellish,
should put the rte_dmadevices to public file too.

PS: it widely used in eth/regexdev...

>> +
>> +extern struct rte_dmadev rte_dmadevices[];
>> +
>> +#endif /* _RTE_DMADEV_CORE_H_ */
>> diff --git a/lib/dmadev/rte_dmadev_pmd.h b/lib/dmadev/rte_dmadev_pmd.h
> 
> Let's remove rte_ prefix from DPDK internal headers.

as above explained, it's public header file.

>> +
>> +#define RTE_DMADEV_LOG(level, fmt, args...) \
> 
> Do we need RTE_ prefix for internal API?
> 
>> +	rte_log(RTE_LOG_ ## level, libdmadev_logtype, "%s(): " fmt "\n", \
>> +		__func__, ##args)
>> +
>> +/* Macros to check for valid device */
>> +#define RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, retval) do { \
>> +	if (!rte_dmadev_pmd_is_valid_dev((dev_id))) { \
>> +		RTE_DMADEV_LOG(ERR, "Invalid dev_id=%d", dev_id); \
>> +		return retval; \
>> +	} \
>> +} while (0)
>> +
>> +#define RTE_DMADEV_VALID_DEVID_OR_RET(dev_id) do { \
>> +	if (!rte_dmadev_pmd_is_valid_dev((dev_id))) { \
>> +		RTE_DMADEV_LOG(ERR, "Invalid dev_id=%d", dev_id); \
>> +		return; \
>> +	} \
>> +} while (0)
>> +
>> +#define RTE_DMADEV_DETACHED  0
>> +#define RTE_DMADEV_ATTACHED  1
> 
> Do we really need RTE_ prefix for interlal defines?

with RTE_ prefix will reduce namespace conflicts.

it's same as it lib/eth or regexdev...

>> +typedef int (*dmadev_xstats_reset_t)(struct rte_dmadev *dev,
>> +				     const uint32_t ids[], uint32_t nb_ids);
>> +/**< @internal Function used to reset extended stats. */
> 
> Do we really need both stats and xstats from the very
> beginning? I think it is better to start from just
> generic stats and add xstats when it is really required.

OK, but I think we should add one dump ops, which could be useful to
find the problem.

> .
> 


^ permalink raw reply	[flat|nested] 339+ messages in thread

* Re: [dpdk-dev] [PATCH] dmadev: introduce DMA device library
  2021-07-04 15:21 ` Matan Azrad
@ 2021-07-06  6:25   ` fengchengwen
  2021-07-06  6:50     ` Matan Azrad
  0 siblings, 1 reply; 339+ messages in thread
From: fengchengwen @ 2021-07-06  6:25 UTC (permalink / raw)
  To: Matan Azrad, NBU-Contact-Thomas Monjalon, ferruh.yigit,
	bruce.richardson, jerinj, jerinjacobk
  Cc: dev, mb, nipun.gupta, hemant.agrawal, maxime.coquelin,
	honnappa.nagarahalli, david.marchand, sburla, pkapoor,
	konstantin.ananyev, liangma

On 2021/7/4 23:21, Matan Azrad wrote:
> 
> 
> From: Chengwen Feng
>> This patch introduces 'dmadevice' which is a generic type of DMA
>> device.
>>
>> The APIs of dmadev library exposes some generic operations which can
>> enable configuration and I/O with the DMA devices.
>>
> Did you consider RTE_COMP_ALGO_NULL xform in compressdev library?
> 

em, I just looked at the code.

The RTE_COMP_ALGO_NULL is a small feature of the compression device.
and currently only mlx5 and isal support it.

Also the compressdev dataplane API relatively complicated to do just
DMA copy.

So I think we need a separate driver framework for the DMA device.

thanks

[snip]

> 
> 
> .
> 


^ permalink raw reply	[flat|nested] 339+ messages in thread

* Re: [dpdk-dev] [PATCH] dmadev: introduce DMA device library
  2021-07-06  6:25   ` fengchengwen
@ 2021-07-06  6:50     ` Matan Azrad
  2021-07-06  9:08       ` fengchengwen
  0 siblings, 1 reply; 339+ messages in thread
From: Matan Azrad @ 2021-07-06  6:50 UTC (permalink / raw)
  To: fengchengwen, NBU-Contact-Thomas Monjalon, ferruh.yigit,
	bruce.richardson, jerinj, jerinjacobk
  Cc: dev, mb, nipun.gupta, hemant.agrawal, maxime.coquelin,
	honnappa.nagarahalli, david.marchand, sburla, pkapoor,
	konstantin.ananyev, liangma

Hi

From: fengchengwen
> On 2021/7/4 23:21, Matan Azrad wrote:
> >
> >
> > From: Chengwen Feng
> >> This patch introduces 'dmadevice' which is a generic type of DMA
> >> device.
> >>
> >> The APIs of dmadev library exposes some generic operations which can
> >> enable configuration and I/O with the DMA devices.
> >>
> > Did you consider RTE_COMP_ALGO_NULL xform in compressdev library?
> >
> 
> em, I just looked at the code.
> 
> The RTE_COMP_ALGO_NULL is a small feature of the compression device.
> and currently only mlx5 and isal support it.

Yes, but what that is mean?
If more drivers support DMA operations they can add the support there, no?


> Also the compressdev dataplane API relatively complicated to do just DMA
> copy.

You snipped more comments I wrote below 😊
Maybe it is related....

> So I think we need a separate driver framework for the DMA device.

Need to consider deprecation in compressdev if so....

> thanks
> 
> [snip]
> 
> >
> >
> > .
> >


^ permalink raw reply	[flat|nested] 339+ messages in thread

* Re: [dpdk-dev] [PATCH] dmadev: introduce DMA device library
  2021-07-05 10:52   ` Bruce Richardson
  2021-07-05 11:12     ` Morten Brørup
  2021-07-05 15:55     ` Jerin Jacob
@ 2021-07-06  8:20     ` fengchengwen
  2021-07-06  9:27       ` Bruce Richardson
  2 siblings, 1 reply; 339+ messages in thread
From: fengchengwen @ 2021-07-06  8:20 UTC (permalink / raw)
  To: Bruce Richardson, Jerin Jacob
  Cc: Thomas Monjalon, Ferruh Yigit, Jerin Jacob, dpdk-dev,
	Morten Brørup, Nipun Gupta, Hemant Agrawal, Maxime Coquelin,
	Honnappa Nagarahalli, David Marchand, Satananda Burla,
	Prasun Kapoor, Ananyev, Konstantin, liangma,
	Radha Mohan Chintakuntla

On 2021/7/5 18:52, Bruce Richardson wrote:
> On Sun, Jul 04, 2021 at 03:00:30PM +0530, Jerin Jacob wrote:
>> On Fri, Jul 2, 2021 at 6:51 PM Chengwen Feng <fengchengwen@huawei.com> wrote:

[snip]

>>> + *
>>> + * If dma_cookie_t is >=0 it's a DMA operation request cookie, <0 it's a error
>>> + * code.
>>> + * When using cookies, comply with the following rules:
>>> + * a) Cookies for each virtual queue are independent.
>>> + * b) For a virt queue, the cookie are monotonically incremented, when it reach
>>> + *    the INT_MAX, it wraps back to zero.
> 
> I disagree with the INT_MAX (or INT32_MAX) value here. If we use that
> value, it means that we cannot use implicit wrap-around inside the CPU and
> have to check for the INT_MAX value. Better to:
> 1. Specify that it wraps at UINT16_MAX which allows us to just use a
> uint16_t internally and wrap-around automatically, or:
> 2. Specify that it wraps at a power-of-2 value >= UINT16_MAX, giving
> drivers the flexibility at what value to wrap around.

+1 for option 1
BTW: option 2 seem a little complicated for driver and application.

>> When the system is configured as IOVA as VA
>> 1) Device supports any VA address like memory from rte_malloc(),
>> rte_memzone(), malloc, stack memory
>> 2) Device support only VA address from rte_malloc(), rte_memzone() i.e
>> memory backed by hugepage and added to DMA map.
>>
>> When the system is configured as IOVA as PA
>> 1) Devices support only PA addresses .
>>
>> IMO, Above needs to be  advertised as capability and application needs
>> to align with that
>> and I dont think application requests the driver to work in any of the modes.
>>
>>
> 
> I don't think we need this level of detail for addressing capabilities.
> Unless I'm missing something, the hardware should behave exactly as other
> hardware does taking in iova's.  If the user wants to check whether virtual
> addresses to pinned memory can be used directly, the user can call
> "rte_eal_iova_mode". We can't have a situation where some hardware uses one
> type of addresses and another hardware the other.
> 
> Therefore, the only additional addressing capability we should need to
> report is that the hardware can use SVM/SVA and use virtual addresses not
> in hugepage memory.
> 

I discuss the addressing capability in previous thread.
Indeed, we can reduce it to just one capability.

>>> + * @warning
>>> + * @b EXPERIMENTAL: this API may change without prior notice.
>>> + *
>>> + * Enqueue a fill operation onto the DMA virt queue
>>> + *
>>> + * This queues up a fill operation to be performed by hardware, but does not
>>> + * trigger hardware to begin that operation.
>>> + *
>>> + * @param dev_id
>>> + *   The identifier of the device.
>>> + * @param vq_id
>>> + *   The identifier of virt queue.
>>> + * @param pattern
>>> + *   The pattern to populate the destination buffer with.
>>> + * @param dst
>>> + *   The address of the destination buffer.
>>> + * @param length
>>> + *   The length of the destination buffer.
>>> + * @param flags
>>> + *   An opaque flags for this operation.
>>
>> PLEASE REMOVE opaque stuff from fastpath it will be a pain for
>> application writers as
>> they need to write multiple combinations of fastpath. flags are OK, if
>> we have a valid
>> generic flag now to control the transfer behavior.
>>
> 
> +1. Flags need to be explicitly listed. If we don't have any flags for now,
> we can specify that the value must be given as zero and it's for future
> use.
> 

+1, I will delete the flags parameters.

Currently we have fence which was implemented by ops, if later need more flags,
maybe we need create one new ops, this is not the way to expand.

So I think we need change fence ops to extra_flags ops:
	rte_dmadev_fence(uint16_t dev_id, uint16_t vq_id)
to
	rte_dmadev_extra_flags(uint16_t dev_id, uint16_t vq_id, uint64_t flags);

So we could add fence by: rte_dmadev_extra_flags(dev_id, vq_id, RTE_DMA_FLAG_FENCE);
	
>>> +/**
>>> + * @warning
>>> + * @b EXPERIMENTAL: this API may change without prior notice.
>>> + *
>>> + * Returns the number of operations that failed to complete.
>>> + * NOTE: This API was used when rte_dmadev_completed has_error was set.
>>> + *
>>> + * @param dev_id
>>> + *   The identifier of the device.
>>> + * @param vq_id
>>> + *   The identifier of virt queue.
>> (> + * @param nb_status
>>> + *   Indicates the size  of status array.
>>> + * @param[out] status
>>> + *   The error code of operations that failed to complete.
>>> + * @param[out] cookie
>>> + *   The last failed completed operation's cookie.
>>> + *
>>> + * @return
>>> + *   The number of operations that failed to complete.
>>> + *
>>> + * NOTE: The caller must ensure that the input parameter is valid and the
>>> + *       corresponding device supports the operation.
>>> + */
>>> +__rte_experimental
>>> +static inline uint16_t
>>> +rte_dmadev_completed_fails(uint16_t dev_id, uint16_t vq_id,
>>> +                          const uint16_t nb_status, uint32_t *status,
>>> +                          dma_cookie_t *cookie)
>>
>> IMO, it is better to move cookie/rind_idx at 3.
>> Why it would return any array of errors? since it called after
>> rte_dmadev_completed() has
>> has_error. Is it better to change
>>
>> rte_dmadev_error_status((uint16_t dev_id, uint16_t vq_id, dma_cookie_t
>> *cookie,  uint32_t *status)
>>
>> I also think, we may need to set status as bitmask and enumerate all
>> the combination of error codes
>> of all the driver and return string from driver existing rte_flow_error
>>
>> See
>> struct rte_flow_error {
>>         enum rte_flow_error_type type; /**< Cause field and error types. */
>>         const void *cause; /**< Object responsible for the error. */
>>         const char *message; /**< Human-readable error message. */
>> };
>>
> 
> I think we need a multi-return value API here, as we may add operations in
> future which have non-error status values to return. The obvious case is
> DMA engines which support "compare" operations. In that case a successful

Just curious, what the 'compare' operations's application scenario ?

> compare (as in there were no DMA or HW errors) can return "equal" or
> "not-equal" as statuses. For general "copy" operations, the faster
> completion op can be used to just return successful values (and only call
> this status version on error), while apps using those compare ops or a
> mixture of copy and compare ops, would always use the slower one that
> returns status values for each and every op..

In the current design, rte_dmadev_completed_fails applies only to failure
scenarios. Do you mean in 'compare' operations, the status always non-zero
whether or not the two are consistent ?

> 
> The ioat APIs used 32-bit integer values for this status array so as to
> allow e.g. 16-bits for error code and 16-bits for future status values. For
> most operations there should be a fairly small set of things that can go
> wrong, i.e. bad source address, bad destination address or invalid length.
> Within that we may have a couple of specifics for why an address is bad,
> but even so I don't think we need to start having multiple bit
> combinations.
> 
>>> +{
>>> +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
>>> +       return (*dev->completed_fails)(dev, vq_id, nb_status, status, cookie);
>>> +}
>>> +
>>> +struct rte_dmadev_stats {
>>> +       uint64_t enqueue_fail_count;
>>> +       /**< Conut of all operations which failed enqueued */
>>> +       uint64_t enqueued_count;
>>> +       /**< Count of all operations which successful enqueued */
>>> +       uint64_t completed_fail_count;
>>> +       /**< Count of all operations which failed to complete */
>>> +       uint64_t completed_count;
>>> +       /**< Count of all operations which successful complete */
>>> +};
>>
>> We need to have capability API to tell which items are
>> updated/supported by the driver.
>>
> 
> I also would remove the enqueue fail counts, since they are better counted
> by the app. If a driver reports 20,000 failures we have no way of knowing
> if that is 20,000 unique operations which failed to enqueue or a single
> operation which failed to enqueue 20,000 times but succeeded on attempt
> 20,001.
> 

This does exist, The application may just show a DEBUG trace other than recording.
So I would recommend keeping at least know if it happens after a long run.

> 


^ permalink raw reply	[flat|nested] 339+ messages in thread

* Re: [dpdk-dev] [PATCH] dmadev: introduce DMA device library
  2021-07-06  6:50     ` Matan Azrad
@ 2021-07-06  9:08       ` fengchengwen
  2021-07-06  9:17         ` Matan Azrad
  0 siblings, 1 reply; 339+ messages in thread
From: fengchengwen @ 2021-07-06  9:08 UTC (permalink / raw)
  To: Matan Azrad, NBU-Contact-Thomas Monjalon, ferruh.yigit,
	bruce.richardson, jerinj, jerinjacobk
  Cc: dev, mb, nipun.gupta, hemant.agrawal, maxime.coquelin,
	honnappa.nagarahalli, david.marchand, sburla, pkapoor,
	konstantin.ananyev, liangma

On 2021/7/6 14:50, Matan Azrad wrote:
> Hi
> 
> From: fengchengwen
>> On 2021/7/4 23:21, Matan Azrad wrote:
>>>
>>>
>>> From: Chengwen Feng
>>>> This patch introduces 'dmadevice' which is a generic type of DMA
>>>> device.
>>>>
>>>> The APIs of dmadev library exposes some generic operations which can
>>>> enable configuration and I/O with the DMA devices.
>>>>
>>> Did you consider RTE_COMP_ALGO_NULL xform in compressdev library?
>>>
>>
>> em, I just looked at the code.
>>
>> The RTE_COMP_ALGO_NULL is a small feature of the compression device.
>> and currently only mlx5 and isal support it.
> 
> Yes, but what that is mean?
> If more drivers support DMA operations they can add the support there, no?
> 

You mean to expand directly on compressdev ?
I think it hard to expand, and may break the compressdev concept.

> 
>> Also the compressdev dataplane API relatively complicated to do just DMA
>> copy.
> 
> You snipped more comments I wrote below 😊
> Maybe it is related....

Sorry, I just skipped.

'Did you consider also mbuf API usage for memory descriptor?'
---One scenario of the DMA is vhost-net, which src or dst could be mbuf, but the
peer were not mbuf. so here we use raw fields.

> 
>> So I think we need a separate driver framework for the DMA device.
> 
> Need to consider deprecation in compressdev if so....
> 
>> thanks
>>
>> [snip]
>>
>>>
>>>
>>> .
>>>
> 


^ permalink raw reply	[flat|nested] 339+ messages in thread

* Re: [dpdk-dev] [PATCH] dmadev: introduce DMA device library
  2021-07-06  9:08       ` fengchengwen
@ 2021-07-06  9:17         ` Matan Azrad
  0 siblings, 0 replies; 339+ messages in thread
From: Matan Azrad @ 2021-07-06  9:17 UTC (permalink / raw)
  To: fengchengwen, NBU-Contact-Thomas Monjalon, ferruh.yigit,
	bruce.richardson, jerinj, jerinjacobk
  Cc: dev, mb, nipun.gupta, hemant.agrawal, maxime.coquelin,
	honnappa.nagarahalli, david.marchand, sburla, pkapoor,
	konstantin.ananyev, liangma



From: fengchengwen
> On 2021/7/6 14:50, Matan Azrad wrote:
> > Hi
> >
> > From: fengchengwen
> >> On 2021/7/4 23:21, Matan Azrad wrote:
> >>>
> >>>
> >>> From: Chengwen Feng
> >>>> This patch introduces 'dmadevice' which is a generic type of DMA
> >>>> device.
> >>>>
> >>>> The APIs of dmadev library exposes some generic operations which
> >>>> can enable configuration and I/O with the DMA devices.
> >>>>
> >>> Did you consider RTE_COMP_ALGO_NULL xform in compressdev library?
> >>>
> >>
> >> em, I just looked at the code.
> >>
> >> The RTE_COMP_ALGO_NULL is a small feature of the compression device.
> >> and currently only mlx5 and isal support it.
> >
> > Yes, but what that is mean?
> > If more drivers support DMA operations they can add the support there,
> no?
> >
> 
> You mean to expand directly on compressdev ?
> I think it hard to expand, and may break the compressdev concept.

Maybe, what do you need to expand?
Also maybe your expansion is related also to compress, finally both are mem-to-mem offload.
 

> >
> >> Also the compressdev dataplane API relatively complicated to do just
> >> DMA copy.
> >
> > You snipped more comments I wrote below 😊
> > Maybe it is related....
> 
> Sorry, I just skipped.
> 
> 'Did you consider also mbuf API usage for memory descriptor?'
> ---One scenario of the DMA is vhost-net, which src or dst could be mbuf, but
> the peer were not mbuf. so here we use raw fields.

Did you consider using external\attached mbuf for this case?

Did you also consider raw API in cryptodev library?
 
> >
> >> So I think we need a separate driver framework for the DMA device.
> >
> > Need to consider deprecation in compressdev if so....
> >
> >> thanks
> >>
> >> [snip]
> >>
> >>>
> >>>
> >>> .
> >>>
> >


^ permalink raw reply	[flat|nested] 339+ messages in thread

* Re: [dpdk-dev] [PATCH] dmadev: introduce DMA device library
  2021-07-06  8:20     ` fengchengwen
@ 2021-07-06  9:27       ` Bruce Richardson
  0 siblings, 0 replies; 339+ messages in thread
From: Bruce Richardson @ 2021-07-06  9:27 UTC (permalink / raw)
  To: fengchengwen
  Cc: Jerin Jacob, Thomas Monjalon, Ferruh Yigit, Jerin Jacob,
	dpdk-dev, Morten Brørup, Nipun Gupta, Hemant Agrawal,
	Maxime Coquelin, Honnappa Nagarahalli, David Marchand,
	Satananda Burla, Prasun Kapoor, Ananyev, Konstantin, liangma,
	Radha Mohan Chintakuntla

On Tue, Jul 06, 2021 at 04:20:38PM +0800, fengchengwen wrote:
> On 2021/7/5 18:52, Bruce Richardson wrote:
> > On Sun, Jul 04, 2021 at 03:00:30PM +0530, Jerin Jacob wrote:
> >> On Fri, Jul 2, 2021 at 6:51 PM Chengwen Feng <fengchengwen@huawei.com> wrote:
> 
> [snip]
> 
> >>> + *
> >>> + * If dma_cookie_t is >=0 it's a DMA operation request cookie, <0 it's a error
> >>> + * code.
> >>> + * When using cookies, comply with the following rules:
> >>> + * a) Cookies for each virtual queue are independent.
> >>> + * b) For a virt queue, the cookie are monotonically incremented, when it reach
> >>> + *    the INT_MAX, it wraps back to zero.
> > 
> > I disagree with the INT_MAX (or INT32_MAX) value here. If we use that
> > value, it means that we cannot use implicit wrap-around inside the CPU and
> > have to check for the INT_MAX value. Better to:
> > 1. Specify that it wraps at UINT16_MAX which allows us to just use a
> > uint16_t internally and wrap-around automatically, or:
> > 2. Specify that it wraps at a power-of-2 value >= UINT16_MAX, giving
> > drivers the flexibility at what value to wrap around.
> 
> +1 for option 1
> BTW: option 2 seem a little complicated for driver and application.
> 

I would tend to agree. I just included it in case there was a case where
you explicitly wanted more than UINT16_MAX values in your driver.

> >> When the system is configured as IOVA as VA
> >> 1) Device supports any VA address like memory from rte_malloc(),
> >> rte_memzone(), malloc, stack memory
> >> 2) Device support only VA address from rte_malloc(), rte_memzone() i.e
> >> memory backed by hugepage and added to DMA map.
> >>
> >> When the system is configured as IOVA as PA
> >> 1) Devices support only PA addresses .
> >>
> >> IMO, Above needs to be  advertised as capability and application needs
> >> to align with that
> >> and I dont think application requests the driver to work in any of the modes.
> >>
> >>
> > 
> > I don't think we need this level of detail for addressing capabilities.
> > Unless I'm missing something, the hardware should behave exactly as other
> > hardware does taking in iova's.  If the user wants to check whether virtual
> > addresses to pinned memory can be used directly, the user can call
> > "rte_eal_iova_mode". We can't have a situation where some hardware uses one
> > type of addresses and another hardware the other.
> > 
> > Therefore, the only additional addressing capability we should need to
> > report is that the hardware can use SVM/SVA and use virtual addresses not
> > in hugepage memory.
> > 
> 
> I discuss the addressing capability in previous thread.
> Indeed, we can reduce it to just one capability.
> 
> >>> + * @warning
> >>> + * @b EXPERIMENTAL: this API may change without prior notice.
> >>> + *
> >>> + * Enqueue a fill operation onto the DMA virt queue
> >>> + *
> >>> + * This queues up a fill operation to be performed by hardware, but does not
> >>> + * trigger hardware to begin that operation.
> >>> + *
> >>> + * @param dev_id
> >>> + *   The identifier of the device.
> >>> + * @param vq_id
> >>> + *   The identifier of virt queue.
> >>> + * @param pattern
> >>> + *   The pattern to populate the destination buffer with.
> >>> + * @param dst
> >>> + *   The address of the destination buffer.
> >>> + * @param length
> >>> + *   The length of the destination buffer.
> >>> + * @param flags
> >>> + *   An opaque flags for this operation.
> >>
> >> PLEASE REMOVE opaque stuff from fastpath it will be a pain for
> >> application writers as
> >> they need to write multiple combinations of fastpath. flags are OK, if
> >> we have a valid
> >> generic flag now to control the transfer behavior.
> >>
> > 
> > +1. Flags need to be explicitly listed. If we don't have any flags for now,
> > we can specify that the value must be given as zero and it's for future
> > use.
> > 
> 
> +1, I will delete the flags parameters.
> 
> Currently we have fence which was implemented by ops, if later need more flags,
> maybe we need create one new ops, this is not the way to expand.
> 
> So I think we need change fence ops to extra_flags ops:
> 	rte_dmadev_fence(uint16_t dev_id, uint16_t vq_id)
> to
> 	rte_dmadev_extra_flags(uint16_t dev_id, uint16_t vq_id, uint64_t flags);
> 
> So we could add fence by: rte_dmadev_extra_flags(dev_id, vq_id, RTE_DMA_FLAG_FENCE);
> 

I don't think this is the way to go. I think we will need the flags
parameter per op in the future, so we should keep it, even if it is always
zero for now. It gives us future expandability options.

> >>> +/**
> >>> + * @warning
> >>> + * @b EXPERIMENTAL: this API may change without prior notice.
> >>> + *
> >>> + * Returns the number of operations that failed to complete.
> >>> + * NOTE: This API was used when rte_dmadev_completed has_error was set.
> >>> + *
> >>> + * @param dev_id
> >>> + *   The identifier of the device.
> >>> + * @param vq_id
> >>> + *   The identifier of virt queue.
> >> (> + * @param nb_status
> >>> + *   Indicates the size  of status array.
> >>> + * @param[out] status
> >>> + *   The error code of operations that failed to complete.
> >>> + * @param[out] cookie
> >>> + *   The last failed completed operation's cookie.
> >>> + *
> >>> + * @return
> >>> + *   The number of operations that failed to complete.
> >>> + *
> >>> + * NOTE: The caller must ensure that the input parameter is valid and the
> >>> + *       corresponding device supports the operation.
> >>> + */
> >>> +__rte_experimental
> >>> +static inline uint16_t
> >>> +rte_dmadev_completed_fails(uint16_t dev_id, uint16_t vq_id,
> >>> +                          const uint16_t nb_status, uint32_t *status,
> >>> +                          dma_cookie_t *cookie)
> >>
> >> IMO, it is better to move cookie/rind_idx at 3.
> >> Why it would return any array of errors? since it called after
> >> rte_dmadev_completed() has
> >> has_error. Is it better to change
> >>
> >> rte_dmadev_error_status((uint16_t dev_id, uint16_t vq_id, dma_cookie_t
> >> *cookie,  uint32_t *status)
> >>
> >> I also think, we may need to set status as bitmask and enumerate all
> >> the combination of error codes
> >> of all the driver and return string from driver existing rte_flow_error
> >>
> >> See
> >> struct rte_flow_error {
> >>         enum rte_flow_error_type type; /**< Cause field and error types. */
> >>         const void *cause; /**< Object responsible for the error. */
> >>         const char *message; /**< Human-readable error message. */
> >> };
> >>
> > 
> > I think we need a multi-return value API here, as we may add operations in
> > future which have non-error status values to return. The obvious case is
> > DMA engines which support "compare" operations. In that case a successful
> 
> Just curious, what the 'compare' operations's application scenario ?
> 

We are not looking to use this capability just now - but it's a capability
in our hardware that offers some interest possibilities so I'd like to
ensure it's possible to integrate in future. To do so, we just need to
ensure that the function which returns the "error" status - or status
generally, can be used to returns bursts of statuses, even if it's slower
compared to the regular completion return which just assumes all succeed.

> > compare (as in there were no DMA or HW errors) can return "equal" or
> > "not-equal" as statuses. For general "copy" operations, the faster
> > completion op can be used to just return successful values (and only call
> > this status version on error), while apps using those compare ops or a
> > mixture of copy and compare ops, would always use the slower one that
> > returns status values for each and every op..
> 
> In the current design, rte_dmadev_completed_fails applies only to failure
> scenarios. Do you mean in 'compare' operations, the status always non-zero
> whether or not the two are consistent ?
> 

Yes and no. There are two separate "status" values to be returned for such
operations - the actual HW status i.e. all parameters valid, and the actual
memcmp result of equal/non-equal. In our completion records these are
called "status" and "result" respectively. "Result" is only valid if
"status" is successful, and is not relevant for copy or fill or similar
ops. Therefore, to support this, we just need some bits in "status"
reserved for that result case, and the completed_status op to return an
array of status values, not just a single one.

> > 
> > The ioat APIs used 32-bit integer values for this status array so as to
> > allow e.g. 16-bits for error code and 16-bits for future status values. For
> > most operations there should be a fairly small set of things that can go
> > wrong, i.e. bad source address, bad destination address or invalid length.
> > Within that we may have a couple of specifics for why an address is bad,
> > but even so I don't think we need to start having multiple bit
> > combinations.
> > 
> >>> +{
> >>> +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> >>> +       return (*dev->completed_fails)(dev, vq_id, nb_status, status, cookie);
> >>> +}
> >>> +
> >>> +struct rte_dmadev_stats {
> >>> +       uint64_t enqueue_fail_count;
> >>> +       /**< Conut of all operations which failed enqueued */
> >>> +       uint64_t enqueued_count;
> >>> +       /**< Count of all operations which successful enqueued */
> >>> +       uint64_t completed_fail_count;
> >>> +       /**< Count of all operations which failed to complete */
> >>> +       uint64_t completed_count;
> >>> +       /**< Count of all operations which successful complete */
> >>> +};
> >>
> >> We need to have capability API to tell which items are
> >> updated/supported by the driver.
> >>
> > 
> > I also would remove the enqueue fail counts, since they are better counted
> > by the app. If a driver reports 20,000 failures we have no way of knowing
> > if that is 20,000 unique operations which failed to enqueue or a single
> > operation which failed to enqueue 20,000 times but succeeded on attempt
> > 20,001.
> > 
> 
> This does exist, The application may just show a DEBUG trace other than recording.
> So I would recommend keeping at least know if it happens after a long run.
> 
I disagree here - the enqueue failure should only be tracked by the app,
because:
1. only app knows whether a particular enqueue failure is retry or not and
   how it should be counted
2. these failures cannot be counted by hardware and must be counted by
   software, so adding additional operations to our enqueue path. In the
   retry case, that could be a lot of load-update-stores that will have to
   be done in the driver, while if tracked in the app, the count would
   just be a register increment.

Operation failures can be tracked in driver stats, though, as that is
related to hardware operation.

/Bruce

^ permalink raw reply	[flat|nested] 339+ messages in thread

* Re: [dpdk-dev] [PATCH] dmadev: introduce DMA device library
  2021-07-06  3:01   ` fengchengwen
@ 2021-07-06 10:01     ` Bruce Richardson
  0 siblings, 0 replies; 339+ messages in thread
From: Bruce Richardson @ 2021-07-06 10:01 UTC (permalink / raw)
  To: fengchengwen
  Cc: Jerin Jacob, Thomas Monjalon, Ferruh Yigit, Jerin Jacob,
	dpdk-dev, Morten Brørup, Nipun Gupta, Hemant Agrawal,
	Maxime Coquelin, Honnappa Nagarahalli, David Marchand,
	Satananda Burla, Prasun Kapoor, Ananyev, Konstantin, liangma,
	Radha Mohan Chintakuntla

On Tue, Jul 06, 2021 at 11:01:17AM +0800, fengchengwen wrote:
> Many thanks, mostly OK, and a few comment inline
> 
> On 2021/7/4 17:30, Jerin Jacob wrote:
> > On Fri, Jul 2, 2021 at 6:51 PM Chengwen Feng <fengchengwen@huawei.com> wrote:
> >>
> >> This patch introduces 'dmadevice' which is a generic type of DMA
> >> device.
> ...
> >> +#include <rte_compat.h>
> > 
> > Sort in alphabetical order.
> > 
> >> +
> >> +/**
> >> + * dma_cookie_t - an opaque DMA cookie
> > 
> > Since we are defining the behaviour is not opaque any more.
> > I think, it is better to call ring_idx or so.
> > 
> 
> 
> This type is designed to have two meanings, return <0 on failure and return >=0 on success.
> 
> How about follwing definition:
>     typedef int dma_ring_index_t;
> 
> if >= 0, it's value range is [0, 65535] = uint16_t, so driver implementation will simply.
> if <0, then men enqueue failure
> 
> For driver, it could hold uint16_t ring_index, if enquer fail just return fail, else return
> the current ring_index, and update it by: ring_index++;
> 

Well, yes and no on the "two meanings". For the enqueue function, yes the
return value can have two meanings, but I don't consider them one type. On
the completion call, however, this can only be positive values <
UINT16_MAX, so having two meanings is actually confusing. Better to have

* enqueue return regular int, with doxygen comment 
	"@return 
	  Negative on error, otherwise job index between 0 and UINT16_MAX"
* for completions, take a uint16_t* parameter for the last completed index
  since no negative values are needed.

Beyond this, we generally don't use typedefs in DPDK for basic types (with
a few exceptions e.g. rte_iova_t), and save their use only for function
pointers.

> >> +
> >> +/**
> >> + * A structure used to retrieve the contextual information of
> >> + * an DMA device
> >> + */
> >> +struct rte_dmadev_info {
> >> +       /**
> >> +        * Fields filled by framewok
> > 
> > typo.
> > 
> >> +        */
> >> +       struct rte_device *device; /**< Generic Device information */
> >> +       const char *driver_name; /**< Device driver name */
> >> +       int socket_id; /**< Socket ID where memory is allocated */
> >> +
> >> +       /**
> >> +        * Specification fields filled by driver
> >> +        */
> >> +       uint64_t dev_capa; /**< Device capabilities (RTE_DMA_DEV_CAPA_) */
> >> +       uint16_t max_hw_queues; /**< Maximum number of HW queues. */
> >> +       uint16_t max_vqs_per_hw_queue;
> >> +       /**< Maximum number of virt queues to allocate per HW queue */
> >> +       uint16_t max_desc;
> >> +       /**< Maximum allowed number of virt queue descriptors */
> >> +       uint16_t min_desc;
> >> +       /**< Minimum allowed number of virt queue descriptors */
> > 
> > Please add max_nb_segs. i.e maximum number of segments supported.
> 
> Do you means something like "burst_size" ?
> 
> > 
> >> +
> >> +       /**
> >> +        * Status fields filled by driver
> >> +        */
> >> +       uint16_t nb_hw_queues; /**< Number of HW queues configured */
> >> +       uint16_t nb_vqs; /**< Number of virt queues configured */
> >> +};
> >> + i
> >> +
> >> +/**
> >> + * dma_address_type
> >> + */
> >> +enum dma_address_type {
> >> +       DMA_ADDRESS_TYPE_IOVA, /**< Use IOVA as dma address */
> >> +       DMA_ADDRESS_TYPE_VA, /**< Use VA as dma address */
> >> +};
> >> +
> >> +/**
> >> + * A structure used to configure a DMA device.
> >> + */
> >> +struct rte_dmadev_conf {
> >> +       enum dma_address_type addr_type; /**< Address type to used */
> > 
> > I think, there are 3 kinds of limitations/capabilities.
> > 
> > When the system is configured as IOVA as VA
> > 1) Device supports any VA address like memory from rte_malloc(),
> > rte_memzone(), malloc, stack memory
> > 2) Device support only VA address from rte_malloc(), rte_memzone() i.e
> > memory backed by hugepage and added to DMA map.
> > 
> > When the system is configured as IOVA as PA
> > 1) Devices support only PA addresses .
> > 
> > IMO, Above needs to be  advertised as capability and application needs
> > to align with that
> > and I dont think application requests the driver to work in any of the modes.
> > 
> 
> OK, Let's put together our ideas on address type:
> 
> There are three mode, we may define as:
> 	IOVA_as_VA-ALL     ---for device which may need support SVA feature
>                            ---may also be a CPU memcpy 'device'
> 	IOVA_as_VA         ---for device which need support IOMMU
> 	IOVA_as_PA
> 
> There are many combination of the modes which device supports: eg. some device
> may only support IOVA_as_PA, some may only support IOVA_as_VA, and some support
> IOVA_as_PA and IOVA_as_VA. The specific runtime type is determined by the vfio
> and drive capability(e.g RTE_PCI_DRV_NEED_IOVA_AS_VA).
> 
> So we already define two capabilities for this:
> 	#define RTE_DMA_DEV_CAPA_IOVA	(1ull << 8) /**< Support IOVA as DMA address */
> 					---this cover IOVA_as_VA and IOVA_as_PA
> 	#define RTE_DMA_DEV_CAPA_VA	(1ull << 9) /**< Support VA as DMA address */
> 					---this cover IOVA_as_VA-ALL
> for a device which don't support SVA:
> 	only declare RTE_DMA_DEV_CAPA_IOVA
> for a device which support SVA:
> 	delcare RTE_DAMA_DEV_CAPA_IOVA
> 	delcare RTE_DMA_DEV_CAPA_VA (only when IOMMU enabled and 'SVA flag' was set)
> for a CPU memcpy device:
> 	only declare RTE_DMA_DEV_CAPA_VA
> 
> As application:
> - if RTE_DMA_DEV_CAPA_VA support, then it could pass any va address to the DMA,
> - else if RTE_DMA_DEV_CAPA_IOVA support, then it should pass iova address to the DMA
> - else the DMA device should not exist.
> 

I still don't think we need all of this. DPDK already has support through
the existing bus infrastructure for determining if DPDK needs to use
physical or virtual addresses, so we should not be duplicating that as
devices *cannot* use a different addressing mode to DPDK itself.
Given that, the only flag we need is one to indicate SVA support.

> > 
<snip>
> > 
> > I also think, we may need to set status as bitmask and enumerate all
> > the combination of error codes
> > of all the driver and return string from driver existing rte_flow_error
> > 
> 
> bitmask has limit for most 32 (or we can extend 64), and also the rte_flow_error is
> heavy.
> 
> Considering that errors are a small number of scenarios, so it's OK to
> pass status array, and status have 32bit it could denotes a very large number
> of errcode.
> 

+1 to this.

> >> +
> >> +struct rte_dmadev_stats {
> >> +       uint64_t enqueue_fail_count;
> >> +       /**< Conut of all operations which failed enqueued */
> >> +       uint64_t enqueued_count;
> >> +       /**< Count of all operations which successful enqueued */
> >> +       uint64_t completed_fail_count;
> >> +       /**< Count of all operations which failed to complete */
> >> +       uint64_t completed_count;
> >> +       /**< Count of all operations which successful complete */
> >> +};
> > 
> > We need to have capability API to tell which items are
> > updated/supported by the driver.
> > 
> 
> There are fewer fields, and I don't think it's necessary to add capability API,
> for those who don't support, it could don't implement the callback.
> For those support, these fields are minimum et.
> 
> > 
> >> diff --git a/lib/dmadev/rte_dmadev_core.h b/lib/dmadev/rte_dmadev_core.h
> >> new file mode 100644
> >> index 0000000..a3afea2
> >> --- /dev/null
> >> +++ b/lib/dmadev/rte_dmadev_core.h
> >> @@ -0,0 +1,98 @@
> >> +/* SPDX-License-Identifier: BSD-3-Clause
> >> + * Copyright 2021 HiSilicon Limited.
> >> + */
> >> +
> >> +#ifndef _RTE_DMADEV_CORE_H_
> >> +#define _RTE_DMADEV_CORE_H_
> >> +
> >> +/**
> >> + * @file
> >> + *
> >> + * RTE DMA Device internal header.
> >> + *
> >> + * This header contains internal data types. But they are still part of the
> >> + * public API because they are used by inline public functions.
> >> + */
> >> +
> >> +struct rte_dmadev;
> >> +
> >> +typedef dma_cookie_t (*dmadev_copy_t)(struct rte_dmadev *dev, uint16_t vq_id,
> >> +                                     void *src, void *dst,
> >> +                                     uint32_t length, uint64_t flags);
> >> +/**< @internal Function used to enqueue a copy operation. */
> > 
> > To avoid namespace conflict(as it is public API) use rte_
> 
> These are internal function used by driver, not application.
> and the eth/regexdev_core also defined without rte_
> 
> So I think it should remain as it is.
> 

Even if only used by a driver, APIs are exported from the .so built for
the library, which means that they become public for apps using the lib.
Even for header-only symbols for drivers, it's good practice to put the
prefix since they are for use outside the compilation unit.

> > 
> > 
> >> +
> >> +/**
> >> + * The data structure associated with each DMA device.
> >> + */
> >> +struct rte_dmadev {
> >> +       /**< Enqueue a copy operation onto the DMA device. */
> >> +       dmadev_copy_t copy;
> >> +       /**< Enqueue a scatter list copy operation onto the DMA device. */
> >> +       dmadev_copy_sg_t copy_sg;
> >> +       /**< Enqueue a fill operation onto the DMA device. */
> >> +       dmadev_fill_t fill;
> >> +       /**< Enqueue a scatter list fill operation onto the DMA device. */
> >> +       dmadev_fill_sg_t fill_sg;
> >> +       /**< Add a fence to force ordering between operations. */
> >> +       dmadev_fence_t fence;
> >> +       /**< Trigger hardware to begin performing enqueued operations. */
> >> +       dmadev_perform_t perform;
> >> +       /**< Returns the number of operations that successful completed. */
> >> +       dmadev_completed_t completed;
> >> +       /**< Returns the number of operations that failed to complete. */
> >> +       dmadev_completed_fails_t completed_fails;
> > 
> > We need to limit fastpath items in 1 CL
> 
> yes, currently there are 8 callback, which just fill one cache line.
> 

Before we get overly concerned about this, I think we should benchmark it
to see how much our "one cacheline" is giving us compared to having them in
ops. For example, the "perform" doorbell function, or the completed
function is only called once every burst, so it would be interesting to see
how much difference it really makes for that.

/Bruce

^ permalink raw reply	[flat|nested] 339+ messages in thread

* Re: [dpdk-dev] [PATCH] dmadev: introduce DMA device library
  2021-07-06  3:56   ` fengchengwen
@ 2021-07-06 10:02     ` Bruce Richardson
  0 siblings, 0 replies; 339+ messages in thread
From: Bruce Richardson @ 2021-07-06 10:02 UTC (permalink / raw)
  To: fengchengwen
  Cc: Andrew Rybchenko, thomas, ferruh.yigit, jerinj, jerinjacobk, dev,
	mb, nipun.gupta, hemant.agrawal, maxime.coquelin,
	honnappa.nagarahalli, david.marchand, sburla, pkapoor,
	konstantin.ananyev, liangma

On Tue, Jul 06, 2021 at 11:56:03AM +0800, fengchengwen wrote:
> Many thanks, mostly OK, a few comment inline
> 
> On 2021/7/4 22:57, Andrew Rybchenko wrote:
> > On 7/2/21 4:18 PM, Chengwen Feng wrote:
> >> This patch introduces 'dmadevice' which is a generic type of DMA
> >> device.
<snip>
> > Do we really need both stats and xstats from the very
> > beginning? I think it is better to start from just
> > generic stats and add xstats when it is really required.
> 
> OK, but I think we should add one dump ops, which could be useful to
> find the problem.
> 
+1 to both suggestions - dropping xstats (for now) and adding dump fn.

^ permalink raw reply	[flat|nested] 339+ messages in thread

* [dpdk-dev] [RFC UPDATE PATCH 0/9] dmadev rfc suggested updates
  2021-07-02 13:18 [dpdk-dev] [PATCH] dmadev: introduce DMA device library Chengwen Feng
                   ` (3 preceding siblings ...)
  2021-07-04 15:21 ` Matan Azrad
@ 2021-07-06 20:28 ` Bruce Richardson
  2021-07-06 20:28   ` [dpdk-dev] [RFC UPDATE PATCH 1/9] dmadev: add missing exports Bruce Richardson
                     ` (9 more replies)
  2021-07-11  9:25 ` [dpdk-dev] [PATCH v2] dmadev: introduce DMA device library Chengwen Feng
                   ` (24 subsequent siblings)
  29 siblings, 10 replies; 339+ messages in thread
From: Bruce Richardson @ 2021-07-06 20:28 UTC (permalink / raw)
  To: dev
  Cc: Chengwen Feng, Jerin Jacob, Jerin Jacob, Morten Brørup,
	Bruce Richardson

This patchset contains a series of changes to dmadev based on work being done to
port over our drivers to test this new infrastructure. Some of these are bug
fixes to enable compilation e.g. missing exports or meson.build files, while
others are suggested changes to enhance the API. All these patches are to be
applied on top of [1] as they are mostly suggested changes to that RFC i.e.
patches to the patch!

The final patch includes some basic sanity tests for copy operations that we
have ported over from the ioat self-tests to use the dmadev APIs. The basic
dataplane part of those tests is probably ok for now, but the initialization of
queues in that test code may need some enhancement. Feedback welcome.

A tree with all these patches applied can be got at [2] if anyone wants to use
that as a basis for working on drivers, or for other discussion.

[1] http://patches.dpdk.org/project/dpdk/patch/1625231891-2963-1-git-send-email-fengchengwen@huawei.com/
[2] https://github.com/bruce-richardson/dpdk/tree/dmadev-rfcs

Bruce Richardson (9):
  dmadev: add missing exports
  dmadev: change virtual addresses to IOVA
  dmadev: add dump function
  dmadev: remove xstats functions
  dmadev: drop cookie typedef
  dmadev: allow NULL parameters to completed ops call
  dmadev: stats structure updates
  drivers: add dma driver category
  app/test: add basic dmadev unit test

 app/test/meson.build         |   2 +
 app/test/test_dmadev.c       | 320 +++++++++++++++++++++++++++++++++++
 drivers/dma/meson.build      |  11 ++
 drivers/meson.build          |   1 +
 lib/dmadev/rte_dmadev.c      |  66 ++------
 lib/dmadev/rte_dmadev.h      | 204 +++++++---------------
 lib/dmadev/rte_dmadev_core.h |  16 +-
 lib/dmadev/rte_dmadev_pmd.h  |  24 +--
 lib/dmadev/version.map       |   7 +-
 9 files changed, 425 insertions(+), 226 deletions(-)
 create mode 100644 app/test/test_dmadev.c
 create mode 100644 drivers/dma/meson.build

--
2.30.2


^ permalink raw reply	[flat|nested] 339+ messages in thread

* [dpdk-dev] [RFC UPDATE PATCH 1/9] dmadev: add missing exports
  2021-07-06 20:28 ` [dpdk-dev] [RFC UPDATE PATCH 0/9] dmadev rfc suggested updates Bruce Richardson
@ 2021-07-06 20:28   ` Bruce Richardson
  2021-07-07  8:26     ` David Marchand
  2021-07-06 20:28   ` [dpdk-dev] [RFC UPDATE PATCH 2/9] dmadev: change virtual addresses to IOVA Bruce Richardson
                     ` (8 subsequent siblings)
  9 siblings, 1 reply; 339+ messages in thread
From: Bruce Richardson @ 2021-07-06 20:28 UTC (permalink / raw)
  To: dev
  Cc: Chengwen Feng, Jerin Jacob, Jerin Jacob, Morten Brørup,
	Bruce Richardson

Export the rte_dmadevices array and the allocate and release functions
which are needed by PMDs.

Signed-off-by: Bruce Richardson <bruce.richardson@intel.com>
---
 drivers/meson.build     | 1 +
 lib/dmadev/rte_dmadev.c | 2 ++
 lib/dmadev/version.map  | 3 +++
 3 files changed, 6 insertions(+)

diff --git a/drivers/meson.build b/drivers/meson.build
index bc6f4f567..f09a9172c 100644
--- a/drivers/meson.build
+++ b/drivers/meson.build
@@ -9,6 +9,7 @@ subdirs = [
         'common/mlx5',    # depends on bus.
         'common/qat',     # depends on bus.
         'common/sfc_efx', # depends on bus.
+        'dma',            # depends on bus.
         'mempool',        # depends on common and bus.
         'net',            # depends on common, bus, mempool
         'raw',            # depends on common, bus and net.
diff --git a/lib/dmadev/rte_dmadev.c b/lib/dmadev/rte_dmadev.c
index a94e83984..855f4d272 100644
--- a/lib/dmadev/rte_dmadev.c
+++ b/lib/dmadev/rte_dmadev.c
@@ -372,6 +372,7 @@ rte_dmadev_find_free_device_index(void)
 	return RTE_DMADEV_MAX_DEVS;
 }
 
+__rte_experimental
 struct rte_dmadev *
 rte_dmadev_pmd_allocate(const char *name, size_t dev_priv_size, int socket_id)
 {
@@ -414,6 +415,7 @@ rte_dmadev_pmd_allocate(const char *name, size_t dev_priv_size, int socket_id)
 	return dev;
 }
 
+__rte_experimental
 int
 rte_dmadev_pmd_release(struct rte_dmadev *dev)
 {
diff --git a/lib/dmadev/version.map b/lib/dmadev/version.map
index 383b3ca5f..a0a121f3a 100644
--- a/lib/dmadev/version.map
+++ b/lib/dmadev/version.map
@@ -1,6 +1,8 @@
 EXPERIMENTAL {
 	global:
 
+	rte_dmadevices;
+	rte_dmadev_pmd_allocate;
 	rte_dmadev_count;
 	rte_dmadev_get_dev_id;
 	rte_dmadev_socket_id;
@@ -19,6 +21,7 @@ EXPERIMENTAL {
 	rte_dmadev_fill_sg;
 	rte_dmadev_fence;
 	rte_dmadev_perform;
+	rte_dmadev_pmd_release;
 	rte_dmadev_completed;
 	rte_dmadev_completed_fails;
 	rte_dmadev_stats_get;
-- 
2.30.2


^ permalink raw reply	[flat|nested] 339+ messages in thread

* [dpdk-dev] [RFC UPDATE PATCH 2/9] dmadev: change virtual addresses to IOVA
  2021-07-06 20:28 ` [dpdk-dev] [RFC UPDATE PATCH 0/9] dmadev rfc suggested updates Bruce Richardson
  2021-07-06 20:28   ` [dpdk-dev] [RFC UPDATE PATCH 1/9] dmadev: add missing exports Bruce Richardson
@ 2021-07-06 20:28   ` Bruce Richardson
  2021-07-06 20:28   ` [dpdk-dev] [RFC UPDATE PATCH 3/9] dmadev: add dump function Bruce Richardson
                     ` (7 subsequent siblings)
  9 siblings, 0 replies; 339+ messages in thread
From: Bruce Richardson @ 2021-07-06 20:28 UTC (permalink / raw)
  To: dev
  Cc: Chengwen Feng, Jerin Jacob, Jerin Jacob, Morten Brørup,
	Bruce Richardson

For 32-bit builds, iova's are 64-bit still, so to ensure we can still
use PA mode on 32-bit we need to convert all enqueue "void *" parameters
to rte_iova_t

Signed-off-by: Bruce Richardson <bruce.richardson@intel.com>
---
 lib/dmadev/rte_dmadev.h      | 8 ++++----
 lib/dmadev/rte_dmadev_core.h | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/lib/dmadev/rte_dmadev.h b/lib/dmadev/rte_dmadev.h
index f74fc6adb..1659ceaf2 100644
--- a/lib/dmadev/rte_dmadev.h
+++ b/lib/dmadev/rte_dmadev.h
@@ -133,8 +133,8 @@ typedef int32_t dma_cookie_t;
  * dma_scatterlist - can hold scatter DMA operation request
  */
 struct dma_scatterlist {
-	void *src;
-	void *dst;
+	rte_iova_t src;
+	rte_iova_t dst;
 	uint32_t length;
 };
 
@@ -505,7 +505,7 @@ rte_dmadev_queue_info_get(uint16_t dev_id, uint16_t vq_id,
  */
 __rte_experimental
 static inline dma_cookie_t
-rte_dmadev_copy(uint16_t dev_id, uint16_t vq_id, void *src, void *dst,
+rte_dmadev_copy(uint16_t dev_id, uint16_t vq_id, rte_iova_t src, rte_iova_t dst,
 		uint32_t length, uint64_t flags)
 {
 	struct rte_dmadev *dev = &rte_dmadevices[dev_id];
@@ -579,7 +579,7 @@ rte_dmadev_copy_sg(uint16_t dev_id, uint16_t vq_id,
 __rte_experimental
 static inline dma_cookie_t
 rte_dmadev_fill(uint16_t dev_id, uint16_t vq_id, uint64_t pattern,
-		void *dst, uint32_t length, uint64_t flags)
+		rte_iova_t dst, uint32_t length, uint64_t flags)
 {
 	struct rte_dmadev *dev = &rte_dmadevices[dev_id];
 	return (*dev->fill)(dev, vq_id, pattern, dst, length, flags);
diff --git a/lib/dmadev/rte_dmadev_core.h b/lib/dmadev/rte_dmadev_core.h
index a3afea251..80b56ed83 100644
--- a/lib/dmadev/rte_dmadev_core.h
+++ b/lib/dmadev/rte_dmadev_core.h
@@ -17,7 +17,7 @@
 struct rte_dmadev;
 
 typedef dma_cookie_t (*dmadev_copy_t)(struct rte_dmadev *dev, uint16_t vq_id,
-				      void *src, void *dst,
+				      rte_iova_t src, rte_iova_t dst,
 				      uint32_t length, uint64_t flags);
 /**< @internal Function used to enqueue a copy operation. */
 
@@ -27,7 +27,7 @@ typedef dma_cookie_t (*dmadev_copy_sg_t)(struct rte_dmadev *dev, uint16_t vq_id,
 /**< @internal Function used to enqueue a scatter list copy operation. */
 
 typedef dma_cookie_t (*dmadev_fill_t)(struct rte_dmadev *dev, uint16_t vq_id,
-				      uint64_t pattern, void *dst,
+				      uint64_t pattern, rte_iova_t dst,
 				      uint32_t length, uint64_t flags);
 /**< @internal Function used to enqueue a fill operation. */
 
-- 
2.30.2


^ permalink raw reply	[flat|nested] 339+ messages in thread

* [dpdk-dev] [RFC UPDATE PATCH 3/9] dmadev: add dump function
  2021-07-06 20:28 ` [dpdk-dev] [RFC UPDATE PATCH 0/9] dmadev rfc suggested updates Bruce Richardson
  2021-07-06 20:28   ` [dpdk-dev] [RFC UPDATE PATCH 1/9] dmadev: add missing exports Bruce Richardson
  2021-07-06 20:28   ` [dpdk-dev] [RFC UPDATE PATCH 2/9] dmadev: change virtual addresses to IOVA Bruce Richardson
@ 2021-07-06 20:28   ` Bruce Richardson
  2021-07-06 20:28   ` [dpdk-dev] [RFC UPDATE PATCH 4/9] dmadev: remove xstats functions Bruce Richardson
                     ` (6 subsequent siblings)
  9 siblings, 0 replies; 339+ messages in thread
From: Bruce Richardson @ 2021-07-06 20:28 UTC (permalink / raw)
  To: dev
  Cc: Chengwen Feng, Jerin Jacob, Jerin Jacob, Morten Brørup,
	Bruce Richardson

a dump() function to print the state of a device to a file (e.g. sterr
or stdout) is very useful for debugging drivers.

Signed-off-by: Bruce Richardson <bruce.richardson@intel.com>
---
 lib/dmadev/rte_dmadev.c     | 17 +++++++++++++++++
 lib/dmadev/rte_dmadev.h     | 19 +++++++++++++++++++
 lib/dmadev/rte_dmadev_pmd.h |  5 +++++
 lib/dmadev/version.map      |  1 +
 4 files changed, 42 insertions(+)

diff --git a/lib/dmadev/rte_dmadev.c b/lib/dmadev/rte_dmadev.c
index 855f4d272..ffd7c5b97 100644
--- a/lib/dmadev/rte_dmadev.c
+++ b/lib/dmadev/rte_dmadev.c
@@ -345,6 +345,23 @@ rte_dmadev_xstats_reset(uint16_t dev_id, const uint32_t ids[], uint32_t nb_ids)
 	return (*dev->dev_ops->xstats_reset)(dev, ids, nb_ids);
 }
 
+int
+rte_dmadev_dump(uint16_t dev_id, FILE *f)
+{
+	struct rte_dmadev *dev = &rte_dmadevices[dev_id];
+
+	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
+
+	fprintf(f, "DMA Dev %u, '%s' [%s]\n", dev->dev_id, dev->name,
+			dev->started ? "started" : "stopped");
+	fprintf(f, "  Driver: %s\n", dev->driver_name);
+	fprintf(f, "  Socket Id: %d\n", dev->socket_id);
+
+	if (dev->dev_ops->dump != NULL)
+		return (*dev->dev_ops->dump)(dev, f);
+	return 0;
+}
+
 int
 rte_dmadev_selftest(uint16_t dev_id)
 {
diff --git a/lib/dmadev/rte_dmadev.h b/lib/dmadev/rte_dmadev.h
index 1659ceaf2..d64df17bd 100644
--- a/lib/dmadev/rte_dmadev.h
+++ b/lib/dmadev/rte_dmadev.h
@@ -357,6 +357,25 @@ __rte_experimental
 int
 rte_dmadev_close(uint16_t dev_id);
 
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Dump DMA device info.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ *
+ * @param f
+ *   The file to write the output to.
+ *
+ * @return
+ *   0 on success. Non-zero otherwise.
+ */
+__rte_experimental
+int
+rte_dmadev_dump(uint16_t dev_id, FILE *f);
+
 /**
  * @warning
  * @b EXPERIMENTAL: this API may change without prior notice.
diff --git a/lib/dmadev/rte_dmadev_pmd.h b/lib/dmadev/rte_dmadev_pmd.h
index ef03cf7cd..428ddc943 100644
--- a/lib/dmadev/rte_dmadev_pmd.h
+++ b/lib/dmadev/rte_dmadev_pmd.h
@@ -99,6 +99,9 @@ typedef int (*dmadev_close_t)(struct rte_dmadev *dev);
 typedef int (*dmadev_reset_t)(struct rte_dmadev *dev);
 /**< @internal Function used to reset a configured device. */
 
+typedef int (*dmadev_dump_t)(struct rte_dmadev *dev, FILE *f);
+/**< @internal Function used to dump out the state of a device for debugging. */
+
 typedef int (*dmadev_queue_setup_t)(struct rte_dmadev *dev,
 				    const struct rte_dmadev_queue_conf *conf);
 /**< @internal Function used to allocate and set up a virt queue. */
@@ -147,6 +150,8 @@ struct rte_dmadev_ops {
 	dmadev_close_t dev_close;
 	/**< Reset device. */
 	dmadev_reset_t dev_reset;
+	/**< Dump device info for debugging */
+	dmadev_dump_t dump;
 
 	/**< Allocate and set up a virt queue. */
 	dmadev_queue_setup_t queue_setup;
diff --git a/lib/dmadev/version.map b/lib/dmadev/version.map
index a0a121f3a..ed051d54f 100644
--- a/lib/dmadev/version.map
+++ b/lib/dmadev/version.map
@@ -4,6 +4,7 @@ EXPERIMENTAL {
 	rte_dmadevices;
 	rte_dmadev_pmd_allocate;
 	rte_dmadev_count;
+	rte_dmadev_dump;
 	rte_dmadev_get_dev_id;
 	rte_dmadev_socket_id;
 	rte_dmadev_info_get;
-- 
2.30.2


^ permalink raw reply	[flat|nested] 339+ messages in thread

* [dpdk-dev] [RFC UPDATE PATCH 4/9] dmadev: remove xstats functions
  2021-07-06 20:28 ` [dpdk-dev] [RFC UPDATE PATCH 0/9] dmadev rfc suggested updates Bruce Richardson
                     ` (2 preceding siblings ...)
  2021-07-06 20:28   ` [dpdk-dev] [RFC UPDATE PATCH 3/9] dmadev: add dump function Bruce Richardson
@ 2021-07-06 20:28   ` Bruce Richardson
  2021-07-06 20:28   ` [dpdk-dev] [RFC UPDATE PATCH 5/9] dmadev: drop cookie typedef Bruce Richardson
                     ` (5 subsequent siblings)
  9 siblings, 0 replies; 339+ messages in thread
From: Bruce Richardson @ 2021-07-06 20:28 UTC (permalink / raw)
  To: dev
  Cc: Chengwen Feng, Jerin Jacob, Jerin Jacob, Morten Brørup,
	Bruce Richardson

remove the xstats function calls, as they are not needed for this class
as-yet.

Signed-off-by: Bruce Richardson <bruce.richardson@intel.com>
---
 lib/dmadev/rte_dmadev.c     | 63 --------------------------
 lib/dmadev/rte_dmadev.h     | 89 -------------------------------------
 lib/dmadev/rte_dmadev_pmd.h | 19 --------
 lib/dmadev/version.map      |  3 --
 4 files changed, 174 deletions(-)

diff --git a/lib/dmadev/rte_dmadev.c b/lib/dmadev/rte_dmadev.c
index ffd7c5b97..fed168675 100644
--- a/lib/dmadev/rte_dmadev.c
+++ b/lib/dmadev/rte_dmadev.c
@@ -282,69 +282,6 @@ rte_dmadev_stats_reset(uint16_t dev_id, int vq_id)
 	return (*dev->dev_ops->stats_reset)(dev, vq_id);
 }
 
-static int
-xstats_get_count(uint16_t dev_id)
-{
-	struct rte_dmadev *dev = &rte_dmadevices[dev_id];
-
-	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->xstats_get_names, -ENOTSUP);
-
-	return (*dev->dev_ops->xstats_get_names)(dev, NULL, 0);
-}
-
-int
-rte_dmadev_xstats_names_get(uint16_t dev_id,
-			    struct rte_dmadev_xstats_name *xstats_names,
-			    uint32_t size)
-{
-	struct rte_dmadev *dev;
-	int cnt_expected_entries;
-
-	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
-
-	cnt_expected_entries = xstats_get_count(dev_id);
-
-	if (xstats_names == NULL || cnt_expected_entries < 0 ||
-	    (int)size < cnt_expected_entries || size == 0)
-		return cnt_expected_entries;
-
-	dev = &rte_dmadevices[dev_id];
-
-	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->xstats_get_names, -ENOTSUP);
-	return (*dev->dev_ops->xstats_get_names)(dev, xstats_names, size);
-}
-
-int
-rte_dmadev_xstats_get(uint16_t dev_id, const uint32_t ids[],
-		      uint64_t values[], uint32_t n)
-{
-	struct rte_dmadev *dev;
-
-	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
-	RTE_FUNC_PTR_OR_ERR_RET(ids, -EINVAL);
-	RTE_FUNC_PTR_OR_ERR_RET(values, -EINVAL);
-
-	dev = &rte_dmadevices[dev_id];
-
-	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->xstats_get, -ENOTSUP);
-
-	return (*dev->dev_ops->xstats_get)(dev, ids, values, n);
-}
-
-int
-rte_dmadev_xstats_reset(uint16_t dev_id, const uint32_t ids[], uint32_t nb_ids)
-{
-	struct rte_dmadev *dev;
-
-	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
-
-	dev = &rte_dmadevices[dev_id];
-
-	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->xstats_reset, -ENOTSUP);
-
-	return (*dev->dev_ops->xstats_reset)(dev, ids, nb_ids);
-}
-
 int
 rte_dmadev_dump(uint16_t dev_id, FILE *f)
 {
diff --git a/lib/dmadev/rte_dmadev.h b/lib/dmadev/rte_dmadev.h
index d64df17bd..2bfc0b619 100644
--- a/lib/dmadev/rte_dmadev.h
+++ b/lib/dmadev/rte_dmadev.h
@@ -824,95 +824,6 @@ __rte_experimental
 int
 rte_dmadev_stats_reset(uint16_t dev_id, int vq_id);
 
-/** Maximum name length for extended statistics counters */
-#define RTE_DMA_DEV_XSTATS_NAME_SIZE 64
-
-/**
- * A name-key lookup element for extended statistics.
- *
- * This structure is used to map between names and ID numbers
- * for extended ethdev statistics.
- */
-struct rte_dmadev_xstats_name {
-	char name[RTE_DMA_DEV_XSTATS_NAME_SIZE];
-};
-
-/**
- * @warning
- * @b EXPERIMENTAL: this API may change without prior notice.
- *
- * Retrieve names of extended statistics of a DMA device.
- *
- * @param dev_id
- *   The identifier of the device.
- * @param[out] xstats_names
- *   Block of memory to insert names into. Must be at least size in capacity.
- *   If set to NULL, function returns required capacity.
- * @param size
- *   Capacity of xstats_names (number of names).
- * @return
- *   - positive value lower or equal to size: success. The return value
- *     is the number of entries filled in the stats table.
- *   - positive value higher than size: error, the given statistics table
- *     is too small. The return value corresponds to the size that should
- *     be given to succeed. The entries in the table are not valid and
- *     shall not be used by the caller.
- *   - negative value on error.
- */
-__rte_experimental
-int
-rte_dmadev_xstats_names_get(uint16_t dev_id,
-			    struct rte_dmadev_xstats_name *xstats_names,
-			    uint32_t size);
-
-/**
- * @warning
- * @b EXPERIMENTAL: this API may change without prior notice.
- *
- * Retrieve extended statistics of a DMA device.
- *
- * @param dev_id
- *   The identifier of the device.
- * @param ids
- *   The id numbers of the stats to get. The ids can be got from the stat
- *   position in the stat list from rte_dmadev_get_xstats_names().
- * @param[out] values
- *   The values for each stats request by ID.
- * @param n
- *   The number of stats requested.
- *
- * @return
- *   - positive value: number of stat entries filled into the values array.
- *   - negative value on error.
- */
-__rte_experimental
-int
-rte_dmadev_xstats_get(uint16_t dev_id, const uint32_t ids[],
-		      uint64_t values[], uint32_t n);
-
-/**
- * @warning
- * @b EXPERIMENTAL: this API may change without prior notice.
- *
- * Reset the values of the xstats of the selected component in the device.
- *
- * @param dev_id
- *   The identifier of the device.
- * @param ids
- *   Selects specific statistics to be reset. When NULL, all statistics
- *   will be reset. If non-NULL, must point to array of at least
- *   *nb_ids* size.
- * @param nb_ids
- *   The number of ids available from the *ids* array. Ignored when ids is NULL.
- *
- * @return
- *   - zero: successfully reset the statistics to zero.
- *   - negative value on error.
- */
-__rte_experimental
-int
-rte_dmadev_xstats_reset(uint16_t dev_id, const uint32_t ids[], uint32_t nb_ids);
-
 /**
  * @warning
  * @b EXPERIMENTAL: this API may change without prior notice.
diff --git a/lib/dmadev/rte_dmadev_pmd.h b/lib/dmadev/rte_dmadev_pmd.h
index 428ddc943..d0ec43af6 100644
--- a/lib/dmadev/rte_dmadev_pmd.h
+++ b/lib/dmadev/rte_dmadev_pmd.h
@@ -120,19 +120,6 @@ typedef int (*dmadev_stats_get_t)(struct rte_dmadev *dev, int vq_id,
 typedef int (*dmadev_stats_reset_t)(struct rte_dmadev *dev, int vq_id);
 /**< @internal Function used to reset basic statistics. */
 
-typedef int (*dmadev_xstats_get_names_t)(const struct rte_dmadev *dev,
-		struct rte_dmadev_xstats_name *xstats_names,
-		uint32_t size);
-/**< @internal Function used to get names of extended stats. */
-
-typedef int (*dmadev_xstats_get_t)(const struct rte_dmadev *dev,
-		const uint32_t ids[], uint64_t values[], uint32_t n);
-/**< @internal Function used to retrieve extended stats. */
-
-typedef int (*dmadev_xstats_reset_t)(struct rte_dmadev *dev,
-				     const uint32_t ids[], uint32_t nb_ids);
-/**< @internal Function used to reset extended stats. */
-
 typedef int (*dmadev_selftest_t)(uint16_t dev_id);
 /**< @internal Function used to start dmadev selftest. */
 
@@ -164,12 +151,6 @@ struct rte_dmadev_ops {
 	dmadev_stats_get_t stats_get;
 	/**< Reset basic statistics. */
 	dmadev_stats_reset_t stats_reset;
-	/**< Get names of extended stats. */
-	dmadev_xstats_get_names_t xstats_get_names;
-	/**< Get extended statistics. */
-	dmadev_xstats_get_t xstats_get;
-	/**< Reset extended statistics values. */
-	dmadev_xstats_reset_t xstats_reset;
 
 	/**< Device selftest function */
 	dmadev_selftest_t dev_selftest;
diff --git a/lib/dmadev/version.map b/lib/dmadev/version.map
index ed051d54f..a4d6b539a 100644
--- a/lib/dmadev/version.map
+++ b/lib/dmadev/version.map
@@ -27,9 +27,6 @@ EXPERIMENTAL {
 	rte_dmadev_completed_fails;
 	rte_dmadev_stats_get;
 	rte_dmadev_stats_reset;
-	rte_dmadev_xstats_names_get;
-	rte_dmadev_xstats_get;
-	rte_dmadev_xstats_reset;
 	rte_dmadev_selftest;
 
 	local: *;
-- 
2.30.2


^ permalink raw reply	[flat|nested] 339+ messages in thread

* [dpdk-dev] [RFC UPDATE PATCH 5/9] dmadev: drop cookie typedef
  2021-07-06 20:28 ` [dpdk-dev] [RFC UPDATE PATCH 0/9] dmadev rfc suggested updates Bruce Richardson
                     ` (3 preceding siblings ...)
  2021-07-06 20:28   ` [dpdk-dev] [RFC UPDATE PATCH 4/9] dmadev: remove xstats functions Bruce Richardson
@ 2021-07-06 20:28   ` Bruce Richardson
  2021-07-06 20:28   ` [dpdk-dev] [RFC UPDATE PATCH 6/9] dmadev: allow NULL parameters to completed ops call Bruce Richardson
                     ` (4 subsequent siblings)
  9 siblings, 0 replies; 339+ messages in thread
From: Bruce Richardson @ 2021-07-06 20:28 UTC (permalink / raw)
  To: dev
  Cc: Chengwen Feng, Jerin Jacob, Jerin Jacob, Morten Brørup,
	Bruce Richardson

Rather than having a special type for the index values used in dmadev,
just use regular int types, with appropriate return value notifications.

Signed-off-by: Bruce Richardson <bruce.richardson@intel.com>
---
 lib/dmadev/rte_dmadev.h      | 59 ++++++++++++------------------------
 lib/dmadev/rte_dmadev_core.h | 12 ++++----
 2 files changed, 26 insertions(+), 45 deletions(-)

diff --git a/lib/dmadev/rte_dmadev.h b/lib/dmadev/rte_dmadev.h
index 2bfc0b619..8cfe14dd2 100644
--- a/lib/dmadev/rte_dmadev.h
+++ b/lib/dmadev/rte_dmadev.h
@@ -106,29 +106,6 @@ extern "C" {
 #include <rte_errno.h>
 #include <rte_compat.h>
 
-/**
- * dma_cookie_t - an opaque DMA cookie
- *
- * If dma_cookie_t is >=0 it's a DMA operation request cookie, <0 it's a error
- * code.
- * When using cookies, comply with the following rules:
- * a) Cookies for each virtual queue are independent.
- * b) For a virt queue, the cookie are monotonically incremented, when it reach
- *    the INT_MAX, it wraps back to zero.
- * c) The initial cookie of a virt queue is zero, after the device is stopped or
- *    reset, the virt queue's cookie needs to be reset to zero.
- * Example:
- *    step-1: start one dmadev
- *    step-2: enqueue a copy operation, the cookie return is 0
- *    step-3: enqueue a copy operation again, the cookie return is 1
- *    ...
- *    step-101: stop the dmadev
- *    step-102: start the dmadev
- *    step-103: enqueue a copy operation, the cookie return is 0
- *    ...
- */
-typedef int32_t dma_cookie_t;
-
 /**
  * dma_scatterlist - can hold scatter DMA operation request
  */
@@ -517,13 +494,14 @@ rte_dmadev_queue_info_get(uint16_t dev_id, uint16_t vq_id,
  *   An opaque flags for this operation.
  *
  * @return
- *   dma_cookie_t: please refer to the corresponding definition.
+ *   <0 on error,
+ *   on success, index of enqueued copy job, monotonically increasing between 0..UINT16_MAX
  *
  * NOTE: The caller must ensure that the input parameter is valid and the
  *       corresponding device supports the operation.
  */
 __rte_experimental
-static inline dma_cookie_t
+static inline int
 rte_dmadev_copy(uint16_t dev_id, uint16_t vq_id, rte_iova_t src, rte_iova_t dst,
 		uint32_t length, uint64_t flags)
 {
@@ -552,13 +530,14 @@ rte_dmadev_copy(uint16_t dev_id, uint16_t vq_id, rte_iova_t src, rte_iova_t dst,
  *   An opaque flags for this operation.
  *
  * @return
- *   dma_cookie_t: please refer to the corresponding definition.
+ *   <0 on error,
+ *   on success, index of enqueued copy job, monotonically increasing between 0..UINT16_MAX
  *
  * NOTE: The caller must ensure that the input parameter is valid and the
  *       corresponding device supports the operation.
  */
 __rte_experimental
-static inline dma_cookie_t
+static inline int
 rte_dmadev_copy_sg(uint16_t dev_id, uint16_t vq_id,
 		   const struct dma_scatterlist *sg,
 		   uint32_t sg_len, uint64_t flags)
@@ -590,13 +569,14 @@ rte_dmadev_copy_sg(uint16_t dev_id, uint16_t vq_id,
  *   An opaque flags for this operation.
  *
  * @return
- *   dma_cookie_t: please refer to the corresponding definition.
+ *   <0 on error,
+ *   on success, index of enqueued copy job, monotonically increasing between 0..UINT16_MAX
  *
  * NOTE: The caller must ensure that the input parameter is valid and the
  *       corresponding device supports the operation.
  */
 __rte_experimental
-static inline dma_cookie_t
+static inline int
 rte_dmadev_fill(uint16_t dev_id, uint16_t vq_id, uint64_t pattern,
 		rte_iova_t dst, uint32_t length, uint64_t flags)
 {
@@ -627,13 +607,14 @@ rte_dmadev_fill(uint16_t dev_id, uint16_t vq_id, uint64_t pattern,
  *   An opaque flags for this operation.
  *
  * @return
- *   dma_cookie_t: please refer to the corresponding definition.
+ *   <0 on error,
+ *   on success, index of enqueued copy job, monotonically increasing between 0..UINT16_MAX
  *
  * NOTE: The caller must ensure that the input parameter is valid and the
  *       corresponding device supports the operation.
  */
 __rte_experimental
-static inline dma_cookie_t
+static inline int
 rte_dmadev_fill_sg(uint16_t dev_id, uint16_t vq_id, uint64_t pattern,
 		   const struct dma_scatterlist *sg, uint32_t sg_len,
 		   uint64_t flags)
@@ -716,8 +697,8 @@ rte_dmadev_perform(uint16_t dev_id, uint16_t vq_id)
  *   The identifier of virt queue.
  * @param nb_cpls
  *   The maximum number of completed operations that can be processed.
- * @param[out] cookie
- *   The last completed operation's cookie.
+ * @param[out] last_idx
+ *   The last completed operation's index, as returned when entry was enqueued
  * @param[out] has_error
  *   Indicates if there are transfer error.
  *
@@ -730,11 +711,11 @@ rte_dmadev_perform(uint16_t dev_id, uint16_t vq_id)
 __rte_experimental
 static inline uint16_t
 rte_dmadev_completed(uint16_t dev_id, uint16_t vq_id, const uint16_t nb_cpls,
-		     dma_cookie_t *cookie, bool *has_error)
+		     uint16_t *last_idx, bool *has_error)
 {
 	struct rte_dmadev *dev = &rte_dmadevices[dev_id];
 	has_error = false;
-	return (*dev->completed)(dev, vq_id, nb_cpls, cookie, has_error);
+	return (*dev->completed)(dev, vq_id, nb_cpls, last_idx, has_error);
 }
 
 /**
@@ -752,8 +733,8 @@ rte_dmadev_completed(uint16_t dev_id, uint16_t vq_id, const uint16_t nb_cpls,
  *   Indicates the size of status array.
  * @param[out] status
  *   The error code of operations that failed to complete.
- * @param[out] cookie
- *   The last failed completed operation's cookie.
+ * @param[out] last_idx
+ *   The last failed completed operation's index.
  *
  * @return
  *   The number of operations that failed to complete.
@@ -765,10 +746,10 @@ __rte_experimental
 static inline uint16_t
 rte_dmadev_completed_fails(uint16_t dev_id, uint16_t vq_id,
 			   const uint16_t nb_status, uint32_t *status,
-			   dma_cookie_t *cookie)
+			   uint16_t *last_idx)
 {
 	struct rte_dmadev *dev = &rte_dmadevices[dev_id];
-	return (*dev->completed_fails)(dev, vq_id, nb_status, status, cookie);
+	return (*dev->completed_fails)(dev, vq_id, nb_status, status, last_idx);
 }
 
 struct rte_dmadev_stats {
diff --git a/lib/dmadev/rte_dmadev_core.h b/lib/dmadev/rte_dmadev_core.h
index 80b56ed83..7fbefe8f9 100644
--- a/lib/dmadev/rte_dmadev_core.h
+++ b/lib/dmadev/rte_dmadev_core.h
@@ -16,22 +16,22 @@
 
 struct rte_dmadev;
 
-typedef dma_cookie_t (*dmadev_copy_t)(struct rte_dmadev *dev, uint16_t vq_id,
+typedef int (*dmadev_copy_t)(struct rte_dmadev *dev, uint16_t vq_id,
 				      rte_iova_t src, rte_iova_t dst,
 				      uint32_t length, uint64_t flags);
 /**< @internal Function used to enqueue a copy operation. */
 
-typedef dma_cookie_t (*dmadev_copy_sg_t)(struct rte_dmadev *dev, uint16_t vq_id,
+typedef int (*dmadev_copy_sg_t)(struct rte_dmadev *dev, uint16_t vq_id,
 					 const struct dma_scatterlist *sg,
 					 uint32_t sg_len, uint64_t flags);
 /**< @internal Function used to enqueue a scatter list copy operation. */
 
-typedef dma_cookie_t (*dmadev_fill_t)(struct rte_dmadev *dev, uint16_t vq_id,
+typedef int (*dmadev_fill_t)(struct rte_dmadev *dev, uint16_t vq_id,
 				      uint64_t pattern, rte_iova_t dst,
 				      uint32_t length, uint64_t flags);
 /**< @internal Function used to enqueue a fill operation. */
 
-typedef dma_cookie_t (*dmadev_fill_sg_t)(struct rte_dmadev *dev, uint16_t vq_id,
+typedef int (*dmadev_fill_sg_t)(struct rte_dmadev *dev, uint16_t vq_id,
 			uint64_t pattern, const struct dma_scatterlist *sg,
 			uint32_t sg_len, uint64_t flags);
 /**< @internal Function used to enqueue a scatter list fill operation. */
@@ -44,12 +44,12 @@ typedef int (*dmadev_perform_t)(struct rte_dmadev *dev, uint16_t vq_id);
 
 typedef uint16_t (*dmadev_completed_t)(struct rte_dmadev *dev, uint16_t vq_id,
 				       const uint16_t nb_cpls,
-				       dma_cookie_t *cookie, bool *has_error);
+				       uint16_t *last_idx, bool *has_error);
 /**< @internal Function used to return number of successful completed operations */
 
 typedef uint16_t (*dmadev_completed_fails_t)(struct rte_dmadev *dev,
 			uint16_t vq_id, const uint16_t nb_status,
-			uint32_t *status, dma_cookie_t *cookie);
+			uint32_t *status, uint16_t *last_idx);
 /**< @internal Function used to return number of failed completed operations */
 
 #define RTE_DMADEV_NAME_MAX_LEN	64 /**< Max length of name of DMA PMD */
-- 
2.30.2


^ permalink raw reply	[flat|nested] 339+ messages in thread

* [dpdk-dev] [RFC UPDATE PATCH 6/9] dmadev: allow NULL parameters to completed ops call
  2021-07-06 20:28 ` [dpdk-dev] [RFC UPDATE PATCH 0/9] dmadev rfc suggested updates Bruce Richardson
                     ` (4 preceding siblings ...)
  2021-07-06 20:28   ` [dpdk-dev] [RFC UPDATE PATCH 5/9] dmadev: drop cookie typedef Bruce Richardson
@ 2021-07-06 20:28   ` Bruce Richardson
  2021-07-06 20:28   ` [dpdk-dev] [RFC UPDATE PATCH 7/9] dmadev: stats structure updates Bruce Richardson
                     ` (3 subsequent siblings)
  9 siblings, 0 replies; 339+ messages in thread
From: Bruce Richardson @ 2021-07-06 20:28 UTC (permalink / raw)
  To: dev
  Cc: Chengwen Feng, Jerin Jacob, Jerin Jacob, Morten Brørup,
	Bruce Richardson

Allow the user to skip passing the "out" parameters to the
rte_dmadev_completed() API call, by using local replacements in the
inline function. This simplifies drivers, and compilers should be able
to remove the branches at compile time in many cases.

Signed-off-by: Bruce Richardson <bruce.richardson@intel.com>
---
 lib/dmadev/rte_dmadev.h | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/lib/dmadev/rte_dmadev.h b/lib/dmadev/rte_dmadev.h
index 8cfe14dd2..eb78f3805 100644
--- a/lib/dmadev/rte_dmadev.h
+++ b/lib/dmadev/rte_dmadev.h
@@ -698,9 +698,11 @@ rte_dmadev_perform(uint16_t dev_id, uint16_t vq_id)
  * @param nb_cpls
  *   The maximum number of completed operations that can be processed.
  * @param[out] last_idx
- *   The last completed operation's index, as returned when entry was enqueued
+ *   The last completed operation's index, as returned when entry was enqueued.
+ *   If not required, NULL can be passed in.
  * @param[out] has_error
  *   Indicates if there are transfer error.
+ *   If not required, may be passed as NULL.
  *
  * @return
  *   The number of operations that successful completed.
@@ -714,7 +716,20 @@ rte_dmadev_completed(uint16_t dev_id, uint16_t vq_id, const uint16_t nb_cpls,
 		     uint16_t *last_idx, bool *has_error)
 {
 	struct rte_dmadev *dev = &rte_dmadevices[dev_id];
-	has_error = false;
+	bool err = false;
+	uint16_t idx;
+
+	/* ensure the pointer values are non-null to simplify drivers.
+	 * In most cases these should be compile time evaluated, since this is an inline function.
+	 * - If NULL is explicitly passed as parameter, then compiler knows the value is NULL
+	 * - If address of local variable is passed as parameter, then compiler can
+	 *   know it's non-NULL.
+	 */
+	if (has_error == NULL)
+		has_error = &err;
+	if (last_idx == NULL)
+		last_idx = &idx;
+
 	return (*dev->completed)(dev, vq_id, nb_cpls, last_idx, has_error);
 }
 
-- 
2.30.2


^ permalink raw reply	[flat|nested] 339+ messages in thread

* [dpdk-dev] [RFC UPDATE PATCH 7/9] dmadev: stats structure updates
  2021-07-06 20:28 ` [dpdk-dev] [RFC UPDATE PATCH 0/9] dmadev rfc suggested updates Bruce Richardson
                     ` (5 preceding siblings ...)
  2021-07-06 20:28   ` [dpdk-dev] [RFC UPDATE PATCH 6/9] dmadev: allow NULL parameters to completed ops call Bruce Richardson
@ 2021-07-06 20:28   ` Bruce Richardson
  2021-07-06 20:28   ` [dpdk-dev] [RFC UPDATE PATCH 8/9] drivers: add dma driver category Bruce Richardson
                     ` (2 subsequent siblings)
  9 siblings, 0 replies; 339+ messages in thread
From: Bruce Richardson @ 2021-07-06 20:28 UTC (permalink / raw)
  To: dev
  Cc: Chengwen Feng, Jerin Jacob, Jerin Jacob, Morten Brørup,
	Bruce Richardson

Drop the failed enqueue count since that is best tracked by the
application so that retries of the same job can be counted as desired by
the app developer. Since the "doorbell" function is separate from the
actual functions to enqueue descriptors, track a separate stat for jobs
which were submitted to hardware, in case the "enqueued" count includes
jobs which were not yet "doorbelled".

Signed-off-by: Bruce Richardson <bruce.richardson@intel.com>
---
 lib/dmadev/rte_dmadev.h | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/lib/dmadev/rte_dmadev.h b/lib/dmadev/rte_dmadev.h
index eb78f3805..bdb531a53 100644
--- a/lib/dmadev/rte_dmadev.h
+++ b/lib/dmadev/rte_dmadev.h
@@ -768,14 +768,10 @@ rte_dmadev_completed_fails(uint16_t dev_id, uint16_t vq_id,
 }
 
 struct rte_dmadev_stats {
-	uint64_t enqueue_fail_count;
-	/**< Conut of all operations which failed enqueued */
-	uint64_t enqueued_count;
-	/**< Count of all operations which successful enqueued */
-	uint64_t completed_fail_count;
-	/**< Count of all operations which failed to complete */
-	uint64_t completed_count;
-	/**< Count of all operations which successful complete */
+	uint64_t enqueued_count;       /**< Count of operations which were successful enqueued */
+	uint64_t submitted_count;      /**< Count of operations which were submitted to hardware */
+	uint64_t completed_fail_count; /**< Count of operations which failed to complete */
+	uint64_t completed_count;      /**< Count of operations which successful complete */
 };
 
 /**
-- 
2.30.2


^ permalink raw reply	[flat|nested] 339+ messages in thread

* [dpdk-dev] [RFC UPDATE PATCH 8/9] drivers: add dma driver category
  2021-07-06 20:28 ` [dpdk-dev] [RFC UPDATE PATCH 0/9] dmadev rfc suggested updates Bruce Richardson
                     ` (6 preceding siblings ...)
  2021-07-06 20:28   ` [dpdk-dev] [RFC UPDATE PATCH 7/9] dmadev: stats structure updates Bruce Richardson
@ 2021-07-06 20:28   ` Bruce Richardson
  2021-07-06 20:28   ` [dpdk-dev] [RFC UPDATE PATCH 9/9] app/test: add basic dmadev unit test Bruce Richardson
  2021-07-07  3:16   ` [dpdk-dev] [RFC UPDATE PATCH 0/9] dmadev rfc suggested updates fengchengwen
  9 siblings, 0 replies; 339+ messages in thread
From: Bruce Richardson @ 2021-07-06 20:28 UTC (permalink / raw)
  To: dev
  Cc: Chengwen Feng, Jerin Jacob, Jerin Jacob, Morten Brørup,
	Bruce Richardson

Signed-off-by: Bruce Richardson <bruce.richardson@intel.com>
---
 drivers/dma/meson.build | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 drivers/dma/meson.build

diff --git a/drivers/dma/meson.build b/drivers/dma/meson.build
new file mode 100644
index 000000000..986b28be5
--- /dev/null
+++ b/drivers/dma/meson.build
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2021 Intel Corporation
+
+if is_windows
+    subdir_done()
+endif
+
+drivers = [
+]
+
+std_deps = ['dmadev']
-- 
2.30.2


^ permalink raw reply	[flat|nested] 339+ messages in thread

* [dpdk-dev] [RFC UPDATE PATCH 9/9] app/test: add basic dmadev unit test
  2021-07-06 20:28 ` [dpdk-dev] [RFC UPDATE PATCH 0/9] dmadev rfc suggested updates Bruce Richardson
                     ` (7 preceding siblings ...)
  2021-07-06 20:28   ` [dpdk-dev] [RFC UPDATE PATCH 8/9] drivers: add dma driver category Bruce Richardson
@ 2021-07-06 20:28   ` Bruce Richardson
  2021-07-07  3:16   ` [dpdk-dev] [RFC UPDATE PATCH 0/9] dmadev rfc suggested updates fengchengwen
  9 siblings, 0 replies; 339+ messages in thread
From: Bruce Richardson @ 2021-07-06 20:28 UTC (permalink / raw)
  To: dev
  Cc: Chengwen Feng, Jerin Jacob, Jerin Jacob, Morten Brørup,
	Bruce Richardson

Add in some basic dmadev unit tests for testing drivers and the library
itself.

Signed-off-by: Bruce Richardson <bruce.richardson@intel.com>
---
 app/test/meson.build   |   2 +
 app/test/test_dmadev.c | 320 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 322 insertions(+)
 create mode 100644 app/test/test_dmadev.c

diff --git a/app/test/meson.build b/app/test/meson.build
index 0a5f42557..223ca210a 100644
--- a/app/test/meson.build
+++ b/app/test/meson.build
@@ -36,6 +36,7 @@ test_sources = files(
         'test_debug.c',
         'test_distributor.c',
         'test_distributor_perf.c',
+        'test_dmadev.c',
         'test_eal_flags.c',
         'test_eal_fs.c',
         'test_efd.c',
@@ -155,6 +156,7 @@ test_deps = [
         'cmdline',
         'cryptodev',
         'distributor',
+        'dmadev',
         'efd',
         'ethdev',
         'eventdev',
diff --git a/app/test/test_dmadev.c b/app/test/test_dmadev.c
new file mode 100644
index 000000000..df301b385
--- /dev/null
+++ b/app/test/test_dmadev.c
@@ -0,0 +1,320 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2021 Intel Corporation
+ */
+
+#include <unistd.h>
+
+#include <rte_mbuf.h>
+#include <rte_dmadev.h>
+#include "test.h"
+
+#define COPY_LEN 1024
+
+static struct rte_mempool *pool;
+static uint16_t id_count = 0;
+
+#define PRINT_ERR(...) print_err(__func__, __LINE__, __VA_ARGS__)
+
+static inline int
+__rte_format_printf(3, 4)
+print_err(const char *func, int lineno, const char *format, ...)
+{
+	va_list ap;
+	int ret;
+
+	ret = fprintf(stderr, "In %s:%d - ", func, lineno);
+	va_start(ap, format);
+	ret += vfprintf(stderr, format, ap);
+	va_end(ap);
+
+	return ret;
+}
+
+static int
+do_multi_copies(int dev_id, int split_batches, int split_completions)
+{
+	struct rte_mbuf *srcs[32], *dsts[32];
+	unsigned int i, j;
+	bool dma_err = false;
+
+	for (i = 0; i < RTE_DIM(srcs); i++) {
+		char *src_data;
+
+		if (split_batches && i == RTE_DIM(srcs) / 2)
+			rte_dmadev_perform(dev_id, 0);
+
+		srcs[i] = rte_pktmbuf_alloc(pool);
+		dsts[i] = rte_pktmbuf_alloc(pool);
+		src_data = rte_pktmbuf_mtod(srcs[i], char *);
+		if (srcs[i] == NULL || dsts[i] == NULL) {
+			PRINT_ERR("Error allocating buffers\n");
+			return -1;
+		}
+
+		for (j = 0; j < COPY_LEN; j++)
+			src_data[j] = rand() & 0xFF;
+
+		if (rte_dmadev_copy(dev_id, 0, srcs[i]->buf_iova + srcs[i]->data_off,
+				dsts[i]->buf_iova + dsts[i]->data_off, COPY_LEN, 0) != id_count++) {
+			PRINT_ERR("Error with rte_dmadev_copy for buffer %u\n", i);
+			return -1;
+		}
+	}
+	rte_dmadev_perform(dev_id, 0);
+	usleep(100);
+
+	if (split_completions) {
+		/* gather completions in two halves */
+		uint16_t half_len = RTE_DIM(srcs) / 2;
+		int ret = rte_dmadev_completed(dev_id, 0, half_len, NULL, &dma_err);
+		if (ret != half_len || dma_err) {
+			PRINT_ERR("Error with rte_dmadev_completed - first half. ret = %d, expected ret = %u, dma_err = %d\n",
+					ret, half_len, dma_err);
+			rte_dmadev_dump(dev_id, stdout);
+			return -1;
+		}
+		ret = rte_dmadev_completed(dev_id, 0, half_len, NULL, &dma_err);
+		if (ret != half_len || dma_err) {
+			PRINT_ERR("Error with rte_dmadev_completed - second half. ret = %d, expected ret = %u, dma_err = %d\n",
+					ret, half_len, dma_err);
+			rte_dmadev_dump(dev_id, stdout);
+			return -1;
+		}
+	} else {
+		/* gather all completions in one go */
+		if ((j = rte_dmadev_completed(dev_id, 0, RTE_DIM(srcs), NULL,
+				&dma_err)) != RTE_DIM(srcs) || dma_err) {
+			PRINT_ERR("Error with rte_dmadev_completed, %u [expected: %zu], dma_err = %d\n",
+					j, RTE_DIM(srcs), dma_err);
+			rte_dmadev_dump(dev_id, stdout);
+			return -1;
+		}
+	}
+
+	/* check for empty */
+	if (rte_dmadev_completed(dev_id, 0, RTE_DIM(srcs), NULL, &dma_err) != 0 || dma_err) {
+		PRINT_ERR("Error with rte_dmadev_completed - ops unexpectedly returned\n");
+		rte_dmadev_dump(dev_id, stdout);
+		return -1;
+	}
+
+	for (i = 0; i < RTE_DIM(srcs); i++) {
+		char *src_data, *dst_data;
+
+		src_data = rte_pktmbuf_mtod(srcs[i], char *);
+		dst_data = rte_pktmbuf_mtod(dsts[i], char *);
+		for (j = 0; j < COPY_LEN; j++)
+			if (src_data[j] != dst_data[j]) {
+				PRINT_ERR("Error with copy of packet %u, byte %u\n", i, j);
+				return -1;
+			}
+		rte_pktmbuf_free(srcs[i]);
+		rte_pktmbuf_free(dsts[i]);
+	}
+	return 0;
+}
+
+static int
+test_enqueue_copies(int dev_id)
+{
+	unsigned int i;
+	uint16_t id;
+
+	/* test doing a single copy */
+	do {
+		struct rte_mbuf *src, *dst;
+		char *src_data, *dst_data;
+
+		src = rte_pktmbuf_alloc(pool);
+		dst = rte_pktmbuf_alloc(pool);
+		src_data = rte_pktmbuf_mtod(src, char *);
+		dst_data = rte_pktmbuf_mtod(dst, char *);
+
+		for (i = 0; i < COPY_LEN; i++)
+			src_data[i] = rand() & 0xFF;
+
+		id = rte_dmadev_copy(dev_id, 0, src->buf_iova + src->data_off,
+				dst->buf_iova + dst->data_off, COPY_LEN, 0);
+		if (id != id_count) {
+			PRINT_ERR("Error with rte_dmadev_copy, got %u, expected %u\n",
+					id, id_count);
+			return -1;
+		}
+		if (rte_dmadev_perform(dev_id, 0) < 0) {
+			PRINT_ERR("Error with rte_dmadev_perform\n");
+			return -1;
+		}
+		/* give time for copy to finish, then check it was done */
+		usleep(10);
+
+		for (i = 0; i < COPY_LEN; i++) {
+			if (dst_data[i] != src_data[i]) {
+				PRINT_ERR("Data mismatch at char %u [Got %02x not %02x]\n", i,
+						dst_data[i], src_data[i]);
+				rte_dmadev_dump(dev_id, stderr);
+				return -1;
+			}
+		}
+
+		/* now check completion works */
+		if (rte_dmadev_completed(dev_id, 0, 1, &id, NULL) != 1) {
+			PRINT_ERR("Error with rte_dmadev_completed\n");
+			return -1;
+		}
+		if (id != id_count) {
+			PRINT_ERR("Error:incorrect job id received, %u [expected %u]\n", id, id_count);
+			return -1;
+		}
+
+		rte_pktmbuf_free(src);
+		rte_pktmbuf_free(dst);
+
+		/* now check completion works */
+		if (rte_dmadev_completed(dev_id, 0, 1, NULL, NULL) != 0) {
+			PRINT_ERR("Error with rte_dmadev_completed in empty check\n");
+			return -1;
+		}
+		id_count++;
+
+	} while (0);
+
+	/* test doing a multiple single copies */
+	do {
+		const uint16_t max_ops = 4;
+		struct rte_mbuf *src, *dst;
+		char *src_data, *dst_data;
+
+		src = rte_pktmbuf_alloc(pool);
+		dst = rte_pktmbuf_alloc(pool);
+		src_data = rte_pktmbuf_mtod(src, char *);
+		dst_data = rte_pktmbuf_mtod(dst, char *);
+
+		for (i = 0; i < COPY_LEN; i++)
+			src_data[i] = rand() & 0xFF;
+
+		/* perform the same copy <max_ops> times */
+		for (i = 0; i < max_ops; i++) {
+			if (rte_dmadev_copy(dev_id, 0,
+					src->buf_iova + src->data_off,
+					dst->buf_iova + dst->data_off,
+					COPY_LEN, 0) != id_count++) {
+				PRINT_ERR("Error with rte_dmadev_copy\n");
+				return -1;
+			}
+			rte_dmadev_perform(dev_id, 0);
+		}
+		usleep(10);
+
+		if ((i = rte_dmadev_completed(dev_id, 0, max_ops * 2, &id, NULL)) != max_ops) {
+			PRINT_ERR("Error with rte_dmadev_completed, got %u not %u\n", i, max_ops);
+			return -1;
+		}
+		if (id != id_count - 1) {
+			PRINT_ERR("Error, incorrect job id returned: got %u not %u\n", id, id_count - 1);
+			return -1;
+		}
+		for (i = 0; i < COPY_LEN; i++) {
+			if (dst_data[i] != src_data[i]) {
+				PRINT_ERR("Data mismatch at char %u\n", i);
+				return -1;
+			}
+		}
+		rte_pktmbuf_free(src);
+		rte_pktmbuf_free(dst);
+	} while (0);
+
+	/* test doing multiple copies */
+	return do_multi_copies(dev_id, 0, 0) /* enqueue and complete one batch at a time */
+			|| do_multi_copies(dev_id, 1, 0) /* enqueue 2 batches and then complete both */
+			|| do_multi_copies(dev_id, 0, 1); /* enqueue 1 batch, then complete in two halves */
+}
+
+static int
+test_dmadev_instance(uint16_t dev_id)
+{
+#define TEST_RINGSIZE 512
+	struct rte_dmadev_info info;
+	struct rte_dmadev_conf conf = { .nb_hw_queues = 1};
+	struct rte_dmadev_queue_conf qconf = { .nb_desc = TEST_RINGSIZE };
+	int i;
+
+	rte_dmadev_info_get(dev_id, &info);
+	if (info.max_hw_queues < 1) {
+		PRINT_ERR("Error, no queues reported on device id %u\n", dev_id);
+		return -1;
+	}
+	if (rte_dmadev_configure(dev_id, &conf) != 0) {
+		PRINT_ERR("Error with rte_rawdev_configure()\n");
+		return -1;
+	}
+	if (rte_dmadev_queue_setup(dev_id, &qconf) != 0) {
+		PRINT_ERR("Error with queue configuration\n");
+		return -1;
+	}
+	rte_dmadev_info_get(dev_id, &info);
+	if (info.nb_hw_queues != 1) {
+		PRINT_ERR("Error, no configured queues reported on device id %u\n", dev_id);
+		return -1;
+	}
+
+	if (rte_dmadev_start(dev_id) != 0) {
+		PRINT_ERR("Error with rte_rawdev_start()\n");
+		return -1;
+	}
+	id_count = 0;
+
+	/* create a mempool for running tests */
+	pool = rte_pktmbuf_pool_create("TEST_DMADEV_POOL",
+			TEST_RINGSIZE * 2, /* n == num elements */
+			32,  /* cache size */
+			0,   /* priv size */
+			2048, /* data room size */
+			info.socket_id);
+	if (pool == NULL) {
+		PRINT_ERR("Error with mempool creation\n");
+		return -1;
+	}
+
+	/* run the test cases */
+	printf("DMA Dev: %u, Running Copy Tests\n", dev_id);
+	for (i = 0; i < 768; i++) {
+		struct rte_dmadev_stats stats;
+
+		if (test_enqueue_copies(dev_id) != 0) {
+			printf("Error with iteration %d\n", i);
+			rte_dmadev_dump(dev_id, stdout);
+			goto err;
+		}
+
+		rte_dmadev_stats_get(dev_id, 0, &stats);
+		printf("Ops enqueued: %"PRIu64"\t", stats.enqueued_count);
+		printf("Ops completed: %"PRIu64"\r", stats.completed_count);
+	}
+	printf("\n");
+
+	rte_mempool_free(pool);
+	rte_dmadev_stop(dev_id);
+
+	return 0;
+
+err:
+	rte_mempool_free(pool);
+	rte_dmadev_stop(dev_id);
+	return -1;
+}
+
+static int
+test_dmadevs(void)
+{
+	int i;
+
+	if (rte_dmadev_count() == 0)
+		return TEST_SKIPPED;
+
+	for (i = 0; i < RTE_DMADEV_MAX_DEVS; i++)
+		if (rte_dmadevices[i].attached && test_dmadev_instance(i) < 0)
+			return -1;
+	return 0;
+}
+
+REGISTER_TEST_COMMAND(dmadev_autotest, test_dmadevs);
-- 
2.30.2


^ permalink raw reply	[flat|nested] 339+ messages in thread

* Re: [dpdk-dev] [RFC UPDATE PATCH 0/9] dmadev rfc suggested updates
  2021-07-06 20:28 ` [dpdk-dev] [RFC UPDATE PATCH 0/9] dmadev rfc suggested updates Bruce Richardson
                     ` (8 preceding siblings ...)
  2021-07-06 20:28   ` [dpdk-dev] [RFC UPDATE PATCH 9/9] app/test: add basic dmadev unit test Bruce Richardson
@ 2021-07-07  3:16   ` fengchengwen
  2021-07-07  8:11     ` Bruce Richardson
                       ` (2 more replies)
  9 siblings, 3 replies; 339+ messages in thread
From: fengchengwen @ 2021-07-07  3:16 UTC (permalink / raw)
  To: Bruce Richardson, dev; +Cc: Jerin Jacob, Jerin Jacob, Morten Brørup

LGTM, thanks

And I'am prepare dmadev V2, include:
a) Fix code review comments (e.g. multiple-process support, doxygen, comments, typo)
b) Flatten device abstraction to two layer: dmadev <> vchan
c) Public API use dev_id and vchan_id to locate one vchan
d) Using the flags parameter instead of the fence API
e) Rename rte_dmadev_perform to rte_dmadev_submit so it corresponds to the stats variable.

PS: Some code (lib/dmadev) will rebase this patchset


On 2021/7/7 4:28, Bruce Richardson wrote:
> This patchset contains a series of changes to dmadev based on work being done to
> port over our drivers to test this new infrastructure. Some of these are bug
> fixes to enable compilation e.g. missing exports or meson.build files, while
> others are suggested changes to enhance the API. All these patches are to be
> applied on top of [1] as they are mostly suggested changes to that RFC i.e.
> patches to the patch!
> 
> The final patch includes some basic sanity tests for copy operations that we
> have ported over from the ioat self-tests to use the dmadev APIs. The basic
> dataplane part of those tests is probably ok for now, but the initialization of
> queues in that test code may need some enhancement. Feedback welcome.
> 
> A tree with all these patches applied can be got at [2] if anyone wants to use
> that as a basis for working on drivers, or for other discussion.
> 
> [1] http://patches.dpdk.org/project/dpdk/patch/1625231891-2963-1-git-send-email-fengchengwen@huawei.com/
> [2] https://github.com/bruce-richardson/dpdk/tree/dmadev-rfcs
> 
> Bruce Richardson (9):
>   dmadev: add missing exports
>   dmadev: change virtual addresses to IOVA
>   dmadev: add dump function
>   dmadev: remove xstats functions
>   dmadev: drop cookie typedef
>   dmadev: allow NULL parameters to completed ops call
>   dmadev: stats structure updates
>   drivers: add dma driver category
>   app/test: add basic dmadev unit test
> 
>  app/test/meson.build         |   2 +
>  app/test/test_dmadev.c       | 320 +++++++++++++++++++++++++++++++++++
>  drivers/dma/meson.build      |  11 ++
>  drivers/meson.build          |   1 +
>  lib/dmadev/rte_dmadev.c      |  66 ++------
>  lib/dmadev/rte_dmadev.h      | 204 +++++++---------------
>  lib/dmadev/rte_dmadev_core.h |  16 +-
>  lib/dmadev/rte_dmadev_pmd.h  |  24 +--
>  lib/dmadev/version.map       |   7 +-
>  9 files changed, 425 insertions(+), 226 deletions(-)
>  create mode 100644 app/test/test_dmadev.c
>  create mode 100644 drivers/dma/meson.build
> 
> --
> 2.30.2
> 
> 
> .
> 


^ permalink raw reply	[flat|nested] 339+ messages in thread

* Re: [dpdk-dev] [PATCH] dmadev: introduce DMA device library
  2021-07-05 17:16       ` Bruce Richardson
@ 2021-07-07  8:08         ` Jerin Jacob
  2021-07-07  8:35           ` Bruce Richardson
  0 siblings, 1 reply; 339+ messages in thread
From: Jerin Jacob @ 2021-07-07  8:08 UTC (permalink / raw)
  To: Bruce Richardson
  Cc: Chengwen Feng, Thomas Monjalon, Ferruh Yigit, Jerin Jacob,
	dpdk-dev, Morten Brørup, Nipun Gupta, Hemant Agrawal,
	Maxime Coquelin, Honnappa Nagarahalli, David Marchand,
	Satananda Burla, Prasun Kapoor, Ananyev, Konstantin, liangma,
	Radha Mohan Chintakuntla

On Mon, Jul 5, 2021 at 10:46 PM Bruce Richardson
<bruce.richardson@intel.com> wrote:
>
> On Mon, Jul 05, 2021 at 09:25:34PM +0530, Jerin Jacob wrote:
> >
> > On Mon, Jul 5, 2021 at 4:22 PM Bruce Richardson
> > <bruce.richardson@intel.com> wrote:
> > >
> > > On Sun, Jul 04, 2021 at 03:00:30PM +0530, Jerin Jacob wrote:
> > > > On Fri, Jul 2, 2021 at 6:51 PM Chengwen Feng <fengchengwen@huawei.com> wrote:
> > > > >
> > > > > This patch introduces 'dmadevice' which is a generic type of DMA
> > > > > device.
> <snip>
> > >
> > > +1 and the terminology with regards to queues and channels. With our ioat
> > > hardware, each HW queue was called a channel for instance.
> >
> > Looks like <dmadev> <> <channel> can cover all the use cases, if the
> > HW has more than
> > 1 queues it can be exposed as separate dmadev dev.
> >
>
> Fine for me.
>
> However, just to confirm that Morten's suggestion of using a
> (device-specific void *) channel pointer rather than dev_id + channel_id
> pair of parameters won't work for you? You can't store a pointer or dev
> index in the channel struct in the driver?

Yes. That will work. To confirm, the suggestion is to use, void *
object instead of channel_id,
That will avoid one more indirection.(index -> pointer)


>
> >
> <snip>
> > > > > + *
> > > > > + * If dma_cookie_t is >=0 it's a DMA operation request cookie, <0 it's a error
> > > > > + * code.
> > > > > + * When using cookies, comply with the following rules:
> > > > > + * a) Cookies for each virtual queue are independent.
> > > > > + * b) For a virt queue, the cookie are monotonically incremented, when it reach
> > > > > + *    the INT_MAX, it wraps back to zero.
> > >
> > > I disagree with the INT_MAX (or INT32_MAX) value here. If we use that
> > > value, it means that we cannot use implicit wrap-around inside the CPU and
> > > have to check for the INT_MAX value. Better to:
> > > 1. Specify that it wraps at UINT16_MAX which allows us to just use a
> > > uint16_t internally and wrap-around automatically, or:
> > > 2. Specify that it wraps at a power-of-2 value >= UINT16_MAX, giving
> > > drivers the flexibility at what value to wrap around.
> >
> > I think, (2) better than 1. I think, even better to wrap around the number of
> > descriptors configured in dev_configure()(We cake make this as the power of 2),
> >
>
> Interesting, I hadn't really considered that before. My only concern
> would be if an app wants to keep values in the app ring for a while after
> they have been returned from dmadev. I thought it easier to have the full
> 16-bit counter value returned to the user to give the most flexibility,
> given that going from that to any power-of-2 ring size smaller is a trivial
> operation.
>
> Overall, while my ideal situation is to always have a 0..UINT16_MAX return
> value from the function, I can live with your suggestion of wrapping at
> ring_size, since drivers will likely do that internally anyway.
> I think wrapping at INT32_MAX is too awkward and will be error prone since
> we can't rely on hardware automatically wrapping to zero, nor on the driver
> having pre-masked the value.

OK. +1 for UINT16_MAX

>
> > >
> > > > > + * c) The initial cookie of a virt queue is zero, after the device is stopped or
> > > > > + *    reset, the virt queue's cookie needs to be reset to zero.
> <snip>
> > > >
> > > > Please add some good amount of reserved bits and have API to init this
> > > > structure for future ABI stability, say rte_dmadev_queue_config_init()
> > > > or so.
> > > >
> > >
> > > I don't think that is necessary. Since the config struct is used only as
> > > parameter to the config function, any changes to it can be managed by
> > > versioning that single function. Padding would only be necessary if we had
> > > an array of these config structs somewhere.
> >
> > OK.
> >
> > For some reason, the versioning API looks ugly to me in code instead of keeping
> > some rsvd fields look cool to me with init function.
> >
> > But I agree. function versioning works in this case. No need to find other API
> > if tt is not general DPDK API practice.
> >
>
> The one thing I would suggest instead of the padding is for the internal
> APIS, to pass the struct size through, since we can't version those - and
> for padding we can't know whether any replaced padding should be used or
> not. Specifically:
>
>         typedef int (*rte_dmadev_configure_t)(struct rte_dmadev *dev, struct
>                         rte_dmadev_conf *cfg, size_t cfg_size);
>
> but for the public function:
>
>         int
>         rte_dmadev_configure(struct rte_dmadev *dev, struct
>                         rte_dmadev_conf *cfg)
>         {
>                 ...
>                 ret = dev->ops.configure(dev, cfg, sizeof(*cfg));
>                 ...
>         }

Makes sense.

>
> Then if we change the structure and version the config API, the driver can
> tell from the size what struct version it is and act accordingly. Without
> that, each time the struct changed, we'd have to add a new function pointer
> to the device ops.
>
> > In other libraries, I have seen such _init or function that can use
> > for this as well as filling default value
> > in some cases implementation values is not zero).
> > So that application can avoid memset for param structure.
> > Added rte_event_queue_default_conf_get() in eventdev spec for this.
> >
>
> I think that would largely have the same issues, unless it returned a
> pointer to data inside the driver - and which therefore could not be
> modified. Alternatively it would mean that the memory would have been
> allocated in the driver and we would need to ensure proper cleanup
> functions were called to free memory afterwards. Supporting having the
> config parameter as a local variable I think makes things a lot easier.
>
> > No strong opinion on this.
> >
> >
> >
> > >
> > > >
> > > > > +
> > > > > +/**
> > > > > + * A structure used to retrieve information of a DMA virt queue.
> > > > > + */
> > > > > +struct rte_dmadev_queue_info {
> > > > > +       enum dma_transfer_direction direction;
> > > >
> > > > A queue may support all directions so I think it should be a bitfield.
> > > >
> > > > > +       /**< Associated transfer direction */
> > > > > +       uint16_t hw_queue_id; /**< The HW queue on which to create virt queue */
> > > > > +       uint16_t nb_desc; /**< Number of descriptor for this virt queue */
> > > > > +       uint64_t dev_flags; /**< Device specific flags */
> > > > > +};
> > > > > +
> > > >
> > > > > +__rte_experimental
> > > > > +static inline dma_cookie_t
> > > > > +rte_dmadev_copy_sg(uint16_t dev_id, uint16_t vq_id,
> > > > > +                  const struct dma_scatterlist *sg,
> > > > > +                  uint32_t sg_len, uint64_t flags)
> > > >
> > > > I would like to change this as:
> > > > rte_dmadev_copy_sg(uint16_t dev_id, uint16_t vq_id, const struct
> > > > rte_dma_sg *src, uint32_t nb_src,
> > > > const struct rte_dma_sg *dst, uint32_t nb_dst) or so allow the use case like

In the above syntax, @Chengchang Tang
rte_dma_sg needs to contains only ptr and size.

> > > > src 30 MB copy can be splitted as written as 1 MB x 30 dst.
> > > >
>
> Out of interest, do you see much benefit (and in what way) from having the
> scatter-gather support? Unlike sending 5 buffers in one packet rather than
> 5 buffers in 5 packets to a NIC, copying an array of memory in one op vs
> multiple is functionally identical.

Knowing upfront or in shot if such segments expressed can have better
optimization
in drivers like
1) In one DMA job request HW can fill multiple segments vs multiple
DMA job requests with each segment.
2) Single completion i.e less overhead system.
3) Less latency for the job requests.


>
> > > >
> > > >
> <snip>
> > Got it. In order to save space if first CL size for fastpath(Saving 8B
> > for the pointer) and to avoid
> > function overhead, Can we use one bit of flags of op function to
> > enable the fence?
> >
>
> The original ioat implementation did exactly that. However, I then
> discovered that because a fence logically belongs between two operations,
> does the fence flag on an operation mean "don't do any jobs after this
> until this job has completed" or does it mean "don't start this job until
> all previous jobs have completed". [Or theoretically does it mean both :-)]
> Naturally, some hardware does it the former way (i.e. fence flag goes on
> last op before fence), while other hardware the latter way (i.e. fence flag
> goes on first op after the fence). Therefore, since fencing is about
> ordering *between* two (sets of) jobs, I decided that it should do exactly
> that and go between two jobs, so there is no ambiguity!
>
> However, I'm happy enough to switch to having a fence flag, but I think if
> we do that, it should be put in the "first job after fence" case, because
> it is always easier to modify a previously written job if we need to, than
> to save the flag for a future one.
>
> Alternatively, if we keep the fence as a separate function, I'm happy
> enough for it not to be on the same cacheline as the "hot" operations,
> since fencing will always introduce a small penalty anyway.

Ack.
You may consider two flags, FENCE_THEN_JOB and JOB_THEN_FENCE( If
there any use case for this or it makes sense for your HW)


For us, Fence is NOP for us as we have an implicit fence between each
HW job descriptor.


>
> > >
> > > >
> <snip>
> > > > Since we have additional function call overhead in all the
> > > > applications for this scheme, I would like to understand
> > > > the use of doing this way vs enq does the doorbell implicitly from
> > > > driver/application PoV?
> > > >
> > >
> > > In our benchmarks it's just faster. When we tested it, the overhead of the
> > > function calls was noticably less than the cost of building up the
> > > parameter array(s) for passing the jobs in as a burst. [We don't see this
> > > cost with things like NIC I/O since DPDK tends to already have the mbuf
> > > fully populated before the TX call anyway.]
> >
> > OK. I agree with stack population.
> >
> > My question was more on doing implicit doorbell update enq. Is doorbell write
> > costly in other HW compare to a function call? In our HW, it is just write of
> > the number of instructions written in a register.
> >
> > Also, we need to again access the internal PMD memory structure to find
> > where to write etc if it is a separate function.
> >
>
> The cost varies depending on a number of factors - even writing to a single
> HW register can be very slow if that register is mapped as device
> (uncacheable) memory, since (AFAIK) it will act as a full fence and wait

I don't know, At least in our case, writes are write-back. so core does not need
to wait.(If there is no read operation).

> for the write to go all the way to hardware. For more modern HW, the cost
> can be lighter. However, any cost of HW writes is going to be the same
> whether its a separate function call or not.
>
> However, the main thing about the doorbell update is that it's a
> once-per-burst thing, rather than a once-per-job. Therefore, even if you
> have to re-read the struct memory (which is likely still somewhere in your
> cores' cache), any extra small cost of doing so is to be amortized over the
> cost of a whole burst of copies.

Linux kernel has xmit_more flag in skb to address similar thing.
i.e enq job flag can have one more bit field to say update ring bell or not?
Rather having yet another function overhead.IMO, it is the best of both worlds.


>
> >
> > >
> > > >
> <snip>
> > > > > +
> > > > > +/**
> > > > > + * @warning
> > > > > + * @b EXPERIMENTAL: this API may change without prior notice.
> > > > > + *
> > > > > + * Returns the number of operations that failed to complete.
> > > > > + * NOTE: This API was used when rte_dmadev_completed has_error was set.
> > > > > + *
> > > > > + * @param dev_id
> > > > > + *   The identifier of the device.
> > > > > + * @param vq_id
> > > > > + *   The identifier of virt queue.
> > > > (> + * @param nb_status
> > > > > + *   Indicates the size  of status array.
> > > > > + * @param[out] status
> > > > > + *   The error code of operations that failed to complete.
> > > > > + * @param[out] cookie
> > > > > + *   The last failed completed operation's cookie.
> > > > > + *
> > > > > + * @return
> > > > > + *   The number of operations that failed to complete.
> > > > > + *
> > > > > + * NOTE: The caller must ensure that the input parameter is valid and the
> > > > > + *       corresponding device supports the operation.
> > > > > + */
> > > > > +__rte_experimental
> > > > > +static inline uint16_t
> > > > > +rte_dmadev_completed_fails(uint16_t dev_id, uint16_t vq_id,
> > > > > +                          const uint16_t nb_status, uint32_t *status,
> > > > > +                          dma_cookie_t *cookie)
> > > >
> > > > IMO, it is better to move cookie/rind_idx at 3.
> > > > Why it would return any array of errors? since it called after
> > > > rte_dmadev_completed() has
> > > > has_error. Is it better to change
> > > >
> > > > rte_dmadev_error_status((uint16_t dev_id, uint16_t vq_id, dma_cookie_t
> > > > *cookie,  uint32_t *status)
> > > >
> > > > I also think, we may need to set status as bitmask and enumerate all
> > > > the combination of error codes
> > > > of all the driver and return string from driver existing rte_flow_error
> > > >
> > > > See
> > > > struct rte_flow_error {
> > > >         enum rte_flow_error_type type; /**< Cause field and error types. */
> > > >         const void *cause; /**< Object responsible for the error. */
> > > >         const char *message; /**< Human-readable error message. */
> > > > };
> > > >
> > >
> > > I think we need a multi-return value API here, as we may add operations in
> > > future which have non-error status values to return. The obvious case is
> > > DMA engines which support "compare" operations. In that case a successful
> > > compare (as in there were no DMA or HW errors) can return "equal" or
> > > "not-equal" as statuses. For general "copy" operations, the faster
> > > completion op can be used to just return successful values (and only call
> > > this status version on error), while apps using those compare ops or a
> > > mixture of copy and compare ops, would always use the slower one that
> > > returns status values for each and every op..
> > >
> > > The ioat APIs used 32-bit integer values for this status array so as to
> > > allow e.g. 16-bits for error code and 16-bits for future status values. For
> > > most operations there should be a fairly small set of things that can go
> > > wrong, i.e. bad source address, bad destination address or invalid length.
> > > Within that we may have a couple of specifics for why an address is bad,
> > > but even so I don't think we need to start having multiple bit
> > > combinations.
> >
> > OK. What is the purpose of errors status? Is it for application printing it or
> > Does the application need to take any action based on specific error requests?
>
> It's largely for information purposes, but in the case of SVA/SVM errors
> could occur due to the memory not being pinned, i.e. a page fault, in some
> cases. If that happens, then it's up the app to either touch the memory and
> retry the copy, or to do a SW memcpy as a fallback.
>
> In other error cases, I think it's good to tell the application if it's
> passing around bad data, or data that is beyond the scope of hardware, e.g.
> a copy that is beyond what can be done in a single transaction for a HW
> instance. Given that there are always things that can go wrong, I think we
> need some error reporting mechanism.
>
> > If the former is scope, then we need to define the standard enum value
> > for the error right?
> > ie. uint32_t *status needs to change to enum rte_dma_error or so.
> >
> Sure. Perhaps an error/status structure either is an option, where we
> explicitly call out error info from status info.

Agree. Better to have a structure with filed like,

1)  enum rte_dma_error_type
2)  memory to store, informative message on fine aspects of error.
LIke address caused issue etc.(Which will be driver-specific
information).


>
> >
> >
> <snip to end>
>
> /Bruce

^ permalink raw reply	[flat|nested] 339+ messages in thread

* Re: [dpdk-dev] [RFC UPDATE PATCH 0/9] dmadev rfc suggested updates
  2021-07-07  3:16   ` [dpdk-dev] [RFC UPDATE PATCH 0/9] dmadev rfc suggested updates fengchengwen
@ 2021-07-07  8:11     ` Bruce Richardson
  2021-07-07  8:14     ` Bruce Richardson
  2021-07-07 10:42     ` Jerin Jacob
  2 siblings, 0 replies; 339+ messages in thread
From: Bruce Richardson @ 2021-07-07  8:11 UTC (permalink / raw)
  To: fengchengwen; +Cc: dev, Jerin Jacob, Jerin Jacob, Morten Brørup

On Wed, Jul 07, 2021 at 11:16:44AM +0800, fengchengwen wrote:
> LGTM, thanks
> 
> And I'am prepare dmadev V2, include:
> a) Fix code review comments (e.g. multiple-process support, doxygen, comments, typo)
> b) Flatten device abstraction to two layer: dmadev <> vchan
> c) Public API use dev_id and vchan_id to locate one vchan
> d) Using the flags parameter instead of the fence API

Bit uncertain about this one still

> e) Rename rte_dmadev_perform to rte_dmadev_submit so it corresponds to the stats variable.
> 
> PS: Some code (lib/dmadev) will rebase this patchset
> 
This was not meant to be a separate patchset, but rather to try and keep us
all in sync on what was being looked at and tested. Please just pull in the
changes from this set (as many as you are happy with) into your V2 RFC to
simplify things. It's better to just have the one master RFC into which
changes are pulled.

Thanks,
/Bruce

^ permalink raw reply	[flat|nested] 339+ messages in thread

* Re: [dpdk-dev] [RFC UPDATE PATCH 0/9] dmadev rfc suggested updates
  2021-07-07  3:16   ` [dpdk-dev] [RFC UPDATE PATCH 0/9] dmadev rfc suggested updates fengchengwen
  2021-07-07  8:11     ` Bruce Richardson
@ 2021-07-07  8:14     ` Bruce Richardson
  2021-07-07 10:42     ` Jerin Jacob
  2 siblings, 0 replies; 339+ messages in thread
From: Bruce Richardson @ 2021-07-07  8:14 UTC (permalink / raw)
  To: fengchengwen; +Cc: dev, Jerin Jacob, Jerin Jacob, Morten Brørup

On Wed, Jul 07, 2021 at 11:16:44AM +0800, fengchengwen wrote:
> LGTM, thanks
> 
> And I'am prepare dmadev V2, include: a) Fix code review comments (e.g.
> multiple-process support, doxygen, comments, typo) b) Flatten device
> abstraction to two layer: dmadev <> vchan c) Public API use dev_id and
> vchan_id to locate one vchan d) Using the flags parameter instead of the
> fence API e) Rename rte_dmadev_perform to rte_dmadev_submit so it
> corresponds to the stats variable.
> 
> PS: Some code (lib/dmadev) will rebase this patchset
> 
> 
As well as posting the RFC v2 here, could you also perhaps post it to a
github repo, so that Jerin, myself and others can send pull-requests with
suggested changes?  For key areas of discussion, working through github
with patches sent via pull request you can take in directly will probably
be faster for getting us to a v3. The mail threads on list are already
getting very long and hard to follow.

Regards,
/Bruce

^ permalink raw reply	[flat|nested] 339+ messages in thread

* Re: [dpdk-dev] [RFC UPDATE PATCH 1/9] dmadev: add missing exports
  2021-07-06 20:28   ` [dpdk-dev] [RFC UPDATE PATCH 1/9] dmadev: add missing exports Bruce Richardson
@ 2021-07-07  8:26     ` David Marchand
  2021-07-07  8:36       ` Bruce Richardson
  0 siblings, 1 reply; 339+ messages in thread
From: David Marchand @ 2021-07-07  8:26 UTC (permalink / raw)
  To: Bruce Richardson
  Cc: dev, Chengwen Feng, Jerin Jacob, Jerin Jacob, Morten Brørup

On Tue, Jul 6, 2021 at 10:29 PM Bruce Richardson
<bruce.richardson@intel.com> wrote:
>
> Export the rte_dmadevices array and the allocate and release functions
> which are needed by PMDs.

rte_dmadevices[] might be an issue for inline accesses, but pmd
allocate/release should be internal (driver only).


-- 
David Marchand


^ permalink raw reply	[flat|nested] 339+ messages in thread

* Re: [dpdk-dev] [PATCH] dmadev: introduce DMA device library
  2021-07-07  8:08         ` Jerin Jacob
@ 2021-07-07  8:35           ` Bruce Richardson
  2021-07-07 10:34             ` Jerin Jacob
  0 siblings, 1 reply; 339+ messages in thread
From: Bruce Richardson @ 2021-07-07  8:35 UTC (permalink / raw)
  To: Jerin Jacob
  Cc: Chengwen Feng, Thomas Monjalon, Ferruh Yigit, Jerin Jacob,
	dpdk-dev, Morten Brørup, Nipun Gupta, Hemant Agrawal,
	Maxime Coquelin, Honnappa Nagarahalli, David Marchand,
	Satananda Burla, Prasun Kapoor, Ananyev, Konstantin, liangma,
	Radha Mohan Chintakuntla

On Wed, Jul 07, 2021 at 01:38:58PM +0530, Jerin Jacob wrote:
> On Mon, Jul 5, 2021 at 10:46 PM Bruce Richardson
> <bruce.richardson@intel.com> wrote:
> >
> > On Mon, Jul 05, 2021 at 09:25:34PM +0530, Jerin Jacob wrote:
> > >
> > > On Mon, Jul 5, 2021 at 4:22 PM Bruce Richardson
> > > <bruce.richardson@intel.com> wrote:
> > > >
> > > > On Sun, Jul 04, 2021 at 03:00:30PM +0530, Jerin Jacob wrote:
> > > > > On Fri, Jul 2, 2021 at 6:51 PM Chengwen Feng <fengchengwen@huawei.com> wrote:
> > > > > >
> > > > > > This patch introduces 'dmadevice' which is a generic type of DMA
> > > > > > device.
> > <snip>
> > > >
> > > > +1 and the terminology with regards to queues and channels. With our ioat
> > > > hardware, each HW queue was called a channel for instance.
> > >
> > > Looks like <dmadev> <> <channel> can cover all the use cases, if the
> > > HW has more than
> > > 1 queues it can be exposed as separate dmadev dev.
> > >
> >
> > Fine for me.
> >
> > However, just to confirm that Morten's suggestion of using a
> > (device-specific void *) channel pointer rather than dev_id + channel_id
> > pair of parameters won't work for you? You can't store a pointer or dev
> > index in the channel struct in the driver?
> 
> Yes. That will work. To confirm, the suggestion is to use, void *
> object instead of channel_id,
> That will avoid one more indirection.(index -> pointer)
> 

The proposal was to use it in place of "dev_id + channel_id", i.e.

copy(dev_id, ch_id, src, dst, len, flags) --> copy(ch_ptr, src, dst, len, flags)

Where the channel pointer implicitly indicates the device too. However, I
realise now that this would be something completely transparent to the
driver as it would all have to be implemented in the dmadev level, and
lead to lots of duplication of function pointers, etc. Therefore, let's
just go with original scheme. :-(

> 
> >
> > >

<snip>

> > > Got it. In order to save space if first CL size for fastpath(Saving 8B
> > > for the pointer) and to avoid
> > > function overhead, Can we use one bit of flags of op function to
> > > enable the fence?
> > >
> >
> > The original ioat implementation did exactly that. However, I then
> > discovered that because a fence logically belongs between two operations,
> > does the fence flag on an operation mean "don't do any jobs after this
> > until this job has completed" or does it mean "don't start this job until
> > all previous jobs have completed". [Or theoretically does it mean both :-)]
> > Naturally, some hardware does it the former way (i.e. fence flag goes on
> > last op before fence), while other hardware the latter way (i.e. fence flag
> > goes on first op after the fence). Therefore, since fencing is about
> > ordering *between* two (sets of) jobs, I decided that it should do exactly
> > that and go between two jobs, so there is no ambiguity!
> >
> > However, I'm happy enough to switch to having a fence flag, but I think if
> > we do that, it should be put in the "first job after fence" case, because
> > it is always easier to modify a previously written job if we need to, than
> > to save the flag for a future one.
> >
> > Alternatively, if we keep the fence as a separate function, I'm happy
> > enough for it not to be on the same cacheline as the "hot" operations,
> > since fencing will always introduce a small penalty anyway.
> 
> Ack.
> You may consider two flags, FENCE_THEN_JOB and JOB_THEN_FENCE( If
> there any use case for this or it makes sense for your HW)
> 
> 
> For us, Fence is NOP for us as we have an implicit fence between each
> HW job descriptor.
> 

I actually still think that having a separate fence function in the "ops"
section is the best approach here. It's unabiguous as to whether it's
fence-before or fence-after, and if we have it in the ops, it doesn't use a
"fast-path" slot.

However, if we *really* want to use a flag instead, I don't see the value
in having two flags, it will be really confusing.  Instead, if we do go
with a flag, I think "RTE_DMA_PRE_FENCE" should be the name, indicating
that the fence occurs before the job in question.

> 
> >
> > > >
> > > > >
> > <snip>
> > > > > Since we have additional function call overhead in all the
> > > > > applications for this scheme, I would like to understand
> > > > > the use of doing this way vs enq does the doorbell implicitly from
> > > > > driver/application PoV?
> > > > >
> > > >
> > > > In our benchmarks it's just faster. When we tested it, the overhead of the
> > > > function calls was noticably less than the cost of building up the
> > > > parameter array(s) for passing the jobs in as a burst. [We don't see this
> > > > cost with things like NIC I/O since DPDK tends to already have the mbuf
> > > > fully populated before the TX call anyway.]
> > >
> > > OK. I agree with stack population.
> > >
> > > My question was more on doing implicit doorbell update enq. Is doorbell write
> > > costly in other HW compare to a function call? In our HW, it is just write of
> > > the number of instructions written in a register.
> > >
> > > Also, we need to again access the internal PMD memory structure to find
> > > where to write etc if it is a separate function.
> > >
> >
> > The cost varies depending on a number of factors - even writing to a single
> > HW register can be very slow if that register is mapped as device
> > (uncacheable) memory, since (AFAIK) it will act as a full fence and wait
> 
> I don't know, At least in our case, writes are write-back. so core does not need
> to wait.(If there is no read operation).
> 
> > for the write to go all the way to hardware. For more modern HW, the cost
> > can be lighter. However, any cost of HW writes is going to be the same
> > whether its a separate function call or not.
> >
> > However, the main thing about the doorbell update is that it's a
> > once-per-burst thing, rather than a once-per-job. Therefore, even if you
> > have to re-read the struct memory (which is likely still somewhere in your
> > cores' cache), any extra small cost of doing so is to be amortized over the
> > cost of a whole burst of copies.
> 
> Linux kernel has xmit_more flag in skb to address similar thing.
> i.e enq job flag can have one more bit field to say update ring bell or not?
> Rather having yet another function overhead.IMO, it is the best of both worlds.
> 

It's just more conditionals and branches all through the code. Inside the
user application, the user has to check whether to set the flag or not (or
special-case the last transaction outside the loop), and within the driver,
there has to be a branch whether or not to call the doorbell function. The
code on both sides is far simpler and more readable if the doorbell
function is exactly that - a separate function.

> 
> >
> > >
> > > >
> > > > >
> > <snip>
> > > > > > + +/** + * @warning + * @b EXPERIMENTAL: this API may change
> > > > > > without prior notice.  + * + * Returns the number of operations
> > > > > > that failed to complete.  + * NOTE: This API was used when
> > > > > > rte_dmadev_completed has_error was set.  + * + * @param dev_id
> > > > > > + *   The identifier of the device.  + * @param vq_id + *   The
> > > > > > identifier of virt queue.
> > > > > (> + * @param nb_status
> > > > > > + *   Indicates the size  of status array.  + * @param[out]
> > > > > > status + *   The error code of operations that failed to
> > > > > > complete.  + * @param[out] cookie + *   The last failed
> > > > > > completed operation's cookie.  + * + * @return + *   The number
> > > > > > of operations that failed to complete.  + * + * NOTE: The
> > > > > > caller must ensure that the input parameter is valid and the +
> > > > > > *       corresponding device supports the operation.  + */
> > > > > > +__rte_experimental +static inline uint16_t
> > > > > > +rte_dmadev_completed_fails(uint16_t dev_id, uint16_t vq_id, +
> > > > > > const uint16_t nb_status, uint32_t *status, +
> > > > > > dma_cookie_t *cookie)
> > > > >
> > > > > IMO, it is better to move cookie/rind_idx at 3.  Why it would
> > > > > return any array of errors? since it called after
> > > > > rte_dmadev_completed() has has_error. Is it better to change
> > > > >
> > > > > rte_dmadev_error_status((uint16_t dev_id, uint16_t vq_id,
> > > > > dma_cookie_t *cookie,  uint32_t *status)
> > > > >
> > > > > I also think, we may need to set status as bitmask and enumerate
> > > > > all the combination of error codes of all the driver and return
> > > > > string from driver existing rte_flow_error
> > > > >
> > > > > See struct rte_flow_error { enum rte_flow_error_type type; /**<
> > > > > Cause field and error types. */ const void *cause; /**< Object
> > > > > responsible for the error. */ const char *message; /**<
> > > > > Human-readable error message. */ };
> > > > >
> > > >
> > > > I think we need a multi-return value API here, as we may add
> > > > operations in future which have non-error status values to return.
> > > > The obvious case is DMA engines which support "compare" operations.
> > > > In that case a successful compare (as in there were no DMA or HW
> > > > errors) can return "equal" or "not-equal" as statuses. For general
> > > > "copy" operations, the faster completion op can be used to just
> > > > return successful values (and only call this status version on
> > > > error), while apps using those compare ops or a mixture of copy and
> > > > compare ops, would always use the slower one that returns status
> > > > values for each and every op..
> > > >
> > > > The ioat APIs used 32-bit integer values for this status array so
> > > > as to allow e.g. 16-bits for error code and 16-bits for future
> > > > status values. For most operations there should be a fairly small
> > > > set of things that can go wrong, i.e. bad source address, bad
> > > > destination address or invalid length.  Within that we may have a
> > > > couple of specifics for why an address is bad, but even so I don't
> > > > think we need to start having multiple bit combinations.
> > >
> > > OK. What is the purpose of errors status? Is it for application
> > > printing it or Does the application need to take any action based on
> > > specific error requests?
> >
> > It's largely for information purposes, but in the case of SVA/SVM
> > errors could occur due to the memory not being pinned, i.e. a page
> > fault, in some cases. If that happens, then it's up the app to either
> > touch the memory and retry the copy, or to do a SW memcpy as a
> > fallback.
> >
> > In other error cases, I think it's good to tell the application if it's
> > passing around bad data, or data that is beyond the scope of hardware,
> > e.g.  a copy that is beyond what can be done in a single transaction
> > for a HW instance. Given that there are always things that can go
> > wrong, I think we need some error reporting mechanism.
> >
> > > If the former is scope, then we need to define the standard enum
> > > value for the error right?  ie. uint32_t *status needs to change to
> > > enum rte_dma_error or so.
> > >
> > Sure. Perhaps an error/status structure either is an option, where we
> > explicitly call out error info from status info.
> 
> Agree. Better to have a structure with filed like,
> 
> 1)  enum rte_dma_error_type 2)  memory to store, informative message on
> fine aspects of error.  LIke address caused issue etc.(Which will be
> driver-specific information).
> 
The only issue I have with that is that once we have driver specific
information it is of little use to the application, since it can't know
anything about it excepy maybe log it.  I'd much rather have a set of error
codes telling user that "source address is wrong", "dest address is wrong",
and a generic "an address is wrong" in case driver/HW cannot distinguish
source of error. Can we see how far we get with just error codes before we
start into passing string messages around and all the memory management
headaches that implies.

/Bruce

^ permalink raw reply	[flat|nested] 339+ messages in thread

* Re: [dpdk-dev] [RFC UPDATE PATCH 1/9] dmadev: add missing exports
  2021-07-07  8:26     ` David Marchand
@ 2021-07-07  8:36       ` Bruce Richardson
  2021-07-07  8:57         ` David Marchand
  0 siblings, 1 reply; 339+ messages in thread
From: Bruce Richardson @ 2021-07-07  8:36 UTC (permalink / raw)
  To: David Marchand
  Cc: dev, Chengwen Feng, Jerin Jacob, Jerin Jacob, Morten Brørup

On Wed, Jul 07, 2021 at 10:26:36AM +0200, David Marchand wrote:
> On Tue, Jul 6, 2021 at 10:29 PM Bruce Richardson
> <bruce.richardson@intel.com> wrote:
> >
> > Export the rte_dmadevices array and the allocate and release functions
> > which are needed by PMDs.
> 
> rte_dmadevices[] might be an issue for inline accesses, but pmd
> allocate/release should be internal (driver only).
> 
So if I understand correctly, they still need to be in the version.map
file, but with "interal" versioning rather than "experimental", right?

^ permalink raw reply	[flat|nested] 339+ messages in thread

* Re: [dpdk-dev] [RFC UPDATE PATCH 1/9] dmadev: add missing exports
  2021-07-07  8:36       ` Bruce Richardson
@ 2021-07-07  8:57         ` David Marchand
  0 siblings, 0 replies; 339+ messages in thread
From: David Marchand @ 2021-07-07  8:57 UTC (permalink / raw)
  To: Bruce Richardson
  Cc: dev, Chengwen Feng, Jerin Jacob, Jerin Jacob, Morten Brørup

On Wed, Jul 7, 2021 at 10:37 AM Bruce Richardson
<bruce.richardson@intel.com> wrote:
>
> On Wed, Jul 07, 2021 at 10:26:36AM +0200, David Marchand wrote:
> > On Tue, Jul 6, 2021 at 10:29 PM Bruce Richardson
> > <bruce.richardson@intel.com> wrote:
> > >
> > > Export the rte_dmadevices array and the allocate and release functions
> > > which are needed by PMDs.
> >
> > rte_dmadevices[] might be an issue for inline accesses, but pmd
> > allocate/release should be internal (driver only).
> >
> So if I understand correctly, they still need to be in the version.map

drivers still need them exported, so yes they must be in version.map
with INTERNAL version.

> file, but with "interal" versioning rather than "experimental", right?

But I would move them too in a separate header, for drivers, like
ethdev_driver.h.
And the __rte_internal tag will be needed at their declarations.

https://git.dpdk.org/dpdk/tree/lib/ethdev/ethdev_driver.h#n1005
https://git.dpdk.org/dpdk/tree/lib/ethdev/version.map#n257


-- 
David Marchand


^ permalink raw reply	[flat|nested] 339+ messages in thread

* Re: [dpdk-dev] [PATCH] dmadev: introduce DMA device library
  2021-07-07  8:35           ` Bruce Richardson
@ 2021-07-07 10:34             ` Jerin Jacob
  2021-07-07 11:01               ` Bruce Richardson
  0 siblings, 1 reply; 339+ messages in thread
From: Jerin Jacob @ 2021-07-07 10:34 UTC (permalink / raw)
  To: Bruce Richardson
  Cc: Chengwen Feng, Thomas Monjalon, Ferruh Yigit, Jerin Jacob,
	dpdk-dev, Morten Brørup, Nipun Gupta, Hemant Agrawal,
	Maxime Coquelin, Honnappa Nagarahalli, David Marchand,
	Satananda Burla, Prasun Kapoor, Ananyev, Konstantin, liangma,
	Radha Mohan Chintakuntla

On Wed, Jul 7, 2021 at 2:05 PM Bruce Richardson
<bruce.richardson@intel.com> wrote:
>
> On Wed, Jul 07, 2021 at 01:38:58PM +0530, Jerin Jacob wrote:
> > On Mon, Jul 5, 2021 at 10:46 PM Bruce Richardson
> > <bruce.richardson@intel.com> wrote:
> > >
> > > On Mon, Jul 05, 2021 at 09:25:34PM +0530, Jerin Jacob wrote:
> > > >
> > > > On Mon, Jul 5, 2021 at 4:22 PM Bruce Richardson
> > > > <bruce.richardson@intel.com> wrote:
> > > > >
> > > > > On Sun, Jul 04, 2021 at 03:00:30PM +0530, Jerin Jacob wrote:
> > > > > > On Fri, Jul 2, 2021 at 6:51 PM Chengwen Feng <fengchengwen@huawei.com> wrote:
> > > > > > >
> > > > > > > This patch introduces 'dmadevice' which is a generic type of DMA
> > > > > > > device.
> > > <snip>
> > > > >
> > > > > +1 and the terminology with regards to queues and channels. With our ioat
> > > > > hardware, each HW queue was called a channel for instance.
> > > >
> > > > Looks like <dmadev> <> <channel> can cover all the use cases, if the
> > > > HW has more than
> > > > 1 queues it can be exposed as separate dmadev dev.
> > > >
> > >
> > > Fine for me.
> > >
> > > However, just to confirm that Morten's suggestion of using a
> > > (device-specific void *) channel pointer rather than dev_id + channel_id
> > > pair of parameters won't work for you? You can't store a pointer or dev
> > > index in the channel struct in the driver?
> >
> > Yes. That will work. To confirm, the suggestion is to use, void *
> > object instead of channel_id,
> > That will avoid one more indirection.(index -> pointer)
> >
>
> The proposal was to use it in place of "dev_id + channel_id", i.e.
>
> copy(dev_id, ch_id, src, dst, len, flags) --> copy(ch_ptr, src, dst, len, flags)
>
> Where the channel pointer implicitly indicates the device too. However, I
> realise now that this would be something completely transparent to the
> driver as it would all have to be implemented in the dmadev level, and
> lead to lots of duplication of function pointers, etc. Therefore, let's
> just go with original scheme. :-(

Yes. Just go with the original scheme.

>
> >
> > >
> > > >
>
> <snip>
>
> > > > Got it. In order to save space if first CL size for fastpath(Saving 8B
> > > > for the pointer) and to avoid
> > > > function overhead, Can we use one bit of flags of op function to
> > > > enable the fence?
> > > >
> > >
> > > The original ioat implementation did exactly that. However, I then
> > > discovered that because a fence logically belongs between two operations,
> > > does the fence flag on an operation mean "don't do any jobs after this
> > > until this job has completed" or does it mean "don't start this job until
> > > all previous jobs have completed". [Or theoretically does it mean both :-)]
> > > Naturally, some hardware does it the former way (i.e. fence flag goes on
> > > last op before fence), while other hardware the latter way (i.e. fence flag
> > > goes on first op after the fence). Therefore, since fencing is about
> > > ordering *between* two (sets of) jobs, I decided that it should do exactly
> > > that and go between two jobs, so there is no ambiguity!
> > >
> > > However, I'm happy enough to switch to having a fence flag, but I think if
> > > we do that, it should be put in the "first job after fence" case, because
> > > it is always easier to modify a previously written job if we need to, than
> > > to save the flag for a future one.
> > >
> > > Alternatively, if we keep the fence as a separate function, I'm happy
> > > enough for it not to be on the same cacheline as the "hot" operations,
> > > since fencing will always introduce a small penalty anyway.
> >
> > Ack.
> > You may consider two flags, FENCE_THEN_JOB and JOB_THEN_FENCE( If
> > there any use case for this or it makes sense for your HW)
> >
> >
> > For us, Fence is NOP for us as we have an implicit fence between each
> > HW job descriptor.
> >
>
> I actually still think that having a separate fence function in the "ops"
> section is the best approach here. It's unabiguous as to whether it's
> fence-before or fence-after, and if we have it in the ops, it doesn't use a
> "fast-path" slot.
>
> However, if we *really* want to use a flag instead, I don't see the value
> in having two flags, it will be really confusing.  Instead, if we do go
> with a flag, I think "RTE_DMA_PRE_FENCE" should be the name, indicating
> that the fence occurs before the job in question.

IMO, We need to use flags and the name can be RTE_DMA_PRE_FENCE
due to overhead of the driver implementation where the fence request
can be NOP and
to save the first cache line occupancy.

>
> >
> > >
> > > > >
> > > > > >
> > > <snip>
> > > > > > Since we have additional function call overhead in all the
> > > > > > applications for this scheme, I would like to understand
> > > > > > the use of doing this way vs enq does the doorbell implicitly from
> > > > > > driver/application PoV?
> > > > > >
> > > > >
> > > > > In our benchmarks it's just faster. When we tested it, the overhead of the
> > > > > function calls was noticably less than the cost of building up the
> > > > > parameter array(s) for passing the jobs in as a burst. [We don't see this
> > > > > cost with things like NIC I/O since DPDK tends to already have the mbuf
> > > > > fully populated before the TX call anyway.]
> > > >
> > > > OK. I agree with stack population.
> > > >
> > > > My question was more on doing implicit doorbell update enq. Is doorbell write
> > > > costly in other HW compare to a function call? In our HW, it is just write of
> > > > the number of instructions written in a register.
> > > >
> > > > Also, we need to again access the internal PMD memory structure to find
> > > > where to write etc if it is a separate function.
> > > >
> > >
> > > The cost varies depending on a number of factors - even writing to a single
> > > HW register can be very slow if that register is mapped as device
> > > (uncacheable) memory, since (AFAIK) it will act as a full fence and wait
> >
> > I don't know, At least in our case, writes are write-back. so core does not need
> > to wait.(If there is no read operation).
> >
> > > for the write to go all the way to hardware. For more modern HW, the cost
> > > can be lighter. However, any cost of HW writes is going to be the same
> > > whether its a separate function call or not.
> > >
> > > However, the main thing about the doorbell update is that it's a
> > > once-per-burst thing, rather than a once-per-job. Therefore, even if you
> > > have to re-read the struct memory (which is likely still somewhere in your
> > > cores' cache), any extra small cost of doing so is to be amortized over the
> > > cost of a whole burst of copies.
> >
> > Linux kernel has xmit_more flag in skb to address similar thing.
> > i.e enq job flag can have one more bit field to say update ring bell or not?
> > Rather having yet another function overhead.IMO, it is the best of both worlds.
> >
>
> It's just more conditionals and branches all through the code. Inside the
> user application, the user has to check whether to set the flag or not (or
> special-case the last transaction outside the loop), and within the driver,
> there has to be a branch whether or not to call the doorbell function. The
> code on both sides is far simpler and more readable if the doorbell
> function is exactly that - a separate function.

I disagree. The reason is:

We will have two classes of applications

a) do dma copy request as and when it has data(I think, this is the
prime use case), for those,
I think, it is considerable overhead to have two function invocation
per transfer i.e
rte_dma_copy() and rte_dma_perform()

b) do dma copy when the data is reached to a logical state,  like copy
IP frame from Ethernet packets or so,
In that case, the application will have  a LOGIC to detect when to
perform it so on the end of
that rte_dma_copy() flag can be updated to fire the doorbell.

IMO, We are comparing against a branch(flag is already in register) vs
a set of instructions for
1) function pointer overhead
2) Need to use the channel context again back in another function.

IMO, a single branch is most optimal from performance PoV.


>
> >
> > >
> > > >
> > > > >
> > > > > >
> > > <snip>
> > > > > > > + +/** + * @warning + * @b EXPERIMENTAL: this API may change
> > > > > > > without prior notice.  + * + * Returns the number of operations
> > > > > > > that failed to complete.  + * NOTE: This API was used when
> > > > > > > rte_dmadev_completed has_error was set.  + * + * @param dev_id
> > > > > > > + *   The identifier of the device.  + * @param vq_id + *   The
> > > > > > > identifier of virt queue.
> > > > > > (> + * @param nb_status
> > > > > > > + *   Indicates the size  of status array.  + * @param[out]
> > > > > > > status + *   The error code of operations that failed to
> > > > > > > complete.  + * @param[out] cookie + *   The last failed
> > > > > > > completed operation's cookie.  + * + * @return + *   The number
> > > > > > > of operations that failed to complete.  + * + * NOTE: The
> > > > > > > caller must ensure that the input parameter is valid and the +
> > > > > > > *       corresponding device supports the operation.  + */
> > > > > > > +__rte_experimental +static inline uint16_t
> > > > > > > +rte_dmadev_completed_fails(uint16_t dev_id, uint16_t vq_id, +
> > > > > > > const uint16_t nb_status, uint32_t *status, +
> > > > > > > dma_cookie_t *cookie)
> > > > > >
> > > > > > IMO, it is better to move cookie/rind_idx at 3.  Why it would
> > > > > > return any array of errors? since it called after
> > > > > > rte_dmadev_completed() has has_error. Is it better to change
> > > > > >
> > > > > > rte_dmadev_error_status((uint16_t dev_id, uint16_t vq_id,
> > > > > > dma_cookie_t *cookie,  uint32_t *status)
> > > > > >
> > > > > > I also think, we may need to set status as bitmask and enumerate
> > > > > > all the combination of error codes of all the driver and return
> > > > > > string from driver existing rte_flow_error
> > > > > >
> > > > > > See struct rte_flow_error { enum rte_flow_error_type type; /**<
> > > > > > Cause field and error types. */ const void *cause; /**< Object
> > > > > > responsible for the error. */ const char *message; /**<
> > > > > > Human-readable error message. */ };
> > > > > >
> > > > >
> > > > > I think we need a multi-return value API here, as we may add
> > > > > operations in future which have non-error status values to return.
> > > > > The obvious case is DMA engines which support "compare" operations.
> > > > > In that case a successful compare (as in there were no DMA or HW
> > > > > errors) can return "equal" or "not-equal" as statuses. For general
> > > > > "copy" operations, the faster completion op can be used to just
> > > > > return successful values (and only call this status version on
> > > > > error), while apps using those compare ops or a mixture of copy and
> > > > > compare ops, would always use the slower one that returns status
> > > > > values for each and every op..
> > > > >
> > > > > The ioat APIs used 32-bit integer values for this status array so
> > > > > as to allow e.g. 16-bits for error code and 16-bits for future
> > > > > status values. For most operations there should be a fairly small
> > > > > set of things that can go wrong, i.e. bad source address, bad
> > > > > destination address or invalid length.  Within that we may have a
> > > > > couple of specifics for why an address is bad, but even so I don't
> > > > > think we need to start having multiple bit combinations.
> > > >
> > > > OK. What is the purpose of errors status? Is it for application
> > > > printing it or Does the application need to take any action based on
> > > > specific error requests?
> > >
> > > It's largely for information purposes, but in the case of SVA/SVM
> > > errors could occur due to the memory not being pinned, i.e. a page
> > > fault, in some cases. If that happens, then it's up the app to either
> > > touch the memory and retry the copy, or to do a SW memcpy as a
> > > fallback.
> > >
> > > In other error cases, I think it's good to tell the application if it's
> > > passing around bad data, or data that is beyond the scope of hardware,
> > > e.g.  a copy that is beyond what can be done in a single transaction
> > > for a HW instance. Given that there are always things that can go
> > > wrong, I think we need some error reporting mechanism.
> > >
> > > > If the former is scope, then we need to define the standard enum
> > > > value for the error right?  ie. uint32_t *status needs to change to
> > > > enum rte_dma_error or so.
> > > >
> > > Sure. Perhaps an error/status structure either is an option, where we
> > > explicitly call out error info from status info.
> >
> > Agree. Better to have a structure with filed like,
> >
> > 1)  enum rte_dma_error_type 2)  memory to store, informative message on
> > fine aspects of error.  LIke address caused issue etc.(Which will be
> > driver-specific information).
> >
> The only issue I have with that is that once we have driver specific
> information it is of little use to the application, since it can't know
> anything about it excepy maybe log it.  I'd much rather have a set of error
> codes telling user that "source address is wrong", "dest address is wrong",
> and a generic "an address is wrong" in case driver/HW cannot distinguish
> source of error. Can we see how far we get with just error codes before we
> start into passing string messages around and all the memory management
> headaches that implies.

Works for me. It should be "enum rte_dma_error_type" then, which has a standard
error type. Which is missing in the spec now.

>
> /Bruce

^ permalink raw reply	[flat|nested] 339+ messages in thread

* Re: [dpdk-dev] [RFC UPDATE PATCH 0/9] dmadev rfc suggested updates
  2021-07-07  3:16   ` [dpdk-dev] [RFC UPDATE PATCH 0/9] dmadev rfc suggested updates fengchengwen
  2021-07-07  8:11     ` Bruce Richardson
  2021-07-07  8:14     ` Bruce Richardson
@ 2021-07-07 10:42     ` Jerin Jacob
  2 siblings, 0 replies; 339+ messages in thread
From: Jerin Jacob @ 2021-07-07 10:42 UTC (permalink / raw)
  To: fengchengwen; +Cc: Bruce Richardson, dpdk-dev, Jerin Jacob, Morten Brørup

On Wed, Jul 7, 2021 at 8:46 AM fengchengwen <fengchengwen@huawei.com> wrote:
>
> LGTM, thanks
>
> And I'am prepare dmadev V2, include:
> a) Fix code review comments (e.g. multiple-process support, doxygen, comments, typo)
> b) Flatten device abstraction to two layer: dmadev <> vchan

I think we should not have "virtual" in API specification. it can be
virtual or real based on real HW/SW/Driver implementation.
I think, just chan enough.

> c) Public API use dev_id and vchan_id to locate one vchan
> d) Using the flags parameter instead of the fence API
> e) Rename rte_dmadev_perform to rte_dmadev_submit so it corresponds to the stats variable.
>
> PS: Some code (lib/dmadev) will rebase this patchset
>
>
> On 2021/7/7 4:28, Bruce Richardson wrote:
> > This patchset contains a series of changes to dmadev based on work being done to
> > port over our drivers to test this new infrastructure. Some of these are bug
> > fixes to enable compilation e.g. missing exports or meson.build files, while
> > others are suggested changes to enhance the API. All these patches are to be
> > applied on top of [1] as they are mostly suggested changes to that RFC i.e.
> > patches to the patch!
> >
> > The final patch includes some basic sanity tests for copy operations that we
> > have ported over from the ioat self-tests to use the dmadev APIs. The basic
> > dataplane part of those tests is probably ok for now, but the initialization of
> > queues in that test code may need some enhancement. Feedback welcome.
> >
> > A tree with all these patches applied can be got at [2] if anyone wants to use
> > that as a basis for working on drivers, or for other discussion.
> >
> > [1] http://patches.dpdk.org/project/dpdk/patch/1625231891-2963-1-git-send-email-fengchengwen@huawei.com/
> > [2] https://github.com/bruce-richardson/dpdk/tree/dmadev-rfcs
> >
> > Bruce Richardson (9):
> >   dmadev: add missing exports
> >   dmadev: change virtual addresses to IOVA
> >   dmadev: add dump function
> >   dmadev: remove xstats functions
> >   dmadev: drop cookie typedef
> >   dmadev: allow NULL parameters to completed ops call
> >   dmadev: stats structure updates
> >   drivers: add dma driver category
> >   app/test: add basic dmadev unit test
> >
> >  app/test/meson.build         |   2 +
> >  app/test/test_dmadev.c       | 320 +++++++++++++++++++++++++++++++++++
> >  drivers/dma/meson.build      |  11 ++
> >  drivers/meson.build          |   1 +
> >  lib/dmadev/rte_dmadev.c      |  66 ++------
> >  lib/dmadev/rte_dmadev.h      | 204 +++++++---------------
> >  lib/dmadev/rte_dmadev_core.h |  16 +-
> >  lib/dmadev/rte_dmadev_pmd.h  |  24 +--
> >  lib/dmadev/version.map       |   7 +-
> >  9 files changed, 425 insertions(+), 226 deletions(-)
> >  create mode 100644 app/test/test_dmadev.c
> >  create mode 100644 drivers/dma/meson.build
> >
> > --
> > 2.30.2
> >
> >
> > .
> >
>

^ permalink raw reply	[flat|nested] 339+ messages in thread

* Re: [dpdk-dev] [PATCH] dmadev: introduce DMA device library
  2021-07-07 10:34             ` Jerin Jacob
@ 2021-07-07 11:01               ` Bruce Richardson
  2021-07-08  3:11                 ` fengchengwen
  0 siblings, 1 reply; 339+ messages in thread
From: Bruce Richardson @ 2021-07-07 11:01 UTC (permalink / raw)
  To: Jerin Jacob
  Cc: Chengwen Feng, Thomas Monjalon, Ferruh Yigit, Jerin Jacob,
	dpdk-dev, Morten Brørup, Nipun Gupta, Hemant Agrawal,
	Maxime Coquelin, Honnappa Nagarahalli, David Marchand,
	Satananda Burla, Prasun Kapoor, Ananyev, Konstantin, liangma,
	Radha Mohan Chintakuntla

On Wed, Jul 07, 2021 at 04:04:16PM +0530, Jerin Jacob wrote:
> On Wed, Jul 7, 2021 at 2:05 PM Bruce Richardson
> <bruce.richardson@intel.com> wrote:
> >
> > On Wed, Jul 07, 2021 at 01:38:58PM +0530, Jerin Jacob wrote:
> > > On Mon, Jul 5, 2021 at 10:46 PM Bruce Richardson
> > > <bruce.richardson@intel.com> wrote:
> > > >
> > > > On Mon, Jul 05, 2021 at 09:25:34PM +0530, Jerin Jacob wrote:
> > > > >
> > > > > On Mon, Jul 5, 2021 at 4:22 PM Bruce Richardson
> > > > > <bruce.richardson@intel.com> wrote:
> > > > > >
> > > > > > On Sun, Jul 04, 2021 at 03:00:30PM +0530, Jerin Jacob wrote:
> > > > > > > On Fri, Jul 2, 2021 at 6:51 PM Chengwen Feng <fengchengwen@huawei.com> wrote:
> > > > > > > >
> > > > > > > > This patch introduces 'dmadevice' which is a generic type of DMA
> > > > > > > > device.
> > > > <snip>
> > > > > >
> > > > > > +1 and the terminology with regards to queues and channels. With our ioat
> > > > > > hardware, each HW queue was called a channel for instance.
> > > > >
> > > > > Looks like <dmadev> <> <channel> can cover all the use cases, if the
> > > > > HW has more than
> > > > > 1 queues it can be exposed as separate dmadev dev.
> > > > >
> > > >
> > > > Fine for me.
> > > >
> > > > However, just to confirm that Morten's suggestion of using a
> > > > (device-specific void *) channel pointer rather than dev_id + channel_id
> > > > pair of parameters won't work for you? You can't store a pointer or dev
> > > > index in the channel struct in the driver?
> > >
> > > Yes. That will work. To confirm, the suggestion is to use, void *
> > > object instead of channel_id,
> > > That will avoid one more indirection.(index -> pointer)
> > >
> >
> > The proposal was to use it in place of "dev_id + channel_id", i.e.
> >
> > copy(dev_id, ch_id, src, dst, len, flags) --> copy(ch_ptr, src, dst, len, flags)
> >
> > Where the channel pointer implicitly indicates the device too. However, I
> > realise now that this would be something completely transparent to the
> > driver as it would all have to be implemented in the dmadev level, and
> > lead to lots of duplication of function pointers, etc. Therefore, let's
> > just go with original scheme. :-(
> 
> Yes. Just go with the original scheme.
>
+1

> >
> > >
> > > >
> > > > >
> >
> > <snip>
> >
> > > > > Got it. In order to save space if first CL size for fastpath(Saving 8B
> > > > > for the pointer) and to avoid
> > > > > function overhead, Can we use one bit of flags of op function to
> > > > > enable the fence?
> > > > >
> > > >
> > > > The original ioat implementation did exactly that. However, I then
> > > > discovered that because a fence logically belongs between two operations,
> > > > does the fence flag on an operation mean "don't do any jobs after this
> > > > until this job has completed" or does it mean "don't start this job until
> > > > all previous jobs have completed". [Or theoretically does it mean both :-)]
> > > > Naturally, some hardware does it the former way (i.e. fence flag goes on
> > > > last op before fence), while other hardware the latter way (i.e. fence flag
> > > > goes on first op after the fence). Therefore, since fencing is about
> > > > ordering *between* two (sets of) jobs, I decided that it should do exactly
> > > > that and go between two jobs, so there is no ambiguity!
> > > >
> > > > However, I'm happy enough to switch to having a fence flag, but I think if
> > > > we do that, it should be put in the "first job after fence" case, because
> > > > it is always easier to modify a previously written job if we need to, than
> > > > to save the flag for a future one.
> > > >
> > > > Alternatively, if we keep the fence as a separate function, I'm happy
> > > > enough for it not to be on the same cacheline as the "hot" operations,
> > > > since fencing will always introduce a small penalty anyway.
> > >
> > > Ack.
> > > You may consider two flags, FENCE_THEN_JOB and JOB_THEN_FENCE( If
> > > there any use case for this or it makes sense for your HW)
> > >
> > >
> > > For us, Fence is NOP for us as we have an implicit fence between each
> > > HW job descriptor.
> > >
> >
> > I actually still think that having a separate fence function in the "ops"
> > section is the best approach here. It's unabiguous as to whether it's
> > fence-before or fence-after, and if we have it in the ops, it doesn't use a
> > "fast-path" slot.
> >
> > However, if we *really* want to use a flag instead, I don't see the value
> > in having two flags, it will be really confusing.  Instead, if we do go
> > with a flag, I think "RTE_DMA_PRE_FENCE" should be the name, indicating
> > that the fence occurs before the job in question.
> 
> IMO, We need to use flags and the name can be RTE_DMA_PRE_FENCE
> due to overhead of the driver implementation where the fence request
> can be NOP and
> to save the first cache line occupancy.
> 
> >
> > >
> > > >
> > > > > >
> > > > > > >
> > > > <snip>
> > > > > > > Since we have additional function call overhead in all the
> > > > > > > applications for this scheme, I would like to understand
> > > > > > > the use of doing this way vs enq does the doorbell implicitly from
> > > > > > > driver/application PoV?
> > > > > > >
> > > > > >
> > > > > > In our benchmarks it's just faster. When we tested it, the overhead of the
> > > > > > function calls was noticably less than the cost of building up the
> > > > > > parameter array(s) for passing the jobs in as a burst. [We don't see this
> > > > > > cost with things like NIC I/O since DPDK tends to already have the mbuf
> > > > > > fully populated before the TX call anyway.]
> > > > >
> > > > > OK. I agree with stack population.
> > > > >
> > > > > My question was more on doing implicit doorbell update enq. Is doorbell write
> > > > > costly in other HW compare to a function call? In our HW, it is just write of
> > > > > the number of instructions written in a register.
> > > > >
> > > > > Also, we need to again access the internal PMD memory structure to find
> > > > > where to write etc if it is a separate function.
> > > > >
> > > >
> > > > The cost varies depending on a number of factors - even writing to a single
> > > > HW register can be very slow if that register is mapped as device
> > > > (uncacheable) memory, since (AFAIK) it will act as a full fence and wait
> > >
> > > I don't know, At least in our case, writes are write-back. so core does not need
> > > to wait.(If there is no read operation).
> > >
> > > > for the write to go all the way to hardware. For more modern HW, the cost
> > > > can be lighter. However, any cost of HW writes is going to be the same
> > > > whether its a separate function call or not.
> > > >
> > > > However, the main thing about the doorbell update is that it's a
> > > > once-per-burst thing, rather than a once-per-job. Therefore, even if you
> > > > have to re-read the struct memory (which is likely still somewhere in your
> > > > cores' cache), any extra small cost of doing so is to be amortized over the
> > > > cost of a whole burst of copies.
> > >
> > > Linux kernel has xmit_more flag in skb to address similar thing.
> > > i.e enq job flag can have one more bit field to say update ring bell or not?
> > > Rather having yet another function overhead.IMO, it is the best of both worlds.
> > >
> >
> > It's just more conditionals and branches all through the code. Inside the
> > user application, the user has to check whether to set the flag or not (or
> > special-case the last transaction outside the loop), and within the driver,
> > there has to be a branch whether or not to call the doorbell function. The
> > code on both sides is far simpler and more readable if the doorbell
> > function is exactly that - a separate function.
> 
> I disagree. The reason is:
> 
> We will have two classes of applications
> 
> a) do dma copy request as and when it has data(I think, this is the
> prime use case), for those,
> I think, it is considerable overhead to have two function invocation
> per transfer i.e
> rte_dma_copy() and rte_dma_perform()
> 
> b) do dma copy when the data is reached to a logical state,  like copy
> IP frame from Ethernet packets or so,
> In that case, the application will have  a LOGIC to detect when to
> perform it so on the end of
> that rte_dma_copy() flag can be updated to fire the doorbell.
> 
> IMO, We are comparing against a branch(flag is already in register) vs
> a set of instructions for
> 1) function pointer overhead
> 2) Need to use the channel context again back in another function.
> 
> IMO, a single branch is most optimal from performance PoV.
> 
Ok, let's try it and see how it goes.

> 
> >
> > >
> > > >
> > > > >
> > > > > >
> > > > > > >
> > > > <snip>
> > > > > > > > + +/** + * @warning + * @b EXPERIMENTAL: this API may change
> > > > > > > > without prior notice.  + * + * Returns the number of operations
> > > > > > > > that failed to complete.  + * NOTE: This API was used when
> > > > > > > > rte_dmadev_completed has_error was set.  + * + * @param dev_id
> > > > > > > > + *   The identifier of the device.  + * @param vq_id + *   The
> > > > > > > > identifier of virt queue.
> > > > > > > (> + * @param nb_status
> > > > > > > > + *   Indicates the size  of status array.  + * @param[out]
> > > > > > > > status + *   The error code of operations that failed to
> > > > > > > > complete.  + * @param[out] cookie + *   The last failed
> > > > > > > > completed operation's cookie.  + * + * @return + *   The number
> > > > > > > > of operations that failed to complete.  + * + * NOTE: The
> > > > > > > > caller must ensure that the input parameter is valid and the +
> > > > > > > > *       corresponding device supports the operation.  + */
> > > > > > > > +__rte_experimental +static inline uint16_t
> > > > > > > > +rte_dmadev_completed_fails(uint16_t dev_id, uint16_t vq_id, +
> > > > > > > > const uint16_t nb_status, uint32_t *status, +
> > > > > > > > dma_cookie_t *cookie)
> > > > > > >
> > > > > > > IMO, it is better to move cookie/rind_idx at 3.  Why it would
> > > > > > > return any array of errors? since it called after
> > > > > > > rte_dmadev_completed() has has_error. Is it better to change
> > > > > > >
> > > > > > > rte_dmadev_error_status((uint16_t dev_id, uint16_t vq_id,
> > > > > > > dma_cookie_t *cookie,  uint32_t *status)
> > > > > > >
> > > > > > > I also think, we may need to set status as bitmask and enumerate
> > > > > > > all the combination of error codes of all the driver and return
> > > > > > > string from driver existing rte_flow_error
> > > > > > >
> > > > > > > See struct rte_flow_error { enum rte_flow_error_type type; /**<
> > > > > > > Cause field and error types. */ const void *cause; /**< Object
> > > > > > > responsible for the error. */ const char *message; /**<
> > > > > > > Human-readable error message. */ };
> > > > > > >
> > > > > >
> > > > > > I think we need a multi-return value API here, as we may add
> > > > > > operations in future which have non-error status values to return.
> > > > > > The obvious case is DMA engines which support "compare" operations.
> > > > > > In that case a successful compare (as in there were no DMA or HW
> > > > > > errors) can return "equal" or "not-equal" as statuses. For general
> > > > > > "copy" operations, the faster completion op can be used to just
> > > > > > return successful values (and only call this status version on
> > > > > > error), while apps using those compare ops or a mixture of copy and
> > > > > > compare ops, would always use the slower one that returns status
> > > > > > values for each and every op..
> > > > > >
> > > > > > The ioat APIs used 32-bit integer values for this status array so
> > > > > > as to allow e.g. 16-bits for error code and 16-bits for future
> > > > > > status values. For most operations there should be a fairly small
> > > > > > set of things that can go wrong, i.e. bad source address, bad
> > > > > > destination address or invalid length.  Within that we may have a
> > > > > > couple of specifics for why an address is bad, but even so I don't
> > > > > > think we need to start having multiple bit combinations.
> > > > >
> > > > > OK. What is the purpose of errors status? Is it for application
> > > > > printing it or Does the application need to take any action based on
> > > > > specific error requests?
> > > >
> > > > It's largely for information purposes, but in the case of SVA/SVM
> > > > errors could occur due to the memory not being pinned, i.e. a page
> > > > fault, in some cases. If that happens, then it's up the app to either
> > > > touch the memory and retry the copy, or to do a SW memcpy as a
> > > > fallback.
> > > >
> > > > In other error cases, I think it's good to tell the application if it's
> > > > passing around bad data, or data that is beyond the scope of hardware,
> > > > e.g.  a copy that is beyond what can be done in a single transaction
> > > > for a HW instance. Given that there are always things that can go
> > > > wrong, I think we need some error reporting mechanism.
> > > >
> > > > > If the former is scope, then we need to define the standard enum
> > > > > value for the error right?  ie. uint32_t *status needs to change to
> > > > > enum rte_dma_error or so.
> > > > >
> > > > Sure. Perhaps an error/status structure either is an option, where we
> > > > explicitly call out error info from status info.
> > >
> > > Agree. Better to have a structure with filed like,
> > >
> > > 1)  enum rte_dma_error_type 2)  memory to store, informative message on
> > > fine aspects of error.  LIke address caused issue etc.(Which will be
> > > driver-specific information).
> > >
> > The only issue I have with that is that once we have driver specific
> > information it is of little use to the application, since it can't know
> > anything about it excepy maybe log it.  I'd much rather have a set of error
> > codes telling user that "source address is wrong", "dest address is wrong",
> > and a generic "an address is wrong" in case driver/HW cannot distinguish
> > source of error. Can we see how far we get with just error codes before we
> > start into passing string messages around and all the memory management
> > headaches that implies.
> 
> Works for me. It should be "enum rte_dma_error_type" then, which has a standard
> error type. Which is missing in the spec now.
> 
+1

^ permalink raw reply	[flat|nested] 339+ messages in thread

* Re: [dpdk-dev] [PATCH] dmadev: introduce DMA device library
  2021-07-07 11:01               ` Bruce Richardson
@ 2021-07-08  3:11                 ` fengchengwen
  2021-07-08 18:35                   ` Jerin Jacob
  0 siblings, 1 reply; 339+ messages in thread
From: fengchengwen @ 2021-07-08  3:11 UTC (permalink / raw)
  To: Bruce Richardson, Jerin Jacob
  Cc: Thomas Monjalon, Ferruh Yigit, Jerin Jacob, dpdk-dev,
	Morten Brørup, Nipun Gupta, Hemant Agrawal, Maxime Coquelin,
	Honnappa Nagarahalli, David Marchand, Satananda Burla,
	Prasun Kapoor, Ananyev, Konstantin, liangma,
	Radha Mohan Chintakuntla

On 2021/7/7 19:01, Bruce Richardson wrote:
> On Wed, Jul 07, 2021 at 04:04:16PM +0530, Jerin Jacob wrote:
>> On Wed, Jul 7, 2021 at 2:05 PM Bruce Richardson
>> <bruce.richardson@intel.com> wrote:
>>>
>>> On Wed, Jul 07, 2021 at 01:38:58PM +0530, Jerin Jacob wrote:
>>>> On Mon, Jul 5, 2021 at 10:46 PM Bruce Richardson
>>>> <bruce.richardson@intel.com> wrote:
>>>>>
>>>>> On Mon, Jul 05, 2021 at 09:25:34PM +0530, Jerin Jacob wrote:
>>>>>>
>>>>>> On Mon, Jul 5, 2021 at 4:22 PM Bruce Richardson
>>>>>> <bruce.richardson@intel.com> wrote:
>>>>>>>
>>>>>>> On Sun, Jul 04, 2021 at 03:00:30PM +0530, Jerin Jacob wrote:
>>>>>>>> On Fri, Jul 2, 2021 at 6:51 PM Chengwen Feng <fengchengwen@huawei.com> wrote:
>>>>>>>>>
>>>>>>>>> This patch introduces 'dmadevice' which is a generic type of DMA
>>>>>>>>> device.
>>>>> <snip>
>>>>>>>
>>>>>>> +1 and the terminology with regards to queues and channels. With our ioat
>>>>>>> hardware, each HW queue was called a channel for instance.
>>>>>>
>>>>>> Looks like <dmadev> <> <channel> can cover all the use cases, if the
>>>>>> HW has more than
>>>>>> 1 queues it can be exposed as separate dmadev dev.
>>>>>>
>>>>>
>>>>> Fine for me.
>>>>>
>>>>> However, just to confirm that Morten's suggestion of using a
>>>>> (device-specific void *) channel pointer rather than dev_id + channel_id
>>>>> pair of parameters won't work for you? You can't store a pointer or dev
>>>>> index in the channel struct in the driver?
>>>>
>>>> Yes. That will work. To confirm, the suggestion is to use, void *
>>>> object instead of channel_id,
>>>> That will avoid one more indirection.(index -> pointer)
>>>>
>>>
>>> The proposal was to use it in place of "dev_id + channel_id", i.e.
>>>
>>> copy(dev_id, ch_id, src, dst, len, flags) --> copy(ch_ptr, src, dst, len, flags)
>>>
>>> Where the channel pointer implicitly indicates the device too. However, I
>>> realise now that this would be something completely transparent to the
>>> driver as it would all have to be implemented in the dmadev level, and
>>> lead to lots of duplication of function pointers, etc. Therefore, let's
>>> just go with original scheme. :-(
>>
>> Yes. Just go with the original scheme.
>>
> +1
> 
>>>
>>>>
>>>>>
>>>>>>
>>>
>>> <snip>
>>>
>>>>>> Got it. In order to save space if first CL size for fastpath(Saving 8B
>>>>>> for the pointer) and to avoid
>>>>>> function overhead, Can we use one bit of flags of op function to
>>>>>> enable the fence?
>>>>>>
>>>>>
>>>>> The original ioat implementation did exactly that. However, I then
>>>>> discovered that because a fence logically belongs between two operations,
>>>>> does the fence flag on an operation mean "don't do any jobs after this
>>>>> until this job has completed" or does it mean "don't start this job until
>>>>> all previous jobs have completed". [Or theoretically does it mean both :-)]
>>>>> Naturally, some hardware does it the former way (i.e. fence flag goes on
>>>>> last op before fence), while other hardware the latter way (i.e. fence flag
>>>>> goes on first op after the fence). Therefore, since fencing is about
>>>>> ordering *between* two (sets of) jobs, I decided that it should do exactly
>>>>> that and go between two jobs, so there is no ambiguity!
>>>>>
>>>>> However, I'm happy enough to switch to having a fence flag, but I think if
>>>>> we do that, it should be put in the "first job after fence" case, because
>>>>> it is always easier to modify a previously written job if we need to, than
>>>>> to save the flag for a future one.
>>>>>
>>>>> Alternatively, if we keep the fence as a separate function, I'm happy
>>>>> enough for it not to be on the same cacheline as the "hot" operations,
>>>>> since fencing will always introduce a small penalty anyway.
>>>>
>>>> Ack.
>>>> You may consider two flags, FENCE_THEN_JOB and JOB_THEN_FENCE( If
>>>> there any use case for this or it makes sense for your HW)
>>>>
>>>>
>>>> For us, Fence is NOP for us as we have an implicit fence between each
>>>> HW job descriptor.
>>>>
>>>
>>> I actually still think that having a separate fence function in the "ops"
>>> section is the best approach here. It's unabiguous as to whether it's
>>> fence-before or fence-after, and if we have it in the ops, it doesn't use a
>>> "fast-path" slot.
>>>
>>> However, if we *really* want to use a flag instead, I don't see the value
>>> in having two flags, it will be really confusing.  Instead, if we do go
>>> with a flag, I think "RTE_DMA_PRE_FENCE" should be the name, indicating
>>> that the fence occurs before the job in question.
>>
>> IMO, We need to use flags and the name can be RTE_DMA_PRE_FENCE
>> due to overhead of the driver implementation where the fence request
>> can be NOP and
>> to save the first cache line occupancy.
>>
>>>
>>>>
>>>>>
>>>>>>>
>>>>>>>>
>>>>> <snip>
>>>>>>>> Since we have additional function call overhead in all the
>>>>>>>> applications for this scheme, I would like to understand
>>>>>>>> the use of doing this way vs enq does the doorbell implicitly from
>>>>>>>> driver/application PoV?
>>>>>>>>
>>>>>>>
>>>>>>> In our benchmarks it's just faster. When we tested it, the overhead of the
>>>>>>> function calls was noticably less than the cost of building up the
>>>>>>> parameter array(s) for passing the jobs in as a burst. [We don't see this
>>>>>>> cost with things like NIC I/O since DPDK tends to already have the mbuf
>>>>>>> fully populated before the TX call anyway.]
>>>>>>
>>>>>> OK. I agree with stack population.
>>>>>>
>>>>>> My question was more on doing implicit doorbell update enq. Is doorbell write
>>>>>> costly in other HW compare to a function call? In our HW, it is just write of
>>>>>> the number of instructions written in a register.
>>>>>>
>>>>>> Also, we need to again access the internal PMD memory structure to find
>>>>>> where to write etc if it is a separate function.
>>>>>>
>>>>>
>>>>> The cost varies depending on a number of factors - even writing to a single
>>>>> HW register can be very slow if that register is mapped as device
>>>>> (uncacheable) memory, since (AFAIK) it will act as a full fence and wait
>>>>
>>>> I don't know, At least in our case, writes are write-back. so core does not need
>>>> to wait.(If there is no read operation).
>>>>
>>>>> for the write to go all the way to hardware. For more modern HW, the cost
>>>>> can be lighter. However, any cost of HW writes is going to be the same
>>>>> whether its a separate function call or not.
>>>>>
>>>>> However, the main thing about the doorbell update is that it's a
>>>>> once-per-burst thing, rather than a once-per-job. Therefore, even if you
>>>>> have to re-read the struct memory (which is likely still somewhere in your
>>>>> cores' cache), any extra small cost of doing so is to be amortized over the
>>>>> cost of a whole burst of copies.
>>>>
>>>> Linux kernel has xmit_more flag in skb to address similar thing.
>>>> i.e enq job flag can have one more bit field to say update ring bell or not?
>>>> Rather having yet another function overhead.IMO, it is the best of both worlds.
>>>>
>>>
>>> It's just more conditionals and branches all through the code. Inside the
>>> user application, the user has to check whether to set the flag or not (or
>>> special-case the last transaction outside the loop), and within the driver,
>>> there has to be a branch whether or not to call the doorbell function. The
>>> code on both sides is far simpler and more readable if the doorbell
>>> function is exactly that - a separate function.
>>
>> I disagree. The reason is:
>>
>> We will have two classes of applications
>>
>> a) do dma copy request as and when it has data(I think, this is the
>> prime use case), for those,
>> I think, it is considerable overhead to have two function invocation
>> per transfer i.e
>> rte_dma_copy() and rte_dma_perform()
>>
>> b) do dma copy when the data is reached to a logical state,  like copy
>> IP frame from Ethernet packets or so,
>> In that case, the application will have  a LOGIC to detect when to
>> perform it so on the end of
>> that rte_dma_copy() flag can be updated to fire the doorbell.
>>
>> IMO, We are comparing against a branch(flag is already in register) vs
>> a set of instructions for
>> 1) function pointer overhead
>> 2) Need to use the channel context again back in another function.
>>
>> IMO, a single branch is most optimal from performance PoV.
>>
> Ok, let's try it and see how it goes.

Test result show:
1) For Kunpeng platform (ARMv8) could benefit very little with doorbell in flags
2) For Xeon E5-2690 v2 (X86) could benefit with separate function
3) Both platform could benefit with doorbell in flags if burst < 5

There is a performance gain in small bursts (<5). Given the extensive use of bursts
in DPDK applications and users are accustomed to the concept, I do not recommend
using the 'doorbell' in flags.
And also user may confuse about the doorbell operations.

Kunpeng platform test result:
    [root@SZ tmp]# ./a1 1
    burst = 1
    perform_after_multiple_enqueue: burst:1 cost:0s.554422us
    doorbell_for_every_enqueue: burst:1 cost:0s.450927us
    last_enqueue_issue_doorbell: burst:1 cost:0s.450479us
    [root@SZ tmp]#
    [root@SZ tmp]# ./a1 2
    burst = 2
    perform_after_multiple_enqueue: burst:2 cost:0s.900884us
    doorbell_for_every_enqueue: burst:2 cost:0s.866732us
    last_enqueue_issue_doorbell: burst:2 cost:0s.732469us
    [root@SZ tmp]# ./a1 5
    burst = 5
    perform_after_multiple_enqueue: burst:5 cost:1s.732410us
    doorbell_for_every_enqueue: burst:5 cost:2s.115479us
    last_enqueue_issue_doorbell: burst:5 cost:1s.759349us
    [root@SZ tmp]# ./a1 10
    burst = 10
    perform_after_multiple_enqueue: burst:10 cost:3s.490716us
    doorbell_for_every_enqueue: burst:10 cost:4s.194691us
    last_enqueue_issue_doorbell: burst:10 cost:3s.331825us
    [root@SZ tmp]# ./a1 30
    burst = 30
    perform_after_multiple_enqueue: burst:30 cost:9s.61761us
    doorbell_for_every_enqueue: burst:30 cost:12s.517082us
    last_enqueue_issue_doorbell: burst:30 cost:9s.614802us

X86 platform test result:
    fengchengwen@SZ:~/tmp$ ./a1 1
    burst = 1
    perform_after_multiple_enqueue: burst:1 cost:0s.406331us
    doorbell_for_every_enqueue: burst:1 cost:0s.331109us
    last_enqueue_issue_doorbell: burst:1 cost:0s.381782us
    fengchengwen@SZ:~/tmp$ ./a1 2
    burst = 2
    perform_after_multiple_enqueue: burst:2 cost:0s.569024us
    doorbell_for_every_enqueue: burst:2 cost:0s.643449us
    last_enqueue_issue_doorbell: burst:2 cost:0s.486639us
    fengchengwen@SZ:~/tmp$ ./a1 5
    burst = 5
    perform_after_multiple_enqueue: burst:5 cost:1s.166384us
    doorbell_for_every_enqueue: burst:5 cost:1s.602369us
    last_enqueue_issue_doorbell: burst:5 cost:1s.209392us
    fengchengwen@SZ:~/tmp$ ./a1 10
    burst = 10
    perform_after_multiple_enqueue: burst:10 cost:2s.229901us
    doorbell_for_every_enqueue: burst:10 cost:3s.754802us
    last_enqueue_issue_doorbell: burst:10 cost:2s.328705us
    fengchengwen@SZ:~/tmp$
    fengchengwen@SZ:~/tmp$ ./a1 30
    burst = 30
    perform_after_multiple_enqueue: burst:30 cost:6s.132817us
    doorbell_for_every_enqueue: burst:30 cost:9s.944619us
    last_enqueue_issue_doorbell: burst:30 cost:7s.73551us


test-code:

#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <time.h>
#include <sys/time.h>

struct dmadev;

unsigned int dev_reg[10240];
volatile unsigned int *ring;
volatile unsigned int *doorbell;

void init_global(void)
{
        ring = &dev_reg[100];
        doorbell = &dev_reg[10000];
}

#define rte_wmb() asm volatile("dmb oshst" : : : "memory")
//#define rte_wmb() asm volatile ("" : : : "memory")

typedef int (*enqueue_t)(struct dmadev *dev, int vchan, void *src, void *dst, int len, int flags);
typedef void (*perform_t)(struct dmadev *dev, int vchan);

struct dmadev {
        enqueue_t enqueue;
        perform_t perform;
        char rsv[512];
};

int hisi_dma_enqueue(struct dmadev *dev, int vchan, void *src, void *dst, int len, int flags)
{
        *ring = 1;
}

int hisi_dma_enqueue_doorbell(struct dmadev *dev, int vchan, void *src, void *dst, int len, int flags)
{
        *ring = 1;
        if (flags == 1) {
                rte_wmb();
                *doorbell = 1;
        }
}

void hisi_dma_perform(struct dmadev *dev, int vchan)
{
        rte_wmb();
        *doorbell = 1;
}

struct dmadev devlist[64];

void init_devlist(bool enq_doorbell)
{
        int i;
        for (i = 0; i < 64; i++) {
                devlist[i].enqueue = enq_doorbell ? hisi_dma_enqueue_doorbell : hisi_dma_enqueue;
                devlist[i].perform = hisi_dma_perform;
        }
}

static inline int dma_enqueue(int dev_id, int vchan, void *src, void *dst, int len, int flags)
{
        struct dmadev *dev = &devlist[dev_id];
        return dev->enqueue(dev, vchan, src, dst, len, flags);
}

static inline void dma_perform(int dev_id, int vchan)
{
        struct dmadev *dev = &devlist[dev_id];
        return dev->perform(dev, vchan);
}

#define MAX_LOOPS       90000000

void test_for_perform_after_multiple_enqueue(int burst)
{
        struct timeval start, end, delta;
        unsigned int i, j;
        init_devlist(false);
        gettimeofday(&start, NULL);
        for (i = 0; i < MAX_LOOPS; i++) {
                for (j = 0; j < burst; j++)
                        (void)dma_enqueue(10, 0, NULL, NULL, 0, 0);
                dma_perform(10, 0);
        }
        gettimeofday(&end, NULL);
        timersub(&end, &start, &delta);
        printf("perform_after_multiple_enqueue: burst:%d cost:%us.%uus \n", burst, delta.tv_sec, delta.tv_usec);
}

void test_for_doorbell_for_every_enqueue(int burst)
{
        struct timeval start, end, delta;
        unsigned int i, j;
        init_devlist(true);
        gettimeofday(&start, NULL);
        for (i = 0; i < MAX_LOOPS; i++) {
                for (j = 0; j < burst; j++)
                        (void)dma_enqueue(10, 0, NULL, NULL, 0, 1);
        }
        gettimeofday(&end, NULL);
        timersub(&end, &start, &delta);
        printf("doorbell_for_every_enqueue: burst:%d cost:%us.%uus \n", burst, delta.tv_sec, delta.tv_usec);
}

void test_for_last_enqueue_issue_doorbell(int burst)
{
        struct timeval start, end, delta;
        unsigned int i, j;
        init_devlist(true);
        gettimeofday(&start, NULL);
        for (i = 0; i < MAX_LOOPS; i++) {
                for (j = 0; j < burst - 1; j++)
                        (void)dma_enqueue(10, 0, NULL, NULL, 0, 0);
                dma_enqueue(10, 0, NULL, NULL, 0, 1);
        }
        gettimeofday(&end, NULL);
        timersub(&end, &start, &delta);
        printf("last_enqueue_issue_doorbell: burst:%d cost:%us.%uus \n", burst, delta.tv_sec, delta.tv_usec);
}

void main(int argc, char *argv[])
{
        if (argc < 2) {
                printf("please input burst parameter!\n");
                return;
        }
        init_global();
        int burst = atol(argv[1]);
        printf("burst = %d \n", burst);
        test_for_perform_after_multiple_enqueue(burst);
        test_for_doorbell_for_every_enqueue(burst);
        test_for_last_enqueue_issue_doorbell(burst);
}

> 
>>
>>>
>>>>
>>>>>
>>>>>>
>>>>>>>
>>>>>>>>
>>>>> <snip>
>>>>>>>>> + +/** + * @warning + * @b EXPERIMENTAL: this API may change
>>>>>>>>> without prior notice.  + * + * Returns the number of operations
>>>>>>>>> that failed to complete.  + * NOTE: This API was used when
>>>>>>>>> rte_dmadev_completed has_error was set.  + * + * @param dev_id
>>>>>>>>> + *   The identifier of the device.  + * @param vq_id + *   The
>>>>>>>>> identifier of virt queue.
>>>>>>>> (> + * @param nb_status
>>>>>>>>> + *   Indicates the size  of status array.  + * @param[out]
>>>>>>>>> status + *   The error code of operations that failed to
>>>>>>>>> complete.  + * @param[out] cookie + *   The last failed
>>>>>>>>> completed operation's cookie.  + * + * @return + *   The number
>>>>>>>>> of operations that failed to complete.  + * + * NOTE: The
>>>>>>>>> caller must ensure that the input parameter is valid and the +
>>>>>>>>> *       corresponding device supports the operation.  + */
>>>>>>>>> +__rte_experimental +static inline uint16_t
>>>>>>>>> +rte_dmadev_completed_fails(uint16_t dev_id, uint16_t vq_id, +
>>>>>>>>> const uint16_t nb_status, uint32_t *status, +
>>>>>>>>> dma_cookie_t *cookie)
>>>>>>>>
>>>>>>>> IMO, it is better to move cookie/rind_idx at 3.  Why it would
>>>>>>>> return any array of errors? since it called after
>>>>>>>> rte_dmadev_completed() has has_error. Is it better to change
>>>>>>>>
>>>>>>>> rte_dmadev_error_status((uint16_t dev_id, uint16_t vq_id,
>>>>>>>> dma_cookie_t *cookie,  uint32_t *status)
>>>>>>>>
>>>>>>>> I also think, we may need to set status as bitmask and enumerate
>>>>>>>> all the combination of error codes of all the driver and return
>>>>>>>> string from driver existing rte_flow_error
>>>>>>>>
>>>>>>>> See struct rte_flow_error { enum rte_flow_error_type type; /**<
>>>>>>>> Cause field and error types. */ const void *cause; /**< Object
>>>>>>>> responsible for the error. */ const char *message; /**<
>>>>>>>> Human-readable error message. */ };
>>>>>>>>
>>>>>>>
>>>>>>> I think we need a multi-return value API here, as we may add
>>>>>>> operations in future which have non-error status values to return.
>>>>>>> The obvious case is DMA engines which support "compare" operations.
>>>>>>> In that case a successful compare (as in there were no DMA or HW
>>>>>>> errors) can return "equal" or "not-equal" as statuses. For general
>>>>>>> "copy" operations, the faster completion op can be used to just
>>>>>>> return successful values (and only call this status version on
>>>>>>> error), while apps using those compare ops or a mixture of copy and
>>>>>>> compare ops, would always use the slower one that returns status
>>>>>>> values for each and every op..
>>>>>>>
>>>>>>> The ioat APIs used 32-bit integer values for this status array so
>>>>>>> as to allow e.g. 16-bits for error code and 16-bits for future
>>>>>>> status values. For most operations there should be a fairly small
>>>>>>> set of things that can go wrong, i.e. bad source address, bad
>>>>>>> destination address or invalid length.  Within that we may have a
>>>>>>> couple of specifics for why an address is bad, but even so I don't
>>>>>>> think we need to start having multiple bit combinations.
>>>>>>
>>>>>> OK. What is the purpose of errors status? Is it for application
>>>>>> printing it or Does the application need to take any action based on
>>>>>> specific error requests?
>>>>>
>>>>> It's largely for information purposes, but in the case of SVA/SVM
>>>>> errors could occur due to the memory not being pinned, i.e. a page
>>>>> fault, in some cases. If that happens, then it's up the app to either
>>>>> touch the memory and retry the copy, or to do a SW memcpy as a
>>>>> fallback.
>>>>>
>>>>> In other error cases, I think it's good to tell the application if it's
>>>>> passing around bad data, or data that is beyond the scope of hardware,
>>>>> e.g.  a copy that is beyond what can be done in a single transaction
>>>>> for a HW instance. Given that there are always things that can go
>>>>> wrong, I think we need some error reporting mechanism.
>>>>>
>>>>>> If the former is scope, then we need to define the standard enum
>>>>>> value for the error right?  ie. uint32_t *status needs to change to
>>>>>> enum rte_dma_error or so.
>>>>>>
>>>>> Sure. Perhaps an error/status structure either is an option, where we
>>>>> explicitly call out error info from status info.
>>>>
>>>> Agree. Better to have a structure with filed like,
>>>>
>>>> 1)  enum rte_dma_error_type 2)  memory to store, informative message on
>>>> fine aspects of error.  LIke address caused issue etc.(Which will be
>>>> driver-specific information).
>>>>
>>> The only issue I have with that is that once we have driver specific
>>> information it is of little use to the application, since it can't know
>>> anything about it excepy maybe log it.  I'd much rather have a set of error
>>> codes telling user that "source address is wrong", "dest address is wrong",
>>> and a generic "an address is wrong" in case driver/HW cannot distinguish
>>> source of error. Can we see how far we get with just error codes before we
>>> start into passing string messages around and all the memory management
>>> headaches that implies.
>>
>> Works for me. It should be "enum rte_dma_error_type" then, which has a standard
>> error type. Which is missing in the spec now.
>>
> +1
> .
> 

^ permalink raw reply	[flat|nested] 339+ messages in thread

* Re: [dpdk-dev] [PATCH] dmadev: introduce DMA device library
  2021-07-08  3:11                 ` fengchengwen
@ 2021-07-08 18:35                   ` Jerin Jacob
  2021-07-09  9:14                     ` Bruce Richardson
  0 siblings, 1 reply; 339+ messages in thread
From: Jerin Jacob @ 2021-07-08 18:35 UTC (permalink / raw)
  To: fengchengwen
  Cc: Bruce Richardson, Thomas Monjalon, Ferruh Yigit, Jerin Jacob,
	dpdk-dev, Morten Brørup, Nipun Gupta, Hemant Agrawal,
	Maxime Coquelin, Honnappa Nagarahalli, David Marchand,
	Satananda Burla, Prasun Kapoor, Ananyev, Konstantin, liangma,
	Radha Mohan Chintakuntla

On Thu, Jul 8, 2021 at 8:41 AM fengchengwen <fengchengwen@huawei.com> wrote:
>

> >>>
> >>> It's just more conditionals and branches all through the code. Inside the
> >>> user application, the user has to check whether to set the flag or not (or
> >>> special-case the last transaction outside the loop), and within the driver,
> >>> there has to be a branch whether or not to call the doorbell function. The
> >>> code on both sides is far simpler and more readable if the doorbell
> >>> function is exactly that - a separate function.
> >>
> >> I disagree. The reason is:
> >>
> >> We will have two classes of applications
> >>
> >> a) do dma copy request as and when it has data(I think, this is the
> >> prime use case), for those,
> >> I think, it is considerable overhead to have two function invocation
> >> per transfer i.e
> >> rte_dma_copy() and rte_dma_perform()
> >>
> >> b) do dma copy when the data is reached to a logical state,  like copy
> >> IP frame from Ethernet packets or so,
> >> In that case, the application will have  a LOGIC to detect when to
> >> perform it so on the end of
> >> that rte_dma_copy() flag can be updated to fire the doorbell.
> >>
> >> IMO, We are comparing against a branch(flag is already in register) vs
> >> a set of instructions for
> >> 1) function pointer overhead
> >> 2) Need to use the channel context again back in another function.
> >>
> >> IMO, a single branch is most optimal from performance PoV.
> >>
> > Ok, let's try it and see how it goes.
>
> Test result show:
> 1) For Kunpeng platform (ARMv8) could benefit very little with doorbell in flags
> 2) For Xeon E5-2690 v2 (X86) could benefit with separate function
> 3) Both platform could benefit with doorbell in flags if burst < 5
>
> There is a performance gain in small bursts (<5). Given the extensive use of bursts
 in DPDK applications and users are accustomed to the concept, I do
not recommend
> using the 'doorbell' in flags.

There is NO concept change between one option vs other option. Just
argument differnet.
Also, _perform() scheme not used anywhere in DPDK. I

Regarding performance, I have added dummy instructions to simulate the real work
load[1], now burst also has some gain in both x86 and arm64[3]

I have modified your application[2] to dpdk test application to use
cpu isolation etc.
So this is gain in flag scheme ad code is checked in to Github[2[

[1]
static inline void
delay(void)
{
        volatile int k;

        for (k = 0; k < 16; k++) {


      }

}

__rte_noinline
int
hisi_dma_enqueue(struct dmadev *dev, int vchan, void *src, void *dst,
int len, const int flags)
{
         delay();

        *ring = 1;

         return 0;
}

__rte_noinline
int
hisi_dma_enqueue_doorbell(struct dmadev *dev, int vchan, void *src,
void *dst, int len, const int flags)
{
        delay();

        *ring = 1;

        if (unlikely(flags == 1)) {
                rte_wmb();
                *doorbell = 1;
        }
      return 0;
}


[2]

https://github.com/jerinjacobk/dpdk-dmatest/commit/4fc9bc3029543bbc4caaa5183d98bac93c34f588

Update results [3]

Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz

echo "dma_perf_autotest" | ./build/app/test/dpdk-test --no-huge -c 0xf00

core=24 Timer running at 2600.00MHz
   test_for_perform_after_multiple_enqueue: burst=1 cycles=46.000000
      test_for_last_enqueue_issue_doorbell: burst=1 cycles=45.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=2 cycles=90.000000
      test_for_last_enqueue_issue_doorbell: burst=2 cycles=89.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=3 cycles=134.000000
      test_for_last_enqueue_issue_doorbell: burst=3 cycles=133.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=4 cycles=177.000000
      test_for_last_enqueue_issue_doorbell: burst=4 cycles=176.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=5 cycles=221.000000
      test_for_last_enqueue_issue_doorbell: burst=5 cycles=221.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=6 cycles=265.000000
      test_for_last_enqueue_issue_doorbell: burst=6 cycles=265.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=7 cycles=333.000000
      test_for_last_enqueue_issue_doorbell: burst=7 cycles=309.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=8 cycles=375.000000
      test_for_last_enqueue_issue_doorbell: burst=8 cycles=373.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=9 cycles=418.000000
      test_for_last_enqueue_issue_doorbell: burst=9 cycles=414.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=10 cycles=462.000000
      test_for_last_enqueue_issue_doorbell: burst=10 cycles=458.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=11 cycles=507.000000
      test_for_last_enqueue_issue_doorbell: burst=11 cycles=501.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=12 cycles=552.000000
      test_for_last_enqueue_issue_doorbell: burst=12 cycles=546.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=13 cycles=593.000000
      test_for_last_enqueue_issue_doorbell: burst=13 cycles=590.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=14 cycles=638.000000
      test_for_last_enqueue_issue_doorbell: burst=14 cycles=634.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=15 cycles=681.000000
      test_for_last_enqueue_issue_doorbell: burst=15 cycles=678.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=16 cycles=725.000000
      test_for_last_enqueue_issue_doorbell: burst=16 cycles=722.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=17 cycles=770.000000
      test_for_last_enqueue_issue_doorbell: burst=17 cycles=767.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=18 cycles=815.000000
      test_for_last_enqueue_issue_doorbell: burst=18 cycles=812.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=19 cycles=857.000000
      test_for_last_enqueue_issue_doorbell: burst=19 cycles=854.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=20 cycles=902.000000
      test_for_last_enqueue_issue_doorbell: burst=20 cycles=899.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=21 cycles=945.000000
      test_for_last_enqueue_issue_doorbell: burst=21 cycles=943.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=22 cycles=990.000000
      test_for_last_enqueue_issue_doorbell: burst=22 cycles=988.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=23 cycles=1033.000000
      test_for_last_enqueue_issue_doorbell: burst=23 cycles=1031.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=24 cycles=1077.000000
      test_for_last_enqueue_issue_doorbell: burst=24 cycles=1075.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=25 cycles=1121.000000
      test_for_last_enqueue_issue_doorbell: burst=25 cycles=1119.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=26 cycles=1166.000000
      test_for_last_enqueue_issue_doorbell: burst=26 cycles=1163.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=27 cycles=1208.000000
      test_for_last_enqueue_issue_doorbell: burst=27 cycles=1208.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=28 cycles=1252.000000
      test_for_last_enqueue_issue_doorbell: burst=28 cycles=1252.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=29 cycles=1295.000000
      test_for_last_enqueue_issue_doorbell: burst=29 cycles=1295.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=30 cycles=1342.000000
      test_for_last_enqueue_issue_doorbell: burst=30 cycles=1340.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=31 cycles=1386.000000
      test_for_last_enqueue_issue_doorbell: burst=31 cycles=1384.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=32 cycles=1429.000000
      test_for_last_enqueue_issue_doorbell: burst=32 cycles=1428.000000
-------------------------------------------------------------------------------



octeontx2:

See https://doc.dpdk.org/guides/prog_guide/profile_app.html section
62.2.3. High-resolution cycle counter


meson --cross config/arm/arm64_octeontx2_linux_gcc
-Dc_args='-DRTE_ARM_EAL_RDTSC_USE_PMU' build

 echo "dma_perf_autotest" | ./build/app/test/dpdk-test --no-huge -c 0xff0000
RTE>>dma_perf_autotest^M
lcore=16 Timer running at 2400.00MHz
   test_for_perform_after_multiple_enqueue: burst=1 cycles=105.000000
      test_for_last_enqueue_issue_doorbell: burst=1 cycles=105.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=2 cycles=207.000000
      test_for_last_enqueue_issue_doorbell: burst=2 cycles=207.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=3 cycles=309.000000
      test_for_last_enqueue_issue_doorbell: burst=3 cycles=310.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=4 cycles=411.000000
      test_for_last_enqueue_issue_doorbell: burst=4 cycles=410.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=5 cycles=513.000000
      test_for_last_enqueue_issue_doorbell: burst=5 cycles=512.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=6 cycles=615.000000
      test_for_last_enqueue_issue_doorbell: burst=6 cycles=615.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=7 cycles=717.000000
      test_for_last_enqueue_issue_doorbell: burst=7 cycles=716.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=8 cycles=819.000000
      test_for_last_enqueue_issue_doorbell: burst=8 cycles=818.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=9 cycles=921.000000
      test_for_last_enqueue_issue_doorbell: burst=9 cycles=922.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=10 cycles=1023.000000
      test_for_last_enqueue_issue_doorbell: burst=10 cycles=1022.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=11 cycles=1126.000000
      test_for_last_enqueue_issue_doorbell: burst=11 cycles=1124.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=12 cycles=1227.000000
      test_for_last_enqueue_issue_doorbell: burst=12 cycles=1227.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=13 cycles=1329.000000
      test_for_last_enqueue_issue_doorbell: burst=13 cycles=1328.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=14 cycles=1431.000000
      test_for_last_enqueue_issue_doorbell: burst=14 cycles=1430.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=15 cycles=1534.000000
      test_for_last_enqueue_issue_doorbell: burst=15 cycles=1534.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=16 cycles=1638.000000
      test_for_last_enqueue_issue_doorbell: burst=16 cycles=1640.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=17 cycles=1746.000000
      test_for_last_enqueue_issue_doorbell: burst=17 cycles=1739.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=18 cycles=1847.000000
      test_for_last_enqueue_issue_doorbell: burst=18 cycles=1841.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=19 cycles=1950.000000
      test_for_last_enqueue_issue_doorbell: burst=19 cycles=1944.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=20 cycles=2051.000000
       test_for_last_enqueue_issue_doorbell: burst=20 cycles=2045.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=21 cycles=2154.000000
      test_for_last_enqueue_issue_doorbell: burst=21 cycles=2148.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=22 cycles=2257.000000
      test_for_last_enqueue_issue_doorbell: burst=22 cycles=2249.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=23 cycles=2358.000000
      test_for_last_enqueue_issue_doorbell: burst=23 cycles=2352.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=24 cycles=2459.000000
      test_for_last_enqueue_issue_doorbell: burst=24 cycles=2454.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=25 cycles=2562.000000
      test_for_last_enqueue_issue_doorbell: burst=25 cycles=2555.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=26 cycles=2665.000000
      test_for_last_enqueue_issue_doorbell: burst=26 cycles=2657.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=27 cycles=2766.000000
      test_for_last_enqueue_issue_doorbell: burst=27 cycles=2760.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=28 cycles=2867.000000
      test_for_last_enqueue_issue_doorbell: burst=28 cycles=2861.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=29 cycles=2970.000000
      test_for_last_enqueue_issue_doorbell: burst=29 cycles=2964.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=30 cycles=3073.000000
      test_for_last_enqueue_issue_doorbell: burst=30 cycles=3065.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=31 cycles=3174.000000
      test_for_last_enqueue_issue_doorbell: burst=31 cycles=3168.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=32 cycles=3275.000000
      test_for_last_enqueue_issue_doorbell: burst=32 cycles=3269.000000
-------------------------------------------------------------------------------
Test OK
RTE>




> And also user may confuse about the doorbell operations.
>
> Kunpeng platform test result:
>     [root@SZ tmp]# ./a1 1
>     burst = 1
>     perform_after_multiple_enqueue: burst:1 cost:0s.554422us
>     doorbell_for_every_enqueue: burst:1 cost:0s.450927us
>     last_enqueue_issue_doorbell: burst:1 cost:0s.450479us
>     [root@SZ tmp]#
>     [root@SZ tmp]# ./a1 2
>     burst = 2
>     perform_after_multiple_enqueue: burst:2 cost:0s.900884us
>     doorbell_for_every_enqueue: burst:2 cost:0s.866732us
>     last_enqueue_issue_doorbell: burst:2 cost:0s.732469us
>     [root@SZ tmp]# ./a1 5
>     burst = 5
>     perform_after_multiple_enqueue: burst:5 cost:1s.732410us
>     doorbell_for_every_enqueue: burst:5 cost:2s.115479us
>     last_enqueue_issue_doorbell: burst:5 cost:1s.759349us
>     [root@SZ tmp]# ./a1 10
>     burst = 10
>     perform_after_multiple_enqueue: burst:10 cost:3s.490716us
>     doorbell_for_every_enqueue: burst:10 cost:4s.194691us
>     last_enqueue_issue_doorbell: burst:10 cost:3s.331825us
>     [root@SZ tmp]# ./a1 30
>     burst = 30
>     perform_after_multiple_enqueue: burst:30 cost:9s.61761us
>     doorbell_for_every_enqueue: burst:30 cost:12s.517082us
>     last_enqueue_issue_doorbell: burst:30 cost:9s.614802us
>
> X86 platform test result:
>     fengchengwen@SZ:~/tmp$ ./a1 1
>     burst = 1
>     perform_after_multiple_enqueue: burst:1 cost:0s.406331us
>     doorbell_for_every_enqueue: burst:1 cost:0s.331109us
>     last_enqueue_issue_doorbell: burst:1 cost:0s.381782us
>     fengchengwen@SZ:~/tmp$ ./a1 2
>     burst = 2
>     perform_after_multiple_enqueue: burst:2 cost:0s.569024us
>     doorbell_for_every_enqueue: burst:2 cost:0s.643449us
>     last_enqueue_issue_doorbell: burst:2 cost:0s.486639us
>     fengchengwen@SZ:~/tmp$ ./a1 5
>     burst = 5
>     perform_after_multiple_enqueue: burst:5 cost:1s.166384us
>     doorbell_for_every_enqueue: burst:5 cost:1s.602369us
>     last_enqueue_issue_doorbell: burst:5 cost:1s.209392us
>     fengchengwen@SZ:~/tmp$ ./a1 10
>     burst = 10
>     perform_after_multiple_enqueue: burst:10 cost:2s.229901us
>     doorbell_for_every_enqueue: burst:10 cost:3s.754802us
>     last_enqueue_issue_doorbell: burst:10 cost:2s.328705us
>     fengchengwen@SZ:~/tmp$
>     fengchengwen@SZ:~/tmp$ ./a1 30
>     burst = 30
>     perform_after_multiple_enqueue: burst:30 cost:6s.132817us
>     doorbell_for_every_enqueue: burst:30 cost:9s.944619us
>     last_enqueue_issue_doorbell: burst:30 cost:7s.73551us
>
>
> test-code:
>
> #include <stdio.h>
> #include <stdlib.h>
> #include <stdbool.h>
> #include <time.h>
> #include <sys/time.h>
>
> struct dmadev;
>
> unsigned int dev_reg[10240];
> volatile unsigned int *ring;
> volatile unsigned int *doorbell;
>
> void init_global(void)
> {
>         ring = &dev_reg[100];
>         doorbell = &dev_reg[10000];
> }
>
> #define rte_wmb() asm volatile("dmb oshst" : : : "memory")
> //#define rte_wmb() asm volatile ("" : : : "memory")
>
> typedef int (*enqueue_t)(struct dmadev *dev, int vchan, void *src, void *dst, int len, int flags);
> typedef void (*perform_t)(struct dmadev *dev, int vchan);
>
> struct dmadev {
>         enqueue_t enqueue;
>         perform_t perform;
>         char rsv[512];
> };
>
> int hisi_dma_enqueue(struct dmadev *dev, int vchan, void *src, void *dst, int len, int flags)
> {
>         *ring = 1;
> }
>
> int hisi_dma_enqueue_doorbell(struct dmadev *dev, int vchan, void *src, void *dst, int len, int flags)
> {
>         *ring = 1;
>         if (flags == 1) {
>                 rte_wmb();
>                 *doorbell = 1;
>         }
> }
>
> void hisi_dma_perform(struct dmadev *dev, int vchan)
> {
>         rte_wmb();
>         *doorbell = 1;
> }
>
> struct dmadev devlist[64];
>
> void init_devlist(bool enq_doorbell)
> {
>         int i;
>         for (i = 0; i < 64; i++) {
>                 devlist[i].enqueue = enq_doorbell ? hisi_dma_enqueue_doorbell : hisi_dma_enqueue;
>                 devlist[i].perform = hisi_dma_perform;
>         }
> }
>
> static inline int dma_enqueue(int dev_id, int vchan, void *src, void *dst, int len, int flags)
> {
>         struct dmadev *dev = &devlist[dev_id];
>         return dev->enqueue(dev, vchan, src, dst, len, flags);
> }
>
> static inline void dma_perform(int dev_id, int vchan)
> {
>         struct dmadev *dev = &devlist[dev_id];
>         return dev->perform(dev, vchan);
> }
>
> #define MAX_LOOPS       90000000
>
> void test_for_perform_after_multiple_enqueue(int burst)
> {
>         struct timeval start, end, delta;
>         unsigned int i, j;
>         init_devlist(false);
>         gettimeofday(&start, NULL);
>         for (i = 0; i < MAX_LOOPS; i++) {
>                 for (j = 0; j < burst; j++)
>                         (void)dma_enqueue(10, 0, NULL, NULL, 0, 0);
>                 dma_perform(10, 0);
>         }
>         gettimeofday(&end, NULL);
>         timersub(&end, &start, &delta);
>         printf("perform_after_multiple_enqueue: burst:%d cost:%us.%uus \n", burst, delta.tv_sec, delta.tv_usec);
> }
>
> void test_for_doorbell_for_every_enqueue(int burst)
> {
>         struct timeval start, end, delta;
>         unsigned int i, j;
>         init_devlist(true);
>         gettimeofday(&start, NULL);
>         for (i = 0; i < MAX_LOOPS; i++) {
>                 for (j = 0; j < burst; j++)
>                         (void)dma_enqueue(10, 0, NULL, NULL, 0, 1);
>         }
>         gettimeofday(&end, NULL);
>         timersub(&end, &start, &delta);
>         printf("doorbell_for_every_enqueue: burst:%d cost:%us.%uus \n", burst, delta.tv_sec, delta.tv_usec);
> }
>
> void test_for_last_enqueue_issue_doorbell(int burst)
> {
>         struct timeval start, end, delta;
>         unsigned int i, j;
>         init_devlist(true);
>         gettimeofday(&start, NULL);
>         for (i = 0; i < MAX_LOOPS; i++) {
>                 for (j = 0; j < burst - 1; j++)
>                         (void)dma_enqueue(10, 0, NULL, NULL, 0, 0);
>                 dma_enqueue(10, 0, NULL, NULL, 0, 1);
>         }
>         gettimeofday(&end, NULL);
>         timersub(&end, &start, &delta);
>         printf("last_enqueue_issue_doorbell: burst:%d cost:%us.%uus \n", burst, delta.tv_sec, delta.tv_usec);
> }
>
> void main(int argc, char *argv[])
> {
>         if (argc < 2) {
>                 printf("please input burst parameter!\n");
>                 return;
>         }
>         init_global();
>         int burst = atol(argv[1]);
>         printf("burst = %d \n", burst);
>         test_for_perform_after_multiple_enqueue(burst);
>         test_for_doorbell_for_every_enqueue(burst);
>         test_for_last_enqueue_issue_doorbell(burst);
> }
>
> >
> >>
> >>>
> >>>>
> >>>>>
> >>>>>>
> >>>>>>>
> >>>>>>>>
> >>>>> <snip>
> >>>>>>>>> + +/** + * @warning + * @b EXPERIMENTAL: this API may change
> >>>>>>>>> without prior notice.  + * + * Returns the number of operations
> >>>>>>>>> that failed to complete.  + * NOTE: This API was used when
> >>>>>>>>> rte_dmadev_completed has_error was set.  + * + * @param dev_id
> >>>>>>>>> + *   The identifier of the device.  + * @param vq_id + *   The
> >>>>>>>>> identifier of virt queue.
> >>>>>>>> (> + * @param nb_status
> >>>>>>>>> + *   Indicates the size  of status array.  + * @param[out]
> >>>>>>>>> status + *   The error code of operations that failed to
> >>>>>>>>> complete.  + * @param[out] cookie + *   The last failed
> >>>>>>>>> completed operation's cookie.  + * + * @return + *   The number
> >>>>>>>>> of operations that failed to complete.  + * + * NOTE: The
> >>>>>>>>> caller must ensure that the input parameter is valid and the +
> >>>>>>>>> *       corresponding device supports the operation.  + */
> >>>>>>>>> +__rte_experimental +static inline uint16_t
> >>>>>>>>> +rte_dmadev_completed_fails(uint16_t dev_id, uint16_t vq_id, +
> >>>>>>>>> const uint16_t nb_status, uint32_t *status, +
> >>>>>>>>> dma_cookie_t *cookie)
> >>>>>>>>
> >>>>>>>> IMO, it is better to move cookie/rind_idx at 3.  Why it would
> >>>>>>>> return any array of errors? since it called after
> >>>>>>>> rte_dmadev_completed() has has_error. Is it better to change
> >>>>>>>>
> >>>>>>>> rte_dmadev_error_status((uint16_t dev_id, uint16_t vq_id,
> >>>>>>>> dma_cookie_t *cookie,  uint32_t *status)
> >>>>>>>>
> >>>>>>>> I also think, we may need to set status as bitmask and enumerate
> >>>>>>>> all the combination of error codes of all the driver and return
> >>>>>>>> string from driver existing rte_flow_error
> >>>>>>>>
> >>>>>>>> See struct rte_flow_error { enum rte_flow_error_type type; /**<
> >>>>>>>> Cause field and error types. */ const void *cause; /**< Object
> >>>>>>>> responsible for the error. */ const char *message; /**<
> >>>>>>>> Human-readable error message. */ };
> >>>>>>>>
> >>>>>>>
> >>>>>>> I think we need a multi-return value API here, as we may add
> >>>>>>> operations in future which have non-error status values to return.
> >>>>>>> The obvious case is DMA engines which support "compare" operations.
> >>>>>>> In that case a successful compare (as in there were no DMA or HW
> >>>>>>> errors) can return "equal" or "not-equal" as statuses. For general
> >>>>>>> "copy" operations, the faster completion op can be used to just
> >>>>>>> return successful values (and only call this status version on
> >>>>>>> error), while apps using those compare ops or a mixture of copy and
> >>>>>>> compare ops, would always use the slower one that returns status
> >>>>>>> values for each and every op..
> >>>>>>>
> >>>>>>> The ioat APIs used 32-bit integer values for this status array so
> >>>>>>> as to allow e.g. 16-bits for error code and 16-bits for future
> >>>>>>> status values. For most operations there should be a fairly small
> >>>>>>> set of things that can go wrong, i.e. bad source address, bad
> >>>>>>> destination address or invalid length.  Within that we may have a
> >>>>>>> couple of specifics for why an address is bad, but even so I don't
> >>>>>>> think we need to start having multiple bit combinations.
> >>>>>>
> >>>>>> OK. What is the purpose of errors status? Is it for application
> >>>>>> printing it or Does the application need to take any action based on
> >>>>>> specific error requests?
> >>>>>
> >>>>> It's largely for information purposes, but in the case of SVA/SVM
> >>>>> errors could occur due to the memory not being pinned, i.e. a page
> >>>>> fault, in some cases. If that happens, then it's up the app to either
> >>>>> touch the memory and retry the copy, or to do a SW memcpy as a
> >>>>> fallback.
> >>>>>
> >>>>> In other error cases, I think it's good to tell the application if it's
> >>>>> passing around bad data, or data that is beyond the scope of hardware,
> >>>>> e.g.  a copy that is beyond what can be done in a single transaction
> >>>>> for a HW instance. Given that there are always things that can go
> >>>>> wrong, I think we need some error reporting mechanism.
> >>>>>
> >>>>>> If the former is scope, then we need to define the standard enum
> >>>>>> value for the error right?  ie. uint32_t *status needs to change to
> >>>>>> enum rte_dma_error or so.
> >>>>>>
> >>>>> Sure. Perhaps an error/status structure either is an option, where we
> >>>>> explicitly call out error info from status info.
> >>>>
> >>>> Agree. Better to have a structure with filed like,
> >>>>
> >>>> 1)  enum rte_dma_error_type 2)  memory to store, informative message on
> >>>> fine aspects of error.  LIke address caused issue etc.(Which will be
> >>>> driver-specific information).
> >>>>
> >>> The only issue I have with that is that once we have driver specific
> >>> information it is of little use to the application, since it can't know
> >>> anything about it excepy maybe log it.  I'd much rather have a set of error
> >>> codes telling user that "source address is wrong", "dest address is wrong",
> >>> and a generic "an address is wrong" in case driver/HW cannot distinguish
> >>> source of error. Can we see how far we get with just error codes before we
> >>> start into passing string messages around and all the memory management
> >>> headaches that implies.
> >>
> >> Works for me. It should be "enum rte_dma_error_type" then, which has a standard
> >> error type. Which is missing in the spec now.
> >>
> > +1
> > .
> >

^ permalink raw reply	[flat|nested] 339+ messages in thread

* Re: [dpdk-dev] [PATCH] dmadev: introduce DMA device library
  2021-07-08 18:35                   ` Jerin Jacob
@ 2021-07-09  9:14                     ` Bruce Richardson
  2021-07-11  7:14                       ` Jerin Jacob
  0 siblings, 1 reply; 339+ messages in thread
From: Bruce Richardson @ 2021-07-09  9:14 UTC (permalink / raw)
  To: Jerin Jacob
  Cc: fengchengwen, Thomas Monjalon, Ferruh Yigit, Jerin Jacob,
	dpdk-dev, Morten Brørup, Nipun Gupta, Hemant Agrawal,
	Maxime Coquelin, Honnappa Nagarahalli, David Marchand,
	Satananda Burla, Prasun Kapoor, Ananyev, Konstantin, liangma,
	Radha Mohan Chintakuntla

On Fri, Jul 09, 2021 at 12:05:40AM +0530, Jerin Jacob wrote:
> On Thu, Jul 8, 2021 at 8:41 AM fengchengwen <fengchengwen@huawei.com> wrote:
> >
> 
> > >>>
> > >>> It's just more conditionals and branches all through the code. Inside the
> > >>> user application, the user has to check whether to set the flag or not (or
> > >>> special-case the last transaction outside the loop), and within the driver,
> > >>> there has to be a branch whether or not to call the doorbell function. The
> > >>> code on both sides is far simpler and more readable if the doorbell
> > >>> function is exactly that - a separate function.
> > >>
> > >> I disagree. The reason is:
> > >>
> > >> We will have two classes of applications
> > >>
> > >> a) do dma copy request as and when it has data(I think, this is the
> > >> prime use case), for those,
> > >> I think, it is considerable overhead to have two function invocation
> > >> per transfer i.e
> > >> rte_dma_copy() and rte_dma_perform()
> > >>
> > >> b) do dma copy when the data is reached to a logical state,  like copy
> > >> IP frame from Ethernet packets or so,
> > >> In that case, the application will have  a LOGIC to detect when to
> > >> perform it so on the end of
> > >> that rte_dma_copy() flag can be updated to fire the doorbell.
> > >>
> > >> IMO, We are comparing against a branch(flag is already in register) vs
> > >> a set of instructions for
> > >> 1) function pointer overhead
> > >> 2) Need to use the channel context again back in another function.
> > >>
> > >> IMO, a single branch is most optimal from performance PoV.
> > >>
> > > Ok, let's try it and see how it goes.
> >
> > Test result show:
> > 1) For Kunpeng platform (ARMv8) could benefit very little with doorbell in flags
> > 2) For Xeon E5-2690 v2 (X86) could benefit with separate function
> > 3) Both platform could benefit with doorbell in flags if burst < 5
> >
> > There is a performance gain in small bursts (<5). Given the extensive use of bursts
>  in DPDK applications and users are accustomed to the concept, I do
> not recommend
> > using the 'doorbell' in flags.
> 
> There is NO concept change between one option vs other option. Just
> argument differnet.
> Also, _perform() scheme not used anywhere in DPDK. I
> 
> Regarding performance, I have added dummy instructions to simulate the real work
> load[1], now burst also has some gain in both x86 and arm64[3]
> 
> I have modified your application[2] to dpdk test application to use
> cpu isolation etc.
> So this is gain in flag scheme ad code is checked in to Github[2[
> 
<snip>

The benchmark numbers all seem very close between the two schemes. On my
team we pretty much have test ioat & idxd drivers ported internally to the
last dmadev draft library, and have sample apps handling traffic using
those. I'll therefore attempt to get these numbers with real traffic on
real drivers to just double check that it's the same as these
microbenchmarks.

Assuming that perf is the same, how to resolve this? Some thoughts:
* As I understand it, the main objection to the separate doorbell function
  is the use of 8-bytes in fastpath slot. Therefore I will also attempt to
  benchmark having the doorbell function not on the same cacheline and check
  perf impact, if any.
* If we don't have a impact to perf by having the doorbell function inside
  the regular "ops" rather than on fastpath cacheline, there is no reason
  we can't implement both schemes. The user can then choose themselves
  whether to doorbell using a flag on last item, or to doorbell explicitly
  using function call.

Of the two schemes, and assuming they are equal, I do have a preference for
the separate function one, primarily from a code readability point of view.
Other than that, I have no strong opinions.

/Bruce

^ permalink raw reply	[flat|nested] 339+ messages in thread

* Re: [dpdk-dev] [PATCH] dmadev: introduce DMA device library
  2021-07-09  9:14                     ` Bruce Richardson
@ 2021-07-11  7:14                       ` Jerin Jacob
  2021-07-12  7:01                         ` Morten Brørup
  0 siblings, 1 reply; 339+ messages in thread
From: Jerin Jacob @ 2021-07-11  7:14 UTC (permalink / raw)
  To: Bruce Richardson
  Cc: fengchengwen, Thomas Monjalon, Ferruh Yigit, Jerin Jacob,
	dpdk-dev, Morten Brørup, Nipun Gupta, Hemant Agrawal,
	Maxime Coquelin, Honnappa Nagarahalli, David Marchand,
	Satananda Burla, Prasun Kapoor, Ananyev, Konstantin, liangma,
	Radha Mohan Chintakuntla

On Fri, Jul 9, 2021 at 2:44 PM Bruce Richardson
<bruce.richardson@intel.com> wrote:
>
> On Fri, Jul 09, 2021 at 12:05:40AM +0530, Jerin Jacob wrote:
> > On Thu, Jul 8, 2021 at 8:41 AM fengchengwen <fengchengwen@huawei.com> wrote:
> > >
> >
> > > >>>
> > > >>> It's just more conditionals and branches all through the code. Inside the
> > > >>> user application, the user has to check whether to set the flag or not (or
> > > >>> special-case the last transaction outside the loop), and within the driver,
> > > >>> there has to be a branch whether or not to call the doorbell function. The
> > > >>> code on both sides is far simpler and more readable if the doorbell
> > > >>> function is exactly that - a separate function.
> > > >>
> > > >> I disagree. The reason is:
> > > >>
> > > >> We will have two classes of applications
> > > >>
> > > >> a) do dma copy request as and when it has data(I think, this is the
> > > >> prime use case), for those,
> > > >> I think, it is considerable overhead to have two function invocation
> > > >> per transfer i.e
> > > >> rte_dma_copy() and rte_dma_perform()
> > > >>
> > > >> b) do dma copy when the data is reached to a logical state,  like copy
> > > >> IP frame from Ethernet packets or so,
> > > >> In that case, the application will have  a LOGIC to detect when to
> > > >> perform it so on the end of
> > > >> that rte_dma_copy() flag can be updated to fire the doorbell.
> > > >>
> > > >> IMO, We are comparing against a branch(flag is already in register) vs
> > > >> a set of instructions for
> > > >> 1) function pointer overhead
> > > >> 2) Need to use the channel context again back in another function.
> > > >>
> > > >> IMO, a single branch is most optimal from performance PoV.
> > > >>
> > > > Ok, let's try it and see how it goes.
> > >
> > > Test result show:
> > > 1) For Kunpeng platform (ARMv8) could benefit very little with doorbell in flags
> > > 2) For Xeon E5-2690 v2 (X86) could benefit with separate function
> > > 3) Both platform could benefit with doorbell in flags if burst < 5
> > >
> > > There is a performance gain in small bursts (<5). Given the extensive use of bursts
> >  in DPDK applications and users are accustomed to the concept, I do
> > not recommend
> > > using the 'doorbell' in flags.
> >
> > There is NO concept change between one option vs other option. Just
> > argument differnet.
> > Also, _perform() scheme not used anywhere in DPDK. I
> >
> > Regarding performance, I have added dummy instructions to simulate the real work
> > load[1], now burst also has some gain in both x86 and arm64[3]
> >
> > I have modified your application[2] to dpdk test application to use
> > cpu isolation etc.
> > So this is gain in flag scheme ad code is checked in to Github[2[
> >
> <snip>
>
> The benchmark numbers all seem very close between the two schemes. On my
> team we pretty much have test ioat & idxd drivers ported internally to the
> last dmadev draft library, and have sample apps handling traffic using
> those. I'll therefore attempt to get these numbers with real traffic on
> real drivers to just double check that it's the same as these
> microbenchmarks.

Thanks.

>
> Assuming that perf is the same, how to resolve this? Some thoughts:
> * As I understand it, the main objection to the separate doorbell function
>   is the use of 8-bytes in fastpath slot. Therefore I will also attempt to
>   benchmark having the doorbell function not on the same cacheline and check
>   perf impact, if any.

Probably we can remove rte_dmadev_fill_sg() variant and keep sg only for copy
to save 8B.

> * If we don't have a impact to perf by having the doorbell function inside
>   the regular "ops" rather than on fastpath cacheline, there is no reason
>   we can't implement both schemes. The user can then choose themselves
>   whether to doorbell using a flag on last item, or to doorbell explicitly
>   using function call.

Yes. I think, we can keep both.

>
> Of the two schemes, and assuming they are equal, I do have a preference for
> the separate function one, primarily from a code readability point of view.
> Other than that, I have no strong opinions.
>
> /Bruce

^ permalink raw reply	[flat|nested] 339+ messages in thread

* [dpdk-dev] [PATCH v2] dmadev: introduce DMA device library
  2021-07-02 13:18 [dpdk-dev] [PATCH] dmadev: introduce DMA device library Chengwen Feng
                   ` (4 preceding siblings ...)
  2021-07-06 20:28 ` [dpdk-dev] [RFC UPDATE PATCH 0/9] dmadev rfc suggested updates Bruce Richardson
@ 2021-07-11  9:25 ` Chengwen Feng
  2021-07-11  9:42   ` fengchengwen
                     ` (6 more replies)
  2021-07-13 12:27 ` [dpdk-dev] [PATCH v3] " Chengwen Feng
                   ` (23 subsequent siblings)
  29 siblings, 7 replies; 339+ messages in thread
From: Chengwen Feng @ 2021-07-11  9:25 UTC (permalink / raw)
  To: thomas, ferruh.yigit, bruce.richardson, jerinj, jerinjacobk
  Cc: dev, mb, nipun.gupta, hemant.agrawal, maxime.coquelin,
	honnappa.nagarahalli, david.marchand, sburla, pkapoor,
	konstantin.ananyev, liangma

This patch introduce 'dmadevice' which is a generic type of DMA
device.

The APIs of dmadev library exposes some generic operations which can
enable configuration and I/O with the DMA devices.

Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>
---
 MAINTAINERS                  |    4 +
 config/rte_config.h          |    3 +
 lib/dmadev/meson.build       |    6 +
 lib/dmadev/rte_dmadev.c      |  560 +++++++++++++++++++++++
 lib/dmadev/rte_dmadev.h      | 1030 ++++++++++++++++++++++++++++++++++++++++++
 lib/dmadev/rte_dmadev_core.h |  159 +++++++
 lib/dmadev/rte_dmadev_pmd.h  |   72 +++
 lib/dmadev/version.map       |   40 ++
 lib/meson.build              |    1 +
 9 files changed, 1875 insertions(+)
 create mode 100644 lib/dmadev/meson.build
 create mode 100644 lib/dmadev/rte_dmadev.c
 create mode 100644 lib/dmadev/rte_dmadev.h
 create mode 100644 lib/dmadev/rte_dmadev_core.h
 create mode 100644 lib/dmadev/rte_dmadev_pmd.h
 create mode 100644 lib/dmadev/version.map

diff --git a/MAINTAINERS b/MAINTAINERS
index 4347555..0595239 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -496,6 +496,10 @@ F: drivers/raw/skeleton/
 F: app/test/test_rawdev.c
 F: doc/guides/prog_guide/rawdev.rst
 
+DMA device API - EXPERIMENTAL
+M: Chengwen Feng <fengchengwen@huawei.com>
+F: lib/dmadev/
+
 
 Memory Pool Drivers
 -------------------
diff --git a/config/rte_config.h b/config/rte_config.h
index 590903c..331a431 100644
--- a/config/rte_config.h
+++ b/config/rte_config.h
@@ -81,6 +81,9 @@
 /* rawdev defines */
 #define RTE_RAWDEV_MAX_DEVS 64
 
+/* dmadev defines */
+#define RTE_DMADEV_MAX_DEVS 64
+
 /* ip_fragmentation defines */
 #define RTE_LIBRTE_IP_FRAG_MAX_FRAG 4
 #undef RTE_LIBRTE_IP_FRAG_TBL_STAT
diff --git a/lib/dmadev/meson.build b/lib/dmadev/meson.build
new file mode 100644
index 0000000..c918dae
--- /dev/null
+++ b/lib/dmadev/meson.build
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2021 HiSilicon Limited.
+
+sources = files('rte_dmadev.c')
+headers = files('rte_dmadev.h', 'rte_dmadev_pmd.h')
+indirect_headers += files('rte_dmadev_core.h')
diff --git a/lib/dmadev/rte_dmadev.c b/lib/dmadev/rte_dmadev.c
new file mode 100644
index 0000000..8a29abb
--- /dev/null
+++ b/lib/dmadev/rte_dmadev.c
@@ -0,0 +1,560 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 HiSilicon Limited.
+ * Copyright(c) 2021 Intel Corporation.
+ */
+
+#include <ctype.h>
+#include <inttypes.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <rte_debug.h>
+#include <rte_dev.h>
+#include <rte_eal.h>
+#include <rte_errno.h>
+#include <rte_lcore.h>
+#include <rte_log.h>
+#include <rte_memory.h>
+#include <rte_memzone.h>
+#include <rte_malloc.h>
+#include <rte_string_fns.h>
+
+#include "rte_dmadev.h"
+#include "rte_dmadev_pmd.h"
+
+RTE_LOG_REGISTER(rte_dmadev_logtype, lib.dmadev, INFO);
+
+struct rte_dmadev rte_dmadevices[RTE_DMADEV_MAX_DEVS];
+
+static const char *MZ_RTE_DMADEV_DATA = "rte_dmadev_data";
+/* Shared memory between primary and secondary processes. */
+static struct {
+	struct rte_dmadev_data data[RTE_DMADEV_MAX_DEVS];
+} *dmadev_shared_data;
+
+static int
+dmadev_check_name(const char *name)
+{
+	size_t name_len;
+
+	if (name == NULL) {
+		RTE_DMADEV_LOG(ERR, "Name can't be NULL\n");
+		return -EINVAL;
+	}
+
+	name_len = strnlen(name, RTE_DMADEV_NAME_MAX_LEN);
+	if (name_len == 0) {
+		RTE_DMADEV_LOG(ERR, "Zero length DMA device name\n");
+		return -EINVAL;
+	}
+	if (name_len >= RTE_DMADEV_NAME_MAX_LEN) {
+		RTE_DMADEV_LOG(ERR, "DMA device name is too long\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static uint16_t
+dmadev_find_free_dev(void)
+{
+	uint16_t i;
+
+	for (i = 0; i < RTE_DMADEV_MAX_DEVS; i++) {
+		if (dmadev_shared_data->data[i].dev_name[0] == '\0') {
+			RTE_ASSERT(rte_dmadevices[i].attached == 0);
+			return i;
+		}
+	}
+
+	return RTE_DMADEV_MAX_DEVS;
+}
+
+static struct rte_dmadev*
+dmadev_allocated(const char *name)
+{
+	uint16_t i;
+
+	for (i = 0; i < RTE_DMADEV_MAX_DEVS; i++) {
+		if ((rte_dmadevices[i].attached == 1) &&
+		    (!strcmp(name, rte_dmadevices[i].data->dev_name)))
+			return &rte_dmadevices[i];
+	}
+
+	return NULL;
+}
+
+static int
+dmadev_shared_data_prepare(void)
+{
+	const struct rte_memzone *mz;
+
+	if (dmadev_shared_data == NULL) {
+		if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+			/* Allocate port data and ownership shared memory. */
+			mz = rte_memzone_reserve(MZ_RTE_DMADEV_DATA,
+					 sizeof(*dmadev_shared_data),
+					 rte_socket_id(), 0);
+		} else {
+			mz = rte_memzone_lookup(MZ_RTE_DMADEV_DATA);
+		}
+		if (mz == NULL)
+			return -ENOMEM;
+
+		dmadev_shared_data = mz->addr;
+		if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+			memset(dmadev_shared_data->data, 0,
+			       sizeof(dmadev_shared_data->data));
+	}
+
+	return 0;
+}
+
+static struct rte_dmadev *
+dmadev_allocate(const char *name)
+{
+	struct rte_dmadev *dev;
+	uint16_t dev_id;
+
+	dev = dmadev_allocated(name);
+	if (dev != NULL) {
+		RTE_DMADEV_LOG(ERR, "DMA device already allocated\n");
+		return NULL;
+	}
+
+	dev_id = dmadev_find_free_dev();
+	if (dev_id == RTE_DMADEV_MAX_DEVS) {
+		RTE_DMADEV_LOG(ERR, "Reached maximum number of DMA devices\n");
+		return NULL;
+	}
+
+	if (dmadev_shared_data_prepare() != 0) {
+		RTE_DMADEV_LOG(ERR, "Cannot allocate DMA shared data\n");
+		return NULL;
+	}
+
+	dev = &rte_dmadevices[dev_id];
+	dev->data = &dmadev_shared_data->data[dev_id];
+	dev->data->dev_id = dev_id;
+	strlcpy(dev->data->dev_name, name, sizeof(dev->data->dev_name));
+
+	return dev;
+}
+
+static struct rte_dmadev *
+dmadev_attach_secondary(const char *name)
+{
+	struct rte_dmadev *dev;
+	uint16_t i;
+
+	if (dmadev_shared_data_prepare() != 0) {
+		RTE_DMADEV_LOG(ERR, "Cannot allocate DMA shared data\n");
+		return NULL;
+	}
+
+	for (i = 0; i < RTE_DMADEV_MAX_DEVS; i++) {
+		if (!strcmp(dmadev_shared_data->data[i].dev_name, name))
+			break;
+	}
+	if (i == RTE_DMADEV_MAX_DEVS) {
+		RTE_DMADEV_LOG(ERR,
+			"Device %s is not driven by the primary process\n",
+			name);
+		return NULL;
+	}
+
+	dev = &rte_dmadevices[i];
+	dev->data = &dmadev_shared_data->data[i];
+	RTE_ASSERT(dev->data->dev_id == i);
+
+	return dev;
+}
+
+struct rte_dmadev *
+rte_dmadev_pmd_allocate(const char *name)
+{
+	struct rte_dmadev *dev;
+
+	if (dmadev_check_name(name) != 0)
+		return NULL;
+
+	if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+		dev = dmadev_allocate(name);
+	else
+		dev = dmadev_attach_secondary(name);
+
+	if (dev == NULL)
+		return NULL;
+	dev->attached = 1;
+
+	return dev;
+}
+
+int
+rte_dmadev_pmd_release(struct rte_dmadev *dev)
+{
+	if (dev == NULL)
+		return -EINVAL;
+
+	if (dev->attached == 0)
+		return 0;
+
+	if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+		rte_free(dev->data->dev_private);
+		memset(dev->data, 0, sizeof(struct rte_dmadev_data));
+	}
+
+	memset(dev, 0, sizeof(struct rte_dmadev));
+	dev->attached = 0;
+
+	return 0;
+}
+
+struct rte_dmadev *
+rte_dmadev_get_device_by_name(const char *name)
+{
+	if (dmadev_check_name(name) != 0)
+		return NULL;
+	return dmadev_allocated(name);
+}
+
+bool
+rte_dmadev_is_valid_dev(uint16_t dev_id)
+{
+	if (dev_id >= RTE_DMADEV_MAX_DEVS ||
+	    rte_dmadevices[dev_id].attached == 0)
+		return false;
+	return true;
+}
+
+uint16_t
+rte_dmadev_count(void)
+{
+	uint16_t count = 0;
+	uint16_t i;
+
+	for (i = 0; i < RTE_DMADEV_MAX_DEVS; i++) {
+		if (rte_dmadevices[i].attached == 1)
+			count++;
+	}
+
+	return count;
+}
+
+int
+rte_dmadev_info_get(uint16_t dev_id, struct rte_dmadev_info *dev_info)
+{
+	struct rte_dmadev *dev;
+	int ret;
+
+	RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
+	RTE_FUNC_PTR_OR_ERR_RET(dev_info, -EINVAL);
+
+	dev = &rte_dmadevices[dev_id];
+
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_info_get, -ENOTSUP);
+	memset(dev_info, 0, sizeof(struct rte_dmadev_info));
+	ret = (*dev->dev_ops->dev_info_get)(dev, dev_info);
+	if (ret != 0)
+		return ret;
+
+	dev_info->device = dev->device;
+
+	return 0;
+}
+
+int
+rte_dmadev_configure(uint16_t dev_id, const struct rte_dmadev_conf *dev_conf)
+{
+	struct rte_dmadev_info info;
+	struct rte_dmadev *dev;
+	int ret;
+
+	RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
+	RTE_FUNC_PTR_OR_ERR_RET(dev_conf, -EINVAL);
+	dev = &rte_dmadevices[dev_id];
+
+	ret = rte_dmadev_info_get(dev_id, &info);
+	if (ret != 0) {
+		RTE_DMADEV_LOG(ERR, "Device %u get device info fail\n", dev_id);
+		return -EINVAL;
+	}
+	if (dev_conf->max_vchans > info.max_vchans) {
+		RTE_DMADEV_LOG(ERR,
+			"Device %u configure too many vchans\n", dev_id);
+		return -EINVAL;
+	}
+	if (dev_conf->enable_mt_vchan &&
+	    !(info.dev_capa & RTE_DMA_DEV_CAPA_MT_VCHAN)) {
+		RTE_DMADEV_LOG(ERR,
+			"Device %u don't support MT-safe vchan\n", dev_id);
+		return -EINVAL;
+	}
+	if (dev_conf->enable_mt_multi_vchan &&
+	    !(info.dev_capa & RTE_DMA_DEV_CAPA_MT_MULTI_VCHAN)) {
+		RTE_DMADEV_LOG(ERR,
+			"Device %u don't support MT-safe multiple vchan\n",
+			dev_id);
+		return -EINVAL;
+	}
+
+	if (dev->data->dev_started != 0) {
+		RTE_DMADEV_LOG(ERR,
+			"Device %u must be stopped to allow configuration\n",
+			dev_id);
+		return -EBUSY;
+	}
+
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_configure, -ENOTSUP);
+	ret = (*dev->dev_ops->dev_configure)(dev, dev_conf);
+	if (ret == 0)
+		memcpy(&dev->data->dev_conf, dev_conf, sizeof(*dev_conf));
+
+	return ret;
+}
+
+int
+rte_dmadev_start(uint16_t dev_id)
+{
+	struct rte_dmadev *dev;
+	int ret;
+
+	RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
+	dev = &rte_dmadevices[dev_id];
+
+	if (dev->data->dev_started != 0) {
+		RTE_DMADEV_LOG(ERR, "Device %u already started\n", dev_id);
+		return 0;
+	}
+
+	if (dev->dev_ops->dev_start == NULL)
+		goto mark_started;
+
+	ret = (*dev->dev_ops->dev_start)(dev);
+	if (ret != 0)
+		return ret;
+
+mark_started:
+	dev->data->dev_started = 1;
+	return 0;
+}
+
+int
+rte_dmadev_stop(uint16_t dev_id)
+{
+	struct rte_dmadev *dev;
+	int ret;
+
+	RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
+	dev = &rte_dmadevices[dev_id];
+
+	if (dev->data->dev_started == 0) {
+		RTE_DMADEV_LOG(ERR, "Device %u already stopped\n", dev_id);
+		return 0;
+	}
+
+	if (dev->dev_ops->dev_stop == NULL)
+		goto mark_stopped;
+
+	ret = (*dev->dev_ops->dev_stop)(dev);
+	if (ret != 0)
+		return ret;
+
+mark_stopped:
+	dev->data->dev_started = 0;
+	return 0;
+}
+
+int
+rte_dmadev_close(uint16_t dev_id)
+{
+	struct rte_dmadev *dev;
+
+	RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
+	dev = &rte_dmadevices[dev_id];
+
+	/* Device must be stopped before it can be closed */
+	if (dev->data->dev_started == 1) {
+		RTE_DMADEV_LOG(ERR,
+			"Device %u must be stopped before closing\n", dev_id);
+		return -EBUSY;
+	}
+
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_close, -ENOTSUP);
+	return (*dev->dev_ops->dev_close)(dev);
+}
+
+int
+rte_dmadev_reset(uint16_t dev_id)
+{
+	struct rte_dmadev *dev;
+
+	RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
+	dev = &rte_dmadevices[dev_id];
+
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_reset, -ENOTSUP);
+	/* Reset is not dependent on state of the device */
+	return (*dev->dev_ops->dev_reset)(dev);
+}
+
+int
+rte_dmadev_vchan_setup(uint16_t dev_id,
+		       const struct rte_dmadev_vchan_conf *conf)
+{
+	struct rte_dmadev_info info;
+	struct rte_dmadev *dev;
+	int ret;
+
+	RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
+	RTE_FUNC_PTR_OR_ERR_RET(conf, -EINVAL);
+
+	dev = &rte_dmadevices[dev_id];
+
+	ret = rte_dmadev_info_get(dev_id, &info);
+	if (ret != 0) {
+		RTE_DMADEV_LOG(ERR, "Device %u get device info fail\n", dev_id);
+		return -EINVAL;
+	}
+	if (conf->direction == 0 ||
+	    conf->direction & ~RTE_DMA_TRANSFER_DIR_ALL) {
+		RTE_DMADEV_LOG(ERR, "Device %u direction invalid!\n", dev_id);
+		return -EINVAL;
+	}
+	if (conf->direction & RTE_DMA_MEM_TO_MEM &&
+	    !(info.dev_capa & RTE_DMA_DEV_CAPA_MEM_TO_MEM)) {
+		RTE_DMADEV_LOG(ERR,
+			"Device %u don't support mem2mem transfer\n", dev_id);
+		return -EINVAL;
+	}
+	if (conf->direction & RTE_DMA_MEM_TO_DEV &&
+	    !(info.dev_capa & RTE_DMA_DEV_CAPA_MEM_TO_DEV)) {
+		RTE_DMADEV_LOG(ERR,
+			"Device %u don't support mem2dev transfer\n", dev_id);
+		return -EINVAL;
+	}
+	if (conf->direction & RTE_DMA_DEV_TO_MEM &&
+	    !(info.dev_capa & RTE_DMA_DEV_CAPA_DEV_TO_MEM)) {
+		RTE_DMADEV_LOG(ERR,
+			"Device %u don't support dev2mem transfer\n", dev_id);
+		return -EINVAL;
+	}
+	if (conf->direction & RTE_DMA_DEV_TO_DEV &&
+	    !(info.dev_capa & RTE_DMA_DEV_CAPA_DEV_TO_DEV)) {
+		RTE_DMADEV_LOG(ERR,
+			"Device %u don't support dev2dev transfer\n", dev_id);
+		return -EINVAL;
+	}
+	if (conf->nb_desc < info.min_desc || conf->nb_desc > info.max_desc) {
+		RTE_DMADEV_LOG(ERR,
+			"Device %u number of descriptors invalid\n", dev_id);
+		return -EINVAL;
+	}
+
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->vchan_setup, -ENOTSUP);
+	return (*dev->dev_ops->vchan_setup)(dev, conf);
+}
+
+int
+rte_dmadev_vchan_release(uint16_t dev_id, uint16_t vchan)
+{
+	struct rte_dmadev *dev;
+
+	RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
+	dev = &rte_dmadevices[dev_id];
+
+	if (vchan >= dev->data->dev_conf.max_vchans) {
+		RTE_DMADEV_LOG(ERR,
+			"Device %u vchan %u out of range\n", dev_id, vchan);
+		return -EINVAL;
+	}
+
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->vchan_release, -ENOTSUP);
+	return (*dev->dev_ops->vchan_release)(dev, vchan);
+}
+
+int
+rte_dmadev_stats_get(uint16_t dev_id, int vchan, struct rte_dmadev_stats *stats)
+{
+	struct rte_dmadev *dev;
+
+	RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
+	RTE_FUNC_PTR_OR_ERR_RET(stats, -EINVAL);
+
+	dev = &rte_dmadevices[dev_id];
+
+	if (vchan < -1 || vchan >= dev->data->dev_conf.max_vchans) {
+		RTE_DMADEV_LOG(ERR,
+			"Device %u vchan %u out of range\n", dev_id, vchan);
+		return -EINVAL;
+	}
+
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->stats_get, -ENOTSUP);
+	return (*dev->dev_ops->stats_get)(dev, vchan, stats);
+}
+
+int
+rte_dmadev_stats_reset(uint16_t dev_id, int vchan)
+{
+	struct rte_dmadev *dev;
+
+	RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
+	dev = &rte_dmadevices[dev_id];
+
+	if (vchan < -1 || vchan >= dev->data->dev_conf.max_vchans) {
+		RTE_DMADEV_LOG(ERR,
+			"Device %u vchan %u out of range\n", dev_id, vchan);
+		return -EINVAL;
+	}
+
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->stats_reset, -ENOTSUP);
+	return (*dev->dev_ops->stats_reset)(dev, vchan);
+}
+
+int
+rte_dmadev_dump(uint16_t dev_id, FILE *f)
+{
+	struct rte_dmadev_info info;
+	struct rte_dmadev *dev;
+	int ret;
+
+	RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
+	RTE_FUNC_PTR_OR_ERR_RET(f, -EINVAL);
+
+	ret = rte_dmadev_info_get(dev_id, &info);
+	if (ret != 0) {
+		RTE_DMADEV_LOG(ERR, "Device %u get device info fail\n", dev_id);
+		return -EINVAL;
+	}
+
+	dev = &rte_dmadevices[dev_id];
+
+	fprintf(f, "DMA Dev %u, '%s' [%s]\n",
+		dev->data->dev_id,
+		dev->data->dev_name,
+		dev->data->dev_started ? "started" : "stopped");
+	fprintf(f, "  dev_capa: 0x%" PRIx64 "\n", info.dev_capa);
+	fprintf(f, "  max_vchans_supported: %u\n", info.max_vchans);
+	fprintf(f, "  max_vchans_configured: %u\n", info.nb_vchans);
+	fprintf(f, "  MT-safe-configured: vchans: %u multi-vchans: %u\n",
+		dev->data->dev_conf.enable_mt_vchan,
+		dev->data->dev_conf.enable_mt_multi_vchan);
+
+	if (dev->dev_ops->dev_dump != NULL)
+		return (*dev->dev_ops->dev_dump)(dev, f);
+
+	return 0;
+}
+
+int
+rte_dmadev_selftest(uint16_t dev_id)
+{
+	struct rte_dmadev *dev;
+
+	RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
+	dev = &rte_dmadevices[dev_id];
+
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_selftest, -ENOTSUP);
+	return (*dev->dev_ops->dev_selftest)(dev_id);
+}
diff --git a/lib/dmadev/rte_dmadev.h b/lib/dmadev/rte_dmadev.h
new file mode 100644
index 0000000..8779512
--- /dev/null
+++ b/lib/dmadev/rte_dmadev.h
@@ -0,0 +1,1030 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 HiSilicon Limited.
+ * Copyright(c) 2021 Intel Corporation.
+ * Copyright(c) 2021 Marvell International Ltd.
+ */
+
+#ifndef _RTE_DMADEV_H_
+#define _RTE_DMADEV_H_
+
+/**
+ * @file rte_dmadev.h
+ *
+ * RTE DMA (Direct Memory Access) device APIs.
+ *
+ * The DMA framework is built on the following model:
+ *
+ *     ---------------   ---------------       ---------------
+ *     | virtual DMA |   | virtual DMA |       | virtual DMA |
+ *     | channel     |   | channel     |       | channel     |
+ *     ---------------   ---------------       ---------------
+ *            |                |                      |
+ *            ------------------                      |
+ *                     |                              |
+ *               ------------                    ------------
+ *               |  dmadev  |                    |  dmadev  |
+ *               ------------                    ------------
+ *                     |                              |
+ *            ------------------               ------------------
+ *            | HW-DMA-channel |               | HW-DMA-channel |
+ *            ------------------               ------------------
+ *                     |                              |
+ *                     --------------------------------
+ *                                     |
+ *                           ---------------------
+ *                           | HW-DMA-Controller |
+ *                           ---------------------
+ *
+ * The DMA controller could have multilpe HW-DMA-channels (aka. HW-DMA-queues),
+ * each HW-DMA-channel should be represented by a dmadev.
+ *
+ * The dmadev could create multiple virtual DMA channel, each virtual DMA
+ * channel represents a different transfer context. The DMA operation request
+ * must be submitted to the virtual DMA channel.
+ * E.G. Application could create virtual DMA channel 0 for mem-to-mem transfer
+ *      scenario, and create virtual DMA channel 1 for mem-to-dev transfer
+ *      scenario.
+ *
+ * The dmadev are dynamically allocated by rte_dmadev_pmd_allocate() during the
+ * PCI/SoC device probing phase performed at EAL initialization time. And could
+ * be released by rte_dmadev_pmd_release() during the PCI/SoC device removing
+ * phase.
+ *
+ * We use 'uint16_t dev_id' as the device identifier of a dmadev, and
+ * 'uint16_t vchan' as the virtual DMA channel identifier in one dmadev.
+ *
+ * The functions exported by the dmadev API to setup a device designated by its
+ * device identifier must be invoked in the following order:
+ *     - rte_dmadev_configure()
+ *     - rte_dmadev_vchan_setup()
+ *     - rte_dmadev_start()
+ *
+ * Then, the application can invoke dataplane APIs to process jobs.
+ *
+ * If the application wants to change the configuration (i.e. call
+ * rte_dmadev_configure()), it must call rte_dmadev_stop() first to stop the
+ * device and then do the reconfiguration before calling rte_dmadev_start()
+ * again. The dataplane APIs should not be invoked when the device is stopped.
+ *
+ * Finally, an application can close a dmadev by invoking the
+ * rte_dmadev_close() function.
+ *
+ * The dataplane APIs include two parts:
+ *   a) The first part is the submission of operation requests:
+ *        - rte_dmadev_copy()
+ *        - rte_dmadev_copy_sg() - scatter-gather form of copy
+ *        - rte_dmadev_fill()
+ *        - rte_dmadev_fill_sg() - scatter-gather form of fill
+ *        - rte_dmadev_perform() - issue doorbell to hardware
+ *      These APIs could work with different virtual DMA channels which have
+ *      different contexts.
+ *      The first four APIs are used to submit the operation request to the
+ *      virtual DMA channel, if the submission is successful, a uint16_t
+ *      ring_idx is returned, otherwise a negative number is returned.
+ *   b) The second part is to obtain the result of requests:
+ *        - rte_dmadev_completed()
+ *            - return the number of operation requests completed successfully.
+ *        - rte_dmadev_completed_fails()
+ *            - return the number of operation requests failed to complete.
+ *
+ * About the ring_idx which rte_dmadev_copy/copy_sg/fill/fill_sg() returned,
+ * the rules are as follows:
+ *   a) ring_idx for each virtual DMA channel are independent.
+ *   b) For a virtual DMA channel, the ring_idx is monotonically incremented,
+ *      when it reach UINT16_MAX, it wraps back to zero.
+ *   c) The initial ring_idx of a virtual DMA channel is zero, after the device
+ *      is stopped or reset, the ring_idx needs to be reset to zero.
+ *   Example:
+ *      step-1: start one dmadev
+ *      step-2: enqueue a copy operation, the ring_idx return is 0
+ *      step-3: enqueue a copy operation again, the ring_idx return is 1
+ *      ...
+ *      step-101: stop the dmadev
+ *      step-102: start the dmadev
+ *      step-103: enqueue a copy operation, the cookie return is 0
+ *      ...
+ *      step-x+0: enqueue a fill operation, the ring_idx return is 65535
+ *      step-x+1: enqueue a copy operation, the ring_idx return is 0
+ *      ...
+ *
+ * By default, all the non-dataplane functions of the dmadev API exported by a
+ * PMD are lock-free functions which assume to not be invoked in parallel on
+ * different logical cores to work on the same target object.
+ *
+ * The dataplane functions of the dmadev API exported by a PMD can be MT-safe
+ * only when supported by the driver, generally, the driver will reports two
+ * capabilities:
+ *   a) Whether to support MT-safe for the submit/completion API of the same
+ *      virtual DMA channel.
+ *      E.G. one thread do submit operation, another thread do completion
+ *           operation.
+ *      If driver support it, then declare RTE_DMA_DEV_CAPA_MT_VCHAN.
+ *      If driver don't support it, it's up to the application to guarantee
+ *      MT-safe.
+ *   b) Whether to support MT-safe for different virtual DMA channels.
+ *      E.G. one thread do operation on virtual DMA channel 0, another thread
+ *           do operation on virtual DMA channel 1.
+ *      If driver support it, then declare RTE_DMA_DEV_CAPA_MT_MULTI_VCHAN.
+ *      If driver don't support it, it's up to the application to guarantee
+ *      MT-safe.
+ *
+ */
+
+#include <rte_common.h>
+#include <rte_compat.h>
+#include <rte_errno.h>
+#include <rte_memory.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define RTE_DMADEV_NAME_MAX_LEN	RTE_DEV_NAME_MAX_LEN
+
+extern int rte_dmadev_logtype;
+
+#define RTE_DMADEV_LOG(level, ...) \
+	rte_log(RTE_LOG_ ## level, rte_dmadev_logtype, "" __VA_ARGS__)
+
+/* Macros to check for valid port */
+#define RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, retval) do { \
+	if (!rte_dmadev_is_valid_dev(dev_id)) { \
+		RTE_DMADEV_LOG(ERR, "Invalid dev_id=%u\n", dev_id); \
+		return retval; \
+	} \
+} while (0)
+
+#define RTE_DMADEV_VALID_DEV_ID_OR_RET(dev_id) do { \
+	if (!rte_dmadev_is_valid_dev(dev_id)) { \
+		RTE_DMADEV_LOG(ERR, "Invalid dev_id=%u\n", dev_id); \
+		return; \
+	} \
+} while (0)
+
+/**
+ * @internal
+ * Validate if the DMA device index is a valid attached DMA device.
+ *
+ * @param dev_id
+ *   DMA device index.
+ *
+ * @return
+ *   - If the device index is valid (true) or not (false).
+ */
+__rte_internal
+bool
+rte_dmadev_is_valid_dev(uint16_t dev_id);
+
+/**
+ * rte_dma_sg - can hold scatter DMA operation request
+ */
+struct rte_dma_sg {
+	rte_iova_t src;
+	rte_iova_t dst;
+	uint32_t length;
+};
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Get the total number of DMA devices that have been successfully
+ * initialised.
+ *
+ * @return
+ *   The total number of usable DMA devices.
+ */
+__rte_experimental
+uint16_t
+rte_dmadev_count(void);
+
+/**
+ * The capabilities of a DMA device
+ */
+#define RTE_DMA_DEV_CAPA_MEM_TO_MEM	(1ull << 0)
+/**< DMA device support mem-to-mem transfer.
+ *
+ * @see struct rte_dmadev_info::dev_capa
+ */
+#define RTE_DMA_DEV_CAPA_MEM_TO_DEV	(1ull << 1)
+/**< DMA device support slave mode & mem-to-dev transfer.
+ *
+ * @see struct rte_dmadev_info::dev_capa
+ */
+#define RTE_DMA_DEV_CAPA_DEV_TO_MEM	(1ull << 2)
+/**< DMA device support slave mode & dev-to-mem transfer.
+ *
+ * @see struct rte_dmadev_info::dev_capa
+ */
+#define RTE_DMA_DEV_CAPA_DEV_TO_DEV	(1ull << 3)
+/**< DMA device support slave mode & dev-to-dev transfer.
+ *
+ * @see struct rte_dmadev_info::dev_capa
+ */
+#define RTE_DMA_DEV_CAPA_OPS_COPY	(1ull << 4)
+/**< DMA device support copy ops.
+ *
+ * @see struct rte_dmadev_info::dev_capa
+ */
+#define RTE_DMA_DEV_CAPA_OPS_FILL	(1ull << 5)
+/**< DMA device support fill ops.
+ *
+ * @see struct rte_dmadev_info::dev_capa
+ */
+#define RTE_DMA_DEV_CAPA_OPS_SG		(1ull << 6)
+/**< DMA device support scatter-list ops.
+ * If device support ops_copy and ops_sg, it means supporting copy_sg ops.
+ * If device support ops_fill and ops_sg, it means supporting fill_sg ops.
+ *
+ * @see struct rte_dmadev_info::dev_capa
+ */
+#define RTE_DMA_DEV_CAPA_FENCE		(1ull << 7)
+/**< DMA device support fence.
+ * If device support fence, then application could set a fence flags when
+ * enqueue operation by rte_dma_copy/copy_sg/fill/fill_sg.
+ * If a operation has a fence flags, it means the operation must be processed
+ * only after all previous operations are completed.
+ *
+ * @see struct rte_dmadev_info::dev_capa
+ */
+#define RTE_DMA_DEV_CAPA_SVA		(1ull << 8)
+/**< DMA device support SVA which could use VA as DMA address.
+ * If device support SVA then application could pass any VA address like memory
+ * from rte_malloc(), rte_memzone(), malloc, stack memory.
+ * If device don't support SVA, then application should pass IOVA address which
+ * from rte_malloc(), rte_memzone().
+ *
+ * @see struct rte_dmadev_info::dev_capa
+ */
+#define RTE_DMA_DEV_CAPA_MT_VCHAN	(1ull << 9)
+/**< DMA device support MT-safe of a virtual DMA channel.
+ *
+ * @see struct rte_dmadev_info::dev_capa
+ */
+#define RTE_DMA_DEV_CAPA_MT_MULTI_VCHAN	(1ull << 10)
+/**< DMA device support MT-safe of different virtual DMA channels.
+ *
+ * @see struct rte_dmadev_info::dev_capa
+ */
+
+/**
+ * A structure used to retrieve the contextual information of
+ * an DMA device
+ */
+struct rte_dmadev_info {
+	struct rte_device *device; /**< Generic Device information */
+	uint64_t dev_capa; /**< Device capabilities (RTE_DMA_DEV_CAPA_) */
+	/** Maximum number of virtual DMA channels supported */
+	uint16_t max_vchans;
+	/** Maximum allowed number of virtual DMA channel descriptors */
+	uint16_t max_desc;
+	/** Minimum allowed number of virtual DMA channel descriptors */
+	uint16_t min_desc;
+	uint16_t nb_vchans; /**< Number of virtual DMA channel configured */
+};
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Retrieve the contextual information of a DMA device.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param[out] dev_info
+ *   A pointer to a structure of type *rte_dmadev_info* to be filled with the
+ *   contextual information of the device.
+ *
+ * @return
+ *   - =0: Success, driver updates the contextual information of the DMA device
+ *   - <0: Error code returned by the driver info get function.
+ *
+ */
+__rte_experimental
+int
+rte_dmadev_info_get(uint16_t dev_id, struct rte_dmadev_info *dev_info);
+
+/**
+ * A structure used to configure a DMA device.
+ */
+struct rte_dmadev_conf {
+	/** Maximum number of virtual DMA channel to use.
+	 * This value cannot be greater than the field 'max_vchans' of struct
+	 * rte_dmadev_info which get from rte_dmadev_info_get().
+	 */
+	uint16_t max_vchans;
+	/** Enable bit for MT-safe of a virtual DMA channel.
+	 * This bit can be enabled only when the device supports
+	 * RTE_DMA_DEV_CAPA_MT_VCHAN.
+	 * @see RTE_DMA_DEV_CAPA_MT_VCHAN
+	 */
+	uint8_t enable_mt_vchan : 1;
+	/** Enable bit for MT-safe of different virtual DMA channels.
+	 * This bit can be enabled only when the device supports
+	 * RTE_DMA_DEV_CAPA_MT_MULTI_VCHAN.
+	 * @see RTE_DMA_DEV_CAPA_MT_MULTI_VCHAN
+	 */
+	uint8_t enable_mt_multi_vchan : 1;
+	uint64_t reserved[2]; /**< Reserved for future fields */
+};
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Configure a DMA device.
+ *
+ * This function must be invoked first before any other function in the
+ * API. This function can also be re-invoked when a device is in the
+ * stopped state.
+ *
+ * @param dev_id
+ *   The identifier of the device to configure.
+ * @param dev_conf
+ *   The DMA device configuration structure encapsulated into rte_dmadev_conf
+ *   object.
+ *
+ * @return
+ *   - =0: Success, device configured.
+ *   - <0: Error code returned by the driver configuration function.
+ */
+__rte_experimental
+int
+rte_dmadev_configure(uint16_t dev_id, const struct rte_dmadev_conf *dev_conf);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Start a DMA device.
+ *
+ * The device start step is the last one and consists of setting the DMA
+ * to start accepting jobs.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ *
+ * @return
+ *   - =0: Success, device started.
+ *   - <0: Error code returned by the driver start function.
+ */
+__rte_experimental
+int
+rte_dmadev_start(uint16_t dev_id);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Stop a DMA device.
+ *
+ * The device can be restarted with a call to rte_dmadev_start()
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ *
+ * @return
+ *   - =0: Success, device stopped.
+ *   - <0: Error code returned by the driver stop function.
+ */
+__rte_experimental
+int
+rte_dmadev_stop(uint16_t dev_id);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Close a DMA device.
+ *
+ * The device cannot be restarted after this call.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ *
+ * @return
+ *  - =0: Successfully close device
+ *  - <0: Failure to close device
+ */
+__rte_experimental
+int
+rte_dmadev_close(uint16_t dev_id);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Reset a DMA device.
+ *
+ * This is different from cycle of rte_dmadev_start->rte_dmadev_stop in the
+ * sense similar to hard or soft reset.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ *
+ * @return
+ *   - =0: Successfully reset device.
+ *   - <0: Failure to reset device.
+ *   - (-ENOTSUP): If the device doesn't support this function.
+ */
+__rte_experimental
+int
+rte_dmadev_reset(uint16_t dev_id);
+
+/**
+ * DMA transfer direction defines.
+ */
+#define RTE_DMA_MEM_TO_MEM	(1ull << 0)
+/**< DMA transfer direction - from memory to memory.
+ *
+ * @see struct rte_dmadev_vchan_conf::direction
+ */
+#define RTE_DMA_MEM_TO_DEV	(1ull << 1)
+/**< DMA transfer direction - slave mode & from memory to device.
+ * In a typical scenario, ARM SoCs are installed on x86 servers as iNICs. In
+ * this case, the ARM SoCs works in slave mode, it could initiate a DMA move
+ * request from ARM memory to x86 host memory.
+ *
+ * @see struct rte_dmadev_vchan_conf::direction
+ */
+#define RTE_DMA_DEV_TO_MEM	(1ull << 2)
+/**< DMA transfer direction - slave mode & from device to memory.
+ * In a typical scenario, ARM SoCs are installed on x86 servers as iNICs. In
+ * this case, the ARM SoCs works in slave mode, it could initiate a DMA move
+ * request from x86 host memory to ARM memory.
+ *
+ * @see struct rte_dmadev_vchan_conf::direction
+ */
+#define RTE_DMA_DEV_TO_DEV	(1ull << 3)
+/**< DMA transfer direction - slave mode & from device to device.
+ * In a typical scenario, ARM SoCs are installed on x86 servers as iNICs. In
+ * this case, the ARM SoCs works in slave mode, it could initiate a DMA move
+ * request from x86 host memory to another x86 host memory.
+ *
+ * @see struct rte_dmadev_vchan_conf::direction
+ */
+#define RTE_DMA_TRANSFER_DIR_ALL	(RTE_DMA_MEM_TO_MEM | \
+					 RTE_DMA_MEM_TO_DEV | \
+					 RTE_DMA_DEV_TO_MEM | \
+					 RTE_DMA_DEV_TO_DEV)
+
+/**
+ * enum rte_dma_slave_port_type - slave mode type defines
+ */
+enum rte_dma_slave_port_type {
+	/** The slave port is PCIE. */
+	RTE_DMA_SLAVE_PORT_PCIE = 1,
+};
+
+/**
+ * A structure used to descript slave port parameters.
+ */
+struct rte_dma_slave_port_parameters {
+	enum rte_dma_slave_port_type port_type;
+	union {
+		/** For PCIE port */
+		struct {
+			/** The physical function number which to use */
+			uint64_t pf_number : 6;
+			/** Virtual function enable bit */
+			uint64_t vf_enable : 1;
+			/** The virtual function number which to use */
+			uint64_t vf_number : 8;
+			uint64_t pasid : 20;
+			/** The attributes filed in TLP packet */
+			uint64_t tlp_attr : 3;
+		};
+	};
+};
+
+/**
+ * A structure used to configure a virtual DMA channel.
+ */
+struct rte_dmadev_vchan_conf {
+	uint8_t direction; /**< Set of supported transfer directions */
+	/** Number of descriptor for the virtual DMA channel */
+	uint16_t nb_desc;
+	/** 1) Used to describes the dev parameter in the mem-to-dev/dev-to-mem
+	 * transfer scenario.
+	 * 2) Used to describes the src dev parameter in the dev-to-dev
+	 * transfer scenario.
+	 */
+	struct rte_dma_slave_port_parameters port;
+	/** Used to describes the dst dev parameters in the dev-to-dev
+	 * transfer scenario.
+	 */
+	struct rte_dma_slave_port_parameters peer_port;
+	uint64_t reserved[2]; /**< Reserved for future fields */
+};
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Allocate and set up a virtual DMA channel.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param conf
+ *   The virtual DMA channel configuration structure encapsulated into
+ *   rte_dmadev_vchan_conf object.
+ *
+ * @return
+ *   - >=0: Allocate success, it is the virtual DMA channel id. This value must
+ *          be less than the field 'max_vchans' of struct rte_dmadev_conf
+	    which configured by rte_dmadev_configure().
+ *   - <0: Error code returned by the driver virtual channel setup function.
+ */
+__rte_experimental
+int
+rte_dmadev_vchan_setup(uint16_t dev_id,
+		       const struct rte_dmadev_vchan_conf *conf);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Release a virtual DMA channel.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param vchan
+ *   The identifier of virtual DMA channel which return by vchan setup.
+ *
+ * @return
+ *   - =0: Successfully release the virtual DMA channel.
+ *   - <0: Error code returned by the driver virtual channel release function.
+ */
+__rte_experimental
+int
+rte_dmadev_vchan_release(uint16_t dev_id, uint16_t vchan);
+
+/**
+ * rte_dmadev_stats - running statistics.
+ */
+struct rte_dmadev_stats {
+	/** Count of operations which were successfully enqueued */
+	uint64_t enqueued_count;
+	/** Count of operations which were submitted to hardware */
+	uint64_t submitted_count;
+	/** Count of operations which failed to complete */
+	uint64_t completed_fail_count;
+	/** Count of operations which successfully complete */
+	uint64_t completed_count;
+	uint64_t reserved[4]; /**< Reserved for future fields */
+};
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Retrieve basic statistics of a or all virtual DMA channel(s).
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param vchan
+ *   The identifier of virtual DMA channel, -1 means all channels.
+ * @param[out] stats
+ *   The basic statistics structure encapsulated into rte_dmadev_stats
+ *   object.
+ *
+ * @return
+ *   - =0: Successfully retrieve stats.
+ *   - <0: Failure to retrieve stats.
+ */
+__rte_experimental
+int
+rte_dmadev_stats_get(uint16_t dev_id, int vchan,
+		     struct rte_dmadev_stats *stats);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Reset basic statistics of a or all virtual DMA channel(s).
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param vchan
+ *   The identifier of virtual DMA channel, -1 means all channels.
+ *
+ * @return
+ *   - =0: Successfully reset stats.
+ *   - <0: Failure to reset stats.
+ */
+__rte_experimental
+int
+rte_dmadev_stats_reset(uint16_t dev_id, int vchan);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Dump DMA device info.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param f
+ *   The file to write the output to.
+ *
+ * @return
+ *   0 on success. Non-zero otherwise.
+ */
+__rte_experimental
+int
+rte_dmadev_dump(uint16_t dev_id, FILE *f);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Trigger the dmadev self test.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ *
+ * @return
+ *   - 0: Selftest successful.
+ *   - -ENOTSUP if the device doesn't support selftest
+ *   - other values < 0 on failure.
+ */
+__rte_experimental
+int
+rte_dmadev_selftest(uint16_t dev_id);
+
+#include "rte_dmadev_core.h"
+
+/**
+ *  DMA flags to augment operation preparation.
+ *  Used as the 'flags' parameter of rte_dmadev_copy/copy_sg/fill/fill_sg.
+ */
+#define RTE_DMA_FLAG_FENCE	(1ull << 0)
+/**< DMA fence flag
+ * It means the operation with this flag must be processed only after all
+ * previous operations are completed.
+ *
+ * @see rte_dmadev_copy()
+ * @see rte_dmadev_copy_sg()
+ * @see rte_dmadev_fill()
+ * @see rte_dmadev_fill_sg()
+ */
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Enqueue a copy operation onto the virtual DMA channel.
+ *
+ * This queues up a copy operation to be performed by hardware, but does not
+ * trigger hardware to begin that operation.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param vchan
+ *   The identifier of virtual DMA channel.
+ * @param src
+ *   The address of the source buffer.
+ * @param dst
+ *   The address of the destination buffer.
+ * @param length
+ *   The length of the data to be copied.
+ * @param flags
+ *   An flags for this operation.
+ *
+ * @return
+ *   - 0..UINT16_MAX: index of enqueued copy job.
+ *   - <0: Error code returned by the driver copy function.
+ */
+__rte_experimental
+static inline int
+rte_dmadev_copy(uint16_t dev_id, uint16_t vchan, rte_iova_t src, rte_iova_t dst,
+		uint32_t length, uint64_t flags)
+{
+	struct rte_dmadev *dev = &rte_dmadevices[dev_id];
+#ifdef RTE_DMADEV_DEBUG
+	RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->copy, -ENOTSUP);
+	if (vchan >= dev->data->dev_conf.max_vchans) {
+		RTE_DMADEV_LOG(ERR, "Invalid vchan %d\n", vchan);
+		return -EINVAL;
+	}
+#endif
+	return (*dev->copy)(dev, vchan, src, dst, length, flags);
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Enqueue a scatter list copy operation onto the virtual DMA channel.
+ *
+ * This queues up a scatter list copy operation to be performed by hardware,
+ * but does not trigger hardware to begin that operation.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param vchan
+ *   The identifier of virtual DMA channel.
+ * @param sg
+ *   The pointer of scatterlist.
+ * @param sg_len
+ *   The number of scatterlist elements.
+ * @param flags
+ *   An flags for this operation.
+ *
+ * @return
+ *   - 0..UINT16_MAX: index of enqueued copy job.
+ *   - <0: Error code returned by the driver copy function.
+ */
+__rte_experimental
+static inline int
+rte_dmadev_copy_sg(uint16_t dev_id, uint16_t vchan, const struct rte_dma_sg *sg,
+		   uint32_t sg_len, uint64_t flags)
+{
+	struct rte_dmadev *dev = &rte_dmadevices[dev_id];
+#ifdef RTE_DMADEV_DEBUG
+	RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
+	RTE_FUNC_PTR_OR_ERR_RET(sg, -EINVAL);
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->copy_sg, -ENOTSUP);
+	if (vchan >= dev->data->dev_conf.max_vchans) {
+		RTE_DMADEV_LOG(ERR, "Invalid vchan %d\n", vchan);
+		return -EINVAL;
+	}
+#endif
+	return (*dev->copy_sg)(dev, vchan, sg, sg_len, flags);
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Enqueue a fill operation onto the virtual DMA channel.
+ *
+ * This queues up a fill operation to be performed by hardware, but does not
+ * trigger hardware to begin that operation.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param vchan
+ *   The identifier of virtual DMA channel.
+ * @param pattern
+ *   The pattern to populate the destination buffer with.
+ * @param dst
+ *   The address of the destination buffer.
+ * @param length
+ *   The length of the destination buffer.
+ * @param flags
+ *   An flags for this operation.
+ *
+ * @return
+ *   - 0..UINT16_MAX: index of enqueued copy job.
+ *   - <0: Error code returned by the driver copy function.
+ */
+__rte_experimental
+static inline int
+rte_dmadev_fill(uint16_t dev_id, uint16_t vchan, uint64_t pattern,
+		rte_iova_t dst, uint32_t length, uint64_t flags)
+{
+	struct rte_dmadev *dev = &rte_dmadevices[dev_id];
+#ifdef RTE_DMADEV_DEBUG
+	RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->fill, -ENOTSUP);
+	if (vchan >= dev->data->dev_conf.max_vchans) {
+		RTE_DMADEV_LOG(ERR, "Invalid vchan %d\n", vchan);
+		return -EINVAL;
+	}
+#endif
+	return (*dev->fill)(dev, vchan, pattern, dst, length, flags);
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Enqueue a scatter list fill operation onto the virtual DMA channel.
+ *
+ * This queues up a scatter list fill operation to be performed by hardware,
+ * but does not trigger hardware to begin that operation.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param vchan
+ *   The identifier of virtual DMA channel.
+ * @param pattern
+ *   The pattern to populate the destination buffer with.
+ * @param sg
+ *   The pointer of scatterlist.
+ * @param sg_len
+ *   The number of scatterlist elements.
+ * @param flags
+ *   An flags for this operation.
+ *
+ * @return
+ *   - 0..UINT16_MAX: index of enqueued copy job.
+ *   - <0: Error code returned by the driver copy function.
+ */
+__rte_experimental
+static inline int
+rte_dmadev_fill_sg(uint16_t dev_id, uint16_t vchan, uint64_t pattern,
+		   const struct rte_dma_sg *sg, uint32_t sg_len,
+		   uint64_t flags)
+{
+	struct rte_dmadev *dev = &rte_dmadevices[dev_id];
+#ifdef RTE_DMADEV_DEBUG
+	RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
+	RTE_FUNC_PTR_OR_ERR_RET(sg, -ENOTSUP);
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->fill, -ENOTSUP);
+	if (vchan >= dev->data->dev_conf.max_vchans) {
+		RTE_DMADEV_LOG(ERR, "Invalid vchan %d\n", vchan);
+		return -EINVAL;
+	}
+#endif
+	return (*dev->fill_sg)(dev, vchan, pattern, sg, sg_len, flags);
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Trigger hardware to begin performing enqueued operations.
+ *
+ * This API is used to write the "doorbell" to the hardware to trigger it
+ * to begin the operations previously enqueued by rte_dmadev_copy/fill()
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param vchan
+ *   The identifier of virtual DMA channel.
+ *
+ * @return
+ *   - =0: Successfully trigger hardware.
+ *   - <0: Failure to trigger hardware.
+ */
+__rte_experimental
+static inline int
+rte_dmadev_submit(uint16_t dev_id, uint16_t vchan)
+{
+	struct rte_dmadev *dev = &rte_dmadevices[dev_id];
+#ifdef RTE_DMADEV_DEBUG
+	RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->submit, -ENOTSUP);
+	if (vchan >= dev->data->dev_conf.max_vchans) {
+		RTE_DMADEV_LOG(ERR, "Invalid vchan %d\n", vchan);
+		return -EINVAL;
+	}
+#endif
+	return (*dev->submit)(dev, vchan);
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Returns the number of operations that have been successfully completed.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param vchan
+ *   The identifier of virtual DMA channel.
+ * @param nb_cpls
+ *   The maximum number of completed operations that can be processed.
+ * @param[out] last_idx
+ *   The last completed operation's index.
+ *   If not required, NULL can be passed in.
+ * @param[out] has_error
+ *   Indicates if there are transfer error.
+ *   If not required, NULL can be passed in.
+ *
+ * @return
+ *   The number of operations that successfully completed.
+ */
+__rte_experimental
+static inline uint16_t
+rte_dmadev_completed(uint16_t dev_id, uint16_t vchan, const uint16_t nb_cpls,
+		     uint16_t *last_idx, bool *has_error)
+{
+	struct rte_dmadev *dev = &rte_dmadevices[dev_id];
+	uint16_t idx;
+	bool err;
+
+#ifdef RTE_DMADEV_DEBUG
+	RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->completed, -ENOTSUP);
+	if (vchan >= dev->data->dev_conf.max_vchans) {
+		RTE_DMADEV_LOG(ERR, "Invalid vchan %d\n", vchan);
+		return -EINVAL;
+	}
+	if (nb_cpls == 0) {
+		RTE_DMADEV_LOG(ERR, "Invalid nb_cpls\n");
+		return -EINVAL;
+	}
+#endif
+
+	/* Ensure the pointer values are non-null to simplify drivers.
+	 * In most cases these should be compile time evaluated, since this is
+	 * an inline function.
+	 * - If NULL is explicitly passed as parameter, then compiler knows the
+	 *   value is NULL
+	 * - If address of local variable is passed as parameter, then compiler
+	 *   can know it's non-NULL.
+	 */
+	if (last_idx == NULL)
+		last_idx = &idx;
+	if (has_error == NULL)
+		has_error = &err;
+
+	*has_error = false;
+	return (*dev->completed)(dev, vchan, nb_cpls, last_idx, has_error);
+}
+
+/**
+ * DMA transfer status code defines
+ */
+enum rte_dma_status_code {
+	/** The operation completed successfully */
+	RTE_DMA_STATUS_SUCCESSFUL = 0,
+	/** The operation failed to complete due active drop
+	 * This is mainly used when processing dev_stop, allow outstanding
+	 * requests to be completed as much as possible.
+	 */
+	RTE_DMA_STATUS_ACTIVE_DROP,
+	/** The operation failed to complete due invalid source address */
+	RTE_DMA_STATUS_INVALID_SRC_ADDR,
+	/** The operation failed to complete due invalid destination address */
+	RTE_DMA_STATUS_INVALID_DST_ADDR,
+	/** The operation failed to complete due invalid length */
+	RTE_DMA_STATUS_INVALID_LENGTH,
+	/** The operation failed to complete due invalid opcode
+	 * The DMA descriptor could have multiple format, which are
+	 * distinguished by the opcode field.
+	 */
+	RTE_DMA_STATUS_INVALID_OPCODE,
+	/** The operation failed to complete due bus err */
+	RTE_DMA_STATUS_BUS_ERROR,
+	/** The operation failed to complete due data poison */
+	RTE_DMA_STATUS_DATA_POISION,
+	/** The operation failed to complete due descriptor read error */
+	RTE_DMA_STATUS_DESCRIPTOR_READ_ERROR,
+	/** The operation failed to complete due device link error
+	 * Used to indicates that the link error in the mem-to-dev/dev-to-mem/
+	 * dev-to-dev transfer scenario.
+	 */
+	RTE_DMA_STATUS_DEV_LINK_ERROR,
+	/** Driver specific status code offset
+	 * Start status code for the driver to define its own error code.
+	 */
+	RTE_DMA_STATUS_DRV_SPECIFIC_OFFSET = 0x10000,
+};
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Returns the number of operations that failed to complete.
+ * NOTE: This API was used when rte_dmadev_completed has_error was set.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param vchan
+ *   The identifier of virtual DMA channel.
+ * @param nb_status
+ *   Indicates the size of status array.
+ * @param[out] status
+ *   The error code of operations that failed to complete.
+ *   Some standard error code are described in 'enum rte_dma_status_code'
+ *   @see rte_dma_status_code
+ * @param[out] last_idx
+ *   The last failed completed operation's index.
+ *
+ * @return
+ *   The number of operations that failed to complete.
+ */
+__rte_experimental
+static inline uint16_t
+rte_dmadev_completed_fails(uint16_t dev_id, uint16_t vchan,
+			   const uint16_t nb_status, uint32_t *status,
+			   uint16_t *last_idx)
+{
+	struct rte_dmadev *dev = &rte_dmadevices[dev_id];
+#ifdef RTE_DMADEV_DEBUG
+	RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
+	RTE_FUNC_PTR_OR_ERR_RET(status, -EINVAL);
+	RTE_FUNC_PTR_OR_ERR_RET(last_idx, -EINVAL);
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->completed_fails, -ENOTSUP);
+	if (vchan >= dev->data->dev_conf.max_vchans) {
+		RTE_DMADEV_LOG(ERR, "Invalid vchan %d\n", vchan);
+		return -EINVAL;
+	}
+	if (nb_status == 0) {
+		RTE_DMADEV_LOG(ERR, "Invalid nb_status\n");
+		return -EINVAL;
+	}
+#endif
+	return (*dev->completed_fails)(dev, vchan, nb_status, status, last_idx);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_DMADEV_H_ */
diff --git a/lib/dmadev/rte_dmadev_core.h b/lib/dmadev/rte_dmadev_core.h
new file mode 100644
index 0000000..410faf0
--- /dev/null
+++ b/lib/dmadev/rte_dmadev_core.h
@@ -0,0 +1,159 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 HiSilicon Limited.
+ * Copyright(c) 2021 Intel Corporation.
+ */
+
+#ifndef _RTE_DMADEV_CORE_H_
+#define _RTE_DMADEV_CORE_H_
+
+/**
+ * @file
+ *
+ * RTE DMA Device internal header.
+ *
+ * This header contains internal data types, that are used by the DMA devices
+ * in order to expose their ops to the class.
+ *
+ * Applications should not use these API directly.
+ *
+ */
+
+struct rte_dmadev;
+
+/** @internal Used to get device information of a device. */
+typedef int (*dmadev_info_get_t)(struct rte_dmadev *dev,
+				 struct rte_dmadev_info *dev_info);
+
+/** @internal Used to configure a device. */
+typedef int (*dmadev_configure_t)(struct rte_dmadev *dev,
+				  const struct rte_dmadev_conf *dev_conf);
+
+/** @internal Used to start a configured device. */
+typedef int (*dmadev_start_t)(struct rte_dmadev *dev);
+
+/** @internal Used to stop a configured device. */
+typedef int (*dmadev_stop_t)(struct rte_dmadev *dev);
+
+/** @internal Used to close a configured device. */
+typedef int (*dmadev_close_t)(struct rte_dmadev *dev);
+
+/** @internal Used to reset a configured device. */
+typedef int (*dmadev_reset_t)(struct rte_dmadev *dev);
+
+/** @internal Used to allocate and set up a virtual DMA channel. */
+typedef int (*dmadev_vchan_setup_t)(struct rte_dmadev *dev,
+				    const struct rte_dmadev_vchan_conf *conf);
+
+/** @internal Used to release a virtual DMA channel. */
+typedef int (*dmadev_vchan_release_t)(struct rte_dmadev *dev, uint16_t vchan);
+
+/** @internal Used to retrieve basic statistics. */
+typedef int (*dmadev_stats_get_t)(struct rte_dmadev *dev, int vchan,
+				  struct rte_dmadev_stats *stats);
+
+/** @internal Used to reset basic statistics. */
+typedef int (*dmadev_stats_reset_t)(struct rte_dmadev *dev, int vchan);
+
+/** @internal Used to dump internal information. */
+typedef int (*dmadev_dump_t)(struct rte_dmadev *dev, FILE *f);
+
+/** @internal Used to start dmadev selftest. */
+typedef int (*dmadev_selftest_t)(uint16_t dev_id);
+
+/** @internal Used to enqueue a copy operation. */
+typedef int (*dmadev_copy_t)(struct rte_dmadev *dev, uint16_t vchan,
+			     rte_iova_t src, rte_iova_t dst,
+			     uint32_t length, uint64_t flags);
+
+/** @internal Used to enqueue a scatter list copy operation. */
+typedef int (*dmadev_copy_sg_t)(struct rte_dmadev *dev, uint16_t vchan,
+				const struct rte_dma_sg *sg,
+				uint32_t sg_len, uint64_t flags);
+
+/** @internal Used to enqueue a fill operation. */
+typedef int (*dmadev_fill_t)(struct rte_dmadev *dev, uint16_t vchan,
+			     uint64_t pattern, rte_iova_t dst,
+			     uint32_t length, uint64_t flags);
+
+/** @internal Used to enqueue a scatter list fill operation. */
+typedef int (*dmadev_fill_sg_t)(struct rte_dmadev *dev, uint16_t vchan,
+			uint64_t pattern, const struct rte_dma_sg *sg,
+			uint32_t sg_len, uint64_t flags);
+
+/** @internal Used to trigger hardware to begin working. */
+typedef int (*dmadev_submit_t)(struct rte_dmadev *dev, uint16_t vchan);
+
+/** @internal Used to return number of successful completed operations. */
+typedef uint16_t (*dmadev_completed_t)(struct rte_dmadev *dev, uint16_t vchan,
+				       const uint16_t nb_cpls,
+				       uint16_t *last_idx, bool *has_error);
+
+/** @internal Used to return number of failed completed operations. */
+typedef uint16_t (*dmadev_completed_fails_t)(struct rte_dmadev *dev,
+			uint16_t vchan, const uint16_t nb_status,
+			uint32_t *status, uint16_t *last_idx);
+
+/**
+ * DMA device operations function pointer table
+ */
+struct rte_dmadev_ops {
+	dmadev_info_get_t dev_info_get;
+	dmadev_configure_t dev_configure;
+	dmadev_start_t dev_start;
+	dmadev_stop_t dev_stop;
+	dmadev_close_t dev_close;
+	dmadev_reset_t dev_reset;
+	dmadev_vchan_setup_t vchan_setup;
+	dmadev_vchan_release_t vchan_release;
+	dmadev_stats_get_t stats_get;
+	dmadev_stats_reset_t stats_reset;
+	dmadev_dump_t dev_dump;
+	dmadev_selftest_t dev_selftest;
+};
+
+/**
+ * @internal
+ * The data part, with no function pointers, associated with each DMA device.
+ *
+ * This structure is safe to place in shared memory to be common among different
+ * processes in a multi-process configuration.
+ */
+struct rte_dmadev_data {
+	uint16_t dev_id; /**< Device [external] identifier. */
+	char dev_name[RTE_DMADEV_NAME_MAX_LEN]; /**< Unique identifier name */
+	void *dev_private; /**< PMD-specific private data. */
+	struct rte_dmadev_conf dev_conf; /**< DMA device configuration. */
+	uint8_t dev_started : 1; /**< Device state: STARTED(1)/STOPPED(0). */
+	uint64_t reserved[4]; /**< Reserved for future fields */
+} __rte_cache_aligned;
+
+/**
+ * @internal
+ * The generic data structure associated with each DMA device.
+ *
+ * The dataplane APIs are located at the beginning of the structure, along
+ * with the pointer to where all the data elements for the particular device
+ * are stored in shared memory. This split scheme allows the function pointer
+ * and driver data to be per-process, while the actual configuration data for
+ * the device is shared.
+ */
+struct rte_dmadev {
+	dmadev_copy_t copy;
+	dmadev_copy_sg_t copy_sg;
+	dmadev_fill_t fill;
+	dmadev_fill_sg_t fill_sg;
+	dmadev_submit_t submit;
+	dmadev_completed_t completed;
+	dmadev_completed_fails_t completed_fails;
+	const struct rte_dmadev_ops *dev_ops; /**< Functions exported by PMD. */
+	/** Flag indicating the device is attached: ATTACHED(1)/DETACHED(0). */
+	uint8_t attached : 1;
+	/** Device info which supplied during device initialization. */
+	struct rte_device *device;
+	struct rte_dmadev_data *data; /**< Pointer to device data. */
+	uint64_t reserved[4]; /**< Reserved for future fields */
+} __rte_cache_aligned;
+
+extern struct rte_dmadev rte_dmadevices[];
+
+#endif /* _RTE_DMADEV_CORE_H_ */
diff --git a/lib/dmadev/rte_dmadev_pmd.h b/lib/dmadev/rte_dmadev_pmd.h
new file mode 100644
index 0000000..45141f9
--- /dev/null
+++ b/lib/dmadev/rte_dmadev_pmd.h
@@ -0,0 +1,72 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 HiSilicon Limited.
+ */
+
+#ifndef _RTE_DMADEV_PMD_H_
+#define _RTE_DMADEV_PMD_H_
+
+/**
+ * @file
+ *
+ * RTE DMA Device PMD APIs
+ *
+ * Driver facing APIs for a DMA device. These are not to be called directly by
+ * any application.
+ */
+
+#include "rte_dmadev.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @internal
+ * Allocates a new dmadev slot for an DMA device and returns the pointer
+ * to that slot for the driver to use.
+ *
+ * @param name
+ *   DMA device name.
+ *
+ * @return
+ *   A pointer to the DMA device slot case of success,
+ *   NULL otherwise.
+ */
+__rte_internal
+struct rte_dmadev *
+rte_dmadev_pmd_allocate(const char *name);
+
+/**
+ * @internal
+ * Release the specified dmadev.
+ *
+ * @param dev
+ *   Device to be released.
+ *
+ * @return
+ *   - 0 on success, negative on error
+ */
+__rte_internal
+int
+rte_dmadev_pmd_release(struct rte_dmadev *dev);
+
+/**
+ * @internal
+ * Return the DMA device based on the device name.
+ *
+ * @param name
+ *   DMA device name.
+ *
+ * @return
+ *   A pointer to the DMA device slot case of success,
+ *   NULL otherwise.
+ */
+__rte_internal
+struct rte_dmadev *
+rte_dmadev_get_device_by_name(const char *name);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_DMADEV_PMD_H_ */
diff --git a/lib/dmadev/version.map b/lib/dmadev/version.map
new file mode 100644
index 0000000..0f099e7
--- /dev/null
+++ b/lib/dmadev/version.map
@@ -0,0 +1,40 @@
+EXPERIMENTAL {
+	global:
+
+	rte_dmadev_count;
+	rte_dmadev_info_get;
+	rte_dmadev_configure;
+	rte_dmadev_start;
+	rte_dmadev_stop;
+	rte_dmadev_close;
+	rte_dmadev_reset;
+	rte_dmadev_vchan_setup;
+	rte_dmadev_vchan_release;
+	rte_dmadev_stats_get;
+	rte_dmadev_stats_reset;
+	rte_dmadev_dump;
+	rte_dmadev_selftest;
+	rte_dmadev_copy;
+	rte_dmadev_copy_sg;
+	rte_dmadev_fill;
+	rte_dmadev_fill_sg;
+	rte_dmadev_submit;
+	rte_dmadev_completed;
+	rte_dmadev_completed_fails;
+
+	local: *;
+};
+
+INTERNAL {
+        global:
+
+	rte_dmadevices;
+	rte_dmadev_pmd_allocate;
+	rte_dmadev_pmd_release;
+	rte_dmadev_get_device_by_name;
+
+	local:
+
+	rte_dmadev_is_valid_dev;
+};
+
diff --git a/lib/meson.build b/lib/meson.build
index 1673ca4..68d239f 100644
--- a/lib/meson.build
+++ b/lib/meson.build
@@ -60,6 +60,7 @@ libraries = [
         'bpf',
         'graph',
         'node',
+        'dmadev',
 ]
 
 if is_windows
-- 
2.8.1


^ permalink raw reply	[flat|nested] 339+ messages in thread

* Re: [dpdk-dev] [PATCH v2] dmadev: introduce DMA device library
  2021-07-11  9:25 ` [dpdk-dev] [PATCH v2] dmadev: introduce DMA device library Chengwen Feng
@ 2021-07-11  9:42   ` fengchengwen
  2021-07-11 13:34     ` Jerin Jacob
  2021-07-11 14:25   ` Jerin Jacob
                     ` (5 subsequent siblings)
  6 siblings, 1 reply; 339+ messages in thread
From: fengchengwen @ 2021-07-11  9:42 UTC (permalink / raw)
  To: thomas, ferruh.yigit, bruce.richardson, jerinj, jerinjacobk
  Cc: dev, mb, nipun.gupta, hemant.agrawal, maxime.coquelin,
	honnappa.nagarahalli, david.marchand, sburla, pkapoor,
	konstantin.ananyev, liangma

Note:
1) This patch hold dmadev <> vchan layer, I think vchan can be very
   conceptually separated from hw-channel.
2) I could not under struct dpi_dma_queue_ctx_s, so this patch I define
   the rte_dma_slave_port_parameters refer to Kunpeng DMA implemention.
3) This patch hasn't include doxy related file because failed to generate
   a doc in my environment, could this upstream as a new patch or must
   solved ?

Feedback welcome, thanks

On 2021/7/11 17:25, Chengwen Feng wrote:
> This patch introduce 'dmadevice' which is a generic type of DMA
> device.
> 
> The APIs of dmadev library exposes some generic operations which can
> enable configuration and I/O with the DMA devices.
> 
> Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>
> ---


^ permalink raw reply	[flat|nested] 339+ messages in thread

* Re: [dpdk-dev] [PATCH v2] dmadev: introduce DMA device library
  2021-07-11  9:42   ` fengchengwen
@ 2021-07-11 13:34     ` Jerin Jacob
  2021-07-12  7:40       ` Morten Brørup
  0 siblings, 1 reply; 339+ messages in thread
From: Jerin Jacob @ 2021-07-11 13:34 UTC (permalink / raw)
  To: fengchengwen
  Cc: Thomas Monjalon, Ferruh Yigit, Richardson, Bruce, Jerin Jacob,
	dpdk-dev, Morten Brørup, Nipun Gupta, Hemant Agrawal,
	Maxime Coquelin, Honnappa Nagarahalli, David Marchand,
	Satananda Burla, Prasun Kapoor, Ananyev, Konstantin, liangma

On Sun, Jul 11, 2021 at 3:12 PM fengchengwen <fengchengwen@huawei.com> wrote:
>
> Note:
> 1) This patch hold dmadev <> vchan layer, I think vchan can be very
>    conceptually separated from hw-channel.

I would like to keep it as channel instead of virtual channel as it is
implementation-specific.
No strong opinion on this? @Richardson, Bruce  @Morten Brørup  thoughts

> 2) I could not under struct dpi_dma_queue_ctx_s, so this patch I define
>    the rte_dma_slave_port_parameters refer to Kunpeng DMA implemention.
> 3) This patch hasn't include doxy related file because failed to generate
>    a doc in my environment, could this upstream as a new patch or must
>    solved ?

No IMO. The final version needs to be merged to should have split patch-like,

1) Header file with doxygen comments
2) Multiple patches for implementation as needed
3) Programmer  guide doc

Other items, Typically we will have per new device class.

1) Skelton driver(can use memcpy in this case)
2) app/test-dmadev kind of application.(Can be used to measure
performance and functionality)

>
> Feedback welcome, thanks
>
> On 2021/7/11 17:25, Chengwen Feng wrote:
> > This patch introduce 'dmadevice' which is a generic type of DMA
> > device.
> >
> > The APIs of dmadev library exposes some generic operations which can
> > enable configuration and I/O with the DMA devices.
> >
> > Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>
> > ---
>

^ permalink raw reply	[flat|nested] 339+ messages in thread

* Re: [dpdk-dev] [PATCH v2] dmadev: introduce DMA device library
  2021-07-11  9:25 ` [dpdk-dev] [PATCH v2] dmadev: introduce DMA device library Chengwen Feng
  2021-07-11  9:42   ` fengchengwen
@ 2021-07-11 14:25   ` Jerin Jacob
  2021-07-12  7:15   ` Morten Brørup
                     ` (4 subsequent siblings)
  6 siblings, 0 replies; 339+ messages in thread
From: Jerin Jacob @ 2021-07-11 14:25 UTC (permalink / raw)
  To: Chengwen Feng
  Cc: Thomas Monjalon, Ferruh Yigit, Richardson, Bruce, Jerin Jacob,
	dpdk-dev, Morten Brørup, Nipun Gupta, Hemant Agrawal,
	Maxime Coquelin, Honnappa Nagarahalli, David Marchand,
	Satananda Burla, Prasun Kapoor, Ananyev, Konstantin, liangma

On Sun, Jul 11, 2021 at 2:59 PM Chengwen Feng <fengchengwen@huawei.com> wrote:
>
> This patch introduce 'dmadevice' which is a generic type of DMA
> device.
>
> The APIs of dmadev library exposes some generic operations which can
> enable configuration and I/O with the DMA devices.
>
> Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>
> diff --git a/lib/dmadev/rte_dmadev.h b/lib/dmadev/rte_dmadev.h
> new file mode 100644
> index 0000000..8779512
> --- /dev/null
> +++ b/lib/dmadev/rte_dmadev.h
> @@ -0,0 +1,1030 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2021 HiSilicon Limited.
> + * Copyright(c) 2021 Intel Corporation.
> + * Copyright(c) 2021 Marvell International Ltd.
> + */
> +
> +#ifndef _RTE_DMADEV_H_
> +#define _RTE_DMADEV_H_
> +
> +/**
> + * @file rte_dmadev.h
> + *
> + * RTE DMA (Direct Memory Access) device APIs.
> + *
> + * The DMA framework is built on the following model:
> + *
> + *     ---------------   ---------------       ---------------
> + *     | virtual DMA |   | virtual DMA |       | virtual DMA |
> + *     | channel     |   | channel     |       | channel     |
> + *     ---------------   ---------------       ---------------
> + *            |                |                      |
> + *            ------------------                      |
> + *                     |                              |
> + *               ------------                    ------------
> + *               |  dmadev  |                    |  dmadev  |
> + *               ------------                    ------------
> + *                     |                              |
> + *            ------------------               ------------------
> + *            | HW-DMA-channel |               | HW-DMA-channel |
> + *            ------------------               ------------------
> + *                     |                              |
> + *                     --------------------------------
> + *                                     |
> + *                           ---------------------
> + *                           | HW-DMA-Controller |
> + *                           ---------------------
> + *
> + * The DMA controller could have multilpe HW-DMA-channels (aka. HW-DMA-queues),

Typo - multiple

> + * each HW-DMA-channel should be represented by a dmadev.
> + *
> + * The dmadev could create multiple virtual DMA channel, each virtual DMA
> + * channel represents a different transfer context. The DMA operation request
> + * must be submitted to the virtual DMA channel.
> + * E.G. Application could create virtual DMA channel 0 for mem-to-mem transfer
> + *      scenario, and create virtual DMA channel 1 for mem-to-dev transfer
> + *      scenario.
> + *
> + * The dmadev are dynamically allocated by rte_dmadev_pmd_allocate() during the
> + * PCI/SoC device probing phase performed at EAL initialization time. And could
> + * be released by rte_dmadev_pmd_release() during the PCI/SoC device removing
> + * phase.
> + *
> + * We use 'uint16_t dev_id' as the device identifier of a dmadev, and

Please remove "We use" and reword accordingly.

> + * 'uint16_t vchan' as the virtual DMA channel identifier in one dmadev.
> + *
> + * The functions exported by the dmadev API to setup a device designated by its
> + * device identifier must be invoked in the following order:
> + *     - rte_dmadev_configure()
> + *     - rte_dmadev_vchan_setup()
> + *     - rte_dmadev_start()
> + *
> + * Then, the application can invoke dataplane APIs to process jobs.
> + *
> + * If the application wants to change the configuration (i.e. call
> + * rte_dmadev_configure()), it must call rte_dmadev_stop() first to stop the
> + * device and then do the reconfiguration before calling rte_dmadev_start()
> + * again. The dataplane APIs should not be invoked when the device is stopped.
> + *
> + * Finally, an application can close a dmadev by invoking the
> + * rte_dmadev_close() function.
> + *
> + * The dataplane APIs include two parts:
> + *   a) The first part is the submission of operation requests:
> + *        - rte_dmadev_copy()
> + *        - rte_dmadev_copy_sg() - scatter-gather form of copy
> + *        - rte_dmadev_fill()
> + *        - rte_dmadev_fill_sg() - scatter-gather form of fill
> + *        - rte_dmadev_perform() - issue doorbell to hardware
> + *      These APIs could work with different virtual DMA channels which have
> + *      different contexts.
> + *      The first four APIs are used to submit the operation request to the
> + *      virtual DMA channel, if the submission is successful, a uint16_t
> + *      ring_idx is returned, otherwise a negative number is returned.
> + *   b) The second part is to obtain the result of requests:
> + *        - rte_dmadev_completed()
> + *            - return the number of operation requests completed successfully.
> + *        - rte_dmadev_completed_fails()
> + *            - return the number of operation requests failed to complete.
> + *
> + * About the ring_idx which rte_dmadev_copy/copy_sg/fill/fill_sg() returned,
> + * the rules are as follows:
> + *   a) ring_idx for each virtual DMA channel are independent.
> + *   b) For a virtual DMA channel, the ring_idx is monotonically incremented,
> + *      when it reach UINT16_MAX, it wraps back to zero.
> + *   c) The initial ring_idx of a virtual DMA channel is zero, after the device
> + *      is stopped or reset, the ring_idx needs to be reset to zero.
> + *   Example:
> + *      step-1: start one dmadev
> + *      step-2: enqueue a copy operation, the ring_idx return is 0
> + *      step-3: enqueue a copy operation again, the ring_idx return is 1
> + *      ...
> + *      step-101: stop the dmadev
> + *      step-102: start the dmadev
> + *      step-103: enqueue a copy operation, the cookie return is 0
> + *      ...
> + *      step-x+0: enqueue a fill operation, the ring_idx return is 65535
> + *      step-x+1: enqueue a copy operation, the ring_idx return is 0
> + *      ...
> + *
> + * By default, all the non-dataplane functions of the dmadev API exported by a
> + * PMD are lock-free functions which assume to not be invoked in parallel on
> + * different logical cores to work on the same target object.
> + *
> + * The dataplane functions of the dmadev API exported by a PMD can be MT-safe
> + * only when supported by the driver, generally, the driver will reports two
> + * capabilities:
> + *   a) Whether to support MT-safe for the submit/completion API of the same
> + *      virtual DMA channel.
> + *      E.G. one thread do submit operation, another thread do completion
> + *           operation.
> + *      If driver support it, then declare RTE_DMA_DEV_CAPA_MT_VCHAN.
> + *      If driver don't support it, it's up to the application to guarantee
> + *      MT-safe.
> + *   b) Whether to support MT-safe for different virtual DMA channels.
> + *      E.G. one thread do operation on virtual DMA channel 0, another thread
> + *           do operation on virtual DMA channel 1.
> + *      If driver support it, then declare RTE_DMA_DEV_CAPA_MT_MULTI_VCHAN.
> + *      If driver don't support it, it's up to the application to guarantee
> + *      MT-safe.

The above cases make it difficult to write applications . Also,
application locking using spinlock
etc may not better optimization.(If an driver can do it better way).
IMO, As discussed with @Morten Brørup ,
I think, it is better while configuring the channel application can
specify, do they  "need" MT safe to
have a portable application.Unlike network or crypto device, Since we
are creating the virtual channel
such scheme is useful.


> + *
> + */
> +
> +#include <rte_common.h>
> +#include <rte_compat.h>
> +#include <rte_errno.h>
> +#include <rte_memory.h>
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +#define RTE_DMADEV_NAME_MAX_LEN        RTE_DEV_NAME_MAX_LEN
> +
> +extern int rte_dmadev_logtype;
> +

Missing Doxygen comment

> +#define RTE_DMADEV_LOG(level, ...) \
> +       rte_log(RTE_LOG_ ## level, rte_dmadev_logtype, "" __VA_ARGS__)
> +

Make it as internal.

> +/* Macros to check for valid port */
> +#define RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, retval) do { \
> +       if (!rte_dmadev_is_valid_dev(dev_id)) { \
> +               RTE_DMADEV_LOG(ERR, "Invalid dev_id=%u\n", dev_id); \
> +               return retval; \
> +       } \
> +} while (0)
> +

Make it as internal.

> +#define RTE_DMADEV_VALID_DEV_ID_OR_RET(dev_id) do { \
> +       if (!rte_dmadev_is_valid_dev(dev_id)) { \
> +               RTE_DMADEV_LOG(ERR, "Invalid dev_id=%u\n", dev_id); \
> +               return; \
> +       } \
> +} while (0)
> +
> +/**
> + * @internal

We can make it as a public API.

> + * Validate if the DMA device index is a valid attached DMA device.
> + *
> + * @param dev_id
> + *   DMA device index.
> + *
> + * @return
> + *   - If the device index is valid (true) or not (false).
> + */
> +__rte_internal
> +bool
> +rte_dmadev_is_valid_dev(uint16_t dev_id);
> +
> +/**
> + * rte_dma_sg - can hold scatter DMA operation request
> + */
> +struct rte_dma_sg {
> +       rte_iova_t src;
> +       rte_iova_t dst;

IMO, it should be only rte_iova_t addr. See comment at  _sg API

> +       uint32_t length;
> +};
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Get the total number of DMA devices that have been successfully
> + * initialised.
> + *
> + * @return
> + *   The total number of usable DMA devices.
> + */
> +__rte_experimental
> +uint16_t
> +rte_dmadev_count(void);
> +
> +/**
> + * The capabilities of a DMA device
> + */
> +#define RTE_DMA_DEV_CAPA_MEM_TO_MEM    (1ull << 0)
> +/**< DMA device support mem-to-mem transfer.
> + *
> + * @see struct rte_dmadev_info::dev_capa
> + */
> +#define RTE_DMA_DEV_CAPA_MEM_TO_DEV    (1ull << 1)
> +/**< DMA device support slave mode & mem-to-dev transfer.

Do we need to say slave mode? Just mem to dev is fine. Right?

> + *
> + * @see struct rte_dmadev_info::dev_capa
> + */
> +#define RTE_DMA_DEV_CAPA_DEV_TO_MEM    (1ull << 2)
> +/**< DMA device support slave mode & dev-to-mem transfer.

See above.

> + *
> + * @see struct rte_dmadev_info::dev_capa
> + */
> +#define RTE_DMA_DEV_CAPA_DEV_TO_DEV    (1ull << 3)
> +/**< DMA device support slave mode & dev-to-dev transfer.

See above.

> + *
> + * @see struct rte_dmadev_info::dev_capa
> + */
> +#define RTE_DMA_DEV_CAPA_OPS_COPY      (1ull << 4)
> +/**< DMA device support copy ops.
> + *
> + * @see struct rte_dmadev_info::dev_capa
> + */
> +#define RTE_DMA_DEV_CAPA_OPS_FILL      (1ull << 5)
> +/**< DMA device support fill ops.
> + *
> + * @see struct rte_dmadev_info::dev_capa
> + */
> +#define RTE_DMA_DEV_CAPA_OPS_SG                (1ull << 6)
> +/**< DMA device support scatter-list ops.
> + * If device support ops_copy and ops_sg, it means supporting copy_sg ops.
> + * If device support ops_fill and ops_sg, it means supporting fill_sg ops.
> + *
> + * @see struct rte_dmadev_info::dev_capa
> + */
> +#define RTE_DMA_DEV_CAPA_FENCE         (1ull << 7)
> +/**< DMA device support fence.
> + * If device support fence, then application could set a fence flags when
> + * enqueue operation by rte_dma_copy/copy_sg/fill/fill_sg.
> + * If a operation has a fence flags, it means the operation must be processed
> + * only after all previous operations are completed.
> + *
> + * @see struct rte_dmadev_info::dev_capa
> + */
> +#define RTE_DMA_DEV_CAPA_SVA           (1ull << 8)
> +/**< DMA device support SVA which could use VA as DMA address.
> + * If device support SVA then application could pass any VA address like memory
> + * from rte_malloc(), rte_memzone(), malloc, stack memory.
> + * If device don't support SVA, then application should pass IOVA address which
> + * from rte_malloc(), rte_memzone().
> + *
> + * @see struct rte_dmadev_info::dev_capa
> + */
> +#define RTE_DMA_DEV_CAPA_MT_VCHAN      (1ull << 9)
> +/**< DMA device support MT-safe of a virtual DMA channel.
> + *
> + * @see struct rte_dmadev_info::dev_capa
> + */
> +#define RTE_DMA_DEV_CAPA_MT_MULTI_VCHAN        (1ull << 10)
> +/**< DMA device support MT-safe of different virtual DMA channels.
> + *
> + * @see struct rte_dmadev_info::dev_capa
> + */
> +
> +/**
> + * A structure used to retrieve the contextual information of
> + * an DMA device
> + */
> +struct rte_dmadev_info {
> +       struct rte_device *device; /**< Generic Device information */
> +       uint64_t dev_capa; /**< Device capabilities (RTE_DMA_DEV_CAPA_) */
> +       /** Maximum number of virtual DMA channels supported */
> +       uint16_t max_vchans;

Doxygen comment should come after the symbol. Not above.

> +       /** Maximum allowed number of virtual DMA channel descriptors */
> +       uint16_t max_desc;
> +       /** Minimum allowed number of virtual DMA channel descriptors */
> +       uint16_t min_desc;
> +       uint16_t nb_vchans; /**< Number of virtual DMA channel configured */
> +};
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Retrieve the contextual information of a DMA device.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param[out] dev_info
> + *   A pointer to a structure of type *rte_dmadev_info* to be filled with the
> + *   contextual information of the device.
> + *
> + * @return
> + *   - =0: Success, driver updates the contextual information of the DMA device
> + *   - <0: Error code returned by the driver info get function.
> + *
> + */
> +__rte_experimental
> +int
> +rte_dmadev_info_get(uint16_t dev_id, struct rte_dmadev_info *dev_info);
> +
> +/**
> + * A structure used to configure a DMA device.
> + */
> +struct rte_dmadev_conf {
> +       /** Maximum number of virtual DMA channel to use.
> +        * This value cannot be greater than the field 'max_vchans' of struct
> +        * rte_dmadev_info which get from rte_dmadev_info_get().
> +        */
> +       uint16_t max_vchans;
> +       /** Enable bit for MT-safe of a virtual DMA channel.
> +        * This bit can be enabled only when the device supports
> +        * RTE_DMA_DEV_CAPA_MT_VCHAN.
> +        * @see RTE_DMA_DEV_CAPA_MT_VCHAN
> +        */
> +       uint8_t enable_mt_vchan : 1;
> +       /** Enable bit for MT-safe of different virtual DMA channels.
> +        * This bit can be enabled only when the device supports
> +        * RTE_DMA_DEV_CAPA_MT_MULTI_VCHAN.
> +        * @see RTE_DMA_DEV_CAPA_MT_MULTI_VCHAN

I think, we can support this even if the flag is not supported to limit the
application fastpath options.

> +        */
> +       uint8_t enable_mt_multi_vchan : 1;
> +       uint64_t reserved[2]; /**< Reserved for future fields */
> +};
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Configure a DMA device.
> + *
> + * This function must be invoked first before any other function in the
> + * API. This function can also be re-invoked when a device is in the
> + * stopped state.
> + *
> + * @param dev_id
> + *   The identifier of the device to configure.
> + * @param dev_conf
> + *   The DMA device configuration structure encapsulated into rte_dmadev_conf
> + *   object.
> + *
> + * @return
> + *   - =0: Success, device configured.
> + *   - <0: Error code returned by the driver configuration function.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_configure(uint16_t dev_id, const struct rte_dmadev_conf *dev_conf);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Start a DMA device.
> + *
> + * The device start step is the last one and consists of setting the DMA
> + * to start accepting jobs.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + *
> + * @return
> + *   - =0: Success, device started.
> + *   - <0: Error code returned by the driver start function.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_start(uint16_t dev_id);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Stop a DMA device.
> + *
> + * The device can be restarted with a call to rte_dmadev_start()
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + *
> + * @return
> + *   - =0: Success, device stopped.
> + *   - <0: Error code returned by the driver stop function.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_stop(uint16_t dev_id);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Close a DMA device.
> + *
> + * The device cannot be restarted after this call.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + *
> + * @return
> + *  - =0: Successfully close device
> + *  - <0: Failure to close device
> + */
> +__rte_experimental
> +int
> +rte_dmadev_close(uint16_t dev_id);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Reset a DMA device.
> + *
> + * This is different from cycle of rte_dmadev_start->rte_dmadev_stop in the
> + * sense similar to hard or soft reset.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + *
> + * @return
> + *   - =0: Successfully reset device.
> + *   - <0: Failure to reset device.
> + *   - (-ENOTSUP): If the device doesn't support this function.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_reset(uint16_t dev_id);

Is this required now?

> +
> +/**
> + * DMA transfer direction defines.
> + */
> +#define RTE_DMA_MEM_TO_MEM     (1ull << 0)

RTE_DMA_DIRECTION_...

> +/**< DMA transfer direction - from memory to memory.
> + *
> + * @see struct rte_dmadev_vchan_conf::direction
> + */
> +#define RTE_DMA_MEM_TO_DEV     (1ull << 1)
> +/**< DMA transfer direction - slave mode & from memory to device.
> + * In a typical scenario, ARM SoCs are installed on x86 servers as iNICs. In
> + * this case, the ARM SoCs works in slave mode, it could initiate a DMA move
> + * request from ARM memory to x86 host memory.
> + *
> + * @see struct rte_dmadev_vchan_conf::direction
> + */
> +#define RTE_DMA_DEV_TO_MEM     (1ull << 2)
> +/**< DMA transfer direction - slave mode & from device to memory.
> + * In a typical scenario, ARM SoCs are installed on x86 servers as iNICs. In
> + * this case, the ARM SoCs works in slave mode, it could initiate a DMA move
> + * request from x86 host memory to ARM memory.
> + *
> + * @see struct rte_dmadev_vchan_conf::direction
> + */
> +#define RTE_DMA_DEV_TO_DEV     (1ull << 3)
> +/**< DMA transfer direction - slave mode & from device to device.
> + * In a typical scenario, ARM SoCs are installed on x86 servers as iNICs. In
> + * this case, the ARM SoCs works in slave mode, it could initiate a DMA m> + * request from x86 host memory to another x86 host memory.
> + *
> + * @see struct rte_dmadev_vchan_conf::direction
> + */
> +#define RTE_DMA_TRANSFER_DIR_ALL       (RTE_DMA_MEM_TO_MEM | \
> +                                        RTE_DMA_MEM_TO_DEV | \
> +                                        RTE_DMA_DEV_TO_MEM | \
> +                                        RTE_DMA_DEV_TO_DEV)
> +
> +/**
> + * enum rte_dma_slave_port_type - slave mode type defines
> + */
> +enum rte_dma_slave_port_type {

I think, rte_dmadev_dev_type

> +       /** The slave port is PCIE. */
> +       RTE_DMA_SLAVE_PORT_PCIE = 1,
> +};
> +
> +/**
> + * A structure used to descript slave port parameters.*
> + */
> +struct rte_dma_slave_port_parameters {

May be rte_dmadev_dev_conf?

> +       enum rte_dma_slave_port_type port_type;
> +       union {
> +               /** For PCIE port */
> +               struct {
> +                       /** The physical function number which to use */
> +                       uint64_t pf_number : 6;
> +                       /** Virtual function enable bit */
> +                       uint64_t vf_enable : 1;
> +                       /** The virtual function number which to use */
> +                       uint64_t vf_number : 8;
> +                       uint64_t pasid : 20;
> +                       /** The attributes filed in TLP packet */
> +                       uint64_t tlp_attr : 3;
> +               };
> +       };
> +};
> +
> +/**
> + * A structure used to configure a virtual DMA channel.
> + */
> +struct rte_dmadev_vchan_conf {
> +       uint8_t direction; /**< Set of supported transfer directions */
We could add @see RTE_DMA_DIRECTION_*.
Also, say, how the application can know the valid flags.aka point to info.


> +       /** Number of descriptor for the virtual DMA channel */
> +       uint16_t nb_desc;
> +       /** 1) Used to describes the dev parameter in the mem-to-dev/dev-to-mem
> +        * transfer scenario.
> +        * 2) Used to describes the src dev parameter in the dev-to-dev
> +        * transfer scenario.
> +        */
> +       struct rte_dma_slave_port_parameters port;
> +       /** Used to describes the dst dev parameters in the dev-to-dev
> +        * transfer scenario.
> +        */
> +       struct rte_dma_slave_port_parameters peer_port;
> +       uint64_t reserved[2]; /**< Reserved for future fields */
> +};
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Allocate and set up a virtual DMA channel.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param conf
> + *   The virtual DMA channel configuration structure encapsulated into
> + *   rte_dmadev_vchan_conf object.
> + *
> + * @return
> + *   - >=0: Allocate success, it is the virtual DMA channel id. This value must
> + *          be less than the field 'max_vchans' of struct rte_dmadev_conf
> +           which configured by rte_dmadev_configure().
> + *   - <0: Error code returned by the driver virtual channel setup function.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_vchan_setup(uint16_t dev_id,
> +                      const struct rte_dmadev_vchan_conf *conf);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Release a virtual DMA channel.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vchan
> + *   The identifier of virtual DMA channel which return by vchan setup.
> + *
> + * @return
> + *   - =0: Successfully release the virtual DMA channel.
> + *   - <0: Error code returned by the driver virtual channel release function.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_vchan_release(uint16_t dev_id, uint16_t vchan);

We are not making release as pubic API in other device class. See ethdev spec.


> +
> +/**
> + * rte_dmadev_stats - running statistics.
> + */
> +struct rte_dmadev_stats {
> +       /** Count of operations which were successfully enqueued */
> +       uint64_t enqueued_count;
> +       /** Count of operations which were submitted to hardware */
> +       uint64_t submitted_count;
> +       /** Count of operations which failed to complete */
> +       uint64_t completed_fail_count;
> +       /** Count of operations which successfully complete */
> +       uint64_t completed_count;
> +       uint64_t reserved[4]; /**< Reserved for future fields */
> +};

Please add the capability for each counter in info structure as one
device may support all
the counters.

> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Retrieve basic statistics of a or all virtual DMA channel(s).
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vchan
> + *   The identifier of virtual DMA channel, -1 means all channels.
> + * @param[out] stats
> + *   The basic statistics structure encapsulated into rte_dmadev_stats
> + *   object.
> + *
> + * @return
> + *   - =0: Successfully retrieve stats.
> + *   - <0: Failure to retrieve stats.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_stats_get(uint16_t dev_id, int vchan,
> +                    struct rte_dmadev_stats *stats);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Reset basic statistics of a or all virtual DMA channel(s).
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vchan
> + *   The identifier of virtual DMA channel, -1 means all channels.
> + *
> + * @return
> + *   - =0: Successfully reset stats.
> + *   - <0: Failure to reset stats.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_stats_reset(uint16_t dev_id, int vchan);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Dump DMA device info.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param f
> + *   The file to write the output to.
> + *
> + * @return
> + *   0 on success. Non-zero otherwise.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_dump(uint16_t dev_id, FILE *f);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Trigger the dmadev self test.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + *
> + * @return
> + *   - 0: Selftest successful.
> + *   - -ENOTSUP if the device doesn't support selftest
> + *   - other values < 0 on failure.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_selftest(uint16_t dev_id);
> +
> +#include "rte_dmadev_core.h"
> +
> +/**
> + *  DMA flags to augment operation preparation.
> + *  Used as the 'flags' parameter of rte_dmadev_copy/copy_sg/fill/fill_sg.
> + */
> +#define RTE_DMA_FLAG_FENCE     (1ull << 0)

RTE_DMA_OP_FLAG_FENCE

Please add also add RTE_DMA_OP_FLAG_SUMBIT as we discussed in another thread.
We can support submit based on the flag and _submit() version based on
application preference.

> +/**< DMA fence flag
> + * It means the operation with this flag must be processed only after all
> + * previous operations are completed.
> + *
> + * @see rte_dmadev_copy()
> + * @see rte_dmadev_copy_sg()
> + * @see rte_dmadev_fill()
> + * @see rte_dmadev_fill_sg()
> + */
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Enqueue a copy operation onto the virtual DMA channel.
> + *
> + * This queues up a copy operation to be performed by hardware, but does not
> + * trigger hardware to begin that operation.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vchan
> + *   The identifier of virtual DMA channel.
> + * @param src
> + *   The address of the source buffer.
> + * @param dst
> + *   The address of the destination buffer.
> + * @param length
> + *   The length of the data to be copied.
> + * @param flags
> + *   An flags for this operation.

See RTE_DMA_OP_FLAG_*

> + *
> + * @return
> + *   - 0..UINT16_MAX: index of enqueued copy job.
> + *   - <0: Error code returned by the driver copy function.
> + */
> +__rte_experimental
> +static inline int
> +rte_dmadev_copy(uint16_t dev_id, uint16_t vchan, rte_iova_t src, rte_iova_t dst,
> +               uint32_t length, uint64_t flags)
> +{
> +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> +#ifdef RTE_DMADEV_DEBUG
> +       RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
> +       RTE_FUNC_PTR_OR_ERR_RET(*dev->copy, -ENOTSUP);
> +       if (vchan >= dev->data->dev_conf.max_vchans) {
> +               RTE_DMADEV_LOG(ERR, "Invalid vchan %d\n", vchan);
> +               return -EINVAL;
> +       }
> +#endif
> +       return (*dev->copy)(dev, vchan, src, dst, length, flags)> +
> +/**

> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Enqueue a scatter list copy operation onto the virtual DMA channel.
> + *
> + * This queues up a scatter list copy operation to be performed by hardware,
> + * but does not trigger hardware to begin that operation.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vchan
> + *   The identifier of virtual DMA channel.
> + * @param sg
> + *   The pointer of scatterlist.
> + * @param sg_len
> + *   The number of scatterlist elements.
> + * @param flags
> + *   An flags for this operation.
> + *
> + * @return
> + *   - 0..UINT16_MAX: index of enqueued copy job.
> + *   - <0: Error code returned by the driver copy function.
> + */
> +__rte_experimental
> +static inline int
> +rte_dmadev_copy_sg(uint16_t dev_id, uint16_t vchan, const struct rte_dma_sg *sg,
> +                  uint32_t sg_len, uint64_t flags)


As requested earlier, I prefer to have

rte_dmadev_copy_sg(uint16_t dev_id, uint16_t vchan, const struct
rte_dma_sg *src, uint32_t nb_src, const struct rte_dma_sg *dst,
uint32_t nb_dst, uint64_t flags)


> +{
> +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> +#ifdef RTE_DMADEV_DEBUG
> +       RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
> +       RTE_FUNC_PTR_OR_ERR_RET(sg, -EINVAL);
> +       RTE_FUNC_PTR_OR_ERR_RET(*dev->copy_sg, -ENOTSUP);
> +       if (vchan >= dev->data->dev_conf.max_vchans) {
> +               RTE_DMADEV_LOG(ERR, "Invalid vchan %d\n", vchan);
> +               return -EINVAL;
> +       }
> +#endif
> +       return (*dev->copy_sg)(dev, vchan, sg, sg_len, flags);
> +}
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Enqueue a fill operation onto the virtual DMA channel.
> + *
> + * This queues up a fill operation to be performed by hardware, but does not
> + * trigger hardware to begin that operation.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vchan
> + *   The identifier of virtual DMA channel.
> + * @param pattern
> + *   The pattern to populate the destination buffer with.
> + * @param dst
> + *   The address of the destination buffer.
> + * @param length
> + *   The length of the destination buffer.
> + * @param flags
> + *   An flags for this operation.
> + *
> + * @return
> + *   - 0..UINT16_MAX: index of enqueued copy job.
> + *   - <0: Error code returned by the driver copy function.
> + */
> +__rte_experimental
> +static inline int
> +rte_dmadev_fill(uint16_t dev_id, uint16_t vchan, uint64_t pattern,
> +               rte_iova_t dst, uint32_t length, uint64_t flags)
> +{
> +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> +#ifdef RTE_DMADEV_DEBUG
> +       RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
> +       RTE_FUNC_PTR_OR_ERR_RET(*dev->fill, -ENOTSUP);
> +       if (vchan >= dev->data->dev_conf.max_vchans) {
> +               RTE_DMADEV_LOG(ERR, "Invalid vchan %d\n", vchan);
> +               return -EINVAL;
> +       }
> +#endif
> +       return (*dev->fill)(dev, vchan, pattern, dst, length, flags);

Instead of every driver set the NOP function, In the common code, If
the CAPA is not set,
common code can set NOP function for this with <0 return value.

> +}
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Enqueue a scatter list fill operation onto the virtual DMA channel.
> + *
> + * This queues up a scatter list fill operation to be performed by hardware,
> + * but does not trigger hardware to begin that operation.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vchan
> + *   The identifier of virtual DMA channel.
> + * @param pattern
> + *   The pattern to populate the destination buffer with.
> + * @param sg
> + *   The pointer of scatterlist.
> + * @param sg_len
> + *   The number of scatterlist elements.
> + * @param flags
> + *   An flags for this operation.
> + *
> + * @return
> + *   - 0..UINT16_MAX: index of enqueued copy job.
> + *   - <0: Error code returned by the driver copy function.
> + */
> +__rte_experimental
> +static inline int
> +rte_dmadev_fill_sg(uint16_t dev_id, uint16_t vchan, uint64_t pattern,
> +                  const struct rte_dma_sg *sg, uint32_t sg_len,
> +                  uint64_t flags)
> +{
> +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> +#ifdef RTE_DMADEV_DEBUG
> +       RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
> +       RTE_FUNC_PTR_OR_ERR_RET(sg, -ENOTSUP);
> +       RTE_FUNC_PTR_OR_ERR_RET(*dev->fill, -ENOTSUP);
> +       if (vchan >= dev->data->dev_conf.max_vchans) {
> +               RTE_DMADEV_LOG(ERR, "Invalid vchan %d\n", vchan);
> +               return -EINVAL;
> +       }
> +#endif
> +       return (*dev->fill_sg)(dev, vchan, pattern, sg, sg_len, flags);

In order to save 8B in rte_dmadev this API can be removed as looks like none
of the drivers supports fill in sg mode.

> +}
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Trigger hardware to begin performing enqueued operations.
> + *
> + * This API is used to write the "doorbell" to the hardware to trigger it
> + * to begin the operations previously enqueued by rte_dmadev_copy/fill()
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vchan
> + *   The identifier of virtual DMA channel.
> + *
> + * @return
> + *   - =0: Successfully trigger hardware.
> + *   - <0: Failure to trigger hardware.
> + */
> +__rte_experimental
> +static inline int
> +rte_dmadev_submit(uint16_t dev_id, uint16_t vchan)
> +{
> +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> +#ifdef RTE_DMADEV_DEBUG
> +       RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
> +       RTE_FUNC_PTR_OR_ERR_RET(*dev->submit, -ENOTSUP);
> +       if (vchan >= dev->data->dev_conf.max_vchans) {
> +               RTE_DMADEV_LOG(ERR, "Invalid vchan %d\n", vchan);
> +               return -EINVAL;
> +       }
> +#endif
> +       return (*dev->submit)(dev, vchan);
> +}
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Returns the number of operations that have been successfully completed.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vchan
> + *   The identifier of virtual DMA channel.
> + * @param nb_cpls
> + *   The maximum number of completed operations that can be processed.
> + * @param[out] last_idx
> + *   The last completed operation's index.
> + *   If not required, NULL can be passed in.
> + * @param[out] has_error
> + *   Indicates if there are transfer error.
> + *   If not required, NULL can be passed in.
> + *
> + * @return
> + *   The number of operations that successfully completed.
> + */
> +__rte_experimental
> +static inline uint16_t
> +rte_dmadev_completed(uint16_t dev_id, uint16_t vchan, const uint16_t nb_cpls,
> +                    uint16_t *last_idx, bool *has_error)
> +{
> +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> +       uint16_t idx;
> +       bool err;
> +
> +#ifdef RTE_DMADEV_DEBUG
> +       RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
> +       RTE_FUNC_PTR_OR_ERR_RET(*dev->completed, -ENOTSUP);
> +       if (vchan >= dev->data->dev_conf.max_vchans) {
> +               RTE_DMADEV_LOG(ERR, "Invalid vchan %d\n", vchan);
> +               return -EINVAL;
> +       }
> +       if (nb_cpls == 0) {
> +               RTE_DMADEV_LOG(ERR, "Invalid nb_cpls\n");
> +               return -EINVAL;
> +       }
> +#endif
> +
> +       /* Ensure the pointer values are non-null to simplify drivers.
> +        * In most cases these should be compile time evaluated, since this is
> +        * an inline function.
> +        * - If NULL is explicitly passed as parameter, then compiler knows the
> +        *   value is NULL
> +        * - If address of local variable is passed as parameter, then compiler
> +        *   can know it's non-NULL.
> +        */
> +       if (last_idx == NULL)
> +               last_idx = &idx;
> +       if (has_error == NULL)
> +               has_error = &err;
> +
> +       *has_error = false;
> +       return (*dev->completed)(dev, vchan, nb_cpls, last_idx, has_error);
> +}
> +
> +/**
> + * DMA transfer status code defines
> + */
> +enum rte_dma_status_code {
> +       /** The operation completed successfully */
> +       RTE_DMA_STATUS_SUCCESSFUL = 0,
> +       /** The operation failed to complete due active drop
> +        * This is mainly used when processing dev_stop, allow outstanding
> +        * requests to be completed as much as possible.
> +        */
> +       RTE_DMA_STATUS_ACTIVE_DROP,
> +       /** The operation failed to complete due invalid source address */
> +       RTE_DMA_STATUS_INVALID_SRC_ADDR,
> +       /** The operation failed to complete due invalid destination address */
> +       RTE_DMA_STATUS_INVALID_DST_ADDR,
> +       /** The operation failed to complete due invalid length */
> +       RTE_DMA_STATUS_INVALID_LENGTH,
> +       /** The operation failed to complete due invalid opcode
> +        * The DMA descriptor could have multiple format, which are
> +        * distinguished by the opcode field.
> +        */
> +       RTE_DMA_STATUS_INVALID_OPCODE,
> +       /** The operation failed to complete due bus err */
> +       RTE_DMA_STATUS_BUS_ERROR,
> +       /** The operation failed to complete due data poison */
> +       RTE_DMA_STATUS_DATA_POISION,
> +       /** The operation failed to complete due descriptor read error */
> +       RTE_DMA_STATUS_DESCRIPTOR_READ_ERROR,
> +       /** The operation failed to complete due device link error
> +        * Used to indicates that the link error in the mem-to-dev/dev-to-mem/
> +        * dev-to-dev transfer scenario.
> +        */
> +       RTE_DMA_STATUS_DEV_LINK_ERROR,
> +       /** Driver specific status code offset
> +        * Start status code for the driver to define its own error code.


RTE_DMA_STATUS_UNKNOWN for the ones which are not added in public API spec.


> +        */
> +       RTE_DMA_STATUS_DRV_SPECIFIC_OFFSET = 0x10000,
> +};
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Returns the number of operations that failed to complete.
> + * NOTE: This API was used when rte_dmadev_completed has_error was set.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vchan
> + *   The identifier of virtual DMA channel.
> + * @param nb_status
> + *   Indicates the size of status array.
> + * @param[out] status
> + *   The error code of operations that failed to complete.
> + *   Some standard error code are described in 'enum rte_dma_status_code'
> + *   @see rte_dma_status_code
> + * @param[out] last_idx
> + *   The last failed completed operation's index.
> + *
> + * @return
> + *   The number of operations that failed to complete.
> + */
> +__rte_experimental
> +static inline uint16_t
> +rte_dmadev_completed_fails(uint16_t dev_id, uint16_t vchan,
> +                          const uint16_t nb_status, uint32_t *status,

uint32_t -> enum rte_dma_status_code


> +                          uint16_t *last_idx)
> +{
> +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> +#ifdef RTE_DMADEV_DEBUG
> +       RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
> +       RTE_FUNC_PTR_OR_ERR_RET(status, -EINVAL);
> +       RTE_FUNC_PTR_OR_ERR_RET(last_idx, -EINVAL);
> +       RTE_FUNC_PTR_OR_ERR_RET(*dev->completed_fails, -ENOTSUP);
> +       if (vchan >= dev->data->dev_conf.max_vchans) {
> +               RTE_DMADEV_LOG(ERR, "Invalid vchan %d\n", vchan);
> +               return -EINVAL;
> +       }
> +       if (nb_status == 0) {
> +               RTE_DMADEV_LOG(ERR, "Invalid nb_status\n");
> +               return -EINVAL;
> +       }
> +#endif
> +       return (*dev->completed_fails)(dev, vchan, nb_status, status, last_idx);
> +}
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif /* _RTE_DMADEV_H_ */
> diff --git a/lib/dmadev/rte_dmadev_core.h b/lib/dmadev/rte_dmadev_core.h
> new file mode 100644
> index 0000000..410faf0
> --- /dev/null
> +++ b/lib/dmadev/rte_dmadev_core.h
> @@ -0,0 +1,159 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2021 HiSilicon Limited.
> + * Copyright(c) 2021 Intel Corporation.
> + */
> +
> +#ifndef _RTE_DMADEV_CORE_H_
> +#define _RTE_DMADEV_CORE_H_
> +
> +/**
> + * @file
> + *
> + * RTE DMA Device internal header.
> + *
> + * This header contains internal data types, that are used by the DMA devices
> + * in order to expose their ops to the class.
> + *
> + * Applications should not use these API directly.
> + *
> + */
> +
> +struct rte_dmadev;
> +
> +/** @internal Used to get device information of a device. */
> +typedef int (*dmadev_info_get_t)(struct rte_dmadev *dev,
> +                                struct rte_dmadev_info *dev_info);

Please change to rte_dmadev_info_get_t to avoid conflict due to namespace issue
as this header is exported.

> +
> +/** @internal Used to configure a device. */
> +typedef int (*dmadev_configure_t)(struct rte_dmadev *dev,
> +                                 const struct rte_dmadev_conf *dev_conf);
> +
> +/** @internal Used to start a configured device. */
> +typedef int (*dmadev_start_t)(struct rte_dmadev *dev);
> +
> +/** @internal Used to stop a configured device. */
> +typedef int (*dmadev_stop_t)(struct rte_dmadev *dev);
> +
> +/** @internal Used to close a configured device. */
> +typedef int (*dmadev_close_t)(struct rte_dmadev *dev);
> +
> +/** @internal Used to reset a configured device. */
> +typedef int (*dmadev_reset_t)(struct rte_dmadev *dev);
> +
> +/** @internal Used to allocate and set up a virtual DMA channel. */
> +typedef int (*dmadev_vchan_setup_t)(struct rte_dmadev *dev,
> +                                   const struct rte_dmadev_vchan_conf *conf);
> +
> +/** @internal Used to release a virtual DMA channel. */
> +typedef int (*dmadev_vchan_release_t)(struct rte_dmadev *dev, uint16_t vchan);
> +
> +/** @internal Used to retrieve basic statistics. */
> +typedef int (*dmadev_stats_get_t)(struct rte_dmadev *dev, int vchan,
> +                                 struct rte_dmadev_stats *stats);
> +
> +/** @internal Used to reset basic statistics. */
> +typedef int (*dmadev_stats_reset_t)(struct rte_dmadev *dev, int vchan);
> +
> +/** @internal Used to dump internal information. */
> +typedef int (*dmadev_dump_t)(struct rte_dmadev *dev, FILE *f);
> +
> +/** @internal Used to start dmadev selftest. */
> +typedef int (*dmadev_selftest_t)(uint16_t dev_id);
> +
> +/** @internal Used to enqueue a copy operation. */
> +typedef int (*dmadev_copy_t)(struct rte_dmadev *dev, uint16_t vchan,
> +                            rte_iova_t src, rte_iova_t dst,
> +                            uint32_t length, uint64_t flags);
> +
> +/** @internal Used to enqueue a scatter list copy operation. */
> +typedef int (*dmadev_copy_sg_t)(struct rte_dmadev *dev, uint16_t vchan,
> +                               const struct rte_dma_sg *sg,
> +                               uint32_t sg_len, uint64_t flags);
> +
> +/** @internal Used to enqueue a fill operation. */
> +typedef int (*dmadev_fill_t)(struct rte_dmadev *dev, uint16_t vchan,
> +                            uint64_t pattern, rte_iova_t dst,
> +                            uint32_t length, uint64_t flags);
> +
> +/** @internal Used to enqueue a scatter list fill operation. */
> +typedef int (*dmadev_fill_sg_t)(struct rte_dmadev *dev, uint16_t vchan,
> +                       uint64_t pattern, const struct rte_dma_sg *sg,
> +                       uint32_t sg_len, uint64_t flags);
> +
> +/** @internal Used to trigger hardware to begin working. */
> +typedef int (*dmadev_submit_t)(struct rte_dmadev *dev, uint16_t vchan);
> +
> +/** @internal Used to return number of successful completed operations. */
> +typedef uint16_t (*dmadev_completed_t)(struct rte_dmadev *dev, uint16_t vchan,
> +                                      const uint16_t nb_cpls,
> +                                      uint16_t *last_idx, bool *has_error);
> +
> +/** @internal Used to return number of failed completed operations. */
> +typedef uint16_t (*dmadev_completed_fails_t)(struct rte_dmadev *dev,
> +                       uint16_t vchan, const uint16_t nb_status,
> +                       uint32_t *status, uint16_t *last_idx);
> +
> +/**
> + * DMA device operations function pointer table
> + */
> +struct rte_dmadev_ops {
> +       dmadev_info_get_t dev_info_get;
> +       dmadev_configure_t dev_configure;
> +       dmadev_start_t dev_start;
> +       dmadev_stop_t dev_stop;
> +       dmadev_close_t dev_close;
> +       dmadev_reset_t dev_reset;
> +       dmadev_vchan_setup_t vchan_setup;
> +       dmadev_vchan_release_t vchan_release;
> +       dmadev_stats_get_t stats_get;
> +       dmadev_stats_reset_t stats_reset;
> +       dmadev_dump_t dev_dump;
> +       dmadev_selftest_t dev_selftest;
> +};
> +
> +/**
> + * @internal
> + * The data part, with no function pointers, associated with each DMA device.
> + *
> + * This structure is safe to place in shared memory to be common among different
> + * processes in a multi-process configuration.
> + */
> +struct rte_dmadev_data {
> +       uint16_t dev_id; /**< Device [external] identifier. */
> +       char dev_name[RTE_DMADEV_NAME_MAX_LEN]; /**< Unique identifier name */
> +       void *dev_private; /**< PMD-specific private data. */
> +       struct rte_dmadev_conf dev_conf; /**< DMA device configuration. */
> +       uint8_t dev_started : 1; /**< Device state: STARTED(1)/STOPPED(0). */
> +       uint64_t reserved[4]; /**< Reserved for future fields */
> +} __rte_cache_aligned;
> +
> +/**
> + * @internal
> + * The generic data structure associated with each DMA device.
> + *
> + * The dataplane APIs are located at the beginning of the structure, along
> + * with the pointer to where all the data elements for the particular device
> + * are stored in shared memory. This split scheme allows the function pointer
> + * and driver data to be per-process, while the actual configuration data for
> + * the device is shared.
> + */
> +struct rte_dmadev {
> +       dmadev_copy_t copy;
> +       dmadev_copy_sg_t copy_sg;
> +       dmadev_fill_t fill;
> +       dmadev_fill_sg_t fill_sg;
> +       dmadev_submit_t submit;
> +       dmadev_completed_t completed;

We could add reserved here for any fastpath future additions.

> +       dmadev_completed_fails_t completed_fails;


> +       const struct rte_dmadev_ops *dev_ops; /**< Functions exported by PMD. */
> +       /** Flag indicating the device is attached: ATTACHED(1)/DETACHED(0). */
> +       uint8_t attached : 1;
> +       /** Device info which supplied during device initialization. */
> +       struct rte_device *device;
> +       struct rte_dmadev_data *data; /**< Pointer to device data. */
> +       uint64_t reserved[4]; /**< Reserved for future fields */
> +} __rte_cache_aligned;
> +
> +extern struct rte_dmadev rte_dmadevices[];
> +
> +#endif /* _RTE_DMADEV_CORE_H_ */
> diff --git a/lib/dmadev/rte_dmadev_pmd.h b/lib/dmadev/rte_dmadev_pmd.h
> new file mode 100644
> index 0000000..45141f9
> --- /dev/null
> +++ b/lib/dmadev/rte_dmadev_pmd.h
> @@ -0,0 +1,72 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2021 HiSilicon Limited.
> + */
> +
> +#ifndef _RTE_DMADEV_PMD_H_
> +#define _RTE_DMADEV_PMD_H_
> +
> +/**
> + * @file
> + *
> + * RTE DMA Device PMD APIs
> + *
> + * Driver facing APIs for a DMA device. These are not to be called directly by
> + * any application.
> + */
> +
> +#include "rte_dmadev.h"
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +/**
> + * @internal
> + * Allocates a new dmadev slot for an DMA device and returns the pointer
> + * to that slot for the driver to use.
> + *
> + * @param name
> + *   DMA device name.
> + *
> + * @return
> + *   A pointer to the DMA device slot case of success,
> + *   NULL otherwise.
> + */
> +__rte_internal
> +struct rte_dmadev *
> +rte_dmadev_pmd_allocate(const char *name);
> +
> +/**
> + * @internal
> + * Release the specified dmadev.
> + *
> + * @param dev
> + *   Device to be released.
> + *
> + * @return
> + *   - 0 on success, negative on error
> + */
> +__rte_internal
> +int
> +rte_dmadev_pmd_release(struct rte_dmadev *dev);
> +
> +/**
> + * @internal
> + * Return the DMA device based on the device name.
> + *
> + * @param name
> + *   DMA device name.
> + *
> + * @return
> + *   A pointer to the DMA device slot case of success,
> + *   NULL otherwise.
> + */
> +__rte_internal
> +struct rte_dmadev *
> +rte_dmadev_get_device_by_name(const char *name);
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif /* _RTE_DMADEV_PMD_H_ */
> diff --git a/lib/dmadev/version.map b/lib/dmadev/version.map
> new file mode 100644
> index 0000000..0f099e7
> --- /dev/null
> +++ b/lib/dmadev/version.map
> @@ -0,0 +1,40 @@
> +EXPERIMENTAL {
> +       global:
> +
> +       rte_dmadev_count;
> +       rte_dmadev_info_get;
> +       rte_dmadev_configure;
> +       rte_dmadev_start;
> +       rte_dmadev_stop;
> +       rte_dmadev_close;
> +       rte_dmadev_reset;
> +       rte_dmadev_vchan_setup;
> +       rte_dmadev_vchan_release;
> +       rte_dmadev_stats_get;
> +       rte_dmadev_stats_reset;
> +       rte_dmadev_dump;
> +       rte_dmadev_selftest;
> +       rte_dmadev_copy;
> +       rte_dmadev_copy_sg;
> +       rte_dmadev_fill;
> +       rte_dmadev_fill_sg;
> +       rte_dmadev_submit;
> +       rte_dmadev_completed;
> +       rte_dmadev_completed_fails;
> +
> +       local: *;
> +};
> +
> +INTERNAL {
> +        global:
> +
> +       rte_dmadevices;
> +       rte_dmadev_pmd_allocate;
> +       rte_dmadev_pmd_release;
> +       rte_dmadev_get_device_by_name;
> +
> +       local:
> +
> +       rte_dmadev_is_valid_dev;
> +};
> +
> diff --git a/lib/meson.build b/lib/meson.build
> index 1673ca4..68d239f 100644
> --- a/lib/meson.build
> +++ b/lib/meson.build
> @@ -60,6 +60,7 @@ libraries = [
>          'bpf',
>          'graph',
>          'node',
> +        'dmadev',
>  ]
>
>  if is_windows
> --
> 2.8.1
>

^ permalink raw reply	[flat|nested] 339+ messages in thread

* Re: [dpdk-dev] [PATCH] dmadev: introduce DMA device library
  2021-07-11  7:14                       ` Jerin Jacob
@ 2021-07-12  7:01                         ` Morten Brørup
  2021-07-12  7:59                           ` Jerin Jacob
  0 siblings, 1 reply; 339+ messages in thread
From: Morten Brørup @ 2021-07-12  7:01 UTC (permalink / raw)
  To: Jerin Jacob, Bruce Richardson
  Cc: fengchengwen, Thomas Monjalon, Ferruh Yigit, Jerin Jacob,
	dpdk-dev, Nipun Gupta, Hemant Agrawal, Maxime Coquelin,
	Honnappa Nagarahalli, David Marchand, Satananda Burla,
	Prasun Kapoor, Ananyev, Konstantin, liangma,
	Radha Mohan Chintakuntla

> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Jerin Jacob

> Probably we can remove rte_dmadev_fill_sg() variant and keep sg only
> for copy to save 8B.

Perhaps the scatter/gather functions can be on a separate cache line, following the cache line with the simple functions?
Of course, this is only beneficial if the SG functions are not used with simple functions.
This means that we reserve space for 8 simple functions and 8 SG functions.

And if one or two functions are used with both simple and SG functions, it/they can be present in both cache lines. (This is somewhat dirty, but would be a typical implementation for a DPDK flow data structure.)

-Morten

^ permalink raw reply	[flat|nested] 339+ messages in thread

* Re: [dpdk-dev] [PATCH v2] dmadev: introduce DMA device library
  2021-07-11  9:25 ` [dpdk-dev] [PATCH v2] dmadev: introduce DMA device library Chengwen Feng
  2021-07-11  9:42   ` fengchengwen
  2021-07-11 14:25   ` Jerin Jacob
@ 2021-07-12  7:15   ` Morten Brørup
  2021-07-12  9:59   ` Jerin Jacob
                     ` (3 subsequent siblings)
  6 siblings, 0 replies; 339+ messages in thread
From: Morten Brørup @ 2021-07-12  7:15 UTC (permalink / raw)
  To: Chengwen Feng, thomas, ferruh.yigit, bruce.richardson, jerinj,
	jerinjacobk
  Cc: dev, nipun.gupta, hemant.agrawal, maxime.coquelin,
	honnappa.nagarahalli, david.marchand, sburla, pkapoor,
	konstantin.ananyev, liangma

> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Chengwen Feng
> 
> This patch introduce 'dmadevice' which is a generic type of DMA
> device.
> 
> The APIs of dmadev library exposes some generic operations which can
> enable configuration and I/O with the DMA devices.
> 
> Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>

[snip]

> diff --git a/lib/dmadev/rte_dmadev.h b/lib/dmadev/rte_dmadev.h
> new file mode 100644
> index 0000000..8779512
> --- /dev/null
> +++ b/lib/dmadev/rte_dmadev.h
> @@ -0,0 +1,1030 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2021 HiSilicon Limited.
> + * Copyright(c) 2021 Intel Corporation.
> + * Copyright(c) 2021 Marvell International Ltd.

If the group of DMA device hardware vendors don't oppose, I would appreciate it if my contribution to the definition of the DMA device API is recognized in this file:

+ * Copyright(c) 2021 SmartShare Systems.

> + */
> +

Also remember the other contributors in the other files, where appropriate.

-Morten

^ permalink raw reply	[flat|nested] 339+ messages in thread

* Re: [dpdk-dev] [PATCH v2] dmadev: introduce DMA device library
  2021-07-11 13:34     ` Jerin Jacob
@ 2021-07-12  7:40       ` Morten Brørup
  0 siblings, 0 replies; 339+ messages in thread
From: Morten Brørup @ 2021-07-12  7:40 UTC (permalink / raw)
  To: Jerin Jacob, fengchengwen
  Cc: Thomas Monjalon, Ferruh Yigit, Richardson, Bruce, Jerin Jacob,
	dpdk-dev, Nipun Gupta, Hemant Agrawal, Maxime Coquelin,
	Honnappa Nagarahalli, David Marchand, Satananda Burla,
	Prasun Kapoor, Ananyev, Konstantin, liangma

> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Jerin Jacob
> 
> On Sun, Jul 11, 2021 at 3:12 PM fengchengwen <fengchengwen@huawei.com>
> wrote:
> >
> > Note:
> > 1) This patch hold dmadev <> vchan layer, I think vchan can be very
> >    conceptually separated from hw-channel.
> 
> I would like to keep it as channel instead of virtual channel as it is
> implementation-specific.
> No strong opinion on this? @Richardson, Bruce  @Morten Brørup  thoughts

Consider using "context" or "ctx" instead. I might help avoid being mistaken as a DMA hardware feature.

No strong opinion, though. Whatever fits nicely in the documentation is a good choice.

A small anecdote: We once had to name an important parameter in our product, and it came down to the choice of two perfectly good names. We chose the shortest name, only because it fit better into the GUI.

> 
> > 2) I could not under struct dpi_dma_queue_ctx_s, so this patch I
> define
> >    the rte_dma_slave_port_parameters refer to Kunpeng DMA
> implemention.

One more thing:

The DPDK project is aiming to use inclusive code language. (Ref: https://www.dpdk.org/blog/2020/07/22/dpdk-governing-board-update-july-2020/)

Please replace the word "slave" with something politically correct.

-Morten


^ permalink raw reply	[flat|nested] 339+ messages in thread

* Re: [dpdk-dev] [PATCH] dmadev: introduce DMA device library
  2021-07-12  7:01                         ` Morten Brørup
@ 2021-07-12  7:59                           ` Jerin Jacob
  0 siblings, 0 replies; 339+ messages in thread
From: Jerin Jacob @ 2021-07-12  7:59 UTC (permalink / raw)
  To: Morten Brørup
  Cc: Bruce Richardson, fengchengwen, Thomas Monjalon, Ferruh Yigit,
	Jerin Jacob, dpdk-dev, Nipun Gupta, Hemant Agrawal,
	Maxime Coquelin, Honnappa Nagarahalli, David Marchand,
	Satananda Burla, Prasun Kapoor, Ananyev, Konstantin, liangma,
	Radha Mohan Chintakuntla

On Mon, Jul 12, 2021 at 12:31 PM Morten Brørup <mb@smartsharesystems.com> wrote:
>
> > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Jerin Jacob
>
> > Probably we can remove rte_dmadev_fill_sg() variant and keep sg only
> > for copy to save 8B.
>
> Perhaps the scatter/gather functions can be on a separate cache line, following the cache line with the simple functions?
> Of course, this is only beneficial if the SG functions are not used with simple functions.
> This means that we reserve space for 8 simple functions and 8 SG functions.

Currently, there are only two SG and normal ops functions. IMO, should
be to keep all fastpath functions in the same CL irrespective of it is
simple or not.
My suggestion was more like, if there is no HW support for
rte_dmadev_fill_sg() we can remove it from spec.

>
> And if one or two functions are used with both simple and SG functions, it/they can be present in both cache lines. (This is somewhat dirty, but would be a typical implementation for a DPDK flow data structure.)
>
> -Morten

^ permalink raw reply	[flat|nested] 339+ messages in thread

* Re: [dpdk-dev] [PATCH v2] dmadev: introduce DMA device library
  2021-07-11  9:25 ` [dpdk-dev] [PATCH v2] dmadev: introduce DMA device library Chengwen Feng
                     ` (2 preceding siblings ...)
  2021-07-12  7:15   ` Morten Brørup
@ 2021-07-12  9:59   ` Jerin Jacob
  2021-07-12 13:32     ` Bruce Richardson
  2021-07-12 12:05   ` Bruce Richardson
                     ` (2 subsequent siblings)
  6 siblings, 1 reply; 339+ messages in thread
From: Jerin Jacob @ 2021-07-12  9:59 UTC (permalink / raw)
  To: Chengwen Feng
  Cc: Thomas Monjalon, Ferruh Yigit, Richardson, Bruce, Jerin Jacob,
	dpdk-dev, Morten Brørup, Nipun Gupta, Hemant Agrawal,
	Maxime Coquelin, Honnappa Nagarahalli, David Marchand,
	Satananda Burla, Prasun Kapoor, Ananyev, Konstantin, liangma

On Sun, Jul 11, 2021 at 2:59 PM Chengwen Feng <fengchengwen@huawei.com> wrote:
>
> This patch introduce 'dmadevice' which is a generic type of DMA
> device.
>
> The APIs of dmadev library exposes some generic operations which can
> enable configuration and I/O with the DMA devices.
>
> Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>
> ---
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Enqueue a fill operation onto the virtual DMA channel.
> + *
> + * This queues up a fill operation to be performed by hardware, but does not
> + * trigger hardware to begin that operation.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vchan
> + *   The identifier of virtual DMA channel.
> + * @param pattern
> + *   The pattern to populate the destination buffer with.
> + * @param dst
> + *   The address of the destination buffer.
> + * @param length
> + *   The length of the destination buffer.
> + * @param flags
> + *   An flags for this operation.
> + *
> + * @return
> + *   - 0..UINT16_MAX: index of enqueued copy job.

fill job

> + *   - <0: Error code returned by the driver copy function.
> + */
> +__rte_experimental
> +static inline int
> +rte_dmadev_fill(uint16_t dev_id, uint16_t vchan, uint64_t pattern,
> +               rte_iova_t dst, uint32_t length, uint64_t flags)
> +{
> +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> +#ifdef RTE_DMADEV_DEBUG
> +       RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
> +       RTE_FUNC_PTR_OR_ERR_RET(*dev->fill, -ENOTSUP);
> +       if (vchan >= dev->data->dev_conf.max_vchans) {
> +               RTE_DMADEV_LOG(ERR, "Invalid vchan %d\n", vchan);
> +               return -EINVAL;
> +       }
> +#endif
> +       return (*dev->fill)(dev, vchan, pattern, dst, length, flags);
> +}
> +

> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Returns the number of operations that have been successfully completed.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vchan
> + *   The identifier of virtual DMA channel.
> + * @param nb_cpls
> + *   The maximum number of completed operations that can be processed.
> + * @param[out] last_idx
> + *   The last completed operation's index.
> + *   If not required, NULL can be passed in.

This means the driver will be tracking the last index.

Is that mean, the application needs to call this API periodically to
consume the completion slot.
I.e up to 64K (UINT16_MAX)  outstanding jobs are possible. If the
application fails to call this
>64K outstand job then the subsequence enqueue will fail.

If so, we need to document this.

One of the concerns of keeping UINT16_MAX as the limit is the
completion memory will always not in cache.
On the other hand, if we make this size programmable. it may introduce
complexity in the application.

Thoughts?


> + * @param[out] has_error
> + *   Indicates if there are transfer error.
> + *   If not required, NULL can be passed in.
> + *
> + * @return
> + *   The number of operations that successfully completed.
> + */
> +__rte_experimental
> +static inline uint16_t
> +rte_dmadev_completed(uint16_t dev_id, uint16_t vchan, const uint16_t nb_cpls,
> +                    uint16_t *last_idx, bool *has_error)
> +{
> +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> +       uint16_t idx;
> +       bool err;
> +
> +#ifdef RTE_DMADEV_DEBUG
> +       RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
> +       RTE_FUNC_PTR_OR_ERR_RET(*dev->completed, -ENOTSUP);
> +       if (vchan >= dev->data->dev_conf.max_vchans) {
> +               RTE_DMADEV_LOG(ERR, "Invalid vchan %d\n", vchan);
> +               return -EINVAL;
> +       }
> +       if (nb_cpls == 0) {
> +               RTE_DMADEV_LOG(ERR, "Invalid nb_cpls\n");
> +               return -EINVAL;
> +       }
> +#endif
> +
> +       /* Ensure the pointer values are non-null to simplify drivers.
> +        * In most cases these should be compile time evaluated, since this is
> +        * an inline function.
> +        * - If NULL is explicitly passed as parameter, then compiler knows the
> +        *   value is NULL
> +        * - If address of local variable is passed as parameter, then compiler
> +        *   can know it's non-NULL.
> +        */
> +       if (last_idx == NULL)
> +               last_idx = &idx;
> +       if (has_error == NULL)
> +               has_error = &err;
> +
> +       *has_error = false;
> +       return (*dev->completed)(dev, vchan, nb_cpls, last_idx, has_error);
> +}

^ permalink raw reply	[flat|nested] 339+ messages in thread

* Re: [dpdk-dev] [PATCH v2] dmadev: introduce DMA device library
  2021-07-11  9:25 ` [dpdk-dev] [PATCH v2] dmadev: introduce DMA device library Chengwen Feng
                     ` (3 preceding siblings ...)
  2021-07-12  9:59   ` Jerin Jacob
@ 2021-07-12 12:05   ` Bruce Richardson
  2021-07-12 15:50   ` Bruce Richardson
  2021-07-13 14:19   ` Ananyev, Konstantin
  6 siblings, 0 replies; 339+ messages in thread
From: Bruce Richardson @ 2021-07-12 12:05 UTC (permalink / raw)
  To: Chengwen Feng
  Cc: thomas, ferruh.yigit, jerinj, jerinjacobk, dev, mb, nipun.gupta,
	hemant.agrawal, maxime.coquelin, honnappa.nagarahalli,
	david.marchand, sburla, pkapoor, konstantin.ananyev, liangma

On Sun, Jul 11, 2021 at 05:25:56PM +0800, Chengwen Feng wrote:
> This patch introduce 'dmadevice' which is a generic type of DMA
> device.
> 
> The APIs of dmadev library exposes some generic operations which can
> enable configuration and I/O with the DMA devices.
> 
> Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>

Thanks for this V2.
Some initial (mostly minor) comments on the meson.build and dmadev .c file
below. I'll review the headers in a separate email.

/Bruce

> ---
>  MAINTAINERS                  |    4 +
>  config/rte_config.h          |    3 +
>  lib/dmadev/meson.build       |    6 +
>  lib/dmadev/rte_dmadev.c      |  560 +++++++++++++++++++++++
>  lib/dmadev/rte_dmadev.h      | 1030 ++++++++++++++++++++++++++++++++++++++++++
>  lib/dmadev/rte_dmadev_core.h |  159 +++++++
>  lib/dmadev/rte_dmadev_pmd.h  |   72 +++
>  lib/dmadev/version.map       |   40 ++
>  lib/meson.build              |    1 +
>  9 files changed, 1875 insertions(+)
>  create mode 100644 lib/dmadev/meson.build
>  create mode 100644 lib/dmadev/rte_dmadev.c
>  create mode 100644 lib/dmadev/rte_dmadev.h
>  create mode 100644 lib/dmadev/rte_dmadev_core.h
>  create mode 100644 lib/dmadev/rte_dmadev_pmd.h
>  create mode 100644 lib/dmadev/version.map
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 4347555..0595239 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -496,6 +496,10 @@ F: drivers/raw/skeleton/
>  F: app/test/test_rawdev.c
>  F: doc/guides/prog_guide/rawdev.rst
>  
> +DMA device API - EXPERIMENTAL
> +M: Chengwen Feng <fengchengwen@huawei.com>
> +F: lib/dmadev/
> +
>  
>  Memory Pool Drivers
>  -------------------
> diff --git a/config/rte_config.h b/config/rte_config.h
> index 590903c..331a431 100644
> --- a/config/rte_config.h
> +++ b/config/rte_config.h
> @@ -81,6 +81,9 @@
>  /* rawdev defines */
>  #define RTE_RAWDEV_MAX_DEVS 64
>  
> +/* dmadev defines */
> +#define RTE_DMADEV_MAX_DEVS 64
> +
>  /* ip_fragmentation defines */
>  #define RTE_LIBRTE_IP_FRAG_MAX_FRAG 4
>  #undef RTE_LIBRTE_IP_FRAG_TBL_STAT
> diff --git a/lib/dmadev/meson.build b/lib/dmadev/meson.build
> new file mode 100644
> index 0000000..c918dae
> --- /dev/null
> +++ b/lib/dmadev/meson.build
> @@ -0,0 +1,6 @@
> +# SPDX-License-Identifier: BSD-3-Clause
> +# Copyright(c) 2021 HiSilicon Limited.
> +
> +sources = files('rte_dmadev.c')
> +headers = files('rte_dmadev.h', 'rte_dmadev_pmd.h')

If rte_dmadev_pmd.h is only for PMD use, then it should be in
"driver_sdk_headers".

> +indirect_headers += files('rte_dmadev_core.h')
> diff --git a/lib/dmadev/rte_dmadev.c b/lib/dmadev/rte_dmadev.c
> new file mode 100644
> index 0000000..8a29abb
> --- /dev/null
> +++ b/lib/dmadev/rte_dmadev.c
> @@ -0,0 +1,560 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2021 HiSilicon Limited.
> + * Copyright(c) 2021 Intel Corporation.
> + */
> +
> +#include <ctype.h>
> +#include <inttypes.h>
> +#include <stdint.h>
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <string.h>
> +
> +#include <rte_debug.h>
> +#include <rte_dev.h>
> +#include <rte_eal.h>
> +#include <rte_errno.h>
> +#include <rte_lcore.h>
> +#include <rte_log.h>
> +#include <rte_memory.h>
> +#include <rte_memzone.h>
> +#include <rte_malloc.h>
> +#include <rte_string_fns.h>
> +
> +#include "rte_dmadev.h"
> +#include "rte_dmadev_pmd.h"
> +
> +RTE_LOG_REGISTER(rte_dmadev_logtype, lib.dmadev, INFO);
> +
> +struct rte_dmadev rte_dmadevices[RTE_DMADEV_MAX_DEVS];
> +
> +static const char *MZ_RTE_DMADEV_DATA = "rte_dmadev_data";
> +/* Shared memory between primary and secondary processes. */
> +static struct {
> +	struct rte_dmadev_data data[RTE_DMADEV_MAX_DEVS];
> +} *dmadev_shared_data;
> +
> +static int
> +dmadev_check_name(const char *name)
> +{
> +	size_t name_len;
> +
> +	if (name == NULL) {
> +		RTE_DMADEV_LOG(ERR, "Name can't be NULL\n");
> +		return -EINVAL;
> +	}
> +
> +	name_len = strnlen(name, RTE_DMADEV_NAME_MAX_LEN);
> +	if (name_len == 0) {
> +		RTE_DMADEV_LOG(ERR, "Zero length DMA device name\n");
> +		return -EINVAL;
> +	}
> +	if (name_len >= RTE_DMADEV_NAME_MAX_LEN) {
> +		RTE_DMADEV_LOG(ERR, "DMA device name is too long\n");
> +		return -EINVAL;
> +	}
> +
> +	return 0;
> +}
> +
> +static uint16_t
> +dmadev_find_free_dev(void)
> +{
> +	uint16_t i;
> +
> +	for (i = 0; i < RTE_DMADEV_MAX_DEVS; i++) {
> +		if (dmadev_shared_data->data[i].dev_name[0] == '\0') {
> +			RTE_ASSERT(rte_dmadevices[i].attached == 0);
> +			return i;
> +		}
> +	}
> +
> +	return RTE_DMADEV_MAX_DEVS;
> +}
> +
> +static struct rte_dmadev*
> +dmadev_allocated(const char *name)

The name implies a boolean lookup for whether a particular dmadev has been
allocated or not. Since this returns a pointer, I think a name like
"dmadev_find" or "dmadev_get" would be more appropriate.

> +{
> +	uint16_t i;
> +
> +	for (i = 0; i < RTE_DMADEV_MAX_DEVS; i++) {
> +		if ((rte_dmadevices[i].attached == 1) &&
> +		    (!strcmp(name, rte_dmadevices[i].data->dev_name)))
> +			return &rte_dmadevices[i];
> +	}
> +
> +	return NULL;
> +}
> +
> +static int
> +dmadev_shared_data_prepare(void)
> +{
> +	const struct rte_memzone *mz;
> +
> +	if (dmadev_shared_data == NULL) {
> +		if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
> +			/* Allocate port data and ownership shared memory. */
> +			mz = rte_memzone_reserve(MZ_RTE_DMADEV_DATA,
> +					 sizeof(*dmadev_shared_data),
> +					 rte_socket_id(), 0);
> +		} else {
> +			mz = rte_memzone_lookup(MZ_RTE_DMADEV_DATA);
> +		}
> +		if (mz == NULL)
> +			return -ENOMEM;
> +
> +		dmadev_shared_data = mz->addr;
> +		if (rte_eal_process_type() == RTE_PROC_PRIMARY)
> +			memset(dmadev_shared_data->data, 0,
> +			       sizeof(dmadev_shared_data->data));
> +	}
> +
> +	return 0;
> +}
> +
> +static struct rte_dmadev *
> +dmadev_allocate(const char *name)
> +{
> +	struct rte_dmadev *dev;
> +	uint16_t dev_id;
> +
> +	dev = dmadev_allocated(name);
> +	if (dev != NULL) {
> +		RTE_DMADEV_LOG(ERR, "DMA device already allocated\n");
> +		return NULL;
> +	}
> +
> +	dev_id = dmadev_find_free_dev();
> +	if (dev_id == RTE_DMADEV_MAX_DEVS) {
> +		RTE_DMADEV_LOG(ERR, "Reached maximum number of DMA devices\n");
> +		return NULL;
> +	}
> +
> +	if (dmadev_shared_data_prepare() != 0) {
> +		RTE_DMADEV_LOG(ERR, "Cannot allocate DMA shared data\n");
> +		return NULL;
> +	}
> +
> +	dev = &rte_dmadevices[dev_id];
> +	dev->data = &dmadev_shared_data->data[dev_id];
> +	dev->data->dev_id = dev_id;
> +	strlcpy(dev->data->dev_name, name, sizeof(dev->data->dev_name));
> +
> +	return dev;
> +}
> +
> +static struct rte_dmadev *
> +dmadev_attach_secondary(const char *name)
> +{
> +	struct rte_dmadev *dev;
> +	uint16_t i;
> +
> +	if (dmadev_shared_data_prepare() != 0) {
> +		RTE_DMADEV_LOG(ERR, "Cannot allocate DMA shared data\n");
> +		return NULL;
> +	}
> +
> +	for (i = 0; i < RTE_DMADEV_MAX_DEVS; i++) {
> +		if (!strcmp(dmadev_shared_data->data[i].dev_name, name))
> +			break;
> +	}
> +	if (i == RTE_DMADEV_MAX_DEVS) {
> +		RTE_DMADEV_LOG(ERR,
> +			"Device %s is not driven by the primary process\n",
> +			name);
> +		return NULL;
> +	}
> +
> +	dev = &rte_dmadevices[i];
> +	dev->data = &dmadev_shared_data->data[i];
> +	RTE_ASSERT(dev->data->dev_id == i);
> +
> +	return dev;
> +}
> +
> +struct rte_dmadev *
> +rte_dmadev_pmd_allocate(const char *name)
> +{
> +	struct rte_dmadev *dev;
> +
> +	if (dmadev_check_name(name) != 0)
> +		return NULL;
> +
> +	if (rte_eal_process_type() == RTE_PROC_PRIMARY)
> +		dev = dmadev_allocate(name);
> +	else
> +		dev = dmadev_attach_secondary(name);
> +
> +	if (dev == NULL)
> +		return NULL;
> +	dev->attached = 1;
> +
> +	return dev;
> +}
> +
> +int
> +rte_dmadev_pmd_release(struct rte_dmadev *dev)
> +{
> +	if (dev == NULL)
> +		return -EINVAL;
> +
> +	if (dev->attached == 0)
> +		return 0;
> +
> +	if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
> +		rte_free(dev->data->dev_private);
> +		memset(dev->data, 0, sizeof(struct rte_dmadev_data));
> +	}
> +
> +	memset(dev, 0, sizeof(struct rte_dmadev));
> +	dev->attached = 0;
> +
> +	return 0;
> +}
> +
> +struct rte_dmadev *
> +rte_dmadev_get_device_by_name(const char *name)
> +{
> +	if (dmadev_check_name(name) != 0)
> +		return NULL;
> +	return dmadev_allocated(name);
> +}
> +
> +bool
> +rte_dmadev_is_valid_dev(uint16_t dev_id)
> +{
> +	if (dev_id >= RTE_DMADEV_MAX_DEVS ||
> +	    rte_dmadevices[dev_id].attached == 0)
> +		return false;
> +	return true;
> +}
> +
> +uint16_t
> +rte_dmadev_count(void)
> +{
> +	uint16_t count = 0;
> +	uint16_t i;
> +
> +	for (i = 0; i < RTE_DMADEV_MAX_DEVS; i++) {
> +		if (rte_dmadevices[i].attached == 1)
> +			count++;
> +	}
> +
> +	return count;
> +}
> +
> +int
> +rte_dmadev_info_get(uint16_t dev_id, struct rte_dmadev_info *dev_info)
> +{
> +	struct rte_dmadev *dev;
> +	int ret;
> +
> +	RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
> +	RTE_FUNC_PTR_OR_ERR_RET(dev_info, -EINVAL);
> +
> +	dev = &rte_dmadevices[dev_id];
> +
> +	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_info_get, -ENOTSUP);
> +	memset(dev_info, 0, sizeof(struct rte_dmadev_info));
> +	ret = (*dev->dev_ops->dev_info_get)(dev, dev_info);
> +	if (ret != 0)
> +		return ret;
> +
> +	dev_info->device = dev->device;
> +
> +	return 0;
> +}

Should the info_get function (and the related info structure), not include
in it the parameters passed into the configure function. That way, the user
can query a previously set up configuration. This should be done at the
dmadev level, rather than driver level, since I see the parameters are
already being saved in configure below.

Also, for ABI purposes, I would strongly suggest passing "sizeof(dev_info)"
to the driver in the "dev_info_get" call. When dev_info changes, we can
version rte_dmadev_info_get, but can't version the functions that it calls
in turn. When we add a new field to the struct, the driver functions that
choose to use that new field can check the size of the struct passed to
determine if it's safe to write that new field or not. [So long as field is
added at the end, driver functions not updated for the new field, need no
changes]

> +
> +int
> +rte_dmadev_configure(uint16_t dev_id, const struct rte_dmadev_conf *dev_conf)
> +{
> +	struct rte_dmadev_info info;
> +	struct rte_dmadev *dev;
> +	int ret;
> +
> +	RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
> +	RTE_FUNC_PTR_OR_ERR_RET(dev_conf, -EINVAL);
> +	dev = &rte_dmadevices[dev_id];
> +
> +	ret = rte_dmadev_info_get(dev_id, &info);
> +	if (ret != 0) {
> +		RTE_DMADEV_LOG(ERR, "Device %u get device info fail\n", dev_id);
> +		return -EINVAL;
> +	}
> +	if (dev_conf->max_vchans > info.max_vchans) {
> +		RTE_DMADEV_LOG(ERR,
> +			"Device %u configure too many vchans\n", dev_id);

We allow up to 100 characters per line for DPDK code, so these don't need
to be wrapped so aggressively.

> +		return -EINVAL;
> +	}
> +	if (dev_conf->enable_mt_vchan &&
> +	    !(info.dev_capa & RTE_DMA_DEV_CAPA_MT_VCHAN)) {
> +		RTE_DMADEV_LOG(ERR,
> +			"Device %u don't support MT-safe vchan\n", dev_id);
> +		return -EINVAL;
> +	}
> +	if (dev_conf->enable_mt_multi_vchan &&
> +	    !(info.dev_capa & RTE_DMA_DEV_CAPA_MT_MULTI_VCHAN)) {
> +		RTE_DMADEV_LOG(ERR,
> +			"Device %u don't support MT-safe multiple vchan\n",
> +			dev_id);
> +		return -EINVAL;
> +	}
> +
> +	if (dev->data->dev_started != 0) {
> +		RTE_DMADEV_LOG(ERR,
> +			"Device %u must be stopped to allow configuration\n",
> +			dev_id);
> +		return -EBUSY;
> +	}
> +
> +	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_configure, -ENOTSUP);
> +	ret = (*dev->dev_ops->dev_configure)(dev, dev_conf);
> +	if (ret == 0)
> +		memcpy(&dev->data->dev_conf, dev_conf, sizeof(*dev_conf));
> +
> +	return ret;
> +}
> +
> +int
> +rte_dmadev_start(uint16_t dev_id)
> +{
> +	struct rte_dmadev *dev;
> +	int ret;
> +
> +	RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
> +	dev = &rte_dmadevices[dev_id];
> +
> +	if (dev->data->dev_started != 0) {
> +		RTE_DMADEV_LOG(ERR, "Device %u already started\n", dev_id);

Maybe make this a warning rather than error.

> +		return 0;
> +	}
> +
> +	if (dev->dev_ops->dev_start == NULL)
> +		goto mark_started;
> +
> +	ret = (*dev->dev_ops->dev_start)(dev);
> +	if (ret != 0)
> +		return ret;
> +
> +mark_started:
> +	dev->data->dev_started = 1;
> +	return 0;
> +}
> +
> +int
> +rte_dmadev_stop(uint16_t dev_id)
> +{
> +	struct rte_dmadev *dev;
> +	int ret;
> +
> +	RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
> +	dev = &rte_dmadevices[dev_id];
> +
> +	if (dev->data->dev_started == 0) {
> +		RTE_DMADEV_LOG(ERR, "Device %u already stopped\n", dev_id);

As above, suggest just warning rather than error.

> +		return 0;
> +	}
> +
> +	if (dev->dev_ops->dev_stop == NULL)
> +		goto mark_stopped;
> +
> +	ret = (*dev->dev_ops->dev_stop)(dev);
> +	if (ret != 0)
> +		return ret;
> +
> +mark_stopped:
> +	dev->data->dev_started = 0;
> +	return 0;
> +}
> +
> +int
> +rte_dmadev_close(uint16_t dev_id)
> +{
> +	struct rte_dmadev *dev;
> +
> +	RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
> +	dev = &rte_dmadevices[dev_id];
> +
> +	/* Device must be stopped before it can be closed */
> +	if (dev->data->dev_started == 1) {
> +		RTE_DMADEV_LOG(ERR,
> +			"Device %u must be stopped before closing\n", dev_id);
> +		return -EBUSY;
> +	}
> +
> +	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_close, -ENOTSUP);
> +	return (*dev->dev_ops->dev_close)(dev);
> +}
> +
> +int
> +rte_dmadev_reset(uint16_t dev_id)
> +{
> +	struct rte_dmadev *dev;
> +
> +	RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
> +	dev = &rte_dmadevices[dev_id];
> +
> +	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_reset, -ENOTSUP);
> +	/* Reset is not dependent on state of the device */
> +	return (*dev->dev_ops->dev_reset)(dev);
> +}

I would tend to agree with the query as to whether this is needed or not.
Can we perhaps remove for now, and add it back later if it does prove to be
needed. The less code to review and work with for the first version, the
better IMHO. :-)

> +
> +int
> +rte_dmadev_vchan_setup(uint16_t dev_id,
> +		       const struct rte_dmadev_vchan_conf *conf)
> +{
> +	struct rte_dmadev_info info;
> +	struct rte_dmadev *dev;
> +	int ret;
> +
> +	RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
> +	RTE_FUNC_PTR_OR_ERR_RET(conf, -EINVAL);

This is confusing, because you are actually doing a parameter check using a
macro named for checking a function. Better to explicitly just check conf
for null.

> +
> +	dev = &rte_dmadevices[dev_id];
> +
> +	ret = rte_dmadev_info_get(dev_id, &info);
> +	if (ret != 0) {
> +		RTE_DMADEV_LOG(ERR, "Device %u get device info fail\n", dev_id);
> +		return -EINVAL;
> +	}
> +	if (conf->direction == 0 ||
> +	    conf->direction & ~RTE_DMA_TRANSFER_DIR_ALL) {
> +		RTE_DMADEV_LOG(ERR, "Device %u direction invalid!\n", dev_id);
> +		return -EINVAL;
> +	}

I wonder should we allow direction == 0, to be the same as all bits set,
or to be all supported bits set?

> +	if (conf->direction & RTE_DMA_MEM_TO_MEM &&
> +	    !(info.dev_capa & RTE_DMA_DEV_CAPA_MEM_TO_MEM)) {
> +		RTE_DMADEV_LOG(ERR,
> +			"Device %u don't support mem2mem transfer\n", dev_id);
> +		return -EINVAL;
> +	}
> +	if (conf->direction & RTE_DMA_MEM_TO_DEV &&
> +	    !(info.dev_capa & RTE_DMA_DEV_CAPA_MEM_TO_DEV)) {
> +		RTE_DMADEV_LOG(ERR,
> +			"Device %u don't support mem2dev transfer\n", dev_id);
> +		return -EINVAL;
> +	}
> +	if (conf->direction & RTE_DMA_DEV_TO_MEM &&
> +	    !(info.dev_capa & RTE_DMA_DEV_CAPA_DEV_TO_MEM)) {
> +		RTE_DMADEV_LOG(ERR,
> +			"Device %u don't support dev2mem transfer\n", dev_id);
> +		return -EINVAL;
> +	}
> +	if (conf->direction & RTE_DMA_DEV_TO_DEV &&
> +	    !(info.dev_capa & RTE_DMA_DEV_CAPA_DEV_TO_DEV)) {
> +		RTE_DMADEV_LOG(ERR,
> +			"Device %u don't support dev2dev transfer\n", dev_id);
> +		return -EINVAL;
> +	}
> +	if (conf->nb_desc < info.min_desc || conf->nb_desc > info.max_desc) {
> +		RTE_DMADEV_LOG(ERR,
> +			"Device %u number of descriptors invalid\n", dev_id);
> +		return -EINVAL;
> +	}
> +
> +	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->vchan_setup, -ENOTSUP);
> +	return (*dev->dev_ops->vchan_setup)(dev, conf);
> +}
> +
> +int
> +rte_dmadev_vchan_release(uint16_t dev_id, uint16_t vchan)
> +{
> +	struct rte_dmadev *dev;
> +
> +	RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
> +	dev = &rte_dmadevices[dev_id];
> +
> +	if (vchan >= dev->data->dev_conf.max_vchans) {
> +		RTE_DMADEV_LOG(ERR,
> +			"Device %u vchan %u out of range\n", dev_id, vchan);
> +		return -EINVAL;
> +	}
> +
> +	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->vchan_release, -ENOTSUP);
> +	return (*dev->dev_ops->vchan_release)(dev, vchan);
> +}
> +
> +int
> +rte_dmadev_stats_get(uint16_t dev_id, int vchan, struct rte_dmadev_stats *stats)
> +{
> +	struct rte_dmadev *dev;
> +
> +	RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
> +	RTE_FUNC_PTR_OR_ERR_RET(stats, -EINVAL);
> +
> +	dev = &rte_dmadevices[dev_id];
> +
> +	if (vchan < -1 || vchan >= dev->data->dev_conf.max_vchans) {
> +		RTE_DMADEV_LOG(ERR,
> +			"Device %u vchan %u out of range\n", dev_id, vchan);
> +		return -EINVAL;
> +	}
> +
> +	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->stats_get, -ENOTSUP);
> +	return (*dev->dev_ops->stats_get)(dev, vchan, stats);
> +}
> +
> +int
> +rte_dmadev_stats_reset(uint16_t dev_id, int vchan)
> +{
> +	struct rte_dmadev *dev;
> +
> +	RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
> +	dev = &rte_dmadevices[dev_id];
> +
> +	if (vchan < -1 || vchan >= dev->data->dev_conf.max_vchans) {
> +		RTE_DMADEV_LOG(ERR,
> +			"Device %u vchan %u out of range\n", dev_id, vchan);
> +		return -EINVAL;
> +	}
> +
> +	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->stats_reset, -ENOTSUP);
> +	return (*dev->dev_ops->stats_reset)(dev, vchan);
> +}
> +
> +int
> +rte_dmadev_dump(uint16_t dev_id, FILE *f)
> +{
> +	struct rte_dmadev_info info;
> +	struct rte_dmadev *dev;
> +	int ret;
> +
> +	RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
> +	RTE_FUNC_PTR_OR_ERR_RET(f, -EINVAL);
> +
> +	ret = rte_dmadev_info_get(dev_id, &info);
> +	if (ret != 0) {
> +		RTE_DMADEV_LOG(ERR, "Device %u get device info fail\n", dev_id);
> +		return -EINVAL;
> +	}
> +
> +	dev = &rte_dmadevices[dev_id];
> +
> +	fprintf(f, "DMA Dev %u, '%s' [%s]\n",
> +		dev->data->dev_id,
> +		dev->data->dev_name,
> +		dev->data->dev_started ? "started" : "stopped");
> +	fprintf(f, "  dev_capa: 0x%" PRIx64 "\n", info.dev_capa);
> +	fprintf(f, "  max_vchans_supported: %u\n", info.max_vchans);
> +	fprintf(f, "  max_vchans_configured: %u\n", info.nb_vchans);
> +	fprintf(f, "  MT-safe-configured: vchans: %u multi-vchans: %u\n",
> +		dev->data->dev_conf.enable_mt_vchan,
> +		dev->data->dev_conf.enable_mt_multi_vchan);
> +
> +	if (dev->dev_ops->dev_dump != NULL)
> +		return (*dev->dev_ops->dev_dump)(dev, f);
> +
> +	return 0;
> +}
> +
> +int
> +rte_dmadev_selftest(uint16_t dev_id)
> +{
> +	struct rte_dmadev *dev;
> +
> +	RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
> +	dev = &rte_dmadevices[dev_id];
> +
> +	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_selftest, -ENOTSUP);
> +	return (*dev->dev_ops->dev_selftest)(dev_id);
> +}

^ permalink raw reply	[flat|nested] 339+ messages in thread

* Re: [dpdk-dev] [PATCH v2] dmadev: introduce DMA device library
  2021-07-12  9:59   ` Jerin Jacob
@ 2021-07-12 13:32     ` Bruce Richardson
  2021-07-12 16:34       ` Jerin Jacob
  0 siblings, 1 reply; 339+ messages in thread
From: Bruce Richardson @ 2021-07-12 13:32 UTC (permalink / raw)
  To: Jerin Jacob
  Cc: Chengwen Feng, Thomas Monjalon, Ferruh Yigit, Jerin Jacob,
	dpdk-dev, Morten Brørup, Nipun Gupta, Hemant Agrawal,
	Maxime Coquelin, Honnappa Nagarahalli, David Marchand,
	Satananda Burla, Prasun Kapoor, Ananyev, Konstantin, liangma

On Mon, Jul 12, 2021 at 03:29:27PM +0530, Jerin Jacob wrote:
> On Sun, Jul 11, 2021 at 2:59 PM Chengwen Feng <fengchengwen@huawei.com> wrote:
> >
> > This patch introduce 'dmadevice' which is a generic type of DMA
> > device.
> >
> > The APIs of dmadev library exposes some generic operations which can
> > enable configuration and I/O with the DMA devices.
> >
> > Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>
> > ---
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change without prior notice.
> > + *
> > + * Enqueue a fill operation onto the virtual DMA channel.
> > + *
> > + * This queues up a fill operation to be performed by hardware, but does not
> > + * trigger hardware to begin that operation.
> > + *
> > + * @param dev_id
> > + *   The identifier of the device.
> > + * @param vchan
> > + *   The identifier of virtual DMA channel.
> > + * @param pattern
> > + *   The pattern to populate the destination buffer with.
> > + * @param dst
> > + *   The address of the destination buffer.
> > + * @param length
> > + *   The length of the destination buffer.
> > + * @param flags
> > + *   An flags for this operation.
> > + *
> > + * @return
> > + *   - 0..UINT16_MAX: index of enqueued copy job.
> 
> fill job
> 
> > + *   - <0: Error code returned by the driver copy function.
> > + */
> > +__rte_experimental
> > +static inline int
> > +rte_dmadev_fill(uint16_t dev_id, uint16_t vchan, uint64_t pattern,
> > +               rte_iova_t dst, uint32_t length, uint64_t flags)
> > +{
> > +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> > +#ifdef RTE_DMADEV_DEBUG
> > +       RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
> > +       RTE_FUNC_PTR_OR_ERR_RET(*dev->fill, -ENOTSUP);
> > +       if (vchan >= dev->data->dev_conf.max_vchans) {
> > +               RTE_DMADEV_LOG(ERR, "Invalid vchan %d\n", vchan);
> > +               return -EINVAL;
> > +       }
> > +#endif
> > +       return (*dev->fill)(dev, vchan, pattern, dst, length, flags);
> > +}
> > +
> 
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change without prior notice.
> > + *
> > + * Returns the number of operations that have been successfully completed.
> > + *
> > + * @param dev_id
> > + *   The identifier of the device.
> > + * @param vchan
> > + *   The identifier of virtual DMA channel.
> > + * @param nb_cpls
> > + *   The maximum number of completed operations that can be processed.
> > + * @param[out] last_idx
> > + *   The last completed operation's index.
> > + *   If not required, NULL can be passed in.
> 
> This means the driver will be tracking the last index.
> 

Driver will be doing this anyway, no, since it needs to ensure we don't
wrap around?

> Is that mean, the application needs to call this API periodically to
> consume the completion slot.
> I.e up to 64K (UINT16_MAX)  outstanding jobs are possible. If the
> application fails to call this
> >64K outstand job then the subsequence enqueue will fail.

Well, given that there will be a regular enqueue ring which will probably
be <= 64k in size, the completion call will need to be called frequently
anyway. I don't think we need to document this restriction as it's fairly
understood that you can't go beyond the size of the ring without cleanup.

> 
> If so, we need to document this.
> 
> One of the concerns of keeping UINT16_MAX as the limit is the
> completion memory will always not in cache.
> On the other hand, if we make this size programmable. it may introduce
> complexity in the application.
> 
> Thoughts?

The reason for using powers-of-2 sizes, e.g. 0 .. UINT16_MAX, is that the
ring can be any other power-of-2 size and we can index it just by masking.
In the sample app for dmadev, I expect the ring size used to be set the
same as the dmadev enqueue ring size, for simplicity.

In fact, I was thinking that in later versions we may be able to include
some macros to help make this whole process easier, of converting indexes
to arbitrary data structures. [The reason for using macros is so that the
actual rings we are indexing can be of user-defined type, rather than just
a ring of pointers].

/Bruce

^ permalink raw reply	[flat|nested] 339+ messages in thread

* Re: [dpdk-dev] [PATCH v2] dmadev: introduce DMA device library
  2021-07-11  9:25 ` [dpdk-dev] [PATCH v2] dmadev: introduce DMA device library Chengwen Feng
                     ` (4 preceding siblings ...)
  2021-07-12 12:05   ` Bruce Richardson
@ 2021-07-12 15:50   ` Bruce Richardson
  2021-07-13  9:07     ` Jerin Jacob
  2021-07-13 14:19   ` Ananyev, Konstantin
  6 siblings, 1 reply; 339+ messages in thread
From: Bruce Richardson @ 2021-07-12 15:50 UTC (permalink / raw)
  To: Chengwen Feng
  Cc: thomas, ferruh.yigit, jerinj, jerinjacobk, dev, mb, nipun.gupta,
	hemant.agrawal, maxime.coquelin, honnappa.nagarahalli,
	david.marchand, sburla, pkapoor, konstantin.ananyev, liangma

On Sun, Jul 11, 2021 at 05:25:56PM +0800, Chengwen Feng wrote:
> This patch introduce 'dmadevice' which is a generic type of DMA
> device.
> 
> The APIs of dmadev library exposes some generic operations which can
> enable configuration and I/O with the DMA devices.
> 
> Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>

Hi again,

some further review comments inline.

/Bruce

> ---
>  MAINTAINERS                  |    4 +
>  config/rte_config.h          |    3 +
>  lib/dmadev/meson.build       |    6 +
>  lib/dmadev/rte_dmadev.c      |  560 +++++++++++++++++++++++
>  lib/dmadev/rte_dmadev.h      | 1030 ++++++++++++++++++++++++++++++++++++++++++
>  lib/dmadev/rte_dmadev_core.h |  159 +++++++
>  lib/dmadev/rte_dmadev_pmd.h  |   72 +++
>  lib/dmadev/version.map       |   40 ++
>  lib/meson.build              |    1 +

<snip>

> diff --git a/lib/dmadev/rte_dmadev.h b/lib/dmadev/rte_dmadev.h
> new file mode 100644
> index 0000000..8779512
> --- /dev/null
> +++ b/lib/dmadev/rte_dmadev.h
> @@ -0,0 +1,1030 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2021 HiSilicon Limited.
> + * Copyright(c) 2021 Intel Corporation.
> + * Copyright(c) 2021 Marvell International Ltd.
> + */
> +
> +#ifndef _RTE_DMADEV_H_
> +#define _RTE_DMADEV_H_
> +
> +/**
> + * @file rte_dmadev.h
> + *
> + * RTE DMA (Direct Memory Access) device APIs.
> + *
> + * The DMA framework is built on the following model:
> + *
> + *     ---------------   ---------------       ---------------
> + *     | virtual DMA |   | virtual DMA |       | virtual DMA |
> + *     | channel     |   | channel     |       | channel     |
> + *     ---------------   ---------------       ---------------
> + *            |                |                      |
> + *            ------------------                      |
> + *                     |                              |
> + *               ------------                    ------------
> + *               |  dmadev  |                    |  dmadev  |
> + *               ------------                    ------------
> + *                     |                              |
> + *            ------------------               ------------------
> + *            | HW-DMA-channel |               | HW-DMA-channel |
> + *            ------------------               ------------------
> + *                     |                              |
> + *                     --------------------------------
> + *                                     |
> + *                           ---------------------
> + *                           | HW-DMA-Controller |
> + *                           ---------------------
> + *
> + * The DMA controller could have multilpe HW-DMA-channels (aka. HW-DMA-queues),
> + * each HW-DMA-channel should be represented by a dmadev.
> + *
> + * The dmadev could create multiple virtual DMA channel, each virtual DMA
> + * channel represents a different transfer context. The DMA operation request
> + * must be submitted to the virtual DMA channel.
> + * E.G. Application could create virtual DMA channel 0 for mem-to-mem transfer
> + *      scenario, and create virtual DMA channel 1 for mem-to-dev transfer
> + *      scenario.
> + *
> + * The dmadev are dynamically allocated by rte_dmadev_pmd_allocate() during the
> + * PCI/SoC device probing phase performed at EAL initialization time. And could
> + * be released by rte_dmadev_pmd_release() during the PCI/SoC device removing
> + * phase.
> + *
> + * We use 'uint16_t dev_id' as the device identifier of a dmadev, and
> + * 'uint16_t vchan' as the virtual DMA channel identifier in one dmadev.
> + *
> + * The functions exported by the dmadev API to setup a device designated by its
> + * device identifier must be invoked in the following order:
> + *     - rte_dmadev_configure()
> + *     - rte_dmadev_vchan_setup()
> + *     - rte_dmadev_start()
> + *
> + * Then, the application can invoke dataplane APIs to process jobs.
> + *
> + * If the application wants to change the configuration (i.e. call
> + * rte_dmadev_configure()), it must call rte_dmadev_stop() first to stop the
> + * device and then do the reconfiguration before calling rte_dmadev_start()
> + * again. The dataplane APIs should not be invoked when the device is stopped.
> + *
> + * Finally, an application can close a dmadev by invoking the
> + * rte_dmadev_close() function.
> + *
> + * The dataplane APIs include two parts:
> + *   a) The first part is the submission of operation requests:
> + *        - rte_dmadev_copy()
> + *        - rte_dmadev_copy_sg() - scatter-gather form of copy
> + *        - rte_dmadev_fill()
> + *        - rte_dmadev_fill_sg() - scatter-gather form of fill
> + *        - rte_dmadev_perform() - issue doorbell to hardware
> + *      These APIs could work with different virtual DMA channels which have
> + *      different contexts.
> + *      The first four APIs are used to submit the operation request to the
> + *      virtual DMA channel, if the submission is successful, a uint16_t
> + *      ring_idx is returned, otherwise a negative number is returned.
> + *   b) The second part is to obtain the result of requests:
> + *        - rte_dmadev_completed()
> + *            - return the number of operation requests completed successfully.
> + *        - rte_dmadev_completed_fails()
> + *            - return the number of operation requests failed to complete.

Please rename this to "completed_status" to allow the return of information
other than just errors. As I suggested before, I think this should also be
usable as a slower version of "completed" even in the case where there are
no errors, in that it returns status information for each and every job
rather than just returning as soon as it hits a failure.

> + * + * About the ring_idx which rte_dmadev_copy/copy_sg/fill/fill_sg()
> returned, + * the rules are as follows: + *   a) ring_idx for each
> virtual DMA channel are independent.  + *   b) For a virtual DMA channel,
> the ring_idx is monotonically incremented, + *      when it reach
> UINT16_MAX, it wraps back to zero.

Based on other feedback, I suggest we put in the detail here that: "This
index can be used by applications to track per-job metadata in an
application-defined circular ring, where the ring is a power-of-2 size, and
the indexes are masked appropriately."

> + *   c) The initial ring_idx of a virtual DMA channel is zero, after the device
> + *      is stopped or reset, the ring_idx needs to be reset to zero.
> + *   Example:
> + *      step-1: start one dmadev
> + *      step-2: enqueue a copy operation, the ring_idx return is 0
> + *      step-3: enqueue a copy operation again, the ring_idx return is 1
> + *      ...
> + *      step-101: stop the dmadev
> + *      step-102: start the dmadev
> + *      step-103: enqueue a copy operation, the cookie return is 0
> + *      ...
> + *      step-x+0: enqueue a fill operation, the ring_idx return is 65535
> + *      step-x+1: enqueue a copy operation, the ring_idx return is 0
> + *      ...
> + *
> + * By default, all the non-dataplane functions of the dmadev API exported by a
> + * PMD are lock-free functions which assume to not be invoked in parallel on
> + * different logical cores to work on the same target object.
> + *
> + * The dataplane functions of the dmadev API exported by a PMD can be MT-safe
> + * only when supported by the driver, generally, the driver will reports two
> + * capabilities:
> + *   a) Whether to support MT-safe for the submit/completion API of the same
> + *      virtual DMA channel.
> + *      E.G. one thread do submit operation, another thread do completion
> + *           operation.
> + *      If driver support it, then declare RTE_DMA_DEV_CAPA_MT_VCHAN.
> + *      If driver don't support it, it's up to the application to guarantee
> + *      MT-safe.
> + *   b) Whether to support MT-safe for different virtual DMA channels.
> + *      E.G. one thread do operation on virtual DMA channel 0, another thread
> + *           do operation on virtual DMA channel 1.
> + *      If driver support it, then declare RTE_DMA_DEV_CAPA_MT_MULTI_VCHAN.
> + *      If driver don't support it, it's up to the application to guarantee
> + *      MT-safe.
> + *
> + */

Just to check - do we have hardware that currently supports these
capabilities? For Intel HW, we will only support one virtual channel per
device without any MT-safety guarantees, so won't be setting either of
these flags. If any of these flags are unused in all planned drivers, we
should drop them from the spec until they prove necessary. Idealy,
everything in the dmadev definition should be testable, and features unused
by anyone obviously will be untested.

> +
> +#include <rte_common.h>
> +#include <rte_compat.h>
> +#include <rte_errno.h>
> +#include <rte_memory.h>
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +#define RTE_DMADEV_NAME_MAX_LEN	RTE_DEV_NAME_MAX_LEN
> +
> +extern int rte_dmadev_logtype;
> +
> +#define RTE_DMADEV_LOG(level, ...) \
> +	rte_log(RTE_LOG_ ## level, rte_dmadev_logtype, "" __VA_ARGS__)
> +
> +/* Macros to check for valid port */
> +#define RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, retval) do { \
> +	if (!rte_dmadev_is_valid_dev(dev_id)) { \
> +		RTE_DMADEV_LOG(ERR, "Invalid dev_id=%u\n", dev_id); \
> +		return retval; \
> +	} \
> +} while (0)
> +
> +#define RTE_DMADEV_VALID_DEV_ID_OR_RET(dev_id) do { \
> +	if (!rte_dmadev_is_valid_dev(dev_id)) { \
> +		RTE_DMADEV_LOG(ERR, "Invalid dev_id=%u\n", dev_id); \
> +		return; \
> +	} \
> +} while (0)
> +

Can we avoid using these in the inline functions in this file, and move
them to the _pmd.h which is for internal PMD use only? It would mean we
don't get logging from the key dataplane functions, but I would hope the
return values would provide enough info.

Alternatively, can we keep the logtype definition and first macro and move
the other two to the _pmd.h file.

> +/**
> + * @internal
> + * Validate if the DMA device index is a valid attached DMA device.
> + *
> + * @param dev_id
> + *   DMA device index.
> + *
> + * @return
> + *   - If the device index is valid (true) or not (false).
> + */
> +__rte_internal
> +bool
> +rte_dmadev_is_valid_dev(uint16_t dev_id);
> +
> +/**
> + * rte_dma_sg - can hold scatter DMA operation request
> + */
> +struct rte_dma_sg {
> +	rte_iova_t src;
> +	rte_iova_t dst;
> +	uint32_t length;
> +};
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Get the total number of DMA devices that have been successfully
> + * initialised.
> + *
> + * @return
> + *   The total number of usable DMA devices.
> + */
> +__rte_experimental
> +uint16_t
> +rte_dmadev_count(void);
> +
> +/**
> + * The capabilities of a DMA device
> + */
> +#define RTE_DMA_DEV_CAPA_MEM_TO_MEM	(1ull << 0)
> +/**< DMA device support mem-to-mem transfer.

Do we need this? Can we assume that any device appearing as a dmadev can
do mem-to-mem copies, and drop the capability for mem-to-mem and the
capability for copying?

> + *
> + * @see struct rte_dmadev_info::dev_capa
> + */
> +#define RTE_DMA_DEV_CAPA_MEM_TO_DEV	(1ull << 1)
> +/**< DMA device support slave mode & mem-to-dev transfer.
> + *
> + * @see struct rte_dmadev_info::dev_capa
> + */
> +#define RTE_DMA_DEV_CAPA_DEV_TO_MEM	(1ull << 2)
> +/**< DMA device support slave mode & dev-to-mem transfer.
> + *
> + * @see struct rte_dmadev_info::dev_capa
> + */
> +#define RTE_DMA_DEV_CAPA_DEV_TO_DEV	(1ull << 3)
> +/**< DMA device support slave mode & dev-to-dev transfer.
> + *

Just to confirm, are there devices currently planned for dmadev that
supports only a subset of these flags? Thinking particularly of the
dev-2-mem and mem-2-dev ones here - do any of the devices we are
considering not support using device memory?
[Again, just want to ensure we aren't adding too much stuff that we don't
need yet]

> + * @see struct rte_dmadev_info::dev_capa
> + */
> +#define RTE_DMA_DEV_CAPA_OPS_COPY	(1ull << 4)
> +/**< DMA device support copy ops.
> + *

Suggest dropping this and making it min for dmadev.

> + * @see struct rte_dmadev_info::dev_capa
> + */
> +#define RTE_DMA_DEV_CAPA_OPS_FILL	(1ull << 5)
> +/**< DMA device support fill ops.
> + *
> + * @see struct rte_dmadev_info::dev_capa
> + */
> +#define RTE_DMA_DEV_CAPA_OPS_SG		(1ull << 6)
> +/**< DMA device support scatter-list ops.
> + * If device support ops_copy and ops_sg, it means supporting copy_sg ops.
> + * If device support ops_fill and ops_sg, it means supporting fill_sg ops.
> + *
> + * @see struct rte_dmadev_info::dev_capa
> + */
> +#define RTE_DMA_DEV_CAPA_FENCE		(1ull << 7)
> +/**< DMA device support fence.
> + * If device support fence, then application could set a fence flags when
> + * enqueue operation by rte_dma_copy/copy_sg/fill/fill_sg.
> + * If a operation has a fence flags, it means the operation must be processed
> + * only after all previous operations are completed.
> + *

Is this needed? As I understand it, the Marvell driver doesn't require
fences so providing one is a no-op. Therefore, this flag is probably
unnecessary.

> + * @see struct rte_dmadev_info::dev_capa
> + */
> +#define RTE_DMA_DEV_CAPA_SVA		(1ull << 8)
> +/**< DMA device support SVA which could use VA as DMA address.
> + * If device support SVA then application could pass any VA address like memory
> + * from rte_malloc(), rte_memzone(), malloc, stack memory.
> + * If device don't support SVA, then application should pass IOVA address which
> + * from rte_malloc(), rte_memzone().
> + *
> + * @see struct rte_dmadev_info::dev_capa
> + */
> +#define RTE_DMA_DEV_CAPA_MT_VCHAN	(1ull << 9)
> +/**< DMA device support MT-safe of a virtual DMA channel.
> + *
> + * @see struct rte_dmadev_info::dev_capa
> + */
> +#define RTE_DMA_DEV_CAPA_MT_MULTI_VCHAN	(1ull << 10)
> +/**< DMA device support MT-safe of different virtual DMA channels.
> + *
> + * @see struct rte_dmadev_info::dev_capa
> + */

As with comments above - let's check that these will actually be used
before we add them.

> +
> +/**
> + * A structure used to retrieve the contextual information of
> + * an DMA device
> + */
> +struct rte_dmadev_info {
> +	struct rte_device *device; /**< Generic Device information */
> +	uint64_t dev_capa; /**< Device capabilities (RTE_DMA_DEV_CAPA_) */
> +	/** Maximum number of virtual DMA channels supported */
> +	uint16_t max_vchans;
> +	/** Maximum allowed number of virtual DMA channel descriptors */
> +	uint16_t max_desc;
> +	/** Minimum allowed number of virtual DMA channel descriptors */
> +	uint16_t min_desc;
> +	uint16_t nb_vchans; /**< Number of virtual DMA channel configured */
> +};

Let's add rte_dmadev_conf struct into this to return the configuration
settings.

> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Retrieve the contextual information of a DMA device.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param[out] dev_info
> + *   A pointer to a structure of type *rte_dmadev_info* to be filled with the
> + *   contextual information of the device.
> + *
> + * @return
> + *   - =0: Success, driver updates the contextual information of the DMA device
> + *   - <0: Error code returned by the driver info get function.
> + *
> + */
> +__rte_experimental
> +int
> +rte_dmadev_info_get(uint16_t dev_id, struct rte_dmadev_info *dev_info);
> +

Should have "const" on second param.

> +/**
> + * A structure used to configure a DMA device.
> + */
> +struct rte_dmadev_conf {
> +	/** Maximum number of virtual DMA channel to use.
> +	 * This value cannot be greater than the field 'max_vchans' of struct
> +	 * rte_dmadev_info which get from rte_dmadev_info_get().
> +	 */
> +	uint16_t max_vchans;
> +	/** Enable bit for MT-safe of a virtual DMA channel.
> +	 * This bit can be enabled only when the device supports
> +	 * RTE_DMA_DEV_CAPA_MT_VCHAN.
> +	 * @see RTE_DMA_DEV_CAPA_MT_VCHAN
> +	 */
> +	uint8_t enable_mt_vchan : 1;
> +	/** Enable bit for MT-safe of different virtual DMA channels.
> +	 * This bit can be enabled only when the device supports
> +	 * RTE_DMA_DEV_CAPA_MT_MULTI_VCHAN.
> +	 * @see RTE_DMA_DEV_CAPA_MT_MULTI_VCHAN
> +	 */
> +	uint8_t enable_mt_multi_vchan : 1;
> +	uint64_t reserved[2]; /**< Reserved for future fields */
> +};

Drop the reserved fields. ABI versioning is a better way to deal with
adding new fields.

> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Configure a DMA device.
> + *
> + * This function must be invoked first before any other function in the
> + * API. This function can also be re-invoked when a device is in the
> + * stopped state.
> + *
> + * @param dev_id
> + *   The identifier of the device to configure.
> + * @param dev_conf
> + *   The DMA device configuration structure encapsulated into rte_dmadev_conf
> + *   object.
> + *
> + * @return
> + *   - =0: Success, device configured.
> + *   - <0: Error code returned by the driver configuration function.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_configure(uint16_t dev_id, const struct rte_dmadev_conf *dev_conf);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Start a DMA device.
> + *
> + * The device start step is the last one and consists of setting the DMA
> + * to start accepting jobs.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + *
> + * @return
> + *   - =0: Success, device started.
> + *   - <0: Error code returned by the driver start function.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_start(uint16_t dev_id);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Stop a DMA device.
> + *
> + * The device can be restarted with a call to rte_dmadev_start()
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + *
> + * @return
> + *   - =0: Success, device stopped.
> + *   - <0: Error code returned by the driver stop function.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_stop(uint16_t dev_id);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Close a DMA device.
> + *
> + * The device cannot be restarted after this call.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + *
> + * @return
> + *  - =0: Successfully close device
> + *  - <0: Failure to close device
> + */
> +__rte_experimental
> +int
> +rte_dmadev_close(uint16_t dev_id);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Reset a DMA device.
> + *
> + * This is different from cycle of rte_dmadev_start->rte_dmadev_stop in the
> + * sense similar to hard or soft reset.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + *
> + * @return
> + *   - =0: Successfully reset device.
> + *   - <0: Failure to reset device.
> + *   - (-ENOTSUP): If the device doesn't support this function.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_reset(uint16_t dev_id);
> +
> +/**
> + * DMA transfer direction defines.
> + */
> +#define RTE_DMA_MEM_TO_MEM	(1ull << 0)
> +/**< DMA transfer direction - from memory to memory.
> + *
> + * @see struct rte_dmadev_vchan_conf::direction
> + */
> +#define RTE_DMA_MEM_TO_DEV	(1ull << 1)
> +/**< DMA transfer direction - slave mode & from memory to device.
> + * In a typical scenario, ARM SoCs are installed on x86 servers as iNICs. In
> + * this case, the ARM SoCs works in slave mode, it could initiate a DMA move
> + * request from ARM memory to x86 host memory.

For clarity, it would be good to specify in the scenario described which
memory is the "mem" and which is the "dev" (I assume SoC memory is "mem"
and x86 host memory is "dev"??)

> + *
> + * @see struct rte_dmadev_vchan_conf::direction
> + */
> +#define RTE_DMA_DEV_TO_MEM	(1ull << 2)
> +/**< DMA transfer direction - slave mode & from device to memory.
> + * In a typical scenario, ARM SoCs are installed on x86 servers as iNICs. In
> + * this case, the ARM SoCs works in slave mode, it could initiate a DMA move
> + * request from x86 host memory to ARM memory.
> + *
> + * @see struct rte_dmadev_vchan_conf::direction
> + */
> +#define RTE_DMA_DEV_TO_DEV	(1ull << 3)
> +/**< DMA transfer direction - slave mode & from device to device.
> + * In a typical scenario, ARM SoCs are installed on x86 servers as iNICs. In
> + * this case, the ARM SoCs works in slave mode, it could initiate a DMA move
> + * request from x86 host memory to another x86 host memory.
> + *
> + * @see struct rte_dmadev_vchan_conf::direction
> + */
> +#define RTE_DMA_TRANSFER_DIR_ALL	(RTE_DMA_MEM_TO_MEM | \
> +					 RTE_DMA_MEM_TO_DEV | \
> +					 RTE_DMA_DEV_TO_MEM | \
> +					 RTE_DMA_DEV_TO_DEV)
> +
> +/**
> + * enum rte_dma_slave_port_type - slave mode type defines
> + */
> +enum rte_dma_slave_port_type {
> +	/** The slave port is PCIE. */
> +	RTE_DMA_SLAVE_PORT_PCIE = 1,
> +};
> +

As previously mentioned, this needs to be updated to use other terms.
For some suggested alternatives see:
https://doc.dpdk.org/guides-21.05/contributing/coding_style.html#naming

> +/**
> + * A structure used to descript slave port parameters.
> + */
> +struct rte_dma_slave_port_parameters {
> +	enum rte_dma_slave_port_type port_type;
> +	union {
> +		/** For PCIE port */
> +		struct {
> +			/** The physical function number which to use */
> +			uint64_t pf_number : 6;
> +			/** Virtual function enable bit */
> +			uint64_t vf_enable : 1;
> +			/** The virtual function number which to use */
> +			uint64_t vf_number : 8;
> +			uint64_t pasid : 20;
> +			/** The attributes filed in TLP packet */
> +			uint64_t tlp_attr : 3;
> +		};
> +	};
> +};
> +
> +/**
> + * A structure used to configure a virtual DMA channel.
> + */
> +struct rte_dmadev_vchan_conf {
> +	uint8_t direction; /**< Set of supported transfer directions */
> +	/** Number of descriptor for the virtual DMA channel */
> +	uint16_t nb_desc;
> +	/** 1) Used to describes the dev parameter in the mem-to-dev/dev-to-mem
> +	 * transfer scenario.
> +	 * 2) Used to describes the src dev parameter in the dev-to-dev
> +	 * transfer scenario.
> +	 */
> +	struct rte_dma_slave_port_parameters port;
> +	/** Used to describes the dst dev parameters in the dev-to-dev
> +	 * transfer scenario.
> +	 */
> +	struct rte_dma_slave_port_parameters peer_port;
> +	uint64_t reserved[2]; /**< Reserved for future fields */
> +};

Let's drop the reserved fields and use ABI versioning if necesssary in
future.

> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Allocate and set up a virtual DMA channel.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param conf
> + *   The virtual DMA channel configuration structure encapsulated into
> + *   rte_dmadev_vchan_conf object.
> + *
> + * @return
> + *   - >=0: Allocate success, it is the virtual DMA channel id. This value must
> + *          be less than the field 'max_vchans' of struct rte_dmadev_conf
> +	    which configured by rte_dmadev_configure().

nit: whitespace error here.

> + *   - <0: Error code returned by the driver virtual channel setup function.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_vchan_setup(uint16_t dev_id,
> +		       const struct rte_dmadev_vchan_conf *conf);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Release a virtual DMA channel.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vchan
> + *   The identifier of virtual DMA channel which return by vchan setup.
> + *
> + * @return
> + *   - =0: Successfully release the virtual DMA channel.
> + *   - <0: Error code returned by the driver virtual channel release function.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_vchan_release(uint16_t dev_id, uint16_t vchan);
> +
> +/**
> + * rte_dmadev_stats - running statistics.
> + */
> +struct rte_dmadev_stats {
> +	/** Count of operations which were successfully enqueued */
> +	uint64_t enqueued_count;
> +	/** Count of operations which were submitted to hardware */
> +	uint64_t submitted_count;
> +	/** Count of operations which failed to complete */
> +	uint64_t completed_fail_count;
> +	/** Count of operations which successfully complete */
> +	uint64_t completed_count;
> +	uint64_t reserved[4]; /**< Reserved for future fields */
> +};
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Retrieve basic statistics of a or all virtual DMA channel(s).
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vchan
> + *   The identifier of virtual DMA channel, -1 means all channels.
> + * @param[out] stats
> + *   The basic statistics structure encapsulated into rte_dmadev_stats
> + *   object.
> + *
> + * @return
> + *   - =0: Successfully retrieve stats.
> + *   - <0: Failure to retrieve stats.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_stats_get(uint16_t dev_id, int vchan,

vchan as uint16_t rather than int, I think. This would apply to all
dataplane functions. There is no need for a signed vchan value.

> +		     struct rte_dmadev_stats *stats);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Reset basic statistics of a or all virtual DMA channel(s).
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vchan
> + *   The identifier of virtual DMA channel, -1 means all channels.
> + *
> + * @return
> + *   - =0: Successfully reset stats.
> + *   - <0: Failure to reset stats.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_stats_reset(uint16_t dev_id, int vchan);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Dump DMA device info.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param f
> + *   The file to write the output to.
> + *
> + * @return
> + *   0 on success. Non-zero otherwise.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_dump(uint16_t dev_id, FILE *f);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Trigger the dmadev self test.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + *
> + * @return
> + *   - 0: Selftest successful.
> + *   - -ENOTSUP if the device doesn't support selftest
> + *   - other values < 0 on failure.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_selftest(uint16_t dev_id);

I don't think this needs to be in the public API, since it should only be
for the autotest app to use. Maybe move the prototype to the _pmd.h (since
we don't have a separate internal header), and then the autotest app can
pick it up from there.

> +
> +#include "rte_dmadev_core.h"
> +
> +/**
> + *  DMA flags to augment operation preparation.
> + *  Used as the 'flags' parameter of rte_dmadev_copy/copy_sg/fill/fill_sg.
> + */
> +#define RTE_DMA_FLAG_FENCE	(1ull << 0)
> +/**< DMA fence flag
> + * It means the operation with this flag must be processed only after all
> + * previous operations are completed.
> + *
> + * @see rte_dmadev_copy()
> + * @see rte_dmadev_copy_sg()
> + * @see rte_dmadev_fill()
> + * @see rte_dmadev_fill_sg()
> + */

As a general comment, I think all these multi-line comments should go
before the item they describe. Comments after should only be used in the
case where the comment fits on the rest of the line after a value.

We also should define the SUBMIT flag as suggested by Jerin, to allow apps
to automatically submit jobs after enqueue.

> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Enqueue a copy operation onto the virtual DMA channel.
> + *
> + * This queues up a copy operation to be performed by hardware, but does not
> + * trigger hardware to begin that operation.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vchan
> + *   The identifier of virtual DMA channel.
> + * @param src
> + *   The address of the source buffer.
> + * @param dst
> + *   The address of the destination buffer.
> + * @param length
> + *   The length of the data to be copied.
> + * @param flags
> + *   An flags for this operation.
> + *
> + * @return
> + *   - 0..UINT16_MAX: index of enqueued copy job.
> + *   - <0: Error code returned by the driver copy function.
> + */
> +__rte_experimental
> +static inline int
> +rte_dmadev_copy(uint16_t dev_id, uint16_t vchan, rte_iova_t src, rte_iova_t dst,
> +		uint32_t length, uint64_t flags)
> +{
> +	struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> +#ifdef RTE_DMADEV_DEBUG
> +	RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
> +	RTE_FUNC_PTR_OR_ERR_RET(*dev->copy, -ENOTSUP);
> +	if (vchan >= dev->data->dev_conf.max_vchans) {
> +		RTE_DMADEV_LOG(ERR, "Invalid vchan %d\n", vchan);
> +		return -EINVAL;
> +	}
> +#endif
> +	return (*dev->copy)(dev, vchan, src, dst, length, flags);
> +}
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Enqueue a scatter list copy operation onto the virtual DMA channel.
> + *
> + * This queues up a scatter list copy operation to be performed by hardware,
> + * but does not trigger hardware to begin that operation.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vchan
> + *   The identifier of virtual DMA channel.
> + * @param sg
> + *   The pointer of scatterlist.
> + * @param sg_len
> + *   The number of scatterlist elements.
> + * @param flags
> + *   An flags for this operation.
> + *
> + * @return
> + *   - 0..UINT16_MAX: index of enqueued copy job.
> + *   - <0: Error code returned by the driver copy function.
> + */
> +__rte_experimental
> +static inline int
> +rte_dmadev_copy_sg(uint16_t dev_id, uint16_t vchan, const struct rte_dma_sg *sg,
> +		   uint32_t sg_len, uint64_t flags)
> +{
> +	struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> +#ifdef RTE_DMADEV_DEBUG
> +	RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
> +	RTE_FUNC_PTR_OR_ERR_RET(sg, -EINVAL);
> +	RTE_FUNC_PTR_OR_ERR_RET(*dev->copy_sg, -ENOTSUP);
> +	if (vchan >= dev->data->dev_conf.max_vchans) {
> +		RTE_DMADEV_LOG(ERR, "Invalid vchan %d\n", vchan);
> +		return -EINVAL;
> +	}
> +#endif
> +	return (*dev->copy_sg)(dev, vchan, sg, sg_len, flags);
> +}
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Enqueue a fill operation onto the virtual DMA channel.
> + *
> + * This queues up a fill operation to be performed by hardware, but does not
> + * trigger hardware to begin that operation.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vchan
> + *   The identifier of virtual DMA channel.
> + * @param pattern
> + *   The pattern to populate the destination buffer with.
> + * @param dst
> + *   The address of the destination buffer.
> + * @param length
> + *   The length of the destination buffer.
> + * @param flags
> + *   An flags for this operation.
> + *
> + * @return
> + *   - 0..UINT16_MAX: index of enqueued copy job.
> + *   - <0: Error code returned by the driver copy function.
> + */
> +__rte_experimental
> +static inline int
> +rte_dmadev_fill(uint16_t dev_id, uint16_t vchan, uint64_t pattern,
> +		rte_iova_t dst, uint32_t length, uint64_t flags)
> +{
> +	struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> +#ifdef RTE_DMADEV_DEBUG
> +	RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
> +	RTE_FUNC_PTR_OR_ERR_RET(*dev->fill, -ENOTSUP);
> +	if (vchan >= dev->data->dev_conf.max_vchans) {
> +		RTE_DMADEV_LOG(ERR, "Invalid vchan %d\n", vchan);
> +		return -EINVAL;
> +	}
> +#endif
> +	return (*dev->fill)(dev, vchan, pattern, dst, length, flags);
> +}
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Enqueue a scatter list fill operation onto the virtual DMA channel.
> + *
> + * This queues up a scatter list fill operation to be performed by hardware,
> + * but does not trigger hardware to begin that operation.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vchan
> + *   The identifier of virtual DMA channel.
> + * @param pattern
> + *   The pattern to populate the destination buffer with.
> + * @param sg
> + *   The pointer of scatterlist.
> + * @param sg_len
> + *   The number of scatterlist elements.
> + * @param flags
> + *   An flags for this operation.
> + *
> + * @return
> + *   - 0..UINT16_MAX: index of enqueued copy job.
> + *   - <0: Error code returned by the driver copy function.
> + */
> +__rte_experimental
> +static inline int
> +rte_dmadev_fill_sg(uint16_t dev_id, uint16_t vchan, uint64_t pattern,
> +		   const struct rte_dma_sg *sg, uint32_t sg_len,
> +		   uint64_t flags)
> +{
> +	struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> +#ifdef RTE_DMADEV_DEBUG
> +	RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
> +	RTE_FUNC_PTR_OR_ERR_RET(sg, -ENOTSUP);
> +	RTE_FUNC_PTR_OR_ERR_RET(*dev->fill, -ENOTSUP);
> +	if (vchan >= dev->data->dev_conf.max_vchans) {
> +		RTE_DMADEV_LOG(ERR, "Invalid vchan %d\n", vchan);
> +		return -EINVAL;
> +	}
> +#endif
> +	return (*dev->fill_sg)(dev, vchan, pattern, sg, sg_len, flags);
> +}
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Trigger hardware to begin performing enqueued operations.
> + *
> + * This API is used to write the "doorbell" to the hardware to trigger it
> + * to begin the operations previously enqueued by rte_dmadev_copy/fill()
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vchan
> + *   The identifier of virtual DMA channel.
> + *
> + * @return
> + *   - =0: Successfully trigger hardware.
> + *   - <0: Failure to trigger hardware.
> + */
> +__rte_experimental
> +static inline int
> +rte_dmadev_submit(uint16_t dev_id, uint16_t vchan)
> +{
> +	struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> +#ifdef RTE_DMADEV_DEBUG
> +	RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
> +	RTE_FUNC_PTR_OR_ERR_RET(*dev->submit, -ENOTSUP);
> +	if (vchan >= dev->data->dev_conf.max_vchans) {
> +		RTE_DMADEV_LOG(ERR, "Invalid vchan %d\n", vchan);
> +		return -EINVAL;
> +	}
> +#endif
> +	return (*dev->submit)(dev, vchan);
> +}
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Returns the number of operations that have been successfully completed.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vchan
> + *   The identifier of virtual DMA channel.
> + * @param nb_cpls
> + *   The maximum number of completed operations that can be processed.
> + * @param[out] last_idx
> + *   The last completed operation's index.
> + *   If not required, NULL can be passed in.
> + * @param[out] has_error
> + *   Indicates if there are transfer error.
> + *   If not required, NULL can be passed in.
> + *
> + * @return
> + *   The number of operations that successfully completed.
> + */
> +__rte_experimental
> +static inline uint16_t
> +rte_dmadev_completed(uint16_t dev_id, uint16_t vchan, const uint16_t nb_cpls,
> +		     uint16_t *last_idx, bool *has_error)
> +{
> +	struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> +	uint16_t idx;
> +	bool err;
> +
> +#ifdef RTE_DMADEV_DEBUG
> +	RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
> +	RTE_FUNC_PTR_OR_ERR_RET(*dev->completed, -ENOTSUP);
> +	if (vchan >= dev->data->dev_conf.max_vchans) {
> +		RTE_DMADEV_LOG(ERR, "Invalid vchan %d\n", vchan);
> +		return -EINVAL;
> +	}
> +	if (nb_cpls == 0) {
> +		RTE_DMADEV_LOG(ERR, "Invalid nb_cpls\n");
> +		return -EINVAL;
> +	}
> +#endif
> +
> +	/* Ensure the pointer values are non-null to simplify drivers.
> +	 * In most cases these should be compile time evaluated, since this is
> +	 * an inline function.
> +	 * - If NULL is explicitly passed as parameter, then compiler knows the
> +	 *   value is NULL
> +	 * - If address of local variable is passed as parameter, then compiler
> +	 *   can know it's non-NULL.
> +	 */
> +	if (last_idx == NULL)
> +		last_idx = &idx;
> +	if (has_error == NULL)
> +		has_error = &err;
> +
> +	*has_error = false;
> +	return (*dev->completed)(dev, vchan, nb_cpls, last_idx, has_error);
> +}
> +
> +/**
> + * DMA transfer status code defines
> + */
> +enum rte_dma_status_code {
> +	/** The operation completed successfully */
> +	RTE_DMA_STATUS_SUCCESSFUL = 0,
> +	/** The operation failed to complete due active drop
> +	 * This is mainly used when processing dev_stop, allow outstanding
> +	 * requests to be completed as much as possible.
> +	 */
> +	RTE_DMA_STATUS_ACTIVE_DROP,
> +	/** The operation failed to complete due invalid source address */
> +	RTE_DMA_STATUS_INVALID_SRC_ADDR,
> +	/** The operation failed to complete due invalid destination address */
> +	RTE_DMA_STATUS_INVALID_DST_ADDR,
> +	/** The operation failed to complete due invalid length */
> +	RTE_DMA_STATUS_INVALID_LENGTH,
> +	/** The operation failed to complete due invalid opcode
> +	 * The DMA descriptor could have multiple format, which are
> +	 * distinguished by the opcode field.
> +	 */
> +	RTE_DMA_STATUS_INVALID_OPCODE,
> +	/** The operation failed to complete due bus err */
> +	RTE_DMA_STATUS_BUS_ERROR,
> +	/** The operation failed to complete due data poison */
> +	RTE_DMA_STATUS_DATA_POISION,
> +	/** The operation failed to complete due descriptor read error */
> +	RTE_DMA_STATUS_DESCRIPTOR_READ_ERROR,
> +	/** The operation failed to complete due device link error
> +	 * Used to indicates that the link error in the mem-to-dev/dev-to-mem/
> +	 * dev-to-dev transfer scenario.
> +	 */
> +	RTE_DMA_STATUS_DEV_LINK_ERROR,
> +	/** Driver specific status code offset
> +	 * Start status code for the driver to define its own error code.
> +	 */
> +	RTE_DMA_STATUS_DRV_SPECIFIC_OFFSET = 0x10000,
> +};
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Returns the number of operations that failed to complete.
> + * NOTE: This API was used when rte_dmadev_completed has_error was set.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vchan
> + *   The identifier of virtual DMA channel.
> + * @param nb_status
> + *   Indicates the size of status array.
> + * @param[out] status
> + *   The error code of operations that failed to complete.
> + *   Some standard error code are described in 'enum rte_dma_status_code'
> + *   @see rte_dma_status_code
> + * @param[out] last_idx
> + *   The last failed completed operation's index.
> + *
> + * @return
> + *   The number of operations that failed to complete.
> + */
> +__rte_experimental
> +static inline uint16_t
> +rte_dmadev_completed_fails(uint16_t dev_id, uint16_t vchan,
> +			   const uint16_t nb_status, uint32_t *status,
> +			   uint16_t *last_idx)
> +{
> +	struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> +#ifdef RTE_DMADEV_DEBUG
> +	RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
> +	RTE_FUNC_PTR_OR_ERR_RET(status, -EINVAL);
> +	RTE_FUNC_PTR_OR_ERR_RET(last_idx, -EINVAL);
> +	RTE_FUNC_PTR_OR_ERR_RET(*dev->completed_fails, -ENOTSUP);
> +	if (vchan >= dev->data->dev_conf.max_vchans) {
> +		RTE_DMADEV_LOG(ERR, "Invalid vchan %d\n", vchan);
> +		return -EINVAL;
> +	}
> +	if (nb_status == 0) {
> +		RTE_DMADEV_LOG(ERR, "Invalid nb_status\n");
> +		return -EINVAL;
> +	}
> +#endif
> +	return (*dev->completed_fails)(dev, vchan, nb_status, status, last_idx);
> +}
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif /* _RTE_DMADEV_H_ */
> diff --git a/lib/dmadev/rte_dmadev_core.h b/lib/dmadev/rte_dmadev_core.h
> new file mode 100644
> index 0000000..410faf0
> --- /dev/null
> +++ b/lib/dmadev/rte_dmadev_core.h
> @@ -0,0 +1,159 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2021 HiSilicon Limited.
> + * Copyright(c) 2021 Intel Corporation.
> + */
> +
> +#ifndef _RTE_DMADEV_CORE_H_
> +#define _RTE_DMADEV_CORE_H_
> +
> +/**
> + * @file
> + *
> + * RTE DMA Device internal header.
> + *
> + * This header contains internal data types, that are used by the DMA devices
> + * in order to expose their ops to the class.
> + *
> + * Applications should not use these API directly.
> + *
> + */
> +
> +struct rte_dmadev;
> +
> +/** @internal Used to get device information of a device. */
> +typedef int (*dmadev_info_get_t)(struct rte_dmadev *dev,
> +				 struct rte_dmadev_info *dev_info);

First parameter can be "const"

> +/** @internal Used to configure a device. */
> +typedef int (*dmadev_configure_t)(struct rte_dmadev *dev,
> +				  const struct rte_dmadev_conf *dev_conf);
> +
> +/** @internal Used to start a configured device. */
> +typedef int (*dmadev_start_t)(struct rte_dmadev *dev);
> +
> +/** @internal Used to stop a configured device. */
> +typedef int (*dmadev_stop_t)(struct rte_dmadev *dev);
> +
> +/** @internal Used to close a configured device. */
> +typedef int (*dmadev_close_t)(struct rte_dmadev *dev);
> +
> +/** @internal Used to reset a configured device. */
> +typedef int (*dmadev_reset_t)(struct rte_dmadev *dev);
> +
> +/** @internal Used to allocate and set up a virtual DMA channel. */
> +typedef int (*dmadev_vchan_setup_t)(struct rte_dmadev *dev,
> +				    const struct rte_dmadev_vchan_conf *conf);
> +
> +/** @internal Used to release a virtual DMA channel. */
> +typedef int (*dmadev_vchan_release_t)(struct rte_dmadev *dev, uint16_t vchan);
> +
> +/** @internal Used to retrieve basic statistics. */
> +typedef int (*dmadev_stats_get_t)(struct rte_dmadev *dev, int vchan,
> +				  struct rte_dmadev_stats *stats);

First parameter can be "const"

> +
> +/** @internal Used to reset basic statistics. */
> +typedef int (*dmadev_stats_reset_t)(struct rte_dmadev *dev, int vchan);
> +
> +/** @internal Used to dump internal information. */
> +typedef int (*dmadev_dump_t)(struct rte_dmadev *dev, FILE *f);
> +

First param "const"

> +/** @internal Used to start dmadev selftest. */
> +typedef int (*dmadev_selftest_t)(uint16_t dev_id);
> +

This looks an outlier taking a dev_id. It should take a rawdev parameter.
Most drivers should not need to implement this anyway, as the main unit
tests should be in "test_dmadev.c" in the autotest app.

> +/** @internal Used to enqueue a copy operation. */
> +typedef int (*dmadev_copy_t)(struct rte_dmadev *dev, uint16_t vchan,
> +			     rte_iova_t src, rte_iova_t dst,
> +			     uint32_t length, uint64_t flags);
> +
> +/** @internal Used to enqueue a scatter list copy operation. */
> +typedef int (*dmadev_copy_sg_t)(struct rte_dmadev *dev, uint16_t vchan,
> +				const struct rte_dma_sg *sg,
> +				uint32_t sg_len, uint64_t flags);
> +
> +/** @internal Used to enqueue a fill operation. */
> +typedef int (*dmadev_fill_t)(struct rte_dmadev *dev, uint16_t vchan,
> +			     uint64_t pattern, rte_iova_t dst,
> +			     uint32_t length, uint64_t flags);
> +
> +/** @internal Used to enqueue a scatter list fill operation. */
> +typedef int (*dmadev_fill_sg_t)(struct rte_dmadev *dev, uint16_t vchan,
> +			uint64_t pattern, const struct rte_dma_sg *sg,
> +			uint32_t sg_len, uint64_t flags);
> +
> +/** @internal Used to trigger hardware to begin working. */
> +typedef int (*dmadev_submit_t)(struct rte_dmadev *dev, uint16_t vchan);
> +
> +/** @internal Used to return number of successful completed operations. */
> +typedef uint16_t (*dmadev_completed_t)(struct rte_dmadev *dev, uint16_t vchan,
> +				       const uint16_t nb_cpls,
> +				       uint16_t *last_idx, bool *has_error);
> +
> +/** @internal Used to return number of failed completed operations. */
> +typedef uint16_t (*dmadev_completed_fails_t)(struct rte_dmadev *dev,
> +			uint16_t vchan, const uint16_t nb_status,
> +			uint32_t *status, uint16_t *last_idx);
> +
> +/**
> + * DMA device operations function pointer table
> + */
> +struct rte_dmadev_ops {
> +	dmadev_info_get_t dev_info_get;
> +	dmadev_configure_t dev_configure;
> +	dmadev_start_t dev_start;
> +	dmadev_stop_t dev_stop;
> +	dmadev_close_t dev_close;
> +	dmadev_reset_t dev_reset;
> +	dmadev_vchan_setup_t vchan_setup;
> +	dmadev_vchan_release_t vchan_release;
> +	dmadev_stats_get_t stats_get;
> +	dmadev_stats_reset_t stats_reset;
> +	dmadev_dump_t dev_dump;
> +	dmadev_selftest_t dev_selftest;
> +};
> +
> +/**
> + * @internal
> + * The data part, with no function pointers, associated with each DMA device.
> + *
> + * This structure is safe to place in shared memory to be common among different
> + * processes in a multi-process configuration.
> + */
> +struct rte_dmadev_data {
> +	uint16_t dev_id; /**< Device [external] identifier. */
> +	char dev_name[RTE_DMADEV_NAME_MAX_LEN]; /**< Unique identifier name */
> +	void *dev_private; /**< PMD-specific private data. */
> +	struct rte_dmadev_conf dev_conf; /**< DMA device configuration. */
> +	uint8_t dev_started : 1; /**< Device state: STARTED(1)/STOPPED(0). */
> +	uint64_t reserved[4]; /**< Reserved for future fields */
> +} __rte_cache_aligned;
> +

While I generally don't like having reserved space, this is one place where
it makes sense, so +1 for it here.

> +/**
> + * @internal
> + * The generic data structure associated with each DMA device.
> + *
> + * The dataplane APIs are located at the beginning of the structure, along
> + * with the pointer to where all the data elements for the particular device
> + * are stored in shared memory. This split scheme allows the function pointer
> + * and driver data to be per-process, while the actual configuration data for
> + * the device is shared.
> + */
> +struct rte_dmadev {
> +	dmadev_copy_t copy;
> +	dmadev_copy_sg_t copy_sg;
> +	dmadev_fill_t fill;
> +	dmadev_fill_sg_t fill_sg;
> +	dmadev_submit_t submit;
> +	dmadev_completed_t completed;
> +	dmadev_completed_fails_t completed_fails;
> +	const struct rte_dmadev_ops *dev_ops; /**< Functions exported by PMD. */
> +	/** Flag indicating the device is attached: ATTACHED(1)/DETACHED(0). */
> +	uint8_t attached : 1;

Since it's in the midst of a series of pointers, this 1-bit flag is
actually using 8-bytes of space. Is it needed. Can we use dev_ops == NULL
or data == NULL instead to indicate this is a valid entry?

> +	/** Device info which supplied during device initialization. */
> +	struct rte_device *device;
> +	struct rte_dmadev_data *data; /**< Pointer to device data. */

If we are to try and minimise cacheline access, we should put this data
pointer - or even better a copy of data->private pointer - at the top of
the structure on the same cacheline as datapath operations. For dataplane,
I can't see any elements of data, except the private pointer being
accessed, so we would probably get most benefit for having a copy put there
on init of the dmadev struct.

> +	uint64_t reserved[4]; /**< Reserved for future fields */
> +} __rte_cache_aligned;
> +
> +extern struct rte_dmadev rte_dmadevices[];
> +
> +#endif /* _RTE_DMADEV_CORE_H_ */
> diff --git a/lib/dmadev/rte_dmadev_pmd.h b/lib/dmadev/rte_dmadev_pmd.h
> new file mode 100644
> index 0000000..45141f9
> --- /dev/null
> +++ b/lib/dmadev/rte_dmadev_pmd.h
> @@ -0,0 +1,72 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2021 HiSilicon Limited.
> + */
> +
> +#ifndef _RTE_DMADEV_PMD_H_
> +#define _RTE_DMADEV_PMD_H_
> +
> +/**
> + * @file
> + *
> + * RTE DMA Device PMD APIs
> + *
> + * Driver facing APIs for a DMA device. These are not to be called directly by
> + * any application.
> + */
> +
> +#include "rte_dmadev.h"
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +/**
> + * @internal
> + * Allocates a new dmadev slot for an DMA device and returns the pointer
> + * to that slot for the driver to use.
> + *
> + * @param name
> + *   DMA device name.
> + *
> + * @return
> + *   A pointer to the DMA device slot case of success,
> + *   NULL otherwise.
> + */
> +__rte_internal
> +struct rte_dmadev *
> +rte_dmadev_pmd_allocate(const char *name);
> +
> +/**
> + * @internal
> + * Release the specified dmadev.
> + *
> + * @param dev
> + *   Device to be released.
> + *
> + * @return
> + *   - 0 on success, negative on error
> + */
> +__rte_internal
> +int
> +rte_dmadev_pmd_release(struct rte_dmadev *dev);
> +
> +/**
> + * @internal
> + * Return the DMA device based on the device name.
> + *
> + * @param name
> + *   DMA device name.
> + *
> + * @return
> + *   A pointer to the DMA device slot case of success,
> + *   NULL otherwise.
> + */
> +__rte_internal
> +struct rte_dmadev *
> +rte_dmadev_get_device_by_name(const char *name);
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif /* _RTE_DMADEV_PMD_H_ */
> diff --git a/lib/dmadev/version.map b/lib/dmadev/version.map
> new file mode 100644
> index 0000000..0f099e7
> --- /dev/null
> +++ b/lib/dmadev/version.map
> @@ -0,0 +1,40 @@
> +EXPERIMENTAL {
> +	global:
> +
> +	rte_dmadev_count;
> +	rte_dmadev_info_get;
> +	rte_dmadev_configure;
> +	rte_dmadev_start;
> +	rte_dmadev_stop;
> +	rte_dmadev_close;
> +	rte_dmadev_reset;
> +	rte_dmadev_vchan_setup;
> +	rte_dmadev_vchan_release;
> +	rte_dmadev_stats_get;
> +	rte_dmadev_stats_reset;
> +	rte_dmadev_dump;
> +	rte_dmadev_selftest;
> +	rte_dmadev_copy;
> +	rte_dmadev_copy_sg;
> +	rte_dmadev_fill;
> +	rte_dmadev_fill_sg;
> +	rte_dmadev_submit;
> +	rte_dmadev_completed;
> +	rte_dmadev_completed_fails;
> +
> +	local: *;
> +};

The elements in the version.map file blocks should be sorted alphabetically.

> +
> +INTERNAL {
> +        global:
> +
> +	rte_dmadevices;
> +	rte_dmadev_pmd_allocate;
> +	rte_dmadev_pmd_release;
> +	rte_dmadev_get_device_by_name;
> +
> +	local:
> +
> +	rte_dmadev_is_valid_dev;
> +};
> +
> diff --git a/lib/meson.build b/lib/meson.build
> index 1673ca4..68d239f 100644
> --- a/lib/meson.build
> +++ b/lib/meson.build
> @@ -60,6 +60,7 @@ libraries = [
>          'bpf',
>          'graph',
>          'node',
> +        'dmadev',
>  ]
>  
>  if is_windows
> -- 
> 2.8.1
> 

^ permalink raw reply	[flat|nested] 339+ messages in thread

* Re: [dpdk-dev] [PATCH v2] dmadev: introduce DMA device library
  2021-07-12 13:32     ` Bruce Richardson
@ 2021-07-12 16:34       ` Jerin Jacob
  2021-07-12 17:00         ` Bruce Richardson
  0 siblings, 1 reply; 339+ messages in thread
From: Jerin Jacob @ 2021-07-12 16:34 UTC (permalink / raw)
  To: Bruce Richardson
  Cc: Chengwen Feng, Thomas Monjalon, Ferruh Yigit, Jerin Jacob,
	dpdk-dev, Morten Brørup, Nipun Gupta, Hemant Agrawal,
	Maxime Coquelin, Honnappa Nagarahalli, David Marchand,
	Satananda Burla, Prasun Kapoor, Ananyev, Konstantin, liangma

On Mon, Jul 12, 2021 at 7:02 PM Bruce Richardson
<bruce.richardson@intel.com> wrote:
>
> On Mon, Jul 12, 2021 at 03:29:27PM +0530, Jerin Jacob wrote:
> > On Sun, Jul 11, 2021 at 2:59 PM Chengwen Feng <fengchengwen@huawei.com> wrote:
> > >
> > > This patch introduce 'dmadevice' which is a generic type of DMA
> > > device.
> > >
> > > The APIs of dmadev library exposes some generic operations which can
> > > enable configuration and I/O with the DMA devices.
> > >
> > > Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>
> > > ---
> > > +/**
> > > + * @warning
> > > + * @b EXPERIMENTAL: this API may change without prior notice.
> > > + *
> > > + * Enqueue a fill operation onto the virtual DMA channel.
> > > + *
> > > + * This queues up a fill operation to be performed by hardware, but does not
> > > + * trigger hardware to begin that operation.
> > > + *
> > > + * @param dev_id
> > > + *   The identifier of the device.
> > > + * @param vchan
> > > + *   The identifier of virtual DMA channel.
> > > + * @param pattern
> > > + *   The pattern to populate the destination buffer with.
> > > + * @param dst
> > > + *   The address of the destination buffer.
> > > + * @param length
> > > + *   The length of the destination buffer.
> > > + * @param flags
> > > + *   An flags for this operation.
> > > + *
> > > + * @return
> > > + *   - 0..UINT16_MAX: index of enqueued copy job.
> >
> > fill job
> >
> > > + *   - <0: Error code returned by the driver copy function.
> > > + */
> > > +__rte_experimental
> > > +static inline int
> > > +rte_dmadev_fill(uint16_t dev_id, uint16_t vchan, uint64_t pattern,
> > > +               rte_iova_t dst, uint32_t length, uint64_t flags)
> > > +{
> > > +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> > > +#ifdef RTE_DMADEV_DEBUG
> > > +       RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
> > > +       RTE_FUNC_PTR_OR_ERR_RET(*dev->fill, -ENOTSUP);
> > > +       if (vchan >= dev->data->dev_conf.max_vchans) {
> > > +               RTE_DMADEV_LOG(ERR, "Invalid vchan %d\n", vchan);
> > > +               return -EINVAL;
> > > +       }
> > > +#endif
> > > +       return (*dev->fill)(dev, vchan, pattern, dst, length, flags);
> > > +}
> > > +
> >
> > > +/**
> > > + * @warning
> > > + * @b EXPERIMENTAL: this API may change without prior notice.
> > > + *
> > > + * Returns the number of operations that have been successfully completed.
> > > + *
> > > + * @param dev_id
> > > + *   The identifier of the device.
> > > + * @param vchan
> > > + *   The identifier of virtual DMA channel.
> > > + * @param nb_cpls
> > > + *   The maximum number of completed operations that can be processed.
> > > + * @param[out] last_idx
> > > + *   The last completed operation's index.
> > > + *   If not required, NULL can be passed in.
> >
> > This means the driver will be tracking the last index.
> >
>
> Driver will be doing this anyway, no, since it needs to ensure we don't

Yes.

> wrap around?


>
> > Is that mean, the application needs to call this API periodically to
> > consume the completion slot.
> > I.e up to 64K (UINT16_MAX)  outstanding jobs are possible. If the
> > application fails to call this
> > >64K outstand job then the subsequence enqueue will fail.
>
> Well, given that there will be a regular enqueue ring which will probably
> be <= 64k in size, the completion call will need to be called frequently
> anyway. I don't think we need to document this restriction as it's fairly
> understood that you can't go beyond the size of the ring without cleanup.


See below.

>
> >
> > If so, we need to document this.
> >
> > One of the concerns of keeping UINT16_MAX as the limit is the
> > completion memory will always not in cache.
> > On the other hand, if we make this size programmable. it may introduce
> > complexity in the application.
> >
> > Thoughts?
>
> The reason for using powers-of-2 sizes, e.g. 0 .. UINT16_MAX, is that the
> ring can be any other power-of-2 size and we can index it just by masking.
> In the sample app for dmadev, I expect the ring size used to be set the
> same as the dmadev enqueue ring size, for simplicity.

No question on not using power of 2. Aligned on that.

At least in our HW, the size of the ring is rte_dmadev_vchan_conf::nb_desc.
But completion happens in _different_ memory space. Currently, we are allocating
UINT16_MAX entries to hold that. That's where cache miss aspects of
completion aspects
came.

In your case, Is completion happens in the same ring memory(looks like
one bit in job desc represents the job completed or not) ?
And when application calls rte_dmadev_completed(), You  are converting
UINT16_MAX based index to
rte_dmadev_vchan_conf::nb_desc. Right?


>
> In fact, I was thinking that in later versions we may be able to include
> some macros to help make this whole process easier, of converting indexes
> to arbitrary data structures. [The reason for using macros is so that the
> actual rings we are indexing can be of user-defined type, rather than just
> a ring of pointers].
>
> /Bruce

^ permalink raw reply	[flat|nested] 339+ messages in thread

* Re: [dpdk-dev] [PATCH v2] dmadev: introduce DMA device library
  2021-07-12 16:34       ` Jerin Jacob
@ 2021-07-12 17:00         ` Bruce Richardson
  2021-07-13  8:59           ` Jerin Jacob
  0 siblings, 1 reply; 339+ messages in thread
From: Bruce Richardson @ 2021-07-12 17:00 UTC (permalink / raw)
  To: Jerin Jacob
  Cc: Chengwen Feng, Thomas Monjalon, Ferruh Yigit, Jerin Jacob,
	dpdk-dev, Morten Brørup, Nipun Gupta, Hemant Agrawal,
	Maxime Coquelin, Honnappa Nagarahalli, David Marchand,
	Satananda Burla, Prasun Kapoor, Ananyev, Konstantin, liangma

On Mon, Jul 12, 2021 at 10:04:07PM +0530, Jerin Jacob wrote:
> On Mon, Jul 12, 2021 at 7:02 PM Bruce Richardson
> <bruce.richardson@intel.com> wrote:
> >
> > On Mon, Jul 12, 2021 at 03:29:27PM +0530, Jerin Jacob wrote:
> > > On Sun, Jul 11, 2021 at 2:59 PM Chengwen Feng <fengchengwen@huawei.com> wrote:
> > > >
> > > > This patch introduce 'dmadevice' which is a generic type of DMA
> > > > device.
> > > >
> > > > The APIs of dmadev library exposes some generic operations which can
> > > > enable configuration and I/O with the DMA devices.
> > > >
> > > > Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>
> > > > ---
> > > > +/**
> > > > + * @warning
> > > > + * @b EXPERIMENTAL: this API may change without prior notice.
> > > > + *
> > > > + * Enqueue a fill operation onto the virtual DMA channel.
> > > > + *
> > > > + * This queues up a fill operation to be performed by hardware, but does not
> > > > + * trigger hardware to begin that operation.
> > > > + *
> > > > + * @param dev_id
> > > > + *   The identifier of the device.
> > > > + * @param vchan
> > > > + *   The identifier of virtual DMA channel.
> > > > + * @param pattern
> > > > + *   The pattern to populate the destination buffer with.
> > > > + * @param dst
> > > > + *   The address of the destination buffer.
> > > > + * @param length
> > > > + *   The length of the destination buffer.
> > > > + * @param flags
> > > > + *   An flags for this operation.
> > > > + *
> > > > + * @return
> > > > + *   - 0..UINT16_MAX: index of enqueued copy job.
> > >
> > > fill job
> > >
> > > > + *   - <0: Error code returned by the driver copy function.
> > > > + */
> > > > +__rte_experimental
> > > > +static inline int
> > > > +rte_dmadev_fill(uint16_t dev_id, uint16_t vchan, uint64_t pattern,
> > > > +               rte_iova_t dst, uint32_t length, uint64_t flags)
> > > > +{
> > > > +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> > > > +#ifdef RTE_DMADEV_DEBUG
> > > > +       RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
> > > > +       RTE_FUNC_PTR_OR_ERR_RET(*dev->fill, -ENOTSUP);
> > > > +       if (vchan >= dev->data->dev_conf.max_vchans) {
> > > > +               RTE_DMADEV_LOG(ERR, "Invalid vchan %d\n", vchan);
> > > > +               return -EINVAL;
> > > > +       }
> > > > +#endif
> > > > +       return (*dev->fill)(dev, vchan, pattern, dst, length, flags);
> > > > +}
> > > > +
> > >
> > > > +/**
> > > > + * @warning
> > > > + * @b EXPERIMENTAL: this API may change without prior notice.
> > > > + *
> > > > + * Returns the number of operations that have been successfully completed.
> > > > + *
> > > > + * @param dev_id
> > > > + *   The identifier of the device.
> > > > + * @param vchan
> > > > + *   The identifier of virtual DMA channel.
> > > > + * @param nb_cpls
> > > > + *   The maximum number of completed operations that can be processed.
> > > > + * @param[out] last_idx
> > > > + *   The last completed operation's index.
> > > > + *   If not required, NULL can be passed in.
> > >
> > > This means the driver will be tracking the last index.
> > >
> >
> > Driver will be doing this anyway, no, since it needs to ensure we don't
> 
> Yes.
> 
> > wrap around?
> 
> 
> >
> > > Is that mean, the application needs to call this API periodically to
> > > consume the completion slot.
> > > I.e up to 64K (UINT16_MAX)  outstanding jobs are possible. If the
> > > application fails to call this
> > > >64K outstand job then the subsequence enqueue will fail.
> >
> > Well, given that there will be a regular enqueue ring which will probably
> > be <= 64k in size, the completion call will need to be called frequently
> > anyway. I don't think we need to document this restriction as it's fairly
> > understood that you can't go beyond the size of the ring without cleanup.
> 
> 
> See below.
> 
> >
> > >
> > > If so, we need to document this.
> > >
> > > One of the concerns of keeping UINT16_MAX as the limit is the
> > > completion memory will always not in cache.
> > > On the other hand, if we make this size programmable. it may introduce
> > > complexity in the application.
> > >
> > > Thoughts?
> >
> > The reason for using powers-of-2 sizes, e.g. 0 .. UINT16_MAX, is that the
> > ring can be any other power-of-2 size and we can index it just by masking.
> > In the sample app for dmadev, I expect the ring size used to be set the
> > same as the dmadev enqueue ring size, for simplicity.
> 
> No question on not using power of 2. Aligned on that.
> 
> At least in our HW, the size of the ring is rte_dmadev_vchan_conf::nb_desc.
> But completion happens in _different_ memory space. Currently, we are allocating
> UINT16_MAX entries to hold that. That's where cache miss aspects of
> completion aspects
> came.

Depending on HW, our completions can be written back to a separate memory
area - a completion ring, if you will - but I've generally found it works
as well to reuse the enqueue ring for that purpose. However, with a
separate memory area for completions, why do you need to allocate 64K
entries for the completions? Would nb_desc entries not be enough? Is that
to allow the user to have more than nb_desc jobs outstanding before calling
"get_completions" API?

> 
> In your case, Is completion happens in the same ring memory(looks like
> one bit in job desc represents the job completed or not) ?
> And when application calls rte_dmadev_completed(), You  are converting
> UINT16_MAX based index to
> rte_dmadev_vchan_conf::nb_desc. Right?

Yes, we are masking to do that. Actually, for simplicity and perf we should
only allow power-of-2 ring sizes. Having to use modulus instead of masking
could be a problem. [Alternatively, I suppose we can allow drivers to round
up the ring sizes to the next power of 2, but I prefer just documenting it
as a limitation].

/Bruce

^ permalink raw reply	[flat|nested] 339+ messages in thread

* Re: [dpdk-dev] [PATCH v2] dmadev: introduce DMA device library
  2021-07-12 17:00         ` Bruce Richardson
@ 2021-07-13  8:59           ` Jerin Jacob
  0 siblings, 0 replies; 339+ messages in thread
From: Jerin Jacob @ 2021-07-13  8:59 UTC (permalink / raw)
  To: Bruce Richardson
  Cc: Chengwen Feng, Thomas Monjalon, Ferruh Yigit, Jerin Jacob,
	dpdk-dev, Morten Brørup, Nipun Gupta, Hemant Agrawal,
	Maxime Coquelin, Honnappa Nagarahalli, David Marchand,
	Satananda Burla, Prasun Kapoor, Ananyev, Konstantin, liangma

On Mon, Jul 12, 2021 at 10:30 PM Bruce Richardson
<bruce.richardson@intel.com> wrote:
>
> On Mon, Jul 12, 2021 at 10:04:07PM +0530, Jerin Jacob wrote:
> > On Mon, Jul 12, 2021 at 7:02 PM Bruce Richardson
> > <bruce.richardson@intel.com> wrote:
> > >
> > > On Mon, Jul 12, 2021 at 03:29:27PM +0530, Jerin Jacob wrote:
> > > > On Sun, Jul 11, 2021 at 2:59 PM Chengwen Feng <fengchengwen@huawei.com> wrote:
> > > > >
> > > > > This patch introduce 'dmadevice' which is a generic type of DMA
> > > > > device.
> > > > >
> > > > > The APIs of dmadev library exposes some generic operations which can
> > > > > enable configuration and I/O with the DMA devices.
> > > > >
> > > > > Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>
> > > > > ---
> > > > > +/**
> > > > > + * @warning
> > > > > + * @b EXPERIMENTAL: this API may change without prior notice.
> > > > > + *
> > > > > + * Enqueue a fill operation onto the virtual DMA channel.
> > > > > + *
> > > > > + * This queues up a fill operation to be performed by hardware, but does not
> > > > > + * trigger hardware to begin that operation.
> > > > > + *
> > > > > + * @param dev_id
> > > > > + *   The identifier of the device.
> > > > > + * @param vchan
> > > > > + *   The identifier of virtual DMA channel.
> > > > > + * @param pattern
> > > > > + *   The pattern to populate the destination buffer with.
> > > > > + * @param dst
> > > > > + *   The address of the destination buffer.
> > > > > + * @param length
> > > > > + *   The length of the destination buffer.
> > > > > + * @param flags
> > > > > + *   An flags for this operation.
> > > > > + *
> > > > > + * @return
> > > > > + *   - 0..UINT16_MAX: index of enqueued copy job.
> > > >
> > > > fill job
> > > >
> > > > > + *   - <0: Error code returned by the driver copy function.
> > > > > + */
> > > > > +__rte_experimental
> > > > > +static inline int
> > > > > +rte_dmadev_fill(uint16_t dev_id, uint16_t vchan, uint64_t pattern,
> > > > > +               rte_iova_t dst, uint32_t length, uint64_t flags)
> > > > > +{
> > > > > +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> > > > > +#ifdef RTE_DMADEV_DEBUG
> > > > > +       RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
> > > > > +       RTE_FUNC_PTR_OR_ERR_RET(*dev->fill, -ENOTSUP);
> > > > > +       if (vchan >= dev->data->dev_conf.max_vchans) {
> > > > > +               RTE_DMADEV_LOG(ERR, "Invalid vchan %d\n", vchan);
> > > > > +               return -EINVAL;
> > > > > +       }
> > > > > +#endif
> > > > > +       return (*dev->fill)(dev, vchan, pattern, dst, length, flags);
> > > > > +}
> > > > > +
> > > >
> > > > > +/**
> > > > > + * @warning
> > > > > + * @b EXPERIMENTAL: this API may change without prior notice.
> > > > > + *
> > > > > + * Returns the number of operations that have been successfully completed.
> > > > > + *
> > > > > + * @param dev_id
> > > > > + *   The identifier of the device.
> > > > > + * @param vchan
> > > > > + *   The identifier of virtual DMA channel.
> > > > > + * @param nb_cpls
> > > > > + *   The maximum number of completed operations that can be processed.
> > > > > + * @param[out] last_idx
> > > > > + *   The last completed operation's index.
> > > > > + *   If not required, NULL can be passed in.
> > > >
> > > > This means the driver will be tracking the last index.
> > > >
> > >
> > > Driver will be doing this anyway, no, since it needs to ensure we don't
> >
> > Yes.
> >
> > > wrap around?
> >
> >
> > >
> > > > Is that mean, the application needs to call this API periodically to
> > > > consume the completion slot.
> > > > I.e up to 64K (UINT16_MAX)  outstanding jobs are possible. If the
> > > > application fails to call this
> > > > >64K outstand job then the subsequence enqueue will fail.
> > >
> > > Well, given that there will be a regular enqueue ring which will probably
> > > be <= 64k in size, the completion call will need to be called frequently
> > > anyway. I don't think we need to document this restriction as it's fairly
> > > understood that you can't go beyond the size of the ring without cleanup.
> >
> >
> > See below.
> >
> > >
> > > >
> > > > If so, we need to document this.
> > > >
> > > > One of the concerns of keeping UINT16_MAX as the limit is the
> > > > completion memory will always not in cache.
> > > > On the other hand, if we make this size programmable. it may introduce
> > > > complexity in the application.
> > > >
> > > > Thoughts?
> > >
> > > The reason for using powers-of-2 sizes, e.g. 0 .. UINT16_MAX, is that the
> > > ring can be any other power-of-2 size and we can index it just by masking.
> > > In the sample app for dmadev, I expect the ring size used to be set the
> > > same as the dmadev enqueue ring size, for simplicity.
> >
> > No question on not using power of 2. Aligned on that.
> >
> > At least in our HW, the size of the ring is rte_dmadev_vchan_conf::nb_desc.
> > But completion happens in _different_ memory space. Currently, we are allocating
> > UINT16_MAX entries to hold that. That's where cache miss aspects of
> > completion aspects
> > came.
>
> Depending on HW, our completions can be written back to a separate memory
> area - a completion ring, if you will - but I've generally found it works
> as well to reuse the enqueue ring for that purpose. However, with a
> separate memory area for completions, why do you need to allocate 64K
> entries for the completions? Would nb_desc entries not be enough? Is that
> to allow the user to have more than nb_desc jobs outstanding before calling
> "get_completions" API?

Yes. That's what I thought. Thats where my question on what is the max number of
outstanding completions. I thought it can be up to 64K. Agree to keep
it implementation-specific and not need to highlight this in the
documentation.


>
> >
> > In your case, Is completion happens in the same ring memory(looks like
> > one bit in job desc represents the job completed or not) ?
> > And when application calls rte_dmadev_completed(), You  are converting
> > UINT16_MAX based index to
> > rte_dmadev_vchan_conf::nb_desc. Right?
>
> Yes, we are masking to do that. Actually, for simplicity and perf we should
> only allow power-of-2 ring sizes. Having to use modulus instead of masking
> could be a problem. [Alternatively, I suppose we can allow drivers to round
> up the ring sizes to the next power of 2, but I prefer just documenting it
> as a limitation].

OK.

>
> /Bruce

^ permalink raw reply	[flat|nested] 339+ messages in thread

* Re: [dpdk-dev] [PATCH v2] dmadev: introduce DMA device library
  2021-07-12 15:50   ` Bruce Richardson
@ 2021-07-13  9:07     ` Jerin Jacob
  0 siblings, 0 replies; 339+ messages in thread
From: Jerin Jacob @ 2021-07-13  9:07 UTC (permalink / raw)
  To: Bruce Richardson
  Cc: Chengwen Feng, Thomas Monjalon, Ferruh Yigit, Jerin Jacob,
	dpdk-dev, Morten Brørup, Nipun Gupta, Hemant Agrawal,
	Maxime Coquelin, Honnappa Nagarahalli, David Marchand,
	Satananda Burla, Prasun Kapoor, Ananyev, Konstantin, liangma

On Mon, Jul 12, 2021 at 9:21 PM Bruce Richardson
<bruce.richardson@intel.com> wrote:
>
> On Sun, Jul 11, 2021 at 05:25:56PM +0800, Chengwen Feng wrote:
> > This patch introduce 'dmadevice' which is a generic type of DMA
> > device.
> >
> > The APIs of dmadev library exposes some generic operations which can
> > enable configuration and I/O with the DMA devices.
> >
> > Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>
>
> Hi again,
>
> some further review comments inline.
>
> /Bruce
>
> > ---
> >  MAINTAINERS                  |    4 +
> >  config/rte_config.h          |    3 +
> >  lib/dmadev/meson.build       |    6 +
> >  lib/dmadev/rte_dmadev.c      |  560 +++++++++++++++++++++++
> >  lib/dmadev/rte_dmadev.h      | 1030 ++++++++++++++++++++++++++++++++++++++++++
> >  lib/dmadev/rte_dmadev_core.h |  159 +++++++
> >  lib/dmadev/rte_dmadev_pmd.h  |   72 +++
> >  lib/dmadev/version.map       |   40 ++
> >  lib/meson.build              |    1 +
>
> <snip>
>
> > diff --git a/lib/dmadev/rte_dmadev.h b/lib/dmadev/rte_dmadev.h
> > new file mode 100644
> > index 0000000..8779512
> > --- /dev/null
> > +++ b/lib/dmadev/rte_dmadev.h
> > @@ -0,0 +1,1030 @@
> > +/* SPDX-License-Identifier: BSD-3-Clause
> > + * Copyright(c) 2021 HiSilicon Limited.
> > + * Copyright(c) 2021 Intel Corporation.
> > + * Copyright(c) 2021 Marvell International Ltd.
> > + */
> > +
> > +#ifndef _RTE_DMADEV_H_
> > +#define _RTE_DMADEV_H_
> > +
> > +/**
> > + * @file rte_dmadev.h
> > + *
> > + * RTE DMA (Direct Memory Access) device APIs.
> > + *
> > + * The DMA framework is built on the following model:
> > + *
> > + *     ---------------   ---------------       ---------------
> > + *     | virtual DMA |   | virtual DMA |       | virtual DMA |
> > + *     | channel     |   | channel     |       | channel     |
> > + *     ---------------   ---------------       ---------------
> > + *            |                |                      |
> > + *            ------------------                      |
> > + *                     |                              |
> > + *               ------------                    ------------
> > + *               |  dmadev  |                    |  dmadev  |
> > + *               ------------                    ------------
> > + *                     |                              |
> > + *            ------------------               ------------------
> > + *            | HW-DMA-channel |               | HW-DMA-channel |
> > + *            ------------------               ------------------
> > + *                     |                              |
> > + *                     --------------------------------
> > + *                                     |
> > + *                           ---------------------
> > + *                           | HW-DMA-Controller |
> > + *                           ---------------------
> > + *
> > + * The DMA controller could have multilpe HW-DMA-channels (aka. HW-DMA-queues),
> > + * each HW-DMA-channel should be represented by a dmadev.
> > + *
> > + * The dmadev could create multiple virtual DMA channel, each virtual DMA
> > + * channel represents a different transfer context. The DMA operation request
> > + * must be submitted to the virtual DMA channel.
> > + * E.G. Application could create virtual DMA channel 0 for mem-to-mem transfer
> > + *      scenario, and create virtual DMA channel 1 for mem-to-dev transfer
> > + *      scenario.
> > + *
> > + * The dmadev are dynamically allocated by rte_dmadev_pmd_allocate() during the
> > + * PCI/SoC device probing phase performed at EAL initialization time. And could
> > + * be released by rte_dmadev_pmd_release() during the PCI/SoC device removing
> > + * phase.
> > + *
> > + * We use 'uint16_t dev_id' as the device identifier of a dmadev, and
> > + * 'uint16_t vchan' as the virtual DMA channel identifier in one dmadev.
> > + *
> > + * The functions exported by the dmadev API to setup a device designated by its
> > + * device identifier must be invoked in the following order:
> > + *     - rte_dmadev_configure()
> > + *     - rte_dmadev_vchan_setup()
> > + *     - rte_dmadev_start()
> > + *
> > + * Then, the application can invoke dataplane APIs to process jobs.
> > + *
> > + * If the application wants to change the configuration (i.e. call
> > + * rte_dmadev_configure()), it must call rte_dmadev_stop() first to stop the
> > + * device and then do the reconfiguration before calling rte_dmadev_start()
> > + * again. The dataplane APIs should not be invoked when the device is stopped.
> > + *
> > + * Finally, an application can close a dmadev by invoking the
> > + * rte_dmadev_close() function.
> > + *
> > + * The dataplane APIs include two parts:
> > + *   a) The first part is the submission of operation requests:
> > + *        - rte_dmadev_copy()
> > + *        - rte_dmadev_copy_sg() - scatter-gather form of copy
> > + *        - rte_dmadev_fill()
> > + *        - rte_dmadev_fill_sg() - scatter-gather form of fill
> > + *        - rte_dmadev_perform() - issue doorbell to hardware
> > + *      These APIs could work with different virtual DMA channels which have
> > + *      different contexts.
> > + *      The first four APIs are used to submit the operation request to the
> > + *      virtual DMA channel, if the submission is successful, a uint16_t
> > + *      ring_idx is returned, otherwise a negative number is returned.
> > + *   b) The second part is to obtain the result of requests:
> > + *        - rte_dmadev_completed()
> > + *            - return the number of operation requests completed successfully.
> > + *        - rte_dmadev_completed_fails()
> > + *            - return the number of operation requests failed to complete.
>
> Please rename this to "completed_status" to allow the return of information
> other than just errors. As I suggested before, I think this should also be
> usable as a slower version of "completed" even in the case where there are
> no errors, in that it returns status information for each and every job
> rather than just returning as soon as it hits a failure.
>
> > + * + * About the ring_idx which rte_dmadev_copy/copy_sg/fill/fill_sg()
> > returned, + * the rules are as follows: + *   a) ring_idx for each
> > virtual DMA channel are independent.  + *   b) For a virtual DMA channel,
> > the ring_idx is monotonically incremented, + *      when it reach
> > UINT16_MAX, it wraps back to zero.
>
> Based on other feedback, I suggest we put in the detail here that: "This
> index can be used by applications to track per-job metadata in an
> application-defined circular ring, where the ring is a power-of-2 size, and
> the indexes are masked appropriately."
>
> > + *   c) The initial ring_idx of a virtual DMA channel is zero, after the device
> > + *      is stopped or reset, the ring_idx needs to be reset to zero.
> > + *   Example:
> > + *      step-1: start one dmadev
> > + *      step-2: enqueue a copy operation, the ring_idx return is 0
> > + *      step-3: enqueue a copy operation again, the ring_idx return is 1
> > + *      ...
> > + *      step-101: stop the dmadev
> > + *      step-102: start the dmadev
> > + *      step-103: enqueue a copy operation, the cookie return is 0
> > + *      ...
> > + *      step-x+0: enqueue a fill operation, the ring_idx return is 65535
> > + *      step-x+1: enqueue a copy operation, the ring_idx return is 0
> > + *      ...
> > + *
> > + * By default, all the non-dataplane functions of the dmadev API exported by a
> > + * PMD are lock-free functions which assume to not be invoked in parallel on
> > + * different logical cores to work on the same target object.
> > + *
> > + * The dataplane functions of the dmadev API exported by a PMD can be MT-safe
> > + * only when supported by the driver, generally, the driver will reports two
> > + * capabilities:
> > + *   a) Whether to support MT-safe for the submit/completion API of the same
> > + *      virtual DMA channel.
> > + *      E.G. one thread do submit operation, another thread do completion
> > + *           operation.
> > + *      If driver support it, then declare RTE_DMA_DEV_CAPA_MT_VCHAN.
> > + *      If driver don't support it, it's up to the application to guarantee
> > + *      MT-safe.
> > + *   b) Whether to support MT-safe for different virtual DMA channels.
> > + *      E.G. one thread do operation on virtual DMA channel 0, another thread
> > + *           do operation on virtual DMA channel 1.
> > + *      If driver support it, then declare RTE_DMA_DEV_CAPA_MT_MULTI_VCHAN.
> > + *      If driver don't support it, it's up to the application to guarantee
> > + *      MT-safe.
> > + *
> > + */
>
> Just to check - do we have hardware that currently supports these
> capabilities? For Intel HW, we will only support one virtual channel per
> device without any MT-safety guarantees, so won't be setting either of
> these flags. If any of these flags are unused in all planned drivers, we
> should drop them from the spec until they prove necessary. Idealy,
> everything in the dmadev definition should be testable, and features unused
> by anyone obviously will be untested.
>
> > +
> > +#include <rte_common.h>
> > +#include <rte_compat.h>
> > +#include <rte_errno.h>
> > +#include <rte_memory.h>
> > +
> > +#ifdef __cplusplus
> > +extern "C" {
> > +#endif
> > +
> > +#define RTE_DMADEV_NAME_MAX_LEN      RTE_DEV_NAME_MAX_LEN
> > +
> > +extern int rte_dmadev_logtype;
> > +
> > +#define RTE_DMADEV_LOG(level, ...) \
> > +     rte_log(RTE_LOG_ ## level, rte_dmadev_logtype, "" __VA_ARGS__)
> > +
> > +/* Macros to check for valid port */
> > +#define RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, retval) do { \
> > +     if (!rte_dmadev_is_valid_dev(dev_id)) { \
> > +             RTE_DMADEV_LOG(ERR, "Invalid dev_id=%u\n", dev_id); \
> > +             return retval; \
> > +     } \
> > +} while (0)
> > +
> > +#define RTE_DMADEV_VALID_DEV_ID_OR_RET(dev_id) do { \
> > +     if (!rte_dmadev_is_valid_dev(dev_id)) { \
> > +             RTE_DMADEV_LOG(ERR, "Invalid dev_id=%u\n", dev_id); \
> > +             return; \
> > +     } \
> > +} while (0)
> > +
>
> Can we avoid using these in the inline functions in this file, and move
> them to the _pmd.h which is for internal PMD use only? It would mean we
> don't get logging from the key dataplane functions, but I would hope the
> return values would provide enough info.
>
> Alternatively, can we keep the logtype definition and first macro and move
> the other two to the _pmd.h file.
>
> > +/**
> > + * @internal
> > + * Validate if the DMA device index is a valid attached DMA device.
> > + *
> > + * @param dev_id
> > + *   DMA device index.
> > + *
> > + * @return
> > + *   - If the device index is valid (true) or not (false).
> > + */
> > +__rte_internal
> > +bool
> > +rte_dmadev_is_valid_dev(uint16_t dev_id);
> > +
> > +/**
> > + * rte_dma_sg - can hold scatter DMA operation request
> > + */
> > +struct rte_dma_sg {
> > +     rte_iova_t src;
> > +     rte_iova_t dst;
> > +     uint32_t length;
> > +};
> > +
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change without prior notice.
> > + *
> > + * Get the total number of DMA devices that have been successfully
> > + * initialised.
> > + *
> > + * @return
> > + *   The total number of usable DMA devices.
> > + */
> > +__rte_experimental
> > +uint16_t
> > +rte_dmadev_count(void);
> > +
> > +/**
> > + * The capabilities of a DMA device
> > + */
> > +#define RTE_DMA_DEV_CAPA_MEM_TO_MEM  (1ull << 0)
> > +/**< DMA device support mem-to-mem transfer.
>
> Do we need this? Can we assume that any device appearing as a dmadev can
> do mem-to-mem copies, and drop the capability for mem-to-mem and the
> capability for copying?
>
> > + *
> > + * @see struct rte_dmadev_info::dev_capa
> > + */
> > +#define RTE_DMA_DEV_CAPA_MEM_TO_DEV  (1ull << 1)
> > +/**< DMA device support slave mode & mem-to-dev transfer.
> > + *
> > + * @see struct rte_dmadev_info::dev_capa
> > + */
> > +#define RTE_DMA_DEV_CAPA_DEV_TO_MEM  (1ull << 2)
> > +/**< DMA device support slave mode & dev-to-mem transfer.
> > + *
> > + * @see struct rte_dmadev_info::dev_capa
> > + */
> > +#define RTE_DMA_DEV_CAPA_DEV_TO_DEV  (1ull << 3)
> > +/**< DMA device support slave mode & dev-to-dev transfer.
> > + *
>
> Just to confirm, are there devices currently planned for dmadev that

We are planning to use this support as our exiting raw driver has this.

> supports only a subset of these flags? Thinking particularly of the
> dev-2-mem and mem-2-dev ones here - do any of the devices we are
> considering not support using device memory?
> [Again, just want to ensure we aren't adding too much stuff that we don't
> need yet]



>
> > + * @see struct rte_dmadev_info::dev_capa
> > + */
> > +#define RTE_DMA_DEV_CAPA_OPS_COPY    (1ull << 4)
> > +/**< DMA device support copy ops.
> > + *
>
> Suggest dropping this and making it min for dmadev.
>
> > + * @see struct rte_dmadev_info::dev_capa
> > + */
> > +#define RTE_DMA_DEV_CAPA_OPS_FILL    (1ull << 5)
> > +/**< DMA device support fill ops.
> > + *
> > + * @see struct rte_dmadev_info::dev_capa
> > + */
> > +#define RTE_DMA_DEV_CAPA_OPS_SG              (1ull << 6)
> > +/**< DMA device support scatter-list ops.
> > + * If device support ops_copy and ops_sg, it means supporting copy_sg ops.
> > + * If device support ops_fill and ops_sg, it means supporting fill_sg ops.
> > + *
> > + * @see struct rte_dmadev_info::dev_capa
> > + */
> > +#define RTE_DMA_DEV_CAPA_FENCE               (1ull << 7)
> > +/**< DMA device support fence.
> > + * If device support fence, then application could set a fence flags when
> > + * enqueue operation by rte_dma_copy/copy_sg/fill/fill_sg.
> > + * If a operation has a fence flags, it means the operation must be processed
> > + * only after all previous operations are completed.
> > + *
>
> Is this needed? As I understand it, the Marvell driver doesn't require
> fences so providing one is a no-op. Therefore, this flag is probably
> unnecessary.

+1

>
> > + * @see struct rte_dmadev_info::dev_capa
> > + */
> > +#define RTE_DMA_DEV_CAPA_SVA         (1ull << 8)
> > +/**< DMA device support SVA which could use VA as DMA address.
> > + * If device support SVA then application could pass any VA address like memory
> > + * from rte_malloc(), rte_memzone(), malloc, stack memory.
> > + * If device don't support SVA, then application should pass IOVA address which
> > + * from rte_malloc(), rte_memzone().
> > + *
> > + * @see struct rte_dmadev_info::dev_capa
> > + */
> > +#define RTE_DMA_DEV_CAPA_MT_VCHAN    (1ull << 9)
> > +/**< DMA device support MT-safe of a virtual DMA channel.
> > + *
> > + * @see struct rte_dmadev_info::dev_capa
> > + */
> > +#define RTE_DMA_DEV_CAPA_MT_MULTI_VCHAN      (1ull << 10)
> > +/**< DMA device support MT-safe of different virtual DMA channels.
> > + *
> > + * @see struct rte_dmadev_info::dev_capa
> > + */
>
> As with comments above - let's check that these will actually be used
> before we add them.
>
> > +
> > +/**
> > + * A structure used to retrieve the contextual information of
> > + * an DMA device
> > + */
> > +struct rte_dmadev_info {
> > +     struct rte_device *device; /**< Generic Device information */
> > +     uint64_t dev_capa; /**< Device capabilities (RTE_DMA_DEV_CAPA_) */
> > +     /** Maximum number of virtual DMA channels supported */
> > +     uint16_t max_vchans;
> > +     /** Maximum allowed number of virtual DMA channel descriptors */
> > +     uint16_t max_desc;
> > +     /** Minimum allowed number of virtual DMA channel descriptors */
> > +     uint16_t min_desc;
> > +     uint16_t nb_vchans; /**< Number of virtual DMA channel configured */
> > +};
>
> Let's add rte_dmadev_conf struct into this to return the configuration
> settings.
>
> > +
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change without prior notice.
> > + *
> > + * Retrieve the contextual information of a DMA device.
> > + *
> > + * @param dev_id
> > + *   The identifier of the device.
> > + * @param[out] dev_info
> > + *   A pointer to a structure of type *rte_dmadev_info* to be filled with the
> > + *   contextual information of the device.
> > + *
> > + * @return
> > + *   - =0: Success, driver updates the contextual information of the DMA device
> > + *   - <0: Error code returned by the driver info get function.
> > + *
> > + */
> > +__rte_experimental
> > +int
> > +rte_dmadev_info_get(uint16_t dev_id, struct rte_dmadev_info *dev_info);
> > +
>
> Should have "const" on second param.
>
> > +/**
> > + * A structure used to configure a DMA device.
> > + */
> > +struct rte_dmadev_conf {
> > +     /** Maximum number of virtual DMA channel to use.
> > +      * This value cannot be greater than the field 'max_vchans' of struct
> > +      * rte_dmadev_info which get from rte_dmadev_info_get().
> > +      */
> > +     uint16_t max_vchans;
> > +     /** Enable bit for MT-safe of a virtual DMA channel.
> > +      * This bit can be enabled only when the device supports
> > +      * RTE_DMA_DEV_CAPA_MT_VCHAN.
> > +      * @see RTE_DMA_DEV_CAPA_MT_VCHAN
> > +      */
> > +     uint8_t enable_mt_vchan : 1;
> > +     /** Enable bit for MT-safe of different virtual DMA channels.
> > +      * This bit can be enabled only when the device supports
> > +      * RTE_DMA_DEV_CAPA_MT_MULTI_VCHAN.
> > +      * @see RTE_DMA_DEV_CAPA_MT_MULTI_VCHAN
> > +      */
> > +     uint8_t enable_mt_multi_vchan : 1;
> > +     uint64_t reserved[2]; /**< Reserved for future fields */
> > +};
>
> Drop the reserved fields. ABI versioning is a better way to deal with
> adding new fields.

+1

>
> > +
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change without prior notice.
> > + *
> > + * Configure a DMA device.
> > + *
> > + * This function must be invoked first before any other function in the
> > + * API. This function can also be re-invoked when a device is in the
> > + * stopped state.
> > + *
> > + * @param dev_id
> > + *   The identifier of the device to configure.
> > + * @param dev_conf
> > + *   The DMA device configuration structure encapsulated into rte_dmadev_conf
> > + *   object.
> > + *
> > + * @return
> > + *   - =0: Success, device configured.
> > + *   - <0: Error code returned by the driver configuration function.
> > + */
> > +__rte_experimental
> > +int
> > +rte_dmadev_configure(uint16_t dev_id, const struct rte_dmadev_conf *dev_conf);
> > +
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change without prior notice.
> > + *
> > + * Start a DMA device.
> > + *
> > + * The device start step is the last one and consists of setting the DMA
> > + * to start accepting jobs.
> > + *
> > + * @param dev_id
> > + *   The identifier of the device.
> > + *
> > + * @return
> > + *   - =0: Success, device started.
> > + *   - <0: Error code returned by the driver start function.
> > + */
> > +__rte_experimental
> > +int
> > +rte_dmadev_start(uint16_t dev_id);
> > +
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change without prior notice.
> > + *
> > + * Stop a DMA device.
> > + *
> > + * The device can be restarted with a call to rte_dmadev_start()
> > + *
> > + * @param dev_id
> > + *   The identifier of the device.
> > + *
> > + * @return
> > + *   - =0: Success, device stopped.
> > + *   - <0: Error code returned by the driver stop function.
> > + */
> > +__rte_experimental
> > +int
> > +rte_dmadev_stop(uint16_t dev_id);
> > +
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change without prior notice.
> > + *
> > + * Close a DMA device.
> > + *
> > + * The device cannot be restarted after this call.
> > + *
> > + * @param dev_id
> > + *   The identifier of the device.
> > + *
> > + * @return
> > + *  - =0: Successfully close device
> > + *  - <0: Failure to close device
> > + */
> > +__rte_experimental
> > +int
> > +rte_dmadev_close(uint16_t dev_id);
> > +
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change without prior notice.
> > + *
> > + * Reset a DMA device.
> > + *
> > + * This is different from cycle of rte_dmadev_start->rte_dmadev_stop in the
> > + * sense similar to hard or soft reset.
> > + *
> > + * @param dev_id
> > + *   The identifier of the device.
> > + *
> > + * @return
> > + *   - =0: Successfully reset device.
> > + *   - <0: Failure to reset device.
> > + *   - (-ENOTSUP): If the device doesn't support this function.
> > + */
> > +__rte_experimental
> > +int
> > +rte_dmadev_reset(uint16_t dev_id);
> > +
> > +/**
> > + * DMA transfer direction defines.
> > + */
> > +#define RTE_DMA_MEM_TO_MEM   (1ull << 0)
> > +/**< DMA transfer direction - from memory to memory.
> > + *
> > + * @see struct rte_dmadev_vchan_conf::direction
> > + */
> > +#define RTE_DMA_MEM_TO_DEV   (1ull << 1)
> > +/**< DMA transfer direction - slave mode & from memory to device.
> > + * In a typical scenario, ARM SoCs are installed on x86 servers as iNICs. In
> > + * this case, the ARM SoCs works in slave mode, it could initiate a DMA move
> > + * request from ARM memory to x86 host memory.
>
> For clarity, it would be good to specify in the scenario described which
> memory is the "mem" and which is the "dev" (I assume SoC memory is "mem"
> and x86 host memory is "dev"??)
>
> > + *
> > + * @see struct rte_dmadev_vchan_conf::direction
> > + */
> > +#define RTE_DMA_DEV_TO_MEM   (1ull << 2)
> > +/**< DMA transfer direction - slave mode & from device to memory.
> > + * In a typical scenario, ARM SoCs are installed on x86 servers as iNICs. In
> > + * this case, the ARM SoCs works in slave mode, it could initiate a DMA move
> > + * request from x86 host memory to ARM memory.
> > + *
> > + * @see struct rte_dmadev_vchan_conf::direction
> > + */
> > +#define RTE_DMA_DEV_TO_DEV   (1ull << 3)
> > +/**< DMA transfer direction - slave mode & from device to device.
> > + * In a typical scenario, ARM SoCs are installed on x86 servers as iNICs. In
> > + * this case, the ARM SoCs works in slave mode, it could initiate a DMA move
> > + * request from x86 host memory to another x86 host memory.
> > + *
> > + * @see struct rte_dmadev_vchan_conf::direction
> > + */
> > +#define RTE_DMA_TRANSFER_DIR_ALL     (RTE_DMA_MEM_TO_MEM | \
> > +                                      RTE_DMA_MEM_TO_DEV | \
> > +                                      RTE_DMA_DEV_TO_MEM | \
> > +                                      RTE_DMA_DEV_TO_DEV)
> > +
> > +/**
> > + * enum rte_dma_slave_port_type - slave mode type defines
> > + */
> > +enum rte_dma_slave_port_type {
> > +     /** The slave port is PCIE. */
> > +     RTE_DMA_SLAVE_PORT_PCIE = 1,
> > +};
> > +
>
> As previously mentioned, this needs to be updated to use other terms.
> For some suggested alternatives see:
> https://doc.dpdk.org/guides-21.05/contributing/coding_style.html#naming
>
> > +/**
> > + * A structure used to descript slave port parameters.
> > + */
> > +struct rte_dma_slave_port_parameters {
> > +     enum rte_dma_slave_port_type port_type;
> > +     union {
> > +             /** For PCIE port */
> > +             struct {
> > +                     /** The physical function number which to use */
> > +                     uint64_t pf_number : 6;
> > +                     /** Virtual function enable bit */
> > +                     uint64_t vf_enable : 1;
> > +                     /** The virtual function number which to use */
> > +                     uint64_t vf_number : 8;
> > +                     uint64_t pasid : 20;
> > +                     /** The attributes filed in TLP packet */
> > +                     uint64_t tlp_attr : 3;
> > +             };
> > +     };
> > +};
> > +
> > +/**
> > + * A structure used to configure a virtual DMA channel.
> > + */
> > +struct rte_dmadev_vchan_conf {
> > +     uint8_t direction; /**< Set of supported transfer directions */
> > +     /** Number of descriptor for the virtual DMA channel */
> > +     uint16_t nb_desc;
> > +     /** 1) Used to describes the dev parameter in the mem-to-dev/dev-to-mem
> > +      * transfer scenario.
> > +      * 2) Used to describes the src dev parameter in the dev-to-dev
> > +      * transfer scenario.
> > +      */
> > +     struct rte_dma_slave_port_parameters port;
> > +     /** Used to describes the dst dev parameters in the dev-to-dev
> > +      * transfer scenario.
> > +      */
> > +     struct rte_dma_slave_port_parameters peer_port;
> > +     uint64_t reserved[2]; /**< Reserved for future fields */
> > +};
>
> Let's drop the reserved fields and use ABI versioning if necesssary in
> future.
>
> > +
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change without prior notice.
> > + *
> > + * Allocate and set up a virtual DMA channel.
> > + *
> > + * @param dev_id
> > + *   The identifier of the device.
> > + * @param conf
> > + *   The virtual DMA channel configuration structure encapsulated into
> > + *   rte_dmadev_vchan_conf object.
> > + *
> > + * @return
> > + *   - >=0: Allocate success, it is the virtual DMA channel id. This value must
> > + *          be less than the field 'max_vchans' of struct rte_dmadev_conf
> > +         which configured by rte_dmadev_configure().
>
> nit: whitespace error here.
>
> > + *   - <0: Error code returned by the driver virtual channel setup function.
> > + */
> > +__rte_experimental
> > +int
> > +rte_dmadev_vchan_setup(uint16_t dev_id,
> > +                    const struct rte_dmadev_vchan_conf *conf);
> > +
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change without prior notice.
> > + *
> > + * Release a virtual DMA channel.
> > + *
> > + * @param dev_id
> > + *   The identifier of the device.
> > + * @param vchan
> > + *   The identifier of virtual DMA channel which return by vchan setup.
> > + *
> > + * @return
> > + *   - =0: Successfully release the virtual DMA channel.
> > + *   - <0: Error code returned by the driver virtual channel release function.
> > + */
> > +__rte_experimental
> > +int
> > +rte_dmadev_vchan_release(uint16_t dev_id, uint16_t vchan);
> > +
> > +/**
> > + * rte_dmadev_stats - running statistics.
> > + */
> > +struct rte_dmadev_stats {
> > +     /** Count of operations which were successfully enqueued */
> > +     uint64_t enqueued_count;
> > +     /** Count of operations which were submitted to hardware */
> > +     uint64_t submitted_count;
> > +     /** Count of operations which failed to complete */
> > +     uint64_t completed_fail_count;
> > +     /** Count of operations which successfully complete */
> > +     uint64_t completed_count;
> > +     uint64_t reserved[4]; /**< Reserved for future fields */
> > +};
> > +
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change without prior notice.
> > + *
> > + * Retrieve basic statistics of a or all virtual DMA channel(s).
> > + *
> > + * @param dev_id
> > + *   The identifier of the device.
> > + * @param vchan
> > + *   The identifier of virtual DMA channel, -1 means all channels.
> > + * @param[out] stats
> > + *   The basic statistics structure encapsulated into rte_dmadev_stats
> > + *   object.
> > + *
> > + * @return
> > + *   - =0: Successfully retrieve stats.
> > + *   - <0: Failure to retrieve stats.
> > + */
> > +__rte_experimental
> > +int
> > +rte_dmadev_stats_get(uint16_t dev_id, int vchan,
>
> vchan as uint16_t rather than int, I think. This would apply to all
> dataplane functions. There is no need for a signed vchan value.
>
> > +                  struct rte_dmadev_stats *stats);
> > +
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change without prior notice.
> > + *
> > + * Reset basic statistics of a or all virtual DMA channel(s).
> > + *
> > + * @param dev_id
> > + *   The identifier of the device.
> > + * @param vchan
> > + *   The identifier of virtual DMA channel, -1 means all channels.
> > + *
> > + * @return
> > + *   - =0: Successfully reset stats.
> > + *   - <0: Failure to reset stats.
> > + */
> > +__rte_experimental
> > +int
> > +rte_dmadev_stats_reset(uint16_t dev_id, int vchan);
> > +
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change without prior notice.
> > + *
> > + * Dump DMA device info.
> > + *
> > + * @param dev_id
> > + *   The identifier of the device.
> > + * @param f
> > + *   The file to write the output to.
> > + *
> > + * @return
> > + *   0 on success. Non-zero otherwise.
> > + */
> > +__rte_experimental
> > +int
> > +rte_dmadev_dump(uint16_t dev_id, FILE *f);
> > +
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change without prior notice.
> > + *
> > + * Trigger the dmadev self test.
> > + *
> > + * @param dev_id
> > + *   The identifier of the device.
> > + *
> > + * @return
> > + *   - 0: Selftest successful.
> > + *   - -ENOTSUP if the device doesn't support selftest
> > + *   - other values < 0 on failure.
> > + */
> > +__rte_experimental
> > +int
> > +rte_dmadev_selftest(uint16_t dev_id);
>
> I don't think this needs to be in the public API, since it should only be
> for the autotest app to use. Maybe move the prototype to the _pmd.h (since
> we don't have a separate internal header), and then the autotest app can
> pick it up from there.
>
> > +
> > +#include "rte_dmadev_core.h"
> > +
> > +/**
> > + *  DMA flags to augment operation preparation.
> > + *  Used as the 'flags' parameter of rte_dmadev_copy/copy_sg/fill/fill_sg.
> > + */
> > +#define RTE_DMA_FLAG_FENCE   (1ull << 0)
> > +/**< DMA fence flag
> > + * It means the operation with this flag must be processed only after all
> > + * previous operations are completed.
> > + *
> > + * @see rte_dmadev_copy()
> > + * @see rte_dmadev_copy_sg()
> > + * @see rte_dmadev_fill()
> > + * @see rte_dmadev_fill_sg()
> > + */
>
> As a general comment, I think all these multi-line comments should go
> before the item they describe. Comments after should only be used in the
> case where the comment fits on the rest of the line after a value.
>
> We also should define the SUBMIT flag as suggested by Jerin, to allow apps
> to automatically submit jobs after enqueue.
>
> > +
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change without prior notice.
> > + *
> > + * Enqueue a copy operation onto the virtual DMA channel.
> > + *
> > + * This queues up a copy operation to be performed by hardware, but does not
> > + * trigger hardware to begin that operation.
> > + *
> > + * @param dev_id
> > + *   The identifier of the device.
> > + * @param vchan
> > + *   The identifier of virtual DMA channel.
> > + * @param src
> > + *   The address of the source buffer.
> > + * @param dst
> > + *   The address of the destination buffer.
> > + * @param length
> > + *   The length of the data to be copied.
> > + * @param flags
> > + *   An flags for this operation.
> > + *
> > + * @return
> > + *   - 0..UINT16_MAX: index of enqueued copy job.
> > + *   - <0: Error code returned by the driver copy function.
> > + */
> > +__rte_experimental
> > +static inline int
> > +rte_dmadev_copy(uint16_t dev_id, uint16_t vchan, rte_iova_t src, rte_iova_t dst,
> > +             uint32_t length, uint64_t flags)
> > +{
> > +     struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> > +#ifdef RTE_DMADEV_DEBUG
> > +     RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
> > +     RTE_FUNC_PTR_OR_ERR_RET(*dev->copy, -ENOTSUP);
> > +     if (vchan >= dev->data->dev_conf.max_vchans) {
> > +             RTE_DMADEV_LOG(ERR, "Invalid vchan %d\n", vchan);
> > +             return -EINVAL;
> > +     }
> > +#endif
> > +     return (*dev->copy)(dev, vchan, src, dst, length, flags);
> > +}
> > +
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change without prior notice.
> > + *
> > + * Enqueue a scatter list copy operation onto the virtual DMA channel.
> > + *
> > + * This queues up a scatter list copy operation to be performed by hardware,
> > + * but does not trigger hardware to begin that operation.
> > + *
> > + * @param dev_id
> > + *   The identifier of the device.
> > + * @param vchan
> > + *   The identifier of virtual DMA channel.
> > + * @param sg
> > + *   The pointer of scatterlist.
> > + * @param sg_len
> > + *   The number of scatterlist elements.
> > + * @param flags
> > + *   An flags for this operation.
> > + *
> > + * @return
> > + *   - 0..UINT16_MAX: index of enqueued copy job.
> > + *   - <0: Error code returned by the driver copy function.
> > + */
> > +__rte_experimental
> > +static inline int
> > +rte_dmadev_copy_sg(uint16_t dev_id, uint16_t vchan, const struct rte_dma_sg *sg,
> > +                uint32_t sg_len, uint64_t flags)
> > +{
> > +     struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> > +#ifdef RTE_DMADEV_DEBUG
> > +     RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
> > +     RTE_FUNC_PTR_OR_ERR_RET(sg, -EINVAL);
> > +     RTE_FUNC_PTR_OR_ERR_RET(*dev->copy_sg, -ENOTSUP);
> > +     if (vchan >= dev->data->dev_conf.max_vchans) {
> > +             RTE_DMADEV_LOG(ERR, "Invalid vchan %d\n", vchan);
> > +             return -EINVAL;
> > +     }
> > +#endif
> > +     return (*dev->copy_sg)(dev, vchan, sg, sg_len, flags);
> > +}
> > +
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change without prior notice.
> > + *
> > + * Enqueue a fill operation onto the virtual DMA channel.
> > + *
> > + * This queues up a fill operation to be performed by hardware, but does not
> > + * trigger hardware to begin that operation.
> > + *
> > + * @param dev_id
> > + *   The identifier of the device.
> > + * @param vchan
> > + *   The identifier of virtual DMA channel.
> > + * @param pattern
> > + *   The pattern to populate the destination buffer with.
> > + * @param dst
> > + *   The address of the destination buffer.
> > + * @param length
> > + *   The length of the destination buffer.
> > + * @param flags
> > + *   An flags for this operation.
> > + *
> > + * @return
> > + *   - 0..UINT16_MAX: index of enqueued copy job.
> > + *   - <0: Error code returned by the driver copy function.
> > + */
> > +__rte_experimental
> > +static inline int
> > +rte_dmadev_fill(uint16_t dev_id, uint16_t vchan, uint64_t pattern,
> > +             rte_iova_t dst, uint32_t length, uint64_t flags)
> > +{
> > +     struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> > +#ifdef RTE_DMADEV_DEBUG
> > +     RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
> > +     RTE_FUNC_PTR_OR_ERR_RET(*dev->fill, -ENOTSUP);
> > +     if (vchan >= dev->data->dev_conf.max_vchans) {
> > +             RTE_DMADEV_LOG(ERR, "Invalid vchan %d\n", vchan);
> > +             return -EINVAL;
> > +     }
> > +#endif
> > +     return (*dev->fill)(dev, vchan, pattern, dst, length, flags);
> > +}
> > +
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change without prior notice.
> > + *
> > + * Enqueue a scatter list fill operation onto the virtual DMA channel.
> > + *
> > + * This queues up a scatter list fill operation to be performed by hardware,
> > + * but does not trigger hardware to begin that operation.
> > + *
> > + * @param dev_id
> > + *   The identifier of the device.
> > + * @param vchan
> > + *   The identifier of virtual DMA channel.
> > + * @param pattern
> > + *   The pattern to populate the destination buffer with.
> > + * @param sg
> > + *   The pointer of scatterlist.
> > + * @param sg_len
> > + *   The number of scatterlist elements.
> > + * @param flags
> > + *   An flags for this operation.
> > + *
> > + * @return
> > + *   - 0..UINT16_MAX: index of enqueued copy job.
> > + *   - <0: Error code returned by the driver copy function.
> > + */
> > +__rte_experimental
> > +static inline int
> > +rte_dmadev_fill_sg(uint16_t dev_id, uint16_t vchan, uint64_t pattern,
> > +                const struct rte_dma_sg *sg, uint32_t sg_len,
> > +                uint64_t flags)
> > +{
> > +     struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> > +#ifdef RTE_DMADEV_DEBUG
> > +     RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
> > +     RTE_FUNC_PTR_OR_ERR_RET(sg, -ENOTSUP);
> > +     RTE_FUNC_PTR_OR_ERR_RET(*dev->fill, -ENOTSUP);
> > +     if (vchan >= dev->data->dev_conf.max_vchans) {
> > +             RTE_DMADEV_LOG(ERR, "Invalid vchan %d\n", vchan);
> > +             return -EINVAL;
> > +     }
> > +#endif
> > +     return (*dev->fill_sg)(dev, vchan, pattern, sg, sg_len, flags);
> > +}
> > +
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change without prior notice.
> > + *
> > + * Trigger hardware to begin performing enqueued operations.
> > + *
> > + * This API is used to write the "doorbell" to the hardware to trigger it
> > + * to begin the operations previously enqueued by rte_dmadev_copy/fill()
> > + *
> > + * @param dev_id
> > + *   The identifier of the device.
> > + * @param vchan
> > + *   The identifier of virtual DMA channel.
> > + *
> > + * @return
> > + *   - =0: Successfully trigger hardware.
> > + *   - <0: Failure to trigger hardware.
> > + */
> > +__rte_experimental
> > +static inline int
> > +rte_dmadev_submit(uint16_t dev_id, uint16_t vchan)
> > +{
> > +     struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> > +#ifdef RTE_DMADEV_DEBUG
> > +     RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
> > +     RTE_FUNC_PTR_OR_ERR_RET(*dev->submit, -ENOTSUP);
> > +     if (vchan >= dev->data->dev_conf.max_vchans) {
> > +             RTE_DMADEV_LOG(ERR, "Invalid vchan %d\n", vchan);
> > +             return -EINVAL;
> > +     }
> > +#endif
> > +     return (*dev->submit)(dev, vchan);
> > +}
> > +
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change without prior notice.
> > + *
> > + * Returns the number of operations that have been successfully completed.
> > + *
> > + * @param dev_id
> > + *   The identifier of the device.
> > + * @param vchan
> > + *   The identifier of virtual DMA channel.
> > + * @param nb_cpls
> > + *   The maximum number of completed operations that can be processed.
> > + * @param[out] last_idx
> > + *   The last completed operation's index.
> > + *   If not required, NULL can be passed in.
> > + * @param[out] has_error
> > + *   Indicates if there are transfer error.
> > + *   If not required, NULL can be passed in.
> > + *
> > + * @return
> > + *   The number of operations that successfully completed.
> > + */
> > +__rte_experimental
> > +static inline uint16_t
> > +rte_dmadev_completed(uint16_t dev_id, uint16_t vchan, const uint16_t nb_cpls,
> > +                  uint16_t *last_idx, bool *has_error)
> > +{
> > +     struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> > +     uint16_t idx;
> > +     bool err;
> > +
> > +#ifdef RTE_DMADEV_DEBUG
> > +     RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
> > +     RTE_FUNC_PTR_OR_ERR_RET(*dev->completed, -ENOTSUP);
> > +     if (vchan >= dev->data->dev_conf.max_vchans) {
> > +             RTE_DMADEV_LOG(ERR, "Invalid vchan %d\n", vchan);
> > +             return -EINVAL;
> > +     }
> > +     if (nb_cpls == 0) {
> > +             RTE_DMADEV_LOG(ERR, "Invalid nb_cpls\n");
> > +             return -EINVAL;
> > +     }
> > +#endif
> > +
> > +     /* Ensure the pointer values are non-null to simplify drivers.
> > +      * In most cases these should be compile time evaluated, since this is
> > +      * an inline function.
> > +      * - If NULL is explicitly passed as parameter, then compiler knows the
> > +      *   value is NULL
> > +      * - If address of local variable is passed as parameter, then compiler
> > +      *   can know it's non-NULL.
> > +      */
> > +     if (last_idx == NULL)
> > +             last_idx = &idx;
> > +     if (has_error == NULL)
> > +             has_error = &err;
> > +
> > +     *has_error = false;
> > +     return (*dev->completed)(dev, vchan, nb_cpls, last_idx, has_error);
> > +}
> > +
> > +/**
> > + * DMA transfer status code defines
> > + */
> > +enum rte_dma_status_code {
> > +     /** The operation completed successfully */
> > +     RTE_DMA_STATUS_SUCCESSFUL = 0,
> > +     /** The operation failed to complete due active drop
> > +      * This is mainly used when processing dev_stop, allow outstanding
> > +      * requests to be completed as much as possible.
> > +      */
> > +     RTE_DMA_STATUS_ACTIVE_DROP,
> > +     /** The operation failed to complete due invalid source address */
> > +     RTE_DMA_STATUS_INVALID_SRC_ADDR,
> > +     /** The operation failed to complete due invalid destination address */
> > +     RTE_DMA_STATUS_INVALID_DST_ADDR,
> > +     /** The operation failed to complete due invalid length */
> > +     RTE_DMA_STATUS_INVALID_LENGTH,
> > +     /** The operation failed to complete due invalid opcode
> > +      * The DMA descriptor could have multiple format, which are
> > +      * distinguished by the opcode field.
> > +      */
> > +     RTE_DMA_STATUS_INVALID_OPCODE,
> > +     /** The operation failed to complete due bus err */
> > +     RTE_DMA_STATUS_BUS_ERROR,
> > +     /** The operation failed to complete due data poison */
> > +     RTE_DMA_STATUS_DATA_POISION,
> > +     /** The operation failed to complete due descriptor read error */
> > +     RTE_DMA_STATUS_DESCRIPTOR_READ_ERROR,
> > +     /** The operation failed to complete due device link error
> > +      * Used to indicates that the link error in the mem-to-dev/dev-to-mem/
> > +      * dev-to-dev transfer scenario.
> > +      */
> > +     RTE_DMA_STATUS_DEV_LINK_ERROR,
> > +     /** Driver specific status code offset
> > +      * Start status code for the driver to define its own error code.
> > +      */
> > +     RTE_DMA_STATUS_DRV_SPECIFIC_OFFSET = 0x10000,
> > +};
> > +
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change without prior notice.
> > + *
> > + * Returns the number of operations that failed to complete.
> > + * NOTE: This API was used when rte_dmadev_completed has_error was set.
> > + *
> > + * @param dev_id
> > + *   The identifier of the device.
> > + * @param vchan
> > + *   The identifier of virtual DMA channel.
> > + * @param nb_status
> > + *   Indicates the size of status array.
> > + * @param[out] status
> > + *   The error code of operations that failed to complete.
> > + *   Some standard error code are described in 'enum rte_dma_status_code'
> > + *   @see rte_dma_status_code
> > + * @param[out] last_idx
> > + *   The last failed completed operation's index.
> > + *
> > + * @return
> > + *   The number of operations that failed to complete.
> > + */
> > +__rte_experimental
> > +static inline uint16_t
> > +rte_dmadev_completed_fails(uint16_t dev_id, uint16_t vchan,
> > +                        const uint16_t nb_status, uint32_t *status,
> > +                        uint16_t *last_idx)
> > +{
> > +     struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> > +#ifdef RTE_DMADEV_DEBUG
> > +     RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
> > +     RTE_FUNC_PTR_OR_ERR_RET(status, -EINVAL);
> > +     RTE_FUNC_PTR_OR_ERR_RET(last_idx, -EINVAL);
> > +     RTE_FUNC_PTR_OR_ERR_RET(*dev->completed_fails, -ENOTSUP);
> > +     if (vchan >= dev->data->dev_conf.max_vchans) {
> > +             RTE_DMADEV_LOG(ERR, "Invalid vchan %d\n", vchan);
> > +             return -EINVAL;
> > +     }
> > +     if (nb_status == 0) {
> > +             RTE_DMADEV_LOG(ERR, "Invalid nb_status\n");
> > +             return -EINVAL;
> > +     }
> > +#endif
> > +     return (*dev->completed_fails)(dev, vchan, nb_status, status, last_idx);
> > +}
> > +
> > +#ifdef __cplusplus
> > +}
> > +#endif
> > +
> > +#endif /* _RTE_DMADEV_H_ */
> > diff --git a/lib/dmadev/rte_dmadev_core.h b/lib/dmadev/rte_dmadev_core.h
> > new file mode 100644
> > index 0000000..410faf0
> > --- /dev/null
> > +++ b/lib/dmadev/rte_dmadev_core.h
> > @@ -0,0 +1,159 @@
> > +/* SPDX-License-Identifier: BSD-3-Clause
> > + * Copyright(c) 2021 HiSilicon Limited.
> > + * Copyright(c) 2021 Intel Corporation.
> > + */
> > +
> > +#ifndef _RTE_DMADEV_CORE_H_
> > +#define _RTE_DMADEV_CORE_H_
> > +
> > +/**
> > + * @file
> > + *
> > + * RTE DMA Device internal header.
> > + *
> > + * This header contains internal data types, that are used by the DMA devices
> > + * in order to expose their ops to the class.
> > + *
> > + * Applications should not use these API directly.
> > + *
> > + */
> > +
> > +struct rte_dmadev;
> > +
> > +/** @internal Used to get device information of a device. */
> > +typedef int (*dmadev_info_get_t)(struct rte_dmadev *dev,
> > +                              struct rte_dmadev_info *dev_info);
>
> First parameter can be "const"
>
> > +/** @internal Used to configure a device. */
> > +typedef int (*dmadev_configure_t)(struct rte_dmadev *dev,
> > +                               const struct rte_dmadev_conf *dev_conf);
> > +
> > +/** @internal Used to start a configured device. */
> > +typedef int (*dmadev_start_t)(struct rte_dmadev *dev);
> > +
> > +/** @internal Used to stop a configured device. */
> > +typedef int (*dmadev_stop_t)(struct rte_dmadev *dev);
> > +
> > +/** @internal Used to close a configured device. */
> > +typedef int (*dmadev_close_t)(struct rte_dmadev *dev);
> > +
> > +/** @internal Used to reset a configured device. */
> > +typedef int (*dmadev_reset_t)(struct rte_dmadev *dev);
> > +
> > +/** @internal Used to allocate and set up a virtual DMA channel. */
> > +typedef int (*dmadev_vchan_setup_t)(struct rte_dmadev *dev,
> > +                                 const struct rte_dmadev_vchan_conf *conf);
> > +
> > +/** @internal Used to release a virtual DMA channel. */
> > +typedef int (*dmadev_vchan_release_t)(struct rte_dmadev *dev, uint16_t vchan);
> > +
> > +/** @internal Used to retrieve basic statistics. */
> > +typedef int (*dmadev_stats_get_t)(struct rte_dmadev *dev, int vchan,
> > +                               struct rte_dmadev_stats *stats);
>
> First parameter can be "const"
>
> > +
> > +/** @internal Used to reset basic statistics. */
> > +typedef int (*dmadev_stats_reset_t)(struct rte_dmadev *dev, int vchan);
> > +
> > +/** @internal Used to dump internal information. */
> > +typedef int (*dmadev_dump_t)(struct rte_dmadev *dev, FILE *f);
> > +
>
> First param "const"
>
> > +/** @internal Used to start dmadev selftest. */
> > +typedef int (*dmadev_selftest_t)(uint16_t dev_id);
> > +
>
> This looks an outlier taking a dev_id. It should take a rawdev parameter.
> Most drivers should not need to implement this anyway, as the main unit
> tests should be in "test_dmadev.c" in the autotest app.
>
> > +/** @internal Used to enqueue a copy operation. */
> > +typedef int (*dmadev_copy_t)(struct rte_dmadev *dev, uint16_t vchan,
> > +                          rte_iova_t src, rte_iova_t dst,
> > +                          uint32_t length, uint64_t flags);
> > +
> > +/** @internal Used to enqueue a scatter list copy operation. */
> > +typedef int (*dmadev_copy_sg_t)(struct rte_dmadev *dev, uint16_t vchan,
> > +                             const struct rte_dma_sg *sg,
> > +                             uint32_t sg_len, uint64_t flags);
> > +
> > +/** @internal Used to enqueue a fill operation. */
> > +typedef int (*dmadev_fill_t)(struct rte_dmadev *dev, uint16_t vchan,
> > +                          uint64_t pattern, rte_iova_t dst,
> > +                          uint32_t length, uint64_t flags);
> > +
> > +/** @internal Used to enqueue a scatter list fill operation. */
> > +typedef int (*dmadev_fill_sg_t)(struct rte_dmadev *dev, uint16_t vchan,
> > +                     uint64_t pattern, const struct rte_dma_sg *sg,
> > +                     uint32_t sg_len, uint64_t flags);
> > +
> > +/** @internal Used to trigger hardware to begin working. */
> > +typedef int (*dmadev_submit_t)(struct rte_dmadev *dev, uint16_t vchan);
> > +
> > +/** @internal Used to return number of successful completed operations. */
> > +typedef uint16_t (*dmadev_completed_t)(struct rte_dmadev *dev, uint16_t vchan,
> > +                                    const uint16_t nb_cpls,
> > +                                    uint16_t *last_idx, bool *has_error);
> > +
> > +/** @internal Used to return number of failed completed operations. */
> > +typedef uint16_t (*dmadev_completed_fails_t)(struct rte_dmadev *dev,
> > +                     uint16_t vchan, const uint16_t nb_status,
> > +                     uint32_t *status, uint16_t *last_idx);
> > +
> > +/**
> > + * DMA device operations function pointer table
> > + */
> > +struct rte_dmadev_ops {
> > +     dmadev_info_get_t dev_info_get;
> > +     dmadev_configure_t dev_configure;
> > +     dmadev_start_t dev_start;
> > +     dmadev_stop_t dev_stop;
> > +     dmadev_close_t dev_close;
> > +     dmadev_reset_t dev_reset;
> > +     dmadev_vchan_setup_t vchan_setup;
> > +     dmadev_vchan_release_t vchan_release;
> > +     dmadev_stats_get_t stats_get;
> > +     dmadev_stats_reset_t stats_reset;
> > +     dmadev_dump_t dev_dump;
> > +     dmadev_selftest_t dev_selftest;
> > +};
> > +
> > +/**
> > + * @internal
> > + * The data part, with no function pointers, associated with each DMA device.
> > + *
> > + * This structure is safe to place in shared memory to be common among different
> > + * processes in a multi-process configuration.
> > + */
> > +struct rte_dmadev_data {
> > +     uint16_t dev_id; /**< Device [external] identifier. */
> > +     char dev_name[RTE_DMADEV_NAME_MAX_LEN]; /**< Unique identifier name */
> > +     void *dev_private; /**< PMD-specific private data. */
> > +     struct rte_dmadev_conf dev_conf; /**< DMA device configuration. */
> > +     uint8_t dev_started : 1; /**< Device state: STARTED(1)/STOPPED(0). */
> > +     uint64_t reserved[4]; /**< Reserved for future fields */
> > +} __rte_cache_aligned;
> > +
>
> While I generally don't like having reserved space, this is one place where
> it makes sense, so +1 for it here.
>
> > +/**
> > + * @internal
> > + * The generic data structure associated with each DMA device.
> > + *
> > + * The dataplane APIs are located at the beginning of the structure, along
> > + * with the pointer to where all the data elements for the particular device
> > + * are stored in shared memory. This split scheme allows the function pointer
> > + * and driver data to be per-process, while the actual configuration data for
> > + * the device is shared.
> > + */
> > +struct rte_dmadev {
> > +     dmadev_copy_t copy;
> > +     dmadev_copy_sg_t copy_sg;
> > +     dmadev_fill_t fill;
> > +     dmadev_fill_sg_t fill_sg;
> > +     dmadev_submit_t submit;
> > +     dmadev_completed_t completed;
> > +     dmadev_completed_fails_t completed_fails;
> > +     const struct rte_dmadev_ops *dev_ops; /**< Functions exported by PMD. */
> > +     /** Flag indicating the device is attached: ATTACHED(1)/DETACHED(0). */
> > +     uint8_t attached : 1;
>
> Since it's in the midst of a series of pointers, this 1-bit flag is
> actually using 8-bytes of space. Is it needed. Can we use dev_ops == NULL
> or data == NULL instead to indicate this is a valid entry?
>
> > +     /** Device info which supplied during device initialization. */
> > +     struct rte_device *device;
> > +     struct rte_dmadev_data *data; /**< Pointer to device data. */
>
> If we are to try and minimise cacheline access, we should put this data
> pointer - or even better a copy of data->private pointer - at the top of
> the structure on the same cacheline as datapath operations. For dataplane,
> I can't see any elements of data, except the private pointer being
> accessed, so we would probably get most benefit for having a copy put there
> on init of the dmadev struct.
>
> > +     uint64_t reserved[4]; /**< Reserved for future fields */
> > +} __rte_cache_aligned;
> > +
> > +extern struct rte_dmadev rte_dmadevices[];
> > +
> > +#endif /* _RTE_DMADEV_CORE_H_ */
> > diff --git a/lib/dmadev/rte_dmadev_pmd.h b/lib/dmadev/rte_dmadev_pmd.h
> > new file mode 100644
> > index 0000000..45141f9
> > --- /dev/null
> > +++ b/lib/dmadev/rte_dmadev_pmd.h
> > @@ -0,0 +1,72 @@
> > +/* SPDX-License-Identifier: BSD-3-Clause
> > + * Copyright(c) 2021 HiSilicon Limited.
> > + */
> > +
> > +#ifndef _RTE_DMADEV_PMD_H_
> > +#define _RTE_DMADEV_PMD_H_
> > +
> > +/**
> > + * @file
> > + *
> > + * RTE DMA Device PMD APIs
> > + *
> > + * Driver facing APIs for a DMA device. These are not to be called directly by
> > + * any application.
> > + */
> > +
> > +#include "rte_dmadev.h"
> > +
> > +#ifdef __cplusplus
> > +extern "C" {
> > +#endif
> > +
> > +/**
> > + * @internal
> > + * Allocates a new dmadev slot for an DMA device and returns the pointer
> > + * to that slot for the driver to use.
> > + *
> > + * @param name
> > + *   DMA device name.
> > + *
> > + * @return
> > + *   A pointer to the DMA device slot case of success,
> > + *   NULL otherwise.
> > + */
> > +__rte_internal
> > +struct rte_dmadev *
> > +rte_dmadev_pmd_allocate(const char *name);
> > +
> > +/**
> > + * @internal
> > + * Release the specified dmadev.
> > + *
> > + * @param dev
> > + *   Device to be released.
> > + *
> > + * @return
> > + *   - 0 on success, negative on error
> > + */
> > +__rte_internal
> > +int
> > +rte_dmadev_pmd_release(struct rte_dmadev *dev);
> > +
> > +/**
> > + * @internal
> > + * Return the DMA device based on the device name.
> > + *
> > + * @param name
> > + *   DMA device name.
> > + *
> > + * @return
> > + *   A pointer to the DMA device slot case of success,
> > + *   NULL otherwise.
> > + */
> > +__rte_internal
> > +struct rte_dmadev *
> > +rte_dmadev_get_device_by_name(const char *name);
> > +
> > +#ifdef __cplusplus
> > +}
> > +#endif
> > +
> > +#endif /* _RTE_DMADEV_PMD_H_ */
> > diff --git a/lib/dmadev/version.map b/lib/dmadev/version.map
> > new file mode 100644
> > index 0000000..0f099e7
> > --- /dev/null
> > +++ b/lib/dmadev/version.map
> > @@ -0,0 +1,40 @@
> > +EXPERIMENTAL {
> > +     global:
> > +
> > +     rte_dmadev_count;
> > +     rte_dmadev_info_get;
> > +     rte_dmadev_configure;
> > +     rte_dmadev_start;
> > +     rte_dmadev_stop;
> > +     rte_dmadev_close;
> > +     rte_dmadev_reset;
> > +     rte_dmadev_vchan_setup;
> > +     rte_dmadev_vchan_release;
> > +     rte_dmadev_stats_get;
> > +     rte_dmadev_stats_reset;
> > +     rte_dmadev_dump;
> > +     rte_dmadev_selftest;
> > +     rte_dmadev_copy;
> > +     rte_dmadev_copy_sg;
> > +     rte_dmadev_fill;
> > +     rte_dmadev_fill_sg;
> > +     rte_dmadev_submit;
> > +     rte_dmadev_completed;
> > +     rte_dmadev_completed_fails;
> > +
> > +     local: *;
> > +};
>
> The elements in the version.map file blocks should be sorted alphabetically.
>
> > +
> > +INTERNAL {
> > +        global:
> > +
> > +     rte_dmadevices;
> > +     rte_dmadev_pmd_allocate;
> > +     rte_dmadev_pmd_release;
> > +     rte_dmadev_get_device_by_name;
> > +
> > +     local:
> > +
> > +     rte_dmadev_is_valid_dev;
> > +};
> > +
> > diff --git a/lib/meson.build b/lib/meson.build
> > index 1673ca4..68d239f 100644
> > --- a/lib/meson.build
> > +++ b/lib/meson.build
> > @@ -60,6 +60,7 @@ libraries = [
> >          'bpf',
> >          'graph',
> >          'node',
> > +        'dmadev',
> >  ]
> >
> >  if is_windows
> > --
> > 2.8.1
> >

^ permalink raw reply	[flat|nested] 339+ messages in thread

* [dpdk-dev] [PATCH v3] dmadev: introduce DMA device library
  2021-07-02 13:18 [dpdk-dev] [PATCH] dmadev: introduce DMA device library Chengwen Feng
                   ` (5 preceding siblings ...)
  2021-07-11  9:25 ` [dpdk-dev] [PATCH v2] dmadev: introduce DMA device library Chengwen Feng
@ 2021-07-13 12:27 ` Chengwen Feng
  2021-07-13 13:06   ` fengchengwen
                     ` (4 more replies)
  2021-07-15 15:41 ` [dpdk-dev] [PATCH v4] " Chengwen Feng
                   ` (22 subsequent siblings)
  29 siblings, 5 replies; 339+ messages in thread
From: Chengwen Feng @ 2021-07-13 12:27 UTC (permalink / raw)
  To: thomas, ferruh.yigit, bruce.richardson, jerinj, jerinjacobk,
	andrew.rybchenko
  Cc: dev, mb, nipun.gupta, hemant.agrawal, maxime.coquelin,
	honnappa.nagarahalli, david.marchand, sburla, pkapoor,
	konstantin.ananyev

This patch introduce 'dmadevice' which is a generic type of DMA
device.

The APIs of dmadev library exposes some generic operations which can
enable configuration and I/O with the DMA devices.

Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>
---
v3:
* rm reset and fill_sg ops.
* rm MT-safe capabilities.
* add submit flag.
* redefine rte_dma_sg to implement asymmetric copy.
* delete some reserved field for future use.
* rearrangement rte_dmadev/rte_dmadev_data struct.
* refresh rte_dmadev.h copyright.
* update vchan setup parameter.
* modified some inappropriate descriptions.
* arrange version.map alphabetically.
* other minor modifications from review comment.
---
 MAINTAINERS                  |   4 +
 config/rte_config.h          |   3 +
 lib/dmadev/meson.build       |   7 +
 lib/dmadev/rte_dmadev.c      | 561 +++++++++++++++++++++++++
 lib/dmadev/rte_dmadev.h      | 968 +++++++++++++++++++++++++++++++++++++++++++
 lib/dmadev/rte_dmadev_core.h | 161 +++++++
 lib/dmadev/rte_dmadev_pmd.h  |  72 ++++
 lib/dmadev/version.map       |  37 ++
 lib/meson.build              |   1 +
 9 files changed, 1814 insertions(+)
 create mode 100644 lib/dmadev/meson.build
 create mode 100644 lib/dmadev/rte_dmadev.c
 create mode 100644 lib/dmadev/rte_dmadev.h
 create mode 100644 lib/dmadev/rte_dmadev_core.h
 create mode 100644 lib/dmadev/rte_dmadev_pmd.h
 create mode 100644 lib/dmadev/version.map

diff --git a/MAINTAINERS b/MAINTAINERS
index af2a91d..e01a07f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -495,6 +495,10 @@ F: drivers/raw/skeleton/
 F: app/test/test_rawdev.c
 F: doc/guides/prog_guide/rawdev.rst
 
+DMA device API - EXPERIMENTAL
+M: Chengwen Feng <fengchengwen@huawei.com>
+F: lib/dmadev/
+
 
 Memory Pool Drivers
 -------------------
diff --git a/config/rte_config.h b/config/rte_config.h
index 590903c..331a431 100644
--- a/config/rte_config.h
+++ b/config/rte_config.h
@@ -81,6 +81,9 @@
 /* rawdev defines */
 #define RTE_RAWDEV_MAX_DEVS 64
 
+/* dmadev defines */
+#define RTE_DMADEV_MAX_DEVS 64
+
 /* ip_fragmentation defines */
 #define RTE_LIBRTE_IP_FRAG_MAX_FRAG 4
 #undef RTE_LIBRTE_IP_FRAG_TBL_STAT
diff --git a/lib/dmadev/meson.build b/lib/dmadev/meson.build
new file mode 100644
index 0000000..d2fc85e
--- /dev/null
+++ b/lib/dmadev/meson.build
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2021 HiSilicon Limited.
+
+sources = files('rte_dmadev.c')
+headers = files('rte_dmadev.h')
+indirect_headers += files('rte_dmadev_core.h')
+driver_sdk_headers += files('rte_dmadev_pmd.h')
diff --git a/lib/dmadev/rte_dmadev.c b/lib/dmadev/rte_dmadev.c
new file mode 100644
index 0000000..1bca463
--- /dev/null
+++ b/lib/dmadev/rte_dmadev.c
@@ -0,0 +1,561 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 HiSilicon Limited.
+ * Copyright(c) 2021 Intel Corporation.
+ */
+
+#include <ctype.h>
+#include <inttypes.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <rte_debug.h>
+#include <rte_dev.h>
+#include <rte_eal.h>
+#include <rte_errno.h>
+#include <rte_lcore.h>
+#include <rte_log.h>
+#include <rte_memory.h>
+#include <rte_memzone.h>
+#include <rte_malloc.h>
+#include <rte_string_fns.h>
+
+#include "rte_dmadev.h"
+#include "rte_dmadev_pmd.h"
+
+struct rte_dmadev rte_dmadevices[RTE_DMADEV_MAX_DEVS];
+
+static const char *MZ_RTE_DMADEV_DATA = "rte_dmadev_data";
+/* Shared memory between primary and secondary processes. */
+static struct {
+	struct rte_dmadev_data data[RTE_DMADEV_MAX_DEVS];
+} *dmadev_shared_data;
+
+RTE_LOG_REGISTER(rte_dmadev_logtype, lib.dmadev, INFO);
+#define RTE_DMADEV_LOG(level, ...) \
+	rte_log(RTE_LOG_ ## level, rte_dmadev_logtype, "" __VA_ARGS__)
+
+/* Macros to check for valid device id */
+#define RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, retval) do { \
+	if (!rte_dmadev_is_valid_dev(dev_id)) { \
+		RTE_DMADEV_LOG(ERR, "Invalid dev_id=%u\n", dev_id); \
+		return retval; \
+	} \
+} while (0)
+
+#define RTE_DMADEV_VALID_DEV_ID_OR_RET(dev_id) do { \
+	if (!rte_dmadev_is_valid_dev(dev_id)) { \
+		RTE_DMADEV_LOG(ERR, "Invalid dev_id=%u\n", dev_id); \
+		return; \
+	} \
+} while (0)
+
+/* Macro to check for invalid pointers */
+#define RTE_DMADEV_PTR_OR_ERR_RET(ptr, retval) do { \
+	if ((ptr) == NULL) \
+		return retval; \
+} while (0)
+
+static int
+dmadev_check_name(const char *name)
+{
+	size_t name_len;
+
+	if (name == NULL) {
+		RTE_DMADEV_LOG(ERR, "Name can't be NULL\n");
+		return -EINVAL;
+	}
+
+	name_len = strnlen(name, RTE_DMADEV_NAME_MAX_LEN);
+	if (name_len == 0) {
+		RTE_DMADEV_LOG(ERR, "Zero length DMA device name\n");
+		return -EINVAL;
+	}
+	if (name_len >= RTE_DMADEV_NAME_MAX_LEN) {
+		RTE_DMADEV_LOG(ERR, "DMA device name is too long\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static uint16_t
+dmadev_find_free_dev(void)
+{
+	uint16_t i;
+
+	for (i = 0; i < RTE_DMADEV_MAX_DEVS; i++) {
+		if (dmadev_shared_data->data[i].dev_name[0] == '\0') {
+			RTE_ASSERT(rte_dmadevices[i].state ==
+				   RTE_DMADEV_UNUSED);
+			return i;
+		}
+	}
+
+	return RTE_DMADEV_MAX_DEVS;
+}
+
+static struct rte_dmadev*
+dmadev_find(const char *name)
+{
+	uint16_t i;
+
+	for (i = 0; i < RTE_DMADEV_MAX_DEVS; i++) {
+		if ((rte_dmadevices[i].state == RTE_DMADEV_ATTACHED) &&
+		    (!strcmp(name, rte_dmadevices[i].data->dev_name)))
+			return &rte_dmadevices[i];
+	}
+
+	return NULL;
+}
+
+static int
+dmadev_shared_data_prepare(void)
+{
+	const struct rte_memzone *mz;
+
+	if (dmadev_shared_data == NULL) {
+		if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+			/* Allocate port data and ownership shared memory. */
+			mz = rte_memzone_reserve(MZ_RTE_DMADEV_DATA,
+					 sizeof(*dmadev_shared_data),
+					 rte_socket_id(), 0);
+		} else {
+			mz = rte_memzone_lookup(MZ_RTE_DMADEV_DATA);
+		}
+		if (mz == NULL)
+			return -ENOMEM;
+
+		dmadev_shared_data = mz->addr;
+		if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+			memset(dmadev_shared_data->data, 0,
+			       sizeof(dmadev_shared_data->data));
+	}
+
+	return 0;
+}
+
+static struct rte_dmadev *
+dmadev_allocate(const char *name)
+{
+	struct rte_dmadev *dev;
+	uint16_t dev_id;
+
+	dev = dmadev_find(name);
+	if (dev != NULL) {
+		RTE_DMADEV_LOG(ERR, "DMA device already allocated\n");
+		return NULL;
+	}
+
+	dev_id = dmadev_find_free_dev();
+	if (dev_id == RTE_DMADEV_MAX_DEVS) {
+		RTE_DMADEV_LOG(ERR, "Reached maximum number of DMA devices\n");
+		return NULL;
+	}
+
+	if (dmadev_shared_data_prepare() != 0) {
+		RTE_DMADEV_LOG(ERR, "Cannot allocate DMA shared data\n");
+		return NULL;
+	}
+
+	dev = &rte_dmadevices[dev_id];
+	dev->data = &dmadev_shared_data->data[dev_id];
+	dev->data->dev_id = dev_id;
+	strlcpy(dev->data->dev_name, name, sizeof(dev->data->dev_name));
+
+	return dev;
+}
+
+static struct rte_dmadev *
+dmadev_attach_secondary(const char *name)
+{
+	struct rte_dmadev *dev;
+	uint16_t i;
+
+	if (dmadev_shared_data_prepare() != 0) {
+		RTE_DMADEV_LOG(ERR, "Cannot allocate DMA shared data\n");
+		return NULL;
+	}
+
+	for (i = 0; i < RTE_DMADEV_MAX_DEVS; i++) {
+		if (!strcmp(dmadev_shared_data->data[i].dev_name, name))
+			break;
+	}
+	if (i == RTE_DMADEV_MAX_DEVS) {
+		RTE_DMADEV_LOG(ERR,
+			"Device %s is not driven by the primary process\n",
+			name);
+		return NULL;
+	}
+
+	dev = &rte_dmadevices[i];
+	dev->data = &dmadev_shared_data->data[i];
+	RTE_ASSERT(dev->data->dev_id == i);
+
+	return dev;
+}
+
+struct rte_dmadev *
+rte_dmadev_pmd_allocate(const char *name)
+{
+	struct rte_dmadev *dev;
+
+	if (dmadev_check_name(name) != 0)
+		return NULL;
+
+	if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+		dev = dmadev_allocate(name);
+	else
+		dev = dmadev_attach_secondary(name);
+
+	if (dev == NULL)
+		return NULL;
+	dev->state = RTE_DMADEV_ATTACHED;
+
+	return dev;
+}
+
+int
+rte_dmadev_pmd_release(struct rte_dmadev *dev)
+{
+	if (dev == NULL)
+		return -EINVAL;
+
+	if (dev->state == RTE_DMADEV_UNUSED)
+		return 0;
+
+	if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+		rte_free(dev->data->dev_private);
+		memset(dev->data, 0, sizeof(struct rte_dmadev_data));
+	}
+
+	memset(dev, 0, sizeof(struct rte_dmadev));
+	dev->state = RTE_DMADEV_UNUSED;
+
+	return 0;
+}
+
+struct rte_dmadev *
+rte_dmadev_get_device_by_name(const char *name)
+{
+	if (dmadev_check_name(name) != 0)
+		return NULL;
+	return dmadev_find(name);
+}
+
+bool
+rte_dmadev_is_valid_dev(uint16_t dev_id)
+{
+	if (dev_id >= RTE_DMADEV_MAX_DEVS ||
+	    rte_dmadevices[dev_id].state != RTE_DMADEV_ATTACHED)
+		return false;
+	return true;
+}
+
+uint16_t
+rte_dmadev_count(void)
+{
+	uint16_t count = 0;
+	uint16_t i;
+
+	for (i = 0; i < RTE_DMADEV_MAX_DEVS; i++) {
+		if (rte_dmadevices[i].state == RTE_DMADEV_ATTACHED)
+			count++;
+	}
+
+	return count;
+}
+
+int
+rte_dmadev_info_get(uint16_t dev_id, struct rte_dmadev_info *dev_info)
+{
+	const struct rte_dmadev *dev;
+	int ret;
+
+	RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
+	RTE_DMADEV_PTR_OR_ERR_RET(dev_info, -EINVAL);
+
+	dev = &rte_dmadevices[dev_id];
+
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_info_get, -ENOTSUP);
+	memset(dev_info, 0, sizeof(struct rte_dmadev_info));
+	ret = (*dev->dev_ops->dev_info_get)(dev, dev_info,
+					    sizeof(struct rte_dmadev_info));
+	if (ret != 0)
+		return ret;
+
+	dev_info->device = dev->device;
+	dev_info->nb_vchans = dev->data->dev_conf.max_vchans;
+
+	return 0;
+}
+
+int
+rte_dmadev_configure(uint16_t dev_id, const struct rte_dmadev_conf *dev_conf)
+{
+	struct rte_dmadev_info info;
+	struct rte_dmadev *dev;
+	int ret;
+
+	RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
+	RTE_DMADEV_PTR_OR_ERR_RET(dev_conf, -EINVAL);
+	dev = &rte_dmadevices[dev_id];
+
+	ret = rte_dmadev_info_get(dev_id, &info);
+	if (ret != 0) {
+		RTE_DMADEV_LOG(ERR, "Device %u get device info fail\n", dev_id);
+		return -EINVAL;
+	}
+	if (dev_conf->max_vchans > info.max_vchans) {
+		RTE_DMADEV_LOG(ERR,
+			"Device %u configure too many vchans\n", dev_id);
+		return -EINVAL;
+	}
+
+	if (dev->data->dev_started != 0) {
+		RTE_DMADEV_LOG(ERR,
+			"Device %u must be stopped to allow configuration\n",
+			dev_id);
+		return -EBUSY;
+	}
+
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_configure, -ENOTSUP);
+	ret = (*dev->dev_ops->dev_configure)(dev, dev_conf);
+	if (ret == 0)
+		memcpy(&dev->data->dev_conf, dev_conf, sizeof(*dev_conf));
+
+	return ret;
+}
+
+int
+rte_dmadev_start(uint16_t dev_id)
+{
+	struct rte_dmadev *dev;
+	int ret;
+
+	RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
+	dev = &rte_dmadevices[dev_id];
+
+	if (dev->data->dev_started != 0) {
+		RTE_DMADEV_LOG(WARNING, "Device %u already started\n", dev_id);
+		return 0;
+	}
+
+	if (dev->dev_ops->dev_start == NULL)
+		goto mark_started;
+
+	ret = (*dev->dev_ops->dev_start)(dev);
+	if (ret != 0)
+		return ret;
+
+mark_started:
+	dev->data->dev_started = 1;
+	return 0;
+}
+
+int
+rte_dmadev_stop(uint16_t dev_id)
+{
+	struct rte_dmadev *dev;
+	int ret;
+
+	RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
+	dev = &rte_dmadevices[dev_id];
+
+	if (dev->data->dev_started == 0) {
+		RTE_DMADEV_LOG(WARNING, "Device %u already stopped\n", dev_id);
+		return 0;
+	}
+
+	if (dev->dev_ops->dev_stop == NULL)
+		goto mark_stopped;
+
+	ret = (*dev->dev_ops->dev_stop)(dev);
+	if (ret != 0)
+		return ret;
+
+mark_stopped:
+	dev->data->dev_started = 0;
+	return 0;
+}
+
+int
+rte_dmadev_close(uint16_t dev_id)
+{
+	struct rte_dmadev *dev;
+
+	RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
+	dev = &rte_dmadevices[dev_id];
+
+	/* Device must be stopped before it can be closed */
+	if (dev->data->dev_started == 1) {
+		RTE_DMADEV_LOG(ERR,
+			"Device %u must be stopped before closing\n", dev_id);
+		return -EBUSY;
+	}
+
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_close, -ENOTSUP);
+	return (*dev->dev_ops->dev_close)(dev);
+}
+
+int
+rte_dmadev_vchan_setup(uint16_t dev_id,
+		       const struct rte_dmadev_vchan_conf *conf)
+{
+	struct rte_dmadev_info info;
+	struct rte_dmadev *dev;
+	int ret;
+
+	RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
+	RTE_DMADEV_PTR_OR_ERR_RET(conf, -EINVAL);
+
+	dev = &rte_dmadevices[dev_id];
+
+	ret = rte_dmadev_info_get(dev_id, &info);
+	if (ret != 0) {
+		RTE_DMADEV_LOG(ERR, "Device %u get device info fail\n", dev_id);
+		return -EINVAL;
+	}
+	if (conf->direction == 0 ||
+	    conf->direction & ~RTE_DMA_TRANSFER_DIR_ALL) {
+		RTE_DMADEV_LOG(ERR, "Device %u direction invalid!\n", dev_id);
+		return -EINVAL;
+	}
+	if (conf->direction & RTE_DMA_MEM_TO_MEM &&
+	    !(info.dev_capa & RTE_DMA_DEV_CAPA_MEM_TO_MEM)) {
+		RTE_DMADEV_LOG(ERR,
+			"Device %u don't support mem2mem transfer\n", dev_id);
+		return -EINVAL;
+	}
+	if (conf->direction & RTE_DMA_MEM_TO_DEV &&
+	    !(info.dev_capa & RTE_DMA_DEV_CAPA_MEM_TO_DEV)) {
+		RTE_DMADEV_LOG(ERR,
+			"Device %u don't support mem2dev transfer\n", dev_id);
+		return -EINVAL;
+	}
+	if (conf->direction & RTE_DMA_DEV_TO_MEM &&
+	    !(info.dev_capa & RTE_DMA_DEV_CAPA_DEV_TO_MEM)) {
+		RTE_DMADEV_LOG(ERR,
+			"Device %u don't support dev2mem transfer\n", dev_id);
+		return -EINVAL;
+	}
+	if (conf->direction & RTE_DMA_DEV_TO_DEV &&
+	    !(info.dev_capa & RTE_DMA_DEV_CAPA_DEV_TO_DEV)) {
+		RTE_DMADEV_LOG(ERR,
+			"Device %u don't support dev2dev transfer\n", dev_id);
+		return -EINVAL;
+	}
+	if (conf->nb_desc < info.min_desc || conf->nb_desc > info.max_desc) {
+		RTE_DMADEV_LOG(ERR,
+			"Device %u number of descriptors invalid\n", dev_id);
+		return -EINVAL;
+	}
+
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->vchan_setup, -ENOTSUP);
+	return (*dev->dev_ops->vchan_setup)(dev, conf);
+}
+
+int
+rte_dmadev_vchan_release(uint16_t dev_id, uint16_t vchan)
+{
+	struct rte_dmadev *dev;
+
+	RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
+	dev = &rte_dmadevices[dev_id];
+
+	if (vchan >= dev->data->dev_conf.max_vchans) {
+		RTE_DMADEV_LOG(ERR,
+			"Device %u vchan %u out of range\n", dev_id, vchan);
+		return -EINVAL;
+	}
+
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->vchan_release, -ENOTSUP);
+	return (*dev->dev_ops->vchan_release)(dev, vchan);
+}
+
+int
+rte_dmadev_stats_get(uint16_t dev_id, uint16_t vchan,
+		     struct rte_dmadev_stats *stats)
+{
+	const struct rte_dmadev *dev;
+
+	RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
+	RTE_DMADEV_PTR_OR_ERR_RET(stats, -EINVAL);
+
+	dev = &rte_dmadevices[dev_id];
+
+	if (vchan >= dev->data->dev_conf.max_vchans &&
+	    vchan != RTE_DMADEV_ALL_VCHAN) {
+		RTE_DMADEV_LOG(ERR,
+			"Device %u vchan %u out of range\n", dev_id, vchan);
+		return -EINVAL;
+	}
+
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->stats_get, -ENOTSUP);
+	return (*dev->dev_ops->stats_get)(dev, vchan, stats,
+					  sizeof(struct rte_dmadev_stats));
+}
+
+int
+rte_dmadev_stats_reset(uint16_t dev_id, uint16_t vchan)
+{
+	struct rte_dmadev *dev;
+
+	RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
+	dev = &rte_dmadevices[dev_id];
+
+	if (vchan >= dev->data->dev_conf.max_vchans &&
+	    vchan != RTE_DMADEV_ALL_VCHAN) {
+		RTE_DMADEV_LOG(ERR,
+			"Device %u vchan %u out of range\n", dev_id, vchan);
+		return -EINVAL;
+	}
+
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->stats_reset, -ENOTSUP);
+	return (*dev->dev_ops->stats_reset)(dev, vchan);
+}
+
+int
+rte_dmadev_dump(uint16_t dev_id, FILE *f)
+{
+	const struct rte_dmadev *dev;
+	struct rte_dmadev_info info;
+	int ret;
+
+	RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
+	RTE_DMADEV_PTR_OR_ERR_RET(f, -EINVAL);
+
+	ret = rte_dmadev_info_get(dev_id, &info);
+	if (ret != 0) {
+		RTE_DMADEV_LOG(ERR, "Device %u get device info fail\n", dev_id);
+		return -EINVAL;
+	}
+
+	dev = &rte_dmadevices[dev_id];
+
+	fprintf(f, "DMA Dev %u, '%s' [%s]\n",
+		dev->data->dev_id,
+		dev->data->dev_name,
+		dev->data->dev_started ? "started" : "stopped");
+	fprintf(f, "  dev_capa: 0x%" PRIx64 "\n", info.dev_capa);
+	fprintf(f, "  max_vchans_supported: %u\n", info.max_vchans);
+	fprintf(f, "  max_vchans_configured: %u\n", info.nb_vchans);
+
+	if (dev->dev_ops->dev_dump != NULL)
+		return (*dev->dev_ops->dev_dump)(dev, f);
+
+	return 0;
+}
+
+int
+rte_dmadev_selftest(uint16_t dev_id)
+{
+	struct rte_dmadev *dev;
+
+	RTE_DMADEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL);
+	dev = &rte_dmadevices[dev_id];
+
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_selftest, -ENOTSUP);
+	return (*dev->dev_ops->dev_selftest)(dev_id);
+}
diff --git a/lib/dmadev/rte_dmadev.h b/lib/dmadev/rte_dmadev.h
new file mode 100644
index 0000000..f6cc4e5
--- /dev/null
+++ b/lib/dmadev/rte_dmadev.h
@@ -0,0 +1,968 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 HiSilicon Limited.
+ * Copyright(c) 2021 Intel Corporation.
+ * Copyright(c) 2021 Marvell International Ltd.
+ * Copyright(c) 2021 SmartShare Systems.
+ */
+
+#ifndef _RTE_DMADEV_H_
+#define _RTE_DMADEV_H_
+
+/**
+ * @file rte_dmadev.h
+ *
+ * RTE DMA (Direct Memory Access) device APIs.
+ *
+ * The DMA framework is built on the following model:
+ *
+ *     ---------------   ---------------       ---------------
+ *     | virtual DMA |   | virtual DMA |       | virtual DMA |
+ *     | channel     |   | channel     |       | channel     |
+ *     ---------------   ---------------       ---------------
+ *            |                |                      |
+ *            ------------------                      |
+ *                     |                              |
+ *               ------------                    ------------
+ *               |  dmadev  |                    |  dmadev  |
+ *               ------------                    ------------
+ *                     |                              |
+ *            ------------------               ------------------
+ *            | HW-DMA-channel |               | HW-DMA-channel |
+ *            ------------------               ------------------
+ *                     |                              |
+ *                     --------------------------------
+ *                                     |
+ *                           ---------------------
+ *                           | HW-DMA-Controller |
+ *                           ---------------------
+ *
+ * The DMA controller could have multiple HW-DMA-channels (aka. HW-DMA-queues),
+ * each HW-DMA-channel should be represented by a dmadev.
+ *
+ * The dmadev could create multiple virtual DMA channel, each virtual DMA
+ * channel represents a different transfer context. The DMA operation request
+ * must be submitted to the virtual DMA channel.
+ * E.G. Application could create virtual DMA channel 0 for mem-to-mem transfer
+ *      scenario, and create virtual DMA channel 1 for mem-to-dev transfer
+ *      scenario.
+ *
+ * The dmadev are dynamically allocated by rte_dmadev_pmd_allocate() during the
+ * PCI/SoC device probing phase performed at EAL initialization time. And could
+ * be released by rte_dmadev_pmd_release() during the PCI/SoC device removing
+ * phase.
+ *
+ * This framework uses 'uint16_t dev_id' as the device identifier of a dmadev,
+ * and 'uint16_t vchan' as the virtual DMA channel identifier in one dmadev.
+ *
+ * The functions exported by the dmadev API to setup a device designated by its
+ * device identifier must be invoked in the following order:
+ *     - rte_dmadev_configure()
+ *     - rte_dmadev_vchan_setup()
+ *     - rte_dmadev_start()
+ *
+ * Then, the application can invoke dataplane APIs to process jobs.
+ *
+ * If the application wants to change the configuration (i.e. call
+ * rte_dmadev_configure()), it must call rte_dmadev_stop() first to stop the
+ * device and then do the reconfiguration before calling rte_dmadev_start()
+ * again. The dataplane APIs should not be invoked when the device is stopped.
+ *
+ * Finally, an application can close a dmadev by invoking the
+ * rte_dmadev_close() function.
+ *
+ * The dataplane APIs include two parts:
+ *   a) The first part is the submission of operation requests:
+ *        - rte_dmadev_copy()
+ *        - rte_dmadev_copy_sg() - scatter-gather form of copy
+ *        - rte_dmadev_fill()
+ *        - rte_dmadev_fill_sg() - scatter-gather form of fill
+ *        - rte_dmadev_perform() - issue doorbell to hardware
+ *      These APIs could work with different virtual DMA channels which have
+ *      different contexts.
+ *      The first four APIs are used to submit the operation request to the
+ *      virtual DMA channel, if the submission is successful, a uint16_t
+ *      ring_idx is returned, otherwise a negative number is returned.
+ *   b) The second part is to obtain the result of requests:
+ *        - rte_dmadev_completed()
+ *            - return the number of operation requests completed successfully.
+ *        - rte_dmadev_completed_fails()
+ *            - return the number of operation requests failed to complete.
+ *
+ * About the ring_idx which rte_dmadev_copy/copy_sg/fill/fill_sg() returned,
+ * the rules are as follows:
+ *   a) ring_idx for each virtual DMA channel are independent.
+ *   b) For a virtual DMA channel, the ring_idx is monotonically incremented,
+ *      when it reach UINT16_MAX, it wraps back to zero.
+ *   c) This ring_idx can be used by applications to track per-operation
+ *      metadata in an application-defined circular ring.
+ *   d) The initial ring_idx of a virtual DMA channel is zero, after the device
+ *      is stopped, the ring_idx needs to be reset to zero.
+ *   Example:
+ *      step-1: start one dmadev
+ *      step-2: enqueue a copy operation, the ring_idx return is 0
+ *      step-3: enqueue a copy operation again, the ring_idx return is 1
+ *      ...
+ *      step-101: stop the dmadev
+ *      step-102: start the dmadev
+ *      step-103: enqueue a copy operation, the cookie return is 0
+ *      ...
+ *      step-x+0: enqueue a fill operation, the ring_idx return is 65535
+ *      step-x+1: enqueue a copy operation, the ring_idx return is 0
+ *      ...
+ *
+ * By default, all the functions of the dmadev API exported by a PMD are
+ * lock-free functions which assume to not be invoked in parallel on different
+ * logical cores to work on the same target object.
+ *
+ */
+
+#include <rte_common.h>
+#include <rte_compat.h>
+#ifdef RTE_DMADEV_DEBUG
+#include <rte_dev.h>
+#endif
+#include <rte_errno.h>
+#include <rte_memory.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define RTE_DMADEV_NAME_MAX_LEN	RTE_DEV_NAME_MAX_LEN
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * @param dev_id
+ *   DMA device index.
+ *
+ * @return
+ *   - If the device index is valid (true) or not (false).
+ */
+__rte_experimental
+bool
+rte_dmadev_is_valid_dev(uint16_t dev_id);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Get the total number of DMA devices that have been successfully
+ * initialised.
+ *
+ * @return
+ *   The total number of usable DMA devices.
+ */
+__rte_experimental
+uint16_t
+rte_dmadev_count(void);
+
+/**
+ * The capabilities of a DMA device
+ */
+#define RTE_DMA_DEV_CAPA_MEM_TO_MEM	(1ull << 0)
+/**< DMA device support memory-to-memory transfer.
+ *
+ * @see struct rte_dmadev_info::dev_capa
+ */
+#define RTE_DMA_DEV_CAPA_MEM_TO_DEV	(1ull << 1)
+/**< DMA device support memory-to-device transfer.
+ *
+ * @see struct rte_dmadev_info::dev_capa
+ */
+#define RTE_DMA_DEV_CAPA_DEV_TO_MEM	(1ull << 2)
+/**< DMA device support device-to-memory transfer.
+ *
+ * @see struct rte_dmadev_info::dev_capa
+ */
+#define RTE_DMA_DEV_CAPA_DEV_TO_DEV	(1ull << 3)
+/**< DMA device support device-to-device transfer.
+ *
+ * @see struct rte_dmadev_info::dev_capa
+ */
+#define RTE_DMA_DEV_CAPA_OPS_COPY	(1ull << 4)
+/**< DMA device support copy ops.
+ *
+ * @see struct rte_dmadev_info::dev_capa
+ */
+#define RTE_DMA_DEV_CAPA_OPS_FILL	(1ull << 5)
+/**< DMA device support fill ops.
+ *
+ * @see struct rte_dmadev_info::dev_capa
+ */
+#define RTE_DMA_DEV_CAPA_OPS_SG		(1ull << 6)
+/**< DMA device support scatter-list ops.
+ * If device support ops_copy and ops_sg, it means supporting copy_sg ops.
+ *
+ * @see struct rte_dmadev_info::dev_capa
+ */
+#define RTE_DMA_DEV_CAPA_FENCE		(1ull << 7)
+/**< DMA device support fence.
+ * If device support fence, then application could set a fence flags when
+ * enqueue operation by rte_dma_copy/copy_sg/fill/fill_sg.
+ * If a operation has a fence flags, it means the operation must be processed
+ * only after all previous operations are completed.
+ *
+ * @see struct rte_dmadev_info::dev_capa
+ */
+#define RTE_DMA_DEV_CAPA_SVA		(1ull << 8)
+/**< DMA device support SVA which could use VA as DMA address.
+ * If device support SVA then application could pass any VA address like memory
+ * from rte_malloc(), rte_memzone(), malloc, stack memory.
+ * If device don't support SVA, then application should pass IOVA address which
+ * from rte_malloc(), rte_memzone().
+ *
+ * @see struct rte_dmadev_info::dev_capa
+ */
+
+/**
+ * A structure used to retrieve the contextual information of
+ * an DMA device
+ */
+struct rte_dmadev_info {
+	struct rte_device *device; /**< Generic Device information */
+	uint64_t dev_capa; /**< Device capabilities (RTE_DMA_DEV_CAPA_*) */
+	/** Maximum number of virtual DMA channels supported */
+	uint16_t max_vchans;
+	/** Maximum allowed number of virtual DMA channel descriptors */
+	uint16_t max_desc;
+	/** Minimum allowed number of virtual DMA channel descriptors */
+	uint16_t min_desc;
+	uint16_t nb_vchans; /**< Number of virtual DMA channel configured */
+};
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Retrieve the contextual information of a DMA device.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param[out] dev_info
+ *   A pointer to a structure of type *rte_dmadev_info* to be filled with the
+ *   contextual information of the device.
+ *
+ * @return
+ *   - =0: Success, driver updates the contextual information of the DMA device
+ *   - <0: Error code returned by the driver info get function.
+ *
+ */
+__rte_experimental
+int
+rte_dmadev_info_get(uint16_t dev_id, struct rte_dmadev_info *dev_info);
+
+/**
+ * A structure used to configure a DMA device.
+ */
+struct rte_dmadev_conf {
+	/** Maximum number of virtual DMA channel to use.
+	 * This value cannot be greater than the field 'max_vchans' of struct
+	 * rte_dmadev_info which get from rte_dmadev_info_get().
+	 */
+	uint16_t max_vchans;
+};
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Configure a DMA device.
+ *
+ * This function must be invoked first before any other function in the
+ * API. This function can also be re-invoked when a device is in the
+ * stopped state.
+ *
+ * @param dev_id
+ *   The identifier of the device to configure.
+ * @param dev_conf
+ *   The DMA device configuration structure encapsulated into rte_dmadev_conf
+ *   object.
+ *
+ * @return
+ *   - =0: Success, device configured.
+ *   - <0: Error code returned by the driver configuration function.
+ */
+__rte_experimental
+int
+rte_dmadev_configure(uint16_t dev_id, const struct rte_dmadev_conf *dev_conf);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Start a DMA device.
+ *
+ * The device start step is the last one and consists of setting the DMA
+ * to start accepting jobs.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ *
+ * @return
+ *   - =0: Success, device started.
+ *   - <0: Error code returned by the driver start function.
+ */
+__rte_experimental
+int
+rte_dmadev_start(uint16_t dev_id);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Stop a DMA device.
+ *
+ * The device can be restarted with a call to rte_dmadev_start()
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ *
+ * @return
+ *   - =0: Success, device stopped.
+ *   - <0: Error code returned by the driver stop function.
+ */
+__rte_experimental
+int
+rte_dmadev_stop(uint16_t dev_id);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Close a DMA device.
+ *
+ * The device cannot be restarted after this call.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ *
+ * @return
+ *  - =0: Successfully close device
+ *  - <0: Failure to close device
+ */
+__rte_experimental
+int
+rte_dmadev_close(uint16_t dev_id);
+
+/**
+ * DMA transfer direction defines.
+ */
+#define RTE_DMA_MEM_TO_MEM	(1ull << 0)
+/**< DMA transfer direction - from memory to memory.
+ *
+ * @see struct rte_dmadev_vchan_conf::direction
+ */
+#define RTE_DMA_MEM_TO_DEV	(1ull << 1)
+/**< DMA transfer direction - from memory to device.
+ * In a typical scenario, ARM SoCs are installed on x86 servers as iNICs
+ * through the PCIE interface. In this case, the ARM SoCs works in EP(endpoint)
+ * mode, it could initiate a DMA move request from memory (which is ARM memory)
+ * to device (which is x86 host memory).
+ *
+ * @see struct rte_dmadev_vchan_conf::direction
+ */
+#define RTE_DMA_DEV_TO_MEM	(1ull << 2)
+/**< DMA transfer direction - from device to memory.
+ * In a typical scenario, ARM SoCs are installed on x86 servers as iNICs
+ * through the PCIE interface. In this case, the ARM SoCs works in EP(endpoint)
+ * mode, it could initiate a DMA move request from device (which is x86 host
+ * memory) to memory (which is ARM memory).
+ *
+ * @see struct rte_dmadev_vchan_conf::direction
+ */
+#define RTE_DMA_DEV_TO_DEV	(1ull << 3)
+/**< DMA transfer direction - from device to device.
+ * In a typical scenario, ARM SoCs are installed on x86 servers as iNICs
+ * through the PCIE interface. In this case, the ARM SoCs works in EP(endpoint)
+ * mode, it could initiate a DMA move request from device (which is x86 host
+ * memory) to device (which is another x86 host memory).
+ *
+ * @see struct rte_dmadev_vchan_conf::direction
+ */
+#define RTE_DMA_TRANSFER_DIR_ALL	(RTE_DMA_MEM_TO_MEM | \
+					 RTE_DMA_MEM_TO_DEV | \
+					 RTE_DMA_DEV_TO_MEM | \
+					 RTE_DMA_DEV_TO_DEV)
+
+/**
+ * enum rte_dmadev_port_type - DMA port type defines
+ * When
+ */
+enum rte_dmadev_port_type {
+	/** The device port type is PCIE. */
+	RTE_DMADEV_PORT_OF_PCIE = 1,
+};
+
+/**
+ * A structure used to descript DMA port parameters.
+ */
+struct rte_dmadev_port_parameters {
+	enum rte_dmadev_port_type port_type;
+	union {
+		/** For PCIE port
+		 *
+		 * The following model show SoC's PCIE module connects to
+		 * multiple PCIE hosts and multiple endpoints. The PCIE module
+		 * has an integrate DMA controller.
+		 * If the DMA wants to access the memory of host A, it can be
+		 * initiated by PF1 in core0, or by VF0 of PF0 in core0.
+		 *
+		 * System Bus
+		 *    |     ----------PCIE module----------
+		 *    |     Bus
+		 *    |     Interface
+		 *    |     -----        ------------------
+		 *    |     |   |        | PCIE Core0     |
+		 *    |     |   |        |                |        -----------
+		 *    |     |   |        |   PF-0 -- VF-0 |        | Host A  |
+		 *    |     |   |--------|        |- VF-1 |--------| Root    |
+		 *    |     |   |        |   PF-1         |        | Complex |
+		 *    |     |   |        |   PF-2         |        -----------
+		 *    |     |   |        ------------------
+		 *    |     |   |
+		 *    |     |   |        ------------------
+		 *    |     |   |        | PCIE Core1     |
+		 *    |     |   |        |                |        -----------
+		 *    |     |   |        |   PF-0 -- VF-0 |        | Host B  |
+		 *    |-----|   |--------|   PF-1 -- VF-0 |--------| Root    |
+		 *    |     |   |        |        |- VF-1 |        | Complex |
+		 *    |     |   |        |   PF-2         |        -----------
+		 *    |     |   |        ------------------
+		 *    |     |   |
+		 *    |     |   |        ------------------
+		 *    |     |DMA|        |                |        ------
+		 *    |     |   |        |                |--------| EP |
+		 *    |     |   |--------| PCIE Core2     |        ------
+		 *    |     |   |        |                |        ------
+		 *    |     |   |        |                |--------| EP |
+		 *    |     |   |        |                |        ------
+		 *    |     -----        ------------------
+		 *
+		 * The following structure is used to describe the above access
+		 * port.
+		 */
+		struct {
+			uint64_t coreid : 3; /**< PCIE core id used */
+			uint64_t pfid : 6; /**< PF id used */
+			uint64_t vfen : 1; /**< VF enable bit */
+			uint64_t vfid : 8; /**< VF id used */
+			/** The pasid filed in TLP packet */
+			uint64_t pasid : 20;
+			/** The attributes filed in TLP packet */
+			uint64_t attr : 3;
+			/** The processing hint filed in TLP packet */
+			uint64_t ph : 2;
+			/** The steering tag filed in TLP packet */
+			uint64_t st : 16;
+		} pcie;
+	};
+	uint64_t reserved[2]; /**< Reserved for future fields */
+};
+
+/**
+ * A structure used to configure a virtual DMA channel.
+ */
+struct rte_dmadev_vchan_conf {
+	uint8_t direction;
+	/**< Set of supported transfer directions
+	 * @see RTE_DMA_MEM_TO_MEM
+	 * @see RTE_DMA_MEM_TO_DEV
+	 * @see RTE_DMA_DEV_TO_MEM
+	 * @see RTE_DMA_DEV_TO_DEV
+	 */
+	/** Number of descriptor for the virtual DMA channel */
+	uint16_t nb_desc;
+	/** 1) Used to describes the port parameter in the device-to-memory
+	 * transfer scenario.
+	 * 2) Used to describes the source port parameter in the
+	 * device-to-device transfer scenario.
+	 * @see struct rte_dmadev_port_parameters
+	 */
+	struct rte_dmadev_port_parameters src_port;
+	/** 1) Used to describes the port parameter in the memory-to-device-to
+	 * transfer scenario.
+	 * 2) Used to describes the destination port parameter in the
+	 * device-to-device transfer scenario.
+	 * @see struct rte_dmadev_port_parameters
+	 */
+	struct rte_dmadev_port_parameters dst_port;
+};
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Allocate and set up a virtual DMA channel.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param conf
+ *   The virtual DMA channel configuration structure encapsulated into
+ *   rte_dmadev_vchan_conf object.
+ *
+ * @return
+ *   - >=0: Allocate success, it is the virtual DMA channel id. This value must
+ *          be less than the field 'max_vchans' of struct rte_dmadev_conf
+ *          which configured by rte_dmadev_configure().
+ *   - <0: Error code returned by the driver virtual channel setup function.
+ */
+__rte_experimental
+int
+rte_dmadev_vchan_setup(uint16_t dev_id,
+		       const struct rte_dmadev_vchan_conf *conf);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Release a virtual DMA channel.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param vchan
+ *   The identifier of virtual DMA channel which return by vchan setup.
+ *
+ * @return
+ *   - =0: Successfully release the virtual DMA channel.
+ *   - <0: Error code returned by the driver virtual channel release function.
+ */
+__rte_experimental
+int
+rte_dmadev_vchan_release(uint16_t dev_id, uint16_t vchan);
+
+/**
+ * rte_dmadev_stats - running statistics.
+ */
+struct rte_dmadev_stats {
+	/** Count of operations which were successfully enqueued */
+	uint64_t enqueued_count;
+	/** Count of operations which were submitted to hardware */
+	uint64_t submitted_count;
+	/** Count of operations which failed to complete */
+	uint64_t completed_fail_count;
+	/** Count of operations which successfully complete */
+	uint64_t completed_count;
+};
+
+#define RTE_DMADEV_ALL_VCHAN	0xFFFFu
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Retrieve basic statistics of a or all virtual DMA channel(s).
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param vchan
+ *   The identifier of virtual DMA channel.
+ *   If equal RTE_DMADEV_ALL_VCHAN means all channels.
+ * @param[out] stats
+ *   The basic statistics structure encapsulated into rte_dmadev_stats
+ *   object.
+ *
+ * @return
+ *   - =0: Successfully retrieve stats.
+ *   - <0: Failure to retrieve stats.
+ */
+__rte_experimental
+int
+rte_dmadev_stats_get(uint16_t dev_id, uint16_t vchan,
+		     struct rte_dmadev_stats *stats);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Reset basic statistics of a or all virtual DMA channel(s).
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param vchan
+ *   The identifier of virtual DMA channel.
+ *   If equal RTE_DMADEV_ALL_VCHAN means all channels.
+ *
+ * @return
+ *   - =0: Successfully reset stats.
+ *   - <0: Failure to reset stats.
+ */
+__rte_experimental
+int
+rte_dmadev_stats_reset(uint16_t dev_id, uint16_t vchan);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Dump DMA device info.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param f
+ *   The file to write the output to.
+ *
+ * @return
+ *   0 on success. Non-zero otherwise.
+ */
+__rte_experimental
+int
+rte_dmadev_dump(uint16_t dev_id, FILE *f);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Trigger the dmadev self test.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ *
+ * @return
+ *   - 0: Selftest successful.
+ *   - -ENOTSUP if the device doesn't support selftest
+ *   - other values < 0 on failure.
+ */
+__rte_experimental
+int
+rte_dmadev_selftest(uint16_t dev_id);
+
+/**
+ * rte_dma_sge - can hold scatter DMA operation request entry
+ */
+struct rte_dma_sge {
+	rte_iova_t addr;
+	uint32_t length;
+};
+
+/**
+ * rte_dma_sg - can hold scatter DMA operation request
+ */
+struct rte_dma_sg {
+	struct rte_dma_sge *src;
+	struct rte_dma_sge *dst;
+	uint16_t nb_src; /**< The number of src entry */
+	uint16_t nb_dst; /**< The number of dst entry */
+};
+
+#include "rte_dmadev_core.h"
+
+/**
+ *  DMA flags to augment operation preparation.
+ *  Used as the 'flags' parameter of rte_dmadev_copy/fill.
+ */
+#define RTE_DMA_OP_FLAG_FENCE	(1ull << 0)
+/**< DMA fence flag
+ * It means the operation with this flag must be processed only after all
+ * previous operations are completed.
+ *
+ * @see rte_dmadev_copy()
+ * @see rte_dmadev_copy_sg()
+ * @see rte_dmadev_fill()
+ */
+#define RTE_DMA_OP_FLAG_SUBMIT	(1ull << 1)
+/**< DMA submit flag
+ * It means the operation with this flag must issue doorbell to hardware after
+ * enqueued jobs.
+ */
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Enqueue a copy operation onto the virtual DMA channel.
+ *
+ * This queues up a copy operation to be performed by hardware, but does not
+ * trigger hardware to begin that operation.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param vchan
+ *   The identifier of virtual DMA channel.
+ * @param src
+ *   The address of the source buffer.
+ * @param dst
+ *   The address of the destination buffer.
+ * @param length
+ *   The length of the data to be copied.
+ * @param flags
+ *   An flags for this operation.
+ *   @see RTE_DMA_OP_FLAG_*
+ *
+ * @return
+ *   - 0..UINT16_MAX: index of enqueued copy job.
+ *   - <0: Error code returned by the driver copy function.
+ */
+__rte_experimental
+static inline int
+rte_dmadev_copy(uint16_t dev_id, uint16_t vchan, rte_iova_t src, rte_iova_t dst,
+		uint32_t length, uint64_t flags)
+{
+	struct rte_dmadev *dev = &rte_dmadevices[dev_id];
+#ifdef RTE_DMADEV_DEBUG
+	if (!rte_dmadev_is_valid_dev(dev_id) ||
+	    vchan >= dev->data->dev_conf.max_vchans)
+		return -EINVAL;
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->copy, -ENOTSUP);
+#endif
+	return (*dev->copy)(dev, vchan, src, dst, length, flags);
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Enqueue a scatter list copy operation onto the virtual DMA channel.
+ *
+ * This queues up a scatter list copy operation to be performed by hardware,
+ * but does not trigger hardware to begin that operation.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param vchan
+ *   The identifier of virtual DMA channel.
+ * @param sg
+ *   The pointer of scatterlist.
+ * @param flags
+ *   An flags for this operation.
+ *   @see RTE_DMA_OP_FLAG_*
+ *
+ * @return
+ *   - 0..UINT16_MAX: index of enqueued copy scatterlist job.
+ *   - <0: Error code returned by the driver copy scatterlist function.
+ */
+__rte_experimental
+static inline int
+rte_dmadev_copy_sg(uint16_t dev_id, uint16_t vchan, const struct rte_dma_sg *sg,
+		   uint64_t flags)
+{
+	struct rte_dmadev *dev = &rte_dmadevices[dev_id];
+#ifdef RTE_DMADEV_DEBUG
+	if (!rte_dmadev_is_valid_dev(dev_id) ||
+	    vchan >= dev->data->dev_conf.max_vchans ||
+	    sg == NULL)
+		return -EINVAL;
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->copy_sg, -ENOTSUP);
+#endif
+	return (*dev->copy_sg)(dev, vchan, sg, flags);
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Enqueue a fill operation onto the virtual DMA channel.
+ *
+ * This queues up a fill operation to be performed by hardware, but does not
+ * trigger hardware to begin that operation.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param vchan
+ *   The identifier of virtual DMA channel.
+ * @param pattern
+ *   The pattern to populate the destination buffer with.
+ * @param dst
+ *   The address of the destination buffer.
+ * @param length
+ *   The length of the destination buffer.
+ * @param flags
+ *   An flags for this operation.
+ *   @see RTE_DMA_OP_FLAG_*
+ *
+ * @return
+ *   - 0..UINT16_MAX: index of enqueued fill job.
+ *   - <0: Error code returned by the driver fill function.
+ */
+__rte_experimental
+static inline int
+rte_dmadev_fill(uint16_t dev_id, uint16_t vchan, uint64_t pattern,
+		rte_iova_t dst, uint32_t length, uint64_t flags)
+{
+	struct rte_dmadev *dev = &rte_dmadevices[dev_id];
+#ifdef RTE_DMADEV_DEBUG
+	if (!rte_dmadev_is_valid_dev(dev_id) ||
+	    vchan >= dev->data->dev_conf.max_vchans)
+		return -EINVAL;
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->fill, -ENOTSUP);
+#endif
+	return (*dev->fill)(dev, vchan, pattern, dst, length, flags);
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Trigger hardware to begin performing enqueued operations.
+ *
+ * This API is used to write the "doorbell" to the hardware to trigger it
+ * to begin the operations previously enqueued by rte_dmadev_copy/fill()
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param vchan
+ *   The identifier of virtual DMA channel.
+ *
+ * @return
+ *   - =0: Successfully trigger hardware.
+ *   - <0: Failure to trigger hardware.
+ */
+__rte_experimental
+static inline int
+rte_dmadev_submit(uint16_t dev_id, uint16_t vchan)
+{
+	struct rte_dmadev *dev = &rte_dmadevices[dev_id];
+#ifdef RTE_DMADEV_DEBUG
+	if (!rte_dmadev_is_valid_dev(dev_id) ||
+	    vchan >= dev->data->dev_conf.max_vchans)
+		return -EINVAL;
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->submit, -ENOTSUP);
+#endif
+	return (*dev->submit)(dev, vchan);
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Returns the number of operations that have been successfully completed.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param vchan
+ *   The identifier of virtual DMA channel.
+ * @param nb_cpls
+ *   The maximum number of completed operations that can be processed.
+ * @param[out] last_idx
+ *   The last completed operation's index.
+ *   If not required, NULL can be passed in.
+ * @param[out] has_error
+ *   Indicates if there are transfer error.
+ *   If not required, NULL can be passed in.
+ *
+ * @return
+ *   The number of operations that successfully completed.
+ */
+__rte_experimental
+static inline uint16_t
+rte_dmadev_completed(uint16_t dev_id, uint16_t vchan, const uint16_t nb_cpls,
+		     uint16_t *last_idx, bool *has_error)
+{
+	struct rte_dmadev *dev = &rte_dmadevices[dev_id];
+	uint16_t idx;
+	bool err;
+
+#ifdef RTE_DMADEV_DEBUG
+	if (!rte_dmadev_is_valid_dev(dev_id) ||
+	    vchan >= dev->data->dev_conf.max_vchans ||
+	    nb_cpls == 0)
+		return -EINVAL;
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->completed, -ENOTSUP);
+#endif
+
+	/* Ensure the pointer values are non-null to simplify drivers.
+	 * In most cases these should be compile time evaluated, since this is
+	 * an inline function.
+	 * - If NULL is explicitly passed as parameter, then compiler knows the
+	 *   value is NULL
+	 * - If address of local variable is passed as parameter, then compiler
+	 *   can know it's non-NULL.
+	 */
+	if (last_idx == NULL)
+		last_idx = &idx;
+	if (has_error == NULL)
+		has_error = &err;
+
+	*has_error = false;
+	return (*dev->completed)(dev, vchan, nb_cpls, last_idx, has_error);
+}
+
+/**
+ * DMA transfer status code defines
+ */
+enum rte_dma_status_code {
+	/** The operation completed successfully */
+	RTE_DMA_STATUS_SUCCESSFUL = 0,
+	/** The operation failed to complete due active drop
+	 * This is mainly used when processing dev_stop, allow outstanding
+	 * requests to be completed as much as possible.
+	 */
+	RTE_DMA_STATUS_ACTIVE_DROP,
+	/** The operation failed to complete due invalid source address */
+	RTE_DMA_STATUS_INVALID_SRC_ADDR,
+	/** The operation failed to complete due invalid destination address */
+	RTE_DMA_STATUS_INVALID_DST_ADDR,
+	/** The operation failed to complete due invalid length */
+	RTE_DMA_STATUS_INVALID_LENGTH,
+	/** The operation failed to complete due invalid opcode
+	 * The DMA descriptor could have multiple format, which are
+	 * distinguished by the opcode field.
+	 */
+	RTE_DMA_STATUS_INVALID_OPCODE,
+	/** The operation failed to complete due bus err */
+	RTE_DMA_STATUS_BUS_ERROR,
+	/** The operation failed to complete due data poison */
+	RTE_DMA_STATUS_DATA_POISION,
+	/** The operation failed to complete due descriptor read error */
+	RTE_DMA_STATUS_DESCRIPTOR_READ_ERROR,
+	/** The operation failed to complete due device link error
+	 * Used to indicates that the link error in the mem-to-dev/dev-to-mem/
+	 * dev-to-dev transfer scenario.
+	 */
+	RTE_DMA_STATUS_DEV_LINK_ERROR,
+	/** The operation failed to complete due unknown reason */
+	RTE_DMA_STATUS_UNKNOWN,
+	/** Driver specific status code offset
+	 * Start status code for the driver to define its own error code.
+	 */
+	RTE_DMA_STATUS_DRV_SPECIFIC_OFFSET = 0x10000,
+};
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Returns the number of operations that failed to complete.
+ * NOTE: This API was used when rte_dmadev_completed has_error was set.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param vchan
+ *   The identifier of virtual DMA channel.
+ * @param nb_status
+ *   Indicates the size of status array.
+ * @param[out] status
+ *   The error code of operations that failed to complete.
+ *   Some standard error code are described in 'enum rte_dma_status_code'
+ *   @see rte_dma_status_code
+ * @param[out] last_idx
+ *   The last failed completed operation's index.
+ *
+ * @return
+ *   The number of operations that failed to complete.
+ */
+__rte_experimental
+static inline uint16_t
+rte_dmadev_completed_fails(uint16_t dev_id, uint16_t vchan,
+			   const uint16_t nb_status, uint32_t *status,
+			   uint16_t *last_idx)
+{
+	struct rte_dmadev *dev = &rte_dmadevices[dev_id];
+#ifdef RTE_DMADEV_DEBUG
+	if (!rte_dmadev_is_valid_dev(dev_id) ||
+	    vchan >= dev->data->dev_conf.max_vchans ||
+	    nb_status == 0 ||
+	    status == NULL ||
+	    last_idx == NULL)
+		return -EINVAL;
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->completed_fails, -ENOTSUP);
+#endif
+	return (*dev->completed_fails)(dev, vchan, nb_status, status, last_idx);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_DMADEV_H_ */
diff --git a/lib/dmadev/rte_dmadev_core.h b/lib/dmadev/rte_dmadev_core.h
new file mode 100644
index 0000000..b0b6494
--- /dev/null
+++ b/lib/dmadev/rte_dmadev_core.h
@@ -0,0 +1,161 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 HiSilicon Limited.
+ * Copyright(c) 2021 Intel Corporation.
+ */
+
+#ifndef _RTE_DMADEV_CORE_H_
+#define _RTE_DMADEV_CORE_H_
+
+/**
+ * @file
+ *
+ * RTE DMA Device internal header.
+ *
+ * This header contains internal data types, that are used by the DMA devices
+ * in order to expose their ops to the class.
+ *
+ * Applications should not use these API directly.
+ *
+ */
+
+struct rte_dmadev;
+
+/** @internal Used to get device information of a device. */
+typedef int (*dmadev_info_get_t)(const struct rte_dmadev *dev,
+				 struct rte_dmadev_info *dev_info,
+				 uint32_t info_sz);
+
+/** @internal Used to configure a device. */
+typedef int (*dmadev_configure_t)(struct rte_dmadev *dev,
+				  const struct rte_dmadev_conf *dev_conf);
+
+/** @internal Used to start a configured device. */
+typedef int (*dmadev_start_t)(struct rte_dmadev *dev);
+
+/** @internal Used to stop a configured device. */
+typedef int (*dmadev_stop_t)(struct rte_dmadev *dev);
+
+/** @internal Used to close a configured device. */
+typedef int (*dmadev_close_t)(struct rte_dmadev *dev);
+
+/** @internal Used to allocate and set up a virtual DMA channel. */
+typedef int (*dmadev_vchan_setup_t)(struct rte_dmadev *dev,
+				    const struct rte_dmadev_vchan_conf *conf);
+
+/** @internal Used to release a virtual DMA channel. */
+typedef int (*dmadev_vchan_release_t)(struct rte_dmadev *dev, uint16_t vchan);
+
+/** @internal Used to retrieve basic statistics. */
+typedef int (*dmadev_stats_get_t)(const struct rte_dmadev *dev, uint16_t vchan,
+				  struct rte_dmadev_stats *stats,
+				  uint32_t stats_sz);
+
+/** @internal Used to reset basic statistics. */
+typedef int (*dmadev_stats_reset_t)(struct rte_dmadev *dev, uint16_t vchan);
+
+/** @internal Used to dump internal information. */
+typedef int (*dmadev_dump_t)(const struct rte_dmadev *dev, FILE *f);
+
+/** @internal Used to start dmadev selftest. */
+typedef int (*dmadev_selftest_t)(uint16_t dev_id);
+
+/** @internal Used to enqueue a copy operation. */
+typedef int (*dmadev_copy_t)(struct rte_dmadev *dev, uint16_t vchan,
+			     rte_iova_t src, rte_iova_t dst,
+			     uint32_t length, uint64_t flags);
+
+/** @internal Used to enqueue a scatter list copy operation. */
+typedef int (*dmadev_copy_sg_t)(struct rte_dmadev *dev, uint16_t vchan,
+				const struct rte_dma_sg *sg, uint64_t flags);
+
+/** @internal Used to enqueue a fill operation. */
+typedef int (*dmadev_fill_t)(struct rte_dmadev *dev, uint16_t vchan,
+			     uint64_t pattern, rte_iova_t dst,
+			     uint32_t length, uint64_t flags);
+
+/** @internal Used to trigger hardware to begin working. */
+typedef int (*dmadev_submit_t)(struct rte_dmadev *dev, uint16_t vchan);
+
+/** @internal Used to return number of successful completed operations. */
+typedef uint16_t (*dmadev_completed_t)(struct rte_dmadev *dev, uint16_t vchan,
+				       const uint16_t nb_cpls,
+				       uint16_t *last_idx, bool *has_error);
+
+/** @internal Used to return number of failed completed operations. */
+typedef uint16_t (*dmadev_completed_fails_t)(struct rte_dmadev *dev,
+			uint16_t vchan, const uint16_t nb_status,
+			uint32_t *status, uint16_t *last_idx);
+
+/**
+ * Possible states of a DMA device.
+ */
+enum rte_dmadev_state {
+	/** Device is unused before being probed. */
+	RTE_DMADEV_UNUSED = 0,
+	/** Device is attached when allocated in probing. */
+	RTE_DMADEV_ATTACHED,
+};
+
+/**
+ * DMA device operations function pointer table
+ */
+struct rte_dmadev_ops {
+	dmadev_info_get_t dev_info_get;
+	dmadev_configure_t dev_configure;
+	dmadev_start_t dev_start;
+	dmadev_stop_t dev_stop;
+	dmadev_close_t dev_close;
+	dmadev_vchan_setup_t vchan_setup;
+	dmadev_vchan_release_t vchan_release;
+	dmadev_stats_get_t stats_get;
+	dmadev_stats_reset_t stats_reset;
+	dmadev_dump_t dev_dump;
+	dmadev_selftest_t dev_selftest;
+};
+
+/**
+ * @internal
+ * The data part, with no function pointers, associated with each DMA device.
+ *
+ * This structure is safe to place in shared memory to be common among different
+ * processes in a multi-process configuration.
+ */
+struct rte_dmadev_data {
+	void *dev_private; /**< PMD-specific private data. */
+	uint16_t dev_id; /**< Device [external] identifier. */
+	char dev_name[RTE_DMADEV_NAME_MAX_LEN]; /**< Unique identifier name */
+	struct rte_dmadev_conf dev_conf; /**< DMA device configuration. */
+	uint8_t dev_started : 1; /**< Device state: STARTED(1)/STOPPED(0). */
+	uint64_t reserved[2]; /**< Reserved for future fields */
+} __rte_cache_aligned;
+
+/**
+ * @internal
+ * The generic data structure associated with each DMA device.
+ *
+ * The dataplane APIs are located at the beginning of the structure, along
+ * with the pointer to where all the data elements for the particular device
+ * are stored in shared memory. This split scheme allows the function pointer
+ * and driver data to be per-process, while the actual configuration data for
+ * the device is shared.
+ */
+struct rte_dmadev {
+	dmadev_copy_t copy;
+	dmadev_copy_sg_t copy_sg;
+	dmadev_fill_t fill;
+	dmadev_submit_t submit;
+	dmadev_completed_t completed;
+	dmadev_completed_fails_t completed_fails;
+	void *reserved_ptr; /**< Reserved for future IO function */
+	struct rte_dmadev_data *data; /**< Pointer to device data. */
+
+	const struct rte_dmadev_ops *dev_ops; /**< Functions exported by PMD. */
+	/** Device info which supplied during device initialization. */
+	struct rte_device *device;
+	enum rte_dmadev_state state; /**< Flag indicating the device state */
+	uint64_t reserved[2]; /**< Reserved for future fields */
+} __rte_cache_aligned;
+
+extern struct rte_dmadev rte_dmadevices[];
+
+#endif /* _RTE_DMADEV_CORE_H_ */
diff --git a/lib/dmadev/rte_dmadev_pmd.h b/lib/dmadev/rte_dmadev_pmd.h
new file mode 100644
index 0000000..45141f9
--- /dev/null
+++ b/lib/dmadev/rte_dmadev_pmd.h
@@ -0,0 +1,72 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 HiSilicon Limited.
+ */
+
+#ifndef _RTE_DMADEV_PMD_H_
+#define _RTE_DMADEV_PMD_H_
+
+/**
+ * @file
+ *
+ * RTE DMA Device PMD APIs
+ *
+ * Driver facing APIs for a DMA device. These are not to be called directly by
+ * any application.
+ */
+
+#include "rte_dmadev.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @internal
+ * Allocates a new dmadev slot for an DMA device and returns the pointer
+ * to that slot for the driver to use.
+ *
+ * @param name
+ *   DMA device name.
+ *
+ * @return
+ *   A pointer to the DMA device slot case of success,
+ *   NULL otherwise.
+ */
+__rte_internal
+struct rte_dmadev *
+rte_dmadev_pmd_allocate(const char *name);
+
+/**
+ * @internal
+ * Release the specified dmadev.
+ *
+ * @param dev
+ *   Device to be released.
+ *
+ * @return
+ *   - 0 on success, negative on error
+ */
+__rte_internal
+int
+rte_dmadev_pmd_release(struct rte_dmadev *dev);
+
+/**
+ * @internal
+ * Return the DMA device based on the device name.
+ *
+ * @param name
+ *   DMA device name.
+ *
+ * @return
+ *   A pointer to the DMA device slot case of success,
+ *   NULL otherwise.
+ */
+__rte_internal
+struct rte_dmadev *
+rte_dmadev_get_device_by_name(const char *name);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_DMADEV_PMD_H_ */
diff --git a/lib/dmadev/version.map b/lib/dmadev/version.map
new file mode 100644
index 0000000..2af78e4
--- /dev/null
+++ b/lib/dmadev/version.map
@@ -0,0 +1,37 @@
+EXPERIMENTAL {
+	global:
+
+	rte_dmadev_close;
+	rte_dmadev_completed;
+	rte_dmadev_completed_fails;
+	rte_dmadev_configure;
+	rte_dmadev_copy;
+	rte_dmadev_copy_sg;
+	rte_dmadev_count;
+	rte_dmadev_dump;
+	rte_dmadev_fill;
+	rte_dmadev_info_get;
+	rte_dmadev_is_valid_dev;
+	rte_dmadev_selftest;
+	rte_dmadev_start;
+	rte_dmadev_stats_get;
+	rte_dmadev_stats_reset;
+	rte_dmadev_stop;
+	rte_dmadev_submit;
+	rte_dmadev_vchan_release;
+	rte_dmadev_vchan_setup;
+
+	local: *;
+};
+
+INTERNAL {
+        global:
+
+	rte_dmadevices;
+	rte_dmadev_get_device_by_name;
+	rte_dmadev_pmd_allocate;
+	rte_dmadev_pmd_release;
+
+	local: *;
+};
+
diff --git a/lib/meson.build b/lib/meson.build
index 1673ca4..68d239f 100644
--- a/lib/meson.build
+++ b/lib/meson.build
@@ -60,6 +60,7 @@ libraries = [
         'bpf',
         'graph',
         'node',
+        'dmadev',
 ]
 
 if is_windows
-- 
2.8.1


^ permalink raw reply	[flat|nested] 339+ messages in thread

* Re: [dpdk-dev] [PATCH v3] dmadev: introduce DMA device library
  2021-07-13 12:27 ` [dpdk-dev] [PATCH v3] " Chengwen Feng
@ 2021-07-13 13:06   ` fengchengwen
  2021-07-13 13:37     ` Bruce Richardson
  2021-07-13 16:02   ` Bruce Richardson
                     ` (3 subsequent siblings)
  4 siblings, 1 reply; 339+ messages in thread
From: fengchengwen @ 2021-07-13 13:06 UTC (permalink / raw)
  To: thomas, ferruh.yigit, bruce.richardson, jerinj, jerinjacobk,
	andrew.rybchenko
  Cc: dev, mb, nipun.gupta, hemant.agrawal, maxime.coquelin,
	honnappa.nagarahalli, david.marchand, sburla, pkapoor,
	konstantin.ananyev

Thank you for your valuable comments, and I think we've taken a big step forward.

@andrew Could you provide the copyright line so that I can add it to relevant file.

@burce, jerin  Some unmodified review comments are returned here:

1.
COMMENT: We allow up to 100 characters per line for DPDK code, so these don't need
to be wrapped so aggressively.

REPLY: Our CI still has 80 characters limit, and I review most framework still comply.

2.
COMMENT: > +#define RTE_DMA_MEM_TO_MEM     (1ull << 0)
RTE_DMA_DIRECTION_...

REPLY: add the 'DIRECTION' may the macro too long, I prefer keep it simple.

3.
COMMENT: > +rte_dmadev_vchan_release(uint16_t dev_id, uint16_t vchan);
We are not making release as pubic API in other device class. See ethdev spec.
bbdev/eventdev/rawdev

REPLY: because ethdev's queue is hard-queue, and here is the software defined channels,
I think release is OK, BTW: bbdev/eventdev also have release ops.

4.
COMMENT:> +       uint64_t reserved[4]; /**< Reserved for future fields */
> +};
Please add the capability for each counter in info structure as one
device may support all
the counters.

REPLY: This is a statistics function. If this function is not supported, then do not need
to implement the stats ops function. Also could to set the unimplemented ones to zero.

5.
COMMENT: > +#endif
> +       return (*dev->fill)(dev, vchan, pattern, dst, length, flags);
Instead of every driver set the NOP function, In the common code, If
the CAPA is not set,
common code can set NOP function for this with <0 return value.

REPLY: I don't think it's a good idea to judge in IO path, it's application duty to ensure
don't call API which driver not supported (which could get from capabilities).

6.
COMMENT: > +rte_dmadev_completed_fails(uint16_t dev_id, uint16_t vchan,
> +                          const uint16_t nb_status, uint32_t *status,
uint32_t -> enum rte_dma_status_code

REPLY:I'm still evaluating this. It takes a long time for the driver to perform error code
conversion in this API. Do we need to provide an error code conversion function alone ?

7.
COMMENT: > +typedef int (*dmadev_info_get_t)(struct rte_dmadev *dev,
> +                                struct rte_dmadev_info *dev_info);
Please change to rte_dmadev_info_get_t to avoid conflict due to namespace issue
as this header is exported.

REPLY: I prefer not add 'rte_' prefix, it make the define too long.

8.
COMMENT: > + *        - rte_dmadev_completed_fails()
> + *            - return the number of operation requests failed to complete.
Please rename this to "completed_status" to allow the return of information
other than just errors. As I suggested before, I think this should also be
usable as a slower version of "completed" even in the case where there are
no errors, in that it returns status information for each and every job
rather than just returning as soon as it hits a failure.

REPLY: well, I think it maybe confuse (current OK/FAIL API is easy to understand.),
and we can build the slow path function on the two API.

9.
COMMENT: > +#define RTE_DMA_DEV_CAPA_MEM_TO_MEM	(1ull << 0)
> +/**< DMA device support mem-to-mem transfer.
Do we need this? Can we assume that any device appearing as a dmadev can
do mem-to-mem copies, and drop the capability for mem-to-mem and the
capability for copying?
also for RTE_DMA_DEV_CAPA_OPS_COPY

REPLY: yes, I insist on adding this for the sake of conceptual integrity.
For ioat driver just make a statement.

10.
COMMENT: > +	uint16_t nb_vchans; /**< Number of virtual DMA channel configured */
> +};
Let's add rte_dmadev_conf struct into this to return the configuration
settings.

REPLY: If we add rte_dmadev_conf in, it may break ABI when rte_dmadev_conf add fields.


[snip]

On 2021/7/13 20:27, Chengwen Feng wrote:
> This patch introduce 'dmadevice' which is a generic type of DMA
> device.
> 
> The APIs of dmadev library exposes some generic operations which can
> enable configuration and I/O with the DMA devices.
> 
> Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>
> ---
> v3:
> * rm reset and fill_sg ops.
> * rm MT-safe capabilities.
> * add submit flag.
> * redefine rte_dma_sg to implement asymmetric copy.
> * delete some reserved field for future use.
> * rearrangement rte_dmadev/rte_dmadev_data struct.
> * refresh rte_dmadev.h copyright.
> * update vchan setup parameter.
> * modified some inappropriate descriptions.
> * arrange version.map alphabetically.
> * other minor modifications from review comment.
> ---
>  MAINTAINERS                  |   4 +
>  config/rte_config.h          |   3 +
>  lib/dmadev/meson.build       |   7 +
>  lib/dmadev/rte_dmadev.c      | 561 +++++++++++++++++++++++++
>  lib/dmadev/rte_dmadev.h      | 968 +++++++++++++++++++++++++++++++++++++++++++
>  lib/dmadev/rte_dmadev_core.h | 161 +++++++
>  lib/dmadev/rte_dmadev_pmd.h  |  72 ++++
>  lib/dmadev/version.map       |  37 ++
>  lib/meson.build              |   1 +


^ permalink raw reply	[flat|nested] 339+ messages in thread

* Re: [dpdk-dev] [PATCH v3] dmadev: introduce DMA device library
  2021-07-13 13:06   ` fengchengwen
@ 2021-07-13 13:37     ` Bruce Richardson
  2021-07-15  6:44       ` Jerin Jacob
  0 siblings, 1 reply; 339+ messages in thread
From: Bruce Richardson @ 2021-07-13 13:37 UTC (permalink / raw)
  To: fengchengwen
  Cc: thomas, ferruh.yigit, jerinj, jerinjacobk, andrew.rybchenko, dev,
	mb, nipun.gupta, hemant.agrawal, maxime.coquelin,
	honnappa.nagarahalli, david.marchand, sburla, pkapoor,
	konstantin.ananyev

On Tue, Jul 13, 2021 at 09:06:39PM +0800, fengchengwen wrote:
> Thank you for your valuable comments, and I think we've taken a big step forward.
> 
> @andrew Could you provide the copyright line so that I can add it to relevant file.
> 
> @burce, jerin  Some unmodified review comments are returned here:

Thanks. Some further comments inline below. Most points you make I'm ok
with, but I do disagree on a number of others.

/Bruce

> 
> 1.
> COMMENT: We allow up to 100 characters per line for DPDK code, so these don't need
> to be wrapped so aggressively.
> 
> REPLY: Our CI still has 80 characters limit, and I review most framework still comply.
> 
Ok.

> 2.
> COMMENT: > +#define RTE_DMA_MEM_TO_MEM     (1ull << 0)
> RTE_DMA_DIRECTION_...
> 
> REPLY: add the 'DIRECTION' may the macro too long, I prefer keep it simple.
> 
DIRECTION could be shortened to DIR, but I think this is probably ok as is
too.

> 3.
> COMMENT: > +rte_dmadev_vchan_release(uint16_t dev_id, uint16_t vchan);
> We are not making release as pubic API in other device class. See ethdev spec.
> bbdev/eventdev/rawdev
> 
> REPLY: because ethdev's queue is hard-queue, and here is the software defined channels,
> I think release is OK, BTW: bbdev/eventdev also have release ops.
> 
Ok

> 4.  COMMENT:> +       uint64_t reserved[4]; /**< Reserved for future
> fields */
> > +};
> Please add the capability for each counter in info structure as one
> device may support all the counters.
> 
> REPLY: This is a statistics function. If this function is not supported,
> then do not need to implement the stats ops function. Also could to set
> the unimplemented ones to zero.
> 
+1
The stats functions should be a minimum set that is supported by all
drivers. Each of these stats can be easily tracked by software if HW
support for it is not available, so I agree that we should not have each
stat as a capability.

> 5.
> COMMENT: > +#endif
> > +       return (*dev->fill)(dev, vchan, pattern, dst, length, flags);
> Instead of every driver set the NOP function, In the common code, If
> the CAPA is not set,
> common code can set NOP function for this with <0 return value.
> 
> REPLY: I don't think it's a good idea to judge in IO path, it's application duty to ensure
> don't call API which driver not supported (which could get from capabilities).
> 
For datapath functions, +1.

> 6.
> COMMENT: > +rte_dmadev_completed_fails(uint16_t dev_id, uint16_t vchan,
> > +                          const uint16_t nb_status, uint32_t *status,
> uint32_t -> enum rte_dma_status_code
> 
> REPLY:I'm still evaluating this. It takes a long time for the driver to perform error code
> conversion in this API. Do we need to provide an error code conversion function alone ?
> 
It's not that difficult a conversion to do, and so long as we have the
regular "completed" function which doesn't do all the error manipulation we
should be fine. Performance in the case of errors is not expected to be as
good, since errors should be very rare.

> 7.
> COMMENT: > +typedef int (*dmadev_info_get_t)(struct rte_dmadev *dev,
> > +                                struct rte_dmadev_info *dev_info);
> Please change to rte_dmadev_info_get_t to avoid conflict due to namespace issue
> as this header is exported.
> 
> REPLY: I prefer not add 'rte_' prefix, it make the define too long.
> 
I disagree on this, they need the rte_ prefix, despite the fact it makes
them longer. If length is a concern, these can be changed from "dmadev_" to
"rte_dma_", which is only one character longer.
In fact, I believe Morten already suggested we use "rte_dma" rather than
"rte_dmadev" as a function prefix across the library.

> 8.
> COMMENT: > + *        - rte_dmadev_completed_fails()
> > + *            - return the number of operation requests failed to complete.
> Please rename this to "completed_status" to allow the return of information
> other than just errors. As I suggested before, I think this should also be
> usable as a slower version of "completed" even in the case where there are
> no errors, in that it returns status information for each and every job
> rather than just returning as soon as it hits a failure.
> 
> REPLY: well, I think it maybe confuse (current OK/FAIL API is easy to understand.),
> and we can build the slow path function on the two API.
> 
I still disagree on this too. We have a "completed" op where we get
informed of what has completed and minimal error indication, and a
"completed_status" operation which provides status information for each
operation completed, at the cost of speed.

> 9.
> COMMENT: > +#define RTE_DMA_DEV_CAPA_MEM_TO_MEM	(1ull << 0)
> > +/**< DMA device support mem-to-mem transfer.
> Do we need this? Can we assume that any device appearing as a dmadev can
> do mem-to-mem copies, and drop the capability for mem-to-mem and the
> capability for copying?
> also for RTE_DMA_DEV_CAPA_OPS_COPY
> 
> REPLY: yes, I insist on adding this for the sake of conceptual integrity.
> For ioat driver just make a statement.
> 

Ok. It seems a wasted bit to me, but I don't see us running out of them
soon.

> 10.
> COMMENT: > +	uint16_t nb_vchans; /**< Number of virtual DMA channel configured */
> > +};
> Let's add rte_dmadev_conf struct into this to return the configuration
> settings.
> 
> REPLY: If we add rte_dmadev_conf in, it may break ABI when rte_dmadev_conf add fields.
> 
Yes, that is true, but I fail to see why that is a major problem. It just
means that if the conf structure changes we have two functions to version
instead of one. The information is still useful.

If you don't want the actual conf structure explicitly put into the info
struct, we can instead put the fields in directly. I really think that the
info_get function should provide back to the user the details of what way
the device was configured previously.

regards,
/Bruce

^ permalink raw reply	[flat|nested] 339+ messages in thread

* Re: [dpdk-dev] [PATCH v2] dmadev: introduce DMA device library
  2021-07-11  9:25 ` [dpdk-dev] [PATCH v2] dmadev: introduce DMA device library Chengwen Feng
                     ` (5 preceding siblings ...)
  2021-07-12 15:50   ` Bruce Richardson
@ 2021-07-13 14:19   ` Ananyev, Konstantin
  2021-07-13 14:28     ` Bruce Richardson
  6 siblings, 1 reply; 339+ messages in thread
From: Ananyev, Konstantin @ 2021-07-13 14:19 UTC (permalink /