All of lore.kernel.org
 help / color / mirror / Atom feed
* [dpdk-dev] [PATCH 0/3] Add dual threading in QAT PMD
@ 2019-09-06 15:44 Fiona Trahe
  2019-09-06 15:44 ` [dpdk-dev] [PATCH 1/3] common/qat: remove tail write coalescing feature Fiona Trahe
                   ` (6 more replies)
  0 siblings, 7 replies; 10+ messages in thread
From: Fiona Trahe @ 2019-09-06 15:44 UTC (permalink / raw)
  To: dev; +Cc: akhil.goyal, arkadiuszx.kusztal, fiona.trahe

Remove the limitation whereby enqueue and dequeue must be
done in same thread.
The inflight calculation is reworked to be thread-safe for 2
threads - note this is not general multi-thread support, i.e
all enqueues to a qp must still be done in one thread and
all dequeues must be done in one thread, but enqueues and
dequeues may be in separate threads.
As the tail-coalescing feature is not
threadsafe it is removed first.

Fiona Trahe (3):
  common/qat: remove tail write coalescing feature
  common/qat: move max inflights param into qp
  common/qat: add dual thread support

 doc/guides/compressdevs/qat_comp.rst |    5 ++-
 doc/guides/cryptodevs/qat.rst        |   11 ++++-
 drivers/common/qat/qat_qp.c          |   77 +++++++++++++++++-----------------
 drivers/common/qat/qat_qp.h          |   11 +----
 4 files changed, 54 insertions(+), 50 deletions(-)


^ permalink raw reply	[flat|nested] 10+ messages in thread

* [dpdk-dev] [PATCH 1/3] common/qat: remove tail write coalescing feature
  2019-09-06 15:44 [dpdk-dev] [PATCH 0/3] Add dual threading in QAT PMD Fiona Trahe
@ 2019-09-06 15:44 ` Fiona Trahe
  2019-09-06 15:44 ` [dpdk-dev] [PATCH 2/3] common/qat: move max inflights param into qp Fiona Trahe
                   ` (5 subsequent siblings)
  6 siblings, 0 replies; 10+ messages in thread
From: Fiona Trahe @ 2019-09-06 15:44 UTC (permalink / raw)
  To: dev; +Cc: akhil.goyal, arkadiuszx.kusztal, fiona.trahe

The feature Coalescing Tail Writes on Enqueue is removed
as it is not thread-safe and a dual-thread feature will be added shortly.

Signed-off-by: Fiona Trahe <fiona.trahe@intel.com>
---
 drivers/common/qat/qat_qp.c |   16 +++-------------
 drivers/common/qat/qat_qp.h |    6 ------
 2 files changed, 3 insertions(+), 19 deletions(-)

diff --git a/drivers/common/qat/qat_qp.c b/drivers/common/qat/qat_qp.c
index 03f11f8..01ddce0 100644
--- a/drivers/common/qat/qat_qp.c
+++ b/drivers/common/qat/qat_qp.c
@@ -538,7 +538,6 @@ static inline uint32_t adf_modulo(uint32_t data, uint32_t modulo_mask)
 txq_write_tail(struct qat_qp *qp, struct qat_queue *q) {
 	WRITE_CSR_RING_TAIL(qp->mmap_bar_addr, q->hw_bundle_number,
 			q->hw_queue_number, q->tail);
-	q->nb_pending_requests = 0;
 	q->csr_tail = q->tail;
 }
 
@@ -622,25 +621,20 @@ void rxq_free_desc(struct qat_qp *qp, struct qat_queue *q)
 kick_tail:
 	queue->tail = tail;
 	tmp_qp->stats.enqueued_count += nb_ops_sent;
-	queue->nb_pending_requests += nb_ops_sent;
-	if (tmp_qp->inflights16 < QAT_CSR_TAIL_FORCE_WRITE_THRESH ||
-		    queue->nb_pending_requests > QAT_CSR_TAIL_WRITE_THRESH) {
-		txq_write_tail(tmp_qp, queue);
-	}
+	txq_write_tail(tmp_qp, queue);
 	return nb_ops_sent;
 }
 
 uint16_t
 qat_dequeue_op_burst(void *qp, void **ops, uint16_t nb_ops)
 {
-	struct qat_queue *rx_queue, *tx_queue;
+	struct qat_queue *rx_queue;
 	struct qat_qp *tmp_qp = (struct qat_qp *)qp;
 	uint32_t head;
 	uint32_t resp_counter = 0;
 	uint8_t *resp_msg;
 
 	rx_queue = &(tmp_qp->rx_q);
-	tx_queue = &(tmp_qp->tx_q);
 	head = rx_queue->head;
 	resp_msg = (uint8_t *)rx_queue->base_addr + rx_queue->head;
 
@@ -677,11 +671,7 @@ void rxq_free_desc(struct qat_qp *qp, struct qat_queue *q)
 						QAT_CSR_HEAD_WRITE_THRESH)
 			rxq_free_desc(tmp_qp, rx_queue);
 	}
-	/* also check if tail needs to be advanced */
-	if (tmp_qp->inflights16 <= QAT_CSR_TAIL_FORCE_WRITE_THRESH &&
-		tx_queue->tail != tx_queue->csr_tail) {
-		txq_write_tail(tmp_qp, tx_queue);
-	}
+
 	return resp_counter;
 }
 
diff --git a/drivers/common/qat/qat_qp.h b/drivers/common/qat/qat_qp.h
index 980c2ba..9212ca4 100644
--- a/drivers/common/qat/qat_qp.h
+++ b/drivers/common/qat/qat_qp.h
@@ -11,10 +11,6 @@
 
 #define QAT_CSR_HEAD_WRITE_THRESH 32U
 /* number of requests to accumulate before writing head CSR */
-#define QAT_CSR_TAIL_WRITE_THRESH 32U
-/* number of requests to accumulate before writing tail CSR */
-#define QAT_CSR_TAIL_FORCE_WRITE_THRESH 256U
-/* number of inflights below which no tail write coalescing should occur */
 
 typedef int (*build_request_t)(void *op,
 		uint8_t *req, void *op_cookie,
@@ -64,8 +60,6 @@ struct qat_queue {
 	uint32_t	csr_tail;		/* last written tail value */
 	uint16_t	nb_processed_responses;
 	/* number of responses processed since last CSR head write */
-	uint16_t	nb_pending_requests;
-	/* number of requests pending since last CSR tail write */
 };
 
 struct qat_qp {
-- 
1.7.0.7


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [dpdk-dev] [PATCH 2/3] common/qat: move max inflights param into qp
  2019-09-06 15:44 [dpdk-dev] [PATCH 0/3] Add dual threading in QAT PMD Fiona Trahe
  2019-09-06 15:44 ` [dpdk-dev] [PATCH 1/3] common/qat: remove tail write coalescing feature Fiona Trahe
@ 2019-09-06 15:44 ` Fiona Trahe
  2019-09-06 15:44 ` [dpdk-dev] [PATCH 3/3] common/qat: add dual thread support Fiona Trahe
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 10+ messages in thread
From: Fiona Trahe @ 2019-09-06 15:44 UTC (permalink / raw)
  To: dev; +Cc: akhil.goyal, arkadiuszx.kusztal, fiona.trahe

The max_inflights parameter is moved from qat_queue to qat_qp as it's
a more appropriate location.

Signed-off-by: Fiona Trahe <fiona.trahe@intel.com>
---
 drivers/common/qat/qat_qp.c |   23 ++++++++++++-----------
 drivers/common/qat/qat_qp.h |    2 +-
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/drivers/common/qat/qat_qp.c b/drivers/common/qat/qat_qp.c
index 01ddce0..8e4c74a 100644
--- a/drivers/common/qat/qat_qp.c
+++ b/drivers/common/qat/qat_qp.c
@@ -239,6 +239,15 @@ int qat_qp_setup(struct qat_pci_device *qat_dev,
 		goto create_err;
 	}
 
+	qp->max_inflights = ADF_MAX_INFLIGHTS(qp->tx_q.queue_size,
+				ADF_BYTES_TO_MSG_SIZE(qp->tx_q.msg_size));
+
+	if (qp->max_inflights < 2) {
+		QAT_LOG(ERR, "Invalid num inflights");
+		qat_queue_delete(&(qp->tx_q));
+		goto create_err;
+	}
+
 	if (qat_queue_create(qat_dev, &(qp->rx_q), qat_qp_conf,
 					ADF_RING_DIR_RX) != 0) {
 		QAT_LOG(ERR, "Rx queue create failed "
@@ -416,15 +425,7 @@ static void qat_queue_delete(struct qat_queue *queue)
 		goto queue_create_err;
 	}
 
-	queue->max_inflights = ADF_MAX_INFLIGHTS(queue->queue_size,
-					ADF_BYTES_TO_MSG_SIZE(desc_size));
 	queue->modulo_mask = (1 << ADF_RING_SIZE_MODULO(queue->queue_size)) - 1;
-
-	if (queue->max_inflights < 2) {
-		QAT_LOG(ERR, "Invalid num inflights");
-		ret = -EINVAL;
-		goto queue_create_err;
-	}
 	queue->head = 0;
 	queue->tail = 0;
 	queue->msg_size = desc_size;
@@ -443,11 +444,11 @@ static void qat_queue_delete(struct qat_queue *queue)
 			queue->hw_queue_number, queue_base);
 
 	QAT_LOG(DEBUG, "RING: Name:%s, size in CSR: %u, in bytes %u,"
-		" nb msgs %u, msg_size %u, max_inflights %u modulo mask %u",
+		" nb msgs %u, msg_size %u, modulo mask %u",
 			queue->memz_name,
 			queue->queue_size, queue_size_bytes,
 			qp_conf->nb_descriptors, desc_size,
-			queue->max_inflights, queue->modulo_mask);
+			queue->modulo_mask);
 
 	return 0;
 
@@ -590,7 +591,7 @@ void rxq_free_desc(struct qat_qp *qp, struct qat_queue *q)
 
 	/* Find how many can actually fit on the ring */
 	tmp_qp->inflights16 += nb_ops;
-	overflow = tmp_qp->inflights16 - queue->max_inflights;
+	overflow = tmp_qp->inflights16 - tmp_qp->max_inflights;
 	if (overflow > 0) {
 		tmp_qp->inflights16 -= overflow;
 		nb_ops_possible = nb_ops - overflow;
diff --git a/drivers/common/qat/qat_qp.h b/drivers/common/qat/qat_qp.h
index 9212ca4..5066f06 100644
--- a/drivers/common/qat/qat_qp.h
+++ b/drivers/common/qat/qat_qp.h
@@ -51,7 +51,6 @@ struct qat_queue {
 	uint32_t	tail;			/* Shadow copy of the tail */
 	uint32_t	modulo_mask;
 	uint32_t	msg_size;
-	uint16_t	max_inflights;
 	uint32_t	queue_size;
 	uint8_t		hw_bundle_number;
 	uint8_t		hw_queue_number;
@@ -76,6 +75,7 @@ struct qat_qp {
 	enum qat_service_type service_type;
 	struct qat_pci_device *qat_dev;
 	/**< qat device this qp is on */
+	uint16_t max_inflights;
 } __rte_cache_aligned;
 
 extern const struct qat_qp_hw_data qat_gen1_qps[][ADF_MAX_QPS_ON_ANY_SERVICE];
-- 
1.7.0.7


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [dpdk-dev] [PATCH 3/3] common/qat: add dual thread support
  2019-09-06 15:44 [dpdk-dev] [PATCH 0/3] Add dual threading in QAT PMD Fiona Trahe
  2019-09-06 15:44 ` [dpdk-dev] [PATCH 1/3] common/qat: remove tail write coalescing feature Fiona Trahe
  2019-09-06 15:44 ` [dpdk-dev] [PATCH 2/3] common/qat: move max inflights param into qp Fiona Trahe
@ 2019-09-06 15:44 ` Fiona Trahe
  2019-09-06 16:12 ` [dpdk-dev] [PATCH v2 0/3] Add dual threading in QAT PMD Fiona Trahe
                   ` (3 subsequent siblings)
  6 siblings, 0 replies; 10+ messages in thread
From: Fiona Trahe @ 2019-09-06 15:44 UTC (permalink / raw)
  To: dev; +Cc: akhil.goyal, arkadiuszx.kusztal, fiona.trahe

Remove the limitation whereby enqueue and dequeue must be
done in same thread.
The inflight calculation is reworked to be thread-safe for 2
threads - note this is not general multi-thread support, i.e
all enqueues to a qp must still be done in one thread and
all dequeues must be done in one thread, but enqueues and
dequeues may be in separate threads.
Documentation updated.

Signed-off-by: Fiona Trahe <fiona.trahe@intel.com>
---
 doc/guides/compressdevs/qat_comp.rst |    5 +++-
 doc/guides/cryptodevs/qat.rst        |   11 +++++++-
 drivers/common/qat/qat_qp.c          |   40 ++++++++++++++++++++-------------
 drivers/common/qat/qat_qp.h          |    3 +-
 4 files changed, 39 insertions(+), 20 deletions(-)

diff --git a/doc/guides/compressdevs/qat_comp.rst b/doc/guides/compressdevs/qat_comp.rst
index 6f583a4..d06b08a 100644
--- a/doc/guides/compressdevs/qat_comp.rst
+++ b/doc/guides/compressdevs/qat_comp.rst
@@ -33,7 +33,10 @@ Limitations
 -----------
 
 * Compressdev level 0, no compression, is not supported.
-* Queue pairs are not thread-safe (that is, within a single queue pair, RX and TX from different lcores is not supported).
+* Queue-pairs are thread-safe on Intel CPUs but Queues are not (that is, within a single
+  queue-pair all enqueues to the TX queue must be done from one thread and all dequeues
+  from the RX queue must be done from one thread, but enqueues and dequeues may be done
+  in different threads.)
 * No BSD support as BSD QAT kernel driver not available.
 * When using Deflate dynamic huffman encoding for compression, the input size (op.src.length)
   must be < CONFIG_RTE_PMD_QAT_COMP_IM_BUFFER_SIZE from the config file,
diff --git a/doc/guides/cryptodevs/qat.rst b/doc/guides/cryptodevs/qat.rst
index e905f6d..7c81b9b 100644
--- a/doc/guides/cryptodevs/qat.rst
+++ b/doc/guides/cryptodevs/qat.rst
@@ -81,7 +81,11 @@ Limitations
 * No BSD support as BSD QAT kernel driver not available.
 * ZUC EEA3/EIA3 is not supported by dh895xcc devices
 * Maximum additional authenticated data (AAD) for GCM is 240 bytes long and must be passed to the device in a buffer rounded up to the nearest block-size multiple (x16) and padded with zeros.
-* Queue pairs are not thread-safe (that is, within a single queue pair, RX and TX from different lcores is not supported).
+* Queue-pairs are thread-safe on Intel CPUs but Queues are not (that is, within a single
+  queue-pair all enqueues to the TX queue must be done from one thread and all dequeues
+  from the RX queue must be done from one thread, but enqueues and dequeues may be done
+  in different threads.)
+
 
 Extra notes on KASUMI F9
 ~~~~~~~~~~~~~~~~~~~~~~~~
@@ -122,7 +126,10 @@ Limitations
 ~~~~~~~~~~~
 
 * Big integers longer than 4096 bits are not supported.
-* Queue pairs are not thread-safe (that is, within a single queue pair, RX and TX from different lcores is not supported).
+* Queue-pairs are thread-safe on Intel CPUs but Queues are not (that is, within a single
+  queue-pair all enqueues to the TX queue must be done from one thread and all dequeues
+  from the RX queue must be done from one thread, but enqueues and dequeues may be done
+  in different threads.)
 
 .. _building_qat:
 
diff --git a/drivers/common/qat/qat_qp.c b/drivers/common/qat/qat_qp.c
index 8e4c74a..30cdc61 100644
--- a/drivers/common/qat/qat_qp.c
+++ b/drivers/common/qat/qat_qp.c
@@ -230,7 +230,7 @@ int qat_qp_setup(struct qat_pci_device *qat_dev,
 	}
 
 	qp->mmap_bar_addr = pci_dev->mem_resource[0].addr;
-	qp->inflights16 = 0;
+	qp->enqueued = qp->dequeued = 0;
 
 	if (qat_queue_create(qat_dev, &(qp->tx_q), qat_qp_conf,
 					ADF_RING_DIR_TX) != 0) {
@@ -321,7 +321,7 @@ int qat_qp_release(struct qat_qp **qp_addr)
 				qp->qat_dev->qat_dev_id);
 
 	/* Don't free memory if there are still responses to be processed */
-	if (qp->inflights16 == 0) {
+	if ((qp->enqueued - qp->dequeued) == 0) {
 		qat_queue_delete(&(qp->tx_q));
 		qat_queue_delete(&(qp->rx_q));
 	} else {
@@ -579,7 +579,6 @@ void rxq_free_desc(struct qat_qp *qp, struct qat_queue *q)
 	uint16_t nb_ops_possible = nb_ops;
 	register uint8_t *base_addr;
 	register uint32_t tail;
-	int overflow;
 
 	if (unlikely(nb_ops == 0))
 		return 0;
@@ -590,13 +589,25 @@ void rxq_free_desc(struct qat_qp *qp, struct qat_queue *q)
 	tail = queue->tail;
 
 	/* Find how many can actually fit on the ring */
-	tmp_qp->inflights16 += nb_ops;
-	overflow = tmp_qp->inflights16 - tmp_qp->max_inflights;
-	if (overflow > 0) {
-		tmp_qp->inflights16 -= overflow;
-		nb_ops_possible = nb_ops - overflow;
-		if (nb_ops_possible == 0)
-			return 0;
+	{
+		/* dequeued can only be written by one thread, but it may not
+		 * be this thread. As it's 4-byte aligned it will be read
+		 * atomically here by any Intel CPU.
+		 * enqueued can wrap before dequeued, but cannot
+		 * lap it as var size of enq/deq (uint32_t) > var size of
+		 * max_inflights (uint16_t). In reality inflights is never
+		 * even as big as max uint16_t, as it's <= ADF_MAX_DESC.
+		 * On wrapping, the calculation still returns the correct
+		 * positive value as all three vars are unsigned.
+		 */
+		uint32_t inflights =
+			tmp_qp->enqueued - tmp_qp->dequeued;
+
+		if ((inflights + nb_ops) > tmp_qp->max_inflights) {
+			nb_ops_possible = tmp_qp->max_inflights - inflights;
+			if (nb_ops_possible == 0)
+				return 0;
+		}
 	}
 
 	while (nb_ops_sent != nb_ops_possible) {
@@ -605,11 +616,7 @@ void rxq_free_desc(struct qat_qp *qp, struct qat_queue *q)
 				tmp_qp->qat_dev_gen);
 		if (ret != 0) {
 			tmp_qp->stats.enqueue_err_count++;
-			/*
-			 * This message cannot be enqueued,
-			 * decrease number of ops that wasn't sent
-			 */
-			tmp_qp->inflights16 -= nb_ops_possible - nb_ops_sent;
+			/* This message cannot be enqueued */
 			if (nb_ops_sent == 0)
 				return 0;
 			goto kick_tail;
@@ -621,6 +628,7 @@ void rxq_free_desc(struct qat_qp *qp, struct qat_queue *q)
 	}
 kick_tail:
 	queue->tail = tail;
+	tmp_qp->enqueued += nb_ops_sent;
 	tmp_qp->stats.enqueued_count += nb_ops_sent;
 	txq_write_tail(tmp_qp, queue);
 	return nb_ops_sent;
@@ -664,9 +672,9 @@ void rxq_free_desc(struct qat_qp *qp, struct qat_queue *q)
 	}
 	if (resp_counter > 0) {
 		rx_queue->head = head;
+		tmp_qp->dequeued += resp_counter;
 		tmp_qp->stats.dequeued_count += resp_counter;
 		rx_queue->nb_processed_responses += resp_counter;
-		tmp_qp->inflights16 -= resp_counter;
 
 		if (rx_queue->nb_processed_responses >
 						QAT_CSR_HEAD_WRITE_THRESH)
diff --git a/drivers/common/qat/qat_qp.h b/drivers/common/qat/qat_qp.h
index 5066f06..8b9ab79 100644
--- a/drivers/common/qat/qat_qp.h
+++ b/drivers/common/qat/qat_qp.h
@@ -63,7 +63,6 @@ struct qat_queue {
 
 struct qat_qp {
 	void			*mmap_bar_addr;
-	uint16_t		inflights16;
 	struct qat_queue	tx_q;
 	struct qat_queue	rx_q;
 	struct qat_common_stats stats;
@@ -75,6 +74,8 @@ struct qat_qp {
 	enum qat_service_type service_type;
 	struct qat_pci_device *qat_dev;
 	/**< qat device this qp is on */
+	uint32_t enqueued;
+	uint32_t dequeued __rte_aligned(4);
 	uint16_t max_inflights;
 } __rte_cache_aligned;
 
-- 
1.7.0.7


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [dpdk-dev] [PATCH v2 0/3] Add dual threading in QAT PMD
  2019-09-06 15:44 [dpdk-dev] [PATCH 0/3] Add dual threading in QAT PMD Fiona Trahe
                   ` (2 preceding siblings ...)
  2019-09-06 15:44 ` [dpdk-dev] [PATCH 3/3] common/qat: add dual thread support Fiona Trahe
@ 2019-09-06 16:12 ` Fiona Trahe
  2019-10-04 10:48   ` Akhil Goyal
  2019-09-06 16:12 ` [dpdk-dev] [PATCH v2 1/3] common/qat: remove tail write coalescing feature Fiona Trahe
                   ` (2 subsequent siblings)
  6 siblings, 1 reply; 10+ messages in thread
From: Fiona Trahe @ 2019-09-06 16:12 UTC (permalink / raw)
  To: dev; +Cc: akhil.goyal, arkadiuszx.kusztal, fiona.trahe

Remove the limitation whereby enqueue and dequeue must be
done in same thread.
The inflight calculation is reworked to be thread-safe for 2
threads - note this is not general multi-thread support, i.e
all enqueues to a qp must still be done in one thread and
all dequeues must be done in one thread, but enqueues and
dequeues may be in separate threads.
As the tail-coalescing feature is not
threadsafe it is removed first.

v2 changes:
 - clarified wording in docs

Fiona Trahe (3):
  common/qat: remove tail write coalescing feature
  common/qat: move max inflights param into qp
  common/qat: add dual thread support

 doc/guides/compressdevs/qat_comp.rst |  5 ++-
 doc/guides/cryptodevs/qat.rst        | 11 +++++-
 drivers/common/qat/qat_qp.c          | 77 ++++++++++++++++++------------------
 drivers/common/qat/qat_qp.h          | 11 ++----
 4 files changed, 54 insertions(+), 50 deletions(-)

-- 
2.13.6


^ permalink raw reply	[flat|nested] 10+ messages in thread

* [dpdk-dev] [PATCH v2 1/3] common/qat: remove tail write coalescing feature
  2019-09-06 15:44 [dpdk-dev] [PATCH 0/3] Add dual threading in QAT PMD Fiona Trahe
                   ` (3 preceding siblings ...)
  2019-09-06 16:12 ` [dpdk-dev] [PATCH v2 0/3] Add dual threading in QAT PMD Fiona Trahe
@ 2019-09-06 16:12 ` Fiona Trahe
  2019-09-06 16:12 ` [dpdk-dev] [PATCH v2 2/3] common/qat: move max inflights param into qp Fiona Trahe
  2019-09-06 16:12 ` [dpdk-dev] [PATCH v2 3/3] common/qat: add dual thread support Fiona Trahe
  6 siblings, 0 replies; 10+ messages in thread
From: Fiona Trahe @ 2019-09-06 16:12 UTC (permalink / raw)
  To: dev; +Cc: akhil.goyal, arkadiuszx.kusztal, fiona.trahe

The feature Coalescing Tail Writes on Enqueue is removed
as it is not thread-safe and a dual-thread feature will be added shortly.

Signed-off-by: Fiona Trahe <fiona.trahe@intel.com>
---
 drivers/common/qat/qat_qp.c | 16 +++-------------
 drivers/common/qat/qat_qp.h |  6 ------
 2 files changed, 3 insertions(+), 19 deletions(-)

diff --git a/drivers/common/qat/qat_qp.c b/drivers/common/qat/qat_qp.c
index 03f11f869..01ddce008 100644
--- a/drivers/common/qat/qat_qp.c
+++ b/drivers/common/qat/qat_qp.c
@@ -538,7 +538,6 @@ static inline void
 txq_write_tail(struct qat_qp *qp, struct qat_queue *q) {
 	WRITE_CSR_RING_TAIL(qp->mmap_bar_addr, q->hw_bundle_number,
 			q->hw_queue_number, q->tail);
-	q->nb_pending_requests = 0;
 	q->csr_tail = q->tail;
 }
 
@@ -622,25 +621,20 @@ qat_enqueue_op_burst(void *qp, void **ops, uint16_t nb_ops)
 kick_tail:
 	queue->tail = tail;
 	tmp_qp->stats.enqueued_count += nb_ops_sent;
-	queue->nb_pending_requests += nb_ops_sent;
-	if (tmp_qp->inflights16 < QAT_CSR_TAIL_FORCE_WRITE_THRESH ||
-		    queue->nb_pending_requests > QAT_CSR_TAIL_WRITE_THRESH) {
-		txq_write_tail(tmp_qp, queue);
-	}
+	txq_write_tail(tmp_qp, queue);
 	return nb_ops_sent;
 }
 
 uint16_t
 qat_dequeue_op_burst(void *qp, void **ops, uint16_t nb_ops)
 {
-	struct qat_queue *rx_queue, *tx_queue;
+	struct qat_queue *rx_queue;
 	struct qat_qp *tmp_qp = (struct qat_qp *)qp;
 	uint32_t head;
 	uint32_t resp_counter = 0;
 	uint8_t *resp_msg;
 
 	rx_queue = &(tmp_qp->rx_q);
-	tx_queue = &(tmp_qp->tx_q);
 	head = rx_queue->head;
 	resp_msg = (uint8_t *)rx_queue->base_addr + rx_queue->head;
 
@@ -677,11 +671,7 @@ qat_dequeue_op_burst(void *qp, void **ops, uint16_t nb_ops)
 						QAT_CSR_HEAD_WRITE_THRESH)
 			rxq_free_desc(tmp_qp, rx_queue);
 	}
-	/* also check if tail needs to be advanced */
-	if (tmp_qp->inflights16 <= QAT_CSR_TAIL_FORCE_WRITE_THRESH &&
-		tx_queue->tail != tx_queue->csr_tail) {
-		txq_write_tail(tmp_qp, tx_queue);
-	}
+
 	return resp_counter;
 }
 
diff --git a/drivers/common/qat/qat_qp.h b/drivers/common/qat/qat_qp.h
index 980c2ba32..9212ca457 100644
--- a/drivers/common/qat/qat_qp.h
+++ b/drivers/common/qat/qat_qp.h
@@ -11,10 +11,6 @@ struct qat_pci_device;
 
 #define QAT_CSR_HEAD_WRITE_THRESH 32U
 /* number of requests to accumulate before writing head CSR */
-#define QAT_CSR_TAIL_WRITE_THRESH 32U
-/* number of requests to accumulate before writing tail CSR */
-#define QAT_CSR_TAIL_FORCE_WRITE_THRESH 256U
-/* number of inflights below which no tail write coalescing should occur */
 
 typedef int (*build_request_t)(void *op,
 		uint8_t *req, void *op_cookie,
@@ -64,8 +60,6 @@ struct qat_queue {
 	uint32_t	csr_tail;		/* last written tail value */
 	uint16_t	nb_processed_responses;
 	/* number of responses processed since last CSR head write */
-	uint16_t	nb_pending_requests;
-	/* number of requests pending since last CSR tail write */
 };
 
 struct qat_qp {
-- 
2.13.6


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [dpdk-dev] [PATCH v2 2/3] common/qat: move max inflights param into qp
  2019-09-06 15:44 [dpdk-dev] [PATCH 0/3] Add dual threading in QAT PMD Fiona Trahe
                   ` (4 preceding siblings ...)
  2019-09-06 16:12 ` [dpdk-dev] [PATCH v2 1/3] common/qat: remove tail write coalescing feature Fiona Trahe
@ 2019-09-06 16:12 ` Fiona Trahe
  2019-09-06 16:12 ` [dpdk-dev] [PATCH v2 3/3] common/qat: add dual thread support Fiona Trahe
  6 siblings, 0 replies; 10+ messages in thread
From: Fiona Trahe @ 2019-09-06 16:12 UTC (permalink / raw)
  To: dev; +Cc: akhil.goyal, arkadiuszx.kusztal, fiona.trahe

The max_inflights parameter is moved from qat_queue to qat_qp as it's
a more appropriate location.

Signed-off-by: Fiona Trahe <fiona.trahe@intel.com>
---
 drivers/common/qat/qat_qp.c | 23 ++++++++++++-----------
 drivers/common/qat/qat_qp.h |  2 +-
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/drivers/common/qat/qat_qp.c b/drivers/common/qat/qat_qp.c
index 01ddce008..8e4c74a02 100644
--- a/drivers/common/qat/qat_qp.c
+++ b/drivers/common/qat/qat_qp.c
@@ -239,6 +239,15 @@ int qat_qp_setup(struct qat_pci_device *qat_dev,
 		goto create_err;
 	}
 
+	qp->max_inflights = ADF_MAX_INFLIGHTS(qp->tx_q.queue_size,
+				ADF_BYTES_TO_MSG_SIZE(qp->tx_q.msg_size));
+
+	if (qp->max_inflights < 2) {
+		QAT_LOG(ERR, "Invalid num inflights");
+		qat_queue_delete(&(qp->tx_q));
+		goto create_err;
+	}
+
 	if (qat_queue_create(qat_dev, &(qp->rx_q), qat_qp_conf,
 					ADF_RING_DIR_RX) != 0) {
 		QAT_LOG(ERR, "Rx queue create failed "
@@ -416,15 +425,7 @@ qat_queue_create(struct qat_pci_device *qat_dev, struct qat_queue *queue,
 		goto queue_create_err;
 	}
 
-	queue->max_inflights = ADF_MAX_INFLIGHTS(queue->queue_size,
-					ADF_BYTES_TO_MSG_SIZE(desc_size));
 	queue->modulo_mask = (1 << ADF_RING_SIZE_MODULO(queue->queue_size)) - 1;
-
-	if (queue->max_inflights < 2) {
-		QAT_LOG(ERR, "Invalid num inflights");
-		ret = -EINVAL;
-		goto queue_create_err;
-	}
 	queue->head = 0;
 	queue->tail = 0;
 	queue->msg_size = desc_size;
@@ -443,11 +444,11 @@ qat_queue_create(struct qat_pci_device *qat_dev, struct qat_queue *queue,
 			queue->hw_queue_number, queue_base);
 
 	QAT_LOG(DEBUG, "RING: Name:%s, size in CSR: %u, in bytes %u,"
-		" nb msgs %u, msg_size %u, max_inflights %u modulo mask %u",
+		" nb msgs %u, msg_size %u, modulo mask %u",
 			queue->memz_name,
 			queue->queue_size, queue_size_bytes,
 			qp_conf->nb_descriptors, desc_size,
-			queue->max_inflights, queue->modulo_mask);
+			queue->modulo_mask);
 
 	return 0;
 
@@ -590,7 +591,7 @@ qat_enqueue_op_burst(void *qp, void **ops, uint16_t nb_ops)
 
 	/* Find how many can actually fit on the ring */
 	tmp_qp->inflights16 += nb_ops;
-	overflow = tmp_qp->inflights16 - queue->max_inflights;
+	overflow = tmp_qp->inflights16 - tmp_qp->max_inflights;
 	if (overflow > 0) {
 		tmp_qp->inflights16 -= overflow;
 		nb_ops_possible = nb_ops - overflow;
diff --git a/drivers/common/qat/qat_qp.h b/drivers/common/qat/qat_qp.h
index 9212ca457..5066f06f0 100644
--- a/drivers/common/qat/qat_qp.h
+++ b/drivers/common/qat/qat_qp.h
@@ -51,7 +51,6 @@ struct qat_queue {
 	uint32_t	tail;			/* Shadow copy of the tail */
 	uint32_t	modulo_mask;
 	uint32_t	msg_size;
-	uint16_t	max_inflights;
 	uint32_t	queue_size;
 	uint8_t		hw_bundle_number;
 	uint8_t		hw_queue_number;
@@ -76,6 +75,7 @@ struct qat_qp {
 	enum qat_service_type service_type;
 	struct qat_pci_device *qat_dev;
 	/**< qat device this qp is on */
+	uint16_t max_inflights;
 } __rte_cache_aligned;
 
 extern const struct qat_qp_hw_data qat_gen1_qps[][ADF_MAX_QPS_ON_ANY_SERVICE];
-- 
2.13.6


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [dpdk-dev] [PATCH v2 3/3] common/qat: add dual thread support
  2019-09-06 15:44 [dpdk-dev] [PATCH 0/3] Add dual threading in QAT PMD Fiona Trahe
                   ` (5 preceding siblings ...)
  2019-09-06 16:12 ` [dpdk-dev] [PATCH v2 2/3] common/qat: move max inflights param into qp Fiona Trahe
@ 2019-09-06 16:12 ` Fiona Trahe
  6 siblings, 0 replies; 10+ messages in thread
From: Fiona Trahe @ 2019-09-06 16:12 UTC (permalink / raw)
  To: dev; +Cc: akhil.goyal, arkadiuszx.kusztal, fiona.trahe

Remove the limitation whereby enqueue and dequeue must be
done in same thread.
The inflight calculation is reworked to be thread-safe for 2
threads - note this is not general multi-thread support, i.e
all enqueues to a qp must still be done in one thread and
all dequeues must be done in one thread, but enqueues and
dequeues may be in separate threads.
Documentation updated.

Signed-off-by: Fiona Trahe <fiona.trahe@intel.com>
---
 doc/guides/compressdevs/qat_comp.rst |  5 ++++-
 doc/guides/cryptodevs/qat.rst        | 11 ++++++++--
 drivers/common/qat/qat_qp.c          | 40 +++++++++++++++++++++---------------
 drivers/common/qat/qat_qp.h          |  3 ++-
 4 files changed, 39 insertions(+), 20 deletions(-)

diff --git a/doc/guides/compressdevs/qat_comp.rst b/doc/guides/compressdevs/qat_comp.rst
index 6f583a460..669e288e3 100644
--- a/doc/guides/compressdevs/qat_comp.rst
+++ b/doc/guides/compressdevs/qat_comp.rst
@@ -33,7 +33,10 @@ Limitations
 -----------
 
 * Compressdev level 0, no compression, is not supported.
-* Queue pairs are not thread-safe (that is, within a single queue pair, RX and TX from different lcores is not supported).
+* QAT queue-pairs are thread-safe when process runs on an Intel CPU but QAT queues are not
+  (that is, within a single queue-pair all enqueues to the TX queue must be done from one
+  thread and all dequeues from the RX queue must be done from one thread, but enqueues
+  and dequeues may be done in different threads.)
 * No BSD support as BSD QAT kernel driver not available.
 * When using Deflate dynamic huffman encoding for compression, the input size (op.src.length)
   must be < CONFIG_RTE_PMD_QAT_COMP_IM_BUFFER_SIZE from the config file,
diff --git a/doc/guides/cryptodevs/qat.rst b/doc/guides/cryptodevs/qat.rst
index e905f6d00..750a03e6c 100644
--- a/doc/guides/cryptodevs/qat.rst
+++ b/doc/guides/cryptodevs/qat.rst
@@ -81,7 +81,11 @@ Limitations
 * No BSD support as BSD QAT kernel driver not available.
 * ZUC EEA3/EIA3 is not supported by dh895xcc devices
 * Maximum additional authenticated data (AAD) for GCM is 240 bytes long and must be passed to the device in a buffer rounded up to the nearest block-size multiple (x16) and padded with zeros.
-* Queue pairs are not thread-safe (that is, within a single queue pair, RX and TX from different lcores is not supported).
+* QAT queue-pairs are thread-safe when process runs on an Intel CPU but QAT queues are not
+  (that is, within a single queue-pair all enqueues to the TX queue must be done from one
+  thread and all dequeues from the RX queue must be done from one thread, but enqueues
+  and dequeues may be done in different threads.)
+
 
 Extra notes on KASUMI F9
 ~~~~~~~~~~~~~~~~~~~~~~~~
@@ -122,7 +126,10 @@ Limitations
 ~~~~~~~~~~~
 
 * Big integers longer than 4096 bits are not supported.
-* Queue pairs are not thread-safe (that is, within a single queue pair, RX and TX from different lcores is not supported).
+* QAT queue-pairs are thread-safe when process runs on an Intel CPU but QAT queues are not
+  (that is, within a single queue-pair all enqueues to the TX queue must be done from one
+  thread and all dequeues from the RX queue must be done from one thread, but enqueues
+  and dequeues may be done in different threads.)
 
 .. _building_qat:
 
diff --git a/drivers/common/qat/qat_qp.c b/drivers/common/qat/qat_qp.c
index 8e4c74a02..30cdc618d 100644
--- a/drivers/common/qat/qat_qp.c
+++ b/drivers/common/qat/qat_qp.c
@@ -230,7 +230,7 @@ int qat_qp_setup(struct qat_pci_device *qat_dev,
 	}
 
 	qp->mmap_bar_addr = pci_dev->mem_resource[0].addr;
-	qp->inflights16 = 0;
+	qp->enqueued = qp->dequeued = 0;
 
 	if (qat_queue_create(qat_dev, &(qp->tx_q), qat_qp_conf,
 					ADF_RING_DIR_TX) != 0) {
@@ -321,7 +321,7 @@ int qat_qp_release(struct qat_qp **qp_addr)
 				qp->qat_dev->qat_dev_id);
 
 	/* Don't free memory if there are still responses to be processed */
-	if (qp->inflights16 == 0) {
+	if ((qp->enqueued - qp->dequeued) == 0) {
 		qat_queue_delete(&(qp->tx_q));
 		qat_queue_delete(&(qp->rx_q));
 	} else {
@@ -579,7 +579,6 @@ qat_enqueue_op_burst(void *qp, void **ops, uint16_t nb_ops)
 	uint16_t nb_ops_possible = nb_ops;
 	register uint8_t *base_addr;
 	register uint32_t tail;
-	int overflow;
 
 	if (unlikely(nb_ops == 0))
 		return 0;
@@ -590,13 +589,25 @@ qat_enqueue_op_burst(void *qp, void **ops, uint16_t nb_ops)
 	tail = queue->tail;
 
 	/* Find how many can actually fit on the ring */
-	tmp_qp->inflights16 += nb_ops;
-	overflow = tmp_qp->inflights16 - tmp_qp->max_inflights;
-	if (overflow > 0) {
-		tmp_qp->inflights16 -= overflow;
-		nb_ops_possible = nb_ops - overflow;
-		if (nb_ops_possible == 0)
-			return 0;
+	{
+		/* dequeued can only be written by one thread, but it may not
+		 * be this thread. As it's 4-byte aligned it will be read
+		 * atomically here by any Intel CPU.
+		 * enqueued can wrap before dequeued, but cannot
+		 * lap it as var size of enq/deq (uint32_t) > var size of
+		 * max_inflights (uint16_t). In reality inflights is never
+		 * even as big as max uint16_t, as it's <= ADF_MAX_DESC.
+		 * On wrapping, the calculation still returns the correct
+		 * positive value as all three vars are unsigned.
+		 */
+		uint32_t inflights =
+			tmp_qp->enqueued - tmp_qp->dequeued;
+
+		if ((inflights + nb_ops) > tmp_qp->max_inflights) {
+			nb_ops_possible = tmp_qp->max_inflights - inflights;
+			if (nb_ops_possible == 0)
+				return 0;
+		}
 	}
 
 	while (nb_ops_sent != nb_ops_possible) {
@@ -605,11 +616,7 @@ qat_enqueue_op_burst(void *qp, void **ops, uint16_t nb_ops)
 				tmp_qp->qat_dev_gen);
 		if (ret != 0) {
 			tmp_qp->stats.enqueue_err_count++;
-			/*
-			 * This message cannot be enqueued,
-			 * decrease number of ops that wasn't sent
-			 */
-			tmp_qp->inflights16 -= nb_ops_possible - nb_ops_sent;
+			/* This message cannot be enqueued */
 			if (nb_ops_sent == 0)
 				return 0;
 			goto kick_tail;
@@ -621,6 +628,7 @@ qat_enqueue_op_burst(void *qp, void **ops, uint16_t nb_ops)
 	}
 kick_tail:
 	queue->tail = tail;
+	tmp_qp->enqueued += nb_ops_sent;
 	tmp_qp->stats.enqueued_count += nb_ops_sent;
 	txq_write_tail(tmp_qp, queue);
 	return nb_ops_sent;
@@ -664,9 +672,9 @@ qat_dequeue_op_burst(void *qp, void **ops, uint16_t nb_ops)
 	}
 	if (resp_counter > 0) {
 		rx_queue->head = head;
+		tmp_qp->dequeued += resp_counter;
 		tmp_qp->stats.dequeued_count += resp_counter;
 		rx_queue->nb_processed_responses += resp_counter;
-		tmp_qp->inflights16 -= resp_counter;
 
 		if (rx_queue->nb_processed_responses >
 						QAT_CSR_HEAD_WRITE_THRESH)
diff --git a/drivers/common/qat/qat_qp.h b/drivers/common/qat/qat_qp.h
index 5066f06f0..8b9ab79ff 100644
--- a/drivers/common/qat/qat_qp.h
+++ b/drivers/common/qat/qat_qp.h
@@ -63,7 +63,6 @@ struct qat_queue {
 
 struct qat_qp {
 	void			*mmap_bar_addr;
-	uint16_t		inflights16;
 	struct qat_queue	tx_q;
 	struct qat_queue	rx_q;
 	struct qat_common_stats stats;
@@ -75,6 +74,8 @@ struct qat_qp {
 	enum qat_service_type service_type;
 	struct qat_pci_device *qat_dev;
 	/**< qat device this qp is on */
+	uint32_t enqueued;
+	uint32_t dequeued __rte_aligned(4);
 	uint16_t max_inflights;
 } __rte_cache_aligned;
 
-- 
2.13.6


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [dpdk-dev] [PATCH v2 0/3] Add dual threading in QAT PMD
  2019-09-06 16:12 ` [dpdk-dev] [PATCH v2 0/3] Add dual threading in QAT PMD Fiona Trahe
@ 2019-10-04 10:48   ` Akhil Goyal
  2019-10-08 10:53     ` Trahe, Fiona
  0 siblings, 1 reply; 10+ messages in thread
From: Akhil Goyal @ 2019-10-04 10:48 UTC (permalink / raw)
  To: Fiona Trahe, dev; +Cc: arkadiuszx.kusztal

Hi Fiona,

This patchset need a rebase. Could you please send v3.

Thanks,
Akhil

> -----Original Message-----
> From: Fiona Trahe <fiona.trahe@intel.com>
> Sent: Friday, September 6, 2019 9:42 PM
> To: dev@dpdk.org
> Cc: Akhil Goyal <akhil.goyal@nxp.com>; arkadiuszx.kusztal@intel.com;
> fiona.trahe@intel.com
> Subject: [PATCH v2 0/3] Add dual threading in QAT PMD
> 
> Remove the limitation whereby enqueue and dequeue must be
> done in same thread.
> The inflight calculation is reworked to be thread-safe for 2
> threads - note this is not general multi-thread support, i.e
> all enqueues to a qp must still be done in one thread and
> all dequeues must be done in one thread, but enqueues and
> dequeues may be in separate threads.
> As the tail-coalescing feature is not
> threadsafe it is removed first.
> 
> v2 changes:
>  - clarified wording in docs
> 
> Fiona Trahe (3):
>   common/qat: remove tail write coalescing feature
>   common/qat: move max inflights param into qp
>   common/qat: add dual thread support
> 
>  doc/guides/compressdevs/qat_comp.rst |  5 ++-
>  doc/guides/cryptodevs/qat.rst        | 11 +++++-
>  drivers/common/qat/qat_qp.c          | 77 ++++++++++++++++++------------------
>  drivers/common/qat/qat_qp.h          | 11 ++----
>  4 files changed, 54 insertions(+), 50 deletions(-)
> 
> --
> 2.13.6


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [dpdk-dev] [PATCH v2 0/3] Add dual threading in QAT PMD
  2019-10-04 10:48   ` Akhil Goyal
@ 2019-10-08 10:53     ` Trahe, Fiona
  0 siblings, 0 replies; 10+ messages in thread
From: Trahe, Fiona @ 2019-10-08 10:53 UTC (permalink / raw)
  To: Akhil Goyal, dev; +Cc: Kusztal, ArkadiuszX, Trahe, Fiona

Hi Akhil,
We're going to defer this patchset this to 20.02. 
We want to add another patch to it to mitigate some performance impacts it causes.#
I'm not sure of the process for this - do I just mark it in patchwork as deferred?
Or is there a tag I can put in an email, like Deferred-by?
Fiona

> -----Original Message-----
> From: Akhil Goyal [mailto:akhil.goyal@nxp.com]
> Sent: Friday, October 4, 2019 11:48 AM
> To: Trahe, Fiona <fiona.trahe@intel.com>; dev@dpdk.org
> Cc: Kusztal, ArkadiuszX <arkadiuszx.kusztal@intel.com>
> Subject: RE: [PATCH v2 0/3] Add dual threading in QAT PMD
> 
> Hi Fiona,
> 
> This patchset need a rebase. Could you please send v3.
> 
> Thanks,
> Akhil
> 
> > -----Original Message-----
> > From: Fiona Trahe <fiona.trahe@intel.com>
> > Sent: Friday, September 6, 2019 9:42 PM
> > To: dev@dpdk.org
> > Cc: Akhil Goyal <akhil.goyal@nxp.com>; arkadiuszx.kusztal@intel.com;
> > fiona.trahe@intel.com
> > Subject: [PATCH v2 0/3] Add dual threading in QAT PMD
> >
> > Remove the limitation whereby enqueue and dequeue must be
> > done in same thread.
> > The inflight calculation is reworked to be thread-safe for 2
> > threads - note this is not general multi-thread support, i.e
> > all enqueues to a qp must still be done in one thread and
> > all dequeues must be done in one thread, but enqueues and
> > dequeues may be in separate threads.
> > As the tail-coalescing feature is not
> > threadsafe it is removed first.
> >
> > v2 changes:
> >  - clarified wording in docs
> >
> > Fiona Trahe (3):
> >   common/qat: remove tail write coalescing feature
> >   common/qat: move max inflights param into qp
> >   common/qat: add dual thread support
> >
> >  doc/guides/compressdevs/qat_comp.rst |  5 ++-
> >  doc/guides/cryptodevs/qat.rst        | 11 +++++-
> >  drivers/common/qat/qat_qp.c          | 77 ++++++++++++++++++------------------
> >  drivers/common/qat/qat_qp.h          | 11 ++----
> >  4 files changed, 54 insertions(+), 50 deletions(-)
> >
> > --
> > 2.13.6


^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2019-10-08 10:53 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-09-06 15:44 [dpdk-dev] [PATCH 0/3] Add dual threading in QAT PMD Fiona Trahe
2019-09-06 15:44 ` [dpdk-dev] [PATCH 1/3] common/qat: remove tail write coalescing feature Fiona Trahe
2019-09-06 15:44 ` [dpdk-dev] [PATCH 2/3] common/qat: move max inflights param into qp Fiona Trahe
2019-09-06 15:44 ` [dpdk-dev] [PATCH 3/3] common/qat: add dual thread support Fiona Trahe
2019-09-06 16:12 ` [dpdk-dev] [PATCH v2 0/3] Add dual threading in QAT PMD Fiona Trahe
2019-10-04 10:48   ` Akhil Goyal
2019-10-08 10:53     ` Trahe, Fiona
2019-09-06 16:12 ` [dpdk-dev] [PATCH v2 1/3] common/qat: remove tail write coalescing feature Fiona Trahe
2019-09-06 16:12 ` [dpdk-dev] [PATCH v2 2/3] common/qat: move max inflights param into qp Fiona Trahe
2019-09-06 16:12 ` [dpdk-dev] [PATCH v2 3/3] common/qat: add dual thread support Fiona Trahe

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.