ntb.lists.linux.dev archive mirror
 help / color / mirror / Atom feed
* [PATCH v2 1/3] ntb_perf: extend with burst latency measurement
@ 2022-04-22 11:37 Alexander Fomichev
  2022-04-22 11:37 ` [PATCH v2 2/3] ntb_perf: extend with poll " Alexander Fomichev
                   ` (3 more replies)
  0 siblings, 4 replies; 10+ messages in thread
From: Alexander Fomichev @ 2022-04-22 11:37 UTC (permalink / raw)
  To: ntb
  Cc: linux, Jon Mason, Dave Jiang, Allen Hubbe, Guo Zhengkui,
	Alexander Fomichev

From: Alexander Fomichev <a.fomichev@yadro.com>

The ntb_perf test provides a tool for NTB hardware performance
evaluation. For software impact elimination the test uses a simple method
(let's call it 'burst' mode), when the local system sends to the remote
system a data set and counts time interval until hardware completion
report, without the remote side confirming, nor data integrity check.
The measured metric is a 'burst' throughput bandwidth of NTB connection.

The proposed change extends ntb_perf with 1 of 3 new metrics. The resulting
test is fully backward compatible.

1. Burst latency. It's a delay between start to send 1 byte to the
remote system and hardware readiness to send another byte. The
measurement performed within bandwidth test procedure. The DMA Engine is
off. Data integrity is not checked. This mode can be disabled by
'perf_latency=N' module parameter.

Signed-off-by: Alexander Fomichev <a.fomichev@yadro.com>
---
 drivers/ntb/test/ntb_perf.c | 135 +++++++++++++++++++++++++++++++++++-
 1 file changed, 132 insertions(+), 3 deletions(-)

diff --git a/drivers/ntb/test/ntb_perf.c b/drivers/ntb/test/ntb_perf.c
index 65e1e5cf1b29..79faa4a7fe94 100644
--- a/drivers/ntb/test/ntb_perf.c
+++ b/drivers/ntb/test/ntb_perf.c
@@ -106,6 +106,9 @@ MODULE_DESCRIPTION("PCIe NTB Performance Measurement Tool");
 
 #define PERF_BUF_LEN 1024
 
+#define LAT_MIN_TRIES	20
+#define RESCHEDULE_RATIO	10000
+
 static unsigned long max_mw_size;
 module_param(max_mw_size, ulong, 0644);
 MODULE_PARM_DESC(max_mw_size, "Upper limit of memory window size");
@@ -122,6 +125,14 @@ static bool use_dma; /* default to 0 */
 module_param(use_dma, bool, 0644);
 MODULE_PARM_DESC(use_dma, "Use DMA engine to measure performance");
 
+static bool perf_latency = true;
+module_param(perf_latency, bool, 0644);
+MODULE_PARM_DESC(perf_latency, "Also measure latency");
+
+static unsigned long lat_time_ms = 1000; /* default 1s */
+module_param(lat_time_ms, ulong, 0644);
+MODULE_PARM_DESC(lat_time_ms, "Time (in ms) to test latency");
+
 /*==============================================================================
  *                         Perf driver data definition
  *==============================================================================
@@ -178,6 +189,8 @@ struct perf_thread {
 	void *src;
 	u64 copied;
 	ktime_t duration;
+	ktime_t latency;
+	u64 tries;
 	int status;
 	struct work_struct work;
 };
@@ -783,7 +796,7 @@ static void perf_dma_copy_callback(void *data)
 }
 
 static int perf_copy_chunk(struct perf_thread *pthr,
-			   void __iomem *dst, void *src, size_t len)
+			   void __iomem *dst, void *src, size_t len, bool use_dma_)
 {
 	struct dma_async_tx_descriptor *tx;
 	struct dmaengine_unmap_data *unmap;
@@ -794,7 +807,7 @@ static int perf_copy_chunk(struct perf_thread *pthr,
 	void __iomem *dst_vaddr;
 	dma_addr_t dst_dma_addr;
 
-	if (!use_dma) {
+	if (!use_dma_) {
 		memcpy_toio(dst, src, len);
 		goto ret_check_tsync;
 	}
@@ -940,7 +953,7 @@ static int perf_run_test(struct perf_thread *pthr)
 
 	/* Copied field is cleared on test launch stage */
 	while (pthr->copied < total_size) {
-		ret = perf_copy_chunk(pthr, flt_dst, flt_src, chunk_size);
+		ret = perf_copy_chunk(pthr, flt_dst, flt_src, chunk_size, use_dma);
 		if (ret) {
 			dev_err(&perf->ntb->dev, "%d: Got error %d on test\n",
 				pthr->tidx, ret);
@@ -1018,6 +1031,66 @@ static void perf_clear_test(struct perf_thread *pthr)
 	kfree(pthr->src);
 }
 
+static int perf_run_latency(struct perf_thread *pthr)
+{
+	struct perf_peer *peer = pthr->perf->test_peer;
+	struct ntb_dev *ntb = pthr->perf->ntb;
+	void *flt_src;
+	void __iomem *flt_dst, *bnd_dst;
+	int ret;
+	u64 stop_at = ktime_get_real_fast_ns() + lat_time_ms * NSEC_PER_MSEC;
+
+	pthr->tries = 0;
+	pthr->latency = ktime_get();
+	flt_src = pthr->src;
+	flt_dst = peer->outbuf;
+	bnd_dst = peer->outbuf + peer->outbuf_size;
+
+	while (ktime_get_real_fast_ns() < stop_at) {
+		ret = perf_copy_chunk(pthr, flt_dst, flt_src, 1, false);
+		if (ret) {
+			dev_err(&ntb->dev, "%d: Latency testing error %d\n",
+				pthr->tidx, ret);
+			pthr->latency = ktime_set(0, 0);
+			return ret;
+		}
+
+		pthr->tries++;
+		flt_dst++;
+		flt_src++;
+
+		if (flt_dst >= bnd_dst || flt_dst < peer->outbuf) {
+			flt_dst = peer->outbuf;
+			flt_src = pthr->src;
+		}
+
+		/* Avoid processor soft lock-ups */
+		if (!(pthr->tries % RESCHEDULE_RATIO))
+			schedule();
+	}
+
+	/* Stop timer */
+	pthr->latency = ktime_sub(ktime_get(), pthr->latency);
+
+	if (pthr->tries < LAT_MIN_TRIES) {
+		dev_err(&ntb->dev, "%d: Too few steps (%d) to measure Latency, "
+			"recommended > %d. Increase value of 'lat_time_ms' "
+			"parameter\n", pthr->tidx, pthr->tries, LAT_MIN_TRIES);
+		pthr->latency = ktime_set(0, 0);
+		return -EINVAL;
+	}
+
+	dev_dbg(&ntb->dev, "%d: made %llu tries, lasted %llu usecs\n",
+		pthr->tidx, pthr->tries, ktime_to_us(pthr->latency));
+
+	pthr->latency = ns_to_ktime(ktime_divns(pthr->latency, pthr->tries));
+
+	dev_dbg(&ntb->dev, "%d: latency %llu us (%llu ns)\n", pthr->tidx,
+		ktime_to_us(pthr->latency), ktime_to_ns(pthr->latency));
+
+	return 0;
+}
+
 static void perf_thread_work(struct work_struct *work)
 {
 	struct perf_thread *pthr = to_thread_work(work);
@@ -1043,6 +1116,11 @@ static void perf_thread_work(struct work_struct *work)
 	}
 
 	pthr->status = perf_sync_test(pthr);
+	if (pthr->status)
+		goto err_clear_test;
+
+	if (perf_latency)
+		pthr->status = perf_run_latency(pthr);
 
 err_clear_test:
 	perf_clear_test(pthr);
@@ -1142,6 +1220,18 @@ static int perf_read_stats(struct perf_ctx *perf, char *buf,
 			"%d: copied %llu bytes in %llu usecs, %llu MBytes/s\n",
 			tidx, pthr->copied, ktime_to_us(pthr->duration),
 			div64_u64(pthr->copied, ktime_to_us(pthr->duration)));
+
+		if (perf_latency && ktime_compare(pthr->latency, ktime_set(0, 0))) {
+			if (ktime_to_us(pthr->latency) < 10) {
+				(*pos) += scnprintf(buf + *pos, size - *pos,
+						"%d: latency %llu ns\n",
+						tidx, ktime_to_ns(pthr->latency));
+			} else {
+				(*pos) += scnprintf(buf + *pos, size - *pos,
+						"%d: latency %llu us\n",
+						tidx, ktime_to_us(pthr->latency));
+			}
+		}
 	}
 
 	clear_bit_unlock(0, &perf->busy_flag);
@@ -1344,12 +1434,48 @@ static ssize_t perf_dbgfs_write_tcnt(struct file *filep,
 	return size;
 }
 
+static ssize_t perf_dbgfs_read_lattrs(struct file *filep, char __user *ubuf,
+				    size_t size, loff_t *offp)
+{
+	struct perf_ctx *perf = filep->private_data;
+	char *buf;
+	ssize_t pos, ret;
+	size_t buf_size = min_t(size_t, size, PERF_BUF_LEN);
+	int tidx;
+
+	buf = kmalloc(buf_size, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	pos = scnprintf(buf, buf_size, "    Peer %d latency try count:\n",
+			perf->test_peer->pidx);
+
+	for (tidx = 0; tidx < perf->tcnt; tidx++) {
+		struct perf_thread *pthr = &perf->threads[tidx];
+
+		pos += scnprintf(buf + pos, buf_size - pos,
+			"%d: made %llu tries\n",
+			tidx, pthr->tries);
+	}
+
+	ret = simple_read_from_buffer(ubuf, size, offp, buf, pos);
+
+	kfree(buf);
+
+	return ret;
+}
+
 static const struct file_operations perf_dbgfs_tcnt = {
 	.open = simple_open,
 	.read = perf_dbgfs_read_tcnt,
 	.write = perf_dbgfs_write_tcnt
 };
 
+static const struct file_operations perf_dbgfs_lattrs = {
+	.open = simple_open,
+	.read = perf_dbgfs_read_lattrs
+};
+
 static void perf_setup_dbgfs(struct perf_ctx *perf)
 {
 	struct pci_dev *pdev = perf->ntb->pdev;
@@ -1375,6 +1501,9 @@ static void perf_setup_dbgfs(struct perf_ctx *perf)
 	debugfs_create_u8("total_order", 0500, perf->dbgfs_dir, &total_order);
 
 	debugfs_create_bool("use_dma", 0500, perf->dbgfs_dir, &use_dma);
+
+	debugfs_create_file("latency_tries", 0400, perf->dbgfs_dir, perf,
+			    &perf_dbgfs_lattrs);
 }
 
 static void perf_clear_dbgfs(struct perf_ctx *perf)
-- 
2.30.2


^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH v2 2/3] ntb_perf: extend with poll latency measurement
  2022-04-22 11:37 [PATCH v2 1/3] ntb_perf: extend with burst latency measurement Alexander Fomichev
@ 2022-04-22 11:37 ` Alexander Fomichev
  2022-04-25 16:49   ` Dave Jiang
  2022-04-22 11:37 ` [PATCH v2 3/3] ntb_perf: extend with doorbell " Alexander Fomichev
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 10+ messages in thread
From: Alexander Fomichev @ 2022-04-22 11:37 UTC (permalink / raw)
  To: ntb
  Cc: linux, Jon Mason, Dave Jiang, Allen Hubbe, Guo Zhengkui,
	Alexander Fomichev

From: Alexander Fomichev <a.fomichev@yadro.com>

The proposed change extends ntb_perf with 2nd of 3 new metrics. The resulting
test is fully backward compatible.

2. Poll latency. It's a delay between start to send 1 byte to the
remote system and receiving the confirmation. The remote system needs to
be run in server mode beforehand. Then the server polls the input buffer
and on receiving data immediately sends the confirmation back.

Signed-off-by: Alexander Fomichev <a.fomichev@yadro.com>
---
 drivers/ntb/test/ntb_perf.c | 365 +++++++++++++++++++++++++++++++++++-
 1 file changed, 364 insertions(+), 1 deletion(-)

diff --git a/drivers/ntb/test/ntb_perf.c b/drivers/ntb/test/ntb_perf.c
index 79faa4a7fe94..d4664410e543 100644
--- a/drivers/ntb/test/ntb_perf.c
+++ b/drivers/ntb/test/ntb_perf.c
@@ -85,7 +85,7 @@
 #include <linux/ntb.h>
 
 #define DRIVER_NAME		"ntb_perf"
-#define DRIVER_VERSION		"2.0"
+#define DRIVER_VERSION		"2.1"
 
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_VERSION(DRIVER_VERSION);
@@ -133,6 +133,10 @@ static unsigned long lat_time_ms = 1000; /* default 1s */
 module_param(lat_time_ms, ulong, 0644);
 MODULE_PARM_DESC(lat_time_ms, "Time (in ms) to test latency");
 
+static unsigned long lat_timeout_us = 500;
+module_param(lat_timeout_us, ulong, 0644);
+MODULE_PARM_DESC(lat_timeout_us, "Timeout (in us) to wait for server reply");
+
 /*==============================================================================
  *                         Perf driver data definition
  *==============================================================================
@@ -149,6 +153,11 @@ enum perf_cmd {
 	PERF_STS_LNKUP = 6, /* link up state flag */
 };
 
+enum run_mode {
+	RUN_PL_CLIENT,
+	RUN_PL_SERVER,
+};
+
 struct perf_ctx;
 
 struct perf_peer {
@@ -197,6 +206,21 @@ struct perf_thread {
 #define to_thread_work(__work) \
 	container_of(__work, struct perf_thread, work)
 
+struct perf_poll_lat_data {
+	struct perf_ctx *perf;
+	void *src;
+	ktime_t latency;
+	u64 tries;
+	int status;
+	atomic_t running;
+	struct work_struct clt_work;
+	struct work_struct srv_work;
+};
+#define to_pldata_clt_work(__work) \
+	container_of(__work, struct perf_poll_lat_data, clt_work)
+#define to_pldata_srv_work(__work) \
+	container_of(__work, struct perf_poll_lat_data, srv_work)
+
 struct perf_ctx {
 	struct ntb_dev *ntb;
 
@@ -204,6 +228,7 @@ struct perf_ctx {
 	int gidx;
 	int pcnt;
 	struct perf_peer *peers;
+	struct perf_poll_lat_data pldata;
 
 	/* Performance measuring work-threads interface */
 	unsigned long busy_flag;
@@ -252,6 +277,8 @@ static struct dentry *perf_dbgfs_topdir;
 
 static struct workqueue_struct *perf_wq __read_mostly;
 
+static const u8 stop_word = 0xFF;
+
 /*==============================================================================
  *                  NTB cross-link commands execution service
  *==============================================================================
@@ -1126,6 +1153,184 @@ static void perf_thread_work(struct work_struct *work)
 	perf_clear_test(pthr);
 }
 
+static int perf_init_pl(struct perf_poll_lat_data *pldata)
+{
+	struct perf_ctx *perf = pldata->perf;
+	struct perf_peer *peer = perf->test_peer;
+	u8 *bp;
+
+	pldata->src = kmalloc_node(peer->outbuf_size, GFP_KERNEL,
+				 dev_to_node(&perf->ntb->dev));
+	if (!pldata->src)
+		return -ENOMEM;
+
+	/*
+	 * Prepare random data to send, guaranteed exclusion of 0x00 (unreceived)
+	 * and 0xFF (stop_word)
+	 */
+	get_random_bytes(pldata->src, peer->outbuf_size);
+	for (bp = pldata->src; bp < (u8 *) pldata->src + peer->outbuf_size; bp++)
+		while (*bp == 0 || *bp == stop_word)
+			*bp = (u8)get_random_int();
+
+	memset(peer->inbuf, 0, peer->inbuf_size);
+
+	return 0;
+}
+
+static int perf_poll_peer_reply(volatile u8 *cur)
+{
+	u64 wait_till = ktime_get_real_fast_ns() + lat_timeout_us * NSEC_PER_USEC;
+
+	while (ktime_get_real_fast_ns() < wait_till) {
+		if (*cur == stop_word) {
+			*cur = 0;
+			return 1;
+		}
+		if (*cur != 0) {
+			*cur = 0;
+			return 0;
+		}
+	}
+	return -EINTR;
+}
+
+static int perf_run_pl_client(struct perf_poll_lat_data *pldata)
+{
+	struct perf_peer *peer = pldata->perf->test_peer;
+	struct ntb_dev *ntb = pldata->perf->ntb;
+	void *src = pldata->src;
+	int ret;
+	u64 stop_at = ktime_get_real_fast_ns() + lat_time_ms * NSEC_PER_MSEC;
+
+	dev_dbg(&ntb->dev, "poll_lat: client started.\n");
+
+	pldata->tries = 0;
+	pldata->latency = ktime_get();
+
+	while (ktime_get_real_fast_ns() < stop_at) {
+		memcpy_toio(peer->outbuf, src, 1);
+
+		/* Avoid processor soft lock-ups */
+		schedule();
+
+		ret = perf_poll_peer_reply(peer->inbuf);
+		if (ret < 0) {
+			dev_err(&ntb->dev, "Timeout waiting for peer reply on poll latency\n");
+			pldata->latency = ktime_set(0, 0);
+			return -EINTR;
+		} else if (ret == 1) {
+			dev_warn(&ntb->dev, "Server terminated on poll latency, stopping\n");
+			break;
+		} else if (!atomic_read(&pldata->running)) {
+			dev_err(&ntb->dev, "Poll latency client terminated\n");
+			return -EINTR;
+		}
+
+		pldata->tries++;
+		src++;
+
+		if (src >= pldata->src + peer->outbuf_size)
+			src = pldata->src;
+	}
+
+	/* Stop timer */
+	pldata->latency = ktime_sub(ktime_get(), pldata->latency);
+	/* Send stop to peer */
+	memcpy_toio(peer->outbuf, &stop_word, 1);
+
+	if (pldata->tries < LAT_MIN_TRIES) {
+		dev_err(&ntb->dev, "Too few steps (%d) to measure Latency, "
+			"recommended > %d. Increase value of 'lat_time_ms' "
+			"parameter\n", pldata->tries, LAT_MIN_TRIES);
+		pldata->latency = ktime_set(0, 0);
+		return -EINVAL;
+	}
+
+	dev_dbg(&ntb->dev, "poll_lat: made %llu tries, lasted %llu usecs\n",
+		pldata->tries, ktime_to_us(pldata->latency));
+
+	pldata->latency = ns_to_ktime(ktime_divns(pldata->latency, pldata->tries));
+
+	dev_dbg(&ntb->dev, "poll_lat: latency %llu us (%llu ns)\n",
+		ktime_to_us(pldata->latency), ktime_to_ns(pldata->latency));
+
+	return 0;
+}
+
+static int perf_run_pl_server(struct perf_poll_lat_data *pldata)
+{
+	struct perf_peer *peer = pldata->perf->test_peer;
+	struct ntb_dev *ntb = pldata->perf->ntb;
+	void *src = pldata->src;
+	int ret = 0;
+
+	dev_dbg(&ntb->dev, "poll_lat: server started.\n");
+
+	pldata->tries = 0;
+
+	while (ret != 1 && atomic_read(&pldata->running)) {
+		ret = perf_poll_peer_reply(peer->inbuf);
+		if (!ret) {
+			/* Pong to client */
+			memcpy_toio(peer->outbuf, src++, 1);
+			if (src >= pldata->src + peer->outbuf_size)
+				src = pldata->src;
+
+			pldata->tries++;
+		}
+
+		/* Avoid processor soft lock-ups */
+		schedule();
+	}
+
+	if (pldata->tries < LAT_MIN_TRIES)
+		dev_warn(&ntb->dev, "Poll latency test terminated too early. "
+				"Increase client's test time\n");
+
+	dev_dbg(&ntb->dev, "poll_lat: server stopped, had responded %llu times\n",
+		pldata->tries);
+
+	return atomic_read(&pldata->running) ? -ENODATA : -EINTR;
+}
+
+static void perf_clear_pl(struct perf_poll_lat_data *pldata)
+{
+	struct perf_ctx *perf = pldata->perf;
+	struct perf_peer *peer = perf->test_peer;
+
+	memset(peer->inbuf, stop_word, 1);
+	atomic_set(&pldata->running, 0);
+	wake_up(&perf->twait);
+	kfree(pldata->src);
+}
+
+static void perf_poll_lat_client_work(struct work_struct *work)
+{
+	struct perf_poll_lat_data *pldata = to_pldata_clt_work(work);
+
+	pldata->status = perf_init_pl(pldata);
+	if (pldata->status)
+		return;
+
+	pldata->status = perf_run_pl_client(pldata);
+
+	perf_clear_pl(pldata);
+}
+
+static void perf_poll_lat_server_work(struct work_struct *work)
+{
+	struct perf_poll_lat_data *pldata = to_pldata_srv_work(work);
+
+	pldata->status = perf_init_pl(pldata);
+	if (pldata->status)
+		return;
+
+	pldata->status = perf_run_pl_server(pldata);
+
+	perf_clear_pl(pldata);
+}
+
 static int perf_set_tcnt(struct perf_ctx *perf, u8 tcnt)
 {
 	if (tcnt == 0 || tcnt > MAX_THREADS_CNT)
@@ -1146,7 +1351,10 @@ static void perf_terminate_test(struct perf_ctx *perf)
 	int tidx;
 
 	atomic_set(&perf->tsync, -1);
+	atomic_set(&perf->pldata.running, 0);
 	wake_up(&perf->twait);
+	cancel_work_sync(&perf->pldata.srv_work);
+	cancel_work_sync(&perf->pldata.clt_work);
 
 	for (tidx = 0; tidx < MAX_THREADS_CNT; tidx++) {
 		wake_up(&perf->threads[tidx].dma_wait);
@@ -1192,6 +1400,46 @@ static int perf_submit_test(struct perf_peer *peer)
 	return ret;
 }
 
+static int perf_submit_poll_lat(struct perf_peer *peer, enum run_mode mode)
+{
+	struct perf_ctx *perf = peer->perf;
+	int ret;
+
+	ret = wait_for_completion_interruptible(&peer->init_comp);
+	if (ret < 0)
+		return ret;
+
+	if (test_and_set_bit_lock(0, &perf->busy_flag))
+		return -EBUSY;
+
+	perf->test_peer = peer;
+	atomic_set(&perf->pldata.running, 1);
+	perf->pldata.status = -ENODATA;
+	perf->pldata.tries = 0;
+	perf->pldata.latency = ktime_set(0, 0);
+
+	switch (mode) {
+	case RUN_PL_SERVER:
+		(void)queue_work(perf_wq, &perf->pldata.srv_work);
+		break;
+	case RUN_PL_CLIENT:
+	default:
+		(void)queue_work(perf_wq, &perf->pldata.clt_work);
+		break;
+	}
+
+	ret = wait_event_interruptible(perf->twait,
+				       !atomic_read(&perf->pldata.running));
+	if (ret == -ERESTARTSYS) {
+		perf_terminate_test(perf);
+		ret = -EINTR;
+	}
+
+	clear_bit_unlock(0, &perf->busy_flag);
+
+	return ret;
+}
+
 static int perf_read_stats(struct perf_ctx *perf, char *buf,
 			   size_t size, ssize_t *pos)
 {
@@ -1234,6 +1482,24 @@ static int perf_read_stats(struct perf_ctx *perf, char *buf,
 		}
 	}
 
+	if (perf->pldata.status != -ENODATA) {
+		(*pos) += scnprintf(buf + *pos, size - *pos, "\n");
+		if (perf->pldata.status) {
+			(*pos) += scnprintf(buf + *pos, size - *pos,
+				"poll latency: error status %d\n", perf->pldata.status);
+		} else {
+			if (ktime_to_us(perf->pldata.latency) < 10) {
+				(*pos) += scnprintf(buf + *pos, size - *pos,
+						"poll latency %llu ns\n",
+						ktime_to_ns(perf->pldata.latency));
+			} else {
+				(*pos) += scnprintf(buf + *pos, size - *pos,
+						"poll latency %llu us\n",
+						ktime_to_us(perf->pldata.latency));
+			}
+		}
+	}
+
 	clear_bit_unlock(0, &perf->busy_flag);
 
 	return 0;
@@ -1247,6 +1513,10 @@ static void perf_init_threads(struct perf_ctx *perf)
 	perf->tcnt = DEF_THREADS_CNT;
 	perf->test_peer = &perf->peers[0];
 	init_waitqueue_head(&perf->twait);
+	perf->pldata.perf = perf;
+	INIT_WORK(&perf->pldata.srv_work, perf_poll_lat_server_work);
+	INIT_WORK(&perf->pldata.clt_work, perf_poll_lat_client_work);
+	perf->pldata.status = -ENODATA;
 
 	for (tidx = 0; tidx < MAX_THREADS_CNT; tidx++) {
 		pthr = &perf->threads[tidx];
@@ -1403,6 +1673,64 @@ static const struct file_operations perf_dbgfs_run = {
 	.write = perf_dbgfs_write_run
 };
 
+static ssize_t perf_dbgfs_write_run_pl(struct file *filep, const char __user *ubuf,
+				    size_t size, loff_t *offp, enum run_mode mode)
+{
+	struct perf_ctx *perf = filep->private_data;
+	struct ntb_dev *ntb = perf->ntb;
+	struct perf_peer *peer;
+	int pidx, ret;
+
+	ret = kstrtoint_from_user(ubuf, size, 0, &pidx);
+	if (ret)
+		return ret;
+
+	if (pidx < 0 && mode == RUN_PL_SERVER) {
+		dev_dbg(&ntb->dev, "poll_lat: kill server\n");
+		if (test_bit(0, &perf->busy_flag)) {
+			peer = perf->test_peer;
+			/* Send stop to client */
+			memcpy_toio(peer->outbuf, &stop_word, 1);
+		}
+		perf_terminate_test(perf);
+		clear_bit_unlock(0, &perf->busy_flag);
+		return size;
+	}
+
+	if (pidx < 0 || pidx >= perf->pcnt)
+		return -EINVAL;
+
+	peer = &perf->peers[pidx];
+
+	ret = perf_submit_poll_lat(peer, mode);
+
+	return ret ? ret : size;
+}
+
+static ssize_t perf_dbgfs_write_run_client(struct file *filep, const char __user *ubuf,
+				    size_t size, loff_t *offp)
+{
+	return perf_dbgfs_write_run_pl(filep, ubuf, size, offp, RUN_PL_CLIENT);
+}
+
+static const struct file_operations perf_dbgfs_run_client = {
+	.open = simple_open,
+	.read = perf_dbgfs_read_run,
+	.write = perf_dbgfs_write_run_client
+};
+
+static ssize_t perf_dbgfs_write_run_server(struct file *filep, const char __user *ubuf,
+				    size_t size, loff_t *offp)
+{
+	return perf_dbgfs_write_run_pl(filep, ubuf, size, offp, RUN_PL_SERVER);
+}
+
+static const struct file_operations perf_dbgfs_run_server = {
+	.open = simple_open,
+	.read = perf_dbgfs_read_run,
+	.write = perf_dbgfs_write_run_server
+};
+
 static ssize_t perf_dbgfs_read_tcnt(struct file *filep, char __user *ubuf,
 				    size_t size, loff_t *offp)
 {
@@ -1465,6 +1793,24 @@ static ssize_t perf_dbgfs_read_lattrs(struct file *filep, char __user *ubuf,
 	return ret;
 }
 
+static ssize_t perf_dbgfs_read_inbuf(struct file *filep, char __user *ubuf,
+				    size_t size, loff_t *offp)
+{
+	struct perf_ctx *perf = filep->private_data;
+	char buf[32];
+	ssize_t pos;
+	u64 *value;
+
+	if (!perf->test_peer || !perf->test_peer->inbuf) {
+		pos = scnprintf(buf, sizeof(buf), "NULL\n");
+	} else {
+		value = perf->test_peer->inbuf;
+		pos = scnprintf(buf, sizeof(buf), "0x%llx\n", *value);
+	}
+
+	return simple_read_from_buffer(ubuf, size, offp, buf, pos);
+}
+
 static const struct file_operations perf_dbgfs_tcnt = {
 	.open = simple_open,
 	.read = perf_dbgfs_read_tcnt,
@@ -1476,6 +1822,11 @@ static const struct file_operations perf_dbgfs_lattrs = {
 	.read = perf_dbgfs_read_lattrs
 };
 
+static const struct file_operations perf_dbgfs_inbuf = {
+	.open = simple_open,
+	.read = perf_dbgfs_read_inbuf,
+};
+
 static void perf_setup_dbgfs(struct perf_ctx *perf)
 {
 	struct pci_dev *pdev = perf->ntb->pdev;
@@ -1492,6 +1843,12 @@ static void perf_setup_dbgfs(struct perf_ctx *perf)
 	debugfs_create_file("run", 0600, perf->dbgfs_dir, perf,
 			    &perf_dbgfs_run);
 
+	debugfs_create_file("run_client", 0600, perf->dbgfs_dir, perf,
+			    &perf_dbgfs_run_client);
+
+	debugfs_create_file("run_server", 0600, perf->dbgfs_dir, perf,
+			    &perf_dbgfs_run_server);
+
 	debugfs_create_file("threads_count", 0600, perf->dbgfs_dir, perf,
 			    &perf_dbgfs_tcnt);
 
@@ -1504,6 +1861,12 @@ static void perf_setup_dbgfs(struct perf_ctx *perf)
 
 	debugfs_create_file("latency_tries", 0400, perf->dbgfs_dir, perf,
 			    &perf_dbgfs_lattrs);
+
+	debugfs_create_u64("poll_latency_tries", 0400, perf->dbgfs_dir,
+				&perf->pldata.tries);
+
+	debugfs_create_file("inbuf", 0400, perf->dbgfs_dir, perf,
+			    &perf_dbgfs_inbuf);
 }
 
 static void perf_clear_dbgfs(struct perf_ctx *perf)
-- 
2.30.2


^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH v2 3/3] ntb_perf: extend with doorbell latency measurement
  2022-04-22 11:37 [PATCH v2 1/3] ntb_perf: extend with burst latency measurement Alexander Fomichev
  2022-04-22 11:37 ` [PATCH v2 2/3] ntb_perf: extend with poll " Alexander Fomichev
@ 2022-04-22 11:37 ` Alexander Fomichev
  2022-04-25 16:55   ` Dave Jiang
  2022-04-25 16:38 ` [PATCH v2 1/3] ntb_perf: extend with burst " Dave Jiang
  2022-04-28 11:27 ` Serge Semin
  3 siblings, 1 reply; 10+ messages in thread
From: Alexander Fomichev @ 2022-04-22 11:37 UTC (permalink / raw)
  To: ntb
  Cc: linux, Jon Mason, Dave Jiang, Allen Hubbe, Guo Zhengkui,
	Alexander Fomichev

From: Alexander Fomichev <a.fomichev@yadro.com>

The proposed change extends ntb_perf with 3d of 3 new metrics. The resulting
test is fully backward compatible.

3. Doorbell latency. It's a delay between start to ring an NTB doorbell
and receiving the confirmation. The remote system needs to be run in server
mode beforehand. Then the server waits for a doorbell event and immediately
rings self doorbell to confirm.

Thanks-to: Guo Zhengkui <guozhengkui@vivo.com>
Signed-off-by: Alexander Fomichev <a.fomichev@yadro.com>
---
 drivers/ntb/test/ntb_perf.c | 541 ++++++++++++++++++++++++++++--------
 1 file changed, 420 insertions(+), 121 deletions(-)

diff --git a/drivers/ntb/test/ntb_perf.c b/drivers/ntb/test/ntb_perf.c
index d4664410e543..26f0e801ea93 100644
--- a/drivers/ntb/test/ntb_perf.c
+++ b/drivers/ntb/test/ntb_perf.c
@@ -6,6 +6,7 @@
  *
  *   Copyright(c) 2015 Intel Corporation. All rights reserved.
  *   Copyright(c) 2017 T-Platforms. All Rights Reserved.
+ *   Copyright(c) 2022 YADRO. All Rights Reserved.
  *
  *   This program is free software; you can redistribute it and/or modify
  *   it under the terms of version 2 of the GNU General Public License as
@@ -15,6 +16,7 @@
  *
  *   Copyright(c) 2015 Intel Corporation. All rights reserved.
  *   Copyright(c) 2017 T-Platforms. All Rights Reserved.
+ *   Copyright(c) 2022 YADRO. All Rights Reserved.
  *
  *   Redistribution and use in source and binary forms, with or without
  *   modification, are permitted provided that the following conditions
@@ -65,6 +67,22 @@
  *
  * root@self# echo 0 > $DBG_DIR/run
  * root@self# cat $DBG_DIR/run
+ *-----------------------------------------------------------------------------
+ * Eg: start latency test with peer (index 0) poll-waiting and get the metrics
+ *
+ * Server side:
+ * root@self# echo 0 > $DBG_DIR/poll_latency/run_server
+ * Client side:
+ * root@self# echo 0 > $DBG_DIR/poll_latency/run_client
+ * root@self# cat $DBG_DIR/poll_latency/run_client
+ *-----------------------------------------------------------------------------
+ * Eg: start doorbell latency test with peer (index 0) and get the metrics
+ *
+ * Server side:
+ * root@self# echo 0 > $DBG_DIR/db_latency/run_server
+ * Client side:
+ * root@self# echo 0 > $DBG_DIR/db_latency/run_client
+ * root@self# cat $DBG_DIR/db_latency/run_client
  */
 
 #include <linux/init.h>
@@ -76,6 +94,7 @@
 #include <linux/dmaengine.h>
 #include <linux/pci.h>
 #include <linux/ktime.h>
+#include <linux/jiffies.h>
 #include <linux/slab.h>
 #include <linux/delay.h>
 #include <linux/sizes.h>
@@ -85,7 +104,7 @@
 #include <linux/ntb.h>
 
 #define DRIVER_NAME		"ntb_perf"
-#define DRIVER_VERSION		"2.1"
+#define DRIVER_VERSION		"2.2"
 
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_VERSION(DRIVER_VERSION);
@@ -137,6 +156,10 @@ static unsigned long lat_timeout_us = 500;
 module_param(lat_timeout_us, ulong, 0644);
 MODULE_PARM_DESC(lat_timeout_us, "Timeout (in us) to wait for server reply");
 
+static unsigned long peer_timeout_s = 60;
+module_param(peer_timeout_s, ulong, 0644);
+MODULE_PARM_DESC(peer_timeout_s, "Timeout (in s) to wait for peer link");
+
 /*==============================================================================
  *                         Perf driver data definition
  *==============================================================================
@@ -156,9 +179,18 @@ enum perf_cmd {
 enum run_mode {
 	RUN_PL_CLIENT,
 	RUN_PL_SERVER,
+	RUN_DBL_CLIENT,
+	RUN_DBL_SERVER,
 };
 
 struct perf_ctx;
+struct perf_ext_lat_data;
+
+struct perf_ext_lat_ops {
+	int (*init)(struct perf_ext_lat_data *data);
+	int (*run)(struct perf_ext_lat_data *data);
+	void (*clear)(struct perf_ext_lat_data *data);
+};
 
 struct perf_peer {
 	struct perf_ctx	*perf;
@@ -206,20 +238,21 @@ struct perf_thread {
 #define to_thread_work(__work) \
 	container_of(__work, struct perf_thread, work)
 
-struct perf_poll_lat_data {
+struct perf_ext_lat_data {
 	struct perf_ctx *perf;
-	void *src;
 	ktime_t latency;
 	u64 tries;
 	int status;
-	atomic_t running;
-	struct work_struct clt_work;
-	struct work_struct srv_work;
+	struct perf_ext_lat_ops ops;
+	struct work_struct work;
+
+	union {
+		void *src;
+		int db;
+	};
 };
-#define to_pldata_clt_work(__work) \
-	container_of(__work, struct perf_poll_lat_data, clt_work)
-#define to_pldata_srv_work(__work) \
-	container_of(__work, struct perf_poll_lat_data, srv_work)
+#define to_ext_lat_data(__work) \
+	container_of(__work, struct perf_ext_lat_data, work)
 
 struct perf_ctx {
 	struct ntb_dev *ntb;
@@ -228,7 +261,12 @@ struct perf_ctx {
 	int gidx;
 	int pcnt;
 	struct perf_peer *peers;
-	struct perf_poll_lat_data pldata;
+
+	/* Ext latency tests interface */
+	enum run_mode mode;
+	struct perf_ext_lat_data pldata;
+	struct perf_ext_lat_data dbldata;
+	atomic_t running;
 
 	/* Performance measuring work-threads interface */
 	unsigned long busy_flag;
@@ -541,6 +579,15 @@ static void perf_link_event(void *ctx)
 	}
 }
 
+static inline void perf_dbl_pong(struct perf_ctx *perf)
+{
+	struct perf_ext_lat_data *data = &perf->dbldata;
+
+	ntb_db_clear(perf->ntb, BIT_ULL(data->db));
+	data->tries++;
+	ntb_peer_db_set(perf->ntb, BIT_ULL(data->db));
+}
+
 static void perf_db_event(void *ctx, int vec)
 {
 	struct perf_ctx *perf = ctx;
@@ -549,7 +596,11 @@ static void perf_db_event(void *ctx, int vec)
 		ntb_db_vector_mask(perf->ntb, vec), ntb_db_read(perf->ntb));
 
 	/* Just receive all available commands */
-	(void)perf_cmd_recv(perf);
+	if (perf->dbldata.db >= 0 &&
+				BIT_ULL(perf->dbldata.db) & ntb_db_read(perf->ntb))
+		perf_dbl_pong(perf);
+	else
+		(void)perf_cmd_recv(perf);
 }
 
 static void perf_msg_event(void *ctx)
@@ -704,6 +755,8 @@ static int perf_init_service(struct perf_ctx *perf)
 		return -EINVAL;
 	}
 
+	perf->dbldata.db = -1;
+
 	if (ntb_msg_count(perf->ntb) >= PERF_MSG_CNT) {
 		perf->cmd_send = perf_msg_cmd_send;
 		perf->cmd_recv = perf_msg_cmd_recv;
@@ -1153,14 +1206,14 @@ static void perf_thread_work(struct work_struct *work)
 	perf_clear_test(pthr);
 }
 
-static int perf_init_pl(struct perf_poll_lat_data *pldata)
+static int perf_init_pl(struct perf_ext_lat_data *pldata)
 {
 	struct perf_ctx *perf = pldata->perf;
 	struct perf_peer *peer = perf->test_peer;
 	u8 *bp;
 
 	pldata->src = kmalloc_node(peer->outbuf_size, GFP_KERNEL,
-				 dev_to_node(&perf->ntb->dev));
+				dev_to_node(&perf->ntb->dev));
 	if (!pldata->src)
 		return -ENOMEM;
 
@@ -1195,10 +1248,11 @@ static int perf_poll_peer_reply(volatile u8 *cur)
 	return -EINTR;
 }
 
-static int perf_run_pl_client(struct perf_poll_lat_data *pldata)
+static int perf_run_pl_client(struct perf_ext_lat_data *pldata)
 {
-	struct perf_peer *peer = pldata->perf->test_peer;
-	struct ntb_dev *ntb = pldata->perf->ntb;
+	struct perf_ctx *perf = pldata->perf;
+	struct perf_peer *peer = perf->test_peer;
+	struct ntb_dev *ntb = perf->ntb;
 	void *src = pldata->src;
 	int ret;
 	u64 stop_at = ktime_get_real_fast_ns() + lat_time_ms * NSEC_PER_MSEC;
@@ -1222,7 +1276,7 @@ static int perf_run_pl_client(struct perf_poll_lat_data *pldata)
 		} else if (ret == 1) {
 			dev_warn(&ntb->dev, "Server terminated on poll latency, stopping\n");
 			break;
-		} else if (!atomic_read(&pldata->running)) {
+		} else if (!atomic_read(&perf->running)) {
 			dev_err(&ntb->dev, "Poll latency client terminated\n");
 			return -EINTR;
 		}
@@ -1258,10 +1312,11 @@ static int perf_run_pl_client(struct perf_poll_lat_data *pldata)
 	return 0;
 }
 
-static int perf_run_pl_server(struct perf_poll_lat_data *pldata)
+static int perf_run_pl_server(struct perf_ext_lat_data *pldata)
 {
-	struct perf_peer *peer = pldata->perf->test_peer;
-	struct ntb_dev *ntb = pldata->perf->ntb;
+	struct perf_ctx *perf = pldata->perf;
+	struct perf_peer *peer = perf->test_peer;
+	struct ntb_dev *ntb = perf->ntb;
 	void *src = pldata->src;
 	int ret = 0;
 
@@ -1269,7 +1324,7 @@ static int perf_run_pl_server(struct perf_poll_lat_data *pldata)
 
 	pldata->tries = 0;
 
-	while (ret != 1 && atomic_read(&pldata->running)) {
+	while (ret != 1 && atomic_read(&perf->running)) {
 		ret = perf_poll_peer_reply(peer->inbuf);
 		if (!ret) {
 			/* Pong to client */
@@ -1291,44 +1346,130 @@ static int perf_run_pl_server(struct perf_poll_lat_data *pldata)
 	dev_dbg(&ntb->dev, "poll_lat: server stopped, had responded %llu times\n",
 		pldata->tries);
 
-	return atomic_read(&pldata->running) ? -ENODATA : -EINTR;
+	return atomic_read(&perf->running) ? -ENODATA : -EINTR;
 }
 
-static void perf_clear_pl(struct perf_poll_lat_data *pldata)
+static void perf_clear_pl(struct perf_ext_lat_data *pldata)
 {
 	struct perf_ctx *perf = pldata->perf;
 	struct perf_peer *peer = perf->test_peer;
 
 	memset(peer->inbuf, stop_word, 1);
-	atomic_set(&pldata->running, 0);
+	atomic_set(&perf->running, 0);
 	wake_up(&perf->twait);
 	kfree(pldata->src);
 }
 
-static void perf_poll_lat_client_work(struct work_struct *work)
+static struct perf_ext_lat_ops perf_pl_client_ops = {
+	.init = perf_init_pl,
+	.run = perf_run_pl_client,
+	.clear = perf_clear_pl
+};
+
+static struct perf_ext_lat_ops perf_pl_server_ops = {
+	.init = perf_init_pl,
+	.run = perf_run_pl_server,
+	.clear = perf_clear_pl
+};
+
+static int perf_init_dbl(struct perf_ext_lat_data *data)
 {
-	struct perf_poll_lat_data *pldata = to_pldata_clt_work(work);
+	struct perf_ctx *perf = data->perf;
 
-	pldata->status = perf_init_pl(pldata);
-	if (pldata->status)
-		return;
+	data->db = get_bitmask_order(ntb_db_valid_mask(perf->ntb)) - 1;
+	dev_dbg(&perf->ntb->dev, "DB bit for latency test: %d\n", data->db);
+
+	if (data->db <= perf->gidx) {
+		dev_err(&perf->ntb->dev, "No spare DoorBell found\n");
+		data->db = -1;
+		return -ENOSPC;
+	}
+
+	return ntb_db_clear_mask(perf->ntb, BIT_ULL(data->db));
+}
+
+static int perf_run_dbl_client(struct perf_ext_lat_data *data)
+{
+	struct perf_ctx *perf = data->perf;
+	struct ntb_dev *ntb = perf->ntb;
+	u64 stop_at = ktime_get_real_fast_ns() + lat_time_ms * NSEC_PER_MSEC;
+
+	dev_dbg(&ntb->dev, "db_lat: client started.\n");
+
+	data->tries = 0;
+	data->latency = ktime_get();
+
+	if (ntb_peer_db_set(perf->ntb, BIT_ULL(data->db)))
+		return -EIO;
+
+	while (ktime_get_real_fast_ns() < stop_at) {
+		/* Avoid processor soft lock-ups */
+		schedule();
+
+		if (!atomic_read(&perf->running)) {
+			dev_err(&ntb->dev, "DoorBell latency client terminated\n");
+			return -EINTR;
+		}
+	}
 
-	pldata->status = perf_run_pl_client(pldata);
+	/* Stop timer */
+	data->latency = ktime_sub(ktime_get(), data->latency);
+
+	if (data->tries < LAT_MIN_TRIES) {
+		dev_err(&ntb->dev, "Too few steps (%d) to measure Latency, "
+			"recommended > %d. Increase value of 'lat_time_ms' "
+			"parameter\n", data->tries, LAT_MIN_TRIES);
+		data->latency = ktime_set(0, 0);
+		return -EINVAL;
+	}
+
+	dev_dbg(&ntb->dev, "db_lat: made %llu tries, lasted %llu usecs\n",
+		data->tries, ktime_to_us(data->latency));
+
+	data->latency = ns_to_ktime(ktime_divns(data->latency, data->tries));
+
+	dev_dbg(&ntb->dev, "db_lat: latency %llu us (%llu ns)\n",
+		ktime_to_us(data->latency), ktime_to_ns(data->latency));
+
+	return 0;
+}
+
+static void perf_clear_dbl(struct perf_ext_lat_data *data)
+{
+	struct perf_ctx *perf = data->perf;
 
-	perf_clear_pl(pldata);
+	data->db = -1;
+	ntb_db_set_mask(perf->ntb, BIT_ULL(data->db));
+	atomic_set(&perf->running, 0);
+	wake_up(&perf->twait);
 }
 
-static void perf_poll_lat_server_work(struct work_struct *work)
+static struct perf_ext_lat_ops perf_dbl_client_ops = {
+	.init = perf_init_dbl,
+	.run = perf_run_dbl_client,
+	.clear = perf_clear_dbl
+};
+
+static void perf_ext_lat_work(struct work_struct *work)
 {
-	struct perf_poll_lat_data *pldata = to_pldata_srv_work(work);
+	struct perf_ext_lat_data *data = to_ext_lat_data(work);
 
-	pldata->status = perf_init_pl(pldata);
-	if (pldata->status)
+	if (!data->ops.init || !data->ops.run || !data->ops.clear) {
+		struct perf_ctx *perf = data->perf;
+
+		data->status = -EFAULT;
+		atomic_set(&perf->running, 0);
+		wake_up(&perf->twait);
 		return;
+	}
 
-	pldata->status = perf_run_pl_server(pldata);
+	data->status = data->ops.init(data);
+	if (data->status)
+		return;
 
-	perf_clear_pl(pldata);
+	data->status = data->ops.run(data);
+
+	data->ops.clear(data);
 }
 
 static int perf_set_tcnt(struct perf_ctx *perf, u8 tcnt)
@@ -1351,10 +1492,10 @@ static void perf_terminate_test(struct perf_ctx *perf)
 	int tidx;
 
 	atomic_set(&perf->tsync, -1);
-	atomic_set(&perf->pldata.running, 0);
+	atomic_set(&perf->running, 0);
 	wake_up(&perf->twait);
-	cancel_work_sync(&perf->pldata.srv_work);
-	cancel_work_sync(&perf->pldata.clt_work);
+	cancel_work_sync(&perf->pldata.work);
+	cancel_work_sync(&perf->dbldata.work);
 
 	for (tidx = 0; tidx < MAX_THREADS_CNT; tidx++) {
 		wake_up(&perf->threads[tidx].dma_wait);
@@ -1368,9 +1509,10 @@ static int perf_submit_test(struct perf_peer *peer)
 	struct perf_thread *pthr;
 	int tidx, ret;
 
-	ret = wait_for_completion_interruptible(&peer->init_comp);
-	if (ret < 0)
-		return ret;
+	ret = wait_for_completion_interruptible_timeout(&peer->init_comp,
+			msecs_to_jiffies(peer_timeout_s * 1000));
+	if (ret <= 0)
+		return ret ? ret : -ETIMEDOUT;
 
 	if (test_and_set_bit_lock(0, &perf->busy_flag))
 		return -EBUSY;
@@ -1400,41 +1542,58 @@ static int perf_submit_test(struct perf_peer *peer)
 	return ret;
 }
 
-static int perf_submit_poll_lat(struct perf_peer *peer, enum run_mode mode)
+static int perf_submit_ext_lat(struct perf_peer *peer)
 {
 	struct perf_ctx *perf = peer->perf;
 	int ret;
 
-	ret = wait_for_completion_interruptible(&peer->init_comp);
-	if (ret < 0)
-		return ret;
+	ret = wait_for_completion_interruptible_timeout(&peer->init_comp,
+			msecs_to_jiffies(peer_timeout_s * 1000));
+	if (ret <= 0)
+		return ret ? ret : -ETIMEDOUT;
 
 	if (test_and_set_bit_lock(0, &perf->busy_flag))
 		return -EBUSY;
 
 	perf->test_peer = peer;
-	atomic_set(&perf->pldata.running, 1);
+	atomic_set(&perf->running, 1);
 	perf->pldata.status = -ENODATA;
 	perf->pldata.tries = 0;
 	perf->pldata.latency = ktime_set(0, 0);
+	perf->dbldata.status = -ENODATA;
+	perf->dbldata.tries = 0;
+	perf->dbldata.latency = ktime_set(0, 0);
 
-	switch (mode) {
+	switch (perf->mode) {
 	case RUN_PL_SERVER:
-		(void)queue_work(perf_wq, &perf->pldata.srv_work);
+		perf->pldata.ops = perf_pl_server_ops;
+		(void)queue_work(perf_wq, &perf->pldata.work);
 		break;
 	case RUN_PL_CLIENT:
-	default:
-		(void)queue_work(perf_wq, &perf->pldata.clt_work);
+		perf->pldata.ops = perf_pl_client_ops;
+		(void)queue_work(perf_wq, &perf->pldata.work);
+		break;
+	case RUN_DBL_SERVER:
+		ret = perf_init_dbl(&perf->dbldata);
+		dev_dbg(&perf->ntb->dev, "db_lat: server started.\n");
+		goto submit_exit;
+	case RUN_DBL_CLIENT:
+		perf->dbldata.ops = perf_dbl_client_ops;
+		(void)queue_work(perf_wq, &perf->dbldata.work);
 		break;
+	default:
+		ret = -EINVAL;
+		goto submit_exit;
 	}
 
 	ret = wait_event_interruptible(perf->twait,
-				       !atomic_read(&perf->pldata.running));
+				       !atomic_read(&perf->running));
 	if (ret == -ERESTARTSYS) {
 		perf_terminate_test(perf);
 		ret = -EINTR;
 	}
 
+submit_exit:
 	clear_bit_unlock(0, &perf->busy_flag);
 
 	return ret;
@@ -1482,30 +1641,12 @@ static int perf_read_stats(struct perf_ctx *perf, char *buf,
 		}
 	}
 
-	if (perf->pldata.status != -ENODATA) {
-		(*pos) += scnprintf(buf + *pos, size - *pos, "\n");
-		if (perf->pldata.status) {
-			(*pos) += scnprintf(buf + *pos, size - *pos,
-				"poll latency: error status %d\n", perf->pldata.status);
-		} else {
-			if (ktime_to_us(perf->pldata.latency) < 10) {
-				(*pos) += scnprintf(buf + *pos, size - *pos,
-						"poll latency %llu ns\n",
-						ktime_to_ns(perf->pldata.latency));
-			} else {
-				(*pos) += scnprintf(buf + *pos, size - *pos,
-						"poll latency %llu us\n",
-						ktime_to_us(perf->pldata.latency));
-			}
-		}
-	}
-
 	clear_bit_unlock(0, &perf->busy_flag);
 
 	return 0;
 }
 
-static void perf_init_threads(struct perf_ctx *perf)
+static void perf_init_workers(struct perf_ctx *perf)
 {
 	struct perf_thread *pthr;
 	int tidx;
@@ -1513,11 +1654,15 @@ static void perf_init_threads(struct perf_ctx *perf)
 	perf->tcnt = DEF_THREADS_CNT;
 	perf->test_peer = &perf->peers[0];
 	init_waitqueue_head(&perf->twait);
+
 	perf->pldata.perf = perf;
-	INIT_WORK(&perf->pldata.srv_work, perf_poll_lat_server_work);
-	INIT_WORK(&perf->pldata.clt_work, perf_poll_lat_client_work);
+	INIT_WORK(&perf->pldata.work, perf_ext_lat_work);
 	perf->pldata.status = -ENODATA;
 
+	perf->dbldata.perf = perf;
+	INIT_WORK(&perf->dbldata.work, perf_ext_lat_work);
+	perf->dbldata.status = -ENODATA;
+
 	for (tidx = 0; tidx < MAX_THREADS_CNT; tidx++) {
 		pthr = &perf->threads[tidx];
 
@@ -1529,7 +1674,7 @@ static void perf_init_threads(struct perf_ctx *perf)
 	}
 }
 
-static void perf_clear_threads(struct perf_ctx *perf)
+static void perf_clear_workers(struct perf_ctx *perf)
 {
 	perf_terminate_test(perf);
 }
@@ -1673,8 +1818,55 @@ static const struct file_operations perf_dbgfs_run = {
 	.write = perf_dbgfs_write_run
 };
 
-static ssize_t perf_dbgfs_write_run_pl(struct file *filep, const char __user *ubuf,
-				    size_t size, loff_t *offp, enum run_mode mode)
+static ssize_t perf_dbgfs_read_run_pl(struct file *filep, char __user *ubuf,
+				   size_t fsize, loff_t *offp)
+{
+	struct perf_ctx *perf = filep->private_data;
+	ssize_t ret;
+	ssize_t pos = 0;
+	ssize_t size = PERF_BUF_LEN;
+	char *buf;
+
+	if (test_and_set_bit_lock(0, &perf->busy_flag))
+		return -EBUSY;
+
+	buf = kmalloc(size, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	pos += scnprintf(buf + pos, size - pos,
+		"    Peer %d test statistics:\n", perf->test_peer->pidx);
+
+	if (perf->pldata.status != -ENODATA) {
+		if (perf->pldata.status) {
+			pos += scnprintf(buf + pos, size - pos,
+				"poll latency: error status %d\n", perf->pldata.status);
+		} else {
+			if (ktime_to_us(perf->pldata.latency) < 10) {
+				pos += scnprintf(buf + pos, size - pos,
+						"poll latency %llu ns\n",
+						ktime_to_ns(perf->pldata.latency));
+			} else {
+				pos += scnprintf(buf + pos, size - pos,
+						"poll latency %llu us\n",
+						ktime_to_us(perf->pldata.latency));
+			}
+		}
+	} else {
+		pos += scnprintf(buf + pos, size - pos, "Test did not run\n");
+	}
+
+	ret = simple_read_from_buffer(ubuf, fsize, offp, buf, pos);
+
+	kfree(buf);
+
+	clear_bit_unlock(0, &perf->busy_flag);
+
+	return ret;
+}
+
+static ssize_t perf_dbgfs_write_run_ext(struct file *filep, const char __user *ubuf,
+					size_t size, loff_t *offp, enum run_mode mode)
 {
 	struct perf_ctx *perf = filep->private_data;
 	struct ntb_dev *ntb = perf->ntb;
@@ -1685,50 +1877,132 @@ static ssize_t perf_dbgfs_write_run_pl(struct file *filep, const char __user *ub
 	if (ret)
 		return ret;
 
-	if (pidx < 0 && mode == RUN_PL_SERVER) {
-		dev_dbg(&ntb->dev, "poll_lat: kill server\n");
-		if (test_bit(0, &perf->busy_flag)) {
-			peer = perf->test_peer;
-			/* Send stop to client */
-			memcpy_toio(peer->outbuf, &stop_word, 1);
+	if (pidx < 0) {
+		switch (mode) {
+		case RUN_PL_SERVER:
+			dev_dbg(&ntb->dev, "poll_lat: kill server\n");
+			if (test_bit(0, &perf->busy_flag)) {
+				peer = perf->test_peer;
+				/* Send stop to client */
+				memcpy_toio(peer->outbuf, &stop_word, 1);
+			}
+			perf_terminate_test(perf);
+			clear_bit_unlock(0, &perf->busy_flag);
+			return size;
+		case RUN_DBL_SERVER:
+			dev_dbg(&ntb->dev, "db_lat: kill server\n");
+			perf_clear_dbl(&perf->dbldata);
+			clear_bit_unlock(0, &perf->busy_flag);
+			return size;
+		default:
+			return -EINVAL;
 		}
-		perf_terminate_test(perf);
-		clear_bit_unlock(0, &perf->busy_flag);
-		return size;
 	}
 
-	if (pidx < 0 || pidx >= perf->pcnt)
+	if (pidx >= perf->pcnt)
 		return -EINVAL;
 
 	peer = &perf->peers[pidx];
+	perf->mode = mode;
 
-	ret = perf_submit_poll_lat(peer, mode);
+	ret = perf_submit_ext_lat(peer);
 
 	return ret ? ret : size;
 }
 
-static ssize_t perf_dbgfs_write_run_client(struct file *filep, const char __user *ubuf,
-				    size_t size, loff_t *offp)
+static ssize_t perf_dbgfs_write_run_pl_client(struct file *filep,
+					const char __user *ubuf, size_t size, loff_t *offp)
 {
-	return perf_dbgfs_write_run_pl(filep, ubuf, size, offp, RUN_PL_CLIENT);
+	return perf_dbgfs_write_run_ext(filep, ubuf, size, offp, RUN_PL_CLIENT);
 }
 
-static const struct file_operations perf_dbgfs_run_client = {
+static const struct file_operations perf_dbgfs_run_pl_client = {
 	.open = simple_open,
-	.read = perf_dbgfs_read_run,
-	.write = perf_dbgfs_write_run_client
+	.read = perf_dbgfs_read_run_pl,
+	.write = perf_dbgfs_write_run_pl_client
 };
 
-static ssize_t perf_dbgfs_write_run_server(struct file *filep, const char __user *ubuf,
-				    size_t size, loff_t *offp)
+static ssize_t perf_dbgfs_write_run_pl_server(struct file *filep,
+					const char __user *ubuf, size_t size, loff_t *offp)
 {
-	return perf_dbgfs_write_run_pl(filep, ubuf, size, offp, RUN_PL_SERVER);
+	return perf_dbgfs_write_run_ext(filep, ubuf, size, offp, RUN_PL_SERVER);
 }
 
-static const struct file_operations perf_dbgfs_run_server = {
+static const struct file_operations perf_dbgfs_run_pl_server = {
 	.open = simple_open,
-	.read = perf_dbgfs_read_run,
-	.write = perf_dbgfs_write_run_server
+	.read = perf_dbgfs_read_run_pl,
+	.write = perf_dbgfs_write_run_pl_server
+};
+
+static ssize_t perf_dbgfs_read_run_dbl(struct file *filep, char __user *ubuf,
+				   size_t fsize, loff_t *offp)
+{
+	struct perf_ctx *perf = filep->private_data;
+	ssize_t ret;
+	ssize_t pos = 0;
+	ssize_t size = PERF_BUF_LEN;
+	char *buf;
+
+	if (test_and_set_bit_lock(0, &perf->busy_flag))
+		return -EBUSY;
+
+	buf = kmalloc(size, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	pos += scnprintf(buf + pos, size - pos,
+		"    Peer %d test statistics:\n", perf->test_peer->pidx);
+
+	if (perf->dbldata.status != -ENODATA) {
+		if (perf->dbldata.status) {
+			pos += scnprintf(buf + pos, size - pos,
+				"doorbell latency: error status %d\n", perf->dbldata.status);
+		} else {
+			if (ktime_to_us(perf->dbldata.latency) < 10) {
+				pos += scnprintf(buf + pos, size - pos,
+						"doorbell latency %llu ns\n",
+						ktime_to_ns(perf->dbldata.latency));
+			} else {
+				pos += scnprintf(buf + pos, size - pos,
+						"doorbell latency %llu us\n",
+						ktime_to_us(perf->dbldata.latency));
+			}
+		}
+	} else {
+		pos += scnprintf(buf + pos, size - pos, "Test did not run\n");
+	}
+
+	ret = simple_read_from_buffer(ubuf, fsize, offp, buf, pos);
+
+	kfree(buf);
+
+	clear_bit_unlock(0, &perf->busy_flag);
+
+	return ret;
+}
+
+static ssize_t perf_dbgfs_write_run_dbl_client(struct file *filep,
+					const char __user *ubuf, size_t size, loff_t *offp)
+{
+	return perf_dbgfs_write_run_ext(filep, ubuf, size, offp, RUN_DBL_CLIENT);
+}
+
+static const struct file_operations perf_dbgfs_run_dbl_client = {
+	.open = simple_open,
+	.read = perf_dbgfs_read_run_dbl,
+	.write = perf_dbgfs_write_run_dbl_client
+};
+
+static ssize_t perf_dbgfs_write_run_dbl_server(struct file *filep,
+					const char __user *ubuf, size_t size, loff_t *offp)
+{
+	return perf_dbgfs_write_run_ext(filep, ubuf, size, offp, RUN_DBL_SERVER);
+}
+
+static const struct file_operations perf_dbgfs_run_dbl_server = {
+	.open = simple_open,
+	.read = perf_dbgfs_read_run_dbl,
+	.write = perf_dbgfs_write_run_dbl_server
 };
 
 static ssize_t perf_dbgfs_read_tcnt(struct file *filep, char __user *ubuf,
@@ -1782,8 +2056,7 @@ static ssize_t perf_dbgfs_read_lattrs(struct file *filep, char __user *ubuf,
 		struct perf_thread *pthr = &perf->threads[tidx];
 
 		pos += scnprintf(buf + pos, buf_size - pos,
-			"%d: made %llu tries\n",
-			tidx, pthr->tries);
+				"%d: made %llu tries\n", tidx, pthr->tries);
 	}
 
 	ret = simple_read_from_buffer(ubuf, size, offp, buf, pos);
@@ -1794,7 +2067,7 @@ static ssize_t perf_dbgfs_read_lattrs(struct file *filep, char __user *ubuf,
 }
 
 static ssize_t perf_dbgfs_read_inbuf(struct file *filep, char __user *ubuf,
-				    size_t size, loff_t *offp)
+					size_t size, loff_t *offp)
 {
 	struct perf_ctx *perf = filep->private_data;
 	char buf[32];
@@ -1830,6 +2103,9 @@ static const struct file_operations perf_dbgfs_inbuf = {
 static void perf_setup_dbgfs(struct perf_ctx *perf)
 {
 	struct pci_dev *pdev = perf->ntb->pdev;
+	struct dentry *burst_lat_dir;
+	struct dentry *poll_lat_dir;
+	struct dentry *db_lat_dir;
 
 	perf->dbgfs_dir = debugfs_create_dir(pci_name(pdev), perf_dbgfs_topdir);
 	if (!perf->dbgfs_dir) {
@@ -1840,17 +2116,10 @@ static void perf_setup_dbgfs(struct perf_ctx *perf)
 	debugfs_create_file("info", 0600, perf->dbgfs_dir, perf,
 			    &perf_dbgfs_info);
 
-	debugfs_create_file("run", 0600, perf->dbgfs_dir, perf,
-			    &perf_dbgfs_run);
+	debugfs_create_symlink("run", perf->dbgfs_dir, "burst_latency/run");
 
-	debugfs_create_file("run_client", 0600, perf->dbgfs_dir, perf,
-			    &perf_dbgfs_run_client);
-
-	debugfs_create_file("run_server", 0600, perf->dbgfs_dir, perf,
-			    &perf_dbgfs_run_server);
-
-	debugfs_create_file("threads_count", 0600, perf->dbgfs_dir, perf,
-			    &perf_dbgfs_tcnt);
+	debugfs_create_symlink("threads_count", perf->dbgfs_dir,
+				"burst_latency/threads_count");
 
 	/* They are made read-only for test exec safety and integrity */
 	debugfs_create_u8("chunk_order", 0500, perf->dbgfs_dir, &chunk_order);
@@ -1859,14 +2128,44 @@ static void perf_setup_dbgfs(struct perf_ctx *perf)
 
 	debugfs_create_bool("use_dma", 0500, perf->dbgfs_dir, &use_dma);
 
-	debugfs_create_file("latency_tries", 0400, perf->dbgfs_dir, perf,
+	debugfs_create_file("inbuf", 0400, perf->dbgfs_dir, perf,
+			    &perf_dbgfs_inbuf);
+
+	/* burst_latency subdir */
+
+	burst_lat_dir = debugfs_create_dir("burst_latency", perf->dbgfs_dir);
+
+	debugfs_create_file("run", 0600, burst_lat_dir, perf, &perf_dbgfs_run);
+
+	debugfs_create_file("threads_count", 0600, burst_lat_dir, perf,
+			    &perf_dbgfs_tcnt);
+
+	debugfs_create_file("tries", 0400, burst_lat_dir, perf,
 			    &perf_dbgfs_lattrs);
 
-	debugfs_create_u64("poll_latency_tries", 0400, perf->dbgfs_dir,
-				&perf->pldata.tries);
+	/* poll_latency subdir */
 
-	debugfs_create_file("inbuf", 0400, perf->dbgfs_dir, perf,
-			    &perf_dbgfs_inbuf);
+	poll_lat_dir = debugfs_create_dir("poll_latency", perf->dbgfs_dir);
+
+	debugfs_create_file("run_client", 0600, poll_lat_dir, perf,
+			    &perf_dbgfs_run_pl_client);
+
+	debugfs_create_file("run_server", 0600, poll_lat_dir, perf,
+			    &perf_dbgfs_run_pl_server);
+
+	debugfs_create_u64("tries", 0400, poll_lat_dir, &perf->pldata.tries);
+
+	/* db_latency subdir */
+
+	db_lat_dir = debugfs_create_dir("db_latency", perf->dbgfs_dir);
+
+	debugfs_create_file("run_client", 0600, db_lat_dir, perf,
+			    &perf_dbgfs_run_dbl_client);
+
+	debugfs_create_file("run_server", 0600, db_lat_dir, perf,
+			    &perf_dbgfs_run_dbl_server);
+
+	debugfs_create_u64("tries", 0400, db_lat_dir, &perf->dbldata.tries);
 }
 
 static void perf_clear_dbgfs(struct perf_ctx *perf)
@@ -1986,7 +2285,7 @@ static int perf_probe(struct ntb_client *client, struct ntb_dev *ntb)
 	if (ret)
 		return ret;
 
-	perf_init_threads(perf);
+	perf_init_workers(perf);
 
 	ret = perf_init_service(perf);
 	if (ret)
@@ -2009,7 +2308,7 @@ static void perf_remove(struct ntb_client *client, struct ntb_dev *ntb)
 
 	perf_disable_service(perf);
 
-	perf_clear_threads(perf);
+	perf_clear_workers(perf);
 }
 
 static struct ntb_client perf_client = {
-- 
2.30.2


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2 1/3] ntb_perf: extend with burst latency measurement
  2022-04-22 11:37 [PATCH v2 1/3] ntb_perf: extend with burst latency measurement Alexander Fomichev
  2022-04-22 11:37 ` [PATCH v2 2/3] ntb_perf: extend with poll " Alexander Fomichev
  2022-04-22 11:37 ` [PATCH v2 3/3] ntb_perf: extend with doorbell " Alexander Fomichev
@ 2022-04-25 16:38 ` Dave Jiang
  2022-04-28 11:27 ` Serge Semin
  3 siblings, 0 replies; 10+ messages in thread
From: Dave Jiang @ 2022-04-25 16:38 UTC (permalink / raw)
  To: Alexander Fomichev, ntb
  Cc: linux, Jon Mason, Allen Hubbe, Guo Zhengkui, Alexander Fomichev


On 4/22/2022 4:37 AM, Alexander Fomichev wrote:
> From: Alexander Fomichev <a.fomichev@yadro.com>
>
> The ntb_perf test provides a tool for NTB hardware performance
> evaluation. For software impact elimination the test uses a simple method
> (let's call it 'burst' mode), when the local system sends to the remote
> system a data set and counts time interval until hardware completion
> report, without the remote side confirming, nor data integrity check.
> The measured metric is a 'burst' throughput bandwidth of NTB connection.
>
> The proposed change extends ntb_perf with 1 of 3 new metrics. The resulting
> test is fully backward compatible.
>
> 1. Burst latency. It's a delay between start to send 1 byte to the
> remote system and hardware readiness to send another byte. The
> measurement performed within bandwidth test procedure. The DMA Engine is
> off. Data integrity is not checked. This mode can be disabled by
> 'perf_latency=N' module parameter.
>
> Signed-off-by: Alexander Fomichev <a.fomichev@yadro.com>
> ---
>   drivers/ntb/test/ntb_perf.c | 135 +++++++++++++++++++++++++++++++++++-
>   1 file changed, 132 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/ntb/test/ntb_perf.c b/drivers/ntb/test/ntb_perf.c
> index 65e1e5cf1b29..79faa4a7fe94 100644
> --- a/drivers/ntb/test/ntb_perf.c
> +++ b/drivers/ntb/test/ntb_perf.c
> @@ -106,6 +106,9 @@ MODULE_DESCRIPTION("PCIe NTB Performance Measurement Tool");
>   
>   #define PERF_BUF_LEN 1024
>   
> +#define LAT_MIN_TRIES	20
> +#define RESCHEDULE_RATIO	10000
> +
>   static unsigned long max_mw_size;
>   module_param(max_mw_size, ulong, 0644);
>   MODULE_PARM_DESC(max_mw_size, "Upper limit of memory window size");
> @@ -122,6 +125,14 @@ static bool use_dma; /* default to 0 */
>   module_param(use_dma, bool, 0644);
>   MODULE_PARM_DESC(use_dma, "Use DMA engine to measure performance");
>   
> +static bool perf_latency = true;
> +module_param(perf_latency, bool, 0644);
> +MODULE_PARM_DESC(perf_latency, "Also measure latency");

"Measures burst latency"


> +
> +static unsigned long lat_time_ms = 1000; /* default 1s */
> +module_param(lat_time_ms, ulong, 0644);
> +MODULE_PARM_DESC(lat_time_ms, "Time (in ms) to test latency");
> +
>   /*==============================================================================
>    *                         Perf driver data definition
>    *==============================================================================
> @@ -178,6 +189,8 @@ struct perf_thread {
>   	void *src;
>   	u64 copied;
>   	ktime_t duration;
> +	ktime_t latency;
> +	u64 tries;
>   	int status;
>   	struct work_struct work;
>   };
> @@ -783,7 +796,7 @@ static void perf_dma_copy_callback(void *data)
>   }
>   
>   static int perf_copy_chunk(struct perf_thread *pthr,
> -			   void __iomem *dst, void *src, size_t len)
> +			   void __iomem *dst, void *src, size_t len, bool use_dma_)
>   {
>   	struct dma_async_tx_descriptor *tx;
>   	struct dmaengine_unmap_data *unmap;
> @@ -794,7 +807,7 @@ static int perf_copy_chunk(struct perf_thread *pthr,
>   	void __iomem *dst_vaddr;
>   	dma_addr_t dst_dma_addr;
>   
> -	if (!use_dma) {
> +	if (!use_dma_) {
>   		memcpy_toio(dst, src, len);
>   		goto ret_check_tsync;
>   	}
> @@ -940,7 +953,7 @@ static int perf_run_test(struct perf_thread *pthr)
>   
>   	/* Copied field is cleared on test launch stage */
>   	while (pthr->copied < total_size) {
> -		ret = perf_copy_chunk(pthr, flt_dst, flt_src, chunk_size);
> +		ret = perf_copy_chunk(pthr, flt_dst, flt_src, chunk_size, use_dma);
>   		if (ret) {
>   			dev_err(&perf->ntb->dev, "%d: Got error %d on test\n",
>   				pthr->tidx, ret);
> @@ -1018,6 +1031,66 @@ static void perf_clear_test(struct perf_thread *pthr)
>   	kfree(pthr->src);
>   }
>   
> +static int perf_run_latency(struct perf_thread *pthr)
> +{
> +	struct perf_peer *peer = pthr->perf->test_peer;
> +	struct ntb_dev *ntb = pthr->perf->ntb;
> +	void *flt_src;
> +	void __iomem *flt_dst, *bnd_dst;
> +	int ret;
> +	u64 stop_at = ktime_get_real_fast_ns() + lat_time_ms * NSEC_PER_MSEC;
> +
> +	pthr->tries = 0;
> +	pthr->latency = ktime_get();
> +	flt_src = pthr->src;
> +	flt_dst = peer->outbuf;
> +	bnd_dst = peer->outbuf + peer->outbuf_size;
> +
> +	while (ktime_get_real_fast_ns() < stop_at) {
> +		ret = perf_copy_chunk(pthr, flt_dst, flt_src, 1, false);
> +		if (ret) {
> +			dev_err(&ntb->dev, "%d: Latency testing error %d\n",
> +				pthr->tidx, ret);
> +			pthr->latency = ktime_set(0, 0);
> +			return ret;
> +		}
> +
> +		pthr->tries++;
> +		flt_dst++;
> +		flt_src++;
> +
> +		if (flt_dst >= bnd_dst || flt_dst < peer->outbuf) {
> +			flt_dst = peer->outbuf;
> +			flt_src = pthr->src;
> +		}
> +
> +		/* Avoid processor soft lock-ups */
> +		if (!(pthr->tries % RESCHEDULE_RATIO))
> +			schedule();
> +	}
> +
> +	/* Stop timer */
> +	pthr->latency = ktime_sub(ktime_get(), pthr->latency);
> +
> +	if (pthr->tries < LAT_MIN_TRIES) {
> +		dev_err(&ntb->dev, "%d: Too few steps (%d) to measure Latency, "
> +			"recommended > %d. Increase value of 'lat_time_ms' "
> +			"parameter\n", pthr->tidx, pthr->tries, LAT_MIN_TRIES);
> +		pthr->latency = ktime_set(0, 0);
> +		return -EINVAL;
> +	}
> +
> +	dev_dbg(&ntb->dev, "%d: made %llu tries, lasted %llu usecs\n",
> +		pthr->tidx, pthr->tries, ktime_to_us(pthr->latency));
> +
> +	pthr->latency = ns_to_ktime(ktime_divns(pthr->latency, pthr->tries));
> +
> +	dev_dbg(&ntb->dev, "%d: latency %llu us (%llu ns)\n", pthr->tidx,
> +		ktime_to_us(pthr->latency), ktime_to_ns(pthr->latency));
> +
> +	return 0;
> +}
> +
>   static void perf_thread_work(struct work_struct *work)
>   {
>   	struct perf_thread *pthr = to_thread_work(work);
> @@ -1043,6 +1116,11 @@ static void perf_thread_work(struct work_struct *work)
>   	}
>   
>   	pthr->status = perf_sync_test(pthr);
> +	if (pthr->status)
> +		goto err_clear_test;
> +
> +	if (perf_latency)
> +		pthr->status = perf_run_latency(pthr);
>   
>   err_clear_test:
>   	perf_clear_test(pthr);
> @@ -1142,6 +1220,18 @@ static int perf_read_stats(struct perf_ctx *perf, char *buf,
>   			"%d: copied %llu bytes in %llu usecs, %llu MBytes/s\n",
>   			tidx, pthr->copied, ktime_to_us(pthr->duration),
>   			div64_u64(pthr->copied, ktime_to_us(pthr->duration)));
> +
> +		if (perf_latency && ktime_compare(pthr->latency, ktime_set(0, 0))) {
> +			if (ktime_to_us(pthr->latency) < 10) {
> +				(*pos) += scnprintf(buf + *pos, size - *pos,
> +						"%d: latency %llu ns\n",
> +						tidx, ktime_to_ns(pthr->latency));
> +			} else {
> +				(*pos) += scnprintf(buf + *pos, size - *pos,
> +						"%d: latency %llu us\n",
> +						tidx, ktime_to_us(pthr->latency));
> +			}
> +		}
>   	}
>   
>   	clear_bit_unlock(0, &perf->busy_flag);
> @@ -1344,12 +1434,48 @@ static ssize_t perf_dbgfs_write_tcnt(struct file *filep,
>   	return size;
>   }
>   
> +static ssize_t perf_dbgfs_read_lattrs(struct file *filep, char __user *ubuf,
> +				    size_t size, loff_t *offp)
> +{
> +	struct perf_ctx *perf = filep->private_data;
> +	char *buf;
> +	ssize_t pos, ret;
> +	size_t buf_size = min_t(size_t, size, PERF_BUF_LEN);
> +	int tidx;
> +
> +	buf = kmalloc(buf_size, GFP_KERNEL);
> +	if (!buf)
> +		return -ENOMEM;
> +
> +	pos = scnprintf(buf, buf_size, "    Peer %d latency try count:\n",
> +			perf->test_peer->pidx);
> +
> +	for (tidx = 0; tidx < perf->tcnt; tidx++) {
> +		struct perf_thread *pthr = &perf->threads[tidx];
> +
> +		pos += scnprintf(buf + pos, buf_size - pos,
> +			"%d: made %llu tries\n",
> +			tidx, pthr->tries);
> +	}
> +
> +	ret = simple_read_from_buffer(ubuf, size, offp, buf, pos);
> +
> +	kfree(buf);
> +
> +	return ret;
> +}
> +
>   static const struct file_operations perf_dbgfs_tcnt = {
>   	.open = simple_open,
>   	.read = perf_dbgfs_read_tcnt,
>   	.write = perf_dbgfs_write_tcnt
>   };
>   
> +static const struct file_operations perf_dbgfs_lattrs = {
> +	.open = simple_open,
> +	.read = perf_dbgfs_read_lattrs
> +};
> +
>   static void perf_setup_dbgfs(struct perf_ctx *perf)
>   {
>   	struct pci_dev *pdev = perf->ntb->pdev;
> @@ -1375,6 +1501,9 @@ static void perf_setup_dbgfs(struct perf_ctx *perf)
>   	debugfs_create_u8("total_order", 0500, perf->dbgfs_dir, &total_order);
>   
>   	debugfs_create_bool("use_dma", 0500, perf->dbgfs_dir, &use_dma);
> +
> +	debugfs_create_file("latency_tries", 0400, perf->dbgfs_dir, perf,
> +			    &perf_dbgfs_lattrs);
>   }
>   
>   static void perf_clear_dbgfs(struct perf_ctx *perf)

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2 2/3] ntb_perf: extend with poll latency measurement
  2022-04-22 11:37 ` [PATCH v2 2/3] ntb_perf: extend with poll " Alexander Fomichev
@ 2022-04-25 16:49   ` Dave Jiang
  2022-04-29  8:53     ` Alexander Fomichev
  0 siblings, 1 reply; 10+ messages in thread
From: Dave Jiang @ 2022-04-25 16:49 UTC (permalink / raw)
  To: Alexander Fomichev, ntb
  Cc: linux, Jon Mason, Allen Hubbe, Guo Zhengkui, Alexander Fomichev


On 4/22/2022 4:37 AM, Alexander Fomichev wrote:
> From: Alexander Fomichev <a.fomichev@yadro.com>
>
> The proposed change extends ntb_perf with 2nd of 3 new metrics. The resulting
> test is fully backward compatible.
>
> 2. Poll latency. It's a delay between start to send 1 byte to the
> remote system and receiving the confirmation. The remote system needs to
> be run in server mode beforehand. Then the server polls the input buffer
> and on receiving data immediately sends the confirmation back.

This is a general comment. Please make the commit logs as they are 
individual standalone patches and not refer to each other.


>
> Signed-off-by: Alexander Fomichev <a.fomichev@yadro.com>
> ---
>   drivers/ntb/test/ntb_perf.c | 365 +++++++++++++++++++++++++++++++++++-
>   1 file changed, 364 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/ntb/test/ntb_perf.c b/drivers/ntb/test/ntb_perf.c
> index 79faa4a7fe94..d4664410e543 100644
> --- a/drivers/ntb/test/ntb_perf.c
> +++ b/drivers/ntb/test/ntb_perf.c
> @@ -85,7 +85,7 @@
>   #include <linux/ntb.h>
>   
>   #define DRIVER_NAME		"ntb_perf"
> -#define DRIVER_VERSION		"2.0"
> +#define DRIVER_VERSION		"2.1"

Oddly random version change in the middle patch.


>   
>   MODULE_LICENSE("Dual BSD/GPL");
>   MODULE_VERSION(DRIVER_VERSION);
> @@ -133,6 +133,10 @@ static unsigned long lat_time_ms = 1000; /* default 1s */
>   module_param(lat_time_ms, ulong, 0644);
>   MODULE_PARM_DESC(lat_time_ms, "Time (in ms) to test latency");
>   
> +static unsigned long lat_timeout_us = 500;
> +module_param(lat_timeout_us, ulong, 0644);
> +MODULE_PARM_DESC(lat_timeout_us, "Timeout (in us) to wait for server reply");

I would refrain from using the word "timeout" for variable and 
description as that's typically used for when a set time expires. Maybe 
"Roundtrip Latency" would be the better description?


> +
>   /*==============================================================================
>    *                         Perf driver data definition
>    *==============================================================================
> @@ -149,6 +153,11 @@ enum perf_cmd {
>   	PERF_STS_LNKUP = 6, /* link up state flag */
>   };
>   
> +enum run_mode {
> +	RUN_PL_CLIENT,
> +	RUN_PL_SERVER,
> +};
> +
>   struct perf_ctx;
>   
>   struct perf_peer {
> @@ -197,6 +206,21 @@ struct perf_thread {
>   #define to_thread_work(__work) \
>   	container_of(__work, struct perf_thread, work)
>   
> +struct perf_poll_lat_data {
> +	struct perf_ctx *perf;
> +	void *src;
> +	ktime_t latency;
> +	u64 tries;
> +	int status;
> +	atomic_t running;
> +	struct work_struct clt_work;
> +	struct work_struct srv_work;
> +};
> +#define to_pldata_clt_work(__work) \
> +	container_of(__work, struct perf_poll_lat_data, clt_work)
> +#define to_pldata_srv_work(__work) \
> +	container_of(__work, struct perf_poll_lat_data, srv_work)
> +
>   struct perf_ctx {
>   	struct ntb_dev *ntb;
>   
> @@ -204,6 +228,7 @@ struct perf_ctx {
>   	int gidx;
>   	int pcnt;
>   	struct perf_peer *peers;
> +	struct perf_poll_lat_data pldata;
>   
>   	/* Performance measuring work-threads interface */
>   	unsigned long busy_flag;
> @@ -252,6 +277,8 @@ static struct dentry *perf_dbgfs_topdir;
>   
>   static struct workqueue_struct *perf_wq __read_mostly;
>   
> +static const u8 stop_word = 0xFF;
> +
>   /*==============================================================================
>    *                  NTB cross-link commands execution service
>    *==============================================================================
> @@ -1126,6 +1153,184 @@ static void perf_thread_work(struct work_struct *work)
>   	perf_clear_test(pthr);
>   }
>   
> +static int perf_init_pl(struct perf_poll_lat_data *pldata)
> +{
> +	struct perf_ctx *perf = pldata->perf;
> +	struct perf_peer *peer = perf->test_peer;
> +	u8 *bp;
> +
> +	pldata->src = kmalloc_node(peer->outbuf_size, GFP_KERNEL,
> +				 dev_to_node(&perf->ntb->dev));
> +	if (!pldata->src)
> +		return -ENOMEM;
> +
> +	/*
> +	 * Prepare random data to send, guaranteed exclusion of 0x00 (unreceived)
> +	 * and 0xFF (stop_word)
> +	 */
> +	get_random_bytes(pldata->src, peer->outbuf_size);
> +	for (bp = pldata->src; bp < (u8 *) pldata->src + peer->outbuf_size; bp++)
> +		while (*bp == 0 || *bp == stop_word)
> +			*bp = (u8)get_random_int();
> +
> +	memset(peer->inbuf, 0, peer->inbuf_size);
> +
> +	return 0;
> +}
> +
> +static int perf_poll_peer_reply(volatile u8 *cur)
> +{
> +	u64 wait_till = ktime_get_real_fast_ns() + lat_timeout_us * NSEC_PER_USEC;
> +
> +	while (ktime_get_real_fast_ns() < wait_till) {
> +		if (*cur == stop_word) {
> +			*cur = 0;
> +			return 1;
> +		}
> +		if (*cur != 0) {
> +			*cur = 0;
> +			return 0;
> +		}
> +	}
> +	return -EINTR;
> +}
> +
> +static int perf_run_pl_client(struct perf_poll_lat_data *pldata)
> +{
> +	struct perf_peer *peer = pldata->perf->test_peer;
> +	struct ntb_dev *ntb = pldata->perf->ntb;
> +	void *src = pldata->src;
> +	int ret;
> +	u64 stop_at = ktime_get_real_fast_ns() + lat_time_ms * NSEC_PER_MSEC;
> +
> +	dev_dbg(&ntb->dev, "poll_lat: client started.\n");
> +
> +	pldata->tries = 0;
> +	pldata->latency = ktime_get();
> +
> +	while (ktime_get_real_fast_ns() < stop_at) {
> +		memcpy_toio(peer->outbuf, src, 1);
> +
> +		/* Avoid processor soft lock-ups */
> +		schedule();
> +
> +		ret = perf_poll_peer_reply(peer->inbuf);
> +		if (ret < 0) {
> +			dev_err(&ntb->dev, "Timeout waiting for peer reply on poll latency\n");
> +			pldata->latency = ktime_set(0, 0);
> +			return -EINTR;
> +		} else if (ret == 1) {
> +			dev_warn(&ntb->dev, "Server terminated on poll latency, stopping\n");
> +			break;
> +		} else if (!atomic_read(&pldata->running)) {
> +			dev_err(&ntb->dev, "Poll latency client terminated\n");
> +			return -EINTR;
> +		}
> +
> +		pldata->tries++;
> +		src++;
> +
> +		if (src >= pldata->src + peer->outbuf_size)
> +			src = pldata->src;
> +	}
> +
> +	/* Stop timer */
> +	pldata->latency = ktime_sub(ktime_get(), pldata->latency);
> +	/* Send stop to peer */
> +	memcpy_toio(peer->outbuf, &stop_word, 1);
> +
> +	if (pldata->tries < LAT_MIN_TRIES) {
> +		dev_err(&ntb->dev, "Too few steps (%d) to measure Latency, "
> +			"recommended > %d. Increase value of 'lat_time_ms' "
> +			"parameter\n", pldata->tries, LAT_MIN_TRIES);
> +		pldata->latency = ktime_set(0, 0);
> +		return -EINVAL;
> +	}
> +
> +	dev_dbg(&ntb->dev, "poll_lat: made %llu tries, lasted %llu usecs\n",
> +		pldata->tries, ktime_to_us(pldata->latency));
> +
> +	pldata->latency = ns_to_ktime(ktime_divns(pldata->latency, pldata->tries));
> +
> +	dev_dbg(&ntb->dev, "poll_lat: latency %llu us (%llu ns)\n",
> +		ktime_to_us(pldata->latency), ktime_to_ns(pldata->latency));
> +
> +	return 0;
> +}
> +
> +static int perf_run_pl_server(struct perf_poll_lat_data *pldata)
> +{
> +	struct perf_peer *peer = pldata->perf->test_peer;
> +	struct ntb_dev *ntb = pldata->perf->ntb;
> +	void *src = pldata->src;
> +	int ret = 0;
> +
> +	dev_dbg(&ntb->dev, "poll_lat: server started.\n");
> +
> +	pldata->tries = 0;
> +
> +	while (ret != 1 && atomic_read(&pldata->running)) {
> +		ret = perf_poll_peer_reply(peer->inbuf);
> +		if (!ret) {
> +			/* Pong to client */
> +			memcpy_toio(peer->outbuf, src++, 1);
> +			if (src >= pldata->src + peer->outbuf_size)
> +				src = pldata->src;
> +
> +			pldata->tries++;
> +		}
> +
> +		/* Avoid processor soft lock-ups */
> +		schedule();
> +	}
> +
> +	if (pldata->tries < LAT_MIN_TRIES)
> +		dev_warn(&ntb->dev, "Poll latency test terminated too early. "
> +				"Increase client's test time\n");
> +
> +	dev_dbg(&ntb->dev, "poll_lat: server stopped, had responded %llu times\n",
> +		pldata->tries);
> +
> +	return atomic_read(&pldata->running) ? -ENODATA : -EINTR;
> +}
> +
> +static void perf_clear_pl(struct perf_poll_lat_data *pldata)
> +{
> +	struct perf_ctx *perf = pldata->perf;
> +	struct perf_peer *peer = perf->test_peer;
> +
> +	memset(peer->inbuf, stop_word, 1);
> +	atomic_set(&pldata->running, 0);
> +	wake_up(&perf->twait);
> +	kfree(pldata->src);
> +}
> +
> +static void perf_poll_lat_client_work(struct work_struct *work)
> +{
> +	struct perf_poll_lat_data *pldata = to_pldata_clt_work(work);
> +
> +	pldata->status = perf_init_pl(pldata);
> +	if (pldata->status)
> +		return;
> +
> +	pldata->status = perf_run_pl_client(pldata);
> +
> +	perf_clear_pl(pldata);
> +}
> +
> +static void perf_poll_lat_server_work(struct work_struct *work)
> +{
> +	struct perf_poll_lat_data *pldata = to_pldata_srv_work(work);
> +
> +	pldata->status = perf_init_pl(pldata);
> +	if (pldata->status)
> +		return;
> +
> +	pldata->status = perf_run_pl_server(pldata);
> +
> +	perf_clear_pl(pldata);
> +}
> +
>   static int perf_set_tcnt(struct perf_ctx *perf, u8 tcnt)
>   {
>   	if (tcnt == 0 || tcnt > MAX_THREADS_CNT)
> @@ -1146,7 +1351,10 @@ static void perf_terminate_test(struct perf_ctx *perf)
>   	int tidx;
>   
>   	atomic_set(&perf->tsync, -1);
> +	atomic_set(&perf->pldata.running, 0);
>   	wake_up(&perf->twait);
> +	cancel_work_sync(&perf->pldata.srv_work);
> +	cancel_work_sync(&perf->pldata.clt_work);
>   
>   	for (tidx = 0; tidx < MAX_THREADS_CNT; tidx++) {
>   		wake_up(&perf->threads[tidx].dma_wait);
> @@ -1192,6 +1400,46 @@ static int perf_submit_test(struct perf_peer *peer)
>   	return ret;
>   }
>   
> +static int perf_submit_poll_lat(struct perf_peer *peer, enum run_mode mode)
> +{
> +	struct perf_ctx *perf = peer->perf;
> +	int ret;
> +
> +	ret = wait_for_completion_interruptible(&peer->init_comp);
> +	if (ret < 0)
> +		return ret;
> +
> +	if (test_and_set_bit_lock(0, &perf->busy_flag))
> +		return -EBUSY;
> +
> +	perf->test_peer = peer;
> +	atomic_set(&perf->pldata.running, 1);
> +	perf->pldata.status = -ENODATA;
> +	perf->pldata.tries = 0;
> +	perf->pldata.latency = ktime_set(0, 0);
> +
> +	switch (mode) {
> +	case RUN_PL_SERVER:
> +		(void)queue_work(perf_wq, &perf->pldata.srv_work);
> +		break;
> +	case RUN_PL_CLIENT:
> +	default:
> +		(void)queue_work(perf_wq, &perf->pldata.clt_work);
> +		break;
> +	}
> +
> +	ret = wait_event_interruptible(perf->twait,
> +				       !atomic_read(&perf->pldata.running));
> +	if (ret == -ERESTARTSYS) {
> +		perf_terminate_test(perf);
> +		ret = -EINTR;
> +	}
> +
> +	clear_bit_unlock(0, &perf->busy_flag);
> +
> +	return ret;
> +}
> +
>   static int perf_read_stats(struct perf_ctx *perf, char *buf,
>   			   size_t size, ssize_t *pos)
>   {
> @@ -1234,6 +1482,24 @@ static int perf_read_stats(struct perf_ctx *perf, char *buf,
>   		}
>   	}
>   
> +	if (perf->pldata.status != -ENODATA) {
> +		(*pos) += scnprintf(buf + *pos, size - *pos, "\n");
> +		if (perf->pldata.status) {
> +			(*pos) += scnprintf(buf + *pos, size - *pos,
> +				"poll latency: error status %d\n", perf->pldata.status);
> +		} else {
> +			if (ktime_to_us(perf->pldata.latency) < 10) {
> +				(*pos) += scnprintf(buf + *pos, size - *pos,
> +						"poll latency %llu ns\n",
> +						ktime_to_ns(perf->pldata.latency));
> +			} else {
> +				(*pos) += scnprintf(buf + *pos, size - *pos,
> +						"poll latency %llu us\n",
> +						ktime_to_us(perf->pldata.latency));
> +			}
> +		}
> +	}
> +
>   	clear_bit_unlock(0, &perf->busy_flag);
>   
>   	return 0;
> @@ -1247,6 +1513,10 @@ static void perf_init_threads(struct perf_ctx *perf)
>   	perf->tcnt = DEF_THREADS_CNT;
>   	perf->test_peer = &perf->peers[0];
>   	init_waitqueue_head(&perf->twait);
> +	perf->pldata.perf = perf;
> +	INIT_WORK(&perf->pldata.srv_work, perf_poll_lat_server_work);
> +	INIT_WORK(&perf->pldata.clt_work, perf_poll_lat_client_work);
> +	perf->pldata.status = -ENODATA;
>   
>   	for (tidx = 0; tidx < MAX_THREADS_CNT; tidx++) {
>   		pthr = &perf->threads[tidx];
> @@ -1403,6 +1673,64 @@ static const struct file_operations perf_dbgfs_run = {
>   	.write = perf_dbgfs_write_run
>   };
>   
> +static ssize_t perf_dbgfs_write_run_pl(struct file *filep, const char __user *ubuf,
> +				    size_t size, loff_t *offp, enum run_mode mode)
> +{
> +	struct perf_ctx *perf = filep->private_data;
> +	struct ntb_dev *ntb = perf->ntb;
> +	struct perf_peer *peer;
> +	int pidx, ret;
> +
> +	ret = kstrtoint_from_user(ubuf, size, 0, &pidx);
> +	if (ret)
> +		return ret;
> +
> +	if (pidx < 0 && mode == RUN_PL_SERVER) {
> +		dev_dbg(&ntb->dev, "poll_lat: kill server\n");
> +		if (test_bit(0, &perf->busy_flag)) {
> +			peer = perf->test_peer;
> +			/* Send stop to client */
> +			memcpy_toio(peer->outbuf, &stop_word, 1);
> +		}
> +		perf_terminate_test(perf);
> +		clear_bit_unlock(0, &perf->busy_flag);
> +		return size;
> +	}
> +
> +	if (pidx < 0 || pidx >= perf->pcnt)
> +		return -EINVAL;
> +
> +	peer = &perf->peers[pidx];
> +
> +	ret = perf_submit_poll_lat(peer, mode);
> +
> +	return ret ? ret : size;
> +}
> +
> +static ssize_t perf_dbgfs_write_run_client(struct file *filep, const char __user *ubuf,
> +				    size_t size, loff_t *offp)
> +{
> +	return perf_dbgfs_write_run_pl(filep, ubuf, size, offp, RUN_PL_CLIENT);
> +}
> +
> +static const struct file_operations perf_dbgfs_run_client = {
> +	.open = simple_open,
> +	.read = perf_dbgfs_read_run,
> +	.write = perf_dbgfs_write_run_client
> +};
> +
> +static ssize_t perf_dbgfs_write_run_server(struct file *filep, const char __user *ubuf,
> +				    size_t size, loff_t *offp)
> +{
> +	return perf_dbgfs_write_run_pl(filep, ubuf, size, offp, RUN_PL_SERVER);
> +}
> +
> +static const struct file_operations perf_dbgfs_run_server = {
> +	.open = simple_open,
> +	.read = perf_dbgfs_read_run,
> +	.write = perf_dbgfs_write_run_server
> +};
> +
>   static ssize_t perf_dbgfs_read_tcnt(struct file *filep, char __user *ubuf,
>   				    size_t size, loff_t *offp)
>   {
> @@ -1465,6 +1793,24 @@ static ssize_t perf_dbgfs_read_lattrs(struct file *filep, char __user *ubuf,
>   	return ret;
>   }
>   
> +static ssize_t perf_dbgfs_read_inbuf(struct file *filep, char __user *ubuf,
> +				    size_t size, loff_t *offp)
> +{
> +	struct perf_ctx *perf = filep->private_data;
> +	char buf[32];
> +	ssize_t pos;
> +	u64 *value;
> +
> +	if (!perf->test_peer || !perf->test_peer->inbuf) {
> +		pos = scnprintf(buf, sizeof(buf), "NULL\n");
> +	} else {
> +		value = perf->test_peer->inbuf;
> +		pos = scnprintf(buf, sizeof(buf), "0x%llx\n", *value);
> +	}
> +
> +	return simple_read_from_buffer(ubuf, size, offp, buf, pos);
> +}
> +
>   static const struct file_operations perf_dbgfs_tcnt = {
>   	.open = simple_open,
>   	.read = perf_dbgfs_read_tcnt,
> @@ -1476,6 +1822,11 @@ static const struct file_operations perf_dbgfs_lattrs = {
>   	.read = perf_dbgfs_read_lattrs
>   };
>   
> +static const struct file_operations perf_dbgfs_inbuf = {
> +	.open = simple_open,
> +	.read = perf_dbgfs_read_inbuf,
> +};
> +
>   static void perf_setup_dbgfs(struct perf_ctx *perf)
>   {
>   	struct pci_dev *pdev = perf->ntb->pdev;
> @@ -1492,6 +1843,12 @@ static void perf_setup_dbgfs(struct perf_ctx *perf)
>   	debugfs_create_file("run", 0600, perf->dbgfs_dir, perf,
>   			    &perf_dbgfs_run);
>   
> +	debugfs_create_file("run_client", 0600, perf->dbgfs_dir, perf,
> +			    &perf_dbgfs_run_client);
> +
> +	debugfs_create_file("run_server", 0600, perf->dbgfs_dir, perf,
> +			    &perf_dbgfs_run_server);
> +
>   	debugfs_create_file("threads_count", 0600, perf->dbgfs_dir, perf,
>   			    &perf_dbgfs_tcnt);
>   
> @@ -1504,6 +1861,12 @@ static void perf_setup_dbgfs(struct perf_ctx *perf)
>   
>   	debugfs_create_file("latency_tries", 0400, perf->dbgfs_dir, perf,
>   			    &perf_dbgfs_lattrs);
> +
> +	debugfs_create_u64("poll_latency_tries", 0400, perf->dbgfs_dir,
> +				&perf->pldata.tries);
> +
> +	debugfs_create_file("inbuf", 0400, perf->dbgfs_dir, perf,
> +			    &perf_dbgfs_inbuf);
>   }
>   
>   static void perf_clear_dbgfs(struct perf_ctx *perf)

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2 3/3] ntb_perf: extend with doorbell latency measurement
  2022-04-22 11:37 ` [PATCH v2 3/3] ntb_perf: extend with doorbell " Alexander Fomichev
@ 2022-04-25 16:55   ` Dave Jiang
  0 siblings, 0 replies; 10+ messages in thread
From: Dave Jiang @ 2022-04-25 16:55 UTC (permalink / raw)
  To: Alexander Fomichev, ntb
  Cc: linux, Jon Mason, Allen Hubbe, Guo Zhengkui, Alexander Fomichev


On 4/22/2022 4:37 AM, Alexander Fomichev wrote:
> From: Alexander Fomichev <a.fomichev@yadro.com>
>
> The proposed change extends ntb_perf with 3d of 3 new metrics. The resulting
> test is fully backward compatible.
>
> 3. Doorbell latency. It's a delay between start to ring an NTB doorbell
> and receiving the confirmation. The remote system needs to be run in server
> mode beforehand. Then the server waits for a doorbell event and immediately
> rings self doorbell to confirm.
>
> Thanks-to: Guo Zhengkui <guozhengkui@vivo.com>
> Signed-off-by: Alexander Fomichev <a.fomichev@yadro.com>
> ---
>   drivers/ntb/test/ntb_perf.c | 541 ++++++++++++++++++++++++++++--------
>   1 file changed, 420 insertions(+), 121 deletions(-)
>
> diff --git a/drivers/ntb/test/ntb_perf.c b/drivers/ntb/test/ntb_perf.c
> index d4664410e543..26f0e801ea93 100644
> --- a/drivers/ntb/test/ntb_perf.c
> +++ b/drivers/ntb/test/ntb_perf.c
> @@ -6,6 +6,7 @@
>    *
>    *   Copyright(c) 2015 Intel Corporation. All rights reserved.
>    *   Copyright(c) 2017 T-Platforms. All Rights Reserved.
> + *   Copyright(c) 2022 YADRO. All Rights Reserved.

Add this in patch 1/3?


>    *
>    *   This program is free software; you can redistribute it and/or modify
>    *   it under the terms of version 2 of the GNU General Public License as
> @@ -15,6 +16,7 @@
>    *
>    *   Copyright(c) 2015 Intel Corporation. All rights reserved.
>    *   Copyright(c) 2017 T-Platforms. All Rights Reserved.
> + *   Copyright(c) 2022 YADRO. All Rights Reserved.
>    *
>    *   Redistribution and use in source and binary forms, with or without
>    *   modification, are permitted provided that the following conditions
> @@ -65,6 +67,22 @@
>    *
>    * root@self# echo 0 > $DBG_DIR/run
>    * root@self# cat $DBG_DIR/run
> + *-----------------------------------------------------------------------------
> + * Eg: start latency test with peer (index 0) poll-waiting and get the metrics
> + *
> + * Server side:
> + * root@self# echo 0 > $DBG_DIR/poll_latency/run_server
> + * Client side:
> + * root@self# echo 0 > $DBG_DIR/poll_latency/run_client
> + * root@self# cat $DBG_DIR/poll_latency/run_client
> + *-----------------------------------------------------------------------------
> + * Eg: start doorbell latency test with peer (index 0) and get the metrics
> + *
> + * Server side:
> + * root@self# echo 0 > $DBG_DIR/db_latency/run_server
> + * Client side:
> + * root@self# echo 0 > $DBG_DIR/db_latency/run_client
> + * root@self# cat $DBG_DIR/db_latency/run_client
Unrelated to the patch. Hopefully you can update the wiki documentation 
when the patches are accepted? :)
>    */
>   
>   #include <linux/init.h>
> @@ -76,6 +94,7 @@
>   #include <linux/dmaengine.h>
>   #include <linux/pci.h>
>   #include <linux/ktime.h>
> +#include <linux/jiffies.h>
>   #include <linux/slab.h>
>   #include <linux/delay.h>
>   #include <linux/sizes.h>
> @@ -85,7 +104,7 @@
>   #include <linux/ntb.h>
>   
>   #define DRIVER_NAME		"ntb_perf"
> -#define DRIVER_VERSION		"2.1"
> +#define DRIVER_VERSION		"2.2"
Ah you meant to bump the minor version per feature addition. This should 
be 2.3 now right?
>   
>   MODULE_LICENSE("Dual BSD/GPL");
>   MODULE_VERSION(DRIVER_VERSION);
> @@ -137,6 +156,10 @@ static unsigned long lat_timeout_us = 500;
>   module_param(lat_timeout_us, ulong, 0644);
>   MODULE_PARM_DESC(lat_timeout_us, "Timeout (in us) to wait for server reply");
>   
> +static unsigned long peer_timeout_s = 60;
> +module_param(peer_timeout_s, ulong, 0644);
> +MODULE_PARM_DESC(peer_timeout_s, "Timeout (in s) to wait for peer link");

I think the description was doorbell latency measurement no? So maybe 
make this doorbell_latency and "Latency measurement with doorbell and 
response from peer host".


> +
>   /*==============================================================================
>    *                         Perf driver data definition
>    *==============================================================================
> @@ -156,9 +179,18 @@ enum perf_cmd {
>   enum run_mode {
>   	RUN_PL_CLIENT,
>   	RUN_PL_SERVER,
> +	RUN_DBL_CLIENT,
> +	RUN_DBL_SERVER,
>   };
>   
>   struct perf_ctx;
> +struct perf_ext_lat_data;
> +
> +struct perf_ext_lat_ops {
> +	int (*init)(struct perf_ext_lat_data *data);
> +	int (*run)(struct perf_ext_lat_data *data);
> +	void (*clear)(struct perf_ext_lat_data *data);
> +};
>   
>   struct perf_peer {
>   	struct perf_ctx	*perf;
> @@ -206,20 +238,21 @@ struct perf_thread {
>   #define to_thread_work(__work) \
>   	container_of(__work, struct perf_thread, work)
>   
> -struct perf_poll_lat_data {
> +struct perf_ext_lat_data {
>   	struct perf_ctx *perf;
> -	void *src;
>   	ktime_t latency;
>   	u64 tries;
>   	int status;
> -	atomic_t running;
> -	struct work_struct clt_work;
> -	struct work_struct srv_work;
> +	struct perf_ext_lat_ops ops;
> +	struct work_struct work;
> +
> +	union {
> +		void *src;
> +		int db;
> +	};
>   };
> -#define to_pldata_clt_work(__work) \
> -	container_of(__work, struct perf_poll_lat_data, clt_work)
> -#define to_pldata_srv_work(__work) \
> -	container_of(__work, struct perf_poll_lat_data, srv_work)
> +#define to_ext_lat_data(__work) \
> +	container_of(__work, struct perf_ext_lat_data, work)
>   
>   struct perf_ctx {
>   	struct ntb_dev *ntb;
> @@ -228,7 +261,12 @@ struct perf_ctx {
>   	int gidx;
>   	int pcnt;
>   	struct perf_peer *peers;
> -	struct perf_poll_lat_data pldata;
> +
> +	/* Ext latency tests interface */
> +	enum run_mode mode;
> +	struct perf_ext_lat_data pldata;
> +	struct perf_ext_lat_data dbldata;
> +	atomic_t running;
>   
>   	/* Performance measuring work-threads interface */
>   	unsigned long busy_flag;
> @@ -541,6 +579,15 @@ static void perf_link_event(void *ctx)
>   	}
>   }
>   
> +static inline void perf_dbl_pong(struct perf_ctx *perf)
> +{
> +	struct perf_ext_lat_data *data = &perf->dbldata;
> +
> +	ntb_db_clear(perf->ntb, BIT_ULL(data->db));
> +	data->tries++;
> +	ntb_peer_db_set(perf->ntb, BIT_ULL(data->db));
> +}
> +
>   static void perf_db_event(void *ctx, int vec)
>   {
>   	struct perf_ctx *perf = ctx;
> @@ -549,7 +596,11 @@ static void perf_db_event(void *ctx, int vec)
>   		ntb_db_vector_mask(perf->ntb, vec), ntb_db_read(perf->ntb));
>   
>   	/* Just receive all available commands */
> -	(void)perf_cmd_recv(perf);
> +	if (perf->dbldata.db >= 0 &&
> +				BIT_ULL(perf->dbldata.db) & ntb_db_read(perf->ntb))
> +		perf_dbl_pong(perf);
> +	else
> +		(void)perf_cmd_recv(perf);
>   }
>   
>   static void perf_msg_event(void *ctx)
> @@ -704,6 +755,8 @@ static int perf_init_service(struct perf_ctx *perf)
>   		return -EINVAL;
>   	}
>   
> +	perf->dbldata.db = -1;
> +
>   	if (ntb_msg_count(perf->ntb) >= PERF_MSG_CNT) {
>   		perf->cmd_send = perf_msg_cmd_send;
>   		perf->cmd_recv = perf_msg_cmd_recv;
> @@ -1153,14 +1206,14 @@ static void perf_thread_work(struct work_struct *work)
>   	perf_clear_test(pthr);
>   }
>   
> -static int perf_init_pl(struct perf_poll_lat_data *pldata)
> +static int perf_init_pl(struct perf_ext_lat_data *pldata)
>   {
>   	struct perf_ctx *perf = pldata->perf;
>   	struct perf_peer *peer = perf->test_peer;
>   	u8 *bp;
>   
>   	pldata->src = kmalloc_node(peer->outbuf_size, GFP_KERNEL,
> -				 dev_to_node(&perf->ntb->dev));
> +				dev_to_node(&perf->ntb->dev));
>   	if (!pldata->src)
>   		return -ENOMEM;
>   
> @@ -1195,10 +1248,11 @@ static int perf_poll_peer_reply(volatile u8 *cur)
>   	return -EINTR;
>   }
>   
> -static int perf_run_pl_client(struct perf_poll_lat_data *pldata)
> +static int perf_run_pl_client(struct perf_ext_lat_data *pldata)
>   {
> -	struct perf_peer *peer = pldata->perf->test_peer;
> -	struct ntb_dev *ntb = pldata->perf->ntb;
> +	struct perf_ctx *perf = pldata->perf;
> +	struct perf_peer *peer = perf->test_peer;
> +	struct ntb_dev *ntb = perf->ntb;
>   	void *src = pldata->src;
>   	int ret;
>   	u64 stop_at = ktime_get_real_fast_ns() + lat_time_ms * NSEC_PER_MSEC;
> @@ -1222,7 +1276,7 @@ static int perf_run_pl_client(struct perf_poll_lat_data *pldata)
>   		} else if (ret == 1) {
>   			dev_warn(&ntb->dev, "Server terminated on poll latency, stopping\n");
>   			break;
> -		} else if (!atomic_read(&pldata->running)) {
> +		} else if (!atomic_read(&perf->running)) {
>   			dev_err(&ntb->dev, "Poll latency client terminated\n");
>   			return -EINTR;
>   		}
> @@ -1258,10 +1312,11 @@ static int perf_run_pl_client(struct perf_poll_lat_data *pldata)
>   	return 0;
>   }
>   
> -static int perf_run_pl_server(struct perf_poll_lat_data *pldata)
> +static int perf_run_pl_server(struct perf_ext_lat_data *pldata)
>   {
> -	struct perf_peer *peer = pldata->perf->test_peer;
> -	struct ntb_dev *ntb = pldata->perf->ntb;
> +	struct perf_ctx *perf = pldata->perf;
> +	struct perf_peer *peer = perf->test_peer;
> +	struct ntb_dev *ntb = perf->ntb;
>   	void *src = pldata->src;
>   	int ret = 0;
>   
> @@ -1269,7 +1324,7 @@ static int perf_run_pl_server(struct perf_poll_lat_data *pldata)
>   
>   	pldata->tries = 0;
>   
> -	while (ret != 1 && atomic_read(&pldata->running)) {
> +	while (ret != 1 && atomic_read(&perf->running)) {
>   		ret = perf_poll_peer_reply(peer->inbuf);
>   		if (!ret) {
>   			/* Pong to client */
> @@ -1291,44 +1346,130 @@ static int perf_run_pl_server(struct perf_poll_lat_data *pldata)
>   	dev_dbg(&ntb->dev, "poll_lat: server stopped, had responded %llu times\n",
>   		pldata->tries);
>   
> -	return atomic_read(&pldata->running) ? -ENODATA : -EINTR;
> +	return atomic_read(&perf->running) ? -ENODATA : -EINTR;
>   }
>   
> -static void perf_clear_pl(struct perf_poll_lat_data *pldata)
> +static void perf_clear_pl(struct perf_ext_lat_data *pldata)
>   {
>   	struct perf_ctx *perf = pldata->perf;
>   	struct perf_peer *peer = perf->test_peer;
>   
>   	memset(peer->inbuf, stop_word, 1);
> -	atomic_set(&pldata->running, 0);
> +	atomic_set(&perf->running, 0);
>   	wake_up(&perf->twait);
>   	kfree(pldata->src);
>   }
>   
> -static void perf_poll_lat_client_work(struct work_struct *work)
> +static struct perf_ext_lat_ops perf_pl_client_ops = {
> +	.init = perf_init_pl,
> +	.run = perf_run_pl_client,
> +	.clear = perf_clear_pl
> +};
> +
> +static struct perf_ext_lat_ops perf_pl_server_ops = {
> +	.init = perf_init_pl,
> +	.run = perf_run_pl_server,
> +	.clear = perf_clear_pl
> +};
> +
> +static int perf_init_dbl(struct perf_ext_lat_data *data)
>   {
> -	struct perf_poll_lat_data *pldata = to_pldata_clt_work(work);
> +	struct perf_ctx *perf = data->perf;
>   
> -	pldata->status = perf_init_pl(pldata);
> -	if (pldata->status)
> -		return;
> +	data->db = get_bitmask_order(ntb_db_valid_mask(perf->ntb)) - 1;
> +	dev_dbg(&perf->ntb->dev, "DB bit for latency test: %d\n", data->db);
> +
> +	if (data->db <= perf->gidx) {
> +		dev_err(&perf->ntb->dev, "No spare DoorBell found\n");
> +		data->db = -1;
> +		return -ENOSPC;
> +	}
> +
> +	return ntb_db_clear_mask(perf->ntb, BIT_ULL(data->db));
> +}
> +
> +static int perf_run_dbl_client(struct perf_ext_lat_data *data)
> +{
> +	struct perf_ctx *perf = data->perf;
> +	struct ntb_dev *ntb = perf->ntb;
> +	u64 stop_at = ktime_get_real_fast_ns() + lat_time_ms * NSEC_PER_MSEC;
> +
> +	dev_dbg(&ntb->dev, "db_lat: client started.\n");
> +
> +	data->tries = 0;
> +	data->latency = ktime_get();
> +
> +	if (ntb_peer_db_set(perf->ntb, BIT_ULL(data->db)))
> +		return -EIO;
> +
> +	while (ktime_get_real_fast_ns() < stop_at) {
> +		/* Avoid processor soft lock-ups */
> +		schedule();
> +
> +		if (!atomic_read(&perf->running)) {
> +			dev_err(&ntb->dev, "DoorBell latency client terminated\n");
> +			return -EINTR;
> +		}
> +	}
>   
> -	pldata->status = perf_run_pl_client(pldata);
> +	/* Stop timer */
> +	data->latency = ktime_sub(ktime_get(), data->latency);
> +
> +	if (data->tries < LAT_MIN_TRIES) {
> +		dev_err(&ntb->dev, "Too few steps (%d) to measure Latency, "
> +			"recommended > %d. Increase value of 'lat_time_ms' "
> +			"parameter\n", data->tries, LAT_MIN_TRIES);
> +		data->latency = ktime_set(0, 0);
> +		return -EINVAL;
> +	}
> +
> +	dev_dbg(&ntb->dev, "db_lat: made %llu tries, lasted %llu usecs\n",
> +		data->tries, ktime_to_us(data->latency));
> +
> +	data->latency = ns_to_ktime(ktime_divns(data->latency, data->tries));
> +
> +	dev_dbg(&ntb->dev, "db_lat: latency %llu us (%llu ns)\n",
> +		ktime_to_us(data->latency), ktime_to_ns(data->latency));
> +
> +	return 0;
> +}
> +
> +static void perf_clear_dbl(struct perf_ext_lat_data *data)
> +{
> +	struct perf_ctx *perf = data->perf;
>   
> -	perf_clear_pl(pldata);
> +	data->db = -1;
> +	ntb_db_set_mask(perf->ntb, BIT_ULL(data->db));
> +	atomic_set(&perf->running, 0);
> +	wake_up(&perf->twait);
>   }
>   
> -static void perf_poll_lat_server_work(struct work_struct *work)
> +static struct perf_ext_lat_ops perf_dbl_client_ops = {
> +	.init = perf_init_dbl,
> +	.run = perf_run_dbl_client,
> +	.clear = perf_clear_dbl
> +};
> +
> +static void perf_ext_lat_work(struct work_struct *work)
>   {
> -	struct perf_poll_lat_data *pldata = to_pldata_srv_work(work);
> +	struct perf_ext_lat_data *data = to_ext_lat_data(work);
>   
> -	pldata->status = perf_init_pl(pldata);
> -	if (pldata->status)
> +	if (!data->ops.init || !data->ops.run || !data->ops.clear) {
> +		struct perf_ctx *perf = data->perf;
> +
> +		data->status = -EFAULT;
> +		atomic_set(&perf->running, 0);
> +		wake_up(&perf->twait);
>   		return;
> +	}
>   
> -	pldata->status = perf_run_pl_server(pldata);
> +	data->status = data->ops.init(data);
> +	if (data->status)
> +		return;
>   
> -	perf_clear_pl(pldata);
> +	data->status = data->ops.run(data);
> +
> +	data->ops.clear(data);
>   }
>   
>   static int perf_set_tcnt(struct perf_ctx *perf, u8 tcnt)
> @@ -1351,10 +1492,10 @@ static void perf_terminate_test(struct perf_ctx *perf)
>   	int tidx;
>   
>   	atomic_set(&perf->tsync, -1);
> -	atomic_set(&perf->pldata.running, 0);
> +	atomic_set(&perf->running, 0);
>   	wake_up(&perf->twait);
> -	cancel_work_sync(&perf->pldata.srv_work);
> -	cancel_work_sync(&perf->pldata.clt_work);
> +	cancel_work_sync(&perf->pldata.work);
> +	cancel_work_sync(&perf->dbldata.work);
>   
>   	for (tidx = 0; tidx < MAX_THREADS_CNT; tidx++) {
>   		wake_up(&perf->threads[tidx].dma_wait);
> @@ -1368,9 +1509,10 @@ static int perf_submit_test(struct perf_peer *peer)
>   	struct perf_thread *pthr;
>   	int tidx, ret;
>   
> -	ret = wait_for_completion_interruptible(&peer->init_comp);
> -	if (ret < 0)
> -		return ret;
> +	ret = wait_for_completion_interruptible_timeout(&peer->init_comp,
> +			msecs_to_jiffies(peer_timeout_s * 1000));
> +	if (ret <= 0)
> +		return ret ? ret : -ETIMEDOUT;
>   
>   	if (test_and_set_bit_lock(0, &perf->busy_flag))
>   		return -EBUSY;
> @@ -1400,41 +1542,58 @@ static int perf_submit_test(struct perf_peer *peer)
>   	return ret;
>   }
>   
> -static int perf_submit_poll_lat(struct perf_peer *peer, enum run_mode mode)
> +static int perf_submit_ext_lat(struct perf_peer *peer)
>   {
>   	struct perf_ctx *perf = peer->perf;
>   	int ret;
>   
> -	ret = wait_for_completion_interruptible(&peer->init_comp);
> -	if (ret < 0)
> -		return ret;
> +	ret = wait_for_completion_interruptible_timeout(&peer->init_comp,
> +			msecs_to_jiffies(peer_timeout_s * 1000));
> +	if (ret <= 0)
> +		return ret ? ret : -ETIMEDOUT;
>   
>   	if (test_and_set_bit_lock(0, &perf->busy_flag))
>   		return -EBUSY;
>   
>   	perf->test_peer = peer;
> -	atomic_set(&perf->pldata.running, 1);
> +	atomic_set(&perf->running, 1);
>   	perf->pldata.status = -ENODATA;
>   	perf->pldata.tries = 0;
>   	perf->pldata.latency = ktime_set(0, 0);
> +	perf->dbldata.status = -ENODATA;
> +	perf->dbldata.tries = 0;
> +	perf->dbldata.latency = ktime_set(0, 0);
>   
> -	switch (mode) {
> +	switch (perf->mode) {
>   	case RUN_PL_SERVER:
> -		(void)queue_work(perf_wq, &perf->pldata.srv_work);
> +		perf->pldata.ops = perf_pl_server_ops;
> +		(void)queue_work(perf_wq, &perf->pldata.work);
>   		break;
>   	case RUN_PL_CLIENT:
> -	default:
> -		(void)queue_work(perf_wq, &perf->pldata.clt_work);
> +		perf->pldata.ops = perf_pl_client_ops;
> +		(void)queue_work(perf_wq, &perf->pldata.work);
> +		break;
> +	case RUN_DBL_SERVER:
> +		ret = perf_init_dbl(&perf->dbldata);
> +		dev_dbg(&perf->ntb->dev, "db_lat: server started.\n");
> +		goto submit_exit;
> +	case RUN_DBL_CLIENT:
> +		perf->dbldata.ops = perf_dbl_client_ops;
> +		(void)queue_work(perf_wq, &perf->dbldata.work);
>   		break;
> +	default:
> +		ret = -EINVAL;
> +		goto submit_exit;
>   	}
>   
>   	ret = wait_event_interruptible(perf->twait,
> -				       !atomic_read(&perf->pldata.running));
> +				       !atomic_read(&perf->running));
>   	if (ret == -ERESTARTSYS) {
>   		perf_terminate_test(perf);
>   		ret = -EINTR;
>   	}
>   
> +submit_exit:
>   	clear_bit_unlock(0, &perf->busy_flag);
>   
>   	return ret;
> @@ -1482,30 +1641,12 @@ static int perf_read_stats(struct perf_ctx *perf, char *buf,
>   		}
>   	}
>   
> -	if (perf->pldata.status != -ENODATA) {
> -		(*pos) += scnprintf(buf + *pos, size - *pos, "\n");
> -		if (perf->pldata.status) {
> -			(*pos) += scnprintf(buf + *pos, size - *pos,
> -				"poll latency: error status %d\n", perf->pldata.status);
> -		} else {
> -			if (ktime_to_us(perf->pldata.latency) < 10) {
> -				(*pos) += scnprintf(buf + *pos, size - *pos,
> -						"poll latency %llu ns\n",
> -						ktime_to_ns(perf->pldata.latency));
> -			} else {
> -				(*pos) += scnprintf(buf + *pos, size - *pos,
> -						"poll latency %llu us\n",
> -						ktime_to_us(perf->pldata.latency));
> -			}
> -		}
> -	}
> -
>   	clear_bit_unlock(0, &perf->busy_flag);
>   
>   	return 0;
>   }
>   
> -static void perf_init_threads(struct perf_ctx *perf)
> +static void perf_init_workers(struct perf_ctx *perf)
>   {
>   	struct perf_thread *pthr;
>   	int tidx;
> @@ -1513,11 +1654,15 @@ static void perf_init_threads(struct perf_ctx *perf)
>   	perf->tcnt = DEF_THREADS_CNT;
>   	perf->test_peer = &perf->peers[0];
>   	init_waitqueue_head(&perf->twait);
> +
>   	perf->pldata.perf = perf;
> -	INIT_WORK(&perf->pldata.srv_work, perf_poll_lat_server_work);
> -	INIT_WORK(&perf->pldata.clt_work, perf_poll_lat_client_work);
> +	INIT_WORK(&perf->pldata.work, perf_ext_lat_work);
>   	perf->pldata.status = -ENODATA;
>   
> +	perf->dbldata.perf = perf;
> +	INIT_WORK(&perf->dbldata.work, perf_ext_lat_work);
> +	perf->dbldata.status = -ENODATA;
> +
>   	for (tidx = 0; tidx < MAX_THREADS_CNT; tidx++) {
>   		pthr = &perf->threads[tidx];
>   
> @@ -1529,7 +1674,7 @@ static void perf_init_threads(struct perf_ctx *perf)
>   	}
>   }
>   
> -static void perf_clear_threads(struct perf_ctx *perf)
> +static void perf_clear_workers(struct perf_ctx *perf)
>   {
>   	perf_terminate_test(perf);
>   }
> @@ -1673,8 +1818,55 @@ static const struct file_operations perf_dbgfs_run = {
>   	.write = perf_dbgfs_write_run
>   };
>   
> -static ssize_t perf_dbgfs_write_run_pl(struct file *filep, const char __user *ubuf,
> -				    size_t size, loff_t *offp, enum run_mode mode)
> +static ssize_t perf_dbgfs_read_run_pl(struct file *filep, char __user *ubuf,
> +				   size_t fsize, loff_t *offp)
> +{
> +	struct perf_ctx *perf = filep->private_data;
> +	ssize_t ret;
> +	ssize_t pos = 0;
> +	ssize_t size = PERF_BUF_LEN;
> +	char *buf;
> +
> +	if (test_and_set_bit_lock(0, &perf->busy_flag))
> +		return -EBUSY;
> +
> +	buf = kmalloc(size, GFP_KERNEL);
> +	if (!buf)
> +		return -ENOMEM;
> +
> +	pos += scnprintf(buf + pos, size - pos,
> +		"    Peer %d test statistics:\n", perf->test_peer->pidx);
> +
> +	if (perf->pldata.status != -ENODATA) {
> +		if (perf->pldata.status) {
> +			pos += scnprintf(buf + pos, size - pos,
> +				"poll latency: error status %d\n", perf->pldata.status);
> +		} else {
> +			if (ktime_to_us(perf->pldata.latency) < 10) {
> +				pos += scnprintf(buf + pos, size - pos,
> +						"poll latency %llu ns\n",
> +						ktime_to_ns(perf->pldata.latency));
> +			} else {
> +				pos += scnprintf(buf + pos, size - pos,
> +						"poll latency %llu us\n",
> +						ktime_to_us(perf->pldata.latency));
> +			}
> +		}
> +	} else {
> +		pos += scnprintf(buf + pos, size - pos, "Test did not run\n");
> +	}
> +
> +	ret = simple_read_from_buffer(ubuf, fsize, offp, buf, pos);
> +
> +	kfree(buf);
> +
> +	clear_bit_unlock(0, &perf->busy_flag);
> +
> +	return ret;
> +}
> +
> +static ssize_t perf_dbgfs_write_run_ext(struct file *filep, const char __user *ubuf,
> +					size_t size, loff_t *offp, enum run_mode mode)
>   {
>   	struct perf_ctx *perf = filep->private_data;
>   	struct ntb_dev *ntb = perf->ntb;
> @@ -1685,50 +1877,132 @@ static ssize_t perf_dbgfs_write_run_pl(struct file *filep, const char __user *ub
>   	if (ret)
>   		return ret;
>   
> -	if (pidx < 0 && mode == RUN_PL_SERVER) {
> -		dev_dbg(&ntb->dev, "poll_lat: kill server\n");
> -		if (test_bit(0, &perf->busy_flag)) {
> -			peer = perf->test_peer;
> -			/* Send stop to client */
> -			memcpy_toio(peer->outbuf, &stop_word, 1);
> +	if (pidx < 0) {
> +		switch (mode) {
> +		case RUN_PL_SERVER:
> +			dev_dbg(&ntb->dev, "poll_lat: kill server\n");
> +			if (test_bit(0, &perf->busy_flag)) {
> +				peer = perf->test_peer;
> +				/* Send stop to client */
> +				memcpy_toio(peer->outbuf, &stop_word, 1);
> +			}
> +			perf_terminate_test(perf);
> +			clear_bit_unlock(0, &perf->busy_flag);
> +			return size;
> +		case RUN_DBL_SERVER:
> +			dev_dbg(&ntb->dev, "db_lat: kill server\n");
> +			perf_clear_dbl(&perf->dbldata);
> +			clear_bit_unlock(0, &perf->busy_flag);
> +			return size;
> +		default:
> +			return -EINVAL;
>   		}
> -		perf_terminate_test(perf);
> -		clear_bit_unlock(0, &perf->busy_flag);
> -		return size;
>   	}
>   
> -	if (pidx < 0 || pidx >= perf->pcnt)
> +	if (pidx >= perf->pcnt)
>   		return -EINVAL;
>   
>   	peer = &perf->peers[pidx];
> +	perf->mode = mode;
>   
> -	ret = perf_submit_poll_lat(peer, mode);
> +	ret = perf_submit_ext_lat(peer);
>   
>   	return ret ? ret : size;
>   }
>   
> -static ssize_t perf_dbgfs_write_run_client(struct file *filep, const char __user *ubuf,
> -				    size_t size, loff_t *offp)
> +static ssize_t perf_dbgfs_write_run_pl_client(struct file *filep,
> +					const char __user *ubuf, size_t size, loff_t *offp)
>   {
> -	return perf_dbgfs_write_run_pl(filep, ubuf, size, offp, RUN_PL_CLIENT);
> +	return perf_dbgfs_write_run_ext(filep, ubuf, size, offp, RUN_PL_CLIENT);
>   }
>   
> -static const struct file_operations perf_dbgfs_run_client = {
> +static const struct file_operations perf_dbgfs_run_pl_client = {
>   	.open = simple_open,
> -	.read = perf_dbgfs_read_run,
> -	.write = perf_dbgfs_write_run_client
> +	.read = perf_dbgfs_read_run_pl,
> +	.write = perf_dbgfs_write_run_pl_client
>   };
>   
> -static ssize_t perf_dbgfs_write_run_server(struct file *filep, const char __user *ubuf,
> -				    size_t size, loff_t *offp)
> +static ssize_t perf_dbgfs_write_run_pl_server(struct file *filep,
> +					const char __user *ubuf, size_t size, loff_t *offp)
>   {
> -	return perf_dbgfs_write_run_pl(filep, ubuf, size, offp, RUN_PL_SERVER);
> +	return perf_dbgfs_write_run_ext(filep, ubuf, size, offp, RUN_PL_SERVER);
>   }
>   
> -static const struct file_operations perf_dbgfs_run_server = {
> +static const struct file_operations perf_dbgfs_run_pl_server = {
>   	.open = simple_open,
> -	.read = perf_dbgfs_read_run,
> -	.write = perf_dbgfs_write_run_server
> +	.read = perf_dbgfs_read_run_pl,
> +	.write = perf_dbgfs_write_run_pl_server
> +};
> +
> +static ssize_t perf_dbgfs_read_run_dbl(struct file *filep, char __user *ubuf,
> +				   size_t fsize, loff_t *offp)
> +{
> +	struct perf_ctx *perf = filep->private_data;
> +	ssize_t ret;
> +	ssize_t pos = 0;
> +	ssize_t size = PERF_BUF_LEN;
> +	char *buf;
> +
> +	if (test_and_set_bit_lock(0, &perf->busy_flag))
> +		return -EBUSY;
> +
> +	buf = kmalloc(size, GFP_KERNEL);
> +	if (!buf)
> +		return -ENOMEM;
> +
> +	pos += scnprintf(buf + pos, size - pos,
> +		"    Peer %d test statistics:\n", perf->test_peer->pidx);
> +
> +	if (perf->dbldata.status != -ENODATA) {
> +		if (perf->dbldata.status) {
> +			pos += scnprintf(buf + pos, size - pos,
> +				"doorbell latency: error status %d\n", perf->dbldata.status);
> +		} else {
> +			if (ktime_to_us(perf->dbldata.latency) < 10) {
> +				pos += scnprintf(buf + pos, size - pos,
> +						"doorbell latency %llu ns\n",
> +						ktime_to_ns(perf->dbldata.latency));
> +			} else {
> +				pos += scnprintf(buf + pos, size - pos,
> +						"doorbell latency %llu us\n",
> +						ktime_to_us(perf->dbldata.latency));
> +			}
> +		}
> +	} else {
> +		pos += scnprintf(buf + pos, size - pos, "Test did not run\n");
> +	}
> +
> +	ret = simple_read_from_buffer(ubuf, fsize, offp, buf, pos);
> +
> +	kfree(buf);
> +
> +	clear_bit_unlock(0, &perf->busy_flag);
> +
> +	return ret;
> +}
> +
> +static ssize_t perf_dbgfs_write_run_dbl_client(struct file *filep,
> +					const char __user *ubuf, size_t size, loff_t *offp)
> +{
> +	return perf_dbgfs_write_run_ext(filep, ubuf, size, offp, RUN_DBL_CLIENT);
> +}
> +
> +static const struct file_operations perf_dbgfs_run_dbl_client = {
> +	.open = simple_open,
> +	.read = perf_dbgfs_read_run_dbl,
> +	.write = perf_dbgfs_write_run_dbl_client
> +};
> +
> +static ssize_t perf_dbgfs_write_run_dbl_server(struct file *filep,
> +					const char __user *ubuf, size_t size, loff_t *offp)
> +{
> +	return perf_dbgfs_write_run_ext(filep, ubuf, size, offp, RUN_DBL_SERVER);
> +}
> +
> +static const struct file_operations perf_dbgfs_run_dbl_server = {
> +	.open = simple_open,
> +	.read = perf_dbgfs_read_run_dbl,
> +	.write = perf_dbgfs_write_run_dbl_server
>   };
>   
>   static ssize_t perf_dbgfs_read_tcnt(struct file *filep, char __user *ubuf,
> @@ -1782,8 +2056,7 @@ static ssize_t perf_dbgfs_read_lattrs(struct file *filep, char __user *ubuf,
>   		struct perf_thread *pthr = &perf->threads[tidx];
>   
>   		pos += scnprintf(buf + pos, buf_size - pos,
> -			"%d: made %llu tries\n",
> -			tidx, pthr->tries);
> +				"%d: made %llu tries\n", tidx, pthr->tries);
>   	}
>   
>   	ret = simple_read_from_buffer(ubuf, size, offp, buf, pos);
> @@ -1794,7 +2067,7 @@ static ssize_t perf_dbgfs_read_lattrs(struct file *filep, char __user *ubuf,
>   }
>   
>   static ssize_t perf_dbgfs_read_inbuf(struct file *filep, char __user *ubuf,
> -				    size_t size, loff_t *offp)
> +					size_t size, loff_t *offp)
>   {
>   	struct perf_ctx *perf = filep->private_data;
>   	char buf[32];
> @@ -1830,6 +2103,9 @@ static const struct file_operations perf_dbgfs_inbuf = {
>   static void perf_setup_dbgfs(struct perf_ctx *perf)
>   {
>   	struct pci_dev *pdev = perf->ntb->pdev;
> +	struct dentry *burst_lat_dir;
> +	struct dentry *poll_lat_dir;
> +	struct dentry *db_lat_dir;
>   
>   	perf->dbgfs_dir = debugfs_create_dir(pci_name(pdev), perf_dbgfs_topdir);
>   	if (!perf->dbgfs_dir) {
> @@ -1840,17 +2116,10 @@ static void perf_setup_dbgfs(struct perf_ctx *perf)
>   	debugfs_create_file("info", 0600, perf->dbgfs_dir, perf,
>   			    &perf_dbgfs_info);
>   
> -	debugfs_create_file("run", 0600, perf->dbgfs_dir, perf,
> -			    &perf_dbgfs_run);
> +	debugfs_create_symlink("run", perf->dbgfs_dir, "burst_latency/run");
>   
> -	debugfs_create_file("run_client", 0600, perf->dbgfs_dir, perf,
> -			    &perf_dbgfs_run_client);
> -
> -	debugfs_create_file("run_server", 0600, perf->dbgfs_dir, perf,
> -			    &perf_dbgfs_run_server);
> -
> -	debugfs_create_file("threads_count", 0600, perf->dbgfs_dir, perf,
> -			    &perf_dbgfs_tcnt);
> +	debugfs_create_symlink("threads_count", perf->dbgfs_dir,
> +				"burst_latency/threads_count");
>   
>   	/* They are made read-only for test exec safety and integrity */
>   	debugfs_create_u8("chunk_order", 0500, perf->dbgfs_dir, &chunk_order);
> @@ -1859,14 +2128,44 @@ static void perf_setup_dbgfs(struct perf_ctx *perf)
>   
>   	debugfs_create_bool("use_dma", 0500, perf->dbgfs_dir, &use_dma);
>   
> -	debugfs_create_file("latency_tries", 0400, perf->dbgfs_dir, perf,
> +	debugfs_create_file("inbuf", 0400, perf->dbgfs_dir, perf,
> +			    &perf_dbgfs_inbuf);
> +
> +	/* burst_latency subdir */
> +
> +	burst_lat_dir = debugfs_create_dir("burst_latency", perf->dbgfs_dir);
> +
> +	debugfs_create_file("run", 0600, burst_lat_dir, perf, &perf_dbgfs_run);
> +
> +	debugfs_create_file("threads_count", 0600, burst_lat_dir, perf,
> +			    &perf_dbgfs_tcnt);
> +
> +	debugfs_create_file("tries", 0400, burst_lat_dir, perf,
>   			    &perf_dbgfs_lattrs);
>   
> -	debugfs_create_u64("poll_latency_tries", 0400, perf->dbgfs_dir,
> -				&perf->pldata.tries);
> +	/* poll_latency subdir */
>   
> -	debugfs_create_file("inbuf", 0400, perf->dbgfs_dir, perf,
> -			    &perf_dbgfs_inbuf);
> +	poll_lat_dir = debugfs_create_dir("poll_latency", perf->dbgfs_dir);
> +
> +	debugfs_create_file("run_client", 0600, poll_lat_dir, perf,
> +			    &perf_dbgfs_run_pl_client);
> +
> +	debugfs_create_file("run_server", 0600, poll_lat_dir, perf,
> +			    &perf_dbgfs_run_pl_server);
> +
> +	debugfs_create_u64("tries", 0400, poll_lat_dir, &perf->pldata.tries);
> +
> +	/* db_latency subdir */
> +
> +	db_lat_dir = debugfs_create_dir("db_latency", perf->dbgfs_dir);
> +
> +	debugfs_create_file("run_client", 0600, db_lat_dir, perf,
> +			    &perf_dbgfs_run_dbl_client);
> +
> +	debugfs_create_file("run_server", 0600, db_lat_dir, perf,
> +			    &perf_dbgfs_run_dbl_server);
> +
> +	debugfs_create_u64("tries", 0400, db_lat_dir, &perf->dbldata.tries);
>   }
>   
>   static void perf_clear_dbgfs(struct perf_ctx *perf)
> @@ -1986,7 +2285,7 @@ static int perf_probe(struct ntb_client *client, struct ntb_dev *ntb)
>   	if (ret)
>   		return ret;
>   
> -	perf_init_threads(perf);
> +	perf_init_workers(perf);
>   
>   	ret = perf_init_service(perf);
>   	if (ret)
> @@ -2009,7 +2308,7 @@ static void perf_remove(struct ntb_client *client, struct ntb_dev *ntb)
>   
>   	perf_disable_service(perf);
>   
> -	perf_clear_threads(perf);
> +	perf_clear_workers(perf);
>   }
>   
>   static struct ntb_client perf_client = {

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2 1/3] ntb_perf: extend with burst latency measurement
  2022-04-22 11:37 [PATCH v2 1/3] ntb_perf: extend with burst latency measurement Alexander Fomichev
                   ` (2 preceding siblings ...)
  2022-04-25 16:38 ` [PATCH v2 1/3] ntb_perf: extend with burst " Dave Jiang
@ 2022-04-28 11:27 ` Serge Semin
  2022-04-29 15:00   ` Alexander Fomichev
  3 siblings, 1 reply; 10+ messages in thread
From: Serge Semin @ 2022-04-28 11:27 UTC (permalink / raw)
  To: Alexander Fomichev
  Cc: ntb, linux, Jon Mason, Dave Jiang, Allen Hubbe, Guo Zhengkui,
	Alexander Fomichev

Hello Alexander

On Fri, Apr 22, 2022 at 02:37:01PM +0300, Alexander Fomichev wrote:
> From: Alexander Fomichev <a.fomichev@yadro.com>
> 
> The ntb_perf test provides a tool for NTB hardware performance
> evaluation. For software impact elimination the test uses a simple method
> (let's call it 'burst' mode), when the local system sends to the remote
> system a data set and counts time interval until hardware completion
> report, without the remote side confirming, nor data integrity check.
> The measured metric is a 'burst' throughput bandwidth of NTB connection.
> 
> The proposed change extends ntb_perf with 1 of 3 new metrics. The resulting
> test is fully backward compatible.
> 
> 1. Burst latency. It's a delay between start to send 1 byte to the
> remote system and hardware readiness to send another byte. The
> measurement performed within bandwidth test procedure. The DMA Engine is
> off. Data integrity is not checked. This mode can be disabled by
> 'perf_latency=N' module parameter.
> 
> Signed-off-by: Alexander Fomichev <a.fomichev@yadro.com>
> ---
>  drivers/ntb/test/ntb_perf.c | 135 +++++++++++++++++++++++++++++++++++-
>  1 file changed, 132 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/ntb/test/ntb_perf.c b/drivers/ntb/test/ntb_perf.c
> index 65e1e5cf1b29..79faa4a7fe94 100644
> --- a/drivers/ntb/test/ntb_perf.c
> +++ b/drivers/ntb/test/ntb_perf.c
> @@ -106,6 +106,9 @@ MODULE_DESCRIPTION("PCIe NTB Performance Measurement Tool");
>  
>  #define PERF_BUF_LEN 1024
>  
> +#define LAT_MIN_TRIES	20
> +#define RESCHEDULE_RATIO	10000
> +
>  static unsigned long max_mw_size;
>  module_param(max_mw_size, ulong, 0644);
>  MODULE_PARM_DESC(max_mw_size, "Upper limit of memory window size");
> @@ -122,6 +125,14 @@ static bool use_dma; /* default to 0 */
>  module_param(use_dma, bool, 0644);
>  MODULE_PARM_DESC(use_dma, "Use DMA engine to measure performance");
>  
> +static bool perf_latency = true;
> +module_param(perf_latency, bool, 0644);
> +MODULE_PARM_DESC(perf_latency, "Also measure latency");
> +
> +static unsigned long lat_time_ms = 1000; /* default 1s */
> +module_param(lat_time_ms, ulong, 0644);
> +MODULE_PARM_DESC(lat_time_ms, "Time (in ms) to test latency");
> +
>  /*==============================================================================
>   *                         Perf driver data definition
>   *==============================================================================
> @@ -178,6 +189,8 @@ struct perf_thread {
>  	void *src;
>  	u64 copied;
>  	ktime_t duration;
> +	ktime_t latency;
> +	u64 tries;
>  	int status;
>  	struct work_struct work;
>  };
> @@ -783,7 +796,7 @@ static void perf_dma_copy_callback(void *data)
>  }
>  
>  static int perf_copy_chunk(struct perf_thread *pthr,
> -			   void __iomem *dst, void *src, size_t len)
> +			   void __iomem *dst, void *src, size_t len, bool use_dma_)
>  {
>  	struct dma_async_tx_descriptor *tx;
>  	struct dmaengine_unmap_data *unmap;
> @@ -794,7 +807,7 @@ static int perf_copy_chunk(struct perf_thread *pthr,
>  	void __iomem *dst_vaddr;
>  	dma_addr_t dst_dma_addr;
>  

> -	if (!use_dma) {
> +	if (!use_dma_) {

The postfixed underscore is less distinctive from the postfix-less
name than the prefixed one. Anyway seeing the dma-less code is just a
small subset of the generic perf copy method I'd suggest to just split
the method up into two: perf_dma_copy_chunk() and
perf_mem_copy_chunk(). It will be more readable to call one of them
depending on the code context or on the global 'use_dma' flag state.

>  		memcpy_toio(dst, src, len);
>  		goto ret_check_tsync;
>  	}
> @@ -940,7 +953,7 @@ static int perf_run_test(struct perf_thread *pthr)
>  
>  	/* Copied field is cleared on test launch stage */
>  	while (pthr->copied < total_size) {
> -		ret = perf_copy_chunk(pthr, flt_dst, flt_src, chunk_size);
> +		ret = perf_copy_chunk(pthr, flt_dst, flt_src, chunk_size, use_dma);
>  		if (ret) {
>  			dev_err(&perf->ntb->dev, "%d: Got error %d on test\n",
>  				pthr->tidx, ret);
> @@ -1018,6 +1031,66 @@ static void perf_clear_test(struct perf_thread *pthr)
>  	kfree(pthr->src);
>  }
>  
> +static int perf_run_latency(struct perf_thread *pthr)
> +{

> +	struct perf_peer *peer = pthr->perf->test_peer;
> +	struct ntb_dev *ntb = pthr->perf->ntb;
> +	void *flt_src;
> +	void __iomem *flt_dst, *bnd_dst;
> +	int ret;
> +	u64 stop_at = ktime_get_real_fast_ns() + lat_time_ms * NSEC_PER_MSEC;

Please preserve the reverse xmas tree order of the local variables
definition for the consistency with the main part of the rest of the driver
methods.

Also I'd suggest to move the 'stop_at' variable initialization out of
the definition block to a some-line before the while loop. Thus the
time measurements will look more localized.

> +

> +	pthr->tries = 0;

I don't see you functionally using the 'tries' field anywhere but in
this method. What about defining it locally? (See my next comments for
more about the 'tries' field.)

> +	pthr->latency = ktime_get();
> +	flt_src = pthr->src;
> +	flt_dst = peer->outbuf;
> +	bnd_dst = peer->outbuf + peer->outbuf_size;
> +
> +	while (ktime_get_real_fast_ns() < stop_at) {
> +		ret = perf_copy_chunk(pthr, flt_dst, flt_src, 1, false);
> +		if (ret) {
> +			dev_err(&ntb->dev, "%d: Latency testing error %d\n",
> +				pthr->tidx, ret);
> +			pthr->latency = ktime_set(0, 0);
> +			return ret;
> +		}
> +
> +		pthr->tries++;
> +		flt_dst++;
> +		flt_src++;
> +
> +		if (flt_dst >= bnd_dst || flt_dst < peer->outbuf) {
> +			flt_dst = peer->outbuf;
> +			flt_src = pthr->src;
> +		}
> +
> +		/* Avoid processor soft lock-ups */
> +		if (!(pthr->tries % RESCHEDULE_RATIO))
> +			schedule();
> +	}
> +
> +	/* Stop timer */
> +	pthr->latency = ktime_sub(ktime_get(), pthr->latency);
> +


> +	if (pthr->tries < LAT_MIN_TRIES) {

Hm, why 20 is enough? If you think that at least 20 re-tries must be
performed, then why would you need the lat_time_ms parameter at all?
I'd suggest to either define a lat_tries_cnt parameter instead or drop
that parameter and use a fixed number of retries (20, 30, whatever you
think would be enough to get a more-or-less precise mean latency).

(See my last comment in this context.)

> +		dev_err(&ntb->dev, "%d: Too few steps (%d) to measure Latency, "
> +			"recommended > %d. Increase value of 'lat_time_ms' "
> +			"parameter\n", pthr->tidx, pthr->tries, LAT_MIN_TRIES);

Please no for the string breakage (see '2. Breaking long lines and
strings' in Documentation/process/coding-style.rst).

> +		pthr->latency = ktime_set(0, 0);
> +		return -EINVAL;
> +	}
> +
> +	dev_dbg(&ntb->dev, "%d: made %llu tries, lasted %llu usecs\n",
> +		pthr->tidx, pthr->tries, ktime_to_us(pthr->latency));
> +
> +	pthr->latency = ns_to_ktime(ktime_divns(pthr->latency, pthr->tries));
> +
> +	dev_dbg(&ntb->dev, "%d: latency %llu us (%llu ns)\n", pthr->tidx,
> +		ktime_to_us(pthr->latency), ktime_to_ns(pthr->latency));
> +
> +	return 0;
> +}
> +
>  static void perf_thread_work(struct work_struct *work)
>  {
>  	struct perf_thread *pthr = to_thread_work(work);
> @@ -1043,6 +1116,11 @@ static void perf_thread_work(struct work_struct *work)
>  	}
>  
>  	pthr->status = perf_sync_test(pthr);
> +	if (pthr->status)
> +		goto err_clear_test;
> +
> +	if (perf_latency)
> +		pthr->status = perf_run_latency(pthr);
>  
>  err_clear_test:
>  	perf_clear_test(pthr);
> @@ -1142,6 +1220,18 @@ static int perf_read_stats(struct perf_ctx *perf, char *buf,
>  			"%d: copied %llu bytes in %llu usecs, %llu MBytes/s\n",
>  			tidx, pthr->copied, ktime_to_us(pthr->duration),
>  			div64_u64(pthr->copied, ktime_to_us(pthr->duration)));
> +
> +		if (perf_latency && ktime_compare(pthr->latency, ktime_set(0, 0))) {
> +			if (ktime_to_us(pthr->latency) < 10) {
> +				(*pos) += scnprintf(buf + *pos, size - *pos,
> +						"%d: latency %llu ns\n",
> +						tidx, ktime_to_ns(pthr->latency));
> +			} else {
> +				(*pos) += scnprintf(buf + *pos, size - *pos,
> +						"%d: latency %llu us\n",
> +						tidx, ktime_to_us(pthr->latency));
> +			}
> +		}
>  	}
>  
>  	clear_bit_unlock(0, &perf->busy_flag);
> @@ -1344,12 +1434,48 @@ static ssize_t perf_dbgfs_write_tcnt(struct file *filep,
>  	return size;
>  }
>  
> +static ssize_t perf_dbgfs_read_lattrs(struct file *filep, char __user *ubuf,
> +				    size_t size, loff_t *offp)
> +{
> +	struct perf_ctx *perf = filep->private_data;
> +	char *buf;
> +	ssize_t pos, ret;
> +	size_t buf_size = min_t(size_t, size, PERF_BUF_LEN);
> +	int tidx;
> +
> +	buf = kmalloc(buf_size, GFP_KERNEL);
> +	if (!buf)
> +		return -ENOMEM;
> +
> +	pos = scnprintf(buf, buf_size, "    Peer %d latency try count:\n",
> +			perf->test_peer->pidx);
> +
> +	for (tidx = 0; tidx < perf->tcnt; tidx++) {
> +		struct perf_thread *pthr = &perf->threads[tidx];
> +

> +		pos += scnprintf(buf + pos, buf_size - pos,
> +			"%d: made %llu tries\n",
> +			tidx, pthr->tries);

AFAIU from the patch log the main goal of this patch is to add the IO
latency measurement functionality. Why would you need to print the
number of reties then? I don't find it much informative especially
seeing you are already printing the available mean latency. I'd rather
drop the 'latency_tries' entry.

-Sergey

> +	}
> +
> +	ret = simple_read_from_buffer(ubuf, size, offp, buf, pos);
> +
> +	kfree(buf);
> +
> +	return ret;
> +}
> +
>  static const struct file_operations perf_dbgfs_tcnt = {
>  	.open = simple_open,
>  	.read = perf_dbgfs_read_tcnt,
>  	.write = perf_dbgfs_write_tcnt
>  };
>  
> +static const struct file_operations perf_dbgfs_lattrs = {
> +	.open = simple_open,
> +	.read = perf_dbgfs_read_lattrs
> +};
> +
>  static void perf_setup_dbgfs(struct perf_ctx *perf)
>  {
>  	struct pci_dev *pdev = perf->ntb->pdev;
> @@ -1375,6 +1501,9 @@ static void perf_setup_dbgfs(struct perf_ctx *perf)
>  	debugfs_create_u8("total_order", 0500, perf->dbgfs_dir, &total_order);
>  
>  	debugfs_create_bool("use_dma", 0500, perf->dbgfs_dir, &use_dma);
> +
> +	debugfs_create_file("latency_tries", 0400, perf->dbgfs_dir, perf,
> +			    &perf_dbgfs_lattrs);
>  }
>  
>  static void perf_clear_dbgfs(struct perf_ctx *perf)
> -- 
> 2.30.2
> 
> 

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2 2/3] ntb_perf: extend with poll latency measurement
  2022-04-25 16:49   ` Dave Jiang
@ 2022-04-29  8:53     ` Alexander Fomichev
  2022-04-29 17:15       ` Dave Jiang
  0 siblings, 1 reply; 10+ messages in thread
From: Alexander Fomichev @ 2022-04-29  8:53 UTC (permalink / raw)
  To: Dave Jiang
  Cc: ntb, linux, Jon Mason, Allen Hubbe, Guo Zhengkui, Alexander Fomichev

Hi Dave,

On Mon, Apr 25, 2022 at 09:49:43AM -0700, Dave Jiang wrote:
> 
> On 4/22/2022 4:37 AM, Alexander Fomichev wrote:
> > From: Alexander Fomichev <a.fomichev@yadro.com>
> > 
> > The proposed change extends ntb_perf with 2nd of 3 new metrics. The resulting
> > test is fully backward compatible.
> > 
> > 2. Poll latency. It's a delay between start to send 1 byte to the
> > remote system and receiving the confirmation. The remote system needs to
> > be run in server mode beforehand. Then the server polls the input buffer
> > and on receiving data immediately sends the confirmation back.
> 
> This is a general comment. Please make the commit logs as they are
> individual standalone patches and not refer to each other.

OK.

> 
> > 
> > Signed-off-by: Alexander Fomichev <a.fomichev@yadro.com>
> > ---
> >   drivers/ntb/test/ntb_perf.c | 365 +++++++++++++++++++++++++++++++++++-
> >   1 file changed, 364 insertions(+), 1 deletion(-)
> > 
> > diff --git a/drivers/ntb/test/ntb_perf.c b/drivers/ntb/test/ntb_perf.c
> > index 79faa4a7fe94..d4664410e543 100644
> > --- a/drivers/ntb/test/ntb_perf.c
> > +++ b/drivers/ntb/test/ntb_perf.c
> > @@ -85,7 +85,7 @@
> >   #include <linux/ntb.h>
> >   #define DRIVER_NAME		"ntb_perf"
> > -#define DRIVER_VERSION		"2.0"
> > +#define DRIVER_VERSION		"2.1"
> 
> Oddly random version change in the middle patch.
> 
I will re-number every feature as minor version change, accumulating your
comment to the 3rd patch.

> 
> >   MODULE_LICENSE("Dual BSD/GPL");
> >   MODULE_VERSION(DRIVER_VERSION);
> > @@ -133,6 +133,10 @@ static unsigned long lat_time_ms = 1000; /* default 1s */
> >   module_param(lat_time_ms, ulong, 0644);
> >   MODULE_PARM_DESC(lat_time_ms, "Time (in ms) to test latency");
> > +static unsigned long lat_timeout_us = 500;
> > +module_param(lat_timeout_us, ulong, 0644);
> > +MODULE_PARM_DESC(lat_timeout_us, "Timeout (in us) to wait for server reply");
> 
> I would refrain from using the word "timeout" for variable and description
> as that's typically used for when a set time expires. Maybe "Roundtrip
> Latency" would be the better description?
> 
In this case it's a real timemout. The client polls the buffer waiting
for the server reply. If that amount of time expired and no reply word
encountered, then an error reported.


-- 
Regards,
  Alexander

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2 1/3] ntb_perf: extend with burst latency measurement
  2022-04-28 11:27 ` Serge Semin
@ 2022-04-29 15:00   ` Alexander Fomichev
  0 siblings, 0 replies; 10+ messages in thread
From: Alexander Fomichev @ 2022-04-29 15:00 UTC (permalink / raw)
  To: Serge Semin
  Cc: ntb, linux, Jon Mason, Dave Jiang, Allen Hubbe, Guo Zhengkui,
	Alexander Fomichev

Hi Sergey,

On Thu, Apr 28, 2022 at 02:27:58PM +0300, Serge Semin wrote:
> 
> > -	if (!use_dma) {
> > +	if (!use_dma_) {
> 
> The postfixed underscore is less distinctive from the postfix-less
> name than the prefixed one. Anyway seeing the dma-less code is just a
> small subset of the generic perf copy method I'd suggest to just split
> the method up into two: perf_dma_copy_chunk() and
> perf_mem_copy_chunk(). It will be more readable to call one of them
> depending on the code context or on the global 'use_dma' flag state.
> 
Then why should we have 2 similar functions with only couple of lines
difference? I think re-using code is always preferable unless it makes
the code unclear.
Prefixed underscore is OK. Maybe it's better to rename the local
variable at all.

> 
> > +	pthr->tries = 0;
> 
> I don't see you functionally using the 'tries' field anywhere but in
> this method. What about defining it locally? (See my next comments for
> more about the 'tries' field.)
> 
> 
> > +	if (pthr->tries < LAT_MIN_TRIES) {
> 
> Hm, why 20 is enough? If you think that at least 20 re-tries must be
> performed, then why would you need the lat_time_ms parameter at all?
> I'd suggest to either define a lat_tries_cnt parameter instead or drop
> that parameter and use a fixed number of retries (20, 30, whatever you
> think would be enough to get a more-or-less precise mean latency).
> 
> 
> > +		pos += scnprintf(buf + pos, buf_size - pos,
> > +			"%d: made %llu tries\n",
> > +			tidx, pthr->tries);
> 
> AFAIU from the patch log the main goal of this patch is to add the IO
> latency measurement functionality. Why would you need to print the
> number of reties then? I don't find it much informative especially
> seeing you are already printing the available mean latency. I'd rather
> drop the 'latency_tries' entry.
> 
These all were made on purpose. The main usages for the NTB performance
measurement are to evaluate new hardware and to tune its settings. In
the real world 1 iteration can last from tens ns to hundreds ms. It's a
huge difference. And while in my opinion the best measurement results
are gained through 1000 iterations, it may turn the test system
irresponsible for several minutes in the worst case. And contrary, too
low value being hardcoded would produce quite inaccurate results in the
best case.
Therefore it's better to use a parameter, which is predictable for a
user. So LAT_MIN_TRIES macro is defined to protect the test accuracy from
too low 'lat_time_ms' value. And also sysfs entry for 'tries' variable helps
to evaluate accuracy.

-- 
Regards,
  Alexander

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2 2/3] ntb_perf: extend with poll latency measurement
  2022-04-29  8:53     ` Alexander Fomichev
@ 2022-04-29 17:15       ` Dave Jiang
  0 siblings, 0 replies; 10+ messages in thread
From: Dave Jiang @ 2022-04-29 17:15 UTC (permalink / raw)
  To: Alexander Fomichev
  Cc: ntb, linux, Jon Mason, Allen Hubbe, Guo Zhengkui, Alexander Fomichev


On 4/29/2022 1:53 AM, Alexander Fomichev wrote:
> Hi Dave,
>
> On Mon, Apr 25, 2022 at 09:49:43AM -0700, Dave Jiang wrote:
>> On 4/22/2022 4:37 AM, Alexander Fomichev wrote:
>>> From: Alexander Fomichev <a.fomichev@yadro.com>
>>>
>>> The proposed change extends ntb_perf with 2nd of 3 new metrics. The resulting
>>> test is fully backward compatible.
>>>
>>> 2. Poll latency. It's a delay between start to send 1 byte to the
>>> remote system and receiving the confirmation. The remote system needs to
>>> be run in server mode beforehand. Then the server polls the input buffer
>>> and on receiving data immediately sends the confirmation back.
>> This is a general comment. Please make the commit logs as they are
>> individual standalone patches and not refer to each other.
> OK.
>
>>> Signed-off-by: Alexander Fomichev <a.fomichev@yadro.com>
>>> ---
>>>    drivers/ntb/test/ntb_perf.c | 365 +++++++++++++++++++++++++++++++++++-
>>>    1 file changed, 364 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/ntb/test/ntb_perf.c b/drivers/ntb/test/ntb_perf.c
>>> index 79faa4a7fe94..d4664410e543 100644
>>> --- a/drivers/ntb/test/ntb_perf.c
>>> +++ b/drivers/ntb/test/ntb_perf.c
>>> @@ -85,7 +85,7 @@
>>>    #include <linux/ntb.h>
>>>    #define DRIVER_NAME		"ntb_perf"
>>> -#define DRIVER_VERSION		"2.0"
>>> +#define DRIVER_VERSION		"2.1"
>> Oddly random version change in the middle patch.
>>
> I will re-number every feature as minor version change, accumulating your
> comment to the 3rd patch.
>
>>>    MODULE_LICENSE("Dual BSD/GPL");
>>>    MODULE_VERSION(DRIVER_VERSION);
>>> @@ -133,6 +133,10 @@ static unsigned long lat_time_ms = 1000; /* default 1s */
>>>    module_param(lat_time_ms, ulong, 0644);
>>>    MODULE_PARM_DESC(lat_time_ms, "Time (in ms) to test latency");
>>> +static unsigned long lat_timeout_us = 500;
>>> +module_param(lat_timeout_us, ulong, 0644);
>>> +MODULE_PARM_DESC(lat_timeout_us, "Timeout (in us) to wait for server reply");
>> I would refrain from using the word "timeout" for variable and description
>> as that's typically used for when a set time expires. Maybe "Roundtrip
>> Latency" would be the better description?
>>
> In this case it's a real timemout. The client polls the buffer waiting
> for the server reply. If that amount of time expired and no reply word
> encountered, then an error reported.
Ok
>
>

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2022-04-29 17:15 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-04-22 11:37 [PATCH v2 1/3] ntb_perf: extend with burst latency measurement Alexander Fomichev
2022-04-22 11:37 ` [PATCH v2 2/3] ntb_perf: extend with poll " Alexander Fomichev
2022-04-25 16:49   ` Dave Jiang
2022-04-29  8:53     ` Alexander Fomichev
2022-04-29 17:15       ` Dave Jiang
2022-04-22 11:37 ` [PATCH v2 3/3] ntb_perf: extend with doorbell " Alexander Fomichev
2022-04-25 16:55   ` Dave Jiang
2022-04-25 16:38 ` [PATCH v2 1/3] ntb_perf: extend with burst " Dave Jiang
2022-04-28 11:27 ` Serge Semin
2022-04-29 15:00   ` Alexander Fomichev

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).