All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] ntb: ntb perf tool
@ 2016-01-05 23:37 Dave Jiang
  2016-01-06 15:05 ` Jon Mason
  0 siblings, 1 reply; 5+ messages in thread
From: Dave Jiang @ 2016-01-05 23:37 UTC (permalink / raw)
  To: allen.hubbe, jdmason; +Cc: linux-ntb

Providing raw performance data via a tool that directly access data from
NTB w/o any software overhead. This allows measurement of the hardware
performance limit. In revision one we are only doing single direction
CPU and DMA writes. Eventually we will provide bi-directional writes.

The measurement using DMA engine for NTB performance measure does
not measure the raw performance of DMA engine over NTB due to software
overhead. But it should provide the peak performance through the Linux DMA
driver.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 drivers/ntb/test/Kconfig    |    8 
 drivers/ntb/test/Makefile   |    1 
 drivers/ntb/test/ntb_perf.c |  755 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 764 insertions(+)
 create mode 100644 drivers/ntb/test/ntb_perf.c

diff --git a/drivers/ntb/test/Kconfig b/drivers/ntb/test/Kconfig
index 01852f9..aad598a 100644
--- a/drivers/ntb/test/Kconfig
+++ b/drivers/ntb/test/Kconfig
@@ -17,3 +17,11 @@ config NTB_TOOL
 	 functioning at a basic level.
 
 	 If unsure, say N.
+
+config NTB_PERF
+	tristate "NTB RAW Perf Measuring Tool"
+	help
+	 This is a tool to measure raw NTB performance by transfering data
+	 to and from the window without additional software interaction.
+
+	 If unsure, say N.
diff --git a/drivers/ntb/test/Makefile b/drivers/ntb/test/Makefile
index 0ea32a3..9e77e0b 100644
--- a/drivers/ntb/test/Makefile
+++ b/drivers/ntb/test/Makefile
@@ -1,2 +1,3 @@
 obj-$(CONFIG_NTB_PINGPONG) += ntb_pingpong.o
 obj-$(CONFIG_NTB_TOOL) += ntb_tool.o
+obj-$(CONFIG_NTB_PERF) += ntb_perf.o
diff --git a/drivers/ntb/test/ntb_perf.c b/drivers/ntb/test/ntb_perf.c
new file mode 100644
index 0000000..fe0f13c
--- /dev/null
+++ b/drivers/ntb/test/ntb_perf.c
@@ -0,0 +1,755 @@
+/*
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ *   redistributing this file, you may do so under either license.
+ *
+ *   GPL LICENSE SUMMARY
+ *
+ *   Copyright(c) 2015 Intel Corporation. All rights reserved.
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of version 2 of the GNU General Public License as
+ *   published by the Free Software Foundation.
+ *
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2015 Intel Corporation. All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copy
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *   PCIe NTB Perf Linux driver
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/time.h>
+#include <linux/timer.h>
+#include <linux/dma-mapping.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/debugfs.h>
+#include <linux/dmaengine.h>
+#include <linux/delay.h>
+
+#include <linux/ntb.h>
+
+#define DRIVER_NAME		"ntb_perf"
+#define DRIVER_DESCRIPTION	"PCIe NTB Performance Measurement Tool"
+
+#define DRIVER_LICENSE		"Dual BSD/GPL"
+#define DRIVER_VERSION		"1.0"
+#define DRIVER_AUTHOR		"Dave Jiang <dave.jiang@intel.com>"
+
+#define PERF_LINK_DOWN_TIMEOUT	10
+#define PERF_VERSION		0xffff0001
+#define MAX_THREADS		32
+#define MAX_TEST_SIZE		1024*1024	/* 1M */
+#define MAX_SRCS		32
+#define DMA_OUT_RESOURCE_TO	50
+#define DMA_RETRIES		20
+
+MODULE_LICENSE(DRIVER_LICENSE);
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESCRIPTION);
+
+static struct dentry *perf_debugfs_dir;
+
+static unsigned int seg_order = 19; /* 512K */
+module_param(seg_order, uint, 0644);
+MODULE_PARM_DESC(seg_order, "size order [n^2] of buffer segment for testing");
+
+static unsigned int run_order = 32; /* 4G */
+module_param(run_order, uint, 0644);
+MODULE_PARM_DESC(run_order, "size order [n^2] of total data to transfer");
+
+static bool use_dma = false;
+module_param(use_dma, bool, 0644);
+MODULE_PARM_DESC(use_dma, "Using DMA engine to measure performance");
+
+struct perf_mw {
+	phys_addr_t	phys_addr;
+	resource_size_t	phys_size;
+	resource_size_t	xlat_align;
+	resource_size_t	xlat_align_size;
+	void __iomem	*vbase;
+	size_t		xlat_size;
+	size_t		buf_size;
+	void		*virt_addr;
+	dma_addr_t	dma_addr;
+};
+
+struct perf_ctx;
+
+struct pthr_ctx {
+	struct task_struct	*thread;
+	struct perf_ctx		*perf;
+	atomic_t		dma_sync;
+	struct dma_chan		*dma_chan;
+	int			dma_prep_err;
+	int			src_idx;
+	void			*srcs[MAX_SRCS];
+};
+
+struct perf_ctx {
+	struct ntb_dev		*ntb;
+	spinlock_t		db_lock;
+	struct perf_mw		mw;
+	bool			link_is_up;
+	struct work_struct	link_cleanup;
+	struct delayed_work	link_work;
+	struct dentry		*debugfs_node_dir;
+	struct dentry		*debugfs_run;
+	struct dentry		*debugfs_threads;
+	u8			perf_threads;
+	bool			run;
+	struct pthr_ctx		pthr_ctx[MAX_THREADS];
+	atomic_t		tsync;
+};
+
+enum {
+	VERSION = 0,
+	MW_SZ_HIGH,
+	MW_SZ_LOW,
+	SPAD_MSG,
+	SPAD_ACK,
+	MAX_SPAD
+};
+
+static void perf_link_event(void *ctx)
+{
+	struct perf_ctx *perf = ctx;
+
+	if (ntb_link_is_up(perf->ntb, NULL, NULL) == 1)
+		schedule_delayed_work(&perf->link_work, 0);
+	else
+		schedule_work(&perf->link_cleanup);
+}
+
+static void perf_db_event(void *ctx, int vec)
+{
+	struct perf_ctx *perf = ctx;
+	u64 db_bits, db_mask;
+
+	db_mask = ntb_db_vector_mask(perf->ntb, vec);
+	db_bits = ntb_db_read(perf->ntb);
+
+	dev_dbg(&perf->ntb->dev, "doorbell vec %d mask %#llx bits %#llx\n",
+		vec, db_mask, db_bits);
+}
+
+static const struct ntb_ctx_ops perf_ops = {
+	.link_event = perf_link_event,
+	.db_event = perf_db_event,
+};
+
+static void perf_copy_callback(void *data)
+{
+	struct pthr_ctx *pctx = data;
+
+	atomic_dec(&pctx->dma_sync);
+}
+
+static ssize_t perf_copy(struct pthr_ctx *pctx, char *dst,
+			 char *src, size_t size)
+{
+	struct perf_ctx *perf = pctx->perf;
+	struct dma_async_tx_descriptor *txd;
+	struct dma_chan *chan = pctx->dma_chan;
+	struct dma_device *device;
+	struct dmaengine_unmap_data *unmap;
+	dma_cookie_t cookie;
+	size_t src_off, dst_off;
+	struct perf_mw *mw = &perf->mw;
+	u64 vbase, dst_vaddr;
+	dma_addr_t dst_phys;
+	int retries = 0;
+
+	if (!use_dma) {
+		memcpy_toio(dst, src, size);
+		return size;
+	}
+
+	if (!chan) {
+		dev_err(&perf->ntb->dev, "DMA engine does not exist\n");
+		return -EINVAL;
+	}
+
+	device = chan->device;
+	src_off = (size_t)src & ~PAGE_MASK;
+	dst_off = (size_t)dst & ~PAGE_MASK;
+
+	if (!is_dma_copy_aligned(device, src_off, dst_off, size))
+		return -ENODEV;
+
+	vbase = (u64)(u64 *)mw->vbase;
+	dst_vaddr = (u64)(u64 *)dst;
+	dst_phys = mw->phys_addr + (dst_vaddr - vbase);
+
+	unmap = dmaengine_get_unmap_data(device->dev, 1, GFP_NOWAIT);
+	if (!unmap)
+		return -ENOMEM;
+
+	unmap->len = size;
+	unmap->addr[0] = dma_map_page(device->dev, virt_to_page(src),
+				      src_off, size, DMA_TO_DEVICE);
+	if (dma_mapping_error(device->dev, unmap->addr[0]))
+		goto err_get_unmap;
+
+	unmap->to_cnt = 1;
+
+	do {
+		txd = device->device_prep_dma_memcpy(chan, dst_phys,
+						     unmap->addr[0],
+						     size, DMA_PREP_INTERRUPT);
+		if (!txd) {
+			set_current_state(TASK_INTERRUPTIBLE);
+			schedule_timeout(DMA_OUT_RESOURCE_TO);
+		}
+	} while (!txd && (++retries < DMA_RETRIES));
+
+	if (!txd) {
+		pctx->dma_prep_err++;
+		goto err_get_unmap;
+	}
+
+	txd->callback = perf_copy_callback;
+	txd->callback_param = pctx;
+	dma_set_unmap(txd, unmap);
+
+	cookie = dmaengine_submit(txd);
+	if (dma_submit_error(cookie))
+		goto err_set_unmap;
+
+	atomic_inc(&pctx->dma_sync);
+	dma_async_issue_pending(chan);
+
+	return size;
+
+err_set_unmap:
+	dmaengine_unmap_put(unmap);
+err_get_unmap:
+	dmaengine_unmap_put(unmap);
+	return 0;
+}
+
+static int perf_move_data(struct pthr_ctx *pctx, char *dst, char *src,
+			  u64 buf_size, u64 win_size, u64 total)
+{
+	int chunks, total_chunks, i;
+	int copied_chunks = 0;
+	u64 copied = 0, result;
+	char *tmp = dst;
+	u64 perf, diff_us;
+	ktime_t kstart, kstop, kdiff;
+
+	chunks = win_size / buf_size;
+	total_chunks = total / buf_size;
+	kstart = ktime_get();
+
+	for (i = 0; i < total_chunks; i++) {
+		result = perf_copy(pctx, tmp, src, buf_size);
+		copied += result;
+		copied_chunks++;
+		if (copied_chunks == chunks) {
+			tmp = dst;
+			copied_chunks = 0;
+		} else
+			tmp += buf_size;
+
+		/* probably should schedule every 4GB? */
+		if (((copied % (1ULL << 32)) == 0) && !use_dma) {
+			set_current_state(TASK_INTERRUPTIBLE);
+			schedule();
+		}
+	}
+
+	if (use_dma)
+		pr_info("%s: All DMA descriptors submitted\n", current->comm);
+
+	while (atomic_read(&pctx->dma_sync) != 0)
+		msleep(1);
+
+	kstop = ktime_get();
+	kdiff = ktime_sub(kstop, kstart);
+	diff_us = ktime_to_us(kdiff);
+
+	pr_info("%s: copied %Lu bytes\n", current->comm, copied);
+
+	pr_info("%s: lasted %Lu usecs\n", current->comm, diff_us);
+
+	perf = copied / diff_us;
+
+	pr_info("%s: MBytes/s: %Lu\n", current->comm, perf);
+
+	return 0;
+}
+
+static bool perf_dma_filter_fn(struct dma_chan *chan, void *node)
+{
+	return dev_to_node(&chan->dev->device) == (int)(unsigned long)node;
+}
+
+static int ntb_perf_thread(void *data)
+{
+	struct pthr_ctx *pctx = data;
+	struct perf_ctx *perf = pctx->perf;
+	struct pci_dev *pdev = perf->ntb->pdev;
+	struct perf_mw *mw = &perf->mw;
+	char *dst;
+	u64 win_size, buf_size, total;
+	void *src;
+	int rc, node, i;
+	struct dma_chan *dma_chan = NULL;
+
+	pr_info("kthread %s starting...\n", current->comm);
+
+	node = dev_to_node(&pdev->dev);
+
+	if (use_dma && !pctx->dma_chan) {
+		dma_cap_mask_t dma_mask;
+
+		dma_cap_zero(dma_mask);
+		dma_cap_set(DMA_MEMCPY, dma_mask);
+		dma_chan = dma_request_channel(dma_mask, perf_dma_filter_fn,
+					       (void *)(unsigned long)node);
+		if (!dma_chan) {
+			pr_warn("%s: cannot acquire DMA channel, quitting\n",
+				current->comm);
+			return -ENODEV;
+		}
+		pctx->dma_chan = dma_chan;
+	}
+
+	for (i = 0; i < MAX_SRCS; i++) {
+		pctx->srcs[i] = kmalloc_node(MAX_TEST_SIZE, GFP_KERNEL, node);
+		if (!pctx->srcs[i]) {
+			rc = -ENOMEM;
+			goto err;
+		}
+	}
+
+	win_size = mw->phys_size;
+	buf_size = 1ULL << seg_order;
+	total = 1ULL << run_order;
+
+	if (buf_size > MAX_TEST_SIZE)
+		buf_size = MAX_TEST_SIZE;
+
+	dst = (char *)mw->vbase;
+
+	atomic_inc(&perf->tsync);
+	while (atomic_read(&perf->tsync) != perf->perf_threads)
+		schedule();
+
+	src = pctx->srcs[pctx->src_idx];
+	pctx->src_idx = (pctx->src_idx + 1) & (MAX_SRCS - 1);
+
+	rc = perf_move_data(pctx, dst, src, buf_size, win_size, total);
+
+	atomic_dec(&perf->tsync);
+
+	if (rc < 0) {
+		pr_err("%s: failed\n", current->comm);
+		rc = -ENXIO;
+		goto err;
+	}
+
+	for (i = 0; i < MAX_SRCS; i++) {
+		if (pctx->srcs[i]) {
+			kfree(pctx->srcs[i]);
+			pctx->srcs[i] = NULL;
+		}
+	}
+
+	return 0;
+
+err:
+	for (i = 0; i < MAX_SRCS; i++) {
+		if (pctx->srcs[i]) {
+			kfree(pctx->srcs[i]);
+			pctx->srcs[i] = NULL;
+		}
+	}
+
+	if (dma_chan) {
+		dma_release_channel(dma_chan);
+		pctx->dma_chan = NULL;
+	}
+
+	return rc;
+}
+
+static void perf_free_mw(struct perf_ctx *perf)
+{
+	struct perf_mw *mw = &perf->mw;
+	struct pci_dev *pdev = perf->ntb->pdev;
+
+	if (!mw->virt_addr)
+		return;
+
+	ntb_mw_clear_trans(perf->ntb, 0);
+	dma_free_coherent(&pdev->dev, mw->buf_size,
+			  mw->virt_addr, mw->dma_addr);
+	mw->xlat_size = 0;
+	mw->buf_size = 0;
+	mw->virt_addr = NULL;
+}
+
+static int perf_set_mw(struct perf_ctx *perf, resource_size_t size)
+{
+	struct perf_mw *mw = &perf->mw;
+	size_t xlat_size, buf_size;
+
+	if (!size)
+		return -EINVAL;
+
+	xlat_size = round_up(size, mw->xlat_align_size);
+	buf_size = round_up(size, mw->xlat_align);
+
+	if (mw->xlat_size == xlat_size)
+		return 0;
+
+	if (mw->buf_size)
+		perf_free_mw(perf);
+
+	mw->xlat_size = xlat_size;
+	mw->buf_size = buf_size;
+
+	mw->virt_addr = dma_alloc_coherent(&perf->ntb->pdev->dev, buf_size,
+					   &mw->dma_addr, GFP_KERNEL);
+	if (!mw->virt_addr) {
+		mw->xlat_size = 0;
+		mw->buf_size = 0;
+	}
+
+	return 0;
+}
+
+static void perf_link_work(struct work_struct *work)
+{
+	struct perf_ctx *perf =
+		container_of(work, struct perf_ctx, link_work.work);
+	struct ntb_dev *ndev = perf->ntb;
+	struct pci_dev *pdev = ndev->pdev;
+	u32 val;
+	u64 size;
+	int rc;
+
+	dev_dbg(&perf->ntb->pdev->dev, "%s called\n", __func__);
+
+	size = perf->mw.phys_size;
+	ntb_peer_spad_write(ndev, MW_SZ_HIGH, (u32)(size >> 32));
+	ntb_peer_spad_write(ndev, MW_SZ_LOW, (u32)size);
+	ntb_peer_spad_write(ndev, VERSION, PERF_VERSION);
+
+	/* now read what peer wrote */
+	val = ntb_spad_read(ndev, VERSION);
+	dev_dbg(&pdev->dev, "Remote version = %#x\n", val);
+	if (val != PERF_VERSION)
+		goto out;
+
+	val = ntb_spad_read(ndev, MW_SZ_HIGH);
+	size = (u64)val << 32;
+
+	val = ntb_spad_read(ndev, MW_SZ_LOW);
+	size |= val;
+
+	dev_dbg(&pdev->dev, "Remote MW size = %#llx\n", size);
+
+	rc = perf_set_mw(perf, size);
+	if (rc)
+		goto out1;
+
+	perf->link_is_up = true;
+
+	return;
+
+out1:
+	perf_free_mw(perf);
+
+out:
+	if (ntb_link_is_up(ndev, NULL, NULL) == 1)
+		schedule_delayed_work(&perf->link_work,
+				      msecs_to_jiffies(PERF_LINK_DOWN_TIMEOUT));
+}
+
+static void perf_link_cleanup(struct work_struct *work)
+{
+	struct perf_ctx *perf = container_of(work,
+					     struct perf_ctx,
+					     link_cleanup);
+	int i;
+
+	dev_dbg(&perf->ntb->pdev->dev, "%s called\n", __func__);
+
+	if (!perf->link_is_up)
+		cancel_delayed_work_sync(&perf->link_work);
+
+	for (i = 0; i < MAX_SPAD; i++)
+		ntb_spad_write(perf->ntb, i, 0);
+}
+
+static int perf_setup_mw(struct ntb_dev *ntb, struct perf_ctx *perf)
+{
+	struct perf_mw *mw;
+	int rc;
+
+	mw = &perf->mw;
+
+	rc = ntb_mw_get_range(ntb, 0, &mw->phys_addr, &mw->phys_size,
+			      &mw->xlat_align, &mw->xlat_align_size);
+	if (rc)
+		return rc;
+
+	perf->mw.vbase = ioremap_wc(mw->phys_addr, mw->phys_size);
+	if (!mw->vbase)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static ssize_t debugfs_run_read(struct file *filp, char __user *ubuf,
+				size_t count, loff_t *offp)
+{
+	struct perf_ctx *perf = filp->private_data;
+	char *buf;
+	ssize_t ret, out_offset;
+
+	if (!perf)
+		return 0;
+
+	buf = kmalloc(64, GFP_KERNEL);
+	out_offset = snprintf(buf, 64, "%d\n", perf->run);
+	ret = simple_read_from_buffer(ubuf, count, offp, buf, out_offset);
+	kfree(buf);
+
+	return ret;
+}
+
+static ssize_t debugfs_run_write(struct file *filp, const char __user *ubuf,
+				 size_t count, loff_t *offp)
+{
+	struct perf_ctx *perf = filp->private_data;
+	int node, i;
+
+	if (perf->link_is_up == false)
+		return 0;
+
+	if (perf->perf_threads == 0)
+		return 0;
+
+	if (atomic_read(&perf->tsync) == 0)
+		perf->run = false;
+
+	if (perf->run == true) {
+		/* lets stop the threads */
+		perf->run = false;
+		for (i = 0; i < MAX_THREADS; i++) {
+			if (perf->pthr_ctx[i].thread) {
+				kthread_stop(perf->pthr_ctx[i].thread);
+				perf->pthr_ctx[i].thread = NULL;
+			} else
+				break;
+		}
+	} else {
+		perf->run = true;
+
+		if (perf->perf_threads > MAX_THREADS) {
+			perf->perf_threads = MAX_THREADS;
+			pr_info("Reset total threads to: %u\n", MAX_THREADS);
+		}
+
+		/* no greater than 1M */
+		if (seg_order > 20) {
+			seg_order = 20;
+			pr_info("Fix seg_order to %u\n", seg_order);
+		}
+
+		if (run_order < seg_order) {
+			run_order = seg_order;
+			pr_info("Fix run_order to %u\n", run_order);
+		}
+
+		node = dev_to_node(&perf->ntb->dev);
+		/* launch kernel thread */
+		for (i = 0; i < perf->perf_threads; i++) {
+			struct pthr_ctx *pctx;
+
+			pctx = &perf->pthr_ctx[i];
+			atomic_set(&pctx->dma_sync, 0);
+			pctx->perf = perf;
+			pctx->thread =
+				kthread_create_on_node(ntb_perf_thread,
+						       (void *)pctx,
+						       node, "ntb_perf %d", i);
+			if (pctx->thread)
+				wake_up_process(pctx->thread);
+			else {
+				perf->run = false;
+				for (i = 0; i < MAX_THREADS; i++) {
+					if (pctx->thread) {
+						kthread_stop(pctx->thread);
+						pctx->thread = NULL;
+					} else
+						break;
+				}
+			}
+
+			if (perf->run == false)
+				return -ENXIO;
+		}
+
+	}
+
+	return count;
+}
+
+static const struct file_operations ntb_perf_debugfs_run = {
+	.owner = THIS_MODULE,
+	.open = simple_open,
+	.read = debugfs_run_read,
+	.write = debugfs_run_write,
+};
+
+static int perf_debugfs_setup(struct perf_ctx *perf)
+{
+	struct pci_dev *pdev = perf->ntb->pdev;
+
+	if (!perf_debugfs_dir)
+		return -ENODEV;
+
+	perf->debugfs_node_dir = debugfs_create_dir(pci_name(pdev),
+						    perf_debugfs_dir);
+	if (!perf->debugfs_node_dir)
+		return -ENODEV;
+
+	perf->debugfs_run = debugfs_create_file("run", S_IRUSR | S_IWUSR,
+						perf->debugfs_node_dir, perf,
+						&ntb_perf_debugfs_run);
+	if (!perf->debugfs_run)
+		return -ENODEV;
+
+	perf->debugfs_threads = debugfs_create_u8("threads", S_IRUSR | S_IWUSR,
+						  perf->debugfs_node_dir,
+						  &perf->perf_threads);
+	if (!perf->debugfs_threads)
+		return -ENODEV;
+
+	return 0;
+}
+
+static int perf_probe(struct ntb_client *client, struct ntb_dev *ntb)
+{
+	struct pci_dev *pdev = ntb->pdev;
+	struct perf_ctx *perf;
+	int node;
+	int rc = 0;
+
+	node = dev_to_node(&pdev->dev);
+
+	perf = kzalloc_node(sizeof(*perf), GFP_KERNEL, node);
+	if (!perf) {
+		rc = -ENOMEM;
+		goto err_perf;
+	}
+
+	perf->ntb = ntb;
+	perf->perf_threads = 1;
+	atomic_set(&perf->tsync, 0);
+	perf->run = false;
+	spin_lock_init(&perf->db_lock);
+	perf_setup_mw(ntb, perf);
+	INIT_DELAYED_WORK(&perf->link_work, perf_link_work);
+	INIT_WORK(&perf->link_cleanup, perf_link_cleanup);
+
+	rc = ntb_set_ctx(ntb, perf, &perf_ops);
+	if (rc)
+		goto err_ctx;
+
+	perf->link_is_up = false;
+	ntb_link_enable(ntb, NTB_SPEED_AUTO, NTB_WIDTH_AUTO);
+	ntb_link_event(ntb);
+
+	if (debugfs_initialized() && !perf_debugfs_dir) {
+		perf_debugfs_dir = debugfs_create_dir(KBUILD_MODNAME, NULL);
+		if (!perf_debugfs_dir)
+			goto err_ctx;
+
+		rc = perf_debugfs_setup(perf);
+		if (rc)
+			goto err_ctx;
+	}
+
+	return 0;
+
+err_ctx:
+	cancel_delayed_work_sync(&perf->link_work);
+	cancel_work_sync(&perf->link_cleanup);
+	kfree(perf);
+err_perf:
+	return rc;
+}
+
+static void perf_remove(struct ntb_client *client, struct ntb_dev *ntb)
+{
+	struct perf_ctx *perf = ntb->ctx;
+	int i;
+
+	dev_dbg(&perf->ntb->dev, "%s called\n", __func__);
+
+	cancel_delayed_work_sync(&perf->link_work);
+	cancel_work_sync(&perf->link_cleanup);
+
+	ntb_clear_ctx(ntb);
+	ntb_link_disable(ntb);
+
+	if (perf_debugfs_dir) {
+		debugfs_remove_recursive(perf_debugfs_dir);
+		perf_debugfs_dir = NULL;
+	}
+
+	if (use_dma) {
+		for (i = 0; i < MAX_THREADS; i++) {
+			struct pthr_ctx *pctx = &perf->pthr_ctx[i];
+			if (pctx->dma_chan)
+				dma_release_channel(pctx->dma_chan);
+		}
+	}
+
+	kfree(perf);
+}
+
+static struct ntb_client perf_client = {
+	.ops = {
+		.probe = perf_probe,
+		.remove = perf_remove,
+	},
+};
+module_ntb_client(perf_client);


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH] ntb: ntb perf tool
  2016-01-05 23:37 [PATCH] ntb: ntb perf tool Dave Jiang
@ 2016-01-06 15:05 ` Jon Mason
  2016-01-06 16:37   ` Allen Hubbe
  2016-01-06 16:57   ` Jiang, Dave
  0 siblings, 2 replies; 5+ messages in thread
From: Jon Mason @ 2016-01-06 15:05 UTC (permalink / raw)
  To: Dave Jiang; +Cc: Hubbe, Allen, linux-ntb

On Tue, Jan 5, 2016 at 6:37 PM, Dave Jiang <dave.jiang@intel.com> wrote:
> Providing raw performance data via a tool that directly access data from
> NTB w/o any software overhead. This allows measurement of the hardware
> performance limit. In revision one we are only doing single direction
> CPU and DMA writes. Eventually we will provide bi-directional writes.
>
> The measurement using DMA engine for NTB performance measure does
> not measure the raw performance of DMA engine over NTB due to software
> overhead. But it should provide the peak performance through the Linux DMA
> driver.
>
> Signed-off-by: Dave Jiang <dave.jiang@intel.com>
> ---
>  drivers/ntb/test/Kconfig    |    8
>  drivers/ntb/test/Makefile   |    1
>  drivers/ntb/test/ntb_perf.c |  755 +++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 764 insertions(+)
>  create mode 100644 drivers/ntb/test/ntb_perf.c
>
> diff --git a/drivers/ntb/test/Kconfig b/drivers/ntb/test/Kconfig
> index 01852f9..aad598a 100644
> --- a/drivers/ntb/test/Kconfig
> +++ b/drivers/ntb/test/Kconfig
> @@ -17,3 +17,11 @@ config NTB_TOOL
>          functioning at a basic level.
>
>          If unsure, say N.
> +
> +config NTB_PERF
> +       tristate "NTB RAW Perf Measuring Tool"
> +       help
> +        This is a tool to measure raw NTB performance by transfering data
> +        to and from the window without additional software interaction.
> +
> +        If unsure, say N.
> diff --git a/drivers/ntb/test/Makefile b/drivers/ntb/test/Makefile
> index 0ea32a3..9e77e0b 100644
> --- a/drivers/ntb/test/Makefile
> +++ b/drivers/ntb/test/Makefile
> @@ -1,2 +1,3 @@
>  obj-$(CONFIG_NTB_PINGPONG) += ntb_pingpong.o
>  obj-$(CONFIG_NTB_TOOL) += ntb_tool.o
> +obj-$(CONFIG_NTB_PERF) += ntb_perf.o
> diff --git a/drivers/ntb/test/ntb_perf.c b/drivers/ntb/test/ntb_perf.c
> new file mode 100644
> index 0000000..fe0f13c
> --- /dev/null
> +++ b/drivers/ntb/test/ntb_perf.c
> @@ -0,0 +1,755 @@
> +/*
> + * This file is provided under a dual BSD/GPLv2 license.  When using or
> + *   redistributing this file, you may do so under either license.
> + *
> + *   GPL LICENSE SUMMARY
> + *
> + *   Copyright(c) 2015 Intel Corporation. All rights reserved.
> + *
> + *   This program is free software; you can redistribute it and/or modify
> + *   it under the terms of version 2 of the GNU General Public License as
> + *   published by the Free Software Foundation.
> + *
> + *   BSD LICENSE
> + *
> + *   Copyright(c) 2015 Intel Corporation. All rights reserved.
> + *
> + *   Redistribution and use in source and binary forms, with or without
> + *   modification, are permitted provided that the following conditions
> + *   are met:
> + *
> + *     * Redistributions of source code must retain the above copyright
> + *       notice, this list of conditions and the following disclaimer.
> + *     * Redistributions in binary form must reproduce the above copy
> + *       notice, this list of conditions and the following disclaimer in
> + *       the documentation and/or other materials provided with the
> + *       distribution.
> + *     * Neither the name of Intel Corporation nor the names of its
> + *       contributors may be used to endorse or promote products derived
> + *       from this software without specific prior written permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + *
> + *   PCIe NTB Perf Linux driver
> + */
> +
> +#include <linux/init.h>
> +#include <linux/kernel.h>
> +#include <linux/module.h>
> +#include <linux/kthread.h>
> +#include <linux/time.h>
> +#include <linux/timer.h>
> +#include <linux/dma-mapping.h>
> +#include <linux/pci.h>
> +#include <linux/slab.h>
> +#include <linux/spinlock.h>
> +#include <linux/debugfs.h>
> +#include <linux/dmaengine.h>
> +#include <linux/delay.h>
> +

nit, remove the new line above

> +#include <linux/ntb.h>
> +
> +#define DRIVER_NAME            "ntb_perf"
> +#define DRIVER_DESCRIPTION     "PCIe NTB Performance Measurement Tool"
> +
> +#define DRIVER_LICENSE         "Dual BSD/GPL"
> +#define DRIVER_VERSION         "1.0"
> +#define DRIVER_AUTHOR          "Dave Jiang <dave.jiang@intel.com>"
> +
> +#define PERF_LINK_DOWN_TIMEOUT 10
> +#define PERF_VERSION           0xffff0001
> +#define MAX_THREADS            32
> +#define MAX_TEST_SIZE          1024*1024       /* 1M */

using SZ_1M would be better than defining it here.

> +#define MAX_SRCS               32

Is this related to max threads above?  If so, make it move obvious and
tie them together.

> +#define DMA_OUT_RESOURCE_TO    50
> +#define DMA_RETRIES            20
> +
> +MODULE_LICENSE(DRIVER_LICENSE);
> +MODULE_VERSION(DRIVER_VERSION);
> +MODULE_AUTHOR(DRIVER_AUTHOR);
> +MODULE_DESCRIPTION(DRIVER_DESCRIPTION);
> +
> +static struct dentry *perf_debugfs_dir;
> +
> +static unsigned int seg_order = 19; /* 512K */
> +module_param(seg_order, uint, 0644);
> +MODULE_PARM_DESC(seg_order, "size order [n^2] of buffer segment for testing");
> +
> +static unsigned int run_order = 32; /* 4G */
> +module_param(run_order, uint, 0644);
> +MODULE_PARM_DESC(run_order, "size order [n^2] of total data to transfer");
> +
> +static bool use_dma = false;
> +module_param(use_dma, bool, 0644);
> +MODULE_PARM_DESC(use_dma, "Using DMA engine to measure performance");
> +
> +struct perf_mw {
> +       phys_addr_t     phys_addr;
> +       resource_size_t phys_size;
> +       resource_size_t xlat_align;
> +       resource_size_t xlat_align_size;
> +       void __iomem    *vbase;
> +       size_t          xlat_size;
> +       size_t          buf_size;
> +       void            *virt_addr;
> +       dma_addr_t      dma_addr;
> +};
> +
> +struct perf_ctx;
> +
> +struct pthr_ctx {
> +       struct task_struct      *thread;
> +       struct perf_ctx         *perf;
> +       atomic_t                dma_sync;
> +       struct dma_chan         *dma_chan;
> +       int                     dma_prep_err;
> +       int                     src_idx;
> +       void                    *srcs[MAX_SRCS];
> +};
> +
> +struct perf_ctx {
> +       struct ntb_dev          *ntb;
> +       spinlock_t              db_lock;
> +       struct perf_mw          mw;
> +       bool                    link_is_up;
> +       struct work_struct      link_cleanup;
> +       struct delayed_work     link_work;
> +       struct dentry           *debugfs_node_dir;
> +       struct dentry           *debugfs_run;
> +       struct dentry           *debugfs_threads;
> +       u8                      perf_threads;
> +       bool                    run;
> +       struct pthr_ctx         pthr_ctx[MAX_THREADS];
> +       atomic_t                tsync;
> +};
> +
> +enum {
> +       VERSION = 0,
> +       MW_SZ_HIGH,
> +       MW_SZ_LOW,
> +       SPAD_MSG,
> +       SPAD_ACK,
> +       MAX_SPAD
> +};
> +
> +static void perf_link_event(void *ctx)
> +{
> +       struct perf_ctx *perf = ctx;
> +
> +       if (ntb_link_is_up(perf->ntb, NULL, NULL) == 1)

The comparison to one seems unnecessary

> +               schedule_delayed_work(&perf->link_work, 0);
> +       else
> +               schedule_work(&perf->link_cleanup);
> +}
> +
> +static void perf_db_event(void *ctx, int vec)
> +{
> +       struct perf_ctx *perf = ctx;
> +       u64 db_bits, db_mask;
> +
> +       db_mask = ntb_db_vector_mask(perf->ntb, vec);
> +       db_bits = ntb_db_read(perf->ntb);
> +
> +       dev_dbg(&perf->ntb->dev, "doorbell vec %d mask %#llx bits %#llx\n",
> +               vec, db_mask, db_bits);
> +}
> +
> +static const struct ntb_ctx_ops perf_ops = {
> +       .link_event = perf_link_event,
> +       .db_event = perf_db_event,
> +};
> +
> +static void perf_copy_callback(void *data)
> +{
> +       struct pthr_ctx *pctx = data;
> +
> +       atomic_dec(&pctx->dma_sync);
> +}
> +
> +static ssize_t perf_copy(struct pthr_ctx *pctx, char *dst,
> +                        char *src, size_t size)
> +{
> +       struct perf_ctx *perf = pctx->perf;
> +       struct dma_async_tx_descriptor *txd;
> +       struct dma_chan *chan = pctx->dma_chan;
> +       struct dma_device *device;
> +       struct dmaengine_unmap_data *unmap;
> +       dma_cookie_t cookie;
> +       size_t src_off, dst_off;
> +       struct perf_mw *mw = &perf->mw;
> +       u64 vbase, dst_vaddr;
> +       dma_addr_t dst_phys;
> +       int retries = 0;
> +
> +       if (!use_dma) {
> +               memcpy_toio(dst, src, size);
> +               return size;
> +       }
> +
> +       if (!chan) {
> +               dev_err(&perf->ntb->dev, "DMA engine does not exist\n");
> +               return -EINVAL;
> +       }
> +
> +       device = chan->device;
> +       src_off = (size_t)src & ~PAGE_MASK;
> +       dst_off = (size_t)dst & ~PAGE_MASK;
> +
> +       if (!is_dma_copy_aligned(device, src_off, dst_off, size))
> +               return -ENODEV;
> +
> +       vbase = (u64)(u64 *)mw->vbase;
> +       dst_vaddr = (u64)(u64 *)dst;
> +       dst_phys = mw->phys_addr + (dst_vaddr - vbase);
> +
> +       unmap = dmaengine_get_unmap_data(device->dev, 1, GFP_NOWAIT);
> +       if (!unmap)
> +               return -ENOMEM;
> +
> +       unmap->len = size;
> +       unmap->addr[0] = dma_map_page(device->dev, virt_to_page(src),
> +                                     src_off, size, DMA_TO_DEVICE);
> +       if (dma_mapping_error(device->dev, unmap->addr[0]))
> +               goto err_get_unmap;
> +
> +       unmap->to_cnt = 1;
> +
> +       do {
> +               txd = device->device_prep_dma_memcpy(chan, dst_phys,
> +                                                    unmap->addr[0],
> +                                                    size, DMA_PREP_INTERRUPT);
> +               if (!txd) {
> +                       set_current_state(TASK_INTERRUPTIBLE);
> +                       schedule_timeout(DMA_OUT_RESOURCE_TO);
> +               }
> +       } while (!txd && (++retries < DMA_RETRIES));
> +
> +       if (!txd) {
> +               pctx->dma_prep_err++;
> +               goto err_get_unmap;
> +       }
> +
> +       txd->callback = perf_copy_callback;
> +       txd->callback_param = pctx;
> +       dma_set_unmap(txd, unmap);
> +
> +       cookie = dmaengine_submit(txd);
> +       if (dma_submit_error(cookie))
> +               goto err_set_unmap;
> +
> +       atomic_inc(&pctx->dma_sync);
> +       dma_async_issue_pending(chan);
> +
> +       return size;
> +
> +err_set_unmap:
> +       dmaengine_unmap_put(unmap);
> +err_get_unmap:
> +       dmaengine_unmap_put(unmap);
> +       return 0;
> +}
> +
> +static int perf_move_data(struct pthr_ctx *pctx, char *dst, char *src,
> +                         u64 buf_size, u64 win_size, u64 total)
> +{
> +       int chunks, total_chunks, i;
> +       int copied_chunks = 0;
> +       u64 copied = 0, result;
> +       char *tmp = dst;
> +       u64 perf, diff_us;
> +       ktime_t kstart, kstop, kdiff;
> +
> +       chunks = win_size / buf_size;
> +       total_chunks = total / buf_size;
> +       kstart = ktime_get();
> +
> +       for (i = 0; i < total_chunks; i++) {
> +               result = perf_copy(pctx, tmp, src, buf_size);
> +               copied += result;
> +               copied_chunks++;
> +               if (copied_chunks == chunks) {
> +                       tmp = dst;
> +                       copied_chunks = 0;
> +               } else
> +                       tmp += buf_size;
> +
> +               /* probably should schedule every 4GB? */
> +               if (((copied % (1ULL << 32)) == 0) && !use_dma) {

I'd prefer you #define SZ_4G and use it above

> +                       set_current_state(TASK_INTERRUPTIBLE);
> +                       schedule();
> +               }
> +       }
> +
> +       if (use_dma)
> +               pr_info("%s: All DMA descriptors submitted\n", current->comm);
> +
> +       while (atomic_read(&pctx->dma_sync) != 0)
> +               msleep(1);
> +
> +       kstop = ktime_get();
> +       kdiff = ktime_sub(kstop, kstart);
> +       diff_us = ktime_to_us(kdiff);
> +
> +       pr_info("%s: copied %Lu bytes\n", current->comm, copied);
> +
> +       pr_info("%s: lasted %Lu usecs\n", current->comm, diff_us);
> +
> +       perf = copied / diff_us;
> +
> +       pr_info("%s: MBytes/s: %Lu\n", current->comm, perf);
> +
> +       return 0;
> +}
> +
> +static bool perf_dma_filter_fn(struct dma_chan *chan, void *node)
> +{
> +       return dev_to_node(&chan->dev->device) == (int)(unsigned long)node;
> +}
> +
> +static int ntb_perf_thread(void *data)
> +{
> +       struct pthr_ctx *pctx = data;
> +       struct perf_ctx *perf = pctx->perf;
> +       struct pci_dev *pdev = perf->ntb->pdev;
> +       struct perf_mw *mw = &perf->mw;
> +       char *dst;
> +       u64 win_size, buf_size, total;
> +       void *src;
> +       int rc, node, i;
> +       struct dma_chan *dma_chan = NULL;
> +
> +       pr_info("kthread %s starting...\n", current->comm);
> +
> +       node = dev_to_node(&pdev->dev);
> +
> +       if (use_dma && !pctx->dma_chan) {
> +               dma_cap_mask_t dma_mask;
> +
> +               dma_cap_zero(dma_mask);
> +               dma_cap_set(DMA_MEMCPY, dma_mask);
> +               dma_chan = dma_request_channel(dma_mask, perf_dma_filter_fn,
> +                                              (void *)(unsigned long)node);
> +               if (!dma_chan) {
> +                       pr_warn("%s: cannot acquire DMA channel, quitting\n",
> +                               current->comm);
> +                       return -ENODEV;
> +               }
> +               pctx->dma_chan = dma_chan;
> +       }
> +
> +       for (i = 0; i < MAX_SRCS; i++) {
> +               pctx->srcs[i] = kmalloc_node(MAX_TEST_SIZE, GFP_KERNEL, node);
> +               if (!pctx->srcs[i]) {
> +                       rc = -ENOMEM;
> +                       goto err;
> +               }
> +       }
> +
> +       win_size = mw->phys_size;
> +       buf_size = 1ULL << seg_order;
> +       total = 1ULL << run_order;
> +
> +       if (buf_size > MAX_TEST_SIZE)
> +               buf_size = MAX_TEST_SIZE;
> +
> +       dst = (char *)mw->vbase;
> +
> +       atomic_inc(&perf->tsync);
> +       while (atomic_read(&perf->tsync) != perf->perf_threads)
> +               schedule();
> +
> +       src = pctx->srcs[pctx->src_idx];
> +       pctx->src_idx = (pctx->src_idx + 1) & (MAX_SRCS - 1);
> +
> +       rc = perf_move_data(pctx, dst, src, buf_size, win_size, total);
> +
> +       atomic_dec(&perf->tsync);
> +
> +       if (rc < 0) {
> +               pr_err("%s: failed\n", current->comm);
> +               rc = -ENXIO;
> +               goto err;
> +       }
> +
> +       for (i = 0; i < MAX_SRCS; i++) {
> +               if (pctx->srcs[i]) {
> +                       kfree(pctx->srcs[i]);
> +                       pctx->srcs[i] = NULL;
> +               }
> +       }
> +
> +       return 0;
> +
> +err:
> +       for (i = 0; i < MAX_SRCS; i++) {
> +               if (pctx->srcs[i]) {
> +                       kfree(pctx->srcs[i]);
> +                       pctx->srcs[i] = NULL;
> +               }
> +       }
> +
> +       if (dma_chan) {
> +               dma_release_channel(dma_chan);
> +               pctx->dma_chan = NULL;
> +       }
> +
> +       return rc;
> +}
> +
> +static void perf_free_mw(struct perf_ctx *perf)
> +{
> +       struct perf_mw *mw = &perf->mw;
> +       struct pci_dev *pdev = perf->ntb->pdev;
> +
> +       if (!mw->virt_addr)
> +               return;
> +
> +       ntb_mw_clear_trans(perf->ntb, 0);
> +       dma_free_coherent(&pdev->dev, mw->buf_size,
> +                         mw->virt_addr, mw->dma_addr);
> +       mw->xlat_size = 0;
> +       mw->buf_size = 0;
> +       mw->virt_addr = NULL;
> +}
> +
> +static int perf_set_mw(struct perf_ctx *perf, resource_size_t size)
> +{
> +       struct perf_mw *mw = &perf->mw;
> +       size_t xlat_size, buf_size;
> +
> +       if (!size)
> +               return -EINVAL;
> +
> +       xlat_size = round_up(size, mw->xlat_align_size);
> +       buf_size = round_up(size, mw->xlat_align);
> +
> +       if (mw->xlat_size == xlat_size)
> +               return 0;
> +
> +       if (mw->buf_size)
> +               perf_free_mw(perf);
> +
> +       mw->xlat_size = xlat_size;
> +       mw->buf_size = buf_size;
> +
> +       mw->virt_addr = dma_alloc_coherent(&perf->ntb->pdev->dev, buf_size,
> +                                          &mw->dma_addr, GFP_KERNEL);
> +       if (!mw->virt_addr) {
> +               mw->xlat_size = 0;
> +               mw->buf_size = 0;
> +       }
> +
> +       return 0;
> +}
> +
> +static void perf_link_work(struct work_struct *work)
> +{
> +       struct perf_ctx *perf =
> +               container_of(work, struct perf_ctx, link_work.work);
> +       struct ntb_dev *ndev = perf->ntb;
> +       struct pci_dev *pdev = ndev->pdev;
> +       u32 val;
> +       u64 size;
> +       int rc;
> +
> +       dev_dbg(&perf->ntb->pdev->dev, "%s called\n", __func__);
> +
> +       size = perf->mw.phys_size;
> +       ntb_peer_spad_write(ndev, MW_SZ_HIGH, (u32)(size >> 32));

Please use upper_32_bits(n)

> +       ntb_peer_spad_write(ndev, MW_SZ_LOW, (u32)size);

Please use lower_32_bits(n)

> +       ntb_peer_spad_write(ndev, VERSION, PERF_VERSION);
> +
> +       /* now read what peer wrote */
> +       val = ntb_spad_read(ndev, VERSION);
> +       dev_dbg(&pdev->dev, "Remote version = %#x\n", val);
> +       if (val != PERF_VERSION)

It might be better to my the debug comparison below the if check and
make it an error/warning.  That way it becomes more obvious if this is
ever triggered.

> +               goto out;
> +
> +       val = ntb_spad_read(ndev, MW_SZ_HIGH);
> +       size = (u64)val << 32;
> +
> +       val = ntb_spad_read(ndev, MW_SZ_LOW);
> +       size |= val;
> +
> +       dev_dbg(&pdev->dev, "Remote MW size = %#llx\n", size);
> +
> +       rc = perf_set_mw(perf, size);
> +       if (rc)
> +               goto out1;
> +
> +       perf->link_is_up = true;
> +
> +       return;
> +
> +out1:
> +       perf_free_mw(perf);
> +
> +out:
> +       if (ntb_link_is_up(ndev, NULL, NULL) == 1)
> +               schedule_delayed_work(&perf->link_work,
> +                                     msecs_to_jiffies(PERF_LINK_DOWN_TIMEOUT));
> +}
> +
> +static void perf_link_cleanup(struct work_struct *work)
> +{
> +       struct perf_ctx *perf = container_of(work,
> +                                            struct perf_ctx,
> +                                            link_cleanup);
> +       int i;
> +
> +       dev_dbg(&perf->ntb->pdev->dev, "%s called\n", __func__);
> +
> +       if (!perf->link_is_up)
> +               cancel_delayed_work_sync(&perf->link_work);
> +
> +       for (i = 0; i < MAX_SPAD; i++)

Complete aside here, but we might need to add logic to the core NTB
code to query the number of available SPADs and error out of we try to
get more than that.

> +               ntb_spad_write(perf->ntb, i, 0);
> +}
> +
> +static int perf_setup_mw(struct ntb_dev *ntb, struct perf_ctx *perf)
> +{
> +       struct perf_mw *mw;
> +       int rc;
> +
> +       mw = &perf->mw;
> +
> +       rc = ntb_mw_get_range(ntb, 0, &mw->phys_addr, &mw->phys_size,
> +                             &mw->xlat_align, &mw->xlat_align_size);
> +       if (rc)
> +               return rc;
> +
> +       perf->mw.vbase = ioremap_wc(mw->phys_addr, mw->phys_size);
> +       if (!mw->vbase)
> +               return -ENOMEM;
> +
> +       return 0;
> +}
> +
> +static ssize_t debugfs_run_read(struct file *filp, char __user *ubuf,
> +                               size_t count, loff_t *offp)
> +{
> +       struct perf_ctx *perf = filp->private_data;
> +       char *buf;
> +       ssize_t ret, out_offset;
> +
> +       if (!perf)
> +               return 0;
> +
> +       buf = kmalloc(64, GFP_KERNEL);
> +       out_offset = snprintf(buf, 64, "%d\n", perf->run);
> +       ret = simple_read_from_buffer(ubuf, count, offp, buf, out_offset);
> +       kfree(buf);
> +
> +       return ret;
> +}
> +
> +static ssize_t debugfs_run_write(struct file *filp, const char __user *ubuf,
> +                                size_t count, loff_t *offp)
> +{
> +       struct perf_ctx *perf = filp->private_data;
> +       int node, i;
> +
> +       if (perf->link_is_up == false)

Style suggestion, !perf->link_is_up

> +               return 0;
> +
> +       if (perf->perf_threads == 0)
> +               return 0;
> +
> +       if (atomic_read(&perf->tsync) == 0)
> +               perf->run = false;
> +
> +       if (perf->run == true) {

Style suggestion, "if (perf->run) {"

> +               /* lets stop the threads */
> +               perf->run = false;
> +               for (i = 0; i < MAX_THREADS; i++) {
> +                       if (perf->pthr_ctx[i].thread) {
> +                               kthread_stop(perf->pthr_ctx[i].thread);
> +                               perf->pthr_ctx[i].thread = NULL;
> +                       } else
> +                               break;
> +               }
> +       } else {
> +               perf->run = true;
> +
> +               if (perf->perf_threads > MAX_THREADS) {
> +                       perf->perf_threads = MAX_THREADS;
> +                       pr_info("Reset total threads to: %u\n", MAX_THREADS);
> +               }
> +
> +               /* no greater than 1M */
> +               if (seg_order > 20) {

SZ_1M / seg size?

> +                       seg_order = 20;
> +                       pr_info("Fix seg_order to %u\n", seg_order);
> +               }
> +
> +               if (run_order < seg_order) {
> +                       run_order = seg_order;
> +                       pr_info("Fix run_order to %u\n", run_order);
> +               }
> +
> +               node = dev_to_node(&perf->ntb->dev);
> +               /* launch kernel thread */
> +               for (i = 0; i < perf->perf_threads; i++) {
> +                       struct pthr_ctx *pctx;
> +
> +                       pctx = &perf->pthr_ctx[i];
> +                       atomic_set(&pctx->dma_sync, 0);
> +                       pctx->perf = perf;
> +                       pctx->thread =
> +                               kthread_create_on_node(ntb_perf_thread,
> +                                                      (void *)pctx,
> +                                                      node, "ntb_perf %d", i);
> +                       if (pctx->thread)
> +                               wake_up_process(pctx->thread);
> +                       else {
> +                               perf->run = false;
> +                               for (i = 0; i < MAX_THREADS; i++) {
> +                                       if (pctx->thread) {
> +                                               kthread_stop(pctx->thread);
> +                                               pctx->thread = NULL;
> +                                       } else
> +                                               break;

It might be simpler to simply run through them all and stop them
(instead of checking for the first empty and quitting)

> +                               }
> +                       }
> +
> +                       if (perf->run == false)
> +                               return -ENXIO;
> +               }
> +
> +       }
> +
> +       return count;
> +}
> +
> +static const struct file_operations ntb_perf_debugfs_run = {
> +       .owner = THIS_MODULE,
> +       .open = simple_open,
> +       .read = debugfs_run_read,
> +       .write = debugfs_run_write,
> +};
> +
> +static int perf_debugfs_setup(struct perf_ctx *perf)
> +{
> +       struct pci_dev *pdev = perf->ntb->pdev;
> +
> +       if (!perf_debugfs_dir)
> +               return -ENODEV;
> +
> +       perf->debugfs_node_dir = debugfs_create_dir(pci_name(pdev),
> +                                                   perf_debugfs_dir);
> +       if (!perf->debugfs_node_dir)
> +               return -ENODEV;
> +
> +       perf->debugfs_run = debugfs_create_file("run", S_IRUSR | S_IWUSR,
> +                                               perf->debugfs_node_dir, perf,
> +                                               &ntb_perf_debugfs_run);
> +       if (!perf->debugfs_run)
> +               return -ENODEV;
> +
> +       perf->debugfs_threads = debugfs_create_u8("threads", S_IRUSR | S_IWUSR,
> +                                                 perf->debugfs_node_dir,
> +                                                 &perf->perf_threads);
> +       if (!perf->debugfs_threads)
> +               return -ENODEV;
> +
> +       return 0;
> +}
> +
> +static int perf_probe(struct ntb_client *client, struct ntb_dev *ntb)
> +{
> +       struct pci_dev *pdev = ntb->pdev;
> +       struct perf_ctx *perf;
> +       int node;
> +       int rc = 0;
> +
> +       node = dev_to_node(&pdev->dev);
> +
> +       perf = kzalloc_node(sizeof(*perf), GFP_KERNEL, node);
> +       if (!perf) {
> +               rc = -ENOMEM;
> +               goto err_perf;
> +       }
> +
> +       perf->ntb = ntb;
> +       perf->perf_threads = 1;
> +       atomic_set(&perf->tsync, 0);
> +       perf->run = false;
> +       spin_lock_init(&perf->db_lock);
> +       perf_setup_mw(ntb, perf);
> +       INIT_DELAYED_WORK(&perf->link_work, perf_link_work);
> +       INIT_WORK(&perf->link_cleanup, perf_link_cleanup);
> +
> +       rc = ntb_set_ctx(ntb, perf, &perf_ops);
> +       if (rc)
> +               goto err_ctx;
> +
> +       perf->link_is_up = false;
> +       ntb_link_enable(ntb, NTB_SPEED_AUTO, NTB_WIDTH_AUTO);
> +       ntb_link_event(ntb);
> +
> +       if (debugfs_initialized() && !perf_debugfs_dir) {
> +               perf_debugfs_dir = debugfs_create_dir(KBUILD_MODNAME, NULL);
> +               if (!perf_debugfs_dir)
> +                       goto err_ctx;
> +
> +               rc = perf_debugfs_setup(perf);
> +               if (rc)
> +                       goto err_ctx;
> +       }
> +
> +       return 0;
> +
> +err_ctx:
> +       cancel_delayed_work_sync(&perf->link_work);
> +       cancel_work_sync(&perf->link_cleanup);
> +       kfree(perf);
> +err_perf:
> +       return rc;
> +}
> +
> +static void perf_remove(struct ntb_client *client, struct ntb_dev *ntb)
> +{
> +       struct perf_ctx *perf = ntb->ctx;
> +       int i;
> +
> +       dev_dbg(&perf->ntb->dev, "%s called\n", __func__);
> +
> +       cancel_delayed_work_sync(&perf->link_work);
> +       cancel_work_sync(&perf->link_cleanup);
> +
> +       ntb_clear_ctx(ntb);
> +       ntb_link_disable(ntb);
> +
> +       if (perf_debugfs_dir) {
> +               debugfs_remove_recursive(perf_debugfs_dir);
> +               perf_debugfs_dir = NULL;
> +       }
> +
> +       if (use_dma) {
> +               for (i = 0; i < MAX_THREADS; i++) {
> +                       struct pthr_ctx *pctx = &perf->pthr_ctx[i];
> +                       if (pctx->dma_chan)
> +                               dma_release_channel(pctx->dma_chan);
> +               }
> +       }
> +
> +       kfree(perf);
> +}
> +
> +static struct ntb_client perf_client = {
> +       .ops = {
> +               .probe = perf_probe,
> +               .remove = perf_remove,
> +       },
> +};
> +module_ntb_client(perf_client);

A few minor things, but overall looks good.  Please fix them up and resubmit.

Thanks,
Jon

^ permalink raw reply	[flat|nested] 5+ messages in thread

* RE: [PATCH] ntb: ntb perf tool
  2016-01-06 15:05 ` Jon Mason
@ 2016-01-06 16:37   ` Allen Hubbe
  2016-01-06 17:45     ` Jon Mason
  2016-01-06 16:57   ` Jiang, Dave
  1 sibling, 1 reply; 5+ messages in thread
From: Allen Hubbe @ 2016-01-06 16:37 UTC (permalink / raw)
  To: 'Jon Mason', 'Dave Jiang'; +Cc: linux-ntb

Jon Mason <jdmason@kudzu.us>:
> On Tue, Jan 5, 2016 at 6:37 PM, Dave Jiang <dave.jiang@intel.com> wrote:

> > +static void perf_link_event(void *ctx)
> > +{
> > +       struct perf_ctx *perf = ctx;
> > +
> > +       if (ntb_link_is_up(perf->ntb, NULL, NULL) == 1)
> 
> The comparison to one seems unnecessary
> 

It api doc says it returns one, zero, or negative to indicate an error.  The Intel NTB driver does not return an error.  Would you prefer to change the api, to make the return simply a boolean, and have some other mechanism to query an error?

> > +       if (!perf->link_is_up)
> > +               cancel_delayed_work_sync(&perf->link_work);
> > +
> > +       for (i = 0; i < MAX_SPAD; i++)
> 
> Complete aside here, but we might need to add logic to the core NTB
> code to query the number of available SPADs and error out of we try to
> get more than that.

Use ntb_spad_count(), and the count does vary.  For example, there are half the number of spads in RP/TB topology than B2B.



^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] ntb: ntb perf tool
  2016-01-06 15:05 ` Jon Mason
  2016-01-06 16:37   ` Allen Hubbe
@ 2016-01-06 16:57   ` Jiang, Dave
  1 sibling, 0 replies; 5+ messages in thread
From: Jiang, Dave @ 2016-01-06 16:57 UTC (permalink / raw)
  To: jdmason; +Cc: allen.hubbe, linux-ntb

On Wed, 2016-01-06 at 10:05 -0500, Jon Mason wrote:
> On Tue, Jan 5, 2016 at 6:37 PM, Dave Jiang <dave.jiang@intel.com>
> wrote:
> > Providing raw performance data via a tool that directly access data
> > from
> > NTB w/o any software overhead. This allows measurement of the
> > hardware
> > performance limit. In revision one we are only doing single
> > direction
> > CPU and DMA writes. Eventually we will provide bi-directional
> > writes.
> > 
> > The measurement using DMA engine for NTB performance measure does
> > not measure the raw performance of DMA engine over NTB due to
> > software
> > overhead. But it should provide the peak performance through the
> > Linux DMA
> > driver.
> > 
> > Signed-off-by: Dave Jiang <dave.jiang@intel.com>
> > ---
> >  drivers/ntb/test/Kconfig    |    8
> >  drivers/ntb/test/Makefile   |    1
> >  drivers/ntb/test/ntb_perf.c |  755
> > +++++++++++++++++++++++++++++++++++++++++++
> >  3 files changed, 764 insertions(+)
> >  create mode 100644 drivers/ntb/test/ntb_perf.c
> > 
> > diff --git a/drivers/ntb/test/Kconfig b/drivers/ntb/test/Kconfig
> > index 01852f9..aad598a 100644
> > --- a/drivers/ntb/test/Kconfig
> > +++ b/drivers/ntb/test/Kconfig
> > @@ -17,3 +17,11 @@ config NTB_TOOL
> >          functioning at a basic level.
> > 
> >          If unsure, say N.
> > +
> > +config NTB_PERF
> > +       tristate "NTB RAW Perf Measuring Tool"
> > +       help
> > +        This is a tool to measure raw NTB performance by
> > transfering data
> > +        to and from the window without additional software
> > interaction.
> > +
> > +        If unsure, say N.
> > diff --git a/drivers/ntb/test/Makefile b/drivers/ntb/test/Makefile
> > index 0ea32a3..9e77e0b 100644
> > --- a/drivers/ntb/test/Makefile
> > +++ b/drivers/ntb/test/Makefile
> > @@ -1,2 +1,3 @@
> >  obj-$(CONFIG_NTB_PINGPONG) += ntb_pingpong.o
> >  obj-$(CONFIG_NTB_TOOL) += ntb_tool.o
> > +obj-$(CONFIG_NTB_PERF) += ntb_perf.o
> > diff --git a/drivers/ntb/test/ntb_perf.c
> > b/drivers/ntb/test/ntb_perf.c
> > new file mode 100644
> > index 0000000..fe0f13c
> > --- /dev/null
> > +++ b/drivers/ntb/test/ntb_perf.c
> > @@ -0,0 +1,755 @@
> > +/*
> > + * This file is provided under a dual BSD/GPLv2 license.  When
> > using or
> > + *   redistributing this file, you may do so under either license.
> > + *
> > + *   GPL LICENSE SUMMARY
> > + *
> > + *   Copyright(c) 2015 Intel Corporation. All rights reserved.
> > + *
> > + *   This program is free software; you can redistribute it and/or
> > modify
> > + *   it under the terms of version 2 of the GNU General Public
> > License as
> > + *   published by the Free Software Foundation.
> > + *
> > + *   BSD LICENSE
> > + *
> > + *   Copyright(c) 2015 Intel Corporation. All rights reserved.
> > + *
> > + *   Redistribution and use in source and binary forms, with or
> > without
> > + *   modification, are permitted provided that the following
> > conditions
> > + *   are met:
> > + *
> > + *     * Redistributions of source code must retain the above
> > copyright
> > + *       notice, this list of conditions and the following
> > disclaimer.
> > + *     * Redistributions in binary form must reproduce the above
> > copy
> > + *       notice, this list of conditions and the following
> > disclaimer in
> > + *       the documentation and/or other materials provided with
> > the
> > + *       distribution.
> > + *     * Neither the name of Intel Corporation nor the names of
> > its
> > + *       contributors may be used to endorse or promote products
> > derived
> > + *       from this software without specific prior written
> > permission.
> > + *
> > + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
> > CONTRIBUTORS
> > + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
> > NOT
> > + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
> > FITNESS FOR
> > + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
> > COPYRIGHT
> > + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
> > INCIDENTAL,
> > + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
> > NOT
> > + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
> > OF USE,
> > + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
> > AND ON ANY
> > + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
> > OR TORT
> > + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
> > THE USE
> > + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
> > DAMAGE.
> > + *
> > + *   PCIe NTB Perf Linux driver
> > + */
> > +
> > +#include <linux/init.h>
> > +#include <linux/kernel.h>
> > +#include <linux/module.h>
> > +#include <linux/kthread.h>
> > +#include <linux/time.h>
> > +#include <linux/timer.h>
> > +#include <linux/dma-mapping.h>
> > +#include <linux/pci.h>
> > +#include <linux/slab.h>
> > +#include <linux/spinlock.h>
> > +#include <linux/debugfs.h>
> > +#include <linux/dmaengine.h>
> > +#include <linux/delay.h>
> > +
> 
> nit, remove the new line above
> 
> > +#include <linux/ntb.h>
> > +
> > +#define DRIVER_NAME            "ntb_perf"
> > +#define DRIVER_DESCRIPTION     "PCIe NTB Performance Measurement
> > Tool"
> > +
> > +#define DRIVER_LICENSE         "Dual BSD/GPL"
> > +#define DRIVER_VERSION         "1.0"
> > +#define DRIVER_AUTHOR          "Dave Jiang <dave.jiang@intel.com>"
> > +
> > +#define PERF_LINK_DOWN_TIMEOUT 10
> > +#define PERF_VERSION           0xffff0001
> > +#define MAX_THREADS            32
> > +#define MAX_TEST_SIZE          1024*1024       /* 1M */
> 
> using SZ_1M would be better than defining it here.
> 
> > +#define MAX_SRCS               32
> 
> Is this related to max threads above?  If so, make it move obvious
> and
> tie them together.

No these are arbitrary and not related.


> 
> > +#define DMA_OUT_RESOURCE_TO    50
> > +#define DMA_RETRIES            20
> > +
> > +MODULE_LICENSE(DRIVER_LICENSE);
> > +MODULE_VERSION(DRIVER_VERSION);
> > +MODULE_AUTHOR(DRIVER_AUTHOR);
> > +MODULE_DESCRIPTION(DRIVER_DESCRIPTION);
> > +
> > +static struct dentry *perf_debugfs_dir;
> > +
> > +static unsigned int seg_order = 19; /* 512K */
> > +module_param(seg_order, uint, 0644);
> > +MODULE_PARM_DESC(seg_order, "size order [n^2] of buffer segment
> > for testing");
> > +
> > +static unsigned int run_order = 32; /* 4G */
> > +module_param(run_order, uint, 0644);
> > +MODULE_PARM_DESC(run_order, "size order [n^2] of total data to
> > transfer");
> > +
> > +static bool use_dma = false;
> > +module_param(use_dma, bool, 0644);
> > +MODULE_PARM_DESC(use_dma, "Using DMA engine to measure
> > performance");
> > +
> > +struct perf_mw {
> > +       phys_addr_t     phys_addr;
> > +       resource_size_t phys_size;
> > +       resource_size_t xlat_align;
> > +       resource_size_t xlat_align_size;
> > +       void __iomem    *vbase;
> > +       size_t          xlat_size;
> > +       size_t          buf_size;
> > +       void            *virt_addr;
> > +       dma_addr_t      dma_addr;
> > +};
> > +
> > +struct perf_ctx;
> > +
> > +struct pthr_ctx {
> > +       struct task_struct      *thread;
> > +       struct perf_ctx         *perf;
> > +       atomic_t                dma_sync;
> > +       struct dma_chan         *dma_chan;
> > +       int                     dma_prep_err;
> > +       int                     src_idx;
> > +       void                    *srcs[MAX_SRCS];
> > +};
> > +
> > +struct perf_ctx {
> > +       struct ntb_dev          *ntb;
> > +       spinlock_t              db_lock;
> > +       struct perf_mw          mw;
> > +       bool                    link_is_up;
> > +       struct work_struct      link_cleanup;
> > +       struct delayed_work     link_work;
> > +       struct dentry           *debugfs_node_dir;
> > +       struct dentry           *debugfs_run;
> > +       struct dentry           *debugfs_threads;
> > +       u8                      perf_threads;
> > +       bool                    run;
> > +       struct pthr_ctx         pthr_ctx[MAX_THREADS];
> > +       atomic_t                tsync;
> > +};
> > +
> > +enum {
> > +       VERSION = 0,
> > +       MW_SZ_HIGH,
> > +       MW_SZ_LOW,
> > +       SPAD_MSG,
> > +       SPAD_ACK,
> > +       MAX_SPAD
> > +};
> > +
> > +static void perf_link_event(void *ctx)
> > +{
> > +       struct perf_ctx *perf = ctx;
> > +
> > +       if (ntb_link_is_up(perf->ntb, NULL, NULL) == 1)
> 
> The comparison to one seems unnecessary
> 
> > +               schedule_delayed_work(&perf->link_work, 0);
> > +       else
> > +               schedule_work(&perf->link_cleanup);
> > +}
> > +
> > +static void perf_db_event(void *ctx, int vec)
> > +{
> > +       struct perf_ctx *perf = ctx;
> > +       u64 db_bits, db_mask;
> > +
> > +       db_mask = ntb_db_vector_mask(perf->ntb, vec);
> > +       db_bits = ntb_db_read(perf->ntb);
> > +
> > +       dev_dbg(&perf->ntb->dev, "doorbell vec %d mask %#llx bits
> > %#llx\n",
> > +               vec, db_mask, db_bits);
> > +}
> > +
> > +static const struct ntb_ctx_ops perf_ops = {
> > +       .link_event = perf_link_event,
> > +       .db_event = perf_db_event,
> > +};
> > +
> > +static void perf_copy_callback(void *data)
> > +{
> > +       struct pthr_ctx *pctx = data;
> > +
> > +       atomic_dec(&pctx->dma_sync);
> > +}
> > +
> > +static ssize_t perf_copy(struct pthr_ctx *pctx, char *dst,
> > +                        char *src, size_t size)
> > +{
> > +       struct perf_ctx *perf = pctx->perf;
> > +       struct dma_async_tx_descriptor *txd;
> > +       struct dma_chan *chan = pctx->dma_chan;
> > +       struct dma_device *device;
> > +       struct dmaengine_unmap_data *unmap;
> > +       dma_cookie_t cookie;
> > +       size_t src_off, dst_off;
> > +       struct perf_mw *mw = &perf->mw;
> > +       u64 vbase, dst_vaddr;
> > +       dma_addr_t dst_phys;
> > +       int retries = 0;
> > +
> > +       if (!use_dma) {
> > +               memcpy_toio(dst, src, size);
> > +               return size;
> > +       }
> > +
> > +       if (!chan) {
> > +               dev_err(&perf->ntb->dev, "DMA engine does not
> > exist\n");
> > +               return -EINVAL;
> > +       }
> > +
> > +       device = chan->device;
> > +       src_off = (size_t)src & ~PAGE_MASK;
> > +       dst_off = (size_t)dst & ~PAGE_MASK;
> > +
> > +       if (!is_dma_copy_aligned(device, src_off, dst_off, size))
> > +               return -ENODEV;
> > +
> > +       vbase = (u64)(u64 *)mw->vbase;
> > +       dst_vaddr = (u64)(u64 *)dst;
> > +       dst_phys = mw->phys_addr + (dst_vaddr - vbase);
> > +
> > +       unmap = dmaengine_get_unmap_data(device->dev, 1,
> > GFP_NOWAIT);
> > +       if (!unmap)
> > +               return -ENOMEM;
> > +
> > +       unmap->len = size;
> > +       unmap->addr[0] = dma_map_page(device->dev,
> > virt_to_page(src),
> > +                                     src_off, size,
> > DMA_TO_DEVICE);
> > +       if (dma_mapping_error(device->dev, unmap->addr[0]))
> > +               goto err_get_unmap;
> > +
> > +       unmap->to_cnt = 1;
> > +
> > +       do {
> > +               txd = device->device_prep_dma_memcpy(chan,
> > dst_phys,
> > +                                                    unmap
> > ->addr[0],
> > +                                                    size,
> > DMA_PREP_INTERRUPT);
> > +               if (!txd) {
> > +                       set_current_state(TASK_INTERRUPTIBLE);
> > +                       schedule_timeout(DMA_OUT_RESOURCE_TO);
> > +               }
> > +       } while (!txd && (++retries < DMA_RETRIES));
> > +
> > +       if (!txd) {
> > +               pctx->dma_prep_err++;
> > +               goto err_get_unmap;
> > +       }
> > +
> > +       txd->callback = perf_copy_callback;
> > +       txd->callback_param = pctx;
> > +       dma_set_unmap(txd, unmap);
> > +
> > +       cookie = dmaengine_submit(txd);
> > +       if (dma_submit_error(cookie))
> > +               goto err_set_unmap;
> > +
> > +       atomic_inc(&pctx->dma_sync);
> > +       dma_async_issue_pending(chan);
> > +
> > +       return size;
> > +
> > +err_set_unmap:
> > +       dmaengine_unmap_put(unmap);
> > +err_get_unmap:
> > +       dmaengine_unmap_put(unmap);
> > +       return 0;
> > +}
> > +
> > +static int perf_move_data(struct pthr_ctx *pctx, char *dst, char
> > *src,
> > +                         u64 buf_size, u64 win_size, u64 total)
> > +{
> > +       int chunks, total_chunks, i;
> > +       int copied_chunks = 0;
> > +       u64 copied = 0, result;
> > +       char *tmp = dst;
> > +       u64 perf, diff_us;
> > +       ktime_t kstart, kstop, kdiff;
> > +
> > +       chunks = win_size / buf_size;
> > +       total_chunks = total / buf_size;
> > +       kstart = ktime_get();
> > +
> > +       for (i = 0; i < total_chunks; i++) {
> > +               result = perf_copy(pctx, tmp, src, buf_size);
> > +               copied += result;
> > +               copied_chunks++;
> > +               if (copied_chunks == chunks) {
> > +                       tmp = dst;
> > +                       copied_chunks = 0;
> > +               } else
> > +                       tmp += buf_size;
> > +
> > +               /* probably should schedule every 4GB? */
> > +               if (((copied % (1ULL << 32)) == 0) && !use_dma) {
> 
> I'd prefer you #define SZ_4G and use it above
> 
> > +                       set_current_state(TASK_INTERRUPTIBLE);
> > +                       schedule();
> > +               }
> > +       }
> > +
> > +       if (use_dma)
> > +               pr_info("%s: All DMA descriptors submitted\n",
> > current->comm);
> > +
> > +       while (atomic_read(&pctx->dma_sync) != 0)
> > +               msleep(1);
> > +
> > +       kstop = ktime_get();
> > +       kdiff = ktime_sub(kstop, kstart);
> > +       diff_us = ktime_to_us(kdiff);
> > +
> > +       pr_info("%s: copied %Lu bytes\n", current->comm, copied);
> > +
> > +       pr_info("%s: lasted %Lu usecs\n", current->comm, diff_us);
> > +
> > +       perf = copied / diff_us;
> > +
> > +       pr_info("%s: MBytes/s: %Lu\n", current->comm, perf);
> > +
> > +       return 0;
> > +}
> > +
> > +static bool perf_dma_filter_fn(struct dma_chan *chan, void *node)
> > +{
> > +       return dev_to_node(&chan->dev->device) == (int)(unsigned
> > long)node;
> > +}
> > +
> > +static int ntb_perf_thread(void *data)
> > +{
> > +       struct pthr_ctx *pctx = data;
> > +       struct perf_ctx *perf = pctx->perf;
> > +       struct pci_dev *pdev = perf->ntb->pdev;
> > +       struct perf_mw *mw = &perf->mw;
> > +       char *dst;
> > +       u64 win_size, buf_size, total;
> > +       void *src;
> > +       int rc, node, i;
> > +       struct dma_chan *dma_chan = NULL;
> > +
> > +       pr_info("kthread %s starting...\n", current->comm);
> > +
> > +       node = dev_to_node(&pdev->dev);
> > +
> > +       if (use_dma && !pctx->dma_chan) {
> > +               dma_cap_mask_t dma_mask;
> > +
> > +               dma_cap_zero(dma_mask);
> > +               dma_cap_set(DMA_MEMCPY, dma_mask);
> > +               dma_chan = dma_request_channel(dma_mask,
> > perf_dma_filter_fn,
> > +                                              (void *)(unsigned
> > long)node);
> > +               if (!dma_chan) {
> > +                       pr_warn("%s: cannot acquire DMA channel,
> > quitting\n",
> > +                               current->comm);
> > +                       return -ENODEV;
> > +               }
> > +               pctx->dma_chan = dma_chan;
> > +       }
> > +
> > +       for (i = 0; i < MAX_SRCS; i++) {
> > +               pctx->srcs[i] = kmalloc_node(MAX_TEST_SIZE,
> > GFP_KERNEL, node);
> > +               if (!pctx->srcs[i]) {
> > +                       rc = -ENOMEM;
> > +                       goto err;
> > +               }
> > +       }
> > +
> > +       win_size = mw->phys_size;
> > +       buf_size = 1ULL << seg_order;
> > +       total = 1ULL << run_order;
> > +
> > +       if (buf_size > MAX_TEST_SIZE)
> > +               buf_size = MAX_TEST_SIZE;
> > +
> > +       dst = (char *)mw->vbase;
> > +
> > +       atomic_inc(&perf->tsync);
> > +       while (atomic_read(&perf->tsync) != perf->perf_threads)
> > +               schedule();
> > +
> > +       src = pctx->srcs[pctx->src_idx];
> > +       pctx->src_idx = (pctx->src_idx + 1) & (MAX_SRCS - 1);
> > +
> > +       rc = perf_move_data(pctx, dst, src, buf_size, win_size,
> > total);
> > +
> > +       atomic_dec(&perf->tsync);
> > +
> > +       if (rc < 0) {
> > +               pr_err("%s: failed\n", current->comm);
> > +               rc = -ENXIO;
> > +               goto err;
> > +       }
> > +
> > +       for (i = 0; i < MAX_SRCS; i++) {
> > +               if (pctx->srcs[i]) {
> > +                       kfree(pctx->srcs[i]);
> > +                       pctx->srcs[i] = NULL;
> > +               }
> > +       }
> > +
> > +       return 0;
> > +
> > +err:
> > +       for (i = 0; i < MAX_SRCS; i++) {
> > +               if (pctx->srcs[i]) {
> > +                       kfree(pctx->srcs[i]);
> > +                       pctx->srcs[i] = NULL;
> > +               }
> > +       }
> > +
> > +       if (dma_chan) {
> > +               dma_release_channel(dma_chan);
> > +               pctx->dma_chan = NULL;
> > +       }
> > +
> > +       return rc;
> > +}
> > +
> > +static void perf_free_mw(struct perf_ctx *perf)
> > +{
> > +       struct perf_mw *mw = &perf->mw;
> > +       struct pci_dev *pdev = perf->ntb->pdev;
> > +
> > +       if (!mw->virt_addr)
> > +               return;
> > +
> > +       ntb_mw_clear_trans(perf->ntb, 0);
> > +       dma_free_coherent(&pdev->dev, mw->buf_size,
> > +                         mw->virt_addr, mw->dma_addr);
> > +       mw->xlat_size = 0;
> > +       mw->buf_size = 0;
> > +       mw->virt_addr = NULL;
> > +}
> > +
> > +static int perf_set_mw(struct perf_ctx *perf, resource_size_t
> > size)
> > +{
> > +       struct perf_mw *mw = &perf->mw;
> > +       size_t xlat_size, buf_size;
> > +
> > +       if (!size)
> > +               return -EINVAL;
> > +
> > +       xlat_size = round_up(size, mw->xlat_align_size);
> > +       buf_size = round_up(size, mw->xlat_align);
> > +
> > +       if (mw->xlat_size == xlat_size)
> > +               return 0;
> > +
> > +       if (mw->buf_size)
> > +               perf_free_mw(perf);
> > +
> > +       mw->xlat_size = xlat_size;
> > +       mw->buf_size = buf_size;
> > +
> > +       mw->virt_addr = dma_alloc_coherent(&perf->ntb->pdev->dev,
> > buf_size,
> > +                                          &mw->dma_addr,
> > GFP_KERNEL);
> > +       if (!mw->virt_addr) {
> > +               mw->xlat_size = 0;
> > +               mw->buf_size = 0;
> > +       }
> > +
> > +       return 0;
> > +}
> > +
> > +static void perf_link_work(struct work_struct *work)
> > +{
> > +       struct perf_ctx *perf =
> > +               container_of(work, struct perf_ctx,
> > link_work.work);
> > +       struct ntb_dev *ndev = perf->ntb;
> > +       struct pci_dev *pdev = ndev->pdev;
> > +       u32 val;
> > +       u64 size;
> > +       int rc;
> > +
> > +       dev_dbg(&perf->ntb->pdev->dev, "%s called\n", __func__);
> > +
> > +       size = perf->mw.phys_size;
> > +       ntb_peer_spad_write(ndev, MW_SZ_HIGH, (u32)(size >> 32));
> 
> Please use upper_32_bits(n)
> 
> > +       ntb_peer_spad_write(ndev, MW_SZ_LOW, (u32)size);
> 
> Please use lower_32_bits(n)
> 
> > +       ntb_peer_spad_write(ndev, VERSION, PERF_VERSION);
> > +
> > +       /* now read what peer wrote */
> > +       val = ntb_spad_read(ndev, VERSION);
> > +       dev_dbg(&pdev->dev, "Remote version = %#x\n", val);
> > +       if (val != PERF_VERSION)
> 
> It might be better to my the debug comparison below the if check and
> make it an error/warning.  That way it becomes more obvious if this
> is
> ever triggered.

You'll flood the dmesg if you do that because it's checked continuously
until both sides sync up. I moved it under the if check and kept it as
dev_dbg(). 


> 
> > +               goto out;
> > +
> > +       val = ntb_spad_read(ndev, MW_SZ_HIGH);
> > +       size = (u64)val << 32;
> > +
> > +       val = ntb_spad_read(ndev, MW_SZ_LOW);
> > +       size |= val;
> > +
> > +       dev_dbg(&pdev->dev, "Remote MW size = %#llx\n", size);
> > +
> > +       rc = perf_set_mw(perf, size);
> > +       if (rc)
> > +               goto out1;
> > +
> > +       perf->link_is_up = true;
> > +
> > +       return;
> > +
> > +out1:
> > +       perf_free_mw(perf);
> > +
> > +out:
> > +       if (ntb_link_is_up(ndev, NULL, NULL) == 1)
> > +               schedule_delayed_work(&perf->link_work,
> > +                                    
> >  msecs_to_jiffies(PERF_LINK_DOWN_TIMEOUT));
> > +}
> > +
> > +static void perf_link_cleanup(struct work_struct *work)
> > +{
> > +       struct perf_ctx *perf = container_of(work,
> > +                                            struct perf_ctx,
> > +                                            link_cleanup);
> > +       int i;
> > +
> > +       dev_dbg(&perf->ntb->pdev->dev, "%s called\n", __func__);
> > +
> > +       if (!perf->link_is_up)
> > +               cancel_delayed_work_sync(&perf->link_work);
> > +
> > +       for (i = 0; i < MAX_SPAD; i++)
> 
> Complete aside here, but we might need to add logic to the core NTB
> code to query the number of available SPADs and error out of we try
> to
> get more than that.
> 
> > +               ntb_spad_write(perf->ntb, i, 0);
> > +}
> > +
> > +static int perf_setup_mw(struct ntb_dev *ntb, struct perf_ctx
> > *perf)
> > +{
> > +       struct perf_mw *mw;
> > +       int rc;
> > +
> > +       mw = &perf->mw;
> > +
> > +       rc = ntb_mw_get_range(ntb, 0, &mw->phys_addr, &mw
> > ->phys_size,
> > +                             &mw->xlat_align, &mw
> > ->xlat_align_size);
> > +       if (rc)
> > +               return rc;
> > +
> > +       perf->mw.vbase = ioremap_wc(mw->phys_addr, mw->phys_size);
> > +       if (!mw->vbase)
> > +               return -ENOMEM;
> > +
> > +       return 0;
> > +}
> > +
> > +static ssize_t debugfs_run_read(struct file *filp, char __user
> > *ubuf,
> > +                               size_t count, loff_t *offp)
> > +{
> > +       struct perf_ctx *perf = filp->private_data;
> > +       char *buf;
> > +       ssize_t ret, out_offset;
> > +
> > +       if (!perf)
> > +               return 0;
> > +
> > +       buf = kmalloc(64, GFP_KERNEL);
> > +       out_offset = snprintf(buf, 64, "%d\n", perf->run);
> > +       ret = simple_read_from_buffer(ubuf, count, offp, buf,
> > out_offset);
> > +       kfree(buf);
> > +
> > +       return ret;
> > +}
> > +
> > +static ssize_t debugfs_run_write(struct file *filp, const char
> > __user *ubuf,
> > +                                size_t count, loff_t *offp)
> > +{
> > +       struct perf_ctx *perf = filp->private_data;
> > +       int node, i;
> > +
> > +       if (perf->link_is_up == false)
> 
> Style suggestion, !perf->link_is_up
> 
> > +               return 0;
> > +
> > +       if (perf->perf_threads == 0)
> > +               return 0;
> > +
> > +       if (atomic_read(&perf->tsync) == 0)
> > +               perf->run = false;
> > +
> > +       if (perf->run == true) {
> 
> Style suggestion, "if (perf->run) {"
> 
> > +               /* lets stop the threads */
> > +               perf->run = false;
> > +               for (i = 0; i < MAX_THREADS; i++) {
> > +                       if (perf->pthr_ctx[i].thread) {
> > +                               kthread_stop(perf
> > ->pthr_ctx[i].thread);
> > +                               perf->pthr_ctx[i].thread = NULL;
> > +                       } else
> > +                               break;
> > +               }
> > +       } else {
> > +               perf->run = true;
> > +
> > +               if (perf->perf_threads > MAX_THREADS) {
> > +                       perf->perf_threads = MAX_THREADS;
> > +                       pr_info("Reset total threads to: %u\n",
> > MAX_THREADS);
> > +               }
> > +
> > +               /* no greater than 1M */
> > +               if (seg_order > 20) {
> 
> SZ_1M / seg size?
> 
> > +                       seg_order = 20;
> > +                       pr_info("Fix seg_order to %u\n",
> > seg_order);
> > +               }
> > +
> > +               if (run_order < seg_order) {
> > +                       run_order = seg_order;
> > +                       pr_info("Fix run_order to %u\n",
> > run_order);
> > +               }
> > +
> > +               node = dev_to_node(&perf->ntb->dev);
> > +               /* launch kernel thread */
> > +               for (i = 0; i < perf->perf_threads; i++) {
> > +                       struct pthr_ctx *pctx;
> > +
> > +                       pctx = &perf->pthr_ctx[i];
> > +                       atomic_set(&pctx->dma_sync, 0);
> > +                       pctx->perf = perf;
> > +                       pctx->thread =
> > +                              
> >  kthread_create_on_node(ntb_perf_thread,
> > +                                                      (void
> > *)pctx,
> > +                                                      node,
> > "ntb_perf %d", i);
> > +                       if (pctx->thread)
> > +                               wake_up_process(pctx->thread);
> > +                       else {
> > +                               perf->run = false;
> > +                               for (i = 0; i < MAX_THREADS; i++) {
> > +                                       if (pctx->thread) {
> > +                                               kthread_stop(pctx
> > ->thread);
> > +                                               pctx->thread =
> > NULL;
> > +                                       } else
> > +                                               break;
> 
> It might be simpler to simply run through them all and stop them
> (instead of checking for the first empty and quitting)
> 
> > +                               }
> > +                       }
> > +
> > +                       if (perf->run == false)
> > +                               return -ENXIO;
> > +               }
> > +
> > +       }
> > +
> > +       return count;
> > +}
> > +
> > +static const struct file_operations ntb_perf_debugfs_run = {
> > +       .owner = THIS_MODULE,
> > +       .open = simple_open,
> > +       .read = debugfs_run_read,
> > +       .write = debugfs_run_write,
> > +};
> > +
> > +static int perf_debugfs_setup(struct perf_ctx *perf)
> > +{
> > +       struct pci_dev *pdev = perf->ntb->pdev;
> > +
> > +       if (!perf_debugfs_dir)
> > +               return -ENODEV;
> > +
> > +       perf->debugfs_node_dir = debugfs_create_dir(pci_name(pdev),
> > +                                                  
> >  perf_debugfs_dir);
> > +       if (!perf->debugfs_node_dir)
> > +               return -ENODEV;
> > +
> > +       perf->debugfs_run = debugfs_create_file("run", S_IRUSR |
> > S_IWUSR,
> > +                                               perf
> > ->debugfs_node_dir, perf,
> > +                                              
> >  &ntb_perf_debugfs_run);
> > +       if (!perf->debugfs_run)
> > +               return -ENODEV;
> > +
> > +       perf->debugfs_threads = debugfs_create_u8("threads",
> > S_IRUSR | S_IWUSR,
> > +                                                 perf
> > ->debugfs_node_dir,
> > +                                                 &perf
> > ->perf_threads);
> > +       if (!perf->debugfs_threads)
> > +               return -ENODEV;
> > +
> > +       return 0;
> > +}
> > +
> > +static int perf_probe(struct ntb_client *client, struct ntb_dev
> > *ntb)
> > +{
> > +       struct pci_dev *pdev = ntb->pdev;
> > +       struct perf_ctx *perf;
> > +       int node;
> > +       int rc = 0;
> > +
> > +       node = dev_to_node(&pdev->dev);
> > +
> > +       perf = kzalloc_node(sizeof(*perf), GFP_KERNEL, node);
> > +       if (!perf) {
> > +               rc = -ENOMEM;
> > +               goto err_perf;
> > +       }
> > +
> > +       perf->ntb = ntb;
> > +       perf->perf_threads = 1;
> > +       atomic_set(&perf->tsync, 0);
> > +       perf->run = false;
> > +       spin_lock_init(&perf->db_lock);
> > +       perf_setup_mw(ntb, perf);
> > +       INIT_DELAYED_WORK(&perf->link_work, perf_link_work);
> > +       INIT_WORK(&perf->link_cleanup, perf_link_cleanup);
> > +
> > +       rc = ntb_set_ctx(ntb, perf, &perf_ops);
> > +       if (rc)
> > +               goto err_ctx;
> > +
> > +       perf->link_is_up = false;
> > +       ntb_link_enable(ntb, NTB_SPEED_AUTO, NTB_WIDTH_AUTO);
> > +       ntb_link_event(ntb);
> > +
> > +       if (debugfs_initialized() && !perf_debugfs_dir) {
> > +               perf_debugfs_dir =
> > debugfs_create_dir(KBUILD_MODNAME, NULL);
> > +               if (!perf_debugfs_dir)
> > +                       goto err_ctx;
> > +
> > +               rc = perf_debugfs_setup(perf);
> > +               if (rc)
> > +                       goto err_ctx;
> > +       }
> > +
> > +       return 0;
> > +
> > +err_ctx:
> > +       cancel_delayed_work_sync(&perf->link_work);
> > +       cancel_work_sync(&perf->link_cleanup);
> > +       kfree(perf);
> > +err_perf:
> > +       return rc;
> > +}
> > +
> > +static void perf_remove(struct ntb_client *client, struct ntb_dev
> > *ntb)
> > +{
> > +       struct perf_ctx *perf = ntb->ctx;
> > +       int i;
> > +
> > +       dev_dbg(&perf->ntb->dev, "%s called\n", __func__);
> > +
> > +       cancel_delayed_work_sync(&perf->link_work);
> > +       cancel_work_sync(&perf->link_cleanup);
> > +
> > +       ntb_clear_ctx(ntb);
> > +       ntb_link_disable(ntb);
> > +
> > +       if (perf_debugfs_dir) {
> > +               debugfs_remove_recursive(perf_debugfs_dir);
> > +               perf_debugfs_dir = NULL;
> > +       }
> > +
> > +       if (use_dma) {
> > +               for (i = 0; i < MAX_THREADS; i++) {
> > +                       struct pthr_ctx *pctx = &perf->pthr_ctx[i];
> > +                       if (pctx->dma_chan)
> > +                               dma_release_channel(pctx
> > ->dma_chan);
> > +               }
> > +       }
> > +
> > +       kfree(perf);
> > +}
> > +
> > +static struct ntb_client perf_client = {
> > +       .ops = {
> > +               .probe = perf_probe,
> > +               .remove = perf_remove,
> > +       },
> > +};
> > +module_ntb_client(perf_client);
> 
> A few minor things, but overall looks good.  Please fix them up and
> resubmit.
> 
> Thanks,
> Jon
> 

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] ntb: ntb perf tool
  2016-01-06 16:37   ` Allen Hubbe
@ 2016-01-06 17:45     ` Jon Mason
  0 siblings, 0 replies; 5+ messages in thread
From: Jon Mason @ 2016-01-06 17:45 UTC (permalink / raw)
  To: Allen Hubbe; +Cc: Dave Jiang, linux-ntb

On Wed, Jan 6, 2016 at 11:37 AM, Allen Hubbe <Allen.Hubbe@emc.com> wrote:
> Jon Mason <jdmason@kudzu.us>:
>> On Tue, Jan 5, 2016 at 6:37 PM, Dave Jiang <dave.jiang@intel.com> wrote:
>
>> > +static void perf_link_event(void *ctx)
>> > +{
>> > +       struct perf_ctx *perf = ctx;
>> > +
>> > +       if (ntb_link_is_up(perf->ntb, NULL, NULL) == 1)
>>
>> The comparison to one seems unnecessary
>>
>
> It api doc says it returns one, zero, or negative to indicate an error.  The Intel NTB driver does not return an error.  Would you prefer to change the api, to make the return simply a boolean, and have some other mechanism to query an error?

Naa, this is fine.  Sorry for the confusion.

>> > +       if (!perf->link_is_up)
>> > +               cancel_delayed_work_sync(&perf->link_work);
>> > +
>> > +       for (i = 0; i < MAX_SPAD; i++)
>>
>> Complete aside here, but we might need to add logic to the core NTB
>> code to query the number of available SPADs and error out of we try to
>> get more than that.
>
> Use ntb_spad_count(), and the count does vary.  For example, there are half the number of spads in RP/TB topology than B2B.

Well, I think this code is saying it only needs to count up to
MAX_SPAD here, but there needs to be a check in the probe function to
verify that MAX_SPAD isn't > ntb_spad_count() (if not already there).

>
> --
> You received this message because you are subscribed to the Google Groups "linux-ntb" group.
> To unsubscribe from this group and stop receiving emails from it, send an email to linux-ntb+unsubscribe@googlegroups.com.
> To post to this group, send email to linux-ntb@googlegroups.com.
> To view this discussion on the web visit https://groups.google.com/d/msgid/linux-ntb/000001d148a0%2484a18480%248de48d80%24%40emc.com.
> For more options, visit https://groups.google.com/d/optout.

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2016-01-06 17:45 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-01-05 23:37 [PATCH] ntb: ntb perf tool Dave Jiang
2016-01-06 15:05 ` Jon Mason
2016-01-06 16:37   ` Allen Hubbe
2016-01-06 17:45     ` Jon Mason
2016-01-06 16:57   ` Jiang, Dave

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.