All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] dma-mapping: benchmark: pretend DMA is transmitting
@ 2021-02-04 22:58 ` Barry Song
  0 siblings, 0 replies; 6+ messages in thread
From: Barry Song @ 2021-02-04 22:58 UTC (permalink / raw)
  To: m.szyprowski, hch, robin.murphy, iommu; +Cc: linux-kernel, linuxarm, Barry Song

In a real dma mapping user case, after dma_map is done, data will be
transmit. Thus, in multi-threaded user scenario, IOMMU contention
should not be that severe. For example, if users enable multiple
threads to send network packets through 1G/10G/100Gbps NIC, usually
the steps will be: map -> transmission -> unmap.  Transmission delay
reduces the contention of IOMMU. Here a delay is added to simulate
the transmission for TX case so that the tested result could be
more accurate.

RX case would be much more tricky. It is not supported yet.

Signed-off-by: Barry Song <song.bao.hua@hisilicon.com>
---
 kernel/dma/map_benchmark.c                      | 11 +++++++++++
 tools/testing/selftests/dma/dma_map_benchmark.c | 17 +++++++++++++++--
 2 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/kernel/dma/map_benchmark.c b/kernel/dma/map_benchmark.c
index 1b1b8ff875cb..1976db7e34e4 100644
--- a/kernel/dma/map_benchmark.c
+++ b/kernel/dma/map_benchmark.c
@@ -21,6 +21,7 @@
 #define DMA_MAP_BENCHMARK	_IOWR('d', 1, struct map_benchmark)
 #define DMA_MAP_MAX_THREADS	1024
 #define DMA_MAP_MAX_SECONDS	300
+#define DMA_MAP_MAX_TRANS_DELAY	(10 * 1000 * 1000) /* 10ms */
 
 #define DMA_MAP_BIDIRECTIONAL	0
 #define DMA_MAP_TO_DEVICE	1
@@ -36,6 +37,7 @@ struct map_benchmark {
 	__s32 node; /* which numa node this benchmark will run on */
 	__u32 dma_bits; /* DMA addressing capability */
 	__u32 dma_dir; /* DMA data direction */
+	__u32 dma_trans_ns; /* time for DMA transmission in ns */
 	__u64 expansion[10];	/* For future use */
 };
 
@@ -87,6 +89,10 @@ static int map_benchmark_thread(void *data)
 		map_etime = ktime_get();
 		map_delta = ktime_sub(map_etime, map_stime);
 
+		/* Pretend DMA is transmitting */
+		if (map->dir != DMA_FROM_DEVICE)
+			ndelay(map->bparam.dma_trans_ns);
+
 		unmap_stime = ktime_get();
 		dma_unmap_single(map->dev, dma_addr, PAGE_SIZE, map->dir);
 		unmap_etime = ktime_get();
@@ -218,6 +224,11 @@ static long map_benchmark_ioctl(struct file *file, unsigned int cmd,
 			return -EINVAL;
 		}
 
+		if (map->bparam.dma_trans_ns > DMA_MAP_MAX_TRANS_DELAY) {
+			pr_err("invalid transmission delay\n");
+			return -EINVAL;
+		}
+
 		if (map->bparam.node != NUMA_NO_NODE &&
 		    !node_possible(map->bparam.node)) {
 			pr_err("invalid numa node\n");
diff --git a/tools/testing/selftests/dma/dma_map_benchmark.c b/tools/testing/selftests/dma/dma_map_benchmark.c
index 7065163a8388..dbf426e2fb7f 100644
--- a/tools/testing/selftests/dma/dma_map_benchmark.c
+++ b/tools/testing/selftests/dma/dma_map_benchmark.c
@@ -14,6 +14,7 @@
 #define DMA_MAP_BENCHMARK	_IOWR('d', 1, struct map_benchmark)
 #define DMA_MAP_MAX_THREADS	1024
 #define DMA_MAP_MAX_SECONDS     300
+#define DMA_MAP_MAX_TRANS_DELAY	(10 * 1000 * 1000) /* 10ms */
 
 #define DMA_MAP_BIDIRECTIONAL	0
 #define DMA_MAP_TO_DEVICE	1
@@ -35,6 +36,7 @@ struct map_benchmark {
 	__s32 node; /* which numa node this benchmark will run on */
 	__u32 dma_bits; /* DMA addressing capability */
 	__u32 dma_dir; /* DMA data direction */
+	__u32 dma_trans_ns; /* delay for DMA transmission in ns */
 	__u64 expansion[10];	/* For future use */
 };
 
@@ -45,12 +47,12 @@ int main(int argc, char **argv)
 	/* default single thread, run 20 seconds on NUMA_NO_NODE */
 	int threads = 1, seconds = 20, node = -1;
 	/* default dma mask 32bit, bidirectional DMA */
-	int bits = 32, dir = DMA_MAP_BIDIRECTIONAL;
+	int bits = 32, xdelay = 0, dir = DMA_MAP_BIDIRECTIONAL;
 
 	int cmd = DMA_MAP_BENCHMARK;
 	char *p;
 
-	while ((opt = getopt(argc, argv, "t:s:n:b:d:")) != -1) {
+	while ((opt = getopt(argc, argv, "t:s:n:b:d:x:")) != -1) {
 		switch (opt) {
 		case 't':
 			threads = atoi(optarg);
@@ -67,6 +69,9 @@ int main(int argc, char **argv)
 		case 'd':
 			dir = atoi(optarg);
 			break;
+		case 'x':
+			xdelay = atoi(optarg);
+			break;
 		default:
 			return -1;
 		}
@@ -84,6 +89,12 @@ int main(int argc, char **argv)
 		exit(1);
 	}
 
+	if (xdelay < 0 || xdelay > DMA_MAP_MAX_TRANS_DELAY) {
+		fprintf(stderr, "invalid transmit delay, must be in 0-%d\n",
+			DMA_MAP_MAX_TRANS_DELAY);
+		exit(1);
+	}
+
 	/* suppose the mininum DMA zone is 1MB in the world */
 	if (bits < 20 || bits > 64) {
 		fprintf(stderr, "invalid dma mask bit, must be in 20-64\n");
@@ -107,6 +118,8 @@ int main(int argc, char **argv)
 	map.node = node;
 	map.dma_bits = bits;
 	map.dma_dir = dir;
+	map.dma_trans_ns = xdelay;
+
 	if (ioctl(fd, cmd, &map)) {
 		perror("ioctl");
 		exit(1);
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH] dma-mapping: benchmark: pretend DMA is transmitting
@ 2021-02-04 22:58 ` Barry Song
  0 siblings, 0 replies; 6+ messages in thread
From: Barry Song @ 2021-02-04 22:58 UTC (permalink / raw)
  To: m.szyprowski, hch, robin.murphy, iommu; +Cc: linux-kernel, linuxarm

In a real dma mapping user case, after dma_map is done, data will be
transmit. Thus, in multi-threaded user scenario, IOMMU contention
should not be that severe. For example, if users enable multiple
threads to send network packets through 1G/10G/100Gbps NIC, usually
the steps will be: map -> transmission -> unmap.  Transmission delay
reduces the contention of IOMMU. Here a delay is added to simulate
the transmission for TX case so that the tested result could be
more accurate.

RX case would be much more tricky. It is not supported yet.

Signed-off-by: Barry Song <song.bao.hua@hisilicon.com>
---
 kernel/dma/map_benchmark.c                      | 11 +++++++++++
 tools/testing/selftests/dma/dma_map_benchmark.c | 17 +++++++++++++++--
 2 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/kernel/dma/map_benchmark.c b/kernel/dma/map_benchmark.c
index 1b1b8ff875cb..1976db7e34e4 100644
--- a/kernel/dma/map_benchmark.c
+++ b/kernel/dma/map_benchmark.c
@@ -21,6 +21,7 @@
 #define DMA_MAP_BENCHMARK	_IOWR('d', 1, struct map_benchmark)
 #define DMA_MAP_MAX_THREADS	1024
 #define DMA_MAP_MAX_SECONDS	300
+#define DMA_MAP_MAX_TRANS_DELAY	(10 * 1000 * 1000) /* 10ms */
 
 #define DMA_MAP_BIDIRECTIONAL	0
 #define DMA_MAP_TO_DEVICE	1
@@ -36,6 +37,7 @@ struct map_benchmark {
 	__s32 node; /* which numa node this benchmark will run on */
 	__u32 dma_bits; /* DMA addressing capability */
 	__u32 dma_dir; /* DMA data direction */
+	__u32 dma_trans_ns; /* time for DMA transmission in ns */
 	__u64 expansion[10];	/* For future use */
 };
 
@@ -87,6 +89,10 @@ static int map_benchmark_thread(void *data)
 		map_etime = ktime_get();
 		map_delta = ktime_sub(map_etime, map_stime);
 
+		/* Pretend DMA is transmitting */
+		if (map->dir != DMA_FROM_DEVICE)
+			ndelay(map->bparam.dma_trans_ns);
+
 		unmap_stime = ktime_get();
 		dma_unmap_single(map->dev, dma_addr, PAGE_SIZE, map->dir);
 		unmap_etime = ktime_get();
@@ -218,6 +224,11 @@ static long map_benchmark_ioctl(struct file *file, unsigned int cmd,
 			return -EINVAL;
 		}
 
+		if (map->bparam.dma_trans_ns > DMA_MAP_MAX_TRANS_DELAY) {
+			pr_err("invalid transmission delay\n");
+			return -EINVAL;
+		}
+
 		if (map->bparam.node != NUMA_NO_NODE &&
 		    !node_possible(map->bparam.node)) {
 			pr_err("invalid numa node\n");
diff --git a/tools/testing/selftests/dma/dma_map_benchmark.c b/tools/testing/selftests/dma/dma_map_benchmark.c
index 7065163a8388..dbf426e2fb7f 100644
--- a/tools/testing/selftests/dma/dma_map_benchmark.c
+++ b/tools/testing/selftests/dma/dma_map_benchmark.c
@@ -14,6 +14,7 @@
 #define DMA_MAP_BENCHMARK	_IOWR('d', 1, struct map_benchmark)
 #define DMA_MAP_MAX_THREADS	1024
 #define DMA_MAP_MAX_SECONDS     300
+#define DMA_MAP_MAX_TRANS_DELAY	(10 * 1000 * 1000) /* 10ms */
 
 #define DMA_MAP_BIDIRECTIONAL	0
 #define DMA_MAP_TO_DEVICE	1
@@ -35,6 +36,7 @@ struct map_benchmark {
 	__s32 node; /* which numa node this benchmark will run on */
 	__u32 dma_bits; /* DMA addressing capability */
 	__u32 dma_dir; /* DMA data direction */
+	__u32 dma_trans_ns; /* delay for DMA transmission in ns */
 	__u64 expansion[10];	/* For future use */
 };
 
@@ -45,12 +47,12 @@ int main(int argc, char **argv)
 	/* default single thread, run 20 seconds on NUMA_NO_NODE */
 	int threads = 1, seconds = 20, node = -1;
 	/* default dma mask 32bit, bidirectional DMA */
-	int bits = 32, dir = DMA_MAP_BIDIRECTIONAL;
+	int bits = 32, xdelay = 0, dir = DMA_MAP_BIDIRECTIONAL;
 
 	int cmd = DMA_MAP_BENCHMARK;
 	char *p;
 
-	while ((opt = getopt(argc, argv, "t:s:n:b:d:")) != -1) {
+	while ((opt = getopt(argc, argv, "t:s:n:b:d:x:")) != -1) {
 		switch (opt) {
 		case 't':
 			threads = atoi(optarg);
@@ -67,6 +69,9 @@ int main(int argc, char **argv)
 		case 'd':
 			dir = atoi(optarg);
 			break;
+		case 'x':
+			xdelay = atoi(optarg);
+			break;
 		default:
 			return -1;
 		}
@@ -84,6 +89,12 @@ int main(int argc, char **argv)
 		exit(1);
 	}
 
+	if (xdelay < 0 || xdelay > DMA_MAP_MAX_TRANS_DELAY) {
+		fprintf(stderr, "invalid transmit delay, must be in 0-%d\n",
+			DMA_MAP_MAX_TRANS_DELAY);
+		exit(1);
+	}
+
 	/* suppose the mininum DMA zone is 1MB in the world */
 	if (bits < 20 || bits > 64) {
 		fprintf(stderr, "invalid dma mask bit, must be in 20-64\n");
@@ -107,6 +118,8 @@ int main(int argc, char **argv)
 	map.node = node;
 	map.dma_bits = bits;
 	map.dma_dir = dir;
+	map.dma_trans_ns = xdelay;
+
 	if (ioctl(fd, cmd, &map)) {
 		perror("ioctl");
 		exit(1);
-- 
2.25.1

_______________________________________________
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH] dma-mapping: benchmark: pretend DMA is transmitting
  2021-02-04 22:58 ` Barry Song
@ 2021-02-04 23:50   ` Robin Murphy
  -1 siblings, 0 replies; 6+ messages in thread
From: Robin Murphy @ 2021-02-04 23:50 UTC (permalink / raw)
  To: Barry Song, m.szyprowski, hch, iommu; +Cc: linux-kernel, linuxarm

On 2021-02-04 22:58, Barry Song wrote:
> In a real dma mapping user case, after dma_map is done, data will be
> transmit. Thus, in multi-threaded user scenario, IOMMU contention
> should not be that severe. For example, if users enable multiple
> threads to send network packets through 1G/10G/100Gbps NIC, usually
> the steps will be: map -> transmission -> unmap.  Transmission delay
> reduces the contention of IOMMU. Here a delay is added to simulate
> the transmission for TX case so that the tested result could be
> more accurate.
> 
> RX case would be much more tricky. It is not supported yet.

I guess it might be a reasonable approximation to map several pages, 
then unmap them again after a slightly more random delay. Or maybe 
divide the threads into pairs of mappers and unmappers respectively 
filling up and draining proper little buffer pools.

> Signed-off-by: Barry Song <song.bao.hua@hisilicon.com>
> ---
>   kernel/dma/map_benchmark.c                      | 11 +++++++++++
>   tools/testing/selftests/dma/dma_map_benchmark.c | 17 +++++++++++++++--
>   2 files changed, 26 insertions(+), 2 deletions(-)
> 
> diff --git a/kernel/dma/map_benchmark.c b/kernel/dma/map_benchmark.c
> index 1b1b8ff875cb..1976db7e34e4 100644
> --- a/kernel/dma/map_benchmark.c
> +++ b/kernel/dma/map_benchmark.c
> @@ -21,6 +21,7 @@
>   #define DMA_MAP_BENCHMARK	_IOWR('d', 1, struct map_benchmark)
>   #define DMA_MAP_MAX_THREADS	1024
>   #define DMA_MAP_MAX_SECONDS	300
> +#define DMA_MAP_MAX_TRANS_DELAY	(10 * 1000 * 1000) /* 10ms */

Using MSEC_PER_SEC might be sufficiently self-documenting?

>   #define DMA_MAP_BIDIRECTIONAL	0
>   #define DMA_MAP_TO_DEVICE	1
> @@ -36,6 +37,7 @@ struct map_benchmark {
>   	__s32 node; /* which numa node this benchmark will run on */
>   	__u32 dma_bits; /* DMA addressing capability */
>   	__u32 dma_dir; /* DMA data direction */
> +	__u32 dma_trans_ns; /* time for DMA transmission in ns */
>   	__u64 expansion[10];	/* For future use */
>   };
>   
> @@ -87,6 +89,10 @@ static int map_benchmark_thread(void *data)
>   		map_etime = ktime_get();
>   		map_delta = ktime_sub(map_etime, map_stime);
>   
> +		/* Pretend DMA is transmitting */
> +		if (map->dir != DMA_FROM_DEVICE)
> +			ndelay(map->bparam.dma_trans_ns);

TBH I think the option of a fixed delay between map and unmap might be a 
handy thing in general, so having the direction check at all seems 
needlessly restrictive. As long as the driver implements all the basic 
building blocks, combining them to simulate specific traffic patterns 
can be left up to the benchmark tool.

Robin.

> +
>   		unmap_stime = ktime_get();
>   		dma_unmap_single(map->dev, dma_addr, PAGE_SIZE, map->dir);
>   		unmap_etime = ktime_get();
> @@ -218,6 +224,11 @@ static long map_benchmark_ioctl(struct file *file, unsigned int cmd,
>   			return -EINVAL;
>   		}
>   
> +		if (map->bparam.dma_trans_ns > DMA_MAP_MAX_TRANS_DELAY) {
> +			pr_err("invalid transmission delay\n");
> +			return -EINVAL;
> +		}
> +
>   		if (map->bparam.node != NUMA_NO_NODE &&
>   		    !node_possible(map->bparam.node)) {
>   			pr_err("invalid numa node\n");
> diff --git a/tools/testing/selftests/dma/dma_map_benchmark.c b/tools/testing/selftests/dma/dma_map_benchmark.c
> index 7065163a8388..dbf426e2fb7f 100644
> --- a/tools/testing/selftests/dma/dma_map_benchmark.c
> +++ b/tools/testing/selftests/dma/dma_map_benchmark.c
> @@ -14,6 +14,7 @@
>   #define DMA_MAP_BENCHMARK	_IOWR('d', 1, struct map_benchmark)
>   #define DMA_MAP_MAX_THREADS	1024
>   #define DMA_MAP_MAX_SECONDS     300
> +#define DMA_MAP_MAX_TRANS_DELAY	(10 * 1000 * 1000) /* 10ms */
>   
>   #define DMA_MAP_BIDIRECTIONAL	0
>   #define DMA_MAP_TO_DEVICE	1
> @@ -35,6 +36,7 @@ struct map_benchmark {
>   	__s32 node; /* which numa node this benchmark will run on */
>   	__u32 dma_bits; /* DMA addressing capability */
>   	__u32 dma_dir; /* DMA data direction */
> +	__u32 dma_trans_ns; /* delay for DMA transmission in ns */
>   	__u64 expansion[10];	/* For future use */
>   };
>   
> @@ -45,12 +47,12 @@ int main(int argc, char **argv)
>   	/* default single thread, run 20 seconds on NUMA_NO_NODE */
>   	int threads = 1, seconds = 20, node = -1;
>   	/* default dma mask 32bit, bidirectional DMA */
> -	int bits = 32, dir = DMA_MAP_BIDIRECTIONAL;
> +	int bits = 32, xdelay = 0, dir = DMA_MAP_BIDIRECTIONAL;
>   
>   	int cmd = DMA_MAP_BENCHMARK;
>   	char *p;
>   
> -	while ((opt = getopt(argc, argv, "t:s:n:b:d:")) != -1) {
> +	while ((opt = getopt(argc, argv, "t:s:n:b:d:x:")) != -1) {
>   		switch (opt) {
>   		case 't':
>   			threads = atoi(optarg);
> @@ -67,6 +69,9 @@ int main(int argc, char **argv)
>   		case 'd':
>   			dir = atoi(optarg);
>   			break;
> +		case 'x':
> +			xdelay = atoi(optarg);
> +			break;
>   		default:
>   			return -1;
>   		}
> @@ -84,6 +89,12 @@ int main(int argc, char **argv)
>   		exit(1);
>   	}
>   
> +	if (xdelay < 0 || xdelay > DMA_MAP_MAX_TRANS_DELAY) {
> +		fprintf(stderr, "invalid transmit delay, must be in 0-%d\n",
> +			DMA_MAP_MAX_TRANS_DELAY);
> +		exit(1);
> +	}
> +
>   	/* suppose the mininum DMA zone is 1MB in the world */
>   	if (bits < 20 || bits > 64) {
>   		fprintf(stderr, "invalid dma mask bit, must be in 20-64\n");
> @@ -107,6 +118,8 @@ int main(int argc, char **argv)
>   	map.node = node;
>   	map.dma_bits = bits;
>   	map.dma_dir = dir;
> +	map.dma_trans_ns = xdelay;
> +
>   	if (ioctl(fd, cmd, &map)) {
>   		perror("ioctl");
>   		exit(1);
> 

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] dma-mapping: benchmark: pretend DMA is transmitting
@ 2021-02-04 23:50   ` Robin Murphy
  0 siblings, 0 replies; 6+ messages in thread
From: Robin Murphy @ 2021-02-04 23:50 UTC (permalink / raw)
  To: Barry Song, m.szyprowski, hch, iommu; +Cc: linux-kernel, linuxarm

On 2021-02-04 22:58, Barry Song wrote:
> In a real dma mapping user case, after dma_map is done, data will be
> transmit. Thus, in multi-threaded user scenario, IOMMU contention
> should not be that severe. For example, if users enable multiple
> threads to send network packets through 1G/10G/100Gbps NIC, usually
> the steps will be: map -> transmission -> unmap.  Transmission delay
> reduces the contention of IOMMU. Here a delay is added to simulate
> the transmission for TX case so that the tested result could be
> more accurate.
> 
> RX case would be much more tricky. It is not supported yet.

I guess it might be a reasonable approximation to map several pages, 
then unmap them again after a slightly more random delay. Or maybe 
divide the threads into pairs of mappers and unmappers respectively 
filling up and draining proper little buffer pools.

> Signed-off-by: Barry Song <song.bao.hua@hisilicon.com>
> ---
>   kernel/dma/map_benchmark.c                      | 11 +++++++++++
>   tools/testing/selftests/dma/dma_map_benchmark.c | 17 +++++++++++++++--
>   2 files changed, 26 insertions(+), 2 deletions(-)
> 
> diff --git a/kernel/dma/map_benchmark.c b/kernel/dma/map_benchmark.c
> index 1b1b8ff875cb..1976db7e34e4 100644
> --- a/kernel/dma/map_benchmark.c
> +++ b/kernel/dma/map_benchmark.c
> @@ -21,6 +21,7 @@
>   #define DMA_MAP_BENCHMARK	_IOWR('d', 1, struct map_benchmark)
>   #define DMA_MAP_MAX_THREADS	1024
>   #define DMA_MAP_MAX_SECONDS	300
> +#define DMA_MAP_MAX_TRANS_DELAY	(10 * 1000 * 1000) /* 10ms */

Using MSEC_PER_SEC might be sufficiently self-documenting?

>   #define DMA_MAP_BIDIRECTIONAL	0
>   #define DMA_MAP_TO_DEVICE	1
> @@ -36,6 +37,7 @@ struct map_benchmark {
>   	__s32 node; /* which numa node this benchmark will run on */
>   	__u32 dma_bits; /* DMA addressing capability */
>   	__u32 dma_dir; /* DMA data direction */
> +	__u32 dma_trans_ns; /* time for DMA transmission in ns */
>   	__u64 expansion[10];	/* For future use */
>   };
>   
> @@ -87,6 +89,10 @@ static int map_benchmark_thread(void *data)
>   		map_etime = ktime_get();
>   		map_delta = ktime_sub(map_etime, map_stime);
>   
> +		/* Pretend DMA is transmitting */
> +		if (map->dir != DMA_FROM_DEVICE)
> +			ndelay(map->bparam.dma_trans_ns);

TBH I think the option of a fixed delay between map and unmap might be a 
handy thing in general, so having the direction check at all seems 
needlessly restrictive. As long as the driver implements all the basic 
building blocks, combining them to simulate specific traffic patterns 
can be left up to the benchmark tool.

Robin.

> +
>   		unmap_stime = ktime_get();
>   		dma_unmap_single(map->dev, dma_addr, PAGE_SIZE, map->dir);
>   		unmap_etime = ktime_get();
> @@ -218,6 +224,11 @@ static long map_benchmark_ioctl(struct file *file, unsigned int cmd,
>   			return -EINVAL;
>   		}
>   
> +		if (map->bparam.dma_trans_ns > DMA_MAP_MAX_TRANS_DELAY) {
> +			pr_err("invalid transmission delay\n");
> +			return -EINVAL;
> +		}
> +
>   		if (map->bparam.node != NUMA_NO_NODE &&
>   		    !node_possible(map->bparam.node)) {
>   			pr_err("invalid numa node\n");
> diff --git a/tools/testing/selftests/dma/dma_map_benchmark.c b/tools/testing/selftests/dma/dma_map_benchmark.c
> index 7065163a8388..dbf426e2fb7f 100644
> --- a/tools/testing/selftests/dma/dma_map_benchmark.c
> +++ b/tools/testing/selftests/dma/dma_map_benchmark.c
> @@ -14,6 +14,7 @@
>   #define DMA_MAP_BENCHMARK	_IOWR('d', 1, struct map_benchmark)
>   #define DMA_MAP_MAX_THREADS	1024
>   #define DMA_MAP_MAX_SECONDS     300
> +#define DMA_MAP_MAX_TRANS_DELAY	(10 * 1000 * 1000) /* 10ms */
>   
>   #define DMA_MAP_BIDIRECTIONAL	0
>   #define DMA_MAP_TO_DEVICE	1
> @@ -35,6 +36,7 @@ struct map_benchmark {
>   	__s32 node; /* which numa node this benchmark will run on */
>   	__u32 dma_bits; /* DMA addressing capability */
>   	__u32 dma_dir; /* DMA data direction */
> +	__u32 dma_trans_ns; /* delay for DMA transmission in ns */
>   	__u64 expansion[10];	/* For future use */
>   };
>   
> @@ -45,12 +47,12 @@ int main(int argc, char **argv)
>   	/* default single thread, run 20 seconds on NUMA_NO_NODE */
>   	int threads = 1, seconds = 20, node = -1;
>   	/* default dma mask 32bit, bidirectional DMA */
> -	int bits = 32, dir = DMA_MAP_BIDIRECTIONAL;
> +	int bits = 32, xdelay = 0, dir = DMA_MAP_BIDIRECTIONAL;
>   
>   	int cmd = DMA_MAP_BENCHMARK;
>   	char *p;
>   
> -	while ((opt = getopt(argc, argv, "t:s:n:b:d:")) != -1) {
> +	while ((opt = getopt(argc, argv, "t:s:n:b:d:x:")) != -1) {
>   		switch (opt) {
>   		case 't':
>   			threads = atoi(optarg);
> @@ -67,6 +69,9 @@ int main(int argc, char **argv)
>   		case 'd':
>   			dir = atoi(optarg);
>   			break;
> +		case 'x':
> +			xdelay = atoi(optarg);
> +			break;
>   		default:
>   			return -1;
>   		}
> @@ -84,6 +89,12 @@ int main(int argc, char **argv)
>   		exit(1);
>   	}
>   
> +	if (xdelay < 0 || xdelay > DMA_MAP_MAX_TRANS_DELAY) {
> +		fprintf(stderr, "invalid transmit delay, must be in 0-%d\n",
> +			DMA_MAP_MAX_TRANS_DELAY);
> +		exit(1);
> +	}
> +
>   	/* suppose the mininum DMA zone is 1MB in the world */
>   	if (bits < 20 || bits > 64) {
>   		fprintf(stderr, "invalid dma mask bit, must be in 20-64\n");
> @@ -107,6 +118,8 @@ int main(int argc, char **argv)
>   	map.node = node;
>   	map.dma_bits = bits;
>   	map.dma_dir = dir;
> +	map.dma_trans_ns = xdelay;
> +
>   	if (ioctl(fd, cmd, &map)) {
>   		perror("ioctl");
>   		exit(1);
> 
_______________________________________________
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

^ permalink raw reply	[flat|nested] 6+ messages in thread

* RE: [PATCH] dma-mapping: benchmark: pretend DMA is transmitting
  2021-02-04 23:50   ` Robin Murphy
@ 2021-02-05  1:18     ` Song Bao Hua (Barry Song)
  -1 siblings, 0 replies; 6+ messages in thread
From: Song Bao Hua (Barry Song) @ 2021-02-05  1:18 UTC (permalink / raw)
  To: Robin Murphy, m.szyprowski, hch, iommu; +Cc: linux-kernel, linuxarm



> -----Original Message-----
> From: Robin Murphy [mailto:robin.murphy@arm.com]
> Sent: Friday, February 5, 2021 12:51 PM
> To: Song Bao Hua (Barry Song) <song.bao.hua@hisilicon.com>;
> m.szyprowski@samsung.com; hch@lst.de; iommu@lists.linux-foundation.org
> Cc: linux-kernel@vger.kernel.org; linuxarm@openeuler.org
> Subject: Re: [PATCH] dma-mapping: benchmark: pretend DMA is transmitting
> 
> On 2021-02-04 22:58, Barry Song wrote:
> > In a real dma mapping user case, after dma_map is done, data will be
> > transmit. Thus, in multi-threaded user scenario, IOMMU contention
> > should not be that severe. For example, if users enable multiple
> > threads to send network packets through 1G/10G/100Gbps NIC, usually
> > the steps will be: map -> transmission -> unmap.  Transmission delay
> > reduces the contention of IOMMU. Here a delay is added to simulate
> > the transmission for TX case so that the tested result could be
> > more accurate.
> >
> > RX case would be much more tricky. It is not supported yet.
> 
> I guess it might be a reasonable approximation to map several pages,
> then unmap them again after a slightly more random delay. Or maybe
> divide the threads into pairs of mappers and unmappers respectively
> filling up and draining proper little buffer pools.

Yes. Good suggestions. I am actually thinking about how to support
cases like networks. There is a pre-mapped list of pages, each page
is bound with some hardware DMA block descriptor(BD). So if Linux can
consume the packets in time, those buffers are always re-used. Only
when the page bound with BD is full and OS can't consume it in time,
another temp page will be allocated and mapped, BD will switch to use
this temp page, then finally unmap it if it is not needed any more.
On the other hand, the pre-mapped pages are never unmapped.

For things like filesystem and disk driver, RX is always requested by
users. The model would be simpler: map -> rx -> unmap. For networks,
RX transmission can come spontaneously.

Anyway, I'll put this into TBD. For this moment, mainly handle TX path.
Or maybe the current code has been able to handle simple RX model :-)

> 
> > Signed-off-by: Barry Song <song.bao.hua@hisilicon.com>
> > ---
> >   kernel/dma/map_benchmark.c                      | 11 +++++++++++
> >   tools/testing/selftests/dma/dma_map_benchmark.c | 17 +++++++++++++++--
> >   2 files changed, 26 insertions(+), 2 deletions(-)
> >
> > diff --git a/kernel/dma/map_benchmark.c b/kernel/dma/map_benchmark.c
> > index 1b1b8ff875cb..1976db7e34e4 100644
> > --- a/kernel/dma/map_benchmark.c
> > +++ b/kernel/dma/map_benchmark.c
> > @@ -21,6 +21,7 @@
> >   #define DMA_MAP_BENCHMARK	_IOWR('d', 1, struct map_benchmark)
> >   #define DMA_MAP_MAX_THREADS	1024
> >   #define DMA_MAP_MAX_SECONDS	300
> > +#define DMA_MAP_MAX_TRANS_DELAY	(10 * 1000 * 1000) /* 10ms */
> 
> Using MSEC_PER_SEC might be sufficiently self-documenting?

Yes, I guess you mean NSEC_PER_MSEC. will move to it.

> 
> >   #define DMA_MAP_BIDIRECTIONAL	0
> >   #define DMA_MAP_TO_DEVICE	1
> > @@ -36,6 +37,7 @@ struct map_benchmark {
> >   	__s32 node; /* which numa node this benchmark will run on */
> >   	__u32 dma_bits; /* DMA addressing capability */
> >   	__u32 dma_dir; /* DMA data direction */
> > +	__u32 dma_trans_ns; /* time for DMA transmission in ns */
> >   	__u64 expansion[10];	/* For future use */
> >   };
> >
> > @@ -87,6 +89,10 @@ static int map_benchmark_thread(void *data)
> >   		map_etime = ktime_get();
> >   		map_delta = ktime_sub(map_etime, map_stime);
> >
> > +		/* Pretend DMA is transmitting */
> > +		if (map->dir != DMA_FROM_DEVICE)
> > +			ndelay(map->bparam.dma_trans_ns);
> 
> TBH I think the option of a fixed delay between map and unmap might be a
> handy thing in general, so having the direction check at all seems
> needlessly restrictive. As long as the driver implements all the basic
> building blocks, combining them to simulate specific traffic patterns
> can be left up to the benchmark tool.

Sensible, will remove the condition check.

> 
> Robin.
> 
> > +
> >   		unmap_stime = ktime_get();
> >   		dma_unmap_single(map->dev, dma_addr, PAGE_SIZE, map->dir);
> >   		unmap_etime = ktime_get();
> > @@ -218,6 +224,11 @@ static long map_benchmark_ioctl(struct file *file,
> unsigned int cmd,
> >   			return -EINVAL;
> >   		}
> >
> > +		if (map->bparam.dma_trans_ns > DMA_MAP_MAX_TRANS_DELAY) {
> > +			pr_err("invalid transmission delay\n");
> > +			return -EINVAL;
> > +		}
> > +
> >   		if (map->bparam.node != NUMA_NO_NODE &&
> >   		    !node_possible(map->bparam.node)) {
> >   			pr_err("invalid numa node\n");
> > diff --git a/tools/testing/selftests/dma/dma_map_benchmark.c
> b/tools/testing/selftests/dma/dma_map_benchmark.c
> > index 7065163a8388..dbf426e2fb7f 100644
> > --- a/tools/testing/selftests/dma/dma_map_benchmark.c
> > +++ b/tools/testing/selftests/dma/dma_map_benchmark.c
> > @@ -14,6 +14,7 @@
> >   #define DMA_MAP_BENCHMARK	_IOWR('d', 1, struct map_benchmark)
> >   #define DMA_MAP_MAX_THREADS	1024
> >   #define DMA_MAP_MAX_SECONDS     300
> > +#define DMA_MAP_MAX_TRANS_DELAY	(10 * 1000 * 1000) /* 10ms */
> >
> >   #define DMA_MAP_BIDIRECTIONAL	0
> >   #define DMA_MAP_TO_DEVICE	1
> > @@ -35,6 +36,7 @@ struct map_benchmark {
> >   	__s32 node; /* which numa node this benchmark will run on */
> >   	__u32 dma_bits; /* DMA addressing capability */
> >   	__u32 dma_dir; /* DMA data direction */
> > +	__u32 dma_trans_ns; /* delay for DMA transmission in ns */
> >   	__u64 expansion[10];	/* For future use */
> >   };
> >
> > @@ -45,12 +47,12 @@ int main(int argc, char **argv)
> >   	/* default single thread, run 20 seconds on NUMA_NO_NODE */
> >   	int threads = 1, seconds = 20, node = -1;
> >   	/* default dma mask 32bit, bidirectional DMA */
> > -	int bits = 32, dir = DMA_MAP_BIDIRECTIONAL;
> > +	int bits = 32, xdelay = 0, dir = DMA_MAP_BIDIRECTIONAL;
> >
> >   	int cmd = DMA_MAP_BENCHMARK;
> >   	char *p;
> >
> > -	while ((opt = getopt(argc, argv, "t:s:n:b:d:")) != -1) {
> > +	while ((opt = getopt(argc, argv, "t:s:n:b:d:x:")) != -1) {
> >   		switch (opt) {
> >   		case 't':
> >   			threads = atoi(optarg);
> > @@ -67,6 +69,9 @@ int main(int argc, char **argv)
> >   		case 'd':
> >   			dir = atoi(optarg);
> >   			break;
> > +		case 'x':
> > +			xdelay = atoi(optarg);
> > +			break;
> >   		default:
> >   			return -1;
> >   		}
> > @@ -84,6 +89,12 @@ int main(int argc, char **argv)
> >   		exit(1);
> >   	}
> >
> > +	if (xdelay < 0 || xdelay > DMA_MAP_MAX_TRANS_DELAY) {
> > +		fprintf(stderr, "invalid transmit delay, must be in 0-%d\n",
> > +			DMA_MAP_MAX_TRANS_DELAY);
> > +		exit(1);
> > +	}
> > +
> >   	/* suppose the mininum DMA zone is 1MB in the world */
> >   	if (bits < 20 || bits > 64) {
> >   		fprintf(stderr, "invalid dma mask bit, must be in 20-64\n");
> > @@ -107,6 +118,8 @@ int main(int argc, char **argv)
> >   	map.node = node;
> >   	map.dma_bits = bits;
> >   	map.dma_dir = dir;
> > +	map.dma_trans_ns = xdelay;
> > +
> >   	if (ioctl(fd, cmd, &map)) {
> >   		perror("ioctl");
> >   		exit(1);
> >

Thanks
Barry


^ permalink raw reply	[flat|nested] 6+ messages in thread

* RE: [PATCH] dma-mapping: benchmark: pretend DMA is transmitting
@ 2021-02-05  1:18     ` Song Bao Hua (Barry Song)
  0 siblings, 0 replies; 6+ messages in thread
From: Song Bao Hua (Barry Song) @ 2021-02-05  1:18 UTC (permalink / raw)
  To: Robin Murphy, m.szyprowski, hch, iommu; +Cc: linux-kernel, linuxarm



> -----Original Message-----
> From: Robin Murphy [mailto:robin.murphy@arm.com]
> Sent: Friday, February 5, 2021 12:51 PM
> To: Song Bao Hua (Barry Song) <song.bao.hua@hisilicon.com>;
> m.szyprowski@samsung.com; hch@lst.de; iommu@lists.linux-foundation.org
> Cc: linux-kernel@vger.kernel.org; linuxarm@openeuler.org
> Subject: Re: [PATCH] dma-mapping: benchmark: pretend DMA is transmitting
> 
> On 2021-02-04 22:58, Barry Song wrote:
> > In a real dma mapping user case, after dma_map is done, data will be
> > transmit. Thus, in multi-threaded user scenario, IOMMU contention
> > should not be that severe. For example, if users enable multiple
> > threads to send network packets through 1G/10G/100Gbps NIC, usually
> > the steps will be: map -> transmission -> unmap.  Transmission delay
> > reduces the contention of IOMMU. Here a delay is added to simulate
> > the transmission for TX case so that the tested result could be
> > more accurate.
> >
> > RX case would be much more tricky. It is not supported yet.
> 
> I guess it might be a reasonable approximation to map several pages,
> then unmap them again after a slightly more random delay. Or maybe
> divide the threads into pairs of mappers and unmappers respectively
> filling up and draining proper little buffer pools.

Yes. Good suggestions. I am actually thinking about how to support
cases like networks. There is a pre-mapped list of pages, each page
is bound with some hardware DMA block descriptor(BD). So if Linux can
consume the packets in time, those buffers are always re-used. Only
when the page bound with BD is full and OS can't consume it in time,
another temp page will be allocated and mapped, BD will switch to use
this temp page, then finally unmap it if it is not needed any more.
On the other hand, the pre-mapped pages are never unmapped.

For things like filesystem and disk driver, RX is always requested by
users. The model would be simpler: map -> rx -> unmap. For networks,
RX transmission can come spontaneously.

Anyway, I'll put this into TBD. For this moment, mainly handle TX path.
Or maybe the current code has been able to handle simple RX model :-)

> 
> > Signed-off-by: Barry Song <song.bao.hua@hisilicon.com>
> > ---
> >   kernel/dma/map_benchmark.c                      | 11 +++++++++++
> >   tools/testing/selftests/dma/dma_map_benchmark.c | 17 +++++++++++++++--
> >   2 files changed, 26 insertions(+), 2 deletions(-)
> >
> > diff --git a/kernel/dma/map_benchmark.c b/kernel/dma/map_benchmark.c
> > index 1b1b8ff875cb..1976db7e34e4 100644
> > --- a/kernel/dma/map_benchmark.c
> > +++ b/kernel/dma/map_benchmark.c
> > @@ -21,6 +21,7 @@
> >   #define DMA_MAP_BENCHMARK	_IOWR('d', 1, struct map_benchmark)
> >   #define DMA_MAP_MAX_THREADS	1024
> >   #define DMA_MAP_MAX_SECONDS	300
> > +#define DMA_MAP_MAX_TRANS_DELAY	(10 * 1000 * 1000) /* 10ms */
> 
> Using MSEC_PER_SEC might be sufficiently self-documenting?

Yes, I guess you mean NSEC_PER_MSEC. will move to it.

> 
> >   #define DMA_MAP_BIDIRECTIONAL	0
> >   #define DMA_MAP_TO_DEVICE	1
> > @@ -36,6 +37,7 @@ struct map_benchmark {
> >   	__s32 node; /* which numa node this benchmark will run on */
> >   	__u32 dma_bits; /* DMA addressing capability */
> >   	__u32 dma_dir; /* DMA data direction */
> > +	__u32 dma_trans_ns; /* time for DMA transmission in ns */
> >   	__u64 expansion[10];	/* For future use */
> >   };
> >
> > @@ -87,6 +89,10 @@ static int map_benchmark_thread(void *data)
> >   		map_etime = ktime_get();
> >   		map_delta = ktime_sub(map_etime, map_stime);
> >
> > +		/* Pretend DMA is transmitting */
> > +		if (map->dir != DMA_FROM_DEVICE)
> > +			ndelay(map->bparam.dma_trans_ns);
> 
> TBH I think the option of a fixed delay between map and unmap might be a
> handy thing in general, so having the direction check at all seems
> needlessly restrictive. As long as the driver implements all the basic
> building blocks, combining them to simulate specific traffic patterns
> can be left up to the benchmark tool.

Sensible, will remove the condition check.

> 
> Robin.
> 
> > +
> >   		unmap_stime = ktime_get();
> >   		dma_unmap_single(map->dev, dma_addr, PAGE_SIZE, map->dir);
> >   		unmap_etime = ktime_get();
> > @@ -218,6 +224,11 @@ static long map_benchmark_ioctl(struct file *file,
> unsigned int cmd,
> >   			return -EINVAL;
> >   		}
> >
> > +		if (map->bparam.dma_trans_ns > DMA_MAP_MAX_TRANS_DELAY) {
> > +			pr_err("invalid transmission delay\n");
> > +			return -EINVAL;
> > +		}
> > +
> >   		if (map->bparam.node != NUMA_NO_NODE &&
> >   		    !node_possible(map->bparam.node)) {
> >   			pr_err("invalid numa node\n");
> > diff --git a/tools/testing/selftests/dma/dma_map_benchmark.c
> b/tools/testing/selftests/dma/dma_map_benchmark.c
> > index 7065163a8388..dbf426e2fb7f 100644
> > --- a/tools/testing/selftests/dma/dma_map_benchmark.c
> > +++ b/tools/testing/selftests/dma/dma_map_benchmark.c
> > @@ -14,6 +14,7 @@
> >   #define DMA_MAP_BENCHMARK	_IOWR('d', 1, struct map_benchmark)
> >   #define DMA_MAP_MAX_THREADS	1024
> >   #define DMA_MAP_MAX_SECONDS     300
> > +#define DMA_MAP_MAX_TRANS_DELAY	(10 * 1000 * 1000) /* 10ms */
> >
> >   #define DMA_MAP_BIDIRECTIONAL	0
> >   #define DMA_MAP_TO_DEVICE	1
> > @@ -35,6 +36,7 @@ struct map_benchmark {
> >   	__s32 node; /* which numa node this benchmark will run on */
> >   	__u32 dma_bits; /* DMA addressing capability */
> >   	__u32 dma_dir; /* DMA data direction */
> > +	__u32 dma_trans_ns; /* delay for DMA transmission in ns */
> >   	__u64 expansion[10];	/* For future use */
> >   };
> >
> > @@ -45,12 +47,12 @@ int main(int argc, char **argv)
> >   	/* default single thread, run 20 seconds on NUMA_NO_NODE */
> >   	int threads = 1, seconds = 20, node = -1;
> >   	/* default dma mask 32bit, bidirectional DMA */
> > -	int bits = 32, dir = DMA_MAP_BIDIRECTIONAL;
> > +	int bits = 32, xdelay = 0, dir = DMA_MAP_BIDIRECTIONAL;
> >
> >   	int cmd = DMA_MAP_BENCHMARK;
> >   	char *p;
> >
> > -	while ((opt = getopt(argc, argv, "t:s:n:b:d:")) != -1) {
> > +	while ((opt = getopt(argc, argv, "t:s:n:b:d:x:")) != -1) {
> >   		switch (opt) {
> >   		case 't':
> >   			threads = atoi(optarg);
> > @@ -67,6 +69,9 @@ int main(int argc, char **argv)
> >   		case 'd':
> >   			dir = atoi(optarg);
> >   			break;
> > +		case 'x':
> > +			xdelay = atoi(optarg);
> > +			break;
> >   		default:
> >   			return -1;
> >   		}
> > @@ -84,6 +89,12 @@ int main(int argc, char **argv)
> >   		exit(1);
> >   	}
> >
> > +	if (xdelay < 0 || xdelay > DMA_MAP_MAX_TRANS_DELAY) {
> > +		fprintf(stderr, "invalid transmit delay, must be in 0-%d\n",
> > +			DMA_MAP_MAX_TRANS_DELAY);
> > +		exit(1);
> > +	}
> > +
> >   	/* suppose the mininum DMA zone is 1MB in the world */
> >   	if (bits < 20 || bits > 64) {
> >   		fprintf(stderr, "invalid dma mask bit, must be in 20-64\n");
> > @@ -107,6 +118,8 @@ int main(int argc, char **argv)
> >   	map.node = node;
> >   	map.dma_bits = bits;
> >   	map.dma_dir = dir;
> > +	map.dma_trans_ns = xdelay;
> > +
> >   	if (ioctl(fd, cmd, &map)) {
> >   		perror("ioctl");
> >   		exit(1);
> >

Thanks
Barry

_______________________________________________
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2021-02-05  1:19 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-02-04 22:58 [PATCH] dma-mapping: benchmark: pretend DMA is transmitting Barry Song
2021-02-04 22:58 ` Barry Song
2021-02-04 23:50 ` Robin Murphy
2021-02-04 23:50   ` Robin Murphy
2021-02-05  1:18   ` Song Bao Hua (Barry Song)
2021-02-05  1:18     ` Song Bao Hua (Barry Song)

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.