io-uring.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* Is io_uring framework becoming bloated gradually? and introduces performace regression
@ 2020-05-08 15:18 Xiaoguang Wang
  2020-05-08 16:37 ` Jens Axboe
  0 siblings, 1 reply; 6+ messages in thread
From: Xiaoguang Wang @ 2020-05-08 15:18 UTC (permalink / raw)
  To: io-uring; +Cc: Pavel Begunkov, axboe, joseph qi, Jiufei Xue

[-- Attachment #1: Type: text/plain, Size: 2099 bytes --]

hi,

This issue was found when I tested IORING_FEAT_FAST_POLL feature, with the newest
upstream codes, indeed I find that io_uring's performace improvement is not obvious
compared to epoll in my test environment, most of the time they are similar.
Test cases basically comes from:
   https://github.com/frevib/io_uring-echo-server/blob/io-uring-feat-fast-poll/benchmarks/benchmarks.md.
In above url, the author's test results shows that io_uring will get a big performace
improvement compared to epoll. I'm still looking into why I don't get the big improvement,
currently don't know why, but I find some obvious regression issue.

I wrote a simple tool based io_uring nop operation to evaluate io_uring framework in
v5.1 and 5.7.0-rc4+(jens's io_uring-5.7 branch), I see a obvious performace regression:
v5.1 kernel:
     $sudo taskset -c 60 ./io_uring_nop_stress -r 300 # run 300 seconds
     total ios: 1832524960
     IOPS:      6108416
5.7.0-rc4+
     $sudo taskset -c 60 ./io_uring_nop_stress -r 300
     total ios: 1597672304
     IOPS:      5325574
it's about 12% performance regression.

Using perf can see many performance bottlenecks, for example, io_submit_sqes is one.
For now, I did't make many analysis yet, just have a look at io_submit_sqes(), there
are many assignment operations in io_init_req(), but I'm not sure whether they are all
needed when req is not needed to be punt to io-wq, for example,
     INIT_IO_WORK(&req->work, io_wq_submit_work); # a whole struct assignment
from perf annotate tool, it's an expensive operation, I think reqs that use fast poll
feature use task-work function, so the INIT_IO_WORK maybe not necessary.

Above is just one issue, what I worry is that whether io_uring is becoming more bloated
gradually, and will not that better to aio. In https://kernel.dk/io_uring.pdf, it says
that io_uring will eliminate 104 bytes copy compared to aio, but see currenct
io_init_req(), io_uring maybe copy more, introducing more overhead? Or does we need to
carefully re-design struct io_kiocb, to reduce overhead as soon as possible.


Regards,
Xiaoguang Wang

[-- Attachment #2: io_uring_nop_stress.c --]
[-- Type: text/plain, Size: 2690 bytes --]

/*
 * Description: run various io_uring nop tests
 * xiaoguang.wang@linux.alibaba.com
 *
 */
#include <errno.h>
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>

#include "liburing.h"

static char *myprog;
static int batch_count = 16;
static int force_async = 0;
static int runtime = 30;
static unsigned long long ios;
static volatile int stop = 0;

static int test_nop(struct io_uring *ring)
{
	struct io_uring_cqe *cqe;
	struct io_uring_sqe *sqe;
	int i, ret;

	for (i = 0; i < batch_count; i++) {
		sqe = io_uring_get_sqe(ring);
		if (!sqe) {
			fprintf(stderr, "get sqe failed\n");
			goto err;
		}
		io_uring_prep_nop(sqe);
		if (force_async)
			sqe->flags |= IOSQE_ASYNC;
	}

	ret = io_uring_submit(ring);
	if (ret <= 0) {
		fprintf(stderr, "sqe submit failed: %d\n", ret);
		goto err;
	}

	for (i = 0; i < batch_count; i++) {
		ret = io_uring_wait_cqe(ring, &cqe);
		if (ret < 0) {
			fprintf(stderr, "wait completion %d\n", ret);
			goto err;
		}
		io_uring_cqe_seen(ring, cqe);
	}
	ios += batch_count;
	return 0;
err:
	return 1;
}

static void usage(void)
{
	printf("Usage: %s -H   or\n", myprog);
	printf("       %s [-b batch][-a][-r runtime]\n", myprog);
	printf("   -b  batch    submission batch count, default 16\n");
	printf("   -r  runtime  run time, default 30\n");
	printf("   -a           force asynchronous submission\n");
	printf("   -H           prints usage and exits\n");
}

static void alarm_handler(int signum)
{
	(void)signum;
	stop = 1;
}

int main(int argc, char *argv[])
{
	struct io_uring ring;
	struct sigaction sa;
	int ret, c;
	const char *opts = "b:ar:";

	myprog = argv[0];
	while ((c = getopt(argc, argv, opts)) != -1) {
		switch (c) {
		case 'b':
			batch_count = atoi(optarg);
			break;
                case 'a':
			force_async = 1;
			break;
                case 'r':
			runtime = atoi(optarg);
			break;
		case 'H':
			usage();
			exit(1);
		}
	}

	if (!batch_count) {
		fprintf(stderr, "batch count should be greater than 0\n");
		exit(1);
	}

	if (!runtime) {
		printf("run time is zero, are you sure?\n");
		return 0;
	}

	memset(&sa, 0, sizeof(sa));
	sa.sa_handler = alarm_handler;
	sigemptyset(&sa.sa_mask);
	ret = sigaction(SIGALRM, &sa, NULL);
	if (ret < 0) {
		fprintf(stderr, "sigaction failed: %s", strerror(errno));
		exit(1);
	}
	alarm(runtime);

	ret = io_uring_queue_init(batch_count, &ring, 0);
	if (ret) {
		fprintf(stderr, "ring setup failed: %d\n", ret);
		return 1;
	}

	while (!stop) {
		ret = test_nop(&ring);
		if (ret) {
			fprintf(stderr, "test_nop failed\n");
			return ret;
		}
	}
	printf("total ios: %llu\n", ios);
	printf("IOPS:      %llu\n", ios / runtime);

	return 0;
}

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: Is io_uring framework becoming bloated gradually? and introduces performace regression
  2020-05-08 15:18 Is io_uring framework becoming bloated gradually? and introduces performace regression Xiaoguang Wang
@ 2020-05-08 16:37 ` Jens Axboe
  2020-05-10 18:10   ` H. de Vries
  2020-05-11  3:30   ` Xiaoguang Wang
  0 siblings, 2 replies; 6+ messages in thread
From: Jens Axboe @ 2020-05-08 16:37 UTC (permalink / raw)
  To: Xiaoguang Wang, io-uring; +Cc: Pavel Begunkov, joseph qi, Jiufei Xue

On 5/8/20 9:18 AM, Xiaoguang Wang wrote:
> hi,
> 
> This issue was found when I tested IORING_FEAT_FAST_POLL feature, with
> the newest upstream codes, indeed I find that io_uring's performace
> improvement is not obvious compared to epoll in my test environment,
> most of the time they are similar.  Test cases basically comes from:
> https://github.com/frevib/io_uring-echo-server/blob/io-uring-feat-fast-poll/benchmarks/benchmarks.md.
> In above url, the author's test results shows that io_uring will get a
> big performace improvement compared to epoll. I'm still looking into
> why I don't get the big improvement, currently don't know why, but I
> find some obvious regression issue.
> 
> I wrote a simple tool based io_uring nop operation to evaluate
> io_uring framework in v5.1 and 5.7.0-rc4+(jens's io_uring-5.7 branch),
> I see a obvious performace regression:
>
> v5.1 kernel:
>      $sudo taskset -c 60 ./io_uring_nop_stress -r 300 # run 300 seconds
>      total ios: 1832524960
>      IOPS:      6108416
> 5.7.0-rc4+
>      $sudo taskset -c 60 ./io_uring_nop_stress -r 300
>      total ios: 1597672304
>      IOPS:      5325574
> it's about 12% performance regression.

For sure there's a bit more bloat in 5.7+ compared to the initial slim
version, and focus has been on features to a certain extent recently.
The poll rework for 5.7 will really improve performance for the
networked side though, so it's not like it's just piling on features
that add bloat.

That said, I do think it's time for a revisit on overhead. It's been a
while since I've done my disk IO testing. The nop testing isn't _that_
interesting by itself, as a micro benchmark it does yield some results
though. Are you running on bare metal or in a VM?

> Using perf can see many performance bottlenecks, for example,
> io_submit_sqes is one.  For now, I did't make many analysis yet, just
> have a look at io_submit_sqes(), there are many assignment operations
> in io_init_req(), but I'm not sure whether they are all needed when
> req is not needed to be punt to io-wq, for example,
> INIT_IO_WORK(&req->work, io_wq_submit_work); # a whole struct
> assignment from perf annotate tool, it's an expensive operation, I
> think reqs that use fast poll feature use task-work function, so the
> INIT_IO_WORK maybe not necessary.

I'm sure there's some low hanging fruit there, and I'd love to take
patches for it.

> Above is just one issue, what I worry is that whether io_uring is
> becoming more bloated gradually, and will not that better to aio. In
> https://kernel.dk/io_uring.pdf, it says that io_uring will eliminate
> 104 bytes copy compared to aio, but see currenct io_init_req(),
> io_uring maybe copy more, introducing more overhead? Or does we need
> to carefully re-design struct io_kiocb, to reduce overhead as soon as
> possible.

The copy refers to the data structures coming in and out, both io_uring
and io_uring inititalize their main io_kiocb/aio_kiocb structure as
well. The io_uring is slightly bigger, but not much, and it's the same
number of cachelines. So should not be a huge difference there. The
copying on the aio side is basically first the pointer copy, then the
user side kiocb structure. io_uring doesn't need to do that. The
completion side is also slimmer. We also don't need as many system calls
to do the same thing, for example.

So no, we should always been substantially slimmer than aio, just by the
very nature of the API.

One major thing I've been thinking about for io_uring is io_kiocb
recycling. We're hitting the memory allocator for alloc+free for each
request, even though that can be somewhat amortized by doing batched
submissions, and polling for instance can also do batched frees. But I'm
pretty sure we can find some gains here by having some io_kiocb caching
that is persistent across operations.

Outside of that, runtime analysis and speeding up the normal path
through io_uring will probably also easily yield us an extra 5% (or
more).

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: Is io_uring framework becoming bloated gradually? and introduces performace regression
  2020-05-08 16:37 ` Jens Axboe
@ 2020-05-10 18:10   ` H. de Vries
  2020-05-11  5:43     ` Xiaoguang Wang
  2020-05-11  3:30   ` Xiaoguang Wang
  1 sibling, 1 reply; 6+ messages in thread
From: H. de Vries @ 2020-05-10 18:10 UTC (permalink / raw)
  To: Jens Axboe, Xiaoguang Wang, io-uring
  Cc: Pavel Begunkov, joseph qi, Jiufei Xue

Hi Xiaoguang,

Why exactly you don't get the performance [1] is referring to could have many reasons. It seems your nop test code quite different from the echo server that is used for [1]. [1] employs and event loop (single thread) specifically for networking. It saves an expensive call to epoll in each iteration of the loop. I glanced at your test program, what is the performance difference with IOSQE_ASYNC vs without IOSQE_ASYNC? 

[1]: https://github.com/frevib/io_uring-echo-server/blob/io-uring-feat-fast-poll/benchmarks/benchmarks.md.

--
Hielke de Vries


On Fri, May 8, 2020, at 18:37, Jens Axboe wrote:
> On 5/8/20 9:18 AM, Xiaoguang Wang wrote:
> > hi,
> > 
> > This issue was found when I tested IORING_FEAT_FAST_POLL feature, with
> > the newest upstream codes, indeed I find that io_uring's performace
> > improvement is not obvious compared to epoll in my test environment,
> > most of the time they are similar.  Test cases basically comes from:
> > https://github.com/frevib/io_uring-echo-server/blob/io-uring-feat-fast-poll/benchmarks/benchmarks.md.
> > In above url, the author's test results shows that io_uring will get a
> > big performace improvement compared to epoll. I'm still looking into
> > why I don't get the big improvement, currently don't know why, but I
> > find some obvious regression issue.
> > 
> > I wrote a simple tool based io_uring nop operation to evaluate
> > io_uring framework in v5.1 and 5.7.0-rc4+(jens's io_uring-5.7 branch),
> > I see a obvious performace regression:
> >
> > v5.1 kernel:
> >      $sudo taskset -c 60 ./io_uring_nop_stress -r 300 # run 300 seconds
> >      total ios: 1832524960
> >      IOPS:      6108416
> > 5.7.0-rc4+
> >      $sudo taskset -c 60 ./io_uring_nop_stress -r 300
> >      total ios: 1597672304
> >      IOPS:      5325574
> > it's about 12% performance regression.
> 
> For sure there's a bit more bloat in 5.7+ compared to the initial slim
> version, and focus has been on features to a certain extent recently.
> The poll rework for 5.7 will really improve performance for the
> networked side though, so it's not like it's just piling on features
> that add bloat.
> 
> That said, I do think it's time for a revisit on overhead. It's been a
> while since I've done my disk IO testing. The nop testing isn't _that_
> interesting by itself, as a micro benchmark it does yield some results
> though. Are you running on bare metal or in a VM?
> 
> > Using perf can see many performance bottlenecks, for example,
> > io_submit_sqes is one.  For now, I did't make many analysis yet, just
> > have a look at io_submit_sqes(), there are many assignment operations
> > in io_init_req(), but I'm not sure whether they are all needed when
> > req is not needed to be punt to io-wq, for example,
> > INIT_IO_WORK(&req->work, io_wq_submit_work); # a whole struct
> > assignment from perf annotate tool, it's an expensive operation, I
> > think reqs that use fast poll feature use task-work function, so the
> > INIT_IO_WORK maybe not necessary.
> 
> I'm sure there's some low hanging fruit there, and I'd love to take
> patches for it.
> 
> > Above is just one issue, what I worry is that whether io_uring is
> > becoming more bloated gradually, and will not that better to aio. In
> > https://kernel.dk/io_uring.pdf, it says that io_uring will eliminate
> > 104 bytes copy compared to aio, but see currenct io_init_req(),
> > io_uring maybe copy more, introducing more overhead? Or does we need
> > to carefully re-design struct io_kiocb, to reduce overhead as soon as
> > possible.
> 
> The copy refers to the data structures coming in and out, both io_uring
> and io_uring inititalize their main io_kiocb/aio_kiocb structure as
> well. The io_uring is slightly bigger, but not much, and it's the same
> number of cachelines. So should not be a huge difference there. The
> copying on the aio side is basically first the pointer copy, then the
> user side kiocb structure. io_uring doesn't need to do that. The
> completion side is also slimmer. We also don't need as many system calls
> to do the same thing, for example.
> 
> So no, we should always been substantially slimmer than aio, just by the
> very nature of the API.
> 
> One major thing I've been thinking about for io_uring is io_kiocb
> recycling. We're hitting the memory allocator for alloc+free for each
> request, even though that can be somewhat amortized by doing batched
> submissions, and polling for instance can also do batched frees. But I'm
> pretty sure we can find some gains here by having some io_kiocb caching
> that is persistent across operations.
> 
> Outside of that, runtime analysis and speeding up the normal path
> through io_uring will probably also easily yield us an extra 5% (or
> more).
> 
> -- 
> Jens Axboe
> 
>

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: Is io_uring framework becoming bloated gradually? and introduces performace regression
  2020-05-08 16:37 ` Jens Axboe
  2020-05-10 18:10   ` H. de Vries
@ 2020-05-11  3:30   ` Xiaoguang Wang
  2020-05-11 18:08     ` Jens Axboe
  1 sibling, 1 reply; 6+ messages in thread
From: Xiaoguang Wang @ 2020-05-11  3:30 UTC (permalink / raw)
  To: Jens Axboe, io-uring; +Cc: Pavel Begunkov, joseph qi, Jiufei Xue

hi,

> On 5/8/20 9:18 AM, Xiaoguang Wang wrote:
>> hi,
>>
>> This issue was found when I tested IORING_FEAT_FAST_POLL feature, with
>> the newest upstream codes, indeed I find that io_uring's performace
>> improvement is not obvious compared to epoll in my test environment,
>> most of the time they are similar.  Test cases basically comes from:
>> https://github.com/frevib/io_uring-echo-server/blob/io-uring-feat-fast-poll/benchmarks/benchmarks.md.
>> In above url, the author's test results shows that io_uring will get a
>> big performace improvement compared to epoll. I'm still looking into
>> why I don't get the big improvement, currently don't know why, but I
>> find some obvious regression issue.
>>
>> I wrote a simple tool based io_uring nop operation to evaluate
>> io_uring framework in v5.1 and 5.7.0-rc4+(jens's io_uring-5.7 branch),
>> I see a obvious performace regression:
>>
>> v5.1 kernel:
>>       $sudo taskset -c 60 ./io_uring_nop_stress -r 300 # run 300 seconds
>>       total ios: 1832524960
>>       IOPS:      6108416
>> 5.7.0-rc4+
>>       $sudo taskset -c 60 ./io_uring_nop_stress -r 300
>>       total ios: 1597672304
>>       IOPS:      5325574
>> it's about 12% performance regression.
> 
> For sure there's a bit more bloat in 5.7+ compared to the initial slim
> version, and focus has been on features to a certain extent recently.
> The poll rework for 5.7 will really improve performance for the
> networked side though, so it's not like it's just piling on features
> that add bloat.
> 
> That said, I do think it's time for a revisit on overhead. It's been a
> while since I've done my disk IO testing. The nop testing isn't _that_
> interesting by itself, as a micro benchmark it does yield some results
> though. Are you running on bare metal or in a VM?
I run it on bare metal.

> 
>> Using perf can see many performance bottlenecks, for example,
>> io_submit_sqes is one.  For now, I did't make many analysis yet, just
>> have a look at io_submit_sqes(), there are many assignment operations
>> in io_init_req(), but I'm not sure whether they are all needed when
>> req is not needed to be punt to io-wq, for example,
>> INIT_IO_WORK(&req->work, io_wq_submit_work); # a whole struct
>> assignment from perf annotate tool, it's an expensive operation, I
>> think reqs that use fast poll feature use task-work function, so the
>> INIT_IO_WORK maybe not necessary.
> 
> I'm sure there's some low hanging fruit there, and I'd love to take
> patches for it.
Ok, I'll try to improve it a bit, thanks.

> 
>> Above is just one issue, what I worry is that whether io_uring is
>> becoming more bloated gradually, and will not that better to aio. In
>> https://kernel.dk/io_uring.pdf, it says that io_uring will eliminate
>> 104 bytes copy compared to aio, but see currenct io_init_req(),
>> io_uring maybe copy more, introducing more overhead? Or does we need
>> to carefully re-design struct io_kiocb, to reduce overhead as soon as
>> possible.
> 
> The copy refers to the data structures coming in and out, both io_uring
> and io_uring inititalize their main io_kiocb/aio_kiocb structure as
> well. The io_uring is slightly bigger, but not much, and it's the same
> number of cachelines. So should not be a huge difference there. The
> copying on the aio side is basically first the pointer copy, then the
> user side kiocb structure. io_uring doesn't need to do that. The
> completion side is also slimmer. We also don't need as many system calls
> to do the same thing, for example.
> 
> So no, we should always been substantially slimmer than aio, just by the
> very nature of the API.
Really thanks for detailed explanations, now I see, feel good:)

> 
> One major thing I've been thinking about for io_uring is io_kiocb
> recycling. We're hitting the memory allocator for alloc+free for each
> request, even though that can be somewhat amortized by doing batched
> submissions, and polling for instance can also do batched frees. But I'm
> pretty sure we can find some gains here by having some io_kiocb caching
> that is persistent across operations.
Yes, agree. From my above nop operation tests, using perf, it shows
kmem_cache_free() and kmem_cache_alloc_bulk() introduce an considerable
overhead.

Regards,
Xiaoguang Wang
> 
> Outside of that, runtime analysis and speeding up the normal path
> through io_uring will probably also easily yield us an extra 5% (or
> more).
> 

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: Is io_uring framework becoming bloated gradually? and introduces performace regression
  2020-05-10 18:10   ` H. de Vries
@ 2020-05-11  5:43     ` Xiaoguang Wang
  0 siblings, 0 replies; 6+ messages in thread
From: Xiaoguang Wang @ 2020-05-11  5:43 UTC (permalink / raw)
  To: H. de Vries, Jens Axboe, io-uring; +Cc: Pavel Begunkov, joseph qi, Jiufei Xue

hi,

> Hi Xiaoguang,
> 
> Why exactly you don't get the performance [1] is referring to could have many reasons. 
Yes, I think so, and I'll keep looking into it.

> It seems your nop test code quite different from the echo server that is used for [1]. [1] employs and event loop (single thread) specifically for networking. It saves an expensive call to epoll in each iteration of the loop. 
This nop testcase is just writtent to evaluate io_uring framework.
I have added codes to count the number of syscalls in io_uring_echo_server.c and
io_uring really reduces syscalls tremendously, so that's my question that why I still
don't see the obvious improvements though io_uring reduces many syscalls, then I wrote
the mail to discuss io_uring framework's self overhead :)

> I glanced at your test program, what is the performance difference with IOSQE_ASYNC vs without IOSQE_ASYNC?
If IOSQE_ASYNC is flagged, io_uring will complete nop operations in io_uring
internal thread pool, so it'll be a bit slower.

> 
> [1]: https://github.com/frevib/io_uring-echo-server/blob/io-uring-feat-fast-poll/benchmarks/benchmarks.md.
Do you have free time to have a test in newest kernel upstream in your test environment?
Or do you still remember the head kernel commit you tested?

And in your io-uring-feat-fast-poll branch, you use:
// tell kernel we have put a sqe on the submission ring
    io_uring_submit(&ring);

    // wait for new cqe to become available
    ret = io_uring_wait_cqe(&ring, &cqe);
    if (ret != 0)
    {
         perror("Error io_uring_wait_cqe\n");
         exit(1);
     }

Indeed you can use:
ret = io_uring_submit_and_wait(&ring, 1);
if (ret < 0) {
     perror("Error io_uring_wait_cqe\n");
     exit(1);
}
then we can reduce syscall more.

Regards,
Xiaoguang Wang

> 
> --
> Hielke de Vries
> 
> 
> On Fri, May 8, 2020, at 18:37, Jens Axboe wrote:
>> On 5/8/20 9:18 AM, Xiaoguang Wang wrote:
>>> hi,
>>>
>>> This issue was found when I tested IORING_FEAT_FAST_POLL feature, with
>>> the newest upstream codes, indeed I find that io_uring's performace
>>> improvement is not obvious compared to epoll in my test environment,
>>> most of the time they are similar.  Test cases basically comes from:
>>> https://github.com/frevib/io_uring-echo-server/blob/io-uring-feat-fast-poll/benchmarks/benchmarks.md.
>>> In above url, the author's test results shows that io_uring will get a
>>> big performace improvement compared to epoll. I'm still looking into
>>> why I don't get the big improvement, currently don't know why, but I
>>> find some obvious regression issue.
>>>
>>> I wrote a simple tool based io_uring nop operation to evaluate
>>> io_uring framework in v5.1 and 5.7.0-rc4+(jens's io_uring-5.7 branch),
>>> I see a obvious performace regression:
>>>
>>> v5.1 kernel:
>>>       $sudo taskset -c 60 ./io_uring_nop_stress -r 300 # run 300 seconds
>>>       total ios: 1832524960
>>>       IOPS:      6108416
>>> 5.7.0-rc4+
>>>       $sudo taskset -c 60 ./io_uring_nop_stress -r 300
>>>       total ios: 1597672304
>>>       IOPS:      5325574
>>> it's about 12% performance regression.
>>
>> For sure there's a bit more bloat in 5.7+ compared to the initial slim
>> version, and focus has been on features to a certain extent recently.
>> The poll rework for 5.7 will really improve performance for the
>> networked side though, so it's not like it's just piling on features
>> that add bloat.
>>
>> That said, I do think it's time for a revisit on overhead. It's been a
>> while since I've done my disk IO testing. The nop testing isn't _that_
>> interesting by itself, as a micro benchmark it does yield some results
>> though. Are you running on bare metal or in a VM?
>>
>>> Using perf can see many performance bottlenecks, for example,
>>> io_submit_sqes is one.  For now, I did't make many analysis yet, just
>>> have a look at io_submit_sqes(), there are many assignment operations
>>> in io_init_req(), but I'm not sure whether they are all needed when
>>> req is not needed to be punt to io-wq, for example,
>>> INIT_IO_WORK(&req->work, io_wq_submit_work); # a whole struct
>>> assignment from perf annotate tool, it's an expensive operation, I
>>> think reqs that use fast poll feature use task-work function, so the
>>> INIT_IO_WORK maybe not necessary.
>>
>> I'm sure there's some low hanging fruit there, and I'd love to take
>> patches for it.
>>
>>> Above is just one issue, what I worry is that whether io_uring is
>>> becoming more bloated gradually, and will not that better to aio. In
>>> https://kernel.dk/io_uring.pdf, it says that io_uring will eliminate
>>> 104 bytes copy compared to aio, but see currenct io_init_req(),
>>> io_uring maybe copy more, introducing more overhead? Or does we need
>>> to carefully re-design struct io_kiocb, to reduce overhead as soon as
>>> possible.
>>
>> The copy refers to the data structures coming in and out, both io_uring
>> and io_uring inititalize their main io_kiocb/aio_kiocb structure as
>> well. The io_uring is slightly bigger, but not much, and it's the same
>> number of cachelines. So should not be a huge difference there. The
>> copying on the aio side is basically first the pointer copy, then the
>> user side kiocb structure. io_uring doesn't need to do that. The
>> completion side is also slimmer. We also don't need as many system calls
>> to do the same thing, for example.
>>
>> So no, we should always been substantially slimmer than aio, just by the
>> very nature of the API.
>>
>> One major thing I've been thinking about for io_uring is io_kiocb
>> recycling. We're hitting the memory allocator for alloc+free for each
>> request, even though that can be somewhat amortized by doing batched
>> submissions, and polling for instance can also do batched frees. But I'm
>> pretty sure we can find some gains here by having some io_kiocb caching
>> that is persistent across operations.
>>
>> Outside of that, runtime analysis and speeding up the normal path
>> through io_uring will probably also easily yield us an extra 5% (or
>> more).
>>
>> -- 
>> Jens Axboe
>>
>>

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: Is io_uring framework becoming bloated gradually? and introduces performace regression
  2020-05-11  3:30   ` Xiaoguang Wang
@ 2020-05-11 18:08     ` Jens Axboe
  0 siblings, 0 replies; 6+ messages in thread
From: Jens Axboe @ 2020-05-11 18:08 UTC (permalink / raw)
  To: Xiaoguang Wang, io-uring; +Cc: Pavel Begunkov, joseph qi, Jiufei Xue

On 5/10/20 9:30 PM, Xiaoguang Wang wrote:
> hi,
> 
>> On 5/8/20 9:18 AM, Xiaoguang Wang wrote:
>>> hi,
>>>
>>> This issue was found when I tested IORING_FEAT_FAST_POLL feature, with
>>> the newest upstream codes, indeed I find that io_uring's performace
>>> improvement is not obvious compared to epoll in my test environment,
>>> most of the time they are similar.  Test cases basically comes from:
>>> https://github.com/frevib/io_uring-echo-server/blob/io-uring-feat-fast-poll/benchmarks/benchmarks.md.
>>> In above url, the author's test results shows that io_uring will get a
>>> big performace improvement compared to epoll. I'm still looking into
>>> why I don't get the big improvement, currently don't know why, but I
>>> find some obvious regression issue.
>>>
>>> I wrote a simple tool based io_uring nop operation to evaluate
>>> io_uring framework in v5.1 and 5.7.0-rc4+(jens's io_uring-5.7 branch),
>>> I see a obvious performace regression:
>>>
>>> v5.1 kernel:
>>>       $sudo taskset -c 60 ./io_uring_nop_stress -r 300 # run 300 seconds
>>>       total ios: 1832524960
>>>       IOPS:      6108416
>>> 5.7.0-rc4+
>>>       $sudo taskset -c 60 ./io_uring_nop_stress -r 300
>>>       total ios: 1597672304
>>>       IOPS:      5325574
>>> it's about 12% performance regression.
>>
>> For sure there's a bit more bloat in 5.7+ compared to the initial slim
>> version, and focus has been on features to a certain extent recently.
>> The poll rework for 5.7 will really improve performance for the
>> networked side though, so it's not like it's just piling on features
>> that add bloat.
>>
>> That said, I do think it's time for a revisit on overhead. It's been a
>> while since I've done my disk IO testing. The nop testing isn't _that_
>> interesting by itself, as a micro benchmark it does yield some results
>> though. Are you running on bare metal or in a VM?
> I run it on bare metal.
> 
>>
>>> Using perf can see many performance bottlenecks, for example,
>>> io_submit_sqes is one.  For now, I did't make many analysis yet, just
>>> have a look at io_submit_sqes(), there are many assignment operations
>>> in io_init_req(), but I'm not sure whether they are all needed when
>>> req is not needed to be punt to io-wq, for example,
>>> INIT_IO_WORK(&req->work, io_wq_submit_work); # a whole struct
>>> assignment from perf annotate tool, it's an expensive operation, I
>>> think reqs that use fast poll feature use task-work function, so the
>>> INIT_IO_WORK maybe not necessary.
>>
>> I'm sure there's some low hanging fruit there, and I'd love to take
>> patches for it.
> Ok, I'll try to improve it a bit, thanks.
> 
>>
>>> Above is just one issue, what I worry is that whether io_uring is
>>> becoming more bloated gradually, and will not that better to aio. In
>>> https://kernel.dk/io_uring.pdf, it says that io_uring will eliminate
>>> 104 bytes copy compared to aio, but see currenct io_init_req(),
>>> io_uring maybe copy more, introducing more overhead? Or does we need
>>> to carefully re-design struct io_kiocb, to reduce overhead as soon as
>>> possible.
>>
>> The copy refers to the data structures coming in and out, both io_uring
>> and io_uring inititalize their main io_kiocb/aio_kiocb structure as
>> well. The io_uring is slightly bigger, but not much, and it's the same
>> number of cachelines. So should not be a huge difference there. The
>> copying on the aio side is basically first the pointer copy, then the
>> user side kiocb structure. io_uring doesn't need to do that. The
>> completion side is also slimmer. We also don't need as many system calls
>> to do the same thing, for example.
>>
>> So no, we should always been substantially slimmer than aio, just by the
>> very nature of the API.
> Really thanks for detailed explanations, now I see, feel good:)
> 
>>
>> One major thing I've been thinking about for io_uring is io_kiocb
>> recycling. We're hitting the memory allocator for alloc+free for each
>> request, even though that can be somewhat amortized by doing batched
>> submissions, and polling for instance can also do batched frees. But I'm
>> pretty sure we can find some gains here by having some io_kiocb caching
>> that is persistent across operations.
> Yes, agree. From my above nop operation tests, using perf, it shows
> kmem_cache_free() and kmem_cache_alloc_bulk() introduce an considerable
> overhead.

Just did a quick'n dirty where we have a global per-cpu cache of
io_kiocbs, so we can recycle them.

I'm using my laptop for this testing, running in kvm. This isn't ideal,
as the spinlock/irq overhead is much larger there, but it's the same
across all runs.

Baseline performance, this is running your test app as-is, no changes:

# taskset -c 0 ./io_uring_nop_stress 
total ios: 294158288
IOPS:      9805276

Then I disabled the bulk alloc optimization, so we don't get any
benefits of allocating 16 requests at the time:

# taskset -c 0 ./io_uring_nop_stress 
total ios: 285341456
IOPS:      9511381

As expected, we're down a bit at that point, as we're allocating each
io_kiocb individually.

Then I tried with the quick'n dirty patch:

# taskset -c 0 ./io_uring_nop_stress 
total ios: 325797040
IOPS:      10859901

and it's quite a bit faster. This is cheating a little bit, since these
are just nops and we don't have to disable local irqs while grabbing a
recycled entry. But just adding that and it should be safe to use, and
probably won't make much of a difference on actual hardware.

The main concern for me is how to manage this cache. We could just limit
it to X entries per cpu or something like that, but it'd be nice to be
able to respond to memory pressure.

Quick'n dirty below.


diff --git a/fs/io_uring.c b/fs/io_uring.c
index 4c2fe06ae20b..b6a6d0f0f4e5 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -673,6 +673,13 @@ struct io_submit_state {
 	unsigned int		ios_left;
 };
 
+struct io_kiocb_cache {
+	struct list_head	list;
+	unsigned		nr_free;
+};
+
+static struct io_kiocb_cache *alloc_cache;
+
 struct io_op_def {
 	/* needs req->io allocated for deferral/async */
 	unsigned		async_ctx : 1;
@@ -1293,6 +1300,29 @@ static struct io_kiocb *io_get_fallback_req(struct io_ring_ctx *ctx)
 	return NULL;
 }
 
+static struct io_kiocb *io_cache_alloc(void)
+{
+	struct io_kiocb_cache *cache;
+	struct io_kiocb *req = NULL;
+
+	cache = this_cpu_ptr(alloc_cache);
+	if (!list_empty(&cache->list)) {
+		req = list_first_entry(&cache->list, struct io_kiocb, list);
+		list_del(&req->list);
+		return req;
+	}
+
+	return NULL;
+}
+
+static void io_cache_free(struct io_kiocb *req)
+{
+	struct io_kiocb_cache *cache;
+
+	cache = this_cpu_ptr(alloc_cache);
+	list_add(&req->list, &cache->list);
+}
+
 static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx,
 				     struct io_submit_state *state)
 {
@@ -1300,6 +1330,9 @@ static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx,
 	struct io_kiocb *req;
 
 	if (!state) {
+		req = io_cache_alloc();
+		if (req)
+			return req;
 		req = kmem_cache_alloc(req_cachep, gfp);
 		if (unlikely(!req))
 			goto fallback;
@@ -1372,7 +1405,7 @@ static void __io_free_req(struct io_kiocb *req)
 
 	percpu_ref_put(&req->ctx->refs);
 	if (likely(!io_is_fallback_req(req)))
-		kmem_cache_free(req_cachep, req);
+		io_cache_free(req);
 	else
 		clear_bit_unlock(0, (unsigned long *) &req->ctx->fallback_req);
 }
@@ -5842,7 +5875,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 			  struct file *ring_file, int ring_fd, bool async)
 {
-	struct io_submit_state state, *statep = NULL;
+	//struct io_submit_state state, *statep = NULL;
 	struct io_kiocb *link = NULL;
 	int i, submitted = 0;
 
@@ -5859,10 +5892,12 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 	if (!percpu_ref_tryget_many(&ctx->refs, nr))
 		return -EAGAIN;
 
+#if 0
 	if (nr > IO_PLUG_THRESHOLD) {
 		io_submit_state_start(&state, nr);
 		statep = &state;
 	}
+#endif
 
 	ctx->ring_fd = ring_fd;
 	ctx->ring_file = ring_file;
@@ -5877,14 +5912,14 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 			io_consume_sqe(ctx);
 			break;
 		}
-		req = io_alloc_req(ctx, statep);
+		req = io_alloc_req(ctx, NULL);
 		if (unlikely(!req)) {
 			if (!submitted)
 				submitted = -EAGAIN;
 			break;
 		}
 
-		err = io_init_req(ctx, req, sqe, statep, async);
+		err = io_init_req(ctx, req, sqe, NULL, async);
 		io_consume_sqe(ctx);
 		/* will complete beyond this point, count as submitted */
 		submitted++;
@@ -5898,7 +5933,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 
 		trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data,
 						true, async);
-		err = io_submit_sqe(req, sqe, statep, &link);
+		err = io_submit_sqe(req, sqe, NULL, &link);
 		if (err)
 			goto fail_req;
 	}
@@ -5910,8 +5945,10 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 	}
 	if (link)
 		io_queue_link_head(link);
+#if 0
 	if (statep)
 		io_submit_state_end(&state);
+#endif
 
 	 /* Commit SQ ring head once we've consumed and submitted all SQEs */
 	io_commit_sqring(ctx);
@@ -8108,6 +8145,8 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
 
 static int __init io_uring_init(void)
 {
+	int cpu;
+
 #define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
 	BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
 	BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
@@ -8147,6 +8186,15 @@ static int __init io_uring_init(void)
 	BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
 	BUILD_BUG_ON(__REQ_F_LAST_BIT >= 8 * sizeof(int));
 	req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
+
+	alloc_cache = alloc_percpu(struct io_kiocb_cache);
+	for_each_possible_cpu(cpu) {
+		struct io_kiocb_cache *cache;
+
+		cache = per_cpu_ptr(alloc_cache, cpu);
+		INIT_LIST_HEAD(&cache->list);
+		cache->nr_free = 0;
+	}
 	return 0;
 };
 __initcall(io_uring_init);

-- 
Jens Axboe


^ permalink raw reply related	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2020-05-11 18:08 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-05-08 15:18 Is io_uring framework becoming bloated gradually? and introduces performace regression Xiaoguang Wang
2020-05-08 16:37 ` Jens Axboe
2020-05-10 18:10   ` H. de Vries
2020-05-11  5:43     ` Xiaoguang Wang
2020-05-11  3:30   ` Xiaoguang Wang
2020-05-11 18:08     ` Jens Axboe

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).