All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] __io_uring_get_cqe: eliminate unnecessary io_uring_enter() syscalls
@ 2020-03-02  4:18 Xiaoguang Wang
  2020-03-02 14:05 ` Jens Axboe
  0 siblings, 1 reply; 8+ messages in thread
From: Xiaoguang Wang @ 2020-03-02  4:18 UTC (permalink / raw)
  To: io-uring; +Cc: axboe, joseph.qi, Xiaoguang Wang

When user applis programming mode, like sumbit one sqe and wait its
completion event, __io_uring_get_cqe() will result in many unnecessary
syscalls, see below test program:

    int main(int argc, char *argv[])
    {
            struct io_uring ring;
            int fd, ret;
            struct io_uring_sqe *sqe;
            struct io_uring_cqe *cqe;
            struct iovec iov;
            off_t offset, filesize = 0;
            void *buf;

            if (argc < 2) {
                    printf("%s: file\n", argv[0]);
                    return 1;
            }

            ret = io_uring_queue_init(4, &ring, 0);
            if (ret < 0) {
                    fprintf(stderr, "queue_init: %s\n", strerror(-ret));
                    return 1;
            }

            fd = open(argv[1], O_RDONLY | O_DIRECT);
            if (fd < 0) {
                    perror("open");
                    return 1;
            }

            if (posix_memalign(&buf, 4096, 4096))
                    return 1;
            iov.iov_base = buf;
            iov.iov_len = 4096;

            offset = 0;
            do {
                    sqe = io_uring_get_sqe(&ring);
                    if (!sqe) {
                            printf("here\n");
                            break;
                    }
                    io_uring_prep_readv(sqe, fd, &iov, 1, offset);

                    ret = io_uring_submit(&ring);
                    if (ret < 0) {
                            fprintf(stderr, "io_uring_submit: %s\n", strerror(-ret));
                            return 1;
                    }

                    ret = io_uring_wait_cqe(&ring, &cqe);
                    if (ret < 0) {
                            fprintf(stderr, "io_uring_wait_cqe: %s\n", strerror(-ret));
                            return 1;
                    }

                    if (cqe->res <= 0) {
                            if (cqe->res < 0) {
                                    fprintf(stderr, "got eror: %d\n", cqe->res);
                                    ret = 1;
                            }
                            io_uring_cqe_seen(&ring, cqe);
                            break;
                    }
                    offset += cqe->res;
                    filesize += cqe->res;
                    io_uring_cqe_seen(&ring, cqe);
            } while (1);

            printf("filesize: %ld\n", filesize);
            close(fd);
            io_uring_queue_exit(&ring);
            return 0;
    }

dd if=/dev/zero of=testfile bs=4096 count=16
./test  testfile
and use bpftrace to trace io_uring_enter syscalls, in original codes,
[lege@localhost ~]$ sudo bpftrace -e "tracepoint:syscalls:sys_enter_io_uring_enter {@c[tid] = count();}"
Attaching 1 probe...
@c[11184]: 49
Above test issues 49 syscalls, it's counterintuitive. After looking
into the codes, it's because __io_uring_get_cqe issue one more syscall,
indded when __io_uring_get_cqe issues the first syscall, one cqe should
already be ready, we don't need to wait again.

To fix this issue, after the first syscall, set wait_nr to be zero, with
tihs patch, bpftrace shows the number of io_uring_enter syscall is 33.

Signed-off-by: Xiaoguang Wang <xiaoguang.wang@linux.alibaba.com>
---
 src/queue.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/queue.c b/src/queue.c
index ef2cc2b..99a4a0c 100644
--- a/src/queue.c
+++ b/src/queue.c
@@ -53,6 +53,8 @@ int __io_uring_get_cqe(struct io_uring *ring, struct io_uring_cqe **cqe_ptr,
 		if (wait_nr || submit)
 			ret = __sys_io_uring_enter(ring->ring_fd, submit,
 						   wait_nr, flags, sigmask);
+		if (wait_nr)
+			wait_nr = 0;
 		if (ret < 0)
 			err = -errno;
 		submit -= ret;
-- 
2.17.2


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH] __io_uring_get_cqe: eliminate unnecessary io_uring_enter() syscalls
  2020-03-02  4:18 [PATCH] __io_uring_get_cqe: eliminate unnecessary io_uring_enter() syscalls Xiaoguang Wang
@ 2020-03-02 14:05 ` Jens Axboe
  2020-03-02 15:24   ` Jens Axboe
  0 siblings, 1 reply; 8+ messages in thread
From: Jens Axboe @ 2020-03-02 14:05 UTC (permalink / raw)
  To: Xiaoguang Wang, io-uring; +Cc: joseph.qi

On 3/1/20 9:18 PM, Xiaoguang Wang wrote:
> When user applis programming mode, like sumbit one sqe and wait its
> completion event, __io_uring_get_cqe() will result in many unnecessary
> syscalls, see below test program:
> 
>     int main(int argc, char *argv[])
>     {
>             struct io_uring ring;
>             int fd, ret;
>             struct io_uring_sqe *sqe;
>             struct io_uring_cqe *cqe;
>             struct iovec iov;
>             off_t offset, filesize = 0;
>             void *buf;
> 
>             if (argc < 2) {
>                     printf("%s: file\n", argv[0]);
>                     return 1;
>             }
> 
>             ret = io_uring_queue_init(4, &ring, 0);
>             if (ret < 0) {
>                     fprintf(stderr, "queue_init: %s\n", strerror(-ret));
>                     return 1;
>             }
> 
>             fd = open(argv[1], O_RDONLY | O_DIRECT);
>             if (fd < 0) {
>                     perror("open");
>                     return 1;
>             }
> 
>             if (posix_memalign(&buf, 4096, 4096))
>                     return 1;
>             iov.iov_base = buf;
>             iov.iov_len = 4096;
> 
>             offset = 0;
>             do {
>                     sqe = io_uring_get_sqe(&ring);
>                     if (!sqe) {
>                             printf("here\n");
>                             break;
>                     }
>                     io_uring_prep_readv(sqe, fd, &iov, 1, offset);
> 
>                     ret = io_uring_submit(&ring);
>                     if (ret < 0) {
>                             fprintf(stderr, "io_uring_submit: %s\n", strerror(-ret));
>                             return 1;
>                     }
> 
>                     ret = io_uring_wait_cqe(&ring, &cqe);
>                     if (ret < 0) {
>                             fprintf(stderr, "io_uring_wait_cqe: %s\n", strerror(-ret));
>                             return 1;
>                     }
> 
>                     if (cqe->res <= 0) {
>                             if (cqe->res < 0) {
>                                     fprintf(stderr, "got eror: %d\n", cqe->res);
>                                     ret = 1;
>                             }
>                             io_uring_cqe_seen(&ring, cqe);
>                             break;
>                     }
>                     offset += cqe->res;
>                     filesize += cqe->res;
>                     io_uring_cqe_seen(&ring, cqe);
>             } while (1);
> 
>             printf("filesize: %ld\n", filesize);
>             close(fd);
>             io_uring_queue_exit(&ring);
>             return 0;
>     }
> 
> dd if=/dev/zero of=testfile bs=4096 count=16
> ./test  testfile
> and use bpftrace to trace io_uring_enter syscalls, in original codes,
> [lege@localhost ~]$ sudo bpftrace -e "tracepoint:syscalls:sys_enter_io_uring_enter {@c[tid] = count();}"
> Attaching 1 probe...
> @c[11184]: 49
> Above test issues 49 syscalls, it's counterintuitive. After looking
> into the codes, it's because __io_uring_get_cqe issue one more syscall,
> indded when __io_uring_get_cqe issues the first syscall, one cqe should
> already be ready, we don't need to wait again.
> 
> To fix this issue, after the first syscall, set wait_nr to be zero, with
> tihs patch, bpftrace shows the number of io_uring_enter syscall is 33.

Thanks, that's a nice fix, we definitely don't want to be doing
50% more system calls than we have to...

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] __io_uring_get_cqe: eliminate unnecessary io_uring_enter() syscalls
  2020-03-02 14:05 ` Jens Axboe
@ 2020-03-02 15:24   ` Jens Axboe
  2020-03-02 15:37     ` Jens Axboe
  0 siblings, 1 reply; 8+ messages in thread
From: Jens Axboe @ 2020-03-02 15:24 UTC (permalink / raw)
  To: Xiaoguang Wang, io-uring; +Cc: joseph.qi

On 3/2/20 7:05 AM, Jens Axboe wrote:
> On 3/1/20 9:18 PM, Xiaoguang Wang wrote:
>> When user applis programming mode, like sumbit one sqe and wait its
>> completion event, __io_uring_get_cqe() will result in many unnecessary
>> syscalls, see below test program:
>>
>>     int main(int argc, char *argv[])
>>     {
>>             struct io_uring ring;
>>             int fd, ret;
>>             struct io_uring_sqe *sqe;
>>             struct io_uring_cqe *cqe;
>>             struct iovec iov;
>>             off_t offset, filesize = 0;
>>             void *buf;
>>
>>             if (argc < 2) {
>>                     printf("%s: file\n", argv[0]);
>>                     return 1;
>>             }
>>
>>             ret = io_uring_queue_init(4, &ring, 0);
>>             if (ret < 0) {
>>                     fprintf(stderr, "queue_init: %s\n", strerror(-ret));
>>                     return 1;
>>             }
>>
>>             fd = open(argv[1], O_RDONLY | O_DIRECT);
>>             if (fd < 0) {
>>                     perror("open");
>>                     return 1;
>>             }
>>
>>             if (posix_memalign(&buf, 4096, 4096))
>>                     return 1;
>>             iov.iov_base = buf;
>>             iov.iov_len = 4096;
>>
>>             offset = 0;
>>             do {
>>                     sqe = io_uring_get_sqe(&ring);
>>                     if (!sqe) {
>>                             printf("here\n");
>>                             break;
>>                     }
>>                     io_uring_prep_readv(sqe, fd, &iov, 1, offset);
>>
>>                     ret = io_uring_submit(&ring);
>>                     if (ret < 0) {
>>                             fprintf(stderr, "io_uring_submit: %s\n", strerror(-ret));
>>                             return 1;
>>                     }
>>
>>                     ret = io_uring_wait_cqe(&ring, &cqe);
>>                     if (ret < 0) {
>>                             fprintf(stderr, "io_uring_wait_cqe: %s\n", strerror(-ret));
>>                             return 1;
>>                     }
>>
>>                     if (cqe->res <= 0) {
>>                             if (cqe->res < 0) {
>>                                     fprintf(stderr, "got eror: %d\n", cqe->res);
>>                                     ret = 1;
>>                             }
>>                             io_uring_cqe_seen(&ring, cqe);
>>                             break;
>>                     }
>>                     offset += cqe->res;
>>                     filesize += cqe->res;
>>                     io_uring_cqe_seen(&ring, cqe);
>>             } while (1);
>>
>>             printf("filesize: %ld\n", filesize);
>>             close(fd);
>>             io_uring_queue_exit(&ring);
>>             return 0;
>>     }
>>
>> dd if=/dev/zero of=testfile bs=4096 count=16
>> ./test  testfile
>> and use bpftrace to trace io_uring_enter syscalls, in original codes,
>> [lege@localhost ~]$ sudo bpftrace -e "tracepoint:syscalls:sys_enter_io_uring_enter {@c[tid] = count();}"
>> Attaching 1 probe...
>> @c[11184]: 49
>> Above test issues 49 syscalls, it's counterintuitive. After looking
>> into the codes, it's because __io_uring_get_cqe issue one more syscall,
>> indded when __io_uring_get_cqe issues the first syscall, one cqe should
>> already be ready, we don't need to wait again.
>>
>> To fix this issue, after the first syscall, set wait_nr to be zero, with
>> tihs patch, bpftrace shows the number of io_uring_enter syscall is 33.
> 
> Thanks, that's a nice fix, we definitely don't want to be doing
> 50% more system calls than we have to...

Actually, don't think the fix is quite safe. For one, if we get an error
on the __io_uring_enter(), then we may not have waited for entries. Or if
we submitted less than we thought we would, we would not have waited
either. So we need to check for full success before deeming it safe to
clear wait_nr.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] __io_uring_get_cqe: eliminate unnecessary io_uring_enter() syscalls
  2020-03-02 15:24   ` Jens Axboe
@ 2020-03-02 15:37     ` Jens Axboe
  2020-03-03 13:11       ` Xiaoguang Wang
  2020-03-04 13:27       ` Xiaoguang Wang
  0 siblings, 2 replies; 8+ messages in thread
From: Jens Axboe @ 2020-03-02 15:37 UTC (permalink / raw)
  To: Xiaoguang Wang, io-uring; +Cc: joseph.qi

On 3/2/20 8:24 AM, Jens Axboe wrote:
> On 3/2/20 7:05 AM, Jens Axboe wrote:
>> On 3/1/20 9:18 PM, Xiaoguang Wang wrote:
>>> When user applis programming mode, like sumbit one sqe and wait its
>>> completion event, __io_uring_get_cqe() will result in many unnecessary
>>> syscalls, see below test program:
>>>
>>>     int main(int argc, char *argv[])
>>>     {
>>>             struct io_uring ring;
>>>             int fd, ret;
>>>             struct io_uring_sqe *sqe;
>>>             struct io_uring_cqe *cqe;
>>>             struct iovec iov;
>>>             off_t offset, filesize = 0;
>>>             void *buf;
>>>
>>>             if (argc < 2) {
>>>                     printf("%s: file\n", argv[0]);
>>>                     return 1;
>>>             }
>>>
>>>             ret = io_uring_queue_init(4, &ring, 0);
>>>             if (ret < 0) {
>>>                     fprintf(stderr, "queue_init: %s\n", strerror(-ret));
>>>                     return 1;
>>>             }
>>>
>>>             fd = open(argv[1], O_RDONLY | O_DIRECT);
>>>             if (fd < 0) {
>>>                     perror("open");
>>>                     return 1;
>>>             }
>>>
>>>             if (posix_memalign(&buf, 4096, 4096))
>>>                     return 1;
>>>             iov.iov_base = buf;
>>>             iov.iov_len = 4096;
>>>
>>>             offset = 0;
>>>             do {
>>>                     sqe = io_uring_get_sqe(&ring);
>>>                     if (!sqe) {
>>>                             printf("here\n");
>>>                             break;
>>>                     }
>>>                     io_uring_prep_readv(sqe, fd, &iov, 1, offset);
>>>
>>>                     ret = io_uring_submit(&ring);
>>>                     if (ret < 0) {
>>>                             fprintf(stderr, "io_uring_submit: %s\n", strerror(-ret));
>>>                             return 1;
>>>                     }
>>>
>>>                     ret = io_uring_wait_cqe(&ring, &cqe);
>>>                     if (ret < 0) {
>>>                             fprintf(stderr, "io_uring_wait_cqe: %s\n", strerror(-ret));
>>>                             return 1;
>>>                     }
>>>
>>>                     if (cqe->res <= 0) {
>>>                             if (cqe->res < 0) {
>>>                                     fprintf(stderr, "got eror: %d\n", cqe->res);
>>>                                     ret = 1;
>>>                             }
>>>                             io_uring_cqe_seen(&ring, cqe);
>>>                             break;
>>>                     }
>>>                     offset += cqe->res;
>>>                     filesize += cqe->res;
>>>                     io_uring_cqe_seen(&ring, cqe);
>>>             } while (1);
>>>
>>>             printf("filesize: %ld\n", filesize);
>>>             close(fd);
>>>             io_uring_queue_exit(&ring);
>>>             return 0;
>>>     }
>>>
>>> dd if=/dev/zero of=testfile bs=4096 count=16
>>> ./test  testfile
>>> and use bpftrace to trace io_uring_enter syscalls, in original codes,
>>> [lege@localhost ~]$ sudo bpftrace -e "tracepoint:syscalls:sys_enter_io_uring_enter {@c[tid] = count();}"
>>> Attaching 1 probe...
>>> @c[11184]: 49
>>> Above test issues 49 syscalls, it's counterintuitive. After looking
>>> into the codes, it's because __io_uring_get_cqe issue one more syscall,
>>> indded when __io_uring_get_cqe issues the first syscall, one cqe should
>>> already be ready, we don't need to wait again.
>>>
>>> To fix this issue, after the first syscall, set wait_nr to be zero, with
>>> tihs patch, bpftrace shows the number of io_uring_enter syscall is 33.
>>
>> Thanks, that's a nice fix, we definitely don't want to be doing
>> 50% more system calls than we have to...
> 
> Actually, don't think the fix is quite safe. For one, if we get an error
> on the __io_uring_enter(), then we may not have waited for entries. Or if
> we submitted less than we thought we would, we would not have waited
> either. So we need to check for full success before deeming it safe to
> clear wait_nr.

Unrelated fix:

https://git.kernel.dk/cgit/liburing/commit/?id=0edcef5700fd558d2548532e0e5db26cb74d19ca

and then a fix for your patch on top:

https://git.kernel.dk/cgit/liburing/commit/?id=dc14e30a086082b6aebc3130948e2453e3bd3b2a

Can you double check that your original test case still produces the
same amount of system calls with the fix in place?

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] __io_uring_get_cqe: eliminate unnecessary io_uring_enter() syscalls
  2020-03-02 15:37     ` Jens Axboe
@ 2020-03-03 13:11       ` Xiaoguang Wang
  2020-03-03 14:35         ` Jens Axboe
  2020-03-04 13:27       ` Xiaoguang Wang
  1 sibling, 1 reply; 8+ messages in thread
From: Xiaoguang Wang @ 2020-03-03 13:11 UTC (permalink / raw)
  To: Jens Axboe, io-uring; +Cc: joseph.qi

hi,

> On 3/2/20 8:24 AM, Jens Axboe wrote:
>> On 3/2/20 7:05 AM, Jens Axboe wrote:
>>> On 3/1/20 9:18 PM, Xiaoguang Wang wrote:
>>>> When user applis programming mode, like sumbit one sqe and wait its
>>>> completion event, __io_uring_get_cqe() will result in many unnecessary
>>>> syscalls, see below test program:
>>>>
>>>>      int main(int argc, char *argv[])
>>>>      {
>>>>              struct io_uring ring;
>>>>              int fd, ret;
>>>>              struct io_uring_sqe *sqe;
>>>>              struct io_uring_cqe *cqe;
>>>>              struct iovec iov;
>>>>              off_t offset, filesize = 0;
>>>>              void *buf;
>>>>
>>>>              if (argc < 2) {
>>>>                      printf("%s: file\n", argv[0]);
>>>>                      return 1;
>>>>              }
>>>>
>>>>              ret = io_uring_queue_init(4, &ring, 0);
>>>>              if (ret < 0) {
>>>>                      fprintf(stderr, "queue_init: %s\n", strerror(-ret));
>>>>                      return 1;
>>>>              }
>>>>
>>>>              fd = open(argv[1], O_RDONLY | O_DIRECT);
>>>>              if (fd < 0) {
>>>>                      perror("open");
>>>>                      return 1;
>>>>              }
>>>>
>>>>              if (posix_memalign(&buf, 4096, 4096))
>>>>                      return 1;
>>>>              iov.iov_base = buf;
>>>>              iov.iov_len = 4096;
>>>>
>>>>              offset = 0;
>>>>              do {
>>>>                      sqe = io_uring_get_sqe(&ring);
>>>>                      if (!sqe) {
>>>>                              printf("here\n");
>>>>                              break;
>>>>                      }
>>>>                      io_uring_prep_readv(sqe, fd, &iov, 1, offset);
>>>>
>>>>                      ret = io_uring_submit(&ring);
>>>>                      if (ret < 0) {
>>>>                              fprintf(stderr, "io_uring_submit: %s\n", strerror(-ret));
>>>>                              return 1;
>>>>                      }
>>>>
>>>>                      ret = io_uring_wait_cqe(&ring, &cqe);
>>>>                      if (ret < 0) {
>>>>                              fprintf(stderr, "io_uring_wait_cqe: %s\n", strerror(-ret));
>>>>                              return 1;
>>>>                      }
>>>>
>>>>                      if (cqe->res <= 0) {
>>>>                              if (cqe->res < 0) {
>>>>                                      fprintf(stderr, "got eror: %d\n", cqe->res);
>>>>                                      ret = 1;
>>>>                              }
>>>>                              io_uring_cqe_seen(&ring, cqe);
>>>>                              break;
>>>>                      }
>>>>                      offset += cqe->res;
>>>>                      filesize += cqe->res;
>>>>                      io_uring_cqe_seen(&ring, cqe);
>>>>              } while (1);
>>>>
>>>>              printf("filesize: %ld\n", filesize);
>>>>              close(fd);
>>>>              io_uring_queue_exit(&ring);
>>>>              return 0;
>>>>      }
>>>>
>>>> dd if=/dev/zero of=testfile bs=4096 count=16
>>>> ./test  testfile
>>>> and use bpftrace to trace io_uring_enter syscalls, in original codes,
>>>> [lege@localhost ~]$ sudo bpftrace -e "tracepoint:syscalls:sys_enter_io_uring_enter {@c[tid] = count();}"
>>>> Attaching 1 probe...
>>>> @c[11184]: 49
>>>> Above test issues 49 syscalls, it's counterintuitive. After looking
>>>> into the codes, it's because __io_uring_get_cqe issue one more syscall,
>>>> indded when __io_uring_get_cqe issues the first syscall, one cqe should
>>>> already be ready, we don't need to wait again.
>>>>
>>>> To fix this issue, after the first syscall, set wait_nr to be zero, with
>>>> tihs patch, bpftrace shows the number of io_uring_enter syscall is 33.
>>>
>>> Thanks, that's a nice fix, we definitely don't want to be doing
>>> 50% more system calls than we have to...
>>
>> Actually, don't think the fix is quite safe. For one, if we get an error
>> on the __io_uring_enter(), then we may not have waited for entries. Or if
>> we submitted less than we thought we would, we would not have waited
>> either. So we need to check for full success before deeming it safe to
>> clear wait_nr.
> 
> Unrelated fix:
> 
> https://git.kernel.dk/cgit/liburing/commit/?id=0edcef5700fd558d2548532e0e5db26cb74d19ca
> 
> and then a fix for your patch on top:
> 
> https://git.kernel.dk/cgit/liburing/commit/?id=dc14e30a086082b6aebc3130948e2453e3bd3b2a
> 
> Can you double check that your original test case still produces the
> same amount of system calls with the fix in place?
Yes, it still produces the same amount of system calls.
Thanks for explanation and right fix.

Regards,
Xiaoguang Wang
> 

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] __io_uring_get_cqe: eliminate unnecessary io_uring_enter() syscalls
  2020-03-03 13:11       ` Xiaoguang Wang
@ 2020-03-03 14:35         ` Jens Axboe
  0 siblings, 0 replies; 8+ messages in thread
From: Jens Axboe @ 2020-03-03 14:35 UTC (permalink / raw)
  To: Xiaoguang Wang, io-uring; +Cc: joseph.qi

On 3/3/20 6:11 AM, Xiaoguang Wang wrote:
> hi,
> 
>> On 3/2/20 8:24 AM, Jens Axboe wrote:
>>> On 3/2/20 7:05 AM, Jens Axboe wrote:
>>>> On 3/1/20 9:18 PM, Xiaoguang Wang wrote:
>>>>> When user applis programming mode, like sumbit one sqe and wait its
>>>>> completion event, __io_uring_get_cqe() will result in many unnecessary
>>>>> syscalls, see below test program:
>>>>>
>>>>>      int main(int argc, char *argv[])
>>>>>      {
>>>>>              struct io_uring ring;
>>>>>              int fd, ret;
>>>>>              struct io_uring_sqe *sqe;
>>>>>              struct io_uring_cqe *cqe;
>>>>>              struct iovec iov;
>>>>>              off_t offset, filesize = 0;
>>>>>              void *buf;
>>>>>
>>>>>              if (argc < 2) {
>>>>>                      printf("%s: file\n", argv[0]);
>>>>>                      return 1;
>>>>>              }
>>>>>
>>>>>              ret = io_uring_queue_init(4, &ring, 0);
>>>>>              if (ret < 0) {
>>>>>                      fprintf(stderr, "queue_init: %s\n", strerror(-ret));
>>>>>                      return 1;
>>>>>              }
>>>>>
>>>>>              fd = open(argv[1], O_RDONLY | O_DIRECT);
>>>>>              if (fd < 0) {
>>>>>                      perror("open");
>>>>>                      return 1;
>>>>>              }
>>>>>
>>>>>              if (posix_memalign(&buf, 4096, 4096))
>>>>>                      return 1;
>>>>>              iov.iov_base = buf;
>>>>>              iov.iov_len = 4096;
>>>>>
>>>>>              offset = 0;
>>>>>              do {
>>>>>                      sqe = io_uring_get_sqe(&ring);
>>>>>                      if (!sqe) {
>>>>>                              printf("here\n");
>>>>>                              break;
>>>>>                      }
>>>>>                      io_uring_prep_readv(sqe, fd, &iov, 1, offset);
>>>>>
>>>>>                      ret = io_uring_submit(&ring);
>>>>>                      if (ret < 0) {
>>>>>                              fprintf(stderr, "io_uring_submit: %s\n", strerror(-ret));
>>>>>                              return 1;
>>>>>                      }
>>>>>
>>>>>                      ret = io_uring_wait_cqe(&ring, &cqe);
>>>>>                      if (ret < 0) {
>>>>>                              fprintf(stderr, "io_uring_wait_cqe: %s\n", strerror(-ret));
>>>>>                              return 1;
>>>>>                      }
>>>>>
>>>>>                      if (cqe->res <= 0) {
>>>>>                              if (cqe->res < 0) {
>>>>>                                      fprintf(stderr, "got eror: %d\n", cqe->res);
>>>>>                                      ret = 1;
>>>>>                              }
>>>>>                              io_uring_cqe_seen(&ring, cqe);
>>>>>                              break;
>>>>>                      }
>>>>>                      offset += cqe->res;
>>>>>                      filesize += cqe->res;
>>>>>                      io_uring_cqe_seen(&ring, cqe);
>>>>>              } while (1);
>>>>>
>>>>>              printf("filesize: %ld\n", filesize);
>>>>>              close(fd);
>>>>>              io_uring_queue_exit(&ring);
>>>>>              return 0;
>>>>>      }
>>>>>
>>>>> dd if=/dev/zero of=testfile bs=4096 count=16
>>>>> ./test  testfile
>>>>> and use bpftrace to trace io_uring_enter syscalls, in original codes,
>>>>> [lege@localhost ~]$ sudo bpftrace -e "tracepoint:syscalls:sys_enter_io_uring_enter {@c[tid] = count();}"
>>>>> Attaching 1 probe...
>>>>> @c[11184]: 49
>>>>> Above test issues 49 syscalls, it's counterintuitive. After looking
>>>>> into the codes, it's because __io_uring_get_cqe issue one more syscall,
>>>>> indded when __io_uring_get_cqe issues the first syscall, one cqe should
>>>>> already be ready, we don't need to wait again.
>>>>>
>>>>> To fix this issue, after the first syscall, set wait_nr to be zero, with
>>>>> tihs patch, bpftrace shows the number of io_uring_enter syscall is 33.
>>>>
>>>> Thanks, that's a nice fix, we definitely don't want to be doing
>>>> 50% more system calls than we have to...
>>>
>>> Actually, don't think the fix is quite safe. For one, if we get an error
>>> on the __io_uring_enter(), then we may not have waited for entries. Or if
>>> we submitted less than we thought we would, we would not have waited
>>> either. So we need to check for full success before deeming it safe to
>>> clear wait_nr.
>>
>> Unrelated fix:
>>
>> https://git.kernel.dk/cgit/liburing/commit/?id=0edcef5700fd558d2548532e0e5db26cb74d19ca
>>
>> and then a fix for your patch on top:
>>
>> https://git.kernel.dk/cgit/liburing/commit/?id=dc14e30a086082b6aebc3130948e2453e3bd3b2a
>>
>> Can you double check that your original test case still produces the
>> same amount of system calls with the fix in place?
> Yes, it still produces the same amount of system calls.
> Thanks for explanation and right fix.

Great, thanks for confirming!

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] __io_uring_get_cqe: eliminate unnecessary io_uring_enter() syscalls
  2020-03-02 15:37     ` Jens Axboe
  2020-03-03 13:11       ` Xiaoguang Wang
@ 2020-03-04 13:27       ` Xiaoguang Wang
  2020-03-04 13:57         ` Jens Axboe
  1 sibling, 1 reply; 8+ messages in thread
From: Xiaoguang Wang @ 2020-03-04 13:27 UTC (permalink / raw)
  To: Jens Axboe, io-uring; +Cc: joseph.qi

hi,

> On 3/2/20 8:24 AM, Jens Axboe wrote:
>> On 3/2/20 7:05 AM, Jens Axboe wrote:
>>> On 3/1/20 9:18 PM, Xiaoguang Wang wrote:
>>>> When user applis programming mode, like sumbit one sqe and wait its
>>>> completion event, __io_uring_get_cqe() will result in many unnecessary
>>>> syscalls, see below test program:
>>>>
>>>>      int main(int argc, char *argv[])
>>>>      {
>>>>              struct io_uring ring;
>>>>              int fd, ret;
>>>>              struct io_uring_sqe *sqe;
>>>>              struct io_uring_cqe *cqe;
>>>>              struct iovec iov;
>>>>              off_t offset, filesize = 0;
>>>>              void *buf;
>>>>
>>>>              if (argc < 2) {
>>>>                      printf("%s: file\n", argv[0]);
>>>>                      return 1;
>>>>              }
>>>>
>>>>              ret = io_uring_queue_init(4, &ring, 0);
>>>>              if (ret < 0) {
>>>>                      fprintf(stderr, "queue_init: %s\n", strerror(-ret));
>>>>                      return 1;
>>>>              }
>>>>
>>>>              fd = open(argv[1], O_RDONLY | O_DIRECT);
>>>>              if (fd < 0) {
>>>>                      perror("open");
>>>>                      return 1;
>>>>              }
>>>>
>>>>              if (posix_memalign(&buf, 4096, 4096))
>>>>                      return 1;
>>>>              iov.iov_base = buf;
>>>>              iov.iov_len = 4096;
>>>>
>>>>              offset = 0;
>>>>              do {
>>>>                      sqe = io_uring_get_sqe(&ring);
>>>>                      if (!sqe) {
>>>>                              printf("here\n");
>>>>                              break;
>>>>                      }
>>>>                      io_uring_prep_readv(sqe, fd, &iov, 1, offset);
>>>>
>>>>                      ret = io_uring_submit(&ring);
>>>>                      if (ret < 0) {
>>>>                              fprintf(stderr, "io_uring_submit: %s\n", strerror(-ret));
>>>>                              return 1;
>>>>                      }
>>>>
>>>>                      ret = io_uring_wait_cqe(&ring, &cqe);
>>>>                      if (ret < 0) {
>>>>                              fprintf(stderr, "io_uring_wait_cqe: %s\n", strerror(-ret));
>>>>                              return 1;
>>>>                      }
>>>>
>>>>                      if (cqe->res <= 0) {
>>>>                              if (cqe->res < 0) {
>>>>                                      fprintf(stderr, "got eror: %d\n", cqe->res);
>>>>                                      ret = 1;
>>>>                              }
>>>>                              io_uring_cqe_seen(&ring, cqe);
>>>>                              break;
>>>>                      }
>>>>                      offset += cqe->res;
>>>>                      filesize += cqe->res;
>>>>                      io_uring_cqe_seen(&ring, cqe);
>>>>              } while (1);
>>>>
>>>>              printf("filesize: %ld\n", filesize);
>>>>              close(fd);
>>>>              io_uring_queue_exit(&ring);
>>>>              return 0;
>>>>      }
>>>>
>>>> dd if=/dev/zero of=testfile bs=4096 count=16
>>>> ./test  testfile
>>>> and use bpftrace to trace io_uring_enter syscalls, in original codes,
>>>> [lege@localhost ~]$ sudo bpftrace -e "tracepoint:syscalls:sys_enter_io_uring_enter {@c[tid] = count();}"
>>>> Attaching 1 probe...
>>>> @c[11184]: 49
>>>> Above test issues 49 syscalls, it's counterintuitive. After looking
>>>> into the codes, it's because __io_uring_get_cqe issue one more syscall,
>>>> indded when __io_uring_get_cqe issues the first syscall, one cqe should
>>>> already be ready, we don't need to wait again.
>>>>
>>>> To fix this issue, after the first syscall, set wait_nr to be zero, with
>>>> tihs patch, bpftrace shows the number of io_uring_enter syscall is 33.
>>>
>>> Thanks, that's a nice fix, we definitely don't want to be doing
>>> 50% more system calls than we have to...
>>
>> Actually, don't think the fix is quite safe. For one, if we get an error
>> on the __io_uring_enter(), then we may not have waited for entries. Or if
>> we submitted less than we thought we would, we would not have waited
>> either. So we need to check for full success before deeming it safe to
>> clear wait_nr.
> 
> Unrelated fix:
> 
> https://git.kernel.dk/cgit/liburing/commit/?id=0edcef5700fd558d2548532e0e5db26cb74d19ca
> 
> and then a fix for your patch on top:
> 
> https://git.kernel.dk/cgit/liburing/commit/?id=dc14e30a086082b6aebc3130948e2453e3bd3b2a
In this patch, seesms that you forgot to delete:
     if (wait_nr)
         wait_nr = 0;

With these two codes removed, my original test case still produces the same amount
of io_uring_enter syscalls, so you can just remove them safely.

Regards,
Xiaoguang Wang



> 
> Can you double check that your original test case still produces the
> same amount of system calls with the fix in place?
> 

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] __io_uring_get_cqe: eliminate unnecessary io_uring_enter() syscalls
  2020-03-04 13:27       ` Xiaoguang Wang
@ 2020-03-04 13:57         ` Jens Axboe
  0 siblings, 0 replies; 8+ messages in thread
From: Jens Axboe @ 2020-03-04 13:57 UTC (permalink / raw)
  To: Xiaoguang Wang, io-uring; +Cc: joseph.qi

On 3/4/20 6:27 AM, Xiaoguang Wang wrote:
> hi,
> 
>> On 3/2/20 8:24 AM, Jens Axboe wrote:
>>> On 3/2/20 7:05 AM, Jens Axboe wrote:
>>>> On 3/1/20 9:18 PM, Xiaoguang Wang wrote:
>>>>> When user applis programming mode, like sumbit one sqe and wait its
>>>>> completion event, __io_uring_get_cqe() will result in many unnecessary
>>>>> syscalls, see below test program:
>>>>>
>>>>>      int main(int argc, char *argv[])
>>>>>      {
>>>>>              struct io_uring ring;
>>>>>              int fd, ret;
>>>>>              struct io_uring_sqe *sqe;
>>>>>              struct io_uring_cqe *cqe;
>>>>>              struct iovec iov;
>>>>>              off_t offset, filesize = 0;
>>>>>              void *buf;
>>>>>
>>>>>              if (argc < 2) {
>>>>>                      printf("%s: file\n", argv[0]);
>>>>>                      return 1;
>>>>>              }
>>>>>
>>>>>              ret = io_uring_queue_init(4, &ring, 0);
>>>>>              if (ret < 0) {
>>>>>                      fprintf(stderr, "queue_init: %s\n", strerror(-ret));
>>>>>                      return 1;
>>>>>              }
>>>>>
>>>>>              fd = open(argv[1], O_RDONLY | O_DIRECT);
>>>>>              if (fd < 0) {
>>>>>                      perror("open");
>>>>>                      return 1;
>>>>>              }
>>>>>
>>>>>              if (posix_memalign(&buf, 4096, 4096))
>>>>>                      return 1;
>>>>>              iov.iov_base = buf;
>>>>>              iov.iov_len = 4096;
>>>>>
>>>>>              offset = 0;
>>>>>              do {
>>>>>                      sqe = io_uring_get_sqe(&ring);
>>>>>                      if (!sqe) {
>>>>>                              printf("here\n");
>>>>>                              break;
>>>>>                      }
>>>>>                      io_uring_prep_readv(sqe, fd, &iov, 1, offset);
>>>>>
>>>>>                      ret = io_uring_submit(&ring);
>>>>>                      if (ret < 0) {
>>>>>                              fprintf(stderr, "io_uring_submit: %s\n", strerror(-ret));
>>>>>                              return 1;
>>>>>                      }
>>>>>
>>>>>                      ret = io_uring_wait_cqe(&ring, &cqe);
>>>>>                      if (ret < 0) {
>>>>>                              fprintf(stderr, "io_uring_wait_cqe: %s\n", strerror(-ret));
>>>>>                              return 1;
>>>>>                      }
>>>>>
>>>>>                      if (cqe->res <= 0) {
>>>>>                              if (cqe->res < 0) {
>>>>>                                      fprintf(stderr, "got eror: %d\n", cqe->res);
>>>>>                                      ret = 1;
>>>>>                              }
>>>>>                              io_uring_cqe_seen(&ring, cqe);
>>>>>                              break;
>>>>>                      }
>>>>>                      offset += cqe->res;
>>>>>                      filesize += cqe->res;
>>>>>                      io_uring_cqe_seen(&ring, cqe);
>>>>>              } while (1);
>>>>>
>>>>>              printf("filesize: %ld\n", filesize);
>>>>>              close(fd);
>>>>>              io_uring_queue_exit(&ring);
>>>>>              return 0;
>>>>>      }
>>>>>
>>>>> dd if=/dev/zero of=testfile bs=4096 count=16
>>>>> ./test  testfile
>>>>> and use bpftrace to trace io_uring_enter syscalls, in original codes,
>>>>> [lege@localhost ~]$ sudo bpftrace -e "tracepoint:syscalls:sys_enter_io_uring_enter {@c[tid] = count();}"
>>>>> Attaching 1 probe...
>>>>> @c[11184]: 49
>>>>> Above test issues 49 syscalls, it's counterintuitive. After looking
>>>>> into the codes, it's because __io_uring_get_cqe issue one more syscall,
>>>>> indded when __io_uring_get_cqe issues the first syscall, one cqe should
>>>>> already be ready, we don't need to wait again.
>>>>>
>>>>> To fix this issue, after the first syscall, set wait_nr to be zero, with
>>>>> tihs patch, bpftrace shows the number of io_uring_enter syscall is 33.
>>>>
>>>> Thanks, that's a nice fix, we definitely don't want to be doing
>>>> 50% more system calls than we have to...
>>>
>>> Actually, don't think the fix is quite safe. For one, if we get an error
>>> on the __io_uring_enter(), then we may not have waited for entries. Or if
>>> we submitted less than we thought we would, we would not have waited
>>> either. So we need to check for full success before deeming it safe to
>>> clear wait_nr.
>>
>> Unrelated fix:
>>
>> https://git.kernel.dk/cgit/liburing/commit/?id=0edcef5700fd558d2548532e0e5db26cb74d19ca
>>
>> and then a fix for your patch on top:
>>
>> https://git.kernel.dk/cgit/liburing/commit/?id=dc14e30a086082b6aebc3130948e2453e3bd3b2a
> In this patch, seesms that you forgot to delete:
>      if (wait_nr)
>          wait_nr = 0;
> 
> With these two codes removed, my original test case still produces the same amount
> of io_uring_enter syscalls, so you can just remove them safely.

Ah indeed, thanks for letting me know! Killed those two lines.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2020-03-04 13:57 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-03-02  4:18 [PATCH] __io_uring_get_cqe: eliminate unnecessary io_uring_enter() syscalls Xiaoguang Wang
2020-03-02 14:05 ` Jens Axboe
2020-03-02 15:24   ` Jens Axboe
2020-03-02 15:37     ` Jens Axboe
2020-03-03 13:11       ` Xiaoguang Wang
2020-03-03 14:35         ` Jens Axboe
2020-03-04 13:27       ` Xiaoguang Wang
2020-03-04 13:57         ` Jens Axboe

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.