On 1/7/22 3:33 PM, Praveen Kumar wrote: > On 07-01-2022 02:08, Ammar Faizi wrote: >> On Thu, 6 Jan 2022 at 23:01:59 +0530, Praveen Kumar wrote: >>> On 30-12-2021 23:22, Ammar Faizi wrote: >>>> This adds sendto(2) and recvfrom(2) support for io_uring. >>>> >>>> New opcodes: >>>> IORING_OP_SENDTO >>>> IORING_OP_RECVFROM >>>> >>>> Cc: Nugra >>>> Tested-by: Nugra >>>> Link: https://github.com/axboe/liburing/issues/397 >>>> Signed-off-by: Ammar Faizi >>>> --- >>>> >>>> v3: >>>> - Fix build error when CONFIG_NET is undefined should be done in >>>> the first patch, not this patch. >>>> >>>> - Add Tested-by tag from Nugra. >>>> >>>> v2: >>>> - In `io_recvfrom()`, mark the error check of `move_addr_to_user()` >>>> call as unlikely. >>>> >>>> - Fix build error when CONFIG_NET is undefined. >>>> >>>> - Added Nugra to CC list (tester). >>>> --- >>>> fs/io_uring.c | 80 +++++++++++++++++++++++++++++++++-- >>>> include/uapi/linux/io_uring.h | 2 + >>>> 2 files changed, 78 insertions(+), 4 deletions(-) >>>> >>>> diff --git a/fs/io_uring.c b/fs/io_uring.c >>>> index 7adcb591398f..3726958f8f58 100644 >>>> --- a/fs/io_uring.c >>>> +++ b/fs/io_uring.c >>>> @@ -575,7 +575,15 @@ struct io_sr_msg { >>>> union { >>>> struct compat_msghdr __user *umsg_compat; >>>> struct user_msghdr __user *umsg; >>>> - void __user *buf; >>>> + >>>> + struct { >>>> + void __user *buf; >>>> + struct sockaddr __user *addr; >>>> + union { >>>> + int sendto_addr_len; >>>> + int __user *recvfrom_addr_len; >>>> + }; >>>> + }; >>>> }; >>>> int msg_flags; >>>> int bgid; >>>> @@ -1133,6 +1141,19 @@ static const struct io_op_def io_op_defs[] = { >>>> .needs_file = 1 >>>> }, >>>> [IORING_OP_GETXATTR] = {}, >>>> + [IORING_OP_SENDTO] = { >>>> + .needs_file = 1, >>>> + .unbound_nonreg_file = 1, >>>> + .pollout = 1, >>>> + .audit_skip = 1, >>>> + }, >>>> + [IORING_OP_RECVFROM] = { >>>> + .needs_file = 1, >>>> + .unbound_nonreg_file = 1, >>>> + .pollin = 1, >>>> + .buffer_select = 1, >>>> + .audit_skip = 1, >>>> + }, >>>> }; >>>> >>>> /* requests with any of those set should undergo io_disarm_next() */ >>>> @@ -5216,12 +5237,24 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) >>>> if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) >>>> return -EINVAL; >>>> >>>> + /* >>>> + * For IORING_OP_SEND{,TO}, the assignment to @sr->umsg >>>> + * is equivalent to an assignment to @sr->buf. >>>> + */ >>>> sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); >>>> + >>>> sr->len = READ_ONCE(sqe->len); >>>> sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; >>>> if (sr->msg_flags & MSG_DONTWAIT) >>>> req->flags |= REQ_F_NOWAIT; >>>> >>>> + if (req->opcode == IORING_OP_SENDTO) { >>>> + sr->addr = u64_to_user_ptr(READ_ONCE(sqe->addr2)); >>>> + sr->sendto_addr_len = READ_ONCE(sqe->addr3); >>>> + } else { >>>> + sr->addr = (struct sockaddr __user *) NULL; >>> >>> Let's have sendto_addr_len = 0 >> >> Will do in the RFC v5. >> >>> >>>> + } >>>> + >>>> #ifdef CONFIG_COMPAT >>>> if (req->ctx->compat) >>>> sr->msg_flags |= MSG_CMSG_COMPAT; >>>> @@ -5275,6 +5308,7 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) >>>> >>>> static int io_sendto(struct io_kiocb *req, unsigned int issue_flags) >>>> { >>>> + struct sockaddr_storage address; >>>> struct io_sr_msg *sr = &req->sr_msg; >>>> struct msghdr msg; >>>> struct iovec iov; >>>> @@ -5291,10 +5325,20 @@ static int io_sendto(struct io_kiocb *req, unsigned int issue_flags) >>>> if (unlikely(ret)) >>>> return ret; >>>> >>>> - msg.msg_name = NULL; >>>> + >>>> msg.msg_control = NULL; >>>> msg.msg_controllen = 0; >>>> - msg.msg_namelen = 0; >>>> + if (sr->addr) { >>>> + ret = move_addr_to_kernel(sr->addr, sr->sendto_addr_len, >>>> + &address); >>>> + if (unlikely(ret < 0)) >>>> + goto fail; >>>> + msg.msg_name = (struct sockaddr *) &address; >>>> + msg.msg_namelen = sr->sendto_addr_len; >>>> + } else { >>>> + msg.msg_name = NULL; >>>> + msg.msg_namelen = 0; >>>> + } >>>> >>>> flags = req->sr_msg.msg_flags; >>>> if (issue_flags & IO_URING_F_NONBLOCK) >>>> @@ -5309,6 +5353,7 @@ static int io_sendto(struct io_kiocb *req, unsigned int issue_flags) >>>> return -EAGAIN; >>>> if (ret == -ERESTARTSYS) >>>> ret = -EINTR; >>>> + fail: >>>> req_set_fail(req); >>> >>> I think there is a problem with "fail" goto statement. Not getting >>> full clarity on this change. With latest kernel, I see >>> req_set_fail(req) inside if check, which I don't see here. Can you >>> please resend the patch on latest kernel version. Thanks. >> >> I will send the v5 on top of "for-next" branch in Jens' tree soon. >> >> That is already inside an "if check" anyway. We go to that label when >> the move_addr_to_kernel() fails (most of the time it is -EFAULT or >> -EINVAL). >> >> That part looks like this (note the if check before the goto): >> ---------------------------------------------------------------------- >> msg.msg_control = NULL; >> msg.msg_controllen = 0; >> if (sr->addr) { >> ret = move_addr_to_kernel(sr->addr, sr->sendto_addr_len, >> &address); >> if (unlikely(ret < 0)) >> goto fail; >> msg.msg_name = (struct sockaddr *) &address; >> msg.msg_namelen = sr->sendto_addr_len; >> } else { >> msg.msg_name = NULL; >> msg.msg_namelen = 0; >> } >> >> flags = req->sr_msg.msg_flags; >> if (issue_flags & IO_URING_F_NONBLOCK) >> flags |= MSG_DONTWAIT; >> if (flags & MSG_WAITALL) >> min_ret = iov_iter_count(&msg.msg_iter); >> >> msg.msg_flags = flags; >> ret = sock_sendmsg(sock, &msg); >> if (ret < min_ret) { >> if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) >> return -EAGAIN; >> if (ret == -ERESTARTSYS) >> ret = -EINTR; >> fail: > > Thanks for sending this. IMO this goto label should be just before > the "if (ret < min_ret)" statement. AFAICT, error returned by move_addr_to_kernel are only -EFAULT and -EINVAL, so the check against -EAGAIN and -ERESTARTSYS is unnecessary for that case. We can skip that. RFC v4 here (rebased on top of "for-next" Jens' tree): https://lore.kernel.org/io-uring/20220107000006.1194026-1-ammarfaizi2@gnuweeb.org/T/ > >> req_set_fail(req); >> } >> __io_req_complete(req, issue_flags, ret, 0); >> return 0; >> ---------------------------------------------------------------------- >> >>>> } >>>> __io_req_complete(req, issue_flags, ret, 0); >>>> @@ -5427,13 +5472,25 @@ static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) >>>> if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) >>>> return -EINVAL; >>>> >>>> + /* >>>> + * For IORING_OP_RECV{,FROM}, the assignment to @sr->umsg >>>> + * is equivalent to an assignment to @sr->buf. >>>> + */ >>>> sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); >>>> + >>>> sr->len = READ_ONCE(sqe->len); >>>> sr->bgid = READ_ONCE(sqe->buf_group); >>>> sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; >>>> if (sr->msg_flags & MSG_DONTWAIT) >>>> req->flags |= REQ_F_NOWAIT; >>>> >>>> + if (req->opcode == IORING_OP_RECVFROM) { >>>> + sr->addr = u64_to_user_ptr(READ_ONCE(sqe->addr2)); >>>> + sr->recvfrom_addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr3)); >>>> + } else { >>>> + sr->addr = (struct sockaddr __user *) NULL; >>> >>> I think recvfrom_addr_len should also be pointed to NULL, instead of >>> garbage for this case. >> >> Will do in the RFC v5. >> >>> >>>> + } >>>> + >>>> #ifdef CONFIG_COMPAT >>>> if (req->ctx->compat) >>>> sr->msg_flags |= MSG_CMSG_COMPAT; >>>> @@ -5509,6 +5566,7 @@ static int io_recvfrom(struct io_kiocb *req, unsigned int issue_flags) >>>> struct iovec iov; >>>> unsigned flags; >>>> int ret, min_ret = 0; >>>> + struct sockaddr_storage address; >>>> bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; >>>> >>>> sock = sock_from_file(req->file); >>>> @@ -5526,7 +5584,7 @@ static int io_recvfrom(struct io_kiocb *req, unsigned int issue_flags) >>>> if (unlikely(ret)) >>>> goto out_free; >>>> >>>> - msg.msg_name = NULL; >>>> + msg.msg_name = sr->addr ? (struct sockaddr *) &address : NULL; >>>> msg.msg_control = NULL; >>>> msg.msg_controllen = 0; >>>> msg.msg_namelen = 0; >>> >>> I think namelen should also be updated ? >> >> It doesn't have to be updated. From net/socket.c there is a comment >> like this: >> >> /* We assume all kernel code knows the size of sockaddr_storage */ >> msg.msg_namelen = 0; >> >> Full __sys_recvfrom() source code, see here: >> https://github.com/torvalds/linux/blob/v5.16-rc8/net/socket.c#L2085-L2088 >> >> I will add the same comment in next series to clarify this one. >> >>> >>>> @@ -5540,6 +5598,16 @@ static int io_recvfrom(struct io_kiocb *req, unsigned int issue_flags) >>>> min_ret = iov_iter_count(&msg.msg_iter); >>>> >>>> ret = sock_recvmsg(sock, &msg, flags); >>>> + >>>> + if (ret >= 0 && sr->addr != NULL) { >>>> + int tmp; >>>> + >>>> + tmp = move_addr_to_user(&address, msg.msg_namelen, sr->addr, >>>> + sr->recvfrom_addr_len); >>>> + if (unlikely(tmp < 0)) >>>> + ret = tmp; >>>> + } >>>> + >>>> out_free: >>>> if (ret < min_ret) { >>>> if (ret == -EAGAIN && force_nonblock) >>>> @@ -6778,9 +6846,11 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) >>>> case IORING_OP_SYNC_FILE_RANGE: >>>> return io_sfr_prep(req, sqe); >>>> case IORING_OP_SENDMSG: >>>> + case IORING_OP_SENDTO: >>>> case IORING_OP_SEND: >>>> return io_sendmsg_prep(req, sqe); >>>> case IORING_OP_RECVMSG: >>>> + case IORING_OP_RECVFROM: >>>> case IORING_OP_RECV: >>>> return io_recvmsg_prep(req, sqe); >>>> case IORING_OP_CONNECT: >>>> @@ -7060,12 +7130,14 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) >>>> case IORING_OP_SENDMSG: >>>> ret = io_sendmsg(req, issue_flags); >>>> break; >>>> + case IORING_OP_SENDTO: >>>> case IORING_OP_SEND: >>>> ret = io_sendto(req, issue_flags); >>>> break; >>>> case IORING_OP_RECVMSG: >>>> ret = io_recvmsg(req, issue_flags); >>>> break; >>>> + case IORING_OP_RECVFROM: >>>> case IORING_OP_RECV: >>>> ret = io_recvfrom(req, issue_flags); >>>> break; >>>> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h >>>> index efc7ac9b3a6b..a360069d1e8e 100644 >>>> --- a/include/uapi/linux/io_uring.h >>>> +++ b/include/uapi/linux/io_uring.h >>>> @@ -150,6 +150,8 @@ enum { >>>> IORING_OP_SETXATTR, >>>> IORING_OP_FGETXATTR, >>>> IORING_OP_GETXATTR, >>>> + IORING_OP_SENDTO, >>>> + IORING_OP_RECVFROM, >>>> >>>> /* this goes last, obviously */ >>>> IORING_OP_LAST, >>> >>> >>> Regards, >>> >>> ~Praveen. >>> >> >> Thanks for the review. I will send the RFC v5 soon. >> > -- Ammar Faizi