io-uring.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v3] io_uring: add EPOLLEXCLUSIVE flag for POLL_ADD operation
@ 2020-06-12  2:30 Jiufei Xue
  2020-06-12  2:30 ` [PATCH v3 1/2] io_uring: change the poll events to be 32-bits Jiufei Xue
  2020-06-12  2:30 ` [PATCH v3 2/2] io_uring: use EPOLLEXCLUSIVE flag to aoid thundering herd type behavior Jiufei Xue
  0 siblings, 2 replies; 16+ messages in thread
From: Jiufei Xue @ 2020-06-12  2:30 UTC (permalink / raw)
  To: io-uring; +Cc: axboe, joseph.qi

Applications can use this flag to avoid accept thundering herd type
behavior.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v3 1/2] io_uring: change the poll events to be 32-bits
  2020-06-12  2:30 [PATCH v3] io_uring: add EPOLLEXCLUSIVE flag for POLL_ADD operation Jiufei Xue
@ 2020-06-12  2:30 ` Jiufei Xue
  2020-06-12 14:58   ` Jens Axboe
  2020-06-12  2:30 ` [PATCH v3 2/2] io_uring: use EPOLLEXCLUSIVE flag to aoid thundering herd type behavior Jiufei Xue
  1 sibling, 1 reply; 16+ messages in thread
From: Jiufei Xue @ 2020-06-12  2:30 UTC (permalink / raw)
  To: io-uring; +Cc: axboe, joseph.qi

poll events should be 32-bits to cover EPOLLEXCLUSIVE.

Signed-off-by: Jiufei Xue <jiufei.xue@linux.alibaba.com>
---
 fs/io_uring.c                 | 4 ++--
 include/uapi/linux/io_uring.h | 2 +-
 tools/io_uring/liburing.h     | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 47790a2..6250227 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4602,7 +4602,7 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
 static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_poll_iocb *poll = &req->poll;
-	u16 events;
+	u32 events;
 
 	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 		return -EINVAL;
@@ -8196,7 +8196,7 @@ static int __init io_uring_init(void)
 	BUILD_BUG_SQE_ELEM(28, /* compat */   int, rw_flags);
 	BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
 	BUILD_BUG_SQE_ELEM(28, __u32,  fsync_flags);
-	BUILD_BUG_SQE_ELEM(28, __u16,  poll_events);
+	BUILD_BUG_SQE_ELEM(28, __u32,  poll_events);
 	BUILD_BUG_SQE_ELEM(28, __u32,  sync_range_flags);
 	BUILD_BUG_SQE_ELEM(28, __u32,  msg_flags);
 	BUILD_BUG_SQE_ELEM(28, __u32,  timeout_flags);
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 92c2269..afc7edd 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -31,7 +31,7 @@ struct io_uring_sqe {
 	union {
 		__kernel_rwf_t	rw_flags;
 		__u32		fsync_flags;
-		__u16		poll_events;
+		__u32		poll_events;
 		__u32		sync_range_flags;
 		__u32		msg_flags;
 		__u32		timeout_flags;
diff --git a/tools/io_uring/liburing.h b/tools/io_uring/liburing.h
index 5f305c8..094b9ec 100644
--- a/tools/io_uring/liburing.h
+++ b/tools/io_uring/liburing.h
@@ -145,7 +145,7 @@ static inline void io_uring_prep_write_fixed(struct io_uring_sqe *sqe, int fd,
 }
 
 static inline void io_uring_prep_poll_add(struct io_uring_sqe *sqe, int fd,
-					  short poll_mask)
+					  unsigned poll_mask)
 {
 	memset(sqe, 0, sizeof(*sqe));
 	sqe->opcode = IORING_OP_POLL_ADD;
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v3 2/2] io_uring: use EPOLLEXCLUSIVE flag to aoid thundering herd type behavior
  2020-06-12  2:30 [PATCH v3] io_uring: add EPOLLEXCLUSIVE flag for POLL_ADD operation Jiufei Xue
  2020-06-12  2:30 ` [PATCH v3 1/2] io_uring: change the poll events to be 32-bits Jiufei Xue
@ 2020-06-12  2:30 ` Jiufei Xue
  1 sibling, 0 replies; 16+ messages in thread
From: Jiufei Xue @ 2020-06-12  2:30 UTC (permalink / raw)
  To: io-uring; +Cc: axboe, joseph.qi

Applications can pass this flag in to avoid accept thundering herd.

Signed-off-by: Jiufei Xue <jiufei.xue@linux.alibaba.com>
---
 fs/io_uring.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 6250227..03951ec 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4289,7 +4289,11 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
 
 	pt->error = 0;
 	poll->head = head;
-	add_wait_queue(head, &poll->wait);
+
+	if (poll->events & EPOLLEXCLUSIVE)
+		add_wait_queue_exclusive(head, &poll->wait);
+	else
+		add_wait_queue(head, &poll->wait);
 }
 
 static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
@@ -4612,7 +4616,8 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
 		return -EBADF;
 
 	events = READ_ONCE(sqe->poll_events);
-	poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
+	poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP |
+		       (events & EPOLLEXCLUSIVE);
 
 	get_task_struct(current);
 	req->task = current;
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH v3 1/2] io_uring: change the poll events to be 32-bits
  2020-06-12  2:30 ` [PATCH v3 1/2] io_uring: change the poll events to be 32-bits Jiufei Xue
@ 2020-06-12 14:58   ` Jens Axboe
  2020-06-12 16:48     ` Jens Axboe
  0 siblings, 1 reply; 16+ messages in thread
From: Jens Axboe @ 2020-06-12 14:58 UTC (permalink / raw)
  To: Jiufei Xue, io-uring; +Cc: joseph.qi

On 6/11/20 8:30 PM, Jiufei Xue wrote:
> poll events should be 32-bits to cover EPOLLEXCLUSIVE.
> 
> Signed-off-by: Jiufei Xue <jiufei.xue@linux.alibaba.com>
> ---
>  fs/io_uring.c                 | 4 ++--
>  include/uapi/linux/io_uring.h | 2 +-
>  tools/io_uring/liburing.h     | 2 +-
>  3 files changed, 4 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 47790a2..6250227 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -4602,7 +4602,7 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
>  static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>  {
>  	struct io_poll_iocb *poll = &req->poll;
> -	u16 events;
> +	u32 events;
>  
>  	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
>  		return -EINVAL;
> @@ -8196,7 +8196,7 @@ static int __init io_uring_init(void)
>  	BUILD_BUG_SQE_ELEM(28, /* compat */   int, rw_flags);
>  	BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
>  	BUILD_BUG_SQE_ELEM(28, __u32,  fsync_flags);
> -	BUILD_BUG_SQE_ELEM(28, __u16,  poll_events);
> +	BUILD_BUG_SQE_ELEM(28, __u32,  poll_events);
>  	BUILD_BUG_SQE_ELEM(28, __u32,  sync_range_flags);
>  	BUILD_BUG_SQE_ELEM(28, __u32,  msg_flags);
>  	BUILD_BUG_SQE_ELEM(28, __u32,  timeout_flags);
> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> index 92c2269..afc7edd 100644
> --- a/include/uapi/linux/io_uring.h
> +++ b/include/uapi/linux/io_uring.h
> @@ -31,7 +31,7 @@ struct io_uring_sqe {
>  	union {
>  		__kernel_rwf_t	rw_flags;
>  		__u32		fsync_flags;
> -		__u16		poll_events;
> +		__u32		poll_events;
>  		__u32		sync_range_flags;
>  		__u32		msg_flags;
>  		__u32		timeout_flags;

We obviously have the space in there as most other flag members are 32-bits, but
I'd want to double check if we're not changing the ABI here. Is this always
going to be safe, on any platform, regardless of endianess etc?


-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v3 1/2] io_uring: change the poll events to be 32-bits
  2020-06-12 14:58   ` Jens Axboe
@ 2020-06-12 16:48     ` Jens Axboe
  2020-06-15  2:49       ` Jiufei Xue
  0 siblings, 1 reply; 16+ messages in thread
From: Jens Axboe @ 2020-06-12 16:48 UTC (permalink / raw)
  To: Jiufei Xue, io-uring; +Cc: joseph.qi

On 6/12/20 8:58 AM, Jens Axboe wrote:
> On 6/11/20 8:30 PM, Jiufei Xue wrote:
>> poll events should be 32-bits to cover EPOLLEXCLUSIVE.
>>
>> Signed-off-by: Jiufei Xue <jiufei.xue@linux.alibaba.com>
>> ---
>>  fs/io_uring.c                 | 4 ++--
>>  include/uapi/linux/io_uring.h | 2 +-
>>  tools/io_uring/liburing.h     | 2 +-
>>  3 files changed, 4 insertions(+), 4 deletions(-)
>>
>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>> index 47790a2..6250227 100644
>> --- a/fs/io_uring.c
>> +++ b/fs/io_uring.c
>> @@ -4602,7 +4602,7 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
>>  static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>>  {
>>  	struct io_poll_iocb *poll = &req->poll;
>> -	u16 events;
>> +	u32 events;
>>  
>>  	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
>>  		return -EINVAL;
>> @@ -8196,7 +8196,7 @@ static int __init io_uring_init(void)
>>  	BUILD_BUG_SQE_ELEM(28, /* compat */   int, rw_flags);
>>  	BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
>>  	BUILD_BUG_SQE_ELEM(28, __u32,  fsync_flags);
>> -	BUILD_BUG_SQE_ELEM(28, __u16,  poll_events);
>> +	BUILD_BUG_SQE_ELEM(28, __u32,  poll_events);
>>  	BUILD_BUG_SQE_ELEM(28, __u32,  sync_range_flags);
>>  	BUILD_BUG_SQE_ELEM(28, __u32,  msg_flags);
>>  	BUILD_BUG_SQE_ELEM(28, __u32,  timeout_flags);
>> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
>> index 92c2269..afc7edd 100644
>> --- a/include/uapi/linux/io_uring.h
>> +++ b/include/uapi/linux/io_uring.h
>> @@ -31,7 +31,7 @@ struct io_uring_sqe {
>>  	union {
>>  		__kernel_rwf_t	rw_flags;
>>  		__u32		fsync_flags;
>> -		__u16		poll_events;
>> +		__u32		poll_events;
>>  		__u32		sync_range_flags;
>>  		__u32		msg_flags;
>>  		__u32		timeout_flags;
> 
> We obviously have the space in there as most other flag members are 32-bits, but
> I'd want to double check if we're not changing the ABI here. Is this always
> going to be safe, on any platform, regardless of endianess etc?

Double checked, and as I feared, we can't safely do this. We'll have to
do something like the below, grabbing an unused bit of the poll mask
space and if that's set, then store the fact that EPOLLEXCLUSIVE is set.
So probably best to turn this just into one patch, since it doesn't make
a lot of sense to do it as a prep patch at that point.

This does have the benefit of not growing io_poll_iocb. With your patch,
it'd go beyond a cacheline, and hence bump the size of the entire
io_iocb as well, which would be very unfortunate.

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 155f3d830ddb..64a98bf11943 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -350,6 +350,7 @@ struct io_poll_iocb {
 		u64			addr;
 	};
 	__poll_t			events;
+	bool				exclusive;
 	bool				done;
 	bool				canceled;
 	struct wait_queue_entry		wait;
@@ -4543,7 +4544,7 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
 static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_poll_iocb *poll = &req->poll;
-	u16 events;
+	u32 events;
 
 	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 		return -EINVAL;
@@ -4553,6 +4554,9 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
 		return -EBADF;
 
 	events = READ_ONCE(sqe->poll_events);
+	if ((events & IORING_POLL_32BIT) &&
+	    (sqe->poll32_events & EPOLLEXCLUSIVE))
+		poll->exclusive = true;
 	poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
 
 	get_task_struct(current);
@@ -8155,6 +8159,7 @@ static int __init io_uring_init(void)
 	BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
 	BUILD_BUG_SQE_ELEM(28, __u32,  fsync_flags);
 	BUILD_BUG_SQE_ELEM(28, __u16,  poll_events);
+	BUILD_BUG_SQE_ELEM(28, __u32,  poll32_events);
 	BUILD_BUG_SQE_ELEM(28, __u32,  sync_range_flags);
 	BUILD_BUG_SQE_ELEM(28, __u32,  msg_flags);
 	BUILD_BUG_SQE_ELEM(28, __u32,  timeout_flags);
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 92c22699a5a7..16d473d909eb 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -32,6 +32,7 @@ struct io_uring_sqe {
 		__kernel_rwf_t	rw_flags;
 		__u32		fsync_flags;
 		__u16		poll_events;
+		__u32		poll32_events;
 		__u32		sync_range_flags;
 		__u32		msg_flags;
 		__u32		timeout_flags;
@@ -60,6 +61,8 @@ struct io_uring_sqe {
 	};
 };
 
+#define IORING_POLL_32BIT	(1U << 15)
+
 enum {
 	IOSQE_FIXED_FILE_BIT,
 	IOSQE_IO_DRAIN_BIT,

-- 
Jens Axboe


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH v3 1/2] io_uring: change the poll events to be 32-bits
  2020-06-12 16:48     ` Jens Axboe
@ 2020-06-15  2:49       ` Jiufei Xue
  2020-06-15 15:09         ` Jens Axboe
  0 siblings, 1 reply; 16+ messages in thread
From: Jiufei Xue @ 2020-06-15  2:49 UTC (permalink / raw)
  To: Jens Axboe, io-uring; +Cc: joseph.qi

Hi Jens,

On 2020/6/13 上午12:48, Jens Axboe wrote:
> On 6/12/20 8:58 AM, Jens Axboe wrote:
>> On 6/11/20 8:30 PM, Jiufei Xue wrote:
>>> poll events should be 32-bits to cover EPOLLEXCLUSIVE.
>>>
>>> Signed-off-by: Jiufei Xue <jiufei.xue@linux.alibaba.com>
>>> ---
>>>  fs/io_uring.c                 | 4 ++--
>>>  include/uapi/linux/io_uring.h | 2 +-
>>>  tools/io_uring/liburing.h     | 2 +-
>>>  3 files changed, 4 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>>> index 47790a2..6250227 100644
>>> --- a/fs/io_uring.c
>>> +++ b/fs/io_uring.c
>>> @@ -4602,7 +4602,7 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
>>>  static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>>>  {
>>>  	struct io_poll_iocb *poll = &req->poll;
>>> -	u16 events;
>>> +	u32 events;
>>>  
>>>  	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
>>>  		return -EINVAL;
>>> @@ -8196,7 +8196,7 @@ static int __init io_uring_init(void)
>>>  	BUILD_BUG_SQE_ELEM(28, /* compat */   int, rw_flags);
>>>  	BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
>>>  	BUILD_BUG_SQE_ELEM(28, __u32,  fsync_flags);
>>> -	BUILD_BUG_SQE_ELEM(28, __u16,  poll_events);
>>> +	BUILD_BUG_SQE_ELEM(28, __u32,  poll_events);
>>>  	BUILD_BUG_SQE_ELEM(28, __u32,  sync_range_flags);
>>>  	BUILD_BUG_SQE_ELEM(28, __u32,  msg_flags);
>>>  	BUILD_BUG_SQE_ELEM(28, __u32,  timeout_flags);
>>> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
>>> index 92c2269..afc7edd 100644
>>> --- a/include/uapi/linux/io_uring.h
>>> +++ b/include/uapi/linux/io_uring.h
>>> @@ -31,7 +31,7 @@ struct io_uring_sqe {
>>>  	union {
>>>  		__kernel_rwf_t	rw_flags;
>>>  		__u32		fsync_flags;
>>> -		__u16		poll_events;
>>> +		__u32		poll_events;
>>>  		__u32		sync_range_flags;
>>>  		__u32		msg_flags;
>>>  		__u32		timeout_flags;
>>
>> We obviously have the space in there as most other flag members are 32-bits, but
>> I'd want to double check if we're not changing the ABI here. Is this always
>> going to be safe, on any platform, regardless of endianess etc?
> 
> Double checked, and as I feared, we can't safely do this. We'll have to
> do something like the below, grabbing an unused bit of the poll mask
> space and if that's set, then store the fact that EPOLLEXCLUSIVE is set.
> So probably best to turn this just into one patch, since it doesn't make
> a lot of sense to do it as a prep patch at that point.
>
Yes, Agree about that. But I also fear that if the unused bit is used in the
feature, it will bring unexpected behavior.

> This does have the benefit of not growing io_poll_iocb. With your patch,
> it'd go beyond a cacheline, and hence bump the size of the entire
> io_iocb as well, which would be very unfortunate.
>
events in io_poll_iocb is 32-bits already, so why it will bump the size of the io_iocb
structure with my patch? 

Thanks,
Jiufei

> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 155f3d830ddb..64a98bf11943 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -350,6 +350,7 @@ struct io_poll_iocb {
>  		u64			addr;
>  	};
>  	__poll_t			events;
> +	bool				exclusive;
>  	bool				done;
>  	bool				canceled;
>  	struct wait_queue_entry		wait;
> @@ -4543,7 +4544,7 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
>  static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>  {
>  	struct io_poll_iocb *poll = &req->poll;
> -	u16 events;
> +	u32 events;
>  
>  	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
>  		return -EINVAL;
> @@ -4553,6 +4554,9 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
>  		return -EBADF;
>  
>  	events = READ_ONCE(sqe->poll_events);
> +	if ((events & IORING_POLL_32BIT) &&
> +	    (sqe->poll32_events & EPOLLEXCLUSIVE))
> +		poll->exclusive = true;
>  	poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
>  
>  	get_task_struct(current);
> @@ -8155,6 +8159,7 @@ static int __init io_uring_init(void)
>  	BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
>  	BUILD_BUG_SQE_ELEM(28, __u32,  fsync_flags);
>  	BUILD_BUG_SQE_ELEM(28, __u16,  poll_events);
> +	BUILD_BUG_SQE_ELEM(28, __u32,  poll32_events);
>  	BUILD_BUG_SQE_ELEM(28, __u32,  sync_range_flags);
>  	BUILD_BUG_SQE_ELEM(28, __u32,  msg_flags);
>  	BUILD_BUG_SQE_ELEM(28, __u32,  timeout_flags);
> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> index 92c22699a5a7..16d473d909eb 100644
> --- a/include/uapi/linux/io_uring.h
> +++ b/include/uapi/linux/io_uring.h
> @@ -32,6 +32,7 @@ struct io_uring_sqe {
>  		__kernel_rwf_t	rw_flags;
>  		__u32		fsync_flags;
>  		__u16		poll_events;
> +		__u32		poll32_events;
>  		__u32		sync_range_flags;
>  		__u32		msg_flags;
>  		__u32		timeout_flags;
> @@ -60,6 +61,8 @@ struct io_uring_sqe {
>  	};
>  };
>  
> +#define IORING_POLL_32BIT	(1U << 15)
> +
>  enum {
>  	IOSQE_FIXED_FILE_BIT,
>  	IOSQE_IO_DRAIN_BIT,
> 

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v3 1/2] io_uring: change the poll events to be 32-bits
  2020-06-15  2:49       ` Jiufei Xue
@ 2020-06-15 15:09         ` Jens Axboe
  2020-06-16  3:04           ` Jiufei Xue
  0 siblings, 1 reply; 16+ messages in thread
From: Jens Axboe @ 2020-06-15 15:09 UTC (permalink / raw)
  To: Jiufei Xue, io-uring; +Cc: joseph.qi

On 6/14/20 8:49 PM, Jiufei Xue wrote:
> Hi Jens,
> 
> On 2020/6/13 上午12:48, Jens Axboe wrote:
>> On 6/12/20 8:58 AM, Jens Axboe wrote:
>>> On 6/11/20 8:30 PM, Jiufei Xue wrote:
>>>> poll events should be 32-bits to cover EPOLLEXCLUSIVE.
>>>>
>>>> Signed-off-by: Jiufei Xue <jiufei.xue@linux.alibaba.com>
>>>> ---
>>>>  fs/io_uring.c                 | 4 ++--
>>>>  include/uapi/linux/io_uring.h | 2 +-
>>>>  tools/io_uring/liburing.h     | 2 +-
>>>>  3 files changed, 4 insertions(+), 4 deletions(-)
>>>>
>>>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>>>> index 47790a2..6250227 100644
>>>> --- a/fs/io_uring.c
>>>> +++ b/fs/io_uring.c
>>>> @@ -4602,7 +4602,7 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
>>>>  static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>>>>  {
>>>>  	struct io_poll_iocb *poll = &req->poll;
>>>> -	u16 events;
>>>> +	u32 events;
>>>>  
>>>>  	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
>>>>  		return -EINVAL;
>>>> @@ -8196,7 +8196,7 @@ static int __init io_uring_init(void)
>>>>  	BUILD_BUG_SQE_ELEM(28, /* compat */   int, rw_flags);
>>>>  	BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
>>>>  	BUILD_BUG_SQE_ELEM(28, __u32,  fsync_flags);
>>>> -	BUILD_BUG_SQE_ELEM(28, __u16,  poll_events);
>>>> +	BUILD_BUG_SQE_ELEM(28, __u32,  poll_events);
>>>>  	BUILD_BUG_SQE_ELEM(28, __u32,  sync_range_flags);
>>>>  	BUILD_BUG_SQE_ELEM(28, __u32,  msg_flags);
>>>>  	BUILD_BUG_SQE_ELEM(28, __u32,  timeout_flags);
>>>> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
>>>> index 92c2269..afc7edd 100644
>>>> --- a/include/uapi/linux/io_uring.h
>>>> +++ b/include/uapi/linux/io_uring.h
>>>> @@ -31,7 +31,7 @@ struct io_uring_sqe {
>>>>  	union {
>>>>  		__kernel_rwf_t	rw_flags;
>>>>  		__u32		fsync_flags;
>>>> -		__u16		poll_events;
>>>> +		__u32		poll_events;
>>>>  		__u32		sync_range_flags;
>>>>  		__u32		msg_flags;
>>>>  		__u32		timeout_flags;
>>>
>>> We obviously have the space in there as most other flag members are 32-bits, but
>>> I'd want to double check if we're not changing the ABI here. Is this always
>>> going to be safe, on any platform, regardless of endianess etc?
>>
>> Double checked, and as I feared, we can't safely do this. We'll have to
>> do something like the below, grabbing an unused bit of the poll mask
>> space and if that's set, then store the fact that EPOLLEXCLUSIVE is set.
>> So probably best to turn this just into one patch, since it doesn't make
>> a lot of sense to do it as a prep patch at that point.
>>
> Yes, Agree about that. But I also fear that if the unused bit is used
> in the feature, it will bring unexpected behavior.

Yeah, it's certainly not the prettiest and could potentially be fragile.
I'm open to suggestions, we need some way of signaling that the 32-bit
variant of the poll_events should be used. We could potentially make
this work by doing explicit layout for big endian vs little endian, that
might be prettier and wouldn't suffer from the "grab some random bit"
issue.

>> This does have the benefit of not growing io_poll_iocb. With your patch,
>> it'd go beyond a cacheline, and hence bump the size of the entire
>> io_iocb as well, which would be very unfortunate.
>>
> events in io_poll_iocb is 32-bits already, so why it will bump the
> size of the io_iocb structure with my patch? 

It's not 32-bits already, it's a __poll_t type which is 16-bits only.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v3 1/2] io_uring: change the poll events to be 32-bits
  2020-06-15 15:09         ` Jens Axboe
@ 2020-06-16  3:04           ` Jiufei Xue
  2020-06-16 13:58             ` Jens Axboe
  0 siblings, 1 reply; 16+ messages in thread
From: Jiufei Xue @ 2020-06-16  3:04 UTC (permalink / raw)
  To: Jens Axboe, io-uring; +Cc: joseph.qi



On 2020/6/15 下午11:09, Jens Axboe wrote:
> On 6/14/20 8:49 PM, Jiufei Xue wrote:
>> Hi Jens,
>>
>> On 2020/6/13 上午12:48, Jens Axboe wrote:
>>> On 6/12/20 8:58 AM, Jens Axboe wrote:
>>>> On 6/11/20 8:30 PM, Jiufei Xue wrote:
>>>>> poll events should be 32-bits to cover EPOLLEXCLUSIVE.
>>>>>
>>>>> Signed-off-by: Jiufei Xue <jiufei.xue@linux.alibaba.com>
>>>>> ---
>>>>>  fs/io_uring.c                 | 4 ++--
>>>>>  include/uapi/linux/io_uring.h | 2 +-
>>>>>  tools/io_uring/liburing.h     | 2 +-
>>>>>  3 files changed, 4 insertions(+), 4 deletions(-)
>>>>>
>>>>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>>>>> index 47790a2..6250227 100644
>>>>> --- a/fs/io_uring.c
>>>>> +++ b/fs/io_uring.c
>>>>> @@ -4602,7 +4602,7 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
>>>>>  static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>>>>>  {
>>>>>  	struct io_poll_iocb *poll = &req->poll;
>>>>> -	u16 events;
>>>>> +	u32 events;
>>>>>  
>>>>>  	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
>>>>>  		return -EINVAL;
>>>>> @@ -8196,7 +8196,7 @@ static int __init io_uring_init(void)
>>>>>  	BUILD_BUG_SQE_ELEM(28, /* compat */   int, rw_flags);
>>>>>  	BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
>>>>>  	BUILD_BUG_SQE_ELEM(28, __u32,  fsync_flags);
>>>>> -	BUILD_BUG_SQE_ELEM(28, __u16,  poll_events);
>>>>> +	BUILD_BUG_SQE_ELEM(28, __u32,  poll_events);
>>>>>  	BUILD_BUG_SQE_ELEM(28, __u32,  sync_range_flags);
>>>>>  	BUILD_BUG_SQE_ELEM(28, __u32,  msg_flags);
>>>>>  	BUILD_BUG_SQE_ELEM(28, __u32,  timeout_flags);
>>>>> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
>>>>> index 92c2269..afc7edd 100644
>>>>> --- a/include/uapi/linux/io_uring.h
>>>>> +++ b/include/uapi/linux/io_uring.h
>>>>> @@ -31,7 +31,7 @@ struct io_uring_sqe {
>>>>>  	union {
>>>>>  		__kernel_rwf_t	rw_flags;
>>>>>  		__u32		fsync_flags;
>>>>> -		__u16		poll_events;
>>>>> +		__u32		poll_events;
>>>>>  		__u32		sync_range_flags;
>>>>>  		__u32		msg_flags;
>>>>>  		__u32		timeout_flags;
>>>>
>>>> We obviously have the space in there as most other flag members are 32-bits, but
>>>> I'd want to double check if we're not changing the ABI here. Is this always
>>>> going to be safe, on any platform, regardless of endianess etc?
>>>
>>> Double checked, and as I feared, we can't safely do this. We'll have to
>>> do something like the below, grabbing an unused bit of the poll mask
>>> space and if that's set, then store the fact that EPOLLEXCLUSIVE is set.
>>> So probably best to turn this just into one patch, since it doesn't make
>>> a lot of sense to do it as a prep patch at that point.
>>>
>> Yes, Agree about that. But I also fear that if the unused bit is used
>> in the feature, it will bring unexpected behavior.
> 
> Yeah, it's certainly not the prettiest and could potentially be fragile.
> I'm open to suggestions, we need some way of signaling that the 32-bit
> variant of the poll_events should be used. We could potentially make
> this work by doing explicit layout for big endian vs little endian, that
> might be prettier and wouldn't suffer from the "grab some random bit"
> issue.
> 
Thank you for your suggestion, I will think about it.

>>> This does have the benefit of not growing io_poll_iocb. With your patch,
>>> it'd go beyond a cacheline, and hence bump the size of the entire
>>> io_iocb as well, which would be very unfortunate.
>>>
>> events in io_poll_iocb is 32-bits already, so why it will bump the
>> size of the io_iocb structure with my patch? 
> 
> It's not 32-bits already, it's a __poll_t type which is 16-bits only.
> 
Yes, it is a __poll_t type, but I found that __poll_t type is 32-bits
with the definition below:

typedef unsigned __bitwise __poll_t;

And I also investigate it with crash:
crash> io_poll_iocb -ox
struct io_poll_iocb {
   [0x0] struct file *file;
         union {
   [0x8]     struct wait_queue_head *head;
   [0x8]     u64 addr;
         };
  [0x10] __poll_t events;
  [0x14] bool done;
  [0x15] bool canceled;
  [0x18] struct wait_queue_entry wait;
}

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v3 1/2] io_uring: change the poll events to be 32-bits
  2020-06-16  3:04           ` Jiufei Xue
@ 2020-06-16 13:58             ` Jens Axboe
  2020-06-16 18:45               ` Jens Axboe
  0 siblings, 1 reply; 16+ messages in thread
From: Jens Axboe @ 2020-06-16 13:58 UTC (permalink / raw)
  To: Jiufei Xue, io-uring; +Cc: joseph.qi

On 6/15/20 9:04 PM, Jiufei Xue wrote:
> 
> 
> On 2020/6/15 下午11:09, Jens Axboe wrote:
>> On 6/14/20 8:49 PM, Jiufei Xue wrote:
>>> Hi Jens,
>>>
>>> On 2020/6/13 上午12:48, Jens Axboe wrote:
>>>> On 6/12/20 8:58 AM, Jens Axboe wrote:
>>>>> On 6/11/20 8:30 PM, Jiufei Xue wrote:
>>>>>> poll events should be 32-bits to cover EPOLLEXCLUSIVE.
>>>>>>
>>>>>> Signed-off-by: Jiufei Xue <jiufei.xue@linux.alibaba.com>
>>>>>> ---
>>>>>>  fs/io_uring.c                 | 4 ++--
>>>>>>  include/uapi/linux/io_uring.h | 2 +-
>>>>>>  tools/io_uring/liburing.h     | 2 +-
>>>>>>  3 files changed, 4 insertions(+), 4 deletions(-)
>>>>>>
>>>>>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>>>>>> index 47790a2..6250227 100644
>>>>>> --- a/fs/io_uring.c
>>>>>> +++ b/fs/io_uring.c
>>>>>> @@ -4602,7 +4602,7 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
>>>>>>  static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>>>>>>  {
>>>>>>  	struct io_poll_iocb *poll = &req->poll;
>>>>>> -	u16 events;
>>>>>> +	u32 events;
>>>>>>  
>>>>>>  	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
>>>>>>  		return -EINVAL;
>>>>>> @@ -8196,7 +8196,7 @@ static int __init io_uring_init(void)
>>>>>>  	BUILD_BUG_SQE_ELEM(28, /* compat */   int, rw_flags);
>>>>>>  	BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
>>>>>>  	BUILD_BUG_SQE_ELEM(28, __u32,  fsync_flags);
>>>>>> -	BUILD_BUG_SQE_ELEM(28, __u16,  poll_events);
>>>>>> +	BUILD_BUG_SQE_ELEM(28, __u32,  poll_events);
>>>>>>  	BUILD_BUG_SQE_ELEM(28, __u32,  sync_range_flags);
>>>>>>  	BUILD_BUG_SQE_ELEM(28, __u32,  msg_flags);
>>>>>>  	BUILD_BUG_SQE_ELEM(28, __u32,  timeout_flags);
>>>>>> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
>>>>>> index 92c2269..afc7edd 100644
>>>>>> --- a/include/uapi/linux/io_uring.h
>>>>>> +++ b/include/uapi/linux/io_uring.h
>>>>>> @@ -31,7 +31,7 @@ struct io_uring_sqe {
>>>>>>  	union {
>>>>>>  		__kernel_rwf_t	rw_flags;
>>>>>>  		__u32		fsync_flags;
>>>>>> -		__u16		poll_events;
>>>>>> +		__u32		poll_events;
>>>>>>  		__u32		sync_range_flags;
>>>>>>  		__u32		msg_flags;
>>>>>>  		__u32		timeout_flags;
>>>>>
>>>>> We obviously have the space in there as most other flag members are 32-bits, but
>>>>> I'd want to double check if we're not changing the ABI here. Is this always
>>>>> going to be safe, on any platform, regardless of endianess etc?
>>>>
>>>> Double checked, and as I feared, we can't safely do this. We'll have to
>>>> do something like the below, grabbing an unused bit of the poll mask
>>>> space and if that's set, then store the fact that EPOLLEXCLUSIVE is set.
>>>> So probably best to turn this just into one patch, since it doesn't make
>>>> a lot of sense to do it as a prep patch at that point.
>>>>
>>> Yes, Agree about that. But I also fear that if the unused bit is used
>>> in the feature, it will bring unexpected behavior.
>>
>> Yeah, it's certainly not the prettiest and could potentially be fragile.
>> I'm open to suggestions, we need some way of signaling that the 32-bit
>> variant of the poll_events should be used. We could potentially make
>> this work by doing explicit layout for big endian vs little endian, that
>> might be prettier and wouldn't suffer from the "grab some random bit"
>> issue.
>>
> Thank you for your suggestion, I will think about it.
> 
>>>> This does have the benefit of not growing io_poll_iocb. With your patch,
>>>> it'd go beyond a cacheline, and hence bump the size of the entire
>>>> io_iocb as well, which would be very unfortunate.
>>>>
>>> events in io_poll_iocb is 32-bits already, so why it will bump the
>>> size of the io_iocb structure with my patch? 
>>
>> It's not 32-bits already, it's a __poll_t type which is 16-bits only.
>>
> Yes, it is a __poll_t type, but I found that __poll_t type is 32-bits
> with the definition below:
> 
> typedef unsigned __bitwise __poll_t;
> 
> And I also investigate it with crash:
> crash> io_poll_iocb -ox
> struct io_poll_iocb {
>    [0x0] struct file *file;
>          union {
>    [0x8]     struct wait_queue_head *head;
>    [0x8]     u64 addr;
>          };
>   [0x10] __poll_t events;
>   [0x14] bool done;
>   [0x15] bool canceled;
>   [0x18] struct wait_queue_entry wait;
> }

Yeah you're right, not sure why I figured it was 16-bits. But just
checking on my default build:

axboe@x1 ~/gi/linux-block (block-5.8)> pahole -C io_poll_iocb fs/io_uring.o
struct io_poll_iocb {
	struct file *              file;                 /*     0     8 */
	union {
		struct wait_queue_head * head;           /*     8     8 */
		u64                addr;                 /*     8     8 */
	};                                               /*     8     8 */
	__poll_t                   events;               /*    16     4 */
	bool                       done;                 /*    20     1 */
	bool                       canceled;             /*    21     1 */

	/* XXX 2 bytes hole, try to pack */

	struct wait_queue_entry wait;                    /*    24    40 */

	/* size: 64, cachelines: 1, members: 6 */
	/* sum members: 62, holes: 1, sum holes: 2 */
};

and it's definitely 64-bytes in total size (as it should be), and the
'events' is indeed 32-bits as well.

So at least that's good, we don't need anything extra in there. If we
can solve the endian issue, then it should be trivial to use the full
32-bits for the flags in the sqe.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v3 1/2] io_uring: change the poll events to be 32-bits
  2020-06-16 13:58             ` Jens Axboe
@ 2020-06-16 18:45               ` Jens Axboe
  2020-06-16 19:20                 ` Pavel Begunkov
  2020-06-16 19:21                 ` Jens Axboe
  0 siblings, 2 replies; 16+ messages in thread
From: Jens Axboe @ 2020-06-16 18:45 UTC (permalink / raw)
  To: Jiufei Xue, io-uring; +Cc: joseph.qi

On 6/16/20 7:58 AM, Jens Axboe wrote:
> On 6/15/20 9:04 PM, Jiufei Xue wrote:
>>
>>
>> On 2020/6/15 下午11:09, Jens Axboe wrote:
>>> On 6/14/20 8:49 PM, Jiufei Xue wrote:
>>>> Hi Jens,
>>>>
>>>> On 2020/6/13 上午12:48, Jens Axboe wrote:
>>>>> On 6/12/20 8:58 AM, Jens Axboe wrote:
>>>>>> On 6/11/20 8:30 PM, Jiufei Xue wrote:
>>>>>>> poll events should be 32-bits to cover EPOLLEXCLUSIVE.
>>>>>>>
>>>>>>> Signed-off-by: Jiufei Xue <jiufei.xue@linux.alibaba.com>
>>>>>>> ---
>>>>>>>  fs/io_uring.c                 | 4 ++--
>>>>>>>  include/uapi/linux/io_uring.h | 2 +-
>>>>>>>  tools/io_uring/liburing.h     | 2 +-
>>>>>>>  3 files changed, 4 insertions(+), 4 deletions(-)
>>>>>>>
>>>>>>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>>>>>>> index 47790a2..6250227 100644
>>>>>>> --- a/fs/io_uring.c
>>>>>>> +++ b/fs/io_uring.c
>>>>>>> @@ -4602,7 +4602,7 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
>>>>>>>  static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>>>>>>>  {
>>>>>>>  	struct io_poll_iocb *poll = &req->poll;
>>>>>>> -	u16 events;
>>>>>>> +	u32 events;
>>>>>>>  
>>>>>>>  	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
>>>>>>>  		return -EINVAL;
>>>>>>> @@ -8196,7 +8196,7 @@ static int __init io_uring_init(void)
>>>>>>>  	BUILD_BUG_SQE_ELEM(28, /* compat */   int, rw_flags);
>>>>>>>  	BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
>>>>>>>  	BUILD_BUG_SQE_ELEM(28, __u32,  fsync_flags);
>>>>>>> -	BUILD_BUG_SQE_ELEM(28, __u16,  poll_events);
>>>>>>> +	BUILD_BUG_SQE_ELEM(28, __u32,  poll_events);
>>>>>>>  	BUILD_BUG_SQE_ELEM(28, __u32,  sync_range_flags);
>>>>>>>  	BUILD_BUG_SQE_ELEM(28, __u32,  msg_flags);
>>>>>>>  	BUILD_BUG_SQE_ELEM(28, __u32,  timeout_flags);
>>>>>>> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
>>>>>>> index 92c2269..afc7edd 100644
>>>>>>> --- a/include/uapi/linux/io_uring.h
>>>>>>> +++ b/include/uapi/linux/io_uring.h
>>>>>>> @@ -31,7 +31,7 @@ struct io_uring_sqe {
>>>>>>>  	union {
>>>>>>>  		__kernel_rwf_t	rw_flags;
>>>>>>>  		__u32		fsync_flags;
>>>>>>> -		__u16		poll_events;
>>>>>>> +		__u32		poll_events;
>>>>>>>  		__u32		sync_range_flags;
>>>>>>>  		__u32		msg_flags;
>>>>>>>  		__u32		timeout_flags;
>>>>>>
>>>>>> We obviously have the space in there as most other flag members are 32-bits, but
>>>>>> I'd want to double check if we're not changing the ABI here. Is this always
>>>>>> going to be safe, on any platform, regardless of endianess etc?
>>>>>
>>>>> Double checked, and as I feared, we can't safely do this. We'll have to
>>>>> do something like the below, grabbing an unused bit of the poll mask
>>>>> space and if that's set, then store the fact that EPOLLEXCLUSIVE is set.
>>>>> So probably best to turn this just into one patch, since it doesn't make
>>>>> a lot of sense to do it as a prep patch at that point.
>>>>>
>>>> Yes, Agree about that. But I also fear that if the unused bit is used
>>>> in the feature, it will bring unexpected behavior.
>>>
>>> Yeah, it's certainly not the prettiest and could potentially be fragile.
>>> I'm open to suggestions, we need some way of signaling that the 32-bit
>>> variant of the poll_events should be used. We could potentially make
>>> this work by doing explicit layout for big endian vs little endian, that
>>> might be prettier and wouldn't suffer from the "grab some random bit"
>>> issue.
>>>
>> Thank you for your suggestion, I will think about it.
>>
>>>>> This does have the benefit of not growing io_poll_iocb. With your patch,
>>>>> it'd go beyond a cacheline, and hence bump the size of the entire
>>>>> io_iocb as well, which would be very unfortunate.
>>>>>
>>>> events in io_poll_iocb is 32-bits already, so why it will bump the
>>>> size of the io_iocb structure with my patch? 
>>>
>>> It's not 32-bits already, it's a __poll_t type which is 16-bits only.
>>>
>> Yes, it is a __poll_t type, but I found that __poll_t type is 32-bits
>> with the definition below:
>>
>> typedef unsigned __bitwise __poll_t;
>>
>> And I also investigate it with crash:
>> crash> io_poll_iocb -ox
>> struct io_poll_iocb {
>>    [0x0] struct file *file;
>>          union {
>>    [0x8]     struct wait_queue_head *head;
>>    [0x8]     u64 addr;
>>          };
>>   [0x10] __poll_t events;
>>   [0x14] bool done;
>>   [0x15] bool canceled;
>>   [0x18] struct wait_queue_entry wait;
>> }
> 
> Yeah you're right, not sure why I figured it was 16-bits. But just
> checking on my default build:
> 
> axboe@x1 ~/gi/linux-block (block-5.8)> pahole -C io_poll_iocb fs/io_uring.o
> struct io_poll_iocb {
> 	struct file *              file;                 /*     0     8 */
> 	union {
> 		struct wait_queue_head * head;           /*     8     8 */
> 		u64                addr;                 /*     8     8 */
> 	};                                               /*     8     8 */
> 	__poll_t                   events;               /*    16     4 */
> 	bool                       done;                 /*    20     1 */
> 	bool                       canceled;             /*    21     1 */
> 
> 	/* XXX 2 bytes hole, try to pack */
> 
> 	struct wait_queue_entry wait;                    /*    24    40 */
> 
> 	/* size: 64, cachelines: 1, members: 6 */
> 	/* sum members: 62, holes: 1, sum holes: 2 */
> };
> 
> and it's definitely 64-bytes in total size (as it should be), and the
> 'events' is indeed 32-bits as well.
> 
> So at least that's good, we don't need anything extra in there. If we
> can solve the endian issue, then it should be trivial to use the full
> 32-bits for the flags in the sqe.

To get back to something that can move us forward, something like the
below I _think_ will work. Then applications just use poll32_events and
we don't have to handle this in any special way. Care to base on that
and re-send the change?


diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 92c22699a5a7..5b3f6bd59437 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -31,7 +31,16 @@ struct io_uring_sqe {
 	union {
 		__kernel_rwf_t	rw_flags;
 		__u32		fsync_flags;
-		__u16		poll_events;
+		struct {
+#ifdef __BIG_ENDIAN_BITFIELD
+			__u16	__unused_poll;
+			__u16	poll_events;
+#else
+			__u16	poll_events;
+			__u16	__unused_poll;
+#endif
+		};
+		__u32		poll32_events;
 		__u32		sync_range_flags;
 		__u32		msg_flags;
 		__u32		timeout_flags;

-- 
Jens Axboe


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH v3 1/2] io_uring: change the poll events to be 32-bits
  2020-06-16 18:45               ` Jens Axboe
@ 2020-06-16 19:20                 ` Pavel Begunkov
  2020-06-16 19:27                   ` Jens Axboe
  2020-06-16 19:21                 ` Jens Axboe
  1 sibling, 1 reply; 16+ messages in thread
From: Pavel Begunkov @ 2020-06-16 19:20 UTC (permalink / raw)
  To: Jens Axboe, Jiufei Xue, io-uring; +Cc: joseph.qi

On 16/06/2020 21:45, Jens Axboe wrote:
> On 6/16/20 7:58 AM, Jens Axboe wrote:
>> On 6/15/20 9:04 PM, Jiufei Xue wrote:
>>>
>>>
>>> On 2020/6/15 下午11:09, Jens Axboe wrote:
>>>> On 6/14/20 8:49 PM, Jiufei Xue wrote:
>>>>> Hi Jens,
>>>>>
>>>>> On 2020/6/13 上午12:48, Jens Axboe wrote:
>>>>>> On 6/12/20 8:58 AM, Jens Axboe wrote:
>>>>>>> On 6/11/20 8:30 PM, Jiufei Xue wrote:
>>>>>>>> poll events should be 32-bits to cover EPOLLEXCLUSIVE.
>>>>>>>>
>>>>>>>> Signed-off-by: Jiufei Xue <jiufei.xue@linux.alibaba.com>
>>>>>>>> ---
>>>>>>>>  fs/io_uring.c                 | 4 ++--
>>>>>>>>  include/uapi/linux/io_uring.h | 2 +-
>>>>>>>>  tools/io_uring/liburing.h     | 2 +-
>>>>>>>>  3 files changed, 4 insertions(+), 4 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>>>>>>>> index 47790a2..6250227 100644
>>>>>>>> --- a/fs/io_uring.c
>>>>>>>> +++ b/fs/io_uring.c
>>>>>>>> @@ -4602,7 +4602,7 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
>>>>>>>>  static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>>>>>>>>  {
>>>>>>>>  	struct io_poll_iocb *poll = &req->poll;
>>>>>>>> -	u16 events;
>>>>>>>> +	u32 events;
>>>>>>>>  
>>>>>>>>  	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
>>>>>>>>  		return -EINVAL;
>>>>>>>> @@ -8196,7 +8196,7 @@ static int __init io_uring_init(void)
>>>>>>>>  	BUILD_BUG_SQE_ELEM(28, /* compat */   int, rw_flags);
>>>>>>>>  	BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
>>>>>>>>  	BUILD_BUG_SQE_ELEM(28, __u32,  fsync_flags);
>>>>>>>> -	BUILD_BUG_SQE_ELEM(28, __u16,  poll_events);
>>>>>>>> +	BUILD_BUG_SQE_ELEM(28, __u32,  poll_events);
>>>>>>>>  	BUILD_BUG_SQE_ELEM(28, __u32,  sync_range_flags);
>>>>>>>>  	BUILD_BUG_SQE_ELEM(28, __u32,  msg_flags);
>>>>>>>>  	BUILD_BUG_SQE_ELEM(28, __u32,  timeout_flags);
>>>>>>>> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
>>>>>>>> index 92c2269..afc7edd 100644
>>>>>>>> --- a/include/uapi/linux/io_uring.h
>>>>>>>> +++ b/include/uapi/linux/io_uring.h
>>>>>>>> @@ -31,7 +31,7 @@ struct io_uring_sqe {
>>>>>>>>  	union {
>>>>>>>>  		__kernel_rwf_t	rw_flags;
>>>>>>>>  		__u32		fsync_flags;
>>>>>>>> -		__u16		poll_events;
>>>>>>>> +		__u32		poll_events;
>>>>>>>>  		__u32		sync_range_flags;
>>>>>>>>  		__u32		msg_flags;
>>>>>>>>  		__u32		timeout_flags;
>>>>>>>
>>>>>>> We obviously have the space in there as most other flag members are 32-bits, but
>>>>>>> I'd want to double check if we're not changing the ABI here. Is this always
>>>>>>> going to be safe, on any platform, regardless of endianess etc?
>>>>>>
>>>>>> Double checked, and as I feared, we can't safely do this. We'll have to
>>>>>> do something like the below, grabbing an unused bit of the poll mask
>>>>>> space and if that's set, then store the fact that EPOLLEXCLUSIVE is set.
>>>>>> So probably best to turn this just into one patch, since it doesn't make
>>>>>> a lot of sense to do it as a prep patch at that point.
>>>>>>
>>>>> Yes, Agree about that. But I also fear that if the unused bit is used
>>>>> in the feature, it will bring unexpected behavior.
>>>>
>>>> Yeah, it's certainly not the prettiest and could potentially be fragile.
>>>> I'm open to suggestions, we need some way of signaling that the 32-bit
>>>> variant of the poll_events should be used. We could potentially make
>>>> this work by doing explicit layout for big endian vs little endian, that
>>>> might be prettier and wouldn't suffer from the "grab some random bit"
>>>> issue.
>>>>
>>> Thank you for your suggestion, I will think about it.
>>>
>>>>>> This does have the benefit of not growing io_poll_iocb. With your patch,
>>>>>> it'd go beyond a cacheline, and hence bump the size of the entire
>>>>>> io_iocb as well, which would be very unfortunate.
>>>>>>
>>>>> events in io_poll_iocb is 32-bits already, so why it will bump the
>>>>> size of the io_iocb structure with my patch? 
>>>>
>>>> It's not 32-bits already, it's a __poll_t type which is 16-bits only.
>>>>
>>> Yes, it is a __poll_t type, but I found that __poll_t type is 32-bits
>>> with the definition below:
>>>
>>> typedef unsigned __bitwise __poll_t;
>>>
>>> And I also investigate it with crash:
>>> crash> io_poll_iocb -ox
>>> struct io_poll_iocb {
>>>    [0x0] struct file *file;
>>>          union {
>>>    [0x8]     struct wait_queue_head *head;
>>>    [0x8]     u64 addr;
>>>          };
>>>   [0x10] __poll_t events;
>>>   [0x14] bool done;
>>>   [0x15] bool canceled;
>>>   [0x18] struct wait_queue_entry wait;
>>> }
>>
>> Yeah you're right, not sure why I figured it was 16-bits. But just
>> checking on my default build:
>>
>> axboe@x1 ~/gi/linux-block (block-5.8)> pahole -C io_poll_iocb fs/io_uring.o
>> struct io_poll_iocb {
>> 	struct file *              file;                 /*     0     8 */
>> 	union {
>> 		struct wait_queue_head * head;           /*     8     8 */
>> 		u64                addr;                 /*     8     8 */
>> 	};                                               /*     8     8 */
>> 	__poll_t                   events;               /*    16     4 */
>> 	bool                       done;                 /*    20     1 */
>> 	bool                       canceled;             /*    21     1 */
>>
>> 	/* XXX 2 bytes hole, try to pack */
>>
>> 	struct wait_queue_entry wait;                    /*    24    40 */
>>
>> 	/* size: 64, cachelines: 1, members: 6 */
>> 	/* sum members: 62, holes: 1, sum holes: 2 */
>> };
>>
>> and it's definitely 64-bytes in total size (as it should be), and the
>> 'events' is indeed 32-bits as well.
>>
>> So at least that's good, we don't need anything extra in there. If we
>> can solve the endian issue, then it should be trivial to use the full
>> 32-bits for the flags in the sqe.
> 
> To get back to something that can move us forward, something like the
> below I _think_ will work. Then applications just use poll32_events and
> we don't have to handle this in any special way. Care to base on that
> and re-send the change?
> 
> 
> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> index 92c22699a5a7..5b3f6bd59437 100644
> --- a/include/uapi/linux/io_uring.h
> +++ b/include/uapi/linux/io_uring.h
> @@ -31,7 +31,16 @@ struct io_uring_sqe {
>  	union {
>  		__kernel_rwf_t	rw_flags;
>  		__u32		fsync_flags;
> -		__u16		poll_events;
> +		struct {
> +#ifdef __BIG_ENDIAN_BITFIELD
> +			__u16	__unused_poll;
> +			__u16	poll_events;
> +#else
> +			__u16	poll_events;
> +			__u16	__unused_poll;
> +#endif
> +		};
> +		__u32		poll32_events;
>  		__u32		sync_range_flags;
>  		__u32		msg_flags;
>  		__u32		timeout_flags;
> 

That changes layout for big endian, isn't it? It's not ABI compatible.

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v3 1/2] io_uring: change the poll events to be 32-bits
  2020-06-16 18:45               ` Jens Axboe
  2020-06-16 19:20                 ` Pavel Begunkov
@ 2020-06-16 19:21                 ` Jens Axboe
  2020-06-16 21:46                   ` Pavel Begunkov
  1 sibling, 1 reply; 16+ messages in thread
From: Jens Axboe @ 2020-06-16 19:21 UTC (permalink / raw)
  To: Jiufei Xue, io-uring; +Cc: joseph.qi

On 6/16/20 12:45 PM, Jens Axboe wrote:
> On 6/16/20 7:58 AM, Jens Axboe wrote:
>> On 6/15/20 9:04 PM, Jiufei Xue wrote:
>>>
>>>
>>> On 2020/6/15 下午11:09, Jens Axboe wrote:
>>>> On 6/14/20 8:49 PM, Jiufei Xue wrote:
>>>>> Hi Jens,
>>>>>
>>>>> On 2020/6/13 上午12:48, Jens Axboe wrote:
>>>>>> On 6/12/20 8:58 AM, Jens Axboe wrote:
>>>>>>> On 6/11/20 8:30 PM, Jiufei Xue wrote:
>>>>>>>> poll events should be 32-bits to cover EPOLLEXCLUSIVE.
>>>>>>>>
>>>>>>>> Signed-off-by: Jiufei Xue <jiufei.xue@linux.alibaba.com>
>>>>>>>> ---
>>>>>>>>  fs/io_uring.c                 | 4 ++--
>>>>>>>>  include/uapi/linux/io_uring.h | 2 +-
>>>>>>>>  tools/io_uring/liburing.h     | 2 +-
>>>>>>>>  3 files changed, 4 insertions(+), 4 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>>>>>>>> index 47790a2..6250227 100644
>>>>>>>> --- a/fs/io_uring.c
>>>>>>>> +++ b/fs/io_uring.c
>>>>>>>> @@ -4602,7 +4602,7 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
>>>>>>>>  static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>>>>>>>>  {
>>>>>>>>  	struct io_poll_iocb *poll = &req->poll;
>>>>>>>> -	u16 events;
>>>>>>>> +	u32 events;
>>>>>>>>  
>>>>>>>>  	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
>>>>>>>>  		return -EINVAL;
>>>>>>>> @@ -8196,7 +8196,7 @@ static int __init io_uring_init(void)
>>>>>>>>  	BUILD_BUG_SQE_ELEM(28, /* compat */   int, rw_flags);
>>>>>>>>  	BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
>>>>>>>>  	BUILD_BUG_SQE_ELEM(28, __u32,  fsync_flags);
>>>>>>>> -	BUILD_BUG_SQE_ELEM(28, __u16,  poll_events);
>>>>>>>> +	BUILD_BUG_SQE_ELEM(28, __u32,  poll_events);
>>>>>>>>  	BUILD_BUG_SQE_ELEM(28, __u32,  sync_range_flags);
>>>>>>>>  	BUILD_BUG_SQE_ELEM(28, __u32,  msg_flags);
>>>>>>>>  	BUILD_BUG_SQE_ELEM(28, __u32,  timeout_flags);
>>>>>>>> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
>>>>>>>> index 92c2269..afc7edd 100644
>>>>>>>> --- a/include/uapi/linux/io_uring.h
>>>>>>>> +++ b/include/uapi/linux/io_uring.h
>>>>>>>> @@ -31,7 +31,7 @@ struct io_uring_sqe {
>>>>>>>>  	union {
>>>>>>>>  		__kernel_rwf_t	rw_flags;
>>>>>>>>  		__u32		fsync_flags;
>>>>>>>> -		__u16		poll_events;
>>>>>>>> +		__u32		poll_events;
>>>>>>>>  		__u32		sync_range_flags;
>>>>>>>>  		__u32		msg_flags;
>>>>>>>>  		__u32		timeout_flags;
>>>>>>>
>>>>>>> We obviously have the space in there as most other flag members are 32-bits, but
>>>>>>> I'd want to double check if we're not changing the ABI here. Is this always
>>>>>>> going to be safe, on any platform, regardless of endianess etc?
>>>>>>
>>>>>> Double checked, and as I feared, we can't safely do this. We'll have to
>>>>>> do something like the below, grabbing an unused bit of the poll mask
>>>>>> space and if that's set, then store the fact that EPOLLEXCLUSIVE is set.
>>>>>> So probably best to turn this just into one patch, since it doesn't make
>>>>>> a lot of sense to do it as a prep patch at that point.
>>>>>>
>>>>> Yes, Agree about that. But I also fear that if the unused bit is used
>>>>> in the feature, it will bring unexpected behavior.
>>>>
>>>> Yeah, it's certainly not the prettiest and could potentially be fragile.
>>>> I'm open to suggestions, we need some way of signaling that the 32-bit
>>>> variant of the poll_events should be used. We could potentially make
>>>> this work by doing explicit layout for big endian vs little endian, that
>>>> might be prettier and wouldn't suffer from the "grab some random bit"
>>>> issue.
>>>>
>>> Thank you for your suggestion, I will think about it.
>>>
>>>>>> This does have the benefit of not growing io_poll_iocb. With your patch,
>>>>>> it'd go beyond a cacheline, and hence bump the size of the entire
>>>>>> io_iocb as well, which would be very unfortunate.
>>>>>>
>>>>> events in io_poll_iocb is 32-bits already, so why it will bump the
>>>>> size of the io_iocb structure with my patch? 
>>>>
>>>> It's not 32-bits already, it's a __poll_t type which is 16-bits only.
>>>>
>>> Yes, it is a __poll_t type, but I found that __poll_t type is 32-bits
>>> with the definition below:
>>>
>>> typedef unsigned __bitwise __poll_t;
>>>
>>> And I also investigate it with crash:
>>> crash> io_poll_iocb -ox
>>> struct io_poll_iocb {
>>>    [0x0] struct file *file;
>>>          union {
>>>    [0x8]     struct wait_queue_head *head;
>>>    [0x8]     u64 addr;
>>>          };
>>>   [0x10] __poll_t events;
>>>   [0x14] bool done;
>>>   [0x15] bool canceled;
>>>   [0x18] struct wait_queue_entry wait;
>>> }
>>
>> Yeah you're right, not sure why I figured it was 16-bits. But just
>> checking on my default build:
>>
>> axboe@x1 ~/gi/linux-block (block-5.8)> pahole -C io_poll_iocb fs/io_uring.o
>> struct io_poll_iocb {
>> 	struct file *              file;                 /*     0     8 */
>> 	union {
>> 		struct wait_queue_head * head;           /*     8     8 */
>> 		u64                addr;                 /*     8     8 */
>> 	};                                               /*     8     8 */
>> 	__poll_t                   events;               /*    16     4 */
>> 	bool                       done;                 /*    20     1 */
>> 	bool                       canceled;             /*    21     1 */
>>
>> 	/* XXX 2 bytes hole, try to pack */
>>
>> 	struct wait_queue_entry wait;                    /*    24    40 */
>>
>> 	/* size: 64, cachelines: 1, members: 6 */
>> 	/* sum members: 62, holes: 1, sum holes: 2 */
>> };
>>
>> and it's definitely 64-bytes in total size (as it should be), and the
>> 'events' is indeed 32-bits as well.
>>
>> So at least that's good, we don't need anything extra in there. If we
>> can solve the endian issue, then it should be trivial to use the full
>> 32-bits for the flags in the sqe.
> 
> To get back to something that can move us forward, something like the
> below I _think_ will work. Then applications just use poll32_events and
> we don't have to handle this in any special way. Care to base on that
> and re-send the change?
> 
> 
> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> index 92c22699a5a7..5b3f6bd59437 100644
> --- a/include/uapi/linux/io_uring.h
> +++ b/include/uapi/linux/io_uring.h
> @@ -31,7 +31,16 @@ struct io_uring_sqe {
>  	union {
>  		__kernel_rwf_t	rw_flags;
>  		__u32		fsync_flags;
> -		__u16		poll_events;
> +		struct {
> +#ifdef __BIG_ENDIAN_BITFIELD
> +			__u16	__unused_poll;
> +			__u16	poll_events;
> +#else
> +			__u16	poll_events;
> +			__u16	__unused_poll;
> +#endif
> +		};
> +		__u32		poll32_events;
>  		__u32		sync_range_flags;
>  		__u32		msg_flags;
>  		__u32		timeout_flags;
> 

Nah this won't work, as the BE layout will be different. So how about
this, just add a 16-bit poll_events_hi instead. App/liburing will set
upper bits there. Something like the below, then just needs the
exclusive wait change on top.

Only downside I can see is that newer applications on older kernels will
set EPOLLEXCLUSIVE but the kernel will ignore it. That's not a huge
concern for this particular bit, but could be a concern if there are
others that prove useful.

diff --git a/fs/io_uring.c b/fs/io_uring.c
index de1175206807..a9d74330ad6b 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4809,6 +4809,9 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
 	events = READ_ONCE(sqe->poll_events);
 	poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
 
+	if (READ_ONCE(sqe->poll_events_hi) & EPOLLEXCLUSIVE)
+		poll->events |= EPOLLEXCLUSIVE;
+
 	io_get_req_task(req);
 	return 0;
 }
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 92c22699a5a7..e6856d8e068f 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -31,7 +31,10 @@ struct io_uring_sqe {
 	union {
 		__kernel_rwf_t	rw_flags;
 		__u32		fsync_flags;
-		__u16		poll_events;
+		struct {
+			__u16	poll_events;
+			__u16	poll_events_hi;
+		};
 		__u32		sync_range_flags;
 		__u32		msg_flags;
 		__u32		timeout_flags;

-- 
Jens Axboe


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH v3 1/2] io_uring: change the poll events to be 32-bits
  2020-06-16 19:20                 ` Pavel Begunkov
@ 2020-06-16 19:27                   ` Jens Axboe
  0 siblings, 0 replies; 16+ messages in thread
From: Jens Axboe @ 2020-06-16 19:27 UTC (permalink / raw)
  To: Pavel Begunkov, Jiufei Xue, io-uring; +Cc: joseph.qi

On 6/16/20 1:20 PM, Pavel Begunkov wrote:
> On 16/06/2020 21:45, Jens Axboe wrote:
>> On 6/16/20 7:58 AM, Jens Axboe wrote:
>>> On 6/15/20 9:04 PM, Jiufei Xue wrote:
>>>>
>>>>
>>>> On 2020/6/15 下午11:09, Jens Axboe wrote:
>>>>> On 6/14/20 8:49 PM, Jiufei Xue wrote:
>>>>>> Hi Jens,
>>>>>>
>>>>>> On 2020/6/13 上午12:48, Jens Axboe wrote:
>>>>>>> On 6/12/20 8:58 AM, Jens Axboe wrote:
>>>>>>>> On 6/11/20 8:30 PM, Jiufei Xue wrote:
>>>>>>>>> poll events should be 32-bits to cover EPOLLEXCLUSIVE.
>>>>>>>>>
>>>>>>>>> Signed-off-by: Jiufei Xue <jiufei.xue@linux.alibaba.com>
>>>>>>>>> ---
>>>>>>>>>  fs/io_uring.c                 | 4 ++--
>>>>>>>>>  include/uapi/linux/io_uring.h | 2 +-
>>>>>>>>>  tools/io_uring/liburing.h     | 2 +-
>>>>>>>>>  3 files changed, 4 insertions(+), 4 deletions(-)
>>>>>>>>>
>>>>>>>>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>>>>>>>>> index 47790a2..6250227 100644
>>>>>>>>> --- a/fs/io_uring.c
>>>>>>>>> +++ b/fs/io_uring.c
>>>>>>>>> @@ -4602,7 +4602,7 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
>>>>>>>>>  static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>>>>>>>>>  {
>>>>>>>>>  	struct io_poll_iocb *poll = &req->poll;
>>>>>>>>> -	u16 events;
>>>>>>>>> +	u32 events;
>>>>>>>>>  
>>>>>>>>>  	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
>>>>>>>>>  		return -EINVAL;
>>>>>>>>> @@ -8196,7 +8196,7 @@ static int __init io_uring_init(void)
>>>>>>>>>  	BUILD_BUG_SQE_ELEM(28, /* compat */   int, rw_flags);
>>>>>>>>>  	BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
>>>>>>>>>  	BUILD_BUG_SQE_ELEM(28, __u32,  fsync_flags);
>>>>>>>>> -	BUILD_BUG_SQE_ELEM(28, __u16,  poll_events);
>>>>>>>>> +	BUILD_BUG_SQE_ELEM(28, __u32,  poll_events);
>>>>>>>>>  	BUILD_BUG_SQE_ELEM(28, __u32,  sync_range_flags);
>>>>>>>>>  	BUILD_BUG_SQE_ELEM(28, __u32,  msg_flags);
>>>>>>>>>  	BUILD_BUG_SQE_ELEM(28, __u32,  timeout_flags);
>>>>>>>>> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
>>>>>>>>> index 92c2269..afc7edd 100644
>>>>>>>>> --- a/include/uapi/linux/io_uring.h
>>>>>>>>> +++ b/include/uapi/linux/io_uring.h
>>>>>>>>> @@ -31,7 +31,7 @@ struct io_uring_sqe {
>>>>>>>>>  	union {
>>>>>>>>>  		__kernel_rwf_t	rw_flags;
>>>>>>>>>  		__u32		fsync_flags;
>>>>>>>>> -		__u16		poll_events;
>>>>>>>>> +		__u32		poll_events;
>>>>>>>>>  		__u32		sync_range_flags;
>>>>>>>>>  		__u32		msg_flags;
>>>>>>>>>  		__u32		timeout_flags;
>>>>>>>>
>>>>>>>> We obviously have the space in there as most other flag members are 32-bits, but
>>>>>>>> I'd want to double check if we're not changing the ABI here. Is this always
>>>>>>>> going to be safe, on any platform, regardless of endianess etc?
>>>>>>>
>>>>>>> Double checked, and as I feared, we can't safely do this. We'll have to
>>>>>>> do something like the below, grabbing an unused bit of the poll mask
>>>>>>> space and if that's set, then store the fact that EPOLLEXCLUSIVE is set.
>>>>>>> So probably best to turn this just into one patch, since it doesn't make
>>>>>>> a lot of sense to do it as a prep patch at that point.
>>>>>>>
>>>>>> Yes, Agree about that. But I also fear that if the unused bit is used
>>>>>> in the feature, it will bring unexpected behavior.
>>>>>
>>>>> Yeah, it's certainly not the prettiest and could potentially be fragile.
>>>>> I'm open to suggestions, we need some way of signaling that the 32-bit
>>>>> variant of the poll_events should be used. We could potentially make
>>>>> this work by doing explicit layout for big endian vs little endian, that
>>>>> might be prettier and wouldn't suffer from the "grab some random bit"
>>>>> issue.
>>>>>
>>>> Thank you for your suggestion, I will think about it.
>>>>
>>>>>>> This does have the benefit of not growing io_poll_iocb. With your patch,
>>>>>>> it'd go beyond a cacheline, and hence bump the size of the entire
>>>>>>> io_iocb as well, which would be very unfortunate.
>>>>>>>
>>>>>> events in io_poll_iocb is 32-bits already, so why it will bump the
>>>>>> size of the io_iocb structure with my patch? 
>>>>>
>>>>> It's not 32-bits already, it's a __poll_t type which is 16-bits only.
>>>>>
>>>> Yes, it is a __poll_t type, but I found that __poll_t type is 32-bits
>>>> with the definition below:
>>>>
>>>> typedef unsigned __bitwise __poll_t;
>>>>
>>>> And I also investigate it with crash:
>>>> crash> io_poll_iocb -ox
>>>> struct io_poll_iocb {
>>>>    [0x0] struct file *file;
>>>>          union {
>>>>    [0x8]     struct wait_queue_head *head;
>>>>    [0x8]     u64 addr;
>>>>          };
>>>>   [0x10] __poll_t events;
>>>>   [0x14] bool done;
>>>>   [0x15] bool canceled;
>>>>   [0x18] struct wait_queue_entry wait;
>>>> }
>>>
>>> Yeah you're right, not sure why I figured it was 16-bits. But just
>>> checking on my default build:
>>>
>>> axboe@x1 ~/gi/linux-block (block-5.8)> pahole -C io_poll_iocb fs/io_uring.o
>>> struct io_poll_iocb {
>>> 	struct file *              file;                 /*     0     8 */
>>> 	union {
>>> 		struct wait_queue_head * head;           /*     8     8 */
>>> 		u64                addr;                 /*     8     8 */
>>> 	};                                               /*     8     8 */
>>> 	__poll_t                   events;               /*    16     4 */
>>> 	bool                       done;                 /*    20     1 */
>>> 	bool                       canceled;             /*    21     1 */
>>>
>>> 	/* XXX 2 bytes hole, try to pack */
>>>
>>> 	struct wait_queue_entry wait;                    /*    24    40 */
>>>
>>> 	/* size: 64, cachelines: 1, members: 6 */
>>> 	/* sum members: 62, holes: 1, sum holes: 2 */
>>> };
>>>
>>> and it's definitely 64-bytes in total size (as it should be), and the
>>> 'events' is indeed 32-bits as well.
>>>
>>> So at least that's good, we don't need anything extra in there. If we
>>> can solve the endian issue, then it should be trivial to use the full
>>> 32-bits for the flags in the sqe.
>>
>> To get back to something that can move us forward, something like the
>> below I _think_ will work. Then applications just use poll32_events and
>> we don't have to handle this in any special way. Care to base on that
>> and re-send the change?
>>
>>
>> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
>> index 92c22699a5a7..5b3f6bd59437 100644
>> --- a/include/uapi/linux/io_uring.h
>> +++ b/include/uapi/linux/io_uring.h
>> @@ -31,7 +31,16 @@ struct io_uring_sqe {
>>  	union {
>>  		__kernel_rwf_t	rw_flags;
>>  		__u32		fsync_flags;
>> -		__u16		poll_events;
>> +		struct {
>> +#ifdef __BIG_ENDIAN_BITFIELD
>> +			__u16	__unused_poll;
>> +			__u16	poll_events;
>> +#else
>> +			__u16	poll_events;
>> +			__u16	__unused_poll;
>> +#endif
>> +		};
>> +		__u32		poll32_events;
>>  		__u32		sync_range_flags;
>>  		__u32		msg_flags;
>>  		__u32		timeout_flags;
>>
> 
> That changes layout for big endian, isn't it? It's not ABI compatible.

Right, see followup message, mid-air collision on that one.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v3 1/2] io_uring: change the poll events to be 32-bits
  2020-06-16 19:21                 ` Jens Axboe
@ 2020-06-16 21:46                   ` Pavel Begunkov
  2020-06-17  0:06                     ` Jens Axboe
  2020-06-17  1:39                     ` Jiufei Xue
  0 siblings, 2 replies; 16+ messages in thread
From: Pavel Begunkov @ 2020-06-16 21:46 UTC (permalink / raw)
  To: Jens Axboe, Jiufei Xue, io-uring; +Cc: joseph.qi

On 16/06/2020 22:21, Jens Axboe wrote:
> 
> Nah this won't work, as the BE layout will be different. So how about
> this, just add a 16-bit poll_events_hi instead. App/liburing will set
> upper bits there. Something like the below, then just needs the
> exclusive wait change on top.
> 
> Only downside I can see is that newer applications on older kernels will
> set EPOLLEXCLUSIVE but the kernel will ignore it. That's not a huge
> concern for this particular bit, but could be a concern if there are
> others that prove useful.
> 
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index de1175206807..a9d74330ad6b 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -4809,6 +4809,9 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
>  	events = READ_ONCE(sqe->poll_events);
>  	poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
>  
> +	if (READ_ONCE(sqe->poll_events_hi) & EPOLLEXCLUSIVE)

poll_events_hi is 16 bit, EPOLLEXCLUSIVE is (1 << 28). It's always false.
Do you look for something like below?


union {
	...
	__u32		fsync_flags;
	__u16		poll_events;  /* compatibility */
	__u32		poll32_events; /* word-reversed for BE */
};

u32 evs = READ_ONCE(poll32_events);
if (BIG_ENDIAN)
	evs = swahw32(evs); // swap 16-bit halves

// use as always, e.g. if (evs & EPOLLEXCLUSIVE) { ... }

> +		poll->events |= EPOLLEXCLUSIVE;
> +
>  	io_get_req_task(req);
>  	return 0;
>  }
> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> index 92c22699a5a7..e6856d8e068f 100644
> --- a/include/uapi/linux/io_uring.h
> +++ b/include/uapi/linux/io_uring.h
> @@ -31,7 +31,10 @@ struct io_uring_sqe {
>  	union {
>  		__kernel_rwf_t	rw_flags;
>  		__u32		fsync_flags;
> -		__u16		poll_events;
> +		struct {
> +			__u16	poll_events;
> +			__u16	poll_events_hi;
> +		};
>  		__u32		sync_range_flags;
>  		__u32		msg_flags;
>  		__u32		timeout_flags;
> 

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v3 1/2] io_uring: change the poll events to be 32-bits
  2020-06-16 21:46                   ` Pavel Begunkov
@ 2020-06-17  0:06                     ` Jens Axboe
  2020-06-17  1:39                     ` Jiufei Xue
  1 sibling, 0 replies; 16+ messages in thread
From: Jens Axboe @ 2020-06-17  0:06 UTC (permalink / raw)
  To: Pavel Begunkov, Jiufei Xue, io-uring; +Cc: joseph.qi

On 6/16/20 3:46 PM, Pavel Begunkov wrote:
> On 16/06/2020 22:21, Jens Axboe wrote:
>>
>> Nah this won't work, as the BE layout will be different. So how about
>> this, just add a 16-bit poll_events_hi instead. App/liburing will set
>> upper bits there. Something like the below, then just needs the
>> exclusive wait change on top.
>>
>> Only downside I can see is that newer applications on older kernels will
>> set EPOLLEXCLUSIVE but the kernel will ignore it. That's not a huge
>> concern for this particular bit, but could be a concern if there are
>> others that prove useful.
>>
>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>> index de1175206807..a9d74330ad6b 100644
>> --- a/fs/io_uring.c
>> +++ b/fs/io_uring.c
>> @@ -4809,6 +4809,9 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
>>  	events = READ_ONCE(sqe->poll_events);
>>  	poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
>>  
>> +	if (READ_ONCE(sqe->poll_events_hi) & EPOLLEXCLUSIVE)
> 
> poll_events_hi is 16 bit, EPOLLEXCLUSIVE is (1 << 28). It's always false.
> Do you look for something like below?
> 
> 
> union {
> 	...
> 	__u32		fsync_flags;
> 	__u16		poll_events;  /* compatibility */
> 	__u32		poll32_events; /* word-reversed for BE */
> };
> 
> u32 evs = READ_ONCE(poll32_events);
> if (BIG_ENDIAN)
> 	evs = swahw32(evs); // swap 16-bit halves
> 
> // use as always, e.g. if (evs & EPOLLEXCLUSIVE) { ... }

Duh yes, that's much better. Thanks for the clue bat.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v3 1/2] io_uring: change the poll events to be 32-bits
  2020-06-16 21:46                   ` Pavel Begunkov
  2020-06-17  0:06                     ` Jens Axboe
@ 2020-06-17  1:39                     ` Jiufei Xue
  1 sibling, 0 replies; 16+ messages in thread
From: Jiufei Xue @ 2020-06-17  1:39 UTC (permalink / raw)
  To: Pavel Begunkov, Jens Axboe, io-uring; +Cc: joseph.qi

On 2020/6/17 上午5:46, Pavel Begunkov wrote:
> On 16/06/2020 22:21, Jens Axboe wrote:
>>
>> Nah this won't work, as the BE layout will be different. So how about
>> this, just add a 16-bit poll_events_hi instead. App/liburing will set
>> upper bits there. Something like the below, then just needs the
>> exclusive wait change on top.
>>
>> Only downside I can see is that newer applications on older kernels will
>> set EPOLLEXCLUSIVE but the kernel will ignore it. That's not a huge
>> concern for this particular bit, but could be a concern if there are
>> others that prove useful.
>>
>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>> index de1175206807..a9d74330ad6b 100644
>> --- a/fs/io_uring.c
>> +++ b/fs/io_uring.c
>> @@ -4809,6 +4809,9 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
>>  	events = READ_ONCE(sqe->poll_events);
>>  	poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
>>  
>> +	if (READ_ONCE(sqe->poll_events_hi) & EPOLLEXCLUSIVE)
> 
> poll_events_hi is 16 bit, EPOLLEXCLUSIVE is (1 << 28). It's always false.
> Do you look for something like below?
> 
> 
> union {
> 	...
> 	__u32		fsync_flags;
> 	__u16		poll_events;  /* compatibility */
> 	__u32		poll32_events; /* word-reversed for BE */
> };
> 
> u32 evs = READ_ONCE(poll32_events);
> if (BIG_ENDIAN)
> 	evs = swahw32(evs); // swap 16-bit halves
> 
> // use as always, e.g. if (evs & EPOLLEXCLUSIVE) { ... }
>
Cool, That's clear. And we should do the reverse thing in liburing
for big endian.

I also want to add another feature bit in newer kernel, so that
the newer application should check the feature first before setting
the EPOLLEXCLUSIVE.

Thanks,
JIufei.

>> +		poll->events |= EPOLLEXCLUSIVE;
>> +
>>  	io_get_req_task(req);
>>  	return 0;
>>  }
>> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
>> index 92c22699a5a7..e6856d8e068f 100644
>> --- a/include/uapi/linux/io_uring.h
>> +++ b/include/uapi/linux/io_uring.h
>> @@ -31,7 +31,10 @@ struct io_uring_sqe {
>>  	union {
>>  		__kernel_rwf_t	rw_flags;
>>  		__u32		fsync_flags;
>> -		__u16		poll_events;
>> +		struct {
>> +			__u16	poll_events;
>> +			__u16	poll_events_hi;
>> +		};
>>  		__u32		sync_range_flags;
>>  		__u32		msg_flags;
>>  		__u32		timeout_flags;
>>
> 

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2020-06-17  1:40 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-06-12  2:30 [PATCH v3] io_uring: add EPOLLEXCLUSIVE flag for POLL_ADD operation Jiufei Xue
2020-06-12  2:30 ` [PATCH v3 1/2] io_uring: change the poll events to be 32-bits Jiufei Xue
2020-06-12 14:58   ` Jens Axboe
2020-06-12 16:48     ` Jens Axboe
2020-06-15  2:49       ` Jiufei Xue
2020-06-15 15:09         ` Jens Axboe
2020-06-16  3:04           ` Jiufei Xue
2020-06-16 13:58             ` Jens Axboe
2020-06-16 18:45               ` Jens Axboe
2020-06-16 19:20                 ` Pavel Begunkov
2020-06-16 19:27                   ` Jens Axboe
2020-06-16 19:21                 ` Jens Axboe
2020-06-16 21:46                   ` Pavel Begunkov
2020-06-17  0:06                     ` Jens Axboe
2020-06-17  1:39                     ` Jiufei Xue
2020-06-12  2:30 ` [PATCH v3 2/2] io_uring: use EPOLLEXCLUSIVE flag to aoid thundering herd type behavior Jiufei Xue

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).