* [PATCH 1/4] timerfd: add support for keyed wakeups
2018-08-06 8:30 aio poll V22 (aka 2.0) Christoph Hellwig
@ 2018-08-06 8:30 ` Christoph Hellwig
2018-08-06 8:30 ` [PATCH 2/4] aio: add a iocb refcount Christoph Hellwig
` (3 subsequent siblings)
4 siblings, 0 replies; 23+ messages in thread
From: Christoph Hellwig @ 2018-08-06 8:30 UTC (permalink / raw)
To: viro; +Cc: Avi Kivity, Linus Torvalds, linux-aio, linux-fsdevel, linux-kernel
This prepares timerfd for use with aio poll.
Signed-off-by: Christoph Hellwig <hch@lst.de>
Tested-by: Avi Kivity <avi@scylladb.com>
---
fs/timerfd.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/fs/timerfd.c b/fs/timerfd.c
index cdad49da3ff7..f6c54fd56645 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -66,7 +66,7 @@ static void timerfd_triggered(struct timerfd_ctx *ctx)
spin_lock_irqsave(&ctx->wqh.lock, flags);
ctx->expired = 1;
ctx->ticks++;
- wake_up_locked(&ctx->wqh);
+ wake_up_locked_poll(&ctx->wqh, EPOLLIN);
spin_unlock_irqrestore(&ctx->wqh.lock, flags);
}
@@ -107,7 +107,7 @@ void timerfd_clock_was_set(void)
if (ctx->moffs != moffs) {
ctx->moffs = KTIME_MAX;
ctx->ticks++;
- wake_up_locked(&ctx->wqh);
+ wake_up_locked_poll(&ctx->wqh, EPOLLIN);
}
spin_unlock_irqrestore(&ctx->wqh.lock, flags);
}
@@ -345,7 +345,7 @@ static long timerfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg
spin_lock_irq(&ctx->wqh.lock);
if (!timerfd_canceled(ctx)) {
ctx->ticks = ticks;
- wake_up_locked(&ctx->wqh);
+ wake_up_locked_poll(&ctx->wqh, EPOLLIN);
} else
ret = -ECANCELED;
spin_unlock_irq(&ctx->wqh.lock);
--
2.18.0
^ permalink raw reply related [flat|nested] 23+ messages in thread
* [PATCH 2/4] aio: add a iocb refcount
2018-08-06 8:30 aio poll V22 (aka 2.0) Christoph Hellwig
2018-08-06 8:30 ` [PATCH 1/4] timerfd: add support for keyed wakeups Christoph Hellwig
@ 2018-08-06 8:30 ` Christoph Hellwig
2018-08-06 8:30 ` [PATCH 3/4] aio: implement IOCB_CMD_POLL Christoph Hellwig
` (2 subsequent siblings)
4 siblings, 0 replies; 23+ messages in thread
From: Christoph Hellwig @ 2018-08-06 8:30 UTC (permalink / raw)
To: viro; +Cc: Avi Kivity, Linus Torvalds, linux-aio, linux-fsdevel, linux-kernel
This is needed to prevent races caused by the way the ->poll API works.
To avoid introducing overhead for other users of the iocbs we initialize
it to zero and only do refcount operations if it is non-zero in the
completion path.
Signed-off-by: Christoph Hellwig <hch@lst.de>
Tested-by: Avi Kivity <avi@scylladb.com>
---
fs/aio.c | 17 +++++++++++++----
1 file changed, 13 insertions(+), 4 deletions(-)
diff --git a/fs/aio.c b/fs/aio.c
index 27454594e37a..fe2018ada32c 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -18,6 +18,7 @@
#include <linux/export.h>
#include <linux/syscalls.h>
#include <linux/backing-dev.h>
+#include <linux/refcount.h>
#include <linux/uio.h>
#include <linux/sched/signal.h>
@@ -178,6 +179,7 @@ struct aio_kiocb {
struct list_head ki_list; /* the aio core uses this
* for cancellation */
+ refcount_t ki_refcnt;
/*
* If the aio_resfd field of the userspace iocb is not zero,
@@ -1015,6 +1017,7 @@ static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx)
percpu_ref_get(&ctx->reqs);
INIT_LIST_HEAD(&req->ki_list);
+ refcount_set(&req->ki_refcnt, 0);
req->ki_ctx = ctx;
return req;
out_put:
@@ -1049,6 +1052,15 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id)
return ret;
}
+static inline void iocb_put(struct aio_kiocb *iocb)
+{
+ if (refcount_read(&iocb->ki_refcnt) == 0 ||
+ refcount_dec_and_test(&iocb->ki_refcnt)) {
+ percpu_ref_put(&iocb->ki_ctx->reqs);
+ kmem_cache_free(kiocb_cachep, iocb);
+ }
+}
+
/* aio_complete
* Called when the io request on the given iocb is complete.
*/
@@ -1118,8 +1130,6 @@ static void aio_complete(struct aio_kiocb *iocb, long res, long res2)
eventfd_ctx_put(iocb->ki_eventfd);
}
- kmem_cache_free(kiocb_cachep, iocb);
-
/*
* We have to order our ring_info tail store above and test
* of the wait list below outside the wait lock. This is
@@ -1130,8 +1140,7 @@ static void aio_complete(struct aio_kiocb *iocb, long res, long res2)
if (waitqueue_active(&ctx->wait))
wake_up(&ctx->wait);
-
- percpu_ref_put(&ctx->reqs);
+ iocb_put(iocb);
}
/* aio_read_events_ring
--
2.18.0
^ permalink raw reply related [flat|nested] 23+ messages in thread
* [PATCH 3/4] aio: implement IOCB_CMD_POLL
2018-08-06 8:30 aio poll V22 (aka 2.0) Christoph Hellwig
2018-08-06 8:30 ` [PATCH 1/4] timerfd: add support for keyed wakeups Christoph Hellwig
2018-08-06 8:30 ` [PATCH 2/4] aio: add a iocb refcount Christoph Hellwig
@ 2018-08-06 8:30 ` Christoph Hellwig
2018-08-06 8:30 ` [PATCH 4/4] aio: allow direct aio poll comletions for keyed wakeups Christoph Hellwig
2018-08-06 16:49 ` aio poll V22 (aka 2.0) Linus Torvalds
4 siblings, 0 replies; 23+ messages in thread
From: Christoph Hellwig @ 2018-08-06 8:30 UTC (permalink / raw)
To: viro; +Cc: Avi Kivity, Linus Torvalds, linux-aio, linux-fsdevel, linux-kernel
Simple one-shot poll through the io_submit() interface. To poll for
a file descriptor the application should submit an iocb of type
IOCB_CMD_POLL. It will poll the fd for the events specified in the
the first 32 bits of the aio_buf field of the iocb.
Unlike poll or epoll without EPOLLONESHOT this interface always works
in one shot mode, that is once the iocb is completed, it will have to be
resubmitted.
Signed-off-by: Christoph Hellwig <hch@lst.de>
Tested-by: Avi Kivity <avi@scylladb.com>
---
fs/aio.c | 178 +++++++++++++++++++++++++++++++++++
include/uapi/linux/aio_abi.h | 6 +-
2 files changed, 180 insertions(+), 4 deletions(-)
diff --git a/fs/aio.c b/fs/aio.c
index fe2018ada32c..2fd19521d8a8 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -5,6 +5,7 @@
* Implements an efficient asynchronous io interface.
*
* Copyright 2000, 2001, 2002 Red Hat, Inc. All Rights Reserved.
+ * Copyright 2018 Christoph Hellwig.
*
* See ../COPYING for licensing terms.
*/
@@ -165,10 +166,21 @@ struct fsync_iocb {
bool datasync;
};
+struct poll_iocb {
+ struct file *file;
+ struct wait_queue_head *head;
+ __poll_t events;
+ bool woken;
+ bool cancelled;
+ struct wait_queue_entry wait;
+ struct work_struct work;
+};
+
struct aio_kiocb {
union {
struct kiocb rw;
struct fsync_iocb fsync;
+ struct poll_iocb poll;
};
struct kioctx *ki_ctx;
@@ -1601,6 +1613,169 @@ static int aio_fsync(struct fsync_iocb *req, struct iocb *iocb, bool datasync)
return 0;
}
+static inline void aio_poll_complete(struct aio_kiocb *iocb, __poll_t mask)
+{
+ struct file *file = iocb->poll.file;
+
+ aio_complete(iocb, mangle_poll(mask), 0);
+ fput(file);
+}
+
+static void aio_poll_complete_work(struct work_struct *work)
+{
+ struct poll_iocb *req = container_of(work, struct poll_iocb, work);
+ struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll);
+ struct poll_table_struct pt = { ._key = req->events };
+ struct kioctx *ctx = iocb->ki_ctx;
+ __poll_t mask = 0;
+
+ if (!READ_ONCE(req->cancelled))
+ mask = vfs_poll(req->file, &pt) & req->events;
+
+ /*
+ * Note that ->ki_cancel callers also delete iocb from active_reqs after
+ * calling ->ki_cancel. We need the ctx_lock roundtrip here to
+ * synchronize with them. In the cancellation case the list_del_init
+ * itself is not actually needed, but harmless so we keep it in to
+ * avoid further branches in the fast path.
+ */
+ spin_lock_irq(&ctx->ctx_lock);
+ if (!mask && !READ_ONCE(req->cancelled)) {
+ add_wait_queue(req->head, &req->wait);
+ spin_unlock_irq(&ctx->ctx_lock);
+ return;
+ }
+ list_del_init(&iocb->ki_list);
+ spin_unlock_irq(&ctx->ctx_lock);
+
+ aio_poll_complete(iocb, mask);
+}
+
+/* assumes we are called with irqs disabled */
+static int aio_poll_cancel(struct kiocb *iocb)
+{
+ struct aio_kiocb *aiocb = container_of(iocb, struct aio_kiocb, rw);
+ struct poll_iocb *req = &aiocb->poll;
+
+ spin_lock(&req->head->lock);
+ WRITE_ONCE(req->cancelled, true);
+ if (!list_empty(&req->wait.entry)) {
+ list_del_init(&req->wait.entry);
+ schedule_work(&aiocb->poll.work);
+ }
+ spin_unlock(&req->head->lock);
+
+ return 0;
+}
+
+static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
+ void *key)
+{
+ struct poll_iocb *req = container_of(wait, struct poll_iocb, wait);
+ __poll_t mask = key_to_poll(key);
+
+ req->woken = true;
+
+ /* for instances that support it check for an event match first: */
+ if (mask && !(mask & req->events))
+ return 0;
+
+ list_del_init(&req->wait.entry);
+ schedule_work(&req->work);
+ return 1;
+}
+
+struct aio_poll_table {
+ struct poll_table_struct pt;
+ struct aio_kiocb *iocb;
+ int error;
+};
+
+static void
+aio_poll_queue_proc(struct file *file, struct wait_queue_head *head,
+ struct poll_table_struct *p)
+{
+ struct aio_poll_table *pt = container_of(p, struct aio_poll_table, pt);
+
+ /* multiple wait queues per file are not supported */
+ if (unlikely(pt->iocb->poll.head)) {
+ pt->error = -EINVAL;
+ return;
+ }
+
+ pt->error = 0;
+ pt->iocb->poll.head = head;
+ add_wait_queue(head, &pt->iocb->poll.wait);
+}
+
+static ssize_t aio_poll(struct aio_kiocb *aiocb, struct iocb *iocb)
+{
+ struct kioctx *ctx = aiocb->ki_ctx;
+ struct poll_iocb *req = &aiocb->poll;
+ struct aio_poll_table apt;
+ __poll_t mask;
+
+ /* reject any unknown events outside the normal event mask. */
+ if ((u16)iocb->aio_buf != iocb->aio_buf)
+ return -EINVAL;
+ /* reject fields that are not defined for poll */
+ if (iocb->aio_offset || iocb->aio_nbytes || iocb->aio_rw_flags)
+ return -EINVAL;
+
+ INIT_WORK(&req->work, aio_poll_complete_work);
+ req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP;
+ req->file = fget(iocb->aio_fildes);
+ if (unlikely(!req->file))
+ return -EBADF;
+
+ apt.pt._qproc = aio_poll_queue_proc;
+ apt.pt._key = req->events;
+ apt.iocb = aiocb;
+ apt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */
+
+ /* initialized the list so that we can do list_empty checks */
+ INIT_LIST_HEAD(&req->wait.entry);
+ init_waitqueue_func_entry(&req->wait, aio_poll_wake);
+
+ /* one for removal from waitqueue, one for this function */
+ refcount_set(&aiocb->ki_refcnt, 2);
+
+ mask = vfs_poll(req->file, &apt.pt) & req->events;
+ if (unlikely(!req->head)) {
+ /* we did not manage to set up a waitqueue, done */
+ goto out;
+ }
+
+ spin_lock_irq(&ctx->ctx_lock);
+ spin_lock(&req->head->lock);
+ if (req->woken) {
+ /* wake_up context handles the rest */
+ mask = 0;
+ apt.error = 0;
+ } else if (mask || apt.error) {
+ /* if we get an error or a mask we are done */
+ WARN_ON_ONCE(list_empty(&req->wait.entry));
+ list_del_init(&req->wait.entry);
+ } else {
+ /* actually waiting for an event */
+ list_add_tail(&aiocb->ki_list, &ctx->active_reqs);
+ aiocb->ki_cancel = aio_poll_cancel;
+ }
+ spin_unlock(&req->head->lock);
+ spin_unlock_irq(&ctx->ctx_lock);
+
+out:
+ if (unlikely(apt.error)) {
+ fput(req->file);
+ return apt.error;
+ }
+
+ if (mask)
+ aio_poll_complete(aiocb, mask);
+ iocb_put(aiocb);
+ return 0;
+}
+
static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
bool compat)
{
@@ -1674,6 +1849,9 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
case IOCB_CMD_FDSYNC:
ret = aio_fsync(&req->fsync, &iocb, true);
break;
+ case IOCB_CMD_POLL:
+ ret = aio_poll(req, &iocb);
+ break;
default:
pr_debug("invalid aio operation %d\n", iocb.aio_lio_opcode);
ret = -EINVAL;
diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h
index d4593a6062ef..ce43d340f010 100644
--- a/include/uapi/linux/aio_abi.h
+++ b/include/uapi/linux/aio_abi.h
@@ -38,10 +38,8 @@ enum {
IOCB_CMD_PWRITE = 1,
IOCB_CMD_FSYNC = 2,
IOCB_CMD_FDSYNC = 3,
- /* These two are experimental.
- * IOCB_CMD_PREADX = 4,
- * IOCB_CMD_POLL = 5,
- */
+ /* 4 was the experimental IOCB_CMD_PREADX */
+ IOCB_CMD_POLL = 5,
IOCB_CMD_NOOP = 6,
IOCB_CMD_PREADV = 7,
IOCB_CMD_PWRITEV = 8,
--
2.18.0
^ permalink raw reply related [flat|nested] 23+ messages in thread
* [PATCH 4/4] aio: allow direct aio poll comletions for keyed wakeups
2018-08-06 8:30 aio poll V22 (aka 2.0) Christoph Hellwig
` (2 preceding siblings ...)
2018-08-06 8:30 ` [PATCH 3/4] aio: implement IOCB_CMD_POLL Christoph Hellwig
@ 2018-08-06 8:30 ` Christoph Hellwig
2018-08-06 22:27 ` Andrew Morton
2018-08-07 11:44 ` [PATCH 4/4 v2] " Christoph Hellwig
2018-08-06 16:49 ` aio poll V22 (aka 2.0) Linus Torvalds
4 siblings, 2 replies; 23+ messages in thread
From: Christoph Hellwig @ 2018-08-06 8:30 UTC (permalink / raw)
To: viro; +Cc: Avi Kivity, Linus Torvalds, linux-aio, linux-fsdevel, linux-kernel
If we get a keyed wakeup for a aio poll waitqueue and wake can acquire the
ctx_lock without spinning we can just complete the iocb straight from the
wakeup callback to avoid a context switch.
Signed-off-by: Christoph Hellwig <hch@lst.de>
Tested-by: Avi Kivity <avi@scylladb.com>
---
fs/aio.c | 17 +++++++++++++++--
1 file changed, 15 insertions(+), 2 deletions(-)
diff --git a/fs/aio.c b/fs/aio.c
index 2fd19521d8a8..29f2b5b57d32 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1672,13 +1672,26 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
void *key)
{
struct poll_iocb *req = container_of(wait, struct poll_iocb, wait);
+ struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll);
__poll_t mask = key_to_poll(key);
req->woken = true;
/* for instances that support it check for an event match first: */
- if (mask && !(mask & req->events))
- return 0;
+ if (mask) {
+ if (!(mask & req->events))
+ return 0;
+
+ /* try to complete the iocb inline if we can: */
+ if (spin_trylock(&iocb->ki_ctx->ctx_lock)) {
+ list_del(&iocb->ki_list);
+ spin_unlock(&iocb->ki_ctx->ctx_lock);
+
+ list_del_init(&req->wait.entry);
+ aio_poll_complete(iocb, mask);
+ return 1;
+ }
+ }
list_del_init(&req->wait.entry);
schedule_work(&req->work);
--
2.18.0
^ permalink raw reply related [flat|nested] 23+ messages in thread
* Re: [PATCH 4/4] aio: allow direct aio poll comletions for keyed wakeups
2018-08-06 8:30 ` [PATCH 4/4] aio: allow direct aio poll comletions for keyed wakeups Christoph Hellwig
@ 2018-08-06 22:27 ` Andrew Morton
2018-08-07 7:25 ` Christoph Hellwig
2018-08-07 11:44 ` [PATCH 4/4 v2] " Christoph Hellwig
1 sibling, 1 reply; 23+ messages in thread
From: Andrew Morton @ 2018-08-06 22:27 UTC (permalink / raw)
To: Christoph Hellwig
Cc: viro, Avi Kivity, Linus Torvalds, linux-aio, linux-fsdevel, linux-kernel
On Mon, 6 Aug 2018 10:30:58 +0200 Christoph Hellwig <hch@lst.de> wrote:
> If we get a keyed wakeup for a aio poll waitqueue and wake can acquire the
> ctx_lock without spinning we can just complete the iocb straight from the
> wakeup callback to avoid a context switch.
Why do we try to avoid spinning on the lock?
> --- a/fs/aio.c
> +++ b/fs/aio.c
> @@ -1672,13 +1672,26 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
> void *key)
> {
> struct poll_iocb *req = container_of(wait, struct poll_iocb, wait);
> + struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll);
> __poll_t mask = key_to_poll(key);
>
> req->woken = true;
>
> /* for instances that support it check for an event match first: */
> - if (mask && !(mask & req->events))
> - return 0;
> + if (mask) {
> + if (!(mask & req->events))
> + return 0;
> +
> + /* try to complete the iocb inline if we can: */
ie, this comment explains 'what" but not "why".
(There's a typo in Subject:, btw)
> + if (spin_trylock(&iocb->ki_ctx->ctx_lock)) {
> + list_del(&iocb->ki_list);
> + spin_unlock(&iocb->ki_ctx->ctx_lock);
> +
> + list_del_init(&req->wait.entry);
> + aio_poll_complete(iocb, mask);
> + return 1;
> + }
> + }
>
> list_del_init(&req->wait.entry);
> schedule_work(&req->work);
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH 4/4] aio: allow direct aio poll comletions for keyed wakeups
2018-08-06 22:27 ` Andrew Morton
@ 2018-08-07 7:25 ` Christoph Hellwig
2018-08-07 16:04 ` Andrew Morton
0 siblings, 1 reply; 23+ messages in thread
From: Christoph Hellwig @ 2018-08-07 7:25 UTC (permalink / raw)
To: Andrew Morton
Cc: Christoph Hellwig, viro, Avi Kivity, Linus Torvalds, linux-aio,
linux-fsdevel, linux-kernel
On Mon, Aug 06, 2018 at 03:27:05PM -0700, Andrew Morton wrote:
> On Mon, 6 Aug 2018 10:30:58 +0200 Christoph Hellwig <hch@lst.de> wrote:
>
> > If we get a keyed wakeup for a aio poll waitqueue and wake can acquire the
> > ctx_lock without spinning we can just complete the iocb straight from the
> > wakeup callback to avoid a context switch.
>
> Why do we try to avoid spinning on the lock?
Because we are called with the lock on the waitqueue called, which
nests inside it.
> > + /* try to complete the iocb inline if we can: */
>
> ie, this comment explains 'what" but not "why".
>
> (There's a typo in Subject:, btw)
Because it is faster obviously. I can update the comment.
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH 4/4] aio: allow direct aio poll comletions for keyed wakeups
2018-08-07 7:25 ` Christoph Hellwig
@ 2018-08-07 16:04 ` Andrew Morton
2018-08-08 9:57 ` Christoph Hellwig
0 siblings, 1 reply; 23+ messages in thread
From: Andrew Morton @ 2018-08-07 16:04 UTC (permalink / raw)
To: Christoph Hellwig
Cc: viro, Avi Kivity, Linus Torvalds, linux-aio, linux-fsdevel, linux-kernel
On Tue, 7 Aug 2018 09:25:55 +0200 Christoph Hellwig <hch@lst.de> wrote:
> On Mon, Aug 06, 2018 at 03:27:05PM -0700, Andrew Morton wrote:
> > On Mon, 6 Aug 2018 10:30:58 +0200 Christoph Hellwig <hch@lst.de> wrote:
> >
> > > If we get a keyed wakeup for a aio poll waitqueue and wake can acquire the
> > > ctx_lock without spinning we can just complete the iocb straight from the
> > > wakeup callback to avoid a context switch.
> >
> > Why do we try to avoid spinning on the lock?
>
> Because we are called with the lock on the waitqueue called, which
> nests inside it.
Ah.
> > > + /* try to complete the iocb inline if we can: */
> >
> > ie, this comment explains 'what" but not "why".
> >
> > (There's a typo in Subject:, btw)
>
> Because it is faster obviously. I can update the comment.
I meant the comment could explain why it's a trylock instead of a
spin_lock().
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH 4/4] aio: allow direct aio poll comletions for keyed wakeups
2018-08-07 16:04 ` Andrew Morton
@ 2018-08-08 9:57 ` Christoph Hellwig
0 siblings, 0 replies; 23+ messages in thread
From: Christoph Hellwig @ 2018-08-08 9:57 UTC (permalink / raw)
To: Andrew Morton
Cc: Christoph Hellwig, viro, Avi Kivity, Linus Torvalds, linux-aio,
linux-fsdevel, linux-kernel
On Tue, Aug 07, 2018 at 09:04:41AM -0700, Andrew Morton wrote:
> > Because it is faster obviously. I can update the comment.
>
> I meant the comment could explain why it's a trylock instead of a
> spin_lock().
We could something like this the patch below.
Al, do you want me to resend or can you just fold it in?
diff --git a/fs/aio.c b/fs/aio.c
index 5943098a87c6..84df2c2bf80b 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1684,7 +1684,8 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
/*
* Try to complete the iocb inline if we can to avoid a costly
- * context switch.
+ * context switch. As the waitqueue lock nests inside the ctx
+ * lock we can only do that if we can get it without waiting.
*/
if (spin_trylock(&iocb->ki_ctx->ctx_lock)) {
list_del(&iocb->ki_list);
^ permalink raw reply related [flat|nested] 23+ messages in thread
* [PATCH 4/4 v2] aio: allow direct aio poll comletions for keyed wakeups
2018-08-06 8:30 ` [PATCH 4/4] aio: allow direct aio poll comletions for keyed wakeups Christoph Hellwig
2018-08-06 22:27 ` Andrew Morton
@ 2018-08-07 11:44 ` Christoph Hellwig
1 sibling, 0 replies; 23+ messages in thread
From: Christoph Hellwig @ 2018-08-07 11:44 UTC (permalink / raw)
To: viro; +Cc: Avi Kivity, Linus Torvalds, linux-aio, linux-fsdevel, linux-kernel
If we get a keyed wakeup for a aio poll waitqueue and wake can acquire the
ctx_lock without spinning we can just complete the iocb straight from the
wakeup callback to avoid a context switch.
Signed-off-by: Christoph Hellwig <hch@lst.de>
Tested-by: Avi Kivity <avi@scylladb.com>
---
Fix a subject line typo and improve a comment.
fs/aio.c | 20 ++++++++++++++++++--
1 file changed, 18 insertions(+), 2 deletions(-)
diff --git a/fs/aio.c b/fs/aio.c
index 2fd19521d8a8..5943098a87c6 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1672,13 +1672,29 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
void *key)
{
struct poll_iocb *req = container_of(wait, struct poll_iocb, wait);
+ struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll);
__poll_t mask = key_to_poll(key);
req->woken = true;
/* for instances that support it check for an event match first: */
- if (mask && !(mask & req->events))
- return 0;
+ if (mask) {
+ if (!(mask & req->events))
+ return 0;
+
+ /*
+ * Try to complete the iocb inline if we can to avoid a costly
+ * context switch.
+ */
+ if (spin_trylock(&iocb->ki_ctx->ctx_lock)) {
+ list_del(&iocb->ki_list);
+ spin_unlock(&iocb->ki_ctx->ctx_lock);
+
+ list_del_init(&req->wait.entry);
+ aio_poll_complete(iocb, mask);
+ return 1;
+ }
+ }
list_del_init(&req->wait.entry);
schedule_work(&req->work);
--
2.18.0
^ permalink raw reply related [flat|nested] 23+ messages in thread
* Re: aio poll V22 (aka 2.0)
2018-08-06 8:30 aio poll V22 (aka 2.0) Christoph Hellwig
` (3 preceding siblings ...)
2018-08-06 8:30 ` [PATCH 4/4] aio: allow direct aio poll comletions for keyed wakeups Christoph Hellwig
@ 2018-08-06 16:49 ` Linus Torvalds
2018-08-07 7:27 ` Christoph Hellwig
4 siblings, 1 reply; 23+ messages in thread
From: Linus Torvalds @ 2018-08-06 16:49 UTC (permalink / raw)
To: Christoph Hellwig
Cc: Al Viro, Avi Kivity, linux-aio, linux-fsdevel, Linux Kernel Mailing List
On Mon, Aug 6, 2018 at 1:31 AM Christoph Hellwig <hch@lst.de> wrote:
>
> As our dear leader didn't like the ->poll_mask method this tries to
> implement the behavior using plain old ->poll which is rather painful.
I'm not seeing what's painful for this. Looking at the patches, this
is *much* more straightforward than your previous patch,
It adds refcounting to aio_iocb, but that's *much* better than messing
up every other subsystem.
Or is there some follow-up patches that are pending but you didn't
post that are the painful part? Because the diffstat says that this
second version is *way* less painful, at about 200 lines of code in a
couple of files, mostly aio, vs ~700 lines of changes all over the
place, together with a performance regression.
Linus
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: aio poll V22 (aka 2.0)
2018-08-06 16:49 ` aio poll V22 (aka 2.0) Linus Torvalds
@ 2018-08-07 7:27 ` Christoph Hellwig
0 siblings, 0 replies; 23+ messages in thread
From: Christoph Hellwig @ 2018-08-07 7:27 UTC (permalink / raw)
To: Linus Torvalds
Cc: Christoph Hellwig, Al Viro, Avi Kivity, linux-aio, linux-fsdevel,
Linux Kernel Mailing List
On Mon, Aug 06, 2018 at 09:49:24AM -0700, Linus Torvalds wrote:
> I'm not seeing what's painful for this. Looking at the patches, this
> is *much* more straightforward than your previous patch,
>
> It adds refcounting to aio_iocb, but that's *much* better than messing
> up every other subsystem.
>
> Or is there some follow-up patches that are pending but you didn't
> post that are the painful part? Because the diffstat says that this
> second version is *way* less painful, at about 200 lines of code in a
> couple of files, mostly aio, vs ~700 lines of changes all over the
> place, together with a performance regression.
It requires additional lock roundtrips and very strange life time
rules. But we've already established that our preference here are
different, so I'm not surprised by your different view.
^ permalink raw reply [flat|nested] 23+ messages in thread
* [PATCH 3/4] aio: implement IOCB_CMD_POLL
2018-07-30 7:15 aio poll and a new in-kernel poll API V21 " Christoph Hellwig
@ 2018-07-30 7:15 ` Christoph Hellwig
2018-08-01 23:54 ` Al Viro
2018-08-02 0:21 ` Al Viro
0 siblings, 2 replies; 23+ messages in thread
From: Christoph Hellwig @ 2018-07-30 7:15 UTC (permalink / raw)
To: viro; +Cc: Avi Kivity, linux-aio, linux-fsdevel, linux-kernel
Simple one-shot poll through the io_submit() interface. To poll for
a file descriptor the application should submit an iocb of type
IOCB_CMD_POLL. It will poll the fd for the events specified in the
the first 32 bits of the aio_buf field of the iocb.
Unlike poll or epoll without EPOLLONESHOT this interface always works
in one shot mode, that is once the iocb is completed, it will have to be
resubmitted.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/aio.c | 178 +++++++++++++++++++++++++++++++++++
include/uapi/linux/aio_abi.h | 6 +-
2 files changed, 180 insertions(+), 4 deletions(-)
diff --git a/fs/aio.c b/fs/aio.c
index fe2018ada32c..6993684d0665 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -5,6 +5,7 @@
* Implements an efficient asynchronous io interface.
*
* Copyright 2000, 2001, 2002 Red Hat, Inc. All Rights Reserved.
+ * Copyright 2018 Christoph Hellwig.
*
* See ../COPYING for licensing terms.
*/
@@ -165,10 +166,21 @@ struct fsync_iocb {
bool datasync;
};
+struct poll_iocb {
+ struct file *file;
+ struct wait_queue_head *head;
+ __poll_t events;
+ bool cancelled;
+ bool done;
+ struct wait_queue_entry wait;
+ struct work_struct work;
+};
+
struct aio_kiocb {
union {
struct kiocb rw;
struct fsync_iocb fsync;
+ struct poll_iocb poll;
};
struct kioctx *ki_ctx;
@@ -1601,6 +1613,169 @@ static int aio_fsync(struct fsync_iocb *req, struct iocb *iocb, bool datasync)
return 0;
}
+static inline void aio_poll_complete(struct aio_kiocb *iocb, __poll_t mask)
+{
+ struct file *file = iocb->poll.file;
+
+ aio_complete(iocb, mangle_poll(mask), 0);
+ fput(file);
+}
+
+static void aio_poll_complete_work(struct work_struct *work)
+{
+ struct poll_iocb *req = container_of(work, struct poll_iocb, work);
+ struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll);
+ struct poll_table_struct pt = { ._key = req->events };
+ struct kioctx *ctx = iocb->ki_ctx;
+ __poll_t mask;
+
+ if (READ_ONCE(req->cancelled)) {
+ /* synchronize with ki_list removal in the callers: */
+ spin_lock_irq(&ctx->ctx_lock);
+ WARN_ON_ONCE(!list_empty(&iocb->ki_list));
+ spin_unlock_irq(&ctx->ctx_lock);
+
+ aio_poll_complete(iocb, 0);
+ return;
+ }
+
+ mask = vfs_poll(req->file, &pt) & req->events;
+ if (!mask) {
+ add_wait_queue(req->head, &req->wait);
+ return;
+ }
+
+ spin_lock_irq(&ctx->ctx_lock);
+ req->done = true;
+ list_del(&iocb->ki_list);
+ spin_unlock_irq(&ctx->ctx_lock);
+
+ aio_poll_complete(iocb, mask);
+}
+
+/* assumes we are called with irqs disabled */
+static int aio_poll_cancel(struct kiocb *iocb)
+{
+ struct aio_kiocb *aiocb = container_of(iocb, struct aio_kiocb, rw);
+ struct poll_iocb *req = &aiocb->poll;
+
+ spin_lock(&req->head->lock);
+ if (!list_empty(&req->wait.entry)) {
+ WRITE_ONCE(req->cancelled, true);
+ list_del_init(&req->wait.entry);
+ schedule_work(&aiocb->poll.work);
+ }
+ spin_unlock(&req->head->lock);
+
+ return 0;
+}
+
+static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
+ void *key)
+{
+ struct poll_iocb *req = container_of(wait, struct poll_iocb, wait);
+ __poll_t mask = key_to_poll(key);
+
+ /* for instances that support it check for an event match first: */
+ if (mask && !(mask & req->events))
+ return 0;
+
+ list_del_init(&req->wait.entry);
+ schedule_work(&req->work);
+ return 1;
+}
+
+struct aio_poll_table {
+ struct poll_table_struct pt;
+ struct aio_kiocb *iocb;
+ int error;
+};
+
+static void
+aio_poll_queue_proc(struct file *file, struct wait_queue_head *head,
+ struct poll_table_struct *p)
+{
+ struct aio_poll_table *pt = container_of(p, struct aio_poll_table, pt);
+
+ /* multiple wait queues per file are not supported */
+ if (unlikely(pt->iocb->poll.head)) {
+ pt->error = -EINVAL;
+ return;
+ }
+
+ pt->error = 0;
+ pt->iocb->poll.head = head;
+ add_wait_queue(head, &pt->iocb->poll.wait);
+}
+
+static ssize_t aio_poll(struct aio_kiocb *aiocb, struct iocb *iocb)
+{
+ struct kioctx *ctx = aiocb->ki_ctx;
+ struct poll_iocb *req = &aiocb->poll;
+ struct aio_poll_table apt;
+ __poll_t mask;
+
+ /* reject any unknown events outside the normal event mask. */
+ if ((u16)iocb->aio_buf != iocb->aio_buf)
+ return -EINVAL;
+ /* reject fields that are not defined for poll */
+ if (iocb->aio_offset || iocb->aio_nbytes || iocb->aio_rw_flags)
+ return -EINVAL;
+
+ INIT_WORK(&req->work, aio_poll_complete_work);
+ req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP;
+ req->file = fget(iocb->aio_fildes);
+ if (unlikely(!req->file))
+ return -EBADF;
+
+ apt.pt._qproc = aio_poll_queue_proc;
+ apt.pt._key = req->events;
+ apt.iocb = aiocb;
+ apt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */
+
+ /* initialized the list so that we can do list_empty checks */
+ INIT_LIST_HEAD(&req->wait.entry);
+ init_waitqueue_func_entry(&req->wait, aio_poll_wake);
+
+ /* one for removal from waitqueue, one for this function */
+ refcount_set(&aiocb->ki_refcnt, 2);
+
+ mask = vfs_poll(req->file, &apt.pt) & req->events;
+ if (mask || apt.error) {
+ bool removed = false;
+
+ /* we did not manage to set up a waitqueue, done */
+ if (unlikely(!req->head))
+ goto out_fput;
+
+ spin_lock_irq(&req->head->lock);
+ if (!list_empty(&req->wait.entry)) {
+ list_del_init(&req->wait.entry);
+ removed = true;
+ }
+ spin_unlock_irq(&req->head->lock);
+
+ if (removed) {
+ if (apt.error)
+ goto out_fput;
+ aio_poll_complete(aiocb, mask);
+ }
+ } else {
+ spin_lock_irq(&ctx->ctx_lock);
+ if (!req->done) {
+ list_add_tail(&aiocb->ki_list, &ctx->active_reqs);
+ aiocb->ki_cancel = aio_poll_cancel;
+ }
+ spin_unlock_irq(&ctx->ctx_lock);
+ }
+
+ iocb_put(aiocb);
+ return 0;
+out_fput:
+ fput(req->file);
+ return apt.error;
+}
+
static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
bool compat)
{
@@ -1674,6 +1849,9 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
case IOCB_CMD_FDSYNC:
ret = aio_fsync(&req->fsync, &iocb, true);
break;
+ case IOCB_CMD_POLL:
+ ret = aio_poll(req, &iocb);
+ break;
default:
pr_debug("invalid aio operation %d\n", iocb.aio_lio_opcode);
ret = -EINVAL;
diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h
index d4593a6062ef..ce43d340f010 100644
--- a/include/uapi/linux/aio_abi.h
+++ b/include/uapi/linux/aio_abi.h
@@ -38,10 +38,8 @@ enum {
IOCB_CMD_PWRITE = 1,
IOCB_CMD_FSYNC = 2,
IOCB_CMD_FDSYNC = 3,
- /* These two are experimental.
- * IOCB_CMD_PREADX = 4,
- * IOCB_CMD_POLL = 5,
- */
+ /* 4 was the experimental IOCB_CMD_PREADX */
+ IOCB_CMD_POLL = 5,
IOCB_CMD_NOOP = 6,
IOCB_CMD_PREADV = 7,
IOCB_CMD_PWRITEV = 8,
--
2.18.0
^ permalink raw reply related [flat|nested] 23+ messages in thread
* Re: [PATCH 3/4] aio: implement IOCB_CMD_POLL
2018-07-30 7:15 ` [PATCH 3/4] aio: implement IOCB_CMD_POLL Christoph Hellwig
@ 2018-08-01 23:54 ` Al Viro
2018-08-02 9:00 ` Christoph Hellwig
2018-08-02 0:21 ` Al Viro
1 sibling, 1 reply; 23+ messages in thread
From: Al Viro @ 2018-08-01 23:54 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Avi Kivity, linux-aio, linux-fsdevel, linux-kernel
On Mon, Jul 30, 2018 at 09:15:43AM +0200, Christoph Hellwig wrote:
> + apt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */
> + mask = vfs_poll(req->file, &apt.pt) & req->events;
> + if (mask || apt.error) {
> + bool removed = false;
> +
> + /* we did not manage to set up a waitqueue, done */
> + if (unlikely(!req->head))
> + goto out_fput;
> +out_fput:
> + fput(req->file);
> + return apt.error;
Ugh... So anything that simply returns a constant value, without
even bothering to do poll_wait() (on the theory that no matter how
much you wait, nothing will change) is going to git -EINVAL?
What am I missing here?
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH 3/4] aio: implement IOCB_CMD_POLL
2018-08-01 23:54 ` Al Viro
@ 2018-08-02 9:00 ` Christoph Hellwig
0 siblings, 0 replies; 23+ messages in thread
From: Christoph Hellwig @ 2018-08-02 9:00 UTC (permalink / raw)
To: Al Viro
Cc: Christoph Hellwig, Avi Kivity, linux-aio, linux-fsdevel, linux-kernel
On Thu, Aug 02, 2018 at 12:54:12AM +0100, Al Viro wrote:
> On Mon, Jul 30, 2018 at 09:15:43AM +0200, Christoph Hellwig wrote:
>
> > + apt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */
>
> > + mask = vfs_poll(req->file, &apt.pt) & req->events;
> > + if (mask || apt.error) {
> > + bool removed = false;
> > +
> > + /* we did not manage to set up a waitqueue, done */
> > + if (unlikely(!req->head))
> > + goto out_fput;
>
> > +out_fput:
> > + fput(req->file);
> > + return apt.error;
>
> Ugh... So anything that simply returns a constant value, without
> even bothering to do poll_wait() (on the theory that no matter how
> much you wait, nothing will change) is going to git -EINVAL?
> What am I missing here?
I can change it to return that constant value, but is it really going
to be useful to keep resubmitting an iocb for something that we can't
actually poll? I don't think we help the application with that at all.
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH 3/4] aio: implement IOCB_CMD_POLL
2018-07-30 7:15 ` [PATCH 3/4] aio: implement IOCB_CMD_POLL Christoph Hellwig
2018-08-01 23:54 ` Al Viro
@ 2018-08-02 0:21 ` Al Viro
2018-08-02 9:22 ` Christoph Hellwig
1 sibling, 1 reply; 23+ messages in thread
From: Al Viro @ 2018-08-02 0:21 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Avi Kivity, linux-aio, linux-fsdevel, linux-kernel
On Mon, Jul 30, 2018 at 09:15:43AM +0200, Christoph Hellwig wrote:
> +static void aio_poll_complete_work(struct work_struct *work)
> +{
> + struct poll_iocb *req = container_of(work, struct poll_iocb, work);
> + struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll);
> + struct poll_table_struct pt = { ._key = req->events };
> + struct kioctx *ctx = iocb->ki_ctx;
> + __poll_t mask;
> +
> + if (READ_ONCE(req->cancelled)) {
....
> + }
> +
> + mask = vfs_poll(req->file, &pt) & req->events;
> + if (!mask) {
> + add_wait_queue(req->head, &req->wait);
> + return;
> + }
....
> +}
> +/* assumes we are called with irqs disabled */
> +static int aio_poll_cancel(struct kiocb *iocb)
> +{
> + struct aio_kiocb *aiocb = container_of(iocb, struct aio_kiocb, rw);
> + struct poll_iocb *req = &aiocb->poll;
> +
> + spin_lock(&req->head->lock);
> + if (!list_empty(&req->wait.entry)) {
> + WRITE_ONCE(req->cancelled, true);
> + list_del_init(&req->wait.entry);
> + schedule_work(&aiocb->poll.work);
> + }
> + spin_unlock(&req->head->lock);
> +
> + return 0;
> +}
> +static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
> + void *key)
> +{
> + struct poll_iocb *req = container_of(wait, struct poll_iocb, wait);
> + __poll_t mask = key_to_poll(key);
> +
> + /* for instances that support it check for an event match first: */
> + if (mask && !(mask & req->events))
> + return 0;
> +
> + list_del_init(&req->wait.entry);
> + schedule_work(&req->work);
> + return 1;
> +}
> +static ssize_t aio_poll(struct aio_kiocb *aiocb, struct iocb *iocb)
> +{
> + struct kioctx *ctx = aiocb->ki_ctx;
> + struct poll_iocb *req = &aiocb->poll;
> + struct aio_poll_table apt;
> + __poll_t mask;
> + mask = vfs_poll(req->file, &apt.pt) & req->events;
> + if (mask || apt.error) {
> + } else {
> + spin_lock_irq(&ctx->ctx_lock);
> + if (!req->done) {
> + list_add_tail(&aiocb->ki_list, &ctx->active_reqs);
> + aiocb->ki_cancel = aio_poll_cancel;
> + }
> + spin_unlock_irq(&ctx->ctx_lock);
> + }
So what happens if
* we call aio_poll(), add the sucker to queue and see that we need
to wait
* add to ->active_refs just as the wakeup comes
* wakeup removes from queue and hits schedule_work()
* io_cancel() is called, triggering aio_poll_cancel(), which sees that
we are not from queue and buggers off. We are gone from ->active_refs.
* aio_poll_complete_work() is called, sees no ->cancelled
* aio_poll_complete_work() calls vfs_poll(), sees nothing interesting
and puts us back on the queue.
Unless I'm misreading it, cancel will end up with iocb still around and now
impossible to cancel... What am I missing?
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH 3/4] aio: implement IOCB_CMD_POLL
2018-08-02 0:21 ` Al Viro
@ 2018-08-02 9:22 ` Christoph Hellwig
2018-08-02 16:00 ` Al Viro
0 siblings, 1 reply; 23+ messages in thread
From: Christoph Hellwig @ 2018-08-02 9:22 UTC (permalink / raw)
To: Al Viro
Cc: Christoph Hellwig, Avi Kivity, linux-aio, linux-fsdevel, linux-kernel
On Thu, Aug 02, 2018 at 01:21:22AM +0100, Al Viro wrote:
> So what happens if
> * we call aio_poll(), add the sucker to queue and see that we need
> to wait
> * add to ->active_refs just as the wakeup comes
active_reqs I guess..
> * wakeup removes from queue and hits schedule_work()
> * io_cancel() is called, triggering aio_poll_cancel(), which sees that
> we are not from queue and buggers off. We are gone from ->active_refs.
> * aio_poll_complete_work() is called, sees no ->cancelled
> * aio_poll_complete_work() calls vfs_poll(), sees nothing interesting
> and puts us back on the queue.
So let me draw this up, we start with the following:
THREAD 1 THREAD 2
aio_poll
vfs_poll(...)
add_wait_queue()
(no pending mask)
spin_lock_irq(&ctx->ctx_lock);
list_add_tail(..., &ctx->active_reqs) aio_poll_wake
spin_unlock_irq(&ctx->ctx_lock);
(spin_trylock failed)
list_del_init(&req->wait.entry);
schedule_work(&req->work);
Now switching to two new threads:
io_cancel thread worker thread
vfs_poll()
(mask = 0)
aio_poll_cancel
(not on waitqueue, done)
remove from active_reqs
add_wait_queue()
iocb still around
>
> Unless I'm misreading it, cancel will end up with iocb still around and now
> impossible to cancel... What am I missing?
Yes, I think you are right. I'll see how I could handle that case.
One of the easiest options would be to just support aio poll on
file ops that support keyed wakeups, we'd just need to pass that
information up.
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH 3/4] aio: implement IOCB_CMD_POLL
2018-08-02 9:22 ` Christoph Hellwig
@ 2018-08-02 16:00 ` Al Viro
2018-08-02 16:08 ` Christoph Hellwig
0 siblings, 1 reply; 23+ messages in thread
From: Al Viro @ 2018-08-02 16:00 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Avi Kivity, linux-aio, linux-fsdevel, linux-kernel
On Thu, Aug 02, 2018 at 11:22:34AM +0200, Christoph Hellwig wrote:
> Yes, I think you are right. I'll see how I could handle that case.
> One of the easiest options would be to just support aio poll on
> file ops that support keyed wakeups, we'd just need to pass that
> information up.
BTW, what happens if we insert into one queue and immediately get
woken up, even before the damn thing gets to the end of ->poll(),
which proceeds to call poll_wait() again (on another queue)?
AFAICS, apt.error will be set by the second callback and completely
ignored. And so will the return value of ->poll()...
Sigh... Analysis of that thing is bloody painful, mostly because
it's hard to describe the state...
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH 3/4] aio: implement IOCB_CMD_POLL
2018-08-02 16:00 ` Al Viro
@ 2018-08-02 16:08 ` Christoph Hellwig
2018-08-02 16:08 ` Al Viro
0 siblings, 1 reply; 23+ messages in thread
From: Christoph Hellwig @ 2018-08-02 16:08 UTC (permalink / raw)
To: Al Viro
Cc: Christoph Hellwig, Avi Kivity, linux-aio, linux-fsdevel, linux-kernel
On Thu, Aug 02, 2018 at 05:00:32PM +0100, Al Viro wrote:
> BTW, what happens if we insert into one queue and immediately get
> woken up, even before the damn thing gets to the end of ->poll(),
> which proceeds to call poll_wait() again (on another queue)?
> AFAICS, apt.error will be set by the second callback and completely
> ignored. And so will the return value of ->poll()...
>
> Sigh... Analysis of that thing is bloody painful, mostly because
> it's hard to describe the state...
That's the problem with the ->poll interface. We call it, then
have magic happen underneath where it might or might not get added
to one (or more if we didn't exclude that) waitqueues, and might
have actually been worken before return. I can't really think of
a good way to do that entirely sanely.
Best I can think of is to only allow using file ops that do keyed
wakeups and rely on the keyed wakeups alone. I've started coming
up with a version of that, but it won't be until tomorrow at least
that I can post it.
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH 3/4] aio: implement IOCB_CMD_POLL
2018-08-02 16:08 ` Christoph Hellwig
@ 2018-08-02 16:08 ` Al Viro
2018-08-02 16:16 ` Christoph Hellwig
0 siblings, 1 reply; 23+ messages in thread
From: Al Viro @ 2018-08-02 16:08 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Avi Kivity, linux-aio, linux-fsdevel, linux-kernel
On Thu, Aug 02, 2018 at 06:08:16PM +0200, Christoph Hellwig wrote:
> On Thu, Aug 02, 2018 at 05:00:32PM +0100, Al Viro wrote:
> > BTW, what happens if we insert into one queue and immediately get
> > woken up, even before the damn thing gets to the end of ->poll(),
> > which proceeds to call poll_wait() again (on another queue)?
> > AFAICS, apt.error will be set by the second callback and completely
> > ignored. And so will the return value of ->poll()...
> >
> > Sigh... Analysis of that thing is bloody painful, mostly because
> > it's hard to describe the state...
>
> That's the problem with the ->poll interface. We call it, then
> have magic happen underneath where it might or might not get added
> to one (or more if we didn't exclude that) waitqueues, and might
> have actually been worken before return. I can't really think of
> a good way to do that entirely sanely.
>
> Best I can think of is to only allow using file ops that do keyed
> wakeups and rely on the keyed wakeups alone. I've started coming
> up with a version of that, but it won't be until tomorrow at least
> that I can post it.
What does it buy you? You still have to deal with trylock failures
in wakeup...
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH 3/4] aio: implement IOCB_CMD_POLL
2018-08-02 16:08 ` Al Viro
@ 2018-08-02 16:16 ` Christoph Hellwig
2018-08-02 21:48 ` Al Viro
0 siblings, 1 reply; 23+ messages in thread
From: Christoph Hellwig @ 2018-08-02 16:16 UTC (permalink / raw)
To: Al Viro
Cc: Christoph Hellwig, Avi Kivity, linux-aio, linux-fsdevel, linux-kernel
On Thu, Aug 02, 2018 at 05:08:38PM +0100, Al Viro wrote:
> On Thu, Aug 02, 2018 at 06:08:16PM +0200, Christoph Hellwig wrote:
> > On Thu, Aug 02, 2018 at 05:00:32PM +0100, Al Viro wrote:
> > > BTW, what happens if we insert into one queue and immediately get
> > > woken up, even before the damn thing gets to the end of ->poll(),
> > > which proceeds to call poll_wait() again (on another queue)?
> > > AFAICS, apt.error will be set by the second callback and completely
> > > ignored. And so will the return value of ->poll()...
> > >
> > > Sigh... Analysis of that thing is bloody painful, mostly because
> > > it's hard to describe the state...
> >
> > That's the problem with the ->poll interface. We call it, then
> > have magic happen underneath where it might or might not get added
> > to one (or more if we didn't exclude that) waitqueues, and might
> > have actually been worken before return. I can't really think of
> > a good way to do that entirely sanely.
> >
> > Best I can think of is to only allow using file ops that do keyed
> > wakeups and rely on the keyed wakeups alone. I've started coming
> > up with a version of that, but it won't be until tomorrow at least
> > that I can post it.
>
> What does it buy you? You still have to deal with trylock failures
> in wakeup...
But we'll never re-add an iocb once it has been removed from the
waitqueue.
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH 3/4] aio: implement IOCB_CMD_POLL
2018-08-02 16:16 ` Christoph Hellwig
@ 2018-08-02 21:48 ` Al Viro
0 siblings, 0 replies; 23+ messages in thread
From: Al Viro @ 2018-08-02 21:48 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Avi Kivity, linux-aio, linux-fsdevel, linux-kernel
On Thu, Aug 02, 2018 at 06:16:48PM +0200, Christoph Hellwig wrote:
> On Thu, Aug 02, 2018 at 05:08:38PM +0100, Al Viro wrote:
> > On Thu, Aug 02, 2018 at 06:08:16PM +0200, Christoph Hellwig wrote:
> > > On Thu, Aug 02, 2018 at 05:00:32PM +0100, Al Viro wrote:
> > > > BTW, what happens if we insert into one queue and immediately get
> > > > woken up, even before the damn thing gets to the end of ->poll(),
> > > > which proceeds to call poll_wait() again (on another queue)?
> > > > AFAICS, apt.error will be set by the second callback and completely
> > > > ignored. And so will the return value of ->poll()...
> > > >
> > > > Sigh... Analysis of that thing is bloody painful, mostly because
> > > > it's hard to describe the state...
> > >
> > > That's the problem with the ->poll interface. We call it, then
> > > have magic happen underneath where it might or might not get added
> > > to one (or more if we didn't exclude that) waitqueues, and might
> > > have actually been worken before return. I can't really think of
> > > a good way to do that entirely sanely.
> > >
> > > Best I can think of is to only allow using file ops that do keyed
> > > wakeups and rely on the keyed wakeups alone. I've started coming
> > > up with a version of that, but it won't be until tomorrow at least
> > > that I can post it.
> >
> > What does it buy you? You still have to deal with trylock failures
> > in wakeup...
>
> But we'll never re-add an iocb once it has been removed from the
> waitqueue.
Umm... Frankly, I wonder if the right approach is to declare that if
wakeup has happened at all, submit gives up any responsibility.
IOW, have aio_poll
* create iocb, feed to ->poll()
* check if ->head is NULL; if it is, the mask we've got is
*all* we are going to get; complete and bugger off.
* lock ioctx
* insert the sucker into ->active_reqs
* lock the queue
* check if wakeup has already happened.
* if it has - go away, it's submitted
* check apt.error; if set, fail with -EINVAL.
* check the mask; if nothing of interest is in there, go away
* dequeue, unlist, complete and bugger off
On cancel side
* lock queue
* mark it cancelled
* removed from queue if it was there
* unlock queue
On wakeup
* mark it woken
* complete or schedule completion, which would be where we check
if it's marked cancelled - both before vfs_poll() (to avoid calling it
if already set) and after (treating cancel during vfs_poll() as "complete
and bugger off").
Comments?
^ permalink raw reply [flat|nested] 23+ messages in thread
* [PATCH 3/4] aio: implement IOCB_CMD_POLL
2018-07-26 8:28 aio poll and a new in-kernel poll API V20 (aka 2.0) Christoph Hellwig
@ 2018-07-26 8:29 ` Christoph Hellwig
0 siblings, 0 replies; 23+ messages in thread
From: Christoph Hellwig @ 2018-07-26 8:29 UTC (permalink / raw)
To: viro; +Cc: Avi Kivity, linux-aio, linux-fsdevel, linux-kernel
Simple one-shot poll through the io_submit() interface. To poll for
a file descriptor the application should submit an iocb of type
IOCB_CMD_POLL. It will poll the fd for the events specified in the
the first 32 bits of the aio_buf field of the iocb.
Unlike poll or epoll without EPOLLONESHOT this interface always works
in one shot mode, that is once the iocb is completed, it will have to be
resubmitted.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/aio.c | 178 +++++++++++++++++++++++++++++++++++
include/uapi/linux/aio_abi.h | 6 +-
2 files changed, 180 insertions(+), 4 deletions(-)
diff --git a/fs/aio.c b/fs/aio.c
index 7f3c159b3e2e..cf364d75abe9 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -5,6 +5,7 @@
* Implements an efficient asynchronous io interface.
*
* Copyright 2000, 2001, 2002 Red Hat, Inc. All Rights Reserved.
+ * Copyright 2018 Christoph Hellwig.
*
* See ../COPYING for licensing terms.
*/
@@ -164,10 +165,21 @@ struct fsync_iocb {
bool datasync;
};
+struct poll_iocb {
+ struct file *file;
+ struct wait_queue_head *head;
+ __poll_t events;
+ bool cancelled;
+ bool done;
+ struct wait_queue_entry wait;
+ struct work_struct work;
+};
+
struct aio_kiocb {
union {
struct kiocb rw;
struct fsync_iocb fsync;
+ struct poll_iocb poll;
};
struct kioctx *ki_ctx;
@@ -1600,6 +1612,169 @@ static int aio_fsync(struct fsync_iocb *req, struct iocb *iocb, bool datasync)
return 0;
}
+static inline void aio_poll_complete(struct aio_kiocb *iocb, __poll_t mask)
+{
+ struct file *file = iocb->poll.file;
+
+ aio_complete(iocb, mangle_poll(mask), 0);
+ fput(file);
+}
+
+static void aio_poll_complete_work(struct work_struct *work)
+{
+ struct poll_iocb *req = container_of(work, struct poll_iocb, work);
+ struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll);
+ struct poll_table_struct pt = { ._key = req->events };
+ struct kioctx *ctx = iocb->ki_ctx;
+ __poll_t mask;
+
+ if (READ_ONCE(req->cancelled)) {
+ /* synchronize with ki_list removal in the callers: */
+ spin_lock_irq(&ctx->ctx_lock);
+ WARN_ON_ONCE(!list_empty(&iocb->ki_list));
+ spin_unlock_irq(&ctx->ctx_lock);
+
+ aio_poll_complete(iocb, 0);
+ return;
+ }
+
+ mask = vfs_poll(req->file, &pt) & req->events;
+ if (!mask) {
+ add_wait_queue(req->head, &req->wait);
+ return;
+ }
+
+ spin_lock_irq(&ctx->ctx_lock);
+ req->done = true;
+ list_del(&iocb->ki_list);
+ spin_unlock_irq(&ctx->ctx_lock);
+
+ aio_poll_complete(iocb, mask);
+}
+
+/* assumes we are called with irqs disabled */
+static int aio_poll_cancel(struct kiocb *iocb)
+{
+ struct aio_kiocb *aiocb = container_of(iocb, struct aio_kiocb, rw);
+ struct poll_iocb *req = &aiocb->poll;
+
+ spin_lock(&req->head->lock);
+ if (!list_empty(&req->wait.entry)) {
+ WRITE_ONCE(req->cancelled, true);
+ list_del_init(&req->wait.entry);
+ schedule_work(&aiocb->poll.work);
+ }
+ spin_unlock(&req->head->lock);
+
+ return 0;
+}
+
+static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
+ void *key)
+{
+ struct poll_iocb *req = container_of(wait, struct poll_iocb, wait);
+ __poll_t mask = key_to_poll(key);
+
+ /* for instances that support it check for an event match first: */
+ if (mask && !(mask & req->events))
+ return 0;
+
+ list_del_init(&req->wait.entry);
+ schedule_work(&req->work);
+ return 1;
+}
+
+struct aio_poll_table {
+ struct poll_table_struct pt;
+ struct aio_kiocb *iocb;
+ int error;
+};
+
+static void
+aio_poll_queue_proc(struct file *file, struct wait_queue_head *head,
+ struct poll_table_struct *p)
+{
+ struct aio_poll_table *pt = container_of(p, struct aio_poll_table, pt);
+
+ /* multiple wait queues per file are not supported */
+ if (unlikely(pt->iocb->poll.head)) {
+ pt->error = -EINVAL;
+ return;
+ }
+
+ pt->error = 0;
+ pt->iocb->poll.head = head;
+ add_wait_queue(head, &pt->iocb->poll.wait);
+}
+
+static ssize_t aio_poll(struct aio_kiocb *aiocb, struct iocb *iocb)
+{
+ struct kioctx *ctx = aiocb->ki_ctx;
+ struct poll_iocb *req = &aiocb->poll;
+ struct aio_poll_table apt;
+ __poll_t mask;
+
+ /* reject any unknown events outside the normal event mask. */
+ if ((u16)iocb->aio_buf != iocb->aio_buf)
+ return -EINVAL;
+ /* reject fields that are not defined for poll */
+ if (iocb->aio_offset || iocb->aio_nbytes || iocb->aio_rw_flags)
+ return -EINVAL;
+
+ INIT_WORK(&req->work, aio_poll_complete_work);
+ req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP;
+ req->file = fget(iocb->aio_fildes);
+ if (unlikely(!req->file))
+ return -EBADF;
+
+ apt.pt._qproc = aio_poll_queue_proc;
+ apt.pt._key = req->events;
+ apt.iocb = aiocb;
+ apt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */
+
+ /* initialized the list so that we can do list_empty checks */
+ INIT_LIST_HEAD(&req->wait.entry);
+ init_waitqueue_func_entry(&req->wait, aio_poll_wake);
+
+ /* one for removal from waitqueue, one for this function */
+ atomic_set(&aiocb->ki_refcnt, 2);
+
+ mask = vfs_poll(req->file, &apt.pt) & req->events;
+ if (mask || apt.error) {
+ bool removed = false;
+
+ /* we did not manage to set up a waitqueue, done */
+ if (unlikely(!req->head))
+ goto out_fput;
+
+ spin_lock_irq(&req->head->lock);
+ if (!list_empty(&req->wait.entry)) {
+ list_del_init(&req->wait.entry);
+ removed = true;
+ }
+ spin_unlock_irq(&req->head->lock);
+
+ if (removed) {
+ if (apt.error)
+ goto out_fput;
+ aio_poll_complete(aiocb, mask);
+ }
+ } else {
+ spin_lock_irq(&ctx->ctx_lock);
+ if (!req->done) {
+ list_add_tail(&aiocb->ki_list, &ctx->active_reqs);
+ aiocb->ki_cancel = aio_poll_cancel;
+ }
+ spin_unlock_irq(&ctx->ctx_lock);
+ }
+
+ iocb_put(aiocb);
+ return 0;
+out_fput:
+ fput(req->file);
+ return apt.error;
+}
+
static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
bool compat)
{
@@ -1673,6 +1848,9 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
case IOCB_CMD_FDSYNC:
ret = aio_fsync(&req->fsync, &iocb, true);
break;
+ case IOCB_CMD_POLL:
+ ret = aio_poll(req, &iocb);
+ break;
default:
pr_debug("invalid aio operation %d\n", iocb.aio_lio_opcode);
ret = -EINVAL;
diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h
index d4593a6062ef..ce43d340f010 100644
--- a/include/uapi/linux/aio_abi.h
+++ b/include/uapi/linux/aio_abi.h
@@ -38,10 +38,8 @@ enum {
IOCB_CMD_PWRITE = 1,
IOCB_CMD_FSYNC = 2,
IOCB_CMD_FDSYNC = 3,
- /* These two are experimental.
- * IOCB_CMD_PREADX = 4,
- * IOCB_CMD_POLL = 5,
- */
+ /* 4 was the experimental IOCB_CMD_PREADX */
+ IOCB_CMD_POLL = 5,
IOCB_CMD_NOOP = 6,
IOCB_CMD_PREADV = 7,
IOCB_CMD_PWRITEV = 8,
--
2.18.0
^ permalink raw reply related [flat|nested] 23+ messages in thread