All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC] split struct kiocb
@ 2015-01-27 17:55 Christoph Hellwig
  2015-01-27 17:55 ` [PATCH 1/5] fs: don't allow to complete sync iocbs through aio_complete Christoph Hellwig
                   ` (4 more replies)
  0 siblings, 5 replies; 45+ messages in thread
From: Christoph Hellwig @ 2015-01-27 17:55 UTC (permalink / raw)
  To: Al Viro; +Cc: Miklos Szeredi, linux-aio, linux-fsdevel

This series cuts down the amount of fiels in the public iocb that is
allocated on stack for every synchronous I/O, both by removing fields
from it, and by adding a aio-specific iocb that is only allocated
for aio requests.

Additionally it cleans up various corner cases in the aio completion
code and adds a simple in-kernel async read/write interface.

Note that there still are some issues with fuse, see the first patch
for details.


^ permalink raw reply	[flat|nested] 45+ messages in thread

* [PATCH 1/5] fs: don't allow to complete sync iocbs through aio_complete
  2015-01-27 17:55 [RFC] split struct kiocb Christoph Hellwig
@ 2015-01-27 17:55 ` Christoph Hellwig
  2015-01-28 15:30   ` Miklos Szeredi
  2015-01-27 17:55 ` [PATCH 2/5] fs: saner aio_complete prototype Christoph Hellwig
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 45+ messages in thread
From: Christoph Hellwig @ 2015-01-27 17:55 UTC (permalink / raw)
  To: Al Viro; +Cc: Miklos Szeredi, linux-aio, linux-fsdevel

The AIO interface is fairly complex because it tries to allow
filesystems to always work async and then wakeup a synchronous
caller through aio_complete.  It turns out that basically no one
does this to avoid the complexity and context switches, with the
solve exception of fuse, so remove that possibility to simplify
the code.

XXX: fix up fuse.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/usb/gadget/function/f_fs.c |  4 ++--
 fs/aio.c                           | 24 +-----------------------
 fs/ecryptfs/file.c                 |  6 ------
 fs/fuse/file.c                     |  7 +++++--
 fs/read_write.c                    | 26 ++++++++------------------
 include/linux/aio.h                |  4 ----
 net/socket.c                       |  9 +++------
 7 files changed, 19 insertions(+), 61 deletions(-)

diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c
index 63314ed..debd78b 100644
--- a/drivers/usb/gadget/function/f_fs.c
+++ b/drivers/usb/gadget/function/f_fs.c
@@ -969,7 +969,7 @@ static ssize_t ffs_epfile_aio_write(struct kiocb *kiocb,
 	if (unlikely(!io_data))
 		return -ENOMEM;
 
-	io_data->aio = true;
+	io_data->aio = !is_sync_kiocb(kiocb);
 	io_data->read = false;
 	io_data->kiocb = kiocb;
 	io_data->iovec = iovec;
@@ -1005,7 +1005,7 @@ static ssize_t ffs_epfile_aio_read(struct kiocb *kiocb,
 		return -ENOMEM;
 	}
 
-	io_data->aio = true;
+	io_data->aio = !is_sync_kiocb(kiocb);
 	io_data->read = true;
 	io_data->kiocb = kiocb;
 	io_data->iovec = iovec_copy;
diff --git a/fs/aio.c b/fs/aio.c
index 1b7893e..0ee25d6 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -791,22 +791,6 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
 	return 0;
 }
 
-/* wait_on_sync_kiocb:
- *	Waits on the given sync kiocb to complete.
- */
-ssize_t wait_on_sync_kiocb(struct kiocb *req)
-{
-	while (!req->ki_ctx) {
-		set_current_state(TASK_UNINTERRUPTIBLE);
-		if (req->ki_ctx)
-			break;
-		io_schedule();
-	}
-	__set_current_state(TASK_RUNNING);
-	return req->ki_user_data;
-}
-EXPORT_SYMBOL(wait_on_sync_kiocb);
-
 /*
  * exit_aio: called when the last user of mm goes away.  At this point, there is
  * no way for any new requests to be submited or any of the io_* syscalls to be
@@ -1038,13 +1022,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
 	 *    ref, no other paths have a way to get another ref
 	 *  - the sync task helpfully left a reference to itself in the iocb
 	 */
-	if (is_sync_kiocb(iocb)) {
-		iocb->ki_user_data = res;
-		smp_wmb();
-		iocb->ki_ctx = ERR_PTR(-EXDEV);
-		wake_up_process(iocb->ki_obj.tsk);
-		return;
-	}
+	BUG_ON(is_sync_kiocb(iocb));
 
 	if (iocb->ki_list.next) {
 		unsigned long flags;
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 6f4e659..a36da88 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -52,12 +52,6 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb,
 	struct file *file = iocb->ki_filp;
 
 	rc = generic_file_read_iter(iocb, to);
-	/*
-	 * Even though this is a async interface, we need to wait
-	 * for IO to finish to update atime
-	 */
-	if (-EIOCBQUEUED == rc)
-		rc = wait_on_sync_kiocb(iocb);
 	if (rc >= 0) {
 		path = ecryptfs_dentry_to_lower_path(file->f_path.dentry);
 		touch_atime(path);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 760b2c5..9c27312 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -2841,8 +2841,10 @@ fuse_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
 	/*
 	 * By default, we want to optimize all I/Os with async request
 	 * submission to the client filesystem if supported.
+	 *
+	 * XXX: need to add back support for this mode..
 	 */
-	io->async = async_dio;
+	io->async = async_dio && !is_sync_kiocb(iocb);
 	io->iocb = iocb;
 
 	/*
@@ -2865,7 +2867,8 @@ fuse_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
 		if (!is_sync_kiocb(iocb))
 			return -EIOCBQUEUED;
 
-		ret = wait_on_sync_kiocb(iocb);
+		// XXX: need fuse specific replacement
+//		ret = wait_on_sync_kiocb(iocb);
 	} else {
 		kfree(io);
 	}
diff --git a/fs/read_write.c b/fs/read_write.c
index ab4f26a..adab608 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -347,9 +347,7 @@ ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos)
 
 	iter->type |= READ;
 	ret = file->f_op->read_iter(&kiocb, iter);
-	if (ret == -EIOCBQUEUED)
-		ret = wait_on_sync_kiocb(&kiocb);
-
+	BUG_ON(ret == -EIOCBQUEUED);
 	if (ret > 0)
 		*ppos = kiocb.ki_pos;
 	return ret;
@@ -370,9 +368,7 @@ ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos)
 
 	iter->type |= WRITE;
 	ret = file->f_op->write_iter(&kiocb, iter);
-	if (ret == -EIOCBQUEUED)
-		ret = wait_on_sync_kiocb(&kiocb);
-
+	BUG_ON(ret == -EIOCBQUEUED);
 	if (ret > 0)
 		*ppos = kiocb.ki_pos;
 	return ret;
@@ -429,8 +425,7 @@ ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *pp
 	kiocb.ki_nbytes = len;
 
 	ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
-	if (-EIOCBQUEUED == ret)
-		ret = wait_on_sync_kiocb(&kiocb);
+	BUG_ON(ret == -EIOCBQUEUED);
 	*ppos = kiocb.ki_pos;
 	return ret;
 }
@@ -450,8 +445,7 @@ ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *p
 	iov_iter_init(&iter, READ, &iov, 1, len);
 
 	ret = filp->f_op->read_iter(&kiocb, &iter);
-	if (-EIOCBQUEUED == ret)
-		ret = wait_on_sync_kiocb(&kiocb);
+	BUG_ON(ret == -EIOCBQUEUED);
 	*ppos = kiocb.ki_pos;
 	return ret;
 }
@@ -513,8 +507,7 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof
 	kiocb.ki_nbytes = len;
 
 	ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
-	if (-EIOCBQUEUED == ret)
-		ret = wait_on_sync_kiocb(&kiocb);
+	BUG_ON(ret == -EIOCBQUEUED);
 	*ppos = kiocb.ki_pos;
 	return ret;
 }
@@ -534,8 +527,7 @@ ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, lo
 	iov_iter_init(&iter, WRITE, &iov, 1, len);
 
 	ret = filp->f_op->write_iter(&kiocb, &iter);
-	if (-EIOCBQUEUED == ret)
-		ret = wait_on_sync_kiocb(&kiocb);
+	BUG_ON(ret == -EIOCBQUEUED);
 	*ppos = kiocb.ki_pos;
 	return ret;
 }
@@ -723,8 +715,7 @@ static ssize_t do_iter_readv_writev(struct file *filp, int rw, const struct iove
 
 	iov_iter_init(&iter, rw, iov, nr_segs, len);
 	ret = fn(&kiocb, &iter);
-	if (ret == -EIOCBQUEUED)
-		ret = wait_on_sync_kiocb(&kiocb);
+	BUG_ON(ret == -EIOCBQUEUED);
 	*ppos = kiocb.ki_pos;
 	return ret;
 }
@@ -740,8 +731,7 @@ static ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
 	kiocb.ki_nbytes = len;
 
 	ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos);
-	if (ret == -EIOCBQUEUED)
-		ret = wait_on_sync_kiocb(&kiocb);
+	BUG_ON(ret == -EIOCBQUEUED);
 	*ppos = kiocb.ki_pos;
 	return ret;
 }
diff --git a/include/linux/aio.h b/include/linux/aio.h
index d9c92da..57c86c0 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -37,7 +37,6 @@ struct kiocb {
 
 	union {
 		void __user		*user;
-		struct task_struct	*tsk;
 	} ki_obj;
 
 	__u64			ki_user_data;	/* user's data for completion */
@@ -64,13 +63,11 @@ static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp)
 	*kiocb = (struct kiocb) {
 			.ki_ctx = NULL,
 			.ki_filp = filp,
-			.ki_obj.tsk = current,
 		};
 }
 
 /* prototypes */
 #ifdef CONFIG_AIO
-extern ssize_t wait_on_sync_kiocb(struct kiocb *iocb);
 extern void aio_complete(struct kiocb *iocb, long res, long res2);
 struct mm_struct;
 extern void exit_aio(struct mm_struct *mm);
@@ -78,7 +75,6 @@ extern long do_io_submit(aio_context_t ctx_id, long nr,
 			 struct iocb __user *__user *iocbpp, bool compat);
 void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel);
 #else
-static inline ssize_t wait_on_sync_kiocb(struct kiocb *iocb) { return 0; }
 static inline void aio_complete(struct kiocb *iocb, long res, long res2) { }
 struct mm_struct;
 static inline void exit_aio(struct mm_struct *mm) { }
diff --git a/net/socket.c b/net/socket.c
index a2c33a4..4032931 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -642,8 +642,7 @@ static int do_sock_sendmsg(struct socket *sock, struct msghdr *msg,
 	iocb.private = &siocb;
 	ret = nosec ? __sock_sendmsg_nosec(&iocb, sock, msg, size) :
 		      __sock_sendmsg(&iocb, sock, msg, size);
-	if (-EIOCBQUEUED == ret)
-		ret = wait_on_sync_kiocb(&iocb);
+	BUG_ON(-EIOCBQUEUED == ret);
 	return ret;
 }
 
@@ -785,8 +784,7 @@ int sock_recvmsg(struct socket *sock, struct msghdr *msg,
 	init_sync_kiocb(&iocb, NULL);
 	iocb.private = &siocb;
 	ret = __sock_recvmsg(&iocb, sock, msg, size, flags);
-	if (-EIOCBQUEUED == ret)
-		ret = wait_on_sync_kiocb(&iocb);
+	BUG_ON(-EIOCBQUEUED == ret);
 	return ret;
 }
 EXPORT_SYMBOL(sock_recvmsg);
@@ -801,8 +799,7 @@ static int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg,
 	init_sync_kiocb(&iocb, NULL);
 	iocb.private = &siocb;
 	ret = __sock_recvmsg_nosec(&iocb, sock, msg, size, flags);
-	if (-EIOCBQUEUED == ret)
-		ret = wait_on_sync_kiocb(&iocb);
+	BUG_ON(-EIOCBQUEUED == ret);
 	return ret;
 }
 
-- 
1.9.1


^ permalink raw reply related	[flat|nested] 45+ messages in thread

* [PATCH 2/5] fs: saner aio_complete prototype
  2015-01-27 17:55 [RFC] split struct kiocb Christoph Hellwig
  2015-01-27 17:55 ` [PATCH 1/5] fs: don't allow to complete sync iocbs through aio_complete Christoph Hellwig
@ 2015-01-27 17:55 ` Christoph Hellwig
  2015-01-31 10:04   ` Al Viro
  2015-01-27 17:55 ` [PATCH 3/5] fs: remove ki_nbytes Christoph Hellwig
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 45+ messages in thread
From: Christoph Hellwig @ 2015-01-27 17:55 UTC (permalink / raw)
  To: Al Viro; +Cc: Miklos Szeredi, linux-aio, linux-fsdevel

Pass the result as a ssize_t, which is what we use for these returns
all over the kernel.  Remove the res2 argument that all relevant
callers always pass as '0'.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/usb/gadget/function/f_fs.c |  2 +-
 drivers/usb/gadget/legacy/inode.c  |  5 ++---
 fs/aio.c                           | 11 +++++------
 fs/direct-io.c                     |  2 +-
 fs/fuse/file.c                     |  2 +-
 fs/nfs/direct.c                    |  6 ++----
 include/linux/aio.h                |  4 ++--
 7 files changed, 14 insertions(+), 18 deletions(-)

diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c
index debd78b..c30d125 100644
--- a/drivers/usb/gadget/function/f_fs.c
+++ b/drivers/usb/gadget/function/f_fs.c
@@ -671,7 +671,7 @@ static void ffs_user_copy_worker(struct work_struct *work)
 		unuse_mm(io_data->mm);
 	}
 
-	aio_complete(io_data->kiocb, ret, ret);
+	aio_complete(io_data->kiocb, ret);
 
 	usb_ep_free_request(io_data->ep, io_data->req);
 
diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c
index db49ec4..987a461 100644
--- a/drivers/usb/gadget/legacy/inode.c
+++ b/drivers/usb/gadget/legacy/inode.c
@@ -582,7 +582,7 @@ static void ep_user_copy_worker(struct work_struct *work)
 	unuse_mm(mm);
 
 	/* completing the iocb can drop the ctx and mm, don't touch mm after */
-	aio_complete(iocb, ret, ret);
+	aio_complete(iocb, ret);
 
 	kfree(priv->buf);
 	kfree(priv);
@@ -608,8 +608,7 @@ static void ep_aio_complete(struct usb_ep *ep, struct usb_request *req)
 		kfree(priv);
 		iocb->private = NULL;
 		/* aio_complete() reports bytes-transferred _and_ faults */
-		aio_complete(iocb, req->actual ? req->actual : req->status,
-				req->status);
+		aio_complete(iocb, req->actual ? req->actual : req->status);
 	} else {
 		/* ep_copy_to_user() won't report both; we hide some faults */
 		if (unlikely(0 != req->status))
diff --git a/fs/aio.c b/fs/aio.c
index 0ee25d6..52f36ee 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1007,7 +1007,7 @@ out:
 /* aio_complete
  *	Called when the io request on the given iocb is complete.
  */
-void aio_complete(struct kiocb *iocb, long res, long res2)
+void aio_complete(struct kiocb *iocb, ssize_t res)
 {
 	struct kioctx	*ctx = iocb->ki_ctx;
 	struct aio_ring	*ring;
@@ -1051,14 +1051,13 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
 	event->obj = (u64)(unsigned long)iocb->ki_obj.user;
 	event->data = iocb->ki_user_data;
 	event->res = res;
-	event->res2 = res2;
+	event->res2 = 0;
 
 	kunmap_atomic(ev_page);
 	flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
 
-	pr_debug("%p[%u]: %p: %p %Lx %lx %lx\n",
-		 ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data,
-		 res, res2);
+	pr_debug("%p[%u]: %p: %p %Lx %zx\n",
+		 ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data, res);
 
 	/* after flagging the request as done, we
 	 * must never even look at it again
@@ -1475,7 +1474,7 @@ rw_common:
 			     ret == -ERESTARTNOHAND ||
 			     ret == -ERESTART_RESTARTBLOCK))
 			ret = -EINTR;
-		aio_complete(req, ret, 0);
+		aio_complete(req, ret);
 	}
 
 	return 0;
diff --git a/fs/direct-io.c b/fs/direct-io.c
index e181b6b..10bea2b 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -265,7 +265,7 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret,
 				ret = err;
 		}
 
-		aio_complete(dio->iocb, ret, 0);
+		aio_complete(dio->iocb, ret);
 	}
 
 	kmem_cache_free(dio_cache, dio);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 9c27312..b670e30 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -578,7 +578,7 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
 			}
 		}
 
-		aio_complete(io->iocb, res, 0);
+		aio_complete(io->iocb, res);
 		kfree(io);
 	}
 }
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 10bf072..9441c4c 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -330,10 +330,8 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write)
 	inode_dio_done(inode);
 
 	if (dreq->iocb) {
-		long res = (long) dreq->error;
-		if (!res)
-			res = (long) dreq->count;
-		aio_complete(dreq->iocb, res, 0);
+		aio_complete(dreq->iocb,
+			     dreq->error ? dreq->error : dreq->count);
 	}
 
 	complete_all(&dreq->completion);
diff --git a/include/linux/aio.h b/include/linux/aio.h
index 57c86c0..5657bd2 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -68,14 +68,14 @@ static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp)
 
 /* prototypes */
 #ifdef CONFIG_AIO
-extern void aio_complete(struct kiocb *iocb, long res, long res2);
+extern void aio_complete(struct kiocb *iocb, ssize_t ret);
 struct mm_struct;
 extern void exit_aio(struct mm_struct *mm);
 extern long do_io_submit(aio_context_t ctx_id, long nr,
 			 struct iocb __user *__user *iocbpp, bool compat);
 void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel);
 #else
-static inline void aio_complete(struct kiocb *iocb, long res, long res2) { }
+static inline void aio_complete(struct kiocb *iocb, ssize_t ret) { }
 struct mm_struct;
 static inline void exit_aio(struct mm_struct *mm) { }
 static inline long do_io_submit(aio_context_t ctx_id, long nr,
-- 
1.9.1

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply related	[flat|nested] 45+ messages in thread

* [PATCH 3/5] fs: remove ki_nbytes
  2015-01-27 17:55 [RFC] split struct kiocb Christoph Hellwig
  2015-01-27 17:55 ` [PATCH 1/5] fs: don't allow to complete sync iocbs through aio_complete Christoph Hellwig
  2015-01-27 17:55 ` [PATCH 2/5] fs: saner aio_complete prototype Christoph Hellwig
@ 2015-01-27 17:55 ` Christoph Hellwig
  2015-01-31  6:08   ` Al Viro
  2015-01-27 17:55 ` [PATCH 4/5] fs: split generic and aio kiocb Christoph Hellwig
  2015-01-27 17:55 ` [PATCH 5/5] fs: add async read/write interfaces Christoph Hellwig
  4 siblings, 1 reply; 45+ messages in thread
From: Christoph Hellwig @ 2015-01-27 17:55 UTC (permalink / raw)
  To: Al Viro; +Cc: Miklos Szeredi, linux-aio, linux-fsdevel

There is no need to pass the total request length in the kiocb.  We
already have it in the iov_iter, and for those few methods not converted
to iov_iter we can easily calculate it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/staging/android/logger.c   |  2 +-
 drivers/usb/gadget/function/f_fs.c |  4 ++--
 drivers/usb/gadget/legacy/inode.c  | 10 +++++-----
 fs/aio.c                           | 34 ++++++++++++++++++----------------
 fs/ceph/file.c                     |  2 +-
 fs/nfs/direct.c                    |  2 +-
 fs/ocfs2/file.c                    |  8 +++-----
 fs/read_write.c                    |  8 --------
 fs/udf/file.c                      |  2 +-
 include/linux/aio.h                |  1 -
 kernel/printk/printk.c             |  2 +-
 mm/page_io.c                       |  1 -
 net/socket.c                       |  3 +--
 13 files changed, 34 insertions(+), 45 deletions(-)

diff --git a/drivers/staging/android/logger.c b/drivers/staging/android/logger.c
index a673ffa..b0dfda1 100644
--- a/drivers/staging/android/logger.c
+++ b/drivers/staging/android/logger.c
@@ -422,7 +422,7 @@ static ssize_t logger_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	struct timespec now;
 	size_t len, count, w_off;
 
-	count = min_t(size_t, iocb->ki_nbytes, LOGGER_ENTRY_MAX_PAYLOAD);
+	count = min_t(size_t, iov_iter_count(from), LOGGER_ENTRY_MAX_PAYLOAD);
 
 	now = current_kernel_time();
 
diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c
index c30d125..7fc43c7 100644
--- a/drivers/usb/gadget/function/f_fs.c
+++ b/drivers/usb/gadget/function/f_fs.c
@@ -974,7 +974,7 @@ static ssize_t ffs_epfile_aio_write(struct kiocb *kiocb,
 	io_data->kiocb = kiocb;
 	io_data->iovec = iovec;
 	io_data->nr_segs = nr_segs;
-	io_data->len = kiocb->ki_nbytes;
+	io_data->len = iov_length(iovec, nr_segs);
 	io_data->mm = current->mm;
 
 	kiocb->private = io_data;
@@ -1010,7 +1010,7 @@ static ssize_t ffs_epfile_aio_read(struct kiocb *kiocb,
 	io_data->kiocb = kiocb;
 	io_data->iovec = iovec_copy;
 	io_data->nr_segs = nr_segs;
-	io_data->len = kiocb->ki_nbytes;
+	io_data->len = iov_length(iovec, nr_segs);
 	io_data->mm = current->mm;
 
 	kiocb->private = io_data;
diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c
index 987a461..25a4e6f 100644
--- a/drivers/usb/gadget/legacy/inode.c
+++ b/drivers/usb/gadget/legacy/inode.c
@@ -700,16 +700,17 @@ ep_aio_read(struct kiocb *iocb, const struct iovec *iov,
 		unsigned long nr_segs, loff_t o)
 {
 	struct ep_data		*epdata = iocb->ki_filp->private_data;
+	size_t			len = iov_length(iov, nr_segs);
 	char			*buf;
 
 	if (unlikely(usb_endpoint_dir_in(&epdata->desc)))
 		return -EINVAL;
 
-	buf = kmalloc(iocb->ki_nbytes, GFP_KERNEL);
+	buf = kmalloc(len, GFP_KERNEL);
 	if (unlikely(!buf))
 		return -ENOMEM;
 
-	return ep_aio_rwtail(iocb, buf, iocb->ki_nbytes, epdata, iov, nr_segs);
+	return ep_aio_rwtail(iocb, buf, len, epdata, iov, nr_segs);
 }
 
 static ssize_t
@@ -717,14 +718,14 @@ ep_aio_write(struct kiocb *iocb, const struct iovec *iov,
 		unsigned long nr_segs, loff_t o)
 {
 	struct ep_data		*epdata = iocb->ki_filp->private_data;
+	size_t			len = iov_length(iov, nr_segs);
 	char			*buf;
-	size_t			len = 0;
 	int			i = 0;
 
 	if (unlikely(!usb_endpoint_dir_in(&epdata->desc)))
 		return -EINVAL;
 
-	buf = kmalloc(iocb->ki_nbytes, GFP_KERNEL);
+	buf = kmalloc(len, GFP_KERNEL);
 	if (unlikely(!buf))
 		return -ENOMEM;
 
@@ -734,7 +735,6 @@ ep_aio_write(struct kiocb *iocb, const struct iovec *iov,
 			kfree(buf);
 			return -EFAULT;
 		}
-		len += iov[i].iov_len;
 	}
 	return ep_aio_rwtail(iocb, buf, len, epdata, NULL, 0);
 }
diff --git a/fs/aio.c b/fs/aio.c
index 52f36ee..fa079bc 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1327,12 +1327,13 @@ typedef ssize_t (rw_iter_op)(struct kiocb *, struct iov_iter *);
 static ssize_t aio_setup_vectored_rw(struct kiocb *kiocb,
 				     int rw, char __user *buf,
 				     unsigned long *nr_segs,
+				     size_t *len,
 				     struct iovec **iovec,
 				     bool compat)
 {
 	ssize_t ret;
 
-	*nr_segs = kiocb->ki_nbytes;
+	*nr_segs = *len;
 
 #ifdef CONFIG_COMPAT
 	if (compat)
@@ -1347,21 +1348,22 @@ static ssize_t aio_setup_vectored_rw(struct kiocb *kiocb,
 	if (ret < 0)
 		return ret;
 
-	/* ki_nbytes now reflect bytes instead of segs */
-	kiocb->ki_nbytes = ret;
+	/* len now reflect bytes instead of segs */
+	*len = ret;
 	return 0;
 }
 
 static ssize_t aio_setup_single_vector(struct kiocb *kiocb,
 				       int rw, char __user *buf,
 				       unsigned long *nr_segs,
+				       size_t len,
 				       struct iovec *iovec)
 {
-	if (unlikely(!access_ok(!rw, buf, kiocb->ki_nbytes)))
+	if (unlikely(!access_ok(!rw, buf, len)))
 		return -EFAULT;
 
 	iovec->iov_base = buf;
-	iovec->iov_len = kiocb->ki_nbytes;
+	iovec->iov_len = len;
 	*nr_segs = 1;
 	return 0;
 }
@@ -1371,7 +1373,7 @@ static ssize_t aio_setup_single_vector(struct kiocb *kiocb,
  *	Performs the initial checks and io submission.
  */
 static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode,
-			    char __user *buf, bool compat)
+			    char __user *buf, size_t len, bool compat)
 {
 	struct file *file = req->ki_filp;
 	ssize_t ret;
@@ -1406,21 +1408,21 @@ rw_common:
 		if (!rw_op && !iter_op)
 			return -EINVAL;
 
-		ret = (opcode == IOCB_CMD_PREADV ||
-		       opcode == IOCB_CMD_PWRITEV)
-			? aio_setup_vectored_rw(req, rw, buf, &nr_segs,
-						&iovec, compat)
-			: aio_setup_single_vector(req, rw, buf, &nr_segs,
-						  iovec);
+		if (opcode == IOCB_CMD_PREADV || opcode == IOCB_CMD_PWRITEV)
+			ret = aio_setup_vectored_rw(req, rw, buf, &nr_segs,
+						&len, &iovec, compat);
+		else
+			ret = aio_setup_single_vector(req, rw, buf, &nr_segs,
+						  len, iovec);
 		if (!ret)
-			ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes);
+			ret = rw_verify_area(rw, file, &req->ki_pos, len);
 		if (ret < 0) {
 			if (iovec != inline_vecs)
 				kfree(iovec);
 			return ret;
 		}
 
-		req->ki_nbytes = ret;
+		len = ret;
 
 		/* XXX: move/kill - rw_verify_area()? */
 		/* This matches the pread()/pwrite() logic */
@@ -1433,7 +1435,7 @@ rw_common:
 			file_start_write(file);
 
 		if (iter_op) {
-			iov_iter_init(&iter, rw, iovec, nr_segs, req->ki_nbytes);
+			iov_iter_init(&iter, rw, iovec, nr_segs, len);
 			ret = iter_op(req, &iter);
 		} else {
 			ret = rw_op(req, iovec, nr_segs, req->ki_pos);
@@ -1536,10 +1538,10 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 	req->ki_obj.user = user_iocb;
 	req->ki_user_data = iocb->aio_data;
 	req->ki_pos = iocb->aio_offset;
-	req->ki_nbytes = iocb->aio_nbytes;
 
 	ret = aio_run_iocb(req, iocb->aio_lio_opcode,
 			   (char __user *)(unsigned long)iocb->aio_buf,
+			   iocb->aio_nbytes,
 			   compat);
 	if (ret)
 		goto out_put_req;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index ce74b39..0c93f62 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -807,7 +807,7 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to)
 {
 	struct file *filp = iocb->ki_filp;
 	struct ceph_file_info *fi = filp->private_data;
-	size_t len = iocb->ki_nbytes;
+	size_t len = iov_iter_count(to);
 	struct inode *inode = file_inode(filp);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct page *pinned_page = NULL;
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 9441c4c..c416e84 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -218,7 +218,7 @@ ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, loff_t
 
 	return -EINVAL;
 #else
-	VM_BUG_ON(iocb->ki_nbytes != PAGE_SIZE);
+	VM_BUG_ON(iov_iter_count(iter) != PAGE_SIZE);
 
 	if (rw == READ)
 		return nfs_file_direct_read(iocb, iter, pos);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 3950693..5f3fbecf 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2254,7 +2254,7 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
 		file->f_path.dentry->d_name.name,
 		(unsigned int)from->nr_segs);	/* GRRRRR */
 
-	if (iocb->ki_nbytes == 0)
+	if (count == 0)
 		return 0;
 
 	appending = file->f_flags & O_APPEND ? 1 : 0;
@@ -2304,8 +2304,7 @@ relock:
 	}
 
 	can_do_direct = direct_io;
-	ret = ocfs2_prepare_inode_for_write(file, ppos,
-					    iocb->ki_nbytes, appending,
+	ret = ocfs2_prepare_inode_for_write(file, ppos, count, appending,
 					    &can_do_direct, &has_refcount);
 	if (ret < 0) {
 		mlog_errno(ret);
@@ -2313,8 +2312,7 @@ relock:
 	}
 
 	if (direct_io && !is_sync_kiocb(iocb))
-		unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_nbytes,
-						      *ppos);
+		unaligned_dio = ocfs2_is_io_unaligned(inode, count, *ppos);
 
 	/*
 	 * We can't complete the direct I/O as requested, fall back to
diff --git a/fs/read_write.c b/fs/read_write.c
index adab608..955cad3 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -343,7 +343,6 @@ ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos)
 
 	init_sync_kiocb(&kiocb, file);
 	kiocb.ki_pos = *ppos;
-	kiocb.ki_nbytes = iov_iter_count(iter);
 
 	iter->type |= READ;
 	ret = file->f_op->read_iter(&kiocb, iter);
@@ -364,7 +363,6 @@ ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos)
 
 	init_sync_kiocb(&kiocb, file);
 	kiocb.ki_pos = *ppos;
-	kiocb.ki_nbytes = iov_iter_count(iter);
 
 	iter->type |= WRITE;
 	ret = file->f_op->write_iter(&kiocb, iter);
@@ -422,7 +420,6 @@ ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *pp
 
 	init_sync_kiocb(&kiocb, filp);
 	kiocb.ki_pos = *ppos;
-	kiocb.ki_nbytes = len;
 
 	ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
 	BUG_ON(ret == -EIOCBQUEUED);
@@ -441,7 +438,6 @@ ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *p
 
 	init_sync_kiocb(&kiocb, filp);
 	kiocb.ki_pos = *ppos;
-	kiocb.ki_nbytes = len;
 	iov_iter_init(&iter, READ, &iov, 1, len);
 
 	ret = filp->f_op->read_iter(&kiocb, &iter);
@@ -504,7 +500,6 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof
 
 	init_sync_kiocb(&kiocb, filp);
 	kiocb.ki_pos = *ppos;
-	kiocb.ki_nbytes = len;
 
 	ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
 	BUG_ON(ret == -EIOCBQUEUED);
@@ -523,7 +518,6 @@ ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, lo
 
 	init_sync_kiocb(&kiocb, filp);
 	kiocb.ki_pos = *ppos;
-	kiocb.ki_nbytes = len;
 	iov_iter_init(&iter, WRITE, &iov, 1, len);
 
 	ret = filp->f_op->write_iter(&kiocb, &iter);
@@ -711,7 +705,6 @@ static ssize_t do_iter_readv_writev(struct file *filp, int rw, const struct iove
 
 	init_sync_kiocb(&kiocb, filp);
 	kiocb.ki_pos = *ppos;
-	kiocb.ki_nbytes = len;
 
 	iov_iter_init(&iter, rw, iov, nr_segs, len);
 	ret = fn(&kiocb, &iter);
@@ -728,7 +721,6 @@ static ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
 
 	init_sync_kiocb(&kiocb, filp);
 	kiocb.ki_pos = *ppos;
-	kiocb.ki_nbytes = len;
 
 	ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos);
 	BUG_ON(ret == -EIOCBQUEUED);
diff --git a/fs/udf/file.c b/fs/udf/file.c
index bb15771..c3d7ce0 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -122,7 +122,7 @@ static ssize_t udf_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file_inode(file);
 	int err, pos;
-	size_t count = iocb->ki_nbytes;
+	size_t count = iov_iter_count(from);
 	struct udf_inode_info *iinfo = UDF_I(inode);
 
 	mutex_lock(&inode->i_mutex);
diff --git a/include/linux/aio.h b/include/linux/aio.h
index 5657bd2..46ead10 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -41,7 +41,6 @@ struct kiocb {
 
 	__u64			ki_user_data;	/* user's data for completion */
 	loff_t			ki_pos;
-	size_t			ki_nbytes;	/* copy of iocb->aio_nbytes */
 
 	struct list_head	ki_list;	/* the aio core uses this
 						 * for cancellation */
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 02d6b6d..a87301c 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -521,7 +521,7 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from)
 	int i;
 	int level = default_message_loglevel;
 	int facility = 1;	/* LOG_USER */
-	size_t len = iocb->ki_nbytes;
+	size_t len = iov_iter_count(from);
 	ssize_t ret = len;
 
 	if (len > LOG_LINE_MAX)
diff --git a/mm/page_io.c b/mm/page_io.c
index e604580..7ef2157 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -274,7 +274,6 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
 		iov_iter_bvec(&from, ITER_BVEC | WRITE, &bv, 1, PAGE_SIZE);
 		init_sync_kiocb(&kiocb, swap_file);
 		kiocb.ki_pos = page_file_offset(page);
-		kiocb.ki_nbytes = PAGE_SIZE;
 
 		set_page_writeback(page);
 		unlock_page(page);
diff --git a/net/socket.c b/net/socket.c
index 4032931..121b976 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -903,10 +903,9 @@ static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov,
 	if (pos != 0)
 		return -ESPIPE;
 
-	if (iocb->ki_nbytes == 0)	/* Match SYS5 behaviour */
+	if (!iov_length(iov, nr_segs))	/* Match SYS5 behaviour */
 		return 0;
 
-
 	x = alloc_sock_iocb(iocb, &siocb);
 	if (!x)
 		return -ENOMEM;
-- 
1.9.1


^ permalink raw reply related	[flat|nested] 45+ messages in thread

* [PATCH 4/5] fs: split generic and aio kiocb
  2015-01-27 17:55 [RFC] split struct kiocb Christoph Hellwig
                   ` (2 preceding siblings ...)
  2015-01-27 17:55 ` [PATCH 3/5] fs: remove ki_nbytes Christoph Hellwig
@ 2015-01-27 17:55 ` Christoph Hellwig
  2015-01-27 17:55 ` [PATCH 5/5] fs: add async read/write interfaces Christoph Hellwig
  4 siblings, 0 replies; 45+ messages in thread
From: Christoph Hellwig @ 2015-01-27 17:55 UTC (permalink / raw)
  To: Al Viro; +Cc: Miklos Szeredi, linux-aio, linux-fsdevel

Most callers in the kernel want to perform synchronous file I/O, but
still have to bloat the stack with a full struct kiocb.  Split out
the parts needed in filesystem code from those in the aio code, and
only allocate those needed to pass down argument on the stack.  The
aio code embedds the generic iocb in the one it allocates and can
easily get back to it by using container_of.

Also add a ->complete method to struct kiocb, this is used to call
into the aio code and thus removes the dependency on aio for filesystems
impementing asynchronous operations.  It will also allow other callers
to substitute their own completion callback.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/usb/gadget/function/f_fs.c |  2 +-
 drivers/usb/gadget/legacy/inode.c  |  4 +-
 fs/aio.c                           | 90 ++++++++++++++++++++++++++------------
 fs/direct-io.c                     |  4 +-
 fs/fuse/file.c                     |  2 +-
 fs/nfs/direct.c                    |  2 +-
 include/linux/aio.h                | 43 ++----------------
 7 files changed, 72 insertions(+), 75 deletions(-)

diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c
index 7fc43c7..d00042e 100644
--- a/drivers/usb/gadget/function/f_fs.c
+++ b/drivers/usb/gadget/function/f_fs.c
@@ -671,7 +671,7 @@ static void ffs_user_copy_worker(struct work_struct *work)
 		unuse_mm(io_data->mm);
 	}
 
-	aio_complete(io_data->kiocb, ret);
+	io_data->kiocb->complete(io_data->kiocb, ret);
 
 	usb_ep_free_request(io_data->ep, io_data->req);
 
diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c
index 25a4e6f..6141158 100644
--- a/drivers/usb/gadget/legacy/inode.c
+++ b/drivers/usb/gadget/legacy/inode.c
@@ -582,7 +582,7 @@ static void ep_user_copy_worker(struct work_struct *work)
 	unuse_mm(mm);
 
 	/* completing the iocb can drop the ctx and mm, don't touch mm after */
-	aio_complete(iocb, ret);
+	iocb->complete(iocb, ret);
 
 	kfree(priv->buf);
 	kfree(priv);
@@ -608,7 +608,7 @@ static void ep_aio_complete(struct usb_ep *ep, struct usb_request *req)
 		kfree(priv);
 		iocb->private = NULL;
 		/* aio_complete() reports bytes-transferred _and_ faults */
-		aio_complete(iocb, req->actual ? req->actual : req->status);
+		iocb->complete(iocb, req->actual ? req->actual : req->status);
 	} else {
 		/* ep_copy_to_user() won't report both; we hide some faults */
 		if (unlikely(0 != req->status))
diff --git a/fs/aio.c b/fs/aio.c
index fa079bc..228afef 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -151,6 +151,38 @@ struct kioctx {
 	unsigned		id;
 };
 
+/*
+ * We use ki_cancel == KIOCB_CANCELLED to indicate that a kiocb has been either
+ * cancelled or completed (this makes a certain amount of sense because
+ * successful cancellation - io_cancel() - does deliver the completion to
+ * userspace).
+ *
+ * And since most things don't implement kiocb cancellation and we'd really like
+ * kiocb completion to be lockless when possible, we use ki_cancel to
+ * synchronize cancellation and completion - we only set it to KIOCB_CANCELLED
+ * with xchg() or cmpxchg(), see batch_complete_aio() and kiocb_cancel().
+ */
+#define KIOCB_CANCELLED		((void *) (~0ULL))
+
+struct aio_kiocb {
+	struct kiocb		common;
+
+	struct kioctx		*ki_ctx;
+	kiocb_cancel_fn		*ki_cancel;
+
+	struct iocb __user	*ki_user_iocb;	/* user's aiocb */
+	__u64			ki_user_data;	/* user's data for completion */
+
+	struct list_head	ki_list;	/* the aio core uses this
+						 * for cancellation */
+
+	/*
+	 * If the aio_resfd field of the userspace iocb is not zero,
+	 * this is the underlying eventfd context to deliver events to.
+	 */
+	struct eventfd_ctx	*ki_eventfd;
+};
+
 /*------ sysctl variables----*/
 static DEFINE_SPINLOCK(aio_nr_lock);
 unsigned long aio_nr;		/* current system wide number of aio requests */
@@ -233,7 +265,7 @@ static int __init aio_setup(void)
 	if (bdi_init(&aio_fs_backing_dev_info))
 		panic("Failed to init aio fs backing dev info.");
 
-	kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
+	kiocb_cachep = KMEM_CACHE(aio_kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
 	kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
 
 	pr_debug("sizeof(struct page) = %zu\n", sizeof(struct page));
@@ -493,8 +525,9 @@ static int aio_setup_ring(struct kioctx *ctx)
 #define AIO_EVENTS_FIRST_PAGE	((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event))
 #define AIO_EVENTS_OFFSET	(AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE)
 
-void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel)
+void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel)
 {
+	struct aio_kiocb *req = container_of(iocb, struct aio_kiocb, common);
 	struct kioctx *ctx = req->ki_ctx;
 	unsigned long flags;
 
@@ -509,7 +542,7 @@ void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel)
 }
 EXPORT_SYMBOL(kiocb_set_cancel_fn);
 
-static int kiocb_cancel(struct kiocb *kiocb)
+static int kiocb_cancel(struct aio_kiocb *kiocb)
 {
 	kiocb_cancel_fn *old, *cancel;
 
@@ -527,7 +560,7 @@ static int kiocb_cancel(struct kiocb *kiocb)
 		cancel = cmpxchg(&kiocb->ki_cancel, old, KIOCB_CANCELLED);
 	} while (cancel != old);
 
-	return cancel(kiocb);
+	return cancel(&kiocb->common);
 }
 
 static void free_ioctx(struct work_struct *work)
@@ -563,13 +596,13 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
 static void free_ioctx_users(struct percpu_ref *ref)
 {
 	struct kioctx *ctx = container_of(ref, struct kioctx, users);
-	struct kiocb *req;
+	struct aio_kiocb *req;
 
 	spin_lock_irq(&ctx->ctx_lock);
 
 	while (!list_empty(&ctx->active_reqs)) {
 		req = list_first_entry(&ctx->active_reqs,
-				       struct kiocb, ki_list);
+				       struct aio_kiocb, ki_list);
 
 		list_del_init(&req->ki_list);
 		kiocb_cancel(req);
@@ -945,9 +978,9 @@ static void user_refill_reqs_available(struct kioctx *ctx)
  *	Allocate a slot for an aio request.
  * Returns NULL if no requests are free.
  */
-static inline struct kiocb *aio_get_req(struct kioctx *ctx)
+static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx)
 {
-	struct kiocb *req;
+	struct aio_kiocb *req;
 
 	if (!get_reqs_available(ctx)) {
 		user_refill_reqs_available(ctx);
@@ -968,10 +1001,10 @@ out_put:
 	return NULL;
 }
 
-static void kiocb_free(struct kiocb *req)
+static void kiocb_free(struct aio_kiocb *req)
 {
-	if (req->ki_filp)
-		fput(req->ki_filp);
+	if (req->common.ki_filp)
+		fput(req->common.ki_filp);
 	if (req->ki_eventfd != NULL)
 		eventfd_ctx_put(req->ki_eventfd);
 	kmem_cache_free(kiocb_cachep, req);
@@ -1007,8 +1040,9 @@ out:
 /* aio_complete
  *	Called when the io request on the given iocb is complete.
  */
-void aio_complete(struct kiocb *iocb, ssize_t res)
+static void aio_complete(struct kiocb *kiocb, ssize_t res)
 {
+	struct aio_kiocb *iocb = container_of(kiocb, struct aio_kiocb, common);
 	struct kioctx	*ctx = iocb->ki_ctx;
 	struct aio_ring	*ring;
 	struct io_event	*ev_page, *event;
@@ -1022,7 +1056,7 @@ void aio_complete(struct kiocb *iocb, ssize_t res)
 	 *    ref, no other paths have a way to get another ref
 	 *  - the sync task helpfully left a reference to itself in the iocb
 	 */
-	BUG_ON(is_sync_kiocb(iocb));
+	BUG_ON(is_sync_kiocb(kiocb));
 
 	if (iocb->ki_list.next) {
 		unsigned long flags;
@@ -1048,7 +1082,7 @@ void aio_complete(struct kiocb *iocb, ssize_t res)
 	ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
 	event = ev_page + pos % AIO_EVENTS_PER_PAGE;
 
-	event->obj = (u64)(unsigned long)iocb->ki_obj.user;
+	event->obj = (u64)(unsigned long)iocb->ki_user_iocb;
 	event->data = iocb->ki_user_data;
 	event->res = res;
 	event->res2 = 0;
@@ -1057,7 +1091,7 @@ void aio_complete(struct kiocb *iocb, ssize_t res)
 	flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
 
 	pr_debug("%p[%u]: %p: %p %Lx %zx\n",
-		 ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data, res);
+		 ctx, tail, iocb, iocb->ki_user_iocb, iocb->ki_user_data, res);
 
 	/* after flagging the request as done, we
 	 * must never even look at it again
@@ -1485,7 +1519,7 @@ rw_common:
 static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 			 struct iocb *iocb, bool compat)
 {
-	struct kiocb *req;
+	struct aio_kiocb *req;
 	ssize_t ret;
 
 	/* enforce forwards compatibility on users */
@@ -1508,11 +1542,13 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 	if (unlikely(!req))
 		return -EAGAIN;
 
-	req->ki_filp = fget(iocb->aio_fildes);
-	if (unlikely(!req->ki_filp)) {
+	req->common.ki_filp = fget(iocb->aio_fildes);
+	if (unlikely(!req->common.ki_filp)) {
 		ret = -EBADF;
 		goto out_put_req;
 	}
+	req->common.ki_pos = iocb->aio_offset;
+	req->common.complete = aio_complete;
 
 	if (iocb->aio_flags & IOCB_FLAG_RESFD) {
 		/*
@@ -1535,11 +1571,10 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 		goto out_put_req;
 	}
 
-	req->ki_obj.user = user_iocb;
+	req->ki_user_iocb = user_iocb;
 	req->ki_user_data = iocb->aio_data;
-	req->ki_pos = iocb->aio_offset;
 
-	ret = aio_run_iocb(req, iocb->aio_lio_opcode,
+	ret = aio_run_iocb(&req->common, iocb->aio_lio_opcode,
 			   (char __user *)(unsigned long)iocb->aio_buf,
 			   iocb->aio_nbytes,
 			   compat);
@@ -1628,10 +1663,10 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
 /* lookup_kiocb
  *	Finds a given iocb for cancellation.
  */
-static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb,
-				  u32 key)
+static struct aio_kiocb *
+lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb, u32 key)
 {
-	struct list_head *pos;
+	struct aio_kiocb *kiocb;
 
 	assert_spin_locked(&ctx->ctx_lock);
 
@@ -1639,9 +1674,8 @@ static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb,
 		return NULL;
 
 	/* TODO: use a hash or array, this sucks. */
-	list_for_each(pos, &ctx->active_reqs) {
-		struct kiocb *kiocb = list_kiocb(pos);
-		if (kiocb->ki_obj.user == iocb)
+	list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) {
+		if (kiocb->ki_user_iocb == iocb)
 			return kiocb;
 	}
 	return NULL;
@@ -1661,7 +1695,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
 		struct io_event __user *, result)
 {
 	struct kioctx *ctx;
-	struct kiocb *kiocb;
+	struct aio_kiocb *kiocb;
 	u32 key;
 	int ret;
 
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 10bea2b..73ebef0 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -265,7 +265,7 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret,
 				ret = err;
 		}
 
-		aio_complete(dio->iocb, ret);
+		dio->iocb->complete(dio->iocb, ret);
 	}
 
 	kmem_cache_free(dio_cache, dio);
@@ -1056,7 +1056,7 @@ static inline int drop_refcount(struct dio *dio)
 	 * operation.  AIO can if it was a broken operation described above or
 	 * in fact if all the bios race to complete before we get here.  In
 	 * that case dio_complete() translates the EIOCBQUEUED into the proper
-	 * return code that the caller will hand to aio_complete().
+	 * return code that the caller will hand to ->complete().
 	 *
 	 * This is managed by the bio_lock instead of being an atomic_t so that
 	 * completion paths can drop their ref and use the remaining count to
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index b670e30..d2be147 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -578,7 +578,7 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
 			}
 		}
 
-		aio_complete(io->iocb, res);
+		io->iocb->complete(io->iocb, res);
 		kfree(io);
 	}
 }
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index c416e84..3c67a56 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -330,7 +330,7 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write)
 	inode_dio_done(inode);
 
 	if (dreq->iocb) {
-		aio_complete(dreq->iocb,
+		dreq->iocb->complete(dreq->iocb,
 			     dreq->error ? dreq->error : dreq->count);
 	}
 
diff --git a/include/linux/aio.h b/include/linux/aio.h
index 46ead10..00d6e0c 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -14,67 +14,35 @@ struct kiocb;
 
 #define KIOCB_KEY		0
 
-/*
- * We use ki_cancel == KIOCB_CANCELLED to indicate that a kiocb has been either
- * cancelled or completed (this makes a certain amount of sense because
- * successful cancellation - io_cancel() - does deliver the completion to
- * userspace).
- *
- * And since most things don't implement kiocb cancellation and we'd really like
- * kiocb completion to be lockless when possible, we use ki_cancel to
- * synchronize cancellation and completion - we only set it to KIOCB_CANCELLED
- * with xchg() or cmpxchg(), see batch_complete_aio() and kiocb_cancel().
- */
-#define KIOCB_CANCELLED		((void *) (~0ULL))
-
 typedef int (kiocb_cancel_fn)(struct kiocb *);
 
 struct kiocb {
 	struct file		*ki_filp;
-	struct kioctx		*ki_ctx;	/* NULL for sync ops */
-	kiocb_cancel_fn		*ki_cancel;
-	void			*private;
-
-	union {
-		void __user		*user;
-	} ki_obj;
-
-	__u64			ki_user_data;	/* user's data for completion */
 	loff_t			ki_pos;
-
-	struct list_head	ki_list;	/* the aio core uses this
-						 * for cancellation */
-
-	/*
-	 * If the aio_resfd field of the userspace iocb is not zero,
-	 * this is the underlying eventfd context to deliver events to.
-	 */
-	struct eventfd_ctx	*ki_eventfd;
+	void (*complete)(struct kiocb *iocb, ssize_t ret);
+	void			*private;
 };
 
 static inline bool is_sync_kiocb(struct kiocb *kiocb)
 {
-	return kiocb->ki_ctx == NULL;
+	return kiocb->complete == NULL;
 }
 
 static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp)
 {
 	*kiocb = (struct kiocb) {
-			.ki_ctx = NULL,
 			.ki_filp = filp,
 		};
 }
 
 /* prototypes */
 #ifdef CONFIG_AIO
-extern void aio_complete(struct kiocb *iocb, ssize_t ret);
 struct mm_struct;
 extern void exit_aio(struct mm_struct *mm);
 extern long do_io_submit(aio_context_t ctx_id, long nr,
 			 struct iocb __user *__user *iocbpp, bool compat);
 void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel);
 #else
-static inline void aio_complete(struct kiocb *iocb, ssize_t ret) { }
 struct mm_struct;
 static inline void exit_aio(struct mm_struct *mm) { }
 static inline long do_io_submit(aio_context_t ctx_id, long nr,
@@ -84,11 +52,6 @@ static inline void kiocb_set_cancel_fn(struct kiocb *req,
 				       kiocb_cancel_fn *cancel) { }
 #endif /* CONFIG_AIO */
 
-static inline struct kiocb *list_kiocb(struct list_head *h)
-{
-	return list_entry(h, struct kiocb, ki_list);
-}
-
 /* for sysctl: */
 extern unsigned long aio_nr;
 extern unsigned long aio_max_nr;
-- 
1.9.1


^ permalink raw reply related	[flat|nested] 45+ messages in thread

* [PATCH 5/5] fs: add async read/write interfaces
  2015-01-27 17:55 [RFC] split struct kiocb Christoph Hellwig
                   ` (3 preceding siblings ...)
  2015-01-27 17:55 ` [PATCH 4/5] fs: split generic and aio kiocb Christoph Hellwig
@ 2015-01-27 17:55 ` Christoph Hellwig
  2015-01-31  6:29   ` Al Viro
  4 siblings, 1 reply; 45+ messages in thread
From: Christoph Hellwig @ 2015-01-27 17:55 UTC (permalink / raw)
  To: Al Viro; +Cc: Miklos Szeredi, linux-aio, linux-fsdevel

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/read_write.c    | 37 +++++++++++++++++++++++++++++++++++++
 include/linux/fs.h |  5 +++++
 2 files changed, 42 insertions(+)

diff --git a/fs/read_write.c b/fs/read_write.c
index 955cad3..3add15e 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -373,6 +373,43 @@ ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos)
 }
 EXPORT_SYMBOL(vfs_iter_write);
 
+void init_kernel_kiocb(struct kiocb *iocb, struct file *file, loff_t pos,
+		void (*complete)(struct kiocb *iocb, ssize_t ret))
+{
+	iocb->ki_filp = file;
+	iocb->ki_pos = pos;
+	iocb->complete = complete;
+}
+EXPORT_SYMBOL_GPL(init_kernel_kiocb);
+
+ssize_t vfs_iter_read_async(struct kiocb *iocb, struct iov_iter *iter)
+{
+	ssize_t ret;
+
+	if (!iocb->ki_filp->f_op->read_iter)
+		return -EINVAL;
+
+	ret = iocb->ki_filp->f_op->read_iter(iocb, iter);
+	if (ret != -EIOCBQUEUED)
+		iocb->complete(iocb, ret);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(vfs_iter_read_async);
+
+ssize_t vfs_iter_write_async(struct kiocb *iocb, struct iov_iter *iter)
+{
+	ssize_t ret;
+
+	if (!iocb->ki_filp->f_op->write_iter)
+		return -EINVAL;
+
+	ret = iocb->ki_filp->f_op->write_iter(iocb, iter);
+	if (ret != -EIOCBQUEUED)
+		iocb->complete(iocb, ret);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(vfs_iter_write_async);
+
 /*
  * rw_verify_area doesn't like huge counts. We limit
  * them to something that fits in "int" so that others
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 175ed09..3be55d4 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2491,6 +2491,11 @@ extern ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t
 
 ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos);
 ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos);
+ssize_t vfs_iter_read_async(struct kiocb *iocb, struct iov_iter *iter);
+ssize_t vfs_iter_write_async(struct kiocb *iocb, struct iov_iter *iter);
+
+void init_kernel_kiocb(struct kiocb *iocb, struct file *file, loff_t pos,
+		void (*complete)(struct kiocb *iocb, ssize_t ret));
 
 /* fs/block_dev.c */
 extern ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to);
-- 
1.9.1


^ permalink raw reply related	[flat|nested] 45+ messages in thread

* Re: [PATCH 1/5] fs: don't allow to complete sync iocbs through aio_complete
  2015-01-27 17:55 ` [PATCH 1/5] fs: don't allow to complete sync iocbs through aio_complete Christoph Hellwig
@ 2015-01-28 15:30   ` Miklos Szeredi
  2015-01-28 16:57     ` Christoph Hellwig
  0 siblings, 1 reply; 45+ messages in thread
From: Miklos Szeredi @ 2015-01-28 15:30 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Al Viro, linux-aio, linux-fsdevel, Maxim Patlasov

Adding Maxim Patlasov, who did the AIO mode in fuse.


On Tue, 2015-01-27 at 18:55 +0100, Christoph Hellwig wrote:
> The AIO interface is fairly complex because it tries to allow
> filesystems to always work async and then wakeup a synchronous
> caller through aio_complete.  It turns out that basically no one
> does this to avoid the complexity and context switches, with the
> solve exception of fuse, so remove that possibility to simplify
> the code.
> 
> XXX: fix up fuse.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  drivers/usb/gadget/function/f_fs.c |  4 ++--
>  fs/aio.c                           | 24 +-----------------------
>  fs/ecryptfs/file.c                 |  6 ------
>  fs/fuse/file.c                     |  7 +++++--
>  fs/read_write.c                    | 26 ++++++++------------------
>  include/linux/aio.h                |  4 ----
>  net/socket.c                       |  9 +++------
>  7 files changed, 19 insertions(+), 61 deletions(-)
> 
> diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c
> index 63314ed..debd78b 100644
> --- a/drivers/usb/gadget/function/f_fs.c
> +++ b/drivers/usb/gadget/function/f_fs.c
> @@ -969,7 +969,7 @@ static ssize_t ffs_epfile_aio_write(struct kiocb *kiocb,
>  	if (unlikely(!io_data))
>  		return -ENOMEM;
>  
> -	io_data->aio = true;
> +	io_data->aio = !is_sync_kiocb(kiocb);
>  	io_data->read = false;
>  	io_data->kiocb = kiocb;
>  	io_data->iovec = iovec;
> @@ -1005,7 +1005,7 @@ static ssize_t ffs_epfile_aio_read(struct kiocb *kiocb,
>  		return -ENOMEM;
>  	}
>  
> -	io_data->aio = true;
> +	io_data->aio = !is_sync_kiocb(kiocb);
>  	io_data->read = true;
>  	io_data->kiocb = kiocb;
>  	io_data->iovec = iovec_copy;
> diff --git a/fs/aio.c b/fs/aio.c
> index 1b7893e..0ee25d6 100644
> --- a/fs/aio.c
> +++ b/fs/aio.c
> @@ -791,22 +791,6 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
>  	return 0;
>  }
>  
> -/* wait_on_sync_kiocb:
> - *	Waits on the given sync kiocb to complete.
> - */
> -ssize_t wait_on_sync_kiocb(struct kiocb *req)
> -{
> -	while (!req->ki_ctx) {
> -		set_current_state(TASK_UNINTERRUPTIBLE);
> -		if (req->ki_ctx)
> -			break;
> -		io_schedule();
> -	}
> -	__set_current_state(TASK_RUNNING);
> -	return req->ki_user_data;
> -}
> -EXPORT_SYMBOL(wait_on_sync_kiocb);
> -
>  /*
>   * exit_aio: called when the last user of mm goes away.  At this point, there is
>   * no way for any new requests to be submited or any of the io_* syscalls to be
> @@ -1038,13 +1022,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
>  	 *    ref, no other paths have a way to get another ref
>  	 *  - the sync task helpfully left a reference to itself in the iocb
>  	 */
> -	if (is_sync_kiocb(iocb)) {
> -		iocb->ki_user_data = res;
> -		smp_wmb();
> -		iocb->ki_ctx = ERR_PTR(-EXDEV);
> -		wake_up_process(iocb->ki_obj.tsk);
> -		return;
> -	}
> +	BUG_ON(is_sync_kiocb(iocb));
>  
>  	if (iocb->ki_list.next) {
>  		unsigned long flags;
> diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
> index 6f4e659..a36da88 100644
> --- a/fs/ecryptfs/file.c
> +++ b/fs/ecryptfs/file.c
> @@ -52,12 +52,6 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb,
>  	struct file *file = iocb->ki_filp;
>  
>  	rc = generic_file_read_iter(iocb, to);
> -	/*
> -	 * Even though this is a async interface, we need to wait
> -	 * for IO to finish to update atime
> -	 */
> -	if (-EIOCBQUEUED == rc)
> -		rc = wait_on_sync_kiocb(iocb);
>  	if (rc >= 0) {
>  		path = ecryptfs_dentry_to_lower_path(file->f_path.dentry);
>  		touch_atime(path);
> diff --git a/fs/fuse/file.c b/fs/fuse/file.c
> index 760b2c5..9c27312 100644
> --- a/fs/fuse/file.c
> +++ b/fs/fuse/file.c
> @@ -2841,8 +2841,10 @@ fuse_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
>  	/*
>  	 * By default, we want to optimize all I/Os with async request
>  	 * submission to the client filesystem if supported.
> +	 *
> +	 * XXX: need to add back support for this mode..
>  	 */
> -	io->async = async_dio;
> +	io->async = async_dio && !is_sync_kiocb(iocb);
>  	io->iocb = iocb;
>  
>  	/*
> @@ -2865,7 +2867,8 @@ fuse_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
>  		if (!is_sync_kiocb(iocb))
>  			return -EIOCBQUEUED;
>  
> -		ret = wait_on_sync_kiocb(iocb);
> +		// XXX: need fuse specific replacement
> +//		ret = wait_on_sync_kiocb(iocb);
>  	} else {
>  		kfree(io);
>  	}
> diff --git a/fs/read_write.c b/fs/read_write.c
> index ab4f26a..adab608 100644
> --- a/fs/read_write.c
> +++ b/fs/read_write.c
> @@ -347,9 +347,7 @@ ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos)
>  
>  	iter->type |= READ;
>  	ret = file->f_op->read_iter(&kiocb, iter);
> -	if (ret == -EIOCBQUEUED)
> -		ret = wait_on_sync_kiocb(&kiocb);
> -
> +	BUG_ON(ret == -EIOCBQUEUED);
>  	if (ret > 0)
>  		*ppos = kiocb.ki_pos;
>  	return ret;
> @@ -370,9 +368,7 @@ ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos)
>  
>  	iter->type |= WRITE;
>  	ret = file->f_op->write_iter(&kiocb, iter);
> -	if (ret == -EIOCBQUEUED)
> -		ret = wait_on_sync_kiocb(&kiocb);
> -
> +	BUG_ON(ret == -EIOCBQUEUED);
>  	if (ret > 0)
>  		*ppos = kiocb.ki_pos;
>  	return ret;
> @@ -429,8 +425,7 @@ ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *pp
>  	kiocb.ki_nbytes = len;
>  
>  	ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
> -	if (-EIOCBQUEUED == ret)
> -		ret = wait_on_sync_kiocb(&kiocb);
> +	BUG_ON(ret == -EIOCBQUEUED);
>  	*ppos = kiocb.ki_pos;
>  	return ret;
>  }
> @@ -450,8 +445,7 @@ ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *p
>  	iov_iter_init(&iter, READ, &iov, 1, len);
>  
>  	ret = filp->f_op->read_iter(&kiocb, &iter);
> -	if (-EIOCBQUEUED == ret)
> -		ret = wait_on_sync_kiocb(&kiocb);
> +	BUG_ON(ret == -EIOCBQUEUED);
>  	*ppos = kiocb.ki_pos;
>  	return ret;
>  }
> @@ -513,8 +507,7 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof
>  	kiocb.ki_nbytes = len;
>  
>  	ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
> -	if (-EIOCBQUEUED == ret)
> -		ret = wait_on_sync_kiocb(&kiocb);
> +	BUG_ON(ret == -EIOCBQUEUED);
>  	*ppos = kiocb.ki_pos;
>  	return ret;
>  }
> @@ -534,8 +527,7 @@ ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, lo
>  	iov_iter_init(&iter, WRITE, &iov, 1, len);
>  
>  	ret = filp->f_op->write_iter(&kiocb, &iter);
> -	if (-EIOCBQUEUED == ret)
> -		ret = wait_on_sync_kiocb(&kiocb);
> +	BUG_ON(ret == -EIOCBQUEUED);
>  	*ppos = kiocb.ki_pos;
>  	return ret;
>  }
> @@ -723,8 +715,7 @@ static ssize_t do_iter_readv_writev(struct file *filp, int rw, const struct iove
>  
>  	iov_iter_init(&iter, rw, iov, nr_segs, len);
>  	ret = fn(&kiocb, &iter);
> -	if (ret == -EIOCBQUEUED)
> -		ret = wait_on_sync_kiocb(&kiocb);
> +	BUG_ON(ret == -EIOCBQUEUED);
>  	*ppos = kiocb.ki_pos;
>  	return ret;
>  }
> @@ -740,8 +731,7 @@ static ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
>  	kiocb.ki_nbytes = len;
>  
>  	ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos);
> -	if (ret == -EIOCBQUEUED)
> -		ret = wait_on_sync_kiocb(&kiocb);
> +	BUG_ON(ret == -EIOCBQUEUED);
>  	*ppos = kiocb.ki_pos;
>  	return ret;
>  }
> diff --git a/include/linux/aio.h b/include/linux/aio.h
> index d9c92da..57c86c0 100644
> --- a/include/linux/aio.h
> +++ b/include/linux/aio.h
> @@ -37,7 +37,6 @@ struct kiocb {
>  
>  	union {
>  		void __user		*user;
> -		struct task_struct	*tsk;
>  	} ki_obj;
>  
>  	__u64			ki_user_data;	/* user's data for completion */
> @@ -64,13 +63,11 @@ static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp)
>  	*kiocb = (struct kiocb) {
>  			.ki_ctx = NULL,
>  			.ki_filp = filp,
> -			.ki_obj.tsk = current,
>  		};
>  }
>  
>  /* prototypes */
>  #ifdef CONFIG_AIO
> -extern ssize_t wait_on_sync_kiocb(struct kiocb *iocb);
>  extern void aio_complete(struct kiocb *iocb, long res, long res2);
>  struct mm_struct;
>  extern void exit_aio(struct mm_struct *mm);
> @@ -78,7 +75,6 @@ extern long do_io_submit(aio_context_t ctx_id, long nr,
>  			 struct iocb __user *__user *iocbpp, bool compat);
>  void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel);
>  #else
> -static inline ssize_t wait_on_sync_kiocb(struct kiocb *iocb) { return 0; }
>  static inline void aio_complete(struct kiocb *iocb, long res, long res2) { }
>  struct mm_struct;
>  static inline void exit_aio(struct mm_struct *mm) { }
> diff --git a/net/socket.c b/net/socket.c
> index a2c33a4..4032931 100644
> --- a/net/socket.c
> +++ b/net/socket.c
> @@ -642,8 +642,7 @@ static int do_sock_sendmsg(struct socket *sock, struct msghdr *msg,
>  	iocb.private = &siocb;
>  	ret = nosec ? __sock_sendmsg_nosec(&iocb, sock, msg, size) :
>  		      __sock_sendmsg(&iocb, sock, msg, size);
> -	if (-EIOCBQUEUED == ret)
> -		ret = wait_on_sync_kiocb(&iocb);
> +	BUG_ON(-EIOCBQUEUED == ret);
>  	return ret;
>  }
>  
> @@ -785,8 +784,7 @@ int sock_recvmsg(struct socket *sock, struct msghdr *msg,
>  	init_sync_kiocb(&iocb, NULL);
>  	iocb.private = &siocb;
>  	ret = __sock_recvmsg(&iocb, sock, msg, size, flags);
> -	if (-EIOCBQUEUED == ret)
> -		ret = wait_on_sync_kiocb(&iocb);
> +	BUG_ON(-EIOCBQUEUED == ret);
>  	return ret;
>  }
>  EXPORT_SYMBOL(sock_recvmsg);
> @@ -801,8 +799,7 @@ static int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg,
>  	init_sync_kiocb(&iocb, NULL);
>  	iocb.private = &siocb;
>  	ret = __sock_recvmsg_nosec(&iocb, sock, msg, size, flags);
> -	if (-EIOCBQUEUED == ret)
> -		ret = wait_on_sync_kiocb(&iocb);
> +	BUG_ON(-EIOCBQUEUED == ret);
>  	return ret;
>  }
>  


--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 1/5] fs: don't allow to complete sync iocbs through aio_complete
  2015-01-28 15:30   ` Miklos Szeredi
@ 2015-01-28 16:57     ` Christoph Hellwig
  2015-01-31  3:01       ` Maxim Patlasov
  0 siblings, 1 reply; 45+ messages in thread
From: Christoph Hellwig @ 2015-01-28 16:57 UTC (permalink / raw)
  To: Miklos Szeredi; +Cc: Al Viro, linux-aio, linux-fsdevel, Maxim Patlasov

On Wed, Jan 28, 2015 at 04:30:25PM +0100, Miklos Szeredi wrote:
> Adding Maxim Patlasov, who did the AIO mode in fuse.

FYI, I don't think fuse should be too hard, it just needs a completion
in struct fuse_io_priv.  I just didn't bother before getting some higher
level review for the concept.

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 1/5] fs: don't allow to complete sync iocbs through aio_complete
  2015-01-28 16:57     ` Christoph Hellwig
@ 2015-01-31  3:01       ` Maxim Patlasov
  0 siblings, 0 replies; 45+ messages in thread
From: Maxim Patlasov @ 2015-01-31  3:01 UTC (permalink / raw)
  To: hch; +Cc: linux-fsdevel, linux-aio, mszeredi, viro

On 01/28/2015 08:57 AM, Christoph Hellwig wrote:
> On Wed, Jan 28, 2015 at 04:30:25PM +0100, Miklos Szeredi wrote:
>> Adding Maxim Patlasov, who did the AIO mode in fuse.
>
> FYI, I don't think fuse should be too hard, it just needs a completion
> in struct fuse_io_priv.  I just didn't bother before getting some higher
> level review for the concept.

Yes, I agree. Here is an example implementation below.

---
 fs/fuse/file.c   |   75 ++++++++++++++++++++++++++++++++++++++----------------
 fs/fuse/fuse_i.h |    1 +
 2 files changed, 54 insertions(+), 22 deletions(-)

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index d2be147..b82207a 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -528,6 +528,18 @@ static void fuse_release_user_pages(struct fuse_req *req, int write)
 	}
 }
 
+static ssize_t fuse_get_res_by_io(struct fuse_io_priv *io)
+{
+	if (io->err)
+		return io->err;
+
+	if (io->bytes >= 0 && io->write)
+		return -EIO;
+
+	return io->bytes < 0 ? io->size : io->bytes;
+
+}
+
 /**
  * In case of short read, the caller sets 'pos' to the position of
  * actual end of fuse request in IO request. Otherwise, if bytes_requested
@@ -547,6 +559,7 @@ static void fuse_release_user_pages(struct fuse_req *req, int write)
 static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
 {
 	int left;
+	bool is_sync = is_sync_kiocb(io->iocb);
 
 	spin_lock(&io->lock);
 	if (err)
@@ -555,27 +568,25 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
 		io->bytes = pos;
 
 	left = --io->reqs;
-	spin_unlock(&io->lock);
 
-	if (!left) {
-		long res;
+	if (!left && is_sync) {
+		if (io->waiter)
+			wake_up_process(io->waiter);
+	}
 
-		if (io->err)
-			res = io->err;
-		else if (io->bytes >= 0 && io->write)
-			res = -EIO;
-		else {
-			res = io->bytes < 0 ? io->size : io->bytes;
+	spin_unlock(&io->lock);
 
-			if (!is_sync_kiocb(io->iocb)) {
-				struct inode *inode = file_inode(io->iocb->ki_filp);
-				struct fuse_conn *fc = get_fuse_conn(inode);
-				struct fuse_inode *fi = get_fuse_inode(inode);
+	if (!left && !is_sync) {
+		ssize_t res = fuse_get_res_by_io(io);
 
-				spin_lock(&fc->lock);
-				fi->attr_version = ++fc->attr_version;
-				spin_unlock(&fc->lock);
-			}
+		if (res >= 0) {
+			struct inode *inode = file_inode(io->iocb->ki_filp);
+			struct fuse_conn *fc = get_fuse_conn(inode);
+			struct fuse_inode *fi = get_fuse_inode(inode);
+
+			spin_lock(&fc->lock);
+			fi->attr_version = ++fc->attr_version;
+			spin_unlock(&fc->lock);
 		}
 
 		io->iocb->complete(io->iocb, res);
@@ -2798,6 +2809,29 @@ static inline loff_t fuse_round_up(loff_t off)
 	return round_up(off, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT);
 }
 
+static ssize_t fuse_dio_wait(struct fuse_io_priv *io)
+{
+	ssize_t res;
+
+	spin_lock(&io->lock);
+
+	while (io->reqs) {
+		__set_current_state(TASK_UNINTERRUPTIBLE);
+		io->waiter = current;
+		spin_unlock(&io->lock);
+		io_schedule();
+		/* wake up sets us TASK_RUNNING */
+		spin_lock(&io->lock);
+		io->waiter = NULL;
+	}
+
+	spin_unlock(&io->lock);
+
+	res = fuse_get_res_by_io(io);
+	kfree(io);
+	return res;
+}
+
 static ssize_t
 fuse_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
 			loff_t offset)
@@ -2841,10 +2875,8 @@ fuse_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
 	/*
 	 * By default, we want to optimize all I/Os with async request
 	 * submission to the client filesystem if supported.
-	 *
-	 * XXX: need to add back support for this mode..
 	 */
-	io->async = async_dio && !is_sync_kiocb(iocb);
+ 	io->async = async_dio;
 	io->iocb = iocb;
 
 	/*
@@ -2867,8 +2899,7 @@ fuse_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
 		if (!is_sync_kiocb(iocb))
 			return -EIOCBQUEUED;
 
-		// XXX: need fuse specific replacement
-//		ret = wait_on_sync_kiocb(iocb);
+		ret = fuse_dio_wait(io);
 	} else {
 		kfree(io);
 	}
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 1cdfb07..8d4872b 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -263,6 +263,7 @@ struct fuse_io_priv {
 	int err;
 	struct kiocb *iocb;
 	struct file *file;
+	struct task_struct *waiter;
 };
 
 /**

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply related	[flat|nested] 45+ messages in thread

* Re: [PATCH 3/5] fs: remove ki_nbytes
  2015-01-27 17:55 ` [PATCH 3/5] fs: remove ki_nbytes Christoph Hellwig
@ 2015-01-31  6:08   ` Al Viro
  2015-02-02  8:07     ` Christoph Hellwig
  0 siblings, 1 reply; 45+ messages in thread
From: Al Viro @ 2015-01-31  6:08 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Miklos Szeredi, linux-aio, linux-fsdevel

> @@ -717,14 +718,14 @@ ep_aio_write(struct kiocb *iocb, const struct iovec *iov,
>  		unsigned long nr_segs, loff_t o)
>  {
>  	struct ep_data		*epdata = iocb->ki_filp->private_data;
> +	size_t			len = iov_length(iov, nr_segs);
>  	char			*buf;
> -	size_t			len = 0;
>  	int			i = 0;
>  
>  	if (unlikely(!usb_endpoint_dir_in(&epdata->desc)))
>  		return -EINVAL;
>  
> -	buf = kmalloc(iocb->ki_nbytes, GFP_KERNEL);
> +	buf = kmalloc(len, GFP_KERNEL);
>  	if (unlikely(!buf))
>  		return -ENOMEM;
>  
> @@ -734,7 +735,6 @@ ep_aio_write(struct kiocb *iocb, const struct iovec *iov,
>  			kfree(buf);
>  			return -EFAULT;
>  		}
> -		len += iov[i].iov_len;
>  	}
>  	return ep_aio_rwtail(iocb, buf, len, epdata, NULL, 0);

WTF bother?  Just switch it to ->write_iter():

static ssize_t
ep_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
	struct ep_data *epdata = iocb->ki_filp->private_data;
	size_t len = iov_iter_count(from);
	void *buf;

	if (unlikely(!usb_endpoint_dir_in(&epdata->desc)))
		return -EINVAL;

	buf = kmalloc(iov_iter_count(from), GFP_KERNEL);
	if (unlikely(!buf))
		return -ENOMEM;

	if (copy_from_iter(buf, from, len) != len) {
		kfree(buf);
		return -EFAULT;
	}

	return ep_aio_rwtail(iocb, buf, len, epdata, NULL, 0);
}

and be done with that.  I'll put drivers/usb/gadget patches into a stable
branch and ask use folks to pull from it - that's the simplest of this
series, actually...

> @@ -903,10 +903,9 @@ static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov,
>  	if (pos != 0)
>  		return -ESPIPE;
>  
> -	if (iocb->ki_nbytes == 0)	/* Match SYS5 behaviour */
> +	if (!iov_length(iov, nr_segs))	/* Match SYS5 behaviour */
>  		return 0;

FWIW, it's switched to ->read_iter/->write_iter in vfs.git#for-davem.

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 5/5] fs: add async read/write interfaces
  2015-01-27 17:55 ` [PATCH 5/5] fs: add async read/write interfaces Christoph Hellwig
@ 2015-01-31  6:29   ` Al Viro
  0 siblings, 0 replies; 45+ messages in thread
From: Al Viro @ 2015-01-31  6:29 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Miklos Szeredi, linux-aio, linux-fsdevel

On Tue, Jan 27, 2015 at 06:55:13PM +0100, Christoph Hellwig wrote:
> +void init_kernel_kiocb(struct kiocb *iocb, struct file *file, loff_t pos,
> +		void (*complete)(struct kiocb *iocb, ssize_t ret))
> +{
> +	iocb->ki_filp = file;
> +	iocb->ki_pos = pos;
> +	iocb->complete = complete;
> +}
> +EXPORT_SYMBOL_GPL(init_kernel_kiocb);

Could we please stop that nonsense?  Any non-GPL module that decides to
use that will simply open-code this oh-so-valuable piece of intellectual
property - all three assignments worth of it.

EXPORT_SYMBOL_GPL() is usually silly posturing - it's borderline
defensible if you are exporting deep guts of otherwise internal objects
and want to limit the scope of damage (but in that case you'd better
have a very good reason for having an export at all), but in cases
like this it's something better kept to alt.sex.masturbation.

I'm not fond of non-GPL modules, but this is simply ridiculous.
Use normal export.  The same goes for other two exports here.

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 2/5] fs: saner aio_complete prototype
  2015-01-27 17:55 ` [PATCH 2/5] fs: saner aio_complete prototype Christoph Hellwig
@ 2015-01-31 10:04   ` Al Viro
  0 siblings, 0 replies; 45+ messages in thread
From: Al Viro @ 2015-01-31 10:04 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Miklos Szeredi, linux-aio, linux-fsdevel

On Tue, Jan 27, 2015 at 06:55:10PM +0100, Christoph Hellwig wrote:
> Pass the result as a ssize_t, which is what we use for these returns
> all over the kernel.  Remove the res2 argument that all relevant
> callers always pass as '0'.

AFAICS, ->res2 is a part of user-visible ABI and drivers/usb/gadget ones
sure as hell don't look like as if it was 0:

> -	aio_complete(io_data->kiocb, ret, ret);
> -	aio_complete(iocb, ret, ret);
> -		aio_complete(iocb, req->actual ? req->actual : req->status,
> -				req->status);

What am I missing here?

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 3/5] fs: remove ki_nbytes
  2015-01-31  6:08   ` Al Viro
@ 2015-02-02  8:07     ` Christoph Hellwig
  2015-02-02  8:11       ` Al Viro
  0 siblings, 1 reply; 45+ messages in thread
From: Christoph Hellwig @ 2015-02-02  8:07 UTC (permalink / raw)
  To: Al Viro; +Cc: Christoph Hellwig, Miklos Szeredi, linux-aio, linux-fsdevel

On Sat, Jan 31, 2015 at 06:08:41AM +0000, Al Viro wrote:
> and be done with that.  I'll put drivers/usb/gadget patches into a stable
> branch and ask use folks to pull from it - that's the simplest of this
> series, actually...

use folks?  Btw, does this mean you patches to switch it over, or do
you want me to finish my conversion.

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 3/5] fs: remove ki_nbytes
  2015-02-02  8:07     ` Christoph Hellwig
@ 2015-02-02  8:11       ` Al Viro
  2015-02-02  8:14         ` Al Viro
  2015-02-02 14:20         ` [PATCH 3/5] fs: remove ki_nbytes Christoph Hellwig
  0 siblings, 2 replies; 45+ messages in thread
From: Al Viro @ 2015-02-02  8:11 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Miklos Szeredi, linux-aio, linux-fsdevel

On Mon, Feb 02, 2015 at 09:07:29AM +0100, Christoph Hellwig wrote:
> On Sat, Jan 31, 2015 at 06:08:41AM +0000, Al Viro wrote:
> > and be done with that.  I'll put drivers/usb/gadget patches into a stable
> > branch and ask use folks to pull from it - that's the simplest of this
> > series, actually...
> 
> use folks?  Btw, does this mean you patches to switch it over, or do

usb.

> you want me to finish my conversion.

I have f_fs.c conversion and partial legacy/inode.c one.  Will post tomorrow,
about to fall asleep right now...

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 3/5] fs: remove ki_nbytes
  2015-02-02  8:11       ` Al Viro
@ 2015-02-02  8:14         ` Al Viro
  2015-02-02 14:26           ` Christoph Hellwig
  2015-02-02 14:20         ` [PATCH 3/5] fs: remove ki_nbytes Christoph Hellwig
  1 sibling, 1 reply; 45+ messages in thread
From: Al Viro @ 2015-02-02  8:14 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Miklos Szeredi, linux-aio, linux-fsdevel

On Mon, Feb 02, 2015 at 08:11:12AM +0000, Al Viro wrote:
> On Mon, Feb 02, 2015 at 09:07:29AM +0100, Christoph Hellwig wrote:
> > On Sat, Jan 31, 2015 at 06:08:41AM +0000, Al Viro wrote:
> > > and be done with that.  I'll put drivers/usb/gadget patches into a stable
> > > branch and ask use folks to pull from it - that's the simplest of this
> > > series, actually...
> > 
> > use folks?  Btw, does this mean you patches to switch it over, or do
> 
> usb.
> 
> > you want me to finish my conversion.
> 
> I have f_fs.c conversion and partial legacy/inode.c one.  Will post tomorrow,
> about to fall asleep right now...

FWIW, here's the current situation with read/write methods:

1) a bunch of file_operations instances have only ->read() and/or ->write();
no ->aio_... or ->..._iter versions, no ->splice_write().  Those are basically
device drivers:
	readv is equivalent to loop over segments, with read on each.
	writev is equivalent to loop over segments, with write on each.
	AIO read and write are all synchronous
	splice_write is does kmap on each buffer in turn and write that
Note that splitting a buffer into several adjacent pieces and submitting
a vectored IO is *not* guaranteed to be equivalent to plain IO on the entire
buffer - it has datagram-like semantics and boundaries do matter.

2) regular ->read_iter/->write_iter users:
	->read is new_sync_read() (or NULL, if ->read_iter is NULL)
	->write is new_sync_write() (or NULL, if ->write_iter is NULL)
	->aio_read and ->aio_write are NULL
	->splice_write is iter_splice_write (or NULL, if ->write_iter is NULL)
Those are stream-style ones; what matter is concatenation of the buffers, not
the boundaries.  Right now the following is true: ->read_iter/->write_iter on
a sync kiocb never returns EIOCBQUEUED and iterator argument is advanced
exactly by the amount of data transferred.

3) ones like (2), but with NULL ->splice_write() in spite of present
->write_iter().  AFAICS they can be bulk-converted to (2).  This is what the
bulk of irregularities boil down to.

4) ones like (2) or (3), except ->read() and ->write() are left NULL instead
of doing new_sync_{read,write}().  Mostly equivalent to (2); some codepaths
call ->read() or ->write() directly, instead of going through vfs_... wrappers,
and for those it can be a bit of a headache.  In any case, there are very
few such instances (only 3) and they are trivial to convert to (2).

5) Two instances in fs/9p have ->write_iter() *and* ->write(), the latter not
being new_sync_write().  Ditto for ->read_iter() and ->read().  No other
instance has such combinations.  FWIW, they are *almost* regular - their
->read() and ->write() are new_sync_... unless it's an O_DIRECT file.
It might make sense to try and convert those to ->direct_IO(); as it is, their
O_DIRECT is ignored for readv()/writev(), which is arguably a bug.

6) drivers/char/mem.c stuff; they are equivalent to (2), but somewhat
optimised.  Some of that might make sense to convert to (2); there _are_ hot
paths involved, so we'd better be careful there.

7) bad_file_ops.  It is equivalent to (2); it's also a very special
case.  FWIW, I'm not at all sure that we *need* most of the methods in
there.  We never replace ->f_op of an opened file with that, so if we
end up with that sucker, it happens on fresh open of something that had
its ->i_fop replaced with bad_file_ops.  And that will instantly fail in
bad_file_open().  Why do we need the rest of the methods in there?
AFAICS, we should drop all but ->open().

8) hypfs - AFAICS, converts to (2) easily; I'll do that.

9) FUSE - ->aio_read/->aio_write user, with unusual ->splice_write as
well.  I _think_ it can be converted to (2), but that'll take a bit of
work.

10) sockets; conversion to ->read_iter/->write_iter had been posted to
netdev, ->splice_write() (the only user of ->sendpage(), BTW) is a work
for the next cycle.

11) NTFS with rw support enabled.  ->aio_write() user, needs to be converted
to ->write_iter().  AFAICS, it wasn't particularly high priority for Anton
(along with all rw stuff in fs/ntfs); it doesn't look terribly hard, but then
it wasn't a high priority for me either.

12) virtio_console.  Interesting ->splice_write() there; hadn't looked deep
enough.

13) two odd drivers/usb/gadget instances.  I have conversion for f_fs.c,
but legacy/inode.c (ep_read() et.al.) is trickier.  The problem in there
is that writev() on a single-element vector is *not* equivalent to plain
write().  The former treats the wrong-direction endpoint as EINVAL; the
latter does
                if (usb_endpoint_xfer_isoc(&data->desc)) {
                        mutex_unlock(&data->lock);
                        return -EINVAL;
                }
                DBG (data->dev, "%s halt\n", data->name);
                spin_lock_irq (&data->dev->lock);
                if (likely (data->ep != NULL))
                        usb_ep_set_halt (data->ep);
                spin_unlock_irq (&data->dev->lock);
                mutex_unlock(&data->lock);
                return -EBADMSG;
instead.  IOW, for isochronous endpoints behaviour is the same, but the
rest behaves differently.  If not for that, that sucker would convert
to (3) easily; ->splice_write() semantics is potentially interesting -
the question is where do we want transfer boundaries.  As it is, writev()
collects the entire iovec and shoves it into single transfer; splice() to
that thing ends up with each pipe_buf going in a separate transfer, mergable
or not.  I would really appreciate comments on semantics of that thing from
USB folks...

14) ipathfs and qibfs: seriously different semantics for write and writev/AIO
write.  As in "different set of commands recognized"; AIO write plays like
writev, whether it's vectored or not (and it's always synchronous).
I've no idea who had come up with that... highly innovative API or why
hadn't they simply added two files (it's all on their virtual filesystem,
so they had full control of layout) rather that multiplexing two different
command sets in such a fashion.

15) /dev/snd/pcmC*D*[cp].  Again, different semantics for write and writev,
with the latter wanting nr_seqs equal to the number of channels.  AIO
non-vectored write fails unless there's only one channel.  Not sure how
ALSA userland uses that thing; AIO side is always synchronous, so it might
be simply never used.  FWIW, I'm not sure that write() on a single-channel
one is equivalent to 1-element writev() - silencing-related logics seem to
differ.

That's it.

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 3/5] fs: remove ki_nbytes
  2015-02-02  8:11       ` Al Viro
  2015-02-02  8:14         ` Al Viro
@ 2015-02-02 14:20         ` Christoph Hellwig
  1 sibling, 0 replies; 45+ messages in thread
From: Christoph Hellwig @ 2015-02-02 14:20 UTC (permalink / raw)
  To: Al Viro; +Cc: Christoph Hellwig, Miklos Szeredi, linux-aio, linux-fsdevel

On Mon, Feb 02, 2015 at 08:11:12AM +0000, Al Viro wrote:
> > you want me to finish my conversion.
> 
> I have f_fs.c conversion and partial legacy/inode.c one.  Will post tomorrow,
> about to fall asleep right now...

Already.  I've already fixed up the remaining commets on the series,
so if you post it soon I'll have the series one for this merge window.

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 3/5] fs: remove ki_nbytes
  2015-02-02  8:14         ` Al Viro
@ 2015-02-02 14:26           ` Christoph Hellwig
  2015-02-04  8:34             ` Al Viro
  0 siblings, 1 reply; 45+ messages in thread
From: Christoph Hellwig @ 2015-02-02 14:26 UTC (permalink / raw)
  To: Al Viro; +Cc: Christoph Hellwig, Miklos Szeredi, linux-aio, linux-fsdevel

On Mon, Feb 02, 2015 at 08:14:31AM +0000, Al Viro wrote:
> 13) two odd drivers/usb/gadget instances.  I have conversion for f_fs.c,
> but legacy/inode.c (ep_read() et.al.) is trickier.  The problem in there
> is that writev() on a single-element vector is *not* equivalent to plain
> write().  The former treats the wrong-direction endpoint as EINVAL; the
> latter does
>                 if (usb_endpoint_xfer_isoc(&data->desc)) {
>                         mutex_unlock(&data->lock);
>                         return -EINVAL;
>                 }
>                 DBG (data->dev, "%s halt\n", data->name);
>                 spin_lock_irq (&data->dev->lock);
>                 if (likely (data->ep != NULL))
>                         usb_ep_set_halt (data->ep);
>                 spin_unlock_irq (&data->dev->lock);
>                 mutex_unlock(&data->lock);
>                 return -EBADMSG;
> instead.  IOW, for isochronous endpoints behaviour is the same, but the
> rest behaves differently.  If not for that, that sucker would convert
> to (3) easily;

I would bet the behavior difference is a bug, might be worth to Cc the
usb folks on this issue.  I bet we'd want the more complex behavior
for both variants.

> 14) ipathfs and qibfs: seriously different semantics for write and writev/AIO
> write.  As in "different set of commands recognized"; AIO write plays like
> writev, whether it's vectored or not (and it's always synchronous).
> I've no idea who had come up with that... highly innovative API or why
> hadn't they simply added two files (it's all on their virtual filesystem,
> so they had full control of layout) rather that multiplexing two different
> command sets in such a fashion.
> 
> 15) /dev/snd/pcmC*D*[cp].  Again, different semantics for write and writev,
> with the latter wanting nr_seqs equal to the number of channels.  AIO
> non-vectored write fails unless there's only one channel.  Not sure how
> ALSA userland uses that thing; AIO side is always synchronous, so it might
> be simply never used.  FWIW, I'm not sure that write() on a single-channel
> one is equivalent to 1-element writev() - silencing-related logics seem to
> differ.

For these weirdos we can pass down a flag in the kiocb about the source
of the I/O.  We'll need that flags field for non-blocking buffered reads
and per-I/O O_SYNC anyway, and it will be very useful for fixing the
races around changing the O_DIRECT flag at run time.

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 3/5] fs: remove ki_nbytes
  2015-02-02 14:26           ` Christoph Hellwig
@ 2015-02-04  8:34             ` Al Viro
  2015-02-04 18:17               ` Alan Stern
  0 siblings, 1 reply; 45+ messages in thread
From: Al Viro @ 2015-02-04  8:34 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Miklos Szeredi, linux-aio, linux-fsdevel, Felipe Balbi, linux-usb

[USB folks Cc'd]

On Mon, Feb 02, 2015 at 03:26:17PM +0100, Christoph Hellwig wrote:

> I would bet the behavior difference is a bug, might be worth to Cc the
> usb folks on this issue.  I bet we'd want the more complex behavior
> for both variants.

[Context for USB people: The difference in question is what ep_read() does
when it is called on write endpoint that isn't isochronous; it halts the
sucker and fails with EBADMSG, while ep_aio_read() handles all write endpoints
as isochronous ones - fails with EINVAL; FWIW, I agree that it's probably
a bug]

Sadly, that's not the only problem in there ;-/  _This_ one really has
the "what if single-segment AIO read tries to dereference iovec after
the caller is gone" bug you suspected in fs/direct-io.c; we have
static void ep_user_copy_worker(struct work_struct *work)
{
        struct kiocb_priv *priv = container_of(work, struct kiocb_priv, work);
        struct mm_struct *mm = priv->mm;
        struct kiocb *iocb = priv->iocb;
        size_t ret;

        use_mm(mm); 
        ret = ep_copy_to_user(priv);
        unuse_mm(mm);

        /* completing the iocb can drop the ctx and mm, don't touch mm after */
        aio_complete(iocb, ret, ret);

        kfree(priv->buf);
        kfree(priv);
}
called via schedule_work() from ->complete() of usb_request allocated and
queued by ->aio_read().  It very definitely _can_ be executed after return
from ->aio_read() and aio_run_iocb().  And ep_copy_to_user() dereferences
the iovec given to ->aio_read(); _not_ its copy as f_fs.c does.

Do io_submit(2) with several IOCB_CMD_PREAD requests, and you'll almost
certainly get the data from the first one copied to the destination of
the second one instead.  It shouldn't be hard to reproduce.  And that,
of course, is not the worst possible outcome...

I'm going to add copying of iovec in async read case.  And AFAICS, that one
is -stable fodder.  See vfs.git#gadget for f_fs.c conversion; I haven't
pushed legacy/inode.c stuff yet - I need to pull the fix of the bug above
into the beginning of that pile first.

FWIW, I don't believe that it makes sense to do iovec copying in
aio_run_iocb(); note that most of the instances will be done with
iovec before they return there.  These two were the sole exceptions;
function/f_fs.c did copying, legacy/inode.c didn't.  Most of the
->aio_read/->read_iter instances (including ones that *do* return
EIOCBQUEUED) only access iovec synchronously; usually that's done
by grabbing the pages to copy into before we get aronud to starting
IO.  legacy/inode.c is the only instance to step into that kind of bug.
function/f_fs.c also had a fun bug, BTW - failure in AIO ended up leaking
io_data (plus iovec copy in case of aio_read()).  Looks like another
-stable fodder, if less serious one...  See b17d2ded6 (gadget/function/f_fs.c:
close leaks) in vfs.git#gadget for that one.

I plan to pull the fix for use-after-free in the beginning of that queue
(in an easy to backport form) and then have ep_aio_read/ep_aio_write
start doing the halt-related bits as in ep_read/ep_write.  With that it's
trivial to convert that sucker along the same lines as function/f_fs.c.

All of that, assuming that anybody gives a damn about the driver in question.
The things like
                        spin_lock_irq (&dev->lock);
			....
// FIXME don't call this with the spinlock held ...
                                if (copy_to_user (buf, dev->req->buf, len))
seem to indicate that nobody does, seeing that this bug had been there
since 2003, complete with FIXME ;-/

If nobody cares about that sucker, git rm would be a better solution, IMO...

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 3/5] fs: remove ki_nbytes
  2015-02-04  8:34             ` Al Viro
@ 2015-02-04 18:17               ` Alan Stern
  2015-02-04 19:06                 ` Al Viro
  0 siblings, 1 reply; 45+ messages in thread
From: Alan Stern @ 2015-02-04 18:17 UTC (permalink / raw)
  To: Al Viro
  Cc: Christoph Hellwig, Miklos Szeredi, linux-aio, linux-fsdevel,
	Felipe Balbi, linux-usb

On Wed, 4 Feb 2015, Al Viro wrote:

> [USB folks Cc'd]

Incidentally, Al, have you seen this email?

	http://marc.info/?l=linux-usb&m=142295011402339&w=2

I encouraged the writer to send in a patch but so far there has been no 
reply.

> On Mon, Feb 02, 2015 at 03:26:17PM +0100, Christoph Hellwig wrote:
> 
> > I would bet the behavior difference is a bug, might be worth to Cc the
> > usb folks on this issue.  I bet we'd want the more complex behavior
> > for both variants.
> 
> [Context for USB people: The difference in question is what ep_read() does
> when it is called on write endpoint that isn't isochronous;

You're talking about drivers/usb/gadget/legacy/inode.c, right?

>  it halts the
> sucker and fails with EBADMSG, while ep_aio_read() handles all write endpoints
> as isochronous ones - fails with EINVAL; FWIW, I agree that it's probably
> a bug]

It's not a bug; it's by design.  That's how you halt an endpoint in 
gadgetfs -- by doing a synchronous I/O call in the "wrong" direction.

> Sadly, that's not the only problem in there ;-/  _This_ one really has
> the "what if single-segment AIO read tries to dereference iovec after
> the caller is gone" bug you suspected in fs/direct-io.c; we have
> static void ep_user_copy_worker(struct work_struct *work)
> {
>         struct kiocb_priv *priv = container_of(work, struct kiocb_priv, work);
>         struct mm_struct *mm = priv->mm;
>         struct kiocb *iocb = priv->iocb;
>         size_t ret;
> 
>         use_mm(mm); 
>         ret = ep_copy_to_user(priv);
>         unuse_mm(mm);
> 
>         /* completing the iocb can drop the ctx and mm, don't touch mm after */
>         aio_complete(iocb, ret, ret);
> 
>         kfree(priv->buf);
>         kfree(priv);
> }
> called via schedule_work() from ->complete() of usb_request allocated and
> queued by ->aio_read().  It very definitely _can_ be executed after return
> from ->aio_read() and aio_run_iocb().  And ep_copy_to_user() dereferences
> the iovec given to ->aio_read(); _not_ its copy as f_fs.c does.
> 
> Do io_submit(2) with several IOCB_CMD_PREAD requests, and you'll almost
> certainly get the data from the first one copied to the destination of
> the second one instead.  It shouldn't be hard to reproduce.  And that,
> of course, is not the worst possible outcome...
> 
> I'm going to add copying of iovec in async read case.  And AFAICS, that one
> is -stable fodder.  See vfs.git#gadget for f_fs.c conversion; I haven't
> pushed legacy/inode.c stuff yet - I need to pull the fix of the bug above
> into the beginning of that pile first.
> 
> FWIW, I don't believe that it makes sense to do iovec copying in
> aio_run_iocb(); note that most of the instances will be done with
> iovec before they return there.

That's true even for gadgetfs in the write case.

>  These two were the sole exceptions;
> function/f_fs.c did copying, legacy/inode.c didn't.  Most of the
> ->aio_read/->read_iter instances (including ones that *do* return
> EIOCBQUEUED) only access iovec synchronously; usually that's done
> by grabbing the pages to copy into before we get aronud to starting
> IO.  legacy/inode.c is the only instance to step into that kind of bug.
> function/f_fs.c also had a fun bug, BTW - failure in AIO ended up leaking
> io_data (plus iovec copy in case of aio_read()).  Looks like another
> -stable fodder, if less serious one...  See b17d2ded6 (gadget/function/f_fs.c:
> close leaks) in vfs.git#gadget for that one.
> 
> I plan to pull the fix for use-after-free in the beginning of that queue
> (in an easy to backport form) and then have ep_aio_read/ep_aio_write
> start doing the halt-related bits as in ep_read/ep_write.  With that it's
> trivial to convert that sucker along the same lines as function/f_fs.c.

I don't think there's any need to make the async routines do the
halt-related stuff.  After all, it's silly for users to call an async
I/O routine to perform a synchronous action like halting an endpoint.

On the other hand, it would be reasonable to replace the -EBADMSG with
some massaged version of the return code from usb_ep_set_halt(), which
is supposed to return -EAGAIN under some circumstances.  But that would
be an API change, so we probably shouldn't do it...

> All of that, assuming that anybody gives a damn about the driver in question.
> The things like
>                         spin_lock_irq (&dev->lock);
> 			....
> // FIXME don't call this with the spinlock held ...
>                                 if (copy_to_user (buf, dev->req->buf, len))
> seem to indicate that nobody does, seeing that this bug had been there
> since 2003, complete with FIXME ;-/
> 
> If nobody cares about that sucker, git rm would be a better solution, IMO...

It is a legacy driver after all, but some people still use it.

Alan Stern

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 3/5] fs: remove ki_nbytes
  2015-02-04 18:17               ` Alan Stern
@ 2015-02-04 19:06                 ` Al Viro
  2015-02-04 20:30                   ` Alan Stern
  0 siblings, 1 reply; 45+ messages in thread
From: Al Viro @ 2015-02-04 19:06 UTC (permalink / raw)
  To: Alan Stern
  Cc: Christoph Hellwig, Miklos Szeredi, linux-aio, linux-fsdevel,
	Felipe Balbi, linux-usb

On Wed, Feb 04, 2015 at 01:17:30PM -0500, Alan Stern wrote:
> On Wed, 4 Feb 2015, Al Viro wrote:
> 
> > [USB folks Cc'd]
> 
> Incidentally, Al, have you seen this email?
> 
> 	http://marc.info/?l=linux-usb&m=142295011402339&w=2
> 
> I encouraged the writer to send in a patch but so far there has been no 
> reply.

Yecchhh...  Anything that changes ->f_op *after* return from ->open() is
doing a nasty, nasty thing.  What's to guarantee that any checks for
NULL fields will stay valid, etc.?

FWIW, in all the tree there are only 4 places where that would be happening;
	* i810_map_buffer() screwing around with having vm_mmap() done,
only it wants its own thing called as ->mmap() (and a bit of extra data
stashed for it).  Racy as hell (if another thread calls mmap() on the
same file, you'll get a nasty surprise).  Driver's too old and brittle to
touch, according to drm folks...
	* TTY hangup logics.  Nasty (and might be broken around ->fasync()),
but it's a very special case.
	* snd_card_disconnect().  Analogue of TTY hangup, actually; both are
trying to do a form of revoke().
	* this one.  Note that you are not guaranteed that ep_config() won't
be called more than once - two threads might race in write(2), with the loser
getting through mutex_lock_interruptible(&data->lock); in ep_config() only
after the winner has already gotten through write(), switched ->f_op, returned
to userland and started doing read()/write()/etc.  If nothing else,
the contents of data->desc and data->hs_desc can be buggered by arbitrary
data, no matter how bogus, right as the first thread is doing IO.

> > [Context for USB people: The difference in question is what ep_read() does
> > when it is called on write endpoint that isn't isochronous;
> 
> You're talking about drivers/usb/gadget/legacy/inode.c, right?

Yes.

> >  it halts the
> > sucker and fails with EBADMSG, while ep_aio_read() handles all write endpoints
> > as isochronous ones - fails with EINVAL; FWIW, I agree that it's probably
> > a bug]
> 
> It's not a bug; it's by design.  That's how you halt an endpoint in 
> gadgetfs -- by doing a synchronous I/O call in the "wrong" direction.

Yes, but you have readv() on single-element vector behave different from
read(), which is surprising, to put it mildly.

> > I plan to pull the fix for use-after-free in the beginning of that queue
> > (in an easy to backport form) and then have ep_aio_read/ep_aio_write
> > start doing the halt-related bits as in ep_read/ep_write.  With that it's
> > trivial to convert that sucker along the same lines as function/f_fs.c.
> 
> I don't think there's any need to make the async routines do the
> halt-related stuff.  After all, it's silly for users to call an async
> I/O routine to perform a synchronous action like halting an endpoint.

Um...  readv() is also going through ->aio_read().  I can tie that to
sync vs. async, though - is_sync_kiocb() will do just that, if you are
OK with having readv() act the same as read() in that respect.

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 3/5] fs: remove ki_nbytes
  2015-02-04 19:06                 ` Al Viro
@ 2015-02-04 20:30                   ` Alan Stern
  2015-02-04 23:07                     ` Al Viro
  0 siblings, 1 reply; 45+ messages in thread
From: Alan Stern @ 2015-02-04 20:30 UTC (permalink / raw)
  To: Al Viro
  Cc: Christoph Hellwig, Miklos Szeredi, linux-aio, linux-fsdevel,
	Felipe Balbi, linux-usb

On Wed, 4 Feb 2015, Al Viro wrote:

> On Wed, Feb 04, 2015 at 01:17:30PM -0500, Alan Stern wrote:
> > On Wed, 4 Feb 2015, Al Viro wrote:
> > 
> > > [USB folks Cc'd]
> > 
> > Incidentally, Al, have you seen this email?
> > 
> > 	http://marc.info/?l=linux-usb&m=142295011402339&w=2
> > 
> > I encouraged the writer to send in a patch but so far there has been no 
> > reply.
> 
> Yecchhh...  Anything that changes ->f_op *after* return from ->open() is
> doing a nasty, nasty thing.  What's to guarantee that any checks for
> NULL fields will stay valid, etc.?
> 
> FWIW, in all the tree there are only 4 places where that would be happening;
> 	* i810_map_buffer() screwing around with having vm_mmap() done,
> only it wants its own thing called as ->mmap() (and a bit of extra data
> stashed for it).  Racy as hell (if another thread calls mmap() on the
> same file, you'll get a nasty surprise).  Driver's too old and brittle to
> touch, according to drm folks...
> 	* TTY hangup logics.  Nasty (and might be broken around ->fasync()),
> but it's a very special case.
> 	* snd_card_disconnect().  Analogue of TTY hangup, actually; both are
> trying to do a form of revoke().
> 	* this one.  Note that you are not guaranteed that ep_config() won't
> be called more than once - two threads might race in write(2), with the loser
> getting through mutex_lock_interruptible(&data->lock); in ep_config() only
> after the winner has already gotten through write(), switched ->f_op, returned
> to userland and started doing read()/write()/etc.  If nothing else,
> the contents of data->desc and data->hs_desc can be buggered by arbitrary
> data, no matter how bogus, right as the first thread is doing IO.

Well, this one certainly can be fixed to avoid altering ->f_op, at the 
cost of adding an extra check at the start of each I/O operation.

> > >  it halts the
> > > sucker and fails with EBADMSG, while ep_aio_read() handles all write endpoints
> > > as isochronous ones - fails with EINVAL; FWIW, I agree that it's probably
> > > a bug]
> > 
> > It's not a bug; it's by design.  That's how you halt an endpoint in 
> > gadgetfs -- by doing a synchronous I/O call in the "wrong" direction.
> 
> Yes, but you have readv() on single-element vector behave different from
> read(), which is surprising, to put it mildly.
> 
> > > I plan to pull the fix for use-after-free in the beginning of that queue
> > > (in an easy to backport form) and then have ep_aio_read/ep_aio_write
> > > start doing the halt-related bits as in ep_read/ep_write.  With that it's
> > > trivial to convert that sucker along the same lines as function/f_fs.c.
> > 
> > I don't think there's any need to make the async routines do the
> > halt-related stuff.  After all, it's silly for users to call an async
> > I/O routine to perform a synchronous action like halting an endpoint.
> 
> Um...  readv() is also going through ->aio_read().

Why does readv() do this but not read()?  Wouldn't it make more sense 
to have all the read* calls use the same internal interface?

>  I can tie that to
> sync vs. async, though - is_sync_kiocb() will do just that, if you are
> OK with having readv() act the same as read() in that respect.

I don't really care one way or the other.  In fact, it doesn't matter
if the same behavior applies to all the async calls as well as the sync
calls -- I just doubt that anybody will ever use them.

Alan Stern

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 3/5] fs: remove ki_nbytes
  2015-02-04 20:30                   ` Alan Stern
@ 2015-02-04 23:07                     ` Al Viro
  2015-02-05  8:24                       ` Robert Baldyga
       [not found]                       ` <20150204230733.GK29656-3bDd1+5oDREiFSDQTTA3OLVCufUGDwFn@public.gmane.org>
  0 siblings, 2 replies; 45+ messages in thread
From: Al Viro @ 2015-02-04 23:07 UTC (permalink / raw)
  To: Alan Stern
  Cc: Christoph Hellwig, Miklos Szeredi, linux-aio, linux-fsdevel,
	Felipe Balbi, linux-usb

On Wed, Feb 04, 2015 at 03:30:32PM -0500, Alan Stern wrote:
> > 	* this one.  Note that you are not guaranteed that ep_config() won't
> > be called more than once - two threads might race in write(2), with the loser
> > getting through mutex_lock_interruptible(&data->lock); in ep_config() only
> > after the winner has already gotten through write(), switched ->f_op, returned
> > to userland and started doing read()/write()/etc.  If nothing else,
> > the contents of data->desc and data->hs_desc can be buggered by arbitrary
> > data, no matter how bogus, right as the first thread is doing IO.
> 
> Well, this one certainly can be fixed to avoid altering ->f_op, at the 
> cost of adding an extra check at the start of each I/O operation.
 
> > Um...  readv() is also going through ->aio_read().
> 
> Why does readv() do this but not read()?  Wouldn't it make more sense 
> to have all the read* calls use the same internal interface?

Because there are two partially overlapping classes wrt vector IO semantics:
	1) datagram-style.  Vectored read/write is equivalent to simple
read/write done on each vector component.  And IO boundaries matter - if
your driver treats any write() as datagram that starts e.g. with
fixed-sized table in the beginning + arbitrary amount of data following
it, you will get very different results from write(fd, buf, 200) and
writev(fd, (struct iovec[2]){{buf, 100}, {buf+100, 100}}, 2).  A _lot_ of
drivers are like that - they supply ->read() and ->write() for single-range
IO and VFS construct the rest of operations out of those.

	2) stream-style.  Vectored read is guaranteed to behave the same
way as simple read on a range with size being the sum of vector element
sizes, except that the data ends in ranges covered by vector elements instead
of a single array.  Vectored write is guaranteed to behave the same way
as simple write from a buffer containing the concatenation of ranges covered
by vector elements.  Boundaries between the elements do not matter at all.
Regular files on storage filesystems are like that.  So are FIFOs and pipes
and so are sockets.  Even for datagram protocols, boundaries between the
vector elements are ignored; boundaries between syscalls provide the datagram
boundaries, but you can e.g. do writev(udp_socket_fd, (struct iovec[3]){
{const_header, sizeof(const_header)}, {&n, 4}, {s, strlen(s)}}, 3) and have
only one UDP packet sent.  IOW, it's general-purpose scatter-gather for read
and write.

	The last example shows that (2) isn't a subset of (1) - it's not
always possible to call ->write() in loop and get the right behaviour.
For regular files (and pure stream sockets, etc.) it would work, but for
stuff like UDP sockets it would break.  Moreover, even for regular files on
storage filesystems it would be quite inefficient - we'd need to acquire and
release a bunch of locks, poke through metadata, etc., for each segment.

	As the result, there was a couple of new methods added, inventively
called ->readv() and ->writev().  do_sync_read() was supposed to be used
as ->read() instance - it's "feed a single-element vector to ->readv()" and
similar for s/read/write/.

	Note that both (1) and (2) satisfy the following sanity requirement -
single-element readv() is always equivalent to single-element() read().  You
could violate that, by providing completely unrelated ->read() and ->readv(),
but very few drivers went for that - too insane.

	Then, when AIO had been added, those had grown an argument pointing
to iocb (instead of file and ppos - for those we use iocb->ki_filp and
&iocb->ki_pos resp.) and they got renamed into ->aio_read() and ->aio_write().
Note that non-vectored AIO uses the same methods - ->read() and ->write() had
too many instances to convert and most of those would end up just using those
two iocb fields instead of the old arguments - tons of churn for no good
reason.  ->readv() and ->writev() had fewer instances (very common was the
use of generic_file_aio_{read,write}()) and conversion was less painful.
So there was no ->aio_read() and ->aio_write().  That, in principle, was a
bit of constraint - you couldn't make single-element AIO_PREADV behave
different from AIO_PREAD, but nobody had been insane enough to ask for that.

	Moreover, keeping ->readv() and ->writev() was pointless. There is
cheap way to tell whether ->aio_{read,write}() call is due to io_submit(2)
or to readv()/writev() - is_sync_kiocb(iocb) tells which one it is, so if
driver really wanted different semantics for sync vs. async, it could check
that.

	So we ended up with ->read/->write for sync non-vectored and
->aio_read()/->aio_write() for sync vectored *and* async anything.  Usually
you provide one or the other - NULL ->aio_... means loop calling ->read/write
on each element, NULL ->read/write (or do_sync_... for them - it's the same
thing) means feeding sync iocb and single-element vector to ->aio_....
You *can* provide both, but that's rare and almost always pointless.

	These days ->aio_{read,write} is almost gone - replaced with
->{read,write}_iter().  That sucker is equivalent, except that you
get a pointer struct iov_iter instead iovec, nr_seg, size triple.  And
you use linux/uio.h primitives for accessing the data (it might be backed
by something other than userland ranges).  The primitives in there are
not harder to use than copy_..._user(), and you avoid any need to loop over
iovec array, etc.  Those primitives generally don't give a damn about the
range boundaries; the only exception is iov_iter_single_seg_count(), which
tells how much data is left to be consumed in the current segment; very
few things are using it.  is_sync_kiocb() is still available to tell io_submit
from synchronous syscalls.

	I don't believe that it's worth switching the (1) \setminus (2) to
those; it's still too much churn, plus we'd need the loop over segments
somewhere to keep the semantics.  It *can* be expressed by ->read_iter()
and ->write_iter(), but it'll be tons of boilerplate code in a lot of
drivers.  So ->read() and ->write() are there to stay.  However, I think
that we can get to the situation when it's really either one or another -
if you have ->read_iter()/->write_iter(), don't mess with custom ->read()
and ->write().

	Both function/f_fs.c and legacy/inode.c are in class (2) - they
gather data from iovec into temp buffer in ->aio_write() and pass the buffer to 
the code doing actual IO, and on ->aio_read() side they give a buffer to
read into, then arrange for it to be scattered into our iovec upon IO
completion.  And they are doing non-trivial async stuff, so I'd prefer to
switch them completely to ->read_iter/->write_iter.  The only obstacle is
read vs. single-element readv and write vs. single-element writev behaviour
difference.

	For some files it really can't be helped - {ipath,qib}fs has completely
unrelated sets of commands accepted by write and writev, including the
single-element one.  And /dev/snd/pcm*, while somewhat milder, has non-trivial
behaviour differences.  I think that trick suggested by hch (put several
flags in iocb, encoding the reason for the call; that would allow to tell
one from another) is the best way to deal with those.  But I would really
prefer to avoid that; IMO those examples are simply bad userland APIs.
legacy/inode.c is the third (and last) beast like those two.  And there the
difference is small enough to simply change readv/writev to be like read/write
in that respect, i.e. also halt the endpoint when called on the isochronous
one with wrong direction.

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 3/5] fs: remove ki_nbytes
  2015-02-04 23:07                     ` Al Viro
@ 2015-02-05  8:24                       ` Robert Baldyga
  2015-02-05  8:47                         ` Al Viro
       [not found]                       ` <20150204230733.GK29656-3bDd1+5oDREiFSDQTTA3OLVCufUGDwFn@public.gmane.org>
  1 sibling, 1 reply; 45+ messages in thread
From: Robert Baldyga @ 2015-02-05  8:24 UTC (permalink / raw)
  To: Al Viro, Alan Stern
  Cc: Christoph Hellwig, Miklos Szeredi, linux-aio, linux-fsdevel,
	Felipe Balbi, linux-usb

On 02/05/2015 12:07 AM, Al Viro wrote:
> On Wed, Feb 04, 2015 at 03:30:32PM -0500, Alan Stern wrote:
>>> 	* this one.  Note that you are not guaranteed that ep_config() won't
>>> be called more than once - two threads might race in write(2), with the loser
>>> getting through mutex_lock_interruptible(&data->lock); in ep_config() only
>>> after the winner has already gotten through write(), switched ->f_op, returned
>>> to userland and started doing read()/write()/etc.  If nothing else,
>>> the contents of data->desc and data->hs_desc can be buggered by arbitrary
>>> data, no matter how bogus, right as the first thread is doing IO.
>>
>> Well, this one certainly can be fixed to avoid altering ->f_op, at the 
>> cost of adding an extra check at the start of each I/O operation.
>  
>>> Um...  readv() is also going through ->aio_read().
>>
>> Why does readv() do this but not read()?  Wouldn't it make more sense 
>> to have all the read* calls use the same internal interface?
> 
> Because there are two partially overlapping classes wrt vector IO semantics:
> 	1) datagram-style.  Vectored read/write is equivalent to simple
> read/write done on each vector component.  And IO boundaries matter - if
> your driver treats any write() as datagram that starts e.g. with
> fixed-sized table in the beginning + arbitrary amount of data following
> it, you will get very different results from write(fd, buf, 200) and
> writev(fd, (struct iovec[2]){{buf, 100}, {buf+100, 100}}, 2).  A _lot_ of
> drivers are like that - they supply ->read() and ->write() for single-range
> IO and VFS construct the rest of operations out of those.
> 
> 	2) stream-style.  Vectored read is guaranteed to behave the same
> way as simple read on a range with size being the sum of vector element
> sizes, except that the data ends in ranges covered by vector elements instead
> of a single array.  Vectored write is guaranteed to behave the same way
> as simple write from a buffer containing the concatenation of ranges covered
> by vector elements.  Boundaries between the elements do not matter at all.
> Regular files on storage filesystems are like that.  So are FIFOs and pipes
> and so are sockets.  Even for datagram protocols, boundaries between the
> vector elements are ignored; boundaries between syscalls provide the datagram
> boundaries, but you can e.g. do writev(udp_socket_fd, (struct iovec[3]){
> {const_header, sizeof(const_header)}, {&n, 4}, {s, strlen(s)}}, 3) and have
> only one UDP packet sent.  IOW, it's general-purpose scatter-gather for read
> and write.
> 
> 	The last example shows that (2) isn't a subset of (1) - it's not
> always possible to call ->write() in loop and get the right behaviour.
> For regular files (and pure stream sockets, etc.) it would work, but for
> stuff like UDP sockets it would break.  Moreover, even for regular files on
> storage filesystems it would be quite inefficient - we'd need to acquire and
> release a bunch of locks, poke through metadata, etc., for each segment.
> 
> 	As the result, there was a couple of new methods added, inventively
> called ->readv() and ->writev().  do_sync_read() was supposed to be used
> as ->read() instance - it's "feed a single-element vector to ->readv()" and
> similar for s/read/write/.
> 
> 	Note that both (1) and (2) satisfy the following sanity requirement -
> single-element readv() is always equivalent to single-element() read().  You
> could violate that, by providing completely unrelated ->read() and ->readv(),
> but very few drivers went for that - too insane.
> 
> 	Then, when AIO had been added, those had grown an argument pointing
> to iocb (instead of file and ppos - for those we use iocb->ki_filp and
> &iocb->ki_pos resp.) and they got renamed into ->aio_read() and ->aio_write().
> Note that non-vectored AIO uses the same methods - ->read() and ->write() had
> too many instances to convert and most of those would end up just using those
> two iocb fields instead of the old arguments - tons of churn for no good
> reason.  ->readv() and ->writev() had fewer instances (very common was the
> use of generic_file_aio_{read,write}()) and conversion was less painful.
> So there was no ->aio_read() and ->aio_write().  That, in principle, was a
> bit of constraint - you couldn't make single-element AIO_PREADV behave
> different from AIO_PREAD, but nobody had been insane enough to ask for that.
> 
> 	Moreover, keeping ->readv() and ->writev() was pointless. There is
> cheap way to tell whether ->aio_{read,write}() call is due to io_submit(2)
> or to readv()/writev() - is_sync_kiocb(iocb) tells which one it is, so if
> driver really wanted different semantics for sync vs. async, it could check
> that.
> 
> 	So we ended up with ->read/->write for sync non-vectored and
> ->aio_read()/->aio_write() for sync vectored *and* async anything.  Usually
> you provide one or the other - NULL ->aio_... means loop calling ->read/write
> on each element, NULL ->read/write (or do_sync_... for them - it's the same
> thing) means feeding sync iocb and single-element vector to ->aio_....
> You *can* provide both, but that's rare and almost always pointless.
> 
> 	These days ->aio_{read,write} is almost gone - replaced with
> ->{read,write}_iter().  That sucker is equivalent, except that you
> get a pointer struct iov_iter instead iovec, nr_seg, size triple.  And
> you use linux/uio.h primitives for accessing the data (it might be backed
> by something other than userland ranges).  The primitives in there are
> not harder to use than copy_..._user(), and you avoid any need to loop over
> iovec array, etc.  Those primitives generally don't give a damn about the
> range boundaries; the only exception is iov_iter_single_seg_count(), which
> tells how much data is left to be consumed in the current segment; very
> few things are using it.  is_sync_kiocb() is still available to tell io_submit
> from synchronous syscalls.
> 
> 	I don't believe that it's worth switching the (1) \setminus (2) to
> those; it's still too much churn, plus we'd need the loop over segments
> somewhere to keep the semantics.  It *can* be expressed by ->read_iter()
> and ->write_iter(), but it'll be tons of boilerplate code in a lot of
> drivers.  So ->read() and ->write() are there to stay.  However, I think
> that we can get to the situation when it's really either one or another -
> if you have ->read_iter()/->write_iter(), don't mess with custom ->read()
> and ->write().
> 
> 	Both function/f_fs.c and legacy/inode.c are in class (2) - they
> gather data from iovec into temp buffer in ->aio_write() and pass the buffer to 
> the code doing actual IO, and on ->aio_read() side they give a buffer to
> read into, then arrange for it to be scattered into our iovec upon IO
> completion.  And they are doing non-trivial async stuff, so I'd prefer to
> switch them completely to ->read_iter/->write_iter.  The only obstacle is
> read vs. single-element readv and write vs. single-element writev behaviour
> difference.

No, function/f_fs.c and legacy/inode.c are in class (1). They have
datagram semantics - each vector element is submitted in separate USB
request. Each single request is sent in separate USB data packet (for
bulk endpoints it can be more than one packet). In fact sync
read()/write() also will give different results while called once with
some block of data or in loop with the same block of data splitted into
a few parts.

> 
> 	For some files it really can't be helped - {ipath,qib}fs has completely
> unrelated sets of commands accepted by write and writev, including the
> single-element one.  And /dev/snd/pcm*, while somewhat milder, has non-trivial
> behaviour differences.  I think that trick suggested by hch (put several
> flags in iocb, encoding the reason for the call; that would allow to tell
> one from another) is the best way to deal with those.  But I would really
> prefer to avoid that; IMO those examples are simply bad userland APIs.
> legacy/inode.c is the third (and last) beast like those two.  And there the
> difference is small enough to simply change readv/writev to be like read/write
> in that respect, i.e. also halt the endpoint when called on the isochronous
> one with wrong direction.

It shouldn't be a big problem to halt endpoints in
ep_aio_write()/ep_aio_read() to have similar behaviour for
single-element readv()/writev() and read()/write() operations.

Robert Baldyga

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 3/5] fs: remove ki_nbytes
  2015-02-05  8:24                       ` Robert Baldyga
@ 2015-02-05  8:47                         ` Al Viro
  2015-02-05  9:03                           ` Al Viro
  0 siblings, 1 reply; 45+ messages in thread
From: Al Viro @ 2015-02-05  8:47 UTC (permalink / raw)
  To: Robert Baldyga
  Cc: Alan Stern, Christoph Hellwig, Miklos Szeredi, linux-aio,
	linux-fsdevel, Felipe Balbi, linux-usb

On Thu, Feb 05, 2015 at 09:24:32AM +0100, Robert Baldyga wrote:

> No, function/f_fs.c and legacy/inode.c are in class (1). They have
> datagram semantics - each vector element is submitted in separate USB
> request. Each single request is sent in separate USB data packet (for
> bulk endpoints it can be more than one packet). In fact sync
> read()/write() also will give different results while called once with
> some block of data or in loop with the same block of data splitted into
> a few parts.

No, they don't.  This is from ffs_epfile_io():

                data = kmalloc(data_len, GFP_KERNEL);
                if (unlikely(!data))
                        return -ENOMEM;
                if (io_data->aio && !io_data->read) {
                        int i;
                        size_t pos = 0;
                        for (i = 0; i < io_data->nr_segs; i++) {
                                if (unlikely(copy_from_user(&data[pos],
                                             io_data->iovec[i].iov_base,
                                             io_data->iovec[i].iov_len))) {
                                        ret = -EFAULT;
                                        goto error;
                                }
                                pos += io_data->iovec[i].iov_len;
                        }

and that's the last point where it looks at iovec.  After that all work
is done to the copy in data, where no information about the boundaries
survives.  And ep_aio_write() (in legacy/inode.c) is the same way.

	You are confusing datagram-per-syscall (which they are) with
datagram-per-iovec (which they are definitely not).  IOW, they behave
as UDP sockets - writev() is purely scatter-gather variant of write(),
with datagram per syscall and all vector elements silently concatenated.
That's class 2, and _not_ in its intersection with class 1.

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 3/5] fs: remove ki_nbytes
  2015-02-05  8:47                         ` Al Viro
@ 2015-02-05  9:03                           ` Al Viro
  2015-02-05  9:15                             ` Robert Baldyga
  0 siblings, 1 reply; 45+ messages in thread
From: Al Viro @ 2015-02-05  9:03 UTC (permalink / raw)
  To: Robert Baldyga
  Cc: Alan Stern, Christoph Hellwig, Miklos Szeredi, linux-aio,
	linux-fsdevel, Felipe Balbi, linux-usb

On Thu, Feb 05, 2015 at 08:47:29AM +0000, Al Viro wrote:
> 	You are confusing datagram-per-syscall (which they are) with
> datagram-per-iovec (which they are definitely not).  IOW, they behave
> as UDP sockets - writev() is purely scatter-gather variant of write(),
> with datagram per syscall and all vector elements silently concatenated.
> That's class 2, and _not_ in its intersection with class 1.

PS: you want class 1, look at something like /proc/sys/kernel/domainname
(or any other sysctl of that sort).  write "foobar" there and
cat /proc/sys/kernel/domainname will print foorbat.  writev an array consisting
of "foo" and "bar", and you'll see bar afterwards, same as you would
after writing first "foo", then "bar".  There the iovec boundaries affect
the result - ->no aio_write() for that sucker, so we get two calls of
->write(), with expected results.  And there are character devices like that
as well.  _That_ is class 1 outside of intersection with class 2.

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 3/5] fs: remove ki_nbytes
  2015-02-05  9:03                           ` Al Viro
@ 2015-02-05  9:15                             ` Robert Baldyga
  0 siblings, 0 replies; 45+ messages in thread
From: Robert Baldyga @ 2015-02-05  9:15 UTC (permalink / raw)
  To: Al Viro
  Cc: Alan Stern, Christoph Hellwig, Miklos Szeredi, linux-aio,
	linux-fsdevel, Felipe Balbi, linux-usb

On 02/05/2015 10:03 AM, Al Viro wrote:
> On Thu, Feb 05, 2015 at 08:47:29AM +0000, Al Viro wrote:
>> 	You are confusing datagram-per-syscall (which they are) with
>> datagram-per-iovec (which they are definitely not).  IOW, they behave
>> as UDP sockets - writev() is purely scatter-gather variant of write(),
>> with datagram per syscall and all vector elements silently concatenated.
>> That's class 2, and _not_ in its intersection with class 1.
> 
> PS: you want class 1, look at something like /proc/sys/kernel/domainname
> (or any other sysctl of that sort).  write "foobar" there and
> cat /proc/sys/kernel/domainname will print foorbat.  writev an array consisting
> of "foo" and "bar", and you'll see bar afterwards, same as you would
> after writing first "foo", then "bar".  There the iovec boundaries affect
> the result - ->no aio_write() for that sucker, so we get two calls of
> ->write(), with expected results.  And there are character devices like that
> as well.  _That_ is class 1 outside of intersection with class 2.
> 

Oh, I see. Thanks.

So we only need to add endpoint halting to aio_read()/aio_write(), to
make their behaviour similar to sync ones, right?

Robert Baldyga

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 3/5] fs: remove ki_nbytes
       [not found]                       ` <20150204230733.GK29656-3bDd1+5oDREiFSDQTTA3OLVCufUGDwFn@public.gmane.org>
@ 2015-02-05 15:29                         ` Alan Stern
  2015-02-06  7:03                           ` Al Viro
  2015-02-07  5:44                           ` Al Viro
  0 siblings, 2 replies; 45+ messages in thread
From: Alan Stern @ 2015-02-05 15:29 UTC (permalink / raw)
  To: Al Viro
  Cc: Christoph Hellwig, Miklos Szeredi,
	linux-aio-Bw31MaZKKs3YtjvyW6yDsg,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA, Felipe Balbi,
	linux-usb-u79uwXL29TY76Z2rM5mHXA

On Wed, 4 Feb 2015, Al Viro wrote:

> > > Um...  readv() is also going through ->aio_read().
> > 
> > Why does readv() do this but not read()?  Wouldn't it make more sense 
> > to have all the read* calls use the same internal interface?
> 
> Because there are two partially overlapping classes wrt vector IO semantics:
...

Thanks for the detailed explanation.  It appears to boil down to a 
series of historical accidents.

In any case, feel free to copy the non-isochronous behavior of the
synchronous routines in the async routines.  It certainly won't hurt 
anything.

Alan Stern

--
To unsubscribe from this list: send the line "unsubscribe linux-usb" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 3/5] fs: remove ki_nbytes
  2015-02-05 15:29                         ` Alan Stern
@ 2015-02-06  7:03                           ` Al Viro
       [not found]                             ` <20150206070350.GX29656-3bDd1+5oDREiFSDQTTA3OLVCufUGDwFn@public.gmane.org>
  2015-02-07  5:44                           ` Al Viro
  1 sibling, 1 reply; 45+ messages in thread
From: Al Viro @ 2015-02-06  7:03 UTC (permalink / raw)
  To: Alan Stern
  Cc: Christoph Hellwig, Miklos Szeredi, linux-aio, linux-fsdevel,
	Felipe Balbi, linux-usb

On Thu, Feb 05, 2015 at 10:29:42AM -0500, Alan Stern wrote:
> On Wed, 4 Feb 2015, Al Viro wrote:
> 
> > > > Um...  readv() is also going through ->aio_read().
> > > 
> > > Why does readv() do this but not read()?  Wouldn't it make more sense 
> > > to have all the read* calls use the same internal interface?
> > 
> > Because there are two partially overlapping classes wrt vector IO semantics:
> ...
> 
> Thanks for the detailed explanation.  It appears to boil down to a 
> series of historical accidents.
> 
> In any case, feel free to copy the non-isochronous behavior of the
> synchronous routines in the async routines.  It certainly won't hurt 
> anything.

Hmm...  What happens if f_fs.c successfully queues struct usb_request, returns
-EIOCBQUEUED and then gets hit by io_cancel(2)?  AFAICS, you get
ffs_aio_cancel() called, which dequeues usb_request and buggers off.
The thing is, freeing io_data and stuff hanging off it would be done by
ffs_user_copy_worker(), which would be scheduled via schedule_work() by
ffs_epfile_async_io_complete(), i.e. usb_request ->complete() callback.
And usb_ep_dequeue() (aka. ep->ops->dequeue()) has tons of instances, but
AFAICS some of them might not trigger usb_gadget_giveback_request(), which
would normally call ->complete()...

Example:
net2272_dequeue(struct usb_ep *_ep, struct usb_request *_req)
{
        struct net2272_ep *ep;
        struct net2272_request *req;
        unsigned long flags;
        int stopped;

        ep = container_of(_ep, struct net2272_ep, ep);
        if (!_ep || (!ep->desc && ep->num != 0) || !_req)
                return -EINVAL;

        spin_lock_irqsave(&ep->dev->lock, flags);
        stopped = ep->stopped;
        ep->stopped = 1;

        /* make sure it's still queued on this endpoint */
        list_for_each_entry(req, &ep->queue, queue) {
                if (&req->req == _req)
                        break;
        }
        if (&req->req != _req) {
                spin_unlock_irqrestore(&ep->dev->lock, flags);
                return -EINVAL;
        }

        /* queue head may be partially complete */
        if (ep->queue.next == &req->queue) {
                dev_dbg(ep->dev->dev, "unlink (%s) pio\n", _ep->name);
                net2272_done(ep, req, -ECONNRESET);
        }
        req = NULL;
        ep->stopped = stopped;

        spin_unlock_irqrestore(&ep->dev->lock, flags);
        return 0;
}

Note that net2272_done(), which would call usb_gadget_giveback_request(),
is only called if the victim happens to be queue head.  Is that just a
net2272.c bug, or am I missing something subtle here?  Looks like
at least on that hardware io_cancel() could leak io_data and everything
that hangs off it...

FWIW, net2272.c was the first one I looked at (happened to be on the last
line of screen during git grep for \.dequeue in drivers/usb/gadget ;-)
and after checking several more it seems that it's a Sod's Law in action -
I'd checked about 5 of them and they seem to call usb_gadget_giveback_request()
as long as they find the sucker in queue, head or no head.  OTOH, there's
a lot more of those guys, so that observation is not worth much...

IOW, is that a net2272.c bug, or a f_fs.c one? 

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 3/5] fs: remove ki_nbytes
       [not found]                             ` <20150206070350.GX29656-3bDd1+5oDREiFSDQTTA3OLVCufUGDwFn@public.gmane.org>
@ 2015-02-06  8:44                               ` Robert Baldyga
  0 siblings, 0 replies; 45+ messages in thread
From: Robert Baldyga @ 2015-02-06  8:44 UTC (permalink / raw)
  To: Al Viro, Alan Stern
  Cc: Christoph Hellwig, Miklos Szeredi,
	linux-aio-Bw31MaZKKs3YtjvyW6yDsg,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA, Felipe Balbi,
	linux-usb-u79uwXL29TY76Z2rM5mHXA

On 02/06/2015 08:03 AM, Al Viro wrote:
> On Thu, Feb 05, 2015 at 10:29:42AM -0500, Alan Stern wrote:
>> On Wed, 4 Feb 2015, Al Viro wrote:
>>
>>>>> Um...  readv() is also going through ->aio_read().
>>>>
>>>> Why does readv() do this but not read()?  Wouldn't it make more sense 
>>>> to have all the read* calls use the same internal interface?
>>>
>>> Because there are two partially overlapping classes wrt vector IO semantics:
>> ...
>>
>> Thanks for the detailed explanation.  It appears to boil down to a 
>> series of historical accidents.
>>
>> In any case, feel free to copy the non-isochronous behavior of the
>> synchronous routines in the async routines.  It certainly won't hurt 
>> anything.
> 
> Hmm...  What happens if f_fs.c successfully queues struct usb_request, returns
> -EIOCBQUEUED and then gets hit by io_cancel(2)?  AFAICS, you get
> ffs_aio_cancel() called, which dequeues usb_request and buggers off.
> The thing is, freeing io_data and stuff hanging off it would be done by
> ffs_user_copy_worker(), which would be scheduled via schedule_work() by
> ffs_epfile_async_io_complete(), i.e. usb_request ->complete() callback.
> And usb_ep_dequeue() (aka. ep->ops->dequeue()) has tons of instances, but
> AFAICS some of them might not trigger usb_gadget_giveback_request(), which
> would normally call ->complete()...
> 
> Example:
> net2272_dequeue(struct usb_ep *_ep, struct usb_request *_req)
> {
>         struct net2272_ep *ep;
>         struct net2272_request *req;
>         unsigned long flags;
>         int stopped;
> 
>         ep = container_of(_ep, struct net2272_ep, ep);
>         if (!_ep || (!ep->desc && ep->num != 0) || !_req)
>                 return -EINVAL;
> 
>         spin_lock_irqsave(&ep->dev->lock, flags);
>         stopped = ep->stopped;
>         ep->stopped = 1;
> 
>         /* make sure it's still queued on this endpoint */
>         list_for_each_entry(req, &ep->queue, queue) {
>                 if (&req->req == _req)
>                         break;
>         }
>         if (&req->req != _req) {
>                 spin_unlock_irqrestore(&ep->dev->lock, flags);
>                 return -EINVAL;
>         }
> 
>         /* queue head may be partially complete */
>         if (ep->queue.next == &req->queue) {
>                 dev_dbg(ep->dev->dev, "unlink (%s) pio\n", _ep->name);
>                 net2272_done(ep, req, -ECONNRESET);
>         }
>         req = NULL;
>         ep->stopped = stopped;
> 
>         spin_unlock_irqrestore(&ep->dev->lock, flags);
>         return 0;
> }
> 
> Note that net2272_done(), which would call usb_gadget_giveback_request(),
> is only called if the victim happens to be queue head.  Is that just a
> net2272.c bug, or am I missing something subtle here?  Looks like
> at least on that hardware io_cancel() could leak io_data and everything
> that hangs off it...
> 
> FWIW, net2272.c was the first one I looked at (happened to be on the last
> line of screen during git grep for \.dequeue in drivers/usb/gadget ;-)
> and after checking several more it seems that it's a Sod's Law in action -
> I'd checked about 5 of them and they seem to call usb_gadget_giveback_request()
> as long as they find the sucker in queue, head or no head.  OTOH, there's
> a lot more of those guys, so that observation is not worth much...
> 
> IOW, is that a net2272.c bug, or a f_fs.c one? 

AFAIK usb request should be completed in all cases, and many gadget
drivers make assumptions that complete() of each request will be called,
so it's definitely bug in net2272 driver.

Robert Baldyga
--
To unsubscribe from this list: send the line "unsubscribe linux-usb" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH 3/5] fs: remove ki_nbytes
  2015-02-05 15:29                         ` Alan Stern
  2015-02-06  7:03                           ` Al Viro
@ 2015-02-07  5:44                           ` Al Viro
  2015-02-07  5:48                             ` [PATCH 1/6] new helper: dup_iter() Al Viro
                                               ` (5 more replies)
  1 sibling, 6 replies; 45+ messages in thread
From: Al Viro @ 2015-02-07  5:44 UTC (permalink / raw)
  To: Alan Stern
  Cc: Christoph Hellwig, Miklos Szeredi, linux-aio, linux-fsdevel,
	Felipe Balbi, linux-usb

On Thu, Feb 05, 2015 at 10:29:42AM -0500, Alan Stern wrote:
> On Wed, 4 Feb 2015, Al Viro wrote:
> 
> > > > Um...  readv() is also going through ->aio_read().
> > > 
> > > Why does readv() do this but not read()?  Wouldn't it make more sense 
> > > to have all the read* calls use the same internal interface?
> > 
> > Because there are two partially overlapping classes wrt vector IO semantics:
> ...
> 
> Thanks for the detailed explanation.  It appears to boil down to a 
> series of historical accidents.
> 
> In any case, feel free to copy the non-isochronous behavior of the
> synchronous routines in the async routines.  It certainly won't hurt 
> anything.

OK, I've limited it to sync ones, actually.  Preliminary series in followups.
Those who prefer to read in git, see
git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git #gadget

WARNING: completely untested

Al Viro (6):
      new helper: dup_iter()
      gadget/function/f_fs.c: close leaks
      gadget/function/f_fs.c: use put iov_iter into io_data
      gadget/function/f_fs.c: switch to ->{read,write}_iter()
      gadgetfs: use-after-free in ->aio_read()
      gadget: switch ep_io_operations to ->read_iter/->write_iter

 drivers/usb/gadget/function/f_fs.c | 204 +++++++++-------------
 drivers/usb/gadget/legacy/inode.c  | 346 +++++++++++++++----------------------
 include/linux/uio.h                |   2 +
 mm/iov_iter.c                      |  15 ++
 4 files changed, 237 insertions(+), 330 deletions(-)

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* [PATCH 1/6] new helper: dup_iter()
  2015-02-07  5:44                           ` Al Viro
@ 2015-02-07  5:48                             ` Al Viro
  2015-02-07  5:48                             ` [PATCH 2/6] gadget/function/f_fs.c: close leaks Al Viro
                                               ` (4 subsequent siblings)
  5 siblings, 0 replies; 45+ messages in thread
From: Al Viro @ 2015-02-07  5:48 UTC (permalink / raw)
  To: Alan Stern
  Cc: Christoph Hellwig, Miklos Szeredi, linux-aio, linux-fsdevel,
	Felipe Balbi, linux-usb

From: Al Viro <viro@zeniv.linux.org.uk>

Copy iter and kmemdup the underlying array for the copy.  Returns
a pointer to result of kmemdup() to be kfree()'d later.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/uio.h |  2 ++
 mm/iov_iter.c       | 15 +++++++++++++++
 2 files changed, 17 insertions(+)

diff --git a/include/linux/uio.h b/include/linux/uio.h
index b447402..e325cb1 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -98,6 +98,8 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, struct page ***pages,
 			size_t maxsize, size_t *start);
 int iov_iter_npages(const struct iov_iter *i, int maxpages);
 
+const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags);
+
 static inline size_t iov_iter_count(struct iov_iter *i)
 {
 	return i->count;
diff --git a/mm/iov_iter.c b/mm/iov_iter.c
index 8277320..9d96e283 100644
--- a/mm/iov_iter.c
+++ b/mm/iov_iter.c
@@ -751,3 +751,18 @@ int iov_iter_npages(const struct iov_iter *i, int maxpages)
 	return npages;
 }
 EXPORT_SYMBOL(iov_iter_npages);
+
+const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags)
+{
+	*new = *old;
+	if (new->type & ITER_BVEC)
+		return new->bvec = kmemdup(new->bvec,
+				    new->nr_segs * sizeof(struct bio_vec),
+				    flags);
+	else
+		/* iovec and kvec have identical layout */
+		return new->iov = kmemdup(new->iov,
+				   new->nr_segs * sizeof(struct iovec),
+				   flags);
+}
+EXPORT_SYMBOL(dup_iter);
-- 
2.1.4

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply related	[flat|nested] 45+ messages in thread

* [PATCH 2/6] gadget/function/f_fs.c: close leaks
  2015-02-07  5:44                           ` Al Viro
  2015-02-07  5:48                             ` [PATCH 1/6] new helper: dup_iter() Al Viro
@ 2015-02-07  5:48                             ` Al Viro
  2015-02-07  5:48                             ` [PATCH 3/6] gadget/function/f_fs.c: use put iov_iter into io_data Al Viro
                                               ` (3 subsequent siblings)
  5 siblings, 0 replies; 45+ messages in thread
From: Al Viro @ 2015-02-07  5:48 UTC (permalink / raw)
  To: Alan Stern
  Cc: Christoph Hellwig, Miklos Szeredi, linux-aio, linux-fsdevel,
	Felipe Balbi, linux-usb

From: Al Viro <viro@zeniv.linux.org.uk>

If ffs_epfile_io() fails in AIO case, we end up leaking io_data
(and iovec_copy in case of AIO read).

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/usb/gadget/function/f_fs.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c
index 63314ed..0c120ad 100644
--- a/drivers/usb/gadget/function/f_fs.c
+++ b/drivers/usb/gadget/function/f_fs.c
@@ -962,6 +962,7 @@ static ssize_t ffs_epfile_aio_write(struct kiocb *kiocb,
 				    unsigned long nr_segs, loff_t loff)
 {
 	struct ffs_io_data *io_data;
+	ssize_t res;
 
 	ENTER();
 
@@ -981,7 +982,10 @@ static ssize_t ffs_epfile_aio_write(struct kiocb *kiocb,
 
 	kiocb_set_cancel_fn(kiocb, ffs_aio_cancel);
 
-	return ffs_epfile_io(kiocb->ki_filp, io_data);
+	res = ffs_epfile_io(kiocb->ki_filp, io_data);
+	if (res != -EIOCBQUEUED)
+		kfree(io_data);
+	return res;
 }
 
 static ssize_t ffs_epfile_aio_read(struct kiocb *kiocb,
@@ -990,6 +994,7 @@ static ssize_t ffs_epfile_aio_read(struct kiocb *kiocb,
 {
 	struct ffs_io_data *io_data;
 	struct iovec *iovec_copy;
+	ssize_t res;
 
 	ENTER();
 
@@ -1017,7 +1022,12 @@ static ssize_t ffs_epfile_aio_read(struct kiocb *kiocb,
 
 	kiocb_set_cancel_fn(kiocb, ffs_aio_cancel);
 
-	return ffs_epfile_io(kiocb->ki_filp, io_data);
+	res = ffs_epfile_io(kiocb->ki_filp, io_data);
+	if (res != -EIOCBQUEUED) {
+		kfree(io_data);
+		kfree(iovec_copy);
+	}
+	return res;
 }
 
 static int
-- 
2.1.4

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply related	[flat|nested] 45+ messages in thread

* [PATCH 3/6] gadget/function/f_fs.c: use put iov_iter into io_data
  2015-02-07  5:44                           ` Al Viro
  2015-02-07  5:48                             ` [PATCH 1/6] new helper: dup_iter() Al Viro
  2015-02-07  5:48                             ` [PATCH 2/6] gadget/function/f_fs.c: close leaks Al Viro
@ 2015-02-07  5:48                             ` Al Viro
  2015-02-07  5:48                             ` [PATCH 4/6] gadget/function/f_fs.c: switch to ->{read,write}_iter() Al Viro
                                               ` (2 subsequent siblings)
  5 siblings, 0 replies; 45+ messages in thread
From: Al Viro @ 2015-02-07  5:48 UTC (permalink / raw)
  To: Alan Stern
  Cc: Christoph Hellwig, Miklos Szeredi, linux-aio, linux-fsdevel,
	Felipe Balbi, linux-usb

From: Al Viro <viro@zeniv.linux.org.uk>

both on aio and non-aio sides

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/usb/gadget/function/f_fs.c | 86 +++++++++++---------------------------
 1 file changed, 25 insertions(+), 61 deletions(-)

diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c
index 0c120ad..11704e7 100644
--- a/drivers/usb/gadget/function/f_fs.c
+++ b/drivers/usb/gadget/function/f_fs.c
@@ -143,10 +143,9 @@ struct ffs_io_data {
 	bool read;
 
 	struct kiocb *kiocb;
-	const struct iovec *iovec;
-	unsigned long nr_segs;
-	char __user *buf;
-	size_t len;
+	struct iov_iter data;
+	const void *to_free;
+	char *buf;
 
 	struct mm_struct *mm;
 	struct work_struct work;
@@ -645,29 +644,10 @@ static void ffs_user_copy_worker(struct work_struct *work)
 					 io_data->req->actual;
 
 	if (io_data->read && ret > 0) {
-		int i;
-		size_t pos = 0;
-
-		/*
-		 * Since req->length may be bigger than io_data->len (after
-		 * being rounded up to maxpacketsize), we may end up with more
-		 * data then user space has space for.
-		 */
-		ret = min_t(int, ret, io_data->len);
-
 		use_mm(io_data->mm);
-		for (i = 0; i < io_data->nr_segs; i++) {
-			size_t len = min_t(size_t, ret - pos,
-					io_data->iovec[i].iov_len);
-			if (!len)
-				break;
-			if (unlikely(copy_to_user(io_data->iovec[i].iov_base,
-						 &io_data->buf[pos], len))) {
-				ret = -EFAULT;
-				break;
-			}
-			pos += len;
-		}
+		ret = copy_to_iter(io_data->buf, ret, &io_data->data);
+		if (iov_iter_count(&io_data->data))
+			ret = -EFAULT;
 		unuse_mm(io_data->mm);
 	}
 
@@ -677,7 +657,7 @@ static void ffs_user_copy_worker(struct work_struct *work)
 
 	io_data->kiocb->private = NULL;
 	if (io_data->read)
-		kfree(io_data->iovec);
+		kfree(io_data->to_free);
 	kfree(io_data->buf);
 	kfree(io_data);
 }
@@ -736,6 +716,7 @@ static ssize_t ffs_epfile_io(struct file *file, struct ffs_io_data *io_data)
 		 * before the waiting completes, so do not assign to 'gadget' earlier
 		 */
 		struct usb_gadget *gadget = epfile->ffs->gadget;
+		size_t copied;
 
 		spin_lock_irq(&epfile->ffs->eps_lock);
 		/* In the meantime, endpoint got disabled or changed. */
@@ -743,34 +724,21 @@ static ssize_t ffs_epfile_io(struct file *file, struct ffs_io_data *io_data)
 			spin_unlock_irq(&epfile->ffs->eps_lock);
 			return -ESHUTDOWN;
 		}
+		data_len = iov_iter_count(&io_data->data);
 		/*
 		 * Controller may require buffer size to be aligned to
 		 * maxpacketsize of an out endpoint.
 		 */
-		data_len = io_data->read ?
-			   usb_ep_align_maybe(gadget, ep->ep, io_data->len) :
-			   io_data->len;
+		if (io_data->read)
+			data_len = usb_ep_align_maybe(gadget, ep->ep, data_len);
 		spin_unlock_irq(&epfile->ffs->eps_lock);
 
 		data = kmalloc(data_len, GFP_KERNEL);
 		if (unlikely(!data))
 			return -ENOMEM;
-		if (io_data->aio && !io_data->read) {
-			int i;
-			size_t pos = 0;
-			for (i = 0; i < io_data->nr_segs; i++) {
-				if (unlikely(copy_from_user(&data[pos],
-					     io_data->iovec[i].iov_base,
-					     io_data->iovec[i].iov_len))) {
-					ret = -EFAULT;
-					goto error;
-				}
-				pos += io_data->iovec[i].iov_len;
-			}
-		} else {
-			if (!io_data->read &&
-			    unlikely(__copy_from_user(data, io_data->buf,
-						      io_data->len))) {
+		if (!io_data->read) {
+			copied = copy_from_iter(data, data_len, &io_data->data);
+			if (copied != data_len) {
 				ret = -EFAULT;
 				goto error;
 			}
@@ -868,10 +836,8 @@ static ssize_t ffs_epfile_io(struct file *file, struct ffs_io_data *io_data)
 				 */
 				ret = ep->status;
 				if (io_data->read && ret > 0) {
-					ret = min_t(size_t, ret, io_data->len);
-
-					if (unlikely(copy_to_user(io_data->buf,
-						data, ret)))
+					ret = copy_to_iter(data, ret, &io_data->data);
+					if (unlikely(iov_iter_count(&io_data->data)))
 						ret = -EFAULT;
 				}
 			}
@@ -895,13 +861,13 @@ ffs_epfile_write(struct file *file, const char __user *buf, size_t len,
 		 loff_t *ptr)
 {
 	struct ffs_io_data io_data;
+	struct iovec iov = {.iov_base = buf, .iov_len = len};
 
 	ENTER();
 
 	io_data.aio = false;
 	io_data.read = false;
-	io_data.buf = (char * __user)buf;
-	io_data.len = len;
+	iov_iter_init(&io_data.data, WRITE, &iov, 1, len);
 
 	return ffs_epfile_io(file, &io_data);
 }
@@ -910,13 +876,14 @@ static ssize_t
 ffs_epfile_read(struct file *file, char __user *buf, size_t len, loff_t *ptr)
 {
 	struct ffs_io_data io_data;
+	struct iovec iov = {.iov_base = buf, .iov_len = len};
 
 	ENTER();
 
 	io_data.aio = false;
 	io_data.read = true;
-	io_data.buf = buf;
-	io_data.len = len;
+	io_data.to_free = NULL;
+	iov_iter_init(&io_data.data, READ, &iov, 1, len);
 
 	return ffs_epfile_io(file, &io_data);
 }
@@ -973,9 +940,7 @@ static ssize_t ffs_epfile_aio_write(struct kiocb *kiocb,
 	io_data->aio = true;
 	io_data->read = false;
 	io_data->kiocb = kiocb;
-	io_data->iovec = iovec;
-	io_data->nr_segs = nr_segs;
-	io_data->len = kiocb->ki_nbytes;
+	iov_iter_init(&io_data->data, WRITE, iovec, nr_segs, kiocb->ki_nbytes);
 	io_data->mm = current->mm;
 
 	kiocb->private = io_data;
@@ -1013,9 +978,8 @@ static ssize_t ffs_epfile_aio_read(struct kiocb *kiocb,
 	io_data->aio = true;
 	io_data->read = true;
 	io_data->kiocb = kiocb;
-	io_data->iovec = iovec_copy;
-	io_data->nr_segs = nr_segs;
-	io_data->len = kiocb->ki_nbytes;
+	io_data->to_free = iovec_copy;
+	iov_iter_init(&io_data->data, READ, iovec_copy, nr_segs, kiocb->ki_nbytes);
 	io_data->mm = current->mm;
 
 	kiocb->private = io_data;
@@ -1024,8 +988,8 @@ static ssize_t ffs_epfile_aio_read(struct kiocb *kiocb,
 
 	res = ffs_epfile_io(kiocb->ki_filp, io_data);
 	if (res != -EIOCBQUEUED) {
+		kfree(io_data->to_free);
 		kfree(io_data);
-		kfree(iovec_copy);
 	}
 	return res;
 }
-- 
2.1.4

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply related	[flat|nested] 45+ messages in thread

* [PATCH 4/6] gadget/function/f_fs.c: switch to ->{read,write}_iter()
  2015-02-07  5:44                           ` Al Viro
                                               ` (2 preceding siblings ...)
  2015-02-07  5:48                             ` [PATCH 3/6] gadget/function/f_fs.c: use put iov_iter into io_data Al Viro
@ 2015-02-07  5:48                             ` Al Viro
  2015-02-07  5:48                             ` [PATCH 5/6] gadgetfs: use-after-free in ->aio_read() Al Viro
  2015-02-07  5:48                             ` [PATCH 6/6] gadget: switch ep_io_operations to ->read_iter/->write_iter Al Viro
  5 siblings, 0 replies; 45+ messages in thread
From: Al Viro @ 2015-02-07  5:48 UTC (permalink / raw)
  To: Alan Stern
  Cc: Christoph Hellwig, Miklos Szeredi, linux-aio, linux-fsdevel,
	Felipe Balbi, linux-usb

From: Al Viro <viro@zeniv.linux.org.uk>

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/usb/gadget/function/f_fs.c | 136 ++++++++++++++++---------------------
 1 file changed, 58 insertions(+), 78 deletions(-)

diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c
index 11704e7..04c6542 100644
--- a/drivers/usb/gadget/function/f_fs.c
+++ b/drivers/usb/gadget/function/f_fs.c
@@ -856,38 +856,6 @@ error:
 	return ret;
 }
 
-static ssize_t
-ffs_epfile_write(struct file *file, const char __user *buf, size_t len,
-		 loff_t *ptr)
-{
-	struct ffs_io_data io_data;
-	struct iovec iov = {.iov_base = buf, .iov_len = len};
-
-	ENTER();
-
-	io_data.aio = false;
-	io_data.read = false;
-	iov_iter_init(&io_data.data, WRITE, &iov, 1, len);
-
-	return ffs_epfile_io(file, &io_data);
-}
-
-static ssize_t
-ffs_epfile_read(struct file *file, char __user *buf, size_t len, loff_t *ptr)
-{
-	struct ffs_io_data io_data;
-	struct iovec iov = {.iov_base = buf, .iov_len = len};
-
-	ENTER();
-
-	io_data.aio = false;
-	io_data.read = true;
-	io_data.to_free = NULL;
-	iov_iter_init(&io_data.data, READ, &iov, 1, len);
-
-	return ffs_epfile_io(file, &io_data);
-}
-
 static int
 ffs_epfile_open(struct inode *inode, struct file *file)
 {
@@ -924,72 +892,84 @@ static int ffs_aio_cancel(struct kiocb *kiocb)
 	return value;
 }
 
-static ssize_t ffs_epfile_aio_write(struct kiocb *kiocb,
-				    const struct iovec *iovec,
-				    unsigned long nr_segs, loff_t loff)
+static ssize_t ffs_epfile_write_iter(struct kiocb *kiocb, struct iov_iter *from)
 {
-	struct ffs_io_data *io_data;
+	struct ffs_io_data io_data, *p = &io_data;
 	ssize_t res;
 
 	ENTER();
 
-	io_data = kmalloc(sizeof(*io_data), GFP_KERNEL);
-	if (unlikely(!io_data))
-		return -ENOMEM;
+	if (!is_sync_kiocb(kiocb)) {
+		p = kmalloc(sizeof(io_data), GFP_KERNEL);
+		if (unlikely(!p))
+			return -ENOMEM;
+		p->aio = true;
+	} else {
+		p->aio = false;
+	}
 
-	io_data->aio = true;
-	io_data->read = false;
-	io_data->kiocb = kiocb;
-	iov_iter_init(&io_data->data, WRITE, iovec, nr_segs, kiocb->ki_nbytes);
-	io_data->mm = current->mm;
+	p->read = false;
+	p->kiocb = kiocb;
+	p->data = *from;
+	p->mm = current->mm;
 
-	kiocb->private = io_data;
+	kiocb->private = p;
 
 	kiocb_set_cancel_fn(kiocb, ffs_aio_cancel);
 
-	res = ffs_epfile_io(kiocb->ki_filp, io_data);
-	if (res != -EIOCBQUEUED)
-		kfree(io_data);
+	res = ffs_epfile_io(kiocb->ki_filp, p);
+	if (res == -EIOCBQUEUED)
+		return res;
+	if (p->aio)
+		kfree(p);
+	else
+		*from = p->data;
 	return res;
 }
 
-static ssize_t ffs_epfile_aio_read(struct kiocb *kiocb,
-				   const struct iovec *iovec,
-				   unsigned long nr_segs, loff_t loff)
+static ssize_t ffs_epfile_read_iter(struct kiocb *kiocb, struct iov_iter *to)
 {
-	struct ffs_io_data *io_data;
-	struct iovec *iovec_copy;
+	struct ffs_io_data io_data, *p = &io_data;
 	ssize_t res;
 
 	ENTER();
 
-	iovec_copy = kmalloc_array(nr_segs, sizeof(*iovec_copy), GFP_KERNEL);
-	if (unlikely(!iovec_copy))
-		return -ENOMEM;
-
-	memcpy(iovec_copy, iovec, sizeof(struct iovec)*nr_segs);
-
-	io_data = kmalloc(sizeof(*io_data), GFP_KERNEL);
-	if (unlikely(!io_data)) {
-		kfree(iovec_copy);
-		return -ENOMEM;
+	if (!is_sync_kiocb(kiocb)) {
+		p = kmalloc(sizeof(io_data), GFP_KERNEL);
+		if (unlikely(!p))
+			return -ENOMEM;
+		p->aio = true;
+	} else {
+		p->aio = false;
 	}
 
-	io_data->aio = true;
-	io_data->read = true;
-	io_data->kiocb = kiocb;
-	io_data->to_free = iovec_copy;
-	iov_iter_init(&io_data->data, READ, iovec_copy, nr_segs, kiocb->ki_nbytes);
-	io_data->mm = current->mm;
+	p->read = true;
+	p->kiocb = kiocb;
+	if (p->aio) {
+		p->to_free = dup_iter(&p->data, to, GFP_KERNEL);
+		if (!p->to_free) {
+			kfree(p);
+			return -ENOMEM;
+		}
+	} else {
+		p->data = *to;
+		p->to_free = NULL;
+	}
+	p->mm = current->mm;
 
-	kiocb->private = io_data;
+	kiocb->private = p;
 
 	kiocb_set_cancel_fn(kiocb, ffs_aio_cancel);
 
-	res = ffs_epfile_io(kiocb->ki_filp, io_data);
-	if (res != -EIOCBQUEUED) {
-		kfree(io_data->to_free);
-		kfree(io_data);
+	res = ffs_epfile_io(kiocb->ki_filp, p);
+	if (res == -EIOCBQUEUED)
+		return res;
+
+	if (p->aio) {
+		kfree(p->to_free);
+		kfree(p);
+	} else {
+		*to = p->data;
 	}
 	return res;
 }
@@ -1071,10 +1051,10 @@ static const struct file_operations ffs_epfile_operations = {
 	.llseek =	no_llseek,
 
 	.open =		ffs_epfile_open,
-	.write =	ffs_epfile_write,
-	.read =		ffs_epfile_read,
-	.aio_write =	ffs_epfile_aio_write,
-	.aio_read =	ffs_epfile_aio_read,
+	.write =	new_sync_write,
+	.read =		new_sync_read,
+	.write_iter =	ffs_epfile_write_iter,
+	.read_iter =	ffs_epfile_read_iter,
 	.release =	ffs_epfile_release,
 	.unlocked_ioctl =	ffs_epfile_ioctl,
 };
-- 
2.1.4

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply related	[flat|nested] 45+ messages in thread

* [PATCH 5/6] gadgetfs: use-after-free in ->aio_read()
  2015-02-07  5:44                           ` Al Viro
                                               ` (3 preceding siblings ...)
  2015-02-07  5:48                             ` [PATCH 4/6] gadget/function/f_fs.c: switch to ->{read,write}_iter() Al Viro
@ 2015-02-07  5:48                             ` Al Viro
  2015-02-07  5:48                             ` [PATCH 6/6] gadget: switch ep_io_operations to ->read_iter/->write_iter Al Viro
  5 siblings, 0 replies; 45+ messages in thread
From: Al Viro @ 2015-02-07  5:48 UTC (permalink / raw)
  To: Alan Stern
  Cc: Christoph Hellwig, Miklos Szeredi, linux-aio, linux-fsdevel,
	Felipe Balbi, linux-usb, stable

From: Al Viro <viro@zeniv.linux.org.uk>

AIO_PREAD requests call ->aio_read() with iovec on caller's stack, so if
we are going to access it asynchronously, we'd better get ourselves
a copy - the one on kernel stack of aio_run_iocb() won't be there
anymore.  function/f_fs.c take care of doing that, legacy/inode.c
doesn't...

Cc: stable@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/usb/gadget/legacy/inode.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c
index db49ec4..9fbbaa0 100644
--- a/drivers/usb/gadget/legacy/inode.c
+++ b/drivers/usb/gadget/legacy/inode.c
@@ -566,7 +566,6 @@ static ssize_t ep_copy_to_user(struct kiocb_priv *priv)
 		if (total == 0)
 			break;
 	}
-
 	return len;
 }
 
@@ -585,6 +584,7 @@ static void ep_user_copy_worker(struct work_struct *work)
 	aio_complete(iocb, ret, ret);
 
 	kfree(priv->buf);
+	kfree(priv->iv);
 	kfree(priv);
 }
 
@@ -605,6 +605,7 @@ static void ep_aio_complete(struct usb_ep *ep, struct usb_request *req)
 	 */
 	if (priv->iv == NULL || unlikely(req->actual == 0)) {
 		kfree(req->buf);
+		kfree(priv->iv);
 		kfree(priv);
 		iocb->private = NULL;
 		/* aio_complete() reports bytes-transferred _and_ faults */
@@ -640,7 +641,7 @@ ep_aio_rwtail(
 	struct usb_request	*req;
 	ssize_t			value;
 
-	priv = kmalloc(sizeof *priv, GFP_KERNEL);
+	priv = kzalloc(sizeof *priv, GFP_KERNEL);
 	if (!priv) {
 		value = -ENOMEM;
 fail:
@@ -649,7 +650,14 @@ fail:
 	}
 	iocb->private = priv;
 	priv->iocb = iocb;
-	priv->iv = iv;
+	if (iv) {
+		priv->iv = kmemdup(iv, nr_segs * sizeof(struct iovec),
+				   GFP_KERNEL);
+		if (!priv->iv) {
+			kfree(priv);
+			goto fail;
+		}
+	}
 	priv->nr_segs = nr_segs;
 	INIT_WORK(&priv->work, ep_user_copy_worker);
 
@@ -689,6 +697,7 @@ fail:
 	mutex_unlock(&epdata->lock);
 
 	if (unlikely(value)) {
+		kfree(priv->iv);
 		kfree(priv);
 		put_ep(epdata);
 	} else
-- 
2.1.4


^ permalink raw reply related	[flat|nested] 45+ messages in thread

* [PATCH 6/6] gadget: switch ep_io_operations to ->read_iter/->write_iter
  2015-02-07  5:44                           ` Al Viro
                                               ` (4 preceding siblings ...)
  2015-02-07  5:48                             ` [PATCH 5/6] gadgetfs: use-after-free in ->aio_read() Al Viro
@ 2015-02-07  5:48                             ` Al Viro
  5 siblings, 0 replies; 45+ messages in thread
From: Al Viro @ 2015-02-07  5:48 UTC (permalink / raw)
  To: Alan Stern
  Cc: Christoph Hellwig, Miklos Szeredi, linux-aio, linux-fsdevel,
	Felipe Balbi, linux-usb

From: Al Viro <viro@zeniv.linux.org.uk>

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/usb/gadget/legacy/inode.c | 355 +++++++++++++++-----------------------
 1 file changed, 141 insertions(+), 214 deletions(-)

diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c
index 9fbbaa0..b825edc 100644
--- a/drivers/usb/gadget/legacy/inode.c
+++ b/drivers/usb/gadget/legacy/inode.c
@@ -363,97 +363,6 @@ ep_io (struct ep_data *epdata, void *buf, unsigned len)
 	return value;
 }
 
-
-/* handle a synchronous OUT bulk/intr/iso transfer */
-static ssize_t
-ep_read (struct file *fd, char __user *buf, size_t len, loff_t *ptr)
-{
-	struct ep_data		*data = fd->private_data;
-	void			*kbuf;
-	ssize_t			value;
-
-	if ((value = get_ready_ep (fd->f_flags, data)) < 0)
-		return value;
-
-	/* halt any endpoint by doing a "wrong direction" i/o call */
-	if (usb_endpoint_dir_in(&data->desc)) {
-		if (usb_endpoint_xfer_isoc(&data->desc)) {
-			mutex_unlock(&data->lock);
-			return -EINVAL;
-		}
-		DBG (data->dev, "%s halt\n", data->name);
-		spin_lock_irq (&data->dev->lock);
-		if (likely (data->ep != NULL))
-			usb_ep_set_halt (data->ep);
-		spin_unlock_irq (&data->dev->lock);
-		mutex_unlock(&data->lock);
-		return -EBADMSG;
-	}
-
-	/* FIXME readahead for O_NONBLOCK and poll(); careful with ZLPs */
-
-	value = -ENOMEM;
-	kbuf = kmalloc (len, GFP_KERNEL);
-	if (unlikely (!kbuf))
-		goto free1;
-
-	value = ep_io (data, kbuf, len);
-	VDEBUG (data->dev, "%s read %zu OUT, status %d\n",
-		data->name, len, (int) value);
-	if (value >= 0 && copy_to_user (buf, kbuf, value))
-		value = -EFAULT;
-
-free1:
-	mutex_unlock(&data->lock);
-	kfree (kbuf);
-	return value;
-}
-
-/* handle a synchronous IN bulk/intr/iso transfer */
-static ssize_t
-ep_write (struct file *fd, const char __user *buf, size_t len, loff_t *ptr)
-{
-	struct ep_data		*data = fd->private_data;
-	void			*kbuf;
-	ssize_t			value;
-
-	if ((value = get_ready_ep (fd->f_flags, data)) < 0)
-		return value;
-
-	/* halt any endpoint by doing a "wrong direction" i/o call */
-	if (!usb_endpoint_dir_in(&data->desc)) {
-		if (usb_endpoint_xfer_isoc(&data->desc)) {
-			mutex_unlock(&data->lock);
-			return -EINVAL;
-		}
-		DBG (data->dev, "%s halt\n", data->name);
-		spin_lock_irq (&data->dev->lock);
-		if (likely (data->ep != NULL))
-			usb_ep_set_halt (data->ep);
-		spin_unlock_irq (&data->dev->lock);
-		mutex_unlock(&data->lock);
-		return -EBADMSG;
-	}
-
-	/* FIXME writebehind for O_NONBLOCK and poll(), qlen = 1 */
-
-	value = -ENOMEM;
-	kbuf = memdup_user(buf, len);
-	if (IS_ERR(kbuf)) {
-		value = PTR_ERR(kbuf);
-		kbuf = NULL;
-		goto free1;
-	}
-
-	value = ep_io (data, kbuf, len);
-	VDEBUG (data->dev, "%s write %zu IN, status %d\n",
-		data->name, len, (int) value);
-free1:
-	mutex_unlock(&data->lock);
-	kfree (kbuf);
-	return value;
-}
-
 static int
 ep_release (struct inode *inode, struct file *fd)
 {
@@ -517,8 +426,8 @@ struct kiocb_priv {
 	struct mm_struct	*mm;
 	struct work_struct	work;
 	void			*buf;
-	const struct iovec	*iv;
-	unsigned long		nr_segs;
+	struct iov_iter		to;
+	const void		*to_free;
 	unsigned		actual;
 };
 
@@ -541,34 +450,6 @@ static int ep_aio_cancel(struct kiocb *iocb)
 	return value;
 }
 
-static ssize_t ep_copy_to_user(struct kiocb_priv *priv)
-{
-	ssize_t			len, total;
-	void			*to_copy;
-	int			i;
-
-	/* copy stuff into user buffers */
-	total = priv->actual;
-	len = 0;
-	to_copy = priv->buf;
-	for (i=0; i < priv->nr_segs; i++) {
-		ssize_t this = min((ssize_t)(priv->iv[i].iov_len), total);
-
-		if (copy_to_user(priv->iv[i].iov_base, to_copy, this)) {
-			if (len == 0)
-				len = -EFAULT;
-			break;
-		}
-
-		total -= this;
-		len += this;
-		to_copy += this;
-		if (total == 0)
-			break;
-	}
-	return len;
-}
-
 static void ep_user_copy_worker(struct work_struct *work)
 {
 	struct kiocb_priv *priv = container_of(work, struct kiocb_priv, work);
@@ -577,14 +458,16 @@ static void ep_user_copy_worker(struct work_struct *work)
 	size_t ret;
 
 	use_mm(mm);
-	ret = ep_copy_to_user(priv);
+	ret = copy_to_iter(priv->buf, priv->actual, &priv->to);
 	unuse_mm(mm);
+	if (!ret)
+		ret = -EFAULT;
 
 	/* completing the iocb can drop the ctx and mm, don't touch mm after */
 	aio_complete(iocb, ret, ret);
 
 	kfree(priv->buf);
-	kfree(priv->iv);
+	kfree(priv->to_free);
 	kfree(priv);
 }
 
@@ -603,9 +486,9 @@ static void ep_aio_complete(struct usb_ep *ep, struct usb_request *req)
 	 * don't need to copy anything to userspace, so we can
 	 * complete the aio request immediately.
 	 */
-	if (priv->iv == NULL || unlikely(req->actual == 0)) {
+	if (priv->to_free == NULL || unlikely(req->actual == 0)) {
 		kfree(req->buf);
-		kfree(priv->iv);
+		kfree(priv->to_free);
 		kfree(priv);
 		iocb->private = NULL;
 		/* aio_complete() reports bytes-transferred _and_ faults */
@@ -619,6 +502,7 @@ static void ep_aio_complete(struct usb_ep *ep, struct usb_request *req)
 
 		priv->buf = req->buf;
 		priv->actual = req->actual;
+		INIT_WORK(&priv->work, ep_user_copy_worker);
 		schedule_work(&priv->work);
 	}
 	spin_unlock(&epdata->dev->lock);
@@ -627,45 +511,17 @@ static void ep_aio_complete(struct usb_ep *ep, struct usb_request *req)
 	put_ep(epdata);
 }
 
-static ssize_t
-ep_aio_rwtail(
-	struct kiocb	*iocb,
-	char		*buf,
-	size_t		len,
-	struct ep_data	*epdata,
-	const struct iovec *iv,
-	unsigned long	nr_segs
-)
+static ssize_t ep_aio(struct kiocb *iocb,
+		      struct kiocb_priv *priv,
+		      struct ep_data *epdata,
+		      char *buf,
+		      size_t len)
 {
-	struct kiocb_priv	*priv;
-	struct usb_request	*req;
-	ssize_t			value;
+	struct usb_request *req;
+	ssize_t value;
 
-	priv = kzalloc(sizeof *priv, GFP_KERNEL);
-	if (!priv) {
-		value = -ENOMEM;
-fail:
-		kfree(buf);
-		return value;
-	}
 	iocb->private = priv;
 	priv->iocb = iocb;
-	if (iv) {
-		priv->iv = kmemdup(iv, nr_segs * sizeof(struct iovec),
-				   GFP_KERNEL);
-		if (!priv->iv) {
-			kfree(priv);
-			goto fail;
-		}
-	}
-	priv->nr_segs = nr_segs;
-	INIT_WORK(&priv->work, ep_user_copy_worker);
-
-	value = get_ready_ep(iocb->ki_filp->f_flags, epdata);
-	if (unlikely(value < 0)) {
-		kfree(priv);
-		goto fail;
-	}
 
 	kiocb_set_cancel_fn(iocb, ep_aio_cancel);
 	get_ep(epdata);
@@ -677,76 +533,147 @@ fail:
 	 * allocate or submit those if the host disconnected.
 	 */
 	spin_lock_irq(&epdata->dev->lock);
-	if (likely(epdata->ep)) {
-		req = usb_ep_alloc_request(epdata->ep, GFP_ATOMIC);
-		if (likely(req)) {
-			priv->req = req;
-			req->buf = buf;
-			req->length = len;
-			req->complete = ep_aio_complete;
-			req->context = iocb;
-			value = usb_ep_queue(epdata->ep, req, GFP_ATOMIC);
-			if (unlikely(0 != value))
-				usb_ep_free_request(epdata->ep, req);
-		} else
-			value = -EAGAIN;
-	} else
-		value = -ENODEV;
-	spin_unlock_irq(&epdata->dev->lock);
+	value = -ENODEV;
+	if (unlikely(epdata->ep))
+		goto fail;
 
-	mutex_unlock(&epdata->lock);
+	req = usb_ep_alloc_request(epdata->ep, GFP_ATOMIC);
+	value = -ENOMEM;
+	if (unlikely(!req))
+		goto fail;
 
-	if (unlikely(value)) {
-		kfree(priv->iv);
-		kfree(priv);
-		put_ep(epdata);
-	} else
-		value = -EIOCBQUEUED;
+	priv->req = req;
+	req->buf = buf;
+	req->length = len;
+	req->complete = ep_aio_complete;
+	req->context = iocb;
+	value = usb_ep_queue(epdata->ep, req, GFP_ATOMIC);
+	if (unlikely(0 != value)) {
+		usb_ep_free_request(epdata->ep, req);
+		goto fail;
+	}
+	spin_unlock_irq(&epdata->dev->lock);
+	return -EIOCBQUEUED;
+
+fail:
+	spin_unlock_irq(&epdata->dev->lock);
+	kfree(priv->to_free);
+	kfree(priv);
+	put_ep(epdata);
 	return value;
 }
 
 static ssize_t
-ep_aio_read(struct kiocb *iocb, const struct iovec *iov,
-		unsigned long nr_segs, loff_t o)
+ep_read_iter(struct kiocb *iocb, struct iov_iter *to)
 {
-	struct ep_data		*epdata = iocb->ki_filp->private_data;
-	char			*buf;
+	struct file *file = iocb->ki_filp;
+	struct ep_data *epdata = file->private_data;
+	size_t len = iov_iter_count(to);
+	ssize_t value;
+	char *buf;
 
-	if (unlikely(usb_endpoint_dir_in(&epdata->desc)))
-		return -EINVAL;
+	if ((value = get_ready_ep(file->f_flags, epdata)) < 0)
+		return value;
 
-	buf = kmalloc(iocb->ki_nbytes, GFP_KERNEL);
-	if (unlikely(!buf))
-		return -ENOMEM;
+	/* halt any endpoint by doing a "wrong direction" i/o call */
+	if (usb_endpoint_dir_in(&epdata->desc)) {
+		if (usb_endpoint_xfer_isoc(&epdata->desc) ||
+		    !is_sync_kiocb(iocb)) {
+			mutex_unlock(&epdata->lock);
+			return -EINVAL;
+		}
+		DBG (epdata->dev, "%s halt\n", epdata->name);
+		spin_lock_irq(&epdata->dev->lock);
+		if (likely(epdata->ep != NULL))
+			usb_ep_set_halt(epdata->ep);
+		spin_unlock_irq(&epdata->dev->lock);
+		mutex_unlock(&epdata->lock);
+		return -EBADMSG;
+	}
 
-	return ep_aio_rwtail(iocb, buf, iocb->ki_nbytes, epdata, iov, nr_segs);
+	buf = kmalloc(len, GFP_KERNEL);
+	if (unlikely(!buf)) {
+		mutex_unlock(&epdata->lock);
+		return -ENOMEM;
+	}
+	if (is_sync_kiocb(iocb)) {
+		value = ep_io(epdata, buf, len);
+		if (value >= 0 && copy_to_iter(buf, value, to))
+			value = -EFAULT;
+	} else {
+		struct kiocb_priv *priv = kzalloc(sizeof *priv, GFP_KERNEL);
+		value = -ENOMEM;
+		if (!priv)
+			goto fail;
+		priv->to_free = dup_iter(&priv->to, to, GFP_KERNEL);
+		if (!priv->to_free) {
+			kfree(priv);
+			goto fail;
+		}
+		value = ep_aio(iocb, priv, epdata, buf, len);
+		if (value == -EIOCBQUEUED)
+			buf = NULL;
+	}
+fail:
+	kfree(buf);
+	mutex_unlock(&epdata->lock);
+	return value;
 }
 
 static ssize_t
-ep_aio_write(struct kiocb *iocb, const struct iovec *iov,
-		unsigned long nr_segs, loff_t o)
+ep_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
-	struct ep_data		*epdata = iocb->ki_filp->private_data;
-	char			*buf;
-	size_t			len = 0;
-	int			i = 0;
+	struct file *file = iocb->ki_filp;
+	struct ep_data *epdata = file->private_data;
+	size_t len = iov_iter_count(from);
+	ssize_t value;
+	char *buf;
 
-	if (unlikely(!usb_endpoint_dir_in(&epdata->desc)))
-		return -EINVAL;
+	if ((value = get_ready_ep(file->f_flags, epdata)) < 0)
+		return value;
 
-	buf = kmalloc(iocb->ki_nbytes, GFP_KERNEL);
-	if (unlikely(!buf))
+	/* halt any endpoint by doing a "wrong direction" i/o call */
+	if (!usb_endpoint_dir_in(&epdata->desc)) {
+		if (usb_endpoint_xfer_isoc(&epdata->desc) ||
+		    !is_sync_kiocb(iocb)) {
+			mutex_unlock(&epdata->lock);
+			return -EINVAL;
+		}
+		DBG (epdata->dev, "%s halt\n", epdata->name);
+		spin_lock_irq(&epdata->dev->lock);
+		if (likely(epdata->ep != NULL))
+			usb_ep_set_halt(epdata->ep);
+		spin_unlock_irq(&epdata->dev->lock);
+		mutex_unlock(&epdata->lock);
+		return -EBADMSG;
+	}
+
+	buf = kmalloc(len, GFP_KERNEL);
+	if (unlikely(!buf)) {
+		mutex_unlock(&epdata->lock);
 		return -ENOMEM;
+	}
 
-	for (i=0; i < nr_segs; i++) {
-		if (unlikely(copy_from_user(&buf[len], iov[i].iov_base,
-				iov[i].iov_len) != 0)) {
-			kfree(buf);
-			return -EFAULT;
+	if (unlikely(copy_from_iter(buf, len, from) != len)) {
+		value = -EFAULT;
+		goto out;
+	}
+
+	if (is_sync_kiocb(iocb)) {
+		value = ep_io(epdata, buf, len);
+	} else {
+		struct kiocb_priv *priv = kzalloc(sizeof *priv, GFP_KERNEL);
+		value = -ENOMEM;
+		if (priv) {
+			value = ep_aio(iocb, priv, epdata, buf, len);
+			if (value == -EIOCBQUEUED)
+				buf = NULL;
 		}
-		len += iov[i].iov_len;
 	}
-	return ep_aio_rwtail(iocb, buf, len, epdata, NULL, 0);
+out:
+	kfree(buf);
+	mutex_unlock(&epdata->lock);
+	return value;
 }
 
 /*----------------------------------------------------------------------*/
@@ -756,13 +683,13 @@ static const struct file_operations ep_io_operations = {
 	.owner =	THIS_MODULE,
 	.llseek =	no_llseek,
 
-	.read =		ep_read,
-	.write =	ep_write,
+	.read =		new_sync_read,
+	.write =	new_sync_write,
 	.unlocked_ioctl = ep_ioctl,
 	.release =	ep_release,
 
-	.aio_read =	ep_aio_read,
-	.aio_write =	ep_aio_write,
+	.read_iter =	ep_read_iter,
+	.write_iter =	ep_write_iter,
 };
 
 /* ENDPOINT INITIALIZATION
-- 
2.1.4

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply related	[flat|nested] 45+ messages in thread

* Re: [RFC] split struct kiocb
  2015-02-23 21:20 ` Al Viro
  2015-02-23 21:22   ` Christoph Hellwig
  2015-02-23 21:23   ` Felipe Balbi
@ 2015-02-25 17:13   ` Felipe Balbi
  2 siblings, 0 replies; 45+ messages in thread
From: Felipe Balbi @ 2015-02-25 17:13 UTC (permalink / raw)
  To: Al Viro, David Cohen
  Cc: Christoph Hellwig, Maxim Patlasov, Robert Baldyga,
	Michal Nazarewicz, Felipe Balbi, linux-aio, linux-fsdevel

[-- Attachment #1: Type: text/plain, Size: 1744 bytes --]

Hi,

On Mon, Feb 23, 2015 at 09:20:59PM +0000, Al Viro wrote:
> On Mon, Feb 23, 2015 at 10:00:24AM -0800, Christoph Hellwig wrote:
> > This series cuts down the amount of fiels in the public iocb that is
> > allocated on stack for every synchronous I/O, both by removing fields
> > from it, and by adding a aio-specific iocb that is only allocated
> > for aio requests.
> > 
> > Additionally it cleans up various corner cases in the aio completion
> > code and allowes for adding a simple in-kernel async read/write
> > interface.
> > 
> > The first few patches are from Al's gadget branch and reposted
> > here because they are needed for the rest of the series.
> 
> FWIW, I would really like to hear from USB folks concerning those patches
> (gadgetfs ones, that is).  I don't have any way to test them beyond "does
> it compile" - no hardware that could run Linux and act as USB slave and
> no idea if there are any sane emulator setups (e.g. qemu doesn't seem to
> emulate anything drivers/usb/gadget/udc/* stuff would understand).
> 
> I'm not happy about the idea of having it merged into vfs.git#for-next
> with zero testing and no comments from the people actually using the
> drivers in question, _especially_ if it becomes a never-rebase branch
> used as prereq for other development.
> 
> Now that the merge window is closed, could USB folks review and comment
> on the stuff in
> git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git gadget
> ?
> 
> PS: I would prefer to rebase #iov_iter and #gadget to -rc1, actually...

I very much like those patches.

David, any chance you can test the branch above on your adb setup just
to make sure we're not breaking anything ?

cheers

-- 
balbi

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [RFC] split struct kiocb
  2015-02-23 21:39     ` Al Viro
@ 2015-02-24  3:47       ` Al Viro
  0 siblings, 0 replies; 45+ messages in thread
From: Al Viro @ 2015-02-24  3:47 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Maxim Patlasov, Robert Baldyga, Michal Nazarewicz, Felipe Balbi,
	linux-aio, linux-fsdevel

On Mon, Feb 23, 2015 at 09:39:47PM +0000, Al Viro wrote:
> On Mon, Feb 23, 2015 at 10:22:24PM +0100, Christoph Hellwig wrote:
> > On Mon, Feb 23, 2015 at 09:20:59PM +0000, Al Viro wrote:
> > > PS: I would prefer to rebase #iov_iter and #gadget to -rc1, actually...
> > 
> > Btw, any chance to sneak dup_iter to Linus for this release?  I ended
> > up adding an opencoded version to the block layer for this merge window,
> > and I'd prefer to get these consolidated ASAP.
> 
> Hmm...  Let's wait for USB folks to review those fixes - dup_iter() is
> helper needed for a fix of easily triggered use-after-free bug, after
> all, and beginning of the cycle is obviously OK for such.

While we are at it - once the situation with USB patches gets resolved,
I'm going to put that into vfs.git.  If nothing else, O_DIRECT, O_APPEND and
probably O_NDELAY as well, need to be mirrored into ->ki_flags, to deal with
races on flipping those suckers during ->{read,write}_iter().  That, and
followup patches to affected filesystems out to go through vfs.git.

Another thing is that rw_copy_check_uvector() should be split in two helpers -
one to have iov_iter_init() folded into it, another being equivalent to
passing CHECK_IOVEC_ONLY (sg_io() and foreign address space treatment in
process_vm_readv()).  Also vfs.git fodder, obviously, and should be on
top of that stuff.

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [RFC] split struct kiocb
  2015-02-23 21:42     ` Al Viro
@ 2015-02-23 21:50       ` Felipe Balbi
  0 siblings, 0 replies; 45+ messages in thread
From: Felipe Balbi @ 2015-02-23 21:50 UTC (permalink / raw)
  To: Al Viro
  Cc: Felipe Balbi, Christoph Hellwig, Maxim Patlasov, Robert Baldyga,
	Michal Nazarewicz, linux-aio, linux-fsdevel

[-- Attachment #1: Type: text/plain, Size: 2141 bytes --]

Hi,

On Mon, Feb 23, 2015 at 09:42:13PM +0000, Al Viro wrote:
> On Mon, Feb 23, 2015 at 03:23:43PM -0600, Felipe Balbi wrote:
> > > FWIW, I would really like to hear from USB folks concerning those patches
> > > (gadgetfs ones, that is).  I don't have any way to test them beyond "does
> > > it compile" - no hardware that could run Linux and act as USB slave and
> > > no idea if there are any sane emulator setups (e.g. qemu doesn't seem to
> > > emulate anything drivers/usb/gadget/udc/* stuff would understand).
> > > 
> > > I'm not happy about the idea of having it merged into vfs.git#for-next
> > > with zero testing and no comments from the people actually using the
> > > drivers in question, _especially_ if it becomes a never-rebase branch
> > > used as prereq for other development.
> > > 
> > > Now that the merge window is closed, could USB folks review and comment
> > > on the stuff in
> > > git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git gadget
> > > ?
> > > 
> > > PS: I would prefer to rebase #iov_iter and #gadget to -rc1, actually...
> > 
> > give me a couple days, quite busy myself ;-)
> 
> Sure, no problem...  BTW, how does one normally test in that area?  IOW,
> which emulators and/or real hardware that could be found on e.g. ebay can
> handle the current kernels?

well, you can use dummy_hcd, for example. It's completely done in SW.

If you want HW, there are quite a few you can use: beaglebone black is
cheap (but MUSB is a terrible controller), OMAP5 uEVM ain't that cheap,
but a better controller, BeagleBoard X15 is also good, but not that
cheap either. There's also gumstix overo which is the same controller as
Beaglebone Black.

AM437x SK (which is not yet available for purchase unfortunately), has
a good USB controller, works just fine with mainline (since 3.17) and
it's one of the boards I use daily. Unfortunately, not yet released for
public consumption :-(

There are other boards like raspberry pi (*real* cheap), or minnowboard
but I don't have those and I can't really say whether they "Just
Work(TM)" with upstream.

cheers

-- 
balbi

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [RFC] split struct kiocb
  2015-02-23 21:23   ` Felipe Balbi
@ 2015-02-23 21:42     ` Al Viro
  2015-02-23 21:50       ` Felipe Balbi
  0 siblings, 1 reply; 45+ messages in thread
From: Al Viro @ 2015-02-23 21:42 UTC (permalink / raw)
  To: Felipe Balbi
  Cc: Christoph Hellwig, Maxim Patlasov, Robert Baldyga,
	Michal Nazarewicz, linux-aio, linux-fsdevel

On Mon, Feb 23, 2015 at 03:23:43PM -0600, Felipe Balbi wrote:
> > FWIW, I would really like to hear from USB folks concerning those patches
> > (gadgetfs ones, that is).  I don't have any way to test them beyond "does
> > it compile" - no hardware that could run Linux and act as USB slave and
> > no idea if there are any sane emulator setups (e.g. qemu doesn't seem to
> > emulate anything drivers/usb/gadget/udc/* stuff would understand).
> > 
> > I'm not happy about the idea of having it merged into vfs.git#for-next
> > with zero testing and no comments from the people actually using the
> > drivers in question, _especially_ if it becomes a never-rebase branch
> > used as prereq for other development.
> > 
> > Now that the merge window is closed, could USB folks review and comment
> > on the stuff in
> > git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git gadget
> > ?
> > 
> > PS: I would prefer to rebase #iov_iter and #gadget to -rc1, actually...
> 
> give me a couple days, quite busy myself ;-)

Sure, no problem...  BTW, how does one normally test in that area?  IOW,
which emulators and/or real hardware that could be found on e.g. ebay can
handle the current kernels?

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [RFC] split struct kiocb
  2015-02-23 21:22   ` Christoph Hellwig
@ 2015-02-23 21:39     ` Al Viro
  2015-02-24  3:47       ` Al Viro
  0 siblings, 1 reply; 45+ messages in thread
From: Al Viro @ 2015-02-23 21:39 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Maxim Patlasov, Robert Baldyga, Michal Nazarewicz, Felipe Balbi,
	linux-aio, linux-fsdevel

On Mon, Feb 23, 2015 at 10:22:24PM +0100, Christoph Hellwig wrote:
> On Mon, Feb 23, 2015 at 09:20:59PM +0000, Al Viro wrote:
> > PS: I would prefer to rebase #iov_iter and #gadget to -rc1, actually...
> 
> Btw, any chance to sneak dup_iter to Linus for this release?  I ended
> up adding an opencoded version to the block layer for this merge window,
> and I'd prefer to get these consolidated ASAP.

Hmm...  Let's wait for USB folks to review those fixes - dup_iter() is
helper needed for a fix of easily triggered use-after-free bug, after
all, and beginning of the cycle is obviously OK for such.

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [RFC] split struct kiocb
  2015-02-23 21:20 ` Al Viro
  2015-02-23 21:22   ` Christoph Hellwig
@ 2015-02-23 21:23   ` Felipe Balbi
  2015-02-23 21:42     ` Al Viro
  2015-02-25 17:13   ` Felipe Balbi
  2 siblings, 1 reply; 45+ messages in thread
From: Felipe Balbi @ 2015-02-23 21:23 UTC (permalink / raw)
  To: Al Viro
  Cc: Christoph Hellwig, Maxim Patlasov, Robert Baldyga,
	Michal Nazarewicz, Felipe Balbi, linux-aio, linux-fsdevel

[-- Attachment #1: Type: text/plain, Size: 1629 bytes --]

Hi,

On Mon, Feb 23, 2015 at 09:20:59PM +0000, Al Viro wrote:
> On Mon, Feb 23, 2015 at 10:00:24AM -0800, Christoph Hellwig wrote:
> > This series cuts down the amount of fiels in the public iocb that is
> > allocated on stack for every synchronous I/O, both by removing fields
> > from it, and by adding a aio-specific iocb that is only allocated
> > for aio requests.
> > 
> > Additionally it cleans up various corner cases in the aio completion
> > code and allowes for adding a simple in-kernel async read/write
> > interface.
> > 
> > The first few patches are from Al's gadget branch and reposted
> > here because they are needed for the rest of the series.
> 
> FWIW, I would really like to hear from USB folks concerning those patches
> (gadgetfs ones, that is).  I don't have any way to test them beyond "does
> it compile" - no hardware that could run Linux and act as USB slave and
> no idea if there are any sane emulator setups (e.g. qemu doesn't seem to
> emulate anything drivers/usb/gadget/udc/* stuff would understand).
> 
> I'm not happy about the idea of having it merged into vfs.git#for-next
> with zero testing and no comments from the people actually using the
> drivers in question, _especially_ if it becomes a never-rebase branch
> used as prereq for other development.
> 
> Now that the merge window is closed, could USB folks review and comment
> on the stuff in
> git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git gadget
> ?
> 
> PS: I would prefer to rebase #iov_iter and #gadget to -rc1, actually...

give me a couple days, quite busy myself ;-)

-- 
balbi

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [RFC] split struct kiocb
  2015-02-23 21:20 ` Al Viro
@ 2015-02-23 21:22   ` Christoph Hellwig
  2015-02-23 21:39     ` Al Viro
  2015-02-23 21:23   ` Felipe Balbi
  2015-02-25 17:13   ` Felipe Balbi
  2 siblings, 1 reply; 45+ messages in thread
From: Christoph Hellwig @ 2015-02-23 21:22 UTC (permalink / raw)
  To: Al Viro
  Cc: Christoph Hellwig, Maxim Patlasov, Robert Baldyga,
	Michal Nazarewicz, Felipe Balbi, linux-aio, linux-fsdevel

On Mon, Feb 23, 2015 at 09:20:59PM +0000, Al Viro wrote:
> PS: I would prefer to rebase #iov_iter and #gadget to -rc1, actually...

Btw, any chance to sneak dup_iter to Linus for this release?  I ended
up adding an opencoded version to the block layer for this merge window,
and I'd prefer to get these consolidated ASAP.

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [RFC] split struct kiocb
  2015-02-23 18:00 [RFC] split struct kiocb Christoph Hellwig
@ 2015-02-23 21:20 ` Al Viro
  2015-02-23 21:22   ` Christoph Hellwig
                     ` (2 more replies)
  0 siblings, 3 replies; 45+ messages in thread
From: Al Viro @ 2015-02-23 21:20 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Maxim Patlasov, Robert Baldyga, Michal Nazarewicz, Felipe Balbi,
	linux-aio, linux-fsdevel

On Mon, Feb 23, 2015 at 10:00:24AM -0800, Christoph Hellwig wrote:
> This series cuts down the amount of fiels in the public iocb that is
> allocated on stack for every synchronous I/O, both by removing fields
> from it, and by adding a aio-specific iocb that is only allocated
> for aio requests.
> 
> Additionally it cleans up various corner cases in the aio completion
> code and allowes for adding a simple in-kernel async read/write
> interface.
> 
> The first few patches are from Al's gadget branch and reposted
> here because they are needed for the rest of the series.

FWIW, I would really like to hear from USB folks concerning those patches
(gadgetfs ones, that is).  I don't have any way to test them beyond "does
it compile" - no hardware that could run Linux and act as USB slave and
no idea if there are any sane emulator setups (e.g. qemu doesn't seem to
emulate anything drivers/usb/gadget/udc/* stuff would understand).

I'm not happy about the idea of having it merged into vfs.git#for-next
with zero testing and no comments from the people actually using the
drivers in question, _especially_ if it becomes a never-rebase branch
used as prereq for other development.

Now that the merge window is closed, could USB folks review and comment
on the stuff in
git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git gadget
?

PS: I would prefer to rebase #iov_iter and #gadget to -rc1, actually...

^ permalink raw reply	[flat|nested] 45+ messages in thread

* [RFC] split struct kiocb
@ 2015-02-23 18:00 Christoph Hellwig
  2015-02-23 21:20 ` Al Viro
  0 siblings, 1 reply; 45+ messages in thread
From: Christoph Hellwig @ 2015-02-23 18:00 UTC (permalink / raw)
  To: Al Viro
  Cc: Maxim Patlasov, Robert Baldyga, Michal Nazarewicz, Felipe Balbi,
	linux-aio, linux-fsdevel

This series cuts down the amount of fiels in the public iocb that is
allocated on stack for every synchronous I/O, both by removing fields
from it, and by adding a aio-specific iocb that is only allocated
for aio requests.

Additionally it cleans up various corner cases in the aio completion
code and allowes for adding a simple in-kernel async read/write
interface.

The first few patches are from Al's gadget branch and reposted
here because they are needed for the rest of the series.


^ permalink raw reply	[flat|nested] 45+ messages in thread

end of thread, other threads:[~2015-02-25 17:13 UTC | newest]

Thread overview: 45+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-01-27 17:55 [RFC] split struct kiocb Christoph Hellwig
2015-01-27 17:55 ` [PATCH 1/5] fs: don't allow to complete sync iocbs through aio_complete Christoph Hellwig
2015-01-28 15:30   ` Miklos Szeredi
2015-01-28 16:57     ` Christoph Hellwig
2015-01-31  3:01       ` Maxim Patlasov
2015-01-27 17:55 ` [PATCH 2/5] fs: saner aio_complete prototype Christoph Hellwig
2015-01-31 10:04   ` Al Viro
2015-01-27 17:55 ` [PATCH 3/5] fs: remove ki_nbytes Christoph Hellwig
2015-01-31  6:08   ` Al Viro
2015-02-02  8:07     ` Christoph Hellwig
2015-02-02  8:11       ` Al Viro
2015-02-02  8:14         ` Al Viro
2015-02-02 14:26           ` Christoph Hellwig
2015-02-04  8:34             ` Al Viro
2015-02-04 18:17               ` Alan Stern
2015-02-04 19:06                 ` Al Viro
2015-02-04 20:30                   ` Alan Stern
2015-02-04 23:07                     ` Al Viro
2015-02-05  8:24                       ` Robert Baldyga
2015-02-05  8:47                         ` Al Viro
2015-02-05  9:03                           ` Al Viro
2015-02-05  9:15                             ` Robert Baldyga
     [not found]                       ` <20150204230733.GK29656-3bDd1+5oDREiFSDQTTA3OLVCufUGDwFn@public.gmane.org>
2015-02-05 15:29                         ` Alan Stern
2015-02-06  7:03                           ` Al Viro
     [not found]                             ` <20150206070350.GX29656-3bDd1+5oDREiFSDQTTA3OLVCufUGDwFn@public.gmane.org>
2015-02-06  8:44                               ` Robert Baldyga
2015-02-07  5:44                           ` Al Viro
2015-02-07  5:48                             ` [PATCH 1/6] new helper: dup_iter() Al Viro
2015-02-07  5:48                             ` [PATCH 2/6] gadget/function/f_fs.c: close leaks Al Viro
2015-02-07  5:48                             ` [PATCH 3/6] gadget/function/f_fs.c: use put iov_iter into io_data Al Viro
2015-02-07  5:48                             ` [PATCH 4/6] gadget/function/f_fs.c: switch to ->{read,write}_iter() Al Viro
2015-02-07  5:48                             ` [PATCH 5/6] gadgetfs: use-after-free in ->aio_read() Al Viro
2015-02-07  5:48                             ` [PATCH 6/6] gadget: switch ep_io_operations to ->read_iter/->write_iter Al Viro
2015-02-02 14:20         ` [PATCH 3/5] fs: remove ki_nbytes Christoph Hellwig
2015-01-27 17:55 ` [PATCH 4/5] fs: split generic and aio kiocb Christoph Hellwig
2015-01-27 17:55 ` [PATCH 5/5] fs: add async read/write interfaces Christoph Hellwig
2015-01-31  6:29   ` Al Viro
2015-02-23 18:00 [RFC] split struct kiocb Christoph Hellwig
2015-02-23 21:20 ` Al Viro
2015-02-23 21:22   ` Christoph Hellwig
2015-02-23 21:39     ` Al Viro
2015-02-24  3:47       ` Al Viro
2015-02-23 21:23   ` Felipe Balbi
2015-02-23 21:42     ` Al Viro
2015-02-23 21:50       ` Felipe Balbi
2015-02-25 17:13   ` Felipe Balbi

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.