async-io API registration for 2.5.29

* async-io API registration for 2.5.29
@ 2002-07-30  5:41 Andrea Arcangeli
  2002-07-30  8:11 ` Christoph Hellwig
                   ` (3 more replies)
  0 siblings, 4 replies; 83+ messages in thread
From: Andrea Arcangeli @ 2002-07-30  5:41 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Benjamin LaHaise, linux-kernel, linux-aio

Hello,

this patch against 2.5.29 adds the async-io API as from latest Ben's
patch.

I find the dynamic syscall approch in some vendor kernel out there
that implements a /proc/libredhat unacceptable since it's not forward
compatible with 2.5:

@@ -636,6 +637,12 @@
 	.long SYMBOL_NAME(sys_ni_syscall)	/* reserved for fremovexattr */
  	.long SYMBOL_NAME(sys_tkill)
 
+ 	.rept __NR_sys_dynamic_syscall-(.-sys_call_table)/4
+ 		.long SYMBOL_NAME(sys_ni_syscall)
+ 	.endr
+ 	.long SYMBOL_NAME(sys_dynamic_syscall)
+ 	.long SYMBOL_NAME(sys_io_submit)
+
 	.rept NR_syscalls-(.-sys_call_table)/4
 		.long SYMBOL_NAME(sys_ni_syscall)
 	.endr

diff -urN v2.4.19-pre5/include/asm-i386/unistd.h
linux.diff/include/asm-i386/unistd.h

--- v2.4.19-pre5/include/asm-i386/unistd.h	Wed Apr  3 21:04:38 2002
+++ linux.diff/include/asm-i386/unistd.h	Sat May 18 11:44:01 2002
@@ -245,6 +245,9 @@
 
 #define __NR_tkill		238
 
+#define __NR_sys_dynamic_syscall	250
+#define __NR_io_submit			251
+
 /* user-visible error numbers are in the range -1 - -124: see
 * <asm-i386/errno.h> */
 

to try not to execute random code they use a magic number choosen at
compile time from /dev/urandom, so the probability to execute random
code is low but still there's a chance. For the io_sumbit I'm not even
sure if it's using the magic anymore (I guess checking the cookie
payload was a showstopper performance hit, in some older patch the
io_sumbit operation was passing through the slowdown of the dynamic
syscall but infact the new code does this:

+asmlinkage long vsys_io_submit(aio_context_t ctx_id, long nr, struct iocb **iocbpp)
+{
+	long res;
+	__asm__ volatile ("int $0x80"
+		: "=a" (res)
+		: "0" (__NR_io_submit), "b" (ctx_id), "c" (nr),
+		  "d" (iocbpp));
+	return res;
+}

). So I would ask if you could merge the below interface into 2.5 so we can
ship a real async-io with real syscalls in 2.4, there's not much time to
change it given this is just used in production userspace today. I
prepared a patch against 2.5.29. Ben, I would appreciate if you could
review and confirm you're fine with it too.

BTW, I'm not the author of the API, and personally I dislike the
sys_io_sumbit approch, the worst part is the multiplexing of course:

+		if (IOCB_CMD_PREAD == tmp.aio_lio_opcode) {
+			op = file->f_op->aio_read;
+			if (unlikely(!(file->f_mode & FMODE_READ)))
+				goto out_put_req;
+		} else if (IOCB_CMD_PREADX == tmp.aio_lio_opcode) {
+			op = file->f_op->aio_readx;
+			if (unlikely(!(file->f_mode & FMODE_READ)))
+				goto out_put_req;
+		} else if (IOCB_CMD_PWRITE == tmp.aio_lio_opcode) {
+			op = file->f_op->aio_write;
+			if (unlikely(!(file->f_mode & FMODE_WRITE)))
+				goto out_put_req;
+		} else if (IOCB_CMD_FSYNC == tmp.aio_lio_opcode) {
+			op = file->f_op->aio_fsync;
+		} else if (IOCB_CMD_POLL == tmp.aio_lio_opcode) {
+			op = generic_aio_poll;
+		} else
+			op = NULL;


instead of separate syscalls for the various async_io
PREAD/PREADX/PWRITE/FSYNC/POLL operations there is just a single entry
point and a parameters specify the operation. But this is what the
current userspace expects and I wouldn't have too much time to change it
anyways because then I would break all the userspace libs too (I just
break them because of the true syscalls instead of passing through the
/proc/libredhat that calls into the dynamic syscall, but that's not
too painful to adapt). And after all even the io_submit isn't too bad
besides the above slowdown in the multiplexing (at least it's sharing
some icache for top/bottom of the functionality).

checked that it still compiles fine on x86 (all other archs should keep
compiling too). available also from here:

	http://www.us.kernel.org/pub/linux/kernel/people/andrea/patches/v2.5/2.5.29/aio-api-1

Comments are welcome, many thanks.

diff -urNp 2.5.29/arch/i386/kernel/entry.S aio-api-1/arch/i386/kernel/entry.S
--- 2.5.29/arch/i386/kernel/entry.S	Sat Jul 27 06:07:21 2002
+++ aio-api-1/arch/i386/kernel/entry.S	Tue Jul 30 05:23:46 2002
@@ -753,6 +753,12 @@ ENTRY(sys_call_table)
 	.long sys_sched_setaffinity
 	.long sys_sched_getaffinity
 	.long sys_set_thread_area
+	.long sys_io_setup
+	.long sys_io_destroy	/* 245 */
+	.long sys_io_submit
+	.long sys_io_cancel
+	.long sys_io_wait
+	.long sys_io_getevents
 
 	.rept NR_syscalls-(.-sys_call_table)/4
 		.long sys_ni_syscall
diff -urNp 2.5.29/fs/Makefile aio-api-1/fs/Makefile
--- 2.5.29/fs/Makefile	Wed Jul 17 02:13:47 2002
+++ aio-api-1/fs/Makefile	Tue Jul 30 05:25:03 2002
@@ -15,7 +15,7 @@ obj-y :=	open.o read_write.o devices.o f
 		namei.o fcntl.o ioctl.o readdir.o select.o fifo.o locks.o \
 		dcache.o inode.o attr.o bad_inode.o file.o iobuf.o dnotify.o \
 		filesystems.o namespace.o seq_file.o xattr.o libfs.o \
-		fs-writeback.o mpage.o direct-io.o
+		fs-writeback.o mpage.o direct-io.o aio.o
 
 ifneq ($(CONFIG_NFSD),n)
 ifneq ($(CONFIG_NFSD),)
diff -urNp 2.5.29/fs/aio.c aio-api-1/fs/aio.c
--- 2.5.29/fs/aio.c	Thu Jan  1 01:00:00 1970
+++ aio-api-1/fs/aio.c	Tue Jul 30 05:33:20 2002
@@ -0,0 +1,38 @@
+#include <linux/kernel.h>
+#include <linux/aio.h>
+#include <linux/time.h>
+#include <linux/errno.h>
+
+asmlinkage long sys_io_setup(unsigned nr_reqs, aio_context_t *ctxp)
+{
+	return -ENOSYS;
+}
+
+asmlinkage long sys_io_destroy(aio_context_t ctx)
+{
+	return -ENOSYS;
+}
+
+asmlinkage long sys_io_submit(aio_context_t ctx_id, long nr, struct iocb **iocbpp)
+{
+	return -ENOSYS;
+}
+
+asmlinkage long sys_io_cancel(aio_context_t ctx_id, struct iocb *iocb)
+{
+	return -ENOSYS;
+}
+
+asmlinkage long sys_io_wait(aio_context_t ctx_id, struct iocb *iocb,
+			    const struct timespec *timeout)
+{
+	return -ENOSYS;
+}
+
+asmlinkage long sys_io_getevents(aio_context_t ctx_id,
+				 long nr,
+				 struct io_event *events,
+				 const struct timespec *timeout)
+{
+	return -ENOSYS;
+}
diff -urNp 2.5.29/include/asm-i386/unistd.h aio-api-1/include/asm-i386/unistd.h
--- 2.5.29/include/asm-i386/unistd.h	Sun Apr 14 22:09:06 2002
+++ aio-api-1/include/asm-i386/unistd.h	Tue Jul 30 05:22:38 2002
@@ -247,6 +247,13 @@
 #define __NR_futex		240
 #define __NR_sched_setaffinity	241
 #define __NR_sched_getaffinity	242
+#define __NR_set_thread_area	243
+#define __NR_io_setup		244
+#define __NR_io_destroy		245
+#define __NR_io_submit		246
+#define __NR_io_cancel		247
+#define __NR_io_wait		248
+#define __NR_io_getevents	249
 
 /* user-visible error numbers are in the range -1 - -124: see <asm-i386/errno.h> */
 
diff -urNp 2.5.29/include/linux/aio.h aio-api-1/include/linux/aio.h
--- 2.5.29/include/linux/aio.h	Thu Jan  1 01:00:00 1970
+++ aio-api-1/include/linux/aio.h	Tue Jul 30 05:32:30 2002
@@ -0,0 +1,6 @@
+#ifndef __LINUX__AIO_H
+#define __LINUX__AIO_H
+
+#include <linux/aio_abi.h>
+
+#endif /* __LINUX__AIO_H */
diff -urNp 2.5.29/include/linux/aio_abi.h aio-api-1/include/linux/aio_abi.h
--- 2.5.29/include/linux/aio_abi.h	Thu Jan  1 01:00:00 1970
+++ aio-api-1/include/linux/aio_abi.h	Tue Jul 30 05:57:23 2002
@@ -0,0 +1,86 @@
+/* linux/aio_abi.h
+ *
+ * Copyright 2000,2001,2002 Red Hat.
+ *
+ * Written by Benjamin LaHaise <bcrl@redhat.com>
+ *
+ * Permission to use, copy, modify, and distribute this software and its
+ * documentation is hereby granted, provided that the above copyright
+ * notice appears in all copies.  This software is provided without any
+ * warranty, express or implied.  Red Hat makes no representations about
+ * the suitability of this software for any purpose.
+ *
+ * IN NO EVENT SHALL RED HAT BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT,
+ * SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OF
+ * THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF RED HAT HAS BEEN ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * RED HAT DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND
+ * RED HAT HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+ * ENHANCEMENTS, OR MODIFICATIONS.
+ */
+#ifndef __LINUX__AIO_ABI_H
+#define __LINUX__AIO_ABI_H
+
+#include <asm/byteorder.h>
+
+typedef unsigned long	aio_context_t;
+
+enum {
+	IOCB_CMD_PREAD = 0,
+	IOCB_CMD_PWRITE = 1,
+	IOCB_CMD_FSYNC = 2,
+	IOCB_CMD_FDSYNC = 3,
+	IOCB_CMD_PREADX = 4,
+	IOCB_CMD_POLL = 5,
+	IOCB_CMD_NOOP = 6,
+};
+
+/* read() from /dev/aio returns these structures. */
+struct io_event {
+	__u64		data;		/* the data field from the iocb */
+	__u64		obj;		/* what iocb this event came from */
+	__s64		res;		/* result code for this event */
+	__s64		res2;		/* secondary result */
+};
+
+#if defined(__LITTLE_ENDIAN)
+#define PADDED(x,y)	x, y
+#elif defined(__BIG_ENDIAN)
+#define PADDED(x,y)	y, x
+#else
+#error edit for your odd byteorder.
+#endif
+
+/*
+ * we always use a 64bit off_t when communicating
+ * with userland.  its up to libraries to do the
+ * proper padding and aio_error abstraction
+ */
+
+struct iocb {
+	/* these are internal to the kernel/libc. */
+	__u64	aio_data;	/* data to be returned in event's data */
+	__u32	PADDED(aio_key, aio_reserved1);
+				/* the kernel sets aio_key to the req # */
+
+	/* common fields */
+	__u16	aio_lio_opcode;	/* see IOCB_CMD_ above */
+	__s16	aio_reqprio;
+	__u32	aio_fildes;
+
+	__u64	aio_buf;
+	__u64	aio_nbytes;
+	__s64	aio_offset;
+
+	/* extra parameters */
+	__u64	aio_reserved2;
+	__u64	aio_reserved3;
+}; /* 64 bytes */
+
+#undef IFBIG
+#undef IFLITTLE
+
+#endif /* __LINUX__AIO_ABI_H */

Andrea

^ permalink raw reply	[flat|nested] 83+ messages in thread