linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC 1/4] kevent: core files.
@ 2006-07-09 13:24 Evgeniy Polyakov
  2006-07-09 14:59 ` Pekka Enberg
  2006-07-25  6:17 ` David Miller
  0 siblings, 2 replies; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-07-09 13:24 UTC (permalink / raw)
  To: linux-kernel; +Cc: netdev

This patch includes core kevent files:
 - userspace controlling
 - kernelspace interfaces
 - initialisation
 - notification state machines

It might also inlclude parts from other subsystem (like network related
syscalls so it is possible that it will not compile without other
patches applied).

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>

diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index af56987..93e23ff 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@@ -316,3 +316,7 @@ ENTRY(sys_call_table)
 	.long sys_sync_file_range
 	.long sys_tee			/* 315 */
 	.long sys_vmsplice
+	.long sys_aio_recv
+	.long sys_aio_send
+	.long sys_aio_sendfile
+	.long sys_kevent_ctl
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index 5a92fed..534d516 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -696,4 +696,8 @@ #endif
 	.quad sys_sync_file_range
 	.quad sys_tee
 	.quad compat_sys_vmsplice
+	.quad sys_aio_recv
+	.quad sys_aio_send
+	.quad sys_aio_sendfile
+	.quad sys_kevent_ctl
 ia32_syscall_end:		
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index de2ccc1..52f8642 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -322,10 +322,14 @@ #define __NR_splice		313
 #define __NR_sync_file_range	314
 #define __NR_tee		315
 #define __NR_vmsplice		316
+#define __NR_aio_recv		317
+#define __NR_aio_send		318
+#define __NR_aio_sendfile	319
+#define __NR_kevent_ctl		320
 
 #ifdef __KERNEL__
 
-#define NR_syscalls 317
+#define NR_syscalls 321
 
 /*
  * user-visible error numbers are in the range -1 - -128: see
diff --git a/include/asm-x86_64/socket.h b/include/asm-x86_64/socket.h
index f2cdbea..1f31f86 100644
--- a/include/asm-x86_64/socket.h
+++ b/include/asm-x86_64/socket.h
@@ -49,4 +49,6 @@ #define SO_ACCEPTCONN		30
 
 #define SO_PEERSEC             31
 
+#define SO_ASYNC_SOCK		34
+
 #endif /* _ASM_SOCKET_H */
diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h
index 0aff22b..352c34b 100644
--- a/include/asm-x86_64/unistd.h
+++ b/include/asm-x86_64/unistd.h
@@ -617,11 +617,18 @@ #define __NR_sync_file_range	277
 __SYSCALL(__NR_sync_file_range, sys_sync_file_range)
 #define __NR_vmsplice		278
 __SYSCALL(__NR_vmsplice, sys_vmsplice)
+#define __NR_aio_recv		279
+__SYSCALL(__NR_aio_recv, sys_aio_recv)
+#define __NR_aio_send		280
+__SYSCALL(__NR_aio_send, sys_aio_send)
+#define __NR_aio_sendfile	281
+__SYSCALL(__NR_aio_sendfile, sys_aio_sendfile)
+#define __NR_kevent_ctl		282
+__SYSCALL(__NR_kevent_ctl, sys_kevent_ctl)
 
 #ifdef __KERNEL__
 
-#define __NR_syscall_max __NR_vmsplice
-
+#define __NR_syscall_max __NR_kevent_ctl
 #ifndef __NO_STUBS
 
 /* user-visible error numbers are in the range -1 - -4095 */

diff --git a/include/linux/kevent.h b/include/linux/kevent.h
new file mode 100644
index 0000000..e94a7bf
--- /dev/null
+++ b/include/linux/kevent.h
@@ -0,0 +1,263 @@
+/*
+ * 	kevent.h
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef __KEVENT_H
+#define __KEVENT_H
+
+/*
+ * Kevent request flags.
+ */
+
+#define KEVENT_REQ_ONESHOT	0x1		/* Process this event only once and then dequeue. */
+
+/*
+ * Kevent return flags.
+ */
+#define KEVENT_RET_BROKEN	0x1		/* Kevent is broken. */
+#define KEVENT_RET_DONE		0x2		/* Kevent processing was finished successfully. */
+
+/*
+ * Kevent type set.
+ */
+#define KEVENT_SOCKET 		0
+#define KEVENT_INODE		1
+#define KEVENT_TIMER		2
+#define KEVENT_POLL		3
+#define KEVENT_NAIO		4
+#define KEVENT_AIO		5
+#define	KEVENT_MAX		6
+
+/*
+ * Per-type event sets.
+ * Number of per-event sets should be exactly as number of kevent types.
+ */
+
+/*
+ * Timer events.
+ */
+#define	KEVENT_TIMER_FIRED	0x1
+
+/*
+ * Socket/network asynchronous IO events.
+ */
+#define	KEVENT_SOCKET_RECV	0x1
+#define	KEVENT_SOCKET_ACCEPT	0x2
+#define	KEVENT_SOCKET_SEND	0x4
+
+/*
+ * Inode events.
+ */
+#define	KEVENT_INODE_CREATE	0x1
+#define	KEVENT_INODE_REMOVE	0x2
+
+/*
+ * Poll events.
+ */
+#define	KEVENT_POLL_POLLIN	0x0001
+#define	KEVENT_POLL_POLLPRI	0x0002
+#define	KEVENT_POLL_POLLOUT	0x0004
+#define	KEVENT_POLL_POLLERR	0x0008
+#define	KEVENT_POLL_POLLHUP	0x0010
+#define	KEVENT_POLL_POLLNVAL	0x0020
+
+#define	KEVENT_POLL_POLLRDNORM	0x0040
+#define	KEVENT_POLL_POLLRDBAND	0x0080
+#define	KEVENT_POLL_POLLWRNORM	0x0100
+#define	KEVENT_POLL_POLLWRBAND	0x0200
+#define	KEVENT_POLL_POLLMSG	0x0400
+#define	KEVENT_POLL_POLLREMOVE	0x1000
+
+/*
+ * Asynchronous IO events.
+ */
+#define	KEVENT_AIO_BIO		0x1
+
+#define KEVENT_MASK_ALL		0xffffffff	/* Mask of all possible event values. */
+#define KEVENT_MASK_EMPTY	0x0		/* Empty mask of ready events. */
+
+struct kevent_id
+{
+	__u32		raw[2];
+};
+
+struct ukevent
+{
+	struct kevent_id	id;			/* Id of this request, e.g. socket number, file descriptor and so on... */
+	__u32			type;			/* Event type, e.g. KEVENT_SOCK, KEVENT_INODE, KEVENT_TIMER and so on... */
+	__u32			event;			/* Event itself, e.g. SOCK_ACCEPT, INODE_CREATED, TIMER_FIRED... */
+	__u32			req_flags;		/* Per-event request flags */
+	__u32			ret_flags;		/* Per-event return flags */
+	__u32			ret_data[2];		/* Event return data. Event originator fills it with anything it likes. */
+	union {
+		__u32		user[2];		/* User's data. It is not used, just copied to/from user. */
+		void		*ptr;
+	};
+};
+
+#define	KEVENT_CTL_ADD 		0
+#define	KEVENT_CTL_REMOVE	1
+#define	KEVENT_CTL_MODIFY	2
+#define	KEVENT_CTL_WAIT		3
+#define	KEVENT_CTL_INIT		4
+
+struct kevent_user_control
+{
+	unsigned int		cmd;			/* Control command, e.g. KEVENT_ADD, KEVENT_REMOVE... */
+	unsigned int		num;			/* Number of ukevents this strucutre controls. */
+	unsigned int		timeout;		/* Timeout in milliseconds waiting for "num" events to become ready. */
+};
+
+#define KEVENT_USER_SYMBOL	'K'
+#define KEVENT_USER_CTL		_IOWR(KEVENT_USER_SYMBOL, 0, struct kevent_user_control)
+#define KEVENT_USER_WAIT	_IOWR(KEVENT_USER_SYMBOL, 1, struct kevent_user_control)
+
+#ifdef __KERNEL__
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/kevent_storage.h>
+#include <asm/semaphore.h>
+
+struct inode;
+struct dentry;
+struct sock;
+
+struct kevent;
+struct kevent_storage;
+typedef int (* kevent_callback_t)(struct kevent *);
+
+struct kevent
+{
+	struct ukevent		event;
+	spinlock_t		lock;			/* This lock protects ukevent manipulations, e.g. ret_flags changes. */
+
+	struct list_head	kevent_entry;		/* Entry of user's queue. */
+	struct list_head	storage_entry;		/* Entry of origin's queue. */
+	struct list_head	ready_entry;		/* Entry of user's ready. */
+
+	struct kevent_user	*user;			/* User who requested this kevent. */
+	struct kevent_storage	*st;			/* Kevent container. */
+
+	kevent_callback_t	callback;		/* Is called each time new event has been caught. */
+	kevent_callback_t	enqueue;		/* Is called each time new event is queued. */
+	kevent_callback_t	dequeue;		/* Is called each time event is dequeued. */
+
+	void			*priv;			/* Private data for different storages. 
+							 * poll()/select storage has a list of wait_queue_t containers 
+							 * for each ->poll() { poll_wait()' } here.
+							 */
+};
+
+#define KEVENT_HASH_MASK	0xff
+
+struct kevent_list
+{
+	struct list_head	kevent_list;		/* List of all kevents. */
+	spinlock_t 		kevent_lock;		/* Protects all manipulations with queue of kevents. */
+};
+
+struct kevent_user
+{
+	struct kevent_list	kqueue[KEVENT_HASH_MASK+1];
+	unsigned int		kevent_num;		/* Number of queued kevents. */
+
+	struct list_head	ready_list;		/* List of ready kevents. */
+	unsigned int		ready_num;		/* Number of ready kevents. */
+	spinlock_t 		ready_lock;		/* Protects all manipulations with ready queue. */
+
+	unsigned int		max_ready_num;		/* Requested number of kevents. */
+
+	struct semaphore	ctl_mutex;		/* Protects against simultaneous kevent_user control manipulations. */
+	struct semaphore	wait_mutex;		/* Protects against simultaneous kevent_user waits. */
+	wait_queue_head_t	wait;			/* Wait until some events are ready. */
+
+	atomic_t		refcnt;			/* Reference counter, increased for each new kevent. */
+#ifdef CONFIG_KEVENT_USER_STAT
+	unsigned long		im_num;
+	unsigned long		wait_num;
+	unsigned long		total;
+#endif
+};
+
+#define KEVENT_MAX_REQUESTS		PAGE_SIZE/sizeof(struct kevent)
+
+struct kevent *kevent_alloc(gfp_t mask);
+void kevent_free(struct kevent *k);
+int kevent_enqueue(struct kevent *k);
+int kevent_dequeue(struct kevent *k);
+int kevent_init(struct kevent *k);
+void kevent_requeue(struct kevent *k);
+
+#define list_for_each_entry_reverse_safe(pos, n, head, member)		\
+	for (pos = list_entry((head)->prev, typeof(*pos), member),	\
+		n = list_entry(pos->member.prev, typeof(*pos), member);	\
+	     prefetch(pos->member.prev), &pos->member != (head); 	\
+	     pos = n, n = list_entry(pos->member.prev, typeof(*pos), member))
+
+int kevent_break(struct kevent *k);
+int kevent_init(struct kevent *k);
+
+int kevent_init_socket(struct kevent *k);
+int kevent_init_inode(struct kevent *k);
+int kevent_init_timer(struct kevent *k);
+int kevent_init_poll(struct kevent *k);
+int kevent_init_naio(struct kevent *k);
+int kevent_init_aio(struct kevent *k);
+
+void kevent_storage_ready(struct kevent_storage *st, 
+		kevent_callback_t ready_callback, u32 event);
+int kevent_storage_init(void *origin, struct kevent_storage *st);
+void kevent_storage_fini(struct kevent_storage *st);
+int kevent_storage_enqueue(struct kevent_storage *st, struct kevent *k);
+void kevent_storage_dequeue(struct kevent_storage *st, struct kevent *k);
+
+int kevent_user_add_ukevent(struct ukevent *uk, struct kevent_user *u);
+
+#ifdef CONFIG_KEVENT_INODE
+void kevent_inode_notify(struct inode *inode, u32 event);
+void kevent_inode_notify_parent(struct dentry *dentry, u32 event);
+void kevent_inode_remove(struct inode *inode);
+#else
+static inline void kevent_inode_notify(struct inode *inode, u32 event)
+{
+}
+static inline void kevent_inode_notify_parent(struct dentry *dentry, u32 event)
+{
+}
+static inline void kevent_inode_remove(struct inode *inode)
+{
+}
+#endif /* CONFIG_KEVENT_INODE */
+#ifdef CONFIG_KEVENT_SOCKET
+
+void kevent_socket_notify(struct sock *sock, u32 event);
+int kevent_socket_dequeue(struct kevent *k);
+int kevent_socket_enqueue(struct kevent *k);
+#define sock_async(__sk) sock_flag(__sk, SOCK_ASYNC)
+#else
+static inline void kevent_socket_notify(struct sock *sock, u32 event)
+{
+}
+#define sock_async(__sk)	0
+#endif
+#endif /* __KERNEL__ */
+#endif /* __KEVENT_H */
diff --git a/include/linux/kevent_storage.h b/include/linux/kevent_storage.h
new file mode 100644
index 0000000..bd891f0
--- /dev/null
+++ b/include/linux/kevent_storage.h
@@ -0,0 +1,12 @@
+#ifndef __KEVENT_STORAGE_H
+#define __KEVENT_STORAGE_H
+
+struct kevent_storage
+{
+	void			*origin;		/* Originator's pointer, e.g. struct sock or struct file. Can be NULL. */
+	struct list_head	list;			/* List of queued kevents. */
+	unsigned int		qlen;			/* Number of queued kevents. */
+	spinlock_t		lock;			/* Protects users queue. */
+};
+
+#endif /* __KEVENT_STORAGE_H */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index bd67a44..33d436e 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -587,4 +587,8 @@ asmlinkage long sys_get_robust_list(int 
 asmlinkage long sys_set_robust_list(struct robust_list_head __user *head,
 				    size_t len);
 
+asmlinkage long sys_aio_recv(int ctl_fd, int s, void __user *buf, size_t size, unsigned flags);
+asmlinkage long sys_aio_send(int ctl_fd, int s, void __user *buf, size_t size, unsigned flags);
+asmlinkage long sys_aio_sendfile(int ctl_fd, int fd, int s, size_t size, unsigned flags);
+asmlinkage long sys_kevent_ctl(int ctl_fd, void __user *buf);
 #endif
diff --git a/init/Kconfig b/init/Kconfig
index df864a3..6135afc 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -185,6 +185,8 @@ config AUDITSYSCALL
 	  such as SELinux.  To use audit's filesystem watch feature, please
 	  ensure that INOTIFY is configured.
 
+source "kernel/kevent/Kconfig"
+
 config IKCONFIG
 	bool "Kernel .config support"
 	---help---
diff --git a/kernel/Makefile b/kernel/Makefile
index f6ef00f..eb057ea 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -36,6 +36,7 @@ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softl
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
+obj-$(CONFIG_KEVENT) += kevent/
 obj-$(CONFIG_RELAY) += relay.o
 
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
diff --git a/kernel/kevent/Kconfig b/kernel/kevent/Kconfig
new file mode 100644
index 0000000..88b35af
--- /dev/null
+++ b/kernel/kevent/Kconfig
@@ -0,0 +1,57 @@
+config KEVENT
+	bool "Kernel event notification mechanism"
+	help
+	  This option enables event queue mechanism.
+	  It can be used as replacement for poll()/select(), AIO callback invocations,
+	  advanced timer notifications and other kernel object status changes.
+
+config KEVENT_USER_STAT
+	bool "Kevent user statistic"
+	depends on KEVENT
+	default N
+	help
+	  This option will turn kevent_user statistic collection on.
+	  Statistic data includes total number of kevent, number of kevents which are ready
+	  immediately at insertion time and number of kevents which were removed through
+	  readiness completion. It will be printed each time control kevent descriptor
+	  is closed.
+
+config KEVENT_SOCKET
+	bool "Kernel event notifications for sockets"
+	depends on NET && KEVENT
+	help
+	  This option enables notifications through KEVENT subsystem of 
+	  sockets operations, like new packet receiving conditions, ready for accept
+  	  conditions and so on.
+	
+config KEVENT_INODE
+	bool "Kernel event notifications for inodes"
+	depends on KEVENT
+	help
+	  This option enables notifications through KEVENT subsystem of 
+	  inode operations, like file creation, removal and so on.
+
+config KEVENT_TIMER
+	bool "Kernel event notifications for timers"
+	depends on KEVENT
+	help
+	  This option allows to use timers through KEVENT subsystem.
+
+config KEVENT_POLL
+	bool "Kernel event notifications for poll()/select()"
+	depends on KEVENT
+	help
+	  This option allows to use kevent subsystem for poll()/select() notifications.
+
+config KEVENT_NAIO
+	bool "Network asynchronous IO"
+	depends on KEVENT && KEVENT_SOCKET
+	help
+	  This option enables kevent based network asynchronous IO subsystem.
+
+config KEVENT_AIO
+	bool "Asynchronous IO"
+	depends on KEVENT
+	help
+	  This option allows to use kevent subsystem for AIO operations.
+	  AIO read is currently supported.
diff --git a/kernel/kevent/Makefile b/kernel/kevent/Makefile
new file mode 100644
index 0000000..7dcd651
--- /dev/null
+++ b/kernel/kevent/Makefile
@@ -0,0 +1,7 @@
+obj-y := kevent.o kevent_user.o kevent_init.o
+obj-$(CONFIG_KEVENT_SOCKET) += kevent_socket.o
+obj-$(CONFIG_KEVENT_INODE) += kevent_inode.o
+obj-$(CONFIG_KEVENT_TIMER) += kevent_timer.o
+obj-$(CONFIG_KEVENT_POLL) += kevent_poll.o
+obj-$(CONFIG_KEVENT_NAIO) += kevent_naio.o
+obj-$(CONFIG_KEVENT_AIO) += kevent_aio.o
diff --git a/kernel/kevent/kevent.c b/kernel/kevent/kevent.c
new file mode 100644
index 0000000..f699a13
--- /dev/null
+++ b/kernel/kevent/kevent.c
@@ -0,0 +1,260 @@
+/*
+ * 	kevent.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/mempool.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/kevent.h>
+
+static kmem_cache_t *kevent_cache;
+
+/*
+ * Attempts to add an event into appropriate origin's queue.
+ * Returns positive value if this event is ready immediately,
+ * negative value in case of error and zero if event has been queued.
+ * ->enqueue() callback must increase origin's reference counter.
+ */
+int kevent_enqueue(struct kevent *k)
+{
+	if (k->event.type >= KEVENT_MAX)
+		return -E2BIG;
+
+	if (!k->enqueue) {
+		kevent_break(k);
+		return -EINVAL;
+	}
+	
+	return k->enqueue(k);
+}
+
+/*
+ * Remove event from the appropriate queue.
+ * ->dequeue() callback must decrease origin's reference counter.
+ */
+int kevent_dequeue(struct kevent *k)
+{
+	if (k->event.type >= KEVENT_MAX)
+		return -E2BIG;
+	
+	if (!k->dequeue) {
+		kevent_break(k);
+		return -EINVAL;
+	}
+
+	return k->dequeue(k);
+}
+
+/*
+ * Must be called before event is going to be added into some origin's queue.
+ * Initializes ->enqueue(), ->dequeue() and ->callback() callbacks.
+ * If failed, kevent should not be used or kevent_enqueue() will fail to add
+ * this kevent into origin's queue with setting
+ * KEVENT_RET_BROKEN flag in kevent->event.ret_flags.
+ */
+int kevent_init(struct kevent *k)
+{
+	int err;
+
+	spin_lock_init(&k->lock);
+	k->kevent_entry.next = LIST_POISON1;
+	k->storage_entry.next = LIST_POISON1;
+	k->ready_entry.next = LIST_POISON1;
+
+	if (k->event.type >= KEVENT_MAX)
+		return -E2BIG;
+	
+	switch (k->event.type) {
+		case KEVENT_NAIO:
+			err = kevent_init_naio(k);
+			break;
+		case KEVENT_SOCKET:
+			err = kevent_init_socket(k);
+			break;
+		case KEVENT_INODE:
+			err = kevent_init_inode(k);
+			break;
+		case KEVENT_TIMER:
+			err = kevent_init_timer(k);
+			break;
+		case KEVENT_POLL:
+			err = kevent_init_poll(k);
+			break;
+		case KEVENT_AIO:
+			err = kevent_init_aio(k);
+			break;
+		default:
+			err = -ENODEV;
+	}
+
+	return err;
+}
+
+/*
+ * Called from ->enqueue() callback when reference counter for given
+ * origin (socket, inode...) has been increased.
+ */
+int kevent_storage_enqueue(struct kevent_storage *st, struct kevent *k)
+{
+	unsigned long flags;
+
+	k->st = st;
+	spin_lock_irqsave(&st->lock, flags);
+	list_add_tail(&k->storage_entry, &st->list);
+	st->qlen++;
+	spin_unlock_irqrestore(&st->lock, flags);
+	return 0;
+}
+
+/*
+ * Dequeue kevent from origin's queue. 
+ * It does not decrease origin's reference counter in any way 
+ * and must be called before it, so storage itself must be valid.
+ * It is called from ->dequeue() callback.
+ */
+void kevent_storage_dequeue(struct kevent_storage *st, struct kevent *k)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&st->lock, flags);
+	if (k->storage_entry.next != LIST_POISON1) {
+		list_del(&k->storage_entry);
+		st->qlen--;
+	}
+	spin_unlock_irqrestore(&st->lock, flags);
+}
+
+static void __kevent_requeue(struct kevent *k, u32 event)
+{
+	int err, rem = 0;
+	unsigned long flags;
+
+	err = k->callback(k);
+
+	spin_lock_irqsave(&k->lock, flags);
+	if (err > 0) {
+		k->event.ret_flags |= KEVENT_RET_DONE;
+	} else if (err < 0) {
+		k->event.ret_flags |= KEVENT_RET_BROKEN;
+		k->event.ret_flags |= KEVENT_RET_DONE;
+	}
+	rem = (k->event.req_flags & KEVENT_REQ_ONESHOT);
+	if (!err)
+		err = (k->event.ret_flags & (KEVENT_RET_BROKEN|KEVENT_RET_DONE));
+	spin_unlock_irqrestore(&k->lock, flags);
+
+	if (err) {
+		if (rem) {
+			list_del(&k->storage_entry);
+			k->st->qlen--;
+		}
+		
+		spin_lock_irqsave(&k->user->ready_lock, flags);
+		if (k->ready_entry.next == LIST_POISON1) {
+			list_add_tail(&k->ready_entry, &k->user->ready_list);
+			k->user->ready_num++;
+		}
+		spin_unlock_irqrestore(&k->user->ready_lock, flags);
+		wake_up(&k->user->wait);
+	}
+}
+
+void kevent_requeue(struct kevent *k)
+{
+	unsigned long flags;
+	
+	spin_lock_irqsave(&k->st->lock, flags);
+	__kevent_requeue(k, 0);
+	spin_unlock_irqrestore(&k->st->lock, flags);
+}
+
+/*
+ * Called each time some activity in origin (socket, inode...) is noticed.
+ */
+void kevent_storage_ready(struct kevent_storage *st, 
+		kevent_callback_t ready_callback, u32 event)
+{
+	struct kevent *k, *n;
+
+	spin_lock(&st->lock);
+	list_for_each_entry_safe(k, n, &st->list, storage_entry) {
+		if (ready_callback)
+			ready_callback(k);
+
+		if (event & k->event.event)
+			__kevent_requeue(k, event);
+	}
+	spin_unlock(&st->lock);
+}
+
+int kevent_storage_init(void *origin, struct kevent_storage *st)
+{
+	spin_lock_init(&st->lock);
+	st->origin = origin;
+	st->qlen = 0;
+	INIT_LIST_HEAD(&st->list);
+	return 0;
+}
+
+void kevent_storage_fini(struct kevent_storage *st)
+{
+	kevent_storage_ready(st, kevent_break, KEVENT_MASK_ALL);
+}
+
+struct kevent *kevent_alloc(gfp_t mask)
+{
+	struct kevent *k;
+	
+	if (kevent_cache)
+		k = kmem_cache_alloc(kevent_cache, mask);
+	else
+		k = kzalloc(sizeof(struct kevent), mask);
+
+	return k;
+}
+
+void kevent_free(struct kevent *k)
+{
+	memset(k, 0xab, sizeof(struct kevent));
+
+	if (kevent_cache)
+		kmem_cache_free(kevent_cache, k);
+	else
+		kfree(k);
+}
+
+int __init kevent_sys_init(void)
+{
+	int err = 0;
+
+	kevent_cache = kmem_cache_create("kevent_cache", 
+			sizeof(struct kevent), 0, 0, NULL, NULL);
+	if (!kevent_cache)
+		err = -ENOMEM;
+	
+	return err;
+}
+
+late_initcall(kevent_sys_init);
diff --git a/kernel/kevent/kevent_init.c b/kernel/kevent/kevent_init.c
new file mode 100644
index 0000000..ec95114
--- /dev/null
+++ b/kernel/kevent/kevent_init.c
@@ -0,0 +1,85 @@
+/*
+ * 	kevent_init.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/errno.h>
+#include <linux/kevent.h>
+
+int kevent_break(struct kevent *k)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&k->lock, flags);
+	k->event.ret_flags |= KEVENT_RET_BROKEN;
+	spin_unlock_irqrestore(&k->lock, flags);
+	return 0;
+}
+
+#ifndef CONFIG_KEVENT_SOCKET
+int kevent_init_socket(struct kevent *k)
+{
+	kevent_break(k);
+	return -ENODEV;
+}
+#endif
+
+#ifndef CONFIG_KEVENT_INODE
+int kevent_init_inode(struct kevent *k)
+{
+	kevent_break(k);
+	return -ENODEV;
+}
+#endif
+
+#ifndef CONFIG_KEVENT_TIMER
+int kevent_init_timer(struct kevent *k)
+{
+	kevent_break(k);
+	return -ENODEV;
+}
+#endif
+
+#ifndef CONFIG_KEVENT_POLL
+int kevent_init_poll(struct kevent *k)
+{
+	kevent_break(k);
+	return -ENODEV;
+}
+#endif
+
+#ifndef CONFIG_KEVENT_NAIO
+int kevent_init_naio(struct kevent *k)
+{
+	kevent_break(k);
+	return -ENODEV;
+}
+#endif
+
+#ifndef CONFIG_KEVENT_AIO
+int kevent_init_aio(struct kevent *k)
+{
+	kevent_break(k);
+	return -ENODEV;
+}
+#endif
diff --git a/kernel/kevent/kevent_user.c b/kernel/kevent/kevent_user.c
new file mode 100644
index 0000000..566b62b
--- /dev/null
+++ b/kernel/kevent/kevent_user.c
@@ -0,0 +1,728 @@
+/*
+ * 	kevent_user.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/device.h>
+#include <linux/poll.h>
+#include <linux/kevent.h>
+#include <linux/jhash.h>
+#include <asm/uaccess.h>
+#include <asm/semaphore.h>
+
+static struct class *kevent_user_class;
+static char kevent_name[] = "kevent";
+static int kevent_user_major;
+
+static int kevent_user_open(struct inode *, struct file *);
+static int kevent_user_release(struct inode *, struct file *);
+static int kevent_user_ioctl(struct inode *, struct file *, 
+		unsigned int, unsigned long);
+static unsigned int kevent_user_poll(struct file *, struct poll_table_struct *);
+
+static struct file_operations kevent_user_fops = {
+	.open		= kevent_user_open,
+	.release	= kevent_user_release,
+	.ioctl		= kevent_user_ioctl,
+	.poll		= kevent_user_poll,
+	.owner		= THIS_MODULE,
+};
+
+static struct super_block *kevent_get_sb(struct file_system_type *fs_type, 
+		int flags, const char *dev_name, void *data)
+{
+	/* So original magic... */
+	return get_sb_pseudo(fs_type, kevent_name, NULL, 0xabcdef);	
+}
+
+static struct file_system_type kevent_fs_type = {
+	.name		= kevent_name,
+	.get_sb		= kevent_get_sb,
+	.kill_sb	= kill_anon_super,
+};
+
+static struct vfsmount *kevent_mnt;
+
+static unsigned int kevent_user_poll(struct file *file, struct poll_table_struct *wait)
+{
+	struct kevent_user *u = file->private_data;
+	unsigned int mask;
+	
+	poll_wait(file, &u->wait, wait);
+	mask = 0;
+
+	if (u->ready_num)
+		mask |= POLLIN | POLLRDNORM;
+
+	return mask;
+}
+
+static struct kevent_user *kevent_user_alloc(void)
+{
+	struct kevent_user *u;
+	int i;
+
+	u = kzalloc(sizeof(struct kevent_user), GFP_KERNEL);
+	if (!u)
+		return NULL;
+
+	INIT_LIST_HEAD(&u->ready_list);
+	spin_lock_init(&u->ready_lock);
+	u->ready_num = 0;
+#ifdef CONFIG_KEVENT_USER_STAT
+	u->wait_num = u->im_num = u->total = 0;
+#endif
+	for (i=0; i<KEVENT_HASH_MASK+1; ++i) {
+		INIT_LIST_HEAD(&u->kqueue[i].kevent_list);
+		spin_lock_init(&u->kqueue[i].kevent_lock);
+	}
+	u->kevent_num = 0;
+	
+	init_MUTEX(&u->ctl_mutex);
+	init_MUTEX(&u->wait_mutex);
+	init_waitqueue_head(&u->wait);
+	u->max_ready_num = 0;
+
+	atomic_set(&u->refcnt, 1);
+
+	return u;
+}
+
+static int kevent_user_open(struct inode *inode, struct file *file)
+{
+	struct kevent_user *u = kevent_user_alloc();
+	
+	if (!u)
+		return -ENOMEM;
+
+	file->private_data = u;
+	
+	return 0;
+}
+
+static inline void kevent_user_get(struct kevent_user *u)
+{
+	atomic_inc(&u->refcnt);
+}
+
+static inline void kevent_user_put(struct kevent_user *u)
+{
+	if (atomic_dec_and_test(&u->refcnt)) {
+#ifdef CONFIG_KEVENT_USER_STAT
+		printk("%s: u=%p, wait=%lu, immediately=%lu, total=%lu.\n", 
+				__func__, u, u->wait_num, u->im_num, u->total);
+#endif
+		kfree(u);
+	}
+}
+
+#if 0
+static inline unsigned int kevent_user_hash(struct ukevent *uk)
+{
+	unsigned int h = (uk->user[0] ^ uk->user[1]) ^ (uk->id.raw[0] ^ uk->id.raw[1]);
+	
+	h = (((h >> 16) & 0xffff) ^ (h & 0xffff)) & 0xffff;
+	h = (((h >> 8) & 0xff) ^ (h & 0xff)) & KEVENT_HASH_MASK;
+
+	return h;
+}
+#else
+static inline unsigned int kevent_user_hash(struct ukevent *uk)
+{
+	return jhash_1word(uk->id.raw[0], 0) & KEVENT_HASH_MASK;
+}
+#endif
+
+/*
+ * Remove kevent from user's list of all events, 
+ * dequeue it from storage and decrease user's reference counter,
+ * since this kevent does not exist anymore. That is why it is freed here.
+ */
+static void kevent_finish_user(struct kevent *k, int lock, int deq)
+{
+	struct kevent_user *u = k->user;
+	unsigned long flags;
+
+	if (lock) {
+		unsigned int hash = kevent_user_hash(&k->event);
+		struct kevent_list *l = &u->kqueue[hash];
+
+		spin_lock_irqsave(&l->kevent_lock, flags);
+		list_del(&k->kevent_entry);
+		u->kevent_num--;
+		spin_unlock_irqrestore(&l->kevent_lock, flags);
+	} else {
+		list_del(&k->kevent_entry);
+		u->kevent_num--;
+	}
+
+	if (deq)
+		kevent_dequeue(k);
+
+	spin_lock_irqsave(&u->ready_lock, flags);
+	if (k->ready_entry.next != LIST_POISON1) {
+		list_del(&k->ready_entry);
+		u->ready_num--;
+	}
+	spin_unlock_irqrestore(&u->ready_lock, flags);
+	
+	kevent_user_put(u);
+	kevent_free(k);
+}
+
+/*
+ * Dequeue one entry from user's ready queue.
+ */
+static struct kevent *__kqueue_dequeue_one_ready(struct list_head *q, 
+		unsigned int *qlen)
+{
+	struct kevent *k = NULL;
+	unsigned int len = *qlen;
+	
+	if (len && !list_empty(q)) {
+		k = list_entry(q->next, struct kevent, ready_entry);
+		list_del(&k->ready_entry);
+		*qlen = len - 1;
+	}
+	
+	return k;
+}
+
+static struct kevent *kqueue_dequeue_ready(struct kevent_user *u)
+{
+	unsigned long flags;
+	struct kevent *k;
+
+	spin_lock_irqsave(&u->ready_lock, flags);
+	k = __kqueue_dequeue_one_ready(&u->ready_list, &u->ready_num);
+	spin_unlock_irqrestore(&u->ready_lock, flags);
+
+	return k;
+}
+
+static struct kevent *__kevent_search(struct kevent_list *l, struct ukevent *uk, 
+		struct kevent_user *u)
+{
+	struct kevent *k;
+	int found = 0;
+	
+	list_for_each_entry(k, &l->kevent_list, kevent_entry) {
+		spin_lock(&k->lock);
+		if (k->event.user[0] == uk->user[0] && k->event.user[1] == uk->user[1] &&
+				k->event.id.raw[0] == uk->id.raw[0] && 
+				k->event.id.raw[1] == uk->id.raw[1]) {
+			found = 1;
+			spin_unlock(&k->lock);
+			break;
+		}
+		spin_unlock(&k->lock);
+	}
+
+	return (found)?k:NULL;
+}
+
+static int kevent_modify(struct ukevent *uk, struct kevent_user *u)
+{
+	struct kevent *k;
+	unsigned int hash = kevent_user_hash(uk);
+	struct kevent_list *l = &u->kqueue[hash];
+	int err = -ENODEV;
+	unsigned long flags;
+	
+	spin_lock_irqsave(&l->kevent_lock, flags);
+	k = __kevent_search(l, uk, u);
+	if (k) {
+		spin_lock(&k->lock);
+		k->event.event = uk->event;
+		k->event.req_flags = uk->req_flags;
+		k->event.ret_flags = 0;
+		spin_unlock(&k->lock);
+		kevent_requeue(k);
+		err = 0;
+	}
+	spin_unlock_irqrestore(&l->kevent_lock, flags);
+	
+	return err;
+}
+
+static int kevent_remove(struct ukevent *uk, struct kevent_user *u)
+{
+	int err = -ENODEV;
+	struct kevent *k;
+	unsigned int hash = kevent_user_hash(uk);
+	struct kevent_list *l = &u->kqueue[hash];
+	unsigned long flags;
+
+	spin_lock_irqsave(&l->kevent_lock, flags);
+	k = __kevent_search(l, uk, u);
+	if (k) {
+		kevent_finish_user(k, 0, 1);
+		err = 0;
+	}
+	spin_unlock_irqrestore(&l->kevent_lock, flags);
+
+	return err;
+}
+
+/*
+ * No new entry can be added or removed from any list at this point.
+ * It is not permitted to call ->ioctl() and ->release() in parallel.
+ */
+static int kevent_user_release(struct inode *inode, struct file *file)
+{
+	struct kevent_user *u = file->private_data;
+	struct kevent *k, *n;
+	int i;
+
+	for (i=0; i<KEVENT_HASH_MASK+1; ++i) {
+		struct kevent_list *l = &u->kqueue[i];
+		
+		list_for_each_entry_safe(k, n, &l->kevent_list, kevent_entry)
+			kevent_finish_user(k, 1, 1);
+	}
+
+	kevent_user_put(u);
+	file->private_data = NULL;
+
+	return 0;
+}
+
+static int kevent_user_ctl_modify(struct kevent_user *u, 
+		struct kevent_user_control *ctl, void __user *arg)
+{
+	int err = 0, i;
+	struct ukevent uk;
+
+	if (down_interruptible(&u->ctl_mutex))
+		return -ERESTARTSYS;
+
+	for (i=0; i<ctl->num; ++i) {
+		if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
+			err = -EINVAL;
+			break;
+		}
+
+		if (kevent_modify(&uk, u))
+			uk.ret_flags |= KEVENT_RET_BROKEN;
+		uk.ret_flags |= KEVENT_RET_DONE;
+
+		if (copy_to_user(arg, &uk, sizeof(struct ukevent))) {
+			err = -EINVAL;
+			break;
+		}
+
+		arg += sizeof(struct ukevent);
+	}
+
+	up(&u->ctl_mutex);
+
+	return err;
+}
+
+static int kevent_user_ctl_remove(struct kevent_user *u, 
+		struct kevent_user_control *ctl, void __user *arg)
+{
+	int err = 0, i;
+	struct ukevent uk;
+
+	if (down_interruptible(&u->ctl_mutex))
+		return -ERESTARTSYS;
+
+	for (i=0; i<ctl->num; ++i) {
+		if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
+			err = -EINVAL;
+			break;
+		}
+
+		if (kevent_remove(&uk, u))
+			uk.ret_flags |= KEVENT_RET_BROKEN;
+
+		uk.ret_flags |= KEVENT_RET_DONE;
+
+		if (copy_to_user(arg, &uk, sizeof(struct ukevent))) {
+			err = -EINVAL;
+			break;
+		}
+
+		arg += sizeof(struct ukevent);
+	}
+
+	up(&u->ctl_mutex);
+
+	return err;
+}
+
+int kevent_user_add_ukevent(struct ukevent *uk, struct kevent_user *u)
+{
+	struct kevent *k;
+	int err;
+
+	k = kevent_alloc(GFP_KERNEL);
+	if (!k) {
+		err = -ENOMEM;
+		goto err_out_exit;
+	}
+
+	memcpy(&k->event, uk, sizeof(struct ukevent));
+
+	k->event.ret_flags = 0;
+
+	err = kevent_init(k);
+	if (err) {
+		kevent_free(k);
+		goto err_out_exit;
+	}
+	k->user = u;
+#ifdef CONFIG_KEVENT_USER_STAT
+	u->total++;
+#endif
+	{
+		unsigned long flags;
+		unsigned int hash = kevent_user_hash(&k->event);
+		struct kevent_list *l = &u->kqueue[hash];
+		
+		spin_lock_irqsave(&l->kevent_lock, flags);
+		list_add_tail(&k->kevent_entry, &l->kevent_list);
+		u->kevent_num++;
+		kevent_user_get(u);
+		spin_unlock_irqrestore(&l->kevent_lock, flags);
+	}
+
+	err = kevent_enqueue(k);
+	if (err) {
+		memcpy(uk, &k->event, sizeof(struct ukevent));
+		if (err < 0)
+			uk->ret_flags |= KEVENT_RET_BROKEN;
+		uk->ret_flags |= KEVENT_RET_DONE;
+		kevent_finish_user(k, 1, 0);
+	} 
+
+err_out_exit:
+	return err;
+}
+
+/*
+ * Copy all ukevents from userspace, allocate kevent for each one 
+ * and add them into appropriate kevent_storages, 
+ * e.g. sockets, inodes and so on...
+ * If something goes wrong, all events will be dequeued and 
+ * negative error will be returned. 
+ * On success zero is returned and 
+ * ctl->num will be a number of finished events, either completed or failed. 
+ * Array of finished events (struct ukevent) will be placed behind 
+ * kevent_user_control structure. User must run through that array and check 
+ * ret_flags field of each ukevent structure to determine if it is fired or failed event.
+ */
+static int kevent_user_ctl_add(struct kevent_user *u, 
+		struct kevent_user_control *ctl, void __user *arg)
+{
+	int err = 0, cerr = 0, num = 0, knum = 0, i;
+	void __user *orig, *ctl_addr;
+	struct ukevent uk;
+
+	if (down_interruptible(&u->ctl_mutex))
+		return -ERESTARTSYS;
+
+	orig = arg;
+	ctl_addr = arg - sizeof(struct kevent_user_control);
+#if 1
+	err = -ENFILE;
+	if (u->kevent_num + ctl->num >= 1024)
+		goto err_out_remove;
+#endif
+	for (i=0; i<ctl->num; ++i) {
+		if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
+			cerr = -EINVAL;
+			break;
+		}
+		arg += sizeof(struct ukevent);
+
+		err = kevent_user_add_ukevent(&uk, u);
+		if (err) {
+#ifdef CONFIG_KEVENT_USER_STAT
+			u->im_num++;
+#endif
+			if (copy_to_user(orig, &uk, sizeof(struct ukevent)))
+				cerr = -EINVAL;
+			orig += sizeof(struct ukevent);
+			num++;
+		} else
+			knum++;
+	}
+
+	if (cerr < 0)
+		goto err_out_remove;
+
+	ctl->num = num;
+	if (copy_to_user(ctl_addr, ctl, sizeof(struct kevent_user_control)))
+		cerr = -EINVAL;
+
+	if (cerr)
+		err = cerr;
+	if (!err)
+		err = num;
+
+err_out_remove:
+	up(&u->ctl_mutex);
+
+	return err;
+}
+
+/*
+ * Waits until at least ctl->ready_num events are ready or timeout and returns 
+ * number of ready events (in case of timeout) or number of requested events.
+ */
+static int kevent_user_wait(struct file *file, struct kevent_user *u, 
+		struct kevent_user_control *ctl, void __user *arg)
+{
+	struct kevent *k;
+	int cerr = 0, num = 0;
+	void __user *ptr = arg + sizeof(struct kevent_user_control);
+
+	if (down_interruptible(&u->ctl_mutex))
+		return -ERESTARTSYS;
+
+	if (!(file->f_flags & O_NONBLOCK)) {
+		if (ctl->timeout)
+			wait_event_interruptible_timeout(u->wait, 
+				u->ready_num >= ctl->num, msecs_to_jiffies(ctl->timeout));
+		else
+			wait_event_interruptible_timeout(u->wait, 
+					u->ready_num > 0, msecs_to_jiffies(1000));
+	}
+	while (num < ctl->num && ((k = kqueue_dequeue_ready(u)) != NULL)) {
+		if (copy_to_user(ptr + num*sizeof(struct ukevent), 
+					&k->event, sizeof(struct ukevent)))
+			cerr = -EINVAL;
+
+		/*
+		 * If it is one-shot kevent, it has been removed already from
+		 * origin's queue, so we can easily free it here.
+		 */
+		if (k->event.req_flags & KEVENT_REQ_ONESHOT)
+			kevent_finish_user(k, 1, 1);
+		++num;
+#ifdef CONFIG_KEVENT_USER_STAT
+		u->wait_num++;
+#endif
+	}
+
+	ctl->num = num;
+	if (copy_to_user(arg, ctl, sizeof(struct kevent_user_control)))
+		cerr = -EINVAL;
+
+	up(&u->ctl_mutex);
+
+	return (cerr)?cerr:num;
+}
+
+static int kevent_ctl_init(void)
+{
+	struct kevent_user *u;
+	struct file *file;	
+	int fd, ret;
+
+	fd = get_unused_fd();
+	if (fd < 0)
+		return fd;
+
+	file = get_empty_filp();
+	if (!file) {
+		ret = -ENFILE;
+		goto out_put_fd;
+	}
+
+	u = kevent_user_alloc();
+	if (unlikely(!u)) {
+		ret = -ENOMEM;
+		goto out_put_file;
+	}
+
+	file->f_op = &kevent_user_fops;
+	file->f_vfsmnt = mntget(kevent_mnt);
+	file->f_dentry = dget(kevent_mnt->mnt_root);
+	file->f_mapping = file->f_dentry->d_inode->i_mapping;
+	file->f_mode = FMODE_READ;
+	file->f_flags = O_RDONLY;
+	file->private_data = u;
+	
+	fd_install(fd, file);
+
+	return fd;
+
+out_put_file:
+	put_filp(file);
+out_put_fd:
+	put_unused_fd(fd);
+	return ret;
+}
+
+static int kevent_ctl_process(struct file *file, 
+		struct kevent_user_control *ctl, void __user *arg)
+{
+	int err;
+	struct kevent_user *u = file->private_data;
+
+	if (!u)
+		return -EINVAL;
+
+	switch (ctl->cmd) {
+		case KEVENT_CTL_ADD:
+			err = kevent_user_ctl_add(u, ctl, 
+					arg+sizeof(struct kevent_user_control));
+			break;
+		case KEVENT_CTL_REMOVE:
+			err = kevent_user_ctl_remove(u, ctl, 
+					arg+sizeof(struct kevent_user_control));
+			break;
+		case KEVENT_CTL_MODIFY:
+			err = kevent_user_ctl_modify(u, ctl, 
+					arg+sizeof(struct kevent_user_control));
+			break;
+		case KEVENT_CTL_WAIT:
+			err = kevent_user_wait(file, u, ctl, arg);
+			break;
+		case KEVENT_CTL_INIT:
+			err = kevent_ctl_init();
+		default:
+			err = -EINVAL;
+			break;
+	}
+
+	return err;
+}
+
+asmlinkage long sys_kevent_ctl(int fd, void __user *arg)
+{
+	int err, fput_needed;
+	struct kevent_user_control ctl;
+	struct file *file;
+
+	if (copy_from_user(&ctl, arg, sizeof(struct kevent_user_control)))
+		return -EINVAL;
+
+	if (ctl.cmd == KEVENT_CTL_INIT)
+		return kevent_ctl_init();
+
+	file = fget_light(fd, &fput_needed);
+	if (!file)
+		return -ENODEV;
+
+	err = kevent_ctl_process(file, &ctl, arg);
+
+	fput_light(file, fput_needed);
+	return err;
+}
+
+static int kevent_user_ioctl(struct inode *inode, struct file *file, 
+		unsigned int cmd, unsigned long arg)
+{
+	int err = -ENODEV;
+	struct kevent_user_control ctl;
+	struct kevent_user *u = file->private_data;
+	void __user *ptr = (void __user *)arg;
+
+	if (copy_from_user(&ctl, ptr, sizeof(struct kevent_user_control)))
+		return -EINVAL;
+
+	switch (cmd) {
+		case KEVENT_USER_CTL:
+			err = kevent_ctl_process(file, &ctl, ptr);
+			break;
+		case KEVENT_USER_WAIT:
+			err = kevent_user_wait(file, u, &ctl, ptr);
+			break;
+		default:
+			break;
+	}
+
+	return err;
+}
+
+static int __devinit kevent_user_init(void)
+{
+	struct class_device *dev;
+	int err = 0;
+	
+	err = register_filesystem(&kevent_fs_type);
+	if (err)
+		panic("%s: failed to register filesystem: err=%d.\n",
+			       kevent_name, err);
+
+	kevent_mnt = kern_mount(&kevent_fs_type);
+	if (IS_ERR(kevent_mnt))
+		panic("%s: failed to mount silesystem: err=%ld.\n", 
+				kevent_name, PTR_ERR(kevent_mnt));
+	
+	kevent_user_major = register_chrdev(0, kevent_name, &kevent_user_fops);
+	if (kevent_user_major < 0) {
+		printk(KERN_ERR "Failed to register \"%s\" char device: err=%d.\n", 
+				kevent_name, kevent_user_major);
+		return -ENODEV;
+	}
+
+	kevent_user_class = class_create(THIS_MODULE, "kevent");
+	if (IS_ERR(kevent_user_class)) {
+		printk(KERN_ERR "Failed to register \"%s\" class: err=%ld.\n", 
+				kevent_name, PTR_ERR(kevent_user_class));
+		err = PTR_ERR(kevent_user_class);
+		goto err_out_unregister;
+	}
+
+	dev = class_device_create(kevent_user_class, NULL, 
+			MKDEV(kevent_user_major, 0), NULL, kevent_name);
+	if (IS_ERR(dev)) {
+		printk(KERN_ERR "Failed to create %d.%d class device in \"%s\" class: err=%ld.\n", 
+				kevent_user_major, 0, kevent_name, PTR_ERR(dev));
+		err = PTR_ERR(dev);
+		goto err_out_class_destroy;
+	}
+
+	printk("KEVENT subsystem: chardev helper: major=%d.\n", kevent_user_major);
+
+	return 0;
+
+err_out_class_destroy:
+	class_destroy(kevent_user_class);
+err_out_unregister:
+	unregister_chrdev(kevent_user_major, kevent_name);
+
+	return err;
+}
+
+static void __devexit kevent_user_fini(void)
+{
+	class_device_destroy(kevent_user_class, MKDEV(kevent_user_major, 0));
+	class_destroy(kevent_user_class);
+	unregister_chrdev(kevent_user_major, kevent_name);
+	mntput(kevent_mnt);
+	unregister_filesystem(&kevent_fs_type);
+}
+
+module_init(kevent_user_init);
+module_exit(kevent_user_fini);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 5433195..dcbacf5 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -121,6 +121,11 @@ cond_syscall(ppc_rtas);
 cond_syscall(sys_spu_run);
 cond_syscall(sys_spu_create);
 
+cond_syscall(sys_aio_recv);
+cond_syscall(sys_aio_send);
+cond_syscall(sys_aio_sendfile);
+cond_syscall(sys_kevent_ctl);
+
 /* mmu depending weak syscall entries */
 cond_syscall(sys_mprotect);
 cond_syscall(sys_msync);

-- 
	Evgeniy Polyakov

^ permalink raw reply related	[flat|nested] 160+ messages in thread

* Re: [RFC 1/4] kevent: core files.
  2006-07-09 13:24 [RFC 1/4] kevent: core files Evgeniy Polyakov
@ 2006-07-09 14:59 ` Pekka Enberg
  2006-07-09 15:08   ` Evgeniy Polyakov
  2006-07-25  6:17 ` David Miller
  1 sibling, 1 reply; 160+ messages in thread
From: Pekka Enberg @ 2006-07-09 14:59 UTC (permalink / raw)
  To: Evgeniy Polyakov; +Cc: linux-kernel, netdev

On 7/9/06, Evgeniy Polyakov <johnpol@2ka.mipt.ru> wrote:
> +struct kevent *kevent_alloc(gfp_t mask)
> +{
> +       struct kevent *k;
> +
> +       if (kevent_cache)
> +               k = kmem_cache_alloc(kevent_cache, mask);
> +       else
> +               k = kzalloc(sizeof(struct kevent), mask);
> +
> +       return k;
> +}

What's this for? Why would kevent_cache be NULL? Note that you can use
kmem_cache_zalloc() for fixed size allocations that need to be zeroed.

On 7/9/06, Evgeniy Polyakov <johnpol@2ka.mipt.ru> wrote:
> +
> +void kevent_free(struct kevent *k)
> +{
> +       memset(k, 0xab, sizeof(struct kevent));

Why is slab poisoning not sufficient?

> +       if (kevent_cache)
> +               kmem_cache_free(kevent_cache, k);
> +       else
> +               kfree(k);
> +}

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [RFC 1/4] kevent: core files.
  2006-07-09 14:59 ` Pekka Enberg
@ 2006-07-09 15:08   ` Evgeniy Polyakov
  0 siblings, 0 replies; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-07-09 15:08 UTC (permalink / raw)
  To: Pekka Enberg; +Cc: linux-kernel, netdev

On Sun, Jul 09, 2006 at 05:59:42PM +0300, Pekka Enberg (penberg@cs.helsinki.fi) wrote:
> On 7/9/06, Evgeniy Polyakov <johnpol@2ka.mipt.ru> wrote:
> >+struct kevent *kevent_alloc(gfp_t mask)
> >+{
> >+       struct kevent *k;
> >+
> >+       if (kevent_cache)
> >+               k = kmem_cache_alloc(kevent_cache, mask);
> >+       else
> >+               k = kzalloc(sizeof(struct kevent), mask);
> >+
> >+       return k;
> >+}
> 
> What's this for? Why would kevent_cache be NULL? Note that you can use
> kmem_cache_zalloc() for fixed size allocations that need to be zeroed.

It can work without cache at all, i.e. if cache creation fails.
Well, it can be removed of course, since it does not hurt anything.

> On 7/9/06, Evgeniy Polyakov <johnpol@2ka.mipt.ru> wrote:
> >+
> >+void kevent_free(struct kevent *k)
> >+{
> >+       memset(k, 0xab, sizeof(struct kevent));
> 
> Why is slab poisoning not sufficient?

Since that pointer is always known to be poisoned no matter if kernel
debugging option is turned on or off.

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [RFC 1/4] kevent: core files.
  2006-07-09 13:24 [RFC 1/4] kevent: core files Evgeniy Polyakov
  2006-07-09 14:59 ` Pekka Enberg
@ 2006-07-25  6:17 ` David Miller
  2006-07-25  6:26   ` Evgeniy Polyakov
  2006-07-27 19:18   ` Zach Brown
  1 sibling, 2 replies; 160+ messages in thread
From: David Miller @ 2006-07-25  6:17 UTC (permalink / raw)
  To: johnpol; +Cc: linux-kernel, netdev

From: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
Date: Sun, 9 Jul 2006 17:24:46 +0400

> This patch includes core kevent files:
>  - userspace controlling
>  - kernelspace interfaces
>  - initialisation
>  - notification state machines
> 
> It might also inlclude parts from other subsystem (like network related
> syscalls so it is possible that it will not compile without other
> patches applied).
> 
> Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>

I like this work a lot, as I've stated before.  The data structures
look like they will scale well and it takes care of all the limitations
that networking in particular seems to have in this area.

I have to say that the user API is not the nicest in the world.  Yet,
at the same time, I cannot think of a better one :)

Please, remove some grot such as this:

> +	if (kevent_cache)
> +		k = kmem_cache_alloc(kevent_cache, mask);
> +	else
> +		k = kzalloc(sizeof(struct kevent), mask);
 ...
> +	if (kevent_cache)
> +		kmem_cache_free(kevent_cache, k);
> +	else
> +		kfree(k);

Instead, make this:

> +	kevent_cache = kmem_cache_create("kevent_cache", 
> +			sizeof(struct kevent), 0, 0, NULL, NULL);
> +	if (!kevent_cache)
> +		err = -ENOMEM;

panic().  This is consistent with how other core subsystems handle
SLAB cache creation failures.

I also think that if we accept this work, it should be first class
citizen with no config options and no ifdefs scattered all over.
Either this is how we do network AIO or it is not.

I've looked only briefly at Ulrich Drepper's AIO proposal in his OLS
slides, although the DMA bits do not initially strike me as such a hot
idea.  I haven't wrapped my brain much around this new stuff, so I'm
not going to touch on it much more just yet.

The practical advantage kevent has over any new proposal is that 1)
implementation exists :) and 2) several types of test applications and
performance measurements have been made against it which usually
flushes out the worst design issues.

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [RFC 1/4] kevent: core files.
  2006-07-25  6:17 ` David Miller
@ 2006-07-25  6:26   ` Evgeniy Polyakov
  2006-07-27 19:18   ` Zach Brown
  1 sibling, 0 replies; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-07-25  6:26 UTC (permalink / raw)
  To: David Miller; +Cc: linux-kernel, netdev

On Mon, Jul 24, 2006 at 11:17:08PM -0700, David Miller (davem@davemloft.net) wrote:
> From: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
> Date: Sun, 9 Jul 2006 17:24:46 +0400
> 
> > This patch includes core kevent files:
> >  - userspace controlling
> >  - kernelspace interfaces
> >  - initialisation
> >  - notification state machines
> > 
> > It might also inlclude parts from other subsystem (like network related
> > syscalls so it is possible that it will not compile without other
> > patches applied).
> > 
> > Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
> 
> I like this work a lot, as I've stated before.  The data structures
> look like they will scale well and it takes care of all the limitations
> that networking in particular seems to have in this area.
> 
> I have to say that the user API is not the nicest in the world.  Yet,
> at the same time, I cannot think of a better one :)

Hi David. I see you have a day of backlog mails processing :)

> Please, remove some grot such as this:
> 
> > +	if (kevent_cache)
> > +		k = kmem_cache_alloc(kevent_cache, mask);
> > +	else
> > +		k = kzalloc(sizeof(struct kevent), mask);
>  ...
> > +	if (kevent_cache)
> > +		kmem_cache_free(kevent_cache, k);
> > +	else
> > +		kfree(k);
> 
> Instead, make this:
> 
> > +	kevent_cache = kmem_cache_create("kevent_cache", 
> > +			sizeof(struct kevent), 0, 0, NULL, NULL);
> > +	if (!kevent_cache)
> > +		err = -ENOMEM;
> 
> panic().  This is consistent with how other core subsystems handle
> SLAB cache creation failures.

Ok.

> I also think that if we accept this work, it should be first class
> citizen with no config options and no ifdefs scattered all over.
> Either this is how we do network AIO or it is not.
> 
> I've looked only briefly at Ulrich Drepper's AIO proposal in his OLS
> slides, although the DMA bits do not initially strike me as such a hot
> idea.  I haven't wrapped my brain much around this new stuff, so I'm
> not going to touch on it much more just yet.

Yes, his idea of dma alloc is extremely good.
I manage it with quite big overhead in kevent unfortunately.
All other topics are fully covered with kevent (except nice userspace
API of course :) )

> The practical advantage kevent has over any new proposal is that 1)
> implementation exists :) and 2) several types of test applications and
> performance measurements have been made against it which usually
> flushes out the worst design issues.

I will clean code up and resubmit today.
Thank you.

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [RFC 1/4] kevent: core files.
  2006-07-25  6:17 ` David Miller
  2006-07-25  6:26   ` Evgeniy Polyakov
@ 2006-07-27 19:18   ` Zach Brown
  2006-07-27 20:06     ` Evgeniy Polyakov
                       ` (2 more replies)
  1 sibling, 3 replies; 160+ messages in thread
From: Zach Brown @ 2006-07-27 19:18 UTC (permalink / raw)
  To: David Miller; +Cc: Evgeniy Polyakov, linux-kernel, netdev


> I like this work a lot, as I've stated before.

Yeah, me too.  I think we're very close to having a workable system
here.  A few weeks of some restructuring and we all might be very happy.

> The data structures
> look like they will scale well and it takes care of all the limitations
> that networking in particular seems to have in this area.
> 
> I have to say that the user API is not the nicest in the world.  Yet,
> at the same time, I cannot think of a better one :)

I want to first focus on the event collection side of the API because I
think we can definitely do better there :).  I hope we all agree that
there is huge value in having one place where an application can wait
for notification from many different sources.  If we get the collection
side right we can later worry about generating events down the pipe from
subsystems in a way that works best for them.

I'll sort of ramble about my thoughts here.

The easy part is fixing up the somewhat obfuscated collection call.
Instead of coming in through a multiplexer that magically treats a void
* as a struct kevent_user_control followed by N ukevents (as specified
in the kevent_user_control!) we'd turn it into a more explicit
collection syscall:

	int kevent_getevents(int event_fd, struct ukevent *events,
		int min_events, int max_events,
		struct timeval *timeout);

This would look a lot less nutty in strace.  It lets apps specify if
there is some minimum number of events they'd like the opportunity to
process rather than waiting for the timeout to expire before the max
number arrives.  (the latter is what kevent_user_wait() does today).  We
can have the usual argument about whether *timeout is updated on a
partial wake-up :).

That'd be a fine syscall collection interface, but we should try hard to
explore being able to collect events without hitting the kernel.

Say we have a ring of event structs.  AIO has this today, but it sort of
gets it wrong because each event element doesn't specify whether it is
owned by the kernel or userspace.  (It really gets it wrong because it
doesn't flush_dcache_page() after updating the ring via kmap(), but
never mind that!  No one actually uses this mmap() AIO ring.)  In AIO
today there is also a control struct mapped along with the ring that has
head and tail pointers.  We don't want to bounce that cacheline around.
 net/socket/af_packet.c gets this right with it's tp_status member of
tpacket_hdr.

So as the kernel generates events in the ring it only produces an event
if the ownership field says that userspace has consumed it and in doing
so it sets the ownership field to tell userspace that an event is
waiting.  userspace and the kernel now each follow their index around
the ring as the ownership field lets them produce or consume the event
at their index.  Can someone tell me if the cache coherence costs of
this are extreme?  I'm hoping they're not.

So, great, glibc can now find pending events very quickly if they're
waiting in the ring and can fall back to the collection syscall if it
wants to wait and the ring is empty.  If it consumes events via the
syscall it increases its ring index by the number the syscall returned.

There's two things we should address: level events and the notion of
only submitting as much as fits in the ring.

epoll and kevent both have the notion of an event type that always
creates an event at the time of the collection syscall while the event
source is on a ready list.  Think of epoll calling ->poll(POLLOUT) for
an empty socket buffer at every sys_epoll_wait() call.  We can't have
some source constantly spewing into the ring :/.  We could fix this by
the API requiring that level events can *only* be collected through the
syscall interface.  userspace could call into the collection syscall
every N events collected through the ring, say.  N would be tuned to
amortize the syscall cost and still provide fairness or latency for the
level sources.  I'd be fine with that, especially when it's hidden off
in glibc.

Today AIO only allows submission of as many events as there are space in
the ring.  It mostly does this so its completion can drop an event in
the ring from any context.  If we back away from this so that we can
have long-lived source registration generate multiple edge events (and I
think we want to!), we have to be kind of careful.  A source could
generate an event while the ring is full.  The event could go in a list
but if userspace is collecting events in userspace the kernel won't be
told when there's space.  We'd first have to check this ready list when
later events are generated so that pending events on the list aren't
overlooked.  Userspace would also want to use the collection syscall as
the ring empties.  Neither seem hard.

So how does this sound?  It wouldn't take me long to build this off of
the current kevent patches.  We could see how it looks..

- z

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [RFC 1/4] kevent: core files.
  2006-07-27 19:18   ` Zach Brown
@ 2006-07-27 20:06     ` Evgeniy Polyakov
  2006-07-27 21:32       ` Zach Brown
  2006-07-27 20:58     ` Benjamin LaHaise
  2006-08-01  1:02     ` David Miller
  2 siblings, 1 reply; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-07-27 20:06 UTC (permalink / raw)
  To: Zach Brown; +Cc: David Miller, linux-kernel, netdev

On Thu, Jul 27, 2006 at 12:18:42PM -0700, Zach Brown (zach.brown@oracle.com) wrote:
> > I have to say that the user API is not the nicest in the world.  Yet,
> > at the same time, I cannot think of a better one :)
> 
> I want to first focus on the event collection side of the API because I
> think we can definitely do better there :).  I hope we all agree that
> there is huge value in having one place where an application can wait
> for notification from many different sources.  If we get the collection
> side right we can later worry about generating events down the pipe from
> subsystems in a way that works best for them.
> 
> I'll sort of ramble about my thoughts here.
> 
> The easy part is fixing up the somewhat obfuscated collection call.
> Instead of coming in through a multiplexer that magically treats a void
> * as a struct kevent_user_control followed by N ukevents (as specified
> in the kevent_user_control!) we'd turn it into a more explicit
> collection syscall:
> 
> 	int kevent_getevents(int event_fd, struct ukevent *events,
> 		int min_events, int max_events,
> 		struct timeval *timeout);

I used only one syscall for all operations, above syscall is
essentially what kevent_user_wait() does.

> This would look a lot less nutty in strace.  It lets apps specify if
> there is some minimum number of events they'd like the opportunity to
> process rather than waiting for the timeout to expire before the max
> number arrives.  (the latter is what kevent_user_wait() does today).  We
> can have the usual argument about whether *timeout is updated on a
> partial wake-up :).

Sure it can be moved as a different syscall.

> That'd be a fine syscall collection interface, but we should try hard to
> explore being able to collect events without hitting the kernel.
> 
> Say we have a ring of event structs.  AIO has this today, but it sort of
> gets it wrong because each event element doesn't specify whether it is
> owned by the kernel or userspace.  (It really gets it wrong because it
> doesn't flush_dcache_page() after updating the ring via kmap(), but
> never mind that!  No one actually uses this mmap() AIO ring.)  In AIO
> today there is also a control struct mapped along with the ring that has
> head and tail pointers.  We don't want to bounce that cacheline around.
>  net/socket/af_packet.c gets this right with it's tp_status member of
> tpacket_hdr.

When kevent is ready, it is supposed to be "moved" into the userspace,
if it is ready or not is detected through it's list entry for given
list(or just dequeue it).
So there is no need to have a ring of all kevents and select ready from
them (actually kevent has a list of all kevents, and it is possible to
scan it and select for ready ones, but special ready list was created to
speed that process up).

> So as the kernel generates events in the ring it only produces an event
> if the ownership field says that userspace has consumed it and in doing
> so it sets the ownership field to tell userspace that an event is
> waiting.  userspace and the kernel now each follow their index around
> the ring as the ownership field lets them produce or consume the event
> at their index.  Can someone tell me if the cache coherence costs of
> this are extreme?  I'm hoping they're not.
> 
> So, great, glibc can now find pending events very quickly if they're
> waiting in the ring and can fall back to the collection syscall if it
> wants to wait and the ring is empty.  If it consumes events via the
> syscall it increases its ring index by the number the syscall returned.

It can get pending event from ready list - no need to scan the whole
ring. If one wants to wait on special kevent, it is possible to get it
from "main" list and check it's state.
But do we really need that functionality? If user created several
kevents, it was not done just for the sake of kevent creation - user
wants them back, so why he will wait only on special one?
I had an idea about priorities for the kevents, and it can be
implemented as several ready lists though.

> There's two things we should address: level events and the notion of
> only submitting as much as fits in the ring.
> 
> epoll and kevent both have the notion of an event type that always
> creates an event at the time of the collection syscall while the event
> source is on a ready list.  Think of epoll calling ->poll(POLLOUT) for
> an empty socket buffer at every sys_epoll_wait() call.  We can't have
> some source constantly spewing into the ring :/.  We could fix this by
> the API requiring that level events can *only* be collected through the
> syscall interface.  userspace could call into the collection syscall
> every N events collected through the ring, say.  N would be tuned to
> amortize the syscall cost and still provide fairness or latency for the
> level sources.  I'd be fine with that, especially when it's hidden off
> in glibc.

Hmm, it looks like I'm lost here...
Each kevent can be modified and even reused at any time, no matter if it
is in the ready list or not.

> Today AIO only allows submission of as many events as there are space in
> the ring.  It mostly does this so its completion can drop an event in
> the ring from any context.  If we back away from this so that we can
> have long-lived source registration generate multiple edge events (and I
> think we want to!), we have to be kind of careful.  A source could
> generate an event while the ring is full.  The event could go in a list
> but if userspace is collecting events in userspace the kernel won't be
> told when there's space.  We'd first have to check this ready list when
> later events are generated so that pending events on the list aren't
> overlooked.  Userspace would also want to use the collection syscall as
> the ring empties.  Neither seem hard.

There is no problem with lists lengths - when kevent is added, and
queue length is enough, it is autmatically means that all lists will
accept that kevent, since all kevents live simultaneously in several
rings. There are no event generators - only existing, i.e. requested
kevents are confirmed to be ready or not, so system behaves axactly how
user asked it to work with provided requests.
For example inotify will allocate and add to the ring new events each
time they fire, but kevent works in a different way (it's inode
notification) - it checks if there are events for inode creation/removal
and that events are marked as ready - thus no allocations, no problems
with queues -system scales well and does not eat resources, but it has a
problem, that not very much info can be delivered to user when event is
ready (for example no filenames, only inode number can be transferred).

> So how does this sound?  It wouldn't take me long to build this off of
> the current kevent patches.  We could see how it looks..

I especially like idea about world happinness in a week or so :)

> - z

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [RFC 1/4] kevent: core files.
  2006-07-27 19:18   ` Zach Brown
  2006-07-27 20:06     ` Evgeniy Polyakov
@ 2006-07-27 20:58     ` Benjamin LaHaise
  2006-07-27 21:44       ` Zach Brown
  2006-08-01  1:02     ` David Miller
  2 siblings, 1 reply; 160+ messages in thread
From: Benjamin LaHaise @ 2006-07-27 20:58 UTC (permalink / raw)
  To: Zach Brown; +Cc: David Miller, Evgeniy Polyakov, linux-kernel, netdev

On Thu, Jul 27, 2006 at 12:18:42PM -0700, Zach Brown wrote:
> The easy part is fixing up the somewhat obfuscated collection call.
> Instead of coming in through a multiplexer that magically treats a void
> * as a struct kevent_user_control followed by N ukevents (as specified
> in the kevent_user_control!) we'd turn it into a more explicit
> collection syscall:
> 
> 	int kevent_getevents(int event_fd, struct ukevent *events,
> 		int min_events, int max_events,
> 		struct timeval *timeout);

You've just reinvented io_getevents().  What exactly are we getting from 
reinventing this (aside from breaking existing apps and creating more of 
an API mess)?

> Say we have a ring of event structs.  AIO has this today, but it sort of
> gets it wrong because each event element doesn't specify whether it is
> owned by the kernel or userspace.  (It really gets it wrong because it
> doesn't flush_dcache_page() after updating the ring via kmap(), but
> never mind that!  No one actually uses this mmap() AIO ring.)  In AIO
> today there is also a control struct mapped along with the ring that has
> head and tail pointers.  We don't want to bounce that cacheline around.
>  net/socket/af_packet.c gets this right with it's tp_status member of
> tpacket_hdr.

That could be rev'd in the mmap() ring buffer, as there are compat and 
incompat bits for changing the structure layout.  As for bouncing the 
cacheline of head/tail around, I don't think it matters on real machines, 
as the multithreaded/SMP case will hit that cacheline bouncing if the 
user is sharing the event ring between multiple threads on multiple CPUs.  
The only way around that is to use multiple event rings, say one per node, 
at which point you have to do load balancing of io requests explicitely 
between queues (which might be worth it).

> So, great, glibc can now find pending events very quickly if they're
> waiting in the ring and can fall back to the collection syscall if it
> wants to wait and the ring is empty.  If it consumes events via the
> syscall it increases its ring index by the number the syscall returned.
> 
> There's two things we should address: level events and the notion of
> only submitting as much as fits in the ring.
> 
> epoll and kevent both have the notion of an event type that always
> creates an event at the time of the collection syscall while the event
> source is on a ready list.  Think of epoll calling ->poll(POLLOUT) for
> an empty socket buffer at every sys_epoll_wait() call.  We can't have
> some source constantly spewing into the ring :/.  We could fix this by
> the API requiring that level events can *only* be collected through the
> syscall interface.  userspace could call into the collection syscall
> every N events collected through the ring, say.  N would be tuned to
> amortize the syscall cost and still provide fairness or latency for the
> level sources.  I'd be fine with that, especially when it's hidden off
> in glibc.

This is exactly why I think level triggered events are nasty.  It's 
impossible to do cleanly without requiring a syscall.

> Today AIO only allows submission of as many events as there are space in
> the ring.  It mostly does this so its completion can drop an event in
> the ring from any context.  If we back away from this so that we can
> have long-lived source registration generate multiple edge events (and I
> think we want to!), we have to be kind of careful.  A source could
> generate an event while the ring is full.  The event could go in a list
> but if userspace is collecting events in userspace the kernel won't be
> told when there's space.  We'd first have to check this ready list when
> later events are generated so that pending events on the list aren't
> overlooked.  Userspace would also want to use the collection syscall as
> the ring empties.  Neither seem hard.

As soon as you allow queueing events up in kernel space, it becomes 
necessary to do another syscall after pulling events out of the queue, 
which is a waste of CPU cycles when you're under heavy load (exactly the 
point at which you want the system to be its most efficient).  Given that 
growing the ring buffer is easy enough to do, I'm not sure that the hit 
is worth it.  At some point there has to be some form of flow control 
involved, and it is much better if it is explicitely obvious where this 
happens (as opposed to signal queues and our wonderful OOM handling).

		-ben
-- 
"Time is of no importance, Mr. President, only life is important."
Don't Email: <dont@kvack.org>.

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [RFC 1/4] kevent: core files.
  2006-07-27 20:06     ` Evgeniy Polyakov
@ 2006-07-27 21:32       ` Zach Brown
  2006-07-28  5:23         ` Evgeniy Polyakov
  0 siblings, 1 reply; 160+ messages in thread
From: Zach Brown @ 2006-07-27 21:32 UTC (permalink / raw)
  To: Evgeniy Polyakov; +Cc: David Miller, linux-kernel, netdev


>> 	int kevent_getevents(int event_fd, struct ukevent *events,
>> 		int min_events, int max_events,
>> 		struct timeval *timeout);
> 
> I used only one syscall for all operations, above syscall is
> essentially what kevent_user_wait() does.

Essentially, yes, but the differences are important.  It's important to
have a clear syscall interface instead of nesting through multiplexers.
 And we should get the batching/latency inputs right.  (I'm for both
min/max elements and arguably timeouts, but I could understand not
wanting to go *that* far.)

> Hmm, it looks like I'm lost here...

Yeah, it seems my description might not have sunk in :).  We're giving
userspace a way to collect events without performing a system call.

> I especially like idea about world happinness in a week or so :)

A few weeks! :)

- z

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [RFC 1/4] kevent: core files.
  2006-07-27 20:58     ` Benjamin LaHaise
@ 2006-07-27 21:44       ` Zach Brown
  2006-07-27 22:02         ` Benjamin LaHaise
  0 siblings, 1 reply; 160+ messages in thread
From: Zach Brown @ 2006-07-27 21:44 UTC (permalink / raw)
  To: Benjamin LaHaise; +Cc: David Miller, Evgeniy Polyakov, linux-kernel, netdev


>> 	int kevent_getevents(int event_fd, struct ukevent *events,
>> 		int min_events, int max_events,
>> 		struct timeval *timeout);
> 
> You've just reinvented io_getevents().

Well, that's certainly one inflammatory way to put it.  I would describe
it as suggesting that the kevents collection interface not lose the
nicer properties of io_getevents().

> What exactly are we getting from 
> reinventing this (aside from breaking existing apps and creating more of 
> an API mess)?

A generic event collection interface that isn't so strongly bound to the
existing semantics of io_setup() and io_submit().  It can be a file
descriptor instead of a mysterious cookie/pointer to the mapped region,
to start.

> incompat bits for changing the structure layout.  As for bouncing the 
> cacheline of head/tail around, I don't think it matters on real machines, 
> as the multithreaded/SMP case will hit that cacheline bouncing if the 
> user is sharing the event ring between multiple threads on multiple CPUs.  
> The only way around that is to use multiple event rings, say one per node, 
> at which point you have to do load balancing of io requests explicitely 
> between queues (which might be worth it).

Sure, so maybe we experiment with these things in the context of the
kevent patches and maybe merge them back into the AIO paths if in the
end that's the right thing to do.  I see no problem with separating
development from the existing code.

>> epoll and kevent both have the notion of an event type that always
>> creates an event at the time of the collection syscall while the event
>> source is on a ready list.  Think of epoll calling ->poll(POLLOUT) for
>> an empty socket buffer at every sys_epoll_wait() call.  We can't have
>> some source constantly spewing into the ring :/.  We could fix this by
>> the API requiring that level events can *only* be collected through the
>> syscall interface.  userspace could call into the collection syscall
>> every N events collected through the ring, say.  N would be tuned to
>> amortize the syscall cost and still provide fairness or latency for the
>> level sources.  I'd be fine with that, especially when it's hidden off
>> in glibc.
> 
> This is exactly why I think level triggered events are nasty.  It's 
> impossible to do cleanly without requiring a syscall.

I'm not convinced that it isn't possible to get a sufficiently clean
interface that involves the mix.

> As soon as you allow queueing events up in kernel space, it becomes 
> necessary to do another syscall after pulling events out of the queue, 
> which is a waste of CPU cycles when you're under heavy load (exactly the 
> point at which you want the system to be its most efficient).

If we've just consumed a full ring worth of events, and done real work
with them, I'm not convinced that an empty syscall is going to be that
painful.  If we're really under load it might well return some newly
arrived events.  It becomes a mix of ring completions and syscall
completions.

- z

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [RFC 1/4] kevent: core files.
  2006-07-27 21:44       ` Zach Brown
@ 2006-07-27 22:02         ` Benjamin LaHaise
  2006-07-28  5:39           ` Evgeniy Polyakov
  2006-07-28 19:01           ` Zach Brown
  0 siblings, 2 replies; 160+ messages in thread
From: Benjamin LaHaise @ 2006-07-27 22:02 UTC (permalink / raw)
  To: Zach Brown; +Cc: David Miller, Evgeniy Polyakov, linux-kernel, netdev

On Thu, Jul 27, 2006 at 02:44:50PM -0700, Zach Brown wrote:
> 
> >> 	int kevent_getevents(int event_fd, struct ukevent *events,
> >> 		int min_events, int max_events,
> >> 		struct timeval *timeout);
> > 
> > You've just reinvented io_getevents().
> 
> Well, that's certainly one inflammatory way to put it.  I would describe
> it as suggesting that the kevents collection interface not lose the
> nicer properties of io_getevents().

Perhaps, but there seems to be a lot of talk about introducing new APIs 
where it isn't entirely clear that it is needed.  Sorry if that sounded 
rather acerbic.

> > What exactly are we getting from 
> > reinventing this (aside from breaking existing apps and creating more of 
> > an API mess)?
> 
> A generic event collection interface that isn't so strongly bound to the
> existing semantics of io_setup() and io_submit().  It can be a file
> descriptor instead of a mysterious cookie/pointer to the mapped region,
> to start.

Things were like that at one point in time, but file descriptors turn out 
to introduce a huge gaping security hole with SUID programs.  The problem 
is that any event context is closely tied to the address space of the 
thread issuing the syscalls, and file descriptors do not have this close 
binding.

> Sure, so maybe we experiment with these things in the context of the
> kevent patches and maybe merge them back into the AIO paths if in the
> end that's the right thing to do.  I see no problem with separating
> development from the existing code.

Excellent!

> >> epoll and kevent both have the notion of an event type that always
> >> creates an event at the time of the collection syscall while the event
> >> source is on a ready list.  Think of epoll calling ->poll(POLLOUT) for
> >> an empty socket buffer at every sys_epoll_wait() call.  We can't have
> >> some source constantly spewing into the ring :/.  We could fix this by
> >> the API requiring that level events can *only* be collected through the
> >> syscall interface.  userspace could call into the collection syscall
> >> every N events collected through the ring, say.  N would be tuned to
> >> amortize the syscall cost and still provide fairness or latency for the
> >> level sources.  I'd be fine with that, especially when it's hidden off
> >> in glibc.
> > 
> > This is exactly why I think level triggered events are nasty.  It's 
> > impossible to do cleanly without requiring a syscall.
> 
> I'm not convinced that it isn't possible to get a sufficiently clean
> interface that involves the mix.

My arguement is that this approach introduces a slow path into the heavily 
loaded server case.  If you can show me how to avoid that, I'd be happy to 
see such an implementation. =-)

> > As soon as you allow queueing events up in kernel space, it becomes 
> > necessary to do another syscall after pulling events out of the queue, 
> > which is a waste of CPU cycles when you're under heavy load (exactly the 
> > point at which you want the system to be its most efficient).
> 
> If we've just consumed a full ring worth of events, and done real work
> with them, I'm not convinced that an empty syscall is going to be that
> painful.  If we're really under load it might well return some newly
> arrived events.  It becomes a mix of ring completions and syscall
> completions.

Except that you're not usually pulling a full ring worth of events at a 
time, more often just one.  One of the driving forces behind AIO use is 
in realtime apps where you don't want to eat occasional spikes in the 
latency of request processing, one just wants to eat the highest priority 
event then work on the next.  By keeping each step small and managable, 
the properties of the system are much easier to predict.  Yes, batching 
can be helpful performance-wise, but it is somewhat opposite to the design 
criteria that need to be considered.  The right way to cope with that may 
be to have two different modes of operation that trade off one way or the 
other on the batching question.

		-ben
-- 
"Time is of no importance, Mr. President, only life is important."
Don't Email: <dont@kvack.org>.

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [RFC 1/4] kevent: core files.
  2006-07-27 21:32       ` Zach Brown
@ 2006-07-28  5:23         ` Evgeniy Polyakov
  2006-07-28 18:33           ` Zach Brown
  2006-08-01  1:05           ` [RFC 1/4] kevent: core files David Miller
  0 siblings, 2 replies; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-07-28  5:23 UTC (permalink / raw)
  To: Zach Brown; +Cc: David Miller, linux-kernel, netdev

On Thu, Jul 27, 2006 at 02:32:05PM -0700, Zach Brown (zach.brown@oracle.com) wrote:
> 
> >> 	int kevent_getevents(int event_fd, struct ukevent *events,
> >> 		int min_events, int max_events,
> >> 		struct timeval *timeout);
> > 
> > I used only one syscall for all operations, above syscall is
> > essentially what kevent_user_wait() does.
> 
> Essentially, yes, but the differences are important.  It's important to
> have a clear syscall interface instead of nesting through multiplexers.
>  And we should get the batching/latency inputs right.  (I'm for both
> min/max elements and arguably timeouts, but I could understand not
> wanting to go *that* far.)

I completely agree that existing kevent interface is not the best, so
I'm opened for any suggestions.
Should kevent creation/removing/modification be separated too?

> > Hmm, it looks like I'm lost here...
> 
> Yeah, it seems my description might not have sunk in :).  We're giving
> userspace a way to collect events without performing a system call.

And why do we want this?
How glibc is supposed to determine, that some events already fired and
such requests will return immediately, or for example how timer events
will be managed?

> > I especially like idea about world happinness in a week or so :)
> 
> A few weeks! :)

No matter after couple of millions of years of human evolution :)

> - z

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [RFC 1/4] kevent: core files.
  2006-07-27 22:02         ` Benjamin LaHaise
@ 2006-07-28  5:39           ` Evgeniy Polyakov
  2006-07-28 19:01           ` Zach Brown
  1 sibling, 0 replies; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-07-28  5:39 UTC (permalink / raw)
  To: Benjamin LaHaise; +Cc: Zach Brown, David Miller, linux-kernel, netdev

On Thu, Jul 27, 2006 at 06:02:38PM -0400, Benjamin LaHaise (bcrl@kvack.org) wrote:
> On Thu, Jul 27, 2006 at 02:44:50PM -0700, Zach Brown wrote:
> > 
> > >> 	int kevent_getevents(int event_fd, struct ukevent *events,
> > >> 		int min_events, int max_events,
> > >> 		struct timeval *timeout);
> > > 
> > > You've just reinvented io_getevents().
> > 
> > Well, that's certainly one inflammatory way to put it.  I would describe
> > it as suggesting that the kevents collection interface not lose the
> > nicer properties of io_getevents().
> 
> Perhaps, but there seems to be a lot of talk about introducing new APIs 
> where it isn't entirely clear that it is needed.  Sorry if that sounded 
> rather acerbic.

Except that kevent has completely different structures and it is
possible to have a lot of more types of event than AIO has and there is
no need to specially have bytes, offsets and so on.
Magic pointer returned from aio syscall is no pollable too.

> > > What exactly are we getting from 
> > > reinventing this (aside from breaking existing apps and creating more of 
> > > an API mess)?
> > 
> > A generic event collection interface that isn't so strongly bound to the
> > existing semantics of io_setup() and io_submit().  It can be a file
> > descriptor instead of a mysterious cookie/pointer to the mapped region,
> > to start.
> 
> Things were like that at one point in time, but file descriptors turn out 
> to introduce a huge gaping security hole with SUID programs.  The problem 
> is that any event context is closely tied to the address space of the 
> thread issuing the syscalls, and file descriptors do not have this close 
> binding.

It is true for all shared resources no matter if it is file descriptor or 
mapped area.

> > Sure, so maybe we experiment with these things in the context of the
> > kevent patches and maybe merge them back into the AIO paths if in the
> > end that's the right thing to do.  I see no problem with separating
> > development from the existing code.
> 
> Excellent!
> 
> > >> epoll and kevent both have the notion of an event type that always
> > >> creates an event at the time of the collection syscall while the event
> > >> source is on a ready list.  Think of epoll calling ->poll(POLLOUT) for
> > >> an empty socket buffer at every sys_epoll_wait() call.  We can't have
> > >> some source constantly spewing into the ring :/.  We could fix this by
> > >> the API requiring that level events can *only* be collected through the
> > >> syscall interface.  userspace could call into the collection syscall
> > >> every N events collected through the ring, say.  N would be tuned to
> > >> amortize the syscall cost and still provide fairness or latency for the
> > >> level sources.  I'd be fine with that, especially when it's hidden off
> > >> in glibc.
> > > 
> > > This is exactly why I think level triggered events are nasty.  It's 
> > > impossible to do cleanly without requiring a syscall.
> > 
> > I'm not convinced that it isn't possible to get a sufficiently clean
> > interface that involves the mix.
> 
> My arguement is that this approach introduces a slow path into the heavily 
> loaded server case.  If you can show me how to avoid that, I'd be happy to 
> see such an implementation. =-)

If I understand you correct you are talking about level triggering
events, which arrive continuously? Such events are handled by only one
kevent, if iti is ready, that means that at least one such event was
fired. One can add a number of fired events as a hints inside kevent.

> > > As soon as you allow queueing events up in kernel space, it becomes 
> > > necessary to do another syscall after pulling events out of the queue, 
> > > which is a waste of CPU cycles when you're under heavy load (exactly the 
> > > point at which you want the system to be its most efficient).
> > 
> > If we've just consumed a full ring worth of events, and done real work
> > with them, I'm not convinced that an empty syscall is going to be that
> > painful.  If we're really under load it might well return some newly
> > arrived events.  It becomes a mix of ring completions and syscall
> > completions.
> 
> Except that you're not usually pulling a full ring worth of events at a 
> time, more often just one.  One of the driving forces behind AIO use is 
> in realtime apps where you don't want to eat occasional spikes in the 
> latency of request processing, one just wants to eat the highest priority 
> event then work on the next.  By keeping each step small and managable, 
> the properties of the system are much easier to predict.  Yes, batching 
> can be helpful performance-wise, but it is somewhat opposite to the design 
> criteria that need to be considered.  The right way to cope with that may 
> be to have two different modes of operation that trade off one way or the 
> other on the batching question.

It is user who should decide either he wants one, all or at least one
eventt. kevent supports all types of requests, it's behaviour depends on
parameters.

> 		-ben

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [RFC 1/4] kevent: core files.
  2006-07-28  5:23         ` Evgeniy Polyakov
@ 2006-07-28 18:33           ` Zach Brown
  2006-07-28 18:44             ` Evgeniy Polyakov
  2006-08-01  1:05           ` [RFC 1/4] kevent: core files David Miller
  1 sibling, 1 reply; 160+ messages in thread
From: Zach Brown @ 2006-07-28 18:33 UTC (permalink / raw)
  To: Evgeniy Polyakov; +Cc: David Miller, linux-kernel, netdev


> I completely agree that existing kevent interface is not the best, so
> I'm opened for any suggestions.
> Should kevent creation/removing/modification be separated too?

Yeah, I think so.

>>> Hmm, it looks like I'm lost here...
>> Yeah, it seems my description might not have sunk in :).  We're giving
>> userspace a way to collect events without performing a system call.
> 
> And why do we want this?

So that event collection can be very efficient.

> How glibc is supposed to determine, that some events already fired and
> such requests will return immediately, or for example how timer events
> will be managed?

...

That was what my previous mail was all about!

- z

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [RFC 1/4] kevent: core files.
  2006-07-28 18:33           ` Zach Brown
@ 2006-07-28 18:44             ` Evgeniy Polyakov
  2006-07-28 19:10               ` Zach Brown
  0 siblings, 1 reply; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-07-28 18:44 UTC (permalink / raw)
  To: Zach Brown; +Cc: David Miller, linux-kernel, netdev

On Fri, Jul 28, 2006 at 11:33:16AM -0700, Zach Brown (zach.brown@oracle.com) wrote:
> 
> > I completely agree that existing kevent interface is not the best, so
> > I'm opened for any suggestions.
> > Should kevent creation/removing/modification be separated too?
> 
> Yeah, I think so.

So, I'm going to create kevent_create/destroy/control and kevent_get_events()
Or any better names?

> >>> Hmm, it looks like I'm lost here...
> >> Yeah, it seems my description might not have sunk in :).  We're giving
> >> userspace a way to collect events without performing a system call.
> > 
> > And why do we want this?
> 
> So that event collection can be very efficient.
> 
> > How glibc is supposed to determine, that some events already fired and
> > such requests will return immediately, or for example how timer events
> > will be managed?
> 
> ...
> 
> That was what my previous mail was all about!

Some events are impossible to create in userspace (like timer
notification, which requires timer start and check when timer
completed).
Actually all events are part of the kernel, since glibc does not have
any knowledge about in-kernel state machines which are bound to
appropriate kevents, so each kevent takes at least two syscall (create
and get ready), and I do not see how, for exmple, glibc can avoid them
when user requested POLLIN or similar event for network dataflow?

According to syscall speed on Linux, last time I checked empty syscall 
took about 100ns on AMD Athlon 3500+.

> - z

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [RFC 1/4] kevent: core files.
  2006-07-27 22:02         ` Benjamin LaHaise
  2006-07-28  5:39           ` Evgeniy Polyakov
@ 2006-07-28 19:01           ` Zach Brown
  2006-07-28 19:24             ` Evgeniy Polyakov
  1 sibling, 1 reply; 160+ messages in thread
From: Zach Brown @ 2006-07-28 19:01 UTC (permalink / raw)
  To: Benjamin LaHaise; +Cc: David Miller, Evgeniy Polyakov, linux-kernel, netdev


> Things were like that at one point in time, but file descriptors turn out 
> to introduce a huge gaping security hole with SUID programs.  The problem 
> is that any event context is closely tied to the address space of the 
> thread issuing the syscalls, and file descriptors do not have this close 
> binding.

Can you go into that hole in more detail?

> Except that you're not usually pulling a full ring worth of events at a 
> time, more often just one.

OK, but then to wait for it you were already sleeping in the kernel, right?

Clearly we should port httpd to kevents and take some measurements :)

- z

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [RFC 1/4] kevent: core files.
  2006-07-28 18:44             ` Evgeniy Polyakov
@ 2006-07-28 19:10               ` Zach Brown
  2006-07-29  3:38                 ` Ulrich Drepper
  0 siblings, 1 reply; 160+ messages in thread
From: Zach Brown @ 2006-07-28 19:10 UTC (permalink / raw)
  To: Evgeniy Polyakov; +Cc: David Miller, linux-kernel, netdev, Ulrich Drepper


> So, I'm going to create kevent_create/destroy/control and kevent_get_events()
> Or any better names?

Yeah, that sounds good.

> Some events are impossible to create in userspace (like timer
> notification, which requires timer start and check when timer
> completed).

We're not talking about *creating* events in userspace, we're talking
about checking for their completion events in the ring.

> and get ready), and I do not see how, for exmple, glibc can avoid them
> when user requested POLLIN or similar event for network dataflow?

There are events that can be generated by kernel code paths as the event
completes.  Network sockets have the hooks to do this with SIGIO, it's
very natural for the storage completion paths, etc.  So that kernel code
would update the ring which userspace could check.  AIO does this today.
 Userspace would still have to use the syscall to sleep waiting for new
events when the ring is empty.

> According to syscall speed on Linux, last time I checked empty syscall 
> took about 100ns on AMD Athlon 3500+.

Oh, sure, but still nice to avoid.

I'm mostly pursuing this because Ulrich seemed so insistent on it in his
paper and talk.  I will be very sad if we don't have aggressive glibc
support for this generic event collection interface and so I want very
much to keep him engaged.  Ulrich, would you be satisfied if we didn't
have the userspace mapped ring on the first pass and only had a
collection syscall?

- z

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [RFC 1/4] kevent: core files.
  2006-07-28 19:01           ` Zach Brown
@ 2006-07-28 19:24             ` Evgeniy Polyakov
  2006-07-28 19:34               ` Zach Brown
  0 siblings, 1 reply; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-07-28 19:24 UTC (permalink / raw)
  To: Zach Brown; +Cc: Benjamin LaHaise, David Miller, linux-kernel, netdev

On Fri, Jul 28, 2006 at 12:01:28PM -0700, Zach Brown (zach.brown@oracle.com) wrote:
> Clearly we should port httpd to kevents and take some measurements :)

One of my main kevent benchmarks (socket notifications for
accept/receive) is handmade http server.
I compared it with FreeBSD kqueue, epoll and kevent_poll
(this is generic poll/select notifications ported to kevent)
based (it is the same server but with different event functions.

Client was httperf, I ran it with 30k connections in bursts of 3k
connection with 1 second timeout between bursts.


Here are results:

kevent:	more than 2600 requests/second
epoll and kevent_poll: about 1600-1800 requests/second
kqueue: enormous number of connection reset errors (only 62% of
successfull connections) (likely misconfiguration, default FreeBSD
6-something does not allow such rates at all).

More info can be found on kevent homepage:
http://tservice.net.ru/~s0mbre/old/?section=projects&item=kevent

> - z

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [RFC 1/4] kevent: core files.
  2006-07-28 19:24             ` Evgeniy Polyakov
@ 2006-07-28 19:34               ` Zach Brown
  2006-07-28 19:37                 ` Zach Brown
  0 siblings, 1 reply; 160+ messages in thread
From: Zach Brown @ 2006-07-28 19:34 UTC (permalink / raw)
  To: Evgeniy Polyakov; +Cc: Benjamin LaHaise, David Miller, linux-kernel, netdev

Evgeniy Polyakov wrote:
> On Fri, Jul 28, 2006 at 12:01:28PM -0700, Zach Brown (zach.brown@oracle.com) wrote:
>> Clearly we should port httpd to kevents and take some measurements :)
> 
> One of my main kevent benchmarks (socket notifications for
> accept/receive) is handmade http server.

Yeah, so I noticed.  That's a good starting point but I'm more
interested in seeing the work integrated with servers that have to
survive outside of benchmarking runs.

- z

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [RFC 1/4] kevent: core files.
  2006-07-28 19:34               ` Zach Brown
@ 2006-07-28 19:37                 ` Zach Brown
  0 siblings, 0 replies; 160+ messages in thread
From: Zach Brown @ 2006-07-28 19:37 UTC (permalink / raw)
  To: Zach Brown
  Cc: Evgeniy Polyakov, Benjamin LaHaise, David Miller, linux-kernel, netdev


>>> Clearly we should port httpd to kevents and take some measurements :)

oh, I see, I forgot the 't' in 'thttpd'.  My mistake.

- z

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [RFC 1/4] kevent: core files.
  2006-07-28 19:10               ` Zach Brown
@ 2006-07-29  3:38                 ` Ulrich Drepper
  2006-07-29  4:32                   ` Nicholas Miell
  2006-07-29 15:44                   ` Evgeniy Polyakov
  0 siblings, 2 replies; 160+ messages in thread
From: Ulrich Drepper @ 2006-07-29  3:38 UTC (permalink / raw)
  To: Zach Brown; +Cc: Evgeniy Polyakov, David Miller, linux-kernel, netdev

[-- Attachment #1: Type: text/plain, Size: 417 bytes --]

Zach Brown wrote:
> Ulrich, would you be satisfied if we didn't
> have the userspace mapped ring on the first pass and only had a
> collection syscall?

I'm not the one to make a call but why rush things?  Let's do it right
from the start.  Later changes can only lead to problems with users of
the earlier interface.

-- 
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 251 bytes --]

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [RFC 1/4] kevent: core files.
  2006-07-29  3:38                 ` Ulrich Drepper
@ 2006-07-29  4:32                   ` Nicholas Miell
  2006-07-29 15:48                     ` Evgeniy Polyakov
  2006-07-30  8:08                     ` Ulrich Drepper
  2006-07-29 15:44                   ` Evgeniy Polyakov
  1 sibling, 2 replies; 160+ messages in thread
From: Nicholas Miell @ 2006-07-29  4:32 UTC (permalink / raw)
  To: Ulrich Drepper
  Cc: Zach Brown, Evgeniy Polyakov, David Miller, linux-kernel, netdev

On Fri, 2006-07-28 at 20:38 -0700, Ulrich Drepper wrote:
> Zach Brown wrote:
> > Ulrich, would you be satisfied if we didn't
> > have the userspace mapped ring on the first pass and only had a
> > collection syscall?
> 
> I'm not the one to make a call but why rush things?  Let's do it right
> from the start.  Later changes can only lead to problems with users of
> the earlier interface.
> 

Speaking of API design choices, I saw your OLS paper and was wondering
if you were familiar with the Solaris port APIs* and, if so, you could
please comment on how your proposed event channels are different/better.


* http://docs.sun.com/app/docs/doc/816-5168/6mbb3hrir?a=view

-- 
Nicholas Miell <nmiell@comcast.net>


^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [RFC 1/4] kevent: core files.
  2006-07-29  3:38                 ` Ulrich Drepper
  2006-07-29  4:32                   ` Nicholas Miell
@ 2006-07-29 15:44                   ` Evgeniy Polyakov
  2006-07-29 16:18                     ` Ulrich Drepper
  1 sibling, 1 reply; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-07-29 15:44 UTC (permalink / raw)
  To: Ulrich Drepper; +Cc: Zach Brown, David Miller, linux-kernel, netdev

On Fri, Jul 28, 2006 at 08:38:02PM -0700, Ulrich Drepper (drepper@redhat.com) wrote:
> Zach Brown wrote:
> > Ulrich, would you be satisfied if we didn't
> > have the userspace mapped ring on the first pass and only had a
> > collection syscall?
> 
> I'm not the one to make a call but why rush things?  Let's do it right
> from the start.  Later changes can only lead to problems with users of
> the earlier interface.

Btw, why do we want mapped ring of ready events?
If user requestd some event, he definitely wants to get them back when
they are ready, and not to check and then get them?
Could you please explain more on this issue?

> -- 
> ??? Ulrich Drepper ??? Red Hat, Inc. ??? 444 Castro St ??? Mountain View, CA ???
> 



-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [RFC 1/4] kevent: core files.
  2006-07-29  4:32                   ` Nicholas Miell
@ 2006-07-29 15:48                     ` Evgeniy Polyakov
  2006-07-29 20:54                       ` Nicholas Miell
  2006-07-30  8:08                     ` Ulrich Drepper
  1 sibling, 1 reply; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-07-29 15:48 UTC (permalink / raw)
  To: Nicholas Miell
  Cc: Ulrich Drepper, Zach Brown, David Miller, linux-kernel, netdev

On Fri, Jul 28, 2006 at 09:32:42PM -0700, Nicholas Miell (nmiell@comcast.net) wrote:
> Speaking of API design choices, I saw your OLS paper and was wondering
> if you were familiar with the Solaris port APIs* and, if so, you could
> please comment on how your proposed event channels are different/better.

As far as it concerns kevents - userspace "ports" are just usual users
of kevents, like timer notifications. Add another syscall to "complete"
requested kevents and you get exactly Solaris ports.
It is fairly simple to implement on top of kevents, I just do not see
immediate benefits from that.

> -- 
> Nicholas Miell <nmiell@comcast.net>

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [RFC 1/4] kevent: core files.
  2006-07-29 15:44                   ` Evgeniy Polyakov
@ 2006-07-29 16:18                     ` Ulrich Drepper
  2006-07-31 10:33                       ` Evgeniy Polyakov
  0 siblings, 1 reply; 160+ messages in thread
From: Ulrich Drepper @ 2006-07-29 16:18 UTC (permalink / raw)
  To: Evgeniy Polyakov; +Cc: Zach Brown, David Miller, linux-kernel, netdev

[-- Attachment #1: Type: text/plain, Size: 2898 bytes --]

Evgeniy Polyakov wrote:
> Btw, why do we want mapped ring of ready events?
> If user requestd some event, he definitely wants to get them back when
> they are ready, and not to check and then get them?
> Could you please explain more on this issue?

If of course makes no sense to enter the kernel to actually get the
event.  This should be done by storing the event in the ring buffer.
I.e., there are two ways to get an event:

- with a syscall.  This can report as many events at once as the caller
  provides space for.  And no event which is reported in the run buffer
  should be reported this way

- if there is space, report it in the ring buffer.  Yes, the buffer
  can be optional, then all events are reported by the system call.


So the use case would be like this:


wait_and_get_event:

  is buffer empty ?

    yes -> make syscall

    no -> get event from buffer


To avoid races, the syscall needs to take a parameter indicating the
last event checked out from the buffer.  If in the meantime the kernel
put another event in the buffer the syscall immediately returns.
Similar to what we do in the futex syscall.

The question is how to best represent the ring buffer.  Zach and some
others had some ready responses in Ottawa.  The important thing is to
avoid cache line ping pong when possible.


Is the ring buffer absolutely necessary?  Probably not.  But it has the
potential to help quite a bit.  Don't look at the problem to solve in
the context of heavy I/O operations when another syscall here and there
doesn't matter.  With this single event mechanism for every possible
event the kernel can generate programming can look quite different.
E.g., every read() call can implicitly we changed into an async read
call followed by a user-level reschedule.  This rescheduling allows
another thread of execution to run while the read request is processed.
 I.e., it's basically a setjmp() followed by a goto into the inner loop
to get the next event.  And now suddenly the event notification
mechanism really should be as fast as possible.  If we submit basically
every request asynchronously and are not creating dedicated threads for
specific tasks anymore we

a) have a lot more event notifications

b) the probability of an event being reported when we want the receive
   the next one if higher (i.e., the case where no syscall vs syscall
   makes a difference)

Yes, all this will require changes in the way programs a written but we
shouldn't limit the way we can write programs unnecessarily.  I think
that given increasing discrepancies in relative speed/latency of the
peripherals and the CPU this is one possible solution to keep the CPUs
busy without resorting to a gazillion separate threads in each program.

-- 
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 251 bytes --]

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [RFC 1/4] kevent: core files.
  2006-07-29 15:48                     ` Evgeniy Polyakov
@ 2006-07-29 20:54                       ` Nicholas Miell
  0 siblings, 0 replies; 160+ messages in thread
From: Nicholas Miell @ 2006-07-29 20:54 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: Ulrich Drepper, Zach Brown, David Miller, linux-kernel, netdev

On Sat, 2006-07-29 at 19:48 +0400, Evgeniy Polyakov wrote:
> On Fri, Jul 28, 2006 at 09:32:42PM -0700, Nicholas Miell (nmiell@comcast.net) wrote:
> > Speaking of API design choices, I saw your OLS paper and was wondering
> > if you were familiar with the Solaris port APIs* and, if so, you could
> > please comment on how your proposed event channels are different/better.
> 
> As far as it concerns kevents - userspace "ports" are just usual users
> of kevents, like timer notifications. Add another syscall to "complete"
> requested kevents and you get exactly Solaris ports.
> It is fairly simple to implement on top of kevents, I just do not see
> immediate benefits from that.
> 

Sorry, I wasn't talking about kevent, I was talking about the interfaces
described in "The Need for Asynchronous, Zero-Copy Network I/O" by
Ulrich Drepper -- specifically the ec_t type and related functions and
the modifications to struct sigevent.

-- 
Nicholas Miell <nmiell@comcast.net>


^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [RFC 1/4] kevent: core files.
  2006-07-29  4:32                   ` Nicholas Miell
  2006-07-29 15:48                     ` Evgeniy Polyakov
@ 2006-07-30  8:08                     ` Ulrich Drepper
  1 sibling, 0 replies; 160+ messages in thread
From: Ulrich Drepper @ 2006-07-30  8:08 UTC (permalink / raw)
  To: Nicholas Miell
  Cc: Zach Brown, Evgeniy Polyakov, David Miller, linux-kernel, netdev

[-- Attachment #1: Type: text/plain, Size: 690 bytes --]

Nicholas Miell wrote:
> [...] and was wondering
> if you were familiar with the Solaris port APIs* and,

I wasn't.


> if so, you could
> please comment on how your proposed event channels are different/better.

There indeed is not much difference.  The differences are in the
details.  The way those ports are specified doesn't allow much room for
further optimizations.  E.g., the userlevel ring buffer isn't possible.
 But mostly it's the same semantics.  The ec_t type in my text is also
better a file descriptor since otherwise it cannot be transported via
Unix stream sockets.

-- 
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 251 bytes --]

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [RFC 1/4] kevent: core files.
  2006-07-29 16:18                     ` Ulrich Drepper
@ 2006-07-31 10:33                       ` Evgeniy Polyakov
  2006-07-31 10:35                         ` Herbert Xu
                                           ` (10 more replies)
  0 siblings, 11 replies; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-07-31 10:33 UTC (permalink / raw)
  To: Ulrich Drepper; +Cc: Zach Brown, David Miller, linux-kernel, netdev

On Sat, Jul 29, 2006 at 09:18:47AM -0700, Ulrich Drepper (drepper@redhat.com) wrote:
> Evgeniy Polyakov wrote:
> > Btw, why do we want mapped ring of ready events?
> > If user requestd some event, he definitely wants to get them back when
> > they are ready, and not to check and then get them?
> > Could you please explain more on this issue?
> 
> If of course makes no sense to enter the kernel to actually get the
> event.  This should be done by storing the event in the ring buffer.
> I.e., there are two ways to get an event:
> 
> - with a syscall.  This can report as many events at once as the caller
>   provides space for.  And no event which is reported in the run buffer
>   should be reported this way
> 
> - if there is space, report it in the ring buffer.  Yes, the buffer
>   can be optional, then all events are reported by the system call.

That requires a copy, which can neglect syscall overhead.
Do we really want it to be done?
 
> So the use case would be like this:
> 
> 
> wait_and_get_event:
> 
>   is buffer empty ?
> 
>     yes -> make syscall
> 
>     no -> get event from buffer
> 
> 
> To avoid races, the syscall needs to take a parameter indicating the
> last event checked out from the buffer.  If in the meantime the kernel
> put another event in the buffer the syscall immediately returns.
> Similar to what we do in the futex syscall.

And how "misordering" between queue and buffer is going to be managed?
I.e. when buffer is full and events are placed into queue, so syscall
could get them, and then syscall is called to get events from the queue
but not from the buffer - we can endup taking events from buffer while
old are placed in the queue.
And how waiting will be done without syscalls? Will glibc take care of
it?

> The question is how to best represent the ring buffer.  Zach and some
> others had some ready responses in Ottawa.  The important thing is to
> avoid cache line ping pong when possible.
> 
> Is the ring buffer absolutely necessary?  Probably not.  But it has the
> potential to help quite a bit.  Don't look at the problem to solve in
> the context of heavy I/O operations when another syscall here and there
> doesn't matter.  With this single event mechanism for every possible
> event the kernel can generate programming can look quite different.
> E.g., every read() call can implicitly we changed into an async read
> call followed by a user-level reschedule.  This rescheduling allows
> another thread of execution to run while the read request is processed.
>  I.e., it's basically a setjmp() followed by a goto into the inner loop
> to get the next event.  And now suddenly the event notification
> mechanism really should be as fast as possible.  If we submit basically
> every request asynchronously and are not creating dedicated threads for
> specific tasks anymore we
> 
> a) have a lot more event notifications
> 
> b) the probability of an event being reported when we want the receive
>    the next one if higher (i.e., the case where no syscall vs syscall
>    makes a difference)
> 
> Yes, all this will require changes in the way programs a written but we
> shouldn't limit the way we can write programs unnecessarily.  I think
> that given increasing discrepancies in relative speed/latency of the
> peripherals and the CPU this is one possible solution to keep the CPUs
> busy without resorting to a gazillion separate threads in each program.

Ok, let's do it in the following way:
I present new version of kevent with new syscalls and fixed issues mentioned
before, while people look at it we can end up with mapped buffer design.
Is it ok?

> -- 
> ➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
> 



-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [RFC 1/4] kevent: core files.
  2006-07-31 10:33                       ` Evgeniy Polyakov
@ 2006-07-31 10:35                         ` Herbert Xu
  2006-07-31 10:50                           ` Evgeniy Polyakov
  2006-07-31 19:41                         ` Evgeniy Polyakov
                                           ` (9 subsequent siblings)
  10 siblings, 1 reply; 160+ messages in thread
From: Herbert Xu @ 2006-07-31 10:35 UTC (permalink / raw)
  To: Evgeniy Polyakov; +Cc: drepper, zach.brown, davem, linux-kernel, netdev

Evgeniy Polyakov <johnpol@2ka.mipt.ru> wrote:
>
>> - if there is space, report it in the ring buffer.  Yes, the buffer
>>   can be optional, then all events are reported by the system call.
> 
> That requires a copy, which can neglect syscall overhead.
> Do we really want it to be done?

Please note that we're talking about events here, not actual data.  So
only the event is being copied, which is presumably rather small compared
to the data.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [RFC 1/4] kevent: core files.
  2006-07-31 10:35                         ` Herbert Xu
@ 2006-07-31 10:50                           ` Evgeniy Polyakov
  2006-07-31 10:57                             ` David Miller
  0 siblings, 1 reply; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-07-31 10:50 UTC (permalink / raw)
  To: Herbert Xu; +Cc: drepper, zach.brown, davem, linux-kernel, netdev

On Mon, Jul 31, 2006 at 08:35:55PM +1000, Herbert Xu (herbert@gondor.apana.org.au) wrote:
> Evgeniy Polyakov <johnpol@2ka.mipt.ru> wrote:
> >
> >> - if there is space, report it in the ring buffer.  Yes, the buffer
> >>   can be optional, then all events are reported by the system call.
> > 
> > That requires a copy, which can neglect syscall overhead.
> > Do we really want it to be done?
> 
> Please note that we're talking about events here, not actual data.  So
> only the event is being copied, which is presumably rather small compared
> to the data.

In syscall time kevents copy 40bytes for each event + 12 bytes of header 
(number of events, timeout and command number). That's likely two cache
lines if only one event is reported.

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [RFC 1/4] kevent: core files.
  2006-07-31 10:50                           ` Evgeniy Polyakov
@ 2006-07-31 10:57                             ` David Miller
  2006-07-31 10:59                               ` Herbert Xu
  0 siblings, 1 reply; 160+ messages in thread
From: David Miller @ 2006-07-31 10:57 UTC (permalink / raw)
  To: johnpol; +Cc: herbert, drepper, zach.brown, linux-kernel, netdev

From: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
Date: Mon, 31 Jul 2006 14:50:37 +0400

> In syscall time kevents copy 40bytes for each event + 12 bytes of header 
> (number of events, timeout and command number). That's likely two cache
> lines if only one event is reported.

Do you know how many cachelines are dirtied by system call
entry and exit on typical system?

On sparc64 it is a minimum of 3 64-byte cachelines just to save and
restore the system call time cpu register state.  If application is
deep in a call chain, register windows might spill and each such
register window will dirty 2 more cachelines as they are dumped to the
stack.

I am not even talking about the other basic necessities of doing
a system call such as touching various task_struct and thread_info
state to check for pending signals etc.

System call overhead is non-trivial especially when you are using
it to move only a few small objects into and out of the kernel.

So I would say for up to 4 or 5 events, system call overhead alone
touches as many cache lines as the events themselves.

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [RFC 1/4] kevent: core files.
  2006-07-31 10:57                             ` David Miller
@ 2006-07-31 10:59                               ` Herbert Xu
  2006-08-01  7:53                                 ` Ulrich Drepper
  0 siblings, 1 reply; 160+ messages in thread
From: Herbert Xu @ 2006-07-31 10:59 UTC (permalink / raw)
  To: David Miller; +Cc: johnpol, drepper, zach.brown, linux-kernel, netdev

On Mon, Jul 31, 2006 at 03:57:16AM -0700, David Miller wrote:
> 
> So I would say for up to 4 or 5 events, system call overhead alone
> touches as many cache lines as the events themselves.

Absolutely.

The other to consider is that events don't come from the hardware.
Events are written by the kernel.  So if user-space is just reading
the events that we've written, then there are no cache misses at all.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [RFC 1/4] kevent: core files.
  2006-07-31 10:33                       ` Evgeniy Polyakov
  2006-07-31 10:35                         ` Herbert Xu
@ 2006-07-31 19:41                         ` Evgeniy Polyakov
  2006-07-31 22:00                           ` David Miller
  2006-07-31 22:46                         ` Zach Brown
                                           ` (8 subsequent siblings)
  10 siblings, 1 reply; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-07-31 19:41 UTC (permalink / raw)
  To: Ulrich Drepper; +Cc: Zach Brown, David Miller, linux-kernel, netdev

On Mon, Jul 31, 2006 at 02:33:22PM +0400, Evgeniy Polyakov (johnpol@2ka.mipt.ru) wrote:
> Ok, let's do it in the following way:
> I present new version of kevent with new syscalls and fixed issues mentioned
> before, while people look at it we can end up with mapped buffer design.
> Is it ok?

Since kevents are never generated by kernel, but only marked as ready,
length of the main queue performs as flow control, so we can create a
mapped buffer which will have space equal to the main queue length
multiplied by size of the copied to userspace structure plus 16 bits for
the start index of the kernel writing side, i.e. it will store offset
where the oldest event was placed.
Since queue length is a limited factor and thus no new events can be added
when queue is full, that means that buffer is full too and userspace
must read events. When syscall is called to add new kevent and provided 
there offset differs from what kernel stored, that means that all events 
from kernel to provided index have been read and new events can be added.
Thus we can even allow read-only mapping. Kernel's index is incremented
modulo queue length. If kevent was removed after it was marked as
ready, it's copy stays in the mapped buffer, but special flag can be
assigned to show that kevent is no longer valid.


-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [RFC 1/4] kevent: core files.
  2006-07-31 19:41                         ` Evgeniy Polyakov
@ 2006-07-31 22:00                           ` David Miller
  2006-07-31 22:16                             ` Brent Cook
  2006-08-01  6:24                             ` Evgeniy Polyakov
  0 siblings, 2 replies; 160+ messages in thread
From: David Miller @ 2006-07-31 22:00 UTC (permalink / raw)
  To: johnpol; +Cc: drepper, zach.brown, linux-kernel, netdev

From: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
Date: Mon, 31 Jul 2006 23:41:43 +0400

> Since kevents are never generated by kernel, but only marked as ready,
> length of the main queue performs as flow control, so we can create a
> mapped buffer which will have space equal to the main queue length
> multiplied by size of the copied to userspace structure plus 16 bits for
> the start index of the kernel writing side, i.e. it will store offset
> where the oldest event was placed.
>
> Since queue length is a limited factor and thus no new events can be added
> when queue is full, that means that buffer is full too and userspace
> must read events. When syscall is called to add new kevent and provided 
> there offset differs from what kernel stored, that means that all events 
> from kernel to provided index have been read and new events can be added.
> Thus we can even allow read-only mapping. Kernel's index is incremented
> modulo queue length. If kevent was removed after it was marked as
> ready, it's copy stays in the mapped buffer, but special flag can be
> assigned to show that kevent is no longer valid.

This sounds reasonable.

However we must be mindful that the thread of control trying to
add a new event might not be in a position to drain the queue
of pending events when the queue is full.  Usually he will be
trying to add an event in response to handling another event.

So we'd have cases like this, assume we start with a full event
queue:

	thread A		thread B

	dequeue event
	aha, new connection
	accept()
				register new kevent
				queue is now full again
	add kevent on new
	connection

At this point thread A doesn't have very many options when the kevent
add fails.  You cannot force this thread to read more events, since he
may not be in a state where he is easily able to do so.

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [RFC 1/4] kevent: core files.
  2006-07-31 22:00                           ` David Miller
@ 2006-07-31 22:16                             ` Brent Cook
  2006-07-31 22:20                               ` David Miller
  2006-08-01  6:24                             ` Evgeniy Polyakov
  1 sibling, 1 reply; 160+ messages in thread
From: Brent Cook @ 2006-07-31 22:16 UTC (permalink / raw)
  To: David Miller; +Cc: johnpol, drepper, zach.brown, linux-kernel, netdev

On Monday 31 July 2006 17:00, David Miller wrote:
>
> So we'd have cases like this, assume we start with a full event
> queue:
>
> 	thread A		thread B
>
> 	dequeue event
> 	aha, new connection
> 	accept()
> 				register new kevent
> 				queue is now full again
> 	add kevent on new
> 	connection
>
> At this point thread A doesn't have very many options when the kevent
> add fails.  You cannot force this thread to read more events, since he
> may not be in a state where he is easily able to do so.

There has to be some thread that is responsible for reading events. Perhaps a 
reasonable thing for a blocked thread that cannot process events to do is to 
yield to one that can?


^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [RFC 1/4] kevent: core files.
  2006-07-31 22:16                             ` Brent Cook
@ 2006-07-31 22:20                               ` David Miller
  0 siblings, 0 replies; 160+ messages in thread
From: David Miller @ 2006-07-31 22:20 UTC (permalink / raw)
  To: bcook; +Cc: johnpol, drepper, zach.brown, linux-kernel, netdev

From: Brent Cook <bcook@bpointsys.com>
Date: Mon, 31 Jul 2006 17:16:48 -0500

> There has to be some thread that is responsible for reading
> events. Perhaps a reasonable thing for a blocked thread that cannot
> process events to do is to yield to one that can?

The reason one decentralizes event processing into threads is so that
once they are tasked to process some event they need not be concerned
with event state.

They are designed to process their event through to the end, then
return to the top level and say "any more work for me?"

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [RFC 1/4] kevent: core files.
  2006-07-31 10:33                       ` Evgeniy Polyakov
  2006-07-31 10:35                         ` Herbert Xu
  2006-07-31 19:41                         ` Evgeniy Polyakov
@ 2006-07-31 22:46                         ` Zach Brown
  2006-08-01  9:34                         ` [take2 0/4] kevent: introduction Evgeniy Polyakov
                                           ` (7 subsequent siblings)
  10 siblings, 0 replies; 160+ messages in thread
From: Zach Brown @ 2006-07-31 22:46 UTC (permalink / raw)
  To: Evgeniy Polyakov; +Cc: Ulrich Drepper, David Miller, linux-kernel, netdev


> Ok, let's do it in the following way:
> I present new version of kevent with new syscalls and fixed issues mentioned
> before, while people look at it we can end up with mapped buffer design.
> Is it ok?

Yeah, that sounds good.  I'm looking forward to seeing the next set of
patches :).

- z

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [RFC 1/4] kevent: core files.
  2006-07-27 19:18   ` Zach Brown
  2006-07-27 20:06     ` Evgeniy Polyakov
  2006-07-27 20:58     ` Benjamin LaHaise
@ 2006-08-01  1:02     ` David Miller
  2006-08-01 17:02       ` Zach Brown
  2 siblings, 1 reply; 160+ messages in thread
From: David Miller @ 2006-08-01  1:02 UTC (permalink / raw)
  To: zach.brown; +Cc: johnpol, linux-kernel, netdev

From: Zach Brown <zach.brown@oracle.com>
Date: Thu, 27 Jul 2006 12:18:42 -0700

[ I kept this thread around in my inbox because I wanted to give it
  some deep thought, so sorry for replying to old bits... ]

> So as the kernel generates events in the ring it only produces an event
> if the ownership field says that userspace has consumed it and in doing
> so it sets the ownership field to tell userspace that an event is
> waiting.  userspace and the kernel now each follow their index around
> the ring as the ownership field lets them produce or consume the event
> at their index.  Can someone tell me if the cache coherence costs of
> this are extreme?  I'm hoping they're not.

No need for an owner field, we can use something like a VJ
netchannel datastructure for this.  Kernel only writes to
producer index and user only writes to consumer index.

> So, great, glibc can now find pending events very quickly if they're
> waiting in the ring and can fall back to the collection syscall if it
> wants to wait and the ring is empty.  If it consumes events via the
> syscall it increases its ring index by the number the syscall returned.

I do not think if we do a ring buffer that events should be obtainable
via a syscall at all.  Rather, I think this system call should be
purely "sleep until ring is not empty".

This is actually reasonably simple stuff to implement as Evgeniy
has tried to explain.

Events in kevent live on a ready list when they have triggered.
Existence on a list determined the state, and I think this design
btw invalidates some of the arguments against using netlink that
Ulrich mentions in his paper.  If netlink socket queuing fails,
well then kevent stays on ready list and that is all until the
kevent can be successfully published to the user.

I am not advocating netlink at all for this, as the ring buffer idea
is much better.

The ring buffer size, as Evgeniy also tried to describe, is bounded
purely by the number of registered events.  So event loop of
application might look something like this:

	struct ukevent cur_event;
	struct timeval timeo;

	setup_timeout(&timeo);
	for (;;) {
		int err;
		while(!(err = ukevent_dequeue(evt_fd, evt_ring,
					      &cur_event, &timeo))) {
			struct my_event_object *o =
				event_to_object(&cur_event);
			o->dispatch(o, &cur_event);
			setup_timeout(&timeo);
		}
		if (err == -ETIMEDOUT)
			timeout_processing();
		else
			event_error_processing(err);
	}

ukevent_dequeue() is perhaps some GLIBC implemented routine which does
something like:

	int err;

	for (;;) {
		if (!evt_ring_empty(evt_ring)) {
			struct ukevent *p = evt_ring_consume(evt_ring);
			memcpy(event_p, p, sizeof(struct ukevent));
			return 0;
		}
		err = kevent_wait(evt_fd, timeo_p);
		if (err < 0)
			break;
	}
	return err;

It's just some stupid ideas... we could also choose to expose the ring
buffer layout directly to the user event loop and let it perform the
dequeue operation and kevent_wait() calls directly.  I don't see why
not to allow that.

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [RFC 1/4] kevent: core files.
  2006-07-28  5:23         ` Evgeniy Polyakov
  2006-07-28 18:33           ` Zach Brown
@ 2006-08-01  1:05           ` David Miller
  1 sibling, 0 replies; 160+ messages in thread
From: David Miller @ 2006-08-01  1:05 UTC (permalink / raw)
  To: johnpol; +Cc: zach.brown, linux-kernel, netdev

From: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
Date: Fri, 28 Jul 2006 09:23:12 +0400

> I completely agree that existing kevent interface is not the best, so
> I'm opened for any suggestions.
> Should kevent creation/removing/modification be separated too?

I do not think so, object for these 3 operations are the same,
so there are no typing issues.

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [RFC 1/4] kevent: core files.
  2006-07-31 22:00                           ` David Miller
  2006-07-31 22:16                             ` Brent Cook
@ 2006-08-01  6:24                             ` Evgeniy Polyakov
  1 sibling, 0 replies; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-01  6:24 UTC (permalink / raw)
  To: David Miller; +Cc: drepper, zach.brown, linux-kernel, netdev

On Mon, Jul 31, 2006 at 03:00:28PM -0700, David Miller (davem@davemloft.net) wrote:
> From: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
> Date: Mon, 31 Jul 2006 23:41:43 +0400
> 
> > Since kevents are never generated by kernel, but only marked as ready,
> > length of the main queue performs as flow control, so we can create a
> > mapped buffer which will have space equal to the main queue length
> > multiplied by size of the copied to userspace structure plus 16 bits for
> > the start index of the kernel writing side, i.e. it will store offset
> > where the oldest event was placed.
> >
> > Since queue length is a limited factor and thus no new events can be added
> > when queue is full, that means that buffer is full too and userspace
> > must read events. When syscall is called to add new kevent and provided 
> > there offset differs from what kernel stored, that means that all events 
> > from kernel to provided index have been read and new events can be added.
> > Thus we can even allow read-only mapping. Kernel's index is incremented
> > modulo queue length. If kevent was removed after it was marked as
> > ready, it's copy stays in the mapped buffer, but special flag can be
> > assigned to show that kevent is no longer valid.
> 
> This sounds reasonable.
> 
> However we must be mindful that the thread of control trying to
> add a new event might not be in a position to drain the queue
> of pending events when the queue is full.  Usually he will be
> trying to add an event in response to handling another event.
> 
> So we'd have cases like this, assume we start with a full event
> queue:
> 
> 	thread A		thread B
> 
> 	dequeue event
> 	aha, new connection
> 	accept()
> 				register new kevent
> 				queue is now full again
> 	add kevent on new
> 	connection
> 
> At this point thread A doesn't have very many options when the kevent
> add fails.  You cannot force this thread to read more events, since he
> may not be in a state where he is easily able to do so.

By default all kevents are not removed from the queue, so accept events
will be in the queue and thread B will fail to register new kevent.
To remove kevent from the queue user should either set one-shot flag or
do it by special command.
So if we are in position when queue is full and all events are not
one-shot, control thread must think about what does it do, and remove
some of them (and next time add them with one-shot flag).

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [RFC 1/4] kevent: core files.
  2006-07-31 10:59                               ` Herbert Xu
@ 2006-08-01  7:53                                 ` Ulrich Drepper
  2006-08-01  7:58                                   ` David Miller
  0 siblings, 1 reply; 160+ messages in thread
From: Ulrich Drepper @ 2006-08-01  7:53 UTC (permalink / raw)
  To: Herbert Xu; +Cc: David Miller, johnpol, zach.brown, linux-kernel, netdev

[-- Attachment #1: Type: text/plain, Size: 816 bytes --]

Herbert Xu wrote:
> The other to consider is that events don't come from the hardware.
> Events are written by the kernel.  So if user-space is just reading
> the events that we've written, then there are no cache misses at all.

Not quite true.  The ring buffer can be written to from another
processor.  The kernel thread responsible for generating the event
(receiving data from network or disk, expired timer) can run
independently on another CPU.

This is the case to keep in mind here.  I thought Zach and the other
involved in the discussions in Ottawa said this has been shown to be a
problem and that a ring buffer implementation with something other than
simple front and back pointers is preferable.

-- 
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 251 bytes --]

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [RFC 1/4] kevent: core files.
  2006-08-01  7:53                                 ` Ulrich Drepper
@ 2006-08-01  7:58                                   ` David Miller
  0 siblings, 0 replies; 160+ messages in thread
From: David Miller @ 2006-08-01  7:58 UTC (permalink / raw)
  To: drepper; +Cc: herbert, johnpol, zach.brown, linux-kernel, netdev

From: Ulrich Drepper <drepper@redhat.com>
Date: Tue, 01 Aug 2006 00:53:10 -0700

> This is the case to keep in mind here.  I thought Zach and the other
> involved in the discussions in Ottawa said this has been shown to be a
> problem and that a ring buffer implementation with something other than
> simple front and back pointers is preferable.

This is part of why I suggested VJ style channel data
structure.  At worst, the cachelines for the entries get
into shared modified state when the remove userland cpu
reads the slot.

^ permalink raw reply	[flat|nested] 160+ messages in thread

* [take2 0/4] kevent: introduction.
  2006-07-31 10:33                       ` Evgeniy Polyakov
                                           ` (2 preceding siblings ...)
  2006-07-31 22:46                         ` Zach Brown
@ 2006-08-01  9:34                         ` Evgeniy Polyakov
  2006-08-01  9:34                           ` [take2 1/4] kevent: core files Evgeniy Polyakov
  2006-08-03  9:45                         ` [take3 0/4] kevent: Generic event handling mechanism Evgeniy Polyakov
                                           ` (6 subsequent siblings)
  10 siblings, 1 reply; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-01  9:34 UTC (permalink / raw)
  To: lkml; +Cc: David Miller, Ulrich Drepper, Evgeniy Polyakov, netdev, Zach Brown


I send this patchset for comments and review, it still contains AIO and 
aio_sendfile() implementation on top of get_block() abstraction, which was
decided to postpone for a while (it is simpler right now to generate patchset as a whole,
when kevent will be ready for merge, I will generate patchset without AIO stuff).
It does not contain mapped buffer implementation, since it's design is not 100% 
completed, I will present that implementation in the third patchset.

Changes from previous patchset:
 - rebased against 2.6.18-git tree
 - removed ioctl controlling
 - added new syscall kevent_get_events(int fd, unsigned int min_nr, unsigned int max_nr,
			unsigned int timeout, void __user *buf, unsigned flags)
 - use old syscall kevent_ctl for creation/removing, modification and initial kevent 
	initialization
 - use mutuxes instead of semaphores
 - added file descriptor check and return error if provided descriptor does not match
	kevent file operations
 - various indent fixes
 - removed aio_sendfile() declarations.

Thank you.

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>



^ permalink raw reply	[flat|nested] 160+ messages in thread

* [take2 4/4] kevent: poll/select() notifications. Timer notifications.
  2006-08-01  9:34                             ` [take2 2/4] kevent: network AIO, socket notifications Evgeniy Polyakov
@ 2006-08-01  9:34                               ` Evgeniy Polyakov
  2006-08-01  9:34                                 ` [take2 3/4] kevent: AIO, aio_sendfile() implementation Evgeniy Polyakov
  0 siblings, 1 reply; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-01  9:34 UTC (permalink / raw)
  To: lkml; +Cc: David Miller, Ulrich Drepper, Evgeniy Polyakov, netdev, Zach Brown


This patch includes generic poll/select and timer notifications.

kevent_poll works simialr to epoll and has the same issues (callback
is invoked not from internal state machine of the caller, but through
process awake).

Timer notifications can be used for fine grained per-process time 
management, since iteractive timers are very inconveniently to use, 
and they are limited.

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mitp.ru>

diff --git a/kernel/kevent/kevent_poll.c b/kernel/kevent/kevent_poll.c
new file mode 100644
index 0000000..4950e7c
--- /dev/null
+++ b/kernel/kevent/kevent_poll.c
@@ -0,0 +1,223 @@
+/*
+ * 	kevent_poll.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+#include <linux/file.h>
+#include <linux/kevent.h>
+#include <linux/poll.h>
+#include <linux/fs.h>
+
+static kmem_cache_t *kevent_poll_container_cache;
+static kmem_cache_t *kevent_poll_priv_cache;
+
+struct kevent_poll_ctl
+{
+	struct poll_table_struct 	pt;
+	struct kevent			*k;
+};
+
+struct kevent_poll_wait_container
+{
+	struct list_head		container_entry;
+	wait_queue_head_t		*whead;
+	wait_queue_t			wait;
+	struct kevent			*k;
+};
+
+struct kevent_poll_private
+{
+	struct list_head		container_list;
+	spinlock_t			container_lock;
+};
+
+static int kevent_poll_enqueue(struct kevent *k);
+static int kevent_poll_dequeue(struct kevent *k);
+static int kevent_poll_callback(struct kevent *k);
+
+static int kevent_poll_wait_callback(wait_queue_t *wait, 
+		unsigned mode, int sync, void *key)
+{
+	struct kevent_poll_wait_container *cont = 
+		container_of(wait, struct kevent_poll_wait_container, wait);
+	struct kevent *k = cont->k;
+	struct file *file = k->st->origin;
+	unsigned long flags;
+	u32 revents, event;
+
+	revents = file->f_op->poll(file, NULL);
+	spin_lock_irqsave(&k->lock, flags);
+	event = k->event.event;
+	spin_unlock_irqrestore(&k->lock, flags);
+
+	kevent_storage_ready(k->st, NULL, revents);
+
+	return 0;
+}
+
+static void kevent_poll_qproc(struct file *file, wait_queue_head_t *whead, 
+		struct poll_table_struct *poll_table)
+{
+	struct kevent *k = 
+		container_of(poll_table, struct kevent_poll_ctl, pt)->k;
+	struct kevent_poll_private *priv = k->priv;
+	struct kevent_poll_wait_container *cont;
+	unsigned long flags;
+
+	cont = kmem_cache_alloc(kevent_poll_container_cache, SLAB_KERNEL);
+	if (!cont) {
+		kevent_break(k);
+		return;
+	}
+		
+	cont->k = k;
+	init_waitqueue_func_entry(&cont->wait, kevent_poll_wait_callback);
+	cont->whead = whead;
+
+	spin_lock_irqsave(&priv->container_lock, flags);
+	list_add_tail(&cont->container_entry, &priv->container_list);
+	spin_unlock_irqrestore(&priv->container_lock, flags);
+
+	add_wait_queue(whead, &cont->wait);
+}
+
+static int kevent_poll_enqueue(struct kevent *k)
+{
+	struct file *file;
+	int err, ready = 0;
+	unsigned int revents;
+	struct kevent_poll_ctl ctl;
+	struct kevent_poll_private *priv;
+
+	file = fget(k->event.id.raw[0]);
+	if (!file)
+		return -ENODEV;
+
+	err = -EINVAL;
+	if (!file->f_op || !file->f_op->poll)
+		goto err_out_fput;
+
+	err = -ENOMEM;
+	priv = kmem_cache_alloc(kevent_poll_priv_cache, SLAB_KERNEL);
+	if (!priv)
+		goto err_out_fput;
+
+	spin_lock_init(&priv->container_lock);
+	INIT_LIST_HEAD(&priv->container_list);
+
+	k->priv = priv;
+
+	ctl.k = k;
+	init_poll_funcptr(&ctl.pt, &kevent_poll_qproc);
+
+	err = kevent_storage_enqueue(&file->st, k);
+	if (err)
+		goto err_out_free;
+
+	revents = file->f_op->poll(file, &ctl.pt);
+	if (revents & k->event.event) {
+		ready = 1;
+		kevent_poll_dequeue(k);
+	}
+	
+	return ready;
+
+err_out_free:
+	kmem_cache_free(kevent_poll_priv_cache, priv);
+err_out_fput:
+	fput(file);
+	return err;
+}
+
+static int kevent_poll_dequeue(struct kevent *k)
+{
+	struct file *file = k->st->origin;
+	struct kevent_poll_private *priv = k->priv;
+	struct kevent_poll_wait_container *w, *n;
+	unsigned long flags;
+
+	kevent_storage_dequeue(k->st, k);
+
+	spin_lock_irqsave(&priv->container_lock, flags);
+	list_for_each_entry_safe(w, n, &priv->container_list, container_entry) {
+		list_del(&w->container_entry);
+		remove_wait_queue(w->whead, &w->wait);
+		kmem_cache_free(kevent_poll_container_cache, w);
+	}
+	spin_unlock_irqrestore(&priv->container_lock, flags);
+	
+	kmem_cache_free(kevent_poll_priv_cache, priv);
+	k->priv = NULL;
+	
+	fput(file);
+
+	return 0;
+}
+
+static int kevent_poll_callback(struct kevent *k)
+{
+	struct file *file = k->st->origin;
+	unsigned int revents = file->f_op->poll(file, NULL);
+	return (revents & k->event.event);
+}
+
+int kevent_init_poll(struct kevent *k)
+{
+	if (!kevent_poll_container_cache || !kevent_poll_priv_cache)
+		return -ENOMEM;
+
+	k->enqueue = &kevent_poll_enqueue;
+	k->dequeue = &kevent_poll_dequeue;
+	k->callback = &kevent_poll_callback;
+	return 0;
+}
+
+
+static int __init kevent_poll_sys_init(void)
+{
+	kevent_poll_container_cache = kmem_cache_create("kevent_poll_container_cache", 
+			sizeof(struct kevent_poll_wait_container), 0, 0, NULL, NULL);
+	if (!kevent_poll_container_cache) {
+		printk(KERN_ERR "Failed to create kevent poll container cache.\n");
+		return -ENOMEM;
+	}
+	
+	kevent_poll_priv_cache = kmem_cache_create("kevent_poll_priv_cache", 
+			sizeof(struct kevent_poll_private), 0, 0, NULL, NULL);
+	if (!kevent_poll_priv_cache) {
+		printk(KERN_ERR "Failed to create kevent poll private data cache.\n");
+		kmem_cache_destroy(kevent_poll_container_cache);
+		kevent_poll_container_cache = NULL;
+		return -ENOMEM;
+	}
+
+	printk(KERN_INFO "Kevent poll()/select() subsystem has been initialized.\n");
+	return 0;
+}
+
+static void __exit kevent_poll_sys_fini(void)
+{
+	kmem_cache_destroy(kevent_poll_priv_cache);
+	kmem_cache_destroy(kevent_poll_container_cache);
+}
+
+module_init(kevent_poll_sys_init);
+module_exit(kevent_poll_sys_fini);
diff --git a/kernel/kevent/kevent_timer.c b/kernel/kevent/kevent_timer.c
new file mode 100644
index 0000000..53d3bdf
--- /dev/null
+++ b/kernel/kevent/kevent_timer.c
@@ -0,0 +1,112 @@
+/*
+ * 	kevent_timer.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+#include <linux/jiffies.h>
+#include <linux/kevent.h>
+
+static void kevent_timer_func(unsigned long data)
+{
+	struct kevent *k = (struct kevent *)data;
+	struct timer_list *t = k->st->origin;
+
+	kevent_storage_ready(k->st, NULL, KEVENT_MASK_ALL);
+	mod_timer(t, jiffies + msecs_to_jiffies(k->event.id.raw[0]));
+}
+
+static int kevent_timer_enqueue(struct kevent *k)
+{
+	struct timer_list *t;
+	struct kevent_storage *st;
+	int err;
+
+	t = kmalloc(sizeof(struct timer_list) + sizeof(struct kevent_storage), 
+			GFP_KERNEL);
+	if (!t)
+		return -ENOMEM;
+
+	init_timer(t);
+	t->function = kevent_timer_func;
+	t->expires = jiffies + msecs_to_jiffies(k->event.id.raw[0]);
+	t->data = (unsigned long)k;
+
+	st = (struct kevent_storage *)(t+1);
+	err = kevent_storage_init(t, st);
+	if (err)
+		goto err_out_free;
+
+	err = kevent_storage_enqueue(st, k);
+	if (err)
+		goto err_out_st_fini;
+	
+	add_timer(t);
+
+	return 0;
+
+err_out_st_fini:	
+	kevent_storage_fini(st);
+err_out_free:
+	kfree(t);
+
+	return err;
+}
+
+static int kevent_timer_dequeue(struct kevent *k)
+{
+	struct kevent_storage *st = k->st;
+	struct timer_list *t = st->origin;
+
+	if (!t)
+		return -ENODEV;
+
+	del_timer_sync(t);
+	
+	kevent_storage_dequeue(st, k);
+	
+	kfree(t);
+
+	return 0;
+}
+
+static int kevent_timer_callback(struct kevent *k)
+{
+	struct kevent_storage *st = k->st;
+	struct timer_list *t = st->origin;
+
+	if (!t)
+		return -ENODEV;
+	
+	k->event.ret_data[0] = (__u32)jiffies;
+	return 1;
+}
+
+int kevent_init_timer(struct kevent *k)
+{
+	k->enqueue = &kevent_timer_enqueue;
+	k->dequeue = &kevent_timer_dequeue;
+	k->callback = &kevent_timer_callback;
+	return 0;
+}



^ permalink raw reply related	[flat|nested] 160+ messages in thread

* [take2 2/4] kevent: network AIO, socket notifications.
  2006-08-01  9:34                           ` [take2 1/4] kevent: core files Evgeniy Polyakov
@ 2006-08-01  9:34                             ` Evgeniy Polyakov
  2006-08-01  9:34                               ` [take2 4/4] kevent: poll/select() notifications. Timer notifications Evgeniy Polyakov
  2006-08-01 13:46                             ` [take2 1/4] kevent: core files James Morris
  2006-08-01 23:56                             ` Zach Brown
  2 siblings, 1 reply; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-01  9:34 UTC (permalink / raw)
  To: lkml; +Cc: David Miller, Ulrich Drepper, Evgeniy Polyakov, netdev, Zach Brown


This patchset includes socket notifications and network asynchronous IO.
Network AIO is based on kevent and works as usual kevent storage on top
of inode.

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>

diff --git a/include/asm-i386/socket.h b/include/asm-i386/socket.h
index 5755d57..d4d2f5c 100644
--- a/include/asm-i386/socket.h
+++ b/include/asm-i386/socket.h
@@ -50,4 +50,6 @@ #define SO_ACCEPTCONN		30
 #define SO_PEERSEC		31
 #define SO_PASSSEC		34
 
+#define SO_ASYNC_SOCK		35
+
 #endif /* _ASM_SOCKET_H */

diff --git a/include/asm-x86_64/socket.h b/include/asm-x86_64/socket.h
index b467026..fc2b49d 100644
--- a/include/asm-x86_64/socket.h
+++ b/include/asm-x86_64/socket.h
@@ -50,4 +50,6 @@ #define SO_ACCEPTCONN		30
 #define SO_PEERSEC             31
 #define SO_PASSSEC		34
 
+#define SO_ASYNC_SOCK		35
+
 #endif /* _ASM_SOCKET_H */
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 4307e76..9267873 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1283,6 +1283,8 @@ extern struct sk_buff *skb_recv_datagram
 					 int noblock, int *err);
 extern unsigned int    datagram_poll(struct file *file, struct socket *sock,
 				     struct poll_table_struct *wait);
+extern int	       skb_copy_datagram(const struct sk_buff *from, 
+					 int offset, void *dst, int size);
 extern int	       skb_copy_datagram_iovec(const struct sk_buff *from,
 					       int offset, struct iovec *to,
 					       int size);
diff --git a/include/net/sock.h b/include/net/sock.h
index 324b3ea..c43a153 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -48,6 +48,7 @@ #include <linux/lockdep.h>
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>	/* struct sk_buff */
 #include <linux/security.h>
+#include <linux/kevent.h>
 
 #include <linux/filter.h>
 
@@ -391,6 +392,8 @@ enum sock_flags {
 	SOCK_RCVTSTAMP, /* %SO_TIMESTAMP setting */
 	SOCK_LOCALROUTE, /* route locally only, %SO_DONTROUTE setting */
 	SOCK_QUEUE_SHRUNK, /* write queue has been shrunk recently */
+	SOCK_ASYNC,
+	SOCK_ASYNC_INUSE,
 };
 
 static inline void sock_copy_flags(struct sock *nsk, struct sock *osk)
@@ -450,6 +453,21 @@ static inline int sk_stream_memory_free(
 
 extern void sk_stream_rfree(struct sk_buff *skb);
 
+struct socket_alloc {
+	struct socket socket;
+	struct inode vfs_inode;
+};
+
+static inline struct socket *SOCKET_I(struct inode *inode)
+{
+	return &container_of(inode, struct socket_alloc, vfs_inode)->socket;
+}
+
+static inline struct inode *SOCK_INODE(struct socket *socket)
+{
+	return &container_of(socket, struct socket_alloc, socket)->vfs_inode;
+}
+
 static inline void sk_stream_set_owner_r(struct sk_buff *skb, struct sock *sk)
 {
 	skb->sk = sk;
@@ -477,6 +495,7 @@ static inline void sk_add_backlog(struct
 		sk->sk_backlog.tail = skb;
 	}
 	skb->next = NULL;
+	kevent_socket_notify(sk, KEVENT_SOCKET_RECV);
 }
 
 #define sk_wait_event(__sk, __timeo, __condition)		\
@@ -548,6 +567,12 @@ struct proto {
 
 	int			(*backlog_rcv) (struct sock *sk, 
 						struct sk_buff *skb);
+	
+	int			(*async_recv) (struct sock *sk, 
+						void *dst, size_t size);
+	int			(*async_send) (struct sock *sk, 
+						struct page **pages, unsigned int poffset, 
+						size_t size);
 
 	/* Keeping track of sk's, looking them up, and port selection methods. */
 	void			(*hash)(struct sock *sk);
@@ -679,21 +704,6 @@ static inline struct kiocb *siocb_to_kio
 	return si->kiocb;
 }
 
-struct socket_alloc {
-	struct socket socket;
-	struct inode vfs_inode;
-};
-
-static inline struct socket *SOCKET_I(struct inode *inode)
-{
-	return &container_of(inode, struct socket_alloc, vfs_inode)->socket;
-}
-
-static inline struct inode *SOCK_INODE(struct socket *socket)
-{
-	return &container_of(socket, struct socket_alloc, socket)->vfs_inode;
-}
-
 extern void __sk_stream_mem_reclaim(struct sock *sk);
 extern int sk_stream_mem_schedule(struct sock *sk, int size, int kind);
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 0720bdd..5a1899b 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -364,6 +364,8 @@ extern int			compat_tcp_setsockopt(struc
 					int level, int optname,
 					char __user *optval, int optlen);
 extern void			tcp_set_keepalive(struct sock *sk, int val);
+extern int			tcp_async_recv(struct sock *sk, void *dst, size_t size);
+extern int			tcp_async_send(struct sock *sk, struct page **pages, unsigned int poffset, size_t size);
 extern int			tcp_recvmsg(struct kiocb *iocb, struct sock *sk,
 					    struct msghdr *msg,
 					    size_t len, int nonblock, 
@@ -857,6 +859,7 @@ static inline int tcp_prequeue(struct so
 			tp->ucopy.memory = 0;
 		} else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
 			wake_up_interruptible(sk->sk_sleep);
+			kevent_socket_notify(sk, KEVENT_SOCKET_RECV|KEVENT_SOCKET_SEND);
 			if (!inet_csk_ack_scheduled(sk))
 				inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
 						          (3 * TCP_RTO_MIN) / 4,
diff --git a/kernel/kevent/kevent_naio.c b/kernel/kevent/kevent_naio.c
new file mode 100644
index 0000000..1c71021
--- /dev/null
+++ b/kernel/kevent/kevent_naio.c
@@ -0,0 +1,239 @@
+/*
+ * 	kevent_naio.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/kevent.h>
+
+#include <net/sock.h>
+#include <net/tcp_states.h>
+
+static int kevent_naio_enqueue(struct kevent *k);
+static int kevent_naio_dequeue(struct kevent *k);
+static int kevent_naio_callback(struct kevent *k);
+
+static int kevent_naio_setup_aio(int ctl_fd, int s, void __user *buf, 
+		size_t size, u32 event)
+{
+	struct kevent_user *u;
+	struct file *file;
+	int err, fput_needed;
+	struct ukevent uk;
+
+	file = fget_light(ctl_fd, &fput_needed);
+	if (!file)
+		return -ENODEV;
+
+	u = file->private_data;
+	if (!u) {
+		err = -EINVAL;
+		goto err_out_fput;
+	}
+
+	memset(&uk, 0, sizeof(struct ukevent));
+	uk.type = KEVENT_NAIO;
+	uk.ptr = buf;
+	uk.req_flags = KEVENT_REQ_ONESHOT;
+	uk.event = event;
+	uk.id.raw[0] = s;
+	uk.id.raw[1] = size;
+
+	err = kevent_user_add_ukevent(&uk, u);
+
+err_out_fput:
+	fput_light(file, fput_needed);
+	return err;
+}
+
+asmlinkage long sys_aio_recv(int ctl_fd, int s, void __user *buf, 
+		size_t size, unsigned flags)
+{
+	return kevent_naio_setup_aio(ctl_fd, s, buf, size, KEVENT_SOCKET_RECV);
+}
+
+asmlinkage long sys_aio_send(int ctl_fd, int s, void __user *buf, 
+		size_t size, unsigned flags)
+{
+	return kevent_naio_setup_aio(ctl_fd, s, buf, size, KEVENT_SOCKET_SEND);
+}
+
+static int kevent_naio_enqueue(struct kevent *k)
+{
+	int err, i;
+	struct page **page;
+	void *addr;
+	unsigned int size = k->event.id.raw[1];
+	int num = size/PAGE_SIZE;
+	struct file *file;
+	struct sock *sk = NULL;
+	int fput_needed;
+
+	file = fget_light(k->event.id.raw[0], &fput_needed);
+	if (!file)
+		return -ENODEV;
+
+	err = -EINVAL;
+	if (!file->f_dentry || !file->f_dentry->d_inode)
+		goto err_out_fput;
+
+	sk = SOCKET_I(file->f_dentry->d_inode)->sk;
+
+	err = -ESOCKTNOSUPPORT;
+	if (!sk || !sk->sk_prot->async_recv || !sk->sk_prot->async_send || 
+		!sock_flag(sk, SOCK_ASYNC))
+		goto err_out_fput;
+	
+	addr = k->event.ptr;
+	if (((unsigned long)addr & PAGE_MASK) != (unsigned long)addr)
+		num++;
+
+	page = kmalloc(sizeof(struct page *) * num, GFP_KERNEL);
+	if (!page)
+		return -ENOMEM;
+
+	down_read(&current->mm->mmap_sem);
+	err = get_user_pages(current, current->mm, (unsigned long)addr, 
+			num, 1, 0, page, NULL);
+	up_read(&current->mm->mmap_sem);
+	if (err <= 0)
+		goto err_out_free;
+	num = err;
+
+	k->event.ret_data[0] = num;
+	k->event.ret_data[1] = offset_in_page(k->event.ptr);
+	k->priv = page;
+
+	sk->sk_allocation = GFP_ATOMIC;
+
+	spin_lock_bh(&sk->sk_lock.slock);
+	err = kevent_socket_enqueue(k);
+	spin_unlock_bh(&sk->sk_lock.slock);
+	if (err)
+		goto err_out_put_pages;
+
+	fput_light(file, fput_needed);
+
+	return err;
+
+err_out_put_pages:
+	for (i=0; i<num; ++i)
+		page_cache_release(page[i]);
+err_out_free:
+	kfree(page);
+err_out_fput:
+	fput_light(file, fput_needed);
+
+	return err;
+}
+
+static int kevent_naio_dequeue(struct kevent *k)
+{
+	int err, i, num;
+	struct page **page = k->priv;
+
+	num = k->event.ret_data[0];
+
+	err = kevent_socket_dequeue(k);
+
+	for (i=0; i<num; ++i)
+		page_cache_release(page[i]);
+
+	kfree(k->priv);
+	k->priv = NULL;
+
+	return err;
+}
+
+static int kevent_naio_callback(struct kevent *k)
+{
+	struct inode *inode = k->st->origin;
+	struct sock *sk = SOCKET_I(inode)->sk;
+	unsigned int size = k->event.id.raw[1];
+	unsigned int off = k->event.ret_data[1];
+	struct page **pages = k->priv, *page;
+	int ready = 0, num = off/PAGE_SIZE, err = 0, send = 0;
+	void *ptr, *optr;
+	unsigned int len;
+
+	if (!sock_flag(sk, SOCK_ASYNC))
+		return -1;
+
+	if (k->event.event & KEVENT_SOCKET_SEND)
+		send = 1;
+	else if (!(k->event.event & KEVENT_SOCKET_RECV))
+		return -EINVAL;
+
+	/*
+	 * sk_prot->async_*() can return either number of bytes processed,
+	 * or negative error value, or zero if socket is closed.
+	 */
+
+	if (!send) {
+		page = pages[num];
+
+		optr = ptr = kmap_atomic(page, KM_IRQ0);
+		if (!ptr)
+			return -ENOMEM;
+
+		ptr += off % PAGE_SIZE;
+		len = min_t(unsigned int, PAGE_SIZE - (ptr - optr), size);
+
+		err = sk->sk_prot->async_recv(sk, ptr, len);
+
+		kunmap_atomic(optr, KM_IRQ0);
+	} else {
+		len = size;
+		err = sk->sk_prot->async_send(sk, pages, off, size);
+	}
+
+	if (err > 0) {
+		num++;
+		size -= err;
+		off += err;
+	}
+
+	k->event.ret_data[1] = off;
+	k->event.id.raw[1] = size;
+
+	if (err == 0 || (err < 0 && err != -EAGAIN))
+		ready = -1;
+
+	if (!size)
+		ready = 1;
+#if 0
+	printk("%s: sk=%p, k=%p, size=%4u, off=%4u, err=%3d, ready=%1d.\n",
+			__func__, sk, k, size, off, err, ready);
+#endif
+
+	return ready;
+}
+
+int kevent_init_naio(struct kevent *k)
+{
+	k->enqueue = &kevent_naio_enqueue;
+	k->dequeue = &kevent_naio_dequeue;
+	k->callback = &kevent_naio_callback;
+	return 0;
+}
diff --git a/kernel/kevent/kevent_socket.c b/kernel/kevent/kevent_socket.c
new file mode 100644
index 0000000..c230aaa
--- /dev/null
+++ b/kernel/kevent/kevent_socket.c
@@ -0,0 +1,125 @@
+/*
+ * 	kevent_socket.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+#include <linux/file.h>
+#include <linux/tcp.h>
+#include <linux/kevent.h>
+
+#include <net/sock.h>
+#include <net/request_sock.h>
+#include <net/inet_connection_sock.h>
+
+static int kevent_socket_callback(struct kevent *k)
+{
+	struct inode *inode = k->st->origin;
+	struct sock *sk = SOCKET_I(inode)->sk;
+	int rmem;
+	
+	if (k->event.event & KEVENT_SOCKET_RECV) {
+		int ret = 0;
+		
+		if ((rmem = atomic_read(&sk->sk_rmem_alloc)) > 0 || 
+				!skb_queue_empty(&sk->sk_receive_queue))
+			ret = 1;
+		if (sk->sk_shutdown & RCV_SHUTDOWN)
+			ret = 1;
+		if (ret)
+			return ret;
+	}
+	if ((k->event.event & KEVENT_SOCKET_ACCEPT) && 
+		(!reqsk_queue_empty(&inet_csk(sk)->icsk_accept_queue) || 
+		 	reqsk_queue_len_young(&inet_csk(sk)->icsk_accept_queue))) {
+		k->event.ret_data[1] = reqsk_queue_len(&inet_csk(sk)->icsk_accept_queue);
+		return 1;
+	}
+
+	return 0;
+}
+
+int kevent_socket_enqueue(struct kevent *k)
+{
+	struct file *file;
+	struct inode *inode;
+	int err, fput_needed;
+
+	file = fget_light(k->event.id.raw[0], &fput_needed);
+	if (!file)
+		return -ENODEV;
+
+	err = -EINVAL;
+	if (!file->f_dentry || !file->f_dentry->d_inode)
+		goto err_out_fput;
+
+	inode = igrab(file->f_dentry->d_inode);
+	if (!inode)
+		goto err_out_fput;
+
+	err = kevent_storage_enqueue(&inode->st, k);
+	if (err)
+		goto err_out_iput;
+
+	err = k->callback(k);
+	if (err)
+		goto err_out_dequeue;
+
+	fput_light(file, fput_needed);
+	return err;
+
+err_out_dequeue:
+	kevent_storage_dequeue(k->st, k);
+err_out_iput:
+	iput(inode);
+err_out_fput:
+	fput_light(file, fput_needed);
+	return err;
+}
+
+int kevent_socket_dequeue(struct kevent *k)
+{
+	struct inode *inode = k->st->origin;
+
+	kevent_storage_dequeue(k->st, k);
+	iput(inode);
+
+	return 0;
+}
+
+int kevent_init_socket(struct kevent *k)
+{
+	k->enqueue = &kevent_socket_enqueue;
+	k->dequeue = &kevent_socket_dequeue;
+	k->callback = &kevent_socket_callback;
+	return 0;
+}
+
+void kevent_socket_notify(struct sock *sk, u32 event)
+{
+	if (sk->sk_socket && !test_and_set_bit(SOCK_ASYNC_INUSE, &sk->sk_flags)) {
+		kevent_storage_ready(&SOCK_INODE(sk->sk_socket)->st, NULL, event);
+		sock_reset_flag(sk, SOCK_ASYNC_INUSE);
+	}
+}
diff --git a/net/core/datagram.c b/net/core/datagram.c
index aecddcc..493245b 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -236,6 +236,60 @@ void skb_kill_datagram(struct sock *sk, 
 EXPORT_SYMBOL(skb_kill_datagram);
 
 /**
+ *	skb_copy_datagram - Copy a datagram.
+ *	@skb: buffer to copy
+ *	@offset: offset in the buffer to start copying from
+ *	@to: pointer to copy to
+ *	@len: amount of data to copy from buffer to iovec
+ */
+int skb_copy_datagram(const struct sk_buff *skb, int offset,
+			    void *to, int len)
+{
+	int i, fraglen, end = 0;
+	struct sk_buff *next = skb_shinfo(skb)->frag_list;
+
+	if (!len)
+		return 0;
+
+next_skb:
+	fraglen = skb_headlen(skb);
+	i = -1;
+
+	while (1) {
+		int start = end;
+
+		if ((end += fraglen) > offset) {
+			int copy = end - offset, o = offset - start;
+
+			if (copy > len)
+				copy = len;
+			if (i == -1)
+				memcpy(to, skb->data + o, copy);
+			else {
+				skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+				struct page *page = frag->page;
+				void *p = kmap(page) + frag->page_offset + o;
+				memcpy(to, p, copy);
+				kunmap(page);
+			}
+			if (!(len -= copy))
+				return 0;
+			offset += copy;
+		}
+		if (++i >= skb_shinfo(skb)->nr_frags)
+			break;
+		fraglen = skb_shinfo(skb)->frags[i].size;
+	}
+	if (next) {
+		skb = next;
+		BUG_ON(skb_shinfo(skb)->frag_list);
+		next = skb->next;
+		goto next_skb;
+	}
+	return -EFAULT;
+}
+
+/**
  *	skb_copy_datagram_iovec - Copy a datagram to an iovec.
  *	@skb: buffer to copy
  *	@offset: offset in the buffer to start copying from
@@ -530,6 +584,7 @@ unsigned int datagram_poll(struct file *
 
 EXPORT_SYMBOL(datagram_poll);
 EXPORT_SYMBOL(skb_copy_and_csum_datagram_iovec);
+EXPORT_SYMBOL(skb_copy_datagram);
 EXPORT_SYMBOL(skb_copy_datagram_iovec);
 EXPORT_SYMBOL(skb_free_datagram);
 EXPORT_SYMBOL(skb_recv_datagram);
diff --git a/net/core/sock.c b/net/core/sock.c
index 51fcfbc..9922373 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -617,6 +617,16 @@ #endif
 			spin_unlock_bh(&sk->sk_lock.slock);
 			ret = -ENONET;
 			break;
+#ifdef CONFIG_KEVENT_SOCKET
+		case SO_ASYNC_SOCK:
+			spin_lock_bh(&sk->sk_lock.slock);
+			if (valbool)
+				sock_set_flag(sk, SOCK_ASYNC);
+			else
+				sock_reset_flag(sk, SOCK_ASYNC);
+			spin_unlock_bh(&sk->sk_lock.slock);
+			break;
+#endif
 
 		case SO_PASSSEC:
 			if (valbool)
@@ -1406,6 +1416,7 @@ static void sock_def_wakeup(struct sock 
 	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
 		wake_up_interruptible_all(sk->sk_sleep);
 	read_unlock(&sk->sk_callback_lock);
+	kevent_socket_notify(sk, KEVENT_SOCKET_RECV|KEVENT_SOCKET_SEND);
 }
 
 static void sock_def_error_report(struct sock *sk)
@@ -1415,6 +1426,7 @@ static void sock_def_error_report(struct
 		wake_up_interruptible(sk->sk_sleep);
 	sk_wake_async(sk,0,POLL_ERR); 
 	read_unlock(&sk->sk_callback_lock);
+	kevent_socket_notify(sk, KEVENT_SOCKET_RECV|KEVENT_SOCKET_SEND);
 }
 
 static void sock_def_readable(struct sock *sk, int len)
@@ -1424,6 +1436,7 @@ static void sock_def_readable(struct soc
 		wake_up_interruptible(sk->sk_sleep);
 	sk_wake_async(sk,1,POLL_IN);
 	read_unlock(&sk->sk_callback_lock);
+	kevent_socket_notify(sk, KEVENT_SOCKET_RECV|KEVENT_SOCKET_SEND);
 }
 
 static void sock_def_write_space(struct sock *sk)
@@ -1443,6 +1456,7 @@ static void sock_def_write_space(struct 
 	}
 
 	read_unlock(&sk->sk_callback_lock);
+	kevent_socket_notify(sk, KEVENT_SOCKET_SEND|KEVENT_SOCKET_RECV);
 }
 
 static void sock_def_destruct(struct sock *sk)
@@ -1559,8 +1573,10 @@ void fastcall release_sock(struct sock *
 	if (sk->sk_backlog.tail)
 		__release_sock(sk);
 	sk->sk_lock.owner = NULL;
-	if (waitqueue_active(&sk->sk_lock.wq))
+	if (waitqueue_active(&sk->sk_lock.wq)) {
 		wake_up(&sk->sk_lock.wq);
+		kevent_socket_notify(sk, KEVENT_SOCKET_RECV|KEVENT_SOCKET_SEND);
+	}
 	spin_unlock_bh(&sk->sk_lock.slock);
 }
 EXPORT_SYMBOL(release_sock);
diff --git a/net/core/stream.c b/net/core/stream.c
index d1d7dec..2878c2a 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -36,6 +36,7 @@ void sk_stream_write_space(struct sock *
 			wake_up_interruptible(sk->sk_sleep);
 		if (sock->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
 			sock_wake_async(sock, 2, POLL_OUT);
+		kevent_socket_notify(sk, KEVENT_SOCKET_SEND|KEVENT_SOCKET_RECV);
 	}
 }
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index f6a2d92..e878a41 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -206,6 +206,7 @@
  *					lingertime == 0 (RFC 793 ABORT Call)
  *	Hirokazu Takahashi	:	Use copy_from_user() instead of
  *					csum_and_copy_from_user() if possible.
+ *	Evgeniy Polyakov	:	Network asynchronous IO.
  *
  *		This program is free software; you can redistribute it and/or
  *		modify it under the terms of the GNU General Public License
@@ -1085,6 +1086,301 @@ int tcp_read_sock(struct sock *sk, read_
 }
 
 /*
+ * Must be called with locked sock.
+ */
+int tcp_async_send(struct sock *sk, struct page **pages, unsigned int poffset, size_t len)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int mss_now, size_goal;
+	int err = -EAGAIN;
+	ssize_t copied;
+
+	/* Wait for a connection to finish. */
+	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
+		goto out_err;
+
+	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+
+	mss_now = tcp_current_mss(sk, 1);
+	size_goal = tp->xmit_size_goal;
+	copied = 0;
+
+	err = -EPIPE;
+	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN) || sock_flag(sk, SOCK_DONE) ||
+			(sk->sk_state == TCP_CLOSE) || (atomic_read(&sk->sk_refcnt) == 1))
+		goto do_error;
+
+	while (len > 0) {
+		struct sk_buff *skb = sk->sk_write_queue.prev;
+		struct page *page = pages[poffset / PAGE_SIZE];
+		int copy, i, can_coalesce;
+		int offset = poffset % PAGE_SIZE;
+		int size = min_t(size_t, len, PAGE_SIZE - offset);
+
+		if (!sk->sk_send_head || (copy = size_goal - skb->len) <= 0) {
+new_segment:
+			if (!sk_stream_memory_free(sk))
+				goto wait_for_sndbuf;
+
+			skb = sk_stream_alloc_pskb(sk, 0, 0,
+						   sk->sk_allocation);
+			if (!skb)
+				goto wait_for_memory;
+
+			skb_entail(sk, tp, skb);
+			copy = size_goal;
+		}
+
+		if (copy > size)
+			copy = size;
+
+		i = skb_shinfo(skb)->nr_frags;
+		can_coalesce = skb_can_coalesce(skb, i, page, offset);
+		if (!can_coalesce && i >= MAX_SKB_FRAGS) {
+			tcp_mark_push(tp, skb);
+			goto new_segment;
+		}
+		if (!sk_stream_wmem_schedule(sk, copy))
+			goto wait_for_memory;
+		
+		if (can_coalesce) {
+			skb_shinfo(skb)->frags[i - 1].size += copy;
+		} else {
+			get_page(page);
+			skb_fill_page_desc(skb, i, page, offset, copy);
+		}
+
+		skb->len += copy;
+		skb->data_len += copy;
+		skb->truesize += copy;
+		sk->sk_wmem_queued += copy;
+		sk->sk_forward_alloc -= copy;
+		skb->ip_summed = CHECKSUM_HW;
+		tp->write_seq += copy;
+		TCP_SKB_CB(skb)->end_seq += copy;
+		skb_shinfo(skb)->gso_segs = 0;
+
+		if (!copied)
+			TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
+
+		copied += copy;
+		poffset += copy;
+		if (!(len -= copy))
+			goto out;
+
+		if (skb->len < mss_now)
+			continue;
+
+		if (forced_push(tp)) {
+			tcp_mark_push(tp, skb);
+			__tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
+		} else if (skb == sk->sk_send_head)
+			tcp_push_one(sk, mss_now);
+		continue;
+
+wait_for_sndbuf:
+		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+wait_for_memory:
+		if (copied)
+			tcp_push(sk, tp, 0, mss_now, TCP_NAGLE_PUSH);
+
+		err = -EAGAIN;
+		goto do_error;
+	}
+
+out:
+	if (copied)
+		tcp_push(sk, tp, 0, mss_now, tp->nonagle);
+	return copied;
+
+do_error:
+	if (copied)
+		goto out;
+out_err:
+	return sk_stream_error(sk, 0, err);
+}
+
+/*
+ * Must be called with locked sock.
+ */
+int tcp_async_recv(struct sock *sk, void *dst, size_t len)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int copied = 0;
+	u32 *seq;
+	unsigned long used;
+	int err;
+	int target;		/* Read at least this many bytes */
+	int copied_early = 0;
+
+	TCP_CHECK_TIMER(sk);
+
+	err = -ENOTCONN;
+	if (sk->sk_state == TCP_LISTEN)
+		goto out;
+
+	seq = &tp->copied_seq;
+
+	target = sock_rcvlowat(sk, 0, len);
+
+	do {
+		struct sk_buff *skb;
+		u32 offset;
+
+		/* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
+		if (tp->urg_data && tp->urg_seq == *seq) {
+			if (copied)
+				break;
+		}
+
+		/* Next get a buffer. */
+
+		skb = skb_peek(&sk->sk_receive_queue);
+		do {
+			if (!skb)
+				break;
+
+			/* Now that we have two receive queues this
+			 * shouldn't happen.
+			 */
+			if (before(*seq, TCP_SKB_CB(skb)->seq)) {
+				printk(KERN_INFO "async_recv bug: copied %X "
+				       "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
+				break;
+			}
+			offset = *seq - TCP_SKB_CB(skb)->seq;
+			if (skb->h.th->syn)
+				offset--;
+			if (offset < skb->len)
+				goto found_ok_skb;
+			if (skb->h.th->fin)
+				goto found_fin_ok;
+			skb = skb->next;
+		} while (skb != (struct sk_buff *)&sk->sk_receive_queue);
+
+		if (copied)
+			break;
+
+		if (sock_flag(sk, SOCK_DONE))
+			break;
+
+		if (sk->sk_err) {
+			copied = sock_error(sk);
+			break;
+		}
+
+		if (sk->sk_shutdown & RCV_SHUTDOWN)
+			break;
+
+		if (sk->sk_state == TCP_CLOSE) {
+			if (!sock_flag(sk, SOCK_DONE)) {
+				/* This occurs when user tries to read
+				 * from never connected socket.
+				 */
+				copied = -ENOTCONN;
+				break;
+			}
+			break;
+		}
+
+		copied = -EAGAIN;
+		break;
+
+	found_ok_skb:
+		/* Ok so how much can we use? */
+		used = skb->len - offset;
+		if (len < used)
+			used = len;
+
+		/* Do we have urgent data here? */
+		if (tp->urg_data) {
+			u32 urg_offset = tp->urg_seq - *seq;
+			if (urg_offset < used) {
+				if (!urg_offset) {
+					if (!sock_flag(sk, SOCK_URGINLINE)) {
+						++*seq;
+						offset++;
+						used--;
+						if (!used)
+							goto skip_copy;
+					}
+				} else
+					used = urg_offset;
+			}
+		}
+#ifdef CONFIG_NET_DMA
+		if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
+			tp->ucopy.dma_chan = get_softnet_dma();
+
+		if (tp->ucopy.dma_chan) {
+			tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec(
+				tp->ucopy.dma_chan, skb, offset,
+				msg->msg_iov, used,
+				tp->ucopy.pinned_list);
+
+			if (tp->ucopy.dma_cookie < 0) {
+
+				printk(KERN_ALERT "dma_cookie < 0\n");
+
+				/* Exception. Bailout! */
+				if (!copied)
+					copied = -EFAULT;
+				break;
+			}
+			if ((offset + used) == skb->len)
+				copied_early = 1;
+
+		} else
+#endif
+		{
+			err = skb_copy_datagram(skb, offset, dst, used);
+			if (err) {
+				/* Exception. Bailout! */
+				if (!copied)
+					copied = -EFAULT;
+				break;
+			}
+		}
+
+		*seq += used;
+		copied += used;
+		len -= used;
+		dst += used;
+
+		tcp_rcv_space_adjust(sk);
+
+skip_copy:
+		if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
+			tp->urg_data = 0;
+			tcp_fast_path_check(sk, tp);
+		}
+		if (used + offset < skb->len)
+			continue;
+
+		if (skb->h.th->fin)
+			goto found_fin_ok;
+		sk_eat_skb(sk, skb, copied_early);
+		continue;
+
+	found_fin_ok:
+		/* Process the FIN. */
+		++*seq;
+		sk_eat_skb(sk, skb, copied_early);
+		break;
+	} while (len > 0);
+
+	/* Clean up data we have read: This will do ACK frames. */
+	tcp_cleanup_rbuf(sk, copied);
+
+	TCP_CHECK_TIMER(sk);
+	return copied;
+
+out:
+	TCP_CHECK_TIMER(sk);
+	return err;
+}
+
+/*
  *	This routine copies from a sock struct into the user buffer.
  *
  *	Technical note: in 2.3 we work on _locked_ socket, so that
@@ -2342,6 +2638,8 @@ EXPORT_SYMBOL(tcp_getsockopt);
 EXPORT_SYMBOL(tcp_ioctl);
 EXPORT_SYMBOL(tcp_poll);
 EXPORT_SYMBOL(tcp_read_sock);
+EXPORT_SYMBOL(tcp_async_recv);
+EXPORT_SYMBOL(tcp_async_send);
 EXPORT_SYMBOL(tcp_recvmsg);
 EXPORT_SYMBOL(tcp_sendmsg);
 EXPORT_SYMBOL(tcp_sendpage);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 738dad9..f70d045 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3112,6 +3112,7 @@ static void tcp_ofo_queue(struct sock *s
 
 		__skb_unlink(skb, &tp->out_of_order_queue);
 		__skb_queue_tail(&sk->sk_receive_queue, skb);
+		kevent_socket_notify(sk, KEVENT_SOCKET_RECV);
 		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
 		if(skb->h.th->fin)
 			tcp_fin(skb, sk, skb->h.th);
@@ -3955,7 +3956,8 @@ int tcp_rcv_established(struct sock *sk,
 			int copied_early = 0;
 
 			if (tp->copied_seq == tp->rcv_nxt &&
-			    len - tcp_header_len <= tp->ucopy.len) {
+			    len - tcp_header_len <= tp->ucopy.len &&
+			    !sock_async(sk)) {
 #ifdef CONFIG_NET_DMA
 				if (tcp_dma_try_early_copy(sk, skb, tcp_header_len)) {
 					copied_early = 1;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index f6f39e8..ae4f23c 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -61,6 +61,7 @@ #include <linux/cache.h>
 #include <linux/jhash.h>
 #include <linux/init.h>
 #include <linux/times.h>
+#include <linux/kevent.h>
 
 #include <net/icmp.h>
 #include <net/inet_hashtables.h>
@@ -868,6 +869,7 @@ #endif
 	   	reqsk_free(req);
 	} else {
 		inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
+		kevent_socket_notify(sk, KEVENT_SOCKET_ACCEPT);
 	}
 	return 0;
 
@@ -1108,24 +1110,30 @@ process:
 
 	skb->dev = NULL;
 
-	bh_lock_sock_nested(sk);
 	ret = 0;
-	if (!sock_owned_by_user(sk)) {
+	if (sock_async(sk)) {
+		spin_lock_bh(&sk->sk_lock.slock);
+		ret = tcp_v4_do_rcv(sk, skb);
+		spin_unlock_bh(&sk->sk_lock.slock);
+	} else {
+		bh_lock_sock_nested(sk);
+		if (!sock_owned_by_user(sk)) {
 #ifdef CONFIG_NET_DMA
-		struct tcp_sock *tp = tcp_sk(sk);
-		if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
-			tp->ucopy.dma_chan = get_softnet_dma();
-		if (tp->ucopy.dma_chan)
-			ret = tcp_v4_do_rcv(sk, skb);
-		else
+			struct tcp_sock *tp = tcp_sk(sk);
+			if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
+				tp->ucopy.dma_chan = get_softnet_dma();
+			if (tp->ucopy.dma_chan)
+				ret = tcp_v4_do_rcv(sk, skb);
+			else
 #endif
-		{
-			if (!tcp_prequeue(sk, skb))
-			ret = tcp_v4_do_rcv(sk, skb);
-		}
-	} else
-		sk_add_backlog(sk, skb);
-	bh_unlock_sock(sk);
+			{
+				if (!tcp_prequeue(sk, skb))
+				ret = tcp_v4_do_rcv(sk, skb);
+			}
+		} else
+			sk_add_backlog(sk, skb);
+		bh_unlock_sock(sk);
+	}
 
 	sock_put(sk);
 
@@ -1849,6 +1857,8 @@ struct proto tcp_prot = {
 	.getsockopt		= tcp_getsockopt,
 	.sendmsg		= tcp_sendmsg,
 	.recvmsg		= tcp_recvmsg,
+	.async_recv		= tcp_async_recv,
+	.async_send		= tcp_async_send,
 	.backlog_rcv		= tcp_v4_do_rcv,
 	.hash			= tcp_v4_hash,
 	.unhash			= tcp_unhash,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 923989d..a5d3ac8 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1230,22 +1230,28 @@ process:
 
 	skb->dev = NULL;
 
-	bh_lock_sock(sk);
 	ret = 0;
-	if (!sock_owned_by_user(sk)) {
+	if (sock_async(sk)) {
+		spin_lock_bh(&sk->sk_lock.slock);
+		ret = tcp_v4_do_rcv(sk, skb);
+		spin_unlock_bh(&sk->sk_lock.slock);
+	} else {
+		bh_lock_sock(sk);
+		if (!sock_owned_by_user(sk)) {
 #ifdef CONFIG_NET_DMA
-                struct tcp_sock *tp = tcp_sk(sk);
-                if (tp->ucopy.dma_chan)
-                        ret = tcp_v6_do_rcv(sk, skb);
-                else
-#endif
-		{
-			if (!tcp_prequeue(sk, skb))
+			struct tcp_sock *tp = tcp_sk(sk);
+			if (tp->ucopy.dma_chan)
 				ret = tcp_v6_do_rcv(sk, skb);
-		}
-	} else
-		sk_add_backlog(sk, skb);
-	bh_unlock_sock(sk);
+			else
+#endif
+			{
+				if (!tcp_prequeue(sk, skb))
+					ret = tcp_v6_do_rcv(sk, skb);
+			}
+		} else
+			sk_add_backlog(sk, skb);
+		bh_unlock_sock(sk);
+	}
 
 	sock_put(sk);
 	return ret ? -1 : 0;
@@ -1596,6 +1602,8 @@ struct proto tcpv6_prot = {
 	.getsockopt		= tcp_getsockopt,
 	.sendmsg		= tcp_sendmsg,
 	.recvmsg		= tcp_recvmsg,
+	.async_recv		= tcp_async_recv,
+	.async_send		= tcp_async_send,
 	.backlog_rcv		= tcp_v6_do_rcv,
 	.hash			= tcp_v6_hash,
 	.unhash			= tcp_unhash,


^ permalink raw reply related	[flat|nested] 160+ messages in thread

* [take2 1/4] kevent: core files.
  2006-08-01  9:34                         ` [take2 0/4] kevent: introduction Evgeniy Polyakov
@ 2006-08-01  9:34                           ` Evgeniy Polyakov
  2006-08-01  9:34                             ` [take2 2/4] kevent: network AIO, socket notifications Evgeniy Polyakov
                                               ` (2 more replies)
  0 siblings, 3 replies; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-01  9:34 UTC (permalink / raw)
  To: lkml; +Cc: David Miller, Ulrich Drepper, Evgeniy Polyakov, netdev, Zach Brown


This patch includes core kevent files:
 - userspace controlling
 - kernelspace interfaces
 - initialization
 - notification state machines

It might also inlclude parts from other subsystem (like network related
syscalls, so it is possible that it will not compile without other
patches applied).

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>


diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index dd63d47..0af988a 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@@ -317,3 +317,7 @@ ENTRY(sys_call_table)
 	.long sys_tee			/* 315 */
 	.long sys_vmsplice
 	.long sys_move_pages
+	.long sys_aio_recv
+	.long sys_aio_send
+	.long sys_kevent_get_events
+	.long sys_kevent_ctl
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index 5d4a7d1..e157ad4 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -713,4 +713,8 @@ #endif
 	.quad sys_tee
 	.quad compat_sys_vmsplice
 	.quad compat_sys_move_pages
+	.quad sys_aio_recv
+	.quad sys_aio_send
+	.quad sys_kevent_get_events
+	.quad sys_kevent_ctl
 ia32_syscall_end:		

diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index fc1c8dd..a76e50d 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -323,10 +323,14 @@ #define __NR_sync_file_range	314
 #define __NR_tee		315
 #define __NR_vmsplice		316
 #define __NR_move_pages		317
+#define __NR_aio_recv		318
+#define __NR_aio_send		319
+#define __NR_kevent_get_events	320
+#define __NR_kevent_ctl		321
 
 #ifdef __KERNEL__
 
-#define NR_syscalls 318
+#define NR_syscalls 322
 
 /*
  * user-visible error numbers are in the range -1 - -128: see

diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h
index 94387c9..9e61299 100644
--- a/include/asm-x86_64/unistd.h
+++ b/include/asm-x86_64/unistd.h
@@ -619,10 +619,18 @@ #define __NR_vmsplice		278
 __SYSCALL(__NR_vmsplice, sys_vmsplice)
 #define __NR_move_pages		279
 __SYSCALL(__NR_move_pages, sys_move_pages)
+#define __NR_aio_recv		280
+__SYSCALL(__NR_aio_recv, sys_aio_recv)
+#define __NR_aio_send		281
+__SYSCALL(__NR_aio_send, sys_aio_send)
+#define __NR_aio_sendfile	282
+__SYSCALL(__NR_aio_sendfile, sys_kevent_get_events)
+#define __NR_kevent_ctl		283
+__SYSCALL(__NR_kevent_ctl, sys_kevent_ctl)
 
 #ifdef __KERNEL__
 
-#define __NR_syscall_max __NR_move_pages
+#define __NR_syscall_max __NR_kevent_ctl
 
 #ifndef __NO_STUBS
 
diff --git a/include/linux/kevent.h b/include/linux/kevent.h
new file mode 100644
index 0000000..6c36f3f
--- /dev/null
+++ b/include/linux/kevent.h
@@ -0,0 +1,259 @@
+/*
+ * 	kevent.h
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef __KEVENT_H
+#define __KEVENT_H
+
+/*
+ * Kevent request flags.
+ */
+
+#define KEVENT_REQ_ONESHOT	0x1		/* Process this event only once and then dequeue. */
+
+/*
+ * Kevent return flags.
+ */
+#define KEVENT_RET_BROKEN	0x1		/* Kevent is broken. */
+#define KEVENT_RET_DONE		0x2		/* Kevent processing was finished successfully. */
+
+/*
+ * Kevent type set.
+ */
+#define KEVENT_SOCKET 		0
+#define KEVENT_INODE		1
+#define KEVENT_TIMER		2
+#define KEVENT_POLL		3
+#define KEVENT_NAIO		4
+#define KEVENT_AIO		5
+#define	KEVENT_MAX		6
+
+/*
+ * Per-type event sets.
+ * Number of per-event sets should be exactly as number of kevent types.
+ */
+
+/*
+ * Timer events.
+ */
+#define	KEVENT_TIMER_FIRED	0x1
+
+/*
+ * Socket/network asynchronous IO events.
+ */
+#define	KEVENT_SOCKET_RECV	0x1
+#define	KEVENT_SOCKET_ACCEPT	0x2
+#define	KEVENT_SOCKET_SEND	0x4
+
+/*
+ * Inode events.
+ */
+#define	KEVENT_INODE_CREATE	0x1
+#define	KEVENT_INODE_REMOVE	0x2
+
+/*
+ * Poll events.
+ */
+#define	KEVENT_POLL_POLLIN	0x0001
+#define	KEVENT_POLL_POLLPRI	0x0002
+#define	KEVENT_POLL_POLLOUT	0x0004
+#define	KEVENT_POLL_POLLERR	0x0008
+#define	KEVENT_POLL_POLLHUP	0x0010
+#define	KEVENT_POLL_POLLNVAL	0x0020
+
+#define	KEVENT_POLL_POLLRDNORM	0x0040
+#define	KEVENT_POLL_POLLRDBAND	0x0080
+#define	KEVENT_POLL_POLLWRNORM	0x0100
+#define	KEVENT_POLL_POLLWRBAND	0x0200
+#define	KEVENT_POLL_POLLMSG	0x0400
+#define	KEVENT_POLL_POLLREMOVE	0x1000
+
+/*
+ * Asynchronous IO events.
+ */
+#define	KEVENT_AIO_BIO		0x1
+
+#define KEVENT_MASK_ALL		0xffffffff	/* Mask of all possible event values. */
+#define KEVENT_MASK_EMPTY	0x0		/* Empty mask of ready events. */
+
+struct kevent_id
+{
+	__u32		raw[2];
+};
+
+struct ukevent
+{
+	struct kevent_id	id;			/* Id of this request, e.g. socket number, file descriptor and so on... */
+	__u32			type;			/* Event type, e.g. KEVENT_SOCK, KEVENT_INODE, KEVENT_TIMER and so on... */
+	__u32			event;			/* Event itself, e.g. SOCK_ACCEPT, INODE_CREATED, TIMER_FIRED... */
+	__u32			req_flags;		/* Per-event request flags */
+	__u32			ret_flags;		/* Per-event return flags */
+	__u32			ret_data[2];		/* Event return data. Event originator fills it with anything it likes. */
+	union {
+		__u32		user[2];		/* User's data. It is not used, just copied to/from user. */
+		void		*ptr;
+	};
+};
+
+#define	KEVENT_CTL_ADD 		0
+#define	KEVENT_CTL_REMOVE	1
+#define	KEVENT_CTL_MODIFY	2
+#define	KEVENT_CTL_INIT		3
+
+struct kevent_user_control
+{
+	unsigned int		cmd;			/* Control command, e.g. KEVENT_ADD, KEVENT_REMOVE... */
+	unsigned int		num;			/* Number of ukevents this strucutre controls. */
+	unsigned int		timeout;		/* Timeout in milliseconds waiting for "num" events to become ready. */
+};
+
+#ifdef __KERNEL__
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/wait.h>
+#include <linux/kevent_storage.h>
+
+struct inode;
+struct dentry;
+struct sock;
+
+struct kevent;
+struct kevent_storage;
+typedef int (* kevent_callback_t)(struct kevent *);
+
+struct kevent
+{
+	struct ukevent		event;
+	spinlock_t		lock;			/* This lock protects ukevent manipulations, e.g. ret_flags changes. */
+
+	struct list_head	kevent_entry;		/* Entry of user's queue. */
+	struct list_head	storage_entry;		/* Entry of origin's queue. */
+	struct list_head	ready_entry;		/* Entry of user's ready. */
+
+	struct kevent_user	*user;			/* User who requested this kevent. */
+	struct kevent_storage	*st;			/* Kevent container. */
+
+	kevent_callback_t	callback;		/* Is called each time new event has been caught. */
+	kevent_callback_t	enqueue;		/* Is called each time new event is queued. */
+	kevent_callback_t	dequeue;		/* Is called each time event is dequeued. */
+
+	void			*priv;			/* Private data for different storages. 
+							 * poll()/select storage has a list of wait_queue_t containers 
+							 * for each ->poll() { poll_wait()' } here.
+							 */
+};
+
+#define KEVENT_HASH_MASK	0xff
+
+struct kevent_list
+{
+	struct list_head	kevent_list;		/* List of all kevents. */
+	spinlock_t 		kevent_lock;		/* Protects all manipulations with queue of kevents. */
+};
+
+struct kevent_user
+{
+	struct kevent_list	kqueue[KEVENT_HASH_MASK+1];
+	unsigned int		kevent_num;		/* Number of queued kevents. */
+
+	struct list_head	ready_list;		/* List of ready kevents. */
+	unsigned int		ready_num;		/* Number of ready kevents. */
+	spinlock_t 		ready_lock;		/* Protects all manipulations with ready queue. */
+
+	unsigned int		max_ready_num;		/* Requested number of kevents. */
+
+	struct mutex		ctl_mutex;		/* Protects against simultaneous kevent_user control manipulations. */
+	struct mutex		wait_mutex;		/* Protects against simultaneous kevent_user waits. */
+	wait_queue_head_t	wait;			/* Wait until some events are ready. */
+
+	atomic_t		refcnt;			/* Reference counter, increased for each new kevent. */
+#ifdef CONFIG_KEVENT_USER_STAT
+	unsigned long		im_num;
+	unsigned long		wait_num;
+	unsigned long		total;
+#endif
+};
+
+#define KEVENT_MAX_REQUESTS		PAGE_SIZE/sizeof(struct kevent)
+
+struct kevent *kevent_alloc(gfp_t mask);
+void kevent_free(struct kevent *k);
+int kevent_enqueue(struct kevent *k);
+int kevent_dequeue(struct kevent *k);
+int kevent_init(struct kevent *k);
+void kevent_requeue(struct kevent *k);
+
+#define list_for_each_entry_reverse_safe(pos, n, head, member)		\
+	for (pos = list_entry((head)->prev, typeof(*pos), member),	\
+		n = list_entry(pos->member.prev, typeof(*pos), member);	\
+	     prefetch(pos->member.prev), &pos->member != (head); 	\
+	     pos = n, n = list_entry(pos->member.prev, typeof(*pos), member))
+
+int kevent_break(struct kevent *k);
+int kevent_init(struct kevent *k);
+
+int kevent_init_socket(struct kevent *k);
+int kevent_init_inode(struct kevent *k);
+int kevent_init_timer(struct kevent *k);
+int kevent_init_poll(struct kevent *k);
+int kevent_init_naio(struct kevent *k);
+int kevent_init_aio(struct kevent *k);
+
+void kevent_storage_ready(struct kevent_storage *st, 
+		kevent_callback_t ready_callback, u32 event);
+int kevent_storage_init(void *origin, struct kevent_storage *st);
+void kevent_storage_fini(struct kevent_storage *st);
+int kevent_storage_enqueue(struct kevent_storage *st, struct kevent *k);
+void kevent_storage_dequeue(struct kevent_storage *st, struct kevent *k);
+
+int kevent_user_add_ukevent(struct ukevent *uk, struct kevent_user *u);
+
+#ifdef CONFIG_KEVENT_INODE
+void kevent_inode_notify(struct inode *inode, u32 event);
+void kevent_inode_notify_parent(struct dentry *dentry, u32 event);
+void kevent_inode_remove(struct inode *inode);
+#else
+static inline void kevent_inode_notify(struct inode *inode, u32 event)
+{
+}
+static inline void kevent_inode_notify_parent(struct dentry *dentry, u32 event)
+{
+}
+static inline void kevent_inode_remove(struct inode *inode)
+{
+}
+#endif /* CONFIG_KEVENT_INODE */
+#ifdef CONFIG_KEVENT_SOCKET
+
+void kevent_socket_notify(struct sock *sock, u32 event);
+int kevent_socket_dequeue(struct kevent *k);
+int kevent_socket_enqueue(struct kevent *k);
+#define sock_async(__sk) sock_flag(__sk, SOCK_ASYNC)
+#else
+static inline void kevent_socket_notify(struct sock *sock, u32 event)
+{
+}
+#define sock_async(__sk)	0
+#endif
+#endif /* __KERNEL__ */
+#endif /* __KEVENT_H */
diff --git a/include/linux/kevent_storage.h b/include/linux/kevent_storage.h
new file mode 100644
index 0000000..bd891f0
--- /dev/null
+++ b/include/linux/kevent_storage.h
@@ -0,0 +1,12 @@
+#ifndef __KEVENT_STORAGE_H
+#define __KEVENT_STORAGE_H
+
+struct kevent_storage
+{
+	void			*origin;		/* Originator's pointer, e.g. struct sock or struct file. Can be NULL. */
+	struct list_head	list;			/* List of queued kevents. */
+	unsigned int		qlen;			/* Number of queued kevents. */
+	spinlock_t		lock;			/* Protects users queue. */
+};
+
+#endif /* __KEVENT_STORAGE_H */

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 008f04c..57e64de 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -597,4 +597,9 @@ asmlinkage long sys_get_robust_list(int 
 asmlinkage long sys_set_robust_list(struct robust_list_head __user *head,
 				    size_t len);
 
+asmlinkage long sys_aio_recv(int ctl_fd, int s, void __user *buf, size_t size, unsigned flags);
+asmlinkage long sys_aio_send(int ctl_fd, int s, void __user *buf, size_t size, unsigned flags);
+asmlinkage long sys_kevent_get_events(int ctl_fd, unsigned int min, unsigned int max, 
+		unsigned int timeout, void __user *buf, unsigned flags);
+asmlinkage long sys_kevent_ctl(int ctl_fd, void __user *buf);
 #endif

diff --git a/init/Kconfig b/init/Kconfig
index a099fc6..c550fcc 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -218,6 +218,8 @@ config AUDITSYSCALL
 	  such as SELinux.  To use audit's filesystem watch feature, please
 	  ensure that INOTIFY is configured.
 
+source "kernel/kevent/Kconfig"
+
 config IKCONFIG
 	bool "Kernel .config support"
 	---help---
diff --git a/kernel/Makefile b/kernel/Makefile
index d62ec66..2d7a6dd 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -47,6 +47,7 @@ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softl
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
+obj-$(CONFIG_KEVENT) += kevent/
 obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o
diff --git a/kernel/kevent/Kconfig b/kernel/kevent/Kconfig
new file mode 100644
index 0000000..88b35af
--- /dev/null
+++ b/kernel/kevent/Kconfig
@@ -0,0 +1,57 @@
+config KEVENT
+	bool "Kernel event notification mechanism"
+	help
+	  This option enables event queue mechanism.
+	  It can be used as replacement for poll()/select(), AIO callback invocations,
+	  advanced timer notifications and other kernel object status changes.
+
+config KEVENT_USER_STAT
+	bool "Kevent user statistic"
+	depends on KEVENT
+	default N
+	help
+	  This option will turn kevent_user statistic collection on.
+	  Statistic data includes total number of kevent, number of kevents which are ready
+	  immediately at insertion time and number of kevents which were removed through
+	  readiness completion. It will be printed each time control kevent descriptor
+	  is closed.
+
+config KEVENT_SOCKET
+	bool "Kernel event notifications for sockets"
+	depends on NET && KEVENT
+	help
+	  This option enables notifications through KEVENT subsystem of 
+	  sockets operations, like new packet receiving conditions, ready for accept
+  	  conditions and so on.
+	
+config KEVENT_INODE
+	bool "Kernel event notifications for inodes"
+	depends on KEVENT
+	help
+	  This option enables notifications through KEVENT subsystem of 
+	  inode operations, like file creation, removal and so on.
+
+config KEVENT_TIMER
+	bool "Kernel event notifications for timers"
+	depends on KEVENT
+	help
+	  This option allows to use timers through KEVENT subsystem.
+
+config KEVENT_POLL
+	bool "Kernel event notifications for poll()/select()"
+	depends on KEVENT
+	help
+	  This option allows to use kevent subsystem for poll()/select() notifications.
+
+config KEVENT_NAIO
+	bool "Network asynchronous IO"
+	depends on KEVENT && KEVENT_SOCKET
+	help
+	  This option enables kevent based network asynchronous IO subsystem.
+
+config KEVENT_AIO
+	bool "Asynchronous IO"
+	depends on KEVENT
+	help
+	  This option allows to use kevent subsystem for AIO operations.
+	  AIO read is currently supported.
diff --git a/kernel/kevent/Makefile b/kernel/kevent/Makefile
new file mode 100644
index 0000000..7dcd651
--- /dev/null
+++ b/kernel/kevent/Makefile
@@ -0,0 +1,7 @@
+obj-y := kevent.o kevent_user.o kevent_init.o
+obj-$(CONFIG_KEVENT_SOCKET) += kevent_socket.o
+obj-$(CONFIG_KEVENT_INODE) += kevent_inode.o
+obj-$(CONFIG_KEVENT_TIMER) += kevent_timer.o
+obj-$(CONFIG_KEVENT_POLL) += kevent_poll.o
+obj-$(CONFIG_KEVENT_NAIO) += kevent_naio.o
+obj-$(CONFIG_KEVENT_AIO) += kevent_aio.o
diff --git a/kernel/kevent/kevent.c b/kernel/kevent/kevent.c
new file mode 100644
index 0000000..63d9439
--- /dev/null
+++ b/kernel/kevent/kevent.c
@@ -0,0 +1,248 @@
+/*
+ * 	kevent.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/mempool.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/kevent.h>
+
+static kmem_cache_t *kevent_cache;
+
+/*
+ * Attempts to add an event into appropriate origin's queue.
+ * Returns positive value if this event is ready immediately,
+ * negative value in case of error and zero if event has been queued.
+ * ->enqueue() callback must increase origin's reference counter.
+ */
+int kevent_enqueue(struct kevent *k)
+{
+	if (k->event.type >= KEVENT_MAX)
+		return -E2BIG;
+
+	if (!k->enqueue) {
+		kevent_break(k);
+		return -EINVAL;
+	}
+	
+	return k->enqueue(k);
+}
+
+/*
+ * Remove event from the appropriate queue.
+ * ->dequeue() callback must decrease origin's reference counter.
+ */
+int kevent_dequeue(struct kevent *k)
+{
+	if (k->event.type >= KEVENT_MAX)
+		return -E2BIG;
+	
+	if (!k->dequeue) {
+		kevent_break(k);
+		return -EINVAL;
+	}
+
+	return k->dequeue(k);
+}
+
+/*
+ * Must be called before event is going to be added into some origin's queue.
+ * Initializes ->enqueue(), ->dequeue() and ->callback() callbacks.
+ * If failed, kevent should not be used or kevent_enqueue() will fail to add
+ * this kevent into origin's queue with setting
+ * KEVENT_RET_BROKEN flag in kevent->event.ret_flags.
+ */
+int kevent_init(struct kevent *k)
+{
+	int err;
+
+	spin_lock_init(&k->lock);
+	k->kevent_entry.next = LIST_POISON1;
+	k->storage_entry.next = LIST_POISON1;
+	k->ready_entry.next = LIST_POISON1;
+
+	if (k->event.type >= KEVENT_MAX)
+		return -E2BIG;
+	
+	switch (k->event.type) {
+		case KEVENT_NAIO:
+			err = kevent_init_naio(k);
+			break;
+		case KEVENT_SOCKET:
+			err = kevent_init_socket(k);
+			break;
+		case KEVENT_INODE:
+			err = kevent_init_inode(k);
+			break;
+		case KEVENT_TIMER:
+			err = kevent_init_timer(k);
+			break;
+		case KEVENT_POLL:
+			err = kevent_init_poll(k);
+			break;
+		case KEVENT_AIO:
+			err = kevent_init_aio(k);
+			break;
+		default:
+			err = -ENODEV;
+	}
+
+	return err;
+}
+
+/*
+ * Called from ->enqueue() callback when reference counter for given
+ * origin (socket, inode...) has been increased.
+ */
+int kevent_storage_enqueue(struct kevent_storage *st, struct kevent *k)
+{
+	unsigned long flags;
+
+	k->st = st;
+	spin_lock_irqsave(&st->lock, flags);
+	list_add_tail(&k->storage_entry, &st->list);
+	st->qlen++;
+	spin_unlock_irqrestore(&st->lock, flags);
+	return 0;
+}
+
+/*
+ * Dequeue kevent from origin's queue. 
+ * It does not decrease origin's reference counter in any way 
+ * and must be called before it, so storage itself must be valid.
+ * It is called from ->dequeue() callback.
+ */
+void kevent_storage_dequeue(struct kevent_storage *st, struct kevent *k)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&st->lock, flags);
+	if (k->storage_entry.next != LIST_POISON1) {
+		list_del(&k->storage_entry);
+		st->qlen--;
+	}
+	spin_unlock_irqrestore(&st->lock, flags);
+}
+
+static void __kevent_requeue(struct kevent *k, u32 event)
+{
+	int err, rem = 0;
+	unsigned long flags;
+
+	err = k->callback(k);
+
+	spin_lock_irqsave(&k->lock, flags);
+	if (err > 0) {
+		k->event.ret_flags |= KEVENT_RET_DONE;
+	} else if (err < 0) {
+		k->event.ret_flags |= KEVENT_RET_BROKEN;
+		k->event.ret_flags |= KEVENT_RET_DONE;
+	}
+	rem = (k->event.req_flags & KEVENT_REQ_ONESHOT);
+	if (!err)
+		err = (k->event.ret_flags & (KEVENT_RET_BROKEN|KEVENT_RET_DONE));
+	spin_unlock_irqrestore(&k->lock, flags);
+
+	if (err) {
+		if (rem) {
+			list_del(&k->storage_entry);
+			k->st->qlen--;
+		}
+		
+		spin_lock_irqsave(&k->user->ready_lock, flags);
+		if (k->ready_entry.next == LIST_POISON1) {
+			list_add_tail(&k->ready_entry, &k->user->ready_list);
+			k->user->ready_num++;
+		}
+		spin_unlock_irqrestore(&k->user->ready_lock, flags);
+		wake_up(&k->user->wait);
+	}
+}
+
+void kevent_requeue(struct kevent *k)
+{
+	unsigned long flags;
+	
+	spin_lock_irqsave(&k->st->lock, flags);
+	__kevent_requeue(k, 0);
+	spin_unlock_irqrestore(&k->st->lock, flags);
+}
+
+/*
+ * Called each time some activity in origin (socket, inode...) is noticed.
+ */
+void kevent_storage_ready(struct kevent_storage *st, 
+		kevent_callback_t ready_callback, u32 event)
+{
+	struct kevent *k, *n;
+
+	spin_lock(&st->lock);
+	list_for_each_entry_safe(k, n, &st->list, storage_entry) {
+		if (ready_callback)
+			ready_callback(k);
+
+		if (event & k->event.event)
+			__kevent_requeue(k, event);
+	}
+	spin_unlock(&st->lock);
+}
+
+int kevent_storage_init(void *origin, struct kevent_storage *st)
+{
+	spin_lock_init(&st->lock);
+	st->origin = origin;
+	st->qlen = 0;
+	INIT_LIST_HEAD(&st->list);
+	return 0;
+}
+
+void kevent_storage_fini(struct kevent_storage *st)
+{
+	kevent_storage_ready(st, kevent_break, KEVENT_MASK_ALL);
+}
+
+struct kevent *kevent_alloc(gfp_t mask)
+{
+	return kmem_cache_alloc(kevent_cache, mask);
+}
+
+void kevent_free(struct kevent *k)
+{
+	kmem_cache_free(kevent_cache, k);
+}
+
+int __init kevent_sys_init(void)
+{
+	int err = 0;
+
+	kevent_cache = kmem_cache_create("kevent_cache", 
+			sizeof(struct kevent), 0, 0, NULL, NULL);
+	if (!kevent_cache)
+		panic("kevent: Unable to create a cache.\n");
+	
+	return err;
+}
+
+late_initcall(kevent_sys_init);

diff --git a/kernel/kevent/kevent_init.c b/kernel/kevent/kevent_init.c
new file mode 100644
index 0000000..ec95114
--- /dev/null
+++ b/kernel/kevent/kevent_init.c
@@ -0,0 +1,85 @@
+/*
+ * 	kevent_init.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/errno.h>
+#include <linux/kevent.h>
+
+int kevent_break(struct kevent *k)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&k->lock, flags);
+	k->event.ret_flags |= KEVENT_RET_BROKEN;
+	spin_unlock_irqrestore(&k->lock, flags);
+	return 0;
+}
+
+#ifndef CONFIG_KEVENT_SOCKET
+int kevent_init_socket(struct kevent *k)
+{
+	kevent_break(k);
+	return -ENODEV;
+}
+#endif
+
+#ifndef CONFIG_KEVENT_INODE
+int kevent_init_inode(struct kevent *k)
+{
+	kevent_break(k);
+	return -ENODEV;
+}
+#endif
+
+#ifndef CONFIG_KEVENT_TIMER
+int kevent_init_timer(struct kevent *k)
+{
+	kevent_break(k);
+	return -ENODEV;
+}
+#endif
+
+#ifndef CONFIG_KEVENT_POLL
+int kevent_init_poll(struct kevent *k)
+{
+	kevent_break(k);
+	return -ENODEV;
+}
+#endif
+
+#ifndef CONFIG_KEVENT_NAIO
+int kevent_init_naio(struct kevent *k)
+{
+	kevent_break(k);
+	return -ENODEV;
+}
+#endif
+
+#ifndef CONFIG_KEVENT_AIO
+int kevent_init_aio(struct kevent *k)
+{
+	kevent_break(k);
+	return -ENODEV;
+}
+#endif
diff --git a/kernel/kevent/kevent_inode.c b/kernel/kevent/kevent_inode.c
new file mode 100644
index 0000000..3af0e11
--- /dev/null
+++ b/kernel/kevent/kevent_inode.c
@@ -0,0 +1,110 @@
+/*
+ * 	kevent_inode.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+#include <linux/file.h>
+#include <linux/kevent.h>
+#include <linux/fs.h>
+
+static int kevent_inode_enqueue(struct kevent *k)
+{
+	struct file *file;
+	struct inode *inode;
+	int err, fput_needed;
+
+	file = fget_light(k->event.id.raw[0], &fput_needed);
+	if (!file)
+		return -ENODEV;
+
+	err = -EINVAL;
+	if (!file->f_dentry || !file->f_dentry->d_inode)
+		goto err_out_fput;
+	
+	inode = igrab(file->f_dentry->d_inode);
+	if (!inode)
+		goto err_out_fput;
+
+	err = kevent_storage_enqueue(&inode->st, k);
+	if (err)
+		goto err_out_iput;
+
+	fput_light(file, fput_needed);
+	return 0;
+
+err_out_iput:
+	iput(inode);
+err_out_fput:
+	fput_light(file, fput_needed);
+	return err;
+}
+
+static int kevent_inode_dequeue(struct kevent *k)
+{
+	struct inode *inode = k->st->origin;
+
+	kevent_storage_dequeue(k->st, k);
+	iput(inode);
+
+	return 0;
+}
+
+static int kevent_inode_callback(struct kevent *k)
+{
+	return 1;
+}
+
+int kevent_init_inode(struct kevent *k)
+{
+	k->enqueue = &kevent_inode_enqueue;
+	k->dequeue = &kevent_inode_dequeue;
+	k->callback = &kevent_inode_callback;
+	return 0;
+}
+
+void kevent_inode_notify_parent(struct dentry *dentry, u32 event)
+{
+	struct dentry *parent;
+	struct inode *inode;
+	
+	spin_lock(&dentry->d_lock);
+	parent = dentry->d_parent;
+	inode = parent->d_inode;
+
+	dget(parent);
+	spin_unlock(&dentry->d_lock);
+	kevent_inode_notify(inode, KEVENT_INODE_REMOVE);
+	dput(parent);
+}
+	
+void kevent_inode_remove(struct inode *inode)
+{
+	kevent_storage_fini(&inode->st);
+}
+	
+void kevent_inode_notify(struct inode *inode, u32 event)
+{
+	kevent_storage_ready(&inode->st, NULL, event);
+}
diff --git a/kernel/kevent/kevent_user.c b/kernel/kevent/kevent_user.c
new file mode 100644
index 0000000..7d01c2b
--- /dev/null
+++ b/kernel/kevent/kevent_user.c
@@ -0,0 +1,713 @@
+/*
+ * 	kevent_user.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/device.h>
+#include <linux/poll.h>
+#include <linux/kevent.h>
+#include <linux/jhash.h>
+
+static struct class *kevent_user_class;
+static char kevent_name[] = "kevent";
+static int kevent_user_major;
+
+static int kevent_user_open(struct inode *, struct file *);
+static int kevent_user_release(struct inode *, struct file *);
+static unsigned int kevent_user_poll(struct file *, struct poll_table_struct *);
+
+static struct file_operations kevent_user_fops = {
+	.open		= kevent_user_open,
+	.release	= kevent_user_release,
+	.poll		= kevent_user_poll,
+	.owner		= THIS_MODULE,
+};
+
+static int kevent_get_sb(struct file_system_type *fs_type, 
+		int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+{
+	/* So original magic... */
+	return get_sb_pseudo(fs_type, kevent_name, NULL, 0xabcdef, mnt);	
+}
+
+static struct file_system_type kevent_fs_type = {
+	.name		= kevent_name,
+	.get_sb		= kevent_get_sb,
+	.kill_sb	= kill_anon_super,
+};
+
+static struct vfsmount *kevent_mnt;
+
+static unsigned int kevent_user_poll(struct file *file, struct poll_table_struct *wait)
+{
+	struct kevent_user *u = file->private_data;
+	unsigned int mask;
+	
+	poll_wait(file, &u->wait, wait);
+	mask = 0;
+
+	if (u->ready_num)
+		mask |= POLLIN | POLLRDNORM;
+
+	return mask;
+}
+
+static struct kevent_user *kevent_user_alloc(void)
+{
+	struct kevent_user *u;
+	int i;
+
+	u = kzalloc(sizeof(struct kevent_user), GFP_KERNEL);
+	if (!u)
+		return NULL;
+
+	INIT_LIST_HEAD(&u->ready_list);
+	spin_lock_init(&u->ready_lock);
+	u->ready_num = 0;
+#ifdef CONFIG_KEVENT_USER_STAT
+	u->wait_num = u->im_num = u->total = 0;
+#endif
+	for (i=0; i<KEVENT_HASH_MASK+1; ++i) {
+		INIT_LIST_HEAD(&u->kqueue[i].kevent_list);
+		spin_lock_init(&u->kqueue[i].kevent_lock);
+	}
+	u->kevent_num = 0;
+	
+	mutex_init(&u->ctl_mutex);
+	mutex_init(&u->wait_mutex);
+	init_waitqueue_head(&u->wait);
+	u->max_ready_num = 0;
+
+	atomic_set(&u->refcnt, 1);
+
+	return u;
+}
+
+static int kevent_user_open(struct inode *inode, struct file *file)
+{
+	struct kevent_user *u = kevent_user_alloc();
+	
+	if (!u)
+		return -ENOMEM;
+
+	file->private_data = u;
+	
+	return 0;
+}
+
+static inline void kevent_user_get(struct kevent_user *u)
+{
+	atomic_inc(&u->refcnt);
+}
+
+static inline void kevent_user_put(struct kevent_user *u)
+{
+	if (atomic_dec_and_test(&u->refcnt)) {
+#ifdef CONFIG_KEVENT_USER_STAT
+		printk("%s: u=%p, wait=%lu, immediately=%lu, total=%lu.\n", 
+				__func__, u, u->wait_num, u->im_num, u->total);
+#endif
+		kfree(u);
+	}
+}
+
+#if 0
+static inline unsigned int kevent_user_hash(struct ukevent *uk)
+{
+	unsigned int h = (uk->user[0] ^ uk->user[1]) ^ (uk->id.raw[0] ^ uk->id.raw[1]);
+	
+	h = (((h >> 16) & 0xffff) ^ (h & 0xffff)) & 0xffff;
+	h = (((h >> 8) & 0xff) ^ (h & 0xff)) & KEVENT_HASH_MASK;
+
+	return h;
+}
+#else
+static inline unsigned int kevent_user_hash(struct ukevent *uk)
+{
+	return jhash_1word(uk->id.raw[0], 0) & KEVENT_HASH_MASK;
+}
+#endif
+
+/*
+ * Remove kevent from user's list of all events, 
+ * dequeue it from storage and decrease user's reference counter,
+ * since this kevent does not exist anymore. That is why it is freed here.
+ */
+static void kevent_finish_user(struct kevent *k, int lock, int deq)
+{
+	struct kevent_user *u = k->user;
+	unsigned long flags;
+
+	if (lock) {
+		unsigned int hash = kevent_user_hash(&k->event);
+		struct kevent_list *l = &u->kqueue[hash];
+
+		spin_lock_irqsave(&l->kevent_lock, flags);
+		list_del(&k->kevent_entry);
+		u->kevent_num--;
+		spin_unlock_irqrestore(&l->kevent_lock, flags);
+	} else {
+		list_del(&k->kevent_entry);
+		u->kevent_num--;
+	}
+
+	if (deq)
+		kevent_dequeue(k);
+
+	spin_lock_irqsave(&u->ready_lock, flags);
+	if (k->ready_entry.next != LIST_POISON1) {
+		list_del(&k->ready_entry);
+		u->ready_num--;
+	}
+	spin_unlock_irqrestore(&u->ready_lock, flags);
+	
+	kevent_user_put(u);
+	kevent_free(k);
+}
+
+/*
+ * Dequeue one entry from user's ready queue.
+ */
+static struct kevent *__kqueue_dequeue_one_ready(struct list_head *q, 
+		unsigned int *qlen)
+{
+	struct kevent *k = NULL;
+	unsigned int len = *qlen;
+	
+	if (len && !list_empty(q)) {
+		k = list_entry(q->next, struct kevent, ready_entry);
+		list_del(&k->ready_entry);
+		*qlen = len - 1;
+	}
+	
+	return k;
+}
+
+static struct kevent *kqueue_dequeue_ready(struct kevent_user *u)
+{
+	unsigned long flags;
+	struct kevent *k;
+
+	spin_lock_irqsave(&u->ready_lock, flags);
+	k = __kqueue_dequeue_one_ready(&u->ready_list, &u->ready_num);
+	spin_unlock_irqrestore(&u->ready_lock, flags);
+
+	return k;
+}
+
+static struct kevent *__kevent_search(struct kevent_list *l, struct ukevent *uk, 
+		struct kevent_user *u)
+{
+	struct kevent *k;
+	int found = 0;
+	
+	list_for_each_entry(k, &l->kevent_list, kevent_entry) {
+		spin_lock(&k->lock);
+		if (k->event.user[0] == uk->user[0] && k->event.user[1] == uk->user[1] &&
+				k->event.id.raw[0] == uk->id.raw[0] && 
+				k->event.id.raw[1] == uk->id.raw[1]) {
+			found = 1;
+			spin_unlock(&k->lock);
+			break;
+		}
+		spin_unlock(&k->lock);
+	}
+
+	return (found)?k:NULL;
+}
+
+static int kevent_modify(struct ukevent *uk, struct kevent_user *u)
+{
+	struct kevent *k;
+	unsigned int hash = kevent_user_hash(uk);
+	struct kevent_list *l = &u->kqueue[hash];
+	int err = -ENODEV;
+	unsigned long flags;
+	
+	spin_lock_irqsave(&l->kevent_lock, flags);
+	k = __kevent_search(l, uk, u);
+	if (k) {
+		spin_lock(&k->lock);
+		k->event.event = uk->event;
+		k->event.req_flags = uk->req_flags;
+		k->event.ret_flags = 0;
+		spin_unlock(&k->lock);
+		kevent_requeue(k);
+		err = 0;
+	}
+	spin_unlock_irqrestore(&l->kevent_lock, flags);
+	
+	return err;
+}
+
+static int kevent_remove(struct ukevent *uk, struct kevent_user *u)
+{
+	int err = -ENODEV;
+	struct kevent *k;
+	unsigned int hash = kevent_user_hash(uk);
+	struct kevent_list *l = &u->kqueue[hash];
+	unsigned long flags;
+
+	spin_lock_irqsave(&l->kevent_lock, flags);
+	k = __kevent_search(l, uk, u);
+	if (k) {
+		kevent_finish_user(k, 0, 1);
+		err = 0;
+	}
+	spin_unlock_irqrestore(&l->kevent_lock, flags);
+
+	return err;
+}
+
+/*
+ * No new entry can be added or removed from any list at this point.
+ * It is not permitted to call ->ioctl() and ->release() in parallel.
+ */
+static int kevent_user_release(struct inode *inode, struct file *file)
+{
+	struct kevent_user *u = file->private_data;
+	struct kevent *k, *n;
+	int i;
+
+	for (i=0; i<KEVENT_HASH_MASK+1; ++i) {
+		struct kevent_list *l = &u->kqueue[i];
+		
+		list_for_each_entry_safe(k, n, &l->kevent_list, kevent_entry)
+			kevent_finish_user(k, 1, 1);
+	}
+
+	kevent_user_put(u);
+	file->private_data = NULL;
+
+	return 0;
+}
+
+static int kevent_user_ctl_modify(struct kevent_user *u, 
+		struct kevent_user_control *ctl, void __user *arg)
+{
+	int err = 0, i;
+	struct ukevent uk;
+
+	mutex_lock(&u->ctl_mutex);
+
+	for (i=0; i<ctl->num; ++i) {
+		if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
+			err = -EINVAL;
+			break;
+		}
+
+		if (kevent_modify(&uk, u))
+			uk.ret_flags |= KEVENT_RET_BROKEN;
+		uk.ret_flags |= KEVENT_RET_DONE;
+
+		if (copy_to_user(arg, &uk, sizeof(struct ukevent))) {
+			err = -EINVAL;
+			break;
+		}
+
+		arg += sizeof(struct ukevent);
+	}
+
+	mutex_unlock(&u->ctl_mutex);
+
+	return err;
+}
+
+static int kevent_user_ctl_remove(struct kevent_user *u, 
+		struct kevent_user_control *ctl, void __user *arg)
+{
+	int err = 0, i;
+	struct ukevent uk;
+
+	mutex_lock(&u->ctl_mutex);
+
+	for (i=0; i<ctl->num; ++i) {
+		if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
+			err = -EINVAL;
+			break;
+		}
+
+		if (kevent_remove(&uk, u))
+			uk.ret_flags |= KEVENT_RET_BROKEN;
+
+		uk.ret_flags |= KEVENT_RET_DONE;
+
+		if (copy_to_user(arg, &uk, sizeof(struct ukevent))) {
+			err = -EINVAL;
+			break;
+		}
+
+		arg += sizeof(struct ukevent);
+	}
+
+	mutex_unlock(&u->ctl_mutex);
+
+	return err;
+}
+
+int kevent_user_add_ukevent(struct ukevent *uk, struct kevent_user *u)
+{
+	struct kevent *k;
+	int err;
+
+	k = kevent_alloc(GFP_KERNEL);
+	if (!k) {
+		err = -ENOMEM;
+		goto err_out_exit;
+	}
+
+	memcpy(&k->event, uk, sizeof(struct ukevent));
+
+	k->event.ret_flags = 0;
+
+	err = kevent_init(k);
+	if (err) {
+		kevent_free(k);
+		goto err_out_exit;
+	}
+	k->user = u;
+#ifdef CONFIG_KEVENT_USER_STAT
+	u->total++;
+#endif
+	{
+		unsigned long flags;
+		unsigned int hash = kevent_user_hash(&k->event);
+		struct kevent_list *l = &u->kqueue[hash];
+		
+		spin_lock_irqsave(&l->kevent_lock, flags);
+		list_add_tail(&k->kevent_entry, &l->kevent_list);
+		u->kevent_num++;
+		kevent_user_get(u);
+		spin_unlock_irqrestore(&l->kevent_lock, flags);
+	}
+
+	err = kevent_enqueue(k);
+	if (err) {
+		memcpy(uk, &k->event, sizeof(struct ukevent));
+		if (err < 0)
+			uk->ret_flags |= KEVENT_RET_BROKEN;
+		uk->ret_flags |= KEVENT_RET_DONE;
+		kevent_finish_user(k, 1, 0);
+	} 
+
+err_out_exit:
+	return err;
+}
+
+/*
+ * Copy all ukevents from userspace, allocate kevent for each one 
+ * and add them into appropriate kevent_storages, 
+ * e.g. sockets, inodes and so on...
+ * If something goes wrong, all events will be dequeued and 
+ * negative error will be returned. 
+ * On success zero is returned and 
+ * ctl->num will be a number of finished events, either completed or failed. 
+ * Array of finished events (struct ukevent) will be placed behind 
+ * kevent_user_control structure. User must run through that array and check 
+ * ret_flags field of each ukevent structure to determine if it is fired or failed event.
+ */
+static int kevent_user_ctl_add(struct kevent_user *u, 
+		struct kevent_user_control *ctl, void __user *arg)
+{
+	int err = 0, cerr = 0, num = 0, knum = 0, i;
+	void __user *orig, *ctl_addr;
+	struct ukevent uk;
+
+	mutex_lock(&u->ctl_mutex);
+
+	orig = arg;
+	ctl_addr = arg - sizeof(struct kevent_user_control);
+#if 1
+	err = -ENFILE;
+	if (u->kevent_num + ctl->num >= 1024)
+		goto err_out_remove;
+#endif
+	for (i=0; i<ctl->num; ++i) {
+		if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
+			cerr = -EINVAL;
+			break;
+		}
+		arg += sizeof(struct ukevent);
+
+		err = kevent_user_add_ukevent(&uk, u);
+		if (err) {
+#ifdef CONFIG_KEVENT_USER_STAT
+			u->im_num++;
+#endif
+			if (copy_to_user(orig, &uk, sizeof(struct ukevent)))
+				cerr = -EINVAL;
+			orig += sizeof(struct ukevent);
+			num++;
+		} else
+			knum++;
+	}
+
+	if (cerr < 0)
+		goto err_out_remove;
+
+	ctl->num = num;
+	if (copy_to_user(ctl_addr, ctl, sizeof(struct kevent_user_control)))
+		cerr = -EINVAL;
+
+	if (cerr)
+		err = cerr;
+	if (!err)
+		err = num;
+
+err_out_remove:
+	mutex_unlock(&u->ctl_mutex);
+
+	return err;
+}
+
+/*
+ * In nonblocking mode it returns as many events as possible, but not more than @max_nr.
+ * In blocking mode it waits until timeout or if at least @min_nr events are ready,
+ * if timeout is zero, than it waits no more than 1 second or if at least one event
+ * is ready.
+ */
+static int kevent_user_wait(struct file *file, struct kevent_user *u, 
+		unsigned int min_nr, unsigned int max_nr, unsigned int timeout, 
+		void __user *buf)
+{
+	struct kevent *k;
+	int cerr = 0, num = 0;
+
+	if (!(file->f_flags & O_NONBLOCK)) {
+		if (timeout)
+			wait_event_interruptible_timeout(u->wait, 
+				u->ready_num >= min_nr, msecs_to_jiffies(timeout));
+		else
+			wait_event_interruptible_timeout(u->wait, 
+					u->ready_num > 0, msecs_to_jiffies(1000));
+	}
+	
+	mutex_lock(&u->ctl_mutex);
+	while (num < max_nr && ((k = kqueue_dequeue_ready(u)) != NULL)) {
+		if (copy_to_user(buf + num*sizeof(struct ukevent), 
+					&k->event, sizeof(struct ukevent))) {
+			cerr = -EINVAL;
+			break;
+		}
+
+		/*
+		 * If it is one-shot kevent, it has been removed already from
+		 * origin's queue, so we can easily free it here.
+		 */
+		if (k->event.req_flags & KEVENT_REQ_ONESHOT)
+			kevent_finish_user(k, 1, 1);
+		++num;
+#ifdef CONFIG_KEVENT_USER_STAT
+		u->wait_num++;
+#endif
+	}
+	mutex_unlock(&u->ctl_mutex);
+
+	return (cerr)?cerr:num;
+}
+
+static int kevent_ctl_init(void)
+{
+	struct kevent_user *u;
+	struct file *file;
+	int fd, ret;
+
+	fd = get_unused_fd();
+	if (fd < 0)
+		return fd;
+
+	file = get_empty_filp();
+	if (!file) {
+		ret = -ENFILE;
+		goto out_put_fd;
+	}
+
+	u = kevent_user_alloc();
+	if (unlikely(!u)) {
+		ret = -ENOMEM;
+		goto out_put_file;
+	}
+
+	file->f_op = &kevent_user_fops;
+	file->f_vfsmnt = mntget(kevent_mnt);
+	file->f_dentry = dget(kevent_mnt->mnt_root);
+	file->f_mapping = file->f_dentry->d_inode->i_mapping;
+	file->f_mode = FMODE_READ;
+	file->f_flags = O_RDONLY;
+	file->private_data = u;
+	
+	fd_install(fd, file);
+
+	return fd;
+
+out_put_file:
+	put_filp(file);
+out_put_fd:
+	put_unused_fd(fd);
+	return ret;
+}
+
+static int kevent_ctl_process(struct file *file, 
+		struct kevent_user_control *ctl, void __user *arg)
+{
+	int err;
+	struct kevent_user *u = file->private_data;
+
+	if (!u)
+		return -EINVAL;
+
+	switch (ctl->cmd) {
+	case KEVENT_CTL_ADD:
+		err = kevent_user_ctl_add(u, ctl, 
+				arg+sizeof(struct kevent_user_control));
+		break;
+	case KEVENT_CTL_REMOVE:
+		err = kevent_user_ctl_remove(u, ctl, 
+				arg+sizeof(struct kevent_user_control));
+		break;
+	case KEVENT_CTL_MODIFY:
+		err = kevent_user_ctl_modify(u, ctl, 
+				arg+sizeof(struct kevent_user_control));
+		break;
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+	return err;
+}
+
+asmlinkage long sys_kevent_get_events(int ctl_fd, unsigned int min_nr, unsigned int max_nr,
+		unsigned int timeout, void __user *buf, unsigned flags)
+{
+	int err = -EINVAL, fput_needed;
+	struct file *file;
+	struct kevent_user *u;
+
+	file = fget_light(ctl_fd, &fput_needed);
+	if (!file)
+		return -ENODEV;
+
+	if (file->f_op != &kevent_user_fops)
+		goto out_fput;
+	u = file->private_data;
+
+	err = kevent_user_wait(file, u, min_nr, max_nr, timeout, buf);
+out_fput:
+	fput_light(file, fput_needed);
+	return err;
+}
+
+asmlinkage long sys_kevent_ctl(int fd, void __user *arg)
+{
+	int err = -EINVAL, fput_needed;
+	struct kevent_user_control ctl;
+	struct file *file;
+
+	if (copy_from_user(&ctl, arg, sizeof(struct kevent_user_control)))
+		return -EINVAL;
+
+	if (ctl.cmd == KEVENT_CTL_INIT)
+		return kevent_ctl_init();
+
+	file = fget_light(fd, &fput_needed);
+	if (!file)
+		return -ENODEV;
+
+	if (file->f_op != &kevent_user_fops)
+		goto out_fput;
+
+	err = kevent_ctl_process(file, &ctl, arg);
+
+out_fput:
+	fput_light(file, fput_needed);
+	return err;
+}
+
+static int __devinit kevent_user_init(void)
+{
+	struct class_device *dev;
+	int err = 0;
+	
+	err = register_filesystem(&kevent_fs_type);
+	if (err)
+		panic("%s: failed to register filesystem: err=%d.\n",
+			       kevent_name, err);
+
+	kevent_mnt = kern_mount(&kevent_fs_type);
+	if (IS_ERR(kevent_mnt))
+		panic("%s: failed to mount silesystem: err=%ld.\n", 
+				kevent_name, PTR_ERR(kevent_mnt));
+	
+	kevent_user_major = register_chrdev(0, kevent_name, &kevent_user_fops);
+	if (kevent_user_major < 0) {
+		printk(KERN_ERR "Failed to register \"%s\" char device: err=%d.\n", 
+				kevent_name, kevent_user_major);
+		return -ENODEV;
+	}
+
+	kevent_user_class = class_create(THIS_MODULE, "kevent");
+	if (IS_ERR(kevent_user_class)) {
+		printk(KERN_ERR "Failed to register \"%s\" class: err=%ld.\n", 
+				kevent_name, PTR_ERR(kevent_user_class));
+		err = PTR_ERR(kevent_user_class);
+		goto err_out_unregister;
+	}
+
+	dev = class_device_create(kevent_user_class, NULL, 
+			MKDEV(kevent_user_major, 0), NULL, kevent_name);
+	if (IS_ERR(dev)) {
+		printk(KERN_ERR "Failed to create %d.%d class device in \"%s\" class: err=%ld.\n", 
+				kevent_user_major, 0, kevent_name, PTR_ERR(dev));
+		err = PTR_ERR(dev);
+		goto err_out_class_destroy;
+	}
+
+	printk("KEVENT subsystem: chardev helper: major=%d.\n", kevent_user_major);
+
+	return 0;
+
+err_out_class_destroy:
+	class_destroy(kevent_user_class);
+err_out_unregister:
+	unregister_chrdev(kevent_user_major, kevent_name);
+
+	return err;
+}
+
+static void __devexit kevent_user_fini(void)
+{
+	class_device_destroy(kevent_user_class, MKDEV(kevent_user_major, 0));
+	class_destroy(kevent_user_class);
+	unregister_chrdev(kevent_user_major, kevent_name);
+	mntput(kevent_mnt);
+	unregister_filesystem(&kevent_fs_type);
+}
+
+module_init(kevent_user_init);
+module_exit(kevent_user_fini);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 6991bec..8843cca 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -122,6 +122,11 @@ cond_syscall(ppc_rtas);
 cond_syscall(sys_spu_run);
 cond_syscall(sys_spu_create);
 
+cond_syscall(sys_aio_recv);
+cond_syscall(sys_aio_send);
+cond_syscall(sys_kevent_get_events);
+cond_syscall(sys_kevent_ctl);
+
 /* mmu depending weak syscall entries */
 cond_syscall(sys_mprotect);
 cond_syscall(sys_msync);



^ permalink raw reply related	[flat|nested] 160+ messages in thread

* [take2 3/4] kevent: AIO, aio_sendfile() implementation.
  2006-08-01  9:34                               ` [take2 4/4] kevent: poll/select() notifications. Timer notifications Evgeniy Polyakov
@ 2006-08-01  9:34                                 ` Evgeniy Polyakov
  0 siblings, 0 replies; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-01  9:34 UTC (permalink / raw)
  To: lkml; +Cc: David Miller, Ulrich Drepper, Evgeniy Polyakov, netdev, Zach Brown


This patch includes asynchronous propagation of file's data into VFS
cache and aio_sendfile() implementation.
Network aio_sendfile() works lazily - it asynchronously populates pages
into the VFS cache (which can be used for various tricks with adaptive
readahead) and then uses usual ->sendfile() callback.

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>

diff --git a/fs/bio.c b/fs/bio.c
index 6a0b9ad..a3ee530 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -119,7 +119,7 @@ void bio_free(struct bio *bio, struct bi
 /*
  * default destructor for a bio allocated with bio_alloc_bioset()
  */
-static void bio_fs_destructor(struct bio *bio)
+void bio_fs_destructor(struct bio *bio)
 {
 	bio_free(bio, fs_bio_set);
 }
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index fb4d322..9316551 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -685,6 +685,7 @@ ext2_writepages(struct address_space *ma
 }
 
 const struct address_space_operations ext2_aops = {
+	.get_block		= ext2_get_block,
 	.readpage		= ext2_readpage,
 	.readpages		= ext2_readpages,
 	.writepage		= ext2_writepage,
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index c5ee9f0..d9210d4 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1699,6 +1699,7 @@ static int ext3_journalled_set_page_dirt
 }
 
 static const struct address_space_operations ext3_ordered_aops = {
+	.get_block	= ext3_get_block,
 	.readpage	= ext3_readpage,
 	.readpages	= ext3_readpages,
 	.writepage	= ext3_ordered_writepage,
diff --git a/fs/file_table.c b/fs/file_table.c
index 0131ba0..b649317 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -112,6 +112,9 @@ struct file *get_empty_filp(void)
 	if (security_file_alloc(f))
 		goto fail_sec;
 
+#ifdef CONFIG_KEVENT_POLL
+	kevent_storage_init(f, &f->st);
+#endif
 	tsk = current;
 	INIT_LIST_HEAD(&f->f_u.fu_list);
 	atomic_set(&f->f_count, 1);
@@ -159,6 +162,9 @@ void fastcall __fput(struct file *file)
 	might_sleep();
 
 	fsnotify_close(file);
+#ifdef CONFIG_KEVENT_POLL
+	kevent_storage_fini(&file->st);
+#endif
 	/*
 	 * The function eventpoll_release() should be the first called
 	 * in the file cleanup chain.
diff --git a/fs/inode.c b/fs/inode.c
index 0bf9f04..fdbd0ba 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -21,6 +21,7 @@ #include <linux/pagemap.h>
 #include <linux/cdev.h>
 #include <linux/bootmem.h>
 #include <linux/inotify.h>
+#include <linux/kevent.h>
 #include <linux/mount.h>
 
 /*
@@ -165,12 +166,18 @@ #endif
 		}
 		memset(&inode->u, 0, sizeof(inode->u));
 		inode->i_mapping = mapping;
+#if defined CONFIG_KEVENT
+		kevent_storage_init(inode, &inode->st);
+#endif
 	}
 	return inode;
 }
 
 void destroy_inode(struct inode *inode) 
 {
+#if defined CONFIG_KEVENT_INODE || defined CONFIG_KEVENT_SOCKET
+	kevent_storage_fini(&inode->st);
+#endif
 	BUG_ON(inode_has_buffers(inode));
 	security_inode_free(inode);
 	if (inode->i_sb->s_op->destroy_inode)
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 12dfdcf..f8dca72 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -3001,6 +3001,7 @@ int reiserfs_setattr(struct dentry *dent
 }
 
 const struct address_space_operations reiserfs_address_space_operations = {
+	.get_block = reiserfs_get_block,
 	.writepage = reiserfs_writepage,
 	.readpage = reiserfs_readpage,
 	.readpages = reiserfs_readpages,

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2561020..65eb438 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -240,6 +240,9 @@ #include <linux/mutex.h>
 #include <asm/atomic.h>
 #include <asm/semaphore.h>
 #include <asm/byteorder.h>
+#ifdef CONFIG_KEVENT
+#include <linux/kevent_storage.h>
+#endif
 
 struct hd_geometry;
 struct iovec;
@@ -352,6 +355,8 @@ struct address_space;
 struct writeback_control;
 
 struct address_space_operations {
+	int  (*get_block)(struct inode *inode, sector_t iblock,
+			struct buffer_head *bh_result, int create);
 	int (*writepage)(struct page *page, struct writeback_control *wbc);
 	int (*readpage)(struct file *, struct page *);
 	void (*sync_page)(struct page *);
@@ -546,6 +551,10 @@ #ifdef CONFIG_INOTIFY
 	struct mutex		inotify_mutex;	/* protects the watches list */
 #endif
 
+#ifdef CONFIG_KEVENT_INODE
+	struct kevent_storage	st;
+#endif
+
 	unsigned long		i_state;
 	unsigned long		dirtied_when;	/* jiffies of first dirtying */
 
@@ -698,6 +707,9 @@ #ifdef CONFIG_EPOLL
 	struct list_head	f_ep_links;
 	spinlock_t		f_ep_lock;
 #endif /* #ifdef CONFIG_EPOLL */
+#ifdef CONFIG_KEVENT_POLL
+	struct kevent_storage	st;
+#endif
 	struct address_space	*f_mapping;
 };
 extern spinlock_t files_lock;
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index cc5dec7..0acc8db 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -15,6 +15,7 @@ #ifdef __KERNEL__
 
 #include <linux/dnotify.h>
 #include <linux/inotify.h>
+#include <linux/kevent.h>
 #include <linux/audit.h>
 
 /*
@@ -79,6 +80,7 @@ static inline void fsnotify_nameremove(s
 		isdir = IN_ISDIR;
 	dnotify_parent(dentry, DN_DELETE);
 	inotify_dentry_parent_queue_event(dentry, IN_DELETE|isdir, 0, dentry->d_name.name);
+	kevent_inode_notify_parent(dentry, KEVENT_INODE_REMOVE);
 }
 
 /*
@@ -88,6 +90,7 @@ static inline void fsnotify_inoderemove(
 {
 	inotify_inode_queue_event(inode, IN_DELETE_SELF, 0, NULL, NULL);
 	inotify_inode_is_dead(inode);
+	kevent_inode_remove(inode);
 }
 
 /*
@@ -96,6 +99,7 @@ static inline void fsnotify_inoderemove(
 static inline void fsnotify_create(struct inode *inode, struct dentry *dentry)
 {
 	inode_dir_notify(inode, DN_CREATE);
+	kevent_inode_notify(inode, KEVENT_INODE_CREATE);
 	inotify_inode_queue_event(inode, IN_CREATE, 0, dentry->d_name.name,
 				  dentry->d_inode);
 	audit_inode_child(dentry->d_name.name, dentry->d_inode, inode->i_ino);
@@ -107,6 +111,7 @@ static inline void fsnotify_create(struc
 static inline void fsnotify_mkdir(struct inode *inode, struct dentry *dentry)
 {
 	inode_dir_notify(inode, DN_CREATE);
+	kevent_inode_notify(inode, KEVENT_INODE_CREATE);
 	inotify_inode_queue_event(inode, IN_CREATE | IN_ISDIR, 0, 
 				  dentry->d_name.name, dentry->d_inode);
 	audit_inode_child(dentry->d_name.name, dentry->d_inode, inode->i_ino);
diff --git a/kernel/kevent/kevent_aio.c b/kernel/kevent/kevent_aio.c
new file mode 100644
index 0000000..fa07a19
--- /dev/null
+++ b/kernel/kevent/kevent_aio.c
@@ -0,0 +1,580 @@
+/*
+ * 	kevent_aio.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/swap.h>
+#include <linux/pagemap.h>
+#include <linux/bio.h>
+#include <linux/buffer_head.h>
+#include <linux/kevent.h>
+
+#include <net/sock.h>
+
+#define KEVENT_AIO_DEBUG
+
+#ifdef KEVENT_AIO_DEBUG
+#define dprintk(f, a...) printk(f, ##a)
+#else
+#define dprintk(f, a...) do {} while (0)
+#endif
+
+struct kevent_aio_private
+{
+	int			pg_num;
+	size_t			size;
+	loff_t			offset;
+	loff_t			processed;
+	atomic_t		bio_page_num;
+	struct completion	bio_complete;
+	struct file		*file, *sock;
+	struct work_struct	work;
+};
+
+static int kevent_aio_dequeue(struct kevent *k);
+static int kevent_aio_enqueue(struct kevent *k);
+static int kevent_aio_callback(struct kevent *k);
+
+extern void bio_fs_destructor(struct bio *bio);
+
+static void kevent_aio_bio_destructor(struct bio *bio)
+{
+	struct kevent *k = bio->bi_private;
+	struct kevent_aio_private *priv = k->priv;
+
+	dprintk("%s: bio=%p, num=%u, k=%p, inode=%p.\n", __func__, bio, bio->bi_vcnt, k, k->st->origin);
+	schedule_work(&priv->work);
+	bio_fs_destructor(bio);
+}
+
+static void kevent_aio_bio_put(struct kevent *k)
+{
+	struct kevent_aio_private *priv = k->priv;
+	
+	if (atomic_dec_and_test(&priv->bio_page_num))
+		complete(&priv->bio_complete);
+}
+
+static int kevent_mpage_end_io_read(struct bio *bio, unsigned int bytes_done, int err)
+{
+	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+	struct kevent *k = bio->bi_private;
+
+	if (bio->bi_size)
+		return 1;
+
+	do {
+		struct page *page = bvec->bv_page;
+
+		if (--bvec >= bio->bi_io_vec)
+			prefetchw(&bvec->bv_page->flags);
+
+		if (uptodate) {
+			SetPageUptodate(page);
+		} else {
+			ClearPageUptodate(page);
+			SetPageError(page);
+		}
+
+		unlock_page(page);
+		kevent_aio_bio_put(k);
+	} while (bvec >= bio->bi_io_vec);
+
+	bio_put(bio);
+	return 0;
+}
+
+static inline struct bio *kevent_mpage_bio_submit(int rw, struct bio *bio)
+{
+	if (bio) {
+		bio->bi_end_io = kevent_mpage_end_io_read;
+		dprintk("%s: bio=%p, num=%u.\n", __func__, bio, bio->bi_vcnt);
+		submit_bio(READ, bio);
+	}
+	return NULL;
+}
+
+static struct bio *kevent_mpage_readpage(struct kevent *k, struct bio *bio,
+		struct page *page, unsigned nr_pages, get_block_t get_block, 
+		loff_t *offset, sector_t *last_block_in_bio)
+{
+	struct inode *inode = k->st->origin;
+	const unsigned blkbits = inode->i_blkbits;
+	const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits;
+	const unsigned blocksize = 1 << blkbits;
+	sector_t block_in_file;
+	sector_t last_block;
+	struct block_device *bdev = NULL;
+	unsigned first_hole = blocks_per_page;
+	unsigned page_block;
+	sector_t blocks[MAX_BUF_PER_PAGE];
+	struct buffer_head bh;
+	int fully_mapped = 1, length;
+
+	block_in_file = (*offset + blocksize - 1) >> blkbits;
+	last_block = (i_size_read(inode) + blocksize - 1) >> blkbits;
+
+	bh.b_page = page;
+	for (page_block = 0; page_block < blocks_per_page; page_block++, block_in_file++) {
+		bh.b_state = 0;
+		if (block_in_file < last_block) {
+			if (get_block(inode, block_in_file, &bh, 0))
+				goto confused;
+		}
+
+		if (!buffer_mapped(&bh)) {
+			fully_mapped = 0;
+			if (first_hole == blocks_per_page)
+				first_hole = page_block;
+			continue;
+		}
+
+		/* some filesystems will copy data into the page during
+		 * the get_block call, in which case we don't want to
+		 * read it again.  map_buffer_to_page copies the data
+		 * we just collected from get_block into the page's buffers
+		 * so readpage doesn't have to repeat the get_block call
+		 */
+		if (buffer_uptodate(&bh)) {
+			BUG();
+			//map_buffer_to_page(page, &bh, page_block);
+			goto confused;
+		}
+	
+		if (first_hole != blocks_per_page)
+			goto confused;		/* hole -> non-hole */
+
+		/* Contiguous blocks? */
+		if (page_block && blocks[page_block-1] != bh.b_blocknr-1)
+			goto confused;
+		blocks[page_block] = bh.b_blocknr;
+		bdev = bh.b_bdev;
+	}
+
+	if (!bdev)
+		goto confused;
+
+	if (first_hole != blocks_per_page) {
+		char *kaddr = kmap_atomic(page, KM_USER0);
+		memset(kaddr + (first_hole << blkbits), 0,
+				PAGE_CACHE_SIZE - (first_hole << blkbits));
+		flush_dcache_page(page);
+		kunmap_atomic(kaddr, KM_USER0);
+		if (first_hole == 0) {
+			SetPageUptodate(page);
+			goto out;
+		}
+	} else if (fully_mapped) {
+		SetPageMappedToDisk(page);
+	}
+	
+	/*
+	 * This page will go to BIO.  Do we need to send this BIO off first?
+	 */
+	if (bio && (*last_block_in_bio != blocks[0] - 1))
+		bio = kevent_mpage_bio_submit(READ, bio);
+
+alloc_new:
+	if (bio == NULL) {
+		nr_pages = min_t(unsigned, nr_pages, bio_get_nr_vecs(bdev));
+		bio = bio_alloc(GFP_KERNEL, nr_pages);
+		if (bio == NULL)
+			goto confused;
+
+		bio->bi_destructor = kevent_aio_bio_destructor;
+		bio->bi_bdev = bdev;
+		bio->bi_sector = blocks[0] << (blkbits - 9);
+		bio->bi_private = k;
+	}
+
+	length = first_hole << blkbits;
+	if (bio_add_page(bio, page, length, 0) < length) {
+		bio = kevent_mpage_bio_submit(READ, bio);
+		dprintk("%s: Failed to add a page: nr_pages=%d, length=%d, page=%p.\n", 
+				__func__, nr_pages, length, page);
+		goto alloc_new;
+	}
+	
+	dprintk("%s: bio=%p, b=%d, m=%d, u=%d, nr_pages=%d, offset=%Lu, "
+			"size=%Lu. page_block=%u, page=%p.\n", 
+			__func__, bio, buffer_boundary(&bh), buffer_mapped(&bh), 
+			buffer_uptodate(&bh), nr_pages, *offset, i_size_read(inode), 
+			page_block, page);
+	
+	*offset = *offset + length;
+
+	if (buffer_boundary(&bh) || (first_hole != blocks_per_page))
+		bio = kevent_mpage_bio_submit(READ, bio);
+	else
+		*last_block_in_bio = blocks[blocks_per_page - 1];
+
+out:
+	return bio;
+
+confused:
+	dprintk("%s: confused. bio=%p, nr_pages=%d.\n", __func__, bio, nr_pages);
+	if (bio)
+		bio = kevent_mpage_bio_submit(READ, bio);
+	kevent_aio_bio_put(k);
+	SetPageUptodate(page);
+
+	if (nr_pages == 1) {
+		struct kevent_aio_private *priv = k->priv;
+
+		wait_for_completion(&priv->bio_complete);
+		kevent_storage_ready(k->st, NULL, KEVENT_AIO_BIO);
+		init_completion(&priv->bio_complete);
+		complete(&priv->bio_complete);
+	}
+	goto out;
+}
+
+static int kevent_aio_alloc_cached_page(struct kevent *k, struct page **cached_page)
+{
+	struct kevent_aio_private *priv = k->priv;
+	struct address_space *mapping = priv->file->f_mapping;
+	struct page *page;
+	int err = 0;
+	pgoff_t index = priv->offset >> PAGE_CACHE_SHIFT;
+
+	page = page_cache_alloc_cold(mapping);
+	if (!page) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL);
+	if (err) {
+		if (err == -EEXIST)
+			err = 0;
+		page_cache_release(page);
+		goto out;
+	}
+
+	dprintk("%s: page=%p, offset=%Lu, processed=%Lu, index=%lu, size=%zu.\n",
+			__func__, page, priv->offset, priv->processed, index, priv->size);
+
+	*cached_page = page;
+
+out:
+	return err;
+}
+
+static int kevent_mpage_readpages(struct kevent *k, int first,
+		int (* get_block)(struct inode *inode, sector_t iblock,	
+			struct buffer_head *bh_result, int create))
+{
+	struct bio *bio = NULL;
+	struct kevent_aio_private *priv = k->priv;
+	sector_t last_block_in_bio = 0;
+	int i, err = 0;
+
+	atomic_set(&priv->bio_page_num, priv->pg_num);
+
+	for (i=first; i<priv->pg_num; ++i) {
+		struct page *page = NULL;
+		
+		err = kevent_aio_alloc_cached_page(k, &page);
+		if (err)
+			break;
+
+		/*
+		 * If there is no error and page is NULL, this means
+		 * that someone added a page into VFS cache.
+		 * We will not process this page, since it is that who
+		 * added a page must read data from disk.
+		 */
+		if (!page)
+			continue;
+
+		bio = kevent_mpage_readpage(k, bio, page, priv->pg_num - i, 
+				get_block, &priv->offset, &last_block_in_bio);
+	}
+
+	if (bio)
+		bio = kevent_mpage_bio_submit(READ, bio);
+
+	return err;
+}
+
+static size_t kevent_aio_vfs_read_actor(struct kevent *k, struct page *kpage, size_t len)
+{
+	struct kevent_aio_private *priv = k->priv;
+	size_t ret;
+	
+	ret = priv->sock->f_op->sendpage(priv->sock, kpage, 0, len, &priv->sock->f_pos, 1);
+
+	dprintk("%s: k=%p, page=%p, len=%zu, ret=%zd.\n", 
+			__func__, k, kpage, len, ret);
+
+	return ret;
+}
+
+static int kevent_aio_vfs_read(struct kevent *k, 
+		size_t (*actor)(struct kevent *, struct page *, size_t))
+{
+	struct kevent_aio_private *priv = k->priv;
+	struct address_space *mapping;
+	size_t isize, actor_size;
+	int i;
+
+	mapping = priv->file->f_mapping;
+	isize = i_size_read(priv->file->f_dentry->d_inode);
+	
+	dprintk("%s: start: size_left=%zd, offset=%Lu, processed=%Lu, isize=%zu, pg_num=%d.\n", 
+			__func__, priv->size, priv->offset, priv->processed, isize, priv->pg_num);
+
+	for (i=0; i<priv->pg_num && priv->size; ++i) {
+		struct page *page;
+		size_t nr = PAGE_CACHE_SIZE;
+
+		cond_resched();
+		page = find_get_page(mapping, priv->processed >> PAGE_CACHE_SHIFT);
+		if (unlikely(page == NULL))
+			break;
+		if (!PageUptodate(page)) {
+			dprintk("%s: %2d: page=%p, processed=%Lu, size=%zu not uptodate.\n", 
+					__func__, i, page, priv->processed, priv->size);
+			page_cache_release(page);
+			break;
+		}
+
+		if (mapping_writably_mapped(mapping))
+			flush_dcache_page(page);
+
+		mark_page_accessed(page);
+
+		if (nr + priv->processed > isize)
+			nr = isize - priv->processed;
+		if (nr > priv->size)
+			nr = priv->size;
+
+		actor_size = actor(k, page, nr);
+		if (actor_size < 0) {
+			page_cache_release(page);
+			break;
+		}
+
+		page_cache_release(page);
+
+		priv->processed += actor_size;
+		priv->size -= actor_size;
+	}
+
+	if (!priv->size)
+		i = priv->pg_num;
+
+	if (i != priv->pg_num)
+		priv->offset = priv->processed;
+
+	dprintk("%s: end: next=%d, num=%d, left=%zu, offset=%Lu, procesed=%Lu, ret=%d.\n", 
+			__func__, i, priv->pg_num, 
+			priv->size, priv->offset, priv->processed, i);
+
+	return i;
+}
+
+static int kevent_aio_callback(struct kevent *k)
+{
+	return 1;
+}
+
+static void kevent_aio_work(void *data)
+{
+	struct kevent *k = data;
+	struct kevent_aio_private *priv = k->priv;
+	struct inode *inode = k->st->origin;
+	struct address_space *mapping = priv->file->f_mapping;
+	int err, ready = 0, num;
+
+	dprintk("%s: k=%p, priv=%p, inode=%p.\n", __func__, k, priv, inode);
+
+	init_completion(&priv->bio_complete);
+	
+	num = ready = kevent_aio_vfs_read(k, &kevent_aio_vfs_read_actor);
+	if (ready > 0 && ready != priv->pg_num)
+		ready = 0;
+
+	dprintk("%s: k=%p, ready=%d, size=%zd.\n", __func__, k, ready, priv->size);
+
+	if (!ready) {
+		err = kevent_mpage_readpages(k, num, mapping->a_ops->get_block);
+		if (err) {
+			dprintk("%s: kevent_mpage_readpages failed: err=%d, k=%p, size=%zd.\n",
+					__func__, err, k, priv->size);
+			kevent_break(k);
+			kevent_storage_ready(k->st, NULL, KEVENT_MASK_ALL);
+		}
+	} else {
+		dprintk("%s: next k=%p, size=%zd.\n", __func__, k, priv->size);
+
+		if (priv->size)
+			schedule_work(&priv->work);
+		else {
+			kevent_storage_ready(k->st, NULL, KEVENT_MASK_ALL);
+		}
+
+		complete(&priv->bio_complete);
+	}
+}
+
+static int kevent_aio_enqueue(struct kevent *k)
+{
+	int err;
+	struct file *file, *sock;
+	struct inode *inode;
+	struct kevent_aio_private *priv;
+	struct address_space *mapping;
+	int fd = k->event.id.raw[0];
+	int num = k->event.id.raw[1];
+	int s = k->event.ret_data[0];
+	size_t size;
+
+	err = -ENODEV;
+	file = fget(fd);
+	if (!file)
+		goto err_out_exit;
+	
+	sock = fget(s);
+	if (!sock)
+		goto err_out_fput_file;
+	
+	mapping = file->f_mapping;
+
+	err = -EINVAL;
+	if (!file->f_dentry || !file->f_dentry->d_inode || !mapping->a_ops->get_block)
+		goto err_out_fput;
+	if (!sock->f_dentry || !sock->f_dentry->d_inode)
+		goto err_out_fput;
+
+	inode = igrab(file->f_dentry->d_inode);
+	if (!inode)
+		goto err_out_fput;
+
+	size = i_size_read(inode);
+	
+	num = (size > num << PAGE_SHIFT) ? num : (size >> PAGE_SHIFT);
+
+	err = -ENOMEM;
+	priv = kzalloc(sizeof(struct kevent_aio_private), GFP_KERNEL);
+	if (!priv)
+		goto err_out_iput;
+
+	priv->pg_num = num;
+	priv->size = size;
+	priv->offset = 0;
+	priv->file = file;
+	priv->sock = sock;
+	INIT_WORK(&priv->work, kevent_aio_work, k);
+	k->priv = priv;
+
+	dprintk("%s: read: k=%p, priv=%p, inode=%p, num=%u, size=%zu, off=%Lu.\n", 
+			__func__, k, priv, inode, priv->pg_num, priv->size, priv->offset);
+	
+	init_completion(&priv->bio_complete);
+	kevent_storage_enqueue(&inode->st, k);
+	schedule_work(&priv->work);
+	
+	return 0;
+
+err_out_iput:
+	iput(inode);
+err_out_fput:
+	fput(sock);
+err_out_fput_file:
+	fput(file);
+err_out_exit:
+
+	return err;
+}
+
+static int kevent_aio_dequeue(struct kevent *k)
+{
+	struct kevent_aio_private *priv = k->priv;
+	struct inode *inode = k->st->origin;
+	struct file *file = priv->file;
+	struct file *sock = priv->sock;
+
+	kevent_storage_dequeue(k->st, k);
+	flush_scheduled_work();
+	wait_for_completion(&priv->bio_complete);
+
+	kfree(k->priv);
+	k->priv = NULL;
+	iput(inode);
+	fput(file);
+	fput(sock);
+
+	return 0;
+}
+
+asmlinkage long sys_aio_sendfile(int ctl_fd, int fd, int s, 
+		size_t size, unsigned flags)
+{
+	struct ukevent ukread, uksend;
+	struct kevent_user *u;
+	struct file *file;
+	int err, fput_needed;
+	int num = (flags & 7)?(flags & 7):8;
+
+	memset(&ukread, 0, sizeof(struct ukevent));
+	memset(&uksend, 0, sizeof(struct ukevent));
+
+	ukread.type = KEVENT_AIO;
+	ukread.event = KEVENT_AIO_BIO;
+
+	ukread.id.raw[0] = fd;
+	ukread.id.raw[1] = num;
+	ukread.ret_data[0] = s;
+
+	dprintk("%s: fd=%d, s=%d, num=%d.\n", __func__, fd, s, num);
+
+	file = fget_light(ctl_fd, &fput_needed);
+	if (!file)
+		return -ENODEV;
+
+	u = file->private_data;
+	if (!u) {
+		err = -EINVAL;
+		goto err_out_fput;
+	}
+
+	err = kevent_user_add_ukevent(&ukread, u);
+	if (err < 0)
+		goto err_out_fput;
+
+err_out_fput:
+	fput_light(file, fput_needed);
+	return err;
+}
+
+int kevent_init_aio(struct kevent *k)
+{
+	k->enqueue = &kevent_aio_enqueue;
+	k->dequeue = &kevent_aio_dequeue;
+	k->callback = &kevent_aio_callback;
+	return 0;
+}


^ permalink raw reply related	[flat|nested] 160+ messages in thread

* Re: [take2 1/4] kevent: core files.
  2006-08-01  9:34                           ` [take2 1/4] kevent: core files Evgeniy Polyakov
  2006-08-01  9:34                             ` [take2 2/4] kevent: network AIO, socket notifications Evgeniy Polyakov
@ 2006-08-01 13:46                             ` James Morris
  2006-08-01 13:55                               ` Evgeniy Polyakov
  2006-08-01 23:56                             ` Zach Brown
  2 siblings, 1 reply; 160+ messages in thread
From: James Morris @ 2006-08-01 13:46 UTC (permalink / raw)
  To: Evgeniy Polyakov; +Cc: lkml, David Miller, Ulrich Drepper, netdev, Zach Brown

On Tue, 1 Aug 2006, Evgeniy Polyakov wrote:

> +	u->ready_num = 0;
> +#ifdef CONFIG_KEVENT_USER_STAT
> +	u->wait_num = u->im_num = u->total = 0;
> +#endif

Generally, #ifdefs in the body of the kernel code are discouraged.  Can 
you abstract these out as static inlines?


- James
-- 
James Morris
<jmorris@namei.org>

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take2 1/4] kevent: core files.
  2006-08-01 13:46                             ` [take2 1/4] kevent: core files James Morris
@ 2006-08-01 13:55                               ` Evgeniy Polyakov
  2006-08-01 14:27                                 ` James Morris
  0 siblings, 1 reply; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-01 13:55 UTC (permalink / raw)
  To: James Morris; +Cc: lkml, David Miller, Ulrich Drepper, netdev, Zach Brown

On Tue, Aug 01, 2006 at 09:46:58AM -0400, James Morris (jmorris@namei.org) wrote:
> On Tue, 1 Aug 2006, Evgeniy Polyakov wrote:
> 
> > +	u->ready_num = 0;
> > +#ifdef CONFIG_KEVENT_USER_STAT
> > +	u->wait_num = u->im_num = u->total = 0;
> > +#endif
> 
> Generally, #ifdefs in the body of the kernel code are discouraged.  Can 
> you abstract these out as static inlines?

Yes, it is possible.
I would ask is it needed at all? It contains number of immediately fired
events (i.e. those which were ready when event was added and thus
syscall returned immediately showing that it is ready), total number of
events, which were inserted in the given queue and number of events
which were marked as ready after they were inserted.
Currently it is compilation option which ends up in printk with above
info when kevent queue is removed.
 
> - James
> -- 
> James Morris
> <jmorris@namei.org>

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take2 1/4] kevent: core files.
  2006-08-01 13:55                               ` Evgeniy Polyakov
@ 2006-08-01 14:27                                 ` James Morris
  2006-08-01 14:34                                   ` Evgeniy Polyakov
  0 siblings, 1 reply; 160+ messages in thread
From: James Morris @ 2006-08-01 14:27 UTC (permalink / raw)
  To: Evgeniy Polyakov; +Cc: lkml, David Miller, Ulrich Drepper, netdev, Zach Brown

On Tue, 1 Aug 2006, Evgeniy Polyakov wrote:

> On Tue, Aug 01, 2006 at 09:46:58AM -0400, James Morris (jmorris@namei.org) wrote:
> > On Tue, 1 Aug 2006, Evgeniy Polyakov wrote:
> > 
> > > +	u->ready_num = 0;
> > > +#ifdef CONFIG_KEVENT_USER_STAT
> > > +	u->wait_num = u->im_num = u->total = 0;
> > > +#endif
> > 
> > Generally, #ifdefs in the body of the kernel code are discouraged.  Can 
> > you abstract these out as static inlines?
> 
> Yes, it is possible.
> I would ask is it needed at all?

Yes, please, it is standard kernel development practice.

Otherwise, the kernel will turn into an unmaintainable #ifdef jungle.

> It contains number of immediately fired
> events (i.e. those which were ready when event was added and thus
> syscall returned immediately showing that it is ready), total number of
> events, which were inserted in the given queue and number of events
> which were marked as ready after they were inserted.
> Currently it is compilation option which ends up in printk with above
> info when kevent queue is removed.

Fine, make 

static inline void kevent_user_stat_reset(u);

etc.

which compile to nothing when it's not confifgured.


-- 
James Morris
<jmorris@namei.org>

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take2 1/4] kevent: core files.
  2006-08-01 14:27                                 ` James Morris
@ 2006-08-01 14:34                                   ` Evgeniy Polyakov
  0 siblings, 0 replies; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-01 14:34 UTC (permalink / raw)
  To: James Morris; +Cc: lkml, David Miller, Ulrich Drepper, netdev, Zach Brown

On Tue, Aug 01, 2006 at 10:27:36AM -0400, James Morris (jmorris@namei.org) wrote:
> > > > +	u->ready_num = 0;
> > > > +#ifdef CONFIG_KEVENT_USER_STAT
> > > > +	u->wait_num = u->im_num = u->total = 0;
> > > > +#endif
> > > 
> > > Generally, #ifdefs in the body of the kernel code are discouraged.  Can 
> > > you abstract these out as static inlines?
> > 
> > Yes, it is possible.
> > I would ask is it needed at all?
> 
> Yes, please, it is standard kernel development practice.

Will do.
Thanks, James.

> -- 
> James Morris
> <jmorris@namei.org>

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [RFC 1/4] kevent: core files.
  2006-08-01  1:02     ` David Miller
@ 2006-08-01 17:02       ` Zach Brown
  0 siblings, 0 replies; 160+ messages in thread
From: Zach Brown @ 2006-08-01 17:02 UTC (permalink / raw)
  To: David Miller; +Cc: johnpol, linux-kernel, netdev


> I do not think if we do a ring buffer that events should be obtainable
> via a syscall at all.  Rather, I think this system call should be
> purely "sleep until ring is not empty".

Mmm, yeah, of course.  That's much simpler.  I'm looking forward to
Evgeniy's next patch set.

> The ring buffer size, as Evgeniy also tried to describe, is bounded
> purely by the number of registered events.

Yeah.  fwiw, fs/aio.c has this property today.

- z

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take2 1/4] kevent: core files.
  2006-08-01  9:34                           ` [take2 1/4] kevent: core files Evgeniy Polyakov
  2006-08-01  9:34                             ` [take2 2/4] kevent: network AIO, socket notifications Evgeniy Polyakov
  2006-08-01 13:46                             ` [take2 1/4] kevent: core files James Morris
@ 2006-08-01 23:56                             ` Zach Brown
  2006-08-02  0:01                               ` David Miller
  2006-08-02  6:39                               ` Evgeniy Polyakov
  2 siblings, 2 replies; 160+ messages in thread
From: Zach Brown @ 2006-08-01 23:56 UTC (permalink / raw)
  To: Evgeniy Polyakov; +Cc: lkml, David Miller, Ulrich Drepper, netdev and


OK, here's some of my reactions to the core part.

> +#define KEVENT_SOCKET 		0
> +#define KEVENT_INODE		1
> +#define KEVENT_TIMER		2
> +#define KEVENT_POLL		3
> +#define KEVENT_NAIO		4
> +#define KEVENT_AIO		5

I guess we can't really avoid some form of centralized list of the
constants in the API if we're going for a flat constant namespace.
It'll be irritating to manage this list over time, just like it's
irritating to manage syscall numbers now.

> +/*
> + * Socket/network asynchronous IO events.
> + */
> +#define	KEVENT_SOCKET_RECV	0x1
> +#define	KEVENT_SOCKET_ACCEPT	0x2
> +#define	KEVENT_SOCKET_SEND	0x4

I wonder if these shouldn't live in the subsystems instead of in kevent.h.

> +/*
> + * Poll events.
> + */
> +#define	KEVENT_POLL_POLLIN	0x0001
> +#define	KEVENT_POLL_POLLPRI	0x0002
> +#define	KEVENT_POLL_POLLOUT	0x0004
> +#define	KEVENT_POLL_POLLERR	0x0008
> +#define	KEVENT_POLL_POLLHUP	0x0010
> +#define	KEVENT_POLL_POLLNVAL	0x0020
> +
> +#define	KEVENT_POLL_POLLRDNORM	0x0040
> +#define	KEVENT_POLL_POLLRDBAND	0x0080
> +#define	KEVENT_POLL_POLLWRNORM	0x0100
> +#define	KEVENT_POLL_POLLWRBAND	0x0200
> +#define	KEVENT_POLL_POLLMSG	0x0400
> +#define	KEVENT_POLL_POLLREMOVE	0x1000

And couldn't we just use the existing poll bit definitions for this?

> +struct kevent_id
> +{
> +	__u32		raw[2];
> +};

Why not a simple u64?  Users can play games with packing it into other
types if they want.

> +		__u32		user[2];		/* User's data. It is not used, just copied to/from user. */
> +		void		*ptr;
> +	};

Again just a u64 seems like it would be simpler.  userspace library
wrappers can help massage it, but the kernel is just treating it as an
opaque data blob.

> +};
> +
> +#define	KEVENT_CTL_ADD 		0
> +#define	KEVENT_CTL_REMOVE	1
> +#define	KEVENT_CTL_MODIFY	2
> +#define	KEVENT_CTL_INIT		3
> +
> +struct kevent_user_control
> +{
> +	unsigned int		cmd;			/* Control command, e.g. KEVENT_ADD, KEVENT_REMOVE... */
> +	unsigned int		num;			/* Number of ukevents this strucutre controls. */
> +	unsigned int		timeout;		/* Timeout in milliseconds waiting for "num" events to become ready. */
> +};

Even if we only have one syscall with a cmd multiplexer (which I'm not
thrilled with), we should at least make these arguments explicit in the
system call.  It's weird to hide them in a struct.  We could also think
about making them u32 or u64 so that we don't need compat wrappers, but
maybe that's overkill.

Also, can we please use a struct timespec for the timeout?  Then the
kernel will have the luxury of using whatever mechanism it wants to
satisfy the user's precision desires.  Just like sys_nanosleep() uses
timespec and so can be implemented with hrtimers.

> +struct kevent
> +{

(trivial nit, "struct kevent {" is the preferred form.)

> +	struct ukevent		event;
> +	spinlock_t		lock;			/* This lock protects ukevent manipulations, e.g. ret_flags changes. */


It'd be great if these struct members could get a prefix (ala: inode ->
i_, socket -> sk_) so that it's less painful getting tags helpers to
look up instances for us.  Asking for 'lock' is hilarious.

> +struct kevent_list
> +{
> +	struct list_head	kevent_list;		/* List of all kevents. */
> +	spinlock_t 		kevent_lock;		/* Protects all manipulations with queue of kevents. */
> +};
> +
> +struct kevent_user
> +{
> +	struct kevent_list	kqueue[KEVENT_HASH_MASK+1];

Hmm.  I think the current preference is not to have a lock per bucket.
It doesn't scale nearly as well as it seems like it should as the cache
footprint is higher and as cacheline contention hits as there are
multiple buckets per cacheline.  For now I'd simplify the hash into a
single lock and an array of struct hlist_head.  In the future it could
be another user of some kind of relatively-generic hash implementation
based on rcu that has been talked about for a while.

> +#define KEVENT_MAX_REQUESTS		PAGE_SIZE/sizeof(struct kevent)

This is unused?

> +#define list_for_each_entry_reverse_safe(pos, n, head, member)		\
> +	for (pos = list_entry((head)->prev, typeof(*pos), member),	\
> +		n = list_entry(pos->member.prev, typeof(*pos), member);	\
> +	     prefetch(pos->member.prev), &pos->member != (head); 	\
> +	     pos = n, n = list_entry(pos->member.prev, typeof(*pos), member))

If anyone was calling this they could use
list_for_each_entry_safe_reverse() in list.h but nothing is calling it?
 Either way, it should be removed :).

> +#define sock_async(__sk)	0

It's a minor complaint, but these kinds of ifdefs that drop arguments
can cause unused argument warnings if they're the only user of the given
argument.  It'd be nicer to do something like ({ (void)_sk; 0; }) .

> +struct kevent_storage
> +{
> +	void			*origin;		/* Originator's pointer, e.g. struct sock or struct file. Can be NULL. */

Do we really need this pointer?  When the kevent_storage is embedded in
the origin, like struct inode in your patch sets, you can use
container_of() to get back to the inode.  For sources that aren't built
like that, like the timer_list, you could introduce a parent structure
that has the timer_list and the _storage embedded in it.

> +obj-$(CONFIG_KEVENT_SOCKET) += kevent_socket.o
> +obj-$(CONFIG_KEVENT_INODE) += kevent_inode.o
> +obj-$(CONFIG_KEVENT_TIMER) += kevent_timer.o
> +obj-$(CONFIG_KEVENT_POLL) += kevent_poll.o
> +obj-$(CONFIG_KEVENT_NAIO) += kevent_naio.o
> +obj-$(CONFIG_KEVENT_AIO) += kevent_aio.o

I suspect that we won't want this configurable if it gets merged, but I
could be wrong and don't feel strongly about it.

> +	switch (k->event.type) {
> +		case KEVENT_NAIO:
> +			err = kevent_init_naio(k);
> +			break;
> +		case KEVENT_SOCKET:
> +			err = kevent_init_socket(k);
> +			break;

I wonder if it wouldn't be less noisy to have something like

struct kevent_callbacks {
	kevent_callback_t	callback, enqueue, dequeue;
} kev_callbacks[] = {
	[ KEVENT_NAIO ] = &kevent_naio_callbacks,
};

	k->callbacks = kev_callbacks[k->event.type];

Then you'd also have one pointer per kevent instead of three.

> +void kevent_storage_dequeue(struct kevent_storage *st, struct kevent *k)
> +{
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&st->lock, flags);
> +	if (k->storage_entry.next != LIST_POISON1) {
> +		list_del(&k->storage_entry);
> +		st->qlen--;
> +	}

Is this relying on list_del() having set LIST_POISON1?  If so, please
use list_del_init() and list_empty() instead.

> +static void __kevent_requeue(struct kevent *k, u32 event)
> +{

A few things here.  First, the event argument isn't used?

> +	err = k->callback(k);

This is being called while holding both the kevent_list->kevent_lock and
the k->st->lock.  So ->callback is being called with locks held and
interrupts blocked which is going to greatly restrict what can be done
there.  If that's what we want to do we should document the heck out of it.

> +	spin_lock_irqsave(&k->lock, flags);

> +		spin_lock_irqsave(&k->user->ready_lock, flags);

It also now creates lock nesting three deep, which starts to feel
uncomfortable.  Have you run this under lockdep?  It might be fine, but
this kind of depth means those of us auditing have to stare very very
closely at the locks that paths acquire.  The fewer locks that are held
at a time, the better.

> +void kevent_storage_ready(struct kevent_storage *st, 
> +		kevent_callback_t ready_callback, u32 event)

Hmm, the only caller that provides a callback is using it to call
kevent_break() on each event.  Could that not be done by the helper
itself if the caller provides the right event?  Is there some more
complicated use of the callback on the horizon?  Not a big deal, but
there are those who prefer to avoid code paths that nest lots of
callbacks in sequence.

> +	struct kevent *k, *n;

In general, it's nicer to user longer names please.  You'll notice
throughout the kernel that we use things like dentry, inode, page, sock,
etc, instead of d, i, p, and s.

> +struct kevent *kevent_alloc(gfp_t mask)
> +{
> +	return kmem_cache_alloc(kevent_cache, mask);
> +}
> +
> +void kevent_free(struct kevent *k)
> +{
> +	kmem_cache_free(kevent_cache, k);
> +}

We probably don't need these wrappers, just call the kmem_cache_*
functions directly.

> +static unsigned int kevent_user_poll(struct file *file, struct poll_table_struct *wait)
> +{
> +	struct kevent_user *u = file->private_data;
> +	unsigned int mask;
> +	
> +	poll_wait(file, &u->wait, wait);
> +	mask = 0;
> +
> +	if (u->ready_num)
> +		mask |= POLLIN | POLLRDNORM;

Shouldn't this be testing ready_num while holding the ready_lock?

> +	for (i=0; i<KEVENT_HASH_MASK+1; ++i) {
> +		INIT_LIST_HEAD(&u->kqueue[i].kevent_list);
> +		spin_lock_init(&u->kqueue[i].kevent_lock);
> +	}

ARRAY_SIZE(u->kqueue) should probably be used instead of trying to keep
(KEVENT_HASH_MASK + 1) in sync with the kqueue definition.

> +static inline void kevent_user_put(struct kevent_user *u)
> +{
> +	if (atomic_dec_and_test(&u->refcnt)) {
> +#ifdef CONFIG_KEVENT_USER_STAT
> +		printk("%s: u=%p, wait=%lu, immediately=%lu, total=%lu.\n", 
> +				__func__, u, u->wait_num, u->im_num, u->total);
> +#endif

printk() seems like a poor choice if this is meant to be a more formal
functionality.  If it's just a debugging aid then pr_debug() and DEBUG
might be nicer.

> +/*
> + * Remove kevent from user's list of all events, 
> + * dequeue it from storage and decrease user's reference counter,
> + * since this kevent does not exist anymore. That is why it is freed here.
> + */
> +static void kevent_finish_user(struct kevent *k, int lock, int deq)

How about providing locked and unlocked prototypes instead of an
argument that says whether to lock or not?  You know, the usual:

void __thingy() {
	doit();
}

void thingy() {
	lock();
	__thingy();
	unlock();
}

> +		list_del(&k->kevent_entry);
> +		u->kevent_num--;

I wonder if these shouldn't get micro helpers that then have BUG_ON()s
to test bad conditions.  like BUG_ON(list_empty() && kevent_num), that
sort of thing.

> +static struct kevent *__kqueue_dequeue_one_ready(struct list_head *q, 
> +		unsigned int *qlen)
> +{
> +	struct kevent *k = NULL;
> +	unsigned int len = *qlen;
> +	
> +	if (len && !list_empty(q)) {
> +		k = list_entry(q->next, struct kevent, ready_entry);
> +		list_del(&k->ready_entry);
> +		*qlen = len - 1;
> +	}
> +	
> +	return k;
> +}

Hmm, this is only called in one place?  I'd either make the list_head
and lock into one struct (like struct sk_buff_head) or hoist the code
into the caller.

> +	list_for_each_entry(k, &l->kevent_list, kevent_entry) {
> +		spin_lock(&k->lock);
> +		if (k->event.user[0] == uk->user[0] && k->event.user[1] == uk->user[1] &&
> +				k->event.id.raw[0] == uk->id.raw[0] && 
> +				k->event.id.raw[1] == uk->id.raw[1]) {
> +			found = 1;
> +			spin_unlock(&k->lock);

Ahhh, it's fs/aio.c:lookup_kiocb() all over again :) :).  I guess we'll
get this in a hash, or something, before merging.

> +	mutex_lock(&u->ctl_mutex);
> +
> +	for (i=0; i<ctl->num; ++i) {
> +		if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
> +			err = -EINVAL;
> +			break;
> +		}
> +
> +		if (kevent_modify(&uk, u))

So there are a bunch of these.  The internal list and kevents and such
have their own locks.  What are the mutexes serializing?  Why can't we
rely on the finger-grained object locking to make sure that concurrent
operations behave resonably?  One can imagine wanting to modify two
kevents in a context that have nothing to do with each other and not
wanting to serialize them at the context.

> +int kevent_user_add_ukevent(struct ukevent *uk, struct kevent_user *u)
> +{
> +	struct kevent *k;
> +	int err;
> +
> +	k = kevent_alloc(GFP_KERNEL);
> +	if (!k) {
> +		err = -ENOMEM;
> +		goto err_out_exit;
> +	}
> +
> +	memcpy(&k->event, uk, sizeof(struct ukevent));

This path is copying the ukevents twice.  First from userspace back up
in kevent_user_ctl_add() and then here into the kevent.  We should
rework things a bit so that we only copy it once.

> +#ifdef CONFIG_KEVENT_USER_STAT
> +	u->total++;
> +#endif

FWIW, this could hide behind some kevent_user_stat_inc(u) that could be
ifdefed away in the header.

> +	{
> +		unsigned long flags;
> +		unsigned int hash = kevent_user_hash(&k->event);
> +		struct kevent_list *l = &u->kqueue[hash];
> +		
> +		spin_lock_irqsave(&l->kevent_lock, flags);
> +		list_add_tail(&k->kevent_entry, &l->kevent_list);
> +		u->kevent_num++;
> +		kevent_user_get(u);
> +		spin_unlock_irqrestore(&l->kevent_lock, flags);
> +	}

Hmm, please don't indent things like this.  Add a little helper function
or hoist the locals up into the main function and lose the braces.

> +static int kevent_user_ctl_add(struct kevent_user *u, 
> +		struct kevent_user_control *ctl, void __user *arg)
> +{

> +	orig = arg;
> +	ctl_addr = arg - sizeof(struct kevent_user_control);

Ugh.  This is more awkwardness that comes from packing the system call
arguments in neighbouring structs behind a void *.  We should really
have explicit typed args.

> +	for (i=0; i<ctl->num; ++i) {
> +		if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
> +			cerr = -EINVAL;
> +			break;
> +		}
> +		arg += sizeof(struct ukevent);
> +
> +		err = kevent_user_add_ukevent(&uk, u);

There are some users that will want to add thousands of events at a
time.  (Like, say, a certain database writing back lots of cached
dirtied database blocks.)  I wonder if we should arrange this so that we
can get some batching done and reduce the amount of lock traffic per
event added.

> +/*
> + * In nonblocking mode it returns as many events as possible, but not more than @max_nr.
> + * In blocking mode it waits until timeout or if at least @min_nr events are ready,
> + * if timeout is zero, than it waits no more than 1 second or if at least one event
> + * is ready.

That's odd.  Why not have a timeout of 0 mean a timeout of 0?  Where did
1 second come from? :)  It seems pretty crazy to require programmers to
check that their timer math didn't just end up at 0 and magically tell
the kernel to wait a second.

> +	mutex_lock(&u->ctl_mutex);
> +	while (num < max_nr && ((k = kqueue_dequeue_ready(u)) != NULL)) {
> +		if (copy_to_user(buf + num*sizeof(struct ukevent), 
> +					&k->event, sizeof(struct ukevent))) {
> +			cerr = -EINVAL;
> +			break;
> +		}

Again, this a great opportunity to copy more than one at a time with
some refactoring.

> +asmlinkage long sys_kevent_ctl(int fd, void __user *arg)
> +{
> +	int err = -EINVAL, fput_needed;
> +	struct kevent_user_control ctl;
> +	struct file *file;
> +
> +	if (copy_from_user(&ctl, arg, sizeof(struct kevent_user_control)))
> +		return -EINVAL;
> +
> +	if (ctl.cmd == KEVENT_CTL_INIT)
> +		return kevent_ctl_init();

Hmm.  So we can get one of these fds both by opening the device file or
by calling _CTL_INIT (which then magically ignores the fd argument?).
That seems confusing.

Anyway, that's enough for now.  I hope this helps.

- z

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take2 1/4] kevent: core files.
  2006-08-01 23:56                             ` Zach Brown
@ 2006-08-02  0:01                               ` David Miller
  2006-08-02  6:43                                 ` Evgeniy Polyakov
  2006-08-02  6:39                               ` Evgeniy Polyakov
  1 sibling, 1 reply; 160+ messages in thread
From: David Miller @ 2006-08-02  0:01 UTC (permalink / raw)
  To: zach.brown; +Cc: johnpol, linux-kernel, drepper, netdev

From: Zach Brown <zach.brown@oracle.com>
Date: Tue, 01 Aug 2006 16:56:59 -0700

> Even if we only have one syscall with a cmd multiplexer (which I'm not
> thrilled with), we should at least make these arguments explicit in the
> system call.  It's weird to hide them in a struct.  We could also think
> about making them u32 or u64 so that we don't need compat wrappers, but
> maybe that's overkill.

I think making the userspace data structure not require any compat
handling is a must, thanks for pointing this out Zach.

> It'd be great if these struct members could get a prefix (ala: inode ->
> i_, socket -> sk_) so that it's less painful getting tags helpers to
> look up instances for us.  Asking for 'lock' is hilarious.

Agreed.

> Hmm.  I think the current preference is not to have a lock per bucket.

Yes, it loses badly, that's why we undid this in the routing cache
and just have a fixed sized array of locks which is hashed into.

For kevents, I think a single spinlock initially is fine and
if we hit performance problems on SMP we can fix it.  We should
not implement complexity we have no proof of needing yet :)

> > +#define KEVENT_MAX_REQUESTS		PAGE_SIZE/sizeof(struct kevent)
> 
> This is unused?

It is probably groundwork for the mmap() ring buffer... :)


^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take2 1/4] kevent: core files.
  2006-08-01 23:56                             ` Zach Brown
  2006-08-02  0:01                               ` David Miller
@ 2006-08-02  6:39                               ` Evgeniy Polyakov
  2006-08-02  7:25                                 ` David Miller
  1 sibling, 1 reply; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-02  6:39 UTC (permalink / raw)
  To: Zach Brown; +Cc: lkml, David Miller, Ulrich Drepper, netdev and

On Tue, Aug 01, 2006 at 04:56:59PM -0700, Zach Brown (zach.brown@oracle.com) wrote:
> 
> OK, here's some of my reactions to the core part.

Thanks.

> > +#define KEVENT_SOCKET 		0
> > +#define KEVENT_INODE		1
> > +#define KEVENT_TIMER		2
> > +#define KEVENT_POLL		3
> > +#define KEVENT_NAIO		4
> > +#define KEVENT_AIO		5
> 
> I guess we can't really avoid some form of centralized list of the
> constants in the API if we're going for a flat constant namespace.
> It'll be irritating to manage this list over time, just like it's
> irritating to manage syscall numbers now.
> 
> > +/*
> > + * Socket/network asynchronous IO events.
> > + */
> > +#define	KEVENT_SOCKET_RECV	0x1
> > +#define	KEVENT_SOCKET_ACCEPT	0x2
> > +#define	KEVENT_SOCKET_SEND	0x4
> 
> I wonder if these shouldn't live in the subsystems instead of in kevent.h.

Yes it could, but it requires including those files in kevent.h, which
is exported to userspace, and it is not always possible to publish
included file there.

> > +/*
> > + * Poll events.
> > + */
> > +#define	KEVENT_POLL_POLLIN	0x0001
> > +#define	KEVENT_POLL_POLLPRI	0x0002
> > +#define	KEVENT_POLL_POLLOUT	0x0004
> > +#define	KEVENT_POLL_POLLERR	0x0008
> > +#define	KEVENT_POLL_POLLHUP	0x0010
> > +#define	KEVENT_POLL_POLLNVAL	0x0020
> > +
> > +#define	KEVENT_POLL_POLLRDNORM	0x0040
> > +#define	KEVENT_POLL_POLLRDBAND	0x0080
> > +#define	KEVENT_POLL_POLLWRNORM	0x0100
> > +#define	KEVENT_POLL_POLLWRBAND	0x0200
> > +#define	KEVENT_POLL_POLLMSG	0x0400
> > +#define	KEVENT_POLL_POLLREMOVE	0x1000
> 
> And couldn't we just use the existing poll bit definitions for this?

asm/poll.h I expect.
linux/poll.h is too heavy or not?

> > +struct kevent_id
> > +{
> > +	__u32		raw[2];
> > +};
> 
> Why not a simple u64?  Users can play games with packing it into other
> types if they want.
> 
> > +		__u32		user[2];		/* User's data. It is not used, just copied to/from user. */
> > +		void		*ptr;
> > +	};
> 
> Again just a u64 seems like it would be simpler.  userspace library
> wrappers can help massage it, but the kernel is just treating it as an
> opaque data blob.

u64 is not aligned, so I prefer to use u32 as much as possible.

> > +};
> > +
> > +#define	KEVENT_CTL_ADD 		0
> > +#define	KEVENT_CTL_REMOVE	1
> > +#define	KEVENT_CTL_MODIFY	2
> > +#define	KEVENT_CTL_INIT		3
> > +
> > +struct kevent_user_control
> > +{
> > +	unsigned int		cmd;			/* Control command, e.g. KEVENT_ADD, KEVENT_REMOVE... */
> > +	unsigned int		num;			/* Number of ukevents this strucutre controls. */
> > +	unsigned int		timeout;		/* Timeout in milliseconds waiting for "num" events to become ready. */
> > +};
> 
> Even if we only have one syscall with a cmd multiplexer (which I'm not
> thrilled with), we should at least make these arguments explicit in the
> system call.  It's weird to hide them in a struct.  We could also think
> about making them u32 or u64 so that we don't need compat wrappers, but
> maybe that's overkill.

Ok.

> Also, can we please use a struct timespec for the timeout?  Then the
> kernel will have the luxury of using whatever mechanism it wants to
> satisfy the user's precision desires.  Just like sys_nanosleep() uses
> timespec and so can be implemented with hrtimers.

It has variable size, I strongly against such things between kernel and
userspace.

> > +struct kevent
> > +{
> 
> (trivial nit, "struct kevent {" is the preferred form.)

Ok.

> > +	struct ukevent		event;
> > +	spinlock_t		lock;			/* This lock protects ukevent manipulations, e.g. ret_flags changes. */
> 
> 
> It'd be great if these struct members could get a prefix (ala: inode ->
> i_, socket -> sk_) so that it's less painful getting tags helpers to
> look up instances for us.  Asking for 'lock' is hilarious.

But it requires much less typing :)
Will update.

> > +struct kevent_list
> > +{
> > +	struct list_head	kevent_list;		/* List of all kevents. */
> > +	spinlock_t 		kevent_lock;		/* Protects all manipulations with queue of kevents. */
> > +};
> > +
> > +struct kevent_user
> > +{
> > +	struct kevent_list	kqueue[KEVENT_HASH_MASK+1];
> 
> Hmm.  I think the current preference is not to have a lock per bucket.
> It doesn't scale nearly as well as it seems like it should as the cache
> footprint is higher and as cacheline contention hits as there are
> multiple buckets per cacheline.  For now I'd simplify the hash into a
> single lock and an array of struct hlist_head.  In the future it could
> be another user of some kind of relatively-generic hash implementation
> based on rcu that has been talked about for a while.

Well, it scales better than one lock per the whole queue, but we can
see how it looks with one lock.

I used RCU hash table in kevents, but it scales very bad for things like
inode removal, which can not be done (at least when kevent was initially
created) in rcu callback context, so it required sycnhronize)rcu() which
broke latencies to unacceptible level.

> > +#define KEVENT_MAX_REQUESTS		PAGE_SIZE/sizeof(struct kevent)
> 
> This is unused?

As David mentioned, I expect it to be base for mapped ring, although it
should be PAGE_SIZE/sizeof(struct ukevent). I will remove for now.

> > +#define list_for_each_entry_reverse_safe(pos, n, head, member)		\
> > +	for (pos = list_entry((head)->prev, typeof(*pos), member),	\
> > +		n = list_entry(pos->member.prev, typeof(*pos), member);	\
> > +	     prefetch(pos->member.prev), &pos->member != (head); 	\
> > +	     pos = n, n = list_entry(pos->member.prev, typeof(*pos), member))
> 
> If anyone was calling this they could use
> list_for_each_entry_safe_reverse() in list.h but nothing is calling it?
>  Either way, it should be removed :).

It is from the past life, I will remove it.

> > +#define sock_async(__sk)	0
> 
> It's a minor complaint, but these kinds of ifdefs that drop arguments
> can cause unused argument warnings if they're the only user of the given
> argument.  It'd be nicer to do something like ({ (void)_sk; 0; }) .

Ok.

> > +struct kevent_storage
> > +{
> > +	void			*origin;		/* Originator's pointer, e.g. struct sock or struct file. Can be NULL. */
> 
> Do we really need this pointer?  When the kevent_storage is embedded in
> the origin, like struct inode in your patch sets, you can use
> container_of() to get back to the inode.  For sources that aren't built
> like that, like the timer_list, you could introduce a parent structure
> that has the timer_list and the _storage embedded in it.

Well, the idea was to be able not only to embed kevent_storage, but to
have a pointer to it, so some users can allocate it as addon.
If we strongly decide, that it will not used that way, this pointer can
be removed.

> > +obj-$(CONFIG_KEVENT_SOCKET) += kevent_socket.o
> > +obj-$(CONFIG_KEVENT_INODE) += kevent_inode.o
> > +obj-$(CONFIG_KEVENT_TIMER) += kevent_timer.o
> > +obj-$(CONFIG_KEVENT_POLL) += kevent_poll.o
> > +obj-$(CONFIG_KEVENT_NAIO) += kevent_naio.o
> > +obj-$(CONFIG_KEVENT_AIO) += kevent_aio.o
> 
> I suspect that we won't want this configurable if it gets merged, but I
> could be wrong and don't feel strongly about it.

For example epoll is configurable for embedded systems, so it might be a
good ide to be possible to remove something that will not be 100% in
use.

> > +	switch (k->event.type) {
> > +		case KEVENT_NAIO:
> > +			err = kevent_init_naio(k);
> > +			break;
> > +		case KEVENT_SOCKET:
> > +			err = kevent_init_socket(k);
> > +			break;
> 
> I wonder if it wouldn't be less noisy to have something like
> 
> struct kevent_callbacks {
> 	kevent_callback_t	callback, enqueue, dequeue;
> } kev_callbacks[] = {
> 	[ KEVENT_NAIO ] = &kevent_naio_callbacks,
> };
> 
> 	k->callbacks = kev_callbacks[k->event.type];
> 
> Then you'd also have one pointer per kevent instead of three.

Ok, I will create such table.

> > +void kevent_storage_dequeue(struct kevent_storage *st, struct kevent *k)
> > +{
> > +	unsigned long flags;
> > +
> > +	spin_lock_irqsave(&st->lock, flags);
> > +	if (k->storage_entry.next != LIST_POISON1) {
> > +		list_del(&k->storage_entry);
> > +		st->qlen--;
> > +	}
> 
> Is this relying on list_del() having set LIST_POISON1?  If so, please
> use list_del_init() and list_empty() instead.

Yes, POISON is a flag, that keven is or is not in the appropriate list.
It could be done by wasting some bits, but I decided to not do it since
list poison is always there.

> > +static void __kevent_requeue(struct kevent *k, u32 event)
> > +{
> 
> A few things here.  First, the event argument isn't used?

Tss, it was used for printk when it was there :)

> > +	err = k->callback(k);
> 
> This is being called while holding both the kevent_list->kevent_lock and
> the k->st->lock.  So ->callback is being called with locks held and
> interrupts blocked which is going to greatly restrict what can be done
> there.  If that's what we want to do we should document the heck out of it.

No, interrupts and bh a not blocked there, when it is called from
origin's state machine.
locking is following:

storage_lock (interrupts and bh are in the state which was in a origin's
state machine, for example socket code has bh disabled, but block layer
has interrupts disabled here)
check if at aleast one event in storage queue is requested for 
	that event
call callback
It is possible to mark event as broken or done in callback, so we need
to check event's flags. It does not actually require irqsave lock, since
the same kevent can not live in several storages.
We hold a lock to protect against userspace which can change that flags.
If it is marked as ready we queue that event into ready list under
ready list lock. That requires irq disabling, since that queue can be
accessed from any context.
So there are maximum 2 embedded locks.

When userspace modifies kevent, it must protect against access from the
origin, so it disables interrupt.

> > +	spin_lock_irqsave(&k->lock, flags);
> 
> > +		spin_lock_irqsave(&k->user->ready_lock, flags);
> 
> It also now creates lock nesting three deep, which starts to feel
> uncomfortable.  Have you run this under lockdep?  It might be fine, but
> this kind of depth means those of us auditing have to stare very very
> closely at the locks that paths acquire.  The fewer locks that are held
> at a time, the better.

Above sequence is only possible when userspace modifies kevent which was
not fired yet, and that modification ends up in immediat fire.
When it is called from origin's state machine irqs are disabled only
when kevent is moved into the ready queue, since that queue can be
accessed from different CPU or from irq on the same one.

> > +void kevent_storage_ready(struct kevent_storage *st, 
> > +		kevent_callback_t ready_callback, u32 event)
> 
> Hmm, the only caller that provides a callback is using it to call
> kevent_break() on each event.  Could that not be done by the helper
> itself if the caller provides the right event?  Is there some more
> complicated use of the callback on the horizon?  Not a big deal, but
> there are those who prefer to avoid code paths that nest lots of
> callbacks in sequence.

kevent users can call any callback they want here, for example it can
mark all events as ready, not only broken, or all mark as one-shot if
origin is going to be removed and so on.

> > +	struct kevent *k, *n;
> 
> In general, it's nicer to user longer names please.  You'll notice
> throughout the kernel that we use things like dentry, inode, page, sock,
> etc, instead of d, i, p, and s.

But we have 'sk' :)

> > +struct kevent *kevent_alloc(gfp_t mask)
> > +{
> > +	return kmem_cache_alloc(kevent_cache, mask);
> > +}
> > +
> > +void kevent_free(struct kevent *k)
> > +{
> > +	kmem_cache_free(kevent_cache, k);
> > +}
> 
> We probably don't need these wrappers, just call the kmem_cache_*
> functions directly.

Ok.

> > +static unsigned int kevent_user_poll(struct file *file, struct poll_table_struct *wait)
> > +{
> > +	struct kevent_user *u = file->private_data;
> > +	unsigned int mask;
> > +	
> > +	poll_wait(file, &u->wait, wait);
> > +	mask = 0;
> > +
> > +	if (u->ready_num)
> > +		mask |= POLLIN | POLLRDNORM;
> 
> Shouldn't this be testing ready_num while holding the ready_lock?

Integer read is atomic, so no need to wrap a lock around it.

> > +	for (i=0; i<KEVENT_HASH_MASK+1; ++i) {
> > +		INIT_LIST_HEAD(&u->kqueue[i].kevent_list);
> > +		spin_lock_init(&u->kqueue[i].kevent_lock);
> > +	}
> 
> ARRAY_SIZE(u->kqueue) should probably be used instead of trying to keep
> (KEVENT_HASH_MASK + 1) in sync with the kqueue definition.

Ok.

> > +static inline void kevent_user_put(struct kevent_user *u)
> > +{
> > +	if (atomic_dec_and_test(&u->refcnt)) {
> > +#ifdef CONFIG_KEVENT_USER_STAT
> > +		printk("%s: u=%p, wait=%lu, immediately=%lu, total=%lu.\n", 
> > +				__func__, u, u->wait_num, u->im_num, u->total);
> > +#endif
> 
> printk() seems like a poor choice if this is meant to be a more formal
> functionality.  If it's just a debugging aid then pr_debug() and DEBUG
> might be nicer.

Yes, I will wrap it into some nice function.

> > +/*
> > + * Remove kevent from user's list of all events, 
> > + * dequeue it from storage and decrease user's reference counter,
> > + * since this kevent does not exist anymore. That is why it is freed here.
> > + */
> > +static void kevent_finish_user(struct kevent *k, int lock, int deq)
> 
> How about providing locked and unlocked prototypes instead of an
> argument that says whether to lock or not?  You know, the usual:
> 
> void __thingy() {
> 	doit();
> }
> 
> void thingy() {
> 	lock();
> 	__thingy();
> 	unlock();
> }

Ok.

> > +		list_del(&k->kevent_entry);
> > +		u->kevent_num--;
> 
> I wonder if these shouldn't get micro helpers that then have BUG_ON()s
> to test bad conditions.  like BUG_ON(list_empty() && kevent_num), that
> sort of thing.

Well, if list is empty that means that kevent_entry has a broken links,
which will will fire on list_del.
And having a lot of BUGs is not a good sign.
But I do not care much actually about it, let's have couple...

> > +static struct kevent *__kqueue_dequeue_one_ready(struct list_head *q, 
> > +		unsigned int *qlen)
> > +{
> > +	struct kevent *k = NULL;
> > +	unsigned int len = *qlen;
> > +	
> > +	if (len && !list_empty(q)) {
> > +		k = list_entry(q->next, struct kevent, ready_entry);
> > +		list_del(&k->ready_entry);
> > +		*qlen = len - 1;
> > +	}
> > +	
> > +	return k;
> > +}
> 
> Hmm, this is only called in one place?  I'd either make the list_head
> and lock into one struct (like struct sk_buff_head) or hoist the code
> into the caller.

No problem.

> > +	list_for_each_entry(k, &l->kevent_list, kevent_entry) {
> > +		spin_lock(&k->lock);
> > +		if (k->event.user[0] == uk->user[0] && k->event.user[1] == uk->user[1] &&
> > +				k->event.id.raw[0] == uk->id.raw[0] && 
> > +				k->event.id.raw[1] == uk->id.raw[1]) {
> > +			found = 1;
> > +			spin_unlock(&k->lock);
> 
> Ahhh, it's fs/aio.c:lookup_kiocb() all over again :) :).  I guess we'll
> get this in a hash, or something, before merging.

Please note that it is searching inside hash bucket :)

> > +	mutex_lock(&u->ctl_mutex);
> > +
> > +	for (i=0; i<ctl->num; ++i) {
> > +		if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
> > +			err = -EINVAL;
> > +			break;
> > +		}
> > +
> > +		if (kevent_modify(&uk, u))
> 
> So there are a bunch of these.  The internal list and kevents and such
> have their own locks.  What are the mutexes serializing?  Why can't we
> rely on the finger-grained object locking to make sure that concurrent
> operations behave resonably?  One can imagine wanting to modify two
> kevents in a context that have nothing to do with each other and not
> wanting to serialize them at the context.

Each lock is hosted inside a bucket, but kevent itself is protected by
that mutex. lock is being held for relatively small operations, but
mutex protects against the whole sequence of them (select bucket,
search, hash and so on).

> > +int kevent_user_add_ukevent(struct ukevent *uk, struct kevent_user *u)
> > +{
> > +	struct kevent *k;
> > +	int err;
> > +
> > +	k = kevent_alloc(GFP_KERNEL);
> > +	if (!k) {
> > +		err = -ENOMEM;
> > +		goto err_out_exit;
> > +	}
> > +
> > +	memcpy(&k->event, uk, sizeof(struct ukevent));
> 
> This path is copying the ukevents twice.  First from userspace back up
> in kevent_user_ctl_add() and then here into the kevent.  We should
> rework things a bit so that we only copy it once.

struct ukevent here can be allocated in stack and filled by naio or aio.
I would not allow them to allocate and link kevents by itself, so I
created this function.

> > +#ifdef CONFIG_KEVENT_USER_STAT
> > +	u->total++;
> > +#endif
> 
> FWIW, this could hide behind some kevent_user_stat_inc(u) that could be
> ifdefed away in the header.

Ok.

> > +	{
> > +		unsigned long flags;
> > +		unsigned int hash = kevent_user_hash(&k->event);
> > +		struct kevent_list *l = &u->kqueue[hash];
> > +		
> > +		spin_lock_irqsave(&l->kevent_lock, flags);
> > +		list_add_tail(&k->kevent_entry, &l->kevent_list);
> > +		u->kevent_num++;
> > +		kevent_user_get(u);
> > +		spin_unlock_irqrestore(&l->kevent_lock, flags);
> > +	}
> 
> Hmm, please don't indent things like this.  Add a little helper function
> or hoist the locals up into the main function and lose the braces.

Ok.

> > +static int kevent_user_ctl_add(struct kevent_user *u, 
> > +		struct kevent_user_control *ctl, void __user *arg)
> > +{
> 
> > +	orig = arg;
> > +	ctl_addr = arg - sizeof(struct kevent_user_control);
> 
> Ugh.  This is more awkwardness that comes from packing the system call
> arguments in neighbouring structs behind a void *.  We should really
> have explicit typed args.

kevent_user_control will be replaced with explicit syscall parameters,
so it will be removed.

> > +	for (i=0; i<ctl->num; ++i) {
> > +		if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
> > +			cerr = -EINVAL;
> > +			break;
> > +		}
> > +		arg += sizeof(struct ukevent);
> > +
> > +		err = kevent_user_add_ukevent(&uk, u);
> 
> There are some users that will want to add thousands of events at a
> time.  (Like, say, a certain database writing back lots of cached
> dirtied database blocks.)  I wonder if we should arrange this so that we
> can get some batching done and reduce the amount of lock traffic per
> event added.

Well, it is possible with additional GPF_KERNEL allocation of the buffer, 
do we want that cost?

> > +/*
> > + * In nonblocking mode it returns as many events as possible, but not more than @max_nr.
> > + * In blocking mode it waits until timeout or if at least @min_nr events are ready,
> > + * if timeout is zero, than it waits no more than 1 second or if at least one event
> > + * is ready.
> 
> That's odd.  Why not have a timeout of 0 mean a timeout of 0?  Where did
> 1 second come from? :)  It seems pretty crazy to require programmers to
> check that their timer math didn't just end up at 0 and magically tell
> the kernel to wait a second.

Zero timeout means that we want as much as we have, but not less than one kevent.
So we sleep one second for them :)
I will use min_nr for that, i.e. if it is zero, than it meanst at least
onee and less than max_nr.

> > +	mutex_lock(&u->ctl_mutex);
> > +	while (num < max_nr && ((k = kqueue_dequeue_ready(u)) != NULL)) {
> > +		if (copy_to_user(buf + num*sizeof(struct ukevent), 
> > +					&k->event, sizeof(struct ukevent))) {
> > +			cerr = -EINVAL;
> > +			break;
> > +		}
> 
> Again, this a great opportunity to copy more than one at a time with
> some refactoring.

If we are going to remove ability to get events by ssycall it will be
pure memcpy without additional overhead.

> > +asmlinkage long sys_kevent_ctl(int fd, void __user *arg)
> > +{
> > +	int err = -EINVAL, fput_needed;
> > +	struct kevent_user_control ctl;
> > +	struct file *file;
> > +
> > +	if (copy_from_user(&ctl, arg, sizeof(struct kevent_user_control)))
> > +		return -EINVAL;
> > +
> > +	if (ctl.cmd == KEVENT_CTL_INIT)
> > +		return kevent_ctl_init();
> 
> Hmm.  So we can get one of these fds both by opening the device file or
> by calling _CTL_INIT (which then magically ignores the fd argument?).
> That seems confusing.

So we want additional syscall? :)

> Anyway, that's enough for now.  I hope this helps.

Thanks Zach.

> - z

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take2 1/4] kevent: core files.
  2006-08-02  0:01                               ` David Miller
@ 2006-08-02  6:43                                 ` Evgeniy Polyakov
  0 siblings, 0 replies; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-02  6:43 UTC (permalink / raw)
  To: David Miller; +Cc: zach.brown, linux-kernel, drepper, netdev

On Tue, Aug 01, 2006 at 05:01:38PM -0700, David Miller (davem@davemloft.net) wrote:
> From: Zach Brown <zach.brown@oracle.com>
> Date: Tue, 01 Aug 2006 16:56:59 -0700
> 
> > Even if we only have one syscall with a cmd multiplexer (which I'm not
> > thrilled with), we should at least make these arguments explicit in the
> > system call.  It's weird to hide them in a struct.  We could also think
> > about making them u32 or u64 so that we don't need compat wrappers, but
> > maybe that's overkill.
> 
> I think making the userspace data structure not require any compat
> handling is a must, thanks for pointing this out Zach.

It does not require compat macros, since unsigned int has the same size
on all normal machines where Linux runs, although it can be different.
Anyway, I will replace it with explicit syscall parameters.

> > It'd be great if these struct members could get a prefix (ala: inode ->
> > i_, socket -> sk_) so that it's less painful getting tags helpers to
> > look up instances for us.  Asking for 'lock' is hilarious.
> 
> Agreed.

Heh, it was so much less typing...

> > Hmm.  I think the current preference is not to have a lock per bucket.
> 
> Yes, it loses badly, that's why we undid this in the routing cache
> and just have a fixed sized array of locks which is hashed into.
> 
> For kevents, I think a single spinlock initially is fine and
> if we hit performance problems on SMP we can fix it.  We should
> not implement complexity we have no proof of needing yet :)

Ok, let's see how it will behave.

> > > +#define KEVENT_MAX_REQUESTS		PAGE_SIZE/sizeof(struct kevent)
> > 
> > This is unused?
> 
> It is probably groundwork for the mmap() ring buffer... :)

A lot of work, isn't it? :)

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take2 1/4] kevent: core files.
  2006-08-02  6:39                               ` Evgeniy Polyakov
@ 2006-08-02  7:25                                 ` David Miller
  2006-08-02  7:46                                   ` Evgeniy Polyakov
  0 siblings, 1 reply; 160+ messages in thread
From: David Miller @ 2006-08-02  7:25 UTC (permalink / raw)
  To: johnpol; +Cc: zach.brown, linux-kernel, drepper, netdev

From: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
Date: Wed, 2 Aug 2006 10:39:18 +0400

> u64 is not aligned, so I prefer to use u32 as much as possible.

We have aligned_u64 exactly for this purpose, netfilter makes
use of it to avoid the x86_64 vs. x86 u64 alignment discrepency.

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take2 1/4] kevent: core files.
  2006-08-02  7:25                                 ` David Miller
@ 2006-08-02  7:46                                   ` Evgeniy Polyakov
  0 siblings, 0 replies; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-02  7:46 UTC (permalink / raw)
  To: David Miller; +Cc: zach.brown, linux-kernel, drepper, netdev

On Wed, Aug 02, 2006 at 12:25:05AM -0700, David Miller (davem@davemloft.net) wrote:
> From: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
> Date: Wed, 2 Aug 2006 10:39:18 +0400
> 
> > u64 is not aligned, so I prefer to use u32 as much as possible.
> 
> We have aligned_u64 exactly for this purpose, netfilter makes
> use of it to avoid the x86_64 vs. x86 u64 alignment discrepency.

Ok, I will use that type.

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take3 0/4] kevent: Generic event handling mechanism.
  2006-08-03  9:45                         ` [take3 0/4] kevent: Generic event handling mechanism Evgeniy Polyakov
@ 2006-08-03  9:40                           ` Evgeniy Polyakov
  2006-08-03  9:46                           ` [take3 1/4] kevent: Core files Evgeniy Polyakov
  1 sibling, 0 replies; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-03  9:40 UTC (permalink / raw)
  To: lkml; +Cc: David Miller, Ulrich Drepper, netdev, Zach Brown

On Thu, Aug 03, 2006 at 01:45:59PM +0400, Evgeniy Polyakov (johnpol@2ka.mipt.ru) wrote:
> Changes from 'take2' patchset:
>  * split kevent_finish_user() to locked and unlocked variants
>  * do not use KEVENT_STAT ifdefs, use inline functions instead
>  * use array of callbacks of each type instead of each kevent callback initialization
>  * changed name of ukevent guarding lock
>  * use only one kevent lock in kevent_user for all hash buckets instead of per-bucket locks
>  * do not use kevent_user_ctl structure instead provide needed arguments as syscall parameters
>  * various indent cleanups
>  * mapped buffer (initial) implementation (no userspace yet)

Also added optimisation aimed to help when a lot of kevents are being
copied from userspace in one syscall.

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take3 4/4] kevent: poll/select() notifications. Timer notifications.
  2006-08-03  9:46                                 ` [take3 4/4] kevent: poll/select() notifications. Timer notifications Evgeniy Polyakov
@ 2006-08-03  9:43                                   ` Eric Dumazet
  2006-08-03  9:48                                     ` Evgeniy Polyakov
  0 siblings, 1 reply; 160+ messages in thread
From: Eric Dumazet @ 2006-08-03  9:43 UTC (permalink / raw)
  To: Evgeniy Polyakov; +Cc: lkml, David Miller, Ulrich Drepper, netdev, Zach Brown

On Thursday 03 August 2006 11:46, Evgeniy Polyakov wrote:
> poll/select() notifications. Timer notifications.
>
> +++ b/kernel/kevent/kevent_poll.c

> +static int kevent_poll_wait_callback(wait_queue_t *wait,
> +		unsigned mode, int sync, void *key)
> +{
> +	struct kevent_poll_wait_container *cont =
> +		container_of(wait, struct kevent_poll_wait_container, wait);
> +	struct kevent *k = cont->k;
> +	struct file *file = k->st->origin;
> +	unsigned long flags;
> +	u32 revents, event;
> +
> +	revents = file->f_op->poll(file, NULL);
> +	spin_lock_irqsave(&k->ulock, flags);
> +	event = k->event.event;
> +	spin_unlock_irqrestore(&k->ulock, flags);

Not sure why you take a spinlock just to read a u32

Eric

^ permalink raw reply	[flat|nested] 160+ messages in thread

* [take3 0/4] kevent: Generic event handling mechanism.
  2006-07-31 10:33                       ` Evgeniy Polyakov
                                           ` (3 preceding siblings ...)
  2006-08-01  9:34                         ` [take2 0/4] kevent: introduction Evgeniy Polyakov
@ 2006-08-03  9:45                         ` Evgeniy Polyakov
  2006-08-03  9:40                           ` Evgeniy Polyakov
  2006-08-03  9:46                           ` [take3 1/4] kevent: Core files Evgeniy Polyakov
  2006-08-05 13:02                         ` [take4 0/4] kevent: Generic event handling mechanism Evgeniy Polyakov
                                           ` (5 subsequent siblings)
  10 siblings, 2 replies; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-03  9:45 UTC (permalink / raw)
  To: lkml; +Cc: David Miller, Ulrich Drepper, Evgeniy Polyakov, netdev, Zach Brown


Generic event handling mechanism.

I send this patchset for comments and review, it still contains AIO and 
aio_sendfile() implementation on top of get_block() abstraction, which was
decided to postpone for a while (it is simpler right now to generate patchset as a whole,
when kevent will be ready for merge, I will generate patchset without AIO stuff).
It does not contain mapped buffer implementation, since it's design is not 100% 
completed, I will present that implementation in the third patchset.

Changes from 'take2' patchset:
 * split kevent_finish_user() to locked and unlocked variants
 * do not use KEVENT_STAT ifdefs, use inline functions instead
 * use array of callbacks of each type instead of each kevent callback initialization
 * changed name of ukevent guarding lock
 * use only one kevent lock in kevent_user for all hash buckets instead of per-bucket locks
 * do not use kevent_user_ctl structure instead provide needed arguments as syscall parameters
 * various indent cleanups
 * mapped buffer (initial) implementation (no userspace yet)

Changes from 'take1' patchset:
 - rebased against 2.6.18-git tree
 - removed ioctl controlling
 - added new syscall kevent_get_events(int fd, unsigned int min_nr, unsigned int max_nr,
			unsigned int timeout, void __user *buf, unsigned flags)
 - use old syscall kevent_ctl for creation/removing, modification and initial kevent 
	initialization
 - use mutuxes instead of semaphores
 - added file descriptor check and return error if provided descriptor does not match
	kevent file operations
 - various indent fixes
 - removed aio_sendfile() declarations.

Thank you.

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>



^ permalink raw reply	[flat|nested] 160+ messages in thread

* [take3 3/4] kevent: Network AIO, socket notifications.
  2006-08-03  9:46                             ` [take3 2/4] kevent: AIO, aio_sendfile() implementation Evgeniy Polyakov
@ 2006-08-03  9:46                               ` Evgeniy Polyakov
  2006-08-03  9:46                                 ` [take3 4/4] kevent: poll/select() notifications. Timer notifications Evgeniy Polyakov
  2006-08-03  9:54                                 ` [take3 3/4] kevent: Network AIO, socket notifications Eric Dumazet
  2006-08-03 17:04                               ` [take3 2/4] kevent: AIO, aio_sendfile() implementation Badari Pulavarty
  1 sibling, 2 replies; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-03  9:46 UTC (permalink / raw)
  To: lkml; +Cc: David Miller, Ulrich Drepper, Evgeniy Polyakov, netdev, Zach Brown


Network AIO, socket notifications.

This patchset includes socket notifications and network asynchronous IO.
Network AIO is based on kevent and works as usual kevent storage on top
of inode.

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>

diff --git a/include/asm-i386/socket.h b/include/asm-i386/socket.h
index 5755d57..9300678 100644
--- a/include/asm-i386/socket.h
+++ b/include/asm-i386/socket.h
@@ -50,4 +50,6 @@ #define SO_ACCEPTCONN		30
 #define SO_PEERSEC		31
 #define SO_PASSSEC		34
 
+#define SO_ASYNC_SOCK		35
+
 #endif /* _ASM_SOCKET_H */
diff --git a/include/asm-x86_64/socket.h b/include/asm-x86_64/socket.h
index b467026..fc2b49d 100644
--- a/include/asm-x86_64/socket.h
+++ b/include/asm-x86_64/socket.h
@@ -50,4 +50,6 @@ #define SO_ACCEPTCONN		30
 #define SO_PEERSEC             31
 #define SO_PASSSEC		34
 
+#define SO_ASYNC_SOCK		35
+
 #endif /* _ASM_SOCKET_H */
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 4307e76..9267873 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1283,6 +1283,8 @@ extern struct sk_buff *skb_recv_datagram
 					 int noblock, int *err);
 extern unsigned int    datagram_poll(struct file *file, struct socket *sock,
 				     struct poll_table_struct *wait);
+extern int	       skb_copy_datagram(const struct sk_buff *from, 
+					 int offset, void *dst, int size);
 extern int	       skb_copy_datagram_iovec(const struct sk_buff *from,
 					       int offset, struct iovec *to,
 					       int size);
diff --git a/include/net/sock.h b/include/net/sock.h
index 324b3ea..c43a153 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -48,6 +48,7 @@ #include <linux/lockdep.h>
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>	/* struct sk_buff */
 #include <linux/security.h>
+#include <linux/kevent.h>
 
 #include <linux/filter.h>
 
@@ -391,6 +392,8 @@ enum sock_flags {
 	SOCK_RCVTSTAMP, /* %SO_TIMESTAMP setting */
 	SOCK_LOCALROUTE, /* route locally only, %SO_DONTROUTE setting */
 	SOCK_QUEUE_SHRUNK, /* write queue has been shrunk recently */
+	SOCK_ASYNC,
+	SOCK_ASYNC_INUSE,
 };
 
 static inline void sock_copy_flags(struct sock *nsk, struct sock *osk)
@@ -450,6 +453,21 @@ static inline int sk_stream_memory_free(
 
 extern void sk_stream_rfree(struct sk_buff *skb);
 
+struct socket_alloc {
+	struct socket socket;
+	struct inode vfs_inode;
+};
+
+static inline struct socket *SOCKET_I(struct inode *inode)
+{
+	return &container_of(inode, struct socket_alloc, vfs_inode)->socket;
+}
+
+static inline struct inode *SOCK_INODE(struct socket *socket)
+{
+	return &container_of(socket, struct socket_alloc, socket)->vfs_inode;
+}
+
 static inline void sk_stream_set_owner_r(struct sk_buff *skb, struct sock *sk)
 {
 	skb->sk = sk;
@@ -477,6 +495,7 @@ static inline void sk_add_backlog(struct
 		sk->sk_backlog.tail = skb;
 	}
 	skb->next = NULL;
+	kevent_socket_notify(sk, KEVENT_SOCKET_RECV);
 }
 
 #define sk_wait_event(__sk, __timeo, __condition)		\
@@ -548,6 +567,12 @@ struct proto {
 
 	int			(*backlog_rcv) (struct sock *sk, 
 						struct sk_buff *skb);
+	
+	int			(*async_recv) (struct sock *sk, 
+						void *dst, size_t size);
+	int			(*async_send) (struct sock *sk, 
+						struct page **pages, unsigned int poffset, 
+						size_t size);
 
 	/* Keeping track of sk's, looking them up, and port selection methods. */
 	void			(*hash)(struct sock *sk);
@@ -679,21 +704,6 @@ static inline struct kiocb *siocb_to_kio
 	return si->kiocb;
 }
 
-struct socket_alloc {
-	struct socket socket;
-	struct inode vfs_inode;
-};
-
-static inline struct socket *SOCKET_I(struct inode *inode)
-{
-	return &container_of(inode, struct socket_alloc, vfs_inode)->socket;
-}
-
-static inline struct inode *SOCK_INODE(struct socket *socket)
-{
-	return &container_of(socket, struct socket_alloc, socket)->vfs_inode;
-}
-
 extern void __sk_stream_mem_reclaim(struct sock *sk);
 extern int sk_stream_mem_schedule(struct sock *sk, int size, int kind);
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 0720bdd..5a1899b 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -364,6 +364,8 @@ extern int			compat_tcp_setsockopt(struc
 					int level, int optname,
 					char __user *optval, int optlen);
 extern void			tcp_set_keepalive(struct sock *sk, int val);
+extern int			tcp_async_recv(struct sock *sk, void *dst, size_t size);
+extern int			tcp_async_send(struct sock *sk, struct page **pages, unsigned int poffset, size_t size);
 extern int			tcp_recvmsg(struct kiocb *iocb, struct sock *sk,
 					    struct msghdr *msg,
 					    size_t len, int nonblock, 
@@ -857,6 +859,7 @@ static inline int tcp_prequeue(struct so
 			tp->ucopy.memory = 0;
 		} else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
 			wake_up_interruptible(sk->sk_sleep);
+			kevent_socket_notify(sk, KEVENT_SOCKET_RECV|KEVENT_SOCKET_SEND);
 			if (!inet_csk_ack_scheduled(sk))
 				inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
 						          (3 * TCP_RTO_MIN) / 4,
diff --git a/kernel/kevent/kevent_naio.c b/kernel/kevent/kevent_naio.c
new file mode 100644
index 0000000..71eb6a5
--- /dev/null
+++ b/kernel/kevent/kevent_naio.c
@@ -0,0 +1,242 @@
+/*
+ * 	kevent_naio.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/kevent.h>
+
+#include <net/sock.h>
+#include <net/tcp_states.h>
+
+static int kevent_naio_enqueue(struct kevent *k);
+static int kevent_naio_dequeue(struct kevent *k);
+static int kevent_naio_callback(struct kevent *k);
+
+static int kevent_naio_setup_aio(int ctl_fd, int s, void __user *buf, 
+		size_t size, u32 event)
+{
+	struct kevent_user *u;
+	struct file *file;
+	int err, fput_needed;
+	struct ukevent uk;
+
+	file = fget_light(ctl_fd, &fput_needed);
+	if (!file)
+		return -ENODEV;
+
+	u = file->private_data;
+	if (!u) {
+		err = -EINVAL;
+		goto err_out_fput;
+	}
+
+	memset(&uk, 0, sizeof(struct ukevent));
+	uk.type = KEVENT_NAIO;
+	uk.ptr = buf;
+	uk.req_flags = KEVENT_REQ_ONESHOT;
+	uk.event = event;
+	uk.id.raw[0] = s;
+	uk.id.raw[1] = size;
+
+	err = kevent_user_add_ukevent(&uk, u);
+
+err_out_fput:
+	fput_light(file, fput_needed);
+	return err;
+}
+
+asmlinkage long sys_aio_recv(int ctl_fd, int s, void __user *buf, 
+		size_t size, unsigned flags)
+{
+	return kevent_naio_setup_aio(ctl_fd, s, buf, size, KEVENT_SOCKET_RECV);
+}
+
+asmlinkage long sys_aio_send(int ctl_fd, int s, void __user *buf, 
+		size_t size, unsigned flags)
+{
+	return kevent_naio_setup_aio(ctl_fd, s, buf, size, KEVENT_SOCKET_SEND);
+}
+
+static int kevent_naio_enqueue(struct kevent *k)
+{
+	int err, i;
+	struct page **page;
+	void *addr;
+	unsigned int size = k->event.id.raw[1];
+	int num = size/PAGE_SIZE;
+	struct file *file;
+	struct sock *sk = NULL;
+	int fput_needed;
+
+	file = fget_light(k->event.id.raw[0], &fput_needed);
+	if (!file)
+		return -ENODEV;
+
+	err = -EINVAL;
+	if (!file->f_dentry || !file->f_dentry->d_inode)
+		goto err_out_fput;
+
+	sk = SOCKET_I(file->f_dentry->d_inode)->sk;
+
+	err = -ESOCKTNOSUPPORT;
+	if (!sk || !sk->sk_prot->async_recv || !sk->sk_prot->async_send || 
+		!sock_flag(sk, SOCK_ASYNC))
+		goto err_out_fput;
+	
+	addr = k->event.ptr;
+	if (((unsigned long)addr & PAGE_MASK) != (unsigned long)addr)
+		num++;
+
+	page = kmalloc(sizeof(struct page *) * num, GFP_KERNEL);
+	if (!page)
+		return -ENOMEM;
+
+	down_read(&current->mm->mmap_sem);
+	err = get_user_pages(current, current->mm, (unsigned long)addr, 
+			num, 1, 0, page, NULL);
+	up_read(&current->mm->mmap_sem);
+	if (err <= 0)
+		goto err_out_free;
+	num = err;
+
+	k->event.ret_data[0] = num;
+	k->event.ret_data[1] = offset_in_page(k->event.ptr);
+	k->priv = page;
+
+	sk->sk_allocation = GFP_ATOMIC;
+
+	spin_lock_bh(&sk->sk_lock.slock);
+	err = kevent_socket_enqueue(k);
+	spin_unlock_bh(&sk->sk_lock.slock);
+	if (err)
+		goto err_out_put_pages;
+
+	fput_light(file, fput_needed);
+
+	return err;
+
+err_out_put_pages:
+	for (i=0; i<num; ++i)
+		page_cache_release(page[i]);
+err_out_free:
+	kfree(page);
+err_out_fput:
+	fput_light(file, fput_needed);
+
+	return err;
+}
+
+static int kevent_naio_dequeue(struct kevent *k)
+{
+	int err, i, num;
+	struct page **page = k->priv;
+
+	num = k->event.ret_data[0];
+
+	err = kevent_socket_dequeue(k);
+
+	for (i=0; i<num; ++i)
+		page_cache_release(page[i]);
+
+	kfree(k->priv);
+	k->priv = NULL;
+
+	return err;
+}
+
+static int kevent_naio_callback(struct kevent *k)
+{
+	struct inode *inode = k->st->origin;
+	struct sock *sk = SOCKET_I(inode)->sk;
+	unsigned int size = k->event.id.raw[1];
+	unsigned int off = k->event.ret_data[1];
+	struct page **pages = k->priv, *page;
+	int ready = 0, num = off/PAGE_SIZE, err = 0, send = 0;
+	void *ptr, *optr;
+	unsigned int len;
+
+	if (!sock_flag(sk, SOCK_ASYNC))
+		return -1;
+
+	if (k->event.event & KEVENT_SOCKET_SEND)
+		send = 1;
+	else if (!(k->event.event & KEVENT_SOCKET_RECV))
+		return -EINVAL;
+
+	/*
+	 * sk_prot->async_*() can return either number of bytes processed,
+	 * or negative error value, or zero if socket is closed.
+	 */
+
+	if (!send) {
+		page = pages[num];
+
+		optr = ptr = kmap_atomic(page, KM_IRQ0);
+		if (!ptr)
+			return -ENOMEM;
+
+		ptr += off % PAGE_SIZE;
+		len = min_t(unsigned int, PAGE_SIZE - (ptr - optr), size);
+
+		err = sk->sk_prot->async_recv(sk, ptr, len);
+
+		kunmap_atomic(optr, KM_IRQ0);
+	} else {
+		len = size;
+		err = sk->sk_prot->async_send(sk, pages, off, size);
+	}
+
+	if (err > 0) {
+		num++;
+		size -= err;
+		off += err;
+	}
+
+	k->event.ret_data[1] = off;
+	k->event.id.raw[1] = size;
+
+	if (err == 0 || (err < 0 && err != -EAGAIN))
+		ready = -1;
+
+	if (!size)
+		ready = 1;
+#if 0
+	printk("%s: sk=%p, k=%p, size=%4u, off=%4u, err=%3d, ready=%1d.\n",
+			__func__, sk, k, size, off, err, ready);
+#endif
+
+	return ready;
+}
+
+static int __init kevent_init_naio(void)
+{
+	struct kevent_callbacks *nc = &kevent_registered_callbacks[KEVENT_NAIO];
+
+	nc->callback = &kevent_naio_enqueue;
+	nc->dequeue = &kevent_naio_dequeue;
+	nc->callback = &kevent_naio_callback;
+	return 0;
+}
+late_initcall(kevent_init_naio);
diff --git a/kernel/kevent/kevent_socket.c b/kernel/kevent/kevent_socket.c
new file mode 100644
index 0000000..20c9568
--- /dev/null
+++ b/kernel/kevent/kevent_socket.c
@@ -0,0 +1,128 @@
+/*
+ * 	kevent_socket.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+#include <linux/file.h>
+#include <linux/tcp.h>
+#include <linux/kevent.h>
+
+#include <net/sock.h>
+#include <net/request_sock.h>
+#include <net/inet_connection_sock.h>
+
+static int kevent_socket_callback(struct kevent *k)
+{
+	struct inode *inode = k->st->origin;
+	struct sock *sk = SOCKET_I(inode)->sk;
+	int rmem;
+	
+	if (k->event.event & KEVENT_SOCKET_RECV) {
+		int ret = 0;
+		
+		if ((rmem = atomic_read(&sk->sk_rmem_alloc)) > 0 || 
+				!skb_queue_empty(&sk->sk_receive_queue))
+			ret = 1;
+		if (sk->sk_shutdown & RCV_SHUTDOWN)
+			ret = 1;
+		if (ret)
+			return ret;
+	}
+	if ((k->event.event & KEVENT_SOCKET_ACCEPT) && 
+		(!reqsk_queue_empty(&inet_csk(sk)->icsk_accept_queue) || 
+		 	reqsk_queue_len_young(&inet_csk(sk)->icsk_accept_queue))) {
+		k->event.ret_data[1] = reqsk_queue_len(&inet_csk(sk)->icsk_accept_queue);
+		return 1;
+	}
+
+	return 0;
+}
+
+int kevent_socket_enqueue(struct kevent *k)
+{
+	struct file *file;
+	struct inode *inode;
+	int err, fput_needed;
+
+	file = fget_light(k->event.id.raw[0], &fput_needed);
+	if (!file)
+		return -ENODEV;
+
+	err = -EINVAL;
+	if (!file->f_dentry || !file->f_dentry->d_inode)
+		goto err_out_fput;
+
+	inode = igrab(file->f_dentry->d_inode);
+	if (!inode)
+		goto err_out_fput;
+
+	err = kevent_storage_enqueue(&inode->st, k);
+	if (err)
+		goto err_out_iput;
+
+	err = k->callbacks.callback(k);
+	if (err)
+		goto err_out_dequeue;
+
+	fput_light(file, fput_needed);
+	return err;
+
+err_out_dequeue:
+	kevent_storage_dequeue(k->st, k);
+err_out_iput:
+	iput(inode);
+err_out_fput:
+	fput_light(file, fput_needed);
+	return err;
+}
+
+int kevent_socket_dequeue(struct kevent *k)
+{
+	struct inode *inode = k->st->origin;
+
+	kevent_storage_dequeue(k->st, k);
+	iput(inode);
+
+	return 0;
+}
+
+void kevent_socket_notify(struct sock *sk, u32 event)
+{
+	if (sk->sk_socket && !test_and_set_bit(SOCK_ASYNC_INUSE, &sk->sk_flags)) {
+		kevent_storage_ready(&SOCK_INODE(sk->sk_socket)->st, NULL, event);
+		sock_reset_flag(sk, SOCK_ASYNC_INUSE);
+	}
+}
+
+static int __init kevent_init_socket(void)
+{
+	struct kevent_callbacks *sc = &kevent_registered_callbacks[KEVENT_SOCKET];
+
+	sc->enqueue = &kevent_socket_enqueue;
+	sc->dequeue = &kevent_socket_dequeue;
+	sc->callback = &kevent_socket_callback;
+	return 0;
+}
+late_initcall(kevent_init_socket);
diff --git a/net/core/datagram.c b/net/core/datagram.c
index aecddcc..493245b 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -236,6 +236,60 @@ void skb_kill_datagram(struct sock *sk, 
 EXPORT_SYMBOL(skb_kill_datagram);
 
 /**
+ *	skb_copy_datagram - Copy a datagram.
+ *	@skb: buffer to copy
+ *	@offset: offset in the buffer to start copying from
+ *	@to: pointer to copy to
+ *	@len: amount of data to copy from buffer to iovec
+ */
+int skb_copy_datagram(const struct sk_buff *skb, int offset,
+			    void *to, int len)
+{
+	int i, fraglen, end = 0;
+	struct sk_buff *next = skb_shinfo(skb)->frag_list;
+
+	if (!len)
+		return 0;
+
+next_skb:
+	fraglen = skb_headlen(skb);
+	i = -1;
+
+	while (1) {
+		int start = end;
+
+		if ((end += fraglen) > offset) {
+			int copy = end - offset, o = offset - start;
+
+			if (copy > len)
+				copy = len;
+			if (i == -1)
+				memcpy(to, skb->data + o, copy);
+			else {
+				skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+				struct page *page = frag->page;
+				void *p = kmap(page) + frag->page_offset + o;
+				memcpy(to, p, copy);
+				kunmap(page);
+			}
+			if (!(len -= copy))
+				return 0;
+			offset += copy;
+		}
+		if (++i >= skb_shinfo(skb)->nr_frags)
+			break;
+		fraglen = skb_shinfo(skb)->frags[i].size;
+	}
+	if (next) {
+		skb = next;
+		BUG_ON(skb_shinfo(skb)->frag_list);
+		next = skb->next;
+		goto next_skb;
+	}
+	return -EFAULT;
+}
+
+/**
  *	skb_copy_datagram_iovec - Copy a datagram to an iovec.
  *	@skb: buffer to copy
  *	@offset: offset in the buffer to start copying from
@@ -530,6 +584,7 @@ unsigned int datagram_poll(struct file *
 
 EXPORT_SYMBOL(datagram_poll);
 EXPORT_SYMBOL(skb_copy_and_csum_datagram_iovec);
+EXPORT_SYMBOL(skb_copy_datagram);
 EXPORT_SYMBOL(skb_copy_datagram_iovec);
 EXPORT_SYMBOL(skb_free_datagram);
 EXPORT_SYMBOL(skb_recv_datagram);
diff --git a/net/core/sock.c b/net/core/sock.c
index 51fcfbc..9922373 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -617,6 +617,16 @@ #endif
 			spin_unlock_bh(&sk->sk_lock.slock);
 			ret = -ENONET;
 			break;
+#ifdef CONFIG_KEVENT_SOCKET
+		case SO_ASYNC_SOCK:
+			spin_lock_bh(&sk->sk_lock.slock);
+			if (valbool)
+				sock_set_flag(sk, SOCK_ASYNC);
+			else
+				sock_reset_flag(sk, SOCK_ASYNC);
+			spin_unlock_bh(&sk->sk_lock.slock);
+			break;
+#endif
 
 		case SO_PASSSEC:
 			if (valbool)
@@ -1406,6 +1416,7 @@ static void sock_def_wakeup(struct sock 
 	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
 		wake_up_interruptible_all(sk->sk_sleep);
 	read_unlock(&sk->sk_callback_lock);
+	kevent_socket_notify(sk, KEVENT_SOCKET_RECV|KEVENT_SOCKET_SEND);
 }
 
 static void sock_def_error_report(struct sock *sk)
@@ -1415,6 +1426,7 @@ static void sock_def_error_report(struct
 		wake_up_interruptible(sk->sk_sleep);
 	sk_wake_async(sk,0,POLL_ERR); 
 	read_unlock(&sk->sk_callback_lock);
+	kevent_socket_notify(sk, KEVENT_SOCKET_RECV|KEVENT_SOCKET_SEND);
 }
 
 static void sock_def_readable(struct sock *sk, int len)
@@ -1424,6 +1436,7 @@ static void sock_def_readable(struct soc
 		wake_up_interruptible(sk->sk_sleep);
 	sk_wake_async(sk,1,POLL_IN);
 	read_unlock(&sk->sk_callback_lock);
+	kevent_socket_notify(sk, KEVENT_SOCKET_RECV|KEVENT_SOCKET_SEND);
 }
 
 static void sock_def_write_space(struct sock *sk)
@@ -1443,6 +1456,7 @@ static void sock_def_write_space(struct 
 	}
 
 	read_unlock(&sk->sk_callback_lock);
+	kevent_socket_notify(sk, KEVENT_SOCKET_SEND|KEVENT_SOCKET_RECV);
 }
 
 static void sock_def_destruct(struct sock *sk)
@@ -1559,8 +1573,10 @@ void fastcall release_sock(struct sock *
 	if (sk->sk_backlog.tail)
 		__release_sock(sk);
 	sk->sk_lock.owner = NULL;
-	if (waitqueue_active(&sk->sk_lock.wq))
+	if (waitqueue_active(&sk->sk_lock.wq)) {
 		wake_up(&sk->sk_lock.wq);
+		kevent_socket_notify(sk, KEVENT_SOCKET_RECV|KEVENT_SOCKET_SEND);
+	}
 	spin_unlock_bh(&sk->sk_lock.slock);
 }
 EXPORT_SYMBOL(release_sock);
diff --git a/net/core/stream.c b/net/core/stream.c
index d1d7dec..2878c2a 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -36,6 +36,7 @@ void sk_stream_write_space(struct sock *
 			wake_up_interruptible(sk->sk_sleep);
 		if (sock->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
 			sock_wake_async(sock, 2, POLL_OUT);
+		kevent_socket_notify(sk, KEVENT_SOCKET_SEND|KEVENT_SOCKET_RECV);
 	}
 }
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index f6a2d92..e878a41 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -206,6 +206,7 @@
  *					lingertime == 0 (RFC 793 ABORT Call)
  *	Hirokazu Takahashi	:	Use copy_from_user() instead of
  *					csum_and_copy_from_user() if possible.
+ *	Evgeniy Polyakov	:	Network asynchronous IO.
  *
  *		This program is free software; you can redistribute it and/or
  *		modify it under the terms of the GNU General Public License
@@ -1085,6 +1086,301 @@ int tcp_read_sock(struct sock *sk, read_
 }
 
 /*
+ * Must be called with locked sock.
+ */
+int tcp_async_send(struct sock *sk, struct page **pages, unsigned int poffset, size_t len)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int mss_now, size_goal;
+	int err = -EAGAIN;
+	ssize_t copied;
+
+	/* Wait for a connection to finish. */
+	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
+		goto out_err;
+
+	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+
+	mss_now = tcp_current_mss(sk, 1);
+	size_goal = tp->xmit_size_goal;
+	copied = 0;
+
+	err = -EPIPE;
+	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN) || sock_flag(sk, SOCK_DONE) ||
+			(sk->sk_state == TCP_CLOSE) || (atomic_read(&sk->sk_refcnt) == 1))
+		goto do_error;
+
+	while (len > 0) {
+		struct sk_buff *skb = sk->sk_write_queue.prev;
+		struct page *page = pages[poffset / PAGE_SIZE];
+		int copy, i, can_coalesce;
+		int offset = poffset % PAGE_SIZE;
+		int size = min_t(size_t, len, PAGE_SIZE - offset);
+
+		if (!sk->sk_send_head || (copy = size_goal - skb->len) <= 0) {
+new_segment:
+			if (!sk_stream_memory_free(sk))
+				goto wait_for_sndbuf;
+
+			skb = sk_stream_alloc_pskb(sk, 0, 0,
+						   sk->sk_allocation);
+			if (!skb)
+				goto wait_for_memory;
+
+			skb_entail(sk, tp, skb);
+			copy = size_goal;
+		}
+
+		if (copy > size)
+			copy = size;
+
+		i = skb_shinfo(skb)->nr_frags;
+		can_coalesce = skb_can_coalesce(skb, i, page, offset);
+		if (!can_coalesce && i >= MAX_SKB_FRAGS) {
+			tcp_mark_push(tp, skb);
+			goto new_segment;
+		}
+		if (!sk_stream_wmem_schedule(sk, copy))
+			goto wait_for_memory;
+		
+		if (can_coalesce) {
+			skb_shinfo(skb)->frags[i - 1].size += copy;
+		} else {
+			get_page(page);
+			skb_fill_page_desc(skb, i, page, offset, copy);
+		}
+
+		skb->len += copy;
+		skb->data_len += copy;
+		skb->truesize += copy;
+		sk->sk_wmem_queued += copy;
+		sk->sk_forward_alloc -= copy;
+		skb->ip_summed = CHECKSUM_HW;
+		tp->write_seq += copy;
+		TCP_SKB_CB(skb)->end_seq += copy;
+		skb_shinfo(skb)->gso_segs = 0;
+
+		if (!copied)
+			TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
+
+		copied += copy;
+		poffset += copy;
+		if (!(len -= copy))
+			goto out;
+
+		if (skb->len < mss_now)
+			continue;
+
+		if (forced_push(tp)) {
+			tcp_mark_push(tp, skb);
+			__tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
+		} else if (skb == sk->sk_send_head)
+			tcp_push_one(sk, mss_now);
+		continue;
+
+wait_for_sndbuf:
+		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+wait_for_memory:
+		if (copied)
+			tcp_push(sk, tp, 0, mss_now, TCP_NAGLE_PUSH);
+
+		err = -EAGAIN;
+		goto do_error;
+	}
+
+out:
+	if (copied)
+		tcp_push(sk, tp, 0, mss_now, tp->nonagle);
+	return copied;
+
+do_error:
+	if (copied)
+		goto out;
+out_err:
+	return sk_stream_error(sk, 0, err);
+}
+
+/*
+ * Must be called with locked sock.
+ */
+int tcp_async_recv(struct sock *sk, void *dst, size_t len)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int copied = 0;
+	u32 *seq;
+	unsigned long used;
+	int err;
+	int target;		/* Read at least this many bytes */
+	int copied_early = 0;
+
+	TCP_CHECK_TIMER(sk);
+
+	err = -ENOTCONN;
+	if (sk->sk_state == TCP_LISTEN)
+		goto out;
+
+	seq = &tp->copied_seq;
+
+	target = sock_rcvlowat(sk, 0, len);
+
+	do {
+		struct sk_buff *skb;
+		u32 offset;
+
+		/* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
+		if (tp->urg_data && tp->urg_seq == *seq) {
+			if (copied)
+				break;
+		}
+
+		/* Next get a buffer. */
+
+		skb = skb_peek(&sk->sk_receive_queue);
+		do {
+			if (!skb)
+				break;
+
+			/* Now that we have two receive queues this
+			 * shouldn't happen.
+			 */
+			if (before(*seq, TCP_SKB_CB(skb)->seq)) {
+				printk(KERN_INFO "async_recv bug: copied %X "
+				       "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
+				break;
+			}
+			offset = *seq - TCP_SKB_CB(skb)->seq;
+			if (skb->h.th->syn)
+				offset--;
+			if (offset < skb->len)
+				goto found_ok_skb;
+			if (skb->h.th->fin)
+				goto found_fin_ok;
+			skb = skb->next;
+		} while (skb != (struct sk_buff *)&sk->sk_receive_queue);
+
+		if (copied)
+			break;
+
+		if (sock_flag(sk, SOCK_DONE))
+			break;
+
+		if (sk->sk_err) {
+			copied = sock_error(sk);
+			break;
+		}
+
+		if (sk->sk_shutdown & RCV_SHUTDOWN)
+			break;
+
+		if (sk->sk_state == TCP_CLOSE) {
+			if (!sock_flag(sk, SOCK_DONE)) {
+				/* This occurs when user tries to read
+				 * from never connected socket.
+				 */
+				copied = -ENOTCONN;
+				break;
+			}
+			break;
+		}
+
+		copied = -EAGAIN;
+		break;
+
+	found_ok_skb:
+		/* Ok so how much can we use? */
+		used = skb->len - offset;
+		if (len < used)
+			used = len;
+
+		/* Do we have urgent data here? */
+		if (tp->urg_data) {
+			u32 urg_offset = tp->urg_seq - *seq;
+			if (urg_offset < used) {
+				if (!urg_offset) {
+					if (!sock_flag(sk, SOCK_URGINLINE)) {
+						++*seq;
+						offset++;
+						used--;
+						if (!used)
+							goto skip_copy;
+					}
+				} else
+					used = urg_offset;
+			}
+		}
+#ifdef CONFIG_NET_DMA
+		if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
+			tp->ucopy.dma_chan = get_softnet_dma();
+
+		if (tp->ucopy.dma_chan) {
+			tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec(
+				tp->ucopy.dma_chan, skb, offset,
+				msg->msg_iov, used,
+				tp->ucopy.pinned_list);
+
+			if (tp->ucopy.dma_cookie < 0) {
+
+				printk(KERN_ALERT "dma_cookie < 0\n");
+
+				/* Exception. Bailout! */
+				if (!copied)
+					copied = -EFAULT;
+				break;
+			}
+			if ((offset + used) == skb->len)
+				copied_early = 1;
+
+		} else
+#endif
+		{
+			err = skb_copy_datagram(skb, offset, dst, used);
+			if (err) {
+				/* Exception. Bailout! */
+				if (!copied)
+					copied = -EFAULT;
+				break;
+			}
+		}
+
+		*seq += used;
+		copied += used;
+		len -= used;
+		dst += used;
+
+		tcp_rcv_space_adjust(sk);
+
+skip_copy:
+		if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
+			tp->urg_data = 0;
+			tcp_fast_path_check(sk, tp);
+		}
+		if (used + offset < skb->len)
+			continue;
+
+		if (skb->h.th->fin)
+			goto found_fin_ok;
+		sk_eat_skb(sk, skb, copied_early);
+		continue;
+
+	found_fin_ok:
+		/* Process the FIN. */
+		++*seq;
+		sk_eat_skb(sk, skb, copied_early);
+		break;
+	} while (len > 0);
+
+	/* Clean up data we have read: This will do ACK frames. */
+	tcp_cleanup_rbuf(sk, copied);
+
+	TCP_CHECK_TIMER(sk);
+	return copied;
+
+out:
+	TCP_CHECK_TIMER(sk);
+	return err;
+}
+
+/*
  *	This routine copies from a sock struct into the user buffer.
  *
  *	Technical note: in 2.3 we work on _locked_ socket, so that
@@ -2342,6 +2638,8 @@ EXPORT_SYMBOL(tcp_getsockopt);
 EXPORT_SYMBOL(tcp_ioctl);
 EXPORT_SYMBOL(tcp_poll);
 EXPORT_SYMBOL(tcp_read_sock);
+EXPORT_SYMBOL(tcp_async_recv);
+EXPORT_SYMBOL(tcp_async_send);
 EXPORT_SYMBOL(tcp_recvmsg);
 EXPORT_SYMBOL(tcp_sendmsg);
 EXPORT_SYMBOL(tcp_sendpage);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 738dad9..f70d045 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3112,6 +3112,7 @@ static void tcp_ofo_queue(struct sock *s
 
 		__skb_unlink(skb, &tp->out_of_order_queue);
 		__skb_queue_tail(&sk->sk_receive_queue, skb);
+		kevent_socket_notify(sk, KEVENT_SOCKET_RECV);
 		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
 		if(skb->h.th->fin)
 			tcp_fin(skb, sk, skb->h.th);
@@ -3955,7 +3956,8 @@ int tcp_rcv_established(struct sock *sk,
 			int copied_early = 0;
 
 			if (tp->copied_seq == tp->rcv_nxt &&
-			    len - tcp_header_len <= tp->ucopy.len) {
+			    len - tcp_header_len <= tp->ucopy.len &&
+			    !sock_async(sk)) {
 #ifdef CONFIG_NET_DMA
 				if (tcp_dma_try_early_copy(sk, skb, tcp_header_len)) {
 					copied_early = 1;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index f6f39e8..ae4f23c 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -61,6 +61,7 @@ #include <linux/cache.h>
 #include <linux/jhash.h>
 #include <linux/init.h>
 #include <linux/times.h>
+#include <linux/kevent.h>
 
 #include <net/icmp.h>
 #include <net/inet_hashtables.h>
@@ -868,6 +869,7 @@ #endif
 	   	reqsk_free(req);
 	} else {
 		inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
+		kevent_socket_notify(sk, KEVENT_SOCKET_ACCEPT);
 	}
 	return 0;
 
@@ -1108,24 +1110,30 @@ process:
 
 	skb->dev = NULL;
 
-	bh_lock_sock_nested(sk);
 	ret = 0;
-	if (!sock_owned_by_user(sk)) {
+	if (sock_async(sk)) {
+		spin_lock_bh(&sk->sk_lock.slock);
+		ret = tcp_v4_do_rcv(sk, skb);
+		spin_unlock_bh(&sk->sk_lock.slock);
+	} else {
+		bh_lock_sock_nested(sk);
+		if (!sock_owned_by_user(sk)) {
 #ifdef CONFIG_NET_DMA
-		struct tcp_sock *tp = tcp_sk(sk);
-		if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
-			tp->ucopy.dma_chan = get_softnet_dma();
-		if (tp->ucopy.dma_chan)
-			ret = tcp_v4_do_rcv(sk, skb);
-		else
+			struct tcp_sock *tp = tcp_sk(sk);
+			if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
+				tp->ucopy.dma_chan = get_softnet_dma();
+			if (tp->ucopy.dma_chan)
+				ret = tcp_v4_do_rcv(sk, skb);
+			else
 #endif
-		{
-			if (!tcp_prequeue(sk, skb))
-			ret = tcp_v4_do_rcv(sk, skb);
-		}
-	} else
-		sk_add_backlog(sk, skb);
-	bh_unlock_sock(sk);
+			{
+				if (!tcp_prequeue(sk, skb))
+				ret = tcp_v4_do_rcv(sk, skb);
+			}
+		} else
+			sk_add_backlog(sk, skb);
+		bh_unlock_sock(sk);
+	}
 
 	sock_put(sk);
 
@@ -1849,6 +1857,8 @@ struct proto tcp_prot = {
 	.getsockopt		= tcp_getsockopt,
 	.sendmsg		= tcp_sendmsg,
 	.recvmsg		= tcp_recvmsg,
+	.async_recv		= tcp_async_recv,
+	.async_send		= tcp_async_send,
 	.backlog_rcv		= tcp_v4_do_rcv,
 	.hash			= tcp_v4_hash,
 	.unhash			= tcp_unhash,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 923989d..a5d3ac8 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1230,22 +1230,28 @@ process:
 
 	skb->dev = NULL;
 
-	bh_lock_sock(sk);
 	ret = 0;
-	if (!sock_owned_by_user(sk)) {
+	if (sock_async(sk)) {
+		spin_lock_bh(&sk->sk_lock.slock);
+		ret = tcp_v4_do_rcv(sk, skb);
+		spin_unlock_bh(&sk->sk_lock.slock);
+	} else {
+		bh_lock_sock(sk);
+		if (!sock_owned_by_user(sk)) {
 #ifdef CONFIG_NET_DMA
-                struct tcp_sock *tp = tcp_sk(sk);
-                if (tp->ucopy.dma_chan)
-                        ret = tcp_v6_do_rcv(sk, skb);
-                else
-#endif
-		{
-			if (!tcp_prequeue(sk, skb))
+			struct tcp_sock *tp = tcp_sk(sk);
+			if (tp->ucopy.dma_chan)
 				ret = tcp_v6_do_rcv(sk, skb);
-		}
-	} else
-		sk_add_backlog(sk, skb);
-	bh_unlock_sock(sk);
+			else
+#endif
+			{
+				if (!tcp_prequeue(sk, skb))
+					ret = tcp_v6_do_rcv(sk, skb);
+			}
+		} else
+			sk_add_backlog(sk, skb);
+		bh_unlock_sock(sk);
+	}
 
 	sock_put(sk);
 	return ret ? -1 : 0;
@@ -1596,6 +1602,8 @@ struct proto tcpv6_prot = {
 	.getsockopt		= tcp_getsockopt,
 	.sendmsg		= tcp_sendmsg,
 	.recvmsg		= tcp_recvmsg,
+	.async_recv		= tcp_async_recv,
+	.async_send		= tcp_async_send,
 	.backlog_rcv		= tcp_v6_do_rcv,
 	.hash			= tcp_v6_hash,
 	.unhash			= tcp_unhash,


^ permalink raw reply related	[flat|nested] 160+ messages in thread

* [take3 4/4] kevent: poll/select() notifications. Timer notifications.
  2006-08-03  9:46                               ` [take3 3/4] kevent: Network AIO, socket notifications Evgeniy Polyakov
@ 2006-08-03  9:46                                 ` Evgeniy Polyakov
  2006-08-03  9:43                                   ` Eric Dumazet
  2006-08-03  9:54                                 ` [take3 3/4] kevent: Network AIO, socket notifications Eric Dumazet
  1 sibling, 1 reply; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-03  9:46 UTC (permalink / raw)
  To: lkml; +Cc: David Miller, Ulrich Drepper, Evgeniy Polyakov, netdev, Zach Brown


poll/select() notifications. Timer notifications.

This patch includes generic poll/select and timer notifications.

kevent_poll works simialr to epoll and has the same issues (callback
is invoked not from internal state machine of the caller, but through
process awake).

Timer notifications can be used for fine grained per-process time 
management, since iteractive timers are very inconveniently to use, 
and they are limited.

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mitp.ru>

diff --git a/kernel/kevent/kevent_poll.c b/kernel/kevent/kevent_poll.c
new file mode 100644
index 0000000..46cc8f0
--- /dev/null
+++ b/kernel/kevent/kevent_poll.c
@@ -0,0 +1,217 @@
+/*
+ * 	kevent_poll.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+#include <linux/file.h>
+#include <linux/kevent.h>
+#include <linux/poll.h>
+#include <linux/fs.h>
+
+static kmem_cache_t *kevent_poll_container_cache;
+static kmem_cache_t *kevent_poll_priv_cache;
+
+struct kevent_poll_ctl
+{
+	struct poll_table_struct 	pt;
+	struct kevent			*k;
+};
+
+struct kevent_poll_wait_container
+{
+	struct list_head		container_entry;
+	wait_queue_head_t		*whead;
+	wait_queue_t			wait;
+	struct kevent			*k;
+};
+
+struct kevent_poll_private
+{
+	struct list_head		container_list;
+	spinlock_t			container_lock;
+};
+
+static int kevent_poll_enqueue(struct kevent *k);
+static int kevent_poll_dequeue(struct kevent *k);
+static int kevent_poll_callback(struct kevent *k);
+
+static int kevent_poll_wait_callback(wait_queue_t *wait, 
+		unsigned mode, int sync, void *key)
+{
+	struct kevent_poll_wait_container *cont = 
+		container_of(wait, struct kevent_poll_wait_container, wait);
+	struct kevent *k = cont->k;
+	struct file *file = k->st->origin;
+	unsigned long flags;
+	u32 revents, event;
+
+	revents = file->f_op->poll(file, NULL);
+	spin_lock_irqsave(&k->ulock, flags);
+	event = k->event.event;
+	spin_unlock_irqrestore(&k->ulock, flags);
+
+	kevent_storage_ready(k->st, NULL, revents);
+
+	return 0;
+}
+
+static void kevent_poll_qproc(struct file *file, wait_queue_head_t *whead, 
+		struct poll_table_struct *poll_table)
+{
+	struct kevent *k = 
+		container_of(poll_table, struct kevent_poll_ctl, pt)->k;
+	struct kevent_poll_private *priv = k->priv;
+	struct kevent_poll_wait_container *cont;
+	unsigned long flags;
+
+	cont = kmem_cache_alloc(kevent_poll_container_cache, SLAB_KERNEL);
+	if (!cont) {
+		kevent_break(k);
+		return;
+	}
+		
+	cont->k = k;
+	init_waitqueue_func_entry(&cont->wait, kevent_poll_wait_callback);
+	cont->whead = whead;
+
+	spin_lock_irqsave(&priv->container_lock, flags);
+	list_add_tail(&cont->container_entry, &priv->container_list);
+	spin_unlock_irqrestore(&priv->container_lock, flags);
+
+	add_wait_queue(whead, &cont->wait);
+}
+
+static int kevent_poll_enqueue(struct kevent *k)
+{
+	struct file *file;
+	int err, ready = 0;
+	unsigned int revents;
+	struct kevent_poll_ctl ctl;
+	struct kevent_poll_private *priv;
+
+	file = fget(k->event.id.raw[0]);
+	if (!file)
+		return -ENODEV;
+
+	err = -EINVAL;
+	if (!file->f_op || !file->f_op->poll)
+		goto err_out_fput;
+
+	err = -ENOMEM;
+	priv = kmem_cache_alloc(kevent_poll_priv_cache, SLAB_KERNEL);
+	if (!priv)
+		goto err_out_fput;
+
+	spin_lock_init(&priv->container_lock);
+	INIT_LIST_HEAD(&priv->container_list);
+
+	k->priv = priv;
+
+	ctl.k = k;
+	init_poll_funcptr(&ctl.pt, &kevent_poll_qproc);
+
+	err = kevent_storage_enqueue(&file->st, k);
+	if (err)
+		goto err_out_free;
+
+	revents = file->f_op->poll(file, &ctl.pt);
+	if (revents & k->event.event) {
+		ready = 1;
+		kevent_poll_dequeue(k);
+	}
+	
+	return ready;
+
+err_out_free:
+	kmem_cache_free(kevent_poll_priv_cache, priv);
+err_out_fput:
+	fput(file);
+	return err;
+}
+
+static int kevent_poll_dequeue(struct kevent *k)
+{
+	struct file *file = k->st->origin;
+	struct kevent_poll_private *priv = k->priv;
+	struct kevent_poll_wait_container *w, *n;
+	unsigned long flags;
+
+	kevent_storage_dequeue(k->st, k);
+
+	spin_lock_irqsave(&priv->container_lock, flags);
+	list_for_each_entry_safe(w, n, &priv->container_list, container_entry) {
+		list_del(&w->container_entry);
+		remove_wait_queue(w->whead, &w->wait);
+		kmem_cache_free(kevent_poll_container_cache, w);
+	}
+	spin_unlock_irqrestore(&priv->container_lock, flags);
+	
+	kmem_cache_free(kevent_poll_priv_cache, priv);
+	k->priv = NULL;
+	
+	fput(file);
+
+	return 0;
+}
+
+static int kevent_poll_callback(struct kevent *k)
+{
+	struct file *file = k->st->origin;
+	unsigned int revents = file->f_op->poll(file, NULL);
+	return (revents & k->event.event);
+}
+
+static int __init kevent_poll_sys_init(void)
+{
+	struct kevent_callbacks *pc = &kevent_registered_callbacks[KEVENT_POLL];
+
+	kevent_poll_container_cache = kmem_cache_create("kevent_poll_container_cache", 
+			sizeof(struct kevent_poll_wait_container), 0, 0, NULL, NULL);
+	if (!kevent_poll_container_cache) {
+		printk(KERN_ERR "Failed to create kevent poll container cache.\n");
+		return -ENOMEM;
+	}
+	
+	kevent_poll_priv_cache = kmem_cache_create("kevent_poll_priv_cache", 
+			sizeof(struct kevent_poll_private), 0, 0, NULL, NULL);
+	if (!kevent_poll_priv_cache) {
+		printk(KERN_ERR "Failed to create kevent poll private data cache.\n");
+		kmem_cache_destroy(kevent_poll_container_cache);
+		kevent_poll_container_cache = NULL;
+		return -ENOMEM;
+	}
+	
+	pc->enqueue = &kevent_poll_enqueue;
+	pc->dequeue = &kevent_poll_dequeue;
+	pc->callback = &kevent_poll_callback;
+
+	printk(KERN_INFO "Kevent poll()/select() subsystem has been initialized.\n");
+	return 0;
+}
+
+static void __exit kevent_poll_sys_fini(void)
+{
+	kmem_cache_destroy(kevent_poll_priv_cache);
+	kmem_cache_destroy(kevent_poll_container_cache);
+}
+
+module_init(kevent_poll_sys_init);
+module_exit(kevent_poll_sys_fini);
diff --git a/kernel/kevent/kevent_timer.c b/kernel/kevent/kevent_timer.c
new file mode 100644
index 0000000..9063cec
--- /dev/null
+++ b/kernel/kevent/kevent_timer.c
@@ -0,0 +1,116 @@
+/*
+ * 	kevent_timer.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+#include <linux/jiffies.h>
+#include <linux/kevent.h>
+
+static void kevent_timer_func(unsigned long data)
+{
+	struct kevent *k = (struct kevent *)data;
+	struct timer_list *t = k->st->origin;
+
+	kevent_storage_ready(k->st, NULL, KEVENT_MASK_ALL);
+	mod_timer(t, jiffies + msecs_to_jiffies(k->event.id.raw[0]));
+}
+
+static int kevent_timer_enqueue(struct kevent *k)
+{
+	struct timer_list *t;
+	struct kevent_storage *st;
+	int err;
+
+	t = kmalloc(sizeof(struct timer_list) + sizeof(struct kevent_storage), 
+			GFP_KERNEL);
+	if (!t)
+		return -ENOMEM;
+
+	init_timer(t);
+	t->function = kevent_timer_func;
+	t->expires = jiffies + msecs_to_jiffies(k->event.id.raw[0]);
+	t->data = (unsigned long)k;
+
+	st = (struct kevent_storage *)(t+1);
+	err = kevent_storage_init(t, st);
+	if (err)
+		goto err_out_free;
+
+	err = kevent_storage_enqueue(st, k);
+	if (err)
+		goto err_out_st_fini;
+	
+	add_timer(t);
+
+	return 0;
+
+err_out_st_fini:	
+	kevent_storage_fini(st);
+err_out_free:
+	kfree(t);
+
+	return err;
+}
+
+static int kevent_timer_dequeue(struct kevent *k)
+{
+	struct kevent_storage *st = k->st;
+	struct timer_list *t = st->origin;
+
+	if (!t)
+		return -ENODEV;
+
+	del_timer_sync(t);
+	
+	kevent_storage_dequeue(st, k);
+	
+	kfree(t);
+
+	return 0;
+}
+
+static int kevent_timer_callback(struct kevent *k)
+{
+	struct kevent_storage *st = k->st;
+	struct timer_list *t = st->origin;
+
+	if (!t)
+		return -ENODEV;
+	
+	k->event.ret_data[0] = (__u32)jiffies;
+	return 1;
+}
+
+static int __init kevent_init_timer(void)
+{
+	struct kevent_callbacks *tc = &kevent_registered_callbacks[KEVENT_TIMER];
+
+	tc->enqueue = &kevent_timer_enqueue;
+	tc->dequeue = &kevent_timer_dequeue;
+	tc->callback = &kevent_timer_callback;
+
+	return 0;
+}
+late_initcall(kevent_init_timer);


^ permalink raw reply related	[flat|nested] 160+ messages in thread

* [take3 1/4] kevent: Core files.
  2006-08-03  9:45                         ` [take3 0/4] kevent: Generic event handling mechanism Evgeniy Polyakov
  2006-08-03  9:40                           ` Evgeniy Polyakov
@ 2006-08-03  9:46                           ` Evgeniy Polyakov
  2006-08-03  9:46                             ` [take3 2/4] kevent: AIO, aio_sendfile() implementation Evgeniy Polyakov
  2006-08-03 14:40                             ` [take3 1/4] kevent: Core files Eric Dumazet
  1 sibling, 2 replies; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-03  9:46 UTC (permalink / raw)
  To: lkml; +Cc: David Miller, Ulrich Drepper, Evgeniy Polyakov, netdev, Zach Brown


Core files.

This patch includes core kevent files:
 - userspace controlling
 - kernelspace interfaces
 - initialization
 - notification state machines

It might also inlclude parts from other subsystem (like network related
syscalls, so it is possible that it will not compile without other
patches applied).

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>

diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index dd63d47..0af988a 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@@ -317,3 +317,7 @@ ENTRY(sys_call_table)
 	.long sys_tee			/* 315 */
 	.long sys_vmsplice
 	.long sys_move_pages
+	.long sys_aio_recv
+	.long sys_aio_send
+	.long sys_kevent_get_events
+	.long sys_kevent_ctl
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index 5d4a7d1..e157ad4 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -713,4 +713,8 @@ #endif
 	.quad sys_tee
 	.quad compat_sys_vmsplice
 	.quad compat_sys_move_pages
+	.quad sys_aio_recv
+	.quad sys_aio_send
+	.quad sys_kevent_get_events
+	.quad sys_kevent_ctl
 ia32_syscall_end:		
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index fc1c8dd..a76e50d 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -323,10 +323,14 @@ #define __NR_sync_file_range	314
 #define __NR_tee		315
 #define __NR_vmsplice		316
 #define __NR_move_pages		317
+#define __NR_aio_recv		318
+#define __NR_aio_send		319
+#define __NR_kevent_get_events	320
+#define __NR_kevent_ctl		321
 
 #ifdef __KERNEL__
 
-#define NR_syscalls 318
+#define NR_syscalls 322
 
 /*
  * user-visible error numbers are in the range -1 - -128: see
diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h
index 94387c9..9a0b581 100644
--- a/include/asm-x86_64/unistd.h
+++ b/include/asm-x86_64/unistd.h
@@ -619,10 +619,18 @@ #define __NR_vmsplice		278
 __SYSCALL(__NR_vmsplice, sys_vmsplice)
 #define __NR_move_pages		279
 __SYSCALL(__NR_move_pages, sys_move_pages)
+#define __NR_aio_recv		280
+__SYSCALL(__NR_aio_recv, sys_aio_recv)
+#define __NR_aio_send		281
+__SYSCALL(__NR_aio_send, sys_aio_send)
+#define __NR_kevent_get_events	282
+__SYSCALL(__NR_kevent_get_events, sys_kevent_get_events)
+#define __NR_kevent_ctl		283
+__SYSCALL(__NR_kevent_ctl, sys_kevent_ctl)
 
 #ifdef __KERNEL__
 
-#define __NR_syscall_max __NR_move_pages
+#define __NR_syscall_max __NR_kevent_ctl
 
 #ifndef __NO_STUBS
 
diff --git a/include/linux/kevent.h b/include/linux/kevent.h
new file mode 100644
index 0000000..cb09726
--- /dev/null
+++ b/include/linux/kevent.h
@@ -0,0 +1,277 @@
+/*
+ * 	kevent.h
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef __KEVENT_H
+#define __KEVENT_H
+
+/*
+ * Kevent request flags.
+ */
+
+#define KEVENT_REQ_ONESHOT	0x1		/* Process this event only once and then dequeue. */
+
+/*
+ * Kevent return flags.
+ */
+#define KEVENT_RET_BROKEN	0x1		/* Kevent is broken. */
+#define KEVENT_RET_DONE		0x2		/* Kevent processing was finished successfully. */
+
+/*
+ * Kevent type set.
+ */
+#define KEVENT_SOCKET 		0
+#define KEVENT_INODE		1
+#define KEVENT_TIMER		2
+#define KEVENT_POLL		3
+#define KEVENT_NAIO		4
+#define KEVENT_AIO		5
+#define	KEVENT_MAX		6
+
+/*
+ * Per-type event sets.
+ * Number of per-event sets should be exactly as number of kevent types.
+ */
+
+/*
+ * Timer events.
+ */
+#define	KEVENT_TIMER_FIRED	0x1
+
+/*
+ * Socket/network asynchronous IO events.
+ */
+#define	KEVENT_SOCKET_RECV	0x1
+#define	KEVENT_SOCKET_ACCEPT	0x2
+#define	KEVENT_SOCKET_SEND	0x4
+
+/*
+ * Inode events.
+ */
+#define	KEVENT_INODE_CREATE	0x1
+#define	KEVENT_INODE_REMOVE	0x2
+
+/*
+ * Poll events.
+ */
+#define	KEVENT_POLL_POLLIN	0x0001
+#define	KEVENT_POLL_POLLPRI	0x0002
+#define	KEVENT_POLL_POLLOUT	0x0004
+#define	KEVENT_POLL_POLLERR	0x0008
+#define	KEVENT_POLL_POLLHUP	0x0010
+#define	KEVENT_POLL_POLLNVAL	0x0020
+
+#define	KEVENT_POLL_POLLRDNORM	0x0040
+#define	KEVENT_POLL_POLLRDBAND	0x0080
+#define	KEVENT_POLL_POLLWRNORM	0x0100
+#define	KEVENT_POLL_POLLWRBAND	0x0200
+#define	KEVENT_POLL_POLLMSG	0x0400
+#define	KEVENT_POLL_POLLREMOVE	0x1000
+
+/*
+ * Asynchronous IO events.
+ */
+#define	KEVENT_AIO_BIO		0x1
+
+#define KEVENT_MASK_ALL		0xffffffff	/* Mask of all possible event values. */
+#define KEVENT_MASK_EMPTY	0x0		/* Empty mask of ready events. */
+
+struct kevent_id
+{
+	__u32		raw[2];
+};
+
+struct ukevent
+{
+	struct kevent_id	id;			/* Id of this request, e.g. socket number, file descriptor and so on... */
+	__u32			type;			/* Event type, e.g. KEVENT_SOCK, KEVENT_INODE, KEVENT_TIMER and so on... */
+	__u32			event;			/* Event itself, e.g. SOCK_ACCEPT, INODE_CREATED, TIMER_FIRED... */
+	__u32			req_flags;		/* Per-event request flags */
+	__u32			ret_flags;		/* Per-event return flags */
+	__u32			ret_data[2];		/* Event return data. Event originator fills it with anything it likes. */
+	union {
+		__u32		user[2];		/* User's data. It is not used, just copied to/from user. */
+		void		*ptr;
+	};
+};
+
+#define	KEVENT_CTL_ADD 		0
+#define	KEVENT_CTL_REMOVE	1
+#define	KEVENT_CTL_MODIFY	2
+#define	KEVENT_CTL_INIT		3
+
+#ifdef __KERNEL__
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/wait.h>
+#include <linux/kevent_storage.h>
+
+#define KEVENT_MAX_EVENTS	4096
+#define KEVENT_MIN_BUFFS_ALLOC	3
+
+struct inode;
+struct dentry;
+struct sock;
+
+struct kevent;
+struct kevent_storage;
+typedef int (* kevent_callback_t)(struct kevent *);
+
+/* @callback is called each time new event has been caught. */
+/* @enqueue is called each time new event is queued. */
+/* @dequeue is called each time event is dequeued. */
+
+struct kevent_callbacks {
+	kevent_callback_t	callback, enqueue, dequeue;
+};
+
+struct kevent
+{
+	struct ukevent		event;
+	spinlock_t		ulock;			/* This lock protects ukevent manipulations, e.g. ret_flags changes. */
+
+	struct list_head	kevent_entry;		/* Entry of user's queue. */
+	struct list_head	storage_entry;		/* Entry of origin's queue. */
+	struct list_head	ready_entry;		/* Entry of user's ready. */
+
+	struct kevent_user	*user;			/* User who requested this kevent. */
+	struct kevent_storage	*st;			/* Kevent container. */
+
+	struct kevent_callbacks	callbacks;
+
+	void			*priv;			/* Private data for different storages. 
+							 * poll()/select storage has a list of wait_queue_t containers 
+							 * for each ->poll() { poll_wait()' } here.
+							 */
+};
+
+extern struct kevent_callbacks kevent_registered_callbacks[];
+
+#define KEVENT_HASH_MASK	0xff
+
+struct kevent_user
+{
+	struct list_head	kevent_list[KEVENT_HASH_MASK+1];
+	spinlock_t		kevent_lock;
+	unsigned int		kevent_num;		/* Number of queued kevents. */
+
+	struct list_head	ready_list;		/* List of ready kevents. */
+	unsigned int		ready_num;		/* Number of ready kevents. */
+	spinlock_t 		ready_lock;		/* Protects all manipulations with ready queue. */
+
+	unsigned int		max_ready_num;		/* Requested number of kevents. */
+
+	struct mutex		ctl_mutex;		/* Protects against simultaneous kevent_user control manipulations. */
+	struct mutex		wait_mutex;		/* Protects against simultaneous kevent_user waits. */
+	wait_queue_head_t	wait;			/* Wait until some events are ready. */
+
+	atomic_t		refcnt;			/* Reference counter, increased for each new kevent. */
+	
+	unsigned long		*pring;			/* Array of pages forming mapped ring buffer */
+
+#ifdef CONFIG_KEVENT_USER_STAT
+	unsigned long		im_num;
+	unsigned long		wait_num;
+	unsigned long		total;
+#endif
+};
+
+struct kevent *kevent_alloc(gfp_t mask);
+void kevent_free(struct kevent *k);
+int kevent_enqueue(struct kevent *k);
+int kevent_dequeue(struct kevent *k);
+int kevent_init(struct kevent *k);
+void kevent_requeue(struct kevent *k);
+int kevent_break(struct kevent *k);
+
+void kevent_user_ring_add_event(struct kevent *k);
+
+void kevent_storage_ready(struct kevent_storage *st, 
+		kevent_callback_t ready_callback, u32 event);
+int kevent_storage_init(void *origin, struct kevent_storage *st);
+void kevent_storage_fini(struct kevent_storage *st);
+int kevent_storage_enqueue(struct kevent_storage *st, struct kevent *k);
+void kevent_storage_dequeue(struct kevent_storage *st, struct kevent *k);
+
+int kevent_user_add_ukevent(struct ukevent *uk, struct kevent_user *u);
+
+#ifdef CONFIG_KEVENT_INODE
+void kevent_inode_notify(struct inode *inode, u32 event);
+void kevent_inode_notify_parent(struct dentry *dentry, u32 event);
+void kevent_inode_remove(struct inode *inode);
+#else
+static inline void kevent_inode_notify(struct inode *inode, u32 event)
+{
+}
+static inline void kevent_inode_notify_parent(struct dentry *dentry, u32 event)
+{
+}
+static inline void kevent_inode_remove(struct inode *inode)
+{
+}
+#endif /* CONFIG_KEVENT_INODE */
+#ifdef CONFIG_KEVENT_SOCKET
+
+void kevent_socket_notify(struct sock *sock, u32 event);
+int kevent_socket_dequeue(struct kevent *k);
+int kevent_socket_enqueue(struct kevent *k);
+#define sock_async(__sk) sock_flag(__sk, SOCK_ASYNC)
+#else
+static inline void kevent_socket_notify(struct sock *sock, u32 event)
+{
+}
+#define sock_async(__sk)	({ (void)__sk; 0; })
+#endif
+
+#ifdef CONFIG_KEVENT_USER_STAT
+static inline void kevent_user_stat_init(struct kevent_user *u)
+{
+	u->wait_num = u->im_num = u->total = 0;
+}
+static inline void kevent_user_stat_print(struct kevent_user *u)
+{
+	pr_debug("%s: u=%p, wait=%lu, immediately=%lu, total=%lu.\n", 
+			__func__, u, u->wait_num, u->im_num, u->total);
+}
+static inline void kevent_user_stat_increase_im(struct kevent_user *u)
+{
+	u->im_num++;
+}
+static inline void kevent_user_stat_increase_wait(struct kevent_user *u)
+{
+	u->wait_num++;
+}
+static inline void kevent_user_stat_increase_total(struct kevent_user *u)
+{
+	u->total++;
+}
+#else
+#define kevent_user_stat_print(u)		({ (void) u;})
+#define kevent_user_stat_init(u)		({ (void) u;})
+#define kevent_user_stat_increase_im(u)		({ (void) u;})
+#define kevent_user_stat_increase_wait(u)	({ (void) u;})
+#define kevent_user_stat_increase_total(u)	({ (void) u;})
+#endif
+
+#endif /* __KERNEL__ */
+#endif /* __KEVENT_H */
diff --git a/include/linux/kevent_storage.h b/include/linux/kevent_storage.h
new file mode 100644
index 0000000..bd891f0
--- /dev/null
+++ b/include/linux/kevent_storage.h
@@ -0,0 +1,12 @@
+#ifndef __KEVENT_STORAGE_H
+#define __KEVENT_STORAGE_H
+
+struct kevent_storage
+{
+	void			*origin;		/* Originator's pointer, e.g. struct sock or struct file. Can be NULL. */
+	struct list_head	list;			/* List of queued kevents. */
+	unsigned int		qlen;			/* Number of queued kevents. */
+	spinlock_t		lock;			/* Protects users queue. */
+};
+
+#endif /* __KEVENT_STORAGE_H */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 008f04c..143f3b5 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -597,4 +597,9 @@ asmlinkage long sys_get_robust_list(int 
 asmlinkage long sys_set_robust_list(struct robust_list_head __user *head,
 				    size_t len);
 
+asmlinkage long sys_aio_recv(int ctl_fd, int s, void __user *buf, size_t size, unsigned flags);
+asmlinkage long sys_aio_send(int ctl_fd, int s, void __user *buf, size_t size, unsigned flags);
+asmlinkage long sys_kevent_get_events(int ctl_fd, unsigned int min, unsigned int max, 
+		unsigned int timeout, void __user *buf, unsigned flags);
+asmlinkage long sys_kevent_ctl(int ctl_fd, unsigned int cmd, unsigned int num, void __user *buf);
 #endif
diff --git a/init/Kconfig b/init/Kconfig
index a099fc6..c550fcc 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -218,6 +218,8 @@ config AUDITSYSCALL
 	  such as SELinux.  To use audit's filesystem watch feature, please
 	  ensure that INOTIFY is configured.
 
+source "kernel/kevent/Kconfig"
+
 config IKCONFIG
 	bool "Kernel .config support"
 	---help---
diff --git a/kernel/Makefile b/kernel/Makefile
index d62ec66..2d7a6dd 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -47,6 +47,7 @@ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softl
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
+obj-$(CONFIG_KEVENT) += kevent/
 obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o
diff --git a/kernel/kevent/Kconfig b/kernel/kevent/Kconfig
new file mode 100644
index 0000000..88b35af
--- /dev/null
+++ b/kernel/kevent/Kconfig
@@ -0,0 +1,57 @@
+config KEVENT
+	bool "Kernel event notification mechanism"
+	help
+	  This option enables event queue mechanism.
+	  It can be used as replacement for poll()/select(), AIO callback invocations,
+	  advanced timer notifications and other kernel object status changes.
+
+config KEVENT_USER_STAT
+	bool "Kevent user statistic"
+	depends on KEVENT
+	default N
+	help
+	  This option will turn kevent_user statistic collection on.
+	  Statistic data includes total number of kevent, number of kevents which are ready
+	  immediately at insertion time and number of kevents which were removed through
+	  readiness completion. It will be printed each time control kevent descriptor
+	  is closed.
+
+config KEVENT_SOCKET
+	bool "Kernel event notifications for sockets"
+	depends on NET && KEVENT
+	help
+	  This option enables notifications through KEVENT subsystem of 
+	  sockets operations, like new packet receiving conditions, ready for accept
+  	  conditions and so on.
+	
+config KEVENT_INODE
+	bool "Kernel event notifications for inodes"
+	depends on KEVENT
+	help
+	  This option enables notifications through KEVENT subsystem of 
+	  inode operations, like file creation, removal and so on.
+
+config KEVENT_TIMER
+	bool "Kernel event notifications for timers"
+	depends on KEVENT
+	help
+	  This option allows to use timers through KEVENT subsystem.
+
+config KEVENT_POLL
+	bool "Kernel event notifications for poll()/select()"
+	depends on KEVENT
+	help
+	  This option allows to use kevent subsystem for poll()/select() notifications.
+
+config KEVENT_NAIO
+	bool "Network asynchronous IO"
+	depends on KEVENT && KEVENT_SOCKET
+	help
+	  This option enables kevent based network asynchronous IO subsystem.
+
+config KEVENT_AIO
+	bool "Asynchronous IO"
+	depends on KEVENT
+	help
+	  This option allows to use kevent subsystem for AIO operations.
+	  AIO read is currently supported.
diff --git a/kernel/kevent/Makefile b/kernel/kevent/Makefile
new file mode 100644
index 0000000..d1ef9ba
--- /dev/null
+++ b/kernel/kevent/Makefile
@@ -0,0 +1,7 @@
+obj-y := kevent.o kevent_user.o
+obj-$(CONFIG_KEVENT_SOCKET) += kevent_socket.o
+obj-$(CONFIG_KEVENT_INODE) += kevent_inode.o
+obj-$(CONFIG_KEVENT_TIMER) += kevent_timer.o
+obj-$(CONFIG_KEVENT_POLL) += kevent_poll.o
+obj-$(CONFIG_KEVENT_NAIO) += kevent_naio.o
+obj-$(CONFIG_KEVENT_AIO) += kevent_aio.o
diff --git a/kernel/kevent/kevent.c b/kernel/kevent/kevent.c
new file mode 100644
index 0000000..8c71ca9
--- /dev/null
+++ b/kernel/kevent/kevent.c
@@ -0,0 +1,248 @@
+/*
+ * 	kevent.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/mempool.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/kevent.h>
+
+static kmem_cache_t *kevent_cache;
+
+/*
+ * Attempts to add an event into appropriate origin's queue.
+ * Returns positive value if this event is ready immediately,
+ * negative value in case of error and zero if event has been queued.
+ * ->enqueue() callback must increase origin's reference counter.
+ */
+int kevent_enqueue(struct kevent *k)
+{
+	if (k->event.type >= KEVENT_MAX)
+		return -E2BIG;
+
+	if (!k->callbacks.enqueue) {
+		kevent_break(k);
+		return -EINVAL;
+	}
+	
+	return k->callbacks.enqueue(k);
+}
+
+/*
+ * Remove event from the appropriate queue.
+ * ->dequeue() callback must decrease origin's reference counter.
+ */
+int kevent_dequeue(struct kevent *k)
+{
+	if (k->event.type >= KEVENT_MAX)
+		return -E2BIG;
+	
+	if (!k->callbacks.dequeue) {
+		kevent_break(k);
+		return -EINVAL;
+	}
+
+	return k->callbacks.dequeue(k);
+}
+
+int kevent_break(struct kevent *k)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&k->ulock, flags);
+	k->event.ret_flags |= KEVENT_RET_BROKEN;
+	spin_unlock_irqrestore(&k->ulock, flags);
+	return 0;
+}
+
+struct kevent_callbacks kevent_registered_callbacks[KEVENT_MAX];
+
+/*
+ * Must be called before event is going to be added into some origin's queue.
+ * Initializes ->enqueue(), ->dequeue() and ->callback() callbacks.
+ * If failed, kevent should not be used or kevent_enqueue() will fail to add
+ * this kevent into origin's queue with setting
+ * KEVENT_RET_BROKEN flag in kevent->event.ret_flags.
+ */
+int kevent_init(struct kevent *k)
+{
+	spin_lock_init(&k->ulock);
+	k->kevent_entry.next = LIST_POISON1;
+	k->storage_entry.next = LIST_POISON1;
+	k->ready_entry.next = LIST_POISON1;
+
+	if (k->event.type >= KEVENT_MAX)
+		return -E2BIG;
+
+	k->callbacks = kevent_registered_callbacks[k->event.type];
+	if (!k->callbacks.callback) {
+		kevent_break(k);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/*
+ * Called from ->enqueue() callback when reference counter for given
+ * origin (socket, inode...) has been increased.
+ */
+int kevent_storage_enqueue(struct kevent_storage *st, struct kevent *k)
+{
+	unsigned long flags;
+
+	k->st = st;
+	spin_lock_irqsave(&st->lock, flags);
+	list_add_tail(&k->storage_entry, &st->list);
+	st->qlen++;
+	spin_unlock_irqrestore(&st->lock, flags);
+	return 0;
+}
+
+/*
+ * Dequeue kevent from origin's queue. 
+ * It does not decrease origin's reference counter in any way 
+ * and must be called before it, so storage itself must be valid.
+ * It is called from ->dequeue() callback.
+ */
+void kevent_storage_dequeue(struct kevent_storage *st, struct kevent *k)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&st->lock, flags);
+	if (k->storage_entry.next != LIST_POISON1) {
+		list_del(&k->storage_entry);
+		st->qlen--;
+	}
+	spin_unlock_irqrestore(&st->lock, flags);
+}
+
+static void __kevent_requeue(struct kevent *k, u32 event)
+{
+	int err, rem = 0;
+	unsigned long flags;
+
+	err = k->callbacks.callback(k);
+
+	spin_lock_irqsave(&k->ulock, flags);
+	if (err > 0) {
+		k->event.ret_flags |= KEVENT_RET_DONE;
+	} else if (err < 0) {
+		k->event.ret_flags |= KEVENT_RET_BROKEN;
+		k->event.ret_flags |= KEVENT_RET_DONE;
+	}
+	rem = (k->event.req_flags & KEVENT_REQ_ONESHOT);
+	if (!err)
+		err = (k->event.ret_flags & (KEVENT_RET_BROKEN|KEVENT_RET_DONE));
+	spin_unlock_irqrestore(&k->ulock, flags);
+
+	if (err) {
+		if (rem) {
+			list_del(&k->storage_entry);
+			k->st->qlen--;
+		}
+		
+		spin_lock_irqsave(&k->user->ready_lock, flags);
+		if (k->ready_entry.next == LIST_POISON1) {
+			kevent_user_ring_add_event(k);
+			list_add_tail(&k->ready_entry, &k->user->ready_list);
+			k->user->ready_num++;
+		}
+		spin_unlock_irqrestore(&k->user->ready_lock, flags);
+		wake_up(&k->user->wait);
+	}
+}
+
+void kevent_requeue(struct kevent *k)
+{
+	unsigned long flags;
+	
+	spin_lock_irqsave(&k->st->lock, flags);
+	__kevent_requeue(k, 0);
+	spin_unlock_irqrestore(&k->st->lock, flags);
+}
+
+/*
+ * Called each time some activity in origin (socket, inode...) is noticed.
+ */
+void kevent_storage_ready(struct kevent_storage *st, 
+		kevent_callback_t ready_callback, u32 event)
+{
+	struct kevent *k, *n;
+
+	spin_lock(&st->lock);
+	list_for_each_entry_safe(k, n, &st->list, storage_entry) {
+		if (ready_callback)
+			ready_callback(k);
+
+		if (event & k->event.event)
+			__kevent_requeue(k, event);
+	}
+	spin_unlock(&st->lock);
+}
+
+int kevent_storage_init(void *origin, struct kevent_storage *st)
+{
+	spin_lock_init(&st->lock);
+	st->origin = origin;
+	st->qlen = 0;
+	INIT_LIST_HEAD(&st->list);
+	return 0;
+}
+
+void kevent_storage_fini(struct kevent_storage *st)
+{
+	kevent_storage_ready(st, kevent_break, KEVENT_MASK_ALL);
+}
+
+struct kevent *kevent_alloc(gfp_t mask)
+{
+	return kmem_cache_alloc(kevent_cache, mask);
+}
+
+void kevent_free(struct kevent *k)
+{
+	kmem_cache_free(kevent_cache, k);
+}
+
+static int __init kevent_sys_init(void)
+{
+	int i;
+
+	kevent_cache = kmem_cache_create("kevent_cache", 
+			sizeof(struct kevent), 0, 0, NULL, NULL);
+	if (!kevent_cache)
+		panic("kevent: Unable to create a cache.\n");
+
+	for (i=0; i<ARRAY_SIZE(kevent_registered_callbacks); ++i) {
+		struct kevent_callbacks *c = &kevent_registered_callbacks[i];
+
+		c->callback = c->enqueue = c->dequeue = NULL;
+	}
+	
+	return 0;
+}
+
+late_initcall(kevent_sys_init);
diff --git a/kernel/kevent/kevent_user.c b/kernel/kevent/kevent_user.c
new file mode 100644
index 0000000..87ac367
--- /dev/null
+++ b/kernel/kevent/kevent_user.c
@@ -0,0 +1,876 @@
+/*
+ * 	kevent_user.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/device.h>
+#include <linux/poll.h>
+#include <linux/kevent.h>
+#include <linux/jhash.h>
+#include <asm/io.h>
+
+static struct class *kevent_user_class;
+static char kevent_name[] = "kevent";
+static int kevent_user_major;
+
+static int kevent_user_open(struct inode *, struct file *);
+static int kevent_user_release(struct inode *, struct file *);
+static unsigned int kevent_user_poll(struct file *, struct poll_table_struct *);
+static int kevnet_user_mmap(struct file *, struct vm_area_struct *);
+
+static struct file_operations kevent_user_fops = {
+	.mmap		= kevnet_user_mmap,
+	.open		= kevent_user_open,
+	.release	= kevent_user_release,
+	.poll		= kevent_user_poll,
+	.owner		= THIS_MODULE,
+};
+
+static int kevent_get_sb(struct file_system_type *fs_type, 
+		int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+{
+	/* So original magic... */
+	return get_sb_pseudo(fs_type, kevent_name, NULL, 0xabcdef, mnt);	
+}
+
+static struct file_system_type kevent_fs_type = {
+	.name		= kevent_name,
+	.get_sb		= kevent_get_sb,
+	.kill_sb	= kill_anon_super,
+};
+
+static struct vfsmount *kevent_mnt;
+
+static unsigned int kevent_user_poll(struct file *file, struct poll_table_struct *wait)
+{
+	struct kevent_user *u = file->private_data;
+	unsigned int mask;
+	
+	poll_wait(file, &u->wait, wait);
+	mask = 0;
+
+	if (u->ready_num)
+		mask |= POLLIN | POLLRDNORM;
+
+	return mask;
+}
+
+static inline void kevent_user_ring_set(struct kevent_user *u, unsigned int num)
+{
+	unsigned int *idx;
+	
+	idx = (unsigned int *)u->pring[0];
+	idx[0] = num;
+}
+
+/*
+ * Note that kevents does not exactly fill the page (each ukevent is 40 bytes),
+ * so we reuse 4 bytes at the begining of the first page to store index.
+ * Take that into account if you want to change size of struct ukevent.
+ */
+#define KEVENTS_ON_PAGE PAGE_SIZE/sizeof(struct ukevent)
+
+/*
+ * Called under kevent_user->ready_lock, so updates are always protected.
+ */
+void kevent_user_ring_add_event(struct kevent *k)
+{
+	unsigned int *idx_ptr, idx, pidx, off;
+	struct ukevent *ukev;
+	
+	idx_ptr = (unsigned int *)k->user->pring[0];
+	idx = idx_ptr[0];
+
+	pidx = idx/KEVENTS_ON_PAGE;
+	off = idx%KEVENTS_ON_PAGE;
+
+	if (pidx == 0)
+		ukev = (struct ukevent *)(k->user->pring[pidx] + sizeof(unsigned int));
+	else
+		ukev = (struct ukevent *)(k->user->pring[pidx]);
+
+	memcpy(&ukev[off], &k->event, sizeof(struct ukevent));
+
+	idx++;
+	if (idx >= KEVENT_MAX_EVENTS)
+		idx = 0;
+
+	idx_ptr[0] = idx;
+}
+
+static int kevent_user_ring_init(struct kevent_user *u)
+{
+	int i, pnum;
+
+	pnum = ALIGN(KEVENT_MAX_EVENTS*sizeof(struct ukevent) + sizeof(unsigned int), PAGE_SIZE)/PAGE_SIZE;
+
+	u->pring = kmalloc(pnum * sizeof(unsigned long), GFP_KERNEL);
+	if (!u->pring)
+		return -ENOMEM;
+
+	for (i=0; i<pnum; ++i) {
+		u->pring[i] = __get_free_page(GFP_KERNEL);
+		if (!u->pring)
+			break;
+	}
+
+	if (i != pnum) {
+		pnum = i;
+		goto err_out_free;
+	}
+
+	kevent_user_ring_set(u, 0);
+
+	return 0;
+
+err_out_free:
+	for (i=0; i<pnum; ++i)
+		free_page(u->pring[i]);
+
+	kfree(u->pring);
+
+	return -ENOMEM;
+}
+
+static void kevent_user_ring_fini(struct kevent_user *u)
+{
+	int i, pnum;
+
+	pnum = ALIGN(KEVENT_MAX_EVENTS*sizeof(struct ukevent) + sizeof(unsigned int), PAGE_SIZE)/PAGE_SIZE;
+	
+	for (i=0; i<pnum; ++i)
+		free_page(u->pring[i]);
+
+	kfree(u->pring);
+}
+
+static struct kevent_user *kevent_user_alloc(void)
+{
+	struct kevent_user *u;
+	int i;
+
+	u = kzalloc(sizeof(struct kevent_user), GFP_KERNEL);
+	if (!u)
+		return NULL;
+
+	INIT_LIST_HEAD(&u->ready_list);
+	spin_lock_init(&u->ready_lock);
+	u->ready_num = 0;
+	kevent_user_stat_init(u);
+	spin_lock_init(&u->kevent_lock);
+	for (i=0; i<ARRAY_SIZE(u->kevent_list); ++i)
+		INIT_LIST_HEAD(&u->kevent_list[i]);
+	u->kevent_num = 0;
+	
+	mutex_init(&u->ctl_mutex);
+	mutex_init(&u->wait_mutex);
+	init_waitqueue_head(&u->wait);
+	u->max_ready_num = 0;
+
+	atomic_set(&u->refcnt, 1);
+
+	if (kevent_user_ring_init(u)) {
+		kfree(u);
+		u = NULL;
+	}
+
+	return u;
+}
+
+static int kevent_user_open(struct inode *inode, struct file *file)
+{
+	struct kevent_user *u = kevent_user_alloc();
+	
+	if (!u)
+		return -ENOMEM;
+
+	file->private_data = u;
+	
+	return 0;
+}
+
+static inline void kevent_user_get(struct kevent_user *u)
+{
+	atomic_inc(&u->refcnt);
+}
+
+static inline void kevent_user_put(struct kevent_user *u)
+{
+	if (atomic_dec_and_test(&u->refcnt)) {
+		kevent_user_stat_print(u);
+		kevent_user_ring_fini(u);
+		kfree(u);
+	}
+}
+
+static int kevnet_user_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	size_t size = vma->vm_end - vma->vm_start, psize;
+	int pnum = size/PAGE_SIZE, i;
+	unsigned long start = vma->vm_start;
+	struct kevent_user *u = file->private_data;
+
+	psize = ALIGN(KEVENT_MAX_EVENTS*sizeof(struct ukevent) + sizeof(unsigned int), PAGE_SIZE);
+
+	if (size + vma->vm_pgoff*PAGE_SIZE != psize)
+		return -EINVAL;
+
+	if (vma->vm_flags & VM_WRITE)
+		return -EPERM;
+
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+	for (i=0; i<pnum; ++i) {
+		if (remap_pfn_range(vma, start, virt_to_phys((void *)u->pring[i+vma->vm_pgoff]), PAGE_SIZE,
+					vma->vm_page_prot))
+			return -EAGAIN;
+		start += PAGE_SIZE;
+	}
+
+	return 0;
+}
+
+#if 0
+static inline unsigned int kevent_user_hash(struct ukevent *uk)
+{
+	unsigned int h = (uk->user[0] ^ uk->user[1]) ^ (uk->id.raw[0] ^ uk->id.raw[1]);
+	
+	h = (((h >> 16) & 0xffff) ^ (h & 0xffff)) & 0xffff;
+	h = (((h >> 8) & 0xff) ^ (h & 0xff)) & KEVENT_HASH_MASK;
+
+	return h;
+}
+#else
+static inline unsigned int kevent_user_hash(struct ukevent *uk)
+{
+	return jhash_1word(uk->id.raw[0], 0) & KEVENT_HASH_MASK;
+}
+#endif
+
+static void kevent_finish_user_complete(struct kevent *k, int deq)
+{
+	struct kevent_user *u = k->user;
+	unsigned long flags;
+
+	if (deq)
+		kevent_dequeue(k);
+
+	spin_lock_irqsave(&u->ready_lock, flags);
+	if (k->ready_entry.next != LIST_POISON1) {
+		list_del(&k->ready_entry);
+		u->ready_num--;
+	}
+	spin_unlock_irqrestore(&u->ready_lock, flags);
+
+	kevent_user_put(u);
+	kevent_free(k);
+}
+
+static void __kevent_finish_user(struct kevent *k, int deq)
+{
+	struct kevent_user *u = k->user;
+
+	list_del(&k->kevent_entry);
+	u->kevent_num--;
+	kevent_finish_user_complete(k, deq);
+}
+
+/*
+ * Remove kevent from user's list of all events, 
+ * dequeue it from storage and decrease user's reference counter,
+ * since this kevent does not exist anymore. That is why it is freed here.
+ */
+static void kevent_finish_user(struct kevent *k, int deq)
+{
+	struct kevent_user *u = k->user;
+	unsigned long flags;
+
+	spin_lock_irqsave(&u->kevent_lock, flags);
+	list_del(&k->kevent_entry);
+	u->kevent_num--;
+	spin_unlock_irqrestore(&u->kevent_lock, flags);
+	kevent_finish_user_complete(k, deq);
+}
+
+/*
+ * Dequeue one entry from user's ready queue.
+ */
+
+static struct kevent *kqueue_dequeue_ready(struct kevent_user *u)
+{
+	unsigned long flags;
+	struct kevent *k = NULL;
+
+	spin_lock_irqsave(&u->ready_lock, flags);
+	if (u->ready_num && !list_empty(&u->ready_list)) {
+		k = list_entry(u->ready_list.next, struct kevent, ready_entry);
+		list_del(&k->ready_entry);
+		u->ready_num--;
+	}
+	spin_unlock_irqrestore(&u->ready_lock, flags);
+
+	return k;
+}
+
+static struct kevent *__kevent_search(struct list_head *head, struct ukevent *uk, 
+		struct kevent_user *u)
+{
+	struct kevent *k;
+	int found = 0;
+	
+	list_for_each_entry(k, head, kevent_entry) {
+		spin_lock(&k->ulock);
+		if (k->event.user[0] == uk->user[0] && k->event.user[1] == uk->user[1] &&
+				k->event.id.raw[0] == uk->id.raw[0] && 
+				k->event.id.raw[1] == uk->id.raw[1]) {
+			found = 1;
+			spin_unlock(&k->ulock);
+			break;
+		}
+		spin_unlock(&k->ulock);
+	}
+
+	return (found)?k:NULL;
+}
+
+static int kevent_modify(struct ukevent *uk, struct kevent_user *u)
+{
+	struct kevent *k;
+	unsigned int hash = kevent_user_hash(uk);
+	int err = -ENODEV;
+	unsigned long flags;
+	
+	spin_lock_irqsave(&u->kevent_lock, flags);
+	k = __kevent_search(&u->kevent_list[hash], uk, u);
+	if (k) {
+		spin_lock(&k->ulock);
+		k->event.event = uk->event;
+		k->event.req_flags = uk->req_flags;
+		k->event.ret_flags = 0;
+		spin_unlock(&k->ulock);
+		kevent_requeue(k);
+		err = 0;
+	}
+	spin_unlock_irqrestore(&u->kevent_lock, flags);
+	
+	return err;
+}
+
+static int kevent_remove(struct ukevent *uk, struct kevent_user *u)
+{
+	int err = -ENODEV;
+	struct kevent *k;
+	unsigned int hash = kevent_user_hash(uk);
+	unsigned long flags;
+
+	spin_lock_irqsave(&u->kevent_lock, flags);
+	k = __kevent_search(&u->kevent_list[hash], uk, u);
+	if (k) {
+		__kevent_finish_user(k, 1);
+		err = 0;
+	}
+	spin_unlock_irqrestore(&u->kevent_lock, flags);
+
+	return err;
+}
+
+/*
+ * No new entry can be added or removed from any list at this point.
+ * It is not permitted to call ->ioctl() and ->release() in parallel.
+ */
+static int kevent_user_release(struct inode *inode, struct file *file)
+{
+	struct kevent_user *u = file->private_data;
+	struct kevent *k, *n;
+	int i;
+
+	for (i=0; i<KEVENT_HASH_MASK+1; ++i) {
+		list_for_each_entry_safe(k, n, &u->kevent_list[i], kevent_entry)
+			kevent_finish_user(k, 1);
+	}
+
+	kevent_user_put(u);
+	file->private_data = NULL;
+
+	return 0;
+}
+
+static struct ukevent *kevent_get_user(unsigned int num, void __user *arg)
+{
+	struct ukevent *ukev;
+
+	ukev = kmalloc(sizeof(struct ukevent) * num, GFP_KERNEL);
+	if (!ukev)
+		return NULL;
+
+	if (copy_from_user(arg, ukev, sizeof(struct ukevent) * num)) {
+		kfree(ukev);
+		return NULL;
+	}
+
+	return ukev;
+}
+
+static int kevent_user_ctl_modify(struct kevent_user *u, unsigned int num, void __user *arg)
+{
+	int err = 0, i;
+	struct ukevent uk;
+
+	mutex_lock(&u->ctl_mutex);
+	
+	if (num > KEVENT_MIN_BUFFS_ALLOC) {
+		struct ukevent *ukev;
+
+		ukev = kevent_get_user(num, arg);
+		if (ukev) {
+			for (i=0; i<num; ++i) {
+				if (kevent_modify(&ukev[i], u))
+					ukev[i].ret_flags |= KEVENT_RET_BROKEN;
+				ukev[i].ret_flags |= KEVENT_RET_DONE;
+			}
+			if (copy_to_user(arg, ukev, num*sizeof(struct ukevent)))
+				err = -EINVAL;
+			kfree(ukev);
+			goto out;
+		}
+	}
+
+	for (i=0; i<num; ++i) {
+		if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
+			err = -EINVAL;
+			break;
+		}
+
+		if (kevent_modify(&uk, u))
+			uk.ret_flags |= KEVENT_RET_BROKEN;
+		uk.ret_flags |= KEVENT_RET_DONE;
+
+		if (copy_to_user(arg, &uk, sizeof(struct ukevent))) {
+			err = -EINVAL;
+			break;
+		}
+
+		arg += sizeof(struct ukevent);
+	}
+out:
+	mutex_unlock(&u->ctl_mutex);
+
+	return err;
+}
+
+static int kevent_user_ctl_remove(struct kevent_user *u, unsigned int num, void __user *arg)
+{
+	int err = 0, i;
+	struct ukevent uk;
+
+	mutex_lock(&u->ctl_mutex);
+	
+	if (num > KEVENT_MIN_BUFFS_ALLOC) {
+		struct ukevent *ukev;
+
+		ukev = kevent_get_user(num, arg);
+		if (ukev) {
+			for (i=0; i<num; ++i) {
+				if (kevent_remove(&ukev[i], u))
+					ukev[i].ret_flags |= KEVENT_RET_BROKEN;
+				ukev[i].ret_flags |= KEVENT_RET_DONE;
+			}
+			if (copy_to_user(arg, ukev, num*sizeof(struct ukevent)))
+				err = -EINVAL;
+			kfree(ukev);
+			goto out;
+		}
+	}
+
+	for (i=0; i<num; ++i) {
+		if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
+			err = -EINVAL;
+			break;
+		}
+
+		if (kevent_remove(&uk, u))
+			uk.ret_flags |= KEVENT_RET_BROKEN;
+
+		uk.ret_flags |= KEVENT_RET_DONE;
+
+		if (copy_to_user(arg, &uk, sizeof(struct ukevent))) {
+			err = -EINVAL;
+			break;
+		}
+
+		arg += sizeof(struct ukevent);
+	}
+out:
+	mutex_unlock(&u->ctl_mutex);
+
+	return err;
+}
+
+static void kevent_user_enqueue(struct kevent_user *u, struct kevent *k)
+{
+	unsigned long flags;
+	unsigned int hash = kevent_user_hash(&k->event);
+
+	spin_lock_irqsave(&u->kevent_lock, flags);
+	list_add_tail(&k->kevent_entry, &u->kevent_list[hash]);
+	u->kevent_num++;
+	kevent_user_get(u);
+	spin_unlock_irqrestore(&u->kevent_lock, flags);
+}
+
+int kevent_user_add_ukevent(struct ukevent *uk, struct kevent_user *u)
+{
+	struct kevent *k;
+	int err;
+
+	k = kevent_alloc(GFP_KERNEL);
+	if (!k) {
+		err = -ENOMEM;
+		goto err_out_exit;
+	}
+
+	memcpy(&k->event, uk, sizeof(struct ukevent));
+
+	k->event.ret_flags = 0;
+
+	err = kevent_init(k);
+	if (err) {
+		kevent_free(k);
+		goto err_out_exit;
+	}
+	k->user = u;
+	kevent_user_stat_increase_total(u);
+	kevent_user_enqueue(u, k);
+
+	err = kevent_enqueue(k);
+	if (err) {
+		memcpy(uk, &k->event, sizeof(struct ukevent));
+		if (err < 0)
+			uk->ret_flags |= KEVENT_RET_BROKEN;
+		uk->ret_flags |= KEVENT_RET_DONE;
+		kevent_finish_user(k, 0);
+	} 
+
+err_out_exit:
+	return err;
+}
+
+/*
+ * Copy all ukevents from userspace, allocate kevent for each one 
+ * and add them into appropriate kevent_storages, 
+ * e.g. sockets, inodes and so on...
+ * If something goes wrong, all events will be dequeued and 
+ * negative error will be returned. 
+ * On success number of finished events is returned and 
+ * Array of finished events (struct ukevent) will be placed behind 
+ * kevent_user_control structure. User must run through that array and check 
+ * ret_flags field of each ukevent structure to determine if it is fired or failed event.
+ */
+static int kevent_user_ctl_add(struct kevent_user *u, unsigned int num, void __user *arg)
+{
+	int err, cerr = 0, knum = 0, rnum = 0, i;
+	void __user *orig = arg;
+	struct ukevent uk;
+
+	mutex_lock(&u->ctl_mutex);
+
+	err = -ENFILE;
+	if (u->kevent_num + num >= KEVENT_MAX_EVENTS)
+		goto out_remove;
+
+	if (num > KEVENT_MIN_BUFFS_ALLOC) {
+		struct ukevent *ukev;
+
+		ukev = kevent_get_user(num, arg);
+		if (ukev) {
+			for (i=0; i<num; ++i) {
+				err = kevent_user_add_ukevent(&ukev[i], u);
+				if (err) {
+					kevent_user_stat_increase_im(u);
+					if (i != rnum)
+						memcpy(&ukev[rnum], &ukev[i], sizeof(struct ukevent));
+					rnum++;
+				} else
+					knum++;
+			}
+			if (copy_to_user(orig, ukev, rnum*sizeof(struct ukevent)))
+				cerr = -EINVAL;
+			kfree(ukev);
+			goto out_setup;
+		}
+	}
+
+	for (i=0; i<num; ++i) {
+		if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
+			cerr = -EINVAL;
+			break;
+		}
+		arg += sizeof(struct ukevent);
+
+		err = kevent_user_add_ukevent(&uk, u);
+		if (err) {
+			kevent_user_stat_increase_im(u);
+			if (copy_to_user(orig, &uk, sizeof(struct ukevent))) {
+				cerr = -EINVAL;
+				break;
+			}
+			orig += sizeof(struct ukevent);
+			rnum++;
+		} else
+			knum++;
+	}
+
+out_setup:
+	if (cerr < 0) {
+		err = cerr;
+		goto out_remove;
+	}
+
+	err = rnum;
+out_remove:
+	mutex_unlock(&u->ctl_mutex);
+
+	return err;
+}
+
+/*
+ * In nonblocking mode it returns as many events as possible, but not more than @max_nr.
+ * In blocking mode it waits until timeout or if at least @min_nr events are ready,
+ * if timeout is zero, than it waits no more than 1 second or if at least one event
+ * is ready.
+ */
+static int kevent_user_wait(struct file *file, struct kevent_user *u, 
+		unsigned int min_nr, unsigned int max_nr, unsigned int timeout, 
+		void __user *buf)
+{
+	struct kevent *k;
+	int cerr = 0, num = 0;
+
+	if (!(file->f_flags & O_NONBLOCK)) {
+		if (timeout)
+			wait_event_interruptible_timeout(u->wait, 
+				u->ready_num >= min_nr, msecs_to_jiffies(timeout));
+		else
+			wait_event_interruptible_timeout(u->wait, 
+					u->ready_num > 0, msecs_to_jiffies(1000));
+	}
+	
+	mutex_lock(&u->ctl_mutex);
+	while (num < max_nr && ((k = kqueue_dequeue_ready(u)) != NULL)) {
+		if (copy_to_user(buf + num*sizeof(struct ukevent), 
+					&k->event, sizeof(struct ukevent))) {
+			cerr = -EINVAL;
+			break;
+		}
+
+		/*
+		 * If it is one-shot kevent, it has been removed already from
+		 * origin's queue, so we can easily free it here.
+		 */
+		if (k->event.req_flags & KEVENT_REQ_ONESHOT)
+			kevent_finish_user(k, 1);
+		++num;
+		kevent_user_stat_increase_wait(u);
+	}
+	mutex_unlock(&u->ctl_mutex);
+
+	return (cerr)?cerr:num;
+}
+
+static int kevent_ctl_init(void)
+{
+	struct kevent_user *u;
+	struct file *file;
+	int fd, ret;
+
+	fd = get_unused_fd();
+	if (fd < 0)
+		return fd;
+
+	file = get_empty_filp();
+	if (!file) {
+		ret = -ENFILE;
+		goto out_put_fd;
+	}
+
+	u = kevent_user_alloc();
+	if (unlikely(!u)) {
+		ret = -ENOMEM;
+		goto out_put_file;
+	}
+
+	file->f_op = &kevent_user_fops;
+	file->f_vfsmnt = mntget(kevent_mnt);
+	file->f_dentry = dget(kevent_mnt->mnt_root);
+	file->f_mapping = file->f_dentry->d_inode->i_mapping;
+	file->f_mode = FMODE_READ;
+	file->f_flags = O_RDONLY;
+	file->private_data = u;
+	
+	fd_install(fd, file);
+
+	return fd;
+
+out_put_file:
+	put_filp(file);
+out_put_fd:
+	put_unused_fd(fd);
+	return ret;
+}
+
+static int kevent_ctl_process(struct file *file, unsigned int cmd, unsigned int num, void __user *arg)
+{
+	int err;
+	struct kevent_user *u = file->private_data;
+
+	if (!u)
+		return -EINVAL;
+
+	switch (cmd) {
+	case KEVENT_CTL_ADD:
+		err = kevent_user_ctl_add(u, num, arg);
+		break;
+	case KEVENT_CTL_REMOVE:
+		err = kevent_user_ctl_remove(u, num, arg);
+		break;
+	case KEVENT_CTL_MODIFY:
+		err = kevent_user_ctl_modify(u, num, arg);
+		break;
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+	return err;
+}
+
+asmlinkage long sys_kevent_get_events(int ctl_fd, unsigned int min_nr, unsigned int max_nr,
+		unsigned int timeout, void __user *buf, unsigned flags)
+{
+	int err = -EINVAL, fput_needed;
+	struct file *file;
+	struct kevent_user *u;
+
+	file = fget_light(ctl_fd, &fput_needed);
+	if (!file)
+		return -ENODEV;
+
+	if (file->f_op != &kevent_user_fops)
+		goto out_fput;
+	u = file->private_data;
+
+	err = kevent_user_wait(file, u, min_nr, max_nr, timeout, buf);
+out_fput:
+	fput_light(file, fput_needed);
+	return err;
+}
+
+asmlinkage long sys_kevent_ctl(int fd, unsigned int cmd, unsigned int num, void __user *arg)
+{
+	int err = -EINVAL, fput_needed;
+	struct file *file;
+
+	if (cmd == KEVENT_CTL_INIT)
+		return kevent_ctl_init();
+
+	file = fget_light(fd, &fput_needed);
+	if (!file)
+		return -ENODEV;
+
+	if (file->f_op != &kevent_user_fops)
+		goto out_fput;
+
+	err = kevent_ctl_process(file, cmd, num, arg);
+
+out_fput:
+	fput_light(file, fput_needed);
+	return err;
+}
+
+static int __devinit kevent_user_init(void)
+{
+	struct class_device *dev;
+	int err = 0;
+	
+	err = register_filesystem(&kevent_fs_type);
+	if (err)
+		panic("%s: failed to register filesystem: err=%d.\n",
+			       kevent_name, err);
+
+	kevent_mnt = kern_mount(&kevent_fs_type);
+	if (IS_ERR(kevent_mnt))
+		panic("%s: failed to mount silesystem: err=%ld.\n", 
+				kevent_name, PTR_ERR(kevent_mnt));
+	
+	kevent_user_major = register_chrdev(0, kevent_name, &kevent_user_fops);
+	if (kevent_user_major < 0) {
+		printk(KERN_ERR "Failed to register \"%s\" char device: err=%d.\n", 
+				kevent_name, kevent_user_major);
+		return -ENODEV;
+	}
+
+	kevent_user_class = class_create(THIS_MODULE, "kevent");
+	if (IS_ERR(kevent_user_class)) {
+		printk(KERN_ERR "Failed to register \"%s\" class: err=%ld.\n", 
+				kevent_name, PTR_ERR(kevent_user_class));
+		err = PTR_ERR(kevent_user_class);
+		goto err_out_unregister;
+	}
+
+	dev = class_device_create(kevent_user_class, NULL, 
+			MKDEV(kevent_user_major, 0), NULL, kevent_name);
+	if (IS_ERR(dev)) {
+		printk(KERN_ERR "Failed to create %d.%d class device in \"%s\" class: err=%ld.\n", 
+				kevent_user_major, 0, kevent_name, PTR_ERR(dev));
+		err = PTR_ERR(dev);
+		goto err_out_class_destroy;
+	}
+
+	printk("KEVENT subsystem: chardev helper: major=%d.\n", kevent_user_major);
+
+	return 0;
+
+err_out_class_destroy:
+	class_destroy(kevent_user_class);
+err_out_unregister:
+	unregister_chrdev(kevent_user_major, kevent_name);
+
+	return err;
+}
+
+static void __devexit kevent_user_fini(void)
+{
+	class_device_destroy(kevent_user_class, MKDEV(kevent_user_major, 0));
+	class_destroy(kevent_user_class);
+	unregister_chrdev(kevent_user_major, kevent_name);
+	mntput(kevent_mnt);
+	unregister_filesystem(&kevent_fs_type);
+}
+
+module_init(kevent_user_init);
+module_exit(kevent_user_fini);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 6991bec..8843cca 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -122,6 +122,11 @@ cond_syscall(ppc_rtas);
 cond_syscall(sys_spu_run);
 cond_syscall(sys_spu_create);
 
+cond_syscall(sys_aio_recv);
+cond_syscall(sys_aio_send);
+cond_syscall(sys_kevent_get_events);
+cond_syscall(sys_kevent_ctl);
+
 /* mmu depending weak syscall entries */
 cond_syscall(sys_mprotect);
 cond_syscall(sys_msync);


^ permalink raw reply related	[flat|nested] 160+ messages in thread

* [take3 2/4] kevent: AIO, aio_sendfile() implementation.
  2006-08-03  9:46                           ` [take3 1/4] kevent: Core files Evgeniy Polyakov
@ 2006-08-03  9:46                             ` Evgeniy Polyakov
  2006-08-03  9:46                               ` [take3 3/4] kevent: Network AIO, socket notifications Evgeniy Polyakov
  2006-08-03 17:04                               ` [take3 2/4] kevent: AIO, aio_sendfile() implementation Badari Pulavarty
  2006-08-03 14:40                             ` [take3 1/4] kevent: Core files Eric Dumazet
  1 sibling, 2 replies; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-03  9:46 UTC (permalink / raw)
  To: lkml; +Cc: David Miller, Ulrich Drepper, Evgeniy Polyakov, netdev, Zach Brown


AIO, aio_sendfile() implementation.

This patch includes asynchronous propagation of file's data into VFS
cache and aio_sendfile() implementation.
Network aio_sendfile() works lazily - it asynchronously populates pages
into the VFS cache (which can be used for various tricks with adaptive
readahead) and then uses usual ->sendfile() callback.

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>

diff --git a/fs/bio.c b/fs/bio.c
index 6a0b9ad..a3ee530 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -119,7 +119,7 @@ void bio_free(struct bio *bio, struct bi
 /*
  * default destructor for a bio allocated with bio_alloc_bioset()
  */
-static void bio_fs_destructor(struct bio *bio)
+void bio_fs_destructor(struct bio *bio)
 {
 	bio_free(bio, fs_bio_set);
 }
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index fb4d322..9316551 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -685,6 +685,7 @@ ext2_writepages(struct address_space *ma
 }
 
 const struct address_space_operations ext2_aops = {
+	.get_block		= ext2_get_block,
 	.readpage		= ext2_readpage,
 	.readpages		= ext2_readpages,
 	.writepage		= ext2_writepage,
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index c5ee9f0..d9210d4 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1699,6 +1699,7 @@ static int ext3_journalled_set_page_dirt
 }
 
 static const struct address_space_operations ext3_ordered_aops = {
+	.get_block	= ext3_get_block,
 	.readpage	= ext3_readpage,
 	.readpages	= ext3_readpages,
 	.writepage	= ext3_ordered_writepage,
diff --git a/fs/file_table.c b/fs/file_table.c
index 0131ba0..b649317 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -112,6 +112,9 @@ struct file *get_empty_filp(void)
 	if (security_file_alloc(f))
 		goto fail_sec;
 
+#ifdef CONFIG_KEVENT_POLL
+	kevent_storage_init(f, &f->st);
+#endif
 	tsk = current;
 	INIT_LIST_HEAD(&f->f_u.fu_list);
 	atomic_set(&f->f_count, 1);
@@ -159,6 +162,9 @@ void fastcall __fput(struct file *file)
 	might_sleep();
 
 	fsnotify_close(file);
+#ifdef CONFIG_KEVENT_POLL
+	kevent_storage_fini(&file->st);
+#endif
 	/*
 	 * The function eventpoll_release() should be the first called
 	 * in the file cleanup chain.
diff --git a/fs/inode.c b/fs/inode.c
index 0bf9f04..fdbd0ba 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -21,6 +21,7 @@ #include <linux/pagemap.h>
 #include <linux/cdev.h>
 #include <linux/bootmem.h>
 #include <linux/inotify.h>
+#include <linux/kevent.h>
 #include <linux/mount.h>
 
 /*
@@ -165,12 +166,18 @@ #endif
 		}
 		memset(&inode->u, 0, sizeof(inode->u));
 		inode->i_mapping = mapping;
+#if defined CONFIG_KEVENT
+		kevent_storage_init(inode, &inode->st);
+#endif
 	}
 	return inode;
 }
 
 void destroy_inode(struct inode *inode) 
 {
+#if defined CONFIG_KEVENT_INODE || defined CONFIG_KEVENT_SOCKET
+	kevent_storage_fini(&inode->st);
+#endif
 	BUG_ON(inode_has_buffers(inode));
 	security_inode_free(inode);
 	if (inode->i_sb->s_op->destroy_inode)
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 12dfdcf..f8dca72 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -3001,6 +3001,7 @@ int reiserfs_setattr(struct dentry *dent
 }
 
 const struct address_space_operations reiserfs_address_space_operations = {
+	.get_block = reiserfs_get_block,
 	.writepage = reiserfs_writepage,
 	.readpage = reiserfs_readpage,
 	.readpages = reiserfs_readpages,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2561020..65eb438 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -240,6 +240,9 @@ #include <linux/mutex.h>
 #include <asm/atomic.h>
 #include <asm/semaphore.h>
 #include <asm/byteorder.h>
+#ifdef CONFIG_KEVENT
+#include <linux/kevent_storage.h>
+#endif
 
 struct hd_geometry;
 struct iovec;
@@ -352,6 +355,8 @@ struct address_space;
 struct writeback_control;
 
 struct address_space_operations {
+	int  (*get_block)(struct inode *inode, sector_t iblock,
+			struct buffer_head *bh_result, int create);
 	int (*writepage)(struct page *page, struct writeback_control *wbc);
 	int (*readpage)(struct file *, struct page *);
 	void (*sync_page)(struct page *);
@@ -546,6 +551,10 @@ #ifdef CONFIG_INOTIFY
 	struct mutex		inotify_mutex;	/* protects the watches list */
 #endif
 
+#ifdef CONFIG_KEVENT_INODE
+	struct kevent_storage	st;
+#endif
+
 	unsigned long		i_state;
 	unsigned long		dirtied_when;	/* jiffies of first dirtying */
 
@@ -698,6 +707,9 @@ #ifdef CONFIG_EPOLL
 	struct list_head	f_ep_links;
 	spinlock_t		f_ep_lock;
 #endif /* #ifdef CONFIG_EPOLL */
+#ifdef CONFIG_KEVENT_POLL
+	struct kevent_storage	st;
+#endif
 	struct address_space	*f_mapping;
 };
 extern spinlock_t files_lock;
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index cc5dec7..0acc8db 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -15,6 +15,7 @@ #ifdef __KERNEL__
 
 #include <linux/dnotify.h>
 #include <linux/inotify.h>
+#include <linux/kevent.h>
 #include <linux/audit.h>
 
 /*
@@ -79,6 +80,7 @@ static inline void fsnotify_nameremove(s
 		isdir = IN_ISDIR;
 	dnotify_parent(dentry, DN_DELETE);
 	inotify_dentry_parent_queue_event(dentry, IN_DELETE|isdir, 0, dentry->d_name.name);
+	kevent_inode_notify_parent(dentry, KEVENT_INODE_REMOVE);
 }
 
 /*
@@ -88,6 +90,7 @@ static inline void fsnotify_inoderemove(
 {
 	inotify_inode_queue_event(inode, IN_DELETE_SELF, 0, NULL, NULL);
 	inotify_inode_is_dead(inode);
+	kevent_inode_remove(inode);
 }
 
 /*
@@ -96,6 +99,7 @@ static inline void fsnotify_inoderemove(
 static inline void fsnotify_create(struct inode *inode, struct dentry *dentry)
 {
 	inode_dir_notify(inode, DN_CREATE);
+	kevent_inode_notify(inode, KEVENT_INODE_CREATE);
 	inotify_inode_queue_event(inode, IN_CREATE, 0, dentry->d_name.name,
 				  dentry->d_inode);
 	audit_inode_child(dentry->d_name.name, dentry->d_inode, inode->i_ino);
@@ -107,6 +111,7 @@ static inline void fsnotify_create(struc
 static inline void fsnotify_mkdir(struct inode *inode, struct dentry *dentry)
 {
 	inode_dir_notify(inode, DN_CREATE);
+	kevent_inode_notify(inode, KEVENT_INODE_CREATE);
 	inotify_inode_queue_event(inode, IN_CREATE | IN_ISDIR, 0, 
 				  dentry->d_name.name, dentry->d_inode);
 	audit_inode_child(dentry->d_name.name, dentry->d_inode, inode->i_ino);
diff --git a/kernel/kevent/kevent_aio.c b/kernel/kevent/kevent_aio.c
new file mode 100644
index 0000000..9cbba69
--- /dev/null
+++ b/kernel/kevent/kevent_aio.c
@@ -0,0 +1,584 @@
+/*
+ * 	kevent_aio.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/swap.h>
+#include <linux/pagemap.h>
+#include <linux/bio.h>
+#include <linux/buffer_head.h>
+#include <linux/kevent.h>
+
+#include <net/sock.h>
+
+#define KEVENT_AIO_DEBUG
+
+#ifdef KEVENT_AIO_DEBUG
+#define dprintk(f, a...) printk(f, ##a)
+#else
+#define dprintk(f, a...) do {} while (0)
+#endif
+
+struct kevent_aio_private
+{
+	int			pg_num;
+	size_t			size;
+	loff_t			offset;
+	loff_t			processed;
+	atomic_t		bio_page_num;
+	struct completion	bio_complete;
+	struct file		*file, *sock;
+	struct work_struct	work;
+};
+
+static int kevent_aio_dequeue(struct kevent *k);
+static int kevent_aio_enqueue(struct kevent *k);
+static int kevent_aio_callback(struct kevent *k);
+
+extern void bio_fs_destructor(struct bio *bio);
+
+static void kevent_aio_bio_destructor(struct bio *bio)
+{
+	struct kevent *k = bio->bi_private;
+	struct kevent_aio_private *priv = k->priv;
+
+	dprintk("%s: bio=%p, num=%u, k=%p, inode=%p.\n", __func__, bio, bio->bi_vcnt, k, k->st->origin);
+	schedule_work(&priv->work);
+	bio_fs_destructor(bio);
+}
+
+static void kevent_aio_bio_put(struct kevent *k)
+{
+	struct kevent_aio_private *priv = k->priv;
+	
+	if (atomic_dec_and_test(&priv->bio_page_num))
+		complete(&priv->bio_complete);
+}
+
+static int kevent_mpage_end_io_read(struct bio *bio, unsigned int bytes_done, int err)
+{
+	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+	struct kevent *k = bio->bi_private;
+
+	if (bio->bi_size)
+		return 1;
+
+	do {
+		struct page *page = bvec->bv_page;
+
+		if (--bvec >= bio->bi_io_vec)
+			prefetchw(&bvec->bv_page->flags);
+
+		if (uptodate) {
+			SetPageUptodate(page);
+		} else {
+			ClearPageUptodate(page);
+			SetPageError(page);
+		}
+
+		unlock_page(page);
+		kevent_aio_bio_put(k);
+	} while (bvec >= bio->bi_io_vec);
+
+	bio_put(bio);
+	return 0;
+}
+
+static inline struct bio *kevent_mpage_bio_submit(int rw, struct bio *bio)
+{
+	if (bio) {
+		bio->bi_end_io = kevent_mpage_end_io_read;
+		dprintk("%s: bio=%p, num=%u.\n", __func__, bio, bio->bi_vcnt);
+		submit_bio(READ, bio);
+	}
+	return NULL;
+}
+
+static struct bio *kevent_mpage_readpage(struct kevent *k, struct bio *bio,
+		struct page *page, unsigned nr_pages, get_block_t get_block, 
+		loff_t *offset, sector_t *last_block_in_bio)
+{
+	struct inode *inode = k->st->origin;
+	const unsigned blkbits = inode->i_blkbits;
+	const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits;
+	const unsigned blocksize = 1 << blkbits;
+	sector_t block_in_file;
+	sector_t last_block;
+	struct block_device *bdev = NULL;
+	unsigned first_hole = blocks_per_page;
+	unsigned page_block;
+	sector_t blocks[MAX_BUF_PER_PAGE];
+	struct buffer_head bh;
+	int fully_mapped = 1, length;
+
+	block_in_file = (*offset + blocksize - 1) >> blkbits;
+	last_block = (i_size_read(inode) + blocksize - 1) >> blkbits;
+
+	bh.b_page = page;
+	for (page_block = 0; page_block < blocks_per_page; page_block++, block_in_file++) {
+		bh.b_state = 0;
+		if (block_in_file < last_block) {
+			if (get_block(inode, block_in_file, &bh, 0))
+				goto confused;
+		}
+
+		if (!buffer_mapped(&bh)) {
+			fully_mapped = 0;
+			if (first_hole == blocks_per_page)
+				first_hole = page_block;
+			continue;
+		}
+
+		/* some filesystems will copy data into the page during
+		 * the get_block call, in which case we don't want to
+		 * read it again.  map_buffer_to_page copies the data
+		 * we just collected from get_block into the page's buffers
+		 * so readpage doesn't have to repeat the get_block call
+		 */
+		if (buffer_uptodate(&bh)) {
+			BUG();
+			//map_buffer_to_page(page, &bh, page_block);
+			goto confused;
+		}
+	
+		if (first_hole != blocks_per_page)
+			goto confused;		/* hole -> non-hole */
+
+		/* Contiguous blocks? */
+		if (page_block && blocks[page_block-1] != bh.b_blocknr-1)
+			goto confused;
+		blocks[page_block] = bh.b_blocknr;
+		bdev = bh.b_bdev;
+	}
+
+	if (!bdev)
+		goto confused;
+
+	if (first_hole != blocks_per_page) {
+		char *kaddr = kmap_atomic(page, KM_USER0);
+		memset(kaddr + (first_hole << blkbits), 0,
+				PAGE_CACHE_SIZE - (first_hole << blkbits));
+		flush_dcache_page(page);
+		kunmap_atomic(kaddr, KM_USER0);
+		if (first_hole == 0) {
+			SetPageUptodate(page);
+			goto out;
+		}
+	} else if (fully_mapped) {
+		SetPageMappedToDisk(page);
+	}
+	
+	/*
+	 * This page will go to BIO.  Do we need to send this BIO off first?
+	 */
+	if (bio && (*last_block_in_bio != blocks[0] - 1))
+		bio = kevent_mpage_bio_submit(READ, bio);
+
+alloc_new:
+	if (bio == NULL) {
+		nr_pages = min_t(unsigned, nr_pages, bio_get_nr_vecs(bdev));
+		bio = bio_alloc(GFP_KERNEL, nr_pages);
+		if (bio == NULL)
+			goto confused;
+
+		bio->bi_destructor = kevent_aio_bio_destructor;
+		bio->bi_bdev = bdev;
+		bio->bi_sector = blocks[0] << (blkbits - 9);
+		bio->bi_private = k;
+	}
+
+	length = first_hole << blkbits;
+	if (bio_add_page(bio, page, length, 0) < length) {
+		bio = kevent_mpage_bio_submit(READ, bio);
+		dprintk("%s: Failed to add a page: nr_pages=%d, length=%d, page=%p.\n", 
+				__func__, nr_pages, length, page);
+		goto alloc_new;
+	}
+	
+	dprintk("%s: bio=%p, b=%d, m=%d, u=%d, nr_pages=%d, offset=%Lu, "
+			"size=%Lu. page_block=%u, page=%p.\n", 
+			__func__, bio, buffer_boundary(&bh), buffer_mapped(&bh), 
+			buffer_uptodate(&bh), nr_pages, *offset, i_size_read(inode), 
+			page_block, page);
+	
+	*offset = *offset + length;
+
+	if (buffer_boundary(&bh) || (first_hole != blocks_per_page))
+		bio = kevent_mpage_bio_submit(READ, bio);
+	else
+		*last_block_in_bio = blocks[blocks_per_page - 1];
+
+out:
+	return bio;
+
+confused:
+	dprintk("%s: confused. bio=%p, nr_pages=%d.\n", __func__, bio, nr_pages);
+	if (bio)
+		bio = kevent_mpage_bio_submit(READ, bio);
+	kevent_aio_bio_put(k);
+	SetPageUptodate(page);
+
+	if (nr_pages == 1) {
+		struct kevent_aio_private *priv = k->priv;
+
+		wait_for_completion(&priv->bio_complete);
+		kevent_storage_ready(k->st, NULL, KEVENT_AIO_BIO);
+		init_completion(&priv->bio_complete);
+		complete(&priv->bio_complete);
+	}
+	goto out;
+}
+
+static int kevent_aio_alloc_cached_page(struct kevent *k, struct page **cached_page)
+{
+	struct kevent_aio_private *priv = k->priv;
+	struct address_space *mapping = priv->file->f_mapping;
+	struct page *page;
+	int err = 0;
+	pgoff_t index = priv->offset >> PAGE_CACHE_SHIFT;
+
+	page = page_cache_alloc_cold(mapping);
+	if (!page) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL);
+	if (err) {
+		if (err == -EEXIST)
+			err = 0;
+		page_cache_release(page);
+		goto out;
+	}
+
+	dprintk("%s: page=%p, offset=%Lu, processed=%Lu, index=%lu, size=%zu.\n",
+			__func__, page, priv->offset, priv->processed, index, priv->size);
+
+	*cached_page = page;
+
+out:
+	return err;
+}
+
+static int kevent_mpage_readpages(struct kevent *k, int first,
+		int (* get_block)(struct inode *inode, sector_t iblock,	
+			struct buffer_head *bh_result, int create))
+{
+	struct bio *bio = NULL;
+	struct kevent_aio_private *priv = k->priv;
+	sector_t last_block_in_bio = 0;
+	int i, err = 0;
+
+	atomic_set(&priv->bio_page_num, priv->pg_num);
+
+	for (i=first; i<priv->pg_num; ++i) {
+		struct page *page = NULL;
+		
+		err = kevent_aio_alloc_cached_page(k, &page);
+		if (err)
+			break;
+
+		/*
+		 * If there is no error and page is NULL, this means
+		 * that someone added a page into VFS cache.
+		 * We will not process this page, since it is that who
+		 * added a page must read data from disk.
+		 */
+		if (!page)
+			continue;
+
+		bio = kevent_mpage_readpage(k, bio, page, priv->pg_num - i, 
+				get_block, &priv->offset, &last_block_in_bio);
+	}
+
+	if (bio)
+		bio = kevent_mpage_bio_submit(READ, bio);
+
+	return err;
+}
+
+static size_t kevent_aio_vfs_read_actor(struct kevent *k, struct page *kpage, size_t len)
+{
+	struct kevent_aio_private *priv = k->priv;
+	size_t ret;
+	
+	ret = priv->sock->f_op->sendpage(priv->sock, kpage, 0, len, &priv->sock->f_pos, 1);
+
+	dprintk("%s: k=%p, page=%p, len=%zu, ret=%zd.\n", 
+			__func__, k, kpage, len, ret);
+
+	return ret;
+}
+
+static int kevent_aio_vfs_read(struct kevent *k, 
+		size_t (*actor)(struct kevent *, struct page *, size_t))
+{
+	struct kevent_aio_private *priv = k->priv;
+	struct address_space *mapping;
+	size_t isize, actor_size;
+	int i;
+
+	mapping = priv->file->f_mapping;
+	isize = i_size_read(priv->file->f_dentry->d_inode);
+	
+	dprintk("%s: start: size_left=%zd, offset=%Lu, processed=%Lu, isize=%zu, pg_num=%d.\n", 
+			__func__, priv->size, priv->offset, priv->processed, isize, priv->pg_num);
+
+	for (i=0; i<priv->pg_num && priv->size; ++i) {
+		struct page *page;
+		size_t nr = PAGE_CACHE_SIZE;
+
+		cond_resched();
+		page = find_get_page(mapping, priv->processed >> PAGE_CACHE_SHIFT);
+		if (unlikely(page == NULL))
+			break;
+		if (!PageUptodate(page)) {
+			dprintk("%s: %2d: page=%p, processed=%Lu, size=%zu not uptodate.\n", 
+					__func__, i, page, priv->processed, priv->size);
+			page_cache_release(page);
+			break;
+		}
+
+		if (mapping_writably_mapped(mapping))
+			flush_dcache_page(page);
+
+		mark_page_accessed(page);
+
+		if (nr + priv->processed > isize)
+			nr = isize - priv->processed;
+		if (nr > priv->size)
+			nr = priv->size;
+
+		actor_size = actor(k, page, nr);
+		if (actor_size < 0) {
+			page_cache_release(page);
+			break;
+		}
+
+		page_cache_release(page);
+
+		priv->processed += actor_size;
+		priv->size -= actor_size;
+	}
+
+	if (!priv->size)
+		i = priv->pg_num;
+
+	if (i != priv->pg_num)
+		priv->offset = priv->processed;
+
+	dprintk("%s: end: next=%d, num=%d, left=%zu, offset=%Lu, procesed=%Lu, ret=%d.\n", 
+			__func__, i, priv->pg_num, 
+			priv->size, priv->offset, priv->processed, i);
+
+	return i;
+}
+
+static int kevent_aio_callback(struct kevent *k)
+{
+	return 1;
+}
+
+static void kevent_aio_work(void *data)
+{
+	struct kevent *k = data;
+	struct kevent_aio_private *priv = k->priv;
+	struct inode *inode = k->st->origin;
+	struct address_space *mapping = priv->file->f_mapping;
+	int err, ready = 0, num;
+
+	dprintk("%s: k=%p, priv=%p, inode=%p.\n", __func__, k, priv, inode);
+
+	init_completion(&priv->bio_complete);
+	
+	num = ready = kevent_aio_vfs_read(k, &kevent_aio_vfs_read_actor);
+	if (ready > 0 && ready != priv->pg_num)
+		ready = 0;
+
+	dprintk("%s: k=%p, ready=%d, size=%zd.\n", __func__, k, ready, priv->size);
+
+	if (!ready) {
+		err = kevent_mpage_readpages(k, num, mapping->a_ops->get_block);
+		if (err) {
+			dprintk("%s: kevent_mpage_readpages failed: err=%d, k=%p, size=%zd.\n",
+					__func__, err, k, priv->size);
+			kevent_break(k);
+			kevent_storage_ready(k->st, NULL, KEVENT_MASK_ALL);
+		}
+	} else {
+		dprintk("%s: next k=%p, size=%zd.\n", __func__, k, priv->size);
+
+		if (priv->size)
+			schedule_work(&priv->work);
+		else {
+			kevent_storage_ready(k->st, NULL, KEVENT_MASK_ALL);
+		}
+
+		complete(&priv->bio_complete);
+	}
+}
+
+static int kevent_aio_enqueue(struct kevent *k)
+{
+	int err;
+	struct file *file, *sock;
+	struct inode *inode;
+	struct kevent_aio_private *priv;
+	struct address_space *mapping;
+	int fd = k->event.id.raw[0];
+	int num = k->event.id.raw[1];
+	int s = k->event.ret_data[0];
+	size_t size;
+
+	err = -ENODEV;
+	file = fget(fd);
+	if (!file)
+		goto err_out_exit;
+	
+	sock = fget(s);
+	if (!sock)
+		goto err_out_fput_file;
+	
+	mapping = file->f_mapping;
+
+	err = -EINVAL;
+	if (!file->f_dentry || !file->f_dentry->d_inode || !mapping->a_ops->get_block)
+		goto err_out_fput;
+	if (!sock->f_dentry || !sock->f_dentry->d_inode)
+		goto err_out_fput;
+
+	inode = igrab(file->f_dentry->d_inode);
+	if (!inode)
+		goto err_out_fput;
+
+	size = i_size_read(inode);
+	
+	num = (size > num << PAGE_SHIFT) ? num : (size >> PAGE_SHIFT);
+
+	err = -ENOMEM;
+	priv = kzalloc(sizeof(struct kevent_aio_private), GFP_KERNEL);
+	if (!priv)
+		goto err_out_iput;
+
+	priv->pg_num = num;
+	priv->size = size;
+	priv->offset = 0;
+	priv->file = file;
+	priv->sock = sock;
+	INIT_WORK(&priv->work, kevent_aio_work, k);
+	k->priv = priv;
+
+	dprintk("%s: read: k=%p, priv=%p, inode=%p, num=%u, size=%zu, off=%Lu.\n", 
+			__func__, k, priv, inode, priv->pg_num, priv->size, priv->offset);
+	
+	init_completion(&priv->bio_complete);
+	kevent_storage_enqueue(&inode->st, k);
+	schedule_work(&priv->work);
+	
+	return 0;
+
+err_out_iput:
+	iput(inode);
+err_out_fput:
+	fput(sock);
+err_out_fput_file:
+	fput(file);
+err_out_exit:
+
+	return err;
+}
+
+static int kevent_aio_dequeue(struct kevent *k)
+{
+	struct kevent_aio_private *priv = k->priv;
+	struct inode *inode = k->st->origin;
+	struct file *file = priv->file;
+	struct file *sock = priv->sock;
+
+	kevent_storage_dequeue(k->st, k);
+	flush_scheduled_work();
+	wait_for_completion(&priv->bio_complete);
+
+	kfree(k->priv);
+	k->priv = NULL;
+	iput(inode);
+	fput(file);
+	fput(sock);
+
+	return 0;
+}
+
+asmlinkage long sys_aio_sendfile(int ctl_fd, int fd, int s, 
+		size_t size, unsigned flags)
+{
+	struct ukevent ukread, uksend;
+	struct kevent_user *u;
+	struct file *file;
+	int err, fput_needed;
+	int num = (flags & 7)?(flags & 7):8;
+
+	memset(&ukread, 0, sizeof(struct ukevent));
+	memset(&uksend, 0, sizeof(struct ukevent));
+
+	ukread.type = KEVENT_AIO;
+	ukread.event = KEVENT_AIO_BIO;
+
+	ukread.id.raw[0] = fd;
+	ukread.id.raw[1] = num;
+	ukread.ret_data[0] = s;
+
+	dprintk("%s: fd=%d, s=%d, num=%d.\n", __func__, fd, s, num);
+
+	file = fget_light(ctl_fd, &fput_needed);
+	if (!file)
+		return -ENODEV;
+
+	u = file->private_data;
+	if (!u) {
+		err = -EINVAL;
+		goto err_out_fput;
+	}
+
+	err = kevent_user_add_ukevent(&ukread, u);
+	if (err < 0)
+		goto err_out_fput;
+
+err_out_fput:
+	fput_light(file, fput_needed);
+	return err;
+}
+
+static int __init kevent_init_aio(void)
+{
+	struct kevent_callbacks *ac = &kevent_registered_callbacks[KEVENT_AIO];
+
+	ac->enqueue = &kevent_aio_enqueue;
+	ac->dequeue = &kevent_aio_dequeue;
+	ac->callback = &kevent_aio_callback;
+
+	return 0;
+}
+late_initcall(kevent_init_aio);
diff --git a/kernel/kevent/kevent_inode.c b/kernel/kevent/kevent_inode.c
new file mode 100644
index 0000000..1626067
--- /dev/null
+++ b/kernel/kevent/kevent_inode.c
@@ -0,0 +1,114 @@
+/*
+ * 	kevent_inode.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+#include <linux/file.h>
+#include <linux/kevent.h>
+#include <linux/fs.h>
+
+static int kevent_inode_enqueue(struct kevent *k)
+{
+	struct file *file;
+	struct inode *inode;
+	int err, fput_needed;
+
+	file = fget_light(k->event.id.raw[0], &fput_needed);
+	if (!file)
+		return -ENODEV;
+
+	err = -EINVAL;
+	if (!file->f_dentry || !file->f_dentry->d_inode)
+		goto err_out_fput;
+	
+	inode = igrab(file->f_dentry->d_inode);
+	if (!inode)
+		goto err_out_fput;
+
+	err = kevent_storage_enqueue(&inode->st, k);
+	if (err)
+		goto err_out_iput;
+
+	fput_light(file, fput_needed);
+	return 0;
+
+err_out_iput:
+	iput(inode);
+err_out_fput:
+	fput_light(file, fput_needed);
+	return err;
+}
+
+static int kevent_inode_dequeue(struct kevent *k)
+{
+	struct inode *inode = k->st->origin;
+
+	kevent_storage_dequeue(k->st, k);
+	iput(inode);
+
+	return 0;
+}
+
+static int kevent_inode_callback(struct kevent *k)
+{
+	return 1;
+}
+
+void kevent_inode_notify_parent(struct dentry *dentry, u32 event)
+{
+	struct dentry *parent;
+	struct inode *inode;
+	
+	spin_lock(&dentry->d_lock);
+	parent = dentry->d_parent;
+	inode = parent->d_inode;
+
+	dget(parent);
+	spin_unlock(&dentry->d_lock);
+	kevent_inode_notify(inode, KEVENT_INODE_REMOVE);
+	dput(parent);
+}
+	
+void kevent_inode_remove(struct inode *inode)
+{
+	kevent_storage_fini(&inode->st);
+}
+	
+void kevent_inode_notify(struct inode *inode, u32 event)
+{
+	kevent_storage_ready(&inode->st, NULL, event);
+}
+
+static int __init kevent_init_inode(void)
+{
+	struct kevent_callbacks *ic = &kevent_registered_callbacks[KEVENT_INODE];
+
+	ic->enqueue = &kevent_inode_enqueue;
+	ic->dequeue = &kevent_inode_dequeue;
+	ic->callback = &kevent_inode_callback;
+
+	return 0;
+}
+late_initcall(kevent_init_inode);


^ permalink raw reply related	[flat|nested] 160+ messages in thread

* Re: [take3 4/4] kevent: poll/select() notifications. Timer notifications.
  2006-08-03  9:43                                   ` Eric Dumazet
@ 2006-08-03  9:48                                     ` Evgeniy Polyakov
  0 siblings, 0 replies; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-03  9:48 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: lkml, David Miller, Ulrich Drepper, netdev, Zach Brown

On Thu, Aug 03, 2006 at 11:43:02AM +0200, Eric Dumazet (dada1@cosmosbay.com) wrote:
> On Thursday 03 August 2006 11:46, Evgeniy Polyakov wrote:
> > poll/select() notifications. Timer notifications.
> >
> > +++ b/kernel/kevent/kevent_poll.c
> 
> > +static int kevent_poll_wait_callback(wait_queue_t *wait,
> > +		unsigned mode, int sync, void *key)
> > +{
> > +	struct kevent_poll_wait_container *cont =
> > +		container_of(wait, struct kevent_poll_wait_container, wait);
> > +	struct kevent *k = cont->k;
> > +	struct file *file = k->st->origin;
> > +	unsigned long flags;
> > +	u32 revents, event;
> > +
> > +	revents = file->f_op->poll(file, NULL);
> > +	spin_lock_irqsave(&k->ulock, flags);
> > +	event = k->event.event;
> > +	spin_unlock_irqrestore(&k->ulock, flags);
> 
> Not sure why you take a spinlock just to read a u32

You are right, it is not needed there.
Thank you.

> Eric

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take3 3/4] kevent: Network AIO, socket notifications.
  2006-08-03  9:46                               ` [take3 3/4] kevent: Network AIO, socket notifications Evgeniy Polyakov
  2006-08-03  9:46                                 ` [take3 4/4] kevent: poll/select() notifications. Timer notifications Evgeniy Polyakov
@ 2006-08-03  9:54                                 ` Eric Dumazet
  2006-08-03 10:13                                   ` Evgeniy Polyakov
  1 sibling, 1 reply; 160+ messages in thread
From: Eric Dumazet @ 2006-08-03  9:54 UTC (permalink / raw)
  To: Evgeniy Polyakov; +Cc: lkml, David Miller, Ulrich Drepper, netdev, Zach Brown

On Thursday 03 August 2006 11:46, Evgeniy Polyakov wrote:
> Network AIO, socket notifications.
>
> This patchset includes socket notifications and network asynchronous IO.
> Network AIO is based on kevent and works as usual kevent storage on top
> of inode.
>  						          (3 * TCP_RTO_MIN) / 4,
> diff --git a/kernel/kevent/kevent_naio.c b/kernel/kevent/kevent_naio.c
+
> +static int kevent_naio_enqueue(struct kevent *k)
> +{
> +	int err, i;
> +	struct page **page;
> +	void *addr;
> +	unsigned int size = k->event.id.raw[1];
> +	int num = size/PAGE_SIZE;
> +	struct file *file;
> +	struct sock *sk = NULL;
> +	int fput_needed;
> +
> +	file = fget_light(k->event.id.raw[0], &fput_needed);
> +	if (!file)
> +		return -ENODEV;
> +
> +	err = -EINVAL;
> +	if (!file->f_dentry || !file->f_dentry->d_inode)
> +		goto err_out_fput;

How can you be 100% sure this file is actually a socket here ?
(Another thread could close the fd and this fd can now point to another file)

You should do
if (file->f_op != &socket_file_ops)
	goto err_out_fput;
sk = file->private_data;  /* set in sock_map_fd */ 

> +
> +	sk = SOCKET_I(file->f_dentry->d_inode)->sk;
> +


Eric

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take3 3/4] kevent: Network AIO, socket notifications.
  2006-08-03  9:54                                 ` [take3 3/4] kevent: Network AIO, socket notifications Eric Dumazet
@ 2006-08-03 10:13                                   ` Evgeniy Polyakov
  0 siblings, 0 replies; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-03 10:13 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: lkml, David Miller, Ulrich Drepper, netdev, Zach Brown

On Thu, Aug 03, 2006 at 11:54:26AM +0200, Eric Dumazet (dada1@cosmosbay.com) wrote:
> On Thursday 03 August 2006 11:46, Evgeniy Polyakov wrote:
> > Network AIO, socket notifications.
> >
> > This patchset includes socket notifications and network asynchronous IO.
> > Network AIO is based on kevent and works as usual kevent storage on top
> > of inode.

> > +	file = fget_light(k->event.id.raw[0], &fput_needed);
> > +	if (!file)
> > +		return -ENODEV;
> > +
> > +	err = -EINVAL;
> > +	if (!file->f_dentry || !file->f_dentry->d_inode)
> > +		goto err_out_fput;
> 
> How can you be 100% sure this file is actually a socket here ?
> (Another thread could close the fd and this fd can now point to another file)
> 
> You should do
> if (file->f_op != &socket_file_ops)
> 	goto err_out_fput;
> sk = file->private_data;  /* set in sock_map_fd */ 

That will be socket, not sock, but that check is definitely needed in
both socket and network aio code.
Thanks Eric.

> Eric

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take3 1/4] kevent: Core files.
  2006-08-03  9:46                           ` [take3 1/4] kevent: Core files Evgeniy Polyakov
  2006-08-03  9:46                             ` [take3 2/4] kevent: AIO, aio_sendfile() implementation Evgeniy Polyakov
@ 2006-08-03 14:40                             ` Eric Dumazet
  2006-08-03 14:55                               ` Evgeniy Polyakov
  1 sibling, 1 reply; 160+ messages in thread
From: Eric Dumazet @ 2006-08-03 14:40 UTC (permalink / raw)
  To: Evgeniy Polyakov; +Cc: lkml, David Miller, Ulrich Drepper, netdev, Zach Brown

On Thursday 03 August 2006 11:46, Evgeniy Polyakov wrote:
> Core files.
>
> This patch includes core kevent files:
>  - userspace controlling
>  - kernelspace interfaces
>  - initialization
>  - notification state machines
>

> +static int kevent_user_wait(struct file *file, struct kevent_user *u,
> +		unsigned int min_nr, unsigned int max_nr, unsigned int timeout,
> +		void __user *buf)
> +{
>

> +	mutex_lock(&u->ctl_mutex);
> +	while (num < max_nr && ((k = kqueue_dequeue_ready(u)) != NULL)) {
> +		if (copy_to_user(buf + num*sizeof(struct ukevent),
> +					&k->event, sizeof(struct ukevent))) {
> +			cerr = -EINVAL;
> +			break;
> +		}


It seems quite wrong to hold ctl_mutex while doing a copy_to_user() (of 
possibly a large amount of data) : A thread can sleep on a page fault and 
other threads cannot make progress.

Eric

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take3 1/4] kevent: Core files.
  2006-08-03 14:40                             ` [take3 1/4] kevent: Core files Eric Dumazet
@ 2006-08-03 14:55                               ` Evgeniy Polyakov
  2006-08-03 15:11                                 ` Eric Dumazet
  2006-08-03 21:37                                 ` David Miller
  0 siblings, 2 replies; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-03 14:55 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: lkml, David Miller, Ulrich Drepper, netdev, Zach Brown

On Thu, Aug 03, 2006 at 04:40:34PM +0200, Eric Dumazet (dada1@cosmosbay.com) wrote:
> > +	mutex_lock(&u->ctl_mutex);
> > +	while (num < max_nr && ((k = kqueue_dequeue_ready(u)) != NULL)) {
> > +		if (copy_to_user(buf + num*sizeof(struct ukevent),
> > +					&k->event, sizeof(struct ukevent))) {
> > +			cerr = -EINVAL;
> > +			break;
> > +		}
> 
> 
> It seems quite wrong to hold ctl_mutex while doing a copy_to_user() (of 
> possibly a large amount of data) : A thread can sleep on a page fault and 
> other threads cannot make progress.

I would not call that wrong - system prevents some threads from removing 
kevents which are counted to be transfered to the userspace, i.e. when 
dequeuing was awakened and it had seen some events it is possible, that 
when it will dequeue them part will be removed by other thread, so I 
prevent this.

> Eric

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take3 1/4] kevent: Core files.
  2006-08-03 14:55                               ` Evgeniy Polyakov
@ 2006-08-03 15:11                                 ` Eric Dumazet
  2006-08-03 15:21                                   ` Evgeniy Polyakov
  2006-08-03 21:37                                 ` David Miller
  1 sibling, 1 reply; 160+ messages in thread
From: Eric Dumazet @ 2006-08-03 15:11 UTC (permalink / raw)
  To: Evgeniy Polyakov; +Cc: lkml, David Miller, Ulrich Drepper, netdev, Zach Brown

On Thursday 03 August 2006 16:55, Evgeniy Polyakov wrote:
> On Thu, Aug 03, 2006 at 04:40:34PM +0200, Eric Dumazet (dada1@cosmosbay.com) 
wrote:
> > > +	mutex_lock(&u->ctl_mutex);
> > > +	while (num < max_nr && ((k = kqueue_dequeue_ready(u)) != NULL)) {
> > > +		if (copy_to_user(buf + num*sizeof(struct ukevent),
> > > +					&k->event, sizeof(struct ukevent))) {
> > > +			cerr = -EINVAL;
> > > +			break;
> > > +		}
> >
> > It seems quite wrong to hold ctl_mutex while doing a copy_to_user() (of
> > possibly a large amount of data) : A thread can sleep on a page fault and
> > other threads cannot make progress.
>
> I would not call that wrong - system prevents some threads from removing
> kevents which are counted to be transfered to the userspace, i.e. when
> dequeuing was awakened and it had seen some events it is possible, that
> when it will dequeue them part will be removed by other thread, so I
> prevent this.

Hum, "wrong" was maybe not the good word.... but kqueue_dequeue_ready() uses a 
spinlock (ready_lock) to protect ready_list. One particular struct kevent is 
given to one thread, one at a time.

If you look at fs/eventpoll.c, you can see how carefull is ep_send_events() so 
that multiple threads can in the same time transfer different items to user 
memory.

In a model where several threads are servicing events collected by a single 
point (epoll, or kevent), this is important to not block all threads because 
of a single thread waiting a swapin (trigered by copy_to_user() )

Eric

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take3 1/4] kevent: Core files.
  2006-08-03 15:11                                 ` Eric Dumazet
@ 2006-08-03 15:21                                   ` Evgeniy Polyakov
  0 siblings, 0 replies; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-03 15:21 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: lkml, David Miller, Ulrich Drepper, netdev, Zach Brown

On Thu, Aug 03, 2006 at 05:11:58PM +0200, Eric Dumazet (dada1@cosmosbay.com) wrote:
> On Thursday 03 August 2006 16:55, Evgeniy Polyakov wrote:
> > On Thu, Aug 03, 2006 at 04:40:34PM +0200, Eric Dumazet (dada1@cosmosbay.com) 
> wrote:
> > > > +	mutex_lock(&u->ctl_mutex);
> > > > +	while (num < max_nr && ((k = kqueue_dequeue_ready(u)) != NULL)) {
> > > > +		if (copy_to_user(buf + num*sizeof(struct ukevent),
> > > > +					&k->event, sizeof(struct ukevent))) {
> > > > +			cerr = -EINVAL;
> > > > +			break;
> > > > +		}
> > >
> > > It seems quite wrong to hold ctl_mutex while doing a copy_to_user() (of
> > > possibly a large amount of data) : A thread can sleep on a page fault and
> > > other threads cannot make progress.
> >
> > I would not call that wrong - system prevents some threads from removing
> > kevents which are counted to be transfered to the userspace, i.e. when
> > dequeuing was awakened and it had seen some events it is possible, that
> > when it will dequeue them part will be removed by other thread, so I
> > prevent this.
> 
> Hum, "wrong" was maybe not the good word.... but kqueue_dequeue_ready() uses a 
> spinlock (ready_lock) to protect ready_list. One particular struct kevent is 
> given to one thread, one at a time.

I mean that wait_event logic will see that there are requested number of
events, and when it starts to get them, it is possible that there will
be no events at all. 

> If you look at fs/eventpoll.c, you can see how carefull is ep_send_events() so 
> that multiple threads can in the same time transfer different items to user 
> memory.

It is done under the same logic under ep->sem semaphore, which is being
held for del and read operations.
Or do you mean to have rw semahore instead of mutex here?

> In a model where several threads are servicing events collected by a single 
> point (epoll, or kevent), this is important to not block all threads because 
> of a single thread waiting a swapin (trigered by copy_to_user() )
 
> Eric

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take3 2/4] kevent: AIO, aio_sendfile() implementation.
  2006-08-03  9:46                             ` [take3 2/4] kevent: AIO, aio_sendfile() implementation Evgeniy Polyakov
  2006-08-03  9:46                               ` [take3 3/4] kevent: Network AIO, socket notifications Evgeniy Polyakov
@ 2006-08-03 17:04                               ` Badari Pulavarty
  2006-08-03 17:13                                 ` Evgeniy Polyakov
  1 sibling, 1 reply; 160+ messages in thread
From: Badari Pulavarty @ 2006-08-03 17:04 UTC (permalink / raw)
  To: Evgeniy Polyakov; +Cc: lkml, David Miller, Ulrich Drepper, netdev, Zach Brown

Evgeniy Polyakov wrote:
> AIO, aio_sendfile() implementation.
>
> This patch includes asynchronous propagation of file's data into VFS
> cache and aio_sendfile() implementation.
> Network aio_sendfile() works lazily - it asynchronously populates pages
> into the VFS cache (which can be used for various tricks with adaptive
> readahead) and then uses usual ->sendfile() callback.
>
> ...
> --- /dev/null
> +++ b/kernel/kevent/kevent_aio.c
> @@ -0,0 +1,584 @@
> +/*
> + * 	kevent_aio.c
> + * 
>   
Since this is *almost* same as mpage.c code, wondering if its possible 
to make common
generic/helper routines in mpage.c and use it here ?

Thanks,
Badari


^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take3 2/4] kevent: AIO, aio_sendfile() implementation.
  2006-08-03 17:04                               ` [take3 2/4] kevent: AIO, aio_sendfile() implementation Badari Pulavarty
@ 2006-08-03 17:13                                 ` Evgeniy Polyakov
  0 siblings, 0 replies; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-03 17:13 UTC (permalink / raw)
  To: Badari Pulavarty; +Cc: lkml, David Miller, Ulrich Drepper, netdev, Zach Brown

On Thu, Aug 03, 2006 at 10:04:36AM -0700, Badari Pulavarty (pbadari@us.ibm.com) wrote:
> Evgeniy Polyakov wrote:
> >AIO, aio_sendfile() implementation.
> >
> >This patch includes asynchronous propagation of file's data into VFS
> >cache and aio_sendfile() implementation.
> >Network aio_sendfile() works lazily - it asynchronously populates pages
> >into the VFS cache (which can be used for various tricks with adaptive
> >readahead) and then uses usual ->sendfile() callback.
> >
> >...
> >--- /dev/null
> >+++ b/kernel/kevent/kevent_aio.c
> >@@ -0,0 +1,584 @@
> >+/*
> >+ * 	kevent_aio.c
> >+ * 
> >  
> Since this is *almost* same as mpage.c code, wondering if its possible 
> to make common
> generic/helper routines in mpage.c and use it here ?

Yes, as I mentioned in mail to Christoph, I did it just to separate
kevent as much as possible (so I introduced ->get_block() based
approach). It can be safely moved into mpage code and used from more
clear callback like ->readpage().
Since this AIO code was decided to be postponed for a while, I'm not
updating it (just make sure that it compiles with new changes), since
overall design of AIO changes (if any) is not 100% completed.

> Thanks,
> Badari

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take3 1/4] kevent: Core files.
  2006-08-03 14:55                               ` Evgeniy Polyakov
  2006-08-03 15:11                                 ` Eric Dumazet
@ 2006-08-03 21:37                                 ` David Miller
  1 sibling, 0 replies; 160+ messages in thread
From: David Miller @ 2006-08-03 21:37 UTC (permalink / raw)
  To: johnpol; +Cc: dada1, linux-kernel, drepper, netdev, zach.brown

From: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
Date: Thu, 3 Aug 2006 18:55:57 +0400

> I would not call that wrong - system prevents some threads from removing 
> kevents which are counted to be transfered to the userspace, i.e. when 
> dequeuing was awakened and it had seen some events it is possible, that 
> when it will dequeue them part will be removed by other thread, so I 
> prevent this.

Queue is all that matters to be synchronized, so it seems
better to have a mutex on the queue rather than a global
one.  That way, user can only hurt himself.

^ permalink raw reply	[flat|nested] 160+ messages in thread

* [take4 0/4] kevent: Generic event handling mechanism.
  2006-07-31 10:33                       ` Evgeniy Polyakov
                                           ` (4 preceding siblings ...)
  2006-08-03  9:45                         ` [take3 0/4] kevent: Generic event handling mechanism Evgeniy Polyakov
@ 2006-08-05 13:02                         ` Evgeniy Polyakov
  2006-08-05 13:02                           ` [take4 1/4] kevent: Core files Evgeniy Polyakov
  2006-08-09  8:02                         ` [take6 0/3] kevent: Generic event handling mechanism Evgeniy Polyakov
                                           ` (4 subsequent siblings)
  10 siblings, 1 reply; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-05 13:02 UTC (permalink / raw)
  To: lkml; +Cc: David Miller, Ulrich Drepper, Evgeniy Polyakov, netdev, Zach Brown


Generic event handling mechanism.

I send this patchset for comments and review, it still contains AIO and 
aio_sendfile() implementation on top of get_block() abstraction, which was
decided to postpone for a while (it is simpler right now to generate patchset as a whole,
when kevent will be ready for merge, I will generate patchset without AIO stuff).
It does not contain mapped buffer implementation, since it's design is not 100% 
completed, I will present that implementation in the third patchset.

Changes from 'take3' patchset:
 * removed serializing mutex from kevent_user_wait()
 * moved storage list processing to RCU
 * removed lockdep screaming - all storage locks are initialized in the same function, so it was learned
	to differentiate between various cases
 * remove kevent from storage if is marked as broken after callback
 * fixed a typo in mmaped buffer implementation which would end up in wrong index calcualtion

Changes from 'take2' patchset:
 * split kevent_finish_user() to locked and unlocked variants
 * do not use KEVENT_STAT ifdefs, use inline functions instead
 * use array of callbacks of each type instead of each kevent callback initialization
 * changed name of ukevent guarding lock
 * use only one kevent lock in kevent_user for all hash buckets instead of per-bucket locks
 * do not use kevent_user_ctl structure instead provide needed arguments as syscall parameters
 * various indent cleanups
 * added optimisation, which is aimed to help when a lot of kevents are being copied from userspace
 * mapped buffer (initial) implementation (no userspace yet)

Changes from 'take1' patchset:
 - rebased against 2.6.18-git tree
 - removed ioctl controlling
 - added new syscall kevent_get_events(int fd, unsigned int min_nr, unsigned int max_nr,
			unsigned int timeout, void __user *buf, unsigned flags)
 - use old syscall kevent_ctl for creation/removing, modification and initial kevent 
	initialization
 - use mutuxes instead of semaphores
 - added file descriptor check and return error if provided descriptor does not match
	kevent file operations
 - various indent fixes
 - removed aio_sendfile() declarations.

Thank you.

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>



^ permalink raw reply	[flat|nested] 160+ messages in thread

* [take4 2/4] kevent: AIO, aio_sendfile() implementation.
  2006-08-05 13:02                           ` [take4 1/4] kevent: Core files Evgeniy Polyakov
@ 2006-08-05 13:02                             ` Evgeniy Polyakov
  2006-08-05 13:02                               ` [take4 3/4] kevent: Network AIO, socket notifications Evgeniy Polyakov
  2006-08-05 17:57                             ` [take4 1/4] kevent: Core files Greg KH
  1 sibling, 1 reply; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-05 13:02 UTC (permalink / raw)
  To: lkml; +Cc: David Miller, Ulrich Drepper, Evgeniy Polyakov, netdev, Zach Brown


AIO, aio_sendfile() implementation.

This patch includes asynchronous propagation of file's data into VFS
cache and aio_sendfile() implementation.
Network aio_sendfile() works lazily - it asynchronously populates pages
into the VFS cache (which can be used for various tricks with adaptive
readahead) and then uses usual ->sendfile() callback.

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>

diff --git a/fs/bio.c b/fs/bio.c
index 6a0b9ad..a3ee530 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -119,7 +119,7 @@ void bio_free(struct bio *bio, struct bi
 /*
  * default destructor for a bio allocated with bio_alloc_bioset()
  */
-static void bio_fs_destructor(struct bio *bio)
+void bio_fs_destructor(struct bio *bio)
 {
 	bio_free(bio, fs_bio_set);
 }
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index fb4d322..9316551 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -685,6 +685,7 @@ ext2_writepages(struct address_space *ma
 }
 
 const struct address_space_operations ext2_aops = {
+	.get_block		= ext2_get_block,
 	.readpage		= ext2_readpage,
 	.readpages		= ext2_readpages,
 	.writepage		= ext2_writepage,
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index c5ee9f0..d9210d4 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1699,6 +1699,7 @@ static int ext3_journalled_set_page_dirt
 }
 
 static const struct address_space_operations ext3_ordered_aops = {
+	.get_block	= ext3_get_block,
 	.readpage	= ext3_readpage,
 	.readpages	= ext3_readpages,
 	.writepage	= ext3_ordered_writepage,
diff --git a/fs/file_table.c b/fs/file_table.c
index 0131ba0..7f5b35f 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -112,6 +112,10 @@ struct file *get_empty_filp(void)
 	if (security_file_alloc(f))
 		goto fail_sec;
 
+#ifdef CONFIG_KEVENT_POLL
+	kevent_storage_init(f, &f->st);
+	kevent_poll_reinit(f);
+#endif
 	tsk = current;
 	INIT_LIST_HEAD(&f->f_u.fu_list);
 	atomic_set(&f->f_count, 1);
@@ -159,6 +163,9 @@ void fastcall __fput(struct file *file)
 	might_sleep();
 
 	fsnotify_close(file);
+#ifdef CONFIG_KEVENT_POLL
+	kevent_storage_fini(&file->st);
+#endif
 	/*
 	 * The function eventpoll_release() should be the first called
 	 * in the file cleanup chain.
diff --git a/fs/inode.c b/fs/inode.c
index 0bf9f04..fdbd0ba 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -21,6 +21,7 @@ #include <linux/pagemap.h>
 #include <linux/cdev.h>
 #include <linux/bootmem.h>
 #include <linux/inotify.h>
+#include <linux/kevent.h>
 #include <linux/mount.h>
 
 /*
@@ -165,12 +166,18 @@ #endif
 		}
 		memset(&inode->u, 0, sizeof(inode->u));
 		inode->i_mapping = mapping;
+#if defined CONFIG_KEVENT
+		kevent_storage_init(inode, &inode->st);
+#endif
 	}
 	return inode;
 }
 
 void destroy_inode(struct inode *inode) 
 {
+#if defined CONFIG_KEVENT_INODE || defined CONFIG_KEVENT_SOCKET
+	kevent_storage_fini(&inode->st);
+#endif
 	BUG_ON(inode_has_buffers(inode));
 	security_inode_free(inode);
 	if (inode->i_sb->s_op->destroy_inode)
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 12dfdcf..f8dca72 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -3001,6 +3001,7 @@ int reiserfs_setattr(struct dentry *dent
 }
 
 const struct address_space_operations reiserfs_address_space_operations = {
+	.get_block = reiserfs_get_block,
 	.writepage = reiserfs_writepage,
 	.readpage = reiserfs_readpage,
 	.readpages = reiserfs_readpages,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2561020..65eb438 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -240,6 +240,9 @@ #include <linux/mutex.h>
 #include <asm/atomic.h>
 #include <asm/semaphore.h>
 #include <asm/byteorder.h>
+#ifdef CONFIG_KEVENT
+#include <linux/kevent_storage.h>
+#endif
 
 struct hd_geometry;
 struct iovec;
@@ -352,6 +355,8 @@ struct address_space;
 struct writeback_control;
 
 struct address_space_operations {
+	int  (*get_block)(struct inode *inode, sector_t iblock,
+			struct buffer_head *bh_result, int create);
 	int (*writepage)(struct page *page, struct writeback_control *wbc);
 	int (*readpage)(struct file *, struct page *);
 	void (*sync_page)(struct page *);
@@ -546,6 +551,10 @@ #ifdef CONFIG_INOTIFY
 	struct mutex		inotify_mutex;	/* protects the watches list */
 #endif
 
+#ifdef CONFIG_KEVENT_INODE
+	struct kevent_storage	st;
+#endif
+
 	unsigned long		i_state;
 	unsigned long		dirtied_when;	/* jiffies of first dirtying */
 
@@ -698,6 +707,9 @@ #ifdef CONFIG_EPOLL
 	struct list_head	f_ep_links;
 	spinlock_t		f_ep_lock;
 #endif /* #ifdef CONFIG_EPOLL */
+#ifdef CONFIG_KEVENT_POLL
+	struct kevent_storage	st;
+#endif
 	struct address_space	*f_mapping;
 };
 extern spinlock_t files_lock;
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index cc5dec7..0acc8db 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -15,6 +15,7 @@ #ifdef __KERNEL__
 
 #include <linux/dnotify.h>
 #include <linux/inotify.h>
+#include <linux/kevent.h>
 #include <linux/audit.h>
 
 /*
@@ -79,6 +80,7 @@ static inline void fsnotify_nameremove(s
 		isdir = IN_ISDIR;
 	dnotify_parent(dentry, DN_DELETE);
 	inotify_dentry_parent_queue_event(dentry, IN_DELETE|isdir, 0, dentry->d_name.name);
+	kevent_inode_notify_parent(dentry, KEVENT_INODE_REMOVE);
 }
 
 /*
@@ -88,6 +90,7 @@ static inline void fsnotify_inoderemove(
 {
 	inotify_inode_queue_event(inode, IN_DELETE_SELF, 0, NULL, NULL);
 	inotify_inode_is_dead(inode);
+	kevent_inode_remove(inode);
 }
 
 /*
@@ -96,6 +99,7 @@ static inline void fsnotify_inoderemove(
 static inline void fsnotify_create(struct inode *inode, struct dentry *dentry)
 {
 	inode_dir_notify(inode, DN_CREATE);
+	kevent_inode_notify(inode, KEVENT_INODE_CREATE);
 	inotify_inode_queue_event(inode, IN_CREATE, 0, dentry->d_name.name,
 				  dentry->d_inode);
 	audit_inode_child(dentry->d_name.name, dentry->d_inode, inode->i_ino);
@@ -107,6 +111,7 @@ static inline void fsnotify_create(struc
 static inline void fsnotify_mkdir(struct inode *inode, struct dentry *dentry)
 {
 	inode_dir_notify(inode, DN_CREATE);
+	kevent_inode_notify(inode, KEVENT_INODE_CREATE);
 	inotify_inode_queue_event(inode, IN_CREATE | IN_ISDIR, 0, 
 				  dentry->d_name.name, dentry->d_inode);
 	audit_inode_child(dentry->d_name.name, dentry->d_inode, inode->i_ino);
diff --git a/kernel/kevent/kevent_aio.c b/kernel/kevent/kevent_aio.c
new file mode 100644
index 0000000..332629b
--- /dev/null
+++ b/kernel/kevent/kevent_aio.c
@@ -0,0 +1,584 @@
+/*
+ * 	kevent_aio.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/swap.h>
+#include <linux/pagemap.h>
+#include <linux/bio.h>
+#include <linux/buffer_head.h>
+#include <linux/kevent.h>
+
+#include <net/sock.h>
+
+#define KEVENT_AIO_DEBUG
+
+#ifdef KEVENT_AIO_DEBUG
+#define dprintk(f, a...) printk(f, ##a)
+#else
+#define dprintk(f, a...) do {} while (0)
+#endif
+
+struct kevent_aio_private
+{
+	int			pg_num;
+	size_t			size;
+	loff_t			offset;
+	loff_t			processed;
+	atomic_t		bio_page_num;
+	struct completion	bio_complete;
+	struct file		*file, *sock;
+	struct work_struct	work;
+};
+
+static int kevent_aio_dequeue(struct kevent *k);
+static int kevent_aio_enqueue(struct kevent *k);
+static int kevent_aio_callback(struct kevent *k);
+
+extern void bio_fs_destructor(struct bio *bio);
+
+static void kevent_aio_bio_destructor(struct bio *bio)
+{
+	struct kevent *k = bio->bi_private;
+	struct kevent_aio_private *priv = k->priv;
+
+	dprintk("%s: bio=%p, num=%u, k=%p, inode=%p.\n", __func__, bio, bio->bi_vcnt, k, k->st->origin);
+	schedule_work(&priv->work);
+	bio_fs_destructor(bio);
+}
+
+static void kevent_aio_bio_put(struct kevent *k)
+{
+	struct kevent_aio_private *priv = k->priv;
+	
+	if (atomic_dec_and_test(&priv->bio_page_num))
+		complete(&priv->bio_complete);
+}
+
+static int kevent_mpage_end_io_read(struct bio *bio, unsigned int bytes_done, int err)
+{
+	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+	struct kevent *k = bio->bi_private;
+
+	if (bio->bi_size)
+		return 1;
+
+	do {
+		struct page *page = bvec->bv_page;
+
+		if (--bvec >= bio->bi_io_vec)
+			prefetchw(&bvec->bv_page->flags);
+
+		if (uptodate) {
+			SetPageUptodate(page);
+		} else {
+			ClearPageUptodate(page);
+			SetPageError(page);
+		}
+
+		unlock_page(page);
+		kevent_aio_bio_put(k);
+	} while (bvec >= bio->bi_io_vec);
+
+	bio_put(bio);
+	return 0;
+}
+
+static inline struct bio *kevent_mpage_bio_submit(int rw, struct bio *bio)
+{
+	if (bio) {
+		bio->bi_end_io = kevent_mpage_end_io_read;
+		dprintk("%s: bio=%p, num=%u.\n", __func__, bio, bio->bi_vcnt);
+		submit_bio(READ, bio);
+	}
+	return NULL;
+}
+
+static struct bio *kevent_mpage_readpage(struct kevent *k, struct bio *bio,
+		struct page *page, unsigned nr_pages, get_block_t get_block, 
+		loff_t *offset, sector_t *last_block_in_bio)
+{
+	struct inode *inode = k->st->origin;
+	const unsigned blkbits = inode->i_blkbits;
+	const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits;
+	const unsigned blocksize = 1 << blkbits;
+	sector_t block_in_file;
+	sector_t last_block;
+	struct block_device *bdev = NULL;
+	unsigned first_hole = blocks_per_page;
+	unsigned page_block;
+	sector_t blocks[MAX_BUF_PER_PAGE];
+	struct buffer_head bh;
+	int fully_mapped = 1, length;
+
+	block_in_file = (*offset + blocksize - 1) >> blkbits;
+	last_block = (i_size_read(inode) + blocksize - 1) >> blkbits;
+
+	bh.b_page = page;
+	for (page_block = 0; page_block < blocks_per_page; page_block++, block_in_file++) {
+		bh.b_state = 0;
+		if (block_in_file < last_block) {
+			if (get_block(inode, block_in_file, &bh, 0))
+				goto confused;
+		}
+
+		if (!buffer_mapped(&bh)) {
+			fully_mapped = 0;
+			if (first_hole == blocks_per_page)
+				first_hole = page_block;
+			continue;
+		}
+
+		/* some filesystems will copy data into the page during
+		 * the get_block call, in which case we don't want to
+		 * read it again.  map_buffer_to_page copies the data
+		 * we just collected from get_block into the page's buffers
+		 * so readpage doesn't have to repeat the get_block call
+		 */
+		if (buffer_uptodate(&bh)) {
+			BUG();
+			//map_buffer_to_page(page, &bh, page_block);
+			goto confused;
+		}
+	
+		if (first_hole != blocks_per_page)
+			goto confused;		/* hole -> non-hole */
+
+		/* Contiguous blocks? */
+		if (page_block && blocks[page_block-1] != bh.b_blocknr-1)
+			goto confused;
+		blocks[page_block] = bh.b_blocknr;
+		bdev = bh.b_bdev;
+	}
+
+	if (!bdev)
+		goto confused;
+
+	if (first_hole != blocks_per_page) {
+		char *kaddr = kmap_atomic(page, KM_USER0);
+		memset(kaddr + (first_hole << blkbits), 0,
+				PAGE_CACHE_SIZE - (first_hole << blkbits));
+		flush_dcache_page(page);
+		kunmap_atomic(kaddr, KM_USER0);
+		if (first_hole == 0) {
+			SetPageUptodate(page);
+			goto out;
+		}
+	} else if (fully_mapped) {
+		SetPageMappedToDisk(page);
+	}
+	
+	/*
+	 * This page will go to BIO.  Do we need to send this BIO off first?
+	 */
+	if (bio && (*last_block_in_bio != blocks[0] - 1))
+		bio = kevent_mpage_bio_submit(READ, bio);
+
+alloc_new:
+	if (bio == NULL) {
+		nr_pages = min_t(unsigned, nr_pages, bio_get_nr_vecs(bdev));
+		bio = bio_alloc(GFP_KERNEL, nr_pages);
+		if (bio == NULL)
+			goto confused;
+
+		bio->bi_destructor = kevent_aio_bio_destructor;
+		bio->bi_bdev = bdev;
+		bio->bi_sector = blocks[0] << (blkbits - 9);
+		bio->bi_private = k;
+	}
+
+	length = first_hole << blkbits;
+	if (bio_add_page(bio, page, length, 0) < length) {
+		bio = kevent_mpage_bio_submit(READ, bio);
+		dprintk("%s: Failed to add a page: nr_pages=%d, length=%d, page=%p.\n", 
+				__func__, nr_pages, length, page);
+		goto alloc_new;
+	}
+	
+	dprintk("%s: bio=%p, b=%d, m=%d, u=%d, nr_pages=%d, offset=%Lu, "
+			"size=%Lu. page_block=%u, page=%p.\n", 
+			__func__, bio, buffer_boundary(&bh), buffer_mapped(&bh), 
+			buffer_uptodate(&bh), nr_pages, *offset, i_size_read(inode), 
+			page_block, page);
+	
+	*offset = *offset + length;
+
+	if (buffer_boundary(&bh) || (first_hole != blocks_per_page))
+		bio = kevent_mpage_bio_submit(READ, bio);
+	else
+		*last_block_in_bio = blocks[blocks_per_page - 1];
+
+out:
+	return bio;
+
+confused:
+	dprintk("%s: confused. bio=%p, nr_pages=%d.\n", __func__, bio, nr_pages);
+	if (bio)
+		bio = kevent_mpage_bio_submit(READ, bio);
+	kevent_aio_bio_put(k);
+	SetPageUptodate(page);
+
+	if (nr_pages == 1) {
+		struct kevent_aio_private *priv = k->priv;
+
+		wait_for_completion(&priv->bio_complete);
+		kevent_storage_ready(k->st, NULL, KEVENT_AIO_BIO);
+		init_completion(&priv->bio_complete);
+		complete(&priv->bio_complete);
+	}
+	goto out;
+}
+
+static int kevent_aio_alloc_cached_page(struct kevent *k, struct page **cached_page)
+{
+	struct kevent_aio_private *priv = k->priv;
+	struct address_space *mapping = priv->file->f_mapping;
+	struct page *page;
+	int err = 0;
+	pgoff_t index = priv->offset >> PAGE_CACHE_SHIFT;
+
+	page = page_cache_alloc_cold(mapping);
+	if (!page) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL);
+	if (err) {
+		if (err == -EEXIST)
+			err = 0;
+		page_cache_release(page);
+		goto out;
+	}
+
+	dprintk("%s: page=%p, offset=%Lu, processed=%Lu, index=%lu, size=%zu.\n",
+			__func__, page, priv->offset, priv->processed, index, priv->size);
+
+	*cached_page = page;
+
+out:
+	return err;
+}
+
+static int kevent_mpage_readpages(struct kevent *k, int first,
+		int (* get_block)(struct inode *inode, sector_t iblock,	
+			struct buffer_head *bh_result, int create))
+{
+	struct bio *bio = NULL;
+	struct kevent_aio_private *priv = k->priv;
+	sector_t last_block_in_bio = 0;
+	int i, err = 0;
+
+	atomic_set(&priv->bio_page_num, priv->pg_num);
+
+	for (i=first; i<priv->pg_num; ++i) {
+		struct page *page = NULL;
+		
+		err = kevent_aio_alloc_cached_page(k, &page);
+		if (err)
+			break;
+
+		/*
+		 * If there is no error and page is NULL, this means
+		 * that someone added a page into VFS cache.
+		 * We will not process this page, since it is that who
+		 * added a page must read data from disk.
+		 */
+		if (!page)
+			continue;
+
+		bio = kevent_mpage_readpage(k, bio, page, priv->pg_num - i, 
+				get_block, &priv->offset, &last_block_in_bio);
+	}
+
+	if (bio)
+		bio = kevent_mpage_bio_submit(READ, bio);
+
+	return err;
+}
+
+static size_t kevent_aio_vfs_read_actor(struct kevent *k, struct page *kpage, size_t len)
+{
+	struct kevent_aio_private *priv = k->priv;
+	size_t ret;
+	
+	ret = priv->sock->f_op->sendpage(priv->sock, kpage, 0, len, &priv->sock->f_pos, 1);
+
+	dprintk("%s: k=%p, page=%p, len=%zu, ret=%zd.\n", 
+			__func__, k, kpage, len, ret);
+
+	return ret;
+}
+
+static int kevent_aio_vfs_read(struct kevent *k, 
+		size_t (*actor)(struct kevent *, struct page *, size_t))
+{
+	struct kevent_aio_private *priv = k->priv;
+	struct address_space *mapping;
+	size_t isize, actor_size;
+	int i;
+
+	mapping = priv->file->f_mapping;
+	isize = i_size_read(priv->file->f_dentry->d_inode);
+	
+	dprintk("%s: start: size_left=%zd, offset=%Lu, processed=%Lu, isize=%zu, pg_num=%d.\n", 
+			__func__, priv->size, priv->offset, priv->processed, isize, priv->pg_num);
+
+	for (i=0; i<priv->pg_num && priv->size; ++i) {
+		struct page *page;
+		size_t nr = PAGE_CACHE_SIZE;
+
+		cond_resched();
+		page = find_get_page(mapping, priv->processed >> PAGE_CACHE_SHIFT);
+		if (unlikely(page == NULL))
+			break;
+		if (!PageUptodate(page)) {
+			dprintk("%s: %2d: page=%p, processed=%Lu, size=%zu not uptodate.\n", 
+					__func__, i, page, priv->processed, priv->size);
+			page_cache_release(page);
+			break;
+		}
+
+		if (mapping_writably_mapped(mapping))
+			flush_dcache_page(page);
+
+		mark_page_accessed(page);
+
+		if (nr + priv->processed > isize)
+			nr = isize - priv->processed;
+		if (nr > priv->size)
+			nr = priv->size;
+
+		actor_size = actor(k, page, nr);
+		if (actor_size < 0) {
+			page_cache_release(page);
+			break;
+		}
+
+		page_cache_release(page);
+
+		priv->processed += actor_size;
+		priv->size -= actor_size;
+	}
+
+	if (!priv->size)
+		i = priv->pg_num;
+
+	if (i != priv->pg_num)
+		priv->offset = priv->processed;
+
+	dprintk("%s: end: next=%d, num=%d, left=%zu, offset=%Lu, procesed=%Lu, ret=%d.\n", 
+			__func__, i, priv->pg_num, 
+			priv->size, priv->offset, priv->processed, i);
+
+	return i;
+}
+
+static int kevent_aio_callback(struct kevent *k)
+{
+	return 1;
+}
+
+static void kevent_aio_work(void *data)
+{
+	struct kevent *k = data;
+	struct kevent_aio_private *priv = k->priv;
+	struct inode *inode = k->st->origin;
+	struct address_space *mapping = priv->file->f_mapping;
+	int err, ready = 0, num;
+
+	dprintk("%s: k=%p, priv=%p, inode=%p.\n", __func__, k, priv, inode);
+
+	init_completion(&priv->bio_complete);
+	
+	num = ready = kevent_aio_vfs_read(k, &kevent_aio_vfs_read_actor);
+	if (ready > 0 && ready != priv->pg_num)
+		ready = 0;
+
+	dprintk("%s: k=%p, ready=%d, size=%zd.\n", __func__, k, ready, priv->size);
+
+	if (!ready) {
+		err = kevent_mpage_readpages(k, num, mapping->a_ops->get_block);
+		if (err) {
+			dprintk("%s: kevent_mpage_readpages failed: err=%d, k=%p, size=%zd.\n",
+					__func__, err, k, priv->size);
+			kevent_break(k);
+			kevent_storage_ready(k->st, NULL, KEVENT_MASK_ALL);
+		}
+	} else {
+		dprintk("%s: next k=%p, size=%zd.\n", __func__, k, priv->size);
+
+		if (priv->size)
+			schedule_work(&priv->work);
+		else {
+			kevent_storage_ready(k->st, NULL, KEVENT_MASK_ALL);
+		}
+
+		complete(&priv->bio_complete);
+	}
+}
+
+static int kevent_aio_enqueue(struct kevent *k)
+{
+	int err;
+	struct file *file, *sock;
+	struct inode *inode;
+	struct kevent_aio_private *priv;
+	struct address_space *mapping;
+	int fd = k->event.id.raw[0];
+	int num = k->event.id.raw[1];
+	int s = k->event.ret_data[0];
+	size_t size;
+
+	err = -ENODEV;
+	file = fget(fd);
+	if (!file)
+		goto err_out_exit;
+	
+	sock = fget(s);
+	if (!sock)
+		goto err_out_fput_file;
+	
+	mapping = file->f_mapping;
+
+	err = -EINVAL;
+	if (!file->f_dentry || !file->f_dentry->d_inode || !mapping->a_ops->get_block)
+		goto err_out_fput;
+	if (!sock->f_dentry || !sock->f_dentry->d_inode)
+		goto err_out_fput;
+
+	inode = igrab(file->f_dentry->d_inode);
+	if (!inode)
+		goto err_out_fput;
+
+	size = i_size_read(inode);
+	
+	num = (size > num << PAGE_SHIFT) ? num : (size >> PAGE_SHIFT);
+
+	err = -ENOMEM;
+	priv = kzalloc(sizeof(struct kevent_aio_private), GFP_KERNEL);
+	if (!priv)
+		goto err_out_iput;
+
+	priv->pg_num = num;
+	priv->size = size;
+	priv->offset = 0;
+	priv->file = file;
+	priv->sock = sock;
+	INIT_WORK(&priv->work, kevent_aio_work, k);
+	k->priv = priv;
+
+	dprintk("%s: read: k=%p, priv=%p, inode=%p, num=%u, size=%zu, off=%Lu.\n", 
+			__func__, k, priv, inode, priv->pg_num, priv->size, priv->offset);
+	
+	init_completion(&priv->bio_complete);
+	kevent_storage_enqueue(&inode->st, k);
+	schedule_work(&priv->work);
+	
+	return 0;
+
+err_out_iput:
+	iput(inode);
+err_out_fput:
+	fput(sock);
+err_out_fput_file:
+	fput(file);
+err_out_exit:
+
+	return err;
+}
+
+static int kevent_aio_dequeue(struct kevent *k)
+{
+	struct kevent_aio_private *priv = k->priv;
+	struct inode *inode = k->st->origin;
+	struct file *file = priv->file;
+	struct file *sock = priv->sock;
+
+	kevent_storage_dequeue(k->st, k);
+	flush_scheduled_work();
+	wait_for_completion(&priv->bio_complete);
+
+	kfree(k->priv);
+	k->priv = NULL;
+	iput(inode);
+	fput(file);
+	fput(sock);
+
+	return 0;
+}
+
+asmlinkage long sys_aio_sendfile(int ctl_fd, int fd, int s, 
+		size_t size, unsigned flags)
+{
+	struct ukevent ukread, uksend;
+	struct kevent_user *u;
+	struct file *file;
+	int err;
+	int num = (flags & 7)?(flags & 7):8;
+
+	memset(&ukread, 0, sizeof(struct ukevent));
+	memset(&uksend, 0, sizeof(struct ukevent));
+
+	ukread.type = KEVENT_AIO;
+	ukread.event = KEVENT_AIO_BIO;
+
+	ukread.id.raw[0] = fd;
+	ukread.id.raw[1] = num;
+	ukread.ret_data[0] = s;
+
+	dprintk("%s: fd=%d, s=%d, num=%d.\n", __func__, fd, s, num);
+
+	file = fget(ctl_fd);
+	if (!file)
+		return -ENODEV;
+
+	u = file->private_data;
+	if (!u) {
+		err = -EINVAL;
+		goto err_out_fput;
+	}
+
+	err = kevent_user_add_ukevent(&ukread, u);
+	if (err < 0)
+		goto err_out_fput;
+
+err_out_fput:
+	fput(file);
+	return err;
+}
+
+static int __init kevent_init_aio(void)
+{
+	struct kevent_callbacks *ac = &kevent_registered_callbacks[KEVENT_AIO];
+
+	ac->enqueue = &kevent_aio_enqueue;
+	ac->dequeue = &kevent_aio_dequeue;
+	ac->callback = &kevent_aio_callback;
+
+	return 0;
+}
+late_initcall(kevent_init_aio);
diff --git a/kernel/kevent/kevent_inode.c b/kernel/kevent/kevent_inode.c
new file mode 100644
index 0000000..00e59f3
--- /dev/null
+++ b/kernel/kevent/kevent_inode.c
@@ -0,0 +1,114 @@
+/*
+ * 	kevent_inode.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+#include <linux/file.h>
+#include <linux/kevent.h>
+#include <linux/fs.h>
+
+static int kevent_inode_enqueue(struct kevent *k)
+{
+	struct file *file;
+	struct inode *inode;
+	int err;
+
+	file = fget(k->event.id.raw[0]);
+	if (!file)
+		return -ENODEV;
+
+	err = -EINVAL;
+	if (!file->f_dentry || !file->f_dentry->d_inode)
+		goto err_out_fput;
+	
+	inode = igrab(file->f_dentry->d_inode);
+	if (!inode)
+		goto err_out_fput;
+
+	err = kevent_storage_enqueue(&inode->st, k);
+	if (err)
+		goto err_out_iput;
+
+	fput(file);
+	return 0;
+
+err_out_iput:
+	iput(inode);
+err_out_fput:
+	fput(file);
+	return err;
+}
+
+static int kevent_inode_dequeue(struct kevent *k)
+{
+	struct inode *inode = k->st->origin;
+
+	kevent_storage_dequeue(k->st, k);
+	iput(inode);
+
+	return 0;
+}
+
+static int kevent_inode_callback(struct kevent *k)
+{
+	return 1;
+}
+
+void kevent_inode_notify_parent(struct dentry *dentry, u32 event)
+{
+	struct dentry *parent;
+	struct inode *inode;
+	
+	spin_lock(&dentry->d_lock);
+	parent = dentry->d_parent;
+	inode = parent->d_inode;
+
+	dget(parent);
+	spin_unlock(&dentry->d_lock);
+	kevent_inode_notify(inode, KEVENT_INODE_REMOVE);
+	dput(parent);
+}
+	
+void kevent_inode_remove(struct inode *inode)
+{
+	kevent_storage_fini(&inode->st);
+}
+	
+void kevent_inode_notify(struct inode *inode, u32 event)
+{
+	kevent_storage_ready(&inode->st, NULL, event);
+}
+
+static int __init kevent_init_inode(void)
+{
+	struct kevent_callbacks *ic = &kevent_registered_callbacks[KEVENT_INODE];
+
+	ic->enqueue = &kevent_inode_enqueue;
+	ic->dequeue = &kevent_inode_dequeue;
+	ic->callback = &kevent_inode_callback;
+
+	return 0;
+}
+late_initcall(kevent_init_inode);


^ permalink raw reply related	[flat|nested] 160+ messages in thread

* [take4 3/4] kevent: Network AIO, socket notifications.
  2006-08-05 13:02                             ` [take4 2/4] kevent: AIO, aio_sendfile() implementation Evgeniy Polyakov
@ 2006-08-05 13:02                               ` Evgeniy Polyakov
  2006-08-05 13:02                                 ` [take4 4/4] kevent: poll/select() notifications. Timer notifications Evgeniy Polyakov
  0 siblings, 1 reply; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-05 13:02 UTC (permalink / raw)
  To: lkml; +Cc: David Miller, Ulrich Drepper, Evgeniy Polyakov, netdev, Zach Brown


Network AIO, socket notifications.

This patchset includes socket notifications and network asynchronous IO.
Network AIO is based on kevent and works as usual kevent storage on top
of inode.

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>

diff --git a/include/asm-i386/socket.h b/include/asm-i386/socket.h
index 5755d57..9300678 100644
--- a/include/asm-i386/socket.h
+++ b/include/asm-i386/socket.h
@@ -50,4 +50,6 @@ #define SO_ACCEPTCONN		30
 #define SO_PEERSEC		31
 #define SO_PASSSEC		34
 
+#define SO_ASYNC_SOCK		35
+
 #endif /* _ASM_SOCKET_H */
diff --git a/include/asm-x86_64/socket.h b/include/asm-x86_64/socket.h
index b467026..fc2b49d 100644
--- a/include/asm-x86_64/socket.h
+++ b/include/asm-x86_64/socket.h
@@ -50,4 +50,6 @@ #define SO_ACCEPTCONN		30
 #define SO_PEERSEC             31
 #define SO_PASSSEC		34
 
+#define SO_ASYNC_SOCK		35
+
 #endif /* _ASM_SOCKET_H */
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 4307e76..9267873 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1283,6 +1283,8 @@ extern struct sk_buff *skb_recv_datagram
 					 int noblock, int *err);
 extern unsigned int    datagram_poll(struct file *file, struct socket *sock,
 				     struct poll_table_struct *wait);
+extern int	       skb_copy_datagram(const struct sk_buff *from, 
+					 int offset, void *dst, int size);
 extern int	       skb_copy_datagram_iovec(const struct sk_buff *from,
 					       int offset, struct iovec *to,
 					       int size);
diff --git a/include/net/sock.h b/include/net/sock.h
index 324b3ea..c43a153 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -48,6 +48,7 @@ #include <linux/lockdep.h>
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>	/* struct sk_buff */
 #include <linux/security.h>
+#include <linux/kevent.h>
 
 #include <linux/filter.h>
 
@@ -391,6 +392,8 @@ enum sock_flags {
 	SOCK_RCVTSTAMP, /* %SO_TIMESTAMP setting */
 	SOCK_LOCALROUTE, /* route locally only, %SO_DONTROUTE setting */
 	SOCK_QUEUE_SHRUNK, /* write queue has been shrunk recently */
+	SOCK_ASYNC,
+	SOCK_ASYNC_INUSE,
 };
 
 static inline void sock_copy_flags(struct sock *nsk, struct sock *osk)
@@ -450,6 +453,21 @@ static inline int sk_stream_memory_free(
 
 extern void sk_stream_rfree(struct sk_buff *skb);
 
+struct socket_alloc {
+	struct socket socket;
+	struct inode vfs_inode;
+};
+
+static inline struct socket *SOCKET_I(struct inode *inode)
+{
+	return &container_of(inode, struct socket_alloc, vfs_inode)->socket;
+}
+
+static inline struct inode *SOCK_INODE(struct socket *socket)
+{
+	return &container_of(socket, struct socket_alloc, socket)->vfs_inode;
+}
+
 static inline void sk_stream_set_owner_r(struct sk_buff *skb, struct sock *sk)
 {
 	skb->sk = sk;
@@ -477,6 +495,7 @@ static inline void sk_add_backlog(struct
 		sk->sk_backlog.tail = skb;
 	}
 	skb->next = NULL;
+	kevent_socket_notify(sk, KEVENT_SOCKET_RECV);
 }
 
 #define sk_wait_event(__sk, __timeo, __condition)		\
@@ -548,6 +567,12 @@ struct proto {
 
 	int			(*backlog_rcv) (struct sock *sk, 
 						struct sk_buff *skb);
+	
+	int			(*async_recv) (struct sock *sk, 
+						void *dst, size_t size);
+	int			(*async_send) (struct sock *sk, 
+						struct page **pages, unsigned int poffset, 
+						size_t size);
 
 	/* Keeping track of sk's, looking them up, and port selection methods. */
 	void			(*hash)(struct sock *sk);
@@ -679,21 +704,6 @@ static inline struct kiocb *siocb_to_kio
 	return si->kiocb;
 }
 
-struct socket_alloc {
-	struct socket socket;
-	struct inode vfs_inode;
-};
-
-static inline struct socket *SOCKET_I(struct inode *inode)
-{
-	return &container_of(inode, struct socket_alloc, vfs_inode)->socket;
-}
-
-static inline struct inode *SOCK_INODE(struct socket *socket)
-{
-	return &container_of(socket, struct socket_alloc, socket)->vfs_inode;
-}
-
 extern void __sk_stream_mem_reclaim(struct sock *sk);
 extern int sk_stream_mem_schedule(struct sock *sk, int size, int kind);
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 0720bdd..5a1899b 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -364,6 +364,8 @@ extern int			compat_tcp_setsockopt(struc
 					int level, int optname,
 					char __user *optval, int optlen);
 extern void			tcp_set_keepalive(struct sock *sk, int val);
+extern int			tcp_async_recv(struct sock *sk, void *dst, size_t size);
+extern int			tcp_async_send(struct sock *sk, struct page **pages, unsigned int poffset, size_t size);
 extern int			tcp_recvmsg(struct kiocb *iocb, struct sock *sk,
 					    struct msghdr *msg,
 					    size_t len, int nonblock, 
@@ -857,6 +859,7 @@ static inline int tcp_prequeue(struct so
 			tp->ucopy.memory = 0;
 		} else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
 			wake_up_interruptible(sk->sk_sleep);
+			kevent_socket_notify(sk, KEVENT_SOCKET_RECV|KEVENT_SOCKET_SEND);
 			if (!inet_csk_ack_scheduled(sk))
 				inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
 						          (3 * TCP_RTO_MIN) / 4,
diff --git a/kernel/kevent/kevent_naio.c b/kernel/kevent/kevent_naio.c
new file mode 100644
index 0000000..98c357f
--- /dev/null
+++ b/kernel/kevent/kevent_naio.c
@@ -0,0 +1,243 @@
+/*
+ * 	kevent_naio.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/kevent.h>
+
+#include <net/sock.h>
+#include <net/tcp_states.h>
+
+static int kevent_naio_enqueue(struct kevent *k);
+static int kevent_naio_dequeue(struct kevent *k);
+static int kevent_naio_callback(struct kevent *k);
+
+static int kevent_naio_setup_aio(int ctl_fd, int s, void __user *buf, 
+		size_t size, u32 event)
+{
+	struct kevent_user *u;
+	struct file *file;
+	int err;
+	struct ukevent uk;
+
+	file = fget(ctl_fd);
+	if (!file)
+		return -ENODEV;
+
+	u = file->private_data;
+	if (!u) {
+		err = -EINVAL;
+		goto err_out_fput;
+	}
+
+	memset(&uk, 0, sizeof(struct ukevent));
+	uk.type = KEVENT_NAIO;
+	uk.ptr = buf;
+	uk.req_flags = KEVENT_REQ_ONESHOT;
+	uk.event = event;
+	uk.id.raw[0] = s;
+	uk.id.raw[1] = size;
+
+	err = kevent_user_add_ukevent(&uk, u);
+
+err_out_fput:
+	fput(file);
+	return err;
+}
+
+asmlinkage long sys_aio_recv(int ctl_fd, int s, void __user *buf, 
+		size_t size, unsigned flags)
+{
+	return kevent_naio_setup_aio(ctl_fd, s, buf, size, KEVENT_SOCKET_RECV);
+}
+
+asmlinkage long sys_aio_send(int ctl_fd, int s, void __user *buf, 
+		size_t size, unsigned flags)
+{
+	return kevent_naio_setup_aio(ctl_fd, s, buf, size, KEVENT_SOCKET_SEND);
+}
+
+static int kevent_naio_enqueue(struct kevent *k)
+{
+	int err, i;
+	struct page **page;
+	void *addr;
+	unsigned int size = k->event.id.raw[1];
+	int num = size/PAGE_SIZE;
+	struct file *file;
+	struct sock *sk = NULL;
+
+	file = fget(k->event.id.raw[0]);
+	if (!file)
+		return -ENODEV;
+
+	err = -EINVAL;
+	if (!file->f_dentry || !file->f_dentry->d_inode)
+		goto err_out_fput;
+	if (file->f_op != &socket_file_ops)
+		goto err_out_fput;
+
+	sk = SOCKET_I(file->f_dentry->d_inode)->sk;
+
+	err = -ESOCKTNOSUPPORT;
+	if (!sk || !sk->sk_prot->async_recv || !sk->sk_prot->async_send || 
+		!sock_flag(sk, SOCK_ASYNC))
+		goto err_out_fput;
+	
+	addr = k->event.ptr;
+	if (((unsigned long)addr & PAGE_MASK) != (unsigned long)addr)
+		num++;
+
+	page = kmalloc(sizeof(struct page *) * num, GFP_KERNEL);
+	if (!page)
+		return -ENOMEM;
+
+	down_read(&current->mm->mmap_sem);
+	err = get_user_pages(current, current->mm, (unsigned long)addr, 
+			num, 1, 0, page, NULL);
+	up_read(&current->mm->mmap_sem);
+	if (err <= 0)
+		goto err_out_free;
+	num = err;
+
+	k->event.ret_data[0] = num;
+	k->event.ret_data[1] = offset_in_page(k->event.ptr);
+	k->priv = page;
+
+	sk->sk_allocation = GFP_ATOMIC;
+
+	spin_lock_bh(&sk->sk_lock.slock);
+	err = kevent_socket_enqueue(k);
+	spin_unlock_bh(&sk->sk_lock.slock);
+	if (err)
+		goto err_out_put_pages;
+
+	fput(file);
+
+	return err;
+
+err_out_put_pages:
+	for (i=0; i<num; ++i)
+		page_cache_release(page[i]);
+err_out_free:
+	kfree(page);
+err_out_fput:
+	fput(file);
+
+	return err;
+}
+
+static int kevent_naio_dequeue(struct kevent *k)
+{
+	int err, i, num;
+	struct page **page = k->priv;
+
+	num = k->event.ret_data[0];
+
+	err = kevent_socket_dequeue(k);
+
+	for (i=0; i<num; ++i)
+		page_cache_release(page[i]);
+
+	kfree(k->priv);
+	k->priv = NULL;
+
+	return err;
+}
+
+static int kevent_naio_callback(struct kevent *k)
+{
+	struct inode *inode = k->st->origin;
+	struct sock *sk = SOCKET_I(inode)->sk;
+	unsigned int size = k->event.id.raw[1];
+	unsigned int off = k->event.ret_data[1];
+	struct page **pages = k->priv, *page;
+	int ready = 0, num = off/PAGE_SIZE, err = 0, send = 0;
+	void *ptr, *optr;
+	unsigned int len;
+
+	if (!sock_flag(sk, SOCK_ASYNC))
+		return -1;
+
+	if (k->event.event & KEVENT_SOCKET_SEND)
+		send = 1;
+	else if (!(k->event.event & KEVENT_SOCKET_RECV))
+		return -EINVAL;
+
+	/*
+	 * sk_prot->async_*() can return either number of bytes processed,
+	 * or negative error value, or zero if socket is closed.
+	 */
+
+	if (!send) {
+		page = pages[num];
+
+		optr = ptr = kmap_atomic(page, KM_IRQ0);
+		if (!ptr)
+			return -ENOMEM;
+
+		ptr += off % PAGE_SIZE;
+		len = min_t(unsigned int, PAGE_SIZE - (ptr - optr), size);
+
+		err = sk->sk_prot->async_recv(sk, ptr, len);
+
+		kunmap_atomic(optr, KM_IRQ0);
+	} else {
+		len = size;
+		err = sk->sk_prot->async_send(sk, pages, off, size);
+	}
+
+	if (err > 0) {
+		num++;
+		size -= err;
+		off += err;
+	}
+
+	k->event.ret_data[1] = off;
+	k->event.id.raw[1] = size;
+
+	if (err == 0 || (err < 0 && err != -EAGAIN))
+		ready = -1;
+
+	if (!size)
+		ready = 1;
+#if 0
+	printk("%s: sk=%p, k=%p, size=%4u, off=%4u, err=%3d, ready=%1d.\n",
+			__func__, sk, k, size, off, err, ready);
+#endif
+
+	return ready;
+}
+
+static int __init kevent_init_naio(void)
+{
+	struct kevent_callbacks *nc = &kevent_registered_callbacks[KEVENT_NAIO];
+
+	nc->callback = &kevent_naio_enqueue;
+	nc->dequeue = &kevent_naio_dequeue;
+	nc->callback = &kevent_naio_callback;
+	return 0;
+}
+late_initcall(kevent_init_naio);
diff --git a/kernel/kevent/kevent_socket.c b/kernel/kevent/kevent_socket.c
new file mode 100644
index 0000000..59f50ba
--- /dev/null
+++ b/kernel/kevent/kevent_socket.c
@@ -0,0 +1,149 @@
+/*
+ * 	kevent_socket.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+#include <linux/file.h>
+#include <linux/tcp.h>
+#include <linux/kevent.h>
+
+#include <net/sock.h>
+#include <net/request_sock.h>
+#include <net/inet_connection_sock.h>
+
+static int kevent_socket_callback(struct kevent *k)
+{
+	struct inode *inode = k->st->origin;
+	struct sock *sk = SOCKET_I(inode)->sk;
+	int rmem;
+	
+	if (k->event.event & KEVENT_SOCKET_RECV) {
+		int ret = 0;
+		
+		if ((rmem = atomic_read(&sk->sk_rmem_alloc)) > 0 || 
+				!skb_queue_empty(&sk->sk_receive_queue))
+			ret = 1;
+		if (sk->sk_shutdown & RCV_SHUTDOWN)
+			ret = 1;
+		if (ret)
+			return ret;
+	}
+	if ((k->event.event & KEVENT_SOCKET_ACCEPT) && 
+		(!reqsk_queue_empty(&inet_csk(sk)->icsk_accept_queue) || 
+		 	reqsk_queue_len_young(&inet_csk(sk)->icsk_accept_queue))) {
+		k->event.ret_data[1] = reqsk_queue_len(&inet_csk(sk)->icsk_accept_queue);
+		return 1;
+	}
+
+	return 0;
+}
+
+int kevent_socket_enqueue(struct kevent *k)
+{
+	struct file *file;
+	struct inode *inode;
+	int err;
+
+	file = fget(k->event.id.raw[0]);
+	if (!file)
+		return -ENODEV;
+
+	err = -EINVAL;
+	if (!file->f_dentry || !file->f_dentry->d_inode)
+		goto err_out_fput;
+	
+	if (file->f_op != &socket_file_ops)
+		goto err_out_fput;
+
+	inode = igrab(file->f_dentry->d_inode);
+	if (!inode)
+		goto err_out_fput;
+
+	err = kevent_storage_enqueue(&inode->st, k);
+	if (err)
+		goto err_out_iput;
+
+	err = k->callbacks.callback(k);
+	if (err)
+		goto err_out_dequeue;
+
+	fput(file);
+	return err;
+
+err_out_dequeue:
+	kevent_storage_dequeue(k->st, k);
+err_out_iput:
+	iput(inode);
+err_out_fput:
+	fput(file);
+	return err;
+}
+
+int kevent_socket_dequeue(struct kevent *k)
+{
+	struct inode *inode = k->st->origin;
+
+	kevent_storage_dequeue(k->st, k);
+	iput(inode);
+
+	return 0;
+}
+
+void kevent_socket_notify(struct sock *sk, u32 event)
+{
+	if (sk->sk_socket && !test_and_set_bit(SOCK_ASYNC_INUSE, &sk->sk_flags)) {
+		kevent_storage_ready(&SOCK_INODE(sk->sk_socket)->st, NULL, event);
+		sock_reset_flag(sk, SOCK_ASYNC_INUSE);
+	}
+}
+
+static struct lock_class_key kevent_sock_key;
+
+void kevent_socket_reinit(struct socket *sock)
+{
+	struct inode *inode = SOCK_INODE(sock);
+
+	lockdep_set_class(&inode->st.lock, &kevent_sock_key);
+}
+
+void kevent_sk_reinit(struct sock *sk)
+{
+	if (sk->sk_socket) {
+		struct inode *inode = SOCK_INODE(sk->sk_socket);
+
+		lockdep_set_class(&inode->st.lock, &kevent_sock_key);
+	}
+}
+
+static int __init kevent_init_socket(void)
+{
+	struct kevent_callbacks *sc = &kevent_registered_callbacks[KEVENT_SOCKET];
+
+	sc->enqueue = &kevent_socket_enqueue;
+	sc->dequeue = &kevent_socket_dequeue;
+	sc->callback = &kevent_socket_callback;
+	return 0;
+}
+late_initcall(kevent_init_socket);
diff --git a/net/core/datagram.c b/net/core/datagram.c
index aecddcc..493245b 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -236,6 +236,60 @@ void skb_kill_datagram(struct sock *sk, 
 EXPORT_SYMBOL(skb_kill_datagram);
 
 /**
+ *	skb_copy_datagram - Copy a datagram.
+ *	@skb: buffer to copy
+ *	@offset: offset in the buffer to start copying from
+ *	@to: pointer to copy to
+ *	@len: amount of data to copy from buffer to iovec
+ */
+int skb_copy_datagram(const struct sk_buff *skb, int offset,
+			    void *to, int len)
+{
+	int i, fraglen, end = 0;
+	struct sk_buff *next = skb_shinfo(skb)->frag_list;
+
+	if (!len)
+		return 0;
+
+next_skb:
+	fraglen = skb_headlen(skb);
+	i = -1;
+
+	while (1) {
+		int start = end;
+
+		if ((end += fraglen) > offset) {
+			int copy = end - offset, o = offset - start;
+
+			if (copy > len)
+				copy = len;
+			if (i == -1)
+				memcpy(to, skb->data + o, copy);
+			else {
+				skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+				struct page *page = frag->page;
+				void *p = kmap(page) + frag->page_offset + o;
+				memcpy(to, p, copy);
+				kunmap(page);
+			}
+			if (!(len -= copy))
+				return 0;
+			offset += copy;
+		}
+		if (++i >= skb_shinfo(skb)->nr_frags)
+			break;
+		fraglen = skb_shinfo(skb)->frags[i].size;
+	}
+	if (next) {
+		skb = next;
+		BUG_ON(skb_shinfo(skb)->frag_list);
+		next = skb->next;
+		goto next_skb;
+	}
+	return -EFAULT;
+}
+
+/**
  *	skb_copy_datagram_iovec - Copy a datagram to an iovec.
  *	@skb: buffer to copy
  *	@offset: offset in the buffer to start copying from
@@ -530,6 +584,7 @@ unsigned int datagram_poll(struct file *
 
 EXPORT_SYMBOL(datagram_poll);
 EXPORT_SYMBOL(skb_copy_and_csum_datagram_iovec);
+EXPORT_SYMBOL(skb_copy_datagram);
 EXPORT_SYMBOL(skb_copy_datagram_iovec);
 EXPORT_SYMBOL(skb_free_datagram);
 EXPORT_SYMBOL(skb_recv_datagram);
diff --git a/net/core/sock.c b/net/core/sock.c
index 51fcfbc..138ce90 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -617,6 +617,16 @@ #endif
 			spin_unlock_bh(&sk->sk_lock.slock);
 			ret = -ENONET;
 			break;
+#ifdef CONFIG_KEVENT_SOCKET
+		case SO_ASYNC_SOCK:
+			spin_lock_bh(&sk->sk_lock.slock);
+			if (valbool)
+				sock_set_flag(sk, SOCK_ASYNC);
+			else
+				sock_reset_flag(sk, SOCK_ASYNC);
+			spin_unlock_bh(&sk->sk_lock.slock);
+			break;
+#endif
 
 		case SO_PASSSEC:
 			if (valbool)
@@ -1406,6 +1416,7 @@ static void sock_def_wakeup(struct sock 
 	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
 		wake_up_interruptible_all(sk->sk_sleep);
 	read_unlock(&sk->sk_callback_lock);
+	kevent_socket_notify(sk, KEVENT_SOCKET_RECV|KEVENT_SOCKET_SEND);
 }
 
 static void sock_def_error_report(struct sock *sk)
@@ -1415,6 +1426,7 @@ static void sock_def_error_report(struct
 		wake_up_interruptible(sk->sk_sleep);
 	sk_wake_async(sk,0,POLL_ERR); 
 	read_unlock(&sk->sk_callback_lock);
+	kevent_socket_notify(sk, KEVENT_SOCKET_RECV|KEVENT_SOCKET_SEND);
 }
 
 static void sock_def_readable(struct sock *sk, int len)
@@ -1424,6 +1436,7 @@ static void sock_def_readable(struct soc
 		wake_up_interruptible(sk->sk_sleep);
 	sk_wake_async(sk,1,POLL_IN);
 	read_unlock(&sk->sk_callback_lock);
+	kevent_socket_notify(sk, KEVENT_SOCKET_RECV|KEVENT_SOCKET_SEND);
 }
 
 static void sock_def_write_space(struct sock *sk)
@@ -1443,6 +1456,7 @@ static void sock_def_write_space(struct 
 	}
 
 	read_unlock(&sk->sk_callback_lock);
+	kevent_socket_notify(sk, KEVENT_SOCKET_SEND|KEVENT_SOCKET_RECV);
 }
 
 static void sock_def_destruct(struct sock *sk)
@@ -1493,6 +1507,8 @@ #endif
 	sk->sk_state		=	TCP_CLOSE;
 	sk->sk_socket		=	sock;
 
+	kevent_sk_reinit(sk);
+
 	sock_set_flag(sk, SOCK_ZAPPED);
 
 	if(sock)
@@ -1559,8 +1575,10 @@ void fastcall release_sock(struct sock *
 	if (sk->sk_backlog.tail)
 		__release_sock(sk);
 	sk->sk_lock.owner = NULL;
-	if (waitqueue_active(&sk->sk_lock.wq))
+	if (waitqueue_active(&sk->sk_lock.wq)) {
 		wake_up(&sk->sk_lock.wq);
+		kevent_socket_notify(sk, KEVENT_SOCKET_RECV|KEVENT_SOCKET_SEND);
+	}
 	spin_unlock_bh(&sk->sk_lock.slock);
 }
 EXPORT_SYMBOL(release_sock);
diff --git a/net/core/stream.c b/net/core/stream.c
index d1d7dec..2878c2a 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -36,6 +36,7 @@ void sk_stream_write_space(struct sock *
 			wake_up_interruptible(sk->sk_sleep);
 		if (sock->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
 			sock_wake_async(sock, 2, POLL_OUT);
+		kevent_socket_notify(sk, KEVENT_SOCKET_SEND|KEVENT_SOCKET_RECV);
 	}
 }
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index f6a2d92..e878a41 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -206,6 +206,7 @@
  *					lingertime == 0 (RFC 793 ABORT Call)
  *	Hirokazu Takahashi	:	Use copy_from_user() instead of
  *					csum_and_copy_from_user() if possible.
+ *	Evgeniy Polyakov	:	Network asynchronous IO.
  *
  *		This program is free software; you can redistribute it and/or
  *		modify it under the terms of the GNU General Public License
@@ -1085,6 +1086,301 @@ int tcp_read_sock(struct sock *sk, read_
 }
 
 /*
+ * Must be called with locked sock.
+ */
+int tcp_async_send(struct sock *sk, struct page **pages, unsigned int poffset, size_t len)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int mss_now, size_goal;
+	int err = -EAGAIN;
+	ssize_t copied;
+
+	/* Wait for a connection to finish. */
+	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
+		goto out_err;
+
+	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+
+	mss_now = tcp_current_mss(sk, 1);
+	size_goal = tp->xmit_size_goal;
+	copied = 0;
+
+	err = -EPIPE;
+	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN) || sock_flag(sk, SOCK_DONE) ||
+			(sk->sk_state == TCP_CLOSE) || (atomic_read(&sk->sk_refcnt) == 1))
+		goto do_error;
+
+	while (len > 0) {
+		struct sk_buff *skb = sk->sk_write_queue.prev;
+		struct page *page = pages[poffset / PAGE_SIZE];
+		int copy, i, can_coalesce;
+		int offset = poffset % PAGE_SIZE;
+		int size = min_t(size_t, len, PAGE_SIZE - offset);
+
+		if (!sk->sk_send_head || (copy = size_goal - skb->len) <= 0) {
+new_segment:
+			if (!sk_stream_memory_free(sk))
+				goto wait_for_sndbuf;
+
+			skb = sk_stream_alloc_pskb(sk, 0, 0,
+						   sk->sk_allocation);
+			if (!skb)
+				goto wait_for_memory;
+
+			skb_entail(sk, tp, skb);
+			copy = size_goal;
+		}
+
+		if (copy > size)
+			copy = size;
+
+		i = skb_shinfo(skb)->nr_frags;
+		can_coalesce = skb_can_coalesce(skb, i, page, offset);
+		if (!can_coalesce && i >= MAX_SKB_FRAGS) {
+			tcp_mark_push(tp, skb);
+			goto new_segment;
+		}
+		if (!sk_stream_wmem_schedule(sk, copy))
+			goto wait_for_memory;
+		
+		if (can_coalesce) {
+			skb_shinfo(skb)->frags[i - 1].size += copy;
+		} else {
+			get_page(page);
+			skb_fill_page_desc(skb, i, page, offset, copy);
+		}
+
+		skb->len += copy;
+		skb->data_len += copy;
+		skb->truesize += copy;
+		sk->sk_wmem_queued += copy;
+		sk->sk_forward_alloc -= copy;
+		skb->ip_summed = CHECKSUM_HW;
+		tp->write_seq += copy;
+		TCP_SKB_CB(skb)->end_seq += copy;
+		skb_shinfo(skb)->gso_segs = 0;
+
+		if (!copied)
+			TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
+
+		copied += copy;
+		poffset += copy;
+		if (!(len -= copy))
+			goto out;
+
+		if (skb->len < mss_now)
+			continue;
+
+		if (forced_push(tp)) {
+			tcp_mark_push(tp, skb);
+			__tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
+		} else if (skb == sk->sk_send_head)
+			tcp_push_one(sk, mss_now);
+		continue;
+
+wait_for_sndbuf:
+		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+wait_for_memory:
+		if (copied)
+			tcp_push(sk, tp, 0, mss_now, TCP_NAGLE_PUSH);
+
+		err = -EAGAIN;
+		goto do_error;
+	}
+
+out:
+	if (copied)
+		tcp_push(sk, tp, 0, mss_now, tp->nonagle);
+	return copied;
+
+do_error:
+	if (copied)
+		goto out;
+out_err:
+	return sk_stream_error(sk, 0, err);
+}
+
+/*
+ * Must be called with locked sock.
+ */
+int tcp_async_recv(struct sock *sk, void *dst, size_t len)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int copied = 0;
+	u32 *seq;
+	unsigned long used;
+	int err;
+	int target;		/* Read at least this many bytes */
+	int copied_early = 0;
+
+	TCP_CHECK_TIMER(sk);
+
+	err = -ENOTCONN;
+	if (sk->sk_state == TCP_LISTEN)
+		goto out;
+
+	seq = &tp->copied_seq;
+
+	target = sock_rcvlowat(sk, 0, len);
+
+	do {
+		struct sk_buff *skb;
+		u32 offset;
+
+		/* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
+		if (tp->urg_data && tp->urg_seq == *seq) {
+			if (copied)
+				break;
+		}
+
+		/* Next get a buffer. */
+
+		skb = skb_peek(&sk->sk_receive_queue);
+		do {
+			if (!skb)
+				break;
+
+			/* Now that we have two receive queues this
+			 * shouldn't happen.
+			 */
+			if (before(*seq, TCP_SKB_CB(skb)->seq)) {
+				printk(KERN_INFO "async_recv bug: copied %X "
+				       "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
+				break;
+			}
+			offset = *seq - TCP_SKB_CB(skb)->seq;
+			if (skb->h.th->syn)
+				offset--;
+			if (offset < skb->len)
+				goto found_ok_skb;
+			if (skb->h.th->fin)
+				goto found_fin_ok;
+			skb = skb->next;
+		} while (skb != (struct sk_buff *)&sk->sk_receive_queue);
+
+		if (copied)
+			break;
+
+		if (sock_flag(sk, SOCK_DONE))
+			break;
+
+		if (sk->sk_err) {
+			copied = sock_error(sk);
+			break;
+		}
+
+		if (sk->sk_shutdown & RCV_SHUTDOWN)
+			break;
+
+		if (sk->sk_state == TCP_CLOSE) {
+			if (!sock_flag(sk, SOCK_DONE)) {
+				/* This occurs when user tries to read
+				 * from never connected socket.
+				 */
+				copied = -ENOTCONN;
+				break;
+			}
+			break;
+		}
+
+		copied = -EAGAIN;
+		break;
+
+	found_ok_skb:
+		/* Ok so how much can we use? */
+		used = skb->len - offset;
+		if (len < used)
+			used = len;
+
+		/* Do we have urgent data here? */
+		if (tp->urg_data) {
+			u32 urg_offset = tp->urg_seq - *seq;
+			if (urg_offset < used) {
+				if (!urg_offset) {
+					if (!sock_flag(sk, SOCK_URGINLINE)) {
+						++*seq;
+						offset++;
+						used--;
+						if (!used)
+							goto skip_copy;
+					}
+				} else
+					used = urg_offset;
+			}
+		}
+#ifdef CONFIG_NET_DMA
+		if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
+			tp->ucopy.dma_chan = get_softnet_dma();
+
+		if (tp->ucopy.dma_chan) {
+			tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec(
+				tp->ucopy.dma_chan, skb, offset,
+				msg->msg_iov, used,
+				tp->ucopy.pinned_list);
+
+			if (tp->ucopy.dma_cookie < 0) {
+
+				printk(KERN_ALERT "dma_cookie < 0\n");
+
+				/* Exception. Bailout! */
+				if (!copied)
+					copied = -EFAULT;
+				break;
+			}
+			if ((offset + used) == skb->len)
+				copied_early = 1;
+
+		} else
+#endif
+		{
+			err = skb_copy_datagram(skb, offset, dst, used);
+			if (err) {
+				/* Exception. Bailout! */
+				if (!copied)
+					copied = -EFAULT;
+				break;
+			}
+		}
+
+		*seq += used;
+		copied += used;
+		len -= used;
+		dst += used;
+
+		tcp_rcv_space_adjust(sk);
+
+skip_copy:
+		if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
+			tp->urg_data = 0;
+			tcp_fast_path_check(sk, tp);
+		}
+		if (used + offset < skb->len)
+			continue;
+
+		if (skb->h.th->fin)
+			goto found_fin_ok;
+		sk_eat_skb(sk, skb, copied_early);
+		continue;
+
+	found_fin_ok:
+		/* Process the FIN. */
+		++*seq;
+		sk_eat_skb(sk, skb, copied_early);
+		break;
+	} while (len > 0);
+
+	/* Clean up data we have read: This will do ACK frames. */
+	tcp_cleanup_rbuf(sk, copied);
+
+	TCP_CHECK_TIMER(sk);
+	return copied;
+
+out:
+	TCP_CHECK_TIMER(sk);
+	return err;
+}
+
+/*
  *	This routine copies from a sock struct into the user buffer.
  *
  *	Technical note: in 2.3 we work on _locked_ socket, so that
@@ -2342,6 +2638,8 @@ EXPORT_SYMBOL(tcp_getsockopt);
 EXPORT_SYMBOL(tcp_ioctl);
 EXPORT_SYMBOL(tcp_poll);
 EXPORT_SYMBOL(tcp_read_sock);
+EXPORT_SYMBOL(tcp_async_recv);
+EXPORT_SYMBOL(tcp_async_send);
 EXPORT_SYMBOL(tcp_recvmsg);
 EXPORT_SYMBOL(tcp_sendmsg);
 EXPORT_SYMBOL(tcp_sendpage);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 738dad9..f70d045 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3112,6 +3112,7 @@ static void tcp_ofo_queue(struct sock *s
 
 		__skb_unlink(skb, &tp->out_of_order_queue);
 		__skb_queue_tail(&sk->sk_receive_queue, skb);
+		kevent_socket_notify(sk, KEVENT_SOCKET_RECV);
 		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
 		if(skb->h.th->fin)
 			tcp_fin(skb, sk, skb->h.th);
@@ -3955,7 +3956,8 @@ int tcp_rcv_established(struct sock *sk,
 			int copied_early = 0;
 
 			if (tp->copied_seq == tp->rcv_nxt &&
-			    len - tcp_header_len <= tp->ucopy.len) {
+			    len - tcp_header_len <= tp->ucopy.len &&
+			    !sock_async(sk)) {
 #ifdef CONFIG_NET_DMA
 				if (tcp_dma_try_early_copy(sk, skb, tcp_header_len)) {
 					copied_early = 1;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index f6f39e8..ae4f23c 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -61,6 +61,7 @@ #include <linux/cache.h>
 #include <linux/jhash.h>
 #include <linux/init.h>
 #include <linux/times.h>
+#include <linux/kevent.h>
 
 #include <net/icmp.h>
 #include <net/inet_hashtables.h>
@@ -868,6 +869,7 @@ #endif
 	   	reqsk_free(req);
 	} else {
 		inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
+		kevent_socket_notify(sk, KEVENT_SOCKET_ACCEPT);
 	}
 	return 0;
 
@@ -1108,24 +1110,30 @@ process:
 
 	skb->dev = NULL;
 
-	bh_lock_sock_nested(sk);
 	ret = 0;
-	if (!sock_owned_by_user(sk)) {
+	if (sock_async(sk)) {
+		spin_lock_bh(&sk->sk_lock.slock);
+		ret = tcp_v4_do_rcv(sk, skb);
+		spin_unlock_bh(&sk->sk_lock.slock);
+	} else {
+		bh_lock_sock_nested(sk);
+		if (!sock_owned_by_user(sk)) {
 #ifdef CONFIG_NET_DMA
-		struct tcp_sock *tp = tcp_sk(sk);
-		if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
-			tp->ucopy.dma_chan = get_softnet_dma();
-		if (tp->ucopy.dma_chan)
-			ret = tcp_v4_do_rcv(sk, skb);
-		else
+			struct tcp_sock *tp = tcp_sk(sk);
+			if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
+				tp->ucopy.dma_chan = get_softnet_dma();
+			if (tp->ucopy.dma_chan)
+				ret = tcp_v4_do_rcv(sk, skb);
+			else
 #endif
-		{
-			if (!tcp_prequeue(sk, skb))
-			ret = tcp_v4_do_rcv(sk, skb);
-		}
-	} else
-		sk_add_backlog(sk, skb);
-	bh_unlock_sock(sk);
+			{
+				if (!tcp_prequeue(sk, skb))
+				ret = tcp_v4_do_rcv(sk, skb);
+			}
+		} else
+			sk_add_backlog(sk, skb);
+		bh_unlock_sock(sk);
+	}
 
 	sock_put(sk);
 
@@ -1849,6 +1857,8 @@ struct proto tcp_prot = {
 	.getsockopt		= tcp_getsockopt,
 	.sendmsg		= tcp_sendmsg,
 	.recvmsg		= tcp_recvmsg,
+	.async_recv		= tcp_async_recv,
+	.async_send		= tcp_async_send,
 	.backlog_rcv		= tcp_v4_do_rcv,
 	.hash			= tcp_v4_hash,
 	.unhash			= tcp_unhash,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 923989d..a5d3ac8 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1230,22 +1230,28 @@ process:
 
 	skb->dev = NULL;
 
-	bh_lock_sock(sk);
 	ret = 0;
-	if (!sock_owned_by_user(sk)) {
+	if (sock_async(sk)) {
+		spin_lock_bh(&sk->sk_lock.slock);
+		ret = tcp_v4_do_rcv(sk, skb);
+		spin_unlock_bh(&sk->sk_lock.slock);
+	} else {
+		bh_lock_sock(sk);
+		if (!sock_owned_by_user(sk)) {
 #ifdef CONFIG_NET_DMA
-                struct tcp_sock *tp = tcp_sk(sk);
-                if (tp->ucopy.dma_chan)
-                        ret = tcp_v6_do_rcv(sk, skb);
-                else
-#endif
-		{
-			if (!tcp_prequeue(sk, skb))
+			struct tcp_sock *tp = tcp_sk(sk);
+			if (tp->ucopy.dma_chan)
 				ret = tcp_v6_do_rcv(sk, skb);
-		}
-	} else
-		sk_add_backlog(sk, skb);
-	bh_unlock_sock(sk);
+			else
+#endif
+			{
+				if (!tcp_prequeue(sk, skb))
+					ret = tcp_v6_do_rcv(sk, skb);
+			}
+		} else
+			sk_add_backlog(sk, skb);
+		bh_unlock_sock(sk);
+	}
 
 	sock_put(sk);
 	return ret ? -1 : 0;
@@ -1596,6 +1602,8 @@ struct proto tcpv6_prot = {
 	.getsockopt		= tcp_getsockopt,
 	.sendmsg		= tcp_sendmsg,
 	.recvmsg		= tcp_recvmsg,
+	.async_recv		= tcp_async_recv,
+	.async_send		= tcp_async_send,
 	.backlog_rcv		= tcp_v6_do_rcv,
 	.hash			= tcp_v6_hash,
 	.unhash			= tcp_unhash,


^ permalink raw reply related	[flat|nested] 160+ messages in thread

* [take4 4/4] kevent: poll/select() notifications. Timer notifications.
  2006-08-05 13:02                               ` [take4 3/4] kevent: Network AIO, socket notifications Evgeniy Polyakov
@ 2006-08-05 13:02                                 ` Evgeniy Polyakov
  0 siblings, 0 replies; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-05 13:02 UTC (permalink / raw)
  To: lkml; +Cc: David Miller, Ulrich Drepper, Evgeniy Polyakov, netdev, Zach Brown


poll/select() notifications. Timer notifications.

This patch includes generic poll/select and timer notifications.

kevent_poll works simialr to epoll and has the same issues (callback
is invoked not from internal state machine of the caller, but through
process awake).

Timer notifications can be used for fine grained per-process time 
management, since iteractive timers are very inconveniently to use, 
and they are limited.

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mitp.ru>

diff --git a/kernel/kevent/kevent_poll.c b/kernel/kevent/kevent_poll.c
new file mode 100644
index 0000000..8a4f863
--- /dev/null
+++ b/kernel/kevent/kevent_poll.c
@@ -0,0 +1,220 @@
+/*
+ * 	kevent_poll.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+#include <linux/file.h>
+#include <linux/kevent.h>
+#include <linux/poll.h>
+#include <linux/fs.h>
+
+static kmem_cache_t *kevent_poll_container_cache;
+static kmem_cache_t *kevent_poll_priv_cache;
+
+struct kevent_poll_ctl
+{
+	struct poll_table_struct 	pt;
+	struct kevent			*k;
+};
+
+struct kevent_poll_wait_container
+{
+	struct list_head		container_entry;
+	wait_queue_head_t		*whead;
+	wait_queue_t			wait;
+	struct kevent			*k;
+};
+
+struct kevent_poll_private
+{
+	struct list_head		container_list;
+	spinlock_t			container_lock;
+};
+
+static int kevent_poll_enqueue(struct kevent *k);
+static int kevent_poll_dequeue(struct kevent *k);
+static int kevent_poll_callback(struct kevent *k);
+
+static int kevent_poll_wait_callback(wait_queue_t *wait, 
+		unsigned mode, int sync, void *key)
+{
+	struct kevent_poll_wait_container *cont = 
+		container_of(wait, struct kevent_poll_wait_container, wait);
+	struct kevent *k = cont->k;
+	struct file *file = k->st->origin;
+	u32 revents;
+
+	revents = file->f_op->poll(file, NULL);
+
+	kevent_storage_ready(k->st, NULL, revents);
+
+	return 0;
+}
+
+static void kevent_poll_qproc(struct file *file, wait_queue_head_t *whead, 
+		struct poll_table_struct *poll_table)
+{
+	struct kevent *k = 
+		container_of(poll_table, struct kevent_poll_ctl, pt)->k;
+	struct kevent_poll_private *priv = k->priv;
+	struct kevent_poll_wait_container *cont;
+	unsigned long flags;
+
+	cont = kmem_cache_alloc(kevent_poll_container_cache, SLAB_KERNEL);
+	if (!cont) {
+		kevent_break(k);
+		return;
+	}
+		
+	cont->k = k;
+	init_waitqueue_func_entry(&cont->wait, kevent_poll_wait_callback);
+	cont->whead = whead;
+
+	spin_lock_irqsave(&priv->container_lock, flags);
+	list_add_tail(&cont->container_entry, &priv->container_list);
+	spin_unlock_irqrestore(&priv->container_lock, flags);
+
+	add_wait_queue(whead, &cont->wait);
+}
+
+static int kevent_poll_enqueue(struct kevent *k)
+{
+	struct file *file;
+	int err, ready = 0;
+	unsigned int revents;
+	struct kevent_poll_ctl ctl;
+	struct kevent_poll_private *priv;
+
+	file = fget(k->event.id.raw[0]);
+	if (!file)
+		return -ENODEV;
+
+	err = -EINVAL;
+	if (!file->f_op || !file->f_op->poll)
+		goto err_out_fput;
+
+	err = -ENOMEM;
+	priv = kmem_cache_alloc(kevent_poll_priv_cache, SLAB_KERNEL);
+	if (!priv)
+		goto err_out_fput;
+
+	spin_lock_init(&priv->container_lock);
+	INIT_LIST_HEAD(&priv->container_list);
+
+	k->priv = priv;
+
+	ctl.k = k;
+	init_poll_funcptr(&ctl.pt, &kevent_poll_qproc);
+
+	err = kevent_storage_enqueue(&file->st, k);
+	if (err)
+		goto err_out_free;
+
+	revents = file->f_op->poll(file, &ctl.pt);
+	if (revents & k->event.event) {
+		ready = 1;
+		kevent_poll_dequeue(k);
+	}
+	
+	return ready;
+
+err_out_free:
+	kmem_cache_free(kevent_poll_priv_cache, priv);
+err_out_fput:
+	fput(file);
+	return err;
+}
+
+static int kevent_poll_dequeue(struct kevent *k)
+{
+	struct file *file = k->st->origin;
+	struct kevent_poll_private *priv = k->priv;
+	struct kevent_poll_wait_container *w, *n;
+	unsigned long flags;
+
+	kevent_storage_dequeue(k->st, k);
+
+	spin_lock_irqsave(&priv->container_lock, flags);
+	list_for_each_entry_safe(w, n, &priv->container_list, container_entry) {
+		list_del(&w->container_entry);
+		remove_wait_queue(w->whead, &w->wait);
+		kmem_cache_free(kevent_poll_container_cache, w);
+	}
+	spin_unlock_irqrestore(&priv->container_lock, flags);
+	
+	kmem_cache_free(kevent_poll_priv_cache, priv);
+	k->priv = NULL;
+	
+	fput(file);
+
+	return 0;
+}
+
+static int kevent_poll_callback(struct kevent *k)
+{
+	struct file *file = k->st->origin;
+	unsigned int revents = file->f_op->poll(file, NULL);
+	return (revents & k->event.event);
+}
+
+static int __init kevent_poll_sys_init(void)
+{
+	struct kevent_callbacks *pc = &kevent_registered_callbacks[KEVENT_POLL];
+
+	kevent_poll_container_cache = kmem_cache_create("kevent_poll_container_cache", 
+			sizeof(struct kevent_poll_wait_container), 0, 0, NULL, NULL);
+	if (!kevent_poll_container_cache) {
+		printk(KERN_ERR "Failed to create kevent poll container cache.\n");
+		return -ENOMEM;
+	}
+	
+	kevent_poll_priv_cache = kmem_cache_create("kevent_poll_priv_cache", 
+			sizeof(struct kevent_poll_private), 0, 0, NULL, NULL);
+	if (!kevent_poll_priv_cache) {
+		printk(KERN_ERR "Failed to create kevent poll private data cache.\n");
+		kmem_cache_destroy(kevent_poll_container_cache);
+		kevent_poll_container_cache = NULL;
+		return -ENOMEM;
+	}
+	
+	pc->enqueue = &kevent_poll_enqueue;
+	pc->dequeue = &kevent_poll_dequeue;
+	pc->callback = &kevent_poll_callback;
+
+	printk(KERN_INFO "Kevent poll()/select() subsystem has been initialized.\n");
+	return 0;
+}
+
+static struct lock_class_key kevent_poll_key;
+
+void kevent_poll_reinit(struct file *file)
+{
+	lockdep_set_class(&file->st.lock, &kevent_poll_key);
+}
+
+static void __exit kevent_poll_sys_fini(void)
+{
+	kmem_cache_destroy(kevent_poll_priv_cache);
+	kmem_cache_destroy(kevent_poll_container_cache);
+}
+
+module_init(kevent_poll_sys_init);
+module_exit(kevent_poll_sys_fini);
diff --git a/kernel/kevent/kevent_timer.c b/kernel/kevent/kevent_timer.c
new file mode 100644
index 0000000..f175edd
--- /dev/null
+++ b/kernel/kevent/kevent_timer.c
@@ -0,0 +1,119 @@
+/*
+ * 	kevent_timer.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+#include <linux/jiffies.h>
+#include <linux/kevent.h>
+
+static void kevent_timer_func(unsigned long data)
+{
+	struct kevent *k = (struct kevent *)data;
+	struct timer_list *t = k->st->origin;
+
+	kevent_storage_ready(k->st, NULL, KEVENT_MASK_ALL);
+	mod_timer(t, jiffies + msecs_to_jiffies(k->event.id.raw[0]));
+}
+
+static struct lock_class_key kevent_timer_key;
+
+static int kevent_timer_enqueue(struct kevent *k)
+{
+	struct timer_list *t;
+	struct kevent_storage *st;
+	int err;
+
+	t = kmalloc(sizeof(struct timer_list) + sizeof(struct kevent_storage), 
+			GFP_KERNEL);
+	if (!t)
+		return -ENOMEM;
+
+	init_timer(t);
+	t->function = kevent_timer_func;
+	t->expires = jiffies + msecs_to_jiffies(k->event.id.raw[0]);
+	t->data = (unsigned long)k;
+
+	st = (struct kevent_storage *)(t+1);
+	err = kevent_storage_init(t, st);
+	if (err)
+		goto err_out_free;
+	lockdep_set_class(&st->lock, &kevent_timer_key);
+
+	err = kevent_storage_enqueue(st, k);
+	if (err)
+		goto err_out_st_fini;
+	
+	add_timer(t);
+
+	return 0;
+
+err_out_st_fini:	
+	kevent_storage_fini(st);
+err_out_free:
+	kfree(t);
+
+	return err;
+}
+
+static int kevent_timer_dequeue(struct kevent *k)
+{
+	struct kevent_storage *st = k->st;
+	struct timer_list *t = st->origin;
+
+	if (!t)
+		return -ENODEV;
+
+	del_timer_sync(t);
+	
+	kevent_storage_dequeue(st, k);
+	
+	kfree(t);
+
+	return 0;
+}
+
+static int kevent_timer_callback(struct kevent *k)
+{
+	struct kevent_storage *st = k->st;
+	struct timer_list *t = st->origin;
+
+	if (!t)
+		return -ENODEV;
+	
+	k->event.ret_data[0] = (__u32)jiffies;
+	return 1;
+}
+
+static int __init kevent_init_timer(void)
+{
+	struct kevent_callbacks *tc = &kevent_registered_callbacks[KEVENT_TIMER];
+
+	tc->enqueue = &kevent_timer_enqueue;
+	tc->dequeue = &kevent_timer_dequeue;
+	tc->callback = &kevent_timer_callback;
+
+	return 0;
+}
+late_initcall(kevent_init_timer);


^ permalink raw reply related	[flat|nested] 160+ messages in thread

* [take4 1/4] kevent: Core files.
  2006-08-05 13:02                         ` [take4 0/4] kevent: Generic event handling mechanism Evgeniy Polyakov
@ 2006-08-05 13:02                           ` Evgeniy Polyakov
  2006-08-05 13:02                             ` [take4 2/4] kevent: AIO, aio_sendfile() implementation Evgeniy Polyakov
  2006-08-05 17:57                             ` [take4 1/4] kevent: Core files Greg KH
  0 siblings, 2 replies; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-05 13:02 UTC (permalink / raw)
  To: lkml; +Cc: David Miller, Ulrich Drepper, Evgeniy Polyakov, netdev, Zach Brown


Core files.

This patch includes core kevent files:
 - userspace controlling
 - kernelspace interfaces
 - initialization
 - notification state machines

It might also inlclude parts from other subsystem (like network related
syscalls, so it is possible that it will not compile without other
patches applied).

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>

diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index dd63d47..0af988a 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@@ -317,3 +317,7 @@ ENTRY(sys_call_table)
 	.long sys_tee			/* 315 */
 	.long sys_vmsplice
 	.long sys_move_pages
+	.long sys_aio_recv
+	.long sys_aio_send
+	.long sys_kevent_get_events
+	.long sys_kevent_ctl
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index 5d4a7d1..e157ad4 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -713,4 +713,8 @@ #endif
 	.quad sys_tee
 	.quad compat_sys_vmsplice
 	.quad compat_sys_move_pages
+	.quad sys_aio_recv
+	.quad sys_aio_send
+	.quad sys_kevent_get_events
+	.quad sys_kevent_ctl
 ia32_syscall_end:		
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index fc1c8dd..a76e50d 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -323,10 +323,14 @@ #define __NR_sync_file_range	314
 #define __NR_tee		315
 #define __NR_vmsplice		316
 #define __NR_move_pages		317
+#define __NR_aio_recv		318
+#define __NR_aio_send		319
+#define __NR_kevent_get_events	320
+#define __NR_kevent_ctl		321
 
 #ifdef __KERNEL__
 
-#define NR_syscalls 318
+#define NR_syscalls 322
 
 /*
  * user-visible error numbers are in the range -1 - -128: see
diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h
index 94387c9..9a0b581 100644
--- a/include/asm-x86_64/unistd.h
+++ b/include/asm-x86_64/unistd.h
@@ -619,10 +619,18 @@ #define __NR_vmsplice		278
 __SYSCALL(__NR_vmsplice, sys_vmsplice)
 #define __NR_move_pages		279
 __SYSCALL(__NR_move_pages, sys_move_pages)
+#define __NR_aio_recv		280
+__SYSCALL(__NR_aio_recv, sys_aio_recv)
+#define __NR_aio_send		281
+__SYSCALL(__NR_aio_send, sys_aio_send)
+#define __NR_kevent_get_events	282
+__SYSCALL(__NR_kevent_get_events, sys_kevent_get_events)
+#define __NR_kevent_ctl		283
+__SYSCALL(__NR_kevent_ctl, sys_kevent_ctl)
 
 #ifdef __KERNEL__
 
-#define __NR_syscall_max __NR_move_pages
+#define __NR_syscall_max __NR_kevent_ctl
 
 #ifndef __NO_STUBS
 
diff --git a/include/linux/kevent.h b/include/linux/kevent.h
new file mode 100644
index 0000000..c32f3bd
--- /dev/null
+++ b/include/linux/kevent.h
@@ -0,0 +1,296 @@
+/*
+ * 	kevent.h
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef __KEVENT_H
+#define __KEVENT_H
+
+/*
+ * Kevent request flags.
+ */
+
+#define KEVENT_REQ_ONESHOT	0x1		/* Process this event only once and then dequeue. */
+
+/*
+ * Kevent return flags.
+ */
+#define KEVENT_RET_BROKEN	0x1		/* Kevent is broken. */
+#define KEVENT_RET_DONE		0x2		/* Kevent processing was finished successfully. */
+
+/*
+ * Kevent type set.
+ */
+#define KEVENT_SOCKET 		0
+#define KEVENT_INODE		1
+#define KEVENT_TIMER		2
+#define KEVENT_POLL		3
+#define KEVENT_NAIO		4
+#define KEVENT_AIO		5
+#define	KEVENT_MAX		6
+
+/*
+ * Per-type event sets.
+ * Number of per-event sets should be exactly as number of kevent types.
+ */
+
+/*
+ * Timer events.
+ */
+#define	KEVENT_TIMER_FIRED	0x1
+
+/*
+ * Socket/network asynchronous IO events.
+ */
+#define	KEVENT_SOCKET_RECV	0x1
+#define	KEVENT_SOCKET_ACCEPT	0x2
+#define	KEVENT_SOCKET_SEND	0x4
+
+/*
+ * Inode events.
+ */
+#define	KEVENT_INODE_CREATE	0x1
+#define	KEVENT_INODE_REMOVE	0x2
+
+/*
+ * Poll events.
+ */
+#define	KEVENT_POLL_POLLIN	0x0001
+#define	KEVENT_POLL_POLLPRI	0x0002
+#define	KEVENT_POLL_POLLOUT	0x0004
+#define	KEVENT_POLL_POLLERR	0x0008
+#define	KEVENT_POLL_POLLHUP	0x0010
+#define	KEVENT_POLL_POLLNVAL	0x0020
+
+#define	KEVENT_POLL_POLLRDNORM	0x0040
+#define	KEVENT_POLL_POLLRDBAND	0x0080
+#define	KEVENT_POLL_POLLWRNORM	0x0100
+#define	KEVENT_POLL_POLLWRBAND	0x0200
+#define	KEVENT_POLL_POLLMSG	0x0400
+#define	KEVENT_POLL_POLLREMOVE	0x1000
+
+/*
+ * Asynchronous IO events.
+ */
+#define	KEVENT_AIO_BIO		0x1
+
+#define KEVENT_MASK_ALL		0xffffffff	/* Mask of all possible event values. */
+#define KEVENT_MASK_EMPTY	0x0		/* Empty mask of ready events. */
+
+struct kevent_id
+{
+	__u32		raw[2];
+};
+
+struct ukevent
+{
+	struct kevent_id	id;			/* Id of this request, e.g. socket number, file descriptor and so on... */
+	__u32			type;			/* Event type, e.g. KEVENT_SOCK, KEVENT_INODE, KEVENT_TIMER and so on... */
+	__u32			event;			/* Event itself, e.g. SOCK_ACCEPT, INODE_CREATED, TIMER_FIRED... */
+	__u32			req_flags;		/* Per-event request flags */
+	__u32			ret_flags;		/* Per-event return flags */
+	__u32			ret_data[2];		/* Event return data. Event originator fills it with anything it likes. */
+	union {
+		__u32		user[2];		/* User's data. It is not used, just copied to/from user. */
+		void		*ptr;
+	};
+};
+
+#define	KEVENT_CTL_ADD 		0
+#define	KEVENT_CTL_REMOVE	1
+#define	KEVENT_CTL_MODIFY	2
+#define	KEVENT_CTL_INIT		3
+
+#ifdef __KERNEL__
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/wait.h>
+#include <linux/net.h>
+#include <linux/rcupdate.h>
+#include <linux/kevent_storage.h>
+
+#define KEVENT_MAX_EVENTS	4096
+#define KEVENT_MIN_BUFFS_ALLOC	3
+
+struct inode;
+struct dentry;
+struct sock;
+
+struct kevent;
+struct kevent_storage;
+typedef int (* kevent_callback_t)(struct kevent *);
+
+/* @callback is called each time new event has been caught. */
+/* @enqueue is called each time new event is queued. */
+/* @dequeue is called each time event is dequeued. */
+
+struct kevent_callbacks {
+	kevent_callback_t	callback, enqueue, dequeue;
+};
+
+struct kevent
+{
+	struct rcu_head		rcu_head;		/* Used for kevent freeing.*/
+	struct ukevent		event;
+	spinlock_t		ulock;			/* This lock protects ukevent manipulations, e.g. ret_flags changes. */
+
+	struct list_head	kevent_entry;		/* Entry of user's queue. */
+	struct list_head	storage_entry;		/* Entry of origin's queue. */
+	struct list_head	ready_entry;		/* Entry of user's ready. */
+
+	struct kevent_user	*user;			/* User who requested this kevent. */
+	struct kevent_storage	*st;			/* Kevent container. */
+
+	struct kevent_callbacks	callbacks;
+
+	void			*priv;			/* Private data for different storages. 
+							 * poll()/select storage has a list of wait_queue_t containers 
+							 * for each ->poll() { poll_wait()' } here.
+							 */
+};
+
+extern struct kevent_callbacks kevent_registered_callbacks[];
+
+#define KEVENT_HASH_MASK	0xff
+
+struct kevent_user
+{
+	struct list_head	kevent_list[KEVENT_HASH_MASK+1];
+	spinlock_t		kevent_lock;
+	unsigned int		kevent_num;		/* Number of queued kevents. */
+
+	struct list_head	ready_list;		/* List of ready kevents. */
+	unsigned int		ready_num;		/* Number of ready kevents. */
+	spinlock_t 		ready_lock;		/* Protects all manipulations with ready queue. */
+
+	unsigned int		max_ready_num;		/* Requested number of kevents. */
+
+	struct mutex		ctl_mutex;		/* Protects against simultaneous kevent_user control manipulations. */
+	wait_queue_head_t	wait;			/* Wait until some events are ready. */
+
+	atomic_t		refcnt;			/* Reference counter, increased for each new kevent. */
+	
+	unsigned long		*pring;			/* Array of pages forming mapped ring buffer */
+
+#ifdef CONFIG_KEVENT_USER_STAT
+	unsigned long		im_num;
+	unsigned long		wait_num;
+	unsigned long		total;
+#endif
+};
+
+extern kmem_cache_t *kevent_cache;
+int kevent_enqueue(struct kevent *k);
+int kevent_dequeue(struct kevent *k);
+int kevent_init(struct kevent *k);
+void kevent_requeue(struct kevent *k);
+int kevent_break(struct kevent *k);
+
+void kevent_user_ring_add_event(struct kevent *k);
+
+void kevent_storage_ready(struct kevent_storage *st, 
+		kevent_callback_t ready_callback, u32 event);
+int kevent_storage_init(void *origin, struct kevent_storage *st);
+void kevent_storage_fini(struct kevent_storage *st);
+int kevent_storage_enqueue(struct kevent_storage *st, struct kevent *k);
+void kevent_storage_dequeue(struct kevent_storage *st, struct kevent *k);
+
+int kevent_user_add_ukevent(struct ukevent *uk, struct kevent_user *u);
+
+#ifdef CONFIG_KEVENT_POLL
+void kevent_poll_reinit(struct file *file);
+#else
+static inline void kevent_poll_reinit(struct file *file)
+{
+}
+#endif
+
+#ifdef CONFIG_KEVENT_INODE
+void kevent_inode_notify(struct inode *inode, u32 event);
+void kevent_inode_notify_parent(struct dentry *dentry, u32 event);
+void kevent_inode_remove(struct inode *inode);
+#else
+static inline void kevent_inode_notify(struct inode *inode, u32 event)
+{
+}
+static inline void kevent_inode_notify_parent(struct dentry *dentry, u32 event)
+{
+}
+static inline void kevent_inode_remove(struct inode *inode)
+{
+}
+#endif /* CONFIG_KEVENT_INODE */
+#ifdef CONFIG_KEVENT_SOCKET
+
+extern struct file_operations socket_file_ops;
+
+void kevent_socket_reinit(struct socket *sock);
+void kevent_sk_reinit(struct sock *sk);
+void kevent_socket_notify(struct sock *sock, u32 event);
+int kevent_socket_dequeue(struct kevent *k);
+int kevent_socket_enqueue(struct kevent *k);
+#define sock_async(__sk) sock_flag(__sk, SOCK_ASYNC)
+#else
+static inline void kevent_socket_notify(struct sock *sock, u32 event)
+{
+}
+static inline void kevent_socket_reinit(struct socket *sock)
+{
+}
+static inline void kevent_sk_reinit(struct sock *sk)
+{
+}
+#define sock_async(__sk)	({ (void)__sk; 0; })
+#endif
+
+#ifdef CONFIG_KEVENT_USER_STAT
+static inline void kevent_user_stat_init(struct kevent_user *u)
+{
+	u->wait_num = u->im_num = u->total = 0;
+}
+static inline void kevent_user_stat_print(struct kevent_user *u)
+{
+	pr_debug("%s: u=%p, wait=%lu, immediately=%lu, total=%lu.\n", 
+			__func__, u, u->wait_num, u->im_num, u->total);
+}
+static inline void kevent_user_stat_increase_im(struct kevent_user *u)
+{
+	u->im_num++;
+}
+static inline void kevent_user_stat_increase_wait(struct kevent_user *u)
+{
+	u->wait_num++;
+}
+static inline void kevent_user_stat_increase_total(struct kevent_user *u)
+{
+	u->total++;
+}
+#else
+#define kevent_user_stat_print(u)		({ (void) u;})
+#define kevent_user_stat_init(u)		({ (void) u;})
+#define kevent_user_stat_increase_im(u)		({ (void) u;})
+#define kevent_user_stat_increase_wait(u)	({ (void) u;})
+#define kevent_user_stat_increase_total(u)	({ (void) u;})
+#endif
+
+#endif /* __KERNEL__ */
+#endif /* __KEVENT_H */
diff --git a/include/linux/kevent_storage.h b/include/linux/kevent_storage.h
new file mode 100644
index 0000000..bd891f0
--- /dev/null
+++ b/include/linux/kevent_storage.h
@@ -0,0 +1,12 @@
+#ifndef __KEVENT_STORAGE_H
+#define __KEVENT_STORAGE_H
+
+struct kevent_storage
+{
+	void			*origin;		/* Originator's pointer, e.g. struct sock or struct file. Can be NULL. */
+	struct list_head	list;			/* List of queued kevents. */
+	unsigned int		qlen;			/* Number of queued kevents. */
+	spinlock_t		lock;			/* Protects users queue. */
+};
+
+#endif /* __KEVENT_STORAGE_H */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 008f04c..143f3b5 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -597,4 +597,9 @@ asmlinkage long sys_get_robust_list(int 
 asmlinkage long sys_set_robust_list(struct robust_list_head __user *head,
 				    size_t len);
 
+asmlinkage long sys_aio_recv(int ctl_fd, int s, void __user *buf, size_t size, unsigned flags);
+asmlinkage long sys_aio_send(int ctl_fd, int s, void __user *buf, size_t size, unsigned flags);
+asmlinkage long sys_kevent_get_events(int ctl_fd, unsigned int min, unsigned int max, 
+		unsigned int timeout, void __user *buf, unsigned flags);
+asmlinkage long sys_kevent_ctl(int ctl_fd, unsigned int cmd, unsigned int num, void __user *buf);
 #endif
diff --git a/init/Kconfig b/init/Kconfig
index a099fc6..c550fcc 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -218,6 +218,8 @@ config AUDITSYSCALL
 	  such as SELinux.  To use audit's filesystem watch feature, please
 	  ensure that INOTIFY is configured.
 
+source "kernel/kevent/Kconfig"
+
 config IKCONFIG
 	bool "Kernel .config support"
 	---help---
diff --git a/kernel/Makefile b/kernel/Makefile
index d62ec66..2d7a6dd 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -47,6 +47,7 @@ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softl
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
+obj-$(CONFIG_KEVENT) += kevent/
 obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o
diff --git a/kernel/kevent/Kconfig b/kernel/kevent/Kconfig
new file mode 100644
index 0000000..88b35af
--- /dev/null
+++ b/kernel/kevent/Kconfig
@@ -0,0 +1,57 @@
+config KEVENT
+	bool "Kernel event notification mechanism"
+	help
+	  This option enables event queue mechanism.
+	  It can be used as replacement for poll()/select(), AIO callback invocations,
+	  advanced timer notifications and other kernel object status changes.
+
+config KEVENT_USER_STAT
+	bool "Kevent user statistic"
+	depends on KEVENT
+	default N
+	help
+	  This option will turn kevent_user statistic collection on.
+	  Statistic data includes total number of kevent, number of kevents which are ready
+	  immediately at insertion time and number of kevents which were removed through
+	  readiness completion. It will be printed each time control kevent descriptor
+	  is closed.
+
+config KEVENT_SOCKET
+	bool "Kernel event notifications for sockets"
+	depends on NET && KEVENT
+	help
+	  This option enables notifications through KEVENT subsystem of 
+	  sockets operations, like new packet receiving conditions, ready for accept
+  	  conditions and so on.
+	
+config KEVENT_INODE
+	bool "Kernel event notifications for inodes"
+	depends on KEVENT
+	help
+	  This option enables notifications through KEVENT subsystem of 
+	  inode operations, like file creation, removal and so on.
+
+config KEVENT_TIMER
+	bool "Kernel event notifications for timers"
+	depends on KEVENT
+	help
+	  This option allows to use timers through KEVENT subsystem.
+
+config KEVENT_POLL
+	bool "Kernel event notifications for poll()/select()"
+	depends on KEVENT
+	help
+	  This option allows to use kevent subsystem for poll()/select() notifications.
+
+config KEVENT_NAIO
+	bool "Network asynchronous IO"
+	depends on KEVENT && KEVENT_SOCKET
+	help
+	  This option enables kevent based network asynchronous IO subsystem.
+
+config KEVENT_AIO
+	bool "Asynchronous IO"
+	depends on KEVENT
+	help
+	  This option allows to use kevent subsystem for AIO operations.
+	  AIO read is currently supported.
diff --git a/kernel/kevent/Makefile b/kernel/kevent/Makefile
new file mode 100644
index 0000000..d1ef9ba
--- /dev/null
+++ b/kernel/kevent/Makefile
@@ -0,0 +1,7 @@
+obj-y := kevent.o kevent_user.o
+obj-$(CONFIG_KEVENT_SOCKET) += kevent_socket.o
+obj-$(CONFIG_KEVENT_INODE) += kevent_inode.o
+obj-$(CONFIG_KEVENT_TIMER) += kevent_timer.o
+obj-$(CONFIG_KEVENT_POLL) += kevent_poll.o
+obj-$(CONFIG_KEVENT_NAIO) += kevent_naio.o
+obj-$(CONFIG_KEVENT_AIO) += kevent_aio.o
diff --git a/kernel/kevent/kevent.c b/kernel/kevent/kevent.c
new file mode 100644
index 0000000..e63a8fd
--- /dev/null
+++ b/kernel/kevent/kevent.c
@@ -0,0 +1,238 @@
+/*
+ * 	kevent.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/mempool.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/kevent.h>
+
+kmem_cache_t *kevent_cache;
+
+/*
+ * Attempts to add an event into appropriate origin's queue.
+ * Returns positive value if this event is ready immediately,
+ * negative value in case of error and zero if event has been queued.
+ * ->enqueue() callback must increase origin's reference counter.
+ */
+int kevent_enqueue(struct kevent *k)
+{
+	if (k->event.type >= KEVENT_MAX)
+		return -E2BIG;
+
+	if (!k->callbacks.enqueue) {
+		kevent_break(k);
+		return -EINVAL;
+	}
+	
+	return k->callbacks.enqueue(k);
+}
+
+/*
+ * Remove event from the appropriate queue.
+ * ->dequeue() callback must decrease origin's reference counter.
+ */
+int kevent_dequeue(struct kevent *k)
+{
+	if (k->event.type >= KEVENT_MAX)
+		return -E2BIG;
+	
+	if (!k->callbacks.dequeue) {
+		kevent_break(k);
+		return -EINVAL;
+	}
+
+	return k->callbacks.dequeue(k);
+}
+
+int kevent_break(struct kevent *k)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&k->ulock, flags);
+	k->event.ret_flags |= KEVENT_RET_BROKEN;
+	spin_unlock_irqrestore(&k->ulock, flags);
+	return 0;
+}
+
+struct kevent_callbacks kevent_registered_callbacks[KEVENT_MAX];
+
+/*
+ * Must be called before event is going to be added into some origin's queue.
+ * Initializes ->enqueue(), ->dequeue() and ->callback() callbacks.
+ * If failed, kevent should not be used or kevent_enqueue() will fail to add
+ * this kevent into origin's queue with setting
+ * KEVENT_RET_BROKEN flag in kevent->event.ret_flags.
+ */
+int kevent_init(struct kevent *k)
+{
+	spin_lock_init(&k->ulock);
+	k->kevent_entry.next = LIST_POISON1;
+	k->storage_entry.prev = LIST_POISON2;
+	k->ready_entry.next = LIST_POISON1;
+
+	if (k->event.type >= KEVENT_MAX)
+		return -E2BIG;
+
+	k->callbacks = kevent_registered_callbacks[k->event.type];
+	if (!k->callbacks.callback) {
+		kevent_break(k);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/*
+ * Called from ->enqueue() callback when reference counter for given
+ * origin (socket, inode...) has been increased.
+ */
+int kevent_storage_enqueue(struct kevent_storage *st, struct kevent *k)
+{
+	unsigned long flags;
+
+	k->st = st;
+	spin_lock_irqsave(&st->lock, flags);
+	list_add_tail_rcu(&k->storage_entry, &st->list);
+	st->qlen++;
+	spin_unlock_irqrestore(&st->lock, flags);
+	return 0;
+}
+
+/*
+ * Dequeue kevent from origin's queue. 
+ * It does not decrease origin's reference counter in any way 
+ * and must be called before it, so storage itself must be valid.
+ * It is called from ->dequeue() callback.
+ */
+void kevent_storage_dequeue(struct kevent_storage *st, struct kevent *k)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&st->lock, flags);
+	if (k->storage_entry.prev != LIST_POISON2) {
+		list_del_rcu(&k->storage_entry);
+		st->qlen--;
+	}
+	spin_unlock_irqrestore(&st->lock, flags);
+}
+
+static void __kevent_requeue(struct kevent *k, u32 event)
+{
+	int err, rem = 0;
+	unsigned long flags;
+
+	err = k->callbacks.callback(k);
+
+	spin_lock_irqsave(&k->ulock, flags);
+	if (err > 0) {
+		k->event.ret_flags |= KEVENT_RET_DONE;
+	} else if (err < 0) {
+		k->event.ret_flags |= KEVENT_RET_BROKEN;
+		k->event.ret_flags |= KEVENT_RET_DONE;
+	}
+	rem = (k->event.req_flags & KEVENT_REQ_ONESHOT);
+	if (!err)
+		err = (k->event.ret_flags & (KEVENT_RET_BROKEN|KEVENT_RET_DONE));
+	spin_unlock_irqrestore(&k->ulock, flags);
+
+	if (err) {
+		if ((rem || err < 0) && k->storage_entry.prev != LIST_POISON2) {
+			list_del_rcu(&k->storage_entry);
+			k->st->qlen--;
+		}
+		
+		spin_lock_irqsave(&k->user->ready_lock, flags);
+		if (k->ready_entry.next == LIST_POISON1) {
+			kevent_user_ring_add_event(k);
+			list_add_tail(&k->ready_entry, &k->user->ready_list);
+			k->user->ready_num++;
+		}
+		spin_unlock_irqrestore(&k->user->ready_lock, flags);
+		wake_up(&k->user->wait);
+	}
+}
+
+void kevent_requeue(struct kevent *k)
+{
+	unsigned long flags;
+	
+	spin_lock_irqsave(&k->st->lock, flags);
+	__kevent_requeue(k, 0);
+	spin_unlock_irqrestore(&k->st->lock, flags);
+}
+
+/*
+ * Called each time some activity in origin (socket, inode...) is noticed.
+ */
+void kevent_storage_ready(struct kevent_storage *st, 
+		kevent_callback_t ready_callback, u32 event)
+{
+	struct kevent *k;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(k, &st->list, storage_entry) {
+		if (ready_callback)
+			ready_callback(k);
+
+		if (event & k->event.event)
+			__kevent_requeue(k, event);
+	}
+	rcu_read_unlock();
+}
+
+int kevent_storage_init(void *origin, struct kevent_storage *st)
+{
+	spin_lock_init(&st->lock);
+	st->origin = origin;
+	st->qlen = 0;
+	INIT_LIST_HEAD(&st->list);
+	return 0;
+}
+
+void kevent_storage_fini(struct kevent_storage *st)
+{
+	kevent_storage_ready(st, kevent_break, KEVENT_MASK_ALL);
+}
+
+static int __init kevent_sys_init(void)
+{
+	int i;
+
+	kevent_cache = kmem_cache_create("kevent_cache", 
+			sizeof(struct kevent), 0, 0, NULL, NULL);
+	if (!kevent_cache)
+		panic("kevent: Unable to create a cache.\n");
+
+	for (i=0; i<ARRAY_SIZE(kevent_registered_callbacks); ++i) {
+		struct kevent_callbacks *c = &kevent_registered_callbacks[i];
+
+		c->callback = c->enqueue = c->dequeue = NULL;
+	}
+	
+	return 0;
+}
+
+late_initcall(kevent_sys_init);
diff --git a/kernel/kevent/kevent_user.c b/kernel/kevent/kevent_user.c
new file mode 100644
index 0000000..452c246
--- /dev/null
+++ b/kernel/kevent/kevent_user.c
@@ -0,0 +1,880 @@
+/*
+ * 	kevent_user.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/device.h>
+#include <linux/poll.h>
+#include <linux/kevent.h>
+#include <linux/jhash.h>
+#include <asm/io.h>
+
+static struct class *kevent_user_class;
+static char kevent_name[] = "kevent";
+static int kevent_user_major;
+
+static int kevent_user_open(struct inode *, struct file *);
+static int kevent_user_release(struct inode *, struct file *);
+static unsigned int kevent_user_poll(struct file *, struct poll_table_struct *);
+static int kevnet_user_mmap(struct file *, struct vm_area_struct *);
+
+static struct file_operations kevent_user_fops = {
+	.mmap		= kevnet_user_mmap,
+	.open		= kevent_user_open,
+	.release	= kevent_user_release,
+	.poll		= kevent_user_poll,
+	.owner		= THIS_MODULE,
+};
+
+static int kevent_get_sb(struct file_system_type *fs_type, 
+		int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+{
+	/* So original magic... */
+	return get_sb_pseudo(fs_type, kevent_name, NULL, 0xabcdef, mnt);	
+}
+
+static struct file_system_type kevent_fs_type = {
+	.name		= kevent_name,
+	.get_sb		= kevent_get_sb,
+	.kill_sb	= kill_anon_super,
+};
+
+static struct vfsmount *kevent_mnt;
+
+static unsigned int kevent_user_poll(struct file *file, struct poll_table_struct *wait)
+{
+	struct kevent_user *u = file->private_data;
+	unsigned int mask;
+	
+	poll_wait(file, &u->wait, wait);
+	mask = 0;
+
+	if (u->ready_num)
+		mask |= POLLIN | POLLRDNORM;
+
+	return mask;
+}
+
+static inline void kevent_user_ring_set(struct kevent_user *u, unsigned int num)
+{
+	unsigned int *idx;
+	
+	idx = (unsigned int *)u->pring[0];
+	idx[0] = num;
+}
+
+/*
+ * Note that kevents does not exactly fill the page (each ukevent is 40 bytes),
+ * so we reuse 4 bytes at the begining of the first page to store index.
+ * Take that into account if you want to change size of struct ukevent.
+ */
+#define KEVENTS_ON_PAGE (PAGE_SIZE/sizeof(struct ukevent))
+
+/*
+ * Called under kevent_user->ready_lock, so updates are always protected.
+ */
+void kevent_user_ring_add_event(struct kevent *k)
+{
+	unsigned int *idx_ptr, idx, pidx, off;
+	struct ukevent *ukev;
+	
+	idx_ptr = (unsigned int *)k->user->pring[0];
+	idx = idx_ptr[0];
+
+	pidx = idx/KEVENTS_ON_PAGE;
+	off = idx%KEVENTS_ON_PAGE;
+
+	if (pidx == 0)
+		ukev = (struct ukevent *)(k->user->pring[pidx] + sizeof(unsigned int));
+	else
+		ukev = (struct ukevent *)(k->user->pring[pidx]);
+
+	memcpy(&ukev[off], &k->event, sizeof(struct ukevent));
+
+	idx++;
+	if (idx >= KEVENT_MAX_EVENTS)
+		idx = 0;
+
+	idx_ptr[0] = idx;
+}
+
+static int kevent_user_ring_init(struct kevent_user *u)
+{
+	int i, pnum;
+
+	pnum = ALIGN(KEVENT_MAX_EVENTS*sizeof(struct ukevent) + sizeof(unsigned int), PAGE_SIZE)/PAGE_SIZE;
+
+	u->pring = kmalloc(pnum * sizeof(unsigned long), GFP_KERNEL);
+	if (!u->pring)
+		return -ENOMEM;
+
+	for (i=0; i<pnum; ++i) {
+		u->pring[i] = __get_free_page(GFP_KERNEL);
+		if (!u->pring)
+			break;
+	}
+
+	if (i != pnum) {
+		pnum = i;
+		goto err_out_free;
+	}
+
+	kevent_user_ring_set(u, 0);
+
+	return 0;
+
+err_out_free:
+	for (i=0; i<pnum; ++i)
+		free_page(u->pring[i]);
+
+	kfree(u->pring);
+
+	return -ENOMEM;
+}
+
+static void kevent_user_ring_fini(struct kevent_user *u)
+{
+	int i, pnum;
+
+	pnum = ALIGN(KEVENT_MAX_EVENTS*sizeof(struct ukevent) + sizeof(unsigned int), PAGE_SIZE)/PAGE_SIZE;
+	
+	for (i=0; i<pnum; ++i)
+		free_page(u->pring[i]);
+
+	kfree(u->pring);
+}
+
+static struct kevent_user *kevent_user_alloc(void)
+{
+	struct kevent_user *u;
+	int i;
+
+	u = kzalloc(sizeof(struct kevent_user), GFP_KERNEL);
+	if (!u)
+		return NULL;
+
+	INIT_LIST_HEAD(&u->ready_list);
+	spin_lock_init(&u->ready_lock);
+	u->ready_num = 0;
+	kevent_user_stat_init(u);
+	spin_lock_init(&u->kevent_lock);
+	for (i=0; i<ARRAY_SIZE(u->kevent_list); ++i)
+		INIT_LIST_HEAD(&u->kevent_list[i]);
+	u->kevent_num = 0;
+	
+	mutex_init(&u->ctl_mutex);
+	init_waitqueue_head(&u->wait);
+	u->max_ready_num = 0;
+
+	atomic_set(&u->refcnt, 1);
+
+	if (kevent_user_ring_init(u)) {
+		kfree(u);
+		u = NULL;
+	}
+
+	return u;
+}
+
+static int kevent_user_open(struct inode *inode, struct file *file)
+{
+	struct kevent_user *u = kevent_user_alloc();
+	
+	if (!u)
+		return -ENOMEM;
+
+	file->private_data = u;
+	
+	return 0;
+}
+
+static inline void kevent_user_get(struct kevent_user *u)
+{
+	atomic_inc(&u->refcnt);
+}
+
+static inline void kevent_user_put(struct kevent_user *u)
+{
+	if (atomic_dec_and_test(&u->refcnt)) {
+		kevent_user_stat_print(u);
+		kevent_user_ring_fini(u);
+		kfree(u);
+	}
+}
+
+static int kevnet_user_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	size_t size = vma->vm_end - vma->vm_start, psize;
+	int pnum = size/PAGE_SIZE, i;
+	unsigned long start = vma->vm_start;
+	struct kevent_user *u = file->private_data;
+
+	psize = ALIGN(KEVENT_MAX_EVENTS*sizeof(struct ukevent) + sizeof(unsigned int), PAGE_SIZE);
+
+	if (size + vma->vm_pgoff*PAGE_SIZE != psize)
+		return -EINVAL;
+
+	if (vma->vm_flags & VM_WRITE)
+		return -EPERM;
+
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+	for (i=0; i<pnum; ++i) {
+		if (remap_pfn_range(vma, start, virt_to_phys((void *)u->pring[i+vma->vm_pgoff]), PAGE_SIZE,
+					vma->vm_page_prot))
+			return -EAGAIN;
+		start += PAGE_SIZE;
+	}
+
+	return 0;
+}
+
+#if 0
+static inline unsigned int kevent_user_hash(struct ukevent *uk)
+{
+	unsigned int h = (uk->user[0] ^ uk->user[1]) ^ (uk->id.raw[0] ^ uk->id.raw[1]);
+	
+	h = (((h >> 16) & 0xffff) ^ (h & 0xffff)) & 0xffff;
+	h = (((h >> 8) & 0xff) ^ (h & 0xff)) & KEVENT_HASH_MASK;
+
+	return h;
+}
+#else
+static inline unsigned int kevent_user_hash(struct ukevent *uk)
+{
+	return jhash_1word(uk->id.raw[0], 0) & KEVENT_HASH_MASK;
+}
+#endif
+
+static void kevent_free_rcu(struct rcu_head *rcu)
+{
+	struct kevent *kevent = container_of(rcu, struct kevent, rcu_head);
+	kmem_cache_free(kevent_cache, kevent);
+}
+
+static void kevent_finish_user_complete(struct kevent *k, int deq)
+{
+	struct kevent_user *u = k->user;
+	unsigned long flags;
+
+	if (deq)
+		kevent_dequeue(k);
+
+	spin_lock_irqsave(&u->ready_lock, flags);
+	if (k->ready_entry.next != LIST_POISON1) {
+		list_del(&k->ready_entry);
+		u->ready_num--;
+	}
+	spin_unlock_irqrestore(&u->ready_lock, flags);
+
+	kevent_user_put(u);
+	call_rcu(&k->rcu_head, kevent_free_rcu);
+}
+
+static void __kevent_finish_user(struct kevent *k, int deq)
+{
+	struct kevent_user *u = k->user;
+
+	list_del(&k->kevent_entry);
+	u->kevent_num--;
+	kevent_finish_user_complete(k, deq);
+}
+
+/*
+ * Remove kevent from user's list of all events, 
+ * dequeue it from storage and decrease user's reference counter,
+ * since this kevent does not exist anymore. That is why it is freed here.
+ */
+static void kevent_finish_user(struct kevent *k, int deq)
+{
+	struct kevent_user *u = k->user;
+	unsigned long flags;
+
+	spin_lock_irqsave(&u->kevent_lock, flags);
+	list_del(&k->kevent_entry);
+	u->kevent_num--;
+	spin_unlock_irqrestore(&u->kevent_lock, flags);
+	kevent_finish_user_complete(k, deq);
+}
+
+/*
+ * Dequeue one entry from user's ready queue.
+ */
+
+static struct kevent *kqueue_dequeue_ready(struct kevent_user *u)
+{
+	unsigned long flags;
+	struct kevent *k = NULL;
+
+	spin_lock_irqsave(&u->ready_lock, flags);
+	if (u->ready_num && !list_empty(&u->ready_list)) {
+		k = list_entry(u->ready_list.next, struct kevent, ready_entry);
+		list_del(&k->ready_entry);
+		u->ready_num--;
+	}
+	spin_unlock_irqrestore(&u->ready_lock, flags);
+
+	return k;
+}
+
+static struct kevent *__kevent_search(struct list_head *head, struct ukevent *uk, 
+		struct kevent_user *u)
+{
+	struct kevent *k;
+	int found = 0;
+	
+	list_for_each_entry(k, head, kevent_entry) {
+		spin_lock(&k->ulock);
+		if (k->event.user[0] == uk->user[0] && k->event.user[1] == uk->user[1] &&
+				k->event.id.raw[0] == uk->id.raw[0] && 
+				k->event.id.raw[1] == uk->id.raw[1]) {
+			found = 1;
+			spin_unlock(&k->ulock);
+			break;
+		}
+		spin_unlock(&k->ulock);
+	}
+
+	return (found)?k:NULL;
+}
+
+static int kevent_modify(struct ukevent *uk, struct kevent_user *u)
+{
+	struct kevent *k;
+	unsigned int hash = kevent_user_hash(uk);
+	int err = -ENODEV;
+	unsigned long flags;
+	
+	spin_lock_irqsave(&u->kevent_lock, flags);
+	k = __kevent_search(&u->kevent_list[hash], uk, u);
+	if (k) {
+		spin_lock(&k->ulock);
+		k->event.event = uk->event;
+		k->event.req_flags = uk->req_flags;
+		k->event.ret_flags = 0;
+		spin_unlock(&k->ulock);
+		kevent_requeue(k);
+		err = 0;
+	}
+	spin_unlock_irqrestore(&u->kevent_lock, flags);
+	
+	return err;
+}
+
+static int kevent_remove(struct ukevent *uk, struct kevent_user *u)
+{
+	int err = -ENODEV;
+	struct kevent *k;
+	unsigned int hash = kevent_user_hash(uk);
+	unsigned long flags;
+
+	spin_lock_irqsave(&u->kevent_lock, flags);
+	k = __kevent_search(&u->kevent_list[hash], uk, u);
+	if (k) {
+		__kevent_finish_user(k, 1);
+		err = 0;
+	}
+	spin_unlock_irqrestore(&u->kevent_lock, flags);
+
+	return err;
+}
+
+/*
+ * No new entry can be added or removed from any list at this point.
+ * It is not permitted to call ->ioctl() and ->release() in parallel.
+ */
+static int kevent_user_release(struct inode *inode, struct file *file)
+{
+	struct kevent_user *u = file->private_data;
+	struct kevent *k, *n;
+	int i;
+
+	for (i=0; i<KEVENT_HASH_MASK+1; ++i) {
+		list_for_each_entry_safe(k, n, &u->kevent_list[i], kevent_entry)
+			kevent_finish_user(k, 1);
+	}
+
+	kevent_user_put(u);
+	file->private_data = NULL;
+
+	return 0;
+}
+
+static struct ukevent *kevent_get_user(unsigned int num, void __user *arg)
+{
+	struct ukevent *ukev;
+
+	ukev = kmalloc(sizeof(struct ukevent) * num, GFP_KERNEL);
+	if (!ukev)
+		return NULL;
+
+	if (copy_from_user(arg, ukev, sizeof(struct ukevent) * num)) {
+		kfree(ukev);
+		return NULL;
+	}
+
+	return ukev;
+}
+
+static int kevent_user_ctl_modify(struct kevent_user *u, unsigned int num, void __user *arg)
+{
+	int err = 0, i;
+	struct ukevent uk;
+
+	mutex_lock(&u->ctl_mutex);
+	
+	if (num > KEVENT_MIN_BUFFS_ALLOC) {
+		struct ukevent *ukev;
+
+		ukev = kevent_get_user(num, arg);
+		if (ukev) {
+			for (i=0; i<num; ++i) {
+				if (kevent_modify(&ukev[i], u))
+					ukev[i].ret_flags |= KEVENT_RET_BROKEN;
+				ukev[i].ret_flags |= KEVENT_RET_DONE;
+			}
+			if (copy_to_user(arg, ukev, num*sizeof(struct ukevent)))
+				err = -EINVAL;
+			kfree(ukev);
+			goto out;
+		}
+	}
+
+	for (i=0; i<num; ++i) {
+		if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
+			err = -EINVAL;
+			break;
+		}
+
+		if (kevent_modify(&uk, u))
+			uk.ret_flags |= KEVENT_RET_BROKEN;
+		uk.ret_flags |= KEVENT_RET_DONE;
+
+		if (copy_to_user(arg, &uk, sizeof(struct ukevent))) {
+			err = -EINVAL;
+			break;
+		}
+
+		arg += sizeof(struct ukevent);
+	}
+out:
+	mutex_unlock(&u->ctl_mutex);
+
+	return err;
+}
+
+static int kevent_user_ctl_remove(struct kevent_user *u, unsigned int num, void __user *arg)
+{
+	int err = 0, i;
+	struct ukevent uk;
+
+	mutex_lock(&u->ctl_mutex);
+	
+	if (num > KEVENT_MIN_BUFFS_ALLOC) {
+		struct ukevent *ukev;
+
+		ukev = kevent_get_user(num, arg);
+		if (ukev) {
+			for (i=0; i<num; ++i) {
+				if (kevent_remove(&ukev[i], u))
+					ukev[i].ret_flags |= KEVENT_RET_BROKEN;
+				ukev[i].ret_flags |= KEVENT_RET_DONE;
+			}
+			if (copy_to_user(arg, ukev, num*sizeof(struct ukevent)))
+				err = -EINVAL;
+			kfree(ukev);
+			goto out;
+		}
+	}
+
+	for (i=0; i<num; ++i) {
+		if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
+			err = -EINVAL;
+			break;
+		}
+
+		if (kevent_remove(&uk, u))
+			uk.ret_flags |= KEVENT_RET_BROKEN;
+
+		uk.ret_flags |= KEVENT_RET_DONE;
+
+		if (copy_to_user(arg, &uk, sizeof(struct ukevent))) {
+			err = -EINVAL;
+			break;
+		}
+
+		arg += sizeof(struct ukevent);
+	}
+out:
+	mutex_unlock(&u->ctl_mutex);
+
+	return err;
+}
+
+static void kevent_user_enqueue(struct kevent_user *u, struct kevent *k)
+{
+	unsigned long flags;
+	unsigned int hash = kevent_user_hash(&k->event);
+
+	spin_lock_irqsave(&u->kevent_lock, flags);
+	list_add_tail(&k->kevent_entry, &u->kevent_list[hash]);
+	u->kevent_num++;
+	kevent_user_get(u);
+	spin_unlock_irqrestore(&u->kevent_lock, flags);
+}
+
+int kevent_user_add_ukevent(struct ukevent *uk, struct kevent_user *u)
+{
+	struct kevent *k;
+	int err;
+
+	k = kmem_cache_alloc(kevent_cache, GFP_KERNEL);
+	if (!k) {
+		err = -ENOMEM;
+		goto err_out_exit;
+	}
+
+	memcpy(&k->event, uk, sizeof(struct ukevent));
+	INIT_RCU_HEAD(&k->rcu_head);
+
+	k->event.ret_flags = 0;
+
+	err = kevent_init(k);
+	if (err) {
+		kmem_cache_free(kevent_cache, k);
+		goto err_out_exit;
+	}
+	k->user = u;
+	kevent_user_stat_increase_total(u);
+	kevent_user_enqueue(u, k);
+
+	err = kevent_enqueue(k);
+	if (err) {
+		memcpy(uk, &k->event, sizeof(struct ukevent));
+		if (err < 0)
+			uk->ret_flags |= KEVENT_RET_BROKEN;
+		uk->ret_flags |= KEVENT_RET_DONE;
+		kevent_finish_user(k, 0);
+	} 
+
+err_out_exit:
+	return err;
+}
+
+/*
+ * Copy all ukevents from userspace, allocate kevent for each one 
+ * and add them into appropriate kevent_storages, 
+ * e.g. sockets, inodes and so on...
+ * If something goes wrong, all events will be dequeued and 
+ * negative error will be returned. 
+ * On success number of finished events is returned and 
+ * Array of finished events (struct ukevent) will be placed behind 
+ * kevent_user_control structure. User must run through that array and check 
+ * ret_flags field of each ukevent structure to determine if it is fired or failed event.
+ */
+static int kevent_user_ctl_add(struct kevent_user *u, unsigned int num, void __user *arg)
+{
+	int err, cerr = 0, knum = 0, rnum = 0, i;
+	void __user *orig = arg;
+	struct ukevent uk;
+
+	mutex_lock(&u->ctl_mutex);
+
+	err = -ENFILE;
+	if (u->kevent_num + num >= KEVENT_MAX_EVENTS)
+		goto out_remove;
+
+	if (num > KEVENT_MIN_BUFFS_ALLOC) {
+		struct ukevent *ukev;
+
+		ukev = kevent_get_user(num, arg);
+		if (ukev) {
+			for (i=0; i<num; ++i) {
+				err = kevent_user_add_ukevent(&ukev[i], u);
+				if (err) {
+					kevent_user_stat_increase_im(u);
+					if (i != rnum)
+						memcpy(&ukev[rnum], &ukev[i], sizeof(struct ukevent));
+					rnum++;
+				} else
+					knum++;
+			}
+			if (copy_to_user(orig, ukev, rnum*sizeof(struct ukevent)))
+				cerr = -EINVAL;
+			kfree(ukev);
+			goto out_setup;
+		}
+	}
+
+	for (i=0; i<num; ++i) {
+		if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
+			cerr = -EINVAL;
+			break;
+		}
+		arg += sizeof(struct ukevent);
+
+		err = kevent_user_add_ukevent(&uk, u);
+		if (err) {
+			kevent_user_stat_increase_im(u);
+			if (copy_to_user(orig, &uk, sizeof(struct ukevent))) {
+				cerr = -EINVAL;
+				break;
+			}
+			orig += sizeof(struct ukevent);
+			rnum++;
+		} else
+			knum++;
+	}
+
+out_setup:
+	if (cerr < 0) {
+		err = cerr;
+		goto out_remove;
+	}
+
+	err = rnum;
+out_remove:
+	mutex_unlock(&u->ctl_mutex);
+
+	return err;
+}
+
+/*
+ * In nonblocking mode it returns as many events as possible, but not more than @max_nr.
+ * In blocking mode it waits until timeout or if at least @min_nr events are ready,
+ * if timeout is zero, than it waits no more than 1 second or if at least one event
+ * is ready.
+ */
+static int kevent_user_wait(struct file *file, struct kevent_user *u, 
+		unsigned int min_nr, unsigned int max_nr, unsigned int timeout, 
+		void __user *buf)
+{
+	struct kevent *k;
+	int cerr = 0, num = 0;
+
+	if (!(file->f_flags & O_NONBLOCK)) {
+		if (timeout)
+			wait_event_interruptible_timeout(u->wait, 
+				u->ready_num >= min_nr, msecs_to_jiffies(timeout));
+		else
+			wait_event_interruptible_timeout(u->wait, 
+					u->ready_num > 0, msecs_to_jiffies(1000));
+	}
+	
+	while (num < max_nr && ((k = kqueue_dequeue_ready(u)) != NULL)) {
+		if (copy_to_user(buf + num*sizeof(struct ukevent), 
+					&k->event, sizeof(struct ukevent))) {
+			cerr = -EINVAL;
+			break;
+		}
+
+		/*
+		 * If it is one-shot kevent, it has been removed already from
+		 * origin's queue, so we can easily free it here.
+		 */
+		if (k->event.req_flags & KEVENT_REQ_ONESHOT)
+			kevent_finish_user(k, 1);
+		++num;
+		kevent_user_stat_increase_wait(u);
+	}
+
+	return (cerr)?cerr:num;
+}
+
+static int kevent_ctl_init(void)
+{
+	struct kevent_user *u;
+	struct file *file;
+	int fd, ret;
+
+	fd = get_unused_fd();
+	if (fd < 0)
+		return fd;
+
+	file = get_empty_filp();
+	if (!file) {
+		ret = -ENFILE;
+		goto out_put_fd;
+	}
+
+	u = kevent_user_alloc();
+	if (unlikely(!u)) {
+		ret = -ENOMEM;
+		goto out_put_file;
+	}
+
+	file->f_op = &kevent_user_fops;
+	file->f_vfsmnt = mntget(kevent_mnt);
+	file->f_dentry = dget(kevent_mnt->mnt_root);
+	file->f_mapping = file->f_dentry->d_inode->i_mapping;
+	file->f_mode = FMODE_READ;
+	file->f_flags = O_RDONLY;
+	file->private_data = u;
+	
+	fd_install(fd, file);
+
+	return fd;
+
+out_put_file:
+	put_filp(file);
+out_put_fd:
+	put_unused_fd(fd);
+	return ret;
+}
+
+static int kevent_ctl_process(struct file *file, unsigned int cmd, unsigned int num, void __user *arg)
+{
+	int err;
+	struct kevent_user *u = file->private_data;
+
+	if (!u)
+		return -EINVAL;
+
+	switch (cmd) {
+	case KEVENT_CTL_ADD:
+		err = kevent_user_ctl_add(u, num, arg);
+		break;
+	case KEVENT_CTL_REMOVE:
+		err = kevent_user_ctl_remove(u, num, arg);
+		break;
+	case KEVENT_CTL_MODIFY:
+		err = kevent_user_ctl_modify(u, num, arg);
+		break;
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+	return err;
+}
+
+asmlinkage long sys_kevent_get_events(int ctl_fd, unsigned int min_nr, unsigned int max_nr,
+		unsigned int timeout, void __user *buf, unsigned flags)
+{
+	int err = -EINVAL;
+	struct file *file;
+	struct kevent_user *u;
+
+	file = fget(ctl_fd);
+	if (!file)
+		return -ENODEV;
+
+	if (file->f_op != &kevent_user_fops)
+		goto out_fput;
+	u = file->private_data;
+
+	err = kevent_user_wait(file, u, min_nr, max_nr, timeout, buf);
+out_fput:
+	fput(file);
+	return err;
+}
+
+asmlinkage long sys_kevent_ctl(int fd, unsigned int cmd, unsigned int num, void __user *arg)
+{
+	int err = -EINVAL;
+	struct file *file;
+
+	if (cmd == KEVENT_CTL_INIT)
+		return kevent_ctl_init();
+
+	file = fget(fd);
+	if (!file)
+		return -ENODEV;
+
+	if (file->f_op != &kevent_user_fops)
+		goto out_fput;
+
+	err = kevent_ctl_process(file, cmd, num, arg);
+
+out_fput:
+	fput(file);
+	return err;
+}
+
+static int __devinit kevent_user_init(void)
+{
+	struct class_device *dev;
+	int err = 0;
+	
+	err = register_filesystem(&kevent_fs_type);
+	if (err)
+		panic("%s: failed to register filesystem: err=%d.\n",
+			       kevent_name, err);
+
+	kevent_mnt = kern_mount(&kevent_fs_type);
+	if (IS_ERR(kevent_mnt))
+		panic("%s: failed to mount silesystem: err=%ld.\n", 
+				kevent_name, PTR_ERR(kevent_mnt));
+	
+	kevent_user_major = register_chrdev(0, kevent_name, &kevent_user_fops);
+	if (kevent_user_major < 0) {
+		printk(KERN_ERR "Failed to register \"%s\" char device: err=%d.\n", 
+				kevent_name, kevent_user_major);
+		return -ENODEV;
+	}
+
+	kevent_user_class = class_create(THIS_MODULE, "kevent");
+	if (IS_ERR(kevent_user_class)) {
+		printk(KERN_ERR "Failed to register \"%s\" class: err=%ld.\n", 
+				kevent_name, PTR_ERR(kevent_user_class));
+		err = PTR_ERR(kevent_user_class);
+		goto err_out_unregister;
+	}
+
+	dev = class_device_create(kevent_user_class, NULL, 
+			MKDEV(kevent_user_major, 0), NULL, kevent_name);
+	if (IS_ERR(dev)) {
+		printk(KERN_ERR "Failed to create %d.%d class device in \"%s\" class: err=%ld.\n", 
+				kevent_user_major, 0, kevent_name, PTR_ERR(dev));
+		err = PTR_ERR(dev);
+		goto err_out_class_destroy;
+	}
+
+	printk("KEVENT subsystem: chardev helper: major=%d.\n", kevent_user_major);
+
+	return 0;
+
+err_out_class_destroy:
+	class_destroy(kevent_user_class);
+err_out_unregister:
+	unregister_chrdev(kevent_user_major, kevent_name);
+
+	return err;
+}
+
+static void __devexit kevent_user_fini(void)
+{
+	class_device_destroy(kevent_user_class, MKDEV(kevent_user_major, 0));
+	class_destroy(kevent_user_class);
+	unregister_chrdev(kevent_user_major, kevent_name);
+	mntput(kevent_mnt);
+	unregister_filesystem(&kevent_fs_type);
+}
+
+module_init(kevent_user_init);
+module_exit(kevent_user_fini);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 6991bec..8843cca 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -122,6 +122,11 @@ cond_syscall(ppc_rtas);
 cond_syscall(sys_spu_run);
 cond_syscall(sys_spu_create);
 
+cond_syscall(sys_aio_recv);
+cond_syscall(sys_aio_send);
+cond_syscall(sys_kevent_get_events);
+cond_syscall(sys_kevent_ctl);
+
 /* mmu depending weak syscall entries */
 cond_syscall(sys_mprotect);
 cond_syscall(sys_msync);


^ permalink raw reply related	[flat|nested] 160+ messages in thread

* Re: [take4 1/4] kevent: Core files.
  2006-08-05 13:02                           ` [take4 1/4] kevent: Core files Evgeniy Polyakov
  2006-08-05 13:02                             ` [take4 2/4] kevent: AIO, aio_sendfile() implementation Evgeniy Polyakov
@ 2006-08-05 17:57                             ` Greg KH
  2006-08-05 18:10                               ` Evgeniy Polyakov
  1 sibling, 1 reply; 160+ messages in thread
From: Greg KH @ 2006-08-05 17:57 UTC (permalink / raw)
  To: Evgeniy Polyakov; +Cc: lkml, David Miller, Ulrich Drepper, netdev, Zach Brown

On Sat, Aug 05, 2006 at 05:02:38PM +0400, Evgeniy Polyakov wrote:
> +static int __devinit kevent_user_init(void)
> +{
> +	struct class_device *dev;
> +	int err = 0;
> +	
> +	err = register_filesystem(&kevent_fs_type);
> +	if (err)
> +		panic("%s: failed to register filesystem: err=%d.\n",
> +			       kevent_name, err);
> +
> +	kevent_mnt = kern_mount(&kevent_fs_type);
> +	if (IS_ERR(kevent_mnt))
> +		panic("%s: failed to mount silesystem: err=%ld.\n", 
> +				kevent_name, PTR_ERR(kevent_mnt));
> +	
> +	kevent_user_major = register_chrdev(0, kevent_name, &kevent_user_fops);
> +	if (kevent_user_major < 0) {
> +		printk(KERN_ERR "Failed to register \"%s\" char device: err=%d.\n", 
> +				kevent_name, kevent_user_major);
> +		return -ENODEV;
> +	}
> +
> +	kevent_user_class = class_create(THIS_MODULE, "kevent");
> +	if (IS_ERR(kevent_user_class)) {
> +		printk(KERN_ERR "Failed to register \"%s\" class: err=%ld.\n", 
> +				kevent_name, PTR_ERR(kevent_user_class));
> +		err = PTR_ERR(kevent_user_class);
> +		goto err_out_unregister;
> +	}
> +
> +	dev = class_device_create(kevent_user_class, NULL, 
> +			MKDEV(kevent_user_major, 0), NULL, kevent_name);
> +	if (IS_ERR(dev)) {
> +		printk(KERN_ERR "Failed to create %d.%d class device in \"%s\" class: err=%ld.\n", 
> +				kevent_user_major, 0, kevent_name, PTR_ERR(dev));
> +		err = PTR_ERR(dev);
> +		goto err_out_class_destroy;
> +	}

As you are only using 1 minor number in this code, why not just use a
miscdevice instead?  It saves a bit of overhead and makes the code a
tiny bit smaller :)

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take4 1/4] kevent: Core files.
  2006-08-05 17:57                             ` [take4 1/4] kevent: Core files Greg KH
@ 2006-08-05 18:10                               ` Evgeniy Polyakov
  0 siblings, 0 replies; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-05 18:10 UTC (permalink / raw)
  To: Greg KH; +Cc: lkml, David Miller, Ulrich Drepper, netdev, Zach Brown

On Sat, Aug 05, 2006 at 10:57:02AM -0700, GregKH (greg@kroah.com) wrote:
> > +	dev = class_device_create(kevent_user_class, NULL, 
> > +			MKDEV(kevent_user_major, 0), NULL, kevent_name);
> > +	if (IS_ERR(dev)) {
> > +		printk(KERN_ERR "Failed to create %d.%d class device in \"%s\" class: err=%ld.\n", 
> > +				kevent_user_major, 0, kevent_name, PTR_ERR(dev));
> > +		err = PTR_ERR(dev);
> > +		goto err_out_class_destroy;
> > +	}
> 
> As you are only using 1 minor number in this code, why not just use a
> miscdevice instead?  It saves a bit of overhead and makes the code a
> tiny bit smaller :)

No problem. I will move it to miscdevice instead of full chardev.

> thanks,
> 
> greg k-h

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take6 0/3] kevent: Generic event handling mechanism.
  2006-08-09  8:02                         ` [take6 0/3] kevent: Generic event handling mechanism Evgeniy Polyakov
@ 2006-08-09  7:58                           ` David Miller
  2006-08-09  8:07                             ` Evgeniy Polyakov
  2006-08-09  8:02                           ` [take6 1/3] kevent: Core files Evgeniy Polyakov
  1 sibling, 1 reply; 160+ messages in thread
From: David Miller @ 2006-08-09  7:58 UTC (permalink / raw)
  To: johnpol; +Cc: linux-kernel, drepper, netdev, zach.brown

From: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
Date: Wed, 9 Aug 2006 12:02:39 +0400

Evgeniy, it's things like the following that make it very draining
mentally to review your work.

>  * removed AIO stuff from patchset

You didn't really do this, you leave the aio_* syscalls and stubs in
there, and you also left things like tcp_async_send() in there.

All the foo_naio_*() stuff is still in there to.

Please remove all of async business we've asked you to.

Thanks.

^ permalink raw reply	[flat|nested] 160+ messages in thread

* [take6 0/3] kevent: Generic event handling mechanism.
  2006-07-31 10:33                       ` Evgeniy Polyakov
                                           ` (5 preceding siblings ...)
  2006-08-05 13:02                         ` [take4 0/4] kevent: Generic event handling mechanism Evgeniy Polyakov
@ 2006-08-09  8:02                         ` Evgeniy Polyakov
  2006-08-09  7:58                           ` David Miller
  2006-08-09  8:02                           ` [take6 1/3] kevent: Core files Evgeniy Polyakov
  2006-08-11  8:40                         ` [take8 0/2] kevent: Generic event handling mechanism Evgeniy Polyakov
                                           ` (3 subsequent siblings)
  10 siblings, 2 replies; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-09  8:02 UTC (permalink / raw)
  To: lkml; +Cc: David Miller, Ulrich Drepper, Evgeniy Polyakov, netdev, Zach Brown


Generic event handling mechanism.

Changes from 'take5' patchset:
 * removed compilation warnings about unused wariables when lockdep is not turned on
 * do not use internal socket structures, use appropriate (exported) wrappers instead
 * removed default 1 second timeout
 * removed AIO stuff from patchset

Changes from 'take4' patchset:
 * use miscdevice instead of chardevice
 * comments fixes

Changes from 'take3' patchset:
 * removed serializing mutex from kevent_user_wait()
 * moved storage list processing to RCU
 * removed lockdep screaming - all storage locks are initialized in the same function, so it was learned 
	to differentiate between various cases
 * remove kevent from storage if is marked as broken after callback
 * fixed a typo in mmaped buffer implementation which would end up in wrong index calcualtion 

Changes from 'take2' patchset:
 * split kevent_finish_user() to locked and unlocked variants
 * do not use KEVENT_STAT ifdefs, use inline functions instead
 * use array of callbacks of each type instead of each kevent callback initialization
 * changed name of ukevent guarding lock
 * use only one kevent lock in kevent_user for all hash buckets instead of per-bucket locks
 * do not use kevent_user_ctl structure instead provide needed arguments as syscall parameters
 * various indent cleanups
 * added optimisation, which is aimed to help when a lot of kevents are being copied from userspace
 * mapped buffer (initial) implementation (no userspace yet)

Changes from 'take1' patchset:
 - rebased against 2.6.18-git tree
 - removed ioctl controlling
 - added new syscall kevent_get_events(int fd, unsigned int min_nr, unsigned int max_nr,
			unsigned int timeout, void __user *buf, unsigned flags)
 - use old syscall kevent_ctl for creation/removing, modification and initial kevent 
	initialization
 - use mutuxes instead of semaphores
 - added file descriptor check and return error if provided descriptor does not match
	kevent file operations
 - various indent fixes
 - removed aio_sendfile() declarations.

Thank you.

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>



^ permalink raw reply	[flat|nested] 160+ messages in thread

* [take6 1/3] kevent: Core files.
  2006-08-09  8:02                         ` [take6 0/3] kevent: Generic event handling mechanism Evgeniy Polyakov
  2006-08-09  7:58                           ` David Miller
@ 2006-08-09  8:02                           ` Evgeniy Polyakov
  2006-08-09  8:02                             ` [take6 3/3] kevent: Network AIO, socket notifications Evgeniy Polyakov
                                               ` (2 more replies)
  1 sibling, 3 replies; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-09  8:02 UTC (permalink / raw)
  To: lkml; +Cc: David Miller, Ulrich Drepper, Evgeniy Polyakov, netdev, Zach Brown


Core files.

This patch includes core kevent files:
 - userspace controlling
 - kernelspace interfaces
 - initialization
 - notification state machines

It might also inlclude parts from other subsystem (like network related
syscalls, so it is possible that it will not compile without other
patches applied).

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>

diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index dd63d47..0af988a 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@@ -317,3 +317,7 @@ ENTRY(sys_call_table)
 	.long sys_tee			/* 315 */
 	.long sys_vmsplice
 	.long sys_move_pages
+	.long sys_aio_recv
+	.long sys_aio_send
+	.long sys_kevent_get_events
+	.long sys_kevent_ctl
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index 5d4a7d1..e157ad4 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -713,4 +713,8 @@ #endif
 	.quad sys_tee
 	.quad compat_sys_vmsplice
 	.quad compat_sys_move_pages
+	.quad sys_aio_recv
+	.quad sys_aio_send
+	.quad sys_kevent_get_events
+	.quad sys_kevent_ctl
 ia32_syscall_end:		
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index fc1c8dd..a76e50d 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -323,10 +323,14 @@ #define __NR_sync_file_range	314
 #define __NR_tee		315
 #define __NR_vmsplice		316
 #define __NR_move_pages		317
+#define __NR_aio_recv		318
+#define __NR_aio_send		319
+#define __NR_kevent_get_events	320
+#define __NR_kevent_ctl		321
 
 #ifdef __KERNEL__
 
-#define NR_syscalls 318
+#define NR_syscalls 322
 
 /*
  * user-visible error numbers are in the range -1 - -128: see
diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h
index 94387c9..9a0b581 100644
--- a/include/asm-x86_64/unistd.h
+++ b/include/asm-x86_64/unistd.h
@@ -619,10 +619,18 @@ #define __NR_vmsplice		278
 __SYSCALL(__NR_vmsplice, sys_vmsplice)
 #define __NR_move_pages		279
 __SYSCALL(__NR_move_pages, sys_move_pages)
+#define __NR_aio_recv		280
+__SYSCALL(__NR_aio_recv, sys_aio_recv)
+#define __NR_aio_send		281
+__SYSCALL(__NR_aio_send, sys_aio_send)
+#define __NR_kevent_get_events	282
+__SYSCALL(__NR_kevent_get_events, sys_kevent_get_events)
+#define __NR_kevent_ctl		283
+__SYSCALL(__NR_kevent_ctl, sys_kevent_ctl)
 
 #ifdef __KERNEL__
 
-#define __NR_syscall_max __NR_move_pages
+#define __NR_syscall_max __NR_kevent_ctl
 
 #ifndef __NO_STUBS
 
diff --git a/include/linux/kevent.h b/include/linux/kevent.h
new file mode 100644
index 0000000..b4342f0
--- /dev/null
+++ b/include/linux/kevent.h
@@ -0,0 +1,296 @@
+/*
+ * 	kevent.h
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef __KEVENT_H
+#define __KEVENT_H
+
+/*
+ * Kevent request flags.
+ */
+
+#define KEVENT_REQ_ONESHOT	0x1		/* Process this event only once and then dequeue. */
+
+/*
+ * Kevent return flags.
+ */
+#define KEVENT_RET_BROKEN	0x1		/* Kevent is broken. */
+#define KEVENT_RET_DONE		0x2		/* Kevent processing was finished successfully. */
+
+/*
+ * Kevent type set.
+ */
+#define KEVENT_SOCKET 		0
+#define KEVENT_INODE		1
+#define KEVENT_TIMER		2
+#define KEVENT_POLL		3
+#define KEVENT_NAIO		4
+#define KEVENT_AIO		5
+#define	KEVENT_MAX		6
+
+/*
+ * Per-type event sets.
+ * Number of per-event sets should be exactly as number of kevent types.
+ */
+
+/*
+ * Timer events.
+ */
+#define	KEVENT_TIMER_FIRED	0x1
+
+/*
+ * Socket/network asynchronous IO events.
+ */
+#define	KEVENT_SOCKET_RECV	0x1
+#define	KEVENT_SOCKET_ACCEPT	0x2
+#define	KEVENT_SOCKET_SEND	0x4
+
+/*
+ * Inode events.
+ */
+#define	KEVENT_INODE_CREATE	0x1
+#define	KEVENT_INODE_REMOVE	0x2
+
+/*
+ * Poll events.
+ */
+#define	KEVENT_POLL_POLLIN	0x0001
+#define	KEVENT_POLL_POLLPRI	0x0002
+#define	KEVENT_POLL_POLLOUT	0x0004
+#define	KEVENT_POLL_POLLERR	0x0008
+#define	KEVENT_POLL_POLLHUP	0x0010
+#define	KEVENT_POLL_POLLNVAL	0x0020
+
+#define	KEVENT_POLL_POLLRDNORM	0x0040
+#define	KEVENT_POLL_POLLRDBAND	0x0080
+#define	KEVENT_POLL_POLLWRNORM	0x0100
+#define	KEVENT_POLL_POLLWRBAND	0x0200
+#define	KEVENT_POLL_POLLMSG	0x0400
+#define	KEVENT_POLL_POLLREMOVE	0x1000
+
+/*
+ * Asynchronous IO events.
+ */
+#define	KEVENT_AIO_BIO		0x1
+
+#define KEVENT_MASK_ALL		0xffffffff	/* Mask of all possible event values. */
+#define KEVENT_MASK_EMPTY	0x0		/* Empty mask of ready events. */
+
+struct kevent_id
+{
+	__u32		raw[2];
+};
+
+struct ukevent
+{
+	struct kevent_id	id;			/* Id of this request, e.g. socket number, file descriptor and so on... */
+	__u32			type;			/* Event type, e.g. KEVENT_SOCK, KEVENT_INODE, KEVENT_TIMER and so on... */
+	__u32			event;			/* Event itself, e.g. SOCK_ACCEPT, INODE_CREATED, TIMER_FIRED... */
+	__u32			req_flags;		/* Per-event request flags */
+	__u32			ret_flags;		/* Per-event return flags */
+	__u32			ret_data[2];		/* Event return data. Event originator fills it with anything it likes. */
+	union {
+		__u32		user[2];		/* User's data. It is not used, just copied to/from user. */
+		void		*ptr;
+	};
+};
+
+#define	KEVENT_CTL_ADD 		0
+#define	KEVENT_CTL_REMOVE	1
+#define	KEVENT_CTL_MODIFY	2
+#define	KEVENT_CTL_INIT		3
+
+#ifdef __KERNEL__
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/wait.h>
+#include <linux/net.h>
+#include <linux/rcupdate.h>
+#include <linux/kevent_storage.h>
+
+#define KEVENT_MAX_EVENTS	4096
+#define KEVENT_MIN_BUFFS_ALLOC	3
+
+struct inode;
+struct dentry;
+struct sock;
+
+struct kevent;
+struct kevent_storage;
+typedef int (* kevent_callback_t)(struct kevent *);
+
+/* @callback is called each time new event has been caught. */
+/* @enqueue is called each time new event is queued. */
+/* @dequeue is called each time event is dequeued. */
+
+struct kevent_callbacks {
+	kevent_callback_t	callback, enqueue, dequeue;
+};
+
+struct kevent
+{
+	struct rcu_head		rcu_head;		/* Used for kevent freeing.*/
+	struct ukevent		event;
+	spinlock_t		ulock;			/* This lock protects ukevent manipulations, e.g. ret_flags changes. */
+
+	struct list_head	kevent_entry;		/* Entry of user's queue. */
+	struct list_head	storage_entry;		/* Entry of origin's queue. */
+	struct list_head	ready_entry;		/* Entry of user's ready. */
+
+	struct kevent_user	*user;			/* User who requested this kevent. */
+	struct kevent_storage	*st;			/* Kevent container. */
+
+	struct kevent_callbacks	callbacks;
+
+	void			*priv;			/* Private data for different storages. 
+							 * poll()/select storage has a list of wait_queue_t containers 
+							 * for each ->poll() { poll_wait()' } here.
+							 */
+};
+
+extern struct kevent_callbacks kevent_registered_callbacks[];
+
+#define KEVENT_HASH_MASK	0xff
+
+struct kevent_user
+{
+	struct list_head	kevent_list[KEVENT_HASH_MASK+1];
+	spinlock_t		kevent_lock;
+	unsigned int		kevent_num;		/* Number of queued kevents. */
+
+	struct list_head	ready_list;		/* List of ready kevents. */
+	unsigned int		ready_num;		/* Number of ready kevents. */
+	spinlock_t 		ready_lock;		/* Protects all manipulations with ready queue. */
+
+	unsigned int		max_ready_num;		/* Requested number of kevents. */
+
+	struct mutex		ctl_mutex;		/* Protects against simultaneous kevent_user control manipulations. */
+	wait_queue_head_t	wait;			/* Wait until some events are ready. */
+
+	atomic_t		refcnt;			/* Reference counter, increased for each new kevent. */
+	
+	unsigned long		*pring;			/* Array of pages forming mapped ring buffer */
+
+#ifdef CONFIG_KEVENT_USER_STAT
+	unsigned long		im_num;
+	unsigned long		wait_num;
+	unsigned long		total;
+#endif
+};
+
+extern kmem_cache_t *kevent_cache;
+int kevent_enqueue(struct kevent *k);
+int kevent_dequeue(struct kevent *k);
+int kevent_init(struct kevent *k);
+void kevent_requeue(struct kevent *k);
+int kevent_break(struct kevent *k);
+
+void kevent_user_ring_add_event(struct kevent *k);
+
+void kevent_storage_ready(struct kevent_storage *st, 
+		kevent_callback_t ready_callback, u32 event);
+int kevent_storage_init(void *origin, struct kevent_storage *st);
+void kevent_storage_fini(struct kevent_storage *st);
+int kevent_storage_enqueue(struct kevent_storage *st, struct kevent *k);
+void kevent_storage_dequeue(struct kevent_storage *st, struct kevent *k);
+
+int kevent_user_add_ukevent(struct ukevent *uk, struct kevent_user *u);
+
+#ifdef CONFIG_KEVENT_POLL
+void kevent_poll_reinit(struct file *file);
+#else
+static inline void kevent_poll_reinit(struct file *file)
+{
+}
+#endif
+
+#ifdef CONFIG_KEVENT_INODE
+void kevent_inode_notify(struct inode *inode, u32 event);
+void kevent_inode_notify_parent(struct dentry *dentry, u32 event);
+void kevent_inode_remove(struct inode *inode);
+#else
+static inline void kevent_inode_notify(struct inode *inode, u32 event)
+{
+}
+static inline void kevent_inode_notify_parent(struct dentry *dentry, u32 event)
+{
+}
+static inline void kevent_inode_remove(struct inode *inode)
+{
+}
+#endif /* CONFIG_KEVENT_INODE */
+#ifdef CONFIG_KEVENT_SOCKET
+#ifdef CONFIG_LOCKDEP
+void kevent_socket_reinit(struct socket *sock);
+void kevent_sk_reinit(struct sock *sk);
+#else
+static inline void kevent_socket_reinit(struct socket *sock)
+{
+}
+static inline void kevent_sk_reinit(struct sock *sk)
+{
+}
+#endif
+void kevent_socket_notify(struct sock *sock, u32 event);
+int kevent_socket_dequeue(struct kevent *k);
+int kevent_socket_enqueue(struct kevent *k);
+#define sock_async(__sk) sock_flag(__sk, SOCK_ASYNC)
+#else
+static inline void kevent_socket_notify(struct sock *sock, u32 event)
+{
+}
+#define sock_async(__sk)	({ (void)__sk; 0; })
+#endif
+
+#ifdef CONFIG_KEVENT_USER_STAT
+static inline void kevent_user_stat_init(struct kevent_user *u)
+{
+	u->wait_num = u->im_num = u->total = 0;
+}
+static inline void kevent_user_stat_print(struct kevent_user *u)
+{
+	pr_debug("%s: u=%p, wait=%lu, immediately=%lu, total=%lu.\n", 
+			__func__, u, u->wait_num, u->im_num, u->total);
+}
+static inline void kevent_user_stat_increase_im(struct kevent_user *u)
+{
+	u->im_num++;
+}
+static inline void kevent_user_stat_increase_wait(struct kevent_user *u)
+{
+	u->wait_num++;
+}
+static inline void kevent_user_stat_increase_total(struct kevent_user *u)
+{
+	u->total++;
+}
+#else
+#define kevent_user_stat_print(u)		({ (void) u;})
+#define kevent_user_stat_init(u)		({ (void) u;})
+#define kevent_user_stat_increase_im(u)		({ (void) u;})
+#define kevent_user_stat_increase_wait(u)	({ (void) u;})
+#define kevent_user_stat_increase_total(u)	({ (void) u;})
+#endif
+
+#endif /* __KERNEL__ */
+#endif /* __KEVENT_H */
diff --git a/include/linux/kevent_storage.h b/include/linux/kevent_storage.h
new file mode 100644
index 0000000..bd891f0
--- /dev/null
+++ b/include/linux/kevent_storage.h
@@ -0,0 +1,12 @@
+#ifndef __KEVENT_STORAGE_H
+#define __KEVENT_STORAGE_H
+
+struct kevent_storage
+{
+	void			*origin;		/* Originator's pointer, e.g. struct sock or struct file. Can be NULL. */
+	struct list_head	list;			/* List of queued kevents. */
+	unsigned int		qlen;			/* Number of queued kevents. */
+	spinlock_t		lock;			/* Protects users queue. */
+};
+
+#endif /* __KEVENT_STORAGE_H */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 008f04c..143f3b5 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -597,4 +597,9 @@ asmlinkage long sys_get_robust_list(int 
 asmlinkage long sys_set_robust_list(struct robust_list_head __user *head,
 				    size_t len);
 
+asmlinkage long sys_aio_recv(int ctl_fd, int s, void __user *buf, size_t size, unsigned flags);
+asmlinkage long sys_aio_send(int ctl_fd, int s, void __user *buf, size_t size, unsigned flags);
+asmlinkage long sys_kevent_get_events(int ctl_fd, unsigned int min, unsigned int max, 
+		unsigned int timeout, void __user *buf, unsigned flags);
+asmlinkage long sys_kevent_ctl(int ctl_fd, unsigned int cmd, unsigned int num, void __user *buf);
 #endif
diff --git a/init/Kconfig b/init/Kconfig
index a099fc6..c550fcc 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -218,6 +218,8 @@ config AUDITSYSCALL
 	  such as SELinux.  To use audit's filesystem watch feature, please
 	  ensure that INOTIFY is configured.
 
+source "kernel/kevent/Kconfig"
+
 config IKCONFIG
 	bool "Kernel .config support"
 	---help---
diff --git a/kernel/Makefile b/kernel/Makefile
index d62ec66..2d7a6dd 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -47,6 +47,7 @@ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softl
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
+obj-$(CONFIG_KEVENT) += kevent/
 obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o
diff --git a/kernel/kevent/Kconfig b/kernel/kevent/Kconfig
new file mode 100644
index 0000000..88b35af
--- /dev/null
+++ b/kernel/kevent/Kconfig
@@ -0,0 +1,50 @@
+config KEVENT
+	bool "Kernel event notification mechanism"
+	help
+	  This option enables event queue mechanism.
+	  It can be used as replacement for poll()/select(), AIO callback invocations,
+	  advanced timer notifications and other kernel object status changes.
+
+config KEVENT_USER_STAT
+	bool "Kevent user statistic"
+	depends on KEVENT
+	default N
+	help
+	  This option will turn kevent_user statistic collection on.
+	  Statistic data includes total number of kevent, number of kevents which are ready
+	  immediately at insertion time and number of kevents which were removed through
+	  readiness completion. It will be printed each time control kevent descriptor
+	  is closed.
+
+config KEVENT_SOCKET
+	bool "Kernel event notifications for sockets"
+	depends on NET && KEVENT
+	help
+	  This option enables notifications through KEVENT subsystem of 
+	  sockets operations, like new packet receiving conditions, ready for accept
+  	  conditions and so on.
+	
+config KEVENT_INODE
+	bool "Kernel event notifications for inodes"
+	depends on KEVENT
+	help
+	  This option enables notifications through KEVENT subsystem of 
+	  inode operations, like file creation, removal and so on.
+
+config KEVENT_TIMER
+	bool "Kernel event notifications for timers"
+	depends on KEVENT
+	help
+	  This option allows to use timers through KEVENT subsystem.
+
+config KEVENT_POLL
+	bool "Kernel event notifications for poll()/select()"
+	depends on KEVENT
+	help
+	  This option allows to use kevent subsystem for poll()/select() notifications.
+
+config KEVENT_NAIO
+	bool "Network asynchronous IO"
+	depends on KEVENT && KEVENT_SOCKET
+	help
+	  This option enables kevent based network asynchronous IO subsystem.
diff --git a/kernel/kevent/Makefile b/kernel/kevent/Makefile
new file mode 100644
index 0000000..d1ef9ba
--- /dev/null
+++ b/kernel/kevent/Makefile
@@ -0,0 +1,6 @@
+obj-y := kevent.o kevent_user.o
+obj-$(CONFIG_KEVENT_SOCKET) += kevent_socket.o
+obj-$(CONFIG_KEVENT_INODE) += kevent_inode.o
+obj-$(CONFIG_KEVENT_TIMER) += kevent_timer.o
+obj-$(CONFIG_KEVENT_POLL) += kevent_poll.o
+obj-$(CONFIG_KEVENT_NAIO) += kevent_naio.o
diff --git a/kernel/kevent/kevent.c b/kernel/kevent/kevent.c
new file mode 100644
index 0000000..e63a8fd
--- /dev/null
+++ b/kernel/kevent/kevent.c
@@ -0,0 +1,238 @@
+/*
+ * 	kevent.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/mempool.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/kevent.h>
+
+kmem_cache_t *kevent_cache;
+
+/*
+ * Attempts to add an event into appropriate origin's queue.
+ * Returns positive value if this event is ready immediately,
+ * negative value in case of error and zero if event has been queued.
+ * ->enqueue() callback must increase origin's reference counter.
+ */
+int kevent_enqueue(struct kevent *k)
+{
+	if (k->event.type >= KEVENT_MAX)
+		return -E2BIG;
+
+	if (!k->callbacks.enqueue) {
+		kevent_break(k);
+		return -EINVAL;
+	}
+	
+	return k->callbacks.enqueue(k);
+}
+
+/*
+ * Remove event from the appropriate queue.
+ * ->dequeue() callback must decrease origin's reference counter.
+ */
+int kevent_dequeue(struct kevent *k)
+{
+	if (k->event.type >= KEVENT_MAX)
+		return -E2BIG;
+	
+	if (!k->callbacks.dequeue) {
+		kevent_break(k);
+		return -EINVAL;
+	}
+
+	return k->callbacks.dequeue(k);
+}
+
+int kevent_break(struct kevent *k)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&k->ulock, flags);
+	k->event.ret_flags |= KEVENT_RET_BROKEN;
+	spin_unlock_irqrestore(&k->ulock, flags);
+	return 0;
+}
+
+struct kevent_callbacks kevent_registered_callbacks[KEVENT_MAX];
+
+/*
+ * Must be called before event is going to be added into some origin's queue.
+ * Initializes ->enqueue(), ->dequeue() and ->callback() callbacks.
+ * If failed, kevent should not be used or kevent_enqueue() will fail to add
+ * this kevent into origin's queue with setting
+ * KEVENT_RET_BROKEN flag in kevent->event.ret_flags.
+ */
+int kevent_init(struct kevent *k)
+{
+	spin_lock_init(&k->ulock);
+	k->kevent_entry.next = LIST_POISON1;
+	k->storage_entry.prev = LIST_POISON2;
+	k->ready_entry.next = LIST_POISON1;
+
+	if (k->event.type >= KEVENT_MAX)
+		return -E2BIG;
+
+	k->callbacks = kevent_registered_callbacks[k->event.type];
+	if (!k->callbacks.callback) {
+		kevent_break(k);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/*
+ * Called from ->enqueue() callback when reference counter for given
+ * origin (socket, inode...) has been increased.
+ */
+int kevent_storage_enqueue(struct kevent_storage *st, struct kevent *k)
+{
+	unsigned long flags;
+
+	k->st = st;
+	spin_lock_irqsave(&st->lock, flags);
+	list_add_tail_rcu(&k->storage_entry, &st->list);
+	st->qlen++;
+	spin_unlock_irqrestore(&st->lock, flags);
+	return 0;
+}
+
+/*
+ * Dequeue kevent from origin's queue. 
+ * It does not decrease origin's reference counter in any way 
+ * and must be called before it, so storage itself must be valid.
+ * It is called from ->dequeue() callback.
+ */
+void kevent_storage_dequeue(struct kevent_storage *st, struct kevent *k)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&st->lock, flags);
+	if (k->storage_entry.prev != LIST_POISON2) {
+		list_del_rcu(&k->storage_entry);
+		st->qlen--;
+	}
+	spin_unlock_irqrestore(&st->lock, flags);
+}
+
+static void __kevent_requeue(struct kevent *k, u32 event)
+{
+	int err, rem = 0;
+	unsigned long flags;
+
+	err = k->callbacks.callback(k);
+
+	spin_lock_irqsave(&k->ulock, flags);
+	if (err > 0) {
+		k->event.ret_flags |= KEVENT_RET_DONE;
+	} else if (err < 0) {
+		k->event.ret_flags |= KEVENT_RET_BROKEN;
+		k->event.ret_flags |= KEVENT_RET_DONE;
+	}
+	rem = (k->event.req_flags & KEVENT_REQ_ONESHOT);
+	if (!err)
+		err = (k->event.ret_flags & (KEVENT_RET_BROKEN|KEVENT_RET_DONE));
+	spin_unlock_irqrestore(&k->ulock, flags);
+
+	if (err) {
+		if ((rem || err < 0) && k->storage_entry.prev != LIST_POISON2) {
+			list_del_rcu(&k->storage_entry);
+			k->st->qlen--;
+		}
+		
+		spin_lock_irqsave(&k->user->ready_lock, flags);
+		if (k->ready_entry.next == LIST_POISON1) {
+			kevent_user_ring_add_event(k);
+			list_add_tail(&k->ready_entry, &k->user->ready_list);
+			k->user->ready_num++;
+		}
+		spin_unlock_irqrestore(&k->user->ready_lock, flags);
+		wake_up(&k->user->wait);
+	}
+}
+
+void kevent_requeue(struct kevent *k)
+{
+	unsigned long flags;
+	
+	spin_lock_irqsave(&k->st->lock, flags);
+	__kevent_requeue(k, 0);
+	spin_unlock_irqrestore(&k->st->lock, flags);
+}
+
+/*
+ * Called each time some activity in origin (socket, inode...) is noticed.
+ */
+void kevent_storage_ready(struct kevent_storage *st, 
+		kevent_callback_t ready_callback, u32 event)
+{
+	struct kevent *k;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(k, &st->list, storage_entry) {
+		if (ready_callback)
+			ready_callback(k);
+
+		if (event & k->event.event)
+			__kevent_requeue(k, event);
+	}
+	rcu_read_unlock();
+}
+
+int kevent_storage_init(void *origin, struct kevent_storage *st)
+{
+	spin_lock_init(&st->lock);
+	st->origin = origin;
+	st->qlen = 0;
+	INIT_LIST_HEAD(&st->list);
+	return 0;
+}
+
+void kevent_storage_fini(struct kevent_storage *st)
+{
+	kevent_storage_ready(st, kevent_break, KEVENT_MASK_ALL);
+}
+
+static int __init kevent_sys_init(void)
+{
+	int i;
+
+	kevent_cache = kmem_cache_create("kevent_cache", 
+			sizeof(struct kevent), 0, 0, NULL, NULL);
+	if (!kevent_cache)
+		panic("kevent: Unable to create a cache.\n");
+
+	for (i=0; i<ARRAY_SIZE(kevent_registered_callbacks); ++i) {
+		struct kevent_callbacks *c = &kevent_registered_callbacks[i];
+
+		c->callback = c->enqueue = c->dequeue = NULL;
+	}
+	
+	return 0;
+}
+
+late_initcall(kevent_sys_init);
diff --git a/kernel/kevent/kevent_user.c b/kernel/kevent/kevent_user.c
new file mode 100644
index 0000000..7b6374b
--- /dev/null
+++ b/kernel/kevent/kevent_user.c
@@ -0,0 +1,857 @@
+/*
+ * 	kevent_user.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/device.h>
+#include <linux/poll.h>
+#include <linux/kevent.h>
+#include <linux/jhash.h>
+#include <linux/miscdevice.h>
+#include <asm/io.h>
+
+static char kevent_name[] = "kevent";
+
+static int kevent_user_open(struct inode *, struct file *);
+static int kevent_user_release(struct inode *, struct file *);
+static unsigned int kevent_user_poll(struct file *, struct poll_table_struct *);
+static int kevnet_user_mmap(struct file *, struct vm_area_struct *);
+
+static struct file_operations kevent_user_fops = {
+	.mmap		= kevnet_user_mmap,
+	.open		= kevent_user_open,
+	.release	= kevent_user_release,
+	.poll		= kevent_user_poll,
+	.owner		= THIS_MODULE,
+};
+
+static struct miscdevice kevent_miscdev = {
+	.minor = MISC_DYNAMIC_MINOR,
+	.name = kevent_name,
+	.fops = &kevent_user_fops,
+};
+
+static int kevent_get_sb(struct file_system_type *fs_type, 
+		int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+{
+	/* So original magic... */
+	return get_sb_pseudo(fs_type, kevent_name, NULL, 0xabcdef, mnt);	
+}
+
+static struct file_system_type kevent_fs_type = {
+	.name		= kevent_name,
+	.get_sb		= kevent_get_sb,
+	.kill_sb	= kill_anon_super,
+};
+
+static struct vfsmount *kevent_mnt;
+
+static unsigned int kevent_user_poll(struct file *file, struct poll_table_struct *wait)
+{
+	struct kevent_user *u = file->private_data;
+	unsigned int mask;
+	
+	poll_wait(file, &u->wait, wait);
+	mask = 0;
+
+	if (u->ready_num)
+		mask |= POLLIN | POLLRDNORM;
+
+	return mask;
+}
+
+static inline void kevent_user_ring_set(struct kevent_user *u, unsigned int num)
+{
+	unsigned int *idx;
+	
+	idx = (unsigned int *)u->pring[0];
+	idx[0] = num;
+}
+
+/*
+ * Note that kevents does not exactly fill the page (each ukevent is 40 bytes),
+ * so we reuse 4 bytes at the begining of the first page to store index.
+ * Take that into account if you want to change size of struct ukevent.
+ */
+#define KEVENTS_ON_PAGE (PAGE_SIZE/sizeof(struct ukevent))
+
+/*
+ * Called under kevent_user->ready_lock, so updates are always protected.
+ */
+void kevent_user_ring_add_event(struct kevent *k)
+{
+	unsigned int *idx_ptr, idx, pidx, off;
+	struct ukevent *ukev;
+	
+	idx_ptr = (unsigned int *)k->user->pring[0];
+	idx = idx_ptr[0];
+
+	pidx = idx/KEVENTS_ON_PAGE;
+	off = idx%KEVENTS_ON_PAGE;
+
+	if (pidx == 0)
+		ukev = (struct ukevent *)(k->user->pring[pidx] + sizeof(unsigned int));
+	else
+		ukev = (struct ukevent *)(k->user->pring[pidx]);
+
+	memcpy(&ukev[off], &k->event, sizeof(struct ukevent));
+
+	idx++;
+	if (idx >= KEVENT_MAX_EVENTS)
+		idx = 0;
+
+	idx_ptr[0] = idx;
+}
+
+static int kevent_user_ring_init(struct kevent_user *u)
+{
+	int i, pnum;
+
+	pnum = ALIGN(KEVENT_MAX_EVENTS*sizeof(struct ukevent) + sizeof(unsigned int), PAGE_SIZE)/PAGE_SIZE;
+
+	u->pring = kmalloc(pnum * sizeof(unsigned long), GFP_KERNEL);
+	if (!u->pring)
+		return -ENOMEM;
+
+	for (i=0; i<pnum; ++i) {
+		u->pring[i] = __get_free_page(GFP_KERNEL);
+		if (!u->pring)
+			break;
+	}
+
+	if (i != pnum) {
+		pnum = i;
+		goto err_out_free;
+	}
+
+	kevent_user_ring_set(u, 0);
+
+	return 0;
+
+err_out_free:
+	for (i=0; i<pnum; ++i)
+		free_page(u->pring[i]);
+
+	kfree(u->pring);
+
+	return -ENOMEM;
+}
+
+static void kevent_user_ring_fini(struct kevent_user *u)
+{
+	int i, pnum;
+
+	pnum = ALIGN(KEVENT_MAX_EVENTS*sizeof(struct ukevent) + sizeof(unsigned int), PAGE_SIZE)/PAGE_SIZE;
+	
+	for (i=0; i<pnum; ++i)
+		free_page(u->pring[i]);
+
+	kfree(u->pring);
+}
+
+static struct kevent_user *kevent_user_alloc(void)
+{
+	struct kevent_user *u;
+	int i;
+
+	u = kzalloc(sizeof(struct kevent_user), GFP_KERNEL);
+	if (!u)
+		return NULL;
+
+	INIT_LIST_HEAD(&u->ready_list);
+	spin_lock_init(&u->ready_lock);
+	u->ready_num = 0;
+	kevent_user_stat_init(u);
+	spin_lock_init(&u->kevent_lock);
+	for (i=0; i<ARRAY_SIZE(u->kevent_list); ++i)
+		INIT_LIST_HEAD(&u->kevent_list[i]);
+	u->kevent_num = 0;
+	
+	mutex_init(&u->ctl_mutex);
+	init_waitqueue_head(&u->wait);
+	u->max_ready_num = 0;
+
+	atomic_set(&u->refcnt, 1);
+
+	if (kevent_user_ring_init(u)) {
+		kfree(u);
+		u = NULL;
+	}
+
+	return u;
+}
+
+static int kevent_user_open(struct inode *inode, struct file *file)
+{
+	struct kevent_user *u = kevent_user_alloc();
+	
+	if (!u)
+		return -ENOMEM;
+
+	file->private_data = u;
+	
+	return 0;
+}
+
+static inline void kevent_user_get(struct kevent_user *u)
+{
+	atomic_inc(&u->refcnt);
+}
+
+static inline void kevent_user_put(struct kevent_user *u)
+{
+	if (atomic_dec_and_test(&u->refcnt)) {
+		kevent_user_stat_print(u);
+		kevent_user_ring_fini(u);
+		kfree(u);
+	}
+}
+
+static int kevnet_user_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	size_t size = vma->vm_end - vma->vm_start, psize;
+	int pnum = size/PAGE_SIZE, i;
+	unsigned long start = vma->vm_start;
+	struct kevent_user *u = file->private_data;
+
+	psize = ALIGN(KEVENT_MAX_EVENTS*sizeof(struct ukevent) + sizeof(unsigned int), PAGE_SIZE);
+
+	if (size + vma->vm_pgoff*PAGE_SIZE != psize)
+		return -EINVAL;
+
+	if (vma->vm_flags & VM_WRITE)
+		return -EPERM;
+
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+	for (i=0; i<pnum; ++i) {
+		if (remap_pfn_range(vma, start, virt_to_phys((void *)u->pring[i+vma->vm_pgoff]), PAGE_SIZE,
+					vma->vm_page_prot))
+			return -EAGAIN;
+		start += PAGE_SIZE;
+	}
+
+	return 0;
+}
+
+#if 0
+static inline unsigned int kevent_user_hash(struct ukevent *uk)
+{
+	unsigned int h = (uk->user[0] ^ uk->user[1]) ^ (uk->id.raw[0] ^ uk->id.raw[1]);
+	
+	h = (((h >> 16) & 0xffff) ^ (h & 0xffff)) & 0xffff;
+	h = (((h >> 8) & 0xff) ^ (h & 0xff)) & KEVENT_HASH_MASK;
+
+	return h;
+}
+#else
+static inline unsigned int kevent_user_hash(struct ukevent *uk)
+{
+	return jhash_1word(uk->id.raw[0], 0) & KEVENT_HASH_MASK;
+}
+#endif
+
+static void kevent_free_rcu(struct rcu_head *rcu)
+{
+	struct kevent *kevent = container_of(rcu, struct kevent, rcu_head);
+	kmem_cache_free(kevent_cache, kevent);
+}
+
+static void kevent_finish_user_complete(struct kevent *k, int deq)
+{
+	struct kevent_user *u = k->user;
+	unsigned long flags;
+
+	if (deq)
+		kevent_dequeue(k);
+
+	spin_lock_irqsave(&u->ready_lock, flags);
+	if (k->ready_entry.next != LIST_POISON1) {
+		list_del(&k->ready_entry);
+		u->ready_num--;
+	}
+	spin_unlock_irqrestore(&u->ready_lock, flags);
+
+	kevent_user_put(u);
+	call_rcu(&k->rcu_head, kevent_free_rcu);
+}
+
+static void __kevent_finish_user(struct kevent *k, int deq)
+{
+	struct kevent_user *u = k->user;
+
+	list_del(&k->kevent_entry);
+	u->kevent_num--;
+	kevent_finish_user_complete(k, deq);
+}
+
+/*
+ * Remove kevent from user's list of all events, 
+ * dequeue it from storage and decrease user's reference counter,
+ * since this kevent does not exist anymore. That is why it is freed here.
+ */
+static void kevent_finish_user(struct kevent *k, int deq)
+{
+	struct kevent_user *u = k->user;
+	unsigned long flags;
+
+	spin_lock_irqsave(&u->kevent_lock, flags);
+	list_del(&k->kevent_entry);
+	u->kevent_num--;
+	spin_unlock_irqrestore(&u->kevent_lock, flags);
+	kevent_finish_user_complete(k, deq);
+}
+
+/*
+ * Dequeue one entry from user's ready queue.
+ */
+
+static struct kevent *kqueue_dequeue_ready(struct kevent_user *u)
+{
+	unsigned long flags;
+	struct kevent *k = NULL;
+
+	spin_lock_irqsave(&u->ready_lock, flags);
+	if (u->ready_num && !list_empty(&u->ready_list)) {
+		k = list_entry(u->ready_list.next, struct kevent, ready_entry);
+		list_del(&k->ready_entry);
+		u->ready_num--;
+	}
+	spin_unlock_irqrestore(&u->ready_lock, flags);
+
+	return k;
+}
+
+static struct kevent *__kevent_search(struct list_head *head, struct ukevent *uk, 
+		struct kevent_user *u)
+{
+	struct kevent *k;
+	int found = 0;
+	
+	list_for_each_entry(k, head, kevent_entry) {
+		spin_lock(&k->ulock);
+		if (k->event.user[0] == uk->user[0] && k->event.user[1] == uk->user[1] &&
+				k->event.id.raw[0] == uk->id.raw[0] && 
+				k->event.id.raw[1] == uk->id.raw[1]) {
+			found = 1;
+			spin_unlock(&k->ulock);
+			break;
+		}
+		spin_unlock(&k->ulock);
+	}
+
+	return (found)?k:NULL;
+}
+
+static int kevent_modify(struct ukevent *uk, struct kevent_user *u)
+{
+	struct kevent *k;
+	unsigned int hash = kevent_user_hash(uk);
+	int err = -ENODEV;
+	unsigned long flags;
+	
+	spin_lock_irqsave(&u->kevent_lock, flags);
+	k = __kevent_search(&u->kevent_list[hash], uk, u);
+	if (k) {
+		spin_lock(&k->ulock);
+		k->event.event = uk->event;
+		k->event.req_flags = uk->req_flags;
+		k->event.ret_flags = 0;
+		spin_unlock(&k->ulock);
+		kevent_requeue(k);
+		err = 0;
+	}
+	spin_unlock_irqrestore(&u->kevent_lock, flags);
+	
+	return err;
+}
+
+static int kevent_remove(struct ukevent *uk, struct kevent_user *u)
+{
+	int err = -ENODEV;
+	struct kevent *k;
+	unsigned int hash = kevent_user_hash(uk);
+	unsigned long flags;
+
+	spin_lock_irqsave(&u->kevent_lock, flags);
+	k = __kevent_search(&u->kevent_list[hash], uk, u);
+	if (k) {
+		__kevent_finish_user(k, 1);
+		err = 0;
+	}
+	spin_unlock_irqrestore(&u->kevent_lock, flags);
+
+	return err;
+}
+
+/*
+ * No new entry can be added or removed from any list at this point.
+ * It is not permitted to call ->ioctl() and ->release() in parallel.
+ */
+static int kevent_user_release(struct inode *inode, struct file *file)
+{
+	struct kevent_user *u = file->private_data;
+	struct kevent *k, *n;
+	int i;
+
+	for (i=0; i<KEVENT_HASH_MASK+1; ++i) {
+		list_for_each_entry_safe(k, n, &u->kevent_list[i], kevent_entry)
+			kevent_finish_user(k, 1);
+	}
+
+	kevent_user_put(u);
+	file->private_data = NULL;
+
+	return 0;
+}
+
+static struct ukevent *kevent_get_user(unsigned int num, void __user *arg)
+{
+	struct ukevent *ukev;
+
+	ukev = kmalloc(sizeof(struct ukevent) * num, GFP_KERNEL);
+	if (!ukev)
+		return NULL;
+
+	if (copy_from_user(arg, ukev, sizeof(struct ukevent) * num)) {
+		kfree(ukev);
+		return NULL;
+	}
+
+	return ukev;
+}
+
+static int kevent_user_ctl_modify(struct kevent_user *u, unsigned int num, void __user *arg)
+{
+	int err = 0, i;
+	struct ukevent uk;
+
+	mutex_lock(&u->ctl_mutex);
+	
+	if (num > KEVENT_MIN_BUFFS_ALLOC) {
+		struct ukevent *ukev;
+
+		ukev = kevent_get_user(num, arg);
+		if (ukev) {
+			for (i=0; i<num; ++i) {
+				if (kevent_modify(&ukev[i], u))
+					ukev[i].ret_flags |= KEVENT_RET_BROKEN;
+				ukev[i].ret_flags |= KEVENT_RET_DONE;
+			}
+			if (copy_to_user(arg, ukev, num*sizeof(struct ukevent)))
+				err = -EINVAL;
+			kfree(ukev);
+			goto out;
+		}
+	}
+
+	for (i=0; i<num; ++i) {
+		if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
+			err = -EINVAL;
+			break;
+		}
+
+		if (kevent_modify(&uk, u))
+			uk.ret_flags |= KEVENT_RET_BROKEN;
+		uk.ret_flags |= KEVENT_RET_DONE;
+
+		if (copy_to_user(arg, &uk, sizeof(struct ukevent))) {
+			err = -EINVAL;
+			break;
+		}
+
+		arg += sizeof(struct ukevent);
+	}
+out:
+	mutex_unlock(&u->ctl_mutex);
+
+	return err;
+}
+
+static int kevent_user_ctl_remove(struct kevent_user *u, unsigned int num, void __user *arg)
+{
+	int err = 0, i;
+	struct ukevent uk;
+
+	mutex_lock(&u->ctl_mutex);
+	
+	if (num > KEVENT_MIN_BUFFS_ALLOC) {
+		struct ukevent *ukev;
+
+		ukev = kevent_get_user(num, arg);
+		if (ukev) {
+			for (i=0; i<num; ++i) {
+				if (kevent_remove(&ukev[i], u))
+					ukev[i].ret_flags |= KEVENT_RET_BROKEN;
+				ukev[i].ret_flags |= KEVENT_RET_DONE;
+			}
+			if (copy_to_user(arg, ukev, num*sizeof(struct ukevent)))
+				err = -EINVAL;
+			kfree(ukev);
+			goto out;
+		}
+	}
+
+	for (i=0; i<num; ++i) {
+		if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
+			err = -EINVAL;
+			break;
+		}
+
+		if (kevent_remove(&uk, u))
+			uk.ret_flags |= KEVENT_RET_BROKEN;
+
+		uk.ret_flags |= KEVENT_RET_DONE;
+
+		if (copy_to_user(arg, &uk, sizeof(struct ukevent))) {
+			err = -EINVAL;
+			break;
+		}
+
+		arg += sizeof(struct ukevent);
+	}
+out:
+	mutex_unlock(&u->ctl_mutex);
+
+	return err;
+}
+
+static void kevent_user_enqueue(struct kevent_user *u, struct kevent *k)
+{
+	unsigned long flags;
+	unsigned int hash = kevent_user_hash(&k->event);
+
+	spin_lock_irqsave(&u->kevent_lock, flags);
+	list_add_tail(&k->kevent_entry, &u->kevent_list[hash]);
+	u->kevent_num++;
+	kevent_user_get(u);
+	spin_unlock_irqrestore(&u->kevent_lock, flags);
+}
+
+int kevent_user_add_ukevent(struct ukevent *uk, struct kevent_user *u)
+{
+	struct kevent *k;
+	int err;
+
+	k = kmem_cache_alloc(kevent_cache, GFP_KERNEL);
+	if (!k) {
+		err = -ENOMEM;
+		goto err_out_exit;
+	}
+
+	memcpy(&k->event, uk, sizeof(struct ukevent));
+	INIT_RCU_HEAD(&k->rcu_head);
+
+	k->event.ret_flags = 0;
+
+	err = kevent_init(k);
+	if (err) {
+		kmem_cache_free(kevent_cache, k);
+		goto err_out_exit;
+	}
+	k->user = u;
+	kevent_user_stat_increase_total(u);
+	kevent_user_enqueue(u, k);
+
+	err = kevent_enqueue(k);
+	if (err) {
+		memcpy(uk, &k->event, sizeof(struct ukevent));
+		if (err < 0)
+			uk->ret_flags |= KEVENT_RET_BROKEN;
+		uk->ret_flags |= KEVENT_RET_DONE;
+		kevent_finish_user(k, 0);
+	} 
+
+err_out_exit:
+	return err;
+}
+
+/*
+ * Copy all ukevents from userspace, allocate kevent for each one 
+ * and add them into appropriate kevent_storages, 
+ * e.g. sockets, inodes and so on...
+ * If something goes wrong, all events will be dequeued and 
+ * negative error will be returned. 
+ * On success number of finished events is returned and 
+ * Array of finished events (struct ukevent) will be placed behind 
+ * kevent_user_control structure. User must run through that array and check 
+ * ret_flags field of each ukevent structure to determine if it is fired or failed event.
+ */
+static int kevent_user_ctl_add(struct kevent_user *u, unsigned int num, void __user *arg)
+{
+	int err, cerr = 0, knum = 0, rnum = 0, i;
+	void __user *orig = arg;
+	struct ukevent uk;
+
+	mutex_lock(&u->ctl_mutex);
+
+	err = -ENFILE;
+	if (u->kevent_num + num >= KEVENT_MAX_EVENTS)
+		goto out_remove;
+
+	if (num > KEVENT_MIN_BUFFS_ALLOC) {
+		struct ukevent *ukev;
+
+		ukev = kevent_get_user(num, arg);
+		if (ukev) {
+			for (i=0; i<num; ++i) {
+				err = kevent_user_add_ukevent(&ukev[i], u);
+				if (err) {
+					kevent_user_stat_increase_im(u);
+					if (i != rnum)
+						memcpy(&ukev[rnum], &ukev[i], sizeof(struct ukevent));
+					rnum++;
+				} else
+					knum++;
+			}
+			if (copy_to_user(orig, ukev, rnum*sizeof(struct ukevent)))
+				cerr = -EINVAL;
+			kfree(ukev);
+			goto out_setup;
+		}
+	}
+
+	for (i=0; i<num; ++i) {
+		if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
+			cerr = -EINVAL;
+			break;
+		}
+		arg += sizeof(struct ukevent);
+
+		err = kevent_user_add_ukevent(&uk, u);
+		if (err) {
+			kevent_user_stat_increase_im(u);
+			if (copy_to_user(orig, &uk, sizeof(struct ukevent))) {
+				cerr = -EINVAL;
+				break;
+			}
+			orig += sizeof(struct ukevent);
+			rnum++;
+		} else
+			knum++;
+	}
+
+out_setup:
+	if (cerr < 0) {
+		err = cerr;
+		goto out_remove;
+	}
+
+	err = rnum;
+out_remove:
+	mutex_unlock(&u->ctl_mutex);
+
+	return err;
+}
+
+/*
+ * In nonblocking mode it returns as many events as possible, but not more than @max_nr.
+ * In blocking mode it waits until timeout or if at least @min_nr events are ready.
+ */
+static int kevent_user_wait(struct file *file, struct kevent_user *u, 
+		unsigned int min_nr, unsigned int max_nr, unsigned int timeout, 
+		void __user *buf)
+{
+	struct kevent *k;
+	int cerr = 0, num = 0;
+
+	if (!(file->f_flags & O_NONBLOCK)) {
+		wait_event_interruptible_timeout(u->wait, 
+			u->ready_num >= min_nr, msecs_to_jiffies(timeout));
+	}
+	
+	while (num < max_nr && ((k = kqueue_dequeue_ready(u)) != NULL)) {
+		if (copy_to_user(buf + num*sizeof(struct ukevent), 
+					&k->event, sizeof(struct ukevent))) {
+			cerr = -EINVAL;
+			break;
+		}
+
+		/*
+		 * If it is one-shot kevent, it has been removed already from
+		 * origin's queue, so we can easily free it here.
+		 */
+		if (k->event.req_flags & KEVENT_REQ_ONESHOT)
+			kevent_finish_user(k, 1);
+		++num;
+		kevent_user_stat_increase_wait(u);
+	}
+
+	return (cerr)?cerr:num;
+}
+
+static int kevent_ctl_init(void)
+{
+	struct kevent_user *u;
+	struct file *file;
+	int fd, ret;
+
+	fd = get_unused_fd();
+	if (fd < 0)
+		return fd;
+
+	file = get_empty_filp();
+	if (!file) {
+		ret = -ENFILE;
+		goto out_put_fd;
+	}
+
+	u = kevent_user_alloc();
+	if (unlikely(!u)) {
+		ret = -ENOMEM;
+		goto out_put_file;
+	}
+
+	file->f_op = &kevent_user_fops;
+	file->f_vfsmnt = mntget(kevent_mnt);
+	file->f_dentry = dget(kevent_mnt->mnt_root);
+	file->f_mapping = file->f_dentry->d_inode->i_mapping;
+	file->f_mode = FMODE_READ;
+	file->f_flags = O_RDONLY;
+	file->private_data = u;
+	
+	fd_install(fd, file);
+
+	return fd;
+
+out_put_file:
+	put_filp(file);
+out_put_fd:
+	put_unused_fd(fd);
+	return ret;
+}
+
+static int kevent_ctl_process(struct file *file, unsigned int cmd, unsigned int num, void __user *arg)
+{
+	int err;
+	struct kevent_user *u = file->private_data;
+
+	if (!u)
+		return -EINVAL;
+
+	switch (cmd) {
+	case KEVENT_CTL_ADD:
+		err = kevent_user_ctl_add(u, num, arg);
+		break;
+	case KEVENT_CTL_REMOVE:
+		err = kevent_user_ctl_remove(u, num, arg);
+		break;
+	case KEVENT_CTL_MODIFY:
+		err = kevent_user_ctl_modify(u, num, arg);
+		break;
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+	return err;
+}
+
+asmlinkage long sys_kevent_get_events(int ctl_fd, unsigned int min_nr, unsigned int max_nr,
+		unsigned int timeout, void __user *buf, unsigned flags)
+{
+	int err = -EINVAL;
+	struct file *file;
+	struct kevent_user *u;
+
+	file = fget(ctl_fd);
+	if (!file)
+		return -ENODEV;
+
+	if (file->f_op != &kevent_user_fops)
+		goto out_fput;
+	u = file->private_data;
+
+	err = kevent_user_wait(file, u, min_nr, max_nr, timeout, buf);
+out_fput:
+	fput(file);
+	return err;
+}
+
+asmlinkage long sys_kevent_ctl(int fd, unsigned int cmd, unsigned int num, void __user *arg)
+{
+	int err = -EINVAL;
+	struct file *file;
+
+	if (cmd == KEVENT_CTL_INIT)
+		return kevent_ctl_init();
+
+	file = fget(fd);
+	if (!file)
+		return -ENODEV;
+
+	if (file->f_op != &kevent_user_fops)
+		goto out_fput;
+
+	err = kevent_ctl_process(file, cmd, num, arg);
+
+out_fput:
+	fput(file);
+	return err;
+}
+
+static int __devinit kevent_user_init(void)
+{
+	int err = 0;
+	
+	err = register_filesystem(&kevent_fs_type);
+	if (err)
+		panic("%s: failed to register filesystem: err=%d.\n",
+			       kevent_name, err);
+
+	kevent_mnt = kern_mount(&kevent_fs_type);
+	if (IS_ERR(kevent_mnt))
+		panic("%s: failed to mount silesystem: err=%ld.\n", 
+				kevent_name, PTR_ERR(kevent_mnt));
+	
+	err = misc_register(&kevent_miscdev);
+	if (err) {
+		printk(KERN_ERR "Failed to register kevent miscdev: err=%d.\n", err);
+		goto err_out_exit;
+	}
+
+	printk("KEVENT subsystem has been successfully registered.\n");
+
+	return 0;
+
+err_out_exit:
+	mntput(kevent_mnt);
+	unregister_filesystem(&kevent_fs_type);
+
+	return err;
+}
+
+static void __devexit kevent_user_fini(void)
+{
+	misc_deregister(&kevent_miscdev);
+	mntput(kevent_mnt);
+	unregister_filesystem(&kevent_fs_type);
+}
+
+module_init(kevent_user_init);
+module_exit(kevent_user_fini);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 6991bec..8843cca 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -122,6 +122,11 @@ cond_syscall(ppc_rtas);
 cond_syscall(sys_spu_run);
 cond_syscall(sys_spu_create);
 
+cond_syscall(sys_aio_recv);
+cond_syscall(sys_aio_send);
+cond_syscall(sys_kevent_get_events);
+cond_syscall(sys_kevent_ctl);
+
 /* mmu depending weak syscall entries */
 cond_syscall(sys_mprotect);
 cond_syscall(sys_msync);


^ permalink raw reply related	[flat|nested] 160+ messages in thread

* [take6 2/3] kevent: poll/select() notifications. Timer notifications.
  2006-08-09  8:02                             ` [take6 3/3] kevent: Network AIO, socket notifications Evgeniy Polyakov
@ 2006-08-09  8:02                               ` Evgeniy Polyakov
  0 siblings, 0 replies; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-09  8:02 UTC (permalink / raw)
  To: lkml; +Cc: David Miller, Ulrich Drepper, Evgeniy Polyakov, netdev, Zach Brown


poll/select() notifications. Timer notifications.

This patch includes generic poll/select and timer notifications.

kevent_poll works similar to epoll and has the same issues (callback
is invoked not from internal state machine of the caller, but through
process awake).

Timer notifications can be used for fine grained per-process time 
management, since interval timers are very inconvenient to use, 
and they are limited.

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mitp.ru>

diff --git a/kernel/kevent/kevent_poll.c b/kernel/kevent/kevent_poll.c
new file mode 100644
index 0000000..8a4f863
--- /dev/null
+++ b/kernel/kevent/kevent_poll.c
@@ -0,0 +1,220 @@
+/*
+ * 	kevent_poll.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+#include <linux/file.h>
+#include <linux/kevent.h>
+#include <linux/poll.h>
+#include <linux/fs.h>
+
+static kmem_cache_t *kevent_poll_container_cache;
+static kmem_cache_t *kevent_poll_priv_cache;
+
+struct kevent_poll_ctl
+{
+	struct poll_table_struct 	pt;
+	struct kevent			*k;
+};
+
+struct kevent_poll_wait_container
+{
+	struct list_head		container_entry;
+	wait_queue_head_t		*whead;
+	wait_queue_t			wait;
+	struct kevent			*k;
+};
+
+struct kevent_poll_private
+{
+	struct list_head		container_list;
+	spinlock_t			container_lock;
+};
+
+static int kevent_poll_enqueue(struct kevent *k);
+static int kevent_poll_dequeue(struct kevent *k);
+static int kevent_poll_callback(struct kevent *k);
+
+static int kevent_poll_wait_callback(wait_queue_t *wait, 
+		unsigned mode, int sync, void *key)
+{
+	struct kevent_poll_wait_container *cont = 
+		container_of(wait, struct kevent_poll_wait_container, wait);
+	struct kevent *k = cont->k;
+	struct file *file = k->st->origin;
+	u32 revents;
+
+	revents = file->f_op->poll(file, NULL);
+
+	kevent_storage_ready(k->st, NULL, revents);
+
+	return 0;
+}
+
+static void kevent_poll_qproc(struct file *file, wait_queue_head_t *whead, 
+		struct poll_table_struct *poll_table)
+{
+	struct kevent *k = 
+		container_of(poll_table, struct kevent_poll_ctl, pt)->k;
+	struct kevent_poll_private *priv = k->priv;
+	struct kevent_poll_wait_container *cont;
+	unsigned long flags;
+
+	cont = kmem_cache_alloc(kevent_poll_container_cache, SLAB_KERNEL);
+	if (!cont) {
+		kevent_break(k);
+		return;
+	}
+		
+	cont->k = k;
+	init_waitqueue_func_entry(&cont->wait, kevent_poll_wait_callback);
+	cont->whead = whead;
+
+	spin_lock_irqsave(&priv->container_lock, flags);
+	list_add_tail(&cont->container_entry, &priv->container_list);
+	spin_unlock_irqrestore(&priv->container_lock, flags);
+
+	add_wait_queue(whead, &cont->wait);
+}
+
+static int kevent_poll_enqueue(struct kevent *k)
+{
+	struct file *file;
+	int err, ready = 0;
+	unsigned int revents;
+	struct kevent_poll_ctl ctl;
+	struct kevent_poll_private *priv;
+
+	file = fget(k->event.id.raw[0]);
+	if (!file)
+		return -ENODEV;
+
+	err = -EINVAL;
+	if (!file->f_op || !file->f_op->poll)
+		goto err_out_fput;
+
+	err = -ENOMEM;
+	priv = kmem_cache_alloc(kevent_poll_priv_cache, SLAB_KERNEL);
+	if (!priv)
+		goto err_out_fput;
+
+	spin_lock_init(&priv->container_lock);
+	INIT_LIST_HEAD(&priv->container_list);
+
+	k->priv = priv;
+
+	ctl.k = k;
+	init_poll_funcptr(&ctl.pt, &kevent_poll_qproc);
+
+	err = kevent_storage_enqueue(&file->st, k);
+	if (err)
+		goto err_out_free;
+
+	revents = file->f_op->poll(file, &ctl.pt);
+	if (revents & k->event.event) {
+		ready = 1;
+		kevent_poll_dequeue(k);
+	}
+	
+	return ready;
+
+err_out_free:
+	kmem_cache_free(kevent_poll_priv_cache, priv);
+err_out_fput:
+	fput(file);
+	return err;
+}
+
+static int kevent_poll_dequeue(struct kevent *k)
+{
+	struct file *file = k->st->origin;
+	struct kevent_poll_private *priv = k->priv;
+	struct kevent_poll_wait_container *w, *n;
+	unsigned long flags;
+
+	kevent_storage_dequeue(k->st, k);
+
+	spin_lock_irqsave(&priv->container_lock, flags);
+	list_for_each_entry_safe(w, n, &priv->container_list, container_entry) {
+		list_del(&w->container_entry);
+		remove_wait_queue(w->whead, &w->wait);
+		kmem_cache_free(kevent_poll_container_cache, w);
+	}
+	spin_unlock_irqrestore(&priv->container_lock, flags);
+	
+	kmem_cache_free(kevent_poll_priv_cache, priv);
+	k->priv = NULL;
+	
+	fput(file);
+
+	return 0;
+}
+
+static int kevent_poll_callback(struct kevent *k)
+{
+	struct file *file = k->st->origin;
+	unsigned int revents = file->f_op->poll(file, NULL);
+	return (revents & k->event.event);
+}
+
+static int __init kevent_poll_sys_init(void)
+{
+	struct kevent_callbacks *pc = &kevent_registered_callbacks[KEVENT_POLL];
+
+	kevent_poll_container_cache = kmem_cache_create("kevent_poll_container_cache", 
+			sizeof(struct kevent_poll_wait_container), 0, 0, NULL, NULL);
+	if (!kevent_poll_container_cache) {
+		printk(KERN_ERR "Failed to create kevent poll container cache.\n");
+		return -ENOMEM;
+	}
+	
+	kevent_poll_priv_cache = kmem_cache_create("kevent_poll_priv_cache", 
+			sizeof(struct kevent_poll_private), 0, 0, NULL, NULL);
+	if (!kevent_poll_priv_cache) {
+		printk(KERN_ERR "Failed to create kevent poll private data cache.\n");
+		kmem_cache_destroy(kevent_poll_container_cache);
+		kevent_poll_container_cache = NULL;
+		return -ENOMEM;
+	}
+	
+	pc->enqueue = &kevent_poll_enqueue;
+	pc->dequeue = &kevent_poll_dequeue;
+	pc->callback = &kevent_poll_callback;
+
+	printk(KERN_INFO "Kevent poll()/select() subsystem has been initialized.\n");
+	return 0;
+}
+
+static struct lock_class_key kevent_poll_key;
+
+void kevent_poll_reinit(struct file *file)
+{
+	lockdep_set_class(&file->st.lock, &kevent_poll_key);
+}
+
+static void __exit kevent_poll_sys_fini(void)
+{
+	kmem_cache_destroy(kevent_poll_priv_cache);
+	kmem_cache_destroy(kevent_poll_container_cache);
+}
+
+module_init(kevent_poll_sys_init);
+module_exit(kevent_poll_sys_fini);
diff --git a/kernel/kevent/kevent_timer.c b/kernel/kevent/kevent_timer.c
new file mode 100644
index 0000000..f175edd
--- /dev/null
+++ b/kernel/kevent/kevent_timer.c
@@ -0,0 +1,119 @@
+/*
+ * 	kevent_timer.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+#include <linux/jiffies.h>
+#include <linux/kevent.h>
+
+static void kevent_timer_func(unsigned long data)
+{
+	struct kevent *k = (struct kevent *)data;
+	struct timer_list *t = k->st->origin;
+
+	kevent_storage_ready(k->st, NULL, KEVENT_MASK_ALL);
+	mod_timer(t, jiffies + msecs_to_jiffies(k->event.id.raw[0]));
+}
+
+static struct lock_class_key kevent_timer_key;
+
+static int kevent_timer_enqueue(struct kevent *k)
+{
+	struct timer_list *t;
+	struct kevent_storage *st;
+	int err;
+
+	t = kmalloc(sizeof(struct timer_list) + sizeof(struct kevent_storage), 
+			GFP_KERNEL);
+	if (!t)
+		return -ENOMEM;
+
+	init_timer(t);
+	t->function = kevent_timer_func;
+	t->expires = jiffies + msecs_to_jiffies(k->event.id.raw[0]);
+	t->data = (unsigned long)k;
+
+	st = (struct kevent_storage *)(t+1);
+	err = kevent_storage_init(t, st);
+	if (err)
+		goto err_out_free;
+	lockdep_set_class(&st->lock, &kevent_timer_key);
+
+	err = kevent_storage_enqueue(st, k);
+	if (err)
+		goto err_out_st_fini;
+	
+	add_timer(t);
+
+	return 0;
+
+err_out_st_fini:	
+	kevent_storage_fini(st);
+err_out_free:
+	kfree(t);
+
+	return err;
+}
+
+static int kevent_timer_dequeue(struct kevent *k)
+{
+	struct kevent_storage *st = k->st;
+	struct timer_list *t = st->origin;
+
+	if (!t)
+		return -ENODEV;
+
+	del_timer_sync(t);
+	
+	kevent_storage_dequeue(st, k);
+	
+	kfree(t);
+
+	return 0;
+}
+
+static int kevent_timer_callback(struct kevent *k)
+{
+	struct kevent_storage *st = k->st;
+	struct timer_list *t = st->origin;
+
+	if (!t)
+		return -ENODEV;
+	
+	k->event.ret_data[0] = (__u32)jiffies;
+	return 1;
+}
+
+static int __init kevent_init_timer(void)
+{
+	struct kevent_callbacks *tc = &kevent_registered_callbacks[KEVENT_TIMER];
+
+	tc->enqueue = &kevent_timer_enqueue;
+	tc->dequeue = &kevent_timer_dequeue;
+	tc->callback = &kevent_timer_callback;
+
+	return 0;
+}
+late_initcall(kevent_init_timer);


^ permalink raw reply related	[flat|nested] 160+ messages in thread

* [take6 3/3] kevent: Network AIO, socket notifications.
  2006-08-09  8:02                           ` [take6 1/3] kevent: Core files Evgeniy Polyakov
@ 2006-08-09  8:02                             ` Evgeniy Polyakov
  2006-08-09  8:02                               ` [take6 2/3] kevent: poll/select() notifications. Timer notifications Evgeniy Polyakov
  2006-08-09 17:47                             ` [take6 1/3] kevent: Core files Stephen Hemminger
  2006-08-09 22:21                             ` Andrew Morton
  2 siblings, 1 reply; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-09  8:02 UTC (permalink / raw)
  To: lkml; +Cc: David Miller, Ulrich Drepper, Evgeniy Polyakov, netdev, Zach Brown


Network AIO, socket notifications.

This patchset includes socket notifications and network asynchronous IO.
Network AIO is based on kevent and works as usual kevent storage on top
of inode.

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>

diff --git a/include/asm-i386/socket.h b/include/asm-i386/socket.h
index 5755d57..9300678 100644
--- a/include/asm-i386/socket.h
+++ b/include/asm-i386/socket.h
@@ -50,4 +50,6 @@ #define SO_ACCEPTCONN		30
 #define SO_PEERSEC		31
 #define SO_PASSSEC		34
 
+#define SO_ASYNC_SOCK		35
+
 #endif /* _ASM_SOCKET_H */
diff --git a/include/asm-x86_64/socket.h b/include/asm-x86_64/socket.h
index b467026..fc2b49d 100644
--- a/include/asm-x86_64/socket.h
+++ b/include/asm-x86_64/socket.h
@@ -50,4 +50,6 @@ #define SO_ACCEPTCONN		30
 #define SO_PEERSEC             31
 #define SO_PASSSEC		34
 
+#define SO_ASYNC_SOCK		35
+
 #endif /* _ASM_SOCKET_H */
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 4307e76..9267873 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1283,6 +1283,8 @@ extern struct sk_buff *skb_recv_datagram
 					 int noblock, int *err);
 extern unsigned int    datagram_poll(struct file *file, struct socket *sock,
 				     struct poll_table_struct *wait);
+extern int	       skb_copy_datagram(const struct sk_buff *from, 
+					 int offset, void *dst, int size);
 extern int	       skb_copy_datagram_iovec(const struct sk_buff *from,
 					       int offset, struct iovec *to,
 					       int size);
diff --git a/include/net/sock.h b/include/net/sock.h
index 324b3ea..c43a153 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -48,6 +48,7 @@ #include <linux/lockdep.h>
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>	/* struct sk_buff */
 #include <linux/security.h>
+#include <linux/kevent.h>
 
 #include <linux/filter.h>
 
@@ -391,6 +392,8 @@ enum sock_flags {
 	SOCK_RCVTSTAMP, /* %SO_TIMESTAMP setting */
 	SOCK_LOCALROUTE, /* route locally only, %SO_DONTROUTE setting */
 	SOCK_QUEUE_SHRUNK, /* write queue has been shrunk recently */
+	SOCK_ASYNC,
+	SOCK_ASYNC_INUSE,
 };
 
 static inline void sock_copy_flags(struct sock *nsk, struct sock *osk)
@@ -450,6 +453,21 @@ static inline int sk_stream_memory_free(
 
 extern void sk_stream_rfree(struct sk_buff *skb);
 
+struct socket_alloc {
+	struct socket socket;
+	struct inode vfs_inode;
+};
+
+static inline struct socket *SOCKET_I(struct inode *inode)
+{
+	return &container_of(inode, struct socket_alloc, vfs_inode)->socket;
+}
+
+static inline struct inode *SOCK_INODE(struct socket *socket)
+{
+	return &container_of(socket, struct socket_alloc, socket)->vfs_inode;
+}
+
 static inline void sk_stream_set_owner_r(struct sk_buff *skb, struct sock *sk)
 {
 	skb->sk = sk;
@@ -477,6 +495,7 @@ static inline void sk_add_backlog(struct
 		sk->sk_backlog.tail = skb;
 	}
 	skb->next = NULL;
+	kevent_socket_notify(sk, KEVENT_SOCKET_RECV);
 }
 
 #define sk_wait_event(__sk, __timeo, __condition)		\
@@ -548,6 +567,12 @@ struct proto {
 
 	int			(*backlog_rcv) (struct sock *sk, 
 						struct sk_buff *skb);
+	
+	int			(*async_recv) (struct sock *sk, 
+						void *dst, size_t size);
+	int			(*async_send) (struct sock *sk, 
+						struct page **pages, unsigned int poffset, 
+						size_t size);
 
 	/* Keeping track of sk's, looking them up, and port selection methods. */
 	void			(*hash)(struct sock *sk);
@@ -679,21 +704,6 @@ static inline struct kiocb *siocb_to_kio
 	return si->kiocb;
 }
 
-struct socket_alloc {
-	struct socket socket;
-	struct inode vfs_inode;
-};
-
-static inline struct socket *SOCKET_I(struct inode *inode)
-{
-	return &container_of(inode, struct socket_alloc, vfs_inode)->socket;
-}
-
-static inline struct inode *SOCK_INODE(struct socket *socket)
-{
-	return &container_of(socket, struct socket_alloc, socket)->vfs_inode;
-}
-
 extern void __sk_stream_mem_reclaim(struct sock *sk);
 extern int sk_stream_mem_schedule(struct sock *sk, int size, int kind);
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 0720bdd..5a1899b 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -364,6 +364,8 @@ extern int			compat_tcp_setsockopt(struc
 					int level, int optname,
 					char __user *optval, int optlen);
 extern void			tcp_set_keepalive(struct sock *sk, int val);
+extern int			tcp_async_recv(struct sock *sk, void *dst, size_t size);
+extern int			tcp_async_send(struct sock *sk, struct page **pages, unsigned int poffset, size_t size);
 extern int			tcp_recvmsg(struct kiocb *iocb, struct sock *sk,
 					    struct msghdr *msg,
 					    size_t len, int nonblock, 
@@ -857,6 +859,7 @@ static inline int tcp_prequeue(struct so
 			tp->ucopy.memory = 0;
 		} else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
 			wake_up_interruptible(sk->sk_sleep);
+			kevent_socket_notify(sk, KEVENT_SOCKET_RECV|KEVENT_SOCKET_SEND);
 			if (!inet_csk_ack_scheduled(sk))
 				inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
 						          (3 * TCP_RTO_MIN) / 4,
diff --git a/kernel/kevent/kevent_naio.c b/kernel/kevent/kevent_naio.c
new file mode 100644
index 0000000..1b6122a
--- /dev/null
+++ b/kernel/kevent/kevent_naio.c
@@ -0,0 +1,237 @@
+/*
+ * 	kevent_naio.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/kevent.h>
+
+#include <net/sock.h>
+#include <net/tcp_states.h>
+
+static int kevent_naio_enqueue(struct kevent *k);
+static int kevent_naio_dequeue(struct kevent *k);
+static int kevent_naio_callback(struct kevent *k);
+
+static int kevent_naio_setup_aio(int ctl_fd, int s, void __user *buf, 
+		size_t size, u32 event)
+{
+	struct kevent_user *u;
+	struct file *file;
+	int err;
+	struct ukevent uk;
+
+	file = fget(ctl_fd);
+	if (!file)
+		return -ENODEV;
+
+	u = file->private_data;
+	if (!u) {
+		err = -EINVAL;
+		goto err_out_fput;
+	}
+
+	memset(&uk, 0, sizeof(struct ukevent));
+	uk.type = KEVENT_NAIO;
+	uk.ptr = buf;
+	uk.req_flags = KEVENT_REQ_ONESHOT;
+	uk.event = event;
+	uk.id.raw[0] = s;
+	uk.id.raw[1] = size;
+
+	err = kevent_user_add_ukevent(&uk, u);
+
+err_out_fput:
+	fput(file);
+	return err;
+}
+
+asmlinkage long sys_aio_recv(int ctl_fd, int s, void __user *buf, 
+		size_t size, unsigned flags)
+{
+	return kevent_naio_setup_aio(ctl_fd, s, buf, size, KEVENT_SOCKET_RECV);
+}
+
+asmlinkage long sys_aio_send(int ctl_fd, int s, void __user *buf, 
+		size_t size, unsigned flags)
+{
+	return kevent_naio_setup_aio(ctl_fd, s, buf, size, KEVENT_SOCKET_SEND);
+}
+
+static int kevent_naio_enqueue(struct kevent *k)
+{
+	int err = -ENODEV, i;
+	struct page **page;
+	void *addr;
+	unsigned int size = k->event.id.raw[1];
+	int num = size/PAGE_SIZE;
+	struct socket *sock;
+	struct sock *sk = NULL;
+
+	sock = sockfd_lookup(k->event.id.raw[0], &err);
+	if (!sock)
+		return -ENODEV;
+
+	sk = sock->sk;
+
+	err = -ESOCKTNOSUPPORT;
+	if (!sk || !sk->sk_prot->async_recv || !sk->sk_prot->async_send || 
+		!sock_flag(sk, SOCK_ASYNC))
+		goto err_out_fput;
+	
+	addr = k->event.ptr;
+	if (((unsigned long)addr & PAGE_MASK) != (unsigned long)addr)
+		num++;
+
+	page = kmalloc(sizeof(struct page *) * num, GFP_KERNEL);
+	if (!page)
+		goto err_out_fput;
+
+	down_read(&current->mm->mmap_sem);
+	err = get_user_pages(current, current->mm, (unsigned long)addr, 
+			num, 1, 0, page, NULL);
+	up_read(&current->mm->mmap_sem);
+	if (err <= 0)
+		goto err_out_free;
+	num = err;
+
+	k->event.ret_data[0] = num;
+	k->event.ret_data[1] = offset_in_page(k->event.ptr);
+	k->priv = page;
+
+	sk->sk_allocation = GFP_ATOMIC;
+
+	spin_lock_bh(&sk->sk_lock.slock);
+	err = kevent_socket_enqueue(k);
+	spin_unlock_bh(&sk->sk_lock.slock);
+	if (err)
+		goto err_out_put_pages;
+
+	sockfd_put(sock);
+
+	return err;
+
+err_out_put_pages:
+	for (i=0; i<num; ++i)
+		page_cache_release(page[i]);
+err_out_free:
+	kfree(page);
+err_out_fput:
+	sockfd_put(sock);
+
+	return err;
+}
+
+static int kevent_naio_dequeue(struct kevent *k)
+{
+	int err, i, num;
+	struct page **page = k->priv;
+
+	num = k->event.ret_data[0];
+
+	err = kevent_socket_dequeue(k);
+
+	for (i=0; i<num; ++i)
+		page_cache_release(page[i]);
+
+	kfree(k->priv);
+	k->priv = NULL;
+
+	return err;
+}
+
+static int kevent_naio_callback(struct kevent *k)
+{
+	struct inode *inode = k->st->origin;
+	struct sock *sk = SOCKET_I(inode)->sk;
+	unsigned int size = k->event.id.raw[1];
+	unsigned int off = k->event.ret_data[1];
+	struct page **pages = k->priv, *page;
+	int ready = 0, num = off/PAGE_SIZE, err = 0, send = 0;
+	void *ptr, *optr;
+	unsigned int len;
+
+	if (!sock_flag(sk, SOCK_ASYNC))
+		return -1;
+
+	if (k->event.event & KEVENT_SOCKET_SEND)
+		send = 1;
+	else if (!(k->event.event & KEVENT_SOCKET_RECV))
+		return -EINVAL;
+
+	/*
+	 * sk_prot->async_*() can return either number of bytes processed,
+	 * or negative error value, or zero if socket is closed.
+	 */
+
+	if (!send) {
+		page = pages[num];
+
+		optr = ptr = kmap_atomic(page, KM_IRQ0);
+		if (!ptr)
+			return -ENOMEM;
+
+		ptr += off % PAGE_SIZE;
+		len = min_t(unsigned int, PAGE_SIZE - (ptr - optr), size);
+
+		err = sk->sk_prot->async_recv(sk, ptr, len);
+
+		kunmap_atomic(optr, KM_IRQ0);
+	} else {
+		len = size;
+		err = sk->sk_prot->async_send(sk, pages, off, size);
+	}
+
+	if (err > 0) {
+		num++;
+		size -= err;
+		off += err;
+	}
+
+	k->event.ret_data[1] = off;
+	k->event.id.raw[1] = size;
+
+	if (err == 0 || (err < 0 && err != -EAGAIN))
+		ready = -1;
+
+	if (!size)
+		ready = 1;
+#if 0
+	printk("%s: sk=%p, k=%p, size=%4u, off=%4u, err=%3d, ready=%1d.\n",
+			__func__, sk, k, size, off, err, ready);
+#endif
+
+	return ready;
+}
+
+static int __init kevent_init_naio(void)
+{
+	struct kevent_callbacks *nc = &kevent_registered_callbacks[KEVENT_NAIO];
+
+	nc->callback = &kevent_naio_enqueue;
+	nc->dequeue = &kevent_naio_dequeue;
+	nc->callback = &kevent_naio_callback;
+	return 0;
+}
+late_initcall(kevent_init_naio);
diff --git a/kernel/kevent/kevent_socket.c b/kernel/kevent/kevent_socket.c
new file mode 100644
index 0000000..3c4a9ad
--- /dev/null
+++ b/kernel/kevent/kevent_socket.c
@@ -0,0 +1,144 @@
+/*
+ * 	kevent_socket.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+#include <linux/file.h>
+#include <linux/tcp.h>
+#include <linux/kevent.h>
+
+#include <net/sock.h>
+#include <net/request_sock.h>
+#include <net/inet_connection_sock.h>
+
+static int kevent_socket_callback(struct kevent *k)
+{
+	struct inode *inode = k->st->origin;
+	struct sock *sk = SOCKET_I(inode)->sk;
+	int rmem;
+	
+	if (k->event.event & KEVENT_SOCKET_RECV) {
+		int ret = 0;
+		
+		if ((rmem = atomic_read(&sk->sk_rmem_alloc)) > 0 || 
+				!skb_queue_empty(&sk->sk_receive_queue))
+			ret = 1;
+		if (sk->sk_shutdown & RCV_SHUTDOWN)
+			ret = 1;
+		if (ret)
+			return ret;
+	}
+	if ((k->event.event & KEVENT_SOCKET_ACCEPT) && 
+		(!reqsk_queue_empty(&inet_csk(sk)->icsk_accept_queue) || 
+		 	reqsk_queue_len_young(&inet_csk(sk)->icsk_accept_queue))) {
+		k->event.ret_data[1] = reqsk_queue_len(&inet_csk(sk)->icsk_accept_queue);
+		return 1;
+	}
+
+	return 0;
+}
+
+int kevent_socket_enqueue(struct kevent *k)
+{
+	struct inode *inode;
+	struct socket *sock;
+	int err = -ENODEV;
+
+	sock = sockfd_lookup(k->event.id.raw[0], &err);
+	if (!sock)
+		goto err_out_exit;
+
+	inode = igrab(SOCK_INODE(sock));
+	if (!inode)
+		goto err_out_fput;
+
+	err = kevent_storage_enqueue(&inode->st, k);
+	if (err)
+		goto err_out_iput;
+
+	err = k->callbacks.callback(k);
+	if (err)
+		goto err_out_dequeue;
+
+	sockfd_put(sock);
+	return err;
+
+err_out_dequeue:
+	kevent_storage_dequeue(k->st, k);
+err_out_iput:
+	iput(inode);
+err_out_fput:
+	sockfd_put(sock);
+err_out_exit:
+	return err;
+}
+
+int kevent_socket_dequeue(struct kevent *k)
+{
+	struct inode *inode = k->st->origin;
+
+	kevent_storage_dequeue(k->st, k);
+	iput(inode);
+
+	return 0;
+}
+
+void kevent_socket_notify(struct sock *sk, u32 event)
+{
+	if (sk->sk_socket && !test_and_set_bit(SOCK_ASYNC_INUSE, &sk->sk_flags)) {
+		kevent_storage_ready(&SOCK_INODE(sk->sk_socket)->st, NULL, event);
+		sock_reset_flag(sk, SOCK_ASYNC_INUSE);
+	}
+}
+
+#ifdef CONFIG_LOCKDEP
+static struct lock_class_key kevent_sock_key;
+
+void kevent_socket_reinit(struct socket *sock)
+{
+	struct inode *inode = SOCK_INODE(sock);
+
+	lockdep_set_class(&inode->st.lock, &kevent_sock_key);
+}
+
+void kevent_sk_reinit(struct sock *sk)
+{
+	if (sk->sk_socket) {
+		struct inode *inode = SOCK_INODE(sk->sk_socket);
+
+		lockdep_set_class(&inode->st.lock, &kevent_sock_key);
+	}
+}
+#endif
+static int __init kevent_init_socket(void)
+{
+	struct kevent_callbacks *sc = &kevent_registered_callbacks[KEVENT_SOCKET];
+
+	sc->enqueue = &kevent_socket_enqueue;
+	sc->dequeue = &kevent_socket_dequeue;
+	sc->callback = &kevent_socket_callback;
+	return 0;
+}
+late_initcall(kevent_init_socket);
diff --git a/net/core/datagram.c b/net/core/datagram.c
index aecddcc..493245b 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -236,6 +236,60 @@ void skb_kill_datagram(struct sock *sk, 
 EXPORT_SYMBOL(skb_kill_datagram);
 
 /**
+ *	skb_copy_datagram - Copy a datagram.
+ *	@skb: buffer to copy
+ *	@offset: offset in the buffer to start copying from
+ *	@to: pointer to copy to
+ *	@len: amount of data to copy from buffer to iovec
+ */
+int skb_copy_datagram(const struct sk_buff *skb, int offset,
+			    void *to, int len)
+{
+	int i, fraglen, end = 0;
+	struct sk_buff *next = skb_shinfo(skb)->frag_list;
+
+	if (!len)
+		return 0;
+
+next_skb:
+	fraglen = skb_headlen(skb);
+	i = -1;
+
+	while (1) {
+		int start = end;
+
+		if ((end += fraglen) > offset) {
+			int copy = end - offset, o = offset - start;
+
+			if (copy > len)
+				copy = len;
+			if (i == -1)
+				memcpy(to, skb->data + o, copy);
+			else {
+				skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+				struct page *page = frag->page;
+				void *p = kmap(page) + frag->page_offset + o;
+				memcpy(to, p, copy);
+				kunmap(page);
+			}
+			if (!(len -= copy))
+				return 0;
+			offset += copy;
+		}
+		if (++i >= skb_shinfo(skb)->nr_frags)
+			break;
+		fraglen = skb_shinfo(skb)->frags[i].size;
+	}
+	if (next) {
+		skb = next;
+		BUG_ON(skb_shinfo(skb)->frag_list);
+		next = skb->next;
+		goto next_skb;
+	}
+	return -EFAULT;
+}
+
+/**
  *	skb_copy_datagram_iovec - Copy a datagram to an iovec.
  *	@skb: buffer to copy
  *	@offset: offset in the buffer to start copying from
@@ -530,6 +584,7 @@ unsigned int datagram_poll(struct file *
 
 EXPORT_SYMBOL(datagram_poll);
 EXPORT_SYMBOL(skb_copy_and_csum_datagram_iovec);
+EXPORT_SYMBOL(skb_copy_datagram);
 EXPORT_SYMBOL(skb_copy_datagram_iovec);
 EXPORT_SYMBOL(skb_free_datagram);
 EXPORT_SYMBOL(skb_recv_datagram);
diff --git a/net/core/sock.c b/net/core/sock.c
index 51fcfbc..138ce90 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -617,6 +617,16 @@ #endif
 			spin_unlock_bh(&sk->sk_lock.slock);
 			ret = -ENONET;
 			break;
+#ifdef CONFIG_KEVENT_SOCKET
+		case SO_ASYNC_SOCK:
+			spin_lock_bh(&sk->sk_lock.slock);
+			if (valbool)
+				sock_set_flag(sk, SOCK_ASYNC);
+			else
+				sock_reset_flag(sk, SOCK_ASYNC);
+			spin_unlock_bh(&sk->sk_lock.slock);
+			break;
+#endif
 
 		case SO_PASSSEC:
 			if (valbool)
@@ -1406,6 +1416,7 @@ static void sock_def_wakeup(struct sock 
 	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
 		wake_up_interruptible_all(sk->sk_sleep);
 	read_unlock(&sk->sk_callback_lock);
+	kevent_socket_notify(sk, KEVENT_SOCKET_RECV|KEVENT_SOCKET_SEND);
 }
 
 static void sock_def_error_report(struct sock *sk)
@@ -1415,6 +1426,7 @@ static void sock_def_error_report(struct
 		wake_up_interruptible(sk->sk_sleep);
 	sk_wake_async(sk,0,POLL_ERR); 
 	read_unlock(&sk->sk_callback_lock);
+	kevent_socket_notify(sk, KEVENT_SOCKET_RECV|KEVENT_SOCKET_SEND);
 }
 
 static void sock_def_readable(struct sock *sk, int len)
@@ -1424,6 +1436,7 @@ static void sock_def_readable(struct soc
 		wake_up_interruptible(sk->sk_sleep);
 	sk_wake_async(sk,1,POLL_IN);
 	read_unlock(&sk->sk_callback_lock);
+	kevent_socket_notify(sk, KEVENT_SOCKET_RECV|KEVENT_SOCKET_SEND);
 }
 
 static void sock_def_write_space(struct sock *sk)
@@ -1443,6 +1456,7 @@ static void sock_def_write_space(struct 
 	}
 
 	read_unlock(&sk->sk_callback_lock);
+	kevent_socket_notify(sk, KEVENT_SOCKET_SEND|KEVENT_SOCKET_RECV);
 }
 
 static void sock_def_destruct(struct sock *sk)
@@ -1493,6 +1507,8 @@ #endif
 	sk->sk_state		=	TCP_CLOSE;
 	sk->sk_socket		=	sock;
 
+	kevent_sk_reinit(sk);
+
 	sock_set_flag(sk, SOCK_ZAPPED);
 
 	if(sock)
@@ -1559,8 +1575,10 @@ void fastcall release_sock(struct sock *
 	if (sk->sk_backlog.tail)
 		__release_sock(sk);
 	sk->sk_lock.owner = NULL;
-	if (waitqueue_active(&sk->sk_lock.wq))
+	if (waitqueue_active(&sk->sk_lock.wq)) {
 		wake_up(&sk->sk_lock.wq);
+		kevent_socket_notify(sk, KEVENT_SOCKET_RECV|KEVENT_SOCKET_SEND);
+	}
 	spin_unlock_bh(&sk->sk_lock.slock);
 }
 EXPORT_SYMBOL(release_sock);
diff --git a/net/core/stream.c b/net/core/stream.c
index d1d7dec..2878c2a 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -36,6 +36,7 @@ void sk_stream_write_space(struct sock *
 			wake_up_interruptible(sk->sk_sleep);
 		if (sock->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
 			sock_wake_async(sock, 2, POLL_OUT);
+		kevent_socket_notify(sk, KEVENT_SOCKET_SEND|KEVENT_SOCKET_RECV);
 	}
 }
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index f6a2d92..e878a41 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -206,6 +206,7 @@
  *					lingertime == 0 (RFC 793 ABORT Call)
  *	Hirokazu Takahashi	:	Use copy_from_user() instead of
  *					csum_and_copy_from_user() if possible.
+ *	Evgeniy Polyakov	:	Network asynchronous IO.
  *
  *		This program is free software; you can redistribute it and/or
  *		modify it under the terms of the GNU General Public License
@@ -1085,6 +1086,301 @@ int tcp_read_sock(struct sock *sk, read_
 }
 
 /*
+ * Must be called with locked sock.
+ */
+int tcp_async_send(struct sock *sk, struct page **pages, unsigned int poffset, size_t len)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int mss_now, size_goal;
+	int err = -EAGAIN;
+	ssize_t copied;
+
+	/* Wait for a connection to finish. */
+	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
+		goto out_err;
+
+	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+
+	mss_now = tcp_current_mss(sk, 1);
+	size_goal = tp->xmit_size_goal;
+	copied = 0;
+
+	err = -EPIPE;
+	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN) || sock_flag(sk, SOCK_DONE) ||
+			(sk->sk_state == TCP_CLOSE) || (atomic_read(&sk->sk_refcnt) == 1))
+		goto do_error;
+
+	while (len > 0) {
+		struct sk_buff *skb = sk->sk_write_queue.prev;
+		struct page *page = pages[poffset / PAGE_SIZE];
+		int copy, i, can_coalesce;
+		int offset = poffset % PAGE_SIZE;
+		int size = min_t(size_t, len, PAGE_SIZE - offset);
+
+		if (!sk->sk_send_head || (copy = size_goal - skb->len) <= 0) {
+new_segment:
+			if (!sk_stream_memory_free(sk))
+				goto wait_for_sndbuf;
+
+			skb = sk_stream_alloc_pskb(sk, 0, 0,
+						   sk->sk_allocation);
+			if (!skb)
+				goto wait_for_memory;
+
+			skb_entail(sk, tp, skb);
+			copy = size_goal;
+		}
+
+		if (copy > size)
+			copy = size;
+
+		i = skb_shinfo(skb)->nr_frags;
+		can_coalesce = skb_can_coalesce(skb, i, page, offset);
+		if (!can_coalesce && i >= MAX_SKB_FRAGS) {
+			tcp_mark_push(tp, skb);
+			goto new_segment;
+		}
+		if (!sk_stream_wmem_schedule(sk, copy))
+			goto wait_for_memory;
+		
+		if (can_coalesce) {
+			skb_shinfo(skb)->frags[i - 1].size += copy;
+		} else {
+			get_page(page);
+			skb_fill_page_desc(skb, i, page, offset, copy);
+		}
+
+		skb->len += copy;
+		skb->data_len += copy;
+		skb->truesize += copy;
+		sk->sk_wmem_queued += copy;
+		sk->sk_forward_alloc -= copy;
+		skb->ip_summed = CHECKSUM_HW;
+		tp->write_seq += copy;
+		TCP_SKB_CB(skb)->end_seq += copy;
+		skb_shinfo(skb)->gso_segs = 0;
+
+		if (!copied)
+			TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
+
+		copied += copy;
+		poffset += copy;
+		if (!(len -= copy))
+			goto out;
+
+		if (skb->len < mss_now)
+			continue;
+
+		if (forced_push(tp)) {
+			tcp_mark_push(tp, skb);
+			__tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
+		} else if (skb == sk->sk_send_head)
+			tcp_push_one(sk, mss_now);
+		continue;
+
+wait_for_sndbuf:
+		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+wait_for_memory:
+		if (copied)
+			tcp_push(sk, tp, 0, mss_now, TCP_NAGLE_PUSH);
+
+		err = -EAGAIN;
+		goto do_error;
+	}
+
+out:
+	if (copied)
+		tcp_push(sk, tp, 0, mss_now, tp->nonagle);
+	return copied;
+
+do_error:
+	if (copied)
+		goto out;
+out_err:
+	return sk_stream_error(sk, 0, err);
+}
+
+/*
+ * Must be called with locked sock.
+ */
+int tcp_async_recv(struct sock *sk, void *dst, size_t len)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int copied = 0;
+	u32 *seq;
+	unsigned long used;
+	int err;
+	int target;		/* Read at least this many bytes */
+	int copied_early = 0;
+
+	TCP_CHECK_TIMER(sk);
+
+	err = -ENOTCONN;
+	if (sk->sk_state == TCP_LISTEN)
+		goto out;
+
+	seq = &tp->copied_seq;
+
+	target = sock_rcvlowat(sk, 0, len);
+
+	do {
+		struct sk_buff *skb;
+		u32 offset;
+
+		/* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
+		if (tp->urg_data && tp->urg_seq == *seq) {
+			if (copied)
+				break;
+		}
+
+		/* Next get a buffer. */
+
+		skb = skb_peek(&sk->sk_receive_queue);
+		do {
+			if (!skb)
+				break;
+
+			/* Now that we have two receive queues this
+			 * shouldn't happen.
+			 */
+			if (before(*seq, TCP_SKB_CB(skb)->seq)) {
+				printk(KERN_INFO "async_recv bug: copied %X "
+				       "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
+				break;
+			}
+			offset = *seq - TCP_SKB_CB(skb)->seq;
+			if (skb->h.th->syn)
+				offset--;
+			if (offset < skb->len)
+				goto found_ok_skb;
+			if (skb->h.th->fin)
+				goto found_fin_ok;
+			skb = skb->next;
+		} while (skb != (struct sk_buff *)&sk->sk_receive_queue);
+
+		if (copied)
+			break;
+
+		if (sock_flag(sk, SOCK_DONE))
+			break;
+
+		if (sk->sk_err) {
+			copied = sock_error(sk);
+			break;
+		}
+
+		if (sk->sk_shutdown & RCV_SHUTDOWN)
+			break;
+
+		if (sk->sk_state == TCP_CLOSE) {
+			if (!sock_flag(sk, SOCK_DONE)) {
+				/* This occurs when user tries to read
+				 * from never connected socket.
+				 */
+				copied = -ENOTCONN;
+				break;
+			}
+			break;
+		}
+
+		copied = -EAGAIN;
+		break;
+
+	found_ok_skb:
+		/* Ok so how much can we use? */
+		used = skb->len - offset;
+		if (len < used)
+			used = len;
+
+		/* Do we have urgent data here? */
+		if (tp->urg_data) {
+			u32 urg_offset = tp->urg_seq - *seq;
+			if (urg_offset < used) {
+				if (!urg_offset) {
+					if (!sock_flag(sk, SOCK_URGINLINE)) {
+						++*seq;
+						offset++;
+						used--;
+						if (!used)
+							goto skip_copy;
+					}
+				} else
+					used = urg_offset;
+			}
+		}
+#ifdef CONFIG_NET_DMA
+		if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
+			tp->ucopy.dma_chan = get_softnet_dma();
+
+		if (tp->ucopy.dma_chan) {
+			tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec(
+				tp->ucopy.dma_chan, skb, offset,
+				msg->msg_iov, used,
+				tp->ucopy.pinned_list);
+
+			if (tp->ucopy.dma_cookie < 0) {
+
+				printk(KERN_ALERT "dma_cookie < 0\n");
+
+				/* Exception. Bailout! */
+				if (!copied)
+					copied = -EFAULT;
+				break;
+			}
+			if ((offset + used) == skb->len)
+				copied_early = 1;
+
+		} else
+#endif
+		{
+			err = skb_copy_datagram(skb, offset, dst, used);
+			if (err) {
+				/* Exception. Bailout! */
+				if (!copied)
+					copied = -EFAULT;
+				break;
+			}
+		}
+
+		*seq += used;
+		copied += used;
+		len -= used;
+		dst += used;
+
+		tcp_rcv_space_adjust(sk);
+
+skip_copy:
+		if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
+			tp->urg_data = 0;
+			tcp_fast_path_check(sk, tp);
+		}
+		if (used + offset < skb->len)
+			continue;
+
+		if (skb->h.th->fin)
+			goto found_fin_ok;
+		sk_eat_skb(sk, skb, copied_early);
+		continue;
+
+	found_fin_ok:
+		/* Process the FIN. */
+		++*seq;
+		sk_eat_skb(sk, skb, copied_early);
+		break;
+	} while (len > 0);
+
+	/* Clean up data we have read: This will do ACK frames. */
+	tcp_cleanup_rbuf(sk, copied);
+
+	TCP_CHECK_TIMER(sk);
+	return copied;
+
+out:
+	TCP_CHECK_TIMER(sk);
+	return err;
+}
+
+/*
  *	This routine copies from a sock struct into the user buffer.
  *
  *	Technical note: in 2.3 we work on _locked_ socket, so that
@@ -2342,6 +2638,8 @@ EXPORT_SYMBOL(tcp_getsockopt);
 EXPORT_SYMBOL(tcp_ioctl);
 EXPORT_SYMBOL(tcp_poll);
 EXPORT_SYMBOL(tcp_read_sock);
+EXPORT_SYMBOL(tcp_async_recv);
+EXPORT_SYMBOL(tcp_async_send);
 EXPORT_SYMBOL(tcp_recvmsg);
 EXPORT_SYMBOL(tcp_sendmsg);
 EXPORT_SYMBOL(tcp_sendpage);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 738dad9..f70d045 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3112,6 +3112,7 @@ static void tcp_ofo_queue(struct sock *s
 
 		__skb_unlink(skb, &tp->out_of_order_queue);
 		__skb_queue_tail(&sk->sk_receive_queue, skb);
+		kevent_socket_notify(sk, KEVENT_SOCKET_RECV);
 		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
 		if(skb->h.th->fin)
 			tcp_fin(skb, sk, skb->h.th);
@@ -3955,7 +3956,8 @@ int tcp_rcv_established(struct sock *sk,
 			int copied_early = 0;
 
 			if (tp->copied_seq == tp->rcv_nxt &&
-			    len - tcp_header_len <= tp->ucopy.len) {
+			    len - tcp_header_len <= tp->ucopy.len &&
+			    !sock_async(sk)) {
 #ifdef CONFIG_NET_DMA
 				if (tcp_dma_try_early_copy(sk, skb, tcp_header_len)) {
 					copied_early = 1;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index f6f39e8..ae4f23c 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -61,6 +61,7 @@ #include <linux/cache.h>
 #include <linux/jhash.h>
 #include <linux/init.h>
 #include <linux/times.h>
+#include <linux/kevent.h>
 
 #include <net/icmp.h>
 #include <net/inet_hashtables.h>
@@ -868,6 +869,7 @@ #endif
 	   	reqsk_free(req);
 	} else {
 		inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
+		kevent_socket_notify(sk, KEVENT_SOCKET_ACCEPT);
 	}
 	return 0;
 
@@ -1108,24 +1110,30 @@ process:
 
 	skb->dev = NULL;
 
-	bh_lock_sock_nested(sk);
 	ret = 0;
-	if (!sock_owned_by_user(sk)) {
+	if (sock_async(sk)) {
+		spin_lock_bh(&sk->sk_lock.slock);
+		ret = tcp_v4_do_rcv(sk, skb);
+		spin_unlock_bh(&sk->sk_lock.slock);
+	} else {
+		bh_lock_sock_nested(sk);
+		if (!sock_owned_by_user(sk)) {
 #ifdef CONFIG_NET_DMA
-		struct tcp_sock *tp = tcp_sk(sk);
-		if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
-			tp->ucopy.dma_chan = get_softnet_dma();
-		if (tp->ucopy.dma_chan)
-			ret = tcp_v4_do_rcv(sk, skb);
-		else
+			struct tcp_sock *tp = tcp_sk(sk);
+			if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
+				tp->ucopy.dma_chan = get_softnet_dma();
+			if (tp->ucopy.dma_chan)
+				ret = tcp_v4_do_rcv(sk, skb);
+			else
 #endif
-		{
-			if (!tcp_prequeue(sk, skb))
-			ret = tcp_v4_do_rcv(sk, skb);
-		}
-	} else
-		sk_add_backlog(sk, skb);
-	bh_unlock_sock(sk);
+			{
+				if (!tcp_prequeue(sk, skb))
+				ret = tcp_v4_do_rcv(sk, skb);
+			}
+		} else
+			sk_add_backlog(sk, skb);
+		bh_unlock_sock(sk);
+	}
 
 	sock_put(sk);
 
@@ -1849,6 +1857,8 @@ struct proto tcp_prot = {
 	.getsockopt		= tcp_getsockopt,
 	.sendmsg		= tcp_sendmsg,
 	.recvmsg		= tcp_recvmsg,
+	.async_recv		= tcp_async_recv,
+	.async_send		= tcp_async_send,
 	.backlog_rcv		= tcp_v4_do_rcv,
 	.hash			= tcp_v4_hash,
 	.unhash			= tcp_unhash,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 923989d..a5d3ac8 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1230,22 +1230,28 @@ process:
 
 	skb->dev = NULL;
 
-	bh_lock_sock(sk);
 	ret = 0;
-	if (!sock_owned_by_user(sk)) {
+	if (sock_async(sk)) {
+		spin_lock_bh(&sk->sk_lock.slock);
+		ret = tcp_v4_do_rcv(sk, skb);
+		spin_unlock_bh(&sk->sk_lock.slock);
+	} else {
+		bh_lock_sock(sk);
+		if (!sock_owned_by_user(sk)) {
 #ifdef CONFIG_NET_DMA
-                struct tcp_sock *tp = tcp_sk(sk);
-                if (tp->ucopy.dma_chan)
-                        ret = tcp_v6_do_rcv(sk, skb);
-                else
-#endif
-		{
-			if (!tcp_prequeue(sk, skb))
+			struct tcp_sock *tp = tcp_sk(sk);
+			if (tp->ucopy.dma_chan)
 				ret = tcp_v6_do_rcv(sk, skb);
-		}
-	} else
-		sk_add_backlog(sk, skb);
-	bh_unlock_sock(sk);
+			else
+#endif
+			{
+				if (!tcp_prequeue(sk, skb))
+					ret = tcp_v6_do_rcv(sk, skb);
+			}
+		} else
+			sk_add_backlog(sk, skb);
+		bh_unlock_sock(sk);
+	}
 
 	sock_put(sk);
 	return ret ? -1 : 0;
@@ -1596,6 +1602,8 @@ struct proto tcpv6_prot = {
 	.getsockopt		= tcp_getsockopt,
 	.sendmsg		= tcp_sendmsg,
 	.recvmsg		= tcp_recvmsg,
+	.async_recv		= tcp_async_recv,
+	.async_send		= tcp_async_send,
 	.backlog_rcv		= tcp_v6_do_rcv,
 	.hash			= tcp_v6_hash,
 	.unhash			= tcp_unhash,


^ permalink raw reply related	[flat|nested] 160+ messages in thread

* Re: [take6 0/3] kevent: Generic event handling mechanism.
  2006-08-09  7:58                           ` David Miller
@ 2006-08-09  8:07                             ` Evgeniy Polyakov
  2006-08-09  8:20                               ` David Miller
  0 siblings, 1 reply; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-09  8:07 UTC (permalink / raw)
  To: David Miller; +Cc: linux-kernel, drepper, netdev, zach.brown

On Wed, Aug 09, 2006 at 12:58:56AM -0700, David Miller (davem@davemloft.net) wrote:
> From: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
> Date: Wed, 9 Aug 2006 12:02:39 +0400
> 
> Evgeniy, it's things like the following that make it very draining
> mentally to review your work.
> 
> >  * removed AIO stuff from patchset
> 
> You didn't really do this, you leave the aio_* syscalls and stubs in
> there, and you also left things like tcp_async_send() in there.

By AIO I meant VFS AIO, not network stuff, exactly that part was frown 
upon in reviews.

> All the foo_naio_*() stuff is still in there to.
> 
> Please remove all of async business we've asked you to.

So you want to review kevent core only at the first point and postpone
network AIO and the rest implementation after core is correct.
Should I remove poll/timer notifications too?

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take6 0/3] kevent: Generic event handling mechanism.
  2006-08-09  8:07                             ` Evgeniy Polyakov
@ 2006-08-09  8:20                               ` David Miller
  2006-08-09  8:24                                 ` Evgeniy Polyakov
  0 siblings, 1 reply; 160+ messages in thread
From: David Miller @ 2006-08-09  8:20 UTC (permalink / raw)
  To: johnpol; +Cc: linux-kernel, drepper, netdev, zach.brown

From: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
Date: Wed, 9 Aug 2006 12:07:57 +0400

> So you want to review kevent core only at the first point and postpone
> network AIO and the rest implementation after core is correct.

That's the idea

> Should I remove poll/timer notifications too?

That can stay

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take6 0/3] kevent: Generic event handling mechanism.
  2006-08-09  8:20                               ` David Miller
@ 2006-08-09  8:24                                 ` Evgeniy Polyakov
  0 siblings, 0 replies; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-09  8:24 UTC (permalink / raw)
  To: David Miller; +Cc: linux-kernel, drepper, netdev, zach.brown

On Wed, Aug 09, 2006 at 01:20:45AM -0700, David Miller (davem@davemloft.net) wrote:
> From: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
> Date: Wed, 9 Aug 2006 12:07:57 +0400
> 
> > So you want to review kevent core only at the first point and postpone
> > network AIO and the rest implementation after core is correct.
> 
> That's the idea
> 
> > Should I remove poll/timer notifications too?
> 
> That can stay


Ok, I will regenerate the lastest patchset completely without AIO stuff
(both network and VFS) and resend it soon.
Thank you.

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take6 1/3] kevent: Core files.
  2006-08-09  8:02                           ` [take6 1/3] kevent: Core files Evgeniy Polyakov
  2006-08-09  8:02                             ` [take6 3/3] kevent: Network AIO, socket notifications Evgeniy Polyakov
@ 2006-08-09 17:47                             ` Stephen Hemminger
  2006-08-09 19:17                               ` Evgeniy Polyakov
  2006-08-10  0:04                               ` David Miller
  2006-08-09 22:21                             ` Andrew Morton
  2 siblings, 2 replies; 160+ messages in thread
From: Stephen Hemminger @ 2006-08-09 17:47 UTC (permalink / raw)
  To: Evgeniy Polyakov; +Cc: lkml, David Miller, Ulrich Drepper, netdev, Zach Brown

On Wed, 9 Aug 2006 12:02:40 +0400
Evgeniy Polyakov <johnpol@2ka.mipt.ru> wrote:

> 
> Core files.
> 
> This patch includes core kevent files:
>  - userspace controlling
>  - kernelspace interfaces
>  - initialization
>  - notification state machines
> 
> It might also inlclude parts from other subsystem (like network related
> syscalls, so it is possible that it will not compile without other
> patches applied).
> 
> Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
> 
> 
> +#ifdef CONFIG_KEVENT_USER_STAT
> +static inline void kevent_user_stat_init(struct kevent_user *u)
> +{
> +	u->wait_num = u->im_num = u->total = 0;
> +}
> +static inline void kevent_user_stat_print(struct kevent_user *u)
> +{
> +	pr_debug("%s: u=%p, wait=%lu, immediately=%lu, total=%lu.\n", 
> +			__func__, u, u->wait_num, u->im_num, u->total);
> +}
> +static inline void kevent_user_stat_increase_im(struct kevent_user *u)
> +{
> +	u->im_num++;
> +}
> +static inline void kevent_user_stat_increase_wait(struct kevent_user *u)
> +{
> +	u->wait_num++;
> +}
> +static inline void kevent_user_stat_increase_total(struct kevent_user *u)
> +{
> +	u->total++;
> +}
>

static wrapper_functions_with_execessive_long_names(struct i_really_hate *this)
{
	suck();
}

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take6 1/3] kevent: Core files.
  2006-08-09 17:47                             ` [take6 1/3] kevent: Core files Stephen Hemminger
@ 2006-08-09 19:17                               ` Evgeniy Polyakov
  2006-08-10  0:04                               ` David Miller
  1 sibling, 0 replies; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-09 19:17 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: lkml, David Miller, Ulrich Drepper, netdev, Zach Brown

On Wed, Aug 09, 2006 at 10:47:38AM -0700, Stephen Hemminger (shemminger@osdl.org) wrote:
> > +static inline void kevent_user_stat_increase_total(struct kevent_user *u)
> > +{
> > +	u->total++;
> > +}
> >
> 
> static wrapper_functions_with_execessive_long_names(struct i_really_hate *this)
> {
> 	suck();
> }

Understood...

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take6 1/3] kevent: Core files.
  2006-08-09  8:02                           ` [take6 1/3] kevent: Core files Evgeniy Polyakov
  2006-08-09  8:02                             ` [take6 3/3] kevent: Network AIO, socket notifications Evgeniy Polyakov
  2006-08-09 17:47                             ` [take6 1/3] kevent: Core files Stephen Hemminger
@ 2006-08-09 22:21                             ` Andrew Morton
  2006-08-10  6:14                               ` Evgeniy Polyakov
  2006-08-10 12:12                               ` [take7 0/1] kevent: generic event handling mechanism Evgeniy Polyakov
  2 siblings, 2 replies; 160+ messages in thread
From: Andrew Morton @ 2006-08-09 22:21 UTC (permalink / raw)
  To: Evgeniy Polyakov; +Cc: lkml, David Miller, Ulrich Drepper, netdev, Zach Brown

On Wed, 9 Aug 2006 12:02:40 +0400
Evgeniy Polyakov <johnpol@2ka.mipt.ru> wrote:

> 
> Core files.
> 
> This patch includes core kevent files:
>  - userspace controlling
>  - kernelspace interfaces
>  - initialization
>  - notification state machines
> 
> It might also inlclude parts from other subsystem (like network related
> syscalls, so it is possible that it will not compile without other
> patches applied).

Summary:

- has serious bugs which indicate that much better testing is needed.

- All -EFOO return values need to be reviewed for appropriateness

- needs much better commenting before I can do more than a local-level review.


> --- /dev/null
> +++ b/include/linux/kevent.h
> ...
>
> +/*
> + * Poll events.
> + */
> +#define	KEVENT_POLL_POLLIN	0x0001
> +#define	KEVENT_POLL_POLLPRI	0x0002
> +#define	KEVENT_POLL_POLLOUT	0x0004
> +#define	KEVENT_POLL_POLLERR	0x0008
> +#define	KEVENT_POLL_POLLHUP	0x0010
> +#define	KEVENT_POLL_POLLNVAL	0x0020
> +
> +#define	KEVENT_POLL_POLLRDNORM	0x0040
> +#define	KEVENT_POLL_POLLRDBAND	0x0080
> +#define	KEVENT_POLL_POLLWRNORM	0x0100
> +#define	KEVENT_POLL_POLLWRBAND	0x0200
> +#define	KEVENT_POLL_POLLMSG	0x0400
> +#define	KEVENT_POLL_POLLREMOVE	0x1000

0x0800 got lost.

> +struct ukevent
> +{
> +	struct kevent_id	id;			/* Id of this request, e.g. socket number, file descriptor and so on... */
> +	__u32			type;			/* Event type, e.g. KEVENT_SOCK, KEVENT_INODE, KEVENT_TIMER and so on... */
> +	__u32			event;			/* Event itself, e.g. SOCK_ACCEPT, INODE_CREATED, TIMER_FIRED... */
> +	__u32			req_flags;		/* Per-event request flags */
> +	__u32			ret_flags;		/* Per-event return flags */
> +	__u32			ret_data[2];		/* Event return data. Event originator fills it with anything it likes. */
> +	union {
> +		__u32		user[2];		/* User's data. It is not used, just copied to/from user. */
> +		void		*ptr;
> +	};
> +};

What is this union for?

`ptr' needs a __user tag, does it not?

`ptr' will be 64-bit in-kernel and 64-bit for 64-bit userspace, but 32-bit
for 32-bit userspace.  I guess that's why user[] is there.

On big-endian machines, this pointer will appear to be word-swapped as far
as a 64-bit kernel is concerned.  Or something.

IOW: What's going on here??

> +#ifdef CONFIG_KEVENT_INODE
> +void kevent_inode_notify(struct inode *inode, u32 event);
> +void kevent_inode_notify_parent(struct dentry *dentry, u32 event);
> +void kevent_inode_remove(struct inode *inode);
> +#else
> +static inline void kevent_inode_notify(struct inode *inode, u32 event)
> +{
> +}
> +static inline void kevent_inode_notify_parent(struct dentry *dentry, u32 event)
> +{
> +}
> +static inline void kevent_inode_remove(struct inode *inode)
> +{
> +}
> +#endif /* CONFIG_KEVENT_INODE */
> +#ifdef CONFIG_KEVENT_SOCKET
> +#ifdef CONFIG_LOCKDEP
> +void kevent_socket_reinit(struct socket *sock);
> +void kevent_sk_reinit(struct sock *sk);
> +#else
> +static inline void kevent_socket_reinit(struct socket *sock)
> +{
> +}
> +static inline void kevent_sk_reinit(struct sock *sk)
> +{
> +}
> +#endif
> +void kevent_socket_notify(struct sock *sock, u32 event);
> +int kevent_socket_dequeue(struct kevent *k);
> +int kevent_socket_enqueue(struct kevent *k);
> +#define sock_async(__sk) sock_flag(__sk, SOCK_ASYNC)

Is this header the correct place to be implementing sock_async()?

> --- /dev/null
> +++ b/kernel/kevent/Kconfig
> @@ -0,0 +1,50 @@
> +config KEVENT
> +	bool "Kernel event notification mechanism"
> +	help
> +	  This option enables event queue mechanism.
> +	  It can be used as replacement for poll()/select(), AIO callback invocations,
> +	  advanced timer notifications and other kernel object status changes.

Please squeeze all the help text into 80-columns.  Or at least check that
it looks OK in menuconfig in an 80-col xterm,

> --- /dev/null
> +++ b/kernel/kevent/kevent.c
> @@ -0,0 +1,238 @@
> +/*
> + * 	kevent.c
> + * 
> + * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
> + * All rights reserved.
> + * 
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
> + */
> +
> +#include <linux/kernel.h>
> +#include <linux/types.h>
> +#include <linux/list.h>
> +#include <linux/slab.h>
> +#include <linux/spinlock.h>
> +#include <linux/mempool.h>
> +#include <linux/sched.h>
> +#include <linux/wait.h>
> +#include <linux/kevent.h>
> +
> +kmem_cache_t *kevent_cache;
> +
> +/*
> + * Attempts to add an event into appropriate origin's queue.
> + * Returns positive value if this event is ready immediately,
> + * negative value in case of error and zero if event has been queued.
> + * ->enqueue() callback must increase origin's reference counter.
> + */
> +int kevent_enqueue(struct kevent *k)
> +{
> +	if (k->event.type >= KEVENT_MAX)
> +		return -E2BIG;

E2BIG is "Argument list too long".  EINVAL is appropriate here.

> +	if (!k->callbacks.enqueue) {
> +		kevent_break(k);
> +		return -EINVAL;
> +	}
> +	
> +	return k->callbacks.enqueue(k);
> +}
> +
> +/*
> + * Remove event from the appropriate queue.
> + * ->dequeue() callback must decrease origin's reference counter.
> + */
> +int kevent_dequeue(struct kevent *k)
> +{
> +	if (k->event.type >= KEVENT_MAX)
> +		return -E2BIG;
> +	
> +	if (!k->callbacks.dequeue) {
> +		kevent_break(k);
> +		return -EINVAL;
> +	}
> +
> +	return k->callbacks.dequeue(k);
> +}
> +
> +int kevent_break(struct kevent *k)
> +{
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&k->ulock, flags);
> +	k->event.ret_flags |= KEVENT_RET_BROKEN;
> +	spin_unlock_irqrestore(&k->ulock, flags);
> +	return 0;
> +}
> +
> +struct kevent_callbacks kevent_registered_callbacks[KEVENT_MAX];
> +
> +/*
> + * Must be called before event is going to be added into some origin's queue.
> + * Initializes ->enqueue(), ->dequeue() and ->callback() callbacks.
> + * If failed, kevent should not be used or kevent_enqueue() will fail to add
> + * this kevent into origin's queue with setting
> + * KEVENT_RET_BROKEN flag in kevent->event.ret_flags.
> + */
> +int kevent_init(struct kevent *k)
> +{
> +	spin_lock_init(&k->ulock);
> +	k->kevent_entry.next = LIST_POISON1;
> +	k->storage_entry.prev = LIST_POISON2;
> +	k->ready_entry.next = LIST_POISON1;

Nope ;)

> +	if (k->event.type >= KEVENT_MAX)
> +		return -E2BIG;
> +
> +	k->callbacks = kevent_registered_callbacks[k->event.type];
> +	if (!k->callbacks.callback) {
> +		kevent_break(k);
> +		return -EINVAL;
> +	}
> +
> +	return 0;
> +}
> +
> +/*
> + * Called from ->enqueue() callback when reference counter for given
> + * origin (socket, inode...) has been increased.
> + */
> +int kevent_storage_enqueue(struct kevent_storage *st, struct kevent *k)
> +{
> +	unsigned long flags;
> +
> +	k->st = st;
> +	spin_lock_irqsave(&st->lock, flags);
> +	list_add_tail_rcu(&k->storage_entry, &st->list);
> +	st->qlen++;
> +	spin_unlock_irqrestore(&st->lock, flags);
> +	return 0;
> +}

Is the _rcu variant needed here?

> +/*
> + * Dequeue kevent from origin's queue. 
> + * It does not decrease origin's reference counter in any way 
> + * and must be called before it, so storage itself must be valid.
> + * It is called from ->dequeue() callback.
> + */
> +void kevent_storage_dequeue(struct kevent_storage *st, struct kevent *k)
> +{
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&st->lock, flags);
> +	if (k->storage_entry.prev != LIST_POISON2) {

Nope, as discussed earlier.

> +		list_del_rcu(&k->storage_entry);
> +		st->qlen--;
> +	}
> +	spin_unlock_irqrestore(&st->lock, flags);
> +}
> +
> +static void __kevent_requeue(struct kevent *k, u32 event)
> +{
> +	int err, rem = 0;
> +	unsigned long flags;
> +
> +	err = k->callbacks.callback(k);
> +
> +	spin_lock_irqsave(&k->ulock, flags);
> +	if (err > 0) {
> +		k->event.ret_flags |= KEVENT_RET_DONE;
> +	} else if (err < 0) {
> +		k->event.ret_flags |= KEVENT_RET_BROKEN;
> +		k->event.ret_flags |= KEVENT_RET_DONE;
> +	}
> +	rem = (k->event.req_flags & KEVENT_REQ_ONESHOT);
> +	if (!err)
> +		err = (k->event.ret_flags & (KEVENT_RET_BROKEN|KEVENT_RET_DONE));
> +	spin_unlock_irqrestore(&k->ulock, flags);

Local variable `err' no longer actually indicates an error, does it?

If not, a differently-named local would be appropriate here.

> +	if (err) {
> +		if ((rem || err < 0) && k->storage_entry.prev != LIST_POISON2) {
> +			list_del_rcu(&k->storage_entry);
> +			k->st->qlen--;

->qlen was previously modified under spinlock.  Here it is not.

> +		}
> +		
> +		spin_lock_irqsave(&k->user->ready_lock, flags);
> +		if (k->ready_entry.next == LIST_POISON1) {
> +			kevent_user_ring_add_event(k);
> +			list_add_tail(&k->ready_entry, &k->user->ready_list);
> +			k->user->ready_num++;
> +		}
> +		spin_unlock_irqrestore(&k->user->ready_lock, flags);
> +		wake_up(&k->user->wait);
> +	}
> +}
> +
> +void kevent_requeue(struct kevent *k)
> +{
> +	unsigned long flags;
> +	
> +	spin_lock_irqsave(&k->st->lock, flags);
> +	__kevent_requeue(k, 0);
> +	spin_unlock_irqrestore(&k->st->lock, flags);
> +}
> +
> +/*
> + * Called each time some activity in origin (socket, inode...) is noticed.
> + */
> +void kevent_storage_ready(struct kevent_storage *st, 
> +		kevent_callback_t ready_callback, u32 event)
> +{
> +	struct kevent *k;
> +
> +	rcu_read_lock();
> +	list_for_each_entry_rcu(k, &st->list, storage_entry) {
> +		if (ready_callback)
> +			ready_callback(k);

For readability reasons I prefer the old-style

		(*ready_callback)(k);

so the reader knows not to go off hunting for the function "ready_callback".
Minor point.

So the kevent_callback_t handlers are not allowed to sleep.

> +		if (event & k->event.event)
> +			__kevent_requeue(k, event);
> +	}

Under what circumstances will `event' be zero??

> +	rcu_read_unlock();
> +}
> +
> +int kevent_storage_init(void *origin, struct kevent_storage *st)
> +{
> +	spin_lock_init(&st->lock);
> +	st->origin = origin;
> +	st->qlen = 0;
> +	INIT_LIST_HEAD(&st->list);
> +	return 0;
> +}
> +
> +void kevent_storage_fini(struct kevent_storage *st)
> +{
> +	kevent_storage_ready(st, kevent_break, KEVENT_MASK_ALL);
> +}
> +
> +static int __init kevent_sys_init(void)
> +{
> +	int i;
> +
> +	kevent_cache = kmem_cache_create("kevent_cache", 
> +			sizeof(struct kevent), 0, 0, NULL, NULL);
> +	if (!kevent_cache)
> +		panic("kevent: Unable to create a cache.\n");

Can use SLAB_PANIC (a silly thing I added to avoid code duplication).

> +	for (i=0; i<ARRAY_SIZE(kevent_registered_callbacks); ++i) {
> +		struct kevent_callbacks *c = &kevent_registered_callbacks[i];
> +
> +		c->callback = c->enqueue = c->dequeue = NULL;
> +	}

This zeroing is redundant.

> +	return 0;
> +}
> +
> +late_initcall(kevent_sys_init);

Why is it late_initcall?  (A comment is needed)

> diff --git a/kernel/kevent/kevent_user.c b/kernel/kevent/kevent_user.c
> new file mode 100644
> index 0000000..7b6374b
> --- /dev/null
> +++ b/kernel/kevent/kevent_user.c
> @@ -0,0 +1,857 @@
> +/*
> + * 	kevent_user.c
> + * 
> + * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
> + * All rights reserved.
> + * 
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
> + */
> +
> +#include <linux/kernel.h>
> +#include <linux/module.h>
> +#include <linux/types.h>
> +#include <linux/list.h>
> +#include <linux/slab.h>
> +#include <linux/spinlock.h>
> +#include <linux/fs.h>
> +#include <linux/file.h>
> +#include <linux/mount.h>
> +#include <linux/device.h>
> +#include <linux/poll.h>
> +#include <linux/kevent.h>
> +#include <linux/jhash.h>
> +#include <linux/miscdevice.h>
> +#include <asm/io.h>
> +
> +static char kevent_name[] = "kevent";
> +
> +static int kevent_user_open(struct inode *, struct file *);
> +static int kevent_user_release(struct inode *, struct file *);
> +static unsigned int kevent_user_poll(struct file *, struct poll_table_struct *);
> +static int kevnet_user_mmap(struct file *, struct vm_area_struct *);
> +
> +static struct file_operations kevent_user_fops = {
> +	.mmap		= kevnet_user_mmap,
> +	.open		= kevent_user_open,
> +	.release	= kevent_user_release,
> +	.poll		= kevent_user_poll,
> +	.owner		= THIS_MODULE,
> +};
> +
> +static struct miscdevice kevent_miscdev = {
> +	.minor = MISC_DYNAMIC_MINOR,
> +	.name = kevent_name,
> +	.fops = &kevent_user_fops,
> +};
> +
> +static int kevent_get_sb(struct file_system_type *fs_type, 
> +		int flags, const char *dev_name, void *data, struct vfsmount *mnt)
> +{
> +	/* So original magic... */
> +	return get_sb_pseudo(fs_type, kevent_name, NULL, 0xabcdef, mnt);	
> +}

That doesn't look like a well-chosen magic number...

> +static struct file_system_type kevent_fs_type = {
> +	.name		= kevent_name,
> +	.get_sb		= kevent_get_sb,
> +	.kill_sb	= kill_anon_super,
> +};
> +
> +static struct vfsmount *kevent_mnt;
> +
> +static unsigned int kevent_user_poll(struct file *file, struct poll_table_struct *wait)
> +{
> +	struct kevent_user *u = file->private_data;
> +	unsigned int mask;
> +	
> +	poll_wait(file, &u->wait, wait);
> +	mask = 0;
> +
> +	if (u->ready_num)
> +		mask |= POLLIN | POLLRDNORM;
> +
> +	return mask;
> +}
> +
> +static inline void kevent_user_ring_set(struct kevent_user *u, unsigned int num)
> +{
> +	unsigned int *idx;
> +	
> +	idx = (unsigned int *)u->pring[0];

This is a bit ugly.


> +	idx[0] = num;
> +}
> +
> +/*
> + * Note that kevents does not exactly fill the page (each ukevent is 40 bytes),
> + * so we reuse 4 bytes at the begining of the first page to store index.
> + * Take that into account if you want to change size of struct ukevent.
> + */
> +#define KEVENTS_ON_PAGE (PAGE_SIZE/sizeof(struct ukevent))

How about doing

	struct ukevent_ring {
		unsigned int index;
		struct ukevent[0];
	}

and removing all those nasty typeasting and offsetting games?

In fact you can even do

	struct ukevent_ring {
		struct ukevent[(PAGE_SIZE - sizeof(unsigned int)) /
				sizeof(struct ukevent)];
		unsigned int index;
	};

if you're careful ;)

> +/*
> + * Called under kevent_user->ready_lock, so updates are always protected.
> + */
> +void kevent_user_ring_add_event(struct kevent *k)
> +{
> +	unsigned int *idx_ptr, idx, pidx, off;
> +	struct ukevent *ukev;
> +	
> +	idx_ptr = (unsigned int *)k->user->pring[0];
> +	idx = idx_ptr[0];
> +
> +	pidx = idx/KEVENTS_ON_PAGE;
> +	off = idx%KEVENTS_ON_PAGE;
> +
> +	if (pidx == 0)
> +		ukev = (struct ukevent *)(k->user->pring[pidx] + sizeof(unsigned int));
> +	else
> +		ukev = (struct ukevent *)(k->user->pring[pidx]);

Such as these.

> +	memcpy(&ukev[off], &k->event, sizeof(struct ukevent));
> +
> +	idx++;
> +	if (idx >= KEVENT_MAX_EVENTS)
> +		idx = 0;
> +
> +	idx_ptr[0] = idx;
> +}
> +
> +static int kevent_user_ring_init(struct kevent_user *u)
> +{
> +	int i, pnum;
> +
> +	pnum = ALIGN(KEVENT_MAX_EVENTS*sizeof(struct ukevent) + sizeof(unsigned int), PAGE_SIZE)/PAGE_SIZE;

And these.

> +	u->pring = kmalloc(pnum * sizeof(unsigned long), GFP_KERNEL);
> +	if (!u->pring)
> +		return -ENOMEM;
> +
> +	for (i=0; i<pnum; ++i) {
> +		u->pring[i] = __get_free_page(GFP_KERNEL);
> +		if (!u->pring)

bug: this is testing the wrong thing.

> +			break;
> +	}
> +
> +	if (i != pnum) {
> +		pnum = i;
> +		goto err_out_free;
> +	}

Move this logic into the `if (!u->pring)' logic, above.

> +	kevent_user_ring_set(u, 0);
> +
> +	return 0;
> +
> +err_out_free:
> +	for (i=0; i<pnum; ++i)
> +		free_page(u->pring[i]);
> +
> +	kfree(u->pring);
> +
> +	return -ENOMEM;
> +}
> +
> +static void kevent_user_ring_fini(struct kevent_user *u)
> +{
> +	int i, pnum;
> +
> +	pnum = ALIGN(KEVENT_MAX_EVENTS*sizeof(struct ukevent) + sizeof(unsigned int), PAGE_SIZE)/PAGE_SIZE;
> +	
> +	for (i=0; i<pnum; ++i)
> +		free_page(u->pring[i]);
> +
> +	kfree(u->pring);
> +}
> +
> +static struct kevent_user *kevent_user_alloc(void)
> +{
> +	struct kevent_user *u;
> +	int i;
> +
> +	u = kzalloc(sizeof(struct kevent_user), GFP_KERNEL);
> +	if (!u)
> +		return NULL;
> +
> +	INIT_LIST_HEAD(&u->ready_list);
> +	spin_lock_init(&u->ready_lock);
> +	u->ready_num = 0;
> +	kevent_user_stat_init(u);
> +	spin_lock_init(&u->kevent_lock);
> +	for (i=0; i<ARRAY_SIZE(u->kevent_list); ++i)
> +		INIT_LIST_HEAD(&u->kevent_list[i]);
> +	u->kevent_num = 0;
> +	
> +	mutex_init(&u->ctl_mutex);
> +	init_waitqueue_head(&u->wait);
> +	u->max_ready_num = 0;

The above zeroes out a bunch of known-to-already-be-zero things.

> +static int kevnet_user_mmap(struct file *file, struct vm_area_struct *vma)

The function name is mistyped.

This code doesn't have many comments, does it?  What are we mapping here,
and why would an application want to map it?

And what are the portability implications?  Does userspace need to know the
64-bitness of its kernel to be able to work out the alignment of things? 
If so, what happens if a later/different compiler aligns things
differently?

> +{
> +	size_t size = vma->vm_end - vma->vm_start, psize;
> +	int pnum = size/PAGE_SIZE, i;
> +	unsigned long start = vma->vm_start;
> +	struct kevent_user *u = file->private_data;
> +
> +	psize = ALIGN(KEVENT_MAX_EVENTS*sizeof(struct ukevent) + sizeof(unsigned int), PAGE_SIZE);
> +
> +	if (size + vma->vm_pgoff*PAGE_SIZE != psize)
> +		return -EINVAL;
> +
> +	if (vma->vm_flags & VM_WRITE)
> +		return -EPERM;
> +
> +	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
> +
> +	for (i=0; i<pnum; ++i) {
> +		if (remap_pfn_range(vma, start, virt_to_phys((void *)u->pring[i+vma->vm_pgoff]), PAGE_SIZE,
> +					vma->vm_page_prot))
> +			return -EAGAIN;
> +		start += PAGE_SIZE;
> +	}
> +
> +	return 0;
> +}

Is EAGAIN an appropriate return value?

If this function had a decent comment we could ask Hugh to review it.

> +#if 0
> +static inline unsigned int kevent_user_hash(struct ukevent *uk)
> +{
> +	unsigned int h = (uk->user[0] ^ uk->user[1]) ^ (uk->id.raw[0] ^ uk->id.raw[1]);
> +	
> +	h = (((h >> 16) & 0xffff) ^ (h & 0xffff)) & 0xffff;
> +	h = (((h >> 8) & 0xff) ^ (h & 0xff)) & KEVENT_HASH_MASK;
> +
> +	return h;
> +}
> +#else
> +static inline unsigned int kevent_user_hash(struct ukevent *uk)
> +{
> +	return jhash_1word(uk->id.raw[0], 0) & KEVENT_HASH_MASK;
> +}
> +#endif
> +
> +static void kevent_free_rcu(struct rcu_head *rcu)
> +{
> +	struct kevent *kevent = container_of(rcu, struct kevent, rcu_head);
> +	kmem_cache_free(kevent_cache, kevent);
> +}
> +
> +static void kevent_finish_user_complete(struct kevent *k, int deq)
> +{
> +	struct kevent_user *u = k->user;
> +	unsigned long flags;
> +
> +	if (deq)
> +		kevent_dequeue(k);
> +
> +	spin_lock_irqsave(&u->ready_lock, flags);
> +	if (k->ready_entry.next != LIST_POISON1) {
> +		list_del(&k->ready_entry);

list_del_rcu()?

> +		u->ready_num--;
> +	}
> +	spin_unlock_irqrestore(&u->ready_lock, flags);
> +
> +	kevent_user_put(u);
> +	call_rcu(&k->rcu_head, kevent_free_rcu);
> +}
> +
> +static void __kevent_finish_user(struct kevent *k, int deq)
> +{
> +	struct kevent_user *u = k->user;
> +
> +	list_del(&k->kevent_entry);
> +	u->kevent_num--;
> +	kevent_finish_user_complete(k, deq);
> +}

No locking needed?

It's hard to review uncommented code.  And the review is less useful if the
reviewer cannot determine what the developer was attempting to do.

> +/*
> + * Remove kevent from user's list of all events, 
> + * dequeue it from storage and decrease user's reference counter,
> + * since this kevent does not exist anymore. That is why it is freed here.
> + */

That's nice.

> +static void kevent_finish_user(struct kevent *k, int deq)
> +{
> +	struct kevent_user *u = k->user;
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&u->kevent_lock, flags);
> +	list_del(&k->kevent_entry);

list_del_rcu()?

> +	u->kevent_num--;
> +	spin_unlock_irqrestore(&u->kevent_lock, flags);
> +	kevent_finish_user_complete(k, deq);
> +}
> +
> +/*
> + * Dequeue one entry from user's ready queue.
> + */
> +
> +static struct kevent *kqueue_dequeue_ready(struct kevent_user *u)
> +{
> +	unsigned long flags;
> +	struct kevent *k = NULL;
> +
> +	spin_lock_irqsave(&u->ready_lock, flags);
> +	if (u->ready_num && !list_empty(&u->ready_list)) {
> +		k = list_entry(u->ready_list.next, struct kevent, ready_entry);
> +		list_del(&k->ready_entry);
> +		u->ready_num--;
> +	}
> +	spin_unlock_irqrestore(&u->ready_lock, flags);
> +
> +	return k;
> +}
> +
> +static struct kevent *__kevent_search(struct list_head *head, struct ukevent *uk, 
> +		struct kevent_user *u)
> +{
> +	struct kevent *k;
> +	int found = 0;
> +	
> +	list_for_each_entry(k, head, kevent_entry) {
> +		spin_lock(&k->ulock);
> +		if (k->event.user[0] == uk->user[0] && k->event.user[1] == uk->user[1] &&
> +				k->event.id.raw[0] == uk->id.raw[0] && 
> +				k->event.id.raw[1] == uk->id.raw[1]) {
> +			found = 1;
> +			spin_unlock(&k->ulock);
> +			break;
> +		}
> +		spin_unlock(&k->ulock);
> +	}
> +
> +	return (found)?k:NULL;
> +}

Remove `found', do

	struct kevent *ret = NULL;

	...
		ret = k;
		break;
	...
	return ret;


> +static int kevent_modify(struct ukevent *uk, struct kevent_user *u)

<wonders what this function does>

> +{
> +	struct kevent *k;
> +	unsigned int hash = kevent_user_hash(uk);
> +	int err = -ENODEV;
> +	unsigned long flags;
> +	
> +	spin_lock_irqsave(&u->kevent_lock, flags);
> +	k = __kevent_search(&u->kevent_list[hash], uk, u);
> +	if (k) {
> +		spin_lock(&k->ulock);
> +		k->event.event = uk->event;
> +		k->event.req_flags = uk->req_flags;
> +		k->event.ret_flags = 0;
> +		spin_unlock(&k->ulock);
> +		kevent_requeue(k);
> +		err = 0;
> +	}
> +	spin_unlock_irqrestore(&u->kevent_lock, flags);
> +	
> +	return err;
> +}

ENODEV: "No such device".  Doesn't sound appropriate.

> +static int kevent_remove(struct ukevent *uk, struct kevent_user *u)
> +{
> +	int err = -ENODEV;
> +	struct kevent *k;
> +	unsigned int hash = kevent_user_hash(uk);
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&u->kevent_lock, flags);
> +	k = __kevent_search(&u->kevent_list[hash], uk, u);
> +	if (k) {
> +		__kevent_finish_user(k, 1);
> +		err = 0;
> +	}
> +	spin_unlock_irqrestore(&u->kevent_lock, flags);
> +
> +	return err;
> +}
> +
> +/*
> + * No new entry can be added or removed from any list at this point.
> + * It is not permitted to call ->ioctl() and ->release() in parallel.
> + */
> +static int kevent_user_release(struct inode *inode, struct file *file)
> +{
> +	struct kevent_user *u = file->private_data;
> +	struct kevent *k, *n;
> +	int i;
> +
> +	for (i=0; i<KEVENT_HASH_MASK+1; ++i) {

ARRAY_SIZE

> +		list_for_each_entry_safe(k, n, &u->kevent_list[i], kevent_entry)
> +			kevent_finish_user(k, 1);
> +	}
> +
> +	kevent_user_put(u);
> +	file->private_data = NULL;
> +
> +	return 0;
> +}
> +
> +static struct ukevent *kevent_get_user(unsigned int num, void __user *arg)
> +{
> +	struct ukevent *ukev;
> +
> +	ukev = kmalloc(sizeof(struct ukevent) * num, GFP_KERNEL);
> +	if (!ukev)
> +		return NULL;
> +
> +	if (copy_from_user(arg, ukev, sizeof(struct ukevent) * num)) {
> +		kfree(ukev);
> +		return NULL;
> +	}
> +
> +	return ukev;
> +}

The copy_fom_user() args are reversed.

This is serious breakage and raises concerns about the amount of testing
which has been performed.

AFAICT there is no bounds checking on `num', so the user can force a
deliberate multiplication overflow and cause havoc here.

> +static int kevent_user_ctl_modify(struct kevent_user *u, unsigned int num, void __user *arg)
> +{
> +	int err = 0, i;
> +	struct ukevent uk;
> +
> +	mutex_lock(&u->ctl_mutex);
> +	
> +	if (num > KEVENT_MIN_BUFFS_ALLOC) {
> +		struct ukevent *ukev;
> +
> +		ukev = kevent_get_user(num, arg);
> +		if (ukev) {
> +			for (i=0; i<num; ++i) {
> +				if (kevent_modify(&ukev[i], u))
> +					ukev[i].ret_flags |= KEVENT_RET_BROKEN;
> +				ukev[i].ret_flags |= KEVENT_RET_DONE;
> +			}
> +			if (copy_to_user(arg, ukev, num*sizeof(struct ukevent)))
> +				err = -EINVAL;

EFAULT

> +			kfree(ukev);
> +			goto out;
> +		}
> +	}
> +
> +	for (i=0; i<num; ++i) {
> +		if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
> +			err = -EINVAL;

EFAULT

> +			break;
> +		}
> +
> +		if (kevent_modify(&uk, u))
> +			uk.ret_flags |= KEVENT_RET_BROKEN;
> +		uk.ret_flags |= KEVENT_RET_DONE;
> +
> +		if (copy_to_user(arg, &uk, sizeof(struct ukevent))) {
> +			err = -EINVAL;

EFAULT.

> +		if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
> +			err = -EINVAL;

EFAULT (all over the place).

> +static void kevent_user_enqueue(struct kevent_user *u, struct kevent *k)
> +{
> +	unsigned long flags;
> +	unsigned int hash = kevent_user_hash(&k->event);
> +
> +	spin_lock_irqsave(&u->kevent_lock, flags);
> +	list_add_tail(&k->kevent_entry, &u->kevent_list[hash]);
> +	u->kevent_num++;
> +	kevent_user_get(u);
> +	spin_unlock_irqrestore(&u->kevent_lock, flags);
> +}

kevent_user_get() can be moved outside the lock?

> +/*
> + * Copy all ukevents from userspace, allocate kevent for each one 
> + * and add them into appropriate kevent_storages, 
> + * e.g. sockets, inodes and so on...
> + * If something goes wrong, all events will be dequeued and 
> + * negative error will be returned. 
> + * On success number of finished events is returned and 
> + * Array of finished events (struct ukevent) will be placed behind 
> + * kevent_user_control structure. User must run through that array and check 
> + * ret_flags field of each ukevent structure to determine if it is fired or failed event.
> + */
> +static int kevent_user_ctl_add(struct kevent_user *u, unsigned int num, void __user *arg)
> +{
> +	int err, cerr = 0, knum = 0, rnum = 0, i;
> +	void __user *orig = arg;
> +	struct ukevent uk;
> +
> +	mutex_lock(&u->ctl_mutex);
> +
> +	err = -ENFILE;
> +	if (u->kevent_num + num >= KEVENT_MAX_EVENTS)

Can a malicious user force an arithmetic overflow here?

> +		goto out_remove;
> +
> +	if (num > KEVENT_MIN_BUFFS_ALLOC) {
> +		struct ukevent *ukev;
> +
> +		ukev = kevent_get_user(num, arg);
> +		if (ukev) {
> +			for (i=0; i<num; ++i) {
> +				err = kevent_user_add_ukevent(&ukev[i], u);
> +				if (err) {
> +					kevent_user_stat_increase_im(u);
> +					if (i != rnum)
> +						memcpy(&ukev[rnum], &ukev[i], sizeof(struct ukevent));
> +					rnum++;

What's happening here?  The games with `rnum' and comparing it with `i'??

Perhaps these are not the best-chosen identifiers..

> +/*
> + * In nonblocking mode it returns as many events as possible, but not more than @max_nr.
> + * In blocking mode it waits until timeout or if at least @min_nr events are ready.
> + */
> +static int kevent_user_wait(struct file *file, struct kevent_user *u, 
> +		unsigned int min_nr, unsigned int max_nr, unsigned int timeout, 
> +		void __user *buf)
> +{
> +	struct kevent *k;
> +	int cerr = 0, num = 0;
> +
> +	if (!(file->f_flags & O_NONBLOCK)) {
> +		wait_event_interruptible_timeout(u->wait, 
> +			u->ready_num >= min_nr, msecs_to_jiffies(timeout));
> +	}
> +	
> +	while (num < max_nr && ((k = kqueue_dequeue_ready(u)) != NULL)) {
> +		if (copy_to_user(buf + num*sizeof(struct ukevent), 
> +					&k->event, sizeof(struct ukevent))) {
> +			cerr = -EINVAL;
> +			break;
> +		}
> +
> +		/*
> +		 * If it is one-shot kevent, it has been removed already from
> +		 * origin's queue, so we can easily free it here.
> +		 */
> +		if (k->event.req_flags & KEVENT_REQ_ONESHOT)
> +			kevent_finish_user(k, 1);
> +		++num;
> +		kevent_user_stat_increase_wait(u);
> +	}
> +
> +	return (cerr)?cerr:num;
> +}

So if this returns an error, the user doesn't know how many events were
actually completed?   That doesn't seem good.

> +asmlinkage long sys_kevent_ctl(int fd, unsigned int cmd, unsigned int num, void __user *arg)

At some point Michael will want to be writing the manpages for things like
this.  He'll start out by reading the comment block, poor guy.

> +{
> +	int err = -EINVAL;
> +	struct file *file;
> +
> +	if (cmd == KEVENT_CTL_INIT)
> +		return kevent_ctl_init();
> +
> +	file = fget(fd);
> +	if (!file)
> +		return -ENODEV;
> +
> +	if (file->f_op != &kevent_user_fops)
> +		goto out_fput;
> +
> +	err = kevent_ctl_process(file, cmd, num, arg);
> +
> +out_fput:
> +	fput(file);
> +	return err;
> +}


^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take6 1/3] kevent: Core files.
  2006-08-09 17:47                             ` [take6 1/3] kevent: Core files Stephen Hemminger
  2006-08-09 19:17                               ` Evgeniy Polyakov
@ 2006-08-10  0:04                               ` David Miller
  1 sibling, 0 replies; 160+ messages in thread
From: David Miller @ 2006-08-10  0:04 UTC (permalink / raw)
  To: shemminger; +Cc: johnpol, linux-kernel, drepper, netdev, zach.brown

From: Stephen Hemminger <shemminger@osdl.org>
Date: Wed, 9 Aug 2006 10:47:38 -0700

> static wrapper_functions_with_execessive_long_names(struct i_really_hate *this)
> {
> 	suck();
> }

Yes, typing 50 characters just to bump a counter, it's beyond
rediculious.

Go hack on the X server if you like long-winded function names
that do trivial operations :-)

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take6 1/3] kevent: Core files.
  2006-08-09 22:21                             ` Andrew Morton
@ 2006-08-10  6:14                               ` Evgeniy Polyakov
  2006-08-10  6:42                                 ` David Miller
  2006-08-10  7:18                                 ` Andrew Morton
  2006-08-10 12:12                               ` [take7 0/1] kevent: generic event handling mechanism Evgeniy Polyakov
  1 sibling, 2 replies; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-10  6:14 UTC (permalink / raw)
  To: Andrew Morton; +Cc: lkml, David Miller, Ulrich Drepper, netdev, Zach Brown

On Wed, Aug 09, 2006 at 03:21:27PM -0700, Andrew Morton (akpm@osdl.org) wrote:
> On Wed, 9 Aug 2006 12:02:40 +0400
> Evgeniy Polyakov <johnpol@2ka.mipt.ru> wrote:
> 
> > 
> > Core files.
> > 
> > This patch includes core kevent files:
> >  - userspace controlling
> >  - kernelspace interfaces
> >  - initialization
> >  - notification state machines
> > 
> > It might also inlclude parts from other subsystem (like network related
> > syscalls, so it is possible that it will not compile without other
> > patches applied).
> 
> Summary:
> 
> - has serious bugs which indicate that much better testing is needed.
> 
> - All -EFOO return values need to be reviewed for appropriateness
> 
> - needs much better commenting before I can do more than a local-level review.
> 
> 
> > --- /dev/null
> > +++ b/include/linux/kevent.h
> > ...
> >
> > +/*
> > + * Poll events.
> > + */
> > +#define	KEVENT_POLL_POLLIN	0x0001
> > +#define	KEVENT_POLL_POLLPRI	0x0002
> > +#define	KEVENT_POLL_POLLOUT	0x0004
> > +#define	KEVENT_POLL_POLLERR	0x0008
> > +#define	KEVENT_POLL_POLLHUP	0x0010
> > +#define	KEVENT_POLL_POLLNVAL	0x0020
> > +
> > +#define	KEVENT_POLL_POLLRDNORM	0x0040
> > +#define	KEVENT_POLL_POLLRDBAND	0x0080
> > +#define	KEVENT_POLL_POLLWRNORM	0x0100
> > +#define	KEVENT_POLL_POLLWRBAND	0x0200
> > +#define	KEVENT_POLL_POLLMSG	0x0400
> > +#define	KEVENT_POLL_POLLREMOVE	0x1000
> 
> 0x0800 got lost.

I will use usual poll definitions.

> > +struct ukevent
> > +{
> > +	struct kevent_id	id;			/* Id of this request, e.g. socket number, file descriptor and so on... */
> > +	__u32			type;			/* Event type, e.g. KEVENT_SOCK, KEVENT_INODE, KEVENT_TIMER and so on... */
> > +	__u32			event;			/* Event itself, e.g. SOCK_ACCEPT, INODE_CREATED, TIMER_FIRED... */
> > +	__u32			req_flags;		/* Per-event request flags */
> > +	__u32			ret_flags;		/* Per-event return flags */
> > +	__u32			ret_data[2];		/* Event return data. Event originator fills it with anything it likes. */
> > +	union {
> > +		__u32		user[2];		/* User's data. It is not used, just copied to/from user. */
> > +		void		*ptr;
> > +	};
> > +};
> 
> What is this union for?
> 
> `ptr' needs a __user tag, does it not?

Not, it is never touched by kernel.

> `ptr' will be 64-bit in-kernel and 64-bit for 64-bit userspace, but 32-bit
> for 32-bit userspace.  I guess that's why user[] is there.

Exactly.

> On big-endian machines, this pointer will appear to be word-swapped as far
> as a 64-bit kernel is concerned.  Or something.
> 
> IOW: What's going on here??

It is user data - I put there a union just to simplify userspace, so it
sould not require some typecasting.

> > +#ifdef CONFIG_KEVENT_INODE
> > +void kevent_inode_notify(struct inode *inode, u32 event);
> > +void kevent_inode_notify_parent(struct dentry *dentry, u32 event);
> > +void kevent_inode_remove(struct inode *inode);
> > +#else
> > +static inline void kevent_inode_notify(struct inode *inode, u32 event)
> > +{
> > +}
> > +static inline void kevent_inode_notify_parent(struct dentry *dentry, u32 event)
> > +{
> > +}
> > +static inline void kevent_inode_remove(struct inode *inode)
> > +{
> > +}
> > +#endif /* CONFIG_KEVENT_INODE */
> > +#ifdef CONFIG_KEVENT_SOCKET
> > +#ifdef CONFIG_LOCKDEP
> > +void kevent_socket_reinit(struct socket *sock);
> > +void kevent_sk_reinit(struct sock *sk);
> > +#else
> > +static inline void kevent_socket_reinit(struct socket *sock)
> > +{
> > +}
> > +static inline void kevent_sk_reinit(struct sock *sk)
> > +{
> > +}
> > +#endif
> > +void kevent_socket_notify(struct sock *sock, u32 event);
> > +int kevent_socket_dequeue(struct kevent *k);
> > +int kevent_socket_enqueue(struct kevent *k);
> > +#define sock_async(__sk) sock_flag(__sk, SOCK_ASYNC)
> 
> Is this header the correct place to be implementing sock_async()?

I decided to have kevent as much separate as possible, so I put a lot
there. When people decide that it is ok, than it can be moved into
appropriate network header file - here it is much easier to review.

> > --- /dev/null
> > +++ b/kernel/kevent/Kconfig
> > @@ -0,0 +1,50 @@
> > +config KEVENT
> > +	bool "Kernel event notification mechanism"
> > +	help
> > +	  This option enables event queue mechanism.
> > +	  It can be used as replacement for poll()/select(), AIO callback invocations,
> > +	  advanced timer notifications and other kernel object status changes.
> 
> Please squeeze all the help text into 80-columns.  Or at least check that
> it looks OK in menuconfig in an 80-col xterm,

Ok.

> > --- /dev/null
> > +++ b/kernel/kevent/kevent.c
> > @@ -0,0 +1,238 @@
> > +/*
> > + * 	kevent.c
> > + * 
> > + * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
> > + * All rights reserved.
> > + * 
> > + * This program is free software; you can redistribute it and/or modify
> > + * it under the terms of the GNU General Public License as published by
> > + * the Free Software Foundation; either version 2 of the License, or
> > + * (at your option) any later version.
> > + *
> > + * This program is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > + * GNU General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU General Public License
> > + * along with this program; if not, write to the Free Software
> > + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
> > + */
> > +
> > +#include <linux/kernel.h>
> > +#include <linux/types.h>
> > +#include <linux/list.h>
> > +#include <linux/slab.h>
> > +#include <linux/spinlock.h>
> > +#include <linux/mempool.h>
> > +#include <linux/sched.h>
> > +#include <linux/wait.h>
> > +#include <linux/kevent.h>
> > +
> > +kmem_cache_t *kevent_cache;
> > +
> > +/*
> > + * Attempts to add an event into appropriate origin's queue.
> > + * Returns positive value if this event is ready immediately,
> > + * negative value in case of error and zero if event has been queued.
> > + * ->enqueue() callback must increase origin's reference counter.
> > + */
> > +int kevent_enqueue(struct kevent *k)
> > +{
> > +	if (k->event.type >= KEVENT_MAX)
> > +		return -E2BIG;
> 
> E2BIG is "Argument list too long".  EINVAL is appropriate here.

No problem.

> > +	if (!k->callbacks.enqueue) {
> > +		kevent_break(k);
> > +		return -EINVAL;
> > +	}
> > +	
> > +	return k->callbacks.enqueue(k);
> > +}
> > +
> > +/*
> > + * Remove event from the appropriate queue.
> > + * ->dequeue() callback must decrease origin's reference counter.
> > + */
> > +int kevent_dequeue(struct kevent *k)
> > +{
> > +	if (k->event.type >= KEVENT_MAX)
> > +		return -E2BIG;
> > +	
> > +	if (!k->callbacks.dequeue) {
> > +		kevent_break(k);
> > +		return -EINVAL;
> > +	}
> > +
> > +	return k->callbacks.dequeue(k);
> > +}
> > +
> > +int kevent_break(struct kevent *k)
> > +{
> > +	unsigned long flags;
> > +
> > +	spin_lock_irqsave(&k->ulock, flags);
> > +	k->event.ret_flags |= KEVENT_RET_BROKEN;
> > +	spin_unlock_irqrestore(&k->ulock, flags);
> > +	return 0;
> > +}
> > +
> > +struct kevent_callbacks kevent_registered_callbacks[KEVENT_MAX];
> > +
> > +/*
> > + * Must be called before event is going to be added into some origin's queue.
> > + * Initializes ->enqueue(), ->dequeue() and ->callback() callbacks.
> > + * If failed, kevent should not be used or kevent_enqueue() will fail to add
> > + * this kevent into origin's queue with setting
> > + * KEVENT_RET_BROKEN flag in kevent->event.ret_flags.
> > + */
> > +int kevent_init(struct kevent *k)
> > +{
> > +	spin_lock_init(&k->ulock);
> > +	k->kevent_entry.next = LIST_POISON1;
> > +	k->storage_entry.prev = LIST_POISON2;
> > +	k->ready_entry.next = LIST_POISON1;
> 
> Nope ;)

I use pointer checks to determine if entry is in the list or not, why it
is frowned upon here?
Please do not say about poisoning which takes a lot of cpu cycles to get
new cachelines and so on - everything in that entry is in the cache,
since entry was added/deleted/accessed through list walk macro.

> > +	if (k->event.type >= KEVENT_MAX)
> > +		return -E2BIG;
> > +
> > +	k->callbacks = kevent_registered_callbacks[k->event.type];
> > +	if (!k->callbacks.callback) {
> > +		kevent_break(k);
> > +		return -EINVAL;
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > +/*
> > + * Called from ->enqueue() callback when reference counter for given
> > + * origin (socket, inode...) has been increased.
> > + */
> > +int kevent_storage_enqueue(struct kevent_storage *st, struct kevent *k)
> > +{
> > +	unsigned long flags;
> > +
> > +	k->st = st;
> > +	spin_lock_irqsave(&st->lock, flags);
> > +	list_add_tail_rcu(&k->storage_entry, &st->list);
> > +	st->qlen++;
> > +	spin_unlock_irqrestore(&st->lock, flags);
> > +	return 0;
> > +}
> 
> Is the _rcu variant needed here?

Yes, storage list is protected by RCU.
st->lock is used to remove race between several "writers".

> > +/*
> > + * Dequeue kevent from origin's queue. 
> > + * It does not decrease origin's reference counter in any way 
> > + * and must be called before it, so storage itself must be valid.
> > + * It is called from ->dequeue() callback.
> > + */
> > +void kevent_storage_dequeue(struct kevent_storage *st, struct kevent *k)
> > +{
> > +	unsigned long flags;
> > +
> > +	spin_lock_irqsave(&st->lock, flags);
> > +	if (k->storage_entry.prev != LIST_POISON2) {
> 
> Nope, as discussed earlier.

Sorry, but I do not agree, that lsit poisoning costs something and I
explained why. It can be wrong from some arcitechtural point of view,
but I can crate a macros and place them into list.h which will do just
the same.

The problem is how to determine if entry is in the list or not
regardless kevent code.

> > +		list_del_rcu(&k->storage_entry);
> > +		st->qlen--;
> > +	}
> > +	spin_unlock_irqrestore(&st->lock, flags);
> > +}
> > +
> > +static void __kevent_requeue(struct kevent *k, u32 event)
> > +{
> > +	int err, rem = 0;
> > +	unsigned long flags;
> > +
> > +	err = k->callbacks.callback(k);
> > +
> > +	spin_lock_irqsave(&k->ulock, flags);
> > +	if (err > 0) {
> > +		k->event.ret_flags |= KEVENT_RET_DONE;
> > +	} else if (err < 0) {
> > +		k->event.ret_flags |= KEVENT_RET_BROKEN;
> > +		k->event.ret_flags |= KEVENT_RET_DONE;
> > +	}
> > +	rem = (k->event.req_flags & KEVENT_REQ_ONESHOT);
> > +	if (!err)
> > +		err = (k->event.ret_flags & (KEVENT_RET_BROKEN|KEVENT_RET_DONE));
> > +	spin_unlock_irqrestore(&k->ulock, flags);
> 
> Local variable `err' no longer actually indicates an error, does it?
> 
> If not, a differently-named local would be appropriate here.

Ok, I will rename it.

> > +	if (err) {
> > +		if ((rem || err < 0) && k->storage_entry.prev != LIST_POISON2) {
> > +			list_del_rcu(&k->storage_entry);
> > +			k->st->qlen--;
> 
> ->qlen was previously modified under spinlock.  Here it is not.

It is tricky part - this part of the code can not be reentered (by
design of all storages which are used with kevents), and different CPUs
can not access the same list due to RCU rules - i.e. it is perfectly ok,
if different CPU sees old value of the queue length.

> > +		}
> > +		
> > +		spin_lock_irqsave(&k->user->ready_lock, flags);
> > +		if (k->ready_entry.next == LIST_POISON1) {
> > +			kevent_user_ring_add_event(k);
> > +			list_add_tail(&k->ready_entry, &k->user->ready_list);
> > +			k->user->ready_num++;
> > +		}
> > +		spin_unlock_irqrestore(&k->user->ready_lock, flags);
> > +		wake_up(&k->user->wait);
> > +	}
> > +}
> > +
> > +void kevent_requeue(struct kevent *k)
> > +{
> > +	unsigned long flags;
> > +	
> > +	spin_lock_irqsave(&k->st->lock, flags);
> > +	__kevent_requeue(k, 0);
> > +	spin_unlock_irqrestore(&k->st->lock, flags);
> > +}
> > +
> > +/*
> > + * Called each time some activity in origin (socket, inode...) is noticed.
> > + */
> > +void kevent_storage_ready(struct kevent_storage *st, 
> > +		kevent_callback_t ready_callback, u32 event)
> > +{
> > +	struct kevent *k;
> > +
> > +	rcu_read_lock();
> > +	list_for_each_entry_rcu(k, &st->list, storage_entry) {
> > +		if (ready_callback)
> > +			ready_callback(k);
> 
> For readability reasons I prefer the old-style
> 
> 		(*ready_callback)(k);
> 
> so the reader knows not to go off hunting for the function "ready_callback".
> Minor point.
> 
> So the kevent_callback_t handlers are not allowed to sleep.

No, since they are called from internals of state machines of the
origins - it is either softirqs (network) or hard irqs (block layer).

> > +		if (event & k->event.event)
> > +			__kevent_requeue(k, event);
> > +	}
> 
> Under what circumstances will `event' be zero??

It is a bit AND, requeing happens only when requested event matches at
least one produced event.

> > +	rcu_read_unlock();
> > +}
> > +
> > +int kevent_storage_init(void *origin, struct kevent_storage *st)
> > +{
> > +	spin_lock_init(&st->lock);
> > +	st->origin = origin;
> > +	st->qlen = 0;
> > +	INIT_LIST_HEAD(&st->list);
> > +	return 0;
> > +}
> > +
> > +void kevent_storage_fini(struct kevent_storage *st)
> > +{
> > +	kevent_storage_ready(st, kevent_break, KEVENT_MASK_ALL);
> > +}
> > +
> > +static int __init kevent_sys_init(void)
> > +{
> > +	int i;
> > +
> > +	kevent_cache = kmem_cache_create("kevent_cache", 
> > +			sizeof(struct kevent), 0, 0, NULL, NULL);
> > +	if (!kevent_cache)
> > +		panic("kevent: Unable to create a cache.\n");
> 
> Can use SLAB_PANIC (a silly thing I added to avoid code duplication).

Ok.

> > +	for (i=0; i<ARRAY_SIZE(kevent_registered_callbacks); ++i) {
> > +		struct kevent_callbacks *c = &kevent_registered_callbacks[i];
> > +
> > +		c->callback = c->enqueue = c->dequeue = NULL;
> > +	}
> 
> This zeroing is redundant.

It is not static, I you sure it will be zeroed?

> > +	return 0;
> > +}
> > +
> > +late_initcall(kevent_sys_init);
> 
> Why is it late_initcall?  (A comment is needed)

Why not?
It can be any initcall or __init.

> > diff --git a/kernel/kevent/kevent_user.c b/kernel/kevent/kevent_user.c
> > new file mode 100644
> > index 0000000..7b6374b
> > --- /dev/null
> > +++ b/kernel/kevent/kevent_user.c
> > @@ -0,0 +1,857 @@
> > +/*
> > + * 	kevent_user.c
> > + * 
> > + * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
> > + * All rights reserved.
> > + * 
> > + * This program is free software; you can redistribute it and/or modify
> > + * it under the terms of the GNU General Public License as published by
> > + * the Free Software Foundation; either version 2 of the License, or
> > + * (at your option) any later version.
> > + *
> > + * This program is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > + * GNU General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU General Public License
> > + * along with this program; if not, write to the Free Software
> > + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
> > + */
> > +
> > +#include <linux/kernel.h>
> > +#include <linux/module.h>
> > +#include <linux/types.h>
> > +#include <linux/list.h>
> > +#include <linux/slab.h>
> > +#include <linux/spinlock.h>
> > +#include <linux/fs.h>
> > +#include <linux/file.h>
> > +#include <linux/mount.h>
> > +#include <linux/device.h>
> > +#include <linux/poll.h>
> > +#include <linux/kevent.h>
> > +#include <linux/jhash.h>
> > +#include <linux/miscdevice.h>
> > +#include <asm/io.h>
> > +
> > +static char kevent_name[] = "kevent";
> > +
> > +static int kevent_user_open(struct inode *, struct file *);
> > +static int kevent_user_release(struct inode *, struct file *);
> > +static unsigned int kevent_user_poll(struct file *, struct poll_table_struct *);
> > +static int kevnet_user_mmap(struct file *, struct vm_area_struct *);
> > +
> > +static struct file_operations kevent_user_fops = {
> > +	.mmap		= kevnet_user_mmap,
> > +	.open		= kevent_user_open,
> > +	.release	= kevent_user_release,
> > +	.poll		= kevent_user_poll,
> > +	.owner		= THIS_MODULE,
> > +};
> > +
> > +static struct miscdevice kevent_miscdev = {
> > +	.minor = MISC_DYNAMIC_MINOR,
> > +	.name = kevent_name,
> > +	.fops = &kevent_user_fops,
> > +};
> > +
> > +static int kevent_get_sb(struct file_system_type *fs_type, 
> > +		int flags, const char *dev_name, void *data, struct vfsmount *mnt)
> > +{
> > +	/* So original magic... */
> > +	return get_sb_pseudo(fs_type, kevent_name, NULL, 0xabcdef, mnt);	
> > +}
> 
> That doesn't look like a well-chosen magic number...
> 
> > +static struct file_system_type kevent_fs_type = {
> > +	.name		= kevent_name,
> > +	.get_sb		= kevent_get_sb,
> > +	.kill_sb	= kill_anon_super,
> > +};
> > +
> > +static struct vfsmount *kevent_mnt;
> > +
> > +static unsigned int kevent_user_poll(struct file *file, struct poll_table_struct *wait)
> > +{
> > +	struct kevent_user *u = file->private_data;
> > +	unsigned int mask;
> > +	
> > +	poll_wait(file, &u->wait, wait);
> > +	mask = 0;
> > +
> > +	if (u->ready_num)
> > +		mask |= POLLIN | POLLRDNORM;
> > +
> > +	return mask;
> > +}
> > +
> > +static inline void kevent_user_ring_set(struct kevent_user *u, unsigned int num)
> > +{
> > +	unsigned int *idx;
> > +	
> > +	idx = (unsigned int *)u->pring[0];
> 
> This is a bit ugly.

I specially use first 4 bytes in the first page to store index there,
since it must be accessed from userspace and kernelspace.
 
> > +	idx[0] = num;
> > +}
> > +
> > +/*
> > + * Note that kevents does not exactly fill the page (each ukevent is 40 bytes),
> > + * so we reuse 4 bytes at the begining of the first page to store index.
> > + * Take that into account if you want to change size of struct ukevent.
> > + */
> > +#define KEVENTS_ON_PAGE (PAGE_SIZE/sizeof(struct ukevent))
> 
> How about doing
> 
> 	struct ukevent_ring {
> 		unsigned int index;
> 		struct ukevent[0];
> 	}
> 
> and removing all those nasty typeasting and offsetting games?
> 
> In fact you can even do
> 
> 	struct ukevent_ring {
> 		struct ukevent[(PAGE_SIZE - sizeof(unsigned int)) /
> 				sizeof(struct ukevent)];
> 		unsigned int index;
> 	};
> 
> if you're careful ;)

Ring takes more than one page, so it will be 
struct ukevent_ring_0 and struct ukevent_ring_other.
Does it really needed?
Not a big problem, if you do thing it worse it.

> > +/*
> > + * Called under kevent_user->ready_lock, so updates are always protected.
> > + */
> > +void kevent_user_ring_add_event(struct kevent *k)
> > +{
> > +	unsigned int *idx_ptr, idx, pidx, off;
> > +	struct ukevent *ukev;
> > +	
> > +	idx_ptr = (unsigned int *)k->user->pring[0];
> > +	idx = idx_ptr[0];
> > +
> > +	pidx = idx/KEVENTS_ON_PAGE;
> > +	off = idx%KEVENTS_ON_PAGE;
> > +
> > +	if (pidx == 0)
> > +		ukev = (struct ukevent *)(k->user->pring[pidx] + sizeof(unsigned int));
> > +	else
> > +		ukev = (struct ukevent *)(k->user->pring[pidx]);
> 
> Such as these.
> 
> > +	memcpy(&ukev[off], &k->event, sizeof(struct ukevent));
> > +
> > +	idx++;
> > +	if (idx >= KEVENT_MAX_EVENTS)
> > +		idx = 0;
> > +
> > +	idx_ptr[0] = idx;
> > +}
> > +
> > +static int kevent_user_ring_init(struct kevent_user *u)
> > +{
> > +	int i, pnum;
> > +
> > +	pnum = ALIGN(KEVENT_MAX_EVENTS*sizeof(struct ukevent) + sizeof(unsigned int), PAGE_SIZE)/PAGE_SIZE;
> 
> And these.
> 
> > +	u->pring = kmalloc(pnum * sizeof(unsigned long), GFP_KERNEL);
> > +	if (!u->pring)
> > +		return -ENOMEM;
> > +
> > +	for (i=0; i<pnum; ++i) {
> > +		u->pring[i] = __get_free_page(GFP_KERNEL);
> > +		if (!u->pring)
> 
> bug: this is testing the wrong thing.

HOw come?
__get_free_page() can return 0 if page was not allocated.

> > +			break;
> > +	}
> > +
> > +	if (i != pnum) {
> > +		pnum = i;
> > +		goto err_out_free;
> > +	}
> 
> Move this logic into the `if (!u->pring)' logic, above.

Ok.

> > +	kevent_user_ring_set(u, 0);
> > +
> > +	return 0;
> > +
> > +err_out_free:
> > +	for (i=0; i<pnum; ++i)
> > +		free_page(u->pring[i]);
> > +
> > +	kfree(u->pring);
> > +
> > +	return -ENOMEM;
> > +}
> > +
> > +static void kevent_user_ring_fini(struct kevent_user *u)
> > +{
> > +	int i, pnum;
> > +
> > +	pnum = ALIGN(KEVENT_MAX_EVENTS*sizeof(struct ukevent) + sizeof(unsigned int), PAGE_SIZE)/PAGE_SIZE;
> > +	
> > +	for (i=0; i<pnum; ++i)
> > +		free_page(u->pring[i]);
> > +
> > +	kfree(u->pring);
> > +}
> > +
> > +static struct kevent_user *kevent_user_alloc(void)
> > +{
> > +	struct kevent_user *u;
> > +	int i;
> > +
> > +	u = kzalloc(sizeof(struct kevent_user), GFP_KERNEL);
> > +	if (!u)
> > +		return NULL;
> > +
> > +	INIT_LIST_HEAD(&u->ready_list);
> > +	spin_lock_init(&u->ready_lock);
> > +	u->ready_num = 0;
> > +	kevent_user_stat_init(u);
> > +	spin_lock_init(&u->kevent_lock);
> > +	for (i=0; i<ARRAY_SIZE(u->kevent_list); ++i)
> > +		INIT_LIST_HEAD(&u->kevent_list[i]);
> > +	u->kevent_num = 0;
> > +	
> > +	mutex_init(&u->ctl_mutex);
> > +	init_waitqueue_head(&u->wait);
> > +	u->max_ready_num = 0;
> 
> The above zeroes out a bunch of known-to-already-be-zero things.

Ok, I will remove redundant settings.

> > +static int kevnet_user_mmap(struct file *file, struct vm_area_struct *vma)
> 
> The function name is mistyped.
> 
> This code doesn't have many comments, does it?  What are we mapping here,
> and why would an application want to map it?

That code waits comments from people who requested it.
It is ring of the ready events, which can be read by userspace instead
of calling syscall, so syscall just becomes "wait until there is a
place" or something like that.

> And what are the portability implications?  Does userspace need to know the
> 64-bitness of its kernel to be able to work out the alignment of things? 
> If so, what happens if a later/different compiler aligns things
> differently?

There are no alignment issues - I use 32bit values anywhere.

> > +{
> > +	size_t size = vma->vm_end - vma->vm_start, psize;
> > +	int pnum = size/PAGE_SIZE, i;
> > +	unsigned long start = vma->vm_start;
> > +	struct kevent_user *u = file->private_data;
> > +
> > +	psize = ALIGN(KEVENT_MAX_EVENTS*sizeof(struct ukevent) + sizeof(unsigned int), PAGE_SIZE);
> > +
> > +	if (size + vma->vm_pgoff*PAGE_SIZE != psize)
> > +		return -EINVAL;
> > +
> > +	if (vma->vm_flags & VM_WRITE)
> > +		return -EPERM;
> > +
> > +	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
> > +
> > +	for (i=0; i<pnum; ++i) {
> > +		if (remap_pfn_range(vma, start, virt_to_phys((void *)u->pring[i+vma->vm_pgoff]), PAGE_SIZE,
> > +					vma->vm_page_prot))
> > +			return -EAGAIN;
> > +		start += PAGE_SIZE;
> > +	}
> > +
> > +	return 0;
> > +}
> 
> Is EAGAIN an appropriate return value?
> 
> If this function had a decent comment we could ask Hugh to review it.

It is trivial ->mmap() implementation - ring buffer, which contains of
several pages (allocated through __get_free_page()) is mapped into
userspace.
vm_pgoff shows offset inside that ring.

> > +#if 0
> > +static inline unsigned int kevent_user_hash(struct ukevent *uk)
> > +{
> > +	unsigned int h = (uk->user[0] ^ uk->user[1]) ^ (uk->id.raw[0] ^ uk->id.raw[1]);
> > +	
> > +	h = (((h >> 16) & 0xffff) ^ (h & 0xffff)) & 0xffff;
> > +	h = (((h >> 8) & 0xff) ^ (h & 0xff)) & KEVENT_HASH_MASK;
> > +
> > +	return h;
> > +}
> > +#else
> > +static inline unsigned int kevent_user_hash(struct ukevent *uk)
> > +{
> > +	return jhash_1word(uk->id.raw[0], 0) & KEVENT_HASH_MASK;
> > +}
> > +#endif
> > +
> > +static void kevent_free_rcu(struct rcu_head *rcu)
> > +{
> > +	struct kevent *kevent = container_of(rcu, struct kevent, rcu_head);
> > +	kmem_cache_free(kevent_cache, kevent);
> > +}
> > +
> > +static void kevent_finish_user_complete(struct kevent *k, int deq)
> > +{
> > +	struct kevent_user *u = k->user;
> > +	unsigned long flags;
> > +
> > +	if (deq)
> > +		kevent_dequeue(k);
> > +
> > +	spin_lock_irqsave(&u->ready_lock, flags);
> > +	if (k->ready_entry.next != LIST_POISON1) {
> > +		list_del(&k->ready_entry);
> 
> list_del_rcu()?

No, ready list does not need RCU protection.

> > +		u->ready_num--;
> > +	}
> > +	spin_unlock_irqrestore(&u->ready_lock, flags);
> > +
> > +	kevent_user_put(u);
> > +	call_rcu(&k->rcu_head, kevent_free_rcu);
> > +}
> > +
> > +static void __kevent_finish_user(struct kevent *k, int deq)
> > +{
> > +	struct kevent_user *u = k->user;
> > +
> > +	list_del(&k->kevent_entry);
> > +	u->kevent_num--;
> > +	kevent_finish_user_complete(k, deq);
> > +}
> 
> No locking needed?

It is special function which is called without lock, function without __
prefix holds appropriate lock.

> It's hard to review uncommented code.  And the review is less useful if the
> reviewer cannot determine what the developer was attempting to do.

Comment is 5 lines below, where that function is called wrapped with
appropriate lock.

> > +/*
> > + * Remove kevent from user's list of all events, 
> > + * dequeue it from storage and decrease user's reference counter,
> > + * since this kevent does not exist anymore. That is why it is freed here.
> > + */
> 
> That's nice.

Here it is.

> > +static void kevent_finish_user(struct kevent *k, int deq)
> > +{
> > +	struct kevent_user *u = k->user;
> > +	unsigned long flags;
> > +
> > +	spin_lock_irqsave(&u->kevent_lock, flags);
> > +	list_del(&k->kevent_entry);
> 
> list_del_rcu()?

No, this list does not require RCU protection, only storage_list
(storage_entry) requires that.

> > +	u->kevent_num--;
> > +	spin_unlock_irqrestore(&u->kevent_lock, flags);
> > +	kevent_finish_user_complete(k, deq);
> > +}
> > +
> > +/*
> > + * Dequeue one entry from user's ready queue.
> > + */
> > +
> > +static struct kevent *kqueue_dequeue_ready(struct kevent_user *u)
> > +{
> > +	unsigned long flags;
> > +	struct kevent *k = NULL;
> > +
> > +	spin_lock_irqsave(&u->ready_lock, flags);
> > +	if (u->ready_num && !list_empty(&u->ready_list)) {
> > +		k = list_entry(u->ready_list.next, struct kevent, ready_entry);
> > +		list_del(&k->ready_entry);
> > +		u->ready_num--;
> > +	}
> > +	spin_unlock_irqrestore(&u->ready_lock, flags);
> > +
> > +	return k;
> > +}
> > +
> > +static struct kevent *__kevent_search(struct list_head *head, struct ukevent *uk, 
> > +		struct kevent_user *u)
> > +{
> > +	struct kevent *k;
> > +	int found = 0;
> > +	
> > +	list_for_each_entry(k, head, kevent_entry) {
> > +		spin_lock(&k->ulock);
> > +		if (k->event.user[0] == uk->user[0] && k->event.user[1] == uk->user[1] &&
> > +				k->event.id.raw[0] == uk->id.raw[0] && 
> > +				k->event.id.raw[1] == uk->id.raw[1]) {
> > +			found = 1;
> > +			spin_unlock(&k->ulock);
> > +			break;
> > +		}
> > +		spin_unlock(&k->ulock);
> > +	}
> > +
> > +	return (found)?k:NULL;
> > +}
> 
> Remove `found', do
> 
> 	struct kevent *ret = NULL;
> 
> 	...
> 		ret = k;
> 		break;
> 	...
> 	return ret;

Ok.
 
> > +static int kevent_modify(struct ukevent *uk, struct kevent_user *u)
> 
> <wonders what this function does>

Let me guess... It modifies kevent? :)
I will add comments.

> > +{
> > +	struct kevent *k;
> > +	unsigned int hash = kevent_user_hash(uk);
> > +	int err = -ENODEV;
> > +	unsigned long flags;
> > +	
> > +	spin_lock_irqsave(&u->kevent_lock, flags);
> > +	k = __kevent_search(&u->kevent_list[hash], uk, u);
> > +	if (k) {
> > +		spin_lock(&k->ulock);
> > +		k->event.event = uk->event;
> > +		k->event.req_flags = uk->req_flags;
> > +		k->event.ret_flags = 0;
> > +		spin_unlock(&k->ulock);
> > +		kevent_requeue(k);
> > +		err = 0;
> > +	}
> > +	spin_unlock_irqrestore(&u->kevent_lock, flags);
> > +	
> > +	return err;
> > +}
> 
> ENODEV: "No such device".  Doesn't sound appropriate.

ENOKEVENT? I expect ENODEV means "there is no requested thing".

> > +static int kevent_remove(struct ukevent *uk, struct kevent_user *u)
> > +{
> > +	int err = -ENODEV;
> > +	struct kevent *k;
> > +	unsigned int hash = kevent_user_hash(uk);
> > +	unsigned long flags;
> > +
> > +	spin_lock_irqsave(&u->kevent_lock, flags);
> > +	k = __kevent_search(&u->kevent_list[hash], uk, u);
> > +	if (k) {
> > +		__kevent_finish_user(k, 1);
> > +		err = 0;
> > +	}
> > +	spin_unlock_irqrestore(&u->kevent_lock, flags);
> > +
> > +	return err;
> > +}
> > +
> > +/*
> > + * No new entry can be added or removed from any list at this point.
> > + * It is not permitted to call ->ioctl() and ->release() in parallel.
> > + */
> > +static int kevent_user_release(struct inode *inode, struct file *file)
> > +{
> > +	struct kevent_user *u = file->private_data;
> > +	struct kevent *k, *n;
> > +	int i;
> > +
> > +	for (i=0; i<KEVENT_HASH_MASK+1; ++i) {
> 
> ARRAY_SIZE

Ok.

> > +		list_for_each_entry_safe(k, n, &u->kevent_list[i], kevent_entry)
> > +			kevent_finish_user(k, 1);
> > +	}
> > +
> > +	kevent_user_put(u);
> > +	file->private_data = NULL;
> > +
> > +	return 0;
> > +}
> > +
> > +static struct ukevent *kevent_get_user(unsigned int num, void __user *arg)
> > +{
> > +	struct ukevent *ukev;
> > +
> > +	ukev = kmalloc(sizeof(struct ukevent) * num, GFP_KERNEL);
> > +	if (!ukev)
> > +		return NULL;
> > +
> > +	if (copy_from_user(arg, ukev, sizeof(struct ukevent) * num)) {
> > +		kfree(ukev);
> > +		return NULL;
> > +	}
> > +
> > +	return ukev;
> > +}
> 
> The copy_fom_user() args are reversed.
> 
> This is serious breakage and raises concerns about the amount of testing
> which has been performed.

It is typo in the new code, which was added by request in this thread.

> AFAICT there is no bounds checking on `num', so the user can force a
> deliberate multiplication overflow and cause havoc here.

It is checked when it is added into the ring, if kevent was not added
and is going to be removed or modified, it will be just thrown with
appropriate return code.
It should be checked against u->kevent_num here.

> > +static int kevent_user_ctl_modify(struct kevent_user *u, unsigned int num, void __user *arg)
> > +{
> > +	int err = 0, i;
> > +	struct ukevent uk;
> > +
> > +	mutex_lock(&u->ctl_mutex);
> > +	
> > +	if (num > KEVENT_MIN_BUFFS_ALLOC) {
> > +		struct ukevent *ukev;
> > +
> > +		ukev = kevent_get_user(num, arg);
> > +		if (ukev) {
> > +			for (i=0; i<num; ++i) {
> > +				if (kevent_modify(&ukev[i], u))
> > +					ukev[i].ret_flags |= KEVENT_RET_BROKEN;
> > +				ukev[i].ret_flags |= KEVENT_RET_DONE;
> > +			}
> > +			if (copy_to_user(arg, ukev, num*sizeof(struct ukevent)))
> > +				err = -EINVAL;
> 
> EFAULT

Ok.

> > +			kfree(ukev);
> > +			goto out;
> > +		}
> > +	}
> > +
> > +	for (i=0; i<num; ++i) {
> > +		if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
> > +			err = -EINVAL;
> 
> EFAULT
> 
> > +			break;
> > +		}
> > +
> > +		if (kevent_modify(&uk, u))
> > +			uk.ret_flags |= KEVENT_RET_BROKEN;
> > +		uk.ret_flags |= KEVENT_RET_DONE;
> > +
> > +		if (copy_to_user(arg, &uk, sizeof(struct ukevent))) {
> > +			err = -EINVAL;
> 
> EFAULT.
> 
> > +		if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
> > +			err = -EINVAL;
> 
> EFAULT (all over the place).

Ok, I will return EFAULT when copy*user fails.

> > +static void kevent_user_enqueue(struct kevent_user *u, struct kevent *k)
> > +{
> > +	unsigned long flags;
> > +	unsigned int hash = kevent_user_hash(&k->event);
> > +
> > +	spin_lock_irqsave(&u->kevent_lock, flags);
> > +	list_add_tail(&k->kevent_entry, &u->kevent_list[hash]);
> > +	u->kevent_num++;
> > +	kevent_user_get(u);
> > +	spin_unlock_irqrestore(&u->kevent_lock, flags);
> > +}
> 
> kevent_user_get() can be moved outside the lock?

Yes.

> > +/*
> > + * Copy all ukevents from userspace, allocate kevent for each one 
> > + * and add them into appropriate kevent_storages, 
> > + * e.g. sockets, inodes and so on...
> > + * If something goes wrong, all events will be dequeued and 
> > + * negative error will be returned. 
> > + * On success number of finished events is returned and 
> > + * Array of finished events (struct ukevent) will be placed behind 
> > + * kevent_user_control structure. User must run through that array and check 
> > + * ret_flags field of each ukevent structure to determine if it is fired or failed event.
> > + */
> > +static int kevent_user_ctl_add(struct kevent_user *u, unsigned int num, void __user *arg)
> > +{
> > +	int err, cerr = 0, knum = 0, rnum = 0, i;
> > +	void __user *orig = arg;
> > +	struct ukevent uk;
> > +
> > +	mutex_lock(&u->ctl_mutex);
> > +
> > +	err = -ENFILE;
> > +	if (u->kevent_num + num >= KEVENT_MAX_EVENTS)
> 
> Can a malicious user force an arithmetic overflow here?

All numbers here are unsigned and are compared against 4096.
So, answer is no.

> > +		goto out_remove;
> > +
> > +	if (num > KEVENT_MIN_BUFFS_ALLOC) {
> > +		struct ukevent *ukev;
> > +
> > +		ukev = kevent_get_user(num, arg);
> > +		if (ukev) {
> > +			for (i=0; i<num; ++i) {
> > +				err = kevent_user_add_ukevent(&ukev[i], u);
> > +				if (err) {
> > +					kevent_user_stat_increase_im(u);
> > +					if (i != rnum)
> > +						memcpy(&ukev[rnum], &ukev[i], sizeof(struct ukevent));
> > +					rnum++;
> 
> What's happening here?  The games with `rnum' and comparing it with `i'??
> 
> Perhaps these are not the best-chosen identifiers..

When kevent is ready immediately it is copied into the same buffer into
previous (rnum ready num) position. kevent at "rpos" was not ready
immediately (otherwise it would be copied and rnum increased) and thus
it is copied into the queue and can be overwritten here.

> > +/*
> > + * In nonblocking mode it returns as many events as possible, but not more than @max_nr.
> > + * In blocking mode it waits until timeout or if at least @min_nr events are ready.
> > + */
> > +static int kevent_user_wait(struct file *file, struct kevent_user *u, 
> > +		unsigned int min_nr, unsigned int max_nr, unsigned int timeout, 
> > +		void __user *buf)
> > +{
> > +	struct kevent *k;
> > +	int cerr = 0, num = 0;
> > +
> > +	if (!(file->f_flags & O_NONBLOCK)) {
> > +		wait_event_interruptible_timeout(u->wait, 
> > +			u->ready_num >= min_nr, msecs_to_jiffies(timeout));
> > +	}
> > +	
> > +	while (num < max_nr && ((k = kqueue_dequeue_ready(u)) != NULL)) {
> > +		if (copy_to_user(buf + num*sizeof(struct ukevent), 
> > +					&k->event, sizeof(struct ukevent))) {
> > +			cerr = -EINVAL;
> > +			break;
> > +		}
> > +
> > +		/*
> > +		 * If it is one-shot kevent, it has been removed already from
> > +		 * origin's queue, so we can easily free it here.
> > +		 */
> > +		if (k->event.req_flags & KEVENT_REQ_ONESHOT)
> > +			kevent_finish_user(k, 1);
> > +		++num;
> > +		kevent_user_stat_increase_wait(u);
> > +	}
> > +
> > +	return (cerr)?cerr:num;
> > +}
> 
> So if this returns an error, the user doesn't know how many events were
> actually completed?   That doesn't seem good.

What is the alternative?
read() work the same way - either error or number of bytes read.

> > +asmlinkage long sys_kevent_ctl(int fd, unsigned int cmd, unsigned int num, void __user *arg)
> 
> At some point Michael will want to be writing the manpages for things like
> this.  He'll start out by reading the comment block, poor guy.

I will add comments.

> > +{
> > +	int err = -EINVAL;
> > +	struct file *file;
> > +
> > +	if (cmd == KEVENT_CTL_INIT)
> > +		return kevent_ctl_init();
> > +
> > +	file = fget(fd);
> > +	if (!file)
> > +		return -ENODEV;
> > +
> > +	if (file->f_op != &kevent_user_fops)
> > +		goto out_fput;
> > +
> > +	err = kevent_ctl_process(file, cmd, num, arg);
> > +
> > +out_fput:
> > +	fput(file);
> > +	return err;
> > +}

So let me quote your first words about kevent:

> Summary:
> 
> - has serious bugs which indicate that much better testing is needed.
> 
> - All -EFOO return values need to be reviewed for appropriateness
> 
> - needs much better commenting before I can do more than a local-level review.

As far as I can see there are no serious bugs except absence of two
checks for and typo in function order, which abviously will be fixed.
All EFOO will be changed according to comments and better comments will
be added.

Thank you for review, Andrew.

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take6 1/3] kevent: Core files.
  2006-08-10  6:14                               ` Evgeniy Polyakov
@ 2006-08-10  6:42                                 ` David Miller
  2006-08-10  6:48                                   ` Evgeniy Polyakov
  2006-08-10  7:18                                 ` Andrew Morton
  1 sibling, 1 reply; 160+ messages in thread
From: David Miller @ 2006-08-10  6:42 UTC (permalink / raw)
  To: johnpol; +Cc: akpm, linux-kernel, drepper, netdev, zach.brown

From: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
Date: Thu, 10 Aug 2006 10:14:33 +0400

> On Wed, Aug 09, 2006 at 03:21:27PM -0700, Andrew Morton (akpm@osdl.org) wrote:
> > On big-endian machines, this pointer will appear to be word-swapped as far
> > as a 64-bit kernel is concerned.  Or something.
> > 
> > IOW: What's going on here??
> 
> It is user data - I put there a union just to simplify userspace, so it
> sould not require some typecasting.

And this is consistent with similar mechianism we use for
netlink socket dumping, so that we don't have compat layer
crap just because we provide a place for the user to store
his pointer or whatever there.

> > > +	k->kevent_entry.next = LIST_POISON1;
> > > +	k->storage_entry.prev = LIST_POISON2;
> > > +	k->ready_entry.next = LIST_POISON1;
> > 
> > Nope ;)
> 
> I use pointer checks to determine if entry is in the list or not, why it
> is frowned upon here?

As Andrew mentioned in another posting, these poison macros
are likely to simply go away some day, so you should not use
them.

If you want pointer encoded tags you use internally, define your own.

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take6 1/3] kevent: Core files.
  2006-08-10  6:42                                 ` David Miller
@ 2006-08-10  6:48                                   ` Evgeniy Polyakov
  0 siblings, 0 replies; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-10  6:48 UTC (permalink / raw)
  To: David Miller; +Cc: akpm, linux-kernel, drepper, netdev, zach.brown

On Wed, Aug 09, 2006 at 11:42:35PM -0700, David Miller (davem@davemloft.net) wrote:
> > > > +	k->kevent_entry.next = LIST_POISON1;
> > > > +	k->storage_entry.prev = LIST_POISON2;
> > > > +	k->ready_entry.next = LIST_POISON1;
> > > 
> > > Nope ;)
> > 
> > I use pointer checks to determine if entry is in the list or not, why it
> > is frowned upon here?
> 
> As Andrew mentioned in another posting, these poison macros
> are likely to simply go away some day, so you should not use
> them.

They exist for ages and sudently can go away?..
 
> If you want pointer encoded tags you use internally, define your own.

I think if I will add code like this
list_del(&k->entry);
k->entry.prev = KEVENT_POISON1;
k->entry.next = KEVENT_POISON2;

I will be suggested to make myself a lobotomy.

I have enough space in flags in each kevent, so I will use some bits there.

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take6 1/3] kevent: Core files.
  2006-08-10  6:14                               ` Evgeniy Polyakov
  2006-08-10  6:42                                 ` David Miller
@ 2006-08-10  7:18                                 ` Andrew Morton
  2006-08-10  7:50                                   ` Evgeniy Polyakov
  1 sibling, 1 reply; 160+ messages in thread
From: Andrew Morton @ 2006-08-10  7:18 UTC (permalink / raw)
  To: Evgeniy Polyakov; +Cc: lkml, David Miller, Ulrich Drepper, netdev, Zach Brown

On Thu, 10 Aug 2006 10:14:33 +0400
Evgeniy Polyakov <johnpol@2ka.mipt.ru> wrote:

> > > +	union {
> > > +		__u32		user[2];		/* User's data. It is not used, just copied to/from user. */
> > > +		void		*ptr;
> > > +	};
> > > +};
> > 
> > What is this union for?
> > 
> > `ptr' needs a __user tag, does it not?
> 
> Not, it is never touched by kernel.

hrm, if you say so.

> > > +/*
> > > + * Must be called before event is going to be added into some origin's queue.
> > > + * Initializes ->enqueue(), ->dequeue() and ->callback() callbacks.
> > > + * If failed, kevent should not be used or kevent_enqueue() will fail to add
> > > + * this kevent into origin's queue with setting
> > > + * KEVENT_RET_BROKEN flag in kevent->event.ret_flags.
> > > + */
> > > +int kevent_init(struct kevent *k)
> > > +{
> > > +	spin_lock_init(&k->ulock);
> > > +	k->kevent_entry.next = LIST_POISON1;
> > > +	k->storage_entry.prev = LIST_POISON2;
> > > +	k->ready_entry.next = LIST_POISON1;
> > 
> > Nope ;)
> 
> I use pointer checks to determine if entry is in the list or not, why it
> is frowned upon here?
> Please do not say about poisoning which takes a lot of cpu cycles to get
> new cachelines and so on - everything in that entry is in the cache,
> since entry was added/deleted/accessed through list walk macro.

"poisoning which takes a lot of cpu cycles".  So there ;)

I assure you, that poisoning code might disappear at any time.

If you want to be able to determine whether a list_head has been detached
you can detach it with list_del_init() and then use list_empty() on it.

> > > +}
> > > +
> > > +late_initcall(kevent_sys_init);
> > 
> > Why is it late_initcall?  (A comment is needed)
> 
> Why not?

Why?

There must have been some reason for having made this a late_initcall() and
that reason is 100% concealed from the reader of this code.

IOW, it needs a comment.

> > > +static inline void kevent_user_ring_set(struct kevent_user *u, unsigned int num)
> > > +{
> > > +	unsigned int *idx;
> > > +	
> > > +	idx = (unsigned int *)u->pring[0];
> > 
> > This is a bit ugly.
> 
> I specially use first 4 bytes in the first page to store index there,
> since it must be accessed from userspace and kernelspace.

Sure, but the C language is the preferred way in which we communicate and
calcuate pointer offsets.

> > > +	idx[0] = num;
> > > +}
> > > +
> > > +/*
> > > + * Note that kevents does not exactly fill the page (each ukevent is 40 bytes),
> > > + * so we reuse 4 bytes at the begining of the first page to store index.
> > > + * Take that into account if you want to change size of struct ukevent.
> > > + */
> > > +#define KEVENTS_ON_PAGE (PAGE_SIZE/sizeof(struct ukevent))
> > 
> > How about doing
> > 
> > 	struct ukevent_ring {
> > 		unsigned int index;
> > 		struct ukevent[0];
> > 	}
> > 
> > and removing all those nasty typeasting and offsetting games?
> > 
> > In fact you can even do
> > 
> > 	struct ukevent_ring {
> > 		struct ukevent[(PAGE_SIZE - sizeof(unsigned int)) /
> > 				sizeof(struct ukevent)];
> > 		unsigned int index;
> > 	};
> > 
> > if you're careful ;)
> 
> Ring takes more than one page, so it will be 
> struct ukevent_ring_0 and struct ukevent_ring_other.
> Does it really needed?
> Not a big problem, if you do thing it worse it.

Well, I've given a couple of prototype-style suggestions.  Please take a
look, see if all this open-coded offsetting magic can be done by the
compiler in some reliable and readable fashion.  It might not work out, but
I suspect it will.

> > > +	u->pring = kmalloc(pnum * sizeof(unsigned long), GFP_KERNEL);
> > > +	if (!u->pring)
> > > +		return -ENOMEM;
> > > +
> > > +	for (i=0; i<pnum; ++i) {
> > > +		u->pring[i] = __get_free_page(GFP_KERNEL);
> > > +		if (!u->pring)
> > 
> > bug: this is testing the wrong thing.
> 
> HOw come?

Take a closer look ;)

> __get_free_page() can return 0 if page was not allocated.

And that 0 is copied to u->pring[0], not to u->pring.

> > The function name is mistyped.

Did you miss an "OK"?  It needs s/kevnet_user_mmap/kevent_user_mmap/g

> > This code doesn't have many comments, does it?  What are we mapping here,
> > and why would an application want to map it?
> 
> That code waits comments from people who requested it.
> It is ring of the ready events, which can be read by userspace instead
> of calling syscall, so syscall just becomes "wait until there is a
> place" or something like that.

hm.  Well, please fully comment code prior to sending it out for review.  I
do go on about this, but trust me, it makes the review much more effective.

Afaict this mmap function gives a user a free way of getting pinned memory. 
What is the upper bound on the amount of memory which a user can thus
obtain?

> > > +static int kevent_modify(struct ukevent *uk, struct kevent_user *u)
> > 
> > <wonders what this function does>
> 
> Let me guess... It modifies kevent? :)
> I will add comments.
> 
> > > +{
> > > +	struct kevent *k;
> > > +	unsigned int hash = kevent_user_hash(uk);
> > > +	int err = -ENODEV;
> > > +	unsigned long flags;
> > > +	
> > > +	spin_lock_irqsave(&u->kevent_lock, flags);
> > > +	k = __kevent_search(&u->kevent_list[hash], uk, u);
> > > +	if (k) {
> > > +		spin_lock(&k->ulock);
> > > +		k->event.event = uk->event;
> > > +		k->event.req_flags = uk->req_flags;
> > > +		k->event.ret_flags = 0;
> > > +		spin_unlock(&k->ulock);
> > > +		kevent_requeue(k);
> > > +		err = 0;
> > > +	}
> > > +	spin_unlock_irqrestore(&u->kevent_lock, flags);
> > > +	
> > > +	return err;
> > > +}
> > 
> > ENODEV: "No such device".  Doesn't sound appropriate.
> 
> ENOKEVENT? I expect ENODEV means "there is no requested thing".

yes, it is hard to map standard errnos onto new and complex non-standard
features.

I don't have a good answer to this, sorry.

Perhaps we should do

#define EPER_SYSCALL_BASE 0x10000

and then each syscall is free to implement new, syscall-specific errnos
starting at this base.  But that might be a stupid idea - I don't know. 
I'm sure the implementor of strerror() would think so ;)

> > 
> > EFAULT (all over the place).
> 
> Ok, I will return EFAULT when copy*user fails.

If that makes sense, fine.  Sometimes it makes sense to return the number
of bytes transferred up to the point of the fault.  Please have a careful
think and decide which behaviour is best in each of these cases.

> > > +	int err, cerr = 0, knum = 0, rnum = 0, i;
> > > +	void __user *orig = arg;
> > > +	struct ukevent uk;
> > > +
> > > +	mutex_lock(&u->ctl_mutex);
> > > +
> > > +	err = -ENFILE;
> > > +	if (u->kevent_num + num >= KEVENT_MAX_EVENTS)
> > 
> > Can a malicious user force an arithmetic overflow here?
> 
> All numbers here are unsigned and are compared against 4096.

Are they?  I only see a comparison of a _sum_ against KEVENT_MAX_EVENTS. 
So if the user passes 0x0800,0xfffffff0, for example?

> So, answer is no.
> 
> > > +		goto out_remove;
> > > +
> > > +	if (num > KEVENT_MIN_BUFFS_ALLOC) {
> > > +		struct ukevent *ukev;
> > > +
> > > +		ukev = kevent_get_user(num, arg);
> > > +		if (ukev) {
> > > +			for (i=0; i<num; ++i) {
> > > +				err = kevent_user_add_ukevent(&ukev[i], u);
> > > +				if (err) {
> > > +					kevent_user_stat_increase_im(u);
> > > +					if (i != rnum)
> > > +						memcpy(&ukev[rnum], &ukev[i], sizeof(struct ukevent));
> > > +					rnum++;
> > 
> > What's happening here?  The games with `rnum' and comparing it with `i'??
> > 
> > Perhaps these are not the best-chosen identifiers..
> 
> When kevent is ready immediately it is copied into the same buffer into
> previous (rnum ready num) position. kevent at "rpos" was not ready
> immediately (otherwise it would be copied and rnum increased) and thus
> it is copied into the queue and can be overwritten here.

If you say so ;)

Please bear in mind that Michael Kerrisk <mtk-manpages@gmx.net> will want
to be writing manpages for all this stuff.

And I must say that Michael repeatedly and correctly dragged me across the
coals for something as simple and stupid as sys_sync_file_range().  Based
on that experience, I wouldn't consider a new syscall like this to be
settled until Michael has fully understood it.  And I suspect he doesn't
fully understand it until he has fully documented it.

> > > +/*
> > > + * In nonblocking mode it returns as many events as possible, but not more than @max_nr.
> > > + * In blocking mode it waits until timeout or if at least @min_nr events are ready.
> > > + */
> > > +static int kevent_user_wait(struct file *file, struct kevent_user *u, 
> > > +		unsigned int min_nr, unsigned int max_nr, unsigned int timeout, 
> > > +		void __user *buf)
> > > +{
> > > +	struct kevent *k;
> > > +	int cerr = 0, num = 0;
> > > +
> > > +	if (!(file->f_flags & O_NONBLOCK)) {
> > > +		wait_event_interruptible_timeout(u->wait, 
> > > +			u->ready_num >= min_nr, msecs_to_jiffies(timeout));
> > > +	}
> > > +	
> > > +	while (num < max_nr && ((k = kqueue_dequeue_ready(u)) != NULL)) {
> > > +		if (copy_to_user(buf + num*sizeof(struct ukevent), 
> > > +					&k->event, sizeof(struct ukevent))) {
> > > +			cerr = -EINVAL;
> > > +			break;
> > > +		}
> > > +
> > > +		/*
> > > +		 * If it is one-shot kevent, it has been removed already from
> > > +		 * origin's queue, so we can easily free it here.
> > > +		 */
> > > +		if (k->event.req_flags & KEVENT_REQ_ONESHOT)
> > > +			kevent_finish_user(k, 1);
> > > +		++num;
> > > +		kevent_user_stat_increase_wait(u);
> > > +	}
> > > +
> > > +	return (cerr)?cerr:num;
> > > +}
> > 
> > So if this returns an error, the user doesn't know how many events were
> > actually completed?   That doesn't seem good.
> 
> What is the alternative?
> read() work the same way - either error or number of bytes read.

No.  If read() hits an IO error or EFAULT partway through, read() will
return the number-of-bytes-trasferred.  read() will only return a -ve
errno if it transferred zero bytes.  This way, there is no lost
information.

However kevent_user_wait() will return a -ve errno even if it has reaped
some events.  That's lost information and this might make it hard for a
robust userspace client to implement error recovery?

> So let me quote your first words about kevent:
> 
> > Summary:
> > 
> > - has serious bugs which indicate that much better testing is needed.
> > 
> > - All -EFOO return values need to be reviewed for appropriateness
> > 
> > - needs much better commenting before I can do more than a local-level review.
> 
> As far as I can see there are no serious bugs except absence of two
> checks for and typo in function order, which abviously will be fixed.

Thus far I have found at least two bugs in this patchset which provide at
least a local DoS and possibly a privilege escalation (aka a roothole) to
local users.  We hit a similar one in the epoll() implementation a while
back.

This is serious stuff.  So experience tells us to be fanatical in the
checking of incoming syscall args.  Check all the arguments to death for
correct values.  Look out for overflows in additions and multiplications. 
Look out for opportunities for excessive resource consumption. 
Exhaustively test your new syscalls with many combinations of values when
the kernel is in various states.

> All EFOO will be changed according to comments and better comments will
> be added.

Thanks.

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take6 1/3] kevent: Core files.
  2006-08-10  7:18                                 ` Andrew Morton
@ 2006-08-10  7:50                                   ` Evgeniy Polyakov
  2006-08-10  8:02                                     ` Andrew Morton
  0 siblings, 1 reply; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-10  7:50 UTC (permalink / raw)
  To: Andrew Morton; +Cc: lkml, David Miller, Ulrich Drepper, netdev, Zach Brown

On Thu, Aug 10, 2006 at 12:18:44AM -0700, Andrew Morton (akpm@osdl.org) wrote:
> > > > +	spin_lock_init(&k->ulock);
> > > > +	k->kevent_entry.next = LIST_POISON1;
> > > > +	k->storage_entry.prev = LIST_POISON2;
> > > > +	k->ready_entry.next = LIST_POISON1;
> > > 
> > > Nope ;)
> > 
> > I use pointer checks to determine if entry is in the list or not, why it
> > is frowned upon here?
> > Please do not say about poisoning which takes a lot of cpu cycles to get
> > new cachelines and so on - everything in that entry is in the cache,
> > since entry was added/deleted/accessed through list walk macro.
> 
> "poisoning which takes a lot of cpu cycles".  So there ;)
> 
> I assure you, that poisoning code might disappear at any time.
> 
> If you want to be able to determine whether a list_head has been detached
> you can detach it with list_del_init() and then use list_empty() on it.

I can't due to RCU rules.

> > > > +}
> > > > +
> > > > +late_initcall(kevent_sys_init);
> > > 
> > > Why is it late_initcall?  (A comment is needed)
> > 
> > Why not?
> 
> Why?
> 
> There must have been some reason for having made this a late_initcall() and
> that reason is 100% concealed from the reader of this code.

kevent must be initialized before use, and it must happen before
userspace started, so I use late_initcall(), as I said it can be
anything other which is called before userspace.

> IOW, it needs a comment.

Sure.
I'm working right now on fixing all issues mentioned in this thread, and
comments are not on the last place.

> > > > +static inline void kevent_user_ring_set(struct kevent_user *u, unsigned int num)
> > > > +{
> > > > +	unsigned int *idx;
> > > > +	
> > > > +	idx = (unsigned int *)u->pring[0];
> > > 
> > > This is a bit ugly.
> > 
> > I specially use first 4 bytes in the first page to store index there,
> > since it must be accessed from userspace and kernelspace.
> 
> Sure, but the C language is the preferred way in which we communicate and
> calcuate pointer offsets.
> 
> > > > +	idx[0] = num;
> > > > +}
> > > > +
> > > > +/*
> > > > + * Note that kevents does not exactly fill the page (each ukevent is 40 bytes),
> > > > + * so we reuse 4 bytes at the begining of the first page to store index.
> > > > + * Take that into account if you want to change size of struct ukevent.
> > > > + */
> > > > +#define KEVENTS_ON_PAGE (PAGE_SIZE/sizeof(struct ukevent))
> > > 
> > > How about doing
> > > 
> > > 	struct ukevent_ring {
> > > 		unsigned int index;
> > > 		struct ukevent[0];
> > > 	}
> > > 
> > > and removing all those nasty typeasting and offsetting games?
> > > 
> > > In fact you can even do
> > > 
> > > 	struct ukevent_ring {
> > > 		struct ukevent[(PAGE_SIZE - sizeof(unsigned int)) /
> > > 				sizeof(struct ukevent)];
> > > 		unsigned int index;
> > > 	};
> > > 
> > > if you're careful ;)
> > 
> > Ring takes more than one page, so it will be 
> > struct ukevent_ring_0 and struct ukevent_ring_other.
> > Does it really needed?
> > Not a big problem, if you do thing it worse it.
> 
> Well, I've given a couple of prototype-style suggestions.  Please take a
> look, see if all this open-coded offsetting magic can be done by the
> compiler in some reliable and readable fashion.  It might not work out, but
> I suspect it will.

I think I will use structure with index on each page, since kevents are
unaligned to exaclty fit page, and it can be some kind of (later)
optimisation to use not global counter, but per-page one.

> > > > +	u->pring = kmalloc(pnum * sizeof(unsigned long), GFP_KERNEL);
> > > > +	if (!u->pring)
> > > > +		return -ENOMEM;
> > > > +
> > > > +	for (i=0; i<pnum; ++i) {
> > > > +		u->pring[i] = __get_free_page(GFP_KERNEL);
> > > > +		if (!u->pring)
> > > 
> > > bug: this is testing the wrong thing.
> > 
> > HOw come?
> 
> Take a closer look ;)

[i] My fault :)

> > __get_free_page() can return 0 if page was not allocated.
> 
> And that 0 is copied to u->pring[0], not to u->pring.
> 
> > > The function name is mistyped.
> 
> Did you miss an "OK"?  It needs s/kevnet_user_mmap/kevent_user_mmap/g

It is already fixed :)

> > > This code doesn't have many comments, does it?  What are we mapping here,
> > > and why would an application want to map it?
> > 
> > That code waits comments from people who requested it.
> > It is ring of the ready events, which can be read by userspace instead
> > of calling syscall, so syscall just becomes "wait until there is a
> > place" or something like that.
> 
> hm.  Well, please fully comment code prior to sending it out for review.  I
> do go on about this, but trust me, it makes the review much more effective.
> 
> Afaict this mmap function gives a user a free way of getting pinned memory. 
> What is the upper bound on the amount of memory which a user can thus
> obtain?

it is limited by maximum queue length which is 4k entries right now, so
maximum number of paged here is 4k*40/page_size, i.e. about 40 pages on
x86.

> > > > +static int kevent_modify(struct ukevent *uk, struct kevent_user *u)
> > > 
> > > <wonders what this function does>
> > 
> > Let me guess... It modifies kevent? :)
> > I will add comments.
> > 
> > > > +{
> > > > +	struct kevent *k;
> > > > +	unsigned int hash = kevent_user_hash(uk);
> > > > +	int err = -ENODEV;
> > > > +	unsigned long flags;
> > > > +	
> > > > +	spin_lock_irqsave(&u->kevent_lock, flags);
> > > > +	k = __kevent_search(&u->kevent_list[hash], uk, u);
> > > > +	if (k) {
> > > > +		spin_lock(&k->ulock);
> > > > +		k->event.event = uk->event;
> > > > +		k->event.req_flags = uk->req_flags;
> > > > +		k->event.ret_flags = 0;
> > > > +		spin_unlock(&k->ulock);
> > > > +		kevent_requeue(k);
> > > > +		err = 0;
> > > > +	}
> > > > +	spin_unlock_irqrestore(&u->kevent_lock, flags);
> > > > +	
> > > > +	return err;
> > > > +}
> > > 
> > > ENODEV: "No such device".  Doesn't sound appropriate.
> > 
> > ENOKEVENT? I expect ENODEV means "there is no requested thing".
> 
> yes, it is hard to map standard errnos onto new and complex non-standard
> features.
> 
> I don't have a good answer to this, sorry.
> 
> Perhaps we should do
> 
> #define EPER_SYSCALL_BASE 0x10000
> 
> and then each syscall is free to implement new, syscall-specific errnos
> starting at this base.  But that might be a stupid idea - I don't know. 
> I'm sure the implementor of strerror() would think so ;)

There are some issues with errno and rules for kernel-only errno
holes... Let's just return EINVAL here.

> > > EFAULT (all over the place).
> > 
> > Ok, I will return EFAULT when copy*user fails.
> 
> If that makes sense, fine.  Sometimes it makes sense to return the number
> of bytes transferred up to the point of the fault.  Please have a careful
> think and decide which behaviour is best in each of these cases.

No, it much better to show that things are broken.
Half of the event transferred will not help user, since his pointer is
last data in the structure :)

> > > > +	int err, cerr = 0, knum = 0, rnum = 0, i;
> > > > +	void __user *orig = arg;
> > > > +	struct ukevent uk;
> > > > +
> > > > +	mutex_lock(&u->ctl_mutex);
> > > > +
> > > > +	err = -ENFILE;
> > > > +	if (u->kevent_num + num >= KEVENT_MAX_EVENTS)
> > > 
> > > Can a malicious user force an arithmetic overflow here?
> > 
> > All numbers here are unsigned and are compared against 4096.
> 
> Are they?  I only see a comparison of a _sum_ against KEVENT_MAX_EVENTS. 
> So if the user passes 0x0800,0xfffffff0, for example?

I've already added check for that user value, you are correct that sum
can overflow.

> > So, answer is no.
> > 
> > > > +		goto out_remove;
> > > > +
> > > > +	if (num > KEVENT_MIN_BUFFS_ALLOC) {
> > > > +		struct ukevent *ukev;
> > > > +
> > > > +		ukev = kevent_get_user(num, arg);
> > > > +		if (ukev) {
> > > > +			for (i=0; i<num; ++i) {
> > > > +				err = kevent_user_add_ukevent(&ukev[i], u);
> > > > +				if (err) {
> > > > +					kevent_user_stat_increase_im(u);
> > > > +					if (i != rnum)
> > > > +						memcpy(&ukev[rnum], &ukev[i], sizeof(struct ukevent));
> > > > +					rnum++;
> > > 
> > > What's happening here?  The games with `rnum' and comparing it with `i'??
> > > 
> > > Perhaps these are not the best-chosen identifiers..
> > 
> > When kevent is ready immediately it is copied into the same buffer into
> > previous (rnum ready num) position. kevent at "rpos" was not ready
> > immediately (otherwise it would be copied and rnum increased) and thus
> > it is copied into the queue and can be overwritten here.
> 
> If you say so ;)
> 
> Please bear in mind that Michael Kerrisk <mtk-manpages@gmx.net> will want
> to be writing manpages for all this stuff.
> 
> And I must say that Michael repeatedly and correctly dragged me across the
> coals for something as simple and stupid as sys_sync_file_range().  Based
> on that experience, I wouldn't consider a new syscall like this to be
> settled until Michael has fully understood it.  And I suspect he doesn't
> fully understand it until he has fully documented it.

Yep, comments are always not that cool thing to do...

> > > > +/*
> > > > + * In nonblocking mode it returns as many events as possible, but not more than @max_nr.
> > > > + * In blocking mode it waits until timeout or if at least @min_nr events are ready.
> > > > + */
> > > > +static int kevent_user_wait(struct file *file, struct kevent_user *u, 
> > > > +		unsigned int min_nr, unsigned int max_nr, unsigned int timeout, 
> > > > +		void __user *buf)
> > > > +{
> > > > +	struct kevent *k;
> > > > +	int cerr = 0, num = 0;
> > > > +
> > > > +	if (!(file->f_flags & O_NONBLOCK)) {
> > > > +		wait_event_interruptible_timeout(u->wait, 
> > > > +			u->ready_num >= min_nr, msecs_to_jiffies(timeout));
> > > > +	}
> > > > +	
> > > > +	while (num < max_nr && ((k = kqueue_dequeue_ready(u)) != NULL)) {
> > > > +		if (copy_to_user(buf + num*sizeof(struct ukevent), 
> > > > +					&k->event, sizeof(struct ukevent))) {
> > > > +			cerr = -EINVAL;
> > > > +			break;
> > > > +		}
> > > > +
> > > > +		/*
> > > > +		 * If it is one-shot kevent, it has been removed already from
> > > > +		 * origin's queue, so we can easily free it here.
> > > > +		 */
> > > > +		if (k->event.req_flags & KEVENT_REQ_ONESHOT)
> > > > +			kevent_finish_user(k, 1);
> > > > +		++num;
> > > > +		kevent_user_stat_increase_wait(u);
> > > > +	}
> > > > +
> > > > +	return (cerr)?cerr:num;
> > > > +}
> > > 
> > > So if this returns an error, the user doesn't know how many events were
> > > actually completed?   That doesn't seem good.
> > 
> > What is the alternative?
> > read() work the same way - either error or number of bytes read.
> 
> No.  If read() hits an IO error or EFAULT partway through, read() will
> return the number-of-bytes-trasferred.  read() will only return a -ve
> errno if it transferred zero bytes.  This way, there is no lost
> information.
> 
> However kevent_user_wait() will return a -ve errno even if it has reaped
> some events.  That's lost information and this might make it hard for a
> robust userspace client to implement error recovery?

I have no strong opinion on what value must be returned here.
I always think that if error happend, than it must be indicated.
But it is perfectly ok to return number of correctly read kevents and
userspace can compare that number with requested number.

> > So let me quote your first words about kevent:
> > 
> > > Summary:
> > > 
> > > - has serious bugs which indicate that much better testing is needed.
> > > 
> > > - All -EFOO return values need to be reviewed for appropriateness
> > > 
> > > - needs much better commenting before I can do more than a local-level review.
> > 
> > As far as I can see there are no serious bugs except absence of two
> > checks for and typo in function order, which abviously will be fixed.
> 
> Thus far I have found at least two bugs in this patchset which provide at
> least a local DoS and possibly a privilege escalation (aka a roothole) to
> local users.  We hit a similar one in the epoll() implementation a while
> back.

It is already fixed.

> This is serious stuff.  So experience tells us to be fanatical in the
> checking of incoming syscall args.  Check all the arguments to death for
> correct values.  Look out for overflows in additions and multiplications. 
> Look out for opportunities for excessive resource consumption. 
> Exhaustively test your new syscalls with many combinations of values when
> the kernel is in various states.

Agree, such security-related issues must be reviewed and tested as much
as possible.

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take6 1/3] kevent: Core files.
  2006-08-10  7:50                                   ` Evgeniy Polyakov
@ 2006-08-10  8:02                                     ` Andrew Morton
  2006-08-10  8:22                                       ` Evgeniy Polyakov
  0 siblings, 1 reply; 160+ messages in thread
From: Andrew Morton @ 2006-08-10  8:02 UTC (permalink / raw)
  To: Evgeniy Polyakov; +Cc: lkml, David Miller, Ulrich Drepper, netdev, Zach Brown

On Thu, 10 Aug 2006 11:50:47 +0400
Evgeniy Polyakov <johnpol@2ka.mipt.ru> wrote:

> > Afaict this mmap function gives a user a free way of getting pinned memory. 
> > What is the upper bound on the amount of memory which a user can thus
> > obtain?
> 
> it is limited by maximum queue length which is 4k entries right now, so
> maximum number of paged here is 4k*40/page_size, i.e. about 40 pages on
> x86.

Is that per user or per fd?  If the latter that is, with the usual
RLIMIT_NOFILE, 160MBytes.  2GB with 64k pagesize.  Problem ;)


^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take6 1/3] kevent: Core files.
  2006-08-10  8:02                                     ` Andrew Morton
@ 2006-08-10  8:22                                       ` Evgeniy Polyakov
  2006-08-11  0:56                                         ` Andrew Morton
  0 siblings, 1 reply; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-10  8:22 UTC (permalink / raw)
  To: Andrew Morton; +Cc: lkml, David Miller, Ulrich Drepper, netdev, Zach Brown

On Thu, Aug 10, 2006 at 01:02:54AM -0700, Andrew Morton (akpm@osdl.org) wrote:
> > > Afaict this mmap function gives a user a free way of getting pinned memory. 
> > > What is the upper bound on the amount of memory which a user can thus
> > > obtain?
> > 
> > it is limited by maximum queue length which is 4k entries right now, so
> > maximum number of paged here is 4k*40/page_size, i.e. about 40 pages on
> > x86.
> 
> Is that per user or per fd?  If the latter that is, with the usual
> RLIMIT_NOFILE, 160MBytes.  2GB with 64k pagesize.  Problem ;)

Per kevent fd.
I have some ideas about better mmap ring implementation, which would
dinamically grow it's buffer when events are added and reuse the same
place for next events, but there are some nitpics unresolved yet.
Let's not see there in next releases (no merge of course), until better 
solution is ready. I will change that area when other things are ready.

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 160+ messages in thread

* [take7 0/1] kevent: generic event handling mechanism.
  2006-08-09 22:21                             ` Andrew Morton
  2006-08-10  6:14                               ` Evgeniy Polyakov
@ 2006-08-10 12:12                               ` Evgeniy Polyakov
  2006-08-10 12:16                                 ` [take7 1/1] kevent: core files and timer/poll notifications Evgeniy Polyakov
  1 sibling, 1 reply; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-10 12:12 UTC (permalink / raw)
  To: Andrew Morton; +Cc: lkml, David Miller, netdev, Zach Brown

Hello.

Generic event handling mechanism.

Changes from 'take6' patchset:
 * a lot of comments!
 * do not use list poisoning for detection of the fact, that entry is in the list
 * return number of ready kevents even if copy*user() fails
 * strict check for number of kevents in syscall
 * use ARRAY_SIZE for array size calculation
 * changed superblock magic number
 * use SLAB_PANIC instead of direct panic() call
 * changed -E* return values
 * a lot of small cleanups and indent fixes
 * fully removed AIO stuff from patchset

Changes from 'take5' patchset:
 * removed compilation warnings about unused wariables when lockdep is not turned on
 * do not use internal socket structures, use appropriate (exported) wrappers instead
 * removed default 1 second timeout
 * removed AIO stuff from patchset

Changes from 'take4' patchset:
 * use miscdevice instead of chardevice
 * comments fixes

Changes from 'take3' patchset:
 * removed serializing mutex from kevent_user_wait()
 * moved storage list processing to RCU
 * removed lockdep screaming - all storage locks are initialized in the same function, so it was learned 
	to differentiate between various cases
 * remove kevent from storage if is marked as broken after callback
 * fixed a typo in mmaped buffer implementation which would end up in wrong index calcualtion 

Changes from 'take2' patchset:
 * split kevent_finish_user() to locked and unlocked variants
 * do not use KEVENT_STAT ifdefs, use inline functions instead
 * use array of callbacks of each type instead of each kevent callback initialization
 * changed name of ukevent guarding lock
 * use only one kevent lock in kevent_user for all hash buckets instead of per-bucket locks
 * do not use kevent_user_ctl structure instead provide needed arguments as syscall parameters
 * various indent cleanups
 * added optimisation, which is aimed to help when a lot of kevents are being copied from userspace
 * mapped buffer (initial) implementation (no userspace yet)

Changes from 'take1' patchset:
 - rebased against 2.6.18-git tree
 - removed ioctl controlling
 - added new syscall kevent_get_events(int fd, unsigned int min_nr, unsigned int max_nr,
			unsigned int timeout, void __user *buf, unsigned flags)
 - use old syscall kevent_ctl for creation/removing, modification and initial kevent 
	initialization
 - use mutuxes instead of semaphores
 - added file descriptor check and return error if provided descriptor does not match
	kevent file operations
 - various indent fixes
 - removed aio_sendfile() declarations.

Thank you.

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>



-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 160+ messages in thread

* [take7 1/1] kevent: core files and timer/poll notifications.
  2006-08-10 12:12                               ` [take7 0/1] kevent: generic event handling mechanism Evgeniy Polyakov
@ 2006-08-10 12:16                                 ` Evgeniy Polyakov
  2006-08-10 12:22                                   ` Evgeniy Polyakov
  0 siblings, 1 reply; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-10 12:16 UTC (permalink / raw)
  To: Andrew Morton; +Cc: lkml, David Miller, netdev, Zach Brown

This patch includes core kevent files:
- userspace controlling
- kernelspace interfaces
- initialization
- notification state machines
- timer and poll/select notifications

With this patchset rate of requests per second has achieved 2500 req/sec
while with epoll/kqueue and similar techniques it is about 1600-1800
requests per second on my test hardware and trivial web server.

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>

diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index dd63d47..091ff42 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@@ -317,3 +317,5 @@ ENTRY(sys_call_table)
 	.long sys_tee			/* 315 */
 	.long sys_vmsplice
 	.long sys_move_pages
+	.long sys_kevent_get_events
+	.long sys_kevent_ctl
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index 5d4a7d1..b2af4a8 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -713,4 +713,6 @@ #endif
 	.quad sys_tee
 	.quad compat_sys_vmsplice
 	.quad compat_sys_move_pages
+	.quad sys_kevent_get_events
+	.quad sys_kevent_ctl
 ia32_syscall_end:		
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index fc1c8dd..c9dde13 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -323,10 +323,12 @@ #define __NR_sync_file_range	314
 #define __NR_tee		315
 #define __NR_vmsplice		316
 #define __NR_move_pages		317
+#define __NR_kevent_get_events	318
+#define __NR_kevent_ctl		319
 
 #ifdef __KERNEL__
 
-#define NR_syscalls 318
+#define NR_syscalls 320
 
 /*
  * user-visible error numbers are in the range -1 - -128: see
diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h
index 94387c9..61363e0 100644
--- a/include/asm-x86_64/unistd.h
+++ b/include/asm-x86_64/unistd.h
@@ -619,10 +619,14 @@ #define __NR_vmsplice		278
 __SYSCALL(__NR_vmsplice, sys_vmsplice)
 #define __NR_move_pages		279
 __SYSCALL(__NR_move_pages, sys_move_pages)
+#define __NR_kevent_get_events	280
+__SYSCALL(__NR_kevent_get_events, sys_kevent_get_events)
+#define __NR_kevent_ctl		281
+__SYSCALL(__NR_kevent_ctl, sys_kevent_ctl)
 
 #ifdef __KERNEL__
 
-#define __NR_syscall_max __NR_move_pages
+#define __NR_syscall_max __NR_kevent_ctl
 
 #ifndef __NO_STUBS
 
diff --git a/include/linux/kevent.h b/include/linux/kevent.h
new file mode 100644
index 0000000..d3ff0cd
--- /dev/null
+++ b/include/linux/kevent.h
@@ -0,0 +1,302 @@
+/*
+ * 	kevent.h
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef __KEVENT_H
+#define __KEVENT_H
+
+/*
+ * Kevent request flags.
+ */
+
+#define KEVENT_REQ_ONESHOT	0x1		/* Process this event only once and then dequeue. */
+
+/*
+ * Kevent return flags.
+ */
+#define KEVENT_RET_BROKEN	0x1		/* Kevent is broken. */
+#define KEVENT_RET_DONE		0x2		/* Kevent processing was finished successfully. */
+
+/*
+ * Kevent type set.
+ */
+#define KEVENT_SOCKET 		0
+#define KEVENT_INODE		1
+#define KEVENT_TIMER		2
+#define KEVENT_POLL		3
+#define KEVENT_NAIO		4
+#define KEVENT_AIO		5
+#define	KEVENT_MAX		6
+
+/*
+ * Per-type event sets.
+ * Number of per-event sets should be exactly as number of kevent types.
+ */
+
+/*
+ * Timer events.
+ */
+#define	KEVENT_TIMER_FIRED	0x1
+
+/*
+ * Socket/network asynchronous IO events.
+ */
+#define	KEVENT_SOCKET_RECV	0x1
+#define	KEVENT_SOCKET_ACCEPT	0x2
+#define	KEVENT_SOCKET_SEND	0x4
+
+/*
+ * Inode events.
+ */
+#define	KEVENT_INODE_CREATE	0x1
+#define	KEVENT_INODE_REMOVE	0x2
+
+/*
+ * Poll events.
+ */
+#define	KEVENT_POLL_POLLIN	0x0001
+#define	KEVENT_POLL_POLLPRI	0x0002
+#define	KEVENT_POLL_POLLOUT	0x0004
+#define	KEVENT_POLL_POLLERR	0x0008
+#define	KEVENT_POLL_POLLHUP	0x0010
+#define	KEVENT_POLL_POLLNVAL	0x0020
+
+#define	KEVENT_POLL_POLLRDNORM	0x0040
+#define	KEVENT_POLL_POLLRDBAND	0x0080
+#define	KEVENT_POLL_POLLWRNORM	0x0100
+#define	KEVENT_POLL_POLLWRBAND	0x0200
+#define	KEVENT_POLL_POLLMSG	0x0400
+#define	KEVENT_POLL_POLLREMOVE	0x1000
+
+/*
+ * Asynchronous IO events.
+ */
+#define	KEVENT_AIO_BIO		0x1
+
+#define KEVENT_MASK_ALL		0xffffffff	/* Mask of all possible event values. */
+#define KEVENT_MASK_EMPTY	0x0		/* Empty mask of ready events. */
+
+struct kevent_id
+{
+	__u32		raw[2];
+};
+
+struct ukevent
+{
+	struct kevent_id	id;			/* Id of this request, e.g. socket number, file descriptor and so on... */
+	__u32			type;			/* Event type, e.g. KEVENT_SOCK, KEVENT_INODE, KEVENT_TIMER and so on... */
+	__u32			event;			/* Event itself, e.g. SOCK_ACCEPT, INODE_CREATED, TIMER_FIRED... */
+	__u32			req_flags;		/* Per-event request flags */
+	__u32			ret_flags;		/* Per-event return flags */
+	__u32			ret_data[2];		/* Event return data. Event originator fills it with anything it likes. */
+	union {
+		__u32		user[2];		/* User's data. It is not used, just copied to/from user. */
+		void		*ptr;
+	};
+};
+
+#define	KEVENT_CTL_ADD 		0
+#define	KEVENT_CTL_REMOVE	1
+#define	KEVENT_CTL_MODIFY	2
+#define	KEVENT_CTL_INIT		3
+
+#ifdef __KERNEL__
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/wait.h>
+#include <linux/net.h>
+#include <linux/rcupdate.h>
+#include <linux/kevent_storage.h>
+
+#define KEVENT_MAX_EVENTS	4096
+#define KEVENT_MIN_BUFFS_ALLOC	3
+
+struct inode;
+struct dentry;
+struct sock;
+
+struct kevent;
+struct kevent_storage;
+typedef int (* kevent_callback_t)(struct kevent *);
+
+/* @callback is called each time new event has been caught. */
+/* @enqueue is called each time new event is queued. */
+/* @dequeue is called each time event is dequeued. */
+
+struct kevent_callbacks {
+	kevent_callback_t	callback, enqueue, dequeue;
+};
+
+#define KEVENT_READY		0x1
+#define KEVENT_STORAGE		0x2
+#define KEVENT_USER		0x4
+
+struct kevent
+{
+	struct rcu_head		rcu_head;		/* Used for kevent freeing.*/
+	struct ukevent		event;
+	spinlock_t		ulock;			/* This lock protects ukevent manipulations, e.g. ret_flags changes. */
+
+	struct list_head	kevent_entry;		/* Entry of user's queue. */
+	struct list_head	storage_entry;		/* Entry of origin's queue. */
+	struct list_head	ready_entry;		/* Entry of user's ready. */
+
+	u32			flags;
+
+	struct kevent_user	*user;			/* User who requested this kevent. */
+	struct kevent_storage	*st;			/* Kevent container. */
+
+	struct kevent_callbacks	callbacks;
+
+	void			*priv;			/* Private data for different storages. 
+							 * poll()/select storage has a list of wait_queue_t containers 
+							 * for each ->poll() { poll_wait()' } here.
+							 */
+};
+
+extern struct kevent_callbacks kevent_registered_callbacks[];
+
+#define KEVENT_HASH_MASK	0xff
+
+struct kevent_user
+{
+	struct list_head	kevent_list[KEVENT_HASH_MASK+1];
+	spinlock_t		kevent_lock;
+	unsigned int		kevent_num;		/* Number of queued kevents. */
+
+	struct list_head	ready_list;		/* List of ready kevents. */
+	unsigned int		ready_num;		/* Number of ready kevents. */
+	spinlock_t 		ready_lock;		/* Protects all manipulations with ready queue. */
+
+	unsigned int		max_ready_num;		/* Requested number of kevents. */
+
+	struct mutex		ctl_mutex;		/* Protects against simultaneous kevent_user control manipulations. */
+	wait_queue_head_t	wait;			/* Wait until some events are ready. */
+
+	atomic_t		refcnt;			/* Reference counter, increased for each new kevent. */
+	
+	unsigned long		*pring;			/* Array of pages forming mapped ring buffer */
+
+#ifdef CONFIG_KEVENT_USER_STAT
+	unsigned long		im_num;
+	unsigned long		wait_num;
+	unsigned long		total;
+#endif
+};
+
+extern kmem_cache_t *kevent_cache;
+int kevent_enqueue(struct kevent *k);
+int kevent_dequeue(struct kevent *k);
+int kevent_init(struct kevent *k);
+void kevent_requeue(struct kevent *k);
+int kevent_break(struct kevent *k);
+
+void kevent_user_ring_add_event(struct kevent *k);
+
+void kevent_storage_ready(struct kevent_storage *st, 
+		kevent_callback_t ready_callback, u32 event);
+int kevent_storage_init(void *origin, struct kevent_storage *st);
+void kevent_storage_fini(struct kevent_storage *st);
+int kevent_storage_enqueue(struct kevent_storage *st, struct kevent *k);
+void kevent_storage_dequeue(struct kevent_storage *st, struct kevent *k);
+
+int kevent_user_add_ukevent(struct ukevent *uk, struct kevent_user *u);
+
+#ifdef CONFIG_KEVENT_POLL
+void kevent_poll_reinit(struct file *file);
+#else
+static inline void kevent_poll_reinit(struct file *file)
+{
+}
+#endif
+
+#ifdef CONFIG_KEVENT_INODE
+void kevent_inode_notify(struct inode *inode, u32 event);
+void kevent_inode_notify_parent(struct dentry *dentry, u32 event);
+void kevent_inode_remove(struct inode *inode);
+#else
+static inline void kevent_inode_notify(struct inode *inode, u32 event)
+{
+}
+static inline void kevent_inode_notify_parent(struct dentry *dentry, u32 event)
+{
+}
+static inline void kevent_inode_remove(struct inode *inode)
+{
+}
+#endif /* CONFIG_KEVENT_INODE */
+#ifdef CONFIG_KEVENT_SOCKET
+#ifdef CONFIG_LOCKDEP
+void kevent_socket_reinit(struct socket *sock);
+void kevent_sk_reinit(struct sock *sk);
+#else
+static inline void kevent_socket_reinit(struct socket *sock)
+{
+}
+static inline void kevent_sk_reinit(struct sock *sk)
+{
+}
+#endif
+void kevent_socket_notify(struct sock *sock, u32 event);
+int kevent_socket_dequeue(struct kevent *k);
+int kevent_socket_enqueue(struct kevent *k);
+#define sock_async(__sk) sock_flag(__sk, SOCK_ASYNC)
+#else
+static inline void kevent_socket_notify(struct sock *sock, u32 event)
+{
+}
+#define sock_async(__sk)	({ (void)__sk; 0; })
+#endif
+
+#ifdef CONFIG_KEVENT_USER_STAT
+static inline void kevent_stat_init(struct kevent_user *u)
+{
+	u->wait_num = u->im_num = u->total = 0;
+}
+static inline void kevent_stat_print(struct kevent_user *u)
+{
+	pr_debug("%s: u=%p, wait=%lu, immediately=%lu, total=%lu.\n", 
+			__func__, u, u->wait_num, u->im_num, u->total);
+}
+static inline void kevent_stat_im(struct kevent_user *u)
+{
+	u->im_num++;
+}
+static inline void kevent_stat_wait(struct kevent_user *u)
+{
+	u->wait_num++;
+}
+static inline void kevent_stat_total(struct kevent_user *u)
+{
+	u->total++;
+}
+#else
+#define kevent_stat_print(u)		({ (void) u;})
+#define kevent_stat_init(u)		({ (void) u;})
+#define kevent_stat_im(u)		({ (void) u;})
+#define kevent_stat_wait(u)		({ (void) u;})
+#define kevent_stat_total(u)		({ (void) u;})
+#endif
+
+#endif /* __KERNEL__ */
+#endif /* __KEVENT_H */
diff --git a/include/linux/kevent_storage.h b/include/linux/kevent_storage.h
new file mode 100644
index 0000000..a38575d
--- /dev/null
+++ b/include/linux/kevent_storage.h
@@ -0,0 +1,11 @@
+#ifndef __KEVENT_STORAGE_H
+#define __KEVENT_STORAGE_H
+
+struct kevent_storage
+{
+	void			*origin;		/* Originator's pointer, e.g. struct sock or struct file. Can be NULL. */
+	struct list_head	list;			/* List of queued kevents. */
+	spinlock_t		lock;			/* Protects users queue. */
+};
+
+#endif /* __KEVENT_STORAGE_H */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 008f04c..8609910 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -597,4 +597,7 @@ asmlinkage long sys_get_robust_list(int 
 asmlinkage long sys_set_robust_list(struct robust_list_head __user *head,
 				    size_t len);
 
+asmlinkage long sys_kevent_get_events(int ctl_fd, unsigned int min, unsigned int max, 
+		unsigned int timeout, void __user *buf, unsigned flags);
+asmlinkage long sys_kevent_ctl(int ctl_fd, unsigned int cmd, unsigned int num, void __user *buf);
 #endif
diff --git a/init/Kconfig b/init/Kconfig
index a099fc6..c550fcc 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -218,6 +218,8 @@ config AUDITSYSCALL
 	  such as SELinux.  To use audit's filesystem watch feature, please
 	  ensure that INOTIFY is configured.
 
+source "kernel/kevent/Kconfig"
+
 config IKCONFIG
 	bool "Kernel .config support"
 	---help---
diff --git a/kernel/Makefile b/kernel/Makefile
index d62ec66..2d7a6dd 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -47,6 +47,7 @@ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softl
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
+obj-$(CONFIG_KEVENT) += kevent/
 obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o
diff --git a/kernel/kevent/Kconfig b/kernel/kevent/Kconfig
new file mode 100644
index 0000000..31ea7b2
--- /dev/null
+++ b/kernel/kevent/Kconfig
@@ -0,0 +1,59 @@
+config KEVENT
+	bool "Kernel event notification mechanism"
+	help
+	  This option enables event queue mechanism.
+	  It can be used as replacement for poll()/select(), AIO callback 
+	  invocations, advanced timer notifications and other kernel 
+	  object status changes.
+
+config KEVENT_USER_STAT
+	bool "Kevent user statistic"
+	depends on KEVENT
+	default N
+	help
+	  This option will turn kevent_user statistic collection on.
+	  Statistic data includes total number of kevent, number of kevents 
+	  which are ready immediately at insertion time and number of kevents 
+	  which were removed through readiness completion. 
+	  It will be printed each time control kevent descriptor is closed.
+
+config KEVENT_SOCKET
+	bool "Kernel event notifications for sockets"
+	depends on NET && KEVENT
+	help
+	  This option enables notifications through KEVENT subsystem of 
+	  sockets operations, like new packet receiving conditions, 
+	  ready for accept conditions and so on.
+	
+config KEVENT_INODE
+	bool "Kernel event notifications for inodes"
+	depends on KEVENT
+	help
+	  This option enables notifications through KEVENT subsystem of 
+	  inode operations, like file creation, removal and so on.
+
+config KEVENT_TIMER
+	bool "Kernel event notifications for timers"
+	depends on KEVENT
+	help
+	  This option allows to use timers through KEVENT subsystem.
+
+config KEVENT_POLL
+	bool "Kernel event notifications for poll()/select()"
+	depends on KEVENT
+	help
+	  This option allows to use kevent subsystem for poll()/select() 
+	  notifications.
+
+config KEVENT_NAIO
+	bool "Network asynchronous IO"
+	depends on KEVENT && KEVENT_SOCKET
+	help
+	  This option enables kevent based network asynchronous IO subsystem.
+
+config KEVENT_AIO
+	bool "Asynchronous IO"
+	depends on KEVENT
+	help
+	  This option allows to use kevent subsystem for AIO operations.
+	  AIO read is currently supported.
diff --git a/kernel/kevent/Makefile b/kernel/kevent/Makefile
new file mode 100644
index 0000000..d1ef9ba
--- /dev/null
+++ b/kernel/kevent/Makefile
@@ -0,0 +1,7 @@
+obj-y := kevent.o kevent_user.o
+obj-$(CONFIG_KEVENT_SOCKET) += kevent_socket.o
+obj-$(CONFIG_KEVENT_INODE) += kevent_inode.o
+obj-$(CONFIG_KEVENT_TIMER) += kevent_timer.o
+obj-$(CONFIG_KEVENT_POLL) += kevent_poll.o
+obj-$(CONFIG_KEVENT_NAIO) += kevent_naio.o
+obj-$(CONFIG_KEVENT_AIO) += kevent_aio.o
diff --git a/kernel/kevent/kevent.c b/kernel/kevent/kevent.c
new file mode 100644
index 0000000..03430c9
--- /dev/null
+++ b/kernel/kevent/kevent.c
@@ -0,0 +1,251 @@
+/*
+ * 	kevent.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/mempool.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/kevent.h>
+
+kmem_cache_t *kevent_cache;
+
+/*
+ * Attempts to add an event into appropriate origin's queue.
+ * Returns positive value if this event is ready immediately,
+ * negative value in case of error and zero if event has been queued.
+ * ->enqueue() callback must increase origin's reference counter.
+ */
+int kevent_enqueue(struct kevent *k)
+{
+	if (k->event.type >= KEVENT_MAX)
+		return -EINVAL;
+
+	if (!k->callbacks.enqueue) {
+		kevent_break(k);
+		return -EINVAL;
+	}
+	
+	return k->callbacks.enqueue(k);
+}
+
+/*
+ * Remove event from the appropriate queue.
+ * ->dequeue() callback must decrease origin's reference counter.
+ */
+int kevent_dequeue(struct kevent *k)
+{
+	if (k->event.type >= KEVENT_MAX)
+		return -EINVAL;
+	
+	if (!k->callbacks.dequeue) {
+		kevent_break(k);
+		return -EINVAL;
+	}
+
+	return k->callbacks.dequeue(k);
+}
+
+/*
+ * Mark kevent as broken.
+ */
+int kevent_break(struct kevent *k)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&k->ulock, flags);
+	k->event.ret_flags |= KEVENT_RET_BROKEN;
+	spin_unlock_irqrestore(&k->ulock, flags);
+	return 0;
+}
+
+struct kevent_callbacks kevent_registered_callbacks[KEVENT_MAX];
+
+/*
+ * Must be called before event is going to be added into some origin's queue.
+ * Initializes ->enqueue(), ->dequeue() and ->callback() callbacks.
+ * If failed, kevent should not be used or kevent_enqueue() will fail to add
+ * this kevent into origin's queue with setting
+ * KEVENT_RET_BROKEN flag in kevent->event.ret_flags.
+ */
+int kevent_init(struct kevent *k)
+{
+	spin_lock_init(&k->ulock);
+	k->flags = 0;
+
+	if (k->event.type >= KEVENT_MAX)
+		return -EINVAL;
+
+	k->callbacks = kevent_registered_callbacks[k->event.type];
+	if (!k->callbacks.callback) {
+		kevent_break(k);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/*
+ * Called from ->enqueue() callback when reference counter for given
+ * origin (socket, inode...) has been increased.
+ */
+int kevent_storage_enqueue(struct kevent_storage *st, struct kevent *k)
+{
+	unsigned long flags;
+
+	k->st = st;
+	spin_lock_irqsave(&st->lock, flags);
+	list_add_tail_rcu(&k->storage_entry, &st->list);
+	k->flags |= KEVENT_STORAGE;
+	spin_unlock_irqrestore(&st->lock, flags);
+	return 0;
+}
+
+/*
+ * Dequeue kevent from origin's queue. 
+ * It does not decrease origin's reference counter in any way 
+ * and must be called before it, so storage itself must be valid.
+ * It is called from ->dequeue() callback.
+ */
+void kevent_storage_dequeue(struct kevent_storage *st, struct kevent *k)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&st->lock, flags);
+	if (k->flags & KEVENT_STORAGE) {
+		list_del_rcu(&k->storage_entry);
+		k->flags &= ~KEVENT_STORAGE;
+	}
+	spin_unlock_irqrestore(&st->lock, flags);
+}
+
+/*
+ * Call kevent ready callback and queue it into ready queue if needed.
+ * If kevent is marked as one-shot, then remove it from storage queue.
+ */
+static void __kevent_requeue(struct kevent *k, u32 event)
+{
+	int ret, rem = 0;
+	unsigned long flags;
+
+	ret = k->callbacks.callback(k);
+
+	spin_lock_irqsave(&k->ulock, flags);
+	if (ret > 0) {
+		k->event.ret_flags |= KEVENT_RET_DONE;
+	} else if (ret < 0) {
+		k->event.ret_flags |= KEVENT_RET_BROKEN;
+		k->event.ret_flags |= KEVENT_RET_DONE;
+	}
+	rem = (k->event.req_flags & KEVENT_REQ_ONESHOT);
+	if (!ret)
+		ret = (k->event.ret_flags & (KEVENT_RET_BROKEN|KEVENT_RET_DONE));
+	spin_unlock_irqrestore(&k->ulock, flags);
+
+	if (ret) {
+		if ((rem || ret < 0) && k->flags &KEVENT_STORAGE) {
+			list_del_rcu(&k->storage_entry);
+			k->flags &= ~KEVENT_STORAGE;
+		}
+		
+		spin_lock_irqsave(&k->user->ready_lock, flags);
+		if (!(k->flags & KEVENT_READY)) {
+			kevent_user_ring_add_event(k);
+			list_add_tail(&k->ready_entry, &k->user->ready_list);
+			k->flags |= KEVENT_READY;
+			k->user->ready_num++;
+		}
+		spin_unlock_irqrestore(&k->user->ready_lock, flags);
+		wake_up(&k->user->wait);
+	}
+}
+
+/*
+ * Check if kevent is ready (by invoking it's callback) and requeue/remove
+ * if needed.
+ */
+void kevent_requeue(struct kevent *k)
+{
+	unsigned long flags;
+	
+	spin_lock_irqsave(&k->st->lock, flags);
+	__kevent_requeue(k, 0);
+	spin_unlock_irqrestore(&k->st->lock, flags);
+}
+
+/*
+ * Called each time some activity in origin (socket, inode...) is noticed.
+ */
+void kevent_storage_ready(struct kevent_storage *st, 
+		kevent_callback_t ready_callback, u32 event)
+{
+	struct kevent *k;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(k, &st->list, storage_entry) {
+		if (ready_callback)
+			(*ready_callback)(k);
+
+		if (event & k->event.event)
+			__kevent_requeue(k, event);
+	}
+	rcu_read_unlock();
+}
+
+int kevent_storage_init(void *origin, struct kevent_storage *st)
+{
+	spin_lock_init(&st->lock);
+	st->origin = origin;
+	INIT_LIST_HEAD(&st->list);
+	return 0;
+}
+
+/*
+ * Mark all events as broken, that will remove them from storage,
+ * so storage origin (inode, sockt and so on) can be safely removed.
+ * No new entries are allowed to be added into the storage at this point.
+ * (Socket is removed from file table at this point for example).
+ */
+void kevent_storage_fini(struct kevent_storage *st)
+{
+	kevent_storage_ready(st, kevent_break, KEVENT_MASK_ALL);
+}
+
+static int __init kevent_sys_init(void)
+{
+	int i;
+
+	kevent_cache = kmem_cache_create("kevent_cache", 
+			sizeof(struct kevent), 0, SLAB_PANIC, NULL, NULL);
+
+	for (i=0; i<ARRAY_SIZE(kevent_registered_callbacks); ++i) {
+		struct kevent_callbacks *c = &kevent_registered_callbacks[i];
+
+		c->callback = c->enqueue = c->dequeue = NULL;
+	}
+	
+	return 0;
+}
+
+late_initcall(kevent_sys_init);
diff --git a/kernel/kevent/kevent_poll.c b/kernel/kevent/kevent_poll.c
new file mode 100644
index 0000000..8a4f863
--- /dev/null
+++ b/kernel/kevent/kevent_poll.c
@@ -0,0 +1,220 @@
+/*
+ * 	kevent_poll.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+#include <linux/file.h>
+#include <linux/kevent.h>
+#include <linux/poll.h>
+#include <linux/fs.h>
+
+static kmem_cache_t *kevent_poll_container_cache;
+static kmem_cache_t *kevent_poll_priv_cache;
+
+struct kevent_poll_ctl
+{
+	struct poll_table_struct 	pt;
+	struct kevent			*k;
+};
+
+struct kevent_poll_wait_container
+{
+	struct list_head		container_entry;
+	wait_queue_head_t		*whead;
+	wait_queue_t			wait;
+	struct kevent			*k;
+};
+
+struct kevent_poll_private
+{
+	struct list_head		container_list;
+	spinlock_t			container_lock;
+};
+
+static int kevent_poll_enqueue(struct kevent *k);
+static int kevent_poll_dequeue(struct kevent *k);
+static int kevent_poll_callback(struct kevent *k);
+
+static int kevent_poll_wait_callback(wait_queue_t *wait, 
+		unsigned mode, int sync, void *key)
+{
+	struct kevent_poll_wait_container *cont = 
+		container_of(wait, struct kevent_poll_wait_container, wait);
+	struct kevent *k = cont->k;
+	struct file *file = k->st->origin;
+	u32 revents;
+
+	revents = file->f_op->poll(file, NULL);
+
+	kevent_storage_ready(k->st, NULL, revents);
+
+	return 0;
+}
+
+static void kevent_poll_qproc(struct file *file, wait_queue_head_t *whead, 
+		struct poll_table_struct *poll_table)
+{
+	struct kevent *k = 
+		container_of(poll_table, struct kevent_poll_ctl, pt)->k;
+	struct kevent_poll_private *priv = k->priv;
+	struct kevent_poll_wait_container *cont;
+	unsigned long flags;
+
+	cont = kmem_cache_alloc(kevent_poll_container_cache, SLAB_KERNEL);
+	if (!cont) {
+		kevent_break(k);
+		return;
+	}
+		
+	cont->k = k;
+	init_waitqueue_func_entry(&cont->wait, kevent_poll_wait_callback);
+	cont->whead = whead;
+
+	spin_lock_irqsave(&priv->container_lock, flags);
+	list_add_tail(&cont->container_entry, &priv->container_list);
+	spin_unlock_irqrestore(&priv->container_lock, flags);
+
+	add_wait_queue(whead, &cont->wait);
+}
+
+static int kevent_poll_enqueue(struct kevent *k)
+{
+	struct file *file;
+	int err, ready = 0;
+	unsigned int revents;
+	struct kevent_poll_ctl ctl;
+	struct kevent_poll_private *priv;
+
+	file = fget(k->event.id.raw[0]);
+	if (!file)
+		return -ENODEV;
+
+	err = -EINVAL;
+	if (!file->f_op || !file->f_op->poll)
+		goto err_out_fput;
+
+	err = -ENOMEM;
+	priv = kmem_cache_alloc(kevent_poll_priv_cache, SLAB_KERNEL);
+	if (!priv)
+		goto err_out_fput;
+
+	spin_lock_init(&priv->container_lock);
+	INIT_LIST_HEAD(&priv->container_list);
+
+	k->priv = priv;
+
+	ctl.k = k;
+	init_poll_funcptr(&ctl.pt, &kevent_poll_qproc);
+
+	err = kevent_storage_enqueue(&file->st, k);
+	if (err)
+		goto err_out_free;
+
+	revents = file->f_op->poll(file, &ctl.pt);
+	if (revents & k->event.event) {
+		ready = 1;
+		kevent_poll_dequeue(k);
+	}
+	
+	return ready;
+
+err_out_free:
+	kmem_cache_free(kevent_poll_priv_cache, priv);
+err_out_fput:
+	fput(file);
+	return err;
+}
+
+static int kevent_poll_dequeue(struct kevent *k)
+{
+	struct file *file = k->st->origin;
+	struct kevent_poll_private *priv = k->priv;
+	struct kevent_poll_wait_container *w, *n;
+	unsigned long flags;
+
+	kevent_storage_dequeue(k->st, k);
+
+	spin_lock_irqsave(&priv->container_lock, flags);
+	list_for_each_entry_safe(w, n, &priv->container_list, container_entry) {
+		list_del(&w->container_entry);
+		remove_wait_queue(w->whead, &w->wait);
+		kmem_cache_free(kevent_poll_container_cache, w);
+	}
+	spin_unlock_irqrestore(&priv->container_lock, flags);
+	
+	kmem_cache_free(kevent_poll_priv_cache, priv);
+	k->priv = NULL;
+	
+	fput(file);
+
+	return 0;
+}
+
+static int kevent_poll_callback(struct kevent *k)
+{
+	struct file *file = k->st->origin;
+	unsigned int revents = file->f_op->poll(file, NULL);
+	return (revents & k->event.event);
+}
+
+static int __init kevent_poll_sys_init(void)
+{
+	struct kevent_callbacks *pc = &kevent_registered_callbacks[KEVENT_POLL];
+
+	kevent_poll_container_cache = kmem_cache_create("kevent_poll_container_cache", 
+			sizeof(struct kevent_poll_wait_container), 0, 0, NULL, NULL);
+	if (!kevent_poll_container_cache) {
+		printk(KERN_ERR "Failed to create kevent poll container cache.\n");
+		return -ENOMEM;
+	}
+	
+	kevent_poll_priv_cache = kmem_cache_create("kevent_poll_priv_cache", 
+			sizeof(struct kevent_poll_private), 0, 0, NULL, NULL);
+	if (!kevent_poll_priv_cache) {
+		printk(KERN_ERR "Failed to create kevent poll private data cache.\n");
+		kmem_cache_destroy(kevent_poll_container_cache);
+		kevent_poll_container_cache = NULL;
+		return -ENOMEM;
+	}
+	
+	pc->enqueue = &kevent_poll_enqueue;
+	pc->dequeue = &kevent_poll_dequeue;
+	pc->callback = &kevent_poll_callback;
+
+	printk(KERN_INFO "Kevent poll()/select() subsystem has been initialized.\n");
+	return 0;
+}
+
+static struct lock_class_key kevent_poll_key;
+
+void kevent_poll_reinit(struct file *file)
+{
+	lockdep_set_class(&file->st.lock, &kevent_poll_key);
+}
+
+static void __exit kevent_poll_sys_fini(void)
+{
+	kmem_cache_destroy(kevent_poll_priv_cache);
+	kmem_cache_destroy(kevent_poll_container_cache);
+}
+
+module_init(kevent_poll_sys_init);
+module_exit(kevent_poll_sys_fini);
diff --git a/kernel/kevent/kevent_timer.c b/kernel/kevent/kevent_timer.c
new file mode 100644
index 0000000..f175edd
--- /dev/null
+++ b/kernel/kevent/kevent_timer.c
@@ -0,0 +1,119 @@
+/*
+ * 	kevent_timer.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+#include <linux/jiffies.h>
+#include <linux/kevent.h>
+
+static void kevent_timer_func(unsigned long data)
+{
+	struct kevent *k = (struct kevent *)data;
+	struct timer_list *t = k->st->origin;
+
+	kevent_storage_ready(k->st, NULL, KEVENT_MASK_ALL);
+	mod_timer(t, jiffies + msecs_to_jiffies(k->event.id.raw[0]));
+}
+
+static struct lock_class_key kevent_timer_key;
+
+static int kevent_timer_enqueue(struct kevent *k)
+{
+	struct timer_list *t;
+	struct kevent_storage *st;
+	int err;
+
+	t = kmalloc(sizeof(struct timer_list) + sizeof(struct kevent_storage), 
+			GFP_KERNEL);
+	if (!t)
+		return -ENOMEM;
+
+	init_timer(t);
+	t->function = kevent_timer_func;
+	t->expires = jiffies + msecs_to_jiffies(k->event.id.raw[0]);
+	t->data = (unsigned long)k;
+
+	st = (struct kevent_storage *)(t+1);
+	err = kevent_storage_init(t, st);
+	if (err)
+		goto err_out_free;
+	lockdep_set_class(&st->lock, &kevent_timer_key);
+
+	err = kevent_storage_enqueue(st, k);
+	if (err)
+		goto err_out_st_fini;
+	
+	add_timer(t);
+
+	return 0;
+
+err_out_st_fini:	
+	kevent_storage_fini(st);
+err_out_free:
+	kfree(t);
+
+	return err;
+}
+
+static int kevent_timer_dequeue(struct kevent *k)
+{
+	struct kevent_storage *st = k->st;
+	struct timer_list *t = st->origin;
+
+	if (!t)
+		return -ENODEV;
+
+	del_timer_sync(t);
+	
+	kevent_storage_dequeue(st, k);
+	
+	kfree(t);
+
+	return 0;
+}
+
+static int kevent_timer_callback(struct kevent *k)
+{
+	struct kevent_storage *st = k->st;
+	struct timer_list *t = st->origin;
+
+	if (!t)
+		return -ENODEV;
+	
+	k->event.ret_data[0] = (__u32)jiffies;
+	return 1;
+}
+
+static int __init kevent_init_timer(void)
+{
+	struct kevent_callbacks *tc = &kevent_registered_callbacks[KEVENT_TIMER];
+
+	tc->enqueue = &kevent_timer_enqueue;
+	tc->dequeue = &kevent_timer_dequeue;
+	tc->callback = &kevent_timer_callback;
+
+	return 0;
+}
+late_initcall(kevent_init_timer);
diff --git a/kernel/kevent/kevent_user.c b/kernel/kevent/kevent_user.c
new file mode 100644
index 0000000..7d699aa
--- /dev/null
+++ b/kernel/kevent/kevent_user.c
@@ -0,0 +1,959 @@
+/*
+ * 	kevent_user.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/device.h>
+#include <linux/poll.h>
+#include <linux/kevent.h>
+#include <linux/jhash.h>
+#include <linux/miscdevice.h>
+#include <asm/io.h>
+
+static char kevent_name[] = "kevent";
+
+static int kevent_user_open(struct inode *, struct file *);
+static int kevent_user_release(struct inode *, struct file *);
+static unsigned int kevent_user_poll(struct file *, struct poll_table_struct *);
+static int kevent_user_mmap(struct file *, struct vm_area_struct *);
+
+static struct file_operations kevent_user_fops = {
+	.mmap		= kevent_user_mmap,
+	.open		= kevent_user_open,
+	.release	= kevent_user_release,
+	.poll		= kevent_user_poll,
+	.owner		= THIS_MODULE,
+};
+
+static struct miscdevice kevent_miscdev = {
+	.minor = MISC_DYNAMIC_MINOR,
+	.name = kevent_name,
+	.fops = &kevent_user_fops,
+};
+
+static int kevent_get_sb(struct file_system_type *fs_type, 
+		int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+{
+	/* So original magic... */
+	return get_sb_pseudo(fs_type, kevent_name, NULL, 0xbcdbcdbcdul, mnt);
+}
+
+static struct file_system_type kevent_fs_type = {
+	.name		= kevent_name,
+	.get_sb		= kevent_get_sb,
+	.kill_sb	= kill_anon_super,
+};
+
+static struct vfsmount *kevent_mnt;
+
+/*
+ * kevents are pollable, return POLLIN and POLLRDNORM 
+ * when there is at least one ready kevent.
+ */
+static unsigned int kevent_user_poll(struct file *file, struct poll_table_struct *wait)
+{
+	struct kevent_user *u = file->private_data;
+	unsigned int mask;
+	
+	poll_wait(file, &u->wait, wait);
+	mask = 0;
+
+	if (u->ready_num)
+		mask |= POLLIN | POLLRDNORM;
+
+	return mask;
+}
+
+static inline void kevent_user_ring_set(struct kevent_user *u, unsigned int num)
+{
+	unsigned int *idx;
+	
+	idx = (unsigned int *)u->pring[0];
+	idx[0] = num;
+}
+
+/*
+ * Note that kevents does not exactly fill the page (each ukevent is 40 bytes),
+ * so we reuse 4 bytes at the begining of the first page to store index.
+ * Take that into account if you want to change size of struct ukevent.
+ */
+#define KEVENTS_ON_PAGE (PAGE_SIZE/sizeof(struct ukevent))
+
+/*
+ * Called under kevent_user->ready_lock, so updates are always protected.
+ */
+void kevent_user_ring_add_event(struct kevent *k)
+{
+	unsigned int *idx_ptr, idx, pidx, off;
+	struct ukevent *ukev;
+	
+	idx_ptr = (unsigned int *)k->user->pring[0];
+	idx = idx_ptr[0];
+
+	pidx = idx/KEVENTS_ON_PAGE;
+	off = idx%KEVENTS_ON_PAGE;
+
+	if (pidx == 0)
+		ukev = (struct ukevent *)(k->user->pring[pidx] + sizeof(unsigned int));
+	else
+		ukev = (struct ukevent *)(k->user->pring[pidx]);
+
+	memcpy(&ukev[off], &k->event, sizeof(struct ukevent));
+
+	idx++;
+	if (idx >= KEVENT_MAX_EVENTS)
+		idx = 0;
+
+	idx_ptr[0] = idx;
+}
+
+/*
+ * Initialize mmap ring buffer.
+ * It will store ready kevents, so userspace could get them directly instead
+ * of using syscall. Esentially syscall becomes just a waiting point.
+ */
+static int kevent_user_ring_init(struct kevent_user *u)
+{
+	int i, pnum;
+
+	pnum = ALIGN(KEVENT_MAX_EVENTS*sizeof(struct ukevent) + sizeof(unsigned int), PAGE_SIZE)/PAGE_SIZE;
+
+	u->pring = kmalloc(pnum * sizeof(unsigned long), GFP_KERNEL);
+	if (!u->pring)
+		return -ENOMEM;
+
+	for (i=0; i<pnum; ++i) {
+		u->pring[i] = __get_free_page(GFP_KERNEL);
+		if (!u->pring[i]) {
+			pnum = i;
+			goto err_out_free;
+		}
+	}
+
+	kevent_user_ring_set(u, 0);
+
+	return 0;
+
+err_out_free:
+	for (i=0; i<pnum; ++i)
+		free_page(u->pring[i]);
+
+	kfree(u->pring);
+
+	return -ENOMEM;
+}
+
+static void kevent_user_ring_fini(struct kevent_user *u)
+{
+	int i, pnum;
+
+	pnum = ALIGN(KEVENT_MAX_EVENTS*sizeof(struct ukevent) + sizeof(unsigned int), PAGE_SIZE)/PAGE_SIZE;
+	
+	for (i=0; i<pnum; ++i)
+		free_page(u->pring[i]);
+
+	kfree(u->pring);
+}
+
+
+/*
+ * Allocate new kevent userspace control entry.
+ */
+static struct kevent_user *kevent_user_alloc(void)
+{
+	struct kevent_user *u;
+	int i;
+
+	u = kzalloc(sizeof(struct kevent_user), GFP_KERNEL);
+	if (!u)
+		return NULL;
+
+	INIT_LIST_HEAD(&u->ready_list);
+	spin_lock_init(&u->ready_lock);
+	kevent_stat_init(u);
+	spin_lock_init(&u->kevent_lock);
+	for (i=0; i<ARRAY_SIZE(u->kevent_list); ++i)
+		INIT_LIST_HEAD(&u->kevent_list[i]);
+	
+	mutex_init(&u->ctl_mutex);
+	init_waitqueue_head(&u->wait);
+
+	atomic_set(&u->refcnt, 1);
+
+	if (kevent_user_ring_init(u)) {
+		kfree(u);
+		u = NULL;
+	}
+
+	return u;
+}
+
+static int kevent_user_open(struct inode *inode, struct file *file)
+{
+	struct kevent_user *u = kevent_user_alloc();
+	
+	if (!u)
+		return -ENOMEM;
+
+	file->private_data = u;
+	
+	return 0;
+}
+
+
+/*
+ * Kevent userspace control block reference counting.
+ * Set to 1 at creation time, when appropriate kevent file descriptor
+ * is closed, that reference counter is decreased.
+ * When counter hits zero block is freed.
+ */
+static inline void kevent_user_get(struct kevent_user *u)
+{
+	atomic_inc(&u->refcnt);
+}
+
+static inline void kevent_user_put(struct kevent_user *u)
+{
+	if (atomic_dec_and_test(&u->refcnt)) {
+		kevent_stat_print(u);
+		kevent_user_ring_fini(u);
+		kfree(u);
+	}
+}
+
+/*
+ * Mmap implementation for ring buffer, which is created as array
+ * of pages, so vm_pgoff is an offset (in pages, not in bytes) of
+ * the first page to be mapped.
+ */
+static int kevent_user_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	size_t size = vma->vm_end - vma->vm_start, psize;
+	int pnum = size/PAGE_SIZE, i;
+	unsigned long start = vma->vm_start;
+	struct kevent_user *u = file->private_data;
+
+	psize = ALIGN(KEVENT_MAX_EVENTS*sizeof(struct ukevent) + sizeof(unsigned int), PAGE_SIZE);
+
+	if (size + vma->vm_pgoff*PAGE_SIZE != psize)
+		return -EINVAL;
+
+	if (vma->vm_flags & VM_WRITE)
+		return -EPERM;
+
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+	for (i=0; i<pnum; ++i) {
+		if (remap_pfn_range(vma, start, virt_to_phys((void *)u->pring[i+vma->vm_pgoff]), PAGE_SIZE,
+					vma->vm_page_prot))
+			return -EFAULT;
+		start += PAGE_SIZE;
+	}
+
+	return 0;
+}
+
+#if 0
+static inline unsigned int kevent_user_hash(struct ukevent *uk)
+{
+	unsigned int h = (uk->user[0] ^ uk->user[1]) ^ (uk->id.raw[0] ^ uk->id.raw[1]);
+	
+	h = (((h >> 16) & 0xffff) ^ (h & 0xffff)) & 0xffff;
+	h = (((h >> 8) & 0xff) ^ (h & 0xff)) & KEVENT_HASH_MASK;
+
+	return h;
+}
+#else
+static inline unsigned int kevent_user_hash(struct ukevent *uk)
+{
+	return jhash_1word(uk->id.raw[0], 0) & KEVENT_HASH_MASK;
+}
+#endif
+
+/*
+ * RCU protects storage list (kevent->storage_entry).
+ * Free entry in RCU callback, it is dequeued from all lists at 
+ * this point.
+ */
+
+static void kevent_free_rcu(struct rcu_head *rcu)
+{
+	struct kevent *kevent = container_of(rcu, struct kevent, rcu_head);
+	kmem_cache_free(kevent_cache, kevent);
+}
+
+/*
+ * Complete kevent removing - it dequeues kevent from storage list
+ * if it is requested, removes kevent from ready list, drops userspace
+ * control block reference counter and schedules kevent freeing through RCU.
+ */
+static void kevent_finish_user_complete(struct kevent *k, int deq)
+{
+	struct kevent_user *u = k->user;
+	unsigned long flags;
+
+	if (deq)
+		kevent_dequeue(k);
+
+	spin_lock_irqsave(&u->ready_lock, flags);
+	if (k->flags & KEVENT_READY) {
+		list_del(&k->ready_entry);
+		k->flags &= ~KEVENT_READY;
+		u->ready_num--;
+	}
+	spin_unlock_irqrestore(&u->ready_lock, flags);
+
+	kevent_user_put(u);
+	call_rcu(&k->rcu_head, kevent_free_rcu);
+}
+
+/*
+ * Remove from all lists and free kevent.
+ * Must be called under kevent_user->kevent_lock to protect 
+ * kevent->kevent_entry removing.
+ */
+static void __kevent_finish_user(struct kevent *k, int deq)
+{
+	struct kevent_user *u = k->user;
+
+	list_del(&k->kevent_entry);
+	k->flags &= ~KEVENT_USER;
+	u->kevent_num--;
+	kevent_finish_user_complete(k, deq);
+}
+
+/*
+ * Remove kevent from user's list of all events, 
+ * dequeue it from storage and decrease user's reference counter,
+ * since this kevent does not exist anymore. That is why it is freed here.
+ */
+static void kevent_finish_user(struct kevent *k, int deq)
+{
+	struct kevent_user *u = k->user;
+	unsigned long flags;
+
+	spin_lock_irqsave(&u->kevent_lock, flags);
+	list_del(&k->kevent_entry);
+	k->flags &= ~KEVENT_USER;
+	u->kevent_num--;
+	spin_unlock_irqrestore(&u->kevent_lock, flags);
+	kevent_finish_user_complete(k, deq);
+}
+
+/*
+ * Dequeue one entry from user's ready queue.
+ */
+static struct kevent *kqueue_dequeue_ready(struct kevent_user *u)
+{
+	unsigned long flags;
+	struct kevent *k = NULL;
+
+	spin_lock_irqsave(&u->ready_lock, flags);
+	if (u->ready_num && !list_empty(&u->ready_list)) {
+		k = list_entry(u->ready_list.next, struct kevent, ready_entry);
+		list_del(&k->ready_entry);
+		k->flags &= ~KEVENT_READY;
+		u->ready_num--;
+	}
+	spin_unlock_irqrestore(&u->ready_lock, flags);
+
+	return k;
+}
+
+/*
+ * Search a kevent inside hash bucket for given ukevent.
+ */
+static struct kevent *__kevent_search(struct list_head *head, struct ukevent *uk, 
+		struct kevent_user *u)
+{
+	struct kevent *k, *ret = NULL;
+	
+	list_for_each_entry(k, head, kevent_entry) {
+		spin_lock(&k->ulock);
+		if (k->event.user[0] == uk->user[0] && k->event.user[1] == uk->user[1] &&
+				k->event.id.raw[0] == uk->id.raw[0] && 
+				k->event.id.raw[1] == uk->id.raw[1]) {
+			ret = k;
+			spin_unlock(&k->ulock);
+			break;
+		}
+		spin_unlock(&k->ulock);
+	}
+
+	return ret;
+}
+
+/*
+ * Search and modify kevent according to provided ukevent.
+ */
+static int kevent_modify(struct ukevent *uk, struct kevent_user *u)
+{
+	struct kevent *k;
+	unsigned int hash = kevent_user_hash(uk);
+	int err = -ENODEV;
+	unsigned long flags;
+	
+	spin_lock_irqsave(&u->kevent_lock, flags);
+	k = __kevent_search(&u->kevent_list[hash], uk, u);
+	if (k) {
+		spin_lock(&k->ulock);
+		k->event.event = uk->event;
+		k->event.req_flags = uk->req_flags;
+		k->event.ret_flags = 0;
+		spin_unlock(&k->ulock);
+		kevent_requeue(k);
+		err = 0;
+	}
+	spin_unlock_irqrestore(&u->kevent_lock, flags);
+	
+	return err;
+}
+
+/*
+ * Remove kevent which matches provided ukevent.
+ */
+static int kevent_remove(struct ukevent *uk, struct kevent_user *u)
+{
+	int err = -ENODEV;
+	struct kevent *k;
+	unsigned int hash = kevent_user_hash(uk);
+	unsigned long flags;
+
+	spin_lock_irqsave(&u->kevent_lock, flags);
+	k = __kevent_search(&u->kevent_list[hash], uk, u);
+	if (k) {
+		__kevent_finish_user(k, 1);
+		err = 0;
+	}
+	spin_unlock_irqrestore(&u->kevent_lock, flags);
+
+	return err;
+}
+
+/*
+ * Detaches userspace control block from file descriptor 
+ * and decrease it's reference counter.
+ * No new kevents can be added or removed from any list at this point.
+ */
+static int kevent_user_release(struct inode *inode, struct file *file)
+{
+	struct kevent_user *u = file->private_data;
+	struct kevent *k, *n;
+	int i;
+
+	for (i=0; i<ARRAY_SIZE(u->kevent_list); ++i) {
+		list_for_each_entry_safe(k, n, &u->kevent_list[i], kevent_entry)
+			kevent_finish_user(k, 1);
+	}
+
+	kevent_user_put(u);
+	file->private_data = NULL;
+
+	return 0;
+}
+
+/*
+ * Read requested number of ukevents in one shot.
+ */
+static struct ukevent *kevent_get_user(unsigned int num, void __user *arg)
+{
+	struct ukevent *ukev;
+
+	ukev = kmalloc(sizeof(struct ukevent) * num, GFP_KERNEL);
+	if (!ukev)
+		return NULL;
+
+	if (copy_from_user(ukev, arg, sizeof(struct ukevent) * num)) {
+		kfree(ukev);
+		return NULL;
+	}
+
+	return ukev;
+}
+
+/*
+ * Read from userspace all ukevents and modify appropriate kevents.
+ * If provided number of ukevents is more that threshold, it is faster
+ * to allocate a room for them and copy in one shot instead of copy 
+ * one-by-one and then process them.
+ */
+static int kevent_user_ctl_modify(struct kevent_user *u, unsigned int num, void __user *arg)
+{
+	int err = 0, i;
+	struct ukevent uk;
+
+	mutex_lock(&u->ctl_mutex);
+
+	if (num > u->kevent_num) {
+		err = -EINVAL;
+		goto out;
+	}
+	
+	if (num > KEVENT_MIN_BUFFS_ALLOC) {
+		struct ukevent *ukev;
+
+		ukev = kevent_get_user(num, arg);
+		if (ukev) {
+			for (i=0; i<num; ++i) {
+				if (kevent_modify(&ukev[i], u))
+					ukev[i].ret_flags |= KEVENT_RET_BROKEN;
+				ukev[i].ret_flags |= KEVENT_RET_DONE;
+			}
+			if (copy_to_user(arg, ukev, num*sizeof(struct ukevent)))
+				err = -EFAULT;
+			kfree(ukev);
+			goto out;
+		}
+	}
+
+	for (i=0; i<num; ++i) {
+		if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
+			err = -EFAULT;
+			break;
+		}
+
+		if (kevent_modify(&uk, u))
+			uk.ret_flags |= KEVENT_RET_BROKEN;
+		uk.ret_flags |= KEVENT_RET_DONE;
+
+		if (copy_to_user(arg, &uk, sizeof(struct ukevent))) {
+			err = -EFAULT;
+			break;
+		}
+
+		arg += sizeof(struct ukevent);
+	}
+out:
+	mutex_unlock(&u->ctl_mutex);
+
+	return err;
+}
+
+/*
+ * Read from userspace all ukevents and remove appropriate kevents.
+ * If provided number of ukevents is more that threshold, it is faster
+ * to allocate a room for them and copy in one shot instead of copy 
+ * one-by-one and then process them.
+ */
+static int kevent_user_ctl_remove(struct kevent_user *u, unsigned int num, void __user *arg)
+{
+	int err = 0, i;
+	struct ukevent uk;
+
+	mutex_lock(&u->ctl_mutex);
+	
+	if (num > u->kevent_num) {
+		err = -EINVAL;
+		goto out;
+	}
+	
+	if (num > KEVENT_MIN_BUFFS_ALLOC) {
+		struct ukevent *ukev;
+
+		ukev = kevent_get_user(num, arg);
+		if (ukev) {
+			for (i=0; i<num; ++i) {
+				if (kevent_remove(&ukev[i], u))
+					ukev[i].ret_flags |= KEVENT_RET_BROKEN;
+				ukev[i].ret_flags |= KEVENT_RET_DONE;
+			}
+			if (copy_to_user(arg, ukev, num*sizeof(struct ukevent)))
+				err = -EFAULT;
+			kfree(ukev);
+			goto out;
+		}
+	}
+
+	for (i=0; i<num; ++i) {
+		if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
+			err = -EFAULT;
+			break;
+		}
+
+		if (kevent_remove(&uk, u))
+			uk.ret_flags |= KEVENT_RET_BROKEN;
+
+		uk.ret_flags |= KEVENT_RET_DONE;
+
+		if (copy_to_user(arg, &uk, sizeof(struct ukevent))) {
+			err = -EFAULT;
+			break;
+		}
+
+		arg += sizeof(struct ukevent);
+	}
+out:
+	mutex_unlock(&u->ctl_mutex);
+
+	return err;
+}
+
+/*
+ * Queue kevent into userspace control block and increase
+ * it's reference counter.
+ */
+static void kevent_user_enqueue(struct kevent_user *u, struct kevent *k)
+{
+	unsigned long flags;
+	unsigned int hash = kevent_user_hash(&k->event);
+
+	spin_lock_irqsave(&u->kevent_lock, flags);
+	list_add_tail(&k->kevent_entry, &u->kevent_list[hash]);
+	k->flags |= KEVENT_USER;
+	u->kevent_num++;
+	kevent_user_get(u);
+	spin_unlock_irqrestore(&u->kevent_lock, flags);
+}
+
+/*
+ * Add kevent from both kernel and userspace users.
+ * This function allocates and queues kevent, returns negative value
+ * on error, positive if kevent is ready immediately and zero
+ * if kevent has been queued.
+ */
+int kevent_user_add_ukevent(struct ukevent *uk, struct kevent_user *u)
+{
+	struct kevent *k;
+	int err;
+
+	k = kmem_cache_alloc(kevent_cache, GFP_KERNEL);
+	if (!k) {
+		err = -ENOMEM;
+		goto err_out_exit;
+	}
+
+	memcpy(&k->event, uk, sizeof(struct ukevent));
+	INIT_RCU_HEAD(&k->rcu_head);
+
+	k->event.ret_flags = 0;
+
+	err = kevent_init(k);
+	if (err) {
+		kmem_cache_free(kevent_cache, k);
+		goto err_out_exit;
+	}
+	k->user = u;
+	kevent_stat_total(u);
+	kevent_user_enqueue(u, k);
+
+	err = kevent_enqueue(k);
+	if (err) {
+		memcpy(uk, &k->event, sizeof(struct ukevent));
+		if (err < 0)
+			uk->ret_flags |= KEVENT_RET_BROKEN;
+		uk->ret_flags |= KEVENT_RET_DONE;
+		kevent_finish_user(k, 0);
+	} 
+
+err_out_exit:
+	return err;
+}
+
+/*
+ * Copy all ukevents from userspace, allocate kevent for each one 
+ * and add them into appropriate kevent_storages, 
+ * e.g. sockets, inodes and so on...
+ * Ready events will replace ones provided by used and number 
+ * of ready events is returned.
+ * User must check ret_flags field of each ukevent structure 
+ * to determine if it is fired or failed event.
+ */
+static int kevent_user_ctl_add(struct kevent_user *u, unsigned int num, void __user *arg)
+{
+	int err, cerr = 0, knum = 0, rnum = 0, i;
+	void __user *orig = arg;
+	struct ukevent uk;
+
+	mutex_lock(&u->ctl_mutex);
+
+	err = -EINVAL;
+	if (u->kevent_num + num >= KEVENT_MAX_EVENTS)
+		goto out_remove;
+
+	if (num > KEVENT_MIN_BUFFS_ALLOC) {
+		struct ukevent *ukev;
+
+		ukev = kevent_get_user(num, arg);
+		if (ukev) {
+			for (i=0; i<num; ++i) {
+				err = kevent_user_add_ukevent(&ukev[i], u);
+				if (err) {
+					kevent_stat_im(u);
+					if (i != rnum)
+						memcpy(&ukev[rnum], &ukev[i], sizeof(struct ukevent));
+					rnum++;
+				} else
+					knum++;
+			}
+			if (copy_to_user(orig, ukev, rnum*sizeof(struct ukevent)))
+				cerr = -EFAULT;
+			kfree(ukev);
+			goto out_setup;
+		}
+	}
+
+	for (i=0; i<num; ++i) {
+		if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
+			cerr = -EFAULT;
+			break;
+		}
+		arg += sizeof(struct ukevent);
+
+		err = kevent_user_add_ukevent(&uk, u);
+		if (err) {
+			kevent_stat_im(u);
+			if (copy_to_user(orig, &uk, sizeof(struct ukevent))) {
+				cerr = -EFAULT;
+				break;
+			}
+			orig += sizeof(struct ukevent);
+			rnum++;
+		} else
+			knum++;
+	}
+
+out_setup:
+	if (cerr < 0) {
+		err = cerr;
+		goto out_remove;
+	}
+
+	err = rnum;
+out_remove:
+	mutex_unlock(&u->ctl_mutex);
+
+	return err;
+}
+
+/*
+ * In nonblocking mode it returns as many events as possible, but not more than @max_nr.
+ * In blocking mode it waits until timeout or if at least @min_nr events are ready.
+ */
+static int kevent_user_wait(struct file *file, struct kevent_user *u, 
+		unsigned int min_nr, unsigned int max_nr, unsigned int timeout, 
+		void __user *buf)
+{
+	struct kevent *k;
+	int num = 0;
+
+	if (!(file->f_flags & O_NONBLOCK)) {
+		wait_event_interruptible_timeout(u->wait, 
+			u->ready_num >= min_nr, msecs_to_jiffies(timeout));
+	}
+	
+	while (num < max_nr && ((k = kqueue_dequeue_ready(u)) != NULL)) {
+		if (copy_to_user(buf + num*sizeof(struct ukevent), 
+					&k->event, sizeof(struct ukevent)))
+			break;
+
+		/*
+		 * If it is one-shot kevent, it has been removed already from
+		 * origin's queue, so we can easily free it here.
+		 */
+		if (k->event.req_flags & KEVENT_REQ_ONESHOT)
+			kevent_finish_user(k, 1);
+		++num;
+		kevent_stat_wait(u);
+	}
+
+	return num;
+}
+
+/*
+ * Userspace control block creation and initialization.
+ */
+static int kevent_ctl_init(void)
+{
+	struct kevent_user *u;
+	struct file *file;
+	int fd, ret;
+
+	fd = get_unused_fd();
+	if (fd < 0)
+		return fd;
+
+	file = get_empty_filp();
+	if (!file) {
+		ret = -ENFILE;
+		goto out_put_fd;
+	}
+
+	u = kevent_user_alloc();
+	if (unlikely(!u)) {
+		ret = -ENOMEM;
+		goto out_put_file;
+	}
+
+	file->f_op = &kevent_user_fops;
+	file->f_vfsmnt = mntget(kevent_mnt);
+	file->f_dentry = dget(kevent_mnt->mnt_root);
+	file->f_mapping = file->f_dentry->d_inode->i_mapping;
+	file->f_mode = FMODE_READ;
+	file->f_flags = O_RDONLY;
+	file->private_data = u;
+	
+	fd_install(fd, file);
+
+	return fd;
+
+out_put_file:
+	put_filp(file);
+out_put_fd:
+	put_unused_fd(fd);
+	return ret;
+}
+
+static int kevent_ctl_process(struct file *file, unsigned int cmd, unsigned int num, void __user *arg)
+{
+	int err;
+	struct kevent_user *u = file->private_data;
+
+	if (!u || num > KEVENT_MAX_EVENTS)
+		return -EINVAL;
+
+	switch (cmd) {
+	case KEVENT_CTL_ADD:
+		err = kevent_user_ctl_add(u, num, arg);
+		break;
+	case KEVENT_CTL_REMOVE:
+		err = kevent_user_ctl_remove(u, num, arg);
+		break;
+	case KEVENT_CTL_MODIFY:
+		err = kevent_user_ctl_modify(u, num, arg);
+		break;
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+	return err;
+}
+
+/*
+ * Used to get ready kevents from queue.
+ * @ctl_fd - kevent control descriptor which must be obtained through kevent_ctl(KEVENT_CTL_INIT).
+ * @min_nr - minimum number of ready kevents.
+ * @max_nr - maximum number of ready kevents.
+ * @timeout - timeout in milliseconds to wait until some events are ready.
+ * @buf - buffer to place ready events.
+ * @flags - ununsed for now (will be used for mmap implementation).
+ */
+asmlinkage long sys_kevent_get_events(int ctl_fd, unsigned int min_nr, unsigned int max_nr,
+		unsigned int timeout, void __user *buf, unsigned flags)
+{
+	int err = -EINVAL;
+	struct file *file;
+	struct kevent_user *u;
+
+	file = fget(ctl_fd);
+	if (!file)
+		return -ENODEV;
+
+	if (file->f_op != &kevent_user_fops)
+		goto out_fput;
+	u = file->private_data;
+
+	err = kevent_user_wait(file, u, min_nr, max_nr, timeout, buf);
+out_fput:
+	fput(file);
+	return err;
+}
+
+/*
+ * This syscall is used to perform various control operations
+ * on given kevent queue, which is obtained through kevent file descriptor @fd.
+ * @cmd - type of operation.
+ * @num - number of kevents to be processed.
+ * @arg - pointer to array of struct ukevent.
+ */
+asmlinkage long sys_kevent_ctl(int fd, unsigned int cmd, unsigned int num, void __user *arg)
+{
+	int err = -EINVAL;
+	struct file *file;
+
+	if (cmd == KEVENT_CTL_INIT)
+		return kevent_ctl_init();
+
+	file = fget(fd);
+	if (!file)
+		return -ENODEV;
+
+	if (file->f_op != &kevent_user_fops)
+		goto out_fput;
+
+	err = kevent_ctl_process(file, cmd, num, arg);
+
+out_fput:
+	fput(file);
+	return err;
+}
+
+/*
+ * Kevent subsystem initialization - create kevent cache and register
+ * filesystem to get control file descriptors from.
+ */
+static int __devinit kevent_user_init(void)
+{
+	int err = 0;
+	
+	err = register_filesystem(&kevent_fs_type);
+	if (err)
+		panic("%s: failed to register filesystem: err=%d.\n",
+			       kevent_name, err);
+
+	kevent_mnt = kern_mount(&kevent_fs_type);
+	if (IS_ERR(kevent_mnt))
+		panic("%s: failed to mount silesystem: err=%ld.\n", 
+				kevent_name, PTR_ERR(kevent_mnt));
+	
+	err = misc_register(&kevent_miscdev);
+	if (err) {
+		printk(KERN_ERR "Failed to register kevent miscdev: err=%d.\n", err);
+		goto err_out_exit;
+	}
+
+	printk("KEVENT subsystem has been successfully registered.\n");
+
+	return 0;
+
+err_out_exit:
+	mntput(kevent_mnt);
+	unregister_filesystem(&kevent_fs_type);
+
+	return err;
+}
+
+static void __devexit kevent_user_fini(void)
+{
+	misc_deregister(&kevent_miscdev);
+	mntput(kevent_mnt);
+	unregister_filesystem(&kevent_fs_type);
+}
+
+module_init(kevent_user_init);
+module_exit(kevent_user_fini);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 6991bec..8d3769b 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -122,6 +122,9 @@ cond_syscall(ppc_rtas);
 cond_syscall(sys_spu_run);
 cond_syscall(sys_spu_create);
 
+cond_syscall(sys_kevent_get_events);
+cond_syscall(sys_kevent_ctl);
+
 /* mmu depending weak syscall entries */
 cond_syscall(sys_mprotect);
 cond_syscall(sys_msync);

-- 
	Evgeniy Polyakov

^ permalink raw reply related	[flat|nested] 160+ messages in thread

* Re: [take7 1/1] kevent: core files and timer/poll notifications.
  2006-08-10 12:16                                 ` [take7 1/1] kevent: core files and timer/poll notifications Evgeniy Polyakov
@ 2006-08-10 12:22                                   ` Evgeniy Polyakov
  0 siblings, 0 replies; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-10 12:22 UTC (permalink / raw)
  To: Andrew Morton; +Cc: lkml, David Miller, netdev, Zach Brown

On Thu, Aug 10, 2006 at 04:16:38PM +0400, Evgeniy Polyakov (johnpol@2ka.mipt.ru) wrote:
> With this patchset rate of requests per second has achieved 2500 req/sec
> while with epoll/kqueue and similar techniques it is about 1600-1800
> requests per second on my test hardware and trivial web server.

Nope, it is old record from archives... Current one is 2600+

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take6 1/3] kevent: Core files.
  2006-08-10  8:22                                       ` Evgeniy Polyakov
@ 2006-08-11  0:56                                         ` Andrew Morton
  2006-08-11  6:15                                           ` Evgeniy Polyakov
  0 siblings, 1 reply; 160+ messages in thread
From: Andrew Morton @ 2006-08-11  0:56 UTC (permalink / raw)
  To: Evgeniy Polyakov; +Cc: lkml, David Miller, Ulrich Drepper, netdev, Zach Brown

On Thu, 10 Aug 2006 12:22:35 +0400
Evgeniy Polyakov <johnpol@2ka.mipt.ru> wrote:

> On Thu, Aug 10, 2006 at 01:02:54AM -0700, Andrew Morton (akpm@osdl.org) wrote:
> > > > Afaict this mmap function gives a user a free way of getting pinned memory. 
> > > > What is the upper bound on the amount of memory which a user can thus
> > > > obtain?
> > > 
> > > it is limited by maximum queue length which is 4k entries right now, so
> > > maximum number of paged here is 4k*40/page_size, i.e. about 40 pages on
> > > x86.
> > 
> > Is that per user or per fd?  If the latter that is, with the usual
> > RLIMIT_NOFILE, 160MBytes.  2GB with 64k pagesize.  Problem ;)
> 
> Per kevent fd.
> I have some ideas about better mmap ring implementation, which would
> dinamically grow it's buffer when events are added and reuse the same
> place for next events, but there are some nitpics unresolved yet.
> Let's not see there in next releases (no merge of course), until better 
> solution is ready. I will change that area when other things are ready.

This is not a problem with the mmap interface per-se.  If the proposed
event code permits each user to pin 160MB of kernel memory then that would
be a serious problem.



^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take6 1/3] kevent: Core files.
  2006-08-11  0:56                                         ` Andrew Morton
@ 2006-08-11  6:15                                           ` Evgeniy Polyakov
  2006-08-11  6:23                                             ` Andrew Morton
  2006-08-11  6:25                                             ` Ulrich Drepper
  0 siblings, 2 replies; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-11  6:15 UTC (permalink / raw)
  To: Andrew Morton; +Cc: lkml, David Miller, Ulrich Drepper, netdev, Zach Brown

On Thu, Aug 10, 2006 at 05:56:39PM -0700, Andrew Morton (akpm@osdl.org) wrote:
> > Per kevent fd.
> > I have some ideas about better mmap ring implementation, which would
> > dinamically grow it's buffer when events are added and reuse the same
> > place for next events, but there are some nitpics unresolved yet.
> > Let's not see there in next releases (no merge of course), until better 
> > solution is ready. I will change that area when other things are ready.
> 
> This is not a problem with the mmap interface per-se.  If the proposed
> event code permits each user to pin 160MB of kernel memory then that would
> be a serious problem.

The main disadvantage is that all memory is allocated on the start even
if it will not be used later. I think dynamic grow is appropriate
solution, since user will have that memory used anyway, since kevents
are allocated, just part of them will be allocated from possibly 
mmaped memory.

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take6 1/3] kevent: Core files.
  2006-08-11  6:15                                           ` Evgeniy Polyakov
@ 2006-08-11  6:23                                             ` Andrew Morton
  2006-08-11  6:30                                               ` Evgeniy Polyakov
  2006-08-11  6:25                                             ` Ulrich Drepper
  1 sibling, 1 reply; 160+ messages in thread
From: Andrew Morton @ 2006-08-11  6:23 UTC (permalink / raw)
  To: Evgeniy Polyakov; +Cc: lkml, David Miller, Ulrich Drepper, netdev, Zach Brown

On Fri, 11 Aug 2006 10:15:35 +0400
Evgeniy Polyakov <johnpol@2ka.mipt.ru> wrote:

> On Thu, Aug 10, 2006 at 05:56:39PM -0700, Andrew Morton (akpm@osdl.org) wrote:
> > > Per kevent fd.
> > > I have some ideas about better mmap ring implementation, which would
> > > dinamically grow it's buffer when events are added and reuse the same
> > > place for next events, but there are some nitpics unresolved yet.
> > > Let's not see there in next releases (no merge of course), until better 
> > > solution is ready. I will change that area when other things are ready.
> > 
> > This is not a problem with the mmap interface per-se.  If the proposed
> > event code permits each user to pin 160MB of kernel memory then that would
> > be a serious problem.
> 
> The main disadvantage is that all memory is allocated on the start even
> if it will not be used later. I think dynamic grow is appropriate
> solution, since user will have that memory used anyway, since kevents
> are allocated, just part of them will be allocated from possibly 
> mmaped memory.

But the worst-case remains the same, doesn't it?  160MB of pinned kernel
memory per user?


^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take6 1/3] kevent: Core files.
  2006-08-11  6:15                                           ` Evgeniy Polyakov
  2006-08-11  6:23                                             ` Andrew Morton
@ 2006-08-11  6:25                                             ` Ulrich Drepper
  2006-08-11  6:33                                               ` Evgeniy Polyakov
  1 sibling, 1 reply; 160+ messages in thread
From: Ulrich Drepper @ 2006-08-11  6:25 UTC (permalink / raw)
  To: Evgeniy Polyakov; +Cc: Andrew Morton, lkml, David Miller, netdev, Zach Brown

[-- Attachment #1: Type: text/plain, Size: 841 bytes --]

Evgeniy Polyakov wrote:
> The main disadvantage is that all memory is allocated on the start even
> if it will not be used later. I think dynamic grow is appropriate
> solution, since user will have that memory used anyway, since kevents
> are allocated,

If you _allocate_ memory at startup you're doing something wrong.  All
you should do is allocate address space.  Memory should be allocated
when it is needed.

Growing a memory region is always hard because it means you cannot keep
any addresses around and always have to reload a base pointer.  That's
not ideal.

Especially on 64-bit machines address space really is no limitation
anymore.  So, allocate as much as needed, allocate memory when it's
needed, and don't resize.

-- 
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 251 bytes --]

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take6 1/3] kevent: Core files.
  2006-08-11  6:23                                             ` Andrew Morton
@ 2006-08-11  6:30                                               ` Evgeniy Polyakov
  2006-08-11  7:04                                                 ` Andrew Morton
  0 siblings, 1 reply; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-11  6:30 UTC (permalink / raw)
  To: Andrew Morton; +Cc: lkml, David Miller, Ulrich Drepper, netdev, Zach Brown

On Thu, Aug 10, 2006 at 11:23:40PM -0700, Andrew Morton (akpm@osdl.org) wrote:
> On Fri, 11 Aug 2006 10:15:35 +0400
> Evgeniy Polyakov <johnpol@2ka.mipt.ru> wrote:
> 
> > On Thu, Aug 10, 2006 at 05:56:39PM -0700, Andrew Morton (akpm@osdl.org) wrote:
> > > > Per kevent fd.
> > > > I have some ideas about better mmap ring implementation, which would
> > > > dinamically grow it's buffer when events are added and reuse the same
> > > > place for next events, but there are some nitpics unresolved yet.
> > > > Let's not see there in next releases (no merge of course), until better 
> > > > solution is ready. I will change that area when other things are ready.
> > > 
> > > This is not a problem with the mmap interface per-se.  If the proposed
> > > event code permits each user to pin 160MB of kernel memory then that would
> > > be a serious problem.
> > 
> > The main disadvantage is that all memory is allocated on the start even
> > if it will not be used later. I think dynamic grow is appropriate
> > solution, since user will have that memory used anyway, since kevents
> > are allocated, just part of them will be allocated from possibly 
> > mmaped memory.
> 
> But the worst-case remains the same, doesn't it?  160MB of pinned kernel
> memory per user?

Yes. And now I think dynamic growing is not a good solution, since user
can not know when he must call mmap() again to get additional pages
(although I have some hacks to "dynamically" replace previously mmapped
pages with new ones).

This area can be decreased down to 70mb by reducing amount of
information placed into the buffer (only user's data and flags) without
additional hints.

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take6 1/3] kevent: Core files.
  2006-08-11  6:25                                             ` Ulrich Drepper
@ 2006-08-11  6:33                                               ` Evgeniy Polyakov
  2006-08-11  6:38                                                 ` David Miller
  0 siblings, 1 reply; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-11  6:33 UTC (permalink / raw)
  To: Ulrich Drepper; +Cc: Andrew Morton, lkml, David Miller, netdev, Zach Brown

On Thu, Aug 10, 2006 at 11:25:05PM -0700, Ulrich Drepper (drepper@redhat.com) wrote:
> Evgeniy Polyakov wrote:
> > The main disadvantage is that all memory is allocated on the start even
> > if it will not be used later. I think dynamic grow is appropriate
> > solution, since user will have that memory used anyway, since kevents
> > are allocated,
> 
> If you _allocate_ memory at startup you're doing something wrong.  All
> you should do is allocate address space.  Memory should be allocated
> when it is needed.
> 
> Growing a memory region is always hard because it means you cannot keep
> any addresses around and always have to reload a base pointer.  That's
> not ideal.
>
> Especially on 64-bit machines address space really is no limitation
> anymore.  So, allocate as much as needed, allocate memory when it's
> needed, and don't resize.

That requires mmap hacks to substitute pages in run-time without user
notifications. I do not expect it is a good solution, since on x86 it
requires full TLB flush (at least when I did it there were no exported
methods to flush separate addresses).

> -- 
> ➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
> 



-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take6 1/3] kevent: Core files.
  2006-08-11  6:33                                               ` Evgeniy Polyakov
@ 2006-08-11  6:38                                                 ` David Miller
  2006-08-11  6:55                                                   ` Evgeniy Polyakov
  0 siblings, 1 reply; 160+ messages in thread
From: David Miller @ 2006-08-11  6:38 UTC (permalink / raw)
  To: johnpol; +Cc: drepper, akpm, linux-kernel, netdev, zach.brown

From: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
Date: Fri, 11 Aug 2006 10:33:53 +0400

> That requires mmap hacks to substitute pages in run-time without user
> notifications. I do not expect it is a good solution, since on x86 it
> requires full TLB flush (at least when I did it there were no exported
> methods to flush separate addresses).

You just need to provide a do_no_page method, the VM layer will
take care of the page level flushing or whatever else might be
needed.


^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take6 1/3] kevent: Core files.
  2006-08-11  6:38                                                 ` David Miller
@ 2006-08-11  6:55                                                   ` Evgeniy Polyakov
  0 siblings, 0 replies; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-11  6:55 UTC (permalink / raw)
  To: David Miller; +Cc: drepper, akpm, linux-kernel, netdev, zach.brown

On Thu, Aug 10, 2006 at 11:38:26PM -0700, David Miller (davem@davemloft.net) wrote:
> From: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
> Date: Fri, 11 Aug 2006 10:33:53 +0400
> 
> > That requires mmap hacks to substitute pages in run-time without user
> > notifications. I do not expect it is a good solution, since on x86 it
> > requires full TLB flush (at least when I did it there were no exported
> > methods to flush separate addresses).
> 
> You just need to provide a do_no_page method, the VM layer will
> take care of the page level flushing or whatever else might be
> needed.

Yes, it is the simplest way to extend mapping but not to replace pages
which are successfully mapped, but such hacks are not needed for kevent
which only expects to extend mapping when number of ready kevents
increases.

So I will create such implementation and will place a reduced amount of
info into that pages.

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take6 1/3] kevent: Core files.
  2006-08-11  6:30                                               ` Evgeniy Polyakov
@ 2006-08-11  7:04                                                 ` Andrew Morton
  2006-08-11  7:27                                                   ` Evgeniy Polyakov
  0 siblings, 1 reply; 160+ messages in thread
From: Andrew Morton @ 2006-08-11  7:04 UTC (permalink / raw)
  To: Evgeniy Polyakov; +Cc: lkml, David Miller, Ulrich Drepper, netdev, Zach Brown

On Fri, 11 Aug 2006 10:30:21 +0400
Evgeniy Polyakov <johnpol@2ka.mipt.ru> wrote:

> On Thu, Aug 10, 2006 at 11:23:40PM -0700, Andrew Morton (akpm@osdl.org) wrote:
> > On Fri, 11 Aug 2006 10:15:35 +0400
> > Evgeniy Polyakov <johnpol@2ka.mipt.ru> wrote:
> > 
> > > On Thu, Aug 10, 2006 at 05:56:39PM -0700, Andrew Morton (akpm@osdl.org) wrote:
> > > > > Per kevent fd.
> > > > > I have some ideas about better mmap ring implementation, which would
> > > > > dinamically grow it's buffer when events are added and reuse the same
> > > > > place for next events, but there are some nitpics unresolved yet.
> > > > > Let's not see there in next releases (no merge of course), until better 
> > > > > solution is ready. I will change that area when other things are ready.
> > > > 
> > > > This is not a problem with the mmap interface per-se.  If the proposed
> > > > event code permits each user to pin 160MB of kernel memory then that would
> > > > be a serious problem.
> > > 
> > > The main disadvantage is that all memory is allocated on the start even
> > > if it will not be used later. I think dynamic grow is appropriate
> > > solution, since user will have that memory used anyway, since kevents
> > > are allocated, just part of them will be allocated from possibly 
> > > mmaped memory.
> > 
> > But the worst-case remains the same, doesn't it?  160MB of pinned kernel
> > memory per user?
> 
> Yes. And now I think dynamic growing is not a good solution, since user
> can not know when he must call mmap() again to get additional pages
> (although I have some hacks to "dynamically" replace previously mmapped
> pages with new ones).
> 
> This area can be decreased down to 70mb by reducing amount of
> information placed into the buffer (only user's data and flags) without
> additional hints.
> 

70MB is still very bad, naturally.

There are other ways in which users can do this sort of thing - passing
fd's across sockets, allocating zillions of pagetables come to mind.  But
we don't want to add more.

Possible options:

- Add a new rlimit for the number of kevent fd's

- Add a new rlimit for the amount of kevent memory

- Add a new rlimit for the total amount of pinned kernel memory.  First
  user is kevent.

- Account a kevent fd as being worth 100 regular fds, so the naughty user
  hits EMFILE early (ug).

A new rlimit is attractive, and they're easy to add.  Problem is, userspace
support is hard (I think).  afaik a standard Linux system doesn't have
global and per-user rlimit config files which are parsed and acted upon at
login.  That would make rlimits more useful.

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take6 1/3] kevent: Core files.
  2006-08-11  7:04                                                 ` Andrew Morton
@ 2006-08-11  7:27                                                   ` Evgeniy Polyakov
  0 siblings, 0 replies; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-11  7:27 UTC (permalink / raw)
  To: Andrew Morton; +Cc: lkml, David Miller, Ulrich Drepper, netdev, Zach Brown

On Fri, Aug 11, 2006 at 12:04:54AM -0700, Andrew Morton (akpm@osdl.org) wrote:
> > This area can be decreased down to 70mb by reducing amount of
> > information placed into the buffer (only user's data and flags) without
> > additional hints.
> > 
> 
> 70MB is still very bad, naturally.

Actually I do not think that 4k events is a good choice - I expect people
will scale it to tens of thousands at least, so we definitely want not to
allow user to create way too many kevent fds.

> There are other ways in which users can do this sort of thing - passing
> fd's across sockets, allocating zillions of pagetables come to mind.  But
> we don't want to add more.
> 
> Possible options:
> 
> - Add a new rlimit for the number of kevent fd's
> 
> - Add a new rlimit for the amount of kevent memory
> 
> - Add a new rlimit for the total amount of pinned kernel memory.  First
>   user is kevent.

I think this rlimit and first one are the best choises.

> - Account a kevent fd as being worth 100 regular fds, so the naughty user
>   hits EMFILE early (ug).
> 
> A new rlimit is attractive, and they're easy to add.  Problem is, userspace
> support is hard (I think).  afaik a standard Linux system doesn't have
> global and per-user rlimit config files which are parsed and acted upon at
> login.  That would make rlimits more useful.

As for now it is possible to use stack size rlimit for example.

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 160+ messages in thread

* [take8 0/2] kevent: Generic event handling mechanism.
  2006-07-31 10:33                       ` Evgeniy Polyakov
                                           ` (6 preceding siblings ...)
  2006-08-09  8:02                         ` [take6 0/3] kevent: Generic event handling mechanism Evgeniy Polyakov
@ 2006-08-11  8:40                         ` Evgeniy Polyakov
  2006-08-11  8:40                           ` [take8 1/2] kevent: Core files Evgeniy Polyakov
  2006-08-14  6:20                         ` [take8 0/2] kevent: Generic event handling mechanism Evgeniy Polyakov
                                           ` (2 subsequent siblings)
  10 siblings, 1 reply; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-11  8:40 UTC (permalink / raw)
  To: lkml
  Cc: David Miller, Ulrich Drepper, Andrew Morton, Evgeniy Polyakov,
	netdev, Zach Brown


Generic event handling mechanism.

Changes from 'take7' patchset:
 * new mmap interface (not tested, waiting for other changes to be acked)
	- use nopage() method to dynamically substitue pages
	- allocate new page for events only when new added kevent requres it
	- do not use ugly index dereferencing, use structure instead
	- reduced amount of data in the ring (id and flags), 
		maximum 12 pages on x86 per kevent fd

Changes from 'take6' patchset:
 * a lot of comments!
 * do not use list poisoning for detection of the fact, that entry is in the list
 * return number of ready kevents even if copy*user() fails
 * strict check for number of kevents in syscall
 * use ARRAY_SIZE for array size calculation
 * changed superblock magic number
 * use SLAB_PANIC instead of direct panic() call
 * changed -E* return values
 * a lot of small cleanups and indent fixes

Changes from 'take5' patchset:
 * removed compilation warnings about unused wariables when lockdep is not turned on
 * do not use internal socket structures, use appropriate (exported) wrappers instead
 * removed default 1 second timeout
 * removed AIO stuff from patchset

Changes from 'take4' patchset:
 * use miscdevice instead of chardevice
 * comments fixes

Changes from 'take3' patchset:
 * removed serializing mutex from kevent_user_wait()
 * moved storage list processing to RCU
 * removed lockdep screaming - all storage locks are initialized in the same function, so it was learned 
	to differentiate between various cases
 * remove kevent from storage if is marked as broken after callback
 * fixed a typo in mmaped buffer implementation which would end up in wrong index calcualtion 

Changes from 'take2' patchset:
 * split kevent_finish_user() to locked and unlocked variants
 * do not use KEVENT_STAT ifdefs, use inline functions instead
 * use array of callbacks of each type instead of each kevent callback initialization
 * changed name of ukevent guarding lock
 * use only one kevent lock in kevent_user for all hash buckets instead of per-bucket locks
 * do not use kevent_user_ctl structure instead provide needed arguments as syscall parameters
 * various indent cleanups
 * added optimisation, which is aimed to help when a lot of kevents are being copied from userspace
 * mapped buffer (initial) implementation (no userspace yet)

Changes from 'take1' patchset:
 - rebased against 2.6.18-git tree
 - removed ioctl controlling
 - added new syscall kevent_get_events(int fd, unsigned int min_nr, unsigned int max_nr,
			unsigned int timeout, void __user *buf, unsigned flags)
 - use old syscall kevent_ctl for creation/removing, modification and initial kevent 
	initialization
 - use mutuxes instead of semaphores
 - added file descriptor check and return error if provided descriptor does not match
	kevent file operations
 - various indent fixes
 - removed aio_sendfile() declarations.

Thank you.

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>



^ permalink raw reply	[flat|nested] 160+ messages in thread

* [take8 2/2] kevent: poll/select() notifications. Timer notifications.
  2006-08-11  8:40                           ` [take8 1/2] kevent: Core files Evgeniy Polyakov
@ 2006-08-11  8:40                             ` Evgeniy Polyakov
  2006-08-11 15:45                               ` Andrew Morton
  2006-08-13  0:51                             ` [take8 1/2] kevent: Core files Jeff Carr
  1 sibling, 1 reply; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-11  8:40 UTC (permalink / raw)
  To: lkml
  Cc: David Miller, Ulrich Drepper, Andrew Morton, Evgeniy Polyakov,
	netdev, Zach Brown


poll/select() notifications. Timer notifications.

This patch includes generic poll/select and timer notifications.

kevent_poll works simialr to epoll and has the same issues (callback
is invoked not from internal state machine of the caller, but through
process awake).

Timer notifications can be used for fine grained per-process time 
management, since interval timers are very inconvenient to use, 
and they are limited.

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mitp.ru>

diff --git a/kernel/kevent/kevent_poll.c b/kernel/kevent/kevent_poll.c
new file mode 100644
index 0000000..8a4f863
--- /dev/null
+++ b/kernel/kevent/kevent_poll.c
@@ -0,0 +1,220 @@
+/*
+ * 	kevent_poll.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+#include <linux/file.h>
+#include <linux/kevent.h>
+#include <linux/poll.h>
+#include <linux/fs.h>
+
+static kmem_cache_t *kevent_poll_container_cache;
+static kmem_cache_t *kevent_poll_priv_cache;
+
+struct kevent_poll_ctl
+{
+	struct poll_table_struct 	pt;
+	struct kevent			*k;
+};
+
+struct kevent_poll_wait_container
+{
+	struct list_head		container_entry;
+	wait_queue_head_t		*whead;
+	wait_queue_t			wait;
+	struct kevent			*k;
+};
+
+struct kevent_poll_private
+{
+	struct list_head		container_list;
+	spinlock_t			container_lock;
+};
+
+static int kevent_poll_enqueue(struct kevent *k);
+static int kevent_poll_dequeue(struct kevent *k);
+static int kevent_poll_callback(struct kevent *k);
+
+static int kevent_poll_wait_callback(wait_queue_t *wait, 
+		unsigned mode, int sync, void *key)
+{
+	struct kevent_poll_wait_container *cont = 
+		container_of(wait, struct kevent_poll_wait_container, wait);
+	struct kevent *k = cont->k;
+	struct file *file = k->st->origin;
+	u32 revents;
+
+	revents = file->f_op->poll(file, NULL);
+
+	kevent_storage_ready(k->st, NULL, revents);
+
+	return 0;
+}
+
+static void kevent_poll_qproc(struct file *file, wait_queue_head_t *whead, 
+		struct poll_table_struct *poll_table)
+{
+	struct kevent *k = 
+		container_of(poll_table, struct kevent_poll_ctl, pt)->k;
+	struct kevent_poll_private *priv = k->priv;
+	struct kevent_poll_wait_container *cont;
+	unsigned long flags;
+
+	cont = kmem_cache_alloc(kevent_poll_container_cache, SLAB_KERNEL);
+	if (!cont) {
+		kevent_break(k);
+		return;
+	}
+		
+	cont->k = k;
+	init_waitqueue_func_entry(&cont->wait, kevent_poll_wait_callback);
+	cont->whead = whead;
+
+	spin_lock_irqsave(&priv->container_lock, flags);
+	list_add_tail(&cont->container_entry, &priv->container_list);
+	spin_unlock_irqrestore(&priv->container_lock, flags);
+
+	add_wait_queue(whead, &cont->wait);
+}
+
+static int kevent_poll_enqueue(struct kevent *k)
+{
+	struct file *file;
+	int err, ready = 0;
+	unsigned int revents;
+	struct kevent_poll_ctl ctl;
+	struct kevent_poll_private *priv;
+
+	file = fget(k->event.id.raw[0]);
+	if (!file)
+		return -ENODEV;
+
+	err = -EINVAL;
+	if (!file->f_op || !file->f_op->poll)
+		goto err_out_fput;
+
+	err = -ENOMEM;
+	priv = kmem_cache_alloc(kevent_poll_priv_cache, SLAB_KERNEL);
+	if (!priv)
+		goto err_out_fput;
+
+	spin_lock_init(&priv->container_lock);
+	INIT_LIST_HEAD(&priv->container_list);
+
+	k->priv = priv;
+
+	ctl.k = k;
+	init_poll_funcptr(&ctl.pt, &kevent_poll_qproc);
+
+	err = kevent_storage_enqueue(&file->st, k);
+	if (err)
+		goto err_out_free;
+
+	revents = file->f_op->poll(file, &ctl.pt);
+	if (revents & k->event.event) {
+		ready = 1;
+		kevent_poll_dequeue(k);
+	}
+	
+	return ready;
+
+err_out_free:
+	kmem_cache_free(kevent_poll_priv_cache, priv);
+err_out_fput:
+	fput(file);
+	return err;
+}
+
+static int kevent_poll_dequeue(struct kevent *k)
+{
+	struct file *file = k->st->origin;
+	struct kevent_poll_private *priv = k->priv;
+	struct kevent_poll_wait_container *w, *n;
+	unsigned long flags;
+
+	kevent_storage_dequeue(k->st, k);
+
+	spin_lock_irqsave(&priv->container_lock, flags);
+	list_for_each_entry_safe(w, n, &priv->container_list, container_entry) {
+		list_del(&w->container_entry);
+		remove_wait_queue(w->whead, &w->wait);
+		kmem_cache_free(kevent_poll_container_cache, w);
+	}
+	spin_unlock_irqrestore(&priv->container_lock, flags);
+	
+	kmem_cache_free(kevent_poll_priv_cache, priv);
+	k->priv = NULL;
+	
+	fput(file);
+
+	return 0;
+}
+
+static int kevent_poll_callback(struct kevent *k)
+{
+	struct file *file = k->st->origin;
+	unsigned int revents = file->f_op->poll(file, NULL);
+	return (revents & k->event.event);
+}
+
+static int __init kevent_poll_sys_init(void)
+{
+	struct kevent_callbacks *pc = &kevent_registered_callbacks[KEVENT_POLL];
+
+	kevent_poll_container_cache = kmem_cache_create("kevent_poll_container_cache", 
+			sizeof(struct kevent_poll_wait_container), 0, 0, NULL, NULL);
+	if (!kevent_poll_container_cache) {
+		printk(KERN_ERR "Failed to create kevent poll container cache.\n");
+		return -ENOMEM;
+	}
+	
+	kevent_poll_priv_cache = kmem_cache_create("kevent_poll_priv_cache", 
+			sizeof(struct kevent_poll_private), 0, 0, NULL, NULL);
+	if (!kevent_poll_priv_cache) {
+		printk(KERN_ERR "Failed to create kevent poll private data cache.\n");
+		kmem_cache_destroy(kevent_poll_container_cache);
+		kevent_poll_container_cache = NULL;
+		return -ENOMEM;
+	}
+	
+	pc->enqueue = &kevent_poll_enqueue;
+	pc->dequeue = &kevent_poll_dequeue;
+	pc->callback = &kevent_poll_callback;
+
+	printk(KERN_INFO "Kevent poll()/select() subsystem has been initialized.\n");
+	return 0;
+}
+
+static struct lock_class_key kevent_poll_key;
+
+void kevent_poll_reinit(struct file *file)
+{
+	lockdep_set_class(&file->st.lock, &kevent_poll_key);
+}
+
+static void __exit kevent_poll_sys_fini(void)
+{
+	kmem_cache_destroy(kevent_poll_priv_cache);
+	kmem_cache_destroy(kevent_poll_container_cache);
+}
+
+module_init(kevent_poll_sys_init);
+module_exit(kevent_poll_sys_fini);
diff --git a/kernel/kevent/kevent_timer.c b/kernel/kevent/kevent_timer.c
new file mode 100644
index 0000000..f175edd
--- /dev/null
+++ b/kernel/kevent/kevent_timer.c
@@ -0,0 +1,119 @@
+/*
+ * 	kevent_timer.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+#include <linux/jiffies.h>
+#include <linux/kevent.h>
+
+static void kevent_timer_func(unsigned long data)
+{
+	struct kevent *k = (struct kevent *)data;
+	struct timer_list *t = k->st->origin;
+
+	kevent_storage_ready(k->st, NULL, KEVENT_MASK_ALL);
+	mod_timer(t, jiffies + msecs_to_jiffies(k->event.id.raw[0]));
+}
+
+static struct lock_class_key kevent_timer_key;
+
+static int kevent_timer_enqueue(struct kevent *k)
+{
+	struct timer_list *t;
+	struct kevent_storage *st;
+	int err;
+
+	t = kmalloc(sizeof(struct timer_list) + sizeof(struct kevent_storage), 
+			GFP_KERNEL);
+	if (!t)
+		return -ENOMEM;
+
+	init_timer(t);
+	t->function = kevent_timer_func;
+	t->expires = jiffies + msecs_to_jiffies(k->event.id.raw[0]);
+	t->data = (unsigned long)k;
+
+	st = (struct kevent_storage *)(t+1);
+	err = kevent_storage_init(t, st);
+	if (err)
+		goto err_out_free;
+	lockdep_set_class(&st->lock, &kevent_timer_key);
+
+	err = kevent_storage_enqueue(st, k);
+	if (err)
+		goto err_out_st_fini;
+	
+	add_timer(t);
+
+	return 0;
+
+err_out_st_fini:	
+	kevent_storage_fini(st);
+err_out_free:
+	kfree(t);
+
+	return err;
+}
+
+static int kevent_timer_dequeue(struct kevent *k)
+{
+	struct kevent_storage *st = k->st;
+	struct timer_list *t = st->origin;
+
+	if (!t)
+		return -ENODEV;
+
+	del_timer_sync(t);
+	
+	kevent_storage_dequeue(st, k);
+	
+	kfree(t);
+
+	return 0;
+}
+
+static int kevent_timer_callback(struct kevent *k)
+{
+	struct kevent_storage *st = k->st;
+	struct timer_list *t = st->origin;
+
+	if (!t)
+		return -ENODEV;
+	
+	k->event.ret_data[0] = (__u32)jiffies;
+	return 1;
+}
+
+static int __init kevent_init_timer(void)
+{
+	struct kevent_callbacks *tc = &kevent_registered_callbacks[KEVENT_TIMER];
+
+	tc->enqueue = &kevent_timer_enqueue;
+	tc->dequeue = &kevent_timer_dequeue;
+	tc->callback = &kevent_timer_callback;
+
+	return 0;
+}
+late_initcall(kevent_init_timer);


^ permalink raw reply related	[flat|nested] 160+ messages in thread

* [take8 1/2] kevent: Core files.
  2006-08-11  8:40                         ` [take8 0/2] kevent: Generic event handling mechanism Evgeniy Polyakov
@ 2006-08-11  8:40                           ` Evgeniy Polyakov
  2006-08-11  8:40                             ` [take8 2/2] kevent: poll/select() notifications. Timer notifications Evgeniy Polyakov
  2006-08-13  0:51                             ` [take8 1/2] kevent: Core files Jeff Carr
  0 siblings, 2 replies; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-11  8:40 UTC (permalink / raw)
  To: lkml
  Cc: David Miller, Ulrich Drepper, Andrew Morton, Evgeniy Polyakov,
	netdev, Zach Brown


Core files.

This patch includes core kevent files:
 - userspace controlling
 - kernelspace interfaces
 - initialization
 - notification state machines

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>

diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index dd63d47..091ff42 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@@ -317,3 +317,5 @@ ENTRY(sys_call_table)
 	.long sys_tee			/* 315 */
 	.long sys_vmsplice
 	.long sys_move_pages
+	.long sys_kevent_get_events
+	.long sys_kevent_ctl
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index 5d4a7d1..b2af4a8 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -713,4 +713,6 @@ #endif
 	.quad sys_tee
 	.quad compat_sys_vmsplice
 	.quad compat_sys_move_pages
+	.quad sys_kevent_get_events
+	.quad sys_kevent_ctl
 ia32_syscall_end:		
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index fc1c8dd..c9dde13 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -323,10 +323,12 @@ #define __NR_sync_file_range	314
 #define __NR_tee		315
 #define __NR_vmsplice		316
 #define __NR_move_pages		317
+#define __NR_kevent_get_events	318
+#define __NR_kevent_ctl		319
 
 #ifdef __KERNEL__
 
-#define NR_syscalls 318
+#define NR_syscalls 320
 
 /*
  * user-visible error numbers are in the range -1 - -128: see
diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h
index 94387c9..61363e0 100644
--- a/include/asm-x86_64/unistd.h
+++ b/include/asm-x86_64/unistd.h
@@ -619,10 +619,14 @@ #define __NR_vmsplice		278
 __SYSCALL(__NR_vmsplice, sys_vmsplice)
 #define __NR_move_pages		279
 __SYSCALL(__NR_move_pages, sys_move_pages)
+#define __NR_kevent_get_events	280
+__SYSCALL(__NR_kevent_get_events, sys_kevent_get_events)
+#define __NR_kevent_ctl		281
+__SYSCALL(__NR_kevent_ctl, sys_kevent_ctl)
 
 #ifdef __KERNEL__
 
-#define __NR_syscall_max __NR_move_pages
+#define __NR_syscall_max __NR_kevent_ctl
 
 #ifndef __NO_STUBS
 
diff --git a/include/linux/kevent.h b/include/linux/kevent.h
new file mode 100644
index 0000000..64ef706
--- /dev/null
+++ b/include/linux/kevent.h
@@ -0,0 +1,309 @@
+/*
+ * 	kevent.h
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef __KEVENT_H
+#define __KEVENT_H
+
+/*
+ * Kevent request flags.
+ */
+
+#define KEVENT_REQ_ONESHOT	0x1		/* Process this event only once and then dequeue. */
+
+/*
+ * Kevent return flags.
+ */
+#define KEVENT_RET_BROKEN	0x1		/* Kevent is broken. */
+#define KEVENT_RET_DONE		0x2		/* Kevent processing was finished successfully. */
+
+/*
+ * Kevent type set.
+ */
+#define KEVENT_SOCKET 		0
+#define KEVENT_INODE		1
+#define KEVENT_TIMER		2
+#define KEVENT_POLL		3
+#define KEVENT_NAIO		4
+#define KEVENT_AIO		5
+#define	KEVENT_MAX		6
+
+/*
+ * Per-type event sets.
+ * Number of per-event sets should be exactly as number of kevent types.
+ */
+
+/*
+ * Timer events.
+ */
+#define	KEVENT_TIMER_FIRED	0x1
+
+/*
+ * Socket/network asynchronous IO events.
+ */
+#define	KEVENT_SOCKET_RECV	0x1
+#define	KEVENT_SOCKET_ACCEPT	0x2
+#define	KEVENT_SOCKET_SEND	0x4
+
+/*
+ * Inode events.
+ */
+#define	KEVENT_INODE_CREATE	0x1
+#define	KEVENT_INODE_REMOVE	0x2
+
+/*
+ * Poll events.
+ */
+#define	KEVENT_POLL_POLLIN	0x0001
+#define	KEVENT_POLL_POLLPRI	0x0002
+#define	KEVENT_POLL_POLLOUT	0x0004
+#define	KEVENT_POLL_POLLERR	0x0008
+#define	KEVENT_POLL_POLLHUP	0x0010
+#define	KEVENT_POLL_POLLNVAL	0x0020
+
+#define	KEVENT_POLL_POLLRDNORM	0x0040
+#define	KEVENT_POLL_POLLRDBAND	0x0080
+#define	KEVENT_POLL_POLLWRNORM	0x0100
+#define	KEVENT_POLL_POLLWRBAND	0x0200
+#define	KEVENT_POLL_POLLMSG	0x0400
+#define	KEVENT_POLL_POLLREMOVE	0x1000
+
+/*
+ * Asynchronous IO events.
+ */
+#define	KEVENT_AIO_BIO		0x1
+
+#define KEVENT_MASK_ALL		0xffffffff	/* Mask of all possible event values. */
+#define KEVENT_MASK_EMPTY	0x0		/* Empty mask of ready events. */
+
+struct kevent_id
+{
+	__u32		raw[2];
+};
+
+struct ukevent
+{
+	struct kevent_id	id;			/* Id of this request, e.g. socket number, file descriptor and so on... */
+	__u32			type;			/* Event type, e.g. KEVENT_SOCK, KEVENT_INODE, KEVENT_TIMER and so on... */
+	__u32			event;			/* Event itself, e.g. SOCK_ACCEPT, INODE_CREATED, TIMER_FIRED... */
+	__u32			req_flags;		/* Per-event request flags */
+	__u32			ret_flags;		/* Per-event return flags */
+	__u32			ret_data[2];		/* Event return data. Event originator fills it with anything it likes. */
+	union {
+		__u32		user[2];		/* User's data. It is not used, just copied to/from user. */
+		void		*ptr;
+	};
+};
+
+struct mukevent
+{
+	struct kevent_id	id;
+	__u32			ret_flags;
+};
+
+#define	KEVENT_CTL_ADD 		0
+#define	KEVENT_CTL_REMOVE	1
+#define	KEVENT_CTL_MODIFY	2
+#define	KEVENT_CTL_INIT		3
+
+#ifdef __KERNEL__
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/wait.h>
+#include <linux/net.h>
+#include <linux/rcupdate.h>
+#include <linux/kevent_storage.h>
+
+#define KEVENT_MAX_EVENTS	4096
+#define KEVENT_MIN_BUFFS_ALLOC	3
+
+struct inode;
+struct dentry;
+struct sock;
+
+struct kevent;
+struct kevent_storage;
+typedef int (* kevent_callback_t)(struct kevent *);
+
+/* @callback is called each time new event has been caught. */
+/* @enqueue is called each time new event is queued. */
+/* @dequeue is called each time event is dequeued. */
+
+struct kevent_callbacks {
+	kevent_callback_t	callback, enqueue, dequeue;
+};
+
+#define KEVENT_READY		0x1
+#define KEVENT_STORAGE		0x2
+#define KEVENT_USER		0x4
+
+struct kevent
+{
+	struct rcu_head		rcu_head;		/* Used for kevent freeing.*/
+	struct ukevent		event;
+	spinlock_t		ulock;			/* This lock protects ukevent manipulations, e.g. ret_flags changes. */
+
+	struct list_head	kevent_entry;		/* Entry of user's queue. */
+	struct list_head	storage_entry;		/* Entry of origin's queue. */
+	struct list_head	ready_entry;		/* Entry of user's ready. */
+
+	u32			flags;
+
+	struct kevent_user	*user;			/* User who requested this kevent. */
+	struct kevent_storage	*st;			/* Kevent container. */
+
+	struct kevent_callbacks	callbacks;
+
+	void			*priv;			/* Private data for different storages. 
+							 * poll()/select storage has a list of wait_queue_t containers 
+							 * for each ->poll() { poll_wait()' } here.
+							 */
+};
+
+extern struct kevent_callbacks kevent_registered_callbacks[];
+
+#define KEVENT_HASH_MASK	0xff
+
+struct kevent_user
+{
+	struct list_head	kevent_list[KEVENT_HASH_MASK+1];
+	spinlock_t		kevent_lock;
+	unsigned int		kevent_num;		/* Number of queued kevents. */
+
+	struct list_head	ready_list;		/* List of ready kevents. */
+	unsigned int		ready_num;		/* Number of ready kevents. */
+	spinlock_t 		ready_lock;		/* Protects all manipulations with ready queue. */
+
+	unsigned int		max_ready_num;		/* Requested number of kevents. */
+
+	struct mutex		ctl_mutex;		/* Protects against simultaneous kevent_user control manipulations. */
+	wait_queue_head_t	wait;			/* Wait until some events are ready. */
+
+	atomic_t		refcnt;			/* Reference counter, increased for each new kevent. */
+	
+	unsigned int		pages_in_use;
+	unsigned long		*pring;			/* Array of pages forming mapped ring buffer */
+
+#ifdef CONFIG_KEVENT_USER_STAT
+	unsigned long		im_num;
+	unsigned long		wait_num;
+	unsigned long		total;
+#endif
+};
+
+extern kmem_cache_t *kevent_cache;
+int kevent_enqueue(struct kevent *k);
+int kevent_dequeue(struct kevent *k);
+int kevent_init(struct kevent *k);
+void kevent_requeue(struct kevent *k);
+int kevent_break(struct kevent *k);
+
+void kevent_user_ring_add_event(struct kevent *k);
+
+void kevent_storage_ready(struct kevent_storage *st, 
+		kevent_callback_t ready_callback, u32 event);
+int kevent_storage_init(void *origin, struct kevent_storage *st);
+void kevent_storage_fini(struct kevent_storage *st);
+int kevent_storage_enqueue(struct kevent_storage *st, struct kevent *k);
+void kevent_storage_dequeue(struct kevent_storage *st, struct kevent *k);
+
+int kevent_user_add_ukevent(struct ukevent *uk, struct kevent_user *u);
+
+#ifdef CONFIG_KEVENT_POLL
+void kevent_poll_reinit(struct file *file);
+#else
+static inline void kevent_poll_reinit(struct file *file)
+{
+}
+#endif
+
+#ifdef CONFIG_KEVENT_INODE
+void kevent_inode_notify(struct inode *inode, u32 event);
+void kevent_inode_notify_parent(struct dentry *dentry, u32 event);
+void kevent_inode_remove(struct inode *inode);
+#else
+static inline void kevent_inode_notify(struct inode *inode, u32 event)
+{
+}
+static inline void kevent_inode_notify_parent(struct dentry *dentry, u32 event)
+{
+}
+static inline void kevent_inode_remove(struct inode *inode)
+{
+}
+#endif /* CONFIG_KEVENT_INODE */
+#ifdef CONFIG_KEVENT_SOCKET
+#ifdef CONFIG_LOCKDEP
+void kevent_socket_reinit(struct socket *sock);
+void kevent_sk_reinit(struct sock *sk);
+#else
+static inline void kevent_socket_reinit(struct socket *sock)
+{
+}
+static inline void kevent_sk_reinit(struct sock *sk)
+{
+}
+#endif
+void kevent_socket_notify(struct sock *sock, u32 event);
+int kevent_socket_dequeue(struct kevent *k);
+int kevent_socket_enqueue(struct kevent *k);
+#define sock_async(__sk) sock_flag(__sk, SOCK_ASYNC)
+#else
+static inline void kevent_socket_notify(struct sock *sock, u32 event)
+{
+}
+#define sock_async(__sk)	({ (void)__sk; 0; })
+#endif
+
+#ifdef CONFIG_KEVENT_USER_STAT
+static inline void kevent_stat_init(struct kevent_user *u)
+{
+	u->wait_num = u->im_num = u->total = 0;
+}
+static inline void kevent_stat_print(struct kevent_user *u)
+{
+	pr_debug("%s: u=%p, wait=%lu, immediately=%lu, total=%lu.\n", 
+			__func__, u, u->wait_num, u->im_num, u->total);
+}
+static inline void kevent_stat_im(struct kevent_user *u)
+{
+	u->im_num++;
+}
+static inline void kevent_stat_wait(struct kevent_user *u)
+{
+	u->wait_num++;
+}
+static inline void kevent_stat_total(struct kevent_user *u)
+{
+	u->total++;
+}
+#else
+#define kevent_stat_print(u)		({ (void) u;})
+#define kevent_stat_init(u)		({ (void) u;})
+#define kevent_stat_im(u)		({ (void) u;})
+#define kevent_stat_wait(u)		({ (void) u;})
+#define kevent_stat_total(u)		({ (void) u;})
+#endif
+
+#endif /* __KERNEL__ */
+#endif /* __KEVENT_H */
diff --git a/include/linux/kevent_storage.h b/include/linux/kevent_storage.h
new file mode 100644
index 0000000..a38575d
--- /dev/null
+++ b/include/linux/kevent_storage.h
@@ -0,0 +1,11 @@
+#ifndef __KEVENT_STORAGE_H
+#define __KEVENT_STORAGE_H
+
+struct kevent_storage
+{
+	void			*origin;		/* Originator's pointer, e.g. struct sock or struct file. Can be NULL. */
+	struct list_head	list;			/* List of queued kevents. */
+	spinlock_t		lock;			/* Protects users queue. */
+};
+
+#endif /* __KEVENT_STORAGE_H */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 008f04c..8609910 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -597,4 +597,7 @@ asmlinkage long sys_get_robust_list(int 
 asmlinkage long sys_set_robust_list(struct robust_list_head __user *head,
 				    size_t len);
 
+asmlinkage long sys_kevent_get_events(int ctl_fd, unsigned int min, unsigned int max, 
+		unsigned int timeout, void __user *buf, unsigned flags);
+asmlinkage long sys_kevent_ctl(int ctl_fd, unsigned int cmd, unsigned int num, void __user *buf);
 #endif
diff --git a/init/Kconfig b/init/Kconfig
index a099fc6..c550fcc 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -218,6 +218,8 @@ config AUDITSYSCALL
 	  such as SELinux.  To use audit's filesystem watch feature, please
 	  ensure that INOTIFY is configured.
 
+source "kernel/kevent/Kconfig"
+
 config IKCONFIG
 	bool "Kernel .config support"
 	---help---
diff --git a/kernel/Makefile b/kernel/Makefile
index d62ec66..2d7a6dd 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -47,6 +47,7 @@ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softl
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
+obj-$(CONFIG_KEVENT) += kevent/
 obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o
diff --git a/kernel/kevent/Kconfig b/kernel/kevent/Kconfig
new file mode 100644
index 0000000..31ea7b2
--- /dev/null
+++ b/kernel/kevent/Kconfig
@@ -0,0 +1,59 @@
+config KEVENT
+	bool "Kernel event notification mechanism"
+	help
+	  This option enables event queue mechanism.
+	  It can be used as replacement for poll()/select(), AIO callback 
+	  invocations, advanced timer notifications and other kernel 
+	  object status changes.
+
+config KEVENT_USER_STAT
+	bool "Kevent user statistic"
+	depends on KEVENT
+	default N
+	help
+	  This option will turn kevent_user statistic collection on.
+	  Statistic data includes total number of kevent, number of kevents 
+	  which are ready immediately at insertion time and number of kevents 
+	  which were removed through readiness completion. 
+	  It will be printed each time control kevent descriptor is closed.
+
+config KEVENT_SOCKET
+	bool "Kernel event notifications for sockets"
+	depends on NET && KEVENT
+	help
+	  This option enables notifications through KEVENT subsystem of 
+	  sockets operations, like new packet receiving conditions, 
+	  ready for accept conditions and so on.
+	
+config KEVENT_INODE
+	bool "Kernel event notifications for inodes"
+	depends on KEVENT
+	help
+	  This option enables notifications through KEVENT subsystem of 
+	  inode operations, like file creation, removal and so on.
+
+config KEVENT_TIMER
+	bool "Kernel event notifications for timers"
+	depends on KEVENT
+	help
+	  This option allows to use timers through KEVENT subsystem.
+
+config KEVENT_POLL
+	bool "Kernel event notifications for poll()/select()"
+	depends on KEVENT
+	help
+	  This option allows to use kevent subsystem for poll()/select() 
+	  notifications.
+
+config KEVENT_NAIO
+	bool "Network asynchronous IO"
+	depends on KEVENT && KEVENT_SOCKET
+	help
+	  This option enables kevent based network asynchronous IO subsystem.
+
+config KEVENT_AIO
+	bool "Asynchronous IO"
+	depends on KEVENT
+	help
+	  This option allows to use kevent subsystem for AIO operations.
+	  AIO read is currently supported.
diff --git a/kernel/kevent/Makefile b/kernel/kevent/Makefile
new file mode 100644
index 0000000..d1ef9ba
--- /dev/null
+++ b/kernel/kevent/Makefile
@@ -0,0 +1,7 @@
+obj-y := kevent.o kevent_user.o
+obj-$(CONFIG_KEVENT_SOCKET) += kevent_socket.o
+obj-$(CONFIG_KEVENT_INODE) += kevent_inode.o
+obj-$(CONFIG_KEVENT_TIMER) += kevent_timer.o
+obj-$(CONFIG_KEVENT_POLL) += kevent_poll.o
+obj-$(CONFIG_KEVENT_NAIO) += kevent_naio.o
+obj-$(CONFIG_KEVENT_AIO) += kevent_aio.o
diff --git a/kernel/kevent/kevent.c b/kernel/kevent/kevent.c
new file mode 100644
index 0000000..03430c9
--- /dev/null
+++ b/kernel/kevent/kevent.c
@@ -0,0 +1,251 @@
+/*
+ * 	kevent.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/mempool.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/kevent.h>
+
+kmem_cache_t *kevent_cache;
+
+/*
+ * Attempts to add an event into appropriate origin's queue.
+ * Returns positive value if this event is ready immediately,
+ * negative value in case of error and zero if event has been queued.
+ * ->enqueue() callback must increase origin's reference counter.
+ */
+int kevent_enqueue(struct kevent *k)
+{
+	if (k->event.type >= KEVENT_MAX)
+		return -EINVAL;
+
+	if (!k->callbacks.enqueue) {
+		kevent_break(k);
+		return -EINVAL;
+	}
+	
+	return k->callbacks.enqueue(k);
+}
+
+/*
+ * Remove event from the appropriate queue.
+ * ->dequeue() callback must decrease origin's reference counter.
+ */
+int kevent_dequeue(struct kevent *k)
+{
+	if (k->event.type >= KEVENT_MAX)
+		return -EINVAL;
+	
+	if (!k->callbacks.dequeue) {
+		kevent_break(k);
+		return -EINVAL;
+	}
+
+	return k->callbacks.dequeue(k);
+}
+
+/*
+ * Mark kevent as broken.
+ */
+int kevent_break(struct kevent *k)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&k->ulock, flags);
+	k->event.ret_flags |= KEVENT_RET_BROKEN;
+	spin_unlock_irqrestore(&k->ulock, flags);
+	return 0;
+}
+
+struct kevent_callbacks kevent_registered_callbacks[KEVENT_MAX];
+
+/*
+ * Must be called before event is going to be added into some origin's queue.
+ * Initializes ->enqueue(), ->dequeue() and ->callback() callbacks.
+ * If failed, kevent should not be used or kevent_enqueue() will fail to add
+ * this kevent into origin's queue with setting
+ * KEVENT_RET_BROKEN flag in kevent->event.ret_flags.
+ */
+int kevent_init(struct kevent *k)
+{
+	spin_lock_init(&k->ulock);
+	k->flags = 0;
+
+	if (k->event.type >= KEVENT_MAX)
+		return -EINVAL;
+
+	k->callbacks = kevent_registered_callbacks[k->event.type];
+	if (!k->callbacks.callback) {
+		kevent_break(k);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/*
+ * Called from ->enqueue() callback when reference counter for given
+ * origin (socket, inode...) has been increased.
+ */
+int kevent_storage_enqueue(struct kevent_storage *st, struct kevent *k)
+{
+	unsigned long flags;
+
+	k->st = st;
+	spin_lock_irqsave(&st->lock, flags);
+	list_add_tail_rcu(&k->storage_entry, &st->list);
+	k->flags |= KEVENT_STORAGE;
+	spin_unlock_irqrestore(&st->lock, flags);
+	return 0;
+}
+
+/*
+ * Dequeue kevent from origin's queue. 
+ * It does not decrease origin's reference counter in any way 
+ * and must be called before it, so storage itself must be valid.
+ * It is called from ->dequeue() callback.
+ */
+void kevent_storage_dequeue(struct kevent_storage *st, struct kevent *k)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&st->lock, flags);
+	if (k->flags & KEVENT_STORAGE) {
+		list_del_rcu(&k->storage_entry);
+		k->flags &= ~KEVENT_STORAGE;
+	}
+	spin_unlock_irqrestore(&st->lock, flags);
+}
+
+/*
+ * Call kevent ready callback and queue it into ready queue if needed.
+ * If kevent is marked as one-shot, then remove it from storage queue.
+ */
+static void __kevent_requeue(struct kevent *k, u32 event)
+{
+	int ret, rem = 0;
+	unsigned long flags;
+
+	ret = k->callbacks.callback(k);
+
+	spin_lock_irqsave(&k->ulock, flags);
+	if (ret > 0) {
+		k->event.ret_flags |= KEVENT_RET_DONE;
+	} else if (ret < 0) {
+		k->event.ret_flags |= KEVENT_RET_BROKEN;
+		k->event.ret_flags |= KEVENT_RET_DONE;
+	}
+	rem = (k->event.req_flags & KEVENT_REQ_ONESHOT);
+	if (!ret)
+		ret = (k->event.ret_flags & (KEVENT_RET_BROKEN|KEVENT_RET_DONE));
+	spin_unlock_irqrestore(&k->ulock, flags);
+
+	if (ret) {
+		if ((rem || ret < 0) && k->flags &KEVENT_STORAGE) {
+			list_del_rcu(&k->storage_entry);
+			k->flags &= ~KEVENT_STORAGE;
+		}
+		
+		spin_lock_irqsave(&k->user->ready_lock, flags);
+		if (!(k->flags & KEVENT_READY)) {
+			kevent_user_ring_add_event(k);
+			list_add_tail(&k->ready_entry, &k->user->ready_list);
+			k->flags |= KEVENT_READY;
+			k->user->ready_num++;
+		}
+		spin_unlock_irqrestore(&k->user->ready_lock, flags);
+		wake_up(&k->user->wait);
+	}
+}
+
+/*
+ * Check if kevent is ready (by invoking it's callback) and requeue/remove
+ * if needed.
+ */
+void kevent_requeue(struct kevent *k)
+{
+	unsigned long flags;
+	
+	spin_lock_irqsave(&k->st->lock, flags);
+	__kevent_requeue(k, 0);
+	spin_unlock_irqrestore(&k->st->lock, flags);
+}
+
+/*
+ * Called each time some activity in origin (socket, inode...) is noticed.
+ */
+void kevent_storage_ready(struct kevent_storage *st, 
+		kevent_callback_t ready_callback, u32 event)
+{
+	struct kevent *k;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(k, &st->list, storage_entry) {
+		if (ready_callback)
+			(*ready_callback)(k);
+
+		if (event & k->event.event)
+			__kevent_requeue(k, event);
+	}
+	rcu_read_unlock();
+}
+
+int kevent_storage_init(void *origin, struct kevent_storage *st)
+{
+	spin_lock_init(&st->lock);
+	st->origin = origin;
+	INIT_LIST_HEAD(&st->list);
+	return 0;
+}
+
+/*
+ * Mark all events as broken, that will remove them from storage,
+ * so storage origin (inode, sockt and so on) can be safely removed.
+ * No new entries are allowed to be added into the storage at this point.
+ * (Socket is removed from file table at this point for example).
+ */
+void kevent_storage_fini(struct kevent_storage *st)
+{
+	kevent_storage_ready(st, kevent_break, KEVENT_MASK_ALL);
+}
+
+static int __init kevent_sys_init(void)
+{
+	int i;
+
+	kevent_cache = kmem_cache_create("kevent_cache", 
+			sizeof(struct kevent), 0, SLAB_PANIC, NULL, NULL);
+
+	for (i=0; i<ARRAY_SIZE(kevent_registered_callbacks); ++i) {
+		struct kevent_callbacks *c = &kevent_registered_callbacks[i];
+
+		c->callback = c->enqueue = c->dequeue = NULL;
+	}
+	
+	return 0;
+}
+
+late_initcall(kevent_sys_init);
diff --git a/kernel/kevent/kevent_user.c b/kernel/kevent/kevent_user.c
new file mode 100644
index 0000000..237151c
--- /dev/null
+++ b/kernel/kevent/kevent_user.c
@@ -0,0 +1,1004 @@
+/*
+ * 	kevent_user.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/device.h>
+#include <linux/poll.h>
+#include <linux/kevent.h>
+#include <linux/jhash.h>
+#include <linux/miscdevice.h>
+#include <asm/io.h>
+
+static char kevent_name[] = "kevent";
+
+static int kevent_user_open(struct inode *, struct file *);
+static int kevent_user_release(struct inode *, struct file *);
+static unsigned int kevent_user_poll(struct file *, struct poll_table_struct *);
+static int kevent_user_mmap(struct file *, struct vm_area_struct *);
+
+static struct file_operations kevent_user_fops = {
+	.mmap		= kevent_user_mmap,
+	.open		= kevent_user_open,
+	.release	= kevent_user_release,
+	.poll		= kevent_user_poll,
+	.owner		= THIS_MODULE,
+};
+
+static struct miscdevice kevent_miscdev = {
+	.minor = MISC_DYNAMIC_MINOR,
+	.name = kevent_name,
+	.fops = &kevent_user_fops,
+};
+
+static int kevent_get_sb(struct file_system_type *fs_type, 
+		int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+{
+	/* So original magic... */
+	return get_sb_pseudo(fs_type, kevent_name, NULL, 0xbcdbcdbcdul, mnt);
+}
+
+static struct file_system_type kevent_fs_type = {
+	.name		= kevent_name,
+	.get_sb		= kevent_get_sb,
+	.kill_sb	= kill_anon_super,
+};
+
+static struct vfsmount *kevent_mnt;
+
+/*
+ * kevents are pollable, return POLLIN and POLLRDNORM 
+ * when there is at least one ready kevent.
+ */
+static unsigned int kevent_user_poll(struct file *file, struct poll_table_struct *wait)
+{
+	struct kevent_user *u = file->private_data;
+	unsigned int mask;
+	
+	poll_wait(file, &u->wait, wait);
+	mask = 0;
+
+	if (u->ready_num)
+		mask |= POLLIN | POLLRDNORM;
+
+	return mask;
+}
+
+/*
+ * Note that kevents does not exactly fill the page (each mukevent is 40 bytes),
+ * so we reuse 4 bytes at the begining of the first page to store index.
+ * Take that into account if you want to change size of struct ukevent.
+ */
+#define KEVENTS_ON_PAGE ((PAGE_SIZE-sizeof(unsigned int))/sizeof(struct mukevent))
+struct kevent_mring
+{
+	unsigned int		index;
+	struct mukevent		event[KEVENTS_ON_PAGE];
+};
+
+static inline void kevent_user_ring_set(struct kevent_user *u, unsigned int num)
+{
+	struct kevent_mring *ring;
+
+	ring = (struct kevent_mring *)u->pring[0];
+	ring->index = num;
+}
+
+static inline void kevent_user_ring_inc(struct kevent_user *u)
+{
+	struct kevent_mring *ring;
+
+	ring = (struct kevent_mring *)u->pring[0];
+	ring->index++;
+}
+
+static int kevent_user_ring_grow(struct kevent_user *u)
+{
+	struct kevent_mring *ring;
+	unsigned int idx;
+
+	ring = (struct kevent_mring *)u->pring[0];
+
+	idx = (ring->index + 1) / KEVENTS_ON_PAGE;
+	if (idx >= u->pages_in_use) {
+		u->pring[idx] = __get_free_page(GFP_KERNEL);
+		if (!u->pring[idx])
+			return -ENOMEM;
+		u->pages_in_use++;
+	}
+	return 0;
+}
+
+/*
+ * Called under kevent_user->ready_lock, so updates are always protected.
+ */
+void kevent_user_ring_add_event(struct kevent *k)
+{
+	unsigned int pidx, off;
+	struct kevent_mring *ring, *copy_ring;
+
+	ring = (struct kevent_mring *)k->user->pring[0];
+	
+	pidx = ring->index/KEVENTS_ON_PAGE;
+	off = ring->index%KEVENTS_ON_PAGE;
+
+	copy_ring = (struct kevent_mring *)k->user->pring[pidx];
+
+	copy_ring->event[off].id.raw[0] = k->event.id.raw[0];
+	copy_ring->event[off].id.raw[1] = k->event.id.raw[1];
+	copy_ring->event[off].ret_flags = k->event.ret_flags;
+
+	if (++ring->index >= KEVENT_MAX_EVENTS)
+		ring->index = 0;
+}
+
+/*
+ * Initialize mmap ring buffer.
+ * It will store ready kevents, so userspace could get them directly instead
+ * of using syscall. Esentially syscall becomes just a waiting point.
+ */
+static int kevent_user_ring_init(struct kevent_user *u)
+{
+	int pnum;
+
+	pnum = ALIGN(KEVENT_MAX_EVENTS*sizeof(struct mukevent) + sizeof(unsigned int), PAGE_SIZE)/PAGE_SIZE;
+
+	u->pring = kmalloc(pnum * sizeof(unsigned long), GFP_KERNEL);
+	if (!u->pring)
+		return -ENOMEM;
+
+	u->pring[0] = __get_free_page(GFP_KERNEL);
+	if (!u->pring[0])
+		goto err_out_free;
+
+	u->pages_in_use = 1;
+	kevent_user_ring_set(u, 0);
+
+	return 0;
+
+err_out_free:
+	kfree(u->pring);
+
+	return -ENOMEM;
+}
+
+static void kevent_user_ring_fini(struct kevent_user *u)
+{
+	int i, pnum;
+
+	pnum = ALIGN(KEVENT_MAX_EVENTS*sizeof(struct ukevent) + sizeof(unsigned int), PAGE_SIZE)/PAGE_SIZE;
+	
+	for (i=0; i<pnum; ++i)
+		free_page(u->pring[i]);
+
+	kfree(u->pring);
+}
+
+
+/*
+ * Allocate new kevent userspace control entry.
+ */
+static struct kevent_user *kevent_user_alloc(void)
+{
+	struct kevent_user *u;
+	int i;
+
+	u = kzalloc(sizeof(struct kevent_user), GFP_KERNEL);
+	if (!u)
+		return NULL;
+
+	INIT_LIST_HEAD(&u->ready_list);
+	spin_lock_init(&u->ready_lock);
+	kevent_stat_init(u);
+	spin_lock_init(&u->kevent_lock);
+	for (i=0; i<ARRAY_SIZE(u->kevent_list); ++i)
+		INIT_LIST_HEAD(&u->kevent_list[i]);
+	
+	mutex_init(&u->ctl_mutex);
+	init_waitqueue_head(&u->wait);
+
+	atomic_set(&u->refcnt, 1);
+
+	if (kevent_user_ring_init(u)) {
+		kfree(u);
+		u = NULL;
+	}
+
+	return u;
+}
+
+static int kevent_user_open(struct inode *inode, struct file *file)
+{
+	struct kevent_user *u = kevent_user_alloc();
+	
+	if (!u)
+		return -ENOMEM;
+
+	file->private_data = u;
+	
+	return 0;
+}
+
+
+/*
+ * Kevent userspace control block reference counting.
+ * Set to 1 at creation time, when appropriate kevent file descriptor
+ * is closed, that reference counter is decreased.
+ * When counter hits zero block is freed.
+ */
+static inline void kevent_user_get(struct kevent_user *u)
+{
+	atomic_inc(&u->refcnt);
+}
+
+static inline void kevent_user_put(struct kevent_user *u)
+{
+	if (atomic_dec_and_test(&u->refcnt)) {
+		kevent_stat_print(u);
+		kevent_user_ring_fini(u);
+		kfree(u);
+	}
+}
+
+static struct page *kevent_user_nopage(struct vm_area_struct *vma, unsigned long addr, int *type)
+{
+	struct kevent_user *u = vma->vm_file->private_data;
+	unsigned long off = (addr - vma->vm_start)/PAGE_SIZE;
+	unsigned int pnum = ALIGN(KEVENT_MAX_EVENTS*sizeof(struct mukevent) + sizeof(unsigned int), PAGE_SIZE)/PAGE_SIZE;
+
+	if (type)
+		*type = VM_FAULT_MINOR;
+
+	if (off >= pnum)
+		goto err_out_sigbus;
+
+	u->pring[off] = __get_free_page(GFP_KERNEL);
+	if (!u->pring)
+		goto err_out_sigbus;
+
+	return virt_to_page(u->pring[off]);
+
+err_out_sigbus:
+	return NOPAGE_SIGBUS;
+}
+
+static struct vm_operations_struct kevent_user_vm_ops = {
+	.nopage = &kevent_user_nopage,
+};
+
+/*
+ * Mmap implementation for ring buffer, which is created as array
+ * of pages, so vm_pgoff is an offset (in pages, not in bytes) of
+ * the first page to be mapped.
+ */
+static int kevent_user_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	unsigned long start = vma->vm_start;
+	struct kevent_user *u = file->private_data;
+
+	if (vma->vm_flags & VM_WRITE)
+		return -EPERM;
+
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+	vma->vm_ops = &kevent_user_vm_ops;
+	vma->vm_flags |= VM_RESERVED;
+	vma->vm_file = file;
+
+	if (remap_pfn_range(vma, start, virt_to_phys((void *)u->pring[0]), PAGE_SIZE,
+				vma->vm_page_prot))
+		return -EFAULT;
+
+	return 0;
+}
+
+#if 0
+static inline unsigned int kevent_user_hash(struct ukevent *uk)
+{
+	unsigned int h = (uk->user[0] ^ uk->user[1]) ^ (uk->id.raw[0] ^ uk->id.raw[1]);
+	
+	h = (((h >> 16) & 0xffff) ^ (h & 0xffff)) & 0xffff;
+	h = (((h >> 8) & 0xff) ^ (h & 0xff)) & KEVENT_HASH_MASK;
+
+	return h;
+}
+#else
+static inline unsigned int kevent_user_hash(struct ukevent *uk)
+{
+	return jhash_1word(uk->id.raw[0], 0) & KEVENT_HASH_MASK;
+}
+#endif
+
+/*
+ * RCU protects storage list (kevent->storage_entry).
+ * Free entry in RCU callback, it is dequeued from all lists at 
+ * this point.
+ */
+
+static void kevent_free_rcu(struct rcu_head *rcu)
+{
+	struct kevent *kevent = container_of(rcu, struct kevent, rcu_head);
+	kmem_cache_free(kevent_cache, kevent);
+}
+
+/*
+ * Complete kevent removing - it dequeues kevent from storage list
+ * if it is requested, removes kevent from ready list, drops userspace
+ * control block reference counter and schedules kevent freeing through RCU.
+ */
+static void kevent_finish_user_complete(struct kevent *k, int deq)
+{
+	struct kevent_user *u = k->user;
+	unsigned long flags;
+
+	if (deq)
+		kevent_dequeue(k);
+
+	spin_lock_irqsave(&u->ready_lock, flags);
+	if (k->flags & KEVENT_READY) {
+		list_del(&k->ready_entry);
+		k->flags &= ~KEVENT_READY;
+		u->ready_num--;
+	}
+	spin_unlock_irqrestore(&u->ready_lock, flags);
+
+	kevent_user_put(u);
+	call_rcu(&k->rcu_head, kevent_free_rcu);
+}
+
+/*
+ * Remove from all lists and free kevent.
+ * Must be called under kevent_user->kevent_lock to protect 
+ * kevent->kevent_entry removing.
+ */
+static void __kevent_finish_user(struct kevent *k, int deq)
+{
+	struct kevent_user *u = k->user;
+
+	list_del(&k->kevent_entry);
+	k->flags &= ~KEVENT_USER;
+	u->kevent_num--;
+	kevent_finish_user_complete(k, deq);
+}
+
+/*
+ * Remove kevent from user's list of all events, 
+ * dequeue it from storage and decrease user's reference counter,
+ * since this kevent does not exist anymore. That is why it is freed here.
+ */
+static void kevent_finish_user(struct kevent *k, int deq)
+{
+	struct kevent_user *u = k->user;
+	unsigned long flags;
+
+	spin_lock_irqsave(&u->kevent_lock, flags);
+	list_del(&k->kevent_entry);
+	k->flags &= ~KEVENT_USER;
+	u->kevent_num--;
+	spin_unlock_irqrestore(&u->kevent_lock, flags);
+	kevent_finish_user_complete(k, deq);
+}
+
+/*
+ * Dequeue one entry from user's ready queue.
+ */
+static struct kevent *kqueue_dequeue_ready(struct kevent_user *u)
+{
+	unsigned long flags;
+	struct kevent *k = NULL;
+
+	spin_lock_irqsave(&u->ready_lock, flags);
+	if (u->ready_num && !list_empty(&u->ready_list)) {
+		k = list_entry(u->ready_list.next, struct kevent, ready_entry);
+		list_del(&k->ready_entry);
+		k->flags &= ~KEVENT_READY;
+		u->ready_num--;
+	}
+	spin_unlock_irqrestore(&u->ready_lock, flags);
+
+	return k;
+}
+
+/*
+ * Search a kevent inside hash bucket for given ukevent.
+ */
+static struct kevent *__kevent_search(struct list_head *head, struct ukevent *uk, 
+		struct kevent_user *u)
+{
+	struct kevent *k, *ret = NULL;
+	
+	list_for_each_entry(k, head, kevent_entry) {
+		spin_lock(&k->ulock);
+		if (k->event.user[0] == uk->user[0] && k->event.user[1] == uk->user[1] &&
+				k->event.id.raw[0] == uk->id.raw[0] && 
+				k->event.id.raw[1] == uk->id.raw[1]) {
+			ret = k;
+			spin_unlock(&k->ulock);
+			break;
+		}
+		spin_unlock(&k->ulock);
+	}
+
+	return ret;
+}
+
+/*
+ * Search and modify kevent according to provided ukevent.
+ */
+static int kevent_modify(struct ukevent *uk, struct kevent_user *u)
+{
+	struct kevent *k;
+	unsigned int hash = kevent_user_hash(uk);
+	int err = -ENODEV;
+	unsigned long flags;
+	
+	spin_lock_irqsave(&u->kevent_lock, flags);
+	k = __kevent_search(&u->kevent_list[hash], uk, u);
+	if (k) {
+		spin_lock(&k->ulock);
+		k->event.event = uk->event;
+		k->event.req_flags = uk->req_flags;
+		k->event.ret_flags = 0;
+		spin_unlock(&k->ulock);
+		kevent_requeue(k);
+		err = 0;
+	}
+	spin_unlock_irqrestore(&u->kevent_lock, flags);
+	
+	return err;
+}
+
+/*
+ * Remove kevent which matches provided ukevent.
+ */
+static int kevent_remove(struct ukevent *uk, struct kevent_user *u)
+{
+	int err = -ENODEV;
+	struct kevent *k;
+	unsigned int hash = kevent_user_hash(uk);
+	unsigned long flags;
+
+	spin_lock_irqsave(&u->kevent_lock, flags);
+	k = __kevent_search(&u->kevent_list[hash], uk, u);
+	if (k) {
+		__kevent_finish_user(k, 1);
+		err = 0;
+	}
+	spin_unlock_irqrestore(&u->kevent_lock, flags);
+
+	return err;
+}
+
+/*
+ * Detaches userspace control block from file descriptor 
+ * and decrease it's reference counter.
+ * No new kevents can be added or removed from any list at this point.
+ */
+static int kevent_user_release(struct inode *inode, struct file *file)
+{
+	struct kevent_user *u = file->private_data;
+	struct kevent *k, *n;
+	int i;
+
+	for (i=0; i<ARRAY_SIZE(u->kevent_list); ++i) {
+		list_for_each_entry_safe(k, n, &u->kevent_list[i], kevent_entry)
+			kevent_finish_user(k, 1);
+	}
+
+	kevent_user_put(u);
+	file->private_data = NULL;
+
+	return 0;
+}
+
+/*
+ * Read requested number of ukevents in one shot.
+ */
+static struct ukevent *kevent_get_user(unsigned int num, void __user *arg)
+{
+	struct ukevent *ukev;
+
+	ukev = kmalloc(sizeof(struct ukevent) * num, GFP_KERNEL);
+	if (!ukev)
+		return NULL;
+
+	if (copy_from_user(ukev, arg, sizeof(struct ukevent) * num)) {
+		kfree(ukev);
+		return NULL;
+	}
+
+	return ukev;
+}
+
+/*
+ * Read from userspace all ukevents and modify appropriate kevents.
+ * If provided number of ukevents is more that threshold, it is faster
+ * to allocate a room for them and copy in one shot instead of copy 
+ * one-by-one and then process them.
+ */
+static int kevent_user_ctl_modify(struct kevent_user *u, unsigned int num, void __user *arg)
+{
+	int err = 0, i;
+	struct ukevent uk;
+
+	mutex_lock(&u->ctl_mutex);
+
+	if (num > u->kevent_num) {
+		err = -EINVAL;
+		goto out;
+	}
+	
+	if (num > KEVENT_MIN_BUFFS_ALLOC) {
+		struct ukevent *ukev;
+
+		ukev = kevent_get_user(num, arg);
+		if (ukev) {
+			for (i=0; i<num; ++i) {
+				if (kevent_modify(&ukev[i], u))
+					ukev[i].ret_flags |= KEVENT_RET_BROKEN;
+				ukev[i].ret_flags |= KEVENT_RET_DONE;
+			}
+			if (copy_to_user(arg, ukev, num*sizeof(struct ukevent)))
+				err = -EFAULT;
+			kfree(ukev);
+			goto out;
+		}
+	}
+
+	for (i=0; i<num; ++i) {
+		if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
+			err = -EFAULT;
+			break;
+		}
+
+		if (kevent_modify(&uk, u))
+			uk.ret_flags |= KEVENT_RET_BROKEN;
+		uk.ret_flags |= KEVENT_RET_DONE;
+
+		if (copy_to_user(arg, &uk, sizeof(struct ukevent))) {
+			err = -EFAULT;
+			break;
+		}
+
+		arg += sizeof(struct ukevent);
+	}
+out:
+	mutex_unlock(&u->ctl_mutex);
+
+	return err;
+}
+
+/*
+ * Read from userspace all ukevents and remove appropriate kevents.
+ * If provided number of ukevents is more that threshold, it is faster
+ * to allocate a room for them and copy in one shot instead of copy 
+ * one-by-one and then process them.
+ */
+static int kevent_user_ctl_remove(struct kevent_user *u, unsigned int num, void __user *arg)
+{
+	int err = 0, i;
+	struct ukevent uk;
+
+	mutex_lock(&u->ctl_mutex);
+	
+	if (num > u->kevent_num) {
+		err = -EINVAL;
+		goto out;
+	}
+	
+	if (num > KEVENT_MIN_BUFFS_ALLOC) {
+		struct ukevent *ukev;
+
+		ukev = kevent_get_user(num, arg);
+		if (ukev) {
+			for (i=0; i<num; ++i) {
+				if (kevent_remove(&ukev[i], u))
+					ukev[i].ret_flags |= KEVENT_RET_BROKEN;
+				ukev[i].ret_flags |= KEVENT_RET_DONE;
+			}
+			if (copy_to_user(arg, ukev, num*sizeof(struct ukevent)))
+				err = -EFAULT;
+			kfree(ukev);
+			goto out;
+		}
+	}
+
+	for (i=0; i<num; ++i) {
+		if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
+			err = -EFAULT;
+			break;
+		}
+
+		if (kevent_remove(&uk, u))
+			uk.ret_flags |= KEVENT_RET_BROKEN;
+
+		uk.ret_flags |= KEVENT_RET_DONE;
+
+		if (copy_to_user(arg, &uk, sizeof(struct ukevent))) {
+			err = -EFAULT;
+			break;
+		}
+
+		arg += sizeof(struct ukevent);
+	}
+out:
+	mutex_unlock(&u->ctl_mutex);
+
+	return err;
+}
+
+/*
+ * Queue kevent into userspace control block and increase
+ * it's reference counter.
+ */
+static void kevent_user_enqueue(struct kevent_user *u, struct kevent *k)
+{
+	unsigned long flags;
+	unsigned int hash = kevent_user_hash(&k->event);
+
+	spin_lock_irqsave(&u->kevent_lock, flags);
+	list_add_tail(&k->kevent_entry, &u->kevent_list[hash]);
+	k->flags |= KEVENT_USER;
+	u->kevent_num++;
+	kevent_user_get(u);
+	spin_unlock_irqrestore(&u->kevent_lock, flags);
+}
+
+/*
+ * Add kevent from both kernel and userspace users.
+ * This function allocates and queues kevent, returns negative value
+ * on error, positive if kevent is ready immediately and zero
+ * if kevent has been queued.
+ */
+int kevent_user_add_ukevent(struct ukevent *uk, struct kevent_user *u)
+{
+	struct kevent *k;
+	int err;
+
+	if (kevent_user_ring_grow(u)) {
+		err = -ENOMEM;
+		goto err_out_exit;
+	}
+
+	k = kmem_cache_alloc(kevent_cache, GFP_KERNEL);
+	if (!k) {
+		err = -ENOMEM;
+		goto err_out_exit;
+	}
+
+	memcpy(&k->event, uk, sizeof(struct ukevent));
+	INIT_RCU_HEAD(&k->rcu_head);
+
+	k->event.ret_flags = 0;
+
+	err = kevent_init(k);
+	if (err) {
+		kmem_cache_free(kevent_cache, k);
+		goto err_out_exit;
+	}
+	k->user = u;
+	kevent_stat_total(u);
+	kevent_user_enqueue(u, k);
+
+	err = kevent_enqueue(k);
+	if (err) {
+		memcpy(uk, &k->event, sizeof(struct ukevent));
+		if (err < 0)
+			uk->ret_flags |= KEVENT_RET_BROKEN;
+		uk->ret_flags |= KEVENT_RET_DONE;
+		kevent_finish_user(k, 0);
+	} else {
+		kevent_user_ring_inc(u);
+	}
+
+err_out_exit:
+	return err;
+}
+
+/*
+ * Copy all ukevents from userspace, allocate kevent for each one 
+ * and add them into appropriate kevent_storages, 
+ * e.g. sockets, inodes and so on...
+ * Ready events will replace ones provided by used and number 
+ * of ready events is returned.
+ * User must check ret_flags field of each ukevent structure 
+ * to determine if it is fired or failed event.
+ */
+static int kevent_user_ctl_add(struct kevent_user *u, unsigned int num, void __user *arg)
+{
+	int err, cerr = 0, knum = 0, rnum = 0, i;
+	void __user *orig = arg;
+	struct ukevent uk;
+
+	mutex_lock(&u->ctl_mutex);
+
+	err = -EINVAL;
+	if (u->kevent_num + num >= KEVENT_MAX_EVENTS)
+		goto out_remove;
+
+	if (num > KEVENT_MIN_BUFFS_ALLOC) {
+		struct ukevent *ukev;
+
+		ukev = kevent_get_user(num, arg);
+		if (ukev) {
+			for (i=0; i<num; ++i) {
+				err = kevent_user_add_ukevent(&ukev[i], u);
+				if (err) {
+					kevent_stat_im(u);
+					if (i != rnum)
+						memcpy(&ukev[rnum], &ukev[i], sizeof(struct ukevent));
+					rnum++;
+				} else
+					knum++;
+			}
+			if (copy_to_user(orig, ukev, rnum*sizeof(struct ukevent)))
+				cerr = -EFAULT;
+			kfree(ukev);
+			goto out_setup;
+		}
+	}
+
+	for (i=0; i<num; ++i) {
+		if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
+			cerr = -EFAULT;
+			break;
+		}
+		arg += sizeof(struct ukevent);
+
+		err = kevent_user_add_ukevent(&uk, u);
+		if (err) {
+			kevent_stat_im(u);
+			if (copy_to_user(orig, &uk, sizeof(struct ukevent))) {
+				cerr = -EFAULT;
+				break;
+			}
+			orig += sizeof(struct ukevent);
+			rnum++;
+		} else
+			knum++;
+	}
+
+out_setup:
+	if (cerr < 0) {
+		err = cerr;
+		goto out_remove;
+	}
+
+	err = rnum;
+out_remove:
+	mutex_unlock(&u->ctl_mutex);
+
+	return err;
+}
+
+/*
+ * In nonblocking mode it returns as many events as possible, but not more than @max_nr.
+ * In blocking mode it waits until timeout or if at least @min_nr events are ready.
+ */
+static int kevent_user_wait(struct file *file, struct kevent_user *u, 
+		unsigned int min_nr, unsigned int max_nr, unsigned int timeout, 
+		void __user *buf)
+{
+	struct kevent *k;
+	int num = 0;
+
+	if (!(file->f_flags & O_NONBLOCK)) {
+		wait_event_interruptible_timeout(u->wait, 
+			u->ready_num >= min_nr, msecs_to_jiffies(timeout));
+	}
+	
+	while (num < max_nr && ((k = kqueue_dequeue_ready(u)) != NULL)) {
+		if (copy_to_user(buf + num*sizeof(struct ukevent), 
+					&k->event, sizeof(struct ukevent)))
+			break;
+
+		/*
+		 * If it is one-shot kevent, it has been removed already from
+		 * origin's queue, so we can easily free it here.
+		 */
+		if (k->event.req_flags & KEVENT_REQ_ONESHOT)
+			kevent_finish_user(k, 1);
+		++num;
+		kevent_stat_wait(u);
+	}
+
+	return num;
+}
+
+/*
+ * Userspace control block creation and initialization.
+ */
+static int kevent_ctl_init(void)
+{
+	struct kevent_user *u;
+	struct file *file;
+	int fd, ret;
+
+	fd = get_unused_fd();
+	if (fd < 0)
+		return fd;
+
+	file = get_empty_filp();
+	if (!file) {
+		ret = -ENFILE;
+		goto out_put_fd;
+	}
+
+	u = kevent_user_alloc();
+	if (unlikely(!u)) {
+		ret = -ENOMEM;
+		goto out_put_file;
+	}
+
+	file->f_op = &kevent_user_fops;
+	file->f_vfsmnt = mntget(kevent_mnt);
+	file->f_dentry = dget(kevent_mnt->mnt_root);
+	file->f_mapping = file->f_dentry->d_inode->i_mapping;
+	file->f_mode = FMODE_READ;
+	file->f_flags = O_RDONLY;
+	file->private_data = u;
+	
+	fd_install(fd, file);
+
+	return fd;
+
+out_put_file:
+	put_filp(file);
+out_put_fd:
+	put_unused_fd(fd);
+	return ret;
+}
+
+static int kevent_ctl_process(struct file *file, unsigned int cmd, unsigned int num, void __user *arg)
+{
+	int err;
+	struct kevent_user *u = file->private_data;
+
+	if (!u || num > KEVENT_MAX_EVENTS)
+		return -EINVAL;
+
+	switch (cmd) {
+	case KEVENT_CTL_ADD:
+		err = kevent_user_ctl_add(u, num, arg);
+		break;
+	case KEVENT_CTL_REMOVE:
+		err = kevent_user_ctl_remove(u, num, arg);
+		break;
+	case KEVENT_CTL_MODIFY:
+		err = kevent_user_ctl_modify(u, num, arg);
+		break;
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+	return err;
+}
+
+/*
+ * Used to get ready kevents from queue.
+ * @ctl_fd - kevent control descriptor which must be obtained through kevent_ctl(KEVENT_CTL_INIT).
+ * @min_nr - minimum number of ready kevents.
+ * @max_nr - maximum number of ready kevents.
+ * @timeout - timeout in milliseconds to wait until some events are ready.
+ * @buf - buffer to place ready events.
+ * @flags - ununsed for now (will be used for mmap implementation).
+ */
+asmlinkage long sys_kevent_get_events(int ctl_fd, unsigned int min_nr, unsigned int max_nr,
+		unsigned int timeout, void __user *buf, unsigned flags)
+{
+	int err = -EINVAL;
+	struct file *file;
+	struct kevent_user *u;
+
+	file = fget(ctl_fd);
+	if (!file)
+		return -ENODEV;
+
+	if (file->f_op != &kevent_user_fops)
+		goto out_fput;
+	u = file->private_data;
+
+	err = kevent_user_wait(file, u, min_nr, max_nr, timeout, buf);
+out_fput:
+	fput(file);
+	return err;
+}
+
+/*
+ * This syscall is used to perform various control operations
+ * on given kevent queue, which is obtained through kevent file descriptor @fd.
+ * @cmd - type of operation.
+ * @num - number of kevents to be processed.
+ * @arg - pointer to array of struct ukevent.
+ */
+asmlinkage long sys_kevent_ctl(int fd, unsigned int cmd, unsigned int num, void __user *arg)
+{
+	int err = -EINVAL;
+	struct file *file;
+
+	if (cmd == KEVENT_CTL_INIT)
+		return kevent_ctl_init();
+
+	file = fget(fd);
+	if (!file)
+		return -ENODEV;
+
+	if (file->f_op != &kevent_user_fops)
+		goto out_fput;
+
+	err = kevent_ctl_process(file, cmd, num, arg);
+
+out_fput:
+	fput(file);
+	return err;
+}
+
+/*
+ * Kevent subsystem initialization - create kevent cache and register
+ * filesystem to get control file descriptors from.
+ */
+static int __devinit kevent_user_init(void)
+{
+	int err = 0;
+	
+	err = register_filesystem(&kevent_fs_type);
+	if (err)
+		panic("%s: failed to register filesystem: err=%d.\n",
+			       kevent_name, err);
+
+	kevent_mnt = kern_mount(&kevent_fs_type);
+	if (IS_ERR(kevent_mnt))
+		panic("%s: failed to mount silesystem: err=%ld.\n", 
+				kevent_name, PTR_ERR(kevent_mnt));
+	
+	err = misc_register(&kevent_miscdev);
+	if (err) {
+		printk(KERN_ERR "Failed to register kevent miscdev: err=%d.\n", err);
+		goto err_out_exit;
+	}
+
+	printk("KEVENT subsystem has been successfully registered.\n");
+
+	return 0;
+
+err_out_exit:
+	mntput(kevent_mnt);
+	unregister_filesystem(&kevent_fs_type);
+
+	return err;
+}
+
+static void __devexit kevent_user_fini(void)
+{
+	misc_deregister(&kevent_miscdev);
+	mntput(kevent_mnt);
+	unregister_filesystem(&kevent_fs_type);
+}
+
+module_init(kevent_user_init);
+module_exit(kevent_user_fini);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 6991bec..8d3769b 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -122,6 +122,9 @@ cond_syscall(ppc_rtas);
 cond_syscall(sys_spu_run);
 cond_syscall(sys_spu_create);
 
+cond_syscall(sys_kevent_get_events);
+cond_syscall(sys_kevent_ctl);
+
 /* mmu depending weak syscall entries */
 cond_syscall(sys_mprotect);
 cond_syscall(sys_msync);


^ permalink raw reply related	[flat|nested] 160+ messages in thread

* Re: [take8 2/2] kevent: poll/select() notifications. Timer notifications.
  2006-08-11  8:40                             ` [take8 2/2] kevent: poll/select() notifications. Timer notifications Evgeniy Polyakov
@ 2006-08-11 15:45                               ` Andrew Morton
  2006-08-12  8:18                                 ` Evgeniy Polyakov
  0 siblings, 1 reply; 160+ messages in thread
From: Andrew Morton @ 2006-08-11 15:45 UTC (permalink / raw)
  To: Evgeniy Polyakov; +Cc: lkml, David Miller, Ulrich Drepper, netdev, Zach Brown

On Fri, 11 Aug 2006 12:40:10 +0400
Evgeniy Polyakov <johnpol@2ka.mipt.ru> wrote:

> 
> poll/select() notifications. Timer notifications.
> 
> This patch includes generic poll/select and timer notifications.
> 
> kevent_poll works simialr to epoll and has the same issues (callback
> is invoked not from internal state machine of the caller, but through
> process awake).
> 
> Timer notifications can be used for fine grained per-process time 
> management, since interval timers are very inconvenient to use, 
> and they are limited.
> 
> ...
>
> +static struct lock_class_key kevent_poll_key;
> +
> +void kevent_poll_reinit(struct file *file)
> +{
> +	lockdep_set_class(&file->st.lock, &kevent_poll_key);
> +}

Why is this necessary?

> +#include <linux/kernel.h>
> +#include <linux/types.h>
> +#include <linux/list.h>
> +#include <linux/slab.h>
> +#include <linux/spinlock.h>
> +#include <linux/timer.h>
> +#include <linux/jiffies.h>
> +#include <linux/kevent.h>
> +
> +static void kevent_timer_func(unsigned long data)
> +{
> +	struct kevent *k = (struct kevent *)data;
> +	struct timer_list *t = k->st->origin;
> +
> +	kevent_storage_ready(k->st, NULL, KEVENT_MASK_ALL);
> +	mod_timer(t, jiffies + msecs_to_jiffies(k->event.id.raw[0]));
> +}
> +
> +static struct lock_class_key kevent_timer_key;
> +
> +static int kevent_timer_enqueue(struct kevent *k)
> +{
> +	struct timer_list *t;
> +	struct kevent_storage *st;
> +	int err;
> +
> +	t = kmalloc(sizeof(struct timer_list) + sizeof(struct kevent_storage), 
> +			GFP_KERNEL);
> +	if (!t)
> +		return -ENOMEM;
> +
> +	init_timer(t);
> +	t->function = kevent_timer_func;
> +	t->expires = jiffies + msecs_to_jiffies(k->event.id.raw[0]);
> +	t->data = (unsigned long)k;

setup_timer().

> +	st = (struct kevent_storage *)(t+1);

It would be cleaner to create

	struct <something> {
		struct timer_list timer;
		struct kevent_storage storage;
	};

> +	err = kevent_storage_init(t, st);
> +	if (err)
> +		goto err_out_free;
> +	lockdep_set_class(&st->lock, &kevent_timer_key);

Why is this necesary?

> +	
> +	kevent_storage_dequeue(st, k);
> +	
> +	kfree(t);
> +
> +	return 0;
> +}
> +
> +static int kevent_timer_callback(struct kevent *k)
> +{
> +	struct kevent_storage *st = k->st;
> +	struct timer_list *t = st->origin;
> +
> +	if (!t)
> +		return -ENODEV;
> +	
> +	k->event.ret_data[0] = (__u32)jiffies;

What does this do?

Does it expose jiffies to userspace?

It truncates jiffies on 64-bit machines.

> +late_initcall(kevent_init_timer);

module_init() would be more typical.  If there was a reason for using
late_initcall(), that reason should be commented.


^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take8 2/2] kevent: poll/select() notifications. Timer notifications.
  2006-08-11 15:45                               ` Andrew Morton
@ 2006-08-12  8:18                                 ` Evgeniy Polyakov
  2006-08-12  8:38                                   ` Andrew Morton
  0 siblings, 1 reply; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-12  8:18 UTC (permalink / raw)
  To: Andrew Morton; +Cc: lkml, David Miller, Ulrich Drepper, netdev, Zach Brown

On Fri, Aug 11, 2006 at 08:45:31AM -0700, Andrew Morton (akpm@osdl.org) wrote:
> > +static struct lock_class_key kevent_poll_key;
> > +
> > +void kevent_poll_reinit(struct file *file)
> > +{
> > +	lockdep_set_class(&file->st.lock, &kevent_poll_key);
> > +}
> 
> Why is this necessary?

Locks for all storages are initialized in the same function, so lockdep thinks 
they are the same, so when later one lock is being held in proces
context and other in BH or IRQ lockdep screams, so I reinitialize locks
after spin_lock_init().

> > +#include <linux/kernel.h>
> > +#include <linux/types.h>
> > +#include <linux/list.h>
> > +#include <linux/slab.h>
> > +#include <linux/spinlock.h>
> > +#include <linux/timer.h>
> > +#include <linux/jiffies.h>
> > +#include <linux/kevent.h>
> > +
> > +static void kevent_timer_func(unsigned long data)
> > +{
> > +	struct kevent *k = (struct kevent *)data;
> > +	struct timer_list *t = k->st->origin;
> > +
> > +	kevent_storage_ready(k->st, NULL, KEVENT_MASK_ALL);
> > +	mod_timer(t, jiffies + msecs_to_jiffies(k->event.id.raw[0]));
> > +}
> > +
> > +static struct lock_class_key kevent_timer_key;
> > +
> > +static int kevent_timer_enqueue(struct kevent *k)
> > +{
> > +	struct timer_list *t;
> > +	struct kevent_storage *st;
> > +	int err;
> > +
> > +	t = kmalloc(sizeof(struct timer_list) + sizeof(struct kevent_storage), 
> > +			GFP_KERNEL);
> > +	if (!t)
> > +		return -ENOMEM;
> > +
> > +	init_timer(t);
> > +	t->function = kevent_timer_func;
> > +	t->expires = jiffies + msecs_to_jiffies(k->event.id.raw[0]);
> > +	t->data = (unsigned long)k;
> 
> setup_timer().

I know about it's existens now...

> > +	st = (struct kevent_storage *)(t+1);
> 
> It would be cleaner to create
> 
> 	struct <something> {
> 		struct timer_list timer;
> 		struct kevent_storage storage;
> 	};
> 
> > +	err = kevent_storage_init(t, st);
> > +	if (err)
> > +		goto err_out_free;
> > +	lockdep_set_class(&st->lock, &kevent_timer_key);
> 
> Why is this necesary?

As I said above kevent_storage_init() initializes locks for all known
storages (inode, socket, file and so on), when later those locks are
called from different contexts (obviously timer callback can not use the
same lock as for example socket one) lockdep screams.

> > +	
> > +	kevent_storage_dequeue(st, k);
> > +	
> > +	kfree(t);
> > +
> > +	return 0;
> > +}
> > +
> > +static int kevent_timer_callback(struct kevent *k)
> > +{
> > +	struct kevent_storage *st = k->st;
> > +	struct timer_list *t = st->origin;
> > +
> > +	if (!t)
> > +		return -ENODEV;
> > +	
> > +	k->event.ret_data[0] = (__u32)jiffies;
> 
> What does this do?
> 
> Does it expose jiffies to userspace?
> 
> It truncates jiffies on 64-bit machines.

It is a hint when timer was stopped.

> > +late_initcall(kevent_init_timer);
> 
> module_init() would be more typical.  If there was a reason for using
> late_initcall(), that reason should be commented.

No, there are no reasons to use late_initcall() in any kevent
initialization function, I do not use module_init() since kevent can not
be modular. It can be replaced with pure __init function.
Should it?

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take8 2/2] kevent: poll/select() notifications. Timer notifications.
  2006-08-12  8:18                                 ` Evgeniy Polyakov
@ 2006-08-12  8:38                                   ` Andrew Morton
  2006-08-12  8:55                                     ` Evgeniy Polyakov
  0 siblings, 1 reply; 160+ messages in thread
From: Andrew Morton @ 2006-08-12  8:38 UTC (permalink / raw)
  To: Evgeniy Polyakov; +Cc: lkml, David Miller, Ulrich Drepper, netdev, Zach Brown

On Sat, 12 Aug 2006 12:18:35 +0400
Evgeniy Polyakov <johnpol@2ka.mipt.ru> wrote:

> On Fri, Aug 11, 2006 at 08:45:31AM -0700, Andrew Morton (akpm@osdl.org) wrote:
> > > +static struct lock_class_key kevent_poll_key;
> > > +
> > > +void kevent_poll_reinit(struct file *file)
> > > +{
> > > +	lockdep_set_class(&file->st.lock, &kevent_poll_key);
> > > +}
> > 
> > Why is this necessary?
> 
> Locks for all storages are initialized in the same function, so lockdep thinks 
> they are the same, so when later one lock is being held in proces
> context and other in BH or IRQ lockdep screams, so I reinitialize locks
> after spin_lock_init().

So why not simply run spin_lock_init() in the kevent_storage_init() caller?

Does kevent_poll_reinit() have any callers?

> > > +	st = (struct kevent_storage *)(t+1);
> > 
> > It would be cleaner to create
> > 
> > 	struct <something> {
> > 		struct timer_list timer;
> > 		struct kevent_storage storage;
> > 	};

You missed this?

> 
> > > +	
> > > +	kevent_storage_dequeue(st, k);
> > > +	
> > > +	kfree(t);
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +static int kevent_timer_callback(struct kevent *k)
> > > +{
> > > +	struct kevent_storage *st = k->st;
> > > +	struct timer_list *t = st->origin;
> > > +
> > > +	if (!t)
> > > +		return -ENODEV;
> > > +	
> > > +	k->event.ret_data[0] = (__u32)jiffies;
> > 
> > What does this do?
> > 
> > Does it expose jiffies to userspace?
> > 
> > It truncates jiffies on 64-bit machines.
> 
> It is a hint when timer was stopped.

What does that mean?  What is it for?

Does it expose jiffies to userspace?

It truncates jiffies on 64-bit machines.

Please respond to all review comments and questions.

> > > +late_initcall(kevent_init_timer);
> > 
> > module_init() would be more typical.  If there was a reason for using
> > late_initcall(), that reason should be commented.
> 
> No, there are no reasons to use late_initcall() in any kevent
> initialization function, I do not use module_init() since kevent can not
> be modular. It can be replaced with pure __init function.
> Should it?

We use module_init() for non-modular modules all the time.  Try doing
grep module_init */*.c


^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take8 2/2] kevent: poll/select() notifications. Timer notifications.
  2006-08-12  8:38                                   ` Andrew Morton
@ 2006-08-12  8:55                                     ` Evgeniy Polyakov
  0 siblings, 0 replies; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-12  8:55 UTC (permalink / raw)
  To: Andrew Morton; +Cc: lkml, David Miller, Ulrich Drepper, netdev, Zach Brown

On Sat, Aug 12, 2006 at 01:38:35AM -0700, Andrew Morton (akpm@osdl.org) wrote:
> On Sat, 12 Aug 2006 12:18:35 +0400
> Evgeniy Polyakov <johnpol@2ka.mipt.ru> wrote:
> 
> > On Fri, Aug 11, 2006 at 08:45:31AM -0700, Andrew Morton (akpm@osdl.org) wrote:
> > > > +static struct lock_class_key kevent_poll_key;
> > > > +
> > > > +void kevent_poll_reinit(struct file *file)
> > > > +{
> > > > +	lockdep_set_class(&file->st.lock, &kevent_poll_key);
> > > > +}
> > > 
> > > Why is this necessary?
> > 
> > Locks for all storages are initialized in the same function, so lockdep thinks 
> > they are the same, so when later one lock is being held in proces
> > context and other in BH or IRQ lockdep screams, so I reinitialize locks
> > after spin_lock_init().
> 
> So why not simply run spin_lock_init() in the kevent_storage_init() caller?

I separated storage initialization into special function, although it is
quite simple, but it allows not to have a lot of duplicated steps in
each origin (inode, socket, file, AIO, network AIO, poll, timer and so
on). If later some members will be changed or added/removed there will
be no need to change them all instead change one function.

> Does kevent_poll_reinit() have any callers?

Only poll nitialization.

> > > > +	st = (struct kevent_storage *)(t+1);
> > > 
> > > It would be cleaner to create
> > > 
> > > 	struct <something> {
> > > 		struct timer_list timer;
> > > 		struct kevent_storage storage;
> > > 	};
> 
> You missed this?

No problem, I will create such structure instead of pointer-based
scheme.
 
> > > > +	
> > > > +	kevent_storage_dequeue(st, k);
> > > > +	
> > > > +	kfree(t);
> > > > +
> > > > +	return 0;
> > > > +}
> > > > +
> > > > +static int kevent_timer_callback(struct kevent *k)
> > > > +{
> > > > +	struct kevent_storage *st = k->st;
> > > > +	struct timer_list *t = st->origin;
> > > > +
> > > > +	if (!t)
> > > > +		return -ENODEV;
> > > > +	
> > > > +	k->event.ret_data[0] = (__u32)jiffies;
> > > 
> > > What does this do?
> > > 
> > > Does it expose jiffies to userspace?
> > > 
> > > It truncates jiffies on 64-bit machines.
> > 
> > It is a hint when timer was stopped.
> 
> What does that mean?  What is it for?

My test code prints it to stdout and I can show a nice graph of how
precise timer is.

> Does it expose jiffies to userspace?

It is a timestamp of fired condition (measured in jiffies), I can not
say if it exposes jiffies to userspace or not.
It is not a requirement to use that field, it can contain zero or any
other number, I placed jiffies there.

> It truncates jiffies on 64-bit machines.

For the purpose of evnt timestamp it is more than enough.

> Please respond to all review comments and questions.

Sorry if something got lost.

> > > > +late_initcall(kevent_init_timer);
> > > 
> > > module_init() would be more typical.  If there was a reason for using
> > > late_initcall(), that reason should be commented.
> > 
> > No, there are no reasons to use late_initcall() in any kevent
> > initialization function, I do not use module_init() since kevent can not
> > be modular. It can be replaced with pure __init function.
> > Should it?
> 
> We use module_init() for non-modular modules all the time.  Try doing
> grep module_init */*.c

Ok, I will use it instead late_initcall().

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take8 1/2] kevent: Core files.
  2006-08-11  8:40                           ` [take8 1/2] kevent: Core files Evgeniy Polyakov
  2006-08-11  8:40                             ` [take8 2/2] kevent: poll/select() notifications. Timer notifications Evgeniy Polyakov
@ 2006-08-13  0:51                             ` Jeff Carr
  2006-08-13  9:04                               ` Evgeniy Polyakov
  1 sibling, 1 reply; 160+ messages in thread
From: Jeff Carr @ 2006-08-13  0:51 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: lkml, David Miller, Ulrich Drepper, Andrew Morton, netdev, Zach Brown

On 08/11/06 01:40, Evgeniy Polyakov wrote:

> +/*
> + * Inode events.
> + */
> +#define	KEVENT_INODE_CREATE	0x1
> +#define	KEVENT_INODE_REMOVE	0x2

It would be useful to have gnome/kde notification when hard drives start
failing. There was some talk in the past about how to implement that
with kobjects. Perhaps you could add for this purpose:

#define	KEVENT_BLOCK_CREATE	0x1
#define	KEVENT_BLOCK_REMOVE	0x2
#define	KEVENT_BLOCK_ERROR	0x4

AFAICT:
The conversation concluded this is the best way to handle ioerrors:

--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -108,6 +108,8 @@ static void buffer_io_error(struct buffe
 	printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
 			bdevname(bh->b_bdev, b),
 			(unsigned long long)bh->b_blocknr);
+
+	kevent_block_error(&bh->b_bdev->bd_disk->kobj);
 }

 /*
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -252,8 +252,11 @@ static void finished_one_bio(struct dio
 				transferred = dio->i_size - offset;

 			/* check for error in completion path */
-			if (dio->io_error)
+			if (dio->io_error) {
 				transferred = dio->io_error;
+				kevent_block_error(
+				&dio->bio->bi_bdev->bd_disk->kobj);
+			}

 			dio_complete(dio, offset, transferred);


^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take8 1/2] kevent: Core files.
  2006-08-13  0:51                             ` [take8 1/2] kevent: Core files Jeff Carr
@ 2006-08-13  9:04                               ` Evgeniy Polyakov
  0 siblings, 0 replies; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-13  9:04 UTC (permalink / raw)
  To: Jeff Carr
  Cc: lkml, David Miller, Ulrich Drepper, Andrew Morton, netdev, Zach Brown

On Sat, Aug 12, 2006 at 05:51:41PM -0700, Jeff Carr (basilarchia@gmail.com) wrote:
> On 08/11/06 01:40, Evgeniy Polyakov wrote:
> 
> > +/*
> > + * Inode events.
> > + */
> > +#define	KEVENT_INODE_CREATE	0x1
> > +#define	KEVENT_INODE_REMOVE	0x2
> 
> It would be useful to have gnome/kde notification when hard drives start
> failing. There was some talk in the past about how to implement that
> with kobjects. Perhaps you could add for this purpose:
> 
> #define	KEVENT_BLOCK_CREATE	0x1
> #define	KEVENT_BLOCK_REMOVE	0x2
> #define	KEVENT_BLOCK_ERROR	0x4

Do we want it to be on top of kobject (like in your patch), or based on
gendisk? I think the latter is better, since kobject reperesents almost
any kind of devices, so we will create noticeble overhead for those who
do not know about kevents.

> AFAICT:
> The conversation concluded this is the best way to handle ioerrors:
> 
> --- a/fs/buffer.c
> +++ b/fs/buffer.c
> @@ -108,6 +108,8 @@ static void buffer_io_error(struct buffe
>  	printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
>  			bdevname(bh->b_bdev, b),
>  			(unsigned long long)bh->b_blocknr);
> +
> +	kevent_block_error(&bh->b_bdev->bd_disk->kobj);
>  }
> 
>  /*
> --- a/fs/direct-io.c
> +++ b/fs/direct-io.c
> @@ -252,8 +252,11 @@ static void finished_one_bio(struct dio
>  				transferred = dio->i_size - offset;
> 
>  			/* check for error in completion path */
> -			if (dio->io_error)
> +			if (dio->io_error) {
>  				transferred = dio->io_error;
> +				kevent_block_error(
> +				&dio->bio->bi_bdev->bd_disk->kobj);
> +			}
> 
>  			dio_complete(dio, offset, transferred);

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 160+ messages in thread

* [take8 0/2] kevent: Generic event handling mechanism.
  2006-07-31 10:33                       ` Evgeniy Polyakov
                                           ` (7 preceding siblings ...)
  2006-08-11  8:40                         ` [take8 0/2] kevent: Generic event handling mechanism Evgeniy Polyakov
@ 2006-08-14  6:20                         ` Evgeniy Polyakov
  2006-08-14  6:20                           ` [take8 1/2] kevent: Core files Evgeniy Polyakov
  2006-08-14  6:21                         ` [take9 0/2] kevent: Generic event handling mechanism Evgeniy Polyakov
  2006-08-16 12:34                         ` [take10 " Evgeniy Polyakov
  10 siblings, 1 reply; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-14  6:20 UTC (permalink / raw)
  To: lkml
  Cc: David Miller, Ulrich Drepper, Andrew Morton, Evgeniy Polyakov,
	netdev, Zach Brown


Generic event handling mechanism.

Changes from 'take7' patchset:
 * new mmap interface (not tested, waiting for other changes to be acked)
	- use nopage() method to dynamically substitue pages
	- allocate new page for events only when new added kevent requres it
	- do not use ugly index dereferencing, use structure instead
	- reduced amount of data in the ring (id and flags), 
		maximum 12 pages on x86 per kevent fd

Changes from 'take6' patchset:
 * a lot of comments!
 * do not use list poisoning for detection of the fact, that entry is in the list
 * return number of ready kevents even if copy*user() fails
 * strict check for number of kevents in syscall
 * use ARRAY_SIZE for array size calculation
 * changed superblock magic number
 * use SLAB_PANIC instead of direct panic() call
 * changed -E* return values
 * a lot of small cleanups and indent fixes

Changes from 'take5' patchset:
 * removed compilation warnings about unused wariables when lockdep is not turned on
 * do not use internal socket structures, use appropriate (exported) wrappers instead
 * removed default 1 second timeout
 * removed AIO stuff from patchset

Changes from 'take4' patchset:
 * use miscdevice instead of chardevice
 * comments fixes

Changes from 'take3' patchset:
 * removed serializing mutex from kevent_user_wait()
 * moved storage list processing to RCU
 * removed lockdep screaming - all storage locks are initialized in the same function, so it was learned 
	to differentiate between various cases
 * remove kevent from storage if is marked as broken after callback
 * fixed a typo in mmaped buffer implementation which would end up in wrong index calcualtion 

Changes from 'take2' patchset:
 * split kevent_finish_user() to locked and unlocked variants
 * do not use KEVENT_STAT ifdefs, use inline functions instead
 * use array of callbacks of each type instead of each kevent callback initialization
 * changed name of ukevent guarding lock
 * use only one kevent lock in kevent_user for all hash buckets instead of per-bucket locks
 * do not use kevent_user_ctl structure instead provide needed arguments as syscall parameters
 * various indent cleanups
 * added optimisation, which is aimed to help when a lot of kevents are being copied from userspace
 * mapped buffer (initial) implementation (no userspace yet)

Changes from 'take1' patchset:
 - rebased against 2.6.18-git tree
 - removed ioctl controlling
 - added new syscall kevent_get_events(int fd, unsigned int min_nr, unsigned int max_nr,
			unsigned int timeout, void __user *buf, unsigned flags)
 - use old syscall kevent_ctl for creation/removing, modification and initial kevent 
	initialization
 - use mutuxes instead of semaphores
 - added file descriptor check and return error if provided descriptor does not match
	kevent file operations
 - various indent fixes
 - removed aio_sendfile() declarations.

Thank you.

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>



^ permalink raw reply	[flat|nested] 160+ messages in thread

* [take8 2/2] kevent: poll/select() notifications. Timer notifications.
  2006-08-14  6:20                           ` [take8 1/2] kevent: Core files Evgeniy Polyakov
@ 2006-08-14  6:20                             ` Evgeniy Polyakov
  0 siblings, 0 replies; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-14  6:20 UTC (permalink / raw)
  To: lkml
  Cc: David Miller, Ulrich Drepper, Andrew Morton, Evgeniy Polyakov,
	netdev, Zach Brown


poll/select() notifications. Timer notifications.

This patch includes generic poll/select and timer notifications.

kevent_poll works simialr to epoll and has the same issues (callback
is invoked not from internal state machine of the caller, but through
process awake).

Timer notifications can be used for fine grained per-process time 
management, since interval timers are very inconvenient to use, 
and they are limited.

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mitp.ru>

diff --git a/kernel/kevent/kevent_poll.c b/kernel/kevent/kevent_poll.c
new file mode 100644
index 0000000..8a4f863
--- /dev/null
+++ b/kernel/kevent/kevent_poll.c
@@ -0,0 +1,220 @@
+/*
+ * 	kevent_poll.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+#include <linux/file.h>
+#include <linux/kevent.h>
+#include <linux/poll.h>
+#include <linux/fs.h>
+
+static kmem_cache_t *kevent_poll_container_cache;
+static kmem_cache_t *kevent_poll_priv_cache;
+
+struct kevent_poll_ctl
+{
+	struct poll_table_struct 	pt;
+	struct kevent			*k;
+};
+
+struct kevent_poll_wait_container
+{
+	struct list_head		container_entry;
+	wait_queue_head_t		*whead;
+	wait_queue_t			wait;
+	struct kevent			*k;
+};
+
+struct kevent_poll_private
+{
+	struct list_head		container_list;
+	spinlock_t			container_lock;
+};
+
+static int kevent_poll_enqueue(struct kevent *k);
+static int kevent_poll_dequeue(struct kevent *k);
+static int kevent_poll_callback(struct kevent *k);
+
+static int kevent_poll_wait_callback(wait_queue_t *wait, 
+		unsigned mode, int sync, void *key)
+{
+	struct kevent_poll_wait_container *cont = 
+		container_of(wait, struct kevent_poll_wait_container, wait);
+	struct kevent *k = cont->k;
+	struct file *file = k->st->origin;
+	u32 revents;
+
+	revents = file->f_op->poll(file, NULL);
+
+	kevent_storage_ready(k->st, NULL, revents);
+
+	return 0;
+}
+
+static void kevent_poll_qproc(struct file *file, wait_queue_head_t *whead, 
+		struct poll_table_struct *poll_table)
+{
+	struct kevent *k = 
+		container_of(poll_table, struct kevent_poll_ctl, pt)->k;
+	struct kevent_poll_private *priv = k->priv;
+	struct kevent_poll_wait_container *cont;
+	unsigned long flags;
+
+	cont = kmem_cache_alloc(kevent_poll_container_cache, SLAB_KERNEL);
+	if (!cont) {
+		kevent_break(k);
+		return;
+	}
+		
+	cont->k = k;
+	init_waitqueue_func_entry(&cont->wait, kevent_poll_wait_callback);
+	cont->whead = whead;
+
+	spin_lock_irqsave(&priv->container_lock, flags);
+	list_add_tail(&cont->container_entry, &priv->container_list);
+	spin_unlock_irqrestore(&priv->container_lock, flags);
+
+	add_wait_queue(whead, &cont->wait);
+}
+
+static int kevent_poll_enqueue(struct kevent *k)
+{
+	struct file *file;
+	int err, ready = 0;
+	unsigned int revents;
+	struct kevent_poll_ctl ctl;
+	struct kevent_poll_private *priv;
+
+	file = fget(k->event.id.raw[0]);
+	if (!file)
+		return -ENODEV;
+
+	err = -EINVAL;
+	if (!file->f_op || !file->f_op->poll)
+		goto err_out_fput;
+
+	err = -ENOMEM;
+	priv = kmem_cache_alloc(kevent_poll_priv_cache, SLAB_KERNEL);
+	if (!priv)
+		goto err_out_fput;
+
+	spin_lock_init(&priv->container_lock);
+	INIT_LIST_HEAD(&priv->container_list);
+
+	k->priv = priv;
+
+	ctl.k = k;
+	init_poll_funcptr(&ctl.pt, &kevent_poll_qproc);
+
+	err = kevent_storage_enqueue(&file->st, k);
+	if (err)
+		goto err_out_free;
+
+	revents = file->f_op->poll(file, &ctl.pt);
+	if (revents & k->event.event) {
+		ready = 1;
+		kevent_poll_dequeue(k);
+	}
+	
+	return ready;
+
+err_out_free:
+	kmem_cache_free(kevent_poll_priv_cache, priv);
+err_out_fput:
+	fput(file);
+	return err;
+}
+
+static int kevent_poll_dequeue(struct kevent *k)
+{
+	struct file *file = k->st->origin;
+	struct kevent_poll_private *priv = k->priv;
+	struct kevent_poll_wait_container *w, *n;
+	unsigned long flags;
+
+	kevent_storage_dequeue(k->st, k);
+
+	spin_lock_irqsave(&priv->container_lock, flags);
+	list_for_each_entry_safe(w, n, &priv->container_list, container_entry) {
+		list_del(&w->container_entry);
+		remove_wait_queue(w->whead, &w->wait);
+		kmem_cache_free(kevent_poll_container_cache, w);
+	}
+	spin_unlock_irqrestore(&priv->container_lock, flags);
+	
+	kmem_cache_free(kevent_poll_priv_cache, priv);
+	k->priv = NULL;
+	
+	fput(file);
+
+	return 0;
+}
+
+static int kevent_poll_callback(struct kevent *k)
+{
+	struct file *file = k->st->origin;
+	unsigned int revents = file->f_op->poll(file, NULL);
+	return (revents & k->event.event);
+}
+
+static int __init kevent_poll_sys_init(void)
+{
+	struct kevent_callbacks *pc = &kevent_registered_callbacks[KEVENT_POLL];
+
+	kevent_poll_container_cache = kmem_cache_create("kevent_poll_container_cache", 
+			sizeof(struct kevent_poll_wait_container), 0, 0, NULL, NULL);
+	if (!kevent_poll_container_cache) {
+		printk(KERN_ERR "Failed to create kevent poll container cache.\n");
+		return -ENOMEM;
+	}
+	
+	kevent_poll_priv_cache = kmem_cache_create("kevent_poll_priv_cache", 
+			sizeof(struct kevent_poll_private), 0, 0, NULL, NULL);
+	if (!kevent_poll_priv_cache) {
+		printk(KERN_ERR "Failed to create kevent poll private data cache.\n");
+		kmem_cache_destroy(kevent_poll_container_cache);
+		kevent_poll_container_cache = NULL;
+		return -ENOMEM;
+	}
+	
+	pc->enqueue = &kevent_poll_enqueue;
+	pc->dequeue = &kevent_poll_dequeue;
+	pc->callback = &kevent_poll_callback;
+
+	printk(KERN_INFO "Kevent poll()/select() subsystem has been initialized.\n");
+	return 0;
+}
+
+static struct lock_class_key kevent_poll_key;
+
+void kevent_poll_reinit(struct file *file)
+{
+	lockdep_set_class(&file->st.lock, &kevent_poll_key);
+}
+
+static void __exit kevent_poll_sys_fini(void)
+{
+	kmem_cache_destroy(kevent_poll_priv_cache);
+	kmem_cache_destroy(kevent_poll_container_cache);
+}
+
+module_init(kevent_poll_sys_init);
+module_exit(kevent_poll_sys_fini);
diff --git a/kernel/kevent/kevent_timer.c b/kernel/kevent/kevent_timer.c
new file mode 100644
index 0000000..f175edd
--- /dev/null
+++ b/kernel/kevent/kevent_timer.c
@@ -0,0 +1,119 @@
+/*
+ * 	kevent_timer.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+#include <linux/jiffies.h>
+#include <linux/kevent.h>
+
+static void kevent_timer_func(unsigned long data)
+{
+	struct kevent *k = (struct kevent *)data;
+	struct timer_list *t = k->st->origin;
+
+	kevent_storage_ready(k->st, NULL, KEVENT_MASK_ALL);
+	mod_timer(t, jiffies + msecs_to_jiffies(k->event.id.raw[0]));
+}
+
+static struct lock_class_key kevent_timer_key;
+
+static int kevent_timer_enqueue(struct kevent *k)
+{
+	struct timer_list *t;
+	struct kevent_storage *st;
+	int err;
+
+	t = kmalloc(sizeof(struct timer_list) + sizeof(struct kevent_storage), 
+			GFP_KERNEL);
+	if (!t)
+		return -ENOMEM;
+
+	init_timer(t);
+	t->function = kevent_timer_func;
+	t->expires = jiffies + msecs_to_jiffies(k->event.id.raw[0]);
+	t->data = (unsigned long)k;
+
+	st = (struct kevent_storage *)(t+1);
+	err = kevent_storage_init(t, st);
+	if (err)
+		goto err_out_free;
+	lockdep_set_class(&st->lock, &kevent_timer_key);
+
+	err = kevent_storage_enqueue(st, k);
+	if (err)
+		goto err_out_st_fini;
+	
+	add_timer(t);
+
+	return 0;
+
+err_out_st_fini:	
+	kevent_storage_fini(st);
+err_out_free:
+	kfree(t);
+
+	return err;
+}
+
+static int kevent_timer_dequeue(struct kevent *k)
+{
+	struct kevent_storage *st = k->st;
+	struct timer_list *t = st->origin;
+
+	if (!t)
+		return -ENODEV;
+
+	del_timer_sync(t);
+	
+	kevent_storage_dequeue(st, k);
+	
+	kfree(t);
+
+	return 0;
+}
+
+static int kevent_timer_callback(struct kevent *k)
+{
+	struct kevent_storage *st = k->st;
+	struct timer_list *t = st->origin;
+
+	if (!t)
+		return -ENODEV;
+	
+	k->event.ret_data[0] = (__u32)jiffies;
+	return 1;
+}
+
+static int __init kevent_init_timer(void)
+{
+	struct kevent_callbacks *tc = &kevent_registered_callbacks[KEVENT_TIMER];
+
+	tc->enqueue = &kevent_timer_enqueue;
+	tc->dequeue = &kevent_timer_dequeue;
+	tc->callback = &kevent_timer_callback;
+
+	return 0;
+}
+late_initcall(kevent_init_timer);


^ permalink raw reply related	[flat|nested] 160+ messages in thread

* [take8 1/2] kevent: Core files.
  2006-08-14  6:20                         ` [take8 0/2] kevent: Generic event handling mechanism Evgeniy Polyakov
@ 2006-08-14  6:20                           ` Evgeniy Polyakov
  2006-08-14  6:20                             ` [take8 2/2] kevent: poll/select() notifications. Timer notifications Evgeniy Polyakov
  0 siblings, 1 reply; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-14  6:20 UTC (permalink / raw)
  To: lkml
  Cc: David Miller, Ulrich Drepper, Andrew Morton, Evgeniy Polyakov,
	netdev, Zach Brown


Core files.

This patch includes core kevent files:
 - userspace controlling
 - kernelspace interfaces
 - initialization
 - notification state machines

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>

diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index dd63d47..091ff42 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@@ -317,3 +317,5 @@ ENTRY(sys_call_table)
 	.long sys_tee			/* 315 */
 	.long sys_vmsplice
 	.long sys_move_pages
+	.long sys_kevent_get_events
+	.long sys_kevent_ctl
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index 5d4a7d1..b2af4a8 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -713,4 +713,6 @@ #endif
 	.quad sys_tee
 	.quad compat_sys_vmsplice
 	.quad compat_sys_move_pages
+	.quad sys_kevent_get_events
+	.quad sys_kevent_ctl
 ia32_syscall_end:		
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index fc1c8dd..c9dde13 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -323,10 +323,12 @@ #define __NR_sync_file_range	314
 #define __NR_tee		315
 #define __NR_vmsplice		316
 #define __NR_move_pages		317
+#define __NR_kevent_get_events	318
+#define __NR_kevent_ctl		319
 
 #ifdef __KERNEL__
 
-#define NR_syscalls 318
+#define NR_syscalls 320
 
 /*
  * user-visible error numbers are in the range -1 - -128: see
diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h
index 94387c9..61363e0 100644
--- a/include/asm-x86_64/unistd.h
+++ b/include/asm-x86_64/unistd.h
@@ -619,10 +619,14 @@ #define __NR_vmsplice		278
 __SYSCALL(__NR_vmsplice, sys_vmsplice)
 #define __NR_move_pages		279
 __SYSCALL(__NR_move_pages, sys_move_pages)
+#define __NR_kevent_get_events	280
+__SYSCALL(__NR_kevent_get_events, sys_kevent_get_events)
+#define __NR_kevent_ctl		281
+__SYSCALL(__NR_kevent_ctl, sys_kevent_ctl)
 
 #ifdef __KERNEL__
 
-#define __NR_syscall_max __NR_move_pages
+#define __NR_syscall_max __NR_kevent_ctl
 
 #ifndef __NO_STUBS
 
diff --git a/include/linux/kevent.h b/include/linux/kevent.h
new file mode 100644
index 0000000..64ef706
--- /dev/null
+++ b/include/linux/kevent.h
@@ -0,0 +1,309 @@
+/*
+ * 	kevent.h
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef __KEVENT_H
+#define __KEVENT_H
+
+/*
+ * Kevent request flags.
+ */
+
+#define KEVENT_REQ_ONESHOT	0x1		/* Process this event only once and then dequeue. */
+
+/*
+ * Kevent return flags.
+ */
+#define KEVENT_RET_BROKEN	0x1		/* Kevent is broken. */
+#define KEVENT_RET_DONE		0x2		/* Kevent processing was finished successfully. */
+
+/*
+ * Kevent type set.
+ */
+#define KEVENT_SOCKET 		0
+#define KEVENT_INODE		1
+#define KEVENT_TIMER		2
+#define KEVENT_POLL		3
+#define KEVENT_NAIO		4
+#define KEVENT_AIO		5
+#define	KEVENT_MAX		6
+
+/*
+ * Per-type event sets.
+ * Number of per-event sets should be exactly as number of kevent types.
+ */
+
+/*
+ * Timer events.
+ */
+#define	KEVENT_TIMER_FIRED	0x1
+
+/*
+ * Socket/network asynchronous IO events.
+ */
+#define	KEVENT_SOCKET_RECV	0x1
+#define	KEVENT_SOCKET_ACCEPT	0x2
+#define	KEVENT_SOCKET_SEND	0x4
+
+/*
+ * Inode events.
+ */
+#define	KEVENT_INODE_CREATE	0x1
+#define	KEVENT_INODE_REMOVE	0x2
+
+/*
+ * Poll events.
+ */
+#define	KEVENT_POLL_POLLIN	0x0001
+#define	KEVENT_POLL_POLLPRI	0x0002
+#define	KEVENT_POLL_POLLOUT	0x0004
+#define	KEVENT_POLL_POLLERR	0x0008
+#define	KEVENT_POLL_POLLHUP	0x0010
+#define	KEVENT_POLL_POLLNVAL	0x0020
+
+#define	KEVENT_POLL_POLLRDNORM	0x0040
+#define	KEVENT_POLL_POLLRDBAND	0x0080
+#define	KEVENT_POLL_POLLWRNORM	0x0100
+#define	KEVENT_POLL_POLLWRBAND	0x0200
+#define	KEVENT_POLL_POLLMSG	0x0400
+#define	KEVENT_POLL_POLLREMOVE	0x1000
+
+/*
+ * Asynchronous IO events.
+ */
+#define	KEVENT_AIO_BIO		0x1
+
+#define KEVENT_MASK_ALL		0xffffffff	/* Mask of all possible event values. */
+#define KEVENT_MASK_EMPTY	0x0		/* Empty mask of ready events. */
+
+struct kevent_id
+{
+	__u32		raw[2];
+};
+
+struct ukevent
+{
+	struct kevent_id	id;			/* Id of this request, e.g. socket number, file descriptor and so on... */
+	__u32			type;			/* Event type, e.g. KEVENT_SOCK, KEVENT_INODE, KEVENT_TIMER and so on... */
+	__u32			event;			/* Event itself, e.g. SOCK_ACCEPT, INODE_CREATED, TIMER_FIRED... */
+	__u32			req_flags;		/* Per-event request flags */
+	__u32			ret_flags;		/* Per-event return flags */
+	__u32			ret_data[2];		/* Event return data. Event originator fills it with anything it likes. */
+	union {
+		__u32		user[2];		/* User's data. It is not used, just copied to/from user. */
+		void		*ptr;
+	};
+};
+
+struct mukevent
+{
+	struct kevent_id	id;
+	__u32			ret_flags;
+};
+
+#define	KEVENT_CTL_ADD 		0
+#define	KEVENT_CTL_REMOVE	1
+#define	KEVENT_CTL_MODIFY	2
+#define	KEVENT_CTL_INIT		3
+
+#ifdef __KERNEL__
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/wait.h>
+#include <linux/net.h>
+#include <linux/rcupdate.h>
+#include <linux/kevent_storage.h>
+
+#define KEVENT_MAX_EVENTS	4096
+#define KEVENT_MIN_BUFFS_ALLOC	3
+
+struct inode;
+struct dentry;
+struct sock;
+
+struct kevent;
+struct kevent_storage;
+typedef int (* kevent_callback_t)(struct kevent *);
+
+/* @callback is called each time new event has been caught. */
+/* @enqueue is called each time new event is queued. */
+/* @dequeue is called each time event is dequeued. */
+
+struct kevent_callbacks {
+	kevent_callback_t	callback, enqueue, dequeue;
+};
+
+#define KEVENT_READY		0x1
+#define KEVENT_STORAGE		0x2
+#define KEVENT_USER		0x4
+
+struct kevent
+{
+	struct rcu_head		rcu_head;		/* Used for kevent freeing.*/
+	struct ukevent		event;
+	spinlock_t		ulock;			/* This lock protects ukevent manipulations, e.g. ret_flags changes. */
+
+	struct list_head	kevent_entry;		/* Entry of user's queue. */
+	struct list_head	storage_entry;		/* Entry of origin's queue. */
+	struct list_head	ready_entry;		/* Entry of user's ready. */
+
+	u32			flags;
+
+	struct kevent_user	*user;			/* User who requested this kevent. */
+	struct kevent_storage	*st;			/* Kevent container. */
+
+	struct kevent_callbacks	callbacks;
+
+	void			*priv;			/* Private data for different storages. 
+							 * poll()/select storage has a list of wait_queue_t containers 
+							 * for each ->poll() { poll_wait()' } here.
+							 */
+};
+
+extern struct kevent_callbacks kevent_registered_callbacks[];
+
+#define KEVENT_HASH_MASK	0xff
+
+struct kevent_user
+{
+	struct list_head	kevent_list[KEVENT_HASH_MASK+1];
+	spinlock_t		kevent_lock;
+	unsigned int		kevent_num;		/* Number of queued kevents. */
+
+	struct list_head	ready_list;		/* List of ready kevents. */
+	unsigned int		ready_num;		/* Number of ready kevents. */
+	spinlock_t 		ready_lock;		/* Protects all manipulations with ready queue. */
+
+	unsigned int		max_ready_num;		/* Requested number of kevents. */
+
+	struct mutex		ctl_mutex;		/* Protects against simultaneous kevent_user control manipulations. */
+	wait_queue_head_t	wait;			/* Wait until some events are ready. */
+
+	atomic_t		refcnt;			/* Reference counter, increased for each new kevent. */
+	
+	unsigned int		pages_in_use;
+	unsigned long		*pring;			/* Array of pages forming mapped ring buffer */
+
+#ifdef CONFIG_KEVENT_USER_STAT
+	unsigned long		im_num;
+	unsigned long		wait_num;
+	unsigned long		total;
+#endif
+};
+
+extern kmem_cache_t *kevent_cache;
+int kevent_enqueue(struct kevent *k);
+int kevent_dequeue(struct kevent *k);
+int kevent_init(struct kevent *k);
+void kevent_requeue(struct kevent *k);
+int kevent_break(struct kevent *k);
+
+void kevent_user_ring_add_event(struct kevent *k);
+
+void kevent_storage_ready(struct kevent_storage *st, 
+		kevent_callback_t ready_callback, u32 event);
+int kevent_storage_init(void *origin, struct kevent_storage *st);
+void kevent_storage_fini(struct kevent_storage *st);
+int kevent_storage_enqueue(struct kevent_storage *st, struct kevent *k);
+void kevent_storage_dequeue(struct kevent_storage *st, struct kevent *k);
+
+int kevent_user_add_ukevent(struct ukevent *uk, struct kevent_user *u);
+
+#ifdef CONFIG_KEVENT_POLL
+void kevent_poll_reinit(struct file *file);
+#else
+static inline void kevent_poll_reinit(struct file *file)
+{
+}
+#endif
+
+#ifdef CONFIG_KEVENT_INODE
+void kevent_inode_notify(struct inode *inode, u32 event);
+void kevent_inode_notify_parent(struct dentry *dentry, u32 event);
+void kevent_inode_remove(struct inode *inode);
+#else
+static inline void kevent_inode_notify(struct inode *inode, u32 event)
+{
+}
+static inline void kevent_inode_notify_parent(struct dentry *dentry, u32 event)
+{
+}
+static inline void kevent_inode_remove(struct inode *inode)
+{
+}
+#endif /* CONFIG_KEVENT_INODE */
+#ifdef CONFIG_KEVENT_SOCKET
+#ifdef CONFIG_LOCKDEP
+void kevent_socket_reinit(struct socket *sock);
+void kevent_sk_reinit(struct sock *sk);
+#else
+static inline void kevent_socket_reinit(struct socket *sock)
+{
+}
+static inline void kevent_sk_reinit(struct sock *sk)
+{
+}
+#endif
+void kevent_socket_notify(struct sock *sock, u32 event);
+int kevent_socket_dequeue(struct kevent *k);
+int kevent_socket_enqueue(struct kevent *k);
+#define sock_async(__sk) sock_flag(__sk, SOCK_ASYNC)
+#else
+static inline void kevent_socket_notify(struct sock *sock, u32 event)
+{
+}
+#define sock_async(__sk)	({ (void)__sk; 0; })
+#endif
+
+#ifdef CONFIG_KEVENT_USER_STAT
+static inline void kevent_stat_init(struct kevent_user *u)
+{
+	u->wait_num = u->im_num = u->total = 0;
+}
+static inline void kevent_stat_print(struct kevent_user *u)
+{
+	pr_debug("%s: u=%p, wait=%lu, immediately=%lu, total=%lu.\n", 
+			__func__, u, u->wait_num, u->im_num, u->total);
+}
+static inline void kevent_stat_im(struct kevent_user *u)
+{
+	u->im_num++;
+}
+static inline void kevent_stat_wait(struct kevent_user *u)
+{
+	u->wait_num++;
+}
+static inline void kevent_stat_total(struct kevent_user *u)
+{
+	u->total++;
+}
+#else
+#define kevent_stat_print(u)		({ (void) u;})
+#define kevent_stat_init(u)		({ (void) u;})
+#define kevent_stat_im(u)		({ (void) u;})
+#define kevent_stat_wait(u)		({ (void) u;})
+#define kevent_stat_total(u)		({ (void) u;})
+#endif
+
+#endif /* __KERNEL__ */
+#endif /* __KEVENT_H */
diff --git a/include/linux/kevent_storage.h b/include/linux/kevent_storage.h
new file mode 100644
index 0000000..a38575d
--- /dev/null
+++ b/include/linux/kevent_storage.h
@@ -0,0 +1,11 @@
+#ifndef __KEVENT_STORAGE_H
+#define __KEVENT_STORAGE_H
+
+struct kevent_storage
+{
+	void			*origin;		/* Originator's pointer, e.g. struct sock or struct file. Can be NULL. */
+	struct list_head	list;			/* List of queued kevents. */
+	spinlock_t		lock;			/* Protects users queue. */
+};
+
+#endif /* __KEVENT_STORAGE_H */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 008f04c..8609910 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -597,4 +597,7 @@ asmlinkage long sys_get_robust_list(int 
 asmlinkage long sys_set_robust_list(struct robust_list_head __user *head,
 				    size_t len);
 
+asmlinkage long sys_kevent_get_events(int ctl_fd, unsigned int min, unsigned int max, 
+		unsigned int timeout, void __user *buf, unsigned flags);
+asmlinkage long sys_kevent_ctl(int ctl_fd, unsigned int cmd, unsigned int num, void __user *buf);
 #endif
diff --git a/init/Kconfig b/init/Kconfig
index a099fc6..c550fcc 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -218,6 +218,8 @@ config AUDITSYSCALL
 	  such as SELinux.  To use audit's filesystem watch feature, please
 	  ensure that INOTIFY is configured.
 
+source "kernel/kevent/Kconfig"
+
 config IKCONFIG
 	bool "Kernel .config support"
 	---help---
diff --git a/kernel/Makefile b/kernel/Makefile
index d62ec66..2d7a6dd 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -47,6 +47,7 @@ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softl
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
+obj-$(CONFIG_KEVENT) += kevent/
 obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o
diff --git a/kernel/kevent/Kconfig b/kernel/kevent/Kconfig
new file mode 100644
index 0000000..31ea7b2
--- /dev/null
+++ b/kernel/kevent/Kconfig
@@ -0,0 +1,59 @@
+config KEVENT
+	bool "Kernel event notification mechanism"
+	help
+	  This option enables event queue mechanism.
+	  It can be used as replacement for poll()/select(), AIO callback 
+	  invocations, advanced timer notifications and other kernel 
+	  object status changes.
+
+config KEVENT_USER_STAT
+	bool "Kevent user statistic"
+	depends on KEVENT
+	default N
+	help
+	  This option will turn kevent_user statistic collection on.
+	  Statistic data includes total number of kevent, number of kevents 
+	  which are ready immediately at insertion time and number of kevents 
+	  which were removed through readiness completion. 
+	  It will be printed each time control kevent descriptor is closed.
+
+config KEVENT_SOCKET
+	bool "Kernel event notifications for sockets"
+	depends on NET && KEVENT
+	help
+	  This option enables notifications through KEVENT subsystem of 
+	  sockets operations, like new packet receiving conditions, 
+	  ready for accept conditions and so on.
+	
+config KEVENT_INODE
+	bool "Kernel event notifications for inodes"
+	depends on KEVENT
+	help
+	  This option enables notifications through KEVENT subsystem of 
+	  inode operations, like file creation, removal and so on.
+
+config KEVENT_TIMER
+	bool "Kernel event notifications for timers"
+	depends on KEVENT
+	help
+	  This option allows to use timers through KEVENT subsystem.
+
+config KEVENT_POLL
+	bool "Kernel event notifications for poll()/select()"
+	depends on KEVENT
+	help
+	  This option allows to use kevent subsystem for poll()/select() 
+	  notifications.
+
+config KEVENT_NAIO
+	bool "Network asynchronous IO"
+	depends on KEVENT && KEVENT_SOCKET
+	help
+	  This option enables kevent based network asynchronous IO subsystem.
+
+config KEVENT_AIO
+	bool "Asynchronous IO"
+	depends on KEVENT
+	help
+	  This option allows to use kevent subsystem for AIO operations.
+	  AIO read is currently supported.
diff --git a/kernel/kevent/Makefile b/kernel/kevent/Makefile
new file mode 100644
index 0000000..d1ef9ba
--- /dev/null
+++ b/kernel/kevent/Makefile
@@ -0,0 +1,7 @@
+obj-y := kevent.o kevent_user.o
+obj-$(CONFIG_KEVENT_SOCKET) += kevent_socket.o
+obj-$(CONFIG_KEVENT_INODE) += kevent_inode.o
+obj-$(CONFIG_KEVENT_TIMER) += kevent_timer.o
+obj-$(CONFIG_KEVENT_POLL) += kevent_poll.o
+obj-$(CONFIG_KEVENT_NAIO) += kevent_naio.o
+obj-$(CONFIG_KEVENT_AIO) += kevent_aio.o
diff --git a/kernel/kevent/kevent.c b/kernel/kevent/kevent.c
new file mode 100644
index 0000000..03430c9
--- /dev/null
+++ b/kernel/kevent/kevent.c
@@ -0,0 +1,251 @@
+/*
+ * 	kevent.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/mempool.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/kevent.h>
+
+kmem_cache_t *kevent_cache;
+
+/*
+ * Attempts to add an event into appropriate origin's queue.
+ * Returns positive value if this event is ready immediately,
+ * negative value in case of error and zero if event has been queued.
+ * ->enqueue() callback must increase origin's reference counter.
+ */
+int kevent_enqueue(struct kevent *k)
+{
+	if (k->event.type >= KEVENT_MAX)
+		return -EINVAL;
+
+	if (!k->callbacks.enqueue) {
+		kevent_break(k);
+		return -EINVAL;
+	}
+	
+	return k->callbacks.enqueue(k);
+}
+
+/*
+ * Remove event from the appropriate queue.
+ * ->dequeue() callback must decrease origin's reference counter.
+ */
+int kevent_dequeue(struct kevent *k)
+{
+	if (k->event.type >= KEVENT_MAX)
+		return -EINVAL;
+	
+	if (!k->callbacks.dequeue) {
+		kevent_break(k);
+		return -EINVAL;
+	}
+
+	return k->callbacks.dequeue(k);
+}
+
+/*
+ * Mark kevent as broken.
+ */
+int kevent_break(struct kevent *k)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&k->ulock, flags);
+	k->event.ret_flags |= KEVENT_RET_BROKEN;
+	spin_unlock_irqrestore(&k->ulock, flags);
+	return 0;
+}
+
+struct kevent_callbacks kevent_registered_callbacks[KEVENT_MAX];
+
+/*
+ * Must be called before event is going to be added into some origin's queue.
+ * Initializes ->enqueue(), ->dequeue() and ->callback() callbacks.
+ * If failed, kevent should not be used or kevent_enqueue() will fail to add
+ * this kevent into origin's queue with setting
+ * KEVENT_RET_BROKEN flag in kevent->event.ret_flags.
+ */
+int kevent_init(struct kevent *k)
+{
+	spin_lock_init(&k->ulock);
+	k->flags = 0;
+
+	if (k->event.type >= KEVENT_MAX)
+		return -EINVAL;
+
+	k->callbacks = kevent_registered_callbacks[k->event.type];
+	if (!k->callbacks.callback) {
+		kevent_break(k);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/*
+ * Called from ->enqueue() callback when reference counter for given
+ * origin (socket, inode...) has been increased.
+ */
+int kevent_storage_enqueue(struct kevent_storage *st, struct kevent *k)
+{
+	unsigned long flags;
+
+	k->st = st;
+	spin_lock_irqsave(&st->lock, flags);
+	list_add_tail_rcu(&k->storage_entry, &st->list);
+	k->flags |= KEVENT_STORAGE;
+	spin_unlock_irqrestore(&st->lock, flags);
+	return 0;
+}
+
+/*
+ * Dequeue kevent from origin's queue. 
+ * It does not decrease origin's reference counter in any way 
+ * and must be called before it, so storage itself must be valid.
+ * It is called from ->dequeue() callback.
+ */
+void kevent_storage_dequeue(struct kevent_storage *st, struct kevent *k)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&st->lock, flags);
+	if (k->flags & KEVENT_STORAGE) {
+		list_del_rcu(&k->storage_entry);
+		k->flags &= ~KEVENT_STORAGE;
+	}
+	spin_unlock_irqrestore(&st->lock, flags);
+}
+
+/*
+ * Call kevent ready callback and queue it into ready queue if needed.
+ * If kevent is marked as one-shot, then remove it from storage queue.
+ */
+static void __kevent_requeue(struct kevent *k, u32 event)
+{
+	int ret, rem = 0;
+	unsigned long flags;
+
+	ret = k->callbacks.callback(k);
+
+	spin_lock_irqsave(&k->ulock, flags);
+	if (ret > 0) {
+		k->event.ret_flags |= KEVENT_RET_DONE;
+	} else if (ret < 0) {
+		k->event.ret_flags |= KEVENT_RET_BROKEN;
+		k->event.ret_flags |= KEVENT_RET_DONE;
+	}
+	rem = (k->event.req_flags & KEVENT_REQ_ONESHOT);
+	if (!ret)
+		ret = (k->event.ret_flags & (KEVENT_RET_BROKEN|KEVENT_RET_DONE));
+	spin_unlock_irqrestore(&k->ulock, flags);
+
+	if (ret) {
+		if ((rem || ret < 0) && k->flags &KEVENT_STORAGE) {
+			list_del_rcu(&k->storage_entry);
+			k->flags &= ~KEVENT_STORAGE;
+		}
+		
+		spin_lock_irqsave(&k->user->ready_lock, flags);
+		if (!(k->flags & KEVENT_READY)) {
+			kevent_user_ring_add_event(k);
+			list_add_tail(&k->ready_entry, &k->user->ready_list);
+			k->flags |= KEVENT_READY;
+			k->user->ready_num++;
+		}
+		spin_unlock_irqrestore(&k->user->ready_lock, flags);
+		wake_up(&k->user->wait);
+	}
+}
+
+/*
+ * Check if kevent is ready (by invoking it's callback) and requeue/remove
+ * if needed.
+ */
+void kevent_requeue(struct kevent *k)
+{
+	unsigned long flags;
+	
+	spin_lock_irqsave(&k->st->lock, flags);
+	__kevent_requeue(k, 0);
+	spin_unlock_irqrestore(&k->st->lock, flags);
+}
+
+/*
+ * Called each time some activity in origin (socket, inode...) is noticed.
+ */
+void kevent_storage_ready(struct kevent_storage *st, 
+		kevent_callback_t ready_callback, u32 event)
+{
+	struct kevent *k;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(k, &st->list, storage_entry) {
+		if (ready_callback)
+			(*ready_callback)(k);
+
+		if (event & k->event.event)
+			__kevent_requeue(k, event);
+	}
+	rcu_read_unlock();
+}
+
+int kevent_storage_init(void *origin, struct kevent_storage *st)
+{
+	spin_lock_init(&st->lock);
+	st->origin = origin;
+	INIT_LIST_HEAD(&st->list);
+	return 0;
+}
+
+/*
+ * Mark all events as broken, that will remove them from storage,
+ * so storage origin (inode, sockt and so on) can be safely removed.
+ * No new entries are allowed to be added into the storage at this point.
+ * (Socket is removed from file table at this point for example).
+ */
+void kevent_storage_fini(struct kevent_storage *st)
+{
+	kevent_storage_ready(st, kevent_break, KEVENT_MASK_ALL);
+}
+
+static int __init kevent_sys_init(void)
+{
+	int i;
+
+	kevent_cache = kmem_cache_create("kevent_cache", 
+			sizeof(struct kevent), 0, SLAB_PANIC, NULL, NULL);
+
+	for (i=0; i<ARRAY_SIZE(kevent_registered_callbacks); ++i) {
+		struct kevent_callbacks *c = &kevent_registered_callbacks[i];
+
+		c->callback = c->enqueue = c->dequeue = NULL;
+	}
+	
+	return 0;
+}
+
+late_initcall(kevent_sys_init);
diff --git a/kernel/kevent/kevent_user.c b/kernel/kevent/kevent_user.c
new file mode 100644
index 0000000..237151c
--- /dev/null
+++ b/kernel/kevent/kevent_user.c
@@ -0,0 +1,1004 @@
+/*
+ * 	kevent_user.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/device.h>
+#include <linux/poll.h>
+#include <linux/kevent.h>
+#include <linux/jhash.h>
+#include <linux/miscdevice.h>
+#include <asm/io.h>
+
+static char kevent_name[] = "kevent";
+
+static int kevent_user_open(struct inode *, struct file *);
+static int kevent_user_release(struct inode *, struct file *);
+static unsigned int kevent_user_poll(struct file *, struct poll_table_struct *);
+static int kevent_user_mmap(struct file *, struct vm_area_struct *);
+
+static struct file_operations kevent_user_fops = {
+	.mmap		= kevent_user_mmap,
+	.open		= kevent_user_open,
+	.release	= kevent_user_release,
+	.poll		= kevent_user_poll,
+	.owner		= THIS_MODULE,
+};
+
+static struct miscdevice kevent_miscdev = {
+	.minor = MISC_DYNAMIC_MINOR,
+	.name = kevent_name,
+	.fops = &kevent_user_fops,
+};
+
+static int kevent_get_sb(struct file_system_type *fs_type, 
+		int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+{
+	/* So original magic... */
+	return get_sb_pseudo(fs_type, kevent_name, NULL, 0xbcdbcdbcdul, mnt);
+}
+
+static struct file_system_type kevent_fs_type = {
+	.name		= kevent_name,
+	.get_sb		= kevent_get_sb,
+	.kill_sb	= kill_anon_super,
+};
+
+static struct vfsmount *kevent_mnt;
+
+/*
+ * kevents are pollable, return POLLIN and POLLRDNORM 
+ * when there is at least one ready kevent.
+ */
+static unsigned int kevent_user_poll(struct file *file, struct poll_table_struct *wait)
+{
+	struct kevent_user *u = file->private_data;
+	unsigned int mask;
+	
+	poll_wait(file, &u->wait, wait);
+	mask = 0;
+
+	if (u->ready_num)
+		mask |= POLLIN | POLLRDNORM;
+
+	return mask;
+}
+
+/*
+ * Note that kevents does not exactly fill the page (each mukevent is 40 bytes),
+ * so we reuse 4 bytes at the begining of the first page to store index.
+ * Take that into account if you want to change size of struct ukevent.
+ */
+#define KEVENTS_ON_PAGE ((PAGE_SIZE-sizeof(unsigned int))/sizeof(struct mukevent))
+struct kevent_mring
+{
+	unsigned int		index;
+	struct mukevent		event[KEVENTS_ON_PAGE];
+};
+
+static inline void kevent_user_ring_set(struct kevent_user *u, unsigned int num)
+{
+	struct kevent_mring *ring;
+
+	ring = (struct kevent_mring *)u->pring[0];
+	ring->index = num;
+}
+
+static inline void kevent_user_ring_inc(struct kevent_user *u)
+{
+	struct kevent_mring *ring;
+
+	ring = (struct kevent_mring *)u->pring[0];
+	ring->index++;
+}
+
+static int kevent_user_ring_grow(struct kevent_user *u)
+{
+	struct kevent_mring *ring;
+	unsigned int idx;
+
+	ring = (struct kevent_mring *)u->pring[0];
+
+	idx = (ring->index + 1) / KEVENTS_ON_PAGE;
+	if (idx >= u->pages_in_use) {
+		u->pring[idx] = __get_free_page(GFP_KERNEL);
+		if (!u->pring[idx])
+			return -ENOMEM;
+		u->pages_in_use++;
+	}
+	return 0;
+}
+
+/*
+ * Called under kevent_user->ready_lock, so updates are always protected.
+ */
+void kevent_user_ring_add_event(struct kevent *k)
+{
+	unsigned int pidx, off;
+	struct kevent_mring *ring, *copy_ring;
+
+	ring = (struct kevent_mring *)k->user->pring[0];
+	
+	pidx = ring->index/KEVENTS_ON_PAGE;
+	off = ring->index%KEVENTS_ON_PAGE;
+
+	copy_ring = (struct kevent_mring *)k->user->pring[pidx];
+
+	copy_ring->event[off].id.raw[0] = k->event.id.raw[0];
+	copy_ring->event[off].id.raw[1] = k->event.id.raw[1];
+	copy_ring->event[off].ret_flags = k->event.ret_flags;
+
+	if (++ring->index >= KEVENT_MAX_EVENTS)
+		ring->index = 0;
+}
+
+/*
+ * Initialize mmap ring buffer.
+ * It will store ready kevents, so userspace could get them directly instead
+ * of using syscall. Esentially syscall becomes just a waiting point.
+ */
+static int kevent_user_ring_init(struct kevent_user *u)
+{
+	int pnum;
+
+	pnum = ALIGN(KEVENT_MAX_EVENTS*sizeof(struct mukevent) + sizeof(unsigned int), PAGE_SIZE)/PAGE_SIZE;
+
+	u->pring = kmalloc(pnum * sizeof(unsigned long), GFP_KERNEL);
+	if (!u->pring)
+		return -ENOMEM;
+
+	u->pring[0] = __get_free_page(GFP_KERNEL);
+	if (!u->pring[0])
+		goto err_out_free;
+
+	u->pages_in_use = 1;
+	kevent_user_ring_set(u, 0);
+
+	return 0;
+
+err_out_free:
+	kfree(u->pring);
+
+	return -ENOMEM;
+}
+
+static void kevent_user_ring_fini(struct kevent_user *u)
+{
+	int i, pnum;
+
+	pnum = ALIGN(KEVENT_MAX_EVENTS*sizeof(struct ukevent) + sizeof(unsigned int), PAGE_SIZE)/PAGE_SIZE;
+	
+	for (i=0; i<pnum; ++i)
+		free_page(u->pring[i]);
+
+	kfree(u->pring);
+}
+
+
+/*
+ * Allocate new kevent userspace control entry.
+ */
+static struct kevent_user *kevent_user_alloc(void)
+{
+	struct kevent_user *u;
+	int i;
+
+	u = kzalloc(sizeof(struct kevent_user), GFP_KERNEL);
+	if (!u)
+		return NULL;
+
+	INIT_LIST_HEAD(&u->ready_list);
+	spin_lock_init(&u->ready_lock);
+	kevent_stat_init(u);
+	spin_lock_init(&u->kevent_lock);
+	for (i=0; i<ARRAY_SIZE(u->kevent_list); ++i)
+		INIT_LIST_HEAD(&u->kevent_list[i]);
+	
+	mutex_init(&u->ctl_mutex);
+	init_waitqueue_head(&u->wait);
+
+	atomic_set(&u->refcnt, 1);
+
+	if (kevent_user_ring_init(u)) {
+		kfree(u);
+		u = NULL;
+	}
+
+	return u;
+}
+
+static int kevent_user_open(struct inode *inode, struct file *file)
+{
+	struct kevent_user *u = kevent_user_alloc();
+	
+	if (!u)
+		return -ENOMEM;
+
+	file->private_data = u;
+	
+	return 0;
+}
+
+
+/*
+ * Kevent userspace control block reference counting.
+ * Set to 1 at creation time, when appropriate kevent file descriptor
+ * is closed, that reference counter is decreased.
+ * When counter hits zero block is freed.
+ */
+static inline void kevent_user_get(struct kevent_user *u)
+{
+	atomic_inc(&u->refcnt);
+}
+
+static inline void kevent_user_put(struct kevent_user *u)
+{
+	if (atomic_dec_and_test(&u->refcnt)) {
+		kevent_stat_print(u);
+		kevent_user_ring_fini(u);
+		kfree(u);
+	}
+}
+
+static struct page *kevent_user_nopage(struct vm_area_struct *vma, unsigned long addr, int *type)
+{
+	struct kevent_user *u = vma->vm_file->private_data;
+	unsigned long off = (addr - vma->vm_start)/PAGE_SIZE;
+	unsigned int pnum = ALIGN(KEVENT_MAX_EVENTS*sizeof(struct mukevent) + sizeof(unsigned int), PAGE_SIZE)/PAGE_SIZE;
+
+	if (type)
+		*type = VM_FAULT_MINOR;
+
+	if (off >= pnum)
+		goto err_out_sigbus;
+
+	u->pring[off] = __get_free_page(GFP_KERNEL);
+	if (!u->pring)
+		goto err_out_sigbus;
+
+	return virt_to_page(u->pring[off]);
+
+err_out_sigbus:
+	return NOPAGE_SIGBUS;
+}
+
+static struct vm_operations_struct kevent_user_vm_ops = {
+	.nopage = &kevent_user_nopage,
+};
+
+/*
+ * Mmap implementation for ring buffer, which is created as array
+ * of pages, so vm_pgoff is an offset (in pages, not in bytes) of
+ * the first page to be mapped.
+ */
+static int kevent_user_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	unsigned long start = vma->vm_start;
+	struct kevent_user *u = file->private_data;
+
+	if (vma->vm_flags & VM_WRITE)
+		return -EPERM;
+
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+	vma->vm_ops = &kevent_user_vm_ops;
+	vma->vm_flags |= VM_RESERVED;
+	vma->vm_file = file;
+
+	if (remap_pfn_range(vma, start, virt_to_phys((void *)u->pring[0]), PAGE_SIZE,
+				vma->vm_page_prot))
+		return -EFAULT;
+
+	return 0;
+}
+
+#if 0
+static inline unsigned int kevent_user_hash(struct ukevent *uk)
+{
+	unsigned int h = (uk->user[0] ^ uk->user[1]) ^ (uk->id.raw[0] ^ uk->id.raw[1]);
+	
+	h = (((h >> 16) & 0xffff) ^ (h & 0xffff)) & 0xffff;
+	h = (((h >> 8) & 0xff) ^ (h & 0xff)) & KEVENT_HASH_MASK;
+
+	return h;
+}
+#else
+static inline unsigned int kevent_user_hash(struct ukevent *uk)
+{
+	return jhash_1word(uk->id.raw[0], 0) & KEVENT_HASH_MASK;
+}
+#endif
+
+/*
+ * RCU protects storage list (kevent->storage_entry).
+ * Free entry in RCU callback, it is dequeued from all lists at 
+ * this point.
+ */
+
+static void kevent_free_rcu(struct rcu_head *rcu)
+{
+	struct kevent *kevent = container_of(rcu, struct kevent, rcu_head);
+	kmem_cache_free(kevent_cache, kevent);
+}
+
+/*
+ * Complete kevent removing - it dequeues kevent from storage list
+ * if it is requested, removes kevent from ready list, drops userspace
+ * control block reference counter and schedules kevent freeing through RCU.
+ */
+static void kevent_finish_user_complete(struct kevent *k, int deq)
+{
+	struct kevent_user *u = k->user;
+	unsigned long flags;
+
+	if (deq)
+		kevent_dequeue(k);
+
+	spin_lock_irqsave(&u->ready_lock, flags);
+	if (k->flags & KEVENT_READY) {
+		list_del(&k->ready_entry);
+		k->flags &= ~KEVENT_READY;
+		u->ready_num--;
+	}
+	spin_unlock_irqrestore(&u->ready_lock, flags);
+
+	kevent_user_put(u);
+	call_rcu(&k->rcu_head, kevent_free_rcu);
+}
+
+/*
+ * Remove from all lists and free kevent.
+ * Must be called under kevent_user->kevent_lock to protect 
+ * kevent->kevent_entry removing.
+ */
+static void __kevent_finish_user(struct kevent *k, int deq)
+{
+	struct kevent_user *u = k->user;
+
+	list_del(&k->kevent_entry);
+	k->flags &= ~KEVENT_USER;
+	u->kevent_num--;
+	kevent_finish_user_complete(k, deq);
+}
+
+/*
+ * Remove kevent from user's list of all events, 
+ * dequeue it from storage and decrease user's reference counter,
+ * since this kevent does not exist anymore. That is why it is freed here.
+ */
+static void kevent_finish_user(struct kevent *k, int deq)
+{
+	struct kevent_user *u = k->user;
+	unsigned long flags;
+
+	spin_lock_irqsave(&u->kevent_lock, flags);
+	list_del(&k->kevent_entry);
+	k->flags &= ~KEVENT_USER;
+	u->kevent_num--;
+	spin_unlock_irqrestore(&u->kevent_lock, flags);
+	kevent_finish_user_complete(k, deq);
+}
+
+/*
+ * Dequeue one entry from user's ready queue.
+ */
+static struct kevent *kqueue_dequeue_ready(struct kevent_user *u)
+{
+	unsigned long flags;
+	struct kevent *k = NULL;
+
+	spin_lock_irqsave(&u->ready_lock, flags);
+	if (u->ready_num && !list_empty(&u->ready_list)) {
+		k = list_entry(u->ready_list.next, struct kevent, ready_entry);
+		list_del(&k->ready_entry);
+		k->flags &= ~KEVENT_READY;
+		u->ready_num--;
+	}
+	spin_unlock_irqrestore(&u->ready_lock, flags);
+
+	return k;
+}
+
+/*
+ * Search a kevent inside hash bucket for given ukevent.
+ */
+static struct kevent *__kevent_search(struct list_head *head, struct ukevent *uk, 
+		struct kevent_user *u)
+{
+	struct kevent *k, *ret = NULL;
+	
+	list_for_each_entry(k, head, kevent_entry) {
+		spin_lock(&k->ulock);
+		if (k->event.user[0] == uk->user[0] && k->event.user[1] == uk->user[1] &&
+				k->event.id.raw[0] == uk->id.raw[0] && 
+				k->event.id.raw[1] == uk->id.raw[1]) {
+			ret = k;
+			spin_unlock(&k->ulock);
+			break;
+		}
+		spin_unlock(&k->ulock);
+	}
+
+	return ret;
+}
+
+/*
+ * Search and modify kevent according to provided ukevent.
+ */
+static int kevent_modify(struct ukevent *uk, struct kevent_user *u)
+{
+	struct kevent *k;
+	unsigned int hash = kevent_user_hash(uk);
+	int err = -ENODEV;
+	unsigned long flags;
+	
+	spin_lock_irqsave(&u->kevent_lock, flags);
+	k = __kevent_search(&u->kevent_list[hash], uk, u);
+	if (k) {
+		spin_lock(&k->ulock);
+		k->event.event = uk->event;
+		k->event.req_flags = uk->req_flags;
+		k->event.ret_flags = 0;
+		spin_unlock(&k->ulock);
+		kevent_requeue(k);
+		err = 0;
+	}
+	spin_unlock_irqrestore(&u->kevent_lock, flags);
+	
+	return err;
+}
+
+/*
+ * Remove kevent which matches provided ukevent.
+ */
+static int kevent_remove(struct ukevent *uk, struct kevent_user *u)
+{
+	int err = -ENODEV;
+	struct kevent *k;
+	unsigned int hash = kevent_user_hash(uk);
+	unsigned long flags;
+
+	spin_lock_irqsave(&u->kevent_lock, flags);
+	k = __kevent_search(&u->kevent_list[hash], uk, u);
+	if (k) {
+		__kevent_finish_user(k, 1);
+		err = 0;
+	}
+	spin_unlock_irqrestore(&u->kevent_lock, flags);
+
+	return err;
+}
+
+/*
+ * Detaches userspace control block from file descriptor 
+ * and decrease it's reference counter.
+ * No new kevents can be added or removed from any list at this point.
+ */
+static int kevent_user_release(struct inode *inode, struct file *file)
+{
+	struct kevent_user *u = file->private_data;
+	struct kevent *k, *n;
+	int i;
+
+	for (i=0; i<ARRAY_SIZE(u->kevent_list); ++i) {
+		list_for_each_entry_safe(k, n, &u->kevent_list[i], kevent_entry)
+			kevent_finish_user(k, 1);
+	}
+
+	kevent_user_put(u);
+	file->private_data = NULL;
+
+	return 0;
+}
+
+/*
+ * Read requested number of ukevents in one shot.
+ */
+static struct ukevent *kevent_get_user(unsigned int num, void __user *arg)
+{
+	struct ukevent *ukev;
+
+	ukev = kmalloc(sizeof(struct ukevent) * num, GFP_KERNEL);
+	if (!ukev)
+		return NULL;
+
+	if (copy_from_user(ukev, arg, sizeof(struct ukevent) * num)) {
+		kfree(ukev);
+		return NULL;
+	}
+
+	return ukev;
+}
+
+/*
+ * Read from userspace all ukevents and modify appropriate kevents.
+ * If provided number of ukevents is more that threshold, it is faster
+ * to allocate a room for them and copy in one shot instead of copy 
+ * one-by-one and then process them.
+ */
+static int kevent_user_ctl_modify(struct kevent_user *u, unsigned int num, void __user *arg)
+{
+	int err = 0, i;
+	struct ukevent uk;
+
+	mutex_lock(&u->ctl_mutex);
+
+	if (num > u->kevent_num) {
+		err = -EINVAL;
+		goto out;
+	}
+	
+	if (num > KEVENT_MIN_BUFFS_ALLOC) {
+		struct ukevent *ukev;
+
+		ukev = kevent_get_user(num, arg);
+		if (ukev) {
+			for (i=0; i<num; ++i) {
+				if (kevent_modify(&ukev[i], u))
+					ukev[i].ret_flags |= KEVENT_RET_BROKEN;
+				ukev[i].ret_flags |= KEVENT_RET_DONE;
+			}
+			if (copy_to_user(arg, ukev, num*sizeof(struct ukevent)))
+				err = -EFAULT;
+			kfree(ukev);
+			goto out;
+		}
+	}
+
+	for (i=0; i<num; ++i) {
+		if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
+			err = -EFAULT;
+			break;
+		}
+
+		if (kevent_modify(&uk, u))
+			uk.ret_flags |= KEVENT_RET_BROKEN;
+		uk.ret_flags |= KEVENT_RET_DONE;
+
+		if (copy_to_user(arg, &uk, sizeof(struct ukevent))) {
+			err = -EFAULT;
+			break;
+		}
+
+		arg += sizeof(struct ukevent);
+	}
+out:
+	mutex_unlock(&u->ctl_mutex);
+
+	return err;
+}
+
+/*
+ * Read from userspace all ukevents and remove appropriate kevents.
+ * If provided number of ukevents is more that threshold, it is faster
+ * to allocate a room for them and copy in one shot instead of copy 
+ * one-by-one and then process them.
+ */
+static int kevent_user_ctl_remove(struct kevent_user *u, unsigned int num, void __user *arg)
+{
+	int err = 0, i;
+	struct ukevent uk;
+
+	mutex_lock(&u->ctl_mutex);
+	
+	if (num > u->kevent_num) {
+		err = -EINVAL;
+		goto out;
+	}
+	
+	if (num > KEVENT_MIN_BUFFS_ALLOC) {
+		struct ukevent *ukev;
+
+		ukev = kevent_get_user(num, arg);
+		if (ukev) {
+			for (i=0; i<num; ++i) {
+				if (kevent_remove(&ukev[i], u))
+					ukev[i].ret_flags |= KEVENT_RET_BROKEN;
+				ukev[i].ret_flags |= KEVENT_RET_DONE;
+			}
+			if (copy_to_user(arg, ukev, num*sizeof(struct ukevent)))
+				err = -EFAULT;
+			kfree(ukev);
+			goto out;
+		}
+	}
+
+	for (i=0; i<num; ++i) {
+		if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
+			err = -EFAULT;
+			break;
+		}
+
+		if (kevent_remove(&uk, u))
+			uk.ret_flags |= KEVENT_RET_BROKEN;
+
+		uk.ret_flags |= KEVENT_RET_DONE;
+
+		if (copy_to_user(arg, &uk, sizeof(struct ukevent))) {
+			err = -EFAULT;
+			break;
+		}
+
+		arg += sizeof(struct ukevent);
+	}
+out:
+	mutex_unlock(&u->ctl_mutex);
+
+	return err;
+}
+
+/*
+ * Queue kevent into userspace control block and increase
+ * it's reference counter.
+ */
+static void kevent_user_enqueue(struct kevent_user *u, struct kevent *k)
+{
+	unsigned long flags;
+	unsigned int hash = kevent_user_hash(&k->event);
+
+	spin_lock_irqsave(&u->kevent_lock, flags);
+	list_add_tail(&k->kevent_entry, &u->kevent_list[hash]);
+	k->flags |= KEVENT_USER;
+	u->kevent_num++;
+	kevent_user_get(u);
+	spin_unlock_irqrestore(&u->kevent_lock, flags);
+}
+
+/*
+ * Add kevent from both kernel and userspace users.
+ * This function allocates and queues kevent, returns negative value
+ * on error, positive if kevent is ready immediately and zero
+ * if kevent has been queued.
+ */
+int kevent_user_add_ukevent(struct ukevent *uk, struct kevent_user *u)
+{
+	struct kevent *k;
+	int err;
+
+	if (kevent_user_ring_grow(u)) {
+		err = -ENOMEM;
+		goto err_out_exit;
+	}
+
+	k = kmem_cache_alloc(kevent_cache, GFP_KERNEL);
+	if (!k) {
+		err = -ENOMEM;
+		goto err_out_exit;
+	}
+
+	memcpy(&k->event, uk, sizeof(struct ukevent));
+	INIT_RCU_HEAD(&k->rcu_head);
+
+	k->event.ret_flags = 0;
+
+	err = kevent_init(k);
+	if (err) {
+		kmem_cache_free(kevent_cache, k);
+		goto err_out_exit;
+	}
+	k->user = u;
+	kevent_stat_total(u);
+	kevent_user_enqueue(u, k);
+
+	err = kevent_enqueue(k);
+	if (err) {
+		memcpy(uk, &k->event, sizeof(struct ukevent));
+		if (err < 0)
+			uk->ret_flags |= KEVENT_RET_BROKEN;
+		uk->ret_flags |= KEVENT_RET_DONE;
+		kevent_finish_user(k, 0);
+	} else {
+		kevent_user_ring_inc(u);
+	}
+
+err_out_exit:
+	return err;
+}
+
+/*
+ * Copy all ukevents from userspace, allocate kevent for each one 
+ * and add them into appropriate kevent_storages, 
+ * e.g. sockets, inodes and so on...
+ * Ready events will replace ones provided by used and number 
+ * of ready events is returned.
+ * User must check ret_flags field of each ukevent structure 
+ * to determine if it is fired or failed event.
+ */
+static int kevent_user_ctl_add(struct kevent_user *u, unsigned int num, void __user *arg)
+{
+	int err, cerr = 0, knum = 0, rnum = 0, i;
+	void __user *orig = arg;
+	struct ukevent uk;
+
+	mutex_lock(&u->ctl_mutex);
+
+	err = -EINVAL;
+	if (u->kevent_num + num >= KEVENT_MAX_EVENTS)
+		goto out_remove;
+
+	if (num > KEVENT_MIN_BUFFS_ALLOC) {
+		struct ukevent *ukev;
+
+		ukev = kevent_get_user(num, arg);
+		if (ukev) {
+			for (i=0; i<num; ++i) {
+				err = kevent_user_add_ukevent(&ukev[i], u);
+				if (err) {
+					kevent_stat_im(u);
+					if (i != rnum)
+						memcpy(&ukev[rnum], &ukev[i], sizeof(struct ukevent));
+					rnum++;
+				} else
+					knum++;
+			}
+			if (copy_to_user(orig, ukev, rnum*sizeof(struct ukevent)))
+				cerr = -EFAULT;
+			kfree(ukev);
+			goto out_setup;
+		}
+	}
+
+	for (i=0; i<num; ++i) {
+		if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
+			cerr = -EFAULT;
+			break;
+		}
+		arg += sizeof(struct ukevent);
+
+		err = kevent_user_add_ukevent(&uk, u);
+		if (err) {
+			kevent_stat_im(u);
+			if (copy_to_user(orig, &uk, sizeof(struct ukevent))) {
+				cerr = -EFAULT;
+				break;
+			}
+			orig += sizeof(struct ukevent);
+			rnum++;
+		} else
+			knum++;
+	}
+
+out_setup:
+	if (cerr < 0) {
+		err = cerr;
+		goto out_remove;
+	}
+
+	err = rnum;
+out_remove:
+	mutex_unlock(&u->ctl_mutex);
+
+	return err;
+}
+
+/*
+ * In nonblocking mode it returns as many events as possible, but not more than @max_nr.
+ * In blocking mode it waits until timeout or if at least @min_nr events are ready.
+ */
+static int kevent_user_wait(struct file *file, struct kevent_user *u, 
+		unsigned int min_nr, unsigned int max_nr, unsigned int timeout, 
+		void __user *buf)
+{
+	struct kevent *k;
+	int num = 0;
+
+	if (!(file->f_flags & O_NONBLOCK)) {
+		wait_event_interruptible_timeout(u->wait, 
+			u->ready_num >= min_nr, msecs_to_jiffies(timeout));
+	}
+	
+	while (num < max_nr && ((k = kqueue_dequeue_ready(u)) != NULL)) {
+		if (copy_to_user(buf + num*sizeof(struct ukevent), 
+					&k->event, sizeof(struct ukevent)))
+			break;
+
+		/*
+		 * If it is one-shot kevent, it has been removed already from
+		 * origin's queue, so we can easily free it here.
+		 */
+		if (k->event.req_flags & KEVENT_REQ_ONESHOT)
+			kevent_finish_user(k, 1);
+		++num;
+		kevent_stat_wait(u);
+	}
+
+	return num;
+}
+
+/*
+ * Userspace control block creation and initialization.
+ */
+static int kevent_ctl_init(void)
+{
+	struct kevent_user *u;
+	struct file *file;
+	int fd, ret;
+
+	fd = get_unused_fd();
+	if (fd < 0)
+		return fd;
+
+	file = get_empty_filp();
+	if (!file) {
+		ret = -ENFILE;
+		goto out_put_fd;
+	}
+
+	u = kevent_user_alloc();
+	if (unlikely(!u)) {
+		ret = -ENOMEM;
+		goto out_put_file;
+	}
+
+	file->f_op = &kevent_user_fops;
+	file->f_vfsmnt = mntget(kevent_mnt);
+	file->f_dentry = dget(kevent_mnt->mnt_root);
+	file->f_mapping = file->f_dentry->d_inode->i_mapping;
+	file->f_mode = FMODE_READ;
+	file->f_flags = O_RDONLY;
+	file->private_data = u;
+	
+	fd_install(fd, file);
+
+	return fd;
+
+out_put_file:
+	put_filp(file);
+out_put_fd:
+	put_unused_fd(fd);
+	return ret;
+}
+
+static int kevent_ctl_process(struct file *file, unsigned int cmd, unsigned int num, void __user *arg)
+{
+	int err;
+	struct kevent_user *u = file->private_data;
+
+	if (!u || num > KEVENT_MAX_EVENTS)
+		return -EINVAL;
+
+	switch (cmd) {
+	case KEVENT_CTL_ADD:
+		err = kevent_user_ctl_add(u, num, arg);
+		break;
+	case KEVENT_CTL_REMOVE:
+		err = kevent_user_ctl_remove(u, num, arg);
+		break;
+	case KEVENT_CTL_MODIFY:
+		err = kevent_user_ctl_modify(u, num, arg);
+		break;
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+	return err;
+}
+
+/*
+ * Used to get ready kevents from queue.
+ * @ctl_fd - kevent control descriptor which must be obtained through kevent_ctl(KEVENT_CTL_INIT).
+ * @min_nr - minimum number of ready kevents.
+ * @max_nr - maximum number of ready kevents.
+ * @timeout - timeout in milliseconds to wait until some events are ready.
+ * @buf - buffer to place ready events.
+ * @flags - ununsed for now (will be used for mmap implementation).
+ */
+asmlinkage long sys_kevent_get_events(int ctl_fd, unsigned int min_nr, unsigned int max_nr,
+		unsigned int timeout, void __user *buf, unsigned flags)
+{
+	int err = -EINVAL;
+	struct file *file;
+	struct kevent_user *u;
+
+	file = fget(ctl_fd);
+	if (!file)
+		return -ENODEV;
+
+	if (file->f_op != &kevent_user_fops)
+		goto out_fput;
+	u = file->private_data;
+
+	err = kevent_user_wait(file, u, min_nr, max_nr, timeout, buf);
+out_fput:
+	fput(file);
+	return err;
+}
+
+/*
+ * This syscall is used to perform various control operations
+ * on given kevent queue, which is obtained through kevent file descriptor @fd.
+ * @cmd - type of operation.
+ * @num - number of kevents to be processed.
+ * @arg - pointer to array of struct ukevent.
+ */
+asmlinkage long sys_kevent_ctl(int fd, unsigned int cmd, unsigned int num, void __user *arg)
+{
+	int err = -EINVAL;
+	struct file *file;
+
+	if (cmd == KEVENT_CTL_INIT)
+		return kevent_ctl_init();
+
+	file = fget(fd);
+	if (!file)
+		return -ENODEV;
+
+	if (file->f_op != &kevent_user_fops)
+		goto out_fput;
+
+	err = kevent_ctl_process(file, cmd, num, arg);
+
+out_fput:
+	fput(file);
+	return err;
+}
+
+/*
+ * Kevent subsystem initialization - create kevent cache and register
+ * filesystem to get control file descriptors from.
+ */
+static int __devinit kevent_user_init(void)
+{
+	int err = 0;
+	
+	err = register_filesystem(&kevent_fs_type);
+	if (err)
+		panic("%s: failed to register filesystem: err=%d.\n",
+			       kevent_name, err);
+
+	kevent_mnt = kern_mount(&kevent_fs_type);
+	if (IS_ERR(kevent_mnt))
+		panic("%s: failed to mount silesystem: err=%ld.\n", 
+				kevent_name, PTR_ERR(kevent_mnt));
+	
+	err = misc_register(&kevent_miscdev);
+	if (err) {
+		printk(KERN_ERR "Failed to register kevent miscdev: err=%d.\n", err);
+		goto err_out_exit;
+	}
+
+	printk("KEVENT subsystem has been successfully registered.\n");
+
+	return 0;
+
+err_out_exit:
+	mntput(kevent_mnt);
+	unregister_filesystem(&kevent_fs_type);
+
+	return err;
+}
+
+static void __devexit kevent_user_fini(void)
+{
+	misc_deregister(&kevent_miscdev);
+	mntput(kevent_mnt);
+	unregister_filesystem(&kevent_fs_type);
+}
+
+module_init(kevent_user_init);
+module_exit(kevent_user_fini);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 6991bec..8d3769b 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -122,6 +122,9 @@ cond_syscall(ppc_rtas);
 cond_syscall(sys_spu_run);
 cond_syscall(sys_spu_create);
 
+cond_syscall(sys_kevent_get_events);
+cond_syscall(sys_kevent_ctl);
+
 /* mmu depending weak syscall entries */
 cond_syscall(sys_mprotect);
 cond_syscall(sys_msync);


^ permalink raw reply related	[flat|nested] 160+ messages in thread

* [take9 0/2] kevent: Generic event handling mechanism.
  2006-07-31 10:33                       ` Evgeniy Polyakov
                                           ` (8 preceding siblings ...)
  2006-08-14  6:20                         ` [take8 0/2] kevent: Generic event handling mechanism Evgeniy Polyakov
@ 2006-08-14  6:21                         ` Evgeniy Polyakov
  2006-08-14  6:21                           ` [take9 1/2] kevent: Core files Evgeniy Polyakov
  2006-08-16 13:26                           ` [take9 0/2] kevent: Generic event handling mechanism Christoph Hellwig
  2006-08-16 12:34                         ` [take10 " Evgeniy Polyakov
  10 siblings, 2 replies; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-14  6:21 UTC (permalink / raw)
  To: lkml
  Cc: David Miller, Ulrich Drepper, Andrew Morton, Evgeniy Polyakov,
	netdev, Zach Brown


Generic event handling mechanism.

Changes from 'take8' patchset:
 * fixed mmap release bug
 * use module_init() instead of late_initcall()
 * use better structures for timer notifications

Changes from 'take7' patchset:
 * new mmap interface (not tested, waiting for other changes to be acked)
	- use nopage() method to dynamically substitue pages
	- allocate new page for events only when new added kevent requres it
	- do not use ugly index dereferencing, use structure instead
	- reduced amount of data in the ring (id and flags), 
		maximum 12 pages on x86 per kevent fd

Changes from 'take6' patchset:
 * a lot of comments!
 * do not use list poisoning for detection of the fact, that entry is in the list
 * return number of ready kevents even if copy*user() fails
 * strict check for number of kevents in syscall
 * use ARRAY_SIZE for array size calculation
 * changed superblock magic number
 * use SLAB_PANIC instead of direct panic() call
 * changed -E* return values
 * a lot of small cleanups and indent fixes

Changes from 'take5' patchset:
 * removed compilation warnings about unused wariables when lockdep is not turned on
 * do not use internal socket structures, use appropriate (exported) wrappers instead
 * removed default 1 second timeout
 * removed AIO stuff from patchset

Changes from 'take4' patchset:
 * use miscdevice instead of chardevice
 * comments fixes

Changes from 'take3' patchset:
 * removed serializing mutex from kevent_user_wait()
 * moved storage list processing to RCU
 * removed lockdep screaming - all storage locks are initialized in the same function, so it was learned 
	to differentiate between various cases
 * remove kevent from storage if is marked as broken after callback
 * fixed a typo in mmaped buffer implementation which would end up in wrong index calcualtion 

Changes from 'take2' patchset:
 * split kevent_finish_user() to locked and unlocked variants
 * do not use KEVENT_STAT ifdefs, use inline functions instead
 * use array of callbacks of each type instead of each kevent callback initialization
 * changed name of ukevent guarding lock
 * use only one kevent lock in kevent_user for all hash buckets instead of per-bucket locks
 * do not use kevent_user_ctl structure instead provide needed arguments as syscall parameters
 * various indent cleanups
 * added optimisation, which is aimed to help when a lot of kevents are being copied from userspace
 * mapped buffer (initial) implementation (no userspace yet)

Changes from 'take1' patchset:
 - rebased against 2.6.18-git tree
 - removed ioctl controlling
 - added new syscall kevent_get_events(int fd, unsigned int min_nr, unsigned int max_nr,
			unsigned int timeout, void __user *buf, unsigned flags)
 - use old syscall kevent_ctl for creation/removing, modification and initial kevent 
	initialization
 - use mutuxes instead of semaphores
 - added file descriptor check and return error if provided descriptor does not match
	kevent file operations
 - various indent fixes
 - removed aio_sendfile() declarations.

Thank you.

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>



^ permalink raw reply	[flat|nested] 160+ messages in thread

* [take9 2/2] kevent: poll/select() notifications. Timer notifications.
  2006-08-14  6:21                           ` [take9 1/2] kevent: Core files Evgeniy Polyakov
@ 2006-08-14  6:21                             ` Evgeniy Polyakov
  2006-08-16 13:30                               ` Christoph Hellwig
  2006-08-16 13:45                             ` [take9 1/2] kevent: Core files Christoph Hellwig
  1 sibling, 1 reply; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-14  6:21 UTC (permalink / raw)
  To: lkml
  Cc: David Miller, Ulrich Drepper, Andrew Morton, Evgeniy Polyakov,
	netdev, Zach Brown


poll/select() notifications. Timer notifications.

This patch includes generic poll/select and timer notifications.

kevent_poll works simialr to epoll and has the same issues (callback
is invoked not from internal state machine of the caller, but through
process awake).

Timer notifications can be used for fine grained per-process time 
management, since interval timers are very inconvenient to use, 
and they are limited.

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mitp.ru>

diff --git a/kernel/kevent/kevent_poll.c b/kernel/kevent/kevent_poll.c
new file mode 100644
index 0000000..8a4f863
--- /dev/null
+++ b/kernel/kevent/kevent_poll.c
@@ -0,0 +1,220 @@
+/*
+ * 	kevent_poll.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+#include <linux/file.h>
+#include <linux/kevent.h>
+#include <linux/poll.h>
+#include <linux/fs.h>
+
+static kmem_cache_t *kevent_poll_container_cache;
+static kmem_cache_t *kevent_poll_priv_cache;
+
+struct kevent_poll_ctl
+{
+	struct poll_table_struct 	pt;
+	struct kevent			*k;
+};
+
+struct kevent_poll_wait_container
+{
+	struct list_head		container_entry;
+	wait_queue_head_t		*whead;
+	wait_queue_t			wait;
+	struct kevent			*k;
+};
+
+struct kevent_poll_private
+{
+	struct list_head		container_list;
+	spinlock_t			container_lock;
+};
+
+static int kevent_poll_enqueue(struct kevent *k);
+static int kevent_poll_dequeue(struct kevent *k);
+static int kevent_poll_callback(struct kevent *k);
+
+static int kevent_poll_wait_callback(wait_queue_t *wait, 
+		unsigned mode, int sync, void *key)
+{
+	struct kevent_poll_wait_container *cont = 
+		container_of(wait, struct kevent_poll_wait_container, wait);
+	struct kevent *k = cont->k;
+	struct file *file = k->st->origin;
+	u32 revents;
+
+	revents = file->f_op->poll(file, NULL);
+
+	kevent_storage_ready(k->st, NULL, revents);
+
+	return 0;
+}
+
+static void kevent_poll_qproc(struct file *file, wait_queue_head_t *whead, 
+		struct poll_table_struct *poll_table)
+{
+	struct kevent *k = 
+		container_of(poll_table, struct kevent_poll_ctl, pt)->k;
+	struct kevent_poll_private *priv = k->priv;
+	struct kevent_poll_wait_container *cont;
+	unsigned long flags;
+
+	cont = kmem_cache_alloc(kevent_poll_container_cache, SLAB_KERNEL);
+	if (!cont) {
+		kevent_break(k);
+		return;
+	}
+		
+	cont->k = k;
+	init_waitqueue_func_entry(&cont->wait, kevent_poll_wait_callback);
+	cont->whead = whead;
+
+	spin_lock_irqsave(&priv->container_lock, flags);
+	list_add_tail(&cont->container_entry, &priv->container_list);
+	spin_unlock_irqrestore(&priv->container_lock, flags);
+
+	add_wait_queue(whead, &cont->wait);
+}
+
+static int kevent_poll_enqueue(struct kevent *k)
+{
+	struct file *file;
+	int err, ready = 0;
+	unsigned int revents;
+	struct kevent_poll_ctl ctl;
+	struct kevent_poll_private *priv;
+
+	file = fget(k->event.id.raw[0]);
+	if (!file)
+		return -ENODEV;
+
+	err = -EINVAL;
+	if (!file->f_op || !file->f_op->poll)
+		goto err_out_fput;
+
+	err = -ENOMEM;
+	priv = kmem_cache_alloc(kevent_poll_priv_cache, SLAB_KERNEL);
+	if (!priv)
+		goto err_out_fput;
+
+	spin_lock_init(&priv->container_lock);
+	INIT_LIST_HEAD(&priv->container_list);
+
+	k->priv = priv;
+
+	ctl.k = k;
+	init_poll_funcptr(&ctl.pt, &kevent_poll_qproc);
+
+	err = kevent_storage_enqueue(&file->st, k);
+	if (err)
+		goto err_out_free;
+
+	revents = file->f_op->poll(file, &ctl.pt);
+	if (revents & k->event.event) {
+		ready = 1;
+		kevent_poll_dequeue(k);
+	}
+	
+	return ready;
+
+err_out_free:
+	kmem_cache_free(kevent_poll_priv_cache, priv);
+err_out_fput:
+	fput(file);
+	return err;
+}
+
+static int kevent_poll_dequeue(struct kevent *k)
+{
+	struct file *file = k->st->origin;
+	struct kevent_poll_private *priv = k->priv;
+	struct kevent_poll_wait_container *w, *n;
+	unsigned long flags;
+
+	kevent_storage_dequeue(k->st, k);
+
+	spin_lock_irqsave(&priv->container_lock, flags);
+	list_for_each_entry_safe(w, n, &priv->container_list, container_entry) {
+		list_del(&w->container_entry);
+		remove_wait_queue(w->whead, &w->wait);
+		kmem_cache_free(kevent_poll_container_cache, w);
+	}
+	spin_unlock_irqrestore(&priv->container_lock, flags);
+	
+	kmem_cache_free(kevent_poll_priv_cache, priv);
+	k->priv = NULL;
+	
+	fput(file);
+
+	return 0;
+}
+
+static int kevent_poll_callback(struct kevent *k)
+{
+	struct file *file = k->st->origin;
+	unsigned int revents = file->f_op->poll(file, NULL);
+	return (revents & k->event.event);
+}
+
+static int __init kevent_poll_sys_init(void)
+{
+	struct kevent_callbacks *pc = &kevent_registered_callbacks[KEVENT_POLL];
+
+	kevent_poll_container_cache = kmem_cache_create("kevent_poll_container_cache", 
+			sizeof(struct kevent_poll_wait_container), 0, 0, NULL, NULL);
+	if (!kevent_poll_container_cache) {
+		printk(KERN_ERR "Failed to create kevent poll container cache.\n");
+		return -ENOMEM;
+	}
+	
+	kevent_poll_priv_cache = kmem_cache_create("kevent_poll_priv_cache", 
+			sizeof(struct kevent_poll_private), 0, 0, NULL, NULL);
+	if (!kevent_poll_priv_cache) {
+		printk(KERN_ERR "Failed to create kevent poll private data cache.\n");
+		kmem_cache_destroy(kevent_poll_container_cache);
+		kevent_poll_container_cache = NULL;
+		return -ENOMEM;
+	}
+	
+	pc->enqueue = &kevent_poll_enqueue;
+	pc->dequeue = &kevent_poll_dequeue;
+	pc->callback = &kevent_poll_callback;
+
+	printk(KERN_INFO "Kevent poll()/select() subsystem has been initialized.\n");
+	return 0;
+}
+
+static struct lock_class_key kevent_poll_key;
+
+void kevent_poll_reinit(struct file *file)
+{
+	lockdep_set_class(&file->st.lock, &kevent_poll_key);
+}
+
+static void __exit kevent_poll_sys_fini(void)
+{
+	kmem_cache_destroy(kevent_poll_priv_cache);
+	kmem_cache_destroy(kevent_poll_container_cache);
+}
+
+module_init(kevent_poll_sys_init);
+module_exit(kevent_poll_sys_fini);
diff --git a/kernel/kevent/kevent_timer.c b/kernel/kevent/kevent_timer.c
new file mode 100644
index 0000000..fe39b4e
--- /dev/null
+++ b/kernel/kevent/kevent_timer.c
@@ -0,0 +1,108 @@
+/*
+ * 	kevent_timer.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+#include <linux/jiffies.h>
+#include <linux/kevent.h>
+
+struct kevent_timer
+{
+	struct timer_list	ktimer;
+	struct kevent_storage	ktimer_storage;
+};
+
+static void kevent_timer_func(unsigned long data)
+{
+	struct kevent *k = (struct kevent *)data;
+	struct timer_list *t = k->st->origin;
+
+	kevent_storage_ready(k->st, NULL, KEVENT_MASK_ALL);
+	mod_timer(t, jiffies + msecs_to_jiffies(k->event.id.raw[0]));
+}
+
+static struct lock_class_key kevent_timer_key;
+
+static int kevent_timer_enqueue(struct kevent *k)
+{
+	int err;
+	struct kevent_timer *t;
+
+	t = kmalloc(sizeof(struct kevent_timer), GFP_KERNEL);
+	if (!t)
+		return -ENOMEM;
+
+	setup_timer(&t->ktimer, &kevent_timer_func, (unsigned long)k);
+
+	err = kevent_storage_init(&t->ktimer, &t->ktimer_storage);
+	if (err)
+		goto err_out_free;
+	lockdep_set_class(&t->ktimer_storage.lock, &kevent_timer_key);
+
+	err = kevent_storage_enqueue(&t->ktimer_storage, k);
+	if (err)
+		goto err_out_st_fini;
+	
+	mod_timer(&t->ktimer, jiffies + msecs_to_jiffies(k->event.id.raw[0]));
+
+	return 0;
+
+err_out_st_fini:	
+	kevent_storage_fini(&t->ktimer_storage);
+err_out_free:
+	kfree(t);
+
+	return err;
+}
+
+static int kevent_timer_dequeue(struct kevent *k)
+{
+	struct kevent_storage *st = k->st;
+	struct kevent_timer *t = container_of(st, struct kevent_timer, ktimer_storage);
+
+	del_timer_sync(&t->ktimer);
+	kevent_storage_dequeue(st, k);
+	kfree(t);
+
+	return 0;
+}
+
+static int kevent_timer_callback(struct kevent *k)
+{
+	k->event.ret_data[0] = (__u32)jiffies;
+	return 1;
+}
+
+static int __init kevent_init_timer(void)
+{
+	struct kevent_callbacks *tc = &kevent_registered_callbacks[KEVENT_TIMER];
+
+	tc->enqueue = &kevent_timer_enqueue;
+	tc->dequeue = &kevent_timer_dequeue;
+	tc->callback = &kevent_timer_callback;
+
+	return 0;
+}
+module_init(kevent_init_timer);


^ permalink raw reply related	[flat|nested] 160+ messages in thread

* [take9 1/2] kevent: Core files.
  2006-08-14  6:21                         ` [take9 0/2] kevent: Generic event handling mechanism Evgeniy Polyakov
@ 2006-08-14  6:21                           ` Evgeniy Polyakov
  2006-08-14  6:21                             ` [take9 2/2] kevent: poll/select() notifications. Timer notifications Evgeniy Polyakov
  2006-08-16 13:45                             ` [take9 1/2] kevent: Core files Christoph Hellwig
  2006-08-16 13:26                           ` [take9 0/2] kevent: Generic event handling mechanism Christoph Hellwig
  1 sibling, 2 replies; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-14  6:21 UTC (permalink / raw)
  To: lkml
  Cc: David Miller, Ulrich Drepper, Andrew Morton, Evgeniy Polyakov,
	netdev, Zach Brown


Core files.

This patch includes core kevent files:
 - userspace controlling
 - kernelspace interfaces
 - initialization
 - notification state machines

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>

diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index dd63d47..091ff42 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@@ -317,3 +317,5 @@ ENTRY(sys_call_table)
 	.long sys_tee			/* 315 */
 	.long sys_vmsplice
 	.long sys_move_pages
+	.long sys_kevent_get_events
+	.long sys_kevent_ctl
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index 5d4a7d1..b2af4a8 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -713,4 +713,6 @@ #endif
 	.quad sys_tee
 	.quad compat_sys_vmsplice
 	.quad compat_sys_move_pages
+	.quad sys_kevent_get_events
+	.quad sys_kevent_ctl
 ia32_syscall_end:		
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index fc1c8dd..c9dde13 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -323,10 +323,12 @@ #define __NR_sync_file_range	314
 #define __NR_tee		315
 #define __NR_vmsplice		316
 #define __NR_move_pages		317
+#define __NR_kevent_get_events	318
+#define __NR_kevent_ctl		319
 
 #ifdef __KERNEL__
 
-#define NR_syscalls 318
+#define NR_syscalls 320
 
 /*
  * user-visible error numbers are in the range -1 - -128: see
diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h
index 94387c9..61363e0 100644
--- a/include/asm-x86_64/unistd.h
+++ b/include/asm-x86_64/unistd.h
@@ -619,10 +619,14 @@ #define __NR_vmsplice		278
 __SYSCALL(__NR_vmsplice, sys_vmsplice)
 #define __NR_move_pages		279
 __SYSCALL(__NR_move_pages, sys_move_pages)
+#define __NR_kevent_get_events	280
+__SYSCALL(__NR_kevent_get_events, sys_kevent_get_events)
+#define __NR_kevent_ctl		281
+__SYSCALL(__NR_kevent_ctl, sys_kevent_ctl)
 
 #ifdef __KERNEL__
 
-#define __NR_syscall_max __NR_move_pages
+#define __NR_syscall_max __NR_kevent_ctl
 
 #ifndef __NO_STUBS
 
diff --git a/include/linux/kevent.h b/include/linux/kevent.h
new file mode 100644
index 0000000..03eeeea
--- /dev/null
+++ b/include/linux/kevent.h
@@ -0,0 +1,310 @@
+/*
+ * 	kevent.h
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef __KEVENT_H
+#define __KEVENT_H
+
+/*
+ * Kevent request flags.
+ */
+
+#define KEVENT_REQ_ONESHOT	0x1		/* Process this event only once and then dequeue. */
+
+/*
+ * Kevent return flags.
+ */
+#define KEVENT_RET_BROKEN	0x1		/* Kevent is broken. */
+#define KEVENT_RET_DONE		0x2		/* Kevent processing was finished successfully. */
+
+/*
+ * Kevent type set.
+ */
+#define KEVENT_SOCKET 		0
+#define KEVENT_INODE		1
+#define KEVENT_TIMER		2
+#define KEVENT_POLL		3
+#define KEVENT_NAIO		4
+#define KEVENT_AIO		5
+#define	KEVENT_MAX		6
+
+/*
+ * Per-type event sets.
+ * Number of per-event sets should be exactly as number of kevent types.
+ */
+
+/*
+ * Timer events.
+ */
+#define	KEVENT_TIMER_FIRED	0x1
+
+/*
+ * Socket/network asynchronous IO events.
+ */
+#define	KEVENT_SOCKET_RECV	0x1
+#define	KEVENT_SOCKET_ACCEPT	0x2
+#define	KEVENT_SOCKET_SEND	0x4
+
+/*
+ * Inode events.
+ */
+#define	KEVENT_INODE_CREATE	0x1
+#define	KEVENT_INODE_REMOVE	0x2
+
+/*
+ * Poll events.
+ */
+#define	KEVENT_POLL_POLLIN	0x0001
+#define	KEVENT_POLL_POLLPRI	0x0002
+#define	KEVENT_POLL_POLLOUT	0x0004
+#define	KEVENT_POLL_POLLERR	0x0008
+#define	KEVENT_POLL_POLLHUP	0x0010
+#define	KEVENT_POLL_POLLNVAL	0x0020
+
+#define	KEVENT_POLL_POLLRDNORM	0x0040
+#define	KEVENT_POLL_POLLRDBAND	0x0080
+#define	KEVENT_POLL_POLLWRNORM	0x0100
+#define	KEVENT_POLL_POLLWRBAND	0x0200
+#define	KEVENT_POLL_POLLMSG	0x0400
+#define	KEVENT_POLL_POLLREMOVE	0x1000
+
+/*
+ * Asynchronous IO events.
+ */
+#define	KEVENT_AIO_BIO		0x1
+
+#define KEVENT_MASK_ALL		0xffffffff	/* Mask of all possible event values. */
+#define KEVENT_MASK_EMPTY	0x0		/* Empty mask of ready events. */
+
+struct kevent_id
+{
+	__u32		raw[2];
+};
+
+struct ukevent
+{
+	struct kevent_id	id;			/* Id of this request, e.g. socket number, file descriptor and so on... */
+	__u32			type;			/* Event type, e.g. KEVENT_SOCK, KEVENT_INODE, KEVENT_TIMER and so on... */
+	__u32			event;			/* Event itself, e.g. SOCK_ACCEPT, INODE_CREATED, TIMER_FIRED... */
+	__u32			req_flags;		/* Per-event request flags */
+	__u32			ret_flags;		/* Per-event return flags */
+	__u32			ret_data[2];		/* Event return data. Event originator fills it with anything it likes. */
+	union {
+		__u32		user[2];		/* User's data. It is not used, just copied to/from user. */
+		void		*ptr;
+	};
+};
+
+struct mukevent
+{
+	struct kevent_id	id;
+	__u32			ret_flags;
+};
+
+#define	KEVENT_CTL_ADD 		0
+#define	KEVENT_CTL_REMOVE	1
+#define	KEVENT_CTL_MODIFY	2
+#define	KEVENT_CTL_INIT		3
+
+#ifdef __KERNEL__
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/wait.h>
+#include <linux/net.h>
+#include <linux/rcupdate.h>
+#include <linux/kevent_storage.h>
+
+#define KEVENT_MAX_EVENTS	4096
+#define KEVENT_MIN_BUFFS_ALLOC	3
+
+struct inode;
+struct dentry;
+struct sock;
+
+struct kevent;
+struct kevent_storage;
+typedef int (* kevent_callback_t)(struct kevent *);
+
+/* @callback is called each time new event has been caught. */
+/* @enqueue is called each time new event is queued. */
+/* @dequeue is called each time event is dequeued. */
+
+struct kevent_callbacks {
+	kevent_callback_t	callback, enqueue, dequeue;
+};
+
+#define KEVENT_READY		0x1
+#define KEVENT_STORAGE		0x2
+#define KEVENT_USER		0x4
+
+struct kevent
+{
+	struct rcu_head		rcu_head;		/* Used for kevent freeing.*/
+	struct ukevent		event;
+	spinlock_t		ulock;			/* This lock protects ukevent manipulations, e.g. ret_flags changes. */
+
+	struct list_head	kevent_entry;		/* Entry of user's queue. */
+	struct list_head	storage_entry;		/* Entry of origin's queue. */
+	struct list_head	ready_entry;		/* Entry of user's ready. */
+
+	u32			flags;
+
+	struct kevent_user	*user;			/* User who requested this kevent. */
+	struct kevent_storage	*st;			/* Kevent container. */
+
+	struct kevent_callbacks	callbacks;
+
+	void			*priv;			/* Private data for different storages. 
+							 * poll()/select storage has a list of wait_queue_t containers 
+							 * for each ->poll() { poll_wait()' } here.
+							 */
+};
+
+extern struct kevent_callbacks kevent_registered_callbacks[];
+
+#define KEVENT_HASH_MASK	0xff
+
+struct kevent_user
+{
+	struct list_head	kevent_list[KEVENT_HASH_MASK+1];
+	spinlock_t		kevent_lock;
+	unsigned int		kevent_num;		/* Number of queued kevents. */
+
+	struct list_head	ready_list;		/* List of ready kevents. */
+	unsigned int		ready_num;		/* Number of ready kevents. */
+	spinlock_t 		ready_lock;		/* Protects all manipulations with ready queue. */
+
+	unsigned int		max_ready_num;		/* Requested number of kevents. */
+
+	struct mutex		ctl_mutex;		/* Protects against simultaneous kevent_user control manipulations. */
+	wait_queue_head_t	wait;			/* Wait until some events are ready. */
+
+	atomic_t		refcnt;			/* Reference counter, increased for each new kevent. */
+	
+	unsigned int		pages_in_use;
+	unsigned long		*pring;			/* Array of pages forming mapped ring buffer */
+
+#ifdef CONFIG_KEVENT_USER_STAT
+	unsigned long		im_num;
+	unsigned long		wait_num;
+	unsigned long		total;
+#endif
+};
+
+extern kmem_cache_t *kevent_cache;
+int kevent_sys_init(void);
+int kevent_enqueue(struct kevent *k);
+int kevent_dequeue(struct kevent *k);
+int kevent_init(struct kevent *k);
+void kevent_requeue(struct kevent *k);
+int kevent_break(struct kevent *k);
+
+void kevent_user_ring_add_event(struct kevent *k);
+
+void kevent_storage_ready(struct kevent_storage *st, 
+		kevent_callback_t ready_callback, u32 event);
+int kevent_storage_init(void *origin, struct kevent_storage *st);
+void kevent_storage_fini(struct kevent_storage *st);
+int kevent_storage_enqueue(struct kevent_storage *st, struct kevent *k);
+void kevent_storage_dequeue(struct kevent_storage *st, struct kevent *k);
+
+int kevent_user_add_ukevent(struct ukevent *uk, struct kevent_user *u);
+
+#ifdef CONFIG_KEVENT_POLL
+void kevent_poll_reinit(struct file *file);
+#else
+static inline void kevent_poll_reinit(struct file *file)
+{
+}
+#endif
+
+#ifdef CONFIG_KEVENT_INODE
+void kevent_inode_notify(struct inode *inode, u32 event);
+void kevent_inode_notify_parent(struct dentry *dentry, u32 event);
+void kevent_inode_remove(struct inode *inode);
+#else
+static inline void kevent_inode_notify(struct inode *inode, u32 event)
+{
+}
+static inline void kevent_inode_notify_parent(struct dentry *dentry, u32 event)
+{
+}
+static inline void kevent_inode_remove(struct inode *inode)
+{
+}
+#endif /* CONFIG_KEVENT_INODE */
+#ifdef CONFIG_KEVENT_SOCKET
+#ifdef CONFIG_LOCKDEP
+void kevent_socket_reinit(struct socket *sock);
+void kevent_sk_reinit(struct sock *sk);
+#else
+static inline void kevent_socket_reinit(struct socket *sock)
+{
+}
+static inline void kevent_sk_reinit(struct sock *sk)
+{
+}
+#endif
+void kevent_socket_notify(struct sock *sock, u32 event);
+int kevent_socket_dequeue(struct kevent *k);
+int kevent_socket_enqueue(struct kevent *k);
+#define sock_async(__sk) sock_flag(__sk, SOCK_ASYNC)
+#else
+static inline void kevent_socket_notify(struct sock *sock, u32 event)
+{
+}
+#define sock_async(__sk)	({ (void)__sk; 0; })
+#endif
+
+#ifdef CONFIG_KEVENT_USER_STAT
+static inline void kevent_stat_init(struct kevent_user *u)
+{
+	u->wait_num = u->im_num = u->total = 0;
+}
+static inline void kevent_stat_print(struct kevent_user *u)
+{
+	pr_debug("%s: u=%p, wait=%lu, immediately=%lu, total=%lu.\n", 
+			__func__, u, u->wait_num, u->im_num, u->total);
+}
+static inline void kevent_stat_im(struct kevent_user *u)
+{
+	u->im_num++;
+}
+static inline void kevent_stat_wait(struct kevent_user *u)
+{
+	u->wait_num++;
+}
+static inline void kevent_stat_total(struct kevent_user *u)
+{
+	u->total++;
+}
+#else
+#define kevent_stat_print(u)		({ (void) u;})
+#define kevent_stat_init(u)		({ (void) u;})
+#define kevent_stat_im(u)		({ (void) u;})
+#define kevent_stat_wait(u)		({ (void) u;})
+#define kevent_stat_total(u)		({ (void) u;})
+#endif
+
+#endif /* __KERNEL__ */
+#endif /* __KEVENT_H */
diff --git a/include/linux/kevent_storage.h b/include/linux/kevent_storage.h
new file mode 100644
index 0000000..a38575d
--- /dev/null
+++ b/include/linux/kevent_storage.h
@@ -0,0 +1,11 @@
+#ifndef __KEVENT_STORAGE_H
+#define __KEVENT_STORAGE_H
+
+struct kevent_storage
+{
+	void			*origin;		/* Originator's pointer, e.g. struct sock or struct file. Can be NULL. */
+	struct list_head	list;			/* List of queued kevents. */
+	spinlock_t		lock;			/* Protects users queue. */
+};
+
+#endif /* __KEVENT_STORAGE_H */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 008f04c..8609910 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -597,4 +597,7 @@ asmlinkage long sys_get_robust_list(int 
 asmlinkage long sys_set_robust_list(struct robust_list_head __user *head,
 				    size_t len);
 
+asmlinkage long sys_kevent_get_events(int ctl_fd, unsigned int min, unsigned int max, 
+		unsigned int timeout, void __user *buf, unsigned flags);
+asmlinkage long sys_kevent_ctl(int ctl_fd, unsigned int cmd, unsigned int num, void __user *buf);
 #endif
diff --git a/init/Kconfig b/init/Kconfig
index a099fc6..c550fcc 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -218,6 +218,8 @@ config AUDITSYSCALL
 	  such as SELinux.  To use audit's filesystem watch feature, please
 	  ensure that INOTIFY is configured.
 
+source "kernel/kevent/Kconfig"
+
 config IKCONFIG
 	bool "Kernel .config support"
 	---help---
diff --git a/kernel/Makefile b/kernel/Makefile
index d62ec66..2d7a6dd 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -47,6 +47,7 @@ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softl
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
+obj-$(CONFIG_KEVENT) += kevent/
 obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o
diff --git a/kernel/kevent/Kconfig b/kernel/kevent/Kconfig
new file mode 100644
index 0000000..31ea7b2
--- /dev/null
+++ b/kernel/kevent/Kconfig
@@ -0,0 +1,59 @@
+config KEVENT
+	bool "Kernel event notification mechanism"
+	help
+	  This option enables event queue mechanism.
+	  It can be used as replacement for poll()/select(), AIO callback 
+	  invocations, advanced timer notifications and other kernel 
+	  object status changes.
+
+config KEVENT_USER_STAT
+	bool "Kevent user statistic"
+	depends on KEVENT
+	default N
+	help
+	  This option will turn kevent_user statistic collection on.
+	  Statistic data includes total number of kevent, number of kevents 
+	  which are ready immediately at insertion time and number of kevents 
+	  which were removed through readiness completion. 
+	  It will be printed each time control kevent descriptor is closed.
+
+config KEVENT_SOCKET
+	bool "Kernel event notifications for sockets"
+	depends on NET && KEVENT
+	help
+	  This option enables notifications through KEVENT subsystem of 
+	  sockets operations, like new packet receiving conditions, 
+	  ready for accept conditions and so on.
+	
+config KEVENT_INODE
+	bool "Kernel event notifications for inodes"
+	depends on KEVENT
+	help
+	  This option enables notifications through KEVENT subsystem of 
+	  inode operations, like file creation, removal and so on.
+
+config KEVENT_TIMER
+	bool "Kernel event notifications for timers"
+	depends on KEVENT
+	help
+	  This option allows to use timers through KEVENT subsystem.
+
+config KEVENT_POLL
+	bool "Kernel event notifications for poll()/select()"
+	depends on KEVENT
+	help
+	  This option allows to use kevent subsystem for poll()/select() 
+	  notifications.
+
+config KEVENT_NAIO
+	bool "Network asynchronous IO"
+	depends on KEVENT && KEVENT_SOCKET
+	help
+	  This option enables kevent based network asynchronous IO subsystem.
+
+config KEVENT_AIO
+	bool "Asynchronous IO"
+	depends on KEVENT
+	help
+	  This option allows to use kevent subsystem for AIO operations.
+	  AIO read is currently supported.
diff --git a/kernel/kevent/Makefile b/kernel/kevent/Makefile
new file mode 100644
index 0000000..d1ef9ba
--- /dev/null
+++ b/kernel/kevent/Makefile
@@ -0,0 +1,7 @@
+obj-y := kevent.o kevent_user.o
+obj-$(CONFIG_KEVENT_SOCKET) += kevent_socket.o
+obj-$(CONFIG_KEVENT_INODE) += kevent_inode.o
+obj-$(CONFIG_KEVENT_TIMER) += kevent_timer.o
+obj-$(CONFIG_KEVENT_POLL) += kevent_poll.o
+obj-$(CONFIG_KEVENT_NAIO) += kevent_naio.o
+obj-$(CONFIG_KEVENT_AIO) += kevent_aio.o
diff --git a/kernel/kevent/kevent.c b/kernel/kevent/kevent.c
new file mode 100644
index 0000000..3814464
--- /dev/null
+++ b/kernel/kevent/kevent.c
@@ -0,0 +1,249 @@
+/*
+ * 	kevent.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/mempool.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/kevent.h>
+
+kmem_cache_t *kevent_cache;
+
+/*
+ * Attempts to add an event into appropriate origin's queue.
+ * Returns positive value if this event is ready immediately,
+ * negative value in case of error and zero if event has been queued.
+ * ->enqueue() callback must increase origin's reference counter.
+ */
+int kevent_enqueue(struct kevent *k)
+{
+	if (k->event.type >= KEVENT_MAX)
+		return -EINVAL;
+
+	if (!k->callbacks.enqueue) {
+		kevent_break(k);
+		return -EINVAL;
+	}
+	
+	return k->callbacks.enqueue(k);
+}
+
+/*
+ * Remove event from the appropriate queue.
+ * ->dequeue() callback must decrease origin's reference counter.
+ */
+int kevent_dequeue(struct kevent *k)
+{
+	if (k->event.type >= KEVENT_MAX)
+		return -EINVAL;
+	
+	if (!k->callbacks.dequeue) {
+		kevent_break(k);
+		return -EINVAL;
+	}
+
+	return k->callbacks.dequeue(k);
+}
+
+/*
+ * Mark kevent as broken.
+ */
+int kevent_break(struct kevent *k)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&k->ulock, flags);
+	k->event.ret_flags |= KEVENT_RET_BROKEN;
+	spin_unlock_irqrestore(&k->ulock, flags);
+	return 0;
+}
+
+struct kevent_callbacks kevent_registered_callbacks[KEVENT_MAX];
+
+/*
+ * Must be called before event is going to be added into some origin's queue.
+ * Initializes ->enqueue(), ->dequeue() and ->callback() callbacks.
+ * If failed, kevent should not be used or kevent_enqueue() will fail to add
+ * this kevent into origin's queue with setting
+ * KEVENT_RET_BROKEN flag in kevent->event.ret_flags.
+ */
+int kevent_init(struct kevent *k)
+{
+	spin_lock_init(&k->ulock);
+	k->flags = 0;
+
+	if (k->event.type >= KEVENT_MAX)
+		return -EINVAL;
+
+	k->callbacks = kevent_registered_callbacks[k->event.type];
+	if (!k->callbacks.callback) {
+		kevent_break(k);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/*
+ * Called from ->enqueue() callback when reference counter for given
+ * origin (socket, inode...) has been increased.
+ */
+int kevent_storage_enqueue(struct kevent_storage *st, struct kevent *k)
+{
+	unsigned long flags;
+
+	k->st = st;
+	spin_lock_irqsave(&st->lock, flags);
+	list_add_tail_rcu(&k->storage_entry, &st->list);
+	k->flags |= KEVENT_STORAGE;
+	spin_unlock_irqrestore(&st->lock, flags);
+	return 0;
+}
+
+/*
+ * Dequeue kevent from origin's queue. 
+ * It does not decrease origin's reference counter in any way 
+ * and must be called before it, so storage itself must be valid.
+ * It is called from ->dequeue() callback.
+ */
+void kevent_storage_dequeue(struct kevent_storage *st, struct kevent *k)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&st->lock, flags);
+	if (k->flags & KEVENT_STORAGE) {
+		list_del_rcu(&k->storage_entry);
+		k->flags &= ~KEVENT_STORAGE;
+	}
+	spin_unlock_irqrestore(&st->lock, flags);
+}
+
+/*
+ * Call kevent ready callback and queue it into ready queue if needed.
+ * If kevent is marked as one-shot, then remove it from storage queue.
+ */
+static void __kevent_requeue(struct kevent *k, u32 event)
+{
+	int ret, rem = 0;
+	unsigned long flags;
+
+	ret = k->callbacks.callback(k);
+
+	spin_lock_irqsave(&k->ulock, flags);
+	if (ret > 0) {
+		k->event.ret_flags |= KEVENT_RET_DONE;
+	} else if (ret < 0) {
+		k->event.ret_flags |= KEVENT_RET_BROKEN;
+		k->event.ret_flags |= KEVENT_RET_DONE;
+	}
+	rem = (k->event.req_flags & KEVENT_REQ_ONESHOT);
+	if (!ret)
+		ret = (k->event.ret_flags & (KEVENT_RET_BROKEN|KEVENT_RET_DONE));
+	spin_unlock_irqrestore(&k->ulock, flags);
+
+	if (ret) {
+		if ((rem || ret < 0) && k->flags &KEVENT_STORAGE) {
+			list_del_rcu(&k->storage_entry);
+			k->flags &= ~KEVENT_STORAGE;
+		}
+		
+		spin_lock_irqsave(&k->user->ready_lock, flags);
+		if (!(k->flags & KEVENT_READY)) {
+			kevent_user_ring_add_event(k);
+			list_add_tail(&k->ready_entry, &k->user->ready_list);
+			k->flags |= KEVENT_READY;
+			k->user->ready_num++;
+		}
+		spin_unlock_irqrestore(&k->user->ready_lock, flags);
+		wake_up(&k->user->wait);
+	}
+}
+
+/*
+ * Check if kevent is ready (by invoking it's callback) and requeue/remove
+ * if needed.
+ */
+void kevent_requeue(struct kevent *k)
+{
+	unsigned long flags;
+	
+	spin_lock_irqsave(&k->st->lock, flags);
+	__kevent_requeue(k, 0);
+	spin_unlock_irqrestore(&k->st->lock, flags);
+}
+
+/*
+ * Called each time some activity in origin (socket, inode...) is noticed.
+ */
+void kevent_storage_ready(struct kevent_storage *st, 
+		kevent_callback_t ready_callback, u32 event)
+{
+	struct kevent *k;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(k, &st->list, storage_entry) {
+		if (ready_callback)
+			(*ready_callback)(k);
+
+		if (event & k->event.event)
+			__kevent_requeue(k, event);
+	}
+	rcu_read_unlock();
+}
+
+int kevent_storage_init(void *origin, struct kevent_storage *st)
+{
+	spin_lock_init(&st->lock);
+	st->origin = origin;
+	INIT_LIST_HEAD(&st->list);
+	return 0;
+}
+
+/*
+ * Mark all events as broken, that will remove them from storage,
+ * so storage origin (inode, sockt and so on) can be safely removed.
+ * No new entries are allowed to be added into the storage at this point.
+ * (Socket is removed from file table at this point for example).
+ */
+void kevent_storage_fini(struct kevent_storage *st)
+{
+	kevent_storage_ready(st, kevent_break, KEVENT_MASK_ALL);
+}
+
+int kevent_sys_init(void)
+{
+	int i;
+
+	kevent_cache = kmem_cache_create("kevent_cache", 
+			sizeof(struct kevent), 0, SLAB_PANIC, NULL, NULL);
+
+	for (i=0; i<ARRAY_SIZE(kevent_registered_callbacks); ++i) {
+		struct kevent_callbacks *c = &kevent_registered_callbacks[i];
+
+		c->callback = c->enqueue = c->dequeue = NULL;
+	}
+	
+	return 0;
+}
diff --git a/kernel/kevent/kevent_user.c b/kernel/kevent/kevent_user.c
new file mode 100644
index 0000000..f6d1ff6
--- /dev/null
+++ b/kernel/kevent/kevent_user.c
@@ -0,0 +1,1007 @@
+/*
+ * 	kevent_user.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/device.h>
+#include <linux/poll.h>
+#include <linux/kevent.h>
+#include <linux/jhash.h>
+#include <linux/miscdevice.h>
+#include <asm/io.h>
+
+static char kevent_name[] = "kevent";
+
+static int kevent_user_open(struct inode *, struct file *);
+static int kevent_user_release(struct inode *, struct file *);
+static unsigned int kevent_user_poll(struct file *, struct poll_table_struct *);
+static int kevent_user_mmap(struct file *, struct vm_area_struct *);
+
+static struct file_operations kevent_user_fops = {
+	.mmap		= kevent_user_mmap,
+	.open		= kevent_user_open,
+	.release	= kevent_user_release,
+	.poll		= kevent_user_poll,
+	.owner		= THIS_MODULE,
+};
+
+static struct miscdevice kevent_miscdev = {
+	.minor = MISC_DYNAMIC_MINOR,
+	.name = kevent_name,
+	.fops = &kevent_user_fops,
+};
+
+static int kevent_get_sb(struct file_system_type *fs_type, 
+		int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+{
+	/* So original magic... */
+	return get_sb_pseudo(fs_type, kevent_name, NULL, 0xbcdbcdul, mnt);
+}
+
+static struct file_system_type kevent_fs_type = {
+	.name		= kevent_name,
+	.get_sb		= kevent_get_sb,
+	.kill_sb	= kill_anon_super,
+};
+
+static struct vfsmount *kevent_mnt;
+
+/*
+ * kevents are pollable, return POLLIN and POLLRDNORM 
+ * when there is at least one ready kevent.
+ */
+static unsigned int kevent_user_poll(struct file *file, struct poll_table_struct *wait)
+{
+	struct kevent_user *u = file->private_data;
+	unsigned int mask;
+	
+	poll_wait(file, &u->wait, wait);
+	mask = 0;
+
+	if (u->ready_num)
+		mask |= POLLIN | POLLRDNORM;
+
+	return mask;
+}
+
+/*
+ * Note that kevents does not exactly fill the page (each mukevent is 40 bytes),
+ * so we reuse 4 bytes at the begining of the first page to store index.
+ * Take that into account if you want to change size of struct ukevent.
+ */
+#define KEVENTS_ON_PAGE ((PAGE_SIZE-sizeof(unsigned int))/sizeof(struct mukevent))
+struct kevent_mring
+{
+	unsigned int		index;
+	struct mukevent		event[KEVENTS_ON_PAGE];
+};
+
+static inline void kevent_user_ring_set(struct kevent_user *u, unsigned int num)
+{
+	struct kevent_mring *ring;
+
+	ring = (struct kevent_mring *)u->pring[0];
+	ring->index = num;
+}
+
+static inline void kevent_user_ring_inc(struct kevent_user *u)
+{
+	struct kevent_mring *ring;
+
+	ring = (struct kevent_mring *)u->pring[0];
+	ring->index++;
+}
+
+static int kevent_user_ring_grow(struct kevent_user *u)
+{
+	struct kevent_mring *ring;
+	unsigned int idx;
+
+	ring = (struct kevent_mring *)u->pring[0];
+
+	idx = (ring->index + 1) / KEVENTS_ON_PAGE;
+	if (idx >= u->pages_in_use) {
+		u->pring[idx] = __get_free_page(GFP_KERNEL);
+		if (!u->pring[idx])
+			return -ENOMEM;
+		u->pages_in_use++;
+	}
+	return 0;
+}
+
+/*
+ * Called under kevent_user->ready_lock, so updates are always protected.
+ */
+void kevent_user_ring_add_event(struct kevent *k)
+{
+	unsigned int pidx, off;
+	struct kevent_mring *ring, *copy_ring;
+
+	ring = (struct kevent_mring *)k->user->pring[0];
+	
+	pidx = ring->index/KEVENTS_ON_PAGE;
+	off = ring->index%KEVENTS_ON_PAGE;
+
+	copy_ring = (struct kevent_mring *)k->user->pring[pidx];
+
+	copy_ring->event[off].id.raw[0] = k->event.id.raw[0];
+	copy_ring->event[off].id.raw[1] = k->event.id.raw[1];
+	copy_ring->event[off].ret_flags = k->event.ret_flags;
+
+	if (++ring->index >= KEVENT_MAX_EVENTS)
+		ring->index = 0;
+}
+
+/*
+ * Initialize mmap ring buffer.
+ * It will store ready kevents, so userspace could get them directly instead
+ * of using syscall. Esentially syscall becomes just a waiting point.
+ */
+static int kevent_user_ring_init(struct kevent_user *u)
+{
+	int pnum;
+
+	pnum = ALIGN(KEVENT_MAX_EVENTS*sizeof(struct mukevent) + sizeof(unsigned int), PAGE_SIZE)/PAGE_SIZE;
+
+	u->pring = kmalloc(pnum * sizeof(unsigned long), GFP_KERNEL);
+	if (!u->pring)
+		return -ENOMEM;
+
+	u->pring[0] = __get_free_page(GFP_KERNEL);
+	if (!u->pring[0])
+		goto err_out_free;
+
+	u->pages_in_use = 1;
+	kevent_user_ring_set(u, 0);
+
+	return 0;
+
+err_out_free:
+	kfree(u->pring);
+
+	return -ENOMEM;
+}
+
+static void kevent_user_ring_fini(struct kevent_user *u)
+{
+	int i;
+	
+	for (i=0; i<u->pages_in_use; ++i)
+		free_page(u->pring[i]);
+
+	kfree(u->pring);
+}
+
+
+/*
+ * Allocate new kevent userspace control entry.
+ */
+static struct kevent_user *kevent_user_alloc(void)
+{
+	struct kevent_user *u;
+	int i;
+
+	u = kzalloc(sizeof(struct kevent_user), GFP_KERNEL);
+	if (!u)
+		return NULL;
+
+	INIT_LIST_HEAD(&u->ready_list);
+	spin_lock_init(&u->ready_lock);
+	kevent_stat_init(u);
+	spin_lock_init(&u->kevent_lock);
+	for (i=0; i<ARRAY_SIZE(u->kevent_list); ++i)
+		INIT_LIST_HEAD(&u->kevent_list[i]);
+	
+	mutex_init(&u->ctl_mutex);
+	init_waitqueue_head(&u->wait);
+
+	atomic_set(&u->refcnt, 1);
+
+	if (kevent_user_ring_init(u)) {
+		kfree(u);
+		u = NULL;
+	}
+
+	return u;
+}
+
+static int kevent_user_open(struct inode *inode, struct file *file)
+{
+	struct kevent_user *u = kevent_user_alloc();
+	
+	if (!u)
+		return -ENOMEM;
+
+	file->private_data = u;
+	
+	return 0;
+}
+
+
+/*
+ * Kevent userspace control block reference counting.
+ * Set to 1 at creation time, when appropriate kevent file descriptor
+ * is closed, that reference counter is decreased.
+ * When counter hits zero block is freed.
+ */
+static inline void kevent_user_get(struct kevent_user *u)
+{
+	atomic_inc(&u->refcnt);
+}
+
+static inline void kevent_user_put(struct kevent_user *u)
+{
+	if (atomic_dec_and_test(&u->refcnt)) {
+		kevent_stat_print(u);
+		kevent_user_ring_fini(u);
+		kfree(u);
+	}
+}
+
+static struct page *kevent_user_nopage(struct vm_area_struct *vma, unsigned long addr, int *type)
+{
+	struct kevent_user *u = vma->vm_file->private_data;
+	unsigned long off = (addr - vma->vm_start)/PAGE_SIZE;
+	unsigned int pnum = ALIGN(KEVENT_MAX_EVENTS*sizeof(struct mukevent) + sizeof(unsigned int), PAGE_SIZE)/PAGE_SIZE;
+
+	if (type)
+		*type = VM_FAULT_MINOR;
+
+	if (off >= pnum)
+		goto err_out_sigbus;
+
+	u->pring[off] = __get_free_page(GFP_KERNEL);
+	if (!u->pring)
+		goto err_out_sigbus;
+
+	return virt_to_page(u->pring[off]);
+
+err_out_sigbus:
+	return NOPAGE_SIGBUS;
+}
+
+static struct vm_operations_struct kevent_user_vm_ops = {
+	.nopage = &kevent_user_nopage,
+};
+
+/*
+ * Mmap implementation for ring buffer, which is created as array
+ * of pages, so vm_pgoff is an offset (in pages, not in bytes) of
+ * the first page to be mapped.
+ */
+static int kevent_user_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	unsigned long start = vma->vm_start;
+	struct kevent_user *u = file->private_data;
+
+	if (vma->vm_flags & VM_WRITE)
+		return -EPERM;
+
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+	vma->vm_ops = &kevent_user_vm_ops;
+	vma->vm_flags |= VM_RESERVED;
+	vma->vm_file = file;
+
+	if (remap_pfn_range(vma, start, virt_to_phys((void *)u->pring[0]), PAGE_SIZE,
+				vma->vm_page_prot))
+		return -EFAULT;
+
+	return 0;
+}
+
+#if 0
+static inline unsigned int kevent_user_hash(struct ukevent *uk)
+{
+	unsigned int h = (uk->user[0] ^ uk->user[1]) ^ (uk->id.raw[0] ^ uk->id.raw[1]);
+	
+	h = (((h >> 16) & 0xffff) ^ (h & 0xffff)) & 0xffff;
+	h = (((h >> 8) & 0xff) ^ (h & 0xff)) & KEVENT_HASH_MASK;
+
+	return h;
+}
+#else
+static inline unsigned int kevent_user_hash(struct ukevent *uk)
+{
+	return jhash_1word(uk->id.raw[0], 0) & KEVENT_HASH_MASK;
+}
+#endif
+
+/*
+ * RCU protects storage list (kevent->storage_entry).
+ * Free entry in RCU callback, it is dequeued from all lists at 
+ * this point.
+ */
+
+static void kevent_free_rcu(struct rcu_head *rcu)
+{
+	struct kevent *kevent = container_of(rcu, struct kevent, rcu_head);
+	kmem_cache_free(kevent_cache, kevent);
+}
+
+/*
+ * Complete kevent removing - it dequeues kevent from storage list
+ * if it is requested, removes kevent from ready list, drops userspace
+ * control block reference counter and schedules kevent freeing through RCU.
+ */
+static void kevent_finish_user_complete(struct kevent *k, int deq)
+{
+	struct kevent_user *u = k->user;
+	unsigned long flags;
+
+	if (deq)
+		kevent_dequeue(k);
+
+	spin_lock_irqsave(&u->ready_lock, flags);
+	if (k->flags & KEVENT_READY) {
+		list_del(&k->ready_entry);
+		k->flags &= ~KEVENT_READY;
+		u->ready_num--;
+	}
+	spin_unlock_irqrestore(&u->ready_lock, flags);
+
+	kevent_user_put(u);
+	call_rcu(&k->rcu_head, kevent_free_rcu);
+}
+
+/*
+ * Remove from all lists and free kevent.
+ * Must be called under kevent_user->kevent_lock to protect 
+ * kevent->kevent_entry removing.
+ */
+static void __kevent_finish_user(struct kevent *k, int deq)
+{
+	struct kevent_user *u = k->user;
+
+	list_del(&k->kevent_entry);
+	k->flags &= ~KEVENT_USER;
+	u->kevent_num--;
+	kevent_finish_user_complete(k, deq);
+}
+
+/*
+ * Remove kevent from user's list of all events, 
+ * dequeue it from storage and decrease user's reference counter,
+ * since this kevent does not exist anymore. That is why it is freed here.
+ */
+static void kevent_finish_user(struct kevent *k, int deq)
+{
+	struct kevent_user *u = k->user;
+	unsigned long flags;
+
+	spin_lock_irqsave(&u->kevent_lock, flags);
+	list_del(&k->kevent_entry);
+	k->flags &= ~KEVENT_USER;
+	u->kevent_num--;
+	spin_unlock_irqrestore(&u->kevent_lock, flags);
+	kevent_finish_user_complete(k, deq);
+}
+
+/*
+ * Dequeue one entry from user's ready queue.
+ */
+static struct kevent *kqueue_dequeue_ready(struct kevent_user *u)
+{
+	unsigned long flags;
+	struct kevent *k = NULL;
+
+	spin_lock_irqsave(&u->ready_lock, flags);
+	if (u->ready_num && !list_empty(&u->ready_list)) {
+		k = list_entry(u->ready_list.next, struct kevent, ready_entry);
+		list_del(&k->ready_entry);
+		k->flags &= ~KEVENT_READY;
+		u->ready_num--;
+	}
+	spin_unlock_irqrestore(&u->ready_lock, flags);
+
+	return k;
+}
+
+/*
+ * Search a kevent inside hash bucket for given ukevent.
+ */
+static struct kevent *__kevent_search(struct list_head *head, struct ukevent *uk, 
+		struct kevent_user *u)
+{
+	struct kevent *k, *ret = NULL;
+	
+	list_for_each_entry(k, head, kevent_entry) {
+		spin_lock(&k->ulock);
+		if (k->event.user[0] == uk->user[0] && k->event.user[1] == uk->user[1] &&
+				k->event.id.raw[0] == uk->id.raw[0] && 
+				k->event.id.raw[1] == uk->id.raw[1]) {
+			ret = k;
+			spin_unlock(&k->ulock);
+			break;
+		}
+		spin_unlock(&k->ulock);
+	}
+
+	return ret;
+}
+
+/*
+ * Search and modify kevent according to provided ukevent.
+ */
+static int kevent_modify(struct ukevent *uk, struct kevent_user *u)
+{
+	struct kevent *k;
+	unsigned int hash = kevent_user_hash(uk);
+	int err = -ENODEV;
+	unsigned long flags;
+	
+	spin_lock_irqsave(&u->kevent_lock, flags);
+	k = __kevent_search(&u->kevent_list[hash], uk, u);
+	if (k) {
+		spin_lock(&k->ulock);
+		k->event.event = uk->event;
+		k->event.req_flags = uk->req_flags;
+		k->event.ret_flags = 0;
+		spin_unlock(&k->ulock);
+		kevent_requeue(k);
+		err = 0;
+	}
+	spin_unlock_irqrestore(&u->kevent_lock, flags);
+	
+	return err;
+}
+
+/*
+ * Remove kevent which matches provided ukevent.
+ */
+static int kevent_remove(struct ukevent *uk, struct kevent_user *u)
+{
+	int err = -ENODEV;
+	struct kevent *k;
+	unsigned int hash = kevent_user_hash(uk);
+	unsigned long flags;
+
+	spin_lock_irqsave(&u->kevent_lock, flags);
+	k = __kevent_search(&u->kevent_list[hash], uk, u);
+	if (k) {
+		__kevent_finish_user(k, 1);
+		err = 0;
+	}
+	spin_unlock_irqrestore(&u->kevent_lock, flags);
+
+	return err;
+}
+
+/*
+ * Detaches userspace control block from file descriptor 
+ * and decrease it's reference counter.
+ * No new kevents can be added or removed from any list at this point.
+ */
+static int kevent_user_release(struct inode *inode, struct file *file)
+{
+	struct kevent_user *u = file->private_data;
+	struct kevent *k, *n;
+	int i;
+
+	for (i=0; i<ARRAY_SIZE(u->kevent_list); ++i) {
+		list_for_each_entry_safe(k, n, &u->kevent_list[i], kevent_entry)
+			kevent_finish_user(k, 1);
+	}
+
+	kevent_user_put(u);
+	file->private_data = NULL;
+
+	return 0;
+}
+
+/*
+ * Read requested number of ukevents in one shot.
+ */
+static struct ukevent *kevent_get_user(unsigned int num, void __user *arg)
+{
+	struct ukevent *ukev;
+
+	ukev = kmalloc(sizeof(struct ukevent) * num, GFP_KERNEL);
+	if (!ukev)
+		return NULL;
+
+	if (copy_from_user(ukev, arg, sizeof(struct ukevent) * num)) {
+		kfree(ukev);
+		return NULL;
+	}
+
+	return ukev;
+}
+
+/*
+ * Read from userspace all ukevents and modify appropriate kevents.
+ * If provided number of ukevents is more that threshold, it is faster
+ * to allocate a room for them and copy in one shot instead of copy 
+ * one-by-one and then process them.
+ */
+static int kevent_user_ctl_modify(struct kevent_user *u, unsigned int num, void __user *arg)
+{
+	int err = 0, i;
+	struct ukevent uk;
+
+	mutex_lock(&u->ctl_mutex);
+
+	if (num > u->kevent_num) {
+		err = -EINVAL;
+		goto out;
+	}
+	
+	if (num > KEVENT_MIN_BUFFS_ALLOC) {
+		struct ukevent *ukev;
+
+		ukev = kevent_get_user(num, arg);
+		if (ukev) {
+			for (i=0; i<num; ++i) {
+				if (kevent_modify(&ukev[i], u))
+					ukev[i].ret_flags |= KEVENT_RET_BROKEN;
+				ukev[i].ret_flags |= KEVENT_RET_DONE;
+			}
+			if (copy_to_user(arg, ukev, num*sizeof(struct ukevent)))
+				err = -EFAULT;
+			kfree(ukev);
+			goto out;
+		}
+	}
+
+	for (i=0; i<num; ++i) {
+		if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
+			err = -EFAULT;
+			break;
+		}
+
+		if (kevent_modify(&uk, u))
+			uk.ret_flags |= KEVENT_RET_BROKEN;
+		uk.ret_flags |= KEVENT_RET_DONE;
+
+		if (copy_to_user(arg, &uk, sizeof(struct ukevent))) {
+			err = -EFAULT;
+			break;
+		}
+
+		arg += sizeof(struct ukevent);
+	}
+out:
+	mutex_unlock(&u->ctl_mutex);
+
+	return err;
+}
+
+/*
+ * Read from userspace all ukevents and remove appropriate kevents.
+ * If provided number of ukevents is more that threshold, it is faster
+ * to allocate a room for them and copy in one shot instead of copy 
+ * one-by-one and then process them.
+ */
+static int kevent_user_ctl_remove(struct kevent_user *u, unsigned int num, void __user *arg)
+{
+	int err = 0, i;
+	struct ukevent uk;
+
+	mutex_lock(&u->ctl_mutex);
+	
+	if (num > u->kevent_num) {
+		err = -EINVAL;
+		goto out;
+	}
+	
+	if (num > KEVENT_MIN_BUFFS_ALLOC) {
+		struct ukevent *ukev;
+
+		ukev = kevent_get_user(num, arg);
+		if (ukev) {
+			for (i=0; i<num; ++i) {
+				if (kevent_remove(&ukev[i], u))
+					ukev[i].ret_flags |= KEVENT_RET_BROKEN;
+				ukev[i].ret_flags |= KEVENT_RET_DONE;
+			}
+			if (copy_to_user(arg, ukev, num*sizeof(struct ukevent)))
+				err = -EFAULT;
+			kfree(ukev);
+			goto out;
+		}
+	}
+
+	for (i=0; i<num; ++i) {
+		if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
+			err = -EFAULT;
+			break;
+		}
+
+		if (kevent_remove(&uk, u))
+			uk.ret_flags |= KEVENT_RET_BROKEN;
+
+		uk.ret_flags |= KEVENT_RET_DONE;
+
+		if (copy_to_user(arg, &uk, sizeof(struct ukevent))) {
+			err = -EFAULT;
+			break;
+		}
+
+		arg += sizeof(struct ukevent);
+	}
+out:
+	mutex_unlock(&u->ctl_mutex);
+
+	return err;
+}
+
+/*
+ * Queue kevent into userspace control block and increase
+ * it's reference counter.
+ */
+static void kevent_user_enqueue(struct kevent_user *u, struct kevent *k)
+{
+	unsigned long flags;
+	unsigned int hash = kevent_user_hash(&k->event);
+
+	spin_lock_irqsave(&u->kevent_lock, flags);
+	list_add_tail(&k->kevent_entry, &u->kevent_list[hash]);
+	k->flags |= KEVENT_USER;
+	u->kevent_num++;
+	kevent_user_get(u);
+	spin_unlock_irqrestore(&u->kevent_lock, flags);
+}
+
+/*
+ * Add kevent from both kernel and userspace users.
+ * This function allocates and queues kevent, returns negative value
+ * on error, positive if kevent is ready immediately and zero
+ * if kevent has been queued.
+ */
+int kevent_user_add_ukevent(struct ukevent *uk, struct kevent_user *u)
+{
+	struct kevent *k;
+	int err;
+
+	if (kevent_user_ring_grow(u)) {
+		err = -ENOMEM;
+		goto err_out_exit;
+	}
+
+	k = kmem_cache_alloc(kevent_cache, GFP_KERNEL);
+	if (!k) {
+		err = -ENOMEM;
+		goto err_out_exit;
+	}
+
+	memcpy(&k->event, uk, sizeof(struct ukevent));
+	INIT_RCU_HEAD(&k->rcu_head);
+
+	k->event.ret_flags = 0;
+
+	err = kevent_init(k);
+	if (err) {
+		kmem_cache_free(kevent_cache, k);
+		goto err_out_exit;
+	}
+	k->user = u;
+	kevent_stat_total(u);
+	kevent_user_enqueue(u, k);
+
+	err = kevent_enqueue(k);
+	if (err) {
+		memcpy(uk, &k->event, sizeof(struct ukevent));
+		kevent_finish_user(k, 0);
+	} else {
+		kevent_user_ring_inc(u);
+	}
+
+err_out_exit:
+	if (err < 0) {
+		uk->ret_flags |= KEVENT_RET_BROKEN | KEVENT_RET_DONE;
+		uk->ret_data[1] = err;
+	}
+	return err;
+}
+
+/*
+ * Copy all ukevents from userspace, allocate kevent for each one 
+ * and add them into appropriate kevent_storages, 
+ * e.g. sockets, inodes and so on...
+ * Ready events will replace ones provided by used and number 
+ * of ready events is returned.
+ * User must check ret_flags field of each ukevent structure 
+ * to determine if it is fired or failed event.
+ */
+static int kevent_user_ctl_add(struct kevent_user *u, unsigned int num, void __user *arg)
+{
+	int err, cerr = 0, knum = 0, rnum = 0, i;
+	void __user *orig = arg;
+	struct ukevent uk;
+
+	mutex_lock(&u->ctl_mutex);
+
+	err = -EINVAL;
+	if (u->kevent_num + num >= KEVENT_MAX_EVENTS)
+		goto out_remove;
+
+	if (num > KEVENT_MIN_BUFFS_ALLOC) {
+		struct ukevent *ukev;
+
+		ukev = kevent_get_user(num, arg);
+		if (ukev) {
+			for (i=0; i<num; ++i) {
+				err = kevent_user_add_ukevent(&ukev[i], u);
+				if (err) {
+					kevent_stat_im(u);
+					if (i != rnum)
+						memcpy(&ukev[rnum], &ukev[i], sizeof(struct ukevent));
+					rnum++;
+				} else
+					knum++;
+			}
+			if (copy_to_user(orig, ukev, rnum*sizeof(struct ukevent)))
+				cerr = -EFAULT;
+			kfree(ukev);
+			goto out_setup;
+		}
+	}
+
+	for (i=0; i<num; ++i) {
+		if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
+			cerr = -EFAULT;
+			break;
+		}
+		arg += sizeof(struct ukevent);
+
+		err = kevent_user_add_ukevent(&uk, u);
+		if (err) {
+			kevent_stat_im(u);
+			if (copy_to_user(orig, &uk, sizeof(struct ukevent))) {
+				cerr = -EFAULT;
+				break;
+			}
+			orig += sizeof(struct ukevent);
+			rnum++;
+		} else
+			knum++;
+	}
+
+out_setup:
+	if (cerr < 0) {
+		err = cerr;
+		goto out_remove;
+	}
+
+	err = rnum;
+out_remove:
+	mutex_unlock(&u->ctl_mutex);
+
+	return err;
+}
+
+/*
+ * In nonblocking mode it returns as many events as possible, but not more than @max_nr.
+ * In blocking mode it waits until timeout or if at least @min_nr events are ready.
+ */
+static int kevent_user_wait(struct file *file, struct kevent_user *u, 
+		unsigned int min_nr, unsigned int max_nr, unsigned int timeout, 
+		void __user *buf)
+{
+	struct kevent *k;
+	int num = 0;
+
+	if (!(file->f_flags & O_NONBLOCK)) {
+		wait_event_interruptible_timeout(u->wait, 
+			u->ready_num >= min_nr, msecs_to_jiffies(timeout));
+	}
+	
+	while (num < max_nr && ((k = kqueue_dequeue_ready(u)) != NULL)) {
+		if (copy_to_user(buf + num*sizeof(struct ukevent), 
+					&k->event, sizeof(struct ukevent)))
+			break;
+
+		/*
+		 * If it is one-shot kevent, it has been removed already from
+		 * origin's queue, so we can easily free it here.
+		 */
+		if (k->event.req_flags & KEVENT_REQ_ONESHOT)
+			kevent_finish_user(k, 1);
+		++num;
+		kevent_stat_wait(u);
+	}
+
+	return num;
+}
+
+/*
+ * Userspace control block creation and initialization.
+ */
+static int kevent_ctl_init(void)
+{
+	struct kevent_user *u;
+	struct file *file;
+	int fd, ret;
+
+	fd = get_unused_fd();
+	if (fd < 0)
+		return fd;
+
+	file = get_empty_filp();
+	if (!file) {
+		ret = -ENFILE;
+		goto out_put_fd;
+	}
+
+	u = kevent_user_alloc();
+	if (unlikely(!u)) {
+		ret = -ENOMEM;
+		goto out_put_file;
+	}
+
+	file->f_op = &kevent_user_fops;
+	file->f_vfsmnt = mntget(kevent_mnt);
+	file->f_dentry = dget(kevent_mnt->mnt_root);
+	file->f_mapping = file->f_dentry->d_inode->i_mapping;
+	file->f_mode = FMODE_READ;
+	file->f_flags = O_RDONLY;
+	file->private_data = u;
+	
+	fd_install(fd, file);
+
+	return fd;
+
+out_put_file:
+	put_filp(file);
+out_put_fd:
+	put_unused_fd(fd);
+	return ret;
+}
+
+static int kevent_ctl_process(struct file *file, unsigned int cmd, unsigned int num, void __user *arg)
+{
+	int err;
+	struct kevent_user *u = file->private_data;
+
+	if (!u || num > KEVENT_MAX_EVENTS)
+		return -EINVAL;
+
+	switch (cmd) {
+	case KEVENT_CTL_ADD:
+		err = kevent_user_ctl_add(u, num, arg);
+		break;
+	case KEVENT_CTL_REMOVE:
+		err = kevent_user_ctl_remove(u, num, arg);
+		break;
+	case KEVENT_CTL_MODIFY:
+		err = kevent_user_ctl_modify(u, num, arg);
+		break;
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+	return err;
+}
+
+/*
+ * Used to get ready kevents from queue.
+ * @ctl_fd - kevent control descriptor which must be obtained through kevent_ctl(KEVENT_CTL_INIT).
+ * @min_nr - minimum number of ready kevents.
+ * @max_nr - maximum number of ready kevents.
+ * @timeout - timeout in milliseconds to wait until some events are ready.
+ * @buf - buffer to place ready events.
+ * @flags - ununsed for now (will be used for mmap implementation).
+ */
+asmlinkage long sys_kevent_get_events(int ctl_fd, unsigned int min_nr, unsigned int max_nr,
+		unsigned int timeout, void __user *buf, unsigned flags)
+{
+	int err = -EINVAL;
+	struct file *file;
+	struct kevent_user *u;
+
+	file = fget(ctl_fd);
+	if (!file)
+		return -ENODEV;
+
+	if (file->f_op != &kevent_user_fops)
+		goto out_fput;
+	u = file->private_data;
+
+	err = kevent_user_wait(file, u, min_nr, max_nr, timeout, buf);
+out_fput:
+	fput(file);
+	return err;
+}
+
+/*
+ * This syscall is used to perform various control operations
+ * on given kevent queue, which is obtained through kevent file descriptor @fd.
+ * @cmd - type of operation.
+ * @num - number of kevents to be processed.
+ * @arg - pointer to array of struct ukevent.
+ */
+asmlinkage long sys_kevent_ctl(int fd, unsigned int cmd, unsigned int num, void __user *arg)
+{
+	int err = -EINVAL;
+	struct file *file;
+
+	if (cmd == KEVENT_CTL_INIT)
+		return kevent_ctl_init();
+
+	file = fget(fd);
+	if (!file)
+		return -ENODEV;
+
+	if (file->f_op != &kevent_user_fops)
+		goto out_fput;
+
+	err = kevent_ctl_process(file, cmd, num, arg);
+
+out_fput:
+	fput(file);
+	return err;
+}
+
+/*
+ * Kevent subsystem initialization - create kevent cache and register
+ * filesystem to get control file descriptors from.
+ */
+static int __devinit kevent_user_init(void)
+{
+	int err = 0;
+
+	err = kevent_sys_init();
+	if (err)
+		panic("%s: failed to initialize kevent: err=%d.\n", err);
+	
+	err = register_filesystem(&kevent_fs_type);
+	if (err)
+		panic("%s: failed to register filesystem: err=%d.\n",
+			       kevent_name, err);
+
+	kevent_mnt = kern_mount(&kevent_fs_type);
+	if (IS_ERR(kevent_mnt))
+		panic("%s: failed to mount silesystem: err=%ld.\n", 
+				kevent_name, PTR_ERR(kevent_mnt));
+	
+	err = misc_register(&kevent_miscdev);
+	if (err) {
+		printk(KERN_ERR "Failed to register kevent miscdev: err=%d.\n", err);
+		goto err_out_exit;
+	}
+
+	printk("KEVENT subsystem has been successfully registered.\n");
+
+	return 0;
+
+err_out_exit:
+	mntput(kevent_mnt);
+	unregister_filesystem(&kevent_fs_type);
+
+	return err;
+}
+
+static void __devexit kevent_user_fini(void)
+{
+	misc_deregister(&kevent_miscdev);
+	mntput(kevent_mnt);
+	unregister_filesystem(&kevent_fs_type);
+}
+
+module_init(kevent_user_init);
+module_exit(kevent_user_fini);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 6991bec..8d3769b 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -122,6 +122,9 @@ cond_syscall(ppc_rtas);
 cond_syscall(sys_spu_run);
 cond_syscall(sys_spu_create);
 
+cond_syscall(sys_kevent_get_events);
+cond_syscall(sys_kevent_ctl);
+
 /* mmu depending weak syscall entries */
 cond_syscall(sys_mprotect);
 cond_syscall(sys_msync);


^ permalink raw reply related	[flat|nested] 160+ messages in thread

* [take10 0/2] kevent: Generic event handling mechanism.
  2006-07-31 10:33                       ` Evgeniy Polyakov
                                           ` (9 preceding siblings ...)
  2006-08-14  6:21                         ` [take9 0/2] kevent: Generic event handling mechanism Evgeniy Polyakov
@ 2006-08-16 12:34                         ` Evgeniy Polyakov
  2006-08-16 12:34                           ` [take10 1/2] kevent: Core files Evgeniy Polyakov
  10 siblings, 1 reply; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-16 12:34 UTC (permalink / raw)
  To: lkml
  Cc: David Miller, Ulrich Drepper, Andrew Morton, Evgeniy Polyakov,
	netdev, Zach Brown


Generic event handling mechanism.

Changes from 'take9' patchset:
 * fixed ->nopage method

Changes from 'take8' patchset:
 * fixed mmap release bug
 * use module_init() instead of late_initcall()
 * use better structures for timer notifications

Changes from 'take7' patchset:
 * new mmap interface (not tested, waiting for other changes to be acked)
	- use nopage() method to dynamically substitue pages
	- allocate new page for events only when new added kevent requres it
	- do not use ugly index dereferencing, use structure instead
	- reduced amount of data in the ring (id and flags), 
		maximum 12 pages on x86 per kevent fd

Changes from 'take6' patchset:
 * a lot of comments!
 * do not use list poisoning for detection of the fact, that entry is in the list
 * return number of ready kevents even if copy*user() fails
 * strict check for number of kevents in syscall
 * use ARRAY_SIZE for array size calculation
 * changed superblock magic number
 * use SLAB_PANIC instead of direct panic() call
 * changed -E* return values
 * a lot of small cleanups and indent fixes

Changes from 'take5' patchset:
 * removed compilation warnings about unused wariables when lockdep is not turned on
 * do not use internal socket structures, use appropriate (exported) wrappers instead
 * removed default 1 second timeout
 * removed AIO stuff from patchset

Changes from 'take4' patchset:
 * use miscdevice instead of chardevice
 * comments fixes

Changes from 'take3' patchset:
 * removed serializing mutex from kevent_user_wait()
 * moved storage list processing to RCU
 * removed lockdep screaming - all storage locks are initialized in the same function, so it was learned 
	to differentiate between various cases
 * remove kevent from storage if is marked as broken after callback
 * fixed a typo in mmaped buffer implementation which would end up in wrong index calcualtion 

Changes from 'take2' patchset:
 * split kevent_finish_user() to locked and unlocked variants
 * do not use KEVENT_STAT ifdefs, use inline functions instead
 * use array of callbacks of each type instead of each kevent callback initialization
 * changed name of ukevent guarding lock
 * use only one kevent lock in kevent_user for all hash buckets instead of per-bucket locks
 * do not use kevent_user_ctl structure instead provide needed arguments as syscall parameters
 * various indent cleanups
 * added optimisation, which is aimed to help when a lot of kevents are being copied from userspace
 * mapped buffer (initial) implementation (no userspace yet)

Changes from 'take1' patchset:
 - rebased against 2.6.18-git tree
 - removed ioctl controlling
 - added new syscall kevent_get_events(int fd, unsigned int min_nr, unsigned int max_nr,
			unsigned int timeout, void __user *buf, unsigned flags)
 - use old syscall kevent_ctl for creation/removing, modification and initial kevent 
	initialization
 - use mutuxes instead of semaphores
 - added file descriptor check and return error if provided descriptor does not match
	kevent file operations
 - various indent fixes
 - removed aio_sendfile() declarations.

Thank you.

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>



^ permalink raw reply	[flat|nested] 160+ messages in thread

* [take10 2/2] kevent: poll/select() notifications. Timer notifications.
  2006-08-16 12:34                           ` [take10 1/2] kevent: Core files Evgeniy Polyakov
@ 2006-08-16 12:34                             ` Evgeniy Polyakov
  2006-08-16 12:37                             ` [take10 1/2] kevent: Core files Mika Penttilä
  2006-08-18  9:35                             ` Joe Jin
  2 siblings, 0 replies; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-16 12:34 UTC (permalink / raw)
  To: lkml
  Cc: David Miller, Ulrich Drepper, Andrew Morton, Evgeniy Polyakov,
	netdev, Zach Brown


poll/select() notifications. Timer notifications.

This patch includes generic poll/select and timer notifications.

kevent_poll works simialr to epoll and has the same issues (callback
is invoked not from internal state machine of the caller, but through
process awake).

Timer notifications can be used for fine grained per-process time 
management, since interval timers are very inconvenient to use, 
and they are limited.

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mitp.ru>

diff --git a/kernel/kevent/kevent_poll.c b/kernel/kevent/kevent_poll.c
new file mode 100644
index 0000000..8a4f863
--- /dev/null
+++ b/kernel/kevent/kevent_poll.c
@@ -0,0 +1,220 @@
+/*
+ * 	kevent_poll.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+#include <linux/file.h>
+#include <linux/kevent.h>
+#include <linux/poll.h>
+#include <linux/fs.h>
+
+static kmem_cache_t *kevent_poll_container_cache;
+static kmem_cache_t *kevent_poll_priv_cache;
+
+struct kevent_poll_ctl
+{
+	struct poll_table_struct 	pt;
+	struct kevent			*k;
+};
+
+struct kevent_poll_wait_container
+{
+	struct list_head		container_entry;
+	wait_queue_head_t		*whead;
+	wait_queue_t			wait;
+	struct kevent			*k;
+};
+
+struct kevent_poll_private
+{
+	struct list_head		container_list;
+	spinlock_t			container_lock;
+};
+
+static int kevent_poll_enqueue(struct kevent *k);
+static int kevent_poll_dequeue(struct kevent *k);
+static int kevent_poll_callback(struct kevent *k);
+
+static int kevent_poll_wait_callback(wait_queue_t *wait, 
+		unsigned mode, int sync, void *key)
+{
+	struct kevent_poll_wait_container *cont = 
+		container_of(wait, struct kevent_poll_wait_container, wait);
+	struct kevent *k = cont->k;
+	struct file *file = k->st->origin;
+	u32 revents;
+
+	revents = file->f_op->poll(file, NULL);
+
+	kevent_storage_ready(k->st, NULL, revents);
+
+	return 0;
+}
+
+static void kevent_poll_qproc(struct file *file, wait_queue_head_t *whead, 
+		struct poll_table_struct *poll_table)
+{
+	struct kevent *k = 
+		container_of(poll_table, struct kevent_poll_ctl, pt)->k;
+	struct kevent_poll_private *priv = k->priv;
+	struct kevent_poll_wait_container *cont;
+	unsigned long flags;
+
+	cont = kmem_cache_alloc(kevent_poll_container_cache, SLAB_KERNEL);
+	if (!cont) {
+		kevent_break(k);
+		return;
+	}
+		
+	cont->k = k;
+	init_waitqueue_func_entry(&cont->wait, kevent_poll_wait_callback);
+	cont->whead = whead;
+
+	spin_lock_irqsave(&priv->container_lock, flags);
+	list_add_tail(&cont->container_entry, &priv->container_list);
+	spin_unlock_irqrestore(&priv->container_lock, flags);
+
+	add_wait_queue(whead, &cont->wait);
+}
+
+static int kevent_poll_enqueue(struct kevent *k)
+{
+	struct file *file;
+	int err, ready = 0;
+	unsigned int revents;
+	struct kevent_poll_ctl ctl;
+	struct kevent_poll_private *priv;
+
+	file = fget(k->event.id.raw[0]);
+	if (!file)
+		return -ENODEV;
+
+	err = -EINVAL;
+	if (!file->f_op || !file->f_op->poll)
+		goto err_out_fput;
+
+	err = -ENOMEM;
+	priv = kmem_cache_alloc(kevent_poll_priv_cache, SLAB_KERNEL);
+	if (!priv)
+		goto err_out_fput;
+
+	spin_lock_init(&priv->container_lock);
+	INIT_LIST_HEAD(&priv->container_list);
+
+	k->priv = priv;
+
+	ctl.k = k;
+	init_poll_funcptr(&ctl.pt, &kevent_poll_qproc);
+
+	err = kevent_storage_enqueue(&file->st, k);
+	if (err)
+		goto err_out_free;
+
+	revents = file->f_op->poll(file, &ctl.pt);
+	if (revents & k->event.event) {
+		ready = 1;
+		kevent_poll_dequeue(k);
+	}
+	
+	return ready;
+
+err_out_free:
+	kmem_cache_free(kevent_poll_priv_cache, priv);
+err_out_fput:
+	fput(file);
+	return err;
+}
+
+static int kevent_poll_dequeue(struct kevent *k)
+{
+	struct file *file = k->st->origin;
+	struct kevent_poll_private *priv = k->priv;
+	struct kevent_poll_wait_container *w, *n;
+	unsigned long flags;
+
+	kevent_storage_dequeue(k->st, k);
+
+	spin_lock_irqsave(&priv->container_lock, flags);
+	list_for_each_entry_safe(w, n, &priv->container_list, container_entry) {
+		list_del(&w->container_entry);
+		remove_wait_queue(w->whead, &w->wait);
+		kmem_cache_free(kevent_poll_container_cache, w);
+	}
+	spin_unlock_irqrestore(&priv->container_lock, flags);
+	
+	kmem_cache_free(kevent_poll_priv_cache, priv);
+	k->priv = NULL;
+	
+	fput(file);
+
+	return 0;
+}
+
+static int kevent_poll_callback(struct kevent *k)
+{
+	struct file *file = k->st->origin;
+	unsigned int revents = file->f_op->poll(file, NULL);
+	return (revents & k->event.event);
+}
+
+static int __init kevent_poll_sys_init(void)
+{
+	struct kevent_callbacks *pc = &kevent_registered_callbacks[KEVENT_POLL];
+
+	kevent_poll_container_cache = kmem_cache_create("kevent_poll_container_cache", 
+			sizeof(struct kevent_poll_wait_container), 0, 0, NULL, NULL);
+	if (!kevent_poll_container_cache) {
+		printk(KERN_ERR "Failed to create kevent poll container cache.\n");
+		return -ENOMEM;
+	}
+	
+	kevent_poll_priv_cache = kmem_cache_create("kevent_poll_priv_cache", 
+			sizeof(struct kevent_poll_private), 0, 0, NULL, NULL);
+	if (!kevent_poll_priv_cache) {
+		printk(KERN_ERR "Failed to create kevent poll private data cache.\n");
+		kmem_cache_destroy(kevent_poll_container_cache);
+		kevent_poll_container_cache = NULL;
+		return -ENOMEM;
+	}
+	
+	pc->enqueue = &kevent_poll_enqueue;
+	pc->dequeue = &kevent_poll_dequeue;
+	pc->callback = &kevent_poll_callback;
+
+	printk(KERN_INFO "Kevent poll()/select() subsystem has been initialized.\n");
+	return 0;
+}
+
+static struct lock_class_key kevent_poll_key;
+
+void kevent_poll_reinit(struct file *file)
+{
+	lockdep_set_class(&file->st.lock, &kevent_poll_key);
+}
+
+static void __exit kevent_poll_sys_fini(void)
+{
+	kmem_cache_destroy(kevent_poll_priv_cache);
+	kmem_cache_destroy(kevent_poll_container_cache);
+}
+
+module_init(kevent_poll_sys_init);
+module_exit(kevent_poll_sys_fini);
diff --git a/kernel/kevent/kevent_timer.c b/kernel/kevent/kevent_timer.c
new file mode 100644
index 0000000..fe39b4e
--- /dev/null
+++ b/kernel/kevent/kevent_timer.c
@@ -0,0 +1,108 @@
+/*
+ * 	kevent_timer.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+#include <linux/jiffies.h>
+#include <linux/kevent.h>
+
+struct kevent_timer
+{
+	struct timer_list	ktimer;
+	struct kevent_storage	ktimer_storage;
+};
+
+static void kevent_timer_func(unsigned long data)
+{
+	struct kevent *k = (struct kevent *)data;
+	struct timer_list *t = k->st->origin;
+
+	kevent_storage_ready(k->st, NULL, KEVENT_MASK_ALL);
+	mod_timer(t, jiffies + msecs_to_jiffies(k->event.id.raw[0]));
+}
+
+static struct lock_class_key kevent_timer_key;
+
+static int kevent_timer_enqueue(struct kevent *k)
+{
+	int err;
+	struct kevent_timer *t;
+
+	t = kmalloc(sizeof(struct kevent_timer), GFP_KERNEL);
+	if (!t)
+		return -ENOMEM;
+
+	setup_timer(&t->ktimer, &kevent_timer_func, (unsigned long)k);
+
+	err = kevent_storage_init(&t->ktimer, &t->ktimer_storage);
+	if (err)
+		goto err_out_free;
+	lockdep_set_class(&t->ktimer_storage.lock, &kevent_timer_key);
+
+	err = kevent_storage_enqueue(&t->ktimer_storage, k);
+	if (err)
+		goto err_out_st_fini;
+	
+	mod_timer(&t->ktimer, jiffies + msecs_to_jiffies(k->event.id.raw[0]));
+
+	return 0;
+
+err_out_st_fini:	
+	kevent_storage_fini(&t->ktimer_storage);
+err_out_free:
+	kfree(t);
+
+	return err;
+}
+
+static int kevent_timer_dequeue(struct kevent *k)
+{
+	struct kevent_storage *st = k->st;
+	struct kevent_timer *t = container_of(st, struct kevent_timer, ktimer_storage);
+
+	del_timer_sync(&t->ktimer);
+	kevent_storage_dequeue(st, k);
+	kfree(t);
+
+	return 0;
+}
+
+static int kevent_timer_callback(struct kevent *k)
+{
+	k->event.ret_data[0] = (__u32)jiffies;
+	return 1;
+}
+
+static int __init kevent_init_timer(void)
+{
+	struct kevent_callbacks *tc = &kevent_registered_callbacks[KEVENT_TIMER];
+
+	tc->enqueue = &kevent_timer_enqueue;
+	tc->dequeue = &kevent_timer_dequeue;
+	tc->callback = &kevent_timer_callback;
+
+	return 0;
+}
+module_init(kevent_init_timer);


^ permalink raw reply related	[flat|nested] 160+ messages in thread

* [take10 1/2] kevent: Core files.
  2006-08-16 12:34                         ` [take10 " Evgeniy Polyakov
@ 2006-08-16 12:34                           ` Evgeniy Polyakov
  2006-08-16 12:34                             ` [take10 2/2] kevent: poll/select() notifications. Timer notifications Evgeniy Polyakov
                                               ` (2 more replies)
  0 siblings, 3 replies; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-16 12:34 UTC (permalink / raw)
  To: lkml
  Cc: David Miller, Ulrich Drepper, Andrew Morton, Evgeniy Polyakov,
	netdev, Zach Brown


Core files.

This patch includes core kevent files:
 - userspace controlling
 - kernelspace interfaces
 - initialization
 - notification state machines

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>

diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index dd63d47..091ff42 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@@ -317,3 +317,5 @@ ENTRY(sys_call_table)
 	.long sys_tee			/* 315 */
 	.long sys_vmsplice
 	.long sys_move_pages
+	.long sys_kevent_get_events
+	.long sys_kevent_ctl
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index 5d4a7d1..b2af4a8 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -713,4 +713,6 @@ #endif
 	.quad sys_tee
 	.quad compat_sys_vmsplice
 	.quad compat_sys_move_pages
+	.quad sys_kevent_get_events
+	.quad sys_kevent_ctl
 ia32_syscall_end:		
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index fc1c8dd..c9dde13 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -323,10 +323,12 @@ #define __NR_sync_file_range	314
 #define __NR_tee		315
 #define __NR_vmsplice		316
 #define __NR_move_pages		317
+#define __NR_kevent_get_events	318
+#define __NR_kevent_ctl		319
 
 #ifdef __KERNEL__
 
-#define NR_syscalls 318
+#define NR_syscalls 320
 
 /*
  * user-visible error numbers are in the range -1 - -128: see
diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h
index 94387c9..61363e0 100644
--- a/include/asm-x86_64/unistd.h
+++ b/include/asm-x86_64/unistd.h
@@ -619,10 +619,14 @@ #define __NR_vmsplice		278
 __SYSCALL(__NR_vmsplice, sys_vmsplice)
 #define __NR_move_pages		279
 __SYSCALL(__NR_move_pages, sys_move_pages)
+#define __NR_kevent_get_events	280
+__SYSCALL(__NR_kevent_get_events, sys_kevent_get_events)
+#define __NR_kevent_ctl		281
+__SYSCALL(__NR_kevent_ctl, sys_kevent_ctl)
 
 #ifdef __KERNEL__
 
-#define __NR_syscall_max __NR_move_pages
+#define __NR_syscall_max __NR_kevent_ctl
 
 #ifndef __NO_STUBS
 
diff --git a/include/linux/kevent.h b/include/linux/kevent.h
new file mode 100644
index 0000000..03eeeea
--- /dev/null
+++ b/include/linux/kevent.h
@@ -0,0 +1,310 @@
+/*
+ * 	kevent.h
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef __KEVENT_H
+#define __KEVENT_H
+
+/*
+ * Kevent request flags.
+ */
+
+#define KEVENT_REQ_ONESHOT	0x1		/* Process this event only once and then dequeue. */
+
+/*
+ * Kevent return flags.
+ */
+#define KEVENT_RET_BROKEN	0x1		/* Kevent is broken. */
+#define KEVENT_RET_DONE		0x2		/* Kevent processing was finished successfully. */
+
+/*
+ * Kevent type set.
+ */
+#define KEVENT_SOCKET 		0
+#define KEVENT_INODE		1
+#define KEVENT_TIMER		2
+#define KEVENT_POLL		3
+#define KEVENT_NAIO		4
+#define KEVENT_AIO		5
+#define	KEVENT_MAX		6
+
+/*
+ * Per-type event sets.
+ * Number of per-event sets should be exactly as number of kevent types.
+ */
+
+/*
+ * Timer events.
+ */
+#define	KEVENT_TIMER_FIRED	0x1
+
+/*
+ * Socket/network asynchronous IO events.
+ */
+#define	KEVENT_SOCKET_RECV	0x1
+#define	KEVENT_SOCKET_ACCEPT	0x2
+#define	KEVENT_SOCKET_SEND	0x4
+
+/*
+ * Inode events.
+ */
+#define	KEVENT_INODE_CREATE	0x1
+#define	KEVENT_INODE_REMOVE	0x2
+
+/*
+ * Poll events.
+ */
+#define	KEVENT_POLL_POLLIN	0x0001
+#define	KEVENT_POLL_POLLPRI	0x0002
+#define	KEVENT_POLL_POLLOUT	0x0004
+#define	KEVENT_POLL_POLLERR	0x0008
+#define	KEVENT_POLL_POLLHUP	0x0010
+#define	KEVENT_POLL_POLLNVAL	0x0020
+
+#define	KEVENT_POLL_POLLRDNORM	0x0040
+#define	KEVENT_POLL_POLLRDBAND	0x0080
+#define	KEVENT_POLL_POLLWRNORM	0x0100
+#define	KEVENT_POLL_POLLWRBAND	0x0200
+#define	KEVENT_POLL_POLLMSG	0x0400
+#define	KEVENT_POLL_POLLREMOVE	0x1000
+
+/*
+ * Asynchronous IO events.
+ */
+#define	KEVENT_AIO_BIO		0x1
+
+#define KEVENT_MASK_ALL		0xffffffff	/* Mask of all possible event values. */
+#define KEVENT_MASK_EMPTY	0x0		/* Empty mask of ready events. */
+
+struct kevent_id
+{
+	__u32		raw[2];
+};
+
+struct ukevent
+{
+	struct kevent_id	id;			/* Id of this request, e.g. socket number, file descriptor and so on... */
+	__u32			type;			/* Event type, e.g. KEVENT_SOCK, KEVENT_INODE, KEVENT_TIMER and so on... */
+	__u32			event;			/* Event itself, e.g. SOCK_ACCEPT, INODE_CREATED, TIMER_FIRED... */
+	__u32			req_flags;		/* Per-event request flags */
+	__u32			ret_flags;		/* Per-event return flags */
+	__u32			ret_data[2];		/* Event return data. Event originator fills it with anything it likes. */
+	union {
+		__u32		user[2];		/* User's data. It is not used, just copied to/from user. */
+		void		*ptr;
+	};
+};
+
+struct mukevent
+{
+	struct kevent_id	id;
+	__u32			ret_flags;
+};
+
+#define	KEVENT_CTL_ADD 		0
+#define	KEVENT_CTL_REMOVE	1
+#define	KEVENT_CTL_MODIFY	2
+#define	KEVENT_CTL_INIT		3
+
+#ifdef __KERNEL__
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/wait.h>
+#include <linux/net.h>
+#include <linux/rcupdate.h>
+#include <linux/kevent_storage.h>
+
+#define KEVENT_MAX_EVENTS	4096
+#define KEVENT_MIN_BUFFS_ALLOC	3
+
+struct inode;
+struct dentry;
+struct sock;
+
+struct kevent;
+struct kevent_storage;
+typedef int (* kevent_callback_t)(struct kevent *);
+
+/* @callback is called each time new event has been caught. */
+/* @enqueue is called each time new event is queued. */
+/* @dequeue is called each time event is dequeued. */
+
+struct kevent_callbacks {
+	kevent_callback_t	callback, enqueue, dequeue;
+};
+
+#define KEVENT_READY		0x1
+#define KEVENT_STORAGE		0x2
+#define KEVENT_USER		0x4
+
+struct kevent
+{
+	struct rcu_head		rcu_head;		/* Used for kevent freeing.*/
+	struct ukevent		event;
+	spinlock_t		ulock;			/* This lock protects ukevent manipulations, e.g. ret_flags changes. */
+
+	struct list_head	kevent_entry;		/* Entry of user's queue. */
+	struct list_head	storage_entry;		/* Entry of origin's queue. */
+	struct list_head	ready_entry;		/* Entry of user's ready. */
+
+	u32			flags;
+
+	struct kevent_user	*user;			/* User who requested this kevent. */
+	struct kevent_storage	*st;			/* Kevent container. */
+
+	struct kevent_callbacks	callbacks;
+
+	void			*priv;			/* Private data for different storages. 
+							 * poll()/select storage has a list of wait_queue_t containers 
+							 * for each ->poll() { poll_wait()' } here.
+							 */
+};
+
+extern struct kevent_callbacks kevent_registered_callbacks[];
+
+#define KEVENT_HASH_MASK	0xff
+
+struct kevent_user
+{
+	struct list_head	kevent_list[KEVENT_HASH_MASK+1];
+	spinlock_t		kevent_lock;
+	unsigned int		kevent_num;		/* Number of queued kevents. */
+
+	struct list_head	ready_list;		/* List of ready kevents. */
+	unsigned int		ready_num;		/* Number of ready kevents. */
+	spinlock_t 		ready_lock;		/* Protects all manipulations with ready queue. */
+
+	unsigned int		max_ready_num;		/* Requested number of kevents. */
+
+	struct mutex		ctl_mutex;		/* Protects against simultaneous kevent_user control manipulations. */
+	wait_queue_head_t	wait;			/* Wait until some events are ready. */
+
+	atomic_t		refcnt;			/* Reference counter, increased for each new kevent. */
+	
+	unsigned int		pages_in_use;
+	unsigned long		*pring;			/* Array of pages forming mapped ring buffer */
+
+#ifdef CONFIG_KEVENT_USER_STAT
+	unsigned long		im_num;
+	unsigned long		wait_num;
+	unsigned long		total;
+#endif
+};
+
+extern kmem_cache_t *kevent_cache;
+int kevent_sys_init(void);
+int kevent_enqueue(struct kevent *k);
+int kevent_dequeue(struct kevent *k);
+int kevent_init(struct kevent *k);
+void kevent_requeue(struct kevent *k);
+int kevent_break(struct kevent *k);
+
+void kevent_user_ring_add_event(struct kevent *k);
+
+void kevent_storage_ready(struct kevent_storage *st, 
+		kevent_callback_t ready_callback, u32 event);
+int kevent_storage_init(void *origin, struct kevent_storage *st);
+void kevent_storage_fini(struct kevent_storage *st);
+int kevent_storage_enqueue(struct kevent_storage *st, struct kevent *k);
+void kevent_storage_dequeue(struct kevent_storage *st, struct kevent *k);
+
+int kevent_user_add_ukevent(struct ukevent *uk, struct kevent_user *u);
+
+#ifdef CONFIG_KEVENT_POLL
+void kevent_poll_reinit(struct file *file);
+#else
+static inline void kevent_poll_reinit(struct file *file)
+{
+}
+#endif
+
+#ifdef CONFIG_KEVENT_INODE
+void kevent_inode_notify(struct inode *inode, u32 event);
+void kevent_inode_notify_parent(struct dentry *dentry, u32 event);
+void kevent_inode_remove(struct inode *inode);
+#else
+static inline void kevent_inode_notify(struct inode *inode, u32 event)
+{
+}
+static inline void kevent_inode_notify_parent(struct dentry *dentry, u32 event)
+{
+}
+static inline void kevent_inode_remove(struct inode *inode)
+{
+}
+#endif /* CONFIG_KEVENT_INODE */
+#ifdef CONFIG_KEVENT_SOCKET
+#ifdef CONFIG_LOCKDEP
+void kevent_socket_reinit(struct socket *sock);
+void kevent_sk_reinit(struct sock *sk);
+#else
+static inline void kevent_socket_reinit(struct socket *sock)
+{
+}
+static inline void kevent_sk_reinit(struct sock *sk)
+{
+}
+#endif
+void kevent_socket_notify(struct sock *sock, u32 event);
+int kevent_socket_dequeue(struct kevent *k);
+int kevent_socket_enqueue(struct kevent *k);
+#define sock_async(__sk) sock_flag(__sk, SOCK_ASYNC)
+#else
+static inline void kevent_socket_notify(struct sock *sock, u32 event)
+{
+}
+#define sock_async(__sk)	({ (void)__sk; 0; })
+#endif
+
+#ifdef CONFIG_KEVENT_USER_STAT
+static inline void kevent_stat_init(struct kevent_user *u)
+{
+	u->wait_num = u->im_num = u->total = 0;
+}
+static inline void kevent_stat_print(struct kevent_user *u)
+{
+	pr_debug("%s: u=%p, wait=%lu, immediately=%lu, total=%lu.\n", 
+			__func__, u, u->wait_num, u->im_num, u->total);
+}
+static inline void kevent_stat_im(struct kevent_user *u)
+{
+	u->im_num++;
+}
+static inline void kevent_stat_wait(struct kevent_user *u)
+{
+	u->wait_num++;
+}
+static inline void kevent_stat_total(struct kevent_user *u)
+{
+	u->total++;
+}
+#else
+#define kevent_stat_print(u)		({ (void) u;})
+#define kevent_stat_init(u)		({ (void) u;})
+#define kevent_stat_im(u)		({ (void) u;})
+#define kevent_stat_wait(u)		({ (void) u;})
+#define kevent_stat_total(u)		({ (void) u;})
+#endif
+
+#endif /* __KERNEL__ */
+#endif /* __KEVENT_H */
diff --git a/include/linux/kevent_storage.h b/include/linux/kevent_storage.h
new file mode 100644
index 0000000..a38575d
--- /dev/null
+++ b/include/linux/kevent_storage.h
@@ -0,0 +1,11 @@
+#ifndef __KEVENT_STORAGE_H
+#define __KEVENT_STORAGE_H
+
+struct kevent_storage
+{
+	void			*origin;		/* Originator's pointer, e.g. struct sock or struct file. Can be NULL. */
+	struct list_head	list;			/* List of queued kevents. */
+	spinlock_t		lock;			/* Protects users queue. */
+};
+
+#endif /* __KEVENT_STORAGE_H */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 008f04c..8609910 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -597,4 +597,7 @@ asmlinkage long sys_get_robust_list(int 
 asmlinkage long sys_set_robust_list(struct robust_list_head __user *head,
 				    size_t len);
 
+asmlinkage long sys_kevent_get_events(int ctl_fd, unsigned int min, unsigned int max, 
+		unsigned int timeout, void __user *buf, unsigned flags);
+asmlinkage long sys_kevent_ctl(int ctl_fd, unsigned int cmd, unsigned int num, void __user *buf);
 #endif
diff --git a/init/Kconfig b/init/Kconfig
index a099fc6..c550fcc 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -218,6 +218,8 @@ config AUDITSYSCALL
 	  such as SELinux.  To use audit's filesystem watch feature, please
 	  ensure that INOTIFY is configured.
 
+source "kernel/kevent/Kconfig"
+
 config IKCONFIG
 	bool "Kernel .config support"
 	---help---
diff --git a/kernel/Makefile b/kernel/Makefile
index d62ec66..2d7a6dd 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -47,6 +47,7 @@ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softl
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
+obj-$(CONFIG_KEVENT) += kevent/
 obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o
diff --git a/kernel/kevent/Kconfig b/kernel/kevent/Kconfig
new file mode 100644
index 0000000..31ea7b2
--- /dev/null
+++ b/kernel/kevent/Kconfig
@@ -0,0 +1,59 @@
+config KEVENT
+	bool "Kernel event notification mechanism"
+	help
+	  This option enables event queue mechanism.
+	  It can be used as replacement for poll()/select(), AIO callback 
+	  invocations, advanced timer notifications and other kernel 
+	  object status changes.
+
+config KEVENT_USER_STAT
+	bool "Kevent user statistic"
+	depends on KEVENT
+	default N
+	help
+	  This option will turn kevent_user statistic collection on.
+	  Statistic data includes total number of kevent, number of kevents 
+	  which are ready immediately at insertion time and number of kevents 
+	  which were removed through readiness completion. 
+	  It will be printed each time control kevent descriptor is closed.
+
+config KEVENT_SOCKET
+	bool "Kernel event notifications for sockets"
+	depends on NET && KEVENT
+	help
+	  This option enables notifications through KEVENT subsystem of 
+	  sockets operations, like new packet receiving conditions, 
+	  ready for accept conditions and so on.
+	
+config KEVENT_INODE
+	bool "Kernel event notifications for inodes"
+	depends on KEVENT
+	help
+	  This option enables notifications through KEVENT subsystem of 
+	  inode operations, like file creation, removal and so on.
+
+config KEVENT_TIMER
+	bool "Kernel event notifications for timers"
+	depends on KEVENT
+	help
+	  This option allows to use timers through KEVENT subsystem.
+
+config KEVENT_POLL
+	bool "Kernel event notifications for poll()/select()"
+	depends on KEVENT
+	help
+	  This option allows to use kevent subsystem for poll()/select() 
+	  notifications.
+
+config KEVENT_NAIO
+	bool "Network asynchronous IO"
+	depends on KEVENT && KEVENT_SOCKET
+	help
+	  This option enables kevent based network asynchronous IO subsystem.
+
+config KEVENT_AIO
+	bool "Asynchronous IO"
+	depends on KEVENT
+	help
+	  This option allows to use kevent subsystem for AIO operations.
+	  AIO read is currently supported.
diff --git a/kernel/kevent/Makefile b/kernel/kevent/Makefile
new file mode 100644
index 0000000..d1ef9ba
--- /dev/null
+++ b/kernel/kevent/Makefile
@@ -0,0 +1,7 @@
+obj-y := kevent.o kevent_user.o
+obj-$(CONFIG_KEVENT_SOCKET) += kevent_socket.o
+obj-$(CONFIG_KEVENT_INODE) += kevent_inode.o
+obj-$(CONFIG_KEVENT_TIMER) += kevent_timer.o
+obj-$(CONFIG_KEVENT_POLL) += kevent_poll.o
+obj-$(CONFIG_KEVENT_NAIO) += kevent_naio.o
+obj-$(CONFIG_KEVENT_AIO) += kevent_aio.o
diff --git a/kernel/kevent/kevent.c b/kernel/kevent/kevent.c
new file mode 100644
index 0000000..3814464
--- /dev/null
+++ b/kernel/kevent/kevent.c
@@ -0,0 +1,249 @@
+/*
+ * 	kevent.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/mempool.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/kevent.h>
+
+kmem_cache_t *kevent_cache;
+
+/*
+ * Attempts to add an event into appropriate origin's queue.
+ * Returns positive value if this event is ready immediately,
+ * negative value in case of error and zero if event has been queued.
+ * ->enqueue() callback must increase origin's reference counter.
+ */
+int kevent_enqueue(struct kevent *k)
+{
+	if (k->event.type >= KEVENT_MAX)
+		return -EINVAL;
+
+	if (!k->callbacks.enqueue) {
+		kevent_break(k);
+		return -EINVAL;
+	}
+	
+	return k->callbacks.enqueue(k);
+}
+
+/*
+ * Remove event from the appropriate queue.
+ * ->dequeue() callback must decrease origin's reference counter.
+ */
+int kevent_dequeue(struct kevent *k)
+{
+	if (k->event.type >= KEVENT_MAX)
+		return -EINVAL;
+	
+	if (!k->callbacks.dequeue) {
+		kevent_break(k);
+		return -EINVAL;
+	}
+
+	return k->callbacks.dequeue(k);
+}
+
+/*
+ * Mark kevent as broken.
+ */
+int kevent_break(struct kevent *k)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&k->ulock, flags);
+	k->event.ret_flags |= KEVENT_RET_BROKEN;
+	spin_unlock_irqrestore(&k->ulock, flags);
+	return 0;
+}
+
+struct kevent_callbacks kevent_registered_callbacks[KEVENT_MAX];
+
+/*
+ * Must be called before event is going to be added into some origin's queue.
+ * Initializes ->enqueue(), ->dequeue() and ->callback() callbacks.
+ * If failed, kevent should not be used or kevent_enqueue() will fail to add
+ * this kevent into origin's queue with setting
+ * KEVENT_RET_BROKEN flag in kevent->event.ret_flags.
+ */
+int kevent_init(struct kevent *k)
+{
+	spin_lock_init(&k->ulock);
+	k->flags = 0;
+
+	if (k->event.type >= KEVENT_MAX)
+		return -EINVAL;
+
+	k->callbacks = kevent_registered_callbacks[k->event.type];
+	if (!k->callbacks.callback) {
+		kevent_break(k);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/*
+ * Called from ->enqueue() callback when reference counter for given
+ * origin (socket, inode...) has been increased.
+ */
+int kevent_storage_enqueue(struct kevent_storage *st, struct kevent *k)
+{
+	unsigned long flags;
+
+	k->st = st;
+	spin_lock_irqsave(&st->lock, flags);
+	list_add_tail_rcu(&k->storage_entry, &st->list);
+	k->flags |= KEVENT_STORAGE;
+	spin_unlock_irqrestore(&st->lock, flags);
+	return 0;
+}
+
+/*
+ * Dequeue kevent from origin's queue. 
+ * It does not decrease origin's reference counter in any way 
+ * and must be called before it, so storage itself must be valid.
+ * It is called from ->dequeue() callback.
+ */
+void kevent_storage_dequeue(struct kevent_storage *st, struct kevent *k)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&st->lock, flags);
+	if (k->flags & KEVENT_STORAGE) {
+		list_del_rcu(&k->storage_entry);
+		k->flags &= ~KEVENT_STORAGE;
+	}
+	spin_unlock_irqrestore(&st->lock, flags);
+}
+
+/*
+ * Call kevent ready callback and queue it into ready queue if needed.
+ * If kevent is marked as one-shot, then remove it from storage queue.
+ */
+static void __kevent_requeue(struct kevent *k, u32 event)
+{
+	int ret, rem = 0;
+	unsigned long flags;
+
+	ret = k->callbacks.callback(k);
+
+	spin_lock_irqsave(&k->ulock, flags);
+	if (ret > 0) {
+		k->event.ret_flags |= KEVENT_RET_DONE;
+	} else if (ret < 0) {
+		k->event.ret_flags |= KEVENT_RET_BROKEN;
+		k->event.ret_flags |= KEVENT_RET_DONE;
+	}
+	rem = (k->event.req_flags & KEVENT_REQ_ONESHOT);
+	if (!ret)
+		ret = (k->event.ret_flags & (KEVENT_RET_BROKEN|KEVENT_RET_DONE));
+	spin_unlock_irqrestore(&k->ulock, flags);
+
+	if (ret) {
+		if ((rem || ret < 0) && k->flags &KEVENT_STORAGE) {
+			list_del_rcu(&k->storage_entry);
+			k->flags &= ~KEVENT_STORAGE;
+		}
+		
+		spin_lock_irqsave(&k->user->ready_lock, flags);
+		if (!(k->flags & KEVENT_READY)) {
+			kevent_user_ring_add_event(k);
+			list_add_tail(&k->ready_entry, &k->user->ready_list);
+			k->flags |= KEVENT_READY;
+			k->user->ready_num++;
+		}
+		spin_unlock_irqrestore(&k->user->ready_lock, flags);
+		wake_up(&k->user->wait);
+	}
+}
+
+/*
+ * Check if kevent is ready (by invoking it's callback) and requeue/remove
+ * if needed.
+ */
+void kevent_requeue(struct kevent *k)
+{
+	unsigned long flags;
+	
+	spin_lock_irqsave(&k->st->lock, flags);
+	__kevent_requeue(k, 0);
+	spin_unlock_irqrestore(&k->st->lock, flags);
+}
+
+/*
+ * Called each time some activity in origin (socket, inode...) is noticed.
+ */
+void kevent_storage_ready(struct kevent_storage *st, 
+		kevent_callback_t ready_callback, u32 event)
+{
+	struct kevent *k;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(k, &st->list, storage_entry) {
+		if (ready_callback)
+			(*ready_callback)(k);
+
+		if (event & k->event.event)
+			__kevent_requeue(k, event);
+	}
+	rcu_read_unlock();
+}
+
+int kevent_storage_init(void *origin, struct kevent_storage *st)
+{
+	spin_lock_init(&st->lock);
+	st->origin = origin;
+	INIT_LIST_HEAD(&st->list);
+	return 0;
+}
+
+/*
+ * Mark all events as broken, that will remove them from storage,
+ * so storage origin (inode, sockt and so on) can be safely removed.
+ * No new entries are allowed to be added into the storage at this point.
+ * (Socket is removed from file table at this point for example).
+ */
+void kevent_storage_fini(struct kevent_storage *st)
+{
+	kevent_storage_ready(st, kevent_break, KEVENT_MASK_ALL);
+}
+
+int kevent_sys_init(void)
+{
+	int i;
+
+	kevent_cache = kmem_cache_create("kevent_cache", 
+			sizeof(struct kevent), 0, SLAB_PANIC, NULL, NULL);
+
+	for (i=0; i<ARRAY_SIZE(kevent_registered_callbacks); ++i) {
+		struct kevent_callbacks *c = &kevent_registered_callbacks[i];
+
+		c->callback = c->enqueue = c->dequeue = NULL;
+	}
+	
+	return 0;
+}
diff --git a/kernel/kevent/kevent_user.c b/kernel/kevent/kevent_user.c
new file mode 100644
index 0000000..799ed8a
--- /dev/null
+++ b/kernel/kevent/kevent_user.c
@@ -0,0 +1,1002 @@
+/*
+ * 	kevent_user.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/device.h>
+#include <linux/poll.h>
+#include <linux/kevent.h>
+#include <linux/jhash.h>
+#include <linux/miscdevice.h>
+#include <asm/io.h>
+
+static char kevent_name[] = "kevent";
+
+static int kevent_user_open(struct inode *, struct file *);
+static int kevent_user_release(struct inode *, struct file *);
+static unsigned int kevent_user_poll(struct file *, struct poll_table_struct *);
+static int kevent_user_mmap(struct file *, struct vm_area_struct *);
+
+static struct file_operations kevent_user_fops = {
+	.mmap		= kevent_user_mmap,
+	.open		= kevent_user_open,
+	.release	= kevent_user_release,
+	.poll		= kevent_user_poll,
+	.owner		= THIS_MODULE,
+};
+
+static struct miscdevice kevent_miscdev = {
+	.minor = MISC_DYNAMIC_MINOR,
+	.name = kevent_name,
+	.fops = &kevent_user_fops,
+};
+
+static int kevent_get_sb(struct file_system_type *fs_type, 
+		int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+{
+	/* So original magic... */
+	return get_sb_pseudo(fs_type, kevent_name, NULL, 0xbcdbcdul, mnt);
+}
+
+static struct file_system_type kevent_fs_type = {
+	.name		= kevent_name,
+	.get_sb		= kevent_get_sb,
+	.kill_sb	= kill_anon_super,
+};
+
+static struct vfsmount *kevent_mnt;
+
+/*
+ * kevents are pollable, return POLLIN and POLLRDNORM 
+ * when there is at least one ready kevent.
+ */
+static unsigned int kevent_user_poll(struct file *file, struct poll_table_struct *wait)
+{
+	struct kevent_user *u = file->private_data;
+	unsigned int mask;
+	
+	poll_wait(file, &u->wait, wait);
+	mask = 0;
+
+	if (u->ready_num)
+		mask |= POLLIN | POLLRDNORM;
+
+	return mask;
+}
+
+/*
+ * Note that kevents does not exactly fill the page (each mukevent is 40 bytes),
+ * so we reuse 4 bytes at the begining of the first page to store index.
+ * Take that into account if you want to change size of struct ukevent.
+ */
+#define KEVENTS_ON_PAGE ((PAGE_SIZE-sizeof(unsigned int))/sizeof(struct mukevent))
+struct kevent_mring
+{
+	unsigned int		index;
+	struct mukevent		event[KEVENTS_ON_PAGE];
+};
+
+static inline void kevent_user_ring_set(struct kevent_user *u, unsigned int num)
+{
+	struct kevent_mring *ring;
+
+	ring = (struct kevent_mring *)u->pring[0];
+	ring->index = num;
+}
+
+static inline void kevent_user_ring_inc(struct kevent_user *u)
+{
+	struct kevent_mring *ring;
+
+	ring = (struct kevent_mring *)u->pring[0];
+	ring->index++;
+}
+
+static int kevent_user_ring_grow(struct kevent_user *u)
+{
+	struct kevent_mring *ring;
+	unsigned int idx;
+
+	ring = (struct kevent_mring *)u->pring[0];
+
+	idx = (ring->index + 1) / KEVENTS_ON_PAGE;
+	if (idx >= u->pages_in_use) {
+		u->pring[idx] = __get_free_page(GFP_KERNEL);
+		if (!u->pring[idx])
+			return -ENOMEM;
+		u->pages_in_use++;
+	}
+	return 0;
+}
+
+/*
+ * Called under kevent_user->ready_lock, so updates are always protected.
+ */
+void kevent_user_ring_add_event(struct kevent *k)
+{
+	unsigned int pidx, off;
+	struct kevent_mring *ring, *copy_ring;
+
+	ring = (struct kevent_mring *)k->user->pring[0];
+	
+	pidx = ring->index/KEVENTS_ON_PAGE;
+	off = ring->index%KEVENTS_ON_PAGE;
+
+	copy_ring = (struct kevent_mring *)k->user->pring[pidx];
+
+	copy_ring->event[off].id.raw[0] = k->event.id.raw[0];
+	copy_ring->event[off].id.raw[1] = k->event.id.raw[1];
+	copy_ring->event[off].ret_flags = k->event.ret_flags;
+
+	if (++ring->index >= KEVENT_MAX_EVENTS)
+		ring->index = 0;
+}
+
+/*
+ * Initialize mmap ring buffer.
+ * It will store ready kevents, so userspace could get them directly instead
+ * of using syscall. Esentially syscall becomes just a waiting point.
+ */
+static int kevent_user_ring_init(struct kevent_user *u)
+{
+	int pnum;
+
+	pnum = ALIGN(KEVENT_MAX_EVENTS*sizeof(struct mukevent) + sizeof(unsigned int), PAGE_SIZE)/PAGE_SIZE;
+
+	u->pring = kmalloc(pnum * sizeof(unsigned long), GFP_KERNEL);
+	if (!u->pring)
+		return -ENOMEM;
+
+	u->pring[0] = __get_free_page(GFP_KERNEL);
+	if (!u->pring[0])
+		goto err_out_free;
+
+	u->pages_in_use = 1;
+	kevent_user_ring_set(u, 0);
+
+	return 0;
+
+err_out_free:
+	kfree(u->pring);
+
+	return -ENOMEM;
+}
+
+static void kevent_user_ring_fini(struct kevent_user *u)
+{
+	int i;
+	
+	for (i=0; i<u->pages_in_use; ++i)
+		free_page(u->pring[i]);
+
+	kfree(u->pring);
+}
+
+
+/*
+ * Allocate new kevent userspace control entry.
+ */
+static struct kevent_user *kevent_user_alloc(void)
+{
+	struct kevent_user *u;
+	int i;
+
+	u = kzalloc(sizeof(struct kevent_user), GFP_KERNEL);
+	if (!u)
+		return NULL;
+
+	INIT_LIST_HEAD(&u->ready_list);
+	spin_lock_init(&u->ready_lock);
+	kevent_stat_init(u);
+	spin_lock_init(&u->kevent_lock);
+	for (i=0; i<ARRAY_SIZE(u->kevent_list); ++i)
+		INIT_LIST_HEAD(&u->kevent_list[i]);
+	
+	mutex_init(&u->ctl_mutex);
+	init_waitqueue_head(&u->wait);
+
+	atomic_set(&u->refcnt, 1);
+
+	if (kevent_user_ring_init(u)) {
+		kfree(u);
+		u = NULL;
+	}
+
+	return u;
+}
+
+static int kevent_user_open(struct inode *inode, struct file *file)
+{
+	struct kevent_user *u = kevent_user_alloc();
+	
+	if (!u)
+		return -ENOMEM;
+
+	file->private_data = u;
+	
+	return 0;
+}
+
+
+/*
+ * Kevent userspace control block reference counting.
+ * Set to 1 at creation time, when appropriate kevent file descriptor
+ * is closed, that reference counter is decreased.
+ * When counter hits zero block is freed.
+ */
+static inline void kevent_user_get(struct kevent_user *u)
+{
+	atomic_inc(&u->refcnt);
+}
+
+static inline void kevent_user_put(struct kevent_user *u)
+{
+	if (atomic_dec_and_test(&u->refcnt)) {
+		kevent_stat_print(u);
+		kevent_user_ring_fini(u);
+		kfree(u);
+	}
+}
+
+static struct page *kevent_user_nopage(struct vm_area_struct *vma, unsigned long addr, int *type)
+{
+	struct kevent_user *u = vma->vm_file->private_data;
+	unsigned long off = (addr - vma->vm_start)/PAGE_SIZE;
+
+	if (type)
+		*type = VM_FAULT_MINOR;
+
+	if (off >= u->pages_in_use)
+		goto err_out_sigbus;
+
+	return virt_to_page(u->pring[off]);
+
+err_out_sigbus:
+	return NOPAGE_SIGBUS;
+}
+
+static struct vm_operations_struct kevent_user_vm_ops = {
+	.nopage = &kevent_user_nopage,
+};
+
+/*
+ * Mmap implementation for ring buffer, which is created as array
+ * of pages, so vm_pgoff is an offset (in pages, not in bytes) of
+ * the first page to be mapped.
+ */
+static int kevent_user_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	unsigned long start = vma->vm_start;
+	struct kevent_user *u = file->private_data;
+
+	if (vma->vm_flags & VM_WRITE)
+		return -EPERM;
+
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+	vma->vm_ops = &kevent_user_vm_ops;
+	vma->vm_flags |= VM_RESERVED;
+	vma->vm_file = file;
+
+	if (remap_pfn_range(vma, start, virt_to_phys((void *)u->pring[0]), PAGE_SIZE,
+				vma->vm_page_prot))
+		return -EFAULT;
+
+	return 0;
+}
+
+#if 0
+static inline unsigned int kevent_user_hash(struct ukevent *uk)
+{
+	unsigned int h = (uk->user[0] ^ uk->user[1]) ^ (uk->id.raw[0] ^ uk->id.raw[1]);
+	
+	h = (((h >> 16) & 0xffff) ^ (h & 0xffff)) & 0xffff;
+	h = (((h >> 8) & 0xff) ^ (h & 0xff)) & KEVENT_HASH_MASK;
+
+	return h;
+}
+#else
+static inline unsigned int kevent_user_hash(struct ukevent *uk)
+{
+	return jhash_1word(uk->id.raw[0], 0) & KEVENT_HASH_MASK;
+}
+#endif
+
+/*
+ * RCU protects storage list (kevent->storage_entry).
+ * Free entry in RCU callback, it is dequeued from all lists at 
+ * this point.
+ */
+
+static void kevent_free_rcu(struct rcu_head *rcu)
+{
+	struct kevent *kevent = container_of(rcu, struct kevent, rcu_head);
+	kmem_cache_free(kevent_cache, kevent);
+}
+
+/*
+ * Complete kevent removing - it dequeues kevent from storage list
+ * if it is requested, removes kevent from ready list, drops userspace
+ * control block reference counter and schedules kevent freeing through RCU.
+ */
+static void kevent_finish_user_complete(struct kevent *k, int deq)
+{
+	struct kevent_user *u = k->user;
+	unsigned long flags;
+
+	if (deq)
+		kevent_dequeue(k);
+
+	spin_lock_irqsave(&u->ready_lock, flags);
+	if (k->flags & KEVENT_READY) {
+		list_del(&k->ready_entry);
+		k->flags &= ~KEVENT_READY;
+		u->ready_num--;
+	}
+	spin_unlock_irqrestore(&u->ready_lock, flags);
+
+	kevent_user_put(u);
+	call_rcu(&k->rcu_head, kevent_free_rcu);
+}
+
+/*
+ * Remove from all lists and free kevent.
+ * Must be called under kevent_user->kevent_lock to protect 
+ * kevent->kevent_entry removing.
+ */
+static void __kevent_finish_user(struct kevent *k, int deq)
+{
+	struct kevent_user *u = k->user;
+
+	list_del(&k->kevent_entry);
+	k->flags &= ~KEVENT_USER;
+	u->kevent_num--;
+	kevent_finish_user_complete(k, deq);
+}
+
+/*
+ * Remove kevent from user's list of all events, 
+ * dequeue it from storage and decrease user's reference counter,
+ * since this kevent does not exist anymore. That is why it is freed here.
+ */
+static void kevent_finish_user(struct kevent *k, int deq)
+{
+	struct kevent_user *u = k->user;
+	unsigned long flags;
+
+	spin_lock_irqsave(&u->kevent_lock, flags);
+	list_del(&k->kevent_entry);
+	k->flags &= ~KEVENT_USER;
+	u->kevent_num--;
+	spin_unlock_irqrestore(&u->kevent_lock, flags);
+	kevent_finish_user_complete(k, deq);
+}
+
+/*
+ * Dequeue one entry from user's ready queue.
+ */
+static struct kevent *kqueue_dequeue_ready(struct kevent_user *u)
+{
+	unsigned long flags;
+	struct kevent *k = NULL;
+
+	spin_lock_irqsave(&u->ready_lock, flags);
+	if (u->ready_num && !list_empty(&u->ready_list)) {
+		k = list_entry(u->ready_list.next, struct kevent, ready_entry);
+		list_del(&k->ready_entry);
+		k->flags &= ~KEVENT_READY;
+		u->ready_num--;
+	}
+	spin_unlock_irqrestore(&u->ready_lock, flags);
+
+	return k;
+}
+
+/*
+ * Search a kevent inside hash bucket for given ukevent.
+ */
+static struct kevent *__kevent_search(struct list_head *head, struct ukevent *uk, 
+		struct kevent_user *u)
+{
+	struct kevent *k, *ret = NULL;
+	
+	list_for_each_entry(k, head, kevent_entry) {
+		spin_lock(&k->ulock);
+		if (k->event.user[0] == uk->user[0] && k->event.user[1] == uk->user[1] &&
+				k->event.id.raw[0] == uk->id.raw[0] && 
+				k->event.id.raw[1] == uk->id.raw[1]) {
+			ret = k;
+			spin_unlock(&k->ulock);
+			break;
+		}
+		spin_unlock(&k->ulock);
+	}
+
+	return ret;
+}
+
+/*
+ * Search and modify kevent according to provided ukevent.
+ */
+static int kevent_modify(struct ukevent *uk, struct kevent_user *u)
+{
+	struct kevent *k;
+	unsigned int hash = kevent_user_hash(uk);
+	int err = -ENODEV;
+	unsigned long flags;
+	
+	spin_lock_irqsave(&u->kevent_lock, flags);
+	k = __kevent_search(&u->kevent_list[hash], uk, u);
+	if (k) {
+		spin_lock(&k->ulock);
+		k->event.event = uk->event;
+		k->event.req_flags = uk->req_flags;
+		k->event.ret_flags = 0;
+		spin_unlock(&k->ulock);
+		kevent_requeue(k);
+		err = 0;
+	}
+	spin_unlock_irqrestore(&u->kevent_lock, flags);
+	
+	return err;
+}
+
+/*
+ * Remove kevent which matches provided ukevent.
+ */
+static int kevent_remove(struct ukevent *uk, struct kevent_user *u)
+{
+	int err = -ENODEV;
+	struct kevent *k;
+	unsigned int hash = kevent_user_hash(uk);
+	unsigned long flags;
+
+	spin_lock_irqsave(&u->kevent_lock, flags);
+	k = __kevent_search(&u->kevent_list[hash], uk, u);
+	if (k) {
+		__kevent_finish_user(k, 1);
+		err = 0;
+	}
+	spin_unlock_irqrestore(&u->kevent_lock, flags);
+
+	return err;
+}
+
+/*
+ * Detaches userspace control block from file descriptor 
+ * and decrease it's reference counter.
+ * No new kevents can be added or removed from any list at this point.
+ */
+static int kevent_user_release(struct inode *inode, struct file *file)
+{
+	struct kevent_user *u = file->private_data;
+	struct kevent *k, *n;
+	int i;
+
+	for (i=0; i<ARRAY_SIZE(u->kevent_list); ++i) {
+		list_for_each_entry_safe(k, n, &u->kevent_list[i], kevent_entry)
+			kevent_finish_user(k, 1);
+	}
+
+	kevent_user_put(u);
+	file->private_data = NULL;
+
+	return 0;
+}
+
+/*
+ * Read requested number of ukevents in one shot.
+ */
+static struct ukevent *kevent_get_user(unsigned int num, void __user *arg)
+{
+	struct ukevent *ukev;
+
+	ukev = kmalloc(sizeof(struct ukevent) * num, GFP_KERNEL);
+	if (!ukev)
+		return NULL;
+
+	if (copy_from_user(ukev, arg, sizeof(struct ukevent) * num)) {
+		kfree(ukev);
+		return NULL;
+	}
+
+	return ukev;
+}
+
+/*
+ * Read from userspace all ukevents and modify appropriate kevents.
+ * If provided number of ukevents is more that threshold, it is faster
+ * to allocate a room for them and copy in one shot instead of copy 
+ * one-by-one and then process them.
+ */
+static int kevent_user_ctl_modify(struct kevent_user *u, unsigned int num, void __user *arg)
+{
+	int err = 0, i;
+	struct ukevent uk;
+
+	mutex_lock(&u->ctl_mutex);
+
+	if (num > u->kevent_num) {
+		err = -EINVAL;
+		goto out;
+	}
+	
+	if (num > KEVENT_MIN_BUFFS_ALLOC) {
+		struct ukevent *ukev;
+
+		ukev = kevent_get_user(num, arg);
+		if (ukev) {
+			for (i=0; i<num; ++i) {
+				if (kevent_modify(&ukev[i], u))
+					ukev[i].ret_flags |= KEVENT_RET_BROKEN;
+				ukev[i].ret_flags |= KEVENT_RET_DONE;
+			}
+			if (copy_to_user(arg, ukev, num*sizeof(struct ukevent)))
+				err = -EFAULT;
+			kfree(ukev);
+			goto out;
+		}
+	}
+
+	for (i=0; i<num; ++i) {
+		if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
+			err = -EFAULT;
+			break;
+		}
+
+		if (kevent_modify(&uk, u))
+			uk.ret_flags |= KEVENT_RET_BROKEN;
+		uk.ret_flags |= KEVENT_RET_DONE;
+
+		if (copy_to_user(arg, &uk, sizeof(struct ukevent))) {
+			err = -EFAULT;
+			break;
+		}
+
+		arg += sizeof(struct ukevent);
+	}
+out:
+	mutex_unlock(&u->ctl_mutex);
+
+	return err;
+}
+
+/*
+ * Read from userspace all ukevents and remove appropriate kevents.
+ * If provided number of ukevents is more that threshold, it is faster
+ * to allocate a room for them and copy in one shot instead of copy 
+ * one-by-one and then process them.
+ */
+static int kevent_user_ctl_remove(struct kevent_user *u, unsigned int num, void __user *arg)
+{
+	int err = 0, i;
+	struct ukevent uk;
+
+	mutex_lock(&u->ctl_mutex);
+	
+	if (num > u->kevent_num) {
+		err = -EINVAL;
+		goto out;
+	}
+	
+	if (num > KEVENT_MIN_BUFFS_ALLOC) {
+		struct ukevent *ukev;
+
+		ukev = kevent_get_user(num, arg);
+		if (ukev) {
+			for (i=0; i<num; ++i) {
+				if (kevent_remove(&ukev[i], u))
+					ukev[i].ret_flags |= KEVENT_RET_BROKEN;
+				ukev[i].ret_flags |= KEVENT_RET_DONE;
+			}
+			if (copy_to_user(arg, ukev, num*sizeof(struct ukevent)))
+				err = -EFAULT;
+			kfree(ukev);
+			goto out;
+		}
+	}
+
+	for (i=0; i<num; ++i) {
+		if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
+			err = -EFAULT;
+			break;
+		}
+
+		if (kevent_remove(&uk, u))
+			uk.ret_flags |= KEVENT_RET_BROKEN;
+
+		uk.ret_flags |= KEVENT_RET_DONE;
+
+		if (copy_to_user(arg, &uk, sizeof(struct ukevent))) {
+			err = -EFAULT;
+			break;
+		}
+
+		arg += sizeof(struct ukevent);
+	}
+out:
+	mutex_unlock(&u->ctl_mutex);
+
+	return err;
+}
+
+/*
+ * Queue kevent into userspace control block and increase
+ * it's reference counter.
+ */
+static void kevent_user_enqueue(struct kevent_user *u, struct kevent *k)
+{
+	unsigned long flags;
+	unsigned int hash = kevent_user_hash(&k->event);
+
+	spin_lock_irqsave(&u->kevent_lock, flags);
+	list_add_tail(&k->kevent_entry, &u->kevent_list[hash]);
+	k->flags |= KEVENT_USER;
+	u->kevent_num++;
+	kevent_user_get(u);
+	spin_unlock_irqrestore(&u->kevent_lock, flags);
+}
+
+/*
+ * Add kevent from both kernel and userspace users.
+ * This function allocates and queues kevent, returns negative value
+ * on error, positive if kevent is ready immediately and zero
+ * if kevent has been queued.
+ */
+int kevent_user_add_ukevent(struct ukevent *uk, struct kevent_user *u)
+{
+	struct kevent *k;
+	int err;
+
+	if (kevent_user_ring_grow(u)) {
+		err = -ENOMEM;
+		goto err_out_exit;
+	}
+
+	k = kmem_cache_alloc(kevent_cache, GFP_KERNEL);
+	if (!k) {
+		err = -ENOMEM;
+		goto err_out_exit;
+	}
+
+	memcpy(&k->event, uk, sizeof(struct ukevent));
+	INIT_RCU_HEAD(&k->rcu_head);
+
+	k->event.ret_flags = 0;
+
+	err = kevent_init(k);
+	if (err) {
+		kmem_cache_free(kevent_cache, k);
+		goto err_out_exit;
+	}
+	k->user = u;
+	kevent_stat_total(u);
+	kevent_user_enqueue(u, k);
+
+	err = kevent_enqueue(k);
+	if (err) {
+		memcpy(uk, &k->event, sizeof(struct ukevent));
+		kevent_finish_user(k, 0);
+	} else {
+		kevent_user_ring_inc(u);
+	}
+
+err_out_exit:
+	if (err < 0) {
+		uk->ret_flags |= KEVENT_RET_BROKEN | KEVENT_RET_DONE;
+		uk->ret_data[1] = err;
+	}
+	return err;
+}
+
+/*
+ * Copy all ukevents from userspace, allocate kevent for each one 
+ * and add them into appropriate kevent_storages, 
+ * e.g. sockets, inodes and so on...
+ * Ready events will replace ones provided by used and number 
+ * of ready events is returned.
+ * User must check ret_flags field of each ukevent structure 
+ * to determine if it is fired or failed event.
+ */
+static int kevent_user_ctl_add(struct kevent_user *u, unsigned int num, void __user *arg)
+{
+	int err, cerr = 0, knum = 0, rnum = 0, i;
+	void __user *orig = arg;
+	struct ukevent uk;
+
+	mutex_lock(&u->ctl_mutex);
+
+	err = -EINVAL;
+	if (u->kevent_num + num >= KEVENT_MAX_EVENTS)
+		goto out_remove;
+
+	if (num > KEVENT_MIN_BUFFS_ALLOC) {
+		struct ukevent *ukev;
+
+		ukev = kevent_get_user(num, arg);
+		if (ukev) {
+			for (i=0; i<num; ++i) {
+				err = kevent_user_add_ukevent(&ukev[i], u);
+				if (err) {
+					kevent_stat_im(u);
+					if (i != rnum)
+						memcpy(&ukev[rnum], &ukev[i], sizeof(struct ukevent));
+					rnum++;
+				} else
+					knum++;
+			}
+			if (copy_to_user(orig, ukev, rnum*sizeof(struct ukevent)))
+				cerr = -EFAULT;
+			kfree(ukev);
+			goto out_setup;
+		}
+	}
+
+	for (i=0; i<num; ++i) {
+		if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
+			cerr = -EFAULT;
+			break;
+		}
+		arg += sizeof(struct ukevent);
+
+		err = kevent_user_add_ukevent(&uk, u);
+		if (err) {
+			kevent_stat_im(u);
+			if (copy_to_user(orig, &uk, sizeof(struct ukevent))) {
+				cerr = -EFAULT;
+				break;
+			}
+			orig += sizeof(struct ukevent);
+			rnum++;
+		} else
+			knum++;
+	}
+
+out_setup:
+	if (cerr < 0) {
+		err = cerr;
+		goto out_remove;
+	}
+
+	err = rnum;
+out_remove:
+	mutex_unlock(&u->ctl_mutex);
+
+	return err;
+}
+
+/*
+ * In nonblocking mode it returns as many events as possible, but not more than @max_nr.
+ * In blocking mode it waits until timeout or if at least @min_nr events are ready.
+ */
+static int kevent_user_wait(struct file *file, struct kevent_user *u, 
+		unsigned int min_nr, unsigned int max_nr, unsigned int timeout, 
+		void __user *buf)
+{
+	struct kevent *k;
+	int num = 0;
+
+	if (!(file->f_flags & O_NONBLOCK)) {
+		wait_event_interruptible_timeout(u->wait, 
+			u->ready_num >= min_nr, msecs_to_jiffies(timeout));
+	}
+	
+	while (num < max_nr && ((k = kqueue_dequeue_ready(u)) != NULL)) {
+		if (copy_to_user(buf + num*sizeof(struct ukevent), 
+					&k->event, sizeof(struct ukevent)))
+			break;
+
+		/*
+		 * If it is one-shot kevent, it has been removed already from
+		 * origin's queue, so we can easily free it here.
+		 */
+		if (k->event.req_flags & KEVENT_REQ_ONESHOT)
+			kevent_finish_user(k, 1);
+		++num;
+		kevent_stat_wait(u);
+	}
+
+	return num;
+}
+
+/*
+ * Userspace control block creation and initialization.
+ */
+static int kevent_ctl_init(void)
+{
+	struct kevent_user *u;
+	struct file *file;
+	int fd, ret;
+
+	fd = get_unused_fd();
+	if (fd < 0)
+		return fd;
+
+	file = get_empty_filp();
+	if (!file) {
+		ret = -ENFILE;
+		goto out_put_fd;
+	}
+
+	u = kevent_user_alloc();
+	if (unlikely(!u)) {
+		ret = -ENOMEM;
+		goto out_put_file;
+	}
+
+	file->f_op = &kevent_user_fops;
+	file->f_vfsmnt = mntget(kevent_mnt);
+	file->f_dentry = dget(kevent_mnt->mnt_root);
+	file->f_mapping = file->f_dentry->d_inode->i_mapping;
+	file->f_mode = FMODE_READ;
+	file->f_flags = O_RDONLY;
+	file->private_data = u;
+	
+	fd_install(fd, file);
+
+	return fd;
+
+out_put_file:
+	put_filp(file);
+out_put_fd:
+	put_unused_fd(fd);
+	return ret;
+}
+
+static int kevent_ctl_process(struct file *file, unsigned int cmd, unsigned int num, void __user *arg)
+{
+	int err;
+	struct kevent_user *u = file->private_data;
+
+	if (!u || num > KEVENT_MAX_EVENTS)
+		return -EINVAL;
+
+	switch (cmd) {
+	case KEVENT_CTL_ADD:
+		err = kevent_user_ctl_add(u, num, arg);
+		break;
+	case KEVENT_CTL_REMOVE:
+		err = kevent_user_ctl_remove(u, num, arg);
+		break;
+	case KEVENT_CTL_MODIFY:
+		err = kevent_user_ctl_modify(u, num, arg);
+		break;
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+	return err;
+}
+
+/*
+ * Used to get ready kevents from queue.
+ * @ctl_fd - kevent control descriptor which must be obtained through kevent_ctl(KEVENT_CTL_INIT).
+ * @min_nr - minimum number of ready kevents.
+ * @max_nr - maximum number of ready kevents.
+ * @timeout - timeout in milliseconds to wait until some events are ready.
+ * @buf - buffer to place ready events.
+ * @flags - ununsed for now (will be used for mmap implementation).
+ */
+asmlinkage long sys_kevent_get_events(int ctl_fd, unsigned int min_nr, unsigned int max_nr,
+		unsigned int timeout, void __user *buf, unsigned flags)
+{
+	int err = -EINVAL;
+	struct file *file;
+	struct kevent_user *u;
+
+	file = fget(ctl_fd);
+	if (!file)
+		return -ENODEV;
+
+	if (file->f_op != &kevent_user_fops)
+		goto out_fput;
+	u = file->private_data;
+
+	err = kevent_user_wait(file, u, min_nr, max_nr, timeout, buf);
+out_fput:
+	fput(file);
+	return err;
+}
+
+/*
+ * This syscall is used to perform various control operations
+ * on given kevent queue, which is obtained through kevent file descriptor @fd.
+ * @cmd - type of operation.
+ * @num - number of kevents to be processed.
+ * @arg - pointer to array of struct ukevent.
+ */
+asmlinkage long sys_kevent_ctl(int fd, unsigned int cmd, unsigned int num, void __user *arg)
+{
+	int err = -EINVAL;
+	struct file *file;
+
+	if (cmd == KEVENT_CTL_INIT)
+		return kevent_ctl_init();
+
+	file = fget(fd);
+	if (!file)
+		return -ENODEV;
+
+	if (file->f_op != &kevent_user_fops)
+		goto out_fput;
+
+	err = kevent_ctl_process(file, cmd, num, arg);
+
+out_fput:
+	fput(file);
+	return err;
+}
+
+/*
+ * Kevent subsystem initialization - create kevent cache and register
+ * filesystem to get control file descriptors from.
+ */
+static int __devinit kevent_user_init(void)
+{
+	int err = 0;
+
+	err = kevent_sys_init();
+	if (err)
+		panic("%s: failed to initialize kevent: err=%d.\n", err);
+	
+	err = register_filesystem(&kevent_fs_type);
+	if (err)
+		panic("%s: failed to register filesystem: err=%d.\n",
+			       kevent_name, err);
+
+	kevent_mnt = kern_mount(&kevent_fs_type);
+	if (IS_ERR(kevent_mnt))
+		panic("%s: failed to mount silesystem: err=%ld.\n", 
+				kevent_name, PTR_ERR(kevent_mnt));
+	
+	err = misc_register(&kevent_miscdev);
+	if (err) {
+		printk(KERN_ERR "Failed to register kevent miscdev: err=%d.\n", err);
+		goto err_out_exit;
+	}
+
+	printk("KEVENT subsystem has been successfully registered.\n");
+
+	return 0;
+
+err_out_exit:
+	mntput(kevent_mnt);
+	unregister_filesystem(&kevent_fs_type);
+
+	return err;
+}
+
+static void __devexit kevent_user_fini(void)
+{
+	misc_deregister(&kevent_miscdev);
+	mntput(kevent_mnt);
+	unregister_filesystem(&kevent_fs_type);
+}
+
+module_init(kevent_user_init);
+module_exit(kevent_user_fini);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 6991bec..8d3769b 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -122,6 +122,9 @@ cond_syscall(ppc_rtas);
 cond_syscall(sys_spu_run);
 cond_syscall(sys_spu_create);
 
+cond_syscall(sys_kevent_get_events);
+cond_syscall(sys_kevent_ctl);
+
 /* mmu depending weak syscall entries */
 cond_syscall(sys_mprotect);
 cond_syscall(sys_msync);


^ permalink raw reply related	[flat|nested] 160+ messages in thread

* Re: [take10 1/2] kevent: Core files.
  2006-08-16 12:34                           ` [take10 1/2] kevent: Core files Evgeniy Polyakov
  2006-08-16 12:34                             ` [take10 2/2] kevent: poll/select() notifications. Timer notifications Evgeniy Polyakov
@ 2006-08-16 12:37                             ` Mika Penttilä
  2006-08-16 12:44                               ` Evgeniy Polyakov
  2006-08-18  9:35                             ` Joe Jin
  2 siblings, 1 reply; 160+ messages in thread
From: Mika Penttilä @ 2006-08-16 12:37 UTC (permalink / raw)
  To: Evgeniy Polyakov; +Cc: lkml

+void kevent_user_ring_add_event(struct kevent *k)
+{
+	unsigned int pidx, off;
+	struct kevent_mring *ring, *copy_ring;
+
+	ring = (struct kevent_mring *)k->user->pring[0];
+	
+	pidx = ring->index/KEVENTS_ON_PAGE;
+	off = ring->index%KEVENTS_ON_PAGE;
+
+	copy_ring = (struct kevent_mring *)k->user->pring[pidx];
+
+	copy_ring->event[off].id.raw[0] = k->event.id.raw[0];
+	copy_ring->event[off].id.raw[1] = k->event.id.raw[1];
+	copy_ring->event[off].ret_flags = k->event.ret_flags;
+
+	if (++ring->index >= KEVENT_MAX_EVENTS)
+		ring->index = 0;
+}

Can you assume that the page at pidx is already allocated and why?

--Mika



^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take10 1/2] kevent: Core files.
  2006-08-16 12:37                             ` [take10 1/2] kevent: Core files Mika Penttilä
@ 2006-08-16 12:44                               ` Evgeniy Polyakov
  0 siblings, 0 replies; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-16 12:44 UTC (permalink / raw)
  To: Mika Penttilä; +Cc: lkml

On Wed, Aug 16, 2006 at 03:37:30PM +0300, Mika Penttilä (mika.penttila@kolumbus.fi) wrote:
> +void kevent_user_ring_add_event(struct kevent *k)
> +{
> +	unsigned int pidx, off;
> +	struct kevent_mring *ring, *copy_ring;
> +
> +	ring = (struct kevent_mring *)k->user->pring[0];
> +	
> +	pidx = ring->index/KEVENTS_ON_PAGE;
> +	off = ring->index%KEVENTS_ON_PAGE;
> +
> +	copy_ring = (struct kevent_mring *)k->user->pring[pidx];
> +
> +	copy_ring->event[off].id.raw[0] = k->event.id.raw[0];
> +	copy_ring->event[off].id.raw[1] = k->event.id.raw[1];
> +	copy_ring->event[off].ret_flags = k->event.ret_flags;
> +
> +	if (++ring->index >= KEVENT_MAX_EVENTS)
> +		ring->index = 0;
> +}
> 
> Can you assume that the page at pidx is already allocated and why?

It is checked and allocated if needed in kevent_user_ring_grow(), which
is called for each new kevent.

> --Mika
> 

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take9 0/2] kevent: Generic event handling mechanism.
  2006-08-14  6:21                         ` [take9 0/2] kevent: Generic event handling mechanism Evgeniy Polyakov
  2006-08-14  6:21                           ` [take9 1/2] kevent: Core files Evgeniy Polyakov
@ 2006-08-16 13:26                           ` Christoph Hellwig
  2006-08-16 13:38                             ` Evgeniy Polyakov
  1 sibling, 1 reply; 160+ messages in thread
From: Christoph Hellwig @ 2006-08-16 13:26 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: lkml, David Miller, Ulrich Drepper, Andrew Morton, netdev, Zach Brown

On Mon, Aug 14, 2006 at 10:21:36AM +0400, Evgeniy Polyakov wrote:
> 
> Generic event handling mechanism.

Hi, I've just started looking into this, so some comments here first
on the submission process:

 - could you send new revisions of the patches in a new thread so one can
   easily find them?
 - the patch split is not very nice, your first patch adds Makefile and
   Kconfig entries for files only in the second patch or not actually
   submitted at all, that's a big no-no.


^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take9 2/2] kevent: poll/select() notifications. Timer notifications.
  2006-08-14  6:21                             ` [take9 2/2] kevent: poll/select() notifications. Timer notifications Evgeniy Polyakov
@ 2006-08-16 13:30                               ` Christoph Hellwig
  2006-08-16 13:40                                 ` Evgeniy Polyakov
  2006-08-22 14:35                                 ` Davide Libenzi
  0 siblings, 2 replies; 160+ messages in thread
From: Christoph Hellwig @ 2006-08-16 13:30 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: lkml, David Miller, Ulrich Drepper, Andrew Morton, netdev,
	Zach Brown, tglx

On Mon, Aug 14, 2006 at 10:21:36AM +0400, Evgeniy Polyakov wrote:
> 
> poll/select() notifications. Timer notifications.
> 
> This patch includes generic poll/select and timer notifications.
> 
> kevent_poll works simialr to epoll and has the same issues (callback
> is invoked not from internal state machine of the caller, but through
> process awake).

I'm not a big fan of duplicating code over and over.  kevent is a candidate
for a generic event devlivery mechanisms which is a _very_ good thing.  But
starting that system by duplicating existing functionality is not very nice.

What speaks against a patch the recplaces the epoll core by something that
build on kevent while still supporting the epoll interface as a compatibility
shim?

> Timer notifications can be used for fine grained per-process time 
> management, since interval timers are very inconvenient to use, 
> and they are limited.

I have similar reservations about this one.  Having timers as part of a
generic events system is very nice, but having so much duplicated functionality
is not.  Cc'ed Thomas on behalf of the Timer cabal if there's a point in
integrating this into a larger framework of timer code.


Also it would be nice if you could submit each of the notifications as a patch
on it's own.

> diff --git a/kernel/kevent/kevent_poll.c b/kernel/kevent/kevent_poll.c
> new file mode 100644
> index 0000000..8a4f863
> --- /dev/null
> +++ b/kernel/kevent/kevent_poll.c
> @@ -0,0 +1,220 @@
> +/*
> + * 	kevent_poll.c
> + * 
> + * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
> + * All rights reserved.
> + * 
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + */
> +
> +#include <linux/kernel.h>
> +#include <linux/types.h>
> +#include <linux/list.h>
> +#include <linux/slab.h>
> +#include <linux/spinlock.h>
> +#include <linux/timer.h>
> +#include <linux/file.h>
> +#include <linux/kevent.h>
> +#include <linux/poll.h>
> +#include <linux/fs.h>
> +
> +static kmem_cache_t *kevent_poll_container_cache;
> +static kmem_cache_t *kevent_poll_priv_cache;
> +
> +struct kevent_poll_ctl
> +{
> +	struct poll_table_struct 	pt;
> +	struct kevent			*k;
> +};
> +
> +struct kevent_poll_wait_container
> +{
> +	struct list_head		container_entry;
> +	wait_queue_head_t		*whead;
> +	wait_queue_t			wait;
> +	struct kevent			*k;
> +};
> +
> +struct kevent_poll_private
> +{
> +	struct list_head		container_list;
> +	spinlock_t			container_lock;
> +};
> +
> +static int kevent_poll_enqueue(struct kevent *k);
> +static int kevent_poll_dequeue(struct kevent *k);
> +static int kevent_poll_callback(struct kevent *k);
> +
> +static int kevent_poll_wait_callback(wait_queue_t *wait, 
> +		unsigned mode, int sync, void *key)
> +{
> +	struct kevent_poll_wait_container *cont = 
> +		container_of(wait, struct kevent_poll_wait_container, wait);
> +	struct kevent *k = cont->k;
> +	struct file *file = k->st->origin;
> +	u32 revents;
> +
> +	revents = file->f_op->poll(file, NULL);
> +
> +	kevent_storage_ready(k->st, NULL, revents);
> +
> +	return 0;
> +}
> +
> +static void kevent_poll_qproc(struct file *file, wait_queue_head_t *whead, 
> +		struct poll_table_struct *poll_table)
> +{
> +	struct kevent *k = 
> +		container_of(poll_table, struct kevent_poll_ctl, pt)->k;
> +	struct kevent_poll_private *priv = k->priv;
> +	struct kevent_poll_wait_container *cont;
> +	unsigned long flags;
> +
> +	cont = kmem_cache_alloc(kevent_poll_container_cache, SLAB_KERNEL);
> +	if (!cont) {
> +		kevent_break(k);
> +		return;
> +	}
> +		
> +	cont->k = k;
> +	init_waitqueue_func_entry(&cont->wait, kevent_poll_wait_callback);
> +	cont->whead = whead;
> +
> +	spin_lock_irqsave(&priv->container_lock, flags);
> +	list_add_tail(&cont->container_entry, &priv->container_list);
> +	spin_unlock_irqrestore(&priv->container_lock, flags);
> +
> +	add_wait_queue(whead, &cont->wait);
> +}
> +
> +static int kevent_poll_enqueue(struct kevent *k)
> +{
> +	struct file *file;
> +	int err, ready = 0;
> +	unsigned int revents;
> +	struct kevent_poll_ctl ctl;
> +	struct kevent_poll_private *priv;
> +
> +	file = fget(k->event.id.raw[0]);
> +	if (!file)
> +		return -ENODEV;
> +
> +	err = -EINVAL;
> +	if (!file->f_op || !file->f_op->poll)
> +		goto err_out_fput;
> +
> +	err = -ENOMEM;
> +	priv = kmem_cache_alloc(kevent_poll_priv_cache, SLAB_KERNEL);
> +	if (!priv)
> +		goto err_out_fput;
> +
> +	spin_lock_init(&priv->container_lock);
> +	INIT_LIST_HEAD(&priv->container_list);
> +
> +	k->priv = priv;
> +
> +	ctl.k = k;
> +	init_poll_funcptr(&ctl.pt, &kevent_poll_qproc);
> +
> +	err = kevent_storage_enqueue(&file->st, k);
> +	if (err)
> +		goto err_out_free;
> +
> +	revents = file->f_op->poll(file, &ctl.pt);
> +	if (revents & k->event.event) {
> +		ready = 1;
> +		kevent_poll_dequeue(k);
> +	}
> +	
> +	return ready;
> +
> +err_out_free:
> +	kmem_cache_free(kevent_poll_priv_cache, priv);
> +err_out_fput:
> +	fput(file);
> +	return err;
> +}
> +
> +static int kevent_poll_dequeue(struct kevent *k)
> +{
> +	struct file *file = k->st->origin;
> +	struct kevent_poll_private *priv = k->priv;
> +	struct kevent_poll_wait_container *w, *n;
> +	unsigned long flags;
> +
> +	kevent_storage_dequeue(k->st, k);
> +
> +	spin_lock_irqsave(&priv->container_lock, flags);
> +	list_for_each_entry_safe(w, n, &priv->container_list, container_entry) {
> +		list_del(&w->container_entry);
> +		remove_wait_queue(w->whead, &w->wait);
> +		kmem_cache_free(kevent_poll_container_cache, w);
> +	}
> +	spin_unlock_irqrestore(&priv->container_lock, flags);
> +	
> +	kmem_cache_free(kevent_poll_priv_cache, priv);
> +	k->priv = NULL;
> +	
> +	fput(file);
> +
> +	return 0;
> +}
> +
> +static int kevent_poll_callback(struct kevent *k)
> +{
> +	struct file *file = k->st->origin;
> +	unsigned int revents = file->f_op->poll(file, NULL);
> +	return (revents & k->event.event);
> +}
> +
> +static int __init kevent_poll_sys_init(void)
> +{
> +	struct kevent_callbacks *pc = &kevent_registered_callbacks[KEVENT_POLL];
> +
> +	kevent_poll_container_cache = kmem_cache_create("kevent_poll_container_cache", 
> +			sizeof(struct kevent_poll_wait_container), 0, 0, NULL, NULL);
> +	if (!kevent_poll_container_cache) {
> +		printk(KERN_ERR "Failed to create kevent poll container cache.\n");
> +		return -ENOMEM;
> +	}
> +	
> +	kevent_poll_priv_cache = kmem_cache_create("kevent_poll_priv_cache", 
> +			sizeof(struct kevent_poll_private), 0, 0, NULL, NULL);
> +	if (!kevent_poll_priv_cache) {
> +		printk(KERN_ERR "Failed to create kevent poll private data cache.\n");
> +		kmem_cache_destroy(kevent_poll_container_cache);
> +		kevent_poll_container_cache = NULL;
> +		return -ENOMEM;
> +	}
> +	
> +	pc->enqueue = &kevent_poll_enqueue;
> +	pc->dequeue = &kevent_poll_dequeue;
> +	pc->callback = &kevent_poll_callback;
> +
> +	printk(KERN_INFO "Kevent poll()/select() subsystem has been initialized.\n");
> +	return 0;
> +}
> +
> +static struct lock_class_key kevent_poll_key;
> +
> +void kevent_poll_reinit(struct file *file)
> +{
> +	lockdep_set_class(&file->st.lock, &kevent_poll_key);
> +}
> +
> +static void __exit kevent_poll_sys_fini(void)
> +{
> +	kmem_cache_destroy(kevent_poll_priv_cache);
> +	kmem_cache_destroy(kevent_poll_container_cache);
> +}
> +
> +module_init(kevent_poll_sys_init);
> +module_exit(kevent_poll_sys_fini);
> diff --git a/kernel/kevent/kevent_timer.c b/kernel/kevent/kevent_timer.c
> new file mode 100644
> index 0000000..fe39b4e
> --- /dev/null
> +++ b/kernel/kevent/kevent_timer.c
> @@ -0,0 +1,108 @@
> +/*
> + * 	kevent_timer.c
> + * 
> + * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
> + * All rights reserved.
> + * 
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
> + */
> +
> +#include <linux/kernel.h>
> +#include <linux/types.h>
> +#include <linux/list.h>
> +#include <linux/slab.h>
> +#include <linux/spinlock.h>
> +#include <linux/timer.h>
> +#include <linux/jiffies.h>
> +#include <linux/kevent.h>
> +
> +struct kevent_timer
> +{
> +	struct timer_list	ktimer;
> +	struct kevent_storage	ktimer_storage;
> +};
> +
> +static void kevent_timer_func(unsigned long data)
> +{
> +	struct kevent *k = (struct kevent *)data;
> +	struct timer_list *t = k->st->origin;
> +
> +	kevent_storage_ready(k->st, NULL, KEVENT_MASK_ALL);
> +	mod_timer(t, jiffies + msecs_to_jiffies(k->event.id.raw[0]));
> +}
> +
> +static struct lock_class_key kevent_timer_key;
> +
> +static int kevent_timer_enqueue(struct kevent *k)
> +{
> +	int err;
> +	struct kevent_timer *t;
> +
> +	t = kmalloc(sizeof(struct kevent_timer), GFP_KERNEL);
> +	if (!t)
> +		return -ENOMEM;
> +
> +	setup_timer(&t->ktimer, &kevent_timer_func, (unsigned long)k);
> +
> +	err = kevent_storage_init(&t->ktimer, &t->ktimer_storage);
> +	if (err)
> +		goto err_out_free;
> +	lockdep_set_class(&t->ktimer_storage.lock, &kevent_timer_key);
> +
> +	err = kevent_storage_enqueue(&t->ktimer_storage, k);
> +	if (err)
> +		goto err_out_st_fini;
> +	
> +	mod_timer(&t->ktimer, jiffies + msecs_to_jiffies(k->event.id.raw[0]));
> +
> +	return 0;
> +
> +err_out_st_fini:	
> +	kevent_storage_fini(&t->ktimer_storage);
> +err_out_free:
> +	kfree(t);
> +
> +	return err;
> +}
> +
> +static int kevent_timer_dequeue(struct kevent *k)
> +{
> +	struct kevent_storage *st = k->st;
> +	struct kevent_timer *t = container_of(st, struct kevent_timer, ktimer_storage);
> +
> +	del_timer_sync(&t->ktimer);
> +	kevent_storage_dequeue(st, k);
> +	kfree(t);
> +
> +	return 0;
> +}
> +
> +static int kevent_timer_callback(struct kevent *k)
> +{
> +	k->event.ret_data[0] = (__u32)jiffies;
> +	return 1;
> +}
> +
> +static int __init kevent_init_timer(void)
> +{
> +	struct kevent_callbacks *tc = &kevent_registered_callbacks[KEVENT_TIMER];
> +
> +	tc->enqueue = &kevent_timer_enqueue;
> +	tc->dequeue = &kevent_timer_dequeue;
> +	tc->callback = &kevent_timer_callback;
> +
> +	return 0;
> +}
> +module_init(kevent_init_timer);
> 
> -
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
---end quoted text---

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take9 0/2] kevent: Generic event handling mechanism.
  2006-08-16 13:26                           ` [take9 0/2] kevent: Generic event handling mechanism Christoph Hellwig
@ 2006-08-16 13:38                             ` Evgeniy Polyakov
  2006-08-16 18:10                               ` Zach Brown
  0 siblings, 1 reply; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-16 13:38 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: lkml, David Miller, Ulrich Drepper, Andrew Morton, netdev, Zach Brown

On Wed, Aug 16, 2006 at 02:26:31PM +0100, Christoph Hellwig (hch@infradead.org) wrote:
> On Mon, Aug 14, 2006 at 10:21:36AM +0400, Evgeniy Polyakov wrote:
> > 
> > Generic event handling mechanism.
> 
> Hi, I've just started looking into this, so some comments here first
> on the submission process:
> 
>  - could you send new revisions of the patches in a new thread so one can
>    easily find them?

Ok.

>  - the patch split is not very nice, your first patch adds Makefile and
>    Kconfig entries for files only in the second patch or not actually
>    submitted at all, that's a big no-no.

It is done by scripts using list of files generated by git-diff, but I
can reformat them to be in a way:
core files
poll/select
timer
any other
main Kconfig/Makefile

Kevent's makefile still contains all entries for files added later, is
it a big problem right now?
I can split patches manually, but it would be much better to do it when
decision about it's inclusion is made, and until review and feature
addiotion process is not completed generate patches as is...

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take9 2/2] kevent: poll/select() notifications. Timer notifications.
  2006-08-16 13:30                               ` Christoph Hellwig
@ 2006-08-16 13:40                                 ` Evgeniy Polyakov
  2006-08-18 10:41                                   ` Christoph Hellwig
  2006-08-22 14:35                                 ` Davide Libenzi
  1 sibling, 1 reply; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-16 13:40 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: lkml, David Miller, Ulrich Drepper, Andrew Morton, netdev,
	Zach Brown, tglx

On Wed, Aug 16, 2006 at 02:30:14PM +0100, Christoph Hellwig (hch@infradead.org) wrote:
> On Mon, Aug 14, 2006 at 10:21:36AM +0400, Evgeniy Polyakov wrote:
> > 
> > poll/select() notifications. Timer notifications.
> > 
> > This patch includes generic poll/select and timer notifications.
> > 
> > kevent_poll works simialr to epoll and has the same issues (callback
> > is invoked not from internal state machine of the caller, but through
> > process awake).
> 
> I'm not a big fan of duplicating code over and over.  kevent is a candidate
> for a generic event devlivery mechanisms which is a _very_ good thing.  But
> starting that system by duplicating existing functionality is not very nice.
> 
> What speaks against a patch the recplaces the epoll core by something that
> build on kevent while still supporting the epoll interface as a compatibility
> shim?

There is no problem from my side, but epoll and kevent_poll differs on
some aspects, so it can be better to not replace them for a while.

> > Timer notifications can be used for fine grained per-process time 
> > management, since interval timers are very inconvenient to use, 
> > and they are limited.
> 
> I have similar reservations about this one.  Having timers as part of a
> generic events system is very nice, but having so much duplicated functionality
> is not.  Cc'ed Thomas on behalf of the Timer cabal if there's a point in
> integrating this into a larger framework of timer code.
> 
> 
> Also it would be nice if you could submit each of the notifications as a patch
> on it's own.

Ok.

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take9 1/2] kevent: Core files.
  2006-08-14  6:21                           ` [take9 1/2] kevent: Core files Evgeniy Polyakov
  2006-08-14  6:21                             ` [take9 2/2] kevent: poll/select() notifications. Timer notifications Evgeniy Polyakov
@ 2006-08-16 13:45                             ` Christoph Hellwig
  2006-08-16 13:56                               ` Evgeniy Polyakov
  1 sibling, 1 reply; 160+ messages in thread
From: Christoph Hellwig @ 2006-08-16 13:45 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: lkml, David Miller, Ulrich Drepper, Andrew Morton, netdev, Zach Brown

> diff --git a/include/linux/kevent.h b/include/linux/kevent.h
> new file mode 100644
> index 0000000..03eeeea
> --- /dev/null
> +++ b/include/linux/kevent.h
> @@ -0,0 +1,310 @@
> +/*
> + * 	kevent.h

Please don't put filenames in the top of file block comments.  They're
redudant and as history shows out of date far too often.

> +#ifdef __KERNEL__

Please split the user/kernel ABI and kernel implementation details into
two different headers.  That way we don't have to run unifdef as part of
the user headers generation process and it's much cleaner what bit is a
kernel implementation details and what's the public ABI.

> +#define KEVENT_READY		0x1
> +#define KEVENT_STORAGE		0x2
> +#define KEVENT_USER		0x4

Please use enums here.

> +	void			*priv;			/* Private data for different storages. 
> +							 * poll()/select storage has a list of wait_queue_t containers 
> +							 * for each ->poll() { poll_wait()' } here.
> +							 */

Please try to avoid spilling over the 80 chars limit.  In this case it's
easy, just put the comment before the field beeing documented.

> +extern struct kevent_callbacks kevent_registered_callbacks[];

Having global arrays is not very nice.  Any chance this could be hidden
behind proper accessor functions?

> +#ifdef CONFIG_KEVENT_INODE
> +void kevent_inode_notify(struct inode *inode, u32 event);
> +void kevent_inode_notify_parent(struct dentry *dentry, u32 event);
> +void kevent_inode_remove(struct inode *inode);
> +#else
> +static inline void kevent_inode_notify(struct inode *inode, u32 event)
> +{
> +}
> +static inline void kevent_inode_notify_parent(struct dentry *dentry, u32 event)
> +{
> +}
> +static inline void kevent_inode_remove(struct inode *inode)
> +{
> +}
> +#endif /* CONFIG_KEVENT_INODE */

The code implementing these prototypes doesn't exist.

> +#ifdef CONFIG_KEVENT_SOCKET
> +#ifdef CONFIG_LOCKDEP
> +void kevent_socket_reinit(struct socket *sock);
> +void kevent_sk_reinit(struct sock *sk);
> +#else
> +static inline void kevent_socket_reinit(struct socket *sock)
> +{
> +}
> +static inline void kevent_sk_reinit(struct sock *sk)
> +{
> +}
> +#endif

Dito.  Please clean the header from all this dead code.

> +int kevent_storage_init(void *origin, struct kevent_storage *st)
> +{
> +	spin_lock_init(&st->lock);
> +	st->origin = origin;
> +	INIT_LIST_HEAD(&st->list);
> +	return 0;
> +}

Why does this need a return value?

> +int kevent_sys_init(void)
> +{
> +	int i;
> +
> +	kevent_cache = kmem_cache_create("kevent_cache", 
> +			sizeof(struct kevent), 0, SLAB_PANIC, NULL, NULL);
> +
> +	for (i=0; i<ARRAY_SIZE(kevent_registered_callbacks); ++i) {
> +		struct kevent_callbacks *c = &kevent_registered_callbacks[i];
> +
> +		c->callback = c->enqueue = c->dequeue = NULL;
> +	}
> +	
> +	return 0;
> +}

Please make this an initcall in this file and make sure it's linked before
kevent_users.c


> +static int kevent_user_open(struct inode *, struct file *);
> +static int kevent_user_release(struct inode *, struct file *);
> +static unsigned int kevent_user_poll(struct file *, struct poll_table_struct *);
> +static int kevent_user_mmap(struct file *, struct vm_area_struct *);

Could you reorder the file so these forward-declaring prototypes aren't
needed?

> +	for (i=0; i<ARRAY_SIZE(u->kevent_list); ++i)

	for (i = 0; i < ARRAY_SIZE(u->kevent_list); i++)

> +static struct page *kevent_user_nopage(struct vm_area_struct *vma, unsigned long addr, int *type)
> +{
> +	struct kevent_user *u = vma->vm_file->private_data;
> +	unsigned long off = (addr - vma->vm_start)/PAGE_SIZE;
> +	unsigned int pnum = ALIGN(KEVENT_MAX_EVENTS*sizeof(struct mukevent) + sizeof(unsigned int), PAGE_SIZE)/PAGE_SIZE;
> +
> +	if (type)
> +		*type = VM_FAULT_MINOR;
> +
> +	if (off >= pnum)
> +		goto err_out_sigbus;
> +
> +	u->pring[off] = __get_free_page(GFP_KERNEL);

So we have a pagefault handler that allocates pages.

> +static int kevent_user_mmap(struct file *file, struct vm_area_struct *vma)
> +{
> +	unsigned long start = vma->vm_start;
> +	struct kevent_user *u = file->private_data;
> +
> +	if (vma->vm_flags & VM_WRITE)
> +		return -EPERM;
> +
> +	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
> +	vma->vm_ops = &kevent_user_vm_ops;
> +	vma->vm_flags |= VM_RESERVED;
> +	vma->vm_file = file;
> +
> +	if (remap_pfn_range(vma, start, virt_to_phys((void *)u->pring[0]), PAGE_SIZE,
> +				vma->vm_page_prot))
> +		return -EFAULT;

but you always map the first page.  This model sounds odd and rather confusing.
Do we really need to avoid of the cost of the pagefault just for the special
first page?

If so please at least use vm_insert_page() instead of remap_pfn_range().

> +#if 0
> +static inline unsigned int kevent_user_hash(struct ukevent *uk)
> +{
> +	unsigned int h = (uk->user[0] ^ uk->user[1]) ^ (uk->id.raw[0] ^ uk->id.raw[1]);
> +	
> +	h = (((h >> 16) & 0xffff) ^ (h & 0xffff)) & 0xffff;
> +	h = (((h >> 8) & 0xff) ^ (h & 0xff)) & KEVENT_HASH_MASK;
> +
> +	return h;
> +}
> +#else
> +static inline unsigned int kevent_user_hash(struct ukevent *uk)
> +{
> +	return jhash_1word(uk->id.raw[0], 0) & KEVENT_HASH_MASK;
> +}
> +#endif

Please remove that #if 0 code.

> +static int kevent_ctl_process(struct file *file, unsigned int cmd, unsigned int num, void __user *arg)
> +{
> +	int err;
> +	struct kevent_user *u = file->private_data;
> +
> +	if (!u || num > KEVENT_MAX_EVENTS)
> +		return -EINVAL;
> +
> +	switch (cmd) {
> +	case KEVENT_CTL_ADD:
> +		err = kevent_user_ctl_add(u, num, arg);
> +		break;
> +	case KEVENT_CTL_REMOVE:
> +		err = kevent_user_ctl_remove(u, num, arg);
> +		break;
> +	case KEVENT_CTL_MODIFY:
> +		err = kevent_user_ctl_modify(u, num, arg);
> +		break;
> +	default:
> +		err = -EINVAL;
> +		break;

We were rather against these kind of odd multiplexers in the past.  For
these three we at least have a common type beeing passed down so there's
not compat handling problem, but I'm still not very happy with it..

> +asmlinkage long sys_kevent_ctl(int fd, unsigned int cmd, unsigned int num, void __user *arg)
> +{
> +	int err = -EINVAL;
> +	struct file *file;
> +
> +	if (cmd == KEVENT_CTL_INIT)
> +		return kevent_ctl_init();

This one on the other hand is plain wrong. At least it should be a separate
syscall.  But looking at the code I don't quite understand why you need
a syscall at all, why can't kevent be implemented as a cloning chardevice
(on where every open allocates a new structure and stores it into
file->private_data?)


^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take9 1/2] kevent: Core files.
  2006-08-16 13:45                             ` [take9 1/2] kevent: Core files Christoph Hellwig
@ 2006-08-16 13:56                               ` Evgeniy Polyakov
  2006-08-16 18:08                                 ` Zach Brown
  2006-08-18 10:46                                 ` Christoph Hellwig
  0 siblings, 2 replies; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-16 13:56 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: lkml, David Miller, Ulrich Drepper, Andrew Morton, netdev, Zach Brown

On Wed, Aug 16, 2006 at 02:45:50PM +0100, Christoph Hellwig (hch@infradead.org) wrote:
> > diff --git a/include/linux/kevent.h b/include/linux/kevent.h
> > new file mode 100644
> > index 0000000..03eeeea
> > --- /dev/null
> > +++ b/include/linux/kevent.h
> > @@ -0,0 +1,310 @@
> > +/*
> > + * 	kevent.h
> 
> Please don't put filenames in the top of file block comments.  They're
> redudant and as history shows out of date far too often.

Ok.

> > +#ifdef __KERNEL__
> 
> Please split the user/kernel ABI and kernel implementation details into
> two different headers.  That way we don't have to run unifdef as part of
> the user headers generation process and it's much cleaner what bit is a
> kernel implementation details and what's the public ABI.

ok.

> > +#define KEVENT_READY		0x1
> > +#define KEVENT_STORAGE		0x2
> > +#define KEVENT_USER		0x4
> 
> Please use enums here.

I used, but I was sugested to use define in some previous releases :)

> > +	void			*priv;			/* Private data for different storages. 
> > +							 * poll()/select storage has a list of wait_queue_t containers 
> > +							 * for each ->poll() { poll_wait()' } here.
> > +							 */
> 
> Please try to avoid spilling over the 80 chars limit.  In this case it's
> easy, just put the comment before the field beeing documented.

Ok.

> > +extern struct kevent_callbacks kevent_registered_callbacks[];
> 
> Having global arrays is not very nice.  Any chance this could be hidden
> behind proper accessor functions?

Ok.

> > +#ifdef CONFIG_KEVENT_INODE
> > +void kevent_inode_notify(struct inode *inode, u32 event);
> > +void kevent_inode_notify_parent(struct dentry *dentry, u32 event);
> > +void kevent_inode_remove(struct inode *inode);
> > +#else
> > +static inline void kevent_inode_notify(struct inode *inode, u32 event)
> > +{
> > +}
> > +static inline void kevent_inode_notify_parent(struct dentry *dentry, u32 event)
> > +{
> > +}
> > +static inline void kevent_inode_remove(struct inode *inode)
> > +{
> > +}
> > +#endif /* CONFIG_KEVENT_INODE */
> 
> The code implementing these prototypes doesn't exist.

It exist, it was suggested by you to not include into patchset right
now, so this file is not yet updated to not contain AIO stuff.

> > +#ifdef CONFIG_KEVENT_SOCKET
> > +#ifdef CONFIG_LOCKDEP
> > +void kevent_socket_reinit(struct socket *sock);
> > +void kevent_sk_reinit(struct sock *sk);
> > +#else
> > +static inline void kevent_socket_reinit(struct socket *sock)
> > +{
> > +}
> > +static inline void kevent_sk_reinit(struct sock *sk)
> > +{
> > +}
> > +#endif
> 
> Dito.  Please clean the header from all this dead code.
> 
> > +int kevent_storage_init(void *origin, struct kevent_storage *st)
> > +{
> > +	spin_lock_init(&st->lock);
> > +	st->origin = origin;
> > +	INIT_LIST_HEAD(&st->list);
> > +	return 0;
> > +}
> 
> Why does this need a return value?

Initialization in general can fail, this one can not, but I prefer to
reserve a second plan.

> > +int kevent_sys_init(void)
> > +{
> > +	int i;
> > +
> > +	kevent_cache = kmem_cache_create("kevent_cache", 
> > +			sizeof(struct kevent), 0, SLAB_PANIC, NULL, NULL);
> > +
> > +	for (i=0; i<ARRAY_SIZE(kevent_registered_callbacks); ++i) {
> > +		struct kevent_callbacks *c = &kevent_registered_callbacks[i];
> > +
> > +		c->callback = c->enqueue = c->dequeue = NULL;
> > +	}
> > +	
> > +	return 0;
> > +}
> 
> Please make this an initcall in this file and make sure it's linked before
> kevent_users.c

Ok.

> > +static int kevent_user_open(struct inode *, struct file *);
> > +static int kevent_user_release(struct inode *, struct file *);
> > +static unsigned int kevent_user_poll(struct file *, struct poll_table_struct *);
> > +static int kevent_user_mmap(struct file *, struct vm_area_struct *);
> 
> Could you reorder the file so these forward-declaring prototypes aren't
> needed?

I prefer structures to be placed at the begining of the file, so it
requires forward declaration.
But there is no sttrong feeling about that, so I will put it at the end.

> > +	for (i=0; i<ARRAY_SIZE(u->kevent_list); ++i)
> 
> 	for (i = 0; i < ARRAY_SIZE(u->kevent_list); i++)

Ugh, no. It reduces readability due to exessive number of spaces.

> > +static struct page *kevent_user_nopage(struct vm_area_struct *vma, unsigned long addr, int *type)
> > +{
> > +	struct kevent_user *u = vma->vm_file->private_data;
> > +	unsigned long off = (addr - vma->vm_start)/PAGE_SIZE;
> > +	unsigned int pnum = ALIGN(KEVENT_MAX_EVENTS*sizeof(struct mukevent) + sizeof(unsigned int), PAGE_SIZE)/PAGE_SIZE;
> > +
> > +	if (type)
> > +		*type = VM_FAULT_MINOR;
> > +
> > +	if (off >= pnum)
> > +		goto err_out_sigbus;
> > +
> > +	u->pring[off] = __get_free_page(GFP_KERNEL);
> 
> So we have a pagefault handler that allocates pages.

It is fixed in take10 patchset, which is why it was released.

> > +static int kevent_user_mmap(struct file *file, struct vm_area_struct *vma)
> > +{
> > +	unsigned long start = vma->vm_start;
> > +	struct kevent_user *u = file->private_data;
> > +
> > +	if (vma->vm_flags & VM_WRITE)
> > +		return -EPERM;
> > +
> > +	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
> > +	vma->vm_ops = &kevent_user_vm_ops;
> > +	vma->vm_flags |= VM_RESERVED;
> > +	vma->vm_file = file;
> > +
> > +	if (remap_pfn_range(vma, start, virt_to_phys((void *)u->pring[0]), PAGE_SIZE,
> > +				vma->vm_page_prot))
> > +		return -EFAULT;
> 
> but you always map the first page.  This model sounds odd and rather confusing.
> Do we really need to avoid of the cost of the pagefault just for the special
> first page?

->nopage() is fixed in take10 patchset to return next one.
Number of pages can grow with te time until limit (which is quite high,
and I was suggested to not allocate them all at startup).

> If so please at least use vm_insert_page() instead of remap_pfn_range().
 
Ok.
 
> > +#if 0
> > +static inline unsigned int kevent_user_hash(struct ukevent *uk)
> > +{
> > +	unsigned int h = (uk->user[0] ^ uk->user[1]) ^ (uk->id.raw[0] ^ uk->id.raw[1]);
> > +	
> > +	h = (((h >> 16) & 0xffff) ^ (h & 0xffff)) & 0xffff;
> > +	h = (((h >> 8) & 0xff) ^ (h & 0xff)) & KEVENT_HASH_MASK;
> > +
> > +	return h;
> > +}
> > +#else
> > +static inline unsigned int kevent_user_hash(struct ukevent *uk)
> > +{
> > +	return jhash_1word(uk->id.raw[0], 0) & KEVENT_HASH_MASK;
> > +}
> > +#endif
> 
> Please remove that #if 0 code.

Ok.

> +static int kevent_ctl_process(struct file *file, unsigned int cmd, unsigned int num, void __user *arg)
> > +{
> > +	int err;
> > +	struct kevent_user *u = file->private_data;
> > +
> > +	if (!u || num > KEVENT_MAX_EVENTS)
> > +		return -EINVAL;
> > +
> > +	switch (cmd) {
> > +	case KEVENT_CTL_ADD:
> > +		err = kevent_user_ctl_add(u, num, arg);
> > +		break;
> > +	case KEVENT_CTL_REMOVE:
> > +		err = kevent_user_ctl_remove(u, num, arg);
> > +		break;
> > +	case KEVENT_CTL_MODIFY:
> > +		err = kevent_user_ctl_modify(u, num, arg);
> > +		break;
> > +	default:
> > +		err = -EINVAL;
> > +		break;
> 
> We were rather against these kind of odd multiplexers in the past.  For
> these three we at least have a common type beeing passed down so there's
> not compat handling problem, but I'm still not very happy with it..

I use one syscall for add/remove/modify, so it requires multiplexer.

> > +asmlinkage long sys_kevent_ctl(int fd, unsigned int cmd, unsigned int num, void __user *arg)
> > +{
> > +	int err = -EINVAL;
> > +	struct file *file;
> > +
> > +	if (cmd == KEVENT_CTL_INIT)
> > +		return kevent_ctl_init();
> 
> This one on the other hand is plain wrong. At least it should be a separate
> syscall.  But looking at the code I don't quite understand why you need
> a syscall at all, why can't kevent be implemented as a cloning chardevice
> (on where every open allocates a new structure and stores it into
> file->private_data?)

That requires separate syscall.

I created a char device in first releases and was forced to not use it
at all.

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take9 1/2] kevent: Core files.
  2006-08-16 13:56                               ` Evgeniy Polyakov
@ 2006-08-16 18:08                                 ` Zach Brown
  2006-08-16 19:24                                   ` Evgeniy Polyakov
  2006-08-16 19:45                                   ` David Miller
  2006-08-18 10:46                                 ` Christoph Hellwig
  1 sibling, 2 replies; 160+ messages in thread
From: Zach Brown @ 2006-08-16 18:08 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: Christoph Hellwig, lkml, David Miller, Ulrich Drepper,
	Andrew Morton, netdev


>>> +	for (i=0; i<ARRAY_SIZE(u->kevent_list); ++i)
>> 	for (i = 0; i < ARRAY_SIZE(u->kevent_list); i++)
> 
> Ugh, no. It reduces readability due to exessive number of spaces.

Ihavetoverystronglydisagree.

- z

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take9 0/2] kevent: Generic event handling mechanism.
  2006-08-16 13:38                             ` Evgeniy Polyakov
@ 2006-08-16 18:10                               ` Zach Brown
  0 siblings, 0 replies; 160+ messages in thread
From: Zach Brown @ 2006-08-16 18:10 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: Christoph Hellwig, lkml, David Miller, Ulrich Drepper,
	Andrew Morton, netdev


> It is done by scripts using list of files generated by git-diff, but I
> can reformat them to be in a way:

Perhaps you should think about maintaining them as an explicit series of
patches, say with quilt or mq, instead of as one repository that you try
and cut up into separate patches.

- z

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take9 1/2] kevent: Core files.
  2006-08-16 18:08                                 ` Zach Brown
@ 2006-08-16 19:24                                   ` Evgeniy Polyakov
  2006-08-16 19:45                                   ` David Miller
  1 sibling, 0 replies; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-16 19:24 UTC (permalink / raw)
  To: Zach Brown
  Cc: Christoph Hellwig, lkml, David Miller, Ulrich Drepper,
	Andrew Morton, netdev

On Wed, Aug 16, 2006 at 11:08:41AM -0700, Zach Brown (zach.brown@oracle.com) wrote:
> 
> >>> +	for (i=0; i<ARRAY_SIZE(u->kevent_list); ++i)
> >> 	for (i = 0; i < ARRAY_SIZE(u->kevent_list); i++)
> > 
> > Ugh, no. It reduces readability due to exessive number of spaces.
> 
> Ihavetoverystronglydisagree.

W e l l , i f y o u i n s i s t a n d a b s o l u t e l y s u r e.

> - z

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take9 1/2] kevent: Core files.
  2006-08-16 18:08                                 ` Zach Brown
  2006-08-16 19:24                                   ` Evgeniy Polyakov
@ 2006-08-16 19:45                                   ` David Miller
  2006-08-16 20:06                                     ` Evgeniy Polyakov
  1 sibling, 1 reply; 160+ messages in thread
From: David Miller @ 2006-08-16 19:45 UTC (permalink / raw)
  To: zach.brown; +Cc: johnpol, hch, linux-kernel, drepper, akpm, netdev

From: Zach Brown <zach.brown@oracle.com>
Date: Wed, 16 Aug 2006 11:08:41 -0700

> >>> +	for (i=0; i<ARRAY_SIZE(u->kevent_list); ++i)
> >> 	for (i = 0; i < ARRAY_SIZE(u->kevent_list); i++)
> > 
> > Ugh, no. It reduces readability due to exessive number of spaces.
> 
> Ihavetoverystronglydisagree.

Metoo. :-)

Spaces help humans parse out the syntactic structure of
multi-token expressions.

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take9 1/2] kevent: Core files.
  2006-08-16 19:45                                   ` David Miller
@ 2006-08-16 20:06                                     ` Evgeniy Polyakov
  0 siblings, 0 replies; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-16 20:06 UTC (permalink / raw)
  To: David Miller; +Cc: zach.brown, hch, linux-kernel, drepper, akpm, netdev

On Wed, Aug 16, 2006 at 12:45:54PM -0700, David Miller (davem@davemloft.net) wrote:
> > >>> +	for (i=0; i<ARRAY_SIZE(u->kevent_list); ++i)
> > >> 	for (i = 0; i < ARRAY_SIZE(u->kevent_list); i++)
> > > 
> > > Ugh, no. It reduces readability due to exessive number of spaces.
> > 
> > Ihavetoverystronglydisagree.
> 
> Metoo. :-)
> 
> Spaces help humans parse out the syntactic structure of
> multi-token expressions.

There is an anecdote:
in a near future, when world crisis killed economy,
russain scientists found a way to make time machine, so they 
get some dictator from the past and ask him how to improve a situation.
He quickly answers that it is only needed to shoot all opposition, kill
the freedom and redraw Kremlin and Red Square into blue colour.
People wonder "such tragic steps, so much blood, but why Red Square?"
Dictator answers: well, if there are no objections about other issues,
we will not change Red Square.

I'm telling it just because I would like to know if there are any issues
which must be fixed in the tomorrow patchset except mentioned in
previous e-mails... :)

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take10 1/2] kevent: Core files.
  2006-08-16 12:34                           ` [take10 1/2] kevent: Core files Evgeniy Polyakov
  2006-08-16 12:34                             ` [take10 2/2] kevent: poll/select() notifications. Timer notifications Evgeniy Polyakov
  2006-08-16 12:37                             ` [take10 1/2] kevent: Core files Mika Penttilä
@ 2006-08-18  9:35                             ` Joe Jin
  2006-08-18 10:10                               ` Evgeniy Polyakov
  2 siblings, 1 reply; 160+ messages in thread
From: Joe Jin @ 2006-08-18  9:35 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: lkml, David Miller, Ulrich Drepper, Andrew Morton, netdev, Zach Brown

> +static int __devinit kevent_user_init(void)
> +{
> +       int err = 0;
> +
> +       err = kevent_sys_init();
> +       if (err)
> +               panic("%s: failed to initialize kevent: err=%d.\n", err);

Here should be?
                    panic("%s: failed to initialize kevent: err=%d\n",
kevent_name, err);

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take10 1/2] kevent: Core files.
  2006-08-18  9:35                             ` Joe Jin
@ 2006-08-18 10:10                               ` Evgeniy Polyakov
  0 siblings, 0 replies; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-18 10:10 UTC (permalink / raw)
  To: Joe Jin
  Cc: lkml, David Miller, Ulrich Drepper, Andrew Morton, netdev, Zach Brown

On Fri, Aug 18, 2006 at 05:35:45PM +0800, Joe Jin (lkmaillist@gmail.com) wrote:
> >+static int __devinit kevent_user_init(void)
> >+{
> >+       int err = 0;
> >+
> >+       err = kevent_sys_init();
> >+       if (err)
> >+               panic("%s: failed to initialize kevent: err=%d.\n", err);
> 
> Here should be?
>                    panic("%s: failed to initialize kevent: err=%d\n",
> kevent_name, err);

The whole function does not exist in the latest patchset anymore.

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take9 2/2] kevent: poll/select() notifications. Timer notifications.
  2006-08-16 13:40                                 ` Evgeniy Polyakov
@ 2006-08-18 10:41                                   ` Christoph Hellwig
  2006-08-18 10:59                                     ` Evgeniy Polyakov
  0 siblings, 1 reply; 160+ messages in thread
From: Christoph Hellwig @ 2006-08-18 10:41 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: Christoph Hellwig, lkml, David Miller, Ulrich Drepper,
	Andrew Morton, netdev, Zach Brown, tglx

On Wed, Aug 16, 2006 at 05:40:32PM +0400, Evgeniy Polyakov wrote:
> > What speaks against a patch the recplaces the epoll core by something that
> > build on kevent while still supporting the epoll interface as a compatibility
> > shim?
> 
> There is no problem from my side, but epoll and kevent_poll differs on
> some aspects, so it can be better to not replace them for a while.

Please explain the differences and why they are important.  We really
shouldn't keep on adding code without beeing able to replace older bits.
If there's a really good reason we can keep things separate, but

  "epoll and kevent_poll differs on some aspects"

is not one :)


^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take9 1/2] kevent: Core files.
  2006-08-16 13:56                               ` Evgeniy Polyakov
  2006-08-16 18:08                                 ` Zach Brown
@ 2006-08-18 10:46                                 ` Christoph Hellwig
  2006-08-18 11:23                                   ` Evgeniy Polyakov
  1 sibling, 1 reply; 160+ messages in thread
From: Christoph Hellwig @ 2006-08-18 10:46 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: Christoph Hellwig, lkml, David Miller, Ulrich Drepper,
	Andrew Morton, netdev, Zach Brown

> > > +#define KEVENT_READY		0x1
> > > +#define KEVENT_STORAGE		0x2
> > > +#define KEVENT_USER		0x4
> > 
> > Please use enums here.
> 
> I used, but I was sugested to use define in some previous releases :)

defines make some sense for userspace-visible ABIs because then people
can test for features with ifdef.  It doesn't make any sense for constants
that are used purely in-kernel.  For those enums make more sense because
you can for example looks at the symbolic names with a debugger.

> > We were rather against these kind of odd multiplexers in the past.  For
> > these three we at least have a common type beeing passed down so there's
> > not compat handling problem, but I'm still not very happy with it..
> 
> I use one syscall for add/remove/modify, so it requires multiplexer.

I noticed that you do it, but it's not exactly considered a nice design.

> > > +asmlinkage long sys_kevent_ctl(int fd, unsigned int cmd, unsigned int num, void __user *arg)
> > > +{
> > > +	int err = -EINVAL;
> > > +	struct file *file;
> > > +
> > > +	if (cmd == KEVENT_CTL_INIT)
> > > +		return kevent_ctl_init();
> > 
> > This one on the other hand is plain wrong. At least it should be a separate
> > syscall.  But looking at the code I don't quite understand why you need
> > a syscall at all, why can't kevent be implemented as a cloning chardevice
> > (on where every open allocates a new structure and stores it into
> > file->private_data?)
> 
> That requires separate syscall.

Yes, it requires a separate syscall.

> I created a char device in first releases and was forced to not use it
> at all.

Do you have a reference to it?  In this case a char devices makes a lot of
sense because you get a filedescriptor and have operations only defined on
it.  In fact given that you have a multiplexer anyway there's really no
point in adding a syscall for that aswell, you could rather use the existing
and debugged ioctl() multiplexer.  Sure, it's still not what we consider
nice, but better than adding even more odd multiplexer syscalls.

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take9 2/2] kevent: poll/select() notifications. Timer notifications.
  2006-08-18 10:41                                   ` Christoph Hellwig
@ 2006-08-18 10:59                                     ` Evgeniy Polyakov
  2006-08-21 11:01                                       ` Christoph Hellwig
  0 siblings, 1 reply; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-18 10:59 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: lkml, David Miller, Ulrich Drepper, Andrew Morton, netdev,
	Zach Brown, tglx

On Fri, Aug 18, 2006 at 11:41:20AM +0100, Christoph Hellwig (hch@infradead.org) wrote:
> On Wed, Aug 16, 2006 at 05:40:32PM +0400, Evgeniy Polyakov wrote:
> > > What speaks against a patch the recplaces the epoll core by something that
> > > build on kevent while still supporting the epoll interface as a compatibility
> > > shim?
> > 
> > There is no problem from my side, but epoll and kevent_poll differs on
> > some aspects, so it can be better to not replace them for a while.
> 
> Please explain the differences and why they are important.  We really
> shouldn't keep on adding code without beeing able to replace older bits.
> If there's a really good reason we can keep things separate, but
> 
>   "epoll and kevent_poll differs on some aspects"
> 
> is not one :)

kevent_poll uses hash table (actually it is kevent that uses table),
locking is simpler and part of it is hidden in kevent core.
Actually kevent_poll is just a container allocator for poll wait queue.
So epoll does not differ (except hash/tree and locking,
which is based on locks for pathes which are shared in kevent with those
ones which can be called from irq/bh context) from kevent + kevent_poll.
And since kevent_poll can be not selected while epoll is always there
(until embedded config is turned on), I recommend to have them both.
Or always turn kevent on :)

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take9 1/2] kevent: Core files.
  2006-08-18 10:46                                 ` Christoph Hellwig
@ 2006-08-18 11:23                                   ` Evgeniy Polyakov
  2006-08-21 10:56                                     ` Christoph Hellwig
  0 siblings, 1 reply; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-18 11:23 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: lkml, David Miller, Ulrich Drepper, Andrew Morton, netdev, Zach Brown

On Fri, Aug 18, 2006 at 11:46:07AM +0100, Christoph Hellwig (hch@infradead.org) wrote:
> > > > +#define KEVENT_READY		0x1
> > > > +#define KEVENT_STORAGE		0x2
> > > > +#define KEVENT_USER		0x4
> > > 
> > > Please use enums here.
> > 
> > I used, but I was sugested to use define in some previous releases :)
> 
> defines make some sense for userspace-visible ABIs because then people
> can test for features with ifdef.  It doesn't make any sense for constants
> that are used purely in-kernel.  For those enums make more sense because
> you can for example looks at the symbolic names with a debugger.

Enums are only usefull when value is increased with each new member by
one.

> > > We were rather against these kind of odd multiplexers in the past.  For
> > > these three we at least have a common type beeing passed down so there's
> > > not compat handling problem, but I'm still not very happy with it..
> > 
> > I use one syscall for add/remove/modify, so it requires multiplexer.
> 
> I noticed that you do it, but it's not exactly considered a nice design.

There will be either several syscalls or multiplexer...
I prefer to have one syscall and a lot of multiplexers inside.

> > > > +asmlinkage long sys_kevent_ctl(int fd, unsigned int cmd, unsigned int num, void __user *arg)
> > > > +{
> > > > +	int err = -EINVAL;
> > > > +	struct file *file;
> > > > +
> > > > +	if (cmd == KEVENT_CTL_INIT)
> > > > +		return kevent_ctl_init();
> > > 
> > > This one on the other hand is plain wrong. At least it should be a separate
> > > syscall.  But looking at the code I don't quite understand why you need
> > > a syscall at all, why can't kevent be implemented as a cloning chardevice
> > > (on where every open allocates a new structure and stores it into
> > > file->private_data?)
> > 
> > That requires separate syscall.
> 
> Yes, it requires a separate syscall.
> 
> > I created a char device in first releases and was forced to not use it
> > at all.
> 
> Do you have a reference to it?  In this case a char devices makes a lot of
> sense because you get a filedescriptor and have operations only defined on
> it.  In fact given that you have a multiplexer anyway there's really no
> point in adding a syscall for that aswell, you could rather use the existing
> and debugged ioctl() multiplexer.  Sure, it's still not what we consider
> nice, but better than adding even more odd multiplexer syscalls.

Somewhere in february.
Here is link to initial anounce which used ioctl and raw char device and
enums for all constants.

http://marc.theaimsgroup.com/?l=linux-netdev&m=113949344414464&w=2

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take9 1/2] kevent: Core files.
  2006-08-18 11:23                                   ` Evgeniy Polyakov
@ 2006-08-21 10:56                                     ` Christoph Hellwig
  2006-08-21 11:13                                       ` Evgeniy Polyakov
  0 siblings, 1 reply; 160+ messages in thread
From: Christoph Hellwig @ 2006-08-21 10:56 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: Christoph Hellwig, lkml, David Miller, Ulrich Drepper,
	Andrew Morton, netdev, Zach Brown

On Fri, Aug 18, 2006 at 03:23:36PM +0400, Evgeniy Polyakov wrote:
> > defines make some sense for userspace-visible ABIs because then people
> > can test for features with ifdef.  It doesn't make any sense for constants
> > that are used purely in-kernel.  For those enums make more sense because
> > you can for example looks at the symbolic names with a debugger.
> 
> Enums are only usefull when value is increased with each new member by
> one.

No, they are not.  Please search the lkml archives, this came up multiple
times.

> There will be either several syscalls or multiplexer...
> I prefer to have one syscall and a lot of multiplexers inside.

To make life for everyone to detangle the mess hard.  Please at least
try to follow existing design principles.

> > > I created a char device in first releases and was forced to not use it
> > > at all.
> > 
> > Do you have a reference to it?  In this case a char devices makes a lot of
> > sense because you get a filedescriptor and have operations only defined on
> > it.  In fact given that you have a multiplexer anyway there's really no
> > point in adding a syscall for that aswell, you could rather use the existing
> > and debugged ioctl() multiplexer.  Sure, it's still not what we consider
> > nice, but better than adding even more odd multiplexer syscalls.
> 
> Somewhere in february.
> Here is link to initial anounce which used ioctl and raw char device and
> enums for all constants.
> 
> http://marc.theaimsgroup.com/?l=linux-netdev&m=113949344414464&w=2

That thread only shows your patch but no comments to it.  Do you have
an url for the complaint about this design?  And please include the author
of it in the cc list of your reply so we can settle the arguments.

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take9 2/2] kevent: poll/select() notifications. Timer notifications.
  2006-08-18 10:59                                     ` Evgeniy Polyakov
@ 2006-08-21 11:01                                       ` Christoph Hellwig
  2006-08-21 11:26                                         ` Evgeniy Polyakov
  0 siblings, 1 reply; 160+ messages in thread
From: Christoph Hellwig @ 2006-08-21 11:01 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: lkml, David Miller, Ulrich Drepper, Andrew Morton, netdev,
	Zach Brown, tglx

On Fri, Aug 18, 2006 at 02:59:34PM +0400, Evgeniy Polyakov wrote:
> > If there's a really good reason we can keep things separate, but
> > 
> >   "epoll and kevent_poll differs on some aspects"
> > 
> > is not one :)
> 
> kevent_poll uses hash table (actually it is kevent that uses table),
> locking is simpler and part of it is hidden in kevent core.
> Actually kevent_poll is just a container allocator for poll wait queue.
> So epoll does not differ (except hash/tree and locking,
> which is based on locks for pathes which are shared in kevent with those
> ones which can be called from irq/bh context) from kevent + kevent_poll.
> And since kevent_poll can be not selected while epoll is always there
> (until embedded config is turned on), I recommend to have them both.
> Or always turn kevent on :)

You mention a lot of implementation details that absoultely shouldn't
matter to the userspace interface.

I might not have explained enough what the point behind all this is, so
I'll try to explain it again:

 - the fate of aio, inotify, epoll, etc shows we badly need a generic
   event mechnism that unifies event based interfaces of various subsystem.
   Only having a single mechanisms allows things like unified event loops
   and gives application progreammers the chance to learn that one interface
   for real and get it right.
 - kevent looks like the right way to do this.  but to show it can really
   archive this it needs to show it can do the things the existing event
   systems can do at least as good.  reimplementing their user interfaces
   ontop of kevent is the best (or maybe only) way to show that.
   epoll is probably the easiest of the ones we have, so I'd suggest starting
   with it.  inotify will be a lot harder, but we'll need that aswell.
   the kevent inode hooks you had in your earlier patches will never ever
   get in.

Was this clear enough?

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take9 1/2] kevent: Core files.
  2006-08-21 10:56                                     ` Christoph Hellwig
@ 2006-08-21 11:13                                       ` Evgeniy Polyakov
  2006-08-21 12:53                                         ` Bernd Petrovitsch
  0 siblings, 1 reply; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-21 11:13 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: lkml, David Miller, Ulrich Drepper, Andrew Morton, netdev, Zach Brown

On Mon, Aug 21, 2006 at 11:56:37AM +0100, Christoph Hellwig (hch@infradead.org) wrote:
> On Fri, Aug 18, 2006 at 03:23:36PM +0400, Evgeniy Polyakov wrote:
> > > defines make some sense for userspace-visible ABIs because then people
> > > can test for features with ifdef.  It doesn't make any sense for constants
> > > that are used purely in-kernel.  For those enums make more sense because
> > > you can for example looks at the symbolic names with a debugger.
> > 
> > Enums are only usefull when value is increased with each new member by
> > one.
> 
> No, they are not.  Please search the lkml archives, this came up multiple
> times.

Enums when OR'ed and ANDed in theory can produce value out of the enum
set.
And what is the difference between
#define A 1
#define B 2
#define C 4
and
enum {
 A = 1,
 B = 2,
 C = 4,
}
?

> > There will be either several syscalls or multiplexer...
> > I prefer to have one syscall and a lot of multiplexers inside.
> 
> To make life for everyone to detangle the mess hard.  Please at least
> try to follow existing design principles.

I added as less as possible syscalls - control one read ones.
The former can initialize/add/remove/modify kevents, the latter reads
ready events.

> > > > I created a char device in first releases and was forced to not use it
> > > > at all.
> > > 
> > > Do you have a reference to it?  In this case a char devices makes a lot of
> > > sense because you get a filedescriptor and have operations only defined on
> > > it.  In fact given that you have a multiplexer anyway there's really no
> > > point in adding a syscall for that aswell, you could rather use the existing
> > > and debugged ioctl() multiplexer.  Sure, it's still not what we consider
> > > nice, but better than adding even more odd multiplexer syscalls.
> > 
> > Somewhere in february.
> > Here is link to initial anounce which used ioctl and raw char device and
> > enums for all constants.
> > 
> > http://marc.theaimsgroup.com/?l=linux-netdev&m=113949344414464&w=2
> 
> That thread only shows your patch but no comments to it.  Do you have
> an url for the complaint about this design?  And please include the author
> of it in the cc list of your reply so we can settle the arguments.

enum vs. defines were changed after David Miller's suggestion, since the
whole network stack and epoll use defines extensively, and kevent was
built for them in a first time.

I do not remember who suggested not to use char device, but I saw that
inotify specially mention about ioct/syscall issues, old discussion
about event handling mechanism somewhere in this thread:
http://uwsg.iu.edu/hypermail/linux/kernel/0010.3/0002.html

It looks a bit illogical to have epoll/poll syscalls and read data from
char device and it's ioctls for network sockets completeness or timer
notifications.

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take9 2/2] kevent: poll/select() notifications. Timer notifications.
  2006-08-21 11:01                                       ` Christoph Hellwig
@ 2006-08-21 11:26                                         ` Evgeniy Polyakov
  0 siblings, 0 replies; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-21 11:26 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: lkml, David Miller, Ulrich Drepper, Andrew Morton, netdev,
	Zach Brown, tglx

On Mon, Aug 21, 2006 at 12:01:04PM +0100, Christoph Hellwig (hch@infradead.org) wrote:
> On Fri, Aug 18, 2006 at 02:59:34PM +0400, Evgeniy Polyakov wrote:
> > > If there's a really good reason we can keep things separate, but
> > > 
> > >   "epoll and kevent_poll differs on some aspects"
> > > 
> > > is not one :)
> > 
> > kevent_poll uses hash table (actually it is kevent that uses table),
> > locking is simpler and part of it is hidden in kevent core.
> > Actually kevent_poll is just a container allocator for poll wait queue.
> > So epoll does not differ (except hash/tree and locking,
> > which is based on locks for pathes which are shared in kevent with those
> > ones which can be called from irq/bh context) from kevent + kevent_poll.
> > And since kevent_poll can be not selected while epoll is always there
> > (until embedded config is turned on), I recommend to have them both.
> > Or always turn kevent on :)
> 
> You mention a lot of implementation details that absoultely shouldn't
> matter to the userspace interface.
> 
> I might not have explained enough what the point behind all this is, so
> I'll try to explain it again:
> 
>  - the fate of aio, inotify, epoll, etc shows we badly need a generic
>    event mechnism that unifies event based interfaces of various subsystem.
>    Only having a single mechanisms allows things like unified event loops
>    and gives application progreammers the chance to learn that one interface
>    for real and get it right.
>  - kevent looks like the right way to do this.  but to show it can really
>    archive this it needs to show it can do the things the existing event
>    systems can do at least as good.  reimplementing their user interfaces
>    ontop of kevent is the best (or maybe only) way to show that.
>    epoll is probably the easiest of the ones we have, so I'd suggest starting
>    with it.  inotify will be a lot harder, but we'll need that aswell.
>    the kevent inode hooks you had in your earlier patches will never ever
>    get in.
> 
> Was this clear enough?

Sure, but if I say that it would sound like advertisement :)
Some inotify notifications (inode create/remove) are implemented already
in (dropped) FS notification patchset.

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take9 1/2] kevent: Core files.
  2006-08-21 11:13                                       ` Evgeniy Polyakov
@ 2006-08-21 12:53                                         ` Bernd Petrovitsch
  2006-08-21 13:01                                           ` Evgeniy Polyakov
  0 siblings, 1 reply; 160+ messages in thread
From: Bernd Petrovitsch @ 2006-08-21 12:53 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: Christoph Hellwig, lkml, David Miller, Ulrich Drepper,
	Andrew Morton, netdev, Zach Brown

On Mon, 2006-08-21 at 15:13 +0400, Evgeniy Polyakov wrote:
[...]
> And what is the difference between

As others already pointed out in this thread:

These are not seen by the C compiler.
> #define A 1
> #define B 2
> #define C 4
> and

These are known by the C compiler and thus usable/viewable in a
debugger.
> enum {
>  A = 1,
>  B = 2,
>  C = 4,
> }
> ?
	Bernd
-- 
Firmix Software GmbH                   http://www.firmix.at/
mobil: +43 664 4416156                 fax: +43 1 7890849-55
          Embedded Linux Development and Services


^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take9 1/2] kevent: Core files.
  2006-08-21 12:53                                         ` Bernd Petrovitsch
@ 2006-08-21 13:01                                           ` Evgeniy Polyakov
  2006-08-21 13:49                                             ` Bernd Petrovitsch
  2006-08-21 19:09                                             ` David Miller
  0 siblings, 2 replies; 160+ messages in thread
From: Evgeniy Polyakov @ 2006-08-21 13:01 UTC (permalink / raw)
  To: Bernd Petrovitsch
  Cc: Christoph Hellwig, lkml, David Miller, Ulrich Drepper,
	Andrew Morton, netdev, Zach Brown

On Mon, Aug 21, 2006 at 02:53:25PM +0200, Bernd Petrovitsch (bernd@firmix.at) wrote:
> On Mon, 2006-08-21 at 15:13 +0400, Evgeniy Polyakov wrote:
> [...]
> > And what is the difference between
> 
> As others already pointed out in this thread:
> 
> These are not seen by the C compiler.
> > #define A 1
> > #define B 2
> > #define C 4
> > and
> 
> These are known by the C compiler and thus usable/viewable in a
> debugger.
> > enum {
> >  A = 1,
> >  B = 2,
> >  C = 4,
> > }
> > ?

:) And I pointed quite a few other issues about enums vs. defines.
According to this one - no one wants to watch enums in debugger.

And, ugh:

(gdb) list
1       enum {
2               A = 1,
3               B = 2,
4       };
5
6       int main()
7       {
8               printf("%x\n", A | B);
9       }
(gdb) bre 8
Breakpoint 1 at 0x4004ac: file ./test.c, line 8.
(gdb) r
Starting program: /tmp/test 

Breakpoint 1, main () at ./test.c:8
8               printf("%x\n", A | B);
(gdb) p A
No symbol "A" in current context.


Actually I completely do not care about define or enums, it is really
silly dispute, I just do not want to rewrite bunch of code _again_ and
then _again_ when someone decide that defines are better.

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take9 1/2] kevent: Core files.
  2006-08-21 13:01                                           ` Evgeniy Polyakov
@ 2006-08-21 13:49                                             ` Bernd Petrovitsch
  2006-08-21 19:09                                             ` David Miller
  1 sibling, 0 replies; 160+ messages in thread
From: Bernd Petrovitsch @ 2006-08-21 13:49 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: Christoph Hellwig, lkml, David Miller, Ulrich Drepper,
	Andrew Morton, netdev, Zach Brown

On Mon, 2006-08-21 at 17:01 +0400, Evgeniy Polyakov wrote:
[ #define vs enum { } ]
> And, ugh:
> 
> (gdb) list
> 1       enum {
> 2               A = 1,
> 3               B = 2,
> 4       };
> 5
> 6       int main()
> 7       {
> 8               printf("%x\n", A | B);
> 9       }
> (gdb) bre 8
> Breakpoint 1 at 0x4004ac: file ./test.c, line 8.
> (gdb) r
> Starting program: /tmp/test 
> 
> Breakpoint 1, main () at ./test.c:8
> 8               printf("%x\n", A | B);
> (gdb) p A
> No symbol "A" in current context.

Oops, I stand corrected.

> Actually I completely do not care about define or enums, it is really
> silly dispute, I just do not want to rewrite bunch of code _again_ and
> then _again_ when someone decide that defines are better.

ACK. Personally I also do not care that much - as long as it doesn't
change with the phase of the moon.
And we probably do not want
----  snip  ----
#ifdef CONFIG_I_LOVE_ENUMS
#define A 1
#define B 2
#define C 4
#else
enum {
	A = 1,
	B = 2,
	C = 4,
};
#endif
----  snip  ----
either.

	Bernd, shutting now up on this thread
-- 
Firmix Software GmbH                   http://www.firmix.at/
mobil: +43 664 4416156                 fax: +43 1 7890849-55
          Embedded Linux Development and Services


^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take9 1/2] kevent: Core files.
  2006-08-21 13:01                                           ` Evgeniy Polyakov
  2006-08-21 13:49                                             ` Bernd Petrovitsch
@ 2006-08-21 19:09                                             ` David Miller
  1 sibling, 0 replies; 160+ messages in thread
From: David Miller @ 2006-08-21 19:09 UTC (permalink / raw)
  To: johnpol; +Cc: bernd, hch, linux-kernel, drepper, akpm, netdev, zach.brown

From: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
Date: Mon, 21 Aug 2006 17:01:21 +0400

> Actually I completely do not care about define or enums, it is really
> silly dispute, I just do not want to rewrite bunch of code _again_ and
> then _again_ when someone decide that defines are better.

I totally agree.

What in the world is wrong with you people arguing over stuff like
this?

If the goal is to discourage Evgeniy and his work, you might just
get your wish if you keep up with this silly coding style and
enumeration crap!

I can't even stand to read these kevent threads any longer, the
sanity in them has long gone out the window.

^ permalink raw reply	[flat|nested] 160+ messages in thread

* Re: [take9 2/2] kevent: poll/select() notifications. Timer notifications.
  2006-08-16 13:30                               ` Christoph Hellwig
  2006-08-16 13:40                                 ` Evgeniy Polyakov
@ 2006-08-22 14:35                                 ` Davide Libenzi
  1 sibling, 0 replies; 160+ messages in thread
From: Davide Libenzi @ 2006-08-22 14:35 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Evgeniy Polyakov, lkml, David Miller, Ulrich Drepper,
	Andrew Morton, netdev, Zach Brown, tglx

On Wed, 16 Aug 2006, Christoph Hellwig wrote:

> On Mon, Aug 14, 2006 at 10:21:36AM +0400, Evgeniy Polyakov wrote:
>>
>> poll/select() notifications. Timer notifications.
>>
>> This patch includes generic poll/select and timer notifications.
>>
>> kevent_poll works simialr to epoll and has the same issues (callback
>> is invoked not from internal state machine of the caller, but through
>> process awake).
>
> I'm not a big fan of duplicating code over and over.  kevent is a candidate
> for a generic event devlivery mechanisms which is a _very_ good thing.  But
> starting that system by duplicating existing functionality is not very nice.
>
> What speaks against a patch the recplaces the epoll core by something that
> build on kevent while still supporting the epoll interface as a compatibility
> shim?

Sorry, I'm catching up with a huge post-vacation backlog, so I didn't have 
the time to look at the source code. But, if kevent performance is same or 
better, and the external epoll interface is fully supported, than I think 
the shim layer idea is a good one. Provided the shim being smaller than 
eventpoll.c :)



- Davide



^ permalink raw reply	[flat|nested] 160+ messages in thread

end of thread, other threads:[~2006-08-22 14:36 UTC | newest]

Thread overview: 160+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2006-07-09 13:24 [RFC 1/4] kevent: core files Evgeniy Polyakov
2006-07-09 14:59 ` Pekka Enberg
2006-07-09 15:08   ` Evgeniy Polyakov
2006-07-25  6:17 ` David Miller
2006-07-25  6:26   ` Evgeniy Polyakov
2006-07-27 19:18   ` Zach Brown
2006-07-27 20:06     ` Evgeniy Polyakov
2006-07-27 21:32       ` Zach Brown
2006-07-28  5:23         ` Evgeniy Polyakov
2006-07-28 18:33           ` Zach Brown
2006-07-28 18:44             ` Evgeniy Polyakov
2006-07-28 19:10               ` Zach Brown
2006-07-29  3:38                 ` Ulrich Drepper
2006-07-29  4:32                   ` Nicholas Miell
2006-07-29 15:48                     ` Evgeniy Polyakov
2006-07-29 20:54                       ` Nicholas Miell
2006-07-30  8:08                     ` Ulrich Drepper
2006-07-29 15:44                   ` Evgeniy Polyakov
2006-07-29 16:18                     ` Ulrich Drepper
2006-07-31 10:33                       ` Evgeniy Polyakov
2006-07-31 10:35                         ` Herbert Xu
2006-07-31 10:50                           ` Evgeniy Polyakov
2006-07-31 10:57                             ` David Miller
2006-07-31 10:59                               ` Herbert Xu
2006-08-01  7:53                                 ` Ulrich Drepper
2006-08-01  7:58                                   ` David Miller
2006-07-31 19:41                         ` Evgeniy Polyakov
2006-07-31 22:00                           ` David Miller
2006-07-31 22:16                             ` Brent Cook
2006-07-31 22:20                               ` David Miller
2006-08-01  6:24                             ` Evgeniy Polyakov
2006-07-31 22:46                         ` Zach Brown
2006-08-01  9:34                         ` [take2 0/4] kevent: introduction Evgeniy Polyakov
2006-08-01  9:34                           ` [take2 1/4] kevent: core files Evgeniy Polyakov
2006-08-01  9:34                             ` [take2 2/4] kevent: network AIO, socket notifications Evgeniy Polyakov
2006-08-01  9:34                               ` [take2 4/4] kevent: poll/select() notifications. Timer notifications Evgeniy Polyakov
2006-08-01  9:34                                 ` [take2 3/4] kevent: AIO, aio_sendfile() implementation Evgeniy Polyakov
2006-08-01 13:46                             ` [take2 1/4] kevent: core files James Morris
2006-08-01 13:55                               ` Evgeniy Polyakov
2006-08-01 14:27                                 ` James Morris
2006-08-01 14:34                                   ` Evgeniy Polyakov
2006-08-01 23:56                             ` Zach Brown
2006-08-02  0:01                               ` David Miller
2006-08-02  6:43                                 ` Evgeniy Polyakov
2006-08-02  6:39                               ` Evgeniy Polyakov
2006-08-02  7:25                                 ` David Miller
2006-08-02  7:46                                   ` Evgeniy Polyakov
2006-08-03  9:45                         ` [take3 0/4] kevent: Generic event handling mechanism Evgeniy Polyakov
2006-08-03  9:40                           ` Evgeniy Polyakov
2006-08-03  9:46                           ` [take3 1/4] kevent: Core files Evgeniy Polyakov
2006-08-03  9:46                             ` [take3 2/4] kevent: AIO, aio_sendfile() implementation Evgeniy Polyakov
2006-08-03  9:46                               ` [take3 3/4] kevent: Network AIO, socket notifications Evgeniy Polyakov
2006-08-03  9:46                                 ` [take3 4/4] kevent: poll/select() notifications. Timer notifications Evgeniy Polyakov
2006-08-03  9:43                                   ` Eric Dumazet
2006-08-03  9:48                                     ` Evgeniy Polyakov
2006-08-03  9:54                                 ` [take3 3/4] kevent: Network AIO, socket notifications Eric Dumazet
2006-08-03 10:13                                   ` Evgeniy Polyakov
2006-08-03 17:04                               ` [take3 2/4] kevent: AIO, aio_sendfile() implementation Badari Pulavarty
2006-08-03 17:13                                 ` Evgeniy Polyakov
2006-08-03 14:40                             ` [take3 1/4] kevent: Core files Eric Dumazet
2006-08-03 14:55                               ` Evgeniy Polyakov
2006-08-03 15:11                                 ` Eric Dumazet
2006-08-03 15:21                                   ` Evgeniy Polyakov
2006-08-03 21:37                                 ` David Miller
2006-08-05 13:02                         ` [take4 0/4] kevent: Generic event handling mechanism Evgeniy Polyakov
2006-08-05 13:02                           ` [take4 1/4] kevent: Core files Evgeniy Polyakov
2006-08-05 13:02                             ` [take4 2/4] kevent: AIO, aio_sendfile() implementation Evgeniy Polyakov
2006-08-05 13:02                               ` [take4 3/4] kevent: Network AIO, socket notifications Evgeniy Polyakov
2006-08-05 13:02                                 ` [take4 4/4] kevent: poll/select() notifications. Timer notifications Evgeniy Polyakov
2006-08-05 17:57                             ` [take4 1/4] kevent: Core files Greg KH
2006-08-05 18:10                               ` Evgeniy Polyakov
2006-08-09  8:02                         ` [take6 0/3] kevent: Generic event handling mechanism Evgeniy Polyakov
2006-08-09  7:58                           ` David Miller
2006-08-09  8:07                             ` Evgeniy Polyakov
2006-08-09  8:20                               ` David Miller
2006-08-09  8:24                                 ` Evgeniy Polyakov
2006-08-09  8:02                           ` [take6 1/3] kevent: Core files Evgeniy Polyakov
2006-08-09  8:02                             ` [take6 3/3] kevent: Network AIO, socket notifications Evgeniy Polyakov
2006-08-09  8:02                               ` [take6 2/3] kevent: poll/select() notifications. Timer notifications Evgeniy Polyakov
2006-08-09 17:47                             ` [take6 1/3] kevent: Core files Stephen Hemminger
2006-08-09 19:17                               ` Evgeniy Polyakov
2006-08-10  0:04                               ` David Miller
2006-08-09 22:21                             ` Andrew Morton
2006-08-10  6:14                               ` Evgeniy Polyakov
2006-08-10  6:42                                 ` David Miller
2006-08-10  6:48                                   ` Evgeniy Polyakov
2006-08-10  7:18                                 ` Andrew Morton
2006-08-10  7:50                                   ` Evgeniy Polyakov
2006-08-10  8:02                                     ` Andrew Morton
2006-08-10  8:22                                       ` Evgeniy Polyakov
2006-08-11  0:56                                         ` Andrew Morton
2006-08-11  6:15                                           ` Evgeniy Polyakov
2006-08-11  6:23                                             ` Andrew Morton
2006-08-11  6:30                                               ` Evgeniy Polyakov
2006-08-11  7:04                                                 ` Andrew Morton
2006-08-11  7:27                                                   ` Evgeniy Polyakov
2006-08-11  6:25                                             ` Ulrich Drepper
2006-08-11  6:33                                               ` Evgeniy Polyakov
2006-08-11  6:38                                                 ` David Miller
2006-08-11  6:55                                                   ` Evgeniy Polyakov
2006-08-10 12:12                               ` [take7 0/1] kevent: generic event handling mechanism Evgeniy Polyakov
2006-08-10 12:16                                 ` [take7 1/1] kevent: core files and timer/poll notifications Evgeniy Polyakov
2006-08-10 12:22                                   ` Evgeniy Polyakov
2006-08-11  8:40                         ` [take8 0/2] kevent: Generic event handling mechanism Evgeniy Polyakov
2006-08-11  8:40                           ` [take8 1/2] kevent: Core files Evgeniy Polyakov
2006-08-11  8:40                             ` [take8 2/2] kevent: poll/select() notifications. Timer notifications Evgeniy Polyakov
2006-08-11 15:45                               ` Andrew Morton
2006-08-12  8:18                                 ` Evgeniy Polyakov
2006-08-12  8:38                                   ` Andrew Morton
2006-08-12  8:55                                     ` Evgeniy Polyakov
2006-08-13  0:51                             ` [take8 1/2] kevent: Core files Jeff Carr
2006-08-13  9:04                               ` Evgeniy Polyakov
2006-08-14  6:20                         ` [take8 0/2] kevent: Generic event handling mechanism Evgeniy Polyakov
2006-08-14  6:20                           ` [take8 1/2] kevent: Core files Evgeniy Polyakov
2006-08-14  6:20                             ` [take8 2/2] kevent: poll/select() notifications. Timer notifications Evgeniy Polyakov
2006-08-14  6:21                         ` [take9 0/2] kevent: Generic event handling mechanism Evgeniy Polyakov
2006-08-14  6:21                           ` [take9 1/2] kevent: Core files Evgeniy Polyakov
2006-08-14  6:21                             ` [take9 2/2] kevent: poll/select() notifications. Timer notifications Evgeniy Polyakov
2006-08-16 13:30                               ` Christoph Hellwig
2006-08-16 13:40                                 ` Evgeniy Polyakov
2006-08-18 10:41                                   ` Christoph Hellwig
2006-08-18 10:59                                     ` Evgeniy Polyakov
2006-08-21 11:01                                       ` Christoph Hellwig
2006-08-21 11:26                                         ` Evgeniy Polyakov
2006-08-22 14:35                                 ` Davide Libenzi
2006-08-16 13:45                             ` [take9 1/2] kevent: Core files Christoph Hellwig
2006-08-16 13:56                               ` Evgeniy Polyakov
2006-08-16 18:08                                 ` Zach Brown
2006-08-16 19:24                                   ` Evgeniy Polyakov
2006-08-16 19:45                                   ` David Miller
2006-08-16 20:06                                     ` Evgeniy Polyakov
2006-08-18 10:46                                 ` Christoph Hellwig
2006-08-18 11:23                                   ` Evgeniy Polyakov
2006-08-21 10:56                                     ` Christoph Hellwig
2006-08-21 11:13                                       ` Evgeniy Polyakov
2006-08-21 12:53                                         ` Bernd Petrovitsch
2006-08-21 13:01                                           ` Evgeniy Polyakov
2006-08-21 13:49                                             ` Bernd Petrovitsch
2006-08-21 19:09                                             ` David Miller
2006-08-16 13:26                           ` [take9 0/2] kevent: Generic event handling mechanism Christoph Hellwig
2006-08-16 13:38                             ` Evgeniy Polyakov
2006-08-16 18:10                               ` Zach Brown
2006-08-16 12:34                         ` [take10 " Evgeniy Polyakov
2006-08-16 12:34                           ` [take10 1/2] kevent: Core files Evgeniy Polyakov
2006-08-16 12:34                             ` [take10 2/2] kevent: poll/select() notifications. Timer notifications Evgeniy Polyakov
2006-08-16 12:37                             ` [take10 1/2] kevent: Core files Mika Penttilä
2006-08-16 12:44                               ` Evgeniy Polyakov
2006-08-18  9:35                             ` Joe Jin
2006-08-18 10:10                               ` Evgeniy Polyakov
2006-08-01  1:05           ` [RFC 1/4] kevent: core files David Miller
2006-07-27 20:58     ` Benjamin LaHaise
2006-07-27 21:44       ` Zach Brown
2006-07-27 22:02         ` Benjamin LaHaise
2006-07-28  5:39           ` Evgeniy Polyakov
2006-07-28 19:01           ` Zach Brown
2006-07-28 19:24             ` Evgeniy Polyakov
2006-07-28 19:34               ` Zach Brown
2006-07-28 19:37                 ` Zach Brown
2006-08-01  1:02     ` David Miller
2006-08-01 17:02       ` Zach Brown

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).