linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [2.6.16 PATCH] Filessytem Events Reporter V2
@ 2006-04-05 13:21 Yi Yang
  2006-04-05 17:12 ` Carlos Silva
  2006-04-07  6:24 ` Evgeniy Polyakov
  0 siblings, 2 replies; 14+ messages in thread
From: Yi Yang @ 2006-04-05 13:21 UTC (permalink / raw)
  To: LKML; +Cc: Andrew Morton, Evgeniy Polyakov, Matt Helsley

Compared with Filesystem Events Reporter v1, the following changes are done:
  - Use workqueue/keventd instead of kfseventd. 
  - fsevent_mask can be set by sysctl and proc
    interface.
  - Add missed spinlock
  - Ensure fsevent sequence number is uniqe

This patch implements a filsystem events report facitily, Filesystem Events
Reporter, the user can monitor filesystem activities via it, currently, it
 can monitor access, attribute change, open, create, modify, delete,
 move and close of any file or directory as well as mount/umount.

Every filesystem event will include tgid, uid and gid of the process
 which triggered this event, process name, file or directory name 
operated by it.

Filesystem Events Reporter is never a duplicate of inotify, inotify
 just concerns change on file or directory, Beagle uses it to watch
 file changes in order to regenerate index for it, inotify can't tell
 us who did that change and what is its process name, but filesystem
 events reporter can do these, moreover inotify's overhead is greater
 than filesystem events reporter, inotify needs compare inode with 
watched file or directories list to decide whether it should generate an
 inotify_event, some locks also increase overhead, filesystem event 
connector hasn't these overhead, it just generates a fsevent and send.

To be important, filesystem event reporter doesn't add any new system 
call, the user space application can make use of it by netlink socket, 
but inotify added several system calls, many events mechanism in kernel
 have used netlink as communication way with user space, for example, 
KOBJECT_UEVENT, PROC_EVENTS, to use netlink will make it more possible
 to unify events interface to netlink, the user space application can use 
it very easy.

Every application using fsevent can set its own fsevent filter list 
without effect in other applications using fsevent, there are three
filter lists, they are pid filter list, uid filetr list and gid 
filter list, respectively, moreover, there is a fsevent mask used to
control those fsevents which fail to match  three filter lists, an 
application using fsevent can listen those fsevents it want to 
monitor and ignore those fsevents it doesn't interest in by set 
series of filters, there is a fsevent mask used to take effects on
all the applications using fsevent, it can be set by sysctl and proc
interface.

 fs/Kconfig               |   10 
 fs/Makefile              |    1 
 fs/fsevent.c             |  598 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/namespace.c           |   12 
 include/linux/fsevent.h  |  131 ++++++++++
 include/linux/fsnotify.h |   37 ++
 include/linux/netlink.h  |    1 
 7 files changed, 790 insertions(+)

Signed-off-by: Yi Yang <yang.y.yi@gmail.com>

--- a/include/linux/netlink.h.orig	2006-03-31 22:58:50.000000000 +0800
+++ b/include/linux/netlink.h	2006-04-05 20:34:23.000000000 +0800
@@ -21,6 +21,7 @@
 #define NETLINK_DNRTMSG		14	/* DECnet routing messages */
 #define NETLINK_KOBJECT_UEVENT	15	/* Kernel messages to userspace */
 #define NETLINK_GENERIC		16
+#define NETLINK_FSEVENT		17	/* File system events to userspace */
 
 #define MAX_LINKS 32		
 
--- a/include/linux/fsnotify.h.orig	2006-01-03 11:21:10.000000000 +0800
+++ b/include/linux/fsnotify.h	2006-04-03 22:12:52.000000000 +0800
@@ -15,6 +15,7 @@
 
 #include <linux/dnotify.h>
 #include <linux/inotify.h>
+#include <linux/fsevent.h>
 
 /*
  * fsnotify_move - file old_name at old_dir was moved to new_name at new_dir
@@ -45,6 +46,8 @@ static inline void fsnotify_move(struct 
 	if (source) {
 		inotify_inode_queue_event(source, IN_MOVE_SELF, 0, NULL);
 	}
+	raise_fsevent_move(old_dir, old_name, new_dir, new_name,
+			   FSEVENT_MOVE | (isdir?FSEVENT_ISDIR:0));
 }
 
 /*
@@ -56,6 +59,8 @@ static inline void fsnotify_nameremove(s
 		isdir = IN_ISDIR;
 	dnotify_parent(dentry, DN_DELETE);
 	inotify_dentry_parent_queue_event(dentry, IN_DELETE|isdir, 0, dentry->d_name.name);
+	raise_fsevent(dentry,
+		      FSEVENT_DELETE | (isdir?FSEVENT_ISDIR:0));
 }
 
 /*
@@ -74,6 +79,7 @@ static inline void fsnotify_create(struc
 {
 	inode_dir_notify(inode, DN_CREATE);
 	inotify_inode_queue_event(inode, IN_CREATE, 0, name);
+	raise_fsevent_create(inode, name, FSEVENT_CREATE);
 }
 
 /*
@@ -83,6 +89,8 @@ static inline void fsnotify_mkdir(struct
 {
 	inode_dir_notify(inode, DN_CREATE);
 	inotify_inode_queue_event(inode, IN_CREATE | IN_ISDIR, 0, name);
+	raise_fsevent_create(inode, name,
+			     FSEVENT_CREATE | FSEVENT_ISDIR);
 }
 
 /*
@@ -99,6 +107,8 @@ static inline void fsnotify_access(struc
 	dnotify_parent(dentry, DN_ACCESS);
 	inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
 	inotify_inode_queue_event(inode, mask, 0, NULL);
+	raise_fsevent(dentry, FSEVENT_ACCESS |
+				((S_ISDIR(inode->i_mode))?FSEVENT_ISDIR:0));
 }
 
 /*
@@ -115,6 +125,8 @@ static inline void fsnotify_modify(struc
 	dnotify_parent(dentry, DN_MODIFY);
 	inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
 	inotify_inode_queue_event(inode, mask, 0, NULL);
+	raise_fsevent(dentry, FSEVENT_MODIFY |
+				((S_ISDIR(inode->i_mode))?FSEVENT_ISDIR:0));
 }
 
 /*
@@ -130,6 +142,9 @@ static inline void fsnotify_open(struct 
 
 	inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
 	inotify_inode_queue_event(inode, mask, 0, NULL);	
+	raise_fsevent(dentry, FSEVENT_OPEN |
+				((S_ISDIR(inode->i_mode))?FSEVENT_ISDIR:0));
+
 }
 
 /*
@@ -148,6 +163,8 @@ static inline void fsnotify_close(struct
 
 	inotify_dentry_parent_queue_event(dentry, mask, 0, name);
 	inotify_inode_queue_event(inode, mask, 0, NULL);
+	raise_fsevent(dentry, FSEVENT_CLOSE |
+				((S_ISDIR(inode->i_mode))?FSEVENT_ISDIR:0));
 }
 
 /*
@@ -163,6 +180,8 @@ static inline void fsnotify_xattr(struct
 
 	inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
 	inotify_inode_queue_event(inode, mask, 0, NULL);
+	raise_fsevent(dentry, FSEVENT_MODIFY_ATTRIB |
+				((S_ISDIR(inode->i_mode))?FSEVENT_ISDIR:0));
 }
 
 /*
@@ -213,6 +232,24 @@ static inline void fsnotify_change(struc
 		inotify_dentry_parent_queue_event(dentry, in_mask, 0,
 						  dentry->d_name.name);
 	}
+
+#ifdef CONFIG_FS_EVENTS
+	{
+	u32 fsevent_mask = 0;
+	if (ia_valid & (ATTR_UID | ATTR_GID | ATTR_MODE))
+		fsevent_mask |= FSEVENT_MODIFY_ATTRIB;
+	if ((ia_valid & ATTR_ATIME) && (ia_valid & ATTR_MTIME))
+		fsevent_mask |= FSEVENT_MODIFY_ATTRIB;
+	else if (ia_valid & ATTR_ATIME)
+		fsevent_mask |= FSEVENT_ACCESS;
+	else if (ia_valid & ATTR_MTIME)
+		fsevent_mask |= FSEVENT_MODIFY;
+	if (ia_valid & ATTR_SIZE)
+		fsevent_mask |= FSEVENT_MODIFY;
+	if (fsevent_mask)
+		raise_fsevent(dentry, fsevent_mask);
+	}
+#endif /* CONFIG_FS_EVENTS */
 }
 
 #ifdef CONFIG_INOTIFY	/* inotify helpers */
--- a/fs/namespace.c.orig	2006-03-25 23:13:42.000000000 +0800
+++ b/fs/namespace.c	2006-04-03 22:12:52.000000000 +0800
@@ -25,6 +25,7 @@
 #include <linux/mount.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
+#include <linux/fsevent.h>
 #include "pnode.h"
 
 extern int __init init_rootfs(void);
@@ -612,6 +613,13 @@ asmlinkage long sys_umount(char __user *
 		goto dput_and_out;
 
 	retval = do_umount(nd.mnt, flags);
+
+	if (retval == 0) {
+		char * tmp = getname(name);
+		raise_fsevent_umount(tmp);
+		putname(tmp);
+	}
+
 dput_and_out:
 	path_release_on_umount(&nd);
 out:
@@ -1459,6 +1467,10 @@ asmlinkage long sys_mount(char __user * 
 	retval = do_mount((char *)dev_page, dir_page, (char *)type_page,
 			  flags, (void *)data_page);
 	unlock_kernel();
+
+	if (retval == 0)
+		raise_fsevent_mount((char *)dev_page, dir_page);
+
 	free_page(data_page);
 
 out3:
--- a/fs/Kconfig.orig	2006-03-31 21:23:20.000000000 +0800
+++ b/fs/Kconfig	2006-04-03 22:12:52.000000000 +0800
@@ -405,6 +405,16 @@ config INOTIFY
 
 	  If unsure, say Y.
 
+config FS_EVENTS
+	tristate "Report filesystem events to userspace"
+	---help---
+	  Provide a facility that reports filesystem events to userspace. The
+	  reported event include access, write, utime, chmod, chown, chgrp,
+	  close, open, create, rename, unlink, mkdir, rmdir, mount, umount.
+
+	  The user can set filesystem events filter to filter its events, so
+	  that he just get those events he concerns.
+
 config QUOTA
 	bool "Quota support"
 	help
--- a/fs/Makefile.orig	2006-03-31 21:23:33.000000000 +0800
+++ b/fs/Makefile	2006-04-03 22:12:52.000000000 +0800
@@ -13,6 +13,7 @@ obj-y :=	open.o read_write.o file_table.
 		ioprio.o pnode.o drop_caches.o
 
 obj-$(CONFIG_INOTIFY)		+= inotify.o
+obj-$(CONFIG_FS_EVENTS)		+= fsevent.o
 obj-$(CONFIG_EPOLL)		+= eventpoll.o
 obj-$(CONFIG_COMPAT)		+= compat.o compat_ioctl.o
 
--- /dev/null	2003-01-30 18:24:37.000000000 +0800
+++ b/include/linux/fsevent.h	2006-04-04 23:18:01.000000000 +0800
@@ -0,0 +1,131 @@
+/*
+ * fsevent.h - filesystem events connector
+ *
+ * Copyright (C) 2006 Yi Yang <yang.y.yi@gmail.com>
+ * Based on cn_proc.h by Matt Helsley, IBM Corp
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef LINUX_FSEVENT_H
+#define LINUX_FSEVENT_H
+
+#include <linux/types.h>
+#include <linux/time.h>
+#include <linux/netlink.h>
+
+enum  fsevent_type {
+	FSEVENT_ACCESS = 	0x00000001,	/* File was accessed */
+	FSEVENT_MODIFY = 	0x00000002,	/* File was modified */
+	FSEVENT_MODIFY_ATTRIB = 0x00000004,	/* Metadata changed */
+	FSEVENT_CLOSE = 	0x00000008,	/* File was closed */
+	FSEVENT_OPEN = 		0x00000010,	/* File was opened */
+	FSEVENT_MOVE = 		0x00000020,	/* File was moved */
+	FSEVENT_CREATE = 	0x00000040,	/* File was created */
+	FSEVENT_DELETE =	0x00000080,	/* File was deleted */
+	FSEVENT_MOUNT =		0x00000100,	/* File system is mounted */
+	FSEVENT_UMOUNT =	0x00000200,	/* File system is umounted */
+
+	/* The following definitions are command types for fsevent filter
+	 * or acknowlege types of the corresponding commands
+	 */
+	FSEVENT_FILTER_ALL = 	0x08000000,	/* For all events */
+	FSEVENT_FILTER_PID = 	0x10000000,	/* For some process ID */
+	FSEVENT_FILTER_UID = 	0x20000000,	/* For some user ID */
+	FSEVENT_FILTER_GID =	0x40000000,	/* For some group ID */
+
+	FSEVENT_ISDIR = 	0x80000000	/* It is set for a dir */
+};
+
+#define FSEVENT_MASK 0x800003ff
+
+typedef unsigned long fsevent_mask_t;
+
+enum filter_control {
+	FSEVENT_FILTER_LISTEN = 1,		/* Listen fsevents mask defines*/
+	FSEVENT_FILTER_IGNORE ,		/* Ignore fsevents mask defines*/
+	FSEVENT_FILTER_REMOVE,			/* Remove a given filter */
+};	
+
+struct fsevent_filter {
+	/* filter type, it just is one of them
+	 * FSEVENT_FILTER_ALL
+	 * FSEVENT_FILTER_PID
+	 * FSEVENT_FILTER_UID
+	 * FSEVENT_FILTER_GID
+	 */
+	enum fsevent_type type;	/* filter type */
+
+	/* mask of file system events the user listen or ignore
+	 * if the user need to ignore all the events of some pid
+	 * , gid or uid, he(she) must set mask to FSEVENT_MASK.
+	 */ 
+	fsevent_mask_t mask;
+	union {
+		pid_t pid;
+		uid_t uid;
+		gid_t gid;
+	} id;
+
+	enum filter_control control;
+};
+
+struct fsevent {
+	__u32 type;
+	__u32 cpu;
+	struct timespec timestamp;
+	pid_t pid;
+	uid_t uid;
+	gid_t gid;
+	int err;
+	__u32 len;
+	__u32 pname_len;
+	__u32 fname_len;
+	__u32 new_fname_len;
+	char name[0];
+};
+
+#define FSEVENT_FILTER_MSGSIZE \
+	(sizeof(struct fsevent_filter) + sizeof(struct nlmsghdr))
+
+#ifdef __KERNEL__
+#ifdef CONFIG_FS_EVENTS
+extern void raise_fsevent(struct dentry * dentryp, u32 mask);
+extern void raise_fsevent_move(struct inode * olddir, const char * oldname, 
+		struct inode * newdir, const char * newname, u32 mask);
+extern void raise_fsevent_create(struct inode * inode, 
+		const char * name, u32 mask);
+extern void raise_fsevent_mount(const char * devname, const char * mountpoint);
+extern void raise_fsevent_umount(const char * mountpoint);
+#else
+static void raise_fsevent(struct dentry * dentryp,  u32 mask)
+{}
+
+static void raise_fsevent_move(struct inode * olddir, const char * oldname, 
+		struct inode * newdir, const char * newname, u32 mask)
+{}
+
+static void raise_fsevent_create(struct inode * inode, 
+		const char * name, u32 mask)
+{}
+
+static void raise_fsevent_mount(const char * devname, const char * mountpoint)
+{}
+
+static void raise_fsevent_umount(const char * mountpoint)
+{}
+#endif	/* CONFIG_FS_EVENTS */
+#endif	/* __KERNEL__ */
+#endif	/* LINUX_FSEVENT_H */
--- /dev/null	2003-01-30 18:24:37.000000000 +0800
+++ b/fs/fsevent.c	2006-04-05 20:52:53.000000000 +0800
@@ -0,0 +1,598 @@
+/*
+ * 	fsevent.c
+ * 
+ * 2006 Copyright (c) Yi Yang <yang.y.yi@gmail.com>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/moduleparam.h>
+#include <linux/fsevent.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <linux/list.h>
+#include <linux/percpu.h>
+#include <linux/cpu.h>
+#include <linux/kthread.h>
+#include <linux/notifier.h>
+#include <linux/compiler.h>
+#include <linux/workqueue.h>
+#include <linux/sysctl.h>
+
+#define FSEVENT_MASK_CTL_NAME -2
+
+static DEFINE_PER_CPU(struct sk_buff_head, fsevent_send_queue);
+static DEFINE_PER_CPU(struct work_struct, fsevent_work);
+
+typedef struct pid_filter {
+	pid_t pid;
+	u32 mask;
+	struct list_head list;
+} pid_filter;
+
+typedef struct uid_filter {
+	uid_t uid;
+	u32 mask;
+	struct list_head list;
+} uid_filter;
+
+typedef struct gid_filter {
+	gid_t gid;
+	u32 mask;
+	struct list_head list;
+} gid_filter;
+
+typedef struct fsevent_listener {
+	pid_t pid;
+	struct list_head pid_filter_list_head;
+	struct list_head uid_filter_list_head;
+	struct list_head gid_filter_list_head;
+	u32 mask;
+	struct list_head list;
+} listener;
+
+	
+/* The netlink socket. */
+static struct sock * fsevent_sock = NULL;
+static LIST_HEAD(listener_list_head);
+static DEFINE_SPINLOCK(listener_list_lock);
+
+static atomic_t fsevent_count = ATOMIC_INIT(0);
+static int fsevent_burst_limit = 100;
+static int fsevent_ratelimit = 5 * HZ;
+static unsigned long last = 0;
+static int fsevent_sum = 0;
+static u32 fsevents_mask = FSEVENT_MASK;
+static atomic_t fsevent_listener_num = ATOMIC_INIT(0);
+
+static inline void get_seq(__u32 *ts, int *cpu)
+{
+	*ts = atomic_inc_return(&fsevent_count);
+	*cpu = smp_processor_id();
+}
+
+static void append_string(char **dest, const char *src, size_t len)
+{
+	strncpy(*dest, src, len);
+	(*dest)[len] = '\0';
+	*dest += len + 1;
+}
+
+static inline int filter_fsevent(u32 filter_mask, u32 event_mask)
+{
+	event_mask &= FSEVENT_MASK;
+	event_mask &= filter_mask;
+	if (event_mask == 0) {
+		return -1;
+	}
+	return 0;
+}
+
+static int filter_fsevent_all(u32 * mask)
+{
+	int ret = 0;
+
+	(*mask) &= FSEVENT_MASK;
+
+	if ((((*mask) & FSEVENT_ISDIR) == FSEVENT_ISDIR)
+		 && ((fsevents_mask & FSEVENT_ISDIR) == 0)) {
+		ret = -1;
+		goto out;
+	}
+
+	(*mask) &= fsevents_mask;
+	if ((*mask) == 0) {
+		ret = -5;
+	}
+
+out:
+	return ret;
+}
+
+static void fsevent_send(struct sk_buff * skb)
+{
+	struct sk_buff_head * head = &get_cpu_var(fsevent_send_queue);
+	skb_queue_tail(head, skb);
+	schedule_work(&per_cpu(fsevent_work, smp_processor_id()));
+	put_cpu_var(fsevent_send_queue);
+}
+
+int __raise_fsevent(const char * oldname, const char * newname, u32 mask)
+{
+	struct fsevent *event;
+	int namelen = 0;
+	char * nameptr = NULL;
+	unsigned int size;
+	struct nlmsghdr * nlhdr;
+	struct sk_buff * skb = NULL;
+
+	if (filter_fsevent_all(&mask) != 0)
+		return -1;
+
+	if (atomic_read(&fsevent_listener_num) <= 0)
+		return -1;
+
+	if (jiffies - last <= fsevent_ratelimit) {
+		if (fsevent_sum > fsevent_burst_limit)
+			return -2;
+		fsevent_sum++;
+	} else {
+		last = jiffies;
+		fsevent_sum = 0;
+	}
+
+	namelen = strlen(current->comm) + strlen(oldname) + 2;
+	if (newname)
+		namelen += strlen(newname) + 1;
+
+	size = NLMSG_SPACE(sizeof(struct fsevent) + namelen);
+	                                                                                                                                       
+	skb = alloc_skb(size, GFP_KERNEL);
+	if (!skb)
+	        return -ENOMEM;
+	                                                                                                                                       
+	nlhdr = NLMSG_PUT(skb, 0, 0, NLMSG_DONE, size - sizeof(*nlhdr));
+	event = NLMSG_DATA(nlhdr);
+
+	get_seq(&(nlhdr->nlmsg_seq), &event->cpu);
+	ktime_get_ts(&event->timestamp);
+	event->type = mask;
+	event->pid = current->tgid;
+	event->uid = current->uid;
+	event->gid = current->gid;
+	nameptr = event->name;
+	event->pname_len = strlen(current->comm);
+	append_string(&nameptr, current->comm, event->pname_len);
+	event->fname_len = strlen(oldname);
+	append_string(&nameptr, oldname, event->fname_len);
+	event->len = event->pname_len +  event->fname_len + 2;
+	event->new_fname_len = 0;
+	if (newname) {
+		event->new_fname_len = strlen(newname);
+		append_string(&nameptr, newname, event->new_fname_len);
+		event->len += event->new_fname_len + 1;
+	}
+	fsevent_send(skb);
+	return 0;
+
+nlmsg_failure:
+	kfree_skb(skb);
+	return -1;
+}
+
+void raise_fsevent(struct dentry * dentryp, u32 mask)
+{
+	if (dentryp->d_inode && (MAJOR(dentryp->d_inode->i_rdev) == 4))
+		return;
+	__raise_fsevent(dentryp->d_name.name, NULL, mask);
+}
+EXPORT_SYMBOL_GPL(raise_fsevent);
+
+void raise_fsevent_create(struct inode * inode, const char * name, u32 mask)
+{
+	__raise_fsevent(name, NULL, mask);
+}
+EXPORT_SYMBOL_GPL(raise_fsevent_create);
+
+void raise_fsevent_move(struct inode * olddir, const char * oldname, 
+		struct inode * newdir, const char * newname, u32 mask)
+{
+	__raise_fsevent(oldname, newname, mask);
+}
+EXPORT_SYMBOL_GPL(raise_fsevent_move);
+
+void raise_fsevent_mount(const char * devname, const char * mountpoint)
+{
+	__raise_fsevent(devname, mountpoint, FSEVENT_MOUNT);
+}
+
+void raise_fsevent_umount(const char * mountpoint)
+{
+	__raise_fsevent(mountpoint, NULL, FSEVENT_UMOUNT);
+}
+
+static int fsevent_ack(enum fsevent_type type, pid_t pid, u32 seq)
+{
+	struct fsevent *event;
+	unsigned int size;
+	struct sk_buff * skb = NULL;
+	struct nlmsghdr * nlhdr = NULL;
+
+	size = NLMSG_SPACE(sizeof(struct fsevent));
+	                                                                                                                                       
+	skb = alloc_skb(size, GFP_KERNEL);
+	if (!skb)
+	        return -ENOMEM;
+	                                                                                                                                       
+	nlhdr = NLMSG_PUT(skb, 0, seq, NLMSG_DONE, size - sizeof(*nlhdr));
+	event = NLMSG_DATA(nlhdr);
+
+	ktime_get_ts(&event->timestamp);
+	event->cpu = -1;
+	event->type = type; 
+	event->pid = 0;
+	event->uid = 0;
+	event->gid = 0;
+	event->len = 0;
+	event->pname_len = 0;
+	event->fname_len = 0;
+	event->new_fname_len = 0;
+	event->err = 0;
+	                                                                                                                                       
+	NETLINK_CB(skb).dst_group = 0;
+	NETLINK_CB(skb).dst_pid = pid;
+	NETLINK_CB(skb).pid = 0;
+
+	return (netlink_unicast(fsevent_sock, skb, pid, MSG_DONTWAIT));
+
+nlmsg_failure:
+	kfree_skb(skb);
+	return -1;
+}
+
+static void set_fsevent_mask(u32 * to_mask, u32 from_mask, int mode)
+{
+	if (mode == FSEVENT_FILTER_IGNORE)
+		(*to_mask) &= ~(from_mask);
+	else if (mode == FSEVENT_FILTER_LISTEN)
+		(*to_mask) |= from_mask;
+}
+
+#define DEFINE_FILTER_FIND_FUNC(type, key) 				\
+	type * find_##type(struct list_head * head, key##_t id)		\
+	{								\
+		int alloc_flag = 1;					\
+		type * entry = NULL;					\
+									\
+		list_for_each_entry(entry, head, list) {		\
+			if (entry->key == id) {				\
+				alloc_flag = 0;				\
+				break;					\
+			}						\
+		}							\
+									\
+		if (alloc_flag == 1) {					\
+			entry  = (type *)kmalloc(sizeof(type), GFP_ATOMIC); \
+			if (entry == NULL) 				\
+				return NULL;				\
+			memset(entry, 0, sizeof(type));			\
+			entry->key = id;				\
+			list_add_tail(&(entry->list), head);		\
+		}							\
+		return entry;						\
+	}								\
+
+DEFINE_FILTER_FIND_FUNC(pid_filter, pid)
+
+DEFINE_FILTER_FIND_FUNC(uid_filter, uid)
+
+DEFINE_FILTER_FIND_FUNC(gid_filter, gid)
+		
+DEFINE_FILTER_FIND_FUNC(listener, pid)
+
+static void set_fsevent_filter(struct fsevent_filter * filter, pid_t pid)
+{
+	enum fsevent_type type;
+	u32 mask = 0;
+	int control = 0;
+	listener * listenerp = NULL;
+	pid_filter * pfilter = NULL;
+	uid_filter * ufilter = NULL;
+	gid_filter * gfilter = NULL;
+
+
+	mask = filter->mask;
+	control = filter->control;
+	type = filter->type;
+	mask &= FSEVENT_MASK;
+	if (mask == 0)
+		goto out;
+
+	spin_lock(&listener_list_lock);
+	listenerp = find_listener(&listener_list_head, pid);
+	if (unlikely(listenerp == NULL)) {
+		spin_unlock(&listener_list_lock);
+		return;
+	}
+
+	if (!(listenerp->pid_filter_list_head.next)) {
+		INIT_LIST_HEAD(&(listenerp->pid_filter_list_head));
+		INIT_LIST_HEAD(&(listenerp->uid_filter_list_head));
+		INIT_LIST_HEAD(&(listenerp->gid_filter_list_head));
+	}
+
+	if ((type & FSEVENT_FILTER_ALL) == FSEVENT_FILTER_ALL) {
+		if (control == FSEVENT_FILTER_REMOVE) {
+			atomic_dec(&fsevent_listener_num);
+			list_del(&(listenerp->list));
+			kfree(listenerp);
+		} else
+			set_fsevent_mask(&(listenerp->mask), mask, control);
+	} else if ((type & FSEVENT_FILTER_PID) == FSEVENT_FILTER_PID) {
+		pfilter = find_pid_filter(&(listenerp->pid_filter_list_head),
+				filter->id.pid);
+		if (unlikely(pfilter == NULL))
+			return;
+
+		if (control == FSEVENT_FILTER_REMOVE) {
+			list_del(&(pfilter->list));
+			kfree(pfilter);
+		} else
+			set_fsevent_mask(&(pfilter->mask), mask, control);
+	} else if ((type & FSEVENT_FILTER_UID) == FSEVENT_FILTER_UID) {
+		ufilter = find_uid_filter(&(listenerp->uid_filter_list_head),
+						filter->id.uid);
+		if (unlikely(ufilter == NULL))
+			return;
+
+		if (control == FSEVENT_FILTER_REMOVE) {
+			list_del(&(ufilter->list));
+			kfree(ufilter);
+		} else
+			set_fsevent_mask(&(ufilter->mask), mask, control);
+	} else if ((type & FSEVENT_FILTER_GID) == FSEVENT_FILTER_GID) {
+		gfilter = find_gid_filter(&(listenerp->gid_filter_list_head),
+						filter->id.gid);
+		if (unlikely(gfilter == NULL))
+			return;
+
+		if (control == FSEVENT_FILTER_REMOVE) {
+			list_del(&(gfilter->list));
+			kfree(gfilter);
+		} else
+			set_fsevent_mask(&(gfilter->mask), mask, control);
+	}
+	spin_unlock(&listener_list_lock);
+
+out:
+	fsevent_ack(type, pid, 0);
+}
+
+static listener * find_fsevent_listener(pid_t pid)
+{
+	listener * listenerp = NULL;
+	spin_lock(&listener_list_lock);
+	list_for_each_entry(listenerp, &listener_list_head, list) {
+		if (listenerp->pid == pid) {
+			spin_unlock(&listener_list_lock);
+			return listenerp;
+		}
+	}
+	spin_unlock(&listener_list_lock);
+	return NULL;
+}
+
+static void cleanup_dead_listener(listener * x)
+{
+	pid_filter * p = NULL, * pq = NULL;
+	uid_filter * u = NULL, * uq = NULL;
+	gid_filter * g = NULL, * gq = NULL;
+
+	if (p == NULL)
+		return;
+
+	list_del(&(x->list));
+
+	list_for_each_entry_safe(p, pq, &(x->pid_filter_list_head), list) {
+		list_del(&(p->list));
+		kfree(p);
+	}
+
+	list_for_each_entry_safe(u, uq, &(x->uid_filter_list_head), list) {
+		list_del(&(u->list));
+		kfree(u);
+	}
+
+	list_for_each_entry_safe(g, gq, &(x->gid_filter_list_head), list) {
+		list_del(&(g->list));
+		kfree(g);
+	}
+	
+	kfree(x);
+}
+
+static void fsevent_recv(struct sock *sk, int len)
+{
+	struct sk_buff *skb = NULL;
+	struct nlmsghdr *nlhdr = NULL;
+	struct fsevent_filter * filter = NULL;
+	pid_t pid;
+
+	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+		skb_get(skb);
+		if (skb->len >= FSEVENT_FILTER_MSGSIZE) {
+			nlhdr = (struct nlmsghdr *)skb->data;
+			filter = NLMSG_DATA(nlhdr);
+			pid = NETLINK_CREDS(skb)->pid;
+			if (find_fsevent_listener(pid) == NULL)
+				atomic_inc(&fsevent_listener_num);
+			set_fsevent_filter(filter, pid);
+		}
+		kfree_skb(skb);
+	}
+}
+
+#define DEFINE_FILTER_MATCH_FUNC(filtertype, key) 			\
+	static int match_##filtertype(listener * p,			\
+				struct fsevent * event,			\
+				struct sk_buff * skb)			\
+	{								\
+		int ret = 0;						\
+		filtertype * xfilter = NULL;				\
+		struct sk_buff * skb2 = NULL;				\
+		struct list_head *  head = &(p->key##_filter_list_head);  \
+		list_for_each_entry(xfilter, head, list) {		\
+			if (xfilter->key != event->key)			\
+				continue;				\
+			ret = filter_fsevent(xfilter->mask, event->type); \
+			if ( ret != 0)					\
+				return -1;				\
+			skb2 = skb_clone(skb, GFP_KERNEL);		\
+       			if (skb2 == NULL)				\
+				return -ENOMEM;				\
+			NETLINK_CB(skb2).dst_group = 0;			\
+			NETLINK_CB(skb2).dst_pid = p->pid;		\
+			NETLINK_CB(skb2).pid = 0;			\
+			return (netlink_unicast(fsevent_sock, skb2,	\
+					p->pid, MSG_DONTWAIT));		\
+		}							\
+		return -ENODEV;						\
+	}								\
+
+DEFINE_FILTER_MATCH_FUNC(pid_filter, pid)
+
+DEFINE_FILTER_MATCH_FUNC(uid_filter, uid)
+
+DEFINE_FILTER_MATCH_FUNC(gid_filter, gid)
+
+#define MATCH_XID(key, listenerp, event, skb) 			\
+	ret = match_##key##_filter(listenerp, event, skb); 	\
+	if (ret == 0) {					 	\
+		kfree_skb(skb);				 	\
+	        continue;				 	\
+	}						 	\
+	do {} while (0)					 	\
+
+static int fsevent_send_to_process(struct sk_buff * skb)
+{
+	listener * p  = NULL, * q = NULL;
+	struct fsevent * event = NULL;
+	struct sk_buff * skb2 = NULL;
+	int ret = 0;
+
+	event = (struct fsevent *)(skb->data + sizeof(struct nlmsghdr));
+	spin_lock(&listener_list_lock);
+	list_for_each_entry_safe(p, q, &listener_list_head, list) {
+		MATCH_XID(pid, p, event, skb);
+		MATCH_XID(uid, p, event, skb);
+		MATCH_XID(gid, p, event, skb);
+
+		if (filter_fsevent(p->mask, event->type) == 0) {
+			 skb2 = skb_clone(skb, GFP_KERNEL);
+	                 if (skb2 == NULL)
+	                 	return -ENOMEM;
+	                 NETLINK_CB(skb2).dst_group = 0;
+	                 NETLINK_CB(skb2).dst_pid = p->pid;
+	                 NETLINK_CB(skb2).pid = 0;
+	                 ret = netlink_unicast(fsevent_sock, skb2,
+	                                p->pid, MSG_DONTWAIT);
+			if (ret == -ECONNREFUSED) {
+				atomic_dec(&fsevent_listener_num);
+				cleanup_dead_listener(p);
+			}
+		}
+	}
+	spin_unlock(&listener_list_lock);
+	return ret;
+}
+
+static void fsevent_commit(void * unused)
+{
+	struct sk_buff * skb = NULL;
+		
+	while((skb = skb_dequeue(&get_cpu_var(fsevent_send_queue)))
+		!= NULL) {
+		fsevent_send_to_process(skb);
+		put_cpu_var(fsevent_send_queue);
+	}
+}
+
+static struct ctl_table fsevent_mask_sysctl[] = {
+	{
+		.ctl_name = FSEVENT_MASK_CTL_NAME,
+		.procname = "fsevent_mask",
+		.data = &fsevents_mask,
+		.maxlen = sizeof(u32),
+		.mode = 0644,
+		.proc_handler = &proc_dointvec,
+	},
+	{ .ctl_name = 0 }
+};
+
+static struct ctl_table fs_root_sysctl[] = {
+	{
+		.ctl_name = CTL_FS,
+		.procname = "fs",
+		.mode = 0555,
+		.child = fsevent_mask_sysctl,
+	},
+	{ .ctl_name = 0 }
+};
+
+static int __init fsevent_init(void)
+{
+	int cpu;
+	struct sk_buff_head * listptr;
+	struct work_struct * workptr;
+
+	fsevent_sock = netlink_kernel_create(NETLINK_FSEVENT, 0,
+					 fsevent_recv, THIS_MODULE);
+	if (!fsevent_sock)
+		return -EIO;
+	for_each_cpu(cpu) {
+		listptr = &per_cpu(fsevent_send_queue, cpu);
+		skb_queue_head_init(listptr);
+		workptr = &per_cpu(fsevent_work, cpu);
+		INIT_WORK(workptr, fsevent_commit, NULL);
+	}
+
+	if (register_sysctl_table(fs_root_sysctl, 0) == NULL)
+                return -ENOMEM;
+
+	return 0;
+}
+
+static void __exit fsevent_exit(void)
+{
+	listener * p = NULL, * q = NULL;
+	sock_release(fsevent_sock->sk_socket);
+	spin_lock(&listener_list_lock);
+	list_for_each_entry_safe(p, q, &listener_list_head, list) {
+		cleanup_dead_listener(p);
+	}
+	spin_unlock(&listener_list_lock);
+}
+
+module_init(fsevent_init);
+module_exit(fsevent_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Yi Yang <yang.y.yi@gmail.com>");
+MODULE_DESCRIPTION("File System Events Reporter");



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [2.6.16 PATCH] Filessytem Events Reporter V2
  2006-04-05 13:21 [2.6.16 PATCH] Filessytem Events Reporter V2 Yi Yang
@ 2006-04-05 17:12 ` Carlos Silva
  2006-04-05 23:45   ` Yi Yang
  2006-04-07  6:24 ` Evgeniy Polyakov
  1 sibling, 1 reply; 14+ messages in thread
From: Carlos Silva @ 2006-04-05 17:12 UTC (permalink / raw)
  To: Yi Yang; +Cc: LKML, Andrew Morton, Evgeniy Polyakov, Matt Helsley

[-- Attachment #1: Type: text/plain, Size: 460 bytes --]

On Wed, 2006-04-05 at 21:21 +0800, Yi Yang wrote:
> <snip>
> +static void cleanup_dead_listener(listener * x)
> +{
> +	pid_filter * p = NULL, * pq = NULL;
> +	uid_filter * u = NULL, * uq = NULL;
> +	gid_filter * g = NULL, * gq = NULL;
> +
> +	if (p == NULL)
> +		return;
> <snip>

I think you ment "if (x == NULL)" here.  Otherwise, the condition will always be true.
btw, I'm not reviewing your code, just stumbled across this while reading it.

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 200 bytes --]

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [2.6.16 PATCH] Filessytem Events Reporter V2
  2006-04-05 17:12 ` Carlos Silva
@ 2006-04-05 23:45   ` Yi Yang
  0 siblings, 0 replies; 14+ messages in thread
From: Yi Yang @ 2006-04-05 23:45 UTC (permalink / raw)
  To: Carlos Silva; +Cc: LKML, Andrew Morton, Evgeniy Polyakov, Matt Helsley

Carlos Silva wrote:
> On Wed, 2006-04-05 at 21:21 +0800, Yi Yang wrote:
>   
>> <snip>
>> +static void cleanup_dead_listener(listener * x)
>> +{
>> +	pid_filter * p = NULL, * pq = NULL;
>> +	uid_filter * u = NULL, * uq = NULL;
>> +	gid_filter * g = NULL, * gq = NULL;
>> +
>> +	if (p == NULL)
>> +		return;
>> <snip>
>>     
>
> I think you ment "if (x == NULL)" here.  Otherwise, the condition will always be true.
> btw, I'm not reviewing your code, just stumbled across this while reading it.
>   
Yes, it is my mistake, thank you.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [2.6.16 PATCH] Filessytem Events Reporter V2
  2006-04-05 13:21 [2.6.16 PATCH] Filessytem Events Reporter V2 Yi Yang
  2006-04-05 17:12 ` Carlos Silva
@ 2006-04-07  6:24 ` Evgeniy Polyakov
  2006-04-07  8:13   ` Yi Yang
  1 sibling, 1 reply; 14+ messages in thread
From: Evgeniy Polyakov @ 2006-04-07  6:24 UTC (permalink / raw)
  To: Yi Yang; +Cc: LKML, Andrew Morton, Matt Helsley

On Wed, Apr 05, 2006 at 09:21:26PM +0800, Yi Yang (yang.y.yi@gmail.com) wrote:
> Compared with Filesystem Events Reporter v1, the following changes are done:
>   - Use workqueue/keventd instead of kfseventd. 
>   - fsevent_mask can be set by sysctl and proc
>     interface.
>   - Add missed spinlock
>   - Ensure fsevent sequence number is uniqe
> 
> This patch implements a filsystem events report facitily, Filesystem Events
> Reporter, the user can monitor filesystem activities via it, currently, it
>  can monitor access, attribute change, open, create, modify, delete,
>  move and close of any file or directory as well as mount/umount.

Comments below.

...
> +
> +static int fsevent_ack(enum fsevent_type type, pid_t pid, u32 seq)
> +{
> +	struct fsevent *event;
> +	unsigned int size;
> +	struct sk_buff * skb = NULL;
> +	struct nlmsghdr * nlhdr = NULL;
> +
> +	size = NLMSG_SPACE(sizeof(struct fsevent));
> +	                                                                                                                                       
> +	skb = alloc_skb(size, GFP_KERNEL);
> +	if (!skb)
> +	        return -ENOMEM;
> +	                                                                                                                                       
> +	nlhdr = NLMSG_PUT(skb, 0, seq, NLMSG_DONE, size - sizeof(*nlhdr));
> +	event = NLMSG_DATA(nlhdr);
> +
> +	ktime_get_ts(&event->timestamp);
> +	event->cpu = -1;
> +	event->type = type; 
> +	event->pid = 0;
> +	event->uid = 0;
> +	event->gid = 0;
> +	event->len = 0;
> +	event->pname_len = 0;
> +	event->fname_len = 0;
> +	event->new_fname_len = 0;
> +	event->err = 0;
> +	                                                                                                                                       
> +	NETLINK_CB(skb).dst_group = 0;
> +	NETLINK_CB(skb).dst_pid = pid;
> +	NETLINK_CB(skb).pid = 0;
> +
> +	return (netlink_unicast(fsevent_sock, skb, pid, MSG_DONTWAIT));

netlink_unicast() uses boolean value but ont MSG_* flags for nonblocking, 
so this should be netlink_unicast(fsevent_sock, skb, pid, 0);

> +nlmsg_failure:
> +	kfree_skb(skb);
> +	return -1;
> +}

...

> +static void fsevent_recv(struct sock *sk, int len)
> +{
> +	struct sk_buff *skb = NULL;
> +	struct nlmsghdr *nlhdr = NULL;
> +	struct fsevent_filter * filter = NULL;
> +	pid_t pid;
> +
> +	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
> +		skb_get(skb);
> +		if (skb->len >= FSEVENT_FILTER_MSGSIZE) {
> +			nlhdr = (struct nlmsghdr *)skb->data;
> +			filter = NLMSG_DATA(nlhdr);
> +			pid = NETLINK_CREDS(skb)->pid;
> +			if (find_fsevent_listener(pid) == NULL)
> +				atomic_inc(&fsevent_listener_num);
> +			set_fsevent_filter(filter, pid);

What is the logic behind this steps?
If there are no listeners you increment it's number no matter if it will
or not be added in set_fsevent_filter().

> +		}
> +		kfree_skb(skb);
> +	}
> +}
> +
> +#define DEFINE_FILTER_MATCH_FUNC(filtertype, key) 			\
> +	static int match_##filtertype(listener * p,			\
> +				struct fsevent * event,			\
> +				struct sk_buff * skb)			\
> +	{								\
> +		int ret = 0;						\
> +		filtertype * xfilter = NULL;				\
> +		struct sk_buff * skb2 = NULL;				\
> +		struct list_head *  head = &(p->key##_filter_list_head);  \
> +		list_for_each_entry(xfilter, head, list) {		\
> +			if (xfilter->key != event->key)			\
> +				continue;				\
> +			ret = filter_fsevent(xfilter->mask, event->type); \
> +			if ( ret != 0)					\
> +				return -1;				\
> +			skb2 = skb_clone(skb, GFP_KERNEL);		\
> +       			if (skb2 == NULL)				\
> +				return -ENOMEM;				\
> +			NETLINK_CB(skb2).dst_group = 0;			\
> +			NETLINK_CB(skb2).dst_pid = p->pid;		\
> +			NETLINK_CB(skb2).pid = 0;			\
> +			return (netlink_unicast(fsevent_sock, skb2,	\
> +					p->pid, MSG_DONTWAIT));		\

The same issue about nonblocking sending.

> +		}							\
> +		return -ENODEV;						\
> +	}								\
> +
> +DEFINE_FILTER_MATCH_FUNC(pid_filter, pid)
> +
> +DEFINE_FILTER_MATCH_FUNC(uid_filter, uid)
> +
> +DEFINE_FILTER_MATCH_FUNC(gid_filter, gid)
> +
> +#define MATCH_XID(key, listenerp, event, skb) 			\
> +	ret = match_##key##_filter(listenerp, event, skb); 	\
> +	if (ret == 0) {					 	\
> +		kfree_skb(skb);				 	\
> +	        continue;				 	\
> +	}						 	\
> +	do {} while (0)					 	\
> +
> +static int fsevent_send_to_process(struct sk_buff * skb)
> +{
> +	listener * p  = NULL, * q = NULL;
> +	struct fsevent * event = NULL;
> +	struct sk_buff * skb2 = NULL;
> +	int ret = 0;
> +
> +	event = (struct fsevent *)(skb->data + sizeof(struct nlmsghdr));
> +	spin_lock(&listener_list_lock);
> +	list_for_each_entry_safe(p, q, &listener_list_head, list) {
> +		MATCH_XID(pid, p, event, skb);
> +		MATCH_XID(uid, p, event, skb);
> +		MATCH_XID(gid, p, event, skb);
> +
> +		if (filter_fsevent(p->mask, event->type) == 0) {
> +			 skb2 = skb_clone(skb, GFP_KERNEL);
> +	                 if (skb2 == NULL)
> +	                 	return -ENOMEM;
> +	                 NETLINK_CB(skb2).dst_group = 0;
> +	                 NETLINK_CB(skb2).dst_pid = p->pid;
> +	                 NETLINK_CB(skb2).pid = 0;
> +	                 ret = netlink_unicast(fsevent_sock, skb2,
> +	                                p->pid, MSG_DONTWAIT);
> +			if (ret == -ECONNREFUSED) {
> +				atomic_dec(&fsevent_listener_num);
> +				cleanup_dead_listener(p);
> +			}
> +		}
> +	}
> +	spin_unlock(&listener_list_lock);
> +	return ret;
> +}
> +
> +static void fsevent_commit(void * unused)
> +{
> +	struct sk_buff * skb = NULL;
> +		
> +	while((skb = skb_dequeue(&get_cpu_var(fsevent_send_queue)))
> +		!= NULL) {
> +		fsevent_send_to_process(skb);
> +		put_cpu_var(fsevent_send_queue);
> +	}
> +}

Really strange mix of per-cpu variables for optimized performance and
global spin locking.
Consider using RCU for list of listeners.

You use unicast delivery for netlink messages. 
According to my investigation [1], it's performance is better only when
there is only one listener (or maybe two in some cases), but then it is
noticebly slower than broadcasting.

1. http://marc.theaimsgroup.com/?l=linux-netdev&m=114424884216006&w=2

> +static struct ctl_table fsevent_mask_sysctl[] = {
> +	{
> +		.ctl_name = FSEVENT_MASK_CTL_NAME,
> +		.procname = "fsevent_mask",
> +		.data = &fsevents_mask,
> +		.maxlen = sizeof(u32),
> +		.mode = 0644,
> +		.proc_handler = &proc_dointvec,
> +	},
> +	{ .ctl_name = 0 }
> +};
> +
> +static struct ctl_table fs_root_sysctl[] = {
> +	{
> +		.ctl_name = CTL_FS,
> +		.procname = "fs",
> +		.mode = 0555,
> +		.child = fsevent_mask_sysctl,
> +	},
> +	{ .ctl_name = 0 }
> +};
> +
> +static int __init fsevent_init(void)
> +{
> +	int cpu;
> +	struct sk_buff_head * listptr;
> +	struct work_struct * workptr;
> +
> +	fsevent_sock = netlink_kernel_create(NETLINK_FSEVENT, 0,
> +					 fsevent_recv, THIS_MODULE);
> +	if (!fsevent_sock)
> +		return -EIO;
> +	for_each_cpu(cpu) {
> +		listptr = &per_cpu(fsevent_send_queue, cpu);
> +		skb_queue_head_init(listptr);
> +		workptr = &per_cpu(fsevent_work, cpu);
> +		INIT_WORK(workptr, fsevent_commit, NULL);
> +	}

Btw, you need some rebalancing of the per-cpu queues, probably in
keventd, since CPUs can go offline and your messages will stuck foreve
there.

> +	if (register_sysctl_table(fs_root_sysctl, 0) == NULL)
> +                return -ENOMEM;
> +
> +	return 0;
> +}
> +
> +static void __exit fsevent_exit(void)
> +{
> +	listener * p = NULL, * q = NULL;
> +	sock_release(fsevent_sock->sk_socket);
> +	spin_lock(&listener_list_lock);
> +	list_for_each_entry_safe(p, q, &listener_list_head, list) {
> +		cleanup_dead_listener(p);
> +	}
> +	spin_unlock(&listener_list_lock);

Broken. Your work can be pending on this stage, but you already removed
and freed resources.

> +}
> +
> +module_init(fsevent_init);
> +module_exit(fsevent_exit);
> +
> +MODULE_LICENSE("GPL");
> +MODULE_AUTHOR("Yi Yang <yang.y.yi@gmail.com>");
> +MODULE_DESCRIPTION("File System Events Reporter");
> 

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [2.6.16 PATCH] Filessytem Events Reporter V2
  2006-04-07  6:24 ` Evgeniy Polyakov
@ 2006-04-07  8:13   ` Yi Yang
  2006-04-07  9:47     ` Evgeniy Polyakov
  0 siblings, 1 reply; 14+ messages in thread
From: Yi Yang @ 2006-04-07  8:13 UTC (permalink / raw)
  To: Evgeniy Polyakov; +Cc: LKML, Andrew Morton, Matt Helsley

Evgeniy Polyakov wrote:
> On Wed, Apr 05, 2006 at 09:21:26PM +0800, Yi Yang (yang.y.yi@gmail.com) wrote:
>   
>> Compared with Filesystem Events Reporter v1, the following changes are done:
>>   - Use workqueue/keventd instead of kfseventd. 
>>   - fsevent_mask can be set by sysctl and proc
>>     interface.
>>   - Add missed spinlock
>>   - Ensure fsevent sequence number is uniqe
>>
>> This patch implements a filsystem events report facitily, Filesystem Events
>> Reporter, the user can monitor filesystem activities via it, currently, it
>>  can monitor access, attribute change, open, create, modify, delete,
>>  move and close of any file or directory as well as mount/umount.
>>     
>
> Comments below.
>
> ...
>   
>> +
>> +static int fsevent_ack(enum fsevent_type type, pid_t pid, u32 seq)
>> +{
>> +	struct fsevent *event;
>> +	unsigned int size;
>> +	struct sk_buff * skb = NULL;
>> +	struct nlmsghdr * nlhdr = NULL;
>> +
>> +	size = NLMSG_SPACE(sizeof(struct fsevent));
>> +	                                                                                                                                       
>> +	skb = alloc_skb(size, GFP_KERNEL);
>> +	if (!skb)
>> +	        return -ENOMEM;
>> +	                                                                                                                                       
>> +	nlhdr = NLMSG_PUT(skb, 0, seq, NLMSG_DONE, size - sizeof(*nlhdr));
>> +	event = NLMSG_DATA(nlhdr);
>> +
>> +	ktime_get_ts(&event->timestamp);
>> +	event->cpu = -1;
>> +	event->type = type; 
>> +	event->pid = 0;
>> +	event->uid = 0;
>> +	event->gid = 0;
>> +	event->len = 0;
>> +	event->pname_len = 0;
>> +	event->fname_len = 0;
>> +	event->new_fname_len = 0;
>> +	event->err = 0;
>> +	                                                                                                                                       
>> +	NETLINK_CB(skb).dst_group = 0;
>> +	NETLINK_CB(skb).dst_pid = pid;
>> +	NETLINK_CB(skb).pid = 0;
>> +
>> +	return (netlink_unicast(fsevent_sock, skb, pid, MSG_DONTWAIT));
>>     
>
> netlink_unicast() uses boolean value but ont MSG_* flags for nonblocking, 
> so this should be netlink_unicast(fsevent_sock, skb, pid, 0);
>   
a example invocation in file net/netlink/af_netlink.c:
 netlink_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
so, it hasn't any problem.
>   
>> +nlmsg_failure:
>> +	kfree_skb(skb);
>> +	return -1;
>> +}
>>     
>
> ...
>
>   
>> +static void fsevent_recv(struct sock *sk, int len)
>> +{
>> +	struct sk_buff *skb = NULL;
>> +	struct nlmsghdr *nlhdr = NULL;
>> +	struct fsevent_filter * filter = NULL;
>> +	pid_t pid;
>> +
>> +	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
>> +		skb_get(skb);
>> +		if (skb->len >= FSEVENT_FILTER_MSGSIZE) {
>> +			nlhdr = (struct nlmsghdr *)skb->data;
>> +			filter = NLMSG_DATA(nlhdr);
>> +			pid = NETLINK_CREDS(skb)->pid;
>> +			if (find_fsevent_listener(pid) == NULL)
>> +				atomic_inc(&fsevent_listener_num);
>> +			set_fsevent_filter(filter, pid);
>>     
>
> What is the logic behind this steps?
> If there are no listeners you increment it's number no matter if it will
> or not be added in set_fsevent_filter().
>   
fsevent_recv is used to receive listener's commands, a listener must 
send commands in order to get fsevents it
interests, so this is the best point to increment number of listeners. 
set_fsevent_filter will add listener to listener
list, so it is OK.
>   
>> +		}
>> +		kfree_skb(skb);
>> +	}
>> +}
>> +
>> +#define DEFINE_FILTER_MATCH_FUNC(filtertype, key) 			\
>> +	static int match_##filtertype(listener * p,			\
>> +				struct fsevent * event,			\
>> +				struct sk_buff * skb)			\
>> +	{								\
>> +		int ret = 0;						\
>> +		filtertype * xfilter = NULL;				\
>> +		struct sk_buff * skb2 = NULL;				\
>> +		struct list_head *  head = &(p->key##_filter_list_head);  \
>> +		list_for_each_entry(xfilter, head, list) {		\
>> +			if (xfilter->key != event->key)			\
>> +				continue;				\
>> +			ret = filter_fsevent(xfilter->mask, event->type); \
>> +			if ( ret != 0)					\
>> +				return -1;				\
>> +			skb2 = skb_clone(skb, GFP_KERNEL);		\
>> +       			if (skb2 == NULL)				\
>> +				return -ENOMEM;				\
>> +			NETLINK_CB(skb2).dst_group = 0;			\
>> +			NETLINK_CB(skb2).dst_pid = p->pid;		\
>> +			NETLINK_CB(skb2).pid = 0;			\
>> +			return (netlink_unicast(fsevent_sock, skb2,	\
>> +					p->pid, MSG_DONTWAIT));		\
>>     
>
> The same issue about nonblocking sending.
>
>   
>> +		}							\
>> +		return -ENODEV;						\
>> +	}								\
>> +
>> +DEFINE_FILTER_MATCH_FUNC(pid_filter, pid)
>> +
>> +DEFINE_FILTER_MATCH_FUNC(uid_filter, uid)
>> +
>> +DEFINE_FILTER_MATCH_FUNC(gid_filter, gid)
>> +
>> +#define MATCH_XID(key, listenerp, event, skb) 			\
>> +	ret = match_##key##_filter(listenerp, event, skb); 	\
>> +	if (ret == 0) {					 	\
>> +		kfree_skb(skb);				 	\
>> +	        continue;				 	\
>> +	}						 	\
>> +	do {} while (0)					 	\
>> +
>> +static int fsevent_send_to_process(struct sk_buff * skb)
>> +{
>> +	listener * p  = NULL, * q = NULL;
>> +	struct fsevent * event = NULL;
>> +	struct sk_buff * skb2 = NULL;
>> +	int ret = 0;
>> +
>> +	event = (struct fsevent *)(skb->data + sizeof(struct nlmsghdr));
>> +	spin_lock(&listener_list_lock);
>> +	list_for_each_entry_safe(p, q, &listener_list_head, list) {
>> +		MATCH_XID(pid, p, event, skb);
>> +		MATCH_XID(uid, p, event, skb);
>> +		MATCH_XID(gid, p, event, skb);
>> +
>> +		if (filter_fsevent(p->mask, event->type) == 0) {
>> +			 skb2 = skb_clone(skb, GFP_KERNEL);
>> +	                 if (skb2 == NULL)
>> +	                 	return -ENOMEM;
>> +	                 NETLINK_CB(skb2).dst_group = 0;
>> +	                 NETLINK_CB(skb2).dst_pid = p->pid;
>> +	                 NETLINK_CB(skb2).pid = 0;
>> +	                 ret = netlink_unicast(fsevent_sock, skb2,
>> +	                                p->pid, MSG_DONTWAIT);
>> +			if (ret == -ECONNREFUSED) {
>> +				atomic_dec(&fsevent_listener_num);
>> +				cleanup_dead_listener(p);
>> +			}
>> +		}
>> +	}
>> +	spin_unlock(&listener_list_lock);
>> +	return ret;
>> +}
>> +
>> +static void fsevent_commit(void * unused)
>> +{
>> +	struct sk_buff * skb = NULL;
>> +		
>> +	while((skb = skb_dequeue(&get_cpu_var(fsevent_send_queue)))
>> +		!= NULL) {
>> +		fsevent_send_to_process(skb);
>> +		put_cpu_var(fsevent_send_queue);
>> +	}
>> +}
>>     
>
> Really strange mix of per-cpu variables for optimized performance and
> global spin locking.
> Consider using RCU for list of listeners.
>   
per cpu queue is used to avoid raise_fsevent to contend spinlock, but 
listener_list_lock just is used
 to synchronize the operations of userspace applications(listener) on 
listener list, it just protect listener
 list.

Of course, your advice is good, RCU will be better, I'm considering 
substitute spinlock with RCU,
maybe list*_rcu  functions can help me.
> You use unicast delivery for netlink messages. 
> According to my investigation [1], it's performance is better only when
> there is only one listener (or maybe two in some cases), but then it is
> noticebly slower than broadcasting.
>
> 1. http://marc.theaimsgroup.com/?l=linux-netdev&m=114424884216006&w=2
>   
Because fsevent has to deliver different events to different listeners, 
so I must use netlink_unicast,
in fact, netlink_broadcast also must send skb to every member of the 
group, so in my opinion,
they haven't  big difference.

Can you explain why there is such a big difference between 
netlink_unicast and netlink_broadcast?
>   
>> +static struct ctl_table fsevent_mask_sysctl[] = {
>> +	{
>> +		.ctl_name = FSEVENT_MASK_CTL_NAME,
>> +		.procname = "fsevent_mask",
>> +		.data = &fsevents_mask,
>> +		.maxlen = sizeof(u32),
>> +		.mode = 0644,
>> +		.proc_handler = &proc_dointvec,
>> +	},
>> +	{ .ctl_name = 0 }
>> +};
>> +
>> +static struct ctl_table fs_root_sysctl[] = {
>> +	{
>> +		.ctl_name = CTL_FS,
>> +		.procname = "fs",
>> +		.mode = 0555,
>> +		.child = fsevent_mask_sysctl,
>> +	},
>> +	{ .ctl_name = 0 }
>> +};
>> +
>> +static int __init fsevent_init(void)
>> +{
>> +	int cpu;
>> +	struct sk_buff_head * listptr;
>> +	struct work_struct * workptr;
>> +
>> +	fsevent_sock = netlink_kernel_create(NETLINK_FSEVENT, 0,
>> +					 fsevent_recv, THIS_MODULE);
>> +	if (!fsevent_sock)
>> +		return -EIO;
>> +	for_each_cpu(cpu) {
>> +		listptr = &per_cpu(fsevent_send_queue, cpu);
>> +		skb_queue_head_init(listptr);
>> +		workptr = &per_cpu(fsevent_work, cpu);
>> +		INIT_WORK(workptr, fsevent_commit, NULL);
>> +	}
>>     
>
> Btw, you need some rebalancing of the per-cpu queues, probably in
> keventd, since CPUs can go offline and your messages will stuck foreve
> there.
>   
Does keventd not do it? if so, keventd should be modified.
>   
>> +	if (register_sysctl_table(fs_root_sysctl, 0) == NULL)
>> +                return -ENOMEM;
>> +
>> +	return 0;
>> +}
>> +
>> +static void __exit fsevent_exit(void)
>> +{
>> +	listener * p = NULL, * q = NULL;
>> +	sock_release(fsevent_sock->sk_socket);
>> +	spin_lock(&listener_list_lock);
>> +	list_for_each_entry_safe(p, q, &listener_list_head, list) {
>> +		cleanup_dead_listener(p);
>> +	}
>> +	spin_unlock(&listener_list_lock);
>>     
>
> Broken. Your work can be pending on this stage, but you already removed
> and freed resources.
>   
Sorry, I don't consider it, I will add it, thank you very much.
>   
>> +}
>> +
>> +module_init(fsevent_init);
>> +module_exit(fsevent_exit);
>> +
>> +MODULE_LICENSE("GPL");
>> +MODULE_AUTHOR("Yi Yang <yang.y.yi@gmail.com>");
>> +MODULE_DESCRIPTION("File System Events Reporter");
>>
>>     
>
>   


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [2.6.16 PATCH] Filessytem Events Reporter V2
  2006-04-07  8:13   ` Yi Yang
@ 2006-04-07  9:47     ` Evgeniy Polyakov
  2006-04-07 10:03       ` Yi Yang
  0 siblings, 1 reply; 14+ messages in thread
From: Evgeniy Polyakov @ 2006-04-07  9:47 UTC (permalink / raw)
  To: Yi Yang; +Cc: LKML, Andrew Morton, Matt Helsley

On Fri, Apr 07, 2006 at 04:13:45PM +0800, Yi Yang (yang.y.yi@gmail.com) wrote:
> >>+
> >>+	return (netlink_unicast(fsevent_sock, skb, pid, MSG_DONTWAIT));
> >>    
> >
> >netlink_unicast() uses boolean value but ont MSG_* flags for nonblocking, 
> >so this should be netlink_unicast(fsevent_sock, skb, pid, 0);
> >  
> a example invocation in file net/netlink/af_netlink.c:
> netlink_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
> so, it hasn't any problem.

Well...

static inline long sock_sndtimeo(const struct sock *sk, int noblock)
{
	return noblock ? 0 : sk->sk_sndtimeo;
}

int netlink_unicast(struct sock *ssk, struct sk_buff *skb, u32 pid, int nonblock)
{
	struct sock *sk;
	int err;
	long timeo;

	skb = netlink_trim(skb, gfp_any());

	timeo = sock_sndtimeo(ssk, nonblock);

I mean that it is boolean value, MSG_PEEK will produce the same result.
But it is a matter of coding style probably.

> >>+nlmsg_failure:
> >>+	kfree_skb(skb);
> >>+	return -1;
> >>+}
> >>    
> >
> >...
> >
> >  
> >>+static void fsevent_recv(struct sock *sk, int len)
> >>+{
> >>+	struct sk_buff *skb = NULL;
> >>+	struct nlmsghdr *nlhdr = NULL;
> >>+	struct fsevent_filter * filter = NULL;
> >>+	pid_t pid;
> >>+
> >>+	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
> >>+		skb_get(skb);
> >>+		if (skb->len >= FSEVENT_FILTER_MSGSIZE) {
> >>+			nlhdr = (struct nlmsghdr *)skb->data;
> >>+			filter = NLMSG_DATA(nlhdr);
> >>+			pid = NETLINK_CREDS(skb)->pid;
> >>+			if (find_fsevent_listener(pid) == NULL)
> >>+				atomic_inc(&fsevent_listener_num);
> >>+			set_fsevent_filter(filter, pid);
> >>    
> >
> >What is the logic behind this steps?
> >If there are no listeners you increment it's number no matter if it will
> >or not be added in set_fsevent_filter().
> >  
> fsevent_recv is used to receive listener's commands, a listener must 
> send commands in order to get fsevents it
> interests, so this is the best point to increment number of listeners. 
> set_fsevent_filter will add listener to listener
> list, so it is OK.

And what if set_fsevent_filter() fails?

> >>+		}
> >>+		kfree_skb(skb);
> >>+	}
> >>+}
> >>+
> >>+#define DEFINE_FILTER_MATCH_FUNC(filtertype, key) 			\
> >>+	static int match_##filtertype(listener * p,			\
> >>+				struct fsevent * event,			\
> >>+				struct sk_buff * skb)			\
> >>+	{								\
> >>+		int ret = 0;						\
> >>+		filtertype * xfilter = NULL;				\
> >>+		struct sk_buff * skb2 = NULL;				\
> >>+		struct list_head *  head = &(p->key##_filter_list_head);  \
> >>+		list_for_each_entry(xfilter, head, list) {		\
> >>+			if (xfilter->key != event->key)			\
> >>+				continue;				\
> >>+			ret = filter_fsevent(xfilter->mask, event->type); \
> >>+			if ( ret != 0)					\
> >>+				return -1;				\
> >>+			skb2 = skb_clone(skb, GFP_KERNEL);		\
> >>+       			if (skb2 == NULL)			 \
> >>+				return -ENOMEM;				\
> >>+			NETLINK_CB(skb2).dst_group = 0;			\
> >>+			NETLINK_CB(skb2).dst_pid = p->pid;		\
> >>+			NETLINK_CB(skb2).pid = 0;			\
> >>+			return (netlink_unicast(fsevent_sock, skb2,	\
> >>+					p->pid, MSG_DONTWAIT));		\
> >>    
> >
> >The same issue about nonblocking sending.
> >
> >  
> >>+		}							\
> >>+		return -ENODEV;						\
> >>+	}								\
> >>+
> >>+DEFINE_FILTER_MATCH_FUNC(pid_filter, pid)
> >>+
> >>+DEFINE_FILTER_MATCH_FUNC(uid_filter, uid)
> >>+
> >>+DEFINE_FILTER_MATCH_FUNC(gid_filter, gid)
> >>+
> >>+#define MATCH_XID(key, listenerp, event, skb) 			\
> >>+	ret = match_##key##_filter(listenerp, event, skb); 	\
> >>+	if (ret == 0) {					 	\
> >>+		kfree_skb(skb);				 	\
> >>+	        continue;				 	\
> >>+	}						 	\
> >>+	do {} while (0)					 	\
> >>+
> >>+static int fsevent_send_to_process(struct sk_buff * skb)
> >>+{
> >>+	listener * p  = NULL, * q = NULL;
> >>+	struct fsevent * event = NULL;
> >>+	struct sk_buff * skb2 = NULL;
> >>+	int ret = 0;
> >>+
> >>+	event = (struct fsevent *)(skb->data + sizeof(struct nlmsghdr));
> >>+	spin_lock(&listener_list_lock);
> >>+	list_for_each_entry_safe(p, q, &listener_list_head, list) {
> >>+		MATCH_XID(pid, p, event, skb);
> >>+		MATCH_XID(uid, p, event, skb);
> >>+		MATCH_XID(gid, p, event, skb);
> >>+
> >>+		if (filter_fsevent(p->mask, event->type) == 0) {
> >>+			 skb2 = skb_clone(skb, GFP_KERNEL);
> >>+	                 if (skb2 == NULL)
> >>+	                 	return -ENOMEM;
> >>+	                 NETLINK_CB(skb2).dst_group = 0;
> >>+	                 NETLINK_CB(skb2).dst_pid = p->pid;
> >>+	                 NETLINK_CB(skb2).pid = 0;
> >>+	                 ret = netlink_unicast(fsevent_sock, skb2,
> >>+	                                p->pid, MSG_DONTWAIT);
> >>+			if (ret == -ECONNREFUSED) {
> >>+				atomic_dec(&fsevent_listener_num);
> >>+				cleanup_dead_listener(p);
> >>+			}
> >>+		}
> >>+	}
> >>+	spin_unlock(&listener_list_lock);
> >>+	return ret;
> >>+}
> >>+
> >>+static void fsevent_commit(void * unused)
> >>+{
> >>+	struct sk_buff * skb = NULL;
> >>+		
> >>+	while((skb = skb_dequeue(&get_cpu_var(fsevent_send_queue)))
> >>+		!= NULL) {
> >>+		fsevent_send_to_process(skb);
> >>+		put_cpu_var(fsevent_send_queue);
> >>+	}
> >>+}
> >>    
> >
> >Really strange mix of per-cpu variables for optimized performance and
> >global spin locking.
> >Consider using RCU for list of listeners.
> >  
> per cpu queue is used to avoid raise_fsevent to contend spinlock, but 
> listener_list_lock just is used
> to synchronize the operations of userspace applications(listener) on 
> listener list, it just protect listener
> list.
> 
> Of course, your advice is good, RCU will be better, I'm considering 
> substitute spinlock with RCU,
> maybe list*_rcu  functions can help me.

You get global lock in each processor when traverse the list
&listener_list_lock.

And you call GFP_KERNEL allocation under that lock, which is wrong.

If all your code is called from process context (it looks so), you
could mutexes.

> >You use unicast delivery for netlink messages. 
> >According to my investigation [1], it's performance is better only when
> >there is only one listener (or maybe two in some cases), but then it is
> >noticebly slower than broadcasting.
> >
> >1. http://marc.theaimsgroup.com/?l=linux-netdev&m=114424884216006&w=2
> >  
> Because fsevent has to deliver different events to different listeners, 
> so I must use netlink_unicast,
> in fact, netlink_broadcast also must send skb to every member of the 
> group, so in my opinion,
> they haven't  big difference.

And what if there are several listeners for the same type of events?

> Can you explain why there is such a big difference between 
> netlink_unicast and netlink_broadcast?

Netlink broadcast clones skbs, while unicasting requires the whole new
one.

> >Btw, you need some rebalancing of the per-cpu queues, probably in
> >keventd, since CPUs can go offline and your messages will stuck foreve
> >there.
> >  
> Does keventd not do it? if so, keventd should be modified.

How does keventd know about your own structures?
You have an per-cpu object, but your keventd function gets object 
from running cpu, not from any other cpus.

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [2.6.16 PATCH] Filessytem Events Reporter V2
  2006-04-07  9:47     ` Evgeniy Polyakov
@ 2006-04-07 10:03       ` Yi Yang
  2006-04-07 10:26         ` Evgeniy Polyakov
  0 siblings, 1 reply; 14+ messages in thread
From: Yi Yang @ 2006-04-07 10:03 UTC (permalink / raw)
  To: Evgeniy Polyakov; +Cc: LKML, Andrew Morton, Matt Helsley

Evgeniy Polyakov wrote:
> On Fri, Apr 07, 2006 at 04:13:45PM +0800, Yi Yang (yang.y.yi@gmail.com) wrote:
>   
>>>> +
>>>> +	return (netlink_unicast(fsevent_sock, skb, pid, MSG_DONTWAIT));
>>>>    
>>>>         
>>> netlink_unicast() uses boolean value but ont MSG_* flags for nonblocking, 
>>> so this should be netlink_unicast(fsevent_sock, skb, pid, 0);
>>>  
>>>       
>> a example invocation in file net/netlink/af_netlink.c:
>> netlink_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
>> so, it hasn't any problem.
>>     
>
> Well...
>
> static inline long sock_sndtimeo(const struct sock *sk, int noblock)
> {
> 	return noblock ? 0 : sk->sk_sndtimeo;
> }
>
> int netlink_unicast(struct sock *ssk, struct sk_buff *skb, u32 pid, int nonblock)
> {
> 	struct sock *sk;
> 	int err;
> 	long timeo;
>
> 	skb = netlink_trim(skb, gfp_any());
>
> 	timeo = sock_sndtimeo(ssk, nonblock);
>
> I mean that it is boolean value, MSG_PEEK will produce the same result.
> But it is a matter of coding style probably.
>
>   
>>>> +nlmsg_failure:
>>>> +	kfree_skb(skb);
>>>> +	return -1;
>>>> +}
>>>>    
>>>>         
>>> ...
>>>
>>>  
>>>       
>>>> +static void fsevent_recv(struct sock *sk, int len)
>>>> +{
>>>> +	struct sk_buff *skb = NULL;
>>>> +	struct nlmsghdr *nlhdr = NULL;
>>>> +	struct fsevent_filter * filter = NULL;
>>>> +	pid_t pid;
>>>> +
>>>> +	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
>>>> +		skb_get(skb);
>>>> +		if (skb->len >= FSEVENT_FILTER_MSGSIZE) {
>>>> +			nlhdr = (struct nlmsghdr *)skb->data;
>>>> +			filter = NLMSG_DATA(nlhdr);
>>>> +			pid = NETLINK_CREDS(skb)->pid;
>>>> +			if (find_fsevent_listener(pid) == NULL)
>>>> +				atomic_inc(&fsevent_listener_num);
>>>> +			set_fsevent_filter(filter, pid);
>>>>    
>>>>         
>>> What is the logic behind this steps?
>>> If there are no listeners you increment it's number no matter if it will
>>> or not be added in set_fsevent_filter().
>>>  
>>>       
>> fsevent_recv is used to receive listener's commands, a listener must 
>> send commands in order to get fsevents it
>> interests, so this is the best point to increment number of listeners. 
>> set_fsevent_filter will add listener to listener
>> list, so it is OK.
>>     
>
> And what if set_fsevent_filter() fails?
>   
I didn't consider this case, thanks, I will do with it.
>   
>>>> +		}
>>>> +		kfree_skb(skb);
>>>> +	}
>>>> +}
>>>> +
>>>> +#define DEFINE_FILTER_MATCH_FUNC(filtertype, key) 			\
>>>> +	static int match_##filtertype(listener * p,			\
>>>> +				struct fsevent * event,			\
>>>> +				struct sk_buff * skb)			\
>>>> +	{								\
>>>> +		int ret = 0;						\
>>>> +		filtertype * xfilter = NULL;				\
>>>> +		struct sk_buff * skb2 = NULL;				\
>>>> +		struct list_head *  head = &(p->key##_filter_list_head);  \
>>>> +		list_for_each_entry(xfilter, head, list) {		\
>>>> +			if (xfilter->key != event->key)			\
>>>> +				continue;				\
>>>> +			ret = filter_fsevent(xfilter->mask, event->type); \
>>>> +			if ( ret != 0)					\
>>>> +				return -1;				\
>>>> +			skb2 = skb_clone(skb, GFP_KERNEL);		\
>>>> +       			if (skb2 == NULL)			 \
>>>> +				return -ENOMEM;				\
>>>> +			NETLINK_CB(skb2).dst_group = 0;			\
>>>> +			NETLINK_CB(skb2).dst_pid = p->pid;		\
>>>> +			NETLINK_CB(skb2).pid = 0;			\
>>>> +			return (netlink_unicast(fsevent_sock, skb2,	\
>>>> +					p->pid, MSG_DONTWAIT));		\
>>>>    
>>>>         
>>> The same issue about nonblocking sending.
>>>
>>>  
>>>       
>>>> +		}							\
>>>> +		return -ENODEV;						\
>>>> +	}								\
>>>> +
>>>> +DEFINE_FILTER_MATCH_FUNC(pid_filter, pid)
>>>> +
>>>> +DEFINE_FILTER_MATCH_FUNC(uid_filter, uid)
>>>> +
>>>> +DEFINE_FILTER_MATCH_FUNC(gid_filter, gid)
>>>> +
>>>> +#define MATCH_XID(key, listenerp, event, skb) 			\
>>>> +	ret = match_##key##_filter(listenerp, event, skb); 	\
>>>> +	if (ret == 0) {					 	\
>>>> +		kfree_skb(skb);				 	\
>>>> +	        continue;				 	\
>>>> +	}						 	\
>>>> +	do {} while (0)					 	\
>>>> +
>>>> +static int fsevent_send_to_process(struct sk_buff * skb)
>>>> +{
>>>> +	listener * p  = NULL, * q = NULL;
>>>> +	struct fsevent * event = NULL;
>>>> +	struct sk_buff * skb2 = NULL;
>>>> +	int ret = 0;
>>>> +
>>>> +	event = (struct fsevent *)(skb->data + sizeof(struct nlmsghdr));
>>>> +	spin_lock(&listener_list_lock);
>>>> +	list_for_each_entry_safe(p, q, &listener_list_head, list) {
>>>> +		MATCH_XID(pid, p, event, skb);
>>>> +		MATCH_XID(uid, p, event, skb);
>>>> +		MATCH_XID(gid, p, event, skb);
>>>> +
>>>> +		if (filter_fsevent(p->mask, event->type) == 0) {
>>>> +			 skb2 = skb_clone(skb, GFP_KERNEL);
>>>> +	                 if (skb2 == NULL)
>>>> +	                 	return -ENOMEM;
>>>> +	                 NETLINK_CB(skb2).dst_group = 0;
>>>> +	                 NETLINK_CB(skb2).dst_pid = p->pid;
>>>> +	                 NETLINK_CB(skb2).pid = 0;
>>>> +	                 ret = netlink_unicast(fsevent_sock, skb2,
>>>> +	                                p->pid, MSG_DONTWAIT);
>>>> +			if (ret == -ECONNREFUSED) {
>>>> +				atomic_dec(&fsevent_listener_num);
>>>> +				cleanup_dead_listener(p);
>>>> +			}
>>>> +		}
>>>> +	}
>>>> +	spin_unlock(&listener_list_lock);
>>>> +	return ret;
>>>> +}
>>>> +
>>>> +static void fsevent_commit(void * unused)
>>>> +{
>>>> +	struct sk_buff * skb = NULL;
>>>> +		
>>>> +	while((skb = skb_dequeue(&get_cpu_var(fsevent_send_queue)))
>>>> +		!= NULL) {
>>>> +		fsevent_send_to_process(skb);
>>>> +		put_cpu_var(fsevent_send_queue);
>>>> +	}
>>>> +}
>>>>    
>>>>         
>>> Really strange mix of per-cpu variables for optimized performance and
>>> global spin locking.
>>> Consider using RCU for list of listeners.
>>>  
>>>       
>> per cpu queue is used to avoid raise_fsevent to contend spinlock, but 
>> listener_list_lock just is used
>> to synchronize the operations of userspace applications(listener) on 
>> listener list, it just protect listener
>> list.
>>
>> Of course, your advice is good, RCU will be better, I'm considering 
>> substitute spinlock with RCU,
>> maybe list*_rcu  functions can help me.
>>     
>
> You get global lock in each processor when traverse the list
> &listener_list_lock.
>
> And you call GFP_KERNEL allocation under that lock, which is wrong.
>
> If all your code is called from process context (it looks so), you
> could mutexes.
>   
Yes, mutex should be the best choice.
>   
>>> You use unicast delivery for netlink messages. 
>>> According to my investigation [1], it's performance is better only when
>>> there is only one listener (or maybe two in some cases), but then it is
>>> noticebly slower than broadcasting.
>>>
>>> 1. http://marc.theaimsgroup.com/?l=linux-netdev&m=114424884216006&w=2
>>>  
>>>       
>> Because fsevent has to deliver different events to different listeners, 
>> so I must use netlink_unicast,
>> in fact, netlink_broadcast also must send skb to every member of the 
>> group, so in my opinion,
>> they haven't  big difference.
>>     
>
> And what if there are several listeners for the same type of events?
>
>   
>> Can you explain why there is such a big difference between 
>> netlink_unicast and netlink_broadcast?
>>     
>
> Netlink broadcast clones skbs, while unicasting requires the whole new
> one.
>   
No, I also use clone to send skb, so they should have the same overhead.
>   
>>> Btw, you need some rebalancing of the per-cpu queues, probably in
>>> keventd, since CPUs can go offline and your messages will stuck foreve
>>> there.
>>>  
>>>       
>> Does keventd not do it? if so, keventd should be modified.
>>     
>
> How does keventd know about your own structures?
> You have an per-cpu object, but your keventd function gets object 
> from running cpu, not from any other cpus.
>
>   


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [2.6.16 PATCH] Filessytem Events Reporter V2
  2006-04-07 10:03       ` Yi Yang
@ 2006-04-07 10:26         ` Evgeniy Polyakov
  2006-04-07 15:14           ` Yi Yang
  0 siblings, 1 reply; 14+ messages in thread
From: Evgeniy Polyakov @ 2006-04-07 10:26 UTC (permalink / raw)
  To: Yi Yang; +Cc: LKML, Andrew Morton, Matt Helsley

On Fri, Apr 07, 2006 at 06:03:04PM +0800, Yi Yang (yang.y.yi@gmail.com) wrote:
> >>Can you explain why there is such a big difference between 
> >>netlink_unicast and netlink_broadcast?
> >>    
> >
> >Netlink broadcast clones skbs, while unicasting requires the whole new
> >one.
> >  
> No, I also use clone to send skb, so they should have the same overhead.

I missed that.
After rereading fsevent_send_to_process() I do not see how original skb
is freed though.

> >>>Btw, you need some rebalancing of the per-cpu queues, probably in
> >>>keventd, since CPUs can go offline and your messages will stuck foreve
> >>>there.
> >>> 
> >>>      
> >>Does keventd not do it? if so, keventd should be modified.
> >>    
> >
> >How does keventd know about your own structures?
> >You have an per-cpu object, but your keventd function gets object 
> >from running cpu, not from any other cpus.

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [2.6.16 PATCH] Filessytem Events Reporter V2
  2006-04-07 10:26         ` Evgeniy Polyakov
@ 2006-04-07 15:14           ` Yi Yang
  2006-04-07 19:47             ` Evgeniy Polyakov
  0 siblings, 1 reply; 14+ messages in thread
From: Yi Yang @ 2006-04-07 15:14 UTC (permalink / raw)
  To: Evgeniy Polyakov; +Cc: LKML, Andrew Morton, Matt Helsley

Evgeniy Polyakov 写道:
> On Fri, Apr 07, 2006 at 06:03:04PM +0800, Yi Yang (yang.y.yi@gmail.com) wrote:
>   
>>>> Can you explain why there is such a big difference between 
>>>> netlink_unicast and netlink_broadcast?
>>>>    
>>>>         
>>> Netlink broadcast clones skbs, while unicasting requires the whole new
>>> one.
>>>  
>>>       
>> No, I also use clone to send skb, so they should have the same overhead.
>>     
>
> I missed that.
> After rereading fsevent_send_to_process() I do not see how original skb
> is freed though.
>   
I'm considering how to free it, because cloned skbs share data with 
original skb, so this case is special,
I try to clarify the logic of kfree_skb.
>   
>>>>> Btw, you need some rebalancing of the per-cpu queues, probably in
>>>>> keventd, since CPUs can go offline and your messages will stuck foreve
>>>>> there.
>>>>>
>>>>>      
>>>>>           
>>>> Does keventd not do it? if so, keventd should be modified.
>>>>    
>>>>         
>>> How does keventd know about your own structures?
>>> You have an per-cpu object, but your keventd function gets object 
>>>       
>> >from running cpu, not from any other cpus.
>>     
>
>   


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [2.6.16 PATCH] Filessytem Events Reporter V2
  2006-04-07 15:14           ` Yi Yang
@ 2006-04-07 19:47             ` Evgeniy Polyakov
  2006-04-07 20:27               ` Evgeniy Polyakov
  0 siblings, 1 reply; 14+ messages in thread
From: Evgeniy Polyakov @ 2006-04-07 19:47 UTC (permalink / raw)
  To: Yi Yang; +Cc: LKML, Andrew Morton, Matt Helsley

On Fri, Apr 07, 2006 at 11:14:27PM +0800, Yi Yang (yang.y.yi@gmail.com) wrote:
> Evgeniy Polyakov ??????:
> >On Fri, Apr 07, 2006 at 06:03:04PM +0800, Yi Yang (yang.y.yi@gmail.com) 
> >wrote:
> >  
> >>>>Can you explain why there is such a big difference between 
> >>>>netlink_unicast and netlink_broadcast?
> >>>>   
> >>>>        
> >>>Netlink broadcast clones skbs, while unicasting requires the whole new
> >>>one.
> >>> 
> >>>      
> >>No, I also use clone to send skb, so they should have the same overhead.
> >>    
> >
> >I missed that.
> >After rereading fsevent_send_to_process() I do not see how original skb
> >is freed though.
> >  
> I'm considering how to free it, because cloned skbs share data with 
> original skb, so this case is special,
> I try to clarify the logic of kfree_skb.

Just call kfree_skb() after fsevent_send_to_process() or at the very
end of this function. If unicast delivering fails you also need to free cloned skb.

> >  
> >>>>>Btw, you need some rebalancing of the per-cpu queues, probably in
> >>>>>keventd, since CPUs can go offline and your messages will stuck foreve
> >>>>>there.
> >>>>>
> >>>>>     
> >>>>>          
> >>>>Does keventd not do it? if so, keventd should be modified.
> >>>>   
> >>>>        
> >>>How does keventd know about your own structures?
> >>>You have an per-cpu object, but your keventd function gets object 
> >>>      
> >>>from running cpu, not from any other cpus.
> >>    
> >
> >  

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [2.6.16 PATCH] Filessytem Events Reporter V2
  2006-04-07 19:47             ` Evgeniy Polyakov
@ 2006-04-07 20:27               ` Evgeniy Polyakov
  0 siblings, 0 replies; 14+ messages in thread
From: Evgeniy Polyakov @ 2006-04-07 20:27 UTC (permalink / raw)
  To: Yi Yang; +Cc: LKML, Andrew Morton, Matt Helsley

On Fri, Apr 07, 2006 at 11:47:16PM +0400, Evgeniy Polyakov (johnpol@2ka.mipt.ru) wrote:
> On Fri, Apr 07, 2006 at 11:14:27PM +0800, Yi Yang (yang.y.yi@gmail.com) wrote:
> > Evgeniy Polyakov ??????:
> > >On Fri, Apr 07, 2006 at 06:03:04PM +0800, Yi Yang (yang.y.yi@gmail.com) 
> > >wrote:
> > >  
> > >>>>Can you explain why there is such a big difference between 
> > >>>>netlink_unicast and netlink_broadcast?
> > >>>>   
> > >>>>        
> > >>>Netlink broadcast clones skbs, while unicasting requires the whole new
> > >>>one.
> > >>> 
> > >>>      
> > >>No, I also use clone to send skb, so they should have the same overhead.
> > >>    
> > >
> > >I missed that.
> > >After rereading fsevent_send_to_process() I do not see how original skb
> > >is freed though.
> > >  
> > I'm considering how to free it, because cloned skbs share data with 
> > original skb, so this case is special,
> > I try to clarify the logic of kfree_skb.
> 
> Just call kfree_skb() after fsevent_send_to_process() or at the very
> end of this function. If unicast delivering fails you also need to free cloned skb.

For clarification: I mean if any error happens before netlink_unicast()
call, you need to free skb. If netlink_unicast() is called, it will take
care of skb in any case.

> > >  
> > >>>>>Btw, you need some rebalancing of the per-cpu queues, probably in
> > >>>>>keventd, since CPUs can go offline and your messages will stuck foreve
> > >>>>>there.
> > >>>>>
> > >>>>>     
> > >>>>>          
> > >>>>Does keventd not do it? if so, keventd should be modified.
> > >>>>   
> > >>>>        
> > >>>How does keventd know about your own structures?
> > >>>You have an per-cpu object, but your keventd function gets object 
> > >>>      
> > >>>from running cpu, not from any other cpus.
> > >>    
> > >
> > >  
> 
> -- 
> 	Evgeniy Polyakov

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [2.6.16 PATCH] Filessytem Events Reporter V2
@ 2006-04-06 14:34 Michael Guo
  0 siblings, 0 replies; 14+ messages in thread
From: Michael Guo @ 2006-04-06 14:34 UTC (permalink / raw)
  To: Yi Yang; +Cc: LKML, Andrew Morton, Evgeniy Polyakov, Matt Helsley

Ok, when kernel developers throw "everything" to libraries and let them encapsulate "everything", it sounds like old story: hardware engineers hope software engineers to do more things, however, software engineers think it reservedly. And for application engineers, it is also unfair because they have to follow new features added by libraries even worse have to update their programs from time to time. Who knows library programmers never change their own interface in order to keep to trace new kernel due to similar functionalities but different interfaces. 

               Kernel size UP -> Library size UP -> Application size UP (chain reaction)
            
I don't know if the time when users complain that their systems are running slowly and slowly and memory is becoming scarce even if fastest CPU (whatever single or multiple cores) and tons of memory is coming very soon. By the way, it is pleasure to discuss the problem with you


-----Original Message-----
From: Yi Yang [mailto:yang.y.yi@gmail.com]
Sent: Wednesday, April 05, 2006 7:44 PM
To: Michael Guo
Cc: LKML; Andrew Morton; Evgeniy Polyakov; Matt Helsley
Subject: Re: [2.6.16 PATCH] Filessytem Events Reporter V2


Michael Guo wrote:
> Hi,
>   Now, kernel is growing bigger and bigger continuously and performance is becoming slower. So, if possible, please consider to add a 
> common and simple interface which is scalable and flexible to satisfy real requirement of users instead of telling users use this or that like Microsoft. In a word, simple makes application programmers happy!
>
>
> Guo
>   
Your requirement should be done by a userspace library, in fact, most of
functions provided by kernel are exported
to the final application in this way

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [2.6.16 PATCH] Filessytem Events Reporter V2
  2006-04-05 14:13 Michael Guo
@ 2006-04-05 23:44 ` Yi Yang
  0 siblings, 0 replies; 14+ messages in thread
From: Yi Yang @ 2006-04-05 23:44 UTC (permalink / raw)
  To: Michael Guo; +Cc: LKML, Andrew Morton, Evgeniy Polyakov, Matt Helsley

Michael Guo wrote:
> Hi,
>   Now, kernel is growing bigger and bigger continuously and performance is becoming slower. So, if possible, please consider to add a 
> common and simple interface which is scalable and flexible to satisfy real requirement of users instead of telling users use this or that like Microsoft. In a word, simple makes application programmers happy!
>
>
> Guo
>   
Your requirement should be done by a userspace library, in fact, most of
functions provided by kernel are exported
to the final application in this way

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [2.6.16 PATCH] Filessytem Events Reporter V2
@ 2006-04-05 14:13 Michael Guo
  2006-04-05 23:44 ` Yi Yang
  0 siblings, 1 reply; 14+ messages in thread
From: Michael Guo @ 2006-04-05 14:13 UTC (permalink / raw)
  To: Yi Yang, LKML; +Cc: Andrew Morton, Evgeniy Polyakov, Matt Helsley

Hi,
  Now, kernel is growing bigger and bigger continuously and performance is becoming slower. So, if possible, please consider to add a 
common and simple interface which is scalable and flexible to satisfy real requirement of users instead of telling users use this or that like Microsoft. In a word, simple makes application programmers happy!


Guo
 

-----Original Message-----
From: linux-kernel-owner@vger.kernel.org [mailto:linux-kernel-owner@vger.kernel.org]On Behalf Of Yi Yang
Sent: Wednesday, April 05, 2006 9:21 AM
To: LKML
Cc: Andrew Morton; Evgeniy Polyakov; Matt Helsley
Subject: [2.6.16 PATCH] Filessytem Events Reporter V2


Compared with Filesystem Events Reporter v1, the following changes are done:
  - Use workqueue/keventd instead of kfseventd. 
  - fsevent_mask can be set by sysctl and proc
    interface.
  - Add missed spinlock
  - Ensure fsevent sequence number is uniqe

This patch implements a filsystem events report facitily, Filesystem Events
Reporter, the user can monitor filesystem activities via it, currently, it
 can monitor access, attribute change, open, create, modify, delete,
 move and close of any file or directory as well as mount/umount.

Every filesystem event will include tgid, uid and gid of the process
 which triggered this event, process name, file or directory name 
operated by it.

Filesystem Events Reporter is never a duplicate of inotify, inotify
 just concerns change on file or directory, Beagle uses it to watch
 file changes in order to regenerate index for it, inotify can't tell
 us who did that change and what is its process name, but filesystem
 events reporter can do these, moreover inotify's overhead is greater
 than filesystem events reporter, inotify needs compare inode with 
watched file or directories list to decide whether it should generate an
 inotify_event, some locks also increase overhead, filesystem event 
connector hasn't these overhead, it just generates a fsevent and send.

To be important, filesystem event reporter doesn't add any new system 
call, the user space application can make use of it by netlink socket, 
but inotify added several system calls, many events mechanism in kernel
 have used netlink as communication way with user space, for example, 
KOBJECT_UEVENT, PROC_EVENTS, to use netlink will make it more possible
 to unify events interface to netlink, the user space application can use 
it very easy.

Every application using fsevent can set its own fsevent filter list 
without effect in other applications using fsevent, there are three
filter lists, they are pid filter list, uid filetr list and gid 
filter list, respectively, moreover, there is a fsevent mask used to
control those fsevents which fail to match  three filter lists, an 
application using fsevent can listen those fsevents it want to 
monitor and ignore those fsevents it doesn't interest in by set 
series of filters, there is a fsevent mask used to take effects on
all the applications using fsevent, it can be set by sysctl and proc
interface.

 fs/Kconfig               |   10 
 fs/Makefile              |    1 
 fs/fsevent.c             |  598 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/namespace.c           |   12 
 include/linux/fsevent.h  |  131 ++++++++++
 include/linux/fsnotify.h |   37 ++
 include/linux/netlink.h  |    1 
 7 files changed, 790 insertions(+)

Signed-off-by: Yi Yang <yang.y.yi@gmail.com>

--- a/include/linux/netlink.h.orig	2006-03-31 22:58:50.000000000 +0800
+++ b/include/linux/netlink.h	2006-04-05 20:34:23.000000000 +0800
@@ -21,6 +21,7 @@
 #define NETLINK_DNRTMSG		14	/* DECnet routing messages */
 #define NETLINK_KOBJECT_UEVENT	15	/* Kernel messages to userspace */
 #define NETLINK_GENERIC		16
+#define NETLINK_FSEVENT		17	/* File system events to userspace */
 
 #define MAX_LINKS 32		
 
--- a/include/linux/fsnotify.h.orig	2006-01-03 11:21:10.000000000 +0800
+++ b/include/linux/fsnotify.h	2006-04-03 22:12:52.000000000 +0800
@@ -15,6 +15,7 @@
 
 #include <linux/dnotify.h>
 #include <linux/inotify.h>
+#include <linux/fsevent.h>
 
 /*
  * fsnotify_move - file old_name at old_dir was moved to new_name at new_dir
@@ -45,6 +46,8 @@ static inline void fsnotify_move(struct 
 	if (source) {
 		inotify_inode_queue_event(source, IN_MOVE_SELF, 0, NULL);
 	}
+	raise_fsevent_move(old_dir, old_name, new_dir, new_name,
+			   FSEVENT_MOVE | (isdir?FSEVENT_ISDIR:0));
 }
 
 /*
@@ -56,6 +59,8 @@ static inline void fsnotify_nameremove(s
 		isdir = IN_ISDIR;
 	dnotify_parent(dentry, DN_DELETE);
 	inotify_dentry_parent_queue_event(dentry, IN_DELETE|isdir, 0, dentry->d_name.name);
+	raise_fsevent(dentry,
+		      FSEVENT_DELETE | (isdir?FSEVENT_ISDIR:0));
 }
 
 /*
@@ -74,6 +79,7 @@ static inline void fsnotify_create(struc
 {
 	inode_dir_notify(inode, DN_CREATE);
 	inotify_inode_queue_event(inode, IN_CREATE, 0, name);
+	raise_fsevent_create(inode, name, FSEVENT_CREATE);
 }
 
 /*
@@ -83,6 +89,8 @@ static inline void fsnotify_mkdir(struct
 {
 	inode_dir_notify(inode, DN_CREATE);
 	inotify_inode_queue_event(inode, IN_CREATE | IN_ISDIR, 0, name);
+	raise_fsevent_create(inode, name,
+			     FSEVENT_CREATE | FSEVENT_ISDIR);
 }
 
 /*
@@ -99,6 +107,8 @@ static inline void fsnotify_access(struc
 	dnotify_parent(dentry, DN_ACCESS);
 	inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
 	inotify_inode_queue_event(inode, mask, 0, NULL);
+	raise_fsevent(dentry, FSEVENT_ACCESS |
+				((S_ISDIR(inode->i_mode))?FSEVENT_ISDIR:0));
 }
 
 /*
@@ -115,6 +125,8 @@ static inline void fsnotify_modify(struc
 	dnotify_parent(dentry, DN_MODIFY);
 	inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
 	inotify_inode_queue_event(inode, mask, 0, NULL);
+	raise_fsevent(dentry, FSEVENT_MODIFY |
+				((S_ISDIR(inode->i_mode))?FSEVENT_ISDIR:0));
 }
 
 /*
@@ -130,6 +142,9 @@ static inline void fsnotify_open(struct 
 
 	inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
 	inotify_inode_queue_event(inode, mask, 0, NULL);	
+	raise_fsevent(dentry, FSEVENT_OPEN |
+				((S_ISDIR(inode->i_mode))?FSEVENT_ISDIR:0));
+
 }
 
 /*
@@ -148,6 +163,8 @@ static inline void fsnotify_close(struct
 
 	inotify_dentry_parent_queue_event(dentry, mask, 0, name);
 	inotify_inode_queue_event(inode, mask, 0, NULL);
+	raise_fsevent(dentry, FSEVENT_CLOSE |
+				((S_ISDIR(inode->i_mode))?FSEVENT_ISDIR:0));
 }
 
 /*
@@ -163,6 +180,8 @@ static inline void fsnotify_xattr(struct
 
 	inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
 	inotify_inode_queue_event(inode, mask, 0, NULL);
+	raise_fsevent(dentry, FSEVENT_MODIFY_ATTRIB |
+				((S_ISDIR(inode->i_mode))?FSEVENT_ISDIR:0));
 }
 
 /*
@@ -213,6 +232,24 @@ static inline void fsnotify_change(struc
 		inotify_dentry_parent_queue_event(dentry, in_mask, 0,
 						  dentry->d_name.name);
 	}
+
+#ifdef CONFIG_FS_EVENTS
+	{
+	u32 fsevent_mask = 0;
+	if (ia_valid & (ATTR_UID | ATTR_GID | ATTR_MODE))
+		fsevent_mask |= FSEVENT_MODIFY_ATTRIB;
+	if ((ia_valid & ATTR_ATIME) && (ia_valid & ATTR_MTIME))
+		fsevent_mask |= FSEVENT_MODIFY_ATTRIB;
+	else if (ia_valid & ATTR_ATIME)
+		fsevent_mask |= FSEVENT_ACCESS;
+	else if (ia_valid & ATTR_MTIME)
+		fsevent_mask |= FSEVENT_MODIFY;
+	if (ia_valid & ATTR_SIZE)
+		fsevent_mask |= FSEVENT_MODIFY;
+	if (fsevent_mask)
+		raise_fsevent(dentry, fsevent_mask);
+	}
+#endif /* CONFIG_FS_EVENTS */
 }
 
 #ifdef CONFIG_INOTIFY	/* inotify helpers */
--- a/fs/namespace.c.orig	2006-03-25 23:13:42.000000000 +0800
+++ b/fs/namespace.c	2006-04-03 22:12:52.000000000 +0800
@@ -25,6 +25,7 @@
 #include <linux/mount.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
+#include <linux/fsevent.h>
 #include "pnode.h"
 
 extern int __init init_rootfs(void);
@@ -612,6 +613,13 @@ asmlinkage long sys_umount(char __user *
 		goto dput_and_out;
 
 	retval = do_umount(nd.mnt, flags);
+
+	if (retval == 0) {
+		char * tmp = getname(name);
+		raise_fsevent_umount(tmp);
+		putname(tmp);
+	}
+
 dput_and_out:
 	path_release_on_umount(&nd);
 out:
@@ -1459,6 +1467,10 @@ asmlinkage long sys_mount(char __user * 
 	retval = do_mount((char *)dev_page, dir_page, (char *)type_page,
 			  flags, (void *)data_page);
 	unlock_kernel();
+
+	if (retval == 0)
+		raise_fsevent_mount((char *)dev_page, dir_page);
+
 	free_page(data_page);
 
 out3:
--- a/fs/Kconfig.orig	2006-03-31 21:23:20.000000000 +0800
+++ b/fs/Kconfig	2006-04-03 22:12:52.000000000 +0800
@@ -405,6 +405,16 @@ config INOTIFY
 
 	  If unsure, say Y.
 
+config FS_EVENTS
+	tristate "Report filesystem events to userspace"
+	---help---
+	  Provide a facility that reports filesystem events to userspace. The
+	  reported event include access, write, utime, chmod, chown, chgrp,
+	  close, open, create, rename, unlink, mkdir, rmdir, mount, umount.
+
+	  The user can set filesystem events filter to filter its events, so
+	  that he just get those events he concerns.
+
 config QUOTA
 	bool "Quota support"
 	help
--- a/fs/Makefile.orig	2006-03-31 21:23:33.000000000 +0800
+++ b/fs/Makefile	2006-04-03 22:12:52.000000000 +0800
@@ -13,6 +13,7 @@ obj-y :=	open.o read_write.o file_table.
 		ioprio.o pnode.o drop_caches.o
 
 obj-$(CONFIG_INOTIFY)		+= inotify.o
+obj-$(CONFIG_FS_EVENTS)		+= fsevent.o
 obj-$(CONFIG_EPOLL)		+= eventpoll.o
 obj-$(CONFIG_COMPAT)		+= compat.o compat_ioctl.o
 
--- /dev/null	2003-01-30 18:24:37.000000000 +0800
+++ b/include/linux/fsevent.h	2006-04-04 23:18:01.000000000 +0800
@@ -0,0 +1,131 @@
+/*
+ * fsevent.h - filesystem events connector
+ *
+ * Copyright (C) 2006 Yi Yang <yang.y.yi@gmail.com>
+ * Based on cn_proc.h by Matt Helsley, IBM Corp
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef LINUX_FSEVENT_H
+#define LINUX_FSEVENT_H
+
+#include <linux/types.h>
+#include <linux/time.h>
+#include <linux/netlink.h>
+
+enum  fsevent_type {
+	FSEVENT_ACCESS = 	0x00000001,	/* File was accessed */
+	FSEVENT_MODIFY = 	0x00000002,	/* File was modified */
+	FSEVENT_MODIFY_ATTRIB = 0x00000004,	/* Metadata changed */
+	FSEVENT_CLOSE = 	0x00000008,	/* File was closed */
+	FSEVENT_OPEN = 		0x00000010,	/* File was opened */
+	FSEVENT_MOVE = 		0x00000020,	/* File was moved */
+	FSEVENT_CREATE = 	0x00000040,	/* File was created */
+	FSEVENT_DELETE =	0x00000080,	/* File was deleted */
+	FSEVENT_MOUNT =		0x00000100,	/* File system is mounted */
+	FSEVENT_UMOUNT =	0x00000200,	/* File system is umounted */
+
+	/* The following definitions are command types for fsevent filter
+	 * or acknowlege types of the corresponding commands
+	 */
+	FSEVENT_FILTER_ALL = 	0x08000000,	/* For all events */
+	FSEVENT_FILTER_PID = 	0x10000000,	/* For some process ID */
+	FSEVENT_FILTER_UID = 	0x20000000,	/* For some user ID */
+	FSEVENT_FILTER_GID =	0x40000000,	/* For some group ID */
+
+	FSEVENT_ISDIR = 	0x80000000	/* It is set for a dir */
+};
+
+#define FSEVENT_MASK 0x800003ff
+
+typedef unsigned long fsevent_mask_t;
+
+enum filter_control {
+	FSEVENT_FILTER_LISTEN = 1,		/* Listen fsevents mask defines*/
+	FSEVENT_FILTER_IGNORE ,		/* Ignore fsevents mask defines*/
+	FSEVENT_FILTER_REMOVE,			/* Remove a given filter */
+};	
+
+struct fsevent_filter {
+	/* filter type, it just is one of them
+	 * FSEVENT_FILTER_ALL
+	 * FSEVENT_FILTER_PID
+	 * FSEVENT_FILTER_UID
+	 * FSEVENT_FILTER_GID
+	 */
+	enum fsevent_type type;	/* filter type */
+
+	/* mask of file system events the user listen or ignore
+	 * if the user need to ignore all the events of some pid
+	 * , gid or uid, he(she) must set mask to FSEVENT_MASK.
+	 */ 
+	fsevent_mask_t mask;
+	union {
+		pid_t pid;
+		uid_t uid;
+		gid_t gid;
+	} id;
+
+	enum filter_control control;
+};
+
+struct fsevent {
+	__u32 type;
+	__u32 cpu;
+	struct timespec timestamp;
+	pid_t pid;
+	uid_t uid;
+	gid_t gid;
+	int err;
+	__u32 len;
+	__u32 pname_len;
+	__u32 fname_len;
+	__u32 new_fname_len;
+	char name[0];
+};
+
+#define FSEVENT_FILTER_MSGSIZE \
+	(sizeof(struct fsevent_filter) + sizeof(struct nlmsghdr))
+
+#ifdef __KERNEL__
+#ifdef CONFIG_FS_EVENTS
+extern void raise_fsevent(struct dentry * dentryp, u32 mask);
+extern void raise_fsevent_move(struct inode * olddir, const char * oldname, 
+		struct inode * newdir, const char * newname, u32 mask);
+extern void raise_fsevent_create(struct inode * inode, 
+		const char * name, u32 mask);
+extern void raise_fsevent_mount(const char * devname, const char * mountpoint);
+extern void raise_fsevent_umount(const char * mountpoint);
+#else
+static void raise_fsevent(struct dentry * dentryp,  u32 mask)
+{}
+
+static void raise_fsevent_move(struct inode * olddir, const char * oldname, 
+		struct inode * newdir, const char * newname, u32 mask)
+{}
+
+static void raise_fsevent_create(struct inode * inode, 
+		const char * name, u32 mask)
+{}
+
+static void raise_fsevent_mount(const char * devname, const char * mountpoint)
+{}
+
+static void raise_fsevent_umount(const char * mountpoint)
+{}
+#endif	/* CONFIG_FS_EVENTS */
+#endif	/* __KERNEL__ */
+#endif	/* LINUX_FSEVENT_H */
--- /dev/null	2003-01-30 18:24:37.000000000 +0800
+++ b/fs/fsevent.c	2006-04-05 20:52:53.000000000 +0800
@@ -0,0 +1,598 @@
+/*
+ * 	fsevent.c
+ * 
+ * 2006 Copyright (c) Yi Yang <yang.y.yi@gmail.com>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/moduleparam.h>
+#include <linux/fsevent.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <linux/list.h>
+#include <linux/percpu.h>
+#include <linux/cpu.h>
+#include <linux/kthread.h>
+#include <linux/notifier.h>
+#include <linux/compiler.h>
+#include <linux/workqueue.h>
+#include <linux/sysctl.h>
+
+#define FSEVENT_MASK_CTL_NAME -2
+
+static DEFINE_PER_CPU(struct sk_buff_head, fsevent_send_queue);
+static DEFINE_PER_CPU(struct work_struct, fsevent_work);
+
+typedef struct pid_filter {
+	pid_t pid;
+	u32 mask;
+	struct list_head list;
+} pid_filter;
+
+typedef struct uid_filter {
+	uid_t uid;
+	u32 mask;
+	struct list_head list;
+} uid_filter;
+
+typedef struct gid_filter {
+	gid_t gid;
+	u32 mask;
+	struct list_head list;
+} gid_filter;
+
+typedef struct fsevent_listener {
+	pid_t pid;
+	struct list_head pid_filter_list_head;
+	struct list_head uid_filter_list_head;
+	struct list_head gid_filter_list_head;
+	u32 mask;
+	struct list_head list;
+} listener;
+
+	
+/* The netlink socket. */
+static struct sock * fsevent_sock = NULL;
+static LIST_HEAD(listener_list_head);
+static DEFINE_SPINLOCK(listener_list_lock);
+
+static atomic_t fsevent_count = ATOMIC_INIT(0);
+static int fsevent_burst_limit = 100;
+static int fsevent_ratelimit = 5 * HZ;
+static unsigned long last = 0;
+static int fsevent_sum = 0;
+static u32 fsevents_mask = FSEVENT_MASK;
+static atomic_t fsevent_listener_num = ATOMIC_INIT(0);
+
+static inline void get_seq(__u32 *ts, int *cpu)
+{
+	*ts = atomic_inc_return(&fsevent_count);
+	*cpu = smp_processor_id();
+}
+
+static void append_string(char **dest, const char *src, size_t len)
+{
+	strncpy(*dest, src, len);
+	(*dest)[len] = '\0';
+	*dest += len + 1;
+}
+
+static inline int filter_fsevent(u32 filter_mask, u32 event_mask)
+{
+	event_mask &= FSEVENT_MASK;
+	event_mask &= filter_mask;
+	if (event_mask == 0) {
+		return -1;
+	}
+	return 0;
+}
+
+static int filter_fsevent_all(u32 * mask)
+{
+	int ret = 0;
+
+	(*mask) &= FSEVENT_MASK;
+
+	if ((((*mask) & FSEVENT_ISDIR) == FSEVENT_ISDIR)
+		 && ((fsevents_mask & FSEVENT_ISDIR) == 0)) {
+		ret = -1;
+		goto out;
+	}
+
+	(*mask) &= fsevents_mask;
+	if ((*mask) == 0) {
+		ret = -5;
+	}
+
+out:
+	return ret;
+}
+
+static void fsevent_send(struct sk_buff * skb)
+{
+	struct sk_buff_head * head = &get_cpu_var(fsevent_send_queue);
+	skb_queue_tail(head, skb);
+	schedule_work(&per_cpu(fsevent_work, smp_processor_id()));
+	put_cpu_var(fsevent_send_queue);
+}
+
+int __raise_fsevent(const char * oldname, const char * newname, u32 mask)
+{
+	struct fsevent *event;
+	int namelen = 0;
+	char * nameptr = NULL;
+	unsigned int size;
+	struct nlmsghdr * nlhdr;
+	struct sk_buff * skb = NULL;
+
+	if (filter_fsevent_all(&mask) != 0)
+		return -1;
+
+	if (atomic_read(&fsevent_listener_num) <= 0)
+		return -1;
+
+	if (jiffies - last <= fsevent_ratelimit) {
+		if (fsevent_sum > fsevent_burst_limit)
+			return -2;
+		fsevent_sum++;
+	} else {
+		last = jiffies;
+		fsevent_sum = 0;
+	}
+
+	namelen = strlen(current->comm) + strlen(oldname) + 2;
+	if (newname)
+		namelen += strlen(newname) + 1;
+
+	size = NLMSG_SPACE(sizeof(struct fsevent) + namelen);
+	                                                                                                                                       
+	skb = alloc_skb(size, GFP_KERNEL);
+	if (!skb)
+	        return -ENOMEM;
+	                                                                                                                                       
+	nlhdr = NLMSG_PUT(skb, 0, 0, NLMSG_DONE, size - sizeof(*nlhdr));
+	event = NLMSG_DATA(nlhdr);
+
+	get_seq(&(nlhdr->nlmsg_seq), &event->cpu);
+	ktime_get_ts(&event->timestamp);
+	event->type = mask;
+	event->pid = current->tgid;
+	event->uid = current->uid;
+	event->gid = current->gid;
+	nameptr = event->name;
+	event->pname_len = strlen(current->comm);
+	append_string(&nameptr, current->comm, event->pname_len);
+	event->fname_len = strlen(oldname);
+	append_string(&nameptr, oldname, event->fname_len);
+	event->len = event->pname_len +  event->fname_len + 2;
+	event->new_fname_len = 0;
+	if (newname) {
+		event->new_fname_len = strlen(newname);
+		append_string(&nameptr, newname, event->new_fname_len);
+		event->len += event->new_fname_len + 1;
+	}
+	fsevent_send(skb);
+	return 0;
+
+nlmsg_failure:
+	kfree_skb(skb);
+	return -1;
+}
+
+void raise_fsevent(struct dentry * dentryp, u32 mask)
+{
+	if (dentryp->d_inode && (MAJOR(dentryp->d_inode->i_rdev) == 4))
+		return;
+	__raise_fsevent(dentryp->d_name.name, NULL, mask);
+}
+EXPORT_SYMBOL_GPL(raise_fsevent);
+
+void raise_fsevent_create(struct inode * inode, const char * name, u32 mask)
+{
+	__raise_fsevent(name, NULL, mask);
+}
+EXPORT_SYMBOL_GPL(raise_fsevent_create);
+
+void raise_fsevent_move(struct inode * olddir, const char * oldname, 
+		struct inode * newdir, const char * newname, u32 mask)
+{
+	__raise_fsevent(oldname, newname, mask);
+}
+EXPORT_SYMBOL_GPL(raise_fsevent_move);
+
+void raise_fsevent_mount(const char * devname, const char * mountpoint)
+{
+	__raise_fsevent(devname, mountpoint, FSEVENT_MOUNT);
+}
+
+void raise_fsevent_umount(const char * mountpoint)
+{
+	__raise_fsevent(mountpoint, NULL, FSEVENT_UMOUNT);
+}
+
+static int fsevent_ack(enum fsevent_type type, pid_t pid, u32 seq)
+{
+	struct fsevent *event;
+	unsigned int size;
+	struct sk_buff * skb = NULL;
+	struct nlmsghdr * nlhdr = NULL;
+
+	size = NLMSG_SPACE(sizeof(struct fsevent));
+	                                                                                                                                       
+	skb = alloc_skb(size, GFP_KERNEL);
+	if (!skb)
+	        return -ENOMEM;
+	                                                                                                                                       
+	nlhdr = NLMSG_PUT(skb, 0, seq, NLMSG_DONE, size - sizeof(*nlhdr));
+	event = NLMSG_DATA(nlhdr);
+
+	ktime_get_ts(&event->timestamp);
+	event->cpu = -1;
+	event->type = type; 
+	event->pid = 0;
+	event->uid = 0;
+	event->gid = 0;
+	event->len = 0;
+	event->pname_len = 0;
+	event->fname_len = 0;
+	event->new_fname_len = 0;
+	event->err = 0;
+	                                                                                                                                       
+	NETLINK_CB(skb).dst_group = 0;
+	NETLINK_CB(skb).dst_pid = pid;
+	NETLINK_CB(skb).pid = 0;
+
+	return (netlink_unicast(fsevent_sock, skb, pid, MSG_DONTWAIT));
+
+nlmsg_failure:
+	kfree_skb(skb);
+	return -1;
+}
+
+static void set_fsevent_mask(u32 * to_mask, u32 from_mask, int mode)
+{
+	if (mode == FSEVENT_FILTER_IGNORE)
+		(*to_mask) &= ~(from_mask);
+	else if (mode == FSEVENT_FILTER_LISTEN)
+		(*to_mask) |= from_mask;
+}
+
+#define DEFINE_FILTER_FIND_FUNC(type, key) 				\
+	type * find_##type(struct list_head * head, key##_t id)		\
+	{								\
+		int alloc_flag = 1;					\
+		type * entry = NULL;					\
+									\
+		list_for_each_entry(entry, head, list) {		\
+			if (entry->key == id) {				\
+				alloc_flag = 0;				\
+				break;					\
+			}						\
+		}							\
+									\
+		if (alloc_flag == 1) {					\
+			entry  = (type *)kmalloc(sizeof(type), GFP_ATOMIC); \
+			if (entry == NULL) 				\
+				return NULL;				\
+			memset(entry, 0, sizeof(type));			\
+			entry->key = id;				\
+			list_add_tail(&(entry->list), head);		\
+		}							\
+		return entry;						\
+	}								\
+
+DEFINE_FILTER_FIND_FUNC(pid_filter, pid)
+
+DEFINE_FILTER_FIND_FUNC(uid_filter, uid)
+
+DEFINE_FILTER_FIND_FUNC(gid_filter, gid)
+		
+DEFINE_FILTER_FIND_FUNC(listener, pid)
+
+static void set_fsevent_filter(struct fsevent_filter * filter, pid_t pid)
+{
+	enum fsevent_type type;
+	u32 mask = 0;
+	int control = 0;
+	listener * listenerp = NULL;
+	pid_filter * pfilter = NULL;
+	uid_filter * ufilter = NULL;
+	gid_filter * gfilter = NULL;
+
+
+	mask = filter->mask;
+	control = filter->control;
+	type = filter->type;
+	mask &= FSEVENT_MASK;
+	if (mask == 0)
+		goto out;
+
+	spin_lock(&listener_list_lock);
+	listenerp = find_listener(&listener_list_head, pid);
+	if (unlikely(listenerp == NULL)) {
+		spin_unlock(&listener_list_lock);
+		return;
+	}
+
+	if (!(listenerp->pid_filter_list_head.next)) {
+		INIT_LIST_HEAD(&(listenerp->pid_filter_list_head));
+		INIT_LIST_HEAD(&(listenerp->uid_filter_list_head));
+		INIT_LIST_HEAD(&(listenerp->gid_filter_list_head));
+	}
+
+	if ((type & FSEVENT_FILTER_ALL) == FSEVENT_FILTER_ALL) {
+		if (control == FSEVENT_FILTER_REMOVE) {
+			atomic_dec(&fsevent_listener_num);
+			list_del(&(listenerp->list));
+			kfree(listenerp);
+		} else
+			set_fsevent_mask(&(listenerp->mask), mask, control);
+	} else if ((type & FSEVENT_FILTER_PID) == FSEVENT_FILTER_PID) {
+		pfilter = find_pid_filter(&(listenerp->pid_filter_list_head),
+				filter->id.pid);
+		if (unlikely(pfilter == NULL))
+			return;
+
+		if (control == FSEVENT_FILTER_REMOVE) {
+			list_del(&(pfilter->list));
+			kfree(pfilter);
+		} else
+			set_fsevent_mask(&(pfilter->mask), mask, control);
+	} else if ((type & FSEVENT_FILTER_UID) == FSEVENT_FILTER_UID) {
+		ufilter = find_uid_filter(&(listenerp->uid_filter_list_head),
+						filter->id.uid);
+		if (unlikely(ufilter == NULL))
+			return;
+
+		if (control == FSEVENT_FILTER_REMOVE) {
+			list_del(&(ufilter->list));
+			kfree(ufilter);
+		} else
+			set_fsevent_mask(&(ufilter->mask), mask, control);
+	} else if ((type & FSEVENT_FILTER_GID) == FSEVENT_FILTER_GID) {
+		gfilter = find_gid_filter(&(listenerp->gid_filter_list_head),
+						filter->id.gid);
+		if (unlikely(gfilter == NULL))
+			return;
+
+		if (control == FSEVENT_FILTER_REMOVE) {
+			list_del(&(gfilter->list));
+			kfree(gfilter);
+		} else
+			set_fsevent_mask(&(gfilter->mask), mask, control);
+	}
+	spin_unlock(&listener_list_lock);
+
+out:
+	fsevent_ack(type, pid, 0);
+}
+
+static listener * find_fsevent_listener(pid_t pid)
+{
+	listener * listenerp = NULL;
+	spin_lock(&listener_list_lock);
+	list_for_each_entry(listenerp, &listener_list_head, list) {
+		if (listenerp->pid == pid) {
+			spin_unlock(&listener_list_lock);
+			return listenerp;
+		}
+	}
+	spin_unlock(&listener_list_lock);
+	return NULL;
+}
+
+static void cleanup_dead_listener(listener * x)
+{
+	pid_filter * p = NULL, * pq = NULL;
+	uid_filter * u = NULL, * uq = NULL;
+	gid_filter * g = NULL, * gq = NULL;
+
+	if (p == NULL)
+		return;
+
+	list_del(&(x->list));
+
+	list_for_each_entry_safe(p, pq, &(x->pid_filter_list_head), list) {
+		list_del(&(p->list));
+		kfree(p);
+	}
+
+	list_for_each_entry_safe(u, uq, &(x->uid_filter_list_head), list) {
+		list_del(&(u->list));
+		kfree(u);
+	}
+
+	list_for_each_entry_safe(g, gq, &(x->gid_filter_list_head), list) {
+		list_del(&(g->list));
+		kfree(g);
+	}
+	
+	kfree(x);
+}
+
+static void fsevent_recv(struct sock *sk, int len)
+{
+	struct sk_buff *skb = NULL;
+	struct nlmsghdr *nlhdr = NULL;
+	struct fsevent_filter * filter = NULL;
+	pid_t pid;
+
+	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+		skb_get(skb);
+		if (skb->len >= FSEVENT_FILTER_MSGSIZE) {
+			nlhdr = (struct nlmsghdr *)skb->data;
+			filter = NLMSG_DATA(nlhdr);
+			pid = NETLINK_CREDS(skb)->pid;
+			if (find_fsevent_listener(pid) == NULL)
+				atomic_inc(&fsevent_listener_num);
+			set_fsevent_filter(filter, pid);
+		}
+		kfree_skb(skb);
+	}
+}
+
+#define DEFINE_FILTER_MATCH_FUNC(filtertype, key) 			\
+	static int match_##filtertype(listener * p,			\
+				struct fsevent * event,			\
+				struct sk_buff * skb)			\
+	{								\
+		int ret = 0;						\
+		filtertype * xfilter = NULL;				\
+		struct sk_buff * skb2 = NULL;				\
+		struct list_head *  head = &(p->key##_filter_list_head);  \
+		list_for_each_entry(xfilter, head, list) {		\
+			if (xfilter->key != event->key)			\
+				continue;				\
+			ret = filter_fsevent(xfilter->mask, event->type); \
+			if ( ret != 0)					\
+				return -1;				\
+			skb2 = skb_clone(skb, GFP_KERNEL);		\
+       			if (skb2 == NULL)				\
+				return -ENOMEM;				\
+			NETLINK_CB(skb2).dst_group = 0;			\
+			NETLINK_CB(skb2).dst_pid = p->pid;		\
+			NETLINK_CB(skb2).pid = 0;			\
+			return (netlink_unicast(fsevent_sock, skb2,	\
+					p->pid, MSG_DONTWAIT));		\
+		}							\
+		return -ENODEV;						\
+	}								\
+
+DEFINE_FILTER_MATCH_FUNC(pid_filter, pid)
+
+DEFINE_FILTER_MATCH_FUNC(uid_filter, uid)
+
+DEFINE_FILTER_MATCH_FUNC(gid_filter, gid)
+
+#define MATCH_XID(key, listenerp, event, skb) 			\
+	ret = match_##key##_filter(listenerp, event, skb); 	\
+	if (ret == 0) {					 	\
+		kfree_skb(skb);				 	\
+	        continue;				 	\
+	}						 	\
+	do {} while (0)					 	\
+
+static int fsevent_send_to_process(struct sk_buff * skb)
+{
+	listener * p  = NULL, * q = NULL;
+	struct fsevent * event = NULL;
+	struct sk_buff * skb2 = NULL;
+	int ret = 0;
+
+	event = (struct fsevent *)(skb->data + sizeof(struct nlmsghdr));
+	spin_lock(&listener_list_lock);
+	list_for_each_entry_safe(p, q, &listener_list_head, list) {
+		MATCH_XID(pid, p, event, skb);
+		MATCH_XID(uid, p, event, skb);
+		MATCH_XID(gid, p, event, skb);
+
+		if (filter_fsevent(p->mask, event->type) == 0) {
+			 skb2 = skb_clone(skb, GFP_KERNEL);
+	                 if (skb2 == NULL)
+	                 	return -ENOMEM;
+	                 NETLINK_CB(skb2).dst_group = 0;
+	                 NETLINK_CB(skb2).dst_pid = p->pid;
+	                 NETLINK_CB(skb2).pid = 0;
+	                 ret = netlink_unicast(fsevent_sock, skb2,
+	                                p->pid, MSG_DONTWAIT);
+			if (ret == -ECONNREFUSED) {
+				atomic_dec(&fsevent_listener_num);
+				cleanup_dead_listener(p);
+			}
+		}
+	}
+	spin_unlock(&listener_list_lock);
+	return ret;
+}
+
+static void fsevent_commit(void * unused)
+{
+	struct sk_buff * skb = NULL;
+		
+	while((skb = skb_dequeue(&get_cpu_var(fsevent_send_queue)))
+		!= NULL) {
+		fsevent_send_to_process(skb);
+		put_cpu_var(fsevent_send_queue);
+	}
+}
+
+static struct ctl_table fsevent_mask_sysctl[] = {
+	{
+		.ctl_name = FSEVENT_MASK_CTL_NAME,
+		.procname = "fsevent_mask",
+		.data = &fsevents_mask,
+		.maxlen = sizeof(u32),
+		.mode = 0644,
+		.proc_handler = &proc_dointvec,
+	},
+	{ .ctl_name = 0 }
+};
+
+static struct ctl_table fs_root_sysctl[] = {
+	{
+		.ctl_name = CTL_FS,
+		.procname = "fs",
+		.mode = 0555,
+		.child = fsevent_mask_sysctl,
+	},
+	{ .ctl_name = 0 }
+};
+
+static int __init fsevent_init(void)
+{
+	int cpu;
+	struct sk_buff_head * listptr;
+	struct work_struct * workptr;
+
+	fsevent_sock = netlink_kernel_create(NETLINK_FSEVENT, 0,
+					 fsevent_recv, THIS_MODULE);
+	if (!fsevent_sock)
+		return -EIO;
+	for_each_cpu(cpu) {
+		listptr = &per_cpu(fsevent_send_queue, cpu);
+		skb_queue_head_init(listptr);
+		workptr = &per_cpu(fsevent_work, cpu);
+		INIT_WORK(workptr, fsevent_commit, NULL);
+	}
+
+	if (register_sysctl_table(fs_root_sysctl, 0) == NULL)
+                return -ENOMEM;
+
+	return 0;
+}
+
+static void __exit fsevent_exit(void)
+{
+	listener * p = NULL, * q = NULL;
+	sock_release(fsevent_sock->sk_socket);
+	spin_lock(&listener_list_lock);
+	list_for_each_entry_safe(p, q, &listener_list_head, list) {
+		cleanup_dead_listener(p);
+	}
+	spin_unlock(&listener_list_lock);
+}
+
+module_init(fsevent_init);
+module_exit(fsevent_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Yi Yang <yang.y.yi@gmail.com>");
+MODULE_DESCRIPTION("File System Events Reporter");


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2006-04-07 20:27 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2006-04-05 13:21 [2.6.16 PATCH] Filessytem Events Reporter V2 Yi Yang
2006-04-05 17:12 ` Carlos Silva
2006-04-05 23:45   ` Yi Yang
2006-04-07  6:24 ` Evgeniy Polyakov
2006-04-07  8:13   ` Yi Yang
2006-04-07  9:47     ` Evgeniy Polyakov
2006-04-07 10:03       ` Yi Yang
2006-04-07 10:26         ` Evgeniy Polyakov
2006-04-07 15:14           ` Yi Yang
2006-04-07 19:47             ` Evgeniy Polyakov
2006-04-07 20:27               ` Evgeniy Polyakov
2006-04-05 14:13 Michael Guo
2006-04-05 23:44 ` Yi Yang
2006-04-06 14:34 Michael Guo

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).