All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] sysctl: add support for poll()
@ 2011-06-01 12:14 Lucas De Marchi
  2011-06-02  2:51 ` Lucas De Marchi
  0 siblings, 1 reply; 30+ messages in thread
From: Lucas De Marchi @ 2011-06-01 12:14 UTC (permalink / raw)
  To: linux-kernel; +Cc: kay.sievers, Lucas De Marchi

Adding support for poll() in sysctl fs allows userspace to receive
notifications when an entry in sysctl changes. This way it's possible to
know when hostname/domainname is changed due to the respective syscall
has been called or its file under /proc/sys has been written to.

Signed-off-by: Lucas De Marchi <lucas.demarchi@profusion.mobi>
---
 fs/proc/proc_sysctl.c   |   40 ++++++++++++++++++++++++++++++++++++++++
 include/linux/sysctl.h  |    8 ++++++++
 include/linux/utsname.h |   16 ++++++++++++++++
 kernel/sys.c            |    2 ++
 kernel/utsname_sysctl.c |   36 ++++++++++++++++++++++++++++++++++++
 5 files changed, 102 insertions(+), 0 deletions(-)

diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index f50133c..2e5d3ec 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -3,6 +3,7 @@
  */
 #include <linux/init.h>
 #include <linux/sysctl.h>
+#include <linux/poll.h>
 #include <linux/proc_fs.h>
 #include <linux/security.h>
 #include <linux/namei.h>
@@ -176,6 +177,43 @@ static ssize_t proc_sys_write(struct file *filp, const char __user *buf,
 	return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 1);
 }
 
+static int proc_sys_open(struct inode *inode, struct file *filp)
+{
+	struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+
+	if (table->poll) {
+		unsigned long event = atomic_read(&table->poll->event);
+
+		filp->private_data = (void *)event;
+	}
+
+	return 0;
+}
+
+static unsigned int proc_sys_poll(struct file *filp, poll_table *wait)
+{
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+	unsigned long event = (unsigned long)filp->private_data;
+	unsigned int ret = POLLIN | POLLRDNORM;
+
+	if (!table->proc_handler)
+		goto out;
+
+	if (!table->poll)
+		goto out;
+
+	poll_wait(filp, &table->poll->wait, wait);
+
+	if (event != atomic_read(&table->poll->event)) {
+		filp->private_data = (void *)(unsigned long)atomic_read(
+						&table->poll->event);
+		ret = POLLIN | POLLRDNORM | POLLERR | POLLPRI;
+	}
+
+out:
+	return ret;
+}
 
 static int proc_sys_fill_cache(struct file *filp, void *dirent,
 				filldir_t filldir,
@@ -367,6 +405,8 @@ static int proc_sys_getattr(struct vfsmount *mnt, struct dentry *dentry, struct
 }
 
 static const struct file_operations proc_sys_file_operations = {
+	.open		= proc_sys_open,
+	.poll		= proc_sys_poll,
 	.read		= proc_sys_read,
 	.write		= proc_sys_write,
 	.llseek		= default_llseek,
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 11684d9..96c89ba 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -25,6 +25,7 @@
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/compiler.h>
+#include <linux/wait.h>
 
 struct completion;
 
@@ -1011,6 +1012,12 @@ extern int proc_do_large_bitmap(struct ctl_table *, int,
  * cover common cases.
  */
 
+/* Support for userspace poll() to watch for changes */
+struct ctl_table_poll {
+	atomic_t event;
+	wait_queue_head_t wait;
+};
+
 /* A sysctl table is an array of struct ctl_table: */
 struct ctl_table 
 {
@@ -1021,6 +1028,7 @@ struct ctl_table
 	struct ctl_table *child;
 	struct ctl_table *parent;	/* Automatically set */
 	proc_handler *proc_handler;	/* Callback for text formatting */
+	struct ctl_table_poll *poll;
 	void *extra1;
 	void *extra2;
 };
diff --git a/include/linux/utsname.h b/include/linux/utsname.h
index 4e5b021..c714ed7 100644
--- a/include/linux/utsname.h
+++ b/include/linux/utsname.h
@@ -37,6 +37,14 @@ struct new_utsname {
 #include <linux/nsproxy.h>
 #include <linux/err.h>
 
+enum uts_proc {
+	UTS_PROC_OSTYPE,
+	UTS_PROC_OSRELEASE,
+	UTS_PROC_VERSION,
+	UTS_PROC_HOSTNAME,
+	UTS_PROC_DOMAINNAME,
+};
+
 struct user_namespace;
 extern struct user_namespace init_user_ns;
 
@@ -80,6 +88,14 @@ static inline struct uts_namespace *copy_utsname(unsigned long flags,
 }
 #endif
 
+#ifdef CONFIG_PROC_SYSCTL
+extern void uts_proc_notify(enum uts_proc proc);
+#else
+static inline void uts_proc_notify(enum uts_proc proc)
+{
+}
+#endif
+
 static inline struct new_utsname *utsname(void)
 {
 	return &current->nsproxy->uts_ns->name;
diff --git a/kernel/sys.c b/kernel/sys.c
index e4128b2..ada9cd7 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1211,6 +1211,7 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
 		memset(u->nodename + len, 0, sizeof(u->nodename) - len);
 		errno = 0;
 	}
+	uts_proc_notify(UTS_PROC_HOSTNAME);
 	up_write(&uts_sem);
 	return errno;
 }
@@ -1261,6 +1262,7 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
 		memset(u->domainname + len, 0, sizeof(u->domainname) - len);
 		errno = 0;
 	}
+	uts_proc_notify(UTS_PROC_DOMAINNAME);
 	up_write(&uts_sem);
 	return errno;
 }
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index a2cd77e..e96b766 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -13,6 +13,7 @@
 #include <linux/uts.h>
 #include <linux/utsname.h>
 #include <linux/sysctl.h>
+#include <linux/wait.h>
 
 static void *get_uts(ctl_table *table, int write)
 {
@@ -51,12 +52,28 @@ static int proc_do_uts_string(ctl_table *table, int write,
 	uts_table.data = get_uts(table, write);
 	r = proc_dostring(&uts_table,write,buffer,lenp, ppos);
 	put_uts(table, write, uts_table.data);
+
+	if (write) {
+		atomic_inc(&table->poll->event);
+		wake_up_interruptible(&table->poll->wait);
+	}
+
 	return r;
 }
 #else
 #define proc_do_uts_string NULL
 #endif
 
+static struct ctl_table_poll hostname_poll = {
+	.event		= ATOMIC_INIT(0),
+	.wait		= __WAIT_QUEUE_HEAD_INITIALIZER(hostname_poll.wait),
+};
+
+static struct ctl_table_poll domainname_poll = {
+	.event		= ATOMIC_INIT(0),
+	.wait		= __WAIT_QUEUE_HEAD_INITIALIZER(domainname_poll.wait),
+};
+
 static struct ctl_table uts_kern_table[] = {
 	{
 		.procname	= "ostype",
@@ -85,6 +102,7 @@ static struct ctl_table uts_kern_table[] = {
 		.maxlen		= sizeof(init_uts_ns.name.nodename),
 		.mode		= 0644,
 		.proc_handler	= proc_do_uts_string,
+		.poll		= &hostname_poll,
 	},
 	{
 		.procname	= "domainname",
@@ -92,6 +110,7 @@ static struct ctl_table uts_kern_table[] = {
 		.maxlen		= sizeof(init_uts_ns.name.domainname),
 		.mode		= 0644,
 		.proc_handler	= proc_do_uts_string,
+		.poll		= &domainname_poll,
 	},
 	{}
 };
@@ -105,6 +124,23 @@ static struct ctl_table uts_root_table[] = {
 	{}
 };
 
+#ifdef CONFIG_PROC_SYSCTL
+/*
+ * Notify userspace about a change in a certain entry of uts_kern_table,
+ * identified by the parameter proc.
+ */
+void uts_proc_notify(enum uts_proc proc)
+{
+	struct ctl_table *table = &uts_kern_table[proc];
+
+	if (!table->poll)
+		return;
+
+	atomic_inc(&table->poll->event);
+	wake_up_interruptible(&table->poll->wait);
+}
+#endif
+
 static int __init utsname_sysctl_init(void)
 {
 	register_sysctl_table(uts_root_table);
-- 
1.7.5.2


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* Re: [PATCH] sysctl: add support for poll()
  2011-06-01 12:14 [PATCH] sysctl: add support for poll() Lucas De Marchi
@ 2011-06-02  2:51 ` Lucas De Marchi
  2011-06-02  3:31   ` Eric W. Biederman
  0 siblings, 1 reply; 30+ messages in thread
From: Lucas De Marchi @ 2011-06-02  2:51 UTC (permalink / raw)
  To: linux-kernel
  Cc: kay.sievers, Lucas De Marchi, Nick Piggin, Al Viro,
	Christoph Hellwig, Eric W. Biederman, Stephen Rothwell,
	Andrew Morton, David Howells, Serge E. Hallyn, Daniel Lezcano,
	Jiri Slaby, Greg Kroah-Hartman, James Morris

CC'ing people as suggested by get_maintainer.pl

On Wed, Jun 1, 2011 at 9:14 AM, Lucas De Marchi
<lucas.demarchi@profusion.mobi> wrote:
> Adding support for poll() in sysctl fs allows userspace to receive
> notifications when an entry in sysctl changes. This way it's possible to
> know when hostname/domainname is changed due to the respective syscall
> has been called or its file under /proc/sys has been written to.
>
> Signed-off-by: Lucas De Marchi <lucas.demarchi@profusion.mobi>
> ---
>  fs/proc/proc_sysctl.c   |   40 ++++++++++++++++++++++++++++++++++++++++
>  include/linux/sysctl.h  |    8 ++++++++
>  include/linux/utsname.h |   16 ++++++++++++++++
>  kernel/sys.c            |    2 ++
>  kernel/utsname_sysctl.c |   36 ++++++++++++++++++++++++++++++++++++
>  5 files changed, 102 insertions(+), 0 deletions(-)
>
> diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
> index f50133c..2e5d3ec 100644
> --- a/fs/proc/proc_sysctl.c
> +++ b/fs/proc/proc_sysctl.c
> @@ -3,6 +3,7 @@
>  */
>  #include <linux/init.h>
>  #include <linux/sysctl.h>
> +#include <linux/poll.h>
>  #include <linux/proc_fs.h>
>  #include <linux/security.h>
>  #include <linux/namei.h>
> @@ -176,6 +177,43 @@ static ssize_t proc_sys_write(struct file *filp, const char __user *buf,
>        return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 1);
>  }
>
> +static int proc_sys_open(struct inode *inode, struct file *filp)
> +{
> +       struct ctl_table *table = PROC_I(inode)->sysctl_entry;
> +
> +       if (table->poll) {
> +               unsigned long event = atomic_read(&table->poll->event);
> +
> +               filp->private_data = (void *)event;
> +       }
> +
> +       return 0;
> +}
> +
> +static unsigned int proc_sys_poll(struct file *filp, poll_table *wait)
> +{
> +       struct inode *inode = filp->f_path.dentry->d_inode;
> +       struct ctl_table *table = PROC_I(inode)->sysctl_entry;
> +       unsigned long event = (unsigned long)filp->private_data;
> +       unsigned int ret = POLLIN | POLLRDNORM;
> +
> +       if (!table->proc_handler)
> +               goto out;
> +
> +       if (!table->poll)
> +               goto out;
> +
> +       poll_wait(filp, &table->poll->wait, wait);
> +
> +       if (event != atomic_read(&table->poll->event)) {
> +               filp->private_data = (void *)(unsigned long)atomic_read(
> +                                               &table->poll->event);
> +               ret = POLLIN | POLLRDNORM | POLLERR | POLLPRI;
> +       }
> +
> +out:
> +       return ret;
> +}
>
>  static int proc_sys_fill_cache(struct file *filp, void *dirent,
>                                filldir_t filldir,
> @@ -367,6 +405,8 @@ static int proc_sys_getattr(struct vfsmount *mnt, struct dentry *dentry, struct
>  }
>
>  static const struct file_operations proc_sys_file_operations = {
> +       .open           = proc_sys_open,
> +       .poll           = proc_sys_poll,
>        .read           = proc_sys_read,
>        .write          = proc_sys_write,
>        .llseek         = default_llseek,
> diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
> index 11684d9..96c89ba 100644
> --- a/include/linux/sysctl.h
> +++ b/include/linux/sysctl.h
> @@ -25,6 +25,7 @@
>  #include <linux/kernel.h>
>  #include <linux/types.h>
>  #include <linux/compiler.h>
> +#include <linux/wait.h>
>
>  struct completion;
>
> @@ -1011,6 +1012,12 @@ extern int proc_do_large_bitmap(struct ctl_table *, int,
>  * cover common cases.
>  */
>
> +/* Support for userspace poll() to watch for changes */
> +struct ctl_table_poll {
> +       atomic_t event;
> +       wait_queue_head_t wait;
> +};
> +
>  /* A sysctl table is an array of struct ctl_table: */
>  struct ctl_table
>  {
> @@ -1021,6 +1028,7 @@ struct ctl_table
>        struct ctl_table *child;
>        struct ctl_table *parent;       /* Automatically set */
>        proc_handler *proc_handler;     /* Callback for text formatting */
> +       struct ctl_table_poll *poll;
>        void *extra1;
>        void *extra2;
>  };
> diff --git a/include/linux/utsname.h b/include/linux/utsname.h
> index 4e5b021..c714ed7 100644
> --- a/include/linux/utsname.h
> +++ b/include/linux/utsname.h
> @@ -37,6 +37,14 @@ struct new_utsname {
>  #include <linux/nsproxy.h>
>  #include <linux/err.h>
>
> +enum uts_proc {
> +       UTS_PROC_OSTYPE,
> +       UTS_PROC_OSRELEASE,
> +       UTS_PROC_VERSION,
> +       UTS_PROC_HOSTNAME,
> +       UTS_PROC_DOMAINNAME,
> +};
> +
>  struct user_namespace;
>  extern struct user_namespace init_user_ns;
>
> @@ -80,6 +88,14 @@ static inline struct uts_namespace *copy_utsname(unsigned long flags,
>  }
>  #endif
>
> +#ifdef CONFIG_PROC_SYSCTL
> +extern void uts_proc_notify(enum uts_proc proc);
> +#else
> +static inline void uts_proc_notify(enum uts_proc proc)
> +{
> +}
> +#endif
> +
>  static inline struct new_utsname *utsname(void)
>  {
>        return &current->nsproxy->uts_ns->name;
> diff --git a/kernel/sys.c b/kernel/sys.c
> index e4128b2..ada9cd7 100644
> --- a/kernel/sys.c
> +++ b/kernel/sys.c
> @@ -1211,6 +1211,7 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
>                memset(u->nodename + len, 0, sizeof(u->nodename) - len);
>                errno = 0;
>        }
> +       uts_proc_notify(UTS_PROC_HOSTNAME);
>        up_write(&uts_sem);
>        return errno;
>  }
> @@ -1261,6 +1262,7 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
>                memset(u->domainname + len, 0, sizeof(u->domainname) - len);
>                errno = 0;
>        }
> +       uts_proc_notify(UTS_PROC_DOMAINNAME);
>        up_write(&uts_sem);
>        return errno;
>  }
> diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
> index a2cd77e..e96b766 100644
> --- a/kernel/utsname_sysctl.c
> +++ b/kernel/utsname_sysctl.c
> @@ -13,6 +13,7 @@
>  #include <linux/uts.h>
>  #include <linux/utsname.h>
>  #include <linux/sysctl.h>
> +#include <linux/wait.h>
>
>  static void *get_uts(ctl_table *table, int write)
>  {
> @@ -51,12 +52,28 @@ static int proc_do_uts_string(ctl_table *table, int write,
>        uts_table.data = get_uts(table, write);
>        r = proc_dostring(&uts_table,write,buffer,lenp, ppos);
>        put_uts(table, write, uts_table.data);
> +
> +       if (write) {
> +               atomic_inc(&table->poll->event);
> +               wake_up_interruptible(&table->poll->wait);
> +       }
> +
>        return r;
>  }
>  #else
>  #define proc_do_uts_string NULL
>  #endif
>
> +static struct ctl_table_poll hostname_poll = {
> +       .event          = ATOMIC_INIT(0),
> +       .wait           = __WAIT_QUEUE_HEAD_INITIALIZER(hostname_poll.wait),
> +};
> +
> +static struct ctl_table_poll domainname_poll = {
> +       .event          = ATOMIC_INIT(0),
> +       .wait           = __WAIT_QUEUE_HEAD_INITIALIZER(domainname_poll.wait),
> +};
> +
>  static struct ctl_table uts_kern_table[] = {
>        {
>                .procname       = "ostype",
> @@ -85,6 +102,7 @@ static struct ctl_table uts_kern_table[] = {
>                .maxlen         = sizeof(init_uts_ns.name.nodename),
>                .mode           = 0644,
>                .proc_handler   = proc_do_uts_string,
> +               .poll           = &hostname_poll,
>        },
>        {
>                .procname       = "domainname",
> @@ -92,6 +110,7 @@ static struct ctl_table uts_kern_table[] = {
>                .maxlen         = sizeof(init_uts_ns.name.domainname),
>                .mode           = 0644,
>                .proc_handler   = proc_do_uts_string,
> +               .poll           = &domainname_poll,
>        },
>        {}
>  };
> @@ -105,6 +124,23 @@ static struct ctl_table uts_root_table[] = {
>        {}
>  };
>
> +#ifdef CONFIG_PROC_SYSCTL
> +/*
> + * Notify userspace about a change in a certain entry of uts_kern_table,
> + * identified by the parameter proc.
> + */
> +void uts_proc_notify(enum uts_proc proc)
> +{
> +       struct ctl_table *table = &uts_kern_table[proc];
> +
> +       if (!table->poll)
> +               return;
> +
> +       atomic_inc(&table->poll->event);
> +       wake_up_interruptible(&table->poll->wait);
> +}
> +#endif
> +
>  static int __init utsname_sysctl_init(void)
>  {
>        register_sysctl_table(uts_root_table);
> --
> 1.7.5.2
>
>

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH] sysctl: add support for poll()
  2011-06-02  2:51 ` Lucas De Marchi
@ 2011-06-02  3:31   ` Eric W. Biederman
  2011-06-02 12:06     ` Kay Sievers
  0 siblings, 1 reply; 30+ messages in thread
From: Eric W. Biederman @ 2011-06-02  3:31 UTC (permalink / raw)
  To: Lucas De Marchi
  Cc: linux-kernel, kay.sievers, Nick Piggin, Al Viro,
	Christoph Hellwig, Stephen Rothwell, Andrew Morton,
	David Howells, Serge E. Hallyn, Daniel Lezcano, Jiri Slaby,
	Greg Kroah-Hartman, James Morris

Lucas De Marchi <lucas.demarchi@profusion.mobi> writes:

> CC'ing people as suggested by get_maintainer.pl
>
> On Wed, Jun 1, 2011 at 9:14 AM, Lucas De Marchi
> <lucas.demarchi@profusion.mobi> wrote:
>> Adding support for poll() in sysctl fs allows userspace to receive
>> notifications when an entry in sysctl changes. This way it's possible to
>> know when hostname/domainname is changed due to the respective syscall
>> has been called or its file under /proc/sys has been written to.

Why do you want to do this?  What advantage does this bring to
userspace?

That feels like a pretty big special case.

Eric


>> Signed-off-by: Lucas De Marchi <lucas.demarchi@profusion.mobi>
>> ---
>>  fs/proc/proc_sysctl.c   |   40 ++++++++++++++++++++++++++++++++++++++++
>>  include/linux/sysctl.h  |    8 ++++++++
>>  include/linux/utsname.h |   16 ++++++++++++++++
>>  kernel/sys.c            |    2 ++
>>  kernel/utsname_sysctl.c |   36 ++++++++++++++++++++++++++++++++++++
>>  5 files changed, 102 insertions(+), 0 deletions(-)
>>
>> diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
>> index f50133c..2e5d3ec 100644
>> --- a/fs/proc/proc_sysctl.c
>> +++ b/fs/proc/proc_sysctl.c
>> @@ -3,6 +3,7 @@
>>  */
>>  #include <linux/init.h>
>>  #include <linux/sysctl.h>
>> +#include <linux/poll.h>
>>  #include <linux/proc_fs.h>
>>  #include <linux/security.h>
>>  #include <linux/namei.h>
>> @@ -176,6 +177,43 @@ static ssize_t proc_sys_write(struct file *filp, const char __user *buf,
>>        return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 1);
>>  }
>>
>> +static int proc_sys_open(struct inode *inode, struct file *filp)
>> +{
>> +       struct ctl_table *table = PROC_I(inode)->sysctl_entry;
>> +
>> +       if (table->poll) {
>> +               unsigned long event = atomic_read(&table->poll->event);
>> +
>> +               filp->private_data = (void *)event;
>> +       }
>> +
>> +       return 0;
>> +}
>> +
>> +static unsigned int proc_sys_poll(struct file *filp, poll_table *wait)
>> +{
>> +       struct inode *inode = filp->f_path.dentry->d_inode;
>> +       struct ctl_table *table = PROC_I(inode)->sysctl_entry;
>> +       unsigned long event = (unsigned long)filp->private_data;
>> +       unsigned int ret = POLLIN | POLLRDNORM;
>> +
>> +       if (!table->proc_handler)
>> +               goto out;
>> +
>> +       if (!table->poll)
>> +               goto out;
>> +
>> +       poll_wait(filp, &table->poll->wait, wait);
>> +
>> +       if (event != atomic_read(&table->poll->event)) {
>> +               filp->private_data = (void *)(unsigned long)atomic_read(
>> +                                               &table->poll->event);
>> +               ret = POLLIN | POLLRDNORM | POLLERR | POLLPRI;
>> +       }
>> +
>> +out:
>> +       return ret;
>> +}
>>
>>  static int proc_sys_fill_cache(struct file *filp, void *dirent,
>>                                filldir_t filldir,
>> @@ -367,6 +405,8 @@ static int proc_sys_getattr(struct vfsmount *mnt, struct dentry *dentry, struct
>>  }
>>
>>  static const struct file_operations proc_sys_file_operations = {
>> +       .open           = proc_sys_open,
>> +       .poll           = proc_sys_poll,
>>        .read           = proc_sys_read,
>>        .write          = proc_sys_write,
>>        .llseek         = default_llseek,
>> diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
>> index 11684d9..96c89ba 100644
>> --- a/include/linux/sysctl.h
>> +++ b/include/linux/sysctl.h
>> @@ -25,6 +25,7 @@
>>  #include <linux/kernel.h>
>>  #include <linux/types.h>
>>  #include <linux/compiler.h>
>> +#include <linux/wait.h>
>>
>>  struct completion;
>>
>> @@ -1011,6 +1012,12 @@ extern int proc_do_large_bitmap(struct ctl_table *, int,
>>  * cover common cases.
>>  */
>>
>> +/* Support for userspace poll() to watch for changes */
>> +struct ctl_table_poll {
>> +       atomic_t event;
>> +       wait_queue_head_t wait;
>> +};
>> +
>>  /* A sysctl table is an array of struct ctl_table: */
>>  struct ctl_table
>>  {
>> @@ -1021,6 +1028,7 @@ struct ctl_table
>>        struct ctl_table *child;
>>        struct ctl_table *parent;       /* Automatically set */
>>        proc_handler *proc_handler;     /* Callback for text formatting */
>> +       struct ctl_table_poll *poll;
>>        void *extra1;
>>        void *extra2;
>>  };
>> diff --git a/include/linux/utsname.h b/include/linux/utsname.h
>> index 4e5b021..c714ed7 100644
>> --- a/include/linux/utsname.h
>> +++ b/include/linux/utsname.h
>> @@ -37,6 +37,14 @@ struct new_utsname {
>>  #include <linux/nsproxy.h>
>>  #include <linux/err.h>
>>
>> +enum uts_proc {
>> +       UTS_PROC_OSTYPE,
>> +       UTS_PROC_OSRELEASE,
>> +       UTS_PROC_VERSION,
>> +       UTS_PROC_HOSTNAME,
>> +       UTS_PROC_DOMAINNAME,
>> +};
>> +
>>  struct user_namespace;
>>  extern struct user_namespace init_user_ns;
>>
>> @@ -80,6 +88,14 @@ static inline struct uts_namespace *copy_utsname(unsigned long flags,
>>  }
>>  #endif
>>
>> +#ifdef CONFIG_PROC_SYSCTL
>> +extern void uts_proc_notify(enum uts_proc proc);
>> +#else
>> +static inline void uts_proc_notify(enum uts_proc proc)
>> +{
>> +}
>> +#endif
>> +
>>  static inline struct new_utsname *utsname(void)
>>  {
>>        return &current->nsproxy->uts_ns->name;
>> diff --git a/kernel/sys.c b/kernel/sys.c
>> index e4128b2..ada9cd7 100644
>> --- a/kernel/sys.c
>> +++ b/kernel/sys.c
>> @@ -1211,6 +1211,7 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
>>                memset(u->nodename + len, 0, sizeof(u->nodename) - len);
>>                errno = 0;
>>        }
>> +       uts_proc_notify(UTS_PROC_HOSTNAME);
>>        up_write(&uts_sem);
>>        return errno;
>>  }
>> @@ -1261,6 +1262,7 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
>>                memset(u->domainname + len, 0, sizeof(u->domainname) - len);
>>                errno = 0;
>>        }
>> +       uts_proc_notify(UTS_PROC_DOMAINNAME);
>>        up_write(&uts_sem);
>>        return errno;
>>  }
>> diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
>> index a2cd77e..e96b766 100644
>> --- a/kernel/utsname_sysctl.c
>> +++ b/kernel/utsname_sysctl.c
>> @@ -13,6 +13,7 @@
>>  #include <linux/uts.h>
>>  #include <linux/utsname.h>
>>  #include <linux/sysctl.h>
>> +#include <linux/wait.h>
>>
>>  static void *get_uts(ctl_table *table, int write)
>>  {
>> @@ -51,12 +52,28 @@ static int proc_do_uts_string(ctl_table *table, int write,
>>        uts_table.data = get_uts(table, write);
>>        r = proc_dostring(&uts_table,write,buffer,lenp, ppos);
>>        put_uts(table, write, uts_table.data);
>> +
>> +       if (write) {
>> +               atomic_inc(&table->poll->event);
>> +               wake_up_interruptible(&table->poll->wait);
>> +       }
>> +
>>        return r;
>>  }
>>  #else
>>  #define proc_do_uts_string NULL
>>  #endif
>>
>> +static struct ctl_table_poll hostname_poll = {
>> +       .event          = ATOMIC_INIT(0),
>> +       .wait           = __WAIT_QUEUE_HEAD_INITIALIZER(hostname_poll.wait),
>> +};
>> +
>> +static struct ctl_table_poll domainname_poll = {
>> +       .event          = ATOMIC_INIT(0),
>> +       .wait           = __WAIT_QUEUE_HEAD_INITIALIZER(domainname_poll.wait),
>> +};
>> +
>>  static struct ctl_table uts_kern_table[] = {
>>        {
>>                .procname       = "ostype",
>> @@ -85,6 +102,7 @@ static struct ctl_table uts_kern_table[] = {
>>                .maxlen         = sizeof(init_uts_ns.name.nodename),
>>                .mode           = 0644,
>>                .proc_handler   = proc_do_uts_string,
>> +               .poll           = &hostname_poll,
>>        },
>>        {
>>                .procname       = "domainname",
>> @@ -92,6 +110,7 @@ static struct ctl_table uts_kern_table[] = {
>>                .maxlen         = sizeof(init_uts_ns.name.domainname),
>>                .mode           = 0644,
>>                .proc_handler   = proc_do_uts_string,
>> +               .poll           = &domainname_poll,
>>        },
>>        {}
>>  };
>> @@ -105,6 +124,23 @@ static struct ctl_table uts_root_table[] = {
>>        {}
>>  };
>>
>> +#ifdef CONFIG_PROC_SYSCTL
>> +/*
>> + * Notify userspace about a change in a certain entry of uts_kern_table,
>> + * identified by the parameter proc.
>> + */
>> +void uts_proc_notify(enum uts_proc proc)
>> +{
>> +       struct ctl_table *table = &uts_kern_table[proc];
>> +
>> +       if (!table->poll)
>> +               return;
>> +
>> +       atomic_inc(&table->poll->event);
>> +       wake_up_interruptible(&table->poll->wait);
>> +}
>> +#endif
>> +
>>  static int __init utsname_sysctl_init(void)
>>  {
>>        register_sysctl_table(uts_root_table);
>> --
>> 1.7.5.2
>>
>>

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH] sysctl: add support for poll()
  2011-06-02  3:31   ` Eric W. Biederman
@ 2011-06-02 12:06     ` Kay Sievers
  2011-06-02 12:43       ` Alan Cox
  0 siblings, 1 reply; 30+ messages in thread
From: Kay Sievers @ 2011-06-02 12:06 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Lucas De Marchi, linux-kernel, Nick Piggin, Al Viro,
	Christoph Hellwig, Stephen Rothwell, Andrew Morton,
	David Howells, Serge E. Hallyn, Daniel Lezcano, Jiri Slaby,
	Greg Kroah-Hartman, James Morris

On Thu, Jun 2, 2011 at 05:31, Eric W. Biederman <ebiederm@xmission.com> wrote:
> Lucas De Marchi <lucas.demarchi@profusion.mobi> writes:
>
>> CC'ing people as suggested by get_maintainer.pl
>>
>> On Wed, Jun 1, 2011 at 9:14 AM, Lucas De Marchi
>> <lucas.demarchi@profusion.mobi> wrote:
>>> Adding support for poll() in sysctl fs allows userspace to receive
>>> notifications when an entry in sysctl changes. This way it's possible to
>>> know when hostname/domainname is changed due to the respective syscall
>>> has been called or its file under /proc/sys has been written to.
>
> Why do you want to do this?  What advantage does this bring to
> userspace?

Host names are dynamic, can change during system runtime by dhcp or
similar setups, or just get changed by the user.

System management tools/services needs to track these changes to
propagate the actual host name into the running services. It's the
same like we dynamically track /proc/mounts, /proc/swaps, or any other
stuff that can change underneath us, and forces other things to adapt
to the changing environment.

> That feels like a pretty big special case.

The alternative is to have a process constantly polling and reading
the file, which is nothing we even want to think about in 2011.

It's just another special case to bring us out of the UNIX stone age
of doing things. :)

Kay

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH] sysctl: add support for poll()
  2011-06-02 12:06     ` Kay Sievers
@ 2011-06-02 12:43       ` Alan Cox
  2011-06-02 13:01         ` Kay Sievers
                           ` (2 more replies)
  0 siblings, 3 replies; 30+ messages in thread
From: Alan Cox @ 2011-06-02 12:43 UTC (permalink / raw)
  To: Kay Sievers
  Cc: Eric W. Biederman, Lucas De Marchi, linux-kernel, Nick Piggin,
	Al Viro, Christoph Hellwig, Stephen Rothwell, Andrew Morton,
	David Howells, Serge E. Hallyn, Daniel Lezcano, Jiri Slaby,
	Greg Kroah-Hartman, James Morris

> Host names are dynamic, can change during system runtime by dhcp or
> similar setups, or just get changed by the user.

I don't actually see what this has to do with utsname. uname historically
defined nodename as "name within an implementation-defined communications
network" and actually tended to be the UUCP name. Modern SuS says "`the
name of the node of the communications network to which this node is
attached, if any"

The latter unfortunately makes no sense anyway and is a fine example of
standards body cluelessness as name mapping on IP networks is not one
name per host, and also because the standard doesn't require the fields
in the struct are long enough to hold a DNS name!

(Indeed in its usual head up backside manner its technically valid to
define

	char nodename[1];

and have only \0 as a valid reply)

Realistically all the naming policy has to be in userspace, and that
means that all the userspace bits have to agree on what they use utsname
for (or indeed if they are wise not use nodename in the first place) so
they can handle name management outside of kernel.

In the DHCP case for example a node which is acquiring a DHCP address on
wireless and on ethernet at the same time (very typical) has to make sure
all its DHCP and other address management policies agree rather than have
the two have a kernel food fight over the name to use.

So this all belongs in conman, network manager and other similar
systems and I'd suggest they try and deal with the meaningful IP world
questions of "what list of names do I posess for purpose X on network Y"
and "is this one of my names on ..."

> The alternative is to have a process constantly polling and reading
> the file, which is nothing we even want to think about in 2011.

Or to manage it properly.

> It's just another special case to bring us out of the UNIX stone age
> of doing things. :)

Unfortunately not. It's a misguided attempt to follow stone age Unix 'one
short name' policy. Forget utsname node names, they are a historical
quirk of UUCP and friends and on many OS platforms will be limited to 15
chars !

As to poll in general I can see some of the other proc files being
more relevant, eg for process monitoring tools being able to poll
in /proc/<pid> and some of the proc/sys and sysctl data that does change
meaningfully. Utsname however is not one of those things.


Alan

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH] sysctl: add support for poll()
  2011-06-02 12:43       ` Alan Cox
@ 2011-06-02 13:01         ` Kay Sievers
  2011-06-02 13:02         ` Lucas De Marchi
  2011-06-02 14:16         ` Eric W. Biederman
  2 siblings, 0 replies; 30+ messages in thread
From: Kay Sievers @ 2011-06-02 13:01 UTC (permalink / raw)
  To: Alan Cox
  Cc: Eric W. Biederman, Lucas De Marchi, linux-kernel, Nick Piggin,
	Al Viro, Christoph Hellwig, Stephen Rothwell, Andrew Morton,
	David Howells, Serge E. Hallyn, Daniel Lezcano, Jiri Slaby,
	Greg Kroah-Hartman, James Morris

On Thu, Jun 2, 2011 at 14:43, Alan Cox <alan@lxorguk.ukuu.org.uk> wrote:
> So this all belongs in conman, network manager and other similar
> systems and I'd suggest they try and deal with the meaningful IP world
> questions of "what list of names do I posess for purpose X on network Y"
> and "is this one of my names on ..."

Higher level interfaces already live in the system services to set and
get the names in various ways. But we also need to support changes by
the old and simple tools. Calling hostname(1) must work just like
everything else. It changes state in the kernel, we need to know it
and adapt to it.

>> The alternative is to have a process constantly polling and reading
>> the file, which is nothing we even want to think about in 2011.
>
> Or to manage it properly.

That's just a dream. We do it properly, but we have no control over
everything. You've been yourself in that position, and have seen that
some stuff will 'never' change, we better just support it instead of
hoping it will not be used. Especially when it's that damn simple and
clean.

>> It's just another special case to bring us out of the UNIX stone age
>> of doing things. :)
>
> Unfortunately not. It's a misguided attempt to follow stone age Unix 'one
> short name' policy. Forget utsname node names, they are a historical
> quirk of UUCP and friends and on many OS platforms will be limited to 15
> chars !

That's not a problem. We support pretty names in the higher stack just
fine, but there is a tone of stuff that depends on gethostname(),
sethostname() and we have to support that.

> As to poll in general I can see some of the other proc files being
> more relevant, eg for process monitoring tools being able to poll
> in /proc/<pid> and some of the proc/sys and sysctl data that does change
> meaningfully.

Yeah, being able to watch a pid with such a simple interface would be
great to have.

> Utsname however is not one of those things.

I think it is.

As always, what are the alternatives you would propose, to watch
sethostname() changes to the system. And 'it should not be used', or
'it should be centrally intercepted' does not count, the reality will
go on without such rules.

Kay

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH] sysctl: add support for poll()
  2011-06-02 12:43       ` Alan Cox
  2011-06-02 13:01         ` Kay Sievers
@ 2011-06-02 13:02         ` Lucas De Marchi
  2011-06-02 13:12           ` Alan Cox
  2011-06-02 13:56           ` Eric W. Biederman
  2011-06-02 14:16         ` Eric W. Biederman
  2 siblings, 2 replies; 30+ messages in thread
From: Lucas De Marchi @ 2011-06-02 13:02 UTC (permalink / raw)
  To: Alan Cox
  Cc: Kay Sievers, Eric W. Biederman, linux-kernel, Nick Piggin,
	Al Viro, Christoph Hellwig, Stephen Rothwell, Andrew Morton,
	David Howells, Serge E. Hallyn, Daniel Lezcano, Jiri Slaby,
	Greg Kroah-Hartman, James Morris

On Thu, Jun 2, 2011 at 9:43 AM, Alan Cox <alan@lxorguk.ukuu.org.uk> wrote:
>> The alternative is to have a process constantly polling and reading
>> the file, which is nothing we even want to think about in 2011.
>
> Or to manage it properly.

What if the user decides do invoke sethostname syscall "by hand"?
Hostname would change beneath any other process that is trying to
manage it properly. What this patch does is to notify that process
that something happened.


>> It's just another special case to bring us out of the UNIX stone age
>> of doing things. :)
>
> Unfortunately not. It's a misguided attempt to follow stone age Unix 'one
> short name' policy. Forget utsname node names, they are a historical
> quirk of UUCP and friends and on many OS platforms will be limited to 15
> chars !
>
> As to poll in general I can see some of the other proc files being
> more relevant, eg for process monitoring tools being able to poll
> in /proc/<pid> and some of the proc/sys and sysctl data that does change
> meaningfully. Utsname however is not one of those things.
>

With this patch in, if anyone wants to manage a file under /proc/sys
there's really a small amount of code to write. He only has to define
the new poll struct for that file.


Lucas De Marchi

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH] sysctl: add support for poll()
  2011-06-02 13:02         ` Lucas De Marchi
@ 2011-06-02 13:12           ` Alan Cox
  2011-06-02 13:24             ` Kay Sievers
  2011-06-02 13:56           ` Eric W. Biederman
  1 sibling, 1 reply; 30+ messages in thread
From: Alan Cox @ 2011-06-02 13:12 UTC (permalink / raw)
  To: Lucas De Marchi
  Cc: Kay Sievers, Eric W. Biederman, linux-kernel, Nick Piggin,
	Al Viro, Christoph Hellwig, Stephen Rothwell, Andrew Morton,
	David Howells, Serge E. Hallyn, Daniel Lezcano, Jiri Slaby,
	Greg Kroah-Hartman, James Morris

> > Or to manage it properly.
> 
> What if the user decides do invoke sethostname syscall "by hand"?
> Hostname would change beneath any other process that is trying to
> manage it properly. What this patch does is to notify that process
> that something happened.

That is a stupid argument. Shall we extend it to its logical idiotic end
and ask

"What if the user decides to recompile their kernel without sysfs poll
support ?"

You have to be root to run sethostname, at which point you are
realistically at the command line, a superuser and you know what you are
doing (eg using sethostname for non IP network naming, or cluster id, or
other stuff).

> With this patch in, if anyone wants to manage a file under /proc/sys
> there's really a small amount of code to write. He only has to define
> the new poll struct for that file.

Sure - and there is an 8 byte cost per sysctl node (of which we have
rather a lot), and we really need to tackle sysfs not sysctl anyway.

I'm not averse to pollable sysfs/sysctl nodes at all although the memory
hit on sysfs is going to be tricky to manage and need clever code.

I just think the utsname is a completely misguided example and whoever is
trying to do it doesn't actually understand the limits of utsname.

Alan


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH] sysctl: add support for poll()
  2011-06-02 13:12           ` Alan Cox
@ 2011-06-02 13:24             ` Kay Sievers
  0 siblings, 0 replies; 30+ messages in thread
From: Kay Sievers @ 2011-06-02 13:24 UTC (permalink / raw)
  To: Alan Cox
  Cc: Lucas De Marchi, Eric W. Biederman, linux-kernel, Nick Piggin,
	Al Viro, Christoph Hellwig, Stephen Rothwell, Andrew Morton,
	David Howells, Serge E. Hallyn, Daniel Lezcano, Jiri Slaby,
	Greg Kroah-Hartman, James Morris

On Thu, Jun 2, 2011 at 15:12, Alan Cox <alan@lxorguk.ukuu.org.uk> wrote:
>> > Or to manage it properly.
>>
>> What if the user decides do invoke sethostname syscall "by hand"?
>> Hostname would change beneath any other process that is trying to
>> manage it properly. What this patch does is to notify that process
>> that something happened.
>
> That is a stupid argument. Shall we extend it to its logical idiotic end
> and ask
>
> "What if the user decides to recompile their kernel without sysfs poll
> support ?"

Alan please! This is not something we haven't thought through.

> You have to be root to run sethostname, at which point you are
> realistically at the command line, a superuser and you know what you are
> doing (eg using sethostname for non IP network naming, or cluster id, or
> other stuff).

Please stay to the actual problem this patch tries to resolve.

>> With this patch in, if anyone wants to manage a file under /proc/sys
>> there's really a small amount of code to write. He only has to define
>> the new poll struct for that file.
>
> Sure - and there is an 8 byte cost per sysctl node (of which we have
> rather a lot), and we really need to tackle sysfs not sysctl anyway.

It is. And we will very likely need poll() for other things in
/proc/sys too. It's the cost of providing functionality we just need
today.

I could understand arguing about things like: void *extra1; void
*extra2; in that very same structure, but not about something that
can't really be solved otherwise.

> I'm not averse to pollable sysfs/sysctl nodes at all although the memory
> hit on sysfs is going to be tricky to manage and need clever code.

Yeah, but not related to the problem this patch tries to solve.

> I just think the utsname is a completely misguided example and whoever is
> trying to do it doesn't actually understand the limits of utsname.

We are not talking about limits of a certain infrastructure. It is
used, it will not go away, we need to support it.

This is about propagating in-kernel state changes to userspace. Please
open a different conversation for everything else.

Thanks,
Kay

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH] sysctl: add support for poll()
  2011-06-02 13:02         ` Lucas De Marchi
  2011-06-02 13:12           ` Alan Cox
@ 2011-06-02 13:56           ` Eric W. Biederman
  2011-06-02 17:32             ` Kay Sievers
  1 sibling, 1 reply; 30+ messages in thread
From: Eric W. Biederman @ 2011-06-02 13:56 UTC (permalink / raw)
  To: Lucas De Marchi
  Cc: Alan Cox, Kay Sievers, linux-kernel, Nick Piggin, Al Viro,
	Christoph Hellwig, Stephen Rothwell, Andrew Morton,
	David Howells, Serge E. Hallyn, Daniel Lezcano, Jiri Slaby,
	Greg Kroah-Hartman, James Morris

Lucas De Marchi <lucas.demarchi@profusion.mobi> writes:

> On Thu, Jun 2, 2011 at 9:43 AM, Alan Cox <alan@lxorguk.ukuu.org.uk> wrote:
>>> The alternative is to have a process constantly polling and reading
>>> the file, which is nothing we even want to think about in 2011.
>>
>> Or to manage it properly.
>
> What if the user decides do invoke sethostname syscall "by hand"?
> Hostname would change beneath any other process that is trying to
> manage it properly. What this patch does is to notify that process
> that something happened.
>
>
>>> It's just another special case to bring us out of the UNIX stone age
>>> of doing things. :)
>>
>> Unfortunately not. It's a misguided attempt to follow stone age Unix 'one
>> short name' policy. Forget utsname node names, they are a historical
>> quirk of UUCP and friends and on many OS platforms will be limited to 15
>> chars !
>>
>> As to poll in general I can see some of the other proc files being
>> more relevant, eg for process monitoring tools being able to poll
>> in /proc/<pid> and some of the proc/sys and sysctl data that does change
>> meaningfully. Utsname however is not one of those things.
>>
>
> With this patch in, if anyone wants to manage a file under /proc/sys
> there's really a small amount of code to write. He only has to define
> the new poll struct for that file.

The support currently appears cumbersome to add, and it adds what appear
to be unnecessary wake ups (say when the hostname in another uts
namespace changes).

There is no explanation at all of why you care about the nis domainname.

Since there does not appear to be a specific problem that this problem
is being aimed at, since the code just looks like extra maintenance and
since the code needed to support this appears to be unnecessarily
cumbersome I am going to nack the patch for now.

Nacked-by: "Eric W. Biederman" <ebiederm@xmission.com>

If the goal here is just to fix the general case then we probably
want to get inotify going on proc files instead of poll.  Either
that or we want pollable files to appear as something besides files
so that it is clear to their users that poll will work.

Eric

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH] sysctl: add support for poll()
  2011-06-02 12:43       ` Alan Cox
  2011-06-02 13:01         ` Kay Sievers
  2011-06-02 13:02         ` Lucas De Marchi
@ 2011-06-02 14:16         ` Eric W. Biederman
  2 siblings, 0 replies; 30+ messages in thread
From: Eric W. Biederman @ 2011-06-02 14:16 UTC (permalink / raw)
  To: Alan Cox
  Cc: Kay Sievers, Lucas De Marchi, linux-kernel, Nick Piggin, Al Viro,
	Christoph Hellwig, Stephen Rothwell, Andrew Morton,
	David Howells, Serge E. Hallyn, Daniel Lezcano, Jiri Slaby,
	Greg Kroah-Hartman, James Morris

Alan Cox <alan@lxorguk.ukuu.org.uk> writes:

>> Host names are dynamic, can change during system runtime by dhcp or
>> similar setups, or just get changed by the user.
>
> I don't actually see what this has to do with utsname. uname historically
> defined nodename as "name within an implementation-defined communications
> network" and actually tended to be the UUCP name. Modern SuS says "`the
> name of the node of the communications network to which this node is
> attached, if any"

>
> The latter unfortunately makes no sense anyway and is a fine example of
> standards body cluelessness as name mapping on IP networks is not one
> name per host, and also because the standard doesn't require the fields
> in the struct are long enough to hold a DNS name!
>
> (Indeed in its usual head up backside manner its technically valid to
> define
>
> 	char nodename[1];
>
> and have only \0 as a valid reply)

However we have conveniently defined sethostname and gethostname to use
the same state in the kernel, as uname.  I believe at least one of these
interfaces that map to the same storage in linux has a usable size
guaranteed by all of the implementations.

Eric



^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH] sysctl: add support for poll()
  2011-06-02 13:56           ` Eric W. Biederman
@ 2011-06-02 17:32             ` Kay Sievers
  2011-06-08 22:17               ` Andrew Morton
  2011-06-12 15:34               ` Eric W. Biederman
  0 siblings, 2 replies; 30+ messages in thread
From: Kay Sievers @ 2011-06-02 17:32 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Lucas De Marchi, Alan Cox, linux-kernel, Nick Piggin, Al Viro,
	Christoph Hellwig, Stephen Rothwell, Andrew Morton,
	David Howells, Serge E. Hallyn, Daniel Lezcano, Jiri Slaby,
	Greg Kroah-Hartman, James Morris

On Thu, Jun 2, 2011 at 15:56, Eric W. Biederman <ebiederm@xmission.com> wrote:
> Lucas De Marchi <lucas.demarchi@profusion.mobi> writes:

>> With this patch in, if anyone wants to manage a file under /proc/sys
>> there's really a small amount of code to write. He only has to define
>> the new poll struct for that file.
>
> The support currently appears cumbersome to add, and it adds what appear
> to be unnecessary wake ups (say when the hostname in another uts
> namespace changes).

Yeah, *possibly* waking up once a day compared to *unconditionally*
waking up every second in every namespace and check if something has
changed. If that *possibly* should be optimized, which I think isn't
necessary at all, I guess the logic could be moved down to the
namespaced data.

> There is no explanation at all of why you care about the nis domainname.

Just the same reason as the hostname, we need to know when the
kernel's internal state changes. regardless who did it and why, system
services need to follow it.

> Since there does not appear to be a specific problem that this problem
> is being aimed at, since the code just looks like extra maintenance and
> since the code needed to support this appears to be unnecessarily
> cumbersome I am going to nack the patch for now.
>
> Nacked-by: "Eric W. Biederman" <ebiederm@xmission.com>

Please provide solid technical reasons, why we can't to the same we do
everywhere else since years, or provide working alternatives. Until
then:

Acked-By: Kay Sievers <kay.sievers@vrfy.org>

> If the goal here is just to fix the general case then we probably
> want to get inotify going on proc files instead of poll.  Either
> that or we want pollable files to appear as something besides files
> so that it is clear to their users that poll will work.

I think poll() is the natural interface if you care about the data in
a file. It's the same an single fd you care, and not some iniotify
watch, fd, pathname to register, and filter for whatever comes out
there.

We already use exactly the same semantics for quite some years for
/proc/mounts, /proc/swaps, /proc/mdstat, ... and all over /sys. The
patch just provides the missing pieces that /proc provides, but
/proc/sys is missing.

I don't disagree, it might be nice to have generic inotify support on
/proc for this or other problems. But unless you want to work on it
now, and it would solve these problems, I don't see how we can get the
functionality we need, and that seems to solved with this patch.
Besides the fact that I think poll() is the simple and better
solution.

Thanks,
Kay

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH] sysctl: add support for poll()
  2011-06-02 17:32             ` Kay Sievers
@ 2011-06-08 22:17               ` Andrew Morton
  2011-06-09 13:16                 ` Kay Sievers
  2011-06-12 15:34               ` Eric W. Biederman
  1 sibling, 1 reply; 30+ messages in thread
From: Andrew Morton @ 2011-06-08 22:17 UTC (permalink / raw)
  To: Kay Sievers
  Cc: Eric W. Biederman, Lucas De Marchi, Alan Cox, linux-kernel,
	Nick Piggin, Al Viro, Christoph Hellwig, Stephen Rothwell,
	David Howells, Serge E. Hallyn, Daniel Lezcano, Jiri Slaby,
	Greg Kroah-Hartman, James Morris

On Thu, 2 Jun 2011 19:32:49 +0200
Kay Sievers <kay.sievers@vrfy.org> wrote:

> > Nacked-by: "Eric W. Biederman" <ebiederm@xmission.com>
> 
> Acked-By: Kay Sievers <kay.sievers@vrfy.org>

The patch itself doesn't look too bad to me...

We already have several pollable procfs files, such as
fs/proc/base.c:mounts_poll() and I think drivers/md has one.  I do
think that any work in this area should end up with those custom
make-procfs-pollable hacks being identified and removed.


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH] sysctl: add support for poll()
  2011-06-08 22:17               ` Andrew Morton
@ 2011-06-09 13:16                 ` Kay Sievers
  2011-06-13 16:05                   ` Kay Sievers
  0 siblings, 1 reply; 30+ messages in thread
From: Kay Sievers @ 2011-06-09 13:16 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Eric W. Biederman, Lucas De Marchi, Alan Cox, linux-kernel,
	Nick Piggin, Al Viro, Christoph Hellwig, Stephen Rothwell,
	David Howells, Serge E. Hallyn, Daniel Lezcano, Jiri Slaby,
	Greg Kroah-Hartman, James Morris, neilb

On Thu, Jun 9, 2011 at 00:17, Andrew Morton <akpm@linux-foundation.org> wrote:
> On Thu, 2 Jun 2011 19:32:49 +0200
> Kay Sievers <kay.sievers@vrfy.org> wrote:
>
>> > Nacked-by: "Eric W. Biederman" <ebiederm@xmission.com>
>>
>> Acked-By: Kay Sievers <kay.sievers@vrfy.org>
>
> The patch itself doesn't look too bad to me...
>
> We already have several pollable procfs files, such as
> fs/proc/base.c:mounts_poll() and I think drivers/md has one.  I do
> think that any work in this area should end up with those custom
> make-procfs-pollable hacks being identified and removed.

For these files we can probably move the event counter into the
seq_file structure, and get rid of the dance to kmalloc it and assign
it to seq_file->private. That might simplify the logic a bit.

[Adding Neil, to get his opinion of moving 'event' so seq_file and get
rid of the malloc dance]

The wait_queue and the change counter needs to be in the same context
as the watched data, so we can probably not really help with generic
proc infrastructure here.

This patch is sysctl infrastructure, which is kind of a subclass of
proc like seq_file is. I have no good idea how to share anything
between them. Unlike the three current users of seq_file, the sysctl
stuff already moved the needed things to the common sysctl code.

If moving the individual event counter to seq_file makes sense, we can
give it a shot, but it don't think it really affects the sysctl case.
Maybe someone has good idea how to unify them, I currently don't have.

Kay

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH] sysctl: add support for poll()
  2011-06-02 17:32             ` Kay Sievers
  2011-06-08 22:17               ` Andrew Morton
@ 2011-06-12 15:34               ` Eric W. Biederman
  2011-06-13 14:28                 ` Kay Sievers
  1 sibling, 1 reply; 30+ messages in thread
From: Eric W. Biederman @ 2011-06-12 15:34 UTC (permalink / raw)
  To: Kay Sievers
  Cc: Lucas De Marchi, Alan Cox, linux-kernel, Nick Piggin, Al Viro,
	Christoph Hellwig, Stephen Rothwell, Andrew Morton,
	David Howells, Serge E. Hallyn, Daniel Lezcano, Jiri Slaby,
	Greg Kroah-Hartman, James Morris

Kay Sievers <kay.sievers@vrfy.org> writes:

> On Thu, Jun 2, 2011 at 15:56, Eric W. Biederman <ebiederm@xmission.com> wrote:
>> Lucas De Marchi <lucas.demarchi@profusion.mobi> writes:
>
>>> With this patch in, if anyone wants to manage a file under /proc/sys
>>> there's really a small amount of code to write. He only has to define
>>> the new poll struct for that file.
>>
>> The support currently appears cumbersome to add, and it adds what appear
>> to be unnecessary wake ups (say when the hostname in another uts
>> namespace changes).
>
> Yeah, *possibly* waking up once a day compared to *unconditionally*
> waking up every second in every namespace and check if something has
> changed. If that *possibly* should be optimized, which I think isn't
> necessary at all, I guess the logic could be moved down to the
> namespaced data.

Unless you happen to be on a system, where someone has decided that it
has a daemon that likes creating a new set of namespaces per connection
to reduce the effect of code bugs.  At which point you could be talking
much more than a wake up per second.

>> There is no explanation at all of why you care about the nis domainname.
>
> Just the same reason as the hostname, we need to know when the
> kernel's internal state changes. regardless who did it and why, system
> services need to follow it.

So the problem is that you have a system where userspace can't get it's
act together?

>> Since there does not appear to be a specific problem that this problem
>> is being aimed at, since the code just looks like extra maintenance and
>> since the code needed to support this appears to be unnecessarily
>> cumbersome I am going to nack the patch for now.
>>
>> Nacked-by: "Eric W. Biederman" <ebiederm@xmission.com>
>
> Please provide solid technical reasons, why we can't to the same we do
> everywhere else since years, or provide working alternatives. Until
> then:
>
> Acked-By: Kay Sievers <kay.sievers@vrfy.org>

Looking a little closer the patch is broken.

It changes the return of poll for every file under /proc/sys reporting
no file under /proc/sys is writable unless it implements the new poll
method.

Also with having the wait_queue owned by the calling code either I am
mistake or this breaks hotplug type situations.  How do you get things
off of your wait queue when you remove a file from /proc/sys.  This
infrastructure as written looks like a setup for use after free
problems.

> I think poll() is the natural interface if you care about the data in
> a file. It's the same an single fd you care, and not some iniotify
> watch, fd, pathname to register, and filter for whatever comes out
> there.

Sort of.  poll is really designed for socket and pipe data.

The problem with poll is there is no POLLUPDATED.

> We already use exactly the same semantics for quite some years for
> /proc/mounts, /proc/swaps, /proc/mdstat, ... and all over /sys. The
> patch just provides the missing pieces that /proc provides, but
> /proc/sys is missing.

Good point.

There is still the problem that the infrastructure code for the proc
sysctl files are in much worse shape than the sysfs files.

> I don't disagree, it might be nice to have generic inotify support on
> /proc for this or other problems. But unless you want to work on it
> now, and it would solve these problems, I don't see how we can get the
> functionality we need, and that seems to solved with this patch.
> Besides the fact that I think poll() is the simple and better
> solution.

Which is a question that has never been answered.  What functionality
do you need?  To me this all looks like a bad science experiment.

Eric

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH] sysctl: add support for poll()
  2011-06-12 15:34               ` Eric W. Biederman
@ 2011-06-13 14:28                 ` Kay Sievers
  0 siblings, 0 replies; 30+ messages in thread
From: Kay Sievers @ 2011-06-13 14:28 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Lucas De Marchi, Alan Cox, linux-kernel, Nick Piggin, Al Viro,
	Christoph Hellwig, Stephen Rothwell, Andrew Morton,
	David Howells, Serge E. Hallyn, Daniel Lezcano, Jiri Slaby,
	Greg Kroah-Hartman, James Morris

On Sun, Jun 12, 2011 at 17:34, Eric W. Biederman <ebiederm@xmission.com> wrote:
> Kay Sievers <kay.sievers@vrfy.org> writes:
>> On Thu, Jun 2, 2011 at 15:56, Eric W. Biederman <ebiederm@xmission.com> wrote:
>>> Lucas De Marchi <lucas.demarchi@profusion.mobi> writes:
>>
>>>> With this patch in, if anyone wants to manage a file under /proc/sys
>>>> there's really a small amount of code to write. He only has to define
>>>> the new poll struct for that file.
>>>
>>> The support currently appears cumbersome to add, and it adds what appear
>>> to be unnecessary wake ups (say when the hostname in another uts
>>> namespace changes).
>>
>> Yeah, *possibly* waking up once a day compared to *unconditionally*
>> waking up every second in every namespace and check if something has
>> changed. If that *possibly* should be optimized, which I think isn't
>> necessary at all, I guess the logic could be moved down to the
>> namespaced data.
>
> Unless you happen to be on a system, where someone has decided that it
> has a daemon that likes creating a new set of namespaces per connection
> to reduce the effect of code bugs.  At which point you could be talking
> much more than a wake up per second.

What do you mean? We talk about changing the hostname, right? How
often do we expect that to happen?

You mean every network connection mounts a proc filesystem and runs a
service that polls hostname files there?

>>> There is no explanation at all of why you care about the nis domainname.
>>
>> Just the same reason as the hostname, we need to know when the
>> kernel's internal state changes. regardless who did it and why, system
>> services need to follow it.
>
> So the problem is that you have a system where userspace can't get it's
> act together?

There will never be a layer between the kernel and userspace that is
centrally managed. The kernel keeps the authoritative data, so we need
to know when _this_ data changes, not when some imaginary middle man
might manage it.

It will just never happen in reality, we stopped long ago being so
naive. And 'getting the act together' today is 'watching what people
did' to the kernel data, not tell them how to do it, or not to do it.

We can change some parts sometimes, and recommend some sanity, but
surely nobody will patch all the 15 dhcp clients to work with some
strange higher-level hostname interface here. People use the syscall
to set it, and there is nothing wrong with it, we just need to know
it.

>>> Since there does not appear to be a specific problem that this problem
>>> is being aimed at, since the code just looks like extra maintenance and
>>> since the code needed to support this appears to be unnecessarily
>>> cumbersome I am going to nack the patch for now.
>>>
>>> Nacked-by: "Eric W. Biederman" <ebiederm@xmission.com>
>>
>> Please provide solid technical reasons, why we can't to the same we do
>> everywhere else since years, or provide working alternatives. Until
>> then:
>>
>> Acked-By: Kay Sievers <kay.sievers@vrfy.org>
>
> Looking a little closer the patch is broken.
>
> It changes the return of poll for every file under /proc/sys reporting
> no file under /proc/sys is writable unless it implements the new poll
> method.

Poll is undefined for regular files, or proc/sys files which don't
implement it. What do you mean with writable? How does not providing
poll() affect write()?

> Also with having the wait_queue owned by the calling code either I am
> mistake or this breaks hotplug type situations.  How do you get things
> off of your wait queue when you remove a file from /proc/sys.
>
> This
> infrastructure as written looks like a setup for use after free
> problems.

The wait queue is in static non-allocated code. What do you mean?

>> I think poll() is the natural interface if you care about the data in
>> a file. It's the same an single fd you care, and not some iniotify
>> watch, fd, pathname to register, and filter for whatever comes out
>> there.
>
> Sort of.  poll is really designed for socket and pipe data.
>
> The problem with poll is there is no POLLUPDATED.

Right, hence we use POLLERR|POLLPRI, to indicate something has
changed. The case is kind of special because poll does not tell 'there
is more to read' but 'something happend, re-read it again'.

>> We already use exactly the same semantics for quite some years for
>> /proc/mounts, /proc/swaps, /proc/mdstat, ... and all over /sys. The
>> patch just provides the missing pieces that /proc provides, but
>> /proc/sys is missing.
>
> Good point.
>
> There is still the problem that the infrastructure code for the proc
> sysctl files are in much worse shape than the sysfs files.
>
>> I don't disagree, it might be nice to have generic inotify support on
>> /proc for this or other problems. But unless you want to work on it
>> now, and it would solve these problems, I don't see how we can get the
>> functionality we need, and that seems to solved with this patch.
>> Besides the fact that I think poll() is the simple and better
>> solution.
>
> Which is a question that has never been answered.

Sure, any other easy-to-use and easy-to-get-working interface you have
in mind? We can certainly use something else if it can provide us with
the updates we are looking for.

> What functionality
> do you need?  To me this all looks like a bad science experiment.

We need a simple interface to detect changes to the hostname data in
the kernel. It does not matter if the proc file is written or
sethostname() or hostname(1) is used. Any source of such change needs
to be propagated to running system services, to adapt to the new
hostname.

Kay

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH] sysctl: add support for poll()
  2011-06-09 13:16                 ` Kay Sievers
@ 2011-06-13 16:05                   ` Kay Sievers
  2011-06-14  3:53                     ` Lucas De Marchi
                                       ` (2 more replies)
  0 siblings, 3 replies; 30+ messages in thread
From: Kay Sievers @ 2011-06-13 16:05 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Eric W. Biederman, Lucas De Marchi, Alan Cox, linux-kernel,
	Nick Piggin, Al Viro, Christoph Hellwig, Stephen Rothwell,
	David Howells, Serge E. Hallyn, Daniel Lezcano, Jiri Slaby,
	Greg Kroah-Hartman, James Morris, neilb

On Thu, 2011-06-09 at 15:16 +0200, Kay Sievers wrote:
> On Thu, Jun 9, 2011 at 00:17, Andrew Morton <akpm@linux-foundation.org> wrote:

> > We already have several pollable procfs files, such as
> > fs/proc/base.c:mounts_poll() and I think drivers/md has one.  I do
> > think that any work in this area should end up with those custom
> > make-procfs-pollable hacks being identified and removed.
> 
> For these files we can probably move the event counter into the
> seq_file structure, and get rid of the dance to kmalloc it and assign
> it to seq_file->private. That might simplify the logic a bit.
> 
> [Adding Neil, to get his opinion of moving 'event' so seq_file and get
> rid of the malloc dance]

I guess, we could do something like this, which looks quite a bit
simpler by moving the poll event counter into the dynamically allocated
seq_file structure itself, instead of having private structures
allocated on top to just carry the counter (patch is just
compile-tested).

Thanks,
Kay

---
 drivers/md/md.c               |   26 ++++++++------------------
 fs/namespace.c                |    4 ++--
 fs/proc/base.c                |    2 +-
 include/linux/mnt_namespace.h |    1 -
 include/linux/seq_file.h      |    1 +
 mm/swapfile.c                 |   29 ++++++++---------------------
 6 files changed, 20 insertions(+), 43 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index aa640a8..b073d36 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -6377,16 +6377,11 @@ static void md_seq_stop(struct seq_file *seq, void *v)
 		mddev_put(mddev);
 }
 
-struct mdstat_info {
-	int event;
-};
-
 static int md_seq_show(struct seq_file *seq, void *v)
 {
 	mddev_t *mddev = v;
 	sector_t sectors;
 	mdk_rdev_t *rdev;
-	struct mdstat_info *mi = seq->private;
 	struct bitmap *bitmap;
 
 	if (v == (void*)1) {
@@ -6398,7 +6393,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
 
 		spin_unlock(&pers_lock);
 		seq_printf(seq, "\n");
-		mi->event = atomic_read(&md_event_count);
+		seq->poll_event = atomic_read(&md_event_count);
 		return 0;
 	}
 	if (v == (void*)2) {
@@ -6510,26 +6505,21 @@ static const struct seq_operations md_seq_ops = {
 
 static int md_seq_open(struct inode *inode, struct file *file)
 {
+	struct seq_file *seq;
 	int error;
-	struct mdstat_info *mi = kmalloc(sizeof(*mi), GFP_KERNEL);
-	if (mi == NULL)
-		return -ENOMEM;
 
 	error = seq_open(file, &md_seq_ops);
 	if (error)
-		kfree(mi);
-	else {
-		struct seq_file *p = file->private_data;
-		p->private = mi;
-		mi->event = atomic_read(&md_event_count);
-	}
+		return error;
+
+	seq = file->private_data;
+	seq->poll_event = atomic_read(&md_event_count);
 	return error;
 }
 
 static unsigned int mdstat_poll(struct file *filp, poll_table *wait)
 {
-	struct seq_file *m = filp->private_data;
-	struct mdstat_info *mi = m->private;
+	struct seq_file *seq = filp->private_data;
 	int mask;
 
 	poll_wait(filp, &md_event_waiters, wait);
@@ -6537,7 +6527,7 @@ static unsigned int mdstat_poll(struct file *filp, poll_table *wait)
 	/* always allow read */
 	mask = POLLIN | POLLRDNORM;
 
-	if (mi->event != atomic_read(&md_event_count))
+	if (seq->poll_event != atomic_read(&md_event_count))
 		mask |= POLLERR | POLLPRI;
 	return mask;
 }
diff --git a/fs/namespace.c b/fs/namespace.c
index fe59bd1..cda50fe 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -934,8 +934,8 @@ int mnt_had_events(struct proc_mounts *p)
 	int res = 0;
 
 	br_read_lock(vfsmount_lock);
-	if (p->event != ns->event) {
-		p->event = ns->event;
+	if (p->m.poll_event != ns->event) {
+		p->m.poll_event = ns->event;
 		res = 1;
 	}
 	br_read_unlock(vfsmount_lock);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 14def99..16d33a3 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -673,7 +673,7 @@ static int mounts_open_common(struct inode *inode, struct file *file,
 	p->m.private = p;
 	p->ns = ns;
 	p->root = root;
-	p->event = ns->event;
+	p->m.poll_event = ns->event;
 
 	return 0;
 
diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h
index 0b89efc..2930485 100644
--- a/include/linux/mnt_namespace.h
+++ b/include/linux/mnt_namespace.h
@@ -18,7 +18,6 @@ struct proc_mounts {
 	struct seq_file m; /* must be the first element */
 	struct mnt_namespace *ns;
 	struct path root;
-	int event;
 };
 
 struct fs_struct;
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index 03c0232..be720cd 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -23,6 +23,7 @@ struct seq_file {
 	u64 version;
 	struct mutex lock;
 	const struct seq_operations *op;
+	int poll_event;
 	void *private;
 };
 
diff --git a/mm/swapfile.c b/mm/swapfile.c
index d537d29..5161d7d 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1681,19 +1681,14 @@ out:
 }
 
 #ifdef CONFIG_PROC_FS
-struct proc_swaps {
-	struct seq_file seq;
-	int event;
-};
-
 static unsigned swaps_poll(struct file *file, poll_table *wait)
 {
-	struct proc_swaps *s = file->private_data;
+	struct seq_file *seq = file->private_data;
 
 	poll_wait(file, &proc_poll_wait, wait);
 
-	if (s->event != atomic_read(&proc_poll_event)) {
-		s->event = atomic_read(&proc_poll_event);
+	if (seq->poll_event != atomic_read(&proc_poll_event)) {
+		seq->poll_event = atomic_read(&proc_poll_event);
 		return POLLIN | POLLRDNORM | POLLERR | POLLPRI;
 	}
 
@@ -1783,24 +1778,16 @@ static const struct seq_operations swaps_op = {
 
 static int swaps_open(struct inode *inode, struct file *file)
 {
-	struct proc_swaps *s;
+	struct seq_file *seq;
 	int ret;
 
-	s = kmalloc(sizeof(struct proc_swaps), GFP_KERNEL);
-	if (!s)
-		return -ENOMEM;
-
-	file->private_data = s;
-
 	ret = seq_open(file, &swaps_op);
-	if (ret) {
-		kfree(s);
+	if (ret)
 		return ret;
-	}
 
-	s->seq.private = s;
-	s->event = atomic_read(&proc_poll_event);
-	return ret;
+	seq = file->private_data;
+	seq->poll_event = atomic_read(&proc_poll_event);
+	return 0;
 }
 
 static const struct file_operations proc_swaps_operations = {




^ permalink raw reply related	[flat|nested] 30+ messages in thread

* Re: [PATCH] sysctl: add support for poll()
  2011-06-13 16:05                   ` Kay Sievers
@ 2011-06-14  3:53                     ` Lucas De Marchi
  2011-06-14  4:17                     ` NeilBrown
  2011-07-26  2:17                     ` Lucas De Marchi
  2 siblings, 0 replies; 30+ messages in thread
From: Lucas De Marchi @ 2011-06-14  3:53 UTC (permalink / raw)
  To: Kay Sievers
  Cc: Andrew Morton, Eric W. Biederman, Alan Cox, linux-kernel,
	Nick Piggin, Al Viro, Christoph Hellwig, Stephen Rothwell,
	David Howells, Serge E. Hallyn, Daniel Lezcano, Jiri Slaby,
	Greg Kroah-Hartman, James Morris, neilb

* Kay Sievers <kay.sievers@vrfy.org> [2011-06-13 18:05:51 +0200]:

> On Thu, 2011-06-09 at 15:16 +0200, Kay Sievers wrote:
> > On Thu, Jun 9, 2011 at 00:17, Andrew Morton <akpm@linux-foundation.org> wrote:
> 
> > > We already have several pollable procfs files, such as
> > > fs/proc/base.c:mounts_poll() and I think drivers/md has one.  I do
> > > think that any work in this area should end up with those custom
> > > make-procfs-pollable hacks being identified and removed.
> > 
> > For these files we can probably move the event counter into the
> > seq_file structure, and get rid of the dance to kmalloc it and assign
> > it to seq_file->private. That might simplify the logic a bit.
> > 
> > [Adding Neil, to get his opinion of moving 'event' so seq_file and get
> > rid of the malloc dance]
> 
> I guess, we could do something like this, which looks quite a bit
> simpler by moving the poll event counter into the dynamically allocated
> seq_file structure itself, instead of having private structures
> allocated on top to just carry the counter (patch is just
> compile-tested).
> 
> Thanks,
> Kay
> 
> ---
>  drivers/md/md.c               |   26 ++++++++------------------
>  fs/namespace.c                |    4 ++--
>  fs/proc/base.c                |    2 +-
>  include/linux/mnt_namespace.h |    1 -
>  include/linux/seq_file.h      |    1 +
>  mm/swapfile.c                 |   29 ++++++++---------------------
>  6 files changed, 20 insertions(+), 43 deletions(-)

I've successfully tested this on top of 3.0-rc3 for /proc/mounts and
/proc/swaps. I've booted with systemd (that depends on mounts and swaps
being pollable) and created a small test using epoll.


Lucas De Marchi

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH] sysctl: add support for poll()
  2011-06-13 16:05                   ` Kay Sievers
  2011-06-14  3:53                     ` Lucas De Marchi
@ 2011-06-14  4:17                     ` NeilBrown
  2011-07-26  2:17                     ` Lucas De Marchi
  2 siblings, 0 replies; 30+ messages in thread
From: NeilBrown @ 2011-06-14  4:17 UTC (permalink / raw)
  To: Kay Sievers
  Cc: Andrew Morton, Eric W. Biederman, Lucas De Marchi, Alan Cox,
	linux-kernel, Nick Piggin, Al Viro, Christoph Hellwig,
	Stephen Rothwell, David Howells, Serge E. Hallyn, Daniel Lezcano,
	Jiri Slaby, Greg Kroah-Hartman, James Morris

On Mon, 13 Jun 2011 18:05:51 +0200 Kay Sievers <kay.sievers@vrfy.org> wrote:

> On Thu, 2011-06-09 at 15:16 +0200, Kay Sievers wrote:
> > On Thu, Jun 9, 2011 at 00:17, Andrew Morton <akpm@linux-foundation.org> wrote:
> 
> > > We already have several pollable procfs files, such as
> > > fs/proc/base.c:mounts_poll() and I think drivers/md has one.  I do
> > > think that any work in this area should end up with those custom
> > > make-procfs-pollable hacks being identified and removed.
> > 
> > For these files we can probably move the event counter into the
> > seq_file structure, and get rid of the dance to kmalloc it and assign
> > it to seq_file->private. That might simplify the logic a bit.
> > 
> > [Adding Neil, to get his opinion of moving 'event' so seq_file and get
> > rid of the malloc dance]
> 
> I guess, we could do something like this, which looks quite a bit
> simpler by moving the poll event counter into the dynamically allocated
> seq_file structure itself, instead of having private structures
> allocated on top to just carry the counter (patch is just
> compile-tested).
> 
> Thanks,
> Kay

Acked-by: NeilBrown <neilb@suse.de>

Looks like a nice improvement - thanks.

It is a pity that files don't all respond the same way to select/poll though.

Some set POLLIN|POLLRDNORM always, some set it only when there is a new event.

I have found that the former is safer.  There are some frameworks that always
use select/poll before reading, and get confused when they cannot read from
some /proc file.

And uniformity is good anyway.

NeilBrown


> 
> ---
>  drivers/md/md.c               |   26 ++++++++------------------
>  fs/namespace.c                |    4 ++--
>  fs/proc/base.c                |    2 +-
>  include/linux/mnt_namespace.h |    1 -
>  include/linux/seq_file.h      |    1 +
>  mm/swapfile.c                 |   29 ++++++++---------------------
>  6 files changed, 20 insertions(+), 43 deletions(-)
> 
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index aa640a8..b073d36 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -6377,16 +6377,11 @@ static void md_seq_stop(struct seq_file *seq, void *v)
>  		mddev_put(mddev);
>  }
>  
> -struct mdstat_info {
> -	int event;
> -};
> -
>  static int md_seq_show(struct seq_file *seq, void *v)
>  {
>  	mddev_t *mddev = v;
>  	sector_t sectors;
>  	mdk_rdev_t *rdev;
> -	struct mdstat_info *mi = seq->private;
>  	struct bitmap *bitmap;
>  
>  	if (v == (void*)1) {
> @@ -6398,7 +6393,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
>  
>  		spin_unlock(&pers_lock);
>  		seq_printf(seq, "\n");
> -		mi->event = atomic_read(&md_event_count);
> +		seq->poll_event = atomic_read(&md_event_count);
>  		return 0;
>  	}
>  	if (v == (void*)2) {
> @@ -6510,26 +6505,21 @@ static const struct seq_operations md_seq_ops = {
>  
>  static int md_seq_open(struct inode *inode, struct file *file)
>  {
> +	struct seq_file *seq;
>  	int error;
> -	struct mdstat_info *mi = kmalloc(sizeof(*mi), GFP_KERNEL);
> -	if (mi == NULL)
> -		return -ENOMEM;
>  
>  	error = seq_open(file, &md_seq_ops);
>  	if (error)
> -		kfree(mi);
> -	else {
> -		struct seq_file *p = file->private_data;
> -		p->private = mi;
> -		mi->event = atomic_read(&md_event_count);
> -	}
> +		return error;
> +
> +	seq = file->private_data;
> +	seq->poll_event = atomic_read(&md_event_count);
>  	return error;
>  }
>  
>  static unsigned int mdstat_poll(struct file *filp, poll_table *wait)
>  {
> -	struct seq_file *m = filp->private_data;
> -	struct mdstat_info *mi = m->private;
> +	struct seq_file *seq = filp->private_data;
>  	int mask;
>  
>  	poll_wait(filp, &md_event_waiters, wait);
> @@ -6537,7 +6527,7 @@ static unsigned int mdstat_poll(struct file *filp, poll_table *wait)
>  	/* always allow read */
>  	mask = POLLIN | POLLRDNORM;
>  
> -	if (mi->event != atomic_read(&md_event_count))
> +	if (seq->poll_event != atomic_read(&md_event_count))
>  		mask |= POLLERR | POLLPRI;
>  	return mask;
>  }
> diff --git a/fs/namespace.c b/fs/namespace.c
> index fe59bd1..cda50fe 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -934,8 +934,8 @@ int mnt_had_events(struct proc_mounts *p)
>  	int res = 0;
>  
>  	br_read_lock(vfsmount_lock);
> -	if (p->event != ns->event) {
> -		p->event = ns->event;
> +	if (p->m.poll_event != ns->event) {
> +		p->m.poll_event = ns->event;
>  		res = 1;
>  	}
>  	br_read_unlock(vfsmount_lock);
> diff --git a/fs/proc/base.c b/fs/proc/base.c
> index 14def99..16d33a3 100644
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -673,7 +673,7 @@ static int mounts_open_common(struct inode *inode, struct file *file,
>  	p->m.private = p;
>  	p->ns = ns;
>  	p->root = root;
> -	p->event = ns->event;
> +	p->m.poll_event = ns->event;
>  
>  	return 0;
>  
> diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h
> index 0b89efc..2930485 100644
> --- a/include/linux/mnt_namespace.h
> +++ b/include/linux/mnt_namespace.h
> @@ -18,7 +18,6 @@ struct proc_mounts {
>  	struct seq_file m; /* must be the first element */
>  	struct mnt_namespace *ns;
>  	struct path root;
> -	int event;
>  };
>  
>  struct fs_struct;
> diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
> index 03c0232..be720cd 100644
> --- a/include/linux/seq_file.h
> +++ b/include/linux/seq_file.h
> @@ -23,6 +23,7 @@ struct seq_file {
>  	u64 version;
>  	struct mutex lock;
>  	const struct seq_operations *op;
> +	int poll_event;
>  	void *private;
>  };
>  
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index d537d29..5161d7d 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -1681,19 +1681,14 @@ out:
>  }
>  
>  #ifdef CONFIG_PROC_FS
> -struct proc_swaps {
> -	struct seq_file seq;
> -	int event;
> -};
> -
>  static unsigned swaps_poll(struct file *file, poll_table *wait)
>  {
> -	struct proc_swaps *s = file->private_data;
> +	struct seq_file *seq = file->private_data;
>  
>  	poll_wait(file, &proc_poll_wait, wait);
>  
> -	if (s->event != atomic_read(&proc_poll_event)) {
> -		s->event = atomic_read(&proc_poll_event);
> +	if (seq->poll_event != atomic_read(&proc_poll_event)) {
> +		seq->poll_event = atomic_read(&proc_poll_event);
>  		return POLLIN | POLLRDNORM | POLLERR | POLLPRI;
>  	}
>  
> @@ -1783,24 +1778,16 @@ static const struct seq_operations swaps_op = {
>  
>  static int swaps_open(struct inode *inode, struct file *file)
>  {
> -	struct proc_swaps *s;
> +	struct seq_file *seq;
>  	int ret;
>  
> -	s = kmalloc(sizeof(struct proc_swaps), GFP_KERNEL);
> -	if (!s)
> -		return -ENOMEM;
> -
> -	file->private_data = s;
> -
>  	ret = seq_open(file, &swaps_op);
> -	if (ret) {
> -		kfree(s);
> +	if (ret)
>  		return ret;
> -	}
>  
> -	s->seq.private = s;
> -	s->event = atomic_read(&proc_poll_event);
> -	return ret;
> +	seq = file->private_data;
> +	seq->poll_event = atomic_read(&proc_poll_event);
> +	return 0;
>  }
>  
>  static const struct file_operations proc_swaps_operations = {
> 
> 


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH] sysctl: add support for poll()
  2011-06-13 16:05                   ` Kay Sievers
  2011-06-14  3:53                     ` Lucas De Marchi
  2011-06-14  4:17                     ` NeilBrown
@ 2011-07-26  2:17                     ` Lucas De Marchi
  2011-08-02 22:53                       ` Kay Sievers
  2 siblings, 1 reply; 30+ messages in thread
From: Lucas De Marchi @ 2011-07-26  2:17 UTC (permalink / raw)
  To: Kay Sievers
  Cc: Andrew Morton, Eric W. Biederman, Alan Cox, linux-kernel,
	Nick Piggin, Al Viro, Christoph Hellwig, Stephen Rothwell,
	David Howells, Serge E. Hallyn, Daniel Lezcano, Jiri Slaby,
	Greg Kroah-Hartman, James Morris, neilb

On Mon, Jun 13, 2011 at 1:05 PM, Kay Sievers <kay.sievers@vrfy.org> wrote:
> On Thu, 2011-06-09 at 15:16 +0200, Kay Sievers wrote:
>> On Thu, Jun 9, 2011 at 00:17, Andrew Morton <akpm@linux-foundation.org> wrote:
>
>> > We already have several pollable procfs files, such as
>> > fs/proc/base.c:mounts_poll() and I think drivers/md has one.  I do
>> > think that any work in this area should end up with those custom
>> > make-procfs-pollable hacks being identified and removed.
>>
>> For these files we can probably move the event counter into the
>> seq_file structure, and get rid of the dance to kmalloc it and assign
>> it to seq_file->private. That might simplify the logic a bit.
>>
>> [Adding Neil, to get his opinion of moving 'event' so seq_file and get
>> rid of the malloc dance]
>
> I guess, we could do something like this, which looks quite a bit
> simpler by moving the poll event counter into the dynamically allocated
> seq_file structure itself, instead of having private structures
> allocated on top to just carry the counter (patch is just
> compile-tested).

Now that this cleanup made its way, could we look again to to the
pollable sysctl implementation?


thanks
Lucas De Marchi

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH] sysctl: add support for poll()
  2011-07-26  2:17                     ` Lucas De Marchi
@ 2011-08-02 22:53                       ` Kay Sievers
  2011-08-02 23:16                         ` Greg KH
  0 siblings, 1 reply; 30+ messages in thread
From: Kay Sievers @ 2011-08-02 22:53 UTC (permalink / raw)
  To: Lucas De Marchi
  Cc: Andrew Morton, Eric W. Biederman, Alan Cox, linux-kernel,
	Nick Piggin, Al Viro, Christoph Hellwig, Stephen Rothwell,
	David Howells, Serge E. Hallyn, Daniel Lezcano, Jiri Slaby,
	Greg Kroah-Hartman, James Morris, neilb

On Tue, Jul 26, 2011 at 04:17, Lucas De Marchi
<lucas.demarchi@profusion.mobi> wrote:
> On Mon, Jun 13, 2011 at 1:05 PM, Kay Sievers <kay.sievers@vrfy.org> wrote:
>> On Thu, 2011-06-09 at 15:16 +0200, Kay Sievers wrote:
>>> On Thu, Jun 9, 2011 at 00:17, Andrew Morton <akpm@linux-foundation.org> wrote:
>>
>>> > We already have several pollable procfs files, such as
>>> > fs/proc/base.c:mounts_poll() and I think drivers/md has one.  I do
>>> > think that any work in this area should end up with those custom
>>> > make-procfs-pollable hacks being identified and removed.
>>>
>>> For these files we can probably move the event counter into the
>>> seq_file structure, and get rid of the dance to kmalloc it and assign
>>> it to seq_file->private. That might simplify the logic a bit.
>>>
>>> [Adding Neil, to get his opinion of moving 'event' so seq_file and get
>>> rid of the malloc dance]
>>
>> I guess, we could do something like this, which looks quite a bit
>> simpler by moving the poll event counter into the dynamically allocated
>> seq_file structure itself, instead of having private structures
>> allocated on top to just carry the counter (patch is just
>> compile-tested).
>
> Now that this cleanup made its way, could we look again to to the
> pollable sysctl implementation?

Can we please get support for poll() for /proc/sys files merged? Just
like we do that for /proc already.

We like to have notifications triggered by the kernel for selected
files in /proc/sys.

I don't really see any technical reasons to hold this back.

Thanks,
Kay

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH] sysctl: add support for poll()
  2011-08-02 22:53                       ` Kay Sievers
@ 2011-08-02 23:16                         ` Greg KH
  2011-08-03  1:12                           ` Lucas De Marchi
  0 siblings, 1 reply; 30+ messages in thread
From: Greg KH @ 2011-08-02 23:16 UTC (permalink / raw)
  To: Kay Sievers
  Cc: Lucas De Marchi, Andrew Morton, Eric W. Biederman, Alan Cox,
	linux-kernel, Nick Piggin, Al Viro, Christoph Hellwig,
	Stephen Rothwell, David Howells, Serge E. Hallyn, Daniel Lezcano,
	Jiri Slaby, James Morris, neilb

On Wed, Aug 03, 2011 at 12:53:33AM +0200, Kay Sievers wrote:
> On Tue, Jul 26, 2011 at 04:17, Lucas De Marchi
> <lucas.demarchi@profusion.mobi> wrote:
> > On Mon, Jun 13, 2011 at 1:05 PM, Kay Sievers <kay.sievers@vrfy.org> wrote:
> >> On Thu, 2011-06-09 at 15:16 +0200, Kay Sievers wrote:
> >>> On Thu, Jun 9, 2011 at 00:17, Andrew Morton <akpm@linux-foundation.org> wrote:
> >>
> >>> > We already have several pollable procfs files, such as
> >>> > fs/proc/base.c:mounts_poll() and I think drivers/md has one.  I do
> >>> > think that any work in this area should end up with those custom
> >>> > make-procfs-pollable hacks being identified and removed.
> >>>
> >>> For these files we can probably move the event counter into the
> >>> seq_file structure, and get rid of the dance to kmalloc it and assign
> >>> it to seq_file->private. That might simplify the logic a bit.
> >>>
> >>> [Adding Neil, to get his opinion of moving 'event' so seq_file and get
> >>> rid of the malloc dance]
> >>
> >> I guess, we could do something like this, which looks quite a bit
> >> simpler by moving the poll event counter into the dynamically allocated
> >> seq_file structure itself, instead of having private structures
> >> allocated on top to just carry the counter (patch is just
> >> compile-tested).
> >
> > Now that this cleanup made its way, could we look again to to the
> > pollable sysctl implementation?
> 
> Can we please get support for poll() for /proc/sys files merged? Just
> like we do that for /proc already.
> 
> We like to have notifications triggered by the kernel for selected
> files in /proc/sys.
> 
> I don't really see any technical reasons to hold this back.

Care to resend the patch so we can queue it up for 3.2?

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH] sysctl: add support for poll()
  2011-08-02 23:16                         ` Greg KH
@ 2011-08-03  1:12                           ` Lucas De Marchi
  2011-08-03  9:40                             ` Eric W. Biederman
  0 siblings, 1 reply; 30+ messages in thread
From: Lucas De Marchi @ 2011-08-03  1:12 UTC (permalink / raw)
  To: Greg KH
  Cc: Kay Sievers, Andrew Morton, Eric W. Biederman, Alan Cox,
	linux-kernel, Nick Piggin, Al Viro, Christoph Hellwig,
	Stephen Rothwell, David Howells, Serge E. Hallyn, Daniel Lezcano,
	Jiri Slaby, James Morris, neilb

>From dae691ba16a6c6c50ca05e2cd69bdc808f2c4f89 Mon Sep 17 00:00:00 2001
From: Lucas De Marchi <lucas.demarchi@profusion.mobi>
Date: Wed, 1 Jun 2011 09:14:36 -0300
Subject: [PATCH] sysctl: add support for poll()
Cc: linux-kernel@vger.kernel.org

Adding support for poll() in sysctl fs allows userspace to receive
notifications when an entry in sysctl changes. This way it's possible to
know when hostname/domainname is changed due to the respective syscall
has been called or its file under /proc/sys has been written to.

Signed-off-by: Lucas De Marchi <lucas.demarchi@profusion.mobi>
Acked-By: Kay Sievers <kay.sievers@vrfy.org>
---
 fs/proc/proc_sysctl.c   |   40 ++++++++++++++++++++++++++++++++++++++++
 include/linux/sysctl.h  |    8 ++++++++
 include/linux/utsname.h |   16 ++++++++++++++++
 kernel/sys.c            |    2 ++
 kernel/utsname_sysctl.c |   36 ++++++++++++++++++++++++++++++++++++
 5 files changed, 102 insertions(+), 0 deletions(-)

diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 1a77dbe..cf4bc7c 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -3,6 +3,7 @@
  */
 #include <linux/init.h>
 #include <linux/sysctl.h>
+#include <linux/poll.h>
 #include <linux/proc_fs.h>
 #include <linux/security.h>
 #include <linux/namei.h>
@@ -176,6 +177,43 @@ static ssize_t proc_sys_write(struct file *filp, const char __user *buf,
 	return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 1);
 }
 
+static int proc_sys_open(struct inode *inode, struct file *filp)
+{
+	struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+
+	if (table->poll) {
+		unsigned long event = atomic_read(&table->poll->event);
+
+		filp->private_data = (void *)event;
+	}
+
+	return 0;
+}
+
+static unsigned int proc_sys_poll(struct file *filp, poll_table *wait)
+{
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+	unsigned long event = (unsigned long)filp->private_data;
+	unsigned int ret = POLLIN | POLLRDNORM;
+
+	if (!table->proc_handler)
+		goto out;
+
+	if (!table->poll)
+		goto out;
+
+	poll_wait(filp, &table->poll->wait, wait);
+
+	if (event != atomic_read(&table->poll->event)) {
+		filp->private_data = (void *)(unsigned long)atomic_read(
+						&table->poll->event);
+		ret = POLLIN | POLLRDNORM | POLLERR | POLLPRI;
+	}
+
+out:
+	return ret;
+}
 
 static int proc_sys_fill_cache(struct file *filp, void *dirent,
 				filldir_t filldir,
@@ -364,6 +402,8 @@ static int proc_sys_getattr(struct vfsmount *mnt, struct dentry *dentry, struct
 }
 
 static const struct file_operations proc_sys_file_operations = {
+	.open		= proc_sys_open,
+	.poll		= proc_sys_poll,
 	.read		= proc_sys_read,
 	.write		= proc_sys_write,
 	.llseek		= default_llseek,
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 11684d9..96c89ba 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -25,6 +25,7 @@
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/compiler.h>
+#include <linux/wait.h>
 
 struct completion;
 
@@ -1011,6 +1012,12 @@ extern int proc_do_large_bitmap(struct ctl_table *, int,
  * cover common cases.
  */
 
+/* Support for userspace poll() to watch for changes */
+struct ctl_table_poll {
+	atomic_t event;
+	wait_queue_head_t wait;
+};
+
 /* A sysctl table is an array of struct ctl_table: */
 struct ctl_table 
 {
@@ -1021,6 +1028,7 @@ struct ctl_table
 	struct ctl_table *child;
 	struct ctl_table *parent;	/* Automatically set */
 	proc_handler *proc_handler;	/* Callback for text formatting */
+	struct ctl_table_poll *poll;
 	void *extra1;
 	void *extra2;
 };
diff --git a/include/linux/utsname.h b/include/linux/utsname.h
index 4e5b021..c714ed7 100644
--- a/include/linux/utsname.h
+++ b/include/linux/utsname.h
@@ -37,6 +37,14 @@ struct new_utsname {
 #include <linux/nsproxy.h>
 #include <linux/err.h>
 
+enum uts_proc {
+	UTS_PROC_OSTYPE,
+	UTS_PROC_OSRELEASE,
+	UTS_PROC_VERSION,
+	UTS_PROC_HOSTNAME,
+	UTS_PROC_DOMAINNAME,
+};
+
 struct user_namespace;
 extern struct user_namespace init_user_ns;
 
@@ -80,6 +88,14 @@ static inline struct uts_namespace *copy_utsname(unsigned long flags,
 }
 #endif
 
+#ifdef CONFIG_PROC_SYSCTL
+extern void uts_proc_notify(enum uts_proc proc);
+#else
+static inline void uts_proc_notify(enum uts_proc proc)
+{
+}
+#endif
+
 static inline struct new_utsname *utsname(void)
 {
 	return &current->nsproxy->uts_ns->name;
diff --git a/kernel/sys.c b/kernel/sys.c
index a101ba3..2347043 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1241,6 +1241,7 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
 		memset(u->nodename + len, 0, sizeof(u->nodename) - len);
 		errno = 0;
 	}
+	uts_proc_notify(UTS_PROC_HOSTNAME);
 	up_write(&uts_sem);
 	return errno;
 }
@@ -1291,6 +1292,7 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
 		memset(u->domainname + len, 0, sizeof(u->domainname) - len);
 		errno = 0;
 	}
+	uts_proc_notify(UTS_PROC_DOMAINNAME);
 	up_write(&uts_sem);
 	return errno;
 }
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index a2cd77e..e96b766 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -13,6 +13,7 @@
 #include <linux/uts.h>
 #include <linux/utsname.h>
 #include <linux/sysctl.h>
+#include <linux/wait.h>
 
 static void *get_uts(ctl_table *table, int write)
 {
@@ -51,12 +52,28 @@ static int proc_do_uts_string(ctl_table *table, int write,
 	uts_table.data = get_uts(table, write);
 	r = proc_dostring(&uts_table,write,buffer,lenp, ppos);
 	put_uts(table, write, uts_table.data);
+
+	if (write) {
+		atomic_inc(&table->poll->event);
+		wake_up_interruptible(&table->poll->wait);
+	}
+
 	return r;
 }
 #else
 #define proc_do_uts_string NULL
 #endif
 
+static struct ctl_table_poll hostname_poll = {
+	.event		= ATOMIC_INIT(0),
+	.wait		= __WAIT_QUEUE_HEAD_INITIALIZER(hostname_poll.wait),
+};
+
+static struct ctl_table_poll domainname_poll = {
+	.event		= ATOMIC_INIT(0),
+	.wait		= __WAIT_QUEUE_HEAD_INITIALIZER(domainname_poll.wait),
+};
+
 static struct ctl_table uts_kern_table[] = {
 	{
 		.procname	= "ostype",
@@ -85,6 +102,7 @@ static struct ctl_table uts_kern_table[] = {
 		.maxlen		= sizeof(init_uts_ns.name.nodename),
 		.mode		= 0644,
 		.proc_handler	= proc_do_uts_string,
+		.poll		= &hostname_poll,
 	},
 	{
 		.procname	= "domainname",
@@ -92,6 +110,7 @@ static struct ctl_table uts_kern_table[] = {
 		.maxlen		= sizeof(init_uts_ns.name.domainname),
 		.mode		= 0644,
 		.proc_handler	= proc_do_uts_string,
+		.poll		= &domainname_poll,
 	},
 	{}
 };
@@ -105,6 +124,23 @@ static struct ctl_table uts_root_table[] = {
 	{}
 };
 
+#ifdef CONFIG_PROC_SYSCTL
+/*
+ * Notify userspace about a change in a certain entry of uts_kern_table,
+ * identified by the parameter proc.
+ */
+void uts_proc_notify(enum uts_proc proc)
+{
+	struct ctl_table *table = &uts_kern_table[proc];
+
+	if (!table->poll)
+		return;
+
+	atomic_inc(&table->poll->event);
+	wake_up_interruptible(&table->poll->wait);
+}
+#endif
+
 static int __init utsname_sysctl_init(void)
 {
 	register_sysctl_table(uts_root_table);
-- 
1.7.6


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* Re: [PATCH] sysctl: add support for poll()
  2011-08-03  1:12                           ` Lucas De Marchi
@ 2011-08-03  9:40                             ` Eric W. Biederman
  2011-08-03 13:17                               ` Lucas De Marchi
  0 siblings, 1 reply; 30+ messages in thread
From: Eric W. Biederman @ 2011-08-03  9:40 UTC (permalink / raw)
  To: Lucas De Marchi
  Cc: Greg KH, Kay Sievers, Andrew Morton, Alan Cox, linux-kernel,
	Nick Piggin, Al Viro, Christoph Hellwig, Stephen Rothwell,
	David Howells, Serge E. Hallyn, Daniel Lezcano, Jiri Slaby,
	James Morris, neilb

Lucas De Marchi <lucas.demarchi@profusion.mobi> writes:

>>From dae691ba16a6c6c50ca05e2cd69bdc808f2c4f89 Mon Sep 17 00:00:00 2001
> From: Lucas De Marchi <lucas.demarchi@profusion.mobi>
> Date: Wed, 1 Jun 2011 09:14:36 -0300
> Subject: [PATCH] sysctl: add support for poll()
> Cc: linux-kernel@vger.kernel.org
>
> Adding support for poll() in sysctl fs allows userspace to receive
> notifications when an entry in sysctl changes. This way it's possible to
> know when hostname/domainname is changed due to the respective syscall
> has been called or its file under /proc/sys has been written to.

The patch breaks the return code of poll, is ugly, and it increases the
size of struct ctl_table.

I'm pretty certain I mentioned the return code breakage before.

In principle this is reasonable with the POLLERR | POLLPRI idiom but
I'm not a fan of this code.

Eric



> Signed-off-by: Lucas De Marchi <lucas.demarchi@profusion.mobi>
> Acked-By: Kay Sievers <kay.sievers@vrfy.org>
> ---
>  fs/proc/proc_sysctl.c   |   40 ++++++++++++++++++++++++++++++++++++++++
>  include/linux/sysctl.h  |    8 ++++++++
>  include/linux/utsname.h |   16 ++++++++++++++++
>  kernel/sys.c            |    2 ++
>  kernel/utsname_sysctl.c |   36 ++++++++++++++++++++++++++++++++++++
>  5 files changed, 102 insertions(+), 0 deletions(-)
>
> diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
> index 1a77dbe..cf4bc7c 100644
> --- a/fs/proc/proc_sysctl.c
> +++ b/fs/proc/proc_sysctl.c
> @@ -3,6 +3,7 @@
>   */
>  #include <linux/init.h>
>  #include <linux/sysctl.h>
> +#include <linux/poll.h>
>  #include <linux/proc_fs.h>
>  #include <linux/security.h>
>  #include <linux/namei.h>
> @@ -176,6 +177,43 @@ static ssize_t proc_sys_write(struct file *filp, const char __user *buf,
>  	return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 1);
>  }
>  
> +static int proc_sys_open(struct inode *inode, struct file *filp)
> +{
> +	struct ctl_table *table = PROC_I(inode)->sysctl_entry;
> +
> +	if (table->poll) {
> +		unsigned long event = atomic_read(&table->poll->event);
> +
> +		filp->private_data = (void *)event;

It would be nice if there was something that abstracted
filp->private_data accesses so they were type safe, and
clear what you were doing.

> +	}
> +
> +	return 0;
> +}
> +
> +static unsigned int proc_sys_poll(struct file *filp, poll_table *wait)
> +{
> +	struct inode *inode = filp->f_path.dentry->d_inode;
> +	struct ctl_table *table = PROC_I(inode)->sysctl_entry;
> +	unsigned long event = (unsigned long)filp->private_data;
> +	unsigned int ret = POLLIN | POLLRDNORM;

The default return value here must be DEFAULT_POLLMASK otherwise you
change the result of poll for every single proc file and in general
return the wrong values because most sysctls are writable.

> +	if (!table->proc_handler)
> +		goto out;
> +
> +	if (!table->poll)
> +		goto out;
> +
> +	poll_wait(filp, &table->poll->wait, wait);
> +
> +	if (event != atomic_read(&table->poll->event)) {
> +		filp->private_data = (void *)(unsigned long)atomic_read(
> +						&table->poll->event);
> +		ret = POLLIN | POLLRDNORM | POLLERR | POLLPRI;

Why are you updating the event here instead of when the file is read?
Presumably we want until the file is read and people see the new value.

> +	}
> +
> +out:
> +	return ret;
> +}
>  
>  static int proc_sys_fill_cache(struct file *filp, void *dirent,
>  				filldir_t filldir,
> @@ -364,6 +402,8 @@ static int proc_sys_getattr(struct vfsmount *mnt, struct dentry *dentry, struct
>  }
>  
>  static const struct file_operations proc_sys_file_operations = {
> +	.open		= proc_sys_open,
> +	.poll		= proc_sys_poll,
>  	.read		= proc_sys_read,
>  	.write		= proc_sys_write,
>  	.llseek		= default_llseek,
> diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
> index 11684d9..96c89ba 100644
> --- a/include/linux/sysctl.h
> +++ b/include/linux/sysctl.h
> @@ -25,6 +25,7 @@
>  #include <linux/kernel.h>
>  #include <linux/types.h>
>  #include <linux/compiler.h>
> +#include <linux/wait.h>
>  
>  struct completion;
>  
> @@ -1011,6 +1012,12 @@ extern int proc_do_large_bitmap(struct ctl_table *, int,
>   * cover common cases.
>   */
>  
> +/* Support for userspace poll() to watch for changes */
> +struct ctl_table_poll {
> +	atomic_t event;
> +	wait_queue_head_t wait;
> +};

It is a moderately big stretch to think that everything that wants to
poll a value under /proc/sys will want to use struct ctl_table_poll.

>  /* A sysctl table is an array of struct ctl_table: */
>  struct ctl_table 
>  {
> @@ -1021,6 +1028,7 @@ struct ctl_table
>  	struct ctl_table *child;
>  	struct ctl_table *parent;	/* Automatically set */
>  	proc_handler *proc_handler;	/* Callback for text formatting */
> +	struct ctl_table_poll *poll;
>  	void *extra1;
>  	void *extra2;
>  };
> diff --git a/include/linux/utsname.h b/include/linux/utsname.h
> index 4e5b021..c714ed7 100644
> --- a/include/linux/utsname.h
> +++ b/include/linux/utsname.h
> @@ -37,6 +37,14 @@ struct new_utsname {
>  #include <linux/nsproxy.h>
>  #include <linux/err.h>
>  
> +enum uts_proc {
> +	UTS_PROC_OSTYPE,
> +	UTS_PROC_OSRELEASE,
> +	UTS_PROC_VERSION,
> +	UTS_PROC_HOSTNAME,
> +	UTS_PROC_DOMAINNAME,
> +};
> +
>  struct user_namespace;
>  extern struct user_namespace init_user_ns;
>  
> @@ -80,6 +88,14 @@ static inline struct uts_namespace *copy_utsname(unsigned long flags,
>  }
>  #endif
>  
> +#ifdef CONFIG_PROC_SYSCTL
> +extern void uts_proc_notify(enum uts_proc proc);
> +#else
> +static inline void uts_proc_notify(enum uts_proc proc)
> +{
> +}
> +#endif
> +
>  static inline struct new_utsname *utsname(void)
>  {
>  	return &current->nsproxy->uts_ns->name;
> diff --git a/kernel/sys.c b/kernel/sys.c
> index a101ba3..2347043 100644
> --- a/kernel/sys.c
> +++ b/kernel/sys.c
> @@ -1241,6 +1241,7 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
>  		memset(u->nodename + len, 0, sizeof(u->nodename) - len);
>  		errno = 0;
>  	}
> +	uts_proc_notify(UTS_PROC_HOSTNAME);
>  	up_write(&uts_sem);
>  	return errno;
>  }
> @@ -1291,6 +1292,7 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
>  		memset(u->domainname + len, 0, sizeof(u->domainname) - len);
>  		errno = 0;
>  	}
> +	uts_proc_notify(UTS_PROC_DOMAINNAME);
>  	up_write(&uts_sem);
>  	return errno;
>  }
> diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
> index a2cd77e..e96b766 100644
> --- a/kernel/utsname_sysctl.c
> +++ b/kernel/utsname_sysctl.c
> @@ -13,6 +13,7 @@
>  #include <linux/uts.h>
>  #include <linux/utsname.h>
>  #include <linux/sysctl.h>
> +#include <linux/wait.h>
>  
>  static void *get_uts(ctl_table *table, int write)
>  {
> @@ -51,12 +52,28 @@ static int proc_do_uts_string(ctl_table *table, int write,
>  	uts_table.data = get_uts(table, write);
>  	r = proc_dostring(&uts_table,write,buffer,lenp, ppos);
>  	put_uts(table, write, uts_table.data);
> +
> +	if (write) {
> +		atomic_inc(&table->poll->event);
> +		wake_up_interruptible(&table->poll->wait);

Duplicating code again?  Why are you not calling your generic notify?

> +	}
> +
>  	return r;
>  }
>  #else
>  #define proc_do_uts_string NULL
>  #endif
>  

> +static struct ctl_table_poll hostname_poll = {
> +	.event		= ATOMIC_INIT(0),
> +	.wait		= __WAIT_QUEUE_HEAD_INITIALIZER(hostname_poll.wait),
> +};
> +
> +static struct ctl_table_poll domainname_poll = {
> +	.event		= ATOMIC_INIT(0),
> +	.wait		= __WAIT_QUEUE_HEAD_INITIALIZER(domainname_poll.wait),
> +};

No generic initialize for this generic structure?  Instead every user
must repeat the initialization?

>  static struct ctl_table uts_kern_table[] = {
>  	{
>  		.procname	= "ostype",
> @@ -85,6 +102,7 @@ static struct ctl_table uts_kern_table[] = {
>  		.maxlen		= sizeof(init_uts_ns.name.nodename),
>  		.mode		= 0644,
>  		.proc_handler	= proc_do_uts_string,
> +		.poll		= &hostname_poll,
>  	},
>  	{
>  		.procname	= "domainname",
> @@ -92,6 +110,7 @@ static struct ctl_table uts_kern_table[] = {
>  		.maxlen		= sizeof(init_uts_ns.name.domainname),
>  		.mode		= 0644,
>  		.proc_handler	= proc_do_uts_string,
> +		.poll		= &domainname_poll,
>  	},
>  	{}
>  };
> @@ -105,6 +124,23 @@ static struct ctl_table uts_root_table[] = {
>  	{}
>  };
>  
> +#ifdef CONFIG_PROC_SYSCTL
> +/*
> + * Notify userspace about a change in a certain entry of uts_kern_table,
> + * identified by the parameter proc.
> + */
> +void uts_proc_notify(enum uts_proc proc)
> +{
> +	struct ctl_table *table = &uts_kern_table[proc];
> +
> +	if (!table->poll)
> +		return;
> +
> +	atomic_inc(&table->poll->event);
> +	wake_up_interruptible(&table->poll->wait);

Since you have a generic structure why don't you have a generic function
that calls the atomic_inc and wake_up_interruptible.

> +}
> +#endif
> +
>  static int __init utsname_sysctl_init(void)
>  {
>  	register_sysctl_table(uts_root_table);

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH] sysctl: add support for poll()
  2011-08-03  9:40                             ` Eric W. Biederman
@ 2011-08-03 13:17                               ` Lucas De Marchi
  2011-08-03 18:08                                 ` Eric W. Biederman
  0 siblings, 1 reply; 30+ messages in thread
From: Lucas De Marchi @ 2011-08-03 13:17 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Greg KH, Kay Sievers, Andrew Morton, Alan Cox, linux-kernel,
	Nick Piggin, Al Viro, Christoph Hellwig, Stephen Rothwell,
	David Howells, Serge E. Hallyn, Daniel Lezcano, Jiri Slaby,
	James Morris, neilb

Hi Eric,

On Wed, Aug 3, 2011 at 6:40 AM, Eric W. Biederman <ebiederm@xmission.com> wrote:
>
> Lucas De Marchi <lucas.demarchi@profusion.mobi> writes:
>
> >>From dae691ba16a6c6c50ca05e2cd69bdc808f2c4f89 Mon Sep 17 00:00:00 2001
> > From: Lucas De Marchi <lucas.demarchi@profusion.mobi>
> > Date: Wed, 1 Jun 2011 09:14:36 -0300
> > Subject: [PATCH] sysctl: add support for poll()
> > Cc: linux-kernel@vger.kernel.org
> >
> > Adding support for poll() in sysctl fs allows userspace to receive
> > notifications when an entry in sysctl changes. This way it's possible to
> > know when hostname/domainname is changed due to the respective syscall
> > has been called or its file under /proc/sys has been written to.
>
> The patch breaks the return code of poll, is ugly, and it increases the
> size of struct ctl_table.
>
> I'm pretty certain I mentioned the return code breakage before.

And Kay already answered this. I'm answering each one now.

> In principle this is reasonable with the POLLERR | POLLPRI idiom but
> I'm not a fan of this code.
>
> Eric
>
>
>
> > Signed-off-by: Lucas De Marchi <lucas.demarchi@profusion.mobi>
> > Acked-By: Kay Sievers <kay.sievers@vrfy.org>
> > ---
> >  fs/proc/proc_sysctl.c   |   40 ++++++++++++++++++++++++++++++++++++++++
> >  include/linux/sysctl.h  |    8 ++++++++
> >  include/linux/utsname.h |   16 ++++++++++++++++
> >  kernel/sys.c            |    2 ++
> >  kernel/utsname_sysctl.c |   36 ++++++++++++++++++++++++++++++++++++
> >  5 files changed, 102 insertions(+), 0 deletions(-)
> >
> > diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
> > index 1a77dbe..cf4bc7c 100644
> > --- a/fs/proc/proc_sysctl.c
> > +++ b/fs/proc/proc_sysctl.c
> > @@ -3,6 +3,7 @@
> >   */
> >  #include <linux/init.h>
> >  #include <linux/sysctl.h>
> > +#include <linux/poll.h>
> >  #include <linux/proc_fs.h>
> >  #include <linux/security.h>
> >  #include <linux/namei.h>
> > @@ -176,6 +177,43 @@ static ssize_t proc_sys_write(struct file *filp, const char __user *buf,
> >       return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 1);
> >  }
> >
> > +static int proc_sys_open(struct inode *inode, struct file *filp)
> > +{
> > +     struct ctl_table *table = PROC_I(inode)->sysctl_entry;
> > +
> > +     if (table->poll) {
> > +             unsigned long event = atomic_read(&table->poll->event);
> > +
> > +             filp->private_data = (void *)event;
>
> It would be nice if there was something that abstracted
> filp->private_data accesses so they were type safe, and
> clear what you were doing.

private_data is used everywhere to save this private-per-inode data.
Here it's just the current number of events. It doesn't matter the
number, as long as when we wake up in the poll() we can compare these
numbers.

>
> > +     }
> > +
> > +     return 0;
> > +}
> > +
> > +static unsigned int proc_sys_poll(struct file *filp, poll_table *wait)
> > +{
> > +     struct inode *inode = filp->f_path.dentry->d_inode;
> > +     struct ctl_table *table = PROC_I(inode)->sysctl_entry;
> > +     unsigned long event = (unsigned long)filp->private_data;
> > +     unsigned int ret = POLLIN | POLLRDNORM;
>
> The default return value here must be DEFAULT_POLLMASK otherwise you
> change the result of poll for every single proc file and in general

This is not true. This patch has nothing to do with proc files and
will not change their return values. However I did use the same return
values as used for proc files that support poll (see
fs/proc/base.c:mounts_poll()

> return the wrong values because most sysctls are writable.

For sysctl, there wasn't any poll support so how could I break
something that didn't exist?


>
> > +     if (!table->proc_handler)
> > +             goto out;
> > +
> > +     if (!table->poll)
> > +             goto out;
> > +
> > +     poll_wait(filp, &table->poll->wait, wait);
> > +
> > +     if (event != atomic_read(&table->poll->event)) {
> > +             filp->private_data = (void *)(unsigned long)atomic_read(
> > +                                             &table->poll->event);
> > +             ret = POLLIN | POLLRDNORM | POLLERR | POLLPRI;
>
> Why are you updating the event here instead of when the file is read?
> Presumably we want until the file is read and people see the new value.

This is to indicate, when we are polling the file , that the file
should be read again. When the process wake up we compare the filp's
copy of the event number with the one in table. Since they were the
same when the file was opened, if they are different now it indicates
the the file was updated and should be read again.

There's no way to do this in read: we update table->poll->event
whenever the table entry is changed and when wakening up from
poll_wait() we compare if these values are the same.

Again, this is the same as done for other proc files that support
polling (see mm/swapfile.c:swaps_poll())

>
> > +     }
> > +
> > +out:
> > +     return ret;
> > +}
> >
> >  static int proc_sys_fill_cache(struct file *filp, void *dirent,
> >                               filldir_t filldir,
> > @@ -364,6 +402,8 @@ static int proc_sys_getattr(struct vfsmount *mnt, struct dentry *dentry, struct
> >  }
> >
> >  static const struct file_operations proc_sys_file_operations = {
> > +     .open           = proc_sys_open,
> > +     .poll           = proc_sys_poll,
> >       .read           = proc_sys_read,
> >       .write          = proc_sys_write,
> >       .llseek         = default_llseek,
> > diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
> > index 11684d9..96c89ba 100644
> > --- a/include/linux/sysctl.h
> > +++ b/include/linux/sysctl.h
> > @@ -25,6 +25,7 @@
> >  #include <linux/kernel.h>
> >  #include <linux/types.h>
> >  #include <linux/compiler.h>
> > +#include <linux/wait.h>
> >
> >  struct completion;
> >
> > @@ -1011,6 +1012,12 @@ extern int proc_do_large_bitmap(struct ctl_table *, int,
> >   * cover common cases.
> >   */
> >
> > +/* Support for userspace poll() to watch for changes */
> > +struct ctl_table_poll {
> > +     atomic_t event;
> > +     wait_queue_head_t wait;
> > +};
>
> It is a moderately big stretch to think that everything that wants to
> poll a value under /proc/sys will want to use struct ctl_table_poll.

This is the way we know what processes are polling() this entry (hence
the wait queue). It's a well known idiom and again maps to the
existent pollable files in /proc


>
> >  /* A sysctl table is an array of struct ctl_table: */
> >  struct ctl_table
> >  {
> > @@ -1021,6 +1028,7 @@ struct ctl_table
> >       struct ctl_table *child;
> >       struct ctl_table *parent;       /* Automatically set */
> >       proc_handler *proc_handler;     /* Callback for text formatting */
> > +     struct ctl_table_poll *poll;
> >       void *extra1;
> >       void *extra2;
> >  };
> > diff --git a/include/linux/utsname.h b/include/linux/utsname.h
> > index 4e5b021..c714ed7 100644
> > --- a/include/linux/utsname.h
> > +++ b/include/linux/utsname.h
> > @@ -37,6 +37,14 @@ struct new_utsname {
> >  #include <linux/nsproxy.h>
> >  #include <linux/err.h>
> >
> > +enum uts_proc {
> > +     UTS_PROC_OSTYPE,
> > +     UTS_PROC_OSRELEASE,
> > +     UTS_PROC_VERSION,
> > +     UTS_PROC_HOSTNAME,
> > +     UTS_PROC_DOMAINNAME,
> > +};
> > +
> >  struct user_namespace;
> >  extern struct user_namespace init_user_ns;
> >
> > @@ -80,6 +88,14 @@ static inline struct uts_namespace *copy_utsname(unsigned long flags,
> >  }
> >  #endif
> >
> > +#ifdef CONFIG_PROC_SYSCTL
> > +extern void uts_proc_notify(enum uts_proc proc);
> > +#else
> > +static inline void uts_proc_notify(enum uts_proc proc)
> > +{
> > +}
> > +#endif
> > +
> >  static inline struct new_utsname *utsname(void)
> >  {
> >       return &current->nsproxy->uts_ns->name;
> > diff --git a/kernel/sys.c b/kernel/sys.c
> > index a101ba3..2347043 100644
> > --- a/kernel/sys.c
> > +++ b/kernel/sys.c
> > @@ -1241,6 +1241,7 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
> >               memset(u->nodename + len, 0, sizeof(u->nodename) - len);
> >               errno = 0;
> >       }
> > +     uts_proc_notify(UTS_PROC_HOSTNAME);
> >       up_write(&uts_sem);
> >       return errno;
> >  }
> > @@ -1291,6 +1292,7 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
> >               memset(u->domainname + len, 0, sizeof(u->domainname) - len);
> >               errno = 0;
> >       }
> > +     uts_proc_notify(UTS_PROC_DOMAINNAME);
> >       up_write(&uts_sem);
> >       return errno;
> >  }
> > diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
> > index a2cd77e..e96b766 100644
> > --- a/kernel/utsname_sysctl.c
> > +++ b/kernel/utsname_sysctl.c
> > @@ -13,6 +13,7 @@
> >  #include <linux/uts.h>
> >  #include <linux/utsname.h>
> >  #include <linux/sysctl.h>
> > +#include <linux/wait.h>
> >
> >  static void *get_uts(ctl_table *table, int write)
> >  {
> > @@ -51,12 +52,28 @@ static int proc_do_uts_string(ctl_table *table, int write,
> >       uts_table.data = get_uts(table, write);
> >       r = proc_dostring(&uts_table,write,buffer,lenp, ppos);
> >       put_uts(table, write, uts_table.data);
> > +
> > +     if (write) {
> > +             atomic_inc(&table->poll->event);
> > +             wake_up_interruptible(&table->poll->wait);
>
> Duplicating code again?  Why are you not calling your generic notify?

Duplicating? The uts_proc_notify() was only created so when user calls
hostname/domainname syscalls we wake up who is polling the
correspondent sysctl entries.

Here it's for the generic case in which we write to /proc/sysctl.

We sure need to send a notification for the user when the entry is
changed, either via syscall or the fs entry.

>
> > +     }
> > +
> >       return r;
> >  }
> >  #else
> >  #define proc_do_uts_string NULL
> >  #endif
> >
>
> > +static struct ctl_table_poll hostname_poll = {
> > +     .event          = ATOMIC_INIT(0),
> > +     .wait           = __WAIT_QUEUE_HEAD_INITIALIZER(hostname_poll.wait),
> > +};
> > +
> > +static struct ctl_table_poll domainname_poll = {
> > +     .event          = ATOMIC_INIT(0),
> > +     .wait           = __WAIT_QUEUE_HEAD_INITIALIZER(domainname_poll.wait),
> > +};
>
> No generic initialize for this generic structure?  Instead every user
> must repeat the initialization?

The wait queues are different, and the ".poll" is set to this. I can't
see how we could generalize this. Maybe creating a macro
CTL_TABLE_POLL_DECLARE() or something along the way, but I'm not sure
it's worth doing.


> > @@ -105,6 +124,23 @@ static struct ctl_table uts_root_table[] = {
> >       {}
> >  };
> >
> > +#ifdef CONFIG_PROC_SYSCTL
> > +/*
> > + * Notify userspace about a change in a certain entry of uts_kern_table,
> > + * identified by the parameter proc.
> > + */
> > +void uts_proc_notify(enum uts_proc proc)
> > +{
> > +     struct ctl_table *table = &uts_kern_table[proc];
> > +
> > +     if (!table->poll)
> > +             return;
> > +
> > +     atomic_inc(&table->poll->event);
> > +     wake_up_interruptible(&table->poll->wait);
>
> Since you have a generic structure why don't you have a generic function
> that calls the atomic_inc and wake_up_interruptible.

It can be done. Maybe as a followup?


regards,
Lucas De Marchi

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH] sysctl: add support for poll()
  2011-08-03 13:17                               ` Lucas De Marchi
@ 2011-08-03 18:08                                 ` Eric W. Biederman
  2011-08-03 18:45                                   ` Lucas De Marchi
  0 siblings, 1 reply; 30+ messages in thread
From: Eric W. Biederman @ 2011-08-03 18:08 UTC (permalink / raw)
  To: Lucas De Marchi
  Cc: Greg KH, Kay Sievers, Andrew Morton, Alan Cox, linux-kernel,
	Nick Piggin, Al Viro, Christoph Hellwig, Stephen Rothwell,
	David Howells, Serge E. Hallyn, Daniel Lezcano, Jiri Slaby,
	James Morris, neilb

Lucas De Marchi <lucas.demarchi@profusion.mobi> writes:

> On Wed, Aug 3, 2011 at 6:40 AM, Eric W. Biederman <ebiederm@xmission.com> wrote:
>>
>> Lucas De Marchi <lucas.demarchi@profusion.mobi> writes:
>>
>> > @@ -176,6 +177,43 @@ static ssize_t proc_sys_write(struct file *filp, const char __user *buf,
>> >       return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 1);
>> >  }
>> >
>> > +static int proc_sys_open(struct inode *inode, struct file *filp)
>> > +{
>> > +     struct ctl_table *table = PROC_I(inode)->sysctl_entry;
>> > +
>> > +     if (table->poll) {
>> > +             unsigned long event = atomic_read(&table->poll->event);
>> > +
>> > +             filp->private_data = (void *)event;
>>
>> It would be nice if there was something that abstracted
>> filp->private_data accesses so they were type safe, and
>> clear what you were doing.
>
> private_data is used everywhere to save this private-per-inode data.
> Here it's just the current number of events. It doesn't matter the
> number, as long as when we wake up in the poll() we can compare these
> numbers.

It sure matters in code readablity if you have to cast the value
every time.  A pair of light wrappers around filp->private_data
that abstracts away the cast would be useful.

>> > +static unsigned int proc_sys_poll(struct file *filp, poll_table *wait)
>> > +{
>> > +     struct inode *inode = filp->f_path.dentry->d_inode;
>> > +     struct ctl_table *table = PROC_I(inode)->sysctl_entry;
>> > +     unsigned long event = (unsigned long)filp->private_data;
>> > +     unsigned int ret = POLLIN | POLLRDNORM;
>>
>> The default return value here must be DEFAULT_POLLMASK otherwise you
>> change the result of poll for every single proc file and in general
>
> This is not true. This patch has nothing to do with proc files and
> will not change their return values. However I did use the same return
> values as used for proc files that support poll (see
> fs/proc/base.c:mounts_poll()

Apologies I meant /proc/sys/ files and it is absolutely true that
you are changing the poll behavior of every sysctl with this change.

/proc/mounts doesn't return writable because it doesn't support
writes.  Most sysctl files support writes and thus should return
writeable.

>> return the wrong values because most sysctls are writable.
>
> For sysctl, there wasn't any poll support so how could I break
> something that didn't exist?

Not true.  There is the generic support of poll that happens
when poll is not implemented.  For a generic file that does
not implement specifically implement poll files return
both readable and writable.

Which ultimately is why we have to use POLLERR | POLLPRI
and not the more obvious values of readable or writable.

>> > +     if (!table->proc_handler)
>> > +             goto out;
>> > +
>> > +     if (!table->poll)
>> > +             goto out;
>> > +
>> > +     poll_wait(filp, &table->poll->wait, wait);
>> > +
>> > +     if (event != atomic_read(&table->poll->event)) {
>> > +             filp->private_data = (void *)(unsigned long)atomic_read(
>> > +                                             &table->poll->event);
>> > +             ret = POLLIN | POLLRDNORM | POLLERR | POLLPRI;
>>
>> Why are you updating the event here instead of when the file is read?
>> Presumably we want until the file is read and people see the new value.
>
> This is to indicate, when we are polling the file , that the file
> should be read again. When the process wake up we compare the filp's
> copy of the event number with the one in table. Since they were the
> same when the file was opened, if they are different now it indicates
> the the file was updated and should be read again.
>
> There's no way to do this in read: we update table->poll->event
> whenever the table entry is changed and when wakening up from
> poll_wait() we compare if these values are the same.

What do you mean there is no way to do this in read?
That is how it is implemented in sysfs and it currently feels more
intuitive.  There might be a semantic reason for not doing it in
read that I am missing but it is certainly technically possible.

>> > @@ -1011,6 +1012,12 @@ extern int proc_do_large_bitmap(struct ctl_table *, int,
>> >   * cover common cases.
>> >   */
>> >
>> > +/* Support for userspace poll() to watch for changes */
>> > +struct ctl_table_poll {
>> > +     atomic_t event;
>> > +     wait_queue_head_t wait;
>> > +};
>>
>> It is a moderately big stretch to think that everything that wants to
>> poll a value under /proc/sys will want to use struct ctl_table_poll.
>
> This is the way we know what processes are polling() this entry (hence
> the wait queue). It's a well known idiom and again maps to the
> existent pollable files in /proc

No.  This is similar but this is not identical to what is done for every
file in proc.  I agree we need something to flag that we are pollable.

>> > @@ -1291,6 +1292,7 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
>> >               memset(u->domainname + len, 0, sizeof(u->domainname) - len);
>> >               errno = 0;
>> >       }
>> > +     uts_proc_notify(UTS_PROC_DOMAINNAME);
>> >       up_write(&uts_sem);
>> >       return errno;
>> >  }
>> > diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
>> > index a2cd77e..e96b766 100644
>> > --- a/kernel/utsname_sysctl.c
>> > +++ b/kernel/utsname_sysctl.c
>> > @@ -13,6 +13,7 @@
>> >  #include <linux/uts.h>
>> >  #include <linux/utsname.h>
>> >  #include <linux/sysctl.h>
>> > +#include <linux/wait.h>
>> >
>> >  static void *get_uts(ctl_table *table, int write)
>> >  {
>> > @@ -51,12 +52,28 @@ static int proc_do_uts_string(ctl_table *table, int write,
>> >       uts_table.data = get_uts(table, write);
>> >       r = proc_dostring(&uts_table,write,buffer,lenp, ppos);
>> >       put_uts(table, write, uts_table.data);
>> > +
>> > +     if (write) {
>> > +             atomic_inc(&table->poll->event);
>> > +             wake_up_interruptible(&table->poll->wait);
>>
>> Duplicating code again?  Why are you not calling your generic notify?
>
> Duplicating? The uts_proc_notify() was only created so when user calls
> hostname/domainname syscalls we wake up who is polling the
> correspondent sysctl entries.

> Here it's for the generic case in which we write to /proc/sysctl.

Except the routine you have changed isn't for the generic case it
is for a specific set of writes to domainname or hostname.

I agree we need code on the update the value through sysctl path.
However if you are going to go with a generic structure why not
implement a generic function to update that structure.  Especially since
you have two copies of this logic to update the notify logic already.

>> > +static struct ctl_table_poll hostname_poll = {
>> > +     .event          = ATOMIC_INIT(0),
>> > +     .wait           = __WAIT_QUEUE_HEAD_INITIALIZER(hostname_poll.wait),
>> > +};
>> > +
>> > +static struct ctl_table_poll domainname_poll = {
>> > +     .event          = ATOMIC_INIT(0),
>> > +     .wait           = __WAIT_QUEUE_HEAD_INITIALIZER(domainname_poll.wait),
>> > +};
>>
>> No generic initialize for this generic structure?  Instead every user
>> must repeat the initialization?
>
> The wait queues are different, and the ".poll" is set to this. I can't
> see how we could generalize this. Maybe creating a macro
> CTL_TABLE_POLL_DECLARE() or something along the way, but I'm not sure
> it's worth doing.

If we are going to have a generic structure it is worth doing,
to reduce maintenance when someone changes things around and for
code clarity.

That initializer is just a hair better than line noise.

You can't seem to make up your mind here if you are implementing
a generic facility that should work for all sysctl entries or
if you are implementing a one off for hostname and nisdomainname.

For a one off what you have is ugly but who cares (except that you touch
the generic code). For a generic facility you are missing common
helpers.

Eric

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH] sysctl: add support for poll()
  2011-08-03 18:08                                 ` Eric W. Biederman
@ 2011-08-03 18:45                                   ` Lucas De Marchi
  2011-08-04 18:57                                     ` Lucas De Marchi
  0 siblings, 1 reply; 30+ messages in thread
From: Lucas De Marchi @ 2011-08-03 18:45 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Greg KH, Kay Sievers, Andrew Morton, Alan Cox, linux-kernel,
	Nick Piggin, Al Viro, Christoph Hellwig, Stephen Rothwell,
	David Howells, Serge E. Hallyn, Daniel Lezcano, Jiri Slaby,
	James Morris, neilb

Hi Eric,

On Wed, Aug 3, 2011 at 3:08 PM, Eric W. Biederman <ebiederm@xmission.com> wrote:
> Lucas De Marchi <lucas.demarchi@profusion.mobi> writes:
>
>> On Wed, Aug 3, 2011 at 6:40 AM, Eric W. Biederman <ebiederm@xmission.com> wrote:
>>>
>>> Lucas De Marchi <lucas.demarchi@profusion.mobi> writes:
>>>
>>> > @@ -176,6 +177,43 @@ static ssize_t proc_sys_write(struct file *filp, const char __user *buf,
>>> >       return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 1);
>>> >  }
>>> >
>>> > +static int proc_sys_open(struct inode *inode, struct file *filp)
>>> > +{
>>> > +     struct ctl_table *table = PROC_I(inode)->sysctl_entry;
>>> > +
>>> > +     if (table->poll) {
>>> > +             unsigned long event = atomic_read(&table->poll->event);
>>> > +
>>> > +             filp->private_data = (void *)event;
>>>
>>> It would be nice if there was something that abstracted
>>> filp->private_data accesses so they were type safe, and
>>> clear what you were doing.
>>
>> private_data is used everywhere to save this private-per-inode data.
>> Here it's just the current number of events. It doesn't matter the
>> number, as long as when we wake up in the poll() we can compare these
>> numbers.
>
> It sure matters in code readablity if you have to cast the value
> every time.  A pair of light wrappers around filp->private_data
> that abstracts away the cast would be useful.

Ok, I'll put a wrapper here.

>
>>> > +static unsigned int proc_sys_poll(struct file *filp, poll_table *wait)
>>> > +{
>>> > +     struct inode *inode = filp->f_path.dentry->d_inode;
>>> > +     struct ctl_table *table = PROC_I(inode)->sysctl_entry;
>>> > +     unsigned long event = (unsigned long)filp->private_data;
>>> > +     unsigned int ret = POLLIN | POLLRDNORM;
>>>
>>> The default return value here must be DEFAULT_POLLMASK otherwise you
>>> change the result of poll for every single proc file and in general
>>
>> This is not true. This patch has nothing to do with proc files and
>> will not change their return values. However I did use the same return
>> values as used for proc files that support poll (see
>> fs/proc/base.c:mounts_poll()
>
> Apologies I meant /proc/sys/ files and it is absolutely true that
> you are changing the poll behavior of every sysctl with this change.
>
> /proc/mounts doesn't return writable because it doesn't support
> writes.  Most sysctl files support writes and thus should return
> writeable.
>>> return the wrong values because most sysctls are writable.
>>
>> For sysctl, there wasn't any poll support so how could I break
>> something that didn't exist?
>
> Not true.  There is the generic support of poll that happens
> when poll is not implemented.  For a generic file that does
> not implement specifically implement poll files return
> both readable and writable.
>
> Which ultimately is why we have to use POLLERR | POLLPRI
> and not the more obvious values of readable or writable.


Ok.


>>> > +     if (!table->proc_handler)
>>> > +             goto out;
>>> > +
>>> > +     if (!table->poll)
>>> > +             goto out;
>>> > +
>>> > +     poll_wait(filp, &table->poll->wait, wait);
>>> > +
>>> > +     if (event != atomic_read(&table->poll->event)) {
>>> > +             filp->private_data = (void *)(unsigned long)atomic_read(
>>> > +                                             &table->poll->event);
>>> > +             ret = POLLIN | POLLRDNORM | POLLERR | POLLPRI;
>>>
>>> Why are you updating the event here instead of when the file is read?
>>> Presumably we want until the file is read and people see the new value.
>>
>> This is to indicate, when we are polling the file , that the file
>> should be read again. When the process wake up we compare the filp's
>> copy of the event number with the one in table. Since they were the
>> same when the file was opened, if they are different now it indicates
>> the the file was updated and should be read again.
>>
>> There's no way to do this in read: we update table->poll->event
>> whenever the table entry is changed and when wakening up from
>> poll_wait() we compare if these values are the same.
>
> What do you mean there is no way to do this in read?
> That is how it is implemented in sysfs and it currently feels more
> intuitive.  There might be a semantic reason for not doing it in
> read that I am missing but it is certainly technically possible.

I thought you were saying that the *notification* should be done in
read. Now I understood that you are only saying that we should update
the value of `event' when the value is read.

Which one would be better for the following scenario?

1. users polls the file
2. wake up
3. don't read the file
4. poll again.

With my implementation he will be notified about the *new* changes
(the ones that happen after he decided to poll the file again).

In your case, he might get woken up because of the prior change. So
the user would always have to read the file after wakening  up.

[snip]
>>> > diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
>>> > index a2cd77e..e96b766 100644
>>> > --- a/kernel/utsname_sysctl.c
>>> > +++ b/kernel/utsname_sysctl.c
>>> > @@ -13,6 +13,7 @@
>>> >  #include <linux/uts.h>
>>> >  #include <linux/utsname.h>
>>> >  #include <linux/sysctl.h>
>>> > +#include <linux/wait.h>
>>> >
>>> >  static void *get_uts(ctl_table *table, int write)
>>> >  {
>>> > @@ -51,12 +52,28 @@ static int proc_do_uts_string(ctl_table *table, int write,
>>> >       uts_table.data = get_uts(table, write);
>>> >       r = proc_dostring(&uts_table,write,buffer,lenp, ppos);
>>> >       put_uts(table, write, uts_table.data);
>>> > +
>>> > +     if (write) {
>>> > +             atomic_inc(&table->poll->event);
>>> > +             wake_up_interruptible(&table->poll->wait);
>>>
>>> Duplicating code again?  Why are you not calling your generic notify?
>>
>> Duplicating? The uts_proc_notify() was only created so when user calls
>> hostname/domainname syscalls we wake up who is polling the
>> correspondent sysctl entries.
>
>> Here it's for the generic case in which we write to /proc/sysctl.
>
> Except the routine you have changed isn't for the generic case it
> is for a specific set of writes to domainname or hostname.
>
> I agree we need code on the update the value through sysctl path.
> However if you are going to go with a generic structure why not
> implement a generic function to update that structure.  Especially since
> you have two copies of this logic to update the notify logic already.

Ok.


thanks
Lucas De Marchi

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH] sysctl: add support for poll()
  2011-08-03 18:45                                   ` Lucas De Marchi
@ 2011-08-04 18:57                                     ` Lucas De Marchi
  2011-08-23 17:57                                       ` Greg KH
  2011-08-26 21:06                                       ` Andrew Morton
  0 siblings, 2 replies; 30+ messages in thread
From: Lucas De Marchi @ 2011-08-04 18:57 UTC (permalink / raw)
  To: Greg KH, Kay Sievers, Andrew Morton, Alan Cox, linux-kernel,
	Nick Piggin, Al Viro, Christoph Hellwig, Stephen Rothwell,
	David Howells, Serge E. Hallyn, Daniel Lezcano, Jiri Slaby,
	James Morris, neilb
  Cc: Lucas De Marchi

Adding support for poll() in sysctl fs allows userspace to receive
notifications of changes in sysctl entries. This adds a infrastructure
to allow files in sysctl fs to be pollable and implements it for
hostname and domainname.

Signed-off-by: Lucas De Marchi <lucas.demarchi@profusion.mobi>
---
 fs/proc/proc_sysctl.c   |   45 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/sysctl.h  |   22 ++++++++++++++++++++++
 include/linux/utsname.h |   16 ++++++++++++++++
 kernel/sys.c            |    2 ++
 kernel/utsname_sysctl.c |   23 +++++++++++++++++++++++
 5 files changed, 108 insertions(+), 0 deletions(-)

diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 1a77dbe..ae8bcd3 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -3,6 +3,7 @@
  */
 #include <linux/init.h>
 #include <linux/sysctl.h>
+#include <linux/poll.h>
 #include <linux/proc_fs.h>
 #include <linux/security.h>
 #include <linux/namei.h>
@@ -14,6 +15,15 @@ static const struct inode_operations proc_sys_inode_operations;
 static const struct file_operations proc_sys_dir_file_operations;
 static const struct inode_operations proc_sys_dir_operations;
 
+void proc_sys_poll_notify(struct ctl_table_poll *poll)
+{
+	if (!poll)
+		return;
+
+	atomic_inc(&poll->event);
+	wake_up_interruptible(&poll->wait);
+}
+
 static struct inode *proc_sys_make_inode(struct super_block *sb,
 		struct ctl_table_header *head, struct ctl_table *table)
 {
@@ -176,6 +186,39 @@ static ssize_t proc_sys_write(struct file *filp, const char __user *buf,
 	return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 1);
 }
 
+static int proc_sys_open(struct inode *inode, struct file *filp)
+{
+	struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+
+	if (table->poll)
+		filp->private_data = proc_sys_poll_event(table->poll);
+
+	return 0;
+}
+
+static unsigned int proc_sys_poll(struct file *filp, poll_table *wait)
+{
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+	unsigned long event = (unsigned long)filp->private_data;
+	unsigned int ret = DEFAULT_POLLMASK;
+
+	if (!table->proc_handler)
+		goto out;
+
+	if (!table->poll)
+		goto out;
+
+	poll_wait(filp, &table->poll->wait, wait);
+
+	if (event != atomic_read(&table->poll->event)) {
+		filp->private_data = proc_sys_poll_event(table->poll);
+		ret = POLLIN | POLLRDNORM | POLLERR | POLLPRI;
+	}
+
+out:
+	return ret;
+}
 
 static int proc_sys_fill_cache(struct file *filp, void *dirent,
 				filldir_t filldir,
@@ -364,6 +407,8 @@ static int proc_sys_getattr(struct vfsmount *mnt, struct dentry *dentry, struct
 }
 
 static const struct file_operations proc_sys_file_operations = {
+	.open		= proc_sys_open,
+	.poll		= proc_sys_poll,
 	.read		= proc_sys_read,
 	.write		= proc_sys_write,
 	.llseek		= default_llseek,
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 11684d9..b6820cc 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -931,6 +931,7 @@ enum
 #ifdef __KERNEL__
 #include <linux/list.h>
 #include <linux/rcupdate.h>
+#include <linux/wait.h>
 
 /* For the /proc/sys support */
 struct ctl_table;
@@ -1011,6 +1012,26 @@ extern int proc_do_large_bitmap(struct ctl_table *, int,
  * cover common cases.
  */
 
+/* Support for userspace poll() to watch for changes */
+struct ctl_table_poll {
+	atomic_t event;
+	wait_queue_head_t wait;
+};
+
+static inline void *proc_sys_poll_event(struct ctl_table_poll *poll)
+{
+	return (void *)(unsigned long)atomic_read(&poll->event);
+}
+
+void proc_sys_poll_notify(struct ctl_table_poll *poll);
+
+#define __CTL_TABLE_POLL_INITIALIZER(name) {				\
+	.event = ATOMIC_INIT(0),					\
+	.wait = __WAIT_QUEUE_HEAD_INITIALIZER(name.wait) }
+
+#define DECLARE_CTL_TABLE_POLL(name)					\
+	struct ctl_table_poll name = __CTL_TABLE_POLL_INITIALIZER(name)
+
 /* A sysctl table is an array of struct ctl_table: */
 struct ctl_table 
 {
@@ -1021,6 +1042,7 @@ struct ctl_table
 	struct ctl_table *child;
 	struct ctl_table *parent;	/* Automatically set */
 	proc_handler *proc_handler;	/* Callback for text formatting */
+	struct ctl_table_poll *poll;
 	void *extra1;
 	void *extra2;
 };
diff --git a/include/linux/utsname.h b/include/linux/utsname.h
index 4e5b021..c714ed7 100644
--- a/include/linux/utsname.h
+++ b/include/linux/utsname.h
@@ -37,6 +37,14 @@ struct new_utsname {
 #include <linux/nsproxy.h>
 #include <linux/err.h>
 
+enum uts_proc {
+	UTS_PROC_OSTYPE,
+	UTS_PROC_OSRELEASE,
+	UTS_PROC_VERSION,
+	UTS_PROC_HOSTNAME,
+	UTS_PROC_DOMAINNAME,
+};
+
 struct user_namespace;
 extern struct user_namespace init_user_ns;
 
@@ -80,6 +88,14 @@ static inline struct uts_namespace *copy_utsname(unsigned long flags,
 }
 #endif
 
+#ifdef CONFIG_PROC_SYSCTL
+extern void uts_proc_notify(enum uts_proc proc);
+#else
+static inline void uts_proc_notify(enum uts_proc proc)
+{
+}
+#endif
+
 static inline struct new_utsname *utsname(void)
 {
 	return &current->nsproxy->uts_ns->name;
diff --git a/kernel/sys.c b/kernel/sys.c
index a101ba3..2347043 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1241,6 +1241,7 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
 		memset(u->nodename + len, 0, sizeof(u->nodename) - len);
 		errno = 0;
 	}
+	uts_proc_notify(UTS_PROC_HOSTNAME);
 	up_write(&uts_sem);
 	return errno;
 }
@@ -1291,6 +1292,7 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
 		memset(u->domainname + len, 0, sizeof(u->domainname) - len);
 		errno = 0;
 	}
+	uts_proc_notify(UTS_PROC_DOMAINNAME);
 	up_write(&uts_sem);
 	return errno;
 }
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index a2cd77e..21d4b79 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -13,6 +13,7 @@
 #include <linux/uts.h>
 #include <linux/utsname.h>
 #include <linux/sysctl.h>
+#include <linux/wait.h>
 
 static void *get_uts(ctl_table *table, int write)
 {
@@ -51,12 +52,19 @@ static int proc_do_uts_string(ctl_table *table, int write,
 	uts_table.data = get_uts(table, write);
 	r = proc_dostring(&uts_table,write,buffer,lenp, ppos);
 	put_uts(table, write, uts_table.data);
+
+	if (write)
+		proc_sys_poll_notify(table->poll);
+
 	return r;
 }
 #else
 #define proc_do_uts_string NULL
 #endif
 
+static DECLARE_CTL_TABLE_POLL(hostname_poll);
+static DECLARE_CTL_TABLE_POLL(domainname_poll);
+
 static struct ctl_table uts_kern_table[] = {
 	{
 		.procname	= "ostype",
@@ -85,6 +93,7 @@ static struct ctl_table uts_kern_table[] = {
 		.maxlen		= sizeof(init_uts_ns.name.nodename),
 		.mode		= 0644,
 		.proc_handler	= proc_do_uts_string,
+		.poll		= &hostname_poll,
 	},
 	{
 		.procname	= "domainname",
@@ -92,6 +101,7 @@ static struct ctl_table uts_kern_table[] = {
 		.maxlen		= sizeof(init_uts_ns.name.domainname),
 		.mode		= 0644,
 		.proc_handler	= proc_do_uts_string,
+		.poll		= &domainname_poll,
 	},
 	{}
 };
@@ -105,6 +115,19 @@ static struct ctl_table uts_root_table[] = {
 	{}
 };
 
+#ifdef CONFIG_PROC_SYSCTL
+/*
+ * Notify userspace about a change in a certain entry of uts_kern_table,
+ * identified by the parameter proc.
+ */
+void uts_proc_notify(enum uts_proc proc)
+{
+	struct ctl_table *table = &uts_kern_table[proc];
+
+	proc_sys_poll_notify(table->poll);
+}
+#endif
+
 static int __init utsname_sysctl_init(void)
 {
 	register_sysctl_table(uts_root_table);
-- 
1.7.6


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* Re: [PATCH] sysctl: add support for poll()
  2011-08-04 18:57                                     ` Lucas De Marchi
@ 2011-08-23 17:57                                       ` Greg KH
  2011-08-26 21:06                                       ` Andrew Morton
  1 sibling, 0 replies; 30+ messages in thread
From: Greg KH @ 2011-08-23 17:57 UTC (permalink / raw)
  To: Lucas De Marchi
  Cc: Greg KH, Kay Sievers, Andrew Morton, Alan Cox, linux-kernel,
	Nick Piggin, Al Viro, Christoph Hellwig, Stephen Rothwell,
	David Howells, Serge E. Hallyn, Daniel Lezcano, Jiri Slaby,
	James Morris, neilb

On Thu, Aug 04, 2011 at 03:57:18PM -0300, Lucas De Marchi wrote:
> Adding support for poll() in sysctl fs allows userspace to receive
> notifications of changes in sysctl entries. This adds a infrastructure
> to allow files in sysctl fs to be pollable and implements it for
> hostname and domainname.
> 
> Signed-off-by: Lucas De Marchi <lucas.demarchi@profusion.mobi>
> ---
>  fs/proc/proc_sysctl.c   |   45 +++++++++++++++++++++++++++++++++++++++++++++
>  include/linux/sysctl.h  |   22 ++++++++++++++++++++++
>  include/linux/utsname.h |   16 ++++++++++++++++
>  kernel/sys.c            |    2 ++
>  kernel/utsname_sysctl.c |   23 +++++++++++++++++++++++
>  5 files changed, 108 insertions(+), 0 deletions(-)

Who is going to take this through their tree?

I can through the driver-core tree if no one else objects.

Any objections?

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH] sysctl: add support for poll()
  2011-08-04 18:57                                     ` Lucas De Marchi
  2011-08-23 17:57                                       ` Greg KH
@ 2011-08-26 21:06                                       ` Andrew Morton
  1 sibling, 0 replies; 30+ messages in thread
From: Andrew Morton @ 2011-08-26 21:06 UTC (permalink / raw)
  To: Lucas De Marchi
  Cc: Greg KH, Kay Sievers, Alan Cox, linux-kernel, Nick Piggin,
	Al Viro, Christoph Hellwig, Stephen Rothwell, David Howells,
	Serge E. Hallyn, Daniel Lezcano, Jiri Slaby, James Morris, neilb

On Thu,  4 Aug 2011 15:57:18 -0300
Lucas De Marchi <lucas.demarchi@profusion.mobi> wrote:

> Adding support for poll() in sysctl fs allows userspace to receive
> notifications of changes in sysctl entries. This adds a infrastructure
> to allow files in sysctl fs to be pollable and implements it for
> hostname and domainname.
> 

Seems reasonable.  One slight nit:

> +#define __CTL_TABLE_POLL_INITIALIZER(name) {				\
> +	.event = ATOMIC_INIT(0),					\
> +	.wait = __WAIT_QUEUE_HEAD_INITIALIZER(name.wait) }
> +
> +#define DECLARE_CTL_TABLE_POLL(name)					\
> +	struct ctl_table_poll name = __CTL_TABLE_POLL_INITIALIZER(name)

This macro is used to define an instance of ctl_table_poll.  Hence it
should be called DEFINE_CTL_TABLE_POLL, not DECLARE_*.

declaration: "one of these exists"
definition: "here it is".

This matters a bit, because sometimes we'll have separate macros for
declaring and defining.


^ permalink raw reply	[flat|nested] 30+ messages in thread

end of thread, other threads:[~2011-08-26 21:07 UTC | newest]

Thread overview: 30+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-06-01 12:14 [PATCH] sysctl: add support for poll() Lucas De Marchi
2011-06-02  2:51 ` Lucas De Marchi
2011-06-02  3:31   ` Eric W. Biederman
2011-06-02 12:06     ` Kay Sievers
2011-06-02 12:43       ` Alan Cox
2011-06-02 13:01         ` Kay Sievers
2011-06-02 13:02         ` Lucas De Marchi
2011-06-02 13:12           ` Alan Cox
2011-06-02 13:24             ` Kay Sievers
2011-06-02 13:56           ` Eric W. Biederman
2011-06-02 17:32             ` Kay Sievers
2011-06-08 22:17               ` Andrew Morton
2011-06-09 13:16                 ` Kay Sievers
2011-06-13 16:05                   ` Kay Sievers
2011-06-14  3:53                     ` Lucas De Marchi
2011-06-14  4:17                     ` NeilBrown
2011-07-26  2:17                     ` Lucas De Marchi
2011-08-02 22:53                       ` Kay Sievers
2011-08-02 23:16                         ` Greg KH
2011-08-03  1:12                           ` Lucas De Marchi
2011-08-03  9:40                             ` Eric W. Biederman
2011-08-03 13:17                               ` Lucas De Marchi
2011-08-03 18:08                                 ` Eric W. Biederman
2011-08-03 18:45                                   ` Lucas De Marchi
2011-08-04 18:57                                     ` Lucas De Marchi
2011-08-23 17:57                                       ` Greg KH
2011-08-26 21:06                                       ` Andrew Morton
2011-06-12 15:34               ` Eric W. Biederman
2011-06-13 14:28                 ` Kay Sievers
2011-06-02 14:16         ` Eric W. Biederman

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.