linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [patch 7/8] fdmap v2 - implement sys_socket2
@ 2007-06-06 22:30 Davide Libenzi
  2007-06-06 22:44 ` David Miller
                   ` (2 more replies)
  0 siblings, 3 replies; 129+ messages in thread
From: Davide Libenzi @ 2007-06-06 22:30 UTC (permalink / raw)
  To: Linux Kernel Mailing List
  Cc: Linus Torvalds, Andrew Morton, Ulrich Drepper, Ingo Molnar, Eric Dumazet

This patch implement a new syscall sys_socket2(), that accepts an
extra "flags" parameter:

int socket2(int domain, int type, int protocol, int flags);

The flags parameter is used to pass extra flags to the kernel, and is
at the moment used to select the file descriptor allocations inside
the non-sequential area (O_NONSEQFD). The remaining parameters are
exactly the same as the ones of sys_socket().
The sys_accept() system call has been modified to return a file
descriptor inside the non-sequential area, if the listening fd is.
The sys_socketcall() system call has been also changed to support
a new SYS_SOCKET2 indentifier.



Signed-off-by: Davide Libenzi <davidel@xmailserver.org>


- Davide



Index: linux-2.6.mod/fs/9p/trans_fd.c
===================================================================
--- linux-2.6.mod.orig/fs/9p/trans_fd.c	2007-06-06 12:38:27.000000000 -0700
+++ linux-2.6.mod/fs/9p/trans_fd.c	2007-06-06 12:48:41.000000000 -0700
@@ -181,7 +181,7 @@
 	int fd, ret;
 
 	csocket->sk->sk_allocation = GFP_NOIO;
-	if ((fd = sock_map_fd(csocket)) < 0) {
+	if ((fd = sock_map_fd(csocket, 0)) < 0) {
 		eprintk(KERN_ERR, "v9fs_socket_open: failed to map fd\n");
 		ret = fd;
 	      release_csocket:
Index: linux-2.6.mod/include/linux/net.h
===================================================================
--- linux-2.6.mod.orig/include/linux/net.h	2007-06-06 12:38:27.000000000 -0700
+++ linux-2.6.mod/include/linux/net.h	2007-06-06 13:09:20.000000000 -0700
@@ -43,6 +43,7 @@
 #define SYS_GETSOCKOPT	15		/* sys_getsockopt(2)		*/
 #define SYS_SENDMSG	16		/* sys_sendmsg(2)		*/
 #define SYS_RECVMSG	17		/* sys_recvmsg(2)		*/
+#define SYS_SOCKET2	18		/* sys_socket2(2)		*/
 
 typedef enum {
 	SS_FREE = 0,			/* not allocated		*/
@@ -190,7 +191,7 @@
 				  size_t len);
 extern int	     sock_recvmsg(struct socket *sock, struct msghdr *msg,
 				  size_t size, int flags);
-extern int 	     sock_map_fd(struct socket *sock);
+extern int 	     sock_map_fd(struct socket *sock, int flags);
 extern struct socket *sockfd_lookup(int fd, int *err);
 #define		     sockfd_put(sock) fput(sock->file)
 extern int	     net_ratelimit(void);
Index: linux-2.6.mod/net/sctp/socket.c
===================================================================
--- linux-2.6.mod.orig/net/sctp/socket.c	2007-06-06 12:38:27.000000000 -0700
+++ linux-2.6.mod/net/sctp/socket.c	2007-06-06 12:48:41.000000000 -0700
@@ -3605,7 +3605,7 @@
 		goto out;
 
 	/* Map the socket to an unused fd that can be returned to the user.  */
-	retval = sock_map_fd(newsock);
+	retval = sock_map_fd(newsock, 0);
 	if (retval < 0) {
 		sock_release(newsock);
 		goto out;
Index: linux-2.6.mod/net/socket.c
===================================================================
--- linux-2.6.mod.orig/net/socket.c	2007-06-06 12:38:27.000000000 -0700
+++ linux-2.6.mod/net/socket.c	2007-06-06 13:14:08.000000000 -0700
@@ -344,11 +344,11 @@
  *	but we take care of internal coherence yet.
  */
 
-static int sock_alloc_fd(struct file **filep)
+static int sock_alloc_fd(struct file **filep, int flags)
 {
 	int fd;
 
-	fd = get_unused_fd();
+	fd = allocate_fd(flags);
 	if (likely(fd >= 0)) {
 		struct file *file = get_empty_filp();
 
@@ -391,10 +391,10 @@
 	return 0;
 }
 
-int sock_map_fd(struct socket *sock)
+int sock_map_fd(struct socket *sock, int flags)
 {
 	struct file *newfile;
-	int fd = sock_alloc_fd(&newfile);
+	int fd = sock_alloc_fd(&newfile, flags);
 
 	if (likely(fd >= 0)) {
 		int err = sock_attach_fd(sock, newfile);
@@ -1198,7 +1198,7 @@
 	return __sock_create(family, type, protocol, res, 1);
 }
 
-asmlinkage long sys_socket(int family, int type, int protocol)
+asmlinkage long sys_socket2(int family, int type, int protocol, int flags)
 {
 	int retval;
 	struct socket *sock;
@@ -1207,7 +1207,7 @@
 	if (retval < 0)
 		goto out;
 
-	retval = sock_map_fd(sock);
+	retval = sock_map_fd(sock, flags);
 	if (retval < 0)
 		goto out_release;
 
@@ -1220,6 +1220,11 @@
 	return retval;
 }
 
+asmlinkage long sys_socket(int family, int type, int protocol)
+{
+	return sys_socket2(family, type, protocol, 0);
+}
+
 /*
  *	Create a pair of connected sockets.
  */
@@ -1248,11 +1253,11 @@
 	if (err < 0)
 		goto out_release_both;
 
-	fd1 = sock_alloc_fd(&newfile1);
+	fd1 = sock_alloc_fd(&newfile1, 0);
 	if (unlikely(fd1 < 0))
 		goto out_release_both;
 
-	fd2 = sock_alloc_fd(&newfile2);
+	fd2 = sock_alloc_fd(&newfile2, 0);
 	if (unlikely(fd2 < 0)) {
 		put_filp(newfile1);
 		put_unused_fd(fd1);
@@ -1407,7 +1412,8 @@
 	 */
 	__module_get(newsock->ops->owner);
 
-	newfd = sock_alloc_fd(&newfile);
+	newfd = sock_alloc_fd(&newfile,
+	      fd > current->signal->rlim[RLIMIT_NOFILE].rlim_cur ? O_NONSEQFD: 0);
 	if (unlikely(newfd < 0)) {
 		err = newfd;
 		sock_release(newsock);
@@ -1983,10 +1989,11 @@
 
 /* Argument list sizes for sys_socketcall */
 #define AL(x) ((x) * sizeof(unsigned long))
-static const unsigned char nargs[18]={
+static const unsigned char nargs[]={
 	AL(0),AL(3),AL(3),AL(3),AL(2),AL(3),
 	AL(3),AL(3),AL(4),AL(4),AL(4),AL(6),
-	AL(6),AL(2),AL(5),AL(5),AL(3),AL(3)
+	AL(6),AL(2),AL(5),AL(5),AL(3),AL(3),
+	AL(4)
 };
 
 #undef AL
@@ -2005,7 +2012,7 @@
 	unsigned long a0, a1;
 	int err;
 
-	if (call < 1 || call > SYS_RECVMSG)
+	if (call < 1 || call >= ARRAY_SIZE(nargs))
 		return -EINVAL;
 
 	/* copy_from_user should be SMP safe. */
@@ -2082,6 +2089,9 @@
 	case SYS_RECVMSG:
 		err = sys_recvmsg(a0, (struct msghdr __user *)a1, a[2]);
 		break;
+	case SYS_SOCKET2:
+		err = sys_socket2(a0, a1, a[2], a[3]);
+		break;
 	default:
 		err = -EINVAL;
 		break;


^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-06 22:30 [patch 7/8] fdmap v2 - implement sys_socket2 Davide Libenzi
@ 2007-06-06 22:44 ` David Miller
  2007-06-06 22:52   ` Davide Libenzi
  2007-06-06 22:57   ` Ulrich Drepper
  2007-06-06 22:59 ` Alan Cox
  2007-06-07  0:29 ` Arnd Bergmann
  2 siblings, 2 replies; 129+ messages in thread
From: David Miller @ 2007-06-06 22:44 UTC (permalink / raw)
  To: davidel; +Cc: linux-kernel, torvalds, akpm, drepper, mingo, dada1

From: Davide Libenzi <davidel@xmailserver.org>
Date: Wed, 06 Jun 2007 15:30:31 -0700

> This patch implement a new syscall sys_socket2(), that accepts an
> extra "flags" parameter:
> 
> int socket2(int domain, int type, int protocol, int flags);
> 
> The flags parameter is used to pass extra flags to the kernel, and is
> at the moment used to select the file descriptor allocations inside
> the non-sequential area (O_NONSEQFD). The remaining parameters are
> exactly the same as the ones of sys_socket().
> The sys_accept() system call has been modified to return a file
> descriptor inside the non-sequential area, if the listening fd is.
> The sys_socketcall() system call has been also changed to support
> a new SYS_SOCKET2 indentifier.
> 
> Signed-off-by: Davide Libenzi <davidel@xmailserver.org>

Since the valid range of "domain" values is quite small,
we could avoid the new system call by cribbing some of the
upper bits of the 'domain' argument.

Valid existing programs pass in valid 'domain' values and
thus will not set any of the new flags.

Just an idea.

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-06 22:44 ` David Miller
@ 2007-06-06 22:52   ` Davide Libenzi
  2007-06-06 22:57     ` David Miller
  2007-06-06 22:57   ` Ulrich Drepper
  1 sibling, 1 reply; 129+ messages in thread
From: Davide Libenzi @ 2007-06-06 22:52 UTC (permalink / raw)
  To: David Miller
  Cc: Linux Kernel Mailing List, Linus Torvalds, Andrew Morton,
	Ulrich Drepper, Ingo Molnar, dada1

On Wed, 6 Jun 2007, David Miller wrote:

> From: Davide Libenzi <davidel@xmailserver.org>
> Date: Wed, 06 Jun 2007 15:30:31 -0700
> 
> > This patch implement a new syscall sys_socket2(), that accepts an
> > extra "flags" parameter:
> > 
> > int socket2(int domain, int type, int protocol, int flags);
> > 
> > The flags parameter is used to pass extra flags to the kernel, and is
> > at the moment used to select the file descriptor allocations inside
> > the non-sequential area (O_NONSEQFD). The remaining parameters are
> > exactly the same as the ones of sys_socket().
> > The sys_accept() system call has been modified to return a file
> > descriptor inside the non-sequential area, if the listening fd is.
> > The sys_socketcall() system call has been also changed to support
> > a new SYS_SOCKET2 indentifier.
> > 
> > Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
> 
> Since the valid range of "domain" values is quite small,
> we could avoid the new system call by cribbing some of the
> upper bits of the 'domain' argument.
> 
> Valid existing programs pass in valid 'domain' values and
> thus will not set any of the new flags.
> 
> Just an idea.

I have no huge preferences. If not the slight one of using the same flags 
for open() and socket{2}() (O_NONSEQFD). If we overload socket() we may 
need to fight with existing O_* flags. OTOH the current free ones are 
pretty high in bits, so it may be OK too.



- Davide



^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-06 22:44 ` David Miller
  2007-06-06 22:52   ` Davide Libenzi
@ 2007-06-06 22:57   ` Ulrich Drepper
  2007-06-06 23:02     ` David Miller
  1 sibling, 1 reply; 129+ messages in thread
From: Ulrich Drepper @ 2007-06-06 22:57 UTC (permalink / raw)
  To: David Miller; +Cc: davidel, linux-kernel, torvalds, akpm, mingo, dada1

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

David Miller wrote:
> Since the valid range of "domain" values is quite small,
> we could avoid the new system call by cribbing some of the
> upper bits of the 'domain' argument.
> 
> Valid existing programs pass in valid 'domain' values and
> thus will not set any of the new flags.

I can see several problems with that:

- - experimental implementers might choose domain values which definitely
won't collide with others

- - the flags parameter ideally allows using the same values used for
open's mode argument.  The lowest value I can see making sense is
O_NONBLOCK (04000).

- - how to recognize kernels without the support?  -EAFNOSUPPORT can also
with new kernels mean it's actually the domain which is wrong

- - there might be new flags we want to use over time

I would strongly argue that any change we're doing in this area at
userlevel would involve a new interface.  Programs also need new
definitions from headers files.  This means a recent enough glibc will
be needed in any case.  Unless programs use their own definitions in
which case they might as well use the syscall() function.

- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.7 (GNU/Linux)

iD8DBQFGZzvl2ijCOnn/RHQRAgV0AKDBDhqSQ/cs4qGYLKGL4dwzpFZ2zgCgl/qO
oFKnQ2eRuiziRu/N5vwWCeM=
=tttP
-----END PGP SIGNATURE-----

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-06 22:52   ` Davide Libenzi
@ 2007-06-06 22:57     ` David Miller
  0 siblings, 0 replies; 129+ messages in thread
From: David Miller @ 2007-06-06 22:57 UTC (permalink / raw)
  To: davidel; +Cc: linux-kernel, torvalds, akpm, drepper, mingo, dada1

From: Davide Libenzi <davidel@xmailserver.org>
Date: Wed, 6 Jun 2007 15:52:31 -0700 (PDT)

> I have no huge preferences. If not the slight one of using the same flags 
> for open() and socket{2}() (O_NONSEQFD). If we overload socket() we may 
> need to fight with existing O_* flags. OTOH the current free ones are 
> pretty high in bits, so it may be OK too.

I see.

To me this tends to give higher weight to just adding the
new socket2() system call.

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-06 22:59 ` Alan Cox
@ 2007-06-06 22:58   ` Ulrich Drepper
  2007-06-06 23:04   ` Davide Libenzi
  2007-06-07 20:10   ` Linus Torvalds
  2 siblings, 0 replies; 129+ messages in thread
From: Ulrich Drepper @ 2007-06-06 22:58 UTC (permalink / raw)
  To: Alan Cox
  Cc: Davide Libenzi, Linux Kernel Mailing List, Linus Torvalds,
	Andrew Morton, Ingo Molnar, Eric Dumazet

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

Alan Cox wrote:
> 	prctl(PR_SPARSEFD, 1);
> 
> to turn on sparse fd allocation for a process ?

Yes, there is.  Go back and read the archives.  It has been discussed in
depth.

- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.7 (GNU/Linux)

iD8DBQFGZzwk2ijCOnn/RHQRAtKGAKCTX5njQnYeyDn4XUGFAZ3Ojai+mwCeN/j0
jibBDSqQpXhR2CwIQNRAnXw=
=sJb0
-----END PGP SIGNATURE-----

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-06 22:30 [patch 7/8] fdmap v2 - implement sys_socket2 Davide Libenzi
  2007-06-06 22:44 ` David Miller
@ 2007-06-06 22:59 ` Alan Cox
  2007-06-06 22:58   ` Ulrich Drepper
                     ` (2 more replies)
  2007-06-07  0:29 ` Arnd Bergmann
  2 siblings, 3 replies; 129+ messages in thread
From: Alan Cox @ 2007-06-06 22:59 UTC (permalink / raw)
  To: Davide Libenzi
  Cc: Linux Kernel Mailing List, Linus Torvalds, Andrew Morton,
	Ulrich Drepper, Ingo Molnar, Eric Dumazet

> The sys_accept() system call has been modified to return a file
> descriptor inside the non-sequential area, if the listening fd is.
> The sys_socketcall() system call has been also changed to support
> a new SYS_SOCKET2 indentifier.

This still all seems really really ugly. Is there anything wrong with
throwing all these extra cases out and replacing the entire lot with

	prctl(PR_SPARSEFD, 1);

to turn on sparse fd allocation for a process ?

Anyone needing to deal with certain special fds will use dup2() anyway so
a task global switch seems to be cleaner and make the behaviour simply to
flip on, with no extra calls (and you need to submit man pages for them
all too), and also more importantly no new glibc stuff should be needed,
and a process can try to set sparsefd, fail and carry on so its more
portable and back portable.

Alan

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-06 22:57   ` Ulrich Drepper
@ 2007-06-06 23:02     ` David Miller
  0 siblings, 0 replies; 129+ messages in thread
From: David Miller @ 2007-06-06 23:02 UTC (permalink / raw)
  To: drepper; +Cc: davidel, linux-kernel, torvalds, akpm, mingo, dada1

From: Ulrich Drepper <drepper@redhat.com>
Date: Wed, 06 Jun 2007 15:57:41 -0700

> I would strongly argue that any change we're doing in this area at
> userlevel would involve a new interface.  Programs also need new
> definitions from headers files.  This means a recent enough glibc will
> be needed in any case.  Unless programs use their own definitions in
> which case they might as well use the syscall() function.

To be honest, after reading Alan's response a few moments ago
I'm growing in favor of his suggestions and that all of these
new system calls perhaps really are overkill.

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-06 22:59 ` Alan Cox
  2007-06-06 22:58   ` Ulrich Drepper
@ 2007-06-06 23:04   ` Davide Libenzi
  2007-06-06 23:08     ` David Miller
  2007-06-06 23:19     ` Alan Cox
  2007-06-07 20:10   ` Linus Torvalds
  2 siblings, 2 replies; 129+ messages in thread
From: Davide Libenzi @ 2007-06-06 23:04 UTC (permalink / raw)
  To: Alan Cox
  Cc: Linux Kernel Mailing List, Linus Torvalds, Andrew Morton,
	Ulrich Drepper, Ingo Molnar, Eric Dumazet

On Wed, 6 Jun 2007, Alan Cox wrote:

> > The sys_accept() system call has been modified to return a file
> > descriptor inside the non-sequential area, if the listening fd is.
> > The sys_socketcall() system call has been also changed to support
> > a new SYS_SOCKET2 indentifier.
> 
> This still all seems really really ugly. Is there anything wrong with
> throwing all these extra cases out and replacing the entire lot with
> 
> 	prctl(PR_SPARSEFD, 1);
> 
> to turn on sparse fd allocation for a process ?

There was a little discussion where I tried to whisper something similar, 
but Linus and Uli shot me :) - with good reasons IMO.
You may link to runtimes that are not non-sequentialfd aware, and will 
break them.



> Anyone needing to deal with certain special fds will use dup2() anyway so
> a task global switch seems to be cleaner and make the behaviour simply to
> flip on, with no extra calls (and you need to submit man pages for them
> all too), and also more importantly no new glibc stuff should be needed,
> and a process can try to set sparsefd, fail and carry on so its more
> portable and back portable.

Man pages! Damn, I forgot Michael Kerrisk is already waiting for the other 
stuff :(



- Davide



^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-06 23:04   ` Davide Libenzi
@ 2007-06-06 23:08     ` David Miller
  2007-06-06 23:19     ` Alan Cox
  1 sibling, 0 replies; 129+ messages in thread
From: David Miller @ 2007-06-06 23:08 UTC (permalink / raw)
  To: davidel; +Cc: alan, linux-kernel, torvalds, akpm, drepper, mingo, dada1

From: Davide Libenzi <davidel@xmailserver.org>
Date: Wed, 6 Jun 2007 16:04:40 -0700 (PDT)

> On Wed, 6 Jun 2007, Alan Cox wrote:
> 
> > > The sys_accept() system call has been modified to return a file
> > > descriptor inside the non-sequential area, if the listening fd is.
> > > The sys_socketcall() system call has been also changed to support
> > > a new SYS_SOCKET2 indentifier.
> > 
> > This still all seems really really ugly. Is there anything wrong with
> > throwing all these extra cases out and replacing the entire lot with
> > 
> > 	prctl(PR_SPARSEFD, 1);
> > 
> > to turn on sparse fd allocation for a process ?
> 
> There was a little discussion where I tried to whisper something similar, 
> but Linus and Uli shot me :) - with good reasons IMO.
> You may link to runtimes that are not non-sequentialfd aware, and will 
> break them.

Thanks for explaining this issue clearly instead of telling people
to "go read the archives" in a condescending manner like someone
else did.

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-06 23:04   ` Davide Libenzi
  2007-06-06 23:08     ` David Miller
@ 2007-06-06 23:19     ` Alan Cox
  2007-06-06 23:22       ` Ulrich Drepper
  2007-06-06 23:29       ` Davide Libenzi
  1 sibling, 2 replies; 129+ messages in thread
From: Alan Cox @ 2007-06-06 23:19 UTC (permalink / raw)
  To: Davide Libenzi
  Cc: Linux Kernel Mailing List, Linus Torvalds, Andrew Morton,
	Ulrich Drepper, Ingo Molnar, Eric Dumazet

> > 	prctl(PR_SPARSEFD, 1);
> > 
> > to turn on sparse fd allocation for a process ?
> 
> There was a little discussion where I tried to whisper something similar, 
> but Linus and Uli shot me :) - with good reasons IMO.
> You may link to runtimes that are not non-sequentialfd aware, and will 
> break them.

Linking to the correct version of a libary and getting the library
versioning right is not rocket science and isn't a sane excuse. Its no
different to the stdio to large fd migration issues with many Unixen and
they all coped just fine.

Really all this new syscall hackery stuff is just too ugly to live. If
you use the prctl then yes we have a bit of library versioning to worry
about for the odd library that cares but thats a once over thing. The
crappy zillion extra syscalls we have to support for years and years just
to save a little bit of userspace work.

At its most moronic its no different to 32 and 64bit binary linking - and
the gnu tools manage to cope with stopping me linking a 32bit app to a
64bit lib and vice versa just fine, so I'm sure they can cope the same
way with sparse fd safe/non sparese fd safe libraries


^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-06 23:19     ` Alan Cox
@ 2007-06-06 23:22       ` Ulrich Drepper
  2007-06-07 10:04         ` Alan Cox
  2007-06-06 23:29       ` Davide Libenzi
  1 sibling, 1 reply; 129+ messages in thread
From: Ulrich Drepper @ 2007-06-06 23:22 UTC (permalink / raw)
  To: Alan Cox
  Cc: Davide Libenzi, Linux Kernel Mailing List, Linus Torvalds,
	Andrew Morton, Ingo Molnar, Eric Dumazet

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

Alan Cox wrote:
> Linking to the correct version of a libary and getting the library
> versioning right is not rocket science and isn't a sane excuse. Its no
> different to the stdio to large fd migration issues with many Unixen and
> they all coped just fine.

This has nothing to do with linking and ABI.  The assumptions about
continuous allocation are part of the API.  It's required by POSIX and
provided by Unix since the early days.  There are entire code bases out
there which depend on this assumption.  Linking with code like this,
before or after the new version controlled symbol is introduced, will
break it.  Policies or stateful behavior, however yo want to call it, is
just plain wrong for this (and most other things).

- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.7 (GNU/Linux)

iD8DBQFGZ0G92ijCOnn/RHQRAkB9AJ93ol7XV2GiCw+8wgbJ9uMBnHU6dQCgmmAp
9m+WEup3iPkEHH6HIHDa88I=
=Dhto
-----END PGP SIGNATURE-----

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-06 23:19     ` Alan Cox
  2007-06-06 23:22       ` Ulrich Drepper
@ 2007-06-06 23:29       ` Davide Libenzi
  2007-06-07 10:06         ` Alan Cox
  1 sibling, 1 reply; 129+ messages in thread
From: Davide Libenzi @ 2007-06-06 23:29 UTC (permalink / raw)
  To: Alan Cox
  Cc: Linux Kernel Mailing List, Linus Torvalds, Andrew Morton,
	Ulrich Drepper, Ingo Molnar, Eric Dumazet

On Thu, 7 Jun 2007, Alan Cox wrote:

> > > 	prctl(PR_SPARSEFD, 1);
> > > 
> > > to turn on sparse fd allocation for a process ?
> > 
> > There was a little discussion where I tried to whisper something similar, 
> > but Linus and Uli shot me :) - with good reasons IMO.
> > You may link to runtimes that are not non-sequentialfd aware, and will 
> > break them.
> 
> Linking to the correct version of a libary and getting the library
> versioning right is not rocket science and isn't a sane excuse. Its no
> different to the stdio to large fd migration issues with many Unixen and
> they all coped just fine.

I don't think it's a matter of versioning. Many userspace libraries 
expects their fds to be compact (for many reasons - they use select, they 
use them to index 0-based arrays, etc...), and if the kernel suddendly 
starts returning values in the 1<<28 up arena, they sure won't be happy.
So I believe that the correct way is that the caller specifically selects 
the feature, leaving the legacy fd allocation as default.



- Davide



^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-06 22:30 [patch 7/8] fdmap v2 - implement sys_socket2 Davide Libenzi
  2007-06-06 22:44 ` David Miller
  2007-06-06 22:59 ` Alan Cox
@ 2007-06-07  0:29 ` Arnd Bergmann
  2007-06-07  0:33   ` Davide Libenzi
  2 siblings, 1 reply; 129+ messages in thread
From: Arnd Bergmann @ 2007-06-07  0:29 UTC (permalink / raw)
  To: Davide Libenzi
  Cc: Linux Kernel Mailing List, Linus Torvalds, Andrew Morton,
	Ulrich Drepper, Ingo Molnar, Eric Dumazet

On Thursday 07 June 2007, Davide Libenzi wrote:
> The sys_socketcall() system call has been also changed to support
> a new SYS_SOCKET2 indentifier.

I thought the general agreement was that sys_socketcall is a bad
idea to start with. Is there any benefit in adding new calls to
it instead of using a new system call number for sys_socket2 on
all architectures?

	Arnd <><

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-07  0:29 ` Arnd Bergmann
@ 2007-06-07  0:33   ` Davide Libenzi
  0 siblings, 0 replies; 129+ messages in thread
From: Davide Libenzi @ 2007-06-07  0:33 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Linux Kernel Mailing List, Linus Torvalds, Andrew Morton,
	Ulrich Drepper, Ingo Molnar, Eric Dumazet

On Thu, 7 Jun 2007, Arnd Bergmann wrote:

> On Thursday 07 June 2007, Davide Libenzi wrote:
> > The sys_socketcall() system call has been also changed to support
> > a new SYS_SOCKET2 indentifier.
> 
> I thought the general agreement was that sys_socketcall is a bad
> idea to start with. Is there any benefit in adding new calls to
> it instead of using a new system call number for sys_socket2 on
> all architectures?

Ohh, I didn't know it was flagged as "bad" ;) I actually had it that way, 
but then I noticed there was no __NR_socket, so I complied to the 
existing way of doing it.



- Davide



^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-06 23:22       ` Ulrich Drepper
@ 2007-06-07 10:04         ` Alan Cox
  2007-06-07 11:59           ` Kyle Moffett
  2007-06-07 14:25           ` Ulrich Drepper
  0 siblings, 2 replies; 129+ messages in thread
From: Alan Cox @ 2007-06-07 10:04 UTC (permalink / raw)
  To: Ulrich Drepper
  Cc: Davide Libenzi, Linux Kernel Mailing List, Linus Torvalds,
	Andrew Morton, Ingo Molnar, Eric Dumazet

> This has nothing to do with linking and ABI.  The assumptions about

It may not to someone who has their head in linkers all day and very
defined ideas about linkers but joining to pieces of code together and
figuring out if they can be combined is "linking", regardless of the
whether-they-can-be-joined being down to symbol availability, instruction
set, memory layout or assumptions about behaviour.

> continuous allocation are part of the API.  It's required by POSIX and
> provided by Unix since the early days.  There are entire code bases out
> there which depend on this assumption.  Linking with code like this,
> before or after the new version controlled symbol is introduced, will
> break it. 

If your linker is doing its job then you won't be able to link them
together because they have incompatible assumptions. Not exactly rocket
science even if it is done a little differently to the usual symbol
compatibility tests. 

> Policies or stateful behavior, however yo want to call it, is
> just plain wrong for this (and most other things).

And the stuff you are trying to put into the kernel is better ? No, its a
bunch of ugly hacks caused by trying to solve the problem in the wrong
way and in the wrong place.

Alan

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-06 23:29       ` Davide Libenzi
@ 2007-06-07 10:06         ` Alan Cox
  2007-06-07 10:45           ` Eric Dumazet
  2007-06-07 15:41           ` Davide Libenzi
  0 siblings, 2 replies; 129+ messages in thread
From: Alan Cox @ 2007-06-07 10:06 UTC (permalink / raw)
  To: Davide Libenzi
  Cc: Linux Kernel Mailing List, Linus Torvalds, Andrew Morton,
	Ulrich Drepper, Ingo Molnar, Eric Dumazet

> I don't think it's a matter of versioning. Many userspace libraries 
> expects their fds to be compact (for many reasons - they use select, they 
> use them to index 0-based arrays, etc...), and if the kernel suddendly 
> starts returning values in the 1<<28 up arena, they sure won't be happy.
> So I believe that the correct way is that the caller specifically selects 
> the feature, leaving the legacy fd allocation as default.

I don't understand the connection between this paragraph (with which I
agree) and the urge to add a ton of ugly syscall hacks. "Caller
specifically selects feature" - > prctl(). Libraries get unhappy ->
linker issue.

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-07 10:06         ` Alan Cox
@ 2007-06-07 10:45           ` Eric Dumazet
  2007-06-07 11:27             ` Alan Cox
  2007-06-07 15:41           ` Davide Libenzi
  1 sibling, 1 reply; 129+ messages in thread
From: Eric Dumazet @ 2007-06-07 10:45 UTC (permalink / raw)
  To: Alan Cox
  Cc: Davide Libenzi, Linux Kernel Mailing List, Linus Torvalds,
	Andrew Morton, Ulrich Drepper, Ingo Molnar

On Thu, 7 Jun 2007 11:06:23 +0100
Alan Cox <alan@lxorguk.ukuu.org.uk> wrote:

> > I don't think it's a matter of versioning. Many userspace libraries 
> > expects their fds to be compact (for many reasons - they use select, they 
> > use them to index 0-based arrays, etc...), and if the kernel suddendly 
> > starts returning values in the 1<<28 up arena, they sure won't be happy.
> > So I believe that the correct way is that the caller specifically selects 
> > the feature, leaving the legacy fd allocation as default.
> 
> I don't understand the connection between this paragraph (with which I
> agree) and the urge to add a ton of ugly syscall hacks. "Caller
> specifically selects feature" - > prctl(). Libraries get unhappy ->
> linker issue.
> 

Alan, prctl() things are usually inherited at fork()/exec() time.

If you fork() from a new application , then exec an old one 
(eventually a statically linked program), we have a problem ?

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-07 10:45           ` Eric Dumazet
@ 2007-06-07 11:27             ` Alan Cox
  0 siblings, 0 replies; 129+ messages in thread
From: Alan Cox @ 2007-06-07 11:27 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Davide Libenzi, Linux Kernel Mailing List, Linus Torvalds,
	Andrew Morton, Ulrich Drepper, Ingo Molnar

> Alan, prctl() things are usually inherited at fork()/exec() time.

You want them inherited at fork time.

> If you fork() from a new application , then exec an old one 
> (eventually a statically linked program), we have a problem ?

Only if you decide not to reset the value. There are lots of things we do
inherit and lots we don't.

Alan

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-07 10:04         ` Alan Cox
@ 2007-06-07 11:59           ` Kyle Moffett
  2007-06-07 13:12             ` Eric Dumazet
  2007-06-07 14:25           ` Ulrich Drepper
  1 sibling, 1 reply; 129+ messages in thread
From: Kyle Moffett @ 2007-06-07 11:59 UTC (permalink / raw)
  To: Alan Cox
  Cc: Ulrich Drepper, Davide Libenzi, Linux Kernel Mailing List,
	Linus Torvalds, Andrew Morton, Ingo Molnar, Eric Dumazet

On Jun 07, 2007, at 06:04:32, Alan Cox wrote:
>> continuous allocation are part of the API.  It's required by POSIX  
>> and provided by Unix since the early days.  There are entire code  
>> bases out there which depend on this assumption.  Linking with  
>> code like this, before or after the new version controlled symbol  
>> is introduced, will break it.
>
> If your linker is doing its job then you won't be able to link them  
> together because they have incompatible assumptions. Not exactly  
> rocket science even if it is done a little differently to the usual  
> symbol compatibility tests.

No, there is a fundamental problem with "solving" this via linking.   
Many programs need the continuous FD allocation space, but then have  
tendancies to do things like:

   int i;
   for (i = 0; i < 1024; i++)
           if (i != comm_sock && i != server_sock)
                   close(i);

Yes I have seen many such programs, it seems to be a pretty standard  
idiom.

On the other hand, that makes it completely impossible for libraries  
to reliably use FDs behind the application's back; they might get  
closed on you at any time without warning.  One such library is  
glibc; it would really like to be able to use file-descriptors behind  
the back of the rest of userspace to implement certain functionality:
   http://www.mail-archive.com/linux-kernel@vger.kernel.org/ 
msg163522.html

Likewise there are a massive group of other libraries (especially  
user-interface and server related ones) that would really like to  
have support for creating file-descriptors without the top-level  
application closing them randomly (like several shells seem to).

Cheers,
Kyle Moffett


^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-07 11:59           ` Kyle Moffett
@ 2007-06-07 13:12             ` Eric Dumazet
  2007-06-07 15:51               ` Davide Libenzi
  2007-06-07 19:49               ` Davide Libenzi
  0 siblings, 2 replies; 129+ messages in thread
From: Eric Dumazet @ 2007-06-07 13:12 UTC (permalink / raw)
  To: Kyle Moffett, Davide Libenzi
  Cc: Alan Cox, Ulrich Drepper, Linux Kernel Mailing List,
	Linus Torvalds, Andrew Morton, Ingo Molnar

On Thu, 7 Jun 2007 07:59:47 -0400
Kyle Moffett <mrmacman_g4@mac.com> wrote:


> Likewise there are a massive group of other libraries (especially  
> user-interface and server related ones) that would really like to  
> have support for creating file-descriptors without the top-level  
> application closing them randomly (like several shells seem to).
> 

True, shells are sometimes quite strange.

For example, bash uses file descriptor 255 (FD_CLOEXEC)

When it forks a new process, child gets a file table with 256 slots.

At exec() time, 255 is closed but file table doesnt shrink.
(shrinking is done at fork() time only)

With fdmap, that means each process started by bash uses at least 
256 * sizeof(list_head) bytes, ie 4096 bytes on x86_64, even if only three 
file-descriptors are opened (0,1,2)

FD_CLOFORK should help here (BTW : current patch from Davide doesnt take this
into account and might need a change in fdmap_top_open_fd())

Davide, are you sure we want FIFO for non sequential allocations ?

This tends to use all the fmap slots, and not very cache friendly
if an app does a lot of [open(),...,close()] things. We already got a 
perf drop because of RCUification of file freeing (FIFO mode instead 
of LIFO given by kmalloc()/kfree())

If the idea behind this FIFO was security (ie not easy for an app to predict 
next glibc file handle), we/glibc might use yet another FD_SECUREMODE flag, 
wich ORed with O_NONSEQFD would ask to fdmap_newfd() to take the tail of 
fmap->slist, not head.


^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-07 10:04         ` Alan Cox
  2007-06-07 11:59           ` Kyle Moffett
@ 2007-06-07 14:25           ` Ulrich Drepper
  2007-06-07 17:56             ` Eric Dumazet
  1 sibling, 1 reply; 129+ messages in thread
From: Ulrich Drepper @ 2007-06-07 14:25 UTC (permalink / raw)
  To: Alan Cox
  Cc: Davide Libenzi, Linux Kernel Mailing List, Linus Torvalds,
	Andrew Morton, Ingo Molnar, Eric Dumazet

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

Alan Cox wrote:
>> continuous allocation are part of the API.  It's required by POSIX and
>> provided by Unix since the early days.  There are entire code bases out
>> there which depend on this assumption.  Linking with code like this,
>> before or after the new version controlled symbol is introduced, will
>> break it. 
> 
> If your linker is doing its job then you won't be able to link them
> together because they have incompatible assumptions. Not exactly rocket
> science even if it is done a little differently to the usual symbol
> compatibility tests. 

No.  You still don't appreciate the problem.

Assume I would change code so that newly compiled programs which use
open, socket, etc generate different references which cannot be
satisfied by old runtimes.  That and only that is something linkers can
see, linkers deal with symbols and references.

If I would do that and people would recompile their existing code this
*still* does not change the fact that the program behavior would be
changed and the code might break.  It can be something as simple as

   int fd = open(...);
   ...
   close(fd);
   ...
   close(STDIN_FILENO);
   open(...);

The last open is supposed to open a new file with descriptor 0 (i.e.,
STDIN_FILENO).  By using non-sequential descriptors this is not guaranteed.

This is nothing you can change with linker magic.  It is embedded in the
code.  It's behavior *guaranteed* by POSIX and Unix before it for
decades.  This isn't anything you can change.


> And the stuff you are trying to put into the kernel is better ? No, its a
> bunch of ugly hacks caused by trying to solve the problem in the wrong
> way and in the wrong place.

Go back 30 years and convince people to not guarantee first-match
allocation of descriptors.  Then you can stand up and demand purity.
But just as in many other cases, legacy demands its price.  We sometimes
have to pay the price for compatibility and if it isn't high it's worth
it.  I don't say the current proposed code is the answer but iff
Davide's unified code does not perform worse than the current code I
don't see the harm since, for instance, extending socket() is in any
case necessary.  I mentioned that close_on_exit must be set on open,
else leaks are risked.  This will come naturally with a flags parameter
which already takes O_NONSEQFD.

- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.7 (GNU/Linux)

iD8DBQFGaBVb2ijCOnn/RHQRAnzGAJwLY3dDc9nl69yFPZfYUr8qbIeXygCfTqMd
u5Jofy3gFB5bWEHdnPtUhJY=
=Jz/g
-----END PGP SIGNATURE-----

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-07 10:06         ` Alan Cox
  2007-06-07 10:45           ` Eric Dumazet
@ 2007-06-07 15:41           ` Davide Libenzi
  1 sibling, 0 replies; 129+ messages in thread
From: Davide Libenzi @ 2007-06-07 15:41 UTC (permalink / raw)
  To: Alan Cox
  Cc: Linux Kernel Mailing List, Linus Torvalds, Andrew Morton,
	Ulrich Drepper, Ingo Molnar, Eric Dumazet

On Thu, 7 Jun 2007, Alan Cox wrote:

> > I don't think it's a matter of versioning. Many userspace libraries 
> > expects their fds to be compact (for many reasons - they use select, they 
> > use them to index 0-based arrays, etc...), and if the kernel suddendly 
> > starts returning values in the 1<<28 up arena, they sure won't be happy.
> > So I believe that the correct way is that the caller specifically selects 
> > the feature, leaving the legacy fd allocation as default.
> 
> I don't understand the connection between this paragraph (with which I
> agree) and the urge to add a ton of ugly syscall hacks. "Caller
> specifically selects feature" - > prctl(). Libraries get unhappy ->
> linker issue.

I meant, caller specifically selects the feature, on a per-fd basis. If 
you select the task flag runtime, then all the allocated fds will be in 
the non-sequential area. Even fds allocated inside the library code, with 
libraries not expecting this. I fail to understand how a linker can nicely 
solve this. If my app is using, for its own reasons, fds in the 
non-sequential area, and library XYZ internally uses fds and does not like 
them in that area, I still want to link to that library. As long as I do 
not pass my fds to the library ("XYZ internally .."), this must still 
work. The contrary is also true. If my app is not non-sequential fd aware, 
and my library wants to use them in order to keep them alive from apps 
doing the for-each-fd-close loop, with a per-fd policy, you can still mix 
them together.
Or are you planning to have two sets of each userspace (userspace is not 
only glibc, there's other stuff too) libraries?



- Davide



^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-07 13:12             ` Eric Dumazet
@ 2007-06-07 15:51               ` Davide Libenzi
  2007-06-07 19:49               ` Davide Libenzi
  1 sibling, 0 replies; 129+ messages in thread
From: Davide Libenzi @ 2007-06-07 15:51 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Kyle Moffett, Alan Cox, Ulrich Drepper,
	Linux Kernel Mailing List, Linus Torvalds, Andrew Morton,
	Ingo Molnar

On Thu, 7 Jun 2007, Eric Dumazet wrote:

> On Thu, 7 Jun 2007 07:59:47 -0400
> Kyle Moffett <mrmacman_g4@mac.com> wrote:
> 
> 
> > Likewise there are a massive group of other libraries (especially  
> > user-interface and server related ones) that would really like to  
> > have support for creating file-descriptors without the top-level  
> > application closing them randomly (like several shells seem to).
> > 
> 
> True, shells are sometimes quite strange.
> 
> For example, bash uses file descriptor 255 (FD_CLOEXEC)
> 
> When it forks a new process, child gets a file table with 256 slots.
> 
> At exec() time, 255 is closed but file table doesnt shrink.
> (shrinking is done at fork() time only)
> 
> With fdmap, that means each process started by bash uses at least 
> 256 * sizeof(list_head) bytes, ie 4096 bytes on x86_64, even if only three 
> file-descriptors are opened (0,1,2)
> 
> FD_CLOFORK should help here (BTW : current patch from Davide doesnt take this
> into account and might need a change in fdmap_top_open_fd())

Yes, the CLOFORK flag is there, but it needs to be taken in account in 
fdmap_top_open_fd().


> Davide, are you sure we want FIFO for non sequential allocations ?
> 
> This tends to use all the fmap slots, and not very cache friendly
> if an app does a lot of [open(),...,close()] things. We already got a 
> perf drop because of RCUification of file freeing (FIFO mode instead 
> of LIFO given by kmalloc()/kfree())
> 
> If the idea behind this FIFO was security (ie not easy for an app to predict 
> next glibc file handle), we/glibc might use yet another FD_SECUREMODE flag, 
> wich ORed with O_NONSEQFD would ask to fdmap_newfd() to take the tail of 
> fmap->slist, not head.

That was the reason, yes. If we agree that the base randomization is 
enough, we can use a LIFO.


- Davide



^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-07 14:25           ` Ulrich Drepper
@ 2007-06-07 17:56             ` Eric Dumazet
  2007-06-07 18:03               ` Davide Libenzi
  2007-06-07 18:26               ` Ulrich Drepper
  0 siblings, 2 replies; 129+ messages in thread
From: Eric Dumazet @ 2007-06-07 17:56 UTC (permalink / raw)
  To: Ulrich Drepper
  Cc: Alan Cox, Davide Libenzi, Linux Kernel Mailing List,
	Linus Torvalds, Andrew Morton, Ingo Molnar

On Thu, 07 Jun 2007 07:25:31 -0700
Ulrich Drepper <drepper@redhat.com> wrote:

> I don't say the current proposed code is the answer but iff
> Davide's unified code does not perform worse than the current code I
> don't see the harm since, for instance, extending socket() is in any
> case necessary.  I mentioned that close_on_exit must be set on open,
> else leaks are risked.  This will come naturally with a flags parameter
> which already takes O_NONSEQFD.

Yes, and for completeness :

accept2(int fd, ...)

pipe2(int *fds, int oflags);

eventfd2(int count, int oflags);

signalfd2(int ufd, sigset_t __user *user_mask, size_t sizemask, int oflags);

timerfd2(int ufd, int clockid, int flags,const struct itimerspec __user *utmr, int oflags) ...

We probably should name those with a better sufix than "2", it is ugly.


^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-07 17:56             ` Eric Dumazet
@ 2007-06-07 18:03               ` Davide Libenzi
  2007-06-07 18:57                 ` Eric Dumazet
  2007-06-07 18:26               ` Ulrich Drepper
  1 sibling, 1 reply; 129+ messages in thread
From: Davide Libenzi @ 2007-06-07 18:03 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Ulrich Drepper, Alan Cox, Linux Kernel Mailing List,
	Linus Torvalds, Andrew Morton, Ingo Molnar

On Thu, 7 Jun 2007, Eric Dumazet wrote:

> accept2(int fd, ...)

I don't see any reason to not have it inherit the non-sequential 
characteristics of "fd".



> pipe2(int *fds, int oflags);

I think pipe+sys_nonseqfd should be OK for those.



> eventfd2(int count, int oflags);
> signalfd2(int ufd, sigset_t __user *user_mask, size_t sizemask, int oflags);
> timerfd2(int ufd, int clockid, int flags,const struct itimerspec __user *utmr, int oflags) ...

Those I think we're still in time to change the interface. No?
Even if not, those are not perf-critical, so I think that 
syscall+sys_nonseqfd is still fine.



- Davide



^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-07 17:56             ` Eric Dumazet
  2007-06-07 18:03               ` Davide Libenzi
@ 2007-06-07 18:26               ` Ulrich Drepper
  2007-06-07 18:39                 ` Davide Libenzi
  1 sibling, 1 reply; 129+ messages in thread
From: Ulrich Drepper @ 2007-06-07 18:26 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Alan Cox, Davide Libenzi, Linux Kernel Mailing List,
	Linus Torvalds, Andrew Morton, Ingo Molnar

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

Eric Dumazet wrote:
> eventfd2(int count, int oflags);
> 
> signalfd2(int ufd, sigset_t __user *user_mask, size_t sizemask, int oflags);
> 
> timerfd2(int ufd, int clockid, int flags,const struct itimerspec __user *utmr, int oflags) ...

These aren't released yet, so, change them now before it's too late.

And to add to your list:

epoll_create().  Important if you think that's the interface people
should use.

- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.7 (GNU/Linux)

iD8DBQFGaE3t2ijCOnn/RHQRAqPGAJ4rhSxvWLvAseBTCZIywIpQ7JCTJACfZSdR
fGkbIAX5l/1zISYQ6rLmIrk=
=Ge8V
-----END PGP SIGNATURE-----

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-07 18:26               ` Ulrich Drepper
@ 2007-06-07 18:39                 ` Davide Libenzi
  2007-06-07 18:56                   ` Ulrich Drepper
  2007-06-07 20:03                   ` Andrew Morton
  0 siblings, 2 replies; 129+ messages in thread
From: Davide Libenzi @ 2007-06-07 18:39 UTC (permalink / raw)
  To: Ulrich Drepper
  Cc: Eric Dumazet, Alan Cox, Linux Kernel Mailing List,
	Linus Torvalds, Andrew Morton, Ingo Molnar

On Thu, 7 Jun 2007, Ulrich Drepper wrote:

> -----BEGIN PGP SIGNED MESSAGE-----
> Hash: SHA1
> 
> Eric Dumazet wrote:
> > eventfd2(int count, int oflags);
> > 
> > signalfd2(int ufd, sigset_t __user *user_mask, size_t sizemask, int oflags);
> > 
> > timerfd2(int ufd, int clockid, int flags,const struct itimerspec __user *utmr, int oflags) ...
> 
> These aren't released yet, so, change them now before it's too late.

Linus, Andrew, would it be OK to change?



> And to add to your list:
> 
> epoll_create().  Important if you think that's the interface people
> should use.

Can't we leave it as is, and use sys_nonseqfd() for those? So basically:

pipe(2)			-> Use sys_nonseqfd
epoll_create(2)		-> Use sys_nonseqfd
accept(2)		-> Inherit the "origin" fd characteristics
open(2)			-> Add O_NONSEQFD
socket(2)		-> Introduce socket2()



- Davide



^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-07 18:39                 ` Davide Libenzi
@ 2007-06-07 18:56                   ` Ulrich Drepper
  2007-06-07 19:12                     ` Davide Libenzi
  2007-06-07 20:03                   ` Andrew Morton
  1 sibling, 1 reply; 129+ messages in thread
From: Ulrich Drepper @ 2007-06-07 18:56 UTC (permalink / raw)
  To: Davide Libenzi
  Cc: Eric Dumazet, Alan Cox, Linux Kernel Mailing List,
	Linus Torvalds, Andrew Morton, Ingo Molnar

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

Davide Libenzi wrote:
>> And to add to your list:
>>
>> epoll_create().  Important if you think that's the interface people
>> should use.
> 
> Can't we leave it as is, and use sys_nonseqfd() for those? So basically:

The problem is O_CLOEXEC.  They are all racy and if we add a variant
with flags then we might as well support O_NONSEQFD as well.

- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.7 (GNU/Linux)

iD8DBQFGaFTO2ijCOnn/RHQRAm7uAJ9NsP6FegpO3cAnCe83eZ8awtkHawCgzKK4
2OjiNCFJhviQSMOaKHS6cTI=
=v9jU
-----END PGP SIGNATURE-----

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-07 18:03               ` Davide Libenzi
@ 2007-06-07 18:57                 ` Eric Dumazet
  0 siblings, 0 replies; 129+ messages in thread
From: Eric Dumazet @ 2007-06-07 18:57 UTC (permalink / raw)
  To: Davide Libenzi
  Cc: Ulrich Drepper, Alan Cox, Linux Kernel Mailing List,
	Linus Torvalds, Andrew Morton, Ingo Molnar

Davide Libenzi a écrit :
> On Thu, 7 Jun 2007, Eric Dumazet wrote:
> 
>> accept2(int fd, ...)
> 
> I don't see any reason to not have it inherit the non-sequential 
> characteristics of "fd".
> 
> 
> 
>> pipe2(int *fds, int oflags);
> 
> I think pipe+sys_nonseqfd should be OK for those.

yes, but O_CLOEXEC/O_CLOFORK ?

I was refering to Uli wanting to close races on multi-threading apps doing a 
fork() while a thread is doing :

fd = open()
<----  race here  if another thread does a fork() ---->
fcntl( CLOEXEC)

> 
> 
> 
>> eventfd2(int count, int oflags);
>> signalfd2(int ufd, sigset_t __user *user_mask, size_t sizemask, int oflags);
>> timerfd2(int ufd, int clockid, int flags,const struct itimerspec __user *utmr, int oflags) ...
> 
> Those I think we're still in time to change the interface. No?
> Even if not, those are not perf-critical, so I think that 
> syscall+sys_nonseqfd is still fine.

Same point here, non a nonseqfd problem.



^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-07 18:56                   ` Ulrich Drepper
@ 2007-06-07 19:12                     ` Davide Libenzi
  0 siblings, 0 replies; 129+ messages in thread
From: Davide Libenzi @ 2007-06-07 19:12 UTC (permalink / raw)
  To: Ulrich Drepper
  Cc: Eric Dumazet, Alan Cox, Linux Kernel Mailing List,
	Linus Torvalds, Andrew Morton, Ingo Molnar

On Thu, 7 Jun 2007, Ulrich Drepper wrote:

> -----BEGIN PGP SIGNED MESSAGE-----
> Hash: SHA1
> 
> Davide Libenzi wrote:
> >> And to add to your list:
> >>
> >> epoll_create().  Important if you think that's the interface people
> >> should use.
> > 
> > Can't we leave it as is, and use sys_nonseqfd() for those? So basically:
> 
> The problem is O_CLOEXEC.  They are all racy and if we add a variant
> with flags then we might as well support O_NONSEQFD as well.

Ahh, I forgot the original O_CLOEXEC atomicity. Right.



- Davide



^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-07 13:12             ` Eric Dumazet
  2007-06-07 15:51               ` Davide Libenzi
@ 2007-06-07 19:49               ` Davide Libenzi
  2007-06-07 20:02                 ` Ulrich Drepper
  2007-06-07 20:05                 ` Eric Dumazet
  1 sibling, 2 replies; 129+ messages in thread
From: Davide Libenzi @ 2007-06-07 19:49 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Kyle Moffett, Alan Cox, Ulrich Drepper,
	Linux Kernel Mailing List, Linus Torvalds, Andrew Morton,
	Ingo Molnar

On Thu, 7 Jun 2007, Eric Dumazet wrote:

> Davide, are you sure we want FIFO for non sequential allocations ?
> 
> This tends to use all the fmap slots, and not very cache friendly
> if an app does a lot of [open(),...,close()] things. We already got a 
> perf drop because of RCUification of file freeing (FIFO mode instead 
> of LIFO given by kmalloc()/kfree())
> 
> If the idea behind this FIFO was security (ie not easy for an app to predict 
> next glibc file handle), we/glibc might use yet another FD_SECUREMODE flag, 
> wich ORed with O_NONSEQFD would ask to fdmap_newfd() to take the tail of 
> fmap->slist, not head.

Uli, would it be OK to rely only on base randomization and use a LIFO 
instead? We have base randomization, plus LIFO does not mean strictly 
sequential like legacy allocator, just more compatc and cache friendly.



- Davide



^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-07 19:49               ` Davide Libenzi
@ 2007-06-07 20:02                 ` Ulrich Drepper
  2007-06-07 20:05                 ` Eric Dumazet
  1 sibling, 0 replies; 129+ messages in thread
From: Ulrich Drepper @ 2007-06-07 20:02 UTC (permalink / raw)
  To: Davide Libenzi
  Cc: Eric Dumazet, Kyle Moffett, Alan Cox, Linux Kernel Mailing List,
	Linus Torvalds, Andrew Morton, Ingo Molnar

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

Davide Libenzi wrote:
> Uli, would it be OK to rely only on base randomization and use a LIFO 
> instead? We have base randomization, plus LIFO does not mean strictly 
> sequential like legacy allocator, just more compatc and cache friendly.

If FIFO is slowing things down it's certainly OK to you LIFO.  If there
is wiggle room (i.e., choose between two descriptors without additional
cost) then taking advantage of this would be of advantage.  A policy
which enforces that only the last closed descriptor is not reused
immediately might might be enough.  Maybe that's too specific for
people's taste.

- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.7 (GNU/Linux)

iD8DBQFGaGRP2ijCOnn/RHQRAv2qAJ0WzyKvOPx01PviCp4L/mUmNaehtQCfdKF5
4Qc7Uj47zY8jdqUZf+Ht3gs=
=jRfN
-----END PGP SIGNATURE-----

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-07 18:39                 ` Davide Libenzi
  2007-06-07 18:56                   ` Ulrich Drepper
@ 2007-06-07 20:03                   ` Andrew Morton
  2007-06-08  2:55                     ` Ulrich Drepper
  1 sibling, 1 reply; 129+ messages in thread
From: Andrew Morton @ 2007-06-07 20:03 UTC (permalink / raw)
  To: Davide Libenzi
  Cc: Ulrich Drepper, Eric Dumazet, Alan Cox,
	Linux Kernel Mailing List, Linus Torvalds, Ingo Molnar

On Thu, 7 Jun 2007 11:39:39 -0700 (PDT)
Davide Libenzi <davidel@xmailserver.org> wrote:

> On Thu, 7 Jun 2007, Ulrich Drepper wrote:
> 
> > -----BEGIN PGP SIGNED MESSAGE-----
> > Hash: SHA1
> > 
> > Eric Dumazet wrote:
> > > eventfd2(int count, int oflags);
> > > 
> > > signalfd2(int ufd, sigset_t __user *user_mask, size_t sizemask, int oflags);
> > > 
> > > timerfd2(int ufd, int clockid, int flags,const struct itimerspec __user *utmr, int oflags) ...
> > 
> > These aren't released yet, so, change them now before it's too late.
> 
> Linus, Andrew, would it be OK to change?

Absolutely.  Let's get them into their final form now, rather than letting
some intermediate interface escape out into a major kernel release.

<directs attention to
ftp://ftp.firstfloor.org/pub/ak/x86_64/quilt/patches/fix-eventd-syscall>

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-07 19:49               ` Davide Libenzi
  2007-06-07 20:02                 ` Ulrich Drepper
@ 2007-06-07 20:05                 ` Eric Dumazet
  2007-06-07 20:18                   ` Ulrich Drepper
  2007-06-07 21:57                   ` Davide Libenzi
  1 sibling, 2 replies; 129+ messages in thread
From: Eric Dumazet @ 2007-06-07 20:05 UTC (permalink / raw)
  To: Davide Libenzi
  Cc: Kyle Moffett, Alan Cox, Ulrich Drepper,
	Linux Kernel Mailing List, Linus Torvalds, Andrew Morton,
	Ingo Molnar

Davide Libenzi a écrit :
> On Thu, 7 Jun 2007, Eric Dumazet wrote:
> 
>> Davide, are you sure we want FIFO for non sequential allocations ?
>>
>> This tends to use all the fmap slots, and not very cache friendly
>> if an app does a lot of [open(),...,close()] things. We already got a 
>> perf drop because of RCUification of file freeing (FIFO mode instead 
>> of LIFO given by kmalloc()/kfree())
>>
>> If the idea behind this FIFO was security (ie not easy for an app to predict 
>> next glibc file handle), we/glibc might use yet another FD_SECUREMODE flag, 
>> wich ORed with O_NONSEQFD would ask to fdmap_newfd() to take the tail of 
>> fmap->slist, not head.
> 
> Uli, would it be OK to rely only on base randomization and use a LIFO 
> instead? We have base randomization, plus LIFO does not mean strictly 
> sequential like legacy allocator, just more compatc and cache friendly.
> 

I am afraid randomization wont really work if /sbin/init or /bin/bash for 
example uses one (or more) unseq fd :
The 'random base' will be propagated at fork()/exec() time ?






^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-06 22:59 ` Alan Cox
  2007-06-06 22:58   ` Ulrich Drepper
  2007-06-06 23:04   ` Davide Libenzi
@ 2007-06-07 20:10   ` Linus Torvalds
  2007-06-07 20:47     ` Eric Dumazet
                       ` (5 more replies)
  2 siblings, 6 replies; 129+ messages in thread
From: Linus Torvalds @ 2007-06-07 20:10 UTC (permalink / raw)
  To: Alan Cox
  Cc: Davide Libenzi, Linux Kernel Mailing List, Andrew Morton,
	Ulrich Drepper, Ingo Molnar, Eric Dumazet



On Wed, 6 Jun 2007, Alan Cox wrote:
> 
> This still all seems really really ugly.

I do agree that it's ugly. That many new system calls with new prototypes 
and new glibc support is just nasty.

So I don't think this is viable.

> Is there anything wrong with throwing all these extra cases out and 
> replacing the entire lot with
> 
> 	prctl(PR_SPARSEFD, 1);
> 
> to turn on sparse fd allocation for a process ?

Yes. We really don't want to set global state that affects any random 
library thing that runs after it.

HOWEVER.

I think we could introduce a *single* new system call, which does 
basically a "run the specified system call with the following flags".

The flags would literally be local to that *one* system call, and one of 
the flags could be the semantics for FD allocation.

[ There are a few other cases where such an indirect system call might be 
  interesting: temporarily unmasking a signal for just the duration of a 
  single system call is the reason for things like 'pselect()' and 
  'sigtimedwait()', and similarly the 'access()' system call is basically 
  a "temporarily run with my real UID, rather than the effective UID 
  thing, and quite frankly, it might be perfectly valid to want to do an 
  'open()' with that rule too, because "access()+open()" is racy! ]

So maybe the proper solution to this mess is *not* to add fifteen new 
system calls, but to add *one*, which takes a "flags" value to set certain 
things:

 - FD_NONSEQ: "allocate any new fd's nonsequentially"
 - FD_CLOEXEC: "allocate any new fd's as close-on-exec"

   Rationale: allow people to open any fd with the flags set a certain 
   way, regardless of the system call.

 - LOOKUP_REALUID/GID: "make the fsuid/fsgid temporarily be my _real_ 
   uid/gid for this single system call"

   Rationale: avoid the inevitable races that the fundamentally broken 
   "access()" system call has! 

 - LOOKUP_NOFOLLOW: "do not follow any symlink at the end of the path"
   LOOKUP_NOATIME: "don't update atime"

   Rationale: "open()" already has O_NOFOLLOW/O_NOATIME, and "stat()" has 
   "lstat()", but a lot of other path-handlign system calls cannot do the 
   same thing.

 - LOOKUP_NOSYMLINKS: "do not allow any symlink traversal at *all*"
   LOOKUP_NODOTDOT: "don't traverse a .. upwards"
   LOOKUP_NOMOUNT: "don't traverse a mount point"

   Rationale: for security-conscious things, quite often it's not the 
   _last_ symlink you want to avoid, it's any symlinks at all, and 
   sometimes it's things like guaranteeing that you stay in a certain 
   directory structure - which means not going outside with ".." or some 
   magic mount-point.

   People currently literally end up traversing things one path component 
   at a time, doing a "lstat()" on it, and checking. Even if 99% 
   of the time you probably don't actually ever hit the problem case. 
   (Eg Apache at some point used to do something like this if you asked 
   for security, I'm not sure if it still does).

 - signal mask for temporarily blocking/unblocking during a single system 
   call.

 - something else? The above are things that I know I _personally_ have 
   occasionally cursed not having had.

What do people think about that kind of approach? It has the advantage 
that it does *not* involve multiple kernel entries (just a single entry to 
a small wrapper that sets some process state temporarily), and that it 
doesn't have any sticky state that might confuse a library (or a signal 
handler: even if you end up doing "prctrl(ON) ; syscall(); prctrl(OFF)", a 
signal handler that happens in between the prctrl's would see unexpected 
behaviour).

It has the disadvantage that it would need some per-architecture setup to 
load the actual real arguments from memory: the system call would probably 
look something like

	syscall_indirect(unsigned long flags, sigset_t *, 
			 int syscall, unsigned long args[6]);

and the rule would be that it would just load the six system call 
registers from that "args[]" array. Always load the full six registers, to 
make it simpler and faster, and not having any confusion or ever needing 
any wrappers that depend on the number of system calls.

			Linus

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-07 20:05                 ` Eric Dumazet
@ 2007-06-07 20:18                   ` Ulrich Drepper
  2007-06-07 21:44                     ` Davide Libenzi
  2007-06-07 21:57                   ` Davide Libenzi
  1 sibling, 1 reply; 129+ messages in thread
From: Ulrich Drepper @ 2007-06-07 20:18 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Davide Libenzi, Kyle Moffett, Alan Cox,
	Linux Kernel Mailing List, Linus Torvalds, Andrew Morton,
	Ingo Molnar

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

Eric Dumazet wrote:
> I am afraid randomization wont really work if /sbin/init or /bin/bash
> for example uses one (or more) unseq fd :
> The 'random base' will be propagated at fork()/exec() time ?

The base certainly should be reset o fork.  Yes, this might expand the
region in which descriptors are allocated due to inherited descriptors.
 But I consider this the application's problem and it usually is not
really an issue.  Apps have to explicitly request using the new
descriptors and they can use CLOEXEC (CLOFORK) correctly.

- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.7 (GNU/Linux)

iD8DBQFGaGgQ2ijCOnn/RHQRAutyAKChp9KT9NVfUTD76GRhyY62GUTtaACglgxi
N/4+vmcUPEYtLmUTYKVjMvg=
=otvw
-----END PGP SIGNATURE-----

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-07 20:10   ` Linus Torvalds
@ 2007-06-07 20:47     ` Eric Dumazet
  2007-06-07 21:08       ` Linus Torvalds
  2007-06-07 20:59     ` Guillaume Chazarain
                       ` (4 subsequent siblings)
  5 siblings, 1 reply; 129+ messages in thread
From: Eric Dumazet @ 2007-06-07 20:47 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Alan Cox, Davide Libenzi, Linux Kernel Mailing List,
	Andrew Morton, Ulrich Drepper, Ingo Molnar

Linus Torvalds a écrit :
> 
> On Wed, 6 Jun 2007, Alan Cox wrote:
>> This still all seems really really ugly.
> 
> I do agree that it's ugly. That many new system calls with new prototypes 
> and new glibc support is just nasty.
> 
> So I don't think this is viable.
> 
>> Is there anything wrong with throwing all these extra cases out and 
>> replacing the entire lot with
>>
>> 	prctl(PR_SPARSEFD, 1);
>>
>> to turn on sparse fd allocation for a process ?
> 
> Yes. We really don't want to set global state that affects any random 
> library thing that runs after it.
> 
> HOWEVER.
> 
> I think we could introduce a *single* new system call, which does 
> basically a "run the specified system call with the following flags".
> 
> The flags would literally be local to that *one* system call, and one of 
> the flags could be the semantics for FD allocation.
> 
> [ There are a few other cases where such an indirect system call might be 
>   interesting: temporarily unmasking a signal for just the duration of a 
>   single system call is the reason for things like 'pselect()' and 
>   'sigtimedwait()', and similarly the 'access()' system call is basically 
>   a "temporarily run with my real UID, rather than the effective UID 
>   thing, and quite frankly, it might be perfectly valid to want to do an 
>   'open()' with that rule too, because "access()+open()" is racy! ]
> 
> So maybe the proper solution to this mess is *not* to add fifteen new 
> system calls, but to add *one*, which takes a "flags" value to set certain 
> things:
> 
>  - FD_NONSEQ: "allocate any new fd's nonsequentially"
>  - FD_CLOEXEC: "allocate any new fd's as close-on-exec"
> 
>    Rationale: allow people to open any fd with the flags set a certain 
>    way, regardless of the system call.
> 
>  - LOOKUP_REALUID/GID: "make the fsuid/fsgid temporarily be my _real_ 
>    uid/gid for this single system call"
> 
>    Rationale: avoid the inevitable races that the fundamentally broken 
>    "access()" system call has! 
> 
>  - LOOKUP_NOFOLLOW: "do not follow any symlink at the end of the path"
>    LOOKUP_NOATIME: "don't update atime"
> 
>    Rationale: "open()" already has O_NOFOLLOW/O_NOATIME, and "stat()" has 
>    "lstat()", but a lot of other path-handlign system calls cannot do the 
>    same thing.
> 
>  - LOOKUP_NOSYMLINKS: "do not allow any symlink traversal at *all*"
>    LOOKUP_NODOTDOT: "don't traverse a .. upwards"
>    LOOKUP_NOMOUNT: "don't traverse a mount point"
> 
>    Rationale: for security-conscious things, quite often it's not the 
>    _last_ symlink you want to avoid, it's any symlinks at all, and 
>    sometimes it's things like guaranteeing that you stay in a certain 
>    directory structure - which means not going outside with ".." or some 
>    magic mount-point.
> 
>    People currently literally end up traversing things one path component 
>    at a time, doing a "lstat()" on it, and checking. Even if 99% 
>    of the time you probably don't actually ever hit the problem case. 
>    (Eg Apache at some point used to do something like this if you asked 
>    for security, I'm not sure if it still does).
> 
>  - signal mask for temporarily blocking/unblocking during a single system 
>    call.
> 
>  - something else? The above are things that I know I _personally_ have 
>    occasionally cursed not having had.
> 
> What do people think about that kind of approach? It has the advantage 
> that it does *not* involve multiple kernel entries (just a single entry to 
> a small wrapper that sets some process state temporarily), and that it 
> doesn't have any sticky state that might confuse a library (or a signal 
> handler: even if you end up doing "prctrl(ON) ; syscall(); prctrl(OFF)", a 
> signal handler that happens in between the prctrl's would see unexpected 
> behaviour).
> 
> It has the disadvantage that it would need some per-architecture setup to 
> load the actual real arguments from memory: the system call would probably 
> look something like
> 
> 	syscall_indirect(unsigned long flags, sigset_t *, 
> 			 int syscall, unsigned long args[6]);
> 
> and the rule would be that it would just load the six system call 
> registers from that "args[]" array. Always load the full six registers, to 
> make it simpler and faster, and not having any confusion or ever needing 
> any wrappers that depend on the number of system calls.

This is a nice idea, but 32/64 compat code is going to hate it :)

syscall_indirect() would be writen in assembly for each arch, since there is 
no generic syscall table. Thats really a lot of work, especially if we want to 
mess with signal mask, umask ...



^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-07 20:10   ` Linus Torvalds
  2007-06-07 20:47     ` Eric Dumazet
@ 2007-06-07 20:59     ` Guillaume Chazarain
  2007-06-07 21:06       ` Guillaume Chazarain
  2007-06-07 21:31     ` Ulrich Drepper
                       ` (3 subsequent siblings)
  5 siblings, 1 reply; 129+ messages in thread
From: Guillaume Chazarain @ 2007-06-07 20:59 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Alan Cox, Davide Libenzi, Linux Kernel Mailing List,
	Andrew Morton, Ulrich Drepper, Ingo Molnar, Eric Dumazet

2007/6/7, Linus Torvalds <torvalds@linux-foundation.org>:

>         syscall_indirect(unsigned long flags, sigset_t *,
>                          int syscall, unsigned long args[6]);

Would that also be the user interface or all the wrappers would
still be implemented by the glibc?

Also, it sounds like a per-thread prctl, fun with syslets though ... ;-)

-- 
Guillaume

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-07 20:59     ` Guillaume Chazarain
@ 2007-06-07 21:06       ` Guillaume Chazarain
  0 siblings, 0 replies; 129+ messages in thread
From: Guillaume Chazarain @ 2007-06-07 21:06 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Alan Cox, Davide Libenzi, Linux Kernel Mailing List,
	Andrew Morton, Ulrich Drepper, Ingo Molnar, Eric Dumazet

2007/6/7, Guillaume Chazarain <guichaz@yahoo.fr>:

> Also, it sounds like a per-thread prctl, fun with syslets though ... ;-)

Oops, even if the user takes care to reset the prctl after his code (the
library problem), there are still signals that can run with an unexpected
prctl. Sorry for the noise.

-- 
Guillaume

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-07 20:47     ` Eric Dumazet
@ 2007-06-07 21:08       ` Linus Torvalds
  2007-06-07 21:41         ` Davide Libenzi
  0 siblings, 1 reply; 129+ messages in thread
From: Linus Torvalds @ 2007-06-07 21:08 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Alan Cox, Davide Libenzi, Linux Kernel Mailing List,
	Andrew Morton, Ulrich Drepper, Ingo Molnar



On Thu, 7 Jun 2007, Eric Dumazet wrote:
> 
> This is a nice idea, but 32/64 compat code is going to hate it :)

It should be fairly simple for 32/64-bit compat code too.

The compat code should just call the compat system call 

> syscall_indirect() would be writen in assembly for each arch, since there is
> no generic syscall table. Thats really a lot of work, especially if we want to
> mess with signal mask, umask ...

No no no. That would be horribly idiotic.

The thing should be 99% generic code, ie we would have

	syscall_indirect(..)
	{
		long retval;

		.. set up signals/flags ..
		retval = arch_syscall_indirect(syscall, args);
		.. unsetup signals/flags ..
		return retval;
	}

	compat_syscall_indirect(..)
	{
		int retval;

		.. set up signals/flags ..
		retval = compat_arch_syscall_indirect(syscall, args);
		.. unsetup signals/flags ..
		return retval;
	}

and the *only* thing that each architecture would need to do is that 
(trivial) arch_syscall_indirect() thing (and the compat version, if 
applicable). And those literally should be generally pretty damn trivial.

The only _subtle_ issue is any system call that actually uses "pt_regs". 
So we'd have to disallow exec/fork/vfork/clone/. They take magic pt_regs 
pointers etc. On x86, for example, the following system calls should *not* 
be things you can do this with:

	asmlinkage int sys_fork(struct pt_regs regs)
	asmlinkage int sys_clone(struct pt_regs regs)
	asmlinkage int sys_vfork(struct pt_regs regs)
	asmlinkage int sys_execve(struct pt_regs regs)
	asmlinkage int sys_vm86old(struct pt_regs regs)
	asmlinkage int sys_vm86(struct pt_regs regs)
	asmlinkage long sys_iopl(unsigned long unused)

because those either take pt_regs, or make pt_regs out of their arguments 
(that "unused" is used to do:

	volatile struct pt_regs * regs = (struct pt_regs *) &unused;

so there is *some* care needed, but other than taking care of this, the 
implementation on x86 should really be totally trivial.

		Linus

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-07 20:10   ` Linus Torvalds
  2007-06-07 20:47     ` Eric Dumazet
  2007-06-07 20:59     ` Guillaume Chazarain
@ 2007-06-07 21:31     ` Ulrich Drepper
  2007-06-07 22:22     ` Davide Libenzi
                       ` (2 subsequent siblings)
  5 siblings, 0 replies; 129+ messages in thread
From: Ulrich Drepper @ 2007-06-07 21:31 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Alan Cox, Davide Libenzi, Linux Kernel Mailing List,
	Andrew Morton, Ingo Molnar, Eric Dumazet

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

Linus Torvalds wrote:
> 	syscall_indirect(unsigned long flags, sigset_t *, 
> 			 int syscall, unsigned long args[6]);

It's a kernel detail, so if this is how you want it, let it be.  I can
certainly live with this.  The only comment about the comments I would
have are

- - think hard about the additional things you want to set

- - make it a bit expendable so that we don't run out of bits.  Maybe
  we also have to pass some additional integer values.

At userlevel this is of course something we cannot expose.  Here we will
need new interfaces like acceptNG, socketNG, etc which themselves can
call this syscall.


This extension reminds me of something we've talked about several times
in the past (I know that I at least discussed this with Ingo).  You
basically implement little scriptlets.  In your call they are simply

    current->flags_arg = flags;
    current->sigmask_arg = sigmask;
    r = syscall(nr, ...);
    current->flags_arg = 0;
    current->sigmask_arg = NULL;
    return r;

This could be made more generic in that you can allow the script to do
more.  The threadlets etc are not too different from this.

If all this is unwanted then go with what you proposed.  Otherwise think
about a more generic approach.

- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.7 (GNU/Linux)

iD8DBQFGaHkj2ijCOnn/RHQRAhWoAJ4loMzrYJQDCU4e6jdOfjL4LG/TsACguhUL
ldVvp0PWIazV2iAWraCc+IU=
=VGDR
-----END PGP SIGNATURE-----

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-07 21:08       ` Linus Torvalds
@ 2007-06-07 21:41         ` Davide Libenzi
  0 siblings, 0 replies; 129+ messages in thread
From: Davide Libenzi @ 2007-06-07 21:41 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Eric Dumazet, Alan Cox, Linux Kernel Mailing List, Andrew Morton,
	Ulrich Drepper, Ingo Molnar

On Thu, 7 Jun 2007, Linus Torvalds wrote:

> On Thu, 7 Jun 2007, Eric Dumazet wrote:
> > 
> > This is a nice idea, but 32/64 compat code is going to hate it :)
> 
> It should be fairly simple for 32/64-bit compat code too.
> 
> The compat code should just call the compat system call 
> 
> > syscall_indirect() would be writen in assembly for each arch, since there is
> > no generic syscall table. Thats really a lot of work, especially if we want to
> > mess with signal mask, umask ...
> 
> No no no. That would be horribly idiotic.
> 
> The thing should be 99% generic code, ie we would have
> 
> 	syscall_indirect(..)
> 	{
> 		long retval;
> 
> 		.. set up signals/flags ..
> 		retval = arch_syscall_indirect(syscall, args);
> 		.. unsetup signals/flags ..
> 		return retval;
> 	}
> 
> 	compat_syscall_indirect(..)
> 	{
> 		int retval;
> 
> 		.. set up signals/flags ..
> 		retval = compat_arch_syscall_indirect(syscall, args);
> 		.. unsetup signals/flags ..
> 		return retval;
> 	}
> 
> and the *only* thing that each architecture would need to do is that 
> (trivial) arch_syscall_indirect() thing (and the compat version, if 
> applicable). And those literally should be generally pretty damn trivial.

Ok, I like this better honestly. I really did not want to win the Oscar 
for the 2007 Syscall Spammer of the year :)



- Davide



^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-07 20:18                   ` Ulrich Drepper
@ 2007-06-07 21:44                     ` Davide Libenzi
  2007-06-07 22:03                       ` Ulrich Drepper
  0 siblings, 1 reply; 129+ messages in thread
From: Davide Libenzi @ 2007-06-07 21:44 UTC (permalink / raw)
  To: Ulrich Drepper
  Cc: Eric Dumazet, Kyle Moffett, Alan Cox, Linux Kernel Mailing List,
	Linus Torvalds, Andrew Morton, Ingo Molnar

On Thu, 7 Jun 2007, Ulrich Drepper wrote:

> -----BEGIN PGP SIGNED MESSAGE-----
> Hash: SHA1
> 
> Eric Dumazet wrote:
> > I am afraid randomization wont really work if /sbin/init or /bin/bash
> > for example uses one (or more) unseq fd :
> > The 'random base' will be propagated at fork()/exec() time ?
> 
> The base certainly should be reset o fork.  Yes, this might expand the
> region in which descriptors are allocated due to inherited descriptors.
>  But I consider this the application's problem and it usually is not
> really an issue.  Apps have to explicitly request using the new
> descriptors and they can use CLOEXEC (CLOFORK) correctly.

This is kinda fishy. We'll end up with very large map with lots of 
unused space in them.
What we can sanily do, is re-random the base if no fds are in there (of 
course CLOFORK and CLOEXEC do not count).



- Davide



^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-07 20:05                 ` Eric Dumazet
  2007-06-07 20:18                   ` Ulrich Drepper
@ 2007-06-07 21:57                   ` Davide Libenzi
  2007-06-08  4:38                     ` Eric Dumazet
  1 sibling, 1 reply; 129+ messages in thread
From: Davide Libenzi @ 2007-06-07 21:57 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Kyle Moffett, Alan Cox, Ulrich Drepper,
	Linux Kernel Mailing List, Linus Torvalds, Andrew Morton,
	Ingo Molnar

[-- Attachment #1: Type: TEXT/PLAIN, Size: 1464 bytes --]

On Thu, 7 Jun 2007, Eric Dumazet wrote:

> Davide Libenzi a écrit :
> > On Thu, 7 Jun 2007, Eric Dumazet wrote:
> > 
> > > Davide, are you sure we want FIFO for non sequential allocations ?
> > > 
> > > This tends to use all the fmap slots, and not very cache friendly
> > > if an app does a lot of [open(),...,close()] things. We already got a perf
> > > drop because of RCUification of file freeing (FIFO mode instead of LIFO
> > > given by kmalloc()/kfree())
> > > 
> > > If the idea behind this FIFO was security (ie not easy for an app to
> > > predict next glibc file handle), we/glibc might use yet another
> > > FD_SECUREMODE flag, wich ORed with O_NONSEQFD would ask to fdmap_newfd()
> > > to take the tail of fmap->slist, not head.
> > 
> > Uli, would it be OK to rely only on base randomization and use a LIFO
> > instead? We have base randomization, plus LIFO does not mean strictly
> > sequential like legacy allocator, just more compatc and cache friendly.
> > 
> 
> I am afraid randomization wont really work if /sbin/init or /bin/bash for
> example uses one (or more) unseq fd :
> The 'random base' will be propagated at fork()/exec() time ?

As I said to Uli, we can't move the base while fds are in there. We can 
re-randomize it when it's empty. This can also be done (it's a trivial and 
fast operation - just set fmap->base to a new value) even every time the 
fd count on the map touches zero.



- Davide


^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-07 21:44                     ` Davide Libenzi
@ 2007-06-07 22:03                       ` Ulrich Drepper
  2007-06-07 22:40                         ` Davide Libenzi
  0 siblings, 1 reply; 129+ messages in thread
From: Ulrich Drepper @ 2007-06-07 22:03 UTC (permalink / raw)
  To: Davide Libenzi
  Cc: Eric Dumazet, Kyle Moffett, Alan Cox, Linux Kernel Mailing List,
	Linus Torvalds, Andrew Morton, Ingo Molnar

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

Davide Libenzi wrote:
> What we can sanily do, is re-random the base if no fds are in there (of 
> course CLOFORK and CLOEXEC do not count).

With the last comment you mean "count after CLOFORK and CLOEXEC", right?
 So the re-basing would be done in two places: after fork and after execve?

That would be enough.  Programs should clean after themselves.

- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.7 (GNU/Linux)

iD8DBQFGaICk2ijCOnn/RHQRAqq5AJwIDmKdsL71ecQVd6PoDI2oOL/KcwCguj2b
TFqDhpJEDqFx1ypdHITuywA=
=Cbfg
-----END PGP SIGNATURE-----

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-07 20:10   ` Linus Torvalds
                       ` (2 preceding siblings ...)
  2007-06-07 21:31     ` Ulrich Drepper
@ 2007-06-07 22:22     ` Davide Libenzi
  2007-06-07 23:42       ` Linus Torvalds
  2007-06-08  0:59     ` Matt Mackall
  2007-06-08 15:56     ` Jeff Dike
  5 siblings, 1 reply; 129+ messages in thread
From: Davide Libenzi @ 2007-06-07 22:22 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Alan Cox, Linux Kernel Mailing List, Andrew Morton,
	Ulrich Drepper, Ingo Molnar, Eric Dumazet

On Thu, 7 Jun 2007, Linus Torvalds wrote:

> It has the disadvantage that it would need some per-architecture setup to 
> load the actual real arguments from memory: the system call would probably 
> look something like
> 
> 	syscall_indirect(unsigned long flags, sigset_t *, 
> 			 int syscall, unsigned long args[6]);
> 
> and the rule would be that it would just load the six system call 
> registers from that "args[]" array. Always load the full six registers, to 
> make it simpler and faster, and not having any confusion or ever needing 
> any wrappers that depend on the number of system calls.

We'd still need sys_nonseqfd() though, to move/dup legacy fds into the 
non-sequential area.



- Davide



^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-07 22:03                       ` Ulrich Drepper
@ 2007-06-07 22:40                         ` Davide Libenzi
  2007-06-08 12:07                           ` Theodore Tso
  0 siblings, 1 reply; 129+ messages in thread
From: Davide Libenzi @ 2007-06-07 22:40 UTC (permalink / raw)
  To: Ulrich Drepper
  Cc: Eric Dumazet, Kyle Moffett, Alan Cox, Linux Kernel Mailing List,
	Linus Torvalds, Andrew Morton, Ingo Molnar

On Thu, 7 Jun 2007, Ulrich Drepper wrote:

> -----BEGIN PGP SIGNED MESSAGE-----
> Hash: SHA1
> 
> Davide Libenzi wrote:
> > What we can sanily do, is re-random the base if no fds are in there (of 
> > course CLOFORK and CLOEXEC do not count).
> 
> With the last comment you mean "count after CLOFORK and CLOEXEC", right?
>  So the re-basing would be done in two places: after fork and after execve?

Yes. Files with the CLOFORK and CLOEXEC flag do not count for fork and 
exec copies.
I was also planning on doing it in __put_unused_fd(), every time 
fmap->count goes to zero. But get_random_int() is not as cheap as I 
thought. If we use a cheaper (although less secure) function to mix pid & 
jiffies, we could do it even in there.



- Davide



^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-07 22:22     ` Davide Libenzi
@ 2007-06-07 23:42       ` Linus Torvalds
  2007-06-08  0:04         ` Davide Libenzi
  0 siblings, 1 reply; 129+ messages in thread
From: Linus Torvalds @ 2007-06-07 23:42 UTC (permalink / raw)
  To: Davide Libenzi
  Cc: Alan Cox, Linux Kernel Mailing List, Andrew Morton,
	Ulrich Drepper, Ingo Molnar, Eric Dumazet



On Thu, 7 Jun 2007, Davide Libenzi wrote:
> 
> We'd still need sys_nonseqfd() though, to move/dup legacy fds into the 
> non-sequential area.

Umm. No we don't. Because it's no more than 

	indirect_syscall(dup, FD_NONSEQ)

isn't it?

		Linus

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-07 23:42       ` Linus Torvalds
@ 2007-06-08  0:04         ` Davide Libenzi
  0 siblings, 0 replies; 129+ messages in thread
From: Davide Libenzi @ 2007-06-08  0:04 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Alan Cox, Linux Kernel Mailing List, Andrew Morton,
	Ulrich Drepper, Ingo Molnar, Eric Dumazet

On Thu, 7 Jun 2007, Linus Torvalds wrote:

> 
> 
> On Thu, 7 Jun 2007, Davide Libenzi wrote:
> > 
> > We'd still need sys_nonseqfd() though, to move/dup legacy fds into the 
> > non-sequential area.
> 
> Umm. No we don't. Because it's no more than 
> 
> 	indirect_syscall(dup, FD_NONSEQ)
> 
> isn't it?

Hmm, ok. It need some changes since sys_dup() and F_DUPFD uses common code 
at the moment, but it'd ok.
Basically, everything that calls get_unused_fd() can get the magic 
indirect_syscall() settings. I was just planning to localize the 
sequential/non-sequential behaviour just in there.
The sys_dup(), sys_dup2() and F_DUPFD have some custom code, although 
sys_dup() should really use get_unused_fd() in any way.



- Davide



^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-07 20:10   ` Linus Torvalds
                       ` (3 preceding siblings ...)
  2007-06-07 22:22     ` Davide Libenzi
@ 2007-06-08  0:59     ` Matt Mackall
  2007-06-08  2:25       ` Linus Torvalds
  2007-06-08 15:56     ` Jeff Dike
  5 siblings, 1 reply; 129+ messages in thread
From: Matt Mackall @ 2007-06-08  0:59 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Alan Cox, Davide Libenzi, Linux Kernel Mailing List,
	Andrew Morton, Ulrich Drepper, Ingo Molnar, Eric Dumazet

On Thu, Jun 07, 2007 at 01:10:23PM -0700, Linus Torvalds wrote:
> What do people think about that kind of approach? It has the advantage 
> that it does *not* involve multiple kernel entries (just a single entry to 
> a small wrapper that sets some process state temporarily), and that it 
> doesn't have any sticky state that might confuse a library (or a signal 
> handler: even if you end up doing "prctrl(ON) ; syscall(); prctrl(OFF)", a 
> signal handler that happens in between the prctrl's would see unexpected 
> behaviour).

First, how does this work in-kernel? Does it set a flag in the thread
struct that magically gets used in the actual syscall? Or do we pass
flags down to the sys_foo() function in some manner?

If the former, there is potential for interaction with asynchronous
code running on behalf of the thread, no? Especially if we ever adopt
one of the syslet schemes.

Second, I think we're likely to run out of flag bits really quickly as
this is a good dumping spot for patching up our many slightly
brain-damaged APIs (be they POSIX or Linux-specific).

Third, can I do sys_indirect(sys_indirect(foo, args), flags1),
flags2)?

Fourth, can we do sys_indirect(foo, args, flags | ASYNC) and get most
of the way to merging this with the syslet proposal?

-- 
Mathematics is the supreme nostalgia of our time.

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-08  0:59     ` Matt Mackall
@ 2007-06-08  2:25       ` Linus Torvalds
  0 siblings, 0 replies; 129+ messages in thread
From: Linus Torvalds @ 2007-06-08  2:25 UTC (permalink / raw)
  To: Matt Mackall
  Cc: Alan Cox, Davide Libenzi, Linux Kernel Mailing List,
	Andrew Morton, Ulrich Drepper, Ingo Molnar, Eric Dumazet



On Thu, 7 Jun 2007, Matt Mackall wrote:
> 
> First, how does this work in-kernel? Does it set a flag in the thread
> struct that magically gets used in the actual syscall? Or do we pass
> flags down to the sys_foo() function in some manner?

Set a flag in the thread-struct.

In fact, that's how "access()" already works.

And yes, syslets would need to have their own thread-structs and/or 
save/restore the thing.

> Second, I think we're likely to run out of flag bits really quickly as
> this is a good dumping spot for patching up our many slightly
> brain-damaged APIs (be they POSIX or Linux-specific).

Well, I do suspect that we'd need to basically make the flags be 
per-system call. With "common features" (ie a system call that doesn't 
return a file descriptor would re-use the bit for "nonlinear-fd" for 
something else, while a system call that doesn't do path lookup would use 
all the LOOKUP_xyzzy bits for something else).

I agree that if we kept flags _totally_ separate, we'd run out of them 
really quickly. But I don't think we want to ever be in the situation 
where _one_ set of system calls would need that many flags. If we get 
there, we'd really be much better off with a new system call!

> Third, can I do sys_indirect(sys_indirect(foo, args), flags1), flags2)?

I'd say no.

> Fourth, can we do sys_indirect(foo, args, flags | ASYNC) and get most
> of the way to merging this with the syslet proposal?

I think that may well be a really good idea.

		Linus

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-07 20:03                   ` Andrew Morton
@ 2007-06-08  2:55                     ` Ulrich Drepper
  2007-06-08  5:16                       ` Davide Libenzi
  0 siblings, 1 reply; 129+ messages in thread
From: Ulrich Drepper @ 2007-06-08  2:55 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Davide Libenzi, Eric Dumazet, Alan Cox,
	Linux Kernel Mailing List, Linus Torvalds, Ingo Molnar

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

Andrew Morton wrote:
> Absolutely.  Let's get them into their final form now, rather than letting
> some intermediate interface escape out into a major kernel release.

Even if Linus' redirection is adopted, these interfaces should still be
fixed up.  No need to rely on hacks if we can still fix up the interfaces.

- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.7 (GNU/Linux)

iD8DBQFGaMU42ijCOnn/RHQRAm0MAJ9pprTVUD/2YvDsL6CPL21WvZSkEACeN3r8
TT8a5FyF2rCct+8gBKyWF/s=
=fs4f
-----END PGP SIGNATURE-----

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-07 21:57                   ` Davide Libenzi
@ 2007-06-08  4:38                     ` Eric Dumazet
  2007-06-08  5:20                       ` Davide Libenzi
  0 siblings, 1 reply; 129+ messages in thread
From: Eric Dumazet @ 2007-06-08  4:38 UTC (permalink / raw)
  To: Davide Libenzi
  Cc: Kyle Moffett, Alan Cox, Ulrich Drepper,
	Linux Kernel Mailing List, Linus Torvalds, Andrew Morton,
	Ingo Molnar

Davide Libenzi a écrit :
> On Thu, 7 Jun 2007, Eric Dumazet wrote:
>> I am afraid randomization wont really work if /sbin/init or /bin/bash for
>> example uses one (or more) unseq fd :
>> The 'random base' will be propagated at fork()/exec() time ?
> 
> As I said to Uli, we can't move the base while fds are in there. We can 
> re-randomize it when it's empty. This can also be done (it's a trivial and 
> fast operation - just set fmap->base to a new value) even every time the 
> fd count on the map touches zero.
> 

Hum, I think it would be better to free fmap if it's empty, instead of change 
fmap->base. (Only in fork() after removal of O_CLOFORK file handles, and in 
exec() after removal of O_CLOEXEC file handles)



^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-08  2:55                     ` Ulrich Drepper
@ 2007-06-08  5:16                       ` Davide Libenzi
  0 siblings, 0 replies; 129+ messages in thread
From: Davide Libenzi @ 2007-06-08  5:16 UTC (permalink / raw)
  To: Ulrich Drepper
  Cc: Andrew Morton, Eric Dumazet, Alan Cox, Linux Kernel Mailing List,
	Linus Torvalds, Ingo Molnar

On Thu, 7 Jun 2007, Ulrich Drepper wrote:

> -----BEGIN PGP SIGNED MESSAGE-----
> Hash: SHA1
> 
> Andrew Morton wrote:
> > Absolutely.  Let's get them into their final form now, rather than letting
> > some intermediate interface escape out into a major kernel release.
> 
> Even if Linus' redirection is adopted, these interfaces should still be
> fixed up.  No need to rely on hacks if we can still fix up the interfaces.

Indeed. I still think that adding O_NONSEQFD is a good idea, independently 
on the other interfaces.


- Davide



^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-08  4:38                     ` Eric Dumazet
@ 2007-06-08  5:20                       ` Davide Libenzi
  0 siblings, 0 replies; 129+ messages in thread
From: Davide Libenzi @ 2007-06-08  5:20 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Kyle Moffett, Alan Cox, Ulrich Drepper,
	Linux Kernel Mailing List, Linus Torvalds, Andrew Morton,
	Ingo Molnar

[-- Attachment #1: Type: TEXT/PLAIN, Size: 1127 bytes --]

On Fri, 8 Jun 2007, Eric Dumazet wrote:

> Davide Libenzi a écrit :
> > On Thu, 7 Jun 2007, Eric Dumazet wrote:
> > > I am afraid randomization wont really work if /sbin/init or /bin/bash for
> > > example uses one (or more) unseq fd :
> > > The 'random base' will be propagated at fork()/exec() time ?
> > 
> > As I said to Uli, we can't move the base while fds are in there. We can
> > re-randomize it when it's empty. This can also be done (it's a trivial and
> > fast operation - just set fmap->base to a new value) even every time the fd
> > count on the map touches zero.
> > 
> 
> Hum, I think it would be better to free fmap if it's empty, instead of change
> fmap->base. (Only in fork() after removal of O_CLOFORK file handles, and in
> exec() after removal of O_CLOEXEC file handles)

That can be done too. When it'll be re-created will be randomized anyway.
I'll probably be doing it everytime fmap->count touches zero in 
__put_unused_fd(), so even if the map is not empty at fork and/or exec 
time, the program have other chances of randomize in the middle of its 
lifetime.


- Davide


^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-07 22:40                         ` Davide Libenzi
@ 2007-06-08 12:07                           ` Theodore Tso
  2007-06-08 13:01                             ` Alan Cox
                                               ` (2 more replies)
  0 siblings, 3 replies; 129+ messages in thread
From: Theodore Tso @ 2007-06-08 12:07 UTC (permalink / raw)
  To: Davide Libenzi
  Cc: Ulrich Drepper, Eric Dumazet, Kyle Moffett, Alan Cox,
	Linux Kernel Mailing List, Linus Torvalds, Andrew Morton,
	Ingo Molnar

On Thu, Jun 07, 2007 at 03:40:14PM -0700, Davide Libenzi wrote:
> Yes. Files with the CLOFORK and CLOEXEC flag do not count for fork and 
> exec copies.
> I was also planning on doing it in __put_unused_fd(), every time 
> fmap->count goes to zero. But get_random_int() is not as cheap as I 
> thought. If we use a cheaper (although less secure) function to mix pid & 
> jiffies, we could do it even in there.

Um, how cheap do you need it?  get_random_int() is actually pretty
cheep, since it was designed to be usable by the networking stack for
sequence numbers for TCP packets; and it's not like sys_close() or
sys_open() is a majorly critical path, is it?  If the concern is
increasing the potential hold time, I suppose you could have the
exactly two callers of __put_unused_fd() (sys_close() and
put_unused_fd()) call get_random_int() before grabbing the
current->files->file_lock spinlock,

						- Ted

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-08 12:07                           ` Theodore Tso
@ 2007-06-08 13:01                             ` Alan Cox
  2007-06-08 18:11                               ` Davide Libenzi
  2007-06-08 18:07                             ` Davide Libenzi
  2007-06-08 18:35                             ` Linus Torvalds
  2 siblings, 1 reply; 129+ messages in thread
From: Alan Cox @ 2007-06-08 13:01 UTC (permalink / raw)
  To: Theodore Tso
  Cc: Davide Libenzi, Ulrich Drepper, Eric Dumazet, Kyle Moffett,
	Linux Kernel Mailing List, Linus Torvalds, Andrew Morton,
	Ingo Molnar

> Um, how cheap do you need it?  get_random_int() is actually pretty
> cheep, since it was designed to be usable by the networking stack for
> sequence numbers for TCP packets; and it's not like sys_close() or
> sys_open() is a majorly critical path, is it?  If the concern is

At this point wouldn't it be cheaper to allocate file handles using a
different algorithm than firing up the RNG - say like in the POSIX
fashion ... 

It seems every purpose of this fd stuff is now a failure when
compared with existing behaviours.


^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-07 20:10   ` Linus Torvalds
                       ` (4 preceding siblings ...)
  2007-06-08  0:59     ` Matt Mackall
@ 2007-06-08 15:56     ` Jeff Dike
  5 siblings, 0 replies; 129+ messages in thread
From: Jeff Dike @ 2007-06-08 15:56 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Alan Cox, Davide Libenzi, Linux Kernel Mailing List,
	Andrew Morton, Ulrich Drepper, Ingo Molnar, Eric Dumazet

On Thu, Jun 07, 2007 at 01:10:23PM -0700, Linus Torvalds wrote:
> HOWEVER.
> 
> I think we could introduce a *single* new system call, which does 
> basically a "run the specified system call with the following flags".

As long as we are considering indirecting system calls, how about
reviving your proposal to run a system call in a different address
space:
	http://www.ussg.iu.edu/hypermail/linux/kernel/0212.3/0502.html

UML could make good use of the ability to remotely manipulate address
spaces.  I never liked any of the proposed interfaces very much,
including mm_indirect.  However, if my tastebuds are defective, and
indirecting system calls is the way to solve a family of problems,
then we should look at fitting in mm_indirect.

> It has the disadvantage that it would need some per-architecture setup to 
> load the actual real arguments from memory: the system call would probably 
> look something like
> 
> 	syscall_indirect(unsigned long flags, sigset_t *, 
> 			 int syscall, unsigned long args[6]);

mm_indirect would need a file descriptor to the other address space,
which we would presumably get from a new get_mm() system call.

				Jeff

-- 
Work email - jdike at linux dot intel dot com

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-08 12:07                           ` Theodore Tso
  2007-06-08 13:01                             ` Alan Cox
@ 2007-06-08 18:07                             ` Davide Libenzi
  2007-06-08 18:35                             ` Linus Torvalds
  2 siblings, 0 replies; 129+ messages in thread
From: Davide Libenzi @ 2007-06-08 18:07 UTC (permalink / raw)
  To: Theodore Tso
  Cc: Ulrich Drepper, Eric Dumazet, Kyle Moffett, Alan Cox,
	Linux Kernel Mailing List, Linus Torvalds, Andrew Morton,
	Ingo Molnar

On Fri, 8 Jun 2007, Theodore Tso wrote:

> On Thu, Jun 07, 2007 at 03:40:14PM -0700, Davide Libenzi wrote:
> > Yes. Files with the CLOFORK and CLOEXEC flag do not count for fork and 
> > exec copies.
> > I was also planning on doing it in __put_unused_fd(), every time 
> > fmap->count goes to zero. But get_random_int() is not as cheap as I 
> > thought. If we use a cheaper (although less secure) function to mix pid & 
> > jiffies, we could do it even in there.
> 
> Um, how cheap do you need it?  get_random_int() is actually pretty
> cheep, since it was designed to be usable by the networking stack for
> sequence numbers for TCP packets; and it's not like sys_close() or
> sys_open() is a majorly critical path, is it?  If the concern is
> increasing the potential hold time, I suppose you could have the
> exactly two callers of __put_unused_fd() (sys_close() and
> put_unused_fd()) call get_random_int() before grabbing the
> current->files->file_lock spinlock,

I'm actually using get_random_int() in the slow path (fmap creation time). 
It does a few things get_random_int(), and one of those is an MD4 transform.
This does not need to be super secure (the Unix allocator has been 
exactly predicatble for years), so maybe a cheaper combination of the 
previous base (generated with get_random_int) together with jiffies and 
pid is enough. I really would not want to put something like an MD4 
transform in that path.


- Davide



^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-08 13:01                             ` Alan Cox
@ 2007-06-08 18:11                               ` Davide Libenzi
  2007-06-08 18:26                                 ` Alan Cox
  2007-06-09  5:41                                 ` Paul Mackerras
  0 siblings, 2 replies; 129+ messages in thread
From: Davide Libenzi @ 2007-06-08 18:11 UTC (permalink / raw)
  To: Alan Cox
  Cc: Theodore Tso, Ulrich Drepper, Eric Dumazet, Kyle Moffett,
	Linux Kernel Mailing List, Linus Torvalds, Andrew Morton,
	Ingo Molnar

On Fri, 8 Jun 2007, Alan Cox wrote:

> > Um, how cheap do you need it?  get_random_int() is actually pretty
> > cheep, since it was designed to be usable by the networking stack for
> > sequence numbers for TCP packets; and it's not like sys_close() or
> > sys_open() is a majorly critical path, is it?  If the concern is
> 
> At this point wouldn't it be cheaper to allocate file handles using a
> different algorithm than firing up the RNG - say like in the POSIX
> fashion ... 

The only reason we use a floating base, is because Uli preferred to have 
non-exactly predictable fd allocations. There no reason of re-doing the 
same POSIX mistake all over again:

Errare humanum est, perseverare autem diabolicum


- Davide



^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-08 18:11                               ` Davide Libenzi
@ 2007-06-08 18:26                                 ` Alan Cox
  2007-06-08 18:43                                   ` Ulrich Drepper
  2007-06-08 19:22                                   ` Davide Libenzi
  2007-06-09  5:41                                 ` Paul Mackerras
  1 sibling, 2 replies; 129+ messages in thread
From: Alan Cox @ 2007-06-08 18:26 UTC (permalink / raw)
  To: Davide Libenzi
  Cc: Theodore Tso, Ulrich Drepper, Eric Dumazet, Kyle Moffett,
	Linux Kernel Mailing List, Linus Torvalds, Andrew Morton,
	Ingo Molnar

> The only reason we use a floating base, is because Uli preferred to have 
> non-exactly predictable fd allocations. There no reason of re-doing the 
> same POSIX mistake all over again:

Why does he want an unpredictable algorithm - to make debugging hell
because apps crash only for some patterns and reduce reproducability of
debugging of userspace ?

> Errare humanum est, perseverare autem diabolicum

Speaking latin didn't make the romans too wise either to look at their
history.

Alan

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-08 12:07                           ` Theodore Tso
  2007-06-08 13:01                             ` Alan Cox
  2007-06-08 18:07                             ` Davide Libenzi
@ 2007-06-08 18:35                             ` Linus Torvalds
  2 siblings, 0 replies; 129+ messages in thread
From: Linus Torvalds @ 2007-06-08 18:35 UTC (permalink / raw)
  To: Theodore Tso
  Cc: Davide Libenzi, Ulrich Drepper, Eric Dumazet, Kyle Moffett,
	Alan Cox, Linux Kernel Mailing List, Andrew Morton, Ingo Molnar



On Fri, 8 Jun 2007, Theodore Tso wrote:
>
> ... and it's not like sys_close() or sys_open() is a majorly critical 
> path, is it?

open/close/stat/lstat are _the_ most important system calls, so yes, it's 
a majorly critical path. MUCH more so than opening a new TCP connection.

You _may_ open a few hundred TCP connections a second (yeah, yeah, don't 
tell me about unrealistic benchmarks that do more), but that's on a server 
with good bandwidth etc. open/closes easily happen tens of _thousands_ of 
times a second. We're talking sub-microsecond system calls.

Whether get_random_int() is noticeable or not, I dunno. But that path is a 
hell of a lot more performance-sensitive than pretty much anything else.

		Linus

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-08 18:26                                 ` Alan Cox
@ 2007-06-08 18:43                                   ` Ulrich Drepper
  2007-06-08 18:46                                     ` Al Viro
  2007-06-08 19:30                                     ` Alan Cox
  2007-06-08 19:22                                   ` Davide Libenzi
  1 sibling, 2 replies; 129+ messages in thread
From: Ulrich Drepper @ 2007-06-08 18:43 UTC (permalink / raw)
  To: Alan Cox
  Cc: Davide Libenzi, Theodore Tso, Eric Dumazet, Kyle Moffett,
	Linux Kernel Mailing List, Linus Torvalds, Andrew Morton,
	Ingo Molnar

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

Alan Cox wrote:
> Why does he want an unpredictable algorithm

To avoid exactly the kind of problem we have now in future: programs
relying on specific patterns.

- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.7 (GNU/Linux)

iD4DBQFGaaNR2ijCOnn/RHQRAtGQAJUWlqUTHuxU7TMi+0tiEldqQPi0AJwIVOAx
lG6JQ9o6YNYVtCFOLyZLsg==
=0Vp+
-----END PGP SIGNATURE-----

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-08 18:43                                   ` Ulrich Drepper
@ 2007-06-08 18:46                                     ` Al Viro
  2007-06-08 18:56                                       ` Ulrich Drepper
  2007-06-08 19:30                                     ` Alan Cox
  1 sibling, 1 reply; 129+ messages in thread
From: Al Viro @ 2007-06-08 18:46 UTC (permalink / raw)
  To: Ulrich Drepper
  Cc: Alan Cox, Davide Libenzi, Theodore Tso, Eric Dumazet,
	Kyle Moffett, Linux Kernel Mailing List, Linus Torvalds,
	Andrew Morton, Ingo Molnar

On Fri, Jun 08, 2007 at 11:43:29AM -0700, Ulrich Drepper wrote:
> -----BEGIN PGP SIGNED MESSAGE-----
> Hash: SHA1
> 
> Alan Cox wrote:
> > Why does he want an unpredictable algorithm
> 
> To avoid exactly the kind of problem we have now in future: programs
> relying on specific patterns.

You are making debugging no end of fun...

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-08 18:46                                     ` Al Viro
@ 2007-06-08 18:56                                       ` Ulrich Drepper
  2007-06-08 19:07                                         ` Linus Torvalds
  2007-06-08 19:34                                         ` Alan Cox
  0 siblings, 2 replies; 129+ messages in thread
From: Ulrich Drepper @ 2007-06-08 18:56 UTC (permalink / raw)
  To: Al Viro
  Cc: Alan Cox, Davide Libenzi, Theodore Tso, Eric Dumazet,
	Kyle Moffett, Linux Kernel Mailing List, Linus Torvalds,
	Andrew Morton, Ingo Molnar

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

Al Viro wrote:
> You are making debugging no end of fun...

We are talking about file descriptors here.  If you're using file
descriptors as anything other than tokens you'll find out soon enough
that your code is broken.  The new type of file descriptors cannot be
used as indeces and the randomization makes sure that no program by some
fluke happens to work.

- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.7 (GNU/Linux)

iD8DBQFGaaZ02ijCOnn/RHQRAkN0AJ9ZDdaoBDKbKMyDfZI7pa3E7w3pXwCeLlbB
9kRy/E09QvPhMOfjeVYPNio=
=Cjh7
-----END PGP SIGNATURE-----

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-08 18:56                                       ` Ulrich Drepper
@ 2007-06-08 19:07                                         ` Linus Torvalds
  2007-06-08 19:21                                           ` Davide Libenzi
  2007-06-08 19:34                                         ` Alan Cox
  1 sibling, 1 reply; 129+ messages in thread
From: Linus Torvalds @ 2007-06-08 19:07 UTC (permalink / raw)
  To: Ulrich Drepper
  Cc: Al Viro, Alan Cox, Davide Libenzi, Theodore Tso, Eric Dumazet,
	Kyle Moffett, Linux Kernel Mailing List, Andrew Morton,
	Ingo Molnar



On Fri, 8 Jun 2007, Ulrich Drepper wrote:
> 
> We are talking about file descriptors here.  If you're using file
> descriptors as anything other than tokens you'll find out soon enough
> that your code is broken.  The new type of file descriptors cannot be
> used as indeces and the randomization makes sure that no program by some
> fluke happens to work.

No, Uli.

You need things to be *repeatable* for debugging. No ifs, buts, or maybes 
about it.

I think truly random is a bad idea, or at the very least needs some way to 
disable it.

		Linus

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-08 19:07                                         ` Linus Torvalds
@ 2007-06-08 19:21                                           ` Davide Libenzi
  2007-06-09  0:03                                             ` Linus Torvalds
  0 siblings, 1 reply; 129+ messages in thread
From: Davide Libenzi @ 2007-06-08 19:21 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Ulrich Drepper, Al Viro, Alan Cox, Theodore Tso, Eric Dumazet,
	Kyle Moffett, Linux Kernel Mailing List, Andrew Morton,
	Ingo Molnar

On Fri, 8 Jun 2007, Linus Torvalds wrote:

> On Fri, 8 Jun 2007, Ulrich Drepper wrote:
> > 
> > We are talking about file descriptors here.  If you're using file
> > descriptors as anything other than tokens you'll find out soon enough
> > that your code is broken.  The new type of file descriptors cannot be
> > used as indeces and the randomization makes sure that no program by some
> > fluke happens to work.
> 
> No, Uli.
> 
> You need things to be *repeatable* for debugging. No ifs, buts, or maybes 
> about it.

It all depends on how you use the file descriptor. If you see a file 
descriptor as an opaque handle (like it should be, really), that is simply 
passed to the OS to use services exposed by the handle, you will be fine 
independently from the values handed out by the OS. It was for the exactly 
this guarantee that created the problems, with ppl relying on it for 
indexing table, closing all files < NR_FILE and so on.



- Davide



^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-08 18:26                                 ` Alan Cox
  2007-06-08 18:43                                   ` Ulrich Drepper
@ 2007-06-08 19:22                                   ` Davide Libenzi
  1 sibling, 0 replies; 129+ messages in thread
From: Davide Libenzi @ 2007-06-08 19:22 UTC (permalink / raw)
  To: Alan Cox
  Cc: Davide Libenzi, Theodore Tso, Ulrich Drepper, Eric Dumazet,
	Kyle Moffett, Linux Kernel Mailing List, Linus Torvalds,
	Andrew Morton, Ingo Molnar

On Fri, 8 Jun 2007, Alan Cox wrote:

> > Errare humanum est, perseverare autem diabolicum
> 
> Speaking latin didn't make the romans too wise either to look at their
> history.

If you really look at history, they were kinda wise.


- Davide



^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-08 18:43                                   ` Ulrich Drepper
  2007-06-08 18:46                                     ` Al Viro
@ 2007-06-08 19:30                                     ` Alan Cox
  2007-06-08 19:37                                       ` Davide Libenzi
  2007-06-11  8:24                                       ` Xavier Bestel
  1 sibling, 2 replies; 129+ messages in thread
From: Alan Cox @ 2007-06-08 19:30 UTC (permalink / raw)
  To: Ulrich Drepper
  Cc: Davide Libenzi, Theodore Tso, Eric Dumazet, Kyle Moffett,
	Linux Kernel Mailing List, Linus Torvalds, Andrew Morton,
	Ingo Molnar

On Fri, 08 Jun 2007 11:43:29 -0700
Ulrich Drepper <drepper@redhat.com> wrote:

> -----BEGIN PGP SIGNED MESSAGE-----
> Hash: SHA1
> 
> Alan Cox wrote:
> > Why does he want an unpredictable algorithm
> 
> To avoid exactly the kind of problem we have now in future: programs
> relying on specific patterns.

Which you seem to think is a bad thing, yet is actually a very good thing
because it means that crashes are repeatable and problems are debuggable
from end user reports.

Trying to randomize filehandles for the general case is not a productive
activity. If you want to debug cases write yourself a glibc wrapper that
does annoying things but don't inflict it on people who actually want to
build working, testable, debuggable systems (ie most of us)

Alan

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-08 18:56                                       ` Ulrich Drepper
  2007-06-08 19:07                                         ` Linus Torvalds
@ 2007-06-08 19:34                                         ` Alan Cox
  1 sibling, 0 replies; 129+ messages in thread
From: Alan Cox @ 2007-06-08 19:34 UTC (permalink / raw)
  To: Ulrich Drepper
  Cc: Al Viro, Davide Libenzi, Theodore Tso, Eric Dumazet,
	Kyle Moffett, Linux Kernel Mailing List, Linus Torvalds,
	Andrew Morton, Ingo Molnar

> We are talking about file descriptors here.  If you're using file
> descriptors as anything other than tokens you'll find out soon enough
> that your code is broken.  The new type of file descriptors cannot be
> used as indeces and the randomization makes sure that no program by some
> fluke happens to work.

If you are building a stable system and you test it and it passes
extensive testing you don't care if it works because of a specific
pattern of accesses since your testing shows that it continues to work.

If you randomize these it becomes fundamentally untestable. There is a
role for this in fuzz testing but there is not a role for it in normal
production behaviour.

Please consign the whole funky file handle farce to the bucket labelled
"dumb ideas". I know its a bit full but there is room in there for more -
unlike the kernel.

Alan

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-08 19:30                                     ` Alan Cox
@ 2007-06-08 19:37                                       ` Davide Libenzi
  2007-06-08 19:48                                         ` Alan Cox
  2007-06-11  8:24                                       ` Xavier Bestel
  1 sibling, 1 reply; 129+ messages in thread
From: Davide Libenzi @ 2007-06-08 19:37 UTC (permalink / raw)
  To: Alan Cox
  Cc: Ulrich Drepper, Theodore Tso, Eric Dumazet, Kyle Moffett,
	Linux Kernel Mailing List, Linus Torvalds, Andrew Morton,
	Ingo Molnar

On Fri, 8 Jun 2007, Alan Cox wrote:

> On Fri, 08 Jun 2007 11:43:29 -0700
> Ulrich Drepper <drepper@redhat.com> wrote:
> 
> > -----BEGIN PGP SIGNED MESSAGE-----
> > Hash: SHA1
> > 
> > Alan Cox wrote:
> > > Why does he want an unpredictable algorithm
> > 
> > To avoid exactly the kind of problem we have now in future: programs
> > relying on specific patterns.
> 
> Which you seem to think is a bad thing, yet is actually a very good thing
> because it means that crashes are repeatable and problems are debuggable
> from end user reports.
> 
> Trying to randomize filehandles for the general case is not a productive
> activity. If you want to debug cases write yourself a glibc wrapper that
> does annoying things but don't inflict it on people who actually want to
> build working, testable, debuggable systems (ie most of us)

So, what do you plan to do? Those handle won't be zero-based. Your 
"working" system I immagine will do:

	bleeh[handle - BASE].duh = ...;

How nice for a working system. If you *store* the handle returned by the 
OS, and you *use* the handle to call for OS services, you will be fine 
independently from the value handed out by the OS.



- Davide



^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-08 19:37                                       ` Davide Libenzi
@ 2007-06-08 19:48                                         ` Alan Cox
  2007-06-08 19:51                                           ` Davide Libenzi
  0 siblings, 1 reply; 129+ messages in thread
From: Alan Cox @ 2007-06-08 19:48 UTC (permalink / raw)
  To: Davide Libenzi
  Cc: Ulrich Drepper, Theodore Tso, Eric Dumazet, Kyle Moffett,
	Linux Kernel Mailing List, Linus Torvalds, Andrew Morton,
	Ingo Molnar

> So, what do you plan to do? Those handle won't be zero-based. Your 
> "working" system I immagine will do:
> 
> 	bleeh[handle - BASE].duh = ...;
> 
> How nice for a working system. If you *store* the handle returned by the 
> OS, and you *use* the handle to call for OS services, you will be fine 
> independently from the value handed out by the OS.

Well there are two ways I'd do this

#1: Throw the whole thing away and accept its not a good idea anyway

#2: If I was really going this way and I wanted to use it for serious
tricks for high performance I/O then I'd provide the handle from
userspace so that the strategy for allocation is controlled by the caller
who is the only one who can make the smart decisions

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-08 19:48                                         ` Alan Cox
@ 2007-06-08 19:51                                           ` Davide Libenzi
  2007-06-08 21:24                                             ` Alan Cox
  0 siblings, 1 reply; 129+ messages in thread
From: Davide Libenzi @ 2007-06-08 19:51 UTC (permalink / raw)
  To: Alan Cox
  Cc: Ulrich Drepper, Theodore Tso, Eric Dumazet, Kyle Moffett,
	Linux Kernel Mailing List, Linus Torvalds, Andrew Morton,
	Ingo Molnar

On Fri, 8 Jun 2007, Alan Cox wrote:

> > So, what do you plan to do? Those handle won't be zero-based. Your 
> > "working" system I immagine will do:
> > 
> > 	bleeh[handle - BASE].duh = ...;
> > 
> > How nice for a working system. If you *store* the handle returned by the 
> > OS, and you *use* the handle to call for OS services, you will be fine 
> > independently from the value handed out by the OS.
> 
> Well there are two ways I'd do this
> 
> #1: Throw the whole thing away and accept its not a good idea anyway

Unfortunately (exactly because of the same guarantees you're asking for 
those handles), in order for userspace libraries to reliably internally 
use fds to interact with the kernel, you need another kind of allocation 
strategy.



> #2: If I was really going this way and I wanted to use it for serious
> tricks for high performance I/O then I'd provide the handle from
> userspace so that the strategy for allocation is controlled by the caller
> who is the only one who can make the smart decisions

It does not work. What if the main application, library A and library B 
wants to implement their own allocation strategy?



- Davide



^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-08 19:51                                           ` Davide Libenzi
@ 2007-06-08 21:24                                             ` Alan Cox
  2007-06-08 21:59                                               ` Davide Libenzi
  0 siblings, 1 reply; 129+ messages in thread
From: Alan Cox @ 2007-06-08 21:24 UTC (permalink / raw)
  To: Davide Libenzi
  Cc: Ulrich Drepper, Theodore Tso, Eric Dumazet, Kyle Moffett,
	Linux Kernel Mailing List, Linus Torvalds, Andrew Morton,
	Ingo Molnar

> > #1: Throw the whole thing away and accept its not a good idea anyway
> 
> Unfortunately (exactly because of the same guarantees you're asking for 
> those handles), in order for userspace libraries to reliably internally 
> use fds to interact with the kernel, you need another kind of allocation 
> strategy.

Unproven and dubious at best as a claim.

> > #2: If I was really going this way and I wanted to use it for serious
> > tricks for high performance I/O then I'd provide the handle from
> > userspace so that the strategy for allocation is controlled by the caller
> > who is the only one who can make the smart decisions
> 
> It does not work. What if the main application, library A and library B 
> wants to implement their own allocation strategy?

Its called "discipline". I would suggest that libc contains a default
allocator. You might also want to assign library and application ranges
for clarity.

Alan

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-08 21:24                                             ` Alan Cox
@ 2007-06-08 21:59                                               ` Davide Libenzi
  2007-06-08 22:28                                                 ` Alan Cox
  0 siblings, 1 reply; 129+ messages in thread
From: Davide Libenzi @ 2007-06-08 21:59 UTC (permalink / raw)
  To: Alan Cox
  Cc: Ulrich Drepper, Theodore Tso, Eric Dumazet, Kyle Moffett,
	Linux Kernel Mailing List, Linus Torvalds, Andrew Morton,
	Ingo Molnar

On Fri, 8 Jun 2007, Alan Cox wrote:

> > > #1: Throw the whole thing away and accept its not a good idea anyway
> > 
> > Unfortunately (exactly because of the same guarantees you're asking for 
> > those handles), in order for userspace libraries to reliably internally 
> > use fds to interact with the kernel, you need another kind of allocation 
> > strategy.
> 
> Unproven and dubious at best as a claim.

I really don't mean to be rude and pointing you to read the archives, but 
the proof and the reason why claims are valid is inside there.



> > > #2: If I was really going this way and I wanted to use it for serious
> > > tricks for high performance I/O then I'd provide the handle from
> > > userspace so that the strategy for allocation is controlled by the caller
> > > who is the only one who can make the smart decisions
> > 
> > It does not work. What if the main application, library A and library B 
> > wants to implement their own allocation strategy?
> 
> Its called "discipline". I would suggest that libc contains a default
> allocator. You might also want to assign library and application ranges
> for clarity.

That is really nice solution. Each library has to have each own allocator. 
Then we'll have what, a committee that assigns fd ranges?
Ranges cannot be implemented with the current fdtable allocator, because 
they'll be way far apart and they'll waste space. Multiple separate ranges 
will require multiple fdtable structures (one for each range/allocator). 
Instead of a two-way switch (legacy and non-sequential) you will have 
multiple choices to select. This multiple choices will have to be 
replicated all around the code that access directly the fdtables. I did 
the fdmap consolidation patch, and I can tell you there are quite a few 
places that access fdtables directly.




- Davide



^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-08 21:59                                               ` Davide Libenzi
@ 2007-06-08 22:28                                                 ` Alan Cox
  2007-06-08 22:38                                                   ` Davide Libenzi
  0 siblings, 1 reply; 129+ messages in thread
From: Alan Cox @ 2007-06-08 22:28 UTC (permalink / raw)
  To: Davide Libenzi
  Cc: Ulrich Drepper, Theodore Tso, Eric Dumazet, Kyle Moffett,
	Linux Kernel Mailing List, Linus Torvalds, Andrew Morton,
	Ingo Molnar

> > Unproven and dubious at best as a claim.
> 
> I really don't mean to be rude and pointing you to read the archives, but 
> the proof and the reason why claims are valid is inside there.

I've read the archive. I'm totally unconvinced by any of the fd
allocation policy stuff. There are some good arguments about O_CLOEXEC
and threading but not about fd allocation.
 
> > > It does not work. What if the main application, library A and library B 
> > > wants to implement their own allocation strategy?
> > 
> > Its called "discipline". I would suggest that libc contains a default
> > allocator. You might also want to assign library and application ranges
> > for clarity.
> 
> That is really nice solution. Each library has to have each own allocator. 

Are you being deliberately stupid ?

I suggested *libc* contains a default allocator

> Then we'll have what, a committee that assigns fd ranges?

Currently the fd ranges are assigned by a committee called POSIX based on
Unix practice.

> replicated all around the code that access directly the fdtables. I did 
> the fdmap consolidation patch, and I can tell you there are quite a few 
> places that access fdtables directly.

This is true, but if you are worried about complexity we get back to the
original posix allocator which packs them in tight and produces a most
excellent spread in the general case (whacko apps like bash aside)


^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-08 22:28                                                 ` Alan Cox
@ 2007-06-08 22:38                                                   ` Davide Libenzi
  0 siblings, 0 replies; 129+ messages in thread
From: Davide Libenzi @ 2007-06-08 22:38 UTC (permalink / raw)
  To: Alan Cox
  Cc: Ulrich Drepper, Theodore Tso, Eric Dumazet, Kyle Moffett,
	Linux Kernel Mailing List, Linus Torvalds, Andrew Morton,
	Ingo Molnar

On Fri, 8 Jun 2007, Alan Cox wrote:

> > > > It does not work. What if the main application, library A and library B 
> > > > wants to implement their own allocation strategy?
> > > 
> > > Its called "discipline". I would suggest that libc contains a default
> > > allocator. You might also want to assign library and application ranges
> > > for clarity.
> > 
> > That is really nice solution. Each library has to have each own allocator. 
> 
> Are you being deliberately stupid ?
> 
> I suggested *libc* contains a default allocator

"You might also want to assign library and application ranges for clarity."

So let's see. We have the legacy one (1st fdtable), the non-legacy app one 
(2nd fdtable) and one for the library (3rd fdtable). And which library? 
Each one his own (4th, 5th ... fdtable)? ...



> > replicated all around the code that access directly the fdtables. I did 
> > the fdmap consolidation patch, and I can tell you there are quite a few 
> > places that access fdtables directly.
> 
> This is true, but if you are worried about complexity we get back to the
> original posix allocator which packs them in tight and produces a most
> excellent spread in the general case (whacko apps like bash aside)

... complexity does not come from the allocator, but from the fact that 
direct access to the fdtable is done all over the kernel. Code that 
assumes there's only *one* fdtable.



- Davide



^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-08 19:21                                           ` Davide Libenzi
@ 2007-06-09  0:03                                             ` Linus Torvalds
  2007-06-09  0:13                                               ` Davide Libenzi
  2007-06-09  0:36                                               ` Al Viro
  0 siblings, 2 replies; 129+ messages in thread
From: Linus Torvalds @ 2007-06-09  0:03 UTC (permalink / raw)
  To: Davide Libenzi
  Cc: Ulrich Drepper, Al Viro, Alan Cox, Theodore Tso, Eric Dumazet,
	Kyle Moffett, Linux Kernel Mailing List, Andrew Morton,
	Ingo Molnar



On Fri, 8 Jun 2007, Davide Libenzi wrote:
>
> On Fri, 8 Jun 2007, Linus Torvalds wrote:
> > 
> > You need things to be *repeatable* for debugging. No ifs, buts, or maybes 
> > about it.
> 
> It all depends on how you use the file descriptor.

Read what I wrote. "for debugging".

If your code is bug-free, and does what you intend it to do, everything is 
fine. But you wouldn't be doing debugging then, would you?

For debugging, it does _not_ depend on "how you use the file descriptor". 
The whole _point_ is that something does something wrong. Maybe you 
_intended_ to use the file descriptor some way, and the bug was that you 
didn't.

		Linus

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-09  0:03                                             ` Linus Torvalds
@ 2007-06-09  0:13                                               ` Davide Libenzi
  2007-06-09  0:36                                               ` Al Viro
  1 sibling, 0 replies; 129+ messages in thread
From: Davide Libenzi @ 2007-06-09  0:13 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Ulrich Drepper, Al Viro, Alan Cox, Theodore Tso, Eric Dumazet,
	Kyle Moffett, Linux Kernel Mailing List, Andrew Morton,
	Ingo Molnar

On Fri, 8 Jun 2007, Linus Torvalds wrote:

> On Fri, 8 Jun 2007, Davide Libenzi wrote:
> >
> > On Fri, 8 Jun 2007, Linus Torvalds wrote:
> > > 
> > > You need things to be *repeatable* for debugging. No ifs, buts, or maybes 
> > > about it.
> > 
> > It all depends on how you use the file descriptor.
> 
> Read what I wrote. "for debugging".
> 
> If your code is bug-free, and does what you intend it to do, everything is 
> fine. But you wouldn't be doing debugging then, would you?
> 
> For debugging, it does _not_ depend on "how you use the file descriptor". 
> The whole _point_ is that something does something wrong. Maybe you 
> _intended_ to use the file descriptor some way, and the bug was that you 
> didn't.

Ok, so what's your idea? Have another POSIX-like allocator somewhere up 
there in the fd space, with a fixed and include-defined base?
Or just drop all this altogether?



- Davide



^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-09  0:03                                             ` Linus Torvalds
  2007-06-09  0:13                                               ` Davide Libenzi
@ 2007-06-09  0:36                                               ` Al Viro
  2007-06-09  1:19                                                 ` Ulrich Drepper
  1 sibling, 1 reply; 129+ messages in thread
From: Al Viro @ 2007-06-09  0:36 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Davide Libenzi, Ulrich Drepper, Alan Cox, Theodore Tso,
	Eric Dumazet, Kyle Moffett, Linux Kernel Mailing List,
	Andrew Morton, Ingo Molnar

On Fri, Jun 08, 2007 at 05:03:57PM -0700, Linus Torvalds wrote:
> 
> 
> On Fri, 8 Jun 2007, Davide Libenzi wrote:
> >
> > On Fri, 8 Jun 2007, Linus Torvalds wrote:
> > > 
> > > You need things to be *repeatable* for debugging. No ifs, buts, or maybes 
> > > about it.
> > 
> > It all depends on how you use the file descriptor.
> 
> Read what I wrote. "for debugging".
> 
> If your code is bug-free, and does what you intend it to do, everything is 
> fine. But you wouldn't be doing debugging then, would you?
> 
> For debugging, it does _not_ depend on "how you use the file descriptor". 
> The whole _point_ is that something does something wrong. Maybe you 
> _intended_ to use the file descriptor some way, and the bug was that you 
> didn't.

Exactly.  Put it another way, randomizer is a stress-tester.  Which is
a damn useful debugging tool in its own right.  *However*, the main thing
about debugging tools is that you need to be able to turn them on and off
individually; then you get a useful information narrowing the diagnosis
down.  If you can't do that, you lose.

	Folks, the real question is whether we consider the loops blindly
shooting down the file descriptors as a supportable thing.  Correction:
whether the code written in that style *and* *correct* *with* *current*
*semantics* is sufficiently numerous and hard to notice that we need that
sort of kludges.  Because what Uli's suggesting is certainly that - a kludge
sufficient for glibc internal needs.

Note that
#define NR_FILES <some constant>

for (i = 0; i < NR_FILES; i++)
	close(i);

is simply broken.  No ifs, no buts, it's broken on the current kernel.  So
that kind of stuff needs fixing anyway; do we have enough left to make it
worth preserving?

I don't know the answer; some data would be much appreciated.  If the broken
stuff like above outnumbers the valid uses, we probably need think of some
way to catch that kind of crap anyway...  Randomizer for open() et.al.
switchable on per-process basis would be a nice tool to catch some of those,
along with instrumenting the kernel to notice massive close() on files that
hadn't been opened, etc.

As long as all we have is a handwaving about widespread and unspecified
code doing this, this and that, but not that...

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-09  0:36                                               ` Al Viro
@ 2007-06-09  1:19                                                 ` Ulrich Drepper
  2007-06-09  1:41                                                   ` Al Viro
  0 siblings, 1 reply; 129+ messages in thread
From: Ulrich Drepper @ 2007-06-09  1:19 UTC (permalink / raw)
  To: Al Viro
  Cc: Linus Torvalds, Davide Libenzi, Alan Cox, Theodore Tso,
	Eric Dumazet, Kyle Moffett, Linux Kernel Mailing List,
	Andrew Morton, Ingo Molnar

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

Al Viro wrote:
> Exactly.  Put it another way, randomizer is a stress-tester.

... and a security mechanism.  And as such it is only useful if it is
used.  Probably it should be policy-controlled whether you can turn it off.


> Note that
> #define NR_FILES <some constant>
> 
> for (i = 0; i < NR_FILES; i++)
> 	close(i);

You're confusing the problems.  This is not the argument for having a
separate file descriptor set.  It is the argument to have hidden file
descriptors.  Randomization has nothing whatsoever to do with this example.

- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.7 (GNU/Linux)

iD8DBQFGagAg2ijCOnn/RHQRAhV+AJ9qT2epTxDWWS++74f+vrV3NucVHACdGkxm
MULoyE+0NY7dSHcB2epKe7w=
=o3+1
-----END PGP SIGNATURE-----

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-09  1:19                                                 ` Ulrich Drepper
@ 2007-06-09  1:41                                                   ` Al Viro
  2007-06-09  2:10                                                     ` Ulrich Drepper
  0 siblings, 1 reply; 129+ messages in thread
From: Al Viro @ 2007-06-09  1:41 UTC (permalink / raw)
  To: Ulrich Drepper
  Cc: Linus Torvalds, Davide Libenzi, Alan Cox, Theodore Tso,
	Eric Dumazet, Kyle Moffett, Linux Kernel Mailing List,
	Andrew Morton, Ingo Molnar

On Fri, Jun 08, 2007 at 06:19:28PM -0700, Ulrich Drepper wrote:
> -----BEGIN PGP SIGNED MESSAGE-----
> Hash: SHA1
> 
> Al Viro wrote:
> > Exactly.  Put it another way, randomizer is a stress-tester.
> 
> ... and a security mechanism.  And as such it is only useful if it is
> used.  Probably it should be policy-controlled whether you can turn it off.
 
Any real-world examples of exploitable holes based on that?  Real blue-sky
world, not grsec-advocacy "undiscovered null ptr derefs on every corner and
all of the variety fixable by our magic goo" one, please.

> > Note that
> > #define NR_FILES <some constant>
> > 
> > for (i = 0; i < NR_FILES; i++)
> > 	close(i);
> 
> You're confusing the problems.

No, I'm not.  The entire argument for having a separate set of descriptors
is based on programs behaving in similar fashion, working correctly now but
limiting what libraries can do with opening files for internal needs.

So here's the question: how widespread it really is, considering that "working
correctly" is not a trivial constraint.  Again, I'd love to see real data;
if I need handwaving, I know where to find it.

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-09  1:41                                                   ` Al Viro
@ 2007-06-09  2:10                                                     ` Ulrich Drepper
  2007-06-09 15:15                                                       ` Al Viro
  0 siblings, 1 reply; 129+ messages in thread
From: Ulrich Drepper @ 2007-06-09  2:10 UTC (permalink / raw)
  To: Al Viro
  Cc: Linus Torvalds, Davide Libenzi, Alan Cox, Theodore Tso,
	Eric Dumazet, Kyle Moffett, Linux Kernel Mailing List,
	Andrew Morton, Ingo Molnar

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

Al Viro wrote:
> Any real-world examples of exploitable holes based on that?

Return to libc exploit, calling dup2, where some privileged data is
redirected from the normal file descriptor to one of the attackers
choosing.  The latter could be an outgoing socket connection which would
result in leaking the data to the outside.

         normal code                         intruder

                                          so = socket()

    fd = open ("local-file")

                                          dup2(so, fd);

    write (fd, privileged data)


It's just a little function call.  If the arguments of dup2() are known
this is not a big issue to construct.



>> You're confusing the problems.
> 
> No, I'm not.  The entire argument for having a separate set of descriptors
> is based on programs behaving in similar fashion, working correctly now but
> limiting what libraries can do with opening files for internal needs.

It's completely different.

The reason why runtime libraries cannot keep descriptors open unless it
is explicitly part of the API (e.g., opendir) is that POSIX and Unix
forever guarantee that descriptors are allocated sequentially.  Linus
already showed a code sequence:

	close(0);
	.. something else ..
	if (open("myfile", O_RDONLY) < 0)
		exit(1);

This occurs in the real world and it is guaranteed to work.

- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.7 (GNU/Linux)

iD8DBQFGagv72ijCOnn/RHQRAqxTAJwLhjuFT22SegEVXrbevpsnOkDxLQCgwgza
7ZOScxEm2lgMJNjG9UDAdfo=
=fenl
-----END PGP SIGNATURE-----

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-08 18:11                               ` Davide Libenzi
  2007-06-08 18:26                                 ` Alan Cox
@ 2007-06-09  5:41                                 ` Paul Mackerras
  2007-06-09 14:38                                   ` Kyle Moffett
  2007-06-09 17:00                                   ` Davide Libenzi
  1 sibling, 2 replies; 129+ messages in thread
From: Paul Mackerras @ 2007-06-09  5:41 UTC (permalink / raw)
  To: Davide Libenzi
  Cc: Alan Cox, Theodore Tso, Ulrich Drepper, Eric Dumazet,
	Kyle Moffett, Linux Kernel Mailing List, Linus Torvalds,
	Andrew Morton, Ingo Molnar

Davide Libenzi writes:

> The only reason we use a floating base, is because Uli preferred to have 
> non-exactly predictable fd allocations. There no reason of re-doing the 
> same POSIX mistake all over again:

Why must everything that makes things a bit simpler and more
predictable for application programmers be called a "mistake"?

Paul.

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-09  5:41                                 ` Paul Mackerras
@ 2007-06-09 14:38                                   ` Kyle Moffett
  2007-06-10  6:48                                     ` Paul Mackerras
  2007-06-09 17:00                                   ` Davide Libenzi
  1 sibling, 1 reply; 129+ messages in thread
From: Kyle Moffett @ 2007-06-09 14:38 UTC (permalink / raw)
  To: Paul Mackerras
  Cc: Davide Libenzi, Alan Cox, Theodore Tso, Ulrich Drepper,
	Eric Dumazet, Linux Kernel Mailing List, Linus Torvalds,
	Andrew Morton, Ingo Molnar

On Jun 09, 2007, at 01:41:42, Paul Mackerras wrote:
> Davide Libenzi writes:
>
>> The only reason we use a floating base, is because Uli preferred  
>> to have non-exactly predictable fd allocations. There no reason of  
>> re-doing the same POSIX mistake all over again:
>
> Why must everything that makes things a bit simpler and more  
> predictable for application programmers be called a "mistake"?

1)  Linear FD allocation makes it IMPOSSIBLE for libraries to  
reliably use persistent FDs behind an application's back.  For  
example, this code sequence should almost always be OK: (except when  
calling into a library specifically to open a file or socket)

close(0);
some_library_function();
fd = open("some_file", O_RDWR);
/* fd == 0 here */

2)  Another common code example which causes problems:

int i;
for (i = 0; i < NR_OPEN; i++)
	if (!fd_is_special_to_us(i))
		close(i);

Note that this is conceptually buggy, but occurs in several major C  
programming books, most of the major shells, and a lot of other  
software to boot.

3) In order to allocate new FDs, the kernel has to scan over a  
(potentially very large) bitmap.  A process with 100,000 fds (not  
terribly uncommon) would have 12.5kbyte of FD bitmap and would trash  
the cache every time it tried to allocate an FD.

You can't even hope to use persistent library FDs with these  
problems, and they even cause problems with servers using lots of  
FDs.  Introducing another linear FD space will just make this sort of  
stuff happen ALL OVER AGAIN.

If you want to make things reproducible, you could have an option  
where the "random" algorithm is just pseudo-linearly allocated FDs  
(no bitmap search) XORed with a boot-time constant either randomly- 
generated or set in the boot args.  That way the people interested in  
maximum security or stress testing can use completely randomized  
numbers and the people who need reproducibility can get that too,  
*without* having the same linear FD allocation problems all over  
again.  The fundamental problem is that a series of numbers  
increasing linearly from X (especially where X == 0) are used as  
meaningful data by lazy programmers, instead of just as a cookie.  By  
nonlinearizing the data, even just a little, that becomes unlikely to  
occur.  It also forces programmers to use FDs correctly and prevents  
problems like this in the future.

Cheers,
Kyle Moffett

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-09  2:10                                                     ` Ulrich Drepper
@ 2007-06-09 15:15                                                       ` Al Viro
  2007-06-09 16:26                                                         ` Ulrich Drepper
  0 siblings, 1 reply; 129+ messages in thread
From: Al Viro @ 2007-06-09 15:15 UTC (permalink / raw)
  To: Ulrich Drepper
  Cc: Linus Torvalds, Davide Libenzi, Alan Cox, Theodore Tso,
	Eric Dumazet, Kyle Moffett, Linux Kernel Mailing List,
	Andrew Morton, Ingo Molnar

On Fri, Jun 08, 2007 at 07:10:03PM -0700, Ulrich Drepper wrote:
> -----BEGIN PGP SIGNED MESSAGE-----
> Hash: SHA1
> 
> Al Viro wrote:
> > Any real-world examples of exploitable holes based on that?
> 
> Return to libc exploit, calling dup2, where some privileged data is
> redirected from the normal file descriptor to one of the attackers
> choosing.  The latter could be an outgoing socket connection which would
> result in leaking the data to the outside.
> 
>          normal code                         intruder
> 
>                                           so = socket()
> 
>     fd = open ("local-file")
> 
>                                           dup2(so, fd);
> 
>     write (fd, privileged data)
> 
> 
> It's just a little function call.  If the arguments of dup2() are known
> this is not a big issue to construct.

So which code is supposed to do that open/write in your example?  Library?
Unmodified application?  Application specifically modified to make *that*
open() randomized?

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-09 15:15                                                       ` Al Viro
@ 2007-06-09 16:26                                                         ` Ulrich Drepper
  2007-06-09 16:54                                                           ` Al Viro
  0 siblings, 1 reply; 129+ messages in thread
From: Ulrich Drepper @ 2007-06-09 16:26 UTC (permalink / raw)
  To: Al Viro
  Cc: Linus Torvalds, Davide Libenzi, Alan Cox, Theodore Tso,
	Eric Dumazet, Kyle Moffett, Linux Kernel Mailing List,
	Andrew Morton, Ingo Molnar

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

Al Viro wrote:
> So which code is supposed to do that open/write in your example?  Library?
> Unmodified application?  Application specifically modified to make *that*
> open() randomized?

Why should that matter?  All of the above.  Any piece of code can of
course choose to not be safe but it should have the opportunity to be,
too.  Currently that's not the case.  With an O_NONSEQFD flag to open it
would be trivial to change any code.  Or one can introduce a new set of
userlevel interfaces to create and write new files which then can do
even more.

In whatever way you look at it, there currently is a problem which
cannot be solved except with truly horrible, horrible hack (open N
descriptors and randomly select one to use; yep, horrible, I said so).

- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.7 (GNU/Linux)

iD8DBQFGatS62ijCOnn/RHQRAqcfAJ9vMc6GUFxlgxOZ9rhAcTV9N95kUACguUpq
ER8Y64pIp80NHiMHXwMxxK0=
=ytei
-----END PGP SIGNATURE-----

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-09 16:26                                                         ` Ulrich Drepper
@ 2007-06-09 16:54                                                           ` Al Viro
  2007-06-09 17:04                                                             ` Davide Libenzi
  2007-06-09 17:08                                                             ` Ulrich Drepper
  0 siblings, 2 replies; 129+ messages in thread
From: Al Viro @ 2007-06-09 16:54 UTC (permalink / raw)
  To: Ulrich Drepper
  Cc: Linus Torvalds, Davide Libenzi, Alan Cox, Theodore Tso,
	Eric Dumazet, Kyle Moffett, Linux Kernel Mailing List,
	Andrew Morton, Ingo Molnar

On Sat, Jun 09, 2007 at 09:26:34AM -0700, Ulrich Drepper wrote:
> -----BEGIN PGP SIGNED MESSAGE-----
> Hash: SHA1
> 
> Al Viro wrote:
> > So which code is supposed to do that open/write in your example?  Library?
> > Unmodified application?  Application specifically modified to make *that*
> > open() randomized?
> 
> Why should that matter?  All of the above.

I asked for real-world example.  Can I have one, please?

> In whatever way you look at it, there currently is a problem which
> cannot be solved except with truly horrible, horrible hack (open N
> descriptors and randomly select one to use; yep, horrible, I said so).

That's simply not true.  On the current kernel nothing stops you from e.g.
picking a random number and using F_DUPFD.  Voila - there's your randomized
descriptor.  Portable to earlier kernels.

Moreover, nonsense^H^H^Hq_fd() can be implemented in userland just fine
if we allow F_DUPFD to arbitrary number - just pass it a random one *or*
base chosen like davedel is doing (constant + 20bit random chosen at start
time).

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-09  5:41                                 ` Paul Mackerras
  2007-06-09 14:38                                   ` Kyle Moffett
@ 2007-06-09 17:00                                   ` Davide Libenzi
  2007-06-10  6:26                                     ` Paul Mackerras
  1 sibling, 1 reply; 129+ messages in thread
From: Davide Libenzi @ 2007-06-09 17:00 UTC (permalink / raw)
  To: Paul Mackerras
  Cc: Alan Cox, Theodore Tso, Ulrich Drepper, Eric Dumazet,
	Kyle Moffett, Linux Kernel Mailing List, Linus Torvalds,
	Andrew Morton, Ingo Molnar

On Sat, 9 Jun 2007, Paul Mackerras wrote:

> Davide Libenzi writes:
> 
> > The only reason we use a floating base, is because Uli preferred to have 
> > non-exactly predictable fd allocations. There no reason of re-doing the 
> > same POSIX mistake all over again:
> 
> Why must everything that makes things a bit simpler and more
> predictable for application programmers be called a "mistake"?

Because if you give guarantees on something, ppl start using such 
guarantee in the wrong way. Kyle's email summarizes it.
This should really be treated as an opaque handle, with no assumption on 
its value. And if you start handing over values that are not predictable, 
the userspace is *forced* to not use any assumption on its values. I never 
made any assumption on values returned by APIs returning "handles", and I 
never had any problem (or even care) about how those values were 
distributed in the N bit space.


- Davide



^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-09 16:54                                                           ` Al Viro
@ 2007-06-09 17:04                                                             ` Davide Libenzi
  2007-06-09 17:08                                                               ` Davide Libenzi
  2007-06-09 17:08                                                             ` Ulrich Drepper
  1 sibling, 1 reply; 129+ messages in thread
From: Davide Libenzi @ 2007-06-09 17:04 UTC (permalink / raw)
  To: Al Viro
  Cc: Ulrich Drepper, Linus Torvalds, Alan Cox, Theodore Tso,
	Eric Dumazet, Kyle Moffett, Linux Kernel Mailing List,
	Andrew Morton, Ingo Molnar

On Sat, 9 Jun 2007, Al Viro wrote:

> That's simply not true.  On the current kernel nothing stops you from e.g.
> picking a random number and using F_DUPFD.  Voila - there's your randomized
> descriptor.  Portable to earlier kernels.
> 
> Moreover, nonsense^H^H^Hq_fd() can be implemented in userland just fine
> if we allow F_DUPFD to arbitrary number - just pass it a random one *or*
> base chosen like davedel is doing (constant + 20bit random chosen at start
> time).

That does not work. Or better, it works but it forces *huge* fdtables to 
be created. To do that, you need "start" to be over NR_FILE, and this 
creates an fdtable bigger than NR_FILE ((pointer-size + a-few-bits) * 
NR_FILE) propagated down to every app you fork.



- Davide



^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-09 17:04                                                             ` Davide Libenzi
@ 2007-06-09 17:08                                                               ` Davide Libenzi
  0 siblings, 0 replies; 129+ messages in thread
From: Davide Libenzi @ 2007-06-09 17:08 UTC (permalink / raw)
  To: Al Viro
  Cc: Ulrich Drepper, Linus Torvalds, Alan Cox, Theodore Tso,
	Eric Dumazet, Kyle Moffett, Linux Kernel Mailing List,
	Andrew Morton, Ingo Molnar

On Sat, 9 Jun 2007, Davide Libenzi wrote:

> On Sat, 9 Jun 2007, Al Viro wrote:
> 
> > That's simply not true.  On the current kernel nothing stops you from e.g.
> > picking a random number and using F_DUPFD.  Voila - there's your randomized
> > descriptor.  Portable to earlier kernels.
> > 
> > Moreover, nonsense^H^H^Hq_fd() can be implemented in userland just fine
> > if we allow F_DUPFD to arbitrary number - just pass it a random one *or*
> > base chosen like davedel is doing (constant + 20bit random chosen at start
> > time).
> 
> That does not work. Or better, it works but it forces *huge* fdtables to 
> be created. To do that, you need "start" to be over NR_FILE, and this 
> creates an fdtable bigger than NR_FILE ((pointer-size + a-few-bits) * 
> NR_FILE) propagated down to every app you fork.

Keep in mind also, that quite a few places in the code, walk through the 
whole fdtable memory. So it is not only a problem of wasted RAM.


- Davide



^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-09 16:54                                                           ` Al Viro
  2007-06-09 17:04                                                             ` Davide Libenzi
@ 2007-06-09 17:08                                                             ` Ulrich Drepper
  2007-06-09 17:24                                                               ` Al Viro
  1 sibling, 1 reply; 129+ messages in thread
From: Ulrich Drepper @ 2007-06-09 17:08 UTC (permalink / raw)
  To: Al Viro
  Cc: Linus Torvalds, Davide Libenzi, Alan Cox, Theodore Tso,
	Eric Dumazet, Kyle Moffett, Linux Kernel Mailing List,
	Andrew Morton, Ingo Molnar

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

Al Viro wrote:
> That's simply not true.  On the current kernel nothing stops you from e.g.
> picking a random number and using F_DUPFD.

Of course there are things stopping one from doing this (aside from the
kernel not allowing this in the moment at all since the highest fd
number is limited severely):

- - this scheme would only be use if it would be possible to have
completely random descriptor values.  But what has been said here
already is that this is too costly.  Hence the approach to randomize
only the base value.

- - there are two interface to use: open + fcntl.  This is racy.  And
don't tell me this doesn't matter.

- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.7 (GNU/Linux)

iD8DBQFGat6r2ijCOnn/RHQRAiatAKCs7RLZmpgAU5NyT58c8ueJum4fgwCgjqP0
jPVCCWEdIHVQS05oIjdsZYs=
=7UX+
-----END PGP SIGNATURE-----

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-09 17:08                                                             ` Ulrich Drepper
@ 2007-06-09 17:24                                                               ` Al Viro
  2007-06-09 19:27                                                                 ` Kyle Moffett
  0 siblings, 1 reply; 129+ messages in thread
From: Al Viro @ 2007-06-09 17:24 UTC (permalink / raw)
  To: Ulrich Drepper
  Cc: Linus Torvalds, Davide Libenzi, Alan Cox, Theodore Tso,
	Eric Dumazet, Kyle Moffett, Linux Kernel Mailing List,
	Andrew Morton, Ingo Molnar

On Sat, Jun 09, 2007 at 10:08:59AM -0700, Ulrich Drepper wrote:
> - - there are two interface to use: open + fcntl.  This is racy.  And
> don't tell me this doesn't matter.

Racy with respect to what?  Return-to-libc exploits from another thread?

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-09 17:24                                                               ` Al Viro
@ 2007-06-09 19:27                                                                 ` Kyle Moffett
  2007-06-09 20:06                                                                   ` Al Viro
  2007-06-10  2:40                                                                   ` Al Viro
  0 siblings, 2 replies; 129+ messages in thread
From: Kyle Moffett @ 2007-06-09 19:27 UTC (permalink / raw)
  To: Al Viro
  Cc: Ulrich Drepper, Linus Torvalds, Davide Libenzi, Alan Cox,
	Theodore Tso, Eric Dumazet, Linux Kernel Mailing List,
	Andrew Morton, Ingo Molnar

On Jun 09, 2007, at 13:24:29, Al Viro wrote:
> On Sat, Jun 09, 2007 at 10:08:59AM -0700, Ulrich Drepper wrote:
>> - - there are two interface to use: open + fcntl.  This is racy.   
>> And don't tell me this doesn't matter.
> Racy with respect to what?  Return-to-libc exploits from another  
> thread?

How about racy with respect to normal open or fork+exec from another  
thread?  Specifically there are cases where libc or other libraries  
want to create a backend thread dealing with file descriptors in  
response to the program's straightforward calls into that library  
(Examples include using syslets or event-based polling threads).


SCENARIO 1:

Program Thread:   Library Thread:
                   fd = socket(AF_*, SOCK_*, 0);
fork();
                   int x = FD_CLOEXEC;
                   fcntl(fd, F_SETFD, &x);

New Process:
setgroups(...);
seteuid(...);
exec(....);

Whoops!!! Suddenly the user process executed by the (theoretically)  
single-threaded program got a handle to a netlink socket affecting  
some system resource!!!


SCENARIO 2:

Program Thread:     Async libc getpwent()-cache syslet
close(0);
                     fd = open("/etc/shadow");
open("/dev/null");
code_which_insecurely_reads_from_stdin();

Here we were trying to safely call into code which reads from stdin  
and shouldn't be given privileged data, but the syslet makes the  
common paradigm 'close(0); open("/dev/null");' horribly insecure.

If you extend all the FD syscalls to all take a "flags" parameter and  
add the appropriate flags, then you can pass O_CLOEXEC|O_RANDFD to  
whatever syscall you are using and both problems vanish.

Cheers,
Kyle Moffett


^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-09 19:27                                                                 ` Kyle Moffett
@ 2007-06-09 20:06                                                                   ` Al Viro
  2007-06-09 20:21                                                                     ` Linus Torvalds
  2007-06-10  2:40                                                                   ` Al Viro
  1 sibling, 1 reply; 129+ messages in thread
From: Al Viro @ 2007-06-09 20:06 UTC (permalink / raw)
  To: Kyle Moffett
  Cc: Ulrich Drepper, Linus Torvalds, Davide Libenzi, Alan Cox,
	Theodore Tso, Eric Dumazet, Linux Kernel Mailing List,
	Andrew Morton, Ingo Molnar

On Sat, Jun 09, 2007 at 03:27:43PM -0400, Kyle Moffett wrote:
> On Jun 09, 2007, at 13:24:29, Al Viro wrote:
> >On Sat, Jun 09, 2007 at 10:08:59AM -0700, Ulrich Drepper wrote:
> >>- - there are two interface to use: open + fcntl.  This is racy.   
> >>And don't tell me this doesn't matter.
> >Racy with respect to what?  Return-to-libc exploits from another  
> >thread?
> 
> How about racy with respect to normal open

How the hell can it be racy wrt normal open()?  F_DUPFD is not dup2(),
it's non-overriding.

> or fork+exec from another  
> thread?  Specifically there are cases where libc or other libraries  
> want to create a backend thread dealing with file descriptors in  
> response to the program's straightforward calls into that library  
> (Examples include using syslets or event-based polling threads).
> 
> 
> SCENARIO 1:
> 
> Program Thread:   Library Thread:
>                   fd = socket(AF_*, SOCK_*, 0);
> fork();
>                   int x = FD_CLOEXEC;
>                   fcntl(fd, F_SETFD, &x);
> 
> New Process:
> setgroups(...);
> seteuid(...);
> exec(....);
>
> Whoops!!! Suddenly the user process executed by the (theoretically)  
> single-threaded program got a handle to a netlink socket affecting  
> some system resource!!!
 
Give me a break.  fork(3) is nowhere near plain fork(2); read the nptl
code for details.  Getting a low-overhead exclusion into that scheme is not
a rocket science.  And lose the bangs, please...

> SCENARIO 2:
> 
> Program Thread:     Async libc getpwent()-cache syslet
> close(0);
>                     fd = open("/etc/shadow");
> open("/dev/null");
> code_which_insecurely_reads_from_stdin();

>From what, again?  Use of stdio after that is deep in nasal demon land...

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-09 20:06                                                                   ` Al Viro
@ 2007-06-09 20:21                                                                     ` Linus Torvalds
  2007-06-09 20:31                                                                       ` Davide Libenzi
                                                                                         ` (3 more replies)
  0 siblings, 4 replies; 129+ messages in thread
From: Linus Torvalds @ 2007-06-09 20:21 UTC (permalink / raw)
  To: Al Viro
  Cc: Kyle Moffett, Ulrich Drepper, Davide Libenzi, Alan Cox,
	Theodore Tso, Eric Dumazet, Linux Kernel Mailing List,
	Andrew Morton, Ingo Molnar



On Sat, 9 Jun 2007, Al Viro wrote:
> 
> How the hell can it be racy wrt normal open()?  F_DUPFD is not dup2(),
> it's non-overriding.

Al, you probably didn't read this thread from the beginning (not in this 
particular email thread - an earlier one on the whole feature).

The problem is that a thread wants to open a FD_CLOEXEC file descriptor, 
because it is doing something that is thread-local. So it does

	fd = some.op.that.returns.an.fd.like.socket();
	fcntl(fd, F_SETFD, &FD_CLOEXEC);

but *another* thread does an execve() at the same time, and the fcntl() 
never gets to happen!

Which is why you'd like to do the *initial* operation with a flag that 
says "please set the FD_CLOEXEC flag on the file descriptor", so that you 
*atomically* install the file file descriptor and set the FD_CLOEXEC bit.

It's trivial to do for open(), but there are about a million ways to get a 
file descriptor, and open() is just about the *only* one of those that 
actually takes a "flags" field that can be used to tell the kernel.

So ignore the fdmapping for now: that's just an extended thing. The 
problem is _independent_ of the fdmapping, but it turns out that a lot of 
these problems are intertwined, in that you actually want *other* flags 
than just "FD_CLOEXEC". For example, one of the flags would be "private fd 
space" (which is where fdmap comes in), so that a library can allocate its 
own internal file descriptors *without* impacting the caller that depends 
on its own file descriptor allocation.

(And dammit, that _is_ a *real*issue*. No races necessary, no NR_OPEN 
iterations, no even *halfway* suspect code. It's perfectly fine to do

	close(0);
	close(1);
	close(2);
	.. generate filenames, whatever ..
	if (open(..) < 0 || open(..) < 0 || open(..) < 0)
		die("Couldn't redirect stdin/stdout/stderr");

and there's absolutely nothing wrong with this kind of setup, even if you 
could obviously have done it other ways too (ie by using "dup2()" instead 
of "close + open"),

And that means that libraries currently MUST NOT open their own file 
descriptors, exactly because they mess with the "application file 
descriptor namespace", namely the linear POSIX-defined fd allocation 
rules!

And no, "dup2(fd, SOME_BIG_FD)" is *not* the answer, exactly because we 
end up sucking like mad if you actually were to do it!

So I think both the FD_CLOEXEC _and_ the "private fd space" are real 
issues. I don't agree with the "random fd" approach. I'd much rather have 
a non-random setup for the nonlinear ones (it just shouldn't be linear).

		Linus

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-09 20:21                                                                     ` Linus Torvalds
@ 2007-06-09 20:31                                                                       ` Davide Libenzi
  2007-06-09 21:41                                                                         ` Matt Mackall
  2007-06-09 20:49                                                                       ` Al Viro
                                                                                         ` (2 subsequent siblings)
  3 siblings, 1 reply; 129+ messages in thread
From: Davide Libenzi @ 2007-06-09 20:31 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Al Viro, Kyle Moffett, Ulrich Drepper, Alan Cox, Theodore Tso,
	Eric Dumazet, Linux Kernel Mailing List, Andrew Morton,
	Ingo Molnar

On Sat, 9 Jun 2007, Linus Torvalds wrote:

> So I think both the FD_CLOEXEC _and_ the "private fd space" are real 
> issues. I don't agree with the "random fd" approach. I'd much rather have 
> a non-random setup for the nonlinear ones (it just shouldn't be linear).

That is fine for me. So what about the randomness?

A) Don't do it at all

B) Let userspace select it in some way globally

C) Let userspace select it per-fd (this won't be an O(1) anymore though)




- Davide



^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-09 20:21                                                                     ` Linus Torvalds
  2007-06-09 20:31                                                                       ` Davide Libenzi
@ 2007-06-09 20:49                                                                       ` Al Viro
  2007-06-09 21:55                                                                         ` Matt Mackall
  2007-06-09 23:33                                                                         ` Linus Torvalds
  2007-06-10  3:19                                                                       ` Al Viro
  2007-06-10  9:14                                                                       ` Eric Dumazet
  3 siblings, 2 replies; 129+ messages in thread
From: Al Viro @ 2007-06-09 20:49 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Kyle Moffett, Ulrich Drepper, Davide Libenzi, Alan Cox,
	Theodore Tso, Eric Dumazet, Linux Kernel Mailing List,
	Andrew Morton, Ingo Molnar

On Sat, Jun 09, 2007 at 01:21:24PM -0700, Linus Torvalds wrote:
> Which is why you'd like to do the *initial* operation with a flag that 
> says "please set the FD_CLOEXEC flag on the file descriptor", so that you 
> *atomically* install the file file descriptor and set the FD_CLOEXEC bit.
> 
> It's trivial to do for open(), but there are about a million ways to get a 
> file descriptor, and open() is just about the *only* one of those that 
> actually takes a "flags" field that can be used to tell the kernel.

Eww...  Idea of pipe(2) taking flags as argument...

BTW, you also need that for recvmsg() (SCM_RIGHTS) and fsckloads of
syscalls we've got duplicating open() for no good reason (and no, "BSD
folks did it for sockets, so we'll do it for tons of new subsystems" doesn't
really qualify ;-/).

I don't know if your indirect is a good idea in that respect, actually.
AFAICS, you are suggesting per-syscall meanings of the flags, so it really
smells like YAMultiplexor, free for abuse.
 
> (And dammit, that _is_ a *real*issue*. No races necessary, no NR_OPEN 
> iterations, no even *halfway* suspect code. It's perfectly fine to do
> 
> 	close(0);
> 	close(1);
> 	close(2);
> 	.. generate filenames, whatever ..
> 	if (open(..) < 0 || open(..) < 0 || open(..) < 0)
> 		die("Couldn't redirect stdin/stdout/stderr");
> 
> and there's absolutely nothing wrong with this kind of setup, even if you 
> could obviously have done it other ways too (ie by using "dup2()" instead 
> of "close + open"),

Yeah, well - I wouldn't call that perfectly fine, but it's probably too
widespread to kill.  Just as use of 0 for NULL ;-)

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-09 20:31                                                                       ` Davide Libenzi
@ 2007-06-09 21:41                                                                         ` Matt Mackall
  2007-06-09 22:12                                                                           ` Davide Libenzi
  0 siblings, 1 reply; 129+ messages in thread
From: Matt Mackall @ 2007-06-09 21:41 UTC (permalink / raw)
  To: Davide Libenzi
  Cc: Linus Torvalds, Al Viro, Kyle Moffett, Ulrich Drepper, Alan Cox,
	Theodore Tso, Eric Dumazet, Linux Kernel Mailing List,
	Andrew Morton, Ingo Molnar

On Sat, Jun 09, 2007 at 01:31:56PM -0700, Davide Libenzi wrote:
> On Sat, 9 Jun 2007, Linus Torvalds wrote:
> 
> > So I think both the FD_CLOEXEC _and_ the "private fd space" are real 
> > issues. I don't agree with the "random fd" approach. I'd much rather have 
> > a non-random setup for the nonlinear ones (it just shouldn't be linear).
> 
> That is fine for me. So what about the randomness?
> 
> A) Don't do it at all
> 
> B) Let userspace select it in some way globally
> 
> C) Let userspace select it per-fd (this won't be an O(1) anymore though)

It should be handled the same way that VMA layout randomness is
handled. There are multiple knobs including ELF markers and /proc
bits.

-- 
Mathematics is the supreme nostalgia of our time.

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-09 20:49                                                                       ` Al Viro
@ 2007-06-09 21:55                                                                         ` Matt Mackall
  2007-06-09 23:33                                                                         ` Linus Torvalds
  1 sibling, 0 replies; 129+ messages in thread
From: Matt Mackall @ 2007-06-09 21:55 UTC (permalink / raw)
  To: Al Viro
  Cc: Linus Torvalds, Kyle Moffett, Ulrich Drepper, Davide Libenzi,
	Alan Cox, Theodore Tso, Eric Dumazet, Linux Kernel Mailing List,
	Andrew Morton, Ingo Molnar

On Sat, Jun 09, 2007 at 09:49:07PM +0100, Al Viro wrote:
> On Sat, Jun 09, 2007 at 01:21:24PM -0700, Linus Torvalds wrote:
> > Which is why you'd like to do the *initial* operation with a flag that 
> > says "please set the FD_CLOEXEC flag on the file descriptor", so that you 
> > *atomically* install the file file descriptor and set the FD_CLOEXEC bit.
> > 
> > It's trivial to do for open(), but there are about a million ways to get a 
> > file descriptor, and open() is just about the *only* one of those that 
> > actually takes a "flags" field that can be used to tell the kernel.
> 
> Eww...  Idea of pipe(2) taking flags as argument...
> 
> BTW, you also need that for recvmsg() (SCM_RIGHTS) and fsckloads of
> syscalls we've got duplicating open() for no good reason (and no, "BSD
> folks did it for sockets, so we'll do it for tons of new subsystems" doesn't
> really qualify ;-/).
> 
> I don't know if your indirect is a good idea in that respect, actually.
> AFAICS, you are suggesting per-syscall meanings of the flags, so it really
> smells like YAMultiplexor, free for abuse.

Well, of course it sucks. The question is does it suck less than
adding dozens of new syscalls to patch up problems with POSIX?

I don't think we can get this right in one iteration, so I actually
prefer Linus's approach as less likely to leave numerous vestiges of
incremental improvements around.

On the other hand, we don't want to end up in a future where the bulk
of our calls are through sys_indirect! Given the number of important
APIs that have gotten slight semantic bumps, that's a real
possibility.

-- 
Mathematics is the supreme nostalgia of our time.

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-09 21:41                                                                         ` Matt Mackall
@ 2007-06-09 22:12                                                                           ` Davide Libenzi
  0 siblings, 0 replies; 129+ messages in thread
From: Davide Libenzi @ 2007-06-09 22:12 UTC (permalink / raw)
  To: Matt Mackall
  Cc: Linus Torvalds, Al Viro, Kyle Moffett, Ulrich Drepper, Alan Cox,
	Theodore Tso, Eric Dumazet, Linux Kernel Mailing List,
	Andrew Morton, Ingo Molnar

On Sat, 9 Jun 2007, Matt Mackall wrote:

> On Sat, Jun 09, 2007 at 01:31:56PM -0700, Davide Libenzi wrote:
> > On Sat, 9 Jun 2007, Linus Torvalds wrote:
> > 
> > > So I think both the FD_CLOEXEC _and_ the "private fd space" are real 
> > > issues. I don't agree with the "random fd" approach. I'd much rather have 
> > > a non-random setup for the nonlinear ones (it just shouldn't be linear).
> > 
> > That is fine for me. So what about the randomness?
> > 
> > A) Don't do it at all
> > 
> > B) Let userspace select it in some way globally
> > 
> > C) Let userspace select it per-fd (this won't be an O(1) anymore though)
> 
> It should be handled the same way that VMA layout randomness is
> handled. There are multiple knobs including ELF markers and /proc
> bits.

You mean a global bit controlled by /proc, eventually overridden by an ELF 
flag? An maybe a prctl() to give sw configurability?



- Davide



^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-09 20:49                                                                       ` Al Viro
  2007-06-09 21:55                                                                         ` Matt Mackall
@ 2007-06-09 23:33                                                                         ` Linus Torvalds
  2007-06-10  3:35                                                                           ` Davide Libenzi
  1 sibling, 1 reply; 129+ messages in thread
From: Linus Torvalds @ 2007-06-09 23:33 UTC (permalink / raw)
  To: Al Viro
  Cc: Kyle Moffett, Ulrich Drepper, Davide Libenzi, Alan Cox,
	Theodore Tso, Eric Dumazet, Linux Kernel Mailing List,
	Andrew Morton, Ingo Molnar



On Sat, 9 Jun 2007, Al Viro wrote:
> 
> Eww...  Idea of pipe(2) taking flags as argument...

Right. That was one of the patches, and it was one that I said was too 
damn ugly to live.

So I instead suggested the alternate approach of adding a single new 
system call that runs another system call indirectly, with a set of flags 
and/or other behaviour modifications.

And I actually think that's a great idea, because we have *multiple* uses 
of this kind of "run system call with special flags" issues:

 - the whole "async" thing, and as Matt reminded me, this is the perfect 
   interface for also saying "run this system call asynchronously"

 - things like FD_CLOEXEC, but also things like O_NOFOLLOW and O_NDELAY 
   (which currently only "open()" supports, even though it makes sense for 
   a lot of other ops too)

 - extended flags like LOOKUP_NOSYMLINK and LOOKUP_NOMOUNT for any system 
   call that does path lookup (to make it return errors if you *ever* 
   traverse a symlink or a mount-point respectively: security conscious 
   programs tend to want this - think about apache that exports per-user 
   "public_html" stuff, but does *not* want to export /etc, and thus 
   doesn't like following symlinks).

> I don't know if your indirect is a good idea in that respect, actually.
> AFAICS, you are suggesting per-syscall meanings of the flags, so it really
> smells like YAMultiplexor, free for abuse.

Well, the actual _setup_ would be global (ie we would have no per-system 
call actions: we'd just copy the "flags" into the thread 
"system_call_flags" thing).

But yes, different system calls could decide to *interpret* the flags 
differently. Quite frankly, I'd rather have that kind of overloaded flag 
bitmap, than try to create something "extensible". 

That said, I don't think there really will be all that many flags, and we 
could even just decide that the bits have to be totally unique (and simply 
limited to 32 flags). The whole point of the flags, after all, would be 
things that *do* make sense for a whole group of system calls: if that's 
not true, and it's some single-system-call thing, then you might as well 
just always add the new system call itself!

So it makes sense for generic flags (like ASYNC and path lookup rules: not 
every system call takes a path, of course, but it's conceptually still 
pretty damn generic). but it wouldn't really make sense to do for a very 
specific system call - it would be faster and easier to just create the 
new system call in the first place.

		Linus

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-09 19:27                                                                 ` Kyle Moffett
  2007-06-09 20:06                                                                   ` Al Viro
@ 2007-06-10  2:40                                                                   ` Al Viro
  1 sibling, 0 replies; 129+ messages in thread
From: Al Viro @ 2007-06-10  2:40 UTC (permalink / raw)
  To: Kyle Moffett
  Cc: Ulrich Drepper, Linus Torvalds, Davide Libenzi, Alan Cox,
	Theodore Tso, Eric Dumazet, Linux Kernel Mailing List,
	Andrew Morton, Ingo Molnar

On Sat, Jun 09, 2007 at 03:27:43PM -0400, Kyle Moffett wrote:
> SCENARIO 1:
> 
> Program Thread:   Library Thread:
>                   fd = socket(AF_*, SOCK_*, 0);
> fork();
>                   int x = FD_CLOEXEC;
>                   fcntl(fd, F_SETFD, &x);

BTW, regardless of anything else, in such situation this "library
thread" would be better off by just having independent descriptors.
We _can_ do that just fine.

That is, if library code using such stuff would be OK with being a thread.
Any specific examples one way or another?

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-09 20:21                                                                     ` Linus Torvalds
  2007-06-09 20:31                                                                       ` Davide Libenzi
  2007-06-09 20:49                                                                       ` Al Viro
@ 2007-06-10  3:19                                                                       ` Al Viro
  2007-06-10  3:48                                                                         ` Linus Torvalds
  2007-06-10  9:14                                                                       ` Eric Dumazet
  3 siblings, 1 reply; 129+ messages in thread
From: Al Viro @ 2007-06-10  3:19 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Kyle Moffett, Ulrich Drepper, Davide Libenzi, Alan Cox,
	Theodore Tso, Eric Dumazet, Linux Kernel Mailing List,
	Andrew Morton, Ingo Molnar

On Sat, Jun 09, 2007 at 01:21:24PM -0700, Linus Torvalds wrote:
 
> (And dammit, that _is_ a *real*issue*. No races necessary, no NR_OPEN 
> iterations, no even *halfway* suspect code. It's perfectly fine to do
> 
> 	close(0);
> 	close(1);
> 	close(2);
> 	.. generate filenames, whatever ..
> 	if (open(..) < 0 || open(..) < 0 || open(..) < 0)
> 		die("Couldn't redirect stdin/stdout/stderr");
> 
> and there's absolutely nothing wrong with this kind of setup, even if you 
> could obviously have done it other ways too (ie by using "dup2()" instead 
> of "close + open"),
> 
> And that means that libraries currently MUST NOT open their own file 
> descriptors, exactly because they mess with the "application file 
> descriptor namespace", namely the linear POSIX-defined fd allocation 
> rules!

Unless it does so in a thread that has unshared its descriptor table.
Which is certainly safer than suggested variants and which might be
natural thing to do anyway (e.g. if we do behind-the-scene getpwent()
caching, etc., we wouldn't be hurt by doing the work asynchronously
wrt the rest of program).

It sure as hell beats the "it's probably going to be safe with expected
use patterns in programs using our library" in the safety department;
that way we *know* that nothing of our stuff will leak anywhere, just as
we know that whatever other threads might be doing, they won't screw with
our descriptors in any way.

That's independent from O_CLOEXEC, etc., and for cases where such technics
is usable we get a bonus of code working with earlier kernels just fine.

Comments?

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-09 23:33                                                                         ` Linus Torvalds
@ 2007-06-10  3:35                                                                           ` Davide Libenzi
  2007-06-10  3:49                                                                             ` Davide Libenzi
  0 siblings, 1 reply; 129+ messages in thread
From: Davide Libenzi @ 2007-06-10  3:35 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Al Viro, Kyle Moffett, Ulrich Drepper, Alan Cox, Theodore Tso,
	Eric Dumazet, Linux Kernel Mailing List, Andrew Morton,
	Ingo Molnar

On Sat, 9 Jun 2007, Linus Torvalds wrote:

> On Sat, 9 Jun 2007, Al Viro wrote:
> > 
> > Eww...  Idea of pipe(2) taking flags as argument...
> 
> Right. That was one of the patches, and it was one that I said was too 
> damn ugly to live.
> 
> So I instead suggested the alternate approach of adding a single new 
> system call that runs another system call indirectly, with a set of flags 
> and/or other behaviour modifications.

What if we put a data section in the vdso and we let userspace set the 
flags or whatever in there? Syslets could use one of those bits to set the 
"threadlet mode" w/out calling a syscall at all.
In that way we have no new syscalls whatsoever. No?


- Davide



^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-10  3:19                                                                       ` Al Viro
@ 2007-06-10  3:48                                                                         ` Linus Torvalds
  2007-06-10  4:00                                                                           ` Al Viro
                                                                                             ` (2 more replies)
  0 siblings, 3 replies; 129+ messages in thread
From: Linus Torvalds @ 2007-06-10  3:48 UTC (permalink / raw)
  To: Al Viro
  Cc: Kyle Moffett, Ulrich Drepper, Davide Libenzi, Alan Cox,
	Theodore Tso, Eric Dumazet, Linux Kernel Mailing List,
	Andrew Morton, Ingo Molnar



On Sun, 10 Jun 2007, Al Viro wrote:
> > 
> > And that means that libraries currently MUST NOT open their own file 
> > descriptors, exactly because they mess with the "application file 
> > descriptor namespace", namely the linear POSIX-defined fd allocation 
> > rules!
> 
> Unless it does so in a thread that has unshared its descriptor table.

Agreed. That was actually part of the reason why I thought clone() was
much better than the pthreads interface.

That said, the Linux !CLONE_FILES does have downsides:

 - it is potentially much slower to do than sharing everything (if you 
   have lots of file descriptors, incrementing the refcounts etc is 
   actually a real overhead)

 - it simply doesn't work, if the library wants to run in the same 
   execution context, and just wants to open one (or more) file 
   descriptors for some helper thing.

IOW, the most common case for libraries is not that they get invoced to do 
one thing, but that they get loaded and then used over and over and over 
again, and the _reason_ for wanting to have a file descriptor open may 
well be that the library wants to cache the file descriptor, rather than 
having to open a file over and over again!

For example, a library routine that does a full

	fd = open();
	.. do something with it ..
	close(fd);

generally doesn't need any private file descriptors at all (although there 
are the threading issues with exec etc) - it will temporarily use a normal 
file descriptor, and the caller won't be any wiser. Lots of current 
library routines do this all the time.

But let's say that you want to do a library that does name resolution, and 
you actually want to create the socket that binds to the DNS server just 
once, and then re-use that socket across library calls. It's not that the 
library is a thread of its own - it's not - but with the normal linear fd 
space it really cannot do this. Sure, it could try to hide it up somewhere 
in high fd space, but that would slow down other operations, and there's 
no way to guarantee it doesn't clash with some _other_ library doing the 
same thing, so it really isn't a good idea.

Now, when you do a DNS query, the setup cost of opening the socket is the 
least of your worries, so the above example is not a very good one. I'm 
really just giving it as a concrete example of a _conceptual_ problem, 
where some other library really had more pressing performance reasons why 
they cannot keep re-opening a file descriptor and closing it each time.

So _that_ is the kind of situation where I think "anonymous file 
descriptors" make sense. 

		Linus

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-10  3:35                                                                           ` Davide Libenzi
@ 2007-06-10  3:49                                                                             ` Davide Libenzi
  0 siblings, 0 replies; 129+ messages in thread
From: Davide Libenzi @ 2007-06-10  3:49 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Al Viro, Kyle Moffett, Ulrich Drepper, Alan Cox, Theodore Tso,
	Eric Dumazet, Linux Kernel Mailing List, Andrew Morton,
	Ingo Molnar

On Sat, 9 Jun 2007, Davide Libenzi wrote:

> On Sat, 9 Jun 2007, Linus Torvalds wrote:
> 
> > On Sat, 9 Jun 2007, Al Viro wrote:
> > > 
> > > Eww...  Idea of pipe(2) taking flags as argument...
> > 
> > Right. That was one of the patches, and it was one that I said was too 
> > damn ugly to live.
> > 
> > So I instead suggested the alternate approach of adding a single new 
> > system call that runs another system call indirectly, with a set of flags 
> > and/or other behaviour modifications.
> 
> What if we put a data section in the vdso and we let userspace set the 
> flags or whatever in there? Syslets could use one of those bits to set the 
> "threadlet mode" w/out calling a syscall at all.
> In that way we have no new syscalls whatsoever. No?

Uhh, never mind :) That won't work in the vdso. Maybe in a TLS area.


- Davide



^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-10  3:48                                                                         ` Linus Torvalds
@ 2007-06-10  4:00                                                                           ` Al Viro
  2007-06-10  4:03                                                                             ` Linus Torvalds
  2007-06-10  4:45                                                                           ` dean gaudet
  2007-06-10  6:35                                                                           ` Kari Hurtta
  2 siblings, 1 reply; 129+ messages in thread
From: Al Viro @ 2007-06-10  4:00 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Kyle Moffett, Ulrich Drepper, Davide Libenzi, Alan Cox,
	Theodore Tso, Eric Dumazet, Linux Kernel Mailing List,
	Andrew Morton, Ingo Molnar

On Sat, Jun 09, 2007 at 08:48:39PM -0700, Linus Torvalds wrote:
> Agreed. That was actually part of the reason why I thought clone() was
> much better than the pthreads interface.
> 
> That said, the Linux !CLONE_FILES does have downsides:
> 
>  - it is potentially much slower to do than sharing everything (if you 
>    have lots of file descriptors, incrementing the refcounts etc is 
>    actually a real overhead)

Huh?  We _skip_ the overhead when descriptor table is not shared.  Think
for a minute - we can skip playing with refcount of fget() in the beginning
of syscall if and only if we know that nobody will touch the reference from
the descriptor table.  I.e. if descriptor table is not shared.  IOW, it's
the other way round - it's _faster_ to not share descriptors.
 
>  - it simply doesn't work, if the library wants to run in the same 
>    execution context, and just wants to open one (or more) file 
>    descriptors for some helper thing.

True, but...  That really depends on the potential uses.  Any real-world
examples (e.g. in related threads)?

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-10  4:00                                                                           ` Al Viro
@ 2007-06-10  4:03                                                                             ` Linus Torvalds
  2007-06-10  4:06                                                                               ` Al Viro
  0 siblings, 1 reply; 129+ messages in thread
From: Linus Torvalds @ 2007-06-10  4:03 UTC (permalink / raw)
  To: Al Viro
  Cc: Kyle Moffett, Ulrich Drepper, Davide Libenzi, Alan Cox,
	Theodore Tso, Eric Dumazet, Linux Kernel Mailing List,
	Andrew Morton, Ingo Molnar



On Sun, 10 Jun 2007, Al Viro wrote:

> On Sat, Jun 09, 2007 at 08:48:39PM -0700, Linus Torvalds wrote:
> > Agreed. That was actually part of the reason why I thought clone() was
> > much better than the pthreads interface.
> > 
> > That said, the Linux !CLONE_FILES does have downsides:
> > 
> >  - it is potentially much slower to do than sharing everything (if you 
> >    have lots of file descriptors, incrementing the refcounts etc is 
> >    actually a real overhead)
> 
> Huh?  We _skip_ the overhead when descriptor table is not shared.

Not for clone()/exit(), ie the actual _creation_ of the thread.

		Linus

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-10  4:03                                                                             ` Linus Torvalds
@ 2007-06-10  4:06                                                                               ` Al Viro
  0 siblings, 0 replies; 129+ messages in thread
From: Al Viro @ 2007-06-10  4:06 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Kyle Moffett, Ulrich Drepper, Davide Libenzi, Alan Cox,
	Theodore Tso, Eric Dumazet, Linux Kernel Mailing List,
	Andrew Morton, Ingo Molnar

On Sat, Jun 09, 2007 at 09:03:23PM -0700, Linus Torvalds wrote:
> 
> 
> On Sun, 10 Jun 2007, Al Viro wrote:
> 
> > On Sat, Jun 09, 2007 at 08:48:39PM -0700, Linus Torvalds wrote:
> > > Agreed. That was actually part of the reason why I thought clone() was
> > > much better than the pthreads interface.
> > > 
> > > That said, the Linux !CLONE_FILES does have downsides:
> > > 
> > >  - it is potentially much slower to do than sharing everything (if you 
> > >    have lots of file descriptors, incrementing the refcounts etc is 
> > >    actually a real overhead)
> > 
> > Huh?  We _skip_ the overhead when descriptor table is not shared.
> 
> Not for clone()/exit(), ie the actual _creation_ of the thread.

Umm...  Yes, but that depends on the time when that sucker is started and
the lifetime.  IOW, we really need specific examples.

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-10  3:48                                                                         ` Linus Torvalds
  2007-06-10  4:00                                                                           ` Al Viro
@ 2007-06-10  4:45                                                                           ` dean gaudet
  2007-06-10  5:06                                                                             ` Linus Torvalds
  2007-06-10  6:35                                                                           ` Kari Hurtta
  2 siblings, 1 reply; 129+ messages in thread
From: dean gaudet @ 2007-06-10  4:45 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Al Viro, Kyle Moffett, Ulrich Drepper, Davide Libenzi, Alan Cox,
	Theodore Tso, Eric Dumazet, Linux Kernel Mailing List,
	Andrew Morton, Ingo Molnar

On Sat, 9 Jun 2007, Linus Torvalds wrote:

> IOW, the most common case for libraries is not that they get invoced to do 
> one thing, but that they get loaded and then used over and over and over 
> again, and the _reason_ for wanting to have a file descriptor open may 
> well be that the library wants to cache the file descriptor, rather than 
> having to open a file over and over again!

for an example of a library wanting to cache an open fd ... and failing 
miserably at protecting itself from the application closing its fd read:

http://bugzilla.padl.com/show_bug.cgi?id=304
http://bugzilla.padl.com/show_bug.cgi?id=305

basically libnss-ldap is trying to use getsockname/getpeername to prove 
that an fd belongs to it.  the failure modes are quite delightful.

-dean

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-10  4:45                                                                           ` dean gaudet
@ 2007-06-10  5:06                                                                             ` Linus Torvalds
  2007-06-10  5:46                                                                               ` Al Viro
  0 siblings, 1 reply; 129+ messages in thread
From: Linus Torvalds @ 2007-06-10  5:06 UTC (permalink / raw)
  To: dean gaudet
  Cc: Al Viro, Kyle Moffett, Ulrich Drepper, Davide Libenzi, Alan Cox,
	Theodore Tso, Eric Dumazet, Linux Kernel Mailing List,
	Andrew Morton, Ingo Molnar



On Sat, 9 Jun 2007, dean gaudet wrote:
> 
> for an example of a library wanting to cache an open fd ... and failing 
> miserably at protecting itself from the application closing its fd read:

Heh.

Why the hell doesn't that thing just do an "fstat()" on the thing, and 
compare the inode number? Not that I would guarantee that it works either 
for a socket, but it would seem to make more sense than what it apparently 
does now, and I think it should work at least on Linux.

(Ie linux will give fake "st_ino" numbers to sockets. Whether anybody else 
will do that, I have no friggin clue. And you probably shouldn't depend on 
it even under Linux, but damn, despite all of those issues, it's likely 
better and more logical than what libnss-ldap apparently does now ;)

Doing a "fstat()" call is how you generally test if two fd's point to the 
same file. Doing the same thing for sockets would at least be half-way 
sane, and even if the OS doesn't give new sockets individual st_ino 
numbers, they probably won't be *changing*.

		Linus

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-10  5:06                                                                             ` Linus Torvalds
@ 2007-06-10  5:46                                                                               ` Al Viro
  2007-06-10 17:23                                                                                 ` Linus Torvalds
  0 siblings, 1 reply; 129+ messages in thread
From: Al Viro @ 2007-06-10  5:46 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: dean gaudet, Kyle Moffett, Ulrich Drepper, Davide Libenzi,
	Alan Cox, Theodore Tso, Eric Dumazet, Linux Kernel Mailing List,
	Andrew Morton, Ingo Molnar

On Sat, Jun 09, 2007 at 10:06:27PM -0700, Linus Torvalds wrote:
> Why the hell doesn't that thing just do an "fstat()" on the thing, and 
> compare the inode number? Not that I would guarantee that it works either 
> for a socket, but it would seem to make more sense than what it apparently 
> does now, and I think it should work at least on Linux.

It works on Linux, all right.

> (Ie linux will give fake "st_ino" numbers to sockets. Whether anybody else 
> will do that, I have no friggin clue. And you probably shouldn't depend on 
> it even under Linux, but damn, despite all of those issues, it's likely 
> better and more logical than what libnss-ldap apparently does now ;)

On FreeBSD it will simply give you zero st_ino on almost all sockets;
AF_UNIX ones get st_ino invented (and stable).  st_dev is NODEV in all
cases...

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-09 17:00                                   ` Davide Libenzi
@ 2007-06-10  6:26                                     ` Paul Mackerras
  2007-06-10  7:10                                       ` William Lee Irwin III
  2007-06-10 15:52                                       ` Davide Libenzi
  0 siblings, 2 replies; 129+ messages in thread
From: Paul Mackerras @ 2007-06-10  6:26 UTC (permalink / raw)
  To: Davide Libenzi
  Cc: Alan Cox, Theodore Tso, Ulrich Drepper, Eric Dumazet,
	Kyle Moffett, Linux Kernel Mailing List, Linus Torvalds,
	Andrew Morton, Ingo Molnar

Davide Libenzi writes:

> > Why must everything that makes things a bit simpler and more
> > predictable for application programmers be called a "mistake"?
> 
> Because if you give guarantees on something, ppl start using such 
> guarantee in the wrong way. Kyle's email summarizes it.

OK, my question/protest was a bit terse.

What I am objecting to is this idea that many kernel developers seem
to have, that if there is some aspect of the kernel/user API that
becomes a bit inconvenient for the kernel to implement, then we can
put the blame on the applications that rely on that aspect, call them
names such as "legacy", "abuser", "conceptually buggy", "broken",
etc., and ultimately justify breaking the ABI -- since it's only those
applications that we have demonised that will be affected, after all.

In a way your response quoted above illustrates this perfectly.  If we
give guarantees then people will start relying on them "in the wrong
way" -- meaning, in any way that later becomes inconvenient to us.
What next?  Shall we break the guarantee that when read() returns, the
data is already in the user's buffer?  I'm sure we could improve
performance if we didn't have to give that guarantee, and after all,
only old, broken, legacy, conceptually buggy programs would be abusing
the interface by relying on that. :)

> This should really be treated as an opaque handle, with no assumption on 
> its value.

No.  This is an assertion that you have just conjured up to suit
yourself, but it is wrong.  It does not reflect either historical UNIX
practice or what is specified in POSIX.  Application writers are
perfectly justified in relying on getting the lowest-numbered unused
file descriptor.

If you don't think we should be bound by POSIX, then you are perfectly
free to go off and write your own research kernel with whatever
interface you want, and no programs available to run on it. :)

Paul.

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-10  3:48                                                                         ` Linus Torvalds
  2007-06-10  4:00                                                                           ` Al Viro
  2007-06-10  4:45                                                                           ` dean gaudet
@ 2007-06-10  6:35                                                                           ` Kari Hurtta
  2007-06-10 15:21                                                                             ` Alan Cox
  2 siblings, 1 reply; 129+ messages in thread
From: Kari Hurtta @ 2007-06-10  6:35 UTC (permalink / raw)
  To: linux-kernel

Linus Torvalds <torvalds@linux-foundation.org> writes in gmane.linux.kernel:

<...>
> But let's say that you want to do a library that does name resolution, and 
> you actually want to create the socket that binds to the DNS server just 
> once, and then re-use that socket across library calls. It's not that the 
> library is a thread of its own - it's not - but with the normal linear fd 
> space it really cannot do this. Sure, it could try to hide it up somewhere 
> in high fd space, but that would slow down other operations, and there's 
> no way to guarantee it doesn't clash with some _other_ library doing the 
> same thing, so it really isn't a good idea.
> 
> Now, when you do a DNS query, the setup cost of opening the socket is the 
> least of your worries, so the above example is not a very good one. I'm 
> really just giving it as a concrete example of a _conceptual_ problem, 
> where some other library really had more pressing performance reasons why 
> they cannot keep re-opening a file descriptor and closing it each time.
> 
> So _that_ is the kind of situation where I think "anonymous file 
> descriptors" make sense. 
> 
> 		Linus

Real world example is nss_ldap / pam_ldap -- these needs open socket to 
ldap server.  That socket is cached. And because they can not trust that 
application does not have closed file description of them, they check it with 
getpeername + getsockname  (at least it did when I looked code on
some years ago.)   

( opening socket again includes using starttls and  authentication  .. so it is 
  quite some overhead )

/ Kari Hurtta


^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-09 14:38                                   ` Kyle Moffett
@ 2007-06-10  6:48                                     ` Paul Mackerras
  2007-06-10 15:56                                       ` Davide Libenzi
  2007-06-10 19:16                                       ` Davide Libenzi
  0 siblings, 2 replies; 129+ messages in thread
From: Paul Mackerras @ 2007-06-10  6:48 UTC (permalink / raw)
  To: Kyle Moffett
  Cc: Davide Libenzi, Alan Cox, Theodore Tso, Ulrich Drepper,
	Eric Dumazet, Linux Kernel Mailing List, Linus Torvalds,
	Andrew Morton, Ingo Molnar

Kyle Moffett writes:

> 1)  Linear FD allocation makes it IMPOSSIBLE for libraries to  
> reliably use persistent FDs behind an application's back.  For  

That's not completely true; for example, openlog() opens a file
descriptor for the library's own use, as does sethostent().  I agree
that it creates difficulties if the library implementor wants to use a
file descriptor in a set of functions that didn't previously use one,
but with a bit of assistance from the kernel, that can be solved
without breaking the ABI.

> for (i = 0; i < NR_OPEN; i++)
> 	if (!fd_is_special_to_us(i))
> 		close(i);
> 
> Note that this is conceptually buggy, but occurs in several major C  
> programming books, most of the major shells, and a lot of other  
> software to boot.

Buggy in what way?  In the use of the NR_OPEN constant?

> 3) In order to allocate new FDs, the kernel has to scan over a  
> (potentially very large) bitmap.  A process with 100,000 fds (not  
> terribly uncommon) would have 12.5kbyte of FD bitmap and would trash  
> the cache every time it tried to allocate an FD.

For specialized programs like that we can offer alternative fd
allocation strategies if necessary (although I expect that with
100,000 fds other things will limit performance more than
allocation).

None of those things is an excuse for breaking the ABI, however.
As I said to Davide, I was really protesting about the attitude that
we can just break the ABI however and whenever we like and force
programs to adapt.

Paul.

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-10  6:26                                     ` Paul Mackerras
@ 2007-06-10  7:10                                       ` William Lee Irwin III
  2007-06-10 15:52                                       ` Davide Libenzi
  1 sibling, 0 replies; 129+ messages in thread
From: William Lee Irwin III @ 2007-06-10  7:10 UTC (permalink / raw)
  To: Paul Mackerras
  Cc: Davide Libenzi, Alan Cox, Theodore Tso, Ulrich Drepper,
	Eric Dumazet, Kyle Moffett, Linux Kernel Mailing List,
	Linus Torvalds, Andrew Morton, Ingo Molnar

On Sun, Jun 10, 2007 at 04:26:07PM +1000, Paul Mackerras wrote:
> If you don't think we should be bound by POSIX, then you are perfectly
> free to go off and write your own research kernel with whatever
> interface you want, and no programs available to run on it. :)

This isn't fair to research kernels. Breaking applications is not an
active area of research.


-- wli

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-09 20:21                                                                     ` Linus Torvalds
                                                                                         ` (2 preceding siblings ...)
  2007-06-10  3:19                                                                       ` Al Viro
@ 2007-06-10  9:14                                                                       ` Eric Dumazet
  2007-06-10 15:16                                                                         ` Alan Cox
  2007-06-10 18:19                                                                         ` Linus Torvalds
  3 siblings, 2 replies; 129+ messages in thread
From: Eric Dumazet @ 2007-06-10  9:14 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Al Viro, Kyle Moffett, Ulrich Drepper, Davide Libenzi, Alan Cox,
	Theodore Tso, Linux Kernel Mailing List, Andrew Morton,
	Ingo Molnar

Linus Torvalds a écrit :
> (And dammit, that _is_ a *real*issue*. No races necessary, no NR_OPEN 
> iterations, no even *halfway* suspect code. It's perfectly fine to do
> 
> 	close(0);
> 	close(1);
> 	close(2);
> 	.. generate filenames, whatever ..
> 	if (open(..) < 0 || open(..) < 0 || open(..) < 0)
> 		die("Couldn't redirect stdin/stdout/stderr");
> 
> and there's absolutely nothing wrong with this kind of setup, even if you 
> could obviously have done it other ways too (ie by using "dup2()" instead 
> of "close + open"),
> 

This kind of setup was OK 25 years ago, before multithreading era.
You cannot reasonably expect it to work in a multithreaded program.

Anyway, I would like to give an alternative idea of the double fdmap, and 
probably more *secure* .

Current fd API mandates integers (32 bits)

Lot of broken code consider a fd must be >= 0, so we currently are limited to 
31 bits.

With NR_OPEN = 1024*1024 = 2^20, that give us 11 bits that we could use as a 
signature.

That is, we could use O_NONSEQ as a indication to kernel to give us a 
composite fd : 20 low order bits give the slot in file table, then 11 bits can 
  be use to make sure the fd was not stolen by malicious code.


Legacy app, (without O_NONSEQ in flags) would get POSIX compatables fd in [0, 
2^20-1] range, with the lowest available fd.

If O_NONSEQ is given, kernel is free to give an fd in [0, 2^31 - 1], with a 
strategy that could be the one Davide gave in its patch (with a list of 
available slots). But instead of FIFO, we can use now LIFO, more cache friendly.

In fget()/fget_light()/close(), we can then use 20 bits to select the slot in 
the single fdmap.
And 11 bits to check the 'signature'.

So if open( O_NONSEQFD) gave us 0x77000010, we cannot do close(0x10) or 
read(0x10, ....)

Storage for these bits is already there in Davide fd_slot structure, where we 
currently use one long to store 3 bits 'only'.

This should work even bumping NR_OPEN to say... 8*1024*1024, and 8 bits signature.


^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-10  9:14                                                                       ` Eric Dumazet
@ 2007-06-10 15:16                                                                         ` Alan Cox
  2007-06-10 18:19                                                                         ` Linus Torvalds
  1 sibling, 0 replies; 129+ messages in thread
From: Alan Cox @ 2007-06-10 15:16 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Linus Torvalds, Al Viro, Kyle Moffett, Ulrich Drepper,
	Davide Libenzi, Theodore Tso, Linux Kernel Mailing List,
	Andrew Morton, Ingo Molnar

> > and there's absolutely nothing wrong with this kind of setup, even if you 
> > could obviously have done it other ways too (ie by using "dup2()" instead 
> > of "close + open"),
> > 
> 
> This kind of setup was OK 25 years ago, before multithreading era.
> You cannot reasonably expect it to work in a multithreaded program.

Why not.

When execution begins which is the normal point you do this then you've
got one thread. If you need to do this from a thread after that point
posix provides threaded applications with locking.

Not much else works in a threaded app if you get the locking wrong, and
that is considered the authors job. Why is fd allocation different ?

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-10  6:35                                                                           ` Kari Hurtta
@ 2007-06-10 15:21                                                                             ` Alan Cox
  0 siblings, 0 replies; 129+ messages in thread
From: Alan Cox @ 2007-06-10 15:21 UTC (permalink / raw)
  To: Kari Hurtta; +Cc: linux-kernel

> Real world example is nss_ldap / pam_ldap -- these needs open socket to 
> ldap server.  That socket is cached. And because they can not trust that 
> application does not have closed file description of them, they check it with 
> getpeername + getsockname  (at least it did when I looked code on
> some years ago.)   
> 
> ( opening socket again includes using starttls and  authentication  .. so it is 
>   quite some overhead )

And if the fd was closed because of a security transition in the
application hiding it and caching it from the application might then lead
to a security hole.

Alan

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-10  6:26                                     ` Paul Mackerras
  2007-06-10  7:10                                       ` William Lee Irwin III
@ 2007-06-10 15:52                                       ` Davide Libenzi
  1 sibling, 0 replies; 129+ messages in thread
From: Davide Libenzi @ 2007-06-10 15:52 UTC (permalink / raw)
  To: Paul Mackerras
  Cc: Alan Cox, Theodore Tso, Ulrich Drepper, Eric Dumazet,
	Kyle Moffett, Linux Kernel Mailing List, Linus Torvalds,
	Andrew Morton, Ingo Molnar

On Sun, 10 Jun 2007, Paul Mackerras wrote:

> What I am objecting to is this idea that many kernel developers seem
> to have, that if there is some aspect of the kernel/user API that
> becomes a bit inconvenient for the kernel to implement, then we can
> put the blame on the applications that rely on that aspect, call them
> names such as "legacy", "abuser", "conceptually buggy", "broken",
> etc., and ultimately justify breaking the ABI -- since it's only those
> applications that we have demonised that will be affected, after all.
> 
> In a way your response quoted above illustrates this perfectly.  If we
> give guarantees then people will start relying on them "in the wrong
> way" -- meaning, in any way that later becomes inconvenient to us.
> What next?  Shall we break the guarantee that when read() returns, the
> data is already in the user's buffer?  I'm sure we could improve
> performance if we didn't have to give that guarantee, and after all,
> only old, broken, legacy, conceptually buggy programs would be abusing
> the interface by relying on that. :)

This fds will be out of POSIX definition in any case. They have to be 
based at very high values in any case in order to be useful, and this 
already breaks the POSIX definition of them.



> > This should really be treated as an opaque handle, with no assumption on 
> > its value.
> 
> No.  This is an assertion that you have just conjured up to suit
> yourself, but it is wrong.  It does not reflect either historical UNIX
> practice or what is specified in POSIX.  Application writers are
> perfectly justified in relying on getting the lowest-numbered unused
> file descriptor.

You seem to deny the fact that libraries cannot reliably use fds allocated 
by POSIX rules for internal communication with the kernel. I really don't 
know what to say, if not that we must have a different definition of the 
word "reliable".
Maybe if you go back in the thread and give your solutions, using POSIX 
fds allocation semantics, to the problems that have been exposed, that 
might help understanding.



- Davide



^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-10  6:48                                     ` Paul Mackerras
@ 2007-06-10 15:56                                       ` Davide Libenzi
  2007-06-10 19:16                                       ` Davide Libenzi
  1 sibling, 0 replies; 129+ messages in thread
From: Davide Libenzi @ 2007-06-10 15:56 UTC (permalink / raw)
  To: Paul Mackerras
  Cc: Kyle Moffett, Davide Libenzi, Alan Cox, Theodore Tso,
	Ulrich Drepper, Eric Dumazet, Linux Kernel Mailing List,
	Linus Torvalds, Andrew Morton, Ingo Molnar

On Sun, 10 Jun 2007, Paul Mackerras wrote:

> > for (i = 0; i < NR_OPEN; i++)
> > 	if (!fd_is_special_to_us(i))
> > 		close(i);
> > 
> > Note that this is conceptually buggy, but occurs in several major C  
> > programming books, most of the major shells, and a lot of other  
> > software to boot.
> 
> Buggy in what way?  In the use of the NR_OPEN constant?

That is typical daemonize code done by quite a few apps, and even 
described in many unix network programming books. Now say that before that 
you loaded a library that had an internal fd allocated to get some sort of 
services from the kernel (say an epoll fd). What happens when the library 
uses the cached apoll fd after that loop?



- Davide



^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-10  5:46                                                                               ` Al Viro
@ 2007-06-10 17:23                                                                                 ` Linus Torvalds
  0 siblings, 0 replies; 129+ messages in thread
From: Linus Torvalds @ 2007-06-10 17:23 UTC (permalink / raw)
  To: Al Viro
  Cc: dean gaudet, Kyle Moffett, Ulrich Drepper, Davide Libenzi,
	Alan Cox, Theodore Tso, Eric Dumazet, Linux Kernel Mailing List,
	Andrew Morton, Ingo Molnar



On Sun, 10 Jun 2007, Al Viro wrote:
> 
> On FreeBSD it will simply give you zero st_ino on almost all sockets;
> AF_UNIX ones get st_ino invented (and stable).  st_dev is NODEV in all
> cases...

So it will still work better than trying to do a "getsockname()" or 
something. If the file descriptor really _has_ been changed, it might give 
a false negative (somebody replaced it with *another* socket), but on the 
other hand, that's much better than a false positive. And if somebody 
replaced the socket with a real file descriptor, it will catch it..

		Linus

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-10  9:14                                                                       ` Eric Dumazet
  2007-06-10 15:16                                                                         ` Alan Cox
@ 2007-06-10 18:19                                                                         ` Linus Torvalds
  1 sibling, 0 replies; 129+ messages in thread
From: Linus Torvalds @ 2007-06-10 18:19 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Al Viro, Kyle Moffett, Ulrich Drepper, Davide Libenzi, Alan Cox,
	Theodore Tso, Linux Kernel Mailing List, Andrew Morton,
	Ingo Molnar

[-- Attachment #1: Type: TEXT/PLAIN, Size: 793 bytes --]



On Sun, 10 Jun 2007, Eric Dumazet wrote:

> Linus Torvalds a écrit :
> > 
> > 	close(0);
> > 	close(1);
> > 	close(2);
> > 	.. generate filenames, whatever ..
> > 	if (open(..) < 0 || open(..) < 0 || open(..) < 0)
> > 		die("Couldn't redirect stdin/stdout/stderr");
> > 
> > and there's absolutely nothing wrong with this kind of setup, even if you
> > could obviously have done it other ways too (ie by using "dup2()" instead of
> > "close + open"),
> 
> This kind of setup was OK 25 years ago, before multithreading era.
> You cannot reasonably expect it to work in a multithreaded program.

Who said _anything_ about threading?

The above is a totally non-threaded app. 99% of all applications are like 
that.

Threading is for the 1%. But 99% is what we need to make sure works.

		Linus

^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-10  6:48                                     ` Paul Mackerras
  2007-06-10 15:56                                       ` Davide Libenzi
@ 2007-06-10 19:16                                       ` Davide Libenzi
  1 sibling, 0 replies; 129+ messages in thread
From: Davide Libenzi @ 2007-06-10 19:16 UTC (permalink / raw)
  To: Paul Mackerras
  Cc: Kyle Moffett, Alan Cox, Linus Torvalds, Theodore Tso,
	Ulrich Drepper, Eric Dumazet, Linux Kernel Mailing List

On Sun, 10 Jun 2007, Paul Mackerras wrote:

> > for (i = 0; i < NR_OPEN; i++)
> > 	if (!fd_is_special_to_us(i))
> > 		close(i);
> > 
> > Note that this is conceptually buggy, but occurs in several major C  
> > programming books, most of the major shells, and a lot of other  
> > software to boot.
> 
> Buggy in what way?  In the use of the NR_OPEN constant?

That is buggy because the code assumes it is the only owner of the fd 
space. Thing that is not true if runtimes have opened their own file 
descriptors to handle their own links to the kernel.
The syslog case is kinda bogus. For certain things, you can afford a retry 
policy (assuming somone else did not open another fd over the fd that you 
cached - then it's gonna be fun). It is crappy, but it *may* work. Sure 
is, does not fit any definition of "reliable" I'm aware of.
Think about a runtime that had an epoll fd open and a few fds dropped into 
it to, say, deliver some sort of events to the application. After the 
Close Loop of Death, the whole thing is gone.
As is, runtimes cannot reliably use (cached) fds for their own internal 
communication with the kernel. That, I think we can agree it is a fact.
We can then say that we do not care if runtimes cannot reliably use fds, 
and move on. But it's hard to say that the problem does not exist.



- Davide



^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-08 19:30                                     ` Alan Cox
  2007-06-08 19:37                                       ` Davide Libenzi
@ 2007-06-11  8:24                                       ` Xavier Bestel
  1 sibling, 0 replies; 129+ messages in thread
From: Xavier Bestel @ 2007-06-11  8:24 UTC (permalink / raw)
  To: Alan Cox
  Cc: Ulrich Drepper, Davide Libenzi, Theodore Tso, Eric Dumazet,
	Kyle Moffett, Linux Kernel Mailing List, Linus Torvalds,
	Andrew Morton, Ingo Molnar

On Fri, 2007-06-08 at 20:30 +0100, Alan Cox wrote:
> > To avoid exactly the kind of problem we have now in future: programs
> > relying on specific patterns.
> 
> Which you seem to think is a bad thing, yet is actually a very good thing
> because it means that crashes are repeatable and problems are debuggable
> from end user reports.

You can have both.
Look at malloc(): when you write your program you can't really guess
which address will be returned by a malloc() call, but you know that if
you launch it twice and if it has the same input, malloc()'s behavior is
repeatable so it's debuggable.

	Xav



^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-07 10:39     ` [patch 7/8] fdmap v2 - implement sys_socket2 Eric Dumazet
@ 2007-06-07 15:42       ` Davide Libenzi
  0 siblings, 0 replies; 129+ messages in thread
From: Davide Libenzi @ 2007-06-07 15:42 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Linux Kernel Mailing List, Linus Torvalds, Andrew Morton,
	Ulrich Drepper, Ingo Molnar

On Thu, 7 Jun 2007, Eric Dumazet wrote:

> Davide Libenzi <davidel@xmailserver.org> wrote:
> 
> > The sys_accept() system call has been modified to return a file
> > descriptor inside the non-sequential area, if the listening fd is.
> 
> > -   newfd = sock_alloc_fd(&newfile);
> > +   newfd = sock_alloc_fd(&newfile,
> > +         fd > current->signal->rlim[RLIMIT_NOFILE].rlim_cur ? O_NONSEQFD: 0);
> 
> This will break apps that change/downgrade their rlimit (after getting a high fd listen socket)
> Yes probably insane, but who knows...
> 
> sock = socket(...);
> bind(...);
> listen(sock, backlog); ...
> fd = dup2(sock, 1023);
> close(sock);
> 
> setrlimit( RLIMIT_NOFILE, rlim.rlim_cur = 256);
> ...
> while ((newsock = accept(fd, ...)) != -1) {
>      fork();...
>      Plain legacy code, expecting newsock being *small*
>      FD_SET(newsock , &rd_set);
>      ...oops... fd is too large to fit in fd_set
>      select(newsock + 1, &rd_set, ...);
>      }
> 
> 
> So you might change logic to straight :
> 
> newfd = sock_alloc_fd(&newfile, (fd >= FDMAP_NONSEQ_BASE) ? O_NONSEQFD: 0);

Yes, that makes perfectly sense to me.


- Davide



^ permalink raw reply	[flat|nested] 129+ messages in thread

* Re: [patch 7/8] fdmap v2 - implement sys_socket2
  2007-06-07  7:10   ` Davide Libenzi
@ 2007-06-07 10:39     ` Eric Dumazet
  2007-06-07 15:42       ` Davide Libenzi
  0 siblings, 1 reply; 129+ messages in thread
From: Eric Dumazet @ 2007-06-07 10:39 UTC (permalink / raw)
  To: Davide Libenzi
  Cc: Linux Kernel Mailing List, Linus Torvalds, Andrew Morton,
	Ulrich Drepper, Ingo Molnar

Davide Libenzi <davidel@xmailserver.org> wrote:

> The sys_accept() system call has been modified to return a file
> descriptor inside the non-sequential area, if the listening fd is.

> -   newfd = sock_alloc_fd(&newfile);
> +   newfd = sock_alloc_fd(&newfile,
> +         fd > current->signal->rlim[RLIMIT_NOFILE].rlim_cur ? O_NONSEQFD: 0);

This will break apps that change/downgrade their rlimit (after getting a high fd listen socket)
Yes probably insane, but who knows...

sock = socket(...);
bind(...);
listen(sock, backlog); ...
fd = dup2(sock, 1023);
close(sock);

setrlimit( RLIMIT_NOFILE, rlim.rlim_cur = 256);
...
while ((newsock = accept(fd, ...)) != -1) {
     fork();...
     Plain legacy code, expecting newsock being *small*
     FD_SET(newsock , &rd_set);
     ...oops... fd is too large to fit in fd_set
     select(newsock + 1, &rd_set, ...);
     }


So you might change logic to straight :

newfd = sock_alloc_fd(&newfile, (fd >= FDMAP_NONSEQ_BASE) ? O_NONSEQFD: 0);


^ permalink raw reply	[flat|nested] 129+ messages in thread

end of thread, other threads:[~2007-06-11  8:25 UTC | newest]

Thread overview: 129+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-06-06 22:30 [patch 7/8] fdmap v2 - implement sys_socket2 Davide Libenzi
2007-06-06 22:44 ` David Miller
2007-06-06 22:52   ` Davide Libenzi
2007-06-06 22:57     ` David Miller
2007-06-06 22:57   ` Ulrich Drepper
2007-06-06 23:02     ` David Miller
2007-06-06 22:59 ` Alan Cox
2007-06-06 22:58   ` Ulrich Drepper
2007-06-06 23:04   ` Davide Libenzi
2007-06-06 23:08     ` David Miller
2007-06-06 23:19     ` Alan Cox
2007-06-06 23:22       ` Ulrich Drepper
2007-06-07 10:04         ` Alan Cox
2007-06-07 11:59           ` Kyle Moffett
2007-06-07 13:12             ` Eric Dumazet
2007-06-07 15:51               ` Davide Libenzi
2007-06-07 19:49               ` Davide Libenzi
2007-06-07 20:02                 ` Ulrich Drepper
2007-06-07 20:05                 ` Eric Dumazet
2007-06-07 20:18                   ` Ulrich Drepper
2007-06-07 21:44                     ` Davide Libenzi
2007-06-07 22:03                       ` Ulrich Drepper
2007-06-07 22:40                         ` Davide Libenzi
2007-06-08 12:07                           ` Theodore Tso
2007-06-08 13:01                             ` Alan Cox
2007-06-08 18:11                               ` Davide Libenzi
2007-06-08 18:26                                 ` Alan Cox
2007-06-08 18:43                                   ` Ulrich Drepper
2007-06-08 18:46                                     ` Al Viro
2007-06-08 18:56                                       ` Ulrich Drepper
2007-06-08 19:07                                         ` Linus Torvalds
2007-06-08 19:21                                           ` Davide Libenzi
2007-06-09  0:03                                             ` Linus Torvalds
2007-06-09  0:13                                               ` Davide Libenzi
2007-06-09  0:36                                               ` Al Viro
2007-06-09  1:19                                                 ` Ulrich Drepper
2007-06-09  1:41                                                   ` Al Viro
2007-06-09  2:10                                                     ` Ulrich Drepper
2007-06-09 15:15                                                       ` Al Viro
2007-06-09 16:26                                                         ` Ulrich Drepper
2007-06-09 16:54                                                           ` Al Viro
2007-06-09 17:04                                                             ` Davide Libenzi
2007-06-09 17:08                                                               ` Davide Libenzi
2007-06-09 17:08                                                             ` Ulrich Drepper
2007-06-09 17:24                                                               ` Al Viro
2007-06-09 19:27                                                                 ` Kyle Moffett
2007-06-09 20:06                                                                   ` Al Viro
2007-06-09 20:21                                                                     ` Linus Torvalds
2007-06-09 20:31                                                                       ` Davide Libenzi
2007-06-09 21:41                                                                         ` Matt Mackall
2007-06-09 22:12                                                                           ` Davide Libenzi
2007-06-09 20:49                                                                       ` Al Viro
2007-06-09 21:55                                                                         ` Matt Mackall
2007-06-09 23:33                                                                         ` Linus Torvalds
2007-06-10  3:35                                                                           ` Davide Libenzi
2007-06-10  3:49                                                                             ` Davide Libenzi
2007-06-10  3:19                                                                       ` Al Viro
2007-06-10  3:48                                                                         ` Linus Torvalds
2007-06-10  4:00                                                                           ` Al Viro
2007-06-10  4:03                                                                             ` Linus Torvalds
2007-06-10  4:06                                                                               ` Al Viro
2007-06-10  4:45                                                                           ` dean gaudet
2007-06-10  5:06                                                                             ` Linus Torvalds
2007-06-10  5:46                                                                               ` Al Viro
2007-06-10 17:23                                                                                 ` Linus Torvalds
2007-06-10  6:35                                                                           ` Kari Hurtta
2007-06-10 15:21                                                                             ` Alan Cox
2007-06-10  9:14                                                                       ` Eric Dumazet
2007-06-10 15:16                                                                         ` Alan Cox
2007-06-10 18:19                                                                         ` Linus Torvalds
2007-06-10  2:40                                                                   ` Al Viro
2007-06-08 19:34                                         ` Alan Cox
2007-06-08 19:30                                     ` Alan Cox
2007-06-08 19:37                                       ` Davide Libenzi
2007-06-08 19:48                                         ` Alan Cox
2007-06-08 19:51                                           ` Davide Libenzi
2007-06-08 21:24                                             ` Alan Cox
2007-06-08 21:59                                               ` Davide Libenzi
2007-06-08 22:28                                                 ` Alan Cox
2007-06-08 22:38                                                   ` Davide Libenzi
2007-06-11  8:24                                       ` Xavier Bestel
2007-06-08 19:22                                   ` Davide Libenzi
2007-06-09  5:41                                 ` Paul Mackerras
2007-06-09 14:38                                   ` Kyle Moffett
2007-06-10  6:48                                     ` Paul Mackerras
2007-06-10 15:56                                       ` Davide Libenzi
2007-06-10 19:16                                       ` Davide Libenzi
2007-06-09 17:00                                   ` Davide Libenzi
2007-06-10  6:26                                     ` Paul Mackerras
2007-06-10  7:10                                       ` William Lee Irwin III
2007-06-10 15:52                                       ` Davide Libenzi
2007-06-08 18:07                             ` Davide Libenzi
2007-06-08 18:35                             ` Linus Torvalds
2007-06-07 21:57                   ` Davide Libenzi
2007-06-08  4:38                     ` Eric Dumazet
2007-06-08  5:20                       ` Davide Libenzi
2007-06-07 14:25           ` Ulrich Drepper
2007-06-07 17:56             ` Eric Dumazet
2007-06-07 18:03               ` Davide Libenzi
2007-06-07 18:57                 ` Eric Dumazet
2007-06-07 18:26               ` Ulrich Drepper
2007-06-07 18:39                 ` Davide Libenzi
2007-06-07 18:56                   ` Ulrich Drepper
2007-06-07 19:12                     ` Davide Libenzi
2007-06-07 20:03                   ` Andrew Morton
2007-06-08  2:55                     ` Ulrich Drepper
2007-06-08  5:16                       ` Davide Libenzi
2007-06-06 23:29       ` Davide Libenzi
2007-06-07 10:06         ` Alan Cox
2007-06-07 10:45           ` Eric Dumazet
2007-06-07 11:27             ` Alan Cox
2007-06-07 15:41           ` Davide Libenzi
2007-06-07 20:10   ` Linus Torvalds
2007-06-07 20:47     ` Eric Dumazet
2007-06-07 21:08       ` Linus Torvalds
2007-06-07 21:41         ` Davide Libenzi
2007-06-07 20:59     ` Guillaume Chazarain
2007-06-07 21:06       ` Guillaume Chazarain
2007-06-07 21:31     ` Ulrich Drepper
2007-06-07 22:22     ` Davide Libenzi
2007-06-07 23:42       ` Linus Torvalds
2007-06-08  0:04         ` Davide Libenzi
2007-06-08  0:59     ` Matt Mackall
2007-06-08  2:25       ` Linus Torvalds
2007-06-08 15:56     ` Jeff Dike
2007-06-07  0:29 ` Arnd Bergmann
2007-06-07  0:33   ` Davide Libenzi
2007-06-06 22:30 [patch 1/8] fdmap v2 - fdmap core Davide Libenzi
2007-06-07  6:54 ` Eric Dumazet
2007-06-07  7:10   ` Davide Libenzi
2007-06-07 10:39     ` [patch 7/8] fdmap v2 - implement sys_socket2 Eric Dumazet
2007-06-07 15:42       ` Davide Libenzi

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).