linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/2] fs, close_range: add flag CLOSE_RANGE_CLOEXEC
@ 2020-10-13 14:06 Giuseppe Scrivano
  2020-10-13 14:06 ` [PATCH 1/2] " Giuseppe Scrivano
  2020-10-13 14:06 ` [PATCH 2/2] selftests: add tests for CLOSE_RANGE_CLOEXEC Giuseppe Scrivano
  0 siblings, 2 replies; 11+ messages in thread
From: Giuseppe Scrivano @ 2020-10-13 14:06 UTC (permalink / raw)
  To: linux-kernel; +Cc: viro, linux-fsdevel, christian.brauner, containers

When the new flag is used, close_range will set the close-on-exec bit
for the file descriptors instead of close()-ing them.

It is useful for e.g. container runtimes that want to minimize the
number of syscalls used after a seccomp profile is installed but want
to keep some fds open until the container process is executed.

Giuseppe Scrivano (2):
  fs, close_range: add flag CLOSE_RANGE_CLOEXEC
  selftests: add tests for CLOSE_RANGE_CLOEXEC

 fs/file.c                                     | 56 +++++++++++++------
 include/uapi/linux/close_range.h              |  3 +
 .../testing/selftests/core/close_range_test.c | 44 +++++++++++++++
 3 files changed, 86 insertions(+), 17 deletions(-)

-- 
2.26.2


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 1/2] fs, close_range: add flag CLOSE_RANGE_CLOEXEC
  2020-10-13 14:06 [PATCH 0/2] fs, close_range: add flag CLOSE_RANGE_CLOEXEC Giuseppe Scrivano
@ 2020-10-13 14:06 ` Giuseppe Scrivano
  2020-10-13 20:54   ` Christian Brauner
  2020-10-13 21:09   ` Al Viro
  2020-10-13 14:06 ` [PATCH 2/2] selftests: add tests for CLOSE_RANGE_CLOEXEC Giuseppe Scrivano
  1 sibling, 2 replies; 11+ messages in thread
From: Giuseppe Scrivano @ 2020-10-13 14:06 UTC (permalink / raw)
  To: linux-kernel; +Cc: viro, linux-fsdevel, christian.brauner, containers

When the flag CLOSE_RANGE_CLOEXEC is set, close_range doesn't
immediately close the files but it sets the close-on-exec bit.

Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com>
---
 fs/file.c                        | 56 ++++++++++++++++++++++----------
 include/uapi/linux/close_range.h |  3 ++
 2 files changed, 42 insertions(+), 17 deletions(-)

diff --git a/fs/file.c b/fs/file.c
index 21c0893f2f1d..ad4ebee41e09 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -672,6 +672,17 @@ int __close_fd(struct files_struct *files, unsigned fd)
 }
 EXPORT_SYMBOL(__close_fd); /* for ksys_close() */
 
+static unsigned int __get_max_fds(struct files_struct *cur_fds)
+{
+	unsigned int max_fds;
+
+	rcu_read_lock();
+	/* cap to last valid index into fdtable */
+	max_fds = files_fdtable(cur_fds)->max_fds;
+	rcu_read_unlock();
+	return max_fds;
+}
+
 /**
  * __close_range() - Close all file descriptors in a given range.
  *
@@ -683,27 +694,23 @@ EXPORT_SYMBOL(__close_fd); /* for ksys_close() */
  */
 int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
 {
-	unsigned int cur_max;
+	unsigned int cur_max = UINT_MAX;
 	struct task_struct *me = current;
 	struct files_struct *cur_fds = me->files, *fds = NULL;
 
-	if (flags & ~CLOSE_RANGE_UNSHARE)
+	if (flags & ~(CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC))
 		return -EINVAL;
 
 	if (fd > max_fd)
 		return -EINVAL;
 
-	rcu_read_lock();
-	cur_max = files_fdtable(cur_fds)->max_fds;
-	rcu_read_unlock();
-
-	/* cap to last valid index into fdtable */
-	cur_max--;
-
 	if (flags & CLOSE_RANGE_UNSHARE) {
 		int ret;
 		unsigned int max_unshare_fds = NR_OPEN_MAX;
 
+		/* cap to last valid index into fdtable */
+		cur_max = __get_max_fds(cur_fds) - 1;
+
 		/*
 		 * If the requested range is greater than the current maximum,
 		 * we're closing everything so only copy all file descriptors
@@ -724,16 +731,31 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
 			swap(cur_fds, fds);
 	}
 
-	max_fd = min(max_fd, cur_max);
-	while (fd <= max_fd) {
-		struct file *file;
+	if (flags & CLOSE_RANGE_CLOEXEC) {
+		struct fdtable *fdt;
 
-		file = pick_file(cur_fds, fd++);
-		if (!file)
-			continue;
+		spin_lock(&cur_fds->file_lock);
+		fdt = files_fdtable(cur_fds);
+		cur_max = fdt->max_fds - 1;
+		max_fd = min(max_fd, cur_max);
+		while (fd <= max_fd)
+			__set_close_on_exec(fd++, fdt);
+		spin_unlock(&cur_fds->file_lock);
+	} else {
+		/* Initialize cur_max if needed.  */
+		if (cur_max == UINT_MAX)
+			cur_max = __get_max_fds(cur_fds) - 1;
+		max_fd = min(max_fd, cur_max);
+		while (fd <= max_fd) {
+			struct file *file;
 
-		filp_close(file, cur_fds);
-		cond_resched();
+			file = pick_file(cur_fds, fd++);
+			if (!file)
+				continue;
+
+			filp_close(file, cur_fds);
+			cond_resched();
+		}
 	}
 
 	if (fds) {
diff --git a/include/uapi/linux/close_range.h b/include/uapi/linux/close_range.h
index 6928a9fdee3c..2d804281554c 100644
--- a/include/uapi/linux/close_range.h
+++ b/include/uapi/linux/close_range.h
@@ -5,5 +5,8 @@
 /* Unshare the file descriptor table before closing file descriptors. */
 #define CLOSE_RANGE_UNSHARE	(1U << 1)
 
+/* Set the FD_CLOEXEC bit instead of closing the file descriptor. */
+#define CLOSE_RANGE_CLOEXEC	(1U << 2)
+
 #endif /* _UAPI_LINUX_CLOSE_RANGE_H */
 
-- 
2.26.2


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 2/2] selftests: add tests for CLOSE_RANGE_CLOEXEC
  2020-10-13 14:06 [PATCH 0/2] fs, close_range: add flag CLOSE_RANGE_CLOEXEC Giuseppe Scrivano
  2020-10-13 14:06 ` [PATCH 1/2] " Giuseppe Scrivano
@ 2020-10-13 14:06 ` Giuseppe Scrivano
  2020-10-13 15:22   ` David Laight
  1 sibling, 1 reply; 11+ messages in thread
From: Giuseppe Scrivano @ 2020-10-13 14:06 UTC (permalink / raw)
  To: linux-kernel; +Cc: viro, linux-fsdevel, christian.brauner, containers

Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com>
---
 .../testing/selftests/core/close_range_test.c | 44 +++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/tools/testing/selftests/core/close_range_test.c b/tools/testing/selftests/core/close_range_test.c
index c99b98b0d461..b8789262cd7d 100644
--- a/tools/testing/selftests/core/close_range_test.c
+++ b/tools/testing/selftests/core/close_range_test.c
@@ -23,6 +23,10 @@
 #define CLOSE_RANGE_UNSHARE	(1U << 1)
 #endif
 
+#ifndef CLOSE_RANGE_CLOEXEC
+#define CLOSE_RANGE_CLOEXEC	(1U << 2)
+#endif
+
 static inline int sys_close_range(unsigned int fd, unsigned int max_fd,
 				  unsigned int flags)
 {
@@ -224,4 +228,44 @@ TEST(close_range_unshare_capped)
 	EXPECT_EQ(0, WEXITSTATUS(status));
 }
 
+TEST(close_range_cloexec)
+{
+	int i, ret;
+	int open_fds[101];
+
+	for (i = 0; i < ARRAY_SIZE(open_fds); i++) {
+		int fd;
+
+		fd = open("/dev/null", O_RDONLY);
+		ASSERT_GE(fd, 0) {
+			if (errno == ENOENT)
+				XFAIL(return, "Skipping test since /dev/null does not exist");
+		}
+
+		open_fds[i] = fd;
+	}
+
+	EXPECT_EQ(-1, sys_close_range(open_fds[0], open_fds[100], -1)) {
+		if (errno == ENOSYS)
+			XFAIL(return, "close_range() syscall not supported");
+	}
+
+	EXPECT_EQ(0, sys_close_range(open_fds[0], open_fds[50], CLOSE_RANGE_CLOEXEC));
+
+	for (i = 0; i <= 50; i++) {
+		int flags = fcntl(open_fds[i], F_GETFD);
+
+		EXPECT_GT(flags, -1);
+		EXPECT_EQ(flags & FD_CLOEXEC, FD_CLOEXEC);
+	}
+
+	for (i = 51; i <= 100; i++) {
+		int flags = fcntl(open_fds[i], F_GETFD);
+
+		EXPECT_GT(flags, -1);
+		EXPECT_EQ(flags & FD_CLOEXEC, 0);
+	}
+}
+
+
 TEST_HARNESS_MAIN
-- 
2.26.2


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* RE: [PATCH 2/2] selftests: add tests for CLOSE_RANGE_CLOEXEC
  2020-10-13 14:06 ` [PATCH 2/2] selftests: add tests for CLOSE_RANGE_CLOEXEC Giuseppe Scrivano
@ 2020-10-13 15:22   ` David Laight
  0 siblings, 0 replies; 11+ messages in thread
From: David Laight @ 2020-10-13 15:22 UTC (permalink / raw)
  To: 'Giuseppe Scrivano', linux-kernel
  Cc: viro, linux-fsdevel, christian.brauner, containers

> Subject: [PATCH 2/2] selftests: add tests for CLOSE_RANGE_CLOEXEC

You really ought to check that it skips closed files.
For instance using dup2() to move some open files to 'big numbers'.

Although you know how it works, a 'black box' test would also
reduce RLIMIT_NOFILES below one of the open files and check
files above the RLIMIT_NOFILES value are affected.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/2] fs, close_range: add flag CLOSE_RANGE_CLOEXEC
  2020-10-13 14:06 ` [PATCH 1/2] " Giuseppe Scrivano
@ 2020-10-13 20:54   ` Christian Brauner
  2020-10-13 21:04     ` Rasmus Villemoes
  2020-10-13 22:45     ` Giuseppe Scrivano
  2020-10-13 21:09   ` Al Viro
  1 sibling, 2 replies; 11+ messages in thread
From: Christian Brauner @ 2020-10-13 20:54 UTC (permalink / raw)
  To: Giuseppe Scrivano; +Cc: linux-kernel, viro, linux-fsdevel, containers

On Tue, Oct 13, 2020 at 04:06:08PM +0200, Giuseppe Scrivano wrote:

Hey Guiseppe,

Thanks for the patch!

> When the flag CLOSE_RANGE_CLOEXEC is set, close_range doesn't
> immediately close the files but it sets the close-on-exec bit.

Hm, please expand on the use-cases a little here so people know where
and how this is useful. Keeping the rationale for a change in the commit
log is really important.

> 
> Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com>
> ---

>  fs/file.c                        | 56 ++++++++++++++++++++++----------
>  include/uapi/linux/close_range.h |  3 ++
>  2 files changed, 42 insertions(+), 17 deletions(-)
> 
> diff --git a/fs/file.c b/fs/file.c
> index 21c0893f2f1d..ad4ebee41e09 100644
> --- a/fs/file.c
> +++ b/fs/file.c
> @@ -672,6 +672,17 @@ int __close_fd(struct files_struct *files, unsigned fd)
>  }
>  EXPORT_SYMBOL(__close_fd); /* for ksys_close() */
>  
> +static unsigned int __get_max_fds(struct files_struct *cur_fds)
> +{
> +	unsigned int max_fds;
> +
> +	rcu_read_lock();
> +	/* cap to last valid index into fdtable */
> +	max_fds = files_fdtable(cur_fds)->max_fds;
> +	rcu_read_unlock();
> +	return max_fds;
> +}
> +
>  /**
>   * __close_range() - Close all file descriptors in a given range.
>   *
> @@ -683,27 +694,23 @@ EXPORT_SYMBOL(__close_fd); /* for ksys_close() */
>   */
>  int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
>  {
> -	unsigned int cur_max;
> +	unsigned int cur_max = UINT_MAX;
>  	struct task_struct *me = current;
>  	struct files_struct *cur_fds = me->files, *fds = NULL;
>  
> -	if (flags & ~CLOSE_RANGE_UNSHARE)
> +	if (flags & ~(CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC))
>  		return -EINVAL;
>  
>  	if (fd > max_fd)
>  		return -EINVAL;
>  
> -	rcu_read_lock();
> -	cur_max = files_fdtable(cur_fds)->max_fds;
> -	rcu_read_unlock();
> -
> -	/* cap to last valid index into fdtable */
> -	cur_max--;
> -
>  	if (flags & CLOSE_RANGE_UNSHARE) {
>  		int ret;
>  		unsigned int max_unshare_fds = NR_OPEN_MAX;
>  
> +		/* cap to last valid index into fdtable */
> +		cur_max = __get_max_fds(cur_fds) - 1;
> +
>  		/*
>  		 * If the requested range is greater than the current maximum,
>  		 * we're closing everything so only copy all file descriptors
> @@ -724,16 +731,31 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
>  			swap(cur_fds, fds);
>  	}
>  
> -	max_fd = min(max_fd, cur_max);
> -	while (fd <= max_fd) {
> -		struct file *file;
> +	if (flags & CLOSE_RANGE_CLOEXEC) {
> +		struct fdtable *fdt;
>  
> -		file = pick_file(cur_fds, fd++);
> -		if (!file)
> -			continue;
> +		spin_lock(&cur_fds->file_lock);
> +		fdt = files_fdtable(cur_fds);
> +		cur_max = fdt->max_fds - 1;
> +		max_fd = min(max_fd, cur_max);
> +		while (fd <= max_fd)
> +			__set_close_on_exec(fd++, fdt);
> +		spin_unlock(&cur_fds->file_lock);
> +	} else {
> +		/* Initialize cur_max if needed.  */
> +		if (cur_max == UINT_MAX)
> +			cur_max = __get_max_fds(cur_fds) - 1;

The separation between how cur_fd is retrieved in the two branches makes
the code more difficult to follow imho. Unless there's a clear reason
why you've done it that way I would think that something like the patch
I appended below might be a little clearer and easier to maintain(?).

> +		max_fd = min(max_fd, cur_max);
> +		while (fd <= max_fd) {
> +			struct file *file;
>  
> -		filp_close(file, cur_fds);
> -		cond_resched();
> +			file = pick_file(cur_fds, fd++);
> +			if (!file)
> +				continue;
> +
> +			filp_close(file, cur_fds);
> +			cond_resched();
> +		}
>  	}

I think I don't have quarrels with this patch in principle but I wonder
if something like the following wouldn't be easier to follow:

diff --git a/fs/file.c b/fs/file.c
index 21c0893f2f1d..872a4098c3be 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -672,6 +672,32 @@ int __close_fd(struct files_struct *files, unsigned fd)
 }
 EXPORT_SYMBOL(__close_fd); /* for ksys_close() */
 
+static inline void __range_cloexec(struct files_struct *cur_fds,
+				   unsigned int fd, unsigned max_fd)
+{
+	struct fdtable *fdt;
+	spin_lock(&cur_fds->file_lock);
+	fdt = files_fdtable(cur_fds);
+	while (fd <= max_fd)
+		__set_close_on_exec(fd++, fdt);
+	spin_unlock(&cur_fds->file_lock);
+}
+
+static inline void __range_close(struct files_struct *cur_fds, unsigned int fd,
+				 unsigned max_fd)
+{
+	while (fd <= max_fd) {
+		struct file *file;
+
+		file = pick_file(cur_fds, fd++);
+		if (!file)
+			continue;
+
+		filp_close(file, cur_fds);
+		cond_resched();
+	}
+}
+
 /**
  * __close_range() - Close all file descriptors in a given range.
  *
@@ -687,7 +713,7 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
 	struct task_struct *me = current;
 	struct files_struct *cur_fds = me->files, *fds = NULL;
 
-	if (flags & ~CLOSE_RANGE_UNSHARE)
+	if (flags & ~(CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC))
 		return -EINVAL;
 
 	if (fd > max_fd)
@@ -725,16 +751,10 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
 	}
 
 	max_fd = min(max_fd, cur_max);
-	while (fd <= max_fd) {
-		struct file *file;
-
-		file = pick_file(cur_fds, fd++);
-		if (!file)
-			continue;
-
-		filp_close(file, cur_fds);
-		cond_resched();
-	}
+	if (flags & CLOSE_RANGE_CLOEXEC)
+		__range_cloexec(cur_fds, fd, max_fd);
+	else
+		__range_close(cur_fds, fd, max_fd);
 
 	if (fds) {
 		/*
diff --git a/include/uapi/linux/close_range.h b/include/uapi/linux/close_range.h
index 6928a9fdee3c..2d804281554c 100644
--- a/include/uapi/linux/close_range.h
+++ b/include/uapi/linux/close_range.h
@@ -5,5 +5,8 @@
 /* Unshare the file descriptor table before closing file descriptors. */
 #define CLOSE_RANGE_UNSHARE	(1U << 1)
 
+/* Set the FD_CLOEXEC bit instead of closing the file descriptor. */
+#define CLOSE_RANGE_CLOEXEC	(1U << 2)
+
 #endif /* _UAPI_LINUX_CLOSE_RANGE_H */
 

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/2] fs, close_range: add flag CLOSE_RANGE_CLOEXEC
  2020-10-13 20:54   ` Christian Brauner
@ 2020-10-13 21:04     ` Rasmus Villemoes
  2020-10-13 21:22       ` Christian Brauner
  2020-10-13 22:45     ` Giuseppe Scrivano
  1 sibling, 1 reply; 11+ messages in thread
From: Rasmus Villemoes @ 2020-10-13 21:04 UTC (permalink / raw)
  To: Christian Brauner, Giuseppe Scrivano
  Cc: linux-kernel, viro, linux-fsdevel, containers

On 13/10/2020 22.54, Christian Brauner wrote:
> On Tue, Oct 13, 2020 at 04:06:08PM +0200, Giuseppe Scrivano wrote:
> 
> Hey Guiseppe,
> 
> Thanks for the patch!
> 
>> When the flag CLOSE_RANGE_CLOEXEC is set, close_range doesn't
>> immediately close the files but it sets the close-on-exec bit.
> 
> Hm, please expand on the use-cases a little here so people know where
> and how this is useful. Keeping the rationale for a change in the commit
> log is really important.
> 

> I think I don't have quarrels with this patch in principle but I wonder
> if something like the following wouldn't be easier to follow:
> 
> diff --git a/fs/file.c b/fs/file.c
> index 21c0893f2f1d..872a4098c3be 100644
> --- a/fs/file.c
> +++ b/fs/file.c
> @@ -672,6 +672,32 @@ int __close_fd(struct files_struct *files, unsigned fd)
>  }
>  EXPORT_SYMBOL(__close_fd); /* for ksys_close() */
>  
> +static inline void __range_cloexec(struct files_struct *cur_fds,
> +				   unsigned int fd, unsigned max_fd)
> +{
> +	struct fdtable *fdt;
> +	spin_lock(&cur_fds->file_lock);
> +	fdt = files_fdtable(cur_fds);
> +	while (fd <= max_fd)
> +		__set_close_on_exec(fd++, fdt);

Doesn't that want to be

  bitmap_set(fdt->close_on_exec, fd, max_fd - fd + 1)

to do word-at-a-time? I assume this would mostly be called with (3, ~0U)
as arguments or something like that.

Rasmus

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/2] fs, close_range: add flag CLOSE_RANGE_CLOEXEC
  2020-10-13 14:06 ` [PATCH 1/2] " Giuseppe Scrivano
  2020-10-13 20:54   ` Christian Brauner
@ 2020-10-13 21:09   ` Al Viro
  2020-10-13 21:32     ` Christian Brauner
  2020-10-13 21:49     ` Rasmus Villemoes
  1 sibling, 2 replies; 11+ messages in thread
From: Al Viro @ 2020-10-13 21:09 UTC (permalink / raw)
  To: Giuseppe Scrivano
  Cc: linux-kernel, linux-fsdevel, christian.brauner, containers

On Tue, Oct 13, 2020 at 04:06:08PM +0200, Giuseppe Scrivano wrote:
> +		spin_lock(&cur_fds->file_lock);
> +		fdt = files_fdtable(cur_fds);
> +		cur_max = fdt->max_fds - 1;
> +		max_fd = min(max_fd, cur_max);
> +		while (fd <= max_fd)
> +			__set_close_on_exec(fd++, fdt);
> +		spin_unlock(&cur_fds->file_lock);

	First of all, this is an atrocious way to set all bits
in a range.  What's more, you don't want to set it for *all*
bits - only for the ones present in open bitmap.  It's probably
harmless at the moment, but let's not create interesting surprises
for the future.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/2] fs, close_range: add flag CLOSE_RANGE_CLOEXEC
  2020-10-13 21:04     ` Rasmus Villemoes
@ 2020-10-13 21:22       ` Christian Brauner
  0 siblings, 0 replies; 11+ messages in thread
From: Christian Brauner @ 2020-10-13 21:22 UTC (permalink / raw)
  To: Rasmus Villemoes
  Cc: Giuseppe Scrivano, linux-kernel, viro, linux-fsdevel, containers

On Tue, Oct 13, 2020 at 11:04:21PM +0200, Rasmus Villemoes wrote:
> On 13/10/2020 22.54, Christian Brauner wrote:
> > On Tue, Oct 13, 2020 at 04:06:08PM +0200, Giuseppe Scrivano wrote:
> > 
> > Hey Guiseppe,
> > 
> > Thanks for the patch!
> > 
> >> When the flag CLOSE_RANGE_CLOEXEC is set, close_range doesn't
> >> immediately close the files but it sets the close-on-exec bit.
> > 
> > Hm, please expand on the use-cases a little here so people know where
> > and how this is useful. Keeping the rationale for a change in the commit
> > log is really important.
> > 
> 
> > I think I don't have quarrels with this patch in principle but I wonder
> > if something like the following wouldn't be easier to follow:
> > 
> > diff --git a/fs/file.c b/fs/file.c
> > index 21c0893f2f1d..872a4098c3be 100644
> > --- a/fs/file.c
> > +++ b/fs/file.c
> > @@ -672,6 +672,32 @@ int __close_fd(struct files_struct *files, unsigned fd)
> >  }
> >  EXPORT_SYMBOL(__close_fd); /* for ksys_close() */
> >  
> > +static inline void __range_cloexec(struct files_struct *cur_fds,
> > +				   unsigned int fd, unsigned max_fd)
> > +{
> > +	struct fdtable *fdt;
> > +	spin_lock(&cur_fds->file_lock);
> > +	fdt = files_fdtable(cur_fds);
> > +	while (fd <= max_fd)
> > +		__set_close_on_exec(fd++, fdt);
> 

(I should've warned that I just proposed this as a completely untested
brainstorm.)

> Doesn't that want to be
> 
>   bitmap_set(fdt->close_on_exec, fd, max_fd - fd + 1)
> 
> to do word-at-a-time? I assume this would mostly be called with (3, ~0U)
> as arguments or something like that.

Yes, that is the common case.

Thanks Rasmus, I was unaware we had that function.

In that case I think we'd actually need sm like:
spin_lock(&cur_fds->file_lock);
fdt = files_fdtable(cur_fds);
cur_max = files_fdtable(cur_fds)->max_fds - 1;
max_fd = min(max_fd, cur_max);
bitmap_set(fdt->close_on_exec, fd, max_fd - fd + 1)

so we retrieve max_fd with the spinlock held, I think.

Christian

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/2] fs, close_range: add flag CLOSE_RANGE_CLOEXEC
  2020-10-13 21:09   ` Al Viro
@ 2020-10-13 21:32     ` Christian Brauner
  2020-10-13 21:49     ` Rasmus Villemoes
  1 sibling, 0 replies; 11+ messages in thread
From: Christian Brauner @ 2020-10-13 21:32 UTC (permalink / raw)
  To: Al Viro; +Cc: Giuseppe Scrivano, linux-kernel, linux-fsdevel, containers

On Tue, Oct 13, 2020 at 10:09:25PM +0100, Al Viro wrote:
> On Tue, Oct 13, 2020 at 04:06:08PM +0200, Giuseppe Scrivano wrote:
> > +		spin_lock(&cur_fds->file_lock);
> > +		fdt = files_fdtable(cur_fds);
> > +		cur_max = fdt->max_fds - 1;
> > +		max_fd = min(max_fd, cur_max);
> > +		while (fd <= max_fd)
> > +			__set_close_on_exec(fd++, fdt);
> > +		spin_unlock(&cur_fds->file_lock);
> 
> 	First of all, this is an atrocious way to set all bits
> in a range.  What's more, you don't want to set it for *all*

Hm, good point.

Would it make sense to just use the bitmap_set() proposal since the 3 to
~0 case is most common or to actually iterate based on the open fds?


Christian

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/2] fs, close_range: add flag CLOSE_RANGE_CLOEXEC
  2020-10-13 21:09   ` Al Viro
  2020-10-13 21:32     ` Christian Brauner
@ 2020-10-13 21:49     ` Rasmus Villemoes
  1 sibling, 0 replies; 11+ messages in thread
From: Rasmus Villemoes @ 2020-10-13 21:49 UTC (permalink / raw)
  To: Al Viro, Giuseppe Scrivano
  Cc: linux-kernel, linux-fsdevel, christian.brauner, containers

On 13/10/2020 23.09, Al Viro wrote:
> On Tue, Oct 13, 2020 at 04:06:08PM +0200, Giuseppe Scrivano wrote:
>> +		spin_lock(&cur_fds->file_lock);
>> +		fdt = files_fdtable(cur_fds);
>> +		cur_max = fdt->max_fds - 1;
>> +		max_fd = min(max_fd, cur_max);
>> +		while (fd <= max_fd)
>> +			__set_close_on_exec(fd++, fdt);
>> +		spin_unlock(&cur_fds->file_lock);
> 
> 	First of all, this is an atrocious way to set all bits
> in a range.  What's more, you don't want to set it for *all*
> bits - only for the ones present in open bitmap.  It's probably
> harmless at the moment, but let's not create interesting surprises
> for the future.

Eh, why not? They can already be set for unallocated slots:

commit 5297908270549b734c7c2556745e2385b6d4941d
Author: Mateusz Guzik <mguzik@redhat.com>
Date:   Tue Oct 3 12:58:14 2017 +0200

    vfs: stop clearing close on exec when closing a fd

    Codepaths allocating a fd always make sure the bit is set/unset
    depending on flags, thus clearing on close is redundant.

And while we're on that subject, yours truly suggested exactly that two
years prior [1], with a follow-up [2] in 2018 to do what wasn't done in
5297908, but (still) seems like obvious micro-optimizations, given that
the close_on_exec bitmap is not maintained as a subset of the open
bitmap. Mind taking a look at [2]?

[1]
https://lore.kernel.org/lkml/1446543679-28849-1-git-send-email-linux@rasmusvillemoes.dk/t/#u
[2]
https://lore.kernel.org/lkml/20181024160159.25884-1-linux@rasmusvillemoes.dk/

Rasmus

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/2] fs, close_range: add flag CLOSE_RANGE_CLOEXEC
  2020-10-13 20:54   ` Christian Brauner
  2020-10-13 21:04     ` Rasmus Villemoes
@ 2020-10-13 22:45     ` Giuseppe Scrivano
  1 sibling, 0 replies; 11+ messages in thread
From: Giuseppe Scrivano @ 2020-10-13 22:45 UTC (permalink / raw)
  To: Christian Brauner; +Cc: linux-kernel, viro, linux-fsdevel, containers

Christian Brauner <christian.brauner@ubuntu.com> writes:

> On Tue, Oct 13, 2020 at 04:06:08PM +0200, Giuseppe Scrivano wrote:
>
> Hey Guiseppe,
>
> Thanks for the patch!
>
>> When the flag CLOSE_RANGE_CLOEXEC is set, close_range doesn't
>> immediately close the files but it sets the close-on-exec bit.
>
> Hm, please expand on the use-cases a little here so people know where
> and how this is useful. Keeping the rationale for a change in the commit
> log is really important.
>
>> 
>> Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com>
>> ---
>
>>  fs/file.c                        | 56 ++++++++++++++++++++++----------
>>  include/uapi/linux/close_range.h |  3 ++
>>  2 files changed, 42 insertions(+), 17 deletions(-)
>> 
>> diff --git a/fs/file.c b/fs/file.c
>> index 21c0893f2f1d..ad4ebee41e09 100644
>> --- a/fs/file.c
>> +++ b/fs/file.c
>> @@ -672,6 +672,17 @@ int __close_fd(struct files_struct *files, unsigned fd)
>>  }
>>  EXPORT_SYMBOL(__close_fd); /* for ksys_close() */
>>  
>> +static unsigned int __get_max_fds(struct files_struct *cur_fds)
>> +{
>> +	unsigned int max_fds;
>> +
>> +	rcu_read_lock();
>> +	/* cap to last valid index into fdtable */
>> +	max_fds = files_fdtable(cur_fds)->max_fds;
>> +	rcu_read_unlock();
>> +	return max_fds;
>> +}
>> +
>>  /**
>>   * __close_range() - Close all file descriptors in a given range.
>>   *
>> @@ -683,27 +694,23 @@ EXPORT_SYMBOL(__close_fd); /* for ksys_close() */
>>   */
>>  int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
>>  {
>> -	unsigned int cur_max;
>> +	unsigned int cur_max = UINT_MAX;
>>  	struct task_struct *me = current;
>>  	struct files_struct *cur_fds = me->files, *fds = NULL;
>>  
>> -	if (flags & ~CLOSE_RANGE_UNSHARE)
>> +	if (flags & ~(CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC))
>>  		return -EINVAL;
>>  
>>  	if (fd > max_fd)
>>  		return -EINVAL;
>>  
>> -	rcu_read_lock();
>> -	cur_max = files_fdtable(cur_fds)->max_fds;
>> -	rcu_read_unlock();
>> -
>> -	/* cap to last valid index into fdtable */
>> -	cur_max--;
>> -
>>  	if (flags & CLOSE_RANGE_UNSHARE) {
>>  		int ret;
>>  		unsigned int max_unshare_fds = NR_OPEN_MAX;
>>  
>> +		/* cap to last valid index into fdtable */
>> +		cur_max = __get_max_fds(cur_fds) - 1;
>> +
>>  		/*
>>  		 * If the requested range is greater than the current maximum,
>>  		 * we're closing everything so only copy all file descriptors
>> @@ -724,16 +731,31 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
>>  			swap(cur_fds, fds);
>>  	}
>>  
>> -	max_fd = min(max_fd, cur_max);
>> -	while (fd <= max_fd) {
>> -		struct file *file;
>> +	if (flags & CLOSE_RANGE_CLOEXEC) {
>> +		struct fdtable *fdt;
>>  
>> -		file = pick_file(cur_fds, fd++);
>> -		if (!file)
>> -			continue;
>> +		spin_lock(&cur_fds->file_lock);
>> +		fdt = files_fdtable(cur_fds);
>> +		cur_max = fdt->max_fds - 1;
>> +		max_fd = min(max_fd, cur_max);
>> +		while (fd <= max_fd)
>> +			__set_close_on_exec(fd++, fdt);
>> +		spin_unlock(&cur_fds->file_lock);
>> +	} else {
>> +		/* Initialize cur_max if needed.  */
>> +		if (cur_max == UINT_MAX)
>> +			cur_max = __get_max_fds(cur_fds) - 1;
>
> The separation between how cur_fd is retrieved in the two branches makes
> the code more difficult to follow imho. Unless there's a clear reason
> why you've done it that way I would think that something like the patch
> I appended below might be a little clearer and easier to maintain(?).

Thanks for the review!

I've opted for this version as in the flags=CLOSE_RANGE_CLOEXEC case we
can read max_fds directly from the fds table and avoid doing it from the
RCU critical section as well.  I'll change it in favor of more readable
code.

Giuseppe


^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2020-10-14  9:20 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-10-13 14:06 [PATCH 0/2] fs, close_range: add flag CLOSE_RANGE_CLOEXEC Giuseppe Scrivano
2020-10-13 14:06 ` [PATCH 1/2] " Giuseppe Scrivano
2020-10-13 20:54   ` Christian Brauner
2020-10-13 21:04     ` Rasmus Villemoes
2020-10-13 21:22       ` Christian Brauner
2020-10-13 22:45     ` Giuseppe Scrivano
2020-10-13 21:09   ` Al Viro
2020-10-13 21:32     ` Christian Brauner
2020-10-13 21:49     ` Rasmus Villemoes
2020-10-13 14:06 ` [PATCH 2/2] selftests: add tests for CLOSE_RANGE_CLOEXEC Giuseppe Scrivano
2020-10-13 15:22   ` David Laight

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).