linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/2 v2] fdmap(2)
@ 2017-09-24 20:06 Alexey Dobriyan
  2017-09-24 20:08 ` [PATCH v2 2/2] pidmap(2) Alexey Dobriyan
                   ` (4 more replies)
  0 siblings, 5 replies; 31+ messages in thread
From: Alexey Dobriyan @ 2017-09-24 20:06 UTC (permalink / raw)
  To: akpm; +Cc: linux-kernel, linux-api, rdunlap, tglx, tixxdz, gladkov.alexey

From: Aliaksandr Patseyenak <Aliaksandr_Patseyenak1@epam.com>

Implement system call for bulk retrieveing of opened descriptors
in binary form.

Some daemons could use it to reliably close file descriptors
before starting. Currently they close everything upto some number
which formally is not reliable. Other natural users are lsof(1) and CRIU
(although lsof does so much in /proc that the effect is thoroughly buried).

/proc, the only way to learn anything about file descriptors may not be
available. There is unavoidable overhead associated with instantiating
3 dentries and 3 inodes and converting integers to strings and back.

Benchmark:

	N=1<<22 times
	4 opened descriptors (0, 1, 2, 3)
	opendir+readdir+closedir /proc/self/fd vs fdmap

	/proc 8.31 ± 0.37%
	fdmap 0.32 ± 0.72%


FDMAP(2)                   Linux Programmer's Manual                  FDMAP(2)

NAME
       fdmap - get open file descriptors of the process

SYNOPSIS
       long fdmap(pid_t pid, int *fd, unsigned int nfd, int start, int flags);

DESCRIPTION
       fdmap()  writes  open  file  descriptors  of the process into buffer fd
       starting from the start descriptor. At most nfd elements  are  written.
       flags argument is reserved and must be zero.

       If pid is zero, syscall will work with the current process.

RETURN VALUE
       On success, number of descriptors written is returned.  On error, -1 is
       returned, and errno is set appropriately.

ERRORS
       ESRCH  No such process.

       EACCES Permission denied.

       EFAULT Invalid fd pointer.

       EINVAL Negative start argument.

NOTES
       Glibc does not provide a wrapper for these system call; call  it  using
       syscall(2).

EXAMPLE
       The program below demonstrates fdmap() usage.

           $ ./a.out $$
           0 1 2 255

           $ ./a.out 42</dev/null 1023</dev/null
           0 1 2 42
           1023

   Program source

       #include <sys/types.h>
       #include <stdlib.h>
       #include <stdio.h>

       static inline long fdmap(int pid, int *fd, unsigned int nfd, unsigned int start, int flags)
       {
            register long r10 asm ("r10") = start;
            register long r8 asm ("r8") = flags;
            long rv;
            asm volatile (
                 "syscall"
                 : "=a" (rv)
                 : "0" (333), "D" (pid), "S" (fd), "d" (nfd), "r" (r10), "r" (r8)
                 : "rcx", "r11", "cc", "memory"
            );
            return rv;
       }

       int main(int argc, char *argv[])
       {
            pid_t pid;
            int fd[4];
            unsigned int start;
            int n;

            pid = 0;
            if (argc > 1)
                 pid = atoi(argv[1]);

            start = 0;
            while ((n = fdmap(pid, fd, sizeof(fd)/sizeof(fd[0]), start, 0)) > 0) {
                 unsigned int i;

                 for (i = 0; i < n; i++)
                      printf("%u ", fd[i]);
                 printf("\n");

                 start = fd[n - 1] + 1;
            }
            return 0;
       }

Linux                             2017-09-21                          FDMAP(2)

Changelog:

	CONFIG_PIDMAP option
	manpage


Signed-off-by: Aliaksandr Patseyenak <Aliaksandr_Patseyenak1@epam.com>
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>

---
 arch/x86/entry/syscalls/syscall_64.tbl     |   1 +
 fs/Makefile                                |   2 +
 fs/fdmap.c                                 | 105 ++++++++++++++++++++
 include/linux/syscalls.h                   |   2 +
 init/Kconfig                               |   7 ++
 kernel/sys_ni.c                            |   2 +
 tools/testing/selftests/Makefile           |   1 +
 tools/testing/selftests/fdmap/.gitignore   |   1 +
 tools/testing/selftests/fdmap/Makefile     |   7 ++
 tools/testing/selftests/fdmap/fdmap.c      | 112 +++++++++++++++++++++
 tools/testing/selftests/fdmap/fdmap.h      |  12 +++
 tools/testing/selftests/fdmap/fdmap_test.c | 153 +++++++++++++++++++++++++++++
 12 files changed, 405 insertions(+)
 create mode 100644 fs/fdmap.c
 create mode 100644 tools/testing/selftests/fdmap/.gitignore
 create mode 100644 tools/testing/selftests/fdmap/Makefile
 create mode 100644 tools/testing/selftests/fdmap/fdmap.c
 create mode 100644 tools/testing/selftests/fdmap/fdmap.h
 create mode 100644 tools/testing/selftests/fdmap/fdmap_test.c

diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 5aef183e2f85..9bfe5f79674f 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -339,6 +339,7 @@
 330	common	pkey_alloc		sys_pkey_alloc
 331	common	pkey_free		sys_pkey_free
 332	common	statx			sys_statx
+333	common	fdmap			sys_fdmap
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/fs/Makefile b/fs/Makefile
index 7bbaca9c67b1..27476a66c18e 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -13,6 +13,8 @@ obj-y :=	open.o read_write.o file_table.o super.o \
 		pnode.o splice.o sync.o utimes.o \
 		stack.o fs_struct.o statfs.o fs_pin.o nsfs.o
 
+obj-$(CONFIG_FDMAP) += fdmap.o
+
 ifeq ($(CONFIG_BLOCK),y)
 obj-y +=	buffer.o block_dev.o direct-io.o mpage.o
 else
diff --git a/fs/fdmap.c b/fs/fdmap.c
new file mode 100644
index 000000000000..274e6c5b7c9c
--- /dev/null
+++ b/fs/fdmap.c
@@ -0,0 +1,105 @@
+#include <linux/bitops.h>
+#include <linux/fdtable.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+#include <linux/syscalls.h>
+#include <linux/uaccess.h>
+
+/**
+ * fdmap - get opened file descriptors of a process
+ * @pid: the pid of the target process
+ * @fds: allocated userspace buffer
+ * @count: buffer size (in descriptors)
+ * @start_fd: first descriptor to search from (inclusive)
+ * @flags: reserved for future functionality, must be zero
+ *
+ * If @pid is zero then it's current process.
+ * Return: number of descriptors written. An error code otherwise.
+ */
+SYSCALL_DEFINE5(fdmap, pid_t, pid, int __user *, fds, unsigned int, count,
+		int, start_fd, int, flags)
+{
+	struct task_struct *task;
+	struct files_struct *files;
+	unsigned long search_mask;
+	unsigned int user_index, offset;
+	int masksize;
+
+	if (start_fd < 0 || flags != 0)
+		return -EINVAL;
+
+	if (!pid) {
+		files = get_files_struct(current);
+	} else {
+		rcu_read_lock();
+		task = find_task_by_vpid(pid);
+		if (!task) {
+			rcu_read_unlock();
+			return -ESRCH;
+		}
+		if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
+			rcu_read_unlock();
+			return -EACCES;
+		}
+		files = get_files_struct(task);
+		rcu_read_unlock();
+	}
+	if (!files)
+		return 0;
+
+	offset = start_fd / BITS_PER_LONG;
+	search_mask = ULONG_MAX << (start_fd % BITS_PER_LONG);
+	user_index = 0;
+#define FDS_BUF_SIZE	(512/sizeof(unsigned long))
+	masksize = FDS_BUF_SIZE;
+	while (user_index < count && masksize == FDS_BUF_SIZE) {
+		unsigned long open_fds[FDS_BUF_SIZE];
+		struct fdtable *fdt;
+		unsigned int i;
+
+		/*
+		 * fdt->max_fds can grow, get it every time
+		 * before copying part into internal buffer.
+		 */
+		rcu_read_lock();
+		fdt = files_fdtable(files);
+		masksize = fdt->max_fds / 8 - offset * sizeof(long);
+		if (masksize < 0) {
+			rcu_read_unlock();
+			break;
+		}
+		masksize = min(masksize, (int)sizeof(open_fds));
+		memcpy(open_fds, fdt->open_fds + offset, masksize);
+		rcu_read_unlock();
+
+		open_fds[0] &= search_mask;
+		search_mask = ULONG_MAX;
+		masksize = (masksize + sizeof(long) - 1) / sizeof(long);
+		start_fd = offset * BITS_PER_LONG;
+		/*
+		 * for_each_set_bit_from() can re-read first word
+		 * multiple times which is not optimal.
+		 */
+		for (i = 0; i < masksize; i++) {
+			unsigned long mask = open_fds[i];
+
+			while (mask) {
+				unsigned int real_fd = start_fd + __ffs(mask);
+
+				if (put_user(real_fd, fds + user_index)) {
+					put_files_struct(files);
+					return -EFAULT;
+				}
+				if (++user_index >= count)
+					goto out;
+				mask &= mask - 1;
+			}
+			start_fd += BITS_PER_LONG;
+		}
+		offset += FDS_BUF_SIZE;
+	}
+out:
+	put_files_struct(files);
+
+	return user_index;
+}
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 95606a2d556f..d393d844facb 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -936,5 +936,7 @@ asmlinkage long sys_pkey_alloc(unsigned long flags, unsigned long init_val);
 asmlinkage long sys_pkey_free(int pkey);
 asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
 			  unsigned mask, struct statx __user *buffer);
+asmlinkage long sys_fdmap(pid_t pid, int __user *fds, unsigned int count,
+			  int start_fd, int flags);
 
 #endif
diff --git a/init/Kconfig b/init/Kconfig
index 78cb2461012e..952d13b7326d 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1400,6 +1400,13 @@ config MEMBARRIER
 
 	  If unsure, say Y.
 
+config FDMAP
+	bool "fdmap() system call" if EXPERT
+	default y
+	help
+	  Enable fdmap() system call that allows to query file descriptors
+	  in binary form avoiding /proc overhead.
+
 config EMBEDDED
 	bool "Embedded system"
 	option allnoconfig_y
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 8acef8576ce9..d61fa27d021e 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -258,3 +258,5 @@ cond_syscall(sys_membarrier);
 cond_syscall(sys_pkey_mprotect);
 cond_syscall(sys_pkey_alloc);
 cond_syscall(sys_pkey_free);
+
+cond_syscall(sys_fdmap);
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index 26ce4f7168be..e8d63c27c865 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -5,6 +5,7 @@ TARGETS += cpufreq
 TARGETS += cpu-hotplug
 TARGETS += efivarfs
 TARGETS += exec
+TARGETS += fdmap
 TARGETS += firmware
 TARGETS += ftrace
 TARGETS += futex
diff --git a/tools/testing/selftests/fdmap/.gitignore b/tools/testing/selftests/fdmap/.gitignore
new file mode 100644
index 000000000000..9a9bfdac1cc0
--- /dev/null
+++ b/tools/testing/selftests/fdmap/.gitignore
@@ -0,0 +1 @@
+fdmap_test
diff --git a/tools/testing/selftests/fdmap/Makefile b/tools/testing/selftests/fdmap/Makefile
new file mode 100644
index 000000000000..bf9f051f4b63
--- /dev/null
+++ b/tools/testing/selftests/fdmap/Makefile
@@ -0,0 +1,7 @@
+TEST_GEN_PROGS := fdmap_test
+CFLAGS += -Wall
+
+include ../lib.mk
+
+$(TEST_GEN_PROGS): fdmap_test.c fdmap.c fdmap.h ../kselftest_harness.h
+	$(CC) $(CFLAGS) $(LDFLAGS) $< fdmap.c -o $@
diff --git a/tools/testing/selftests/fdmap/fdmap.c b/tools/testing/selftests/fdmap/fdmap.c
new file mode 100644
index 000000000000..66725b0201e0
--- /dev/null
+++ b/tools/testing/selftests/fdmap/fdmap.c
@@ -0,0 +1,112 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <dirent.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/types.h>
+#include "fdmap.h"
+
+#define	BUF_SIZE	1024
+
+long fdmap(pid_t pid, int *fds, size_t count, int start_fd, int flags)
+{
+	register int64_t r10 asm("r10") = start_fd;
+	register int64_t r8 asm("r8") = flags;
+	long ret;
+
+	asm volatile (
+		"syscall"
+		: "=a"(ret)
+		: "0" (333),
+		  "D" (pid), "S" (fds), "d" (count), "r" (r10), "r" (r8)
+		: "rcx", "r11", "cc", "memory"
+	);
+	return ret;
+}
+
+int fdmap_full(pid_t pid, int **fds, size_t *n)
+{
+	int buf[BUF_SIZE], start_fd = 0;
+	long ret;
+
+	*n = 0;
+	*fds = NULL;
+	for (;;) {
+		int *new_buff;
+
+		ret = fdmap(pid, buf, BUF_SIZE, start_fd, 0);
+		if (ret < 0)
+			break;
+		if (!ret)
+			return 0;
+
+		new_buff = realloc(*fds, (*n + ret) * sizeof(int));
+		if (!new_buff) {
+			ret = -errno;
+			break;
+		}
+		*fds = new_buff;
+		memcpy(*fds + *n, buf, ret * sizeof(int));
+		*n += ret;
+		start_fd = (*fds)[*n - 1] + 1;
+	}
+	free(*fds);
+	*fds = NULL;
+	return -ret;
+}
+
+int fdmap_proc(pid_t pid, int **fds, size_t *n)
+{
+	char fds_path[20];
+	int dir_fd = 0;
+	struct dirent *fd_link;
+	DIR *fds_dir;
+
+	*fds = NULL;
+	*n = 0;
+	if (!pid)
+		strcpy(fds_path, "/proc/self/fd");
+	else
+		sprintf(fds_path, "/proc/%d/fd", pid);
+
+	fds_dir = opendir(fds_path);
+	if (!fds_dir)
+		return errno == ENOENT ? ESRCH : errno;
+	if (!pid)
+		dir_fd = dirfd(fds_dir);
+
+	while ((fd_link = readdir(fds_dir))) {
+		if (fd_link->d_name[0] < '0'
+		    || fd_link->d_name[0] > '9')
+			continue;
+		if (*n % BUF_SIZE == 0) {
+			int *new_buff;
+
+			new_buff = realloc(*fds, (*n + BUF_SIZE) * sizeof(int));
+			if (!new_buff) {
+				int ret = errno;
+
+				free(*fds);
+				*fds = NULL;
+				return ret;
+			}
+			*fds = new_buff;
+		}
+		(*fds)[*n] = atoi(fd_link->d_name);
+		*n += 1;
+	}
+	closedir(fds_dir);
+
+	if (!pid) {
+		size_t i;
+
+		for (i = 0; i < *n; i++)
+			if ((*fds)[i] == dir_fd)
+				break;
+		i++;
+		memmove(*fds + i - 1, *fds + i, (*n - i) * sizeof(int));
+		(*n)--;
+	}
+	return 0;
+}
diff --git a/tools/testing/selftests/fdmap/fdmap.h b/tools/testing/selftests/fdmap/fdmap.h
new file mode 100644
index 000000000000..c501111b2bbd
--- /dev/null
+++ b/tools/testing/selftests/fdmap/fdmap.h
@@ -0,0 +1,12 @@
+#ifndef FDMAP_H
+#define FDMAP_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+
+long fdmap(pid_t pid, int *fds, size_t count, int start_fd, int flags);
+int fdmap_full(pid_t pid, int **fds, size_t *n);
+int fdmap_proc(pid_t pid, int **fds, size_t *n);
+
+#endif
diff --git a/tools/testing/selftests/fdmap/fdmap_test.c b/tools/testing/selftests/fdmap/fdmap_test.c
new file mode 100644
index 000000000000..6f448406d96a
--- /dev/null
+++ b/tools/testing/selftests/fdmap/fdmap_test.c
@@ -0,0 +1,153 @@
+#include <errno.h>
+#include <syscall.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <limits.h>
+#include "../kselftest_harness.h"
+#include "fdmap.h"
+
+TEST(efault) {
+	int ret;
+
+	ret = syscall(333, 0, NULL, 20 * sizeof(int), 0, 0);
+	ASSERT_EQ(-1, ret);
+	ASSERT_EQ(EFAULT, errno);
+}
+
+TEST(big_start_fd) {
+	int fds[1];
+	int ret;
+
+	ret = syscall(333, 0, fds, sizeof(int), INT_MAX, 0);
+	ASSERT_EQ(0, ret);
+}
+
+TEST(einval) {
+	int ret;
+
+	ret = syscall(333, 0, NULL, 0, -1, 0);
+	ASSERT_EQ(-1, ret);
+	ASSERT_EQ(EINVAL, errno);
+
+	ret = syscall(333, 0, NULL, 0, 0, 1);
+	ASSERT_EQ(-1, ret);
+	ASSERT_EQ(EINVAL, errno);
+}
+
+TEST(esrch) {
+	int fds[1], ret;
+	pid_t pid;
+
+	pid = fork();
+	ASSERT_NE(-1, pid);
+	if (!pid)
+		exit(0);
+	waitpid(pid, NULL, 0);
+
+	ret = syscall(333, pid, fds, sizeof(int), 0, 0);
+	ASSERT_EQ(-1, ret);
+	ASSERT_EQ(ESRCH, errno);
+}
+
+TEST(simple) {
+	int *fds1, *fds2;
+	size_t size1, size2, i;
+	int ret1, ret2;
+
+	ret1 = fdmap_full(0, &fds1, &size1);
+	ret2 = fdmap_proc(0, &fds2, &size2);
+	ASSERT_EQ(ret2, ret1);
+	ASSERT_EQ(size2, size1);
+	for (i = 0; i < size1; i++)
+		ASSERT_EQ(fds2[i], fds1[i]);
+	free(fds1);
+	free(fds2);
+}
+
+TEST(init) {
+	int *fds1, *fds2;
+	size_t size1, size2, i;
+	int ret1, ret2;
+
+	ret1 = fdmap_full(1, &fds1, &size1);
+	ret2 = fdmap_proc(1, &fds2, &size2);
+	ASSERT_EQ(ret2, ret1);
+	ASSERT_EQ(size2, size1);
+	for (i = 0; i < size1; i++)
+		ASSERT_EQ(fds2[i], fds1[i]);
+	free(fds1);
+	free(fds2);
+}
+
+TEST(zero) {
+	int *fds, i;
+	size_t size;
+	int ret;
+
+	ret = fdmap_proc(0, &fds, &size);
+	ASSERT_EQ(0, ret);
+	for (i = 0; i < size; i++)
+		close(fds[i]);
+	free(fds);
+	fds = NULL;
+
+	ret = fdmap_full(0, &fds, &size);
+	ASSERT_EQ(0, ret);
+	ASSERT_EQ(0, size);
+}
+
+TEST(more_fds) {
+	int *fds1, *fds2, ret1, ret2;
+	size_t size1, size2, i;
+
+	struct rlimit rlim = {
+		.rlim_cur = 600000,
+		.rlim_max = 600000
+	};
+	ASSERT_EQ(0, setrlimit(RLIMIT_NOFILE, &rlim));
+	for (int i = 0; i < 500000; i++)
+		dup(0);
+
+	ret1 = fdmap_full(0, &fds1, &size1);
+	ret2 = fdmap_proc(0, &fds2, &size2);
+	ASSERT_EQ(ret2, ret1);
+	ASSERT_EQ(size2, size1);
+	for (i = 0; i < size1; i++)
+		ASSERT_EQ(fds2[i], fds1[i]);
+	free(fds1);
+	free(fds2);
+}
+
+TEST(child) {
+	int pipefd[2];
+	int *fds1, *fds2, ret1, ret2, i;
+	size_t size1, size2;
+	char byte = 0;
+	pid_t pid;
+
+	ASSERT_NE(-1, pipe(pipefd));
+	pid = fork();
+	ASSERT_NE(-1, pid);
+	if (!pid) {
+		read(pipefd[0], &byte, 1);
+		close(pipefd[0]);
+		close(pipefd[1]);
+		exit(0);
+	}
+
+	ret1 = fdmap_full(0, &fds1, &size1);
+	ret2 = fdmap_proc(0, &fds2, &size2);
+	ASSERT_EQ(ret2, ret1);
+	ASSERT_EQ(size2, size1);
+	for (i = 0; i < size1; i++)
+		ASSERT_EQ(fds2[i], fds1[i]);
+	free(fds1);
+	free(fds2);
+
+	write(pipefd[1], &byte, 1);
+	close(pipefd[0]);
+	close(pipefd[1]);
+	waitpid(pid, NULL, 0);
+}
+
+TEST_HARNESS_MAIN

^ permalink raw reply related	[flat|nested] 31+ messages in thread

end of thread, other threads:[~2017-10-26  7:53 UTC | newest]

Thread overview: 31+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-09-24 20:06 [PATCH 1/2 v2] fdmap(2) Alexey Dobriyan
2017-09-24 20:08 ` [PATCH v2 2/2] pidmap(2) Alexey Dobriyan
2017-09-24 21:27   ` Andy Lutomirski
2017-09-26 18:46     ` Alexey Dobriyan
2017-09-27 15:04       ` Andy Lutomirski
2017-09-25  7:43   ` Michael Kerrisk (man-pages)
2017-09-25 10:47   ` Djalal Harouni
2017-09-26  5:44   ` kbuild test robot
2017-09-24 21:31 ` [PATCH 1/2 v2] fdmap(2) Andy Lutomirski
2017-09-26 18:43   ` Alexey Dobriyan
2017-09-25  7:42 ` Michael Kerrisk (man-pages)
2017-09-26 19:00   ` Alexey Dobriyan
2017-09-27 15:03     ` Andy Lutomirski
2017-09-28  7:26       ` Michael Kerrisk (man-pages)
2017-09-28 10:55         ` Alexey Dobriyan
2017-09-28 15:02           ` Andy Lutomirski
2017-10-11 17:37             ` Alexey Dobriyan
2017-09-28 10:10       ` Alexey Dobriyan
2017-10-23  9:29   ` Pavel Machek
2017-10-25 12:45     ` Alexey Dobriyan
2017-10-25 13:48       ` Pavel Machek
2017-09-26  4:25 ` kbuild test robot
2017-10-10 22:08 ` [1/2,v2] fdmap(2) Andrei Vagin
2017-10-11 18:12   ` Alexey Dobriyan
2017-10-12  8:06     ` Andrei Vagin
2017-10-18 11:35       ` Alexey Dobriyan
2017-10-18 17:47         ` Andy Lutomirski
2017-10-19 15:34           ` Alexey Dobriyan
2017-10-20  7:48             ` Greg KH
2017-10-25 13:11               ` Alexey Dobriyan
2017-10-26  7:53             ` Andy Lutomirski

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).