All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2 v2] fdmap(2)
@ 2017-09-24 20:06 ` Alexey Dobriyan
  0 siblings, 0 replies; 51+ messages in thread
From: Alexey Dobriyan @ 2017-09-24 20:06 UTC (permalink / raw)
  To: akpm; +Cc: linux-kernel, linux-api, rdunlap, tglx, tixxdz, gladkov.alexey

From: Aliaksandr Patseyenak <Aliaksandr_Patseyenak1@epam.com>

Implement system call for bulk retrieveing of opened descriptors
in binary form.

Some daemons could use it to reliably close file descriptors
before starting. Currently they close everything upto some number
which formally is not reliable. Other natural users are lsof(1) and CRIU
(although lsof does so much in /proc that the effect is thoroughly buried).

/proc, the only way to learn anything about file descriptors may not be
available. There is unavoidable overhead associated with instantiating
3 dentries and 3 inodes and converting integers to strings and back.

Benchmark:

	N=1<<22 times
	4 opened descriptors (0, 1, 2, 3)
	opendir+readdir+closedir /proc/self/fd vs fdmap

	/proc 8.31 ± 0.37%
	fdmap 0.32 ± 0.72%


FDMAP(2)                   Linux Programmer's Manual                  FDMAP(2)

NAME
       fdmap - get open file descriptors of the process

SYNOPSIS
       long fdmap(pid_t pid, int *fd, unsigned int nfd, int start, int flags);

DESCRIPTION
       fdmap()  writes  open  file  descriptors  of the process into buffer fd
       starting from the start descriptor. At most nfd elements  are  written.
       flags argument is reserved and must be zero.

       If pid is zero, syscall will work with the current process.

RETURN VALUE
       On success, number of descriptors written is returned.  On error, -1 is
       returned, and errno is set appropriately.

ERRORS
       ESRCH  No such process.

       EACCES Permission denied.

       EFAULT Invalid fd pointer.

       EINVAL Negative start argument.

NOTES
       Glibc does not provide a wrapper for these system call; call  it  using
       syscall(2).

EXAMPLE
       The program below demonstrates fdmap() usage.

           $ ./a.out $$
           0 1 2 255

           $ ./a.out 42</dev/null 1023</dev/null
           0 1 2 42
           1023

   Program source

       #include <sys/types.h>
       #include <stdlib.h>
       #include <stdio.h>

       static inline long fdmap(int pid, int *fd, unsigned int nfd, unsigned int start, int flags)
       {
            register long r10 asm ("r10") = start;
            register long r8 asm ("r8") = flags;
            long rv;
            asm volatile (
                 "syscall"
                 : "=a" (rv)
                 : "0" (333), "D" (pid), "S" (fd), "d" (nfd), "r" (r10), "r" (r8)
                 : "rcx", "r11", "cc", "memory"
            );
            return rv;
       }

       int main(int argc, char *argv[])
       {
            pid_t pid;
            int fd[4];
            unsigned int start;
            int n;

            pid = 0;
            if (argc > 1)
                 pid = atoi(argv[1]);

            start = 0;
            while ((n = fdmap(pid, fd, sizeof(fd)/sizeof(fd[0]), start, 0)) > 0) {
                 unsigned int i;

                 for (i = 0; i < n; i++)
                      printf("%u ", fd[i]);
                 printf("\n");

                 start = fd[n - 1] + 1;
            }
            return 0;
       }

Linux                             2017-09-21                          FDMAP(2)

Changelog:

	CONFIG_PIDMAP option
	manpage


Signed-off-by: Aliaksandr Patseyenak <Aliaksandr_Patseyenak1@epam.com>
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>

---
 arch/x86/entry/syscalls/syscall_64.tbl     |   1 +
 fs/Makefile                                |   2 +
 fs/fdmap.c                                 | 105 ++++++++++++++++++++
 include/linux/syscalls.h                   |   2 +
 init/Kconfig                               |   7 ++
 kernel/sys_ni.c                            |   2 +
 tools/testing/selftests/Makefile           |   1 +
 tools/testing/selftests/fdmap/.gitignore   |   1 +
 tools/testing/selftests/fdmap/Makefile     |   7 ++
 tools/testing/selftests/fdmap/fdmap.c      | 112 +++++++++++++++++++++
 tools/testing/selftests/fdmap/fdmap.h      |  12 +++
 tools/testing/selftests/fdmap/fdmap_test.c | 153 +++++++++++++++++++++++++++++
 12 files changed, 405 insertions(+)
 create mode 100644 fs/fdmap.c
 create mode 100644 tools/testing/selftests/fdmap/.gitignore
 create mode 100644 tools/testing/selftests/fdmap/Makefile
 create mode 100644 tools/testing/selftests/fdmap/fdmap.c
 create mode 100644 tools/testing/selftests/fdmap/fdmap.h
 create mode 100644 tools/testing/selftests/fdmap/fdmap_test.c

diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 5aef183e2f85..9bfe5f79674f 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -339,6 +339,7 @@
 330	common	pkey_alloc		sys_pkey_alloc
 331	common	pkey_free		sys_pkey_free
 332	common	statx			sys_statx
+333	common	fdmap			sys_fdmap
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/fs/Makefile b/fs/Makefile
index 7bbaca9c67b1..27476a66c18e 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -13,6 +13,8 @@ obj-y :=	open.o read_write.o file_table.o super.o \
 		pnode.o splice.o sync.o utimes.o \
 		stack.o fs_struct.o statfs.o fs_pin.o nsfs.o
 
+obj-$(CONFIG_FDMAP) += fdmap.o
+
 ifeq ($(CONFIG_BLOCK),y)
 obj-y +=	buffer.o block_dev.o direct-io.o mpage.o
 else
diff --git a/fs/fdmap.c b/fs/fdmap.c
new file mode 100644
index 000000000000..274e6c5b7c9c
--- /dev/null
+++ b/fs/fdmap.c
@@ -0,0 +1,105 @@
+#include <linux/bitops.h>
+#include <linux/fdtable.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+#include <linux/syscalls.h>
+#include <linux/uaccess.h>
+
+/**
+ * fdmap - get opened file descriptors of a process
+ * @pid: the pid of the target process
+ * @fds: allocated userspace buffer
+ * @count: buffer size (in descriptors)
+ * @start_fd: first descriptor to search from (inclusive)
+ * @flags: reserved for future functionality, must be zero
+ *
+ * If @pid is zero then it's current process.
+ * Return: number of descriptors written. An error code otherwise.
+ */
+SYSCALL_DEFINE5(fdmap, pid_t, pid, int __user *, fds, unsigned int, count,
+		int, start_fd, int, flags)
+{
+	struct task_struct *task;
+	struct files_struct *files;
+	unsigned long search_mask;
+	unsigned int user_index, offset;
+	int masksize;
+
+	if (start_fd < 0 || flags != 0)
+		return -EINVAL;
+
+	if (!pid) {
+		files = get_files_struct(current);
+	} else {
+		rcu_read_lock();
+		task = find_task_by_vpid(pid);
+		if (!task) {
+			rcu_read_unlock();
+			return -ESRCH;
+		}
+		if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
+			rcu_read_unlock();
+			return -EACCES;
+		}
+		files = get_files_struct(task);
+		rcu_read_unlock();
+	}
+	if (!files)
+		return 0;
+
+	offset = start_fd / BITS_PER_LONG;
+	search_mask = ULONG_MAX << (start_fd % BITS_PER_LONG);
+	user_index = 0;
+#define FDS_BUF_SIZE	(512/sizeof(unsigned long))
+	masksize = FDS_BUF_SIZE;
+	while (user_index < count && masksize == FDS_BUF_SIZE) {
+		unsigned long open_fds[FDS_BUF_SIZE];
+		struct fdtable *fdt;
+		unsigned int i;
+
+		/*
+		 * fdt->max_fds can grow, get it every time
+		 * before copying part into internal buffer.
+		 */
+		rcu_read_lock();
+		fdt = files_fdtable(files);
+		masksize = fdt->max_fds / 8 - offset * sizeof(long);
+		if (masksize < 0) {
+			rcu_read_unlock();
+			break;
+		}
+		masksize = min(masksize, (int)sizeof(open_fds));
+		memcpy(open_fds, fdt->open_fds + offset, masksize);
+		rcu_read_unlock();
+
+		open_fds[0] &= search_mask;
+		search_mask = ULONG_MAX;
+		masksize = (masksize + sizeof(long) - 1) / sizeof(long);
+		start_fd = offset * BITS_PER_LONG;
+		/*
+		 * for_each_set_bit_from() can re-read first word
+		 * multiple times which is not optimal.
+		 */
+		for (i = 0; i < masksize; i++) {
+			unsigned long mask = open_fds[i];
+
+			while (mask) {
+				unsigned int real_fd = start_fd + __ffs(mask);
+
+				if (put_user(real_fd, fds + user_index)) {
+					put_files_struct(files);
+					return -EFAULT;
+				}
+				if (++user_index >= count)
+					goto out;
+				mask &= mask - 1;
+			}
+			start_fd += BITS_PER_LONG;
+		}
+		offset += FDS_BUF_SIZE;
+	}
+out:
+	put_files_struct(files);
+
+	return user_index;
+}
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 95606a2d556f..d393d844facb 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -936,5 +936,7 @@ asmlinkage long sys_pkey_alloc(unsigned long flags, unsigned long init_val);
 asmlinkage long sys_pkey_free(int pkey);
 asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
 			  unsigned mask, struct statx __user *buffer);
+asmlinkage long sys_fdmap(pid_t pid, int __user *fds, unsigned int count,
+			  int start_fd, int flags);
 
 #endif
diff --git a/init/Kconfig b/init/Kconfig
index 78cb2461012e..952d13b7326d 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1400,6 +1400,13 @@ config MEMBARRIER
 
 	  If unsure, say Y.
 
+config FDMAP
+	bool "fdmap() system call" if EXPERT
+	default y
+	help
+	  Enable fdmap() system call that allows to query file descriptors
+	  in binary form avoiding /proc overhead.
+
 config EMBEDDED
 	bool "Embedded system"
 	option allnoconfig_y
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 8acef8576ce9..d61fa27d021e 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -258,3 +258,5 @@ cond_syscall(sys_membarrier);
 cond_syscall(sys_pkey_mprotect);
 cond_syscall(sys_pkey_alloc);
 cond_syscall(sys_pkey_free);
+
+cond_syscall(sys_fdmap);
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index 26ce4f7168be..e8d63c27c865 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -5,6 +5,7 @@ TARGETS += cpufreq
 TARGETS += cpu-hotplug
 TARGETS += efivarfs
 TARGETS += exec
+TARGETS += fdmap
 TARGETS += firmware
 TARGETS += ftrace
 TARGETS += futex
diff --git a/tools/testing/selftests/fdmap/.gitignore b/tools/testing/selftests/fdmap/.gitignore
new file mode 100644
index 000000000000..9a9bfdac1cc0
--- /dev/null
+++ b/tools/testing/selftests/fdmap/.gitignore
@@ -0,0 +1 @@
+fdmap_test
diff --git a/tools/testing/selftests/fdmap/Makefile b/tools/testing/selftests/fdmap/Makefile
new file mode 100644
index 000000000000..bf9f051f4b63
--- /dev/null
+++ b/tools/testing/selftests/fdmap/Makefile
@@ -0,0 +1,7 @@
+TEST_GEN_PROGS := fdmap_test
+CFLAGS += -Wall
+
+include ../lib.mk
+
+$(TEST_GEN_PROGS): fdmap_test.c fdmap.c fdmap.h ../kselftest_harness.h
+	$(CC) $(CFLAGS) $(LDFLAGS) $< fdmap.c -o $@
diff --git a/tools/testing/selftests/fdmap/fdmap.c b/tools/testing/selftests/fdmap/fdmap.c
new file mode 100644
index 000000000000..66725b0201e0
--- /dev/null
+++ b/tools/testing/selftests/fdmap/fdmap.c
@@ -0,0 +1,112 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <dirent.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/types.h>
+#include "fdmap.h"
+
+#define	BUF_SIZE	1024
+
+long fdmap(pid_t pid, int *fds, size_t count, int start_fd, int flags)
+{
+	register int64_t r10 asm("r10") = start_fd;
+	register int64_t r8 asm("r8") = flags;
+	long ret;
+
+	asm volatile (
+		"syscall"
+		: "=a"(ret)
+		: "0" (333),
+		  "D" (pid), "S" (fds), "d" (count), "r" (r10), "r" (r8)
+		: "rcx", "r11", "cc", "memory"
+	);
+	return ret;
+}
+
+int fdmap_full(pid_t pid, int **fds, size_t *n)
+{
+	int buf[BUF_SIZE], start_fd = 0;
+	long ret;
+
+	*n = 0;
+	*fds = NULL;
+	for (;;) {
+		int *new_buff;
+
+		ret = fdmap(pid, buf, BUF_SIZE, start_fd, 0);
+		if (ret < 0)
+			break;
+		if (!ret)
+			return 0;
+
+		new_buff = realloc(*fds, (*n + ret) * sizeof(int));
+		if (!new_buff) {
+			ret = -errno;
+			break;
+		}
+		*fds = new_buff;
+		memcpy(*fds + *n, buf, ret * sizeof(int));
+		*n += ret;
+		start_fd = (*fds)[*n - 1] + 1;
+	}
+	free(*fds);
+	*fds = NULL;
+	return -ret;
+}
+
+int fdmap_proc(pid_t pid, int **fds, size_t *n)
+{
+	char fds_path[20];
+	int dir_fd = 0;
+	struct dirent *fd_link;
+	DIR *fds_dir;
+
+	*fds = NULL;
+	*n = 0;
+	if (!pid)
+		strcpy(fds_path, "/proc/self/fd");
+	else
+		sprintf(fds_path, "/proc/%d/fd", pid);
+
+	fds_dir = opendir(fds_path);
+	if (!fds_dir)
+		return errno == ENOENT ? ESRCH : errno;
+	if (!pid)
+		dir_fd = dirfd(fds_dir);
+
+	while ((fd_link = readdir(fds_dir))) {
+		if (fd_link->d_name[0] < '0'
+		    || fd_link->d_name[0] > '9')
+			continue;
+		if (*n % BUF_SIZE == 0) {
+			int *new_buff;
+
+			new_buff = realloc(*fds, (*n + BUF_SIZE) * sizeof(int));
+			if (!new_buff) {
+				int ret = errno;
+
+				free(*fds);
+				*fds = NULL;
+				return ret;
+			}
+			*fds = new_buff;
+		}
+		(*fds)[*n] = atoi(fd_link->d_name);
+		*n += 1;
+	}
+	closedir(fds_dir);
+
+	if (!pid) {
+		size_t i;
+
+		for (i = 0; i < *n; i++)
+			if ((*fds)[i] == dir_fd)
+				break;
+		i++;
+		memmove(*fds + i - 1, *fds + i, (*n - i) * sizeof(int));
+		(*n)--;
+	}
+	return 0;
+}
diff --git a/tools/testing/selftests/fdmap/fdmap.h b/tools/testing/selftests/fdmap/fdmap.h
new file mode 100644
index 000000000000..c501111b2bbd
--- /dev/null
+++ b/tools/testing/selftests/fdmap/fdmap.h
@@ -0,0 +1,12 @@
+#ifndef FDMAP_H
+#define FDMAP_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+
+long fdmap(pid_t pid, int *fds, size_t count, int start_fd, int flags);
+int fdmap_full(pid_t pid, int **fds, size_t *n);
+int fdmap_proc(pid_t pid, int **fds, size_t *n);
+
+#endif
diff --git a/tools/testing/selftests/fdmap/fdmap_test.c b/tools/testing/selftests/fdmap/fdmap_test.c
new file mode 100644
index 000000000000..6f448406d96a
--- /dev/null
+++ b/tools/testing/selftests/fdmap/fdmap_test.c
@@ -0,0 +1,153 @@
+#include <errno.h>
+#include <syscall.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <limits.h>
+#include "../kselftest_harness.h"
+#include "fdmap.h"
+
+TEST(efault) {
+	int ret;
+
+	ret = syscall(333, 0, NULL, 20 * sizeof(int), 0, 0);
+	ASSERT_EQ(-1, ret);
+	ASSERT_EQ(EFAULT, errno);
+}
+
+TEST(big_start_fd) {
+	int fds[1];
+	int ret;
+
+	ret = syscall(333, 0, fds, sizeof(int), INT_MAX, 0);
+	ASSERT_EQ(0, ret);
+}
+
+TEST(einval) {
+	int ret;
+
+	ret = syscall(333, 0, NULL, 0, -1, 0);
+	ASSERT_EQ(-1, ret);
+	ASSERT_EQ(EINVAL, errno);
+
+	ret = syscall(333, 0, NULL, 0, 0, 1);
+	ASSERT_EQ(-1, ret);
+	ASSERT_EQ(EINVAL, errno);
+}
+
+TEST(esrch) {
+	int fds[1], ret;
+	pid_t pid;
+
+	pid = fork();
+	ASSERT_NE(-1, pid);
+	if (!pid)
+		exit(0);
+	waitpid(pid, NULL, 0);
+
+	ret = syscall(333, pid, fds, sizeof(int), 0, 0);
+	ASSERT_EQ(-1, ret);
+	ASSERT_EQ(ESRCH, errno);
+}
+
+TEST(simple) {
+	int *fds1, *fds2;
+	size_t size1, size2, i;
+	int ret1, ret2;
+
+	ret1 = fdmap_full(0, &fds1, &size1);
+	ret2 = fdmap_proc(0, &fds2, &size2);
+	ASSERT_EQ(ret2, ret1);
+	ASSERT_EQ(size2, size1);
+	for (i = 0; i < size1; i++)
+		ASSERT_EQ(fds2[i], fds1[i]);
+	free(fds1);
+	free(fds2);
+}
+
+TEST(init) {
+	int *fds1, *fds2;
+	size_t size1, size2, i;
+	int ret1, ret2;
+
+	ret1 = fdmap_full(1, &fds1, &size1);
+	ret2 = fdmap_proc(1, &fds2, &size2);
+	ASSERT_EQ(ret2, ret1);
+	ASSERT_EQ(size2, size1);
+	for (i = 0; i < size1; i++)
+		ASSERT_EQ(fds2[i], fds1[i]);
+	free(fds1);
+	free(fds2);
+}
+
+TEST(zero) {
+	int *fds, i;
+	size_t size;
+	int ret;
+
+	ret = fdmap_proc(0, &fds, &size);
+	ASSERT_EQ(0, ret);
+	for (i = 0; i < size; i++)
+		close(fds[i]);
+	free(fds);
+	fds = NULL;
+
+	ret = fdmap_full(0, &fds, &size);
+	ASSERT_EQ(0, ret);
+	ASSERT_EQ(0, size);
+}
+
+TEST(more_fds) {
+	int *fds1, *fds2, ret1, ret2;
+	size_t size1, size2, i;
+
+	struct rlimit rlim = {
+		.rlim_cur = 600000,
+		.rlim_max = 600000
+	};
+	ASSERT_EQ(0, setrlimit(RLIMIT_NOFILE, &rlim));
+	for (int i = 0; i < 500000; i++)
+		dup(0);
+
+	ret1 = fdmap_full(0, &fds1, &size1);
+	ret2 = fdmap_proc(0, &fds2, &size2);
+	ASSERT_EQ(ret2, ret1);
+	ASSERT_EQ(size2, size1);
+	for (i = 0; i < size1; i++)
+		ASSERT_EQ(fds2[i], fds1[i]);
+	free(fds1);
+	free(fds2);
+}
+
+TEST(child) {
+	int pipefd[2];
+	int *fds1, *fds2, ret1, ret2, i;
+	size_t size1, size2;
+	char byte = 0;
+	pid_t pid;
+
+	ASSERT_NE(-1, pipe(pipefd));
+	pid = fork();
+	ASSERT_NE(-1, pid);
+	if (!pid) {
+		read(pipefd[0], &byte, 1);
+		close(pipefd[0]);
+		close(pipefd[1]);
+		exit(0);
+	}
+
+	ret1 = fdmap_full(0, &fds1, &size1);
+	ret2 = fdmap_proc(0, &fds2, &size2);
+	ASSERT_EQ(ret2, ret1);
+	ASSERT_EQ(size2, size1);
+	for (i = 0; i < size1; i++)
+		ASSERT_EQ(fds2[i], fds1[i]);
+	free(fds1);
+	free(fds2);
+
+	write(pipefd[1], &byte, 1);
+	close(pipefd[0]);
+	close(pipefd[1]);
+	waitpid(pid, NULL, 0);
+}
+
+TEST_HARNESS_MAIN

^ permalink raw reply related	[flat|nested] 51+ messages in thread

* [PATCH 1/2 v2] fdmap(2)
@ 2017-09-24 20:06 ` Alexey Dobriyan
  0 siblings, 0 replies; 51+ messages in thread
From: Alexey Dobriyan @ 2017-09-24 20:06 UTC (permalink / raw)
  To: akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b
  Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA, rdunlap-wEGCiKHe2LqWVfeAwA7xHQ,
	tglx-hfZtesqFncYOwBW4kG4KsQ, tixxdz-Re5JQEeQqe8AvxtiuMwx3w,
	gladkov.alexey-Re5JQEeQqe8AvxtiuMwx3w

From: Aliaksandr Patseyenak <Aliaksandr_Patseyenak1-uRwfk40T5oI@public.gmane.org>

Implement system call for bulk retrieveing of opened descriptors
in binary form.

Some daemons could use it to reliably close file descriptors
before starting. Currently they close everything upto some number
which formally is not reliable. Other natural users are lsof(1) and CRIU
(although lsof does so much in /proc that the effect is thoroughly buried).

/proc, the only way to learn anything about file descriptors may not be
available. There is unavoidable overhead associated with instantiating
3 dentries and 3 inodes and converting integers to strings and back.

Benchmark:

	N=1<<22 times
	4 opened descriptors (0, 1, 2, 3)
	opendir+readdir+closedir /proc/self/fd vs fdmap

	/proc 8.31 ± 0.37%
	fdmap 0.32 ± 0.72%


FDMAP(2)                   Linux Programmer's Manual                  FDMAP(2)

NAME
       fdmap - get open file descriptors of the process

SYNOPSIS
       long fdmap(pid_t pid, int *fd, unsigned int nfd, int start, int flags);

DESCRIPTION
       fdmap()  writes  open  file  descriptors  of the process into buffer fd
       starting from the start descriptor. At most nfd elements  are  written.
       flags argument is reserved and must be zero.

       If pid is zero, syscall will work with the current process.

RETURN VALUE
       On success, number of descriptors written is returned.  On error, -1 is
       returned, and errno is set appropriately.

ERRORS
       ESRCH  No such process.

       EACCES Permission denied.

       EFAULT Invalid fd pointer.

       EINVAL Negative start argument.

NOTES
       Glibc does not provide a wrapper for these system call; call  it  using
       syscall(2).

EXAMPLE
       The program below demonstrates fdmap() usage.

           $ ./a.out $$
           0 1 2 255

           $ ./a.out 42</dev/null 1023</dev/null
           0 1 2 42
           1023

   Program source

       #include <sys/types.h>
       #include <stdlib.h>
       #include <stdio.h>

       static inline long fdmap(int pid, int *fd, unsigned int nfd, unsigned int start, int flags)
       {
            register long r10 asm ("r10") = start;
            register long r8 asm ("r8") = flags;
            long rv;
            asm volatile (
                 "syscall"
                 : "=a" (rv)
                 : "0" (333), "D" (pid), "S" (fd), "d" (nfd), "r" (r10), "r" (r8)
                 : "rcx", "r11", "cc", "memory"
            );
            return rv;
       }

       int main(int argc, char *argv[])
       {
            pid_t pid;
            int fd[4];
            unsigned int start;
            int n;

            pid = 0;
            if (argc > 1)
                 pid = atoi(argv[1]);

            start = 0;
            while ((n = fdmap(pid, fd, sizeof(fd)/sizeof(fd[0]), start, 0)) > 0) {
                 unsigned int i;

                 for (i = 0; i < n; i++)
                      printf("%u ", fd[i]);
                 printf("\n");

                 start = fd[n - 1] + 1;
            }
            return 0;
       }

Linux                             2017-09-21                          FDMAP(2)

Changelog:

	CONFIG_PIDMAP option
	manpage


Signed-off-by: Aliaksandr Patseyenak <Aliaksandr_Patseyenak1-uRwfk40T5oI@public.gmane.org>
Signed-off-by: Alexey Dobriyan <adobriyan-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>

---
 arch/x86/entry/syscalls/syscall_64.tbl     |   1 +
 fs/Makefile                                |   2 +
 fs/fdmap.c                                 | 105 ++++++++++++++++++++
 include/linux/syscalls.h                   |   2 +
 init/Kconfig                               |   7 ++
 kernel/sys_ni.c                            |   2 +
 tools/testing/selftests/Makefile           |   1 +
 tools/testing/selftests/fdmap/.gitignore   |   1 +
 tools/testing/selftests/fdmap/Makefile     |   7 ++
 tools/testing/selftests/fdmap/fdmap.c      | 112 +++++++++++++++++++++
 tools/testing/selftests/fdmap/fdmap.h      |  12 +++
 tools/testing/selftests/fdmap/fdmap_test.c | 153 +++++++++++++++++++++++++++++
 12 files changed, 405 insertions(+)
 create mode 100644 fs/fdmap.c
 create mode 100644 tools/testing/selftests/fdmap/.gitignore
 create mode 100644 tools/testing/selftests/fdmap/Makefile
 create mode 100644 tools/testing/selftests/fdmap/fdmap.c
 create mode 100644 tools/testing/selftests/fdmap/fdmap.h
 create mode 100644 tools/testing/selftests/fdmap/fdmap_test.c

diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 5aef183e2f85..9bfe5f79674f 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -339,6 +339,7 @@
 330	common	pkey_alloc		sys_pkey_alloc
 331	common	pkey_free		sys_pkey_free
 332	common	statx			sys_statx
+333	common	fdmap			sys_fdmap
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/fs/Makefile b/fs/Makefile
index 7bbaca9c67b1..27476a66c18e 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -13,6 +13,8 @@ obj-y :=	open.o read_write.o file_table.o super.o \
 		pnode.o splice.o sync.o utimes.o \
 		stack.o fs_struct.o statfs.o fs_pin.o nsfs.o
 
+obj-$(CONFIG_FDMAP) += fdmap.o
+
 ifeq ($(CONFIG_BLOCK),y)
 obj-y +=	buffer.o block_dev.o direct-io.o mpage.o
 else
diff --git a/fs/fdmap.c b/fs/fdmap.c
new file mode 100644
index 000000000000..274e6c5b7c9c
--- /dev/null
+++ b/fs/fdmap.c
@@ -0,0 +1,105 @@
+#include <linux/bitops.h>
+#include <linux/fdtable.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+#include <linux/syscalls.h>
+#include <linux/uaccess.h>
+
+/**
+ * fdmap - get opened file descriptors of a process
+ * @pid: the pid of the target process
+ * @fds: allocated userspace buffer
+ * @count: buffer size (in descriptors)
+ * @start_fd: first descriptor to search from (inclusive)
+ * @flags: reserved for future functionality, must be zero
+ *
+ * If @pid is zero then it's current process.
+ * Return: number of descriptors written. An error code otherwise.
+ */
+SYSCALL_DEFINE5(fdmap, pid_t, pid, int __user *, fds, unsigned int, count,
+		int, start_fd, int, flags)
+{
+	struct task_struct *task;
+	struct files_struct *files;
+	unsigned long search_mask;
+	unsigned int user_index, offset;
+	int masksize;
+
+	if (start_fd < 0 || flags != 0)
+		return -EINVAL;
+
+	if (!pid) {
+		files = get_files_struct(current);
+	} else {
+		rcu_read_lock();
+		task = find_task_by_vpid(pid);
+		if (!task) {
+			rcu_read_unlock();
+			return -ESRCH;
+		}
+		if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
+			rcu_read_unlock();
+			return -EACCES;
+		}
+		files = get_files_struct(task);
+		rcu_read_unlock();
+	}
+	if (!files)
+		return 0;
+
+	offset = start_fd / BITS_PER_LONG;
+	search_mask = ULONG_MAX << (start_fd % BITS_PER_LONG);
+	user_index = 0;
+#define FDS_BUF_SIZE	(512/sizeof(unsigned long))
+	masksize = FDS_BUF_SIZE;
+	while (user_index < count && masksize == FDS_BUF_SIZE) {
+		unsigned long open_fds[FDS_BUF_SIZE];
+		struct fdtable *fdt;
+		unsigned int i;
+
+		/*
+		 * fdt->max_fds can grow, get it every time
+		 * before copying part into internal buffer.
+		 */
+		rcu_read_lock();
+		fdt = files_fdtable(files);
+		masksize = fdt->max_fds / 8 - offset * sizeof(long);
+		if (masksize < 0) {
+			rcu_read_unlock();
+			break;
+		}
+		masksize = min(masksize, (int)sizeof(open_fds));
+		memcpy(open_fds, fdt->open_fds + offset, masksize);
+		rcu_read_unlock();
+
+		open_fds[0] &= search_mask;
+		search_mask = ULONG_MAX;
+		masksize = (masksize + sizeof(long) - 1) / sizeof(long);
+		start_fd = offset * BITS_PER_LONG;
+		/*
+		 * for_each_set_bit_from() can re-read first word
+		 * multiple times which is not optimal.
+		 */
+		for (i = 0; i < masksize; i++) {
+			unsigned long mask = open_fds[i];
+
+			while (mask) {
+				unsigned int real_fd = start_fd + __ffs(mask);
+
+				if (put_user(real_fd, fds + user_index)) {
+					put_files_struct(files);
+					return -EFAULT;
+				}
+				if (++user_index >= count)
+					goto out;
+				mask &= mask - 1;
+			}
+			start_fd += BITS_PER_LONG;
+		}
+		offset += FDS_BUF_SIZE;
+	}
+out:
+	put_files_struct(files);
+
+	return user_index;
+}
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 95606a2d556f..d393d844facb 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -936,5 +936,7 @@ asmlinkage long sys_pkey_alloc(unsigned long flags, unsigned long init_val);
 asmlinkage long sys_pkey_free(int pkey);
 asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
 			  unsigned mask, struct statx __user *buffer);
+asmlinkage long sys_fdmap(pid_t pid, int __user *fds, unsigned int count,
+			  int start_fd, int flags);
 
 #endif
diff --git a/init/Kconfig b/init/Kconfig
index 78cb2461012e..952d13b7326d 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1400,6 +1400,13 @@ config MEMBARRIER
 
 	  If unsure, say Y.
 
+config FDMAP
+	bool "fdmap() system call" if EXPERT
+	default y
+	help
+	  Enable fdmap() system call that allows to query file descriptors
+	  in binary form avoiding /proc overhead.
+
 config EMBEDDED
 	bool "Embedded system"
 	option allnoconfig_y
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 8acef8576ce9..d61fa27d021e 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -258,3 +258,5 @@ cond_syscall(sys_membarrier);
 cond_syscall(sys_pkey_mprotect);
 cond_syscall(sys_pkey_alloc);
 cond_syscall(sys_pkey_free);
+
+cond_syscall(sys_fdmap);
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index 26ce4f7168be..e8d63c27c865 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -5,6 +5,7 @@ TARGETS += cpufreq
 TARGETS += cpu-hotplug
 TARGETS += efivarfs
 TARGETS += exec
+TARGETS += fdmap
 TARGETS += firmware
 TARGETS += ftrace
 TARGETS += futex
diff --git a/tools/testing/selftests/fdmap/.gitignore b/tools/testing/selftests/fdmap/.gitignore
new file mode 100644
index 000000000000..9a9bfdac1cc0
--- /dev/null
+++ b/tools/testing/selftests/fdmap/.gitignore
@@ -0,0 +1 @@
+fdmap_test
diff --git a/tools/testing/selftests/fdmap/Makefile b/tools/testing/selftests/fdmap/Makefile
new file mode 100644
index 000000000000..bf9f051f4b63
--- /dev/null
+++ b/tools/testing/selftests/fdmap/Makefile
@@ -0,0 +1,7 @@
+TEST_GEN_PROGS := fdmap_test
+CFLAGS += -Wall
+
+include ../lib.mk
+
+$(TEST_GEN_PROGS): fdmap_test.c fdmap.c fdmap.h ../kselftest_harness.h
+	$(CC) $(CFLAGS) $(LDFLAGS) $< fdmap.c -o $@
diff --git a/tools/testing/selftests/fdmap/fdmap.c b/tools/testing/selftests/fdmap/fdmap.c
new file mode 100644
index 000000000000..66725b0201e0
--- /dev/null
+++ b/tools/testing/selftests/fdmap/fdmap.c
@@ -0,0 +1,112 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <dirent.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/types.h>
+#include "fdmap.h"
+
+#define	BUF_SIZE	1024
+
+long fdmap(pid_t pid, int *fds, size_t count, int start_fd, int flags)
+{
+	register int64_t r10 asm("r10") = start_fd;
+	register int64_t r8 asm("r8") = flags;
+	long ret;
+
+	asm volatile (
+		"syscall"
+		: "=a"(ret)
+		: "0" (333),
+		  "D" (pid), "S" (fds), "d" (count), "r" (r10), "r" (r8)
+		: "rcx", "r11", "cc", "memory"
+	);
+	return ret;
+}
+
+int fdmap_full(pid_t pid, int **fds, size_t *n)
+{
+	int buf[BUF_SIZE], start_fd = 0;
+	long ret;
+
+	*n = 0;
+	*fds = NULL;
+	for (;;) {
+		int *new_buff;
+
+		ret = fdmap(pid, buf, BUF_SIZE, start_fd, 0);
+		if (ret < 0)
+			break;
+		if (!ret)
+			return 0;
+
+		new_buff = realloc(*fds, (*n + ret) * sizeof(int));
+		if (!new_buff) {
+			ret = -errno;
+			break;
+		}
+		*fds = new_buff;
+		memcpy(*fds + *n, buf, ret * sizeof(int));
+		*n += ret;
+		start_fd = (*fds)[*n - 1] + 1;
+	}
+	free(*fds);
+	*fds = NULL;
+	return -ret;
+}
+
+int fdmap_proc(pid_t pid, int **fds, size_t *n)
+{
+	char fds_path[20];
+	int dir_fd = 0;
+	struct dirent *fd_link;
+	DIR *fds_dir;
+
+	*fds = NULL;
+	*n = 0;
+	if (!pid)
+		strcpy(fds_path, "/proc/self/fd");
+	else
+		sprintf(fds_path, "/proc/%d/fd", pid);
+
+	fds_dir = opendir(fds_path);
+	if (!fds_dir)
+		return errno == ENOENT ? ESRCH : errno;
+	if (!pid)
+		dir_fd = dirfd(fds_dir);
+
+	while ((fd_link = readdir(fds_dir))) {
+		if (fd_link->d_name[0] < '0'
+		    || fd_link->d_name[0] > '9')
+			continue;
+		if (*n % BUF_SIZE == 0) {
+			int *new_buff;
+
+			new_buff = realloc(*fds, (*n + BUF_SIZE) * sizeof(int));
+			if (!new_buff) {
+				int ret = errno;
+
+				free(*fds);
+				*fds = NULL;
+				return ret;
+			}
+			*fds = new_buff;
+		}
+		(*fds)[*n] = atoi(fd_link->d_name);
+		*n += 1;
+	}
+	closedir(fds_dir);
+
+	if (!pid) {
+		size_t i;
+
+		for (i = 0; i < *n; i++)
+			if ((*fds)[i] == dir_fd)
+				break;
+		i++;
+		memmove(*fds + i - 1, *fds + i, (*n - i) * sizeof(int));
+		(*n)--;
+	}
+	return 0;
+}
diff --git a/tools/testing/selftests/fdmap/fdmap.h b/tools/testing/selftests/fdmap/fdmap.h
new file mode 100644
index 000000000000..c501111b2bbd
--- /dev/null
+++ b/tools/testing/selftests/fdmap/fdmap.h
@@ -0,0 +1,12 @@
+#ifndef FDMAP_H
+#define FDMAP_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+
+long fdmap(pid_t pid, int *fds, size_t count, int start_fd, int flags);
+int fdmap_full(pid_t pid, int **fds, size_t *n);
+int fdmap_proc(pid_t pid, int **fds, size_t *n);
+
+#endif
diff --git a/tools/testing/selftests/fdmap/fdmap_test.c b/tools/testing/selftests/fdmap/fdmap_test.c
new file mode 100644
index 000000000000..6f448406d96a
--- /dev/null
+++ b/tools/testing/selftests/fdmap/fdmap_test.c
@@ -0,0 +1,153 @@
+#include <errno.h>
+#include <syscall.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <limits.h>
+#include "../kselftest_harness.h"
+#include "fdmap.h"
+
+TEST(efault) {
+	int ret;
+
+	ret = syscall(333, 0, NULL, 20 * sizeof(int), 0, 0);
+	ASSERT_EQ(-1, ret);
+	ASSERT_EQ(EFAULT, errno);
+}
+
+TEST(big_start_fd) {
+	int fds[1];
+	int ret;
+
+	ret = syscall(333, 0, fds, sizeof(int), INT_MAX, 0);
+	ASSERT_EQ(0, ret);
+}
+
+TEST(einval) {
+	int ret;
+
+	ret = syscall(333, 0, NULL, 0, -1, 0);
+	ASSERT_EQ(-1, ret);
+	ASSERT_EQ(EINVAL, errno);
+
+	ret = syscall(333, 0, NULL, 0, 0, 1);
+	ASSERT_EQ(-1, ret);
+	ASSERT_EQ(EINVAL, errno);
+}
+
+TEST(esrch) {
+	int fds[1], ret;
+	pid_t pid;
+
+	pid = fork();
+	ASSERT_NE(-1, pid);
+	if (!pid)
+		exit(0);
+	waitpid(pid, NULL, 0);
+
+	ret = syscall(333, pid, fds, sizeof(int), 0, 0);
+	ASSERT_EQ(-1, ret);
+	ASSERT_EQ(ESRCH, errno);
+}
+
+TEST(simple) {
+	int *fds1, *fds2;
+	size_t size1, size2, i;
+	int ret1, ret2;
+
+	ret1 = fdmap_full(0, &fds1, &size1);
+	ret2 = fdmap_proc(0, &fds2, &size2);
+	ASSERT_EQ(ret2, ret1);
+	ASSERT_EQ(size2, size1);
+	for (i = 0; i < size1; i++)
+		ASSERT_EQ(fds2[i], fds1[i]);
+	free(fds1);
+	free(fds2);
+}
+
+TEST(init) {
+	int *fds1, *fds2;
+	size_t size1, size2, i;
+	int ret1, ret2;
+
+	ret1 = fdmap_full(1, &fds1, &size1);
+	ret2 = fdmap_proc(1, &fds2, &size2);
+	ASSERT_EQ(ret2, ret1);
+	ASSERT_EQ(size2, size1);
+	for (i = 0; i < size1; i++)
+		ASSERT_EQ(fds2[i], fds1[i]);
+	free(fds1);
+	free(fds2);
+}
+
+TEST(zero) {
+	int *fds, i;
+	size_t size;
+	int ret;
+
+	ret = fdmap_proc(0, &fds, &size);
+	ASSERT_EQ(0, ret);
+	for (i = 0; i < size; i++)
+		close(fds[i]);
+	free(fds);
+	fds = NULL;
+
+	ret = fdmap_full(0, &fds, &size);
+	ASSERT_EQ(0, ret);
+	ASSERT_EQ(0, size);
+}
+
+TEST(more_fds) {
+	int *fds1, *fds2, ret1, ret2;
+	size_t size1, size2, i;
+
+	struct rlimit rlim = {
+		.rlim_cur = 600000,
+		.rlim_max = 600000
+	};
+	ASSERT_EQ(0, setrlimit(RLIMIT_NOFILE, &rlim));
+	for (int i = 0; i < 500000; i++)
+		dup(0);
+
+	ret1 = fdmap_full(0, &fds1, &size1);
+	ret2 = fdmap_proc(0, &fds2, &size2);
+	ASSERT_EQ(ret2, ret1);
+	ASSERT_EQ(size2, size1);
+	for (i = 0; i < size1; i++)
+		ASSERT_EQ(fds2[i], fds1[i]);
+	free(fds1);
+	free(fds2);
+}
+
+TEST(child) {
+	int pipefd[2];
+	int *fds1, *fds2, ret1, ret2, i;
+	size_t size1, size2;
+	char byte = 0;
+	pid_t pid;
+
+	ASSERT_NE(-1, pipe(pipefd));
+	pid = fork();
+	ASSERT_NE(-1, pid);
+	if (!pid) {
+		read(pipefd[0], &byte, 1);
+		close(pipefd[0]);
+		close(pipefd[1]);
+		exit(0);
+	}
+
+	ret1 = fdmap_full(0, &fds1, &size1);
+	ret2 = fdmap_proc(0, &fds2, &size2);
+	ASSERT_EQ(ret2, ret1);
+	ASSERT_EQ(size2, size1);
+	for (i = 0; i < size1; i++)
+		ASSERT_EQ(fds2[i], fds1[i]);
+	free(fds1);
+	free(fds2);
+
+	write(pipefd[1], &byte, 1);
+	close(pipefd[0]);
+	close(pipefd[1]);
+	waitpid(pid, NULL, 0);
+}
+
+TEST_HARNESS_MAIN

^ permalink raw reply related	[flat|nested] 51+ messages in thread

* [PATCH v2 2/2] pidmap(2)
  2017-09-24 20:06 ` Alexey Dobriyan
@ 2017-09-24 20:08   ` Alexey Dobriyan
  -1 siblings, 0 replies; 51+ messages in thread
From: Alexey Dobriyan @ 2017-09-24 20:08 UTC (permalink / raw)
  To: akpm
  Cc: linux-kernel, linux-api, rdunlap, tglx, tixxdz, gladkov.alexey,
	Tatsiana_Brouka, Aliaksandr_Patseyenak1

From: Tatsiana Brouka <Tatsiana_Brouka@epam.com>

Implement system call for bulk retrieveing of pids in binary form.

Using /proc is slower than necessary: 3 syscalls + another 3 for each thread +
converting with atoi() + instantiating dentries and inodes.

/proc may be not mounted especially in containers. Natural extension of
hidepid=2 efforts is to not mount /proc at all.

It could be used by programs like ps, top or CRIU. Speed increase will
become more drastic once combined with bulk retrieval of process statistics.

Benchmark:

	N=1<<16 times
	~130 processes (~250 task_structs) on a regular desktop system
	opendir + readdir + closedir /proc + the same for every /proc/$PID/task
	(roughly what htop(1) does) vs pidmap

	/proc 16.80 ± 0.73%
	pidmap 0.06 ± 0.31%

PIDMAP_* flags are modelled after /proc/task_diag patchset.


PIDMAP(2)                  Linux Programmer's Manual                 PIDMAP(2)

NAME
       pidmap - get allocated PIDs

SYNOPSIS
       long pidmap(pid_t pid, int *pids, unsigned int count , unsigned int start, int flags);

DESCRIPTION
       The system call pidmap(2) writes process IDs in buffer pointed by pids.
       At most count pids are written. The pid argument specifies  process  ID
       in  several  values in flags.  If pid equals to zero, syscall will work
       with current process.  The argument start depends on  the  flags.   The
       argument  flags  must include one of the following modes: PIDMAP_TASKS,
       PIDMAP_PROC, PIDMAP_CHILDREN, or PIDMAP_THREADS.  For PIDMAP_TASKS  and
       PIDMAP_PROC exists optional PIDMAP_IGNORE_KTHREADS flag.

       PIDs  are  filled  from  pid  namespace  of  the  calling  process POV:
       unshare(CLONE_NEWPID) + fork + pidmap in child will always return 1/1.

       pidmap(2) hides  PIDs  inaccessible  at  /proc  mounted  with  hide_pid
       option.

       Note,  pidmap(2)  does not guarantee that any of returned PID exists by
       the time system call exists.

       Full list of flags and options is below:

       PIDMAP_TASKS
              Get PIDs of all tasks, including  threads  starting  from  start
              inclusive. First argument pid will be ignored.

       PIDMAP_PROC
              Get  all  process IDs starting from start inclusive. First argu‐
              ment pid will be ignored.

       PIDMAP_CHILDREN
              Get children IDs of  the  process  specified  by  pid  argument.
              start  argument  specifies  number  of  children to skip in this
              case.

       PIDMAP_THREADS
              Get threads IDs of the process specified by pid argument.  start
              argument specifies number of threads to skip in this case.

       PIDMAP_IGNORE_KTHREADS
              Ignore  kernel  threads.  Optional  and  will  be  ignored  with
              PIDMAP_CHILDREN and PIDMAP_THREADS flags.

RETURN VALUE
       On success, number of PIDs read is returned. Otherwise, error  code  is
       returned.

ERRORS
       ESRCH  No such process.

       EACCES Permission denied.

       EFAULT Invalid pids pointer.

       EINVAL Invalid flags value.

NOTES
       Glibc  does  not  provide a wrapper for this system call; call it using
       syscall(2).

EXAMPLE
       #include <stdio.h>
       #include <linux/pidmap.h>

       static inline long pidmap(int pid, int *pids, unsigned int n, unsigned int start, int flags)
       {
               register long r10 asm("r10") = start;
               register long r8 asm("r8") = flags;
               long ret;
               asm volatile (
                       "syscall"
                       : "=a" (ret)
                       : "0" (334), "D" (pid), "S" (pids), "d" (n), "r" (r10), "r" (r8)
                       : "rcx", "r11", "cc", "memory"
               );
               return ret;
       }

       int main(void)
       {
               int pids[5];
               unsigned int start;
               unsigned int i;
               int n;

               start = 0;
               while ((n = pidmap(0, pids, sizeof(pids)/sizeof(pids[0]),
                            start, PIDMAP_PROC | PIDMAP_IGNORE_KTHREADS)) > 0) {

                       for (i = 0; i < n; i++)
                               printf("%d ", pids[i]);
                       printf("\n");

                       start = pids[n - 1] + 1;
               }
               return 0;
       }

Linux                             2017-09-21                         PIDMAP(2)

Changelog:

	CONFIG_PIDMAP option
	PIDMAP_* options
	PIDMAP_IGNORE_KTHREADS
	manpage

Signed-off-by: Tatsiana Brouka <Tatsiana_Brouka@epam.com>
Signed-off-by: Aliaksandr Patseyenak <Aliaksandr_Patseyenak1@epam.com>
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 arch/x86/entry/syscalls/syscall_64.tbl    |   1 +
 include/linux/syscalls.h                  |   5 +
 include/uapi/linux/pidmap.h               |  10 +
 init/Kconfig                              |   7 +
 kernel/Makefile                           |   2 +
 kernel/pidmap.c                           | 287 ++++++++++++++++++++++++++++
 kernel/sys_ni.c                           |   1 +
 tools/testing/selftests/Makefile          |   1 +
 tools/testing/selftests/pidmap/.gitignore |   1 +
 tools/testing/selftests/pidmap/Makefile   |   5 +
 tools/testing/selftests/pidmap/pidmap.c   | 298 ++++++++++++++++++++++++++++++
 tools/testing/selftests/pidmap/pidmap.h   |   1 +
 12 files changed, 619 insertions(+)
 create mode 100644 include/uapi/linux/pidmap.h
 create mode 100644 kernel/pidmap.c
 create mode 100644 tools/testing/selftests/pidmap/.gitignore
 create mode 100644 tools/testing/selftests/pidmap/Makefile
 create mode 100644 tools/testing/selftests/pidmap/pidmap.c
 create mode 120000 tools/testing/selftests/pidmap/pidmap.h

diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 9bfe5f79674f..8ce611f14969 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -340,6 +340,7 @@
 331	common	pkey_free		sys_pkey_free
 332	common	statx			sys_statx
 333	common	fdmap			sys_fdmap
+334	common	pidmap			sys_pidmap
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index d393d844facb..cc1ef71dbb4a 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -939,4 +939,9 @@ asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
 asmlinkage long sys_fdmap(pid_t pid, int __user *fds, unsigned int count,
 			  int start_fd, int flags);
 
+asmlinkage long sys_pidmap(pid_t pid,
+			   int __user *pids,
+			   unsigned int pids_count,
+			   unsigned int start_pid,
+			   int flags);
 #endif
diff --git a/include/uapi/linux/pidmap.h b/include/uapi/linux/pidmap.h
new file mode 100644
index 000000000000..75a7557c22eb
--- /dev/null
+++ b/include/uapi/linux/pidmap.h
@@ -0,0 +1,10 @@
+#ifndef _UAPI_LINUX_PIDMAP_H
+#define _UAPI_LINUX_PIDMAP_H
+
+#define PIDMAP_TASKS	1
+#define PIDMAP_PROC	2
+#define PIDMAP_CHILDREN	3
+#define PIDMAP_THREADS	4
+#define PIDMAP_IGNORE_KTHREADS	(1 << 30)
+
+#endif /* _UAPI_LINUX_PIDMAP_H */
diff --git a/init/Kconfig b/init/Kconfig
index 952d13b7326d..163155e0cfb4 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1407,6 +1407,13 @@ config FDMAP
 	  Enable fdmap() system call that allows to query file descriptors
 	  in binary form avoiding /proc overhead.
 
+config PIDMAP
+	bool "pidmap() system call" if EXPERT
+	default y
+	help
+	  Enable pidmap() system call that allows to query PIDs in binary form
+	  avoiding /proc overhead.
+
 config EMBEDDED
 	bool "Embedded system"
 	option allnoconfig_y
diff --git a/kernel/Makefile b/kernel/Makefile
index ed470aac53da..f8833e5b27e5 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -11,6 +11,8 @@ obj-y     = fork.o exec_domain.o panic.o \
 	    notifier.o ksysfs.o cred.o reboot.o \
 	    async.o range.o smpboot.o ucount.o
 
+obj-$(CONFIG_PIDMAP) += pidmap.o
+
 obj-$(CONFIG_MODULES) += kmod.o
 obj-$(CONFIG_MULTIUSER) += groups.o
 
diff --git a/kernel/pidmap.c b/kernel/pidmap.c
new file mode 100644
index 000000000000..0392bc6935b6
--- /dev/null
+++ b/kernel/pidmap.c
@@ -0,0 +1,287 @@
+#include <linux/bitops.h>
+#include <linux/cred.h>
+#include <linux/kernel.h>
+#include <linux/pid.h>
+#include <linux/ptrace.h>
+#include <linux/rcupdate.h>
+#include <linux/syscalls.h>
+#include <linux/sched.h>
+#include <linux/uaccess.h>
+#include <linux/pidmap.h>
+
+#define PIDMAP_PARAM	(~PIDMAP_IGNORE_KTHREADS)
+
+static inline bool pidmap_perm(const struct pid_namespace *pid_ns)
+{
+	return pid_ns->hide_pid < HIDEPID_INVISIBLE || in_group_p(pid_ns->pid_gid);
+}
+
+static bool skip_task(struct task_struct *task, bool has_perms, int flags)
+{
+	int param = flags & PIDMAP_PARAM;
+
+	if (!task)
+		return true;
+	if (!has_perms && !ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
+		return true;
+	if ((flags & PIDMAP_IGNORE_KTHREADS) && (task->flags & PF_KTHREAD))
+		return true;
+	if (param == PIDMAP_PROC && !thread_group_leader(task))
+		return true;
+	return false;
+}
+
+static long pidmap_tasks(int __user *pids, unsigned int count,
+		  unsigned int start, int flags)
+{
+	struct pid_namespace *pid_ns = task_active_pid_ns(current);
+	unsigned int start_page, start_elem;
+	unsigned int last_pos = 0;
+	unsigned int last_set_pid = 0;
+	unsigned long mask;
+	bool has_perms;
+	unsigned int i;
+
+	/*
+	 * Pid 0 does not exist, however, corresponding bit is always set in
+	 * ->pidmap[0].page, so we should skip it.
+	 */
+	if (start == 0)
+		start = 1;
+
+	if (start > pid_ns->last_pid)
+		return 0;
+
+	has_perms = pidmap_perm(pid_ns);
+
+	start_page = start / BITS_PER_PAGE;
+	start_elem = (start % BITS_PER_PAGE) / BITS_PER_LONG;
+	mask = ~0UL << (start % BITS_PER_LONG);
+
+	for (i = start_page; i < PIDMAP_ENTRIES; i++) {
+		unsigned int j;
+
+		/*
+		 * ->pidmap[].page is set once to a valid pointer,
+		 *  therefore do not take any locks.
+		 */
+		if (!pid_ns->pidmap[i].page)
+			continue;
+
+		for (j = start_elem; j < PAGE_SIZE/sizeof(unsigned long); j++) {
+			unsigned long val;
+
+			val = *((unsigned long *)pid_ns->pidmap[i].page + j);
+			val &= mask;
+			mask = ~0UL;
+			while (val != 0) {
+				struct task_struct *task;
+
+				if (last_pos == count)
+					return last_pos;
+
+				last_set_pid = i * BITS_PER_PAGE +
+					j * BITS_PER_LONG + __ffs(val);
+
+				rcu_read_lock();
+				task = find_task_by_pid_ns(last_set_pid, pid_ns);
+				if (skip_task(task, has_perms, flags)) {
+					rcu_read_unlock();
+					goto next;
+				}
+				rcu_read_unlock();
+
+				if (put_user(last_set_pid, pids + last_pos))
+					return -EFAULT;
+				last_pos++;
+				if (last_set_pid == pid_ns->last_pid)
+					return last_pos;
+next:
+				val &= (val - 1);
+			}
+		}
+		start_elem = 0;
+	}
+	if (last_set_pid == 0)
+		return 0;
+	else
+		return last_pos;
+}
+
+static struct task_struct *pidmap_get_task(pid_t pid, bool *has_perms)
+{
+	struct pid_namespace *pid_ns;
+	struct task_struct *task;
+
+	if (pid == 0) {
+		*has_perms = true;
+		return current;
+	}
+
+	pid_ns = task_active_pid_ns(current);
+	task = find_task_by_pid_ns(pid, pid_ns);
+	if (!task)
+		return ERR_PTR(-ESRCH);
+	*has_perms = pidmap_perm(pid_ns);
+	if (!*has_perms && !ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
+		return ERR_PTR(-EACCES);
+	return task;
+}
+
+static long pidmap_children(pid_t pid, int __user *upid,
+			    unsigned int count, unsigned int start)
+{
+	struct task_struct *task, *child;
+	bool has_perms;
+	int pids[64];
+	unsigned int i;
+	unsigned int ret;
+
+	rcu_read_lock();
+	task = pidmap_get_task(pid, &has_perms);
+	if (IS_ERR(task)) {
+		rcu_read_unlock();
+		return PTR_ERR(task);
+	}
+
+	i = 0;
+	ret = 0;
+	list_for_each_entry(child, &task->children, sibling) {
+		if (start) {
+			start--;
+			continue;
+		}
+
+		if (!has_perms &&
+		    !ptrace_may_access(child, PTRACE_MODE_READ_FSCREDS))
+			continue;
+
+		pids[i++] = child->tgid;
+		if (i >= ARRAY_SIZE(pids)) {
+			get_task_struct(task);
+			get_task_struct(child);
+			rcu_read_unlock();
+
+			if (copy_to_user(upid, pids, i * sizeof(int))) {
+				put_task_struct(child);
+				put_task_struct(task);
+				return -EFAULT;
+			}
+			upid += i;
+			ret += i;
+			i = 0;
+
+			rcu_read_lock();
+			put_task_struct(child);
+			put_task_struct(task);
+
+			if (!pid_alive(task) || !pid_alive(child))
+				break;
+		}
+		if (--count == 0)
+			break;
+	}
+	rcu_read_unlock();
+	if (i > 0) {
+		if (copy_to_user(upid, pids, i * sizeof(int)))
+			return -EFAULT;
+		ret += i;
+	}
+	return ret;
+}
+
+static long pidmap_threads(pid_t pid, int __user *upid,
+			   unsigned int count, unsigned int start)
+{
+	struct task_struct *task, *thread;
+	bool has_perms;
+	int pids[64];
+	unsigned int i;
+	unsigned int ret;
+
+	rcu_read_lock();
+	task = pidmap_get_task(pid, &has_perms);
+	if (IS_ERR(task)) {
+		rcu_read_unlock();
+		return PTR_ERR(task);
+	}
+
+	i = 0;
+	ret = 0;
+	for_each_thread(task, thread) {
+		if (start) {
+			start--;
+			continue;
+		}
+
+		pids[i++] = thread->pid;
+		if (i >= ARRAY_SIZE(pids)) {
+			get_task_struct(task);
+			get_task_struct(thread);
+			rcu_read_unlock();
+
+			if (copy_to_user(upid, pids, i * sizeof(int))) {
+				put_task_struct(thread);
+				put_task_struct(task);
+				return -EFAULT;
+			}
+			upid += i;
+			ret += i;
+			i = 0;
+
+			rcu_read_lock();
+			put_task_struct(thread);
+			put_task_struct(task);
+
+			if (!pid_alive(task) || !pid_alive(thread))
+				break;
+		}
+		if (--count == 0)
+			break;
+	}
+	rcu_read_unlock();
+	if (i > 0) {
+		if (copy_to_user(upid, pids, i * sizeof(int)))
+			return -EFAULT;
+		ret += i;
+	}
+	return ret;
+}
+
+/**
+ * pidmap - get allocated PIDs
+ * @pids: destination buffer.
+ * @count: number of elements in the buffer.
+ * @start: PID to start from or PIDs number already readed.
+ * @flags: flags.
+ *
+ * Write allocated PIDs to a buffer. @start specifies PID to start from
+ * with PIDMAP_TASKS or PIDMAP_PROC flags, or number of PIDs already
+ * readed otherwise.
+ *
+ * PIDs are filled from pid namespace of the calling process POV:
+ * unshare(CLONE_NEWPID)+fork+pidmap in child will always return 1/1.
+ *
+ * pidmap(2) hides PIDs inaccessible at /proc mounted with "hidepid" option.
+ *
+ * Note, pidmap(2) does not guarantee that any of returned PID exists
+ * by the time system call exits.
+ *
+ * Return: number of PIDs written to the buffer or error code otherwise.
+ */
+SYSCALL_DEFINE5(pidmap, pid_t, pid, int __user *, pids,
+		unsigned int, count, unsigned int, start, int, flags)
+{
+	int param = flags & PIDMAP_PARAM;
+
+	switch (param) {
+	case PIDMAP_TASKS:
+	case PIDMAP_PROC:
+		return pidmap_tasks(pids, count, start, flags);
+	case PIDMAP_CHILDREN:
+		return pidmap_children(pid, pids, count, start);
+	case PIDMAP_THREADS:
+		return pidmap_threads(pid, pids, count, start);
+	}
+	return -EINVAL;
+}
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index d61fa27d021e..a600d458c1d9 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -260,3 +260,4 @@ cond_syscall(sys_pkey_alloc);
 cond_syscall(sys_pkey_free);
 
 cond_syscall(sys_fdmap);
+cond_syscall(sys_pidmap);
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index e8d63c27c865..4d1443a83121 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -21,6 +21,7 @@ TARGETS += mount
 TARGETS += mqueue
 TARGETS += net
 TARGETS += nsfs
+TARGETS += pidmap
 TARGETS += powerpc
 TARGETS += pstore
 TARGETS += ptrace
diff --git a/tools/testing/selftests/pidmap/.gitignore b/tools/testing/selftests/pidmap/.gitignore
new file mode 100644
index 000000000000..a762199f2637
--- /dev/null
+++ b/tools/testing/selftests/pidmap/.gitignore
@@ -0,0 +1 @@
+pidmap
diff --git a/tools/testing/selftests/pidmap/Makefile b/tools/testing/selftests/pidmap/Makefile
new file mode 100644
index 000000000000..3deae4ef7295
--- /dev/null
+++ b/tools/testing/selftests/pidmap/Makefile
@@ -0,0 +1,5 @@
+CFLAGS = -Wall
+
+TEST_GEN_PROGS := pidmap
+
+include ../lib.mk
diff --git a/tools/testing/selftests/pidmap/pidmap.c b/tools/testing/selftests/pidmap/pidmap.c
new file mode 100644
index 000000000000..76a9ec57d466
--- /dev/null
+++ b/tools/testing/selftests/pidmap/pidmap.c
@@ -0,0 +1,298 @@
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/wait.h>
+#include <stdlib.h>
+#include <sched.h>
+#include <dirent.h>
+#include <string.h>
+#include <sys/mount.h>
+#include <signal.h>
+#include <assert.h>
+#include "pidmap.h"
+#include "../kselftest_harness.h"
+
+#define SIZE 512
+
+static inline long pidmap(pid_t pid, int *pids, unsigned int count,
+			  unsigned int start_pid, int flags)
+{
+	long ret;
+
+	register long r10 asm("r10") = start_pid;
+	register long r8 asm("r8") = flags;
+
+	asm volatile ("syscall" : "=a"(ret) :
+		"0"(334), "D"(pid), "S"(pids), "d"(count), "r"(r10), "r"(r8) :
+		"rcx", "r11", "cc", "memory");
+	return ret;
+}
+
+static int compare(const void *a, const void *b)
+{
+	return *((int *)a) > *((int *)b);
+}
+
+int pidmap_full(int **pid, unsigned int *res_count)
+{
+	int n;
+	int start_pid = 1;
+	*pid = (int *)malloc(SIZE * sizeof(int));
+	*res_count = 0;
+
+	while ((n = pidmap(0, *pid + *res_count, SIZE, start_pid,
+			   PIDMAP_TASKS)) > 0) {
+		*res_count += n;
+		*pid = (int *)realloc(*pid, (*res_count + SIZE) * sizeof(int));
+		start_pid = (*pid)[*res_count - 1] + 1;
+	}
+	return n;
+}
+
+int pidmap_proc(int **pid, unsigned int *n)
+{
+	DIR *dir = opendir("/proc");
+	struct dirent *dirs;
+
+	*n = 0;
+	*pid = NULL;
+
+	while ((dirs = readdir(dir))) {
+		char dname[32] = "";
+		DIR *task_dir;
+
+		if (dirs->d_name[0] < '0' || dirs->d_name[0] > '9')
+			continue;
+
+		strcpy(dname, "/proc/");
+		strcat(dname, dirs->d_name);
+		strcat(dname, "/task");
+		task_dir = opendir(dname);
+
+		if (task_dir) {
+			struct dirent *task_dirs;
+
+			while ((task_dirs = readdir(task_dir))) {
+				if (task_dirs->d_name[0] < '0' ||
+						task_dirs->d_name[0] > '9')
+					continue;
+
+				*pid = (int *)realloc(*pid, (*n + 1) *
+								sizeof(int));
+				if (*pid == NULL)
+					return -1;
+				*(*pid + *n) = atoi(task_dirs->d_name);
+				*n += 1;
+			}
+		} else {
+			*pid = (int *)realloc(*pid, (*n + 1) * sizeof(int));
+			if (*pid == NULL)
+				return -1;
+			*(*pid + *n) = atoi(dirs->d_name);
+			*n += 1;
+		}
+		closedir(task_dir);
+	}
+	closedir(dir);
+	return 0;
+}
+
+TEST(bufsize)
+{
+	int pid[SIZE];
+
+	EXPECT_EQ(0, pidmap(0, pid, 0, 1, PIDMAP_TASKS));
+}
+
+TEST(get_pid)
+{
+	int pid;
+	int ret;
+
+	ret = pidmap(0, &pid, 1, getpid(), PIDMAP_TASKS);
+	ASSERT_LE(0, ret);
+	EXPECT_EQ(getpid(), pid);
+}
+
+TEST(bad_start)
+{
+	int pid[SIZE];
+
+	ASSERT_LE(0, pidmap(0, pid, SIZE, -1, PIDMAP_TASKS));
+	ASSERT_LE(0, pidmap(0, pid, SIZE, ~0U, PIDMAP_TASKS));
+	ASSERT_LE(0, pidmap(0, pid, SIZE, 0, PIDMAP_TASKS));
+	EXPECT_EQ(1, pid[0]);
+}
+
+TEST(child_pid)
+{
+	pid_t pid = fork();
+
+	if (pid == 0)
+		pause();
+	else {
+		int ret;
+		int result = 0;
+
+		ret = pidmap(0, &result, 1, pid, PIDMAP_TASKS);
+		EXPECT_LE(0, ret);
+		EXPECT_EQ(pid, result);
+		kill(pid, SIGTERM);
+	}
+}
+
+TEST(pidmap_children_flag)
+{
+	int real_pids[SIZE], pids[SIZE];
+	int i;
+
+	for (i = 0; i < SIZE; i++) {
+		pid_t pid = fork();
+		if (!pid) {
+			pause();
+			exit(0);
+		} else if (pid < 0) {
+			perror("fork");
+			exit(1);
+		}
+		real_pids[i] = pid;
+	}
+
+	ASSERT_EQ(SIZE, pidmap(0, pids, SIZE, 0, PIDMAP_CHILDREN));
+	for (i = 0; i < SIZE; i++) {
+		ASSERT_EQ(real_pids[i], pids[i]);
+		kill(real_pids[i], SIGKILL);
+	}
+}
+
+int write_pidmax(int new_pidmax)
+{
+	char old_pidmax[32];
+	char new[32];
+	int fd = open("/proc/sys/kernel/pid_max", O_RDWR);
+
+	if (read(fd, old_pidmax, 32) <= 0)
+		printf("Read failed\n");
+	lseek(fd, 0, 0);
+	snprintf(new, sizeof(new), "%d", new_pidmax);
+	if (write(fd, new, strlen(new)) <= 0)
+		printf("Write failed\n");
+	close(fd);
+	return atoi(old_pidmax);
+}
+
+void do_forks(unsigned int n)
+{
+	while (n--) {
+		pid_t pid = fork();
+
+		if (pid == 0)
+			exit(0);
+		waitpid(pid, NULL, 0);
+	}
+}
+
+TEST(pid_max)
+{
+	int *pid;
+	unsigned int n;
+	int ret, p;
+	int old_pidmax;
+
+	old_pidmax = write_pidmax(50000);
+
+	do_forks(40000);
+
+	p = fork();
+
+	if (p == 0)
+		pause();
+
+	ret = pidmap_full(&pid, &n);
+	kill(p, SIGKILL);
+
+	EXPECT_LE(0, ret);
+	EXPECT_LE(1, n);
+	if (ret < 0 || n <= 0)
+		goto exit;
+	EXPECT_EQ(p, pid[n - 1]);
+exit:
+	write_pidmax(old_pidmax);
+}
+
+void sigquit_h(int sig)
+{
+	assert(sig == SIGQUIT);
+	if (getgid() != getpid())
+		exit(0);
+}
+
+TEST(compare_proc)
+{
+	pid_t pid;
+
+	if (unshare(CLONE_NEWNS | CLONE_NEWPID) == -1)
+		return;
+
+	pid = fork();
+
+	if (pid == 0) {
+		pid_t p;
+		int i = 0;
+
+		signal(SIGQUIT, sigquit_h);
+
+		mount("none", "/", NULL, MS_REC | MS_PRIVATE, NULL);
+		mount("none", "/proc", NULL, MS_REC | MS_PRIVATE, NULL);
+		mount("proc", "/proc", "proc",
+			MS_NOSUID | MS_NODEV | MS_NOEXEC, NULL);
+
+		while (i < 150) {
+			i++;
+
+			p = fork();
+
+			if (p == -1) {
+				umount("/proc");
+				return;
+			}
+			if (p == 0) {
+				pause();
+				return;
+			}
+		}
+
+		int *pids, *pids_proc;
+		unsigned int n = 0;
+		unsigned int n_proc = 0;
+		int ret, ret_proc;
+
+		ret = pidmap_full(&pids, &n);
+
+		ret_proc = pidmap_proc(&pids_proc, &n_proc);
+		qsort(pids_proc, n_proc, sizeof(int), compare);
+
+		EXPECT_LE(0, ret);
+		if (ret < 0 || ret_proc < 0)
+			goto exit;
+
+		EXPECT_EQ(n_proc, n);
+		if (n != n_proc)
+			goto exit;
+
+		for (int i = 0; i < n; i++) {
+			EXPECT_EQ(pids_proc[i], pids[i]);
+			if (pids_proc[i] != pids[i])
+				break;
+		}
+exit:
+		free(pids_proc);
+		free(pids);
+		umount("/proc");
+		kill(-getpid(), SIGQUIT);
+	}
+	wait(NULL);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/pidmap/pidmap.h b/tools/testing/selftests/pidmap/pidmap.h
new file mode 120000
index 000000000000..3abbde34fee9
--- /dev/null
+++ b/tools/testing/selftests/pidmap/pidmap.h
@@ -0,0 +1 @@
+../../../../include/uapi/linux/pidmap.h
\ No newline at end of file

^ permalink raw reply related	[flat|nested] 51+ messages in thread

* [PATCH v2 2/2] pidmap(2)
@ 2017-09-24 20:08   ` Alexey Dobriyan
  0 siblings, 0 replies; 51+ messages in thread
From: Alexey Dobriyan @ 2017-09-24 20:08 UTC (permalink / raw)
  To: akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b
  Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA, rdunlap-wEGCiKHe2LqWVfeAwA7xHQ,
	tglx-hfZtesqFncYOwBW4kG4KsQ, tixxdz-Re5JQEeQqe8AvxtiuMwx3w,
	gladkov.alexey-Re5JQEeQqe8AvxtiuMwx3w,
	Tatsiana_Brouka-uRwfk40T5oI, Aliaksandr_Patseyenak1-uRwfk40T5oI

From: Tatsiana Brouka <Tatsiana_Brouka-uRwfk40T5oI@public.gmane.org>

Implement system call for bulk retrieveing of pids in binary form.

Using /proc is slower than necessary: 3 syscalls + another 3 for each thread +
converting with atoi() + instantiating dentries and inodes.

/proc may be not mounted especially in containers. Natural extension of
hidepid=2 efforts is to not mount /proc at all.

It could be used by programs like ps, top or CRIU. Speed increase will
become more drastic once combined with bulk retrieval of process statistics.

Benchmark:

	N=1<<16 times
	~130 processes (~250 task_structs) on a regular desktop system
	opendir + readdir + closedir /proc + the same for every /proc/$PID/task
	(roughly what htop(1) does) vs pidmap

	/proc 16.80 ± 0.73%
	pidmap 0.06 ± 0.31%

PIDMAP_* flags are modelled after /proc/task_diag patchset.


PIDMAP(2)                  Linux Programmer's Manual                 PIDMAP(2)

NAME
       pidmap - get allocated PIDs

SYNOPSIS
       long pidmap(pid_t pid, int *pids, unsigned int count , unsigned int start, int flags);

DESCRIPTION
       The system call pidmap(2) writes process IDs in buffer pointed by pids.
       At most count pids are written. The pid argument specifies  process  ID
       in  several  values in flags.  If pid equals to zero, syscall will work
       with current process.  The argument start depends on  the  flags.   The
       argument  flags  must include one of the following modes: PIDMAP_TASKS,
       PIDMAP_PROC, PIDMAP_CHILDREN, or PIDMAP_THREADS.  For PIDMAP_TASKS  and
       PIDMAP_PROC exists optional PIDMAP_IGNORE_KTHREADS flag.

       PIDs  are  filled  from  pid  namespace  of  the  calling  process POV:
       unshare(CLONE_NEWPID) + fork + pidmap in child will always return 1/1.

       pidmap(2) hides  PIDs  inaccessible  at  /proc  mounted  with  hide_pid
       option.

       Note,  pidmap(2)  does not guarantee that any of returned PID exists by
       the time system call exists.

       Full list of flags and options is below:

       PIDMAP_TASKS
              Get PIDs of all tasks, including  threads  starting  from  start
              inclusive. First argument pid will be ignored.

       PIDMAP_PROC
              Get  all  process IDs starting from start inclusive. First argu‐
              ment pid will be ignored.

       PIDMAP_CHILDREN
              Get children IDs of  the  process  specified  by  pid  argument.
              start  argument  specifies  number  of  children to skip in this
              case.

       PIDMAP_THREADS
              Get threads IDs of the process specified by pid argument.  start
              argument specifies number of threads to skip in this case.

       PIDMAP_IGNORE_KTHREADS
              Ignore  kernel  threads.  Optional  and  will  be  ignored  with
              PIDMAP_CHILDREN and PIDMAP_THREADS flags.

RETURN VALUE
       On success, number of PIDs read is returned. Otherwise, error  code  is
       returned.

ERRORS
       ESRCH  No such process.

       EACCES Permission denied.

       EFAULT Invalid pids pointer.

       EINVAL Invalid flags value.

NOTES
       Glibc  does  not  provide a wrapper for this system call; call it using
       syscall(2).

EXAMPLE
       #include <stdio.h>
       #include <linux/pidmap.h>

       static inline long pidmap(int pid, int *pids, unsigned int n, unsigned int start, int flags)
       {
               register long r10 asm("r10") = start;
               register long r8 asm("r8") = flags;
               long ret;
               asm volatile (
                       "syscall"
                       : "=a" (ret)
                       : "0" (334), "D" (pid), "S" (pids), "d" (n), "r" (r10), "r" (r8)
                       : "rcx", "r11", "cc", "memory"
               );
               return ret;
       }

       int main(void)
       {
               int pids[5];
               unsigned int start;
               unsigned int i;
               int n;

               start = 0;
               while ((n = pidmap(0, pids, sizeof(pids)/sizeof(pids[0]),
                            start, PIDMAP_PROC | PIDMAP_IGNORE_KTHREADS)) > 0) {

                       for (i = 0; i < n; i++)
                               printf("%d ", pids[i]);
                       printf("\n");

                       start = pids[n - 1] + 1;
               }
               return 0;
       }

Linux                             2017-09-21                         PIDMAP(2)

Changelog:

	CONFIG_PIDMAP option
	PIDMAP_* options
	PIDMAP_IGNORE_KTHREADS
	manpage

Signed-off-by: Tatsiana Brouka <Tatsiana_Brouka-uRwfk40T5oI@public.gmane.org>
Signed-off-by: Aliaksandr Patseyenak <Aliaksandr_Patseyenak1-uRwfk40T5oI@public.gmane.org>
Signed-off-by: Alexey Dobriyan <adobriyan-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
---
 arch/x86/entry/syscalls/syscall_64.tbl    |   1 +
 include/linux/syscalls.h                  |   5 +
 include/uapi/linux/pidmap.h               |  10 +
 init/Kconfig                              |   7 +
 kernel/Makefile                           |   2 +
 kernel/pidmap.c                           | 287 ++++++++++++++++++++++++++++
 kernel/sys_ni.c                           |   1 +
 tools/testing/selftests/Makefile          |   1 +
 tools/testing/selftests/pidmap/.gitignore |   1 +
 tools/testing/selftests/pidmap/Makefile   |   5 +
 tools/testing/selftests/pidmap/pidmap.c   | 298 ++++++++++++++++++++++++++++++
 tools/testing/selftests/pidmap/pidmap.h   |   1 +
 12 files changed, 619 insertions(+)
 create mode 100644 include/uapi/linux/pidmap.h
 create mode 100644 kernel/pidmap.c
 create mode 100644 tools/testing/selftests/pidmap/.gitignore
 create mode 100644 tools/testing/selftests/pidmap/Makefile
 create mode 100644 tools/testing/selftests/pidmap/pidmap.c
 create mode 120000 tools/testing/selftests/pidmap/pidmap.h

diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 9bfe5f79674f..8ce611f14969 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -340,6 +340,7 @@
 331	common	pkey_free		sys_pkey_free
 332	common	statx			sys_statx
 333	common	fdmap			sys_fdmap
+334	common	pidmap			sys_pidmap
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index d393d844facb..cc1ef71dbb4a 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -939,4 +939,9 @@ asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
 asmlinkage long sys_fdmap(pid_t pid, int __user *fds, unsigned int count,
 			  int start_fd, int flags);
 
+asmlinkage long sys_pidmap(pid_t pid,
+			   int __user *pids,
+			   unsigned int pids_count,
+			   unsigned int start_pid,
+			   int flags);
 #endif
diff --git a/include/uapi/linux/pidmap.h b/include/uapi/linux/pidmap.h
new file mode 100644
index 000000000000..75a7557c22eb
--- /dev/null
+++ b/include/uapi/linux/pidmap.h
@@ -0,0 +1,10 @@
+#ifndef _UAPI_LINUX_PIDMAP_H
+#define _UAPI_LINUX_PIDMAP_H
+
+#define PIDMAP_TASKS	1
+#define PIDMAP_PROC	2
+#define PIDMAP_CHILDREN	3
+#define PIDMAP_THREADS	4
+#define PIDMAP_IGNORE_KTHREADS	(1 << 30)
+
+#endif /* _UAPI_LINUX_PIDMAP_H */
diff --git a/init/Kconfig b/init/Kconfig
index 952d13b7326d..163155e0cfb4 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1407,6 +1407,13 @@ config FDMAP
 	  Enable fdmap() system call that allows to query file descriptors
 	  in binary form avoiding /proc overhead.
 
+config PIDMAP
+	bool "pidmap() system call" if EXPERT
+	default y
+	help
+	  Enable pidmap() system call that allows to query PIDs in binary form
+	  avoiding /proc overhead.
+
 config EMBEDDED
 	bool "Embedded system"
 	option allnoconfig_y
diff --git a/kernel/Makefile b/kernel/Makefile
index ed470aac53da..f8833e5b27e5 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -11,6 +11,8 @@ obj-y     = fork.o exec_domain.o panic.o \
 	    notifier.o ksysfs.o cred.o reboot.o \
 	    async.o range.o smpboot.o ucount.o
 
+obj-$(CONFIG_PIDMAP) += pidmap.o
+
 obj-$(CONFIG_MODULES) += kmod.o
 obj-$(CONFIG_MULTIUSER) += groups.o
 
diff --git a/kernel/pidmap.c b/kernel/pidmap.c
new file mode 100644
index 000000000000..0392bc6935b6
--- /dev/null
+++ b/kernel/pidmap.c
@@ -0,0 +1,287 @@
+#include <linux/bitops.h>
+#include <linux/cred.h>
+#include <linux/kernel.h>
+#include <linux/pid.h>
+#include <linux/ptrace.h>
+#include <linux/rcupdate.h>
+#include <linux/syscalls.h>
+#include <linux/sched.h>
+#include <linux/uaccess.h>
+#include <linux/pidmap.h>
+
+#define PIDMAP_PARAM	(~PIDMAP_IGNORE_KTHREADS)
+
+static inline bool pidmap_perm(const struct pid_namespace *pid_ns)
+{
+	return pid_ns->hide_pid < HIDEPID_INVISIBLE || in_group_p(pid_ns->pid_gid);
+}
+
+static bool skip_task(struct task_struct *task, bool has_perms, int flags)
+{
+	int param = flags & PIDMAP_PARAM;
+
+	if (!task)
+		return true;
+	if (!has_perms && !ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
+		return true;
+	if ((flags & PIDMAP_IGNORE_KTHREADS) && (task->flags & PF_KTHREAD))
+		return true;
+	if (param == PIDMAP_PROC && !thread_group_leader(task))
+		return true;
+	return false;
+}
+
+static long pidmap_tasks(int __user *pids, unsigned int count,
+		  unsigned int start, int flags)
+{
+	struct pid_namespace *pid_ns = task_active_pid_ns(current);
+	unsigned int start_page, start_elem;
+	unsigned int last_pos = 0;
+	unsigned int last_set_pid = 0;
+	unsigned long mask;
+	bool has_perms;
+	unsigned int i;
+
+	/*
+	 * Pid 0 does not exist, however, corresponding bit is always set in
+	 * ->pidmap[0].page, so we should skip it.
+	 */
+	if (start == 0)
+		start = 1;
+
+	if (start > pid_ns->last_pid)
+		return 0;
+
+	has_perms = pidmap_perm(pid_ns);
+
+	start_page = start / BITS_PER_PAGE;
+	start_elem = (start % BITS_PER_PAGE) / BITS_PER_LONG;
+	mask = ~0UL << (start % BITS_PER_LONG);
+
+	for (i = start_page; i < PIDMAP_ENTRIES; i++) {
+		unsigned int j;
+
+		/*
+		 * ->pidmap[].page is set once to a valid pointer,
+		 *  therefore do not take any locks.
+		 */
+		if (!pid_ns->pidmap[i].page)
+			continue;
+
+		for (j = start_elem; j < PAGE_SIZE/sizeof(unsigned long); j++) {
+			unsigned long val;
+
+			val = *((unsigned long *)pid_ns->pidmap[i].page + j);
+			val &= mask;
+			mask = ~0UL;
+			while (val != 0) {
+				struct task_struct *task;
+
+				if (last_pos == count)
+					return last_pos;
+
+				last_set_pid = i * BITS_PER_PAGE +
+					j * BITS_PER_LONG + __ffs(val);
+
+				rcu_read_lock();
+				task = find_task_by_pid_ns(last_set_pid, pid_ns);
+				if (skip_task(task, has_perms, flags)) {
+					rcu_read_unlock();
+					goto next;
+				}
+				rcu_read_unlock();
+
+				if (put_user(last_set_pid, pids + last_pos))
+					return -EFAULT;
+				last_pos++;
+				if (last_set_pid == pid_ns->last_pid)
+					return last_pos;
+next:
+				val &= (val - 1);
+			}
+		}
+		start_elem = 0;
+	}
+	if (last_set_pid == 0)
+		return 0;
+	else
+		return last_pos;
+}
+
+static struct task_struct *pidmap_get_task(pid_t pid, bool *has_perms)
+{
+	struct pid_namespace *pid_ns;
+	struct task_struct *task;
+
+	if (pid == 0) {
+		*has_perms = true;
+		return current;
+	}
+
+	pid_ns = task_active_pid_ns(current);
+	task = find_task_by_pid_ns(pid, pid_ns);
+	if (!task)
+		return ERR_PTR(-ESRCH);
+	*has_perms = pidmap_perm(pid_ns);
+	if (!*has_perms && !ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
+		return ERR_PTR(-EACCES);
+	return task;
+}
+
+static long pidmap_children(pid_t pid, int __user *upid,
+			    unsigned int count, unsigned int start)
+{
+	struct task_struct *task, *child;
+	bool has_perms;
+	int pids[64];
+	unsigned int i;
+	unsigned int ret;
+
+	rcu_read_lock();
+	task = pidmap_get_task(pid, &has_perms);
+	if (IS_ERR(task)) {
+		rcu_read_unlock();
+		return PTR_ERR(task);
+	}
+
+	i = 0;
+	ret = 0;
+	list_for_each_entry(child, &task->children, sibling) {
+		if (start) {
+			start--;
+			continue;
+		}
+
+		if (!has_perms &&
+		    !ptrace_may_access(child, PTRACE_MODE_READ_FSCREDS))
+			continue;
+
+		pids[i++] = child->tgid;
+		if (i >= ARRAY_SIZE(pids)) {
+			get_task_struct(task);
+			get_task_struct(child);
+			rcu_read_unlock();
+
+			if (copy_to_user(upid, pids, i * sizeof(int))) {
+				put_task_struct(child);
+				put_task_struct(task);
+				return -EFAULT;
+			}
+			upid += i;
+			ret += i;
+			i = 0;
+
+			rcu_read_lock();
+			put_task_struct(child);
+			put_task_struct(task);
+
+			if (!pid_alive(task) || !pid_alive(child))
+				break;
+		}
+		if (--count == 0)
+			break;
+	}
+	rcu_read_unlock();
+	if (i > 0) {
+		if (copy_to_user(upid, pids, i * sizeof(int)))
+			return -EFAULT;
+		ret += i;
+	}
+	return ret;
+}
+
+static long pidmap_threads(pid_t pid, int __user *upid,
+			   unsigned int count, unsigned int start)
+{
+	struct task_struct *task, *thread;
+	bool has_perms;
+	int pids[64];
+	unsigned int i;
+	unsigned int ret;
+
+	rcu_read_lock();
+	task = pidmap_get_task(pid, &has_perms);
+	if (IS_ERR(task)) {
+		rcu_read_unlock();
+		return PTR_ERR(task);
+	}
+
+	i = 0;
+	ret = 0;
+	for_each_thread(task, thread) {
+		if (start) {
+			start--;
+			continue;
+		}
+
+		pids[i++] = thread->pid;
+		if (i >= ARRAY_SIZE(pids)) {
+			get_task_struct(task);
+			get_task_struct(thread);
+			rcu_read_unlock();
+
+			if (copy_to_user(upid, pids, i * sizeof(int))) {
+				put_task_struct(thread);
+				put_task_struct(task);
+				return -EFAULT;
+			}
+			upid += i;
+			ret += i;
+			i = 0;
+
+			rcu_read_lock();
+			put_task_struct(thread);
+			put_task_struct(task);
+
+			if (!pid_alive(task) || !pid_alive(thread))
+				break;
+		}
+		if (--count == 0)
+			break;
+	}
+	rcu_read_unlock();
+	if (i > 0) {
+		if (copy_to_user(upid, pids, i * sizeof(int)))
+			return -EFAULT;
+		ret += i;
+	}
+	return ret;
+}
+
+/**
+ * pidmap - get allocated PIDs
+ * @pids: destination buffer.
+ * @count: number of elements in the buffer.
+ * @start: PID to start from or PIDs number already readed.
+ * @flags: flags.
+ *
+ * Write allocated PIDs to a buffer. @start specifies PID to start from
+ * with PIDMAP_TASKS or PIDMAP_PROC flags, or number of PIDs already
+ * readed otherwise.
+ *
+ * PIDs are filled from pid namespace of the calling process POV:
+ * unshare(CLONE_NEWPID)+fork+pidmap in child will always return 1/1.
+ *
+ * pidmap(2) hides PIDs inaccessible at /proc mounted with "hidepid" option.
+ *
+ * Note, pidmap(2) does not guarantee that any of returned PID exists
+ * by the time system call exits.
+ *
+ * Return: number of PIDs written to the buffer or error code otherwise.
+ */
+SYSCALL_DEFINE5(pidmap, pid_t, pid, int __user *, pids,
+		unsigned int, count, unsigned int, start, int, flags)
+{
+	int param = flags & PIDMAP_PARAM;
+
+	switch (param) {
+	case PIDMAP_TASKS:
+	case PIDMAP_PROC:
+		return pidmap_tasks(pids, count, start, flags);
+	case PIDMAP_CHILDREN:
+		return pidmap_children(pid, pids, count, start);
+	case PIDMAP_THREADS:
+		return pidmap_threads(pid, pids, count, start);
+	}
+	return -EINVAL;
+}
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index d61fa27d021e..a600d458c1d9 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -260,3 +260,4 @@ cond_syscall(sys_pkey_alloc);
 cond_syscall(sys_pkey_free);
 
 cond_syscall(sys_fdmap);
+cond_syscall(sys_pidmap);
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index e8d63c27c865..4d1443a83121 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -21,6 +21,7 @@ TARGETS += mount
 TARGETS += mqueue
 TARGETS += net
 TARGETS += nsfs
+TARGETS += pidmap
 TARGETS += powerpc
 TARGETS += pstore
 TARGETS += ptrace
diff --git a/tools/testing/selftests/pidmap/.gitignore b/tools/testing/selftests/pidmap/.gitignore
new file mode 100644
index 000000000000..a762199f2637
--- /dev/null
+++ b/tools/testing/selftests/pidmap/.gitignore
@@ -0,0 +1 @@
+pidmap
diff --git a/tools/testing/selftests/pidmap/Makefile b/tools/testing/selftests/pidmap/Makefile
new file mode 100644
index 000000000000..3deae4ef7295
--- /dev/null
+++ b/tools/testing/selftests/pidmap/Makefile
@@ -0,0 +1,5 @@
+CFLAGS = -Wall
+
+TEST_GEN_PROGS := pidmap
+
+include ../lib.mk
diff --git a/tools/testing/selftests/pidmap/pidmap.c b/tools/testing/selftests/pidmap/pidmap.c
new file mode 100644
index 000000000000..76a9ec57d466
--- /dev/null
+++ b/tools/testing/selftests/pidmap/pidmap.c
@@ -0,0 +1,298 @@
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/wait.h>
+#include <stdlib.h>
+#include <sched.h>
+#include <dirent.h>
+#include <string.h>
+#include <sys/mount.h>
+#include <signal.h>
+#include <assert.h>
+#include "pidmap.h"
+#include "../kselftest_harness.h"
+
+#define SIZE 512
+
+static inline long pidmap(pid_t pid, int *pids, unsigned int count,
+			  unsigned int start_pid, int flags)
+{
+	long ret;
+
+	register long r10 asm("r10") = start_pid;
+	register long r8 asm("r8") = flags;
+
+	asm volatile ("syscall" : "=a"(ret) :
+		"0"(334), "D"(pid), "S"(pids), "d"(count), "r"(r10), "r"(r8) :
+		"rcx", "r11", "cc", "memory");
+	return ret;
+}
+
+static int compare(const void *a, const void *b)
+{
+	return *((int *)a) > *((int *)b);
+}
+
+int pidmap_full(int **pid, unsigned int *res_count)
+{
+	int n;
+	int start_pid = 1;
+	*pid = (int *)malloc(SIZE * sizeof(int));
+	*res_count = 0;
+
+	while ((n = pidmap(0, *pid + *res_count, SIZE, start_pid,
+			   PIDMAP_TASKS)) > 0) {
+		*res_count += n;
+		*pid = (int *)realloc(*pid, (*res_count + SIZE) * sizeof(int));
+		start_pid = (*pid)[*res_count - 1] + 1;
+	}
+	return n;
+}
+
+int pidmap_proc(int **pid, unsigned int *n)
+{
+	DIR *dir = opendir("/proc");
+	struct dirent *dirs;
+
+	*n = 0;
+	*pid = NULL;
+
+	while ((dirs = readdir(dir))) {
+		char dname[32] = "";
+		DIR *task_dir;
+
+		if (dirs->d_name[0] < '0' || dirs->d_name[0] > '9')
+			continue;
+
+		strcpy(dname, "/proc/");
+		strcat(dname, dirs->d_name);
+		strcat(dname, "/task");
+		task_dir = opendir(dname);
+
+		if (task_dir) {
+			struct dirent *task_dirs;
+
+			while ((task_dirs = readdir(task_dir))) {
+				if (task_dirs->d_name[0] < '0' ||
+						task_dirs->d_name[0] > '9')
+					continue;
+
+				*pid = (int *)realloc(*pid, (*n + 1) *
+								sizeof(int));
+				if (*pid == NULL)
+					return -1;
+				*(*pid + *n) = atoi(task_dirs->d_name);
+				*n += 1;
+			}
+		} else {
+			*pid = (int *)realloc(*pid, (*n + 1) * sizeof(int));
+			if (*pid == NULL)
+				return -1;
+			*(*pid + *n) = atoi(dirs->d_name);
+			*n += 1;
+		}
+		closedir(task_dir);
+	}
+	closedir(dir);
+	return 0;
+}
+
+TEST(bufsize)
+{
+	int pid[SIZE];
+
+	EXPECT_EQ(0, pidmap(0, pid, 0, 1, PIDMAP_TASKS));
+}
+
+TEST(get_pid)
+{
+	int pid;
+	int ret;
+
+	ret = pidmap(0, &pid, 1, getpid(), PIDMAP_TASKS);
+	ASSERT_LE(0, ret);
+	EXPECT_EQ(getpid(), pid);
+}
+
+TEST(bad_start)
+{
+	int pid[SIZE];
+
+	ASSERT_LE(0, pidmap(0, pid, SIZE, -1, PIDMAP_TASKS));
+	ASSERT_LE(0, pidmap(0, pid, SIZE, ~0U, PIDMAP_TASKS));
+	ASSERT_LE(0, pidmap(0, pid, SIZE, 0, PIDMAP_TASKS));
+	EXPECT_EQ(1, pid[0]);
+}
+
+TEST(child_pid)
+{
+	pid_t pid = fork();
+
+	if (pid == 0)
+		pause();
+	else {
+		int ret;
+		int result = 0;
+
+		ret = pidmap(0, &result, 1, pid, PIDMAP_TASKS);
+		EXPECT_LE(0, ret);
+		EXPECT_EQ(pid, result);
+		kill(pid, SIGTERM);
+	}
+}
+
+TEST(pidmap_children_flag)
+{
+	int real_pids[SIZE], pids[SIZE];
+	int i;
+
+	for (i = 0; i < SIZE; i++) {
+		pid_t pid = fork();
+		if (!pid) {
+			pause();
+			exit(0);
+		} else if (pid < 0) {
+			perror("fork");
+			exit(1);
+		}
+		real_pids[i] = pid;
+	}
+
+	ASSERT_EQ(SIZE, pidmap(0, pids, SIZE, 0, PIDMAP_CHILDREN));
+	for (i = 0; i < SIZE; i++) {
+		ASSERT_EQ(real_pids[i], pids[i]);
+		kill(real_pids[i], SIGKILL);
+	}
+}
+
+int write_pidmax(int new_pidmax)
+{
+	char old_pidmax[32];
+	char new[32];
+	int fd = open("/proc/sys/kernel/pid_max", O_RDWR);
+
+	if (read(fd, old_pidmax, 32) <= 0)
+		printf("Read failed\n");
+	lseek(fd, 0, 0);
+	snprintf(new, sizeof(new), "%d", new_pidmax);
+	if (write(fd, new, strlen(new)) <= 0)
+		printf("Write failed\n");
+	close(fd);
+	return atoi(old_pidmax);
+}
+
+void do_forks(unsigned int n)
+{
+	while (n--) {
+		pid_t pid = fork();
+
+		if (pid == 0)
+			exit(0);
+		waitpid(pid, NULL, 0);
+	}
+}
+
+TEST(pid_max)
+{
+	int *pid;
+	unsigned int n;
+	int ret, p;
+	int old_pidmax;
+
+	old_pidmax = write_pidmax(50000);
+
+	do_forks(40000);
+
+	p = fork();
+
+	if (p == 0)
+		pause();
+
+	ret = pidmap_full(&pid, &n);
+	kill(p, SIGKILL);
+
+	EXPECT_LE(0, ret);
+	EXPECT_LE(1, n);
+	if (ret < 0 || n <= 0)
+		goto exit;
+	EXPECT_EQ(p, pid[n - 1]);
+exit:
+	write_pidmax(old_pidmax);
+}
+
+void sigquit_h(int sig)
+{
+	assert(sig == SIGQUIT);
+	if (getgid() != getpid())
+		exit(0);
+}
+
+TEST(compare_proc)
+{
+	pid_t pid;
+
+	if (unshare(CLONE_NEWNS | CLONE_NEWPID) == -1)
+		return;
+
+	pid = fork();
+
+	if (pid == 0) {
+		pid_t p;
+		int i = 0;
+
+		signal(SIGQUIT, sigquit_h);
+
+		mount("none", "/", NULL, MS_REC | MS_PRIVATE, NULL);
+		mount("none", "/proc", NULL, MS_REC | MS_PRIVATE, NULL);
+		mount("proc", "/proc", "proc",
+			MS_NOSUID | MS_NODEV | MS_NOEXEC, NULL);
+
+		while (i < 150) {
+			i++;
+
+			p = fork();
+
+			if (p == -1) {
+				umount("/proc");
+				return;
+			}
+			if (p == 0) {
+				pause();
+				return;
+			}
+		}
+
+		int *pids, *pids_proc;
+		unsigned int n = 0;
+		unsigned int n_proc = 0;
+		int ret, ret_proc;
+
+		ret = pidmap_full(&pids, &n);
+
+		ret_proc = pidmap_proc(&pids_proc, &n_proc);
+		qsort(pids_proc, n_proc, sizeof(int), compare);
+
+		EXPECT_LE(0, ret);
+		if (ret < 0 || ret_proc < 0)
+			goto exit;
+
+		EXPECT_EQ(n_proc, n);
+		if (n != n_proc)
+			goto exit;
+
+		for (int i = 0; i < n; i++) {
+			EXPECT_EQ(pids_proc[i], pids[i]);
+			if (pids_proc[i] != pids[i])
+				break;
+		}
+exit:
+		free(pids_proc);
+		free(pids);
+		umount("/proc");
+		kill(-getpid(), SIGQUIT);
+	}
+	wait(NULL);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/pidmap/pidmap.h b/tools/testing/selftests/pidmap/pidmap.h
new file mode 120000
index 000000000000..3abbde34fee9
--- /dev/null
+++ b/tools/testing/selftests/pidmap/pidmap.h
@@ -0,0 +1 @@
+../../../../include/uapi/linux/pidmap.h
\ No newline at end of file

^ permalink raw reply related	[flat|nested] 51+ messages in thread

* Re: [PATCH v2 2/2] pidmap(2)
  2017-09-24 20:08   ` Alexey Dobriyan
@ 2017-09-24 21:27     ` Andy Lutomirski
  -1 siblings, 0 replies; 51+ messages in thread
From: Andy Lutomirski @ 2017-09-24 21:27 UTC (permalink / raw)
  To: Alexey Dobriyan
  Cc: Andrew Morton, linux-kernel, Linux API, Randy Dunlap,
	Thomas Gleixner, Djalal Harouni, Alexey Gladkov, Tatsiana Brouka,
	Aliaksandr Patseyenak

On Sun, Sep 24, 2017 at 1:08 PM, Alexey Dobriyan <adobriyan@gmail.com> wrote:
> From: Tatsiana Brouka <Tatsiana_Brouka@epam.com>
>
> Implement system call for bulk retrieveing of pids in binary form.
>
> Using /proc is slower than necessary: 3 syscalls + another 3 for each thread +
> converting with atoi() + instantiating dentries and inodes.
>
> /proc may be not mounted especially in containers. Natural extension of
> hidepid=2 efforts is to not mount /proc at all.
>
> It could be used by programs like ps, top or CRIU. Speed increase will
> become more drastic once combined with bulk retrieval of process statistics.
>
> Benchmark:
>
>         N=1<<16 times
>         ~130 processes (~250 task_structs) on a regular desktop system
>         opendir + readdir + closedir /proc + the same for every /proc/$PID/task
>         (roughly what htop(1) does) vs pidmap
>
>         /proc 16.80 ± 0.73%
>         pidmap 0.06 ± 0.31%
>
> PIDMAP_* flags are modelled after /proc/task_diag patchset.
>
>
> PIDMAP(2)                  Linux Programmer's Manual                 PIDMAP(2)
>
> NAME
>        pidmap - get allocated PIDs
>
> SYNOPSIS
>        long pidmap(pid_t pid, int *pids, unsigned int count , unsigned int start, int flags);

I think we will seriously regret a syscall that does this.  Djalal is
working on fixing the turd that is hidepid, and this syscall is
basically incompatible with ever fixing hidepids.  I think that, to
make it less regrettable, it needs to take an fd to a proc mount as a
parameter.  This makes me wonder why it's a syscall at all -- why not
just create a new file like /proc/pids?

--Andy

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v2 2/2] pidmap(2)
@ 2017-09-24 21:27     ` Andy Lutomirski
  0 siblings, 0 replies; 51+ messages in thread
From: Andy Lutomirski @ 2017-09-24 21:27 UTC (permalink / raw)
  To: Alexey Dobriyan
  Cc: Andrew Morton, linux-kernel-u79uwXL29TY76Z2rM5mHXA, Linux API,
	Randy Dunlap, Thomas Gleixner, Djalal Harouni, Alexey Gladkov,
	Tatsiana Brouka, Aliaksandr Patseyenak

On Sun, Sep 24, 2017 at 1:08 PM, Alexey Dobriyan <adobriyan-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> wrote:
> From: Tatsiana Brouka <Tatsiana_Brouka-uRwfk40T5oI@public.gmane.org>
>
> Implement system call for bulk retrieveing of pids in binary form.
>
> Using /proc is slower than necessary: 3 syscalls + another 3 for each thread +
> converting with atoi() + instantiating dentries and inodes.
>
> /proc may be not mounted especially in containers. Natural extension of
> hidepid=2 efforts is to not mount /proc at all.
>
> It could be used by programs like ps, top or CRIU. Speed increase will
> become more drastic once combined with bulk retrieval of process statistics.
>
> Benchmark:
>
>         N=1<<16 times
>         ~130 processes (~250 task_structs) on a regular desktop system
>         opendir + readdir + closedir /proc + the same for every /proc/$PID/task
>         (roughly what htop(1) does) vs pidmap
>
>         /proc 16.80 ± 0.73%
>         pidmap 0.06 ± 0.31%
>
> PIDMAP_* flags are modelled after /proc/task_diag patchset.
>
>
> PIDMAP(2)                  Linux Programmer's Manual                 PIDMAP(2)
>
> NAME
>        pidmap - get allocated PIDs
>
> SYNOPSIS
>        long pidmap(pid_t pid, int *pids, unsigned int count , unsigned int start, int flags);

I think we will seriously regret a syscall that does this.  Djalal is
working on fixing the turd that is hidepid, and this syscall is
basically incompatible with ever fixing hidepids.  I think that, to
make it less regrettable, it needs to take an fd to a proc mount as a
parameter.  This makes me wonder why it's a syscall at all -- why not
just create a new file like /proc/pids?

--Andy

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH 1/2 v2] fdmap(2)
  2017-09-24 20:06 ` Alexey Dobriyan
  (?)
  (?)
@ 2017-09-24 21:31 ` Andy Lutomirski
  2017-09-26 18:43     ` Alexey Dobriyan
  -1 siblings, 1 reply; 51+ messages in thread
From: Andy Lutomirski @ 2017-09-24 21:31 UTC (permalink / raw)
  To: Alexey Dobriyan
  Cc: Andrew Morton, linux-kernel, Linux API, Randy Dunlap,
	Thomas Gleixner, Djalal Harouni, Alexey Gladkov

On Sun, Sep 24, 2017 at 1:06 PM, Alexey Dobriyan <adobriyan@gmail.com> wrote:
> From: Aliaksandr Patseyenak <Aliaksandr_Patseyenak1@epam.com>
>
> Implement system call for bulk retrieveing of opened descriptors
> in binary form.
>
> Some daemons could use it to reliably close file descriptors
> before starting. Currently they close everything upto some number
> which formally is not reliable. Other natural users are lsof(1) and CRIU
> (although lsof does so much in /proc that the effect is thoroughly buried).
>
> /proc, the only way to learn anything about file descriptors may not be
> available. There is unavoidable overhead associated with instantiating
> 3 dentries and 3 inodes and converting integers to strings and back.
>
> Benchmark:
>
>         N=1<<22 times
>         4 opened descriptors (0, 1, 2, 3)
>         opendir+readdir+closedir /proc/self/fd vs fdmap
>
>         /proc 8.31 ą 0.37%
>         fdmap 0.32 ą 0.72%

This doesn't have the semantic problem that pidmap does, but I still
wonder why this can't be accomplished by adding a new file in /proc.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH 1/2 v2] fdmap(2)
  2017-09-24 20:06 ` Alexey Dobriyan
@ 2017-09-25  7:42   ` Michael Kerrisk (man-pages)
  -1 siblings, 0 replies; 51+ messages in thread
From: Michael Kerrisk (man-pages) @ 2017-09-25  7:42 UTC (permalink / raw)
  To: Alexey Dobriyan, akpm
  Cc: mtk.manpages, linux-kernel, linux-api, rdunlap, tglx, tixxdz,
	gladkov.alexey, Aliaksandr Patseyenak

[Not sure why original author is not in CC; added]

Hello Alexey,

On 09/24/2017 10:06 PM, Alexey Dobriyan wrote:
> From: Aliaksandr Patseyenak <Aliaksandr_Patseyenak1@epam.com>
> 
> Implement system call for bulk retrieveing of opened descriptors
> in binary form.
> 
> Some daemons could use it to reliably close file descriptors
> before starting. Currently they close everything upto some number
> which formally is not reliable. Other natural users are lsof(1) and CRIU
> (although lsof does so much in /proc that the effect is thoroughly buried).
> 
> /proc, the only way to learn anything about file descriptors may not be
> available. There is unavoidable overhead associated with instantiating
> 3 dentries and 3 inodes and converting integers to strings and back.
> 
> Benchmark:
> 
> 	N=1<<22 times
> 	4 opened descriptors (0, 1, 2, 3)
> 	opendir+readdir+closedir /proc/self/fd vs fdmap
> 
> 	/proc 8.31 ± 0.37%
> 	fdmap 0.32 ± 0.72%

>From the text above, I'm still trying to understand: whose problem
does this solve? I mean, we've lived with the daemon-close-all-files
technique forever (and I'm not sure that performance is really an 
important issue for the daemon case) . And you say that the effect
for lsof(1) will be buried. So, who does this new system call
really help? (Note: I'm not saying don't add the syscall, but from
explanation given here, it's not clear why we should.)

Thanks,

Michael


> FDMAP(2)                   Linux Programmer's Manual                  FDMAP(2)
> 
> NAME
>        fdmap - get open file descriptors of the process
> 
> SYNOPSIS
>        long fdmap(pid_t pid, int *fd, unsigned int nfd, int start, int flags);
> 
> DESCRIPTION
>        fdmap()  writes  open  file  descriptors  of the process into buffer fd
>        starting from the start descriptor. At most nfd elements  are  written.
>        flags argument is reserved and must be zero.
> 
>        If pid is zero, syscall will work with the current process.
> 
> RETURN VALUE
>        On success, number of descriptors written is returned.  On error, -1 is
>        returned, and errno is set appropriately.
> 
> ERRORS
>        ESRCH  No such process.
> 
>        EACCES Permission denied.
> 
>        EFAULT Invalid fd pointer.
> 
>        EINVAL Negative start argument.
> 
> NOTES
>        Glibc does not provide a wrapper for these system call; call  it  using
>        syscall(2).
> 
> EXAMPLE
>        The program below demonstrates fdmap() usage.
> 
>            $ ./a.out $$
>            0 1 2 255
> 
>            $ ./a.out 42</dev/null 1023</dev/null
>            0 1 2 42
>            1023
> 
>    Program source
> 
>        #include <sys/types.h>
>        #include <stdlib.h>
>        #include <stdio.h>
> 
>        static inline long fdmap(int pid, int *fd, unsigned int nfd, unsigned int start, int flags)
>        {
>             register long r10 asm ("r10") = start;
>             register long r8 asm ("r8") = flags;
>             long rv;
>             asm volatile (
>                  "syscall"
>                  : "=a" (rv)
>                  : "0" (333), "D" (pid), "S" (fd), "d" (nfd), "r" (r10), "r" (r8)
>                  : "rcx", "r11", "cc", "memory"
>             );
>             return rv;
>        }
> 
>        int main(int argc, char *argv[])
>        {
>             pid_t pid;
>             int fd[4];
>             unsigned int start;
>             int n;
> 
>             pid = 0;
>             if (argc > 1)
>                  pid = atoi(argv[1]);
> 
>             start = 0;
>             while ((n = fdmap(pid, fd, sizeof(fd)/sizeof(fd[0]), start, 0)) > 0) {
>                  unsigned int i;
> 
>                  for (i = 0; i < n; i++)
>                       printf("%u ", fd[i]);
>                  printf("\n");
> 
>                  start = fd[n - 1] + 1;
>             }
>             return 0;
>        }
> 
> Linux                             2017-09-21                          FDMAP(2)
> 
> Changelog:
> 
> 	CONFIG_PIDMAP option
> 	manpage
> 
> 
> Signed-off-by: Aliaksandr Patseyenak <Aliaksandr_Patseyenak1@epam.com>
> Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
> 
> ---
>  arch/x86/entry/syscalls/syscall_64.tbl     |   1 +
>  fs/Makefile                                |   2 +
>  fs/fdmap.c                                 | 105 ++++++++++++++++++++
>  include/linux/syscalls.h                   |   2 +
>  init/Kconfig                               |   7 ++
>  kernel/sys_ni.c                            |   2 +
>  tools/testing/selftests/Makefile           |   1 +
>  tools/testing/selftests/fdmap/.gitignore   |   1 +
>  tools/testing/selftests/fdmap/Makefile     |   7 ++
>  tools/testing/selftests/fdmap/fdmap.c      | 112 +++++++++++++++++++++
>  tools/testing/selftests/fdmap/fdmap.h      |  12 +++
>  tools/testing/selftests/fdmap/fdmap_test.c | 153 +++++++++++++++++++++++++++++
>  12 files changed, 405 insertions(+)
>  create mode 100644 fs/fdmap.c
>  create mode 100644 tools/testing/selftests/fdmap/.gitignore
>  create mode 100644 tools/testing/selftests/fdmap/Makefile
>  create mode 100644 tools/testing/selftests/fdmap/fdmap.c
>  create mode 100644 tools/testing/selftests/fdmap/fdmap.h
>  create mode 100644 tools/testing/selftests/fdmap/fdmap_test.c
> 
> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> index 5aef183e2f85..9bfe5f79674f 100644
> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> @@ -339,6 +339,7 @@
>  330	common	pkey_alloc		sys_pkey_alloc
>  331	common	pkey_free		sys_pkey_free
>  332	common	statx			sys_statx
> +333	common	fdmap			sys_fdmap
>  
>  #
>  # x32-specific system call numbers start at 512 to avoid cache impact
> diff --git a/fs/Makefile b/fs/Makefile
> index 7bbaca9c67b1..27476a66c18e 100644
> --- a/fs/Makefile
> +++ b/fs/Makefile
> @@ -13,6 +13,8 @@ obj-y :=	open.o read_write.o file_table.o super.o \
>  		pnode.o splice.o sync.o utimes.o \
>  		stack.o fs_struct.o statfs.o fs_pin.o nsfs.o
>  
> +obj-$(CONFIG_FDMAP) += fdmap.o
> +
>  ifeq ($(CONFIG_BLOCK),y)
>  obj-y +=	buffer.o block_dev.o direct-io.o mpage.o
>  else
> diff --git a/fs/fdmap.c b/fs/fdmap.c
> new file mode 100644
> index 000000000000..274e6c5b7c9c
> --- /dev/null
> +++ b/fs/fdmap.c
> @@ -0,0 +1,105 @@
> +#include <linux/bitops.h>
> +#include <linux/fdtable.h>
> +#include <linux/rcupdate.h>
> +#include <linux/sched.h>
> +#include <linux/syscalls.h>
> +#include <linux/uaccess.h>
> +
> +/**
> + * fdmap - get opened file descriptors of a process
> + * @pid: the pid of the target process
> + * @fds: allocated userspace buffer
> + * @count: buffer size (in descriptors)
> + * @start_fd: first descriptor to search from (inclusive)
> + * @flags: reserved for future functionality, must be zero
> + *
> + * If @pid is zero then it's current process.
> + * Return: number of descriptors written. An error code otherwise.
> + */
> +SYSCALL_DEFINE5(fdmap, pid_t, pid, int __user *, fds, unsigned int, count,
> +		int, start_fd, int, flags)
> +{
> +	struct task_struct *task;
> +	struct files_struct *files;
> +	unsigned long search_mask;
> +	unsigned int user_index, offset;
> +	int masksize;
> +
> +	if (start_fd < 0 || flags != 0)
> +		return -EINVAL;
> +
> +	if (!pid) {
> +		files = get_files_struct(current);
> +	} else {
> +		rcu_read_lock();
> +		task = find_task_by_vpid(pid);
> +		if (!task) {
> +			rcu_read_unlock();
> +			return -ESRCH;
> +		}
> +		if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
> +			rcu_read_unlock();
> +			return -EACCES;
> +		}
> +		files = get_files_struct(task);
> +		rcu_read_unlock();
> +	}
> +	if (!files)
> +		return 0;
> +
> +	offset = start_fd / BITS_PER_LONG;
> +	search_mask = ULONG_MAX << (start_fd % BITS_PER_LONG);
> +	user_index = 0;
> +#define FDS_BUF_SIZE	(512/sizeof(unsigned long))
> +	masksize = FDS_BUF_SIZE;
> +	while (user_index < count && masksize == FDS_BUF_SIZE) {
> +		unsigned long open_fds[FDS_BUF_SIZE];
> +		struct fdtable *fdt;
> +		unsigned int i;
> +
> +		/*
> +		 * fdt->max_fds can grow, get it every time
> +		 * before copying part into internal buffer.
> +		 */
> +		rcu_read_lock();
> +		fdt = files_fdtable(files);
> +		masksize = fdt->max_fds / 8 - offset * sizeof(long);
> +		if (masksize < 0) {
> +			rcu_read_unlock();
> +			break;
> +		}
> +		masksize = min(masksize, (int)sizeof(open_fds));
> +		memcpy(open_fds, fdt->open_fds + offset, masksize);
> +		rcu_read_unlock();
> +
> +		open_fds[0] &= search_mask;
> +		search_mask = ULONG_MAX;
> +		masksize = (masksize + sizeof(long) - 1) / sizeof(long);
> +		start_fd = offset * BITS_PER_LONG;
> +		/*
> +		 * for_each_set_bit_from() can re-read first word
> +		 * multiple times which is not optimal.
> +		 */
> +		for (i = 0; i < masksize; i++) {
> +			unsigned long mask = open_fds[i];
> +
> +			while (mask) {
> +				unsigned int real_fd = start_fd + __ffs(mask);
> +
> +				if (put_user(real_fd, fds + user_index)) {
> +					put_files_struct(files);
> +					return -EFAULT;
> +				}
> +				if (++user_index >= count)
> +					goto out;
> +				mask &= mask - 1;
> +			}
> +			start_fd += BITS_PER_LONG;
> +		}
> +		offset += FDS_BUF_SIZE;
> +	}
> +out:
> +	put_files_struct(files);
> +
> +	return user_index;
> +}
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index 95606a2d556f..d393d844facb 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -936,5 +936,7 @@ asmlinkage long sys_pkey_alloc(unsigned long flags, unsigned long init_val);
>  asmlinkage long sys_pkey_free(int pkey);
>  asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
>  			  unsigned mask, struct statx __user *buffer);
> +asmlinkage long sys_fdmap(pid_t pid, int __user *fds, unsigned int count,
> +			  int start_fd, int flags);
>  
>  #endif
> diff --git a/init/Kconfig b/init/Kconfig
> index 78cb2461012e..952d13b7326d 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -1400,6 +1400,13 @@ config MEMBARRIER
>  
>  	  If unsure, say Y.
>  
> +config FDMAP
> +	bool "fdmap() system call" if EXPERT
> +	default y
> +	help
> +	  Enable fdmap() system call that allows to query file descriptors
> +	  in binary form avoiding /proc overhead.
> +
>  config EMBEDDED
>  	bool "Embedded system"
>  	option allnoconfig_y
> diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
> index 8acef8576ce9..d61fa27d021e 100644
> --- a/kernel/sys_ni.c
> +++ b/kernel/sys_ni.c
> @@ -258,3 +258,5 @@ cond_syscall(sys_membarrier);
>  cond_syscall(sys_pkey_mprotect);
>  cond_syscall(sys_pkey_alloc);
>  cond_syscall(sys_pkey_free);
> +
> +cond_syscall(sys_fdmap);
> diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
> index 26ce4f7168be..e8d63c27c865 100644
> --- a/tools/testing/selftests/Makefile
> +++ b/tools/testing/selftests/Makefile
> @@ -5,6 +5,7 @@ TARGETS += cpufreq
>  TARGETS += cpu-hotplug
>  TARGETS += efivarfs
>  TARGETS += exec
> +TARGETS += fdmap
>  TARGETS += firmware
>  TARGETS += ftrace
>  TARGETS += futex
> diff --git a/tools/testing/selftests/fdmap/.gitignore b/tools/testing/selftests/fdmap/.gitignore
> new file mode 100644
> index 000000000000..9a9bfdac1cc0
> --- /dev/null
> +++ b/tools/testing/selftests/fdmap/.gitignore
> @@ -0,0 +1 @@
> +fdmap_test
> diff --git a/tools/testing/selftests/fdmap/Makefile b/tools/testing/selftests/fdmap/Makefile
> new file mode 100644
> index 000000000000..bf9f051f4b63
> --- /dev/null
> +++ b/tools/testing/selftests/fdmap/Makefile
> @@ -0,0 +1,7 @@
> +TEST_GEN_PROGS := fdmap_test
> +CFLAGS += -Wall
> +
> +include ../lib.mk
> +
> +$(TEST_GEN_PROGS): fdmap_test.c fdmap.c fdmap.h ../kselftest_harness.h
> +	$(CC) $(CFLAGS) $(LDFLAGS) $< fdmap.c -o $@
> diff --git a/tools/testing/selftests/fdmap/fdmap.c b/tools/testing/selftests/fdmap/fdmap.c
> new file mode 100644
> index 000000000000..66725b0201e0
> --- /dev/null
> +++ b/tools/testing/selftests/fdmap/fdmap.c
> @@ -0,0 +1,112 @@
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <string.h>
> +#include <dirent.h>
> +#include <unistd.h>
> +#include <errno.h>
> +#include <sys/types.h>
> +#include "fdmap.h"
> +
> +#define	BUF_SIZE	1024
> +
> +long fdmap(pid_t pid, int *fds, size_t count, int start_fd, int flags)
> +{
> +	register int64_t r10 asm("r10") = start_fd;
> +	register int64_t r8 asm("r8") = flags;
> +	long ret;
> +
> +	asm volatile (
> +		"syscall"
> +		: "=a"(ret)
> +		: "0" (333),
> +		  "D" (pid), "S" (fds), "d" (count), "r" (r10), "r" (r8)
> +		: "rcx", "r11", "cc", "memory"
> +	);
> +	return ret;
> +}
> +
> +int fdmap_full(pid_t pid, int **fds, size_t *n)
> +{
> +	int buf[BUF_SIZE], start_fd = 0;
> +	long ret;
> +
> +	*n = 0;
> +	*fds = NULL;
> +	for (;;) {
> +		int *new_buff;
> +
> +		ret = fdmap(pid, buf, BUF_SIZE, start_fd, 0);
> +		if (ret < 0)
> +			break;
> +		if (!ret)
> +			return 0;
> +
> +		new_buff = realloc(*fds, (*n + ret) * sizeof(int));
> +		if (!new_buff) {
> +			ret = -errno;
> +			break;
> +		}
> +		*fds = new_buff;
> +		memcpy(*fds + *n, buf, ret * sizeof(int));
> +		*n += ret;
> +		start_fd = (*fds)[*n - 1] + 1;
> +	}
> +	free(*fds);
> +	*fds = NULL;
> +	return -ret;
> +}
> +
> +int fdmap_proc(pid_t pid, int **fds, size_t *n)
> +{
> +	char fds_path[20];
> +	int dir_fd = 0;
> +	struct dirent *fd_link;
> +	DIR *fds_dir;
> +
> +	*fds = NULL;
> +	*n = 0;
> +	if (!pid)
> +		strcpy(fds_path, "/proc/self/fd");
> +	else
> +		sprintf(fds_path, "/proc/%d/fd", pid);
> +
> +	fds_dir = opendir(fds_path);
> +	if (!fds_dir)
> +		return errno == ENOENT ? ESRCH : errno;
> +	if (!pid)
> +		dir_fd = dirfd(fds_dir);
> +
> +	while ((fd_link = readdir(fds_dir))) {
> +		if (fd_link->d_name[0] < '0'
> +		    || fd_link->d_name[0] > '9')
> +			continue;
> +		if (*n % BUF_SIZE == 0) {
> +			int *new_buff;
> +
> +			new_buff = realloc(*fds, (*n + BUF_SIZE) * sizeof(int));
> +			if (!new_buff) {
> +				int ret = errno;
> +
> +				free(*fds);
> +				*fds = NULL;
> +				return ret;
> +			}
> +			*fds = new_buff;
> +		}
> +		(*fds)[*n] = atoi(fd_link->d_name);
> +		*n += 1;
> +	}
> +	closedir(fds_dir);
> +
> +	if (!pid) {
> +		size_t i;
> +
> +		for (i = 0; i < *n; i++)
> +			if ((*fds)[i] == dir_fd)
> +				break;
> +		i++;
> +		memmove(*fds + i - 1, *fds + i, (*n - i) * sizeof(int));
> +		(*n)--;
> +	}
> +	return 0;
> +}
> diff --git a/tools/testing/selftests/fdmap/fdmap.h b/tools/testing/selftests/fdmap/fdmap.h
> new file mode 100644
> index 000000000000..c501111b2bbd
> --- /dev/null
> +++ b/tools/testing/selftests/fdmap/fdmap.h
> @@ -0,0 +1,12 @@
> +#ifndef FDMAP_H
> +#define FDMAP_H
> +
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <sys/types.h>
> +
> +long fdmap(pid_t pid, int *fds, size_t count, int start_fd, int flags);
> +int fdmap_full(pid_t pid, int **fds, size_t *n);
> +int fdmap_proc(pid_t pid, int **fds, size_t *n);
> +
> +#endif
> diff --git a/tools/testing/selftests/fdmap/fdmap_test.c b/tools/testing/selftests/fdmap/fdmap_test.c
> new file mode 100644
> index 000000000000..6f448406d96a
> --- /dev/null
> +++ b/tools/testing/selftests/fdmap/fdmap_test.c
> @@ -0,0 +1,153 @@
> +#include <errno.h>
> +#include <syscall.h>
> +#include <sys/time.h>
> +#include <sys/resource.h>
> +#include <limits.h>
> +#include "../kselftest_harness.h"
> +#include "fdmap.h"
> +
> +TEST(efault) {
> +	int ret;
> +
> +	ret = syscall(333, 0, NULL, 20 * sizeof(int), 0, 0);
> +	ASSERT_EQ(-1, ret);
> +	ASSERT_EQ(EFAULT, errno);
> +}
> +
> +TEST(big_start_fd) {
> +	int fds[1];
> +	int ret;
> +
> +	ret = syscall(333, 0, fds, sizeof(int), INT_MAX, 0);
> +	ASSERT_EQ(0, ret);
> +}
> +
> +TEST(einval) {
> +	int ret;
> +
> +	ret = syscall(333, 0, NULL, 0, -1, 0);
> +	ASSERT_EQ(-1, ret);
> +	ASSERT_EQ(EINVAL, errno);
> +
> +	ret = syscall(333, 0, NULL, 0, 0, 1);
> +	ASSERT_EQ(-1, ret);
> +	ASSERT_EQ(EINVAL, errno);
> +}
> +
> +TEST(esrch) {
> +	int fds[1], ret;
> +	pid_t pid;
> +
> +	pid = fork();
> +	ASSERT_NE(-1, pid);
> +	if (!pid)
> +		exit(0);
> +	waitpid(pid, NULL, 0);
> +
> +	ret = syscall(333, pid, fds, sizeof(int), 0, 0);
> +	ASSERT_EQ(-1, ret);
> +	ASSERT_EQ(ESRCH, errno);
> +}
> +
> +TEST(simple) {
> +	int *fds1, *fds2;
> +	size_t size1, size2, i;
> +	int ret1, ret2;
> +
> +	ret1 = fdmap_full(0, &fds1, &size1);
> +	ret2 = fdmap_proc(0, &fds2, &size2);
> +	ASSERT_EQ(ret2, ret1);
> +	ASSERT_EQ(size2, size1);
> +	for (i = 0; i < size1; i++)
> +		ASSERT_EQ(fds2[i], fds1[i]);
> +	free(fds1);
> +	free(fds2);
> +}
> +
> +TEST(init) {
> +	int *fds1, *fds2;
> +	size_t size1, size2, i;
> +	int ret1, ret2;
> +
> +	ret1 = fdmap_full(1, &fds1, &size1);
> +	ret2 = fdmap_proc(1, &fds2, &size2);
> +	ASSERT_EQ(ret2, ret1);
> +	ASSERT_EQ(size2, size1);
> +	for (i = 0; i < size1; i++)
> +		ASSERT_EQ(fds2[i], fds1[i]);
> +	free(fds1);
> +	free(fds2);
> +}
> +
> +TEST(zero) {
> +	int *fds, i;
> +	size_t size;
> +	int ret;
> +
> +	ret = fdmap_proc(0, &fds, &size);
> +	ASSERT_EQ(0, ret);
> +	for (i = 0; i < size; i++)
> +		close(fds[i]);
> +	free(fds);
> +	fds = NULL;
> +
> +	ret = fdmap_full(0, &fds, &size);
> +	ASSERT_EQ(0, ret);
> +	ASSERT_EQ(0, size);
> +}
> +
> +TEST(more_fds) {
> +	int *fds1, *fds2, ret1, ret2;
> +	size_t size1, size2, i;
> +
> +	struct rlimit rlim = {
> +		.rlim_cur = 600000,
> +		.rlim_max = 600000
> +	};
> +	ASSERT_EQ(0, setrlimit(RLIMIT_NOFILE, &rlim));
> +	for (int i = 0; i < 500000; i++)
> +		dup(0);
> +
> +	ret1 = fdmap_full(0, &fds1, &size1);
> +	ret2 = fdmap_proc(0, &fds2, &size2);
> +	ASSERT_EQ(ret2, ret1);
> +	ASSERT_EQ(size2, size1);
> +	for (i = 0; i < size1; i++)
> +		ASSERT_EQ(fds2[i], fds1[i]);
> +	free(fds1);
> +	free(fds2);
> +}
> +
> +TEST(child) {
> +	int pipefd[2];
> +	int *fds1, *fds2, ret1, ret2, i;
> +	size_t size1, size2;
> +	char byte = 0;
> +	pid_t pid;
> +
> +	ASSERT_NE(-1, pipe(pipefd));
> +	pid = fork();
> +	ASSERT_NE(-1, pid);
> +	if (!pid) {
> +		read(pipefd[0], &byte, 1);
> +		close(pipefd[0]);
> +		close(pipefd[1]);
> +		exit(0);
> +	}
> +
> +	ret1 = fdmap_full(0, &fds1, &size1);
> +	ret2 = fdmap_proc(0, &fds2, &size2);
> +	ASSERT_EQ(ret2, ret1);
> +	ASSERT_EQ(size2, size1);
> +	for (i = 0; i < size1; i++)
> +		ASSERT_EQ(fds2[i], fds1[i]);
> +	free(fds1);
> +	free(fds2);
> +
> +	write(pipefd[1], &byte, 1);
> +	close(pipefd[0]);
> +	close(pipefd[1]);
> +	waitpid(pid, NULL, 0);
> +}
> +
> +TEST_HARNESS_MAIN
> --
> To unsubscribe from this list: send the line "unsubscribe linux-api" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 


-- 
Michael Kerrisk
Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/
Linux/UNIX System Programming Training: http://man7.org/training/

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH 1/2 v2] fdmap(2)
@ 2017-09-25  7:42   ` Michael Kerrisk (man-pages)
  0 siblings, 0 replies; 51+ messages in thread
From: Michael Kerrisk (man-pages) @ 2017-09-25  7:42 UTC (permalink / raw)
  To: Alexey Dobriyan, akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b
  Cc: mtk.manpages-Re5JQEeQqe8AvxtiuMwx3w,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA, rdunlap-wEGCiKHe2LqWVfeAwA7xHQ,
	tglx-hfZtesqFncYOwBW4kG4KsQ, tixxdz-Re5JQEeQqe8AvxtiuMwx3w,
	gladkov.alexey-Re5JQEeQqe8AvxtiuMwx3w, Aliaksandr Patseyenak

[Not sure why original author is not in CC; added]

Hello Alexey,

On 09/24/2017 10:06 PM, Alexey Dobriyan wrote:
> From: Aliaksandr Patseyenak <Aliaksandr_Patseyenak1-uRwfk40T5oI@public.gmane.org>
> 
> Implement system call for bulk retrieveing of opened descriptors
> in binary form.
> 
> Some daemons could use it to reliably close file descriptors
> before starting. Currently they close everything upto some number
> which formally is not reliable. Other natural users are lsof(1) and CRIU
> (although lsof does so much in /proc that the effect is thoroughly buried).
> 
> /proc, the only way to learn anything about file descriptors may not be
> available. There is unavoidable overhead associated with instantiating
> 3 dentries and 3 inodes and converting integers to strings and back.
> 
> Benchmark:
> 
> 	N=1<<22 times
> 	4 opened descriptors (0, 1, 2, 3)
> 	opendir+readdir+closedir /proc/self/fd vs fdmap
> 
> 	/proc 8.31 ± 0.37%
> 	fdmap 0.32 ± 0.72%

>From the text above, I'm still trying to understand: whose problem
does this solve? I mean, we've lived with the daemon-close-all-files
technique forever (and I'm not sure that performance is really an 
important issue for the daemon case) . And you say that the effect
for lsof(1) will be buried. So, who does this new system call
really help? (Note: I'm not saying don't add the syscall, but from
explanation given here, it's not clear why we should.)

Thanks,

Michael


> FDMAP(2)                   Linux Programmer's Manual                  FDMAP(2)
> 
> NAME
>        fdmap - get open file descriptors of the process
> 
> SYNOPSIS
>        long fdmap(pid_t pid, int *fd, unsigned int nfd, int start, int flags);
> 
> DESCRIPTION
>        fdmap()  writes  open  file  descriptors  of the process into buffer fd
>        starting from the start descriptor. At most nfd elements  are  written.
>        flags argument is reserved and must be zero.
> 
>        If pid is zero, syscall will work with the current process.
> 
> RETURN VALUE
>        On success, number of descriptors written is returned.  On error, -1 is
>        returned, and errno is set appropriately.
> 
> ERRORS
>        ESRCH  No such process.
> 
>        EACCES Permission denied.
> 
>        EFAULT Invalid fd pointer.
> 
>        EINVAL Negative start argument.
> 
> NOTES
>        Glibc does not provide a wrapper for these system call; call  it  using
>        syscall(2).
> 
> EXAMPLE
>        The program below demonstrates fdmap() usage.
> 
>            $ ./a.out $$
>            0 1 2 255
> 
>            $ ./a.out 42</dev/null 1023</dev/null
>            0 1 2 42
>            1023
> 
>    Program source
> 
>        #include <sys/types.h>
>        #include <stdlib.h>
>        #include <stdio.h>
> 
>        static inline long fdmap(int pid, int *fd, unsigned int nfd, unsigned int start, int flags)
>        {
>             register long r10 asm ("r10") = start;
>             register long r8 asm ("r8") = flags;
>             long rv;
>             asm volatile (
>                  "syscall"
>                  : "=a" (rv)
>                  : "0" (333), "D" (pid), "S" (fd), "d" (nfd), "r" (r10), "r" (r8)
>                  : "rcx", "r11", "cc", "memory"
>             );
>             return rv;
>        }
> 
>        int main(int argc, char *argv[])
>        {
>             pid_t pid;
>             int fd[4];
>             unsigned int start;
>             int n;
> 
>             pid = 0;
>             if (argc > 1)
>                  pid = atoi(argv[1]);
> 
>             start = 0;
>             while ((n = fdmap(pid, fd, sizeof(fd)/sizeof(fd[0]), start, 0)) > 0) {
>                  unsigned int i;
> 
>                  for (i = 0; i < n; i++)
>                       printf("%u ", fd[i]);
>                  printf("\n");
> 
>                  start = fd[n - 1] + 1;
>             }
>             return 0;
>        }
> 
> Linux                             2017-09-21                          FDMAP(2)
> 
> Changelog:
> 
> 	CONFIG_PIDMAP option
> 	manpage
> 
> 
> Signed-off-by: Aliaksandr Patseyenak <Aliaksandr_Patseyenak1-uRwfk40T5oI@public.gmane.org>
> Signed-off-by: Alexey Dobriyan <adobriyan-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
> 
> ---
>  arch/x86/entry/syscalls/syscall_64.tbl     |   1 +
>  fs/Makefile                                |   2 +
>  fs/fdmap.c                                 | 105 ++++++++++++++++++++
>  include/linux/syscalls.h                   |   2 +
>  init/Kconfig                               |   7 ++
>  kernel/sys_ni.c                            |   2 +
>  tools/testing/selftests/Makefile           |   1 +
>  tools/testing/selftests/fdmap/.gitignore   |   1 +
>  tools/testing/selftests/fdmap/Makefile     |   7 ++
>  tools/testing/selftests/fdmap/fdmap.c      | 112 +++++++++++++++++++++
>  tools/testing/selftests/fdmap/fdmap.h      |  12 +++
>  tools/testing/selftests/fdmap/fdmap_test.c | 153 +++++++++++++++++++++++++++++
>  12 files changed, 405 insertions(+)
>  create mode 100644 fs/fdmap.c
>  create mode 100644 tools/testing/selftests/fdmap/.gitignore
>  create mode 100644 tools/testing/selftests/fdmap/Makefile
>  create mode 100644 tools/testing/selftests/fdmap/fdmap.c
>  create mode 100644 tools/testing/selftests/fdmap/fdmap.h
>  create mode 100644 tools/testing/selftests/fdmap/fdmap_test.c
> 
> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> index 5aef183e2f85..9bfe5f79674f 100644
> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> @@ -339,6 +339,7 @@
>  330	common	pkey_alloc		sys_pkey_alloc
>  331	common	pkey_free		sys_pkey_free
>  332	common	statx			sys_statx
> +333	common	fdmap			sys_fdmap
>  
>  #
>  # x32-specific system call numbers start at 512 to avoid cache impact
> diff --git a/fs/Makefile b/fs/Makefile
> index 7bbaca9c67b1..27476a66c18e 100644
> --- a/fs/Makefile
> +++ b/fs/Makefile
> @@ -13,6 +13,8 @@ obj-y :=	open.o read_write.o file_table.o super.o \
>  		pnode.o splice.o sync.o utimes.o \
>  		stack.o fs_struct.o statfs.o fs_pin.o nsfs.o
>  
> +obj-$(CONFIG_FDMAP) += fdmap.o
> +
>  ifeq ($(CONFIG_BLOCK),y)
>  obj-y +=	buffer.o block_dev.o direct-io.o mpage.o
>  else
> diff --git a/fs/fdmap.c b/fs/fdmap.c
> new file mode 100644
> index 000000000000..274e6c5b7c9c
> --- /dev/null
> +++ b/fs/fdmap.c
> @@ -0,0 +1,105 @@
> +#include <linux/bitops.h>
> +#include <linux/fdtable.h>
> +#include <linux/rcupdate.h>
> +#include <linux/sched.h>
> +#include <linux/syscalls.h>
> +#include <linux/uaccess.h>
> +
> +/**
> + * fdmap - get opened file descriptors of a process
> + * @pid: the pid of the target process
> + * @fds: allocated userspace buffer
> + * @count: buffer size (in descriptors)
> + * @start_fd: first descriptor to search from (inclusive)
> + * @flags: reserved for future functionality, must be zero
> + *
> + * If @pid is zero then it's current process.
> + * Return: number of descriptors written. An error code otherwise.
> + */
> +SYSCALL_DEFINE5(fdmap, pid_t, pid, int __user *, fds, unsigned int, count,
> +		int, start_fd, int, flags)
> +{
> +	struct task_struct *task;
> +	struct files_struct *files;
> +	unsigned long search_mask;
> +	unsigned int user_index, offset;
> +	int masksize;
> +
> +	if (start_fd < 0 || flags != 0)
> +		return -EINVAL;
> +
> +	if (!pid) {
> +		files = get_files_struct(current);
> +	} else {
> +		rcu_read_lock();
> +		task = find_task_by_vpid(pid);
> +		if (!task) {
> +			rcu_read_unlock();
> +			return -ESRCH;
> +		}
> +		if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
> +			rcu_read_unlock();
> +			return -EACCES;
> +		}
> +		files = get_files_struct(task);
> +		rcu_read_unlock();
> +	}
> +	if (!files)
> +		return 0;
> +
> +	offset = start_fd / BITS_PER_LONG;
> +	search_mask = ULONG_MAX << (start_fd % BITS_PER_LONG);
> +	user_index = 0;
> +#define FDS_BUF_SIZE	(512/sizeof(unsigned long))
> +	masksize = FDS_BUF_SIZE;
> +	while (user_index < count && masksize == FDS_BUF_SIZE) {
> +		unsigned long open_fds[FDS_BUF_SIZE];
> +		struct fdtable *fdt;
> +		unsigned int i;
> +
> +		/*
> +		 * fdt->max_fds can grow, get it every time
> +		 * before copying part into internal buffer.
> +		 */
> +		rcu_read_lock();
> +		fdt = files_fdtable(files);
> +		masksize = fdt->max_fds / 8 - offset * sizeof(long);
> +		if (masksize < 0) {
> +			rcu_read_unlock();
> +			break;
> +		}
> +		masksize = min(masksize, (int)sizeof(open_fds));
> +		memcpy(open_fds, fdt->open_fds + offset, masksize);
> +		rcu_read_unlock();
> +
> +		open_fds[0] &= search_mask;
> +		search_mask = ULONG_MAX;
> +		masksize = (masksize + sizeof(long) - 1) / sizeof(long);
> +		start_fd = offset * BITS_PER_LONG;
> +		/*
> +		 * for_each_set_bit_from() can re-read first word
> +		 * multiple times which is not optimal.
> +		 */
> +		for (i = 0; i < masksize; i++) {
> +			unsigned long mask = open_fds[i];
> +
> +			while (mask) {
> +				unsigned int real_fd = start_fd + __ffs(mask);
> +
> +				if (put_user(real_fd, fds + user_index)) {
> +					put_files_struct(files);
> +					return -EFAULT;
> +				}
> +				if (++user_index >= count)
> +					goto out;
> +				mask &= mask - 1;
> +			}
> +			start_fd += BITS_PER_LONG;
> +		}
> +		offset += FDS_BUF_SIZE;
> +	}
> +out:
> +	put_files_struct(files);
> +
> +	return user_index;
> +}
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index 95606a2d556f..d393d844facb 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -936,5 +936,7 @@ asmlinkage long sys_pkey_alloc(unsigned long flags, unsigned long init_val);
>  asmlinkage long sys_pkey_free(int pkey);
>  asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
>  			  unsigned mask, struct statx __user *buffer);
> +asmlinkage long sys_fdmap(pid_t pid, int __user *fds, unsigned int count,
> +			  int start_fd, int flags);
>  
>  #endif
> diff --git a/init/Kconfig b/init/Kconfig
> index 78cb2461012e..952d13b7326d 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -1400,6 +1400,13 @@ config MEMBARRIER
>  
>  	  If unsure, say Y.
>  
> +config FDMAP
> +	bool "fdmap() system call" if EXPERT
> +	default y
> +	help
> +	  Enable fdmap() system call that allows to query file descriptors
> +	  in binary form avoiding /proc overhead.
> +
>  config EMBEDDED
>  	bool "Embedded system"
>  	option allnoconfig_y
> diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
> index 8acef8576ce9..d61fa27d021e 100644
> --- a/kernel/sys_ni.c
> +++ b/kernel/sys_ni.c
> @@ -258,3 +258,5 @@ cond_syscall(sys_membarrier);
>  cond_syscall(sys_pkey_mprotect);
>  cond_syscall(sys_pkey_alloc);
>  cond_syscall(sys_pkey_free);
> +
> +cond_syscall(sys_fdmap);
> diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
> index 26ce4f7168be..e8d63c27c865 100644
> --- a/tools/testing/selftests/Makefile
> +++ b/tools/testing/selftests/Makefile
> @@ -5,6 +5,7 @@ TARGETS += cpufreq
>  TARGETS += cpu-hotplug
>  TARGETS += efivarfs
>  TARGETS += exec
> +TARGETS += fdmap
>  TARGETS += firmware
>  TARGETS += ftrace
>  TARGETS += futex
> diff --git a/tools/testing/selftests/fdmap/.gitignore b/tools/testing/selftests/fdmap/.gitignore
> new file mode 100644
> index 000000000000..9a9bfdac1cc0
> --- /dev/null
> +++ b/tools/testing/selftests/fdmap/.gitignore
> @@ -0,0 +1 @@
> +fdmap_test
> diff --git a/tools/testing/selftests/fdmap/Makefile b/tools/testing/selftests/fdmap/Makefile
> new file mode 100644
> index 000000000000..bf9f051f4b63
> --- /dev/null
> +++ b/tools/testing/selftests/fdmap/Makefile
> @@ -0,0 +1,7 @@
> +TEST_GEN_PROGS := fdmap_test
> +CFLAGS += -Wall
> +
> +include ../lib.mk
> +
> +$(TEST_GEN_PROGS): fdmap_test.c fdmap.c fdmap.h ../kselftest_harness.h
> +	$(CC) $(CFLAGS) $(LDFLAGS) $< fdmap.c -o $@
> diff --git a/tools/testing/selftests/fdmap/fdmap.c b/tools/testing/selftests/fdmap/fdmap.c
> new file mode 100644
> index 000000000000..66725b0201e0
> --- /dev/null
> +++ b/tools/testing/selftests/fdmap/fdmap.c
> @@ -0,0 +1,112 @@
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <string.h>
> +#include <dirent.h>
> +#include <unistd.h>
> +#include <errno.h>
> +#include <sys/types.h>
> +#include "fdmap.h"
> +
> +#define	BUF_SIZE	1024
> +
> +long fdmap(pid_t pid, int *fds, size_t count, int start_fd, int flags)
> +{
> +	register int64_t r10 asm("r10") = start_fd;
> +	register int64_t r8 asm("r8") = flags;
> +	long ret;
> +
> +	asm volatile (
> +		"syscall"
> +		: "=a"(ret)
> +		: "0" (333),
> +		  "D" (pid), "S" (fds), "d" (count), "r" (r10), "r" (r8)
> +		: "rcx", "r11", "cc", "memory"
> +	);
> +	return ret;
> +}
> +
> +int fdmap_full(pid_t pid, int **fds, size_t *n)
> +{
> +	int buf[BUF_SIZE], start_fd = 0;
> +	long ret;
> +
> +	*n = 0;
> +	*fds = NULL;
> +	for (;;) {
> +		int *new_buff;
> +
> +		ret = fdmap(pid, buf, BUF_SIZE, start_fd, 0);
> +		if (ret < 0)
> +			break;
> +		if (!ret)
> +			return 0;
> +
> +		new_buff = realloc(*fds, (*n + ret) * sizeof(int));
> +		if (!new_buff) {
> +			ret = -errno;
> +			break;
> +		}
> +		*fds = new_buff;
> +		memcpy(*fds + *n, buf, ret * sizeof(int));
> +		*n += ret;
> +		start_fd = (*fds)[*n - 1] + 1;
> +	}
> +	free(*fds);
> +	*fds = NULL;
> +	return -ret;
> +}
> +
> +int fdmap_proc(pid_t pid, int **fds, size_t *n)
> +{
> +	char fds_path[20];
> +	int dir_fd = 0;
> +	struct dirent *fd_link;
> +	DIR *fds_dir;
> +
> +	*fds = NULL;
> +	*n = 0;
> +	if (!pid)
> +		strcpy(fds_path, "/proc/self/fd");
> +	else
> +		sprintf(fds_path, "/proc/%d/fd", pid);
> +
> +	fds_dir = opendir(fds_path);
> +	if (!fds_dir)
> +		return errno == ENOENT ? ESRCH : errno;
> +	if (!pid)
> +		dir_fd = dirfd(fds_dir);
> +
> +	while ((fd_link = readdir(fds_dir))) {
> +		if (fd_link->d_name[0] < '0'
> +		    || fd_link->d_name[0] > '9')
> +			continue;
> +		if (*n % BUF_SIZE == 0) {
> +			int *new_buff;
> +
> +			new_buff = realloc(*fds, (*n + BUF_SIZE) * sizeof(int));
> +			if (!new_buff) {
> +				int ret = errno;
> +
> +				free(*fds);
> +				*fds = NULL;
> +				return ret;
> +			}
> +			*fds = new_buff;
> +		}
> +		(*fds)[*n] = atoi(fd_link->d_name);
> +		*n += 1;
> +	}
> +	closedir(fds_dir);
> +
> +	if (!pid) {
> +		size_t i;
> +
> +		for (i = 0; i < *n; i++)
> +			if ((*fds)[i] == dir_fd)
> +				break;
> +		i++;
> +		memmove(*fds + i - 1, *fds + i, (*n - i) * sizeof(int));
> +		(*n)--;
> +	}
> +	return 0;
> +}
> diff --git a/tools/testing/selftests/fdmap/fdmap.h b/tools/testing/selftests/fdmap/fdmap.h
> new file mode 100644
> index 000000000000..c501111b2bbd
> --- /dev/null
> +++ b/tools/testing/selftests/fdmap/fdmap.h
> @@ -0,0 +1,12 @@
> +#ifndef FDMAP_H
> +#define FDMAP_H
> +
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <sys/types.h>
> +
> +long fdmap(pid_t pid, int *fds, size_t count, int start_fd, int flags);
> +int fdmap_full(pid_t pid, int **fds, size_t *n);
> +int fdmap_proc(pid_t pid, int **fds, size_t *n);
> +
> +#endif
> diff --git a/tools/testing/selftests/fdmap/fdmap_test.c b/tools/testing/selftests/fdmap/fdmap_test.c
> new file mode 100644
> index 000000000000..6f448406d96a
> --- /dev/null
> +++ b/tools/testing/selftests/fdmap/fdmap_test.c
> @@ -0,0 +1,153 @@
> +#include <errno.h>
> +#include <syscall.h>
> +#include <sys/time.h>
> +#include <sys/resource.h>
> +#include <limits.h>
> +#include "../kselftest_harness.h"
> +#include "fdmap.h"
> +
> +TEST(efault) {
> +	int ret;
> +
> +	ret = syscall(333, 0, NULL, 20 * sizeof(int), 0, 0);
> +	ASSERT_EQ(-1, ret);
> +	ASSERT_EQ(EFAULT, errno);
> +}
> +
> +TEST(big_start_fd) {
> +	int fds[1];
> +	int ret;
> +
> +	ret = syscall(333, 0, fds, sizeof(int), INT_MAX, 0);
> +	ASSERT_EQ(0, ret);
> +}
> +
> +TEST(einval) {
> +	int ret;
> +
> +	ret = syscall(333, 0, NULL, 0, -1, 0);
> +	ASSERT_EQ(-1, ret);
> +	ASSERT_EQ(EINVAL, errno);
> +
> +	ret = syscall(333, 0, NULL, 0, 0, 1);
> +	ASSERT_EQ(-1, ret);
> +	ASSERT_EQ(EINVAL, errno);
> +}
> +
> +TEST(esrch) {
> +	int fds[1], ret;
> +	pid_t pid;
> +
> +	pid = fork();
> +	ASSERT_NE(-1, pid);
> +	if (!pid)
> +		exit(0);
> +	waitpid(pid, NULL, 0);
> +
> +	ret = syscall(333, pid, fds, sizeof(int), 0, 0);
> +	ASSERT_EQ(-1, ret);
> +	ASSERT_EQ(ESRCH, errno);
> +}
> +
> +TEST(simple) {
> +	int *fds1, *fds2;
> +	size_t size1, size2, i;
> +	int ret1, ret2;
> +
> +	ret1 = fdmap_full(0, &fds1, &size1);
> +	ret2 = fdmap_proc(0, &fds2, &size2);
> +	ASSERT_EQ(ret2, ret1);
> +	ASSERT_EQ(size2, size1);
> +	for (i = 0; i < size1; i++)
> +		ASSERT_EQ(fds2[i], fds1[i]);
> +	free(fds1);
> +	free(fds2);
> +}
> +
> +TEST(init) {
> +	int *fds1, *fds2;
> +	size_t size1, size2, i;
> +	int ret1, ret2;
> +
> +	ret1 = fdmap_full(1, &fds1, &size1);
> +	ret2 = fdmap_proc(1, &fds2, &size2);
> +	ASSERT_EQ(ret2, ret1);
> +	ASSERT_EQ(size2, size1);
> +	for (i = 0; i < size1; i++)
> +		ASSERT_EQ(fds2[i], fds1[i]);
> +	free(fds1);
> +	free(fds2);
> +}
> +
> +TEST(zero) {
> +	int *fds, i;
> +	size_t size;
> +	int ret;
> +
> +	ret = fdmap_proc(0, &fds, &size);
> +	ASSERT_EQ(0, ret);
> +	for (i = 0; i < size; i++)
> +		close(fds[i]);
> +	free(fds);
> +	fds = NULL;
> +
> +	ret = fdmap_full(0, &fds, &size);
> +	ASSERT_EQ(0, ret);
> +	ASSERT_EQ(0, size);
> +}
> +
> +TEST(more_fds) {
> +	int *fds1, *fds2, ret1, ret2;
> +	size_t size1, size2, i;
> +
> +	struct rlimit rlim = {
> +		.rlim_cur = 600000,
> +		.rlim_max = 600000
> +	};
> +	ASSERT_EQ(0, setrlimit(RLIMIT_NOFILE, &rlim));
> +	for (int i = 0; i < 500000; i++)
> +		dup(0);
> +
> +	ret1 = fdmap_full(0, &fds1, &size1);
> +	ret2 = fdmap_proc(0, &fds2, &size2);
> +	ASSERT_EQ(ret2, ret1);
> +	ASSERT_EQ(size2, size1);
> +	for (i = 0; i < size1; i++)
> +		ASSERT_EQ(fds2[i], fds1[i]);
> +	free(fds1);
> +	free(fds2);
> +}
> +
> +TEST(child) {
> +	int pipefd[2];
> +	int *fds1, *fds2, ret1, ret2, i;
> +	size_t size1, size2;
> +	char byte = 0;
> +	pid_t pid;
> +
> +	ASSERT_NE(-1, pipe(pipefd));
> +	pid = fork();
> +	ASSERT_NE(-1, pid);
> +	if (!pid) {
> +		read(pipefd[0], &byte, 1);
> +		close(pipefd[0]);
> +		close(pipefd[1]);
> +		exit(0);
> +	}
> +
> +	ret1 = fdmap_full(0, &fds1, &size1);
> +	ret2 = fdmap_proc(0, &fds2, &size2);
> +	ASSERT_EQ(ret2, ret1);
> +	ASSERT_EQ(size2, size1);
> +	for (i = 0; i < size1; i++)
> +		ASSERT_EQ(fds2[i], fds1[i]);
> +	free(fds1);
> +	free(fds2);
> +
> +	write(pipefd[1], &byte, 1);
> +	close(pipefd[0]);
> +	close(pipefd[1]);
> +	waitpid(pid, NULL, 0);
> +}
> +
> +TEST_HARNESS_MAIN
> --
> To unsubscribe from this list: send the line "unsubscribe linux-api" in
> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 


-- 
Michael Kerrisk
Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/
Linux/UNIX System Programming Training: http://man7.org/training/

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v2 2/2] pidmap(2)
  2017-09-24 20:08   ` Alexey Dobriyan
  (?)
  (?)
@ 2017-09-25  7:43   ` Michael Kerrisk (man-pages)
  -1 siblings, 0 replies; 51+ messages in thread
From: Michael Kerrisk (man-pages) @ 2017-09-25  7:43 UTC (permalink / raw)
  To: Alexey Dobriyan, akpm
  Cc: mtk.manpages, linux-kernel, linux-api, rdunlap, tglx, tixxdz,
	gladkov.alexey, Tatsiana_Brouka, Aliaksandr_Patseyenak1

Hello Alexey,

On 09/24/2017 10:08 PM, Alexey Dobriyan wrote:
> From: Tatsiana Brouka <Tatsiana_Brouka@epam.com>
> 
> Implement system call for bulk retrieveing of pids in binary form.
> 
> Using /proc is slower than necessary: 3 syscalls + another 3 for each thread +
> converting with atoi() + instantiating dentries and inodes.
> 
> /proc may be not mounted especially in containers. Natural extension of
> hidepid=2 efforts is to not mount /proc at all.
> 
> It could be used by programs like ps, top or CRIU. Speed increase will
> become more drastic once combined with bulk retrieval of process statistics.

Similar to my comments on fdmap(). The last paragraph here is 
rather hypothetical. Who specifically needs this right now, 
and why? This sort of detail should be part of the commit
message, IMHO.

Thanks,

Michael


> Benchmark:
> 
> 	N=1<<16 times
> 	~130 processes (~250 task_structs) on a regular desktop system
> 	opendir + readdir + closedir /proc + the same for every /proc/$PID/task
> 	(roughly what htop(1) does) vs pidmap
> 
> 	/proc 16.80 ± 0.73%
> 	pidmap 0.06 ± 0.31%
> 
> PIDMAP_* flags are modelled after /proc/task_diag patchset.
> 
> 
> PIDMAP(2)                  Linux Programmer's Manual                 PIDMAP(2)
> 
> NAME
>        pidmap - get allocated PIDs
> 
> SYNOPSIS
>        long pidmap(pid_t pid, int *pids, unsigned int count , unsigned int start, int flags);
> 
> DESCRIPTION
>        The system call pidmap(2) writes process IDs in buffer pointed by pids.
>        At most count pids are written. The pid argument specifies  process  ID
>        in  several  values in flags.  If pid equals to zero, syscall will work
>        with current process.  The argument start depends on  the  flags.   The
>        argument  flags  must include one of the following modes: PIDMAP_TASKS,
>        PIDMAP_PROC, PIDMAP_CHILDREN, or PIDMAP_THREADS.  For PIDMAP_TASKS  and
>        PIDMAP_PROC exists optional PIDMAP_IGNORE_KTHREADS flag.
> 
>        PIDs  are  filled  from  pid  namespace  of  the  calling  process POV:
>        unshare(CLONE_NEWPID) + fork + pidmap in child will always return 1/1.
> 
>        pidmap(2) hides  PIDs  inaccessible  at  /proc  mounted  with  hide_pid
>        option.
> 
>        Note,  pidmap(2)  does not guarantee that any of returned PID exists by
>        the time system call exists.
> 
>        Full list of flags and options is below:
> 
>        PIDMAP_TASKS
>               Get PIDs of all tasks, including  threads  starting  from  start
>               inclusive. First argument pid will be ignored.
> 
>        PIDMAP_PROC
>               Get  all  process IDs starting from start inclusive. First argu‐
>               ment pid will be ignored.
> 
>        PIDMAP_CHILDREN
>               Get children IDs of  the  process  specified  by  pid  argument.
>               start  argument  specifies  number  of  children to skip in this
>               case.
> 
>        PIDMAP_THREADS
>               Get threads IDs of the process specified by pid argument.  start
>               argument specifies number of threads to skip in this case.
> 
>        PIDMAP_IGNORE_KTHREADS
>               Ignore  kernel  threads.  Optional  and  will  be  ignored  with
>               PIDMAP_CHILDREN and PIDMAP_THREADS flags.
> 
> RETURN VALUE
>        On success, number of PIDs read is returned. Otherwise, error  code  is
>        returned.
> 
> ERRORS
>        ESRCH  No such process.
> 
>        EACCES Permission denied.
> 
>        EFAULT Invalid pids pointer.
> 
>        EINVAL Invalid flags value.
> 
> NOTES
>        Glibc  does  not  provide a wrapper for this system call; call it using
>        syscall(2).
> 
> EXAMPLE
>        #include <stdio.h>
>        #include <linux/pidmap.h>
> 
>        static inline long pidmap(int pid, int *pids, unsigned int n, unsigned int start, int flags)
>        {
>                register long r10 asm("r10") = start;
>                register long r8 asm("r8") = flags;
>                long ret;
>                asm volatile (
>                        "syscall"
>                        : "=a" (ret)
>                        : "0" (334), "D" (pid), "S" (pids), "d" (n), "r" (r10), "r" (r8)
>                        : "rcx", "r11", "cc", "memory"
>                );
>                return ret;
>        }
> 
>        int main(void)
>        {
>                int pids[5];
>                unsigned int start;
>                unsigned int i;
>                int n;
> 
>                start = 0;
>                while ((n = pidmap(0, pids, sizeof(pids)/sizeof(pids[0]),
>                             start, PIDMAP_PROC | PIDMAP_IGNORE_KTHREADS)) > 0) {
> 
>                        for (i = 0; i < n; i++)
>                                printf("%d ", pids[i]);
>                        printf("\n");
> 
>                        start = pids[n - 1] + 1;
>                }
>                return 0;
>        }
> 
> Linux                             2017-09-21                         PIDMAP(2)
> 
> Changelog:
> 
> 	CONFIG_PIDMAP option
> 	PIDMAP_* options
> 	PIDMAP_IGNORE_KTHREADS
> 	manpage
> 
> Signed-off-by: Tatsiana Brouka <Tatsiana_Brouka@epam.com>
> Signed-off-by: Aliaksandr Patseyenak <Aliaksandr_Patseyenak1@epam.com>
> Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
> ---
>  arch/x86/entry/syscalls/syscall_64.tbl    |   1 +
>  include/linux/syscalls.h                  |   5 +
>  include/uapi/linux/pidmap.h               |  10 +
>  init/Kconfig                              |   7 +
>  kernel/Makefile                           |   2 +
>  kernel/pidmap.c                           | 287 ++++++++++++++++++++++++++++
>  kernel/sys_ni.c                           |   1 +
>  tools/testing/selftests/Makefile          |   1 +
>  tools/testing/selftests/pidmap/.gitignore |   1 +
>  tools/testing/selftests/pidmap/Makefile   |   5 +
>  tools/testing/selftests/pidmap/pidmap.c   | 298 ++++++++++++++++++++++++++++++
>  tools/testing/selftests/pidmap/pidmap.h   |   1 +
>  12 files changed, 619 insertions(+)
>  create mode 100644 include/uapi/linux/pidmap.h
>  create mode 100644 kernel/pidmap.c
>  create mode 100644 tools/testing/selftests/pidmap/.gitignore
>  create mode 100644 tools/testing/selftests/pidmap/Makefile
>  create mode 100644 tools/testing/selftests/pidmap/pidmap.c
>  create mode 120000 tools/testing/selftests/pidmap/pidmap.h
> 
> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> index 9bfe5f79674f..8ce611f14969 100644
> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> @@ -340,6 +340,7 @@
>  331	common	pkey_free		sys_pkey_free
>  332	common	statx			sys_statx
>  333	common	fdmap			sys_fdmap
> +334	common	pidmap			sys_pidmap
>  
>  #
>  # x32-specific system call numbers start at 512 to avoid cache impact
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index d393d844facb..cc1ef71dbb4a 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -939,4 +939,9 @@ asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
>  asmlinkage long sys_fdmap(pid_t pid, int __user *fds, unsigned int count,
>  			  int start_fd, int flags);
>  
> +asmlinkage long sys_pidmap(pid_t pid,
> +			   int __user *pids,
> +			   unsigned int pids_count,
> +			   unsigned int start_pid,
> +			   int flags);
>  #endif
> diff --git a/include/uapi/linux/pidmap.h b/include/uapi/linux/pidmap.h
> new file mode 100644
> index 000000000000..75a7557c22eb
> --- /dev/null
> +++ b/include/uapi/linux/pidmap.h
> @@ -0,0 +1,10 @@
> +#ifndef _UAPI_LINUX_PIDMAP_H
> +#define _UAPI_LINUX_PIDMAP_H
> +
> +#define PIDMAP_TASKS	1
> +#define PIDMAP_PROC	2
> +#define PIDMAP_CHILDREN	3
> +#define PIDMAP_THREADS	4
> +#define PIDMAP_IGNORE_KTHREADS	(1 << 30)
> +
> +#endif /* _UAPI_LINUX_PIDMAP_H */
> diff --git a/init/Kconfig b/init/Kconfig
> index 952d13b7326d..163155e0cfb4 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -1407,6 +1407,13 @@ config FDMAP
>  	  Enable fdmap() system call that allows to query file descriptors
>  	  in binary form avoiding /proc overhead.
>  
> +config PIDMAP
> +	bool "pidmap() system call" if EXPERT
> +	default y
> +	help
> +	  Enable pidmap() system call that allows to query PIDs in binary form
> +	  avoiding /proc overhead.
> +
>  config EMBEDDED
>  	bool "Embedded system"
>  	option allnoconfig_y
> diff --git a/kernel/Makefile b/kernel/Makefile
> index ed470aac53da..f8833e5b27e5 100644
> --- a/kernel/Makefile
> +++ b/kernel/Makefile
> @@ -11,6 +11,8 @@ obj-y     = fork.o exec_domain.o panic.o \
>  	    notifier.o ksysfs.o cred.o reboot.o \
>  	    async.o range.o smpboot.o ucount.o
>  
> +obj-$(CONFIG_PIDMAP) += pidmap.o
> +
>  obj-$(CONFIG_MODULES) += kmod.o
>  obj-$(CONFIG_MULTIUSER) += groups.o
>  
> diff --git a/kernel/pidmap.c b/kernel/pidmap.c
> new file mode 100644
> index 000000000000..0392bc6935b6
> --- /dev/null
> +++ b/kernel/pidmap.c
> @@ -0,0 +1,287 @@
> +#include <linux/bitops.h>
> +#include <linux/cred.h>
> +#include <linux/kernel.h>
> +#include <linux/pid.h>
> +#include <linux/ptrace.h>
> +#include <linux/rcupdate.h>
> +#include <linux/syscalls.h>
> +#include <linux/sched.h>
> +#include <linux/uaccess.h>
> +#include <linux/pidmap.h>
> +
> +#define PIDMAP_PARAM	(~PIDMAP_IGNORE_KTHREADS)
> +
> +static inline bool pidmap_perm(const struct pid_namespace *pid_ns)
> +{
> +	return pid_ns->hide_pid < HIDEPID_INVISIBLE || in_group_p(pid_ns->pid_gid);
> +}
> +
> +static bool skip_task(struct task_struct *task, bool has_perms, int flags)
> +{
> +	int param = flags & PIDMAP_PARAM;
> +
> +	if (!task)
> +		return true;
> +	if (!has_perms && !ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
> +		return true;
> +	if ((flags & PIDMAP_IGNORE_KTHREADS) && (task->flags & PF_KTHREAD))
> +		return true;
> +	if (param == PIDMAP_PROC && !thread_group_leader(task))
> +		return true;
> +	return false;
> +}
> +
> +static long pidmap_tasks(int __user *pids, unsigned int count,
> +		  unsigned int start, int flags)
> +{
> +	struct pid_namespace *pid_ns = task_active_pid_ns(current);
> +	unsigned int start_page, start_elem;
> +	unsigned int last_pos = 0;
> +	unsigned int last_set_pid = 0;
> +	unsigned long mask;
> +	bool has_perms;
> +	unsigned int i;
> +
> +	/*
> +	 * Pid 0 does not exist, however, corresponding bit is always set in
> +	 * ->pidmap[0].page, so we should skip it.
> +	 */
> +	if (start == 0)
> +		start = 1;
> +
> +	if (start > pid_ns->last_pid)
> +		return 0;
> +
> +	has_perms = pidmap_perm(pid_ns);
> +
> +	start_page = start / BITS_PER_PAGE;
> +	start_elem = (start % BITS_PER_PAGE) / BITS_PER_LONG;
> +	mask = ~0UL << (start % BITS_PER_LONG);
> +
> +	for (i = start_page; i < PIDMAP_ENTRIES; i++) {
> +		unsigned int j;
> +
> +		/*
> +		 * ->pidmap[].page is set once to a valid pointer,
> +		 *  therefore do not take any locks.
> +		 */
> +		if (!pid_ns->pidmap[i].page)
> +			continue;
> +
> +		for (j = start_elem; j < PAGE_SIZE/sizeof(unsigned long); j++) {
> +			unsigned long val;
> +
> +			val = *((unsigned long *)pid_ns->pidmap[i].page + j);
> +			val &= mask;
> +			mask = ~0UL;
> +			while (val != 0) {
> +				struct task_struct *task;
> +
> +				if (last_pos == count)
> +					return last_pos;
> +
> +				last_set_pid = i * BITS_PER_PAGE +
> +					j * BITS_PER_LONG + __ffs(val);
> +
> +				rcu_read_lock();
> +				task = find_task_by_pid_ns(last_set_pid, pid_ns);
> +				if (skip_task(task, has_perms, flags)) {
> +					rcu_read_unlock();
> +					goto next;
> +				}
> +				rcu_read_unlock();
> +
> +				if (put_user(last_set_pid, pids + last_pos))
> +					return -EFAULT;
> +				last_pos++;
> +				if (last_set_pid == pid_ns->last_pid)
> +					return last_pos;
> +next:
> +				val &= (val - 1);
> +			}
> +		}
> +		start_elem = 0;
> +	}
> +	if (last_set_pid == 0)
> +		return 0;
> +	else
> +		return last_pos;
> +}
> +
> +static struct task_struct *pidmap_get_task(pid_t pid, bool *has_perms)
> +{
> +	struct pid_namespace *pid_ns;
> +	struct task_struct *task;
> +
> +	if (pid == 0) {
> +		*has_perms = true;
> +		return current;
> +	}
> +
> +	pid_ns = task_active_pid_ns(current);
> +	task = find_task_by_pid_ns(pid, pid_ns);
> +	if (!task)
> +		return ERR_PTR(-ESRCH);
> +	*has_perms = pidmap_perm(pid_ns);
> +	if (!*has_perms && !ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
> +		return ERR_PTR(-EACCES);
> +	return task;
> +}
> +
> +static long pidmap_children(pid_t pid, int __user *upid,
> +			    unsigned int count, unsigned int start)
> +{
> +	struct task_struct *task, *child;
> +	bool has_perms;
> +	int pids[64];
> +	unsigned int i;
> +	unsigned int ret;
> +
> +	rcu_read_lock();
> +	task = pidmap_get_task(pid, &has_perms);
> +	if (IS_ERR(task)) {
> +		rcu_read_unlock();
> +		return PTR_ERR(task);
> +	}
> +
> +	i = 0;
> +	ret = 0;
> +	list_for_each_entry(child, &task->children, sibling) {
> +		if (start) {
> +			start--;
> +			continue;
> +		}
> +
> +		if (!has_perms &&
> +		    !ptrace_may_access(child, PTRACE_MODE_READ_FSCREDS))
> +			continue;
> +
> +		pids[i++] = child->tgid;
> +		if (i >= ARRAY_SIZE(pids)) {
> +			get_task_struct(task);
> +			get_task_struct(child);
> +			rcu_read_unlock();
> +
> +			if (copy_to_user(upid, pids, i * sizeof(int))) {
> +				put_task_struct(child);
> +				put_task_struct(task);
> +				return -EFAULT;
> +			}
> +			upid += i;
> +			ret += i;
> +			i = 0;
> +
> +			rcu_read_lock();
> +			put_task_struct(child);
> +			put_task_struct(task);
> +
> +			if (!pid_alive(task) || !pid_alive(child))
> +				break;
> +		}
> +		if (--count == 0)
> +			break;
> +	}
> +	rcu_read_unlock();
> +	if (i > 0) {
> +		if (copy_to_user(upid, pids, i * sizeof(int)))
> +			return -EFAULT;
> +		ret += i;
> +	}
> +	return ret;
> +}
> +
> +static long pidmap_threads(pid_t pid, int __user *upid,
> +			   unsigned int count, unsigned int start)
> +{
> +	struct task_struct *task, *thread;
> +	bool has_perms;
> +	int pids[64];
> +	unsigned int i;
> +	unsigned int ret;
> +
> +	rcu_read_lock();
> +	task = pidmap_get_task(pid, &has_perms);
> +	if (IS_ERR(task)) {
> +		rcu_read_unlock();
> +		return PTR_ERR(task);
> +	}
> +
> +	i = 0;
> +	ret = 0;
> +	for_each_thread(task, thread) {
> +		if (start) {
> +			start--;
> +			continue;
> +		}
> +
> +		pids[i++] = thread->pid;
> +		if (i >= ARRAY_SIZE(pids)) {
> +			get_task_struct(task);
> +			get_task_struct(thread);
> +			rcu_read_unlock();
> +
> +			if (copy_to_user(upid, pids, i * sizeof(int))) {
> +				put_task_struct(thread);
> +				put_task_struct(task);
> +				return -EFAULT;
> +			}
> +			upid += i;
> +			ret += i;
> +			i = 0;
> +
> +			rcu_read_lock();
> +			put_task_struct(thread);
> +			put_task_struct(task);
> +
> +			if (!pid_alive(task) || !pid_alive(thread))
> +				break;
> +		}
> +		if (--count == 0)
> +			break;
> +	}
> +	rcu_read_unlock();
> +	if (i > 0) {
> +		if (copy_to_user(upid, pids, i * sizeof(int)))
> +			return -EFAULT;
> +		ret += i;
> +	}
> +	return ret;
> +}
> +
> +/**
> + * pidmap - get allocated PIDs
> + * @pids: destination buffer.
> + * @count: number of elements in the buffer.
> + * @start: PID to start from or PIDs number already readed.
> + * @flags: flags.
> + *
> + * Write allocated PIDs to a buffer. @start specifies PID to start from
> + * with PIDMAP_TASKS or PIDMAP_PROC flags, or number of PIDs already
> + * readed otherwise.
> + *
> + * PIDs are filled from pid namespace of the calling process POV:
> + * unshare(CLONE_NEWPID)+fork+pidmap in child will always return 1/1.
> + *
> + * pidmap(2) hides PIDs inaccessible at /proc mounted with "hidepid" option.
> + *
> + * Note, pidmap(2) does not guarantee that any of returned PID exists
> + * by the time system call exits.
> + *
> + * Return: number of PIDs written to the buffer or error code otherwise.
> + */
> +SYSCALL_DEFINE5(pidmap, pid_t, pid, int __user *, pids,
> +		unsigned int, count, unsigned int, start, int, flags)
> +{
> +	int param = flags & PIDMAP_PARAM;
> +
> +	switch (param) {
> +	case PIDMAP_TASKS:
> +	case PIDMAP_PROC:
> +		return pidmap_tasks(pids, count, start, flags);
> +	case PIDMAP_CHILDREN:
> +		return pidmap_children(pid, pids, count, start);
> +	case PIDMAP_THREADS:
> +		return pidmap_threads(pid, pids, count, start);
> +	}
> +	return -EINVAL;
> +}
> diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
> index d61fa27d021e..a600d458c1d9 100644
> --- a/kernel/sys_ni.c
> +++ b/kernel/sys_ni.c
> @@ -260,3 +260,4 @@ cond_syscall(sys_pkey_alloc);
>  cond_syscall(sys_pkey_free);
>  
>  cond_syscall(sys_fdmap);
> +cond_syscall(sys_pidmap);
> diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
> index e8d63c27c865..4d1443a83121 100644
> --- a/tools/testing/selftests/Makefile
> +++ b/tools/testing/selftests/Makefile
> @@ -21,6 +21,7 @@ TARGETS += mount
>  TARGETS += mqueue
>  TARGETS += net
>  TARGETS += nsfs
> +TARGETS += pidmap
>  TARGETS += powerpc
>  TARGETS += pstore
>  TARGETS += ptrace
> diff --git a/tools/testing/selftests/pidmap/.gitignore b/tools/testing/selftests/pidmap/.gitignore
> new file mode 100644
> index 000000000000..a762199f2637
> --- /dev/null
> +++ b/tools/testing/selftests/pidmap/.gitignore
> @@ -0,0 +1 @@
> +pidmap
> diff --git a/tools/testing/selftests/pidmap/Makefile b/tools/testing/selftests/pidmap/Makefile
> new file mode 100644
> index 000000000000..3deae4ef7295
> --- /dev/null
> +++ b/tools/testing/selftests/pidmap/Makefile
> @@ -0,0 +1,5 @@
> +CFLAGS = -Wall
> +
> +TEST_GEN_PROGS := pidmap
> +
> +include ../lib.mk
> diff --git a/tools/testing/selftests/pidmap/pidmap.c b/tools/testing/selftests/pidmap/pidmap.c
> new file mode 100644
> index 000000000000..76a9ec57d466
> --- /dev/null
> +++ b/tools/testing/selftests/pidmap/pidmap.c
> @@ -0,0 +1,298 @@
> +#define _GNU_SOURCE
> +#include <stdio.h>
> +#include <unistd.h>
> +#include <fcntl.h>
> +#include <sys/wait.h>
> +#include <stdlib.h>
> +#include <sched.h>
> +#include <dirent.h>
> +#include <string.h>
> +#include <sys/mount.h>
> +#include <signal.h>
> +#include <assert.h>
> +#include "pidmap.h"
> +#include "../kselftest_harness.h"
> +
> +#define SIZE 512
> +
> +static inline long pidmap(pid_t pid, int *pids, unsigned int count,
> +			  unsigned int start_pid, int flags)
> +{
> +	long ret;
> +
> +	register long r10 asm("r10") = start_pid;
> +	register long r8 asm("r8") = flags;
> +
> +	asm volatile ("syscall" : "=a"(ret) :
> +		"0"(334), "D"(pid), "S"(pids), "d"(count), "r"(r10), "r"(r8) :
> +		"rcx", "r11", "cc", "memory");
> +	return ret;
> +}
> +
> +static int compare(const void *a, const void *b)
> +{
> +	return *((int *)a) > *((int *)b);
> +}
> +
> +int pidmap_full(int **pid, unsigned int *res_count)
> +{
> +	int n;
> +	int start_pid = 1;
> +	*pid = (int *)malloc(SIZE * sizeof(int));
> +	*res_count = 0;
> +
> +	while ((n = pidmap(0, *pid + *res_count, SIZE, start_pid,
> +			   PIDMAP_TASKS)) > 0) {
> +		*res_count += n;
> +		*pid = (int *)realloc(*pid, (*res_count + SIZE) * sizeof(int));
> +		start_pid = (*pid)[*res_count - 1] + 1;
> +	}
> +	return n;
> +}
> +
> +int pidmap_proc(int **pid, unsigned int *n)
> +{
> +	DIR *dir = opendir("/proc");
> +	struct dirent *dirs;
> +
> +	*n = 0;
> +	*pid = NULL;
> +
> +	while ((dirs = readdir(dir))) {
> +		char dname[32] = "";
> +		DIR *task_dir;
> +
> +		if (dirs->d_name[0] < '0' || dirs->d_name[0] > '9')
> +			continue;
> +
> +		strcpy(dname, "/proc/");
> +		strcat(dname, dirs->d_name);
> +		strcat(dname, "/task");
> +		task_dir = opendir(dname);
> +
> +		if (task_dir) {
> +			struct dirent *task_dirs;
> +
> +			while ((task_dirs = readdir(task_dir))) {
> +				if (task_dirs->d_name[0] < '0' ||
> +						task_dirs->d_name[0] > '9')
> +					continue;
> +
> +				*pid = (int *)realloc(*pid, (*n + 1) *
> +								sizeof(int));
> +				if (*pid == NULL)
> +					return -1;
> +				*(*pid + *n) = atoi(task_dirs->d_name);
> +				*n += 1;
> +			}
> +		} else {
> +			*pid = (int *)realloc(*pid, (*n + 1) * sizeof(int));
> +			if (*pid == NULL)
> +				return -1;
> +			*(*pid + *n) = atoi(dirs->d_name);
> +			*n += 1;
> +		}
> +		closedir(task_dir);
> +	}
> +	closedir(dir);
> +	return 0;
> +}
> +
> +TEST(bufsize)
> +{
> +	int pid[SIZE];
> +
> +	EXPECT_EQ(0, pidmap(0, pid, 0, 1, PIDMAP_TASKS));
> +}
> +
> +TEST(get_pid)
> +{
> +	int pid;
> +	int ret;
> +
> +	ret = pidmap(0, &pid, 1, getpid(), PIDMAP_TASKS);
> +	ASSERT_LE(0, ret);
> +	EXPECT_EQ(getpid(), pid);
> +}
> +
> +TEST(bad_start)
> +{
> +	int pid[SIZE];
> +
> +	ASSERT_LE(0, pidmap(0, pid, SIZE, -1, PIDMAP_TASKS));
> +	ASSERT_LE(0, pidmap(0, pid, SIZE, ~0U, PIDMAP_TASKS));
> +	ASSERT_LE(0, pidmap(0, pid, SIZE, 0, PIDMAP_TASKS));
> +	EXPECT_EQ(1, pid[0]);
> +}
> +
> +TEST(child_pid)
> +{
> +	pid_t pid = fork();
> +
> +	if (pid == 0)
> +		pause();
> +	else {
> +		int ret;
> +		int result = 0;
> +
> +		ret = pidmap(0, &result, 1, pid, PIDMAP_TASKS);
> +		EXPECT_LE(0, ret);
> +		EXPECT_EQ(pid, result);
> +		kill(pid, SIGTERM);
> +	}
> +}
> +
> +TEST(pidmap_children_flag)
> +{
> +	int real_pids[SIZE], pids[SIZE];
> +	int i;
> +
> +	for (i = 0; i < SIZE; i++) {
> +		pid_t pid = fork();
> +		if (!pid) {
> +			pause();
> +			exit(0);
> +		} else if (pid < 0) {
> +			perror("fork");
> +			exit(1);
> +		}
> +		real_pids[i] = pid;
> +	}
> +
> +	ASSERT_EQ(SIZE, pidmap(0, pids, SIZE, 0, PIDMAP_CHILDREN));
> +	for (i = 0; i < SIZE; i++) {
> +		ASSERT_EQ(real_pids[i], pids[i]);
> +		kill(real_pids[i], SIGKILL);
> +	}
> +}
> +
> +int write_pidmax(int new_pidmax)
> +{
> +	char old_pidmax[32];
> +	char new[32];
> +	int fd = open("/proc/sys/kernel/pid_max", O_RDWR);
> +
> +	if (read(fd, old_pidmax, 32) <= 0)
> +		printf("Read failed\n");
> +	lseek(fd, 0, 0);
> +	snprintf(new, sizeof(new), "%d", new_pidmax);
> +	if (write(fd, new, strlen(new)) <= 0)
> +		printf("Write failed\n");
> +	close(fd);
> +	return atoi(old_pidmax);
> +}
> +
> +void do_forks(unsigned int n)
> +{
> +	while (n--) {
> +		pid_t pid = fork();
> +
> +		if (pid == 0)
> +			exit(0);
> +		waitpid(pid, NULL, 0);
> +	}
> +}
> +
> +TEST(pid_max)
> +{
> +	int *pid;
> +	unsigned int n;
> +	int ret, p;
> +	int old_pidmax;
> +
> +	old_pidmax = write_pidmax(50000);
> +
> +	do_forks(40000);
> +
> +	p = fork();
> +
> +	if (p == 0)
> +		pause();
> +
> +	ret = pidmap_full(&pid, &n);
> +	kill(p, SIGKILL);
> +
> +	EXPECT_LE(0, ret);
> +	EXPECT_LE(1, n);
> +	if (ret < 0 || n <= 0)
> +		goto exit;
> +	EXPECT_EQ(p, pid[n - 1]);
> +exit:
> +	write_pidmax(old_pidmax);
> +}
> +
> +void sigquit_h(int sig)
> +{
> +	assert(sig == SIGQUIT);
> +	if (getgid() != getpid())
> +		exit(0);
> +}
> +
> +TEST(compare_proc)
> +{
> +	pid_t pid;
> +
> +	if (unshare(CLONE_NEWNS | CLONE_NEWPID) == -1)
> +		return;
> +
> +	pid = fork();
> +
> +	if (pid == 0) {
> +		pid_t p;
> +		int i = 0;
> +
> +		signal(SIGQUIT, sigquit_h);
> +
> +		mount("none", "/", NULL, MS_REC | MS_PRIVATE, NULL);
> +		mount("none", "/proc", NULL, MS_REC | MS_PRIVATE, NULL);
> +		mount("proc", "/proc", "proc",
> +			MS_NOSUID | MS_NODEV | MS_NOEXEC, NULL);
> +
> +		while (i < 150) {
> +			i++;
> +
> +			p = fork();
> +
> +			if (p == -1) {
> +				umount("/proc");
> +				return;
> +			}
> +			if (p == 0) {
> +				pause();
> +				return;
> +			}
> +		}
> +
> +		int *pids, *pids_proc;
> +		unsigned int n = 0;
> +		unsigned int n_proc = 0;
> +		int ret, ret_proc;
> +
> +		ret = pidmap_full(&pids, &n);
> +
> +		ret_proc = pidmap_proc(&pids_proc, &n_proc);
> +		qsort(pids_proc, n_proc, sizeof(int), compare);
> +
> +		EXPECT_LE(0, ret);
> +		if (ret < 0 || ret_proc < 0)
> +			goto exit;
> +
> +		EXPECT_EQ(n_proc, n);
> +		if (n != n_proc)
> +			goto exit;
> +
> +		for (int i = 0; i < n; i++) {
> +			EXPECT_EQ(pids_proc[i], pids[i]);
> +			if (pids_proc[i] != pids[i])
> +				break;
> +		}
> +exit:
> +		free(pids_proc);
> +		free(pids);
> +		umount("/proc");
> +		kill(-getpid(), SIGQUIT);
> +	}
> +	wait(NULL);
> +}
> +
> +TEST_HARNESS_MAIN
> diff --git a/tools/testing/selftests/pidmap/pidmap.h b/tools/testing/selftests/pidmap/pidmap.h
> new file mode 120000
> index 000000000000..3abbde34fee9
> --- /dev/null
> +++ b/tools/testing/selftests/pidmap/pidmap.h
> @@ -0,0 +1 @@
> +../../../../include/uapi/linux/pidmap.h
> \ No newline at end of file
> --
> To unsubscribe from this list: send the line "unsubscribe linux-api" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 


-- 
Michael Kerrisk
Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/
Linux/UNIX System Programming Training: http://man7.org/training/

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v2 2/2] pidmap(2)
  2017-09-24 20:08   ` Alexey Dobriyan
@ 2017-09-25 10:47     ` Djalal Harouni
  -1 siblings, 0 replies; 51+ messages in thread
From: Djalal Harouni @ 2017-09-25 10:47 UTC (permalink / raw)
  To: Alexey Dobriyan
  Cc: Andrew Morton, linux-kernel, Linux API, Randy Dunlap, tglx,
	Alexey Gladkov, Tatsiana Brouka, Aliaksandr Patseyenak,
	Andy Lutomirski

Hi Alexey,

On Sun, Sep 24, 2017 at 9:08 PM, Alexey Dobriyan <adobriyan@gmail.com> wrote:
> From: Tatsiana Brouka <Tatsiana_Brouka@epam.com>
>
> Implement system call for bulk retrieveing of pids in binary form.
>
> Using /proc is slower than necessary: 3 syscalls + another 3 for each thread +
> converting with atoi() + instantiating dentries and inodes.
>
> /proc may be not mounted especially in containers. Natural extension of
> hidepid=2 efforts is to not mount /proc at all.

Actually I am not sure if software will work if /proc is not mounted,
last time (years) I
checked glibc was doing extra checks during initialization using
/proc/self/* memory
inodes and it may fail. Also fexecve() glibc is implemented using
/proc/self/... so it
depends on which library and the use case for cloud containers...

Also for the natural extension of hidepid=2 where we only want pids inside /proc
without kernel data, we have already a clean patch on top of the procfs
modernization [1] , this is the result of the previous months.


>
> It could be used by programs like ps, top or CRIU. Speed increase will
> become more drastic once combined with bulk retrieval of process statistics.

Yes the numbers are nice, seems that you want to move from filesystem syscalls
on procfs, to only use direct syscalls, hmm this does not help to fix
procfs. Tools
like ps, top and others can be updated, but anyone can *continue* to use
open+read on procfs and access the data.

I think this will be a bit hard to fix from our side, since with your
patches you are
doing it from current context, where from procfs it will be from:
current+procfs mount context.

What if procfs is mounted with "ptracepids=true" the new "hidepid=" but whithout
"gid=" interaction, and then you read from /proc/<pid>/pidmap/* as suggested
by Andy ? /proc/<pid>/pidmap/{tasks|proc|children} I am not sure about the
PIDMAP_IGNORE_KTHREADS case...


> Benchmark:
>
>         N=1<<16 times
>         ~130 processes (~250 task_structs) on a regular desktop system
>         opendir + readdir + closedir /proc + the same for every /proc/$PID/task
>         (roughly what htop(1) does) vs pidmap
>
>         /proc 16.80 ± 0.73%
>         pidmap 0.06 ± 0.31%

Thanks!


[1]  https://github.com/legionus/linux/commit/993a2a5b9af95b0ac901ff41d32124b72ed676e3

P.S. for the procfs modernization we are planning patches next days.

-- 
tixxdz

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v2 2/2] pidmap(2)
@ 2017-09-25 10:47     ` Djalal Harouni
  0 siblings, 0 replies; 51+ messages in thread
From: Djalal Harouni @ 2017-09-25 10:47 UTC (permalink / raw)
  To: Alexey Dobriyan
  Cc: Andrew Morton, linux-kernel, Linux API, Randy Dunlap,
	tglx-hfZtesqFncYOwBW4kG4KsQ, Alexey Gladkov, Tatsiana Brouka,
	Aliaksandr Patseyenak, Andy Lutomirski

Hi Alexey,

On Sun, Sep 24, 2017 at 9:08 PM, Alexey Dobriyan <adobriyan-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> wrote:
> From: Tatsiana Brouka <Tatsiana_Brouka-uRwfk40T5oI@public.gmane.org>
>
> Implement system call for bulk retrieveing of pids in binary form.
>
> Using /proc is slower than necessary: 3 syscalls + another 3 for each thread +
> converting with atoi() + instantiating dentries and inodes.
>
> /proc may be not mounted especially in containers. Natural extension of
> hidepid=2 efforts is to not mount /proc at all.

Actually I am not sure if software will work if /proc is not mounted,
last time (years) I
checked glibc was doing extra checks during initialization using
/proc/self/* memory
inodes and it may fail. Also fexecve() glibc is implemented using
/proc/self/... so it
depends on which library and the use case for cloud containers...

Also for the natural extension of hidepid=2 where we only want pids inside /proc
without kernel data, we have already a clean patch on top of the procfs
modernization [1] , this is the result of the previous months.


>
> It could be used by programs like ps, top or CRIU. Speed increase will
> become more drastic once combined with bulk retrieval of process statistics.

Yes the numbers are nice, seems that you want to move from filesystem syscalls
on procfs, to only use direct syscalls, hmm this does not help to fix
procfs. Tools
like ps, top and others can be updated, but anyone can *continue* to use
open+read on procfs and access the data.

I think this will be a bit hard to fix from our side, since with your
patches you are
doing it from current context, where from procfs it will be from:
current+procfs mount context.

What if procfs is mounted with "ptracepids=true" the new "hidepid=" but whithout
"gid=" interaction, and then you read from /proc/<pid>/pidmap/* as suggested
by Andy ? /proc/<pid>/pidmap/{tasks|proc|children} I am not sure about the
PIDMAP_IGNORE_KTHREADS case...


> Benchmark:
>
>         N=1<<16 times
>         ~130 processes (~250 task_structs) on a regular desktop system
>         opendir + readdir + closedir /proc + the same for every /proc/$PID/task
>         (roughly what htop(1) does) vs pidmap
>
>         /proc 16.80 ± 0.73%
>         pidmap 0.06 ± 0.31%

Thanks!


[1]  https://github.com/legionus/linux/commit/993a2a5b9af95b0ac901ff41d32124b72ed676e3

P.S. for the procfs modernization we are planning patches next days.

-- 
tixxdz

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH 1/2 v2] fdmap(2)
  2017-09-24 20:06 ` Alexey Dobriyan
                   ` (3 preceding siblings ...)
  (?)
@ 2017-09-26  4:25 ` kbuild test robot
  -1 siblings, 0 replies; 51+ messages in thread
From: kbuild test robot @ 2017-09-26  4:25 UTC (permalink / raw)
  To: Alexey Dobriyan
  Cc: kbuild-all, akpm, linux-kernel, linux-api, rdunlap, tglx, tixxdz,
	gladkov.alexey

[-- Attachment #1: Type: text/plain, Size: 16883 bytes --]

Hi Aliaksandr,

[auto build test WARNING on linus/master]
[also build test WARNING on v4.14-rc2 next-20170922]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Alexey-Dobriyan/fdmap-2/20170926-105150
config: frv-defconfig (attached as .config)
compiler: frv-linux-gcc (GCC) 6.2.0
reproduce:
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        make.cross ARCH=frv 

All warnings (new ones prefixed by >>):

   frv-linux-ld: Warning: size of symbol `sys_setuid' changed from 272 in kernel/sys.o to 8 in kernel/sys_ni.o
   frv-linux-ld: Warning: size of symbol `sys_setregid' changed from 308 in kernel/sys.o to 8 in kernel/sys_ni.o
   frv-linux-ld: Warning: size of symbol `sys_setgid' changed from 212 in kernel/sys.o to 8 in kernel/sys_ni.o
   frv-linux-ld: Warning: size of symbol `sys_setreuid' changed from 456 in kernel/sys.o to 8 in kernel/sys_ni.o
   frv-linux-ld: Warning: size of symbol `sys_setresuid' changed from 424 in kernel/sys.o to 8 in kernel/sys_ni.o
   frv-linux-ld: Warning: size of symbol `sys_getresuid' changed from 220 in kernel/sys.o to 8 in kernel/sys_ni.o
   frv-linux-ld: Warning: size of symbol `sys_setresgid' changed from 352 in kernel/sys.o to 8 in kernel/sys_ni.o
   frv-linux-ld: Warning: size of symbol `sys_getresgid' changed from 216 in kernel/sys.o to 8 in kernel/sys_ni.o
   frv-linux-ld: Warning: size of symbol `sys_setfsuid' changed from 248 in kernel/sys.o to 8 in kernel/sys_ni.o
   frv-linux-ld: Warning: size of symbol `sys_setfsgid' changed from 224 in kernel/sys.o to 8 in kernel/sys_ni.o
   frv-linux-ld: Warning: size of symbol `sys_capget' changed from 448 in kernel/capability.o to 8 in kernel/sys_ni.o
   frv-linux-ld: Warning: size of symbol `sys_capset' changed from 492 in kernel/capability.o to 8 in kernel/sys_ni.o
   frv-linux-ld: Warning: size of symbol `sys_getgroups' changed from 8 in kernel/sys_ni.o to 176 in kernel/groups.o
   frv-linux-ld: Warning: size of symbol `sys_setgroups' changed from 8 in kernel/sys_ni.o to 392 in kernel/groups.o
   frv-linux-ld: Warning: size of symbol `sched_clock' changed from 56 in arch/frv/kernel/time.o to 40 in kernel/sched/clock.o
   frv-linux-ld: Warning: size of symbol `arch_cpu_idle' changed from 64 in arch/frv/kernel/process.o to 20 in kernel/sched/idle.o
   frv-linux-ld: Warning: size of symbol `sys_membarrier' changed from 8 in kernel/sys_ni.o to 44 in kernel/sched/membarrier.o
   frv-linux-ld: Warning: size of symbol `sys_syslog' changed from 8 in kernel/sys_ni.o to 16 in kernel/printk/printk.o
   frv-linux-ld: Warning: size of symbol `early_irq_init' changed from 8 in kernel/softirq.o to 240 in kernel/irq/irqdesc.o
   frv-linux-ld: Warning: size of symbol `arch_show_interrupts' changed from 84 in arch/frv/kernel/irq.o to 8 in kernel/irq/proc.o
   frv-linux-ld: Warning: size of symbol `read_persistent_clock' changed from 160 in arch/frv/kernel/time.o to 12 in kernel/time/timekeeping.o
   frv-linux-ld: Warning: size of symbol `sys_set_robust_list' changed from 8 in kernel/sys_ni.o to 48 in kernel/futex.o
   frv-linux-ld: Warning: size of symbol `sys_get_robust_list' changed from 8 in kernel/sys_ni.o to 276 in kernel/futex.o
   frv-linux-ld: Warning: size of symbol `sys_futex' changed from 8 in kernel/sys_ni.o to 400 in kernel/futex.o
   frv-linux-ld: Warning: size of symbol `sys_chown16' changed from 8 in kernel/sys_ni.o to 84 in kernel/uid16.o
   frv-linux-ld: Warning: size of symbol `sys_lchown16' changed from 8 in kernel/sys_ni.o to 84 in kernel/uid16.o
   frv-linux-ld: Warning: size of symbol `sys_fchown16' changed from 8 in kernel/sys_ni.o to 84 in kernel/uid16.o
   frv-linux-ld: Warning: size of symbol `sys_setregid16' changed from 8 in kernel/sys_ni.o to 84 in kernel/uid16.o
   frv-linux-ld: Warning: size of symbol `sys_setgid16' changed from 8 in kernel/sys_ni.o to 48 in kernel/uid16.o
   frv-linux-ld: Warning: size of symbol `sys_setreuid16' changed from 8 in kernel/sys_ni.o to 84 in kernel/uid16.o
   frv-linux-ld: Warning: size of symbol `sys_setuid16' changed from 8 in kernel/sys_ni.o to 48 in kernel/uid16.o
   frv-linux-ld: Warning: size of symbol `sys_setresuid16' changed from 8 in kernel/sys_ni.o to 120 in kernel/uid16.o
   frv-linux-ld: Warning: size of symbol `sys_getresuid16' changed from 8 in kernel/sys_ni.o to 312 in kernel/uid16.o
   frv-linux-ld: Warning: size of symbol `sys_setresgid16' changed from 8 in kernel/sys_ni.o to 120 in kernel/uid16.o
   frv-linux-ld: Warning: size of symbol `sys_getresgid16' changed from 8 in kernel/sys_ni.o to 312 in kernel/uid16.o
   frv-linux-ld: Warning: size of symbol `sys_setfsuid16' changed from 8 in kernel/sys_ni.o to 48 in kernel/uid16.o
   frv-linux-ld: Warning: size of symbol `sys_setfsgid16' changed from 8 in kernel/sys_ni.o to 48 in kernel/uid16.o
   frv-linux-ld: Warning: size of symbol `sys_getgroups16' changed from 8 in kernel/sys_ni.o to 240 in kernel/uid16.o
   frv-linux-ld: Warning: size of symbol `sys_setgroups16' changed from 8 in kernel/sys_ni.o to 348 in kernel/uid16.o
   frv-linux-ld: Warning: size of symbol `sys_getuid16' changed from 8 in kernel/sys_ni.o to 48 in kernel/uid16.o
   frv-linux-ld: Warning: size of symbol `sys_geteuid16' changed from 8 in kernel/sys_ni.o to 48 in kernel/uid16.o
   frv-linux-ld: Warning: size of symbol `sys_getgid16' changed from 8 in kernel/sys_ni.o to 48 in kernel/uid16.o
   frv-linux-ld: Warning: size of symbol `sys_getegid16' changed from 8 in kernel/sys_ni.o to 48 in kernel/uid16.o
   frv-linux-ld: Warning: size of symbol `sys_memfd_create' changed from 8 in kernel/sys_ni.o to 584 in mm/shmem.o
   frv-linux-ld: Warning: size of symbol `sys_mincore' changed from 8 in kernel/sys_ni.o to 608 in mm/mincore.o
   frv-linux-ld: Warning: size of symbol `sys_mlock' changed from 8 in kernel/sys_ni.o to 16 in mm/mlock.o
   frv-linux-ld: Warning: size of symbol `sys_mlock2' changed from 8 in kernel/sys_ni.o to 52 in mm/mlock.o
   frv-linux-ld: Warning: size of symbol `sys_munlock' changed from 8 in kernel/sys_ni.o to 128 in mm/mlock.o
   frv-linux-ld: Warning: size of symbol `sys_mlockall' changed from 8 in kernel/sys_ni.o to 356 in mm/mlock.o
   frv-linux-ld: Warning: size of symbol `sys_munlockall' changed from 8 in kernel/sys_ni.o to 84 in mm/mlock.o
   frv-linux-ld: Warning: size of symbol `sys_remap_file_pages' changed from 8 in kernel/sys_ni.o to 712 in mm/mmap.o
   frv-linux-ld: Warning: size of symbol `sys_mprotect' changed from 8 in kernel/sys_ni.o to 628 in mm/mprotect.o
   frv-linux-ld: Warning: size of symbol `sys_mremap' changed from 8 in kernel/sys_ni.o to 1148 in mm/mremap.o
   frv-linux-ld: Warning: size of symbol `sys_msync' changed from 8 in kernel/sys_ni.o to 560 in mm/msync.o
   frv-linux-ld: Warning: size of symbol `sys_process_vm_readv' changed from 8 in kernel/sys_ni.o to 32 in mm/process_vm_access.o
   frv-linux-ld: Warning: size of symbol `sys_process_vm_writev' changed from 8 in kernel/sys_ni.o to 36 in mm/process_vm_access.o
   frv-linux-ld: Warning: size of symbol `sys_fadvise64_64' changed from 8 in kernel/sys_ni.o to 804 in mm/fadvise.o
   frv-linux-ld: Warning: size of symbol `sys_fadvise64' changed from 8 in kernel/sys_ni.o to 28 in mm/fadvise.o
   frv-linux-ld: Warning: size of symbol `sys_madvise' changed from 8 in kernel/sys_ni.o to 2156 in mm/madvise.o
   frv-linux-ld: Warning: size of symbol `sys_swapoff' changed from 8 in kernel/sys_ni.o to 1356 in mm/swapfile.o
   frv-linux-ld: Warning: size of symbol `sys_swapon' changed from 8 in kernel/sys_ni.o to 3968 in mm/swapfile.o
   frv-linux-ld: Warning: size of symbol `sys_copy_file_range' changed from 8 in kernel/sys_ni.o to 700 in fs/read_write.o
   frv-linux-ld: Warning: size of symbol `sys_execveat' changed from 8 in kernel/sys_ni.o to 128 in fs/exec.o
   frv-linux-ld: Warning: size of symbol `sys_sysfs' changed from 8 in kernel/sys_ni.o to 436 in fs/filesystems.o
>> frv-linux-ld: Warning: size of symbol `sys_fdmap' changed from 8 in kernel/sys_ni.o to 704 in fs/fdmap.o
   frv-linux-ld: Warning: size of symbol `sys_bdflush' changed from 8 in kernel/sys_ni.o to 132 in fs/buffer.o
   frv-linux-ld: Warning: size of symbol `sys_inotify_init1' changed from 8 in kernel/sys_ni.o to 340 in fs/notify/inotify/inotify_user.o
   frv-linux-ld: Warning: size of symbol `sys_inotify_init' changed from 8 in kernel/sys_ni.o to 16 in fs/notify/inotify/inotify_user.o
   frv-linux-ld: Warning: size of symbol `sys_inotify_add_watch' changed from 8 in kernel/sys_ni.o to 832 in fs/notify/inotify/inotify_user.o
   frv-linux-ld: Warning: size of symbol `sys_inotify_rm_watch' changed from 8 in kernel/sys_ni.o to 204 in fs/notify/inotify/inotify_user.o
   frv-linux-ld: Warning: size of symbol `sys_epoll_create1' changed from 8 in kernel/sys_ni.o to 384 in fs/eventpoll.o
   frv-linux-ld: Warning: size of symbol `sys_epoll_create' changed from 8 in kernel/sys_ni.o to 32 in fs/eventpoll.o
   frv-linux-ld: Warning: size of symbol `sys_epoll_ctl' changed from 8 in kernel/sys_ni.o to 2856 in fs/eventpoll.o
   frv-linux-ld: Warning: size of symbol `sys_epoll_wait' changed from 8 in kernel/sys_ni.o to 1152 in fs/eventpoll.o
   frv-linux-ld: Warning: size of symbol `sys_epoll_pwait' changed from 8 in kernel/sys_ni.o to 344 in fs/eventpoll.o
   frv-linux-ld: Warning: size of symbol `sys_signalfd4' changed from 8 in kernel/sys_ni.o to 468 in fs/signalfd.o
   frv-linux-ld: Warning: size of symbol `sys_signalfd' changed from 8 in kernel/sys_ni.o to 16 in fs/signalfd.o
   frv-linux-ld: Warning: size of symbol `sys_timerfd_create' changed from 8 in kernel/sys_ni.o to 428 in fs/timerfd.o
   frv-linux-ld: Warning: size of symbol `sys_timerfd_settime' changed from 8 in kernel/sys_ni.o to 1248 in fs/timerfd.o
   frv-linux-ld: Warning: size of symbol `sys_timerfd_gettime' changed from 8 in kernel/sys_ni.o to 468 in fs/timerfd.o
   frv-linux-ld: Warning: size of symbol `sys_eventfd2' changed from 8 in kernel/sys_ni.o to 208 in fs/eventfd.o
   frv-linux-ld: Warning: size of symbol `sys_eventfd' changed from 8 in kernel/sys_ni.o to 16 in fs/eventfd.o
   frv-linux-ld: Warning: size of symbol `sys_io_setup' changed from 8 in kernel/sys_ni.o to 1944 in fs/aio.o
   frv-linux-ld: Warning: size of symbol `sys_io_destroy' changed from 8 in kernel/sys_ni.o to 232 in fs/aio.o
   frv-linux-ld: Warning: size of symbol `sys_io_submit' changed from 8 in kernel/sys_ni.o to 1620 in fs/aio.o
   frv-linux-ld: Warning: size of symbol `sys_io_cancel' changed from 8 in kernel/sys_ni.o to 348 in fs/aio.o
   frv-linux-ld: Warning: size of symbol `sys_io_getevents' changed from 8 in kernel/sys_ni.o to 732 in fs/aio.o
   frv-linux-ld: Warning: size of symbol `sys_flock' changed from 8 in kernel/sys_ni.o to 448 in fs/locks.o
   frv-linux-ld: Warning: size of symbol `sys_name_to_handle_at' changed from 8 in kernel/sys_ni.o to 640 in fs/fhandle.o
   frv-linux-ld: Warning: size of symbol `sys_open_by_handle_at' changed from 8 in kernel/sys_ni.o to 12 in fs/fhandle.o
   frv-linux-ld: Warning: size of symbol `sys_msgget' changed from 8 in kernel/sys_ni.o to 60 in ipc/msg.o
   frv-linux-ld: Warning: size of symbol `sys_msgctl' changed from 8 in kernel/sys_ni.o to 1856 in ipc/msg.o
   frv-linux-ld: Warning: size of symbol `sys_msgsnd' changed from 8 in kernel/sys_ni.o to 948 in ipc/msg.o
   frv-linux-ld: Warning: size of symbol `sys_msgrcv' changed from 8 in kernel/sys_ni.o to 1052 in ipc/msg.o
   frv-linux-ld: Warning: size of symbol `sys_semget' changed from 8 in kernel/sys_ni.o to 112 in ipc/sem.o
   frv-linux-ld: Warning: size of symbol `sys_semctl' changed from 8 in kernel/sys_ni.o to 2828 in ipc/sem.o
   frv-linux-ld: Warning: size of symbol `sys_semtimedop' changed from 8 in kernel/sys_ni.o to 148 in ipc/sem.o
   frv-linux-ld: Warning: size of symbol `sys_semop' changed from 8 in kernel/sys_ni.o to 16 in ipc/sem.o
   frv-linux-ld: Warning: size of symbol `sys_shmget' changed from 8 in kernel/sys_ni.o to 64 in ipc/shm.o
   frv-linux-ld: Warning: size of symbol `sys_shmctl' changed from 8 in kernel/sys_ni.o to 2008 in ipc/shm.o
   frv-linux-ld: Warning: size of symbol `sys_shmat' changed from 8 in kernel/sys_ni.o to 52 in ipc/shm.o
   frv-linux-ld: Warning: size of symbol `sys_shmdt' changed from 8 in kernel/sys_ni.o to 472 in ipc/shm.o
   frv-linux-ld: Warning: size of symbol `sys_ipc' changed from 8 in kernel/sys_ni.o to 560 in ipc/syscall.o
   frv-linux-ld: Warning: size of symbol `sys_mq_open' changed from 8 in kernel/sys_ni.o to 748 in ipc/mqueue.o
   frv-linux-ld: Warning: size of symbol `sys_mq_unlink' changed from 8 in kernel/sys_ni.o to 308 in ipc/mqueue.o
   frv-linux-ld: Warning: size of symbol `sys_mq_timedsend' changed from 8 in kernel/sys_ni.o to 812 in ipc/mqueue.o
   frv-linux-ld: Warning: size of symbol `sys_mq_timedreceive' changed from 8 in kernel/sys_ni.o to 1180 in ipc/mqueue.o
   frv-linux-ld: Warning: size of symbol `sys_mq_notify' changed from 8 in kernel/sys_ni.o to 1036 in ipc/mqueue.o
   frv-linux-ld: Warning: size of symbol `sys_mq_getsetattr' changed from 8 in kernel/sys_ni.o to 588 in ipc/mqueue.o
   frv-linux-ld: Warning: size of symbol `sys_ioprio_set' changed from 8 in kernel/sys_ni.o to 732 in block/ioprio.o
   frv-linux-ld: Warning: size of symbol `sys_ioprio_get' changed from 8 in kernel/sys_ni.o to 852 in block/ioprio.o
   frv-linux-ld: Warning: size of symbol `pcibios_fixup_bus' changed from 108 in arch/frv/mb93090-mb00/pci-vdk.o to 4 in drivers/pci/probe.o
   frv-linux-ld: Warning: size of symbol `pcibios_enable_device' changed from 100 in arch/frv/mb93090-mb00/pci-vdk.o to 12 in drivers/pci/pci.o
   frv-linux-ld: Warning: size of symbol `pcibios_setup' changed from 76 in arch/frv/mb93090-mb00/pci-vdk.o to 4 in drivers/pci/pci.o
   frv-linux-ld: Warning: size of symbol `pcibios_align_resource' changed from 60 in arch/frv/mb93090-mb00/pci-frv.o to 8 in drivers/pci/setup-res.o
   frv-linux-ld: Warning: size of symbol `sys_socket' changed from 8 in kernel/sys_ni.o to 248 in net/socket.o
   frv-linux-ld: Warning: size of symbol `sys_socketpair' changed from 8 in kernel/sys_ni.o to 644 in net/socket.o
   frv-linux-ld: Warning: size of symbol `sys_bind' changed from 8 in kernel/sys_ni.o to 176 in net/socket.o
   frv-linux-ld: Warning: size of symbol `sys_listen' changed from 8 in kernel/sys_ni.o to 144 in net/socket.o
   frv-linux-ld: Warning: size of symbol `sys_accept4' changed from 8 in kernel/sys_ni.o to 456 in net/socket.o
   frv-linux-ld: Warning: size of symbol `sys_accept' changed from 8 in kernel/sys_ni.o to 16 in net/socket.o
   frv-linux-ld: Warning: size of symbol `sys_connect' changed from 8 in kernel/sys_ni.o to 184 in net/socket.o
   frv-linux-ld: Warning: size of symbol `sys_getsockname' changed from 8 in kernel/sys_ni.o to 188 in net/socket.o
   frv-linux-ld: Warning: size of symbol `sys_getpeername' changed from 8 in kernel/sys_ni.o to 188 in net/socket.o
   frv-linux-ld: Warning: size of symbol `sys_sendto' changed from 8 in kernel/sys_ni.o to 256 in net/socket.o
   frv-linux-ld: Warning: size of symbol `sys_send' changed from 8 in kernel/sys_ni.o to 20 in net/socket.o
   frv-linux-ld: Warning: size of symbol `sys_recvfrom' changed from 8 in kernel/sys_ni.o to 308 in net/socket.o
   frv-linux-ld: Warning: size of symbol `sys_recv' changed from 8 in kernel/sys_ni.o to 20 in net/socket.o
   frv-linux-ld: Warning: size of symbol `sys_setsockopt' changed from 8 in kernel/sys_ni.o to 192 in net/socket.o
   frv-linux-ld: Warning: size of symbol `sys_getsockopt' changed from 8 in kernel/sys_ni.o to 168 in net/socket.o
   frv-linux-ld: Warning: size of symbol `sys_shutdown' changed from 8 in kernel/sys_ni.o to 124 in net/socket.o
   frv-linux-ld: Warning: size of symbol `sys_sendmsg' changed from 8 in kernel/sys_ni.o to 12 in net/socket.o
   frv-linux-ld: Warning: size of symbol `sys_sendmmsg' changed from 8 in kernel/sys_ni.o to 12 in net/socket.o
   frv-linux-ld: Warning: size of symbol `sys_recvmsg' changed from 8 in kernel/sys_ni.o to 12 in net/socket.o
   frv-linux-ld: Warning: size of symbol `sys_recvmmsg' changed from 8 in kernel/sys_ni.o to 292 in net/socket.o
   frv-linux-ld: Warning: size of symbol `sys_socketcall' changed from 8 in kernel/sys_ni.o to 496 in net/socket.o
   frv-linux-ld: Warning: size of symbol `skb_copy_bits' changed from 8 in kernel/bpf/core.o to 716 in net/core/skbuff.o
   frv-linux-ld: Warning: size of symbol `bpf_helper_changes_pkt_data' changed from 8 in kernel/bpf/core.o to 216 in net/core/filter.o

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 8895 bytes --]

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v2 2/2] pidmap(2)
  2017-09-24 20:08   ` Alexey Dobriyan
@ 2017-09-26  5:44     ` kbuild test robot
  -1 siblings, 0 replies; 51+ messages in thread
From: kbuild test robot @ 2017-09-26  5:44 UTC (permalink / raw)
  To: Alexey Dobriyan
  Cc: kbuild-all, akpm, linux-kernel, linux-api, rdunlap, tglx, tixxdz,
	gladkov.alexey, Tatsiana_Brouka, Aliaksandr_Patseyenak1

[-- Attachment #1: Type: text/plain, Size: 13955 bytes --]

Hi Tatsiana,

[auto build test WARNING on linus/master]
[also build test WARNING on v4.14-rc2 next-20170922]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Alexey-Dobriyan/fdmap-2/20170926-105150
config: frv-defconfig (attached as .config)
compiler: frv-linux-gcc (GCC) 6.2.0
reproduce:
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        make.cross ARCH=frv 

All warnings (new ones prefixed by >>):

   frv-linux-ld: Warning: size of symbol `sys_setuid' changed from 272 in kernel/sys.o to 8 in kernel/sys_ni.o
   frv-linux-ld: Warning: size of symbol `sys_setregid' changed from 308 in kernel/sys.o to 8 in kernel/sys_ni.o
   frv-linux-ld: Warning: size of symbol `sys_setgid' changed from 212 in kernel/sys.o to 8 in kernel/sys_ni.o
   frv-linux-ld: Warning: size of symbol `sys_setreuid' changed from 456 in kernel/sys.o to 8 in kernel/sys_ni.o
   frv-linux-ld: Warning: size of symbol `sys_setresuid' changed from 424 in kernel/sys.o to 8 in kernel/sys_ni.o
   frv-linux-ld: Warning: size of symbol `sys_getresuid' changed from 220 in kernel/sys.o to 8 in kernel/sys_ni.o
   frv-linux-ld: Warning: size of symbol `sys_setresgid' changed from 352 in kernel/sys.o to 8 in kernel/sys_ni.o
   frv-linux-ld: Warning: size of symbol `sys_getresgid' changed from 216 in kernel/sys.o to 8 in kernel/sys_ni.o
   frv-linux-ld: Warning: size of symbol `sys_setfsuid' changed from 248 in kernel/sys.o to 8 in kernel/sys_ni.o
   frv-linux-ld: Warning: size of symbol `sys_setfsgid' changed from 224 in kernel/sys.o to 8 in kernel/sys_ni.o
   frv-linux-ld: Warning: size of symbol `sys_capget' changed from 448 in kernel/capability.o to 8 in kernel/sys_ni.o
   frv-linux-ld: Warning: size of symbol `sys_capset' changed from 492 in kernel/capability.o to 8 in kernel/sys_ni.o
>> frv-linux-ld: Warning: size of symbol `sys_pidmap' changed from 8 in kernel/sys_ni.o to 720 in kernel/pidmap.o
   frv-linux-ld: Warning: size of symbol `sys_getgroups' changed from 8 in kernel/sys_ni.o to 176 in kernel/groups.o
   frv-linux-ld: Warning: size of symbol `sys_setgroups' changed from 8 in kernel/sys_ni.o to 392 in kernel/groups.o
   frv-linux-ld: Warning: size of symbol `sched_clock' changed from 56 in arch/frv/kernel/time.o to 40 in kernel/sched/clock.o
   frv-linux-ld: Warning: size of symbol `arch_cpu_idle' changed from 64 in arch/frv/kernel/process.o to 20 in kernel/sched/idle.o
   frv-linux-ld: Warning: size of symbol `sys_membarrier' changed from 8 in kernel/sys_ni.o to 44 in kernel/sched/membarrier.o
   frv-linux-ld: Warning: size of symbol `sys_syslog' changed from 8 in kernel/sys_ni.o to 16 in kernel/printk/printk.o
   frv-linux-ld: Warning: size of symbol `early_irq_init' changed from 8 in kernel/softirq.o to 240 in kernel/irq/irqdesc.o
   frv-linux-ld: Warning: size of symbol `arch_show_interrupts' changed from 84 in arch/frv/kernel/irq.o to 8 in kernel/irq/proc.o
   frv-linux-ld: Warning: size of symbol `read_persistent_clock' changed from 160 in arch/frv/kernel/time.o to 12 in kernel/time/timekeeping.o
   frv-linux-ld: Warning: size of symbol `sys_set_robust_list' changed from 8 in kernel/sys_ni.o to 48 in kernel/futex.o
   frv-linux-ld: Warning: size of symbol `sys_get_robust_list' changed from 8 in kernel/sys_ni.o to 276 in kernel/futex.o
   frv-linux-ld: Warning: size of symbol `sys_futex' changed from 8 in kernel/sys_ni.o to 400 in kernel/futex.o
   frv-linux-ld: Warning: size of symbol `sys_chown16' changed from 8 in kernel/sys_ni.o to 84 in kernel/uid16.o
   frv-linux-ld: Warning: size of symbol `sys_lchown16' changed from 8 in kernel/sys_ni.o to 84 in kernel/uid16.o
   frv-linux-ld: Warning: size of symbol `sys_fchown16' changed from 8 in kernel/sys_ni.o to 84 in kernel/uid16.o
   frv-linux-ld: Warning: size of symbol `sys_setregid16' changed from 8 in kernel/sys_ni.o to 84 in kernel/uid16.o
   frv-linux-ld: Warning: size of symbol `sys_setgid16' changed from 8 in kernel/sys_ni.o to 48 in kernel/uid16.o
   frv-linux-ld: Warning: size of symbol `sys_setreuid16' changed from 8 in kernel/sys_ni.o to 84 in kernel/uid16.o
   frv-linux-ld: Warning: size of symbol `sys_setuid16' changed from 8 in kernel/sys_ni.o to 48 in kernel/uid16.o
   frv-linux-ld: Warning: size of symbol `sys_setresuid16' changed from 8 in kernel/sys_ni.o to 120 in kernel/uid16.o
   frv-linux-ld: Warning: size of symbol `sys_getresuid16' changed from 8 in kernel/sys_ni.o to 312 in kernel/uid16.o
   frv-linux-ld: Warning: size of symbol `sys_setresgid16' changed from 8 in kernel/sys_ni.o to 120 in kernel/uid16.o
   frv-linux-ld: Warning: size of symbol `sys_getresgid16' changed from 8 in kernel/sys_ni.o to 312 in kernel/uid16.o
   frv-linux-ld: Warning: size of symbol `sys_setfsuid16' changed from 8 in kernel/sys_ni.o to 48 in kernel/uid16.o
   frv-linux-ld: Warning: size of symbol `sys_setfsgid16' changed from 8 in kernel/sys_ni.o to 48 in kernel/uid16.o
   frv-linux-ld: Warning: size of symbol `sys_getgroups16' changed from 8 in kernel/sys_ni.o to 240 in kernel/uid16.o
   frv-linux-ld: Warning: size of symbol `sys_setgroups16' changed from 8 in kernel/sys_ni.o to 348 in kernel/uid16.o
   frv-linux-ld: Warning: size of symbol `sys_getuid16' changed from 8 in kernel/sys_ni.o to 48 in kernel/uid16.o
   frv-linux-ld: Warning: size of symbol `sys_geteuid16' changed from 8 in kernel/sys_ni.o to 48 in kernel/uid16.o
   frv-linux-ld: Warning: size of symbol `sys_getgid16' changed from 8 in kernel/sys_ni.o to 48 in kernel/uid16.o
   frv-linux-ld: Warning: size of symbol `sys_getegid16' changed from 8 in kernel/sys_ni.o to 48 in kernel/uid16.o
   frv-linux-ld: Warning: size of symbol `sys_memfd_create' changed from 8 in kernel/sys_ni.o to 584 in mm/shmem.o
   frv-linux-ld: Warning: size of symbol `sys_mincore' changed from 8 in kernel/sys_ni.o to 608 in mm/mincore.o
   frv-linux-ld: Warning: size of symbol `sys_mlock' changed from 8 in kernel/sys_ni.o to 16 in mm/mlock.o
   frv-linux-ld: Warning: size of symbol `sys_mlock2' changed from 8 in kernel/sys_ni.o to 52 in mm/mlock.o
   frv-linux-ld: Warning: size of symbol `sys_munlock' changed from 8 in kernel/sys_ni.o to 128 in mm/mlock.o
   frv-linux-ld: Warning: size of symbol `sys_mlockall' changed from 8 in kernel/sys_ni.o to 356 in mm/mlock.o
   frv-linux-ld: Warning: size of symbol `sys_munlockall' changed from 8 in kernel/sys_ni.o to 84 in mm/mlock.o
   frv-linux-ld: Warning: size of symbol `sys_remap_file_pages' changed from 8 in kernel/sys_ni.o to 712 in mm/mmap.o
   frv-linux-ld: Warning: size of symbol `sys_mprotect' changed from 8 in kernel/sys_ni.o to 628 in mm/mprotect.o
   frv-linux-ld: Warning: size of symbol `sys_mremap' changed from 8 in kernel/sys_ni.o to 1148 in mm/mremap.o
   frv-linux-ld: Warning: size of symbol `sys_msync' changed from 8 in kernel/sys_ni.o to 560 in mm/msync.o
   frv-linux-ld: Warning: size of symbol `sys_process_vm_readv' changed from 8 in kernel/sys_ni.o to 32 in mm/process_vm_access.o
   frv-linux-ld: Warning: size of symbol `sys_process_vm_writev' changed from 8 in kernel/sys_ni.o to 36 in mm/process_vm_access.o
   frv-linux-ld: Warning: size of symbol `sys_fadvise64_64' changed from 8 in kernel/sys_ni.o to 804 in mm/fadvise.o
   frv-linux-ld: Warning: size of symbol `sys_fadvise64' changed from 8 in kernel/sys_ni.o to 28 in mm/fadvise.o
   frv-linux-ld: Warning: size of symbol `sys_madvise' changed from 8 in kernel/sys_ni.o to 2156 in mm/madvise.o
   frv-linux-ld: Warning: size of symbol `sys_swapoff' changed from 8 in kernel/sys_ni.o to 1356 in mm/swapfile.o
   frv-linux-ld: Warning: size of symbol `sys_swapon' changed from 8 in kernel/sys_ni.o to 3968 in mm/swapfile.o
   frv-linux-ld: Warning: size of symbol `sys_copy_file_range' changed from 8 in kernel/sys_ni.o to 700 in fs/read_write.o
   frv-linux-ld: Warning: size of symbol `sys_execveat' changed from 8 in kernel/sys_ni.o to 128 in fs/exec.o
   frv-linux-ld: Warning: size of symbol `sys_sysfs' changed from 8 in kernel/sys_ni.o to 436 in fs/filesystems.o
   frv-linux-ld: Warning: size of symbol `sys_fdmap' changed from 8 in kernel/sys_ni.o to 704 in fs/fdmap.o
   frv-linux-ld: Warning: size of symbol `sys_bdflush' changed from 8 in kernel/sys_ni.o to 132 in fs/buffer.o
   frv-linux-ld: Warning: size of symbol `sys_inotify_init1' changed from 8 in kernel/sys_ni.o to 340 in fs/notify/inotify/inotify_user.o
   frv-linux-ld: Warning: size of symbol `sys_inotify_init' changed from 8 in kernel/sys_ni.o to 16 in fs/notify/inotify/inotify_user.o
   frv-linux-ld: Warning: size of symbol `sys_inotify_add_watch' changed from 8 in kernel/sys_ni.o to 832 in fs/notify/inotify/inotify_user.o
   frv-linux-ld: Warning: size of symbol `sys_inotify_rm_watch' changed from 8 in kernel/sys_ni.o to 204 in fs/notify/inotify/inotify_user.o
   frv-linux-ld: Warning: size of symbol `sys_epoll_create1' changed from 8 in kernel/sys_ni.o to 384 in fs/eventpoll.o
   frv-linux-ld: Warning: size of symbol `sys_epoll_create' changed from 8 in kernel/sys_ni.o to 32 in fs/eventpoll.o
   frv-linux-ld: Warning: size of symbol `sys_epoll_ctl' changed from 8 in kernel/sys_ni.o to 2856 in fs/eventpoll.o
   frv-linux-ld: Warning: size of symbol `sys_epoll_wait' changed from 8 in kernel/sys_ni.o to 1152 in fs/eventpoll.o
   frv-linux-ld: Warning: size of symbol `sys_epoll_pwait' changed from 8 in kernel/sys_ni.o to 344 in fs/eventpoll.o
   frv-linux-ld: Warning: size of symbol `sys_signalfd4' changed from 8 in kernel/sys_ni.o to 468 in fs/signalfd.o
   frv-linux-ld: Warning: size of symbol `sys_signalfd' changed from 8 in kernel/sys_ni.o to 16 in fs/signalfd.o
   frv-linux-ld: Warning: size of symbol `sys_timerfd_create' changed from 8 in kernel/sys_ni.o to 428 in fs/timerfd.o
   frv-linux-ld: Warning: size of symbol `sys_timerfd_settime' changed from 8 in kernel/sys_ni.o to 1248 in fs/timerfd.o
   frv-linux-ld: Warning: size of symbol `sys_timerfd_gettime' changed from 8 in kernel/sys_ni.o to 468 in fs/timerfd.o
   frv-linux-ld: Warning: size of symbol `sys_eventfd2' changed from 8 in kernel/sys_ni.o to 208 in fs/eventfd.o
   frv-linux-ld: Warning: size of symbol `sys_eventfd' changed from 8 in kernel/sys_ni.o to 16 in fs/eventfd.o
   frv-linux-ld: Warning: size of symbol `sys_io_setup' changed from 8 in kernel/sys_ni.o to 1944 in fs/aio.o
   frv-linux-ld: Warning: size of symbol `sys_io_destroy' changed from 8 in kernel/sys_ni.o to 232 in fs/aio.o
   frv-linux-ld: Warning: size of symbol `sys_io_submit' changed from 8 in kernel/sys_ni.o to 1620 in fs/aio.o
   frv-linux-ld: Warning: size of symbol `sys_io_cancel' changed from 8 in kernel/sys_ni.o to 348 in fs/aio.o
   frv-linux-ld: Warning: size of symbol `sys_io_getevents' changed from 8 in kernel/sys_ni.o to 732 in fs/aio.o
   frv-linux-ld: Warning: size of symbol `sys_flock' changed from 8 in kernel/sys_ni.o to 448 in fs/locks.o
   frv-linux-ld: Warning: size of symbol `sys_name_to_handle_at' changed from 8 in kernel/sys_ni.o to 640 in fs/fhandle.o
   frv-linux-ld: Warning: size of symbol `sys_open_by_handle_at' changed from 8 in kernel/sys_ni.o to 12 in fs/fhandle.o
   frv-linux-ld: Warning: size of symbol `sys_msgget' changed from 8 in kernel/sys_ni.o to 60 in ipc/msg.o
   frv-linux-ld: Warning: size of symbol `sys_msgctl' changed from 8 in kernel/sys_ni.o to 1856 in ipc/msg.o
   frv-linux-ld: Warning: size of symbol `sys_msgsnd' changed from 8 in kernel/sys_ni.o to 948 in ipc/msg.o
   frv-linux-ld: Warning: size of symbol `sys_msgrcv' changed from 8 in kernel/sys_ni.o to 1052 in ipc/msg.o
   frv-linux-ld: Warning: size of symbol `sys_semget' changed from 8 in kernel/sys_ni.o to 112 in ipc/sem.o
   frv-linux-ld: Warning: size of symbol `sys_semctl' changed from 8 in kernel/sys_ni.o to 2828 in ipc/sem.o
   frv-linux-ld: Warning: size of symbol `sys_semtimedop' changed from 8 in kernel/sys_ni.o to 148 in ipc/sem.o
   frv-linux-ld: Warning: size of symbol `sys_semop' changed from 8 in kernel/sys_ni.o to 16 in ipc/sem.o
   frv-linux-ld: Warning: size of symbol `sys_shmget' changed from 8 in kernel/sys_ni.o to 64 in ipc/shm.o
   frv-linux-ld: Warning: size of symbol `sys_shmctl' changed from 8 in kernel/sys_ni.o to 2008 in ipc/shm.o
   frv-linux-ld: Warning: size of symbol `sys_shmat' changed from 8 in kernel/sys_ni.o to 52 in ipc/shm.o
   frv-linux-ld: Warning: size of symbol `sys_shmdt' changed from 8 in kernel/sys_ni.o to 472 in ipc/shm.o
   frv-linux-ld: Warning: size of symbol `sys_ipc' changed from 8 in kernel/sys_ni.o to 560 in ipc/syscall.o
   frv-linux-ld: Warning: size of symbol `sys_mq_open' changed from 8 in kernel/sys_ni.o to 748 in ipc/mqueue.o
   frv-linux-ld: Warning: size of symbol `sys_mq_unlink' changed from 8 in kernel/sys_ni.o to 308 in ipc/mqueue.o
   frv-linux-ld: Warning: size of symbol `sys_mq_timedsend' changed from 8 in kernel/sys_ni.o to 812 in ipc/mqueue.o
   frv-linux-ld: Warning: size of symbol `sys_mq_timedreceive' changed from 8 in kernel/sys_ni.o to 1180 in ipc/mqueue.o
   frv-linux-ld: Warning: size of symbol `sys_mq_notify' changed from 8 in kernel/sys_ni.o to 1036 in ipc/mqueue.o
   frv-linux-ld: Warning: size of symbol `sys_mq_getsetattr' changed from 8 in kernel/sys_ni.o to 588 in ipc/mqueue.o
   frv-linux-ld: Warning: size of symbol `sys_ioprio_set' changed from 8 in kernel/sys_ni.o to 732 in block/ioprio.o
   frv-linux-ld: Warning: size of symbol `sys_ioprio_get' changed from 8 in kernel/sys_ni.o to 852 in block/ioprio.o
   frv-linux-ld: Warning: size of symbol `pcibios_fixup_bus' changed from 108 in arch/frv/mb93090-mb00/pci-vdk.o to 4 in drivers/pci/probe.o

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 8898 bytes --]

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v2 2/2] pidmap(2)
@ 2017-09-26  5:44     ` kbuild test robot
  0 siblings, 0 replies; 51+ messages in thread
From: kbuild test robot @ 2017-09-26  5:44 UTC (permalink / raw)
  To: Alexey Dobriyan
  Cc: kbuild-all-JC7UmRfGjtg, akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA, rdunlap-wEGCiKHe2LqWVfeAwA7xHQ,
	tglx-hfZtesqFncYOwBW4kG4KsQ, tixxdz-Re5JQEeQqe8AvxtiuMwx3w,
	gladkov.alexey-Re5JQEeQqe8AvxtiuMwx3w,
	Tatsiana_Brouka-uRwfk40T5oI, Aliaksandr_Patseyenak1-uRwfk40T5oI

[-- Attachment #1: Type: text/plain, Size: 13955 bytes --]

Hi Tatsiana,

[auto build test WARNING on linus/master]
[also build test WARNING on v4.14-rc2 next-20170922]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Alexey-Dobriyan/fdmap-2/20170926-105150
config: frv-defconfig (attached as .config)
compiler: frv-linux-gcc (GCC) 6.2.0
reproduce:
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        make.cross ARCH=frv 

All warnings (new ones prefixed by >>):

   frv-linux-ld: Warning: size of symbol `sys_setuid' changed from 272 in kernel/sys.o to 8 in kernel/sys_ni.o
   frv-linux-ld: Warning: size of symbol `sys_setregid' changed from 308 in kernel/sys.o to 8 in kernel/sys_ni.o
   frv-linux-ld: Warning: size of symbol `sys_setgid' changed from 212 in kernel/sys.o to 8 in kernel/sys_ni.o
   frv-linux-ld: Warning: size of symbol `sys_setreuid' changed from 456 in kernel/sys.o to 8 in kernel/sys_ni.o
   frv-linux-ld: Warning: size of symbol `sys_setresuid' changed from 424 in kernel/sys.o to 8 in kernel/sys_ni.o
   frv-linux-ld: Warning: size of symbol `sys_getresuid' changed from 220 in kernel/sys.o to 8 in kernel/sys_ni.o
   frv-linux-ld: Warning: size of symbol `sys_setresgid' changed from 352 in kernel/sys.o to 8 in kernel/sys_ni.o
   frv-linux-ld: Warning: size of symbol `sys_getresgid' changed from 216 in kernel/sys.o to 8 in kernel/sys_ni.o
   frv-linux-ld: Warning: size of symbol `sys_setfsuid' changed from 248 in kernel/sys.o to 8 in kernel/sys_ni.o
   frv-linux-ld: Warning: size of symbol `sys_setfsgid' changed from 224 in kernel/sys.o to 8 in kernel/sys_ni.o
   frv-linux-ld: Warning: size of symbol `sys_capget' changed from 448 in kernel/capability.o to 8 in kernel/sys_ni.o
   frv-linux-ld: Warning: size of symbol `sys_capset' changed from 492 in kernel/capability.o to 8 in kernel/sys_ni.o
>> frv-linux-ld: Warning: size of symbol `sys_pidmap' changed from 8 in kernel/sys_ni.o to 720 in kernel/pidmap.o
   frv-linux-ld: Warning: size of symbol `sys_getgroups' changed from 8 in kernel/sys_ni.o to 176 in kernel/groups.o
   frv-linux-ld: Warning: size of symbol `sys_setgroups' changed from 8 in kernel/sys_ni.o to 392 in kernel/groups.o
   frv-linux-ld: Warning: size of symbol `sched_clock' changed from 56 in arch/frv/kernel/time.o to 40 in kernel/sched/clock.o
   frv-linux-ld: Warning: size of symbol `arch_cpu_idle' changed from 64 in arch/frv/kernel/process.o to 20 in kernel/sched/idle.o
   frv-linux-ld: Warning: size of symbol `sys_membarrier' changed from 8 in kernel/sys_ni.o to 44 in kernel/sched/membarrier.o
   frv-linux-ld: Warning: size of symbol `sys_syslog' changed from 8 in kernel/sys_ni.o to 16 in kernel/printk/printk.o
   frv-linux-ld: Warning: size of symbol `early_irq_init' changed from 8 in kernel/softirq.o to 240 in kernel/irq/irqdesc.o
   frv-linux-ld: Warning: size of symbol `arch_show_interrupts' changed from 84 in arch/frv/kernel/irq.o to 8 in kernel/irq/proc.o
   frv-linux-ld: Warning: size of symbol `read_persistent_clock' changed from 160 in arch/frv/kernel/time.o to 12 in kernel/time/timekeeping.o
   frv-linux-ld: Warning: size of symbol `sys_set_robust_list' changed from 8 in kernel/sys_ni.o to 48 in kernel/futex.o
   frv-linux-ld: Warning: size of symbol `sys_get_robust_list' changed from 8 in kernel/sys_ni.o to 276 in kernel/futex.o
   frv-linux-ld: Warning: size of symbol `sys_futex' changed from 8 in kernel/sys_ni.o to 400 in kernel/futex.o
   frv-linux-ld: Warning: size of symbol `sys_chown16' changed from 8 in kernel/sys_ni.o to 84 in kernel/uid16.o
   frv-linux-ld: Warning: size of symbol `sys_lchown16' changed from 8 in kernel/sys_ni.o to 84 in kernel/uid16.o
   frv-linux-ld: Warning: size of symbol `sys_fchown16' changed from 8 in kernel/sys_ni.o to 84 in kernel/uid16.o
   frv-linux-ld: Warning: size of symbol `sys_setregid16' changed from 8 in kernel/sys_ni.o to 84 in kernel/uid16.o
   frv-linux-ld: Warning: size of symbol `sys_setgid16' changed from 8 in kernel/sys_ni.o to 48 in kernel/uid16.o
   frv-linux-ld: Warning: size of symbol `sys_setreuid16' changed from 8 in kernel/sys_ni.o to 84 in kernel/uid16.o
   frv-linux-ld: Warning: size of symbol `sys_setuid16' changed from 8 in kernel/sys_ni.o to 48 in kernel/uid16.o
   frv-linux-ld: Warning: size of symbol `sys_setresuid16' changed from 8 in kernel/sys_ni.o to 120 in kernel/uid16.o
   frv-linux-ld: Warning: size of symbol `sys_getresuid16' changed from 8 in kernel/sys_ni.o to 312 in kernel/uid16.o
   frv-linux-ld: Warning: size of symbol `sys_setresgid16' changed from 8 in kernel/sys_ni.o to 120 in kernel/uid16.o
   frv-linux-ld: Warning: size of symbol `sys_getresgid16' changed from 8 in kernel/sys_ni.o to 312 in kernel/uid16.o
   frv-linux-ld: Warning: size of symbol `sys_setfsuid16' changed from 8 in kernel/sys_ni.o to 48 in kernel/uid16.o
   frv-linux-ld: Warning: size of symbol `sys_setfsgid16' changed from 8 in kernel/sys_ni.o to 48 in kernel/uid16.o
   frv-linux-ld: Warning: size of symbol `sys_getgroups16' changed from 8 in kernel/sys_ni.o to 240 in kernel/uid16.o
   frv-linux-ld: Warning: size of symbol `sys_setgroups16' changed from 8 in kernel/sys_ni.o to 348 in kernel/uid16.o
   frv-linux-ld: Warning: size of symbol `sys_getuid16' changed from 8 in kernel/sys_ni.o to 48 in kernel/uid16.o
   frv-linux-ld: Warning: size of symbol `sys_geteuid16' changed from 8 in kernel/sys_ni.o to 48 in kernel/uid16.o
   frv-linux-ld: Warning: size of symbol `sys_getgid16' changed from 8 in kernel/sys_ni.o to 48 in kernel/uid16.o
   frv-linux-ld: Warning: size of symbol `sys_getegid16' changed from 8 in kernel/sys_ni.o to 48 in kernel/uid16.o
   frv-linux-ld: Warning: size of symbol `sys_memfd_create' changed from 8 in kernel/sys_ni.o to 584 in mm/shmem.o
   frv-linux-ld: Warning: size of symbol `sys_mincore' changed from 8 in kernel/sys_ni.o to 608 in mm/mincore.o
   frv-linux-ld: Warning: size of symbol `sys_mlock' changed from 8 in kernel/sys_ni.o to 16 in mm/mlock.o
   frv-linux-ld: Warning: size of symbol `sys_mlock2' changed from 8 in kernel/sys_ni.o to 52 in mm/mlock.o
   frv-linux-ld: Warning: size of symbol `sys_munlock' changed from 8 in kernel/sys_ni.o to 128 in mm/mlock.o
   frv-linux-ld: Warning: size of symbol `sys_mlockall' changed from 8 in kernel/sys_ni.o to 356 in mm/mlock.o
   frv-linux-ld: Warning: size of symbol `sys_munlockall' changed from 8 in kernel/sys_ni.o to 84 in mm/mlock.o
   frv-linux-ld: Warning: size of symbol `sys_remap_file_pages' changed from 8 in kernel/sys_ni.o to 712 in mm/mmap.o
   frv-linux-ld: Warning: size of symbol `sys_mprotect' changed from 8 in kernel/sys_ni.o to 628 in mm/mprotect.o
   frv-linux-ld: Warning: size of symbol `sys_mremap' changed from 8 in kernel/sys_ni.o to 1148 in mm/mremap.o
   frv-linux-ld: Warning: size of symbol `sys_msync' changed from 8 in kernel/sys_ni.o to 560 in mm/msync.o
   frv-linux-ld: Warning: size of symbol `sys_process_vm_readv' changed from 8 in kernel/sys_ni.o to 32 in mm/process_vm_access.o
   frv-linux-ld: Warning: size of symbol `sys_process_vm_writev' changed from 8 in kernel/sys_ni.o to 36 in mm/process_vm_access.o
   frv-linux-ld: Warning: size of symbol `sys_fadvise64_64' changed from 8 in kernel/sys_ni.o to 804 in mm/fadvise.o
   frv-linux-ld: Warning: size of symbol `sys_fadvise64' changed from 8 in kernel/sys_ni.o to 28 in mm/fadvise.o
   frv-linux-ld: Warning: size of symbol `sys_madvise' changed from 8 in kernel/sys_ni.o to 2156 in mm/madvise.o
   frv-linux-ld: Warning: size of symbol `sys_swapoff' changed from 8 in kernel/sys_ni.o to 1356 in mm/swapfile.o
   frv-linux-ld: Warning: size of symbol `sys_swapon' changed from 8 in kernel/sys_ni.o to 3968 in mm/swapfile.o
   frv-linux-ld: Warning: size of symbol `sys_copy_file_range' changed from 8 in kernel/sys_ni.o to 700 in fs/read_write.o
   frv-linux-ld: Warning: size of symbol `sys_execveat' changed from 8 in kernel/sys_ni.o to 128 in fs/exec.o
   frv-linux-ld: Warning: size of symbol `sys_sysfs' changed from 8 in kernel/sys_ni.o to 436 in fs/filesystems.o
   frv-linux-ld: Warning: size of symbol `sys_fdmap' changed from 8 in kernel/sys_ni.o to 704 in fs/fdmap.o
   frv-linux-ld: Warning: size of symbol `sys_bdflush' changed from 8 in kernel/sys_ni.o to 132 in fs/buffer.o
   frv-linux-ld: Warning: size of symbol `sys_inotify_init1' changed from 8 in kernel/sys_ni.o to 340 in fs/notify/inotify/inotify_user.o
   frv-linux-ld: Warning: size of symbol `sys_inotify_init' changed from 8 in kernel/sys_ni.o to 16 in fs/notify/inotify/inotify_user.o
   frv-linux-ld: Warning: size of symbol `sys_inotify_add_watch' changed from 8 in kernel/sys_ni.o to 832 in fs/notify/inotify/inotify_user.o
   frv-linux-ld: Warning: size of symbol `sys_inotify_rm_watch' changed from 8 in kernel/sys_ni.o to 204 in fs/notify/inotify/inotify_user.o
   frv-linux-ld: Warning: size of symbol `sys_epoll_create1' changed from 8 in kernel/sys_ni.o to 384 in fs/eventpoll.o
   frv-linux-ld: Warning: size of symbol `sys_epoll_create' changed from 8 in kernel/sys_ni.o to 32 in fs/eventpoll.o
   frv-linux-ld: Warning: size of symbol `sys_epoll_ctl' changed from 8 in kernel/sys_ni.o to 2856 in fs/eventpoll.o
   frv-linux-ld: Warning: size of symbol `sys_epoll_wait' changed from 8 in kernel/sys_ni.o to 1152 in fs/eventpoll.o
   frv-linux-ld: Warning: size of symbol `sys_epoll_pwait' changed from 8 in kernel/sys_ni.o to 344 in fs/eventpoll.o
   frv-linux-ld: Warning: size of symbol `sys_signalfd4' changed from 8 in kernel/sys_ni.o to 468 in fs/signalfd.o
   frv-linux-ld: Warning: size of symbol `sys_signalfd' changed from 8 in kernel/sys_ni.o to 16 in fs/signalfd.o
   frv-linux-ld: Warning: size of symbol `sys_timerfd_create' changed from 8 in kernel/sys_ni.o to 428 in fs/timerfd.o
   frv-linux-ld: Warning: size of symbol `sys_timerfd_settime' changed from 8 in kernel/sys_ni.o to 1248 in fs/timerfd.o
   frv-linux-ld: Warning: size of symbol `sys_timerfd_gettime' changed from 8 in kernel/sys_ni.o to 468 in fs/timerfd.o
   frv-linux-ld: Warning: size of symbol `sys_eventfd2' changed from 8 in kernel/sys_ni.o to 208 in fs/eventfd.o
   frv-linux-ld: Warning: size of symbol `sys_eventfd' changed from 8 in kernel/sys_ni.o to 16 in fs/eventfd.o
   frv-linux-ld: Warning: size of symbol `sys_io_setup' changed from 8 in kernel/sys_ni.o to 1944 in fs/aio.o
   frv-linux-ld: Warning: size of symbol `sys_io_destroy' changed from 8 in kernel/sys_ni.o to 232 in fs/aio.o
   frv-linux-ld: Warning: size of symbol `sys_io_submit' changed from 8 in kernel/sys_ni.o to 1620 in fs/aio.o
   frv-linux-ld: Warning: size of symbol `sys_io_cancel' changed from 8 in kernel/sys_ni.o to 348 in fs/aio.o
   frv-linux-ld: Warning: size of symbol `sys_io_getevents' changed from 8 in kernel/sys_ni.o to 732 in fs/aio.o
   frv-linux-ld: Warning: size of symbol `sys_flock' changed from 8 in kernel/sys_ni.o to 448 in fs/locks.o
   frv-linux-ld: Warning: size of symbol `sys_name_to_handle_at' changed from 8 in kernel/sys_ni.o to 640 in fs/fhandle.o
   frv-linux-ld: Warning: size of symbol `sys_open_by_handle_at' changed from 8 in kernel/sys_ni.o to 12 in fs/fhandle.o
   frv-linux-ld: Warning: size of symbol `sys_msgget' changed from 8 in kernel/sys_ni.o to 60 in ipc/msg.o
   frv-linux-ld: Warning: size of symbol `sys_msgctl' changed from 8 in kernel/sys_ni.o to 1856 in ipc/msg.o
   frv-linux-ld: Warning: size of symbol `sys_msgsnd' changed from 8 in kernel/sys_ni.o to 948 in ipc/msg.o
   frv-linux-ld: Warning: size of symbol `sys_msgrcv' changed from 8 in kernel/sys_ni.o to 1052 in ipc/msg.o
   frv-linux-ld: Warning: size of symbol `sys_semget' changed from 8 in kernel/sys_ni.o to 112 in ipc/sem.o
   frv-linux-ld: Warning: size of symbol `sys_semctl' changed from 8 in kernel/sys_ni.o to 2828 in ipc/sem.o
   frv-linux-ld: Warning: size of symbol `sys_semtimedop' changed from 8 in kernel/sys_ni.o to 148 in ipc/sem.o
   frv-linux-ld: Warning: size of symbol `sys_semop' changed from 8 in kernel/sys_ni.o to 16 in ipc/sem.o
   frv-linux-ld: Warning: size of symbol `sys_shmget' changed from 8 in kernel/sys_ni.o to 64 in ipc/shm.o
   frv-linux-ld: Warning: size of symbol `sys_shmctl' changed from 8 in kernel/sys_ni.o to 2008 in ipc/shm.o
   frv-linux-ld: Warning: size of symbol `sys_shmat' changed from 8 in kernel/sys_ni.o to 52 in ipc/shm.o
   frv-linux-ld: Warning: size of symbol `sys_shmdt' changed from 8 in kernel/sys_ni.o to 472 in ipc/shm.o
   frv-linux-ld: Warning: size of symbol `sys_ipc' changed from 8 in kernel/sys_ni.o to 560 in ipc/syscall.o
   frv-linux-ld: Warning: size of symbol `sys_mq_open' changed from 8 in kernel/sys_ni.o to 748 in ipc/mqueue.o
   frv-linux-ld: Warning: size of symbol `sys_mq_unlink' changed from 8 in kernel/sys_ni.o to 308 in ipc/mqueue.o
   frv-linux-ld: Warning: size of symbol `sys_mq_timedsend' changed from 8 in kernel/sys_ni.o to 812 in ipc/mqueue.o
   frv-linux-ld: Warning: size of symbol `sys_mq_timedreceive' changed from 8 in kernel/sys_ni.o to 1180 in ipc/mqueue.o
   frv-linux-ld: Warning: size of symbol `sys_mq_notify' changed from 8 in kernel/sys_ni.o to 1036 in ipc/mqueue.o
   frv-linux-ld: Warning: size of symbol `sys_mq_getsetattr' changed from 8 in kernel/sys_ni.o to 588 in ipc/mqueue.o
   frv-linux-ld: Warning: size of symbol `sys_ioprio_set' changed from 8 in kernel/sys_ni.o to 732 in block/ioprio.o
   frv-linux-ld: Warning: size of symbol `sys_ioprio_get' changed from 8 in kernel/sys_ni.o to 852 in block/ioprio.o
   frv-linux-ld: Warning: size of symbol `pcibios_fixup_bus' changed from 108 in arch/frv/mb93090-mb00/pci-vdk.o to 4 in drivers/pci/probe.o

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 8898 bytes --]

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH 1/2 v2] fdmap(2)
@ 2017-09-26 18:43     ` Alexey Dobriyan
  0 siblings, 0 replies; 51+ messages in thread
From: Alexey Dobriyan @ 2017-09-26 18:43 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Andrew Morton, linux-kernel, Linux API, Randy Dunlap,
	Thomas Gleixner, Djalal Harouni, Alexey Gladkov, Tatsiana_Brouka,
	Aliaksandr_Patseyenak1

On Sun, Sep 24, 2017 at 02:31:23PM -0700, Andy Lutomirski wrote:
> On Sun, Sep 24, 2017 at 1:06 PM, Alexey Dobriyan <adobriyan@gmail.com> wrote:
> > From: Aliaksandr Patseyenak <Aliaksandr_Patseyenak1@epam.com>
> >
> > Implement system call for bulk retrieveing of opened descriptors
> > in binary form.
> >
> > Some daemons could use it to reliably close file descriptors
> > before starting. Currently they close everything upto some number
> > which formally is not reliable. Other natural users are lsof(1) and CRIU
> > (although lsof does so much in /proc that the effect is thoroughly buried).
> >
> > /proc, the only way to learn anything about file descriptors may not be
> > available. There is unavoidable overhead associated with instantiating
> > 3 dentries and 3 inodes and converting integers to strings and back.
> >
> > Benchmark:
> >
> >         N=1<<22 times
> >         4 opened descriptors (0, 1, 2, 3)
> >         opendir+readdir+closedir /proc/self/fd vs fdmap
> >
> >         /proc 8.31 ą 0.37%
> >         fdmap 0.32 ą 0.72%
> 
> This doesn't have the semantic problem that pidmap does, but I still
> wonder why this can't be accomplished by adding a new file in /proc.

It can be done in /proc but the point of the exercise is to skip all the
overhead: in this case dcache, 1 descriptor for readdir, conversion
from binary to string.

The problem is much deeper, namely, EIATF people force everyone else
to cater to Unix shells so that they can do read() on them because
Unix shells can't do system calls like real programming languages.
The only way to fix this problem is to ignore Unix shells and start
introducing binary system calls so that normal people aren't forced
to make their programs slower than necessary.

Example: lsof(1) does close() from 3 to 1023 inclusive on startup.
I don't know why but it does it. 1 syscall = 1 us, 1000 syscalls = 1 ms
wasted because all of them will return -EBADF normally. With fdmap(2),
lsof would do 2 fdmap() calls (1 real + 1 to confirm no more descriptors
are available + 0 closes in normal situation). That's 2 syscalls vs 1020.

Obviously, for binary model to work fdmap(2) needs to be complemented
by other system calls all of which will bypass /proc for, say, extracting
/proc/$PID/fd/$i symlink content and fdinfo. Currently, if you use
fdmap(2) you still have to fish in /proc for the rest of the data.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH 1/2 v2] fdmap(2)
@ 2017-09-26 18:43     ` Alexey Dobriyan
  0 siblings, 0 replies; 51+ messages in thread
From: Alexey Dobriyan @ 2017-09-26 18:43 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Andrew Morton, linux-kernel-u79uwXL29TY76Z2rM5mHXA, Linux API,
	Randy Dunlap, Thomas Gleixner, Djalal Harouni, Alexey Gladkov,
	Tatsiana_Brouka-uRwfk40T5oI, Aliaksandr_Patseyenak1-uRwfk40T5oI

On Sun, Sep 24, 2017 at 02:31:23PM -0700, Andy Lutomirski wrote:
> On Sun, Sep 24, 2017 at 1:06 PM, Alexey Dobriyan <adobriyan-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> wrote:
> > From: Aliaksandr Patseyenak <Aliaksandr_Patseyenak1-uRwfk40T5oI@public.gmane.org>
> >
> > Implement system call for bulk retrieveing of opened descriptors
> > in binary form.
> >
> > Some daemons could use it to reliably close file descriptors
> > before starting. Currently they close everything upto some number
> > which formally is not reliable. Other natural users are lsof(1) and CRIU
> > (although lsof does so much in /proc that the effect is thoroughly buried).
> >
> > /proc, the only way to learn anything about file descriptors may not be
> > available. There is unavoidable overhead associated with instantiating
> > 3 dentries and 3 inodes and converting integers to strings and back.
> >
> > Benchmark:
> >
> >         N=1<<22 times
> >         4 opened descriptors (0, 1, 2, 3)
> >         opendir+readdir+closedir /proc/self/fd vs fdmap
> >
> >         /proc 8.31 ą 0.37%
> >         fdmap 0.32 ą 0.72%
> 
> This doesn't have the semantic problem that pidmap does, but I still
> wonder why this can't be accomplished by adding a new file in /proc.

It can be done in /proc but the point of the exercise is to skip all the
overhead: in this case dcache, 1 descriptor for readdir, conversion
from binary to string.

The problem is much deeper, namely, EIATF people force everyone else
to cater to Unix shells so that they can do read() on them because
Unix shells can't do system calls like real programming languages.
The only way to fix this problem is to ignore Unix shells and start
introducing binary system calls so that normal people aren't forced
to make their programs slower than necessary.

Example: lsof(1) does close() from 3 to 1023 inclusive on startup.
I don't know why but it does it. 1 syscall = 1 us, 1000 syscalls = 1 ms
wasted because all of them will return -EBADF normally. With fdmap(2),
lsof would do 2 fdmap() calls (1 real + 1 to confirm no more descriptors
are available + 0 closes in normal situation). That's 2 syscalls vs 1020.

Obviously, for binary model to work fdmap(2) needs to be complemented
by other system calls all of which will bypass /proc for, say, extracting
/proc/$PID/fd/$i symlink content and fdinfo. Currently, if you use
fdmap(2) you still have to fish in /proc for the rest of the data.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v2 2/2] pidmap(2)
@ 2017-09-26 18:46       ` Alexey Dobriyan
  0 siblings, 0 replies; 51+ messages in thread
From: Alexey Dobriyan @ 2017-09-26 18:46 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Andrew Morton, linux-kernel, Linux API, Randy Dunlap,
	Thomas Gleixner, Djalal Harouni, Alexey Gladkov, Tatsiana Brouka,
	Aliaksandr Patseyenak

On Sun, Sep 24, 2017 at 02:27:00PM -0700, Andy Lutomirski wrote:
> On Sun, Sep 24, 2017 at 1:08 PM, Alexey Dobriyan <adobriyan@gmail.com> wrote:
> > From: Tatsiana Brouka <Tatsiana_Brouka@epam.com>
> >
> > Implement system call for bulk retrieveing of pids in binary form.
> >
> > Using /proc is slower than necessary: 3 syscalls + another 3 for each thread +
> > converting with atoi() + instantiating dentries and inodes.
> >
> > /proc may be not mounted especially in containers. Natural extension of
> > hidepid=2 efforts is to not mount /proc at all.
> >
> > It could be used by programs like ps, top or CRIU. Speed increase will
> > become more drastic once combined with bulk retrieval of process statistics.
> >
> > Benchmark:
> >
> >         N=1<<16 times
> >         ~130 processes (~250 task_structs) on a regular desktop system
> >         opendir + readdir + closedir /proc + the same for every /proc/$PID/task
> >         (roughly what htop(1) does) vs pidmap
> >
> >         /proc 16.80 ± 0.73%
> >         pidmap 0.06 ± 0.31%
> >
> > PIDMAP_* flags are modelled after /proc/task_diag patchset.
> >
> >
> > PIDMAP(2)                  Linux Programmer's Manual                 PIDMAP(2)
> >
> > NAME
> >        pidmap - get allocated PIDs
> >
> > SYNOPSIS
> >        long pidmap(pid_t pid, int *pids, unsigned int count , unsigned int start, int flags);
> 
> I think we will seriously regret a syscall that does this.  Djalal is
> working on fixing the turd that is hidepid, and this syscall is
> basically incompatible with ever fixing hidepids.  I think that, to
> make it less regrettable, it needs to take an fd to a proc mount as a
> parameter.  This makes me wonder why it's a syscall at all -- why not
> just create a new file like /proc/pids?

See reply to fdmap(2).

pidmap(2) is indeed more complex case exactly because of
pid/tgid/tid/everything else + pidnamespaces + ->hide_pid.
However the problem remains: query task tree without all the bullshit.
C/R people succumbed with /proc/*/children, it was a mistake IMO.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v2 2/2] pidmap(2)
@ 2017-09-26 18:46       ` Alexey Dobriyan
  0 siblings, 0 replies; 51+ messages in thread
From: Alexey Dobriyan @ 2017-09-26 18:46 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Andrew Morton, linux-kernel-u79uwXL29TY76Z2rM5mHXA, Linux API,
	Randy Dunlap, Thomas Gleixner, Djalal Harouni, Alexey Gladkov,
	Tatsiana Brouka, Aliaksandr Patseyenak

On Sun, Sep 24, 2017 at 02:27:00PM -0700, Andy Lutomirski wrote:
> On Sun, Sep 24, 2017 at 1:08 PM, Alexey Dobriyan <adobriyan-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> wrote:
> > From: Tatsiana Brouka <Tatsiana_Brouka-uRwfk40T5oI@public.gmane.org>
> >
> > Implement system call for bulk retrieveing of pids in binary form.
> >
> > Using /proc is slower than necessary: 3 syscalls + another 3 for each thread +
> > converting with atoi() + instantiating dentries and inodes.
> >
> > /proc may be not mounted especially in containers. Natural extension of
> > hidepid=2 efforts is to not mount /proc at all.
> >
> > It could be used by programs like ps, top or CRIU. Speed increase will
> > become more drastic once combined with bulk retrieval of process statistics.
> >
> > Benchmark:
> >
> >         N=1<<16 times
> >         ~130 processes (~250 task_structs) on a regular desktop system
> >         opendir + readdir + closedir /proc + the same for every /proc/$PID/task
> >         (roughly what htop(1) does) vs pidmap
> >
> >         /proc 16.80 ± 0.73%
> >         pidmap 0.06 ± 0.31%
> >
> > PIDMAP_* flags are modelled after /proc/task_diag patchset.
> >
> >
> > PIDMAP(2)                  Linux Programmer's Manual                 PIDMAP(2)
> >
> > NAME
> >        pidmap - get allocated PIDs
> >
> > SYNOPSIS
> >        long pidmap(pid_t pid, int *pids, unsigned int count , unsigned int start, int flags);
> 
> I think we will seriously regret a syscall that does this.  Djalal is
> working on fixing the turd that is hidepid, and this syscall is
> basically incompatible with ever fixing hidepids.  I think that, to
> make it less regrettable, it needs to take an fd to a proc mount as a
> parameter.  This makes me wonder why it's a syscall at all -- why not
> just create a new file like /proc/pids?

See reply to fdmap(2).

pidmap(2) is indeed more complex case exactly because of
pid/tgid/tid/everything else + pidnamespaces + ->hide_pid.
However the problem remains: query task tree without all the bullshit.
C/R people succumbed with /proc/*/children, it was a mistake IMO.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH 1/2 v2] fdmap(2)
@ 2017-09-26 19:00     ` Alexey Dobriyan
  0 siblings, 0 replies; 51+ messages in thread
From: Alexey Dobriyan @ 2017-09-26 19:00 UTC (permalink / raw)
  To: Michael Kerrisk (man-pages)
  Cc: akpm, linux-kernel, linux-api, rdunlap, tglx, tixxdz,
	gladkov.alexey, Aliaksandr Patseyenak, Tatsiana_Brouka

On Mon, Sep 25, 2017 at 09:42:58AM +0200, Michael Kerrisk (man-pages) wrote:
> [Not sure why original author is not in CC; added]
> 
> Hello Alexey,
> 
> On 09/24/2017 10:06 PM, Alexey Dobriyan wrote:
> > From: Aliaksandr Patseyenak <Aliaksandr_Patseyenak1@epam.com>
> > 
> > Implement system call for bulk retrieveing of opened descriptors
> > in binary form.
> > 
> > Some daemons could use it to reliably close file descriptors
> > before starting. Currently they close everything upto some number
> > which formally is not reliable. Other natural users are lsof(1) and CRIU
> > (although lsof does so much in /proc that the effect is thoroughly buried).
> > 
> > /proc, the only way to learn anything about file descriptors may not be
> > available. There is unavoidable overhead associated with instantiating
> > 3 dentries and 3 inodes and converting integers to strings and back.
> > 
> > Benchmark:
> > 
> > 	N=1<<22 times
> > 	4 opened descriptors (0, 1, 2, 3)
> > 	opendir+readdir+closedir /proc/self/fd vs fdmap
> > 
> > 	/proc 8.31 ± 0.37%
> > 	fdmap 0.32 ± 0.72%
> 
> From the text above, I'm still trying to understand: whose problem
> does this solve? I mean, we've lived with the daemon-close-all-files
> technique forever (and I'm not sure that performance is really an 
> important issue for the daemon case) .

> And you say that the effect for lsof(1) will be buried.

If only fdmap(2) is added, then effect will be negligible for lsof
because it has to go through /proc anyway.

The idea is to start process. In ideal world, only bynary system calls
would exist and shells could emulate /proc/* same way bash implement
/dev/tcp

> So, who does this new system call
> really help? (Note: I'm not saying don't add the syscall, but from
> explanation given here, it's not clear why we should.)

For fdmap(2) natural users are lsof(), CRIU.

At some point, checkpointing was moved to userspace forcing them
to run all over /proc extracting information which could be recovered in
couple of locks, bunch of list iterations and dereferences (just read CRIU).
All of this could not be beneficial for performance.

Parsing text files doesn't help either: most of the numbers in
/proc/*/stat et al are unpadded decimals so that user can't rewind to
exact field he wants.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH 1/2 v2] fdmap(2)
@ 2017-09-26 19:00     ` Alexey Dobriyan
  0 siblings, 0 replies; 51+ messages in thread
From: Alexey Dobriyan @ 2017-09-26 19:00 UTC (permalink / raw)
  To: Michael Kerrisk (man-pages)
  Cc: akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA, rdunlap-wEGCiKHe2LqWVfeAwA7xHQ,
	tglx-hfZtesqFncYOwBW4kG4KsQ, tixxdz-Re5JQEeQqe8AvxtiuMwx3w,
	gladkov.alexey-Re5JQEeQqe8AvxtiuMwx3w, Aliaksandr Patseyenak,
	Tatsiana_Brouka-uRwfk40T5oI

On Mon, Sep 25, 2017 at 09:42:58AM +0200, Michael Kerrisk (man-pages) wrote:
> [Not sure why original author is not in CC; added]
> 
> Hello Alexey,
> 
> On 09/24/2017 10:06 PM, Alexey Dobriyan wrote:
> > From: Aliaksandr Patseyenak <Aliaksandr_Patseyenak1-uRwfk40T5oI@public.gmane.org>
> > 
> > Implement system call for bulk retrieveing of opened descriptors
> > in binary form.
> > 
> > Some daemons could use it to reliably close file descriptors
> > before starting. Currently they close everything upto some number
> > which formally is not reliable. Other natural users are lsof(1) and CRIU
> > (although lsof does so much in /proc that the effect is thoroughly buried).
> > 
> > /proc, the only way to learn anything about file descriptors may not be
> > available. There is unavoidable overhead associated with instantiating
> > 3 dentries and 3 inodes and converting integers to strings and back.
> > 
> > Benchmark:
> > 
> > 	N=1<<22 times
> > 	4 opened descriptors (0, 1, 2, 3)
> > 	opendir+readdir+closedir /proc/self/fd vs fdmap
> > 
> > 	/proc 8.31 ± 0.37%
> > 	fdmap 0.32 ± 0.72%
> 
> From the text above, I'm still trying to understand: whose problem
> does this solve? I mean, we've lived with the daemon-close-all-files
> technique forever (and I'm not sure that performance is really an 
> important issue for the daemon case) .

> And you say that the effect for lsof(1) will be buried.

If only fdmap(2) is added, then effect will be negligible for lsof
because it has to go through /proc anyway.

The idea is to start process. In ideal world, only bynary system calls
would exist and shells could emulate /proc/* same way bash implement
/dev/tcp

> So, who does this new system call
> really help? (Note: I'm not saying don't add the syscall, but from
> explanation given here, it's not clear why we should.)

For fdmap(2) natural users are lsof(), CRIU.

At some point, checkpointing was moved to userspace forcing them
to run all over /proc extracting information which could be recovered in
couple of locks, bunch of list iterations and dereferences (just read CRIU).
All of this could not be beneficial for performance.

Parsing text files doesn't help either: most of the numbers in
/proc/*/stat et al are unpadded decimals so that user can't rewind to
exact field he wants.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH 1/2 v2] fdmap(2)
  2017-09-26 19:00     ` Alexey Dobriyan
@ 2017-09-27 15:03       ` Andy Lutomirski
  -1 siblings, 0 replies; 51+ messages in thread
From: Andy Lutomirski @ 2017-09-27 15:03 UTC (permalink / raw)
  To: Alexey Dobriyan
  Cc: Michael Kerrisk (man-pages),
	Andrew Morton, linux-kernel, Linux API, Randy Dunlap,
	Thomas Gleixner, Djalal Harouni, Alexey Gladkov,
	Aliaksandr Patseyenak, Tatsiana Brouka

On Tue, Sep 26, 2017 at 12:00 PM, Alexey Dobriyan <adobriyan@gmail.com> wrote:
> On Mon, Sep 25, 2017 at 09:42:58AM +0200, Michael Kerrisk (man-pages) wrote:
>> [Not sure why original author is not in CC; added]
>>
>> Hello Alexey,
>>
>> On 09/24/2017 10:06 PM, Alexey Dobriyan wrote:
>> > From: Aliaksandr Patseyenak <Aliaksandr_Patseyenak1@epam.com>
>> >
>> > Implement system call for bulk retrieveing of opened descriptors
>> > in binary form.
>> >
>> > Some daemons could use it to reliably close file descriptors
>> > before starting. Currently they close everything upto some number
>> > which formally is not reliable. Other natural users are lsof(1) and CRIU
>> > (although lsof does so much in /proc that the effect is thoroughly buried).
>> >
>> > /proc, the only way to learn anything about file descriptors may not be
>> > available. There is unavoidable overhead associated with instantiating
>> > 3 dentries and 3 inodes and converting integers to strings and back.
>> >
>> > Benchmark:
>> >
>> >     N=1<<22 times
>> >     4 opened descriptors (0, 1, 2, 3)
>> >     opendir+readdir+closedir /proc/self/fd vs fdmap
>> >
>> >     /proc 8.31 ą 0.37%
>> >     fdmap 0.32 ą 0.72%
>>
>> From the text above, I'm still trying to understand: whose problem
>> does this solve? I mean, we've lived with the daemon-close-all-files
>> technique forever (and I'm not sure that performance is really an
>> important issue for the daemon case) .
>
>> And you say that the effect for lsof(1) will be buried.
>
> If only fdmap(2) is added, then effect will be negligible for lsof
> because it has to go through /proc anyway.
>
> The idea is to start process. In ideal world, only bynary system calls
> would exist and shells could emulate /proc/* same way bash implement
> /dev/tcp

Then start the process by doing it for real and making it obviously
useful.  We should not add a pair of vaguely useful, rather weak
syscalls just to start a process of modernizing /proc.

>
>> So, who does this new system call
>> really help? (Note: I'm not saying don't add the syscall, but from
>> explanation given here, it's not clear why we should.)
>
> For fdmap(2) natural users are lsof(), CRIU.

lsof does:

int
main(argc, argv)
        int argc;
        char *argv[];
{
...
        if ((MaxFd = (int) GET_MAX_FD()) < 53)
            MaxFd = 53;
        for (i = 3; i < MaxFd; i++)
            (void) close(i);

The solution isn't to wrangle fdmap(2) into this code.  The solution
is to remove the code entirely.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH 1/2 v2] fdmap(2)
@ 2017-09-27 15:03       ` Andy Lutomirski
  0 siblings, 0 replies; 51+ messages in thread
From: Andy Lutomirski @ 2017-09-27 15:03 UTC (permalink / raw)
  To: Alexey Dobriyan
  Cc: Michael Kerrisk (man-pages),
	Andrew Morton, linux-kernel-u79uwXL29TY76Z2rM5mHXA, Linux API,
	Randy Dunlap, Thomas Gleixner, Djalal Harouni, Alexey Gladkov,
	Aliaksandr Patseyenak, Tatsiana Brouka

On Tue, Sep 26, 2017 at 12:00 PM, Alexey Dobriyan <adobriyan-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> wrote:
> On Mon, Sep 25, 2017 at 09:42:58AM +0200, Michael Kerrisk (man-pages) wrote:
>> [Not sure why original author is not in CC; added]
>>
>> Hello Alexey,
>>
>> On 09/24/2017 10:06 PM, Alexey Dobriyan wrote:
>> > From: Aliaksandr Patseyenak <Aliaksandr_Patseyenak1-uRwfk40T5oI@public.gmane.org>
>> >
>> > Implement system call for bulk retrieveing of opened descriptors
>> > in binary form.
>> >
>> > Some daemons could use it to reliably close file descriptors
>> > before starting. Currently they close everything upto some number
>> > which formally is not reliable. Other natural users are lsof(1) and CRIU
>> > (although lsof does so much in /proc that the effect is thoroughly buried).
>> >
>> > /proc, the only way to learn anything about file descriptors may not be
>> > available. There is unavoidable overhead associated with instantiating
>> > 3 dentries and 3 inodes and converting integers to strings and back.
>> >
>> > Benchmark:
>> >
>> >     N=1<<22 times
>> >     4 opened descriptors (0, 1, 2, 3)
>> >     opendir+readdir+closedir /proc/self/fd vs fdmap
>> >
>> >     /proc 8.31 ą 0.37%
>> >     fdmap 0.32 ą 0.72%
>>
>> From the text above, I'm still trying to understand: whose problem
>> does this solve? I mean, we've lived with the daemon-close-all-files
>> technique forever (and I'm not sure that performance is really an
>> important issue for the daemon case) .
>
>> And you say that the effect for lsof(1) will be buried.
>
> If only fdmap(2) is added, then effect will be negligible for lsof
> because it has to go through /proc anyway.
>
> The idea is to start process. In ideal world, only bynary system calls
> would exist and shells could emulate /proc/* same way bash implement
> /dev/tcp

Then start the process by doing it for real and making it obviously
useful.  We should not add a pair of vaguely useful, rather weak
syscalls just to start a process of modernizing /proc.

>
>> So, who does this new system call
>> really help? (Note: I'm not saying don't add the syscall, but from
>> explanation given here, it's not clear why we should.)
>
> For fdmap(2) natural users are lsof(), CRIU.

lsof does:

int
main(argc, argv)
        int argc;
        char *argv[];
{
...
        if ((MaxFd = (int) GET_MAX_FD()) < 53)
            MaxFd = 53;
        for (i = 3; i < MaxFd; i++)
            (void) close(i);

The solution isn't to wrangle fdmap(2) into this code.  The solution
is to remove the code entirely.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v2 2/2] pidmap(2)
  2017-09-26 18:46       ` Alexey Dobriyan
  (?)
@ 2017-09-27 15:04       ` Andy Lutomirski
  -1 siblings, 0 replies; 51+ messages in thread
From: Andy Lutomirski @ 2017-09-27 15:04 UTC (permalink / raw)
  To: Alexey Dobriyan
  Cc: Andrew Morton, linux-kernel, Linux API, Randy Dunlap,
	Thomas Gleixner, Djalal Harouni, Alexey Gladkov, Tatsiana Brouka,
	Aliaksandr Patseyenak

On Tue, Sep 26, 2017 at 11:46 AM, Alexey Dobriyan <adobriyan@gmail.com> wrote:
> On Sun, Sep 24, 2017 at 02:27:00PM -0700, Andy Lutomirski wrote:
>> On Sun, Sep 24, 2017 at 1:08 PM, Alexey Dobriyan <adobriyan@gmail.com> wrote:
>> > From: Tatsiana Brouka <Tatsiana_Brouka@epam.com>
>> >
>> > Implement system call for bulk retrieveing of pids in binary form.
>> >
>> > Using /proc is slower than necessary: 3 syscalls + another 3 for each thread +
>> > converting with atoi() + instantiating dentries and inodes.
>> >
>> > /proc may be not mounted especially in containers. Natural extension of
>> > hidepid=2 efforts is to not mount /proc at all.
>> >
>> > It could be used by programs like ps, top or CRIU. Speed increase will
>> > become more drastic once combined with bulk retrieval of process statistics.
>> >
>> > Benchmark:
>> >
>> >         N=1<<16 times
>> >         ~130 processes (~250 task_structs) on a regular desktop system
>> >         opendir + readdir + closedir /proc + the same for every /proc/$PID/task
>> >         (roughly what htop(1) does) vs pidmap
>> >
>> >         /proc 16.80 ą 0.73%
>> >         pidmap 0.06 ą 0.31%
>> >
>> > PIDMAP_* flags are modelled after /proc/task_diag patchset.
>> >
>> >
>> > PIDMAP(2)                  Linux Programmer's Manual                 PIDMAP(2)
>> >
>> > NAME
>> >        pidmap - get allocated PIDs
>> >
>> > SYNOPSIS
>> >        long pidmap(pid_t pid, int *pids, unsigned int count , unsigned int start, int flags);
>>
>> I think we will seriously regret a syscall that does this.  Djalal is
>> working on fixing the turd that is hidepid, and this syscall is
>> basically incompatible with ever fixing hidepids.  I think that, to
>> make it less regrettable, it needs to take an fd to a proc mount as a
>> parameter.  This makes me wonder why it's a syscall at all -- why not
>> just create a new file like /proc/pids?
>
> See reply to fdmap(2).
>
> pidmap(2) is indeed more complex case exactly because of
> pid/tgid/tid/everything else + pidnamespaces + ->hide_pid.
> However the problem remains: query task tree without all the bullshit.
> C/R people succumbed with /proc/*/children, it was a mistake IMO.

Your syscall cannot be implemented sanely.  It doesn't remove bullshit
-- it adds bullshit.  NAK.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH 1/2 v2] fdmap(2)
@ 2017-09-28  7:26         ` Michael Kerrisk (man-pages)
  0 siblings, 0 replies; 51+ messages in thread
From: Michael Kerrisk (man-pages) @ 2017-09-28  7:26 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Alexey Dobriyan, Andrew Morton, linux-kernel, Linux API,
	Randy Dunlap, Thomas Gleixner, Djalal Harouni, Alexey Gladkov,
	Aliaksandr Patseyenak, Tatsiana Brouka

On 27 September 2017 at 17:03, Andy Lutomirski <luto@amacapital.net> wrote:
> On Tue, Sep 26, 2017 at 12:00 PM, Alexey Dobriyan <adobriyan@gmail.com> wrote:
>> On Mon, Sep 25, 2017 at 09:42:58AM +0200, Michael Kerrisk (man-pages) wrote:
>>> [Not sure why original author is not in CC; added]
>>>
>>> Hello Alexey,
>>>
>>> On 09/24/2017 10:06 PM, Alexey Dobriyan wrote:
>>> > From: Aliaksandr Patseyenak <Aliaksandr_Patseyenak1@epam.com>
>>> >
>>> > Implement system call for bulk retrieveing of opened descriptors
>>> > in binary form.
>>> >
>>> > Some daemons could use it to reliably close file descriptors
>>> > before starting. Currently they close everything upto some number
>>> > which formally is not reliable. Other natural users are lsof(1) and CRIU
>>> > (although lsof does so much in /proc that the effect is thoroughly buried).
>>> >
>>> > /proc, the only way to learn anything about file descriptors may not be
>>> > available. There is unavoidable overhead associated with instantiating
>>> > 3 dentries and 3 inodes and converting integers to strings and back.
>>> >
>>> > Benchmark:
>>> >
>>> >     N=1<<22 times
>>> >     4 opened descriptors (0, 1, 2, 3)
>>> >     opendir+readdir+closedir /proc/self/fd vs fdmap
>>> >
>>> >     /proc 8.31 ą 0.37%
>>> >     fdmap 0.32 ą 0.72%
>>>
>>> From the text above, I'm still trying to understand: whose problem
>>> does this solve? I mean, we've lived with the daemon-close-all-files
>>> technique forever (and I'm not sure that performance is really an
>>> important issue for the daemon case) .
>>
>>> And you say that the effect for lsof(1) will be buried.
>>
>> If only fdmap(2) is added, then effect will be negligible for lsof
>> because it has to go through /proc anyway.
>>
>> The idea is to start process. In ideal world, only bynary system calls
>> would exist and shells could emulate /proc/* same way bash implement
>> /dev/tcp
>
> Then start the process by doing it for real and making it obviously
> useful.  We should not add a pair of vaguely useful, rather weak
> syscalls just to start a process of modernizing /proc.

I concur.

Alexey, you still have not wxplained who specifically needs this
right now, and how, precisely, they plan to use the new system calls.
It is all very arm-wavey so far.

Thanks,

Michael

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH 1/2 v2] fdmap(2)
@ 2017-09-28  7:26         ` Michael Kerrisk (man-pages)
  0 siblings, 0 replies; 51+ messages in thread
From: Michael Kerrisk (man-pages) @ 2017-09-28  7:26 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Alexey Dobriyan, Andrew Morton,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Linux API, Randy Dunlap,
	Thomas Gleixner, Djalal Harouni, Alexey Gladkov,
	Aliaksandr Patseyenak, Tatsiana Brouka

On 27 September 2017 at 17:03, Andy Lutomirski <luto-kltTT9wpgjJwATOyAt5JVQ@public.gmane.org> wrote:
> On Tue, Sep 26, 2017 at 12:00 PM, Alexey Dobriyan <adobriyan-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> wrote:
>> On Mon, Sep 25, 2017 at 09:42:58AM +0200, Michael Kerrisk (man-pages) wrote:
>>> [Not sure why original author is not in CC; added]
>>>
>>> Hello Alexey,
>>>
>>> On 09/24/2017 10:06 PM, Alexey Dobriyan wrote:
>>> > From: Aliaksandr Patseyenak <Aliaksandr_Patseyenak1-uRwfk40T5oI@public.gmane.org>
>>> >
>>> > Implement system call for bulk retrieveing of opened descriptors
>>> > in binary form.
>>> >
>>> > Some daemons could use it to reliably close file descriptors
>>> > before starting. Currently they close everything upto some number
>>> > which formally is not reliable. Other natural users are lsof(1) and CRIU
>>> > (although lsof does so much in /proc that the effect is thoroughly buried).
>>> >
>>> > /proc, the only way to learn anything about file descriptors may not be
>>> > available. There is unavoidable overhead associated with instantiating
>>> > 3 dentries and 3 inodes and converting integers to strings and back.
>>> >
>>> > Benchmark:
>>> >
>>> >     N=1<<22 times
>>> >     4 opened descriptors (0, 1, 2, 3)
>>> >     opendir+readdir+closedir /proc/self/fd vs fdmap
>>> >
>>> >     /proc 8.31 ą 0.37%
>>> >     fdmap 0.32 ą 0.72%
>>>
>>> From the text above, I'm still trying to understand: whose problem
>>> does this solve? I mean, we've lived with the daemon-close-all-files
>>> technique forever (and I'm not sure that performance is really an
>>> important issue for the daemon case) .
>>
>>> And you say that the effect for lsof(1) will be buried.
>>
>> If only fdmap(2) is added, then effect will be negligible for lsof
>> because it has to go through /proc anyway.
>>
>> The idea is to start process. In ideal world, only bynary system calls
>> would exist and shells could emulate /proc/* same way bash implement
>> /dev/tcp
>
> Then start the process by doing it for real and making it obviously
> useful.  We should not add a pair of vaguely useful, rather weak
> syscalls just to start a process of modernizing /proc.

I concur.

Alexey, you still have not wxplained who specifically needs this
right now, and how, precisely, they plan to use the new system calls.
It is all very arm-wavey so far.

Thanks,

Michael

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH 1/2 v2] fdmap(2)
@ 2017-09-28 10:10         ` Alexey Dobriyan
  0 siblings, 0 replies; 51+ messages in thread
From: Alexey Dobriyan @ 2017-09-28 10:10 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Michael Kerrisk (man-pages),
	Andrew Morton, linux-kernel, Linux API, Randy Dunlap,
	Thomas Gleixner, Djalal Harouni, Alexey Gladkov,
	Aliaksandr Patseyenak, Tatsiana Brouka

On 9/27/17, Andy Lutomirski <luto@amacapital.net> wrote:
> On Tue, Sep 26, 2017 at 12:00 PM, Alexey Dobriyan <adobriyan@gmail.com>
> wrote:
>> On Mon, Sep 25, 2017 at 09:42:58AM +0200, Michael Kerrisk (man-pages)
>> wrote:
>>> [Not sure why original author is not in CC; added]
>>>
>>> Hello Alexey,
>>>
>>> On 09/24/2017 10:06 PM, Alexey Dobriyan wrote:
>>> > From: Aliaksandr Patseyenak <Aliaksandr_Patseyenak1@epam.com>
>>> >
>>> > Implement system call for bulk retrieveing of opened descriptors
>>> > in binary form.
>>> >
>>> > Some daemons could use it to reliably close file descriptors
>>> > before starting. Currently they close everything upto some number
>>> > which formally is not reliable. Other natural users are lsof(1) and
>>> > CRIU
>>> > (although lsof does so much in /proc that the effect is thoroughly
>>> > buried).
>>> >
>>> > /proc, the only way to learn anything about file descriptors may not
>>> > be
>>> > available. There is unavoidable overhead associated with instantiating
>>> > 3 dentries and 3 inodes and converting integers to strings and back.
>>> >
>>> > Benchmark:
>>> >
>>> >     N=1<<22 times
>>> >     4 opened descriptors (0, 1, 2, 3)
>>> >     opendir+readdir+closedir /proc/self/fd vs fdmap
>>> >
>>> >     /proc 8.31 ą 0.37%
>>> >     fdmap 0.32 ą 0.72%
>>>
>>> From the text above, I'm still trying to understand: whose problem
>>> does this solve? I mean, we've lived with the daemon-close-all-files
>>> technique forever (and I'm not sure that performance is really an
>>> important issue for the daemon case) .
>>
>>> And you say that the effect for lsof(1) will be buried.
>>
>> If only fdmap(2) is added, then effect will be negligible for lsof
>> because it has to go through /proc anyway.
>>
>> The idea is to start process. In ideal world, only bynary system calls
>> would exist and shells could emulate /proc/* same way bash implement
>> /dev/tcp
>
> Then start the process by doing it for real and making it obviously
> useful.  We should not add a pair of vaguely useful, rather weak
> syscalls just to start a process of modernizing /proc.
>
>>
>>> So, who does this new system call
>>> really help? (Note: I'm not saying don't add the syscall, but from
>>> explanation given here, it's not clear why we should.)
>>
>> For fdmap(2) natural users are lsof(), CRIU.
>
> lsof does:
>
> int
> main(argc, argv)
>         int argc;
>         char *argv[];
> {
> ...
>         if ((MaxFd = (int) GET_MAX_FD()) < 53)
>             MaxFd = 53;
>         for (i = 3; i < MaxFd; i++)
>             (void) close(i);
>
> The solution isn't to wrangle fdmap(2) into this code.  The solution
> is to remove the code entirely.

What do you think about this code from OpenSSH?

/*
 * Discard other fds that are hanging around. These can cause problem
 * with backgrounded ssh processes started by ControlPersist.
 */
closefrom(STDERR_FILENO + 1);

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH 1/2 v2] fdmap(2)
@ 2017-09-28 10:10         ` Alexey Dobriyan
  0 siblings, 0 replies; 51+ messages in thread
From: Alexey Dobriyan @ 2017-09-28 10:10 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Michael Kerrisk (man-pages),
	Andrew Morton, linux-kernel-u79uwXL29TY76Z2rM5mHXA, Linux API,
	Randy Dunlap, Thomas Gleixner, Djalal Harouni, Alexey Gladkov,
	Aliaksandr Patseyenak, Tatsiana Brouka

On 9/27/17, Andy Lutomirski <luto-kltTT9wpgjJwATOyAt5JVQ@public.gmane.org> wrote:
> On Tue, Sep 26, 2017 at 12:00 PM, Alexey Dobriyan <adobriyan-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
> wrote:
>> On Mon, Sep 25, 2017 at 09:42:58AM +0200, Michael Kerrisk (man-pages)
>> wrote:
>>> [Not sure why original author is not in CC; added]
>>>
>>> Hello Alexey,
>>>
>>> On 09/24/2017 10:06 PM, Alexey Dobriyan wrote:
>>> > From: Aliaksandr Patseyenak <Aliaksandr_Patseyenak1-uRwfk40T5oI@public.gmane.org>
>>> >
>>> > Implement system call for bulk retrieveing of opened descriptors
>>> > in binary form.
>>> >
>>> > Some daemons could use it to reliably close file descriptors
>>> > before starting. Currently they close everything upto some number
>>> > which formally is not reliable. Other natural users are lsof(1) and
>>> > CRIU
>>> > (although lsof does so much in /proc that the effect is thoroughly
>>> > buried).
>>> >
>>> > /proc, the only way to learn anything about file descriptors may not
>>> > be
>>> > available. There is unavoidable overhead associated with instantiating
>>> > 3 dentries and 3 inodes and converting integers to strings and back.
>>> >
>>> > Benchmark:
>>> >
>>> >     N=1<<22 times
>>> >     4 opened descriptors (0, 1, 2, 3)
>>> >     opendir+readdir+closedir /proc/self/fd vs fdmap
>>> >
>>> >     /proc 8.31 ą 0.37%
>>> >     fdmap 0.32 ą 0.72%
>>>
>>> From the text above, I'm still trying to understand: whose problem
>>> does this solve? I mean, we've lived with the daemon-close-all-files
>>> technique forever (and I'm not sure that performance is really an
>>> important issue for the daemon case) .
>>
>>> And you say that the effect for lsof(1) will be buried.
>>
>> If only fdmap(2) is added, then effect will be negligible for lsof
>> because it has to go through /proc anyway.
>>
>> The idea is to start process. In ideal world, only bynary system calls
>> would exist and shells could emulate /proc/* same way bash implement
>> /dev/tcp
>
> Then start the process by doing it for real and making it obviously
> useful.  We should not add a pair of vaguely useful, rather weak
> syscalls just to start a process of modernizing /proc.
>
>>
>>> So, who does this new system call
>>> really help? (Note: I'm not saying don't add the syscall, but from
>>> explanation given here, it's not clear why we should.)
>>
>> For fdmap(2) natural users are lsof(), CRIU.
>
> lsof does:
>
> int
> main(argc, argv)
>         int argc;
>         char *argv[];
> {
> ...
>         if ((MaxFd = (int) GET_MAX_FD()) < 53)
>             MaxFd = 53;
>         for (i = 3; i < MaxFd; i++)
>             (void) close(i);
>
> The solution isn't to wrangle fdmap(2) into this code.  The solution
> is to remove the code entirely.

What do you think about this code from OpenSSH?

/*
 * Discard other fds that are hanging around. These can cause problem
 * with backgrounded ssh processes started by ControlPersist.
 */
closefrom(STDERR_FILENO + 1);

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH 1/2 v2] fdmap(2)
  2017-09-28  7:26         ` Michael Kerrisk (man-pages)
  (?)
@ 2017-09-28 10:55         ` Alexey Dobriyan
  2017-09-28 15:02             ` Andy Lutomirski
  -1 siblings, 1 reply; 51+ messages in thread
From: Alexey Dobriyan @ 2017-09-28 10:55 UTC (permalink / raw)
  To: mtk.manpages
  Cc: Andy Lutomirski, Andrew Morton, linux-kernel, Linux API,
	Randy Dunlap, Thomas Gleixner, Djalal Harouni, Alexey Gladkov,
	Aliaksandr Patseyenak, Tatsiana Brouka

On 9/28/17, Michael Kerrisk (man-pages) <mtk.manpages@gmail.com> wrote:
> On 27 September 2017 at 17:03, Andy Lutomirski <luto@amacapital.net> wrote:

>>> The idea is to start process. In ideal world, only bynary system calls
>>> would exist and shells could emulate /proc/* same way bash implement
>>> /dev/tcp
>>
>> Then start the process by doing it for real and making it obviously
>> useful.  We should not add a pair of vaguely useful, rather weak
>> syscalls just to start a process of modernizing /proc.

Before doing it for real it would be nice to have at least a nod
from people in charge that syscalls which return binary
information are OK. Otherwise some EIATF guy will just say
"NAK /proc is fine, it always was fine".

Or look from another angle: sched_setaffinity exists but there is
no /proc counterpart, shells must use taskset(1) and world didn't end.

> I concur.
>
> Alexey, you still have not wxplained who specifically needs this
> right now, and how, precisely, they plan to use the new system calls.
> It is all very arm-wavey so far.

It is not if you read even example program in the original patch.
Any program which queries information about file descriptors
will benefit both in CPU and memory usage.

void closefrom(int start)
{
	int fd[1024];
	int n;

	while ((n = fdmap(0, fd, sizeof(fd)/sizeof(fd[0]), start)) > 0) {
		unsigned int i;

		for (i = 0; i < n; i++)
			close(fd[i]);

		start = fd[n - 1] + 1;
	}
}

CRIU naturally to know everything about descriptors of target processes:
It does:

int predump_task_files(int pid)
{
        struct dirent *de;
        DIR *fd_dir;
        int ret = -1;

        pr_info("Pre-dump fds for %d)\n", pid);

        fd_dir = opendir_proc(pid, "fd");
        if (!fd_dir)
                return -1;

        while ((de = readdir(fd_dir))) {
                if (dir_dots(de))
                        continue;

                if (predump_one_fd(pid, atoi(de->d_name)))
                        goto out;
        }

        ret = 0;
out:
        closedir(fd_dir);
        return ret;
}

which is again inefficient.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH 1/2 v2] fdmap(2)
@ 2017-09-28 15:02             ` Andy Lutomirski
  0 siblings, 0 replies; 51+ messages in thread
From: Andy Lutomirski @ 2017-09-28 15:02 UTC (permalink / raw)
  To: Alexey Dobriyan
  Cc: Michael Kerrisk, Andrew Morton, linux-kernel, Linux API,
	Randy Dunlap, Thomas Gleixner, Djalal Harouni, Alexey Gladkov,
	Aliaksandr Patseyenak, Tatsiana Brouka

On Thu, Sep 28, 2017 at 3:55 AM, Alexey Dobriyan <adobriyan@gmail.com> wrote:
> On 9/28/17, Michael Kerrisk (man-pages) <mtk.manpages@gmail.com> wrote:
>> On 27 September 2017 at 17:03, Andy Lutomirski <luto@amacapital.net> wrote:
>
>>>> The idea is to start process. In ideal world, only bynary system calls
>>>> would exist and shells could emulate /proc/* same way bash implement
>>>> /dev/tcp
>>>
>>> Then start the process by doing it for real and making it obviously
>>> useful.  We should not add a pair of vaguely useful, rather weak
>>> syscalls just to start a process of modernizing /proc.
>
> Before doing it for real it would be nice to have at least a nod
> from people in charge that syscalls which return binary
> information are OK. Otherwise some EIATF guy will just say
> "NAK /proc is fine, it always was fine".

There's nothing inherently wrong with syscalls that return binary
information.  There is something wrong with reinventing the world with
insufficient justification, though.

/proc may be old, clunky, and kind of slow, but it has a lot of good
things going for it.  It supports access control (DAC and MAC).  It
handles namespacing in a way that's awkward but supported by all the
existing namespace managers.  It may soon support mount options, which
is rather important.

I feel like we've been discussing this performance issue for over a
year, and I distinctly recall discussing it in Santa Fe.  I suggested
a two-pronged approach:

1. Add a new syscall that will, in a single call, open, read, and
close a proc file and maybe even a regular non-proc file.  Like this:

long readfileat(int dirfd, const char *path, void *buf, size_t len, int flags);

2. Where needed, add new /proc files with lighter-weight
representations.  I think we discussed something that's like status
but in nl_attr format.

This doesn't end up with a bunch of code duplication the way that a
full-blown syscall-based reimplementation would.  It supports all the
/proc features rather than just a subset.  It fully respects access
control, future mount options, and the potential lack of a /proc
mount.

>
> Or look from another angle: sched_setaffinity exists but there is
> no /proc counterpart, shells must use taskset(1) and world didn't end.

sched_setaffinity() modifies the caller.  /proc wouldn't have made much sense.

>
>> I concur.
>>
>> Alexey, you still have not wxplained who specifically needs this
>> right now, and how, precisely, they plan to use the new system calls.
>> It is all very arm-wavey so far.
>
> It is not if you read even example program in the original patch.
> Any program which queries information about file descriptors
> will benefit both in CPU and memory usage.
>
> void closefrom(int start)
> {
>         int fd[1024];
>         int n;
>
>         while ((n = fdmap(0, fd, sizeof(fd)/sizeof(fd[0]), start)) > 0) {
>                 unsigned int i;
>
>                 for (i = 0; i < n; i++)
>                         close(fd[i]);
>
>                 start = fd[n - 1] + 1;
>         }
> }
>
> CRIU naturally to know everything about descriptors of target processes:
> It does:
>
> int predump_task_files(int pid)
> {
>         struct dirent *de;
>         DIR *fd_dir;
>         int ret = -1;
>
>         pr_info("Pre-dump fds for %d)\n", pid);
>
>         fd_dir = opendir_proc(pid, "fd");
>         if (!fd_dir)
>                 return -1;
>
>         while ((de = readdir(fd_dir))) {
>                 if (dir_dots(de))
>                         continue;
>
>                 if (predump_one_fd(pid, atoi(de->d_name)))
>                         goto out;
>         }
>
>         ret = 0;
> out:
>         closedir(fd_dir);
>         return ret;
> }
>
> which is again inefficient.

And /proc/PID/fds would solve this.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH 1/2 v2] fdmap(2)
@ 2017-09-28 15:02             ` Andy Lutomirski
  0 siblings, 0 replies; 51+ messages in thread
From: Andy Lutomirski @ 2017-09-28 15:02 UTC (permalink / raw)
  To: Alexey Dobriyan
  Cc: Michael Kerrisk, Andrew Morton,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Linux API, Randy Dunlap,
	Thomas Gleixner, Djalal Harouni, Alexey Gladkov,
	Aliaksandr Patseyenak, Tatsiana Brouka

On Thu, Sep 28, 2017 at 3:55 AM, Alexey Dobriyan <adobriyan-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> wrote:
> On 9/28/17, Michael Kerrisk (man-pages) <mtk.manpages-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> wrote:
>> On 27 September 2017 at 17:03, Andy Lutomirski <luto-kltTT9wpgjJwATOyAt5JVQ@public.gmane.org> wrote:
>
>>>> The idea is to start process. In ideal world, only bynary system calls
>>>> would exist and shells could emulate /proc/* same way bash implement
>>>> /dev/tcp
>>>
>>> Then start the process by doing it for real and making it obviously
>>> useful.  We should not add a pair of vaguely useful, rather weak
>>> syscalls just to start a process of modernizing /proc.
>
> Before doing it for real it would be nice to have at least a nod
> from people in charge that syscalls which return binary
> information are OK. Otherwise some EIATF guy will just say
> "NAK /proc is fine, it always was fine".

There's nothing inherently wrong with syscalls that return binary
information.  There is something wrong with reinventing the world with
insufficient justification, though.

/proc may be old, clunky, and kind of slow, but it has a lot of good
things going for it.  It supports access control (DAC and MAC).  It
handles namespacing in a way that's awkward but supported by all the
existing namespace managers.  It may soon support mount options, which
is rather important.

I feel like we've been discussing this performance issue for over a
year, and I distinctly recall discussing it in Santa Fe.  I suggested
a two-pronged approach:

1. Add a new syscall that will, in a single call, open, read, and
close a proc file and maybe even a regular non-proc file.  Like this:

long readfileat(int dirfd, const char *path, void *buf, size_t len, int flags);

2. Where needed, add new /proc files with lighter-weight
representations.  I think we discussed something that's like status
but in nl_attr format.

This doesn't end up with a bunch of code duplication the way that a
full-blown syscall-based reimplementation would.  It supports all the
/proc features rather than just a subset.  It fully respects access
control, future mount options, and the potential lack of a /proc
mount.

>
> Or look from another angle: sched_setaffinity exists but there is
> no /proc counterpart, shells must use taskset(1) and world didn't end.

sched_setaffinity() modifies the caller.  /proc wouldn't have made much sense.

>
>> I concur.
>>
>> Alexey, you still have not wxplained who specifically needs this
>> right now, and how, precisely, they plan to use the new system calls.
>> It is all very arm-wavey so far.
>
> It is not if you read even example program in the original patch.
> Any program which queries information about file descriptors
> will benefit both in CPU and memory usage.
>
> void closefrom(int start)
> {
>         int fd[1024];
>         int n;
>
>         while ((n = fdmap(0, fd, sizeof(fd)/sizeof(fd[0]), start)) > 0) {
>                 unsigned int i;
>
>                 for (i = 0; i < n; i++)
>                         close(fd[i]);
>
>                 start = fd[n - 1] + 1;
>         }
> }
>
> CRIU naturally to know everything about descriptors of target processes:
> It does:
>
> int predump_task_files(int pid)
> {
>         struct dirent *de;
>         DIR *fd_dir;
>         int ret = -1;
>
>         pr_info("Pre-dump fds for %d)\n", pid);
>
>         fd_dir = opendir_proc(pid, "fd");
>         if (!fd_dir)
>                 return -1;
>
>         while ((de = readdir(fd_dir))) {
>                 if (dir_dots(de))
>                         continue;
>
>                 if (predump_one_fd(pid, atoi(de->d_name)))
>                         goto out;
>         }
>
>         ret = 0;
> out:
>         closedir(fd_dir);
>         return ret;
> }
>
> which is again inefficient.

And /proc/PID/fds would solve this.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [1/2,v2] fdmap(2)
  2017-09-24 20:06 ` Alexey Dobriyan
@ 2017-10-10 22:08   ` Andrei Vagin
  -1 siblings, 0 replies; 51+ messages in thread
From: Andrei Vagin @ 2017-10-10 22:08 UTC (permalink / raw)
  To: Alexey Dobriyan
  Cc: akpm, linux-kernel, linux-api, rdunlap, tglx, tixxdz, gladkov.alexey

On Sun, Sep 24, 2017 at 11:06:20PM +0300, Alexey Dobriyan wrote:
> From: Aliaksandr Patseyenak <Aliaksandr_Patseyenak1@epam.com>
> 
> Implement system call for bulk retrieveing of opened descriptors
> in binary form.
> 
> Some daemons could use it to reliably close file descriptors
> before starting. Currently they close everything upto some number
> which formally is not reliable. Other natural users are lsof(1) and CRIU
> (although lsof does so much in /proc that the effect is thoroughly buried).

Hello Alexey,

I am not sure about the idea to add syscalls for all sort of process
attributes. For example, in CRIU we need file descriptors with their
properties, which we currently get from /proc/pid/fdinfo/. How can
this interface be extended to achieve our goal?

Have you seen the task-diag interface what I sent about a year ago?

We had a discussion on the previous kernel summit how to rework
task-diag, so that it can be merged into the upstream kernel.
Unfortunately, I didn't send a summary for this discussion. But it's
better now than never. We decided to do something like this:

1. Add a new syscall readfile(fname, buf, size), which can be
used to read small files without opening a file descriptor. It will be
useful for proc files, configs, etc.

2. bin/text/bin conversion is very slow
 - 65.47% proc_pid_status
  - 20.81% render_sigset_t
   - 18.27% seq_printf
    + 15.77% seq_vprintf
  - 10.65% task_mem
    + 8.78% seq_print
    + 1.02% hugetlb_rep
  + 7.40% seq_printf
so a new interface has to use a binary format and the format of netlink
messages can be used here. It should be possible to extend a file
without breaking backward compatibility.

3. There are a lot of objection to use a netlink sockets out of the network
subsystem. The idea of using a "transaction" file looks weird for many
people, so we decided to add a few files in /proc/pid/. I see
minimum two files. One file contains information about a task, it is
mostly what we have in /proc/pid/status and /proc/pid/stat. Another file
describes a task memory, it is what we have now in /proc/pid/smaps.
Here is one more major idea. All attributes in a file has to be equal in
term of performance, or by other words there should not be attributes,
which significantly affect a generation time of a whole file.

If we look at /proc/pid/smaps, we spend a lot of time to get memory
statistics. This file contains a lot of data and if you read it to get
VmFlags, the kernel will waste your time by generating a useless data
for you.

Here is my slides for this discussion:
https://www.linuxplumbersconf.org/2016/ocw/system/presentations/4599/original/Netlink-issues.pdf

Could you add me into recipients for this sort of patches in a future?

Thanks,
Andrei

> 
> /proc, the only way to learn anything about file descriptors may not be
> available. There is unavoidable overhead associated with instantiating
> 3 dentries and 3 inodes and converting integers to strings and back.
> 
> Benchmark:
> 
> 	N=1<<22 times
> 	4 opened descriptors (0, 1, 2, 3)
> 	opendir+readdir+closedir /proc/self/fd vs fdmap
> 
> 	/proc 8.31 ± 0.37%
> 	fdmap 0.32 ± 0.72%
> 
> 
> FDMAP(2)                   Linux Programmer's Manual                  FDMAP(2)
> 
> NAME
>        fdmap - get open file descriptors of the process
> 
> SYNOPSIS
>        long fdmap(pid_t pid, int *fd, unsigned int nfd, int start, int flags);
> 
> DESCRIPTION
>        fdmap()  writes  open  file  descriptors  of the process into buffer fd
>        starting from the start descriptor. At most nfd elements  are  written.
>        flags argument is reserved and must be zero.
> 
>        If pid is zero, syscall will work with the current process.
> 
> RETURN VALUE
>        On success, number of descriptors written is returned.  On error, -1 is
>        returned, and errno is set appropriately.
> 
> ERRORS
>        ESRCH  No such process.
> 
>        EACCES Permission denied.
> 
>        EFAULT Invalid fd pointer.
> 
>        EINVAL Negative start argument.
> 
> NOTES
>        Glibc does not provide a wrapper for these system call; call  it  using
>        syscall(2).
> 
> EXAMPLE
>        The program below demonstrates fdmap() usage.
> 
>            $ ./a.out $$
>            0 1 2 255
> 
>            $ ./a.out 42</dev/null 1023</dev/null
>            0 1 2 42
>            1023
> 
>    Program source
> 
>        #include <sys/types.h>
>        #include <stdlib.h>
>        #include <stdio.h>
> 
>        static inline long fdmap(int pid, int *fd, unsigned int nfd, unsigned int start, int flags)
>        {
>             register long r10 asm ("r10") = start;
>             register long r8 asm ("r8") = flags;
>             long rv;
>             asm volatile (
>                  "syscall"
>                  : "=a" (rv)
>                  : "0" (333), "D" (pid), "S" (fd), "d" (nfd), "r" (r10), "r" (r8)
>                  : "rcx", "r11", "cc", "memory"
>             );
>             return rv;
>        }
> 
>        int main(int argc, char *argv[])
>        {
>             pid_t pid;
>             int fd[4];
>             unsigned int start;
>             int n;
> 
>             pid = 0;
>             if (argc > 1)
>                  pid = atoi(argv[1]);
> 
>             start = 0;
>             while ((n = fdmap(pid, fd, sizeof(fd)/sizeof(fd[0]), start, 0)) > 0) {
>                  unsigned int i;
> 
>                  for (i = 0; i < n; i++)
>                       printf("%u ", fd[i]);
>                  printf("\n");
> 
>                  start = fd[n - 1] + 1;
>             }
>             return 0;
>        }
> 
> Linux                             2017-09-21                          FDMAP(2)
> 
> Changelog:
> 
> 	CONFIG_PIDMAP option
> 	manpage
> 
> 
> Signed-off-by: Aliaksandr Patseyenak <Aliaksandr_Patseyenak1@epam.com>
> Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
> ---
>  arch/x86/entry/syscalls/syscall_64.tbl     |   1 +
>  fs/Makefile                                |   2 +
>  fs/fdmap.c                                 | 105 ++++++++++++++++++++
>  include/linux/syscalls.h                   |   2 +
>  init/Kconfig                               |   7 ++
>  kernel/sys_ni.c                            |   2 +
>  tools/testing/selftests/Makefile           |   1 +
>  tools/testing/selftests/fdmap/.gitignore   |   1 +
>  tools/testing/selftests/fdmap/Makefile     |   7 ++
>  tools/testing/selftests/fdmap/fdmap.c      | 112 +++++++++++++++++++++
>  tools/testing/selftests/fdmap/fdmap.h      |  12 +++
>  tools/testing/selftests/fdmap/fdmap_test.c | 153 +++++++++++++++++++++++++++++
>  12 files changed, 405 insertions(+)
>  create mode 100644 fs/fdmap.c
>  create mode 100644 tools/testing/selftests/fdmap/.gitignore
>  create mode 100644 tools/testing/selftests/fdmap/Makefile
>  create mode 100644 tools/testing/selftests/fdmap/fdmap.c
>  create mode 100644 tools/testing/selftests/fdmap/fdmap.h
>  create mode 100644 tools/testing/selftests/fdmap/fdmap_test.c
> 
> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> index 5aef183e2f85..9bfe5f79674f 100644
> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> @@ -339,6 +339,7 @@
>  330	common	pkey_alloc		sys_pkey_alloc
>  331	common	pkey_free		sys_pkey_free
>  332	common	statx			sys_statx
> +333	common	fdmap			sys_fdmap
>  
>  #
>  # x32-specific system call numbers start at 512 to avoid cache impact
> diff --git a/fs/Makefile b/fs/Makefile
> index 7bbaca9c67b1..27476a66c18e 100644
> --- a/fs/Makefile
> +++ b/fs/Makefile
> @@ -13,6 +13,8 @@ obj-y :=	open.o read_write.o file_table.o super.o \
>  		pnode.o splice.o sync.o utimes.o \
>  		stack.o fs_struct.o statfs.o fs_pin.o nsfs.o
>  
> +obj-$(CONFIG_FDMAP) += fdmap.o
> +
>  ifeq ($(CONFIG_BLOCK),y)
>  obj-y +=	buffer.o block_dev.o direct-io.o mpage.o
>  else
> diff --git a/fs/fdmap.c b/fs/fdmap.c
> new file mode 100644
> index 000000000000..274e6c5b7c9c
> --- /dev/null
> +++ b/fs/fdmap.c
> @@ -0,0 +1,105 @@
> +#include <linux/bitops.h>
> +#include <linux/fdtable.h>
> +#include <linux/rcupdate.h>
> +#include <linux/sched.h>
> +#include <linux/syscalls.h>
> +#include <linux/uaccess.h>
> +
> +/**
> + * fdmap - get opened file descriptors of a process
> + * @pid: the pid of the target process
> + * @fds: allocated userspace buffer
> + * @count: buffer size (in descriptors)
> + * @start_fd: first descriptor to search from (inclusive)
> + * @flags: reserved for future functionality, must be zero
> + *
> + * If @pid is zero then it's current process.
> + * Return: number of descriptors written. An error code otherwise.
> + */
> +SYSCALL_DEFINE5(fdmap, pid_t, pid, int __user *, fds, unsigned int, count,
> +		int, start_fd, int, flags)
> +{
> +	struct task_struct *task;
> +	struct files_struct *files;
> +	unsigned long search_mask;
> +	unsigned int user_index, offset;
> +	int masksize;
> +
> +	if (start_fd < 0 || flags != 0)
> +		return -EINVAL;
> +
> +	if (!pid) {
> +		files = get_files_struct(current);
> +	} else {
> +		rcu_read_lock();
> +		task = find_task_by_vpid(pid);
> +		if (!task) {
> +			rcu_read_unlock();
> +			return -ESRCH;
> +		}
> +		if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
> +			rcu_read_unlock();
> +			return -EACCES;
> +		}
> +		files = get_files_struct(task);
> +		rcu_read_unlock();
> +	}
> +	if (!files)
> +		return 0;
> +
> +	offset = start_fd / BITS_PER_LONG;
> +	search_mask = ULONG_MAX << (start_fd % BITS_PER_LONG);
> +	user_index = 0;
> +#define FDS_BUF_SIZE	(512/sizeof(unsigned long))
> +	masksize = FDS_BUF_SIZE;
> +	while (user_index < count && masksize == FDS_BUF_SIZE) {
> +		unsigned long open_fds[FDS_BUF_SIZE];
> +		struct fdtable *fdt;
> +		unsigned int i;
> +
> +		/*
> +		 * fdt->max_fds can grow, get it every time
> +		 * before copying part into internal buffer.
> +		 */
> +		rcu_read_lock();
> +		fdt = files_fdtable(files);
> +		masksize = fdt->max_fds / 8 - offset * sizeof(long);
> +		if (masksize < 0) {
> +			rcu_read_unlock();
> +			break;
> +		}
> +		masksize = min(masksize, (int)sizeof(open_fds));
> +		memcpy(open_fds, fdt->open_fds + offset, masksize);
> +		rcu_read_unlock();
> +
> +		open_fds[0] &= search_mask;
> +		search_mask = ULONG_MAX;
> +		masksize = (masksize + sizeof(long) - 1) / sizeof(long);
> +		start_fd = offset * BITS_PER_LONG;
> +		/*
> +		 * for_each_set_bit_from() can re-read first word
> +		 * multiple times which is not optimal.
> +		 */
> +		for (i = 0; i < masksize; i++) {
> +			unsigned long mask = open_fds[i];
> +
> +			while (mask) {
> +				unsigned int real_fd = start_fd + __ffs(mask);
> +
> +				if (put_user(real_fd, fds + user_index)) {
> +					put_files_struct(files);
> +					return -EFAULT;
> +				}
> +				if (++user_index >= count)
> +					goto out;
> +				mask &= mask - 1;
> +			}
> +			start_fd += BITS_PER_LONG;
> +		}
> +		offset += FDS_BUF_SIZE;
> +	}
> +out:
> +	put_files_struct(files);
> +
> +	return user_index;
> +}
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index 95606a2d556f..d393d844facb 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -936,5 +936,7 @@ asmlinkage long sys_pkey_alloc(unsigned long flags, unsigned long init_val);
>  asmlinkage long sys_pkey_free(int pkey);
>  asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
>  			  unsigned mask, struct statx __user *buffer);
> +asmlinkage long sys_fdmap(pid_t pid, int __user *fds, unsigned int count,
> +			  int start_fd, int flags);
>  
>  #endif
> diff --git a/init/Kconfig b/init/Kconfig
> index 78cb2461012e..952d13b7326d 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -1400,6 +1400,13 @@ config MEMBARRIER
>  
>  	  If unsure, say Y.
>  
> +config FDMAP
> +	bool "fdmap() system call" if EXPERT
> +	default y
> +	help
> +	  Enable fdmap() system call that allows to query file descriptors
> +	  in binary form avoiding /proc overhead.
> +
>  config EMBEDDED
>  	bool "Embedded system"
>  	option allnoconfig_y
> diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
> index 8acef8576ce9..d61fa27d021e 100644
> --- a/kernel/sys_ni.c
> +++ b/kernel/sys_ni.c
> @@ -258,3 +258,5 @@ cond_syscall(sys_membarrier);
>  cond_syscall(sys_pkey_mprotect);
>  cond_syscall(sys_pkey_alloc);
>  cond_syscall(sys_pkey_free);
> +
> +cond_syscall(sys_fdmap);
> diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
> index 26ce4f7168be..e8d63c27c865 100644
> --- a/tools/testing/selftests/Makefile
> +++ b/tools/testing/selftests/Makefile
> @@ -5,6 +5,7 @@ TARGETS += cpufreq
>  TARGETS += cpu-hotplug
>  TARGETS += efivarfs
>  TARGETS += exec
> +TARGETS += fdmap
>  TARGETS += firmware
>  TARGETS += ftrace
>  TARGETS += futex
> diff --git a/tools/testing/selftests/fdmap/.gitignore b/tools/testing/selftests/fdmap/.gitignore
> new file mode 100644
> index 000000000000..9a9bfdac1cc0
> --- /dev/null
> +++ b/tools/testing/selftests/fdmap/.gitignore
> @@ -0,0 +1 @@
> +fdmap_test
> diff --git a/tools/testing/selftests/fdmap/Makefile b/tools/testing/selftests/fdmap/Makefile
> new file mode 100644
> index 000000000000..bf9f051f4b63
> --- /dev/null
> +++ b/tools/testing/selftests/fdmap/Makefile
> @@ -0,0 +1,7 @@
> +TEST_GEN_PROGS := fdmap_test
> +CFLAGS += -Wall
> +
> +include ../lib.mk
> +
> +$(TEST_GEN_PROGS): fdmap_test.c fdmap.c fdmap.h ../kselftest_harness.h
> +	$(CC) $(CFLAGS) $(LDFLAGS) $< fdmap.c -o $@
> diff --git a/tools/testing/selftests/fdmap/fdmap.c b/tools/testing/selftests/fdmap/fdmap.c
> new file mode 100644
> index 000000000000..66725b0201e0
> --- /dev/null
> +++ b/tools/testing/selftests/fdmap/fdmap.c
> @@ -0,0 +1,112 @@
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <string.h>
> +#include <dirent.h>
> +#include <unistd.h>
> +#include <errno.h>
> +#include <sys/types.h>
> +#include "fdmap.h"
> +
> +#define	BUF_SIZE	1024
> +
> +long fdmap(pid_t pid, int *fds, size_t count, int start_fd, int flags)
> +{
> +	register int64_t r10 asm("r10") = start_fd;
> +	register int64_t r8 asm("r8") = flags;
> +	long ret;
> +
> +	asm volatile (
> +		"syscall"
> +		: "=a"(ret)
> +		: "0" (333),
> +		  "D" (pid), "S" (fds), "d" (count), "r" (r10), "r" (r8)
> +		: "rcx", "r11", "cc", "memory"
> +	);
> +	return ret;
> +}
> +
> +int fdmap_full(pid_t pid, int **fds, size_t *n)
> +{
> +	int buf[BUF_SIZE], start_fd = 0;
> +	long ret;
> +
> +	*n = 0;
> +	*fds = NULL;
> +	for (;;) {
> +		int *new_buff;
> +
> +		ret = fdmap(pid, buf, BUF_SIZE, start_fd, 0);
> +		if (ret < 0)
> +			break;
> +		if (!ret)
> +			return 0;
> +
> +		new_buff = realloc(*fds, (*n + ret) * sizeof(int));
> +		if (!new_buff) {
> +			ret = -errno;
> +			break;
> +		}
> +		*fds = new_buff;
> +		memcpy(*fds + *n, buf, ret * sizeof(int));
> +		*n += ret;
> +		start_fd = (*fds)[*n - 1] + 1;
> +	}
> +	free(*fds);
> +	*fds = NULL;
> +	return -ret;
> +}
> +
> +int fdmap_proc(pid_t pid, int **fds, size_t *n)
> +{
> +	char fds_path[20];
> +	int dir_fd = 0;
> +	struct dirent *fd_link;
> +	DIR *fds_dir;
> +
> +	*fds = NULL;
> +	*n = 0;
> +	if (!pid)
> +		strcpy(fds_path, "/proc/self/fd");
> +	else
> +		sprintf(fds_path, "/proc/%d/fd", pid);
> +
> +	fds_dir = opendir(fds_path);
> +	if (!fds_dir)
> +		return errno == ENOENT ? ESRCH : errno;
> +	if (!pid)
> +		dir_fd = dirfd(fds_dir);
> +
> +	while ((fd_link = readdir(fds_dir))) {
> +		if (fd_link->d_name[0] < '0'
> +		    || fd_link->d_name[0] > '9')
> +			continue;
> +		if (*n % BUF_SIZE == 0) {
> +			int *new_buff;
> +
> +			new_buff = realloc(*fds, (*n + BUF_SIZE) * sizeof(int));
> +			if (!new_buff) {
> +				int ret = errno;
> +
> +				free(*fds);
> +				*fds = NULL;
> +				return ret;
> +			}
> +			*fds = new_buff;
> +		}
> +		(*fds)[*n] = atoi(fd_link->d_name);
> +		*n += 1;
> +	}
> +	closedir(fds_dir);
> +
> +	if (!pid) {
> +		size_t i;
> +
> +		for (i = 0; i < *n; i++)
> +			if ((*fds)[i] == dir_fd)
> +				break;
> +		i++;
> +		memmove(*fds + i - 1, *fds + i, (*n - i) * sizeof(int));
> +		(*n)--;
> +	}
> +	return 0;
> +}
> diff --git a/tools/testing/selftests/fdmap/fdmap.h b/tools/testing/selftests/fdmap/fdmap.h
> new file mode 100644
> index 000000000000..c501111b2bbd
> --- /dev/null
> +++ b/tools/testing/selftests/fdmap/fdmap.h
> @@ -0,0 +1,12 @@
> +#ifndef FDMAP_H
> +#define FDMAP_H
> +
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <sys/types.h>
> +
> +long fdmap(pid_t pid, int *fds, size_t count, int start_fd, int flags);
> +int fdmap_full(pid_t pid, int **fds, size_t *n);
> +int fdmap_proc(pid_t pid, int **fds, size_t *n);
> +
> +#endif
> diff --git a/tools/testing/selftests/fdmap/fdmap_test.c b/tools/testing/selftests/fdmap/fdmap_test.c
> new file mode 100644
> index 000000000000..6f448406d96a
> --- /dev/null
> +++ b/tools/testing/selftests/fdmap/fdmap_test.c
> @@ -0,0 +1,153 @@
> +#include <errno.h>
> +#include <syscall.h>
> +#include <sys/time.h>
> +#include <sys/resource.h>
> +#include <limits.h>
> +#include "../kselftest_harness.h"
> +#include "fdmap.h"
> +
> +TEST(efault) {
> +	int ret;
> +
> +	ret = syscall(333, 0, NULL, 20 * sizeof(int), 0, 0);
> +	ASSERT_EQ(-1, ret);
> +	ASSERT_EQ(EFAULT, errno);
> +}
> +
> +TEST(big_start_fd) {
> +	int fds[1];
> +	int ret;
> +
> +	ret = syscall(333, 0, fds, sizeof(int), INT_MAX, 0);
> +	ASSERT_EQ(0, ret);
> +}
> +
> +TEST(einval) {
> +	int ret;
> +
> +	ret = syscall(333, 0, NULL, 0, -1, 0);
> +	ASSERT_EQ(-1, ret);
> +	ASSERT_EQ(EINVAL, errno);
> +
> +	ret = syscall(333, 0, NULL, 0, 0, 1);
> +	ASSERT_EQ(-1, ret);
> +	ASSERT_EQ(EINVAL, errno);
> +}
> +
> +TEST(esrch) {
> +	int fds[1], ret;
> +	pid_t pid;
> +
> +	pid = fork();
> +	ASSERT_NE(-1, pid);
> +	if (!pid)
> +		exit(0);
> +	waitpid(pid, NULL, 0);
> +
> +	ret = syscall(333, pid, fds, sizeof(int), 0, 0);
> +	ASSERT_EQ(-1, ret);
> +	ASSERT_EQ(ESRCH, errno);
> +}
> +
> +TEST(simple) {
> +	int *fds1, *fds2;
> +	size_t size1, size2, i;
> +	int ret1, ret2;
> +
> +	ret1 = fdmap_full(0, &fds1, &size1);
> +	ret2 = fdmap_proc(0, &fds2, &size2);
> +	ASSERT_EQ(ret2, ret1);
> +	ASSERT_EQ(size2, size1);
> +	for (i = 0; i < size1; i++)
> +		ASSERT_EQ(fds2[i], fds1[i]);
> +	free(fds1);
> +	free(fds2);
> +}
> +
> +TEST(init) {
> +	int *fds1, *fds2;
> +	size_t size1, size2, i;
> +	int ret1, ret2;
> +
> +	ret1 = fdmap_full(1, &fds1, &size1);
> +	ret2 = fdmap_proc(1, &fds2, &size2);
> +	ASSERT_EQ(ret2, ret1);
> +	ASSERT_EQ(size2, size1);
> +	for (i = 0; i < size1; i++)
> +		ASSERT_EQ(fds2[i], fds1[i]);
> +	free(fds1);
> +	free(fds2);
> +}
> +
> +TEST(zero) {
> +	int *fds, i;
> +	size_t size;
> +	int ret;
> +
> +	ret = fdmap_proc(0, &fds, &size);
> +	ASSERT_EQ(0, ret);
> +	for (i = 0; i < size; i++)
> +		close(fds[i]);
> +	free(fds);
> +	fds = NULL;
> +
> +	ret = fdmap_full(0, &fds, &size);
> +	ASSERT_EQ(0, ret);
> +	ASSERT_EQ(0, size);
> +}
> +
> +TEST(more_fds) {
> +	int *fds1, *fds2, ret1, ret2;
> +	size_t size1, size2, i;
> +
> +	struct rlimit rlim = {
> +		.rlim_cur = 600000,
> +		.rlim_max = 600000
> +	};
> +	ASSERT_EQ(0, setrlimit(RLIMIT_NOFILE, &rlim));
> +	for (int i = 0; i < 500000; i++)
> +		dup(0);
> +
> +	ret1 = fdmap_full(0, &fds1, &size1);
> +	ret2 = fdmap_proc(0, &fds2, &size2);
> +	ASSERT_EQ(ret2, ret1);
> +	ASSERT_EQ(size2, size1);
> +	for (i = 0; i < size1; i++)
> +		ASSERT_EQ(fds2[i], fds1[i]);
> +	free(fds1);
> +	free(fds2);
> +}
> +
> +TEST(child) {
> +	int pipefd[2];
> +	int *fds1, *fds2, ret1, ret2, i;
> +	size_t size1, size2;
> +	char byte = 0;
> +	pid_t pid;
> +
> +	ASSERT_NE(-1, pipe(pipefd));
> +	pid = fork();
> +	ASSERT_NE(-1, pid);
> +	if (!pid) {
> +		read(pipefd[0], &byte, 1);
> +		close(pipefd[0]);
> +		close(pipefd[1]);
> +		exit(0);
> +	}
> +
> +	ret1 = fdmap_full(0, &fds1, &size1);
> +	ret2 = fdmap_proc(0, &fds2, &size2);
> +	ASSERT_EQ(ret2, ret1);
> +	ASSERT_EQ(size2, size1);
> +	for (i = 0; i < size1; i++)
> +		ASSERT_EQ(fds2[i], fds1[i]);
> +	free(fds1);
> +	free(fds2);
> +
> +	write(pipefd[1], &byte, 1);
> +	close(pipefd[0]);
> +	close(pipefd[1]);
> +	waitpid(pid, NULL, 0);
> +}
> +
> +TEST_HARNESS_MAIN

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [1/2,v2] fdmap(2)
@ 2017-10-10 22:08   ` Andrei Vagin
  0 siblings, 0 replies; 51+ messages in thread
From: Andrei Vagin @ 2017-10-10 22:08 UTC (permalink / raw)
  To: Alexey Dobriyan
  Cc: akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA, rdunlap-wEGCiKHe2LqWVfeAwA7xHQ,
	tglx-hfZtesqFncYOwBW4kG4KsQ, tixxdz-Re5JQEeQqe8AvxtiuMwx3w,
	gladkov.alexey-Re5JQEeQqe8AvxtiuMwx3w

On Sun, Sep 24, 2017 at 11:06:20PM +0300, Alexey Dobriyan wrote:
> From: Aliaksandr Patseyenak <Aliaksandr_Patseyenak1-uRwfk40T5oI@public.gmane.org>
> 
> Implement system call for bulk retrieveing of opened descriptors
> in binary form.
> 
> Some daemons could use it to reliably close file descriptors
> before starting. Currently they close everything upto some number
> which formally is not reliable. Other natural users are lsof(1) and CRIU
> (although lsof does so much in /proc that the effect is thoroughly buried).

Hello Alexey,

I am not sure about the idea to add syscalls for all sort of process
attributes. For example, in CRIU we need file descriptors with their
properties, which we currently get from /proc/pid/fdinfo/. How can
this interface be extended to achieve our goal?

Have you seen the task-diag interface what I sent about a year ago?

We had a discussion on the previous kernel summit how to rework
task-diag, so that it can be merged into the upstream kernel.
Unfortunately, I didn't send a summary for this discussion. But it's
better now than never. We decided to do something like this:

1. Add a new syscall readfile(fname, buf, size), which can be
used to read small files without opening a file descriptor. It will be
useful for proc files, configs, etc.

2. bin/text/bin conversion is very slow
 - 65.47% proc_pid_status
  - 20.81% render_sigset_t
   - 18.27% seq_printf
    + 15.77% seq_vprintf
  - 10.65% task_mem
    + 8.78% seq_print
    + 1.02% hugetlb_rep
  + 7.40% seq_printf
so a new interface has to use a binary format and the format of netlink
messages can be used here. It should be possible to extend a file
without breaking backward compatibility.

3. There are a lot of objection to use a netlink sockets out of the network
subsystem. The idea of using a "transaction" file looks weird for many
people, so we decided to add a few files in /proc/pid/. I see
minimum two files. One file contains information about a task, it is
mostly what we have in /proc/pid/status and /proc/pid/stat. Another file
describes a task memory, it is what we have now in /proc/pid/smaps.
Here is one more major idea. All attributes in a file has to be equal in
term of performance, or by other words there should not be attributes,
which significantly affect a generation time of a whole file.

If we look at /proc/pid/smaps, we spend a lot of time to get memory
statistics. This file contains a lot of data and if you read it to get
VmFlags, the kernel will waste your time by generating a useless data
for you.

Here is my slides for this discussion:
https://www.linuxplumbersconf.org/2016/ocw/system/presentations/4599/original/Netlink-issues.pdf

Could you add me into recipients for this sort of patches in a future?

Thanks,
Andrei

> 
> /proc, the only way to learn anything about file descriptors may not be
> available. There is unavoidable overhead associated with instantiating
> 3 dentries and 3 inodes and converting integers to strings and back.
> 
> Benchmark:
> 
> 	N=1<<22 times
> 	4 opened descriptors (0, 1, 2, 3)
> 	opendir+readdir+closedir /proc/self/fd vs fdmap
> 
> 	/proc 8.31 ± 0.37%
> 	fdmap 0.32 ± 0.72%
> 
> 
> FDMAP(2)                   Linux Programmer's Manual                  FDMAP(2)
> 
> NAME
>        fdmap - get open file descriptors of the process
> 
> SYNOPSIS
>        long fdmap(pid_t pid, int *fd, unsigned int nfd, int start, int flags);
> 
> DESCRIPTION
>        fdmap()  writes  open  file  descriptors  of the process into buffer fd
>        starting from the start descriptor. At most nfd elements  are  written.
>        flags argument is reserved and must be zero.
> 
>        If pid is zero, syscall will work with the current process.
> 
> RETURN VALUE
>        On success, number of descriptors written is returned.  On error, -1 is
>        returned, and errno is set appropriately.
> 
> ERRORS
>        ESRCH  No such process.
> 
>        EACCES Permission denied.
> 
>        EFAULT Invalid fd pointer.
> 
>        EINVAL Negative start argument.
> 
> NOTES
>        Glibc does not provide a wrapper for these system call; call  it  using
>        syscall(2).
> 
> EXAMPLE
>        The program below demonstrates fdmap() usage.
> 
>            $ ./a.out $$
>            0 1 2 255
> 
>            $ ./a.out 42</dev/null 1023</dev/null
>            0 1 2 42
>            1023
> 
>    Program source
> 
>        #include <sys/types.h>
>        #include <stdlib.h>
>        #include <stdio.h>
> 
>        static inline long fdmap(int pid, int *fd, unsigned int nfd, unsigned int start, int flags)
>        {
>             register long r10 asm ("r10") = start;
>             register long r8 asm ("r8") = flags;
>             long rv;
>             asm volatile (
>                  "syscall"
>                  : "=a" (rv)
>                  : "0" (333), "D" (pid), "S" (fd), "d" (nfd), "r" (r10), "r" (r8)
>                  : "rcx", "r11", "cc", "memory"
>             );
>             return rv;
>        }
> 
>        int main(int argc, char *argv[])
>        {
>             pid_t pid;
>             int fd[4];
>             unsigned int start;
>             int n;
> 
>             pid = 0;
>             if (argc > 1)
>                  pid = atoi(argv[1]);
> 
>             start = 0;
>             while ((n = fdmap(pid, fd, sizeof(fd)/sizeof(fd[0]), start, 0)) > 0) {
>                  unsigned int i;
> 
>                  for (i = 0; i < n; i++)
>                       printf("%u ", fd[i]);
>                  printf("\n");
> 
>                  start = fd[n - 1] + 1;
>             }
>             return 0;
>        }
> 
> Linux                             2017-09-21                          FDMAP(2)
> 
> Changelog:
> 
> 	CONFIG_PIDMAP option
> 	manpage
> 
> 
> Signed-off-by: Aliaksandr Patseyenak <Aliaksandr_Patseyenak1-uRwfk40T5oI@public.gmane.org>
> Signed-off-by: Alexey Dobriyan <adobriyan-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
> ---
>  arch/x86/entry/syscalls/syscall_64.tbl     |   1 +
>  fs/Makefile                                |   2 +
>  fs/fdmap.c                                 | 105 ++++++++++++++++++++
>  include/linux/syscalls.h                   |   2 +
>  init/Kconfig                               |   7 ++
>  kernel/sys_ni.c                            |   2 +
>  tools/testing/selftests/Makefile           |   1 +
>  tools/testing/selftests/fdmap/.gitignore   |   1 +
>  tools/testing/selftests/fdmap/Makefile     |   7 ++
>  tools/testing/selftests/fdmap/fdmap.c      | 112 +++++++++++++++++++++
>  tools/testing/selftests/fdmap/fdmap.h      |  12 +++
>  tools/testing/selftests/fdmap/fdmap_test.c | 153 +++++++++++++++++++++++++++++
>  12 files changed, 405 insertions(+)
>  create mode 100644 fs/fdmap.c
>  create mode 100644 tools/testing/selftests/fdmap/.gitignore
>  create mode 100644 tools/testing/selftests/fdmap/Makefile
>  create mode 100644 tools/testing/selftests/fdmap/fdmap.c
>  create mode 100644 tools/testing/selftests/fdmap/fdmap.h
>  create mode 100644 tools/testing/selftests/fdmap/fdmap_test.c
> 
> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> index 5aef183e2f85..9bfe5f79674f 100644
> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> @@ -339,6 +339,7 @@
>  330	common	pkey_alloc		sys_pkey_alloc
>  331	common	pkey_free		sys_pkey_free
>  332	common	statx			sys_statx
> +333	common	fdmap			sys_fdmap
>  
>  #
>  # x32-specific system call numbers start at 512 to avoid cache impact
> diff --git a/fs/Makefile b/fs/Makefile
> index 7bbaca9c67b1..27476a66c18e 100644
> --- a/fs/Makefile
> +++ b/fs/Makefile
> @@ -13,6 +13,8 @@ obj-y :=	open.o read_write.o file_table.o super.o \
>  		pnode.o splice.o sync.o utimes.o \
>  		stack.o fs_struct.o statfs.o fs_pin.o nsfs.o
>  
> +obj-$(CONFIG_FDMAP) += fdmap.o
> +
>  ifeq ($(CONFIG_BLOCK),y)
>  obj-y +=	buffer.o block_dev.o direct-io.o mpage.o
>  else
> diff --git a/fs/fdmap.c b/fs/fdmap.c
> new file mode 100644
> index 000000000000..274e6c5b7c9c
> --- /dev/null
> +++ b/fs/fdmap.c
> @@ -0,0 +1,105 @@
> +#include <linux/bitops.h>
> +#include <linux/fdtable.h>
> +#include <linux/rcupdate.h>
> +#include <linux/sched.h>
> +#include <linux/syscalls.h>
> +#include <linux/uaccess.h>
> +
> +/**
> + * fdmap - get opened file descriptors of a process
> + * @pid: the pid of the target process
> + * @fds: allocated userspace buffer
> + * @count: buffer size (in descriptors)
> + * @start_fd: first descriptor to search from (inclusive)
> + * @flags: reserved for future functionality, must be zero
> + *
> + * If @pid is zero then it's current process.
> + * Return: number of descriptors written. An error code otherwise.
> + */
> +SYSCALL_DEFINE5(fdmap, pid_t, pid, int __user *, fds, unsigned int, count,
> +		int, start_fd, int, flags)
> +{
> +	struct task_struct *task;
> +	struct files_struct *files;
> +	unsigned long search_mask;
> +	unsigned int user_index, offset;
> +	int masksize;
> +
> +	if (start_fd < 0 || flags != 0)
> +		return -EINVAL;
> +
> +	if (!pid) {
> +		files = get_files_struct(current);
> +	} else {
> +		rcu_read_lock();
> +		task = find_task_by_vpid(pid);
> +		if (!task) {
> +			rcu_read_unlock();
> +			return -ESRCH;
> +		}
> +		if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
> +			rcu_read_unlock();
> +			return -EACCES;
> +		}
> +		files = get_files_struct(task);
> +		rcu_read_unlock();
> +	}
> +	if (!files)
> +		return 0;
> +
> +	offset = start_fd / BITS_PER_LONG;
> +	search_mask = ULONG_MAX << (start_fd % BITS_PER_LONG);
> +	user_index = 0;
> +#define FDS_BUF_SIZE	(512/sizeof(unsigned long))
> +	masksize = FDS_BUF_SIZE;
> +	while (user_index < count && masksize == FDS_BUF_SIZE) {
> +		unsigned long open_fds[FDS_BUF_SIZE];
> +		struct fdtable *fdt;
> +		unsigned int i;
> +
> +		/*
> +		 * fdt->max_fds can grow, get it every time
> +		 * before copying part into internal buffer.
> +		 */
> +		rcu_read_lock();
> +		fdt = files_fdtable(files);
> +		masksize = fdt->max_fds / 8 - offset * sizeof(long);
> +		if (masksize < 0) {
> +			rcu_read_unlock();
> +			break;
> +		}
> +		masksize = min(masksize, (int)sizeof(open_fds));
> +		memcpy(open_fds, fdt->open_fds + offset, masksize);
> +		rcu_read_unlock();
> +
> +		open_fds[0] &= search_mask;
> +		search_mask = ULONG_MAX;
> +		masksize = (masksize + sizeof(long) - 1) / sizeof(long);
> +		start_fd = offset * BITS_PER_LONG;
> +		/*
> +		 * for_each_set_bit_from() can re-read first word
> +		 * multiple times which is not optimal.
> +		 */
> +		for (i = 0; i < masksize; i++) {
> +			unsigned long mask = open_fds[i];
> +
> +			while (mask) {
> +				unsigned int real_fd = start_fd + __ffs(mask);
> +
> +				if (put_user(real_fd, fds + user_index)) {
> +					put_files_struct(files);
> +					return -EFAULT;
> +				}
> +				if (++user_index >= count)
> +					goto out;
> +				mask &= mask - 1;
> +			}
> +			start_fd += BITS_PER_LONG;
> +		}
> +		offset += FDS_BUF_SIZE;
> +	}
> +out:
> +	put_files_struct(files);
> +
> +	return user_index;
> +}
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index 95606a2d556f..d393d844facb 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -936,5 +936,7 @@ asmlinkage long sys_pkey_alloc(unsigned long flags, unsigned long init_val);
>  asmlinkage long sys_pkey_free(int pkey);
>  asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
>  			  unsigned mask, struct statx __user *buffer);
> +asmlinkage long sys_fdmap(pid_t pid, int __user *fds, unsigned int count,
> +			  int start_fd, int flags);
>  
>  #endif
> diff --git a/init/Kconfig b/init/Kconfig
> index 78cb2461012e..952d13b7326d 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -1400,6 +1400,13 @@ config MEMBARRIER
>  
>  	  If unsure, say Y.
>  
> +config FDMAP
> +	bool "fdmap() system call" if EXPERT
> +	default y
> +	help
> +	  Enable fdmap() system call that allows to query file descriptors
> +	  in binary form avoiding /proc overhead.
> +
>  config EMBEDDED
>  	bool "Embedded system"
>  	option allnoconfig_y
> diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
> index 8acef8576ce9..d61fa27d021e 100644
> --- a/kernel/sys_ni.c
> +++ b/kernel/sys_ni.c
> @@ -258,3 +258,5 @@ cond_syscall(sys_membarrier);
>  cond_syscall(sys_pkey_mprotect);
>  cond_syscall(sys_pkey_alloc);
>  cond_syscall(sys_pkey_free);
> +
> +cond_syscall(sys_fdmap);
> diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
> index 26ce4f7168be..e8d63c27c865 100644
> --- a/tools/testing/selftests/Makefile
> +++ b/tools/testing/selftests/Makefile
> @@ -5,6 +5,7 @@ TARGETS += cpufreq
>  TARGETS += cpu-hotplug
>  TARGETS += efivarfs
>  TARGETS += exec
> +TARGETS += fdmap
>  TARGETS += firmware
>  TARGETS += ftrace
>  TARGETS += futex
> diff --git a/tools/testing/selftests/fdmap/.gitignore b/tools/testing/selftests/fdmap/.gitignore
> new file mode 100644
> index 000000000000..9a9bfdac1cc0
> --- /dev/null
> +++ b/tools/testing/selftests/fdmap/.gitignore
> @@ -0,0 +1 @@
> +fdmap_test
> diff --git a/tools/testing/selftests/fdmap/Makefile b/tools/testing/selftests/fdmap/Makefile
> new file mode 100644
> index 000000000000..bf9f051f4b63
> --- /dev/null
> +++ b/tools/testing/selftests/fdmap/Makefile
> @@ -0,0 +1,7 @@
> +TEST_GEN_PROGS := fdmap_test
> +CFLAGS += -Wall
> +
> +include ../lib.mk
> +
> +$(TEST_GEN_PROGS): fdmap_test.c fdmap.c fdmap.h ../kselftest_harness.h
> +	$(CC) $(CFLAGS) $(LDFLAGS) $< fdmap.c -o $@
> diff --git a/tools/testing/selftests/fdmap/fdmap.c b/tools/testing/selftests/fdmap/fdmap.c
> new file mode 100644
> index 000000000000..66725b0201e0
> --- /dev/null
> +++ b/tools/testing/selftests/fdmap/fdmap.c
> @@ -0,0 +1,112 @@
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <string.h>
> +#include <dirent.h>
> +#include <unistd.h>
> +#include <errno.h>
> +#include <sys/types.h>
> +#include "fdmap.h"
> +
> +#define	BUF_SIZE	1024
> +
> +long fdmap(pid_t pid, int *fds, size_t count, int start_fd, int flags)
> +{
> +	register int64_t r10 asm("r10") = start_fd;
> +	register int64_t r8 asm("r8") = flags;
> +	long ret;
> +
> +	asm volatile (
> +		"syscall"
> +		: "=a"(ret)
> +		: "0" (333),
> +		  "D" (pid), "S" (fds), "d" (count), "r" (r10), "r" (r8)
> +		: "rcx", "r11", "cc", "memory"
> +	);
> +	return ret;
> +}
> +
> +int fdmap_full(pid_t pid, int **fds, size_t *n)
> +{
> +	int buf[BUF_SIZE], start_fd = 0;
> +	long ret;
> +
> +	*n = 0;
> +	*fds = NULL;
> +	for (;;) {
> +		int *new_buff;
> +
> +		ret = fdmap(pid, buf, BUF_SIZE, start_fd, 0);
> +		if (ret < 0)
> +			break;
> +		if (!ret)
> +			return 0;
> +
> +		new_buff = realloc(*fds, (*n + ret) * sizeof(int));
> +		if (!new_buff) {
> +			ret = -errno;
> +			break;
> +		}
> +		*fds = new_buff;
> +		memcpy(*fds + *n, buf, ret * sizeof(int));
> +		*n += ret;
> +		start_fd = (*fds)[*n - 1] + 1;
> +	}
> +	free(*fds);
> +	*fds = NULL;
> +	return -ret;
> +}
> +
> +int fdmap_proc(pid_t pid, int **fds, size_t *n)
> +{
> +	char fds_path[20];
> +	int dir_fd = 0;
> +	struct dirent *fd_link;
> +	DIR *fds_dir;
> +
> +	*fds = NULL;
> +	*n = 0;
> +	if (!pid)
> +		strcpy(fds_path, "/proc/self/fd");
> +	else
> +		sprintf(fds_path, "/proc/%d/fd", pid);
> +
> +	fds_dir = opendir(fds_path);
> +	if (!fds_dir)
> +		return errno == ENOENT ? ESRCH : errno;
> +	if (!pid)
> +		dir_fd = dirfd(fds_dir);
> +
> +	while ((fd_link = readdir(fds_dir))) {
> +		if (fd_link->d_name[0] < '0'
> +		    || fd_link->d_name[0] > '9')
> +			continue;
> +		if (*n % BUF_SIZE == 0) {
> +			int *new_buff;
> +
> +			new_buff = realloc(*fds, (*n + BUF_SIZE) * sizeof(int));
> +			if (!new_buff) {
> +				int ret = errno;
> +
> +				free(*fds);
> +				*fds = NULL;
> +				return ret;
> +			}
> +			*fds = new_buff;
> +		}
> +		(*fds)[*n] = atoi(fd_link->d_name);
> +		*n += 1;
> +	}
> +	closedir(fds_dir);
> +
> +	if (!pid) {
> +		size_t i;
> +
> +		for (i = 0; i < *n; i++)
> +			if ((*fds)[i] == dir_fd)
> +				break;
> +		i++;
> +		memmove(*fds + i - 1, *fds + i, (*n - i) * sizeof(int));
> +		(*n)--;
> +	}
> +	return 0;
> +}
> diff --git a/tools/testing/selftests/fdmap/fdmap.h b/tools/testing/selftests/fdmap/fdmap.h
> new file mode 100644
> index 000000000000..c501111b2bbd
> --- /dev/null
> +++ b/tools/testing/selftests/fdmap/fdmap.h
> @@ -0,0 +1,12 @@
> +#ifndef FDMAP_H
> +#define FDMAP_H
> +
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <sys/types.h>
> +
> +long fdmap(pid_t pid, int *fds, size_t count, int start_fd, int flags);
> +int fdmap_full(pid_t pid, int **fds, size_t *n);
> +int fdmap_proc(pid_t pid, int **fds, size_t *n);
> +
> +#endif
> diff --git a/tools/testing/selftests/fdmap/fdmap_test.c b/tools/testing/selftests/fdmap/fdmap_test.c
> new file mode 100644
> index 000000000000..6f448406d96a
> --- /dev/null
> +++ b/tools/testing/selftests/fdmap/fdmap_test.c
> @@ -0,0 +1,153 @@
> +#include <errno.h>
> +#include <syscall.h>
> +#include <sys/time.h>
> +#include <sys/resource.h>
> +#include <limits.h>
> +#include "../kselftest_harness.h"
> +#include "fdmap.h"
> +
> +TEST(efault) {
> +	int ret;
> +
> +	ret = syscall(333, 0, NULL, 20 * sizeof(int), 0, 0);
> +	ASSERT_EQ(-1, ret);
> +	ASSERT_EQ(EFAULT, errno);
> +}
> +
> +TEST(big_start_fd) {
> +	int fds[1];
> +	int ret;
> +
> +	ret = syscall(333, 0, fds, sizeof(int), INT_MAX, 0);
> +	ASSERT_EQ(0, ret);
> +}
> +
> +TEST(einval) {
> +	int ret;
> +
> +	ret = syscall(333, 0, NULL, 0, -1, 0);
> +	ASSERT_EQ(-1, ret);
> +	ASSERT_EQ(EINVAL, errno);
> +
> +	ret = syscall(333, 0, NULL, 0, 0, 1);
> +	ASSERT_EQ(-1, ret);
> +	ASSERT_EQ(EINVAL, errno);
> +}
> +
> +TEST(esrch) {
> +	int fds[1], ret;
> +	pid_t pid;
> +
> +	pid = fork();
> +	ASSERT_NE(-1, pid);
> +	if (!pid)
> +		exit(0);
> +	waitpid(pid, NULL, 0);
> +
> +	ret = syscall(333, pid, fds, sizeof(int), 0, 0);
> +	ASSERT_EQ(-1, ret);
> +	ASSERT_EQ(ESRCH, errno);
> +}
> +
> +TEST(simple) {
> +	int *fds1, *fds2;
> +	size_t size1, size2, i;
> +	int ret1, ret2;
> +
> +	ret1 = fdmap_full(0, &fds1, &size1);
> +	ret2 = fdmap_proc(0, &fds2, &size2);
> +	ASSERT_EQ(ret2, ret1);
> +	ASSERT_EQ(size2, size1);
> +	for (i = 0; i < size1; i++)
> +		ASSERT_EQ(fds2[i], fds1[i]);
> +	free(fds1);
> +	free(fds2);
> +}
> +
> +TEST(init) {
> +	int *fds1, *fds2;
> +	size_t size1, size2, i;
> +	int ret1, ret2;
> +
> +	ret1 = fdmap_full(1, &fds1, &size1);
> +	ret2 = fdmap_proc(1, &fds2, &size2);
> +	ASSERT_EQ(ret2, ret1);
> +	ASSERT_EQ(size2, size1);
> +	for (i = 0; i < size1; i++)
> +		ASSERT_EQ(fds2[i], fds1[i]);
> +	free(fds1);
> +	free(fds2);
> +}
> +
> +TEST(zero) {
> +	int *fds, i;
> +	size_t size;
> +	int ret;
> +
> +	ret = fdmap_proc(0, &fds, &size);
> +	ASSERT_EQ(0, ret);
> +	for (i = 0; i < size; i++)
> +		close(fds[i]);
> +	free(fds);
> +	fds = NULL;
> +
> +	ret = fdmap_full(0, &fds, &size);
> +	ASSERT_EQ(0, ret);
> +	ASSERT_EQ(0, size);
> +}
> +
> +TEST(more_fds) {
> +	int *fds1, *fds2, ret1, ret2;
> +	size_t size1, size2, i;
> +
> +	struct rlimit rlim = {
> +		.rlim_cur = 600000,
> +		.rlim_max = 600000
> +	};
> +	ASSERT_EQ(0, setrlimit(RLIMIT_NOFILE, &rlim));
> +	for (int i = 0; i < 500000; i++)
> +		dup(0);
> +
> +	ret1 = fdmap_full(0, &fds1, &size1);
> +	ret2 = fdmap_proc(0, &fds2, &size2);
> +	ASSERT_EQ(ret2, ret1);
> +	ASSERT_EQ(size2, size1);
> +	for (i = 0; i < size1; i++)
> +		ASSERT_EQ(fds2[i], fds1[i]);
> +	free(fds1);
> +	free(fds2);
> +}
> +
> +TEST(child) {
> +	int pipefd[2];
> +	int *fds1, *fds2, ret1, ret2, i;
> +	size_t size1, size2;
> +	char byte = 0;
> +	pid_t pid;
> +
> +	ASSERT_NE(-1, pipe(pipefd));
> +	pid = fork();
> +	ASSERT_NE(-1, pid);
> +	if (!pid) {
> +		read(pipefd[0], &byte, 1);
> +		close(pipefd[0]);
> +		close(pipefd[1]);
> +		exit(0);
> +	}
> +
> +	ret1 = fdmap_full(0, &fds1, &size1);
> +	ret2 = fdmap_proc(0, &fds2, &size2);
> +	ASSERT_EQ(ret2, ret1);
> +	ASSERT_EQ(size2, size1);
> +	for (i = 0; i < size1; i++)
> +		ASSERT_EQ(fds2[i], fds1[i]);
> +	free(fds1);
> +	free(fds2);
> +
> +	write(pipefd[1], &byte, 1);
> +	close(pipefd[0]);
> +	close(pipefd[1]);
> +	waitpid(pid, NULL, 0);
> +}
> +
> +TEST_HARNESS_MAIN

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH 1/2 v2] fdmap(2)
  2017-09-28 15:02             ` Andy Lutomirski
  (?)
@ 2017-10-11 17:37             ` Alexey Dobriyan
  -1 siblings, 0 replies; 51+ messages in thread
From: Alexey Dobriyan @ 2017-10-11 17:37 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Michael Kerrisk, Andrew Morton, linux-kernel, Linux API,
	Randy Dunlap, Thomas Gleixner, Djalal Harouni, Alexey Gladkov,
	Aliaksandr Patseyenak, Tatsiana Brouka

On Thu, Sep 28, 2017 at 08:02:23AM -0700, Andy Lutomirski wrote:
> On Thu, Sep 28, 2017 at 3:55 AM, Alexey Dobriyan <adobriyan@gmail.com> wrote:
> > On 9/28/17, Michael Kerrisk (man-pages) <mtk.manpages@gmail.com> wrote:
> >> On 27 September 2017 at 17:03, Andy Lutomirski <luto@amacapital.net> wrote:
> >
> >>>> The idea is to start process. In ideal world, only bynary system calls
> >>>> would exist and shells could emulate /proc/* same way bash implement
> >>>> /dev/tcp
> >>>
> >>> Then start the process by doing it for real and making it obviously
> >>> useful.  We should not add a pair of vaguely useful, rather weak
> >>> syscalls just to start a process of modernizing /proc.
> >
> > Before doing it for real it would be nice to have at least a nod
> > from people in charge that syscalls which return binary
> > information are OK. Otherwise some EIATF guy will just say
> > "NAK /proc is fine, it always was fine".
> 
> There's nothing inherently wrong with syscalls that return binary
> information.  There is something wrong with reinventing the world with
> insufficient justification, though.
> 
> /proc may be old, clunky, and kind of slow, but it has a lot of good
> things going for it.  It supports access control (DAC and MAC).  It
> handles namespacing in a way that's awkward but supported by all the
> existing namespace managers.  It may soon support mount options, which
> is rather important.
> 
> I feel like we've been discussing this performance issue for over a
> year, and I distinctly recall discussing it in Santa Fe.  I suggested
> a two-pronged approach:
> 
> 1. Add a new syscall that will, in a single call, open, read, and
> close a proc file and maybe even a regular non-proc file.  Like this:
> 
> long readfileat(int dirfd, const char *path, void *buf, size_t len, int flags);
> 
> 2. Where needed, add new /proc files with lighter-weight
> representations.  I think we discussed something that's like status
> but in nl_attr format.
> 
> This doesn't end up with a bunch of code duplication the way that a
> full-blown syscall-based reimplementation would.  It supports all the
> /proc features rather than just a subset.  It fully respects access
> control, future mount options, and the potential lack of a /proc
> mount.

Aplogies for delayed answer.

The code duplication exists solely because sometime in the beginning
/proc was chosen. There was another route: design system calls for
getting process statistics and add another binary to coreutils
which uses said system calls so that shell scripts could use them.

It is _never_ late to simply stop expanding /proc.

> > Or look from another angle: sched_setaffinity exists but there is
> > no /proc counterpart, shells must use taskset(1) and world didn't end.
> 
> sched_setaffinity() modifies the caller.  /proc wouldn't have made much sense.

I think you're technically wrong: sched_setaffinity(2) doesn't modify
supplied cpumask in userspace. But even if it did it doesn't matter:
imaginary /proc/*/affinity file would simply be re-read to verify that
it was indeed set correctly to mimic umask(2) behaviour.

> >> I concur.
> >>
> >> Alexey, you still have not wxplained who specifically needs this
> >> right now, and how, precisely, they plan to use the new system calls.
> >> It is all very arm-wavey so far.
> >
> > It is not if you read even example program in the original patch.
> > Any program which queries information about file descriptors
> > will benefit both in CPU and memory usage.
> >
> > void closefrom(int start)
> > {
> >         int fd[1024];
> >         int n;
> >
> >         while ((n = fdmap(0, fd, sizeof(fd)/sizeof(fd[0]), start)) > 0) {
> >                 unsigned int i;
> >
> >                 for (i = 0; i < n; i++)
> >                         close(fd[i]);
> >
> >                 start = fd[n - 1] + 1;
> >         }
> > }
> >
> > CRIU naturally to know everything about descriptors of target processes:
> > It does:
> >
> > int predump_task_files(int pid)
> > {
> >         struct dirent *de;
> >         DIR *fd_dir;
> >         int ret = -1;
> >
> >         pr_info("Pre-dump fds for %d)\n", pid);
> >
> >         fd_dir = opendir_proc(pid, "fd");
> >         if (!fd_dir)
> >                 return -1;
> >
> >         while ((de = readdir(fd_dir))) {
> >                 if (dir_dots(de))
> >                         continue;
> >
> >                 if (predump_one_fd(pid, atoi(de->d_name)))
> >                         goto out;
> >         }
> >
> >         ret = 0;
> > out:
> >         closedir(fd_dir);
> >         return ret;
> > }
> >
> > which is again inefficient.
> 
> And /proc/PID/fds would solve this.

Retaining all /proc problems, to reiterate:
* 3 dentries, 3 inodes (allocating and instantiating),
* 1 struct file + descriptor
* 1 page for seqfile buffer,
* converting, copying more than necessary (strings are never better than binary)

readfileat() would help with "struct file" part, but not with anything else.

fdmap(2) simply avoids _all_ overhead except of that truly necessary:
security checks, and shipping data to userspace.

Originally I thought about using direct copy_to_user() from in-kernel
bitmaps which would have been _the_ fastest way possible, but seeing
IDR descriptors patches that idea was scraped. :-(

So my counter suggestion is to merge fdmap(2), it is small and simple.
By itself it can be used to implement "close all descriptors" idiom already
used by userspace and BSD closefrom(2).

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [1/2,v2] fdmap(2)
  2017-10-10 22:08   ` Andrei Vagin
  (?)
@ 2017-10-11 18:12   ` Alexey Dobriyan
  2017-10-12  8:06       ` Andrei Vagin
  -1 siblings, 1 reply; 51+ messages in thread
From: Alexey Dobriyan @ 2017-10-11 18:12 UTC (permalink / raw)
  To: Andrei Vagin
  Cc: akpm, linux-kernel, linux-api, rdunlap, tglx, tixxdz, gladkov.alexey

On Tue, Oct 10, 2017 at 03:08:06PM -0700, Andrei Vagin wrote:
> On Sun, Sep 24, 2017 at 11:06:20PM +0300, Alexey Dobriyan wrote:
> > From: Aliaksandr Patseyenak <Aliaksandr_Patseyenak1@epam.com>
> > 
> > Implement system call for bulk retrieveing of opened descriptors
> > in binary form.
> > 
> > Some daemons could use it to reliably close file descriptors
> > before starting. Currently they close everything upto some number
> > which formally is not reliable. Other natural users are lsof(1) and CRIU
> > (although lsof does so much in /proc that the effect is thoroughly buried).
> 
> Hello Alexey,
> 
> I am not sure about the idea to add syscalls for all sort of process
> attributes. For example, in CRIU we need file descriptors with their
> properties, which we currently get from /proc/pid/fdinfo/. How can
> this interface be extended to achieve our goal?
> 
> Have you seen the task-diag interface what I sent about a year ago?

Of course, let's discuss /proc/task_diag.

Adding it as /proc file is obviously unnecessary: you do it only
to hook ->read and ->write netlink style
(and BTW you don't need .THIS_MODULE anymore ;-)

Transactional netlink send and recv aren't necessary either.
As I understand it, it comes from old times when netlink was async,
so 2 syscalls were neccesary. Netlink is not async anymore.

Basically you want to do sys_task_diag(2) which accepts set of pids
(maybe) and a mask (see statx()) and returns synchronously result into
a buffer.

> We had a discussion on the previous kernel summit how to rework
> task-diag, so that it can be merged into the upstream kernel.
> Unfortunately, I didn't send a summary for this discussion. But it's
> better now than never. We decided to do something like this:
> 
> 1. Add a new syscall readfile(fname, buf, size), which can be
> used to read small files without opening a file descriptor. It will be
> useful for proc files, configs, etc.

If nothing, it should be done because the number of programmers capable
of writing readfile() in userspace correctly handling all errors and
short reads is very small indeed. Out of curiosity I once booted a kernel
which made all reads short by default. It was fascinating I can tell you.

> 2. bin/text/bin conversion is very slow
>  - 65.47% proc_pid_status
>   - 20.81% render_sigset_t
>    - 18.27% seq_printf
>     + 15.77% seq_vprintf
>   - 10.65% task_mem
>     + 8.78% seq_print
>     + 1.02% hugetlb_rep
>   + 7.40% seq_printf
> so a new interface has to use a binary format and the format of netlink
> messages can be used here. It should be possible to extend a file
> without breaking backward compatibility.

Binary -- yes.
netlink attributes -- maybe.

There is statx() model which is perfect for this usecase:
do not want pagecache of all block devices? sure, no problem.

> 3. There are a lot of objection to use a netlink sockets out of the network
> subsystem. The idea of using a "transaction" file looks weird for many
> people, so we decided to add a few files in /proc/pid/. I see
> minimum two files. One file contains information about a task, it is
> mostly what we have in /proc/pid/status and /proc/pid/stat. Another file
> describes a task memory, it is what we have now in /proc/pid/smaps.
> Here is one more major idea. All attributes in a file has to be equal in
> term of performance, or by other words there should not be attributes,
> which significantly affect a generation time of a whole file.
> 
> If we look at /proc/pid/smaps, we spend a lot of time to get memory
> statistics. This file contains a lot of data and if you read it to get
> VmFlags, the kernel will waste your time by generating a useless data
> for you.

There is a unsolvable problem with /proc/*/stat style files. Anyone
who wants to add new stuff has a desicion to make, whether add new /proc
file or extend existing /proc file.

Adding new /proc file means 3 syscalls currently, it surely will become
better with aforementioned readfileat() but even adding tons of symlinks
like this:

	$ readlink /proc/self/affinity
	0f

would have been better -- readlink doesn't open files.

Adding to existing file means _all_ users have to eat the cost as
read(2) doesn't accept any sort of mask to filter data. Most /proc files
are seqfiles now which most of the time internally generates whole buffer
before shipping data to userspace. cat(1) does 32KB read by default
which is bigger than most of files in /proc and stat'ing /proc files is
useless because they're all 0 length. Reliable rewinding to necessary data
is possible only with memchr() which misses the point.

Basically, those sacred text files the Universe consists of suck.

With statx() model the cost of extending result with new data is very
small -- 1 branch to skip generation of data.

I suggest that anyone who dares to improve the situation with process
statistics and anything /proc related uses it as a model.

Of course, I also suggest to freeze /proc for new stuff to press
the issue but one can only dream.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [1/2,v2] fdmap(2)
  2017-10-11 18:12   ` Alexey Dobriyan
@ 2017-10-12  8:06       ` Andrei Vagin
  0 siblings, 0 replies; 51+ messages in thread
From: Andrei Vagin @ 2017-10-12  8:06 UTC (permalink / raw)
  To: Alexey Dobriyan
  Cc: akpm, linux-kernel, linux-api, rdunlap, tglx, tixxdz, gladkov.alexey

On Wed, Oct 11, 2017 at 09:12:34PM +0300, Alexey Dobriyan wrote:
> On Tue, Oct 10, 2017 at 03:08:06PM -0700, Andrei Vagin wrote:
> > On Sun, Sep 24, 2017 at 11:06:20PM +0300, Alexey Dobriyan wrote:
> > > From: Aliaksandr Patseyenak <Aliaksandr_Patseyenak1@epam.com>
> > > 
> > > Implement system call for bulk retrieveing of opened descriptors
> > > in binary form.
> > > 
> > > Some daemons could use it to reliably close file descriptors
> > > before starting. Currently they close everything upto some number
> > > which formally is not reliable. Other natural users are lsof(1) and CRIU
> > > (although lsof does so much in /proc that the effect is thoroughly buried).
> > 
> > Hello Alexey,
> > 
> > I am not sure about the idea to add syscalls for all sort of process
> > attributes. For example, in CRIU we need file descriptors with their
> > properties, which we currently get from /proc/pid/fdinfo/. How can
> > this interface be extended to achieve our goal?
> > 
> > Have you seen the task-diag interface what I sent about a year ago?
> 
> Of course, let's discuss /proc/task_diag.
> 
> Adding it as /proc file is obviously unnecessary: you do it only
> to hook ->read and ->write netlink style
> (and BTW you don't need .THIS_MODULE anymore ;-)
> 
> Transactional netlink send and recv aren't necessary either.
> As I understand it, it comes from old times when netlink was async,
> so 2 syscalls were neccesary. Netlink is not async anymore.
> 
> Basically you want to do sys_task_diag(2) which accepts set of pids
> (maybe) and a mask (see statx()) and returns synchronously result into
> a buffer.

You are not quite right here. We send a request and then we read a
response, which can be bigger than what we can read for one call.

So we need something like a cursor, in your case it is the "start"
argument. But sometimes this cursor contains a kernel internal data
to have a better performance. We need to have a way to address this
cursor from userspace, and it is a reason why we need a file
descriptor in this scheme.

For example, you can look at the proc_maps_private structure.


> 
> > We had a discussion on the previous kernel summit how to rework
> > task-diag, so that it can be merged into the upstream kernel.
> > Unfortunately, I didn't send a summary for this discussion. But it's
> > better now than never. We decided to do something like this:
> > 
> > 1. Add a new syscall readfile(fname, buf, size), which can be
> > used to read small files without opening a file descriptor. It will be
> > useful for proc files, configs, etc.
> 
> If nothing, it should be done because the number of programmers capable
> of writing readfile() in userspace correctly handling all errors and
> short reads is very small indeed. Out of curiosity I once booted a kernel
> which made all reads short by default. It was fascinating I can tell you.
> 
> > 2. bin/text/bin conversion is very slow
> >  - 65.47% proc_pid_status
> >   - 20.81% render_sigset_t
> >    - 18.27% seq_printf
> >     + 15.77% seq_vprintf
> >   - 10.65% task_mem
> >     + 8.78% seq_print
> >     + 1.02% hugetlb_rep
> >   + 7.40% seq_printf
> > so a new interface has to use a binary format and the format of netlink
> > messages can be used here. It should be possible to extend a file
> > without breaking backward compatibility.
> 
> Binary -- yes.
> netlink attributes -- maybe.
> 
> There is statx() model which is perfect for this usecase:
> do not want pagecache of all block devices? sure, no problem.
> 
> > 3. There are a lot of objection to use a netlink sockets out of the network
> > subsystem. The idea of using a "transaction" file looks weird for many
> > people, so we decided to add a few files in /proc/pid/. I see
> > minimum two files. One file contains information about a task, it is
> > mostly what we have in /proc/pid/status and /proc/pid/stat. Another file
> > describes a task memory, it is what we have now in /proc/pid/smaps.
> > Here is one more major idea. All attributes in a file has to be equal in
> > term of performance, or by other words there should not be attributes,
> > which significantly affect a generation time of a whole file.
> > 
> > If we look at /proc/pid/smaps, we spend a lot of time to get memory
> > statistics. This file contains a lot of data and if you read it to get
> > VmFlags, the kernel will waste your time by generating a useless data
> > for you.
> 
> There is a unsolvable problem with /proc/*/stat style files. Anyone
> who wants to add new stuff has a desicion to make, whether add new /proc
> file or extend existing /proc file.
> 
> Adding new /proc file means 3 syscalls currently, it surely will become
> better with aforementioned readfileat() but even adding tons of symlinks
> like this:
> 
> 	$ readlink /proc/self/affinity
> 	0f
> 
> would have been better -- readlink doesn't open files.
> 
> Adding to existing file means _all_ users have to eat the cost as
> read(2) doesn't accept any sort of mask to filter data. Most /proc files
> are seqfiles now which most of the time internally generates whole buffer
> before shipping data to userspace. cat(1) does 32KB read by default
> which is bigger than most of files in /proc and stat'ing /proc files is
> useless because they're all 0 length. Reliable rewinding to necessary data
> is possible only with memchr() which misses the point.
> 
> Basically, those sacred text files the Universe consists of suck.
> 
> With statx() model the cost of extending result with new data is very
> small -- 1 branch to skip generation of data.
> 
> I suggest that anyone who dares to improve the situation with process
> statistics and anything /proc related uses it as a model.
> 
> Of course, I also suggest to freeze /proc for new stuff to press
> the issue but one can only dream.


I'm agree with your points, but I think you choose a wrong set of data
to make an example of a new approach.

You are talking a lot about statx, but for me it is unclear how fdmap
follows the idea of statx. Let's imagine that I want to extend fdmap to
return mnt_id for each file descriptor?

Or it may be more complex case, when we decided to provide all data
from /proc/pid/fdinfo/X for each descriptor. A set of fields in fdinfo
depends on a type of a file descriptor, it is different for epoll,
signalfd, inotify, sockets, etc.

For inotify file descriptors, there are information about all watches,
so it is not possible to use a fixed size struture to present this data.

I like the interface of statx, but this case is more complex.

Thanks,
Andrei

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [1/2,v2] fdmap(2)
@ 2017-10-12  8:06       ` Andrei Vagin
  0 siblings, 0 replies; 51+ messages in thread
From: Andrei Vagin @ 2017-10-12  8:06 UTC (permalink / raw)
  To: Alexey Dobriyan
  Cc: akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA, rdunlap-wEGCiKHe2LqWVfeAwA7xHQ,
	tglx-hfZtesqFncYOwBW4kG4KsQ, tixxdz-Re5JQEeQqe8AvxtiuMwx3w,
	gladkov.alexey-Re5JQEeQqe8AvxtiuMwx3w

On Wed, Oct 11, 2017 at 09:12:34PM +0300, Alexey Dobriyan wrote:
> On Tue, Oct 10, 2017 at 03:08:06PM -0700, Andrei Vagin wrote:
> > On Sun, Sep 24, 2017 at 11:06:20PM +0300, Alexey Dobriyan wrote:
> > > From: Aliaksandr Patseyenak <Aliaksandr_Patseyenak1-uRwfk40T5oI@public.gmane.org>
> > > 
> > > Implement system call for bulk retrieveing of opened descriptors
> > > in binary form.
> > > 
> > > Some daemons could use it to reliably close file descriptors
> > > before starting. Currently they close everything upto some number
> > > which formally is not reliable. Other natural users are lsof(1) and CRIU
> > > (although lsof does so much in /proc that the effect is thoroughly buried).
> > 
> > Hello Alexey,
> > 
> > I am not sure about the idea to add syscalls for all sort of process
> > attributes. For example, in CRIU we need file descriptors with their
> > properties, which we currently get from /proc/pid/fdinfo/. How can
> > this interface be extended to achieve our goal?
> > 
> > Have you seen the task-diag interface what I sent about a year ago?
> 
> Of course, let's discuss /proc/task_diag.
> 
> Adding it as /proc file is obviously unnecessary: you do it only
> to hook ->read and ->write netlink style
> (and BTW you don't need .THIS_MODULE anymore ;-)
> 
> Transactional netlink send and recv aren't necessary either.
> As I understand it, it comes from old times when netlink was async,
> so 2 syscalls were neccesary. Netlink is not async anymore.
> 
> Basically you want to do sys_task_diag(2) which accepts set of pids
> (maybe) and a mask (see statx()) and returns synchronously result into
> a buffer.

You are not quite right here. We send a request and then we read a
response, which can be bigger than what we can read for one call.

So we need something like a cursor, in your case it is the "start"
argument. But sometimes this cursor contains a kernel internal data
to have a better performance. We need to have a way to address this
cursor from userspace, and it is a reason why we need a file
descriptor in this scheme.

For example, you can look at the proc_maps_private structure.


> 
> > We had a discussion on the previous kernel summit how to rework
> > task-diag, so that it can be merged into the upstream kernel.
> > Unfortunately, I didn't send a summary for this discussion. But it's
> > better now than never. We decided to do something like this:
> > 
> > 1. Add a new syscall readfile(fname, buf, size), which can be
> > used to read small files without opening a file descriptor. It will be
> > useful for proc files, configs, etc.
> 
> If nothing, it should be done because the number of programmers capable
> of writing readfile() in userspace correctly handling all errors and
> short reads is very small indeed. Out of curiosity I once booted a kernel
> which made all reads short by default. It was fascinating I can tell you.
> 
> > 2. bin/text/bin conversion is very slow
> >  - 65.47% proc_pid_status
> >   - 20.81% render_sigset_t
> >    - 18.27% seq_printf
> >     + 15.77% seq_vprintf
> >   - 10.65% task_mem
> >     + 8.78% seq_print
> >     + 1.02% hugetlb_rep
> >   + 7.40% seq_printf
> > so a new interface has to use a binary format and the format of netlink
> > messages can be used here. It should be possible to extend a file
> > without breaking backward compatibility.
> 
> Binary -- yes.
> netlink attributes -- maybe.
> 
> There is statx() model which is perfect for this usecase:
> do not want pagecache of all block devices? sure, no problem.
> 
> > 3. There are a lot of objection to use a netlink sockets out of the network
> > subsystem. The idea of using a "transaction" file looks weird for many
> > people, so we decided to add a few files in /proc/pid/. I see
> > minimum two files. One file contains information about a task, it is
> > mostly what we have in /proc/pid/status and /proc/pid/stat. Another file
> > describes a task memory, it is what we have now in /proc/pid/smaps.
> > Here is one more major idea. All attributes in a file has to be equal in
> > term of performance, or by other words there should not be attributes,
> > which significantly affect a generation time of a whole file.
> > 
> > If we look at /proc/pid/smaps, we spend a lot of time to get memory
> > statistics. This file contains a lot of data and if you read it to get
> > VmFlags, the kernel will waste your time by generating a useless data
> > for you.
> 
> There is a unsolvable problem with /proc/*/stat style files. Anyone
> who wants to add new stuff has a desicion to make, whether add new /proc
> file or extend existing /proc file.
> 
> Adding new /proc file means 3 syscalls currently, it surely will become
> better with aforementioned readfileat() but even adding tons of symlinks
> like this:
> 
> 	$ readlink /proc/self/affinity
> 	0f
> 
> would have been better -- readlink doesn't open files.
> 
> Adding to existing file means _all_ users have to eat the cost as
> read(2) doesn't accept any sort of mask to filter data. Most /proc files
> are seqfiles now which most of the time internally generates whole buffer
> before shipping data to userspace. cat(1) does 32KB read by default
> which is bigger than most of files in /proc and stat'ing /proc files is
> useless because they're all 0 length. Reliable rewinding to necessary data
> is possible only with memchr() which misses the point.
> 
> Basically, those sacred text files the Universe consists of suck.
> 
> With statx() model the cost of extending result with new data is very
> small -- 1 branch to skip generation of data.
> 
> I suggest that anyone who dares to improve the situation with process
> statistics and anything /proc related uses it as a model.
> 
> Of course, I also suggest to freeze /proc for new stuff to press
> the issue but one can only dream.


I'm agree with your points, but I think you choose a wrong set of data
to make an example of a new approach.

You are talking a lot about statx, but for me it is unclear how fdmap
follows the idea of statx. Let's imagine that I want to extend fdmap to
return mnt_id for each file descriptor?

Or it may be more complex case, when we decided to provide all data
from /proc/pid/fdinfo/X for each descriptor. A set of fields in fdinfo
depends on a type of a file descriptor, it is different for epoll,
signalfd, inotify, sockets, etc.

For inotify file descriptors, there are information about all watches,
so it is not possible to use a fixed size struture to present this data.

I like the interface of statx, but this case is more complex.

Thanks,
Andrei

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [1/2,v2] fdmap(2)
@ 2017-10-18 11:35         ` Alexey Dobriyan
  0 siblings, 0 replies; 51+ messages in thread
From: Alexey Dobriyan @ 2017-10-18 11:35 UTC (permalink / raw)
  To: Andrei Vagin
  Cc: akpm, linux-kernel, linux-api, rdunlap, tglx, tixxdz, gladkov.alexey

On 10/12/17, Andrei Vagin <avagin@virtuozzo.com> wrote:

> I'm agree with your points, but I think you choose a wrong set of data
> to make an example of a new approach.
>
> You are talking a lot about statx, but for me it is unclear how fdmap
> follows the idea of statx. Let's imagine that I want to extend fdmap to
> return mnt_id for each file descriptor?

fdmap() is standalone thing.

Next step is to design fdinfo(2) (?) which uses descriptors from fdmap(2).
Extending structures is done as usual: but version, add new fields to the end.

> Or it may be more complex case, when we decided to provide all data
> from /proc/pid/fdinfo/X for each descriptor. A set of fields in fdinfo
> depends on a type of a file descriptor, it is different for epoll,
> signalfd, inotify, sockets, etc.
>
> For inotify file descriptors, there are information about all watches,
> so it is not possible to use a fixed size struture to present this data.

Now I didn't look closely at inotify watches but this is done with unions,
zero-length trailing arrays, and other usual stuff. If OpenVZ/Virtuozzo is
doing C/R of inotify then you already have all serializing code.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [1/2,v2] fdmap(2)
@ 2017-10-18 11:35         ` Alexey Dobriyan
  0 siblings, 0 replies; 51+ messages in thread
From: Alexey Dobriyan @ 2017-10-18 11:35 UTC (permalink / raw)
  To: Andrei Vagin
  Cc: akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA, rdunlap-wEGCiKHe2LqWVfeAwA7xHQ,
	tglx-hfZtesqFncYOwBW4kG4KsQ, tixxdz-Re5JQEeQqe8AvxtiuMwx3w,
	gladkov.alexey-Re5JQEeQqe8AvxtiuMwx3w

On 10/12/17, Andrei Vagin <avagin-5HdwGun5lf+gSpxsJD1C4w@public.gmane.org> wrote:

> I'm agree with your points, but I think you choose a wrong set of data
> to make an example of a new approach.
>
> You are talking a lot about statx, but for me it is unclear how fdmap
> follows the idea of statx. Let's imagine that I want to extend fdmap to
> return mnt_id for each file descriptor?

fdmap() is standalone thing.

Next step is to design fdinfo(2) (?) which uses descriptors from fdmap(2).
Extending structures is done as usual: but version, add new fields to the end.

> Or it may be more complex case, when we decided to provide all data
> from /proc/pid/fdinfo/X for each descriptor. A set of fields in fdinfo
> depends on a type of a file descriptor, it is different for epoll,
> signalfd, inotify, sockets, etc.
>
> For inotify file descriptors, there are information about all watches,
> so it is not possible to use a fixed size struture to present this data.

Now I didn't look closely at inotify watches but this is done with unions,
zero-length trailing arrays, and other usual stuff. If OpenVZ/Virtuozzo is
doing C/R of inotify then you already have all serializing code.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [1/2,v2] fdmap(2)
  2017-10-18 11:35         ` Alexey Dobriyan
  (?)
@ 2017-10-18 17:47         ` Andy Lutomirski
  2017-10-19 15:34           ` Alexey Dobriyan
  -1 siblings, 1 reply; 51+ messages in thread
From: Andy Lutomirski @ 2017-10-18 17:47 UTC (permalink / raw)
  To: Alexey Dobriyan
  Cc: Andrei Vagin, Andrew Morton, linux-kernel, Linux API,
	Randy Dunlap, Thomas Gleixner, Djalal Harouni, Alexey Gladkov

> On Oct 18, 2017, at 4:35 AM, Alexey Dobriyan <adobriyan@gmail.com> wrote:
>
>> On 10/12/17, Andrei Vagin <avagin@virtuozzo.com> wrote:
>>
>> I'm agree with your points, but I think you choose a wrong set of data
>> to make an example of a new approach.
>>
>> You are talking a lot about statx, but for me it is unclear how fdmap
>> follows the idea of statx. Let's imagine that I want to extend fdmap to
>> return mnt_id for each file descriptor?
>
> fdmap() is standalone thing.
>
> Next step is to design fdinfo(2) (?) which uses descriptors from fdmap(2).
> Extending structures is done as usual: but version, add new fields to the end.
>

I very strongly disagree.  If you really want a new interface for
reading out information about other processes, design one that makes
sense as a whole.  Don't design it piecemeal. The last thing we need
is a crappy /proc alternative that covers a small fraction of use
cases.  And demonstrate that it actually has a material benefit over
fixing /proc.

Meanwhile, why not just fix /proc?

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [1/2,v2] fdmap(2)
  2017-10-18 17:47         ` Andy Lutomirski
@ 2017-10-19 15:34           ` Alexey Dobriyan
  2017-10-20  7:48             ` Greg KH
  2017-10-26  7:53               ` Andy Lutomirski
  0 siblings, 2 replies; 51+ messages in thread
From: Alexey Dobriyan @ 2017-10-19 15:34 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Andrei Vagin, Andrew Morton, linux-kernel, Linux API,
	Randy Dunlap, Thomas Gleixner, Djalal Harouni, Alexey Gladkov

On 10/18/17, Andy Lutomirski <luto@kernel.org> wrote:
>> fdmap() is standalone thing.
>>
>> Next step is to design fdinfo(2) (?) which uses descriptors from
>> fdmap(2). Extending structures is done as usual: but version,
>> add new fields to the end.
>
> I very strongly disagree.  If you really want a new interface for
> reading out information about other processes, design one that makes
> sense as a whole.  Don't design it piecemeal. The last thing we need
> is a crappy /proc alternative that covers a small fraction of use
> cases.

Oh well.

> And demonstrate that it actually has a material benefit over
> fixing /proc.

> Meanwhile, why not just fix /proc?

/proc can not be fixed. Let me reiterate the arguments one more time.

Using /proc
* overallocates memory even if temporarily,
* instantiates dentries and inodes nobody cares about,
* make expansion path more painful than necessary:
  1) adding field to the end of the file is the least risky move,
  2) decimal radix and unfixed positioning are used often
  3) read() doesn't accept any sort of filter for data

  (1)+(3) = those who added their fields later eat the cost of all
  previous fields even if their field is very simple and easy
  to calculate.

  (2)+(3) = lseek() can not be used as a substitute.

  4) adding new file is cleaner however every virtual file creates
     kernel objects which aren't interesting themselves.
     This has direct costs (more syscalls and lookups) and
     indirect costs (more garbage in d/icache and more flushing
     when the process dies)

  5) converting to strings eats cycles (and more cycles are consumed
     if userspace decides to convert it back to binary for whatever
     reasons)

     For those who didn't pay attention, first patch to make integer
     conversion faster made /proc/*/stat (?) 30% faster!
     This is how slow text can be.

Sysfs is overall better but solely because it has strict one-value-per-file
rule, so interfaces are cleaner and there is less room for mistakes.


Philosophically, text files create false impression that parsing text
is easy.

As an example grab a bunch of students and ask them to write a program
which parses something innocent like /proc/*/stat for process state.

The beartrap is that ->comm is not escaped for Space and ')' so naive
strchr() from the beginning doesn't work reliably. The only reliable
way is to count spaces from the end(!) and pray kernel devs do not
extend the file with another field or, worse, with another textual
field.

ESCAPE_SPACE doesn't escape Space, funny, isn't it?

/proc/*/status kind of has the same problem however forward strstr()
works with "\nState:\t" because \n and \t will be escaped. But nobody
would search for just "State:", especially in scripts, right?

EIATF people often critique binary people for their mistakes
but themselves make equally stupid ones. "(unreachable)" anyone?.

So the answer it not to fix /proc, the answer it to leave /proc alone.
The answer is make Unix shell people move their lazy asses and
implement minimal type system and a way to execute raw system calls
like all normal programming languages do. They still haven't done it
after dozens of years and are arrogant enough to say "oh uh we can't
use cat and pipe to awk".

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [1/2,v2] fdmap(2)
  2017-10-19 15:34           ` Alexey Dobriyan
@ 2017-10-20  7:48             ` Greg KH
  2017-10-25 13:11                 ` Alexey Dobriyan
  2017-10-26  7:53               ` Andy Lutomirski
  1 sibling, 1 reply; 51+ messages in thread
From: Greg KH @ 2017-10-20  7:48 UTC (permalink / raw)
  To: Alexey Dobriyan
  Cc: Andy Lutomirski, Andrei Vagin, Andrew Morton, linux-kernel,
	Linux API, Randy Dunlap, Thomas Gleixner, Djalal Harouni,
	Alexey Gladkov

On Thu, Oct 19, 2017 at 05:34:35PM +0200, Alexey Dobriyan wrote:
> So the answer it not to fix /proc, the answer it to leave /proc alone.

No, because:

> The answer is make Unix shell people move their lazy asses and
> implement minimal type system and a way to execute raw system calls
> like all normal programming languages do. They still haven't done it
> after dozens of years and are arrogant enough to say "oh uh we can't
> use cat and pipe to awk".

Have you submitted changes and patches to those projects in order to
implement such a thing?

If not, just complaining about it isn't going to change anything, you
know better than that :)

sorry,

greg k-h

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH 1/2 v2] fdmap(2)
@ 2017-10-23  9:29     ` Pavel Machek
  0 siblings, 0 replies; 51+ messages in thread
From: Pavel Machek @ 2017-10-23  9:29 UTC (permalink / raw)
  To: Michael Kerrisk (man-pages)
  Cc: Alexey Dobriyan, akpm, linux-kernel, linux-api, rdunlap, tglx,
	tixxdz, gladkov.alexey, Aliaksandr Patseyenak

[-- Attachment #1: Type: text/plain, Size: 1651 bytes --]

Hi!

> > From: Aliaksandr Patseyenak <Aliaksandr_Patseyenak1@epam.com>
> > 
> > Implement system call for bulk retrieveing of opened descriptors
> > in binary form.
> > 
> > Some daemons could use it to reliably close file descriptors
> > before starting. Currently they close everything upto some number
> > which formally is not reliable. Other natural users are lsof(1) and CRIU
> > (although lsof does so much in /proc that the effect is thoroughly buried).
> > 
> > /proc, the only way to learn anything about file descriptors may not be
> > available. There is unavoidable overhead associated with instantiating
> > 3 dentries and 3 inodes and converting integers to strings and back.
> > 
> > Benchmark:
> > 
> > 	N=1<<22 times
> > 	4 opened descriptors (0, 1, 2, 3)
> > 	opendir+readdir+closedir /proc/self/fd vs fdmap
> > 
> > 	/proc 8.31 ± 0.37%
> > 	fdmap 0.32 ± 0.72%
> 
> >From the text above, I'm still trying to understand: whose problem
> does this solve? I mean, we've lived with the daemon-close-all-files
> technique forever (and I'm not sure that performance is really an 
> important issue for the daemon case) . And you say that the effect

Binary fdmap looks... quite ugly to me. But close_all(from, to)
syscall kindof makes sense to me... and is not that ugly.

Given that openbsd has something similar... perhaps we can take that?

Except during startup, application should have good idea what fds it
has open, so maybe that's enough?
								Pavel
-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH 1/2 v2] fdmap(2)
@ 2017-10-23  9:29     ` Pavel Machek
  0 siblings, 0 replies; 51+ messages in thread
From: Pavel Machek @ 2017-10-23  9:29 UTC (permalink / raw)
  To: Michael Kerrisk (man-pages)
  Cc: Alexey Dobriyan, akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA, rdunlap-wEGCiKHe2LqWVfeAwA7xHQ,
	tglx-hfZtesqFncYOwBW4kG4KsQ, tixxdz-Re5JQEeQqe8AvxtiuMwx3w,
	gladkov.alexey-Re5JQEeQqe8AvxtiuMwx3w, Aliaksandr Patseyenak

[-- Attachment #1: Type: text/plain, Size: 1671 bytes --]

Hi!

> > From: Aliaksandr Patseyenak <Aliaksandr_Patseyenak1-uRwfk40T5oI@public.gmane.org>
> > 
> > Implement system call for bulk retrieveing of opened descriptors
> > in binary form.
> > 
> > Some daemons could use it to reliably close file descriptors
> > before starting. Currently they close everything upto some number
> > which formally is not reliable. Other natural users are lsof(1) and CRIU
> > (although lsof does so much in /proc that the effect is thoroughly buried).
> > 
> > /proc, the only way to learn anything about file descriptors may not be
> > available. There is unavoidable overhead associated with instantiating
> > 3 dentries and 3 inodes and converting integers to strings and back.
> > 
> > Benchmark:
> > 
> > 	N=1<<22 times
> > 	4 opened descriptors (0, 1, 2, 3)
> > 	opendir+readdir+closedir /proc/self/fd vs fdmap
> > 
> > 	/proc 8.31 ± 0.37%
> > 	fdmap 0.32 ± 0.72%
> 
> >From the text above, I'm still trying to understand: whose problem
> does this solve? I mean, we've lived with the daemon-close-all-files
> technique forever (and I'm not sure that performance is really an 
> important issue for the daemon case) . And you say that the effect

Binary fdmap looks... quite ugly to me. But close_all(from, to)
syscall kindof makes sense to me... and is not that ugly.

Given that openbsd has something similar... perhaps we can take that?

Except during startup, application should have good idea what fds it
has open, so maybe that's enough?
								Pavel
-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH 1/2 v2] fdmap(2)
  2017-10-23  9:29     ` Pavel Machek
@ 2017-10-25 12:45       ` Alexey Dobriyan
  -1 siblings, 0 replies; 51+ messages in thread
From: Alexey Dobriyan @ 2017-10-25 12:45 UTC (permalink / raw)
  To: Pavel Machek
  Cc: Michael Kerrisk (man-pages),
	akpm, linux-kernel, linux-api, rdunlap, tglx, tixxdz,
	gladkov.alexey, Aliaksandr Patseyenak

On 10/23/17, Pavel Machek <pavel@ucw.cz> wrote:

> Binary fdmap looks... quite ugly to me. But close_all(from, to)
> syscall kindof makes sense to me... and is not that ugly.
>
> Given that openbsd has something similar... perhaps we can take that?

closefrom() can be implemented on top of fdmap().
fdmap as is is useful for other purposes.

> Except during startup, application should have good idea what fds it
> has open, so maybe that's enough?

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH 1/2 v2] fdmap(2)
@ 2017-10-25 12:45       ` Alexey Dobriyan
  0 siblings, 0 replies; 51+ messages in thread
From: Alexey Dobriyan @ 2017-10-25 12:45 UTC (permalink / raw)
  To: Pavel Machek
  Cc: Michael Kerrisk (man-pages),
	akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA, rdunlap-wEGCiKHe2LqWVfeAwA7xHQ,
	tglx-hfZtesqFncYOwBW4kG4KsQ, tixxdz-Re5JQEeQqe8AvxtiuMwx3w,
	gladkov.alexey-Re5JQEeQqe8AvxtiuMwx3w, Aliaksandr Patseyenak

On 10/23/17, Pavel Machek <pavel-+ZI9xUNit7I@public.gmane.org> wrote:

> Binary fdmap looks... quite ugly to me. But close_all(from, to)
> syscall kindof makes sense to me... and is not that ugly.
>
> Given that openbsd has something similar... perhaps we can take that?

closefrom() can be implemented on top of fdmap().
fdmap as is is useful for other purposes.

> Except during startup, application should have good idea what fds it
> has open, so maybe that's enough?

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [1/2,v2] fdmap(2)
@ 2017-10-25 13:11                 ` Alexey Dobriyan
  0 siblings, 0 replies; 51+ messages in thread
From: Alexey Dobriyan @ 2017-10-25 13:11 UTC (permalink / raw)
  To: Greg KH
  Cc: Andy Lutomirski, Andrei Vagin, Andrew Morton, linux-kernel,
	Linux API, Randy Dunlap, Thomas Gleixner, Djalal Harouni,
	Alexey Gladkov

On 10/20/17, Greg KH <gregkh@linuxfoundation.org> wrote:
> On Thu, Oct 19, 2017 at 05:34:35PM +0200, Alexey Dobriyan wrote:
>> So the answer it not to fix /proc, the answer it to leave /proc alone.
>
> No, because:
>
>> The answer is make Unix shell people move their lazy asses and
>> implement minimal type system and a way to execute raw system calls
>> like all normal programming languages do. They still haven't done it
>> after dozens of years and are arrogant enough to say "oh uh we can't
>> use cat and pipe to awk".
>
> Have you submitted changes and patches to those projects in order to
> implement such a thing?

This is open source equivalent of "sod off".

> If not, just complaining about it isn't going to change anything, you
> know better than that :)

Greg, I'm not "just complaining", fdmap() was implemented and
sent for discussion.

Unix shell people seem to me to be anti-intellectual if after all
these years they didn't realize that their core ideas of "universal"
interface and serialization/deserialization at every opportunity
are wrong and stupid. It is best to avoid such people and not
interact with them. However what they can do is to not become
anchors asking for the world to revolve around them.

I'm not proposing to delete /proc, I'm proposing (kind of)
to leave it as is.

P.S.: even Perl 5 has "syscall" function!

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [1/2,v2] fdmap(2)
@ 2017-10-25 13:11                 ` Alexey Dobriyan
  0 siblings, 0 replies; 51+ messages in thread
From: Alexey Dobriyan @ 2017-10-25 13:11 UTC (permalink / raw)
  To: Greg KH
  Cc: Andy Lutomirski, Andrei Vagin, Andrew Morton,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Linux API, Randy Dunlap,
	Thomas Gleixner, Djalal Harouni, Alexey Gladkov

On 10/20/17, Greg KH <gregkh-hQyY1W1yCW8ekmWlsbkhG0B+6BGkLq7r@public.gmane.org> wrote:
> On Thu, Oct 19, 2017 at 05:34:35PM +0200, Alexey Dobriyan wrote:
>> So the answer it not to fix /proc, the answer it to leave /proc alone.
>
> No, because:
>
>> The answer is make Unix shell people move their lazy asses and
>> implement minimal type system and a way to execute raw system calls
>> like all normal programming languages do. They still haven't done it
>> after dozens of years and are arrogant enough to say "oh uh we can't
>> use cat and pipe to awk".
>
> Have you submitted changes and patches to those projects in order to
> implement such a thing?

This is open source equivalent of "sod off".

> If not, just complaining about it isn't going to change anything, you
> know better than that :)

Greg, I'm not "just complaining", fdmap() was implemented and
sent for discussion.

Unix shell people seem to me to be anti-intellectual if after all
these years they didn't realize that their core ideas of "universal"
interface and serialization/deserialization at every opportunity
are wrong and stupid. It is best to avoid such people and not
interact with them. However what they can do is to not become
anchors asking for the world to revolve around them.

I'm not proposing to delete /proc, I'm proposing (kind of)
to leave it as is.

P.S.: even Perl 5 has "syscall" function!

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH 1/2 v2] fdmap(2)
  2017-10-25 12:45       ` Alexey Dobriyan
  (?)
@ 2017-10-25 13:48       ` Pavel Machek
  -1 siblings, 0 replies; 51+ messages in thread
From: Pavel Machek @ 2017-10-25 13:48 UTC (permalink / raw)
  To: Alexey Dobriyan
  Cc: Michael Kerrisk (man-pages),
	akpm, linux-kernel, linux-api, rdunlap, tglx, tixxdz,
	gladkov.alexey, Aliaksandr Patseyenak

[-- Attachment #1: Type: text/plain, Size: 762 bytes --]

On Wed 2017-10-25 14:45:31, Alexey Dobriyan wrote:
> On 10/23/17, Pavel Machek <pavel@ucw.cz> wrote:
> 
> > Binary fdmap looks... quite ugly to me. But close_all(from, to)
> > syscall kindof makes sense to me... and is not that ugly.
> >
> > Given that openbsd has something similar... perhaps we can take that?
> 
> closefrom() can be implemented on top of fdmap().
> fdmap as is is useful for other purposes.

fdmap can be implemented on top of open('/dev/kmem').
open('/dev/kmem') as is is useful for other purposes.

Yes, fdmap is more powerful. No, that does not mean it is better idea.
									Pavel
-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [1/2,v2] fdmap(2)
@ 2017-10-26  7:53               ` Andy Lutomirski
  0 siblings, 0 replies; 51+ messages in thread
From: Andy Lutomirski @ 2017-10-26  7:53 UTC (permalink / raw)
  To: Alexey Dobriyan
  Cc: Andy Lutomirski, Andrei Vagin, Andrew Morton, linux-kernel,
	Linux API, Randy Dunlap, Thomas Gleixner, Djalal Harouni,
	Alexey Gladkov

> On Oct 19, 2017, at 5:34 PM, Alexey Dobriyan <adobriyan@gmail.com> wrote:
>
> On 10/18/17, Andy Lutomirski <luto@kernel.org> wrote:
>>> fdmap() is standalone thing.
>>>
>>> Next step is to design fdinfo(2) (?) which uses descriptors from
>>> fdmap(2). Extending structures is done as usual: but version,
>>> add new fields to the end.
>>
>> I very strongly disagree.  If you really want a new interface for
>> reading out information about other processes, design one that makes
>> sense as a whole.  Don't design it piecemeal. The last thing we need
>> is a crappy /proc alternative that covers a small fraction of use
>> cases.
>
> Oh well.
>
>> And demonstrate that it actually has a material benefit over
>> fixing /proc.
>
>> Meanwhile, why not just fix /proc?
>
> /proc can not be fixed. Let me reiterate the arguments one more time.
>
> Using /proc
> * overallocates memory even if temporarily,

This is fixable.  Maybe hard, but fixable.

> * instantiates dentries and inodes nobody cares about,

Your syscalls won't make a difference here because we can't just
remove /proc.  But IIRC we could give it a backing store like sysfs
has.

> * make expansion path more painful than necessary:
>  1) adding field to the end of the file is the least risky move,
>  2) decimal radix and unfixed positioning are used often

These two are fixable by adding extensible binary files.

>  3) read() doesn't accept any sort of filter for data

Indeed.  Doing this directly in /proc is hard.

>
>  (1)+(3) = those who added their fields later eat the cost of all
>  previous fields even if their field is very simple and easy
>  to calculate.
>
>  (2)+(3) = lseek() can not be used as a substitute.
>
>  4) adding new file is cleaner however every virtual file creates
>     kernel objects which aren't interesting themselves.
>     This has direct costs (more syscalls and lookups) and
>     indirect costs (more garbage in d/icache and more flushing
>     when the process dies)
>
>  5) converting to strings eats cycles (and more cycles are consumed
>     if userspace decides to convert it back to binary for whatever
>     reasons)

As above, fixable with new files.

>
>     For those who didn't pay attention, first patch to make integer
>     conversion faster made /proc/*/stat (?) 30% faster!
>     This is how slow text can be.

Alternatively, that's how much improvement is available without any
ABI change at all.

>
> Sysfs is overall better but solely because it has strict one-value-per-file
> rule, so interfaces are cleaner and there is less room for mistakes.
>
>
> Philosophically, text files create false impression that parsing text
> is easy.
>
> As an example grab a bunch of students and ask them to write a program
> which parses something innocent like /proc/*/stat for process state.
>
> The beartrap is that ->comm is not escaped for Space and ')' so naive
> strchr() from the beginning doesn't work reliably. The only reliable
> way is to count spaces from the end(!) and pray kernel devs do not
> extend the file with another field or, worse, with another textual
> field.
>
> ESCAPE_SPACE doesn't escape Space, funny, isn't it?
>
> /proc/*/status kind of has the same problem however forward strstr()
> works with "\nState:\t" because \n and \t will be escaped. But nobody
> would search for just "State:", especially in scripts, right?
>
> EIATF people often critique binary people for their mistakes
> but themselves make equally stupid ones. "(unreachable)" anyone?.
>
> So the answer it not to fix /proc, the answer it to leave /proc alone.
> The answer is make Unix shell people move their lazy asses and
> implement minimal type system and a way to execute raw system calls
> like all normal programming languages do. They still haven't done it
> after dozens of years and are arrogant enough to say "oh uh we can't
> use cat and pipe to awk"

I think that asking the bash maintainers to implement PowerShell does
not fall into the lazy ass category.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [1/2,v2] fdmap(2)
@ 2017-10-26  7:53               ` Andy Lutomirski
  0 siblings, 0 replies; 51+ messages in thread
From: Andy Lutomirski @ 2017-10-26  7:53 UTC (permalink / raw)
  To: Alexey Dobriyan
  Cc: Andy Lutomirski, Andrei Vagin, Andrew Morton,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Linux API, Randy Dunlap,
	Thomas Gleixner, Djalal Harouni, Alexey Gladkov

> On Oct 19, 2017, at 5:34 PM, Alexey Dobriyan <adobriyan-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> wrote:
>
> On 10/18/17, Andy Lutomirski <luto-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org> wrote:
>>> fdmap() is standalone thing.
>>>
>>> Next step is to design fdinfo(2) (?) which uses descriptors from
>>> fdmap(2). Extending structures is done as usual: but version,
>>> add new fields to the end.
>>
>> I very strongly disagree.  If you really want a new interface for
>> reading out information about other processes, design one that makes
>> sense as a whole.  Don't design it piecemeal. The last thing we need
>> is a crappy /proc alternative that covers a small fraction of use
>> cases.
>
> Oh well.
>
>> And demonstrate that it actually has a material benefit over
>> fixing /proc.
>
>> Meanwhile, why not just fix /proc?
>
> /proc can not be fixed. Let me reiterate the arguments one more time.
>
> Using /proc
> * overallocates memory even if temporarily,

This is fixable.  Maybe hard, but fixable.

> * instantiates dentries and inodes nobody cares about,

Your syscalls won't make a difference here because we can't just
remove /proc.  But IIRC we could give it a backing store like sysfs
has.

> * make expansion path more painful than necessary:
>  1) adding field to the end of the file is the least risky move,
>  2) decimal radix and unfixed positioning are used often

These two are fixable by adding extensible binary files.

>  3) read() doesn't accept any sort of filter for data

Indeed.  Doing this directly in /proc is hard.

>
>  (1)+(3) = those who added their fields later eat the cost of all
>  previous fields even if their field is very simple and easy
>  to calculate.
>
>  (2)+(3) = lseek() can not be used as a substitute.
>
>  4) adding new file is cleaner however every virtual file creates
>     kernel objects which aren't interesting themselves.
>     This has direct costs (more syscalls and lookups) and
>     indirect costs (more garbage in d/icache and more flushing
>     when the process dies)
>
>  5) converting to strings eats cycles (and more cycles are consumed
>     if userspace decides to convert it back to binary for whatever
>     reasons)

As above, fixable with new files.

>
>     For those who didn't pay attention, first patch to make integer
>     conversion faster made /proc/*/stat (?) 30% faster!
>     This is how slow text can be.

Alternatively, that's how much improvement is available without any
ABI change at all.

>
> Sysfs is overall better but solely because it has strict one-value-per-file
> rule, so interfaces are cleaner and there is less room for mistakes.
>
>
> Philosophically, text files create false impression that parsing text
> is easy.
>
> As an example grab a bunch of students and ask them to write a program
> which parses something innocent like /proc/*/stat for process state.
>
> The beartrap is that ->comm is not escaped for Space and ')' so naive
> strchr() from the beginning doesn't work reliably. The only reliable
> way is to count spaces from the end(!) and pray kernel devs do not
> extend the file with another field or, worse, with another textual
> field.
>
> ESCAPE_SPACE doesn't escape Space, funny, isn't it?
>
> /proc/*/status kind of has the same problem however forward strstr()
> works with "\nState:\t" because \n and \t will be escaped. But nobody
> would search for just "State:", especially in scripts, right?
>
> EIATF people often critique binary people for their mistakes
> but themselves make equally stupid ones. "(unreachable)" anyone?.
>
> So the answer it not to fix /proc, the answer it to leave /proc alone.
> The answer is make Unix shell people move their lazy asses and
> implement minimal type system and a way to execute raw system calls
> like all normal programming languages do. They still haven't done it
> after dozens of years and are arrogant enough to say "oh uh we can't
> use cat and pipe to awk"

I think that asking the bash maintainers to implement PowerShell does
not fall into the lazy ass category.

^ permalink raw reply	[flat|nested] 51+ messages in thread

end of thread, other threads:[~2017-10-26  7:53 UTC | newest]

Thread overview: 51+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-09-24 20:06 [PATCH 1/2 v2] fdmap(2) Alexey Dobriyan
2017-09-24 20:06 ` Alexey Dobriyan
2017-09-24 20:08 ` [PATCH v2 2/2] pidmap(2) Alexey Dobriyan
2017-09-24 20:08   ` Alexey Dobriyan
2017-09-24 21:27   ` Andy Lutomirski
2017-09-24 21:27     ` Andy Lutomirski
2017-09-26 18:46     ` Alexey Dobriyan
2017-09-26 18:46       ` Alexey Dobriyan
2017-09-27 15:04       ` Andy Lutomirski
2017-09-25  7:43   ` Michael Kerrisk (man-pages)
2017-09-25 10:47   ` Djalal Harouni
2017-09-25 10:47     ` Djalal Harouni
2017-09-26  5:44   ` kbuild test robot
2017-09-26  5:44     ` kbuild test robot
2017-09-24 21:31 ` [PATCH 1/2 v2] fdmap(2) Andy Lutomirski
2017-09-26 18:43   ` Alexey Dobriyan
2017-09-26 18:43     ` Alexey Dobriyan
2017-09-25  7:42 ` Michael Kerrisk (man-pages)
2017-09-25  7:42   ` Michael Kerrisk (man-pages)
2017-09-26 19:00   ` Alexey Dobriyan
2017-09-26 19:00     ` Alexey Dobriyan
2017-09-27 15:03     ` Andy Lutomirski
2017-09-27 15:03       ` Andy Lutomirski
2017-09-28  7:26       ` Michael Kerrisk (man-pages)
2017-09-28  7:26         ` Michael Kerrisk (man-pages)
2017-09-28 10:55         ` Alexey Dobriyan
2017-09-28 15:02           ` Andy Lutomirski
2017-09-28 15:02             ` Andy Lutomirski
2017-10-11 17:37             ` Alexey Dobriyan
2017-09-28 10:10       ` Alexey Dobriyan
2017-09-28 10:10         ` Alexey Dobriyan
2017-10-23  9:29   ` Pavel Machek
2017-10-23  9:29     ` Pavel Machek
2017-10-25 12:45     ` Alexey Dobriyan
2017-10-25 12:45       ` Alexey Dobriyan
2017-10-25 13:48       ` Pavel Machek
2017-09-26  4:25 ` kbuild test robot
2017-10-10 22:08 ` [1/2,v2] fdmap(2) Andrei Vagin
2017-10-10 22:08   ` Andrei Vagin
2017-10-11 18:12   ` Alexey Dobriyan
2017-10-12  8:06     ` Andrei Vagin
2017-10-12  8:06       ` Andrei Vagin
2017-10-18 11:35       ` Alexey Dobriyan
2017-10-18 11:35         ` Alexey Dobriyan
2017-10-18 17:47         ` Andy Lutomirski
2017-10-19 15:34           ` Alexey Dobriyan
2017-10-20  7:48             ` Greg KH
2017-10-25 13:11               ` Alexey Dobriyan
2017-10-25 13:11                 ` Alexey Dobriyan
2017-10-26  7:53             ` Andy Lutomirski
2017-10-26  7:53               ` Andy Lutomirski

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.