linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/2] pidmap(2)
@ 2017-09-05 19:05 Alexey Dobriyan
  2017-09-05 19:06 ` [PATCH 2/2] fdmap(2) Alexey Dobriyan
                   ` (2 more replies)
  0 siblings, 3 replies; 12+ messages in thread
From: Alexey Dobriyan @ 2017-09-05 19:05 UTC (permalink / raw)
  To: akpm; +Cc: Tatsiana Brouka, linux-kernel, linux-api, Aliaksandr Patseyenak

From: Tatsiana Brouka <Tatsiana_Brouka@epam.com>

Implement system call for bulk retrieveing of pids in binary form.

Using /proc is slower than necessary: 3 syscalls + another 3 for each thread +
converting with atoi().

/proc may be not mounted especially in containers. Natural extension of
hidepid=2 efforts is to not mount /proc at all.

It could be used by programs like ps, top or CRIU. Speed increase will
become more drastic once combined with bulk retrieval of process statistics.

Sample program:

#include <stdio.h>
static inline long sys_pidmap(int *pid, unsigned int n, int start)
{
	register long r10 asm ("r10") = 0;
	long rv;
	asm volatile (
		"syscall"
		: "=a" (rv)
		: "0" (333), "D" (pid), "S" (n), "d" (start), "r" (r10)
		: "rcx", "r11", "cc", "memory"
	);
	return rv;
}

int main(void)
{
	int pid[5];
	unsigned int start;
	int n;

	start = 0;
	while ((n = sys_pidmap(pid, sizeof(pid)/sizeof(pid[0]), start)) > 0) {
		int i;

		for (i = 0; i < n; i++) {
			printf(" %u", pid[i]);
		}
		printf("\n");
		start = pid[n - 1] + 1;
	}

	return 0;
}

Signed-off-by: Tatsiana Brouka <Tatsiana_Brouka@epam.com>
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---

 arch/x86/entry/syscalls/syscall_64.tbl  |    1
 include/linux/syscalls.h                |    4
 kernel/Makefile                         |    2
 kernel/pidmap.c                         |  116 ++++++++++++++
 tools/testing/selftests/Makefile        |    1
 tools/testing/selftests/pidmap/Makefile |    5
 tools/testing/selftests/pidmap/pidmap.c |  263 ++++++++++++++++++++++++++++++++
 7 files changed, 392 insertions(+)

--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -339,6 +339,7 @@
 330	common	pkey_alloc		sys_pkey_alloc
 331	common	pkey_free		sys_pkey_free
 332	common	statx			sys_statx
+333	common	pidmap			sys_pidmap
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -923,4 +923,8 @@ asmlinkage long sys_pkey_free(int pkey);
 asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
 			  unsigned mask, struct statx __user *buffer);
 
+asmlinkage long sys_pidmap(int __user *pids,
+			   unsigned int pids_count,
+			   unsigned int start_pid,
+			   int flags);
 #endif
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -11,6 +11,8 @@ obj-y     = fork.o exec_domain.o panic.o \
 	    notifier.o ksysfs.o cred.o reboot.o \
 	    async.o range.o smpboot.o ucount.o
 
+obj-y += pidmap.o
+
 obj-$(CONFIG_MULTIUSER) += groups.o
 
 ifdef CONFIG_FUNCTION_TRACER
--- /dev/null
+++ b/kernel/pidmap.c
@@ -0,0 +1,116 @@
+#include <linux/bitops.h>
+#include <linux/cred.h>
+#include <linux/kernel.h>
+#include <linux/pid.h>
+#include <linux/ptrace.h>
+#include <linux/rcupdate.h>
+#include <linux/syscalls.h>
+#include <linux/sched.h>
+#include <linux/uaccess.h>
+
+/**
+ * pidmap - get allocated PIDs
+ * @pids: Destination buffer.
+ * @pids_count: number of elements in the buffer.
+ * @start_pid: PID to start from.
+ * @flags: flags, must be 0.
+ *
+ * Write allocated PIDs to a buffer starting from @start_pid (inclusive).
+ * PIDs are filled from pid namespace of the calling process POV:
+ * unshare(CLONE_NEWPID)+fork+pidmap in child will always return 1/1.
+ *
+ * pidmap(2) hides PIDs inaccessible at /proc mounted with "hide_pid" option.
+ *
+ * Note, pidmap(2) does not guarantee that any of returned PID exists
+ * by the time system call exits.
+ *
+ * Return: number of PIDs written to the buffer or error code otherwise.
+ */
+SYSCALL_DEFINE4(pidmap, int __user *, pids, unsigned int, pids_count,
+		unsigned int, start_pid, int, flags)
+{
+	struct pid_namespace *ns = task_active_pid_ns(current);
+	unsigned int start_page, start_elem;
+	unsigned int last_pos = 0;
+	unsigned int last_set_pid = 0;
+	unsigned long mask;
+	bool has_perms = false;
+	unsigned int i;
+
+	if (flags)
+		return -EINVAL;
+
+	/*
+	 * Pid 0 does not exist, however, corresponding bit is always set in
+	 * ->pidmap[0].page, so we should skip it.
+	 */
+	if (start_pid == 0)
+		start_pid = 1;
+
+	if (start_pid > ns->last_pid)
+		return 0;
+
+	if (ns->hide_pid < HIDEPID_INVISIBLE || in_group_p(ns->pid_gid))
+		has_perms = true;
+
+	start_page = start_pid / BITS_PER_PAGE;
+	start_elem = (start_pid % BITS_PER_PAGE) / BITS_PER_LONG;
+	mask = ~0UL << (start_pid % BITS_PER_LONG);
+
+	for (i = start_page; i < PIDMAP_ENTRIES; i++) {
+		unsigned int j;
+
+		/*
+		 * ->pidmap[].page is set once to a valid pointer,
+		 *  therefore do not take any locks.
+		 */
+		if (ns->pidmap[i].page == NULL)
+			continue;
+
+		for (j = start_elem; j < PAGE_SIZE/sizeof(unsigned long); j++) {
+			unsigned long val;
+
+			val = *((unsigned long *)ns->pidmap[i].page + j);
+			val &= mask;
+			mask = ~0UL;
+			while (val != 0) {
+				struct task_struct *task;
+
+				if (last_pos == pids_count)
+					return last_pos;
+
+				last_set_pid = i * BITS_PER_PAGE +
+					j * BITS_PER_LONG + __ffs(val);
+
+				if (has_perms)
+					goto write;
+
+				rcu_read_lock();
+				task = find_task_by_pid_ns(last_set_pid, ns);
+				if (!task) {
+					rcu_read_unlock();
+					goto next;
+				}
+				if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) {
+					rcu_read_unlock();
+					goto next;
+				}
+				rcu_read_unlock();
+write:
+				if (put_user(last_set_pid, pids + last_pos))
+					return -EFAULT;
+				last_pos++;
+				if (last_set_pid == ns->last_pid)
+					return last_pos;
+next:
+				val &= (val - 1);
+			}
+
+		}
+		start_elem = 0;
+	}
+	if (last_set_pid == 0)
+		return 0;
+	else
+		return last_pos;
+}
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -20,6 +20,7 @@ TARGETS += mount
 TARGETS += mqueue
 TARGETS += net
 TARGETS += nsfs
+TARGETS += pidmap
 TARGETS += powerpc
 TARGETS += pstore
 TARGETS += ptrace
--- /dev/null
+++ b/tools/testing/selftests/pidmap/Makefile
@@ -0,0 +1,5 @@
+CFLAGS = -Wall
+
+TEST_GEN_PROGS := pidmap
+
+include ../lib.mk
--- /dev/null
+++ b/tools/testing/selftests/pidmap/pidmap.c
@@ -0,0 +1,263 @@
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/wait.h>
+#include <stdlib.h>
+#include <sched.h>
+#include <dirent.h>
+#include <string.h>
+#include <sys/mount.h>
+#include <signal.h>
+#include "../kselftest_harness.h"
+
+#define SIZE 512
+
+static inline long pidmap(int *pid, unsigned int count, unsigned int start_pid)
+{
+	long ret;
+
+	register long r10 asm("r10") = 0;
+
+	asm volatile ("syscall" : "=a"(ret) :
+		"0"(333), "D"(pid), "S"(count), "d"(start_pid), "r"(r10) :
+		"rcx", "r11", "cc", "memory");
+	return ret;
+}
+
+static int compare(const void *a, const void *b)
+{
+	return *((int *)a) > *((int *)b);
+}
+
+int pidmap_full(int **pid, unsigned int *res_count)
+{
+	int n;
+	int start_pid = 1;
+	*pid = (int *)malloc(SIZE * sizeof(int));
+	*res_count = 0;
+
+	while ((n = pidmap(*pid + *res_count, SIZE, start_pid)) > 0) {
+		*res_count += n;
+		*pid = (int *)realloc(*pid, (*res_count + SIZE) * sizeof(int));
+		start_pid = (*pid)[*res_count - 1] + 1;
+	}
+	return n;
+}
+
+int pidmap_proc(int **pid, unsigned int *n)
+{
+	DIR *dir = opendir("/proc");
+	struct dirent *dirs;
+
+	*n = 0;
+	*pid = NULL;
+
+	while ((dirs = readdir(dir))) {
+		char dname[32] = "";
+		DIR *task_dir;
+
+		if (dirs->d_name[0] < '0' || dirs->d_name[0] > '9')
+			continue;
+
+		strcpy(dname, "/proc/");
+		strcat(dname, dirs->d_name);
+		strcat(dname, "/task");
+		task_dir = opendir(dname);
+
+		if (task_dir) {
+			struct dirent *task_dirs;
+
+			while ((task_dirs = readdir(task_dir))) {
+				if (task_dirs->d_name[0] < '0' ||
+						task_dirs->d_name[0] > '9')
+					continue;
+
+				*pid = (int *)realloc(*pid, (*n + 1) *
+								sizeof(int));
+				if (*pid == NULL)
+					return -1;
+				*(*pid + *n) = atoi(task_dirs->d_name);
+				*n += 1;
+			}
+		} else {
+			*pid = (int *)realloc(*pid, (*n + 1) * sizeof(int));
+			if (*pid == NULL)
+				return -1;
+			*(*pid + *n) = atoi(dirs->d_name);
+			*n += 1;
+		}
+		closedir(task_dir);
+	}
+	closedir(dir);
+	return 0;
+}
+
+TEST(bufsize)
+{
+	int pid[SIZE];
+
+	EXPECT_EQ(0, pidmap(pid, 0, 1));
+}
+
+TEST(get_pid)
+{
+	int pid;
+	int ret;
+
+	ret = pidmap(&pid, 1, getpid());
+	ASSERT_LE(0, ret);
+	EXPECT_EQ(getpid(), pid);
+}
+
+TEST(bad_start)
+{
+	int pid[SIZE];
+
+	ASSERT_LE(0, pidmap(pid, SIZE, -1));
+	ASSERT_LE(0, pidmap(pid, SIZE, ~0U));
+	ASSERT_LE(0, pidmap(pid, SIZE, 0));
+	EXPECT_EQ(1, pid[0]);
+}
+
+TEST(child_pid)
+{
+	pid_t pid = fork();
+
+	if (pid == 0)
+		pause();
+	else {
+		int ret;
+		int result = 0;
+
+		ret = pidmap(&result, 1, pid);
+		EXPECT_LE(0, ret);
+		EXPECT_EQ(pid, result);
+		kill(pid, SIGTERM);
+	}
+}
+
+int write_pidmax(int new_pidmax)
+{
+	char old_pidmax[32];
+	char new[32];
+	int fd = open("/proc/sys/kernel/pid_max", O_RDWR);
+
+	if (read(fd, old_pidmax, 32) <= 0)
+		printf("Read failed\n");
+	lseek(fd, 0, 0);
+	snprintf(new, sizeof(new), "%d", new_pidmax);
+	if (write(fd, new, strlen(new)) <= 0)
+		printf("Write failed\n");
+	close(fd);
+	return atoi(old_pidmax);
+}
+
+void do_forks(unsigned int n)
+{
+	while (n--) {
+		pid_t pid = fork();
+
+		if (pid == 0)
+			exit(0);
+		waitpid(pid, NULL, 0);
+	}
+}
+
+TEST(pid_max)
+{
+	int *pid;
+	unsigned int n;
+	int ret, p;
+	int old_pidmax;
+
+	old_pidmax = write_pidmax(50000);
+
+	do_forks(40000);
+
+	p = fork();
+
+	if (p == 0)
+		pause();
+
+	ret = pidmap_full(&pid, &n);
+
+	EXPECT_LE(0, ret);
+	EXPECT_EQ(p, pid[n - 1]);
+
+	kill(p, SIGKILL);
+	write_pidmax(old_pidmax);
+}
+
+TEST(compare_proc)
+{
+	pid_t pid;
+
+	if (unshare(CLONE_NEWNS | CLONE_NEWPID) == -1)
+		return;
+
+	pid = fork();
+
+	if (pid == 0) {
+		pid_t pid;
+		int i = 0;
+
+		mount("none", "/", NULL, MS_REC | MS_PRIVATE, NULL);
+		mount("none", "/proc", NULL, MS_REC | MS_PRIVATE, NULL);
+		mount("proc", "/proc", "proc",
+			MS_NOSUID | MS_NODEV | MS_NOEXEC, NULL);
+
+		while (i < 150) {
+			i++;
+
+			pid = fork();
+
+			if (pid == -1) {
+				wait(NULL);
+				umount("/proc");
+				return;
+			}
+			if (pid == 0) {
+				pause();
+				return;
+			}
+		}
+
+		int *pids, *pids_proc;
+		unsigned int n = 0;
+		unsigned int n_proc = 0;
+		int ret, ret_proc;
+
+		ret = pidmap_full(&pids, &n);
+
+		ret_proc = pidmap_proc(&pids_proc, &n_proc);
+		qsort(pids_proc, n_proc, sizeof(int), compare);
+
+		EXPECT_LE(0, ret);
+		EXPECT_EQ(n_proc, n);
+
+		if (ret <= 0 || ret_proc <= 0 || n != n_proc) {
+			killpg(0, SIGTERM);
+			wait(NULL);
+			umount("/proc");
+			free(pids);
+			free(pids_proc);
+			return;
+		}
+
+		for (int i = 0; i < n; i++) {
+			EXPECT_EQ(pids_proc[i], pids[i]);
+			if (pids_proc[i] != pids[i])
+				break;
+		}
+		EXPECT_EQ(1, pids[0]);
+
+		free(pids_proc);
+		free(pids);
+		killpg(0, SIGTERM);
+		wait(NULL);
+		umount("/proc");
+	}
+}
+
+TEST_HARNESS_MAIN

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 2/2] fdmap(2)
  2017-09-05 19:05 [PATCH 1/2] pidmap(2) Alexey Dobriyan
@ 2017-09-05 19:06 ` Alexey Dobriyan
  2017-09-05 22:53 ` [PATCH 1/2] pidmap(2) Andrew Morton
  2017-09-07 10:08 ` Dmitry V. Levin
  2 siblings, 0 replies; 12+ messages in thread
From: Alexey Dobriyan @ 2017-09-05 19:06 UTC (permalink / raw)
  To: akpm; +Cc: Tatsiana Brouka, linux-kernel, linux-api, Aliaksandr Patseyenak

From: Aliaksandr Patseyenak <Aliaksandr_Patseyenak1@epam.com>

Implement system call for bulk retrieveing of opened descriptors
in binary form.

Some daemons could use it to reliably close file descriptors
before starting. Currently they close everything upto some number
which formally is not reliable. Other natural users are lsof(1) and CRIU
(although lsof does so much in /proc that the effect is thoroughly buried).

Once again, /proc, the only way to learn anything about file descriptors
may not be available.

Sample program:

#include <stdlib.h>
#include <stdio.h>

static inline long sys_fdmap(int pid, int *fd, unsigned int n, int start)
{
	register long r10 asm ("r10") = start;
	register long r8 asm ("r8") = 0;
	long rv;
	asm volatile (
		"syscall"
		: "=a" (rv)
		: "0" (334), "D" (pid), "S" (fd), "d" (n), "r" (r10), "r" (r8)
		: "rcx", "r11", "cc", "memory"
	);
	return rv;
}

int main(int argc, char *argv[])
{
	int fd[3];
	int pid;
	unsigned int start;
	int n;

	pid = 0;
	if (argc > 1)
		pid = atoi(argv[1]);

	start = 0;
	while ((n = sys_fdmap(pid, fd, sizeof(fd)/sizeof(fd[0]), start)) > 0) {
		unsigned int i;

		for (i = 0; i < n; i++) {
			printf(" %u", fd[i]);
		}
		printf("\n");
		start = fd[n - 1] + 1;
	}

	return 0;
}

Signed-off-by: Aliaksandr Patseyenak <Aliaksandr_Patseyenak1@epam.com>
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---

 arch/x86/entry/syscalls/syscall_64.tbl     |    1 
 fs/Makefile                                |    2 
 fs/fdmap.c                                 |  105 +++++++++++++++++++
 include/linux/syscalls.h                   |    2 
 tools/testing/selftests/fdmap/.gitignore   |    1 
 tools/testing/selftests/fdmap/Makefile     |    7 +
 tools/testing/selftests/fdmap/fdmap.c      |  112 +++++++++++++++++++++
 tools/testing/selftests/fdmap/fdmap.h      |   12 ++
 tools/testing/selftests/fdmap/fdmap_test.c |  153 +++++++++++++++++++++++++++++
 9 files changed, 394 insertions(+), 1 deletion(-)

--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -340,6 +340,7 @@
 331	common	pkey_free		sys_pkey_free
 332	common	statx			sys_statx
 333	common	pidmap			sys_pidmap
+334	common	fdmap			sys_fdmap
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -11,7 +11,7 @@ obj-y :=	open.o read_write.o file_table.o super.o \
 		attr.o bad_inode.o file.o filesystems.o namespace.o \
 		seq_file.o xattr.o libfs.o fs-writeback.o \
 		pnode.o splice.o sync.o utimes.o \
-		stack.o fs_struct.o statfs.o fs_pin.o nsfs.o
+		stack.o fs_struct.o statfs.o fs_pin.o nsfs.o fdmap.o
 
 ifeq ($(CONFIG_BLOCK),y)
 obj-y +=	buffer.o block_dev.o direct-io.o mpage.o
--- /dev/null
+++ b/fs/fdmap.c
@@ -0,0 +1,105 @@
+#include <linux/bitops.h>
+#include <linux/fdtable.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+#include <linux/syscalls.h>
+#include <linux/uaccess.h>
+
+/**
+ * fdmap - get opened file descriptors of a process
+ * @pid: the pid of the target process
+ * @fds: allocated userspace buffer
+ * @count: buffer size (in descriptors)
+ * @start_fd: first descriptor to search from (inclusive)
+ * @flags: reserved for future functionality, must be zero
+ *
+ * If @pid is zero then it's current process.
+ * Return: number of descriptors written. An error code otherwise.
+ */
+SYSCALL_DEFINE5(fdmap, pid_t, pid, int __user *, fds, unsigned int, count,
+		int, start_fd, int, flags)
+{
+	struct task_struct *task;
+	struct files_struct *files;
+	unsigned long search_mask;
+	unsigned int user_index, offset;
+	int masksize;
+
+	if (start_fd < 0 || flags != 0)
+		return -EINVAL;
+
+	if (!pid) {
+		files = get_files_struct(current);
+	} else {
+		rcu_read_lock();
+		task = find_task_by_vpid(pid);
+		if (!task) {
+			rcu_read_unlock();
+			return -ESRCH;
+		}
+		if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
+			rcu_read_unlock();
+			return -EACCES;
+		}
+		files = get_files_struct(task);
+		rcu_read_unlock();
+	}
+	if (!files)
+		return 0;
+
+	offset = start_fd / BITS_PER_LONG;
+	search_mask = ULONG_MAX << (start_fd % BITS_PER_LONG);
+	user_index = 0;
+#define FDS_BUF_SIZE	(1024/sizeof(unsigned long))
+	masksize = FDS_BUF_SIZE;
+	while (user_index < count && masksize == FDS_BUF_SIZE) {
+		unsigned long open_fds[FDS_BUF_SIZE];
+		struct fdtable *fdt;
+		unsigned int i;
+
+		/*
+		 * fdt->max_fds can grow, get it every time
+		 * before copying part into internal buffer.
+		 */
+		rcu_read_lock();
+		fdt = files_fdtable(files);
+		masksize = fdt->max_fds / 8 - offset * sizeof(long);
+		if (masksize < 0) {
+			rcu_read_unlock();
+			break;
+		}
+		masksize = min(masksize, (int)sizeof(open_fds));
+		memcpy(open_fds, fdt->open_fds + offset, masksize);
+		rcu_read_unlock();
+
+		open_fds[0] &= search_mask;
+		search_mask = ULONG_MAX;
+		masksize = (masksize + sizeof(long) - 1) / sizeof(long);
+		start_fd = offset * BITS_PER_LONG;
+		/*
+		 * for_each_set_bit_from() can re-read first word
+		 * multiple times which is not optimal.
+		 */
+		for (i = 0; i < masksize; i++) {
+			unsigned long mask = open_fds[i];
+
+			while (mask) {
+				unsigned int real_fd = start_fd + __ffs(mask);
+
+				if (put_user(real_fd, fds + user_index)) {
+					put_files_struct(files);
+					return -EFAULT;
+				}
+				if (++user_index >= count)
+					goto out;
+				mask &= mask - 1;
+			}
+			start_fd += BITS_PER_LONG;
+		}
+		offset += FDS_BUF_SIZE;
+	}
+out:
+	put_files_struct(files);
+
+	return user_index;
+}
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -922,6 +922,8 @@ asmlinkage long sys_pkey_alloc(unsigned long flags, unsigned long init_val);
 asmlinkage long sys_pkey_free(int pkey);
 asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
 			  unsigned mask, struct statx __user *buffer);
+asmlinkage long sys_fdmap(pid_t pid, int __user *fds, unsigned int count,
+			  int start_fd, int flags);
 
 asmlinkage long sys_pidmap(int __user *pids,
 			   unsigned int pids_count,
--- /dev/null
+++ b/tools/testing/selftests/fdmap/.gitignore
@@ -0,0 +1 @@
+fdmap_test
--- /dev/null
+++ b/tools/testing/selftests/fdmap/Makefile
@@ -0,0 +1,7 @@
+TEST_GEN_PROGS := fdmap_test
+CFLAGS += -Wall
+
+include ../lib.mk
+
+$(TEST_GEN_PROGS): fdmap_test.c fdmap.c fdmap.h ../kselftest_harness.h
+	$(CC) $(CFLAGS) $(LDFLAGS) $< fdmap.c -o $@
--- /dev/null
+++ b/tools/testing/selftests/fdmap/fdmap.c
@@ -0,0 +1,112 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <dirent.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/types.h>
+#include "fdmap.h"
+
+#define	BUF_SIZE	1024
+
+long fdmap(pid_t pid, int *fds, size_t count, int start_fd, int flags)
+{
+	register int64_t r10 asm("r10") = start_fd;
+	register int64_t r8 asm("r8") = flags;
+	long ret;
+
+	asm volatile (
+		"syscall"
+		: "=a"(ret)
+		: "0" (334),
+		  "D" (pid), "S" (fds), "d" (count), "r" (r10), "r" (r8)
+		: "rcx", "r11", "cc", "memory"
+	);
+	return ret;
+}
+
+int fdmap_full(pid_t pid, int **fds, size_t *n)
+{
+	int buf[BUF_SIZE], start_fd = 0;
+	long ret;
+
+	*n = 0;
+	*fds = NULL;
+	for (;;) {
+		int *new_buff;
+
+		ret = fdmap(pid, buf, BUF_SIZE, start_fd, 0);
+		if (ret < 0)
+			break;
+		if (!ret)
+			return 0;
+
+		new_buff = realloc(*fds, (*n + ret) * sizeof(int));
+		if (!new_buff) {
+			ret = -errno;
+			break;
+		}
+		*fds = new_buff;
+		memcpy(*fds + *n, buf, ret * sizeof(int));
+		*n += ret;
+		start_fd = (*fds)[*n - 1] + 1;
+	}
+	free(*fds);
+	*fds = NULL;
+	return -ret;
+}
+
+int fdmap_proc(pid_t pid, int **fds, size_t *n)
+{
+	char fds_path[20];
+	int dir_fd = 0;
+	struct dirent *fd_link;
+	DIR *fds_dir;
+
+	*fds = NULL;
+	*n = 0;
+	if (!pid)
+		strcpy(fds_path, "/proc/self/fd");
+	else
+		sprintf(fds_path, "/proc/%d/fd", pid);
+
+	fds_dir = opendir(fds_path);
+	if (!fds_dir)
+		return errno == ENOENT ? ESRCH : errno;
+	if (!pid)
+		dir_fd = dirfd(fds_dir);
+
+	while ((fd_link = readdir(fds_dir))) {
+		if (fd_link->d_name[0] < '0'
+		    || fd_link->d_name[0] > '9')
+			continue;
+		if (*n % BUF_SIZE == 0) {
+			int *new_buff;
+
+			new_buff = realloc(*fds, (*n + BUF_SIZE) * sizeof(int));
+			if (!new_buff) {
+				int ret = errno;
+
+				free(*fds);
+				*fds = NULL;
+				return ret;
+			}
+			*fds = new_buff;
+		}
+		(*fds)[*n] = atoi(fd_link->d_name);
+		*n += 1;
+	}
+	closedir(fds_dir);
+
+	if (!pid) {
+		size_t i;
+
+		for (i = 0; i < *n; i++)
+			if ((*fds)[i] == dir_fd)
+				break;
+		i++;
+		memmove(*fds + i - 1, *fds + i, (*n - i) * sizeof(int));
+		(*n)--;
+	}
+	return 0;
+}
--- /dev/null
+++ b/tools/testing/selftests/fdmap/fdmap.h
@@ -0,0 +1,12 @@
+#ifndef FDMAP_H
+#define FDMAP_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+
+long fdmap(pid_t pid, int *fds, size_t count, int start_fd, int flags);
+int fdmap_full(pid_t pid, int **fds, size_t *n);
+int fdmap_proc(pid_t pid, int **fds, size_t *n);
+
+#endif
--- /dev/null
+++ b/tools/testing/selftests/fdmap/fdmap_test.c
@@ -0,0 +1,153 @@
+#include <errno.h>
+#include <syscall.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <limits.h>
+#include "../kselftest_harness.h"
+#include "fdmap.h"
+
+TEST(efault) {
+	int ret;
+
+	ret = syscall(334, 0, NULL, 20 * sizeof(int), 0, 0);
+	ASSERT_EQ(-1, ret);
+	ASSERT_EQ(EFAULT, errno);
+}
+
+TEST(big_start_fd) {
+	int fds[1];
+	int ret;
+
+	ret = syscall(334, 0, fds, sizeof(int), INT_MAX, 0);
+	ASSERT_EQ(0, ret);
+}
+
+TEST(einval) {
+	int ret;
+
+	ret = syscall(334, 0, NULL, 0, -1, 0);
+	ASSERT_EQ(-1, ret);
+	ASSERT_EQ(EINVAL, errno);
+
+	ret = syscall(334, 0, NULL, 0, 0, 1);
+	ASSERT_EQ(-1, ret);
+	ASSERT_EQ(EINVAL, errno);
+}
+
+TEST(esrch) {
+	int fds[1], ret;
+	pid_t pid;
+
+	pid = fork();
+	ASSERT_NE(-1, pid);
+	if (!pid)
+		exit(0);
+	waitpid(pid, NULL, 0);
+
+	ret = syscall(334, pid, fds, sizeof(int), 0, 0);
+	ASSERT_EQ(-1, ret);
+	ASSERT_EQ(ESRCH, errno);
+}
+
+TEST(simple) {
+	int *fds1, *fds2;
+	size_t size1, size2, i;
+	int ret1, ret2;
+
+	ret1 = fdmap_full(0, &fds1, &size1);
+	ret2 = fdmap_proc(0, &fds2, &size2);
+	ASSERT_EQ(ret2, ret1);
+	ASSERT_EQ(size2, size1);
+	for (i = 0; i < size1; i++)
+		ASSERT_EQ(fds2[i], fds1[i]);
+	free(fds1);
+	free(fds2);
+}
+
+TEST(init) {
+	int *fds1, *fds2;
+	size_t size1, size2, i;
+	int ret1, ret2;
+
+	ret1 = fdmap_full(1, &fds1, &size1);
+	ret2 = fdmap_proc(1, &fds2, &size2);
+	ASSERT_EQ(ret2, ret1);
+	ASSERT_EQ(size2, size1);
+	for (i = 0; i < size1; i++)
+		ASSERT_EQ(fds2[i], fds1[i]);
+	free(fds1);
+	free(fds2);
+}
+
+TEST(zero) {
+	int *fds, i;
+	size_t size;
+	int ret;
+
+	ret = fdmap_proc(0, &fds, &size);
+	ASSERT_EQ(0, ret);
+	for (i = 0; i < size; i++)
+		close(fds[i]);
+	free(fds);
+	fds = NULL;
+
+	ret = fdmap_full(0, &fds, &size);
+	ASSERT_EQ(0, ret);
+	ASSERT_EQ(0, size);
+}
+
+TEST(more_fds) {
+	int *fds1, *fds2, ret1, ret2;
+	size_t size1, size2, i;
+
+	struct rlimit rlim = {
+		.rlim_cur = 600000,
+		.rlim_max = 600000
+	};
+	ASSERT_EQ(0, setrlimit(RLIMIT_NOFILE, &rlim));
+	for (int i = 0; i < 500000; i++)
+		dup(0);
+
+	ret1 = fdmap_full(0, &fds1, &size1);
+	ret2 = fdmap_proc(0, &fds2, &size2);
+	ASSERT_EQ(ret2, ret1);
+	ASSERT_EQ(size2, size1);
+	for (i = 0; i < size1; i++)
+		ASSERT_EQ(fds2[i], fds1[i]);
+	free(fds1);
+	free(fds2);
+}
+
+TEST(child) {
+	int pipefd[2];
+	int *fds1, *fds2, ret1, ret2, i;
+	size_t size1, size2;
+	char byte = 0;
+	pid_t pid;
+
+	ASSERT_NE(-1, pipe(pipefd));
+	pid = fork();
+	ASSERT_NE(-1, pid);
+	if (!pid) {
+		read(pipefd[0], &byte, 1);
+		close(pipefd[0]);
+		close(pipefd[1]);
+		exit(0);
+	}
+
+	ret1 = fdmap_full(0, &fds1, &size1);
+	ret2 = fdmap_proc(0, &fds2, &size2);
+	ASSERT_EQ(ret2, ret1);
+	ASSERT_EQ(size2, size1);
+	for (i = 0; i < size1; i++)
+		ASSERT_EQ(fds2[i], fds1[i]);
+	free(fds1);
+	free(fds2);
+
+	write(pipefd[1], &byte, 1);
+	close(pipefd[0]);
+	close(pipefd[1]);
+	waitpid(pid, NULL, 0);
+}
+
+TEST_HARNESS_MAIN

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/2] pidmap(2)
  2017-09-05 19:05 [PATCH 1/2] pidmap(2) Alexey Dobriyan
  2017-09-05 19:06 ` [PATCH 2/2] fdmap(2) Alexey Dobriyan
@ 2017-09-05 22:53 ` Andrew Morton
  2017-09-05 23:02   ` Randy Dunlap
  2017-09-06  8:55   ` Alexey Dobriyan
  2017-09-07 10:08 ` Dmitry V. Levin
  2 siblings, 2 replies; 12+ messages in thread
From: Andrew Morton @ 2017-09-05 22:53 UTC (permalink / raw)
  To: Alexey Dobriyan
  Cc: Tatsiana Brouka, linux-kernel, linux-api, Aliaksandr Patseyenak

On Tue, 5 Sep 2017 22:05:00 +0300 Alexey Dobriyan <adobriyan@gmail.com> wrote:

> Implement system call for bulk retrieveing of pids in binary form.
> 
> Using /proc is slower than necessary: 3 syscalls + another 3 for each thread +
> converting with atoi().
> 
> /proc may be not mounted especially in containers. Natural extension of
> hidepid=2 efforts is to not mount /proc at all.
> 
> It could be used by programs like ps, top or CRIU. Speed increase will
> become more drastic once combined with bulk retrieval of process statistics.

The patches are performance optimizations, but their changelogs contain
no performance measurements!

Demonstration of some compelling real-world performance benefits would
help things along a lot.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/2] pidmap(2)
  2017-09-05 22:53 ` [PATCH 1/2] pidmap(2) Andrew Morton
@ 2017-09-05 23:02   ` Randy Dunlap
  2017-09-06  8:30     ` Thomas Gleixner
  2017-09-06  9:04     ` Alexey Dobriyan
  2017-09-06  8:55   ` Alexey Dobriyan
  1 sibling, 2 replies; 12+ messages in thread
From: Randy Dunlap @ 2017-09-05 23:02 UTC (permalink / raw)
  To: Andrew Morton, Alexey Dobriyan
  Cc: Tatsiana Brouka, linux-kernel, linux-api, Aliaksandr Patseyenak

On 09/05/17 15:53, Andrew Morton wrote:
> On Tue, 5 Sep 2017 22:05:00 +0300 Alexey Dobriyan <adobriyan@gmail.com> wrote:
> 
>> Implement system call for bulk retrieveing of pids in binary form.
>>
>> Using /proc is slower than necessary: 3 syscalls + another 3 for each thread +
>> converting with atoi().
>>
>> /proc may be not mounted especially in containers. Natural extension of
>> hidepid=2 efforts is to not mount /proc at all.
>>
>> It could be used by programs like ps, top or CRIU. Speed increase will
>> become more drastic once combined with bulk retrieval of process statistics.
> 
> The patches are performance optimizations, but their changelogs contain
> no performance measurements!
> 
> Demonstration of some compelling real-world performance benefits would
> help things along a lot.
> 

also, I expect that the tiny kernel people will want kconfig options for
these syscalls.

-- 
~Randy

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/2] pidmap(2)
  2017-09-05 23:02   ` Randy Dunlap
@ 2017-09-06  8:30     ` Thomas Gleixner
  2017-09-06  9:04     ` Alexey Dobriyan
  1 sibling, 0 replies; 12+ messages in thread
From: Thomas Gleixner @ 2017-09-06  8:30 UTC (permalink / raw)
  To: Randy Dunlap
  Cc: Andrew Morton, Alexey Dobriyan, Tatsiana Brouka, linux-kernel,
	linux-api, Aliaksandr Patseyenak

On Tue, 5 Sep 2017, Randy Dunlap wrote:
> On 09/05/17 15:53, Andrew Morton wrote:
> > On Tue, 5 Sep 2017 22:05:00 +0300 Alexey Dobriyan <adobriyan@gmail.com> wrote:
> > 
> >> Implement system call for bulk retrieveing of pids in binary form.
> >>
> >> Using /proc is slower than necessary: 3 syscalls + another 3 for each thread +
> >> converting with atoi().
> >>
> >> /proc may be not mounted especially in containers. Natural extension of
> >> hidepid=2 efforts is to not mount /proc at all.
> >>
> >> It could be used by programs like ps, top or CRIU. Speed increase will
> >> become more drastic once combined with bulk retrieval of process statistics.
> > 
> > The patches are performance optimizations, but their changelogs contain
> > no performance measurements!
> > 
> > Demonstration of some compelling real-world performance benefits would
> > help things along a lot.
> > 
> 
> also, I expect that the tiny kernel people will want kconfig options for
> these syscalls.

And of course that stuff wants the corresponding man pages written up.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/2] pidmap(2)
  2017-09-05 22:53 ` [PATCH 1/2] pidmap(2) Andrew Morton
  2017-09-05 23:02   ` Randy Dunlap
@ 2017-09-06  8:55   ` Alexey Dobriyan
  1 sibling, 0 replies; 12+ messages in thread
From: Alexey Dobriyan @ 2017-09-06  8:55 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Tatsiana Brouka, linux-kernel, linux-api, Aliaksandr Patseyenak

[-- Attachment #1: Type: text/plain, Size: 1545 bytes --]

On 9/6/17, Andrew Morton <akpm@linux-foundation.org> wrote:
> On Tue, 5 Sep 2017 22:05:00 +0300 Alexey Dobriyan <adobriyan@gmail.com>
> wrote:
>
>> Implement system call for bulk retrieveing of pids in binary form.
>>
>> Using /proc is slower than necessary: 3 syscalls + another 3 for each
>> thread +
>> converting with atoi().
>>
>> /proc may be not mounted especially in containers. Natural extension of
>> hidepid=2 efforts is to not mount /proc at all.
>>
>> It could be used by programs like ps, top or CRIU. Speed increase will
>> become more drastic once combined with bulk retrieval of process
>> statistics.
>
> The patches are performance optimizations, but their changelogs contain
> no performance measurements!
>
> Demonstration of some compelling real-world performance benefits would
> help things along a lot.

I forgot the sheet with numbers at work. :^)
They're very embarrassing for /proc.

pidmap:
N=1<<16 times
~130 processes (~250 task_structs) on a regular desktop system
opendir + readdir + closedir /proc + the same for every /proc/$PID/task
(roughly what htop(1) does) vs pidmap

/proc 16.80+-0.73%
pidmap 0.06+-0.31%

fdmap:
N=1<<22 times
4 opened descriptors (0, 1, 2, 3)
opendir+readdir+closedir /proc/self/fd (lsof(1)) vs fdmap

/proc 8.31+-0.37%
fdmap 0.32+-0.72%

Currently performance improvements may not be huge or even visible.
That's because programs like ps/top/lsof _have_ to use /proc to retrieve
other information. If combined with bulk taskstats-ish retrieval interfaces
they should run around /proc.

[-- Attachment #2: b-pidmap.c --]
[-- Type: text/x-csrc, Size: 1087 bytes --]

#include <sys/types.h>
#include <stdio.h>
#include <dirent.h>
#include <stdlib.h>
#include <unistd.h>

void f(void)
{
	DIR *d;
	struct dirent *de;

	d = opendir("/proc/");
	while ((de = readdir(d))) {
		if ('1' <= de->d_name[0] && de->d_name[0] <= '9') {
			int pid = atoi(de->d_name);
			char buf[32];
			DIR *dt;
			struct dirent *dte;

			snprintf(buf, sizeof(buf), "/proc/%d/task", pid);
			dt = opendir(buf);
			readdir(dt);
			readdir(dt);
			while ((dte = readdir(dt))) {
				int tid = atoi(dte->d_name);
				asm volatile ("" :: "g" (&tid) : "memory");
			}
			closedir(dt);
		}
	}
	closedir(d);
}

static inline long sys_pidmap(int *pid, unsigned int n, int start)
{
	register long r10 asm ("r10") = 0;
	long rv;
	asm volatile (
		"syscall"
		: "=a" (rv)
		: "0" (333), "D" (pid), "S" (n), "d" (start), "r" (r10)
		: "rcx", "r11", "cc", "memory"
	);
	return rv;
}

void g(void)
{
	int pid[1024];

	sys_pidmap(pid, sizeof(pid)/sizeof(pid[0]), 0);
}

int main(void)
{
	unsigned int i;

//	for (i = 0; i < (1<<16); i++)
//		f();

	for (i = 0; i < (1<<16); i++)
		g();

	return 0;
}

[-- Attachment #3: b-fdmap.c --]
[-- Type: text/x-csrc, Size: 823 bytes --]

#include <sys/types.h>
#include <dirent.h>
#include <stdlib.h>
#include <unistd.h>

void f(void)
{
	DIR *d;
	struct dirent *de;

	d = opendir("/proc/self/fd");
	while ((de = readdir(d))) {
		int fd = atoi(de->d_name);
		asm volatile ("" :: "g" (&fd) : "memory");
	}
	closedir(d);
}

static inline long sys_fdmap(int pid, int *fd, unsigned int n, int start)
{
	register long r10 asm ("r10") = start;
	register long r8 asm ("r8") = 0;
	long rv;
	asm volatile (
		"syscall"
		: "=a" (rv)
		: "0" (334), "D" (pid), "S" (fd), "d" (n), "r" (r10), "r" (r8)
		: "rcx", "r11", "cc", "memory"
	);
	return rv;
}

void g(void)
{
	int fd[1024];

	sys_fdmap(0, fd, sizeof(fd)/sizeof(fd[0]), 0);
}

int main(void)
{
	unsigned int i;

//	for (i = 0; i < (1<<22); i++)
//		f();

	dup(0);
	for (i = 0; i < (1<<22); i++)
		g();

	return 0;
}

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/2] pidmap(2)
  2017-09-05 23:02   ` Randy Dunlap
  2017-09-06  8:30     ` Thomas Gleixner
@ 2017-09-06  9:04     ` Alexey Dobriyan
  2017-09-07  2:04       ` Andy Lutomirski
  1 sibling, 1 reply; 12+ messages in thread
From: Alexey Dobriyan @ 2017-09-06  9:04 UTC (permalink / raw)
  To: Randy Dunlap
  Cc: Andrew Morton, Tatsiana Brouka, linux-kernel, linux-api,
	Aliaksandr Patseyenak

On 9/6/17, Randy Dunlap <rdunlap@infradead.org> wrote:
> On 09/05/17 15:53, Andrew Morton wrote:
>> On Tue, 5 Sep 2017 22:05:00 +0300 Alexey Dobriyan <adobriyan@gmail.com>
>> wrote:
>>
>>> Implement system call for bulk retrieveing of pids in binary form.
>>>
>>> Using /proc is slower than necessary: 3 syscalls + another 3 for each
>>> thread +
>>> converting with atoi().
>>>
>>> /proc may be not mounted especially in containers. Natural extension of
>>> hidepid=2 efforts is to not mount /proc at all.
>>>
>>> It could be used by programs like ps, top or CRIU. Speed increase will
>>> become more drastic once combined with bulk retrieval of process
>>> statistics.
>>
>> The patches are performance optimizations, but their changelogs contain
>> no performance measurements!
>>
>> Demonstration of some compelling real-world performance benefits would
>> help things along a lot.
>>
>
> also, I expect that the tiny kernel people will want kconfig options for
> these syscalls.

We'll add it but the question if it is a good idea. Ideally these system calls
should be mandatory and /proc optional.

$ size kernel/pidmap.o fs/fdmap.o
   text    data     bss     dec     hex filename
    560       0       0     560     230 kernel/pidmap.o
    617       0       0     617     269 fs/fdmap.o

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/2] pidmap(2)
  2017-09-06  9:04     ` Alexey Dobriyan
@ 2017-09-07  2:04       ` Andy Lutomirski
  2017-09-07  5:06         ` Djalal Harouni
  2017-09-07  9:43         ` Alexey Dobriyan
  0 siblings, 2 replies; 12+ messages in thread
From: Andy Lutomirski @ 2017-09-07  2:04 UTC (permalink / raw)
  To: Alexey Dobriyan, Djalal Harouni
  Cc: Randy Dunlap, Andrew Morton, Tatsiana Brouka, linux-kernel,
	Linux API, Aliaksandr Patseyenak

On Wed, Sep 6, 2017 at 2:04 AM, Alexey Dobriyan <adobriyan@gmail.com> wrote:
> On 9/6/17, Randy Dunlap <rdunlap@infradead.org> wrote:
>> On 09/05/17 15:53, Andrew Morton wrote:
>>> On Tue, 5 Sep 2017 22:05:00 +0300 Alexey Dobriyan <adobriyan@gmail.com>
>>> wrote:
>>>
>>>> Implement system call for bulk retrieveing of pids in binary form.
>>>>
>>>> Using /proc is slower than necessary: 3 syscalls + another 3 for each
>>>> thread +
>>>> converting with atoi().
>>>>
>>>> /proc may be not mounted especially in containers. Natural extension of
>>>> hidepid=2 efforts is to not mount /proc at all.
>>>>
>>>> It could be used by programs like ps, top or CRIU. Speed increase will
>>>> become more drastic once combined with bulk retrieval of process
>>>> statistics.
>>>
>>> The patches are performance optimizations, but their changelogs contain
>>> no performance measurements!
>>>
>>> Demonstration of some compelling real-world performance benefits would
>>> help things along a lot.
>>>
>>
>> also, I expect that the tiny kernel people will want kconfig options for
>> these syscalls.
>
> We'll add it but the question if it is a good idea. Ideally these system calls
> should be mandatory and /proc optional.
>
> $ size kernel/pidmap.o fs/fdmap.o
>    text    data     bss     dec     hex filename
>     560       0       0     560     230 kernel/pidmap.o
>     617       0       0     617     269 fs/fdmap.o

After much discussion at LPC/KS last year, I thought the idea was to
try to speed up /proc rather than replacing it outright.  The two
specific ideas I recall were:

1. Add a syscall like readfileat() that you can use to, in a single
operation, open, read, and close a /proc file (or other file).  This
should vastly reduce locking and RCU overhead.

2. Add a /proc file that has a nice binary format for task info.  (nl_attr?)

I don't see why pidmap() deserves to be significantly faster than getdents().

Also, a pidmap() syscall like this inherently bypasses any security
restrictions implied by the way that /proc is mounted.  It can respect
hidepid, but hidepid (as a per-namespace concept) is an enormous turd
that badly needs to be deprecated, and Djalal is working on exactly
that.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/2] pidmap(2)
  2017-09-07  2:04       ` Andy Lutomirski
@ 2017-09-07  5:06         ` Djalal Harouni
  2017-09-07  9:47           ` Alexey Dobriyan
  2017-09-07  9:43         ` Alexey Dobriyan
  1 sibling, 1 reply; 12+ messages in thread
From: Djalal Harouni @ 2017-09-07  5:06 UTC (permalink / raw)
  To: Andy Lutomirski, Alexey Dobriyan
  Cc: Randy Dunlap, Andrew Morton, Tatsiana Brouka, linux-kernel,
	Linux API, Aliaksandr Patseyenak, Alexey Gladkov

Hi Alexey,

On Thu, Sep 7, 2017 at 4:04 AM, Andy Lutomirski <luto@amacapital.net> wrote:
> On Wed, Sep 6, 2017 at 2:04 AM, Alexey Dobriyan <adobriyan@gmail.com> wrote:
>> On 9/6/17, Randy Dunlap <rdunlap@infradead.org> wrote:
>>> On 09/05/17 15:53, Andrew Morton wrote:
[...]
>>>
>>> also, I expect that the tiny kernel people will want kconfig options for
>>> these syscalls.
>>
>> We'll add it but the question if it is a good idea. Ideally these system calls
>> should be mandatory and /proc optional.
>>
>> $ size kernel/pidmap.o fs/fdmap.o
>>    text    data     bss     dec     hex filename
>>     560       0       0     560     230 kernel/pidmap.o
>>     617       0       0     617     269 fs/fdmap.o
>
> After much discussion at LPC/KS last year, I thought the idea was to
> try to speed up /proc rather than replacing it outright.  The two
> specific ideas I recall were:
>
> 1. Add a syscall like readfileat() that you can use to, in a single
> operation, open, read, and close a /proc file (or other file).  This
> should vastly reduce locking and RCU overhead.
>
> 2. Add a /proc file that has a nice binary format for task info.  (nl_attr?)
>
> I don't see why pidmap() deserves to be significantly faster than getdents().
>
> Also, a pidmap() syscall like this inherently bypasses any security
> restrictions implied by the way that /proc is mounted.  It can respect
> hidepid, but hidepid (as a per-namespace concept) is an enormous turd
> that badly needs to be deprecated, and Djalal is working on exactly
> that.

Yes as noted by Andy, me and Alexey Gladkov are working on modernizing
procfs [1] and to reduce/remove ties within pid namespaces which has lot
of problems now.

We just picked the task again, and this was the result of discussion
with Andy some months ago, on how to improve hidepid, but also how to
improve procfs in general, so we can add other mechanisms to hide or return
NULL on other /proc/_file_not_needed_by_containers_  or
/proc/_specific_module_files_ everything that is not virtualized , or mount only
some specific view of the whole /proc API this will also be used by containers.
This also should make it hard for attackers since we are planning to have
a backward compatible options on how to better treat some of these files in
regard of some namespaces.

The syscall or readfileat() for one operation is a nice addition
definitively. But
in general it would be better to treat /proc as a filesystem and not add other
specific interfaces that may abstract it with pidns, as it is the situation now
which make it from userspace perspective: hard to use especially for security
context.

Alexey, could you please Cc'us on future, thank you very much!


[1] https://lkml.org/lkml/2017/4/25/282

-- 
tixxdz

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/2] pidmap(2)
  2017-09-07  2:04       ` Andy Lutomirski
  2017-09-07  5:06         ` Djalal Harouni
@ 2017-09-07  9:43         ` Alexey Dobriyan
  1 sibling, 0 replies; 12+ messages in thread
From: Alexey Dobriyan @ 2017-09-07  9:43 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Djalal Harouni, Randy Dunlap, Andrew Morton, Tatsiana Brouka,
	linux-kernel, Linux API, Aliaksandr Patseyenak

On 9/7/17, Andy Lutomirski <luto@amacapital.net> wrote:
> On Wed, Sep 6, 2017 at 2:04 AM, Alexey Dobriyan <adobriyan@gmail.com>
> wrote:
>> On 9/6/17, Randy Dunlap <rdunlap@infradead.org> wrote:
>>> On 09/05/17 15:53, Andrew Morton wrote:
>>>> On Tue, 5 Sep 2017 22:05:00 +0300 Alexey Dobriyan <adobriyan@gmail.com>
>>>> wrote:
>>>>
>>>>> Implement system call for bulk retrieveing of pids in binary form.
>>>>>
>>>>> Using /proc is slower than necessary: 3 syscalls + another 3 for each
>>>>> thread +
>>>>> converting with atoi().
>>>>>
>>>>> /proc may be not mounted especially in containers. Natural extension
>>>>> of
>>>>> hidepid=2 efforts is to not mount /proc at all.
>>>>>
>>>>> It could be used by programs like ps, top or CRIU. Speed increase will
>>>>> become more drastic once combined with bulk retrieval of process
>>>>> statistics.
>>>>
>>>> The patches are performance optimizations, but their changelogs contain
>>>> no performance measurements!
>>>>
>>>> Demonstration of some compelling real-world performance benefits would
>>>> help things along a lot.
>>>>
>>>
>>> also, I expect that the tiny kernel people will want kconfig options for
>>> these syscalls.
>>
>> We'll add it but the question if it is a good idea. Ideally these system
>> calls
>> should be mandatory and /proc optional.
>>
>> $ size kernel/pidmap.o fs/fdmap.o
>>    text    data     bss     dec     hex filename
>>     560       0       0     560     230 kernel/pidmap.o
>>     617       0       0     617     269 fs/fdmap.o
>
> After much discussion at LPC/KS last year, I thought the idea was to
> try to speed up /proc rather than replacing it outright.  The two
> specific ideas I recall were:
>
> 1. Add a syscall like readfileat() that you can use to, in a single
> operation, open, read, and close a /proc file (or other file).  This
> should vastly reduce locking and RCU overhead.
>
> 2. Add a /proc file that has a nice binary format for task info.
> (nl_attr?)

If you do binary data in /proc there is no need for /proc part.
System call can do everything /proc/$PID/bstat (or whatever the name)
does.

> I don't see why pidmap() deserves to be significantly faster than
> getdents().

Just look at profile. XXX is pure slowdown. _Some_ of it can be deleted
or sped up but not everything. All dcache stuff is unavoidable.

XXX	6.35% [k] number
OK*	5.21% [k] proc_readfd_common  (* partially XXX)
OK	4.19% [k] __rcu_read_unlock
XXX	4.05% [.] __GI_____strtoll_l_internal
XXX	3.73% [k] dput
OK	3.64% [k] entry_SYSCALL_64_fastpath
XXX	3.23% [k] proc_fill_cache
XXX	3.10% [k] __d_lookup
XXX	3.09% [k] filldir
XXX	2.74% [k] format_decode
XXX	2.47% [k] link_path_walk
OK*	2.26% [k] _raw_spin_lock
OK	1.73% [k] get_files_struct
XXX	1.64% [k] __d_lookup_rcu
XXX	1.61% [k] do_sys_open
XXX	1.49% [k] pid_revalidate
OK	1.48% [k] __check_object_size
XXX	1.47% [k] do_filp_open
?	1.44% [.] __memmove_sse2
OK	1.40% [k] __rcu_read_lock
XXX	1.33% [.] __readdir64
XXX	1.32% [k] __follow_mount_rcu.isra.6
XXX	1.30% [k] set_root
XXX	1.27% [k] lookup_fast
XXX	1.23% [k] full_name_hash
OK?	1.17% [k] call_rcu
XXX	1.17% [k] sys_open
?	1.02% [k] lockref_put_or_lock
XXX	1.00% [k] pid_delete_dentry
XXX	0.99% [k] iterate_dir
XXX	0.95% [k] inode_permission
XXX	0.94% [k] __slab_alloc.isra.22.constprop.26
OK	0.93% [k] rcu_process_callbacks
XXX	0.93% [.] __getdents64
XXX	0.93% [k] vsnprintf
XXX	0.92% [k] sys_close

> Also, a pidmap() syscall like this inherently bypasses any security
> restrictions implied by the way that /proc is mounted.  It can respect
> hidepid, but hidepid (as a per-namespace concept) is an enormous turd
> that badly needs to be deprecated, and Djalal is working on exactly
> that.

I agree pid_ns->hide_pid is silly idea. It should be a property of
an individual mount but as posted pidmap() respect it (at a cost of
some slowdown).

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/2] pidmap(2)
  2017-09-07  5:06         ` Djalal Harouni
@ 2017-09-07  9:47           ` Alexey Dobriyan
  0 siblings, 0 replies; 12+ messages in thread
From: Alexey Dobriyan @ 2017-09-07  9:47 UTC (permalink / raw)
  To: Djalal Harouni
  Cc: Andy Lutomirski, Randy Dunlap, Andrew Morton, Tatsiana Brouka,
	linux-kernel, Linux API, Aliaksandr Patseyenak, Alexey Gladkov

On 9/7/17, Djalal Harouni <tixxdz@gmail.com> wrote:
> Hi Alexey,
>
> On Thu, Sep 7, 2017 at 4:04 AM, Andy Lutomirski <luto@amacapital.net>
> wrote:
>> On Wed, Sep 6, 2017 at 2:04 AM, Alexey Dobriyan <adobriyan@gmail.com>
>> wrote:
>>> On 9/6/17, Randy Dunlap <rdunlap@infradead.org> wrote:
>>>> On 09/05/17 15:53, Andrew Morton wrote:
> [...]
>>>>
>>>> also, I expect that the tiny kernel people will want kconfig options
>>>> for
>>>> these syscalls.
>>>
>>> We'll add it but the question if it is a good idea. Ideally these system
>>> calls
>>> should be mandatory and /proc optional.
>>>
>>> $ size kernel/pidmap.o fs/fdmap.o
>>>    text    data     bss     dec     hex filename
>>>     560       0       0     560     230 kernel/pidmap.o
>>>     617       0       0     617     269 fs/fdmap.o
>>
>> After much discussion at LPC/KS last year, I thought the idea was to
>> try to speed up /proc rather than replacing it outright.  The two
>> specific ideas I recall were:
>>
>> 1. Add a syscall like readfileat() that you can use to, in a single
>> operation, open, read, and close a /proc file (or other file).  This
>> should vastly reduce locking and RCU overhead.
>>
>> 2. Add a /proc file that has a nice binary format for task info.
>> (nl_attr?)
>>
>> I don't see why pidmap() deserves to be significantly faster than
>> getdents().
>>
>> Also, a pidmap() syscall like this inherently bypasses any security
>> restrictions implied by the way that /proc is mounted.  It can respect
>> hidepid, but hidepid (as a per-namespace concept) is an enormous turd
>> that badly needs to be deprecated, and Djalal is working on exactly
>> that.
>
> Yes as noted by Andy, me and Alexey Gladkov are working on modernizing
> procfs [1] and to reduce/remove ties within pid namespaces which has lot
> of problems now.
> ...

Kudos for digging into this mess.
But the question will remain: how get pids of existing processes quickly.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/2] pidmap(2)
  2017-09-05 19:05 [PATCH 1/2] pidmap(2) Alexey Dobriyan
  2017-09-05 19:06 ` [PATCH 2/2] fdmap(2) Alexey Dobriyan
  2017-09-05 22:53 ` [PATCH 1/2] pidmap(2) Andrew Morton
@ 2017-09-07 10:08 ` Dmitry V. Levin
  2 siblings, 0 replies; 12+ messages in thread
From: Dmitry V. Levin @ 2017-09-07 10:08 UTC (permalink / raw)
  To: Alexey Dobriyan
  Cc: akpm, Tatsiana Brouka, linux-kernel, linux-api,
	Aliaksandr Patseyenak, Andrey Vagin

[-- Attachment #1: Type: text/plain, Size: 756 bytes --]

On Tue, Sep 05, 2017 at 10:05:00PM +0300, Alexey Dobriyan wrote:
> From: Tatsiana Brouka <Tatsiana_Brouka@epam.com>
> 
> Implement system call for bulk retrieveing of pids in binary form.
> 
> Using /proc is slower than necessary: 3 syscalls + another 3 for each thread +
> converting with atoi().
> 
> /proc may be not mounted especially in containers. Natural extension of
> hidepid=2 efforts is to not mount /proc at all.
> 
> It could be used by programs like ps, top or CRIU. Speed increase will
> become more drastic once combined with bulk retrieval of process statistics.

What could give a noticeable performance gain in a less ridiculous way
is an interface like task_diag, see https://lkml.org/lkml/2016/4/11/924 .


-- 
ldv

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 801 bytes --]

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2017-09-07 10:08 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-09-05 19:05 [PATCH 1/2] pidmap(2) Alexey Dobriyan
2017-09-05 19:06 ` [PATCH 2/2] fdmap(2) Alexey Dobriyan
2017-09-05 22:53 ` [PATCH 1/2] pidmap(2) Andrew Morton
2017-09-05 23:02   ` Randy Dunlap
2017-09-06  8:30     ` Thomas Gleixner
2017-09-06  9:04     ` Alexey Dobriyan
2017-09-07  2:04       ` Andy Lutomirski
2017-09-07  5:06         ` Djalal Harouni
2017-09-07  9:47           ` Alexey Dobriyan
2017-09-07  9:43         ` Alexey Dobriyan
2017-09-06  8:55   ` Alexey Dobriyan
2017-09-07 10:08 ` Dmitry V. Levin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).