* [PATCH v3] proc: faster open/read/close with "permanent" files
@ 2020-02-22 20:15 Alexey Dobriyan
2020-02-22 20:39 ` Joe Perches
0 siblings, 1 reply; 5+ messages in thread
From: Alexey Dobriyan @ 2020-02-22 20:15 UTC (permalink / raw)
To: akpm; +Cc: linux-kernel, linux-fsdevel
Now that "struct proc_ops" exist we can start putting there stuff which
could not fly with VFS "struct file_operations"...
Most of fs/proc/inode.c file is dedicated to make open/read/.../close reliable
in the event of disappearing /proc entries which usually happens if module is
getting removed. Files like /proc/cpuinfo which never disappear simply do not
need such protection.
Save 2 atomic ops, 1 allocation, 1 free per open/read/close sequence for such
"permanent" files.
Enable "permanent" flag for
/proc/cpuinfo
/proc/kmsg
/proc/modules
/proc/slabinfo
/proc/stat
/proc/sysvipc/*
/proc/swaps
More will come once I figure out foolproof way to prevent out module
authors from marking their stuff "permanent" for performance reasons
when it is not.
This should help with scalability: benchmark is "read /proc/cpuinfo R times
by N threads scattered over the system".
N R t, s (before) t, s (after)
-----------------------------------------------------
64 4096 1.582458 1.530502 -3.2%
256 4096 6.371926 6.125168 -3.9%
1024 4096 25.64888 24.47528 -4.6%
Benchmark source:
#include <chrono>
#include <iostream>
#include <thread>
#include <vector>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
const int NR_CPUS = sysconf(_SC_NPROCESSORS_ONLN);
int N;
const char *filename;
int R;
int xxx = 0;
int glue(int n)
{
cpu_set_t m;
CPU_ZERO(&m);
CPU_SET(n, &m);
return sched_setaffinity(0, sizeof(cpu_set_t), &m);
}
void f(int n)
{
glue(n % NR_CPUS);
while (*(volatile int *)&xxx == 0) {
}
for (int i = 0; i < R; i++) {
int fd = open(filename, O_RDONLY);
char buf[4096];
ssize_t rv = read(fd, buf, sizeof(buf));
asm volatile ("" :: "g" (rv));
close(fd);
}
}
int main(int argc, char *argv[])
{
if (argc < 4) {
std::cerr << "usage: " << argv[0] << ' ' << "N /proc/filename R\n";
return 1;
}
N = atoi(argv[1]);
filename = argv[2];
R = atoi(argv[3]);
for (int i = 0; i < NR_CPUS; i++) {
if (glue(i) == 0)
break;
}
std::vector<std::thread> T;
T.reserve(N);
for (int i = 0; i < N; i++) {
T.emplace_back(f, i);
}
auto t0 = std::chrono::system_clock::now();
{
*(volatile int *)&xxx = 1;
for (auto& t: T) {
t.join();
}
}
auto t1 = std::chrono::system_clock::now();
std::chrono::duration<double> dt = t1 - t0;
std::cout << dt.count() << '\n';
return 0;
}
P.S.:
Explicit randomization marker is added because adding non-function pointer
will silently disable structure layout randomization.
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Reported-by: kbuild test robot <lkp@intel.com>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
---
v2: fix unlock on error path
v3: more comments
fs/proc/cpuinfo.c | 1
fs/proc/generic.c | 33 +++++++-
fs/proc/inode.c | 187 +++++++++++++++++++++++++++++++++++-------------
fs/proc/internal.h | 6 +
fs/proc/kmsg.c | 1
fs/proc/stat.c | 1
include/linux/proc_fs.h | 17 ++++
ipc/util.c | 1
kernel/module.c | 1
mm/slab_common.c | 1
mm/swapfile.c | 1
11 files changed, 196 insertions(+), 54 deletions(-)
--- a/fs/proc/cpuinfo.c
+++ b/fs/proc/cpuinfo.c
@@ -17,6 +17,7 @@ static int cpuinfo_open(struct inode *inode, struct file *file)
}
static const struct proc_ops cpuinfo_proc_ops = {
+ .proc_flags = PROC_ENTRY_PERMANENT,
.proc_open = cpuinfo_open,
.proc_read = seq_read,
.proc_lseek = seq_lseek,
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -531,6 +531,13 @@ struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode,
return p;
}
+static inline void pde_set_flags(struct proc_dir_entry *pde)
+{
+ if (pde->proc_ops->proc_flags & PROC_ENTRY_PERMANENT) {
+ pde->flags |= PROC_ENTRY_PERMANENT;
+ }
+}
+
struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
struct proc_dir_entry *parent,
const struct proc_ops *proc_ops, void *data)
@@ -541,6 +548,7 @@ struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
if (!p)
return NULL;
p->proc_ops = proc_ops;
+ pde_set_flags(p);
return proc_register(parent, p);
}
EXPORT_SYMBOL(proc_create_data);
@@ -572,6 +580,7 @@ static int proc_seq_release(struct inode *inode, struct file *file)
}
static const struct proc_ops proc_seq_ops = {
+ /* not permanent -- can call into arbitrary seq_operations */
.proc_open = proc_seq_open,
.proc_read = seq_read,
.proc_lseek = seq_lseek,
@@ -602,6 +611,7 @@ static int proc_single_open(struct inode *inode, struct file *file)
}
static const struct proc_ops proc_single_ops = {
+ /* not permanent -- can call into arbitrary ->single_show */
.proc_open = proc_single_open,
.proc_read = seq_read,
.proc_lseek = seq_lseek,
@@ -662,9 +672,14 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
de = pde_subdir_find(parent, fn, len);
if (de) {
- rb_erase(&de->subdir_node, &parent->subdir);
- if (S_ISDIR(de->mode)) {
- parent->nlink--;
+ if (unlikely(pde_is_permanent(de))) {
+ WARN(1, "removing permanent /proc entry '%s'", de->name);
+ de = NULL;
+ } else {
+ rb_erase(&de->subdir_node, &parent->subdir);
+ if (S_ISDIR(de->mode)) {
+ parent->nlink--;
+ }
}
}
write_unlock(&proc_subdir_lock);
@@ -700,12 +715,24 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
write_unlock(&proc_subdir_lock);
return -ENOENT;
}
+ if (unlikely(pde_is_permanent(root))) {
+ write_unlock(&proc_subdir_lock);
+ WARN(1, "removing permanent /proc entry '%s/%s'",
+ root->parent->name, root->name);
+ return -EINVAL;
+ }
rb_erase(&root->subdir_node, &parent->subdir);
de = root;
while (1) {
next = pde_subdir_first(de);
if (next) {
+ if (unlikely(pde_is_permanent(root))) {
+ write_unlock(&proc_subdir_lock);
+ WARN(1, "removing permanent /proc entry '%s/%s'",
+ next->parent->name, next->name);
+ return -EINVAL;
+ }
rb_erase(&next->subdir_node, &de->subdir);
de = next;
continue;
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -195,135 +195,204 @@ void proc_entry_rundown(struct proc_dir_entry *de)
spin_unlock(&de->pde_unload_lock);
}
+static loff_t pde_lseek(struct proc_dir_entry *pde, struct file *file, loff_t offset, int whence)
+{
+ typeof_member(struct proc_ops, proc_lseek) lseek;
+
+ lseek = pde->proc_ops->proc_lseek;
+ if (!lseek)
+ lseek = default_llseek;
+ return lseek(file, offset, whence);
+}
+
static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence)
{
struct proc_dir_entry *pde = PDE(file_inode(file));
loff_t rv = -EINVAL;
- if (use_pde(pde)) {
- typeof_member(struct proc_ops, proc_lseek) lseek;
- lseek = pde->proc_ops->proc_lseek;
- if (!lseek)
- lseek = default_llseek;
- rv = lseek(file, offset, whence);
+ if (pde_is_permanent(pde)) {
+ return pde_lseek(pde, file, offset, whence);
+ } else if (use_pde(pde)) {
+ rv = pde_lseek(pde, file, offset, whence);
unuse_pde(pde);
}
return rv;
}
+static ssize_t pde_read(struct proc_dir_entry *pde, struct file *file, char __user *buf, size_t count, loff_t *ppos)
+{
+ typeof_member(struct proc_ops, proc_read) read;
+
+ read = pde->proc_ops->proc_read;
+ if (read)
+ return read(file, buf, count, ppos);
+ return -EIO;
+}
+
static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
struct proc_dir_entry *pde = PDE(file_inode(file));
ssize_t rv = -EIO;
- if (use_pde(pde)) {
- typeof_member(struct proc_ops, proc_read) read;
- read = pde->proc_ops->proc_read;
- if (read)
- rv = read(file, buf, count, ppos);
+ if (pde_is_permanent(pde)) {
+ return pde_read(pde, file, buf, count, ppos);
+ } else if (use_pde(pde)) {
+ rv = pde_read(pde, file, buf, count, ppos);
unuse_pde(pde);
}
return rv;
}
+static ssize_t pde_write(struct proc_dir_entry *pde, struct file *file, const char __user *buf, size_t count, loff_t *ppos)
+{
+ typeof_member(struct proc_ops, proc_write) write;
+
+ write = pde->proc_ops->proc_write;
+ if (write)
+ return write(file, buf, count, ppos);
+ return -EIO;
+}
+
static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
{
struct proc_dir_entry *pde = PDE(file_inode(file));
ssize_t rv = -EIO;
- if (use_pde(pde)) {
- typeof_member(struct proc_ops, proc_write) write;
- write = pde->proc_ops->proc_write;
- if (write)
- rv = write(file, buf, count, ppos);
+ if (pde_is_permanent(pde)) {
+ return pde_write(pde, file, buf, count, ppos);
+ } else if (use_pde(pde)) {
+ rv = pde_write(pde, file, buf, count, ppos);
unuse_pde(pde);
}
return rv;
}
+static __poll_t pde_poll(struct proc_dir_entry *pde, struct file *file, struct poll_table_struct *pts)
+{
+ typeof_member(struct proc_ops, proc_poll) poll;
+
+ poll = pde->proc_ops->proc_poll;
+ if (poll)
+ return poll(file, pts);
+ return DEFAULT_POLLMASK;
+}
+
static __poll_t proc_reg_poll(struct file *file, struct poll_table_struct *pts)
{
struct proc_dir_entry *pde = PDE(file_inode(file));
__poll_t rv = DEFAULT_POLLMASK;
- if (use_pde(pde)) {
- typeof_member(struct proc_ops, proc_poll) poll;
- poll = pde->proc_ops->proc_poll;
- if (poll)
- rv = poll(file, pts);
+ if (pde_is_permanent(pde)) {
+ return pde_poll(pde, file, pts);
+ } else if (use_pde(pde)) {
+ rv = pde_poll(pde, file, pts);
unuse_pde(pde);
}
return rv;
}
+static long pde_ioctl(struct proc_dir_entry *pde, struct file *file, unsigned int cmd, unsigned long arg)
+{
+ typeof_member(struct proc_ops, proc_ioctl) ioctl;
+
+ ioctl = pde->proc_ops->proc_ioctl;
+ if (ioctl)
+ return ioctl(file, cmd, arg);
+ return -ENOTTY;
+}
+
static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
struct proc_dir_entry *pde = PDE(file_inode(file));
long rv = -ENOTTY;
- if (use_pde(pde)) {
- typeof_member(struct proc_ops, proc_ioctl) ioctl;
- ioctl = pde->proc_ops->proc_ioctl;
- if (ioctl)
- rv = ioctl(file, cmd, arg);
+ if (pde_is_permanent(pde)) {
+ return pde_ioctl(pde, file, cmd, arg);
+ } else if (use_pde(pde)) {
+ rv = pde_ioctl(pde, file, cmd, arg);
unuse_pde(pde);
}
return rv;
}
#ifdef CONFIG_COMPAT
+static long pde_compat_ioctl(struct proc_dir_entry *pde, struct file *file, unsigned int cmd, unsigned long arg)
+{
+ typeof_member(struct proc_ops, proc_compat_ioctl) compat_ioctl;
+
+ compat_ioctl = pde->proc_ops->proc_compat_ioctl;
+ if (compat_ioctl)
+ return compat_ioctl(file, cmd, arg);
+ return -ENOTTY;
+}
+
static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
struct proc_dir_entry *pde = PDE(file_inode(file));
long rv = -ENOTTY;
- if (use_pde(pde)) {
- typeof_member(struct proc_ops, proc_compat_ioctl) compat_ioctl;
-
- compat_ioctl = pde->proc_ops->proc_compat_ioctl;
- if (compat_ioctl)
- rv = compat_ioctl(file, cmd, arg);
+ if (pde_is_permanent(pde)) {
+ return pde_compat_ioctl(pde, file, cmd, arg);
+ } else if (use_pde(pde)) {
+ rv = pde_compat_ioctl(pde, file, cmd, arg);
unuse_pde(pde);
}
return rv;
}
#endif
+static int pde_mmap(struct proc_dir_entry *pde, struct file *file, struct vm_area_struct *vma)
+{
+ typeof_member(struct proc_ops, proc_mmap) mmap;
+
+ mmap = pde->proc_ops->proc_mmap;
+ if (mmap)
+ return mmap(file, vma);
+ return -EIO;
+}
+
static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma)
{
struct proc_dir_entry *pde = PDE(file_inode(file));
int rv = -EIO;
- if (use_pde(pde)) {
- typeof_member(struct proc_ops, proc_mmap) mmap;
- mmap = pde->proc_ops->proc_mmap;
- if (mmap)
- rv = mmap(file, vma);
+ if (pde_is_permanent(pde)) {
+ return pde_mmap(pde, file, vma);
+ } else if (use_pde(pde)) {
+ rv = pde_mmap(pde, file, vma);
unuse_pde(pde);
}
return rv;
}
static unsigned long
-proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr,
+pde_get_unmapped_area(struct proc_dir_entry *pde, struct file *file, unsigned long orig_addr,
unsigned long len, unsigned long pgoff,
unsigned long flags)
{
- struct proc_dir_entry *pde = PDE(file_inode(file));
- unsigned long rv = -EIO;
-
- if (use_pde(pde)) {
- typeof_member(struct proc_ops, proc_get_unmapped_area) get_area;
+ typeof_member(struct proc_ops, proc_get_unmapped_area) get_area;
- get_area = pde->proc_ops->proc_get_unmapped_area;
+ get_area = pde->proc_ops->proc_get_unmapped_area;
#ifdef CONFIG_MMU
- if (!get_area)
- get_area = current->mm->get_unmapped_area;
+ if (!get_area)
+ get_area = current->mm->get_unmapped_area;
#endif
+ if (get_area)
+ return get_area(file, orig_addr, len, pgoff, flags);
+ return orig_addr;
+}
+
+static unsigned long
+proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags)
+{
+ struct proc_dir_entry *pde = PDE(file_inode(file));
+ unsigned long rv = -EIO;
- if (get_area)
- rv = get_area(file, orig_addr, len, pgoff, flags);
- else
- rv = orig_addr;
+ if (pde_is_permanent(pde)) {
+ return pde_get_unmapped_area(pde, file, orig_addr, len, pgoff, flags);
+ } else if (use_pde(pde)) {
+ rv = pde_get_unmapped_area(pde, file, orig_addr, len, pgoff, flags);
unuse_pde(pde);
}
return rv;
@@ -337,6 +406,13 @@ static int proc_reg_open(struct inode *inode, struct file *file)
typeof_member(struct proc_ops, proc_release) release;
struct pde_opener *pdeo;
+ if (pde_is_permanent(pde)) {
+ open = pde->proc_ops->proc_open;
+ if (open)
+ rv = open(inode, file);
+ return rv;
+ }
+
/*
* Ensure that
* 1) PDE's ->release hook will be called no matter what
@@ -386,6 +462,17 @@ static int proc_reg_release(struct inode *inode, struct file *file)
{
struct proc_dir_entry *pde = PDE(inode);
struct pde_opener *pdeo;
+
+ if (pde_is_permanent(pde)) {
+ typeof_member(struct proc_ops, proc_release) release;
+
+ release = pde->proc_ops->proc_release;
+ if (release) {
+ return release(inode, file);
+ }
+ return 0;
+ }
+
spin_lock(&pde->pde_unload_lock);
list_for_each_entry(pdeo, &pde->pde_openers, lh) {
if (pdeo->file == file) {
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -61,6 +61,7 @@ struct proc_dir_entry {
struct rb_node subdir_node;
char *name;
umode_t mode;
+ u8 flags;
u8 namelen;
char inline_name[];
} __randomize_layout;
@@ -73,6 +74,11 @@ struct proc_dir_entry {
0)
#define SIZEOF_PDE_INLINE_NAME (SIZEOF_PDE - sizeof(struct proc_dir_entry))
+static inline bool pde_is_permanent(const struct proc_dir_entry *pde)
+{
+ return pde->flags & PROC_ENTRY_PERMANENT;
+}
+
extern struct kmem_cache *proc_dir_entry_cache;
void pde_free(struct proc_dir_entry *pde);
--- a/fs/proc/kmsg.c
+++ b/fs/proc/kmsg.c
@@ -50,6 +50,7 @@ static __poll_t kmsg_poll(struct file *file, poll_table *wait)
static const struct proc_ops kmsg_proc_ops = {
+ .proc_flags = PROC_ENTRY_PERMANENT,
.proc_read = kmsg_read,
.proc_poll = kmsg_poll,
.proc_open = kmsg_open,
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -224,6 +224,7 @@ static int stat_open(struct inode *inode, struct file *file)
}
static const struct proc_ops stat_proc_ops = {
+ .proc_flags = PROC_ENTRY_PERMANENT,
.proc_open = stat_open,
.proc_read = seq_read,
.proc_lseek = seq_lseek,
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -5,6 +5,7 @@
#ifndef _LINUX_PROC_FS_H
#define _LINUX_PROC_FS_H
+#include <linux/compiler.h>
#include <linux/types.h>
#include <linux/fs.h>
@@ -12,7 +13,21 @@ struct proc_dir_entry;
struct seq_file;
struct seq_operations;
+enum {
+ /*
+ * All /proc entries using this ->proc_ops instance are never removed.
+ *
+ * If in doubt, ignore this flag.
+ */
+#ifdef MODULE
+ PROC_ENTRY_PERMANENT = 0U,
+#else
+ PROC_ENTRY_PERMANENT = 1U << 0,
+#endif
+};
+
struct proc_ops {
+ unsigned int proc_flags;
int (*proc_open)(struct inode *, struct file *);
ssize_t (*proc_read)(struct file *, char __user *, size_t, loff_t *);
ssize_t (*proc_write)(struct file *, const char __user *, size_t, loff_t *);
@@ -25,7 +40,7 @@ struct proc_ops {
#endif
int (*proc_mmap)(struct file *, struct vm_area_struct *);
unsigned long (*proc_get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
-};
+} __randomize_layout;
#ifdef CONFIG_PROC_FS
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -885,6 +885,7 @@ static int sysvipc_proc_release(struct inode *inode, struct file *file)
}
static const struct proc_ops sysvipc_proc_ops = {
+ .proc_flags = PROC_ENTRY_PERMANENT,
.proc_open = sysvipc_proc_open,
.proc_read = seq_read,
.proc_lseek = seq_lseek,
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -4355,6 +4355,7 @@ static int modules_open(struct inode *inode, struct file *file)
}
static const struct proc_ops modules_proc_ops = {
+ .proc_flags = PROC_ENTRY_PERMANENT,
.proc_open = modules_open,
.proc_read = seq_read,
.proc_lseek = seq_lseek,
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1581,6 +1581,7 @@ static int slabinfo_open(struct inode *inode, struct file *file)
}
static const struct proc_ops slabinfo_proc_ops = {
+ .proc_flags = PROC_ENTRY_PERMANENT,
.proc_open = slabinfo_open,
.proc_read = seq_read,
.proc_write = slabinfo_write,
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2797,6 +2797,7 @@ static int swaps_open(struct inode *inode, struct file *file)
}
static const struct proc_ops swaps_proc_ops = {
+ .proc_flags = PROC_ENTRY_PERMANENT,
.proc_open = swaps_open,
.proc_read = seq_read,
.proc_lseek = seq_lseek,
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH v3] proc: faster open/read/close with "permanent" files
2020-02-22 20:15 [PATCH v3] proc: faster open/read/close with "permanent" files Alexey Dobriyan
@ 2020-02-22 20:39 ` Joe Perches
2020-02-23 11:30 ` Alexey Dobriyan
0 siblings, 1 reply; 5+ messages in thread
From: Joe Perches @ 2020-02-22 20:39 UTC (permalink / raw)
To: Alexey Dobriyan, akpm; +Cc: linux-kernel, linux-fsdevel
On Sat, 2020-02-22 at 23:15 +0300, Alexey Dobriyan wrote:
> Now that "struct proc_ops" exist we can start putting there stuff which
> could not fly with VFS "struct file_operations"...
>
> Most of fs/proc/inode.c file is dedicated to make open/read/.../close reliable
> in the event of disappearing /proc entries which usually happens if module is
> getting removed. Files like /proc/cpuinfo which never disappear simply do not
> need such protection.
>
> Save 2 atomic ops, 1 allocation, 1 free per open/read/close sequence for such
> "permanent" files.
>
> Enable "permanent" flag for
>
> /proc/cpuinfo
> /proc/kmsg
> /proc/modules
> /proc/slabinfo
> /proc/stat
> /proc/sysvipc/*
> /proc/swaps
>
> More will come once I figure out foolproof way to prevent out module
> authors from marking their stuff "permanent" for performance reasons
> when it is not.
>
> This should help with scalability: benchmark is "read /proc/cpuinfo R times
> by N threads scattered over the system".
Is this an actual expected use-case?
Is there some additional unnecessary memory consumption
in the unscaled systems?
And trivia:
> static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence)
> {
[]
> + if (pde_is_permanent(pde)) {
> + return pde_lseek(pde, file, offset, whence);
> + } else if (use_pde(pde)) {
> + rv = pde_lseek(pde, file, offset, whence);
> unuse_pde(pde);
> }
> return rv;
> }
[]
> static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
> {
> struct proc_dir_entry *pde = PDE(file_inode(file));
> ssize_t rv = -EIO;
> - if (use_pde(pde)) {
> - typeof_member(struct proc_ops, proc_read) read;
>
> - read = pde->proc_ops->proc_read;
> - if (read)
> - rv = read(file, buf, count, ppos);
> + if (pde_is_permanent(pde)) {
> + return pde_read(pde, file, buf, count, ppos);
> + } else if (use_pde(pde)) {
> + rv = pde_read(pde, file, buf, count, ppos);
> unuse_pde(pde);
Perhaps all the function call duplication could be minimized
by using code without direct returns like:
rv = pde_read(pde, file, buf, count, pos);
if (!pde_is_permanent(pde))
unuse_pde(pde);
return rv;
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH v3] proc: faster open/read/close with "permanent" files
2020-02-22 20:39 ` Joe Perches
@ 2020-02-23 11:30 ` Alexey Dobriyan
2020-02-24 2:48 ` Joe Perches
0 siblings, 1 reply; 5+ messages in thread
From: Alexey Dobriyan @ 2020-02-23 11:30 UTC (permalink / raw)
To: Joe Perches; +Cc: akpm, linux-kernel, linux-fsdevel
On Sat, Feb 22, 2020 at 12:39:39PM -0800, Joe Perches wrote:
> On Sat, 2020-02-22 at 23:15 +0300, Alexey Dobriyan wrote:
> > Now that "struct proc_ops" exist we can start putting there stuff which
> > could not fly with VFS "struct file_operations"...
> >
> > Most of fs/proc/inode.c file is dedicated to make open/read/.../close reliable
> > in the event of disappearing /proc entries which usually happens if module is
> > getting removed. Files like /proc/cpuinfo which never disappear simply do not
> > need such protection.
> >
> > Save 2 atomic ops, 1 allocation, 1 free per open/read/close sequence for such
> > "permanent" files.
> >
> > Enable "permanent" flag for
> >
> > /proc/cpuinfo
> > /proc/kmsg
> > /proc/modules
> > /proc/slabinfo
> > /proc/stat
> > /proc/sysvipc/*
> > /proc/swaps
> >
> > More will come once I figure out foolproof way to prevent out module
> > authors from marking their stuff "permanent" for performance reasons
> > when it is not.
> >
> > This should help with scalability: benchmark is "read /proc/cpuinfo R times
> > by N threads scattered over the system".
>
> Is this an actual expected use-case?
Yes.
> Is there some additional unnecessary memory consumption
> in the unscaled systems?
No, it's the opposite. Less memory usage for everyone and noticeable
performance improvement for contented case.
> > static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
> > {
> > struct proc_dir_entry *pde = PDE(file_inode(file));
> > ssize_t rv = -EIO;
> > - if (use_pde(pde)) {
> > - typeof_member(struct proc_ops, proc_read) read;
> >
> > - read = pde->proc_ops->proc_read;
> > - if (read)
> > - rv = read(file, buf, count, ppos);
> > + if (pde_is_permanent(pde)) {
> > + return pde_read(pde, file, buf, count, ppos);
> > + } else if (use_pde(pde)) {
> > + rv = pde_read(pde, file, buf, count, ppos);
> > unuse_pde(pde);
>
> Perhaps all the function call duplication could be minimized
> by using code without direct returns like:
>
> rv = pde_read(pde, file, buf, count, pos);
> if (!pde_is_permanent(pde))
> unuse_pde(pde);
>
> return rv;
Function call non-duplication is false goal.
Surprisingly it makes code bigger:
$ ./scripts/bloat-o-meter ../vmlinux-000 ../obj/vmlinux
add/remove: 0/0 grow/shrink: 1/0 up/down: 10/0 (10)
Function old new delta
proc_reg_read 108 118 +10
and worse too: "rv" is carried on stack through "unuse_pde" call.
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH v3] proc: faster open/read/close with "permanent" files
2020-02-23 11:30 ` Alexey Dobriyan
@ 2020-02-24 2:48 ` Joe Perches
2020-02-24 17:55 ` Alexey Dobriyan
0 siblings, 1 reply; 5+ messages in thread
From: Joe Perches @ 2020-02-24 2:48 UTC (permalink / raw)
To: Alexey Dobriyan; +Cc: akpm, linux-kernel, linux-fsdevel
On Sun, 2020-02-23 at 14:30 +0300, Alexey Dobriyan wrote:
> On Sat, Feb 22, 2020 at 12:39:39PM -0800, Joe Perches wrote:
> > On Sat, 2020-02-22 at 23:15 +0300, Alexey Dobriyan wrote:
> > > Now that "struct proc_ops" exist we can start putting there stuff which
> > > could not fly with VFS "struct file_operations"...
> > >
> > > Most of fs/proc/inode.c file is dedicated to make open/read/.../close reliable
> > > in the event of disappearing /proc entries which usually happens if module is
> > > getting removed. Files like /proc/cpuinfo which never disappear simply do not
> > > need such protection.
> > >
> > > Save 2 atomic ops, 1 allocation, 1 free per open/read/close sequence for such
> > > "permanent" files.
> > >
> > > Enable "permanent" flag for
> > >
> > > /proc/cpuinfo
> > > /proc/kmsg
> > > /proc/modules
> > > /proc/slabinfo
> > > /proc/stat
> > > /proc/sysvipc/*
> > > /proc/swaps
> > >
> > > More will come once I figure out foolproof way to prevent out module
> > > authors from marking their stuff "permanent" for performance reasons
> > > when it is not.
> > >
> > > This should help with scalability: benchmark is "read /proc/cpuinfo R times
> > > by N threads scattered over the system".
> >
> > Is this an actual expected use-case?
>
> Yes.
>
> > Is there some additional unnecessary memory consumption
> > in the unscaled systems?
>
> No, it's the opposite. Less memory usage for everyone and noticeable
> performance improvement for contented case.
>
> > > static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
> > > {
> > > struct proc_dir_entry *pde = PDE(file_inode(file));
> > > ssize_t rv = -EIO;
> > > - if (use_pde(pde)) {
> > > - typeof_member(struct proc_ops, proc_read) read;
> > >
> > > - read = pde->proc_ops->proc_read;
> > > - if (read)
> > > - rv = read(file, buf, count, ppos);
> > > + if (pde_is_permanent(pde)) {
> > > + return pde_read(pde, file, buf, count, ppos);
> > > + } else if (use_pde(pde)) {
> > > + rv = pde_read(pde, file, buf, count, ppos);
> > > unuse_pde(pde);
> >
> > Perhaps all the function call duplication could be minimized
> > by using code without direct returns like:
> >
> > rv = pde_read(pde, file, buf, count, pos);
> > if (!pde_is_permanent(pde))
> > unuse_pde(pde);
> >
> > return rv;
>
> Function call non-duplication is false goal.
Depends, copy/paste errors are common and object code
size generally increases.
> Surprisingly it makes code bigger:
Not so far as I can tell. Are you sure?
> $ ./scripts/bloat-o-meter ../vmlinux-000 ../obj/vmlinux
> add/remove: 0/0 grow/shrink: 1/0 up/down: 10/0 (10)
> Function old new delta
> proc_reg_read 108 118 +10
>
> and worse too: "rv" is carried on stack through "unuse_pde" call.
With gcc 9.2.1 x86-64 defconfig:
Changing just proc_reg_read to:
static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
struct proc_dir_entry *pde = PDE(file_inode(file));
ssize_t rv;
rv = pde_read(pde, file, buf, count, ppos);
if (use_pde(pde))
unuse_pde(pde);
return rv;
}
gives:
$ size fs/proc/inode.o*
text data bss dec hex filename
5448 28 0 5476 1564 fs/proc/inode.o.suggested
5526 28 0 5554 15b2 fs/proc/inode.o.alexey
$ objdump -d fs/proc/inode.o.suggested (grep for proc_reg_read)
0000000000000410 <proc_reg_read>:
410: 41 54 push %r12
412: 55 push %rbp
413: 48 8b 47 20 mov 0x20(%rdi),%rax
417: 48 8b 68 d0 mov -0x30(%rax),%rbp
41b: 48 8b 45 30 mov 0x30(%rbp),%rax
41f: 48 8b 40 10 mov 0x10(%rax),%rax
423: 48 85 c0 test %rax,%rax
426: 74 28 je 450 <proc_reg_read+0x40>
428: e8 00 00 00 00 callq 42d <proc_reg_read+0x1d>
42d: 49 89 c4 mov %rax,%r12
430: 8b 45 00 mov 0x0(%rbp),%eax
433: 85 c0 test %eax,%eax
435: 78 12 js 449 <proc_reg_read+0x39>
437: 8d 50 01 lea 0x1(%rax),%edx
43a: f0 0f b1 55 00 lock cmpxchg %edx,0x0(%rbp)
43f: 75 f2 jne 433 <proc_reg_read+0x23>
441: 48 89 ef mov %rbp,%rdi
444: e8 e7 fc ff ff callq 130 <unuse_pde>
449: 4c 89 e0 mov %r12,%rax
44c: 5d pop %rbp
44d: 41 5c pop %r12
44f: c3 retq
450: 49 c7 c4 fb ff ff ff mov $0xfffffffffffffffb,%r12
457: eb d7 jmp 430 <proc_reg_read+0x20>
459: 0f 1f 80 00 00 00 00 nopl 0x0(%rax)
vs
$ objdump -d fs/proc/inode.o.alexey (grep for proc_reg_read)
00000000000006e0 <proc_reg_read>:
6e0: 41 54 push %r12
6e2: 55 push %rbp
6e3: 48 8b 47 20 mov 0x20(%rdi),%rax
6e7: 48 8b 68 d0 mov -0x30(%rax),%rbp
6eb: f6 85 aa 00 00 00 01 testb $0x1,0xaa(%rbp)
6f2: 74 15 je 709 <proc_reg_read+0x29>
6f4: 48 8b 45 30 mov 0x30(%rbp),%rax
6f8: 48 8b 40 10 mov 0x10(%rax),%rax
6fc: 48 85 c0 test %rax,%rax
6ff: 74 3f je 740 <proc_reg_read+0x60>
701: 5d pop %rbp
702: 41 5c pop %r12
704: e9 00 00 00 00 jmpq 709 <proc_reg_read+0x29>
709: 8b 45 00 mov 0x0(%rbp),%eax
70c: 85 c0 test %eax,%eax
70e: 78 30 js 740 <proc_reg_read+0x60>
710: 44 8d 40 01 lea 0x1(%rax),%r8d
714: f0 44 0f b1 45 00 lock cmpxchg %r8d,0x0(%rbp)
71a: 75 f0 jne 70c <proc_reg_read+0x2c>
71c: 48 8b 45 30 mov 0x30(%rbp),%rax
720: 48 8b 40 10 mov 0x10(%rax),%rax
724: 48 85 c0 test %rax,%rax
727: 74 20 je 749 <proc_reg_read+0x69>
729: e8 00 00 00 00 callq 72e <proc_reg_read+0x4e>
72e: 49 89 c4 mov %rax,%r12
731: 48 89 ef mov %rbp,%rdi
734: e8 f7 f9 ff ff callq 130 <unuse_pde>
739: 4c 89 e0 mov %r12,%rax
73c: 5d pop %rbp
73d: 41 5c pop %r12
73f: c3 retq
740: 49 c7 c4 fb ff ff ff mov $0xfffffffffffffffb,%r12
747: eb f0 jmp 739 <proc_reg_read+0x59>
749: 49 c7 c4 fb ff ff ff mov $0xfffffffffffffffb,%r12
750: eb df jmp 731 <proc_reg_read+0x51>
752: 66 66 2e 0f 1f 84 00 data16 nopw %cs:0x0(%rax,%rax,1)
759: 00 00 00 00
75d: 0f 1f 00 nopl (%rax)
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH v3] proc: faster open/read/close with "permanent" files
2020-02-24 2:48 ` Joe Perches
@ 2020-02-24 17:55 ` Alexey Dobriyan
0 siblings, 0 replies; 5+ messages in thread
From: Alexey Dobriyan @ 2020-02-24 17:55 UTC (permalink / raw)
To: Joe Perches; +Cc: akpm, linux-kernel, linux-fsdevel
On Sun, Feb 23, 2020 at 06:48:38PM -0800, Joe Perches wrote:
> On Sun, 2020-02-23 at 14:30 +0300, Alexey Dobriyan wrote:
> > On Sat, Feb 22, 2020 at 12:39:39PM -0800, Joe Perches wrote:
> > > On Sat, 2020-02-22 at 23:15 +0300, Alexey Dobriyan wrote:
> > > > Now that "struct proc_ops" exist we can start putting there stuff which
> > > > could not fly with VFS "struct file_operations"...
> > > >
> > > > Most of fs/proc/inode.c file is dedicated to make open/read/.../close reliable
> > > > in the event of disappearing /proc entries which usually happens if module is
> > > > getting removed. Files like /proc/cpuinfo which never disappear simply do not
> > > > need such protection.
> > > >
> > > > Save 2 atomic ops, 1 allocation, 1 free per open/read/close sequence for such
> > > > "permanent" files.
> > > >
> > > > Enable "permanent" flag for
> > > >
> > > > /proc/cpuinfo
> > > > /proc/kmsg
> > > > /proc/modules
> > > > /proc/slabinfo
> > > > /proc/stat
> > > > /proc/sysvipc/*
> > > > /proc/swaps
> > > >
> > > > More will come once I figure out foolproof way to prevent out module
> > > > authors from marking their stuff "permanent" for performance reasons
> > > > when it is not.
> > > >
> > > > This should help with scalability: benchmark is "read /proc/cpuinfo R times
> > > > by N threads scattered over the system".
> > >
> > > Is this an actual expected use-case?
> >
> > Yes.
> >
> > > Is there some additional unnecessary memory consumption
> > > in the unscaled systems?
> >
> > No, it's the opposite. Less memory usage for everyone and noticeable
> > performance improvement for contented case.
> >
> > > > static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
> > > > {
> > > > struct proc_dir_entry *pde = PDE(file_inode(file));
> > > > ssize_t rv = -EIO;
> > > > - if (use_pde(pde)) {
> > > > - typeof_member(struct proc_ops, proc_read) read;
> > > >
> > > > - read = pde->proc_ops->proc_read;
> > > > - if (read)
> > > > - rv = read(file, buf, count, ppos);
> > > > + if (pde_is_permanent(pde)) {
> > > > + return pde_read(pde, file, buf, count, ppos);
> > > > + } else if (use_pde(pde)) {
> > > > + rv = pde_read(pde, file, buf, count, ppos);
> > > > unuse_pde(pde);
> > >
> > > Perhaps all the function call duplication could be minimized
> > > by using code without direct returns like:
> > >
> > > rv = pde_read(pde, file, buf, count, pos);
> > > if (!pde_is_permanent(pde))
> > > unuse_pde(pde);
> > >
> > > return rv;
> >
> > Function call non-duplication is false goal.
>
> Depends, copy/paste errors are common and object code
> size generally increases.
>
> > Surprisingly it makes code bigger:
>
> Not so far as I can tell. Are you sure?
>
> > $ ./scripts/bloat-o-meter ../vmlinux-000 ../obj/vmlinux
> > add/remove: 0/0 grow/shrink: 1/0 up/down: 10/0 (10)
> > Function old new delta
> > proc_reg_read 108 118 +10
> >
> > and worse too: "rv" is carried on stack through "unuse_pde" call.
>
> With gcc 9.2.1 x86-64 defconfig:
>
> Changing just proc_reg_read to:
>
> static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
> {
> struct proc_dir_entry *pde = PDE(file_inode(file));
> ssize_t rv;
>
> rv = pde_read(pde, file, buf, count, ppos);
> if (use_pde(pde))
> unuse_pde(pde);
What?
Please make non-racy patch before doing anything.
>
> return rv;
> }
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2020-02-24 17:55 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-02-22 20:15 [PATCH v3] proc: faster open/read/close with "permanent" files Alexey Dobriyan
2020-02-22 20:39 ` Joe Perches
2020-02-23 11:30 ` Alexey Dobriyan
2020-02-24 2:48 ` Joe Perches
2020-02-24 17:55 ` Alexey Dobriyan
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).