[v3] proc: faster open/read/close with "permanent" files
diff mbox series

Message ID 20200222201539.GA22576@avx2
State In Next
Commit 04470c4b33ead8af1476553fc110b29dd035020a
Headers show
Series
  • [v3] proc: faster open/read/close with "permanent" files
Related show

Commit Message

Alexey Dobriyan Feb. 22, 2020, 8:15 p.m. UTC
Now that "struct proc_ops" exist we can start putting there stuff which
could not fly with VFS "struct file_operations"...

Most of fs/proc/inode.c file is dedicated to make open/read/.../close reliable
in the event of disappearing /proc entries which usually happens if module is
getting removed. Files like /proc/cpuinfo which never disappear simply do not
need such protection.

Save 2 atomic ops, 1 allocation, 1 free per open/read/close sequence for such
"permanent" files.

Enable "permanent" flag for

	/proc/cpuinfo
	/proc/kmsg
	/proc/modules
	/proc/slabinfo
	/proc/stat
	/proc/sysvipc/*
	/proc/swaps

More will come once I figure out foolproof way to prevent out module
authors from marking their stuff "permanent" for performance reasons
when it is not.

This should help with scalability: benchmark is "read /proc/cpuinfo R times
by N threads scattered over the system".

	N	R	t, s (before)	t, s (after)
	-----------------------------------------------------
	64	4096	1.582458	1.530502	-3.2%
	256	4096	6.371926	6.125168	-3.9%
	1024	4096	25.64888	24.47528	-4.6%

Benchmark source:

#include <chrono>
#include <iostream>
#include <thread>
#include <vector>

#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>

const int NR_CPUS = sysconf(_SC_NPROCESSORS_ONLN);
int N;
const char *filename;
int R;

int xxx = 0;

int glue(int n)
{
	cpu_set_t m;
	CPU_ZERO(&m);
	CPU_SET(n, &m);
	return sched_setaffinity(0, sizeof(cpu_set_t), &m);
}

void f(int n)
{
	glue(n % NR_CPUS);

	while (*(volatile int *)&xxx == 0) {
	}

	for (int i = 0; i < R; i++) {
		int fd = open(filename, O_RDONLY);
		char buf[4096];
		ssize_t rv = read(fd, buf, sizeof(buf));
		asm volatile ("" :: "g" (rv));
		close(fd);
	}
}

int main(int argc, char *argv[])
{
	if (argc < 4) {
		std::cerr << "usage: " << argv[0] << ' ' << "N /proc/filename R\n";
		return 1;
	}

	N = atoi(argv[1]);
	filename = argv[2];
	R = atoi(argv[3]);

	for (int i = 0; i < NR_CPUS; i++) {
		if (glue(i) == 0)
			break;
	}

	std::vector<std::thread> T;
	T.reserve(N);
	for (int i = 0; i < N; i++) {
		T.emplace_back(f, i);
	}

	auto t0 = std::chrono::system_clock::now();
	{
		*(volatile int *)&xxx = 1;
		for (auto& t: T) {
			t.join();
		}
	}
	auto t1 = std::chrono::system_clock::now();
	std::chrono::duration<double> dt = t1 - t0;
	std::cout << dt.count() << '\n';

	return 0;
}

P.S.:
Explicit randomization marker is added because adding non-function pointer
will silently disable structure layout randomization.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Reported-by: kbuild test robot <lkp@intel.com>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
---

	v2: fix unlock on error path
	v3: more comments

 fs/proc/cpuinfo.c       |    1 
 fs/proc/generic.c       |   33 +++++++-
 fs/proc/inode.c         |  187 +++++++++++++++++++++++++++++++++++-------------
 fs/proc/internal.h      |    6 +
 fs/proc/kmsg.c          |    1 
 fs/proc/stat.c          |    1 
 include/linux/proc_fs.h |   17 ++++
 ipc/util.c              |    1 
 kernel/module.c         |    1 
 mm/slab_common.c        |    1 
 mm/swapfile.c           |    1 
 11 files changed, 196 insertions(+), 54 deletions(-)

Comments

Joe Perches Feb. 22, 2020, 8:39 p.m. UTC | #1
On Sat, 2020-02-22 at 23:15 +0300, Alexey Dobriyan wrote:
> Now that "struct proc_ops" exist we can start putting there stuff which
> could not fly with VFS "struct file_operations"...
> 
> Most of fs/proc/inode.c file is dedicated to make open/read/.../close reliable
> in the event of disappearing /proc entries which usually happens if module is
> getting removed. Files like /proc/cpuinfo which never disappear simply do not
> need such protection.
> 
> Save 2 atomic ops, 1 allocation, 1 free per open/read/close sequence for such
> "permanent" files.
> 
> Enable "permanent" flag for
> 
> 	/proc/cpuinfo
> 	/proc/kmsg
> 	/proc/modules
> 	/proc/slabinfo
> 	/proc/stat
> 	/proc/sysvipc/*
> 	/proc/swaps
> 
> More will come once I figure out foolproof way to prevent out module
> authors from marking their stuff "permanent" for performance reasons
> when it is not.
> 
> This should help with scalability: benchmark is "read /proc/cpuinfo R times
> by N threads scattered over the system".

Is this an actual expected use-case?
Is there some additional unnecessary memory consumption
in the unscaled systems?

And trivia:

>  static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence)
>  {
[]
> +	if (pde_is_permanent(pde)) {
> +		return pde_lseek(pde, file, offset, whence);
> +	} else if (use_pde(pde)) {
> +		rv = pde_lseek(pde, file, offset, whence);
>  		unuse_pde(pde);
>  	}
>  	return rv;
>  }
[]
>  static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
>  {
>  	struct proc_dir_entry *pde = PDE(file_inode(file));
>  	ssize_t rv = -EIO;
> -	if (use_pde(pde)) {
> -		typeof_member(struct proc_ops, proc_read) read;
>  
> -		read = pde->proc_ops->proc_read;
> -		if (read)
> -			rv = read(file, buf, count, ppos);
> +	if (pde_is_permanent(pde)) {
> +		return pde_read(pde, file, buf, count, ppos);
> +	} else if (use_pde(pde)) {
> +		rv = pde_read(pde, file, buf, count, ppos);
>  		unuse_pde(pde);

Perhaps all the function call duplication could be minimized
by using code without direct returns like:

	rv = pde_read(pde, file, buf, count, pos);
	if (!pde_is_permanent(pde))
		unuse_pde(pde);

	return rv;
Alexey Dobriyan Feb. 23, 2020, 11:30 a.m. UTC | #2
On Sat, Feb 22, 2020 at 12:39:39PM -0800, Joe Perches wrote:
> On Sat, 2020-02-22 at 23:15 +0300, Alexey Dobriyan wrote:
> > Now that "struct proc_ops" exist we can start putting there stuff which
> > could not fly with VFS "struct file_operations"...
> > 
> > Most of fs/proc/inode.c file is dedicated to make open/read/.../close reliable
> > in the event of disappearing /proc entries which usually happens if module is
> > getting removed. Files like /proc/cpuinfo which never disappear simply do not
> > need such protection.
> > 
> > Save 2 atomic ops, 1 allocation, 1 free per open/read/close sequence for such
> > "permanent" files.
> > 
> > Enable "permanent" flag for
> > 
> > 	/proc/cpuinfo
> > 	/proc/kmsg
> > 	/proc/modules
> > 	/proc/slabinfo
> > 	/proc/stat
> > 	/proc/sysvipc/*
> > 	/proc/swaps
> > 
> > More will come once I figure out foolproof way to prevent out module
> > authors from marking their stuff "permanent" for performance reasons
> > when it is not.
> > 
> > This should help with scalability: benchmark is "read /proc/cpuinfo R times
> > by N threads scattered over the system".
> 
> Is this an actual expected use-case?

Yes.

> Is there some additional unnecessary memory consumption
> in the unscaled systems?

No, it's the opposite. Less memory usage for everyone and noticeable
performance improvement for contented case.

> >  static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
> >  {
> >  	struct proc_dir_entry *pde = PDE(file_inode(file));
> >  	ssize_t rv = -EIO;
> > -	if (use_pde(pde)) {
> > -		typeof_member(struct proc_ops, proc_read) read;
> >  
> > -		read = pde->proc_ops->proc_read;
> > -		if (read)
> > -			rv = read(file, buf, count, ppos);
> > +	if (pde_is_permanent(pde)) {
> > +		return pde_read(pde, file, buf, count, ppos);
> > +	} else if (use_pde(pde)) {
> > +		rv = pde_read(pde, file, buf, count, ppos);
> >  		unuse_pde(pde);
> 
> Perhaps all the function call duplication could be minimized
> by using code without direct returns like:
> 
> 	rv = pde_read(pde, file, buf, count, pos);
> 	if (!pde_is_permanent(pde))
> 		unuse_pde(pde);
> 
> 	return rv;

Function call non-duplication is false goal.
Surprisingly it makes code bigger:

	$ ./scripts/bloat-o-meter ../vmlinux-000 ../obj/vmlinux
	add/remove: 0/0 grow/shrink: 1/0 up/down: 10/0 (10)
	Function                                     old     new   delta
	proc_reg_read                                108     118     +10

and worse too: "rv" is carried on stack through "unuse_pde" call.
Joe Perches Feb. 24, 2020, 2:48 a.m. UTC | #3
On Sun, 2020-02-23 at 14:30 +0300, Alexey Dobriyan wrote:
> On Sat, Feb 22, 2020 at 12:39:39PM -0800, Joe Perches wrote:
> > On Sat, 2020-02-22 at 23:15 +0300, Alexey Dobriyan wrote:
> > > Now that "struct proc_ops" exist we can start putting there stuff which
> > > could not fly with VFS "struct file_operations"...
> > > 
> > > Most of fs/proc/inode.c file is dedicated to make open/read/.../close reliable
> > > in the event of disappearing /proc entries which usually happens if module is
> > > getting removed. Files like /proc/cpuinfo which never disappear simply do not
> > > need such protection.
> > > 
> > > Save 2 atomic ops, 1 allocation, 1 free per open/read/close sequence for such
> > > "permanent" files.
> > > 
> > > Enable "permanent" flag for
> > > 
> > > 	/proc/cpuinfo
> > > 	/proc/kmsg
> > > 	/proc/modules
> > > 	/proc/slabinfo
> > > 	/proc/stat
> > > 	/proc/sysvipc/*
> > > 	/proc/swaps
> > > 
> > > More will come once I figure out foolproof way to prevent out module
> > > authors from marking their stuff "permanent" for performance reasons
> > > when it is not.
> > > 
> > > This should help with scalability: benchmark is "read /proc/cpuinfo R times
> > > by N threads scattered over the system".
> > 
> > Is this an actual expected use-case?
> 
> Yes.
> 
> > Is there some additional unnecessary memory consumption
> > in the unscaled systems?
> 
> No, it's the opposite. Less memory usage for everyone and noticeable
> performance improvement for contented case.
> 
> > >  static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
> > >  {
> > >  	struct proc_dir_entry *pde = PDE(file_inode(file));
> > >  	ssize_t rv = -EIO;
> > > -	if (use_pde(pde)) {
> > > -		typeof_member(struct proc_ops, proc_read) read;
> > >  
> > > -		read = pde->proc_ops->proc_read;
> > > -		if (read)
> > > -			rv = read(file, buf, count, ppos);
> > > +	if (pde_is_permanent(pde)) {
> > > +		return pde_read(pde, file, buf, count, ppos);
> > > +	} else if (use_pde(pde)) {
> > > +		rv = pde_read(pde, file, buf, count, ppos);
> > >  		unuse_pde(pde);
> > 
> > Perhaps all the function call duplication could be minimized
> > by using code without direct returns like:
> > 
> > 	rv = pde_read(pde, file, buf, count, pos);
> > 	if (!pde_is_permanent(pde))
> > 		unuse_pde(pde);
> > 
> > 	return rv;
> 
> Function call non-duplication is false goal.

Depends, copy/paste errors are common and object code
size generally increases.

> Surprisingly it makes code bigger:

Not so far as I can tell.  Are you sure?

> 	$ ./scripts/bloat-o-meter ../vmlinux-000 ../obj/vmlinux
> 	add/remove: 0/0 grow/shrink: 1/0 up/down: 10/0 (10)
> 	Function                                     old     new   delta
> 	proc_reg_read                                108     118     +10
> 
> and worse too: "rv" is carried on stack through "unuse_pde" call.

With gcc 9.2.1 x86-64 defconfig:

Changing just proc_reg_read to:

static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
	struct proc_dir_entry *pde = PDE(file_inode(file));
	ssize_t rv;

	rv = pde_read(pde, file, buf, count, ppos);
	if (use_pde(pde))
		unuse_pde(pde);

	return rv;
}

gives:

$ size fs/proc/inode.o*
   text	   data	    bss	    dec	    hex	filename
   5448	     28	      0	   5476	   1564	fs/proc/inode.o.suggested
   5526	     28	      0	   5554	   15b2	fs/proc/inode.o.alexey

$ objdump -d fs/proc/inode.o.suggested (grep for proc_reg_read)
0000000000000410 <proc_reg_read>:
 410:	41 54                	push   %r12
 412:	55                   	push   %rbp
 413:	48 8b 47 20          	mov    0x20(%rdi),%rax
 417:	48 8b 68 d0          	mov    -0x30(%rax),%rbp
 41b:	48 8b 45 30          	mov    0x30(%rbp),%rax
 41f:	48 8b 40 10          	mov    0x10(%rax),%rax
 423:	48 85 c0             	test   %rax,%rax
 426:	74 28                	je     450 <proc_reg_read+0x40>
 428:	e8 00 00 00 00       	callq  42d <proc_reg_read+0x1d>
 42d:	49 89 c4             	mov    %rax,%r12
 430:	8b 45 00             	mov    0x0(%rbp),%eax
 433:	85 c0                	test   %eax,%eax
 435:	78 12                	js     449 <proc_reg_read+0x39>
 437:	8d 50 01             	lea    0x1(%rax),%edx
 43a:	f0 0f b1 55 00       	lock cmpxchg %edx,0x0(%rbp)
 43f:	75 f2                	jne    433 <proc_reg_read+0x23>
 441:	48 89 ef             	mov    %rbp,%rdi
 444:	e8 e7 fc ff ff       	callq  130 <unuse_pde>
 449:	4c 89 e0             	mov    %r12,%rax
 44c:	5d                   	pop    %rbp
 44d:	41 5c                	pop    %r12
 44f:	c3                   	retq   
 450:	49 c7 c4 fb ff ff ff 	mov    $0xfffffffffffffffb,%r12
 457:	eb d7                	jmp    430 <proc_reg_read+0x20>
 459:	0f 1f 80 00 00 00 00 	nopl   0x0(%rax)

vs
 
$ objdump -d fs/proc/inode.o.alexey (grep for proc_reg_read)

00000000000006e0 <proc_reg_read>:
 6e0:	41 54                	push   %r12
 6e2:	55                   	push   %rbp
 6e3:	48 8b 47 20          	mov    0x20(%rdi),%rax
 6e7:	48 8b 68 d0          	mov    -0x30(%rax),%rbp
 6eb:	f6 85 aa 00 00 00 01 	testb  $0x1,0xaa(%rbp)
 6f2:	74 15                	je     709 <proc_reg_read+0x29>
 6f4:	48 8b 45 30          	mov    0x30(%rbp),%rax
 6f8:	48 8b 40 10          	mov    0x10(%rax),%rax
 6fc:	48 85 c0             	test   %rax,%rax
 6ff:	74 3f                	je     740 <proc_reg_read+0x60>
 701:	5d                   	pop    %rbp
 702:	41 5c                	pop    %r12
 704:	e9 00 00 00 00       	jmpq   709 <proc_reg_read+0x29>
 709:	8b 45 00             	mov    0x0(%rbp),%eax
 70c:	85 c0                	test   %eax,%eax
 70e:	78 30                	js     740 <proc_reg_read+0x60>
 710:	44 8d 40 01          	lea    0x1(%rax),%r8d
 714:	f0 44 0f b1 45 00    	lock cmpxchg %r8d,0x0(%rbp)
 71a:	75 f0                	jne    70c <proc_reg_read+0x2c>
 71c:	48 8b 45 30          	mov    0x30(%rbp),%rax
 720:	48 8b 40 10          	mov    0x10(%rax),%rax
 724:	48 85 c0             	test   %rax,%rax
 727:	74 20                	je     749 <proc_reg_read+0x69>
 729:	e8 00 00 00 00       	callq  72e <proc_reg_read+0x4e>
 72e:	49 89 c4             	mov    %rax,%r12
 731:	48 89 ef             	mov    %rbp,%rdi
 734:	e8 f7 f9 ff ff       	callq  130 <unuse_pde>
 739:	4c 89 e0             	mov    %r12,%rax
 73c:	5d                   	pop    %rbp
 73d:	41 5c                	pop    %r12
 73f:	c3                   	retq   
 740:	49 c7 c4 fb ff ff ff 	mov    $0xfffffffffffffffb,%r12
 747:	eb f0                	jmp    739 <proc_reg_read+0x59>
 749:	49 c7 c4 fb ff ff ff 	mov    $0xfffffffffffffffb,%r12
 750:	eb df                	jmp    731 <proc_reg_read+0x51>
 752:	66 66 2e 0f 1f 84 00 	data16 nopw %cs:0x0(%rax,%rax,1)
 759:	00 00 00 00 
 75d:	0f 1f 00             	nopl   (%rax)
Alexey Dobriyan Feb. 24, 2020, 5:55 p.m. UTC | #4
On Sun, Feb 23, 2020 at 06:48:38PM -0800, Joe Perches wrote:
> On Sun, 2020-02-23 at 14:30 +0300, Alexey Dobriyan wrote:
> > On Sat, Feb 22, 2020 at 12:39:39PM -0800, Joe Perches wrote:
> > > On Sat, 2020-02-22 at 23:15 +0300, Alexey Dobriyan wrote:
> > > > Now that "struct proc_ops" exist we can start putting there stuff which
> > > > could not fly with VFS "struct file_operations"...
> > > > 
> > > > Most of fs/proc/inode.c file is dedicated to make open/read/.../close reliable
> > > > in the event of disappearing /proc entries which usually happens if module is
> > > > getting removed. Files like /proc/cpuinfo which never disappear simply do not
> > > > need such protection.
> > > > 
> > > > Save 2 atomic ops, 1 allocation, 1 free per open/read/close sequence for such
> > > > "permanent" files.
> > > > 
> > > > Enable "permanent" flag for
> > > > 
> > > > 	/proc/cpuinfo
> > > > 	/proc/kmsg
> > > > 	/proc/modules
> > > > 	/proc/slabinfo
> > > > 	/proc/stat
> > > > 	/proc/sysvipc/*
> > > > 	/proc/swaps
> > > > 
> > > > More will come once I figure out foolproof way to prevent out module
> > > > authors from marking their stuff "permanent" for performance reasons
> > > > when it is not.
> > > > 
> > > > This should help with scalability: benchmark is "read /proc/cpuinfo R times
> > > > by N threads scattered over the system".
> > > 
> > > Is this an actual expected use-case?
> > 
> > Yes.
> > 
> > > Is there some additional unnecessary memory consumption
> > > in the unscaled systems?
> > 
> > No, it's the opposite. Less memory usage for everyone and noticeable
> > performance improvement for contented case.
> > 
> > > >  static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
> > > >  {
> > > >  	struct proc_dir_entry *pde = PDE(file_inode(file));
> > > >  	ssize_t rv = -EIO;
> > > > -	if (use_pde(pde)) {
> > > > -		typeof_member(struct proc_ops, proc_read) read;
> > > >  
> > > > -		read = pde->proc_ops->proc_read;
> > > > -		if (read)
> > > > -			rv = read(file, buf, count, ppos);
> > > > +	if (pde_is_permanent(pde)) {
> > > > +		return pde_read(pde, file, buf, count, ppos);
> > > > +	} else if (use_pde(pde)) {
> > > > +		rv = pde_read(pde, file, buf, count, ppos);
> > > >  		unuse_pde(pde);
> > > 
> > > Perhaps all the function call duplication could be minimized
> > > by using code without direct returns like:
> > > 
> > > 	rv = pde_read(pde, file, buf, count, pos);
> > > 	if (!pde_is_permanent(pde))
> > > 		unuse_pde(pde);
> > > 
> > > 	return rv;
> > 
> > Function call non-duplication is false goal.
> 
> Depends, copy/paste errors are common and object code
> size generally increases.
> 
> > Surprisingly it makes code bigger:
> 
> Not so far as I can tell.  Are you sure?
> 
> > 	$ ./scripts/bloat-o-meter ../vmlinux-000 ../obj/vmlinux
> > 	add/remove: 0/0 grow/shrink: 1/0 up/down: 10/0 (10)
> > 	Function                                     old     new   delta
> > 	proc_reg_read                                108     118     +10
> > 
> > and worse too: "rv" is carried on stack through "unuse_pde" call.
> 
> With gcc 9.2.1 x86-64 defconfig:
> 
> Changing just proc_reg_read to:
> 
> static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
> {
> 	struct proc_dir_entry *pde = PDE(file_inode(file));
> 	ssize_t rv;
> 
> 	rv = pde_read(pde, file, buf, count, ppos);
> 	if (use_pde(pde))
> 		unuse_pde(pde);

What?

Please make non-racy patch before doing anything.

> 
> 	return rv;
> }

Patch
diff mbox series

--- a/fs/proc/cpuinfo.c
+++ b/fs/proc/cpuinfo.c
@@ -17,6 +17,7 @@  static int cpuinfo_open(struct inode *inode, struct file *file)
 }
 
 static const struct proc_ops cpuinfo_proc_ops = {
+	.proc_flags	= PROC_ENTRY_PERMANENT,
 	.proc_open	= cpuinfo_open,
 	.proc_read	= seq_read,
 	.proc_lseek	= seq_lseek,
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -531,6 +531,13 @@  struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode,
 	return p;
 }
 
+static inline void pde_set_flags(struct proc_dir_entry *pde)
+{
+	if (pde->proc_ops->proc_flags & PROC_ENTRY_PERMANENT) {
+		pde->flags |= PROC_ENTRY_PERMANENT;
+	}
+}
+
 struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
 		struct proc_dir_entry *parent,
 		const struct proc_ops *proc_ops, void *data)
@@ -541,6 +548,7 @@  struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
 	if (!p)
 		return NULL;
 	p->proc_ops = proc_ops;
+	pde_set_flags(p);
 	return proc_register(parent, p);
 }
 EXPORT_SYMBOL(proc_create_data);
@@ -572,6 +580,7 @@  static int proc_seq_release(struct inode *inode, struct file *file)
 }
 
 static const struct proc_ops proc_seq_ops = {
+	/* not permanent -- can call into arbitrary seq_operations */
 	.proc_open	= proc_seq_open,
 	.proc_read	= seq_read,
 	.proc_lseek	= seq_lseek,
@@ -602,6 +611,7 @@  static int proc_single_open(struct inode *inode, struct file *file)
 }
 
 static const struct proc_ops proc_single_ops = {
+	/* not permanent -- can call into arbitrary ->single_show */
 	.proc_open	= proc_single_open,
 	.proc_read	= seq_read,
 	.proc_lseek	= seq_lseek,
@@ -662,9 +672,14 @@  void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
 
 	de = pde_subdir_find(parent, fn, len);
 	if (de) {
-		rb_erase(&de->subdir_node, &parent->subdir);
-		if (S_ISDIR(de->mode)) {
-			parent->nlink--;
+		if (unlikely(pde_is_permanent(de))) {
+			WARN(1, "removing permanent /proc entry '%s'", de->name);
+			de = NULL;
+		} else {
+			rb_erase(&de->subdir_node, &parent->subdir);
+			if (S_ISDIR(de->mode)) {
+				parent->nlink--;
+			}
 		}
 	}
 	write_unlock(&proc_subdir_lock);
@@ -700,12 +715,24 @@  int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
 		write_unlock(&proc_subdir_lock);
 		return -ENOENT;
 	}
+	if (unlikely(pde_is_permanent(root))) {
+		write_unlock(&proc_subdir_lock);
+		WARN(1, "removing permanent /proc entry '%s/%s'",
+			root->parent->name, root->name);
+		return -EINVAL;
+	}
 	rb_erase(&root->subdir_node, &parent->subdir);
 
 	de = root;
 	while (1) {
 		next = pde_subdir_first(de);
 		if (next) {
+			if (unlikely(pde_is_permanent(root))) {
+				write_unlock(&proc_subdir_lock);
+				WARN(1, "removing permanent /proc entry '%s/%s'",
+					next->parent->name, next->name);
+				return -EINVAL;
+			}
 			rb_erase(&next->subdir_node, &de->subdir);
 			de = next;
 			continue;
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -195,135 +195,204 @@  void proc_entry_rundown(struct proc_dir_entry *de)
 	spin_unlock(&de->pde_unload_lock);
 }
 
+static loff_t pde_lseek(struct proc_dir_entry *pde, struct file *file, loff_t offset, int whence)
+{
+	typeof_member(struct proc_ops, proc_lseek) lseek;
+
+	lseek = pde->proc_ops->proc_lseek;
+	if (!lseek)
+		lseek = default_llseek;
+	return lseek(file, offset, whence);
+}
+
 static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence)
 {
 	struct proc_dir_entry *pde = PDE(file_inode(file));
 	loff_t rv = -EINVAL;
-	if (use_pde(pde)) {
-		typeof_member(struct proc_ops, proc_lseek) lseek;
 
-		lseek = pde->proc_ops->proc_lseek;
-		if (!lseek)
-			lseek = default_llseek;
-		rv = lseek(file, offset, whence);
+	if (pde_is_permanent(pde)) {
+		return pde_lseek(pde, file, offset, whence);
+	} else if (use_pde(pde)) {
+		rv = pde_lseek(pde, file, offset, whence);
 		unuse_pde(pde);
 	}
 	return rv;
 }
 
+static ssize_t pde_read(struct proc_dir_entry *pde, struct file *file, char __user *buf, size_t count, loff_t *ppos)
+{
+	typeof_member(struct proc_ops, proc_read) read;
+
+	read = pde->proc_ops->proc_read;
+	if (read)
+		return read(file, buf, count, ppos);
+	return -EIO;
+}
+
 static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 {
 	struct proc_dir_entry *pde = PDE(file_inode(file));
 	ssize_t rv = -EIO;
-	if (use_pde(pde)) {
-		typeof_member(struct proc_ops, proc_read) read;
 
-		read = pde->proc_ops->proc_read;
-		if (read)
-			rv = read(file, buf, count, ppos);
+	if (pde_is_permanent(pde)) {
+		return pde_read(pde, file, buf, count, ppos);
+	} else if (use_pde(pde)) {
+		rv = pde_read(pde, file, buf, count, ppos);
 		unuse_pde(pde);
 	}
 	return rv;
 }
 
+static ssize_t pde_write(struct proc_dir_entry *pde, struct file *file, const char __user *buf, size_t count, loff_t *ppos)
+{
+	typeof_member(struct proc_ops, proc_write) write;
+
+	write = pde->proc_ops->proc_write;
+	if (write)
+		return write(file, buf, count, ppos);
+	return -EIO;
+}
+
 static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
 {
 	struct proc_dir_entry *pde = PDE(file_inode(file));
 	ssize_t rv = -EIO;
-	if (use_pde(pde)) {
-		typeof_member(struct proc_ops, proc_write) write;
 
-		write = pde->proc_ops->proc_write;
-		if (write)
-			rv = write(file, buf, count, ppos);
+	if (pde_is_permanent(pde)) {
+		return pde_write(pde, file, buf, count, ppos);
+	} else if (use_pde(pde)) {
+		rv = pde_write(pde, file, buf, count, ppos);
 		unuse_pde(pde);
 	}
 	return rv;
 }
 
+static __poll_t pde_poll(struct proc_dir_entry *pde, struct file *file, struct poll_table_struct *pts)
+{
+	typeof_member(struct proc_ops, proc_poll) poll;
+
+	poll = pde->proc_ops->proc_poll;
+	if (poll)
+		return poll(file, pts);
+	return DEFAULT_POLLMASK;
+}
+
 static __poll_t proc_reg_poll(struct file *file, struct poll_table_struct *pts)
 {
 	struct proc_dir_entry *pde = PDE(file_inode(file));
 	__poll_t rv = DEFAULT_POLLMASK;
-	if (use_pde(pde)) {
-		typeof_member(struct proc_ops, proc_poll) poll;
 
-		poll = pde->proc_ops->proc_poll;
-		if (poll)
-			rv = poll(file, pts);
+	if (pde_is_permanent(pde)) {
+		return pde_poll(pde, file, pts);
+	} else if (use_pde(pde)) {
+		rv = pde_poll(pde, file, pts);
 		unuse_pde(pde);
 	}
 	return rv;
 }
 
+static long pde_ioctl(struct proc_dir_entry *pde, struct file *file, unsigned int cmd, unsigned long arg)
+{
+	typeof_member(struct proc_ops, proc_ioctl) ioctl;
+
+	ioctl = pde->proc_ops->proc_ioctl;
+	if (ioctl)
+		return ioctl(file, cmd, arg);
+	return -ENOTTY;
+}
+
 static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	struct proc_dir_entry *pde = PDE(file_inode(file));
 	long rv = -ENOTTY;
-	if (use_pde(pde)) {
-		typeof_member(struct proc_ops, proc_ioctl) ioctl;
 
-		ioctl = pde->proc_ops->proc_ioctl;
-		if (ioctl)
-			rv = ioctl(file, cmd, arg);
+	if (pde_is_permanent(pde)) {
+		return pde_ioctl(pde, file, cmd, arg);
+	} else if (use_pde(pde)) {
+		rv = pde_ioctl(pde, file, cmd, arg);
 		unuse_pde(pde);
 	}
 	return rv;
 }
 
 #ifdef CONFIG_COMPAT
+static long pde_compat_ioctl(struct proc_dir_entry *pde, struct file *file, unsigned int cmd, unsigned long arg)
+{
+	typeof_member(struct proc_ops, proc_compat_ioctl) compat_ioctl;
+
+	compat_ioctl = pde->proc_ops->proc_compat_ioctl;
+	if (compat_ioctl)
+		return compat_ioctl(file, cmd, arg);
+	return -ENOTTY;
+}
+
 static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	struct proc_dir_entry *pde = PDE(file_inode(file));
 	long rv = -ENOTTY;
-	if (use_pde(pde)) {
-		typeof_member(struct proc_ops, proc_compat_ioctl) compat_ioctl;
-
-		compat_ioctl = pde->proc_ops->proc_compat_ioctl;
-		if (compat_ioctl)
-			rv = compat_ioctl(file, cmd, arg);
+	if (pde_is_permanent(pde)) {
+		return pde_compat_ioctl(pde, file, cmd, arg);
+	} else if (use_pde(pde)) {
+		rv = pde_compat_ioctl(pde, file, cmd, arg);
 		unuse_pde(pde);
 	}
 	return rv;
 }
 #endif
 
+static int pde_mmap(struct proc_dir_entry *pde, struct file *file, struct vm_area_struct *vma)
+{
+	typeof_member(struct proc_ops, proc_mmap) mmap;
+
+	mmap = pde->proc_ops->proc_mmap;
+	if (mmap)
+		return mmap(file, vma);
+	return -EIO;
+}
+
 static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct proc_dir_entry *pde = PDE(file_inode(file));
 	int rv = -EIO;
-	if (use_pde(pde)) {
-		typeof_member(struct proc_ops, proc_mmap) mmap;
 
-		mmap = pde->proc_ops->proc_mmap;
-		if (mmap)
-			rv = mmap(file, vma);
+	if (pde_is_permanent(pde)) {
+		return pde_mmap(pde, file, vma);
+	} else if (use_pde(pde)) {
+		rv = pde_mmap(pde, file, vma);
 		unuse_pde(pde);
 	}
 	return rv;
 }
 
 static unsigned long
-proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr,
+pde_get_unmapped_area(struct proc_dir_entry *pde, struct file *file, unsigned long orig_addr,
 			   unsigned long len, unsigned long pgoff,
 			   unsigned long flags)
 {
-	struct proc_dir_entry *pde = PDE(file_inode(file));
-	unsigned long rv = -EIO;
-
-	if (use_pde(pde)) {
-		typeof_member(struct proc_ops, proc_get_unmapped_area) get_area;
+	typeof_member(struct proc_ops, proc_get_unmapped_area) get_area;
 
-		get_area = pde->proc_ops->proc_get_unmapped_area;
+	get_area = pde->proc_ops->proc_get_unmapped_area;
 #ifdef CONFIG_MMU
-		if (!get_area)
-			get_area = current->mm->get_unmapped_area;
+	if (!get_area)
+		get_area = current->mm->get_unmapped_area;
 #endif
+	if (get_area)
+		return get_area(file, orig_addr, len, pgoff, flags);
+	return orig_addr;
+}
+
+static unsigned long
+proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr,
+			   unsigned long len, unsigned long pgoff,
+			   unsigned long flags)
+{
+	struct proc_dir_entry *pde = PDE(file_inode(file));
+	unsigned long rv = -EIO;
 
-		if (get_area)
-			rv = get_area(file, orig_addr, len, pgoff, flags);
-		else
-			rv = orig_addr;
+	if (pde_is_permanent(pde)) {
+		return pde_get_unmapped_area(pde, file, orig_addr, len, pgoff, flags);
+	} else if (use_pde(pde)) {
+		rv = pde_get_unmapped_area(pde, file, orig_addr, len, pgoff, flags);
 		unuse_pde(pde);
 	}
 	return rv;
@@ -337,6 +406,13 @@  static int proc_reg_open(struct inode *inode, struct file *file)
 	typeof_member(struct proc_ops, proc_release) release;
 	struct pde_opener *pdeo;
 
+	if (pde_is_permanent(pde)) {
+		open = pde->proc_ops->proc_open;
+		if (open)
+			rv = open(inode, file);
+		return rv;
+	}
+
 	/*
 	 * Ensure that
 	 * 1) PDE's ->release hook will be called no matter what
@@ -386,6 +462,17 @@  static int proc_reg_release(struct inode *inode, struct file *file)
 {
 	struct proc_dir_entry *pde = PDE(inode);
 	struct pde_opener *pdeo;
+
+	if (pde_is_permanent(pde)) {
+		typeof_member(struct proc_ops, proc_release) release;
+
+		release = pde->proc_ops->proc_release;
+		if (release) {
+			return release(inode, file);
+		}
+		return 0;
+	}
+
 	spin_lock(&pde->pde_unload_lock);
 	list_for_each_entry(pdeo, &pde->pde_openers, lh) {
 		if (pdeo->file == file) {
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -61,6 +61,7 @@  struct proc_dir_entry {
 	struct rb_node subdir_node;
 	char *name;
 	umode_t mode;
+	u8 flags;
 	u8 namelen;
 	char inline_name[];
 } __randomize_layout;
@@ -73,6 +74,11 @@  struct proc_dir_entry {
 	0)
 #define SIZEOF_PDE_INLINE_NAME (SIZEOF_PDE - sizeof(struct proc_dir_entry))
 
+static inline bool pde_is_permanent(const struct proc_dir_entry *pde)
+{
+	return pde->flags & PROC_ENTRY_PERMANENT;
+}
+
 extern struct kmem_cache *proc_dir_entry_cache;
 void pde_free(struct proc_dir_entry *pde);
 
--- a/fs/proc/kmsg.c
+++ b/fs/proc/kmsg.c
@@ -50,6 +50,7 @@  static __poll_t kmsg_poll(struct file *file, poll_table *wait)
 
 
 static const struct proc_ops kmsg_proc_ops = {
+	.proc_flags	= PROC_ENTRY_PERMANENT,
 	.proc_read	= kmsg_read,
 	.proc_poll	= kmsg_poll,
 	.proc_open	= kmsg_open,
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -224,6 +224,7 @@  static int stat_open(struct inode *inode, struct file *file)
 }
 
 static const struct proc_ops stat_proc_ops = {
+	.proc_flags	= PROC_ENTRY_PERMANENT,
 	.proc_open	= stat_open,
 	.proc_read	= seq_read,
 	.proc_lseek	= seq_lseek,
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -5,6 +5,7 @@ 
 #ifndef _LINUX_PROC_FS_H
 #define _LINUX_PROC_FS_H
 
+#include <linux/compiler.h>
 #include <linux/types.h>
 #include <linux/fs.h>
 
@@ -12,7 +13,21 @@  struct proc_dir_entry;
 struct seq_file;
 struct seq_operations;
 
+enum {
+	/*
+	 * All /proc entries using this ->proc_ops instance are never removed.
+	 *
+	 * If in doubt, ignore this flag.
+	 */
+#ifdef MODULE
+	PROC_ENTRY_PERMANENT = 0U,
+#else
+	PROC_ENTRY_PERMANENT = 1U << 0,
+#endif
+};
+
 struct proc_ops {
+	unsigned int proc_flags;
 	int	(*proc_open)(struct inode *, struct file *);
 	ssize_t	(*proc_read)(struct file *, char __user *, size_t, loff_t *);
 	ssize_t	(*proc_write)(struct file *, const char __user *, size_t, loff_t *);
@@ -25,7 +40,7 @@  struct proc_ops {
 #endif
 	int	(*proc_mmap)(struct file *, struct vm_area_struct *);
 	unsigned long (*proc_get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
-};
+} __randomize_layout;
 
 #ifdef CONFIG_PROC_FS
 
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -885,6 +885,7 @@  static int sysvipc_proc_release(struct inode *inode, struct file *file)
 }
 
 static const struct proc_ops sysvipc_proc_ops = {
+	.proc_flags	= PROC_ENTRY_PERMANENT,
 	.proc_open	= sysvipc_proc_open,
 	.proc_read	= seq_read,
 	.proc_lseek	= seq_lseek,
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -4355,6 +4355,7 @@  static int modules_open(struct inode *inode, struct file *file)
 }
 
 static const struct proc_ops modules_proc_ops = {
+	.proc_flags	= PROC_ENTRY_PERMANENT,
 	.proc_open	= modules_open,
 	.proc_read	= seq_read,
 	.proc_lseek	= seq_lseek,
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1581,6 +1581,7 @@  static int slabinfo_open(struct inode *inode, struct file *file)
 }
 
 static const struct proc_ops slabinfo_proc_ops = {
+	.proc_flags	= PROC_ENTRY_PERMANENT,
 	.proc_open	= slabinfo_open,
 	.proc_read	= seq_read,
 	.proc_write	= slabinfo_write,
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2797,6 +2797,7 @@  static int swaps_open(struct inode *inode, struct file *file)
 }
 
 static const struct proc_ops swaps_proc_ops = {
+	.proc_flags	= PROC_ENTRY_PERMANENT,
 	.proc_open	= swaps_open,
 	.proc_read	= seq_read,
 	.proc_lseek	= seq_lseek,