All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3 resend] procfs: Improve Scaling in proc
@ 2013-02-15 20:47 Nathan Zimmer
  2013-02-15 22:12 ` Andrew Morton
  2013-03-04 20:16 ` Stephen Warren
  0 siblings, 2 replies; 8+ messages in thread
From: Nathan Zimmer @ 2013-02-15 20:47 UTC (permalink / raw)
  To: viro, eric.dumazet
  Cc: linux-kernel, linux-fsdevel, Nathan Zimmer, Andrew Morton,
	Eric W. Biederman, David Woodhouse, Alexey Dobriyan,
	Paul E. McKenney

I am currently tracking a hotlock reported by a customer on a large system,
512 cores.  I am currently running 3.8-rc7 but the issue looks like it has been
this way for a very long time.
The offending lock is proc_dir_entry->pde_unload_lock.

This patch converts the replaces the lock with the rcu. However the pde_openers
list still is controlled by a spin lock. I tested on a 4096 machine and the lock
doesn't seem hot at least according to perf.

This is a refresh/resend of what was orignally suggested by Eric Dumazet some 
time ago.  

Supporting numbers, lower is better, they are from the test I posted earlier.
cpuinfo baseline        Rcu
tasks   read-sec        read-sec
1       0.0141          0.0141
2       0.0140          0.0142
4       0.0140          0.0141
8       0.0145          0.0140
16      0.0553          0.0168
32      0.1688          0.0549
64      0.5017          0.1690
128     1.7005          0.5038
256     5.2513          2.0804
512     8.0529          3.0162

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com> 
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Signed-off-by: Nathan Zimmer <nzimmer@sgi.com>
---
 fs/proc/generic.c       |  58 +++++++++---------
 fs/proc/inode.c         | 156 ++++++++++++++++++++++++++----------------------
 include/linux/proc_fs.h |   6 +-
 3 files changed, 118 insertions(+), 102 deletions(-)

diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 76ddae8..6896a70 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -191,13 +191,16 @@ proc_file_read(struct file *file, char __user *buf, size_t nbytes,
 	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
 	ssize_t rv = -EIO;
 
-	spin_lock(&pde->pde_unload_lock);
-	if (!pde->proc_fops) {
-		spin_unlock(&pde->pde_unload_lock);
+	const struct file_operations *fops;
+
+	rcu_read_lock();
+	fops = rcu_dereference(pde->proc_fops);
+	if (!fops) {
+		rcu_read_unlock();
 		return rv;
 	}
-	pde->pde_users++;
-	spin_unlock(&pde->pde_unload_lock);
+	atomic_inc(&pde->pde_users);
+	rcu_read_unlock();
 
 	rv = __proc_file_read(file, buf, nbytes, ppos);
 
@@ -213,13 +216,16 @@ proc_file_write(struct file *file, const char __user *buffer,
 	ssize_t rv = -EIO;
 
 	if (pde->write_proc) {
-		spin_lock(&pde->pde_unload_lock);
-		if (!pde->proc_fops) {
-			spin_unlock(&pde->pde_unload_lock);
+		const struct file_operations *fops;
+
+		rcu_read_lock();
+		fops = rcu_dereference(pde->proc_fops);
+		if (!fops) {
+			rcu_read_unlock();
 			return rv;
 		}
-		pde->pde_users++;
-		spin_unlock(&pde->pde_unload_lock);
+		atomic_inc(&pde->pde_users);
+		rcu_read_unlock();
 
 		/* FIXME: does this routine need ppos?  probably... */
 		rv = pde->write_proc(file, buffer, count, pde->data);
@@ -558,7 +564,7 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp
 
 	if (S_ISDIR(dp->mode)) {
 		if (dp->proc_iops == NULL) {
-			dp->proc_fops = &proc_dir_operations;
+			RCU_INIT_POINTER(dp->proc_fops, &proc_dir_operations);
 			dp->proc_iops = &proc_dir_inode_operations;
 		}
 		dir->nlink++;
@@ -567,7 +573,7 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp
 			dp->proc_iops = &proc_link_inode_operations;
 	} else if (S_ISREG(dp->mode)) {
 		if (dp->proc_fops == NULL)
-			dp->proc_fops = &proc_file_operations;
+			RCU_INIT_POINTER(dp->proc_fops, &proc_file_operations);
 		if (dp->proc_iops == NULL)
 			dp->proc_iops = &proc_file_inode_operations;
 	}
@@ -620,7 +626,8 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
 	ent->mode = mode;
 	ent->nlink = nlink;
 	atomic_set(&ent->count, 1);
-	spin_lock_init(&ent->pde_unload_lock);
+	atomic_set(&ent->pde_users, 1);
+	spin_lock_init(&ent->pde_openers_lock);
 	INIT_LIST_HEAD(&ent->pde_openers);
 out:
 	return ent;
@@ -744,7 +751,7 @@ struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
 	pde = __proc_create(&parent, name, mode, nlink);
 	if (!pde)
 		goto out;
-	pde->proc_fops = proc_fops;
+	rcu_assign_pointer(pde->proc_fops, proc_fops);
 	pde->data = data;
 	if (proc_register(parent, pde) < 0)
 		goto out_free;
@@ -802,37 +809,30 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
 		return;
 	}
 
-	spin_lock(&de->pde_unload_lock);
 	/*
 	 * Stop accepting new callers into module. If you're
 	 * dynamically allocating ->proc_fops, save a pointer somewhere.
 	 */
-	de->proc_fops = NULL;
-	/* Wait until all existing callers into module are done. */
-	if (de->pde_users > 0) {
-		DECLARE_COMPLETION_ONSTACK(c);
-
-		if (!de->pde_unload_completion)
-			de->pde_unload_completion = &c;
 
-		spin_unlock(&de->pde_unload_lock);
+	rcu_assign_pointer(de->proc_fops, NULL);
+	synchronize_rcu();
+	/* Wait until all existing callers into module are done. */
 
+	DECLARE_COMPLETION_ONSTACK(c);
+	de->pde_unload_completion = &c;
+	if (!atomic_dec_and_test(&de->pde_users))
 		wait_for_completion(de->pde_unload_completion);
 
-		spin_lock(&de->pde_unload_lock);
-	}
-
+	spin_lock(&de->pde_openers_lock);
 	while (!list_empty(&de->pde_openers)) {
 		struct pde_opener *pdeo;
 
 		pdeo = list_first_entry(&de->pde_openers, struct pde_opener, lh);
 		list_del(&pdeo->lh);
-		spin_unlock(&de->pde_unload_lock);
 		pdeo->release(pdeo->inode, pdeo->file);
 		kfree(pdeo);
-		spin_lock(&de->pde_unload_lock);
 	}
-	spin_unlock(&de->pde_unload_lock);
+	spin_unlock(&de->pde_openers_lock);
 
 	if (S_ISDIR(de->mode))
 		parent->nlink--;
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 439ae688..031af27 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -128,46 +128,40 @@ static const struct super_operations proc_sops = {
 	.show_options	= proc_show_options,
 };
 
-static void __pde_users_dec(struct proc_dir_entry *pde)
-{
-	pde->pde_users--;
-	if (pde->pde_unload_completion && pde->pde_users == 0)
-		complete(pde->pde_unload_completion);
-}
-
 void pde_users_dec(struct proc_dir_entry *pde)
 {
-	spin_lock(&pde->pde_unload_lock);
-	__pde_users_dec(pde);
-	spin_unlock(&pde->pde_unload_lock);
+	if (atomic_dec_and_test(&pde->pde_users) && pde->pde_unload_completion)
+		complete(pde->pde_unload_completion);
 }
 
 static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence)
 {
+	const struct file_operations *fops;
 	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
 	loff_t rv = -EINVAL;
 	loff_t (*llseek)(struct file *, loff_t, int);
 
-	spin_lock(&pde->pde_unload_lock);
+	rcu_read_lock();
+	fops = rcu_dereference(pde->proc_fops);
 	/*
 	 * remove_proc_entry() is going to delete PDE (as part of module
 	 * cleanup sequence). No new callers into module allowed.
 	 */
-	if (!pde->proc_fops) {
-		spin_unlock(&pde->pde_unload_lock);
+	if (!fops) {
+		rcu_read_unlock();
 		return rv;
 	}
 	/*
 	 * Bump refcount so that remove_proc_entry will wail for ->llseek to
 	 * complete.
 	 */
-	pde->pde_users++;
+	atomic_inc(&pde->pde_users);
 	/*
 	 * Save function pointer under lock, to protect against ->proc_fops
 	 * NULL'ifying right after ->pde_unload_lock is dropped.
 	 */
-	llseek = pde->proc_fops->llseek;
-	spin_unlock(&pde->pde_unload_lock);
+	llseek = fops->llseek;
+	rcu_read_unlock();
 
 	if (!llseek)
 		llseek = default_llseek;
@@ -182,15 +176,17 @@ static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count,
 	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
 	ssize_t rv = -EIO;
 	ssize_t (*read)(struct file *, char __user *, size_t, loff_t *);
+	const struct file_operations *fops;
 
-	spin_lock(&pde->pde_unload_lock);
-	if (!pde->proc_fops) {
-		spin_unlock(&pde->pde_unload_lock);
+	rcu_read_lock();
+	fops = rcu_dereference(pde->proc_fops);
+	if (!fops) {
+		rcu_read_unlock();
 		return rv;
 	}
-	pde->pde_users++;
-	read = pde->proc_fops->read;
-	spin_unlock(&pde->pde_unload_lock);
+	atomic_inc(&pde->pde_users);
+	read = fops->read;
+	rcu_read_unlock();
 
 	if (read)
 		rv = read(file, buf, count, ppos);
@@ -204,15 +200,17 @@ static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t
 	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
 	ssize_t rv = -EIO;
 	ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *);
+	const struct file_operations *fops;
 
-	spin_lock(&pde->pde_unload_lock);
-	if (!pde->proc_fops) {
-		spin_unlock(&pde->pde_unload_lock);
+	rcu_read_lock();
+	fops = rcu_dereference(pde->proc_fops);
+	if (!fops) {
+		rcu_read_unlock();
 		return rv;
 	}
-	pde->pde_users++;
-	write = pde->proc_fops->write;
-	spin_unlock(&pde->pde_unload_lock);
+	atomic_inc(&pde->pde_users);
+	write = fops->write;
+	rcu_read_unlock();
 
 	if (write)
 		rv = write(file, buf, count, ppos);
@@ -226,15 +224,17 @@ static unsigned int proc_reg_poll(struct file *file, struct poll_table_struct *p
 	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
 	unsigned int rv = DEFAULT_POLLMASK;
 	unsigned int (*poll)(struct file *, struct poll_table_struct *);
+	const struct file_operations *fops;
 
-	spin_lock(&pde->pde_unload_lock);
-	if (!pde->proc_fops) {
-		spin_unlock(&pde->pde_unload_lock);
+	rcu_read_lock();
+	fops = rcu_dereference(pde->proc_fops);
+	if (!fops) {
+		rcu_read_unlock();
 		return rv;
 	}
-	pde->pde_users++;
-	poll = pde->proc_fops->poll;
-	spin_unlock(&pde->pde_unload_lock);
+	atomic_inc(&pde->pde_users);
+	poll = fops->poll;
+	rcu_read_unlock();
 
 	if (poll)
 		rv = poll(file, pts);
@@ -248,15 +248,17 @@ static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigne
 	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
 	long rv = -ENOTTY;
 	long (*ioctl)(struct file *, unsigned int, unsigned long);
+	const struct file_operations *fops;
 
-	spin_lock(&pde->pde_unload_lock);
-	if (!pde->proc_fops) {
-		spin_unlock(&pde->pde_unload_lock);
+	rcu_read_lock();
+	fops = rcu_dereference(pde->proc_fops);
+	if (!fops) {
+		rcu_read_unlock();
 		return rv;
 	}
-	pde->pde_users++;
-	ioctl = pde->proc_fops->unlocked_ioctl;
-	spin_unlock(&pde->pde_unload_lock);
+	atomic_inc(&pde->pde_users);
+	ioctl = fops->unlocked_ioctl;
+	rcu_read_unlock();
 
 	if (ioctl)
 		rv = ioctl(file, cmd, arg);
@@ -271,15 +273,17 @@ static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned
 	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
 	long rv = -ENOTTY;
 	long (*compat_ioctl)(struct file *, unsigned int, unsigned long);
+	const struct file_operations *fops;
 
-	spin_lock(&pde->pde_unload_lock);
-	if (!pde->proc_fops) {
-		spin_unlock(&pde->pde_unload_lock);
+	rcu_read_lock();
+	fops = rcu_dereference(pde->proc_fops);
+	if (!fops) {
+		rcu_read_unlock();
 		return rv;
 	}
-	pde->pde_users++;
+	atomic_inc(&pde->pde_users);
 	compat_ioctl = pde->proc_fops->compat_ioctl;
-	spin_unlock(&pde->pde_unload_lock);
+	rcu_read_unlock();
 
 	if (compat_ioctl)
 		rv = compat_ioctl(file, cmd, arg);
@@ -294,15 +298,17 @@ static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma)
 	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
 	int rv = -EIO;
 	int (*mmap)(struct file *, struct vm_area_struct *);
+	const struct file_operations *fops;
 
-	spin_lock(&pde->pde_unload_lock);
-	if (!pde->proc_fops) {
-		spin_unlock(&pde->pde_unload_lock);
+	rcu_read_lock();
+	fops = rcu_dereference(pde->proc_fops);
+	if (!fops) {
+		rcu_read_unlock();
 		return rv;
 	}
-	pde->pde_users++;
+	atomic_inc(&pde->pde_users);
 	mmap = pde->proc_fops->mmap;
-	spin_unlock(&pde->pde_unload_lock);
+	rcu_read_unlock();
 
 	if (mmap)
 		rv = mmap(file, vma);
@@ -318,6 +324,7 @@ static int proc_reg_open(struct inode *inode, struct file *file)
 	int (*open)(struct inode *, struct file *);
 	int (*release)(struct inode *, struct file *);
 	struct pde_opener *pdeo;
+	const struct file_operations *fops;
 
 	/*
 	 * What for, you ask? Well, we can have open, rmmod, remove_proc_entry
@@ -333,32 +340,33 @@ static int proc_reg_open(struct inode *inode, struct file *file)
 	if (!pdeo)
 		return -ENOMEM;
 
-	spin_lock(&pde->pde_unload_lock);
-	if (!pde->proc_fops) {
-		spin_unlock(&pde->pde_unload_lock);
+	rcu_read_lock();
+	fops = rcu_dereference(pde->proc_fops);
+	if (!fops) {
+		rcu_read_unlock();
 		kfree(pdeo);
 		return -ENOENT;
 	}
-	pde->pde_users++;
-	open = pde->proc_fops->open;
-	release = pde->proc_fops->release;
-	spin_unlock(&pde->pde_unload_lock);
+	atomic_inc(&pde->pde_users);
+	open = fops->open;
+	release = fops->release;
+	rcu_read_lock();
 
 	if (open)
 		rv = open(inode, file);
 
-	spin_lock(&pde->pde_unload_lock);
 	if (rv == 0 && release) {
 		/* To know what to release. */
 		pdeo->inode = inode;
 		pdeo->file = file;
 		/* Strictly for "too late" ->release in proc_reg_release(). */
 		pdeo->release = release;
+		spin_lock(&pde->pde_openers_lock);
 		list_add(&pdeo->lh, &pde->pde_openers);
+		spin_unlock(&pde->pde_openers_lock);
 	} else
 		kfree(pdeo);
-	__pde_users_dec(pde);
-	spin_unlock(&pde->pde_unload_lock);
+	pde_users_dec(pde);
 	return rv;
 }
 
@@ -367,10 +375,14 @@ static struct pde_opener *find_pde_opener(struct proc_dir_entry *pde,
 {
 	struct pde_opener *pdeo;
 
+	spin_lock(&pde->pde_openers_lock);
 	list_for_each_entry(pdeo, &pde->pde_openers, lh) {
-		if (pdeo->inode == inode && pdeo->file == file)
+		if (pdeo->inode == inode && pdeo->file == file) {
+			spin_unlock(&pde->pde_openers_lock);
 			return pdeo;
+		}
 	}
+	spin_unlock(&pde->pde_openers_lock);
 	return NULL;
 }
 
@@ -380,10 +392,12 @@ static int proc_reg_release(struct inode *inode, struct file *file)
 	int rv = 0;
 	int (*release)(struct inode *, struct file *);
 	struct pde_opener *pdeo;
+	const struct file_operations *fops;
 
-	spin_lock(&pde->pde_unload_lock);
 	pdeo = find_pde_opener(pde, inode, file);
-	if (!pde->proc_fops) {
+	rcu_read_lock();
+	fops = rcu_dereference(pde->proc_fops);
+	if (!fops) {
 		/*
 		 * Can't simply exit, __fput() will think that everything is OK,
 		 * and move on to freeing struct file. remove_proc_entry() will
@@ -393,21 +407,23 @@ static int proc_reg_release(struct inode *inode, struct file *file)
 		 * But if opener is removed from list, who will ->release it?
 		 */
 		if (pdeo) {
+			spin_lock(&pde->pde_openers_lock);
 			list_del(&pdeo->lh);
-			spin_unlock(&pde->pde_unload_lock);
+			spin_unlock(&pde->pde_openers_lock);
 			rv = pdeo->release(inode, file);
 			kfree(pdeo);
-		} else
-			spin_unlock(&pde->pde_unload_lock);
+		}
 		return rv;
 	}
-	pde->pde_users++;
-	release = pde->proc_fops->release;
+	atomic_inc(&pde->pde_users);
+	release = fops->release;
+	rcu_read_unlock();
 	if (pdeo) {
+		spin_lock(&pde->pde_openers_lock);
 		list_del(&pdeo->lh);
-		kfree(pdeo);
+		spin_unlock(&pde->pde_openers_lock);
 	}
-	spin_unlock(&pde->pde_unload_lock);
+	kfree(pdeo);
 
 	if (release)
 		rv = release(inode, file);
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 32676b3..82628d9 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -68,16 +68,16 @@ struct proc_dir_entry {
 	 * If you're allocating ->proc_fops dynamically, save a pointer
 	 * somewhere.
 	 */
-	const struct file_operations *proc_fops;
+	const struct file_operations __rcu *proc_fops;
 	struct proc_dir_entry *next, *parent, *subdir;
 	void *data;
 	read_proc_t *read_proc;
 	write_proc_t *write_proc;
 	atomic_t count;		/* use count */
-	int pde_users;	/* number of callers into module in progress */
+	atomic_t pde_users;	/* number of callers into module in progress */
 	struct completion *pde_unload_completion;
+	spinlock_t pde_openers_lock;
 	struct list_head pde_openers;	/* who did ->open, but not ->release */
-	spinlock_t pde_unload_lock; /* proc_fops checks and pde_users bumps */
 	u8 namelen;
 	char name[];
 };
-- 
1.8.1.2


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH v3 resend] procfs: Improve Scaling in proc
  2013-02-15 20:47 [PATCH v3 resend] procfs: Improve Scaling in proc Nathan Zimmer
@ 2013-02-15 22:12 ` Andrew Morton
  2013-02-15 23:39   ` Nathan Zimmer
  2013-03-04 20:16 ` Stephen Warren
  1 sibling, 1 reply; 8+ messages in thread
From: Andrew Morton @ 2013-02-15 22:12 UTC (permalink / raw)
  To: Nathan Zimmer
  Cc: viro, eric.dumazet, linux-kernel, linux-fsdevel,
	Eric W. Biederman, David Woodhouse, Alexey Dobriyan,
	Paul E. McKenney

On Fri, 15 Feb 2013 14:47:54 -0600
Nathan Zimmer <nzimmer@sgi.com> wrote:

> I am currently tracking a hotlock reported by a customer on a large system,
> 512 cores.  I am currently running 3.8-rc7 but the issue looks like it has been
> this way for a very long time.
> The offending lock is proc_dir_entry->pde_unload_lock.
> 
> This patch converts the replaces the lock with the rcu. However the pde_openers
> list still is controlled by a spin lock. I tested on a 4096 machine and the lock
> doesn't seem hot at least according to perf.
> 
> This is a refresh/resend of what was orignally suggested by Eric Dumazet some 
> time ago.  
> 
> Supporting numbers, lower is better, they are from the test I posted earlier.
> cpuinfo baseline        Rcu
> tasks   read-sec        read-sec
> 1       0.0141          0.0141
> 2       0.0140          0.0142
> 4       0.0140          0.0141
> 8       0.0145          0.0140
> 16      0.0553          0.0168
> 32      0.1688          0.0549
> 64      0.5017          0.1690
> 128     1.7005          0.5038
> 256     5.2513          2.0804
> 512     8.0529          3.0162
>
> ...
> 
> diff --git a/fs/proc/generic.c b/fs/proc/generic.c
> index 76ddae8..6896a70 100644
> --- a/fs/proc/generic.c
> +++ b/fs/proc/generic.c
> @@ -191,13 +191,16 @@ proc_file_read(struct file *file, char __user *buf, size_t nbytes,
>  	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
>  	ssize_t rv = -EIO;
>  
> -	spin_lock(&pde->pde_unload_lock);
> -	if (!pde->proc_fops) {
> -		spin_unlock(&pde->pde_unload_lock);
> +	const struct file_operations *fops;

There's now a stray newline in the definitions section.

> +	rcu_read_lock();
> +	fops = rcu_dereference(pde->proc_fops);
> +	if (!fops) {
> +		rcu_read_unlock();
>  		return rv;
>  	}
> -	pde->pde_users++;
> -	spin_unlock(&pde->pde_unload_lock);
> +	atomic_inc(&pde->pde_users);
> +	rcu_read_unlock();

So what's up with pde_users?  Seems that it's atomic_t *and* uses a
form of RCU protection.  We can't make it a plain old integer because
it's modified under rcu_read_lock() and we can't move the atomic_inc()
outside rcu_read_lock() because of the synchronization games in
remove_proc_entry()?

>  	rv = __proc_file_read(file, buf, nbytes, ppos);
>  
>
> ...
>
> @@ -802,37 +809,30 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
>  		return;
>  	}
>  
> -	spin_lock(&de->pde_unload_lock);
>  	/*
>  	 * Stop accepting new callers into module. If you're
>  	 * dynamically allocating ->proc_fops, save a pointer somewhere.
>  	 */
> -	de->proc_fops = NULL;
> -	/* Wait until all existing callers into module are done. */
> -	if (de->pde_users > 0) {
> -		DECLARE_COMPLETION_ONSTACK(c);
> -
> -		if (!de->pde_unload_completion)
> -			de->pde_unload_completion = &c;
>  
> -		spin_unlock(&de->pde_unload_lock);
> +	rcu_assign_pointer(de->proc_fops, NULL);
> +	synchronize_rcu();
> +	/* Wait until all existing callers into module are done. */
>  
> +	DECLARE_COMPLETION_ONSTACK(c);

This should have generated a c99-style definition warning.  Did your
compiler version not do this?

> +	de->pde_unload_completion = &c;
> +	if (!atomic_dec_and_test(&de->pde_users))
>  		wait_for_completion(de->pde_unload_completion);
>  
> -		spin_lock(&de->pde_unload_lock);
> -	}
> -
> +	spin_lock(&de->pde_openers_lock);
>  	while (!list_empty(&de->pde_openers)) {
>  		struct pde_opener *pdeo;
>  
>  		pdeo = list_first_entry(&de->pde_openers, struct pde_opener, lh);
>  		list_del(&pdeo->lh);
> -		spin_unlock(&de->pde_unload_lock);
>  		pdeo->release(pdeo->inode, pdeo->file);
>  		kfree(pdeo);
> -		spin_lock(&de->pde_unload_lock);
>  	}
> -	spin_unlock(&de->pde_unload_lock);
> +	spin_unlock(&de->pde_openers_lock);
>  
>  	if (S_ISDIR(de->mode))
>  		parent->nlink--;
>
> ...
>
>  static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence)
>  {
> +	const struct file_operations *fops;
>  	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
>  	loff_t rv = -EINVAL;
>  	loff_t (*llseek)(struct file *, loff_t, int);
>  
> -	spin_lock(&pde->pde_unload_lock);
> +	rcu_read_lock();
> +	fops = rcu_dereference(pde->proc_fops);
>  	/*
>  	 * remove_proc_entry() is going to delete PDE (as part of module
>  	 * cleanup sequence). No new callers into module allowed.
>  	 */
> -	if (!pde->proc_fops) {
> -		spin_unlock(&pde->pde_unload_lock);
> +	if (!fops) {
> +		rcu_read_unlock();
>  		return rv;
>  	}
>  	/*
>  	 * Bump refcount so that remove_proc_entry will wail for ->llseek to
>  	 * complete.
>  	 */
> -	pde->pde_users++;
> +	atomic_inc(&pde->pde_users);
>  	/*
>  	 * Save function pointer under lock, to protect against ->proc_fops
>  	 * NULL'ifying right after ->pde_unload_lock is dropped.
>  	 */

This comment needs updating.

However, it doesn't appear to be true any more.  With this patch we no
longer set ->fops to NULL in remove_proc_entry().  (What replaced that
logic?)

So are all these games with local variable `llseek' still needed? 
afaict the increment of pde_users will stabilize ->fops?


> -	llseek = pde->proc_fops->llseek;
> -	spin_unlock(&pde->pde_unload_lock);
> +	llseek = fops->llseek;
> +	rcu_read_unlock();
>  
>  	if (!llseek)
>  		llseek = default_llseek;
> @@ -182,15 +176,17 @@ static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count,
>  	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
>  	ssize_t rv = -EIO;
>  	ssize_t (*read)(struct file *, char __user *, size_t, loff_t *);
> +	const struct file_operations *fops;
>  
> -	spin_lock(&pde->pde_unload_lock);
> -	if (!pde->proc_fops) {
> -		spin_unlock(&pde->pde_unload_lock);
> +	rcu_read_lock();
> +	fops = rcu_dereference(pde->proc_fops);
> +	if (!fops) {
> +		rcu_read_unlock();
>  		return rv;
>  	}
> -	pde->pde_users++;
> -	read = pde->proc_fops->read;
> -	spin_unlock(&pde->pde_unload_lock);
> +	atomic_inc(&pde->pde_users);
> +	read = fops->read;
> +	rcu_read_unlock();

Many dittoes.

>  	if (read)
>  		rv = read(file, buf, count, ppos);
> @@ -204,15 +200,17 @@ static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t
>  	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
>  	ssize_t rv = -EIO;
>  	ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *);
> +	const struct file_operations *fops;
>  
> -	spin_lock(&pde->pde_unload_lock);
> -	if (!pde->proc_fops) {
> -		spin_unlock(&pde->pde_unload_lock);
> +	rcu_read_lock();
> +	fops = rcu_dereference(pde->proc_fops);
> +	if (!fops) {
> +		rcu_read_unlock();
>  		return rv;
>  	}
> -	pde->pde_users++;
> -	write = pde->proc_fops->write;
> -	spin_unlock(&pde->pde_unload_lock);
> +	atomic_inc(&pde->pde_users);
> +	write = fops->write;
> +	rcu_read_unlock();
>
> ...
>

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v3 resend] procfs: Improve Scaling in proc
  2013-02-15 22:12 ` Andrew Morton
@ 2013-02-15 23:39   ` Nathan Zimmer
  2013-02-15 23:44     ` Andrew Morton
  0 siblings, 1 reply; 8+ messages in thread
From: Nathan Zimmer @ 2013-02-15 23:39 UTC (permalink / raw)
  To: Andrew Morton
  Cc: viro, eric.dumazet, linux-kernel, linux-fsdevel,
	Eric W. Biederman, David Woodhouse, Alexey Dobriyan,
	Paul E. McKenney

On 02/15/2013 04:12 PM, Andrew Morton wrote:
> On Fri, 15 Feb 2013 14:47:54 -0600
> Nathan Zimmer <nzimmer@sgi.com> wrote:
>
>> I am currently tracking a hotlock reported by a customer on a large system,
>> 512 cores.  I am currently running 3.8-rc7 but the issue looks like it has been
>> this way for a very long time.
>> The offending lock is proc_dir_entry->pde_unload_lock.
>>
>> This patch converts the replaces the lock with the rcu. However the pde_openers
>> list still is controlled by a spin lock. I tested on a 4096 machine and the lock
>> doesn't seem hot at least according to perf.
>>
>> This is a refresh/resend of what was orignally suggested by Eric Dumazet some
>> time ago.
>>
>> Supporting numbers, lower is better, they are from the test I posted earlier.
>> cpuinfo baseline        Rcu
>> tasks   read-sec        read-sec
>> 1       0.0141          0.0141
>> 2       0.0140          0.0142
>> 4       0.0140          0.0141
>> 8       0.0145          0.0140
>> 16      0.0553          0.0168
>> 32      0.1688          0.0549
>> 64      0.5017          0.1690
>> 128     1.7005          0.5038
>> 256     5.2513          2.0804
>> 512     8.0529          3.0162
>>
>> ...
>>
>> diff --git a/fs/proc/generic.c b/fs/proc/generic.c
>> index 76ddae8..6896a70 100644
>> --- a/fs/proc/generic.c
>> +++ b/fs/proc/generic.c
>> @@ -191,13 +191,16 @@ proc_file_read(struct file *file, char __user *buf, size_t nbytes,
>>   	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
>>   	ssize_t rv = -EIO;
>>   
>> -	spin_lock(&pde->pde_unload_lock);
>> -	if (!pde->proc_fops) {
>> -		spin_unlock(&pde->pde_unload_lock);
>> +	const struct file_operations *fops;
> There's now a stray newline in the definitions section.
Noted and corrected, in a few places.
>> +	rcu_read_lock();
>> +	fops = rcu_dereference(pde->proc_fops);
>> +	if (!fops) {
>> +		rcu_read_unlock();
>>   		return rv;
>>   	}
>> -	pde->pde_users++;
>> -	spin_unlock(&pde->pde_unload_lock);
>> +	atomic_inc(&pde->pde_users);
>> +	rcu_read_unlock();
> So what's up with pde_users?  Seems that it's atomic_t *and* uses a
> form of RCU protection.  We can't make it a plain old integer because
> it's modified under rcu_read_lock() and we can't move the atomic_inc()
> outside rcu_read_lock() because of the synchronization games in
> remove_proc_entry()?
The intent of pde_users is to let us know when it is safe to clean out 
the pde_openers.
I probably should comment this.
>>   	rv = __proc_file_read(file, buf, nbytes, ppos);
>>   
>>
>> ...
>>
>> @@ -802,37 +809,30 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
>>   		return;
>>   	}
>>   
>> -	spin_lock(&de->pde_unload_lock);
>>   	/*
>>   	 * Stop accepting new callers into module. If you're
>>   	 * dynamically allocating ->proc_fops, save a pointer somewhere.
>>   	 */
>> -	de->proc_fops = NULL;
>> -	/* Wait until all existing callers into module are done. */
>> -	if (de->pde_users > 0) {
>> -		DECLARE_COMPLETION_ONSTACK(c);
>> -
>> -		if (!de->pde_unload_completion)
>> -			de->pde_unload_completion = &c;
>>   
>> -		spin_unlock(&de->pde_unload_lock);
>> +	rcu_assign_pointer(de->proc_fops, NULL);
>> +	synchronize_rcu();
>> +	/* Wait until all existing callers into module are done. */
>>   
>> +	DECLARE_COMPLETION_ONSTACK(c);
> This should have generated a c99-style definition warning.  Did your
> compiler version not do this?
A clear over site on my part.

>> +	de->pde_unload_completion = &c;
>> +	if (!atomic_dec_and_test(&de->pde_users))
>>   		wait_for_completion(de->pde_unload_completion);
>>   
>> -		spin_lock(&de->pde_unload_lock);
>> -	}
>> -
>> +	spin_lock(&de->pde_openers_lock);
>>   	while (!list_empty(&de->pde_openers)) {
>>   		struct pde_opener *pdeo;
>>   
>>   		pdeo = list_first_entry(&de->pde_openers, struct pde_opener, lh);
>>   		list_del(&pdeo->lh);
>> -		spin_unlock(&de->pde_unload_lock);
>>   		pdeo->release(pdeo->inode, pdeo->file);
>>   		kfree(pdeo);
>> -		spin_lock(&de->pde_unload_lock);
>>   	}
>> -	spin_unlock(&de->pde_unload_lock);
>> +	spin_unlock(&de->pde_openers_lock);
>>   
>>   	if (S_ISDIR(de->mode))
>>   		parent->nlink--;
>>
>> ...
>>
>>   static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence)
>>   {
>> +	const struct file_operations *fops;
>>   	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
>>   	loff_t rv = -EINVAL;
>>   	loff_t (*llseek)(struct file *, loff_t, int);
>>   
>> -	spin_lock(&pde->pde_unload_lock);
>> +	rcu_read_lock();
>> +	fops = rcu_dereference(pde->proc_fops);
>>   	/*
>>   	 * remove_proc_entry() is going to delete PDE (as part of module
>>   	 * cleanup sequence). No new callers into module allowed.
>>   	 */
>> -	if (!pde->proc_fops) {
>> -		spin_unlock(&pde->pde_unload_lock);
>> +	if (!fops) {
>> +		rcu_read_unlock();
>>   		return rv;
>>   	}
>>   	/*
>>   	 * Bump refcount so that remove_proc_entry will wail for ->llseek to
>>   	 * complete.
>>   	 */
>> -	pde->pde_users++;
>> +	atomic_inc(&pde->pde_users);
>>   	/*
>>   	 * Save function pointer under lock, to protect against ->proc_fops
>>   	 * NULL'ifying right after ->pde_unload_lock is dropped.
>>   	 */
> This comment needs updating.
>
> However, it doesn't appear to be true any more.  With this patch we no
> longer set ->fops to NULL in remove_proc_entry().  (What replaced that
> logic?)
>
> So are all these games with local variable `llseek' still needed?
> afaict the increment of pde_users will stabilize ->fops?
We still are setting de->proc_fops to NULL to prevent new callers.
Also we still have to save fops-> since we cannot use fops outside the 
rcu_read_un/lock.
Unless I misunderstood your question.
But yes the comment needs to be updated.
>
>> -	llseek = pde->proc_fops->llseek;
>> -	spin_unlock(&pde->pde_unload_lock);
>> +	llseek = fops->llseek;
>> +	rcu_read_unlock();
>>   
>>   	if (!llseek)
>>   		llseek = default_llseek;
>> @@ -182,15 +176,17 @@ static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count,
>>   	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
>>   	ssize_t rv = -EIO;
>>   	ssize_t (*read)(struct file *, char __user *, size_t, loff_t *);
>> +	const struct file_operations *fops;
>>   
>> -	spin_lock(&pde->pde_unload_lock);
>> -	if (!pde->proc_fops) {
>> -		spin_unlock(&pde->pde_unload_lock);
>> +	rcu_read_lock();
>> +	fops = rcu_dereference(pde->proc_fops);
>> +	if (!fops) {
>> +		rcu_read_unlock();
>>   		return rv;
>>   	}
>> -	pde->pde_users++;
>> -	read = pde->proc_fops->read;
>> -	spin_unlock(&pde->pde_unload_lock);
>> +	atomic_inc(&pde->pde_users);
>> +	read = fops->read;
>> +	rcu_read_unlock();
> Many dittoes.
>
>>   	if (read)
>>   		rv = read(file, buf, count, ppos);
>> @@ -204,15 +200,17 @@ static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t
>>   	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
>>   	ssize_t rv = -EIO;
>>   	ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *);
>> +	const struct file_operations *fops;
>>   
>> -	spin_lock(&pde->pde_unload_lock);
>> -	if (!pde->proc_fops) {
>> -		spin_unlock(&pde->pde_unload_lock);
>> +	rcu_read_lock();
>> +	fops = rcu_dereference(pde->proc_fops);
>> +	if (!fops) {
>> +		rcu_read_unlock();
>>   		return rv;
>>   	}
>> -	pde->pde_users++;
>> -	write = pde->proc_fops->write;
>> -	spin_unlock(&pde->pde_unload_lock);
>> +	atomic_inc(&pde->pde_users);
>> +	write = fops->write;
>> +	rcu_read_unlock();
>>
>> ...
>>

Thanks,
Nate

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v3 resend] procfs: Improve Scaling in proc
  2013-02-15 23:39   ` Nathan Zimmer
@ 2013-02-15 23:44     ` Andrew Morton
  0 siblings, 0 replies; 8+ messages in thread
From: Andrew Morton @ 2013-02-15 23:44 UTC (permalink / raw)
  To: Nathan Zimmer
  Cc: viro, eric.dumazet, linux-kernel, linux-fsdevel,
	Eric W. Biederman, David Woodhouse, Alexey Dobriyan,
	Paul E. McKenney

On Fri, 15 Feb 2013 17:39:38 -0600
Nathan Zimmer <nzimmer@sgi.com> wrote:

> > So are all these games with local variable `llseek' still needed?
> > afaict the increment of pde_users will stabilize ->fops?
> We still are setting de->proc_fops to NULL to prevent new callers.
> Also we still have to save fops-> since we cannot use fops outside the 
> rcu_read_un/lock.
> Unless I misunderstood your question.

ah, so we are.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v3 resend] procfs: Improve Scaling in proc
  2013-02-15 20:47 [PATCH v3 resend] procfs: Improve Scaling in proc Nathan Zimmer
  2013-02-15 22:12 ` Andrew Morton
@ 2013-03-04 20:16 ` Stephen Warren
  2013-03-04 21:00   ` Nathan Zimmer
  1 sibling, 1 reply; 8+ messages in thread
From: Stephen Warren @ 2013-03-04 20:16 UTC (permalink / raw)
  To: Nathan Zimmer
  Cc: viro, eric.dumazet, linux-kernel, linux-fsdevel, Andrew Morton,
	Eric W. Biederman, David Woodhouse, Alexey Dobriyan,
	Paul E. McKenney

On 02/15/2013 01:47 PM, Nathan Zimmer wrote:
> I am currently tracking a hotlock reported by a customer on a large system,
> 512 cores.  I am currently running 3.8-rc7 but the issue looks like it has been
> this way for a very long time.
> The offending lock is proc_dir_entry->pde_unload_lock.
> 
> This patch converts the replaces the lock with the rcu. However the pde_openers
> list still is controlled by a spin lock. I tested on a 4096 machine and the lock
> doesn't seem hot at least according to perf.

Nathan,

This patch causes RCU errors on my ARM systems (which have 2 or 4 CPUs)
during boot; see the log below. Reverting it fixes the problem. Let me
know if you need any more information to diagnose the problem. Thanks.

> [ 3.652115] VFS: Mounted root (ext3 filesystem) on device 179:1.
> [ 3.664856] devtmpfs: mounted
> [ 3.668877] Freeing init memory: 212K
...
> [ 25.765891] INFO: rcu_preempt detected stalls on CPUs/tasks: {} (detected by 1, t=2102 jiffies, g=4294967050, c=4294967049, q=5749)
> [ 25.777786] INFO: Stall ended before state dump start
> [ 88.815886] INFO: rcu_preempt detected stalls on CPUs/tasks: {} (detected by 1, t=8407 jiffies, g=4294967050, c=4294967049, q=5818)
> [ 88.827777] INFO: Stall ended before state dump start
> [ 151.865886] INFO: rcu_preempt detected stalls on CPUs/tasks: {} (detected by 1, t=14712 jiffies, g=4294967050, c=4294967049, q=5869)
> [ 151.877863] INFO: Stall ended before state dump start
> [ 214.915885] INFO: rcu_preempt detected stalls on CPUs/tasks: {} (detected by 1, t=21017 jiffies, g=4294967050, c=4294967049, q=5869)
> [ 214.927857] INFO: Stall ended before state dump start
> [ 240.525981] INFO: task mountall:71 blocked for more than 120 seconds.
> [ 240.532435] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> [ 240.541452] INFO: task init:150 blocked for more than 120 seconds.
> [ 240.547687] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> [ 240.556352] INFO: task udevd:151 blocked for more than 120 seconds.
> [ 240.562623] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> [ 240.571262] INFO: task ata_id:197 blocked for more than 120 seconds.
> [ 240.577662] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> [ 277.965881] INFO: rcu_preempt detected stalls on CPUs/tasks: {} (detected by 0, t=27322 jiffies, g=4294967050, c=4294967049, q=5869)
> [ 277.977860] INFO: Stall ended before state dump start 


^ permalink raw reply	[flat|nested] 8+ messages in thread

* RE: [PATCH v3 resend] procfs: Improve Scaling in proc
  2013-03-04 20:16 ` Stephen Warren
@ 2013-03-04 21:00   ` Nathan Zimmer
  2013-03-04 22:00     ` Stephen Warren
  0 siblings, 1 reply; 8+ messages in thread
From: Nathan Zimmer @ 2013-03-04 21:00 UTC (permalink / raw)
  To: Stephen Warren
  Cc: viro, eric.dumazet, linux-kernel, linux-fsdevel, Andrew Morton,
	Eric W. Biederman, David Woodhouse, Alexey Dobriyan,
	Paul E. McKenney

I found and fixed a number of them, with the help of sparse. 
http://marc.info/?l=linux-fsdevel&m=136242660212004&w=2
If there are any I didn't find and fix let me know.


________________________________________
From: Stephen Warren [swarren@wwwdotorg.org]
Sent: Monday, March 04, 2013 2:16 PM
To: Nathan Zimmer
Cc: viro@zeniv.linux.org.uk; eric.dumazet@gmail.com; linux-kernel@vger.kernel.org; linux-fsdevel@vger.kernel.org; Andrew Morton; Eric W. Biederman; David Woodhouse; Alexey Dobriyan; Paul E. McKenney
Subject: Re: [PATCH v3 resend] procfs: Improve Scaling in proc

On 02/15/2013 01:47 PM, Nathan Zimmer wrote:
> I am currently tracking a hotlock reported by a customer on a large system,
> 512 cores.  I am currently running 3.8-rc7 but the issue looks like it has been
> this way for a very long time.
> The offending lock is proc_dir_entry->pde_unload_lock.
>
> This patch converts the replaces the lock with the rcu. However the pde_openers
> list still is controlled by a spin lock. I tested on a 4096 machine and the lock
> doesn't seem hot at least according to perf.

Nathan,

This patch causes RCU errors on my ARM systems (which have 2 or 4 CPUs)
during boot; see the log below. Reverting it fixes the problem. Let me
know if you need any more information to diagnose the problem. Thanks.

> [ 3.652115] VFS: Mounted root (ext3 filesystem) on device 179:1.
> [ 3.664856] devtmpfs: mounted
> [ 3.668877] Freeing init memory: 212K
...
> [ 25.765891] INFO: rcu_preempt detected stalls on CPUs/tasks: {} (detected by 1, t=2102 jiffies, g=4294967050, c=4294967049, q=5749)
> [ 25.777786] INFO: Stall ended before state dump start
> [ 88.815886] INFO: rcu_preempt detected stalls on CPUs/tasks: {} (detected by 1, t=8407 jiffies, g=4294967050, c=4294967049, q=5818)
> [ 88.827777] INFO: Stall ended before state dump start
> [ 151.865886] INFO: rcu_preempt detected stalls on CPUs/tasks: {} (detected by 1, t=14712 jiffies, g=4294967050, c=4294967049, q=5869)
> [ 151.877863] INFO: Stall ended before state dump start
> [ 214.915885] INFO: rcu_preempt detected stalls on CPUs/tasks: {} (detected by 1, t=21017 jiffies, g=4294967050, c=4294967049, q=5869)
> [ 214.927857] INFO: Stall ended before state dump start
> [ 240.525981] INFO: task mountall:71 blocked for more than 120 seconds.
> [ 240.532435] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> [ 240.541452] INFO: task init:150 blocked for more than 120 seconds.
> [ 240.547687] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> [ 240.556352] INFO: task udevd:151 blocked for more than 120 seconds.
> [ 240.562623] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> [ 240.571262] INFO: task ata_id:197 blocked for more than 120 seconds.
> [ 240.577662] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> [ 277.965881] INFO: rcu_preempt detected stalls on CPUs/tasks: {} (detected by 0, t=27322 jiffies, g=4294967050, c=4294967049, q=5869)
> [ 277.977860] INFO: Stall ended before state dump start

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v3 resend] procfs: Improve Scaling in proc
  2013-03-04 21:00   ` Nathan Zimmer
@ 2013-03-04 22:00     ` Stephen Warren
  0 siblings, 0 replies; 8+ messages in thread
From: Stephen Warren @ 2013-03-04 22:00 UTC (permalink / raw)
  To: Nathan Zimmer
  Cc: viro, eric.dumazet, linux-kernel, linux-fsdevel, Andrew Morton,
	Eric W. Biederman, David Woodhouse, Alexey Dobriyan,
	Paul E. McKenney, linux-next

On 03/04/2013 02:00 PM, Nathan Zimmer wrote:
> Stephen Warren wrote at Monday, March 04, 2013 2:16 PM:
>> On 02/15/2013 01:47 PM, Nathan Zimmer wrote:
>>> I am currently tracking a hotlock reported by a customer on a large system,
>>> 512 cores.  I am currently running 3.8-rc7 but the issue looks like it has been
>>> this way for a very long time.
>>> The offending lock is proc_dir_entry->pde_unload_lock.
>>>
>>> This patch converts the replaces the lock with the rcu. However the pde_openers
>>> list still is controlled by a spin lock. I tested on a 4096 machine and the lock
>>> doesn't seem hot at least according to perf.
>> 
>> Nathan,
>> 
>> This patch causes RCU errors on my ARM systems (which have 2 or 4 CPUs)
>> during boot; see the log below. Reverting it fixes the problem. Let me
>> know if you need any more information to diagnose the problem. Thanks.
>
> I found and fixed a number of them, with the help of sparse. 
> http://marc.info/?l=linux-fsdevel&m=136242660212004&w=2
> If there are any I didn't find and fix let me know.

Yes, if I revert the version of this patch that's in next-20130304, and
replace it with V5 that you linked to above, then the problem is no
longer present. Thanks.

>> [ 3.652115] VFS: Mounted root (ext3 filesystem) on device 179:1.
>> [ 3.664856] devtmpfs: mounted
>> [ 3.668877] Freeing init memory: 212K
> ...
>> [ 25.765891] INFO: rcu_preempt detected stalls on CPUs/tasks: {} (detected by 1, t=2102 jiffies, g=4294967050, c=4294967049, q=5749)
>> [ 25.777786] INFO: Stall ended before state dump start
>> [ 88.815886] INFO: rcu_preempt detected stalls on CPUs/tasks: {} (detected by 1, t=8407 jiffies, g=4294967050, c=4294967049, q=5818)
>> [ 88.827777] INFO: Stall ended before state dump start
>> [ 151.865886] INFO: rcu_preempt detected stalls on CPUs/tasks: {} (detected by 1, t=14712 jiffies, g=4294967050, c=4294967049, q=5869)
>> [ 151.877863] INFO: Stall ended before state dump start
>> [ 214.915885] INFO: rcu_preempt detected stalls on CPUs/tasks: {} (detected by 1, t=21017 jiffies, g=4294967050, c=4294967049, q=5869)
>> [ 214.927857] INFO: Stall ended before state dump start
>> [ 240.525981] INFO: task mountall:71 blocked for more than 120 seconds.
>> [ 240.532435] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
>> [ 240.541452] INFO: task init:150 blocked for more than 120 seconds.
>> [ 240.547687] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
>> [ 240.556352] INFO: task udevd:151 blocked for more than 120 seconds.
>> [ 240.562623] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
>> [ 240.571262] INFO: task ata_id:197 blocked for more than 120 seconds.
>> [ 240.577662] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
>> [ 277.965881] INFO: rcu_preempt detected stalls on CPUs/tasks: {} (detected by 0, t=27322 jiffies, g=4294967050, c=4294967049, q=5869)
>> [ 277.977860] INFO: Stall ended before state dump start--


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v3 resend] procfs: Improve Scaling in proc
       [not found] <fa.Y6PHWInLhuIydVYAcT5/eAuyF3Y@ifi.uio.no>
@ 2013-02-22  7:06 ` Zheng Huai Cheng
  0 siblings, 0 replies; 8+ messages in thread
From: Zheng Huai Cheng @ 2013-02-22  7:06 UTC (permalink / raw)
  To: fa.linux.kernel
  Cc: viro, eric.dumazet, linux-kernel, Nathan Zimmer, Andrew Morton,
	Eric W. Biederman, David Woodhouse, Alexey Dobriyan,
	Paul E. McKenney, zhenghch

在 2013年2月16日星期六UTC+8上午4时49分15秒,Nathan Zimmer写道:
> I am currently tracking a hotlock reported by a customer on a large system,
> 
> 512 cores.  I am currently running 3.8-rc7 but the issue looks like it has been
> 
> this way for a very long time.
> 
> The offending lock is proc_dir_entry->pde_unload_lock.
> 
> 
> 
> This patch converts the replaces the lock with the rcu. However the pde_openers
> 
> list still is controlled by a spin lock. I tested on a 4096 machine and the lock
> 
> doesn't seem hot at least according to perf.
> 
> 
> 
> This is a refresh/resend of what was orignally suggested by Eric Dumazet some 
> 
> time ago.  
> 
> 
> 
> Supporting numbers, lower is better, they are from the test I posted earlier.
> 
> cpuinfo baseline        Rcu
> 
> tasks   read-sec        read-sec
> 
> 1       0.0141          0.0141
> 
> 2       0.0140          0.0142
> 
> 4       0.0140          0.0141
> 
> 8       0.0145          0.0140
> 
> 16      0.0553          0.0168
> 
> 32      0.1688          0.0549
> 
> 64      0.5017          0.1690
> 
> 128     1.7005          0.5038
> 
> 256     5.2513          2.0804
> 
> 512     8.0529          3.0162
> 
> 
> 
> Cc: Andrew Morton <akpm@linux-foundation.org>
> 
> Cc: "Eric W. Biederman" <ebiederm@xmission.com> 
> 
> Cc: Eric Dumazet <eric.dumazet@gmail.com>
> 
> Cc: Alexander Viro <viro@zeniv.linux.org.uk>
> 
> Cc: David Woodhouse <dwmw2@infradead.org>
> 
> Cc: Alexey Dobriyan <adobriyan@gmail.com>
> 
> Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
> 
> Signed-off-by: Nathan Zimmer <nzimmer@sgi.com>
> 
> ---
> 
>  fs/proc/generic.c       |  58 +++++++++---------
> 
>  fs/proc/inode.c         | 156 ++++++++++++++++++++++++++----------------------
> 
>  include/linux/proc_fs.h |   6 +-
> 
>  3 files changed, 118 insertions(+), 102 deletions(-)
> 
> 
> 
> diff --git a/fs/proc/generic.c b/fs/proc/generic.c
> 
> index 76ddae8..6896a70 100644
> 
> --- a/fs/proc/generic.c
> 
> +++ b/fs/proc/generic.c
> 
> @@ -191,13 +191,16 @@ proc_file_read(struct file *file, char __user *buf, size_t nbytes,
> 
>  	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
> 
>  	ssize_t rv = -EIO;
> 
>  
> 
> -	spin_lock(&pde->pde_unload_lock);
> 
> -	if (!pde->proc_fops) {
> 
> -		spin_unlock(&pde->pde_unload_lock);
> 
> +	const struct file_operations *fops;
> 
> +
> 
> +	rcu_read_lock();
> 
> +	fops = rcu_dereference(pde->proc_fops);
> 
> +	if (!fops) {
> 
> +		rcu_read_unlock();
> 
>  		return rv;
> 
>  	}
> 
> -	pde->pde_users++;
> 
> -	spin_unlock(&pde->pde_unload_lock);
> 
> +	atomic_inc(&pde->pde_users);
> 
> +	rcu_read_unlock();
> 
>  
> 
>  	rv = __proc_file_read(file, buf, nbytes, ppos);
> 
>  
> 
> @@ -213,13 +216,16 @@ proc_file_write(struct file *file, const char __user *buffer,
> 
>  	ssize_t rv = -EIO;
> 
>  
> 
>  	if (pde->write_proc) {
> 
> -		spin_lock(&pde->pde_unload_lock);
> 
> -		if (!pde->proc_fops) {
> 
> -			spin_unlock(&pde->pde_unload_lock);
> 
> +		const struct file_operations *fops;
> 
> +
> 
> +		rcu_read_lock();
> 
> +		fops = rcu_dereference(pde->proc_fops);
> 
> +		if (!fops) {
> 
> +			rcu_read_unlock();
> 
>  			return rv;
> 
>  		}
> 
> -		pde->pde_users++;
> 
> -		spin_unlock(&pde->pde_unload_lock);
> 
> +		atomic_inc(&pde->pde_users);
> 
> +		rcu_read_unlock();
> 
>  
> 
>  		/* FIXME: does this routine need ppos?  probably... */
> 
>  		rv = pde->write_proc(file, buffer, count, pde->data);
> 
> @@ -558,7 +564,7 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp
> 
>  
> 
>  	if (S_ISDIR(dp->mode)) {
> 
>  		if (dp->proc_iops == NULL) {
> 
> -			dp->proc_fops = &proc_dir_operations;
> 
> +			RCU_INIT_POINTER(dp->proc_fops, &proc_dir_operations);
> 
>  			dp->proc_iops = &proc_dir_inode_operations;
> 
>  		}
> 
>  		dir->nlink++;
> 
> @@ -567,7 +573,7 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp
> 
>  			dp->proc_iops = &proc_link_inode_operations;
> 
>  	} else if (S_ISREG(dp->mode)) {
> 
>  		if (dp->proc_fops == NULL)
> 
> -			dp->proc_fops = &proc_file_operations;
> 
> +			RCU_INIT_POINTER(dp->proc_fops, &proc_file_operations);
> 
>  		if (dp->proc_iops == NULL)
> 
>  			dp->proc_iops = &proc_file_inode_operations;
> 
>  	}
> 
> @@ -620,7 +626,8 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
> 
>  	ent->mode = mode;
> 
>  	ent->nlink = nlink;
> 
>  	atomic_set(&ent->count, 1);
> 
> -	spin_lock_init(&ent->pde_unload_lock);
> 
> +	atomic_set(&ent->pde_users, 1);
> 
> +	spin_lock_init(&ent->pde_openers_lock);
> 
>  	INIT_LIST_HEAD(&ent->pde_openers);
> 
>  out:
> 
>  	return ent;
> 
> @@ -744,7 +751,7 @@ struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
> 
>  	pde = __proc_create(&parent, name, mode, nlink);
> 
>  	if (!pde)
> 
>  		goto out;
> 
> -	pde->proc_fops = proc_fops;
> 
> +	rcu_assign_pointer(pde->proc_fops, proc_fops);
> 
>  	pde->data = data;
> 
>  	if (proc_register(parent, pde) < 0)
> 
>  		goto out_free;
> 
> @@ -802,37 +809,30 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
> 
>  		return;
> 
>  	}
> 
>  
> 
> -	spin_lock(&de->pde_unload_lock);
> 
>  	/*
> 
>  	 * Stop accepting new callers into module. If you're
> 
>  	 * dynamically allocating ->proc_fops, save a pointer somewhere.
> 
>  	 */
> 
> -	de->proc_fops = NULL;
> 
> -	/* Wait until all existing callers into module are done. */
> 
> -	if (de->pde_users > 0) {
> 
> -		DECLARE_COMPLETION_ONSTACK(c);
> 
> -
> 
> -		if (!de->pde_unload_completion)
> 
> -			de->pde_unload_completion = &c;
> 
>  
> 
> -		spin_unlock(&de->pde_unload_lock);
> 
> +	rcu_assign_pointer(de->proc_fops, NULL);
> 
> +	synchronize_rcu();
> 
> +	/* Wait until all existing callers into module are done. */
> 
>  
> 
> +	DECLARE_COMPLETION_ONSTACK(c);
> 
> +	de->pde_unload_completion = &c;
> 
> +	if (!atomic_dec_and_test(&de->pde_users))
> 
>  		wait_for_completion(de->pde_unload_completion);
> 
>  
> 
> -		spin_lock(&de->pde_unload_lock);
> 
> -	}
> 
> -
> 
> +	spin_lock(&de->pde_openers_lock);
> 
>  	while (!list_empty(&de->pde_openers)) {
> 
>  		struct pde_opener *pdeo;
> 
>  
> 
>  		pdeo = list_first_entry(&de->pde_openers, struct pde_opener, lh);
> 
>  		list_del(&pdeo->lh);
> 
> -		spin_unlock(&de->pde_unload_lock);
> 
>  		pdeo->release(pdeo->inode, pdeo->file);
> 
>  		kfree(pdeo);
> 
> -		spin_lock(&de->pde_unload_lock);
> 
>  	}
> 
> -	spin_unlock(&de->pde_unload_lock);
> 
> +	spin_unlock(&de->pde_openers_lock);
> 
>  
> 
>  	if (S_ISDIR(de->mode))
> 
>  		parent->nlink--;
> 
> diff --git a/fs/proc/inode.c b/fs/proc/inode.c
> 
> index 439ae688..031af27 100644
> 
> --- a/fs/proc/inode.c
> 
> +++ b/fs/proc/inode.c
> 
> @@ -128,46 +128,40 @@ static const struct super_operations proc_sops = {
> 
>  	.show_options	= proc_show_options,
> 
>  };
> 
>  
> 
> -static void __pde_users_dec(struct proc_dir_entry *pde)
> 
> -{
> 
> -	pde->pde_users--;
> 
> -	if (pde->pde_unload_completion && pde->pde_users == 0)
> 
> -		complete(pde->pde_unload_completion);
> 
> -}
> 
> -
> 
>  void pde_users_dec(struct proc_dir_entry *pde)
> 
>  {
> 
> -	spin_lock(&pde->pde_unload_lock);
> 
> -	__pde_users_dec(pde);
> 
> -	spin_unlock(&pde->pde_unload_lock);
> 
> +	if (atomic_dec_and_test(&pde->pde_users) && pde->pde_unload_completion)
> 
> +		complete(pde->pde_unload_completion);
> 
>  }
> 
>  
> 
>  static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence)
> 
>  {
> 
> +	const struct file_operations *fops;
> 
>  	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
> 
>  	loff_t rv = -EINVAL;
> 
>  	loff_t (*llseek)(struct file *, loff_t, int);
> 
>  
> 
> -	spin_lock(&pde->pde_unload_lock);
> 
> +	rcu_read_lock();
> 
> +	fops = rcu_dereference(pde->proc_fops);
> 
>  	/*
> 
>  	 * remove_proc_entry() is going to delete PDE (as part of module
> 
>  	 * cleanup sequence). No new callers into module allowed.
> 
>  	 */
> 
> -	if (!pde->proc_fops) {
> 
> -		spin_unlock(&pde->pde_unload_lock);
> 
> +	if (!fops) {
> 
> +		rcu_read_unlock();
> 
>  		return rv;
> 
>  	}
> 
>  	/*
> 
>  	 * Bump refcount so that remove_proc_entry will wail for ->llseek to
> 
>  	 * complete.
> 
>  	 */
> 
> -	pde->pde_users++;
> 
> +	atomic_inc(&pde->pde_users);
> 
>  	/*
> 
>  	 * Save function pointer under lock, to protect against ->proc_fops
> 
>  	 * NULL'ifying right after ->pde_unload_lock is dropped.
> 
>  	 */
> 
> -	llseek = pde->proc_fops->llseek;
> 
> -	spin_unlock(&pde->pde_unload_lock);
> 
> +	llseek = fops->llseek;
> 
> +	rcu_read_unlock();
> 
>  
> 
>  	if (!llseek)
> 
>  		llseek = default_llseek;
> 
> @@ -182,15 +176,17 @@ static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count,
> 
>  	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
> 
>  	ssize_t rv = -EIO;
> 
>  	ssize_t (*read)(struct file *, char __user *, size_t, loff_t *);
> 
> +	const struct file_operations *fops;
> 
>  
> 
> -	spin_lock(&pde->pde_unload_lock);
> 
> -	if (!pde->proc_fops) {
> 
> -		spin_unlock(&pde->pde_unload_lock);
> 
> +	rcu_read_lock();
> 
> +	fops = rcu_dereference(pde->proc_fops);
> 
> +	if (!fops) {
> 
> +		rcu_read_unlock();
> 
>  		return rv;
> 
>  	}
> 
> -	pde->pde_users++;
> 
> -	read = pde->proc_fops->read;
> 
> -	spin_unlock(&pde->pde_unload_lock);
> 
> +	atomic_inc(&pde->pde_users);
> 
> +	read = fops->read;
> 
> +	rcu_read_unlock();
> 
>  
> 
>  	if (read)
> 
>  		rv = read(file, buf, count, ppos);
> 
> @@ -204,15 +200,17 @@ static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t
> 
>  	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
> 
>  	ssize_t rv = -EIO;
> 
>  	ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *);
> 
> +	const struct file_operations *fops;
> 
>  
> 
> -	spin_lock(&pde->pde_unload_lock);
> 
> -	if (!pde->proc_fops) {
> 
> -		spin_unlock(&pde->pde_unload_lock);
> 
> +	rcu_read_lock();
> 
> +	fops = rcu_dereference(pde->proc_fops);
> 
> +	if (!fops) {
> 
> +		rcu_read_unlock();
> 
>  		return rv;
> 
>  	}
> 
> -	pde->pde_users++;
> 
> -	write = pde->proc_fops->write;
> 
> -	spin_unlock(&pde->pde_unload_lock);
> 
> +	atomic_inc(&pde->pde_users);
> 
> +	write = fops->write;
> 
> +	rcu_read_unlock();
> 
>  
> 
>  	if (write)
> 
>  		rv = write(file, buf, count, ppos);
> 
> @@ -226,15 +224,17 @@ static unsigned int proc_reg_poll(struct file *file, struct poll_table_struct *p
> 
>  	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
> 
>  	unsigned int rv = DEFAULT_POLLMASK;
> 
>  	unsigned int (*poll)(struct file *, struct poll_table_struct *);
> 
> +	const struct file_operations *fops;
> 
>  
> 
> -	spin_lock(&pde->pde_unload_lock);
> 
> -	if (!pde->proc_fops) {
> 
> -		spin_unlock(&pde->pde_unload_lock);
> 
> +	rcu_read_lock();
> 
> +	fops = rcu_dereference(pde->proc_fops);
> 
> +	if (!fops) {
> 
> +		rcu_read_unlock();
> 
>  		return rv;
> 
>  	}
> 
> -	pde->pde_users++;
> 
> -	poll = pde->proc_fops->poll;
> 
> -	spin_unlock(&pde->pde_unload_lock);
> 
> +	atomic_inc(&pde->pde_users);
> 
> +	poll = fops->poll;
> 
> +	rcu_read_unlock();
> 
>  
> 
>  	if (poll)
> 
>  		rv = poll(file, pts);
> 
> @@ -248,15 +248,17 @@ static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigne
> 
>  	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
> 
>  	long rv = -ENOTTY;
> 
>  	long (*ioctl)(struct file *, unsigned int, unsigned long);
> 
> +	const struct file_operations *fops;
> 
>  
> 
> -	spin_lock(&pde->pde_unload_lock);
> 
> -	if (!pde->proc_fops) {
> 
> -		spin_unlock(&pde->pde_unload_lock);
> 
> +	rcu_read_lock();
> 
> +	fops = rcu_dereference(pde->proc_fops);
> 
> +	if (!fops) {
> 
> +		rcu_read_unlock();
> 
>  		return rv;
> 
>  	}
> 
> -	pde->pde_users++;
> 
> -	ioctl = pde->proc_fops->unlocked_ioctl;
> 
> -	spin_unlock(&pde->pde_unload_lock);
> 
> +	atomic_inc(&pde->pde_users);
> 
> +	ioctl = fops->unlocked_ioctl;
> 
> +	rcu_read_unlock();
> 
>  
> 
>  	if (ioctl)
> 
>  		rv = ioctl(file, cmd, arg);
> 
> @@ -271,15 +273,17 @@ static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned
> 
>  	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
> 
>  	long rv = -ENOTTY;
> 
>  	long (*compat_ioctl)(struct file *, unsigned int, unsigned long);
> 
> +	const struct file_operations *fops;
> 
>  
> 
> -	spin_lock(&pde->pde_unload_lock);
> 
> -	if (!pde->proc_fops) {
> 
> -		spin_unlock(&pde->pde_unload_lock);
> 
> +	rcu_read_lock();
> 
> +	fops = rcu_dereference(pde->proc_fops);
> 
> +	if (!fops) {
> 
> +		rcu_read_unlock();
> 
>  		return rv;
> 
>  	}
> 
> -	pde->pde_users++;
> 
> +	atomic_inc(&pde->pde_users);
> 
>  	compat_ioctl = pde->proc_fops->compat_ioctl;
should this be compat_ioctl = fops->compat_ioctl ?

> 
> -	spin_unlock(&pde->pde_unload_lock);
> 
> +	rcu_read_unlock();
> 
>  
> 
>  	if (compat_ioctl)
> 
>  		rv = compat_ioctl(file, cmd, arg);
> 
> @@ -294,15 +298,17 @@ static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma)
> 
>  	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
> 
>  	int rv = -EIO;
> 
>  	int (*mmap)(struct file *, struct vm_area_struct *);
> 
> +	const struct file_operations *fops;
> 
>  
> 
> -	spin_lock(&pde->pde_unload_lock);
> 
> -	if (!pde->proc_fops) {
> 
> -		spin_unlock(&pde->pde_unload_lock);
> 
> +	rcu_read_lock();
> 
> +	fops = rcu_dereference(pde->proc_fops);
> 
> +	if (!fops) {
> 
> +		rcu_read_unlock();
> 
>  		return rv;
> 
>  	}
> 
> -	pde->pde_users++;
> 
> +	atomic_inc(&pde->pde_users);
> 
>  	mmap = pde->proc_fops->mmap;
also for this, should it be compat_ioctl = fops->compat_ioctl ?

> 
> -	spin_unlock(&pde->pde_unload_lock);
> 
> +	rcu_read_unlock();
> 
>  
> 
>  	if (mmap)
> 
>  		rv = mmap(file, vma);
> 
> @@ -318,6 +324,7 @@ static int proc_reg_open(struct inode *inode, struct file *file)
> 
>  	int (*open)(struct inode *, struct file *);
> 
>  	int (*release)(struct inode *, struct file *);
> 
>  	struct pde_opener *pdeo;
> 
> +	const struct file_operations *fops;
> 
>  
> 
>  	/*
> 
>  	 * What for, you ask? Well, we can have open, rmmod, remove_proc_entry
> 
> @@ -333,32 +340,33 @@ static int proc_reg_open(struct inode *inode, struct file *file)
> 
>  	if (!pdeo)
> 
>  		return -ENOMEM;
> 
>  
> 
> -	spin_lock(&pde->pde_unload_lock);
> 
> -	if (!pde->proc_fops) {
> 
> -		spin_unlock(&pde->pde_unload_lock);
> 
> +	rcu_read_lock();
> 
> +	fops = rcu_dereference(pde->proc_fops);
> 
> +	if (!fops) {
> 
> +		rcu_read_unlock();
> 
>  		kfree(pdeo);
> 
>  		return -ENOENT;
> 
>  	}
> 
> -	pde->pde_users++;
> 
> -	open = pde->proc_fops->open;
> 
> -	release = pde->proc_fops->release;
> 
> -	spin_unlock(&pde->pde_unload_lock);
> 
> +	atomic_inc(&pde->pde_users);
> 
> +	open = fops->open;
> 
> +	release = fops->release;
> 
> +	rcu_read_lock();
> 
>  
> 
>  	if (open)
> 
>  		rv = open(inode, file);
> 
>  
> 
> -	spin_lock(&pde->pde_unload_lock);
> 
>  	if (rv == 0 && release) {
> 
>  		/* To know what to release. */
> 
>  		pdeo->inode = inode;
> 
>  		pdeo->file = file;
> 
>  		/* Strictly for "too late" ->release in proc_reg_release(). */
> 
>  		pdeo->release = release;
> 
> +		spin_lock(&pde->pde_openers_lock);
> 
>  		list_add(&pdeo->lh, &pde->pde_openers);
> 
> +		spin_unlock(&pde->pde_openers_lock);
> 
>  	} else
> 
>  		kfree(pdeo);
> 
> -	__pde_users_dec(pde);
> 
> -	spin_unlock(&pde->pde_unload_lock);
> 
> +	pde_users_dec(pde);
> 
>  	return rv;
> 
>  }
> 
>  
> 
> @@ -367,10 +375,14 @@ static struct pde_opener *find_pde_opener(struct proc_dir_entry *pde,
> 
>  {
> 
>  	struct pde_opener *pdeo;
> 
>  
> 
> +	spin_lock(&pde->pde_openers_lock);
> 
>  	list_for_each_entry(pdeo, &pde->pde_openers, lh) {
> 
> -		if (pdeo->inode == inode && pdeo->file == file)
> 
> +		if (pdeo->inode == inode && pdeo->file == file) {
> 
> +			spin_unlock(&pde->pde_openers_lock);
> 
>  			return pdeo;
> 
> +		}
> 
>  	}
> 
> +	spin_unlock(&pde->pde_openers_lock);
> 
>  	return NULL;
> 
>  }
> 
>  
> 
> @@ -380,10 +392,12 @@ static int proc_reg_release(struct inode *inode, struct file *file)
> 
>  	int rv = 0;
> 
>  	int (*release)(struct inode *, struct file *);
> 
>  	struct pde_opener *pdeo;
> 
> +	const struct file_operations *fops;
> 
>  
> 
> -	spin_lock(&pde->pde_unload_lock);
> 
>  	pdeo = find_pde_opener(pde, inode, file);
> 
> -	if (!pde->proc_fops) {
> 
> +	rcu_read_lock();
> 
> +	fops = rcu_dereference(pde->proc_fops);
> 
> +	if (!fops) {
> 
>  		/*
> 
>  		 * Can't simply exit, __fput() will think that everything is OK,
> 
>  		 * and move on to freeing struct file. remove_proc_entry() will
> 
> @@ -393,21 +407,23 @@ static int proc_reg_release(struct inode *inode, struct file *file)
> 
>  		 * But if opener is removed from list, who will ->release it?
> 
>  		 */
> 
>  		if (pdeo) {
> 
> +			spin_lock(&pde->pde_openers_lock);
> 
>  			list_del(&pdeo->lh);
> 
> -			spin_unlock(&pde->pde_unload_lock);
> 
> +			spin_unlock(&pde->pde_openers_lock);
> 
>  			rv = pdeo->release(inode, file);
> 
>  			kfree(pdeo);
> 
> -		} else
> 
> -			spin_unlock(&pde->pde_unload_lock);
> 
> +		}
> 
>  		return rv;
> 
>  	}
> 
> -	pde->pde_users++;
> 
> -	release = pde->proc_fops->release;
> 
> +	atomic_inc(&pde->pde_users);
> 
> +	release = fops->release;
> 
> +	rcu_read_unlock();
> 
>  	if (pdeo) {
> 
> +		spin_lock(&pde->pde_openers_lock);
> 
>  		list_del(&pdeo->lh);
> 
> -		kfree(pdeo);
> 
> +		spin_unlock(&pde->pde_openers_lock);
> 
>  	}
> 
> -	spin_unlock(&pde->pde_unload_lock);
> 
> +	kfree(pdeo);
> 
>  
> 
>  	if (release)
> 
>  		rv = release(inode, file);
> 
> diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
> 
> index 32676b3..82628d9 100644
> 
> --- a/include/linux/proc_fs.h
> 
> +++ b/include/linux/proc_fs.h
> 
> @@ -68,16 +68,16 @@ struct proc_dir_entry {
> 
>  	 * If you're allocating ->proc_fops dynamically, save a pointer
> 
>  	 * somewhere.
> 
>  	 */
> 
> -	const struct file_operations *proc_fops;
> 
> +	const struct file_operations __rcu *proc_fops;
> 
>  	struct proc_dir_entry *next, *parent, *subdir;
> 
>  	void *data;
> 
>  	read_proc_t *read_proc;
> 
>  	write_proc_t *write_proc;
> 
>  	atomic_t count;		/* use count */
> 
> -	int pde_users;	/* number of callers into module in progress */
> 
> +	atomic_t pde_users;	/* number of callers into module in progress */
> 
>  	struct completion *pde_unload_completion;
> 
> +	spinlock_t pde_openers_lock;
> 
>  	struct list_head pde_openers;	/* who did ->open, but not ->release */
> 
> -	spinlock_t pde_unload_lock; /* proc_fops checks and pde_users bumps */
> 
>  	u8 namelen;
> 
>  	char name[];
> 
>  };
> 
> -- 
> 
> 1.8.1.2
> 
> 
> 
> --
> 
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> 
> the body of a message to majordomo@vger.kernel.org
> 
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2013-03-04 22:01 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-02-15 20:47 [PATCH v3 resend] procfs: Improve Scaling in proc Nathan Zimmer
2013-02-15 22:12 ` Andrew Morton
2013-02-15 23:39   ` Nathan Zimmer
2013-02-15 23:44     ` Andrew Morton
2013-03-04 20:16 ` Stephen Warren
2013-03-04 21:00   ` Nathan Zimmer
2013-03-04 22:00     ` Stephen Warren
     [not found] <fa.Y6PHWInLhuIydVYAcT5/eAuyF3Y@ifi.uio.no>
2013-02-22  7:06 ` Zheng Huai Cheng

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.