From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1751013AbdBSImx (ORCPT ); Sun, 19 Feb 2017 03:42:53 -0500 Received: from zeniv.linux.org.uk ([195.92.253.2]:59502 "EHLO ZenIV.linux.org.uk" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1750812AbdBSImw (ORCPT ); Sun, 19 Feb 2017 03:42:52 -0500 Date: Sun, 19 Feb 2017 08:42:12 +0000 From: Al Viro To: Konstantin Khlebnikov Cc: linux-kernel@vger.kernel.org, Andrew Morton , "Eric W. Biederman" Subject: Re: [PATCH] proc/sysctl: prune stale dentries during unregistering Message-ID: <20170219084202.GJ29622@ZenIV.linux.org.uk> References: <20170209084016.GL13195@ZenIV.linux.org.uk> <148671210259.52694.13774349516906955456.stgit@buzz> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: User-Agent: Mutt/1.7.1 (2016-10-04) Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org On Sat, Feb 18, 2017 at 09:55:28PM +0300, Konstantin Khlebnikov wrote: > This patch has locking problem. I've got lockdep splat under LTP. > > d_lock nests inside i_lock > sysctl_lock nests inside d_lock in d_compare > > This patch adds i_lock nesting inside sysctl_lock. Once ->unregistering is set, you can drop sysctl_lock just fine. So I'd try something like this - use rcu_read_lock() in proc_sys_prune_dcache(), drop sysctl_lock() before it and regain after. Make sure that no inodes are added to the list ones ->unregistering has been set and use RCU list primitives for modifying the inode list, with sysctl_lock still used to serialize its modifications. Freeing struct inode is RCU-delayed (see proc_destroy_inode()), so doing igrab() is safe there. Since we don't drop inode reference until after we'd passed beyond it in the list, list_for_each_entry_rcu() should be fine, AFAICS. Below is a completely untested modification of your patch along those lines: diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 842a5ff5b85c..7ad9ed7958af 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -43,10 +43,11 @@ static void proc_evict_inode(struct inode *inode) de = PDE(inode); if (de) pde_put(de); + head = PROC_I(inode)->sysctl; if (head) { RCU_INIT_POINTER(PROC_I(inode)->sysctl, NULL); - sysctl_head_put(head); + proc_sys_evict_inode(inode, head); } } diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 2de5194ba378..ed1d762160e6 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -65,6 +65,7 @@ struct proc_inode { struct proc_dir_entry *pde; struct ctl_table_header *sysctl; struct ctl_table *sysctl_entry; + struct list_head sysctl_inodes; const struct proc_ns_operations *ns_ops; struct inode vfs_inode; }; @@ -249,10 +250,12 @@ extern void proc_thread_self_init(void); */ #ifdef CONFIG_PROC_SYSCTL extern int proc_sys_init(void); -extern void sysctl_head_put(struct ctl_table_header *); +extern void proc_sys_evict_inode(struct inode *inode, + struct ctl_table_header *head); #else static inline void proc_sys_init(void) { } -static inline void sysctl_head_put(struct ctl_table_header *head) { } +static inline void proc_sys_evict_inode(struct inode *inode, + struct ctl_table_header *head) { } #endif /* diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 55313d994895..6477c4a2dc6c 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -190,6 +190,7 @@ static void init_header(struct ctl_table_header *head, head->set = set; head->parent = NULL; head->node = node; + INIT_LIST_HEAD(&head->inodes); if (node) { struct ctl_table *entry; for (entry = table; entry->procname; entry++, node++) @@ -259,6 +260,26 @@ static void unuse_table(struct ctl_table_header *p) complete(p->unregistering); } +static void proc_sys_prune_dcache(struct ctl_table_header *head) +{ + struct inode *inode, *prev = NULL; + struct proc_inode *ei; + + rcu_read_lock(); + list_for_each_entry_rcu(ei, &head->inodes, sysctl_inodes) { + inode = igrab(&ei->vfs_inode); + if (inode) { + rcu_read_unlock(); + iput(prev); + prev = inode; + d_prune_aliases(inode); + rcu_read_lock(); + } + } + rcu_read_unlock(); + iput(prev); +} + /* called under sysctl_lock, will reacquire if has to wait */ static void start_unregistering(struct ctl_table_header *p) { @@ -272,31 +293,22 @@ static void start_unregistering(struct ctl_table_header *p) p->unregistering = &wait; spin_unlock(&sysctl_lock); wait_for_completion(&wait); - spin_lock(&sysctl_lock); } else { /* anything non-NULL; we'll never dereference it */ p->unregistering = ERR_PTR(-EINVAL); + spin_unlock(&sysctl_lock); } /* + * Prune dentries for unregistered sysctls: namespaced sysctls + * can have duplicate names and contaminate dcache very badly. + */ + proc_sys_prune_dcache(p); + /* * do not remove from the list until nobody holds it; walking the * list in do_sysctl() relies on that. */ - erase_header(p); -} - -static void sysctl_head_get(struct ctl_table_header *head) -{ - spin_lock(&sysctl_lock); - head->count++; - spin_unlock(&sysctl_lock); -} - -void sysctl_head_put(struct ctl_table_header *head) -{ spin_lock(&sysctl_lock); - if (!--head->count) - kfree_rcu(head, rcu); - spin_unlock(&sysctl_lock); + erase_header(p); } static struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head) @@ -440,10 +452,20 @@ static struct inode *proc_sys_make_inode(struct super_block *sb, inode->i_ino = get_next_ino(); - sysctl_head_get(head); ei = PROC_I(inode); + + spin_lock(&sysctl_lock); + if (unlikely(head->unregistering)) { + spin_unlock(&sysctl_lock); + iput(inode); + inode = NULL; + goto out; + } ei->sysctl = head; ei->sysctl_entry = table; + list_add_rcu(&ei->sysctl_inodes, &head->inodes); + head->count++; + spin_unlock(&sysctl_lock); inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); inode->i_mode = table->mode; @@ -466,6 +488,15 @@ static struct inode *proc_sys_make_inode(struct super_block *sb, return inode; } +void proc_sys_evict_inode(struct inode *inode, struct ctl_table_header *head) +{ + spin_lock(&sysctl_lock); + list_del_rcu(&PROC_I(inode)->sysctl_inodes); + if (!--head->count) + kfree_rcu(head, rcu); + spin_unlock(&sysctl_lock); +} + static struct ctl_table_header *grab_header(struct inode *inode) { struct ctl_table_header *head = PROC_I(inode)->sysctl; diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index adf4e51cf597..b7e82049fec7 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -143,6 +143,7 @@ struct ctl_table_header struct ctl_table_set *set; struct ctl_dir *parent; struct ctl_node *node; + struct list_head inodes; /* head for proc_inode->sysctl_inodes */ }; struct ctl_dir {