linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/3] uprobes: Kill uprobes_mutex[]
@ 2012-11-25 22:33 Oleg Nesterov
  2012-11-25 22:33 ` [PATCH 1/3] uprobes: Kill uprobe_events, use RB_EMPTY_ROOT() instead Oleg Nesterov
                   ` (2 more replies)
  0 siblings, 3 replies; 11+ messages in thread
From: Oleg Nesterov @ 2012-11-25 22:33 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra, Srikar Dronamraju
  Cc: Ananth N Mavinakayanahalli, Anton Arapov, linux-kernel

Hello.

On top of "[PATCH 0/4] uprobes: locking changes for filtering"

No functional changes. But uprobes_mutex[] must die imho.
And alloc_uprobe() should be separated from __register.

Oleg.


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 1/3] uprobes: Kill uprobe_events, use RB_EMPTY_ROOT() instead
  2012-11-25 22:33 [PATCH 0/3] uprobes: Kill uprobes_mutex[] Oleg Nesterov
@ 2012-11-25 22:33 ` Oleg Nesterov
  2012-12-13 14:35   ` Srikar Dronamraju
  2013-01-02 12:56   ` Anton Arapov
  2012-11-25 22:33 ` [PATCH 2/3] uprobes: Introduce uprobe_is_active() Oleg Nesterov
  2012-11-25 22:33 ` [PATCH 3/3] uprobes: Kill uprobes_mutex[], separate alloc_uprobe() and __uprobe_register() Oleg Nesterov
  2 siblings, 2 replies; 11+ messages in thread
From: Oleg Nesterov @ 2012-11-25 22:33 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra, Srikar Dronamraju
  Cc: Ananth N Mavinakayanahalli, Anton Arapov, linux-kernel

uprobe_events counts the number of uprobes in uprobes_tree but
it is used as a boolean. We can use RB_EMPTY_ROOT() instead.

Probably no_uprobe_events() added by this patch can have more
callers, say, mmf_recalc_uprobes().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 kernel/events/uprobes.c |   19 +++++++------------
 1 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 1e047f8..53dc2eb 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -41,6 +41,11 @@
 #define MAX_UPROBE_XOL_SLOTS		UINSNS_PER_PAGE
 
 static struct rb_root uprobes_tree = RB_ROOT;
+/*
+ * allows us to skip the uprobe_mmap if there are no uprobe events active
+ * at this time.  Probably a fine grained per inode count is better?
+ */
+#define no_uprobe_events()	RB_EMPTY_ROOT(&uprobes_tree)
 
 static DEFINE_SPINLOCK(uprobes_treelock);	/* serialize rbtree access */
 
@@ -74,13 +79,6 @@ static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
 
 static struct percpu_rw_semaphore dup_mmap_sem;
 
-/*
- * uprobe_events allows us to skip the uprobe_mmap if there are no uprobe
- * events active at this time.  Probably a fine grained per inode count is
- * better?
- */
-static atomic_t uprobe_events = ATOMIC_INIT(0);
-
 /* Have a copy of original instruction */
 #define UPROBE_COPY_INSN	0
 /* Can skip singlestep */
@@ -460,8 +458,6 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
 		kfree(uprobe);
 		uprobe = cur_uprobe;
 		iput(inode);
-	} else {
-		atomic_inc(&uprobe_events);
 	}
 
 	return uprobe;
@@ -685,7 +681,6 @@ static void delete_uprobe(struct uprobe *uprobe)
 	spin_unlock(&uprobes_treelock);
 	iput(uprobe->inode);
 	put_uprobe(uprobe);
-	atomic_dec(&uprobe_events);
 }
 
 struct map_info {
@@ -975,7 +970,7 @@ int uprobe_mmap(struct vm_area_struct *vma)
 	struct uprobe *uprobe, *u;
 	struct inode *inode;
 
-	if (!atomic_read(&uprobe_events) || !valid_vma(vma, true))
+	if (no_uprobe_events() || !valid_vma(vma, true))
 		return 0;
 
 	inode = vma->vm_file->f_mapping->host;
@@ -1021,7 +1016,7 @@ vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long e
  */
 void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end)
 {
-	if (!atomic_read(&uprobe_events) || !valid_vma(vma, false))
+	if (no_uprobe_events() || !valid_vma(vma, false))
 		return;
 
 	if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */
-- 
1.5.5.1


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 2/3] uprobes: Introduce uprobe_is_active()
  2012-11-25 22:33 [PATCH 0/3] uprobes: Kill uprobes_mutex[] Oleg Nesterov
  2012-11-25 22:33 ` [PATCH 1/3] uprobes: Kill uprobe_events, use RB_EMPTY_ROOT() instead Oleg Nesterov
@ 2012-11-25 22:33 ` Oleg Nesterov
  2013-01-02 12:57   ` Anton Arapov
  2013-01-03  9:04   ` Srikar Dronamraju
  2012-11-25 22:33 ` [PATCH 3/3] uprobes: Kill uprobes_mutex[], separate alloc_uprobe() and __uprobe_register() Oleg Nesterov
  2 siblings, 2 replies; 11+ messages in thread
From: Oleg Nesterov @ 2012-11-25 22:33 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra, Srikar Dronamraju
  Cc: Ananth N Mavinakayanahalli, Anton Arapov, linux-kernel

The lifetime of uprobe->rb_node and uprobe->inode is not refcounted,
delete_uprobe() is called when we detect that uprobe has no consumers,
and it would be deadly wrong to do this twice.

Change delete_uprobe() to WARN() if it was already called. We use
RB_CLEAR_NODE() to mark uprobe "inactive", then RB_EMPTY_NODE() can
be used to detect this case.

RB_EMPTY_NODE() is not used directly, we add the trivial helper for
the next change.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 kernel/events/uprobes.c |    8 ++++++++
 1 files changed, 8 insertions(+), 0 deletions(-)

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 53dc2eb..2886c82 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -669,6 +669,10 @@ remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vad
 	return set_orig_insn(&uprobe->arch, mm, vaddr);
 }
 
+static inline bool uprobe_is_active(struct uprobe *uprobe)
+{
+	return !RB_EMPTY_NODE(&uprobe->rb_node);
+}
 /*
  * There could be threads that have already hit the breakpoint. They
  * will recheck the current insn and restart if find_uprobe() fails.
@@ -676,9 +680,13 @@ remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vad
  */
 static void delete_uprobe(struct uprobe *uprobe)
 {
+	if (WARN_ON(!uprobe_is_active(uprobe)))
+		return;
+
 	spin_lock(&uprobes_treelock);
 	rb_erase(&uprobe->rb_node, &uprobes_tree);
 	spin_unlock(&uprobes_treelock);
+	RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */
 	iput(uprobe->inode);
 	put_uprobe(uprobe);
 }
-- 
1.5.5.1


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 3/3] uprobes: Kill uprobes_mutex[], separate alloc_uprobe() and __uprobe_register()
  2012-11-25 22:33 [PATCH 0/3] uprobes: Kill uprobes_mutex[] Oleg Nesterov
  2012-11-25 22:33 ` [PATCH 1/3] uprobes: Kill uprobe_events, use RB_EMPTY_ROOT() instead Oleg Nesterov
  2012-11-25 22:33 ` [PATCH 2/3] uprobes: Introduce uprobe_is_active() Oleg Nesterov
@ 2012-11-25 22:33 ` Oleg Nesterov
  2013-01-02 12:57   ` Anton Arapov
  2013-01-03  9:05   ` Srikar Dronamraju
  2 siblings, 2 replies; 11+ messages in thread
From: Oleg Nesterov @ 2012-11-25 22:33 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra, Srikar Dronamraju
  Cc: Ananth N Mavinakayanahalli, Anton Arapov, linux-kernel

uprobe_register() and uprobe_unregister() are the only users of
mutex_lock(uprobes_hash(inode)), and the only reason why we can't
simply remove it is that we need to ensure that delete_uprobe() is
not possible after alloc_uprobe() and before consumer_add().

IOW, we need to ensure that when we take uprobe->register_rwsem
this uprobe is still valid and we didn't race with _unregister()
which called delete_uprobe() in between.

With this patch uprobe_register() simply checks uprobe_is_active()
and retries if it hits this very unlikely race. uprobes_mutex[] is
no longer needed and can be removed.

There is another reason for this change, prepare_uprobe() should be
folded into alloc_uprobe() and we do not want to hold the extra locks
around read_mapping_page/etc.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 kernel/events/uprobes.c |   51 +++++++++++++---------------------------------
 1 files changed, 15 insertions(+), 36 deletions(-)

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 2886c82..105ac0d 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -50,29 +50,6 @@ static struct rb_root uprobes_tree = RB_ROOT;
 static DEFINE_SPINLOCK(uprobes_treelock);	/* serialize rbtree access */
 
 #define UPROBES_HASH_SZ	13
-
-/*
- * We need separate register/unregister and mmap/munmap lock hashes because
- * of mmap_sem nesting.
- *
- * uprobe_register() needs to install probes on (potentially) all processes
- * and thus needs to acquire multiple mmap_sems (consequtively, not
- * concurrently), whereas uprobe_mmap() is called while holding mmap_sem
- * for the particular process doing the mmap.
- *
- * uprobe_register()->register_for_each_vma() needs to drop/acquire mmap_sem
- * because of lock order against i_mmap_mutex. This means there's a hole in
- * the register vma iteration where a mmap() can happen.
- *
- * Thus uprobe_register() can race with uprobe_mmap() and we can try and
- * install a probe where one is already installed.
- */
-
-/* serialize (un)register */
-static struct mutex uprobes_mutex[UPROBES_HASH_SZ];
-
-#define uprobes_hash(v)		(&uprobes_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
-
 /* serialize uprobe->pending_list */
 static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
 #define uprobes_mmap_hash(v)	(&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
@@ -865,20 +842,26 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *
 	if (offset > i_size_read(inode))
 		return -EINVAL;
 
-	ret = -ENOMEM;
-	mutex_lock(uprobes_hash(inode));
+ retry:
 	uprobe = alloc_uprobe(inode, offset);
-	if (uprobe) {
-		down_write(&uprobe->register_rwsem);
+	if (!uprobe)
+		return -ENOMEM;
+	/*
+	 * We can race with uprobe_unregister()->delete_uprobe().
+	 * Check uprobe_is_active() and retry if it is false.
+	 */
+	down_write(&uprobe->register_rwsem);
+	ret = -EAGAIN;
+	if (likely(uprobe_is_active(uprobe))) {
 		ret = __uprobe_register(uprobe, uc);
 		if (ret)
 			__uprobe_unregister(uprobe, uc);
-		up_write(&uprobe->register_rwsem);
 	}
-	mutex_unlock(uprobes_hash(inode));
-	if (uprobe)
-		put_uprobe(uprobe);
+	up_write(&uprobe->register_rwsem);
+	put_uprobe(uprobe);
 
+	if (unlikely(ret == -EAGAIN))
+		goto retry;
 	return ret;
 }
 
@@ -896,11 +879,9 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume
 	if (!uprobe)
 		return;
 
-	mutex_lock(uprobes_hash(inode));
 	down_write(&uprobe->register_rwsem);
 	__uprobe_unregister(uprobe, uc);
 	up_write(&uprobe->register_rwsem);
-	mutex_unlock(uprobes_hash(inode));
 	put_uprobe(uprobe);
 }
 
@@ -1609,10 +1590,8 @@ static int __init init_uprobes(void)
 {
 	int i;
 
-	for (i = 0; i < UPROBES_HASH_SZ; i++) {
-		mutex_init(&uprobes_mutex[i]);
+	for (i = 0; i < UPROBES_HASH_SZ; i++)
 		mutex_init(&uprobes_mmap_mutex[i]);
-	}
 
 	if (percpu_init_rwsem(&dup_mmap_sem))
 		return -ENOMEM;
-- 
1.5.5.1


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/3] uprobes: Kill uprobe_events, use RB_EMPTY_ROOT() instead
  2012-11-25 22:33 ` [PATCH 1/3] uprobes: Kill uprobe_events, use RB_EMPTY_ROOT() instead Oleg Nesterov
@ 2012-12-13 14:35   ` Srikar Dronamraju
  2012-12-23 19:21     ` Oleg Nesterov
  2013-01-02 12:56   ` Anton Arapov
  1 sibling, 1 reply; 11+ messages in thread
From: Srikar Dronamraju @ 2012-12-13 14:35 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Ingo Molnar, Peter Zijlstra, Ananth N Mavinakayanahalli,
	Anton Arapov, linux-kernel

* Oleg Nesterov <oleg@redhat.com> [2012-11-25 23:33:44]:

> uprobe_events counts the number of uprobes in uprobes_tree but
> it is used as a boolean. We can use RB_EMPTY_ROOT() instead.
> 

Nice idea.

> Probably no_uprobe_events() added by this patch can have more
> callers, say, mmf_recalc_uprobes().
> 
> Signed-off-by: Oleg Nesterov <oleg@redhat.com>
> ---

Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>

>  kernel/events/uprobes.c |   19 +++++++------------
>  1 files changed, 7 insertions(+), 12 deletions(-)
> 
> diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> index 1e047f8..53dc2eb 100644
> --- a/kernel/events/uprobes.c
> +++ b/kernel/events/uprobes.c
> @@ -41,6 +41,11 @@
>  #define MAX_UPROBE_XOL_SLOTS		UINSNS_PER_PAGE
> 
>  static struct rb_root uprobes_tree = RB_ROOT;
> +/*
> + * allows us to skip the uprobe_mmap if there are no uprobe events active
> + * at this time.  Probably a fine grained per inode count is better?
> + */
> +#define no_uprobe_events()	RB_EMPTY_ROOT(&uprobes_tree)
> 
>  static DEFINE_SPINLOCK(uprobes_treelock);	/* serialize rbtree access */
> 
> @@ -74,13 +79,6 @@ static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
> 
>  static struct percpu_rw_semaphore dup_mmap_sem;
> 
> -/*
> - * uprobe_events allows us to skip the uprobe_mmap if there are no uprobe
> - * events active at this time.  Probably a fine grained per inode count is
> - * better?
> - */
> -static atomic_t uprobe_events = ATOMIC_INIT(0);
> -
>  /* Have a copy of original instruction */
>  #define UPROBE_COPY_INSN	0
>  /* Can skip singlestep */
> @@ -460,8 +458,6 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
>  		kfree(uprobe);
>  		uprobe = cur_uprobe;
>  		iput(inode);
> -	} else {
> -		atomic_inc(&uprobe_events);
>  	}
> 
>  	return uprobe;
> @@ -685,7 +681,6 @@ static void delete_uprobe(struct uprobe *uprobe)
>  	spin_unlock(&uprobes_treelock);
>  	iput(uprobe->inode);
>  	put_uprobe(uprobe);
> -	atomic_dec(&uprobe_events);
>  }
> 
>  struct map_info {
> @@ -975,7 +970,7 @@ int uprobe_mmap(struct vm_area_struct *vma)
>  	struct uprobe *uprobe, *u;
>  	struct inode *inode;
> 
> -	if (!atomic_read(&uprobe_events) || !valid_vma(vma, true))
> +	if (no_uprobe_events() || !valid_vma(vma, true))
>  		return 0;
> 
>  	inode = vma->vm_file->f_mapping->host;
> @@ -1021,7 +1016,7 @@ vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long e
>   */
>  void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end)
>  {
> -	if (!atomic_read(&uprobe_events) || !valid_vma(vma, false))
> +	if (no_uprobe_events() || !valid_vma(vma, false))
>  		return;
> 
>  	if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */
> -- 
> 1.5.5.1
> 


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/3] uprobes: Kill uprobe_events, use RB_EMPTY_ROOT() instead
  2012-12-13 14:35   ` Srikar Dronamraju
@ 2012-12-23 19:21     ` Oleg Nesterov
  0 siblings, 0 replies; 11+ messages in thread
From: Oleg Nesterov @ 2012-12-23 19:21 UTC (permalink / raw)
  To: Srikar Dronamraju
  Cc: Ingo Molnar, Peter Zijlstra, Ananth N Mavinakayanahalli,
	Anton Arapov, linux-kernel

On 12/13, Srikar Dronamraju wrote:
>
> * Oleg Nesterov <oleg@redhat.com> [2012-11-25 23:33:44]:

almost month ago ;)

> Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>

Thanks!

but what about 2/3 and 3/3?

I'd like to send the final (and technically trivial) series which
actually turns the filtering on.

Oleg.


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/3] uprobes: Kill uprobe_events, use RB_EMPTY_ROOT() instead
  2012-11-25 22:33 ` [PATCH 1/3] uprobes: Kill uprobe_events, use RB_EMPTY_ROOT() instead Oleg Nesterov
  2012-12-13 14:35   ` Srikar Dronamraju
@ 2013-01-02 12:56   ` Anton Arapov
  1 sibling, 0 replies; 11+ messages in thread
From: Anton Arapov @ 2013-01-02 12:56 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Ingo Molnar, Peter Zijlstra, Srikar Dronamraju,
	Ananth N Mavinakayanahalli, linux-kernel

On Sun, Nov 25, 2012 at 11:33:44PM +0100, Oleg Nesterov wrote:
> uprobe_events counts the number of uprobes in uprobes_tree but
> it is used as a boolean. We can use RB_EMPTY_ROOT() instead.
> 
> Probably no_uprobe_events() added by this patch can have more
> callers, say, mmf_recalc_uprobes().
> 
> Signed-off-by: Oleg Nesterov <oleg@redhat.com>
> ---
>  kernel/events/uprobes.c |   19 +++++++------------
>  1 files changed, 7 insertions(+), 12 deletions(-)

Acked-by: Anton Arapov <anton@redhat.com>

> diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> index 1e047f8..53dc2eb 100644
> --- a/kernel/events/uprobes.c
> +++ b/kernel/events/uprobes.c
> @@ -41,6 +41,11 @@
>  #define MAX_UPROBE_XOL_SLOTS		UINSNS_PER_PAGE
>  
>  static struct rb_root uprobes_tree = RB_ROOT;
> +/*
> + * allows us to skip the uprobe_mmap if there are no uprobe events active
> + * at this time.  Probably a fine grained per inode count is better?
> + */
> +#define no_uprobe_events()	RB_EMPTY_ROOT(&uprobes_tree)
>  
>  static DEFINE_SPINLOCK(uprobes_treelock);	/* serialize rbtree access */
>  
> @@ -74,13 +79,6 @@ static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
>  
>  static struct percpu_rw_semaphore dup_mmap_sem;
>  
> -/*
> - * uprobe_events allows us to skip the uprobe_mmap if there are no uprobe
> - * events active at this time.  Probably a fine grained per inode count is
> - * better?
> - */
> -static atomic_t uprobe_events = ATOMIC_INIT(0);
> -
>  /* Have a copy of original instruction */
>  #define UPROBE_COPY_INSN	0
>  /* Can skip singlestep */
> @@ -460,8 +458,6 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
>  		kfree(uprobe);
>  		uprobe = cur_uprobe;
>  		iput(inode);
> -	} else {
> -		atomic_inc(&uprobe_events);
>  	}
>  
>  	return uprobe;
> @@ -685,7 +681,6 @@ static void delete_uprobe(struct uprobe *uprobe)
>  	spin_unlock(&uprobes_treelock);
>  	iput(uprobe->inode);
>  	put_uprobe(uprobe);
> -	atomic_dec(&uprobe_events);
>  }
>  
>  struct map_info {
> @@ -975,7 +970,7 @@ int uprobe_mmap(struct vm_area_struct *vma)
>  	struct uprobe *uprobe, *u;
>  	struct inode *inode;
>  
> -	if (!atomic_read(&uprobe_events) || !valid_vma(vma, true))
> +	if (no_uprobe_events() || !valid_vma(vma, true))
>  		return 0;
>  
>  	inode = vma->vm_file->f_mapping->host;
> @@ -1021,7 +1016,7 @@ vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long e
>   */
>  void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end)
>  {
> -	if (!atomic_read(&uprobe_events) || !valid_vma(vma, false))
> +	if (no_uprobe_events() || !valid_vma(vma, false))
>  		return;
>  
>  	if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */
> -- 
> 1.5.5.1
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 2/3] uprobes: Introduce uprobe_is_active()
  2012-11-25 22:33 ` [PATCH 2/3] uprobes: Introduce uprobe_is_active() Oleg Nesterov
@ 2013-01-02 12:57   ` Anton Arapov
  2013-01-03  9:04   ` Srikar Dronamraju
  1 sibling, 0 replies; 11+ messages in thread
From: Anton Arapov @ 2013-01-02 12:57 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Ingo Molnar, Peter Zijlstra, Srikar Dronamraju,
	Ananth N Mavinakayanahalli, linux-kernel

On Sun, Nov 25, 2012 at 11:33:47PM +0100, Oleg Nesterov wrote:
> The lifetime of uprobe->rb_node and uprobe->inode is not refcounted,
> delete_uprobe() is called when we detect that uprobe has no consumers,
> and it would be deadly wrong to do this twice.
> 
> Change delete_uprobe() to WARN() if it was already called. We use
> RB_CLEAR_NODE() to mark uprobe "inactive", then RB_EMPTY_NODE() can
> be used to detect this case.
> 
> RB_EMPTY_NODE() is not used directly, we add the trivial helper for
> the next change.
> 
> Signed-off-by: Oleg Nesterov <oleg@redhat.com>
> ---
>  kernel/events/uprobes.c |    8 ++++++++
>  1 files changed, 8 insertions(+), 0 deletions(-)

Acked-by: Anton Arapov <anton@redhat.com>


> diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> index 53dc2eb..2886c82 100644
> --- a/kernel/events/uprobes.c
> +++ b/kernel/events/uprobes.c
> @@ -669,6 +669,10 @@ remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vad
>  	return set_orig_insn(&uprobe->arch, mm, vaddr);
>  }
>  
> +static inline bool uprobe_is_active(struct uprobe *uprobe)
> +{
> +	return !RB_EMPTY_NODE(&uprobe->rb_node);
> +}
>  /*
>   * There could be threads that have already hit the breakpoint. They
>   * will recheck the current insn and restart if find_uprobe() fails.
> @@ -676,9 +680,13 @@ remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vad
>   */
>  static void delete_uprobe(struct uprobe *uprobe)
>  {
> +	if (WARN_ON(!uprobe_is_active(uprobe)))
> +		return;
> +
>  	spin_lock(&uprobes_treelock);
>  	rb_erase(&uprobe->rb_node, &uprobes_tree);
>  	spin_unlock(&uprobes_treelock);
> +	RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */
>  	iput(uprobe->inode);
>  	put_uprobe(uprobe);
>  }
> -- 
> 1.5.5.1
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 3/3] uprobes: Kill uprobes_mutex[], separate alloc_uprobe() and __uprobe_register()
  2012-11-25 22:33 ` [PATCH 3/3] uprobes: Kill uprobes_mutex[], separate alloc_uprobe() and __uprobe_register() Oleg Nesterov
@ 2013-01-02 12:57   ` Anton Arapov
  2013-01-03  9:05   ` Srikar Dronamraju
  1 sibling, 0 replies; 11+ messages in thread
From: Anton Arapov @ 2013-01-02 12:57 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Ingo Molnar, Peter Zijlstra, Srikar Dronamraju,
	Ananth N Mavinakayanahalli, linux-kernel

On Sun, Nov 25, 2012 at 11:33:50PM +0100, Oleg Nesterov wrote:
> uprobe_register() and uprobe_unregister() are the only users of
> mutex_lock(uprobes_hash(inode)), and the only reason why we can't
> simply remove it is that we need to ensure that delete_uprobe() is
> not possible after alloc_uprobe() and before consumer_add().
> 
> IOW, we need to ensure that when we take uprobe->register_rwsem
> this uprobe is still valid and we didn't race with _unregister()
> which called delete_uprobe() in between.
> 
> With this patch uprobe_register() simply checks uprobe_is_active()
> and retries if it hits this very unlikely race. uprobes_mutex[] is
> no longer needed and can be removed.
> 
> There is another reason for this change, prepare_uprobe() should be
> folded into alloc_uprobe() and we do not want to hold the extra locks
> around read_mapping_page/etc.
> 
> Signed-off-by: Oleg Nesterov <oleg@redhat.com>
> ---
>  kernel/events/uprobes.c |   51 +++++++++++++---------------------------------
>  1 files changed, 15 insertions(+), 36 deletions(-)

Acked-by: Anton Arapov <anton@redhat.com>

> diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> index 2886c82..105ac0d 100644
> --- a/kernel/events/uprobes.c
> +++ b/kernel/events/uprobes.c
> @@ -50,29 +50,6 @@ static struct rb_root uprobes_tree = RB_ROOT;
>  static DEFINE_SPINLOCK(uprobes_treelock);	/* serialize rbtree access */
>  
>  #define UPROBES_HASH_SZ	13
> -
> -/*
> - * We need separate register/unregister and mmap/munmap lock hashes because
> - * of mmap_sem nesting.
> - *
> - * uprobe_register() needs to install probes on (potentially) all processes
> - * and thus needs to acquire multiple mmap_sems (consequtively, not
> - * concurrently), whereas uprobe_mmap() is called while holding mmap_sem
> - * for the particular process doing the mmap.
> - *
> - * uprobe_register()->register_for_each_vma() needs to drop/acquire mmap_sem
> - * because of lock order against i_mmap_mutex. This means there's a hole in
> - * the register vma iteration where a mmap() can happen.
> - *
> - * Thus uprobe_register() can race with uprobe_mmap() and we can try and
> - * install a probe where one is already installed.
> - */
> -
> -/* serialize (un)register */
> -static struct mutex uprobes_mutex[UPROBES_HASH_SZ];
> -
> -#define uprobes_hash(v)		(&uprobes_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
> -
>  /* serialize uprobe->pending_list */
>  static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
>  #define uprobes_mmap_hash(v)	(&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
> @@ -865,20 +842,26 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *
>  	if (offset > i_size_read(inode))
>  		return -EINVAL;
>  
> -	ret = -ENOMEM;
> -	mutex_lock(uprobes_hash(inode));
> + retry:
>  	uprobe = alloc_uprobe(inode, offset);
> -	if (uprobe) {
> -		down_write(&uprobe->register_rwsem);
> +	if (!uprobe)
> +		return -ENOMEM;
> +	/*
> +	 * We can race with uprobe_unregister()->delete_uprobe().
> +	 * Check uprobe_is_active() and retry if it is false.
> +	 */
> +	down_write(&uprobe->register_rwsem);
> +	ret = -EAGAIN;
> +	if (likely(uprobe_is_active(uprobe))) {
>  		ret = __uprobe_register(uprobe, uc);
>  		if (ret)
>  			__uprobe_unregister(uprobe, uc);
> -		up_write(&uprobe->register_rwsem);
>  	}
> -	mutex_unlock(uprobes_hash(inode));
> -	if (uprobe)
> -		put_uprobe(uprobe);
> +	up_write(&uprobe->register_rwsem);
> +	put_uprobe(uprobe);
>  
> +	if (unlikely(ret == -EAGAIN))
> +		goto retry;
>  	return ret;
>  }
>  
> @@ -896,11 +879,9 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume
>  	if (!uprobe)
>  		return;
>  
> -	mutex_lock(uprobes_hash(inode));
>  	down_write(&uprobe->register_rwsem);
>  	__uprobe_unregister(uprobe, uc);
>  	up_write(&uprobe->register_rwsem);
> -	mutex_unlock(uprobes_hash(inode));
>  	put_uprobe(uprobe);
>  }
>  
> @@ -1609,10 +1590,8 @@ static int __init init_uprobes(void)
>  {
>  	int i;
>  
> -	for (i = 0; i < UPROBES_HASH_SZ; i++) {
> -		mutex_init(&uprobes_mutex[i]);
> +	for (i = 0; i < UPROBES_HASH_SZ; i++)
>  		mutex_init(&uprobes_mmap_mutex[i]);
> -	}
>  
>  	if (percpu_init_rwsem(&dup_mmap_sem))
>  		return -ENOMEM;
> -- 
> 1.5.5.1
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 2/3] uprobes: Introduce uprobe_is_active()
  2012-11-25 22:33 ` [PATCH 2/3] uprobes: Introduce uprobe_is_active() Oleg Nesterov
  2013-01-02 12:57   ` Anton Arapov
@ 2013-01-03  9:04   ` Srikar Dronamraju
  1 sibling, 0 replies; 11+ messages in thread
From: Srikar Dronamraju @ 2013-01-03  9:04 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Ingo Molnar, Peter Zijlstra, Ananth N Mavinakayanahalli,
	Anton Arapov, linux-kernel

* Oleg Nesterov <oleg@redhat.com> [2012-11-25 23:33:47]:

> The lifetime of uprobe->rb_node and uprobe->inode is not refcounted,
> delete_uprobe() is called when we detect that uprobe has no consumers,
> and it would be deadly wrong to do this twice.
> 
> Change delete_uprobe() to WARN() if it was already called. We use
> RB_CLEAR_NODE() to mark uprobe "inactive", then RB_EMPTY_NODE() can
> be used to detect this case.
> 
> RB_EMPTY_NODE() is not used directly, we add the trivial helper for
> the next change.
> 
> Signed-off-by: Oleg Nesterov <oleg@redhat.com>

Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>

> ---
>  kernel/events/uprobes.c |    8 ++++++++
>  1 files changed, 8 insertions(+), 0 deletions(-)
> 
> diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> index 53dc2eb..2886c82 100644
> --- a/kernel/events/uprobes.c
> +++ b/kernel/events/uprobes.c
> @@ -669,6 +669,10 @@ remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vad
>  	return set_orig_insn(&uprobe->arch, mm, vaddr);
>  }
> 
> +static inline bool uprobe_is_active(struct uprobe *uprobe)
> +{
> +	return !RB_EMPTY_NODE(&uprobe->rb_node);
> +}
>  /*
>   * There could be threads that have already hit the breakpoint. They
>   * will recheck the current insn and restart if find_uprobe() fails.
> @@ -676,9 +680,13 @@ remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vad
>   */
>  static void delete_uprobe(struct uprobe *uprobe)
>  {
> +	if (WARN_ON(!uprobe_is_active(uprobe)))
> +		return;
> +
>  	spin_lock(&uprobes_treelock);
>  	rb_erase(&uprobe->rb_node, &uprobes_tree);
>  	spin_unlock(&uprobes_treelock);
> +	RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */
>  	iput(uprobe->inode);
>  	put_uprobe(uprobe);
>  }
> -- 
> 1.5.5.1
> 


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 3/3] uprobes: Kill uprobes_mutex[], separate alloc_uprobe() and __uprobe_register()
  2012-11-25 22:33 ` [PATCH 3/3] uprobes: Kill uprobes_mutex[], separate alloc_uprobe() and __uprobe_register() Oleg Nesterov
  2013-01-02 12:57   ` Anton Arapov
@ 2013-01-03  9:05   ` Srikar Dronamraju
  1 sibling, 0 replies; 11+ messages in thread
From: Srikar Dronamraju @ 2013-01-03  9:05 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Ingo Molnar, Peter Zijlstra, Ananth N Mavinakayanahalli,
	Anton Arapov, linux-kernel

* Oleg Nesterov <oleg@redhat.com> [2012-11-25 23:33:50]:

> uprobe_register() and uprobe_unregister() are the only users of
> mutex_lock(uprobes_hash(inode)), and the only reason why we can't
> simply remove it is that we need to ensure that delete_uprobe() is
> not possible after alloc_uprobe() and before consumer_add().
> 
> IOW, we need to ensure that when we take uprobe->register_rwsem
> this uprobe is still valid and we didn't race with _unregister()
> which called delete_uprobe() in between.
> 
> With this patch uprobe_register() simply checks uprobe_is_active()
> and retries if it hits this very unlikely race. uprobes_mutex[] is
> no longer needed and can be removed.
> 
> There is another reason for this change, prepare_uprobe() should be
> folded into alloc_uprobe() and we do not want to hold the extra locks
> around read_mapping_page/etc.
> 
> Signed-off-by: Oleg Nesterov <oleg@redhat.com>

Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>

> ---
>  kernel/events/uprobes.c |   51 +++++++++++++---------------------------------
>  1 files changed, 15 insertions(+), 36 deletions(-)
> 
> diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> index 2886c82..105ac0d 100644
> --- a/kernel/events/uprobes.c
> +++ b/kernel/events/uprobes.c
> @@ -50,29 +50,6 @@ static struct rb_root uprobes_tree = RB_ROOT;
>  static DEFINE_SPINLOCK(uprobes_treelock);	/* serialize rbtree access */
> 
>  #define UPROBES_HASH_SZ	13
> -
> -/*
> - * We need separate register/unregister and mmap/munmap lock hashes because
> - * of mmap_sem nesting.
> - *
> - * uprobe_register() needs to install probes on (potentially) all processes
> - * and thus needs to acquire multiple mmap_sems (consequtively, not
> - * concurrently), whereas uprobe_mmap() is called while holding mmap_sem
> - * for the particular process doing the mmap.
> - *
> - * uprobe_register()->register_for_each_vma() needs to drop/acquire mmap_sem
> - * because of lock order against i_mmap_mutex. This means there's a hole in
> - * the register vma iteration where a mmap() can happen.
> - *
> - * Thus uprobe_register() can race with uprobe_mmap() and we can try and
> - * install a probe where one is already installed.
> - */
> -
> -/* serialize (un)register */
> -static struct mutex uprobes_mutex[UPROBES_HASH_SZ];
> -
> -#define uprobes_hash(v)		(&uprobes_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
> -
>  /* serialize uprobe->pending_list */
>  static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
>  #define uprobes_mmap_hash(v)	(&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
> @@ -865,20 +842,26 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *
>  	if (offset > i_size_read(inode))
>  		return -EINVAL;
> 
> -	ret = -ENOMEM;
> -	mutex_lock(uprobes_hash(inode));
> + retry:
>  	uprobe = alloc_uprobe(inode, offset);
> -	if (uprobe) {
> -		down_write(&uprobe->register_rwsem);
> +	if (!uprobe)
> +		return -ENOMEM;
> +	/*
> +	 * We can race with uprobe_unregister()->delete_uprobe().
> +	 * Check uprobe_is_active() and retry if it is false.
> +	 */
> +	down_write(&uprobe->register_rwsem);
> +	ret = -EAGAIN;
> +	if (likely(uprobe_is_active(uprobe))) {
>  		ret = __uprobe_register(uprobe, uc);
>  		if (ret)
>  			__uprobe_unregister(uprobe, uc);
> -		up_write(&uprobe->register_rwsem);
>  	}
> -	mutex_unlock(uprobes_hash(inode));
> -	if (uprobe)
> -		put_uprobe(uprobe);
> +	up_write(&uprobe->register_rwsem);
> +	put_uprobe(uprobe);
> 
> +	if (unlikely(ret == -EAGAIN))
> +		goto retry;
>  	return ret;
>  }
> 
> @@ -896,11 +879,9 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume
>  	if (!uprobe)
>  		return;
> 
> -	mutex_lock(uprobes_hash(inode));
>  	down_write(&uprobe->register_rwsem);
>  	__uprobe_unregister(uprobe, uc);
>  	up_write(&uprobe->register_rwsem);
> -	mutex_unlock(uprobes_hash(inode));
>  	put_uprobe(uprobe);
>  }
> 
> @@ -1609,10 +1590,8 @@ static int __init init_uprobes(void)
>  {
>  	int i;
> 
> -	for (i = 0; i < UPROBES_HASH_SZ; i++) {
> -		mutex_init(&uprobes_mutex[i]);
> +	for (i = 0; i < UPROBES_HASH_SZ; i++)
>  		mutex_init(&uprobes_mmap_mutex[i]);
> -	}
> 
>  	if (percpu_init_rwsem(&dup_mmap_sem))
>  		return -ENOMEM;
> -- 
> 1.5.5.1
> 


^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2013-01-03  9:06 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-11-25 22:33 [PATCH 0/3] uprobes: Kill uprobes_mutex[] Oleg Nesterov
2012-11-25 22:33 ` [PATCH 1/3] uprobes: Kill uprobe_events, use RB_EMPTY_ROOT() instead Oleg Nesterov
2012-12-13 14:35   ` Srikar Dronamraju
2012-12-23 19:21     ` Oleg Nesterov
2013-01-02 12:56   ` Anton Arapov
2012-11-25 22:33 ` [PATCH 2/3] uprobes: Introduce uprobe_is_active() Oleg Nesterov
2013-01-02 12:57   ` Anton Arapov
2013-01-03  9:04   ` Srikar Dronamraju
2012-11-25 22:33 ` [PATCH 3/3] uprobes: Kill uprobes_mutex[], separate alloc_uprobe() and __uprobe_register() Oleg Nesterov
2013-01-02 12:57   ` Anton Arapov
2013-01-03  9:05   ` Srikar Dronamraju

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).