linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/3] uprobes: make register/unregister O(n)
@ 2012-06-04 14:52 Oleg Nesterov
  2012-06-04 14:53 ` [PATCH 1/3] uprobes: rework register_for_each_vma() to make it O(n) Oleg Nesterov
                   ` (5 more replies)
  0 siblings, 6 replies; 16+ messages in thread
From: Oleg Nesterov @ 2012-06-04 14:52 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra, Srikar Dronamraju
  Cc: Ananth N Mavinakayanahalli, Anton Arapov, Linus Torvalds,
	Masami Hiramatsu, linux-kernel

Hello,

On top of "[PATCH 0/3] uprobes: misc minor changes".

Peter, I forgot about your patch initially, so I am re-sending it
as 3/3 on top of 1-2. Hopefully you do not mind but I can rebase
1-2 on top of your change.

It is not easy to read 1/3, so I attached the resulting code below,
to simplify the review.

Oleg.

struct map_info {
	struct map_info *next;
	struct mm_struct *mm;
	loff_t vaddr;
};

static inline struct map_info *free_map_info(struct map_info *info)
{
	struct map_info *next = info->next;
	kfree(info);
	return next;
}

static struct map_info *
build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
{
	unsigned long pgoff = offset >> PAGE_SHIFT;
	struct prio_tree_iter iter;
	struct vm_area_struct *vma;
	struct map_info *curr = NULL;
	struct map_info *prev = NULL;
	struct map_info *info;
	int more = 0;

 again:
	mutex_lock(&mapping->i_mmap_mutex);
	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
		if (!valid_vma(vma, is_register))
			continue;

		if (!prev) {
			prev = kmalloc(sizeof(struct map_info),
					GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN);
			if (!prev) {
				more++;
				continue;
			}
			prev->next = NULL;
		}

		if (!atomic_inc_not_zero(&vma->vm_mm->mm_users))
			continue;

		info = prev;
		prev = prev->next;
		info->next = curr;
		curr = info;

		info->mm = vma->vm_mm;
		info->vaddr = vma_address(vma, offset);
	}
	mutex_unlock(&mapping->i_mmap_mutex);

	if (!more)
		goto out;

	prev = curr;
	while (curr) {
		mmput(curr->mm);
		curr = curr->next;
	}

	do {
		info = kmalloc(sizeof(struct map_info), GFP_KERNEL);
		if (!info) {
			curr = ERR_PTR(-ENOMEM);
			goto out;
		}
		info->next = prev;
		prev = info;
	} while (--more);

	goto again;
 out:
	while (prev)
		prev = free_map_info(prev);
	return curr;
}

static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
{
	struct map_info *info;
	int err = 0;

	info = build_map_info(uprobe->inode->i_mapping,
					uprobe->offset, is_register);
	if (IS_ERR(info))
		return PTR_ERR(info);

	while (info) {
		struct mm_struct *mm = info->mm;
		struct vm_area_struct *vma;
		loff_t vaddr;

		if (err)
			goto free;

		down_write(&mm->mmap_sem);
		vma = find_vma(mm, (unsigned long)info->vaddr);
		if (!vma || !valid_vma(vma, is_register))
			goto unlock;

		vaddr = vma_address(vma, uprobe->offset);
		if (vma->vm_file->f_mapping->host != uprobe->inode ||
						vaddr != info->vaddr)
			goto unlock;

		if (is_register) {
			err = install_breakpoint(uprobe, mm, vma, info->vaddr);
			/*
			 * We can race against uprobe_register(), see the
			 * comment near uprobe_hash().
			 */
			if (err == -EEXIST)
				err = 0;
		} else {
			remove_breakpoint(uprobe, mm, info->vaddr);
		}
 unlock:
		up_write(&mm->mmap_sem);
 free:
		mmput(mm);
		info = free_map_info(info);
	}

	return err;
}


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 1/3] uprobes: rework register_for_each_vma() to make it O(n)
  2012-06-04 14:52 [PATCH 0/3] uprobes: make register/unregister O(n) Oleg Nesterov
@ 2012-06-04 14:53 ` Oleg Nesterov
  2012-06-04 14:53 ` [PATCH 2/3] uprobes: change build_map_info() to try kmalloc(GFP_NOWAIT) first Oleg Nesterov
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 16+ messages in thread
From: Oleg Nesterov @ 2012-06-04 14:53 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra, Srikar Dronamraju
  Cc: Ananth N Mavinakayanahalli, Anton Arapov, Linus Torvalds,
	Masami Hiramatsu, linux-kernel

Currently register_for_each_vma() is O(n ** 2) + O(n ** 3), every
time find_next_vma_info() "restarts" the vma_prio_tree_foreach()
loop and each iteration rechecks the whole try_list. This also
means that try_list can grow "indefinitely" if register/unregister
races with munmap/mmap activity even if the number of mapping is
bounded at any time.

With this patch register_for_each_vma() builds the list of mm/vaddr
structures only once and does install_breakpoint() for each entry.

We do not care about the new mappings which can be created after
build_map_info() drops mapping->i_mmap_mutex, uprobe_mmap() should
do its work.

Note that we do not allocate map_info under i_mmap_mutex, this can
deadlock with page reclaim (but see the next patch). So we use 2
lists, "curr" which we are going to return, and "prev" which holds
the already allocated memory. The main loop deques the entry from
"prev" (initially it is empty), and if "prev" becomes empty again
it counts the number of entries we need to pre-allocate outside of
i_mmap_mutex.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 kernel/events/uprobes.c |  199 ++++++++++++++++++++---------------------------
 1 files changed, 86 insertions(+), 113 deletions(-)

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index de931e7..a1bbcab 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -60,17 +60,6 @@ static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
  */
 static atomic_t uprobe_events = ATOMIC_INIT(0);
 
-/*
- * Maintain a temporary per vma info that can be used to search if a vma
- * has already been handled. This structure is introduced since extending
- * vm_area_struct wasnt recommended.
- */
-struct vma_info {
-	struct list_head	probe_list;
-	struct mm_struct	*mm;
-	loff_t			vaddr;
-};
-
 struct uprobe {
 	struct rb_node		rb_node;	/* node in the rb tree */
 	atomic_t		ref;
@@ -752,139 +741,123 @@ static void delete_uprobe(struct uprobe *uprobe)
 	atomic_dec(&uprobe_events);
 }
 
-static struct vma_info *
-__find_next_vma_info(struct address_space *mapping, struct list_head *head,
-			struct vma_info *vi, loff_t offset, bool is_register)
+struct map_info {
+	struct map_info *next;
+	struct mm_struct *mm;
+	loff_t vaddr;
+};
+
+static inline struct map_info *free_map_info(struct map_info *info)
 {
+	struct map_info *next = info->next;
+	kfree(info);
+	return next;
+}
+
+static struct map_info *
+build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
+{
+	unsigned long pgoff = offset >> PAGE_SHIFT;
 	struct prio_tree_iter iter;
 	struct vm_area_struct *vma;
-	struct vma_info *tmpvi;
-	unsigned long pgoff;
-	int existing_vma;
-	loff_t vaddr;
-
-	pgoff = offset >> PAGE_SHIFT;
+	struct map_info *curr = NULL;
+	struct map_info *prev = NULL;
+	struct map_info *info;
+	int more = 0;
 
+ again:
+	mutex_lock(&mapping->i_mmap_mutex);
 	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
 		if (!valid_vma(vma, is_register))
 			continue;
 
-		existing_vma = 0;
-		vaddr = vma_address(vma, offset);
-
-		list_for_each_entry(tmpvi, head, probe_list) {
-			if (tmpvi->mm == vma->vm_mm && tmpvi->vaddr == vaddr) {
-				existing_vma = 1;
-				break;
-			}
-		}
-
-		/*
-		 * Another vma needs a probe to be installed. However skip
-		 * installing the probe if the vma is about to be unlinked.
-		 */
-		if (!existing_vma && atomic_inc_not_zero(&vma->vm_mm->mm_users)) {
-			vi->mm = vma->vm_mm;
-			vi->vaddr = vaddr;
-			list_add(&vi->probe_list, head);
-
-			return vi;
+		if (!prev) {
+			more++;
+			continue;
 		}
-	}
 
-	return NULL;
-}
-
-/*
- * Iterate in the rmap prio tree  and find a vma where a probe has not
- * yet been inserted.
- */
-static struct vma_info *
-find_next_vma_info(struct address_space *mapping, struct list_head *head,
-		loff_t offset, bool is_register)
-{
-	struct vma_info *vi, *retvi;
+		if (!atomic_inc_not_zero(&vma->vm_mm->mm_users))
+			continue;
 
-	vi = kzalloc(sizeof(struct vma_info), GFP_KERNEL);
-	if (!vi)
-		return ERR_PTR(-ENOMEM);
+		info = prev;
+		prev = prev->next;
+		info->next = curr;
+		curr = info;
 
-	mutex_lock(&mapping->i_mmap_mutex);
-	retvi = __find_next_vma_info(mapping, head, vi, offset, is_register);
+		info->mm = vma->vm_mm;
+		info->vaddr = vma_address(vma, offset);
+	}
 	mutex_unlock(&mapping->i_mmap_mutex);
 
-	if (!retvi)
-		kfree(vi);
+	if (!more)
+		goto out;
+
+	prev = curr;
+	while (curr) {
+		mmput(curr->mm);
+		curr = curr->next;
+	}
 
-	return retvi;
+	do {
+		info = kmalloc(sizeof(struct map_info), GFP_KERNEL);
+		if (!info) {
+			curr = ERR_PTR(-ENOMEM);
+			goto out;
+		}
+		info->next = prev;
+		prev = info;
+	} while (--more);
+
+	goto again;
+ out:
+	while (prev)
+		prev = free_map_info(prev);
+	return curr;
 }
 
 static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
 {
-	struct list_head try_list;
-	struct vm_area_struct *vma;
-	struct address_space *mapping;
-	struct vma_info *vi, *tmpvi;
-	struct mm_struct *mm;
-	loff_t vaddr;
-	int ret;
+	struct map_info *info;
+	int err = 0;
 
-	mapping = uprobe->inode->i_mapping;
-	INIT_LIST_HEAD(&try_list);
+	info = build_map_info(uprobe->inode->i_mapping,
+					uprobe->offset, is_register);
+	if (IS_ERR(info))
+		return PTR_ERR(info);
 
-	ret = 0;
-
-	for (;;) {
-		vi = find_next_vma_info(mapping, &try_list, uprobe->offset, is_register);
-		if (!vi)
-			break;
+	while (info) {
+		struct mm_struct *mm = info->mm;
+		struct vm_area_struct *vma;
+		loff_t vaddr;
 
-		if (IS_ERR(vi)) {
-			ret = PTR_ERR(vi);
-			break;
-		}
+		if (err)
+			goto free;
 
-		mm = vi->mm;
 		down_write(&mm->mmap_sem);
-		vma = find_vma(mm, (unsigned long)vi->vaddr);
-		if (!vma || !valid_vma(vma, is_register)) {
-			list_del(&vi->probe_list);
-			kfree(vi);
-			up_write(&mm->mmap_sem);
-			mmput(mm);
-			continue;
-		}
+		vma = find_vma(mm, (unsigned long)info->vaddr);
+		if (!vma || !valid_vma(vma, is_register))
+			goto unlock;
+
 		vaddr = vma_address(vma, uprobe->offset);
 		if (vma->vm_file->f_mapping->host != uprobe->inode ||
-						vaddr != vi->vaddr) {
-			list_del(&vi->probe_list);
-			kfree(vi);
-			up_write(&mm->mmap_sem);
-			mmput(mm);
-			continue;
-		}
-
-		if (is_register)
-			ret = install_breakpoint(uprobe, mm, vma, vi->vaddr);
-		else
-			remove_breakpoint(uprobe, mm, vi->vaddr);
+						vaddr != info->vaddr)
+			goto unlock;
 
-		up_write(&mm->mmap_sem);
-		mmput(mm);
 		if (is_register) {
-			if (ret && ret == -EEXIST)
-				ret = 0;
-			if (ret)
-				break;
+			err = install_breakpoint(uprobe, mm, vma, info->vaddr);
+			if (err == -EEXIST)
+				err = 0;
+		} else {
+			remove_breakpoint(uprobe, mm, info->vaddr);
 		}
+ unlock:
+		up_write(&mm->mmap_sem);
+ free:
+		mmput(mm);
+		info = free_map_info(info);
 	}
 
-	list_for_each_entry_safe(vi, tmpvi, &try_list, probe_list) {
-		list_del(&vi->probe_list);
-		kfree(vi);
-	}
-
-	return ret;
+	return err;
 }
 
 static int __uprobe_register(struct uprobe *uprobe)
-- 
1.5.5.1



^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 2/3] uprobes: change build_map_info() to try kmalloc(GFP_NOWAIT) first
  2012-06-04 14:52 [PATCH 0/3] uprobes: make register/unregister O(n) Oleg Nesterov
  2012-06-04 14:53 ` [PATCH 1/3] uprobes: rework register_for_each_vma() to make it O(n) Oleg Nesterov
@ 2012-06-04 14:53 ` Oleg Nesterov
  2012-06-04 14:59   ` Peter Zijlstra
  2012-06-05 10:10   ` Oleg Nesterov
  2012-06-04 14:53 ` [PATCH 3/3] uprobes: document uprobe_register() vs uprobe_mmap() race Oleg Nesterov
                   ` (3 subsequent siblings)
  5 siblings, 2 replies; 16+ messages in thread
From: Oleg Nesterov @ 2012-06-04 14:53 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra, Srikar Dronamraju
  Cc: Ananth N Mavinakayanahalli, Anton Arapov, Linus Torvalds,
	Masami Hiramatsu, linux-kernel

build_map_info() doesn't allocate the memory under i_mmap_mutex
to avoid the deadlock with page reclaim. But it can try GFP_NOWAIT
first, it should work in the likely case and thus we almost never
need the pre-alloc-and-retry path.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 kernel/events/uprobes.c |    9 +++++++--
 1 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index a1bbcab..c9836ae 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -772,8 +772,13 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
 			continue;
 
 		if (!prev) {
-			more++;
-			continue;
+			prev = kmalloc(sizeof(struct map_info),
+					GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN);
+			if (!prev) {
+				more++;
+				continue;
+			}
+			prev->next = NULL;
 		}
 
 		if (!atomic_inc_not_zero(&vma->vm_mm->mm_users))
-- 
1.5.5.1



^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 3/3] uprobes: document uprobe_register() vs uprobe_mmap() race
  2012-06-04 14:52 [PATCH 0/3] uprobes: make register/unregister O(n) Oleg Nesterov
  2012-06-04 14:53 ` [PATCH 1/3] uprobes: rework register_for_each_vma() to make it O(n) Oleg Nesterov
  2012-06-04 14:53 ` [PATCH 2/3] uprobes: change build_map_info() to try kmalloc(GFP_NOWAIT) first Oleg Nesterov
@ 2012-06-04 14:53 ` Oleg Nesterov
  2012-06-04 15:00   ` Peter Zijlstra
  2012-06-04 14:57 ` [PATCH 0/3] uprobes: make register/unregister O(n) Peter Zijlstra
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 16+ messages in thread
From: Oleg Nesterov @ 2012-06-04 14:53 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra, Srikar Dronamraju
  Cc: Ananth N Mavinakayanahalli, Anton Arapov, Linus Torvalds,
	Masami Hiramatsu, linux-kernel

From: Peter Zijlstra <a.p.zijlstra@chello.nl>

Because the mind is treacherous and makes us forget we need to write
stuff down.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 kernel/events/uprobes.c |   31 ++++++++++++++++++++++++++++---
 1 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index c9836ae..87a47c6 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -44,6 +44,23 @@ static DEFINE_SPINLOCK(uprobes_treelock);	/* serialize rbtree access */
 
 #define UPROBES_HASH_SZ	13
 
+/*
+ * We need separate register/unregister and mmap/munmap lock hashes because
+ * of mmap_sem nesting.
+ *
+ * uprobe_register() needs to install probes on (potentially) all processes
+ * and thus needs to acquire multiple mmap_sems (consequtively, not
+ * concurrently), whereas uprobe_mmap() is called while holding mmap_sem
+ * for the particular process doing the mmap.
+ *
+ * uprobe_register()->register_for_each_vma() needs to drop/acquire mmap_sem
+ * because of lock order against i_mmap_mutex. This means there's a hole in
+ * the register vma iteration where a mmap() can happen.
+ *
+ * Thus uprobe_register() can race with uprobe_mmap() and we can try and
+ * install a probe where one is already installed.
+ */
+
 /* serialize (un)register */
 static struct mutex uprobes_mutex[UPROBES_HASH_SZ];
 
@@ -353,7 +370,9 @@ out:
 int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
 {
 	int result;
-
+	/*
+	 * See the comment near uprobes_hash().
+	 */
 	result = is_swbp_at_addr(mm, vaddr);
 	if (result == 1)
 		return -EEXIST;
@@ -850,6 +869,10 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
 
 		if (is_register) {
 			err = install_breakpoint(uprobe, mm, vma, info->vaddr);
+			/*
+			 * We can race against uprobe_register(), see the
+			 * comment near uprobe_hash().
+			 */
 			if (err == -EEXIST)
 				err = 0;
 		} else {
@@ -1058,8 +1081,10 @@ int uprobe_mmap(struct vm_area_struct *vma)
 			}
 
 			ret = install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
-
-			/* Ignore double add: */
+			/*
+			 * We can race against uprobe_register(), see the
+			 * comment near uprobe_hash().
+			 */
 			if (ret == -EEXIST) {
 				ret = 0;
 
-- 
1.5.5.1



^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/3] uprobes: make register/unregister O(n)
  2012-06-04 14:52 [PATCH 0/3] uprobes: make register/unregister O(n) Oleg Nesterov
                   ` (2 preceding siblings ...)
  2012-06-04 14:53 ` [PATCH 3/3] uprobes: document uprobe_register() vs uprobe_mmap() race Oleg Nesterov
@ 2012-06-04 14:57 ` Peter Zijlstra
  2012-06-04 17:37 ` Srikar Dronamraju
  2012-06-06 14:49 ` [PATCH v2 " Oleg Nesterov
  5 siblings, 0 replies; 16+ messages in thread
From: Peter Zijlstra @ 2012-06-04 14:57 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Ingo Molnar, Srikar Dronamraju, Ananth N Mavinakayanahalli,
	Anton Arapov, Linus Torvalds, Masami Hiramatsu, linux-kernel

On Mon, 2012-06-04 at 16:52 +0200, Oleg Nesterov wrote:
> Peter, I forgot about your patch initially, so I am re-sending it
> as 3/3 on top of 1-2. Hopefully you do not mind but I can rebase
> 1-2 on top of your change. 

Nah, I don't mind. Your change to register_for_each_vma() looks good
too.

Thanks!

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/3] uprobes: change build_map_info() to try kmalloc(GFP_NOWAIT) first
  2012-06-04 14:53 ` [PATCH 2/3] uprobes: change build_map_info() to try kmalloc(GFP_NOWAIT) first Oleg Nesterov
@ 2012-06-04 14:59   ` Peter Zijlstra
  2012-06-05 10:10   ` Oleg Nesterov
  1 sibling, 0 replies; 16+ messages in thread
From: Peter Zijlstra @ 2012-06-04 14:59 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Ingo Molnar, Srikar Dronamraju, Ananth N Mavinakayanahalli,
	Anton Arapov, Linus Torvalds, Masami Hiramatsu, linux-kernel

On Mon, 2012-06-04 at 16:53 +0200, Oleg Nesterov wrote:
> build_map_info() doesn't allocate the memory under i_mmap_mutex
> to avoid the deadlock with page reclaim. But it can try GFP_NOWAIT
> first, it should work in the likely case and thus we almost never
> need the pre-alloc-and-retry path.
> 
> Signed-off-by: Oleg Nesterov <oleg@redhat.com>
> ---
>  kernel/events/uprobes.c |    9 +++++++--
>  1 files changed, 7 insertions(+), 2 deletions(-)
> 
> diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> index a1bbcab..c9836ae 100644
> --- a/kernel/events/uprobes.c
> +++ b/kernel/events/uprobes.c
> @@ -772,8 +772,13 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
>  			continue;
>  
>  		if (!prev) {
> -			more++;
> -			continue;


Should we add something like this:

			/*
			 * Needs GFP_NOWAIT to avoid i_mmap_mutex recursion through reclaim.
			 * This is optimistic, no harm done if it fails.
			 */


> +			prev = kmalloc(sizeof(struct map_info),
> +					GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN);
> +			if (!prev) {
> +				more++;
> +				continue;
> +			}
> +			prev->next = NULL;
>  		}
>  
>  		if (!atomic_inc_not_zero(&vma->vm_mm->mm_users))


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 3/3] uprobes: document uprobe_register() vs uprobe_mmap() race
  2012-06-04 14:53 ` [PATCH 3/3] uprobes: document uprobe_register() vs uprobe_mmap() race Oleg Nesterov
@ 2012-06-04 15:00   ` Peter Zijlstra
  2012-06-04 15:40     ` Oleg Nesterov
  0 siblings, 1 reply; 16+ messages in thread
From: Peter Zijlstra @ 2012-06-04 15:00 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Ingo Molnar, Srikar Dronamraju, Ananth N Mavinakayanahalli,
	Anton Arapov, Linus Torvalds, Masami Hiramatsu, linux-kernel

On Mon, 2012-06-04 at 16:53 +0200, Oleg Nesterov wrote:

I didn't check if its my error or not, however:

> @@ -850,6 +869,10 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
>  
>  		if (is_register) {
>  			err = install_breakpoint(uprobe, mm, vma, info->vaddr);
> +			/*
> +			 * We can race against uprobe_register(), see the

that should be uprobe_mmap(), since this is the register path ;-)

> +			 * comment near uprobe_hash().
> +			 */
>  			if (err == -EEXIST)
>  				err = 0;
>  		} else {
> @@ -1058,8 +1081,10 @@ int uprobe_mmap(struct vm_area_struct *vma)
>  			}
>  
>  			ret = install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
> -
> -			/* Ignore double add: */
> +			/*
> +			 * We can race against uprobe_register(), see the
> +			 * comment near uprobe_hash().
> +			 */
>  			if (ret == -EEXIST) {
>  				ret = 0;
>  


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 3/3] uprobes: document uprobe_register() vs uprobe_mmap() race
  2012-06-04 15:00   ` Peter Zijlstra
@ 2012-06-04 15:40     ` Oleg Nesterov
  0 siblings, 0 replies; 16+ messages in thread
From: Oleg Nesterov @ 2012-06-04 15:40 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, Srikar Dronamraju, Ananth N Mavinakayanahalli,
	Anton Arapov, Linus Torvalds, Masami Hiramatsu, linux-kernel

On 06/04, Peter Zijlstra wrote:
>
> On Mon, 2012-06-04 at 16:53 +0200, Oleg Nesterov wrote:
>
> I didn't check if its my error or not, however:
>
> > @@ -850,6 +869,10 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
> >
> >  		if (is_register) {
> >  			err = install_breakpoint(uprobe, mm, vma, info->vaddr);
> > +			/*
> > +			 * We can race against uprobe_register(), see the
>
> that should be uprobe_mmap(), since this is the register path ;-)

Oops. Fixed. 2/2 is also changed as you suggested.

I'll wait a bit to collect more comments/reviews and resend.

Thanks!

Oleg.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/3] uprobes: make register/unregister O(n)
  2012-06-04 14:52 [PATCH 0/3] uprobes: make register/unregister O(n) Oleg Nesterov
                   ` (3 preceding siblings ...)
  2012-06-04 14:57 ` [PATCH 0/3] uprobes: make register/unregister O(n) Peter Zijlstra
@ 2012-06-04 17:37 ` Srikar Dronamraju
  2012-06-04 18:41   ` Oleg Nesterov
  2012-06-06 14:49 ` [PATCH v2 " Oleg Nesterov
  5 siblings, 1 reply; 16+ messages in thread
From: Srikar Dronamraju @ 2012-06-04 17:37 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Ingo Molnar, Peter Zijlstra, Ananth N Mavinakayanahalli,
	Anton Arapov, Linus Torvalds, Masami Hiramatsu, linux-kernel


I read this code a few times, but I still think I am getting confused
about few scenarios.

So please correct me.

> struct map_info {
> 	struct map_info *next;
> 	struct mm_struct *mm;
> 	loff_t vaddr;
> };
> 
> static inline struct map_info *free_map_info(struct map_info *info)
> {
> 	struct map_info *next = info->next;
> 	kfree(info);
> 	return next;
> }
> 
> static struct map_info *
> build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
> {
> 	unsigned long pgoff = offset >> PAGE_SHIFT;
> 	struct prio_tree_iter iter;
> 	struct vm_area_struct *vma;
> 	struct map_info *curr = NULL;
> 	struct map_info *prev = NULL;
> 	struct map_info *info;
> 	int more = 0;
> 
>  again:
> 	mutex_lock(&mapping->i_mmap_mutex);
> 	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
> 		if (!valid_vma(vma, is_register))
> 			continue;
> 
> 		if (!prev) {
> 			prev = kmalloc(sizeof(struct map_info),
> 					GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN);
> 			if (!prev) {
> 				more++;
> 				continue;
> 			}
> 			prev->next = NULL;
> 		}
> 
> 		if (!atomic_inc_not_zero(&vma->vm_mm->mm_users))
> 			continue;
> 
> 		info = prev;
> 		prev = prev->next;
> 		info->next = curr;
> 		curr = info;
> 
> 		info->mm = vma->vm_mm;
> 		info->vaddr = vma_address(vma, offset);
> 	}
> 	mutex_unlock(&mapping->i_mmap_mutex);
> 
> 	if (!more)
> 		goto out;
> 
> 	prev = curr;
> 	while (curr) {
> 		mmput(curr->mm);
> 		curr = curr->next;
> 	}
> 
> 	do {
> 		info = kmalloc(sizeof(struct map_info), GFP_KERNEL);
> 		if (!info) {
> 			curr = ERR_PTR(-ENOMEM);
> 			goto out;
> 		}
> 		info->next = prev;
> 		prev = info;
> 	} while (--more);
> 
> 	goto again;

This is more theory
If the number of vmas in the priority tree keeps increasing in every
iteration, and the kmalloc(GFP_NOWAIT) fails i.e more is !0, then
dont we end up in a forever loop?

Cant we just restrict this to just 2 iterations? [And depend on
uprobe_mmap() to do the necessary if new vmas come in].

>  out:

> 	while (prev)
> 		prev = free_map_info(prev);

If we were able to allocate all map_info objects in the first pass but
the last vma belonged to a mm thats at exit, i.e atomic_inc_non_zero
returned 0 , then prev is !NULL and more is 0.  Then we seem to clear
all the map_info objects without even decreasing the mm counts for which
atomic_inc_non_zero() was successful. Will curr be proper in this case.

Should this while be an if?

I am sure I am missing something here. Probably I should take a look
again. 

> 	return curr;
> }
> 
> static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
> {
> 	struct map_info *info;
> 	int err = 0;
> 
> 	info = build_map_info(uprobe->inode->i_mapping,
> 					uprobe->offset, is_register);
> 	if (IS_ERR(info))
> 		return PTR_ERR(info);
> 
> 	while (info) {
> 		struct mm_struct *mm = info->mm;
> 		struct vm_area_struct *vma;
> 		loff_t vaddr;
> 
> 		if (err)
> 			goto free;
> 
> 		down_write(&mm->mmap_sem);
> 		vma = find_vma(mm, (unsigned long)info->vaddr);
> 		if (!vma || !valid_vma(vma, is_register))
> 			goto unlock;
> 
> 		vaddr = vma_address(vma, uprobe->offset);
> 		if (vma->vm_file->f_mapping->host != uprobe->inode ||
> 						vaddr != info->vaddr)
> 			goto unlock;
> 
> 		if (is_register) {
> 			err = install_breakpoint(uprobe, mm, vma, info->vaddr);
> 			/*
> 			 * We can race against uprobe_register(), see the
> 			 * comment near uprobe_hash().
> 			 */
> 			if (err == -EEXIST)
> 				err = 0;
> 		} else {
> 			remove_breakpoint(uprobe, mm, info->vaddr);
> 		}
>  unlock:
> 		up_write(&mm->mmap_sem);
>  free:
> 		mmput(mm);
> 		info = free_map_info(info);
> 	}
> 
> 	return err;
> }
> 


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/3] uprobes: make register/unregister O(n)
  2012-06-04 17:37 ` Srikar Dronamraju
@ 2012-06-04 18:41   ` Oleg Nesterov
  2012-06-05  9:28     ` Srikar Dronamraju
  0 siblings, 1 reply; 16+ messages in thread
From: Oleg Nesterov @ 2012-06-04 18:41 UTC (permalink / raw)
  To: Srikar Dronamraju
  Cc: Ingo Molnar, Peter Zijlstra, Ananth N Mavinakayanahalli,
	Anton Arapov, Linus Torvalds, Masami Hiramatsu, linux-kernel

On 06/04, Srikar Dronamraju wrote:
>
> >  again:
> > 	mutex_lock(&mapping->i_mmap_mutex);
> > 	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
> > 		if (!valid_vma(vma, is_register))
> > 			continue;
> >
> > 		if (!prev) {
> > 			prev = kmalloc(sizeof(struct map_info),
> > 					GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN);
> > 			if (!prev) {
> > 				more++;
> > 				continue;
> > 			}
> > 			prev->next = NULL;
> > 		}
> >
> > 		if (!atomic_inc_not_zero(&vma->vm_mm->mm_users))
> > 			continue;
> >
> > 		info = prev;
> > 		prev = prev->next;
> > 		info->next = curr;
> > 		curr = info;
> >
> > 		info->mm = vma->vm_mm;
> > 		info->vaddr = vma_address(vma, offset);
> > 	}
> > 	mutex_unlock(&mapping->i_mmap_mutex);
> >
> > 	if (!more)
> > 		goto out;
> >
> > 	prev = curr;
> > 	while (curr) {
> > 		mmput(curr->mm);
> > 		curr = curr->next;
> > 	}
> >
> > 	do {
> > 		info = kmalloc(sizeof(struct map_info), GFP_KERNEL);
> > 		if (!info) {
> > 			curr = ERR_PTR(-ENOMEM);
> > 			goto out;
> > 		}
> > 		info->next = prev;
> > 		prev = info;
> > 	} while (--more);
> >
> > 	goto again;
>
> This is more theory
> If the number of vmas in the priority tree keeps increasing in every
> iteration, and the kmalloc(GFP_NOWAIT) fails i.e more is !0, then
> dont we end up in a forever loop?

But this can only hapen if the number of mappings keeps increasing
indefinitely, this is not possible.

And please not that the current code is worse if the number of mappings
grows. In fact it can livelock even if this number is limited. Suppose
you are trying to probe /bin/true and some process does
"while true; do /bin/true; done". It is possible that every time
register_for_each_vma() finds the new mapping because we do not free
the previous entries which refer to the already dead process/mm.


> Cant we just restrict this to just 2 iterations? [And depend on
> uprobe_mmap() to do the necessary if new vmas come in].

Probably yes, we can improve this. But I don't think this is that
easy, we can't rely on vma_prio_tree_foreach's ordering.

> >  out:
>
> > 	while (prev)
> > 		prev = free_map_info(prev);
>
> If we were able to allocate all map_info objects in the first pass but
> the last vma belonged to a mm thats at exit, i.e atomic_inc_non_zero
> returned 0 , then prev is !NULL and more is 0.  Then we seem to clear
> all the map_info objects without even decreasing the mm counts for which
> atomic_inc_non_zero() was successful. Will curr be proper in this case.

Not sure I understand. To simplify, suppose that we have a single mapping
but atomic_inc_non_zero() fails. In this case, yes, prev != NULL but
prev->mm is not valid. We only need to free the "extra" memory and return
curr == NULL (empty list). This is what the loop above does.

We only need mmput(mm) if atomic_inc_non_zero() suceeeds, and in this
case this "mm" should live in "curr" list. If we need more memory, then
build_map_info() does this right after it detects more != 0. Otherwise
the caller should do this.

> Should this while be an if?

But we can have more entries to free? Not only because atomic_inc_non_zero
failed.

> I am sure I am missing something here.

Or me. Thanks for looking!

Oleg.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/3] uprobes: make register/unregister O(n)
  2012-06-04 18:41   ` Oleg Nesterov
@ 2012-06-05  9:28     ` Srikar Dronamraju
  0 siblings, 0 replies; 16+ messages in thread
From: Srikar Dronamraju @ 2012-06-05  9:28 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Ingo Molnar, Peter Zijlstra, Ananth N Mavinakayanahalli,
	Anton Arapov, Linus Torvalds, Masami Hiramatsu, linux-kernel

> > This is more theory
> > If the number of vmas in the priority tree keeps increasing in every
> > iteration, and the kmalloc(GFP_NOWAIT) fails i.e more is !0, then
> > dont we end up in a forever loop?
> 
> But this can only hapen if the number of mappings keeps increasing
> indefinitely, this is not possible.

Agree, that why I said in theory.

> 
> And please not that the current code is worse if the number of mappings
> grows. In fact it can livelock even if this number is limited. Suppose
> you are trying to probe /bin/true and some process does
> "while true; do /bin/true; done". It is possible that every time
> register_for_each_vma() finds the new mapping because we do not free
> the previous entries which refer to the already dead process/mm.
> 

Completely agree that build_map_info is much better the current implementation. 
I was only trying understand and if possible simplify.

> 
> > Cant we just restrict this to just 2 iterations? [And depend on
> > uprobe_mmap() to do the necessary if new vmas come in].
> 

> Probably yes, we can improve this. But I don't think this is that
> easy, we can't rely on vma_prio_tree_foreach's ordering.
> 

Correct, I forgot that we cant rely on vma_prio_tree_for_each ordering.

> > >  out:
> >
> > > 	while (prev)
> > > 		prev = free_map_info(prev);
> >
> > If we were able to allocate all map_info objects in the first pass but
> > the last vma belonged to a mm thats at exit, i.e atomic_inc_non_zero
> > returned 0 , then prev is !NULL and more is 0.  Then we seem to clear
> > all the map_info objects without even decreasing the mm counts for which
> > atomic_inc_non_zero() was successful. Will curr be proper in this case.


I missed the point that prev->next is NULL when atomic_inc_non_zero
fails.

> 
> Not sure I understand. To simplify, suppose that we have a single mapping
> but atomic_inc_non_zero() fails. In this case, yes, prev != NULL but
> prev->mm is not valid. We only need to free the "extra" memory and return
> curr == NULL (empty list). This is what the loop above does.
> 
> We only need mmput(mm) if atomic_inc_non_zero() suceeeds, and in this
> case this "mm" should live in "curr" list. If we need more memory, then
> build_map_info() does this right after it detects more != 0. Otherwise
> the caller should do this.
> 

Correct. I understand now.

> > Should this while be an if?
> 
> But we can have more entries to free? Not only because atomic_inc_non_zero
> failed.
> 

Oh Yes, we could have entries to free because the vmas in the prio-tree
might have decreased by the time we walk the prio-tree the second time.

-- 
Thanks and Regards
Srikar


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/3] uprobes: change build_map_info() to try kmalloc(GFP_NOWAIT) first
  2012-06-04 14:53 ` [PATCH 2/3] uprobes: change build_map_info() to try kmalloc(GFP_NOWAIT) first Oleg Nesterov
  2012-06-04 14:59   ` Peter Zijlstra
@ 2012-06-05 10:10   ` Oleg Nesterov
  1 sibling, 0 replies; 16+ messages in thread
From: Oleg Nesterov @ 2012-06-05 10:10 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra, Srikar Dronamraju
  Cc: Ananth N Mavinakayanahalli, Anton Arapov, Linus Torvalds,
	Masami Hiramatsu, linux-kernel

On 06/04, Oleg Nesterov wrote:
>
> @@ -772,8 +772,13 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
>  			continue;
>
>  		if (!prev) {
> -			more++;
> -			continue;
> +			prev = kmalloc(sizeof(struct map_info),
> +					GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN);
> +			if (!prev) {
> +				more++;
> +				continue;
> +			}
> +			prev->next = NULL;
>  		}

OK, this is not exactly right, it is pointless and wrong to
do kmalloc(GFP_NOWAIT) if it failed before (iow, more != 0).

Easy to fix, I'll send v2 tomorrow.

Oleg.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v2 0/3] uprobes: make register/unregister O(n)
  2012-06-04 14:52 [PATCH 0/3] uprobes: make register/unregister O(n) Oleg Nesterov
                   ` (4 preceding siblings ...)
  2012-06-04 17:37 ` Srikar Dronamraju
@ 2012-06-06 14:49 ` Oleg Nesterov
  2012-06-06 14:49   ` [PATCH v2 1/3] uprobes: rework register_for_each_vma() to make it O(n) Oleg Nesterov
                     ` (2 more replies)
  5 siblings, 3 replies; 16+ messages in thread
From: Oleg Nesterov @ 2012-06-06 14:49 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra, Srikar Dronamraju
  Cc: Ananth N Mavinakayanahalli, Anton Arapov, Linus Torvalds,
	Masami Hiramatsu, linux-kernel

Hello,

On top of "[PATCH 0/3] uprobes: misc minor changes".

Sorry for delay. When I tried to test these changes, I hit the
bug report from bad_page(). However, it turns out this is another
unrelated problem, hopefully I'll send the fix soon. In short,
replace_page() is buggy.

Changes:

	- do not try GFP_NOWAIT if it failed before and more != 0,
	  this is pointless and wrong

	- update the comments (thanks Peter)

Oleg.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v2 1/3] uprobes: rework register_for_each_vma() to make it O(n)
  2012-06-06 14:49 ` [PATCH v2 " Oleg Nesterov
@ 2012-06-06 14:49   ` Oleg Nesterov
  2012-06-06 14:50   ` [PATCH v2 2/3] uprobes: change build_map_info() to try kmalloc(GFP_NOWAIT) first Oleg Nesterov
  2012-06-06 14:50   ` [PATCH v2 3/3] uprobes: document uprobe_register() vs uprobe_mmap() race Oleg Nesterov
  2 siblings, 0 replies; 16+ messages in thread
From: Oleg Nesterov @ 2012-06-06 14:49 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra, Srikar Dronamraju
  Cc: Ananth N Mavinakayanahalli, Anton Arapov, Linus Torvalds,
	Masami Hiramatsu, linux-kernel

Currently register_for_each_vma() is O(n ** 2) + O(n ** 3), every
time find_next_vma_info() "restarts" the vma_prio_tree_foreach()
loop and each iteration rechecks the whole try_list. This also
means that try_list can grow "indefinitely" if register/unregister
races with munmap/mmap activity even if the number of mapping is
bounded at any time.

With this patch register_for_each_vma() builds the list of mm/vaddr
structures only once and does install_breakpoint() for each entry.

We do not care about the new mappings which can be created after
build_map_info() drops mapping->i_mmap_mutex, uprobe_mmap() should
do its work.

Note that we do not allocate map_info under i_mmap_mutex, this can
deadlock with page reclaim (but see the next patch). So we use 2
lists, "curr" which we are going to return, and "prev" which holds
the already allocated memory. The main loop deques the entry from
"prev" (initially it is empty), and if "prev" becomes empty again
it counts the number of entries we need to pre-allocate outside of
i_mmap_mutex.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 kernel/events/uprobes.c |  199 ++++++++++++++++++++---------------------------
 1 files changed, 86 insertions(+), 113 deletions(-)

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index de931e7..a1bbcab 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -60,17 +60,6 @@ static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
  */
 static atomic_t uprobe_events = ATOMIC_INIT(0);
 
-/*
- * Maintain a temporary per vma info that can be used to search if a vma
- * has already been handled. This structure is introduced since extending
- * vm_area_struct wasnt recommended.
- */
-struct vma_info {
-	struct list_head	probe_list;
-	struct mm_struct	*mm;
-	loff_t			vaddr;
-};
-
 struct uprobe {
 	struct rb_node		rb_node;	/* node in the rb tree */
 	atomic_t		ref;
@@ -752,139 +741,123 @@ static void delete_uprobe(struct uprobe *uprobe)
 	atomic_dec(&uprobe_events);
 }
 
-static struct vma_info *
-__find_next_vma_info(struct address_space *mapping, struct list_head *head,
-			struct vma_info *vi, loff_t offset, bool is_register)
+struct map_info {
+	struct map_info *next;
+	struct mm_struct *mm;
+	loff_t vaddr;
+};
+
+static inline struct map_info *free_map_info(struct map_info *info)
 {
+	struct map_info *next = info->next;
+	kfree(info);
+	return next;
+}
+
+static struct map_info *
+build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
+{
+	unsigned long pgoff = offset >> PAGE_SHIFT;
 	struct prio_tree_iter iter;
 	struct vm_area_struct *vma;
-	struct vma_info *tmpvi;
-	unsigned long pgoff;
-	int existing_vma;
-	loff_t vaddr;
-
-	pgoff = offset >> PAGE_SHIFT;
+	struct map_info *curr = NULL;
+	struct map_info *prev = NULL;
+	struct map_info *info;
+	int more = 0;
 
+ again:
+	mutex_lock(&mapping->i_mmap_mutex);
 	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
 		if (!valid_vma(vma, is_register))
 			continue;
 
-		existing_vma = 0;
-		vaddr = vma_address(vma, offset);
-
-		list_for_each_entry(tmpvi, head, probe_list) {
-			if (tmpvi->mm == vma->vm_mm && tmpvi->vaddr == vaddr) {
-				existing_vma = 1;
-				break;
-			}
-		}
-
-		/*
-		 * Another vma needs a probe to be installed. However skip
-		 * installing the probe if the vma is about to be unlinked.
-		 */
-		if (!existing_vma && atomic_inc_not_zero(&vma->vm_mm->mm_users)) {
-			vi->mm = vma->vm_mm;
-			vi->vaddr = vaddr;
-			list_add(&vi->probe_list, head);
-
-			return vi;
+		if (!prev) {
+			more++;
+			continue;
 		}
-	}
 
-	return NULL;
-}
-
-/*
- * Iterate in the rmap prio tree  and find a vma where a probe has not
- * yet been inserted.
- */
-static struct vma_info *
-find_next_vma_info(struct address_space *mapping, struct list_head *head,
-		loff_t offset, bool is_register)
-{
-	struct vma_info *vi, *retvi;
+		if (!atomic_inc_not_zero(&vma->vm_mm->mm_users))
+			continue;
 
-	vi = kzalloc(sizeof(struct vma_info), GFP_KERNEL);
-	if (!vi)
-		return ERR_PTR(-ENOMEM);
+		info = prev;
+		prev = prev->next;
+		info->next = curr;
+		curr = info;
 
-	mutex_lock(&mapping->i_mmap_mutex);
-	retvi = __find_next_vma_info(mapping, head, vi, offset, is_register);
+		info->mm = vma->vm_mm;
+		info->vaddr = vma_address(vma, offset);
+	}
 	mutex_unlock(&mapping->i_mmap_mutex);
 
-	if (!retvi)
-		kfree(vi);
+	if (!more)
+		goto out;
+
+	prev = curr;
+	while (curr) {
+		mmput(curr->mm);
+		curr = curr->next;
+	}
 
-	return retvi;
+	do {
+		info = kmalloc(sizeof(struct map_info), GFP_KERNEL);
+		if (!info) {
+			curr = ERR_PTR(-ENOMEM);
+			goto out;
+		}
+		info->next = prev;
+		prev = info;
+	} while (--more);
+
+	goto again;
+ out:
+	while (prev)
+		prev = free_map_info(prev);
+	return curr;
 }
 
 static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
 {
-	struct list_head try_list;
-	struct vm_area_struct *vma;
-	struct address_space *mapping;
-	struct vma_info *vi, *tmpvi;
-	struct mm_struct *mm;
-	loff_t vaddr;
-	int ret;
+	struct map_info *info;
+	int err = 0;
 
-	mapping = uprobe->inode->i_mapping;
-	INIT_LIST_HEAD(&try_list);
+	info = build_map_info(uprobe->inode->i_mapping,
+					uprobe->offset, is_register);
+	if (IS_ERR(info))
+		return PTR_ERR(info);
 
-	ret = 0;
-
-	for (;;) {
-		vi = find_next_vma_info(mapping, &try_list, uprobe->offset, is_register);
-		if (!vi)
-			break;
+	while (info) {
+		struct mm_struct *mm = info->mm;
+		struct vm_area_struct *vma;
+		loff_t vaddr;
 
-		if (IS_ERR(vi)) {
-			ret = PTR_ERR(vi);
-			break;
-		}
+		if (err)
+			goto free;
 
-		mm = vi->mm;
 		down_write(&mm->mmap_sem);
-		vma = find_vma(mm, (unsigned long)vi->vaddr);
-		if (!vma || !valid_vma(vma, is_register)) {
-			list_del(&vi->probe_list);
-			kfree(vi);
-			up_write(&mm->mmap_sem);
-			mmput(mm);
-			continue;
-		}
+		vma = find_vma(mm, (unsigned long)info->vaddr);
+		if (!vma || !valid_vma(vma, is_register))
+			goto unlock;
+
 		vaddr = vma_address(vma, uprobe->offset);
 		if (vma->vm_file->f_mapping->host != uprobe->inode ||
-						vaddr != vi->vaddr) {
-			list_del(&vi->probe_list);
-			kfree(vi);
-			up_write(&mm->mmap_sem);
-			mmput(mm);
-			continue;
-		}
-
-		if (is_register)
-			ret = install_breakpoint(uprobe, mm, vma, vi->vaddr);
-		else
-			remove_breakpoint(uprobe, mm, vi->vaddr);
+						vaddr != info->vaddr)
+			goto unlock;
 
-		up_write(&mm->mmap_sem);
-		mmput(mm);
 		if (is_register) {
-			if (ret && ret == -EEXIST)
-				ret = 0;
-			if (ret)
-				break;
+			err = install_breakpoint(uprobe, mm, vma, info->vaddr);
+			if (err == -EEXIST)
+				err = 0;
+		} else {
+			remove_breakpoint(uprobe, mm, info->vaddr);
 		}
+ unlock:
+		up_write(&mm->mmap_sem);
+ free:
+		mmput(mm);
+		info = free_map_info(info);
 	}
 
-	list_for_each_entry_safe(vi, tmpvi, &try_list, probe_list) {
-		list_del(&vi->probe_list);
-		kfree(vi);
-	}
-
-	return ret;
+	return err;
 }
 
 static int __uprobe_register(struct uprobe *uprobe)
-- 
1.5.5.1



^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v2 2/3] uprobes: change build_map_info() to try kmalloc(GFP_NOWAIT) first
  2012-06-06 14:49 ` [PATCH v2 " Oleg Nesterov
  2012-06-06 14:49   ` [PATCH v2 1/3] uprobes: rework register_for_each_vma() to make it O(n) Oleg Nesterov
@ 2012-06-06 14:50   ` Oleg Nesterov
  2012-06-06 14:50   ` [PATCH v2 3/3] uprobes: document uprobe_register() vs uprobe_mmap() race Oleg Nesterov
  2 siblings, 0 replies; 16+ messages in thread
From: Oleg Nesterov @ 2012-06-06 14:50 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra, Srikar Dronamraju
  Cc: Ananth N Mavinakayanahalli, Anton Arapov, Linus Torvalds,
	Masami Hiramatsu, linux-kernel

build_map_info() doesn't allocate the memory under i_mmap_mutex
to avoid the deadlock with page reclaim. But it can try GFP_NOWAIT
first, it should work in the likely case and thus we almost never
need the pre-alloc-and-retry path.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 kernel/events/uprobes.c |   10 ++++++++++
 1 files changed, 10 insertions(+), 0 deletions(-)

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index a1bbcab..88e903d 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -771,6 +771,16 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
 		if (!valid_vma(vma, is_register))
 			continue;
 
+		if (!prev && !more) {
+			/*
+			 * Needs GFP_NOWAIT to avoid i_mmap_mutex recursion through
+			 * reclaim. This is optimistic, no harm done if it fails.
+			 */
+			prev = kmalloc(sizeof(struct map_info),
+					GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN);
+			if (prev)
+				prev->next = NULL;
+		}
 		if (!prev) {
 			more++;
 			continue;
-- 
1.5.5.1



^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v2 3/3] uprobes: document uprobe_register() vs uprobe_mmap() race
  2012-06-06 14:49 ` [PATCH v2 " Oleg Nesterov
  2012-06-06 14:49   ` [PATCH v2 1/3] uprobes: rework register_for_each_vma() to make it O(n) Oleg Nesterov
  2012-06-06 14:50   ` [PATCH v2 2/3] uprobes: change build_map_info() to try kmalloc(GFP_NOWAIT) first Oleg Nesterov
@ 2012-06-06 14:50   ` Oleg Nesterov
  2 siblings, 0 replies; 16+ messages in thread
From: Oleg Nesterov @ 2012-06-06 14:50 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra, Srikar Dronamraju
  Cc: Ananth N Mavinakayanahalli, Anton Arapov, Linus Torvalds,
	Masami Hiramatsu, linux-kernel

From: Peter Zijlstra <a.p.zijlstra@chello.nl>

Because the mind is treacherous and makes us forget we need to write
stuff down.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 kernel/events/uprobes.c |   31 ++++++++++++++++++++++++++++---
 1 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 88e903d..6e3b181 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -44,6 +44,23 @@ static DEFINE_SPINLOCK(uprobes_treelock);	/* serialize rbtree access */
 
 #define UPROBES_HASH_SZ	13
 
+/*
+ * We need separate register/unregister and mmap/munmap lock hashes because
+ * of mmap_sem nesting.
+ *
+ * uprobe_register() needs to install probes on (potentially) all processes
+ * and thus needs to acquire multiple mmap_sems (consequtively, not
+ * concurrently), whereas uprobe_mmap() is called while holding mmap_sem
+ * for the particular process doing the mmap.
+ *
+ * uprobe_register()->register_for_each_vma() needs to drop/acquire mmap_sem
+ * because of lock order against i_mmap_mutex. This means there's a hole in
+ * the register vma iteration where a mmap() can happen.
+ *
+ * Thus uprobe_register() can race with uprobe_mmap() and we can try and
+ * install a probe where one is already installed.
+ */
+
 /* serialize (un)register */
 static struct mutex uprobes_mutex[UPROBES_HASH_SZ];
 
@@ -353,7 +370,9 @@ out:
 int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
 {
 	int result;
-
+	/*
+	 * See the comment near uprobes_hash().
+	 */
 	result = is_swbp_at_addr(mm, vaddr);
 	if (result == 1)
 		return -EEXIST;
@@ -855,6 +874,10 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
 
 		if (is_register) {
 			err = install_breakpoint(uprobe, mm, vma, info->vaddr);
+			/*
+			 * We can race against uprobe_mmap(), see the
+			 * comment near uprobe_hash().
+			 */
 			if (err == -EEXIST)
 				err = 0;
 		} else {
@@ -1063,8 +1086,10 @@ int uprobe_mmap(struct vm_area_struct *vma)
 			}
 
 			ret = install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
-
-			/* Ignore double add: */
+			/*
+			 * We can race against uprobe_register(), see the
+			 * comment near uprobe_hash().
+			 */
 			if (ret == -EEXIST) {
 				ret = 0;
 
-- 
1.5.5.1



^ permalink raw reply related	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2012-06-06 14:52 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-06-04 14:52 [PATCH 0/3] uprobes: make register/unregister O(n) Oleg Nesterov
2012-06-04 14:53 ` [PATCH 1/3] uprobes: rework register_for_each_vma() to make it O(n) Oleg Nesterov
2012-06-04 14:53 ` [PATCH 2/3] uprobes: change build_map_info() to try kmalloc(GFP_NOWAIT) first Oleg Nesterov
2012-06-04 14:59   ` Peter Zijlstra
2012-06-05 10:10   ` Oleg Nesterov
2012-06-04 14:53 ` [PATCH 3/3] uprobes: document uprobe_register() vs uprobe_mmap() race Oleg Nesterov
2012-06-04 15:00   ` Peter Zijlstra
2012-06-04 15:40     ` Oleg Nesterov
2012-06-04 14:57 ` [PATCH 0/3] uprobes: make register/unregister O(n) Peter Zijlstra
2012-06-04 17:37 ` Srikar Dronamraju
2012-06-04 18:41   ` Oleg Nesterov
2012-06-05  9:28     ` Srikar Dronamraju
2012-06-06 14:49 ` [PATCH v2 " Oleg Nesterov
2012-06-06 14:49   ` [PATCH v2 1/3] uprobes: rework register_for_each_vma() to make it O(n) Oleg Nesterov
2012-06-06 14:50   ` [PATCH v2 2/3] uprobes: change build_map_info() to try kmalloc(GFP_NOWAIT) first Oleg Nesterov
2012-06-06 14:50   ` [PATCH v2 3/3] uprobes: document uprobe_register() vs uprobe_mmap() race Oleg Nesterov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).