All of lore.kernel.org
 help / color / mirror / Atom feed
From: Andrew Morton <akpm@linux-foundation.org>
To: Nico Pache <npache@redhat.com>
Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org,
	Rafael Aquini <aquini@redhat.com>,
	Waiman Long <longman@redhat.com>, Baoquan He <bhe@redhat.com>,
	Christoph von Recklinghausen <crecklin@redhat.com>,
	Don Dutile <ddutile@redhat.com>,
	"Herton R . Krzesinski" <herton@redhat.com>,
	David Rientjes <rientjes@google.com>,
	Michal Hocko <mhocko@suse.com>,
	Andrea Arcangeli <aarcange@redhat.com>,
	Davidlohr Bueso <dave@stgolabs.net>,
	Thomas Gleixner <tglx@linutronix.de>,
	Peter Zijlstra <peterz@infradead.org>,
	Ingo Molnar <mingo@redhat.com>, Joel Savitz <jsavitz@redhat.com>,
	Darren Hart <dvhart@infradead.org>,
	stable@kernel.org
Subject: Re: [PATCH v7] oom_kill.c: futex: Don't OOM reap the VMA containing the robust_list_head
Date: Thu, 7 Apr 2022 20:12:50 -0700	[thread overview]
Message-ID: <20220407201250.b4ebaae0cb327cad7b2eb3cf@linux-foundation.org> (raw)
In-Reply-To: <20220408030137.3693195-1-npache@redhat.com>

On Thu,  7 Apr 2022 23:01:37 -0400 Nico Pache <npache@redhat.com> wrote:

> The pthread struct is allocated on PRIVATE|ANONYMOUS memory [1] which can
> be targeted by the oom reaper. This mapping is used to store the futex
> robust list head; the kernel does not keep a copy of the robust list and
> instead references a userspace address to maintain the robustness during
> a process death. A race can occur between exit_mm and the oom reaper that
> allows the oom reaper to free the memory of the futex robust list before
> the exit path has handled the futex death:
> 
>     CPU1                               CPU2
> ------------------------------------------------------------------------
>     page_fault
>     do_exit "signal"
>     wake_oom_reaper
>                                         oom_reaper
>                                         oom_reap_task_mm (invalidates mm)
>     exit_mm
>     exit_mm_release
>     futex_exit_release
>     futex_cleanup
>     exit_robust_list
>     get_user (EFAULT- can't access memory)
> 
> If the get_user EFAULT's, the kernel will be unable to recover the
> waiters on the robust_list, leaving userspace mutexes hung indefinitely.
> 
> Use the robust_list address stored in the kernel to skip the VMA that holds
> it, allowing a successful futex_cleanup.
> 
> Theoretically a failure can still occur if there are locks mapped as
> PRIVATE|ANON; however, the robust futexes are a best-effort approach.
> This patch only strengthens that best-effort.
> 
> The following case can still fail:
> robust head (skipped) -> private lock (reaped) -> shared lock (skipped)
> 
> Reproducer: https://gitlab.com/jsavitz/oom_futex_reproducer
> 
> [1] https://elixir.bootlin.com/glibc/latest/source/nptl/allocatestack.c#L370
> 
> ...
>
> --- a/include/linux/oom.h
> +++ b/include/linux/oom.h
> @@ -106,7 +106,7 @@ static inline vm_fault_t check_stable_address_space(struct mm_struct *mm)
>  	return 0;
>  }
>  
> -bool __oom_reap_task_mm(struct mm_struct *mm);
> +bool __oom_reap_task_mm(struct mm_struct *mm, void *robust_list);
>  
>  long oom_badness(struct task_struct *p,
>  		unsigned long totalpages);
> diff --git a/mm/mmap.c b/mm/mmap.c
> index 3aa839f81e63..d5af1b83cbb2 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -3109,6 +3109,11 @@ void exit_mmap(struct mm_struct *mm)
>  	struct mmu_gather tlb;
>  	struct vm_area_struct *vma;
>  	unsigned long nr_accounted = 0;
> +	void *robust_list;
> +
> +#ifdef CONFIG_FUTEX
> +	robust_list = current->robust_list;
> +#endif
>  
>  	/* mm's last user has gone, and its about to be pulled down */
>  	mmu_notifier_release(mm);
> @@ -3126,7 +3131,8 @@ void exit_mmap(struct mm_struct *mm)
>  		 * to mmu_notifier_release(mm) ensures mmu notifier callbacks in
>  		 * __oom_reap_task_mm() will not block.
>  		 */
> -		(void)__oom_reap_task_mm(mm);
> +		(void)__oom_reap_task_mm(mm, robust_list);

uninitialized var warning when CONFIG_FUTEX=n?

> +
>  		set_bit(MMF_OOM_SKIP, &mm->flags);
>  	}
>  
> --- a/mm/oom_kill.c
> +++ b/mm/oom_kill.c
> @@ -509,9 +509,10 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
>  static struct task_struct *oom_reaper_list;
>  static DEFINE_SPINLOCK(oom_reaper_lock);
>  
> -bool __oom_reap_task_mm(struct mm_struct *mm)
> +bool __oom_reap_task_mm(struct mm_struct *mm, void *robust_list)

Well, this is no longer necessarily a robust_list*.  It's just an
address to skip and the name should reflect that?

>  {
>  	struct vm_area_struct *vma;
> +	unsigned long skip_vma = (unsigned long) robust_list;
>  	bool ret = true;
>  
>  	/*
> @@ -526,6 +527,20 @@ bool __oom_reap_task_mm(struct mm_struct *mm)
>  		if (vma->vm_flags & (VM_HUGETLB|VM_PFNMAP))
>  			continue;
>  
> +#ifdef CONFIG_FUTEX
> +		/*
> +		 * The OOM reaper runs concurrently with do_exit.
> +		 * The robust_list_head is stored in userspace and is required
> +		 * by the exit path to recover the robust futex waiters.
> +		 * Skip the VMA that contains the robust_list to allow for
> +		 * proper cleanup.
> +		 */
> +		if (vma->vm_start <= skip_vma && vma->vm_end > skip_vma) {
> +			pr_info("oom_reaper: skipping vma, contains robust_list");
> +			continue;
> +		}
> +#endif
> +
>  		/*
>  		 * Only anonymous pages have a good chance to be dropped
>  		 * without additional steps which we cannot afford as we
> @@ -567,6 +582,7 @@ bool __oom_reap_task_mm(struct mm_struct *mm)
>  static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
>  {
>  	bool ret = true;
> +	void *robust_list;
>  
>  	if (!mmap_read_trylock(mm)) {
>  		trace_skip_task_reaping(tsk->pid);
> @@ -586,8 +602,11 @@ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
>  
>  	trace_start_task_reaping(tsk->pid);
>  
> +#ifdef CONFIG_FUTEX
> +	robust_list = tsk->robust_list;
> +#endif
>  	/* failed to reap part of the address space. Try again later */
> -	ret = __oom_reap_task_mm(mm);
> +	ret = __oom_reap_task_mm(mm, robust_list);

unintialized var when CONFIG_FUTEX=n?

>  	if (!ret)
>  		goto out_finish;
>  
> @@ -1149,6 +1168,7 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags)
>  	unsigned int f_flags;
>  	bool reap = false;
>  	long ret = 0;
> +	void *robust_list;
>  
>  	if (flags)
>  		return -EINVAL;
> @@ -1186,11 +1206,16 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags)
>  		ret = -EINTR;
>  		goto drop_mm;
>  	}
> +
> +#ifdef CONFIG_FUTEX
> +	robust_list = p->robust_list;
> +#endif
>  	/*
>  	 * Check MMF_OOM_SKIP again under mmap_read_lock protection to ensure
>  	 * possible change in exit_mmap is seen
>  	 */
> -	if (!test_bit(MMF_OOM_SKIP, &mm->flags) && !__oom_reap_task_mm(mm))
> +	if (!test_bit(MMF_OOM_SKIP, &mm->flags) &&
> +			!__oom_reap_task_mm(mm, robust_list))

again

>  		ret = -EAGAIN;
>  	mmap_read_unlock(mm);


  reply	other threads:[~2022-04-08  3:12 UTC|newest]

Thread overview: 3+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-04-08  3:01 [PATCH v7] oom_kill.c: futex: Don't OOM reap the VMA containing the robust_list_head Nico Pache
2022-04-08  3:12 ` Andrew Morton [this message]
2022-04-08  3:24   ` Nico Pache

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220407201250.b4ebaae0cb327cad7b2eb3cf@linux-foundation.org \
    --to=akpm@linux-foundation.org \
    --cc=aarcange@redhat.com \
    --cc=aquini@redhat.com \
    --cc=bhe@redhat.com \
    --cc=crecklin@redhat.com \
    --cc=dave@stgolabs.net \
    --cc=ddutile@redhat.com \
    --cc=dvhart@infradead.org \
    --cc=herton@redhat.com \
    --cc=jsavitz@redhat.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=longman@redhat.com \
    --cc=mhocko@suse.com \
    --cc=mingo@redhat.com \
    --cc=npache@redhat.com \
    --cc=peterz@infradead.org \
    --cc=rientjes@google.com \
    --cc=stable@kernel.org \
    --cc=tglx@linutronix.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.