> -----Original Message-----
> From: Michal Hocko [mailto:mhocko@kernel.org]
> Sent: Friday, August 04, 2017 10:57 PM
> To: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
> Cc: linux-mm@kvack.org; akpm@linux-foundation.org; 陶文苇
> <wenwei.tww@alibaba-inc.com>; oleg@redhat.com; rientjes@google.com;
> linux-kernel@vger.kernel.org
> Subject: Re: [PATCH] mm, oom: fix potential data corruption when
> oom_reaper races with writer
> 
> On Fri 04-08-17 13:00:47, Michal Hocko wrote:
> > On Fri 04-08-17 19:41:42, Tetsuo Handa wrote:
> [...]
> > > Yes. Data corruption still happens.
> >
> > I guess I managed to reproduce finally. Will investigate further.
> 
> One limitation of the current MMF_UNSTABLE implementation is that it still
> keeps the new page mapped and only sends EFAULT/kill to the consumer. If
> somebody tries to re-read the same content nothing will really happen. I
> went this way because it was much simpler and memory consumers usually
> do not retry on EFAULT. Maybe this is not the case here.
> 
> I've been staring into iov_iter_copy_from_user_atomic which I believe
> should be the common write path which reads the user buffer where the
> corruption caused by the oom_reaper would come from.
> iov_iter_fault_in_readable should be called before this function. If this
> happened after MMF_UNSTABLE was set then we should get EFAULT and bail
> out early. Let's assume this wasn't the case. Then we should get down to
> iov_iter_copy_from_user_atomic and that one shouldn't copy any data
> because __copy_from_user_inatomic says
> 
>  * If copying succeeds, the return value must be 0.  If some data cannot
be
>  * fetched, it is permitted to copy less than had been fetched; the only
>  * hard requirement is that not storing anything at all (i.e. returning
size)
>  * should happen only when nothing could be copied.  In other words, you
> don't
>  * have to squeeze as much as possible - it is allowed, but not necessary.
> 
> which should be our case.
> 
> I was testing with xfs (but generic_perform_write seem to be doing the
same
> thing) and that one does
> 		if (unlikely(copied == 0)) {
> 			/*
> 			 * If we were unable to copy any data at all, we
must
> 			 * fall back to a single segment length write.
> 			 *
> 			 * If we didn't fallback here, we could livelock
> 			 * because not all segments in the iov can be copied
at
> 			 * once without a pagefault.
> 			 */
> 			bytes = min_t(unsigned long, PAGE_SIZE - offset,
>
iov_iter_single_seg_count(i));
> 			goto again;
> 		}
> 
> and that again will go through iov_iter_fault_in_readable again and that
will
> succeed now.
> 
Agree, didn't notice this case before.

> And that's why we still see the corruption. That, however, means that the
> MMF_UNSTABLE implementation has to be more complex and we have to
> hook into all anonymous memory fault paths which I hoped I could avoid
> previously.
> 
> This is a rough first draft that passes the test case from Tetsuo on my
system.
> It will need much more eyes on it and I will return to it with a fresh
brain next
> week. I would appreciate as much testing as possible.
> 
> Note that this is on top of the previous attempt for the fix but I will
squash
> the result into one patch because the previous one is not sufficient.
> ---
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c index
> 86975dec0ba1..1fbc78d423d7 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -550,6 +550,7 @@ static int __do_huge_pmd_anonymous_page(struct
> vm_fault *vmf, struct page *page,
>  	struct mem_cgroup *memcg;
>  	pgtable_t pgtable;
>  	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
> +	int ret;
> 
>  	VM_BUG_ON_PAGE(!PageCompound(page), page);
> 
> @@ -561,9 +562,8 @@ static int __do_huge_pmd_anonymous_page(struct
> vm_fault *vmf, struct page *page,
> 
>  	pgtable = pte_alloc_one(vma->vm_mm, haddr);
>  	if (unlikely(!pgtable)) {
> -		mem_cgroup_cancel_charge(page, memcg, true);
> -		put_page(page);
> -		return VM_FAULT_OOM;
> +		ret = VM_FAULT_OOM;
> +		goto release;
>  	}
> 
>  	clear_huge_page(page, haddr, HPAGE_PMD_NR); @@ -583,6 +583,15
> @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
> struct page *page,
>  	} else {
>  		pmd_t entry;
> 
> +		/*
> +		 * range could have been already torn down by
> +		 * the oom reaper
> +		 */
> +		if (test_bit(MMF_UNSTABLE, &vma->vm_mm->flags)) {
> +			spin_unlock(vmf->ptl);
> +			ret = VM_FAULT_SIGBUS;
> +			goto release;
> +		}
>  		/* Deliver the page fault to userland */
>  		if (userfaultfd_missing(vma)) {
>  			int ret;
> @@ -610,6 +619,13 @@ static int
> __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page
> *page,
>  	}
> 
>  	return 0;
> +release:
> +	if (pgtable)
> +		pte_free(vma->vm_mm, pgtable);
> +	mem_cgroup_cancel_charge(page, memcg, true);
> +	put_page(page);
> +	return ret;
> +
>  }
> 
>  /*
> @@ -688,7 +704,14 @@ int do_huge_pmd_anonymous_page(struct
> vm_fault *vmf)
>  		ret = 0;
>  		set = false;
>  		if (pmd_none(*vmf->pmd)) {
> -			if (userfaultfd_missing(vma)) {
> +			/*
> +			 * range could have been already torn down by
> +			 * the oom reaper
> +			 */
> +			if (test_bit(MMF_UNSTABLE, &vma->vm_mm->flags)) {
> +				spin_unlock(vmf->ptl);
> +				ret = VM_FAULT_SIGBUS;
> +			} else if (userfaultfd_missing(vma)) {
>  				spin_unlock(vmf->ptl);
>  				ret = handle_userfault(vmf,
VM_UFFD_MISSING);
>  				VM_BUG_ON(ret & VM_FAULT_FALLBACK); diff
--git
> a/mm/memory.c b/mm/memory.c index e7308e633b52..7de9508e38e4
> 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -2864,6 +2864,7 @@ static int do_anonymous_page(struct vm_fault
> *vmf)
>  	struct vm_area_struct *vma = vmf->vma;
>  	struct mem_cgroup *memcg;
>  	struct page *page;
> +	int ret = 0;
>  	pte_t entry;
> 
>  	/* File mapping without ->vm_ops ? */
> @@ -2896,6 +2897,14 @@ static int do_anonymous_page(struct vm_fault
> *vmf)
>  				vmf->address, &vmf->ptl);
>  		if (!pte_none(*vmf->pte))
>  			goto unlock;
> +		/*
> +		 * range could have been already torn down by
> +		 * the oom reaper
> +		 */
> +		if (test_bit(MMF_UNSTABLE, &vma->vm_mm->flags)) {
> +			ret = VM_FAULT_SIGBUS;
> +			goto unlock;
> +		}
>  		/* Deliver the page fault to userland, check inside PT lock
*/
>  		if (userfaultfd_missing(vma)) {
>  			pte_unmap_unlock(vmf->pte, vmf->ptl); @@ -2930,6
> +2939,15 @@ static int do_anonymous_page(struct vm_fault *vmf)
>  	if (!pte_none(*vmf->pte))
>  		goto release;
> 
> +	/*
> +	 * range could have been already torn down by
> +	 * the oom reaper
> +	 */
> +	if (test_bit(MMF_UNSTABLE, &vma->vm_mm->flags)) {
> +		ret = VM_FAULT_SIGBUS;
> +		goto release;
> +	}
> +
>  	/* Deliver the page fault to userland, check inside PT lock */
>  	if (userfaultfd_missing(vma)) {
>  		pte_unmap_unlock(vmf->pte, vmf->ptl); @@ -2949,7 +2967,7 @@
> static int do_anonymous_page(struct vm_fault *vmf)
>  	update_mmu_cache(vma, vmf->address, vmf->pte);
>  unlock:
>  	pte_unmap_unlock(vmf->pte, vmf->ptl);
> -	return 0;
> +	return ret;
>  release:
>  	mem_cgroup_cancel_charge(page, memcg, false);
>  	put_page(page);
> @@ -3231,7 +3249,10 @@ int finish_fault(struct vm_fault *vmf)
>  		page = vmf->cow_page;
>  	else
>  		page = vmf->page;
> -	ret = alloc_set_pte(vmf, vmf->memcg, page);
> +	if (!test_bit(MMF_UNSTABLE, &vmf->vma->vm_mm->flags))
> +		ret = alloc_set_pte(vmf, vmf->memcg, page);
> +	else
> +		ret = VM_FAULT_SIGBUS;
>  	if (vmf->pte)
>  		pte_unmap_unlock(vmf->pte, vmf->ptl);
>  	return ret;
> @@ -3871,24 +3892,6 @@ int handle_mm_fault(struct vm_area_struct
> *vma, unsigned long address,
>  			mem_cgroup_oom_synchronize(false);
>  	}
> 
> -	/*
> -	 * This mm has been already reaped by the oom reaper and so the
> -	 * refault cannot be trusted in general. Anonymous refaults would
> -	 * lose data and give a zero page instead e.g.
> -	 */
> -	if (unlikely(!(ret & VM_FAULT_ERROR)
> -				&& test_bit(MMF_UNSTABLE,
&vma->vm_mm->flags))) {
> -		/*
> -		 * We are going to enforce SIGBUS but the PF path might have
> -		 * dropped the mmap_sem already so take it again so that
> -		 * we do not break expectations of all arch specific PF
paths
> -		 * and g-u-p
> -		 */
> -		if (ret & VM_FAULT_RETRY)
> -			down_read(&vma->vm_mm->mmap_sem);
> -		ret = VM_FAULT_SIGBUS;
> -	}
> -
>  	return ret;
>  }
>  EXPORT_SYMBOL_GPL(handle_mm_fault);
> --
> Michal Hocko
> SUSE Labs