Re: [PATCH 2/3] hugetlbfs: close race between MADV_DONTNEED and page fault

From: Mike Kravetz <mike.kravetz@oracle.com>
To: riel@surriel.com
Cc: linux-kernel@vger.kernel.org, kernel-team@meta.com,
	linux-mm@kvack.org, akpm@linux-foundation.org,
	muchun.song@linux.dev, leit@meta.com, willy@infradead.org,
	stable@kernel.org
Subject: Re: [PATCH 2/3] hugetlbfs: close race between MADV_DONTNEED and page fault
Date: Wed, 4 Oct 2023 20:19:59 -0700	[thread overview]
Message-ID: <20231005031959.GA13437@monkey> (raw)
In-Reply-To: <20231004032814.3108383-3-riel@surriel.com>

On 10/03/23 23:25, riel@surriel.com wrote:
> From: Rik van Riel <riel@surriel.com>
> 
> Malloc libraries, like jemalloc and tcalloc, take decisions on when
> to call madvise independently from the code in the main application.
> 
> This sometimes results in the application page faulting on an address,
> right after the malloc library has shot down the backing memory with
> MADV_DONTNEED.
> 
> Usually this is harmless, because we always have some 4kB pages
> sitting around to satisfy a page fault. However, with hugetlbfs
> systems often allocate only the exact number of huge pages that
> the application wants.
> 
> Due to TLB batching, hugetlbfs MADV_DONTNEED will free pages outside of
> any lock taken on the page fault path, which can open up the following
> race condition:
> 
>        CPU 1                            CPU 2
> 
>        MADV_DONTNEED
>        unmap page
>        shoot down TLB entry
>                                        page fault
>                                        fail to allocate a huge page
>                                        killed with SIGBUS
>        free page
> 
> Fix that race by pulling the locking from __unmap_hugepage_final_range
> into helper functions called from zap_page_range_single. This ensures
> page faults stay locked out of the MADV_DONTNEED VMA until the
> huge pages have actually been freed.
> 
> Signed-off-by: Rik van Riel <riel@surriel.com>
> Cc: stable@kernel.org
> Fixes: 04ada095dcfc ("hugetlb: don't delete vma_lock in hugetlb MADV_DONTNEED processing")
> ---
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index ee7497f37098..424bb8da9519 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -5435,16 +5435,19 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
>  		tlb_flush_mmu_tlbonly(tlb);
>  }
>  
> -void __unmap_hugepage_range_final(struct mmu_gather *tlb,
> -			  struct vm_area_struct *vma, unsigned long start,
> -			  unsigned long end, struct page *ref_page,
> -			  zap_flags_t zap_flags)
> +void __hugetlb_zap_begin(struct vm_area_struct *vma,
> +			 unsigned long *start, unsigned long *end)
>  {
> +	adjust_range_if_pmd_sharing_possible(vma, start, end);
>  	hugetlb_vma_lock_write(vma);
> -	i_mmap_lock_write(vma->vm_file->f_mapping);
> +	if (vma->vm_file)
> +		i_mmap_lock_write(vma->vm_file->f_mapping);
> +}
>  
> -	/* mmu notification performed in caller */
> -	__unmap_hugepage_range(tlb, vma, start, end, ref_page, zap_flags);
> +void __hugetlb_zap_end(struct vm_area_struct *vma,
> +		       struct zap_details *details)
> +{
> +	zap_flags_t zap_flags = details ? details->zap_flags : 0;
>  
>  	if (zap_flags & ZAP_FLAG_UNMAP) {	/* final unmap */
>  		/*
> @@ -5457,11 +5460,12 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb,
>  		 * someone else.
>  		 */
>  		__hugetlb_vma_unlock_write_free(vma);
> -		i_mmap_unlock_write(vma->vm_file->f_mapping);
>  	} else {
> -		i_mmap_unlock_write(vma->vm_file->f_mapping);
>  		hugetlb_vma_unlock_write(vma);
>  	}
> +
> +	if (vma->vm_file)
> +		i_mmap_unlock_write(vma->vm_file->f_mapping);
>  }

In the case of a mmap(hugetlbfs_file_mmap) error, the per-vma hugetlb
lock will not be setup.  The hugetlb_vma_lock/unlock routines do not
check for this as they were previously always called after the lock was
set up.  So, we can now get:

[   47.653806] BUG: kernel NULL pointer dereference, address: 00000000000000c8
[   47.654967] #PF: supervisor read access in kernel mode
[   47.655900] #PF: error_code(0x0000) - not-present page
[   47.656814] PGD 8000000307415067 P4D 8000000307415067 PUD 30587b067 PMD 0 
[   47.658005] Oops: 0000 [#1] PREEMPT SMP PTI
[   47.658777] CPU: 3 PID: 1224 Comm: heap-overflow Tainted: G        W          6.6.0-rc3-next-20230925+ #19
[   47.660428] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-1.fc37 04/01/2014
[   47.661931] RIP: 0010:__lock_acquire+0x1e6/0x2390
[   47.662784] Code: 46 24 4c 89 e8 25 ff 1f 00 00 48 0f a3 05 f2 84 0f 02 0f 83 e9 05 00 00 48 8d 14 40 48 8d 04 90 48 c1 e0 04 48 05 a0 89 27 83 <0f> b6 98 c8 00 00 00 41 0f b7 46 20 66 25 ff 1f 0f b7 c0 48 0f a3
[   47.665890] RSP: 0018:ffffc90004a03ac8 EFLAGS: 00010046
[   47.667009] RAX: 0000000000000000 RBX: 000000000004138c RCX: 0000000000000000
[   47.668321] RDX: 0000000000000002 RSI: ffffffff8246ce26 RDI: 00000000ffffffff
[   47.669580] RBP: 0000000000000000 R08: 0000000000009ffb R09: 0000000000000001
[   47.670825] R10: ffff888303c51ac0 R11: ffff888303c52430 R12: 0000000000000001
[   47.672070] R13: 3b97e6ab9880538c R14: ffff888303c52458 R15: 0000000000000000
[   47.673285] FS:  00007f3065e7a0c0(0000) GS:ffff888477d00000(0000) knlGS:0000000000000000
[   47.675504] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   47.676646] CR2: 00000000000000c8 CR3: 000000030409a004 CR4: 0000000000370ef0
[   47.677975] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[   47.679264] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[   47.680603] Call Trace:
[   47.681196]  <TASK>
[   47.681723]  ? __die+0x1f/0x70
[   47.682440]  ? page_fault_oops+0x159/0x450
[   47.683246]  ? do_user_addr_fault+0x65/0x850
[   47.684082]  ? exc_page_fault+0x6d/0x1c0
[   47.684838]  ? asm_exc_page_fault+0x22/0x30
[   47.685611]  ? __lock_acquire+0x1e6/0x2390
[   47.686360]  ? __lock_acquire+0xab9/0x2390
[   47.687123]  lock_acquire+0xd4/0x2c0
[   47.687811]  ? __hugetlb_zap_begin+0x6e/0xa0
[   47.688595]  ? mark_held_locks+0x49/0x80
[   47.689321]  down_write+0x2a/0xc0
[   47.689976]  ? __hugetlb_zap_begin+0x6e/0xa0
[   47.690862]  __hugetlb_zap_begin+0x6e/0xa0
[   47.691707]  unmap_vmas+0xb3/0x100
[   47.692480]  unmap_region.constprop.0+0xcc/0x140
[   47.693518]  ? lock_release+0x142/0x290
[   47.694304]  ? preempt_count_add+0x47/0xa0
[   47.695109]  mmap_region+0x565/0xab0
[   47.695831]  do_mmap+0x35a/0x520
[   47.696511]  vm_mmap_pgoff+0xdf/0x200
[   47.697419]  ksys_mmap_pgoff+0xdb/0x200
[   47.698368]  do_syscall_64+0x37/0x90
[   47.699148]  entry_SYSCALL_64_after_hwframe+0x6e/0xd8
[   47.700307] RIP: 0033:0x7f3065f77086

In my environment, I added the following to this patch to resolve the
issue.

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d25db18c9526..48370f5b70f5 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5503,10 +5503,12 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
 void __hugetlb_zap_begin(struct vm_area_struct *vma,
 			 unsigned long *start, unsigned long *end)
 {
+	if (!vma->vm_file)	/* hugetlbfs_file_mmap error */
+		return;
+
 	adjust_range_if_pmd_sharing_possible(vma, start, end);
 	hugetlb_vma_lock_write(vma);
-	if (vma->vm_file)
-		i_mmap_lock_write(vma->vm_file->f_mapping);
+	i_mmap_lock_write(vma->vm_file->f_mapping);
 }
 
 void __hugetlb_zap_end(struct vm_area_struct *vma,
@@ -5514,6 +5516,9 @@ void __hugetlb_zap_end(struct vm_area_struct *vma,
 {
 	zap_flags_t zap_flags = details ? details->zap_flags : 0;
 
+	if (!vma->vm_file)	/* hugetlbfs_file_mmap mmap error */
+		return;
+
 	if (zap_flags & ZAP_FLAG_UNMAP) {	/* final unmap */
 		/*
 		 * Unlock and free the vma lock before releasing i_mmap_rwsem.
@@ -5529,8 +5534,7 @@ void __hugetlb_zap_end(struct vm_area_struct *vma,
 		hugetlb_vma_unlock_write(vma);
 	}
 
-	if (vma->vm_file)
-		i_mmap_unlock_write(vma->vm_file->f_mapping);
+	i_mmap_unlock_write(vma->vm_file->f_mapping);
 }
 
 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,

Another way to resolve would be to fix up the hugetlb_vma_lock/unlock routines
to check for and handle a null lock.
-- 
Mike Kravetz