All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/1] mm/khugepaged: fix vm_lock/i_mmap_rwsem inversion in retract_page_tables
@ 2023-03-03 21:32 Suren Baghdasaryan
  2023-03-04  9:14 ` kernel test robot
                   ` (3 more replies)
  0 siblings, 4 replies; 6+ messages in thread
From: Suren Baghdasaryan @ 2023-03-03 21:32 UTC (permalink / raw)
  To: akpm
  Cc: michel, jglisse, mhocko, vbabka, hannes, mgorman, dave, willy,
	liam.howlett, songliubraving, peterx, david, dhowells, hughd,
	bigeasy, punit.agrawal, lstoakes, rppt, jannh, shakeelb, gthelen,
	gurua, arjunroy, posk, linux-mm, linux-kernel, kernel-team,
	Suren Baghdasaryan, syzbot+8955a9646d1a48b8be92

Internal syscaller on linux-next reported a lock inversion cause by
vm_lock being taken after i_mmap_rwsem:

======================================================
WARNING: possible circular locking dependency detected
6.2.0-next-20230301-syzkaller #0 Not tainted
------------------------------------------------------
syz-executor115/5084 is trying to acquire lock:
ffff888078307a90 (&vma->vm_lock->lock){++++}-{3:3}, at: vma_start_write include/linux/mm.h:678 [inline]
ffff888078307a90 (&vma->vm_lock->lock){++++}-{3:3}, at: retract_page_tables mm/khugepaged.c:1826 [inline]
ffff888078307a90 (&vma->vm_lock->lock){++++}-{3:3}, at: collapse_file+0x4fa5/0x5980 mm/khugepaged.c:2204

but task is already holding lock:
ffff88801f93efa8 (&mapping->i_mmap_rwsem){++++}-{3:3}, at: i_mmap_lock_write include/linux/fs.h:468 [inline]
ffff88801f93efa8 (&mapping->i_mmap_rwsem){++++}-{3:3}, at: retract_page_tables mm/khugepaged.c:1745 [inline]
ffff88801f93efa8 (&mapping->i_mmap_rwsem){++++}-{3:3}, at: collapse_file+0x3da6/0x5980 mm/khugepaged.c:2204

retract_page_tables takes i_mmap_rwsem before exclusive mmap_lock, which
is inverse to normal order. Deadlock is avoided by try-locking mmap_lock
and skipping on failure to obtain it. Locking the VMA should use the same
locking pattern to avoid this lock inversion.

Fixes: 44a83f2083bd ("mm/khugepaged: write-lock VMA while collapsing a huge page")
Reported-by: syzbot+8955a9646d1a48b8be92@syzkaller.appspotmail.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 include/linux/mm.h | 39 ++++++++++++++++++++++++++++-----------
 mm/khugepaged.c    |  5 ++++-
 2 files changed, 32 insertions(+), 12 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1b9be34a24fb..5f16263d176d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -676,18 +676,23 @@ static inline void vma_end_read(struct vm_area_struct *vma)
 	rcu_read_unlock();
 }
 
-static inline void vma_start_write(struct vm_area_struct *vma)
+static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq)
 {
-	int mm_lock_seq;
-
 	mmap_assert_write_locked(vma->vm_mm);
 
 	/*
 	 * current task is holding mmap_write_lock, both vma->vm_lock_seq and
 	 * mm->mm_lock_seq can't be concurrently modified.
 	 */
-	mm_lock_seq = READ_ONCE(vma->vm_mm->mm_lock_seq);
-	if (vma->vm_lock_seq == mm_lock_seq)
+	*mm_lock_seq = READ_ONCE(vma->vm_mm->mm_lock_seq);
+	return (vma->vm_lock_seq == *mm_lock_seq);
+}
+
+static inline void vma_start_write(struct vm_area_struct *vma)
+{
+	int mm_lock_seq;
+
+	if (__is_vma_write_locked(vma, &mm_lock_seq))
 		return;
 
 	down_write(&vma->vm_lock->lock);
@@ -695,14 +700,26 @@ static inline void vma_start_write(struct vm_area_struct *vma)
 	up_write(&vma->vm_lock->lock);
 }
 
+static inline bool vma_try_start_write(struct vm_area_struct *vma)
+{
+	int mm_lock_seq;
+
+	if (__is_vma_write_locked(vma, &mm_lock_seq))
+		return true;
+
+	if (!down_write_trylock(&vma->vm_lock->lock))
+		return false;
+
+	vma->vm_lock_seq = mm_lock_seq;
+	up_write(&vma->vm_lock->lock);
+	return true;
+}
+
 static inline void vma_assert_write_locked(struct vm_area_struct *vma)
 {
-	mmap_assert_write_locked(vma->vm_mm);
-	/*
-	 * current task is holding mmap_write_lock, both vma->vm_lock_seq and
-	 * mm->mm_lock_seq can't be concurrently modified.
-	 */
-	VM_BUG_ON_VMA(vma->vm_lock_seq != READ_ONCE(vma->vm_mm->mm_lock_seq), vma);
+	int mm_lock_seq;
+
+	VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma);
 }
 
 static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index c64e01f03f27..408fed42c9f5 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1795,6 +1795,10 @@ static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
 		result = SCAN_PTE_MAPPED_HUGEPAGE;
 		if ((cc->is_khugepaged || is_target) &&
 		    mmap_write_trylock(mm)) {
+			/* trylock for the same lock inversion as above */
+			if (!vma_try_start_write(vma))
+				goto unlock_next;
+
 			/*
 			 * Re-check whether we have an ->anon_vma, because
 			 * collapse_and_free_pmd() requires that either no
@@ -1823,7 +1827,6 @@ static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
 				result = SCAN_PTE_UFFD_WP;
 				goto unlock_next;
 			}
-			vma_start_write(vma);
 			collapse_and_free_pmd(mm, vma, addr, pmd);
 			if (!cc->is_khugepaged && is_target)
 				result = set_huge_pmd(vma, addr, pmd, hpage);
-- 
2.40.0.rc0.216.gc4246ad0f0-goog


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH 1/1] mm/khugepaged: fix vm_lock/i_mmap_rwsem inversion in retract_page_tables
  2023-03-03 21:32 [PATCH 1/1] mm/khugepaged: fix vm_lock/i_mmap_rwsem inversion in retract_page_tables Suren Baghdasaryan
@ 2023-03-04  9:14 ` kernel test robot
  2023-03-04  9:14 ` kernel test robot
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 6+ messages in thread
From: kernel test robot @ 2023-03-04  9:14 UTC (permalink / raw)
  To: Suren Baghdasaryan; +Cc: llvm, oe-kbuild-all

Hi Suren,

I love your patch! Yet something to improve:

[auto build test ERROR on akpm-mm/mm-everything]

url:    https://github.com/intel-lab-lkp/linux/commits/Suren-Baghdasaryan/mm-khugepaged-fix-vm_lock-i_mmap_rwsem-inversion-in-retract_page_tables/20230304-053401
base:   https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link:    https://lore.kernel.org/r/20230303213250.3555716-1-surenb%40google.com
patch subject: [PATCH 1/1] mm/khugepaged: fix vm_lock/i_mmap_rwsem inversion in retract_page_tables
config: s390-randconfig-r024-20230302 (https://download.01.org/0day-ci/archive/20230304/202303041731.bKMGi6Dx-lkp@intel.com/config)
compiler: clang version 17.0.0 (https://github.com/llvm/llvm-project 67409911353323ca5edf2049ef0df54132fa1ca7)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # install s390 cross compiling tool for clang build
        # apt-get install binutils-s390x-linux-gnu
        # https://github.com/intel-lab-lkp/linux/commit/338d2ee433a0559b3c6c3966b8d4ffe55e8dc6c9
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Suren-Baghdasaryan/mm-khugepaged-fix-vm_lock-i_mmap_rwsem-inversion-in-retract_page_tables/20230304-053401
        git checkout 338d2ee433a0559b3c6c3966b8d4ffe55e8dc6c9
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=s390 olddefconfig
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=s390 SHELL=/bin/bash

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>
| Link: https://lore.kernel.org/oe-kbuild-all/202303041731.bKMGi6Dx-lkp@intel.com/

All errors (new ones prefixed by >>):

>> mm/khugepaged.c:1799:9: error: call to undeclared function 'vma_try_start_write'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
                           if (!vma_try_start_write(vma))
                                ^
   mm/khugepaged.c:1799:9: note: did you mean 'vma_start_write'?
   include/linux/mm.h:742:20: note: 'vma_start_write' declared here
   static inline void vma_start_write(struct vm_area_struct *vma) {}
                      ^
   1 error generated.


vim +/vma_try_start_write +1799 mm/khugepaged.c

  1736	
  1737	static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
  1738				       struct mm_struct *target_mm,
  1739				       unsigned long target_addr, struct page *hpage,
  1740				       struct collapse_control *cc)
  1741	{
  1742		struct vm_area_struct *vma;
  1743		int target_result = SCAN_FAIL;
  1744	
  1745		i_mmap_lock_write(mapping);
  1746		vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
  1747			int result = SCAN_FAIL;
  1748			struct mm_struct *mm = NULL;
  1749			unsigned long addr = 0;
  1750			pmd_t *pmd;
  1751			bool is_target = false;
  1752	
  1753			/*
  1754			 * Check vma->anon_vma to exclude MAP_PRIVATE mappings that
  1755			 * got written to. These VMAs are likely not worth investing
  1756			 * mmap_write_lock(mm) as PMD-mapping is likely to be split
  1757			 * later.
  1758			 *
  1759			 * Note that vma->anon_vma check is racy: it can be set up after
  1760			 * the check but before we took mmap_lock by the fault path.
  1761			 * But page lock would prevent establishing any new ptes of the
  1762			 * page, so we are safe.
  1763			 *
  1764			 * An alternative would be drop the check, but check that page
  1765			 * table is clear before calling pmdp_collapse_flush() under
  1766			 * ptl. It has higher chance to recover THP for the VMA, but
  1767			 * has higher cost too. It would also probably require locking
  1768			 * the anon_vma.
  1769			 */
  1770			if (READ_ONCE(vma->anon_vma)) {
  1771				result = SCAN_PAGE_ANON;
  1772				goto next;
  1773			}
  1774			addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
  1775			if (addr & ~HPAGE_PMD_MASK ||
  1776			    vma->vm_end < addr + HPAGE_PMD_SIZE) {
  1777				result = SCAN_VMA_CHECK;
  1778				goto next;
  1779			}
  1780			mm = vma->vm_mm;
  1781			is_target = mm == target_mm && addr == target_addr;
  1782			result = find_pmd_or_thp_or_none(mm, addr, &pmd);
  1783			if (result != SCAN_SUCCEED)
  1784				goto next;
  1785			/*
  1786			 * We need exclusive mmap_lock to retract page table.
  1787			 *
  1788			 * We use trylock due to lock inversion: we need to acquire
  1789			 * mmap_lock while holding page lock. Fault path does it in
  1790			 * reverse order. Trylock is a way to avoid deadlock.
  1791			 *
  1792			 * Also, it's not MADV_COLLAPSE's job to collapse other
  1793			 * mappings - let khugepaged take care of them later.
  1794			 */
  1795			result = SCAN_PTE_MAPPED_HUGEPAGE;
  1796			if ((cc->is_khugepaged || is_target) &&
  1797			    mmap_write_trylock(mm)) {
  1798				/* trylock for the same lock inversion as above */
> 1799				if (!vma_try_start_write(vma))
  1800					goto unlock_next;
  1801	
  1802				/*
  1803				 * Re-check whether we have an ->anon_vma, because
  1804				 * collapse_and_free_pmd() requires that either no
  1805				 * ->anon_vma exists or the anon_vma is locked.
  1806				 * We already checked ->anon_vma above, but that check
  1807				 * is racy because ->anon_vma can be populated under the
  1808				 * mmap lock in read mode.
  1809				 */
  1810				if (vma->anon_vma) {
  1811					result = SCAN_PAGE_ANON;
  1812					goto unlock_next;
  1813				}
  1814				/*
  1815				 * When a vma is registered with uffd-wp, we can't
  1816				 * recycle the pmd pgtable because there can be pte
  1817				 * markers installed.  Skip it only, so the rest mm/vma
  1818				 * can still have the same file mapped hugely, however
  1819				 * it'll always mapped in small page size for uffd-wp
  1820				 * registered ranges.
  1821				 */
  1822				if (hpage_collapse_test_exit(mm)) {
  1823					result = SCAN_ANY_PROCESS;
  1824					goto unlock_next;
  1825				}
  1826				if (userfaultfd_wp(vma)) {
  1827					result = SCAN_PTE_UFFD_WP;
  1828					goto unlock_next;
  1829				}
  1830				collapse_and_free_pmd(mm, vma, addr, pmd);
  1831				if (!cc->is_khugepaged && is_target)
  1832					result = set_huge_pmd(vma, addr, pmd, hpage);
  1833				else
  1834					result = SCAN_SUCCEED;
  1835	
  1836	unlock_next:
  1837				mmap_write_unlock(mm);
  1838				goto next;
  1839			}
  1840			/*
  1841			 * Calling context will handle target mm/addr. Otherwise, let
  1842			 * khugepaged try again later.
  1843			 */
  1844			if (!is_target) {
  1845				khugepaged_add_pte_mapped_thp(mm, addr);
  1846				continue;
  1847			}
  1848	next:
  1849			if (is_target)
  1850				target_result = result;
  1851		}
  1852		i_mmap_unlock_write(mapping);
  1853		return target_result;
  1854	}
  1855	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 1/1] mm/khugepaged: fix vm_lock/i_mmap_rwsem inversion in retract_page_tables
  2023-03-03 21:32 [PATCH 1/1] mm/khugepaged: fix vm_lock/i_mmap_rwsem inversion in retract_page_tables Suren Baghdasaryan
  2023-03-04  9:14 ` kernel test robot
@ 2023-03-04  9:14 ` kernel test robot
  2023-03-04 10:57 ` kernel test robot
  2023-03-04 23:25 ` Suren Baghdasaryan
  3 siblings, 0 replies; 6+ messages in thread
From: kernel test robot @ 2023-03-04  9:14 UTC (permalink / raw)
  To: Suren Baghdasaryan; +Cc: oe-kbuild-all

Hi Suren,

I love your patch! Yet something to improve:

[auto build test ERROR on akpm-mm/mm-everything]

url:    https://github.com/intel-lab-lkp/linux/commits/Suren-Baghdasaryan/mm-khugepaged-fix-vm_lock-i_mmap_rwsem-inversion-in-retract_page_tables/20230304-053401
base:   https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link:    https://lore.kernel.org/r/20230303213250.3555716-1-surenb%40google.com
patch subject: [PATCH 1/1] mm/khugepaged: fix vm_lock/i_mmap_rwsem inversion in retract_page_tables
config: x86_64-randconfig-a013 (https://download.01.org/0day-ci/archive/20230304/202303041743.Zyg0Voaz-lkp@intel.com/config)
compiler: gcc-11 (Debian 11.3.0-8) 11.3.0
reproduce (this is a W=1 build):
        # https://github.com/intel-lab-lkp/linux/commit/338d2ee433a0559b3c6c3966b8d4ffe55e8dc6c9
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Suren-Baghdasaryan/mm-khugepaged-fix-vm_lock-i_mmap_rwsem-inversion-in-retract_page_tables/20230304-053401
        git checkout 338d2ee433a0559b3c6c3966b8d4ffe55e8dc6c9
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        make W=1 O=build_dir ARCH=x86_64 olddefconfig
        make W=1 O=build_dir ARCH=x86_64 SHELL=/bin/bash

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>
| Link: https://lore.kernel.org/oe-kbuild-all/202303041743.Zyg0Voaz-lkp@intel.com/

All errors (new ones prefixed by >>):

   mm/khugepaged.c: In function 'retract_page_tables':
>> mm/khugepaged.c:1799:30: error: implicit declaration of function 'vma_try_start_write'; did you mean 'vma_start_write'? [-Werror=implicit-function-declaration]
    1799 |                         if (!vma_try_start_write(vma))
         |                              ^~~~~~~~~~~~~~~~~~~
         |                              vma_start_write
   cc1: some warnings being treated as errors


vim +1799 mm/khugepaged.c

  1736	
  1737	static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
  1738				       struct mm_struct *target_mm,
  1739				       unsigned long target_addr, struct page *hpage,
  1740				       struct collapse_control *cc)
  1741	{
  1742		struct vm_area_struct *vma;
  1743		int target_result = SCAN_FAIL;
  1744	
  1745		i_mmap_lock_write(mapping);
  1746		vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
  1747			int result = SCAN_FAIL;
  1748			struct mm_struct *mm = NULL;
  1749			unsigned long addr = 0;
  1750			pmd_t *pmd;
  1751			bool is_target = false;
  1752	
  1753			/*
  1754			 * Check vma->anon_vma to exclude MAP_PRIVATE mappings that
  1755			 * got written to. These VMAs are likely not worth investing
  1756			 * mmap_write_lock(mm) as PMD-mapping is likely to be split
  1757			 * later.
  1758			 *
  1759			 * Note that vma->anon_vma check is racy: it can be set up after
  1760			 * the check but before we took mmap_lock by the fault path.
  1761			 * But page lock would prevent establishing any new ptes of the
  1762			 * page, so we are safe.
  1763			 *
  1764			 * An alternative would be drop the check, but check that page
  1765			 * table is clear before calling pmdp_collapse_flush() under
  1766			 * ptl. It has higher chance to recover THP for the VMA, but
  1767			 * has higher cost too. It would also probably require locking
  1768			 * the anon_vma.
  1769			 */
  1770			if (READ_ONCE(vma->anon_vma)) {
  1771				result = SCAN_PAGE_ANON;
  1772				goto next;
  1773			}
  1774			addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
  1775			if (addr & ~HPAGE_PMD_MASK ||
  1776			    vma->vm_end < addr + HPAGE_PMD_SIZE) {
  1777				result = SCAN_VMA_CHECK;
  1778				goto next;
  1779			}
  1780			mm = vma->vm_mm;
  1781			is_target = mm == target_mm && addr == target_addr;
  1782			result = find_pmd_or_thp_or_none(mm, addr, &pmd);
  1783			if (result != SCAN_SUCCEED)
  1784				goto next;
  1785			/*
  1786			 * We need exclusive mmap_lock to retract page table.
  1787			 *
  1788			 * We use trylock due to lock inversion: we need to acquire
  1789			 * mmap_lock while holding page lock. Fault path does it in
  1790			 * reverse order. Trylock is a way to avoid deadlock.
  1791			 *
  1792			 * Also, it's not MADV_COLLAPSE's job to collapse other
  1793			 * mappings - let khugepaged take care of them later.
  1794			 */
  1795			result = SCAN_PTE_MAPPED_HUGEPAGE;
  1796			if ((cc->is_khugepaged || is_target) &&
  1797			    mmap_write_trylock(mm)) {
  1798				/* trylock for the same lock inversion as above */
> 1799				if (!vma_try_start_write(vma))
  1800					goto unlock_next;
  1801	
  1802				/*
  1803				 * Re-check whether we have an ->anon_vma, because
  1804				 * collapse_and_free_pmd() requires that either no
  1805				 * ->anon_vma exists or the anon_vma is locked.
  1806				 * We already checked ->anon_vma above, but that check
  1807				 * is racy because ->anon_vma can be populated under the
  1808				 * mmap lock in read mode.
  1809				 */
  1810				if (vma->anon_vma) {
  1811					result = SCAN_PAGE_ANON;
  1812					goto unlock_next;
  1813				}
  1814				/*
  1815				 * When a vma is registered with uffd-wp, we can't
  1816				 * recycle the pmd pgtable because there can be pte
  1817				 * markers installed.  Skip it only, so the rest mm/vma
  1818				 * can still have the same file mapped hugely, however
  1819				 * it'll always mapped in small page size for uffd-wp
  1820				 * registered ranges.
  1821				 */
  1822				if (hpage_collapse_test_exit(mm)) {
  1823					result = SCAN_ANY_PROCESS;
  1824					goto unlock_next;
  1825				}
  1826				if (userfaultfd_wp(vma)) {
  1827					result = SCAN_PTE_UFFD_WP;
  1828					goto unlock_next;
  1829				}
  1830				collapse_and_free_pmd(mm, vma, addr, pmd);
  1831				if (!cc->is_khugepaged && is_target)
  1832					result = set_huge_pmd(vma, addr, pmd, hpage);
  1833				else
  1834					result = SCAN_SUCCEED;
  1835	
  1836	unlock_next:
  1837				mmap_write_unlock(mm);
  1838				goto next;
  1839			}
  1840			/*
  1841			 * Calling context will handle target mm/addr. Otherwise, let
  1842			 * khugepaged try again later.
  1843			 */
  1844			if (!is_target) {
  1845				khugepaged_add_pte_mapped_thp(mm, addr);
  1846				continue;
  1847			}
  1848	next:
  1849			if (is_target)
  1850				target_result = result;
  1851		}
  1852		i_mmap_unlock_write(mapping);
  1853		return target_result;
  1854	}
  1855	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 1/1] mm/khugepaged: fix vm_lock/i_mmap_rwsem inversion in retract_page_tables
  2023-03-03 21:32 [PATCH 1/1] mm/khugepaged: fix vm_lock/i_mmap_rwsem inversion in retract_page_tables Suren Baghdasaryan
  2023-03-04  9:14 ` kernel test robot
  2023-03-04  9:14 ` kernel test robot
@ 2023-03-04 10:57 ` kernel test robot
  2023-03-04 23:24   ` Suren Baghdasaryan
  2023-03-04 23:25 ` Suren Baghdasaryan
  3 siblings, 1 reply; 6+ messages in thread
From: kernel test robot @ 2023-03-04 10:57 UTC (permalink / raw)
  To: Suren Baghdasaryan; +Cc: llvm, oe-kbuild-all

Hi Suren,

I love your patch! Yet something to improve:

[auto build test ERROR on akpm-mm/mm-everything]

url:    https://github.com/intel-lab-lkp/linux/commits/Suren-Baghdasaryan/mm-khugepaged-fix-vm_lock-i_mmap_rwsem-inversion-in-retract_page_tables/20230304-053401
base:   https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link:    https://lore.kernel.org/r/20230303213250.3555716-1-surenb%40google.com
patch subject: [PATCH 1/1] mm/khugepaged: fix vm_lock/i_mmap_rwsem inversion in retract_page_tables
config: i386-randconfig-a013 (https://download.01.org/0day-ci/archive/20230304/202303041842.Cv8cTjlL-lkp@intel.com/config)
compiler: clang version 14.0.6 (https://github.com/llvm/llvm-project f28c006a5895fc0e329fe15fead81e37457cb1d1)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/338d2ee433a0559b3c6c3966b8d4ffe55e8dc6c9
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Suren-Baghdasaryan/mm-khugepaged-fix-vm_lock-i_mmap_rwsem-inversion-in-retract_page_tables/20230304-053401
        git checkout 338d2ee433a0559b3c6c3966b8d4ffe55e8dc6c9
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=i386 olddefconfig
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=i386 SHELL=/bin/bash

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>
| Link: https://lore.kernel.org/oe-kbuild-all/202303041842.Cv8cTjlL-lkp@intel.com/

All errors (new ones prefixed by >>):

>> mm/khugepaged.c:1799:9: error: implicit declaration of function 'vma_try_start_write' is invalid in C99 [-Werror,-Wimplicit-function-declaration]
                           if (!vma_try_start_write(vma))
                                ^
   mm/khugepaged.c:1799:9: note: did you mean 'vma_start_write'?
   include/linux/mm.h:742:20: note: 'vma_start_write' declared here
   static inline void vma_start_write(struct vm_area_struct *vma) {}
                      ^
   1 error generated.


vim +/vma_try_start_write +1799 mm/khugepaged.c

  1736	
  1737	static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
  1738				       struct mm_struct *target_mm,
  1739				       unsigned long target_addr, struct page *hpage,
  1740				       struct collapse_control *cc)
  1741	{
  1742		struct vm_area_struct *vma;
  1743		int target_result = SCAN_FAIL;
  1744	
  1745		i_mmap_lock_write(mapping);
  1746		vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
  1747			int result = SCAN_FAIL;
  1748			struct mm_struct *mm = NULL;
  1749			unsigned long addr = 0;
  1750			pmd_t *pmd;
  1751			bool is_target = false;
  1752	
  1753			/*
  1754			 * Check vma->anon_vma to exclude MAP_PRIVATE mappings that
  1755			 * got written to. These VMAs are likely not worth investing
  1756			 * mmap_write_lock(mm) as PMD-mapping is likely to be split
  1757			 * later.
  1758			 *
  1759			 * Note that vma->anon_vma check is racy: it can be set up after
  1760			 * the check but before we took mmap_lock by the fault path.
  1761			 * But page lock would prevent establishing any new ptes of the
  1762			 * page, so we are safe.
  1763			 *
  1764			 * An alternative would be drop the check, but check that page
  1765			 * table is clear before calling pmdp_collapse_flush() under
  1766			 * ptl. It has higher chance to recover THP for the VMA, but
  1767			 * has higher cost too. It would also probably require locking
  1768			 * the anon_vma.
  1769			 */
  1770			if (READ_ONCE(vma->anon_vma)) {
  1771				result = SCAN_PAGE_ANON;
  1772				goto next;
  1773			}
  1774			addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
  1775			if (addr & ~HPAGE_PMD_MASK ||
  1776			    vma->vm_end < addr + HPAGE_PMD_SIZE) {
  1777				result = SCAN_VMA_CHECK;
  1778				goto next;
  1779			}
  1780			mm = vma->vm_mm;
  1781			is_target = mm == target_mm && addr == target_addr;
  1782			result = find_pmd_or_thp_or_none(mm, addr, &pmd);
  1783			if (result != SCAN_SUCCEED)
  1784				goto next;
  1785			/*
  1786			 * We need exclusive mmap_lock to retract page table.
  1787			 *
  1788			 * We use trylock due to lock inversion: we need to acquire
  1789			 * mmap_lock while holding page lock. Fault path does it in
  1790			 * reverse order. Trylock is a way to avoid deadlock.
  1791			 *
  1792			 * Also, it's not MADV_COLLAPSE's job to collapse other
  1793			 * mappings - let khugepaged take care of them later.
  1794			 */
  1795			result = SCAN_PTE_MAPPED_HUGEPAGE;
  1796			if ((cc->is_khugepaged || is_target) &&
  1797			    mmap_write_trylock(mm)) {
  1798				/* trylock for the same lock inversion as above */
> 1799				if (!vma_try_start_write(vma))
  1800					goto unlock_next;
  1801	
  1802				/*
  1803				 * Re-check whether we have an ->anon_vma, because
  1804				 * collapse_and_free_pmd() requires that either no
  1805				 * ->anon_vma exists or the anon_vma is locked.
  1806				 * We already checked ->anon_vma above, but that check
  1807				 * is racy because ->anon_vma can be populated under the
  1808				 * mmap lock in read mode.
  1809				 */
  1810				if (vma->anon_vma) {
  1811					result = SCAN_PAGE_ANON;
  1812					goto unlock_next;
  1813				}
  1814				/*
  1815				 * When a vma is registered with uffd-wp, we can't
  1816				 * recycle the pmd pgtable because there can be pte
  1817				 * markers installed.  Skip it only, so the rest mm/vma
  1818				 * can still have the same file mapped hugely, however
  1819				 * it'll always mapped in small page size for uffd-wp
  1820				 * registered ranges.
  1821				 */
  1822				if (hpage_collapse_test_exit(mm)) {
  1823					result = SCAN_ANY_PROCESS;
  1824					goto unlock_next;
  1825				}
  1826				if (userfaultfd_wp(vma)) {
  1827					result = SCAN_PTE_UFFD_WP;
  1828					goto unlock_next;
  1829				}
  1830				collapse_and_free_pmd(mm, vma, addr, pmd);
  1831				if (!cc->is_khugepaged && is_target)
  1832					result = set_huge_pmd(vma, addr, pmd, hpage);
  1833				else
  1834					result = SCAN_SUCCEED;
  1835	
  1836	unlock_next:
  1837				mmap_write_unlock(mm);
  1838				goto next;
  1839			}
  1840			/*
  1841			 * Calling context will handle target mm/addr. Otherwise, let
  1842			 * khugepaged try again later.
  1843			 */
  1844			if (!is_target) {
  1845				khugepaged_add_pte_mapped_thp(mm, addr);
  1846				continue;
  1847			}
  1848	next:
  1849			if (is_target)
  1850				target_result = result;
  1851		}
  1852		i_mmap_unlock_write(mapping);
  1853		return target_result;
  1854	}
  1855	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 1/1] mm/khugepaged: fix vm_lock/i_mmap_rwsem inversion in retract_page_tables
  2023-03-04 10:57 ` kernel test robot
@ 2023-03-04 23:24   ` Suren Baghdasaryan
  0 siblings, 0 replies; 6+ messages in thread
From: Suren Baghdasaryan @ 2023-03-04 23:24 UTC (permalink / raw)
  To: Andrew Morton; +Cc: llvm, oe-kbuild-all, kernel test robot

Hi Andrew,
I missed vma_try_start_write() definition for CONFIG_PER_VMA_LOCK=n
configuration. Could you please patch this with the following?:

--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -740,6 +740,8 @@ static inline bool vma_start_read(struct
vm_area_struct *vma)
  { return false; }
 static inline void vma_end_read(struct vm_area_struct *vma) {}
 static inline void vma_start_write(struct vm_area_struct *vma) {}
+static inline bool vma_try_start_write(struct vm_area_struct *vma) {
return true; }
 static inline void vma_assert_write_locked(struct vm_area_struct *vma) {}

or should I send a separate patch?
Thanks,
Suren.



On Sat, Mar 4, 2023 at 2:57 AM kernel test robot <lkp@intel.com> wrote:
>
> Hi Suren,
>
> I love your patch! Yet something to improve:
>
> [auto build test ERROR on akpm-mm/mm-everything]
>
> url:    https://github.com/intel-lab-lkp/linux/commits/Suren-Baghdasaryan/mm-khugepaged-fix-vm_lock-i_mmap_rwsem-inversion-in-retract_page_tables/20230304-053401
> base:   https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
> patch link:    https://lore.kernel.org/r/20230303213250.3555716-1-surenb%40google.com
> patch subject: [PATCH 1/1] mm/khugepaged: fix vm_lock/i_mmap_rwsem inversion in retract_page_tables
> config: i386-randconfig-a013 (https://download.01.org/0day-ci/archive/20230304/202303041842.Cv8cTjlL-lkp@intel.com/config)
> compiler: clang version 14.0.6 (https://github.com/llvm/llvm-project f28c006a5895fc0e329fe15fead81e37457cb1d1)
> reproduce (this is a W=1 build):
>         wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
>         chmod +x ~/bin/make.cross
>         # https://github.com/intel-lab-lkp/linux/commit/338d2ee433a0559b3c6c3966b8d4ffe55e8dc6c9
>         git remote add linux-review https://github.com/intel-lab-lkp/linux
>         git fetch --no-tags linux-review Suren-Baghdasaryan/mm-khugepaged-fix-vm_lock-i_mmap_rwsem-inversion-in-retract_page_tables/20230304-053401
>         git checkout 338d2ee433a0559b3c6c3966b8d4ffe55e8dc6c9
>         # save the config file
>         mkdir build_dir && cp config build_dir/.config
>         COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=i386 olddefconfig
>         COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=i386 SHELL=/bin/bash
>
> If you fix the issue, kindly add following tag where applicable
> | Reported-by: kernel test robot <lkp@intel.com>
> | Link: https://lore.kernel.org/oe-kbuild-all/202303041842.Cv8cTjlL-lkp@intel.com/
>
> All errors (new ones prefixed by >>):
>
> >> mm/khugepaged.c:1799:9: error: implicit declaration of function 'vma_try_start_write' is invalid in C99 [-Werror,-Wimplicit-function-declaration]
>                            if (!vma_try_start_write(vma))
>                                 ^
>    mm/khugepaged.c:1799:9: note: did you mean 'vma_start_write'?
>    include/linux/mm.h:742:20: note: 'vma_start_write' declared here
>    static inline void vma_start_write(struct vm_area_struct *vma) {}
>                       ^
>    1 error generated.
>
>
> vim +/vma_try_start_write +1799 mm/khugepaged.c
>
>   1736
>   1737  static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
>   1738                                 struct mm_struct *target_mm,
>   1739                                 unsigned long target_addr, struct page *hpage,
>   1740                                 struct collapse_control *cc)
>   1741  {
>   1742          struct vm_area_struct *vma;
>   1743          int target_result = SCAN_FAIL;
>   1744
>   1745          i_mmap_lock_write(mapping);
>   1746          vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
>   1747                  int result = SCAN_FAIL;
>   1748                  struct mm_struct *mm = NULL;
>   1749                  unsigned long addr = 0;
>   1750                  pmd_t *pmd;
>   1751                  bool is_target = false;
>   1752
>   1753                  /*
>   1754                   * Check vma->anon_vma to exclude MAP_PRIVATE mappings that
>   1755                   * got written to. These VMAs are likely not worth investing
>   1756                   * mmap_write_lock(mm) as PMD-mapping is likely to be split
>   1757                   * later.
>   1758                   *
>   1759                   * Note that vma->anon_vma check is racy: it can be set up after
>   1760                   * the check but before we took mmap_lock by the fault path.
>   1761                   * But page lock would prevent establishing any new ptes of the
>   1762                   * page, so we are safe.
>   1763                   *
>   1764                   * An alternative would be drop the check, but check that page
>   1765                   * table is clear before calling pmdp_collapse_flush() under
>   1766                   * ptl. It has higher chance to recover THP for the VMA, but
>   1767                   * has higher cost too. It would also probably require locking
>   1768                   * the anon_vma.
>   1769                   */
>   1770                  if (READ_ONCE(vma->anon_vma)) {
>   1771                          result = SCAN_PAGE_ANON;
>   1772                          goto next;
>   1773                  }
>   1774                  addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
>   1775                  if (addr & ~HPAGE_PMD_MASK ||
>   1776                      vma->vm_end < addr + HPAGE_PMD_SIZE) {
>   1777                          result = SCAN_VMA_CHECK;
>   1778                          goto next;
>   1779                  }
>   1780                  mm = vma->vm_mm;
>   1781                  is_target = mm == target_mm && addr == target_addr;
>   1782                  result = find_pmd_or_thp_or_none(mm, addr, &pmd);
>   1783                  if (result != SCAN_SUCCEED)
>   1784                          goto next;
>   1785                  /*
>   1786                   * We need exclusive mmap_lock to retract page table.
>   1787                   *
>   1788                   * We use trylock due to lock inversion: we need to acquire
>   1789                   * mmap_lock while holding page lock. Fault path does it in
>   1790                   * reverse order. Trylock is a way to avoid deadlock.
>   1791                   *
>   1792                   * Also, it's not MADV_COLLAPSE's job to collapse other
>   1793                   * mappings - let khugepaged take care of them later.
>   1794                   */
>   1795                  result = SCAN_PTE_MAPPED_HUGEPAGE;
>   1796                  if ((cc->is_khugepaged || is_target) &&
>   1797                      mmap_write_trylock(mm)) {
>   1798                          /* trylock for the same lock inversion as above */
> > 1799                          if (!vma_try_start_write(vma))
>   1800                                  goto unlock_next;
>   1801
>   1802                          /*
>   1803                           * Re-check whether we have an ->anon_vma, because
>   1804                           * collapse_and_free_pmd() requires that either no
>   1805                           * ->anon_vma exists or the anon_vma is locked.
>   1806                           * We already checked ->anon_vma above, but that check
>   1807                           * is racy because ->anon_vma can be populated under the
>   1808                           * mmap lock in read mode.
>   1809                           */
>   1810                          if (vma->anon_vma) {
>   1811                                  result = SCAN_PAGE_ANON;
>   1812                                  goto unlock_next;
>   1813                          }
>   1814                          /*
>   1815                           * When a vma is registered with uffd-wp, we can't
>   1816                           * recycle the pmd pgtable because there can be pte
>   1817                           * markers installed.  Skip it only, so the rest mm/vma
>   1818                           * can still have the same file mapped hugely, however
>   1819                           * it'll always mapped in small page size for uffd-wp
>   1820                           * registered ranges.
>   1821                           */
>   1822                          if (hpage_collapse_test_exit(mm)) {
>   1823                                  result = SCAN_ANY_PROCESS;
>   1824                                  goto unlock_next;
>   1825                          }
>   1826                          if (userfaultfd_wp(vma)) {
>   1827                                  result = SCAN_PTE_UFFD_WP;
>   1828                                  goto unlock_next;
>   1829                          }
>   1830                          collapse_and_free_pmd(mm, vma, addr, pmd);
>   1831                          if (!cc->is_khugepaged && is_target)
>   1832                                  result = set_huge_pmd(vma, addr, pmd, hpage);
>   1833                          else
>   1834                                  result = SCAN_SUCCEED;
>   1835
>   1836  unlock_next:
>   1837                          mmap_write_unlock(mm);
>   1838                          goto next;
>   1839                  }
>   1840                  /*
>   1841                   * Calling context will handle target mm/addr. Otherwise, let
>   1842                   * khugepaged try again later.
>   1843                   */
>   1844                  if (!is_target) {
>   1845                          khugepaged_add_pte_mapped_thp(mm, addr);
>   1846                          continue;
>   1847                  }
>   1848  next:
>   1849                  if (is_target)
>   1850                          target_result = result;
>   1851          }
>   1852          i_mmap_unlock_write(mapping);
>   1853          return target_result;
>   1854  }
>   1855
>
> --
> 0-DAY CI Kernel Test Service
> https://github.com/intel/lkp-tests

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 1/1] mm/khugepaged: fix vm_lock/i_mmap_rwsem inversion in retract_page_tables
  2023-03-03 21:32 [PATCH 1/1] mm/khugepaged: fix vm_lock/i_mmap_rwsem inversion in retract_page_tables Suren Baghdasaryan
                   ` (2 preceding siblings ...)
  2023-03-04 10:57 ` kernel test robot
@ 2023-03-04 23:25 ` Suren Baghdasaryan
  3 siblings, 0 replies; 6+ messages in thread
From: Suren Baghdasaryan @ 2023-03-04 23:25 UTC (permalink / raw)
  To: akpm
  Cc: michel, jglisse, mhocko, vbabka, hannes, mgorman, dave, willy,
	liam.howlett, songliubraving, peterx, david, dhowells, hughd,
	bigeasy, punit.agrawal, lstoakes, rppt, jannh, shakeelb, gthelen,
	gurua, arjunroy, posk, linux-mm, linux-kernel, kernel-team,
	syzbot+8955a9646d1a48b8be92

On Fri, Mar 3, 2023 at 1:32 PM Suren Baghdasaryan <surenb@google.com> wrote:
>
> Internal syscaller on linux-next reported a lock inversion cause by
> vm_lock being taken after i_mmap_rwsem:
>
> ======================================================
> WARNING: possible circular locking dependency detected
> 6.2.0-next-20230301-syzkaller #0 Not tainted
> ------------------------------------------------------
> syz-executor115/5084 is trying to acquire lock:
> ffff888078307a90 (&vma->vm_lock->lock){++++}-{3:3}, at: vma_start_write include/linux/mm.h:678 [inline]
> ffff888078307a90 (&vma->vm_lock->lock){++++}-{3:3}, at: retract_page_tables mm/khugepaged.c:1826 [inline]
> ffff888078307a90 (&vma->vm_lock->lock){++++}-{3:3}, at: collapse_file+0x4fa5/0x5980 mm/khugepaged.c:2204
>
> but task is already holding lock:
> ffff88801f93efa8 (&mapping->i_mmap_rwsem){++++}-{3:3}, at: i_mmap_lock_write include/linux/fs.h:468 [inline]
> ffff88801f93efa8 (&mapping->i_mmap_rwsem){++++}-{3:3}, at: retract_page_tables mm/khugepaged.c:1745 [inline]
> ffff88801f93efa8 (&mapping->i_mmap_rwsem){++++}-{3:3}, at: collapse_file+0x3da6/0x5980 mm/khugepaged.c:2204
>
> retract_page_tables takes i_mmap_rwsem before exclusive mmap_lock, which
> is inverse to normal order. Deadlock is avoided by try-locking mmap_lock
> and skipping on failure to obtain it. Locking the VMA should use the same
> locking pattern to avoid this lock inversion.
>
> Fixes: 44a83f2083bd ("mm/khugepaged: write-lock VMA while collapsing a huge page")
> Reported-by: syzbot+8955a9646d1a48b8be92@syzkaller.appspotmail.com
> Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> ---
>  include/linux/mm.h | 39 ++++++++++++++++++++++++++++-----------
>  mm/khugepaged.c    |  5 ++++-
>  2 files changed, 32 insertions(+), 12 deletions(-)
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 1b9be34a24fb..5f16263d176d 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -676,18 +676,23 @@ static inline void vma_end_read(struct vm_area_struct *vma)
>         rcu_read_unlock();
>  }
>
> -static inline void vma_start_write(struct vm_area_struct *vma)
> +static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq)
>  {
> -       int mm_lock_seq;
> -
>         mmap_assert_write_locked(vma->vm_mm);
>
>         /*
>          * current task is holding mmap_write_lock, both vma->vm_lock_seq and
>          * mm->mm_lock_seq can't be concurrently modified.
>          */
> -       mm_lock_seq = READ_ONCE(vma->vm_mm->mm_lock_seq);
> -       if (vma->vm_lock_seq == mm_lock_seq)
> +       *mm_lock_seq = READ_ONCE(vma->vm_mm->mm_lock_seq);
> +       return (vma->vm_lock_seq == *mm_lock_seq);
> +}
> +
> +static inline void vma_start_write(struct vm_area_struct *vma)
> +{
> +       int mm_lock_seq;
> +
> +       if (__is_vma_write_locked(vma, &mm_lock_seq))
>                 return;
>
>         down_write(&vma->vm_lock->lock);
> @@ -695,14 +700,26 @@ static inline void vma_start_write(struct vm_area_struct *vma)
>         up_write(&vma->vm_lock->lock);
>  }
>
> +static inline bool vma_try_start_write(struct vm_area_struct *vma)
> +{
> +       int mm_lock_seq;
> +
> +       if (__is_vma_write_locked(vma, &mm_lock_seq))
> +               return true;
> +
> +       if (!down_write_trylock(&vma->vm_lock->lock))
> +               return false;
> +
> +       vma->vm_lock_seq = mm_lock_seq;
> +       up_write(&vma->vm_lock->lock);
> +       return true;
> +}
> +
>  static inline void vma_assert_write_locked(struct vm_area_struct *vma)
>  {
> -       mmap_assert_write_locked(vma->vm_mm);
> -       /*
> -        * current task is holding mmap_write_lock, both vma->vm_lock_seq and
> -        * mm->mm_lock_seq can't be concurrently modified.
> -        */
> -       VM_BUG_ON_VMA(vma->vm_lock_seq != READ_ONCE(vma->vm_mm->mm_lock_seq), vma);
> +       int mm_lock_seq;
> +
> +       VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma);
>  }

Hi Andrew,
I missed vma_try_start_write() definition for CONFIG_PER_VMA_LOCK=n
configuration. Could you please patch this with the following?:

--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -740,6 +740,8 @@ static inline bool vma_start_read(struct
vm_area_struct *vma)
  { return false; }
 static inline void vma_end_read(struct vm_area_struct *vma) {}
 static inline void vma_start_write(struct vm_area_struct *vma) {}
+static inline bool vma_try_start_write(struct vm_area_struct *vma) {
return true; }
 static inline void vma_assert_write_locked(struct vm_area_struct *vma) {}

or should I send a separate patch?
Thanks,
Suren.


>
>  static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached)
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index c64e01f03f27..408fed42c9f5 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1795,6 +1795,10 @@ static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
>                 result = SCAN_PTE_MAPPED_HUGEPAGE;
>                 if ((cc->is_khugepaged || is_target) &&
>                     mmap_write_trylock(mm)) {
> +                       /* trylock for the same lock inversion as above */
> +                       if (!vma_try_start_write(vma))
> +                               goto unlock_next;
> +
>                         /*
>                          * Re-check whether we have an ->anon_vma, because
>                          * collapse_and_free_pmd() requires that either no
> @@ -1823,7 +1827,6 @@ static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
>                                 result = SCAN_PTE_UFFD_WP;
>                                 goto unlock_next;
>                         }
> -                       vma_start_write(vma);
>                         collapse_and_free_pmd(mm, vma, addr, pmd);
>                         if (!cc->is_khugepaged && is_target)
>                                 result = set_huge_pmd(vma, addr, pmd, hpage);
> --
> 2.40.0.rc0.216.gc4246ad0f0-goog
>

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2023-03-04 23:26 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-03-03 21:32 [PATCH 1/1] mm/khugepaged: fix vm_lock/i_mmap_rwsem inversion in retract_page_tables Suren Baghdasaryan
2023-03-04  9:14 ` kernel test robot
2023-03-04  9:14 ` kernel test robot
2023-03-04 10:57 ` kernel test robot
2023-03-04 23:24   ` Suren Baghdasaryan
2023-03-04 23:25 ` Suren Baghdasaryan

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.