All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
To: Peter Xu <peterx@redhat.com>
Cc: Nathan Chancellor <nathan@kernel.org>,
	linux-mm@kvack.org, akpm@linux-foundation.org,
	mpe@ellerman.id.au, linuxppc-dev@lists.ozlabs.org,
	kaleshsingh@google.com, npiggin@gmail.com,
	joel@joelfernandes.org,
	Christophe Leroy <christophe.leroy@csgroup.eu>
Subject: Re: [PATCH v5 3/9] mm/mremap: Use pmd/pud_poplulate to update page table entries
Date: Thu, 20 May 2021 19:07:57 +0530	[thread overview]
Message-ID: <87o8d5le4q.fsf@linux.ibm.com> (raw)
In-Reply-To: <e6525655-2e51-a0c0-fe54-596cfae9ce21@linux.ibm.com>

"Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:

> On 5/20/21 6:16 PM, Peter Xu wrote:
>> On Thu, May 20, 2021 at 01:56:54PM +0530, Aneesh Kumar K.V wrote:
>>>> This seems to work at least for my userfaultfd test on shmem, however I don't
>>>> fully understand the commit message [1] on: How do we guarantee we're not
>>>> moving a thp pte?
>>>>
>>>
>>> move_page_tables() checks for pmd_trans_huge() and ends up calling
>>> move_huge_pmd if it is a THP entry.
>> 
>> Sorry to be unclear: what if a huge pud thp?
>> 
>
> I am still checking. Looking at the code before commit 
> c49dd340180260c6239e453263a9a244da9a7c85, I don't see kernel handling 
> huge pud thp. I haven't studied huge pud thp enough to understand 
> whether c49dd340180260c6239e453263a9a244da9a7c85 intent to add that 
> support.
>
> We can do a move_huge_pud() like we do for huge pmd thp. But I am not 
> sure whether we handle those VMA's earlier and restrict mremap on them?

something like this? (not even compile tested). I am still not sure
whether this is really needed or we handle DAX VMA's in some other form.

diff --git a/mm/mremap.c b/mm/mremap.c
index 47c255b60150..037a7bd311f1 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -324,10 +324,51 @@ static inline bool move_normal_pud(struct vm_area_struct *vma,
 }
 #endif
 
+
+static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr,
+			  unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
+{
+	spinlock_t *old_ptl, *new_ptl;
+	struct mm_struct *mm = vma->vm_mm;
+	pud_t pud;
+
+	/*
+	 * The destination pud shouldn't be established, free_pgtables()
+	 * should have released it.
+	 */
+	if (WARN_ON_ONCE(!pud_none(*new_pud)))
+		return false;
+
+	/*
+	 * We don't have to worry about the ordering of src and dst
+	 * ptlocks because exclusive mmap_lock prevents deadlock.
+	 */
+	old_ptl = pud_lock(vma->vm_mm, old_pud);
+	new_ptl = pud_lockptr(mm, new_pud);
+	if (new_ptl != old_ptl)
+		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
+
+	/* Clear the pud */
+	pud = *old_pud;
+	pud_clear(old_pud);
+
+	VM_BUG_ON(!pud_none(*new_pud));
+
+	/* Set the new pud */
+	set_pud_at(mm, new_addr, new_pud, pud);
+	flush_pud_tlb_range(vma, old_addr, old_addr + HPAGE_PUD_SIZE);
+	if (new_ptl != old_ptl)
+		spin_unlock(new_ptl);
+	spin_unlock(old_ptl);
+
+	return true;
+}
+
 enum pgt_entry {
 	NORMAL_PMD,
 	HPAGE_PMD,
 	NORMAL_PUD,
+	HPAGE_PUD,
 };
 
 /*
@@ -347,6 +388,7 @@ static __always_inline unsigned long get_extent(enum pgt_entry entry,
 		mask = PMD_MASK;
 		size = PMD_SIZE;
 		break;
+	case HPAGE_PUD:
 	case NORMAL_PUD:
 		mask = PUD_MASK;
 		size = PUD_SIZE;
@@ -395,6 +437,12 @@ static bool move_pgt_entry(enum pgt_entry entry, struct vm_area_struct *vma,
 			move_huge_pmd(vma, old_addr, new_addr, old_entry,
 				      new_entry);
 		break;
+	case HPAGE_PUD:
+		moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE_PUD) &&
+			move_huge_pud(vma, old_addr, new_addr, old_entry,
+				      new_entry);
+		break;
+
 	default:
 		WARN_ON_ONCE(1);
 		break;
@@ -429,15 +477,23 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
 		 * PUD level if possible.
 		 */
 		extent = get_extent(NORMAL_PUD, old_addr, old_end, new_addr);
-		if (IS_ENABLED(CONFIG_HAVE_MOVE_PUD) && extent == PUD_SIZE) {
-			pud_t *old_pud, *new_pud;
 
-			old_pud = get_old_pud(vma->vm_mm, old_addr);
-			if (!old_pud)
+		old_pud = get_old_pud(vma->vm_mm, old_addr);
+		if (!old_pud)
+			continue;
+		new_pud = alloc_new_pud(vma->vm_mm, vma, new_addr);
+		if (!new_pud)
+			break;
+		if (pud_trans_huge(*old_pud) || pud_devmap(*old_pud)) {
+			if (extent == HPAGE_PUD_SIZE) {
+				move_pgt_entry(HPAGE_PUD, vma, old_addr, new_addr,
+					       old_pud, new_pud, need_rmap_locks);
+				/* We ignore and continue on error? */
 				continue;
-			new_pud = alloc_new_pud(vma->vm_mm, vma, new_addr);
-			if (!new_pud)
-				break;
+			}
+		} else if (IS_ENABLED(CONFIG_HAVE_MOVE_PUD) && extent == PUD_SIZE) {
+			pud_t *old_pud, *new_pud;
+
 			if (move_pgt_entry(NORMAL_PUD, vma, old_addr, new_addr,
 					   old_pud, new_pud, need_rmap_locks))
 				continue;


>
> Are huge pud thp only allowed with DAX vmas?
>
>
> -aneesh


WARNING: multiple messages have this Message-ID (diff)
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
To: Peter Xu <peterx@redhat.com>
Cc: npiggin@gmail.com, Nathan Chancellor <nathan@kernel.org>,
	linux-mm@kvack.org, kaleshsingh@google.com,
	joel@joelfernandes.org, akpm@linux-foundation.org,
	linuxppc-dev@lists.ozlabs.org
Subject: Re: [PATCH v5 3/9] mm/mremap: Use pmd/pud_poplulate to update page table entries
Date: Thu, 20 May 2021 19:07:57 +0530	[thread overview]
Message-ID: <87o8d5le4q.fsf@linux.ibm.com> (raw)
In-Reply-To: <e6525655-2e51-a0c0-fe54-596cfae9ce21@linux.ibm.com>

"Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:

> On 5/20/21 6:16 PM, Peter Xu wrote:
>> On Thu, May 20, 2021 at 01:56:54PM +0530, Aneesh Kumar K.V wrote:
>>>> This seems to work at least for my userfaultfd test on shmem, however I don't
>>>> fully understand the commit message [1] on: How do we guarantee we're not
>>>> moving a thp pte?
>>>>
>>>
>>> move_page_tables() checks for pmd_trans_huge() and ends up calling
>>> move_huge_pmd if it is a THP entry.
>> 
>> Sorry to be unclear: what if a huge pud thp?
>> 
>
> I am still checking. Looking at the code before commit 
> c49dd340180260c6239e453263a9a244da9a7c85, I don't see kernel handling 
> huge pud thp. I haven't studied huge pud thp enough to understand 
> whether c49dd340180260c6239e453263a9a244da9a7c85 intent to add that 
> support.
>
> We can do a move_huge_pud() like we do for huge pmd thp. But I am not 
> sure whether we handle those VMA's earlier and restrict mremap on them?

something like this? (not even compile tested). I am still not sure
whether this is really needed or we handle DAX VMA's in some other form.

diff --git a/mm/mremap.c b/mm/mremap.c
index 47c255b60150..037a7bd311f1 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -324,10 +324,51 @@ static inline bool move_normal_pud(struct vm_area_struct *vma,
 }
 #endif
 
+
+static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr,
+			  unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
+{
+	spinlock_t *old_ptl, *new_ptl;
+	struct mm_struct *mm = vma->vm_mm;
+	pud_t pud;
+
+	/*
+	 * The destination pud shouldn't be established, free_pgtables()
+	 * should have released it.
+	 */
+	if (WARN_ON_ONCE(!pud_none(*new_pud)))
+		return false;
+
+	/*
+	 * We don't have to worry about the ordering of src and dst
+	 * ptlocks because exclusive mmap_lock prevents deadlock.
+	 */
+	old_ptl = pud_lock(vma->vm_mm, old_pud);
+	new_ptl = pud_lockptr(mm, new_pud);
+	if (new_ptl != old_ptl)
+		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
+
+	/* Clear the pud */
+	pud = *old_pud;
+	pud_clear(old_pud);
+
+	VM_BUG_ON(!pud_none(*new_pud));
+
+	/* Set the new pud */
+	set_pud_at(mm, new_addr, new_pud, pud);
+	flush_pud_tlb_range(vma, old_addr, old_addr + HPAGE_PUD_SIZE);
+	if (new_ptl != old_ptl)
+		spin_unlock(new_ptl);
+	spin_unlock(old_ptl);
+
+	return true;
+}
+
 enum pgt_entry {
 	NORMAL_PMD,
 	HPAGE_PMD,
 	NORMAL_PUD,
+	HPAGE_PUD,
 };
 
 /*
@@ -347,6 +388,7 @@ static __always_inline unsigned long get_extent(enum pgt_entry entry,
 		mask = PMD_MASK;
 		size = PMD_SIZE;
 		break;
+	case HPAGE_PUD:
 	case NORMAL_PUD:
 		mask = PUD_MASK;
 		size = PUD_SIZE;
@@ -395,6 +437,12 @@ static bool move_pgt_entry(enum pgt_entry entry, struct vm_area_struct *vma,
 			move_huge_pmd(vma, old_addr, new_addr, old_entry,
 				      new_entry);
 		break;
+	case HPAGE_PUD:
+		moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE_PUD) &&
+			move_huge_pud(vma, old_addr, new_addr, old_entry,
+				      new_entry);
+		break;
+
 	default:
 		WARN_ON_ONCE(1);
 		break;
@@ -429,15 +477,23 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
 		 * PUD level if possible.
 		 */
 		extent = get_extent(NORMAL_PUD, old_addr, old_end, new_addr);
-		if (IS_ENABLED(CONFIG_HAVE_MOVE_PUD) && extent == PUD_SIZE) {
-			pud_t *old_pud, *new_pud;
 
-			old_pud = get_old_pud(vma->vm_mm, old_addr);
-			if (!old_pud)
+		old_pud = get_old_pud(vma->vm_mm, old_addr);
+		if (!old_pud)
+			continue;
+		new_pud = alloc_new_pud(vma->vm_mm, vma, new_addr);
+		if (!new_pud)
+			break;
+		if (pud_trans_huge(*old_pud) || pud_devmap(*old_pud)) {
+			if (extent == HPAGE_PUD_SIZE) {
+				move_pgt_entry(HPAGE_PUD, vma, old_addr, new_addr,
+					       old_pud, new_pud, need_rmap_locks);
+				/* We ignore and continue on error? */
 				continue;
-			new_pud = alloc_new_pud(vma->vm_mm, vma, new_addr);
-			if (!new_pud)
-				break;
+			}
+		} else if (IS_ENABLED(CONFIG_HAVE_MOVE_PUD) && extent == PUD_SIZE) {
+			pud_t *old_pud, *new_pud;
+
 			if (move_pgt_entry(NORMAL_PUD, vma, old_addr, new_addr,
 					   old_pud, new_pud, need_rmap_locks))
 				continue;


>
> Are huge pud thp only allowed with DAX vmas?
>
>
> -aneesh

  reply	other threads:[~2021-05-20 13:38 UTC|newest]

Thread overview: 106+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-04-22  5:43 [PATCH v5 0/9] Speedup mremap on ppc64 Aneesh Kumar K.V
2021-04-22  5:43 ` Aneesh Kumar K.V
2021-04-22  5:43 ` [PATCH v5 1/9] selftest/mremap_test: Update the test to handle pagesize other than 4K Aneesh Kumar K.V
2021-04-22  5:43   ` Aneesh Kumar K.V
2021-04-22  5:43 ` [PATCH v5 2/9] selftest/mremap_test: Avoid crash with static build Aneesh Kumar K.V
2021-04-22  5:43   ` Aneesh Kumar K.V
2021-04-22  5:43 ` [PATCH v5 3/9] mm/mremap: Use pmd/pud_poplulate to update page table entries Aneesh Kumar K.V
2021-04-22  5:43   ` Aneesh Kumar K.V
2021-05-18 20:04   ` Nathan Chancellor
2021-05-18 20:04     ` Nathan Chancellor
2021-05-19  4:46     ` Aneesh Kumar K.V
2021-05-19  4:46       ` Aneesh Kumar K.V
2021-05-19 18:02       ` Nathan Chancellor
2021-05-19 18:02         ` Nathan Chancellor
2021-05-20  2:18       ` Peter Xu
2021-05-20  2:18         ` Peter Xu
2021-05-20  8:26         ` Aneesh Kumar K.V
2021-05-20  8:26           ` Aneesh Kumar K.V
2021-05-20 12:46           ` Peter Xu
2021-05-20 12:46             ` Peter Xu
2021-05-20 13:23             ` Aneesh Kumar K.V
2021-05-20 13:23               ` Aneesh Kumar K.V
2021-05-20 13:37               ` Aneesh Kumar K.V [this message]
2021-05-20 13:37                 ` Aneesh Kumar K.V
2021-05-20 14:57                 ` Peter Xu
2021-05-20 14:57                   ` Peter Xu
2021-05-20 19:06                   ` Zi Yan
2021-05-20 19:06                     ` Zi Yan
2021-05-20 20:01                     ` Peter Xu
2021-05-20 20:01                       ` Peter Xu
2021-05-20 20:25                       ` Kalesh Singh
2021-05-20 20:25                         ` Kalesh Singh
2021-04-22  5:43 ` [PATCH v5 4/9] powerpc/mm/book3s64: Fix possible build error Aneesh Kumar K.V
2021-04-22  5:43   ` Aneesh Kumar K.V
2021-04-22  5:43 ` [PATCH v5 5/9] powerpc/mm/book3s64: Update tlb flush routines to take a page walk cache flush argument Aneesh Kumar K.V
2021-04-22  5:43   ` Aneesh Kumar K.V
2021-05-15 16:35   ` Guenter Roeck
2021-05-15 16:35     ` Guenter Roeck
2021-05-15 20:41     ` Andrew Morton
2021-05-15 20:41       ` Andrew Morton
2021-05-15 23:05       ` Guenter Roeck
2021-05-15 23:05         ` Guenter Roeck
2021-05-17  8:40     ` Aneesh Kumar K.V
2021-05-17  8:40       ` Aneesh Kumar K.V
2021-05-17 13:38       ` Guenter Roeck
2021-05-17 13:38         ` Guenter Roeck
2021-05-17 13:55         ` Aneesh Kumar K.V
2021-05-17 13:55           ` Aneesh Kumar K.V
2021-05-17 14:18           ` Guenter Roeck
2021-05-17 14:18             ` Guenter Roeck
2021-05-19  0:26             ` Michael Ellerman
2021-05-19  0:26               ` Michael Ellerman
2021-05-19  0:45               ` Segher Boessenkool
2021-05-19  0:45                 ` Segher Boessenkool
2021-05-19 12:03                 ` Segher Boessenkool
2021-05-19 13:37                   ` Guenter Roeck
2021-05-19 14:20                     ` Segher Boessenkool
2021-05-19 14:20                       ` Segher Boessenkool
2021-05-19 15:28                       ` Guenter Roeck
2021-05-19 15:28                         ` Guenter Roeck
2021-05-20  7:37                   ` Michael Ellerman
2021-05-20 12:17                     ` Segher Boessenkool
2021-05-19  1:08               ` Guenter Roeck
2021-05-19  1:08                 ` Guenter Roeck
2021-05-20 11:38                 ` Michael Ellerman
2021-05-20 11:38                   ` Michael Ellerman
2021-05-20 11:56                   ` Guenter Roeck
2021-05-20 11:56                     ` Guenter Roeck
2021-04-22  5:43 ` [PATCH v5 6/9] mm/mremap: Use range flush that does TLB and page walk cache flush Aneesh Kumar K.V
2021-04-22  5:43   ` Aneesh Kumar K.V
2021-04-22  5:43 ` [PATCH v5 7/9] mm/mremap: Move TLB flush outside page table lock Aneesh Kumar K.V
2021-04-22  5:43   ` Aneesh Kumar K.V
2021-05-20 15:26   ` Aneesh Kumar K.V
2021-05-20 15:26     ` Aneesh Kumar K.V
2021-05-20 16:57     ` Aneesh Kumar K.V
2021-05-20 16:57       ` Aneesh Kumar K.V
2021-05-21  2:40       ` Linus Torvalds
2021-05-21  2:40         ` Linus Torvalds
2021-05-21  3:03         ` Aneesh Kumar K.V
2021-05-21  3:03           ` Aneesh Kumar K.V
2021-05-21  3:28           ` Aneesh Kumar K.V
2021-05-21  3:28             ` Aneesh Kumar K.V
2021-05-21  6:13           ` Linus Torvalds
2021-05-21  6:13             ` Linus Torvalds
2021-05-21 12:50             ` Aneesh Kumar K.V
2021-05-21 12:50               ` Aneesh Kumar K.V
2021-05-21 13:03               ` Aneesh Kumar K.V
2021-05-21 13:03                 ` Aneesh Kumar K.V
2021-05-21 16:03                 ` Linus Torvalds
2021-05-21 16:03                   ` Linus Torvalds
2021-05-21 16:29                   ` Aneesh Kumar K.V
2021-05-21 16:29                     ` Aneesh Kumar K.V
2021-05-24 14:24                   ` Aneesh Kumar K.V
2021-05-24 14:24                     ` Aneesh Kumar K.V
2021-05-21 15:24               ` Liam Howlett
2021-05-21 15:24                 ` Liam Howlett
2021-05-21 16:02                 ` Aneesh Kumar K.V
2021-05-21 16:02                   ` Aneesh Kumar K.V
2021-05-21 16:05                 ` Linus Torvalds
2021-05-21 16:05                   ` Linus Torvalds
2021-04-22  5:43 ` [PATCH v5 8/9] mm/mremap: Allow arch runtime override Aneesh Kumar K.V
2021-04-22  5:43   ` Aneesh Kumar K.V
2021-04-22  5:43 ` [PATCH v5 9/9] powerpc/mm: Enable move pmd/pud Aneesh Kumar K.V
2021-04-22  5:43   ` Aneesh Kumar K.V
2021-05-11 22:19   ` Andrew Morton
2021-05-11 22:19     ` Andrew Morton

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=87o8d5le4q.fsf@linux.ibm.com \
    --to=aneesh.kumar@linux.ibm.com \
    --cc=akpm@linux-foundation.org \
    --cc=christophe.leroy@csgroup.eu \
    --cc=joel@joelfernandes.org \
    --cc=kaleshsingh@google.com \
    --cc=linux-mm@kvack.org \
    --cc=linuxppc-dev@lists.ozlabs.org \
    --cc=mpe@ellerman.id.au \
    --cc=nathan@kernel.org \
    --cc=npiggin@gmail.com \
    --cc=peterx@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.