All of lore.kernel.org
 help / color / mirror / Atom feed
From: Oleg Nesterov <oleg@redhat.com>
To: Peter Xu <peterx@redhat.com>
Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org,
	Linus Torvalds <torvalds@linux-foundation.org>,
	Michal Hocko <mhocko@suse.com>,
	Kirill Shutemov <kirill@shutemov.name>,
	Jann Horn <jannh@google.com>, Kirill Tkhai <ktkhai@virtuozzo.com>,
	Hugh Dickins <hughd@google.com>,
	Leon Romanovsky <leonro@nvidia.com>, Jan Kara <jack@suse.cz>,
	John Hubbard <jhubbard@nvidia.com>,
	Christoph Hellwig <hch@lst.de>,
	Andrew Morton <akpm@linux-foundation.org>,
	Jason Gunthorpe <jgg@ziepe.ca>,
	Andrea Arcangeli <aarcange@redhat.com>
Subject: Re: [PATCH 4/5] mm: Do early cow for pinned pages during fork() for ptes
Date: Tue, 22 Sep 2020 14:40:14 +0200	[thread overview]
Message-ID: <20200922124013.GD11679@redhat.com> (raw)
In-Reply-To: <20200922114839.GC11679@redhat.com>

On 09/22, Oleg Nesterov wrote:
>
> On 09/21, Peter Xu wrote:
> >
> > @@ -859,6 +989,25 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
> >  			    spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
> >  				break;
> >  		}
> > +
> > +		if (unlikely(data.cow_new_page)) {
> > +			/*
> > +			 * If cow_new_page set, we must be at the 2nd round of
> > +			 * a previous COPY_MM_BREAK_COW.  Try to arm the new
> > +			 * page now.  Note that in all cases page_break_cow()
> > +			 * will properly release the objects in copy_mm_data.
> > +			 */
> > +			WARN_ON_ONCE(copy_ret != COPY_MM_BREAK_COW);
> > +			if (pte_install_copied_page(dst_mm, new, src_pte,
> > +						    dst_pte, addr, rss,
> > +						    &data)) {
> > +				/* We installed the pte successfully; move on */
> > +				progress++;
> > +				continue;
>
> I'm afraid I misread this patch too ;)
>
> But it seems to me in this case the main loop can really "leak"
> COPY_MM_BREAK_COW. Suppose the the next 31 pte's are pte_none() and
> need_resched() is true.
>
> No?

If yes, perhaps we can simplify the copy_ret/cow_new_page logic and make
it more explicit?

Something like below, on top of this patch...

Oleg.


--- x/mm/memory.c
+++ x/mm/memory.c
@@ -704,17 +704,6 @@
 	};
 };
 
-static inline void page_release_cow(struct copy_mm_data *data)
-{
-	/* The old page should only be released in page_duplicate() */
-	WARN_ON_ONCE(data->cow_old_page);
-
-	if (data->cow_new_page) {
-		put_page(data->cow_new_page);
-		data->cow_new_page = NULL;
-	}
-}
-
 /*
  * Duplicate the page for this PTE.  Returns zero if page copied (so we need to
  * retry on the same PTE again to arm the copied page very soon), or negative
@@ -925,7 +914,7 @@
 
 	if (!pte_same(*src_pte, data->cow_oldpte)) {
 		/* PTE has changed under us.  Release the page and retry */
-		page_release_cow(data);
+		put_page(data->cow_new_page);
 		return false;
 	}
 
@@ -936,12 +925,6 @@
 	set_pte_at(dst_mm, addr, dst_pte, entry);
 	rss[mm_counter(new_page)]++;
 
-	/*
-	 * Manually clear the new page pointer since we've moved ownership to
-	 * the newly armed PTE.
-	 */
-	data->cow_new_page = NULL;
-
 	return true;
 }
 
@@ -958,16 +941,12 @@
 	struct copy_mm_data data;
 
 again:
-	/* We don't reset this for COPY_MM_BREAK_COW */
-	memset(&data, 0, sizeof(data));
-
-again_break_cow:
 	init_rss_vec(rss);
 
 	dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
 	if (!dst_pte) {
-		/* Guarantee that the new page is released if there is */
-		page_release_cow(&data);
+		if (unlikely(copy_ret == COPY_MM_BREAK_COW))
+			put_page(data.cow_new_page);
 		return -ENOMEM;
 	}
 	src_pte = pte_offset_map(src_pmd, addr);
@@ -978,6 +957,22 @@
 	arch_enter_lazy_mmu_mode();
 
 	progress = 0;
+	if (unlikely(copy_ret == COPY_MM_BREAK_COW)) {
+		/*
+		 * Note that in all cases pte_install_copied_page()
+		 * will properly release the objects in copy_mm_data.
+		 */
+		copy_ret = COPY_MM_DONE;
+		if (pte_install_copied_page(dst_mm, new, src_pte,
+					    dst_pte, addr, rss,
+					    &data)) {
+			/* We installed the pte successfully; move on */
+			progress++;
+			goto next;
+		}
+		/* PTE changed.  Retry this pte (falls through) */
+	}
+
 	do {
 		/*
 		 * We are holding two locks at this point - either of them
@@ -990,24 +985,6 @@
 				break;
 		}
 
-		if (unlikely(data.cow_new_page)) {
-			/*
-			 * If cow_new_page set, we must be at the 2nd round of
-			 * a previous COPY_MM_BREAK_COW.  Try to arm the new
-			 * page now.  Note that in all cases page_break_cow()
-			 * will properly release the objects in copy_mm_data.
-			 */
-			WARN_ON_ONCE(copy_ret != COPY_MM_BREAK_COW);
-			if (pte_install_copied_page(dst_mm, new, src_pte,
-						    dst_pte, addr, rss,
-						    &data)) {
-				/* We installed the pte successfully; move on */
-				progress++;
-				continue;
-			}
-			/* PTE changed.  Retry this pte (falls through) */
-		}
-
 		if (pte_none(*src_pte)) {
 			progress++;
 			continue;
@@ -1017,6 +994,7 @@
 		if (copy_ret != COPY_MM_DONE)
 			break;
 		progress += 8;
+next:
 	} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
 
 	arch_leave_lazy_mmu_mode();
@@ -1030,13 +1008,14 @@
 	case COPY_MM_SWAP_CONT:
 		if (add_swap_count_continuation(data.entry, GFP_KERNEL) < 0)
 			return -ENOMEM;
-		break;
+		copy_ret = COPY_MM_DONE;
+		goto again;
 	case COPY_MM_BREAK_COW:
 		/* Do accounting onto parent mm directly */
 		ret = page_duplicate(src_mm, vma, addr, &data);
 		if (ret)
 			return ret;
-		goto again_break_cow;
+		goto again;
 	case COPY_MM_DONE:
 		/* This means we're all good. */
 		break;


  reply	other threads:[~2020-09-22 12:40 UTC|newest]

Thread overview: 133+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-09-21 21:17 [PATCH 0/5] mm: Break COW for pinned pages during fork() Peter Xu
2020-09-21 21:17 ` [PATCH 1/5] mm: Introduce mm_struct.has_pinned Peter Xu
2020-09-21 21:43   ` Jann Horn
2020-09-21 21:43     ` Jann Horn
2020-09-21 22:30     ` Peter Xu
2020-09-21 22:47       ` Jann Horn
2020-09-21 22:47         ` Jann Horn
2020-09-22 11:54         ` Jason Gunthorpe
2020-09-22 14:28           ` Peter Xu
2020-09-22 15:56             ` Jason Gunthorpe
2020-09-22 16:25               ` Linus Torvalds
2020-09-22 16:25                 ` Linus Torvalds
2020-09-21 23:53   ` John Hubbard
2020-09-22  0:01     ` John Hubbard
2020-09-22 15:17     ` Peter Xu
2020-09-22 16:10       ` Jason Gunthorpe
2020-09-22 17:54         ` Peter Xu
2020-09-22 19:11           ` Jason Gunthorpe
2020-09-23  0:27             ` Peter Xu
2020-09-23 13:10               ` Peter Xu
2020-09-23 14:20                 ` Jan Kara
2020-09-23 17:12                   ` Jason Gunthorpe
2020-09-24  7:44                     ` Jan Kara
2020-09-24 14:02                       ` Jason Gunthorpe
2020-09-24 14:45                         ` Jan Kara
2020-09-23 17:07               ` Jason Gunthorpe
2020-09-24 14:35                 ` Peter Xu
2020-09-24 16:51                   ` Jason Gunthorpe
2020-09-24 17:55                     ` Peter Xu
2020-09-24 18:15                       ` Jason Gunthorpe
2020-09-24 18:34                         ` Peter Xu
2020-09-24 18:39                           ` Jason Gunthorpe
2020-09-24 21:30                             ` Peter Xu
2020-09-25 19:56                               ` Linus Torvalds
2020-09-25 19:56                                 ` Linus Torvalds
2020-09-25 21:06                                 ` Linus Torvalds
2020-09-25 21:06                                   ` Linus Torvalds
2020-09-26  0:41                                   ` Jason Gunthorpe
2020-09-26  1:15                                     ` Linus Torvalds
2020-09-26  1:15                                       ` Linus Torvalds
2020-09-26 22:28                                       ` Linus Torvalds
2020-09-26 22:28                                         ` Linus Torvalds
2020-09-27  6:23                                         ` Leon Romanovsky
2020-09-27 18:16                                           ` Linus Torvalds
2020-09-27 18:16                                             ` Linus Torvalds
2020-09-27 18:45                                             ` Linus Torvalds
2020-09-27 18:45                                               ` Linus Torvalds
2020-09-28 12:49                                               ` Jason Gunthorpe
2020-09-28 16:17                                                 ` Linus Torvalds
2020-09-28 16:17                                                   ` Linus Torvalds
2020-09-28 17:22                                                   ` Peter Xu
2020-09-28 17:54                                                     ` Linus Torvalds
2020-09-28 17:54                                                       ` Linus Torvalds
2020-09-28 18:39                                                       ` Jason Gunthorpe
2020-09-28 19:29                                                         ` Linus Torvalds
2020-09-28 19:29                                                           ` Linus Torvalds
2020-09-28 23:57                                                           ` Jason Gunthorpe
2020-09-29  0:18                                                             ` John Hubbard
2020-09-28 19:36                                                         ` Linus Torvalds
2020-09-28 19:36                                                           ` Linus Torvalds
2020-09-28 19:50                                                           ` Linus Torvalds
2020-09-28 19:50                                                             ` Linus Torvalds
2020-09-28 22:51                                                             ` Jason Gunthorpe
2020-09-29  0:30                                                               ` Peter Xu
2020-10-08  5:49                                                             ` Leon Romanovsky
2020-09-28 17:13                                             ` Peter Xu
2020-09-25 21:13                                 ` Peter Xu
2020-09-25 22:08                                   ` Linus Torvalds
2020-09-25 22:08                                     ` Linus Torvalds
2020-09-22 18:02       ` John Hubbard
2020-09-22 18:15         ` Peter Xu
2020-09-22 19:11       ` John Hubbard
2020-09-27  0:41   ` [mm] 698ac7610f: will-it-scale.per_thread_ops 8.2% improvement kernel test robot
2020-09-27  0:41     ` kernel test robot
2020-09-21 21:17 ` [PATCH 2/5] mm/fork: Pass new vma pointer into copy_page_range() Peter Xu
2020-09-21 21:17 ` [PATCH 3/5] mm: Rework return value for copy_one_pte() Peter Xu
2020-09-22  7:11   ` John Hubbard
2020-09-22 15:29     ` Peter Xu
2020-09-22 10:08   ` Oleg Nesterov
2020-09-22 10:18     ` Oleg Nesterov
2020-09-22 15:36       ` Peter Xu
2020-09-22 15:48         ` Oleg Nesterov
2020-09-22 16:03           ` Peter Xu
2020-09-22 16:53             ` Oleg Nesterov
2020-09-22 18:13               ` Peter Xu
2020-09-22 18:23                 ` Oleg Nesterov
2020-09-22 18:49                   ` Peter Xu
2020-09-23  6:52                     ` Oleg Nesterov
2020-09-23 17:16   ` Linus Torvalds
2020-09-23 17:16     ` Linus Torvalds
2020-09-23 21:24     ` Linus Torvalds
2020-09-23 21:24       ` Linus Torvalds
2020-09-21 21:20 ` [PATCH 4/5] mm: Do early cow for pinned pages during fork() for ptes Peter Xu
2020-09-21 21:55   ` Jann Horn
2020-09-21 21:55     ` Jann Horn
2020-09-21 22:18     ` John Hubbard
2020-09-21 22:27       ` Jann Horn
2020-09-21 22:27         ` Jann Horn
2020-09-22  0:08         ` John Hubbard
2020-09-21 22:27     ` Peter Xu
2020-09-22 11:48   ` Oleg Nesterov
2020-09-22 12:40     ` Oleg Nesterov [this message]
2020-09-22 15:58       ` Peter Xu
2020-09-22 16:52         ` Oleg Nesterov
2020-09-22 18:34           ` Peter Xu
2020-09-22 18:44             ` Oleg Nesterov
2020-09-23  1:03               ` Peter Xu
2020-09-23 20:25                 ` Linus Torvalds
2020-09-23 20:25                   ` Linus Torvalds
2020-09-24 15:08                   ` Peter Xu
2020-09-24 11:48   ` Kirill Tkhai
2020-09-24 15:16     ` Peter Xu
2020-09-21 21:20 ` [PATCH 5/5] mm/thp: Split huge pmds/puds if they're pinned when fork() Peter Xu
2020-09-22  6:41   ` John Hubbard
2020-09-22 10:33     ` Jan Kara
2020-09-22 20:01       ` John Hubbard
2020-09-23  9:22         ` Jan Kara
2020-09-23 13:50           ` Peter Xu
2020-09-23 14:01             ` Jan Kara
2020-09-23 15:44               ` Peter Xu
2020-09-23 20:19                 ` John Hubbard
2020-09-24 18:49                   ` Peter Xu
2020-09-23 16:06     ` Peter Xu
2020-09-22 12:05   ` Jason Gunthorpe
2020-09-23 15:24     ` Peter Xu
2020-09-23 16:07       ` Yang Shi
2020-09-23 16:07         ` Yang Shi
2020-09-24 15:47         ` Peter Xu
2020-09-24 17:29           ` Yang Shi
2020-09-24 17:29             ` Yang Shi
2020-09-23 17:17       ` Jason Gunthorpe
2020-09-23 10:21 ` [PATCH 0/5] mm: Break COW for pinned pages during fork() Leon Romanovsky
2020-09-23 15:37   ` Peter Xu

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200922124013.GD11679@redhat.com \
    --to=oleg@redhat.com \
    --cc=aarcange@redhat.com \
    --cc=akpm@linux-foundation.org \
    --cc=hch@lst.de \
    --cc=hughd@google.com \
    --cc=jack@suse.cz \
    --cc=jannh@google.com \
    --cc=jgg@ziepe.ca \
    --cc=jhubbard@nvidia.com \
    --cc=kirill@shutemov.name \
    --cc=ktkhai@virtuozzo.com \
    --cc=leonro@nvidia.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mhocko@suse.com \
    --cc=peterx@redhat.com \
    --cc=torvalds@linux-foundation.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.