All of lore.kernel.org
 help / color / mirror / Atom feed
From: Suren Baghdasaryan <surenb@google.com>
To: Jann Horn <jannh@google.com>
Cc: akpm@linux-foundation.org, viro@zeniv.linux.org.uk,
	brauner@kernel.org, shuah@kernel.org, aarcange@redhat.com,
	lokeshgidra@google.com, peterx@redhat.com, david@redhat.com,
	hughd@google.com, mhocko@suse.com, axelrasmussen@google.com,
	rppt@kernel.org, willy@infradead.org, Liam.Howlett@oracle.com,
	zhangpeng362@huawei.com, bgeffon@google.com,
	kaleshsingh@google.com, ngeoffray@google.com, jdduke@google.com,
	linux-mm@kvack.org, linux-fsdevel@vger.kernel.org,
	linux-kernel@vger.kernel.org, linux-kselftest@vger.kernel.org,
	kernel-team@android.com
Subject: Re: [PATCH 2/3] userfaultfd: UFFDIO_REMAP uABI
Date: Fri, 15 Sep 2023 16:39:18 -0700	[thread overview]
Message-ID: <CAJuCfpF3Hz9p6zQotM0aMG3WLT=FtYCu31jpFbU_VqE6BAP8NA@mail.gmail.com> (raw)
In-Reply-To: <CAG48ez3E1t=equPeL5doT+RPaPZ1BmkFi8zkZRKqhOZHbXVkzg@mail.gmail.com>

On Fri, Sep 15, 2023 at 4:34 PM Jann Horn <jannh@google.com> wrote:
>
> On Thu, Sep 14, 2023 at 5:26 PM Suren Baghdasaryan <surenb@google.com> wrote:
> > +/*
> > + * The mmap_lock for reading is held by the caller. Just move the page
> > + * from src_pmd to dst_pmd if possible, and return true if succeeded
> > + * in moving the page.
> > + */
> > +static int remap_pages_pte(struct mm_struct *dst_mm,
> > +                          struct mm_struct *src_mm,
> > +                          pmd_t *dst_pmd,
> > +                          pmd_t *src_pmd,
> > +                          struct vm_area_struct *dst_vma,
> > +                          struct vm_area_struct *src_vma,
> > +                          unsigned long dst_addr,
> > +                          unsigned long src_addr,
> > +                          __u64 mode)
> > +{
> > +       swp_entry_t entry;
> > +       pte_t orig_src_pte, orig_dst_pte;
> > +       spinlock_t *src_ptl, *dst_ptl;
> > +       pte_t *src_pte = NULL;
> > +       pte_t *dst_pte = NULL;
> > +
> > +       struct folio *src_folio = NULL;
> > +       struct anon_vma *src_anon_vma = NULL;
> > +       struct anon_vma *dst_anon_vma;
> > +       struct mmu_notifier_range range;
> > +       int err = 0;
> > +
> > +retry:
> > +       dst_pte = pte_offset_map_nolock(dst_mm, dst_pmd, dst_addr, &dst_ptl);
> > +
> > +       /* If an huge pmd materialized from under us fail */
> > +       if (unlikely(!dst_pte)) {
> > +               err = -EFAULT;
> > +               goto out;
> > +       }
> > +
> > +       src_pte = pte_offset_map_nolock(src_mm, src_pmd, src_addr, &src_ptl);
> > +
> > +       /*
> > +        * We held the mmap_lock for reading so MADV_DONTNEED
> > +        * can zap transparent huge pages under us, or the
> > +        * transparent huge page fault can establish new
> > +        * transparent huge pages under us.
> > +        */
> > +       if (unlikely(!src_pte)) {
> > +               err = -EFAULT;
> > +               goto out;
> > +       }
> > +
> > +       BUG_ON(pmd_none(*dst_pmd));
> > +       BUG_ON(pmd_none(*src_pmd));
> > +       BUG_ON(pmd_trans_huge(*dst_pmd));
> > +       BUG_ON(pmd_trans_huge(*src_pmd));
> > +
> > +       spin_lock(dst_ptl);
> > +       orig_dst_pte = *dst_pte;
> > +       spin_unlock(dst_ptl);
> > +       if (!pte_none(orig_dst_pte)) {
> > +               err = -EEXIST;
> > +               goto out;
> > +       }
> > +
> > +       spin_lock(src_ptl);
> > +       orig_src_pte = *src_pte;
> > +       spin_unlock(src_ptl);
> > +       if (pte_none(orig_src_pte)) {
> > +               if (!(mode & UFFDIO_REMAP_MODE_ALLOW_SRC_HOLES))
> > +                       err = -ENOENT;
> > +               else /* nothing to do to remap a hole */
> > +                       err = 0;
> > +               goto out;
> > +       }
> > +
> > +       if (pte_present(orig_src_pte)) {
> > +               if (!src_folio) {
> > +                       struct folio *folio;
> > +
> > +                       /*
> > +                        * Pin the page while holding the lock to be sure the
> > +                        * page isn't freed under us
> > +                        */
> > +                       spin_lock(src_ptl);
> > +                       if (!pte_same(orig_src_pte, *src_pte)) {
> > +                               spin_unlock(src_ptl);
> > +                               err = -EAGAIN;
> > +                               goto out;
> > +                       }
> > +
> > +                       folio = vm_normal_folio(src_vma, src_addr, orig_src_pte);
> > +                       if (!folio || !folio_test_anon(folio) ||
> > +                           folio_estimated_sharers(folio) != 1) {
> > +                               spin_unlock(src_ptl);
> > +                               err = -EBUSY;
> > +                               goto out;
> > +                       }
> > +
> > +                       src_folio = folio;
> > +                       folio_get(src_folio);
> > +                       spin_unlock(src_ptl);
> > +
> > +                       /* try to block all concurrent rmap walks */
> > +                       if (!folio_trylock(src_folio)) {
> > +                               pte_unmap(&orig_src_pte);
> > +                               pte_unmap(&orig_dst_pte);
> > +                               src_pte = dst_pte = NULL;
> > +                               folio_lock(src_folio);
> > +                               goto retry;
> > +                       }
> > +               }
> > +
> > +               if (!src_anon_vma) {
> > +                       /*
> > +                        * folio_referenced walks the anon_vma chain
> > +                        * without the folio lock. Serialize against it with
> > +                        * the anon_vma lock, the folio lock is not enough.
> > +                        */
> > +                       src_anon_vma = folio_get_anon_vma(src_folio);
> > +                       if (!src_anon_vma) {
> > +                               /* page was unmapped from under us */
> > +                               err = -EAGAIN;
> > +                               goto out;
> > +                       }
> > +                       if (!anon_vma_trylock_write(src_anon_vma)) {
> > +                               pte_unmap(&orig_src_pte);
> > +                               pte_unmap(&orig_dst_pte);
> > +                               src_pte = dst_pte = NULL;
> > +                               anon_vma_lock_write(src_anon_vma);
> > +                               goto retry;
> > +                       }
> > +               }
> > +
> > +               mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, src_mm,
> > +                                       src_addr, src_addr + PAGE_SIZE);
> > +               mmu_notifier_invalidate_range_start_nonblock(&range);
>
> I'm pretty sure you're not allowed to use
> mmu_notifier_invalidate_range_start_nonblock(). Use
> mmu_notifier_invalidate_range_start() at a point where it's fine to
> block; perhaps all the way up in remap_pages() around the whole loop
> or something like that. mmu_notifier_invalidate_range_start_nonblock()
> is special magic sauce for OOM reaping (with deceptively inviting
> naming and no documentation that explains that this is evil hazmat
> code), and if you chase down what various users of MMU notifiers do
> when you just hand them a random notification where
> mmu_notifier_range_blockable() is false, you end up in fun paths like
> in KVM's kvm_mmu_notifier_invalidate_range_start(), which calls
> gfn_to_pfn_cache_invalidate_start(), which does this:
>
>     /*
>      * If the OOM reaper is active, then all vCPUs should have
>      * been stopped already, so perform the request without
>      * KVM_REQUEST_WAIT and be sad if any needed to be IPI'd.
>      */
>     if (!may_block)
>        req &= ~KVM_REQUEST_WAIT;
>
>     called = kvm_make_vcpus_request_mask(kvm, req, vcpu_bitmap);
>
>     WARN_ON_ONCE(called && !may_block);
>
> Which means if you hit this codepath from a place like userfaultfd
> where the process is probably still running (and not being OOM reaped
> in a context where it's guaranteed to have been killed and stopped),
> you could probably get KVM into a situation where it needs to wait for
> vCPUs to acknowledge that they've processed a message before it's safe
> to continue, but it can't wait because we're in nonsleepable context,
> and then it just continues without waiting and presumably the KVM TLB
> gets out of sync or something?

Yeah, I was sceptical about this function too but it was very tempting :)
Thanks for the warning, I'll move
mmu_notifier_invalidate_range_start() to before we map the ptes
because that's where we start the RCU read section.

>
>
>
> > +
> > +               double_pt_lock(dst_ptl, src_ptl);
> > +
> > +               if (!pte_same(*src_pte, orig_src_pte) ||
> > +                   !pte_same(*dst_pte, orig_dst_pte) ||
> > +                   folio_estimated_sharers(src_folio) != 1) {
> > +                       double_pt_unlock(dst_ptl, src_ptl);
> > +                       err = -EAGAIN;
> > +                       goto out;
> > +               }
> > +
> > +               BUG_ON(!folio_test_anon(src_folio));
> > +               /* the PT lock is enough to keep the page pinned now */
> > +               folio_put(src_folio);
> > +
> > +               dst_anon_vma = (void *) dst_vma->anon_vma + PAGE_MAPPING_ANON;
> > +               WRITE_ONCE(src_folio->mapping,
> > +                          (struct address_space *) dst_anon_vma);
> > +               WRITE_ONCE(src_folio->index, linear_page_index(dst_vma,
> > +                                                             dst_addr));
> > +
> > +               if (!pte_same(ptep_clear_flush(src_vma, src_addr, src_pte),
> > +                             orig_src_pte))
> > +                       BUG_ON(1);
> > +
> > +               orig_dst_pte = mk_pte(&src_folio->page, dst_vma->vm_page_prot);
> > +               orig_dst_pte = maybe_mkwrite(pte_mkdirty(orig_dst_pte),
> > +                                            dst_vma);
> > +
> > +               set_pte_at(dst_mm, dst_addr, dst_pte, orig_dst_pte);
> > +
> > +               if (dst_mm != src_mm) {
> > +                       inc_mm_counter(dst_mm, MM_ANONPAGES);
> > +                       dec_mm_counter(src_mm, MM_ANONPAGES);
> > +               }
> > +
> > +               double_pt_unlock(dst_ptl, src_ptl);
> > +
> > +               anon_vma_unlock_write(src_anon_vma);
> > +               mmu_notifier_invalidate_range_end(&range);
> > +               put_anon_vma(src_anon_vma);
> > +               src_anon_vma = NULL;
> > +
> > +               /* unblock rmap walks */
> > +               folio_unlock(src_folio);
> > +               src_folio = NULL;
> > +
> > +       } else {
> > +               struct swap_info_struct *si;
> > +               int swap_count;
> > +
> > +               entry = pte_to_swp_entry(orig_src_pte);
> > +               if (non_swap_entry(entry)) {
> > +                       if (is_migration_entry(entry)) {
> > +                               pte_unmap(&orig_src_pte);
> > +                               pte_unmap(&orig_dst_pte);
> > +                               src_pte = dst_pte = NULL;
> > +                               migration_entry_wait(src_mm, src_pmd,
> > +                                                    src_addr);
> > +                               err = -EAGAIN;
> > +                       } else
> > +                               err = -EFAULT;
> > +                       goto out;
> > +               }
> > +
> > +               /*
> > +                * COUNT_CONTINUE to be returned is fine here, no need
> > +                * of follow all swap continuation to check against
> > +                * number 1.
> > +                */
> > +               si = get_swap_device(entry);
> > +               if (!si) {
> > +                       err = -EBUSY;
> > +                       goto out;
> > +               }
> > +
> > +               swap_count = swap_swapcount(si, entry);
> > +               put_swap_device(si);
> > +               if (swap_count != 1) {
> > +                       err = -EBUSY;
> > +                       goto out;
> > +               }
> > +
> > +               double_pt_lock(dst_ptl, src_ptl);
> > +
> > +               if (!pte_same(*src_pte, orig_src_pte) ||
> > +                   !pte_same(*dst_pte, orig_dst_pte) ||
> > +                   swp_swapcount(entry) != 1) {
> > +                       double_pt_unlock(dst_ptl, src_ptl);
> > +                       err = -EAGAIN;
> > +                       goto out;
> > +               }
> > +
> > +               if (pte_val(ptep_get_and_clear(src_mm, src_addr, src_pte)) !=
> > +                   pte_val(orig_src_pte))
> > +                       BUG_ON(1);
> > +               set_pte_at(dst_mm, dst_addr, dst_pte, orig_src_pte);
> > +
> > +               if (dst_mm != src_mm) {
> > +                       inc_mm_counter(dst_mm, MM_ANONPAGES);
> > +                       dec_mm_counter(src_mm, MM_ANONPAGES);
> > +               }
> > +
> > +               double_pt_unlock(dst_ptl, src_ptl);
> > +       }
> > +
> > +out:
> > +       if (src_anon_vma) {
> > +               anon_vma_unlock_write(src_anon_vma);
> > +               put_anon_vma(src_anon_vma);
> > +       }
> > +       if (src_folio) {
> > +               folio_unlock(src_folio);
> > +               folio_put(src_folio);
> > +       }
> > +       if (dst_pte)
> > +               pte_unmap(dst_pte);
> > +       if (src_pte)
> > +               pte_unmap(src_pte);
> > +
> > +       return err;
> > +}

  reply	other threads:[~2023-09-15 23:40 UTC|newest]

Thread overview: 29+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-09-14 15:26 [PATCH 0/3] userfaultfd remap option Suren Baghdasaryan
2023-09-14 15:26 ` [PATCH 1/3] userfaultfd: UFFDIO_REMAP: rmap preparation Suren Baghdasaryan
2023-09-14 17:56   ` Matthew Wilcox
2023-09-14 18:34     ` Suren Baghdasaryan
2023-09-14 15:26 ` [PATCH 2/3] userfaultfd: UFFDIO_REMAP uABI Suren Baghdasaryan
2023-09-14 18:11   ` Matthew Wilcox
2023-09-14 18:43     ` David Hildenbrand
2023-09-14 18:45       ` David Hildenbrand
2023-09-21 18:04         ` Suren Baghdasaryan
2023-09-21 18:17           ` David Hildenbrand
2023-09-22  1:57             ` Suren Baghdasaryan
2023-09-14 18:47   ` David Hildenbrand
2023-09-14 18:54     ` Suren Baghdasaryan
2023-09-14 19:28   ` Jann Horn
2023-09-14 20:57     ` Suren Baghdasaryan
2023-09-19 23:08     ` Suren Baghdasaryan
2023-09-19 23:40       ` Suren Baghdasaryan
2023-09-19 23:50       ` Jann Horn
2023-09-20  1:49         ` Suren Baghdasaryan
2023-09-20 16:11           ` Jann Horn
2023-09-21 16:59     ` Jann Horn
2023-09-14 21:57   ` Nadav Amit
2023-09-15  3:28     ` Suren Baghdasaryan
2023-09-15  4:03       ` Nadav Amit
2023-09-15  4:15         ` Suren Baghdasaryan
2023-09-15 23:33   ` Jann Horn
2023-09-15 23:39     ` Suren Baghdasaryan [this message]
2023-09-14 15:26 ` [PATCH 3/3] selftests/mm: add UFFDIO_REMAP ioctl test Suren Baghdasaryan
2023-09-14 16:00 [PATCH 2/3] userfaultfd: UFFDIO_REMAP uABI kernel test robot

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to='CAJuCfpF3Hz9p6zQotM0aMG3WLT=FtYCu31jpFbU_VqE6BAP8NA@mail.gmail.com' \
    --to=surenb@google.com \
    --cc=Liam.Howlett@oracle.com \
    --cc=aarcange@redhat.com \
    --cc=akpm@linux-foundation.org \
    --cc=axelrasmussen@google.com \
    --cc=bgeffon@google.com \
    --cc=brauner@kernel.org \
    --cc=david@redhat.com \
    --cc=hughd@google.com \
    --cc=jannh@google.com \
    --cc=jdduke@google.com \
    --cc=kaleshsingh@google.com \
    --cc=kernel-team@android.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-kselftest@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=lokeshgidra@google.com \
    --cc=mhocko@suse.com \
    --cc=ngeoffray@google.com \
    --cc=peterx@redhat.com \
    --cc=rppt@kernel.org \
    --cc=shuah@kernel.org \
    --cc=viro@zeniv.linux.org.uk \
    --cc=willy@infradead.org \
    --cc=zhangpeng362@huawei.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.