Re: [PATCH 41/41] mm: replace rw_semaphore with atomic_t in vma_lock

From: Suren Baghdasaryan <surenb@google.com>
To: Hillf Danton <hdanton@sina.com>
Cc: vbabka@suse.cz, hannes@cmpxchg.org, mgorman@techsingularity.net,
	peterz@infradead.org, hughd@google.com,
	linux-kernel@vger.kernel.org, linux-mm@kvack.org
Subject: Re: [PATCH 41/41] mm: replace rw_semaphore with atomic_t in vma_lock
Date: Mon, 16 Jan 2023 15:11:00 -0800	[thread overview]
Message-ID: <CAJuCfpEx4tLNaVy_VpUKbrTkHJ7uPg5pPostNHD++6hD1dfzrQ@mail.gmail.com> (raw)
In-Reply-To: <CAJuCfpHoHcZxQZgt4Ki1kiBu9O+sANZQambOa+1gSQu2brPoyA@mail.gmail.com>

On Mon, Jan 16, 2023 at 3:08 PM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Mon, Jan 16, 2023 at 6:07 AM Hillf Danton <hdanton@sina.com> wrote:
> >
> > On Mon, 9 Jan 2023 12:53:36 -0800 Suren Baghdasaryan <surenb@google.com>
> > > --- a/include/linux/mm.h
> > > +++ b/include/linux/mm.h
> > > @@ -627,12 +627,16 @@ static inline void vma_write_lock(struct vm_area_struct *vma)
> > >        * mm->mm_lock_seq can't be concurrently modified.
> > >        */
> > >       mm_lock_seq = READ_ONCE(vma->vm_mm->mm_lock_seq);
> > > -     if (vma->vm_lock_seq == mm_lock_seq)
> > > +     if (vma->vm_lock->lock_seq == mm_lock_seq)
> > >               return;
> >
> >         lock acquire for write to info lockdep.
>
> Thanks for the review Hillf!
>
> Good idea. Will add in the next version.
>
> > >
> > > -     down_write(&vma->vm_lock->lock);
> > > -     vma->vm_lock_seq = mm_lock_seq;
> > > -     up_write(&vma->vm_lock->lock);
> > > +     if (atomic_cmpxchg(&vma->vm_lock->count, 0, -1))
> > > +             wait_event(vma->vm_mm->vma_writer_wait,
> > > +                        atomic_cmpxchg(&vma->vm_lock->count, 0, -1) == 0);
> > > +     vma->vm_lock->lock_seq = mm_lock_seq;
> > > +     /* Write barrier to ensure lock_seq change is visible before count */
> > > +     smp_wmb();
> > > +     atomic_set(&vma->vm_lock->count, 0);
> > >  }
> > >
> > >  /*
> > > @@ -643,20 +647,28 @@ static inline void vma_write_lock(struct vm_area_struct *vma)
> > >  static inline bool vma_read_trylock(struct vm_area_struct *vma)
> > >  {
> > >       /* Check before locking. A race might cause false locked result. */
> > > -     if (vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))
> > > +     if (vma->vm_lock->lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))
> > >               return false;
> >
> > Add mb to pair with the above wmb like
>
> The wmb above is to ensure the ordering between updates of lock_seq
> and vm_lock->count (lock_seq is updated first and vm_lock->count only
> after that). The first access to vm_lock->count in this function is
> atomic_inc_unless_negative() and it's an atomic RMW operation with a
> return value. According to documentation such functions are fully
> ordered, therefore I think we already have an implicit full memory
> barrier between reads of lock_seq and vm_lock->count here. Am I wrong?
>
> >
> >         if (READ_ONCE(vma->vm_lock->lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq)) {
> >                 smp_acquire__after_ctrl_dep();
> >                 return false;
> >         }
> > >
> > > -     if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0))
> > > +     if (unlikely(!atomic_inc_unless_negative(&vma->vm_lock->count)))
> > >               return false;
> > >
> > > +     /* If atomic_t overflows, restore and fail to lock. */
> > > +     if (unlikely(atomic_read(&vma->vm_lock->count) < 0)) {
> > > +             if (atomic_dec_and_test(&vma->vm_lock->count))
> > > +                     wake_up(&vma->vm_mm->vma_writer_wait);
> > > +             return false;
> > > +     }
> > > +
> > >       /*
> > >        * Overflow might produce false locked result.
> > >        * False unlocked result is impossible because we modify and check
> > >        * vma->vm_lock_seq under vma->vm_lock protection and mm->mm_lock_seq
> > >        * modification invalidates all existing locks.
> > >        */
> > > -     if (unlikely(vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))) {
> > > -             up_read(&vma->vm_lock->lock);
> > > +     if (unlikely(vma->vm_lock->lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))) {
> > > +             if (atomic_dec_and_test(&vma->vm_lock->count))
> > > +                     wake_up(&vma->vm_mm->vma_writer_wait);
> > >               return false;
> > >       }
> >
> > Simpler way to detect write lock owner and count overflow like
> >
> >         int count = atomic_read(&vma->vm_lock->count);
> >         for (;;) {
> >                 int new = count + 1;
> >
> >                 if (count < 0 || new < 0)
> >                         return false;
> >
> >                 new = atomic_cmpxchg(&vma->vm_lock->count, count, new);
> >                 if (new == count)
> >                         break;
> >                 count = new;
> >                 cpu_relax();
> >         }
> >
> >         (wake up waiting readers after taking the lock;
> >         but the write lock owner before this read trylock should be
> >         responsible for waking waiters up.)
> >
> >         lock acquire for read.
>
> This schema might cause readers to wait, which is not an exact
> replacement for down_read_trylock(). The requirement to wake up
> waiting readers also complicates things and since we can always fall
> back to mmap_lock, that complication is unnecessary IMHO. I could use
> part of your suggestion like this:
>
>                  int new = count + 1;
>
>                  if (count < 0 || new < 0)
>                          return false;
>
>                  new = atomic_cmpxchg(&vma->vm_lock->count, count, new);
>                  if (new == count)
>                          return false;

Made a mistake above. It should have been:
                  if (new != count)
                          return false;


>
> Compared to doing atomic_inc_unless_negative() first, like I did
> originally, this schema opens a bit wider window for the writer to get
> in the middle and cause the reader to fail locking but I don't think
> it would result in any visible regression.
>
> >
> > >       return true;
> > > @@ -664,7 +676,8 @@ static inline bool vma_read_trylock(struct vm_area_struct *vma)
> > >
> > >  static inline void vma_read_unlock(struct vm_area_struct *vma)
> > >  {
> >         lock release for read.
>
> Ack.
>
> >
> > > -     up_read(&vma->vm_lock->lock);
> > > +     if (atomic_dec_and_test(&vma->vm_lock->count))
> > > +             wake_up(&vma->vm_mm->vma_writer_wait);
> > >  }
> >