linux-api.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Jann Horn <jannh@google.com>
To: Florian Weimer <fweimer@redhat.com>
Cc: Andrei Vagin <avagin@gmail.com>,
	kernel list <linux-kernel@vger.kernel.org>,
	Linux API <linux-api@vger.kernel.org>,
	linux-um@lists.infradead.org, criu@openvz.org,
	Andrei Vagin <avagin@google.com>,
	Andrew Morton <akpm@linux-foundation.org>,
	Andy Lutomirski <luto@kernel.org>,
	Anton Ivanov <anton.ivanov@cambridgegreys.com>,
	Christian Brauner <christian.brauner@ubuntu.com>,
	Dmitry Safonov <0x7f454c46@gmail.com>,
	Ingo Molnar <mingo@redhat.com>, Jeff Dike <jdike@addtoit.com>,
	Mike Rapoport <rppt@linux.ibm.com>,
	Michael Kerrisk <mtk.manpages@gmail.com>,
	Oleg Nesterov <oleg@redhat.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Richard Weinberger <richard@nod.at>,
	Thomas Gleixner <tglx@linutronix.de>
Subject: Re: [PATCH 0/4 POC] Allow executing code and syscalls in another address space
Date: Wed, 14 Apr 2021 15:58:25 +0200	[thread overview]
Message-ID: <CAG48ez02UDn_yeLuLF4c=kX0=h2Qq8Fdb0cer1yN8atbXSNjkQ@mail.gmail.com> (raw)
In-Reply-To: <874kg99hwf.fsf@oldenburg.str.redhat.com>

 On Wed, Apr 14, 2021 at 2:20 PM Florian Weimer <fweimer@redhat.com> wrote:
>
> * Jann Horn:
>
> > On Wed, Apr 14, 2021 at 12:27 PM Florian Weimer <fweimer@redhat.com> wrote:
> >>
> >> * Andrei Vagin:
> >>
> >> > We already have process_vm_readv and process_vm_writev to read and write
> >> > to a process memory faster than we can do this with ptrace. And now it
> >> > is time for process_vm_exec that allows executing code in an address
> >> > space of another process. We can do this with ptrace but it is much
> >> > slower.
> >> >
> >> > = Use-cases =
> >>
> >> We also have some vaguely related within the same address space: running
> >> code on another thread, without modifying its stack, while it has signal
> >> handlers blocked, and without causing system calls to fail with EINTR.
> >> This can be used to implement certain kinds of memory barriers.
> >
> > That's what the membarrier() syscall is for, right? Unless you don't
> > want to register all threads for expedited membarrier use?
>
> membarrier is not sufficiently powerful for revoking biased locks, for
> example.

But on Linux >=5.10, together with rseq, it is, right? Then lock
acquisition could look roughly like this, in pseudo-C (yes, I know,
real rseq doesn't quite look like that, you'd need inline asm for that
unless the compiler adds special support for this):


enum local_state {
  STATE_FREE_OR_BIASED,
  STATE_LOCKED
};
#define OWNER_LOCKBIT (1U<<31)
#define OWNER_WAITER_BIT (1U<<30) /* notify futex when OWNER_LOCKBIT
is cleared */
struct biased_lock {
  unsigned int owner_with_lockbit;
  enum local_state local_state;
};

void lock(struct biased_lock *L) {
  unsigned int my_tid = THREAD_SELF->tid;
  RSEQ_SEQUENCE_START(); // restart here on failure
  if (READ_ONCE(L->owner) == my_tid) {
    if (READ_ONCE(L->local_state) == STATE_LOCKED) {
      RSEQ_SEQUENCE_END();
      /*
       * Deadlock, abort execution.
       * Note that we are not necessarily actually *holding* the lock;
       * this can also happen if we entered a signal handler while we
       * were in the process of acquiring the lock.
       * But in that case it could just as well have happened that we
       * already grabbed the lock, so the caller is wrong anyway.
       */
      fatal_error();
    }
    RSEQ_COMMIT(L->local_state = STATE_LOCKED);
    return; /* fastpath success */
  }
  RSEQ_SEQUENCE_END();

  /* slowpath */
  /* acquire and lock owner field */
  unsigned int old_owner_with_lockbit;
  while (1) {
    old_owner_with_lockbit = READ_ONCE(L->owner_with_lockbit);
    if (old_owner_with_lockbit & OWNER_LOCKBIT) {
      if (!__sync_bool_compare_and_swap (&L->owner_with_lockbit,
old_owner_with_lockbit, my_tid | OWNER_LOCKBIT | OWNER_WAITER_BIT))
       continue;
      futex(&L->owner_with_lockbit, FUTEX_WAIT,
old_owner_with_lockbit, NULL, NULL, 0);
      continue;
    } else {
      if (__sync_bool_compare_and_swap (&L->owner_with_lockbit,
old_owner_with_lockbit, my_tid | OWNER_LOCKBIT))
        break;
    }
  }

  /*
   * ensure old owner won't lock local_state anymore.
   * we only have to worry about the owner that directly preceded us here;
   * it will have done this step for the owners that preceded it before clearing
   * the LOCKBIT; so if we were the old owner, we don't have to sync.
   */
  if (old_owner_with_lockbit != my_tid) {
    if (membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ, 0, 0))
      fatal_error();
  }

  /*
   * As soon as the lock becomes STATE_FREE_OR_BIASED, we own it; but
   * at this point it might still be locked.
   */
  while (READ_ONCE(L->local_state) == STATE_LOCKED) {
    futex(&L->local_state, FUTEX_WAIT, STATE_LOCKED, NULL, NULL, 0);
  }

  /* OK, now the lock is biased to us and we can grab it. */
  WRITE_ONCE(L->local_state, STATE_LOCKED);

  /* drop lockbit */
  unsigned int old_owner_with_lockbit;
  while (1) {
    old_owner_with_lockbit = READ_ONCE(L->owner_with_lockbit);
    if (__sync_bool_compare_and_swap (&L->owner_with_lockbit,
old_owner_with_lockbit, my_tid))
      break;
  }
  if (old_owner_with_lockbit & OWNER_WAITER_BIT)
    futex(&L->owner_with_lockbit, FUTEX_WAKE, INT_MAX, NULL, NULL, 0);
}

void unlock(struct biased_lock *L) {
  unsigned int my_tid = THREAD_SELF->tid;

  /*
   * If we run before the membarrier(), the lock() path will immediately
   * see the lock as uncontended, and we don't need to call futex().
   * If we run after the membarrier(), the ->owner_with_lockbit read
   * here will observe the new owner and we'll wake the futex.
   */
  RSEQ_SEQUENCE_START();
  unsigned int old_owner_with_lockbit = READ_ONCE(L->owner_with_lockbit);
  RSEQ_COMMIT(WRITE_ONCE(L->local_state, STATE_FREE_OR_BIASED));
  if (old_owner_with_lockbit != my_tid)
    futex(&L->local_state, FUTEX_WAKE, INT_MAX, NULL, NULL, 0);
}

  reply	other threads:[~2021-04-14 13:59 UTC|newest]

Thread overview: 35+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-04-14  5:52 [PATCH 0/4 POC] Allow executing code and syscalls in another address space Andrei Vagin
2021-04-14  5:52 ` [PATCH 1/4] signal: add a helper to restore a process state from sigcontex Andrei Vagin
2021-04-14  5:52 ` [PATCH 2/4] arch/x86: implement the process_vm_exec syscall Andrei Vagin
2021-04-14 17:09   ` Oleg Nesterov
2021-04-23  6:59     ` Andrei Vagin
2021-06-28 16:13   ` Jann Horn
2021-06-28 16:30     ` Andy Lutomirski
2021-06-28 17:14       ` Jann Horn
2021-06-28 18:18         ` Eric W. Biederman
2021-06-29  1:01           ` Andrei Vagin
2021-07-02  6:22     ` Andrei Vagin
2021-07-02 11:51       ` Jann Horn
2021-07-02 20:40         ` Andy Lutomirski
2021-07-02  8:51   ` Peter Zijlstra
2021-07-02 22:21     ` Andrei Vagin
2021-07-02 20:56   ` Jann Horn
2021-07-02 22:48     ` Andrei Vagin
2021-04-14  5:52 ` [PATCH 3/4] arch/x86: allow to execute syscalls via process_vm_exec Andrei Vagin
2021-04-14  5:52 ` [PATCH 4/4] selftests: add tests for process_vm_exec Andrei Vagin
2021-04-14  6:46 ` [PATCH 0/4 POC] Allow executing code and syscalls in another address space Jann Horn
2021-04-14 22:10   ` Andrei Vagin
2021-07-02  6:57   ` Andrei Vagin
2021-07-02 15:12     ` Jann Horn
2021-07-18  0:38       ` Andrei Vagin
2021-04-14  7:22 ` Anton Ivanov
2021-04-14  7:34   ` Johannes Berg
2021-04-14  9:24     ` Benjamin Berg
2021-04-14 10:27 ` Florian Weimer
2021-04-14 11:24   ` Jann Horn
2021-04-14 12:20     ` Florian Weimer
2021-04-14 13:58       ` Jann Horn [this message]
2021-04-16 19:29 ` Kirill Smelkov
2021-04-17 16:28 ` sbaugh
2021-07-02 22:44 ` Andy Lutomirski
2021-07-18  1:34   ` Andrei Vagin

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to='CAG48ez02UDn_yeLuLF4c=kX0=h2Qq8Fdb0cer1yN8atbXSNjkQ@mail.gmail.com' \
    --to=jannh@google.com \
    --cc=0x7f454c46@gmail.com \
    --cc=akpm@linux-foundation.org \
    --cc=anton.ivanov@cambridgegreys.com \
    --cc=avagin@gmail.com \
    --cc=avagin@google.com \
    --cc=christian.brauner@ubuntu.com \
    --cc=criu@openvz.org \
    --cc=fweimer@redhat.com \
    --cc=jdike@addtoit.com \
    --cc=linux-api@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-um@lists.infradead.org \
    --cc=luto@kernel.org \
    --cc=mingo@redhat.com \
    --cc=mtk.manpages@gmail.com \
    --cc=oleg@redhat.com \
    --cc=peterz@infradead.org \
    --cc=richard@nod.at \
    --cc=rppt@linux.ibm.com \
    --cc=tglx@linutronix.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).