On Mon, Oct 03, 2016 at 12:27:01PM -0700, Dave Hansen wrote:
> On 10/02/2016 11:41 PM, Elena Reshetova wrote:
> >  static __always_inline void atomic_add(int i, atomic_t *v)
> >  {
> > -	asm volatile(LOCK_PREFIX "addl %1,%0"
> > +	asm volatile(LOCK_PREFIX "addl %1,%0\n"
> > +
> > +#ifdef CONFIG_HARDENED_ATOMIC
> > +		     "jno 0f\n"
> > +		     LOCK_PREFIX "subl %1,%0\n"
> > +		     "int $4\n0:\n"
> > +		     _ASM_EXTABLE(0b, 0b)
> > +#endif
> > +
> > +		     : "+m" (v->counter)
> > +		     : "ir" (i));
> > +}
> 
> Rather than doing all this assembly and exception stuff, could we just do:
> 
> static __always_inline void atomic_add(int i, atomic_t *v)
> {
> 	if (atomic_add_unless(v, a, INT_MAX))
> 		BUG_ON_OVERFLOW_FOO()...
> }
> 
> That way, there's also no transient state where somebody can have
> observed the overflow before it is fixed up.  Granted, this
> cmpxchg-based operation _is_ more expensive than the fast-path locked addl.

I think we need some numbers, so I copypasted a bunch of kernel code together
so that I can benchmark this stuff in userspace without having a full kernel
implementation of refcounting protection. My code is at the bottom of
the mail - please test this on other CPUs, these are just numbers from my
machine.

The following numbers are from tests on a
"Intel(R) Core(TM) i7-4790 CPU @ 3.60GHz" CPU.

First, I'm testing the single-threaded (uncontended) case:

$ gcc -o atomic_user_test atomic_user_test.c -std=gnu99 -Wall -pthread -ggdb
$ time ./atomic_user_test 1 1 1000000000 # single-threaded, no protection
real	0m9.281s
user	0m9.251s
sys	0m0.004s
$ time ./atomic_user_test 1 2 1000000000 # single-threaded, racy protection
real	0m9.385s
user	0m9.365s
sys	0m0.003s
$ time ./atomic_user_test 1 3 1000000000 # single-threaded, cmpxchg protection
real	0m12.399s
user	0m12.375s
sys	0m0.000s

The cmpxchg protection is something like 30% slower than the racy one. The
cmpxchg protection needs something like 12.4ns per operation, or around 45
cycles per operation. (Well, probably actually a bit less, considering that
the loop also costs some time.) My guess is that this wouldn't be noticeable.

Now, I'm testing the multi-threaded (contended) case, with two threads that
only try to increment the same counter over and over again - so this is a
pretty extreme worst-case microbenchmark:

$ time ./atomic_user_test 2 1 1000000000 # multi-threaded, no protection
real	0m9.550s
user	0m18.988s
sys	0m0.000s
$ time ./atomic_user_test 2 2 1000000000 # multi-threaded, racy protection
real	0m9.249s
user	0m18.430s
sys	0m0.004s
$ time ./atomic_user_test 2 3 1000000000 # multi-threaded, cmpxchg protection
real	1m47.331s
user	3m34.390s
sys	0m0.024s

Here, the cmpxchg-protected counter is around 11 times as slow as the
unprotected counter, with around 107ns per average increment. That's around
380 cycles per increment.

I guess we probably don't care all that much about the few extra cycles in
the uncontended case.
So I think the big question now is how important the performance of the
high-contention case is.


My test code, cobbled together from the kernel sources and the
suggested mitigations:

===============================================
#define _GNU_SOURCE
#include <pthread.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <sys/ioctl.h>
#include <err.h>
#include <unistd.h>
#include <sys/wait.h>
#include <limits.h>

#define LOCK_PREFIX "lock; "

static __always_inline void atomic_inc_raw(int *v)
{
  asm volatile(LOCK_PREFIX "incl %0"
         : "+m" (v));
}

static __always_inline void atomic_inc_racy(int *v)
{
  asm volatile(LOCK_PREFIX "incl %0\n"
    "jno 0f\n"
    LOCK_PREFIX "decl %0\n"
    "call abort\n0:\n"
    : "+m" (v));
}

#define READ_ONCE_32(v) (*(volatile int *)(v))

static __always_inline int cmpxchg(int *v, int old, int new)
{
        volatile unsigned int *ptr = (volatile unsigned int *)(v);
        int ret;
        asm volatile(LOCK_PREFIX "cmpxchgl %2,%1"
                     : "=a" (ret), "+m" (*ptr)
                     : "r" (new), "0" (old)
                     : "memory");
        return ret;
}

static __always_inline int __atomic_add_unless_(int *v, int a, int u)
{
  int c, old;
  c = READ_ONCE_32(v);
  for (;;) {
    if (__builtin_expect(c == (u), 0))
      break;
    old = cmpxchg((v), c, c + (a));
    if (__builtin_expect(old == c, 1))
      break;
    c = old;
  }
  return c;
}

static __always_inline void atomic_inc_cmpxchg(int *v)
{
  if (__atomic_add_unless_(v, 1, INT_MAX) == INT_MAX)
    abort();
}

int mode, type, count;
#define TYPE_RAW 1
#define TYPE_RACY 2
#define TYPE_CMPXCHG 3

int test_atomic;
void *childfn(void *arg) {
  switch (type) {
  case TYPE_RAW:
    for (int i=0; i<count; i++)
      atomic_inc_raw(&test_atomic);
    break;
  case TYPE_RACY:
    for (int i=0; i<count; i++)
      atomic_inc_racy(&test_atomic);
    break;
  case TYPE_CMPXCHG:
    for (int i=0; i<count; i++)
      atomic_inc_cmpxchg(&test_atomic);
    break;
  }
  return NULL;
}

int main(int argc, char **argv) {
  pthread_t thread;

  if (argc != 4)
    errx(1, "bad invocation; want mode, type and count");
  mode = atoi(argv[1]); // 1 == single process, 2 == two processes
  if (mode != 1 && mode != 2)
    errx(1, "bad mode");
  type = atoi(argv[2]);
  if (type < 1 || type > 3)
    errx(1, "bad type");
  count = atoi(argv[3]);
  if (count <= 0)
    errx(1, "bad count");

  if (mode == 2) {
    if (pthread_create(&thread, NULL, childfn, NULL))
      err(1, "pthread_create");
  }
  childfn(NULL);

  if (mode == 2) {
    if (pthread_join(thread, NULL))
      err(1, "join");
  }

  return 0;
}
===============================================