From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S933372AbdABRSl (ORCPT ); Mon, 2 Jan 2017 12:18:41 -0500 Received: from mail.kernel.org ([198.145.29.136]:60338 "EHLO mail.kernel.org" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1756223AbdABRQs (ORCPT ); Mon, 2 Jan 2017 12:16:48 -0500 Date: Mon, 2 Jan 2017 14:16:40 -0300 From: Arnaldo Carvalho de Melo To: Waiman Long Cc: Thomas Gleixner , Ingo Molnar , Peter Zijlstra , Jonathan Corbet , linux-kernel@vger.kernel.org, linux-doc@vger.kernel.org, Davidlohr Bueso , Mike Galbraith , Scott J Norton Subject: Re: [PATCH v4 13/20] perf bench: New microbenchmark for userspace mutex performance Message-ID: <20170102171640.GA21566@kernel.org> References: <1483028026-10305-1-git-send-email-longman@redhat.com> <1483028026-10305-14-git-send-email-longman@redhat.com> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <1483028026-10305-14-git-send-email-longman@redhat.com> X-Url: http://acmel.wordpress.com User-Agent: Mutt/1.7.1 (2016-10-04) Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Em Thu, Dec 29, 2016 at 11:13:39AM -0500, Waiman Long escreveu: > This microbenchmark simulates how the use of different futex types > can affect the actual performanace of userspace mutex locks. The > usage is: > > perf bench futex mutex Showing the tool output is preferred, trim it if needed or state why that is not possible. > Three sets of simple mutex lock and unlock functions are implemented > using the wait-wake, PI and TP futexes respectively. This > microbenchmark then runs the locking rate measurement tests using > either one of those mutexes or all of them consecutively. > > Signed-off-by: Waiman Long > --- > tools/perf/bench/Build | 1 + > tools/perf/bench/bench.h | 1 + > tools/perf/bench/futex-locks.c | 844 +++++++++++++++++++++++++++++++++++++++++ > tools/perf/bench/futex.h | 23 ++ > tools/perf/builtin-bench.c | 10 + > tools/perf/check-headers.sh | 4 + You forgot to document it in tools/perf/Documentation/perf-bench.txt Also I would suggest you submit this first without supportint TP futexes, i.e. supporting just what is in the kernel already, this way I could cherry-pick this, which would be useful already for benchmarking the existing types of futexes. Then, after the new futex type is accepted into the kernel, in a separate patch you would add support for it in 'perf bench futex mutex'. Thanks, - Arnaldo > 6 files changed, 883 insertions(+) > create mode 100644 tools/perf/bench/futex-locks.c > > diff --git a/tools/perf/bench/Build b/tools/perf/bench/Build > index 60bf119..7ce1cd7 100644 > --- a/tools/perf/bench/Build > +++ b/tools/perf/bench/Build > @@ -6,6 +6,7 @@ perf-y += futex-wake.o > perf-y += futex-wake-parallel.o > perf-y += futex-requeue.o > perf-y += futex-lock-pi.o > +perf-y += futex-locks.o > > perf-$(CONFIG_X86_64) += mem-memcpy-x86-64-asm.o > perf-$(CONFIG_X86_64) += mem-memset-x86-64-asm.o > diff --git a/tools/perf/bench/bench.h b/tools/perf/bench/bench.h > index 579a592..b0632df 100644 > --- a/tools/perf/bench/bench.h > +++ b/tools/perf/bench/bench.h > @@ -36,6 +36,7 @@ > int bench_futex_requeue(int argc, const char **argv, const char *prefix); > /* pi futexes */ > int bench_futex_lock_pi(int argc, const char **argv, const char *prefix); > +int bench_futex_mutex(int argc, const char **argv, const char *prefix); > > #define BENCH_FORMAT_DEFAULT_STR "default" > #define BENCH_FORMAT_DEFAULT 0 > diff --git a/tools/perf/bench/futex-locks.c b/tools/perf/bench/futex-locks.c > new file mode 100644 > index 0000000..ec1e627 > --- /dev/null > +++ b/tools/perf/bench/futex-locks.c > @@ -0,0 +1,844 @@ > +/* > + * Copyright (C) 2016 Waiman Long > + * > + * This microbenchmark simulates how the use of different futex types can > + * affect the actual performanace of userspace locking primitives like mutex. > + * > + * The raw throughput of the futex lock and unlock calls is not a good > + * indication of actual throughput of the mutex code as it may not really > + * need to call into the kernel. Therefore, 3 sets of simple mutex lock and > + * unlock functions are written to implenment a mutex lock using the > + * wait-wake, PI and TP futexes respectively. These functions serve as the > + * basis for measuring the locking throughput. > + */ > + > +#include > + > +#include > +#include > +#include "../util/stat.h" > +#include "../perf-sys.h" > +#include > +#include > +#include > +#include > +#include "bench.h" > +#include "futex.h" > + > +#include > +#include > +#include > + > +#define CACHELINE_SIZE 64 > +#define gettid() syscall(SYS_gettid) > +#define __cacheline_aligned __attribute__((__aligned__(CACHELINE_SIZE))) > + > +typedef u32 futex_t; > +typedef void (*lock_fn_t)(futex_t *futex, int tid); > +typedef void (*unlock_fn_t)(futex_t *futex, int tid); > + > +/* > + * Statistical count list > + */ > +enum { > + STAT_OPS, /* # of exclusive locking operations */ > + STAT_LOCKS, /* # of exclusive lock futex calls */ > + STAT_UNLOCKS, /* # of exclusive unlock futex calls */ > + STAT_SLEEPS, /* # of exclusive lock sleeps */ > + STAT_EAGAINS, /* # of EAGAIN errors */ > + STAT_WAKEUPS, /* # of wakeups (unlock return) */ > + STAT_HANDOFFS, /* # of lock handoff (TP only) */ > + STAT_STEALS, /* # of lock steals (TP only) */ > + STAT_LOCKERRS, /* # of exclusive lock errors */ > + STAT_UNLKERRS, /* # of exclusive unlock errors */ > + STAT_NUM /* Total # of statistical count */ > +}; > + > +/* > + * Syscall time list > + */ > +enum { > + TIME_LOCK, /* Total exclusive lock syscall time */ > + TIME_UNLK, /* Total exclusive unlock syscall time */ > + TIME_NUM, > +}; > + > +struct worker { > + futex_t *futex; > + pthread_t thread; > + > + /* > + * Per-thread operation statistics > + */ > + u32 stats[STAT_NUM]; > + > + /* > + * Lock/unlock times > + */ > + u64 times[TIME_NUM]; > +} __cacheline_aligned; > + > +/* > + * Global cache-aligned futex > + */ > +static futex_t __cacheline_aligned global_futex; > +static futex_t *pfutex = &global_futex; > + > +static __thread futex_t thread_id; /* Thread ID */ > +static __thread int counter; /* Sleep counter */ > + > +static struct worker *worker, *worker_alloc; > +static unsigned int nsecs = 10; > +static bool verbose, done, fshared, exit_now, timestat; > +static unsigned int ncpus, nthreads; > +static int flags; > +static const char *ftype; > +static int loadlat = 1; > +static int locklat = 1; > +static int wratio; > +struct timeval start, end, runtime; > +static unsigned int worker_start; > +static unsigned int threads_starting; > +static unsigned int threads_stopping; > +static struct stats throughput_stats; > +static lock_fn_t mutex_lock_fn; > +static unlock_fn_t mutex_unlock_fn; > + > +/* > + * Lock/unlock syscall time macro > + */ > +static inline void systime_add(int tid, int item, struct timespec *begin, > + struct timespec *_end) > +{ > + worker[tid].times[item] += (_end->tv_sec - begin->tv_sec)*1000000000 + > + _end->tv_nsec - begin->tv_nsec; > +} > + > +static inline double stat_percent(struct worker *w, int top, int bottom) > +{ > + return (double)w->stats[top] * 100 / w->stats[bottom]; > +} > + > +/* > + * Inline functions to update the statistical counts > + * > + * Enable statistics collection may sometimes impact the locking rates > + * to be measured. So we can specify the DISABLE_STAT macro to disable > + * statistic counts collection for all except the core locking rate counts. > + * > + * #define DISABLE_STAT > + */ > +#ifndef DISABLE_STAT > +static inline void stat_add(int tid, int item, int num) > +{ > + worker[tid].stats[item] += num; > +} > + > +static inline void stat_inc(int tid, int item) > +{ > + stat_add(tid, item, 1); > +} > +#else > +static inline void stat_add(int tid __maybe_unused, int item __maybe_unused, > + int num __maybe_unused) > +{ > +} > + > +static inline void stat_inc(int tid __maybe_unused, int item __maybe_unused) > +{ > +} > +#endif > + > +/* > + * The latency value within a lock critical section (load) and between locking > + * operations is in term of the number of cpu_relax() calls that are being > + * issued. > + */ > +static const struct option mutex_options[] = { > + OPT_INTEGER ('d', "locklat", &locklat, "Specify inter-locking latency (default = 1)"), > + OPT_STRING ('f', "ftype", &ftype, "type", "Specify futex type: WW, PI, TP, all (default)"), > + OPT_INTEGER ('L', "loadlat", &loadlat, "Specify load latency (default = 1)"), > + OPT_UINTEGER('r', "runtime", &nsecs, "Specify runtime (in seconds, default = 10s)"), > + OPT_BOOLEAN ('S', "shared", &fshared, "Use shared futexes instead of private ones"), > + OPT_BOOLEAN ('T', "timestat", ×tat, "Track lock/unlock syscall times"), > + OPT_UINTEGER('t', "threads", &nthreads, "Specify number of threads, default = # of CPUs"), > + OPT_BOOLEAN ('v', "verbose", &verbose, "Verbose mode: display thread-level details"), > + OPT_INTEGER ('w', "wait-ratio", &wratio, "Specify /1024 of load is 1us sleep, default = 0"), > + OPT_END() > +}; > + > +static const char * const bench_futex_mutex_usage[] = { > + "perf bench futex mutex ", > + NULL > +}; > + > +/** > + * futex_cmpxchg() - atomic compare and exchange > + * @uaddr: The address of the futex to be modified > + * @oldval: The expected value of the futex > + * @newval: The new value to try and assign to the futex > + * > + * Implement cmpxchg using gcc atomic builtins. > + * > + * Return: the old futex value. > + */ > +static inline futex_t futex_cmpxchg(futex_t *uaddr, futex_t old, futex_t new) > +{ > + return __sync_val_compare_and_swap(uaddr, old, new); > +} > + > +/** > + * futex_xchg() - atomic exchange > + * @uaddr: The address of the futex to be modified > + * @newval: The new value to assign to the futex > + * > + * Implement cmpxchg using gcc atomic builtins. > + * > + * Return: the old futex value. > + */ > +static inline futex_t futex_xchg(futex_t *uaddr, futex_t new) > +{ > + return __sync_lock_test_and_set(uaddr, new); > +} > + > +/** > + * atomic_dec_return - atomically decrement & return the new value > + * @uaddr: The address of the futex to be decremented > + * Return: The new value > + */ > +static inline int atomic_dec_return(futex_t *uaddr) > +{ > + return __sync_sub_and_fetch(uaddr, 1); > +} > + > +/** > + * atomic_inc_return - atomically increment & return the new value > + * @uaddr: The address of the futex to be incremented > + * Return: The new value > + */ > +static inline int atomic_inc_return(futex_t *uaddr) > +{ > + return __sync_add_and_fetch(uaddr, 1); > +} > + > +/**********************[ MUTEX lock/unlock functions ]*********************/ > + > +/* > + * Wait-wake futex lock/unlock functions (Glibc implementation) > + * futex value: 0 - unlocked > + * 1 - locked > + * 2 - locked with waiters (contended) > + */ > +static void ww_mutex_lock(futex_t *futex, int tid) > +{ > + struct timespec stime, etime; > + futex_t val = *futex; > + int ret; > + > + if (!val) { > + val = futex_cmpxchg(futex, 0, 1); > + if (val == 0) > + return; > + } > + > + for (;;) { > + if (val != 2) { > + /* > + * Force value to 2 to indicate waiter > + */ > + val = futex_xchg(futex, 2); > + if (val == 0) > + return; > + } > + if (timestat) { > + clock_gettime(CLOCK_REALTIME, &stime); > + ret = futex_wait(futex, 2, NULL, flags); > + clock_gettime(CLOCK_REALTIME, &etime); > + systime_add(tid, TIME_LOCK, &stime, &etime); > + } else { > + ret = futex_wait(futex, 2, NULL, flags); > + } > + > + stat_inc(tid, STAT_LOCKS); > + if (ret < 0) { > + if (errno == EAGAIN) > + stat_inc(tid, STAT_EAGAINS); > + else > + stat_inc(tid, STAT_LOCKERRS); > + } > + > + val = *futex; > + } > +} > + > +static void ww_mutex_unlock(futex_t *futex, int tid) > +{ > + struct timespec stime, etime; > + futex_t val; > + int ret; > + > + val = futex_xchg(futex, 0); > + > + if (val == 2) { > + stat_inc(tid, STAT_UNLOCKS); > + if (timestat) { > + clock_gettime(CLOCK_REALTIME, &stime); > + ret = futex_wake(futex, 1, flags); > + clock_gettime(CLOCK_REALTIME, &etime); > + systime_add(tid, TIME_UNLK, &stime, &etime); > + } else { > + ret = futex_wake(futex, 1, flags); > + } > + if (ret < 0) > + stat_inc(tid, STAT_UNLKERRS); > + else > + stat_add(tid, STAT_WAKEUPS, ret); > + } > +} > + > +/* > + * Alternate wait-wake futex lock/unlock functions with thread_id lock word > + */ > +static void ww2_mutex_lock(futex_t *futex, int tid) > +{ > + struct timespec stime, etime; > + futex_t val = *futex; > + int ret; > + > + if (!val) { > + val = futex_cmpxchg(futex, 0, thread_id); > + if (val == 0) > + return; > + } > + > + for (;;) { > + /* > + * Set the FUTEX_WAITERS bit, if not set yet. > + */ > + while (!(val & FUTEX_WAITERS)) { > + futex_t old; > + > + if (!val) { > + val = futex_cmpxchg(futex, 0, thread_id); > + if (val == 0) > + return; > + continue; > + } > + old = futex_cmpxchg(futex, val, val | FUTEX_WAITERS); > + if (old == val) { > + val |= FUTEX_WAITERS; > + break; > + } > + val = old; > + } > + if (timestat) { > + clock_gettime(CLOCK_REALTIME, &stime); > + ret = futex_wait(futex, val, NULL, flags); > + clock_gettime(CLOCK_REALTIME, &etime); > + systime_add(tid, TIME_LOCK, &stime, &etime); > + } else { > + ret = futex_wait(futex, val, NULL, flags); > + } > + stat_inc(tid, STAT_LOCKS); > + if (ret < 0) { > + if (errno == EAGAIN) > + stat_inc(tid, STAT_EAGAINS); > + else > + stat_inc(tid, STAT_LOCKERRS); > + } > + > + val = *futex; > + } > +} > + > +static void ww2_mutex_unlock(futex_t *futex, int tid) > +{ > + struct timespec stime, etime; > + futex_t val; > + int ret; > + > + val = futex_xchg(futex, 0); > + > + if ((val & FUTEX_TID_MASK) != thread_id) > + stat_inc(tid, STAT_UNLKERRS); > + > + if (val & FUTEX_WAITERS) { > + stat_inc(tid, STAT_UNLOCKS); > + if (timestat) { > + clock_gettime(CLOCK_REALTIME, &stime); > + ret = futex_wake(futex, 1, flags); > + clock_gettime(CLOCK_REALTIME, &etime); > + systime_add(tid, TIME_UNLK, &stime, &etime); > + } else { > + ret = futex_wake(futex, 1, flags); > + } > + if (ret < 0) > + stat_inc(tid, STAT_UNLKERRS); > + else > + stat_add(tid, STAT_WAKEUPS, ret); > + } > +} > + > +/* > + * PI futex lock/unlock functions > + */ > +static void pi_mutex_lock(futex_t *futex, int tid) > +{ > + struct timespec stime, etime; > + futex_t val; > + int ret; > + > + val = futex_cmpxchg(futex, 0, thread_id); > + if (val == 0) > + return; > + > + /* > + * Retry if an error happens > + */ > + for (;;) { > + if (timestat) { > + clock_gettime(CLOCK_REALTIME, &stime); > + ret = futex_lock_pi(futex, NULL, flags); > + clock_gettime(CLOCK_REALTIME, &etime); > + systime_add(tid, TIME_LOCK, &stime, &etime); > + } else { > + ret = futex_lock_pi(futex, NULL, flags); > + } > + stat_inc(tid, STAT_LOCKS); > + if (ret >= 0) > + break; > + stat_inc(tid, STAT_LOCKERRS); > + } > +} > + > +static void pi_mutex_unlock(futex_t *futex, int tid) > +{ > + struct timespec stime, etime; > + futex_t val; > + int ret; > + > + val = futex_cmpxchg(futex, thread_id, 0); > + if (val == thread_id) > + return; > + > + if (timestat) { > + clock_gettime(CLOCK_REALTIME, &stime); > + ret = futex_unlock_pi(futex, flags); > + clock_gettime(CLOCK_REALTIME, &etime); > + systime_add(tid, TIME_UNLK, &stime, &etime); > + } else { > + ret = futex_unlock_pi(futex, flags); > + } > + if (ret < 0) > + stat_inc(tid, STAT_UNLKERRS); > + else > + stat_add(tid, STAT_WAKEUPS, ret); > + stat_inc(tid, STAT_UNLOCKS); > +} > + > +/* > + * TP futex lock/unlock functions > + */ > +static void tp_mutex_lock(futex_t *futex, int tid) > +{ > + struct timespec stime, etime; > + futex_t val; > + int ret; > + > + val = futex_cmpxchg(futex, 0, thread_id); > + if (val == 0) > + return; > + > + /* > + * Retry if an error happens > + */ > + for (;;) { > + if (timestat) { > + clock_gettime(CLOCK_REALTIME, &stime); > + ret = futex_lock(futex, NULL, flags); > + clock_gettime(CLOCK_REALTIME, &etime); > + systime_add(tid, TIME_LOCK, &stime, &etime); > + } else { > + ret = futex_lock(futex, NULL, flags); > + } > + stat_inc(tid, STAT_LOCKS); > + if (ret >= 0) > + break; > + stat_inc(tid, STAT_LOCKERRS); > + } > + /* > + * Get # of sleeps & locking method > + */ > + stat_add(tid, STAT_SLEEPS, ret >> 16); > + ret &= 0xff; > + if (!ret) > + stat_inc(tid, STAT_STEALS); > + else if (ret == 2) > + stat_inc(tid, STAT_HANDOFFS); > +} > + > +static void tp_mutex_unlock(futex_t *futex, int tid) > +{ > + struct timespec stime, etime; > + futex_t val; > + int ret; > + > + val = futex_cmpxchg(futex, thread_id, 0); > + if (val == thread_id) > + return; > + > + if (timestat) { > + clock_gettime(CLOCK_REALTIME, &stime); > + ret = futex_unlock(futex, flags); > + clock_gettime(CLOCK_REALTIME, &etime); > + systime_add(tid, TIME_UNLK, &stime, &etime); > + } else { > + ret = futex_unlock(futex, flags); > + } > + stat_inc(tid, STAT_UNLOCKS); > + if (ret < 0) > + stat_inc(tid, STAT_UNLKERRS); > + else > + stat_add(tid, STAT_WAKEUPS, ret); > +} > + > +/**************************************************************************/ > + > +/* > + * Load function > + */ > +static inline void load(int tid) > +{ > + int n = loadlat; > + > + /* > + * Optionally does a 1us sleep instead if wratio is defined and > + * is within bound. > + */ > + if (wratio && (((counter++ + tid) & 0x3ff) < wratio)) { > + usleep(1); > + return; > + } > + > + while (n-- > 0) > + cpu_relax(); > +} > + > +static inline void csdelay(void) > +{ > + int n = locklat; > + > + while (n-- > 0) > + cpu_relax(); > +} > + > +static void toggle_done(int sig __maybe_unused, > + siginfo_t *info __maybe_unused, > + void *uc __maybe_unused) > +{ > + /* inform all threads that we're done for the day */ > + done = true; > + gettimeofday(&end, NULL); > + timersub(&end, &start, &runtime); > + if (sig) > + exit_now = true; > +} > + > +static void *mutex_workerfn(void *arg) > +{ > + long tid = (long)arg; > + struct worker *w = &worker[tid]; > + lock_fn_t lock_fn = mutex_lock_fn; > + unlock_fn_t unlock_fn = mutex_unlock_fn; > + > + thread_id = gettid(); > + counter = 0; > + > + atomic_dec_return(&threads_starting); > + > + /* > + * Busy wait until asked to start > + */ > + while (!worker_start) > + cpu_relax(); > + > + do { > + lock_fn(w->futex, tid); > + load(tid); > + unlock_fn(w->futex, tid); > + w->stats[STAT_OPS]++; /* One more locking operation */ > + csdelay(); > + } while (!done); > + > + if (verbose) > + printf("[thread %3ld (%d)] exited.\n", tid, thread_id); > + atomic_inc_return(&threads_stopping); > + return NULL; > +} > + > +static void create_threads(struct worker *w, pthread_attr_t *thread_attr, > + void *(*workerfn)(void *arg), long tid) > +{ > + cpu_set_t cpu; > + > + /* > + * Bind each thread to a CPU > + */ > + CPU_ZERO(&cpu); > + CPU_SET(tid % ncpus, &cpu); > + w->futex = pfutex; > + > + if (pthread_attr_setaffinity_np(thread_attr, sizeof(cpu_set_t), &cpu)) > + err(EXIT_FAILURE, "pthread_attr_setaffinity_np"); > + > + if (pthread_create(&w->thread, thread_attr, workerfn, (void *)tid)) > + err(EXIT_FAILURE, "pthread_create"); > +} > + > +static int futex_mutex_type(const char **ptype) > +{ > + const char *type = *ptype; > + > + if (!strcasecmp(type, "WW")) { > + *ptype = "WW"; > + mutex_lock_fn = ww_mutex_lock; > + mutex_unlock_fn = ww_mutex_unlock; > + } else if (!strcasecmp(type, "WW2")) { > + *ptype = "WW2"; > + mutex_lock_fn = ww2_mutex_lock; > + mutex_unlock_fn = ww2_mutex_unlock; > + } else if (!strcasecmp(type, "PI")) { > + *ptype = "PI"; > + mutex_lock_fn = pi_mutex_lock; > + mutex_unlock_fn = pi_mutex_unlock; > + } else if (!strcasecmp(type, "TP")) { > + *ptype = "TP"; > + mutex_lock_fn = tp_mutex_lock; > + mutex_unlock_fn = tp_mutex_unlock; > + > + /* > + * Check if TP futex is supported. > + */ > + futex_unlock(&global_futex, 0); > + if (errno == ENOSYS) { > + fprintf(stderr, > + "\nTP futexes are not supported by the kernel!\n"); > + return -1; > + } > + } else { > + return -1; > + } > + return 0; > +} > + > +static void futex_test_driver(const char *futex_type, > + int (*proc_type)(const char **ptype), > + void *(*workerfn)(void *arg)) > +{ > + u64 us; > + int i, j; > + struct worker total; > + double avg, stddev; > + pthread_attr_t thread_attr; > + > + /* > + * There is an extra blank line before the error counts to highlight > + * them. > + */ > + const char *desc[STAT_NUM] = { > + [STAT_OPS] = "Total exclusive locking ops", > + [STAT_LOCKS] = "Exclusive lock futex calls", > + [STAT_UNLOCKS] = "Exclusive unlock futex calls", > + [STAT_SLEEPS] = "Exclusive lock sleeps", > + [STAT_WAKEUPS] = "Process wakeups", > + [STAT_EAGAINS] = "EAGAIN lock errors", > + [STAT_HANDOFFS] = "Lock handoffs", > + [STAT_STEALS] = "Lock stealings", > + [STAT_LOCKERRS] = "\nExclusive lock errors", > + [STAT_UNLKERRS] = "\nExclusive unlock errors", > + }; > + > + if (exit_now) > + return; > + > + if (proc_type(&futex_type) < 0) { > + fprintf(stderr, "Unknown futex type '%s'!\n", futex_type); > + exit(1); > + } > + > + printf("\n=====================================\n"); > + printf("[PID %d]: %d threads doing %s futex lockings (load=%d) for %d secs.\n\n", > + getpid(), nthreads, futex_type, loadlat, nsecs); > + > + init_stats(&throughput_stats); > + > + *pfutex = 0; > + done = false; > + threads_starting = nthreads; > + pthread_attr_init(&thread_attr); > + > + for (i = 0; i < (int)nthreads; i++) > + create_threads(&worker[i], &thread_attr, workerfn, i); > + > + while (threads_starting) > + usleep(1); > + > + gettimeofday(&start, NULL); > + > + /* > + * Start the test > + * > + * Unlike the other futex benchmarks, this one uses busy waiting > + * instead of pthread APIs to make sure that all the threads (except > + * the one that shares CPU with the parent) will start more or less > + * simultaineously. > + */ > + atomic_inc_return(&worker_start); > + sleep(nsecs); > + toggle_done(0, NULL, NULL); > + > + /* > + * In verbose mode, we check if all the threads have been stopped > + * after 1ms and report the status if some are still running. > + */ > + if (verbose) { > + usleep(1000); > + if (threads_stopping != nthreads) { > + printf("%d threads still running 1ms after timeout" > + " - futex = 0x%x\n", > + nthreads - threads_stopping, *pfutex); > + /* > + * If the threads are still running after 10s, > + * go directly to statistics printing and exit. > + */ > + for (i = 10; i > 0; i--) { > + sleep(1); > + if (threads_stopping == nthreads) > + break; > + } > + if (!i) { > + printf("*** Threads waiting ABORTED!! ***\n\n"); > + goto print_stat; > + } > + } > + } > + > + for (i = 0; i < (int)nthreads; i++) { > + int ret = pthread_join(worker[i].thread, NULL); > + > + if (ret) > + err(EXIT_FAILURE, "pthread_join"); > + } > + > +print_stat: > + pthread_attr_destroy(&thread_attr); > + > + us = runtime.tv_sec * 1000000 + runtime.tv_usec; > + memset(&total, 0, sizeof(total)); > + for (i = 0; i < (int)nthreads; i++) { > + /* > + * Get a rounded estimate of the # of locking ops/sec. > + */ > + u64 tp = (u64)worker[i].stats[STAT_OPS] * 1000000 / us; > + > + for (j = 0; j < STAT_NUM; j++) > + total.stats[j] += worker[i].stats[j]; > + > + update_stats(&throughput_stats, tp); > + if (verbose) > + printf("[thread %3d] futex: %p [ %'ld ops/sec ]\n", > + i, worker[i].futex, (long)tp); > + } > + > + avg = avg_stats(&throughput_stats); > + stddev = stddev_stats(&throughput_stats); > + > + printf("Locking statistics:\n"); > + printf("%-28s = %'.2fs\n", "Test run time", (double)us/1000000); > + for (i = 0; i < STAT_NUM; i++) > + if (total.stats[i]) > + printf("%-28s = %'d\n", desc[i], total.stats[i]); > + > + if (timestat && total.times[TIME_LOCK]) { > + printf("\nSyscall times:\n"); > + if (total.stats[STAT_LOCKS]) > + printf("Avg exclusive lock syscall = %'ldns\n", > + total.times[TIME_LOCK]/total.stats[STAT_LOCKS]); > + if (total.stats[STAT_UNLOCKS]) > + printf("Avg exclusive unlock syscall = %'ldns\n", > + total.times[TIME_UNLK]/total.stats[STAT_UNLOCKS]); > + } > + > + printf("\nPercentages:\n"); > + if (total.stats[STAT_LOCKS]) > + printf("Exclusive lock futex calls = %.1f%%\n", > + stat_percent(&total, STAT_LOCKS, STAT_OPS)); > + if (total.stats[STAT_UNLOCKS]) > + printf("Exclusive unlock futex calls = %.1f%%\n", > + stat_percent(&total, STAT_UNLOCKS, STAT_OPS)); > + if (total.stats[STAT_EAGAINS]) > + printf("EAGAIN lock errors = %.1f%%\n", > + stat_percent(&total, STAT_EAGAINS, STAT_LOCKS)); > + if (total.stats[STAT_WAKEUPS]) > + printf("Process wakeups = %.1f%%\n", > + stat_percent(&total, STAT_WAKEUPS, STAT_UNLOCKS)); > + > + printf("\nPer-thread Locking Rates:\n"); > + printf("Avg = %'d ops/sec (+- %.2f%%)\n", (int)(avg + 0.5), > + rel_stddev_stats(stddev, avg)); > + printf("Min = %'d ops/sec\n", (int)throughput_stats.min); > + printf("Max = %'d ops/sec\n", (int)throughput_stats.max); > + > + if (*pfutex != 0) > + printf("\nResidual futex value = 0x%x\n", *pfutex); > + > + /* Clear the workers area */ > + memset(worker, 0, sizeof(*worker) * nthreads); > +} > + > +int bench_futex_mutex(int argc, const char **argv, > + const char *prefix __maybe_unused) > +{ > + struct sigaction act; > + > + argc = parse_options(argc, argv, mutex_options, > + bench_futex_mutex_usage, 0); > + if (argc) > + goto err; > + > + ncpus = sysconf(_SC_NPROCESSORS_ONLN); > + > + sigfillset(&act.sa_mask); > + act.sa_sigaction = toggle_done; > + sigaction(SIGINT, &act, NULL); > + > + if (!nthreads) > + nthreads = ncpus; > + > + /* > + * Since the allocated memory buffer may not be properly cacheline > + * aligned, we need to allocate one more than needed and manually > + * adjust array boundary to be cacheline aligned. > + */ > + worker_alloc = calloc(nthreads + 1, sizeof(*worker)); > + if (!worker_alloc) > + err(EXIT_FAILURE, "calloc"); > + worker = (void *)((unsigned long)&worker_alloc[1] & > + ~(CACHELINE_SIZE - 1)); > + > + if (!fshared) > + flags = FUTEX_PRIVATE_FLAG; > + > + if (!ftype || !strcmp(ftype, "all")) { > + futex_test_driver("WW", futex_mutex_type, mutex_workerfn); > + futex_test_driver("PI", futex_mutex_type, mutex_workerfn); > + futex_test_driver("TP", futex_mutex_type, mutex_workerfn); > + } else { > + futex_test_driver(ftype, futex_mutex_type, mutex_workerfn); > + } > + free(worker_alloc); > + return 0; > +err: > + usage_with_options(bench_futex_mutex_usage, mutex_options); > + exit(EXIT_FAILURE); > +} > diff --git a/tools/perf/bench/futex.h b/tools/perf/bench/futex.h > index ba7c735..be1eb7a 100644 > --- a/tools/perf/bench/futex.h > +++ b/tools/perf/bench/futex.h > @@ -87,6 +87,29 @@ > val, opflags); > } > > +#ifndef FUTEX_LOCK > +#define FUTEX_LOCK 0xff /* Disable the use of TP futexes */ > +#define FUTEX_UNLOCK 0xff > +#endif /* FUTEX_LOCK */ > + > +/** > + * futex_lock() - lock the TP futex > + */ > +static inline int > +futex_lock(u_int32_t *uaddr, struct timespec *timeout, int opflags) > +{ > + return futex(uaddr, FUTEX_LOCK, 0, timeout, NULL, 0, opflags); > +} > + > +/** > + * futex_unlock() - unlock the TP futex > + */ > +static inline int > +futex_unlock(u_int32_t *uaddr, int opflags) > +{ > + return futex(uaddr, FUTEX_UNLOCK, 0, NULL, NULL, 0, opflags); > +} > + > #ifndef HAVE_PTHREAD_ATTR_SETAFFINITY_NP > #include > static inline int pthread_attr_setaffinity_np(pthread_attr_t *attr, > diff --git a/tools/perf/builtin-bench.c b/tools/perf/builtin-bench.c > index a1cddc6..bf4418d 100644 > --- a/tools/perf/builtin-bench.c > +++ b/tools/perf/builtin-bench.c > @@ -20,6 +20,7 @@ > #include "builtin.h" > #include "bench/bench.h" > > +#include > #include > #include > #include > @@ -62,6 +63,7 @@ struct bench { > { "requeue", "Benchmark for futex requeue calls", bench_futex_requeue }, > /* pi-futexes */ > { "lock-pi", "Benchmark for futex lock_pi calls", bench_futex_lock_pi }, > + { "mutex", "Benchmark for mutex locks using futexes", bench_futex_mutex }, > { "all", "Run all futex benchmarks", NULL }, > { NULL, NULL, NULL } > }; > @@ -215,6 +217,7 @@ int cmd_bench(int argc, const char **argv, const char *prefix __maybe_unused) > { > struct collection *coll; > int ret = 0; > + char *locale; > > if (argc < 2) { > /* No collection specified. */ > @@ -222,6 +225,13 @@ int cmd_bench(int argc, const char **argv, const char *prefix __maybe_unused) > goto end; > } > > + /* > + * Enable better number formatting. > + */ > + locale = setlocale(LC_NUMERIC, ""); > + if (!strcmp(locale, "C")) > + setlocale(LC_NUMERIC, "en_US"); > + > argc = parse_options(argc, argv, bench_options, bench_usage, > PARSE_OPT_STOP_AT_NON_OPTION); > > diff --git a/tools/perf/check-headers.sh b/tools/perf/check-headers.sh > index c747bfd..6ec5f2d 100755 > --- a/tools/perf/check-headers.sh > +++ b/tools/perf/check-headers.sh > @@ -57,3 +57,7 @@ check arch/x86/lib/memcpy_64.S -B -I "^EXPORT_SYMBOL" -I "^#include check arch/x86/lib/memset_64.S -B -I "^EXPORT_SYMBOL" -I "^#include " > check include/uapi/asm-generic/mman.h -B -I "^#include <\(uapi/\)*asm-generic/mman-common.h>" > check include/uapi/linux/mman.h -B -I "^#include <\(uapi/\)*asm/mman.h>" > + > +# need to link to the latest futex.h header > +[[ -f ../../include/uapi/linux/futex.h ]] && > + ln -sf ../../../../../include/uapi/linux/futex.h util/include/linux > -- > 1.8.3.1