From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1752763AbbC0MOo (ORCPT ); Fri, 27 Mar 2015 08:14:44 -0400 Received: from mx1.redhat.com ([209.132.183.28]:37253 "EHLO mx1.redhat.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751779AbbC0MOm (ORCPT ); Fri, 27 Mar 2015 08:14:42 -0400 Message-ID: <551549AF.50808@redhat.com> Date: Fri, 27 Mar 2015 13:14:39 +0100 From: Denys Vlasenko User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:24.0) Gecko/20100101 Thunderbird/24.2.0 MIME-Version: 1.0 To: Ingo Molnar , Brian Gerst CC: Andy Lutomirski , Borislav Petkov , the arch/x86 maintainers , Linux Kernel Mailing List Subject: Re: [PATCH] x86/asm/entry/64: better check for canonical address References: <1427373731-13056-1-git-send-email-dvlasenk@redhat.com> <20150327081141.GA9526@gmail.com> <551534B1.6090908@redhat.com> <20150327111738.GA8749@gmail.com> <20150327113430.GC14778@gmail.com> In-Reply-To: <20150327113430.GC14778@gmail.com> Content-Type: multipart/mixed; boundary="------------000902090606010704010903" Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org This is a multi-part message in MIME format. --------------000902090606010704010903 Content-Type: text/plain; charset=ISO-8859-1 Content-Transfer-Encoding: 7bit On 03/27/2015 12:34 PM, Ingo Molnar wrote: > > * Brian Gerst wrote: > >>> Btw., there's a neat trick we could do: in the HLT, MWAIT and >>> ACPI-idle code we could attempt to set up RCX to match RIP, to >>> trigger this optimization in the common 'irq interrupted the idle >>> task' case? >> >> sysret only returns to CPL3. > > Indeed, an IRET ought to be pretty cheap for same-ring interrupt > returns in any case. Unfortunately, it is not. Try attached program. On this CPU, 1 ns ~= 3 cycles. $ ./timing_test64 callret 10000 loops in 0.00008s = 7.87 nsec/loop for callret 100000 loops in 0.00076s = 7.56 nsec/loop for callret 1000000 loops in 0.00548s = 5.48 nsec/loop for callret 10000000 loops in 0.02882s = 2.88 nsec/loop for callret 100000000 loops in 0.18334s = 1.83 nsec/loop for callret 200000000 loops in 0.36051s = 1.80 nsec/loop for callret 400000000 loops in 0.71632s = 1.79 nsec/loop for callret Near call + near ret = 5 cycles $ ./timing_test64 lret 10000 loops in 0.00034s = 33.95 nsec/loop for lret 100000 loops in 0.00328s = 32.83 nsec/loop for lret 1000000 loops in 0.04541s = 45.41 nsec/loop for lret 10000000 loops in 0.32130s = 32.13 nsec/loop for lret 20000000 loops in 0.64191s = 32.10 nsec/loop for lret push my_cs + push next_label + far ret = ~90 cycles $ ./timing_test64 iret 10000 loops in 0.00344s = 343.90 nsec/loop for iret 100000 loops in 0.01890s = 188.97 nsec/loop for iret 1000000 loops in 0.08228s = 82.28 nsec/loop for iret 10000000 loops in 0.77910s = 77.91 nsec/loop for iret This is the "same-ring interrupt return". ~230 cycles! :( --------------000902090606010704010903 Content-Type: text/x-csrc; name="timing_test.c" Content-Transfer-Encoding: 7bit Content-Disposition: attachment; filename="timing_test.c" // To be unaffected by random cacheline placement, use generous "align": // // i686-gcc -O2 -Wall -falign-loops=32 -falign-jumps=32 -falign-labels=32 -static // x86_64-gcc -O2 -Wall -falign-loops=32 -falign-jumps=32 -falign-labels=32 -static #include #include #include #include #include #include #include #include #if !defined(__i386__) #define get_sysenter_addr() 0 #else #include long sysenter_addr; long get_sysenter_addr(char **envp) { Elf32_auxv_t *auxv; while (*envp++ != NULL) continue; for (auxv = (void *)envp; auxv->a_type != AT_NULL; auxv++) if( auxv->a_type == AT_SYSINFO) return (sysenter_addr = auxv->a_un.a_val); fprintf(stderr, "AT_SYSINFO not supplied, can't test\n"); exit(0); /* this is not a failure */ } void sysenter_getpid(void) { asm volatile( "\n" " mov $20,%eax" // GETPID "\n" " call *sysenter_addr" ); } #endif #if defined(__i386__) #define L_or_Q "l" #define E_or_R "e" #else #define L_or_Q "q" #define E_or_R "r" #endif asm ( "\n" " .text" "\n" "ret__: ret" ); int main(int argc, char **argv, char **envp) { struct timespec start, end; unsigned long long duration; size_t loops, i; const char *mode; if (argc < 2) { printf("Usage: timing_test [MILLIONS_OF_ITERATIONS] MODE\n"); return 1; } mode = argv[2]; if (!mode) { mode = argv[1]; loops = 10*1000; } else { loops = (size_t)atol(argv[1]) * 1000000; } again: if (!strcmp(mode, "nothing")) { clock_gettime(CLOCK_MONOTONIC, &start); for (i = loops; i != 0; i--) { asm volatile ("# nothing"); } } else if (!strcmp(mode, "nop")) { clock_gettime(CLOCK_MONOTONIC, &start); for (i = loops; i != 0; i--) { asm volatile ("nop"); } } else if (!strcmp(mode, "rdtsc")) { clock_gettime(CLOCK_MONOTONIC, &start); for (i = loops; i != 0; i--) { unsigned int a, d; asm volatile ("rdtsc" : "=a" (a), "=d" (d)); } } else if (!strcmp(mode, "lfence_rdtsc")) { clock_gettime(CLOCK_MONOTONIC, &start); for (i = loops; i != 0; i--) { unsigned int a, d; asm volatile ("lfence;rdtsc" : "=a" (a), "=d" (d)); } } else if (!strcmp(mode, "lfence_rdtsc_lfence")) { clock_gettime(CLOCK_MONOTONIC, &start); for (i = loops; i != 0; i--) { unsigned int a, d; asm volatile (""); asm volatile ("lfence;rdtsc;lfence" : "=a" (a), "=d" (d)); } } else if (!strcmp(mode, "mfence_rdtsc_mfence")) { clock_gettime(CLOCK_MONOTONIC, &start); for (i = loops; i != 0; i--) { unsigned int a, d; asm volatile ("mfence;rdtsc;mfence" : "=a" (a), "=d" (d)); } } else if (!strcmp(mode, "rdtscp")) { clock_gettime(CLOCK_MONOTONIC, &start); for (i = loops; i != 0; i--) { unsigned int a, c, d; asm volatile ("rdtscp" : "=a" (a), "=c" (c), "=d" (d)); } } else if (!strcmp(mode, "gettimeofday")) { struct timeval tv; clock_gettime(CLOCK_MONOTONIC, &start); for (i = loops; i != 0; i--) gettimeofday(&tv, 0); } else if (!strcmp(mode, "getpid")) { clock_gettime(CLOCK_MONOTONIC, &start); for (i = loops; i != 0; i--) syscall(SYS_getpid); #if defined(__i386__) } else if (!strcmp(mode, "sysenter_getpid")) { get_sysenter_addr(envp); clock_gettime(CLOCK_MONOTONIC, &start); for (i = loops; i != 0; i--) sysenter_getpid(); } else if (!strcmp(mode, "iret")) { /* "push cs" is itself a bit expensive, moving it out of loop */ long saved_cs; asm volatile ("mov %%cs,%0" : "=r" (saved_cs)); clock_gettime(CLOCK_MONOTONIC, &start); for (i = loops; i != 0; i--) { asm volatile ( "\n" " push $0" // flags "\n" " push %0" // cs "\n" " push $1f" // ip "\n" " iret" "\n" "1:" : : "r" (saved_cs) ); } #endif #if defined(__x86_64__) } else if (!strcmp(mode, "iret")) { long saved_cs; long saved_ss; asm volatile ("mov %%cs,%0" : "=r" (saved_cs)); asm volatile ("mov %%ss,%0" : "=r" (saved_ss)); clock_gettime(CLOCK_MONOTONIC, &start); for (i = loops; i != 0; i--) { asm volatile ( "\n" " mov %%rsp,%%rax" "\n" " push %0" // ss "\n" " push %%rax" // sp "\n" " push $0" // flags "\n" " push %1" // cs "\n" " push $1f" // ip "\n" " iretq" "\n" "1:" : : "r" (saved_ss), "r" (saved_cs) : "ax" ); } #endif } else if (!strcmp(mode, "lret")) { /* "push cs" is itself a bit expensive, moving it out of loop */ long saved_cs; asm volatile ("mov %%cs,%0" : "=r" (saved_cs)); clock_gettime(CLOCK_MONOTONIC, &start); for (i = loops; i != 0; i--) { asm volatile ( "\n" " push %0" "\n" " push $1f" "\n" " lret"L_or_Q "\n" "1:" : : "r" (saved_cs) ); } } else if (!strcmp(mode, "callret")) { clock_gettime(CLOCK_MONOTONIC, &start); for (i = loops; i != 0; i--) { asm volatile ("call ret__"); } } else if (!strcmp(mode, "ret")) { /* This is useful to measure delays due to * return stack branch prediction not working * (we aren't using paired call/rets here, as CPU expects). * I observed "callret" test above being 4 times faster than this: */ clock_gettime(CLOCK_MONOTONIC, &start); for (i = loops; i != 0; i--) { asm volatile ( "\n" " push $1f" "\n" " ret" "\n" "1:" ); } } else if (!strcmp(mode, "loadss")) { long saved_ss; asm volatile ("mov %%ss,%0" : "=r" (saved_ss)); clock_gettime(CLOCK_MONOTONIC, &start); for (i = loops; i != 0; i--) { asm volatile ("mov %0,%%ss" : : "r" (saved_ss)); } } else if (!strcmp(mode, "pushf")) { clock_gettime(CLOCK_MONOTONIC, &start); for (i = loops; i != 0; i--) { asm volatile ( "\n" " pushf" "\n" " pop %%"E_or_R"ax" : : : "ax" ); } } else if (!strcmp(mode, "popf")) { long flags; asm volatile ( "\n" " pushf" "\n" " pop %0" : "=r" (flags) ); clock_gettime(CLOCK_MONOTONIC, &start); for (i = loops; i != 0; i--) { asm volatile ( "\n" " push %0" "\n" " popf" : : "r" (flags) : "ax" ); } } else if (!strcmp(mode, "rdpmc")) { // Unlikely to work. unsigned int eax, edx; unsigned int ecx = 0; clock_gettime(CLOCK_MONOTONIC, &start); for (i = loops; i != 0; i--) asm volatile ("rdpmc" : "=a" (eax), "=d" (edx) : "c" (ecx)); } else { printf("Unknown mode %s\n", mode); return 1; } clock_gettime(CLOCK_MONOTONIC, &end); duration = (1000*1000*1000ULL * end.tv_sec + end.tv_nsec) - (1000*1000*1000ULL * start.tv_sec + start.tv_nsec); printf("%lu loops in %.5fs = %.2f nsec/loop for %s\n", (unsigned long)loops, (double)duration * 1e-9, (double)duration / loops, mode ); if (!argv[2]) { if (duration < 90*1000*1000) { loops *= 10; goto again; } if (duration < 490*1000*1000) { loops *= 2; goto again; } } return 0; } --------------000902090606010704010903--