TODO: remove (or split out) //temp-s Note: Ankur indicates that for ~L3-size or larger regions MOVNT/CLZERO is better even latency-wise --- unstable.orig/xen/arch/x86/clear_page.S 2021-02-25 09:28:14.175636881 +0100 +++ unstable/xen/arch/x86/clear_page.S 2021-02-25 10:04:04.315325973 +0100 @@ -16,3 +16,66 @@ ENTRY(clear_page_sse2) sfence ret + +ENTRY(clear_page_stosb) + mov $PAGE_SIZE, %ecx + xor %eax,%eax + rep stosb + ret + +ENTRY(clear_page_stosl) + mov $PAGE_SIZE/4, %ecx + xor %eax, %eax + rep stosl + ret + +ENTRY(clear_page_stosq) + mov $PAGE_SIZE/8, %ecx + xor %eax, %eax + rep stosq + ret + +ENTRY(clear_page_avx) + mov $PAGE_SIZE/128, %ecx + vpxor %xmm0, %xmm0, %xmm0 +0: vmovntdq %ymm0, (%rdi) + vmovntdq %ymm0, 32(%rdi) + vmovntdq %ymm0, 64(%rdi) + vmovntdq %ymm0, 96(%rdi) + sub $-128, %rdi + sub $1, %ecx + jnz 0b + sfence + ret + +#if __GNUC__ > 4 +ENTRY(clear_page_avx512) + mov $PAGE_SIZE/256, %ecx + vpxor %xmm0, %xmm0, %xmm0 +0: vmovntdq %zmm0, (%rdi) + vmovntdq %zmm0, 64(%rdi) + vmovntdq %zmm0, 128(%rdi) + vmovntdq %zmm0, 192(%rdi) + add $256, %rdi + sub $1, %ecx + jnz 0b + sfence + ret +#endif + +#if __GNUC__ > 5 +ENTRY(clear_page_clzero) + mov %rdi, %rax + mov $PAGE_SIZE/256, %ecx +0: clzero + add $64, %rax + clzero + add $64, %rax + clzero + add $64, %rax + clzero + add $64, %rax + sub $1, %ecx + jnz 0b + ret +#endif --- unstable.orig/xen/arch/x86/cpu/common.c 2021-02-09 16:20:45.000000000 +0100 +++ unstable/xen/arch/x86/cpu/common.c 2021-02-09 16:20:45.000000000 +0100 @@ -238,6 +238,7 @@ int get_model_name(struct cpuinfo_x86 *c } +extern unsigned l1d_size, l2_size;//temp void display_cacheinfo(struct cpuinfo_x86 *c) { unsigned int dummy, ecx, edx, size; @@ -250,6 +251,7 @@ void display_cacheinfo(struct cpuinfo_x8 " D cache %uK (%u bytes/line)\n", edx >> 24, edx & 0xFF, ecx >> 24, ecx & 0xFF); c->x86_cache_size = (ecx >> 24) + (edx >> 24); +if(ecx >>= 24) l1d_size = ecx;//temp } } @@ -260,6 +262,7 @@ void display_cacheinfo(struct cpuinfo_x8 size = ecx >> 16; if (size) { +l2_size =//temp c->x86_cache_size = size; if (opt_cpu_info) --- unstable.orig/xen/arch/x86/cpu/intel_cacheinfo.c 2021-02-25 09:28:14.175636881 +0100 +++ unstable/xen/arch/x86/cpu/intel_cacheinfo.c 2021-02-09 16:20:23.000000000 +0100 @@ -116,6 +116,7 @@ static int find_num_cache_leaves(void) return i; } +extern unsigned l1d_size, l2_size;//temp void init_intel_cacheinfo(struct cpuinfo_x86 *c) { unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0; /* Cache sizes */ @@ -230,12 +231,14 @@ void init_intel_cacheinfo(struct cpuinfo } if (new_l1d) +l1d_size =//temp l1d = new_l1d; if (new_l1i) l1i = new_l1i; if (new_l2) { +l2_size =//temp l2 = new_l2; } --- unstable.orig/xen/arch/x86/mm.c 2021-02-25 09:28:41.215745784 +0100 +++ unstable/xen/arch/x86/mm.c 2021-04-06 15:44:32.478099453 +0200 @@ -284,6 +284,22 @@ static void __init assign_io_page(struct page->count_info |= PGC_allocated | 1; } +static unsigned __init noinline probe(const unsigned*spc, unsigned nr) {//temp +#define PAGE_ENTS (PAGE_SIZE / sizeof(*spc)) + unsigned i, j, acc; + for(acc = i = 0; i < PAGE_SIZE / 64; ++i) + for(j = 0; j < nr; ++j) + acc += spc[j * PAGE_ENTS + ((i * (64 / sizeof(*spc)) * 7) & (PAGE_ENTS - 1))]; + return acc & (i * nr - 1); +#undef PAGE_ENTS +} +extern void clear_page_stosb(void*);//temp +extern void clear_page_stosl(void*);//temp +extern void clear_page_stosq(void*);//temp +extern void clear_page_avx(void*);//temp +extern void clear_page_avx512(void*);//temp +extern void clear_page_clzero(void*);//temp +unsigned l1d_size = KB(16), l2_size;//temp void __init arch_init_memory(void) { unsigned long i, pfn, rstart_pfn, rend_pfn, iostart_pfn, ioend_pfn; @@ -392,6 +408,67 @@ void __init arch_init_memory(void) } #endif +{//temp + unsigned order = get_order_from_pages(PFN_DOWN(l2_size << 10)) ?: 1; + void*fill = alloc_xenheap_pages(order, 0); + void*buf = alloc_xenheap_pages(order - 1, 0); + unsigned long cr0 = read_cr0(); + printk("erms=%d fsrm=%d fzrm=%d fsrs=%d fsrcs=%d l1d=%uk l2=%uk\n", + !!boot_cpu_has(X86_FEATURE_ERMS), !!boot_cpu_has(X86_FEATURE_FSRM), + !!boot_cpu_has(X86_FEATURE_FZRM), !!boot_cpu_has(X86_FEATURE_FSRS), + !!boot_cpu_has(X86_FEATURE_FSRCS), l1d_size, l2_size); + clts(); + for(unsigned pass = 0; pass < 4; ++pass) { + printk("L%d w/%s flush:\n", 2 - !(pass & 2), pass & 1 ? "" : "o"); + wbinvd(); + for(i = 0; fill && buf && i < 3; ++i) { + unsigned nr = PFN_DOWN((pass & 2 ? l2_size : l1d_size) << 10); + uint64_t start, pre, clr, post; + +#define CHK(kind) do { \ + /* local_irq_disable(); */ \ +\ + memset(buf, __LINE__ | (__LINE__ >> 8), nr * PAGE_SIZE / 2); \ + if(pass & 1) wbinvd(); else mb(); \ + memset(fill, __LINE__ | (__LINE__ >> 8), nr * PAGE_SIZE); \ + mb(); \ +\ + if(boot_cpu_has(X86_FEATURE_IBRSB) || boot_cpu_has(X86_FEATURE_IBPB)) \ + wrmsrl(MSR_PRED_CMD, PRED_CMD_IBPB); \ + start = rdtsc_ordered(); \ + if(probe(fill, nr)) BUG(); \ + pre = rdtsc_ordered() - start; \ +\ + start = rdtsc_ordered(); \ + for(pfn = 0; pfn < nr / 2; ++pfn) \ + clear_page_##kind(buf + pfn * PAGE_SIZE); \ + clr = rdtsc_ordered() - start; \ +\ + if(boot_cpu_has(X86_FEATURE_IBRSB) || boot_cpu_has(X86_FEATURE_IBPB)) \ + wrmsrl(MSR_PRED_CMD, PRED_CMD_IBPB); \ + start = rdtsc_ordered(); \ + if(probe(fill, nr)) BUG(); \ + post = rdtsc_ordered() - start; \ +\ + /* local_irq_enable(); */ \ + printk(" pre=%lx " #kind "=%lx post=%lx\n", pre, clr, post); \ +} while(0) + + CHK(sse2); + CHK(stosb); + CHK(stosl); + CHK(stosq); + if(boot_cpu_has(X86_FEATURE_AVX)) CHK(avx); + if(__GNUC__ > 4 && boot_cpu_has(X86_FEATURE_AVX512F)) CHK(avx512); + if(__GNUC__ > 5 && boot_cpu_has(X86_FEATURE_CLZERO)) CHK(clzero); + +#undef CHK + } + } + write_cr0(cr0); + free_xenheap_pages(buf, order - 1); + free_xenheap_pages(fill, order); +} /* Generate a symbol to be used in linker script */ ASM_CONSTANT(FIXADDR_X_SIZE, FIXADDR_X_SIZE); }