On 29.03.23 14:51, Borislav Petkov wrote: > On Mon, Mar 06, 2023 at 05:34:22PM +0100, Juergen Gross wrote: >> +struct cache_map { >> + u64 start; >> + u64 end; >> + u8 type; >> + bool fixed; > > Can those last two be a single > > u64 flags; > > which contains the type and fixed and later perhaps other things so that we can > have those elements aligned and we don't waste space unnecessarily by having > a bool for a single bit of information? Yes, of course. > >> +}; >> + >> +/* >> + * CACHE_MAP_MAX is the maximum number of memory ranges in cache_map, where >> + * no 2 adjacent ranges have the same cache mode (those would be merged). >> + * The number is based on the worst case: >> + * - no two adjacent fixed MTRRs share the same cache mode >> + * - one variable MTRR is spanning a huge area with mode WB >> + * - 255 variable MTRRs with mode UC all overlap with the WB MTRR, creating 2 >> + * additional ranges each (result like "ababababa...aba" with a = WB, b = UC), >> + * accounting for MTRR_MAX_VAR_RANGES * 2 - 1 range entries >> + * - a TOM2 area (even with overlapping an UC MTRR can't add 2 range entries >> + * to the possible maximum, as it always starts at 4GB, thus it can't be in >> + * the middle of that MTRR, unless that MTRR starts at 0, which would remove >> + * the initial "a" from the "abababa" pattern above) >> + * The map won't contain ranges with no matching MTRR (those fall back to the >> + * default cache mode). >> + */ > > Nice. > >> +#define CACHE_MAP_MAX (MTRR_NUM_FIXED_RANGES + MTRR_MAX_VAR_RANGES * 2) >> + >> +static struct cache_map init_cache_map[CACHE_MAP_MAX] __initdata; >> +static struct cache_map *cache_map __refdata = init_cache_map; >> +static unsigned int cache_map_size = CACHE_MAP_MAX; >> +static unsigned int cache_map_n; >> +static unsigned int cache_map_fixed; >> + >> static unsigned long smp_changes_mask; >> static int mtrr_state_set; >> u64 mtrr_tom2; >> @@ -78,6 +109,20 @@ static u64 get_mtrr_size(u64 mask) >> return size; >> } >> >> +static u8 get_var_mtrr_state(unsigned int reg, u64 *start, u64 *size) >> +{ >> + struct mtrr_var_range *mtrr = mtrr_state.var_ranges + reg; >> + >> + if (!(mtrr->mask_lo & (1 << 11))) > > I'm guessing that's the > > MTRR Pair Enable > > bit? > > Use a macro with a proper name pls. Okay. > >> + return MTRR_TYPE_INVALID; >> + >> + *start = (((u64)mtrr->base_hi) << 32) + (mtrr->base_lo & PAGE_MASK); >> + *size = get_mtrr_size((((u64)mtrr->mask_hi) << 32) + >> + (mtrr->mask_lo & PAGE_MASK)); >> + >> + return mtrr->base_lo & 0xff; >> +} >> + >> static u8 get_effective_type(u8 type1, u8 type2) >> { >> if (type1 == MTRR_TYPE_UNCACHABLE || type2 == MTRR_TYPE_UNCACHABLE) >> @@ -241,6 +286,211 @@ static u8 mtrr_type_lookup_variable(u64 start, u64 end, u64 *partial_end, >> return mtrr_state.def_type; >> } >> >> +static void rm_map_entry_at(int idx) >> +{ >> + int i; >> + >> + for (i = idx; i < cache_map_n - 1; i++) >> + cache_map[i] = cache_map[i + 1]; > > That can be a memmove, I think. Something like > > memmove((void *)&cache_map[i], > (void *)&cache_map[i + 1], > (cache_map_n - idx) * sizeof(struct cache_map)); Okay. > > >> + >> + cache_map_n--; >> +} >> + >> +/* >> + * Add an entry into cache_map at a specific index. >> + * Merges adjacent entries if appropriate. >> + * Return the number of merges for correcting the scan index. > > Make that a block comment: > > * Add an entry into cache_map at a specific index. Merges adjacent entries if > * appropriate. Return the number of merges for correcting the scan index. Okay. > > vim, for example, has the cool "gq}" command for that. > >> + */ >> +static int add_map_entry_at(u64 start, u64 end, u8 type, int idx) >> +{ >> + bool merge_prev, merge_next; >> + int i; >> + >> + if (start >= end) >> + return 0; >> + >> + merge_prev = (idx > 0 && !cache_map[idx - 1].fixed && >> + start == cache_map[idx - 1].end && >> + type == cache_map[idx - 1].type); >> + merge_next = (idx < cache_map_n && !cache_map[idx].fixed && >> + end == cache_map[idx].start && >> + type == cache_map[idx].type); > > Uuh, head hurts. How about: > > bool merge_prev = false, merge_next = false; > > ... > > if (idx > 0) { > struct cache_map *prev = &cache_map[idx - 1]; > > if (!prev->fixed && > prev->end == start && > prev->type == type) > merge_prev = true; > } > > Untested ofc, but you get the idea. It is a lot more readable this way. And the > same with merge_next. Fine with me. > >> + >> + if (merge_prev && merge_next) { >> + cache_map[idx - 1].end = cache_map[idx].end; >> + rm_map_entry_at(idx); >> + return 2; >> + } >> + if (merge_prev) { >> + cache_map[idx - 1].end = end; >> + return 1; >> + } >> + if (merge_next) { >> + cache_map[idx].start = start; >> + return 1; >> + } >> + >> + /* Sanity check: the array should NEVER be too small! */ >> + if (cache_map_n == cache_map_size) { >> + WARN(1, "MTRR cache mode memory map exhausted!\n"); >> + cache_map_n = cache_map_fixed; >> + return 0; >> + } >> + >> + for (i = cache_map_n; i > idx; i--) >> + cache_map[i] = cache_map[i - 1]; > > memmove as above. Okay. > >> + >> + cache_map[idx].start = start; >> + cache_map[idx].end = end; >> + cache_map[idx].type = type; >> + cache_map[idx].fixed = false; >> + cache_map_n++; >> + >> + return 0; >> +} >> + >> +/* Clear a part of an entry. Return 1 if start of entry is still valid. */ >> +static int clr_map_range_at(u64 start, u64 end, int idx) >> +{ >> + int ret = start != cache_map[idx].start; >> + u64 tmp; >> + >> + if (start == cache_map[idx].start && end == cache_map[idx].end) { >> + rm_map_entry_at(idx); >> + } else if (start == cache_map[idx].start) { >> + cache_map[idx].start = end; >> + } else if (end == cache_map[idx].end) { >> + cache_map[idx].end = start; >> + } else { >> + tmp = cache_map[idx].end; >> + cache_map[idx].end = start; >> + add_map_entry_at(end, tmp, cache_map[idx].type, idx + 1); >> + } >> + >> + return ret; >> +} >> + >> +static void add_map_entry(u64 start, u64 end, u8 type) >> +{ >> + int i; >> + u8 new_type, old_type; >> + u64 tmp; > > "int i;" goes here. Okay. > >> + >> + for (i = 0; i < cache_map_n && start < end; i++) { >> + if (start >= cache_map[i].end) >> + continue; >> + >> + if (start < cache_map[i].start) { >> + /* Region start has no overlap. */ >> + tmp = min(end, cache_map[i].start); >> + i -= add_map_entry_at(start, tmp, type, i); > > Uff, what happens if i == 0 and becomes negative here? > > Can that even happen? No. :-) > > This feels weird: using function retvals as index var decrements. I have > only a vague idea what you're doing in this function but it feels weird. > Maybe I need to play through an example to grok it better... The final form of the code is the result of an iterative process. :-) It looked much worse in the beginning. > >> + start = tmp; >> + continue; >> + } >> + >> + new_type = get_effective_type(type, cache_map[i].type); >> + old_type = cache_map[i].type; >> + >> + if (cache_map[i].fixed || new_type == old_type) { >> + /* Cut off start of new entry. */ >> + start = cache_map[i].end; >> + continue; >> + } >> + >> + tmp = min(end, cache_map[i].end); >> + i += clr_map_range_at(start, tmp, i); >> + i -= add_map_entry_at(start, tmp, new_type, i); >> + start = tmp; >> + } >> + >> + add_map_entry_at(start, end, type, i); >> +} >> + >> +/* Add variable MTRRs to cache map. */ >> +static void map_add_var(void) >> +{ >> + unsigned int i; >> + u64 start, size; >> + u8 type; >> + >> + /* Add AMD magic MTRR. */ > > Why magic? I've reused the wording from cleanup.c (just above amd_special_default_mtrr()). > >> + if (mtrr_tom2) { >> + add_map_entry(1ULL << 32, mtrr_tom2 - 1, MTRR_TYPE_WRBACK); > > BIT_ULL(32) Okay. > >> + cache_map[cache_map_n - 1].fixed = true; >> + } >> + >> + for (i = 0; i < num_var_ranges; i++) { >> + type = get_var_mtrr_state(i, &start, &size); >> + if (type != MTRR_TYPE_INVALID) >> + add_map_entry(start, start + size, type); >> + } >> +} >> + >> +/* Rebuild map by replacing variable entries. */ >> +static void rebuild_map(void) >> +{ >> + cache_map_n = cache_map_fixed; >> + >> + map_add_var(); >> +} >> + >> +/* Build the cache_map containing the cache modes per memory range. */ >> +void mtrr_build_map(void) >> +{ >> + unsigned int i; >> + u64 start, end, size; >> + u8 type; >> + >> + if (!mtrr_state.enabled) >> + return; >> + >> + /* Add fixed MTRRs, optimize for adjacent entries with same type. */ >> + if (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED) { >> + start = 0; >> + end = size = 0x10000; >> + type = mtrr_state.fixed_ranges[0]; >> + >> + for (i = 1; i < MTRR_NUM_FIXED_RANGES; i++) { >> + if (i == 8 || i == 24) >> + size >>= 2; >> + >> + if (mtrr_state.fixed_ranges[i] != type) { >> + add_map_entry(start, end, type); >> + start = end; >> + type = mtrr_state.fixed_ranges[i]; >> + } >> + end += size; >> + } >> + add_map_entry(start, end, type); >> + } >> + >> + /* Mark fixed and magic MTRR as fixed, they take precedence. */ >> + for (i = 0; i < cache_map_n; i++) >> + cache_map[i].fixed = true; >> + cache_map_fixed = cache_map_n; >> + >> + map_add_var(); > > I think it would be useful to issue some stats here: > > pr_info("MTRR map: ... size: ..., ... entries: ..., memory used: ...."); > > and so on so that we can get some feedback when that happens. We can always > drop it later but for the initial runs it would be prudent to have that. Fine with me. > >> +/* Copy the cache_map from __initdata memory to dynamically allocated one. */ >> +void __init mtrr_copy_map(void) >> +{ >> + unsigned int new_size = cache_map_fixed + 2 * num_var_ranges; >> + >> + if (!mtrr_state.enabled || !new_size) { >> + cache_map = NULL; >> + return; >> + } >> + >> + mutex_lock(&mtrr_mutex); >> + >> + cache_map = kcalloc(new_size, sizeof(*cache_map), GFP_KERNEL); >> + memmove(cache_map, init_cache_map, cache_map_n * sizeof(*cache_map)); >> + cache_map_size = new_size; >> + >> + mutex_unlock(&mtrr_mutex); >> +} >> + >> /** >> * mtrr_overwrite_state - set static MTRR state >> * >> @@ -815,6 +1065,10 @@ static void generic_set_mtrr(unsigned int reg, unsigned long base, >> >> cache_enable(); >> local_irq_restore(flags); >> + >> + /* On the first cpu rebuild the cache mode memory map. */ > > s/cpu/CPU/ > >> + if (smp_processor_id() == cpumask_first(cpu_online_mask)) > > Why not in mtrr_bp_init()? That is the first CPU. Yeah, but generic_set_mtrr() can be called after boot, too. > >> + rebuild_map(); >> } >> >> int generic_validate_add_page(unsigned long base, unsigned long size, >> diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.c b/arch/x86/kernel/cpu/mtrr/mtrr.c >> index 50cd2287b6e1..1dbb9fdfd87b 100644 >> --- a/arch/x86/kernel/cpu/mtrr/mtrr.c >> +++ b/arch/x86/kernel/cpu/mtrr/mtrr.c >> @@ -65,7 +65,7 @@ static bool mtrr_enabled(void) >> } >> >> unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; >> -static DEFINE_MUTEX(mtrr_mutex); >> +DEFINE_MUTEX(mtrr_mutex); >> >> u64 size_or_mask, size_and_mask; >> >> @@ -668,6 +668,7 @@ void __init mtrr_bp_init(void) >> /* Software overwrite of MTRR state, only for generic case. */ >> mtrr_calc_physbits(true); >> init_table(); >> + mtrr_build_map(); >> pr_info("MTRRs set to read-only\n"); >> >> return; >> @@ -705,6 +706,7 @@ void __init mtrr_bp_init(void) >> if (get_mtrr_state()) { >> memory_caching_control |= CACHE_MTRR; >> changed_by_mtrr_cleanup = mtrr_cleanup(phys_addr); >> + mtrr_build_map(); >> } else { >> mtrr_if = NULL; >> why = "by BIOS"; >> @@ -733,6 +735,8 @@ void mtrr_save_state(void) >> >> static int __init mtrr_init_finialize(void) >> { >> + mtrr_copy_map(); > > Move that... > >> + >> if (!mtrr_enabled()) >> return 0; > > ... here. Umm, not really. I want to do the copy even in the Xen PV case. > > So yeah, I like the general direction but this is a complex patch. Lemme > add some printks to it in order to get a better idea of what happens. Yeah, those were part of my iterative process mentioned above. Juergen