xref: /linux/arch/x86/kernel/cpu/aperfmperf.c (revision 208eed95fc710827b100266c9450ae84d46727bd)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * x86 APERF/MPERF KHz calculation for
4  * /sys/.../cpufreq/scaling_cur_freq
5  *
6  * Copyright (C) 2017 Intel Corp.
7  * Author: Len Brown <len.brown@intel.com>
8  */
9 #include <linux/cpufreq.h>
10 #include <linux/delay.h>
11 #include <linux/ktime.h>
12 #include <linux/math64.h>
13 #include <linux/percpu.h>
14 #include <linux/rcupdate.h>
15 #include <linux/sched/isolation.h>
16 #include <linux/sched/topology.h>
17 #include <linux/smp.h>
18 #include <linux/syscore_ops.h>
19 
20 #include <asm/cpu.h>
21 #include <asm/cpu_device_id.h>
22 #include <asm/intel-family.h>
23 #include <asm/msr.h>
24 
25 #include "cpu.h"
26 
27 struct aperfmperf {
28 	seqcount_t	seq;
29 	unsigned long	last_update;
30 	u64		acnt;
31 	u64		mcnt;
32 	u64		aperf;
33 	u64		mperf;
34 };
35 
36 static DEFINE_PER_CPU_SHARED_ALIGNED(struct aperfmperf, cpu_samples) = {
37 	.seq = SEQCNT_ZERO(cpu_samples.seq)
38 };
39 
init_counter_refs(void * data)40 static void init_counter_refs(void *data)
41 {
42 	u64 aperf, mperf;
43 
44 	rdmsrq(MSR_IA32_APERF, aperf);
45 	rdmsrq(MSR_IA32_MPERF, mperf);
46 
47 	this_cpu_write(cpu_samples.aperf, aperf);
48 	this_cpu_write(cpu_samples.mperf, mperf);
49 }
50 
51 #if defined(CONFIG_X86_64) && defined(CONFIG_SMP)
52 /*
53  * APERF/MPERF frequency ratio computation.
54  *
55  * The scheduler wants to do frequency invariant accounting and needs a <1
56  * ratio to account for the 'current' frequency, corresponding to
57  * freq_curr / freq_max.
58  *
59  * Since the frequency freq_curr on x86 is controlled by micro-controller and
60  * our P-state setting is little more than a request/hint, we need to observe
61  * the effective frequency 'BusyMHz', i.e. the average frequency over a time
62  * interval after discarding idle time. This is given by:
63  *
64  *   BusyMHz = delta_APERF / delta_MPERF * freq_base
65  *
66  * where freq_base is the max non-turbo P-state.
67  *
68  * The freq_max term has to be set to a somewhat arbitrary value, because we
69  * can't know which turbo states will be available at a given point in time:
70  * it all depends on the thermal headroom of the entire package. We set it to
71  * the turbo level with 4 cores active.
72  *
73  * Benchmarks show that's a good compromise between the 1C turbo ratio
74  * (freq_curr/freq_max would rarely reach 1) and something close to freq_base,
75  * which would ignore the entire turbo range (a conspicuous part, making
76  * freq_curr/freq_max always maxed out).
77  *
78  * An exception to the heuristic above is the Atom uarch, where we choose the
79  * highest turbo level for freq_max since Atom's are generally oriented towards
80  * power efficiency.
81  *
82  * Setting freq_max to anything less than the 1C turbo ratio makes the ratio
83  * freq_curr / freq_max to eventually grow >1, in which case we clip it to 1.
84  */
85 
86 DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key);
87 
88 static u64 arch_turbo_freq_ratio = SCHED_CAPACITY_SCALE;
89 static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE;
90 
arch_set_max_freq_ratio(bool turbo_disabled)91 void arch_set_max_freq_ratio(bool turbo_disabled)
92 {
93 	arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE :
94 					arch_turbo_freq_ratio;
95 }
96 EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio);
97 
turbo_disabled(void)98 static bool __init turbo_disabled(void)
99 {
100 	u64 misc_en;
101 	int err;
102 
103 	err = rdmsrq_safe(MSR_IA32_MISC_ENABLE, &misc_en);
104 	if (err)
105 		return false;
106 
107 	return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE);
108 }
109 
slv_set_max_freq_ratio(u64 * base_freq,u64 * turbo_freq)110 static bool __init slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
111 {
112 	int err;
113 
114 	err = rdmsrq_safe(MSR_ATOM_CORE_RATIOS, base_freq);
115 	if (err)
116 		return false;
117 
118 	err = rdmsrq_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq);
119 	if (err)
120 		return false;
121 
122 	*base_freq = (*base_freq >> 16) & 0x3F;     /* max P state */
123 	*turbo_freq = *turbo_freq & 0x3F;           /* 1C turbo    */
124 
125 	return true;
126 }
127 
128 #define X86_MATCH(vfm)						\
129 	X86_MATCH_VFM_FEATURE(vfm, X86_FEATURE_APERFMPERF, NULL)
130 
131 static const struct x86_cpu_id has_knl_turbo_ratio_limits[] __initconst = {
132 	X86_MATCH(INTEL_XEON_PHI_KNL),
133 	X86_MATCH(INTEL_XEON_PHI_KNM),
134 	{}
135 };
136 
137 static const struct x86_cpu_id has_skx_turbo_ratio_limits[] __initconst = {
138 	X86_MATCH(INTEL_SKYLAKE_X),
139 	{}
140 };
141 
142 static const struct x86_cpu_id has_glm_turbo_ratio_limits[] __initconst = {
143 	X86_MATCH(INTEL_ATOM_GOLDMONT),
144 	X86_MATCH(INTEL_ATOM_GOLDMONT_D),
145 	X86_MATCH(INTEL_ATOM_GOLDMONT_PLUS),
146 	{}
147 };
148 
knl_set_max_freq_ratio(u64 * base_freq,u64 * turbo_freq,int num_delta_fratio)149 static bool __init knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq,
150 					  int num_delta_fratio)
151 {
152 	int fratio, delta_fratio, found;
153 	int err, i;
154 	u64 msr;
155 
156 	err = rdmsrq_safe(MSR_PLATFORM_INFO, base_freq);
157 	if (err)
158 		return false;
159 
160 	*base_freq = (*base_freq >> 8) & 0xFF;	    /* max P state */
161 
162 	err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT, &msr);
163 	if (err)
164 		return false;
165 
166 	fratio = (msr >> 8) & 0xFF;
167 	i = 16;
168 	found = 0;
169 	do {
170 		if (found >= num_delta_fratio) {
171 			*turbo_freq = fratio;
172 			return true;
173 		}
174 
175 		delta_fratio = (msr >> (i + 5)) & 0x7;
176 
177 		if (delta_fratio) {
178 			found += 1;
179 			fratio -= delta_fratio;
180 		}
181 
182 		i += 8;
183 	} while (i < 64);
184 
185 	return true;
186 }
187 
skx_set_max_freq_ratio(u64 * base_freq,u64 * turbo_freq,int size)188 static bool __init skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size)
189 {
190 	u64 ratios, counts;
191 	u32 group_size;
192 	int err, i;
193 
194 	err = rdmsrq_safe(MSR_PLATFORM_INFO, base_freq);
195 	if (err)
196 		return false;
197 
198 	*base_freq = (*base_freq >> 8) & 0xFF;      /* max P state */
199 
200 	err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT, &ratios);
201 	if (err)
202 		return false;
203 
204 	err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT1, &counts);
205 	if (err)
206 		return false;
207 
208 	for (i = 0; i < 64; i += 8) {
209 		group_size = (counts >> i) & 0xFF;
210 		if (group_size >= size) {
211 			*turbo_freq = (ratios >> i) & 0xFF;
212 			return true;
213 		}
214 	}
215 
216 	return false;
217 }
218 
core_set_max_freq_ratio(u64 * base_freq,u64 * turbo_freq)219 static bool __init core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
220 {
221 	u64 msr;
222 	int err;
223 
224 	err = rdmsrq_safe(MSR_PLATFORM_INFO, base_freq);
225 	if (err)
226 		return false;
227 
228 	err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT, &msr);
229 	if (err)
230 		return false;
231 
232 	*base_freq = (*base_freq >> 8) & 0xFF;    /* max P state */
233 	*turbo_freq = (msr >> 24) & 0xFF;         /* 4C turbo    */
234 
235 	/* The CPU may have less than 4 cores */
236 	if (!*turbo_freq)
237 		*turbo_freq = msr & 0xFF;         /* 1C turbo    */
238 
239 	return true;
240 }
241 
intel_set_max_freq_ratio(void)242 static bool __init intel_set_max_freq_ratio(void)
243 {
244 	u64 base_freq, turbo_freq;
245 	u64 turbo_ratio;
246 
247 	if (slv_set_max_freq_ratio(&base_freq, &turbo_freq))
248 		goto out;
249 
250 	if (x86_match_cpu(has_glm_turbo_ratio_limits) &&
251 	    skx_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
252 		goto out;
253 
254 	if (x86_match_cpu(has_knl_turbo_ratio_limits) &&
255 	    knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
256 		goto out;
257 
258 	if (x86_match_cpu(has_skx_turbo_ratio_limits) &&
259 	    skx_set_max_freq_ratio(&base_freq, &turbo_freq, 4))
260 		goto out;
261 
262 	if (core_set_max_freq_ratio(&base_freq, &turbo_freq))
263 		goto out;
264 
265 	return false;
266 
267 out:
268 	/*
269 	 * Some hypervisors advertise X86_FEATURE_APERFMPERF
270 	 * but then fill all MSR's with zeroes.
271 	 * Some CPUs have turbo boost but don't declare any turbo ratio
272 	 * in MSR_TURBO_RATIO_LIMIT.
273 	 */
274 	if (!base_freq || !turbo_freq) {
275 		pr_debug("Couldn't determine cpu base or turbo frequency, necessary for scale-invariant accounting.\n");
276 		return false;
277 	}
278 
279 	turbo_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, base_freq);
280 	if (!turbo_ratio) {
281 		pr_debug("Non-zero turbo and base frequencies led to a 0 ratio.\n");
282 		return false;
283 	}
284 
285 	arch_turbo_freq_ratio = turbo_ratio;
286 	arch_set_max_freq_ratio(turbo_disabled());
287 
288 	return true;
289 }
290 
291 #ifdef CONFIG_PM_SLEEP
292 static const struct syscore_ops freq_invariance_syscore_ops = {
293 	.resume = init_counter_refs,
294 };
295 
296 static struct syscore freq_invariance_syscore = {
297 	.ops = &freq_invariance_syscore_ops,
298 };
299 
register_freq_invariance_syscore(void)300 static void register_freq_invariance_syscore(void)
301 {
302 	register_syscore(&freq_invariance_syscore);
303 }
304 #else
register_freq_invariance_syscore(void)305 static inline void register_freq_invariance_syscore(void) {}
306 #endif
307 
freq_invariance_enable(void)308 static void freq_invariance_enable(void)
309 {
310 	if (static_branch_unlikely(&arch_scale_freq_key)) {
311 		WARN_ON_ONCE(1);
312 		return;
313 	}
314 	static_branch_enable_cpuslocked(&arch_scale_freq_key);
315 	register_freq_invariance_syscore();
316 	pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio);
317 }
318 
freq_invariance_set_perf_ratio(u64 ratio,bool turbo_disabled)319 void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled)
320 {
321 	arch_turbo_freq_ratio = ratio;
322 	arch_set_max_freq_ratio(turbo_disabled);
323 	freq_invariance_enable();
324 }
325 
bp_init_freq_invariance(void)326 static void __init bp_init_freq_invariance(void)
327 {
328 	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
329 		return;
330 
331 	if (intel_set_max_freq_ratio()) {
332 		guard(cpus_read_lock)();
333 		freq_invariance_enable();
334 	}
335 }
336 
disable_freq_invariance_workfn(struct work_struct * work)337 static void disable_freq_invariance_workfn(struct work_struct *work)
338 {
339 	int cpu;
340 
341 	static_branch_disable(&arch_scale_freq_key);
342 
343 	/*
344 	 * Set arch_freq_scale to a default value on all cpus
345 	 * This negates the effect of scaling
346 	 */
347 	for_each_possible_cpu(cpu)
348 		per_cpu(arch_freq_scale, cpu) = SCHED_CAPACITY_SCALE;
349 }
350 
351 static DECLARE_WORK(disable_freq_invariance_work,
352 		    disable_freq_invariance_workfn);
353 
354 DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE;
355 EXPORT_PER_CPU_SYMBOL_GPL(arch_freq_scale);
356 
357 static DEFINE_STATIC_KEY_FALSE(arch_hybrid_cap_scale_key);
358 
359 struct arch_hybrid_cpu_scale {
360 	unsigned long capacity;
361 	unsigned long freq_ratio;
362 };
363 
364 static struct arch_hybrid_cpu_scale __percpu *arch_cpu_scale;
365 
366 /**
367  * arch_enable_hybrid_capacity_scale() - Enable hybrid CPU capacity scaling
368  *
369  * Allocate memory for per-CPU data used by hybrid CPU capacity scaling,
370  * initialize it and set the static key controlling its code paths.
371  *
372  * Must be called before arch_set_cpu_capacity().
373  */
arch_enable_hybrid_capacity_scale(void)374 bool arch_enable_hybrid_capacity_scale(void)
375 {
376 	int cpu;
377 
378 	if (static_branch_unlikely(&arch_hybrid_cap_scale_key)) {
379 		WARN_ONCE(1, "Hybrid CPU capacity scaling already enabled");
380 		return true;
381 	}
382 
383 	arch_cpu_scale = alloc_percpu(struct arch_hybrid_cpu_scale);
384 	if (!arch_cpu_scale)
385 		return false;
386 
387 	for_each_possible_cpu(cpu) {
388 		per_cpu_ptr(arch_cpu_scale, cpu)->capacity = SCHED_CAPACITY_SCALE;
389 		per_cpu_ptr(arch_cpu_scale, cpu)->freq_ratio = arch_max_freq_ratio;
390 	}
391 
392 	static_branch_enable(&arch_hybrid_cap_scale_key);
393 
394 	pr_info("Hybrid CPU capacity scaling enabled\n");
395 
396 	return true;
397 }
398 
399 /**
400  * arch_set_cpu_capacity() - Set scale-invariance parameters for a CPU
401  * @cpu: Target CPU.
402  * @cap: Capacity of @cpu at its maximum frequency, relative to @max_cap.
403  * @max_cap: System-wide maximum CPU capacity.
404  * @cap_freq: Frequency of @cpu corresponding to @cap.
405  * @base_freq: Frequency of @cpu at which MPERF counts.
406  *
407  * The units in which @cap and @max_cap are expressed do not matter, so long
408  * as they are consistent, because the former is effectively divided by the
409  * latter.  Analogously for @cap_freq and @base_freq.
410  *
411  * After calling this function for all CPUs, call arch_rebuild_sched_domains()
412  * to let the scheduler know that capacity-aware scheduling can be used going
413  * forward.
414  */
arch_set_cpu_capacity(int cpu,unsigned long cap,unsigned long max_cap,unsigned long cap_freq,unsigned long base_freq)415 void arch_set_cpu_capacity(int cpu, unsigned long cap, unsigned long max_cap,
416 			   unsigned long cap_freq, unsigned long base_freq)
417 {
418 	if (static_branch_likely(&arch_hybrid_cap_scale_key)) {
419 		WRITE_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->capacity,
420 			   div_u64(cap << SCHED_CAPACITY_SHIFT, max_cap));
421 		WRITE_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->freq_ratio,
422 			   div_u64(cap_freq << SCHED_CAPACITY_SHIFT, base_freq));
423 	} else {
424 		WARN_ONCE(1, "Hybrid CPU capacity scaling not enabled");
425 	}
426 }
427 
arch_scale_cpu_capacity(int cpu)428 unsigned long arch_scale_cpu_capacity(int cpu)
429 {
430 	if (static_branch_unlikely(&arch_hybrid_cap_scale_key))
431 		return READ_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->capacity);
432 
433 	return SCHED_CAPACITY_SCALE;
434 }
435 EXPORT_SYMBOL_GPL(arch_scale_cpu_capacity);
436 
scale_freq_tick(u64 acnt,u64 mcnt)437 static void scale_freq_tick(u64 acnt, u64 mcnt)
438 {
439 	u64 freq_scale, freq_ratio;
440 
441 	if (!arch_scale_freq_invariant())
442 		return;
443 
444 	if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt))
445 		goto error;
446 
447 	if (static_branch_unlikely(&arch_hybrid_cap_scale_key))
448 		freq_ratio = READ_ONCE(this_cpu_ptr(arch_cpu_scale)->freq_ratio);
449 	else
450 		freq_ratio = arch_max_freq_ratio;
451 
452 	if (check_mul_overflow(mcnt, freq_ratio, &mcnt) || !mcnt)
453 		goto error;
454 
455 	freq_scale = div64_u64(acnt, mcnt);
456 	if (!freq_scale)
457 		goto error;
458 
459 	if (freq_scale > SCHED_CAPACITY_SCALE)
460 		freq_scale = SCHED_CAPACITY_SCALE;
461 
462 	this_cpu_write(arch_freq_scale, freq_scale);
463 	return;
464 
465 error:
466 	pr_warn("Scheduler frequency invariance went wobbly, disabling!\n");
467 	schedule_work(&disable_freq_invariance_work);
468 }
469 #else
bp_init_freq_invariance(void)470 static inline void bp_init_freq_invariance(void) { }
scale_freq_tick(u64 acnt,u64 mcnt)471 static inline void scale_freq_tick(u64 acnt, u64 mcnt) { }
472 #endif /* CONFIG_X86_64 && CONFIG_SMP */
473 
arch_scale_freq_tick(void)474 void arch_scale_freq_tick(void)
475 {
476 	struct aperfmperf *s = this_cpu_ptr(&cpu_samples);
477 	u64 acnt, mcnt, aperf, mperf;
478 
479 	if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
480 		return;
481 
482 	rdmsrq(MSR_IA32_APERF, aperf);
483 	rdmsrq(MSR_IA32_MPERF, mperf);
484 	acnt = aperf - s->aperf;
485 	mcnt = mperf - s->mperf;
486 
487 	s->aperf = aperf;
488 	s->mperf = mperf;
489 
490 	raw_write_seqcount_begin(&s->seq);
491 	s->last_update = jiffies;
492 	s->acnt = acnt;
493 	s->mcnt = mcnt;
494 	raw_write_seqcount_end(&s->seq);
495 
496 	scale_freq_tick(acnt, mcnt);
497 }
498 
499 /*
500  * Discard samples older than the define maximum sample age of 20ms. There
501  * is no point in sending IPIs in such a case. If the scheduler tick was
502  * not running then the CPU is either idle or isolated.
503  */
504 #define MAX_SAMPLE_AGE	((unsigned long)HZ / 50)
505 
arch_freq_get_on_cpu(int cpu)506 int arch_freq_get_on_cpu(int cpu)
507 {
508 	struct aperfmperf *s = per_cpu_ptr(&cpu_samples, cpu);
509 	unsigned int seq, freq;
510 	unsigned long last;
511 	u64 acnt, mcnt;
512 
513 	if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
514 		goto fallback;
515 
516 	do {
517 		seq = raw_read_seqcount_begin(&s->seq);
518 		last = s->last_update;
519 		acnt = s->acnt;
520 		mcnt = s->mcnt;
521 	} while (read_seqcount_retry(&s->seq, seq));
522 
523 	/*
524 	 * Bail on invalid count and when the last update was too long ago,
525 	 * which covers idle and NOHZ full CPUs.
526 	 */
527 	if (!mcnt || (jiffies - last) > MAX_SAMPLE_AGE)
528 		goto fallback;
529 
530 	return div64_u64((cpu_khz * acnt), mcnt);
531 
532 fallback:
533 	freq = cpufreq_quick_get(cpu);
534 	return freq ? freq : cpu_khz;
535 }
536 
bp_init_aperfmperf(void)537 static int __init bp_init_aperfmperf(void)
538 {
539 	if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
540 		return 0;
541 
542 	init_counter_refs(NULL);
543 	bp_init_freq_invariance();
544 	return 0;
545 }
546 early_initcall(bp_init_aperfmperf);
547 
ap_init_aperfmperf(void)548 void ap_init_aperfmperf(void)
549 {
550 	if (cpu_feature_enabled(X86_FEATURE_APERFMPERF))
551 		init_counter_refs(NULL);
552 }
553