xref: /linux/arch/x86/kernel/cpu/aperfmperf.c (revision 60cb1da6ed4a62ec8331e25ad4be87115cd28feb)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * x86 APERF/MPERF KHz calculation for
4  * /sys/.../cpufreq/scaling_cur_freq
5  *
6  * Copyright (C) 2017 Intel Corp.
7  * Author: Len Brown <len.brown@intel.com>
8  */
9 #include <linux/cpufreq.h>
10 #include <linux/delay.h>
11 #include <linux/ktime.h>
12 #include <linux/math64.h>
13 #include <linux/percpu.h>
14 #include <linux/rcupdate.h>
15 #include <linux/sched/isolation.h>
16 #include <linux/sched/topology.h>
17 #include <linux/smp.h>
18 #include <linux/syscore_ops.h>
19 
20 #include <asm/cpu.h>
21 #include <asm/cpu_device_id.h>
22 #include <asm/intel-family.h>
23 
24 #include "cpu.h"
25 
26 struct aperfmperf {
27 	seqcount_t	seq;
28 	unsigned long	last_update;
29 	u64		acnt;
30 	u64		mcnt;
31 	u64		aperf;
32 	u64		mperf;
33 };
34 
35 static DEFINE_PER_CPU_SHARED_ALIGNED(struct aperfmperf, cpu_samples) = {
36 	.seq = SEQCNT_ZERO(cpu_samples.seq)
37 };
38 
39 static void init_counter_refs(void)
40 {
41 	u64 aperf, mperf;
42 
43 	rdmsrl(MSR_IA32_APERF, aperf);
44 	rdmsrl(MSR_IA32_MPERF, mperf);
45 
46 	this_cpu_write(cpu_samples.aperf, aperf);
47 	this_cpu_write(cpu_samples.mperf, mperf);
48 }
49 
50 #if defined(CONFIG_X86_64) && defined(CONFIG_SMP)
51 /*
52  * APERF/MPERF frequency ratio computation.
53  *
54  * The scheduler wants to do frequency invariant accounting and needs a <1
55  * ratio to account for the 'current' frequency, corresponding to
56  * freq_curr / freq_max.
57  *
58  * Since the frequency freq_curr on x86 is controlled by micro-controller and
59  * our P-state setting is little more than a request/hint, we need to observe
60  * the effective frequency 'BusyMHz', i.e. the average frequency over a time
61  * interval after discarding idle time. This is given by:
62  *
63  *   BusyMHz = delta_APERF / delta_MPERF * freq_base
64  *
65  * where freq_base is the max non-turbo P-state.
66  *
67  * The freq_max term has to be set to a somewhat arbitrary value, because we
68  * can't know which turbo states will be available at a given point in time:
69  * it all depends on the thermal headroom of the entire package. We set it to
70  * the turbo level with 4 cores active.
71  *
72  * Benchmarks show that's a good compromise between the 1C turbo ratio
73  * (freq_curr/freq_max would rarely reach 1) and something close to freq_base,
74  * which would ignore the entire turbo range (a conspicuous part, making
75  * freq_curr/freq_max always maxed out).
76  *
77  * An exception to the heuristic above is the Atom uarch, where we choose the
78  * highest turbo level for freq_max since Atom's are generally oriented towards
79  * power efficiency.
80  *
81  * Setting freq_max to anything less than the 1C turbo ratio makes the ratio
82  * freq_curr / freq_max to eventually grow >1, in which case we clip it to 1.
83  */
84 
85 DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key);
86 
87 static u64 arch_turbo_freq_ratio = SCHED_CAPACITY_SCALE;
88 static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE;
89 
90 void arch_set_max_freq_ratio(bool turbo_disabled)
91 {
92 	arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE :
93 					arch_turbo_freq_ratio;
94 }
95 EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio);
96 
97 static bool __init turbo_disabled(void)
98 {
99 	u64 misc_en;
100 	int err;
101 
102 	err = rdmsrl_safe(MSR_IA32_MISC_ENABLE, &misc_en);
103 	if (err)
104 		return false;
105 
106 	return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE);
107 }
108 
109 static bool __init slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
110 {
111 	int err;
112 
113 	err = rdmsrl_safe(MSR_ATOM_CORE_RATIOS, base_freq);
114 	if (err)
115 		return false;
116 
117 	err = rdmsrl_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq);
118 	if (err)
119 		return false;
120 
121 	*base_freq = (*base_freq >> 16) & 0x3F;     /* max P state */
122 	*turbo_freq = *turbo_freq & 0x3F;           /* 1C turbo    */
123 
124 	return true;
125 }
126 
127 #define X86_MATCH(vfm)						\
128 	X86_MATCH_VFM_FEATURE(vfm, X86_FEATURE_APERFMPERF, NULL)
129 
130 static const struct x86_cpu_id has_knl_turbo_ratio_limits[] __initconst = {
131 	X86_MATCH(INTEL_XEON_PHI_KNL),
132 	X86_MATCH(INTEL_XEON_PHI_KNM),
133 	{}
134 };
135 
136 static const struct x86_cpu_id has_skx_turbo_ratio_limits[] __initconst = {
137 	X86_MATCH(INTEL_SKYLAKE_X),
138 	{}
139 };
140 
141 static const struct x86_cpu_id has_glm_turbo_ratio_limits[] __initconst = {
142 	X86_MATCH(INTEL_ATOM_GOLDMONT),
143 	X86_MATCH(INTEL_ATOM_GOLDMONT_D),
144 	X86_MATCH(INTEL_ATOM_GOLDMONT_PLUS),
145 	{}
146 };
147 
148 static bool __init knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq,
149 					  int num_delta_fratio)
150 {
151 	int fratio, delta_fratio, found;
152 	int err, i;
153 	u64 msr;
154 
155 	err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
156 	if (err)
157 		return false;
158 
159 	*base_freq = (*base_freq >> 8) & 0xFF;	    /* max P state */
160 
161 	err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr);
162 	if (err)
163 		return false;
164 
165 	fratio = (msr >> 8) & 0xFF;
166 	i = 16;
167 	found = 0;
168 	do {
169 		if (found >= num_delta_fratio) {
170 			*turbo_freq = fratio;
171 			return true;
172 		}
173 
174 		delta_fratio = (msr >> (i + 5)) & 0x7;
175 
176 		if (delta_fratio) {
177 			found += 1;
178 			fratio -= delta_fratio;
179 		}
180 
181 		i += 8;
182 	} while (i < 64);
183 
184 	return true;
185 }
186 
187 static bool __init skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size)
188 {
189 	u64 ratios, counts;
190 	u32 group_size;
191 	int err, i;
192 
193 	err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
194 	if (err)
195 		return false;
196 
197 	*base_freq = (*base_freq >> 8) & 0xFF;      /* max P state */
198 
199 	err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &ratios);
200 	if (err)
201 		return false;
202 
203 	err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT1, &counts);
204 	if (err)
205 		return false;
206 
207 	for (i = 0; i < 64; i += 8) {
208 		group_size = (counts >> i) & 0xFF;
209 		if (group_size >= size) {
210 			*turbo_freq = (ratios >> i) & 0xFF;
211 			return true;
212 		}
213 	}
214 
215 	return false;
216 }
217 
218 static bool __init core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
219 {
220 	u64 msr;
221 	int err;
222 
223 	err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
224 	if (err)
225 		return false;
226 
227 	err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr);
228 	if (err)
229 		return false;
230 
231 	*base_freq = (*base_freq >> 8) & 0xFF;    /* max P state */
232 	*turbo_freq = (msr >> 24) & 0xFF;         /* 4C turbo    */
233 
234 	/* The CPU may have less than 4 cores */
235 	if (!*turbo_freq)
236 		*turbo_freq = msr & 0xFF;         /* 1C turbo    */
237 
238 	return true;
239 }
240 
241 static bool __init intel_set_max_freq_ratio(void)
242 {
243 	u64 base_freq, turbo_freq;
244 	u64 turbo_ratio;
245 
246 	if (slv_set_max_freq_ratio(&base_freq, &turbo_freq))
247 		goto out;
248 
249 	if (x86_match_cpu(has_glm_turbo_ratio_limits) &&
250 	    skx_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
251 		goto out;
252 
253 	if (x86_match_cpu(has_knl_turbo_ratio_limits) &&
254 	    knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
255 		goto out;
256 
257 	if (x86_match_cpu(has_skx_turbo_ratio_limits) &&
258 	    skx_set_max_freq_ratio(&base_freq, &turbo_freq, 4))
259 		goto out;
260 
261 	if (core_set_max_freq_ratio(&base_freq, &turbo_freq))
262 		goto out;
263 
264 	return false;
265 
266 out:
267 	/*
268 	 * Some hypervisors advertise X86_FEATURE_APERFMPERF
269 	 * but then fill all MSR's with zeroes.
270 	 * Some CPUs have turbo boost but don't declare any turbo ratio
271 	 * in MSR_TURBO_RATIO_LIMIT.
272 	 */
273 	if (!base_freq || !turbo_freq) {
274 		pr_debug("Couldn't determine cpu base or turbo frequency, necessary for scale-invariant accounting.\n");
275 		return false;
276 	}
277 
278 	turbo_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, base_freq);
279 	if (!turbo_ratio) {
280 		pr_debug("Non-zero turbo and base frequencies led to a 0 ratio.\n");
281 		return false;
282 	}
283 
284 	arch_turbo_freq_ratio = turbo_ratio;
285 	arch_set_max_freq_ratio(turbo_disabled());
286 
287 	return true;
288 }
289 
290 #ifdef CONFIG_PM_SLEEP
291 static struct syscore_ops freq_invariance_syscore_ops = {
292 	.resume = init_counter_refs,
293 };
294 
295 static void register_freq_invariance_syscore_ops(void)
296 {
297 	register_syscore_ops(&freq_invariance_syscore_ops);
298 }
299 #else
300 static inline void register_freq_invariance_syscore_ops(void) {}
301 #endif
302 
303 static void freq_invariance_enable(void)
304 {
305 	if (static_branch_unlikely(&arch_scale_freq_key)) {
306 		WARN_ON_ONCE(1);
307 		return;
308 	}
309 	static_branch_enable_cpuslocked(&arch_scale_freq_key);
310 	register_freq_invariance_syscore_ops();
311 	pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio);
312 }
313 
314 void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled)
315 {
316 	arch_turbo_freq_ratio = ratio;
317 	arch_set_max_freq_ratio(turbo_disabled);
318 	freq_invariance_enable();
319 }
320 
321 static void __init bp_init_freq_invariance(void)
322 {
323 	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
324 		return;
325 
326 	if (intel_set_max_freq_ratio()) {
327 		guard(cpus_read_lock)();
328 		freq_invariance_enable();
329 	}
330 }
331 
332 static void disable_freq_invariance_workfn(struct work_struct *work)
333 {
334 	int cpu;
335 
336 	static_branch_disable(&arch_scale_freq_key);
337 
338 	/*
339 	 * Set arch_freq_scale to a default value on all cpus
340 	 * This negates the effect of scaling
341 	 */
342 	for_each_possible_cpu(cpu)
343 		per_cpu(arch_freq_scale, cpu) = SCHED_CAPACITY_SCALE;
344 }
345 
346 static DECLARE_WORK(disable_freq_invariance_work,
347 		    disable_freq_invariance_workfn);
348 
349 DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE;
350 EXPORT_PER_CPU_SYMBOL_GPL(arch_freq_scale);
351 
352 static void scale_freq_tick(u64 acnt, u64 mcnt)
353 {
354 	u64 freq_scale;
355 
356 	if (!arch_scale_freq_invariant())
357 		return;
358 
359 	if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt))
360 		goto error;
361 
362 	if (check_mul_overflow(mcnt, arch_max_freq_ratio, &mcnt) || !mcnt)
363 		goto error;
364 
365 	freq_scale = div64_u64(acnt, mcnt);
366 	if (!freq_scale)
367 		goto error;
368 
369 	if (freq_scale > SCHED_CAPACITY_SCALE)
370 		freq_scale = SCHED_CAPACITY_SCALE;
371 
372 	this_cpu_write(arch_freq_scale, freq_scale);
373 	return;
374 
375 error:
376 	pr_warn("Scheduler frequency invariance went wobbly, disabling!\n");
377 	schedule_work(&disable_freq_invariance_work);
378 }
379 #else
380 static inline void bp_init_freq_invariance(void) { }
381 static inline void scale_freq_tick(u64 acnt, u64 mcnt) { }
382 #endif /* CONFIG_X86_64 && CONFIG_SMP */
383 
384 void arch_scale_freq_tick(void)
385 {
386 	struct aperfmperf *s = this_cpu_ptr(&cpu_samples);
387 	u64 acnt, mcnt, aperf, mperf;
388 
389 	if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
390 		return;
391 
392 	rdmsrl(MSR_IA32_APERF, aperf);
393 	rdmsrl(MSR_IA32_MPERF, mperf);
394 	acnt = aperf - s->aperf;
395 	mcnt = mperf - s->mperf;
396 
397 	s->aperf = aperf;
398 	s->mperf = mperf;
399 
400 	raw_write_seqcount_begin(&s->seq);
401 	s->last_update = jiffies;
402 	s->acnt = acnt;
403 	s->mcnt = mcnt;
404 	raw_write_seqcount_end(&s->seq);
405 
406 	scale_freq_tick(acnt, mcnt);
407 }
408 
409 /*
410  * Discard samples older than the define maximum sample age of 20ms. There
411  * is no point in sending IPIs in such a case. If the scheduler tick was
412  * not running then the CPU is either idle or isolated.
413  */
414 #define MAX_SAMPLE_AGE	((unsigned long)HZ / 50)
415 
416 unsigned int arch_freq_get_on_cpu(int cpu)
417 {
418 	struct aperfmperf *s = per_cpu_ptr(&cpu_samples, cpu);
419 	unsigned int seq, freq;
420 	unsigned long last;
421 	u64 acnt, mcnt;
422 
423 	if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
424 		goto fallback;
425 
426 	do {
427 		seq = raw_read_seqcount_begin(&s->seq);
428 		last = s->last_update;
429 		acnt = s->acnt;
430 		mcnt = s->mcnt;
431 	} while (read_seqcount_retry(&s->seq, seq));
432 
433 	/*
434 	 * Bail on invalid count and when the last update was too long ago,
435 	 * which covers idle and NOHZ full CPUs.
436 	 */
437 	if (!mcnt || (jiffies - last) > MAX_SAMPLE_AGE)
438 		goto fallback;
439 
440 	return div64_u64((cpu_khz * acnt), mcnt);
441 
442 fallback:
443 	freq = cpufreq_quick_get(cpu);
444 	return freq ? freq : cpu_khz;
445 }
446 
447 static int __init bp_init_aperfmperf(void)
448 {
449 	if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
450 		return 0;
451 
452 	init_counter_refs();
453 	bp_init_freq_invariance();
454 	return 0;
455 }
456 early_initcall(bp_init_aperfmperf);
457 
458 void ap_init_aperfmperf(void)
459 {
460 	if (cpu_feature_enabled(X86_FEATURE_APERFMPERF))
461 		init_counter_refs();
462 }
463