1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * x86 APERF/MPERF KHz calculation for 4 * /sys/.../cpufreq/scaling_cur_freq 5 * 6 * Copyright (C) 2017 Intel Corp. 7 * Author: Len Brown <len.brown@intel.com> 8 */ 9 #include <linux/cpufreq.h> 10 #include <linux/delay.h> 11 #include <linux/ktime.h> 12 #include <linux/math64.h> 13 #include <linux/percpu.h> 14 #include <linux/rcupdate.h> 15 #include <linux/sched/isolation.h> 16 #include <linux/sched/topology.h> 17 #include <linux/smp.h> 18 #include <linux/syscore_ops.h> 19 20 #include <asm/cpu.h> 21 #include <asm/cpu_device_id.h> 22 #include <asm/intel-family.h> 23 24 #include "cpu.h" 25 26 struct aperfmperf { 27 seqcount_t seq; 28 unsigned long last_update; 29 u64 acnt; 30 u64 mcnt; 31 u64 aperf; 32 u64 mperf; 33 }; 34 35 static DEFINE_PER_CPU_SHARED_ALIGNED(struct aperfmperf, cpu_samples) = { 36 .seq = SEQCNT_ZERO(cpu_samples.seq) 37 }; 38 39 static void init_counter_refs(void) 40 { 41 u64 aperf, mperf; 42 43 rdmsrl(MSR_IA32_APERF, aperf); 44 rdmsrl(MSR_IA32_MPERF, mperf); 45 46 this_cpu_write(cpu_samples.aperf, aperf); 47 this_cpu_write(cpu_samples.mperf, mperf); 48 } 49 50 #if defined(CONFIG_X86_64) && defined(CONFIG_SMP) 51 /* 52 * APERF/MPERF frequency ratio computation. 53 * 54 * The scheduler wants to do frequency invariant accounting and needs a <1 55 * ratio to account for the 'current' frequency, corresponding to 56 * freq_curr / freq_max. 57 * 58 * Since the frequency freq_curr on x86 is controlled by micro-controller and 59 * our P-state setting is little more than a request/hint, we need to observe 60 * the effective frequency 'BusyMHz', i.e. the average frequency over a time 61 * interval after discarding idle time. This is given by: 62 * 63 * BusyMHz = delta_APERF / delta_MPERF * freq_base 64 * 65 * where freq_base is the max non-turbo P-state. 66 * 67 * The freq_max term has to be set to a somewhat arbitrary value, because we 68 * can't know which turbo states will be available at a given point in time: 69 * it all depends on the thermal headroom of the entire package. We set it to 70 * the turbo level with 4 cores active. 71 * 72 * Benchmarks show that's a good compromise between the 1C turbo ratio 73 * (freq_curr/freq_max would rarely reach 1) and something close to freq_base, 74 * which would ignore the entire turbo range (a conspicuous part, making 75 * freq_curr/freq_max always maxed out). 76 * 77 * An exception to the heuristic above is the Atom uarch, where we choose the 78 * highest turbo level for freq_max since Atom's are generally oriented towards 79 * power efficiency. 80 * 81 * Setting freq_max to anything less than the 1C turbo ratio makes the ratio 82 * freq_curr / freq_max to eventually grow >1, in which case we clip it to 1. 83 */ 84 85 DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key); 86 87 static u64 arch_turbo_freq_ratio = SCHED_CAPACITY_SCALE; 88 static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE; 89 90 void arch_set_max_freq_ratio(bool turbo_disabled) 91 { 92 arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE : 93 arch_turbo_freq_ratio; 94 } 95 EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio); 96 97 static bool __init turbo_disabled(void) 98 { 99 u64 misc_en; 100 int err; 101 102 err = rdmsrl_safe(MSR_IA32_MISC_ENABLE, &misc_en); 103 if (err) 104 return false; 105 106 return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE); 107 } 108 109 static bool __init slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) 110 { 111 int err; 112 113 err = rdmsrl_safe(MSR_ATOM_CORE_RATIOS, base_freq); 114 if (err) 115 return false; 116 117 err = rdmsrl_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq); 118 if (err) 119 return false; 120 121 *base_freq = (*base_freq >> 16) & 0x3F; /* max P state */ 122 *turbo_freq = *turbo_freq & 0x3F; /* 1C turbo */ 123 124 return true; 125 } 126 127 #define X86_MATCH(vfm) \ 128 X86_MATCH_VFM_FEATURE(vfm, X86_FEATURE_APERFMPERF, NULL) 129 130 static const struct x86_cpu_id has_knl_turbo_ratio_limits[] __initconst = { 131 X86_MATCH(INTEL_XEON_PHI_KNL), 132 X86_MATCH(INTEL_XEON_PHI_KNM), 133 {} 134 }; 135 136 static const struct x86_cpu_id has_skx_turbo_ratio_limits[] __initconst = { 137 X86_MATCH(INTEL_SKYLAKE_X), 138 {} 139 }; 140 141 static const struct x86_cpu_id has_glm_turbo_ratio_limits[] __initconst = { 142 X86_MATCH(INTEL_ATOM_GOLDMONT), 143 X86_MATCH(INTEL_ATOM_GOLDMONT_D), 144 X86_MATCH(INTEL_ATOM_GOLDMONT_PLUS), 145 {} 146 }; 147 148 static bool __init knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, 149 int num_delta_fratio) 150 { 151 int fratio, delta_fratio, found; 152 int err, i; 153 u64 msr; 154 155 err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); 156 if (err) 157 return false; 158 159 *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ 160 161 err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr); 162 if (err) 163 return false; 164 165 fratio = (msr >> 8) & 0xFF; 166 i = 16; 167 found = 0; 168 do { 169 if (found >= num_delta_fratio) { 170 *turbo_freq = fratio; 171 return true; 172 } 173 174 delta_fratio = (msr >> (i + 5)) & 0x7; 175 176 if (delta_fratio) { 177 found += 1; 178 fratio -= delta_fratio; 179 } 180 181 i += 8; 182 } while (i < 64); 183 184 return true; 185 } 186 187 static bool __init skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size) 188 { 189 u64 ratios, counts; 190 u32 group_size; 191 int err, i; 192 193 err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); 194 if (err) 195 return false; 196 197 *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ 198 199 err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &ratios); 200 if (err) 201 return false; 202 203 err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT1, &counts); 204 if (err) 205 return false; 206 207 for (i = 0; i < 64; i += 8) { 208 group_size = (counts >> i) & 0xFF; 209 if (group_size >= size) { 210 *turbo_freq = (ratios >> i) & 0xFF; 211 return true; 212 } 213 } 214 215 return false; 216 } 217 218 static bool __init core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) 219 { 220 u64 msr; 221 int err; 222 223 err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); 224 if (err) 225 return false; 226 227 err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr); 228 if (err) 229 return false; 230 231 *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ 232 *turbo_freq = (msr >> 24) & 0xFF; /* 4C turbo */ 233 234 /* The CPU may have less than 4 cores */ 235 if (!*turbo_freq) 236 *turbo_freq = msr & 0xFF; /* 1C turbo */ 237 238 return true; 239 } 240 241 static bool __init intel_set_max_freq_ratio(void) 242 { 243 u64 base_freq, turbo_freq; 244 u64 turbo_ratio; 245 246 if (slv_set_max_freq_ratio(&base_freq, &turbo_freq)) 247 goto out; 248 249 if (x86_match_cpu(has_glm_turbo_ratio_limits) && 250 skx_set_max_freq_ratio(&base_freq, &turbo_freq, 1)) 251 goto out; 252 253 if (x86_match_cpu(has_knl_turbo_ratio_limits) && 254 knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1)) 255 goto out; 256 257 if (x86_match_cpu(has_skx_turbo_ratio_limits) && 258 skx_set_max_freq_ratio(&base_freq, &turbo_freq, 4)) 259 goto out; 260 261 if (core_set_max_freq_ratio(&base_freq, &turbo_freq)) 262 goto out; 263 264 return false; 265 266 out: 267 /* 268 * Some hypervisors advertise X86_FEATURE_APERFMPERF 269 * but then fill all MSR's with zeroes. 270 * Some CPUs have turbo boost but don't declare any turbo ratio 271 * in MSR_TURBO_RATIO_LIMIT. 272 */ 273 if (!base_freq || !turbo_freq) { 274 pr_debug("Couldn't determine cpu base or turbo frequency, necessary for scale-invariant accounting.\n"); 275 return false; 276 } 277 278 turbo_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, base_freq); 279 if (!turbo_ratio) { 280 pr_debug("Non-zero turbo and base frequencies led to a 0 ratio.\n"); 281 return false; 282 } 283 284 arch_turbo_freq_ratio = turbo_ratio; 285 arch_set_max_freq_ratio(turbo_disabled()); 286 287 return true; 288 } 289 290 #ifdef CONFIG_PM_SLEEP 291 static struct syscore_ops freq_invariance_syscore_ops = { 292 .resume = init_counter_refs, 293 }; 294 295 static void register_freq_invariance_syscore_ops(void) 296 { 297 register_syscore_ops(&freq_invariance_syscore_ops); 298 } 299 #else 300 static inline void register_freq_invariance_syscore_ops(void) {} 301 #endif 302 303 static void freq_invariance_enable(void) 304 { 305 if (static_branch_unlikely(&arch_scale_freq_key)) { 306 WARN_ON_ONCE(1); 307 return; 308 } 309 static_branch_enable_cpuslocked(&arch_scale_freq_key); 310 register_freq_invariance_syscore_ops(); 311 pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio); 312 } 313 314 void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled) 315 { 316 arch_turbo_freq_ratio = ratio; 317 arch_set_max_freq_ratio(turbo_disabled); 318 freq_invariance_enable(); 319 } 320 321 static void __init bp_init_freq_invariance(void) 322 { 323 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) 324 return; 325 326 if (intel_set_max_freq_ratio()) { 327 guard(cpus_read_lock)(); 328 freq_invariance_enable(); 329 } 330 } 331 332 static void disable_freq_invariance_workfn(struct work_struct *work) 333 { 334 int cpu; 335 336 static_branch_disable(&arch_scale_freq_key); 337 338 /* 339 * Set arch_freq_scale to a default value on all cpus 340 * This negates the effect of scaling 341 */ 342 for_each_possible_cpu(cpu) 343 per_cpu(arch_freq_scale, cpu) = SCHED_CAPACITY_SCALE; 344 } 345 346 static DECLARE_WORK(disable_freq_invariance_work, 347 disable_freq_invariance_workfn); 348 349 DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE; 350 EXPORT_PER_CPU_SYMBOL_GPL(arch_freq_scale); 351 352 static DEFINE_STATIC_KEY_FALSE(arch_hybrid_cap_scale_key); 353 354 struct arch_hybrid_cpu_scale { 355 unsigned long capacity; 356 unsigned long freq_ratio; 357 }; 358 359 static struct arch_hybrid_cpu_scale __percpu *arch_cpu_scale; 360 361 /** 362 * arch_enable_hybrid_capacity_scale() - Enable hybrid CPU capacity scaling 363 * 364 * Allocate memory for per-CPU data used by hybrid CPU capacity scaling, 365 * initialize it and set the static key controlling its code paths. 366 * 367 * Must be called before arch_set_cpu_capacity(). 368 */ 369 bool arch_enable_hybrid_capacity_scale(void) 370 { 371 int cpu; 372 373 if (static_branch_unlikely(&arch_hybrid_cap_scale_key)) { 374 WARN_ONCE(1, "Hybrid CPU capacity scaling already enabled"); 375 return true; 376 } 377 378 arch_cpu_scale = alloc_percpu(struct arch_hybrid_cpu_scale); 379 if (!arch_cpu_scale) 380 return false; 381 382 for_each_possible_cpu(cpu) { 383 per_cpu_ptr(arch_cpu_scale, cpu)->capacity = SCHED_CAPACITY_SCALE; 384 per_cpu_ptr(arch_cpu_scale, cpu)->freq_ratio = arch_max_freq_ratio; 385 } 386 387 static_branch_enable(&arch_hybrid_cap_scale_key); 388 389 pr_info("Hybrid CPU capacity scaling enabled\n"); 390 391 return true; 392 } 393 394 /** 395 * arch_set_cpu_capacity() - Set scale-invariance parameters for a CPU 396 * @cpu: Target CPU. 397 * @cap: Capacity of @cpu at its maximum frequency, relative to @max_cap. 398 * @max_cap: System-wide maximum CPU capacity. 399 * @cap_freq: Frequency of @cpu corresponding to @cap. 400 * @base_freq: Frequency of @cpu at which MPERF counts. 401 * 402 * The units in which @cap and @max_cap are expressed do not matter, so long 403 * as they are consistent, because the former is effectively divided by the 404 * latter. Analogously for @cap_freq and @base_freq. 405 * 406 * After calling this function for all CPUs, call arch_rebuild_sched_domains() 407 * to let the scheduler know that capacity-aware scheduling can be used going 408 * forward. 409 */ 410 void arch_set_cpu_capacity(int cpu, unsigned long cap, unsigned long max_cap, 411 unsigned long cap_freq, unsigned long base_freq) 412 { 413 if (static_branch_likely(&arch_hybrid_cap_scale_key)) { 414 WRITE_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->capacity, 415 div_u64(cap << SCHED_CAPACITY_SHIFT, max_cap)); 416 WRITE_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->freq_ratio, 417 div_u64(cap_freq << SCHED_CAPACITY_SHIFT, base_freq)); 418 } else { 419 WARN_ONCE(1, "Hybrid CPU capacity scaling not enabled"); 420 } 421 } 422 423 unsigned long arch_scale_cpu_capacity(int cpu) 424 { 425 if (static_branch_unlikely(&arch_hybrid_cap_scale_key)) 426 return READ_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->capacity); 427 428 return SCHED_CAPACITY_SCALE; 429 } 430 EXPORT_SYMBOL_GPL(arch_scale_cpu_capacity); 431 432 static void scale_freq_tick(u64 acnt, u64 mcnt) 433 { 434 u64 freq_scale, freq_ratio; 435 436 if (!arch_scale_freq_invariant()) 437 return; 438 439 if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt)) 440 goto error; 441 442 if (static_branch_unlikely(&arch_hybrid_cap_scale_key)) 443 freq_ratio = READ_ONCE(this_cpu_ptr(arch_cpu_scale)->freq_ratio); 444 else 445 freq_ratio = arch_max_freq_ratio; 446 447 if (check_mul_overflow(mcnt, freq_ratio, &mcnt) || !mcnt) 448 goto error; 449 450 freq_scale = div64_u64(acnt, mcnt); 451 if (!freq_scale) 452 goto error; 453 454 if (freq_scale > SCHED_CAPACITY_SCALE) 455 freq_scale = SCHED_CAPACITY_SCALE; 456 457 this_cpu_write(arch_freq_scale, freq_scale); 458 return; 459 460 error: 461 pr_warn("Scheduler frequency invariance went wobbly, disabling!\n"); 462 schedule_work(&disable_freq_invariance_work); 463 } 464 #else 465 static inline void bp_init_freq_invariance(void) { } 466 static inline void scale_freq_tick(u64 acnt, u64 mcnt) { } 467 #endif /* CONFIG_X86_64 && CONFIG_SMP */ 468 469 void arch_scale_freq_tick(void) 470 { 471 struct aperfmperf *s = this_cpu_ptr(&cpu_samples); 472 u64 acnt, mcnt, aperf, mperf; 473 474 if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) 475 return; 476 477 rdmsrl(MSR_IA32_APERF, aperf); 478 rdmsrl(MSR_IA32_MPERF, mperf); 479 acnt = aperf - s->aperf; 480 mcnt = mperf - s->mperf; 481 482 s->aperf = aperf; 483 s->mperf = mperf; 484 485 raw_write_seqcount_begin(&s->seq); 486 s->last_update = jiffies; 487 s->acnt = acnt; 488 s->mcnt = mcnt; 489 raw_write_seqcount_end(&s->seq); 490 491 scale_freq_tick(acnt, mcnt); 492 } 493 494 /* 495 * Discard samples older than the define maximum sample age of 20ms. There 496 * is no point in sending IPIs in such a case. If the scheduler tick was 497 * not running then the CPU is either idle or isolated. 498 */ 499 #define MAX_SAMPLE_AGE ((unsigned long)HZ / 50) 500 501 unsigned int arch_freq_get_on_cpu(int cpu) 502 { 503 struct aperfmperf *s = per_cpu_ptr(&cpu_samples, cpu); 504 unsigned int seq, freq; 505 unsigned long last; 506 u64 acnt, mcnt; 507 508 if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) 509 goto fallback; 510 511 do { 512 seq = raw_read_seqcount_begin(&s->seq); 513 last = s->last_update; 514 acnt = s->acnt; 515 mcnt = s->mcnt; 516 } while (read_seqcount_retry(&s->seq, seq)); 517 518 /* 519 * Bail on invalid count and when the last update was too long ago, 520 * which covers idle and NOHZ full CPUs. 521 */ 522 if (!mcnt || (jiffies - last) > MAX_SAMPLE_AGE) 523 goto fallback; 524 525 return div64_u64((cpu_khz * acnt), mcnt); 526 527 fallback: 528 freq = cpufreq_quick_get(cpu); 529 return freq ? freq : cpu_khz; 530 } 531 532 static int __init bp_init_aperfmperf(void) 533 { 534 if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) 535 return 0; 536 537 init_counter_refs(); 538 bp_init_freq_invariance(); 539 return 0; 540 } 541 early_initcall(bp_init_aperfmperf); 542 543 void ap_init_aperfmperf(void) 544 { 545 if (cpu_feature_enabled(X86_FEATURE_APERFMPERF)) 546 init_counter_refs(); 547 } 548