1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * x86 APERF/MPERF KHz calculation for 4 * /sys/.../cpufreq/scaling_cur_freq 5 * 6 * Copyright (C) 2017 Intel Corp. 7 * Author: Len Brown <len.brown@intel.com> 8 */ 9 #include <linux/cpufreq.h> 10 #include <linux/delay.h> 11 #include <linux/ktime.h> 12 #include <linux/math64.h> 13 #include <linux/percpu.h> 14 #include <linux/rcupdate.h> 15 #include <linux/sched/isolation.h> 16 #include <linux/sched/topology.h> 17 #include <linux/smp.h> 18 #include <linux/syscore_ops.h> 19 20 #include <asm/cpu.h> 21 #include <asm/cpu_device_id.h> 22 #include <asm/intel-family.h> 23 #include <asm/msr.h> 24 25 #include "cpu.h" 26 27 struct aperfmperf { 28 seqcount_t seq; 29 unsigned long last_update; 30 u64 acnt; 31 u64 mcnt; 32 u64 aperf; 33 u64 mperf; 34 }; 35 36 static DEFINE_PER_CPU_SHARED_ALIGNED(struct aperfmperf, cpu_samples) = { 37 .seq = SEQCNT_ZERO(cpu_samples.seq) 38 }; 39 40 static void init_counter_refs(void *data) 41 { 42 u64 aperf, mperf; 43 44 rdmsrq(MSR_IA32_APERF, aperf); 45 rdmsrq(MSR_IA32_MPERF, mperf); 46 47 this_cpu_write(cpu_samples.aperf, aperf); 48 this_cpu_write(cpu_samples.mperf, mperf); 49 } 50 51 #if defined(CONFIG_X86_64) && defined(CONFIG_SMP) 52 /* 53 * APERF/MPERF frequency ratio computation. 54 * 55 * The scheduler wants to do frequency invariant accounting and needs a <1 56 * ratio to account for the 'current' frequency, corresponding to 57 * freq_curr / freq_max. 58 * 59 * Since the frequency freq_curr on x86 is controlled by micro-controller and 60 * our P-state setting is little more than a request/hint, we need to observe 61 * the effective frequency 'BusyMHz', i.e. the average frequency over a time 62 * interval after discarding idle time. This is given by: 63 * 64 * BusyMHz = delta_APERF / delta_MPERF * freq_base 65 * 66 * where freq_base is the max non-turbo P-state. 67 * 68 * The freq_max term has to be set to a somewhat arbitrary value, because we 69 * can't know which turbo states will be available at a given point in time: 70 * it all depends on the thermal headroom of the entire package. We set it to 71 * the turbo level with 4 cores active. 72 * 73 * Benchmarks show that's a good compromise between the 1C turbo ratio 74 * (freq_curr/freq_max would rarely reach 1) and something close to freq_base, 75 * which would ignore the entire turbo range (a conspicuous part, making 76 * freq_curr/freq_max always maxed out). 77 * 78 * An exception to the heuristic above is the Atom uarch, where we choose the 79 * highest turbo level for freq_max since Atom's are generally oriented towards 80 * power efficiency. 81 * 82 * Setting freq_max to anything less than the 1C turbo ratio makes the ratio 83 * freq_curr / freq_max to eventually grow >1, in which case we clip it to 1. 84 */ 85 86 DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key); 87 88 static u64 arch_turbo_freq_ratio = SCHED_CAPACITY_SCALE; 89 static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE; 90 91 void arch_set_max_freq_ratio(bool turbo_disabled) 92 { 93 arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE : 94 arch_turbo_freq_ratio; 95 } 96 EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio); 97 98 static bool __init turbo_disabled(void) 99 { 100 u64 misc_en; 101 int err; 102 103 err = rdmsrq_safe(MSR_IA32_MISC_ENABLE, &misc_en); 104 if (err) 105 return false; 106 107 return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE); 108 } 109 110 static bool __init slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) 111 { 112 int err; 113 114 err = rdmsrq_safe(MSR_ATOM_CORE_RATIOS, base_freq); 115 if (err) 116 return false; 117 118 err = rdmsrq_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq); 119 if (err) 120 return false; 121 122 *base_freq = (*base_freq >> 16) & 0x3F; /* max P state */ 123 *turbo_freq = *turbo_freq & 0x3F; /* 1C turbo */ 124 125 return true; 126 } 127 128 #define X86_MATCH(vfm) \ 129 X86_MATCH_VFM_FEATURE(vfm, X86_FEATURE_APERFMPERF, NULL) 130 131 static const struct x86_cpu_id has_knl_turbo_ratio_limits[] __initconst = { 132 X86_MATCH(INTEL_XEON_PHI_KNL), 133 X86_MATCH(INTEL_XEON_PHI_KNM), 134 {} 135 }; 136 137 static const struct x86_cpu_id has_skx_turbo_ratio_limits[] __initconst = { 138 X86_MATCH(INTEL_SKYLAKE_X), 139 {} 140 }; 141 142 static const struct x86_cpu_id has_glm_turbo_ratio_limits[] __initconst = { 143 X86_MATCH(INTEL_ATOM_GOLDMONT), 144 X86_MATCH(INTEL_ATOM_GOLDMONT_D), 145 X86_MATCH(INTEL_ATOM_GOLDMONT_PLUS), 146 {} 147 }; 148 149 static bool __init knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, 150 int num_delta_fratio) 151 { 152 int fratio, delta_fratio, found; 153 int err, i; 154 u64 msr; 155 156 err = rdmsrq_safe(MSR_PLATFORM_INFO, base_freq); 157 if (err) 158 return false; 159 160 *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ 161 162 err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT, &msr); 163 if (err) 164 return false; 165 166 fratio = (msr >> 8) & 0xFF; 167 i = 16; 168 found = 0; 169 do { 170 if (found >= num_delta_fratio) { 171 *turbo_freq = fratio; 172 return true; 173 } 174 175 delta_fratio = (msr >> (i + 5)) & 0x7; 176 177 if (delta_fratio) { 178 found += 1; 179 fratio -= delta_fratio; 180 } 181 182 i += 8; 183 } while (i < 64); 184 185 return true; 186 } 187 188 static bool __init skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size) 189 { 190 u64 ratios, counts; 191 u32 group_size; 192 int err, i; 193 194 err = rdmsrq_safe(MSR_PLATFORM_INFO, base_freq); 195 if (err) 196 return false; 197 198 *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ 199 200 err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT, &ratios); 201 if (err) 202 return false; 203 204 err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT1, &counts); 205 if (err) 206 return false; 207 208 for (i = 0; i < 64; i += 8) { 209 group_size = (counts >> i) & 0xFF; 210 if (group_size >= size) { 211 *turbo_freq = (ratios >> i) & 0xFF; 212 return true; 213 } 214 } 215 216 return false; 217 } 218 219 static bool __init core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) 220 { 221 u64 msr; 222 int err; 223 224 err = rdmsrq_safe(MSR_PLATFORM_INFO, base_freq); 225 if (err) 226 return false; 227 228 err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT, &msr); 229 if (err) 230 return false; 231 232 *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ 233 *turbo_freq = (msr >> 24) & 0xFF; /* 4C turbo */ 234 235 /* The CPU may have less than 4 cores */ 236 if (!*turbo_freq) 237 *turbo_freq = msr & 0xFF; /* 1C turbo */ 238 239 return true; 240 } 241 242 static bool __init intel_set_max_freq_ratio(void) 243 { 244 u64 base_freq, turbo_freq; 245 u64 turbo_ratio; 246 247 if (slv_set_max_freq_ratio(&base_freq, &turbo_freq)) 248 goto out; 249 250 if (x86_match_cpu(has_glm_turbo_ratio_limits) && 251 skx_set_max_freq_ratio(&base_freq, &turbo_freq, 1)) 252 goto out; 253 254 if (x86_match_cpu(has_knl_turbo_ratio_limits) && 255 knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1)) 256 goto out; 257 258 if (x86_match_cpu(has_skx_turbo_ratio_limits) && 259 skx_set_max_freq_ratio(&base_freq, &turbo_freq, 4)) 260 goto out; 261 262 if (core_set_max_freq_ratio(&base_freq, &turbo_freq)) 263 goto out; 264 265 return false; 266 267 out: 268 /* 269 * Some hypervisors advertise X86_FEATURE_APERFMPERF 270 * but then fill all MSR's with zeroes. 271 * Some CPUs have turbo boost but don't declare any turbo ratio 272 * in MSR_TURBO_RATIO_LIMIT. 273 */ 274 if (!base_freq || !turbo_freq) { 275 pr_debug("Couldn't determine cpu base or turbo frequency, necessary for scale-invariant accounting.\n"); 276 return false; 277 } 278 279 turbo_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, base_freq); 280 if (!turbo_ratio) { 281 pr_debug("Non-zero turbo and base frequencies led to a 0 ratio.\n"); 282 return false; 283 } 284 285 arch_turbo_freq_ratio = turbo_ratio; 286 arch_set_max_freq_ratio(turbo_disabled()); 287 288 return true; 289 } 290 291 #ifdef CONFIG_PM_SLEEP 292 static const struct syscore_ops freq_invariance_syscore_ops = { 293 .resume = init_counter_refs, 294 }; 295 296 static struct syscore freq_invariance_syscore = { 297 .ops = &freq_invariance_syscore_ops, 298 }; 299 300 static void register_freq_invariance_syscore(void) 301 { 302 register_syscore(&freq_invariance_syscore); 303 } 304 #else 305 static inline void register_freq_invariance_syscore(void) {} 306 #endif 307 308 static void freq_invariance_enable(void) 309 { 310 if (static_branch_unlikely(&arch_scale_freq_key)) { 311 WARN_ON_ONCE(1); 312 return; 313 } 314 static_branch_enable_cpuslocked(&arch_scale_freq_key); 315 register_freq_invariance_syscore(); 316 pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio); 317 } 318 319 void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled) 320 { 321 arch_turbo_freq_ratio = ratio; 322 arch_set_max_freq_ratio(turbo_disabled); 323 freq_invariance_enable(); 324 } 325 326 static void __init bp_init_freq_invariance(void) 327 { 328 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) 329 return; 330 331 if (intel_set_max_freq_ratio()) { 332 guard(cpus_read_lock)(); 333 freq_invariance_enable(); 334 } 335 } 336 337 static void disable_freq_invariance_workfn(struct work_struct *work) 338 { 339 int cpu; 340 341 static_branch_disable(&arch_scale_freq_key); 342 343 /* 344 * Set arch_freq_scale to a default value on all cpus 345 * This negates the effect of scaling 346 */ 347 for_each_possible_cpu(cpu) 348 per_cpu(arch_freq_scale, cpu) = SCHED_CAPACITY_SCALE; 349 } 350 351 static DECLARE_WORK(disable_freq_invariance_work, 352 disable_freq_invariance_workfn); 353 354 DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE; 355 EXPORT_PER_CPU_SYMBOL_GPL(arch_freq_scale); 356 357 static DEFINE_STATIC_KEY_FALSE(arch_hybrid_cap_scale_key); 358 359 struct arch_hybrid_cpu_scale { 360 unsigned long capacity; 361 unsigned long freq_ratio; 362 }; 363 364 static struct arch_hybrid_cpu_scale __percpu *arch_cpu_scale; 365 366 /** 367 * arch_enable_hybrid_capacity_scale() - Enable hybrid CPU capacity scaling 368 * 369 * Allocate memory for per-CPU data used by hybrid CPU capacity scaling, 370 * initialize it and set the static key controlling its code paths. 371 * 372 * Must be called before arch_set_cpu_capacity(). 373 */ 374 bool arch_enable_hybrid_capacity_scale(void) 375 { 376 int cpu; 377 378 if (static_branch_unlikely(&arch_hybrid_cap_scale_key)) { 379 WARN_ONCE(1, "Hybrid CPU capacity scaling already enabled"); 380 return true; 381 } 382 383 arch_cpu_scale = alloc_percpu(struct arch_hybrid_cpu_scale); 384 if (!arch_cpu_scale) 385 return false; 386 387 for_each_possible_cpu(cpu) { 388 per_cpu_ptr(arch_cpu_scale, cpu)->capacity = SCHED_CAPACITY_SCALE; 389 per_cpu_ptr(arch_cpu_scale, cpu)->freq_ratio = arch_max_freq_ratio; 390 } 391 392 static_branch_enable(&arch_hybrid_cap_scale_key); 393 394 pr_info("Hybrid CPU capacity scaling enabled\n"); 395 396 return true; 397 } 398 399 /** 400 * arch_set_cpu_capacity() - Set scale-invariance parameters for a CPU 401 * @cpu: Target CPU. 402 * @cap: Capacity of @cpu at its maximum frequency, relative to @max_cap. 403 * @max_cap: System-wide maximum CPU capacity. 404 * @cap_freq: Frequency of @cpu corresponding to @cap. 405 * @base_freq: Frequency of @cpu at which MPERF counts. 406 * 407 * The units in which @cap and @max_cap are expressed do not matter, so long 408 * as they are consistent, because the former is effectively divided by the 409 * latter. Analogously for @cap_freq and @base_freq. 410 * 411 * After calling this function for all CPUs, call arch_rebuild_sched_domains() 412 * to let the scheduler know that capacity-aware scheduling can be used going 413 * forward. 414 */ 415 void arch_set_cpu_capacity(int cpu, unsigned long cap, unsigned long max_cap, 416 unsigned long cap_freq, unsigned long base_freq) 417 { 418 if (static_branch_likely(&arch_hybrid_cap_scale_key)) { 419 WRITE_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->capacity, 420 div_u64(cap << SCHED_CAPACITY_SHIFT, max_cap)); 421 WRITE_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->freq_ratio, 422 div_u64(cap_freq << SCHED_CAPACITY_SHIFT, base_freq)); 423 } else { 424 WARN_ONCE(1, "Hybrid CPU capacity scaling not enabled"); 425 } 426 } 427 428 unsigned long arch_scale_cpu_capacity(int cpu) 429 { 430 if (static_branch_unlikely(&arch_hybrid_cap_scale_key)) 431 return READ_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->capacity); 432 433 return SCHED_CAPACITY_SCALE; 434 } 435 EXPORT_SYMBOL_GPL(arch_scale_cpu_capacity); 436 437 static void scale_freq_tick(u64 acnt, u64 mcnt) 438 { 439 u64 freq_scale, freq_ratio; 440 441 if (!arch_scale_freq_invariant()) 442 return; 443 444 if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt)) 445 goto error; 446 447 if (static_branch_unlikely(&arch_hybrid_cap_scale_key)) 448 freq_ratio = READ_ONCE(this_cpu_ptr(arch_cpu_scale)->freq_ratio); 449 else 450 freq_ratio = arch_max_freq_ratio; 451 452 if (check_mul_overflow(mcnt, freq_ratio, &mcnt) || !mcnt) 453 goto error; 454 455 freq_scale = div64_u64(acnt, mcnt); 456 if (!freq_scale) 457 goto error; 458 459 if (freq_scale > SCHED_CAPACITY_SCALE) 460 freq_scale = SCHED_CAPACITY_SCALE; 461 462 this_cpu_write(arch_freq_scale, freq_scale); 463 return; 464 465 error: 466 pr_warn("Scheduler frequency invariance went wobbly, disabling!\n"); 467 schedule_work(&disable_freq_invariance_work); 468 } 469 #else 470 static inline void bp_init_freq_invariance(void) { } 471 static inline void scale_freq_tick(u64 acnt, u64 mcnt) { } 472 #endif /* CONFIG_X86_64 && CONFIG_SMP */ 473 474 void arch_scale_freq_tick(void) 475 { 476 struct aperfmperf *s = this_cpu_ptr(&cpu_samples); 477 u64 acnt, mcnt, aperf, mperf; 478 479 if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) 480 return; 481 482 rdmsrq(MSR_IA32_APERF, aperf); 483 rdmsrq(MSR_IA32_MPERF, mperf); 484 acnt = aperf - s->aperf; 485 mcnt = mperf - s->mperf; 486 487 s->aperf = aperf; 488 s->mperf = mperf; 489 490 raw_write_seqcount_begin(&s->seq); 491 s->last_update = jiffies; 492 s->acnt = acnt; 493 s->mcnt = mcnt; 494 raw_write_seqcount_end(&s->seq); 495 496 scale_freq_tick(acnt, mcnt); 497 } 498 499 /* 500 * Discard samples older than the define maximum sample age of 20ms. There 501 * is no point in sending IPIs in such a case. If the scheduler tick was 502 * not running then the CPU is either idle or isolated. 503 */ 504 #define MAX_SAMPLE_AGE ((unsigned long)HZ / 50) 505 506 int arch_freq_get_on_cpu(int cpu) 507 { 508 struct aperfmperf *s = per_cpu_ptr(&cpu_samples, cpu); 509 unsigned int seq, freq; 510 unsigned long last; 511 u64 acnt, mcnt; 512 513 if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) 514 goto fallback; 515 516 do { 517 seq = raw_read_seqcount_begin(&s->seq); 518 last = s->last_update; 519 acnt = s->acnt; 520 mcnt = s->mcnt; 521 } while (read_seqcount_retry(&s->seq, seq)); 522 523 /* 524 * Bail on invalid count and when the last update was too long ago, 525 * which covers idle and NOHZ full CPUs. 526 */ 527 if (!mcnt || (jiffies - last) > MAX_SAMPLE_AGE) 528 goto fallback; 529 530 return div64_u64((cpu_khz * acnt), mcnt); 531 532 fallback: 533 freq = cpufreq_quick_get(cpu); 534 return freq ? freq : cpu_khz; 535 } 536 537 static int __init bp_init_aperfmperf(void) 538 { 539 if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) 540 return 0; 541 542 init_counter_refs(NULL); 543 bp_init_freq_invariance(); 544 return 0; 545 } 546 early_initcall(bp_init_aperfmperf); 547 548 void ap_init_aperfmperf(void) 549 { 550 if (cpu_feature_enabled(X86_FEATURE_APERFMPERF)) 551 init_counter_refs(NULL); 552 } 553