1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * x86 APERF/MPERF KHz calculation for 4 * /sys/.../cpufreq/scaling_cur_freq 5 * 6 * Copyright (C) 2017 Intel Corp. 7 * Author: Len Brown <len.brown@intel.com> 8 */ 9 #include <linux/cpufreq.h> 10 #include <linux/delay.h> 11 #include <linux/ktime.h> 12 #include <linux/math64.h> 13 #include <linux/percpu.h> 14 #include <linux/rcupdate.h> 15 #include <linux/sched/isolation.h> 16 #include <linux/sched/topology.h> 17 #include <linux/smp.h> 18 #include <linux/syscore_ops.h> 19 20 #include <asm/cpu.h> 21 #include <asm/cpu_device_id.h> 22 #include <asm/intel-family.h> 23 #include <asm/msr.h> 24 25 #include "cpu.h" 26 27 struct aperfmperf { 28 seqcount_t seq; 29 unsigned long last_update; 30 u64 acnt; 31 u64 mcnt; 32 u64 aperf; 33 u64 mperf; 34 }; 35 36 static DEFINE_PER_CPU_SHARED_ALIGNED(struct aperfmperf, cpu_samples) = { 37 .seq = SEQCNT_ZERO(cpu_samples.seq) 38 }; 39 40 static void init_counter_refs(void) 41 { 42 u64 aperf, mperf; 43 44 rdmsrq(MSR_IA32_APERF, aperf); 45 rdmsrq(MSR_IA32_MPERF, mperf); 46 47 this_cpu_write(cpu_samples.aperf, aperf); 48 this_cpu_write(cpu_samples.mperf, mperf); 49 } 50 51 #if defined(CONFIG_X86_64) && defined(CONFIG_SMP) 52 /* 53 * APERF/MPERF frequency ratio computation. 54 * 55 * The scheduler wants to do frequency invariant accounting and needs a <1 56 * ratio to account for the 'current' frequency, corresponding to 57 * freq_curr / freq_max. 58 * 59 * Since the frequency freq_curr on x86 is controlled by micro-controller and 60 * our P-state setting is little more than a request/hint, we need to observe 61 * the effective frequency 'BusyMHz', i.e. the average frequency over a time 62 * interval after discarding idle time. This is given by: 63 * 64 * BusyMHz = delta_APERF / delta_MPERF * freq_base 65 * 66 * where freq_base is the max non-turbo P-state. 67 * 68 * The freq_max term has to be set to a somewhat arbitrary value, because we 69 * can't know which turbo states will be available at a given point in time: 70 * it all depends on the thermal headroom of the entire package. We set it to 71 * the turbo level with 4 cores active. 72 * 73 * Benchmarks show that's a good compromise between the 1C turbo ratio 74 * (freq_curr/freq_max would rarely reach 1) and something close to freq_base, 75 * which would ignore the entire turbo range (a conspicuous part, making 76 * freq_curr/freq_max always maxed out). 77 * 78 * An exception to the heuristic above is the Atom uarch, where we choose the 79 * highest turbo level for freq_max since Atom's are generally oriented towards 80 * power efficiency. 81 * 82 * Setting freq_max to anything less than the 1C turbo ratio makes the ratio 83 * freq_curr / freq_max to eventually grow >1, in which case we clip it to 1. 84 */ 85 86 DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key); 87 88 static u64 arch_turbo_freq_ratio = SCHED_CAPACITY_SCALE; 89 static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE; 90 91 void arch_set_max_freq_ratio(bool turbo_disabled) 92 { 93 arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE : 94 arch_turbo_freq_ratio; 95 } 96 EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio); 97 98 static bool __init turbo_disabled(void) 99 { 100 u64 misc_en; 101 int err; 102 103 err = rdmsrq_safe(MSR_IA32_MISC_ENABLE, &misc_en); 104 if (err) 105 return false; 106 107 return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE); 108 } 109 110 static bool __init slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) 111 { 112 int err; 113 114 err = rdmsrq_safe(MSR_ATOM_CORE_RATIOS, base_freq); 115 if (err) 116 return false; 117 118 err = rdmsrq_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq); 119 if (err) 120 return false; 121 122 *base_freq = (*base_freq >> 16) & 0x3F; /* max P state */ 123 *turbo_freq = *turbo_freq & 0x3F; /* 1C turbo */ 124 125 return true; 126 } 127 128 #define X86_MATCH(vfm) \ 129 X86_MATCH_VFM_FEATURE(vfm, X86_FEATURE_APERFMPERF, NULL) 130 131 static const struct x86_cpu_id has_knl_turbo_ratio_limits[] __initconst = { 132 X86_MATCH(INTEL_XEON_PHI_KNL), 133 X86_MATCH(INTEL_XEON_PHI_KNM), 134 {} 135 }; 136 137 static const struct x86_cpu_id has_skx_turbo_ratio_limits[] __initconst = { 138 X86_MATCH(INTEL_SKYLAKE_X), 139 {} 140 }; 141 142 static const struct x86_cpu_id has_glm_turbo_ratio_limits[] __initconst = { 143 X86_MATCH(INTEL_ATOM_GOLDMONT), 144 X86_MATCH(INTEL_ATOM_GOLDMONT_D), 145 X86_MATCH(INTEL_ATOM_GOLDMONT_PLUS), 146 {} 147 }; 148 149 static bool __init knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, 150 int num_delta_fratio) 151 { 152 int fratio, delta_fratio, found; 153 int err, i; 154 u64 msr; 155 156 err = rdmsrq_safe(MSR_PLATFORM_INFO, base_freq); 157 if (err) 158 return false; 159 160 *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ 161 162 err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT, &msr); 163 if (err) 164 return false; 165 166 fratio = (msr >> 8) & 0xFF; 167 i = 16; 168 found = 0; 169 do { 170 if (found >= num_delta_fratio) { 171 *turbo_freq = fratio; 172 return true; 173 } 174 175 delta_fratio = (msr >> (i + 5)) & 0x7; 176 177 if (delta_fratio) { 178 found += 1; 179 fratio -= delta_fratio; 180 } 181 182 i += 8; 183 } while (i < 64); 184 185 return true; 186 } 187 188 static bool __init skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size) 189 { 190 u64 ratios, counts; 191 u32 group_size; 192 int err, i; 193 194 err = rdmsrq_safe(MSR_PLATFORM_INFO, base_freq); 195 if (err) 196 return false; 197 198 *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ 199 200 err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT, &ratios); 201 if (err) 202 return false; 203 204 err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT1, &counts); 205 if (err) 206 return false; 207 208 for (i = 0; i < 64; i += 8) { 209 group_size = (counts >> i) & 0xFF; 210 if (group_size >= size) { 211 *turbo_freq = (ratios >> i) & 0xFF; 212 return true; 213 } 214 } 215 216 return false; 217 } 218 219 static bool __init core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) 220 { 221 u64 msr; 222 int err; 223 224 err = rdmsrq_safe(MSR_PLATFORM_INFO, base_freq); 225 if (err) 226 return false; 227 228 err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT, &msr); 229 if (err) 230 return false; 231 232 *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ 233 *turbo_freq = (msr >> 24) & 0xFF; /* 4C turbo */ 234 235 /* The CPU may have less than 4 cores */ 236 if (!*turbo_freq) 237 *turbo_freq = msr & 0xFF; /* 1C turbo */ 238 239 return true; 240 } 241 242 static bool __init intel_set_max_freq_ratio(void) 243 { 244 u64 base_freq, turbo_freq; 245 u64 turbo_ratio; 246 247 if (slv_set_max_freq_ratio(&base_freq, &turbo_freq)) 248 goto out; 249 250 if (x86_match_cpu(has_glm_turbo_ratio_limits) && 251 skx_set_max_freq_ratio(&base_freq, &turbo_freq, 1)) 252 goto out; 253 254 if (x86_match_cpu(has_knl_turbo_ratio_limits) && 255 knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1)) 256 goto out; 257 258 if (x86_match_cpu(has_skx_turbo_ratio_limits) && 259 skx_set_max_freq_ratio(&base_freq, &turbo_freq, 4)) 260 goto out; 261 262 if (core_set_max_freq_ratio(&base_freq, &turbo_freq)) 263 goto out; 264 265 return false; 266 267 out: 268 /* 269 * Some hypervisors advertise X86_FEATURE_APERFMPERF 270 * but then fill all MSR's with zeroes. 271 * Some CPUs have turbo boost but don't declare any turbo ratio 272 * in MSR_TURBO_RATIO_LIMIT. 273 */ 274 if (!base_freq || !turbo_freq) { 275 pr_debug("Couldn't determine cpu base or turbo frequency, necessary for scale-invariant accounting.\n"); 276 return false; 277 } 278 279 turbo_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, base_freq); 280 if (!turbo_ratio) { 281 pr_debug("Non-zero turbo and base frequencies led to a 0 ratio.\n"); 282 return false; 283 } 284 285 arch_turbo_freq_ratio = turbo_ratio; 286 arch_set_max_freq_ratio(turbo_disabled()); 287 288 return true; 289 } 290 291 #ifdef CONFIG_PM_SLEEP 292 static struct syscore_ops freq_invariance_syscore_ops = { 293 .resume = init_counter_refs, 294 }; 295 296 static void register_freq_invariance_syscore_ops(void) 297 { 298 register_syscore_ops(&freq_invariance_syscore_ops); 299 } 300 #else 301 static inline void register_freq_invariance_syscore_ops(void) {} 302 #endif 303 304 static void freq_invariance_enable(void) 305 { 306 if (static_branch_unlikely(&arch_scale_freq_key)) { 307 WARN_ON_ONCE(1); 308 return; 309 } 310 static_branch_enable_cpuslocked(&arch_scale_freq_key); 311 register_freq_invariance_syscore_ops(); 312 pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio); 313 } 314 315 void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled) 316 { 317 arch_turbo_freq_ratio = ratio; 318 arch_set_max_freq_ratio(turbo_disabled); 319 freq_invariance_enable(); 320 } 321 322 static void __init bp_init_freq_invariance(void) 323 { 324 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) 325 return; 326 327 if (intel_set_max_freq_ratio()) { 328 guard(cpus_read_lock)(); 329 freq_invariance_enable(); 330 } 331 } 332 333 static void disable_freq_invariance_workfn(struct work_struct *work) 334 { 335 int cpu; 336 337 static_branch_disable(&arch_scale_freq_key); 338 339 /* 340 * Set arch_freq_scale to a default value on all cpus 341 * This negates the effect of scaling 342 */ 343 for_each_possible_cpu(cpu) 344 per_cpu(arch_freq_scale, cpu) = SCHED_CAPACITY_SCALE; 345 } 346 347 static DECLARE_WORK(disable_freq_invariance_work, 348 disable_freq_invariance_workfn); 349 350 DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE; 351 EXPORT_PER_CPU_SYMBOL_GPL(arch_freq_scale); 352 353 static DEFINE_STATIC_KEY_FALSE(arch_hybrid_cap_scale_key); 354 355 struct arch_hybrid_cpu_scale { 356 unsigned long capacity; 357 unsigned long freq_ratio; 358 }; 359 360 static struct arch_hybrid_cpu_scale __percpu *arch_cpu_scale; 361 362 /** 363 * arch_enable_hybrid_capacity_scale() - Enable hybrid CPU capacity scaling 364 * 365 * Allocate memory for per-CPU data used by hybrid CPU capacity scaling, 366 * initialize it and set the static key controlling its code paths. 367 * 368 * Must be called before arch_set_cpu_capacity(). 369 */ 370 bool arch_enable_hybrid_capacity_scale(void) 371 { 372 int cpu; 373 374 if (static_branch_unlikely(&arch_hybrid_cap_scale_key)) { 375 WARN_ONCE(1, "Hybrid CPU capacity scaling already enabled"); 376 return true; 377 } 378 379 arch_cpu_scale = alloc_percpu(struct arch_hybrid_cpu_scale); 380 if (!arch_cpu_scale) 381 return false; 382 383 for_each_possible_cpu(cpu) { 384 per_cpu_ptr(arch_cpu_scale, cpu)->capacity = SCHED_CAPACITY_SCALE; 385 per_cpu_ptr(arch_cpu_scale, cpu)->freq_ratio = arch_max_freq_ratio; 386 } 387 388 static_branch_enable(&arch_hybrid_cap_scale_key); 389 390 pr_info("Hybrid CPU capacity scaling enabled\n"); 391 392 return true; 393 } 394 395 /** 396 * arch_set_cpu_capacity() - Set scale-invariance parameters for a CPU 397 * @cpu: Target CPU. 398 * @cap: Capacity of @cpu at its maximum frequency, relative to @max_cap. 399 * @max_cap: System-wide maximum CPU capacity. 400 * @cap_freq: Frequency of @cpu corresponding to @cap. 401 * @base_freq: Frequency of @cpu at which MPERF counts. 402 * 403 * The units in which @cap and @max_cap are expressed do not matter, so long 404 * as they are consistent, because the former is effectively divided by the 405 * latter. Analogously for @cap_freq and @base_freq. 406 * 407 * After calling this function for all CPUs, call arch_rebuild_sched_domains() 408 * to let the scheduler know that capacity-aware scheduling can be used going 409 * forward. 410 */ 411 void arch_set_cpu_capacity(int cpu, unsigned long cap, unsigned long max_cap, 412 unsigned long cap_freq, unsigned long base_freq) 413 { 414 if (static_branch_likely(&arch_hybrid_cap_scale_key)) { 415 WRITE_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->capacity, 416 div_u64(cap << SCHED_CAPACITY_SHIFT, max_cap)); 417 WRITE_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->freq_ratio, 418 div_u64(cap_freq << SCHED_CAPACITY_SHIFT, base_freq)); 419 } else { 420 WARN_ONCE(1, "Hybrid CPU capacity scaling not enabled"); 421 } 422 } 423 424 unsigned long arch_scale_cpu_capacity(int cpu) 425 { 426 if (static_branch_unlikely(&arch_hybrid_cap_scale_key)) 427 return READ_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->capacity); 428 429 return SCHED_CAPACITY_SCALE; 430 } 431 EXPORT_SYMBOL_GPL(arch_scale_cpu_capacity); 432 433 static void scale_freq_tick(u64 acnt, u64 mcnt) 434 { 435 u64 freq_scale, freq_ratio; 436 437 if (!arch_scale_freq_invariant()) 438 return; 439 440 if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt)) 441 goto error; 442 443 if (static_branch_unlikely(&arch_hybrid_cap_scale_key)) 444 freq_ratio = READ_ONCE(this_cpu_ptr(arch_cpu_scale)->freq_ratio); 445 else 446 freq_ratio = arch_max_freq_ratio; 447 448 if (check_mul_overflow(mcnt, freq_ratio, &mcnt) || !mcnt) 449 goto error; 450 451 freq_scale = div64_u64(acnt, mcnt); 452 if (!freq_scale) 453 goto error; 454 455 if (freq_scale > SCHED_CAPACITY_SCALE) 456 freq_scale = SCHED_CAPACITY_SCALE; 457 458 this_cpu_write(arch_freq_scale, freq_scale); 459 return; 460 461 error: 462 pr_warn("Scheduler frequency invariance went wobbly, disabling!\n"); 463 schedule_work(&disable_freq_invariance_work); 464 } 465 #else 466 static inline void bp_init_freq_invariance(void) { } 467 static inline void scale_freq_tick(u64 acnt, u64 mcnt) { } 468 #endif /* CONFIG_X86_64 && CONFIG_SMP */ 469 470 void arch_scale_freq_tick(void) 471 { 472 struct aperfmperf *s = this_cpu_ptr(&cpu_samples); 473 u64 acnt, mcnt, aperf, mperf; 474 475 if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) 476 return; 477 478 rdmsrq(MSR_IA32_APERF, aperf); 479 rdmsrq(MSR_IA32_MPERF, mperf); 480 acnt = aperf - s->aperf; 481 mcnt = mperf - s->mperf; 482 483 s->aperf = aperf; 484 s->mperf = mperf; 485 486 raw_write_seqcount_begin(&s->seq); 487 s->last_update = jiffies; 488 s->acnt = acnt; 489 s->mcnt = mcnt; 490 raw_write_seqcount_end(&s->seq); 491 492 scale_freq_tick(acnt, mcnt); 493 } 494 495 /* 496 * Discard samples older than the define maximum sample age of 20ms. There 497 * is no point in sending IPIs in such a case. If the scheduler tick was 498 * not running then the CPU is either idle or isolated. 499 */ 500 #define MAX_SAMPLE_AGE ((unsigned long)HZ / 50) 501 502 int arch_freq_get_on_cpu(int cpu) 503 { 504 struct aperfmperf *s = per_cpu_ptr(&cpu_samples, cpu); 505 unsigned int seq, freq; 506 unsigned long last; 507 u64 acnt, mcnt; 508 509 if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) 510 goto fallback; 511 512 do { 513 seq = raw_read_seqcount_begin(&s->seq); 514 last = s->last_update; 515 acnt = s->acnt; 516 mcnt = s->mcnt; 517 } while (read_seqcount_retry(&s->seq, seq)); 518 519 /* 520 * Bail on invalid count and when the last update was too long ago, 521 * which covers idle and NOHZ full CPUs. 522 */ 523 if (!mcnt || (jiffies - last) > MAX_SAMPLE_AGE) 524 goto fallback; 525 526 return div64_u64((cpu_khz * acnt), mcnt); 527 528 fallback: 529 freq = cpufreq_quick_get(cpu); 530 return freq ? freq : cpu_khz; 531 } 532 533 static int __init bp_init_aperfmperf(void) 534 { 535 if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) 536 return 0; 537 538 init_counter_refs(); 539 bp_init_freq_invariance(); 540 return 0; 541 } 542 early_initcall(bp_init_aperfmperf); 543 544 void ap_init_aperfmperf(void) 545 { 546 if (cpu_feature_enabled(X86_FEATURE_APERFMPERF)) 547 init_counter_refs(); 548 } 549