1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * intel_pstate.c: Native P state management for Intel processors 4 * 5 * (C) Copyright 2012 Intel Corporation 6 * Author: Dirk Brandewie <dirk.j.brandewie@intel.com> 7 */ 8 9 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 10 11 #include <linux/kernel.h> 12 #include <linux/kernel_stat.h> 13 #include <linux/module.h> 14 #include <linux/ktime.h> 15 #include <linux/hrtimer.h> 16 #include <linux/tick.h> 17 #include <linux/slab.h> 18 #include <linux/sched/cpufreq.h> 19 #include <linux/sched/smt.h> 20 #include <linux/list.h> 21 #include <linux/cpu.h> 22 #include <linux/cpufreq.h> 23 #include <linux/sysfs.h> 24 #include <linux/types.h> 25 #include <linux/fs.h> 26 #include <linux/acpi.h> 27 #include <linux/vmalloc.h> 28 #include <linux/pm_qos.h> 29 #include <linux/bitfield.h> 30 #include <trace/events/power.h> 31 #include <linux/units.h> 32 33 #include <asm/cpu.h> 34 #include <asm/div64.h> 35 #include <asm/msr.h> 36 #include <asm/cpu_device_id.h> 37 #include <asm/cpufeature.h> 38 #include <asm/intel-family.h> 39 #include "../drivers/thermal/intel/thermal_interrupt.h" 40 41 #define INTEL_PSTATE_SAMPLING_INTERVAL (10 * NSEC_PER_MSEC) 42 43 #define INTEL_CPUFREQ_TRANSITION_LATENCY 20000 44 #define INTEL_CPUFREQ_TRANSITION_DELAY_HWP 5000 45 #define INTEL_CPUFREQ_TRANSITION_DELAY 500 46 47 #ifdef CONFIG_ACPI 48 #include <acpi/processor.h> 49 #include <acpi/cppc_acpi.h> 50 #endif 51 52 #define FRAC_BITS 8 53 #define int_tofp(X) ((int64_t)(X) << FRAC_BITS) 54 #define fp_toint(X) ((X) >> FRAC_BITS) 55 56 #define ONE_EIGHTH_FP ((int64_t)1 << (FRAC_BITS - 3)) 57 58 #define EXT_BITS 6 59 #define EXT_FRAC_BITS (EXT_BITS + FRAC_BITS) 60 #define fp_ext_toint(X) ((X) >> EXT_FRAC_BITS) 61 #define int_ext_tofp(X) ((int64_t)(X) << EXT_FRAC_BITS) 62 63 static inline int32_t mul_fp(int32_t x, int32_t y) 64 { 65 return ((int64_t)x * (int64_t)y) >> FRAC_BITS; 66 } 67 68 static inline int32_t div_fp(s64 x, s64 y) 69 { 70 return div64_s64((int64_t)x << FRAC_BITS, y); 71 } 72 73 static inline int ceiling_fp(int32_t x) 74 { 75 int mask, ret; 76 77 ret = fp_toint(x); 78 mask = (1 << FRAC_BITS) - 1; 79 if (x & mask) 80 ret += 1; 81 return ret; 82 } 83 84 static inline u64 mul_ext_fp(u64 x, u64 y) 85 { 86 return (x * y) >> EXT_FRAC_BITS; 87 } 88 89 static inline u64 div_ext_fp(u64 x, u64 y) 90 { 91 return div64_u64(x << EXT_FRAC_BITS, y); 92 } 93 94 /** 95 * struct sample - Store performance sample 96 * @core_avg_perf: Ratio of APERF/MPERF which is the actual average 97 * performance during last sample period 98 * @busy_scaled: Scaled busy value which is used to calculate next 99 * P state. This can be different than core_avg_perf 100 * to account for cpu idle period 101 * @aperf: Difference of actual performance frequency clock count 102 * read from APERF MSR between last and current sample 103 * @mperf: Difference of maximum performance frequency clock count 104 * read from MPERF MSR between last and current sample 105 * @tsc: Difference of time stamp counter between last and 106 * current sample 107 * @time: Current time from scheduler 108 * 109 * This structure is used in the cpudata structure to store performance sample 110 * data for choosing next P State. 111 */ 112 struct sample { 113 int32_t core_avg_perf; 114 int32_t busy_scaled; 115 u64 aperf; 116 u64 mperf; 117 u64 tsc; 118 u64 time; 119 }; 120 121 /** 122 * struct pstate_data - Store P state data 123 * @current_pstate: Current requested P state 124 * @min_pstate: Min P state possible for this platform 125 * @max_pstate: Max P state possible for this platform 126 * @max_pstate_physical:This is physical Max P state for a processor 127 * This can be higher than the max_pstate which can 128 * be limited by platform thermal design power limits 129 * @perf_ctl_scaling: PERF_CTL P-state to frequency scaling factor 130 * @scaling: Scaling factor between performance and frequency 131 * @turbo_pstate: Max Turbo P state possible for this platform 132 * @min_freq: @min_pstate frequency in cpufreq units 133 * @max_freq: @max_pstate frequency in cpufreq units 134 * @turbo_freq: @turbo_pstate frequency in cpufreq units 135 * 136 * Stores the per cpu model P state limits and current P state. 137 */ 138 struct pstate_data { 139 int current_pstate; 140 int min_pstate; 141 int max_pstate; 142 int max_pstate_physical; 143 int perf_ctl_scaling; 144 int scaling; 145 int turbo_pstate; 146 unsigned int min_freq; 147 unsigned int max_freq; 148 unsigned int turbo_freq; 149 }; 150 151 /** 152 * struct vid_data - Stores voltage information data 153 * @min: VID data for this platform corresponding to 154 * the lowest P state 155 * @max: VID data corresponding to the highest P State. 156 * @turbo: VID data for turbo P state 157 * @ratio: Ratio of (vid max - vid min) / 158 * (max P state - Min P State) 159 * 160 * Stores the voltage data for DVFS (Dynamic Voltage and Frequency Scaling) 161 * This data is used in Atom platforms, where in addition to target P state, 162 * the voltage data needs to be specified to select next P State. 163 */ 164 struct vid_data { 165 int min; 166 int max; 167 int turbo; 168 int32_t ratio; 169 }; 170 171 /** 172 * struct global_params - Global parameters, mostly tunable via sysfs. 173 * @no_turbo: Whether or not to use turbo P-states. 174 * @turbo_disabled: Whether or not turbo P-states are available at all, 175 * based on the MSR_IA32_MISC_ENABLE value and whether or 176 * not the maximum reported turbo P-state is different from 177 * the maximum reported non-turbo one. 178 * @min_perf_pct: Minimum capacity limit in percent of the maximum turbo 179 * P-state capacity. 180 * @max_perf_pct: Maximum capacity limit in percent of the maximum turbo 181 * P-state capacity. 182 */ 183 struct global_params { 184 bool no_turbo; 185 bool turbo_disabled; 186 int max_perf_pct; 187 int min_perf_pct; 188 }; 189 190 /** 191 * struct cpudata - Per CPU instance data storage 192 * @cpu: CPU number for this instance data 193 * @policy: CPUFreq policy value 194 * @update_util: CPUFreq utility callback information 195 * @update_util_set: CPUFreq utility callback is set 196 * @iowait_boost: iowait-related boost fraction 197 * @last_update: Time of the last update. 198 * @pstate: Stores P state limits for this CPU 199 * @vid: Stores VID limits for this CPU 200 * @last_sample_time: Last Sample time 201 * @aperf_mperf_shift: APERF vs MPERF counting frequency difference 202 * @prev_aperf: Last APERF value read from APERF MSR 203 * @prev_mperf: Last MPERF value read from MPERF MSR 204 * @prev_tsc: Last timestamp counter (TSC) value 205 * @sample: Storage for storing last Sample data 206 * @min_perf_ratio: Minimum capacity in terms of PERF or HWP ratios 207 * @max_perf_ratio: Maximum capacity in terms of PERF or HWP ratios 208 * @acpi_perf_data: Stores ACPI perf information read from _PSS 209 * @valid_pss_table: Set to true for valid ACPI _PSS entries found 210 * @epp_powersave: Last saved HWP energy performance preference 211 * (EPP) or energy performance bias (EPB), 212 * when policy switched to performance 213 * @epp_policy: Last saved policy used to set EPP/EPB 214 * @epp_default: Power on default HWP energy performance 215 * preference/bias 216 * @epp_cached: Cached HWP energy-performance preference value 217 * @hwp_req_cached: Cached value of the last HWP Request MSR 218 * @hwp_cap_cached: Cached value of the last HWP Capabilities MSR 219 * @last_io_update: Last time when IO wake flag was set 220 * @capacity_perf: Highest perf used for scale invariance 221 * @sched_flags: Store scheduler flags for possible cross CPU update 222 * @hwp_boost_min: Last HWP boosted min performance 223 * @suspended: Whether or not the driver has been suspended. 224 * @pd_registered: Set when a perf domain is registered for this CPU. 225 * @hwp_notify_work: workqueue for HWP notifications. 226 * 227 * This structure stores per CPU instance data for all CPUs. 228 */ 229 struct cpudata { 230 int cpu; 231 232 unsigned int policy; 233 struct update_util_data update_util; 234 bool update_util_set; 235 236 struct pstate_data pstate; 237 struct vid_data vid; 238 239 u64 last_update; 240 u64 last_sample_time; 241 u64 aperf_mperf_shift; 242 u64 prev_aperf; 243 u64 prev_mperf; 244 u64 prev_tsc; 245 struct sample sample; 246 int32_t min_perf_ratio; 247 int32_t max_perf_ratio; 248 #ifdef CONFIG_ACPI 249 struct acpi_processor_performance acpi_perf_data; 250 bool valid_pss_table; 251 #endif 252 unsigned int iowait_boost; 253 s16 epp_powersave; 254 s16 epp_policy; 255 s16 epp_default; 256 s16 epp_cached; 257 u64 hwp_req_cached; 258 u64 hwp_cap_cached; 259 u64 last_io_update; 260 unsigned int capacity_perf; 261 unsigned int sched_flags; 262 u32 hwp_boost_min; 263 bool suspended; 264 #ifdef CONFIG_ENERGY_MODEL 265 bool pd_registered; 266 #endif 267 struct delayed_work hwp_notify_work; 268 }; 269 270 static struct cpudata **all_cpu_data; 271 272 /** 273 * struct pstate_funcs - Per CPU model specific callbacks 274 * @get_max: Callback to get maximum non turbo effective P state 275 * @get_max_physical: Callback to get maximum non turbo physical P state 276 * @get_min: Callback to get minimum P state 277 * @get_turbo: Callback to get turbo P state 278 * @get_scaling: Callback to get frequency scaling factor 279 * @get_cpu_scaling: Get frequency scaling factor for a given cpu 280 * @get_aperf_mperf_shift: Callback to get the APERF vs MPERF frequency difference 281 * @get_val: Callback to convert P state to actual MSR write value 282 * @get_vid: Callback to get VID data for Atom platforms 283 * 284 * Core and Atom CPU models have different way to get P State limits. This 285 * structure is used to store those callbacks. 286 */ 287 struct pstate_funcs { 288 int (*get_max)(int cpu); 289 int (*get_max_physical)(int cpu); 290 int (*get_min)(int cpu); 291 int (*get_turbo)(int cpu); 292 int (*get_scaling)(void); 293 int (*get_cpu_scaling)(int cpu); 294 int (*get_aperf_mperf_shift)(void); 295 u64 (*get_val)(struct cpudata*, int pstate); 296 void (*get_vid)(struct cpudata *); 297 }; 298 299 static struct pstate_funcs pstate_funcs __read_mostly; 300 301 static bool hwp_active __ro_after_init; 302 static int hwp_mode_bdw __ro_after_init; 303 static bool per_cpu_limits __ro_after_init; 304 static bool hwp_forced __ro_after_init; 305 static bool hwp_boost __read_mostly; 306 static bool hwp_is_hybrid; 307 308 static struct cpufreq_driver *intel_pstate_driver __read_mostly; 309 310 #define INTEL_PSTATE_CORE_SCALING 100000 311 #define HYBRID_SCALING_FACTOR_ADL 78741 312 #define HYBRID_SCALING_FACTOR_MTL 80000 313 #define HYBRID_SCALING_FACTOR_LNL 86957 314 315 static int hybrid_scaling_factor; 316 317 static inline int core_get_scaling(void) 318 { 319 return INTEL_PSTATE_CORE_SCALING; 320 } 321 322 #ifdef CONFIG_ACPI 323 static bool acpi_ppc; 324 #endif 325 326 static struct global_params global; 327 328 static DEFINE_MUTEX(intel_pstate_driver_lock); 329 static DEFINE_MUTEX(intel_pstate_limits_lock); 330 331 #ifdef CONFIG_ACPI 332 333 static bool intel_pstate_acpi_pm_profile_server(void) 334 { 335 if (acpi_gbl_FADT.preferred_profile == PM_ENTERPRISE_SERVER || 336 acpi_gbl_FADT.preferred_profile == PM_PERFORMANCE_SERVER) 337 return true; 338 339 return false; 340 } 341 342 static bool intel_pstate_get_ppc_enable_status(void) 343 { 344 if (intel_pstate_acpi_pm_profile_server()) 345 return true; 346 347 return acpi_ppc; 348 } 349 350 #ifdef CONFIG_ACPI_CPPC_LIB 351 352 /* The work item is needed to avoid CPU hotplug locking issues */ 353 static void intel_pstste_sched_itmt_work_fn(struct work_struct *work) 354 { 355 sched_set_itmt_support(); 356 } 357 358 static DECLARE_WORK(sched_itmt_work, intel_pstste_sched_itmt_work_fn); 359 360 #define CPPC_MAX_PERF U8_MAX 361 362 static void intel_pstate_set_itmt_prio(int cpu) 363 { 364 struct cppc_perf_caps cppc_perf; 365 static u32 max_highest_perf = 0, min_highest_perf = U32_MAX; 366 int ret; 367 368 ret = cppc_get_perf_caps(cpu, &cppc_perf); 369 /* 370 * If CPPC is not available, fall back to MSR_HWP_CAPABILITIES bits [8:0]. 371 * 372 * Also, on some systems with overclocking enabled, CPPC.highest_perf is 373 * hardcoded to 0xff, so CPPC.highest_perf cannot be used to enable ITMT. 374 * Fall back to MSR_HWP_CAPABILITIES then too. 375 */ 376 if (ret || cppc_perf.highest_perf == CPPC_MAX_PERF) 377 cppc_perf.highest_perf = HWP_HIGHEST_PERF(READ_ONCE(all_cpu_data[cpu]->hwp_cap_cached)); 378 379 /* 380 * The priorities can be set regardless of whether or not 381 * sched_set_itmt_support(true) has been called and it is valid to 382 * update them at any time after it has been called. 383 */ 384 sched_set_itmt_core_prio(cppc_perf.highest_perf, cpu); 385 386 if (max_highest_perf <= min_highest_perf) { 387 if (cppc_perf.highest_perf > max_highest_perf) 388 max_highest_perf = cppc_perf.highest_perf; 389 390 if (cppc_perf.highest_perf < min_highest_perf) 391 min_highest_perf = cppc_perf.highest_perf; 392 393 if (max_highest_perf > min_highest_perf) { 394 /* 395 * This code can be run during CPU online under the 396 * CPU hotplug locks, so sched_set_itmt_support() 397 * cannot be called from here. Queue up a work item 398 * to invoke it. 399 */ 400 schedule_work(&sched_itmt_work); 401 } 402 } 403 } 404 405 static int intel_pstate_get_cppc_guaranteed(int cpu) 406 { 407 struct cppc_perf_caps cppc_perf; 408 int ret; 409 410 ret = cppc_get_perf_caps(cpu, &cppc_perf); 411 if (ret) 412 return ret; 413 414 if (cppc_perf.guaranteed_perf) 415 return cppc_perf.guaranteed_perf; 416 417 return cppc_perf.nominal_perf; 418 } 419 420 static int intel_pstate_cppc_get_scaling(int cpu) 421 { 422 struct cppc_perf_caps cppc_perf; 423 424 /* 425 * Compute the perf-to-frequency scaling factor for the given CPU if 426 * possible, unless it would be 0. 427 */ 428 if (!cppc_get_perf_caps(cpu, &cppc_perf) && 429 cppc_perf.nominal_perf && cppc_perf.nominal_freq) 430 return div_u64(cppc_perf.nominal_freq * KHZ_PER_MHZ, 431 cppc_perf.nominal_perf); 432 433 return core_get_scaling(); 434 } 435 436 #else /* CONFIG_ACPI_CPPC_LIB */ 437 static inline void intel_pstate_set_itmt_prio(int cpu) 438 { 439 } 440 #endif /* CONFIG_ACPI_CPPC_LIB */ 441 442 static void intel_pstate_init_acpi_perf_limits(struct cpufreq_policy *policy) 443 { 444 struct cpudata *cpu; 445 int ret; 446 int i; 447 448 if (hwp_active) { 449 intel_pstate_set_itmt_prio(policy->cpu); 450 return; 451 } 452 453 if (!intel_pstate_get_ppc_enable_status()) 454 return; 455 456 cpu = all_cpu_data[policy->cpu]; 457 458 ret = acpi_processor_register_performance(&cpu->acpi_perf_data, 459 policy->cpu); 460 if (ret) 461 return; 462 463 /* 464 * Check if the control value in _PSS is for PERF_CTL MSR, which should 465 * guarantee that the states returned by it map to the states in our 466 * list directly. 467 */ 468 if (cpu->acpi_perf_data.control_register.space_id != 469 ACPI_ADR_SPACE_FIXED_HARDWARE) 470 goto err; 471 472 /* 473 * If there is only one entry _PSS, simply ignore _PSS and continue as 474 * usual without taking _PSS into account 475 */ 476 if (cpu->acpi_perf_data.state_count < 2) 477 goto err; 478 479 pr_debug("CPU%u - ACPI _PSS perf data\n", policy->cpu); 480 for (i = 0; i < cpu->acpi_perf_data.state_count; i++) { 481 pr_debug(" %cP%d: %u MHz, %u mW, 0x%x\n", 482 (i == cpu->acpi_perf_data.state ? '*' : ' '), i, 483 (u32) cpu->acpi_perf_data.states[i].core_frequency, 484 (u32) cpu->acpi_perf_data.states[i].power, 485 (u32) cpu->acpi_perf_data.states[i].control); 486 } 487 488 cpu->valid_pss_table = true; 489 pr_debug("_PPC limits will be enforced\n"); 490 491 return; 492 493 err: 494 cpu->valid_pss_table = false; 495 acpi_processor_unregister_performance(policy->cpu); 496 } 497 498 static void intel_pstate_exit_perf_limits(struct cpufreq_policy *policy) 499 { 500 struct cpudata *cpu; 501 502 cpu = all_cpu_data[policy->cpu]; 503 if (!cpu->valid_pss_table) 504 return; 505 506 acpi_processor_unregister_performance(policy->cpu); 507 } 508 #else /* CONFIG_ACPI */ 509 static inline void intel_pstate_init_acpi_perf_limits(struct cpufreq_policy *policy) 510 { 511 } 512 513 static inline void intel_pstate_exit_perf_limits(struct cpufreq_policy *policy) 514 { 515 } 516 517 static inline bool intel_pstate_acpi_pm_profile_server(void) 518 { 519 return false; 520 } 521 #endif /* CONFIG_ACPI */ 522 523 #ifndef CONFIG_ACPI_CPPC_LIB 524 static inline int intel_pstate_get_cppc_guaranteed(int cpu) 525 { 526 return -ENOTSUPP; 527 } 528 529 static int intel_pstate_cppc_get_scaling(int cpu) 530 { 531 return core_get_scaling(); 532 } 533 #endif /* CONFIG_ACPI_CPPC_LIB */ 534 535 static int intel_pstate_freq_to_hwp_rel(struct cpudata *cpu, int freq, 536 unsigned int relation) 537 { 538 if (freq == cpu->pstate.turbo_freq) 539 return cpu->pstate.turbo_pstate; 540 541 if (freq == cpu->pstate.max_freq) 542 return cpu->pstate.max_pstate; 543 544 switch (relation) { 545 case CPUFREQ_RELATION_H: 546 return freq / cpu->pstate.scaling; 547 case CPUFREQ_RELATION_C: 548 return DIV_ROUND_CLOSEST(freq, cpu->pstate.scaling); 549 } 550 551 return DIV_ROUND_UP(freq, cpu->pstate.scaling); 552 } 553 554 static int intel_pstate_freq_to_hwp(struct cpudata *cpu, int freq) 555 { 556 return intel_pstate_freq_to_hwp_rel(cpu, freq, CPUFREQ_RELATION_L); 557 } 558 559 /** 560 * intel_pstate_hybrid_hwp_adjust - Calibrate HWP performance levels. 561 * @cpu: Target CPU. 562 * 563 * On hybrid processors, HWP may expose more performance levels than there are 564 * P-states accessible through the PERF_CTL interface. If that happens, the 565 * scaling factor between HWP performance levels and CPU frequency will be less 566 * than the scaling factor between P-state values and CPU frequency. 567 * 568 * In that case, adjust the CPU parameters used in computations accordingly. 569 */ 570 static void intel_pstate_hybrid_hwp_adjust(struct cpudata *cpu) 571 { 572 int perf_ctl_max_phys = cpu->pstate.max_pstate_physical; 573 int perf_ctl_scaling = cpu->pstate.perf_ctl_scaling; 574 int perf_ctl_turbo = pstate_funcs.get_turbo(cpu->cpu); 575 int scaling = cpu->pstate.scaling; 576 int freq; 577 578 pr_debug("CPU%d: perf_ctl_max_phys = %d\n", cpu->cpu, perf_ctl_max_phys); 579 pr_debug("CPU%d: perf_ctl_turbo = %d\n", cpu->cpu, perf_ctl_turbo); 580 pr_debug("CPU%d: perf_ctl_scaling = %d\n", cpu->cpu, perf_ctl_scaling); 581 pr_debug("CPU%d: HWP_CAP guaranteed = %d\n", cpu->cpu, cpu->pstate.max_pstate); 582 pr_debug("CPU%d: HWP_CAP highest = %d\n", cpu->cpu, cpu->pstate.turbo_pstate); 583 pr_debug("CPU%d: HWP-to-frequency scaling factor: %d\n", cpu->cpu, scaling); 584 585 cpu->pstate.turbo_freq = rounddown(cpu->pstate.turbo_pstate * scaling, 586 perf_ctl_scaling); 587 cpu->pstate.max_freq = rounddown(cpu->pstate.max_pstate * scaling, 588 perf_ctl_scaling); 589 590 freq = perf_ctl_max_phys * perf_ctl_scaling; 591 cpu->pstate.max_pstate_physical = intel_pstate_freq_to_hwp(cpu, freq); 592 593 freq = cpu->pstate.min_pstate * perf_ctl_scaling; 594 cpu->pstate.min_freq = freq; 595 /* 596 * Cast the min P-state value retrieved via pstate_funcs.get_min() to 597 * the effective range of HWP performance levels. 598 */ 599 cpu->pstate.min_pstate = intel_pstate_freq_to_hwp(cpu, freq); 600 } 601 602 static bool turbo_is_disabled(void) 603 { 604 u64 misc_en; 605 606 if (!cpu_feature_enabled(X86_FEATURE_IDA)) 607 return true; 608 609 rdmsrq(MSR_IA32_MISC_ENABLE, misc_en); 610 611 return !!(misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE); 612 } 613 614 static int min_perf_pct_min(void) 615 { 616 struct cpudata *cpu = all_cpu_data[0]; 617 int turbo_pstate = cpu->pstate.turbo_pstate; 618 619 return turbo_pstate ? 620 (cpu->pstate.min_pstate * 100 / turbo_pstate) : 0; 621 } 622 623 static s16 intel_pstate_get_epp(struct cpudata *cpu_data, u64 hwp_req_data) 624 { 625 s16 epp = -EOPNOTSUPP; 626 627 if (boot_cpu_has(X86_FEATURE_HWP_EPP)) { 628 /* 629 * When hwp_req_data is 0, means that caller didn't read 630 * MSR_HWP_REQUEST, so need to read and get EPP. 631 */ 632 if (!hwp_req_data) { 633 epp = rdmsrq_on_cpu(cpu_data->cpu, MSR_HWP_REQUEST, 634 &hwp_req_data); 635 if (epp) 636 return epp; 637 } 638 epp = (hwp_req_data >> 24) & 0xff; 639 } 640 641 return epp; 642 } 643 644 /* 645 * EPP display strings corresponding to EPP index in the 646 * energy_perf_strings[] 647 * index String 648 *------------------------------------- 649 * 0 default 650 * 1 performance 651 * 2 balance_performance 652 * 3 balance_power 653 * 4 power 654 */ 655 656 enum energy_perf_value_index { 657 EPP_INDEX_DEFAULT = 0, 658 EPP_INDEX_PERFORMANCE, 659 EPP_INDEX_BALANCE_PERFORMANCE, 660 EPP_INDEX_BALANCE_POWERSAVE, 661 EPP_INDEX_POWERSAVE, 662 }; 663 664 static const char * const energy_perf_strings[] = { 665 [EPP_INDEX_DEFAULT] = "default", 666 [EPP_INDEX_PERFORMANCE] = "performance", 667 [EPP_INDEX_BALANCE_PERFORMANCE] = "balance_performance", 668 [EPP_INDEX_BALANCE_POWERSAVE] = "balance_power", 669 [EPP_INDEX_POWERSAVE] = "power", 670 NULL 671 }; 672 static unsigned int epp_values[] = { 673 [EPP_INDEX_DEFAULT] = 0, /* Unused index */ 674 [EPP_INDEX_PERFORMANCE] = HWP_EPP_PERFORMANCE, 675 [EPP_INDEX_BALANCE_PERFORMANCE] = HWP_EPP_BALANCE_PERFORMANCE, 676 [EPP_INDEX_BALANCE_POWERSAVE] = HWP_EPP_BALANCE_POWERSAVE, 677 [EPP_INDEX_POWERSAVE] = HWP_EPP_POWERSAVE, 678 }; 679 680 static int intel_pstate_get_energy_pref_index(struct cpudata *cpu_data, int *raw_epp) 681 { 682 s16 epp; 683 int index = -EINVAL; 684 685 *raw_epp = 0; 686 epp = intel_pstate_get_epp(cpu_data, 0); 687 if (epp < 0) 688 return epp; 689 690 if (boot_cpu_has(X86_FEATURE_HWP_EPP)) { 691 if (epp == epp_values[EPP_INDEX_PERFORMANCE]) 692 return EPP_INDEX_PERFORMANCE; 693 if (epp == epp_values[EPP_INDEX_BALANCE_PERFORMANCE]) 694 return EPP_INDEX_BALANCE_PERFORMANCE; 695 if (epp == epp_values[EPP_INDEX_BALANCE_POWERSAVE]) 696 return EPP_INDEX_BALANCE_POWERSAVE; 697 if (epp == epp_values[EPP_INDEX_POWERSAVE]) 698 return EPP_INDEX_POWERSAVE; 699 *raw_epp = epp; 700 return 0; 701 } else if (boot_cpu_has(X86_FEATURE_EPB)) { 702 /* 703 * Range: 704 * 0x00-0x03 : Performance 705 * 0x04-0x07 : Balance performance 706 * 0x08-0x0B : Balance power 707 * 0x0C-0x0F : Power 708 * The EPB is a 4 bit value, but our ranges restrict the 709 * value which can be set. Here only using top two bits 710 * effectively. 711 */ 712 index = (epp >> 2) + 1; 713 } 714 715 return index; 716 } 717 718 static int intel_pstate_set_epp(struct cpudata *cpu, u32 epp) 719 { 720 int ret; 721 722 /* 723 * Use the cached HWP Request MSR value, because in the active mode the 724 * register itself may be updated by intel_pstate_hwp_boost_up() or 725 * intel_pstate_hwp_boost_down() at any time. 726 */ 727 u64 value = READ_ONCE(cpu->hwp_req_cached); 728 729 value &= ~GENMASK_ULL(31, 24); 730 value |= (u64)epp << 24; 731 /* 732 * The only other updater of hwp_req_cached in the active mode, 733 * intel_pstate_hwp_set(), is called under the same lock as this 734 * function, so it cannot run in parallel with the update below. 735 */ 736 WRITE_ONCE(cpu->hwp_req_cached, value); 737 ret = wrmsrq_on_cpu(cpu->cpu, MSR_HWP_REQUEST, value); 738 if (!ret) 739 cpu->epp_cached = epp; 740 741 return ret; 742 } 743 744 static int intel_pstate_set_energy_pref_index(struct cpudata *cpu_data, 745 int pref_index, bool use_raw, 746 u32 raw_epp) 747 { 748 int epp = -EINVAL; 749 int ret = -EOPNOTSUPP; 750 751 if (!pref_index) 752 epp = cpu_data->epp_default; 753 754 if (boot_cpu_has(X86_FEATURE_HWP_EPP)) { 755 if (use_raw) 756 epp = raw_epp; 757 else if (epp == -EINVAL) 758 epp = epp_values[pref_index]; 759 760 /* 761 * To avoid confusion, refuse to set EPP to any values different 762 * from 0 (performance) if the current policy is "performance", 763 * because those values would be overridden. 764 */ 765 if (epp > 0 && cpu_data->policy == CPUFREQ_POLICY_PERFORMANCE) 766 return -EBUSY; 767 768 ret = intel_pstate_set_epp(cpu_data, epp); 769 } 770 771 return ret; 772 } 773 774 static ssize_t show_energy_performance_available_preferences( 775 struct cpufreq_policy *policy, char *buf) 776 { 777 int i = 0; 778 int ret = 0; 779 780 while (energy_perf_strings[i] != NULL) 781 ret += sprintf(&buf[ret], "%s ", energy_perf_strings[i++]); 782 783 ret += sprintf(&buf[ret], "\n"); 784 785 return ret; 786 } 787 788 cpufreq_freq_attr_ro(energy_performance_available_preferences); 789 790 static struct cpufreq_driver intel_pstate; 791 792 static ssize_t store_energy_performance_preference( 793 struct cpufreq_policy *policy, const char *buf, size_t count) 794 { 795 struct cpudata *cpu = all_cpu_data[policy->cpu]; 796 char str_preference[21]; 797 bool raw = false; 798 ssize_t ret; 799 u32 epp = 0; 800 801 ret = sscanf(buf, "%20s", str_preference); 802 if (ret != 1) 803 return -EINVAL; 804 805 ret = match_string(energy_perf_strings, -1, str_preference); 806 if (ret < 0) { 807 if (!boot_cpu_has(X86_FEATURE_HWP_EPP)) 808 return ret; 809 810 ret = kstrtouint(buf, 10, &epp); 811 if (ret) 812 return ret; 813 814 if (epp > 255) 815 return -EINVAL; 816 817 raw = true; 818 } 819 820 /* 821 * This function runs with the policy R/W semaphore held, which 822 * guarantees that the driver pointer will not change while it is 823 * running. 824 */ 825 if (!intel_pstate_driver) 826 return -EAGAIN; 827 828 mutex_lock(&intel_pstate_limits_lock); 829 830 if (intel_pstate_driver == &intel_pstate) { 831 ret = intel_pstate_set_energy_pref_index(cpu, ret, raw, epp); 832 } else { 833 /* 834 * In the passive mode the governor needs to be stopped on the 835 * target CPU before the EPP update and restarted after it, 836 * which is super-heavy-weight, so make sure it is worth doing 837 * upfront. 838 */ 839 if (!raw) 840 epp = ret ? epp_values[ret] : cpu->epp_default; 841 842 if (cpu->epp_cached != epp) { 843 int err; 844 845 cpufreq_stop_governor(policy); 846 ret = intel_pstate_set_epp(cpu, epp); 847 err = cpufreq_start_governor(policy); 848 if (!ret) 849 ret = err; 850 } else { 851 ret = 0; 852 } 853 } 854 855 mutex_unlock(&intel_pstate_limits_lock); 856 857 return ret ?: count; 858 } 859 860 static ssize_t show_energy_performance_preference( 861 struct cpufreq_policy *policy, char *buf) 862 { 863 struct cpudata *cpu_data = all_cpu_data[policy->cpu]; 864 int preference, raw_epp; 865 866 preference = intel_pstate_get_energy_pref_index(cpu_data, &raw_epp); 867 if (preference < 0) 868 return preference; 869 870 if (raw_epp) 871 return sprintf(buf, "%d\n", raw_epp); 872 else 873 return sprintf(buf, "%s\n", energy_perf_strings[preference]); 874 } 875 876 cpufreq_freq_attr_rw(energy_performance_preference); 877 878 static ssize_t show_base_frequency(struct cpufreq_policy *policy, char *buf) 879 { 880 struct cpudata *cpu = all_cpu_data[policy->cpu]; 881 int ratio, freq; 882 883 ratio = intel_pstate_get_cppc_guaranteed(policy->cpu); 884 if (ratio <= 0) { 885 u64 cap; 886 887 rdmsrq_on_cpu(policy->cpu, MSR_HWP_CAPABILITIES, &cap); 888 ratio = HWP_GUARANTEED_PERF(cap); 889 } 890 891 freq = ratio * cpu->pstate.scaling; 892 if (cpu->pstate.scaling != cpu->pstate.perf_ctl_scaling) 893 freq = rounddown(freq, cpu->pstate.perf_ctl_scaling); 894 895 return sprintf(buf, "%d\n", freq); 896 } 897 898 cpufreq_freq_attr_ro(base_frequency); 899 900 enum hwp_cpufreq_attr_index { 901 HWP_BASE_FREQUENCY_INDEX = 0, 902 HWP_PERFORMANCE_PREFERENCE_INDEX, 903 HWP_PERFORMANCE_AVAILABLE_PREFERENCES_INDEX, 904 HWP_CPUFREQ_ATTR_COUNT, 905 }; 906 907 static struct freq_attr *hwp_cpufreq_attrs[] = { 908 [HWP_BASE_FREQUENCY_INDEX] = &base_frequency, 909 [HWP_PERFORMANCE_PREFERENCE_INDEX] = &energy_performance_preference, 910 [HWP_PERFORMANCE_AVAILABLE_PREFERENCES_INDEX] = 911 &energy_performance_available_preferences, 912 [HWP_CPUFREQ_ATTR_COUNT] = NULL, 913 }; 914 915 static bool no_cas __ro_after_init; 916 917 static struct cpudata *hybrid_max_perf_cpu __read_mostly; 918 /* 919 * Protects hybrid_max_perf_cpu, the capacity_perf fields in struct cpudata, 920 * and the x86 arch scale-invariance information from concurrent updates. 921 */ 922 static DEFINE_MUTEX(hybrid_capacity_lock); 923 924 #ifdef CONFIG_ENERGY_MODEL 925 #define HYBRID_EM_STATE_COUNT 4 926 927 static int hybrid_active_power(struct device *dev, unsigned long *power, 928 unsigned long *freq) 929 { 930 /* 931 * Create "utilization bins" of 0-40%, 40%-60%, 60%-80%, and 80%-100% 932 * of the maximum capacity such that two CPUs of the same type will be 933 * regarded as equally attractive if the utilization of each of them 934 * falls into the same bin, which should prevent tasks from being 935 * migrated between them too often. 936 * 937 * For this purpose, return the "frequency" of 2 for the first 938 * performance level and otherwise leave the value set by the caller. 939 */ 940 if (!*freq) 941 *freq = 2; 942 943 /* No power information. */ 944 *power = EM_MAX_POWER; 945 946 return 0; 947 } 948 949 static int hybrid_get_cost(struct device *dev, unsigned long freq, 950 unsigned long *cost) 951 { 952 struct pstate_data *pstate = &all_cpu_data[dev->id]->pstate; 953 struct cpu_cacheinfo *cacheinfo = get_cpu_cacheinfo(dev->id); 954 955 /* 956 * The smaller the perf-to-frequency scaling factor, the larger the IPC 957 * ratio between the given CPU and the least capable CPU in the system. 958 * Regard that IPC ratio as the primary cost component and assume that 959 * the scaling factors for different CPU types will differ by at least 960 * 5% and they will not be above INTEL_PSTATE_CORE_SCALING. 961 * 962 * Add the freq value to the cost, so that the cost of running on CPUs 963 * of the same type in different "utilization bins" is different. 964 */ 965 *cost = div_u64(100ULL * INTEL_PSTATE_CORE_SCALING, pstate->scaling) + freq; 966 /* 967 * Increase the cost slightly for CPUs able to access L3 to avoid 968 * touching it in case some other CPUs of the same type can do the work 969 * without it. 970 */ 971 if (cacheinfo) { 972 unsigned int i; 973 974 /* Check if L3 cache is there. */ 975 for (i = 0; i < cacheinfo->num_leaves; i++) { 976 if (cacheinfo->info_list[i].level == 3) { 977 *cost += 2; 978 break; 979 } 980 } 981 } 982 983 return 0; 984 } 985 986 static bool hybrid_register_perf_domain(unsigned int cpu) 987 { 988 static const struct em_data_callback cb 989 = EM_ADV_DATA_CB(hybrid_active_power, hybrid_get_cost); 990 struct cpudata *cpudata = all_cpu_data[cpu]; 991 struct device *cpu_dev; 992 993 /* 994 * Registering EM perf domains without enabling asymmetric CPU capacity 995 * support is not really useful and one domain should not be registered 996 * more than once. 997 */ 998 if (!hybrid_max_perf_cpu || cpudata->pd_registered) 999 return false; 1000 1001 cpu_dev = get_cpu_device(cpu); 1002 if (!cpu_dev) 1003 return false; 1004 1005 if (em_dev_register_pd_no_update(cpu_dev, HYBRID_EM_STATE_COUNT, &cb, 1006 cpumask_of(cpu), false)) 1007 return false; 1008 1009 cpudata->pd_registered = true; 1010 1011 return true; 1012 } 1013 1014 static void hybrid_register_all_perf_domains(void) 1015 { 1016 unsigned int cpu; 1017 1018 for_each_online_cpu(cpu) 1019 hybrid_register_perf_domain(cpu); 1020 } 1021 1022 static void hybrid_update_perf_domain(struct cpudata *cpu) 1023 { 1024 if (cpu->pd_registered) 1025 em_adjust_cpu_capacity(cpu->cpu); 1026 } 1027 #else /* !CONFIG_ENERGY_MODEL */ 1028 static inline bool hybrid_register_perf_domain(unsigned int cpu) { return false; } 1029 static inline void hybrid_register_all_perf_domains(void) {} 1030 static inline void hybrid_update_perf_domain(struct cpudata *cpu) {} 1031 #endif /* CONFIG_ENERGY_MODEL */ 1032 1033 static void hybrid_set_cpu_capacity(struct cpudata *cpu) 1034 { 1035 arch_set_cpu_capacity(cpu->cpu, cpu->capacity_perf, 1036 hybrid_max_perf_cpu->capacity_perf, 1037 cpu->capacity_perf, 1038 cpu->pstate.max_pstate_physical); 1039 hybrid_update_perf_domain(cpu); 1040 1041 topology_set_cpu_scale(cpu->cpu, arch_scale_cpu_capacity(cpu->cpu)); 1042 1043 pr_debug("CPU%d: perf = %u, max. perf = %u, base perf = %d\n", cpu->cpu, 1044 cpu->capacity_perf, hybrid_max_perf_cpu->capacity_perf, 1045 cpu->pstate.max_pstate_physical); 1046 } 1047 1048 static void hybrid_clear_cpu_capacity(unsigned int cpunum) 1049 { 1050 arch_set_cpu_capacity(cpunum, 1, 1, 1, 1); 1051 } 1052 1053 static void hybrid_get_capacity_perf(struct cpudata *cpu) 1054 { 1055 if (READ_ONCE(global.no_turbo)) { 1056 cpu->capacity_perf = cpu->pstate.max_pstate_physical; 1057 return; 1058 } 1059 1060 cpu->capacity_perf = HWP_HIGHEST_PERF(READ_ONCE(cpu->hwp_cap_cached)); 1061 } 1062 1063 static void hybrid_set_capacity_of_cpus(void) 1064 { 1065 int cpunum; 1066 1067 for_each_online_cpu(cpunum) { 1068 struct cpudata *cpu = all_cpu_data[cpunum]; 1069 1070 if (cpu) 1071 hybrid_set_cpu_capacity(cpu); 1072 } 1073 } 1074 1075 static void hybrid_update_cpu_capacity_scaling(void) 1076 { 1077 struct cpudata *max_perf_cpu = NULL; 1078 unsigned int max_cap_perf = 0; 1079 int cpunum; 1080 1081 for_each_online_cpu(cpunum) { 1082 struct cpudata *cpu = all_cpu_data[cpunum]; 1083 1084 if (!cpu) 1085 continue; 1086 1087 /* 1088 * During initialization, CPU performance at full capacity needs 1089 * to be determined. 1090 */ 1091 if (!hybrid_max_perf_cpu) 1092 hybrid_get_capacity_perf(cpu); 1093 1094 /* 1095 * If hybrid_max_perf_cpu is not NULL at this point, it is 1096 * being replaced, so don't take it into account when looking 1097 * for the new one. 1098 */ 1099 if (cpu == hybrid_max_perf_cpu) 1100 continue; 1101 1102 if (cpu->capacity_perf > max_cap_perf) { 1103 max_cap_perf = cpu->capacity_perf; 1104 max_perf_cpu = cpu; 1105 } 1106 } 1107 1108 if (max_perf_cpu) { 1109 hybrid_max_perf_cpu = max_perf_cpu; 1110 hybrid_set_capacity_of_cpus(); 1111 } else { 1112 pr_info("Found no CPUs with nonzero maximum performance\n"); 1113 /* Revert to the flat CPU capacity structure. */ 1114 for_each_online_cpu(cpunum) 1115 hybrid_clear_cpu_capacity(cpunum); 1116 } 1117 } 1118 1119 static void __hybrid_refresh_cpu_capacity_scaling(void) 1120 { 1121 hybrid_max_perf_cpu = NULL; 1122 hybrid_update_cpu_capacity_scaling(); 1123 } 1124 1125 static void hybrid_refresh_cpu_capacity_scaling(void) 1126 { 1127 guard(mutex)(&hybrid_capacity_lock); 1128 1129 __hybrid_refresh_cpu_capacity_scaling(); 1130 /* 1131 * Perf domains are not registered before setting hybrid_max_perf_cpu, 1132 * so register them all after setting up CPU capacity scaling. 1133 */ 1134 hybrid_register_all_perf_domains(); 1135 } 1136 1137 static void hybrid_init_cpu_capacity_scaling(bool refresh) 1138 { 1139 /* Bail out if enabling capacity-aware scheduling is prohibited. */ 1140 if (no_cas) 1141 return; 1142 1143 /* 1144 * If hybrid_max_perf_cpu is set at this point, the hybrid CPU capacity 1145 * scaling has been enabled already and the driver is just changing the 1146 * operation mode. 1147 */ 1148 if (refresh) { 1149 hybrid_refresh_cpu_capacity_scaling(); 1150 return; 1151 } 1152 1153 /* 1154 * On hybrid systems, use asym capacity instead of ITMT, but because 1155 * the capacity of SMT threads is not deterministic even approximately, 1156 * do not do that when SMT is in use. 1157 */ 1158 if (hwp_is_hybrid && !sched_smt_active() && arch_enable_hybrid_capacity_scale()) { 1159 hybrid_refresh_cpu_capacity_scaling(); 1160 /* 1161 * Disabling ITMT causes sched domains to be rebuilt to disable asym 1162 * packing and enable asym capacity and EAS. 1163 */ 1164 sched_clear_itmt_support(); 1165 } 1166 } 1167 1168 static bool hybrid_clear_max_perf_cpu(void) 1169 { 1170 bool ret; 1171 1172 guard(mutex)(&hybrid_capacity_lock); 1173 1174 ret = !!hybrid_max_perf_cpu; 1175 hybrid_max_perf_cpu = NULL; 1176 1177 return ret; 1178 } 1179 1180 static void __intel_pstate_get_hwp_cap(struct cpudata *cpu) 1181 { 1182 u64 cap; 1183 1184 rdmsrq_on_cpu(cpu->cpu, MSR_HWP_CAPABILITIES, &cap); 1185 WRITE_ONCE(cpu->hwp_cap_cached, cap); 1186 cpu->pstate.max_pstate = HWP_GUARANTEED_PERF(cap); 1187 cpu->pstate.turbo_pstate = HWP_HIGHEST_PERF(cap); 1188 } 1189 1190 static void intel_pstate_get_hwp_cap(struct cpudata *cpu) 1191 { 1192 int scaling = cpu->pstate.scaling; 1193 1194 __intel_pstate_get_hwp_cap(cpu); 1195 1196 cpu->pstate.max_freq = cpu->pstate.max_pstate * scaling; 1197 cpu->pstate.turbo_freq = cpu->pstate.turbo_pstate * scaling; 1198 if (scaling != cpu->pstate.perf_ctl_scaling) { 1199 int perf_ctl_scaling = cpu->pstate.perf_ctl_scaling; 1200 1201 cpu->pstate.max_freq = rounddown(cpu->pstate.max_freq, 1202 perf_ctl_scaling); 1203 cpu->pstate.turbo_freq = rounddown(cpu->pstate.turbo_freq, 1204 perf_ctl_scaling); 1205 } 1206 } 1207 1208 static void hybrid_update_capacity(struct cpudata *cpu) 1209 { 1210 unsigned int max_cap_perf; 1211 1212 mutex_lock(&hybrid_capacity_lock); 1213 1214 if (!hybrid_max_perf_cpu) 1215 goto unlock; 1216 1217 /* 1218 * The maximum performance of the CPU may have changed, but assume 1219 * that the performance of the other CPUs has not changed. 1220 */ 1221 max_cap_perf = hybrid_max_perf_cpu->capacity_perf; 1222 1223 intel_pstate_get_hwp_cap(cpu); 1224 1225 hybrid_get_capacity_perf(cpu); 1226 /* Should hybrid_max_perf_cpu be replaced by this CPU? */ 1227 if (cpu->capacity_perf > max_cap_perf) { 1228 hybrid_max_perf_cpu = cpu; 1229 hybrid_set_capacity_of_cpus(); 1230 goto unlock; 1231 } 1232 1233 /* If this CPU is hybrid_max_perf_cpu, should it be replaced? */ 1234 if (cpu == hybrid_max_perf_cpu && cpu->capacity_perf < max_cap_perf) { 1235 hybrid_update_cpu_capacity_scaling(); 1236 goto unlock; 1237 } 1238 1239 hybrid_set_cpu_capacity(cpu); 1240 /* 1241 * If the CPU was offline to start with and it is going online for the 1242 * first time, a perf domain needs to be registered for it if hybrid 1243 * capacity scaling has been enabled already. In that case, sched 1244 * domains need to be rebuilt to take the new perf domain into account. 1245 */ 1246 if (hybrid_register_perf_domain(cpu->cpu)) 1247 em_rebuild_sched_domains(); 1248 1249 unlock: 1250 mutex_unlock(&hybrid_capacity_lock); 1251 } 1252 1253 static void intel_pstate_hwp_set(unsigned int cpu) 1254 { 1255 struct cpudata *cpu_data = all_cpu_data[cpu]; 1256 int max, min; 1257 u64 value; 1258 s16 epp; 1259 1260 max = cpu_data->max_perf_ratio; 1261 min = cpu_data->min_perf_ratio; 1262 1263 if (cpu_data->policy == CPUFREQ_POLICY_PERFORMANCE) 1264 min = max; 1265 1266 rdmsrq_on_cpu(cpu, MSR_HWP_REQUEST, &value); 1267 1268 value &= ~HWP_MIN_PERF(~0L); 1269 value |= HWP_MIN_PERF(min); 1270 1271 value &= ~HWP_MAX_PERF(~0L); 1272 value |= HWP_MAX_PERF(max); 1273 1274 if (cpu_data->epp_policy == cpu_data->policy) 1275 goto skip_epp; 1276 1277 cpu_data->epp_policy = cpu_data->policy; 1278 1279 if (cpu_data->policy == CPUFREQ_POLICY_PERFORMANCE) { 1280 epp = intel_pstate_get_epp(cpu_data, value); 1281 cpu_data->epp_powersave = epp; 1282 /* If EPP read was failed, then don't try to write */ 1283 if (epp < 0) 1284 goto skip_epp; 1285 1286 epp = 0; 1287 } else { 1288 /* skip setting EPP, when saved value is invalid */ 1289 if (cpu_data->epp_powersave < 0) 1290 goto skip_epp; 1291 1292 /* 1293 * No need to restore EPP when it is not zero. This 1294 * means: 1295 * - Policy is not changed 1296 * - user has manually changed 1297 * - Error reading EPB 1298 */ 1299 epp = intel_pstate_get_epp(cpu_data, value); 1300 if (epp) 1301 goto skip_epp; 1302 1303 epp = cpu_data->epp_powersave; 1304 } 1305 if (boot_cpu_has(X86_FEATURE_HWP_EPP)) { 1306 value &= ~GENMASK_ULL(31, 24); 1307 value |= (u64)epp << 24; 1308 } 1309 1310 skip_epp: 1311 WRITE_ONCE(cpu_data->hwp_req_cached, value); 1312 wrmsrq_on_cpu(cpu, MSR_HWP_REQUEST, value); 1313 } 1314 1315 static void intel_pstate_disable_hwp_interrupt(struct cpudata *cpudata); 1316 1317 static void intel_pstate_hwp_offline(struct cpudata *cpu) 1318 { 1319 u64 value = READ_ONCE(cpu->hwp_req_cached); 1320 int min_perf; 1321 1322 intel_pstate_disable_hwp_interrupt(cpu); 1323 1324 if (boot_cpu_has(X86_FEATURE_HWP_EPP)) { 1325 /* 1326 * In case the EPP has been set to "performance" by the 1327 * active mode "performance" scaling algorithm, replace that 1328 * temporary value with the cached EPP one. 1329 */ 1330 value &= ~GENMASK_ULL(31, 24); 1331 value |= HWP_ENERGY_PERF_PREFERENCE(cpu->epp_cached); 1332 /* 1333 * However, make sure that EPP will be set to "performance" when 1334 * the CPU is brought back online again and the "performance" 1335 * scaling algorithm is still in effect. 1336 */ 1337 cpu->epp_policy = CPUFREQ_POLICY_UNKNOWN; 1338 } 1339 1340 /* 1341 * Clear the desired perf field in the cached HWP request value to 1342 * prevent nonzero desired values from being leaked into the active 1343 * mode. 1344 */ 1345 value &= ~HWP_DESIRED_PERF(~0L); 1346 WRITE_ONCE(cpu->hwp_req_cached, value); 1347 1348 value &= ~GENMASK_ULL(31, 0); 1349 min_perf = HWP_LOWEST_PERF(READ_ONCE(cpu->hwp_cap_cached)); 1350 1351 /* Set hwp_max = hwp_min */ 1352 value |= HWP_MAX_PERF(min_perf); 1353 value |= HWP_MIN_PERF(min_perf); 1354 1355 /* Set EPP to min */ 1356 if (boot_cpu_has(X86_FEATURE_HWP_EPP)) 1357 value |= HWP_ENERGY_PERF_PREFERENCE(HWP_EPP_POWERSAVE); 1358 1359 wrmsrq_on_cpu(cpu->cpu, MSR_HWP_REQUEST, value); 1360 1361 mutex_lock(&hybrid_capacity_lock); 1362 1363 if (!hybrid_max_perf_cpu) { 1364 mutex_unlock(&hybrid_capacity_lock); 1365 1366 return; 1367 } 1368 1369 if (hybrid_max_perf_cpu == cpu) 1370 hybrid_update_cpu_capacity_scaling(); 1371 1372 mutex_unlock(&hybrid_capacity_lock); 1373 1374 /* Reset the capacity of the CPU going offline to the initial value. */ 1375 hybrid_clear_cpu_capacity(cpu->cpu); 1376 } 1377 1378 #define POWER_CTL_EE_ENABLE 1 1379 #define POWER_CTL_EE_DISABLE 2 1380 1381 /* Enable bit for Dynamic Efficiency Control (DEC) */ 1382 #define POWER_CTL_DEC_ENABLE 27 1383 1384 static int power_ctl_ee_state; 1385 1386 static void set_power_ctl_ee_state(bool input) 1387 { 1388 u64 power_ctl; 1389 1390 mutex_lock(&intel_pstate_driver_lock); 1391 rdmsrq(MSR_IA32_POWER_CTL, power_ctl); 1392 if (input) { 1393 power_ctl &= ~BIT(MSR_IA32_POWER_CTL_BIT_EE); 1394 power_ctl_ee_state = POWER_CTL_EE_ENABLE; 1395 } else { 1396 power_ctl |= BIT(MSR_IA32_POWER_CTL_BIT_EE); 1397 power_ctl_ee_state = POWER_CTL_EE_DISABLE; 1398 } 1399 wrmsrq(MSR_IA32_POWER_CTL, power_ctl); 1400 mutex_unlock(&intel_pstate_driver_lock); 1401 } 1402 1403 static void intel_pstate_hwp_enable(struct cpudata *cpudata); 1404 1405 static void intel_pstate_hwp_reenable(struct cpudata *cpu) 1406 { 1407 intel_pstate_hwp_enable(cpu); 1408 wrmsrq_on_cpu(cpu->cpu, MSR_HWP_REQUEST, READ_ONCE(cpu->hwp_req_cached)); 1409 } 1410 1411 static int intel_pstate_suspend(struct cpufreq_policy *policy) 1412 { 1413 struct cpudata *cpu = all_cpu_data[policy->cpu]; 1414 1415 pr_debug("CPU %d suspending\n", cpu->cpu); 1416 1417 cpu->suspended = true; 1418 1419 /* disable HWP interrupt and cancel any pending work */ 1420 intel_pstate_disable_hwp_interrupt(cpu); 1421 1422 return 0; 1423 } 1424 1425 static int intel_pstate_resume(struct cpufreq_policy *policy) 1426 { 1427 struct cpudata *cpu = all_cpu_data[policy->cpu]; 1428 1429 pr_debug("CPU %d resuming\n", cpu->cpu); 1430 1431 /* Only restore if the system default is changed */ 1432 if (power_ctl_ee_state == POWER_CTL_EE_ENABLE) 1433 set_power_ctl_ee_state(true); 1434 else if (power_ctl_ee_state == POWER_CTL_EE_DISABLE) 1435 set_power_ctl_ee_state(false); 1436 1437 if (cpu->suspended && hwp_active) { 1438 mutex_lock(&intel_pstate_limits_lock); 1439 1440 /* Re-enable HWP, because "online" has not done that. */ 1441 intel_pstate_hwp_reenable(cpu); 1442 1443 mutex_unlock(&intel_pstate_limits_lock); 1444 } 1445 1446 cpu->suspended = false; 1447 1448 return 0; 1449 } 1450 1451 static void intel_pstate_update_policies(void) 1452 { 1453 int cpu; 1454 1455 for_each_possible_cpu(cpu) 1456 cpufreq_update_policy(cpu); 1457 } 1458 1459 static void __intel_pstate_update_max_freq(struct cpufreq_policy *policy, 1460 struct cpudata *cpudata) 1461 { 1462 guard(cpufreq_policy_write)(policy); 1463 1464 if (hwp_active) 1465 intel_pstate_get_hwp_cap(cpudata); 1466 1467 policy->cpuinfo.max_freq = READ_ONCE(global.no_turbo) ? 1468 cpudata->pstate.max_freq : cpudata->pstate.turbo_freq; 1469 1470 refresh_frequency_limits(policy); 1471 } 1472 1473 static bool intel_pstate_update_max_freq(struct cpudata *cpudata) 1474 { 1475 struct cpufreq_policy *policy __free(put_cpufreq_policy) = cpufreq_cpu_get(cpudata->cpu); 1476 if (!policy) 1477 return false; 1478 1479 __intel_pstate_update_max_freq(policy, cpudata); 1480 1481 return true; 1482 } 1483 1484 static void intel_pstate_update_limits(struct cpufreq_policy *policy) 1485 { 1486 struct cpudata *cpudata = all_cpu_data[policy->cpu]; 1487 1488 __intel_pstate_update_max_freq(policy, cpudata); 1489 1490 hybrid_update_capacity(cpudata); 1491 } 1492 1493 static void intel_pstate_update_limits_for_all(void) 1494 { 1495 int cpu; 1496 1497 for_each_possible_cpu(cpu) 1498 intel_pstate_update_max_freq(all_cpu_data[cpu]); 1499 1500 mutex_lock(&hybrid_capacity_lock); 1501 1502 if (hybrid_max_perf_cpu) 1503 __hybrid_refresh_cpu_capacity_scaling(); 1504 1505 mutex_unlock(&hybrid_capacity_lock); 1506 } 1507 1508 /************************** sysfs begin ************************/ 1509 #define show_one(file_name, object) \ 1510 static ssize_t show_##file_name \ 1511 (struct kobject *kobj, struct kobj_attribute *attr, char *buf) \ 1512 { \ 1513 return sprintf(buf, "%u\n", global.object); \ 1514 } 1515 1516 static ssize_t intel_pstate_show_status(char *buf); 1517 static int intel_pstate_update_status(const char *buf, size_t size); 1518 1519 static ssize_t show_status(struct kobject *kobj, 1520 struct kobj_attribute *attr, char *buf) 1521 { 1522 ssize_t ret; 1523 1524 mutex_lock(&intel_pstate_driver_lock); 1525 ret = intel_pstate_show_status(buf); 1526 mutex_unlock(&intel_pstate_driver_lock); 1527 1528 return ret; 1529 } 1530 1531 static ssize_t store_status(struct kobject *a, struct kobj_attribute *b, 1532 const char *buf, size_t count) 1533 { 1534 char *p = memchr(buf, '\n', count); 1535 int ret; 1536 1537 mutex_lock(&intel_pstate_driver_lock); 1538 ret = intel_pstate_update_status(buf, p ? p - buf : count); 1539 mutex_unlock(&intel_pstate_driver_lock); 1540 1541 return ret < 0 ? ret : count; 1542 } 1543 1544 static ssize_t show_turbo_pct(struct kobject *kobj, 1545 struct kobj_attribute *attr, char *buf) 1546 { 1547 struct cpudata *cpu; 1548 int total, no_turbo, turbo_pct; 1549 uint32_t turbo_fp; 1550 1551 mutex_lock(&intel_pstate_driver_lock); 1552 1553 if (!intel_pstate_driver) { 1554 mutex_unlock(&intel_pstate_driver_lock); 1555 return -EAGAIN; 1556 } 1557 1558 cpu = all_cpu_data[0]; 1559 1560 total = cpu->pstate.turbo_pstate - cpu->pstate.min_pstate + 1; 1561 no_turbo = cpu->pstate.max_pstate - cpu->pstate.min_pstate + 1; 1562 turbo_fp = div_fp(no_turbo, total); 1563 turbo_pct = 100 - fp_toint(mul_fp(turbo_fp, int_tofp(100))); 1564 1565 mutex_unlock(&intel_pstate_driver_lock); 1566 1567 return sprintf(buf, "%u\n", turbo_pct); 1568 } 1569 1570 static ssize_t show_num_pstates(struct kobject *kobj, 1571 struct kobj_attribute *attr, char *buf) 1572 { 1573 struct cpudata *cpu; 1574 int total; 1575 1576 mutex_lock(&intel_pstate_driver_lock); 1577 1578 if (!intel_pstate_driver) { 1579 mutex_unlock(&intel_pstate_driver_lock); 1580 return -EAGAIN; 1581 } 1582 1583 cpu = all_cpu_data[0]; 1584 total = cpu->pstate.turbo_pstate - cpu->pstate.min_pstate + 1; 1585 1586 mutex_unlock(&intel_pstate_driver_lock); 1587 1588 return sprintf(buf, "%u\n", total); 1589 } 1590 1591 static ssize_t show_no_turbo(struct kobject *kobj, 1592 struct kobj_attribute *attr, char *buf) 1593 { 1594 ssize_t ret; 1595 1596 mutex_lock(&intel_pstate_driver_lock); 1597 1598 if (!intel_pstate_driver) { 1599 mutex_unlock(&intel_pstate_driver_lock); 1600 return -EAGAIN; 1601 } 1602 1603 ret = sprintf(buf, "%u\n", global.no_turbo); 1604 1605 mutex_unlock(&intel_pstate_driver_lock); 1606 1607 return ret; 1608 } 1609 1610 static ssize_t store_no_turbo(struct kobject *a, struct kobj_attribute *b, 1611 const char *buf, size_t count) 1612 { 1613 unsigned int input; 1614 bool no_turbo; 1615 1616 if (sscanf(buf, "%u", &input) != 1) 1617 return -EINVAL; 1618 1619 mutex_lock(&intel_pstate_driver_lock); 1620 1621 if (!intel_pstate_driver) { 1622 count = -EAGAIN; 1623 goto unlock_driver; 1624 } 1625 1626 no_turbo = !!clamp_t(int, input, 0, 1); 1627 1628 WRITE_ONCE(global.turbo_disabled, turbo_is_disabled()); 1629 if (global.turbo_disabled && !no_turbo) { 1630 pr_notice("Turbo disabled by BIOS or unavailable on processor\n"); 1631 count = -EPERM; 1632 if (global.no_turbo) 1633 goto unlock_driver; 1634 else 1635 no_turbo = 1; 1636 } 1637 1638 if (no_turbo == global.no_turbo) { 1639 goto unlock_driver; 1640 } 1641 1642 WRITE_ONCE(global.no_turbo, no_turbo); 1643 1644 mutex_lock(&intel_pstate_limits_lock); 1645 1646 if (no_turbo) { 1647 struct cpudata *cpu = all_cpu_data[0]; 1648 int pct = cpu->pstate.max_pstate * 100 / cpu->pstate.turbo_pstate; 1649 1650 /* Squash the global minimum into the permitted range. */ 1651 if (global.min_perf_pct > pct) 1652 global.min_perf_pct = pct; 1653 } 1654 1655 mutex_unlock(&intel_pstate_limits_lock); 1656 1657 intel_pstate_update_limits_for_all(); 1658 arch_set_max_freq_ratio(no_turbo); 1659 1660 unlock_driver: 1661 mutex_unlock(&intel_pstate_driver_lock); 1662 1663 return count; 1664 } 1665 1666 static void update_cpu_qos_request(int cpu, enum freq_qos_req_type type) 1667 { 1668 struct cpudata *cpudata = all_cpu_data[cpu]; 1669 unsigned int freq = cpudata->pstate.turbo_freq; 1670 struct freq_qos_request *req; 1671 1672 struct cpufreq_policy *policy __free(put_cpufreq_policy) = cpufreq_cpu_get(cpu); 1673 if (!policy) 1674 return; 1675 1676 req = policy->driver_data; 1677 if (!req) 1678 return; 1679 1680 if (hwp_active) 1681 intel_pstate_get_hwp_cap(cpudata); 1682 1683 if (type == FREQ_QOS_MIN) { 1684 freq = DIV_ROUND_UP(freq * global.min_perf_pct, 100); 1685 } else { 1686 req++; 1687 freq = (freq * global.max_perf_pct) / 100; 1688 } 1689 1690 if (freq_qos_update_request(req, freq) < 0) 1691 pr_warn("Failed to update freq constraint: CPU%d\n", cpu); 1692 } 1693 1694 static void update_qos_requests(enum freq_qos_req_type type) 1695 { 1696 int i; 1697 1698 for_each_possible_cpu(i) 1699 update_cpu_qos_request(i, type); 1700 } 1701 1702 static ssize_t store_max_perf_pct(struct kobject *a, struct kobj_attribute *b, 1703 const char *buf, size_t count) 1704 { 1705 unsigned int input; 1706 int ret; 1707 1708 ret = sscanf(buf, "%u", &input); 1709 if (ret != 1) 1710 return -EINVAL; 1711 1712 mutex_lock(&intel_pstate_driver_lock); 1713 1714 if (!intel_pstate_driver) { 1715 mutex_unlock(&intel_pstate_driver_lock); 1716 return -EAGAIN; 1717 } 1718 1719 mutex_lock(&intel_pstate_limits_lock); 1720 1721 global.max_perf_pct = clamp_t(int, input, global.min_perf_pct, 100); 1722 1723 mutex_unlock(&intel_pstate_limits_lock); 1724 1725 if (intel_pstate_driver == &intel_pstate) 1726 intel_pstate_update_policies(); 1727 else 1728 update_qos_requests(FREQ_QOS_MAX); 1729 1730 mutex_unlock(&intel_pstate_driver_lock); 1731 1732 return count; 1733 } 1734 1735 static ssize_t store_min_perf_pct(struct kobject *a, struct kobj_attribute *b, 1736 const char *buf, size_t count) 1737 { 1738 unsigned int input; 1739 int ret; 1740 1741 ret = sscanf(buf, "%u", &input); 1742 if (ret != 1) 1743 return -EINVAL; 1744 1745 mutex_lock(&intel_pstate_driver_lock); 1746 1747 if (!intel_pstate_driver) { 1748 mutex_unlock(&intel_pstate_driver_lock); 1749 return -EAGAIN; 1750 } 1751 1752 mutex_lock(&intel_pstate_limits_lock); 1753 1754 global.min_perf_pct = clamp_t(int, input, 1755 min_perf_pct_min(), global.max_perf_pct); 1756 1757 mutex_unlock(&intel_pstate_limits_lock); 1758 1759 if (intel_pstate_driver == &intel_pstate) 1760 intel_pstate_update_policies(); 1761 else 1762 update_qos_requests(FREQ_QOS_MIN); 1763 1764 mutex_unlock(&intel_pstate_driver_lock); 1765 1766 return count; 1767 } 1768 1769 static ssize_t show_hwp_dynamic_boost(struct kobject *kobj, 1770 struct kobj_attribute *attr, char *buf) 1771 { 1772 return sprintf(buf, "%u\n", hwp_boost); 1773 } 1774 1775 static ssize_t store_hwp_dynamic_boost(struct kobject *a, 1776 struct kobj_attribute *b, 1777 const char *buf, size_t count) 1778 { 1779 unsigned int input; 1780 int ret; 1781 1782 ret = kstrtouint(buf, 10, &input); 1783 if (ret) 1784 return ret; 1785 1786 mutex_lock(&intel_pstate_driver_lock); 1787 hwp_boost = !!input; 1788 intel_pstate_update_policies(); 1789 mutex_unlock(&intel_pstate_driver_lock); 1790 1791 return count; 1792 } 1793 1794 static ssize_t show_energy_efficiency(struct kobject *kobj, struct kobj_attribute *attr, 1795 char *buf) 1796 { 1797 u64 power_ctl; 1798 int enable; 1799 1800 rdmsrq(MSR_IA32_POWER_CTL, power_ctl); 1801 enable = !!(power_ctl & BIT(MSR_IA32_POWER_CTL_BIT_EE)); 1802 return sprintf(buf, "%d\n", !enable); 1803 } 1804 1805 static ssize_t store_energy_efficiency(struct kobject *a, struct kobj_attribute *b, 1806 const char *buf, size_t count) 1807 { 1808 bool input; 1809 int ret; 1810 1811 ret = kstrtobool(buf, &input); 1812 if (ret) 1813 return ret; 1814 1815 set_power_ctl_ee_state(input); 1816 1817 return count; 1818 } 1819 1820 show_one(max_perf_pct, max_perf_pct); 1821 show_one(min_perf_pct, min_perf_pct); 1822 1823 define_one_global_rw(status); 1824 define_one_global_rw(no_turbo); 1825 define_one_global_rw(max_perf_pct); 1826 define_one_global_rw(min_perf_pct); 1827 define_one_global_ro(turbo_pct); 1828 define_one_global_ro(num_pstates); 1829 define_one_global_rw(hwp_dynamic_boost); 1830 define_one_global_rw(energy_efficiency); 1831 1832 static struct attribute *intel_pstate_attributes[] = { 1833 &status.attr, 1834 &no_turbo.attr, 1835 NULL 1836 }; 1837 1838 static const struct attribute_group intel_pstate_attr_group = { 1839 .attrs = intel_pstate_attributes, 1840 }; 1841 1842 static const struct x86_cpu_id intel_pstate_cpu_ee_disable_ids[]; 1843 1844 static struct kobject *intel_pstate_kobject; 1845 1846 static void __init intel_pstate_sysfs_expose_params(void) 1847 { 1848 struct device *dev_root = bus_get_dev_root(&cpu_subsys); 1849 int rc; 1850 1851 if (dev_root) { 1852 intel_pstate_kobject = kobject_create_and_add("intel_pstate", &dev_root->kobj); 1853 put_device(dev_root); 1854 } 1855 if (WARN_ON(!intel_pstate_kobject)) 1856 return; 1857 1858 rc = sysfs_create_group(intel_pstate_kobject, &intel_pstate_attr_group); 1859 if (WARN_ON(rc)) 1860 return; 1861 1862 if (!boot_cpu_has(X86_FEATURE_HYBRID_CPU)) { 1863 rc = sysfs_create_file(intel_pstate_kobject, &turbo_pct.attr); 1864 WARN_ON(rc); 1865 1866 rc = sysfs_create_file(intel_pstate_kobject, &num_pstates.attr); 1867 WARN_ON(rc); 1868 } 1869 1870 /* 1871 * If per cpu limits are enforced there are no global limits, so 1872 * return without creating max/min_perf_pct attributes 1873 */ 1874 if (per_cpu_limits) 1875 return; 1876 1877 rc = sysfs_create_file(intel_pstate_kobject, &max_perf_pct.attr); 1878 WARN_ON(rc); 1879 1880 rc = sysfs_create_file(intel_pstate_kobject, &min_perf_pct.attr); 1881 WARN_ON(rc); 1882 1883 if (x86_match_cpu(intel_pstate_cpu_ee_disable_ids)) { 1884 rc = sysfs_create_file(intel_pstate_kobject, &energy_efficiency.attr); 1885 WARN_ON(rc); 1886 } 1887 } 1888 1889 static void __init intel_pstate_sysfs_remove(void) 1890 { 1891 if (!intel_pstate_kobject) 1892 return; 1893 1894 sysfs_remove_group(intel_pstate_kobject, &intel_pstate_attr_group); 1895 1896 if (!boot_cpu_has(X86_FEATURE_HYBRID_CPU)) { 1897 sysfs_remove_file(intel_pstate_kobject, &num_pstates.attr); 1898 sysfs_remove_file(intel_pstate_kobject, &turbo_pct.attr); 1899 } 1900 1901 if (!per_cpu_limits) { 1902 sysfs_remove_file(intel_pstate_kobject, &max_perf_pct.attr); 1903 sysfs_remove_file(intel_pstate_kobject, &min_perf_pct.attr); 1904 1905 if (x86_match_cpu(intel_pstate_cpu_ee_disable_ids)) 1906 sysfs_remove_file(intel_pstate_kobject, &energy_efficiency.attr); 1907 } 1908 1909 kobject_put(intel_pstate_kobject); 1910 } 1911 1912 static void intel_pstate_sysfs_expose_hwp_dynamic_boost(void) 1913 { 1914 int rc; 1915 1916 if (!hwp_active) 1917 return; 1918 1919 rc = sysfs_create_file(intel_pstate_kobject, &hwp_dynamic_boost.attr); 1920 WARN_ON_ONCE(rc); 1921 } 1922 1923 static void intel_pstate_sysfs_hide_hwp_dynamic_boost(void) 1924 { 1925 if (!hwp_active) 1926 return; 1927 1928 sysfs_remove_file(intel_pstate_kobject, &hwp_dynamic_boost.attr); 1929 } 1930 1931 /************************** sysfs end ************************/ 1932 1933 static void intel_pstate_notify_work(struct work_struct *work) 1934 { 1935 struct cpudata *cpudata = 1936 container_of(to_delayed_work(work), struct cpudata, hwp_notify_work); 1937 1938 if (intel_pstate_update_max_freq(cpudata)) { 1939 /* 1940 * The driver will not be unregistered while this function is 1941 * running, so update the capacity without acquiring the driver 1942 * lock. 1943 */ 1944 hybrid_update_capacity(cpudata); 1945 } 1946 1947 wrmsrq_on_cpu(cpudata->cpu, MSR_HWP_STATUS, 0); 1948 } 1949 1950 static DEFINE_RAW_SPINLOCK(hwp_notify_lock); 1951 static cpumask_t hwp_intr_enable_mask; 1952 1953 #define HWP_GUARANTEED_PERF_CHANGE_STATUS BIT(0) 1954 #define HWP_HIGHEST_PERF_CHANGE_STATUS BIT(3) 1955 1956 void notify_hwp_interrupt(void) 1957 { 1958 unsigned int this_cpu = smp_processor_id(); 1959 u64 value, status_mask; 1960 unsigned long flags; 1961 1962 if (!hwp_active || !cpu_feature_enabled(X86_FEATURE_HWP_NOTIFY)) 1963 return; 1964 1965 status_mask = HWP_GUARANTEED_PERF_CHANGE_STATUS; 1966 if (cpu_feature_enabled(X86_FEATURE_HWP_HIGHEST_PERF_CHANGE)) 1967 status_mask |= HWP_HIGHEST_PERF_CHANGE_STATUS; 1968 1969 rdmsrq_safe(MSR_HWP_STATUS, &value); 1970 if (!(value & status_mask)) 1971 return; 1972 1973 raw_spin_lock_irqsave(&hwp_notify_lock, flags); 1974 1975 if (!cpumask_test_cpu(this_cpu, &hwp_intr_enable_mask)) 1976 goto ack_intr; 1977 1978 schedule_delayed_work(&all_cpu_data[this_cpu]->hwp_notify_work, 1979 msecs_to_jiffies(10)); 1980 1981 raw_spin_unlock_irqrestore(&hwp_notify_lock, flags); 1982 1983 return; 1984 1985 ack_intr: 1986 wrmsrq_safe(MSR_HWP_STATUS, 0); 1987 raw_spin_unlock_irqrestore(&hwp_notify_lock, flags); 1988 } 1989 1990 static void intel_pstate_disable_hwp_interrupt(struct cpudata *cpudata) 1991 { 1992 bool cancel_work; 1993 1994 if (!cpu_feature_enabled(X86_FEATURE_HWP_NOTIFY)) 1995 return; 1996 1997 /* wrmsrq_on_cpu has to be outside spinlock as this can result in IPC */ 1998 wrmsrq_on_cpu(cpudata->cpu, MSR_HWP_INTERRUPT, 0x00); 1999 2000 raw_spin_lock_irq(&hwp_notify_lock); 2001 cancel_work = cpumask_test_and_clear_cpu(cpudata->cpu, &hwp_intr_enable_mask); 2002 raw_spin_unlock_irq(&hwp_notify_lock); 2003 2004 if (cancel_work) 2005 cancel_delayed_work_sync(&cpudata->hwp_notify_work); 2006 } 2007 2008 #define HWP_GUARANTEED_PERF_CHANGE_REQ BIT(0) 2009 #define HWP_HIGHEST_PERF_CHANGE_REQ BIT(2) 2010 2011 static void intel_pstate_enable_hwp_interrupt(struct cpudata *cpudata) 2012 { 2013 /* Enable HWP notification interrupt for performance change */ 2014 if (boot_cpu_has(X86_FEATURE_HWP_NOTIFY)) { 2015 u64 interrupt_mask = HWP_GUARANTEED_PERF_CHANGE_REQ; 2016 2017 raw_spin_lock_irq(&hwp_notify_lock); 2018 INIT_DELAYED_WORK(&cpudata->hwp_notify_work, intel_pstate_notify_work); 2019 cpumask_set_cpu(cpudata->cpu, &hwp_intr_enable_mask); 2020 raw_spin_unlock_irq(&hwp_notify_lock); 2021 2022 if (cpu_feature_enabled(X86_FEATURE_HWP_HIGHEST_PERF_CHANGE)) 2023 interrupt_mask |= HWP_HIGHEST_PERF_CHANGE_REQ; 2024 2025 /* wrmsrq_on_cpu has to be outside spinlock as this can result in IPC */ 2026 wrmsrq_on_cpu(cpudata->cpu, MSR_HWP_INTERRUPT, interrupt_mask); 2027 wrmsrq_on_cpu(cpudata->cpu, MSR_HWP_STATUS, 0); 2028 } 2029 } 2030 2031 static void intel_pstate_update_epp_defaults(struct cpudata *cpudata) 2032 { 2033 cpudata->epp_default = intel_pstate_get_epp(cpudata, 0); 2034 2035 /* 2036 * If the EPP is set by firmware, which means that firmware enabled HWP 2037 * - Is equal or less than 0x80 (default balance_perf EPP) 2038 * - But less performance oriented than performance EPP 2039 * then use this as new balance_perf EPP. 2040 */ 2041 if (hwp_forced && cpudata->epp_default <= HWP_EPP_BALANCE_PERFORMANCE && 2042 cpudata->epp_default > HWP_EPP_PERFORMANCE) { 2043 epp_values[EPP_INDEX_BALANCE_PERFORMANCE] = cpudata->epp_default; 2044 return; 2045 } 2046 2047 /* 2048 * If this CPU gen doesn't call for change in balance_perf 2049 * EPP return. 2050 */ 2051 if (epp_values[EPP_INDEX_BALANCE_PERFORMANCE] == HWP_EPP_BALANCE_PERFORMANCE) 2052 return; 2053 2054 /* 2055 * Use hard coded value per gen to update the balance_perf 2056 * and default EPP. 2057 */ 2058 cpudata->epp_default = epp_values[EPP_INDEX_BALANCE_PERFORMANCE]; 2059 intel_pstate_set_epp(cpudata, cpudata->epp_default); 2060 } 2061 2062 static void intel_pstate_hwp_enable(struct cpudata *cpudata) 2063 { 2064 /* First disable HWP notification interrupt till we activate again */ 2065 if (boot_cpu_has(X86_FEATURE_HWP_NOTIFY)) 2066 wrmsrq_on_cpu(cpudata->cpu, MSR_HWP_INTERRUPT, 0x00); 2067 2068 wrmsrq_on_cpu(cpudata->cpu, MSR_PM_ENABLE, 0x1); 2069 2070 intel_pstate_enable_hwp_interrupt(cpudata); 2071 2072 if (cpudata->epp_default >= 0) 2073 return; 2074 2075 intel_pstate_update_epp_defaults(cpudata); 2076 } 2077 2078 static int atom_get_min_pstate(int not_used) 2079 { 2080 u64 value; 2081 2082 rdmsrq(MSR_ATOM_CORE_RATIOS, value); 2083 return (value >> 8) & 0x7F; 2084 } 2085 2086 static int atom_get_max_pstate(int not_used) 2087 { 2088 u64 value; 2089 2090 rdmsrq(MSR_ATOM_CORE_RATIOS, value); 2091 return (value >> 16) & 0x7F; 2092 } 2093 2094 static int atom_get_turbo_pstate(int not_used) 2095 { 2096 u64 value; 2097 2098 rdmsrq(MSR_ATOM_CORE_TURBO_RATIOS, value); 2099 return value & 0x7F; 2100 } 2101 2102 static u64 atom_get_val(struct cpudata *cpudata, int pstate) 2103 { 2104 u64 val; 2105 int32_t vid_fp; 2106 u32 vid; 2107 2108 val = (u64)pstate << 8; 2109 if (READ_ONCE(global.no_turbo) && !READ_ONCE(global.turbo_disabled)) 2110 val |= (u64)1 << 32; 2111 2112 vid_fp = cpudata->vid.min + mul_fp( 2113 int_tofp(pstate - cpudata->pstate.min_pstate), 2114 cpudata->vid.ratio); 2115 2116 vid_fp = clamp_t(int32_t, vid_fp, cpudata->vid.min, cpudata->vid.max); 2117 vid = ceiling_fp(vid_fp); 2118 2119 if (pstate > cpudata->pstate.max_pstate) 2120 vid = cpudata->vid.turbo; 2121 2122 return val | vid; 2123 } 2124 2125 static int silvermont_get_scaling(void) 2126 { 2127 u64 value; 2128 int i; 2129 /* Defined in Table 35-6 from SDM (Sept 2015) */ 2130 static int silvermont_freq_table[] = { 2131 83300, 100000, 133300, 116700, 80000}; 2132 2133 rdmsrq(MSR_FSB_FREQ, value); 2134 i = value & 0x7; 2135 WARN_ON(i > 4); 2136 2137 return silvermont_freq_table[i]; 2138 } 2139 2140 static int airmont_get_scaling(void) 2141 { 2142 u64 value; 2143 int i; 2144 /* Defined in Table 35-10 from SDM (Sept 2015) */ 2145 static int airmont_freq_table[] = { 2146 83300, 100000, 133300, 116700, 80000, 2147 93300, 90000, 88900, 87500}; 2148 2149 rdmsrq(MSR_FSB_FREQ, value); 2150 i = value & 0xF; 2151 WARN_ON(i > 8); 2152 2153 return airmont_freq_table[i]; 2154 } 2155 2156 static void atom_get_vid(struct cpudata *cpudata) 2157 { 2158 u64 value; 2159 2160 rdmsrq(MSR_ATOM_CORE_VIDS, value); 2161 cpudata->vid.min = int_tofp((value >> 8) & 0x7f); 2162 cpudata->vid.max = int_tofp((value >> 16) & 0x7f); 2163 cpudata->vid.ratio = div_fp( 2164 cpudata->vid.max - cpudata->vid.min, 2165 int_tofp(cpudata->pstate.max_pstate - 2166 cpudata->pstate.min_pstate)); 2167 2168 rdmsrq(MSR_ATOM_CORE_TURBO_VIDS, value); 2169 cpudata->vid.turbo = value & 0x7f; 2170 } 2171 2172 static int core_get_min_pstate(int cpu) 2173 { 2174 u64 value; 2175 2176 rdmsrq_on_cpu(cpu, MSR_PLATFORM_INFO, &value); 2177 return (value >> 40) & 0xFF; 2178 } 2179 2180 static int core_get_max_pstate_physical(int cpu) 2181 { 2182 u64 value; 2183 2184 rdmsrq_on_cpu(cpu, MSR_PLATFORM_INFO, &value); 2185 return (value >> 8) & 0xFF; 2186 } 2187 2188 static int core_get_tdp_ratio(int cpu, u64 plat_info) 2189 { 2190 /* Check how many TDP levels present */ 2191 if (plat_info & 0x600000000) { 2192 u64 tdp_ctrl; 2193 u64 tdp_ratio; 2194 int tdp_msr; 2195 int err; 2196 2197 /* Get the TDP level (0, 1, 2) to get ratios */ 2198 err = rdmsrq_safe_on_cpu(cpu, MSR_CONFIG_TDP_CONTROL, &tdp_ctrl); 2199 if (err) 2200 return err; 2201 2202 /* TDP MSR are continuous starting at 0x648 */ 2203 tdp_msr = MSR_CONFIG_TDP_NOMINAL + (tdp_ctrl & 0x03); 2204 err = rdmsrq_safe_on_cpu(cpu, tdp_msr, &tdp_ratio); 2205 if (err) 2206 return err; 2207 2208 /* For level 1 and 2, bits[23:16] contain the ratio */ 2209 if (tdp_ctrl & 0x03) 2210 tdp_ratio >>= 16; 2211 2212 tdp_ratio &= 0xff; /* ratios are only 8 bits long */ 2213 pr_debug("tdp_ratio %x\n", (int)tdp_ratio); 2214 2215 return (int)tdp_ratio; 2216 } 2217 2218 return -ENXIO; 2219 } 2220 2221 static int core_get_max_pstate(int cpu) 2222 { 2223 u64 tar; 2224 u64 plat_info; 2225 int max_pstate; 2226 int tdp_ratio; 2227 int err; 2228 2229 rdmsrq_on_cpu(cpu, MSR_PLATFORM_INFO, &plat_info); 2230 max_pstate = (plat_info >> 8) & 0xFF; 2231 2232 tdp_ratio = core_get_tdp_ratio(cpu, plat_info); 2233 if (tdp_ratio <= 0) 2234 return max_pstate; 2235 2236 if (hwp_active) { 2237 /* Turbo activation ratio is not used on HWP platforms */ 2238 return tdp_ratio; 2239 } 2240 2241 err = rdmsrq_safe_on_cpu(cpu, MSR_TURBO_ACTIVATION_RATIO, &tar); 2242 if (!err) { 2243 int tar_levels; 2244 2245 /* Do some sanity checking for safety */ 2246 tar_levels = tar & 0xff; 2247 if (tdp_ratio - 1 == tar_levels) { 2248 max_pstate = tar_levels; 2249 pr_debug("max_pstate=TAC %x\n", max_pstate); 2250 } 2251 } 2252 2253 return max_pstate; 2254 } 2255 2256 static int core_get_turbo_pstate(int cpu) 2257 { 2258 u64 value; 2259 int nont, ret; 2260 2261 rdmsrq_on_cpu(cpu, MSR_TURBO_RATIO_LIMIT, &value); 2262 nont = core_get_max_pstate(cpu); 2263 ret = (value) & 255; 2264 if (ret <= nont) 2265 ret = nont; 2266 return ret; 2267 } 2268 2269 static u64 core_get_val(struct cpudata *cpudata, int pstate) 2270 { 2271 u64 val; 2272 2273 val = (u64)pstate << 8; 2274 if (READ_ONCE(global.no_turbo) && !READ_ONCE(global.turbo_disabled)) 2275 val |= (u64)1 << 32; 2276 2277 return val; 2278 } 2279 2280 static int knl_get_aperf_mperf_shift(void) 2281 { 2282 return 10; 2283 } 2284 2285 static int knl_get_turbo_pstate(int cpu) 2286 { 2287 u64 value; 2288 int nont, ret; 2289 2290 rdmsrq_on_cpu(cpu, MSR_TURBO_RATIO_LIMIT, &value); 2291 nont = core_get_max_pstate(cpu); 2292 ret = (((value) >> 8) & 0xFF); 2293 if (ret <= nont) 2294 ret = nont; 2295 return ret; 2296 } 2297 2298 static int hwp_get_cpu_scaling(int cpu) 2299 { 2300 if (hybrid_scaling_factor) { 2301 struct cpuinfo_x86 *c = &cpu_data(cpu); 2302 u8 cpu_type = c->topo.intel_type; 2303 2304 /* 2305 * Return the hybrid scaling factor for P-cores and use the 2306 * default core scaling for E-cores. 2307 */ 2308 if (cpu_type == INTEL_CPU_TYPE_CORE) 2309 return hybrid_scaling_factor; 2310 2311 if (cpu_type == INTEL_CPU_TYPE_ATOM) 2312 return core_get_scaling(); 2313 } 2314 2315 /* Use core scaling on non-hybrid systems. */ 2316 if (!cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) 2317 return core_get_scaling(); 2318 2319 /* 2320 * The system is hybrid, but the hybrid scaling factor is not known or 2321 * the CPU type is not one of the above, so use CPPC to compute the 2322 * scaling factor for this CPU. 2323 */ 2324 return intel_pstate_cppc_get_scaling(cpu); 2325 } 2326 2327 static void intel_pstate_set_pstate(struct cpudata *cpu, int pstate) 2328 { 2329 trace_cpu_frequency(pstate * cpu->pstate.scaling, cpu->cpu); 2330 cpu->pstate.current_pstate = pstate; 2331 /* 2332 * Generally, there is no guarantee that this code will always run on 2333 * the CPU being updated, so force the register update to run on the 2334 * right CPU. 2335 */ 2336 wrmsrq_on_cpu(cpu->cpu, MSR_IA32_PERF_CTL, 2337 pstate_funcs.get_val(cpu, pstate)); 2338 } 2339 2340 static void intel_pstate_set_min_pstate(struct cpudata *cpu) 2341 { 2342 intel_pstate_set_pstate(cpu, cpu->pstate.min_pstate); 2343 } 2344 2345 static void intel_pstate_get_cpu_pstates(struct cpudata *cpu) 2346 { 2347 int perf_ctl_max_phys = pstate_funcs.get_max_physical(cpu->cpu); 2348 int perf_ctl_scaling = pstate_funcs.get_scaling(); 2349 2350 cpu->pstate.min_pstate = pstate_funcs.get_min(cpu->cpu); 2351 cpu->pstate.max_pstate_physical = perf_ctl_max_phys; 2352 cpu->pstate.perf_ctl_scaling = perf_ctl_scaling; 2353 2354 if (hwp_active && !hwp_mode_bdw) { 2355 __intel_pstate_get_hwp_cap(cpu); 2356 2357 if (pstate_funcs.get_cpu_scaling) { 2358 cpu->pstate.scaling = pstate_funcs.get_cpu_scaling(cpu->cpu); 2359 if (cpu->pstate.scaling != perf_ctl_scaling) { 2360 intel_pstate_hybrid_hwp_adjust(cpu); 2361 hwp_is_hybrid = true; 2362 } 2363 } else { 2364 cpu->pstate.scaling = perf_ctl_scaling; 2365 } 2366 /* 2367 * If the CPU is going online for the first time and it was 2368 * offline initially, asym capacity scaling needs to be updated. 2369 */ 2370 hybrid_update_capacity(cpu); 2371 } else { 2372 cpu->pstate.scaling = perf_ctl_scaling; 2373 cpu->pstate.max_pstate = pstate_funcs.get_max(cpu->cpu); 2374 cpu->pstate.turbo_pstate = pstate_funcs.get_turbo(cpu->cpu); 2375 } 2376 2377 if (cpu->pstate.scaling == perf_ctl_scaling) { 2378 cpu->pstate.min_freq = cpu->pstate.min_pstate * perf_ctl_scaling; 2379 cpu->pstate.max_freq = cpu->pstate.max_pstate * perf_ctl_scaling; 2380 cpu->pstate.turbo_freq = cpu->pstate.turbo_pstate * perf_ctl_scaling; 2381 } 2382 2383 if (pstate_funcs.get_aperf_mperf_shift) 2384 cpu->aperf_mperf_shift = pstate_funcs.get_aperf_mperf_shift(); 2385 2386 if (pstate_funcs.get_vid) 2387 pstate_funcs.get_vid(cpu); 2388 2389 intel_pstate_set_min_pstate(cpu); 2390 } 2391 2392 /* 2393 * Long hold time will keep high perf limits for long time, 2394 * which negatively impacts perf/watt for some workloads, 2395 * like specpower. 3ms is based on experiements on some 2396 * workoads. 2397 */ 2398 static int hwp_boost_hold_time_ns = 3 * NSEC_PER_MSEC; 2399 2400 static inline void intel_pstate_hwp_boost_up(struct cpudata *cpu) 2401 { 2402 u64 hwp_req = READ_ONCE(cpu->hwp_req_cached); 2403 u64 hwp_cap = READ_ONCE(cpu->hwp_cap_cached); 2404 u32 max_limit = (hwp_req & 0xff00) >> 8; 2405 u32 min_limit = (hwp_req & 0xff); 2406 u32 boost_level1; 2407 2408 /* 2409 * Cases to consider (User changes via sysfs or boot time): 2410 * If, P0 (Turbo max) = P1 (Guaranteed max) = min: 2411 * No boost, return. 2412 * If, P0 (Turbo max) > P1 (Guaranteed max) = min: 2413 * Should result in one level boost only for P0. 2414 * If, P0 (Turbo max) = P1 (Guaranteed max) > min: 2415 * Should result in two level boost: 2416 * (min + p1)/2 and P1. 2417 * If, P0 (Turbo max) > P1 (Guaranteed max) > min: 2418 * Should result in three level boost: 2419 * (min + p1)/2, P1 and P0. 2420 */ 2421 2422 /* If max and min are equal or already at max, nothing to boost */ 2423 if (max_limit == min_limit || cpu->hwp_boost_min >= max_limit) 2424 return; 2425 2426 if (!cpu->hwp_boost_min) 2427 cpu->hwp_boost_min = min_limit; 2428 2429 /* level at half way mark between min and guranteed */ 2430 boost_level1 = (HWP_GUARANTEED_PERF(hwp_cap) + min_limit) >> 1; 2431 2432 if (cpu->hwp_boost_min < boost_level1) 2433 cpu->hwp_boost_min = boost_level1; 2434 else if (cpu->hwp_boost_min < HWP_GUARANTEED_PERF(hwp_cap)) 2435 cpu->hwp_boost_min = HWP_GUARANTEED_PERF(hwp_cap); 2436 else if (cpu->hwp_boost_min == HWP_GUARANTEED_PERF(hwp_cap) && 2437 max_limit != HWP_GUARANTEED_PERF(hwp_cap)) 2438 cpu->hwp_boost_min = max_limit; 2439 else 2440 return; 2441 2442 hwp_req = (hwp_req & ~GENMASK_ULL(7, 0)) | cpu->hwp_boost_min; 2443 wrmsrq(MSR_HWP_REQUEST, hwp_req); 2444 cpu->last_update = cpu->sample.time; 2445 } 2446 2447 static inline void intel_pstate_hwp_boost_down(struct cpudata *cpu) 2448 { 2449 if (cpu->hwp_boost_min) { 2450 bool expired; 2451 2452 /* Check if we are idle for hold time to boost down */ 2453 expired = time_after64(cpu->sample.time, cpu->last_update + 2454 hwp_boost_hold_time_ns); 2455 if (expired) { 2456 wrmsrq(MSR_HWP_REQUEST, cpu->hwp_req_cached); 2457 cpu->hwp_boost_min = 0; 2458 } 2459 } 2460 cpu->last_update = cpu->sample.time; 2461 } 2462 2463 static inline void intel_pstate_update_util_hwp_local(struct cpudata *cpu, 2464 u64 time) 2465 { 2466 cpu->sample.time = time; 2467 2468 if (cpu->sched_flags & SCHED_CPUFREQ_IOWAIT) { 2469 bool do_io = false; 2470 2471 cpu->sched_flags = 0; 2472 /* 2473 * Set iowait_boost flag and update time. Since IO WAIT flag 2474 * is set all the time, we can't just conclude that there is 2475 * some IO bound activity is scheduled on this CPU with just 2476 * one occurrence. If we receive at least two in two 2477 * consecutive ticks, then we treat as boost candidate. 2478 */ 2479 if (time_before64(time, cpu->last_io_update + 2 * TICK_NSEC)) 2480 do_io = true; 2481 2482 cpu->last_io_update = time; 2483 2484 if (do_io) 2485 intel_pstate_hwp_boost_up(cpu); 2486 2487 } else { 2488 intel_pstate_hwp_boost_down(cpu); 2489 } 2490 } 2491 2492 static inline void intel_pstate_update_util_hwp(struct update_util_data *data, 2493 u64 time, unsigned int flags) 2494 { 2495 struct cpudata *cpu = container_of(data, struct cpudata, update_util); 2496 2497 cpu->sched_flags |= flags; 2498 2499 if (smp_processor_id() == cpu->cpu) 2500 intel_pstate_update_util_hwp_local(cpu, time); 2501 } 2502 2503 static inline void intel_pstate_calc_avg_perf(struct cpudata *cpu) 2504 { 2505 struct sample *sample = &cpu->sample; 2506 2507 sample->core_avg_perf = div_ext_fp(sample->aperf, sample->mperf); 2508 } 2509 2510 static inline bool intel_pstate_sample(struct cpudata *cpu, u64 time) 2511 { 2512 u64 aperf, mperf; 2513 unsigned long flags; 2514 u64 tsc; 2515 2516 local_irq_save(flags); 2517 rdmsrq(MSR_IA32_APERF, aperf); 2518 rdmsrq(MSR_IA32_MPERF, mperf); 2519 tsc = rdtsc(); 2520 if (cpu->prev_mperf == mperf || cpu->prev_tsc == tsc) { 2521 local_irq_restore(flags); 2522 return false; 2523 } 2524 local_irq_restore(flags); 2525 2526 cpu->last_sample_time = cpu->sample.time; 2527 cpu->sample.time = time; 2528 cpu->sample.aperf = aperf; 2529 cpu->sample.mperf = mperf; 2530 cpu->sample.tsc = tsc; 2531 cpu->sample.aperf -= cpu->prev_aperf; 2532 cpu->sample.mperf -= cpu->prev_mperf; 2533 cpu->sample.tsc -= cpu->prev_tsc; 2534 2535 cpu->prev_aperf = aperf; 2536 cpu->prev_mperf = mperf; 2537 cpu->prev_tsc = tsc; 2538 /* 2539 * First time this function is invoked in a given cycle, all of the 2540 * previous sample data fields are equal to zero or stale and they must 2541 * be populated with meaningful numbers for things to work, so assume 2542 * that sample.time will always be reset before setting the utilization 2543 * update hook and make the caller skip the sample then. 2544 */ 2545 if (likely(cpu->last_sample_time)) { 2546 intel_pstate_calc_avg_perf(cpu); 2547 return true; 2548 } 2549 return false; 2550 } 2551 2552 static inline int32_t get_avg_frequency(struct cpudata *cpu) 2553 { 2554 return mul_ext_fp(cpu->sample.core_avg_perf, cpu_khz); 2555 } 2556 2557 static inline int32_t get_avg_pstate(struct cpudata *cpu) 2558 { 2559 return mul_ext_fp(cpu->pstate.max_pstate_physical, 2560 cpu->sample.core_avg_perf); 2561 } 2562 2563 static inline int32_t get_target_pstate(struct cpudata *cpu) 2564 { 2565 struct sample *sample = &cpu->sample; 2566 int32_t busy_frac; 2567 int target, avg_pstate; 2568 2569 busy_frac = div_fp(sample->mperf << cpu->aperf_mperf_shift, 2570 sample->tsc); 2571 2572 if (busy_frac < cpu->iowait_boost) 2573 busy_frac = cpu->iowait_boost; 2574 2575 sample->busy_scaled = busy_frac * 100; 2576 2577 target = READ_ONCE(global.no_turbo) ? 2578 cpu->pstate.max_pstate : cpu->pstate.turbo_pstate; 2579 target += target >> 2; 2580 target = mul_fp(target, busy_frac); 2581 if (target < cpu->pstate.min_pstate) 2582 target = cpu->pstate.min_pstate; 2583 2584 /* 2585 * If the average P-state during the previous cycle was higher than the 2586 * current target, add 50% of the difference to the target to reduce 2587 * possible performance oscillations and offset possible performance 2588 * loss related to moving the workload from one CPU to another within 2589 * a package/module. 2590 */ 2591 avg_pstate = get_avg_pstate(cpu); 2592 if (avg_pstate > target) 2593 target += (avg_pstate - target) >> 1; 2594 2595 return target; 2596 } 2597 2598 static int intel_pstate_prepare_request(struct cpudata *cpu, int pstate) 2599 { 2600 int min_pstate = max(cpu->pstate.min_pstate, cpu->min_perf_ratio); 2601 int max_pstate = max(min_pstate, cpu->max_perf_ratio); 2602 2603 return clamp_t(int, pstate, min_pstate, max_pstate); 2604 } 2605 2606 static void intel_pstate_update_pstate(struct cpudata *cpu, int pstate) 2607 { 2608 if (pstate == cpu->pstate.current_pstate) 2609 return; 2610 2611 cpu->pstate.current_pstate = pstate; 2612 wrmsrq(MSR_IA32_PERF_CTL, pstate_funcs.get_val(cpu, pstate)); 2613 } 2614 2615 static void intel_pstate_adjust_pstate(struct cpudata *cpu) 2616 { 2617 int from = cpu->pstate.current_pstate; 2618 struct sample *sample; 2619 int target_pstate; 2620 2621 target_pstate = get_target_pstate(cpu); 2622 target_pstate = intel_pstate_prepare_request(cpu, target_pstate); 2623 trace_cpu_frequency(target_pstate * cpu->pstate.scaling, cpu->cpu); 2624 intel_pstate_update_pstate(cpu, target_pstate); 2625 2626 sample = &cpu->sample; 2627 trace_pstate_sample(mul_ext_fp(100, sample->core_avg_perf), 2628 fp_toint(sample->busy_scaled), 2629 from, 2630 cpu->pstate.current_pstate, 2631 sample->mperf, 2632 sample->aperf, 2633 sample->tsc, 2634 get_avg_frequency(cpu), 2635 fp_toint(cpu->iowait_boost * 100)); 2636 } 2637 2638 static void intel_pstate_update_util(struct update_util_data *data, u64 time, 2639 unsigned int flags) 2640 { 2641 struct cpudata *cpu = container_of(data, struct cpudata, update_util); 2642 u64 delta_ns; 2643 2644 /* Don't allow remote callbacks */ 2645 if (smp_processor_id() != cpu->cpu) 2646 return; 2647 2648 delta_ns = time - cpu->last_update; 2649 if (flags & SCHED_CPUFREQ_IOWAIT) { 2650 /* Start over if the CPU may have been idle. */ 2651 if (delta_ns > TICK_NSEC) { 2652 cpu->iowait_boost = ONE_EIGHTH_FP; 2653 } else if (cpu->iowait_boost >= ONE_EIGHTH_FP) { 2654 cpu->iowait_boost <<= 1; 2655 if (cpu->iowait_boost > int_tofp(1)) 2656 cpu->iowait_boost = int_tofp(1); 2657 } else { 2658 cpu->iowait_boost = ONE_EIGHTH_FP; 2659 } 2660 } else if (cpu->iowait_boost) { 2661 /* Clear iowait_boost if the CPU may have been idle. */ 2662 if (delta_ns > TICK_NSEC) 2663 cpu->iowait_boost = 0; 2664 else 2665 cpu->iowait_boost >>= 1; 2666 } 2667 cpu->last_update = time; 2668 delta_ns = time - cpu->sample.time; 2669 if ((s64)delta_ns < INTEL_PSTATE_SAMPLING_INTERVAL) 2670 return; 2671 2672 if (intel_pstate_sample(cpu, time)) 2673 intel_pstate_adjust_pstate(cpu); 2674 } 2675 2676 static struct pstate_funcs core_funcs = { 2677 .get_max = core_get_max_pstate, 2678 .get_max_physical = core_get_max_pstate_physical, 2679 .get_min = core_get_min_pstate, 2680 .get_turbo = core_get_turbo_pstate, 2681 .get_scaling = core_get_scaling, 2682 .get_val = core_get_val, 2683 }; 2684 2685 static const struct pstate_funcs silvermont_funcs = { 2686 .get_max = atom_get_max_pstate, 2687 .get_max_physical = atom_get_max_pstate, 2688 .get_min = atom_get_min_pstate, 2689 .get_turbo = atom_get_turbo_pstate, 2690 .get_val = atom_get_val, 2691 .get_scaling = silvermont_get_scaling, 2692 .get_vid = atom_get_vid, 2693 }; 2694 2695 static const struct pstate_funcs airmont_funcs = { 2696 .get_max = atom_get_max_pstate, 2697 .get_max_physical = atom_get_max_pstate, 2698 .get_min = atom_get_min_pstate, 2699 .get_turbo = atom_get_turbo_pstate, 2700 .get_val = atom_get_val, 2701 .get_scaling = airmont_get_scaling, 2702 .get_vid = atom_get_vid, 2703 }; 2704 2705 static const struct pstate_funcs knl_funcs = { 2706 .get_max = core_get_max_pstate, 2707 .get_max_physical = core_get_max_pstate_physical, 2708 .get_min = core_get_min_pstate, 2709 .get_turbo = knl_get_turbo_pstate, 2710 .get_aperf_mperf_shift = knl_get_aperf_mperf_shift, 2711 .get_scaling = core_get_scaling, 2712 .get_val = core_get_val, 2713 }; 2714 2715 #define X86_MATCH(vfm, policy) \ 2716 X86_MATCH_VFM_FEATURE(vfm, X86_FEATURE_APERFMPERF, &policy) 2717 2718 static const struct x86_cpu_id intel_pstate_cpu_ids[] = { 2719 X86_MATCH(INTEL_SANDYBRIDGE, core_funcs), 2720 X86_MATCH(INTEL_SANDYBRIDGE_X, core_funcs), 2721 X86_MATCH(INTEL_ATOM_SILVERMONT, silvermont_funcs), 2722 X86_MATCH(INTEL_IVYBRIDGE, core_funcs), 2723 X86_MATCH(INTEL_HASWELL, core_funcs), 2724 X86_MATCH(INTEL_BROADWELL, core_funcs), 2725 X86_MATCH(INTEL_IVYBRIDGE_X, core_funcs), 2726 X86_MATCH(INTEL_HASWELL_X, core_funcs), 2727 X86_MATCH(INTEL_HASWELL_L, core_funcs), 2728 X86_MATCH(INTEL_HASWELL_G, core_funcs), 2729 X86_MATCH(INTEL_BROADWELL_G, core_funcs), 2730 X86_MATCH(INTEL_ATOM_AIRMONT, airmont_funcs), 2731 X86_MATCH(INTEL_SKYLAKE_L, core_funcs), 2732 X86_MATCH(INTEL_BROADWELL_X, core_funcs), 2733 X86_MATCH(INTEL_SKYLAKE, core_funcs), 2734 X86_MATCH(INTEL_BROADWELL_D, core_funcs), 2735 X86_MATCH(INTEL_XEON_PHI_KNL, knl_funcs), 2736 X86_MATCH(INTEL_XEON_PHI_KNM, knl_funcs), 2737 X86_MATCH(INTEL_ATOM_GOLDMONT, core_funcs), 2738 X86_MATCH(INTEL_ATOM_GOLDMONT_PLUS, core_funcs), 2739 X86_MATCH(INTEL_SKYLAKE_X, core_funcs), 2740 X86_MATCH(INTEL_COMETLAKE, core_funcs), 2741 X86_MATCH(INTEL_ICELAKE_X, core_funcs), 2742 X86_MATCH(INTEL_TIGERLAKE, core_funcs), 2743 X86_MATCH(INTEL_SAPPHIRERAPIDS_X, core_funcs), 2744 X86_MATCH(INTEL_EMERALDRAPIDS_X, core_funcs), 2745 X86_MATCH(INTEL_GRANITERAPIDS_D, core_funcs), 2746 X86_MATCH(INTEL_GRANITERAPIDS_X, core_funcs), 2747 {} 2748 }; 2749 MODULE_DEVICE_TABLE(x86cpu, intel_pstate_cpu_ids); 2750 2751 #ifdef CONFIG_ACPI 2752 static const struct x86_cpu_id intel_pstate_cpu_oob_ids[] __initconst = { 2753 X86_MATCH(INTEL_BROADWELL_D, core_funcs), 2754 X86_MATCH(INTEL_BROADWELL_X, core_funcs), 2755 X86_MATCH(INTEL_SKYLAKE_X, core_funcs), 2756 X86_MATCH(INTEL_ICELAKE_X, core_funcs), 2757 X86_MATCH(INTEL_SAPPHIRERAPIDS_X, core_funcs), 2758 X86_MATCH(INTEL_EMERALDRAPIDS_X, core_funcs), 2759 X86_MATCH(INTEL_GRANITERAPIDS_D, core_funcs), 2760 X86_MATCH(INTEL_GRANITERAPIDS_X, core_funcs), 2761 X86_MATCH(INTEL_ATOM_CRESTMONT, core_funcs), 2762 X86_MATCH(INTEL_ATOM_CRESTMONT_X, core_funcs), 2763 X86_MATCH(INTEL_ATOM_DARKMONT_X, core_funcs), 2764 {} 2765 }; 2766 #endif 2767 2768 static const struct x86_cpu_id intel_pstate_cpu_ee_disable_ids[] = { 2769 X86_MATCH(INTEL_KABYLAKE, core_funcs), 2770 {} 2771 }; 2772 2773 static int intel_pstate_init_cpu(unsigned int cpunum) 2774 { 2775 struct cpudata *cpu; 2776 2777 cpu = all_cpu_data[cpunum]; 2778 2779 if (!cpu) { 2780 cpu = kzalloc(sizeof(*cpu), GFP_KERNEL); 2781 if (!cpu) 2782 return -ENOMEM; 2783 2784 WRITE_ONCE(all_cpu_data[cpunum], cpu); 2785 2786 cpu->cpu = cpunum; 2787 2788 cpu->epp_default = -EINVAL; 2789 2790 if (hwp_active) { 2791 intel_pstate_hwp_enable(cpu); 2792 2793 if (intel_pstate_acpi_pm_profile_server()) 2794 hwp_boost = true; 2795 } 2796 } else if (hwp_active) { 2797 /* 2798 * Re-enable HWP in case this happens after a resume from ACPI 2799 * S3 if the CPU was offline during the whole system/resume 2800 * cycle. 2801 */ 2802 intel_pstate_hwp_reenable(cpu); 2803 } 2804 2805 cpu->epp_powersave = -EINVAL; 2806 cpu->epp_policy = CPUFREQ_POLICY_UNKNOWN; 2807 2808 intel_pstate_get_cpu_pstates(cpu); 2809 2810 pr_debug("controlling: cpu %d\n", cpunum); 2811 2812 return 0; 2813 } 2814 2815 static void intel_pstate_set_update_util_hook(unsigned int cpu_num) 2816 { 2817 struct cpudata *cpu = all_cpu_data[cpu_num]; 2818 2819 if (hwp_active && !hwp_boost) 2820 return; 2821 2822 if (cpu->update_util_set) 2823 return; 2824 2825 /* Prevent intel_pstate_update_util() from using stale data. */ 2826 cpu->sample.time = 0; 2827 cpufreq_add_update_util_hook(cpu_num, &cpu->update_util, 2828 (hwp_active ? 2829 intel_pstate_update_util_hwp : 2830 intel_pstate_update_util)); 2831 cpu->update_util_set = true; 2832 } 2833 2834 static void intel_pstate_clear_update_util_hook(unsigned int cpu) 2835 { 2836 struct cpudata *cpu_data = all_cpu_data[cpu]; 2837 2838 if (!cpu_data->update_util_set) 2839 return; 2840 2841 cpufreq_remove_update_util_hook(cpu); 2842 cpu_data->update_util_set = false; 2843 synchronize_rcu(); 2844 } 2845 2846 static int intel_pstate_get_max_freq(struct cpudata *cpu) 2847 { 2848 return READ_ONCE(global.no_turbo) ? 2849 cpu->pstate.max_freq : cpu->pstate.turbo_freq; 2850 } 2851 2852 static void intel_pstate_update_perf_limits(struct cpudata *cpu, 2853 unsigned int policy_min, 2854 unsigned int policy_max) 2855 { 2856 int perf_ctl_scaling = cpu->pstate.perf_ctl_scaling; 2857 int32_t max_policy_perf, min_policy_perf; 2858 2859 max_policy_perf = policy_max / perf_ctl_scaling; 2860 if (policy_max == policy_min) { 2861 min_policy_perf = max_policy_perf; 2862 } else { 2863 min_policy_perf = policy_min / perf_ctl_scaling; 2864 min_policy_perf = clamp_t(int32_t, min_policy_perf, 2865 0, max_policy_perf); 2866 } 2867 2868 /* 2869 * HWP needs some special consideration, because HWP_REQUEST uses 2870 * abstract values to represent performance rather than pure ratios. 2871 */ 2872 if (hwp_active && cpu->pstate.scaling != perf_ctl_scaling) { 2873 int freq; 2874 2875 freq = max_policy_perf * perf_ctl_scaling; 2876 max_policy_perf = intel_pstate_freq_to_hwp(cpu, freq); 2877 freq = min_policy_perf * perf_ctl_scaling; 2878 min_policy_perf = intel_pstate_freq_to_hwp(cpu, freq); 2879 } 2880 2881 pr_debug("cpu:%d min_policy_perf:%d max_policy_perf:%d\n", 2882 cpu->cpu, min_policy_perf, max_policy_perf); 2883 2884 /* Normalize user input to [min_perf, max_perf] */ 2885 if (per_cpu_limits) { 2886 cpu->min_perf_ratio = min_policy_perf; 2887 cpu->max_perf_ratio = max_policy_perf; 2888 } else { 2889 int turbo_max = cpu->pstate.turbo_pstate; 2890 int32_t global_min, global_max; 2891 2892 /* Global limits are in percent of the maximum turbo P-state. */ 2893 global_max = DIV_ROUND_UP(turbo_max * global.max_perf_pct, 100); 2894 global_min = DIV_ROUND_UP(turbo_max * global.min_perf_pct, 100); 2895 global_min = clamp_t(int32_t, global_min, 0, global_max); 2896 2897 pr_debug("cpu:%d global_min:%d global_max:%d\n", cpu->cpu, 2898 global_min, global_max); 2899 2900 cpu->min_perf_ratio = max(min_policy_perf, global_min); 2901 cpu->min_perf_ratio = min(cpu->min_perf_ratio, max_policy_perf); 2902 cpu->max_perf_ratio = min(max_policy_perf, global_max); 2903 cpu->max_perf_ratio = max(min_policy_perf, cpu->max_perf_ratio); 2904 2905 /* Make sure min_perf <= max_perf */ 2906 cpu->min_perf_ratio = min(cpu->min_perf_ratio, 2907 cpu->max_perf_ratio); 2908 2909 } 2910 pr_debug("cpu:%d max_perf_ratio:%d min_perf_ratio:%d\n", cpu->cpu, 2911 cpu->max_perf_ratio, 2912 cpu->min_perf_ratio); 2913 } 2914 2915 static int intel_pstate_set_policy(struct cpufreq_policy *policy) 2916 { 2917 struct cpudata *cpu; 2918 2919 if (!policy->cpuinfo.max_freq) 2920 return -ENODEV; 2921 2922 pr_debug("set_policy cpuinfo.max %u policy->max %u\n", 2923 policy->cpuinfo.max_freq, policy->max); 2924 2925 cpu = all_cpu_data[policy->cpu]; 2926 cpu->policy = policy->policy; 2927 2928 mutex_lock(&intel_pstate_limits_lock); 2929 2930 intel_pstate_update_perf_limits(cpu, policy->min, policy->max); 2931 2932 if (cpu->policy == CPUFREQ_POLICY_PERFORMANCE) { 2933 int pstate = max(cpu->pstate.min_pstate, cpu->max_perf_ratio); 2934 2935 /* 2936 * NOHZ_FULL CPUs need this as the governor callback may not 2937 * be invoked on them. 2938 */ 2939 intel_pstate_clear_update_util_hook(policy->cpu); 2940 intel_pstate_set_pstate(cpu, pstate); 2941 } else { 2942 intel_pstate_set_update_util_hook(policy->cpu); 2943 } 2944 2945 if (hwp_active) { 2946 /* 2947 * When hwp_boost was active before and dynamically it 2948 * was turned off, in that case we need to clear the 2949 * update util hook. 2950 */ 2951 if (!hwp_boost) 2952 intel_pstate_clear_update_util_hook(policy->cpu); 2953 intel_pstate_hwp_set(policy->cpu); 2954 } 2955 /* 2956 * policy->cur is never updated with the intel_pstate driver, but it 2957 * is used as a stale frequency value. So, keep it within limits. 2958 */ 2959 policy->cur = policy->min; 2960 2961 mutex_unlock(&intel_pstate_limits_lock); 2962 2963 return 0; 2964 } 2965 2966 static void intel_pstate_adjust_policy_max(struct cpudata *cpu, 2967 struct cpufreq_policy_data *policy) 2968 { 2969 if (!hwp_active && 2970 cpu->pstate.max_pstate_physical > cpu->pstate.max_pstate && 2971 policy->max < policy->cpuinfo.max_freq && 2972 policy->max > cpu->pstate.max_freq) { 2973 pr_debug("policy->max > max non turbo frequency\n"); 2974 policy->max = policy->cpuinfo.max_freq; 2975 } 2976 } 2977 2978 static void intel_pstate_verify_cpu_policy(struct cpudata *cpu, 2979 struct cpufreq_policy_data *policy) 2980 { 2981 int max_freq; 2982 2983 if (hwp_active) { 2984 intel_pstate_get_hwp_cap(cpu); 2985 max_freq = READ_ONCE(global.no_turbo) ? 2986 cpu->pstate.max_freq : cpu->pstate.turbo_freq; 2987 } else { 2988 max_freq = intel_pstate_get_max_freq(cpu); 2989 } 2990 cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq, max_freq); 2991 2992 intel_pstate_adjust_policy_max(cpu, policy); 2993 } 2994 2995 static int intel_pstate_verify_policy(struct cpufreq_policy_data *policy) 2996 { 2997 intel_pstate_verify_cpu_policy(all_cpu_data[policy->cpu], policy); 2998 2999 return 0; 3000 } 3001 3002 static int intel_cpufreq_cpu_offline(struct cpufreq_policy *policy) 3003 { 3004 struct cpudata *cpu = all_cpu_data[policy->cpu]; 3005 3006 pr_debug("CPU %d going offline\n", cpu->cpu); 3007 3008 if (cpu->suspended) 3009 return 0; 3010 3011 /* 3012 * If the CPU is an SMT thread and it goes offline with the performance 3013 * settings different from the minimum, it will prevent its sibling 3014 * from getting to lower performance levels, so force the minimum 3015 * performance on CPU offline to prevent that from happening. 3016 */ 3017 if (hwp_active) 3018 intel_pstate_hwp_offline(cpu); 3019 else 3020 intel_pstate_set_min_pstate(cpu); 3021 3022 intel_pstate_exit_perf_limits(policy); 3023 3024 return 0; 3025 } 3026 3027 static int intel_pstate_cpu_online(struct cpufreq_policy *policy) 3028 { 3029 struct cpudata *cpu = all_cpu_data[policy->cpu]; 3030 3031 pr_debug("CPU %d going online\n", cpu->cpu); 3032 3033 intel_pstate_init_acpi_perf_limits(policy); 3034 3035 if (hwp_active) { 3036 /* 3037 * Re-enable HWP and clear the "suspended" flag to let "resume" 3038 * know that it need not do that. 3039 */ 3040 intel_pstate_hwp_reenable(cpu); 3041 cpu->suspended = false; 3042 3043 hybrid_update_capacity(cpu); 3044 } 3045 3046 return 0; 3047 } 3048 3049 static int intel_pstate_cpu_offline(struct cpufreq_policy *policy) 3050 { 3051 intel_pstate_clear_update_util_hook(policy->cpu); 3052 3053 return intel_cpufreq_cpu_offline(policy); 3054 } 3055 3056 static void intel_pstate_cpu_exit(struct cpufreq_policy *policy) 3057 { 3058 pr_debug("CPU %d exiting\n", policy->cpu); 3059 3060 policy->fast_switch_possible = false; 3061 } 3062 3063 static int __intel_pstate_cpu_init(struct cpufreq_policy *policy) 3064 { 3065 struct cpudata *cpu; 3066 int rc; 3067 3068 rc = intel_pstate_init_cpu(policy->cpu); 3069 if (rc) 3070 return rc; 3071 3072 cpu = all_cpu_data[policy->cpu]; 3073 3074 cpu->max_perf_ratio = 0xFF; 3075 cpu->min_perf_ratio = 0; 3076 3077 /* cpuinfo and default policy values */ 3078 policy->cpuinfo.min_freq = cpu->pstate.min_freq; 3079 policy->cpuinfo.max_freq = READ_ONCE(global.no_turbo) ? 3080 cpu->pstate.max_freq : cpu->pstate.turbo_freq; 3081 3082 policy->min = policy->cpuinfo.min_freq; 3083 policy->max = policy->cpuinfo.max_freq; 3084 3085 intel_pstate_init_acpi_perf_limits(policy); 3086 3087 policy->fast_switch_possible = true; 3088 3089 return 0; 3090 } 3091 3092 static int intel_pstate_cpu_init(struct cpufreq_policy *policy) 3093 { 3094 int ret = __intel_pstate_cpu_init(policy); 3095 3096 if (ret) 3097 return ret; 3098 3099 /* 3100 * Set the policy to powersave to provide a valid fallback value in case 3101 * the default cpufreq governor is neither powersave nor performance. 3102 */ 3103 policy->policy = CPUFREQ_POLICY_POWERSAVE; 3104 3105 if (hwp_active) { 3106 struct cpudata *cpu = all_cpu_data[policy->cpu]; 3107 3108 cpu->epp_cached = intel_pstate_get_epp(cpu, 0); 3109 } 3110 3111 return 0; 3112 } 3113 3114 static struct cpufreq_driver intel_pstate = { 3115 .flags = CPUFREQ_CONST_LOOPS, 3116 .verify = intel_pstate_verify_policy, 3117 .setpolicy = intel_pstate_set_policy, 3118 .suspend = intel_pstate_suspend, 3119 .resume = intel_pstate_resume, 3120 .init = intel_pstate_cpu_init, 3121 .exit = intel_pstate_cpu_exit, 3122 .offline = intel_pstate_cpu_offline, 3123 .online = intel_pstate_cpu_online, 3124 .update_limits = intel_pstate_update_limits, 3125 .name = "intel_pstate", 3126 }; 3127 3128 static int intel_cpufreq_verify_policy(struct cpufreq_policy_data *policy) 3129 { 3130 struct cpudata *cpu = all_cpu_data[policy->cpu]; 3131 3132 intel_pstate_verify_cpu_policy(cpu, policy); 3133 intel_pstate_update_perf_limits(cpu, policy->min, policy->max); 3134 3135 return 0; 3136 } 3137 3138 /* Use of trace in passive mode: 3139 * 3140 * In passive mode the trace core_busy field (also known as the 3141 * performance field, and lablelled as such on the graphs; also known as 3142 * core_avg_perf) is not needed and so is re-assigned to indicate if the 3143 * driver call was via the normal or fast switch path. Various graphs 3144 * output from the intel_pstate_tracer.py utility that include core_busy 3145 * (or performance or core_avg_perf) have a fixed y-axis from 0 to 100%, 3146 * so we use 10 to indicate the normal path through the driver, and 3147 * 90 to indicate the fast switch path through the driver. 3148 * The scaled_busy field is not used, and is set to 0. 3149 */ 3150 3151 #define INTEL_PSTATE_TRACE_TARGET 10 3152 #define INTEL_PSTATE_TRACE_FAST_SWITCH 90 3153 3154 static void intel_cpufreq_trace(struct cpudata *cpu, unsigned int trace_type, int old_pstate) 3155 { 3156 struct sample *sample; 3157 3158 if (!trace_pstate_sample_enabled()) 3159 return; 3160 3161 if (!intel_pstate_sample(cpu, ktime_get())) 3162 return; 3163 3164 sample = &cpu->sample; 3165 trace_pstate_sample(trace_type, 3166 0, 3167 old_pstate, 3168 cpu->pstate.current_pstate, 3169 sample->mperf, 3170 sample->aperf, 3171 sample->tsc, 3172 get_avg_frequency(cpu), 3173 fp_toint(cpu->iowait_boost * 100)); 3174 } 3175 3176 static void intel_cpufreq_hwp_update(struct cpudata *cpu, u32 min, u32 max, 3177 u32 desired, bool fast_switch) 3178 { 3179 u64 prev = READ_ONCE(cpu->hwp_req_cached), value = prev; 3180 3181 value &= ~HWP_MIN_PERF(~0L); 3182 value |= HWP_MIN_PERF(min); 3183 3184 value &= ~HWP_MAX_PERF(~0L); 3185 value |= HWP_MAX_PERF(max); 3186 3187 value &= ~HWP_DESIRED_PERF(~0L); 3188 value |= HWP_DESIRED_PERF(desired); 3189 3190 if (value == prev) 3191 return; 3192 3193 WRITE_ONCE(cpu->hwp_req_cached, value); 3194 if (fast_switch) 3195 wrmsrq(MSR_HWP_REQUEST, value); 3196 else 3197 wrmsrq_on_cpu(cpu->cpu, MSR_HWP_REQUEST, value); 3198 } 3199 3200 static void intel_cpufreq_perf_ctl_update(struct cpudata *cpu, 3201 u32 target_pstate, bool fast_switch) 3202 { 3203 if (fast_switch) 3204 wrmsrq(MSR_IA32_PERF_CTL, 3205 pstate_funcs.get_val(cpu, target_pstate)); 3206 else 3207 wrmsrq_on_cpu(cpu->cpu, MSR_IA32_PERF_CTL, 3208 pstate_funcs.get_val(cpu, target_pstate)); 3209 } 3210 3211 static int intel_cpufreq_update_pstate(struct cpufreq_policy *policy, 3212 int target_pstate, bool fast_switch) 3213 { 3214 struct cpudata *cpu = all_cpu_data[policy->cpu]; 3215 int old_pstate = cpu->pstate.current_pstate; 3216 3217 target_pstate = intel_pstate_prepare_request(cpu, target_pstate); 3218 if (hwp_active) { 3219 int max_pstate = policy->strict_target ? 3220 target_pstate : cpu->max_perf_ratio; 3221 3222 intel_cpufreq_hwp_update(cpu, target_pstate, max_pstate, 3223 target_pstate, fast_switch); 3224 } else if (target_pstate != old_pstate) { 3225 intel_cpufreq_perf_ctl_update(cpu, target_pstate, fast_switch); 3226 } 3227 3228 cpu->pstate.current_pstate = target_pstate; 3229 3230 intel_cpufreq_trace(cpu, fast_switch ? INTEL_PSTATE_TRACE_FAST_SWITCH : 3231 INTEL_PSTATE_TRACE_TARGET, old_pstate); 3232 3233 return target_pstate; 3234 } 3235 3236 static int intel_cpufreq_target(struct cpufreq_policy *policy, 3237 unsigned int target_freq, 3238 unsigned int relation) 3239 { 3240 struct cpudata *cpu = all_cpu_data[policy->cpu]; 3241 struct cpufreq_freqs freqs; 3242 int target_pstate; 3243 3244 freqs.old = policy->cur; 3245 freqs.new = target_freq; 3246 3247 cpufreq_freq_transition_begin(policy, &freqs); 3248 3249 target_pstate = intel_pstate_freq_to_hwp_rel(cpu, freqs.new, relation); 3250 target_pstate = intel_cpufreq_update_pstate(policy, target_pstate, false); 3251 3252 freqs.new = target_pstate * cpu->pstate.scaling; 3253 3254 cpufreq_freq_transition_end(policy, &freqs, false); 3255 3256 return 0; 3257 } 3258 3259 static unsigned int intel_cpufreq_fast_switch(struct cpufreq_policy *policy, 3260 unsigned int target_freq) 3261 { 3262 struct cpudata *cpu = all_cpu_data[policy->cpu]; 3263 int target_pstate; 3264 3265 target_pstate = intel_pstate_freq_to_hwp(cpu, target_freq); 3266 3267 target_pstate = intel_cpufreq_update_pstate(policy, target_pstate, true); 3268 3269 return target_pstate * cpu->pstate.scaling; 3270 } 3271 3272 static void intel_cpufreq_adjust_perf(unsigned int cpunum, 3273 unsigned long min_perf, 3274 unsigned long target_perf, 3275 unsigned long capacity) 3276 { 3277 struct cpudata *cpu = all_cpu_data[cpunum]; 3278 u64 hwp_cap = READ_ONCE(cpu->hwp_cap_cached); 3279 int old_pstate = cpu->pstate.current_pstate; 3280 int cap_pstate, min_pstate, max_pstate, target_pstate; 3281 3282 cap_pstate = READ_ONCE(global.no_turbo) ? 3283 HWP_GUARANTEED_PERF(hwp_cap) : 3284 HWP_HIGHEST_PERF(hwp_cap); 3285 3286 /* Optimization: Avoid unnecessary divisions. */ 3287 3288 target_pstate = cap_pstate; 3289 if (target_perf < capacity) 3290 target_pstate = DIV_ROUND_UP(cap_pstate * target_perf, capacity); 3291 3292 min_pstate = cap_pstate; 3293 if (min_perf < capacity) 3294 min_pstate = DIV_ROUND_UP(cap_pstate * min_perf, capacity); 3295 3296 if (min_pstate < cpu->pstate.min_pstate) 3297 min_pstate = cpu->pstate.min_pstate; 3298 3299 if (min_pstate < cpu->min_perf_ratio) 3300 min_pstate = cpu->min_perf_ratio; 3301 3302 if (min_pstate > cpu->max_perf_ratio) 3303 min_pstate = cpu->max_perf_ratio; 3304 3305 max_pstate = min(cap_pstate, cpu->max_perf_ratio); 3306 if (max_pstate < min_pstate) 3307 max_pstate = min_pstate; 3308 3309 target_pstate = clamp_t(int, target_pstate, min_pstate, max_pstate); 3310 3311 intel_cpufreq_hwp_update(cpu, min_pstate, max_pstate, target_pstate, true); 3312 3313 cpu->pstate.current_pstate = target_pstate; 3314 intel_cpufreq_trace(cpu, INTEL_PSTATE_TRACE_FAST_SWITCH, old_pstate); 3315 } 3316 3317 static int intel_cpufreq_cpu_init(struct cpufreq_policy *policy) 3318 { 3319 struct freq_qos_request *req; 3320 struct cpudata *cpu; 3321 struct device *dev; 3322 int ret, freq; 3323 3324 dev = get_cpu_device(policy->cpu); 3325 if (!dev) 3326 return -ENODEV; 3327 3328 ret = __intel_pstate_cpu_init(policy); 3329 if (ret) 3330 return ret; 3331 3332 policy->cpuinfo.transition_latency = INTEL_CPUFREQ_TRANSITION_LATENCY; 3333 /* This reflects the intel_pstate_get_cpu_pstates() setting. */ 3334 policy->cur = policy->cpuinfo.min_freq; 3335 3336 req = kcalloc(2, sizeof(*req), GFP_KERNEL); 3337 if (!req) { 3338 ret = -ENOMEM; 3339 goto pstate_exit; 3340 } 3341 3342 cpu = all_cpu_data[policy->cpu]; 3343 3344 if (hwp_active) { 3345 u64 value; 3346 3347 policy->transition_delay_us = INTEL_CPUFREQ_TRANSITION_DELAY_HWP; 3348 3349 intel_pstate_get_hwp_cap(cpu); 3350 3351 rdmsrq_on_cpu(cpu->cpu, MSR_HWP_REQUEST, &value); 3352 WRITE_ONCE(cpu->hwp_req_cached, value); 3353 3354 cpu->epp_cached = intel_pstate_get_epp(cpu, value); 3355 } else { 3356 policy->transition_delay_us = INTEL_CPUFREQ_TRANSITION_DELAY; 3357 } 3358 3359 freq = DIV_ROUND_UP(cpu->pstate.turbo_freq * global.min_perf_pct, 100); 3360 3361 ret = freq_qos_add_request(&policy->constraints, req, FREQ_QOS_MIN, 3362 freq); 3363 if (ret < 0) { 3364 dev_err(dev, "Failed to add min-freq constraint (%d)\n", ret); 3365 goto free_req; 3366 } 3367 3368 freq = DIV_ROUND_UP(cpu->pstate.turbo_freq * global.max_perf_pct, 100); 3369 3370 ret = freq_qos_add_request(&policy->constraints, req + 1, FREQ_QOS_MAX, 3371 freq); 3372 if (ret < 0) { 3373 dev_err(dev, "Failed to add max-freq constraint (%d)\n", ret); 3374 goto remove_min_req; 3375 } 3376 3377 policy->driver_data = req; 3378 3379 return 0; 3380 3381 remove_min_req: 3382 freq_qos_remove_request(req); 3383 free_req: 3384 kfree(req); 3385 pstate_exit: 3386 intel_pstate_exit_perf_limits(policy); 3387 3388 return ret; 3389 } 3390 3391 static void intel_cpufreq_cpu_exit(struct cpufreq_policy *policy) 3392 { 3393 struct freq_qos_request *req; 3394 3395 req = policy->driver_data; 3396 3397 freq_qos_remove_request(req + 1); 3398 freq_qos_remove_request(req); 3399 kfree(req); 3400 3401 intel_pstate_cpu_exit(policy); 3402 } 3403 3404 static int intel_cpufreq_suspend(struct cpufreq_policy *policy) 3405 { 3406 intel_pstate_suspend(policy); 3407 3408 if (hwp_active) { 3409 struct cpudata *cpu = all_cpu_data[policy->cpu]; 3410 u64 value = READ_ONCE(cpu->hwp_req_cached); 3411 3412 /* 3413 * Clear the desired perf field in MSR_HWP_REQUEST in case 3414 * intel_cpufreq_adjust_perf() is in use and the last value 3415 * written by it may not be suitable. 3416 */ 3417 value &= ~HWP_DESIRED_PERF(~0L); 3418 wrmsrq_on_cpu(cpu->cpu, MSR_HWP_REQUEST, value); 3419 WRITE_ONCE(cpu->hwp_req_cached, value); 3420 } 3421 3422 return 0; 3423 } 3424 3425 static struct cpufreq_driver intel_cpufreq = { 3426 .flags = CPUFREQ_CONST_LOOPS, 3427 .verify = intel_cpufreq_verify_policy, 3428 .target = intel_cpufreq_target, 3429 .fast_switch = intel_cpufreq_fast_switch, 3430 .init = intel_cpufreq_cpu_init, 3431 .exit = intel_cpufreq_cpu_exit, 3432 .offline = intel_cpufreq_cpu_offline, 3433 .online = intel_pstate_cpu_online, 3434 .suspend = intel_cpufreq_suspend, 3435 .resume = intel_pstate_resume, 3436 .update_limits = intel_pstate_update_limits, 3437 .name = "intel_cpufreq", 3438 }; 3439 3440 static struct cpufreq_driver *default_driver; 3441 3442 static void intel_pstate_driver_cleanup(void) 3443 { 3444 unsigned int cpu; 3445 3446 cpus_read_lock(); 3447 for_each_online_cpu(cpu) { 3448 if (all_cpu_data[cpu]) { 3449 if (intel_pstate_driver == &intel_pstate) 3450 intel_pstate_clear_update_util_hook(cpu); 3451 3452 kfree(all_cpu_data[cpu]); 3453 WRITE_ONCE(all_cpu_data[cpu], NULL); 3454 } 3455 } 3456 cpus_read_unlock(); 3457 3458 intel_pstate_driver = NULL; 3459 } 3460 3461 static int intel_pstate_register_driver(struct cpufreq_driver *driver) 3462 { 3463 bool refresh_cpu_cap_scaling; 3464 int ret; 3465 3466 if (driver == &intel_pstate) 3467 intel_pstate_sysfs_expose_hwp_dynamic_boost(); 3468 3469 memset(&global, 0, sizeof(global)); 3470 global.max_perf_pct = 100; 3471 global.turbo_disabled = turbo_is_disabled(); 3472 global.no_turbo = global.turbo_disabled; 3473 3474 arch_set_max_freq_ratio(global.turbo_disabled); 3475 3476 refresh_cpu_cap_scaling = hybrid_clear_max_perf_cpu(); 3477 3478 intel_pstate_driver = driver; 3479 ret = cpufreq_register_driver(intel_pstate_driver); 3480 if (ret) { 3481 intel_pstate_driver_cleanup(); 3482 return ret; 3483 } 3484 3485 global.min_perf_pct = min_perf_pct_min(); 3486 3487 hybrid_init_cpu_capacity_scaling(refresh_cpu_cap_scaling); 3488 3489 return 0; 3490 } 3491 3492 static ssize_t intel_pstate_show_status(char *buf) 3493 { 3494 if (!intel_pstate_driver) 3495 return sprintf(buf, "off\n"); 3496 3497 return sprintf(buf, "%s\n", intel_pstate_driver == &intel_pstate ? 3498 "active" : "passive"); 3499 } 3500 3501 static int intel_pstate_update_status(const char *buf, size_t size) 3502 { 3503 if (size == 3 && !strncmp(buf, "off", size)) { 3504 if (!intel_pstate_driver) 3505 return -EINVAL; 3506 3507 if (hwp_active) 3508 return -EBUSY; 3509 3510 cpufreq_unregister_driver(intel_pstate_driver); 3511 intel_pstate_driver_cleanup(); 3512 return 0; 3513 } 3514 3515 if (size == 6 && !strncmp(buf, "active", size)) { 3516 if (intel_pstate_driver) { 3517 if (intel_pstate_driver == &intel_pstate) 3518 return 0; 3519 3520 cpufreq_unregister_driver(intel_pstate_driver); 3521 } 3522 3523 return intel_pstate_register_driver(&intel_pstate); 3524 } 3525 3526 if (size == 7 && !strncmp(buf, "passive", size)) { 3527 if (intel_pstate_driver) { 3528 if (intel_pstate_driver == &intel_cpufreq) 3529 return 0; 3530 3531 cpufreq_unregister_driver(intel_pstate_driver); 3532 intel_pstate_sysfs_hide_hwp_dynamic_boost(); 3533 } 3534 3535 return intel_pstate_register_driver(&intel_cpufreq); 3536 } 3537 3538 return -EINVAL; 3539 } 3540 3541 static int no_load __initdata; 3542 static int no_hwp __initdata; 3543 static int hwp_only __initdata; 3544 static unsigned int force_load __initdata; 3545 3546 static int __init intel_pstate_msrs_not_valid(void) 3547 { 3548 if (!pstate_funcs.get_max(0) || 3549 !pstate_funcs.get_min(0) || 3550 !pstate_funcs.get_turbo(0)) 3551 return -ENODEV; 3552 3553 return 0; 3554 } 3555 3556 static void __init copy_cpu_funcs(struct pstate_funcs *funcs) 3557 { 3558 pstate_funcs.get_max = funcs->get_max; 3559 pstate_funcs.get_max_physical = funcs->get_max_physical; 3560 pstate_funcs.get_min = funcs->get_min; 3561 pstate_funcs.get_turbo = funcs->get_turbo; 3562 pstate_funcs.get_scaling = funcs->get_scaling; 3563 pstate_funcs.get_val = funcs->get_val; 3564 pstate_funcs.get_vid = funcs->get_vid; 3565 pstate_funcs.get_aperf_mperf_shift = funcs->get_aperf_mperf_shift; 3566 } 3567 3568 #ifdef CONFIG_ACPI 3569 3570 static bool __init intel_pstate_no_acpi_pss(void) 3571 { 3572 int i; 3573 3574 for_each_possible_cpu(i) { 3575 acpi_status status; 3576 union acpi_object *pss; 3577 struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; 3578 struct acpi_processor *pr = per_cpu(processors, i); 3579 3580 if (!pr) 3581 continue; 3582 3583 status = acpi_evaluate_object(pr->handle, "_PSS", NULL, &buffer); 3584 if (ACPI_FAILURE(status)) 3585 continue; 3586 3587 pss = buffer.pointer; 3588 if (pss && pss->type == ACPI_TYPE_PACKAGE) { 3589 kfree(pss); 3590 return false; 3591 } 3592 3593 kfree(pss); 3594 } 3595 3596 pr_debug("ACPI _PSS not found\n"); 3597 return true; 3598 } 3599 3600 static bool __init intel_pstate_no_acpi_pcch(void) 3601 { 3602 acpi_status status; 3603 acpi_handle handle; 3604 3605 status = acpi_get_handle(NULL, "\\_SB", &handle); 3606 if (ACPI_FAILURE(status)) 3607 goto not_found; 3608 3609 if (acpi_has_method(handle, "PCCH")) 3610 return false; 3611 3612 not_found: 3613 pr_debug("ACPI PCCH not found\n"); 3614 return true; 3615 } 3616 3617 static bool __init intel_pstate_has_acpi_ppc(void) 3618 { 3619 int i; 3620 3621 for_each_possible_cpu(i) { 3622 struct acpi_processor *pr = per_cpu(processors, i); 3623 3624 if (!pr) 3625 continue; 3626 if (acpi_has_method(pr->handle, "_PPC")) 3627 return true; 3628 } 3629 pr_debug("ACPI _PPC not found\n"); 3630 return false; 3631 } 3632 3633 enum { 3634 PSS, 3635 PPC, 3636 }; 3637 3638 /* Hardware vendor-specific info that has its own power management modes */ 3639 static struct acpi_platform_list plat_info[] __initdata = { 3640 {"HP ", "ProLiant", 0, ACPI_SIG_FADT, all_versions, NULL, PSS}, 3641 {"ORACLE", "X4-2 ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC}, 3642 {"ORACLE", "X4-2L ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC}, 3643 {"ORACLE", "X4-2B ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC}, 3644 {"ORACLE", "X3-2 ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC}, 3645 {"ORACLE", "X3-2L ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC}, 3646 {"ORACLE", "X3-2B ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC}, 3647 {"ORACLE", "X4470M2 ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC}, 3648 {"ORACLE", "X4270M3 ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC}, 3649 {"ORACLE", "X4270M2 ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC}, 3650 {"ORACLE", "X4170M2 ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC}, 3651 {"ORACLE", "X4170 M3", 0, ACPI_SIG_FADT, all_versions, NULL, PPC}, 3652 {"ORACLE", "X4275 M3", 0, ACPI_SIG_FADT, all_versions, NULL, PPC}, 3653 {"ORACLE", "X6-2 ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC}, 3654 {"ORACLE", "Sudbury ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC}, 3655 { } /* End */ 3656 }; 3657 3658 #define BITMASK_OOB (BIT(8) | BIT(18)) 3659 3660 static bool __init intel_pstate_platform_pwr_mgmt_exists(void) 3661 { 3662 const struct x86_cpu_id *id; 3663 u64 misc_pwr; 3664 int idx; 3665 3666 id = x86_match_cpu(intel_pstate_cpu_oob_ids); 3667 if (id) { 3668 rdmsrq(MSR_MISC_PWR_MGMT, misc_pwr); 3669 if (misc_pwr & BITMASK_OOB) { 3670 pr_debug("Bit 8 or 18 in the MISC_PWR_MGMT MSR set\n"); 3671 pr_debug("P states are controlled in Out of Band mode by the firmware/hardware\n"); 3672 return true; 3673 } 3674 } 3675 3676 idx = acpi_match_platform_list(plat_info); 3677 if (idx < 0) 3678 return false; 3679 3680 switch (plat_info[idx].data) { 3681 case PSS: 3682 if (!intel_pstate_no_acpi_pss()) 3683 return false; 3684 3685 return intel_pstate_no_acpi_pcch(); 3686 case PPC: 3687 return intel_pstate_has_acpi_ppc() && !force_load; 3688 } 3689 3690 return false; 3691 } 3692 3693 static void intel_pstate_request_control_from_smm(void) 3694 { 3695 /* 3696 * It may be unsafe to request P-states control from SMM if _PPC support 3697 * has not been enabled. 3698 */ 3699 if (acpi_ppc) 3700 acpi_processor_pstate_control(); 3701 } 3702 #else /* CONFIG_ACPI not enabled */ 3703 static inline bool intel_pstate_platform_pwr_mgmt_exists(void) { return false; } 3704 static inline bool intel_pstate_has_acpi_ppc(void) { return false; } 3705 static inline void intel_pstate_request_control_from_smm(void) {} 3706 #endif /* CONFIG_ACPI */ 3707 3708 #define INTEL_PSTATE_HWP_BROADWELL 0x01 3709 3710 #define X86_MATCH_HWP(vfm, hwp_mode) \ 3711 X86_MATCH_VFM_FEATURE(vfm, X86_FEATURE_HWP, hwp_mode) 3712 3713 static const struct x86_cpu_id hwp_support_ids[] __initconst = { 3714 X86_MATCH_HWP(INTEL_BROADWELL_X, INTEL_PSTATE_HWP_BROADWELL), 3715 X86_MATCH_HWP(INTEL_BROADWELL_D, INTEL_PSTATE_HWP_BROADWELL), 3716 X86_MATCH_HWP(INTEL_ANY, 0), 3717 {} 3718 }; 3719 3720 static bool intel_pstate_hwp_is_enabled(void) 3721 { 3722 u64 value; 3723 3724 rdmsrq(MSR_PM_ENABLE, value); 3725 return !!(value & 0x1); 3726 } 3727 3728 #define POWERSAVE_MASK GENMASK(7, 0) 3729 #define BALANCE_POWER_MASK GENMASK(15, 8) 3730 #define BALANCE_PERFORMANCE_MASK GENMASK(23, 16) 3731 #define PERFORMANCE_MASK GENMASK(31, 24) 3732 3733 #define HWP_SET_EPP_VALUES(powersave, balance_power, balance_perf, performance) \ 3734 (FIELD_PREP_CONST(POWERSAVE_MASK, powersave) |\ 3735 FIELD_PREP_CONST(BALANCE_POWER_MASK, balance_power) |\ 3736 FIELD_PREP_CONST(BALANCE_PERFORMANCE_MASK, balance_perf) |\ 3737 FIELD_PREP_CONST(PERFORMANCE_MASK, performance)) 3738 3739 #define HWP_SET_DEF_BALANCE_PERF_EPP(balance_perf) \ 3740 (HWP_SET_EPP_VALUES(HWP_EPP_POWERSAVE, HWP_EPP_BALANCE_POWERSAVE,\ 3741 balance_perf, HWP_EPP_PERFORMANCE)) 3742 3743 static const struct x86_cpu_id intel_epp_default[] = { 3744 /* 3745 * Set EPP value as 102, this is the max suggested EPP 3746 * which can result in one core turbo frequency for 3747 * AlderLake Mobile CPUs. 3748 */ 3749 X86_MATCH_VFM(INTEL_ALDERLAKE_L, HWP_SET_DEF_BALANCE_PERF_EPP(102)), 3750 X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, HWP_SET_DEF_BALANCE_PERF_EPP(32)), 3751 X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, HWP_SET_DEF_BALANCE_PERF_EPP(32)), 3752 X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, HWP_SET_DEF_BALANCE_PERF_EPP(32)), 3753 X86_MATCH_VFM(INTEL_GRANITERAPIDS_D, HWP_SET_DEF_BALANCE_PERF_EPP(32)), 3754 X86_MATCH_VFM(INTEL_METEORLAKE_L, HWP_SET_EPP_VALUES(HWP_EPP_POWERSAVE, 3755 179, 64, 16)), 3756 X86_MATCH_VFM(INTEL_ARROWLAKE, HWP_SET_EPP_VALUES(HWP_EPP_POWERSAVE, 3757 179, 64, 16)), 3758 {} 3759 }; 3760 3761 static const struct x86_cpu_id intel_hybrid_scaling_factor[] = { 3762 X86_MATCH_VFM(INTEL_ALDERLAKE, HYBRID_SCALING_FACTOR_ADL), 3763 X86_MATCH_VFM(INTEL_ALDERLAKE_L, HYBRID_SCALING_FACTOR_ADL), 3764 X86_MATCH_VFM(INTEL_RAPTORLAKE, HYBRID_SCALING_FACTOR_ADL), 3765 X86_MATCH_VFM(INTEL_RAPTORLAKE_P, HYBRID_SCALING_FACTOR_ADL), 3766 X86_MATCH_VFM(INTEL_RAPTORLAKE_S, HYBRID_SCALING_FACTOR_ADL), 3767 X86_MATCH_VFM(INTEL_METEORLAKE_L, HYBRID_SCALING_FACTOR_MTL), 3768 X86_MATCH_VFM(INTEL_LUNARLAKE_M, HYBRID_SCALING_FACTOR_LNL), 3769 {} 3770 }; 3771 3772 static bool hwp_check_epp(void) 3773 { 3774 if (boot_cpu_has(X86_FEATURE_HWP_EPP)) 3775 return true; 3776 3777 /* Without EPP support, don't expose EPP-related sysfs attributes. */ 3778 hwp_cpufreq_attrs[HWP_PERFORMANCE_PREFERENCE_INDEX] = NULL; 3779 hwp_cpufreq_attrs[HWP_PERFORMANCE_AVAILABLE_PREFERENCES_INDEX] = NULL; 3780 3781 return false; 3782 } 3783 3784 static bool hwp_check_dec(void) 3785 { 3786 u64 power_ctl; 3787 3788 rdmsrq(MSR_IA32_POWER_CTL, power_ctl); 3789 return !!(power_ctl & BIT(POWER_CTL_DEC_ENABLE)); 3790 } 3791 3792 static int __init intel_pstate_init(void) 3793 { 3794 static struct cpudata **_all_cpu_data; 3795 const struct x86_cpu_id *id; 3796 int rc; 3797 3798 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) 3799 return -ENODEV; 3800 3801 /* 3802 * The Intel pstate driver will be ignored if the platform 3803 * firmware has its own power management modes. 3804 */ 3805 if (intel_pstate_platform_pwr_mgmt_exists()) { 3806 pr_info("P-states controlled by the platform\n"); 3807 return -ENODEV; 3808 } 3809 3810 id = x86_match_cpu(hwp_support_ids); 3811 if (id) { 3812 bool epp_present = hwp_check_epp(); 3813 3814 /* 3815 * If HWP is enabled already, there is no choice but to deal 3816 * with it. 3817 */ 3818 hwp_forced = intel_pstate_hwp_is_enabled(); 3819 if (hwp_forced) { 3820 pr_info("HWP enabled by BIOS\n"); 3821 no_hwp = 0; 3822 } else if (no_load) { 3823 return -ENODEV; 3824 } else if (!epp_present && !hwp_check_dec()) { 3825 /* 3826 * Avoid enabling HWP for processors without EPP support 3827 * unless the Dynamic Efficiency Control (DEC) enable 3828 * bit (MSR_IA32_POWER_CTL, bit 27) is set because that 3829 * means incomplete HWP implementation which is a corner 3830 * case and supporting it is generally problematic. 3831 */ 3832 no_hwp = 1; 3833 } 3834 3835 copy_cpu_funcs(&core_funcs); 3836 3837 if (!no_hwp) { 3838 hwp_active = true; 3839 hwp_mode_bdw = id->driver_data; 3840 intel_pstate.attr = hwp_cpufreq_attrs; 3841 intel_cpufreq.attr = hwp_cpufreq_attrs; 3842 intel_cpufreq.flags |= CPUFREQ_NEED_UPDATE_LIMITS; 3843 intel_cpufreq.adjust_perf = intel_cpufreq_adjust_perf; 3844 if (!default_driver) 3845 default_driver = &intel_pstate; 3846 3847 pstate_funcs.get_cpu_scaling = hwp_get_cpu_scaling; 3848 3849 goto hwp_cpu_matched; 3850 } 3851 pr_info("HWP not enabled\n"); 3852 } else { 3853 if (no_load) 3854 return -ENODEV; 3855 3856 id = x86_match_cpu(intel_pstate_cpu_ids); 3857 if (!id) { 3858 pr_info("CPU model not supported\n"); 3859 return -ENODEV; 3860 } 3861 3862 copy_cpu_funcs((struct pstate_funcs *)id->driver_data); 3863 } 3864 3865 if (intel_pstate_msrs_not_valid()) { 3866 pr_info("Invalid MSRs\n"); 3867 return -ENODEV; 3868 } 3869 /* Without HWP start in the passive mode. */ 3870 if (!default_driver) 3871 default_driver = &intel_cpufreq; 3872 3873 hwp_cpu_matched: 3874 if (!hwp_active && hwp_only) 3875 return -ENOTSUPP; 3876 3877 pr_info("Intel P-state driver initializing\n"); 3878 3879 _all_cpu_data = vzalloc(array_size(sizeof(void *), num_possible_cpus())); 3880 if (!_all_cpu_data) 3881 return -ENOMEM; 3882 3883 WRITE_ONCE(all_cpu_data, _all_cpu_data); 3884 3885 intel_pstate_request_control_from_smm(); 3886 3887 intel_pstate_sysfs_expose_params(); 3888 3889 if (hwp_active) { 3890 const struct x86_cpu_id *id = x86_match_cpu(intel_epp_default); 3891 const struct x86_cpu_id *hybrid_id = x86_match_cpu(intel_hybrid_scaling_factor); 3892 3893 if (id) { 3894 epp_values[EPP_INDEX_POWERSAVE] = 3895 FIELD_GET(POWERSAVE_MASK, id->driver_data); 3896 epp_values[EPP_INDEX_BALANCE_POWERSAVE] = 3897 FIELD_GET(BALANCE_POWER_MASK, id->driver_data); 3898 epp_values[EPP_INDEX_BALANCE_PERFORMANCE] = 3899 FIELD_GET(BALANCE_PERFORMANCE_MASK, id->driver_data); 3900 epp_values[EPP_INDEX_PERFORMANCE] = 3901 FIELD_GET(PERFORMANCE_MASK, id->driver_data); 3902 pr_debug("Updated EPPs powersave:%x balanced power:%x balanced perf:%x performance:%x\n", 3903 epp_values[EPP_INDEX_POWERSAVE], 3904 epp_values[EPP_INDEX_BALANCE_POWERSAVE], 3905 epp_values[EPP_INDEX_BALANCE_PERFORMANCE], 3906 epp_values[EPP_INDEX_PERFORMANCE]); 3907 } 3908 3909 if (hybrid_id) { 3910 hybrid_scaling_factor = hybrid_id->driver_data; 3911 pr_debug("hybrid scaling factor: %d\n", hybrid_scaling_factor); 3912 } 3913 3914 } 3915 3916 mutex_lock(&intel_pstate_driver_lock); 3917 rc = intel_pstate_register_driver(default_driver); 3918 mutex_unlock(&intel_pstate_driver_lock); 3919 if (rc) { 3920 intel_pstate_sysfs_remove(); 3921 return rc; 3922 } 3923 3924 if (hwp_active) { 3925 const struct x86_cpu_id *id; 3926 3927 id = x86_match_cpu(intel_pstate_cpu_ee_disable_ids); 3928 if (id) { 3929 set_power_ctl_ee_state(false); 3930 pr_info("Disabling energy efficiency optimization\n"); 3931 } 3932 3933 pr_info("HWP enabled\n"); 3934 } else if (boot_cpu_has(X86_FEATURE_HYBRID_CPU)) { 3935 pr_warn("Problematic setup: Hybrid processor with disabled HWP\n"); 3936 } 3937 3938 return 0; 3939 } 3940 device_initcall(intel_pstate_init); 3941 3942 static int __init intel_pstate_setup(char *str) 3943 { 3944 if (!str) 3945 return -EINVAL; 3946 3947 if (!strcmp(str, "disable")) 3948 no_load = 1; 3949 else if (!strcmp(str, "active")) 3950 default_driver = &intel_pstate; 3951 else if (!strcmp(str, "passive")) 3952 default_driver = &intel_cpufreq; 3953 3954 if (!strcmp(str, "no_hwp")) 3955 no_hwp = 1; 3956 3957 if (!strcmp(str, "no_cas")) 3958 no_cas = true; 3959 3960 if (!strcmp(str, "force")) 3961 force_load = 1; 3962 if (!strcmp(str, "hwp_only")) 3963 hwp_only = 1; 3964 if (!strcmp(str, "per_cpu_perf_limits")) 3965 per_cpu_limits = true; 3966 3967 #ifdef CONFIG_ACPI 3968 if (!strcmp(str, "support_acpi_ppc")) 3969 acpi_ppc = true; 3970 #endif 3971 3972 return 0; 3973 } 3974 early_param("intel_pstate", intel_pstate_setup); 3975 3976 MODULE_AUTHOR("Dirk Brandewie <dirk.j.brandewie@intel.com>"); 3977 MODULE_DESCRIPTION("'intel_pstate' - P state driver Intel Core processors"); 3978