1 /* 2 * intel_idle.c - native hardware idle loop for modern Intel processors 3 * 4 * Copyright (c) 2010, Intel Corporation. 5 * Len Brown <len.brown@intel.com> 6 * 7 * This program is free software; you can redistribute it and/or modify it 8 * under the terms and conditions of the GNU General Public License, 9 * version 2, as published by the Free Software Foundation. 10 * 11 * This program is distributed in the hope it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 14 * more details. 15 * 16 * You should have received a copy of the GNU General Public License along with 17 * this program; if not, write to the Free Software Foundation, Inc., 18 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. 19 */ 20 21 /* 22 * intel_idle is a cpuidle driver that loads on specific Intel processors 23 * in lieu of the legacy ACPI processor_idle driver. The intent is to 24 * make Linux more efficient on these processors, as intel_idle knows 25 * more than ACPI, as well as make Linux more immune to ACPI BIOS bugs. 26 */ 27 28 /* 29 * Design Assumptions 30 * 31 * All CPUs have same idle states as boot CPU 32 * 33 * Chipset BM_STS (bus master status) bit is a NOP 34 * for preventing entry into deep C-stats 35 */ 36 37 /* 38 * Known limitations 39 * 40 * The driver currently initializes for_each_online_cpu() upon modprobe. 41 * It it unaware of subsequent processors hot-added to the system. 42 * This means that if you boot with maxcpus=n and later online 43 * processors above n, those processors will use C1 only. 44 * 45 * ACPI has a .suspend hack to turn off deep c-statees during suspend 46 * to avoid complications with the lapic timer workaround. 47 * Have not seen issues with suspend, but may need same workaround here. 48 * 49 * There is currently no kernel-based automatic probing/loading mechanism 50 * if the driver is built as a module. 51 */ 52 53 /* un-comment DEBUG to enable pr_debug() statements */ 54 #define DEBUG 55 56 #include <linux/kernel.h> 57 #include <linux/cpuidle.h> 58 #include <linux/clockchips.h> 59 #include <linux/hrtimer.h> /* ktime_get_real() */ 60 #include <trace/events/power.h> 61 #include <linux/sched.h> 62 #include <linux/notifier.h> 63 #include <linux/cpu.h> 64 #include <linux/module.h> 65 #include <asm/mwait.h> 66 #include <asm/msr.h> 67 68 #define INTEL_IDLE_VERSION "0.4" 69 #define PREFIX "intel_idle: " 70 71 static struct cpuidle_driver intel_idle_driver = { 72 .name = "intel_idle", 73 .owner = THIS_MODULE, 74 }; 75 /* intel_idle.max_cstate=0 disables driver */ 76 static int max_cstate = MWAIT_MAX_NUM_CSTATES - 1; 77 78 static unsigned int mwait_substates; 79 80 #define LAPIC_TIMER_ALWAYS_RELIABLE 0xFFFFFFFF 81 /* Reliable LAPIC Timer States, bit 1 for C1 etc. */ 82 static unsigned int lapic_timer_reliable_states = (1 << 1); /* Default to only C1 */ 83 84 static struct cpuidle_device __percpu *intel_idle_cpuidle_devices; 85 static int intel_idle(struct cpuidle_device *dev, 86 struct cpuidle_driver *drv, int index); 87 88 static struct cpuidle_state *cpuidle_state_table; 89 90 /* 91 * Hardware C-state auto-demotion may not always be optimal. 92 * Indicate which enable bits to clear here. 93 */ 94 static unsigned long long auto_demotion_disable_flags; 95 96 /* 97 * Set this flag for states where the HW flushes the TLB for us 98 * and so we don't need cross-calls to keep it consistent. 99 * If this flag is set, SW flushes the TLB, so even if the 100 * HW doesn't do the flushing, this flag is safe to use. 101 */ 102 #define CPUIDLE_FLAG_TLB_FLUSHED 0x10000 103 104 /* 105 * States are indexed by the cstate number, 106 * which is also the index into the MWAIT hint array. 107 * Thus C0 is a dummy. 108 */ 109 static struct cpuidle_state nehalem_cstates[MWAIT_MAX_NUM_CSTATES] = { 110 { /* MWAIT C0 */ }, 111 { /* MWAIT C1 */ 112 .name = "C1-NHM", 113 .desc = "MWAIT 0x00", 114 .flags = CPUIDLE_FLAG_TIME_VALID, 115 .exit_latency = 3, 116 .target_residency = 6, 117 .enter = &intel_idle }, 118 { /* MWAIT C2 */ 119 .name = "C3-NHM", 120 .desc = "MWAIT 0x10", 121 .flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, 122 .exit_latency = 20, 123 .target_residency = 80, 124 .enter = &intel_idle }, 125 { /* MWAIT C3 */ 126 .name = "C6-NHM", 127 .desc = "MWAIT 0x20", 128 .flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, 129 .exit_latency = 200, 130 .target_residency = 800, 131 .enter = &intel_idle }, 132 }; 133 134 static struct cpuidle_state snb_cstates[MWAIT_MAX_NUM_CSTATES] = { 135 { /* MWAIT C0 */ }, 136 { /* MWAIT C1 */ 137 .name = "C1-SNB", 138 .desc = "MWAIT 0x00", 139 .flags = CPUIDLE_FLAG_TIME_VALID, 140 .exit_latency = 1, 141 .target_residency = 1, 142 .enter = &intel_idle }, 143 { /* MWAIT C2 */ 144 .name = "C3-SNB", 145 .desc = "MWAIT 0x10", 146 .flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, 147 .exit_latency = 80, 148 .target_residency = 211, 149 .enter = &intel_idle }, 150 { /* MWAIT C3 */ 151 .name = "C6-SNB", 152 .desc = "MWAIT 0x20", 153 .flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, 154 .exit_latency = 104, 155 .target_residency = 345, 156 .enter = &intel_idle }, 157 { /* MWAIT C4 */ 158 .name = "C7-SNB", 159 .desc = "MWAIT 0x30", 160 .flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, 161 .exit_latency = 109, 162 .target_residency = 345, 163 .enter = &intel_idle }, 164 }; 165 166 static struct cpuidle_state atom_cstates[MWAIT_MAX_NUM_CSTATES] = { 167 { /* MWAIT C0 */ }, 168 { /* MWAIT C1 */ 169 .name = "C1-ATM", 170 .desc = "MWAIT 0x00", 171 .flags = CPUIDLE_FLAG_TIME_VALID, 172 .exit_latency = 1, 173 .target_residency = 4, 174 .enter = &intel_idle }, 175 { /* MWAIT C2 */ 176 .name = "C2-ATM", 177 .desc = "MWAIT 0x10", 178 .flags = CPUIDLE_FLAG_TIME_VALID, 179 .exit_latency = 20, 180 .target_residency = 80, 181 .enter = &intel_idle }, 182 { /* MWAIT C3 */ }, 183 { /* MWAIT C4 */ 184 .name = "C4-ATM", 185 .desc = "MWAIT 0x30", 186 .flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, 187 .exit_latency = 100, 188 .target_residency = 400, 189 .enter = &intel_idle }, 190 { /* MWAIT C5 */ }, 191 { /* MWAIT C6 */ 192 .name = "C6-ATM", 193 .desc = "MWAIT 0x52", 194 .flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, 195 .exit_latency = 140, 196 .target_residency = 560, 197 .enter = &intel_idle }, 198 }; 199 200 static long get_driver_data(int cstate) 201 { 202 int driver_data; 203 switch (cstate) { 204 205 case 1: /* MWAIT C1 */ 206 driver_data = 0x00; 207 break; 208 case 2: /* MWAIT C2 */ 209 driver_data = 0x10; 210 break; 211 case 3: /* MWAIT C3 */ 212 driver_data = 0x20; 213 break; 214 case 4: /* MWAIT C4 */ 215 driver_data = 0x30; 216 break; 217 case 5: /* MWAIT C5 */ 218 driver_data = 0x40; 219 break; 220 case 6: /* MWAIT C6 */ 221 driver_data = 0x52; 222 break; 223 default: 224 driver_data = 0x00; 225 } 226 return driver_data; 227 } 228 229 /** 230 * intel_idle 231 * @dev: cpuidle_device 232 * @drv: cpuidle driver 233 * @index: index of cpuidle state 234 * 235 * Must be called under local_irq_disable(). 236 */ 237 static int intel_idle(struct cpuidle_device *dev, 238 struct cpuidle_driver *drv, int index) 239 { 240 unsigned long ecx = 1; /* break on interrupt flag */ 241 struct cpuidle_state *state = &drv->states[index]; 242 struct cpuidle_state_usage *state_usage = &dev->states_usage[index]; 243 unsigned long eax = (unsigned long)cpuidle_get_statedata(state_usage); 244 unsigned int cstate; 245 ktime_t kt_before, kt_after; 246 s64 usec_delta; 247 int cpu = smp_processor_id(); 248 249 cstate = (((eax) >> MWAIT_SUBSTATE_SIZE) & MWAIT_CSTATE_MASK) + 1; 250 251 /* 252 * leave_mm() to avoid costly and often unnecessary wakeups 253 * for flushing the user TLB's associated with the active mm. 254 */ 255 if (state->flags & CPUIDLE_FLAG_TLB_FLUSHED) 256 leave_mm(cpu); 257 258 if (!(lapic_timer_reliable_states & (1 << (cstate)))) 259 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); 260 261 kt_before = ktime_get_real(); 262 263 stop_critical_timings(); 264 if (!need_resched()) { 265 266 __monitor((void *)¤t_thread_info()->flags, 0, 0); 267 smp_mb(); 268 if (!need_resched()) 269 __mwait(eax, ecx); 270 } 271 272 start_critical_timings(); 273 274 kt_after = ktime_get_real(); 275 usec_delta = ktime_to_us(ktime_sub(kt_after, kt_before)); 276 277 local_irq_enable(); 278 279 if (!(lapic_timer_reliable_states & (1 << (cstate)))) 280 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu); 281 282 /* Update cpuidle counters */ 283 dev->last_residency = (int)usec_delta; 284 285 return index; 286 } 287 288 static void __setup_broadcast_timer(void *arg) 289 { 290 unsigned long reason = (unsigned long)arg; 291 int cpu = smp_processor_id(); 292 293 reason = reason ? 294 CLOCK_EVT_NOTIFY_BROADCAST_ON : CLOCK_EVT_NOTIFY_BROADCAST_OFF; 295 296 clockevents_notify(reason, &cpu); 297 } 298 299 static int setup_broadcast_cpuhp_notify(struct notifier_block *n, 300 unsigned long action, void *hcpu) 301 { 302 int hotcpu = (unsigned long)hcpu; 303 304 switch (action & 0xf) { 305 case CPU_ONLINE: 306 smp_call_function_single(hotcpu, __setup_broadcast_timer, 307 (void *)true, 1); 308 break; 309 } 310 return NOTIFY_OK; 311 } 312 313 static struct notifier_block setup_broadcast_notifier = { 314 .notifier_call = setup_broadcast_cpuhp_notify, 315 }; 316 317 static void auto_demotion_disable(void *dummy) 318 { 319 unsigned long long msr_bits; 320 321 rdmsrl(MSR_NHM_SNB_PKG_CST_CFG_CTL, msr_bits); 322 msr_bits &= ~auto_demotion_disable_flags; 323 wrmsrl(MSR_NHM_SNB_PKG_CST_CFG_CTL, msr_bits); 324 } 325 326 /* 327 * intel_idle_probe() 328 */ 329 static int intel_idle_probe(void) 330 { 331 unsigned int eax, ebx, ecx; 332 333 if (max_cstate == 0) { 334 pr_debug(PREFIX "disabled\n"); 335 return -EPERM; 336 } 337 338 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) 339 return -ENODEV; 340 341 if (!boot_cpu_has(X86_FEATURE_MWAIT)) 342 return -ENODEV; 343 344 if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF) 345 return -ENODEV; 346 347 cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &mwait_substates); 348 349 if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) || 350 !(ecx & CPUID5_ECX_INTERRUPT_BREAK) || 351 !mwait_substates) 352 return -ENODEV; 353 354 pr_debug(PREFIX "MWAIT substates: 0x%x\n", mwait_substates); 355 356 357 if (boot_cpu_data.x86 != 6) /* family 6 */ 358 return -ENODEV; 359 360 switch (boot_cpu_data.x86_model) { 361 362 case 0x1A: /* Core i7, Xeon 5500 series */ 363 case 0x1E: /* Core i7 and i5 Processor - Lynnfield Jasper Forest */ 364 case 0x1F: /* Core i7 and i5 Processor - Nehalem */ 365 case 0x2E: /* Nehalem-EX Xeon */ 366 case 0x2F: /* Westmere-EX Xeon */ 367 case 0x25: /* Westmere */ 368 case 0x2C: /* Westmere */ 369 cpuidle_state_table = nehalem_cstates; 370 auto_demotion_disable_flags = 371 (NHM_C1_AUTO_DEMOTE | NHM_C3_AUTO_DEMOTE); 372 break; 373 374 case 0x1C: /* 28 - Atom Processor */ 375 cpuidle_state_table = atom_cstates; 376 break; 377 378 case 0x26: /* 38 - Lincroft Atom Processor */ 379 cpuidle_state_table = atom_cstates; 380 auto_demotion_disable_flags = ATM_LNC_C6_AUTO_DEMOTE; 381 break; 382 383 case 0x2A: /* SNB */ 384 case 0x2D: /* SNB Xeon */ 385 cpuidle_state_table = snb_cstates; 386 break; 387 388 default: 389 pr_debug(PREFIX "does not run on family %d model %d\n", 390 boot_cpu_data.x86, boot_cpu_data.x86_model); 391 return -ENODEV; 392 } 393 394 if (boot_cpu_has(X86_FEATURE_ARAT)) /* Always Reliable APIC Timer */ 395 lapic_timer_reliable_states = LAPIC_TIMER_ALWAYS_RELIABLE; 396 else { 397 on_each_cpu(__setup_broadcast_timer, (void *)true, 1); 398 register_cpu_notifier(&setup_broadcast_notifier); 399 } 400 401 pr_debug(PREFIX "v" INTEL_IDLE_VERSION 402 " model 0x%X\n", boot_cpu_data.x86_model); 403 404 pr_debug(PREFIX "lapic_timer_reliable_states 0x%x\n", 405 lapic_timer_reliable_states); 406 return 0; 407 } 408 409 /* 410 * intel_idle_cpuidle_devices_uninit() 411 * unregister, free cpuidle_devices 412 */ 413 static void intel_idle_cpuidle_devices_uninit(void) 414 { 415 int i; 416 struct cpuidle_device *dev; 417 418 for_each_online_cpu(i) { 419 dev = per_cpu_ptr(intel_idle_cpuidle_devices, i); 420 cpuidle_unregister_device(dev); 421 } 422 423 free_percpu(intel_idle_cpuidle_devices); 424 return; 425 } 426 /* 427 * intel_idle_cpuidle_driver_init() 428 * allocate, initialize cpuidle_states 429 */ 430 static int intel_idle_cpuidle_driver_init(void) 431 { 432 int cstate; 433 struct cpuidle_driver *drv = &intel_idle_driver; 434 435 drv->state_count = 1; 436 437 for (cstate = 1; cstate < MWAIT_MAX_NUM_CSTATES; ++cstate) { 438 int num_substates; 439 440 if (cstate > max_cstate) { 441 printk(PREFIX "max_cstate %d reached\n", 442 max_cstate); 443 break; 444 } 445 446 /* does the state exist in CPUID.MWAIT? */ 447 num_substates = (mwait_substates >> ((cstate) * 4)) 448 & MWAIT_SUBSTATE_MASK; 449 if (num_substates == 0) 450 continue; 451 /* is the state not enabled? */ 452 if (cpuidle_state_table[cstate].enter == NULL) { 453 /* does the driver not know about the state? */ 454 if (*cpuidle_state_table[cstate].name == '\0') 455 pr_debug(PREFIX "unaware of model 0x%x" 456 " MWAIT %d please" 457 " contact lenb@kernel.org", 458 boot_cpu_data.x86_model, cstate); 459 continue; 460 } 461 462 if ((cstate > 2) && 463 !boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) 464 mark_tsc_unstable("TSC halts in idle" 465 " states deeper than C2"); 466 467 drv->states[drv->state_count] = /* structure copy */ 468 cpuidle_state_table[cstate]; 469 470 drv->state_count += 1; 471 } 472 473 if (auto_demotion_disable_flags) 474 on_each_cpu(auto_demotion_disable, NULL, 1); 475 476 return 0; 477 } 478 479 480 /* 481 * intel_idle_cpu_init() 482 * allocate, initialize, register cpuidle_devices 483 * @cpu: cpu/core to initialize 484 */ 485 int intel_idle_cpu_init(int cpu) 486 { 487 int cstate; 488 struct cpuidle_device *dev; 489 490 dev = per_cpu_ptr(intel_idle_cpuidle_devices, cpu); 491 492 dev->state_count = 1; 493 494 for (cstate = 1; cstate < MWAIT_MAX_NUM_CSTATES; ++cstate) { 495 int num_substates; 496 497 if (cstate > max_cstate) { 498 printk(PREFIX "max_cstate %d reached\n", 499 max_cstate); 500 break; 501 } 502 503 /* does the state exist in CPUID.MWAIT? */ 504 num_substates = (mwait_substates >> ((cstate) * 4)) 505 & MWAIT_SUBSTATE_MASK; 506 if (num_substates == 0) 507 continue; 508 /* is the state not enabled? */ 509 if (cpuidle_state_table[cstate].enter == NULL) 510 continue; 511 512 dev->states_usage[dev->state_count].driver_data = 513 (void *)get_driver_data(cstate); 514 515 dev->state_count += 1; 516 } 517 dev->cpu = cpu; 518 519 if (cpuidle_register_device(dev)) { 520 pr_debug(PREFIX "cpuidle_register_device %d failed!\n", cpu); 521 intel_idle_cpuidle_devices_uninit(); 522 return -EIO; 523 } 524 525 if (auto_demotion_disable_flags) 526 smp_call_function_single(cpu, auto_demotion_disable, NULL, 1); 527 528 return 0; 529 } 530 531 532 static int __init intel_idle_init(void) 533 { 534 int retval, i; 535 536 /* Do not load intel_idle at all for now if idle= is passed */ 537 if (boot_option_idle_override != IDLE_NO_OVERRIDE) 538 return -ENODEV; 539 540 retval = intel_idle_probe(); 541 if (retval) 542 return retval; 543 544 intel_idle_cpuidle_driver_init(); 545 retval = cpuidle_register_driver(&intel_idle_driver); 546 if (retval) { 547 printk(KERN_DEBUG PREFIX "intel_idle yielding to %s", 548 cpuidle_get_driver()->name); 549 return retval; 550 } 551 552 intel_idle_cpuidle_devices = alloc_percpu(struct cpuidle_device); 553 if (intel_idle_cpuidle_devices == NULL) 554 return -ENOMEM; 555 556 for_each_online_cpu(i) { 557 retval = intel_idle_cpu_init(i); 558 if (retval) { 559 cpuidle_unregister_driver(&intel_idle_driver); 560 return retval; 561 } 562 } 563 564 return 0; 565 } 566 567 static void __exit intel_idle_exit(void) 568 { 569 intel_idle_cpuidle_devices_uninit(); 570 cpuidle_unregister_driver(&intel_idle_driver); 571 572 if (lapic_timer_reliable_states != LAPIC_TIMER_ALWAYS_RELIABLE) { 573 on_each_cpu(__setup_broadcast_timer, (void *)false, 1); 574 unregister_cpu_notifier(&setup_broadcast_notifier); 575 } 576 577 return; 578 } 579 580 module_init(intel_idle_init); 581 module_exit(intel_idle_exit); 582 583 module_param(max_cstate, int, 0444); 584 585 MODULE_AUTHOR("Len Brown <len.brown@intel.com>"); 586 MODULE_DESCRIPTION("Cpuidle driver for Intel Hardware v" INTEL_IDLE_VERSION); 587 MODULE_LICENSE("GPL"); 588