1 /* 2 * intel_idle.c - native hardware idle loop for modern Intel processors 3 * 4 * Copyright (c) 2010, Intel Corporation. 5 * Len Brown <len.brown@intel.com> 6 * 7 * This program is free software; you can redistribute it and/or modify it 8 * under the terms and conditions of the GNU General Public License, 9 * version 2, as published by the Free Software Foundation. 10 * 11 * This program is distributed in the hope it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 14 * more details. 15 * 16 * You should have received a copy of the GNU General Public License along with 17 * this program; if not, write to the Free Software Foundation, Inc., 18 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. 19 */ 20 21 /* 22 * intel_idle is a cpuidle driver that loads on specific Intel processors 23 * in lieu of the legacy ACPI processor_idle driver. The intent is to 24 * make Linux more efficient on these processors, as intel_idle knows 25 * more than ACPI, as well as make Linux more immune to ACPI BIOS bugs. 26 */ 27 28 /* 29 * Design Assumptions 30 * 31 * All CPUs have same idle states as boot CPU 32 * 33 * Chipset BM_STS (bus master status) bit is a NOP 34 * for preventing entry into deep C-stats 35 */ 36 37 /* 38 * Known limitations 39 * 40 * The driver currently initializes for_each_online_cpu() upon modprobe. 41 * It it unaware of subsequent processors hot-added to the system. 42 * This means that if you boot with maxcpus=n and later online 43 * processors above n, those processors will use C1 only. 44 * 45 * ACPI has a .suspend hack to turn off deep c-statees during suspend 46 * to avoid complications with the lapic timer workaround. 47 * Have not seen issues with suspend, but may need same workaround here. 48 * 49 * There is currently no kernel-based automatic probing/loading mechanism 50 * if the driver is built as a module. 51 */ 52 53 /* un-comment DEBUG to enable pr_debug() statements */ 54 #define DEBUG 55 56 #include <linux/kernel.h> 57 #include <linux/cpuidle.h> 58 #include <linux/clockchips.h> 59 #include <linux/hrtimer.h> /* ktime_get_real() */ 60 #include <trace/events/power.h> 61 #include <linux/sched.h> 62 #include <linux/notifier.h> 63 #include <linux/cpu.h> 64 #include <linux/module.h> 65 #include <asm/mwait.h> 66 #include <asm/msr.h> 67 68 #define INTEL_IDLE_VERSION "0.4" 69 #define PREFIX "intel_idle: " 70 71 static struct cpuidle_driver intel_idle_driver = { 72 .name = "intel_idle", 73 .owner = THIS_MODULE, 74 }; 75 /* intel_idle.max_cstate=0 disables driver */ 76 static int max_cstate = MWAIT_MAX_NUM_CSTATES - 1; 77 78 static unsigned int mwait_substates; 79 80 #define LAPIC_TIMER_ALWAYS_RELIABLE 0xFFFFFFFF 81 /* Reliable LAPIC Timer States, bit 1 for C1 etc. */ 82 static unsigned int lapic_timer_reliable_states = (1 << 1); /* Default to only C1 */ 83 84 static struct cpuidle_device __percpu *intel_idle_cpuidle_devices; 85 static int intel_idle(struct cpuidle_device *dev, 86 struct cpuidle_driver *drv, int index); 87 88 static struct cpuidle_state *cpuidle_state_table; 89 90 /* 91 * Hardware C-state auto-demotion may not always be optimal. 92 * Indicate which enable bits to clear here. 93 */ 94 static unsigned long long auto_demotion_disable_flags; 95 96 /* 97 * Set this flag for states where the HW flushes the TLB for us 98 * and so we don't need cross-calls to keep it consistent. 99 * If this flag is set, SW flushes the TLB, so even if the 100 * HW doesn't do the flushing, this flag is safe to use. 101 */ 102 #define CPUIDLE_FLAG_TLB_FLUSHED 0x10000 103 104 /* 105 * States are indexed by the cstate number, 106 * which is also the index into the MWAIT hint array. 107 * Thus C0 is a dummy. 108 */ 109 static struct cpuidle_state nehalem_cstates[MWAIT_MAX_NUM_CSTATES] = { 110 { /* MWAIT C0 */ }, 111 { /* MWAIT C1 */ 112 .name = "C1-NHM", 113 .desc = "MWAIT 0x00", 114 .flags = CPUIDLE_FLAG_TIME_VALID, 115 .exit_latency = 3, 116 .target_residency = 6, 117 .enter = &intel_idle }, 118 { /* MWAIT C2 */ 119 .name = "C3-NHM", 120 .desc = "MWAIT 0x10", 121 .flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, 122 .exit_latency = 20, 123 .target_residency = 80, 124 .enter = &intel_idle }, 125 { /* MWAIT C3 */ 126 .name = "C6-NHM", 127 .desc = "MWAIT 0x20", 128 .flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, 129 .exit_latency = 200, 130 .target_residency = 800, 131 .enter = &intel_idle }, 132 }; 133 134 static struct cpuidle_state snb_cstates[MWAIT_MAX_NUM_CSTATES] = { 135 { /* MWAIT C0 */ }, 136 { /* MWAIT C1 */ 137 .name = "C1-SNB", 138 .desc = "MWAIT 0x00", 139 .flags = CPUIDLE_FLAG_TIME_VALID, 140 .exit_latency = 1, 141 .target_residency = 1, 142 .enter = &intel_idle }, 143 { /* MWAIT C2 */ 144 .name = "C3-SNB", 145 .desc = "MWAIT 0x10", 146 .flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, 147 .exit_latency = 80, 148 .target_residency = 211, 149 .enter = &intel_idle }, 150 { /* MWAIT C3 */ 151 .name = "C6-SNB", 152 .desc = "MWAIT 0x20", 153 .flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, 154 .exit_latency = 104, 155 .target_residency = 345, 156 .enter = &intel_idle }, 157 { /* MWAIT C4 */ 158 .name = "C7-SNB", 159 .desc = "MWAIT 0x30", 160 .flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, 161 .exit_latency = 109, 162 .target_residency = 345, 163 .enter = &intel_idle }, 164 }; 165 166 static struct cpuidle_state atom_cstates[MWAIT_MAX_NUM_CSTATES] = { 167 { /* MWAIT C0 */ }, 168 { /* MWAIT C1 */ 169 .name = "C1-ATM", 170 .desc = "MWAIT 0x00", 171 .flags = CPUIDLE_FLAG_TIME_VALID, 172 .exit_latency = 1, 173 .target_residency = 4, 174 .enter = &intel_idle }, 175 { /* MWAIT C2 */ 176 .name = "C2-ATM", 177 .desc = "MWAIT 0x10", 178 .flags = CPUIDLE_FLAG_TIME_VALID, 179 .exit_latency = 20, 180 .target_residency = 80, 181 .enter = &intel_idle }, 182 { /* MWAIT C3 */ }, 183 { /* MWAIT C4 */ 184 .name = "C4-ATM", 185 .desc = "MWAIT 0x30", 186 .flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, 187 .exit_latency = 100, 188 .target_residency = 400, 189 .enter = &intel_idle }, 190 { /* MWAIT C5 */ }, 191 { /* MWAIT C6 */ 192 .name = "C6-ATM", 193 .desc = "MWAIT 0x52", 194 .flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, 195 .exit_latency = 140, 196 .target_residency = 560, 197 .enter = &intel_idle }, 198 }; 199 200 static int get_driver_data(int cstate) 201 { 202 int driver_data; 203 switch (cstate) { 204 205 case 1: /* MWAIT C1 */ 206 driver_data = 0x00; 207 break; 208 case 2: /* MWAIT C2 */ 209 driver_data = 0x10; 210 break; 211 case 3: /* MWAIT C3 */ 212 driver_data = 0x20; 213 break; 214 case 4: /* MWAIT C4 */ 215 driver_data = 0x30; 216 break; 217 case 5: /* MWAIT C5 */ 218 driver_data = 0x40; 219 break; 220 case 6: /* MWAIT C6 */ 221 driver_data = 0x52; 222 break; 223 default: 224 driver_data = 0x00; 225 } 226 return driver_data; 227 } 228 229 /** 230 * intel_idle 231 * @dev: cpuidle_device 232 * @drv: cpuidle driver 233 * @index: index of cpuidle state 234 * 235 */ 236 static int intel_idle(struct cpuidle_device *dev, 237 struct cpuidle_driver *drv, int index) 238 { 239 unsigned long ecx = 1; /* break on interrupt flag */ 240 struct cpuidle_state *state = &drv->states[index]; 241 struct cpuidle_state_usage *state_usage = &dev->states_usage[index]; 242 unsigned long eax = (unsigned long)cpuidle_get_statedata(state_usage); 243 unsigned int cstate; 244 ktime_t kt_before, kt_after; 245 s64 usec_delta; 246 int cpu = smp_processor_id(); 247 248 cstate = (((eax) >> MWAIT_SUBSTATE_SIZE) & MWAIT_CSTATE_MASK) + 1; 249 250 local_irq_disable(); 251 252 /* 253 * leave_mm() to avoid costly and often unnecessary wakeups 254 * for flushing the user TLB's associated with the active mm. 255 */ 256 if (state->flags & CPUIDLE_FLAG_TLB_FLUSHED) 257 leave_mm(cpu); 258 259 if (!(lapic_timer_reliable_states & (1 << (cstate)))) 260 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); 261 262 kt_before = ktime_get_real(); 263 264 stop_critical_timings(); 265 if (!need_resched()) { 266 267 __monitor((void *)¤t_thread_info()->flags, 0, 0); 268 smp_mb(); 269 if (!need_resched()) 270 __mwait(eax, ecx); 271 } 272 273 start_critical_timings(); 274 275 kt_after = ktime_get_real(); 276 usec_delta = ktime_to_us(ktime_sub(kt_after, kt_before)); 277 278 local_irq_enable(); 279 280 if (!(lapic_timer_reliable_states & (1 << (cstate)))) 281 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu); 282 283 /* Update cpuidle counters */ 284 dev->last_residency = (int)usec_delta; 285 286 return index; 287 } 288 289 static void __setup_broadcast_timer(void *arg) 290 { 291 unsigned long reason = (unsigned long)arg; 292 int cpu = smp_processor_id(); 293 294 reason = reason ? 295 CLOCK_EVT_NOTIFY_BROADCAST_ON : CLOCK_EVT_NOTIFY_BROADCAST_OFF; 296 297 clockevents_notify(reason, &cpu); 298 } 299 300 static int setup_broadcast_cpuhp_notify(struct notifier_block *n, 301 unsigned long action, void *hcpu) 302 { 303 int hotcpu = (unsigned long)hcpu; 304 305 switch (action & 0xf) { 306 case CPU_ONLINE: 307 smp_call_function_single(hotcpu, __setup_broadcast_timer, 308 (void *)true, 1); 309 break; 310 } 311 return NOTIFY_OK; 312 } 313 314 static struct notifier_block setup_broadcast_notifier = { 315 .notifier_call = setup_broadcast_cpuhp_notify, 316 }; 317 318 static void auto_demotion_disable(void *dummy) 319 { 320 unsigned long long msr_bits; 321 322 rdmsrl(MSR_NHM_SNB_PKG_CST_CFG_CTL, msr_bits); 323 msr_bits &= ~auto_demotion_disable_flags; 324 wrmsrl(MSR_NHM_SNB_PKG_CST_CFG_CTL, msr_bits); 325 } 326 327 /* 328 * intel_idle_probe() 329 */ 330 static int intel_idle_probe(void) 331 { 332 unsigned int eax, ebx, ecx; 333 334 if (max_cstate == 0) { 335 pr_debug(PREFIX "disabled\n"); 336 return -EPERM; 337 } 338 339 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) 340 return -ENODEV; 341 342 if (!boot_cpu_has(X86_FEATURE_MWAIT)) 343 return -ENODEV; 344 345 if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF) 346 return -ENODEV; 347 348 cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &mwait_substates); 349 350 if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) || 351 !(ecx & CPUID5_ECX_INTERRUPT_BREAK)) 352 return -ENODEV; 353 354 pr_debug(PREFIX "MWAIT substates: 0x%x\n", mwait_substates); 355 356 357 if (boot_cpu_data.x86 != 6) /* family 6 */ 358 return -ENODEV; 359 360 switch (boot_cpu_data.x86_model) { 361 362 case 0x1A: /* Core i7, Xeon 5500 series */ 363 case 0x1E: /* Core i7 and i5 Processor - Lynnfield Jasper Forest */ 364 case 0x1F: /* Core i7 and i5 Processor - Nehalem */ 365 case 0x2E: /* Nehalem-EX Xeon */ 366 case 0x2F: /* Westmere-EX Xeon */ 367 case 0x25: /* Westmere */ 368 case 0x2C: /* Westmere */ 369 cpuidle_state_table = nehalem_cstates; 370 auto_demotion_disable_flags = 371 (NHM_C1_AUTO_DEMOTE | NHM_C3_AUTO_DEMOTE); 372 break; 373 374 case 0x1C: /* 28 - Atom Processor */ 375 cpuidle_state_table = atom_cstates; 376 break; 377 378 case 0x26: /* 38 - Lincroft Atom Processor */ 379 cpuidle_state_table = atom_cstates; 380 auto_demotion_disable_flags = ATM_LNC_C6_AUTO_DEMOTE; 381 break; 382 383 case 0x2A: /* SNB */ 384 case 0x2D: /* SNB Xeon */ 385 cpuidle_state_table = snb_cstates; 386 break; 387 388 default: 389 pr_debug(PREFIX "does not run on family %d model %d\n", 390 boot_cpu_data.x86, boot_cpu_data.x86_model); 391 return -ENODEV; 392 } 393 394 if (boot_cpu_has(X86_FEATURE_ARAT)) /* Always Reliable APIC Timer */ 395 lapic_timer_reliable_states = LAPIC_TIMER_ALWAYS_RELIABLE; 396 else { 397 smp_call_function(__setup_broadcast_timer, (void *)true, 1); 398 register_cpu_notifier(&setup_broadcast_notifier); 399 } 400 401 pr_debug(PREFIX "v" INTEL_IDLE_VERSION 402 " model 0x%X\n", boot_cpu_data.x86_model); 403 404 pr_debug(PREFIX "lapic_timer_reliable_states 0x%x\n", 405 lapic_timer_reliable_states); 406 return 0; 407 } 408 409 /* 410 * intel_idle_cpuidle_devices_uninit() 411 * unregister, free cpuidle_devices 412 */ 413 static void intel_idle_cpuidle_devices_uninit(void) 414 { 415 int i; 416 struct cpuidle_device *dev; 417 418 for_each_online_cpu(i) { 419 dev = per_cpu_ptr(intel_idle_cpuidle_devices, i); 420 cpuidle_unregister_device(dev); 421 } 422 423 free_percpu(intel_idle_cpuidle_devices); 424 return; 425 } 426 /* 427 * intel_idle_cpuidle_driver_init() 428 * allocate, initialize cpuidle_states 429 */ 430 static int intel_idle_cpuidle_driver_init(void) 431 { 432 int cstate; 433 struct cpuidle_driver *drv = &intel_idle_driver; 434 435 drv->state_count = 1; 436 437 for (cstate = 1; cstate < MWAIT_MAX_NUM_CSTATES; ++cstate) { 438 int num_substates; 439 440 if (cstate > max_cstate) { 441 printk(PREFIX "max_cstate %d reached\n", 442 max_cstate); 443 break; 444 } 445 446 /* does the state exist in CPUID.MWAIT? */ 447 num_substates = (mwait_substates >> ((cstate) * 4)) 448 & MWAIT_SUBSTATE_MASK; 449 if (num_substates == 0) 450 continue; 451 /* is the state not enabled? */ 452 if (cpuidle_state_table[cstate].enter == NULL) { 453 /* does the driver not know about the state? */ 454 if (*cpuidle_state_table[cstate].name == '\0') 455 pr_debug(PREFIX "unaware of model 0x%x" 456 " MWAIT %d please" 457 " contact lenb@kernel.org", 458 boot_cpu_data.x86_model, cstate); 459 continue; 460 } 461 462 if ((cstate > 2) && 463 !boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) 464 mark_tsc_unstable("TSC halts in idle" 465 " states deeper than C2"); 466 467 drv->states[drv->state_count] = /* structure copy */ 468 cpuidle_state_table[cstate]; 469 470 drv->state_count += 1; 471 } 472 473 if (auto_demotion_disable_flags) 474 smp_call_function(auto_demotion_disable, NULL, 1); 475 476 return 0; 477 } 478 479 480 /* 481 * intel_idle_cpuidle_devices_init() 482 * allocate, initialize, register cpuidle_devices 483 */ 484 static int intel_idle_cpuidle_devices_init(void) 485 { 486 int i, cstate; 487 struct cpuidle_device *dev; 488 489 intel_idle_cpuidle_devices = alloc_percpu(struct cpuidle_device); 490 if (intel_idle_cpuidle_devices == NULL) 491 return -ENOMEM; 492 493 for_each_online_cpu(i) { 494 dev = per_cpu_ptr(intel_idle_cpuidle_devices, i); 495 496 dev->state_count = 1; 497 498 for (cstate = 1; cstate < MWAIT_MAX_NUM_CSTATES; ++cstate) { 499 int num_substates; 500 501 if (cstate > max_cstate) { 502 printk(PREFIX "max_cstate %d reached\n", 503 max_cstate); 504 break; 505 } 506 507 /* does the state exist in CPUID.MWAIT? */ 508 num_substates = (mwait_substates >> ((cstate) * 4)) 509 & MWAIT_SUBSTATE_MASK; 510 if (num_substates == 0) 511 continue; 512 /* is the state not enabled? */ 513 if (cpuidle_state_table[cstate].enter == NULL) { 514 continue; 515 } 516 517 dev->states_usage[dev->state_count].driver_data = 518 (void *)get_driver_data(cstate); 519 520 dev->state_count += 1; 521 } 522 523 dev->cpu = i; 524 if (cpuidle_register_device(dev)) { 525 pr_debug(PREFIX "cpuidle_register_device %d failed!\n", 526 i); 527 intel_idle_cpuidle_devices_uninit(); 528 return -EIO; 529 } 530 } 531 532 return 0; 533 } 534 535 536 static int __init intel_idle_init(void) 537 { 538 int retval; 539 540 /* Do not load intel_idle at all for now if idle= is passed */ 541 if (boot_option_idle_override != IDLE_NO_OVERRIDE) 542 return -ENODEV; 543 544 retval = intel_idle_probe(); 545 if (retval) 546 return retval; 547 548 intel_idle_cpuidle_driver_init(); 549 retval = cpuidle_register_driver(&intel_idle_driver); 550 if (retval) { 551 printk(KERN_DEBUG PREFIX "intel_idle yielding to %s", 552 cpuidle_get_driver()->name); 553 return retval; 554 } 555 556 retval = intel_idle_cpuidle_devices_init(); 557 if (retval) { 558 cpuidle_unregister_driver(&intel_idle_driver); 559 return retval; 560 } 561 562 return 0; 563 } 564 565 static void __exit intel_idle_exit(void) 566 { 567 intel_idle_cpuidle_devices_uninit(); 568 cpuidle_unregister_driver(&intel_idle_driver); 569 570 if (lapic_timer_reliable_states != LAPIC_TIMER_ALWAYS_RELIABLE) { 571 smp_call_function(__setup_broadcast_timer, (void *)false, 1); 572 unregister_cpu_notifier(&setup_broadcast_notifier); 573 } 574 575 return; 576 } 577 578 module_init(intel_idle_init); 579 module_exit(intel_idle_exit); 580 581 module_param(max_cstate, int, 0444); 582 583 MODULE_AUTHOR("Len Brown <len.brown@intel.com>"); 584 MODULE_DESCRIPTION("Cpuidle driver for Intel Hardware v" INTEL_IDLE_VERSION); 585 MODULE_LICENSE("GPL"); 586