1 /* 2 * intel_idle.c - native hardware idle loop for modern Intel processors 3 * 4 * Copyright (c) 2010, Intel Corporation. 5 * Len Brown <len.brown@intel.com> 6 * 7 * This program is free software; you can redistribute it and/or modify it 8 * under the terms and conditions of the GNU General Public License, 9 * version 2, as published by the Free Software Foundation. 10 * 11 * This program is distributed in the hope it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 14 * more details. 15 * 16 * You should have received a copy of the GNU General Public License along with 17 * this program; if not, write to the Free Software Foundation, Inc., 18 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. 19 */ 20 21 /* 22 * intel_idle is a cpuidle driver that loads on specific Intel processors 23 * in lieu of the legacy ACPI processor_idle driver. The intent is to 24 * make Linux more efficient on these processors, as intel_idle knows 25 * more than ACPI, as well as make Linux more immune to ACPI BIOS bugs. 26 */ 27 28 /* 29 * Design Assumptions 30 * 31 * All CPUs have same idle states as boot CPU 32 * 33 * Chipset BM_STS (bus master status) bit is a NOP 34 * for preventing entry into deep C-stats 35 */ 36 37 /* 38 * Known limitations 39 * 40 * The driver currently initializes for_each_online_cpu() upon modprobe. 41 * It it unaware of subsequent processors hot-added to the system. 42 * This means that if you boot with maxcpus=n and later online 43 * processors above n, those processors will use C1 only. 44 * 45 * ACPI has a .suspend hack to turn off deep c-statees during suspend 46 * to avoid complications with the lapic timer workaround. 47 * Have not seen issues with suspend, but may need same workaround here. 48 * 49 * There is currently no kernel-based automatic probing/loading mechanism 50 * if the driver is built as a module. 51 */ 52 53 /* un-comment DEBUG to enable pr_debug() statements */ 54 #define DEBUG 55 56 #include <linux/kernel.h> 57 #include <linux/cpuidle.h> 58 #include <linux/clockchips.h> 59 #include <linux/hrtimer.h> /* ktime_get_real() */ 60 #include <trace/events/power.h> 61 #include <linux/sched.h> 62 63 #define INTEL_IDLE_VERSION "0.4" 64 #define PREFIX "intel_idle: " 65 66 #define MWAIT_SUBSTATE_MASK (0xf) 67 #define MWAIT_CSTATE_MASK (0xf) 68 #define MWAIT_SUBSTATE_SIZE (4) 69 #define MWAIT_MAX_NUM_CSTATES 8 70 #define CPUID_MWAIT_LEAF (5) 71 #define CPUID5_ECX_EXTENSIONS_SUPPORTED (0x1) 72 #define CPUID5_ECX_INTERRUPT_BREAK (0x2) 73 74 static struct cpuidle_driver intel_idle_driver = { 75 .name = "intel_idle", 76 .owner = THIS_MODULE, 77 }; 78 /* intel_idle.max_cstate=0 disables driver */ 79 static int max_cstate = MWAIT_MAX_NUM_CSTATES - 1; 80 static int power_policy = 7; /* 0 = max perf; 15 = max powersave */ 81 82 static unsigned int substates; 83 static int (*choose_substate)(int); 84 85 /* Reliable LAPIC Timer States, bit 1 for C1 etc. */ 86 static unsigned int lapic_timer_reliable_states; 87 88 static struct cpuidle_device *intel_idle_cpuidle_devices; 89 static int intel_idle(struct cpuidle_device *dev, struct cpuidle_state *state); 90 91 static struct cpuidle_state *cpuidle_state_table; 92 93 /* 94 * States are indexed by the cstate number, 95 * which is also the index into the MWAIT hint array. 96 * Thus C0 is a dummy. 97 */ 98 static struct cpuidle_state nehalem_cstates[MWAIT_MAX_NUM_CSTATES] = { 99 { /* MWAIT C0 */ }, 100 { /* MWAIT C1 */ 101 .name = "NHM-C1", 102 .desc = "MWAIT 0x00", 103 .driver_data = (void *) 0x00, 104 .flags = CPUIDLE_FLAG_TIME_VALID, 105 .exit_latency = 3, 106 .power_usage = 1000, 107 .target_residency = 6, 108 .enter = &intel_idle }, 109 { /* MWAIT C2 */ 110 .name = "NHM-C3", 111 .desc = "MWAIT 0x10", 112 .driver_data = (void *) 0x10, 113 .flags = CPUIDLE_FLAG_TIME_VALID, 114 .exit_latency = 20, 115 .power_usage = 500, 116 .target_residency = 80, 117 .enter = &intel_idle }, 118 { /* MWAIT C3 */ 119 .name = "NHM-C6", 120 .desc = "MWAIT 0x20", 121 .driver_data = (void *) 0x20, 122 .flags = CPUIDLE_FLAG_TIME_VALID, 123 .exit_latency = 200, 124 .power_usage = 350, 125 .target_residency = 800, 126 .enter = &intel_idle }, 127 }; 128 129 static struct cpuidle_state atom_cstates[MWAIT_MAX_NUM_CSTATES] = { 130 { /* MWAIT C0 */ }, 131 { /* MWAIT C1 */ 132 .name = "ATM-C1", 133 .desc = "MWAIT 0x00", 134 .driver_data = (void *) 0x00, 135 .flags = CPUIDLE_FLAG_TIME_VALID, 136 .exit_latency = 1, 137 .power_usage = 1000, 138 .target_residency = 4, 139 .enter = &intel_idle }, 140 { /* MWAIT C2 */ 141 .name = "ATM-C2", 142 .desc = "MWAIT 0x10", 143 .driver_data = (void *) 0x10, 144 .flags = CPUIDLE_FLAG_TIME_VALID, 145 .exit_latency = 20, 146 .power_usage = 500, 147 .target_residency = 80, 148 .enter = &intel_idle }, 149 { /* MWAIT C3 */ }, 150 { /* MWAIT C4 */ 151 .name = "ATM-C4", 152 .desc = "MWAIT 0x30", 153 .driver_data = (void *) 0x30, 154 .flags = CPUIDLE_FLAG_TIME_VALID, 155 .exit_latency = 100, 156 .power_usage = 250, 157 .target_residency = 400, 158 .enter = &intel_idle }, 159 { /* MWAIT C5 */ }, 160 { /* MWAIT C6 */ 161 .name = "ATM-C6", 162 .desc = "MWAIT 0x40", 163 .driver_data = (void *) 0x40, 164 .flags = CPUIDLE_FLAG_TIME_VALID, 165 .exit_latency = 200, 166 .power_usage = 150, 167 .target_residency = 800, 168 .enter = NULL }, /* disabled */ 169 }; 170 171 /* 172 * choose_tunable_substate() 173 * 174 * Run-time decision on which C-state substate to invoke 175 * If power_policy = 0, choose shallowest substate (0) 176 * If power_policy = 15, choose deepest substate 177 * If power_policy = middle, choose middle substate etc. 178 */ 179 static int choose_tunable_substate(int cstate) 180 { 181 unsigned int num_substates; 182 unsigned int substate_choice; 183 184 power_policy &= 0xF; /* valid range: 0-15 */ 185 cstate &= 7; /* valid range: 0-7 */ 186 187 num_substates = (substates >> ((cstate) * 4)) & MWAIT_SUBSTATE_MASK; 188 189 if (num_substates <= 1) 190 return 0; 191 192 substate_choice = ((power_policy + (power_policy + 1) * 193 (num_substates - 1)) / 16); 194 195 return substate_choice; 196 } 197 198 /* 199 * choose_zero_substate() 200 */ 201 static int choose_zero_substate(int cstate) 202 { 203 return 0; 204 } 205 206 /** 207 * intel_idle 208 * @dev: cpuidle_device 209 * @state: cpuidle state 210 * 211 */ 212 static int intel_idle(struct cpuidle_device *dev, struct cpuidle_state *state) 213 { 214 unsigned long ecx = 1; /* break on interrupt flag */ 215 unsigned long eax = (unsigned long)cpuidle_get_statedata(state); 216 unsigned int cstate; 217 ktime_t kt_before, kt_after; 218 s64 usec_delta; 219 int cpu = smp_processor_id(); 220 221 cstate = (((eax) >> MWAIT_SUBSTATE_SIZE) & MWAIT_CSTATE_MASK) + 1; 222 223 eax = eax + (choose_substate)(cstate); 224 225 local_irq_disable(); 226 227 if (!(lapic_timer_reliable_states & (1 << (cstate)))) 228 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); 229 230 kt_before = ktime_get_real(); 231 232 stop_critical_timings(); 233 #ifndef MODULE 234 trace_power_start(POWER_CSTATE, (eax >> 4) + 1, cpu); 235 #endif 236 if (!need_resched()) { 237 238 __monitor((void *)¤t_thread_info()->flags, 0, 0); 239 smp_mb(); 240 if (!need_resched()) 241 __mwait(eax, ecx); 242 } 243 244 start_critical_timings(); 245 246 kt_after = ktime_get_real(); 247 usec_delta = ktime_to_us(ktime_sub(kt_after, kt_before)); 248 249 local_irq_enable(); 250 251 if (!(lapic_timer_reliable_states & (1 << (cstate)))) 252 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu); 253 254 return usec_delta; 255 } 256 257 /* 258 * intel_idle_probe() 259 */ 260 static int intel_idle_probe(void) 261 { 262 unsigned int eax, ebx, ecx, edx; 263 264 if (max_cstate == 0) { 265 pr_debug(PREFIX "disabled\n"); 266 return -EPERM; 267 } 268 269 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) 270 return -ENODEV; 271 272 if (!boot_cpu_has(X86_FEATURE_MWAIT)) 273 return -ENODEV; 274 275 if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF) 276 return -ENODEV; 277 278 cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx); 279 280 if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) || 281 !(ecx & CPUID5_ECX_INTERRUPT_BREAK)) 282 return -ENODEV; 283 #ifdef DEBUG 284 if (substates == 0) /* can over-ride via modparam */ 285 #endif 286 substates = edx; 287 288 pr_debug(PREFIX "MWAIT substates: 0x%x\n", substates); 289 290 if (boot_cpu_has(X86_FEATURE_ARAT)) /* Always Reliable APIC Timer */ 291 lapic_timer_reliable_states = 0xFFFFFFFF; 292 293 if (boot_cpu_data.x86 != 6) /* family 6 */ 294 return -ENODEV; 295 296 switch (boot_cpu_data.x86_model) { 297 298 case 0x1A: /* Core i7, Xeon 5500 series */ 299 case 0x1E: /* Core i7 and i5 Processor - Lynnfield Jasper Forest */ 300 case 0x1F: /* Core i7 and i5 Processor - Nehalem */ 301 case 0x2E: /* Nehalem-EX Xeon */ 302 lapic_timer_reliable_states = (1 << 1); /* C1 */ 303 304 case 0x25: /* Westmere */ 305 case 0x2C: /* Westmere */ 306 cpuidle_state_table = nehalem_cstates; 307 choose_substate = choose_tunable_substate; 308 break; 309 310 case 0x1C: /* 28 - Atom Processor */ 311 lapic_timer_reliable_states = (1 << 2) | (1 << 1); /* C2, C1 */ 312 cpuidle_state_table = atom_cstates; 313 choose_substate = choose_zero_substate; 314 break; 315 #ifdef FUTURE_USE 316 case 0x17: /* 23 - Core 2 Duo */ 317 lapic_timer_reliable_states = (1 << 2) | (1 << 1); /* C2, C1 */ 318 #endif 319 320 default: 321 pr_debug(PREFIX "does not run on family %d model %d\n", 322 boot_cpu_data.x86, boot_cpu_data.x86_model); 323 return -ENODEV; 324 } 325 326 pr_debug(PREFIX "v" INTEL_IDLE_VERSION 327 " model 0x%X\n", boot_cpu_data.x86_model); 328 329 pr_debug(PREFIX "lapic_timer_reliable_states 0x%x\n", 330 lapic_timer_reliable_states); 331 return 0; 332 } 333 334 /* 335 * intel_idle_cpuidle_devices_uninit() 336 * unregister, free cpuidle_devices 337 */ 338 static void intel_idle_cpuidle_devices_uninit(void) 339 { 340 int i; 341 struct cpuidle_device *dev; 342 343 for_each_online_cpu(i) { 344 dev = per_cpu_ptr(intel_idle_cpuidle_devices, i); 345 cpuidle_unregister_device(dev); 346 } 347 348 free_percpu(intel_idle_cpuidle_devices); 349 return; 350 } 351 /* 352 * intel_idle_cpuidle_devices_init() 353 * allocate, initialize, register cpuidle_devices 354 */ 355 static int intel_idle_cpuidle_devices_init(void) 356 { 357 int i, cstate; 358 struct cpuidle_device *dev; 359 360 intel_idle_cpuidle_devices = alloc_percpu(struct cpuidle_device); 361 if (intel_idle_cpuidle_devices == NULL) 362 return -ENOMEM; 363 364 for_each_online_cpu(i) { 365 dev = per_cpu_ptr(intel_idle_cpuidle_devices, i); 366 367 dev->state_count = 1; 368 369 for (cstate = 1; cstate < MWAIT_MAX_NUM_CSTATES; ++cstate) { 370 int num_substates; 371 372 if (cstate > max_cstate) { 373 printk(PREFIX "max_cstate %d reached\n", 374 max_cstate); 375 break; 376 } 377 378 /* does the state exist in CPUID.MWAIT? */ 379 num_substates = (substates >> ((cstate) * 4)) 380 & MWAIT_SUBSTATE_MASK; 381 if (num_substates == 0) 382 continue; 383 /* is the state not enabled? */ 384 if (cpuidle_state_table[cstate].enter == NULL) { 385 /* does the driver not know about the state? */ 386 if (*cpuidle_state_table[cstate].name == '\0') 387 pr_debug(PREFIX "unaware of model 0x%x" 388 " MWAIT %d please" 389 " contact lenb@kernel.org", 390 boot_cpu_data.x86_model, cstate); 391 continue; 392 } 393 394 if ((cstate > 2) && 395 !boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) 396 mark_tsc_unstable("TSC halts in idle" 397 " states deeper than C2"); 398 399 dev->states[dev->state_count] = /* structure copy */ 400 cpuidle_state_table[cstate]; 401 402 dev->state_count += 1; 403 } 404 405 dev->cpu = i; 406 if (cpuidle_register_device(dev)) { 407 pr_debug(PREFIX "cpuidle_register_device %d failed!\n", 408 i); 409 intel_idle_cpuidle_devices_uninit(); 410 return -EIO; 411 } 412 } 413 414 return 0; 415 } 416 417 418 static int __init intel_idle_init(void) 419 { 420 int retval; 421 422 retval = intel_idle_probe(); 423 if (retval) 424 return retval; 425 426 retval = cpuidle_register_driver(&intel_idle_driver); 427 if (retval) { 428 printk(KERN_DEBUG PREFIX "intel_idle yielding to %s", 429 cpuidle_get_driver()->name); 430 return retval; 431 } 432 433 retval = intel_idle_cpuidle_devices_init(); 434 if (retval) { 435 cpuidle_unregister_driver(&intel_idle_driver); 436 return retval; 437 } 438 439 return 0; 440 } 441 442 static void __exit intel_idle_exit(void) 443 { 444 intel_idle_cpuidle_devices_uninit(); 445 cpuidle_unregister_driver(&intel_idle_driver); 446 447 return; 448 } 449 450 module_init(intel_idle_init); 451 module_exit(intel_idle_exit); 452 453 module_param(power_policy, int, 0644); 454 module_param(max_cstate, int, 0444); 455 #ifdef DEBUG 456 module_param(substates, int, 0444); 457 #endif 458 459 MODULE_AUTHOR("Len Brown <len.brown@intel.com>"); 460 MODULE_DESCRIPTION("Cpuidle driver for Intel Hardware v" INTEL_IDLE_VERSION); 461 MODULE_LICENSE("GPL"); 462