1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 /* 26 * Copyright (c) 2009-2010, Intel Corporation. 27 * All rights reserved. 28 */ 29 /* 30 * Copyright 2019 Joyent, Inc. 31 * Copyright 2025 Oxide Computer Company 32 */ 33 34 #include <sys/x86_archext.h> 35 #include <sys/machsystm.h> 36 #include <sys/x_call.h> 37 #include <sys/stat.h> 38 #include <sys/acpi/acpi.h> 39 #include <sys/acpica.h> 40 #include <sys/cpu_acpi.h> 41 #include <sys/cpu_idle.h> 42 #include <sys/cpupm.h> 43 #include <sys/cpu_event.h> 44 #include <sys/hpet.h> 45 #include <sys/archsystm.h> 46 #include <vm/hat_i86.h> 47 #include <sys/dtrace.h> 48 #include <sys/sdt.h> 49 #include <sys/callb.h> 50 51 #define CSTATE_USING_HPET 1 52 #define CSTATE_USING_LAT 2 53 54 #define CPU_IDLE_STOP_TIMEOUT 1000 55 56 extern void cpu_idle_adaptive(void); 57 extern uint32_t cpupm_next_cstate(cma_c_state_t *cs_data, 58 cpu_acpi_cstate_t *cstates, uint32_t cs_count, hrtime_t start); 59 60 static int cpu_idle_init(cpu_t *); 61 static void cpu_idle_fini(cpu_t *); 62 static void cpu_idle_stop(cpu_t *); 63 static boolean_t cpu_deep_idle_callb(void *arg, int code); 64 static boolean_t cpu_idle_cpr_callb(void *arg, int code); 65 static void acpi_cpu_cstate(cpu_acpi_cstate_t *cstate); 66 67 static boolean_t cstate_use_timer(hrtime_t *lapic_expire, int timer); 68 69 /* 70 * the flag of always-running local APIC timer. 71 * the flag of HPET Timer use in deep cstate. 72 */ 73 static boolean_t cpu_cstate_arat = B_FALSE; 74 static boolean_t cpu_cstate_hpet = B_FALSE; 75 76 /* 77 * Interfaces for modules implementing Intel's deep c-state. 78 */ 79 cpupm_state_ops_t cpu_idle_ops = { 80 "Generic ACPI C-state Support", 81 cpu_idle_init, 82 cpu_idle_fini, 83 NULL, 84 cpu_idle_stop 85 }; 86 87 static kmutex_t cpu_idle_callb_mutex; 88 static callb_id_t cpu_deep_idle_callb_id; 89 static callb_id_t cpu_idle_cpr_callb_id; 90 static uint_t cpu_idle_cfg_state; 91 92 static kmutex_t cpu_idle_mutex; 93 94 cpu_idle_kstat_t cpu_idle_kstat = { 95 { "address_space_id", KSTAT_DATA_STRING }, 96 { "latency", KSTAT_DATA_UINT32 }, 97 { "power", KSTAT_DATA_UINT32 }, 98 }; 99 100 /* 101 * kstat update function of the c-state info 102 */ 103 static int 104 cpu_idle_kstat_update(kstat_t *ksp, int flag) 105 { 106 cpu_acpi_cstate_t *cstate = ksp->ks_private; 107 108 if (flag == KSTAT_WRITE) { 109 return (EACCES); 110 } 111 112 if (cstate->cs_addrspace_id == ACPI_ADR_SPACE_FIXED_HARDWARE) { 113 kstat_named_setstr(&cpu_idle_kstat.addr_space_id, 114 "FFixedHW"); 115 } else if (cstate->cs_addrspace_id == ACPI_ADR_SPACE_SYSTEM_IO) { 116 kstat_named_setstr(&cpu_idle_kstat.addr_space_id, 117 "SystemIO"); 118 } else { 119 kstat_named_setstr(&cpu_idle_kstat.addr_space_id, 120 "Unsupported"); 121 } 122 123 cpu_idle_kstat.cs_latency.value.ui32 = cstate->cs_latency; 124 cpu_idle_kstat.cs_power.value.ui32 = cstate->cs_power; 125 126 return (0); 127 } 128 129 /* 130 * Used during configuration callbacks to manage implementation specific 131 * details of the hardware timer used during Deep C-state. 132 */ 133 boolean_t 134 cstate_timer_callback(int code) 135 { 136 if (cpu_cstate_arat) { 137 return (B_TRUE); 138 } else if (cpu_cstate_hpet) { 139 return (hpet.callback(code)); 140 } 141 return (B_FALSE); 142 } 143 144 /* 145 * Some Local APIC Timers do not work during Deep C-states. 146 * The Deep C-state idle function uses this function to ensure it is using a 147 * hardware timer that works during Deep C-states. This function also 148 * switches the timer back to the LACPI Timer after Deep C-state. 149 */ 150 static boolean_t 151 cstate_use_timer(hrtime_t *lapic_expire, int timer) 152 { 153 if (cpu_cstate_arat) 154 return (B_TRUE); 155 156 /* 157 * We have to return B_FALSE if no arat or hpet support 158 */ 159 if (!cpu_cstate_hpet) 160 return (B_FALSE); 161 162 switch (timer) { 163 case CSTATE_USING_HPET: 164 return (hpet.use_hpet_timer(lapic_expire)); 165 case CSTATE_USING_LAT: 166 hpet.use_lapic_timer(*lapic_expire); 167 return (B_TRUE); 168 default: 169 return (B_FALSE); 170 } 171 } 172 173 /* 174 * c-state wakeup function. 175 * Similar to cpu_wakeup and cpu_wakeup_mwait except this function deals 176 * with CPUs asleep in MWAIT, HLT, or ACPI Deep C-State. 177 */ 178 void 179 cstate_wakeup(cpu_t *cp, int bound) 180 { 181 struct machcpu *mcpu = &(cp->cpu_m); 182 volatile uint32_t *mcpu_mwait = mcpu->mcpu_mwait; 183 cpupart_t *cpu_part; 184 uint_t cpu_found; 185 processorid_t cpu_sid; 186 187 cpu_part = cp->cpu_part; 188 cpu_sid = cp->cpu_seqid; 189 /* 190 * Clear the halted bit for that CPU since it will be woken up 191 * in a moment. 192 */ 193 if (bitset_in_set(&cpu_part->cp_haltset, cpu_sid)) { 194 /* 195 * Clear the halted bit for that CPU since it will be 196 * poked in a moment. 197 */ 198 bitset_atomic_del(&cpu_part->cp_haltset, cpu_sid); 199 200 /* 201 * We may find the current CPU present in the halted cpuset 202 * if we're in the context of an interrupt that occurred 203 * before we had a chance to clear our bit in cpu_idle(). 204 * Waking ourself is obviously unnecessary, since if 205 * we're here, we're not halted. 206 */ 207 if (cp != CPU) { 208 /* 209 * Use correct wakeup mechanism 210 */ 211 if ((mcpu_mwait != NULL) && 212 (*mcpu_mwait == MWAIT_HALTED)) 213 MWAIT_WAKEUP(cp); 214 else 215 poke_cpu(cp->cpu_id); 216 } 217 return; 218 } else { 219 /* 220 * This cpu isn't halted, but it's idle or undergoing a 221 * context switch. No need to awaken anyone else. 222 */ 223 if (cp->cpu_thread == cp->cpu_idle_thread || 224 cp->cpu_disp_flags & CPU_DISP_DONTSTEAL) 225 return; 226 } 227 228 /* 229 * No need to wake up other CPUs if the thread we just enqueued 230 * is bound. 231 */ 232 if (bound) 233 return; 234 235 236 /* 237 * See if there's any other halted CPUs. If there are, then 238 * select one, and awaken it. 239 * It's possible that after we find a CPU, somebody else 240 * will awaken it before we get the chance. 241 * In that case, look again. 242 */ 243 do { 244 cpu_found = bitset_find(&cpu_part->cp_haltset); 245 if (cpu_found == (uint_t)-1) 246 return; 247 248 } while (bitset_atomic_test_and_del(&cpu_part->cp_haltset, 249 cpu_found) < 0); 250 251 /* 252 * Must use correct wakeup mechanism to avoid lost wakeup of 253 * alternate cpu. 254 */ 255 if (cpu_found != CPU->cpu_seqid) { 256 mcpu_mwait = cpu_seq[cpu_found]->cpu_m.mcpu_mwait; 257 if ((mcpu_mwait != NULL) && (*mcpu_mwait == MWAIT_HALTED)) 258 MWAIT_WAKEUP(cpu_seq[cpu_found]); 259 else 260 poke_cpu(cpu_seq[cpu_found]->cpu_id); 261 } 262 } 263 264 /* 265 * Function called by CPU idle notification framework to check whether CPU 266 * has been awakened. It will be called with interrupt disabled. 267 * If CPU has been awakened, call cpu_idle_exit() to notify CPU idle 268 * notification framework. 269 */ 270 static void 271 acpi_cpu_mwait_check_wakeup(void *arg) 272 { 273 volatile uint32_t *mcpu_mwait = (volatile uint32_t *)arg; 274 275 ASSERT(arg != NULL); 276 if (*mcpu_mwait != MWAIT_HALTED) { 277 /* 278 * CPU has been awakened, notify CPU idle notification system. 279 */ 280 cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE); 281 } else { 282 /* 283 * Toggle interrupt flag to detect pending interrupts. 284 * If interrupt happened, do_interrupt() will notify CPU idle 285 * notification framework so no need to call cpu_idle_exit() 286 * here. 287 */ 288 sti(); 289 SMT_PAUSE(); 290 cli(); 291 } 292 } 293 294 static void 295 acpi_cpu_mwait_ipi_check_wakeup(void *arg) 296 { 297 volatile uint32_t *mcpu_mwait = (volatile uint32_t *)arg; 298 299 ASSERT(arg != NULL); 300 if (*mcpu_mwait != MWAIT_WAKEUP_IPI) { 301 /* 302 * CPU has been awakened, notify CPU idle notification system. 303 */ 304 cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE); 305 } else { 306 /* 307 * Toggle interrupt flag to detect pending interrupts. 308 * If interrupt happened, do_interrupt() will notify CPU idle 309 * notification framework so no need to call cpu_idle_exit() 310 * here. 311 */ 312 sti(); 313 SMT_PAUSE(); 314 cli(); 315 } 316 } 317 318 /*ARGSUSED*/ 319 static void 320 acpi_cpu_check_wakeup(void *arg) 321 { 322 /* 323 * Toggle interrupt flag to detect pending interrupts. 324 * If interrupt happened, do_interrupt() will notify CPU idle 325 * notification framework so no need to call cpu_idle_exit() here. 326 */ 327 sti(); 328 SMT_PAUSE(); 329 cli(); 330 } 331 332 /* 333 * Idle the current CPU via ACPI-defined System I/O read to an ACPI-specified 334 * address. 335 */ 336 static void 337 acpi_io_idle(uint32_t address) 338 { 339 uint32_t value; 340 ACPI_TABLE_FADT *gbl_FADT; 341 342 /* 343 * Do we need to work around an ancient chipset bug in early ACPI 344 * implementations that would result in a late STPCLK# assertion? 345 * 346 * Must be true when running on systems where the ACPI-indicated I/O 347 * read to enter low-power states may resolve before actually stopping 348 * the processor that initiated a low-power transition. On such systems, 349 * it is possible the processor would proceed past the idle point and 350 * *then* be stopped. 351 * 352 * An early workaround that has been carried forward is to read the ACPI 353 * PM Timer after requesting a low-power transition. The timer read will 354 * take long enough that we are certain the processor is safe to be 355 * stopped. 356 * 357 * From some investigation, this was only ever necessary on older Intel 358 * chipsets. Additionally, the timer read can take upwards of a thousand 359 * CPU clocks, so for systems that work correctly, it's just a tarpit 360 * for the CPU as it is woken back up. 361 */ 362 boolean_t need_stpclk_workaround = 363 cpuid_getvendor(CPU) == X86_VENDOR_Intel; 364 365 /* 366 * The following call will cause us to halt which will cause the store 367 * buffer to be repartitioned, potentially exposing us to the Intel CPU 368 * vulnerability MDS. As such, we need to explicitly call that here. 369 * The other idle methods do this automatically as part of the 370 * implementation of i86_mwait(). 371 */ 372 x86_md_clear(); 373 (void) cpu_acpi_read_port(address, &value, 8); 374 if (need_stpclk_workaround) { 375 acpica_get_global_FADT(&gbl_FADT); 376 (void) cpu_acpi_read_port( 377 gbl_FADT->XPmTimerBlock.Address, 378 &value, 32); 379 } 380 } 381 382 /* 383 * enter deep c-state handler 384 */ 385 static void 386 acpi_cpu_cstate(cpu_acpi_cstate_t *cstate) 387 { 388 volatile uint32_t *mcpu_mwait = CPU->cpu_m.mcpu_mwait; 389 uint32_t mwait_idle_state; 390 cpu_t *cpup = CPU; 391 processorid_t cpu_sid = cpup->cpu_seqid; 392 cpupart_t *cp = cpup->cpu_part; 393 hrtime_t lapic_expire; 394 uint8_t type = cstate->cs_addrspace_id; 395 uint32_t cs_type = cstate->cs_type; 396 int hset_update = 1; 397 boolean_t using_timer; 398 cpu_idle_check_wakeup_t check_func = &acpi_cpu_check_wakeup; 399 400 /* 401 * Set our mcpu_mwait here, so we can tell if anyone tries to 402 * wake us between now and when we call mwait. No other cpu will 403 * attempt to set our mcpu_mwait until we add ourself to the haltset. 404 */ 405 if (mcpu_mwait != NULL) { 406 if (type == ACPI_ADR_SPACE_SYSTEM_IO) { 407 mwait_idle_state = MWAIT_WAKEUP_IPI; 408 check_func = &acpi_cpu_mwait_ipi_check_wakeup; 409 } else { 410 mwait_idle_state = MWAIT_HALTED; 411 check_func = &acpi_cpu_mwait_check_wakeup; 412 } 413 *mcpu_mwait = mwait_idle_state; 414 } else { 415 /* 416 * Initialize mwait_idle_state, but with mcpu_mwait NULL we'll 417 * never actually use it here. "MWAIT_RUNNING" just 418 * distinguishes from the "WAKEUP_IPI" and "HALTED" cases above. 419 */ 420 mwait_idle_state = MWAIT_RUNNING; 421 } 422 423 /* 424 * If this CPU is online, and there are multiple CPUs 425 * in the system, then we should note our halting 426 * by adding ourselves to the partition's halted CPU 427 * bitmap. This allows other CPUs to find/awaken us when 428 * work becomes available. 429 */ 430 if (cpup->cpu_flags & CPU_OFFLINE || ncpus == 1) 431 hset_update = 0; 432 433 /* 434 * Add ourselves to the partition's halted CPUs bitmask 435 * and set our HALTED flag, if necessary. 436 * 437 * When a thread becomes runnable, it is placed on the queue 438 * and then the halted cpuset is checked to determine who 439 * (if anyone) should be awakened. We therefore need to first 440 * add ourselves to the halted cpuset, and and then check if there 441 * is any work available. 442 * 443 * Note that memory barriers after updating the HALTED flag 444 * are not necessary since an atomic operation (updating the bitmap) 445 * immediately follows. On x86 the atomic operation acts as a 446 * memory barrier for the update of cpu_disp_flags. 447 */ 448 if (hset_update) { 449 cpup->cpu_disp_flags |= CPU_DISP_HALTED; 450 bitset_atomic_add(&cp->cp_haltset, cpu_sid); 451 } 452 453 /* 454 * Check to make sure there's really nothing to do. Work destined for 455 * this CPU may become available after this check. If we're 456 * mwait-halting we'll be notified through the clearing of our bit in 457 * the halted CPU bitmask, and a write to our mcpu_mwait. Otherwise, 458 * we're hlt-based halting, and we'll be immediately woken by the 459 * pending interrupt. 460 * 461 * disp_anywork() checks disp_nrunnable, so we do not have to later. 462 */ 463 if (disp_anywork()) { 464 if (hset_update) { 465 cpup->cpu_disp_flags &= ~CPU_DISP_HALTED; 466 bitset_atomic_del(&cp->cp_haltset, cpu_sid); 467 } 468 return; 469 } 470 471 /* 472 * We're on our way to being halted. 473 * 474 * The local APIC timer can stop in ACPI C2 and deeper c-states. 475 * Try to program the HPET hardware to substitute for this CPU's 476 * LAPIC timer. 477 * cstate_use_timer() could disable the LAPIC Timer. Make sure 478 * to start the LAPIC Timer again before leaving this function. 479 * 480 * Disable interrupts here so we will awaken immediately after halting 481 * if someone tries to poke us between now and the time we actually 482 * halt. 483 */ 484 cli(); 485 using_timer = cstate_use_timer(&lapic_expire, CSTATE_USING_HPET); 486 487 /* 488 * We check for the presence of our bit after disabling interrupts. 489 * If it's cleared, we'll return. If the bit is cleared after 490 * we check then the cstate_wakeup() will pop us out of the halted 491 * state. 492 * 493 * This means that the ordering of the cstate_wakeup() and the clearing 494 * of the bit by cpu_wakeup is important. 495 * cpu_wakeup() must clear our mc_haltset bit, and then call 496 * cstate_wakeup(). 497 * acpi_cpu_cstate() must disable interrupts, then check for the bit. 498 */ 499 if (hset_update && bitset_in_set(&cp->cp_haltset, cpu_sid) == 0) { 500 (void) cstate_use_timer(&lapic_expire, 501 CSTATE_USING_LAT); 502 sti(); 503 cpup->cpu_disp_flags &= ~CPU_DISP_HALTED; 504 return; 505 } 506 507 /* 508 * The check for anything locally runnable is here for performance 509 * and isn't needed for correctness. disp_nrunnable ought to be 510 * in our cache still, so it's inexpensive to check, and if there 511 * is anything runnable we won't have to wait for the poke. 512 */ 513 if (cpup->cpu_disp->disp_nrunnable != 0) { 514 (void) cstate_use_timer(&lapic_expire, 515 CSTATE_USING_LAT); 516 sti(); 517 if (hset_update) { 518 cpup->cpu_disp_flags &= ~CPU_DISP_HALTED; 519 bitset_atomic_del(&cp->cp_haltset, cpu_sid); 520 } 521 return; 522 } 523 524 if (using_timer == B_FALSE) { 525 526 (void) cstate_use_timer(&lapic_expire, 527 CSTATE_USING_LAT); 528 sti(); 529 530 /* 531 * We are currently unable to program the HPET to act as this 532 * CPU's proxy LAPIC timer. This CPU cannot enter C2 or deeper 533 * because no timer is set to wake it up while its LAPIC timer 534 * stalls in deep C-States. 535 * Enter C1 instead. 536 * 537 * cstate_wakeup() will wake this CPU with an IPI, which works 538 * with either MWAIT or HLT. 539 */ 540 if (mcpu_mwait != NULL) { 541 i86_monitor(mcpu_mwait, 0, 0); 542 if (*mcpu_mwait == MWAIT_HALTED) { 543 if (cpu_idle_enter(IDLE_STATE_C1, 0, 544 check_func, (void *)mcpu_mwait) == 0) { 545 if (*mcpu_mwait == MWAIT_HALTED) { 546 i86_mwait(0, 0); 547 } 548 cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE); 549 } 550 } 551 } else { 552 if (cpu_idle_enter(cs_type, 0, check_func, NULL) == 0) { 553 mach_cpu_idle(); 554 cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE); 555 } 556 } 557 558 /* 559 * We're no longer halted 560 */ 561 if (hset_update) { 562 cpup->cpu_disp_flags &= ~CPU_DISP_HALTED; 563 bitset_atomic_del(&cp->cp_haltset, cpu_sid); 564 } 565 return; 566 } 567 568 /* 569 * Tell the cpu idle framework we're going to try idling. 570 * 571 * If cpu_idle_enter returns nonzero, we've found out at the last minute 572 * that we don't actually want to idle. 573 */ 574 boolean_t idle_ok = cpu_idle_enter(cs_type, 0, check_func, 575 (void *)mcpu_mwait) == 0; 576 577 if (idle_ok) { 578 if (type == ACPI_ADR_SPACE_FIXED_HARDWARE) { 579 if (mcpu_mwait != NULL) { 580 /* 581 * We're on our way to being halted. 582 * To avoid a lost wakeup, arm the monitor 583 * before checking if another cpu wrote to 584 * mcpu_mwait to wake us up. 585 */ 586 i86_monitor(mcpu_mwait, 0, 0); 587 if (*mcpu_mwait == mwait_idle_state) { 588 i86_mwait(cstate->cs_address, 1); 589 } 590 } else { 591 mach_cpu_idle(); 592 } 593 } else if (type == ACPI_ADR_SPACE_SYSTEM_IO) { 594 /* 595 * mcpu_mwait is not directly part of idling or wakeup 596 * in the ACPI System I/O case, but if available it can 597 * hint that we shouldn't actually try to idle because 598 * we're about to be woken up anyway. 599 * 600 * A trip through idle/wakeup can be upwards of a few 601 * microseconds, so avoiding that makes this a helpful 602 * optimization, but consulting mcpu_mwait is still not 603 * necessary for correctness here. 604 */ 605 if (!mcpu_mwait || *mcpu_mwait == mwait_idle_state) { 606 acpi_io_idle(cstate->cs_address); 607 } 608 } 609 610 /* 611 * We've either idled and woken up, or decided not to idle. 612 * Either way, tell the cpu idle framework that we're not trying 613 * to idle anymore. 614 */ 615 cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE); 616 } 617 618 /* 619 * The LAPIC timer may have stopped in deep c-state. 620 * Reprogram this CPU's LAPIC here before enabling interrupts. 621 */ 622 (void) cstate_use_timer(&lapic_expire, CSTATE_USING_LAT); 623 sti(); 624 625 /* 626 * We're no longer halted 627 */ 628 if (hset_update) { 629 cpup->cpu_disp_flags &= ~CPU_DISP_HALTED; 630 bitset_atomic_del(&cp->cp_haltset, cpu_sid); 631 } 632 } 633 634 /* 635 * Idle the present CPU, deep c-state is supported 636 */ 637 void 638 cpu_acpi_idle(void) 639 { 640 cpu_t *cp = CPU; 641 cpu_acpi_handle_t handle; 642 cma_c_state_t *cs_data; 643 cpu_acpi_cstate_t *cstates; 644 hrtime_t start, end; 645 int cpu_max_cstates; 646 uint32_t cs_indx; 647 uint16_t cs_type; 648 649 cpupm_mach_state_t *mach_state = 650 (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state; 651 handle = mach_state->ms_acpi_handle; 652 ASSERT(CPU_ACPI_CSTATES(handle) != NULL); 653 654 cs_data = mach_state->ms_cstate.cma_state.cstate; 655 cstates = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle); 656 ASSERT(cstates != NULL); 657 cpu_max_cstates = cpu_acpi_get_max_cstates(handle); 658 if (cpu_max_cstates > CPU_MAX_CSTATES) 659 cpu_max_cstates = CPU_MAX_CSTATES; 660 if (cpu_max_cstates == 1) { /* no ACPI c-state data */ 661 (*non_deep_idle_cpu)(); 662 return; 663 } 664 665 start = gethrtime_unscaled(); 666 667 cs_indx = cpupm_next_cstate(cs_data, cstates, cpu_max_cstates, start); 668 669 cs_type = cstates[cs_indx].cs_type; 670 671 switch (cs_type) { 672 default: 673 /* FALLTHROUGH */ 674 case CPU_ACPI_C1: 675 (*non_deep_idle_cpu)(); 676 break; 677 678 case CPU_ACPI_C2: 679 acpi_cpu_cstate(&cstates[cs_indx]); 680 break; 681 682 case CPU_ACPI_C3: 683 /* 684 * All supported Intel processors maintain cache coherency 685 * during C3. Currently when entering C3 processors flush 686 * core caches to higher level shared cache. The shared cache 687 * maintains state and supports probes during C3. 688 * Consequently there is no need to handle cache coherency 689 * and Bus Master activity here with the cache flush, BM_RLD 690 * bit, BM_STS bit, nor PM2_CNT.ARB_DIS mechanisms described 691 * in section 8.1.4 of the ACPI Specification 4.0. 692 */ 693 acpi_cpu_cstate(&cstates[cs_indx]); 694 break; 695 } 696 697 end = gethrtime_unscaled(); 698 699 /* 700 * Update statistics 701 */ 702 cpupm_wakeup_cstate_data(cs_data, end); 703 } 704 705 boolean_t 706 cpu_deep_cstates_supported(void) 707 { 708 extern int idle_cpu_no_deep_c; 709 710 if (idle_cpu_no_deep_c) 711 return (B_FALSE); 712 713 if (!cpuid_deep_cstates_supported()) 714 return (B_FALSE); 715 716 if (cpuid_arat_supported()) { 717 cpu_cstate_arat = B_TRUE; 718 return (B_TRUE); 719 } 720 721 /* 722 * In theory we can use the HPET as a proxy timer in case we can't rely 723 * on the LAPIC in deep C-states. In practice on AMD it seems something 724 * isn't quite right and we just don't get woken up, so the proxy timer 725 * approach doesn't work. Only set up the HPET as proxy timer on Intel 726 * systems for now. 727 */ 728 if (cpuid_getvendor(CPU) == X86_VENDOR_Intel && 729 (hpet.supported == HPET_FULL_SUPPORT) && 730 hpet.install_proxy()) { 731 cpu_cstate_hpet = B_TRUE; 732 return (B_TRUE); 733 } 734 735 return (B_FALSE); 736 } 737 738 /* 739 * Validate that this processor supports deep cstate and if so, 740 * get the c-state data from ACPI and cache it. 741 */ 742 static int 743 cpu_idle_init(cpu_t *cp) 744 { 745 cpupm_mach_state_t *mach_state = 746 (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state; 747 cpu_acpi_handle_t handle = mach_state->ms_acpi_handle; 748 cpu_acpi_cstate_t *cstate; 749 char name[KSTAT_STRLEN]; 750 int cpu_max_cstates, i; 751 int ret; 752 753 /* 754 * Cache the C-state specific ACPI data. 755 */ 756 if ((ret = cpu_acpi_cache_cstate_data(handle)) != 0) { 757 if (ret < 0) 758 cmn_err(CE_NOTE, 759 "!Support for CPU deep idle states is being " 760 "disabled due to errors parsing ACPI C-state " 761 "objects exported by BIOS."); 762 cpu_idle_fini(cp); 763 return (-1); 764 } 765 766 cstate = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle); 767 768 cpu_max_cstates = cpu_acpi_get_max_cstates(handle); 769 770 for (i = CPU_ACPI_C1; i <= cpu_max_cstates; i++) { 771 (void) snprintf(name, KSTAT_STRLEN - 1, "c%d", cstate->cs_type); 772 /* 773 * Allocate, initialize and install cstate kstat 774 */ 775 cstate->cs_ksp = kstat_create("cstate", cp->cpu_id, 776 name, "misc", 777 KSTAT_TYPE_NAMED, 778 sizeof (cpu_idle_kstat) / sizeof (kstat_named_t), 779 KSTAT_FLAG_VIRTUAL); 780 781 if (cstate->cs_ksp == NULL) { 782 cmn_err(CE_NOTE, "kstat_create(c_state) fail"); 783 } else { 784 cstate->cs_ksp->ks_data = &cpu_idle_kstat; 785 cstate->cs_ksp->ks_lock = &cpu_idle_mutex; 786 cstate->cs_ksp->ks_update = cpu_idle_kstat_update; 787 cstate->cs_ksp->ks_data_size += MAXNAMELEN; 788 cstate->cs_ksp->ks_private = cstate; 789 kstat_install(cstate->cs_ksp); 790 } 791 cstate++; 792 } 793 794 cpupm_alloc_domains(cp, CPUPM_C_STATES); 795 cpupm_alloc_ms_cstate(cp); 796 797 if (cpu_deep_cstates_supported()) { 798 uint32_t value; 799 800 mutex_enter(&cpu_idle_callb_mutex); 801 if (cpu_deep_idle_callb_id == (callb_id_t)0) 802 cpu_deep_idle_callb_id = callb_add(&cpu_deep_idle_callb, 803 (void *)NULL, CB_CL_CPU_DEEP_IDLE, "cpu_deep_idle"); 804 if (cpu_idle_cpr_callb_id == (callb_id_t)0) 805 cpu_idle_cpr_callb_id = callb_add(&cpu_idle_cpr_callb, 806 (void *)NULL, CB_CL_CPR_PM, "cpu_idle_cpr"); 807 mutex_exit(&cpu_idle_callb_mutex); 808 809 810 /* 811 * All supported CPUs (Nehalem and later) will remain in C3 812 * during Bus Master activity. 813 * All CPUs set ACPI_BITREG_BUS_MASTER_RLD to 0 here if it 814 * is not already 0 before enabling Deeper C-states. 815 */ 816 cpu_acpi_get_register(ACPI_BITREG_BUS_MASTER_RLD, &value); 817 if (value & 1) 818 cpu_acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0); 819 } 820 821 return (0); 822 } 823 824 /* 825 * Free resources allocated by cpu_idle_init(). 826 */ 827 static void 828 cpu_idle_fini(cpu_t *cp) 829 { 830 cpupm_mach_state_t *mach_state = 831 (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state); 832 cpu_acpi_handle_t handle = mach_state->ms_acpi_handle; 833 cpu_acpi_cstate_t *cstate; 834 uint_t cpu_max_cstates, i; 835 836 /* 837 * idle cpu points back to the generic one 838 */ 839 idle_cpu = cp->cpu_m.mcpu_idle_cpu = non_deep_idle_cpu; 840 disp_enq_thread = non_deep_idle_disp_enq_thread; 841 842 cstate = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle); 843 if (cstate) { 844 cpu_max_cstates = cpu_acpi_get_max_cstates(handle); 845 846 for (i = CPU_ACPI_C1; i <= cpu_max_cstates; i++) { 847 if (cstate->cs_ksp != NULL) 848 kstat_delete(cstate->cs_ksp); 849 cstate++; 850 } 851 } 852 853 cpupm_free_ms_cstate(cp); 854 cpupm_free_domains(&cpupm_cstate_domains); 855 cpu_acpi_free_cstate_data(handle); 856 857 mutex_enter(&cpu_idle_callb_mutex); 858 if (cpu_deep_idle_callb_id != (callb_id_t)0) { 859 (void) callb_delete(cpu_deep_idle_callb_id); 860 cpu_deep_idle_callb_id = (callb_id_t)0; 861 } 862 if (cpu_idle_cpr_callb_id != (callb_id_t)0) { 863 (void) callb_delete(cpu_idle_cpr_callb_id); 864 cpu_idle_cpr_callb_id = (callb_id_t)0; 865 } 866 mutex_exit(&cpu_idle_callb_mutex); 867 } 868 869 /* 870 * This function is introduced here to solve a race condition 871 * between the master and the slave to touch c-state data structure. 872 * After the slave calls this idle function to switch to the non 873 * deep idle function, the master can go on to reclaim the resource. 874 */ 875 static void 876 cpu_idle_stop_sync(void) 877 { 878 /* switch to the non deep idle function */ 879 CPU->cpu_m.mcpu_idle_cpu = non_deep_idle_cpu; 880 } 881 882 static void 883 cpu_idle_stop(cpu_t *cp) 884 { 885 cpupm_mach_state_t *mach_state = 886 (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state); 887 cpu_acpi_handle_t handle = mach_state->ms_acpi_handle; 888 cpu_acpi_cstate_t *cstate; 889 uint_t cpu_max_cstates, i = 0; 890 891 mutex_enter(&cpu_idle_callb_mutex); 892 if (idle_cpu == cpu_idle_adaptive) { 893 /* 894 * invoke the slave to call synchronous idle function. 895 */ 896 cp->cpu_m.mcpu_idle_cpu = cpu_idle_stop_sync; 897 poke_cpu(cp->cpu_id); 898 899 /* 900 * wait until the slave switchs to non deep idle function, 901 * so that the master is safe to go on to reclaim the resource. 902 */ 903 while (cp->cpu_m.mcpu_idle_cpu != non_deep_idle_cpu) { 904 drv_usecwait(10); 905 if ((++i % CPU_IDLE_STOP_TIMEOUT) == 0) 906 cmn_err(CE_NOTE, "!cpu_idle_stop: the slave" 907 " idle stop timeout"); 908 } 909 } 910 mutex_exit(&cpu_idle_callb_mutex); 911 912 cstate = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle); 913 if (cstate) { 914 cpu_max_cstates = cpu_acpi_get_max_cstates(handle); 915 916 for (i = CPU_ACPI_C1; i <= cpu_max_cstates; i++) { 917 if (cstate->cs_ksp != NULL) 918 kstat_delete(cstate->cs_ksp); 919 cstate++; 920 } 921 } 922 cpupm_free_ms_cstate(cp); 923 cpupm_remove_domains(cp, CPUPM_C_STATES, &cpupm_cstate_domains); 924 cpu_acpi_free_cstate_data(handle); 925 } 926 927 /*ARGSUSED*/ 928 static boolean_t 929 cpu_deep_idle_callb(void *arg, int code) 930 { 931 boolean_t rslt = B_TRUE; 932 933 mutex_enter(&cpu_idle_callb_mutex); 934 switch (code) { 935 case PM_DEFAULT_CPU_DEEP_IDLE: 936 /* 937 * Default policy is same as enable 938 */ 939 /*FALLTHROUGH*/ 940 case PM_ENABLE_CPU_DEEP_IDLE: 941 if ((cpu_idle_cfg_state & CPU_IDLE_DEEP_CFG) == 0) 942 break; 943 944 if (cstate_timer_callback(PM_ENABLE_CPU_DEEP_IDLE)) { 945 disp_enq_thread = cstate_wakeup; 946 idle_cpu = cpu_idle_adaptive; 947 cpu_idle_cfg_state &= ~CPU_IDLE_DEEP_CFG; 948 } else { 949 rslt = B_FALSE; 950 } 951 break; 952 953 case PM_DISABLE_CPU_DEEP_IDLE: 954 if (cpu_idle_cfg_state & CPU_IDLE_DEEP_CFG) 955 break; 956 957 idle_cpu = non_deep_idle_cpu; 958 if (cstate_timer_callback(PM_DISABLE_CPU_DEEP_IDLE)) { 959 disp_enq_thread = non_deep_idle_disp_enq_thread; 960 cpu_idle_cfg_state |= CPU_IDLE_DEEP_CFG; 961 } 962 break; 963 964 default: 965 cmn_err(CE_NOTE, "!cpu deep_idle_callb: invalid code %d\n", 966 code); 967 break; 968 } 969 mutex_exit(&cpu_idle_callb_mutex); 970 return (rslt); 971 } 972 973 /*ARGSUSED*/ 974 static boolean_t 975 cpu_idle_cpr_callb(void *arg, int code) 976 { 977 boolean_t rslt = B_TRUE; 978 979 mutex_enter(&cpu_idle_callb_mutex); 980 switch (code) { 981 case CB_CODE_CPR_RESUME: 982 if (cstate_timer_callback(CB_CODE_CPR_RESUME)) { 983 /* 984 * Do not enable dispatcher hooks if disabled by user. 985 */ 986 if (cpu_idle_cfg_state & CPU_IDLE_DEEP_CFG) 987 break; 988 989 disp_enq_thread = cstate_wakeup; 990 idle_cpu = cpu_idle_adaptive; 991 } else { 992 rslt = B_FALSE; 993 } 994 break; 995 996 case CB_CODE_CPR_CHKPT: 997 idle_cpu = non_deep_idle_cpu; 998 disp_enq_thread = non_deep_idle_disp_enq_thread; 999 (void) cstate_timer_callback(CB_CODE_CPR_CHKPT); 1000 break; 1001 1002 default: 1003 cmn_err(CE_NOTE, "!cpudvr cpr_callb: invalid code %d\n", code); 1004 break; 1005 } 1006 mutex_exit(&cpu_idle_callb_mutex); 1007 return (rslt); 1008 } 1009 1010 /* 1011 * handle _CST notification 1012 */ 1013 void 1014 cpuidle_cstate_instance(cpu_t *cp) 1015 { 1016 #ifndef __xpv 1017 cpupm_mach_state_t *mach_state = 1018 (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state; 1019 cpu_acpi_handle_t handle; 1020 struct machcpu *mcpu; 1021 cpuset_t dom_cpu_set; 1022 kmutex_t *pm_lock; 1023 int result = 0; 1024 processorid_t cpu_id; 1025 1026 if (mach_state == NULL) { 1027 return; 1028 } 1029 1030 ASSERT(mach_state->ms_cstate.cma_domain != NULL); 1031 dom_cpu_set = mach_state->ms_cstate.cma_domain->pm_cpus; 1032 pm_lock = &mach_state->ms_cstate.cma_domain->pm_lock; 1033 1034 /* 1035 * Do for all the CPU's in the domain 1036 */ 1037 mutex_enter(pm_lock); 1038 do { 1039 CPUSET_FIND(dom_cpu_set, cpu_id); 1040 if (cpu_id == CPUSET_NOTINSET) 1041 break; 1042 1043 ASSERT(cpu_id >= 0 && cpu_id < NCPU); 1044 cp = cpu[cpu_id]; 1045 mach_state = (cpupm_mach_state_t *) 1046 cp->cpu_m.mcpu_pm_mach_state; 1047 if (!(mach_state->ms_caps & CPUPM_C_STATES)) { 1048 mutex_exit(pm_lock); 1049 return; 1050 } 1051 handle = mach_state->ms_acpi_handle; 1052 ASSERT(handle != NULL); 1053 1054 /* 1055 * re-evaluate cstate object 1056 */ 1057 if (cpu_acpi_cache_cstate_data(handle) != 0) { 1058 cmn_err(CE_WARN, "Cannot re-evaluate the cpu c-state" 1059 " object Instance: %d", cpu_id); 1060 } 1061 mcpu = &(cp->cpu_m); 1062 mcpu->max_cstates = cpu_acpi_get_max_cstates(handle); 1063 if (mcpu->max_cstates > CPU_ACPI_C1) { 1064 (void) cstate_timer_callback( 1065 CST_EVENT_MULTIPLE_CSTATES); 1066 disp_enq_thread = cstate_wakeup; 1067 cp->cpu_m.mcpu_idle_cpu = cpu_acpi_idle; 1068 } else if (mcpu->max_cstates == CPU_ACPI_C1) { 1069 disp_enq_thread = non_deep_idle_disp_enq_thread; 1070 cp->cpu_m.mcpu_idle_cpu = non_deep_idle_cpu; 1071 (void) cstate_timer_callback(CST_EVENT_ONE_CSTATE); 1072 } 1073 1074 CPUSET_ATOMIC_XDEL(dom_cpu_set, cpu_id, result); 1075 } while (result < 0); 1076 mutex_exit(pm_lock); 1077 #endif 1078 } 1079 1080 /* 1081 * handle the number or the type of available processor power states change 1082 */ 1083 void 1084 cpuidle_manage_cstates(void *ctx) 1085 { 1086 cpu_t *cp = ctx; 1087 cpupm_mach_state_t *mach_state = 1088 (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state; 1089 boolean_t is_ready; 1090 1091 if (mach_state == NULL) { 1092 return; 1093 } 1094 1095 /* 1096 * We currently refuse to power manage if the CPU is not ready to 1097 * take cross calls (cross calls fail silently if CPU is not ready 1098 * for it). 1099 * 1100 * Additionally, for x86 platforms we cannot power manage an instance, 1101 * until it has been initialized. 1102 */ 1103 is_ready = (cp->cpu_flags & CPU_READY) && cpupm_cstate_ready(cp); 1104 if (!is_ready) 1105 return; 1106 1107 cpuidle_cstate_instance(cp); 1108 } 1109