1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 /* 26 * Copyright (c) 2009-2010, Intel Corporation. 27 * All rights reserved. 28 */ 29 /* 30 * Copyright 2019 Joyent, Inc. 31 * Copyright 2024 Oxide Computer Company 32 */ 33 34 #include <sys/x86_archext.h> 35 #include <sys/machsystm.h> 36 #include <sys/x_call.h> 37 #include <sys/stat.h> 38 #include <sys/acpi/acpi.h> 39 #include <sys/acpica.h> 40 #include <sys/cpu_acpi.h> 41 #include <sys/cpu_idle.h> 42 #include <sys/cpupm.h> 43 #include <sys/cpu_event.h> 44 #include <sys/hpet.h> 45 #include <sys/archsystm.h> 46 #include <vm/hat_i86.h> 47 #include <sys/dtrace.h> 48 #include <sys/sdt.h> 49 #include <sys/callb.h> 50 51 #define CSTATE_USING_HPET 1 52 #define CSTATE_USING_LAT 2 53 54 #define CPU_IDLE_STOP_TIMEOUT 1000 55 56 extern void cpu_idle_adaptive(void); 57 extern uint32_t cpupm_next_cstate(cma_c_state_t *cs_data, 58 cpu_acpi_cstate_t *cstates, uint32_t cs_count, hrtime_t start); 59 60 static int cpu_idle_init(cpu_t *); 61 static void cpu_idle_fini(cpu_t *); 62 static void cpu_idle_stop(cpu_t *); 63 static boolean_t cpu_deep_idle_callb(void *arg, int code); 64 static boolean_t cpu_idle_cpr_callb(void *arg, int code); 65 static void acpi_cpu_cstate(cpu_acpi_cstate_t *cstate); 66 67 static boolean_t cstate_use_timer(hrtime_t *lapic_expire, int timer); 68 69 /* 70 * the flag of always-running local APIC timer. 71 * the flag of HPET Timer use in deep cstate. 72 */ 73 static boolean_t cpu_cstate_arat = B_FALSE; 74 static boolean_t cpu_cstate_hpet = B_FALSE; 75 76 /* 77 * Interfaces for modules implementing Intel's deep c-state. 78 */ 79 cpupm_state_ops_t cpu_idle_ops = { 80 "Generic ACPI C-state Support", 81 cpu_idle_init, 82 cpu_idle_fini, 83 NULL, 84 cpu_idle_stop 85 }; 86 87 static kmutex_t cpu_idle_callb_mutex; 88 static callb_id_t cpu_deep_idle_callb_id; 89 static callb_id_t cpu_idle_cpr_callb_id; 90 static uint_t cpu_idle_cfg_state; 91 92 static kmutex_t cpu_idle_mutex; 93 94 cpu_idle_kstat_t cpu_idle_kstat = { 95 { "address_space_id", KSTAT_DATA_STRING }, 96 { "latency", KSTAT_DATA_UINT32 }, 97 { "power", KSTAT_DATA_UINT32 }, 98 }; 99 100 /* 101 * kstat update function of the c-state info 102 */ 103 static int 104 cpu_idle_kstat_update(kstat_t *ksp, int flag) 105 { 106 cpu_acpi_cstate_t *cstate = ksp->ks_private; 107 108 if (flag == KSTAT_WRITE) { 109 return (EACCES); 110 } 111 112 if (cstate->cs_addrspace_id == ACPI_ADR_SPACE_FIXED_HARDWARE) { 113 kstat_named_setstr(&cpu_idle_kstat.addr_space_id, 114 "FFixedHW"); 115 } else if (cstate->cs_addrspace_id == ACPI_ADR_SPACE_SYSTEM_IO) { 116 kstat_named_setstr(&cpu_idle_kstat.addr_space_id, 117 "SystemIO"); 118 } else { 119 kstat_named_setstr(&cpu_idle_kstat.addr_space_id, 120 "Unsupported"); 121 } 122 123 cpu_idle_kstat.cs_latency.value.ui32 = cstate->cs_latency; 124 cpu_idle_kstat.cs_power.value.ui32 = cstate->cs_power; 125 126 return (0); 127 } 128 129 /* 130 * Used during configuration callbacks to manage implementation specific 131 * details of the hardware timer used during Deep C-state. 132 */ 133 boolean_t 134 cstate_timer_callback(int code) 135 { 136 if (cpu_cstate_arat) { 137 return (B_TRUE); 138 } else if (cpu_cstate_hpet) { 139 return (hpet.callback(code)); 140 } 141 return (B_FALSE); 142 } 143 144 /* 145 * Some Local APIC Timers do not work during Deep C-states. 146 * The Deep C-state idle function uses this function to ensure it is using a 147 * hardware timer that works during Deep C-states. This function also 148 * switches the timer back to the LACPI Timer after Deep C-state. 149 */ 150 static boolean_t 151 cstate_use_timer(hrtime_t *lapic_expire, int timer) 152 { 153 if (cpu_cstate_arat) 154 return (B_TRUE); 155 156 /* 157 * We have to return B_FALSE if no arat or hpet support 158 */ 159 if (!cpu_cstate_hpet) 160 return (B_FALSE); 161 162 switch (timer) { 163 case CSTATE_USING_HPET: 164 return (hpet.use_hpet_timer(lapic_expire)); 165 case CSTATE_USING_LAT: 166 hpet.use_lapic_timer(*lapic_expire); 167 return (B_TRUE); 168 default: 169 return (B_FALSE); 170 } 171 } 172 173 /* 174 * c-state wakeup function. 175 * Similar to cpu_wakeup and cpu_wakeup_mwait except this function deals 176 * with CPUs asleep in MWAIT, HLT, or ACPI Deep C-State. 177 */ 178 void 179 cstate_wakeup(cpu_t *cp, int bound) 180 { 181 struct machcpu *mcpu = &(cp->cpu_m); 182 volatile uint32_t *mcpu_mwait = mcpu->mcpu_mwait; 183 cpupart_t *cpu_part; 184 uint_t cpu_found; 185 processorid_t cpu_sid; 186 187 cpu_part = cp->cpu_part; 188 cpu_sid = cp->cpu_seqid; 189 /* 190 * Clear the halted bit for that CPU since it will be woken up 191 * in a moment. 192 */ 193 if (bitset_in_set(&cpu_part->cp_haltset, cpu_sid)) { 194 /* 195 * Clear the halted bit for that CPU since it will be 196 * poked in a moment. 197 */ 198 bitset_atomic_del(&cpu_part->cp_haltset, cpu_sid); 199 200 /* 201 * We may find the current CPU present in the halted cpuset 202 * if we're in the context of an interrupt that occurred 203 * before we had a chance to clear our bit in cpu_idle(). 204 * Waking ourself is obviously unnecessary, since if 205 * we're here, we're not halted. 206 */ 207 if (cp != CPU) { 208 /* 209 * Use correct wakeup mechanism 210 */ 211 if ((mcpu_mwait != NULL) && 212 (*mcpu_mwait == MWAIT_HALTED)) 213 MWAIT_WAKEUP(cp); 214 else 215 poke_cpu(cp->cpu_id); 216 } 217 return; 218 } else { 219 /* 220 * This cpu isn't halted, but it's idle or undergoing a 221 * context switch. No need to awaken anyone else. 222 */ 223 if (cp->cpu_thread == cp->cpu_idle_thread || 224 cp->cpu_disp_flags & CPU_DISP_DONTSTEAL) 225 return; 226 } 227 228 /* 229 * No need to wake up other CPUs if the thread we just enqueued 230 * is bound. 231 */ 232 if (bound) 233 return; 234 235 236 /* 237 * See if there's any other halted CPUs. If there are, then 238 * select one, and awaken it. 239 * It's possible that after we find a CPU, somebody else 240 * will awaken it before we get the chance. 241 * In that case, look again. 242 */ 243 do { 244 cpu_found = bitset_find(&cpu_part->cp_haltset); 245 if (cpu_found == (uint_t)-1) 246 return; 247 248 } while (bitset_atomic_test_and_del(&cpu_part->cp_haltset, 249 cpu_found) < 0); 250 251 /* 252 * Must use correct wakeup mechanism to avoid lost wakeup of 253 * alternate cpu. 254 */ 255 if (cpu_found != CPU->cpu_seqid) { 256 mcpu_mwait = cpu_seq[cpu_found]->cpu_m.mcpu_mwait; 257 if ((mcpu_mwait != NULL) && (*mcpu_mwait == MWAIT_HALTED)) 258 MWAIT_WAKEUP(cpu_seq[cpu_found]); 259 else 260 poke_cpu(cpu_seq[cpu_found]->cpu_id); 261 } 262 } 263 264 /* 265 * Function called by CPU idle notification framework to check whether CPU 266 * has been awakened. It will be called with interrupt disabled. 267 * If CPU has been awakened, call cpu_idle_exit() to notify CPU idle 268 * notification framework. 269 */ 270 static void 271 acpi_cpu_mwait_check_wakeup(void *arg) 272 { 273 volatile uint32_t *mcpu_mwait = (volatile uint32_t *)arg; 274 275 ASSERT(arg != NULL); 276 if (*mcpu_mwait != MWAIT_HALTED) { 277 /* 278 * CPU has been awakened, notify CPU idle notification system. 279 */ 280 cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE); 281 } else { 282 /* 283 * Toggle interrupt flag to detect pending interrupts. 284 * If interrupt happened, do_interrupt() will notify CPU idle 285 * notification framework so no need to call cpu_idle_exit() 286 * here. 287 */ 288 sti(); 289 SMT_PAUSE(); 290 cli(); 291 } 292 } 293 294 static void 295 acpi_cpu_mwait_ipi_check_wakeup(void *arg) 296 { 297 volatile uint32_t *mcpu_mwait = (volatile uint32_t *)arg; 298 299 ASSERT(arg != NULL); 300 if (*mcpu_mwait != MWAIT_WAKEUP_IPI) { 301 /* 302 * CPU has been awakened, notify CPU idle notification system. 303 */ 304 cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE); 305 } else { 306 /* 307 * Toggle interrupt flag to detect pending interrupts. 308 * If interrupt happened, do_interrupt() will notify CPU idle 309 * notification framework so no need to call cpu_idle_exit() 310 * here. 311 */ 312 sti(); 313 SMT_PAUSE(); 314 cli(); 315 } 316 } 317 318 /*ARGSUSED*/ 319 static void 320 acpi_cpu_check_wakeup(void *arg) 321 { 322 /* 323 * Toggle interrupt flag to detect pending interrupts. 324 * If interrupt happened, do_interrupt() will notify CPU idle 325 * notification framework so no need to call cpu_idle_exit() here. 326 */ 327 sti(); 328 SMT_PAUSE(); 329 cli(); 330 } 331 332 /* 333 * Idle the current CPU via ACPI-defined System I/O read to an ACPI-specified 334 * address. 335 */ 336 static void 337 acpi_io_idle(uint32_t address) 338 { 339 uint32_t value; 340 ACPI_TABLE_FADT *gbl_FADT; 341 342 /* 343 * Do we need to work around an ancient chipset bug in early ACPI 344 * implementations that would result in a late STPCLK# assertion? 345 * 346 * Must be true when running on systems where the ACPI-indicated I/O 347 * read to enter low-power states may resolve before actually stopping 348 * the processor that initiated a low-power transition. On such systems, 349 * it is possible the processor would proceed past the idle point and 350 * *then* be stopped. 351 * 352 * An early workaround that has been carried forward is to read the ACPI 353 * PM Timer after requesting a low-power transition. The timer read will 354 * take long enough that we are certain the processor is safe to be 355 * stopped. 356 * 357 * From some investigation, this was only ever necessary on older Intel 358 * chipsets. Additionally, the timer read can take upwards of a thousand 359 * CPU clocks, so for systems that work correctly, it's just a tarpit 360 * for the CPU as it is woken back up. 361 */ 362 boolean_t need_stpclk_workaround = 363 cpuid_getvendor(CPU) == X86_VENDOR_Intel; 364 365 /* 366 * The following call will cause us to halt which will cause the store 367 * buffer to be repartitioned, potentially exposing us to the Intel CPU 368 * vulnerability MDS. As such, we need to explicitly call that here. 369 * The other idle methods do this automatically as part of the 370 * implementation of i86_mwait(). 371 */ 372 x86_md_clear(); 373 (void) cpu_acpi_read_port(address, &value, 8); 374 if (need_stpclk_workaround) { 375 acpica_get_global_FADT(&gbl_FADT); 376 (void) cpu_acpi_read_port( 377 gbl_FADT->XPmTimerBlock.Address, 378 &value, 32); 379 } 380 } 381 382 /* 383 * enter deep c-state handler 384 */ 385 static void 386 acpi_cpu_cstate(cpu_acpi_cstate_t *cstate) 387 { 388 volatile uint32_t *mcpu_mwait = CPU->cpu_m.mcpu_mwait; 389 cpu_t *cpup = CPU; 390 processorid_t cpu_sid = cpup->cpu_seqid; 391 cpupart_t *cp = cpup->cpu_part; 392 hrtime_t lapic_expire; 393 uint8_t type = cstate->cs_addrspace_id; 394 uint32_t cs_type = cstate->cs_type; 395 int hset_update = 1; 396 boolean_t using_timer; 397 cpu_idle_check_wakeup_t check_func = &acpi_cpu_check_wakeup; 398 399 /* 400 * Set our mcpu_mwait here, so we can tell if anyone tries to 401 * wake us between now and when we call mwait. No other cpu will 402 * attempt to set our mcpu_mwait until we add ourself to the haltset. 403 */ 404 if (mcpu_mwait) { 405 if (type == ACPI_ADR_SPACE_SYSTEM_IO) { 406 *mcpu_mwait = MWAIT_WAKEUP_IPI; 407 check_func = &acpi_cpu_mwait_ipi_check_wakeup; 408 } else { 409 *mcpu_mwait = MWAIT_HALTED; 410 check_func = &acpi_cpu_mwait_check_wakeup; 411 } 412 } 413 414 /* 415 * If this CPU is online, and there are multiple CPUs 416 * in the system, then we should note our halting 417 * by adding ourselves to the partition's halted CPU 418 * bitmap. This allows other CPUs to find/awaken us when 419 * work becomes available. 420 */ 421 if (cpup->cpu_flags & CPU_OFFLINE || ncpus == 1) 422 hset_update = 0; 423 424 /* 425 * Add ourselves to the partition's halted CPUs bitmask 426 * and set our HALTED flag, if necessary. 427 * 428 * When a thread becomes runnable, it is placed on the queue 429 * and then the halted cpuset is checked to determine who 430 * (if anyone) should be awakened. We therefore need to first 431 * add ourselves to the halted cpuset, and and then check if there 432 * is any work available. 433 * 434 * Note that memory barriers after updating the HALTED flag 435 * are not necessary since an atomic operation (updating the bitmap) 436 * immediately follows. On x86 the atomic operation acts as a 437 * memory barrier for the update of cpu_disp_flags. 438 */ 439 if (hset_update) { 440 cpup->cpu_disp_flags |= CPU_DISP_HALTED; 441 bitset_atomic_add(&cp->cp_haltset, cpu_sid); 442 } 443 444 /* 445 * Check to make sure there's really nothing to do. 446 * Work destined for this CPU may become available after 447 * this check. We'll be notified through the clearing of our 448 * bit in the halted CPU bitmask, and a write to our mcpu_mwait. 449 * 450 * disp_anywork() checks disp_nrunnable, so we do not have to later. 451 */ 452 if (disp_anywork()) { 453 if (hset_update) { 454 cpup->cpu_disp_flags &= ~CPU_DISP_HALTED; 455 bitset_atomic_del(&cp->cp_haltset, cpu_sid); 456 } 457 return; 458 } 459 460 /* 461 * We're on our way to being halted. 462 * 463 * The local APIC timer can stop in ACPI C2 and deeper c-states. 464 * Try to program the HPET hardware to substitute for this CPU's 465 * LAPIC timer. 466 * cstate_use_timer() could disable the LAPIC Timer. Make sure 467 * to start the LAPIC Timer again before leaving this function. 468 * 469 * Disable interrupts here so we will awaken immediately after halting 470 * if someone tries to poke us between now and the time we actually 471 * halt. 472 */ 473 cli(); 474 using_timer = cstate_use_timer(&lapic_expire, CSTATE_USING_HPET); 475 476 /* 477 * We check for the presence of our bit after disabling interrupts. 478 * If it's cleared, we'll return. If the bit is cleared after 479 * we check then the cstate_wakeup() will pop us out of the halted 480 * state. 481 * 482 * This means that the ordering of the cstate_wakeup() and the clearing 483 * of the bit by cpu_wakeup is important. 484 * cpu_wakeup() must clear our mc_haltset bit, and then call 485 * cstate_wakeup(). 486 * acpi_cpu_cstate() must disable interrupts, then check for the bit. 487 */ 488 if (hset_update && bitset_in_set(&cp->cp_haltset, cpu_sid) == 0) { 489 (void) cstate_use_timer(&lapic_expire, 490 CSTATE_USING_LAT); 491 sti(); 492 cpup->cpu_disp_flags &= ~CPU_DISP_HALTED; 493 return; 494 } 495 496 /* 497 * The check for anything locally runnable is here for performance 498 * and isn't needed for correctness. disp_nrunnable ought to be 499 * in our cache still, so it's inexpensive to check, and if there 500 * is anything runnable we won't have to wait for the poke. 501 */ 502 if (cpup->cpu_disp->disp_nrunnable != 0) { 503 (void) cstate_use_timer(&lapic_expire, 504 CSTATE_USING_LAT); 505 sti(); 506 if (hset_update) { 507 cpup->cpu_disp_flags &= ~CPU_DISP_HALTED; 508 bitset_atomic_del(&cp->cp_haltset, cpu_sid); 509 } 510 return; 511 } 512 513 if (using_timer == B_FALSE) { 514 515 (void) cstate_use_timer(&lapic_expire, 516 CSTATE_USING_LAT); 517 sti(); 518 519 /* 520 * We are currently unable to program the HPET to act as this 521 * CPU's proxy LAPIC timer. This CPU cannot enter C2 or deeper 522 * because no timer is set to wake it up while its LAPIC timer 523 * stalls in deep C-States. 524 * Enter C1 instead. 525 * 526 * cstate_wake_cpu() will wake this CPU with an IPI which 527 * works with MWAIT. 528 */ 529 i86_monitor(mcpu_mwait, 0, 0); 530 if ((*mcpu_mwait & ~MWAIT_WAKEUP_IPI) == MWAIT_HALTED) { 531 if (cpu_idle_enter(IDLE_STATE_C1, 0, 532 check_func, (void *)mcpu_mwait) == 0) { 533 if ((*mcpu_mwait & ~MWAIT_WAKEUP_IPI) == 534 MWAIT_HALTED) { 535 i86_mwait(0, 0); 536 } 537 cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE); 538 } 539 } 540 541 /* 542 * We're no longer halted 543 */ 544 if (hset_update) { 545 cpup->cpu_disp_flags &= ~CPU_DISP_HALTED; 546 bitset_atomic_del(&cp->cp_haltset, cpu_sid); 547 } 548 return; 549 } 550 551 if (type == ACPI_ADR_SPACE_FIXED_HARDWARE) { 552 /* 553 * We're on our way to being halted. 554 * To avoid a lost wakeup, arm the monitor before checking 555 * if another cpu wrote to mcpu_mwait to wake us up. 556 */ 557 i86_monitor(mcpu_mwait, 0, 0); 558 if (*mcpu_mwait == MWAIT_HALTED) { 559 if (cpu_idle_enter((uint_t)cs_type, 0, 560 check_func, (void *)mcpu_mwait) == 0) { 561 if (*mcpu_mwait == MWAIT_HALTED) { 562 i86_mwait(cstate->cs_address, 1); 563 } 564 cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE); 565 } 566 } 567 } else if (type == ACPI_ADR_SPACE_SYSTEM_IO) { 568 if (*mcpu_mwait == MWAIT_WAKEUP_IPI) { 569 if (cpu_idle_enter((uint_t)cs_type, 0, 570 check_func, (void *)mcpu_mwait) == 0) { 571 if (*mcpu_mwait == MWAIT_WAKEUP_IPI) { 572 acpi_io_idle(cstate->cs_address); 573 } 574 cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE); 575 } 576 } 577 } 578 579 /* 580 * The LAPIC timer may have stopped in deep c-state. 581 * Reprogram this CPU's LAPIC here before enabling interrupts. 582 */ 583 (void) cstate_use_timer(&lapic_expire, CSTATE_USING_LAT); 584 sti(); 585 586 /* 587 * We're no longer halted 588 */ 589 if (hset_update) { 590 cpup->cpu_disp_flags &= ~CPU_DISP_HALTED; 591 bitset_atomic_del(&cp->cp_haltset, cpu_sid); 592 } 593 } 594 595 /* 596 * Idle the present CPU, deep c-state is supported 597 */ 598 void 599 cpu_acpi_idle(void) 600 { 601 cpu_t *cp = CPU; 602 cpu_acpi_handle_t handle; 603 cma_c_state_t *cs_data; 604 cpu_acpi_cstate_t *cstates; 605 hrtime_t start, end; 606 int cpu_max_cstates; 607 uint32_t cs_indx; 608 uint16_t cs_type; 609 610 cpupm_mach_state_t *mach_state = 611 (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state; 612 handle = mach_state->ms_acpi_handle; 613 ASSERT(CPU_ACPI_CSTATES(handle) != NULL); 614 615 cs_data = mach_state->ms_cstate.cma_state.cstate; 616 cstates = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle); 617 ASSERT(cstates != NULL); 618 cpu_max_cstates = cpu_acpi_get_max_cstates(handle); 619 if (cpu_max_cstates > CPU_MAX_CSTATES) 620 cpu_max_cstates = CPU_MAX_CSTATES; 621 if (cpu_max_cstates == 1) { /* no ACPI c-state data */ 622 (*non_deep_idle_cpu)(); 623 return; 624 } 625 626 start = gethrtime_unscaled(); 627 628 cs_indx = cpupm_next_cstate(cs_data, cstates, cpu_max_cstates, start); 629 630 cs_type = cstates[cs_indx].cs_type; 631 632 switch (cs_type) { 633 default: 634 /* FALLTHROUGH */ 635 case CPU_ACPI_C1: 636 (*non_deep_idle_cpu)(); 637 break; 638 639 case CPU_ACPI_C2: 640 acpi_cpu_cstate(&cstates[cs_indx]); 641 break; 642 643 case CPU_ACPI_C3: 644 /* 645 * All supported Intel processors maintain cache coherency 646 * during C3. Currently when entering C3 processors flush 647 * core caches to higher level shared cache. The shared cache 648 * maintains state and supports probes during C3. 649 * Consequently there is no need to handle cache coherency 650 * and Bus Master activity here with the cache flush, BM_RLD 651 * bit, BM_STS bit, nor PM2_CNT.ARB_DIS mechanisms described 652 * in section 8.1.4 of the ACPI Specification 4.0. 653 */ 654 acpi_cpu_cstate(&cstates[cs_indx]); 655 break; 656 } 657 658 end = gethrtime_unscaled(); 659 660 /* 661 * Update statistics 662 */ 663 cpupm_wakeup_cstate_data(cs_data, end); 664 } 665 666 boolean_t 667 cpu_deep_cstates_supported(void) 668 { 669 extern int idle_cpu_no_deep_c; 670 671 if (idle_cpu_no_deep_c) 672 return (B_FALSE); 673 674 if (!cpuid_deep_cstates_supported()) 675 return (B_FALSE); 676 677 if (cpuid_arat_supported()) { 678 cpu_cstate_arat = B_TRUE; 679 return (B_TRUE); 680 } 681 682 /* 683 * In theory we can use the HPET as a proxy timer in case we can't rely 684 * on the LAPIC in deep C-states. In practice on AMD it seems something 685 * isn't quite right and we just don't get woken up, so the proxy timer 686 * approach doesn't work. Only set up the HPET as proxy timer on Intel 687 * systems for now. 688 */ 689 if (cpuid_getvendor(CPU) == X86_VENDOR_Intel && 690 (hpet.supported == HPET_FULL_SUPPORT) && 691 hpet.install_proxy()) { 692 cpu_cstate_hpet = B_TRUE; 693 return (B_TRUE); 694 } 695 696 return (B_FALSE); 697 } 698 699 /* 700 * Validate that this processor supports deep cstate and if so, 701 * get the c-state data from ACPI and cache it. 702 */ 703 static int 704 cpu_idle_init(cpu_t *cp) 705 { 706 cpupm_mach_state_t *mach_state = 707 (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state; 708 cpu_acpi_handle_t handle = mach_state->ms_acpi_handle; 709 cpu_acpi_cstate_t *cstate; 710 char name[KSTAT_STRLEN]; 711 int cpu_max_cstates, i; 712 int ret; 713 714 /* 715 * Cache the C-state specific ACPI data. 716 */ 717 if ((ret = cpu_acpi_cache_cstate_data(handle)) != 0) { 718 if (ret < 0) 719 cmn_err(CE_NOTE, 720 "!Support for CPU deep idle states is being " 721 "disabled due to errors parsing ACPI C-state " 722 "objects exported by BIOS."); 723 cpu_idle_fini(cp); 724 return (-1); 725 } 726 727 cstate = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle); 728 729 cpu_max_cstates = cpu_acpi_get_max_cstates(handle); 730 731 for (i = CPU_ACPI_C1; i <= cpu_max_cstates; i++) { 732 (void) snprintf(name, KSTAT_STRLEN - 1, "c%d", cstate->cs_type); 733 /* 734 * Allocate, initialize and install cstate kstat 735 */ 736 cstate->cs_ksp = kstat_create("cstate", cp->cpu_id, 737 name, "misc", 738 KSTAT_TYPE_NAMED, 739 sizeof (cpu_idle_kstat) / sizeof (kstat_named_t), 740 KSTAT_FLAG_VIRTUAL); 741 742 if (cstate->cs_ksp == NULL) { 743 cmn_err(CE_NOTE, "kstat_create(c_state) fail"); 744 } else { 745 cstate->cs_ksp->ks_data = &cpu_idle_kstat; 746 cstate->cs_ksp->ks_lock = &cpu_idle_mutex; 747 cstate->cs_ksp->ks_update = cpu_idle_kstat_update; 748 cstate->cs_ksp->ks_data_size += MAXNAMELEN; 749 cstate->cs_ksp->ks_private = cstate; 750 kstat_install(cstate->cs_ksp); 751 } 752 cstate++; 753 } 754 755 cpupm_alloc_domains(cp, CPUPM_C_STATES); 756 cpupm_alloc_ms_cstate(cp); 757 758 if (cpu_deep_cstates_supported()) { 759 uint32_t value; 760 761 mutex_enter(&cpu_idle_callb_mutex); 762 if (cpu_deep_idle_callb_id == (callb_id_t)0) 763 cpu_deep_idle_callb_id = callb_add(&cpu_deep_idle_callb, 764 (void *)NULL, CB_CL_CPU_DEEP_IDLE, "cpu_deep_idle"); 765 if (cpu_idle_cpr_callb_id == (callb_id_t)0) 766 cpu_idle_cpr_callb_id = callb_add(&cpu_idle_cpr_callb, 767 (void *)NULL, CB_CL_CPR_PM, "cpu_idle_cpr"); 768 mutex_exit(&cpu_idle_callb_mutex); 769 770 771 /* 772 * All supported CPUs (Nehalem and later) will remain in C3 773 * during Bus Master activity. 774 * All CPUs set ACPI_BITREG_BUS_MASTER_RLD to 0 here if it 775 * is not already 0 before enabling Deeper C-states. 776 */ 777 cpu_acpi_get_register(ACPI_BITREG_BUS_MASTER_RLD, &value); 778 if (value & 1) 779 cpu_acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0); 780 } 781 782 return (0); 783 } 784 785 /* 786 * Free resources allocated by cpu_idle_init(). 787 */ 788 static void 789 cpu_idle_fini(cpu_t *cp) 790 { 791 cpupm_mach_state_t *mach_state = 792 (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state); 793 cpu_acpi_handle_t handle = mach_state->ms_acpi_handle; 794 cpu_acpi_cstate_t *cstate; 795 uint_t cpu_max_cstates, i; 796 797 /* 798 * idle cpu points back to the generic one 799 */ 800 idle_cpu = cp->cpu_m.mcpu_idle_cpu = non_deep_idle_cpu; 801 disp_enq_thread = non_deep_idle_disp_enq_thread; 802 803 cstate = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle); 804 if (cstate) { 805 cpu_max_cstates = cpu_acpi_get_max_cstates(handle); 806 807 for (i = CPU_ACPI_C1; i <= cpu_max_cstates; i++) { 808 if (cstate->cs_ksp != NULL) 809 kstat_delete(cstate->cs_ksp); 810 cstate++; 811 } 812 } 813 814 cpupm_free_ms_cstate(cp); 815 cpupm_free_domains(&cpupm_cstate_domains); 816 cpu_acpi_free_cstate_data(handle); 817 818 mutex_enter(&cpu_idle_callb_mutex); 819 if (cpu_deep_idle_callb_id != (callb_id_t)0) { 820 (void) callb_delete(cpu_deep_idle_callb_id); 821 cpu_deep_idle_callb_id = (callb_id_t)0; 822 } 823 if (cpu_idle_cpr_callb_id != (callb_id_t)0) { 824 (void) callb_delete(cpu_idle_cpr_callb_id); 825 cpu_idle_cpr_callb_id = (callb_id_t)0; 826 } 827 mutex_exit(&cpu_idle_callb_mutex); 828 } 829 830 /* 831 * This function is introduced here to solve a race condition 832 * between the master and the slave to touch c-state data structure. 833 * After the slave calls this idle function to switch to the non 834 * deep idle function, the master can go on to reclaim the resource. 835 */ 836 static void 837 cpu_idle_stop_sync(void) 838 { 839 /* switch to the non deep idle function */ 840 CPU->cpu_m.mcpu_idle_cpu = non_deep_idle_cpu; 841 } 842 843 static void 844 cpu_idle_stop(cpu_t *cp) 845 { 846 cpupm_mach_state_t *mach_state = 847 (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state); 848 cpu_acpi_handle_t handle = mach_state->ms_acpi_handle; 849 cpu_acpi_cstate_t *cstate; 850 uint_t cpu_max_cstates, i = 0; 851 852 mutex_enter(&cpu_idle_callb_mutex); 853 if (idle_cpu == cpu_idle_adaptive) { 854 /* 855 * invoke the slave to call synchronous idle function. 856 */ 857 cp->cpu_m.mcpu_idle_cpu = cpu_idle_stop_sync; 858 poke_cpu(cp->cpu_id); 859 860 /* 861 * wait until the slave switchs to non deep idle function, 862 * so that the master is safe to go on to reclaim the resource. 863 */ 864 while (cp->cpu_m.mcpu_idle_cpu != non_deep_idle_cpu) { 865 drv_usecwait(10); 866 if ((++i % CPU_IDLE_STOP_TIMEOUT) == 0) 867 cmn_err(CE_NOTE, "!cpu_idle_stop: the slave" 868 " idle stop timeout"); 869 } 870 } 871 mutex_exit(&cpu_idle_callb_mutex); 872 873 cstate = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle); 874 if (cstate) { 875 cpu_max_cstates = cpu_acpi_get_max_cstates(handle); 876 877 for (i = CPU_ACPI_C1; i <= cpu_max_cstates; i++) { 878 if (cstate->cs_ksp != NULL) 879 kstat_delete(cstate->cs_ksp); 880 cstate++; 881 } 882 } 883 cpupm_free_ms_cstate(cp); 884 cpupm_remove_domains(cp, CPUPM_C_STATES, &cpupm_cstate_domains); 885 cpu_acpi_free_cstate_data(handle); 886 } 887 888 /*ARGSUSED*/ 889 static boolean_t 890 cpu_deep_idle_callb(void *arg, int code) 891 { 892 boolean_t rslt = B_TRUE; 893 894 mutex_enter(&cpu_idle_callb_mutex); 895 switch (code) { 896 case PM_DEFAULT_CPU_DEEP_IDLE: 897 /* 898 * Default policy is same as enable 899 */ 900 /*FALLTHROUGH*/ 901 case PM_ENABLE_CPU_DEEP_IDLE: 902 if ((cpu_idle_cfg_state & CPU_IDLE_DEEP_CFG) == 0) 903 break; 904 905 if (cstate_timer_callback(PM_ENABLE_CPU_DEEP_IDLE)) { 906 disp_enq_thread = cstate_wakeup; 907 idle_cpu = cpu_idle_adaptive; 908 cpu_idle_cfg_state &= ~CPU_IDLE_DEEP_CFG; 909 } else { 910 rslt = B_FALSE; 911 } 912 break; 913 914 case PM_DISABLE_CPU_DEEP_IDLE: 915 if (cpu_idle_cfg_state & CPU_IDLE_DEEP_CFG) 916 break; 917 918 idle_cpu = non_deep_idle_cpu; 919 if (cstate_timer_callback(PM_DISABLE_CPU_DEEP_IDLE)) { 920 disp_enq_thread = non_deep_idle_disp_enq_thread; 921 cpu_idle_cfg_state |= CPU_IDLE_DEEP_CFG; 922 } 923 break; 924 925 default: 926 cmn_err(CE_NOTE, "!cpu deep_idle_callb: invalid code %d\n", 927 code); 928 break; 929 } 930 mutex_exit(&cpu_idle_callb_mutex); 931 return (rslt); 932 } 933 934 /*ARGSUSED*/ 935 static boolean_t 936 cpu_idle_cpr_callb(void *arg, int code) 937 { 938 boolean_t rslt = B_TRUE; 939 940 mutex_enter(&cpu_idle_callb_mutex); 941 switch (code) { 942 case CB_CODE_CPR_RESUME: 943 if (cstate_timer_callback(CB_CODE_CPR_RESUME)) { 944 /* 945 * Do not enable dispatcher hooks if disabled by user. 946 */ 947 if (cpu_idle_cfg_state & CPU_IDLE_DEEP_CFG) 948 break; 949 950 disp_enq_thread = cstate_wakeup; 951 idle_cpu = cpu_idle_adaptive; 952 } else { 953 rslt = B_FALSE; 954 } 955 break; 956 957 case CB_CODE_CPR_CHKPT: 958 idle_cpu = non_deep_idle_cpu; 959 disp_enq_thread = non_deep_idle_disp_enq_thread; 960 (void) cstate_timer_callback(CB_CODE_CPR_CHKPT); 961 break; 962 963 default: 964 cmn_err(CE_NOTE, "!cpudvr cpr_callb: invalid code %d\n", code); 965 break; 966 } 967 mutex_exit(&cpu_idle_callb_mutex); 968 return (rslt); 969 } 970 971 /* 972 * handle _CST notification 973 */ 974 void 975 cpuidle_cstate_instance(cpu_t *cp) 976 { 977 #ifndef __xpv 978 cpupm_mach_state_t *mach_state = 979 (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state; 980 cpu_acpi_handle_t handle; 981 struct machcpu *mcpu; 982 cpuset_t dom_cpu_set; 983 kmutex_t *pm_lock; 984 int result = 0; 985 processorid_t cpu_id; 986 987 if (mach_state == NULL) { 988 return; 989 } 990 991 ASSERT(mach_state->ms_cstate.cma_domain != NULL); 992 dom_cpu_set = mach_state->ms_cstate.cma_domain->pm_cpus; 993 pm_lock = &mach_state->ms_cstate.cma_domain->pm_lock; 994 995 /* 996 * Do for all the CPU's in the domain 997 */ 998 mutex_enter(pm_lock); 999 do { 1000 CPUSET_FIND(dom_cpu_set, cpu_id); 1001 if (cpu_id == CPUSET_NOTINSET) 1002 break; 1003 1004 ASSERT(cpu_id >= 0 && cpu_id < NCPU); 1005 cp = cpu[cpu_id]; 1006 mach_state = (cpupm_mach_state_t *) 1007 cp->cpu_m.mcpu_pm_mach_state; 1008 if (!(mach_state->ms_caps & CPUPM_C_STATES)) { 1009 mutex_exit(pm_lock); 1010 return; 1011 } 1012 handle = mach_state->ms_acpi_handle; 1013 ASSERT(handle != NULL); 1014 1015 /* 1016 * re-evaluate cstate object 1017 */ 1018 if (cpu_acpi_cache_cstate_data(handle) != 0) { 1019 cmn_err(CE_WARN, "Cannot re-evaluate the cpu c-state" 1020 " object Instance: %d", cpu_id); 1021 } 1022 mcpu = &(cp->cpu_m); 1023 mcpu->max_cstates = cpu_acpi_get_max_cstates(handle); 1024 if (mcpu->max_cstates > CPU_ACPI_C1) { 1025 (void) cstate_timer_callback( 1026 CST_EVENT_MULTIPLE_CSTATES); 1027 disp_enq_thread = cstate_wakeup; 1028 cp->cpu_m.mcpu_idle_cpu = cpu_acpi_idle; 1029 } else if (mcpu->max_cstates == CPU_ACPI_C1) { 1030 disp_enq_thread = non_deep_idle_disp_enq_thread; 1031 cp->cpu_m.mcpu_idle_cpu = non_deep_idle_cpu; 1032 (void) cstate_timer_callback(CST_EVENT_ONE_CSTATE); 1033 } 1034 1035 CPUSET_ATOMIC_XDEL(dom_cpu_set, cpu_id, result); 1036 } while (result < 0); 1037 mutex_exit(pm_lock); 1038 #endif 1039 } 1040 1041 /* 1042 * handle the number or the type of available processor power states change 1043 */ 1044 void 1045 cpuidle_manage_cstates(void *ctx) 1046 { 1047 cpu_t *cp = ctx; 1048 cpupm_mach_state_t *mach_state = 1049 (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state; 1050 boolean_t is_ready; 1051 1052 if (mach_state == NULL) { 1053 return; 1054 } 1055 1056 /* 1057 * We currently refuse to power manage if the CPU is not ready to 1058 * take cross calls (cross calls fail silently if CPU is not ready 1059 * for it). 1060 * 1061 * Additionally, for x86 platforms we cannot power manage an instance, 1062 * until it has been initialized. 1063 */ 1064 is_ready = (cp->cpu_flags & CPU_READY) && cpupm_cstate_ready(cp); 1065 if (!is_ready) 1066 return; 1067 1068 cpuidle_cstate_instance(cp); 1069 } 1070