1 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 /* 26 * Copyright (c) 2009-2010, Intel Corporation. 27 * All rights reserved. 28 * Copyright 2018 Joyent, Inc. 29 * Copyright 2020 Oxide Computer Company 30 */ 31 32 #define PSMI_1_7 33 #include <sys/smp_impldefs.h> 34 #include <sys/psm.h> 35 #include <sys/psm_modctl.h> 36 #include <sys/pit.h> 37 #include <sys/cmn_err.h> 38 #include <sys/strlog.h> 39 #include <sys/clock.h> 40 #include <sys/debug.h> 41 #include <sys/rtc.h> 42 #include <sys/x86_archext.h> 43 #include <sys/cpupart.h> 44 #include <sys/cpuvar.h> 45 #include <sys/cpu_event.h> 46 #include <sys/cmt.h> 47 #include <sys/cpu.h> 48 #include <sys/disp.h> 49 #include <sys/archsystm.h> 50 #include <sys/machsystm.h> 51 #include <sys/sysmacros.h> 52 #include <sys/memlist.h> 53 #include <sys/param.h> 54 #include <sys/promif.h> 55 #include <sys/cpu_pm.h> 56 #if defined(__xpv) 57 #include <sys/hypervisor.h> 58 #endif 59 #include <sys/mach_intr.h> 60 #include <vm/hat_i86.h> 61 #include <sys/kdi_machimpl.h> 62 #include <sys/sdt.h> 63 #include <sys/hpet.h> 64 #include <sys/sunddi.h> 65 #include <sys/sunndi.h> 66 #include <sys/cpc_pcbe.h> 67 #include <sys/prom_debug.h> 68 69 70 #define OFFSETOF(s, m) (size_t)(&(((s *)0)->m)) 71 72 /* 73 * Local function prototypes 74 */ 75 static int mp_disable_intr(processorid_t cpun); 76 static void mp_enable_intr(processorid_t cpun); 77 static void mach_init(); 78 static void mach_picinit(); 79 static int machhztomhz(uint64_t cpu_freq_hz); 80 static uint64_t mach_getcpufreq(void); 81 static void mach_fixcpufreq(void); 82 static int mach_clkinit(int, int *); 83 static void mach_smpinit(void); 84 static int mach_softlvl_to_vect(int ipl); 85 static void mach_get_platform(int owner); 86 static void mach_construct_info(); 87 static int mach_translate_irq(dev_info_t *dip, int irqno); 88 static int mach_intr_ops(dev_info_t *, ddi_intr_handle_impl_t *, 89 psm_intr_op_t, int *); 90 static void mach_notify_error(int level, char *errmsg); 91 static hrtime_t dummy_hrtime(void); 92 static void dummy_scalehrtime(hrtime_t *); 93 static uint64_t dummy_unscalehrtime(hrtime_t); 94 void cpu_idle(void); 95 static void cpu_wakeup(cpu_t *, int); 96 #ifndef __xpv 97 void cpu_idle_mwait(void); 98 static void cpu_wakeup_mwait(cpu_t *, int); 99 #endif 100 static int mach_cpu_create_devinfo(cpu_t *cp, dev_info_t **dipp); 101 102 /* 103 * External reference functions 104 */ 105 extern void return_instr(); 106 extern uint64_t freq_tsc(uint32_t *); 107 #if defined(__i386) 108 extern uint64_t freq_notsc(uint32_t *); 109 #endif 110 extern void pc_gethrestime(timestruc_t *); 111 extern int cpuid_get_coreid(cpu_t *); 112 extern int cpuid_get_chipid(cpu_t *); 113 114 /* 115 * PSM functions initialization 116 */ 117 void (*psm_shutdownf)(int, int) = (void (*)(int, int))return_instr; 118 void (*psm_preshutdownf)(int, int) = (void (*)(int, int))return_instr; 119 void (*psm_notifyf)(int) = (void (*)(int))return_instr; 120 void (*psm_set_idle_cpuf)(int) = (void (*)(int))return_instr; 121 void (*psm_unset_idle_cpuf)(int) = (void (*)(int))return_instr; 122 void (*psminitf)() = mach_init; 123 void (*picinitf)() = return_instr; 124 int (*clkinitf)(int, int *) = (int (*)(int, int *))return_instr; 125 int (*ap_mlsetup)() = (int (*)(void))return_instr; 126 void (*send_dirintf)() = return_instr; 127 void (*setspl)(int) = (void (*)(int))return_instr; 128 int (*addspl)(int, int, int, int) = (int (*)(int, int, int, int))return_instr; 129 int (*delspl)(int, int, int, int) = (int (*)(int, int, int, int))return_instr; 130 int (*get_pending_spl)(void) = (int (*)(void))return_instr; 131 int (*addintr)(void *, int, avfunc, char *, int, caddr_t, caddr_t, 132 uint64_t *, dev_info_t *) = NULL; 133 void (*remintr)(void *, int, avfunc, int) = NULL; 134 void (*kdisetsoftint)(int, struct av_softinfo *)= 135 (void (*)(int, struct av_softinfo *))return_instr; 136 void (*setsoftint)(int, struct av_softinfo *)= 137 (void (*)(int, struct av_softinfo *))return_instr; 138 int (*slvltovect)(int) = (int (*)(int))return_instr; 139 int (*setlvl)(int, int *) = (int (*)(int, int *))return_instr; 140 void (*setlvlx)(int, int) = (void (*)(int, int))return_instr; 141 int (*psm_disable_intr)(int) = mp_disable_intr; 142 void (*psm_enable_intr)(int) = mp_enable_intr; 143 hrtime_t (*gethrtimef)(void) = dummy_hrtime; 144 hrtime_t (*gethrtimeunscaledf)(void) = dummy_hrtime; 145 void (*scalehrtimef)(hrtime_t *) = dummy_scalehrtime; 146 uint64_t (*unscalehrtimef)(hrtime_t) = dummy_unscalehrtime; 147 int (*psm_translate_irq)(dev_info_t *, int) = mach_translate_irq; 148 void (*gethrestimef)(timestruc_t *) = pc_gethrestime; 149 void (*psm_notify_error)(int, char *) = (void (*)(int, char *))NULL; 150 int (*psm_get_clockirq)(int) = NULL; 151 int (*psm_get_ipivect)(int, int) = NULL; 152 uchar_t (*psm_get_ioapicid)(uchar_t) = NULL; 153 uint32_t (*psm_get_localapicid)(uint32_t) = NULL; 154 uchar_t (*psm_xlate_vector_by_irq)(uchar_t) = NULL; 155 int (*psm_get_pir_ipivect)(void) = NULL; 156 void (*psm_send_pir_ipi)(processorid_t) = NULL; 157 void (*psm_cmci_setup)(processorid_t, boolean_t) = NULL; 158 159 int (*psm_clkinit)(int) = NULL; 160 void (*psm_timer_reprogram)(hrtime_t) = NULL; 161 void (*psm_timer_enable)(void) = NULL; 162 void (*psm_timer_disable)(void) = NULL; 163 void (*psm_post_cyclic_setup)(void *arg) = NULL; 164 int (*psm_intr_ops)(dev_info_t *, ddi_intr_handle_impl_t *, psm_intr_op_t, 165 int *) = mach_intr_ops; 166 int (*psm_state)(psm_state_request_t *) = (int (*)(psm_state_request_t *)) 167 return_instr; 168 169 void (*notify_error)(int, char *) = (void (*)(int, char *))return_instr; 170 void (*hrtime_tick)(void) = return_instr; 171 172 int (*psm_cpu_create_devinfo)(cpu_t *, dev_info_t **) = mach_cpu_create_devinfo; 173 int (*psm_cpu_get_devinfo)(cpu_t *, dev_info_t **) = NULL; 174 175 /* global IRM pool for APIX (PSM) module */ 176 ddi_irm_pool_t *apix_irm_pool_p = NULL; 177 178 /* 179 * True if the generic TSC code is our source of hrtime, rather than whatever 180 * the PSM can provide. 181 */ 182 #ifdef __xpv 183 int tsc_gethrtime_enable = 0; 184 #else 185 int tsc_gethrtime_enable = 1; 186 #endif 187 int tsc_gethrtime_initted = 0; 188 189 /* 190 * True if the hrtime implementation is "hires"; namely, better than microdata. 191 */ 192 int gethrtime_hires = 0; 193 194 /* 195 * Local Static Data 196 */ 197 static struct psm_ops mach_ops; 198 static struct psm_ops *mach_set[4] = {&mach_ops, NULL, NULL, NULL}; 199 static ushort_t mach_ver[4] = {0, 0, 0, 0}; 200 201 /* 202 * virtualization support for psm 203 */ 204 void *psm_vt_ops = NULL; 205 /* 206 * If non-zero, idle cpus will become "halted" when there's 207 * no work to do. 208 */ 209 int idle_cpu_use_hlt = 1; 210 211 #ifndef __xpv 212 /* 213 * If non-zero, idle cpus will use mwait if available to halt instead of hlt. 214 */ 215 int idle_cpu_prefer_mwait = 1; 216 /* 217 * Set to 0 to avoid MONITOR+CLFLUSH assertion. 218 */ 219 int idle_cpu_assert_cflush_monitor = 1; 220 221 /* 222 * If non-zero, idle cpus will not use power saving Deep C-States idle loop. 223 */ 224 int idle_cpu_no_deep_c = 0; 225 /* 226 * Non-power saving idle loop and wakeup pointers. 227 * Allows user to toggle Deep Idle power saving feature on/off. 228 */ 229 void (*non_deep_idle_cpu)() = cpu_idle; 230 void (*non_deep_idle_disp_enq_thread)(cpu_t *, int); 231 232 /* 233 * Object for the kernel to access the HPET. 234 */ 235 hpet_t hpet; 236 237 #endif /* ifndef __xpv */ 238 239 uint_t cp_haltset_fanout = 0; 240 241 /*ARGSUSED*/ 242 int 243 pg_plat_hw_shared(cpu_t *cp, pghw_type_t hw) 244 { 245 switch (hw) { 246 case PGHW_IPIPE: 247 if (is_x86_feature(x86_featureset, X86FSET_HTT)) { 248 /* 249 * Hyper-threading is SMT 250 */ 251 return (1); 252 } else { 253 return (0); 254 } 255 case PGHW_FPU: 256 if (cpuid_get_cores_per_compunit(cp) > 1) 257 return (1); 258 else 259 return (0); 260 case PGHW_PROCNODE: 261 if (cpuid_get_procnodes_per_pkg(cp) > 1) 262 return (1); 263 else 264 return (0); 265 case PGHW_CHIP: 266 if (is_x86_feature(x86_featureset, X86FSET_CMP) || 267 is_x86_feature(x86_featureset, X86FSET_HTT)) 268 return (1); 269 else 270 return (0); 271 case PGHW_CACHE: 272 if (cpuid_get_ncpu_sharing_last_cache(cp) > 1) 273 return (1); 274 else 275 return (0); 276 case PGHW_POW_ACTIVE: 277 if (cpupm_domain_id(cp, CPUPM_DTYPE_ACTIVE) != (id_t)-1) 278 return (1); 279 else 280 return (0); 281 case PGHW_POW_IDLE: 282 if (cpupm_domain_id(cp, CPUPM_DTYPE_IDLE) != (id_t)-1) 283 return (1); 284 else 285 return (0); 286 default: 287 return (0); 288 } 289 } 290 291 /* 292 * Compare two CPUs and see if they have a pghw_type_t sharing relationship 293 * If pghw_type_t is an unsupported hardware type, then return -1 294 */ 295 int 296 pg_plat_cpus_share(cpu_t *cpu_a, cpu_t *cpu_b, pghw_type_t hw) 297 { 298 id_t pgp_a, pgp_b; 299 300 pgp_a = pg_plat_hw_instance_id(cpu_a, hw); 301 pgp_b = pg_plat_hw_instance_id(cpu_b, hw); 302 303 if (pgp_a == -1 || pgp_b == -1) 304 return (-1); 305 306 return (pgp_a == pgp_b); 307 } 308 309 /* 310 * Return a physical instance identifier for known hardware sharing 311 * relationships 312 */ 313 id_t 314 pg_plat_hw_instance_id(cpu_t *cpu, pghw_type_t hw) 315 { 316 switch (hw) { 317 case PGHW_IPIPE: 318 return (cpuid_get_coreid(cpu)); 319 case PGHW_CACHE: 320 return (cpuid_get_last_lvl_cacheid(cpu)); 321 case PGHW_FPU: 322 return (cpuid_get_compunitid(cpu)); 323 case PGHW_PROCNODE: 324 return (cpuid_get_procnodeid(cpu)); 325 case PGHW_CHIP: 326 return (cpuid_get_chipid(cpu)); 327 case PGHW_POW_ACTIVE: 328 return (cpupm_domain_id(cpu, CPUPM_DTYPE_ACTIVE)); 329 case PGHW_POW_IDLE: 330 return (cpupm_domain_id(cpu, CPUPM_DTYPE_IDLE)); 331 default: 332 return (-1); 333 } 334 } 335 336 /* 337 * Express preference for optimizing for sharing relationship 338 * hw1 vs hw2 339 */ 340 pghw_type_t 341 pg_plat_hw_rank(pghw_type_t hw1, pghw_type_t hw2) 342 { 343 int i, rank1, rank2; 344 345 static pghw_type_t hw_hier[] = { 346 PGHW_IPIPE, 347 PGHW_CACHE, 348 PGHW_FPU, 349 PGHW_PROCNODE, 350 PGHW_CHIP, 351 PGHW_POW_IDLE, 352 PGHW_POW_ACTIVE, 353 PGHW_NUM_COMPONENTS 354 }; 355 356 rank1 = 0; 357 rank2 = 0; 358 359 for (i = 0; hw_hier[i] != PGHW_NUM_COMPONENTS; i++) { 360 if (hw_hier[i] == hw1) 361 rank1 = i; 362 if (hw_hier[i] == hw2) 363 rank2 = i; 364 } 365 366 if (rank1 > rank2) 367 return (hw1); 368 else 369 return (hw2); 370 } 371 372 /* 373 * Override the default CMT dispatcher policy for the specified 374 * hardware sharing relationship 375 */ 376 pg_cmt_policy_t 377 pg_plat_cmt_policy(pghw_type_t hw) 378 { 379 /* 380 * For shared caches, also load balance across them to 381 * maximize aggregate cache capacity 382 * 383 * On AMD family 0x15 CPUs, cores come in pairs called 384 * compute units, sharing the FPU and the I$ and L2 385 * caches. Use balancing and cache affinity. 386 */ 387 switch (hw) { 388 case PGHW_FPU: 389 case PGHW_CACHE: 390 return (CMT_BALANCE|CMT_AFFINITY); 391 default: 392 return (CMT_NO_POLICY); 393 } 394 } 395 396 id_t 397 pg_plat_get_core_id(cpu_t *cpu) 398 { 399 return ((id_t)cpuid_get_coreid(cpu)); 400 } 401 402 void 403 cmp_set_nosteal_interval(void) 404 { 405 /* Set the nosteal interval (used by disp_getbest()) to 100us */ 406 nosteal_nsec = 100000UL; 407 } 408 409 /* 410 * Routine to ensure initial callers to hrtime gets 0 as return 411 */ 412 static hrtime_t 413 dummy_hrtime(void) 414 { 415 return (0); 416 } 417 418 /* ARGSUSED */ 419 static void 420 dummy_scalehrtime(hrtime_t *ticks) 421 {} 422 423 static uint64_t 424 dummy_unscalehrtime(hrtime_t nsecs) 425 { 426 return ((uint64_t)nsecs); 427 } 428 429 /* 430 * Supports Deep C-State power saving idle loop. 431 */ 432 void 433 cpu_idle_adaptive(void) 434 { 435 (*CPU->cpu_m.mcpu_idle_cpu)(); 436 } 437 438 /* 439 * Function called by CPU idle notification framework to check whether CPU 440 * has been awakened. It will be called with interrupt disabled. 441 * If CPU has been awakened, call cpu_idle_exit() to notify CPU idle 442 * notification framework. 443 */ 444 /*ARGSUSED*/ 445 static void 446 cpu_idle_check_wakeup(void *arg) 447 { 448 /* 449 * Toggle interrupt flag to detect pending interrupts. 450 * If interrupt happened, do_interrupt() will notify CPU idle 451 * notification framework so no need to call cpu_idle_exit() here. 452 */ 453 sti(); 454 SMT_PAUSE(); 455 cli(); 456 } 457 458 /* 459 * Idle the present CPU until wakened via an interrupt 460 */ 461 void 462 cpu_idle(void) 463 { 464 cpu_t *cpup = CPU; 465 processorid_t cpu_sid = cpup->cpu_seqid; 466 cpupart_t *cp = cpup->cpu_part; 467 int hset_update = 1; 468 469 /* 470 * If this CPU is online, and there's multiple CPUs 471 * in the system, then we should notate our halting 472 * by adding ourselves to the partition's halted CPU 473 * bitmap. This allows other CPUs to find/awaken us when 474 * work becomes available. 475 */ 476 if (cpup->cpu_flags & CPU_OFFLINE || ncpus == 1) 477 hset_update = 0; 478 479 /* 480 * Add ourselves to the partition's halted CPUs bitmap 481 * and set our HALTED flag, if necessary. 482 * 483 * When a thread becomes runnable, it is placed on the queue 484 * and then the halted CPU bitmap is checked to determine who 485 * (if anyone) should be awakened. We therefore need to first 486 * add ourselves to the bitmap, and and then check if there 487 * is any work available. The order is important to prevent a race 488 * that can lead to work languishing on a run queue somewhere while 489 * this CPU remains halted. 490 * 491 * Either the producing CPU will see we're halted and will awaken us, 492 * or this CPU will see the work available in disp_anywork(). 493 * 494 * Note that memory barriers after updating the HALTED flag 495 * are not necessary since an atomic operation (updating the bitset) 496 * immediately follows. On x86 the atomic operation acts as a 497 * memory barrier for the update of cpu_disp_flags. 498 */ 499 if (hset_update) { 500 cpup->cpu_disp_flags |= CPU_DISP_HALTED; 501 bitset_atomic_add(&cp->cp_haltset, cpu_sid); 502 } 503 504 /* 505 * Check to make sure there's really nothing to do. 506 * Work destined for this CPU may become available after 507 * this check. We'll be notified through the clearing of our 508 * bit in the halted CPU bitmap, and a poke. 509 */ 510 if (disp_anywork()) { 511 if (hset_update) { 512 cpup->cpu_disp_flags &= ~CPU_DISP_HALTED; 513 bitset_atomic_del(&cp->cp_haltset, cpu_sid); 514 } 515 return; 516 } 517 518 /* 519 * We're on our way to being halted. 520 * 521 * Disable interrupts now, so that we'll awaken immediately 522 * after halting if someone tries to poke us between now and 523 * the time we actually halt. 524 * 525 * We check for the presence of our bit after disabling interrupts. 526 * If it's cleared, we'll return. If the bit is cleared after 527 * we check then the poke will pop us out of the halted state. 528 * 529 * This means that the ordering of the poke and the clearing 530 * of the bit by cpu_wakeup is important. 531 * cpu_wakeup() must clear, then poke. 532 * cpu_idle() must disable interrupts, then check for the bit. 533 */ 534 cli(); 535 536 if (hset_update && bitset_in_set(&cp->cp_haltset, cpu_sid) == 0) { 537 cpup->cpu_disp_flags &= ~CPU_DISP_HALTED; 538 sti(); 539 return; 540 } 541 542 /* 543 * The check for anything locally runnable is here for performance 544 * and isn't needed for correctness. disp_nrunnable ought to be 545 * in our cache still, so it's inexpensive to check, and if there 546 * is anything runnable we won't have to wait for the poke. 547 */ 548 if (cpup->cpu_disp->disp_nrunnable != 0) { 549 if (hset_update) { 550 cpup->cpu_disp_flags &= ~CPU_DISP_HALTED; 551 bitset_atomic_del(&cp->cp_haltset, cpu_sid); 552 } 553 sti(); 554 return; 555 } 556 557 if (cpu_idle_enter(IDLE_STATE_C1, 0, 558 cpu_idle_check_wakeup, NULL) == 0) { 559 mach_cpu_idle(); 560 cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE); 561 } 562 563 /* 564 * We're no longer halted 565 */ 566 if (hset_update) { 567 cpup->cpu_disp_flags &= ~CPU_DISP_HALTED; 568 bitset_atomic_del(&cp->cp_haltset, cpu_sid); 569 } 570 } 571 572 573 /* 574 * If "cpu" is halted, then wake it up clearing its halted bit in advance. 575 * Otherwise, see if other CPUs in the cpu partition are halted and need to 576 * be woken up so that they can steal the thread we placed on this CPU. 577 * This function is only used on MP systems. 578 */ 579 static void 580 cpu_wakeup(cpu_t *cpu, int bound) 581 { 582 uint_t cpu_found; 583 processorid_t cpu_sid; 584 cpupart_t *cp; 585 586 cp = cpu->cpu_part; 587 cpu_sid = cpu->cpu_seqid; 588 if (bitset_in_set(&cp->cp_haltset, cpu_sid)) { 589 /* 590 * Clear the halted bit for that CPU since it will be 591 * poked in a moment. 592 */ 593 bitset_atomic_del(&cp->cp_haltset, cpu_sid); 594 /* 595 * We may find the current CPU present in the halted cpuset 596 * if we're in the context of an interrupt that occurred 597 * before we had a chance to clear our bit in cpu_idle(). 598 * Poking ourself is obviously unnecessary, since if 599 * we're here, we're not halted. 600 */ 601 if (cpu != CPU) 602 poke_cpu(cpu->cpu_id); 603 return; 604 } else { 605 /* 606 * This cpu isn't halted, but it's idle or undergoing a 607 * context switch. No need to awaken anyone else. 608 */ 609 if (cpu->cpu_thread == cpu->cpu_idle_thread || 610 cpu->cpu_disp_flags & CPU_DISP_DONTSTEAL) 611 return; 612 } 613 614 /* 615 * No need to wake up other CPUs if this is for a bound thread. 616 */ 617 if (bound) 618 return; 619 620 /* 621 * The CPU specified for wakeup isn't currently halted, so check 622 * to see if there are any other halted CPUs in the partition, 623 * and if there are then awaken one. 624 */ 625 do { 626 cpu_found = bitset_find(&cp->cp_haltset); 627 if (cpu_found == (uint_t)-1) 628 return; 629 } while (bitset_atomic_test_and_del(&cp->cp_haltset, cpu_found) < 0); 630 631 if (cpu_found != CPU->cpu_seqid) { 632 poke_cpu(cpu_seq[cpu_found]->cpu_id); 633 } 634 } 635 636 #ifndef __xpv 637 /* 638 * Function called by CPU idle notification framework to check whether CPU 639 * has been awakened. It will be called with interrupt disabled. 640 * If CPU has been awakened, call cpu_idle_exit() to notify CPU idle 641 * notification framework. 642 */ 643 static void 644 cpu_idle_mwait_check_wakeup(void *arg) 645 { 646 volatile uint32_t *mcpu_mwait = (volatile uint32_t *)arg; 647 648 ASSERT(arg != NULL); 649 if (*mcpu_mwait != MWAIT_HALTED) { 650 /* 651 * CPU has been awakened, notify CPU idle notification system. 652 */ 653 cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE); 654 } else { 655 /* 656 * Toggle interrupt flag to detect pending interrupts. 657 * If interrupt happened, do_interrupt() will notify CPU idle 658 * notification framework so no need to call cpu_idle_exit() 659 * here. 660 */ 661 sti(); 662 SMT_PAUSE(); 663 cli(); 664 } 665 } 666 667 /* 668 * Idle the present CPU until awakened via touching its monitored line 669 */ 670 void 671 cpu_idle_mwait(void) 672 { 673 volatile uint32_t *mcpu_mwait = CPU->cpu_m.mcpu_mwait; 674 cpu_t *cpup = CPU; 675 processorid_t cpu_sid = cpup->cpu_seqid; 676 cpupart_t *cp = cpup->cpu_part; 677 int hset_update = 1; 678 679 /* 680 * Set our mcpu_mwait here, so we can tell if anyone tries to 681 * wake us between now and when we call mwait. No other cpu will 682 * attempt to set our mcpu_mwait until we add ourself to the halted 683 * CPU bitmap. 684 */ 685 *mcpu_mwait = MWAIT_HALTED; 686 687 /* 688 * If this CPU is online, and there's multiple CPUs 689 * in the system, then we should note our halting 690 * by adding ourselves to the partition's halted CPU 691 * bitmap. This allows other CPUs to find/awaken us when 692 * work becomes available. 693 */ 694 if (cpup->cpu_flags & CPU_OFFLINE || ncpus == 1) 695 hset_update = 0; 696 697 /* 698 * Add ourselves to the partition's halted CPUs bitmap 699 * and set our HALTED flag, if necessary. 700 * 701 * When a thread becomes runnable, it is placed on the queue 702 * and then the halted CPU bitmap is checked to determine who 703 * (if anyone) should be awakened. We therefore need to first 704 * add ourselves to the bitmap, and and then check if there 705 * is any work available. 706 * 707 * Note that memory barriers after updating the HALTED flag 708 * are not necessary since an atomic operation (updating the bitmap) 709 * immediately follows. On x86 the atomic operation acts as a 710 * memory barrier for the update of cpu_disp_flags. 711 */ 712 if (hset_update) { 713 cpup->cpu_disp_flags |= CPU_DISP_HALTED; 714 bitset_atomic_add(&cp->cp_haltset, cpu_sid); 715 } 716 717 /* 718 * Check to make sure there's really nothing to do. 719 * Work destined for this CPU may become available after 720 * this check. We'll be notified through the clearing of our 721 * bit in the halted CPU bitmap, and a write to our mcpu_mwait. 722 * 723 * disp_anywork() checks disp_nrunnable, so we do not have to later. 724 */ 725 if (disp_anywork()) { 726 if (hset_update) { 727 cpup->cpu_disp_flags &= ~CPU_DISP_HALTED; 728 bitset_atomic_del(&cp->cp_haltset, cpu_sid); 729 } 730 return; 731 } 732 733 /* 734 * We're on our way to being halted. 735 * To avoid a lost wakeup, arm the monitor before checking if another 736 * cpu wrote to mcpu_mwait to wake us up. 737 */ 738 i86_monitor(mcpu_mwait, 0, 0); 739 if (*mcpu_mwait == MWAIT_HALTED) { 740 if (cpu_idle_enter(IDLE_STATE_C1, 0, 741 cpu_idle_mwait_check_wakeup, (void *)mcpu_mwait) == 0) { 742 if (*mcpu_mwait == MWAIT_HALTED) { 743 i86_mwait(0, 0); 744 } 745 cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE); 746 } 747 } 748 749 /* 750 * We're no longer halted 751 */ 752 if (hset_update) { 753 cpup->cpu_disp_flags &= ~CPU_DISP_HALTED; 754 bitset_atomic_del(&cp->cp_haltset, cpu_sid); 755 } 756 } 757 758 /* 759 * If "cpu" is halted in mwait, then wake it up clearing its halted bit in 760 * advance. Otherwise, see if other CPUs in the cpu partition are halted and 761 * need to be woken up so that they can steal the thread we placed on this CPU. 762 * This function is only used on MP systems. 763 */ 764 static void 765 cpu_wakeup_mwait(cpu_t *cp, int bound) 766 { 767 cpupart_t *cpu_part; 768 uint_t cpu_found; 769 processorid_t cpu_sid; 770 771 cpu_part = cp->cpu_part; 772 cpu_sid = cp->cpu_seqid; 773 774 /* 775 * Clear the halted bit for that CPU since it will be woken up 776 * in a moment. 777 */ 778 if (bitset_in_set(&cpu_part->cp_haltset, cpu_sid)) { 779 /* 780 * Clear the halted bit for that CPU since it will be 781 * poked in a moment. 782 */ 783 bitset_atomic_del(&cpu_part->cp_haltset, cpu_sid); 784 /* 785 * We may find the current CPU present in the halted cpuset 786 * if we're in the context of an interrupt that occurred 787 * before we had a chance to clear our bit in cpu_idle(). 788 * Waking ourself is obviously unnecessary, since if 789 * we're here, we're not halted. 790 * 791 * monitor/mwait wakeup via writing to our cache line is 792 * harmless and less expensive than always checking if we 793 * are waking ourself which is an uncommon case. 794 */ 795 MWAIT_WAKEUP(cp); /* write to monitored line */ 796 return; 797 } else { 798 /* 799 * This cpu isn't halted, but it's idle or undergoing a 800 * context switch. No need to awaken anyone else. 801 */ 802 if (cp->cpu_thread == cp->cpu_idle_thread || 803 cp->cpu_disp_flags & CPU_DISP_DONTSTEAL) 804 return; 805 } 806 807 /* 808 * No need to wake up other CPUs if the thread we just enqueued 809 * is bound. 810 */ 811 if (bound || ncpus == 1) 812 return; 813 814 /* 815 * See if there's any other halted CPUs. If there are, then 816 * select one, and awaken it. 817 * It's possible that after we find a CPU, somebody else 818 * will awaken it before we get the chance. 819 * In that case, look again. 820 */ 821 do { 822 cpu_found = bitset_find(&cpu_part->cp_haltset); 823 if (cpu_found == (uint_t)-1) 824 return; 825 } while (bitset_atomic_test_and_del(&cpu_part->cp_haltset, 826 cpu_found) < 0); 827 828 /* 829 * Do not check if cpu_found is ourself as monitor/mwait 830 * wakeup is cheap. 831 */ 832 MWAIT_WAKEUP(cpu_seq[cpu_found]); /* write to monitored line */ 833 } 834 835 #endif 836 837 void (*cpu_pause_handler)(volatile char *) = NULL; 838 839 static int 840 mp_disable_intr(int cpun) 841 { 842 /* 843 * switch to the offline cpu 844 */ 845 affinity_set(cpun); 846 /* 847 * raise ipl to just below cross call 848 */ 849 splx(XC_SYS_PIL - 1); 850 /* 851 * set base spl to prevent the next swtch to idle from 852 * lowering back to ipl 0 853 */ 854 CPU->cpu_intr_actv |= (1 << (XC_SYS_PIL - 1)); 855 set_base_spl(); 856 affinity_clear(); 857 return (DDI_SUCCESS); 858 } 859 860 static void 861 mp_enable_intr(int cpun) 862 { 863 /* 864 * switch to the online cpu 865 */ 866 affinity_set(cpun); 867 /* 868 * clear the interrupt active mask 869 */ 870 CPU->cpu_intr_actv &= ~(1 << (XC_SYS_PIL - 1)); 871 set_base_spl(); 872 (void) spl0(); 873 affinity_clear(); 874 } 875 876 static void 877 mach_get_platform(int owner) 878 { 879 void **srv_opsp; 880 void **clt_opsp; 881 int i; 882 int total_ops; 883 884 /* fix up psm ops */ 885 srv_opsp = (void **)mach_set[0]; 886 clt_opsp = (void **)mach_set[owner]; 887 if (mach_ver[owner] == (ushort_t)PSM_INFO_VER01) 888 total_ops = sizeof (struct psm_ops_ver01) / 889 sizeof (void (*)(void)); 890 else if (mach_ver[owner] == (ushort_t)PSM_INFO_VER01_1) 891 /* no psm_notify_func */ 892 total_ops = OFFSETOF(struct psm_ops, psm_notify_func) / 893 sizeof (void (*)(void)); 894 else if (mach_ver[owner] == (ushort_t)PSM_INFO_VER01_2) 895 /* no psm_timer funcs */ 896 total_ops = OFFSETOF(struct psm_ops, psm_timer_reprogram) / 897 sizeof (void (*)(void)); 898 else if (mach_ver[owner] == (ushort_t)PSM_INFO_VER01_3) 899 /* no psm_preshutdown function */ 900 total_ops = OFFSETOF(struct psm_ops, psm_preshutdown) / 901 sizeof (void (*)(void)); 902 else if (mach_ver[owner] == (ushort_t)PSM_INFO_VER01_4) 903 /* no psm_intr_ops function */ 904 total_ops = OFFSETOF(struct psm_ops, psm_intr_ops) / 905 sizeof (void (*)(void)); 906 else if (mach_ver[owner] == (ushort_t)PSM_INFO_VER01_5) 907 /* no psm_state function */ 908 total_ops = OFFSETOF(struct psm_ops, psm_state) / 909 sizeof (void (*)(void)); 910 else if (mach_ver[owner] == (ushort_t)PSM_INFO_VER01_6) 911 /* no psm_cpu_ops function */ 912 total_ops = OFFSETOF(struct psm_ops, psm_cpu_ops) / 913 sizeof (void (*)(void)); 914 else 915 total_ops = sizeof (struct psm_ops) / sizeof (void (*)(void)); 916 917 /* 918 * Save the version of the PSM module, in case we need to 919 * behave differently based on version. 920 */ 921 mach_ver[0] = mach_ver[owner]; 922 923 for (i = 0; i < total_ops; i++) 924 if (clt_opsp[i] != NULL) 925 srv_opsp[i] = clt_opsp[i]; 926 } 927 928 static void 929 mach_construct_info() 930 { 931 struct psm_sw *swp; 932 int mach_cnt[PSM_OWN_OVERRIDE+1] = {0}; 933 int conflict_owner = 0; 934 935 if (psmsw->psw_forw == psmsw) 936 panic("No valid PSM modules found"); 937 mutex_enter(&psmsw_lock); 938 for (swp = psmsw->psw_forw; swp != psmsw; swp = swp->psw_forw) { 939 if (!(swp->psw_flag & PSM_MOD_IDENTIFY)) 940 continue; 941 mach_set[swp->psw_infop->p_owner] = swp->psw_infop->p_ops; 942 mach_ver[swp->psw_infop->p_owner] = swp->psw_infop->p_version; 943 mach_cnt[swp->psw_infop->p_owner]++; 944 } 945 mutex_exit(&psmsw_lock); 946 947 mach_get_platform(PSM_OWN_SYS_DEFAULT); 948 949 /* check to see are there any conflicts */ 950 if (mach_cnt[PSM_OWN_EXCLUSIVE] > 1) 951 conflict_owner = PSM_OWN_EXCLUSIVE; 952 if (mach_cnt[PSM_OWN_OVERRIDE] > 1) 953 conflict_owner = PSM_OWN_OVERRIDE; 954 if (conflict_owner) { 955 /* remove all psm modules except uppc */ 956 cmn_err(CE_WARN, 957 "Conflicts detected on the following PSM modules:"); 958 mutex_enter(&psmsw_lock); 959 for (swp = psmsw->psw_forw; swp != psmsw; swp = swp->psw_forw) { 960 if (swp->psw_infop->p_owner == conflict_owner) 961 cmn_err(CE_WARN, "%s ", 962 swp->psw_infop->p_mach_idstring); 963 } 964 mutex_exit(&psmsw_lock); 965 cmn_err(CE_WARN, 966 "Setting the system back to SINGLE processor mode!"); 967 cmn_err(CE_WARN, 968 "Please edit /etc/mach to remove the invalid PSM module."); 969 return; 970 } 971 972 if (mach_set[PSM_OWN_EXCLUSIVE]) 973 mach_get_platform(PSM_OWN_EXCLUSIVE); 974 975 if (mach_set[PSM_OWN_OVERRIDE]) 976 mach_get_platform(PSM_OWN_OVERRIDE); 977 } 978 979 static void 980 mach_init() 981 { 982 struct psm_ops *pops; 983 984 PRM_POINT("mach_construct_info()"); 985 mach_construct_info(); 986 987 pops = mach_set[0]; 988 989 /* register the interrupt and clock initialization rotuines */ 990 picinitf = mach_picinit; 991 clkinitf = mach_clkinit; 992 psm_get_clockirq = pops->psm_get_clockirq; 993 994 /* register the interrupt setup code */ 995 slvltovect = mach_softlvl_to_vect; 996 addspl = pops->psm_addspl; 997 delspl = pops->psm_delspl; 998 999 if (pops->psm_translate_irq) 1000 psm_translate_irq = pops->psm_translate_irq; 1001 if (pops->psm_intr_ops) 1002 psm_intr_ops = pops->psm_intr_ops; 1003 1004 #if defined(PSMI_1_2) || defined(PSMI_1_3) || defined(PSMI_1_4) 1005 /* 1006 * Time-of-day functionality now handled in TOD modules. 1007 * (Warn about PSM modules that think that we're going to use 1008 * their ops vectors.) 1009 */ 1010 if (pops->psm_tod_get) 1011 cmn_err(CE_WARN, "obsolete psm_tod_get op %p", 1012 (void *)pops->psm_tod_get); 1013 1014 if (pops->psm_tod_set) 1015 cmn_err(CE_WARN, "obsolete psm_tod_set op %p", 1016 (void *)pops->psm_tod_set); 1017 #endif 1018 1019 if (pops->psm_notify_error) { 1020 psm_notify_error = mach_notify_error; 1021 notify_error = pops->psm_notify_error; 1022 } 1023 1024 PRM_POINT("psm_softinit()"); 1025 (*pops->psm_softinit)(); 1026 1027 /* 1028 * Initialize the dispatcher's function hooks to enable CPU halting 1029 * when idle. Set both the deep-idle and non-deep-idle hooks. 1030 * 1031 * Assume we can use power saving deep-idle loop cpu_idle_adaptive. 1032 * Platform deep-idle driver will reset our idle loop to 1033 * non_deep_idle_cpu if power saving deep-idle feature is not available. 1034 * 1035 * Do not use monitor/mwait if idle_cpu_use_hlt is not set(spin idle) 1036 * or idle_cpu_prefer_mwait is not set. 1037 * Allocate monitor/mwait buffer for cpu0. 1038 */ 1039 #ifndef __xpv 1040 non_deep_idle_disp_enq_thread = disp_enq_thread; 1041 #endif 1042 PRM_DEBUG(idle_cpu_use_hlt); 1043 if (idle_cpu_use_hlt) { 1044 idle_cpu = cpu_idle_adaptive; 1045 CPU->cpu_m.mcpu_idle_cpu = cpu_idle; 1046 #ifndef __xpv 1047 if (is_x86_feature(x86_featureset, X86FSET_MWAIT) && 1048 idle_cpu_prefer_mwait) { 1049 CPU->cpu_m.mcpu_mwait = cpuid_mwait_alloc(CPU); 1050 /* 1051 * Protect ourself from insane mwait size. 1052 */ 1053 if (CPU->cpu_m.mcpu_mwait == NULL) { 1054 #ifdef DEBUG 1055 cmn_err(CE_NOTE, "Using hlt idle. Cannot " 1056 "handle cpu 0 mwait size."); 1057 #endif 1058 idle_cpu_prefer_mwait = 0; 1059 CPU->cpu_m.mcpu_idle_cpu = cpu_idle; 1060 } else { 1061 CPU->cpu_m.mcpu_idle_cpu = cpu_idle_mwait; 1062 } 1063 } else { 1064 CPU->cpu_m.mcpu_idle_cpu = cpu_idle; 1065 } 1066 non_deep_idle_cpu = CPU->cpu_m.mcpu_idle_cpu; 1067 1068 /* 1069 * Disable power saving deep idle loop? 1070 */ 1071 if (idle_cpu_no_deep_c) { 1072 idle_cpu = non_deep_idle_cpu; 1073 } 1074 #endif 1075 } 1076 1077 PRM_POINT("mach_smpinit()"); 1078 mach_smpinit(); 1079 } 1080 1081 static void 1082 mach_smpinit(void) 1083 { 1084 struct psm_ops *pops; 1085 processorid_t cpu_id; 1086 int cnt; 1087 cpuset_t cpumask; 1088 1089 pops = mach_set[0]; 1090 CPUSET_ZERO(cpumask); 1091 1092 cpu_id = -1; 1093 cpu_id = (*pops->psm_get_next_processorid)(cpu_id); 1094 /* 1095 * Only add boot_ncpus CPUs to mp_cpus. Other CPUs will be handled 1096 * by CPU DR driver at runtime. 1097 */ 1098 for (cnt = 0; cpu_id != -1 && cnt < boot_ncpus; cnt++) { 1099 CPUSET_ADD(cpumask, cpu_id); 1100 cpu_id = (*pops->psm_get_next_processorid)(cpu_id); 1101 } 1102 1103 mp_cpus = cpumask; 1104 1105 /* MP related routines */ 1106 ap_mlsetup = pops->psm_post_cpu_start; 1107 send_dirintf = pops->psm_send_ipi; 1108 1109 /* optional MP related routines */ 1110 if (pops->psm_shutdown) 1111 psm_shutdownf = pops->psm_shutdown; 1112 if (pops->psm_preshutdown) 1113 psm_preshutdownf = pops->psm_preshutdown; 1114 if (pops->psm_notify_func) 1115 psm_notifyf = pops->psm_notify_func; 1116 if (pops->psm_set_idlecpu) 1117 psm_set_idle_cpuf = pops->psm_set_idlecpu; 1118 if (pops->psm_unset_idlecpu) 1119 psm_unset_idle_cpuf = pops->psm_unset_idlecpu; 1120 1121 psm_clkinit = pops->psm_clkinit; 1122 1123 if (pops->psm_timer_reprogram) 1124 psm_timer_reprogram = pops->psm_timer_reprogram; 1125 1126 if (pops->psm_timer_enable) 1127 psm_timer_enable = pops->psm_timer_enable; 1128 1129 if (pops->psm_timer_disable) 1130 psm_timer_disable = pops->psm_timer_disable; 1131 1132 if (pops->psm_post_cyclic_setup) 1133 psm_post_cyclic_setup = pops->psm_post_cyclic_setup; 1134 1135 if (pops->psm_state) 1136 psm_state = pops->psm_state; 1137 1138 /* 1139 * Set these vectors here so they can be used by Suspend/Resume 1140 * on UP machines. 1141 */ 1142 if (pops->psm_disable_intr) 1143 psm_disable_intr = pops->psm_disable_intr; 1144 if (pops->psm_enable_intr) 1145 psm_enable_intr = pops->psm_enable_intr; 1146 1147 /* 1148 * Set this vector so it can be used by vmbus (for Hyper-V) 1149 * Need this even for single-CPU systems. This works for 1150 * "pcplusmp" and "apix" platforms, but not "uppc" (because 1151 * "Uni-processor PC" does not provide a _get_ipivect). 1152 */ 1153 psm_get_ipivect = pops->psm_get_ipivect; 1154 1155 /* check for multiple CPUs */ 1156 if (cnt < 2 && plat_dr_support_cpu() == B_FALSE) 1157 return; 1158 1159 /* check for MP platforms */ 1160 if (pops->psm_cpu_start == NULL) 1161 return; 1162 1163 /* 1164 * Set the dispatcher hook to enable cpu "wake up" 1165 * when a thread becomes runnable. 1166 */ 1167 if (idle_cpu_use_hlt) { 1168 disp_enq_thread = cpu_wakeup; 1169 #ifndef __xpv 1170 if (is_x86_feature(x86_featureset, X86FSET_MWAIT) && 1171 idle_cpu_prefer_mwait) 1172 disp_enq_thread = cpu_wakeup_mwait; 1173 non_deep_idle_disp_enq_thread = disp_enq_thread; 1174 #endif 1175 } 1176 1177 psm_get_pir_ipivect = pops->psm_get_pir_ipivect; 1178 psm_send_pir_ipi = pops->psm_send_pir_ipi; 1179 psm_cmci_setup = pops->psm_cmci_setup; 1180 1181 1182 (void) add_avintr((void *)NULL, XC_HI_PIL, xc_serv, "xc_intr", 1183 (*pops->psm_get_ipivect)(XC_HI_PIL, PSM_INTR_IPI_HI), 1184 NULL, NULL, NULL, NULL); 1185 1186 (void) (*pops->psm_get_ipivect)(XC_CPUPOKE_PIL, PSM_INTR_POKE); 1187 } 1188 1189 static void 1190 mach_picinit() 1191 { 1192 struct psm_ops *pops; 1193 1194 pops = mach_set[0]; 1195 1196 /* register the interrupt handlers */ 1197 setlvl = pops->psm_intr_enter; 1198 setlvlx = pops->psm_intr_exit; 1199 1200 /* initialize the interrupt hardware */ 1201 (*pops->psm_picinit)(); 1202 1203 /* set interrupt mask for current ipl */ 1204 setspl = pops->psm_setspl; 1205 cli(); 1206 setspl(CPU->cpu_pri); 1207 } 1208 1209 uint_t cpu_freq; /* MHz */ 1210 uint64_t cpu_freq_hz; /* measured (in hertz) */ 1211 1212 #define MEGA_HZ 1000000 1213 1214 #ifdef __xpv 1215 1216 int xpv_cpufreq_workaround = 1; 1217 int xpv_cpufreq_verbose = 0; 1218 1219 #else /* __xpv */ 1220 1221 static uint64_t 1222 mach_calchz(uint32_t pit_counter, uint64_t *processor_clks) 1223 { 1224 uint64_t cpu_hz; 1225 1226 if ((pit_counter == 0) || (*processor_clks == 0) || 1227 (*processor_clks > (((uint64_t)-1) / PIT_HZ))) 1228 return (0); 1229 1230 cpu_hz = ((uint64_t)PIT_HZ * *processor_clks) / pit_counter; 1231 1232 return (cpu_hz); 1233 } 1234 1235 #endif /* __xpv */ 1236 1237 static uint64_t 1238 mach_getcpufreq(void) 1239 { 1240 #if defined(__xpv) 1241 vcpu_time_info_t *vti = &CPU->cpu_m.mcpu_vcpu_info->time; 1242 uint64_t cpu_hz; 1243 1244 /* 1245 * During dom0 bringup, it was noted that on at least one older 1246 * Intel HT machine, the hypervisor initially gives a tsc_to_system_mul 1247 * value that is quite wrong (the 3.06GHz clock was reported 1248 * as 4.77GHz) 1249 * 1250 * The curious thing is, that if you stop the kernel at entry, 1251 * breakpoint here and inspect the value with kmdb, the value 1252 * is correct - but if you don't stop and simply enable the 1253 * printf statement (below), you can see the bad value printed 1254 * here. Almost as if something kmdb did caused the hypervisor to 1255 * figure it out correctly. And, note that the hypervisor 1256 * eventually -does- figure it out correctly ... if you look at 1257 * the field later in the life of dom0, it is correct. 1258 * 1259 * For now, on dom0, we employ a slightly cheesy workaround of 1260 * using the DOM0_PHYSINFO hypercall. 1261 */ 1262 if (DOMAIN_IS_INITDOMAIN(xen_info) && xpv_cpufreq_workaround) { 1263 cpu_hz = 1000 * xpv_cpu_khz(); 1264 } else { 1265 cpu_hz = (UINT64_C(1000000000) << 32) / vti->tsc_to_system_mul; 1266 1267 if (vti->tsc_shift < 0) 1268 cpu_hz <<= -vti->tsc_shift; 1269 else 1270 cpu_hz >>= vti->tsc_shift; 1271 } 1272 1273 if (xpv_cpufreq_verbose) 1274 printf("mach_getcpufreq: system_mul 0x%x, shift %d, " 1275 "cpu_hz %" PRId64 "Hz\n", 1276 vti->tsc_to_system_mul, vti->tsc_shift, cpu_hz); 1277 1278 return (cpu_hz); 1279 #else /* __xpv */ 1280 uint32_t pit_counter; 1281 uint64_t processor_clks; 1282 1283 if (is_x86_feature(x86_featureset, X86FSET_TSC)) { 1284 /* 1285 * We have a TSC. freq_tsc() knows how to measure the number 1286 * of clock cycles sampled against the PIT. 1287 */ 1288 ulong_t flags = clear_int_flag(); 1289 processor_clks = freq_tsc(&pit_counter); 1290 restore_int_flag(flags); 1291 return (mach_calchz(pit_counter, &processor_clks)); 1292 } else if (x86_vendor == X86_VENDOR_Cyrix || x86_type == X86_TYPE_P5) { 1293 #if defined(__amd64) 1294 panic("mach_getcpufreq: no TSC!"); 1295 #elif defined(__i386) 1296 /* 1297 * We are a Cyrix based on a 6x86 core or an Intel Pentium 1298 * for which freq_notsc() knows how to measure the number of 1299 * elapsed clock cycles sampled against the PIT 1300 */ 1301 ulong_t flags = clear_int_flag(); 1302 processor_clks = freq_notsc(&pit_counter); 1303 restore_int_flag(flags); 1304 return (mach_calchz(pit_counter, &processor_clks)); 1305 #endif /* __i386 */ 1306 } 1307 1308 /* We do not know how to calculate cpu frequency for this cpu. */ 1309 return (0); 1310 #endif /* __xpv */ 1311 } 1312 1313 /* 1314 * If the clock speed of a cpu is found to be reported incorrectly, do not add 1315 * to this array, instead improve the accuracy of the algorithm that determines 1316 * the clock speed of the processor or extend the implementation to support the 1317 * vendor as appropriate. This is here only to support adjusting the speed on 1318 * older slower processors that mach_fixcpufreq() would not be able to account 1319 * for otherwise. 1320 */ 1321 static int x86_cpu_freq[] = { 60, 75, 80, 90, 120, 160, 166, 175, 180, 233 }; 1322 1323 /* 1324 * On fast processors the clock frequency that is measured may be off by 1325 * a few MHz from the value printed on the part. This is a combination of 1326 * the factors that for such fast parts being off by this much is within 1327 * the tolerances for manufacture and because of the difficulties in the 1328 * measurement that can lead to small error. This function uses some 1329 * heuristics in order to tweak the value that was measured to match what 1330 * is most likely printed on the part. 1331 * 1332 * Some examples: 1333 * AMD Athlon 1000 mhz measured as 998 mhz 1334 * Intel Pentium III Xeon 733 mhz measured as 731 mhz 1335 * Intel Pentium IV 1500 mhz measured as 1495mhz 1336 * 1337 * If in the future this function is no longer sufficient to correct 1338 * for the error in the measurement, then the algorithm used to perform 1339 * the measurement will have to be improved in order to increase accuracy 1340 * rather than adding horrible and questionable kludges here. 1341 * 1342 * This is called after the cyclics subsystem because of the potential 1343 * that the heuristics within may give a worse estimate of the clock 1344 * frequency than the value that was measured. 1345 */ 1346 static void 1347 mach_fixcpufreq(void) 1348 { 1349 uint32_t freq, mul, near66, delta66, near50, delta50, fixed, delta, i; 1350 1351 freq = (uint32_t)cpu_freq; 1352 1353 /* 1354 * Find the nearest integer multiple of 200/3 (about 66) MHz to the 1355 * measured speed taking into account that the 667 MHz parts were 1356 * the first to round-up. 1357 */ 1358 mul = (uint32_t)((3 * (uint64_t)freq + 100) / 200); 1359 near66 = (uint32_t)((200 * (uint64_t)mul + ((mul >= 10) ? 1 : 0)) / 3); 1360 delta66 = (near66 > freq) ? (near66 - freq) : (freq - near66); 1361 1362 /* Find the nearest integer multiple of 50 MHz to the measured speed */ 1363 mul = (freq + 25) / 50; 1364 near50 = mul * 50; 1365 delta50 = (near50 > freq) ? (near50 - freq) : (freq - near50); 1366 1367 /* Find the closer of the two */ 1368 if (delta66 < delta50) { 1369 fixed = near66; 1370 delta = delta66; 1371 } else { 1372 fixed = near50; 1373 delta = delta50; 1374 } 1375 1376 if (fixed > INT_MAX) 1377 return; 1378 1379 /* 1380 * Some older parts have a core clock frequency that is not an 1381 * integral multiple of 50 or 66 MHz. Check if one of the old 1382 * clock frequencies is closer to the measured value than any 1383 * of the integral multiples of 50 an 66, and if so set fixed 1384 * and delta appropriately to represent the closest value. 1385 */ 1386 i = sizeof (x86_cpu_freq) / sizeof (int); 1387 while (i > 0) { 1388 i--; 1389 1390 if (x86_cpu_freq[i] <= freq) { 1391 mul = freq - x86_cpu_freq[i]; 1392 1393 if (mul < delta) { 1394 fixed = x86_cpu_freq[i]; 1395 delta = mul; 1396 } 1397 1398 break; 1399 } 1400 1401 mul = x86_cpu_freq[i] - freq; 1402 1403 if (mul < delta) { 1404 fixed = x86_cpu_freq[i]; 1405 delta = mul; 1406 } 1407 } 1408 1409 /* 1410 * Set a reasonable maximum for how much to correct the measured 1411 * result by. This check is here to prevent the adjustment made 1412 * by this function from being more harm than good. It is entirely 1413 * possible that in the future parts will be made that are not 1414 * integral multiples of 66 or 50 in clock frequency or that 1415 * someone may overclock a part to some odd frequency. If the 1416 * measured value is farther from the corrected value than 1417 * allowed, then assume the corrected value is in error and use 1418 * the measured value. 1419 */ 1420 if (6 < delta) 1421 return; 1422 1423 cpu_freq = (int)fixed; 1424 } 1425 1426 1427 static int 1428 machhztomhz(uint64_t cpu_freq_hz) 1429 { 1430 uint64_t cpu_mhz; 1431 1432 /* Round to nearest MHZ */ 1433 cpu_mhz = (cpu_freq_hz + (MEGA_HZ / 2)) / MEGA_HZ; 1434 1435 if (cpu_mhz > INT_MAX) 1436 return (0); 1437 1438 return ((int)cpu_mhz); 1439 1440 } 1441 1442 1443 static int 1444 mach_clkinit(int preferred_mode, int *set_mode) 1445 { 1446 struct psm_ops *pops; 1447 int resolution; 1448 1449 pops = mach_set[0]; 1450 1451 cpu_freq_hz = mach_getcpufreq(); 1452 1453 cpu_freq = machhztomhz(cpu_freq_hz); 1454 1455 if (!is_x86_feature(x86_featureset, X86FSET_TSC) || (cpu_freq == 0)) 1456 tsc_gethrtime_enable = 0; 1457 1458 #ifndef __xpv 1459 if (tsc_gethrtime_enable) { 1460 tsc_hrtimeinit(cpu_freq_hz); 1461 } else 1462 #endif 1463 { 1464 if (pops->psm_hrtimeinit) 1465 (*pops->psm_hrtimeinit)(); 1466 gethrtimef = pops->psm_gethrtime; 1467 gethrtimeunscaledf = gethrtimef; 1468 /* scalehrtimef will remain dummy */ 1469 } 1470 1471 mach_fixcpufreq(); 1472 1473 if (mach_ver[0] >= PSM_INFO_VER01_3) { 1474 if (preferred_mode == TIMER_ONESHOT) { 1475 1476 resolution = (*pops->psm_clkinit)(0); 1477 if (resolution != 0) { 1478 *set_mode = TIMER_ONESHOT; 1479 return (resolution); 1480 } 1481 } 1482 1483 /* 1484 * either periodic mode was requested or could not set to 1485 * one-shot mode 1486 */ 1487 resolution = (*pops->psm_clkinit)(hz); 1488 /* 1489 * psm should be able to do periodic, so we do not check 1490 * for return value of psm_clkinit here. 1491 */ 1492 *set_mode = TIMER_PERIODIC; 1493 return (resolution); 1494 } else { 1495 /* 1496 * PSMI interface prior to PSMI_3 does not define a return 1497 * value for psm_clkinit, so the return value is ignored. 1498 */ 1499 (void) (*pops->psm_clkinit)(hz); 1500 *set_mode = TIMER_PERIODIC; 1501 return (nsec_per_tick); 1502 } 1503 } 1504 1505 1506 /*ARGSUSED*/ 1507 static int 1508 mach_softlvl_to_vect(int ipl) 1509 { 1510 setsoftint = av_set_softint_pending; 1511 kdisetsoftint = kdi_av_set_softint_pending; 1512 1513 return (PSM_SV_SOFTWARE); 1514 } 1515 1516 #ifdef DEBUG 1517 /* 1518 * This is here to allow us to simulate cpus that refuse to start. 1519 */ 1520 cpuset_t cpufailset; 1521 #endif 1522 1523 int 1524 mach_cpu_start(struct cpu *cp, void *ctx) 1525 { 1526 struct psm_ops *pops = mach_set[0]; 1527 processorid_t id = cp->cpu_id; 1528 1529 #ifdef DEBUG 1530 if (CPU_IN_SET(cpufailset, id)) 1531 return (0); 1532 #endif 1533 return ((*pops->psm_cpu_start)(id, ctx)); 1534 } 1535 1536 int 1537 mach_cpuid_start(processorid_t id, void *ctx) 1538 { 1539 struct psm_ops *pops = mach_set[0]; 1540 1541 #ifdef DEBUG 1542 if (CPU_IN_SET(cpufailset, id)) 1543 return (0); 1544 #endif 1545 return ((*pops->psm_cpu_start)(id, ctx)); 1546 } 1547 1548 int 1549 mach_cpu_stop(cpu_t *cp, void *ctx) 1550 { 1551 struct psm_ops *pops = mach_set[0]; 1552 psm_cpu_request_t request; 1553 1554 if (pops->psm_cpu_ops == NULL) { 1555 return (ENOTSUP); 1556 } 1557 1558 ASSERT(cp->cpu_id != -1); 1559 request.pcr_cmd = PSM_CPU_STOP; 1560 request.req.cpu_stop.cpuid = cp->cpu_id; 1561 request.req.cpu_stop.ctx = ctx; 1562 1563 return ((*pops->psm_cpu_ops)(&request)); 1564 } 1565 1566 int 1567 mach_cpu_add(mach_cpu_add_arg_t *argp, processorid_t *cpuidp) 1568 { 1569 int rc; 1570 struct psm_ops *pops = mach_set[0]; 1571 psm_cpu_request_t request; 1572 1573 if (pops->psm_cpu_ops == NULL) { 1574 return (ENOTSUP); 1575 } 1576 1577 request.pcr_cmd = PSM_CPU_ADD; 1578 request.req.cpu_add.argp = argp; 1579 request.req.cpu_add.cpuid = -1; 1580 rc = (*pops->psm_cpu_ops)(&request); 1581 if (rc == 0) { 1582 ASSERT(request.req.cpu_add.cpuid != -1); 1583 *cpuidp = request.req.cpu_add.cpuid; 1584 } 1585 1586 return (rc); 1587 } 1588 1589 int 1590 mach_cpu_remove(processorid_t cpuid) 1591 { 1592 struct psm_ops *pops = mach_set[0]; 1593 psm_cpu_request_t request; 1594 1595 if (pops->psm_cpu_ops == NULL) { 1596 return (ENOTSUP); 1597 } 1598 1599 request.pcr_cmd = PSM_CPU_REMOVE; 1600 request.req.cpu_remove.cpuid = cpuid; 1601 1602 return ((*pops->psm_cpu_ops)(&request)); 1603 } 1604 1605 /* 1606 * Default handler to create device node for CPU. 1607 * One reference count will be held on created device node. 1608 */ 1609 static int 1610 mach_cpu_create_devinfo(cpu_t *cp, dev_info_t **dipp) 1611 { 1612 int rv, circ; 1613 dev_info_t *dip; 1614 static kmutex_t cpu_node_lock; 1615 static dev_info_t *cpu_nex_devi = NULL; 1616 1617 ASSERT(cp != NULL); 1618 ASSERT(dipp != NULL); 1619 *dipp = NULL; 1620 1621 if (cpu_nex_devi == NULL) { 1622 mutex_enter(&cpu_node_lock); 1623 /* First check whether cpus exists. */ 1624 cpu_nex_devi = ddi_find_devinfo("cpus", -1, 0); 1625 /* Create cpus if it doesn't exist. */ 1626 if (cpu_nex_devi == NULL) { 1627 ndi_devi_enter(ddi_root_node(), &circ); 1628 rv = ndi_devi_alloc(ddi_root_node(), "cpus", 1629 (pnode_t)DEVI_SID_NODEID, &dip); 1630 if (rv != NDI_SUCCESS) { 1631 mutex_exit(&cpu_node_lock); 1632 cmn_err(CE_CONT, 1633 "?failed to create cpu nexus device.\n"); 1634 return (PSM_FAILURE); 1635 } 1636 ASSERT(dip != NULL); 1637 (void) ndi_devi_online(dip, 0); 1638 ndi_devi_exit(ddi_root_node(), circ); 1639 cpu_nex_devi = dip; 1640 } 1641 mutex_exit(&cpu_node_lock); 1642 } 1643 1644 /* 1645 * create a child node for cpu identified as 'cpu_id' 1646 */ 1647 ndi_devi_enter(cpu_nex_devi, &circ); 1648 dip = ddi_add_child(cpu_nex_devi, "cpu", DEVI_SID_NODEID, -1); 1649 if (dip == NULL) { 1650 cmn_err(CE_CONT, 1651 "?failed to create device node for cpu%d.\n", cp->cpu_id); 1652 rv = PSM_FAILURE; 1653 } else { 1654 *dipp = dip; 1655 (void) ndi_hold_devi(dip); 1656 rv = PSM_SUCCESS; 1657 } 1658 ndi_devi_exit(cpu_nex_devi, circ); 1659 1660 return (rv); 1661 } 1662 1663 /* 1664 * Create cpu device node in device tree and online it. 1665 * Return created dip with reference count held if requested. 1666 */ 1667 int 1668 mach_cpu_create_device_node(struct cpu *cp, dev_info_t **dipp) 1669 { 1670 int rv; 1671 dev_info_t *dip = NULL; 1672 1673 ASSERT(psm_cpu_create_devinfo != NULL); 1674 rv = psm_cpu_create_devinfo(cp, &dip); 1675 if (rv == PSM_SUCCESS) { 1676 cpuid_set_cpu_properties(dip, cp->cpu_id, cp->cpu_m.mcpu_cpi); 1677 /* Recursively attach driver for parent nexus device. */ 1678 if (i_ddi_attach_node_hierarchy(ddi_get_parent(dip)) == 1679 DDI_SUCCESS) { 1680 /* Configure cpu itself and descendants. */ 1681 (void) ndi_devi_online(dip, 1682 NDI_ONLINE_ATTACH | NDI_CONFIG); 1683 } 1684 if (dipp != NULL) { 1685 *dipp = dip; 1686 } else { 1687 (void) ndi_rele_devi(dip); 1688 } 1689 } 1690 1691 return (rv); 1692 } 1693 1694 /* 1695 * The dipp contains one of following values on return: 1696 * - NULL if no device node found 1697 * - pointer to device node if found 1698 */ 1699 int 1700 mach_cpu_get_device_node(struct cpu *cp, dev_info_t **dipp) 1701 { 1702 *dipp = NULL; 1703 if (psm_cpu_get_devinfo != NULL) { 1704 if (psm_cpu_get_devinfo(cp, dipp) == PSM_SUCCESS) { 1705 return (PSM_SUCCESS); 1706 } 1707 } 1708 1709 return (PSM_FAILURE); 1710 } 1711 1712 /*ARGSUSED*/ 1713 static int 1714 mach_translate_irq(dev_info_t *dip, int irqno) 1715 { 1716 return (irqno); /* default to NO translation */ 1717 } 1718 1719 static void 1720 mach_notify_error(int level, char *errmsg) 1721 { 1722 /* 1723 * SL_FATAL is pass in once panicstr is set, deliver it 1724 * as CE_PANIC. Also, translate SL_ codes back to CE_ 1725 * codes for the psmi handler 1726 */ 1727 if (level & SL_FATAL) 1728 (*notify_error)(CE_PANIC, errmsg); 1729 else if (level & SL_WARN) 1730 (*notify_error)(CE_WARN, errmsg); 1731 else if (level & SL_NOTE) 1732 (*notify_error)(CE_NOTE, errmsg); 1733 else if (level & SL_CONSOLE) 1734 (*notify_error)(CE_CONT, errmsg); 1735 } 1736 1737 /* 1738 * It provides the default basic intr_ops interface for the new DDI 1739 * interrupt framework if the PSM doesn't have one. 1740 * 1741 * Input: 1742 * dip - pointer to the dev_info structure of the requested device 1743 * hdlp - pointer to the internal interrupt handle structure for the 1744 * requested interrupt 1745 * intr_op - opcode for this call 1746 * result - pointer to the integer that will hold the result to be 1747 * passed back if return value is PSM_SUCCESS 1748 * 1749 * Output: 1750 * return value is either PSM_SUCCESS or PSM_FAILURE 1751 */ 1752 static int 1753 mach_intr_ops(dev_info_t *dip, ddi_intr_handle_impl_t *hdlp, 1754 psm_intr_op_t intr_op, int *result) 1755 { 1756 struct intrspec *ispec; 1757 1758 switch (intr_op) { 1759 case PSM_INTR_OP_CHECK_MSI: 1760 *result = hdlp->ih_type & ~(DDI_INTR_TYPE_MSI | 1761 DDI_INTR_TYPE_MSIX); 1762 break; 1763 case PSM_INTR_OP_ALLOC_VECTORS: 1764 if (hdlp->ih_type == DDI_INTR_TYPE_FIXED) 1765 *result = 1; 1766 else 1767 *result = 0; 1768 break; 1769 case PSM_INTR_OP_FREE_VECTORS: 1770 break; 1771 case PSM_INTR_OP_NAVAIL_VECTORS: 1772 if (hdlp->ih_type == DDI_INTR_TYPE_FIXED) 1773 *result = 1; 1774 else 1775 *result = 0; 1776 break; 1777 case PSM_INTR_OP_XLATE_VECTOR: 1778 ispec = ((ihdl_plat_t *)hdlp->ih_private)->ip_ispecp; 1779 *result = psm_translate_irq(dip, ispec->intrspec_vec); 1780 break; 1781 case PSM_INTR_OP_GET_CAP: 1782 *result = 0; 1783 break; 1784 case PSM_INTR_OP_GET_PENDING: 1785 case PSM_INTR_OP_CLEAR_MASK: 1786 case PSM_INTR_OP_SET_MASK: 1787 case PSM_INTR_OP_GET_SHARED: 1788 case PSM_INTR_OP_SET_PRI: 1789 case PSM_INTR_OP_SET_CAP: 1790 case PSM_INTR_OP_SET_CPU: 1791 case PSM_INTR_OP_GET_INTR: 1792 default: 1793 return (PSM_FAILURE); 1794 } 1795 return (PSM_SUCCESS); 1796 } 1797 /* 1798 * Return 1 if CMT load balancing policies should be 1799 * implemented across instances of the specified hardware 1800 * sharing relationship. 1801 */ 1802 int 1803 pg_cmt_load_bal_hw(pghw_type_t hw) 1804 { 1805 if (hw == PGHW_IPIPE || 1806 hw == PGHW_FPU || 1807 hw == PGHW_PROCNODE || 1808 hw == PGHW_CHIP) 1809 return (1); 1810 else 1811 return (0); 1812 } 1813 /* 1814 * Return 1 if thread affinity polices should be implemented 1815 * for instances of the specifed hardware sharing relationship. 1816 */ 1817 int 1818 pg_cmt_affinity_hw(pghw_type_t hw) 1819 { 1820 if (hw == PGHW_CACHE) 1821 return (1); 1822 else 1823 return (0); 1824 } 1825 1826 /* 1827 * Return number of counter events requested to measure hardware capacity and 1828 * utilization and setup CPC requests for specified CPU as needed 1829 * 1830 * May return 0 when platform or processor specific code knows that no CPC 1831 * events should be programmed on this CPU or -1 when platform or processor 1832 * specific code doesn't know which counter events are best to use and common 1833 * code should decide for itself 1834 */ 1835 int 1836 /* LINTED E_FUNC_ARG_UNUSED */ 1837 cu_plat_cpc_init(cpu_t *cp, kcpc_request_list_t *reqs, int nreqs) 1838 { 1839 const char *impl_name; 1840 1841 /* 1842 * Return error if pcbe_ops not set 1843 */ 1844 if (pcbe_ops == NULL) 1845 return (-1); 1846 1847 /* 1848 * Return that no CPC events should be programmed on hyperthreaded 1849 * Pentium 4 and return error for all other x86 processors to tell 1850 * common code to decide what counter events to program on those CPUs 1851 * for measuring hardware capacity and utilization 1852 */ 1853 impl_name = pcbe_ops->pcbe_impl_name(); 1854 if (impl_name != NULL && strcmp(impl_name, PCBE_IMPL_NAME_P4HT) == 0) 1855 return (0); 1856 else 1857 return (-1); 1858 } 1859