1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 /* 26 * Copyright 2021 Joyent, Inc. 27 * Copyright (c) 2016, 2017 by Delphix. All rights reserved. 28 * Copyright 2019 Joshua M. Clulow <josh@sysmgr.org> 29 */ 30 31 /* 32 * PSMI 1.1 extensions are supported only in 2.6 and later versions. 33 * PSMI 1.2 extensions are supported only in 2.7 and later versions. 34 * PSMI 1.3 and 1.4 extensions are supported in Solaris 10. 35 * PSMI 1.5 extensions are supported in Solaris Nevada. 36 * PSMI 1.6 extensions are supported in Solaris Nevada. 37 * PSMI 1.7 extensions are supported in Solaris Nevada. 38 */ 39 #define PSMI_1_7 40 41 #include <sys/processor.h> 42 #include <sys/time.h> 43 #include <sys/psm.h> 44 #include <sys/smp_impldefs.h> 45 #include <sys/cram.h> 46 #include <sys/acpi/acpi.h> 47 #include <sys/acpica.h> 48 #include <sys/psm_common.h> 49 #include <sys/apic.h> 50 #include <sys/pit.h> 51 #include <sys/ddi.h> 52 #include <sys/sunddi.h> 53 #include <sys/ddi_impldefs.h> 54 #include <sys/pci.h> 55 #include <sys/promif.h> 56 #include <sys/x86_archext.h> 57 #include <sys/cpc_impl.h> 58 #include <sys/uadmin.h> 59 #include <sys/panic.h> 60 #include <sys/debug.h> 61 #include <sys/archsystm.h> 62 #include <sys/trap.h> 63 #include <sys/machsystm.h> 64 #include <sys/sysmacros.h> 65 #include <sys/cpuvar.h> 66 #include <sys/rm_platter.h> 67 #include <sys/privregs.h> 68 #include <sys/note.h> 69 #include <sys/pci_intr_lib.h> 70 #include <sys/spl.h> 71 #include <sys/clock.h> 72 #include <sys/dditypes.h> 73 #include <sys/sunddi.h> 74 #include <sys/x_call.h> 75 #include <sys/reboot.h> 76 #include <sys/hpet.h> 77 #include <sys/apic_common.h> 78 #include <sys/apic_timer.h> 79 #include <sys/tsc.h> 80 81 static void apic_record_ioapic_rdt(void *intrmap_private, 82 ioapic_rdt_t *irdt); 83 static void apic_record_msi(void *intrmap_private, msi_regs_t *mregs); 84 85 /* 86 * Common routines between pcplusmp & apix (taken from apic.c). 87 */ 88 89 int apic_clkinit(int); 90 hrtime_t apic_gethrtime(void); 91 void apic_send_ipi(int, int); 92 void apic_set_idlecpu(processorid_t); 93 void apic_unset_idlecpu(processorid_t); 94 void apic_shutdown(int, int); 95 void apic_preshutdown(int, int); 96 processorid_t apic_get_next_processorid(processorid_t); 97 98 hrtime_t apic_gettime(); 99 100 enum apic_ioapic_method_type apix_mul_ioapic_method = APIC_MUL_IOAPIC_PCPLUSMP; 101 102 /* Now the ones for Dynamic Interrupt distribution */ 103 int apic_enable_dynamic_migration = 0; 104 105 /* maximum loop count when sending Start IPIs. */ 106 int apic_sipi_max_loop_count = 0x1000; 107 108 /* 109 * These variables are frequently accessed in apic_intr_enter(), 110 * apic_intr_exit and apic_setspl, so group them together 111 */ 112 volatile uint32_t *apicadr = NULL; /* virtual addr of local APIC */ 113 int apic_setspl_delay = 1; /* apic_setspl - delay enable */ 114 int apic_clkvect; 115 116 /* vector at which error interrupts come in */ 117 int apic_errvect; 118 int apic_enable_error_intr = 1; 119 int apic_error_display_delay = 100; 120 121 /* vector at which performance counter overflow interrupts come in */ 122 int apic_cpcovf_vect; 123 int apic_enable_cpcovf_intr = 1; 124 125 /* vector at which CMCI interrupts come in */ 126 int apic_cmci_vect; 127 extern void cmi_cmci_trap(void); 128 129 lock_t apic_mode_switch_lock; 130 131 int apic_pir_vect; 132 133 /* 134 * Patchable global variables. 135 */ 136 int apic_forceload = 0; 137 138 int apic_coarse_hrtime = 1; /* 0 - use accurate slow gethrtime() */ 139 140 int apic_flat_model = 0; /* 0 - clustered. 1 - flat */ 141 int apic_panic_on_nmi = 0; 142 int apic_panic_on_apic_error = 0; 143 144 int apic_verbose = 0; /* 0x1ff */ 145 146 /* If set, force APIC calibration to use the PIT instead of the TSC */ 147 int apic_calibrate_use_pit = 0; 148 149 /* 150 * It was found empirically that 5 measurements seem sufficient to give a good 151 * accuracy. Most spurious measurements are higher than the target value thus 152 * we eliminate up to 2/5 spurious measurements. 153 */ 154 #define APIC_CALIBRATE_MEASUREMENTS 5 155 156 #define APIC_CALIBRATE_PERCENT_OFF_WARNING 10 157 158 extern int pit_is_broken; /* from tscc_pit.c */ 159 160 uint64_t apic_info_tsc[APIC_CALIBRATE_MEASUREMENTS]; 161 uint64_t apic_info_pit[APIC_CALIBRATE_MEASUREMENTS]; 162 163 #ifdef DEBUG 164 int apic_debug = 0; 165 int apic_restrict_vector = 0; 166 167 int apic_debug_msgbuf[APIC_DEBUG_MSGBUFSIZE]; 168 int apic_debug_msgbufindex = 0; 169 170 #endif /* DEBUG */ 171 172 uint_t apic_nticks = 0; 173 uint_t apic_skipped_redistribute = 0; 174 175 uint_t last_count_read = 0; 176 lock_t apic_gethrtime_lock; 177 volatile int apic_hrtime_stamp = 0; 178 volatile hrtime_t apic_nsec_since_boot = 0; 179 180 static hrtime_t apic_last_hrtime = 0; 181 int apic_hrtime_error = 0; 182 int apic_remote_hrterr = 0; 183 int apic_num_nmis = 0; 184 int apic_apic_error = 0; 185 int apic_num_apic_errors = 0; 186 int apic_num_cksum_errors = 0; 187 188 int apic_error = 0; 189 190 static int apic_cmos_ssb_set = 0; 191 192 /* use to make sure only one cpu handles the nmi */ 193 lock_t apic_nmi_lock; 194 /* use to make sure only one cpu handles the error interrupt */ 195 lock_t apic_error_lock; 196 197 static struct { 198 uchar_t cntl; 199 uchar_t data; 200 } aspen_bmc[] = { 201 { CC_SMS_WR_START, 0x18 }, /* NetFn/LUN */ 202 { CC_SMS_WR_NEXT, 0x24 }, /* Cmd SET_WATCHDOG_TIMER */ 203 { CC_SMS_WR_NEXT, 0x84 }, /* DataByte 1: SMS/OS no log */ 204 { CC_SMS_WR_NEXT, 0x2 }, /* DataByte 2: Power Down */ 205 { CC_SMS_WR_NEXT, 0x0 }, /* DataByte 3: no pre-timeout */ 206 { CC_SMS_WR_NEXT, 0x0 }, /* DataByte 4: timer expir. */ 207 { CC_SMS_WR_NEXT, 0xa }, /* DataByte 5: init countdown */ 208 { CC_SMS_WR_END, 0x0 }, /* DataByte 6: init countdown */ 209 210 { CC_SMS_WR_START, 0x18 }, /* NetFn/LUN */ 211 { CC_SMS_WR_END, 0x22 } /* Cmd RESET_WATCHDOG_TIMER */ 212 }; 213 214 static struct { 215 int port; 216 uchar_t data; 217 } sitka_bmc[] = { 218 { SMS_COMMAND_REGISTER, SMS_WRITE_START }, 219 { SMS_DATA_REGISTER, 0x18 }, /* NetFn/LUN */ 220 { SMS_DATA_REGISTER, 0x24 }, /* Cmd SET_WATCHDOG_TIMER */ 221 { SMS_DATA_REGISTER, 0x84 }, /* DataByte 1: SMS/OS no log */ 222 { SMS_DATA_REGISTER, 0x2 }, /* DataByte 2: Power Down */ 223 { SMS_DATA_REGISTER, 0x0 }, /* DataByte 3: no pre-timeout */ 224 { SMS_DATA_REGISTER, 0x0 }, /* DataByte 4: timer expir. */ 225 { SMS_DATA_REGISTER, 0xa }, /* DataByte 5: init countdown */ 226 { SMS_COMMAND_REGISTER, SMS_WRITE_END }, 227 { SMS_DATA_REGISTER, 0x0 }, /* DataByte 6: init countdown */ 228 229 { SMS_COMMAND_REGISTER, SMS_WRITE_START }, 230 { SMS_DATA_REGISTER, 0x18 }, /* NetFn/LUN */ 231 { SMS_COMMAND_REGISTER, SMS_WRITE_END }, 232 { SMS_DATA_REGISTER, 0x22 } /* Cmd RESET_WATCHDOG_TIMER */ 233 }; 234 235 /* Patchable global variables. */ 236 int apic_kmdb_on_nmi = 0; /* 0 - no, 1 - yes enter kmdb */ 237 uint32_t apic_divide_reg_init = 0; /* 0 - divide by 2 */ 238 239 /* default apic ops without interrupt remapping */ 240 static apic_intrmap_ops_t apic_nointrmap_ops = { 241 (int (*)(int))return_instr, 242 (void (*)(int))return_instr, 243 (void (*)(void **, dev_info_t *, uint16_t, int, uchar_t))return_instr, 244 (void (*)(void *, void *, uint16_t, int))return_instr, 245 (void (*)(void **))return_instr, 246 apic_record_ioapic_rdt, 247 apic_record_msi, 248 }; 249 250 apic_intrmap_ops_t *apic_vt_ops = &apic_nointrmap_ops; 251 apic_cpus_info_t *apic_cpus = NULL; 252 cpuset_t apic_cpumask; 253 uint_t apic_picinit_called; 254 255 /* Flag to indicate that we need to shut down all processors */ 256 static uint_t apic_shutdown_processors; 257 258 /* 259 * Probe the ioapic method for apix module. Called in apic_probe_common() 260 */ 261 int 262 apic_ioapic_method_probe() 263 { 264 if (apix_enable == 0) 265 return (PSM_SUCCESS); 266 267 /* 268 * Set IOAPIC EOI handling method. The priority from low to high is: 269 * 1. IOxAPIC: with EOI register 270 * 2. IOMMU interrupt mapping 271 * 3. Mask-Before-EOI method for systems without boot 272 * interrupt routing, such as systems with only one IOAPIC; 273 * NVIDIA CK8-04/MCP55 systems; systems with bridge solution 274 * which disables the boot interrupt routing already. 275 * 4. Directed EOI 276 */ 277 if (apic_io_ver[0] >= 0x20) 278 apix_mul_ioapic_method = APIC_MUL_IOAPIC_IOXAPIC; 279 if ((apic_io_max == 1) || (apic_nvidia_io_max == apic_io_max)) 280 apix_mul_ioapic_method = APIC_MUL_IOAPIC_MASK; 281 if (apic_directed_EOI_supported()) 282 apix_mul_ioapic_method = APIC_MUL_IOAPIC_DEOI; 283 284 /* fall back to pcplusmp */ 285 if (apix_mul_ioapic_method == APIC_MUL_IOAPIC_PCPLUSMP) { 286 /* make sure apix is after pcplusmp in /etc/mach */ 287 apix_enable = 0; /* go ahead with pcplusmp install next */ 288 return (PSM_FAILURE); 289 } 290 291 return (PSM_SUCCESS); 292 } 293 294 /* 295 * handler for APIC Error interrupt. Just print a warning and continue 296 */ 297 int 298 apic_error_intr() 299 { 300 uint_t error0, error1, error; 301 uint_t i; 302 303 /* 304 * We need to write before read as per 7.4.17 of system prog manual. 305 * We do both and or the results to be safe 306 */ 307 error0 = apic_reg_ops->apic_read(APIC_ERROR_STATUS); 308 apic_reg_ops->apic_write(APIC_ERROR_STATUS, 0); 309 error1 = apic_reg_ops->apic_read(APIC_ERROR_STATUS); 310 error = error0 | error1; 311 312 /* 313 * Clear the APIC error status (do this on all cpus that enter here) 314 * (two writes are required due to the semantics of accessing the 315 * error status register.) 316 */ 317 apic_reg_ops->apic_write(APIC_ERROR_STATUS, 0); 318 apic_reg_ops->apic_write(APIC_ERROR_STATUS, 0); 319 320 /* 321 * Prevent more than 1 CPU from handling error interrupt causing 322 * double printing (interleave of characters from multiple 323 * CPU's when using prom_printf) 324 */ 325 if (lock_try(&apic_error_lock) == 0) 326 return (error ? DDI_INTR_CLAIMED : DDI_INTR_UNCLAIMED); 327 if (error) { 328 #if DEBUG 329 if (apic_debug) 330 debug_enter("pcplusmp: APIC Error interrupt received"); 331 #endif /* DEBUG */ 332 if (apic_panic_on_apic_error) 333 cmn_err(CE_PANIC, 334 "APIC Error interrupt on CPU %d. Status = %x", 335 psm_get_cpu_id(), error); 336 else { 337 if ((error & ~APIC_CS_ERRORS) == 0) { 338 /* cksum error only */ 339 apic_error |= APIC_ERR_APIC_ERROR; 340 apic_apic_error |= error; 341 apic_num_apic_errors++; 342 apic_num_cksum_errors++; 343 } else { 344 /* 345 * prom_printf is the best shot we have of 346 * something which is problem free from 347 * high level/NMI type of interrupts 348 */ 349 prom_printf("APIC Error interrupt on CPU %d. " 350 "Status 0 = %x, Status 1 = %x\n", 351 psm_get_cpu_id(), error0, error1); 352 apic_error |= APIC_ERR_APIC_ERROR; 353 apic_apic_error |= error; 354 apic_num_apic_errors++; 355 for (i = 0; i < apic_error_display_delay; i++) { 356 tenmicrosec(); 357 } 358 /* 359 * provide more delay next time limited to 360 * roughly 1 clock tick time 361 */ 362 if (apic_error_display_delay < 500) 363 apic_error_display_delay *= 2; 364 } 365 } 366 lock_clear(&apic_error_lock); 367 return (DDI_INTR_CLAIMED); 368 } else { 369 lock_clear(&apic_error_lock); 370 return (DDI_INTR_UNCLAIMED); 371 } 372 } 373 374 /* 375 * Turn off the mask bit in the performance counter Local Vector Table entry. 376 */ 377 void 378 apic_cpcovf_mask_clear(void) 379 { 380 apic_reg_ops->apic_write(APIC_PCINT_VECT, 381 (apic_reg_ops->apic_read(APIC_PCINT_VECT) & ~APIC_LVT_MASK)); 382 } 383 384 static int 385 apic_cmci_enable(xc_arg_t arg1 __unused, xc_arg_t arg2 __unused, 386 xc_arg_t arg3 __unused) 387 { 388 apic_reg_ops->apic_write(APIC_CMCI_VECT, apic_cmci_vect); 389 return (0); 390 } 391 392 static int 393 apic_cmci_disable(xc_arg_t arg1 __unused, xc_arg_t arg2 __unused, 394 xc_arg_t arg3 __unused) 395 { 396 apic_reg_ops->apic_write(APIC_CMCI_VECT, apic_cmci_vect | AV_MASK); 397 return (0); 398 } 399 400 void 401 apic_cmci_setup(processorid_t cpuid, boolean_t enable) 402 { 403 cpuset_t cpu_set; 404 405 CPUSET_ONLY(cpu_set, cpuid); 406 407 if (enable) { 408 xc_call(0, 0, 0, CPUSET2BV(cpu_set), 409 (xc_func_t)apic_cmci_enable); 410 } else { 411 xc_call(0, 0, 0, CPUSET2BV(cpu_set), 412 (xc_func_t)apic_cmci_disable); 413 } 414 } 415 416 static void 417 apic_disable_local_apic(void) 418 { 419 apic_reg_ops->apic_write_task_reg(APIC_MASK_ALL); 420 apic_reg_ops->apic_write(APIC_LOCAL_TIMER, AV_MASK); 421 422 /* local intr reg 0 */ 423 apic_reg_ops->apic_write(APIC_INT_VECT0, AV_MASK); 424 425 /* disable NMI */ 426 apic_reg_ops->apic_write(APIC_INT_VECT1, AV_MASK); 427 428 /* and error interrupt */ 429 apic_reg_ops->apic_write(APIC_ERR_VECT, AV_MASK); 430 431 /* and perf counter intr */ 432 apic_reg_ops->apic_write(APIC_PCINT_VECT, AV_MASK); 433 434 apic_reg_ops->apic_write(APIC_SPUR_INT_REG, APIC_SPUR_INTR); 435 } 436 437 static void 438 apic_cpu_send_SIPI(processorid_t cpun, boolean_t start) 439 { 440 int loop_count; 441 uint32_t vector; 442 uint_t apicid; 443 ulong_t iflag; 444 445 apicid = apic_cpus[cpun].aci_local_id; 446 447 /* 448 * Interrupts on current CPU will be disabled during the 449 * steps in order to avoid unwanted side effects from 450 * executing interrupt handlers on a problematic BIOS. 451 */ 452 iflag = intr_clear(); 453 454 if (start) { 455 outb(CMOS_ADDR, SSB); 456 outb(CMOS_DATA, BIOS_SHUTDOWN); 457 } 458 459 /* 460 * According to X2APIC specification in section '2.3.5.1' of 461 * Interrupt Command Register Semantics, the semantics of 462 * programming the Interrupt Command Register to dispatch an interrupt 463 * is simplified. A single MSR write to the 64-bit ICR is required 464 * for dispatching an interrupt. Specifically, with the 64-bit MSR 465 * interface to ICR, system software is not required to check the 466 * status of the delivery status bit prior to writing to the ICR 467 * to send an IPI. With the removal of the Delivery Status bit, 468 * system software no longer has a reason to read the ICR. It remains 469 * readable only to aid in debugging. 470 */ 471 #ifdef DEBUG 472 APIC_AV_PENDING_SET(); 473 #else 474 if (apic_mode == LOCAL_APIC) { 475 APIC_AV_PENDING_SET(); 476 } 477 #endif /* DEBUG */ 478 479 /* for integrated - make sure there is one INIT IPI in buffer */ 480 /* for external - it will wake up the cpu */ 481 apic_reg_ops->apic_write_int_cmd(apicid, AV_ASSERT | AV_RESET); 482 483 /* If only 1 CPU is installed, PENDING bit will not go low */ 484 for (loop_count = apic_sipi_max_loop_count; loop_count; loop_count--) { 485 if (apic_mode == LOCAL_APIC && 486 apic_reg_ops->apic_read(APIC_INT_CMD1) & AV_PENDING) 487 apic_ret(); 488 else 489 break; 490 } 491 492 apic_reg_ops->apic_write_int_cmd(apicid, AV_DEASSERT | AV_RESET); 493 drv_usecwait(20000); /* 20 milli sec */ 494 495 if (apic_cpus[cpun].aci_local_ver >= APIC_INTEGRATED_VERS) { 496 /* integrated apic */ 497 498 vector = (rm_platter_pa >> MMU_PAGESHIFT) & 499 (APIC_VECTOR_MASK | APIC_IPL_MASK); 500 501 /* to offset the INIT IPI queue up in the buffer */ 502 apic_reg_ops->apic_write_int_cmd(apicid, vector | AV_STARTUP); 503 drv_usecwait(200); /* 20 micro sec */ 504 505 /* 506 * send the second SIPI (Startup IPI) as recommended by Intel 507 * software development manual. 508 */ 509 apic_reg_ops->apic_write_int_cmd(apicid, vector | AV_STARTUP); 510 drv_usecwait(200); /* 20 micro sec */ 511 } 512 513 intr_restore(iflag); 514 } 515 516 /*ARGSUSED1*/ 517 int 518 apic_cpu_start(processorid_t cpun, caddr_t arg __unused) 519 { 520 ASSERT(MUTEX_HELD(&cpu_lock)); 521 522 if (!apic_cpu_in_range(cpun)) { 523 return (EINVAL); 524 } 525 526 /* 527 * Switch to apic_common_send_ipi for safety during starting other CPUs. 528 */ 529 if (apic_mode == LOCAL_X2APIC) { 530 apic_switch_ipi_callback(B_TRUE); 531 } 532 533 apic_cmos_ssb_set = 1; 534 apic_cpu_send_SIPI(cpun, B_TRUE); 535 536 return (0); 537 } 538 539 /* 540 * Put CPU into halted state with interrupts disabled. 541 */ 542 /*ARGSUSED1*/ 543 int 544 apic_cpu_stop(processorid_t cpun, caddr_t arg __unused) 545 { 546 int rc; 547 cpu_t *cp; 548 extern cpuset_t cpu_ready_set; 549 extern void cpu_idle_intercept_cpu(cpu_t *cp); 550 551 ASSERT(MUTEX_HELD(&cpu_lock)); 552 553 if (!apic_cpu_in_range(cpun)) { 554 return (EINVAL); 555 } 556 if (apic_cpus[cpun].aci_local_ver < APIC_INTEGRATED_VERS) { 557 return (ENOTSUP); 558 } 559 560 cp = cpu_get(cpun); 561 ASSERT(cp != NULL); 562 ASSERT((cp->cpu_flags & CPU_OFFLINE) != 0); 563 ASSERT((cp->cpu_flags & CPU_QUIESCED) != 0); 564 ASSERT((cp->cpu_flags & CPU_ENABLE) == 0); 565 566 /* Clear CPU_READY flag to disable cross calls. */ 567 cp->cpu_flags &= ~CPU_READY; 568 CPUSET_ATOMIC_DEL(cpu_ready_set, cpun); 569 rc = xc_flush_cpu(cp); 570 if (rc != 0) { 571 CPUSET_ATOMIC_ADD(cpu_ready_set, cpun); 572 cp->cpu_flags |= CPU_READY; 573 return (rc); 574 } 575 576 /* Intercept target CPU at a safe point before powering it off. */ 577 cpu_idle_intercept_cpu(cp); 578 579 apic_cpu_send_SIPI(cpun, B_FALSE); 580 cp->cpu_flags &= ~CPU_RUNNING; 581 582 return (0); 583 } 584 585 int 586 apic_cpu_ops(psm_cpu_request_t *reqp) 587 { 588 if (reqp == NULL) { 589 return (EINVAL); 590 } 591 592 switch (reqp->pcr_cmd) { 593 case PSM_CPU_ADD: 594 return (apic_cpu_add(reqp)); 595 596 case PSM_CPU_REMOVE: 597 return (apic_cpu_remove(reqp)); 598 599 case PSM_CPU_STOP: 600 return (apic_cpu_stop(reqp->req.cpu_stop.cpuid, 601 reqp->req.cpu_stop.ctx)); 602 603 default: 604 return (ENOTSUP); 605 } 606 } 607 608 #ifdef DEBUG 609 int apic_break_on_cpu = 9; 610 int apic_stretch_interrupts = 0; 611 int apic_stretch_ISR = 1 << 3; /* IPL of 3 matches nothing now */ 612 #endif /* DEBUG */ 613 614 /* 615 * generates an interprocessor interrupt to another CPU. Any changes made to 616 * this routine must be accompanied by similar changes to 617 * apic_common_send_ipi(). 618 */ 619 void 620 apic_send_ipi(int cpun, int ipl) 621 { 622 int vector; 623 ulong_t flag; 624 625 vector = apic_resv_vector[ipl]; 626 627 ASSERT((vector >= APIC_BASE_VECT) && (vector <= APIC_SPUR_INTR)); 628 629 flag = intr_clear(); 630 631 APIC_AV_PENDING_SET(); 632 633 apic_reg_ops->apic_write_int_cmd(apic_cpus[cpun].aci_local_id, 634 vector); 635 636 intr_restore(flag); 637 } 638 639 void 640 apic_send_pir_ipi(processorid_t cpun) 641 { 642 const int vector = apic_pir_vect; 643 ulong_t flag; 644 645 ASSERT((vector >= APIC_BASE_VECT) && (vector <= APIC_SPUR_INTR)); 646 647 flag = intr_clear(); 648 649 /* Self-IPI for inducing PIR makes no sense. */ 650 if ((cpun != psm_get_cpu_id())) { 651 APIC_AV_PENDING_SET(); 652 apic_reg_ops->apic_write_int_cmd(apic_cpus[cpun].aci_local_id, 653 vector); 654 } 655 656 intr_restore(flag); 657 } 658 659 int 660 apic_get_pir_ipivect(void) 661 { 662 return (apic_pir_vect); 663 } 664 665 void 666 apic_set_idlecpu(processorid_t cpun __unused) 667 { 668 } 669 670 void 671 apic_unset_idlecpu(processorid_t cpun __unused) 672 { 673 } 674 675 676 void 677 apic_ret() 678 { 679 } 680 681 /* 682 * If apic_coarse_time == 1, then apic_gettime() is used instead of 683 * apic_gethrtime(). This is used for performance instead of accuracy. 684 */ 685 686 hrtime_t 687 apic_gettime() 688 { 689 int old_hrtime_stamp; 690 hrtime_t temp; 691 692 /* 693 * In one-shot mode, we do not keep time, so if anyone 694 * calls psm_gettime() directly, we vector over to 695 * gethrtime(). 696 * one-shot mode MUST NOT be enabled if this psm is the source of 697 * hrtime. 698 */ 699 700 if (apic_oneshot) 701 return (gethrtime()); 702 703 704 gettime_again: 705 while ((old_hrtime_stamp = apic_hrtime_stamp) & 1) 706 apic_ret(); 707 708 temp = apic_nsec_since_boot; 709 710 if (apic_hrtime_stamp != old_hrtime_stamp) { /* got an interrupt */ 711 goto gettime_again; 712 } 713 return (temp); 714 } 715 716 /* 717 * Here we return the number of nanoseconds since booting. Note every 718 * clock interrupt increments apic_nsec_since_boot by the appropriate 719 * amount. 720 */ 721 hrtime_t 722 apic_gethrtime(void) 723 { 724 int curr_timeval, countval, elapsed_ticks; 725 int old_hrtime_stamp, status; 726 hrtime_t temp; 727 uint32_t cpun; 728 ulong_t oflags; 729 730 /* 731 * In one-shot mode, we do not keep time, so if anyone 732 * calls psm_gethrtime() directly, we vector over to 733 * gethrtime(). 734 * one-shot mode MUST NOT be enabled if this psm is the source of 735 * hrtime. 736 */ 737 738 if (apic_oneshot) 739 return (gethrtime()); 740 741 oflags = intr_clear(); /* prevent migration */ 742 743 cpun = apic_reg_ops->apic_read(APIC_LID_REG); 744 if (apic_mode == LOCAL_APIC) 745 cpun >>= APIC_ID_BIT_OFFSET; 746 747 lock_set(&apic_gethrtime_lock); 748 749 gethrtime_again: 750 while ((old_hrtime_stamp = apic_hrtime_stamp) & 1) 751 apic_ret(); 752 753 /* 754 * Check to see which CPU we are on. Note the time is kept on 755 * the local APIC of CPU 0. If on CPU 0, simply read the current 756 * counter. If on another CPU, issue a remote read command to CPU 0. 757 */ 758 if (cpun == apic_cpus[0].aci_local_id) { 759 countval = apic_reg_ops->apic_read(APIC_CURR_COUNT); 760 } else { 761 #ifdef DEBUG 762 APIC_AV_PENDING_SET(); 763 #else 764 if (apic_mode == LOCAL_APIC) 765 APIC_AV_PENDING_SET(); 766 #endif /* DEBUG */ 767 768 apic_reg_ops->apic_write_int_cmd( 769 apic_cpus[0].aci_local_id, APIC_CURR_ADD | AV_REMOTE); 770 771 while ((status = apic_reg_ops->apic_read(APIC_INT_CMD1)) 772 & AV_READ_PENDING) { 773 apic_ret(); 774 } 775 776 if (status & AV_REMOTE_STATUS) /* 1 = valid */ 777 countval = apic_reg_ops->apic_read(APIC_REMOTE_READ); 778 else { /* 0 = invalid */ 779 apic_remote_hrterr++; 780 /* 781 * return last hrtime right now, will need more 782 * testing if change to retry 783 */ 784 temp = apic_last_hrtime; 785 786 lock_clear(&apic_gethrtime_lock); 787 788 intr_restore(oflags); 789 790 return (temp); 791 } 792 } 793 if (countval > last_count_read) 794 countval = 0; 795 else 796 last_count_read = countval; 797 798 elapsed_ticks = apic_hertz_count - countval; 799 800 curr_timeval = APIC_TICKS_TO_NSECS(elapsed_ticks); 801 temp = apic_nsec_since_boot + curr_timeval; 802 803 if (apic_hrtime_stamp != old_hrtime_stamp) { /* got an interrupt */ 804 /* we might have clobbered last_count_read. Restore it */ 805 last_count_read = apic_hertz_count; 806 goto gethrtime_again; 807 } 808 809 if (temp < apic_last_hrtime) { 810 /* return last hrtime if error occurs */ 811 apic_hrtime_error++; 812 temp = apic_last_hrtime; 813 } 814 else 815 apic_last_hrtime = temp; 816 817 lock_clear(&apic_gethrtime_lock); 818 intr_restore(oflags); 819 820 return (temp); 821 } 822 823 /* apic NMI handler */ 824 uint_t 825 apic_nmi_intr(caddr_t arg __unused, caddr_t arg1 __unused) 826 { 827 nmi_action_t action = nmi_action; 828 829 if (apic_shutdown_processors) { 830 apic_disable_local_apic(); 831 return (DDI_INTR_CLAIMED); 832 } 833 834 apic_error |= APIC_ERR_NMI; 835 836 if (!lock_try(&apic_nmi_lock)) 837 return (DDI_INTR_CLAIMED); 838 apic_num_nmis++; 839 840 /* 841 * "nmi_action" always over-rides the older way of doing this, unless we 842 * can't actually drop into kmdb when requested. 843 */ 844 if (action == NMI_ACTION_KMDB && !psm_debugger()) 845 action = NMI_ACTION_UNSET; 846 847 if (action == NMI_ACTION_UNSET) { 848 if (apic_kmdb_on_nmi && psm_debugger()) 849 action = NMI_ACTION_KMDB; 850 else if (apic_panic_on_nmi) 851 action = NMI_ACTION_PANIC; 852 else 853 action = NMI_ACTION_IGNORE; 854 } 855 856 switch (action) { 857 case NMI_ACTION_IGNORE: 858 /* 859 * prom_printf is the best shot we have of something which is 860 * problem free from high level/NMI type of interrupts 861 */ 862 prom_printf("NMI received\n"); 863 break; 864 865 case NMI_ACTION_PANIC: 866 /* Keep panic from entering kmdb. */ 867 nopanicdebug = 1; 868 panic("NMI received\n"); 869 break; 870 871 case NMI_ACTION_KMDB: 872 default: 873 debug_enter("NMI received: entering kmdb\n"); 874 break; 875 } 876 877 lock_clear(&apic_nmi_lock); 878 return (DDI_INTR_CLAIMED); 879 } 880 881 processorid_t 882 apic_get_next_processorid(processorid_t cpu_id) 883 { 884 885 int i; 886 887 if (cpu_id == -1) 888 return ((processorid_t)0); 889 890 for (i = cpu_id + 1; i < NCPU; i++) { 891 if (apic_cpu_in_range(i)) 892 return (i); 893 } 894 895 return ((processorid_t)-1); 896 } 897 898 int 899 apic_cpu_add(psm_cpu_request_t *reqp) 900 { 901 int i, rv = 0; 902 ulong_t iflag; 903 boolean_t first = B_TRUE; 904 uchar_t localver = 0; 905 uint32_t localid, procid; 906 processorid_t cpuid = (processorid_t)-1; 907 mach_cpu_add_arg_t *ap; 908 909 ASSERT(reqp != NULL); 910 reqp->req.cpu_add.cpuid = (processorid_t)-1; 911 912 /* Check whether CPU hotplug is supported. */ 913 if (!plat_dr_support_cpu() || apic_max_nproc == -1) { 914 return (ENOTSUP); 915 } 916 917 ap = (mach_cpu_add_arg_t *)reqp->req.cpu_add.argp; 918 switch (ap->type) { 919 case MACH_CPU_ARG_LOCAL_APIC: 920 localid = ap->arg.apic.apic_id; 921 procid = ap->arg.apic.proc_id; 922 if (localid >= 255 || procid > 255) { 923 cmn_err(CE_WARN, 924 "!apic: apicid(%u) or procid(%u) is invalid.", 925 localid, procid); 926 return (EINVAL); 927 } 928 break; 929 930 case MACH_CPU_ARG_LOCAL_X2APIC: 931 localid = ap->arg.apic.apic_id; 932 procid = ap->arg.apic.proc_id; 933 if (localid >= UINT32_MAX) { 934 cmn_err(CE_WARN, 935 "!apic: x2apicid(%u) is invalid.", localid); 936 return (EINVAL); 937 } else if (localid >= 255 && apic_mode == LOCAL_APIC) { 938 cmn_err(CE_WARN, "!apic: system is in APIC mode, " 939 "can't support x2APIC processor."); 940 return (ENOTSUP); 941 } 942 break; 943 944 default: 945 cmn_err(CE_WARN, 946 "!apic: unknown argument type %d to apic_cpu_add().", 947 ap->type); 948 return (EINVAL); 949 } 950 951 /* Use apic_ioapic_lock to sync with apic_get_next_bind_cpu. */ 952 iflag = intr_clear(); 953 lock_set(&apic_ioapic_lock); 954 955 /* Check whether local APIC id already exists. */ 956 for (i = 0; i < apic_nproc; i++) { 957 if (!CPU_IN_SET(apic_cpumask, i)) 958 continue; 959 if (apic_cpus[i].aci_local_id == localid) { 960 lock_clear(&apic_ioapic_lock); 961 intr_restore(iflag); 962 cmn_err(CE_WARN, 963 "!apic: local apic id %u already exists.", 964 localid); 965 return (EEXIST); 966 } else if (apic_cpus[i].aci_processor_id == procid) { 967 lock_clear(&apic_ioapic_lock); 968 intr_restore(iflag); 969 cmn_err(CE_WARN, 970 "!apic: processor id %u already exists.", 971 (int)procid); 972 return (EEXIST); 973 } 974 975 /* 976 * There's no local APIC version number available in MADT table, 977 * so assume that all CPUs are homogeneous and use local APIC 978 * version number of the first existing CPU. 979 */ 980 if (first) { 981 first = B_FALSE; 982 localver = apic_cpus[i].aci_local_ver; 983 } 984 } 985 ASSERT(first == B_FALSE); 986 987 /* 988 * Try to assign the same cpuid if APIC id exists in the dirty cache. 989 */ 990 for (i = 0; i < apic_max_nproc; i++) { 991 if (CPU_IN_SET(apic_cpumask, i)) { 992 ASSERT((apic_cpus[i].aci_status & APIC_CPU_FREE) == 0); 993 continue; 994 } 995 ASSERT(apic_cpus[i].aci_status & APIC_CPU_FREE); 996 if ((apic_cpus[i].aci_status & APIC_CPU_DIRTY) && 997 apic_cpus[i].aci_local_id == localid && 998 apic_cpus[i].aci_processor_id == procid) { 999 cpuid = i; 1000 break; 1001 } 1002 } 1003 1004 /* Avoid the dirty cache and allocate fresh slot if possible. */ 1005 if (cpuid == (processorid_t)-1) { 1006 for (i = 0; i < apic_max_nproc; i++) { 1007 if ((apic_cpus[i].aci_status & APIC_CPU_FREE) && 1008 (apic_cpus[i].aci_status & APIC_CPU_DIRTY) == 0) { 1009 cpuid = i; 1010 break; 1011 } 1012 } 1013 } 1014 1015 /* Try to find any free slot as last resort. */ 1016 if (cpuid == (processorid_t)-1) { 1017 for (i = 0; i < apic_max_nproc; i++) { 1018 if (apic_cpus[i].aci_status & APIC_CPU_FREE) { 1019 cpuid = i; 1020 break; 1021 } 1022 } 1023 } 1024 1025 if (cpuid == (processorid_t)-1) { 1026 lock_clear(&apic_ioapic_lock); 1027 intr_restore(iflag); 1028 cmn_err(CE_NOTE, 1029 "!apic: failed to allocate cpu id for processor %u.", 1030 procid); 1031 rv = EAGAIN; 1032 } else if (ACPI_FAILURE(acpica_map_cpu(cpuid, procid))) { 1033 lock_clear(&apic_ioapic_lock); 1034 intr_restore(iflag); 1035 cmn_err(CE_NOTE, 1036 "!apic: failed to build mapping for processor %u.", 1037 procid); 1038 rv = EBUSY; 1039 } else { 1040 ASSERT(cpuid >= 0 && cpuid < NCPU); 1041 ASSERT(cpuid < apic_max_nproc && cpuid < max_ncpus); 1042 bzero(&apic_cpus[cpuid], sizeof (apic_cpus[0])); 1043 apic_cpus[cpuid].aci_processor_id = procid; 1044 apic_cpus[cpuid].aci_local_id = localid; 1045 apic_cpus[cpuid].aci_local_ver = localver; 1046 CPUSET_ATOMIC_ADD(apic_cpumask, cpuid); 1047 if (cpuid >= apic_nproc) { 1048 apic_nproc = cpuid + 1; 1049 } 1050 lock_clear(&apic_ioapic_lock); 1051 intr_restore(iflag); 1052 reqp->req.cpu_add.cpuid = cpuid; 1053 } 1054 1055 return (rv); 1056 } 1057 1058 int 1059 apic_cpu_remove(psm_cpu_request_t *reqp) 1060 { 1061 int i; 1062 ulong_t iflag; 1063 processorid_t cpuid; 1064 1065 /* Check whether CPU hotplug is supported. */ 1066 if (!plat_dr_support_cpu() || apic_max_nproc == -1) { 1067 return (ENOTSUP); 1068 } 1069 1070 cpuid = reqp->req.cpu_remove.cpuid; 1071 1072 /* Use apic_ioapic_lock to sync with apic_get_next_bind_cpu. */ 1073 iflag = intr_clear(); 1074 lock_set(&apic_ioapic_lock); 1075 1076 if (!apic_cpu_in_range(cpuid)) { 1077 lock_clear(&apic_ioapic_lock); 1078 intr_restore(iflag); 1079 cmn_err(CE_WARN, 1080 "!apic: cpuid %d doesn't exist in apic_cpus array.", 1081 cpuid); 1082 return (ENODEV); 1083 } 1084 ASSERT((apic_cpus[cpuid].aci_status & APIC_CPU_FREE) == 0); 1085 1086 if (ACPI_FAILURE(acpica_unmap_cpu(cpuid))) { 1087 lock_clear(&apic_ioapic_lock); 1088 intr_restore(iflag); 1089 return (ENOENT); 1090 } 1091 1092 if (cpuid == apic_nproc - 1) { 1093 /* 1094 * We are removing the highest numbered cpuid so we need to 1095 * find the next highest cpuid as the new value for apic_nproc. 1096 */ 1097 for (i = apic_nproc; i > 0; i--) { 1098 if (CPU_IN_SET(apic_cpumask, i - 1)) { 1099 apic_nproc = i; 1100 break; 1101 } 1102 } 1103 /* at least one CPU left */ 1104 ASSERT(i > 0); 1105 } 1106 CPUSET_ATOMIC_DEL(apic_cpumask, cpuid); 1107 /* mark slot as free and keep it in the dirty cache */ 1108 apic_cpus[cpuid].aci_status = APIC_CPU_FREE | APIC_CPU_DIRTY; 1109 1110 lock_clear(&apic_ioapic_lock); 1111 intr_restore(iflag); 1112 1113 return (0); 1114 } 1115 1116 /* 1117 * Return the number of ticks the APIC decrements in SF nanoseconds. 1118 * The fixed-frequency PIT (aka 8254) is used for the measurement. 1119 */ 1120 static uint64_t 1121 apic_calibrate_pit(void) 1122 { 1123 uint8_t pit_tick_lo; 1124 uint16_t pit_tick, target_pit_tick, pit_ticks_adj; 1125 uint32_t pit_ticks; 1126 uint32_t start_apic_tick, end_apic_tick, apic_ticks; 1127 ulong_t iflag; 1128 1129 if (pit_is_broken) 1130 return (0); 1131 1132 apic_reg_ops->apic_write(APIC_DIVIDE_REG, apic_divide_reg_init); 1133 apic_reg_ops->apic_write(APIC_INIT_COUNT, APIC_MAXVAL); 1134 1135 iflag = intr_clear(); 1136 1137 /* 1138 * Put the PIT in mode 0, "Interrupt On Terminal Count": 1139 */ 1140 outb(PITCTL_PORT, PIT_C0 | PIT_LOADMODE | PIT_ENDSIGMODE); 1141 1142 /* 1143 * The PIT counts down and then the counter value wraps around. Load 1144 * the maximum counter value: 1145 */ 1146 outb(PITCTR0_PORT, 0xFF); 1147 outb(PITCTR0_PORT, 0xFF); 1148 1149 do { 1150 pit_tick_lo = inb(PITCTR0_PORT); 1151 pit_tick = (inb(PITCTR0_PORT) << 8) | pit_tick_lo; 1152 } while (pit_tick < APIC_TIME_MIN || 1153 pit_tick_lo <= APIC_LB_MIN || pit_tick_lo >= APIC_LB_MAX); 1154 1155 /* 1156 * Wait for the PIT to decrement by 5 ticks to ensure 1157 * we didn't start in the middle of a tick. 1158 * Compare with 0x10 for the wrap around case. 1159 */ 1160 target_pit_tick = pit_tick - 5; 1161 do { 1162 pit_tick_lo = inb(PITCTR0_PORT); 1163 pit_tick = (inb(PITCTR0_PORT) << 8) | pit_tick_lo; 1164 } while (pit_tick > target_pit_tick || pit_tick_lo < 0x10); 1165 1166 start_apic_tick = apic_reg_ops->apic_read(APIC_CURR_COUNT); 1167 1168 /* 1169 * Wait for the PIT to decrement by APIC_TIME_COUNT ticks 1170 */ 1171 target_pit_tick = pit_tick - APIC_TIME_COUNT; 1172 do { 1173 pit_tick_lo = inb(PITCTR0_PORT); 1174 pit_tick = (inb(PITCTR0_PORT) << 8) | pit_tick_lo; 1175 } while (pit_tick > target_pit_tick || pit_tick_lo < 0x10); 1176 1177 end_apic_tick = apic_reg_ops->apic_read(APIC_CURR_COUNT); 1178 1179 intr_restore(iflag); 1180 1181 apic_ticks = start_apic_tick - end_apic_tick; 1182 1183 /* The PIT might have decremented by more ticks than planned */ 1184 pit_ticks_adj = target_pit_tick - pit_tick; 1185 /* total number of PIT ticks corresponding to apic_ticks */ 1186 pit_ticks = APIC_TIME_COUNT + pit_ticks_adj; 1187 1188 /* 1189 * Determine the number of nanoseconds per APIC clock tick 1190 * and then determine how many APIC ticks to interrupt at the 1191 * desired frequency 1192 * apic_ticks / (pitticks / PIT_HZ) = apic_ticks_per_s 1193 * (apic_ticks * PIT_HZ) / pitticks = apic_ticks_per_s 1194 * apic_ticks_per_ns = (apic_ticks * PIT_HZ) / (pitticks * 10^9) 1195 * apic_ticks_per_SFns = 1196 * (SF * apic_ticks * PIT_HZ) / (pitticks * 10^9) 1197 */ 1198 return ((SF * apic_ticks * PIT_HZ) / ((uint64_t)pit_ticks * NANOSEC)); 1199 } 1200 1201 /* 1202 * Return the number of ticks the APIC decrements in SF nanoseconds. 1203 * The TSC is used for the measurement. 1204 */ 1205 static uint64_t 1206 apic_calibrate_tsc(void) 1207 { 1208 uint64_t tsc_now, tsc_end, tsc_amt, tsc_hz; 1209 uint64_t apic_ticks; 1210 uint32_t start_apic_tick, end_apic_tick; 1211 ulong_t iflag; 1212 1213 tsc_hz = tsc_get_freq(); 1214 1215 /* 1216 * APIC_TIME_COUNT is in i8254 PIT ticks, which have a period 1217 * slightly under 1us. We can just treat the value as the number of 1218 * microseconds for our sampling period -- that is we wait 1219 * APIC_TIME_COUNT microseconds (corresponding to 'tsc_amt' of TSC 1220 * ticks). 1221 */ 1222 tsc_amt = tsc_hz * APIC_TIME_COUNT / MICROSEC; 1223 1224 apic_reg_ops->apic_write(APIC_DIVIDE_REG, apic_divide_reg_init); 1225 apic_reg_ops->apic_write(APIC_INIT_COUNT, APIC_MAXVAL); 1226 1227 iflag = intr_clear(); 1228 1229 tsc_now = tsc_read(); 1230 tsc_end = tsc_now + tsc_amt; 1231 start_apic_tick = apic_reg_ops->apic_read(APIC_CURR_COUNT); 1232 1233 while (tsc_now < tsc_end) 1234 tsc_now = tsc_read(); 1235 1236 end_apic_tick = apic_reg_ops->apic_read(APIC_CURR_COUNT); 1237 1238 intr_restore(iflag); 1239 1240 apic_ticks = start_apic_tick - end_apic_tick; 1241 1242 /* 1243 * We likely did not wait exactly APIC_TIME_COUNT microseconds, but 1244 * slightly longer. Add the additional amount to tsc_amt. 1245 */ 1246 tsc_amt += tsc_now - tsc_end; 1247 1248 /* 1249 * This calculation is analogous to the one used with the PIT. 1250 * However, due to the typically _much_ higher precision of the 1251 * TSC compared to the PIT, we have to be careful we do not overflow. 1252 * 1253 * Since contemporary APIC timers have frequencies on the order of 1254 * tens of MHz (i.e. 66MHz), we calculate that first. Then we 1255 * scale the result by SF (because the caller wants it scaled by 1256 * that amount), then convert the result to scaled (SF) ticks per ns. 1257 * 1258 */ 1259 uint64_t apic_freq = apic_ticks * tsc_hz / tsc_amt; 1260 1261 return (apic_freq * SF / NANOSEC); 1262 } 1263 1264 /* 1265 * Return the number of ticks the APIC decrements in SF nanoseconds. 1266 * Several measurements are taken to filter out outliers. 1267 */ 1268 uint64_t 1269 apic_calibrate() 1270 { 1271 uint64_t measurements[APIC_CALIBRATE_MEASUREMENTS]; 1272 int median_idx; 1273 uint64_t median; 1274 1275 /* 1276 * When running under a virtual machine, the emulated PIT and APIC 1277 * counters do not always return the right values and can roll over. 1278 * Those spurious measurements are relatively rare but could 1279 * significantly affect the calibration. 1280 * Therefore we take several measurements and then keep the median. 1281 * The median is preferred to the average here as we only want to 1282 * discard outliers. 1283 * 1284 * Traditionally, only the PIT was used to calibrate the APIC as the 1285 * the TSC was not calibrated at this point in the boot process (or 1286 * on even (much, much) older systems, possibly not present). On 1287 * newer systems, the PIT is not always present. We now default to 1288 * using the TSC (since it's now calibrated early enough in the boot 1289 * process to be usable), but for debugging purposes as we transition, 1290 * we still try to use the PIT and record those values. On systems 1291 * without a functioning PIT, the PIT measurements will always be 0. 1292 */ 1293 for (int i = 0; i < APIC_CALIBRATE_MEASUREMENTS; i++) { 1294 apic_info_tsc[i] = apic_calibrate_tsc(); 1295 apic_info_pit[i] = apic_calibrate_pit(); 1296 1297 if (apic_calibrate_use_pit) { 1298 if (pit_is_broken) { 1299 panic("Failed to calibrate APIC due to broken " 1300 "PIT"); 1301 } 1302 measurements[i] = apic_info_pit[i]; 1303 } else { 1304 measurements[i] = apic_info_tsc[i]; 1305 } 1306 } 1307 1308 /* 1309 * sort results and retrieve median. 1310 */ 1311 for (int i = 0; i < APIC_CALIBRATE_MEASUREMENTS; i++) { 1312 for (int j = i + 1; j < APIC_CALIBRATE_MEASUREMENTS; j++) { 1313 if (measurements[j] < measurements[i]) { 1314 uint64_t tmp = measurements[i]; 1315 measurements[i] = measurements[j]; 1316 measurements[j] = tmp; 1317 } 1318 } 1319 } 1320 median_idx = APIC_CALIBRATE_MEASUREMENTS / 2; 1321 median = measurements[median_idx]; 1322 1323 #if (APIC_CALIBRATE_MEASUREMENTS >= 3) 1324 /* 1325 * Check that measurements are consistent. Post a warning 1326 * if the three middle values are not close to each other. 1327 */ 1328 uint64_t delta_warn = median * 1329 APIC_CALIBRATE_PERCENT_OFF_WARNING / 100; 1330 if ((median - measurements[median_idx - 1]) > delta_warn || 1331 (measurements[median_idx + 1] - median) > delta_warn) { 1332 cmn_err(CE_WARN, "apic_calibrate measurements lack " 1333 "precision: %llu, %llu, %llu.", 1334 (u_longlong_t)measurements[median_idx - 1], 1335 (u_longlong_t)median, 1336 (u_longlong_t)measurements[median_idx + 1]); 1337 } 1338 #endif 1339 1340 return (median); 1341 } 1342 1343 /* 1344 * Initialise the APIC timer on the local APIC of CPU 0 to the desired 1345 * frequency. Note at this stage in the boot sequence, the boot processor 1346 * is the only active processor. 1347 * hertz value of 0 indicates a one-shot mode request. In this case 1348 * the function returns the resolution (in nanoseconds) for the hardware 1349 * timer interrupt. If one-shot mode capability is not available, 1350 * the return value will be 0. apic_enable_oneshot is a global switch 1351 * for disabling the functionality. 1352 * A non-zero positive value for hertz indicates a periodic mode request. 1353 * In this case the hardware will be programmed to generate clock interrupts 1354 * at hertz frequency and returns the resolution of interrupts in 1355 * nanosecond. 1356 */ 1357 1358 int 1359 apic_clkinit(int hertz) 1360 { 1361 int ret; 1362 1363 apic_int_busy_mark = (apic_int_busy_mark * 1364 apic_sample_factor_redistribution) / 100; 1365 apic_int_free_mark = (apic_int_free_mark * 1366 apic_sample_factor_redistribution) / 100; 1367 apic_diff_for_redistribution = (apic_diff_for_redistribution * 1368 apic_sample_factor_redistribution) / 100; 1369 1370 ret = apic_timer_init(hertz); 1371 return (ret); 1372 1373 } 1374 1375 /* 1376 * apic_preshutdown: 1377 * Called early in shutdown whilst we can still access filesystems to do 1378 * things like loading modules which will be required to complete shutdown 1379 * after filesystems are all unmounted. 1380 */ 1381 void 1382 apic_preshutdown(int cmd __unused, int fcn __unused) 1383 { 1384 APIC_VERBOSE_POWEROFF(("apic_preshutdown(%d,%d); m=%d a=%d\n", 1385 cmd, fcn, apic_poweroff_method, apic_enable_acpi)); 1386 } 1387 1388 void 1389 apic_shutdown(int cmd, int fcn) 1390 { 1391 int restarts, attempts; 1392 int i; 1393 uchar_t byte; 1394 ulong_t iflag; 1395 1396 hpet_acpi_fini(); 1397 1398 /* Send NMI to all CPUs except self to do per processor shutdown */ 1399 iflag = intr_clear(); 1400 #ifdef DEBUG 1401 APIC_AV_PENDING_SET(); 1402 #else 1403 if (apic_mode == LOCAL_APIC) 1404 APIC_AV_PENDING_SET(); 1405 #endif /* DEBUG */ 1406 apic_shutdown_processors = 1; 1407 apic_reg_ops->apic_write(APIC_INT_CMD1, 1408 AV_NMI | AV_LEVEL | AV_SH_ALL_EXCSELF); 1409 1410 /* restore cmos shutdown byte before reboot */ 1411 if (apic_cmos_ssb_set) { 1412 outb(CMOS_ADDR, SSB); 1413 outb(CMOS_DATA, 0); 1414 } 1415 1416 ioapic_disable_redirection(); 1417 1418 /* disable apic mode if imcr present */ 1419 if (apic_imcrp) { 1420 outb(APIC_IMCR_P1, (uchar_t)APIC_IMCR_SELECT); 1421 outb(APIC_IMCR_P2, (uchar_t)APIC_IMCR_PIC); 1422 } 1423 1424 apic_disable_local_apic(); 1425 1426 intr_restore(iflag); 1427 1428 /* remainder of function is for shutdown cases only */ 1429 if (cmd != A_SHUTDOWN) 1430 return; 1431 1432 /* 1433 * Switch system back into Legacy-Mode if using ACPI and 1434 * not powering-off. Some BIOSes need to remain in ACPI-mode 1435 * for power-off to succeed (Dell Dimension 4600) 1436 * Do not disable ACPI while doing fastreboot 1437 */ 1438 if (apic_enable_acpi && fcn != AD_POWEROFF && fcn != AD_FASTREBOOT) 1439 (void) AcpiDisable(); 1440 1441 if (fcn == AD_FASTREBOOT) { 1442 apic_reg_ops->apic_write(APIC_INT_CMD1, 1443 AV_ASSERT | AV_RESET | AV_SH_ALL_EXCSELF); 1444 } 1445 1446 /* remainder of function is for shutdown+poweroff case only */ 1447 if (fcn != AD_POWEROFF) 1448 return; 1449 1450 switch (apic_poweroff_method) { 1451 case APIC_POWEROFF_VIA_RTC: 1452 1453 /* select the extended NVRAM bank in the RTC */ 1454 outb(CMOS_ADDR, RTC_REGA); 1455 byte = inb(CMOS_DATA); 1456 outb(CMOS_DATA, (byte | EXT_BANK)); 1457 1458 outb(CMOS_ADDR, PFR_REG); 1459 1460 /* for Predator must toggle the PAB bit */ 1461 byte = inb(CMOS_DATA); 1462 1463 /* 1464 * clear power active bar, wakeup alarm and 1465 * kickstart 1466 */ 1467 byte &= ~(PAB_CBIT | WF_FLAG | KS_FLAG); 1468 outb(CMOS_DATA, byte); 1469 1470 /* delay before next write */ 1471 drv_usecwait(1000); 1472 1473 /* for S40 the following would suffice */ 1474 byte = inb(CMOS_DATA); 1475 1476 /* power active bar control bit */ 1477 byte |= PAB_CBIT; 1478 outb(CMOS_DATA, byte); 1479 1480 break; 1481 1482 case APIC_POWEROFF_VIA_ASPEN_BMC: 1483 restarts = 0; 1484 restart_aspen_bmc: 1485 if (++restarts == 3) 1486 break; 1487 attempts = 0; 1488 do { 1489 byte = inb(MISMIC_FLAG_REGISTER); 1490 byte &= MISMIC_BUSY_MASK; 1491 if (byte != 0) { 1492 drv_usecwait(1000); 1493 if (attempts >= 3) 1494 goto restart_aspen_bmc; 1495 ++attempts; 1496 } 1497 } while (byte != 0); 1498 outb(MISMIC_CNTL_REGISTER, CC_SMS_GET_STATUS); 1499 byte = inb(MISMIC_FLAG_REGISTER); 1500 byte |= 0x1; 1501 outb(MISMIC_FLAG_REGISTER, byte); 1502 i = 0; 1503 for (; i < (sizeof (aspen_bmc)/sizeof (aspen_bmc[0])); 1504 i++) { 1505 attempts = 0; 1506 do { 1507 byte = inb(MISMIC_FLAG_REGISTER); 1508 byte &= MISMIC_BUSY_MASK; 1509 if (byte != 0) { 1510 drv_usecwait(1000); 1511 if (attempts >= 3) 1512 goto restart_aspen_bmc; 1513 ++attempts; 1514 } 1515 } while (byte != 0); 1516 outb(MISMIC_CNTL_REGISTER, aspen_bmc[i].cntl); 1517 outb(MISMIC_DATA_REGISTER, aspen_bmc[i].data); 1518 byte = inb(MISMIC_FLAG_REGISTER); 1519 byte |= 0x1; 1520 outb(MISMIC_FLAG_REGISTER, byte); 1521 } 1522 break; 1523 1524 case APIC_POWEROFF_VIA_SITKA_BMC: 1525 restarts = 0; 1526 restart_sitka_bmc: 1527 if (++restarts == 3) 1528 break; 1529 attempts = 0; 1530 do { 1531 byte = inb(SMS_STATUS_REGISTER); 1532 byte &= SMS_STATE_MASK; 1533 if ((byte == SMS_READ_STATE) || 1534 (byte == SMS_WRITE_STATE)) { 1535 drv_usecwait(1000); 1536 if (attempts >= 3) 1537 goto restart_sitka_bmc; 1538 ++attempts; 1539 } 1540 } while ((byte == SMS_READ_STATE) || 1541 (byte == SMS_WRITE_STATE)); 1542 outb(SMS_COMMAND_REGISTER, SMS_GET_STATUS); 1543 i = 0; 1544 for (; i < (sizeof (sitka_bmc)/sizeof (sitka_bmc[0])); 1545 i++) { 1546 attempts = 0; 1547 do { 1548 byte = inb(SMS_STATUS_REGISTER); 1549 byte &= SMS_IBF_MASK; 1550 if (byte != 0) { 1551 drv_usecwait(1000); 1552 if (attempts >= 3) 1553 goto restart_sitka_bmc; 1554 ++attempts; 1555 } 1556 } while (byte != 0); 1557 outb(sitka_bmc[i].port, sitka_bmc[i].data); 1558 } 1559 break; 1560 1561 case APIC_POWEROFF_NONE: 1562 1563 /* If no APIC direct method, we will try using ACPI */ 1564 if (apic_enable_acpi) { 1565 if (acpi_poweroff() == 1) 1566 return; 1567 } else 1568 return; 1569 1570 break; 1571 } 1572 /* 1573 * Wait a limited time here for power to go off. 1574 * If the power does not go off, then there was a 1575 * problem and we should continue to the halt which 1576 * prints a message for the user to press a key to 1577 * reboot. 1578 */ 1579 drv_usecwait(7000000); /* wait seven seconds */ 1580 1581 } 1582 1583 cyclic_id_t apic_cyclic_id; 1584 1585 /* 1586 * The following functions are in the platform specific file so that they 1587 * can be different functions depending on whether we are running on 1588 * bare metal or a hypervisor. 1589 */ 1590 1591 /* 1592 * map an apic for memory-mapped access 1593 */ 1594 uint32_t * 1595 mapin_apic(uint32_t addr, size_t len, int flags) 1596 { 1597 return ((void *)psm_map_phys(addr, len, flags)); 1598 } 1599 1600 uint32_t * 1601 mapin_ioapic(uint32_t addr, size_t len, int flags) 1602 { 1603 return (mapin_apic(addr, len, flags)); 1604 } 1605 1606 /* 1607 * unmap an apic 1608 */ 1609 void 1610 mapout_apic(caddr_t addr, size_t len) 1611 { 1612 psm_unmap_phys(addr, len); 1613 } 1614 1615 void 1616 mapout_ioapic(caddr_t addr, size_t len) 1617 { 1618 mapout_apic(addr, len); 1619 } 1620 1621 uint32_t 1622 ioapic_read(int ioapic_ix, uint32_t reg) 1623 { 1624 volatile uint32_t *ioapic; 1625 1626 ioapic = apicioadr[ioapic_ix]; 1627 ioapic[APIC_IO_REG] = reg; 1628 return (ioapic[APIC_IO_DATA]); 1629 } 1630 1631 void 1632 ioapic_write(int ioapic_ix, uint32_t reg, uint32_t value) 1633 { 1634 volatile uint32_t *ioapic; 1635 1636 ioapic = apicioadr[ioapic_ix]; 1637 ioapic[APIC_IO_REG] = reg; 1638 ioapic[APIC_IO_DATA] = value; 1639 } 1640 1641 void 1642 ioapic_write_eoi(int ioapic_ix, uint32_t value) 1643 { 1644 volatile uint32_t *ioapic; 1645 1646 ioapic = apicioadr[ioapic_ix]; 1647 ioapic[APIC_IO_EOI] = value; 1648 } 1649 1650 /* 1651 * Round-robin algorithm to find the next CPU with interrupts enabled. 1652 * It can't share the same static variable apic_next_bind_cpu with 1653 * apic_get_next_bind_cpu(), since that will cause all interrupts to be 1654 * bound to CPU1 at boot time. During boot, only CPU0 is online with 1655 * interrupts enabled when apic_get_next_bind_cpu() and apic_find_cpu() 1656 * are called. However, the pcplusmp driver assumes that there will be 1657 * boot_ncpus CPUs configured eventually so it tries to distribute all 1658 * interrupts among CPU0 - CPU[boot_ncpus - 1]. Thus to prevent all 1659 * interrupts being targetted at CPU1, we need to use a dedicated static 1660 * variable for find_next_cpu() instead of sharing apic_next_bind_cpu. 1661 */ 1662 1663 processorid_t 1664 apic_find_cpu(int flag) 1665 { 1666 int i; 1667 static processorid_t acid = 0; 1668 1669 /* Find the first CPU with the passed-in flag set */ 1670 for (i = 0; i < apic_nproc; i++) { 1671 if (++acid >= apic_nproc) { 1672 acid = 0; 1673 } 1674 if (apic_cpu_in_range(acid) && 1675 (apic_cpus[acid].aci_status & flag)) { 1676 break; 1677 } 1678 } 1679 1680 ASSERT((apic_cpus[acid].aci_status & flag) != 0); 1681 return (acid); 1682 } 1683 1684 void 1685 apic_intrmap_init(int apic_mode) 1686 { 1687 int suppress_brdcst_eoi = 0; 1688 1689 /* 1690 * Intel Software Developer's Manual 3A, 10.12.7: 1691 * 1692 * Routing of device interrupts to local APIC units operating in 1693 * x2APIC mode requires use of the interrupt-remapping architecture 1694 * specified in the Intel Virtualization Technology for Directed 1695 * I/O, Revision 1.3. Because of this, BIOS must enumerate support 1696 * for and software must enable this interrupt remapping with 1697 * Extended Interrupt Mode Enabled before it enabling x2APIC mode in 1698 * the local APIC units. 1699 * 1700 * 1701 * In other words, to use the APIC in x2APIC mode, we need interrupt 1702 * remapping. Since we don't start up the IOMMU by default, we 1703 * won't be able to do any interrupt remapping and therefore have to 1704 * use the APIC in traditional 'local APIC' mode with memory mapped 1705 * I/O. 1706 */ 1707 1708 if (psm_vt_ops != NULL) { 1709 if (((apic_intrmap_ops_t *)psm_vt_ops)-> 1710 apic_intrmap_init(apic_mode) == DDI_SUCCESS) { 1711 1712 apic_vt_ops = psm_vt_ops; 1713 1714 /* 1715 * We leverage the interrupt remapping engine to 1716 * suppress broadcast EOI; thus we must send the 1717 * directed EOI with the directed-EOI handler. 1718 */ 1719 if (apic_directed_EOI_supported() == 0) { 1720 suppress_brdcst_eoi = 1; 1721 } 1722 1723 apic_vt_ops->apic_intrmap_enable(suppress_brdcst_eoi); 1724 1725 if (apic_detect_x2apic()) { 1726 apic_enable_x2apic(); 1727 } 1728 1729 if (apic_directed_EOI_supported() == 0) { 1730 apic_set_directed_EOI_handler(); 1731 } 1732 } 1733 } 1734 } 1735 1736 static void 1737 apic_record_ioapic_rdt(void *intrmap_private __unused, ioapic_rdt_t *irdt) 1738 { 1739 irdt->ir_hi <<= APIC_ID_BIT_OFFSET; 1740 } 1741 1742 static void 1743 apic_record_msi(void *intrmap_private __unused, msi_regs_t *mregs) 1744 { 1745 mregs->mr_addr = MSI_ADDR_HDR | 1746 (MSI_ADDR_RH_FIXED << MSI_ADDR_RH_SHIFT) | 1747 (MSI_ADDR_DM_PHYSICAL << MSI_ADDR_DM_SHIFT) | 1748 (mregs->mr_addr << MSI_ADDR_DEST_SHIFT); 1749 mregs->mr_data = (MSI_DATA_TM_EDGE << MSI_DATA_TM_SHIFT) | 1750 mregs->mr_data; 1751 } 1752 1753 /* 1754 * Functions from apic_introp.c 1755 * 1756 * Those functions are used by apic_intr_ops(). 1757 */ 1758 1759 /* 1760 * MSI support flag: 1761 * reflects whether MSI is supported at APIC level 1762 * it can also be patched through /etc/system 1763 * 1764 * 0 = default value - don't know and need to call apic_check_msi_support() 1765 * to find out then set it accordingly 1766 * 1 = supported 1767 * -1 = not supported 1768 */ 1769 int apic_support_msi = 0; 1770 1771 /* Multiple vector support for MSI-X */ 1772 int apic_msix_enable = 1; 1773 1774 /* Multiple vector support for MSI */ 1775 int apic_multi_msi_enable = 1; 1776 1777 /* 1778 * Check whether the system supports MSI. 1779 * 1780 * MSI is required for PCI-E and for PCI versions later than 2.2, so if we find 1781 * a PCI-E bus or we find a PCI bus whose version we know is >= 2.2, then we 1782 * return PSM_SUCCESS to indicate this system supports MSI. 1783 * 1784 * (Currently the only way we check whether a given PCI bus supports >= 2.2 is 1785 * by detecting if we are running inside the KVM hypervisor, which guarantees 1786 * this version number.) 1787 */ 1788 int 1789 apic_check_msi_support() 1790 { 1791 dev_info_t *cdip; 1792 char dev_type[16]; 1793 int dev_len; 1794 int hwenv = get_hwenv(); 1795 1796 DDI_INTR_IMPLDBG((CE_CONT, "apic_check_msi_support:\n")); 1797 1798 /* 1799 * check whether the first level children of root_node have 1800 * PCI-E or PCI capability. 1801 */ 1802 for (cdip = ddi_get_child(ddi_root_node()); cdip != NULL; 1803 cdip = ddi_get_next_sibling(cdip)) { 1804 1805 DDI_INTR_IMPLDBG((CE_CONT, "apic_check_msi_support: cdip: 0x%p," 1806 " driver: %s, binding: %s, nodename: %s\n", (void *)cdip, 1807 ddi_driver_name(cdip), ddi_binding_name(cdip), 1808 ddi_node_name(cdip))); 1809 dev_len = sizeof (dev_type); 1810 if (ddi_getlongprop_buf(DDI_DEV_T_ANY, cdip, DDI_PROP_DONTPASS, 1811 "device_type", (caddr_t)dev_type, &dev_len) 1812 != DDI_PROP_SUCCESS) 1813 continue; 1814 if (strcmp(dev_type, "pciex") == 0) 1815 return (PSM_SUCCESS); 1816 if (strcmp(dev_type, "pci") == 0 && 1817 (hwenv == HW_KVM || hwenv == HW_BHYVE)) 1818 return (PSM_SUCCESS); 1819 } 1820 1821 /* MSI is not supported on this system */ 1822 DDI_INTR_IMPLDBG((CE_CONT, "apic_check_msi_support: no 'pciex' " 1823 "device_type found\n")); 1824 return (PSM_FAILURE); 1825 } 1826 1827 /* 1828 * apic_pci_msi_unconfigure: 1829 * 1830 * This and next two interfaces are copied from pci_intr_lib.c 1831 * Do ensure that these two files stay in sync. 1832 * These needed to be copied over here to avoid a deadlock situation on 1833 * certain mp systems that use MSI interrupts. 1834 * 1835 * IMPORTANT regards next three interfaces: 1836 * i) are called only for MSI/X interrupts. 1837 * ii) called with interrupts disabled, and must not block 1838 */ 1839 void 1840 apic_pci_msi_unconfigure(dev_info_t *rdip, int type, int inum) 1841 { 1842 ushort_t msi_ctrl; 1843 int cap_ptr = i_ddi_get_msi_msix_cap_ptr(rdip); 1844 ddi_acc_handle_t handle = i_ddi_get_pci_config_handle(rdip); 1845 1846 ASSERT((handle != NULL) && (cap_ptr != 0)); 1847 1848 if (type == DDI_INTR_TYPE_MSI) { 1849 msi_ctrl = pci_config_get16(handle, cap_ptr + PCI_MSI_CTRL); 1850 msi_ctrl &= (~PCI_MSI_MME_MASK); 1851 pci_config_put16(handle, cap_ptr + PCI_MSI_CTRL, msi_ctrl); 1852 pci_config_put32(handle, cap_ptr + PCI_MSI_ADDR_OFFSET, 0); 1853 1854 if (msi_ctrl & PCI_MSI_64BIT_MASK) { 1855 pci_config_put16(handle, 1856 cap_ptr + PCI_MSI_64BIT_DATA, 0); 1857 pci_config_put32(handle, 1858 cap_ptr + PCI_MSI_ADDR_OFFSET + 4, 0); 1859 } else { 1860 pci_config_put16(handle, 1861 cap_ptr + PCI_MSI_32BIT_DATA, 0); 1862 } 1863 1864 } else if (type == DDI_INTR_TYPE_MSIX) { 1865 uintptr_t off; 1866 uint32_t mask; 1867 ddi_intr_msix_t *msix_p = i_ddi_get_msix(rdip); 1868 1869 ASSERT(msix_p != NULL); 1870 1871 /* Offset into "inum"th entry in the MSI-X table & mask it */ 1872 off = (uintptr_t)msix_p->msix_tbl_addr + (inum * 1873 PCI_MSIX_VECTOR_SIZE) + PCI_MSIX_VECTOR_CTRL_OFFSET; 1874 1875 mask = ddi_get32(msix_p->msix_tbl_hdl, (uint32_t *)off); 1876 1877 ddi_put32(msix_p->msix_tbl_hdl, (uint32_t *)off, (mask | 1)); 1878 1879 /* Offset into the "inum"th entry in the MSI-X table */ 1880 off = (uintptr_t)msix_p->msix_tbl_addr + 1881 (inum * PCI_MSIX_VECTOR_SIZE); 1882 1883 /* Reset the "data" and "addr" bits */ 1884 ddi_put32(msix_p->msix_tbl_hdl, 1885 (uint32_t *)(off + PCI_MSIX_DATA_OFFSET), 0); 1886 ddi_put64(msix_p->msix_tbl_hdl, (uint64_t *)off, 0); 1887 } 1888 } 1889 1890 /* 1891 * apic_pci_msi_disable_mode: 1892 */ 1893 void 1894 apic_pci_msi_disable_mode(dev_info_t *rdip, int type) 1895 { 1896 ushort_t msi_ctrl; 1897 int cap_ptr = i_ddi_get_msi_msix_cap_ptr(rdip); 1898 ddi_acc_handle_t handle = i_ddi_get_pci_config_handle(rdip); 1899 1900 ASSERT((handle != NULL) && (cap_ptr != 0)); 1901 1902 if (type == DDI_INTR_TYPE_MSI) { 1903 msi_ctrl = pci_config_get16(handle, cap_ptr + PCI_MSI_CTRL); 1904 if (!(msi_ctrl & PCI_MSI_ENABLE_BIT)) 1905 return; 1906 1907 msi_ctrl &= ~PCI_MSI_ENABLE_BIT; /* MSI disable */ 1908 pci_config_put16(handle, cap_ptr + PCI_MSI_CTRL, msi_ctrl); 1909 1910 } else if (type == DDI_INTR_TYPE_MSIX) { 1911 msi_ctrl = pci_config_get16(handle, cap_ptr + PCI_MSIX_CTRL); 1912 if (msi_ctrl & PCI_MSIX_ENABLE_BIT) { 1913 msi_ctrl &= ~PCI_MSIX_ENABLE_BIT; 1914 pci_config_put16(handle, cap_ptr + PCI_MSIX_CTRL, 1915 msi_ctrl); 1916 } 1917 } 1918 } 1919 1920 uint32_t 1921 apic_get_localapicid(uint32_t cpuid) 1922 { 1923 ASSERT(cpuid < apic_nproc && apic_cpus != NULL); 1924 1925 return (apic_cpus[cpuid].aci_local_id); 1926 } 1927 1928 uchar_t 1929 apic_get_ioapicid(uchar_t ioapicindex) 1930 { 1931 ASSERT(ioapicindex < MAX_IO_APIC); 1932 1933 return (apic_io_id[ioapicindex]); 1934 } 1935