1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2017 Joyent, Inc. 24 */ 25 /* 26 * Copyright (c) 2010, Intel Corporation. 27 * All rights reserved. 28 */ 29 30 /* 31 * Copyright (c) 2018, Joyent, Inc. 32 */ 33 34 /* 35 * PSMI 1.1 extensions are supported only in 2.6 and later versions. 36 * PSMI 1.2 extensions are supported only in 2.7 and later versions. 37 * PSMI 1.3 and 1.4 extensions are supported in Solaris 10. 38 * PSMI 1.5 extensions are supported in Solaris Nevada. 39 * PSMI 1.6 extensions are supported in Solaris Nevada. 40 * PSMI 1.7 extensions are supported in Solaris Nevada. 41 */ 42 #define PSMI_1_7 43 44 #include <sys/processor.h> 45 #include <sys/time.h> 46 #include <sys/psm.h> 47 #include <sys/smp_impldefs.h> 48 #include <sys/inttypes.h> 49 #include <sys/cram.h> 50 #include <sys/acpi/acpi.h> 51 #include <sys/acpica.h> 52 #include <sys/psm_common.h> 53 #include <sys/apic.h> 54 #include <sys/apic_common.h> 55 #include <sys/pit.h> 56 #include <sys/ddi.h> 57 #include <sys/sunddi.h> 58 #include <sys/ddi_impldefs.h> 59 #include <sys/pci.h> 60 #include <sys/promif.h> 61 #include <sys/x86_archext.h> 62 #include <sys/cpc_impl.h> 63 #include <sys/uadmin.h> 64 #include <sys/panic.h> 65 #include <sys/debug.h> 66 #include <sys/archsystm.h> 67 #include <sys/trap.h> 68 #include <sys/machsystm.h> 69 #include <sys/cpuvar.h> 70 #include <sys/rm_platter.h> 71 #include <sys/privregs.h> 72 #include <sys/cyclic.h> 73 #include <sys/note.h> 74 #include <sys/pci_intr_lib.h> 75 #include <sys/sunndi.h> 76 #include <sys/hpet.h> 77 #include <sys/clock.h> 78 79 /* 80 * Part of mp_platfrom_common.c that's used only by pcplusmp & xpv_psm 81 * but not apix. 82 * These functions may be moved to xpv_psm later when apix and pcplusmp 83 * are merged together 84 */ 85 86 /* 87 * Local Function Prototypes 88 */ 89 static void apic_mark_vector(uchar_t oldvector, uchar_t newvector); 90 static void apic_xlate_vector_free_timeout_handler(void *arg); 91 static int apic_check_stuck_interrupt(apic_irq_t *irq_ptr, int old_bind_cpu, 92 int new_bind_cpu, int apicindex, int intin_no, int which_irq, 93 struct ioapic_reprogram_data *drep); 94 static int apic_setup_irq_table(dev_info_t *dip, int irqno, 95 struct apic_io_intr *intrp, struct intrspec *ispec, iflag_t *intr_flagp, 96 int type); 97 static void apic_try_deferred_reprogram(int ipl, int vect); 98 static void delete_defer_repro_ent(int which_irq); 99 static void apic_ioapic_wait_pending_clear(int ioapicindex, 100 int intin_no); 101 102 extern int apic_acpi_translate_pci_irq(dev_info_t *dip, int busid, int devid, 103 int ipin, int *pci_irqp, iflag_t *intr_flagp); 104 extern int apic_handle_pci_pci_bridge(dev_info_t *idip, int child_devno, 105 int child_ipin, struct apic_io_intr **intrp); 106 extern uchar_t acpi_find_ioapic(int irq); 107 extern struct apic_io_intr *apic_find_io_intr_w_busid(int irqno, int busid); 108 extern int apic_find_bus_id(int bustype); 109 extern int apic_find_intin(uchar_t ioapic, uchar_t intin); 110 extern void apic_record_rdt_entry(apic_irq_t *irqptr, int irq); 111 112 extern int apic_sci_vect; 113 extern iflag_t apic_sci_flags; 114 /* ACPI HPET interrupt configuration; -1 if HPET not used */ 115 extern int apic_hpet_vect; 116 extern iflag_t apic_hpet_flags; 117 extern int apic_intr_policy; 118 extern char *psm_name; 119 120 /* Max wait time (in repetitions) for flags to clear in an RDT entry. */ 121 extern int apic_max_reps_clear_pending; 122 123 /* The irq # is implicit in the array index: */ 124 struct ioapic_reprogram_data apic_reprogram_info[APIC_MAX_VECTOR+1]; 125 /* 126 * APIC_MAX_VECTOR + 1 is the maximum # of IRQs as well. ioapic_reprogram_info 127 * is indexed by IRQ number, NOT by vector number. 128 */ 129 130 extern int apic_int_busy_mark; 131 extern int apic_int_free_mark; 132 extern int apic_diff_for_redistribution; 133 extern int apic_sample_factor_redistribution; 134 extern int apic_redist_cpu_skip; 135 extern int apic_num_imbalance; 136 extern int apic_num_rebind; 137 138 /* timeout for xlate_vector, mark_vector */ 139 int apic_revector_timeout = 16 * 10000; /* 160 millisec */ 140 141 extern int apic_defconf; 142 extern int apic_irq_translate; 143 144 extern int apic_use_acpi_madt_only; /* 1=ONLY use MADT from ACPI */ 145 146 extern uchar_t apic_io_vectbase[MAX_IO_APIC]; 147 148 extern boolean_t ioapic_mask_workaround[MAX_IO_APIC]; 149 150 /* 151 * First available slot to be used as IRQ index into the apic_irq_table 152 * for those interrupts (like MSI/X) that don't have a physical IRQ. 153 */ 154 extern int apic_first_avail_irq; 155 156 /* 157 * apic_defer_reprogram_lock ensures that only one processor is handling 158 * deferred interrupt programming at *_intr_exit time. 159 */ 160 static lock_t apic_defer_reprogram_lock; 161 162 /* 163 * The current number of deferred reprogrammings outstanding 164 */ 165 uint_t apic_reprogram_outstanding = 0; 166 167 #ifdef DEBUG 168 /* 169 * Counters that keep track of deferred reprogramming stats 170 */ 171 uint_t apic_intr_deferrals = 0; 172 uint_t apic_intr_deliver_timeouts = 0; 173 uint_t apic_last_ditch_reprogram_failures = 0; 174 uint_t apic_deferred_setup_failures = 0; 175 uint_t apic_defer_repro_total_retries = 0; 176 uint_t apic_defer_repro_successes = 0; 177 uint_t apic_deferred_spurious_enters = 0; 178 #endif 179 180 extern int apic_io_max; 181 extern struct apic_io_intr *apic_io_intrp; 182 183 uchar_t apic_vector_to_irq[APIC_MAX_VECTOR+1]; 184 185 extern uint32_t eisa_level_intr_mask; 186 /* At least MSB will be set if EISA bus */ 187 188 extern int apic_pci_bus_total; 189 extern uchar_t apic_single_pci_busid; 190 191 /* 192 * Following declarations are for revectoring; used when ISRs at different 193 * IPLs share an irq. 194 */ 195 static lock_t apic_revector_lock; 196 int apic_revector_pending = 0; 197 static uchar_t *apic_oldvec_to_newvec; 198 static uchar_t *apic_newvec_to_oldvec; 199 200 /* ACPI Interrupt Source Override Structure ptr */ 201 extern ACPI_MADT_INTERRUPT_OVERRIDE *acpi_isop; 202 extern int acpi_iso_cnt; 203 204 /* 205 * Auto-configuration routines 206 */ 207 208 /* 209 * Initialise vector->ipl and ipl->pri arrays. level_intr and irqtable 210 * are also set to NULL. vector->irq is set to a value which cannot map 211 * to a real irq to show that it is free. 212 */ 213 void 214 apic_init_common(void) 215 { 216 int i, j, indx; 217 int *iptr; 218 219 /* 220 * Initialize apic_ipls from apic_vectortoipl. This array is 221 * used in apic_intr_enter to determine the IPL to use for the 222 * corresponding vector. On some systems, due to hardware errata 223 * and interrupt sharing, the IPL may not correspond to the IPL listed 224 * in apic_vectortoipl (see apic_addspl and apic_delspl). 225 */ 226 for (i = 0; i < (APIC_AVAIL_VECTOR / APIC_VECTOR_PER_IPL); i++) { 227 indx = i * APIC_VECTOR_PER_IPL; 228 229 for (j = 0; j < APIC_VECTOR_PER_IPL; j++, indx++) 230 apic_ipls[indx] = apic_vectortoipl[i]; 231 } 232 233 /* cpu 0 is always up (for now) */ 234 apic_cpus[0].aci_status = APIC_CPU_ONLINE | APIC_CPU_INTR_ENABLE; 235 236 iptr = (int *)&apic_irq_table[0]; 237 for (i = 0; i <= APIC_MAX_VECTOR; i++) { 238 apic_level_intr[i] = 0; 239 *iptr++ = 0; 240 apic_vector_to_irq[i] = APIC_RESV_IRQ; 241 242 /* These *must* be initted to B_TRUE! */ 243 apic_reprogram_info[i].done = B_TRUE; 244 apic_reprogram_info[i].irqp = NULL; 245 apic_reprogram_info[i].tries = 0; 246 apic_reprogram_info[i].bindcpu = 0; 247 } 248 249 /* 250 * Allocate a dummy irq table entry for the reserved entry. 251 * This takes care of the race between removing an irq and 252 * clock detecting a CPU in that irq during interrupt load 253 * sampling. 254 */ 255 apic_irq_table[APIC_RESV_IRQ] = 256 kmem_zalloc(sizeof (apic_irq_t), KM_SLEEP); 257 258 mutex_init(&airq_mutex, NULL, MUTEX_DEFAULT, NULL); 259 } 260 261 void 262 ioapic_init_intr(int mask_apic) 263 { 264 int ioapic_ix; 265 struct intrspec ispec; 266 apic_irq_t *irqptr; 267 int i, j; 268 ulong_t iflag; 269 270 LOCK_INIT_CLEAR(&apic_revector_lock); 271 LOCK_INIT_CLEAR(&apic_defer_reprogram_lock); 272 273 /* mask interrupt vectors */ 274 for (j = 0; j < apic_io_max && mask_apic; j++) { 275 int intin_max; 276 277 ioapic_ix = j; 278 /* Bits 23-16 define the maximum redirection entries */ 279 intin_max = (ioapic_read(ioapic_ix, APIC_VERS_CMD) >> 16) 280 & 0xff; 281 for (i = 0; i <= intin_max; i++) 282 ioapic_write(ioapic_ix, APIC_RDT_CMD + 2 * i, AV_MASK); 283 } 284 285 /* 286 * Hack alert: deal with ACPI SCI interrupt chicken/egg here 287 */ 288 if (apic_sci_vect > 0) { 289 /* 290 * acpica has already done add_avintr(); we just 291 * to finish the job by mimicing translate_irq() 292 * 293 * Fake up an intrspec and setup the tables 294 */ 295 ispec.intrspec_vec = apic_sci_vect; 296 ispec.intrspec_pri = SCI_IPL; 297 298 if (apic_setup_irq_table(NULL, apic_sci_vect, NULL, 299 &ispec, &apic_sci_flags, DDI_INTR_TYPE_FIXED) < 0) { 300 cmn_err(CE_WARN, "!apic: SCI setup failed"); 301 return; 302 } 303 irqptr = apic_irq_table[apic_sci_vect]; 304 305 iflag = intr_clear(); 306 lock_set(&apic_ioapic_lock); 307 308 /* Program I/O APIC */ 309 (void) apic_setup_io_intr(irqptr, apic_sci_vect, B_FALSE); 310 311 lock_clear(&apic_ioapic_lock); 312 intr_restore(iflag); 313 314 irqptr->airq_share++; 315 } 316 317 /* 318 * Hack alert: deal with ACPI HPET interrupt chicken/egg here. 319 */ 320 if (apic_hpet_vect > 0) { 321 /* 322 * hpet has already done add_avintr(); we just need 323 * to finish the job by mimicing translate_irq() 324 * 325 * Fake up an intrspec and setup the tables 326 */ 327 ispec.intrspec_vec = apic_hpet_vect; 328 ispec.intrspec_pri = CBE_HIGH_PIL; 329 330 if (apic_setup_irq_table(NULL, apic_hpet_vect, NULL, 331 &ispec, &apic_hpet_flags, DDI_INTR_TYPE_FIXED) < 0) { 332 cmn_err(CE_WARN, "!apic: HPET setup failed"); 333 return; 334 } 335 irqptr = apic_irq_table[apic_hpet_vect]; 336 337 iflag = intr_clear(); 338 lock_set(&apic_ioapic_lock); 339 340 /* Program I/O APIC */ 341 (void) apic_setup_io_intr(irqptr, apic_hpet_vect, B_FALSE); 342 343 lock_clear(&apic_ioapic_lock); 344 intr_restore(iflag); 345 346 irqptr->airq_share++; 347 } 348 } 349 350 /* 351 * Add mask bits to disable interrupt vector from happening 352 * at or above IPL. In addition, it should remove mask bits 353 * to enable interrupt vectors below the given IPL. 354 * 355 * Both add and delspl are complicated by the fact that different interrupts 356 * may share IRQs. This can happen in two ways. 357 * 1. The same H/W line is shared by more than 1 device 358 * 1a. with interrupts at different IPLs 359 * 1b. with interrupts at same IPL 360 * 2. We ran out of vectors at a given IPL and started sharing vectors. 361 * 1b and 2 should be handled gracefully, except for the fact some ISRs 362 * will get called often when no interrupt is pending for the device. 363 * For 1a, we handle it at the higher IPL. 364 */ 365 /*ARGSUSED*/ 366 int 367 apic_addspl_common(int irqno, int ipl, int min_ipl, int max_ipl) 368 { 369 uchar_t vector; 370 ulong_t iflag; 371 apic_irq_t *irqptr, *irqheadptr; 372 int irqindex; 373 374 ASSERT(max_ipl <= UCHAR_MAX); 375 irqindex = IRQINDEX(irqno); 376 377 if ((irqindex == -1) || (!apic_irq_table[irqindex])) 378 return (PSM_FAILURE); 379 380 mutex_enter(&airq_mutex); 381 irqptr = irqheadptr = apic_irq_table[irqindex]; 382 383 DDI_INTR_IMPLDBG((CE_CONT, "apic_addspl: dip=0x%p type=%d irqno=0x%x " 384 "vector=0x%x\n", (void *)irqptr->airq_dip, 385 irqptr->airq_mps_intr_index, irqno, irqptr->airq_vector)); 386 387 while (irqptr) { 388 if (VIRTIRQ(irqindex, irqptr->airq_share_id) == irqno) 389 break; 390 irqptr = irqptr->airq_next; 391 } 392 irqptr->airq_share++; 393 394 mutex_exit(&airq_mutex); 395 396 /* return if it is not hardware interrupt */ 397 if (irqptr->airq_mps_intr_index == RESERVE_INDEX) 398 return (PSM_SUCCESS); 399 400 /* Or if there are more interupts at a higher IPL */ 401 if (ipl != max_ipl) 402 return (PSM_SUCCESS); 403 404 /* 405 * if apic_picinit() has not been called yet, just return. 406 * At the end of apic_picinit(), we will call setup_io_intr(). 407 */ 408 409 if (!apic_picinit_called) 410 return (PSM_SUCCESS); 411 412 /* 413 * Upgrade vector if max_ipl is not earlier ipl. If we cannot allocate, 414 * return failure. 415 */ 416 if (irqptr->airq_ipl != max_ipl && 417 !ioapic_mask_workaround[irqptr->airq_ioapicindex]) { 418 419 vector = apic_allocate_vector(max_ipl, irqindex, 1); 420 if (vector == 0) { 421 irqptr->airq_share--; 422 return (PSM_FAILURE); 423 } 424 irqptr = irqheadptr; 425 apic_mark_vector(irqptr->airq_vector, vector); 426 while (irqptr) { 427 irqptr->airq_vector = vector; 428 irqptr->airq_ipl = (uchar_t)max_ipl; 429 /* 430 * reprogram irq being added and every one else 431 * who is not in the UNINIT state 432 */ 433 if ((VIRTIRQ(irqindex, irqptr->airq_share_id) == 434 irqno) || (irqptr->airq_temp_cpu != IRQ_UNINIT)) { 435 apic_record_rdt_entry(irqptr, irqindex); 436 437 iflag = intr_clear(); 438 lock_set(&apic_ioapic_lock); 439 440 (void) apic_setup_io_intr(irqptr, irqindex, 441 B_FALSE); 442 443 lock_clear(&apic_ioapic_lock); 444 intr_restore(iflag); 445 } 446 irqptr = irqptr->airq_next; 447 } 448 return (PSM_SUCCESS); 449 450 } else if (irqptr->airq_ipl != max_ipl && 451 ioapic_mask_workaround[irqptr->airq_ioapicindex]) { 452 /* 453 * We cannot upgrade the vector, but we can change 454 * the IPL that this vector induces. 455 * 456 * Note that we subtract APIC_BASE_VECT from the vector 457 * here because this array is used in apic_intr_enter 458 * (no need to add APIC_BASE_VECT in that hot code 459 * path since we can do it in the rarely-executed path 460 * here). 461 */ 462 apic_ipls[irqptr->airq_vector - APIC_BASE_VECT] = 463 (uchar_t)max_ipl; 464 465 irqptr = irqheadptr; 466 while (irqptr) { 467 irqptr->airq_ipl = (uchar_t)max_ipl; 468 irqptr = irqptr->airq_next; 469 } 470 471 return (PSM_SUCCESS); 472 } 473 474 ASSERT(irqptr); 475 476 iflag = intr_clear(); 477 lock_set(&apic_ioapic_lock); 478 479 (void) apic_setup_io_intr(irqptr, irqindex, B_FALSE); 480 481 lock_clear(&apic_ioapic_lock); 482 intr_restore(iflag); 483 484 return (PSM_SUCCESS); 485 } 486 487 /* 488 * Recompute mask bits for the given interrupt vector. 489 * If there is no interrupt servicing routine for this 490 * vector, this function should disable interrupt vector 491 * from happening at all IPLs. If there are still 492 * handlers using the given vector, this function should 493 * disable the given vector from happening below the lowest 494 * IPL of the remaining hadlers. 495 */ 496 /*ARGSUSED*/ 497 int 498 apic_delspl_common(int irqno, int ipl, int min_ipl, int max_ipl) 499 { 500 uchar_t vector; 501 uint32_t bind_cpu; 502 int intin, irqindex; 503 int ioapic_ix; 504 apic_irq_t *irqptr, *preirqptr, *irqheadptr, *irqp; 505 ulong_t iflag; 506 507 mutex_enter(&airq_mutex); 508 irqindex = IRQINDEX(irqno); 509 irqptr = preirqptr = irqheadptr = apic_irq_table[irqindex]; 510 511 DDI_INTR_IMPLDBG((CE_CONT, "apic_delspl: dip=0x%p type=%d irqno=0x%x " 512 "vector=0x%x\n", (void *)irqptr->airq_dip, 513 irqptr->airq_mps_intr_index, irqno, irqptr->airq_vector)); 514 515 while (irqptr) { 516 if (VIRTIRQ(irqindex, irqptr->airq_share_id) == irqno) 517 break; 518 preirqptr = irqptr; 519 irqptr = irqptr->airq_next; 520 } 521 ASSERT(irqptr); 522 523 irqptr->airq_share--; 524 525 mutex_exit(&airq_mutex); 526 527 /* 528 * If there are more interrupts at a higher IPL, we don't need 529 * to disable anything. 530 */ 531 if (ipl < max_ipl) 532 return (PSM_SUCCESS); 533 534 /* return if it is not hardware interrupt */ 535 if (irqptr->airq_mps_intr_index == RESERVE_INDEX) 536 return (PSM_SUCCESS); 537 538 if (!apic_picinit_called) { 539 /* 540 * Clear irq_struct. If two devices shared an intpt 541 * line & 1 unloaded before picinit, we are hosed. But, then 542 * we hope the machine survive. 543 */ 544 irqptr->airq_mps_intr_index = FREE_INDEX; 545 irqptr->airq_temp_cpu = IRQ_UNINIT; 546 apic_free_vector(irqptr->airq_vector); 547 return (PSM_SUCCESS); 548 } 549 /* 550 * Downgrade vector to new max_ipl if needed. If we cannot allocate, 551 * use old IPL. Not very elegant, but it should work. 552 */ 553 if ((irqptr->airq_ipl != max_ipl) && (max_ipl != PSM_INVALID_IPL) && 554 !ioapic_mask_workaround[irqptr->airq_ioapicindex]) { 555 apic_irq_t *irqp; 556 if ((vector = apic_allocate_vector(max_ipl, irqno, 1))) { 557 apic_mark_vector(irqheadptr->airq_vector, vector); 558 irqp = irqheadptr; 559 while (irqp) { 560 irqp->airq_vector = vector; 561 irqp->airq_ipl = (uchar_t)max_ipl; 562 if (irqp->airq_temp_cpu != IRQ_UNINIT) { 563 apic_record_rdt_entry(irqp, irqindex); 564 565 iflag = intr_clear(); 566 lock_set(&apic_ioapic_lock); 567 568 (void) apic_setup_io_intr(irqp, 569 irqindex, B_FALSE); 570 571 lock_clear(&apic_ioapic_lock); 572 intr_restore(iflag); 573 } 574 irqp = irqp->airq_next; 575 } 576 } 577 578 } else if (irqptr->airq_ipl != max_ipl && 579 max_ipl != PSM_INVALID_IPL && 580 ioapic_mask_workaround[irqptr->airq_ioapicindex]) { 581 582 /* 583 * We cannot downgrade the IPL of the vector below the vector's 584 * hardware priority. If we did, it would be possible for a 585 * higher-priority hardware vector to interrupt a CPU running at an IPL 586 * lower than the hardware priority of the interrupting vector (but 587 * higher than the soft IPL of this IRQ). When this happens, we would 588 * then try to drop the IPL BELOW what it was (effectively dropping 589 * below base_spl) which would be potentially catastrophic. 590 * 591 * (e.g. Suppose the hardware vector associated with this IRQ is 0x40 592 * (hardware IPL of 4). Further assume that the old IPL of this IRQ 593 * was 4, but the new IPL is 1. If we forced vector 0x40 to result in 594 * an IPL of 1, it would be possible for the processor to be executing 595 * at IPL 3 and for an interrupt to come in on vector 0x40, interrupting 596 * the currently-executing ISR. When apic_intr_enter consults 597 * apic_irqs[], it will return 1, bringing the IPL of the CPU down to 1 598 * so even though the processor was running at IPL 4, an IPL 1 599 * interrupt will have interrupted it, which must not happen)). 600 * 601 * Effectively, this means that the hardware priority corresponding to 602 * the IRQ's IPL (in apic_ipls[]) cannot be lower than the vector's 603 * hardware priority. 604 * 605 * (In the above example, then, after removal of the IPL 4 device's 606 * interrupt handler, the new IPL will continue to be 4 because the 607 * hardware priority that IPL 1 implies is lower than the hardware 608 * priority of the vector used.) 609 */ 610 /* apic_ipls is indexed by vector, starting at APIC_BASE_VECT */ 611 const int apic_ipls_index = irqptr->airq_vector - 612 APIC_BASE_VECT; 613 const int vect_inherent_hwpri = irqptr->airq_vector >> 614 APIC_IPL_SHIFT; 615 616 /* 617 * If there are still devices using this IRQ, determine the 618 * new ipl to use. 619 */ 620 if (irqptr->airq_share) { 621 int vect_desired_hwpri, hwpri; 622 623 ASSERT(max_ipl < MAXIPL); 624 vect_desired_hwpri = apic_ipltopri[max_ipl] >> 625 APIC_IPL_SHIFT; 626 627 /* 628 * If the desired IPL's hardware priority is lower 629 * than that of the vector, use the hardware priority 630 * of the vector to determine the new IPL. 631 */ 632 hwpri = (vect_desired_hwpri < vect_inherent_hwpri) ? 633 vect_inherent_hwpri : vect_desired_hwpri; 634 635 /* 636 * Now, to get the right index for apic_vectortoipl, 637 * we need to subtract APIC_BASE_VECT from the 638 * hardware-vector-equivalent (in hwpri). Since hwpri 639 * is already shifted, we shift APIC_BASE_VECT before 640 * doing the subtraction. 641 */ 642 hwpri -= (APIC_BASE_VECT >> APIC_IPL_SHIFT); 643 644 ASSERT(hwpri >= 0); 645 ASSERT(hwpri < MAXIPL); 646 max_ipl = apic_vectortoipl[hwpri]; 647 apic_ipls[apic_ipls_index] = (uchar_t)max_ipl; 648 649 irqp = irqheadptr; 650 while (irqp) { 651 irqp->airq_ipl = (uchar_t)max_ipl; 652 irqp = irqp->airq_next; 653 } 654 } else { 655 /* 656 * No more devices on this IRQ, so reset this vector's 657 * element in apic_ipls to the original IPL for this 658 * vector 659 */ 660 apic_ipls[apic_ipls_index] = 661 apic_vectortoipl[vect_inherent_hwpri]; 662 } 663 } 664 665 /* 666 * If there are still active interrupts, we are done. 667 */ 668 if (irqptr->airq_share) 669 return (PSM_SUCCESS); 670 671 iflag = intr_clear(); 672 lock_set(&apic_ioapic_lock); 673 674 if (irqptr->airq_mps_intr_index == MSI_INDEX) { 675 /* 676 * Disable the MSI vector 677 * Make sure we only disable on the last 678 * of the multi-MSI support 679 */ 680 if (i_ddi_intr_get_current_nenables(irqptr->airq_dip) == 1) { 681 apic_pci_msi_disable_mode(irqptr->airq_dip, 682 DDI_INTR_TYPE_MSI); 683 } 684 } else if (irqptr->airq_mps_intr_index == MSIX_INDEX) { 685 /* 686 * Disable the MSI-X vector 687 * needs to clear its mask and addr/data for each MSI-X 688 */ 689 apic_pci_msi_unconfigure(irqptr->airq_dip, DDI_INTR_TYPE_MSIX, 690 irqptr->airq_origirq); 691 /* 692 * Make sure we only disable on the last MSI-X 693 */ 694 if (i_ddi_intr_get_current_nenables(irqptr->airq_dip) == 1) { 695 apic_pci_msi_disable_mode(irqptr->airq_dip, 696 DDI_INTR_TYPE_MSIX); 697 } 698 } else { 699 /* 700 * The assumption here is that this is safe, even for 701 * systems with IOAPICs that suffer from the hardware 702 * erratum because all devices have been quiesced before 703 * they unregister their interrupt handlers. If that 704 * assumption turns out to be false, this mask operation 705 * can induce the same erratum result we're trying to 706 * avoid. 707 */ 708 ioapic_ix = irqptr->airq_ioapicindex; 709 intin = irqptr->airq_intin_no; 710 ioapic_write(ioapic_ix, APIC_RDT_CMD + 2 * intin, AV_MASK); 711 } 712 713 apic_vt_ops->apic_intrmap_free_entry(&irqptr->airq_intrmap_private); 714 715 /* 716 * This irq entry is the only one in the chain. 717 */ 718 if (irqheadptr->airq_next == NULL) { 719 ASSERT(irqheadptr == irqptr); 720 bind_cpu = irqptr->airq_temp_cpu; 721 if (((uint32_t)bind_cpu != IRQ_UNBOUND) && 722 ((uint32_t)bind_cpu != IRQ_UNINIT)) { 723 ASSERT(apic_cpu_in_range(bind_cpu)); 724 if (bind_cpu & IRQ_USER_BOUND) { 725 /* If hardbound, temp_cpu == cpu */ 726 bind_cpu &= ~IRQ_USER_BOUND; 727 apic_cpus[bind_cpu].aci_bound--; 728 } else 729 apic_cpus[bind_cpu].aci_temp_bound--; 730 } 731 irqptr->airq_temp_cpu = IRQ_UNINIT; 732 irqptr->airq_mps_intr_index = FREE_INDEX; 733 lock_clear(&apic_ioapic_lock); 734 intr_restore(iflag); 735 apic_free_vector(irqptr->airq_vector); 736 return (PSM_SUCCESS); 737 } 738 739 /* 740 * If we get here, we are sharing the vector and there are more than 741 * one active irq entries in the chain. 742 */ 743 lock_clear(&apic_ioapic_lock); 744 intr_restore(iflag); 745 746 mutex_enter(&airq_mutex); 747 /* Remove the irq entry from the chain */ 748 if (irqptr == irqheadptr) { /* The irq entry is at the head */ 749 apic_irq_table[irqindex] = irqptr->airq_next; 750 } else { 751 preirqptr->airq_next = irqptr->airq_next; 752 } 753 /* Free the irq entry */ 754 kmem_free(irqptr, sizeof (apic_irq_t)); 755 mutex_exit(&airq_mutex); 756 757 return (PSM_SUCCESS); 758 } 759 760 /* 761 * apic_introp_xlate() replaces apic_translate_irq() and is 762 * called only from apic_intr_ops(). With the new ADII framework, 763 * the priority can no longer be retrieved through i_ddi_get_intrspec(). 764 * It has to be passed in from the caller. 765 * 766 * Return value: 767 * Success: irqno for the given device 768 * Failure: -1 769 */ 770 int 771 apic_introp_xlate(dev_info_t *dip, struct intrspec *ispec, int type) 772 { 773 char dev_type[16]; 774 int dev_len, pci_irq, newirq, bustype, devid, busid, i; 775 int irqno = ispec->intrspec_vec; 776 ddi_acc_handle_t cfg_handle; 777 uchar_t ipin; 778 struct apic_io_intr *intrp; 779 iflag_t intr_flag; 780 ACPI_SUBTABLE_HEADER *hp; 781 ACPI_MADT_INTERRUPT_OVERRIDE *isop; 782 apic_irq_t *airqp; 783 int parent_is_pci_or_pciex = 0; 784 int child_is_pciex = 0; 785 786 DDI_INTR_IMPLDBG((CE_CONT, "apic_introp_xlate: dip=0x%p name=%s " 787 "type=%d irqno=0x%x\n", (void *)dip, ddi_get_name(dip), type, 788 irqno)); 789 790 dev_len = sizeof (dev_type); 791 if (ddi_getlongprop_buf(DDI_DEV_T_ANY, ddi_get_parent(dip), 792 DDI_PROP_DONTPASS, "device_type", (caddr_t)dev_type, 793 &dev_len) == DDI_PROP_SUCCESS) { 794 if ((strcmp(dev_type, "pci") == 0) || 795 (strcmp(dev_type, "pciex") == 0)) 796 parent_is_pci_or_pciex = 1; 797 } 798 799 if (ddi_getlongprop_buf(DDI_DEV_T_ANY, dip, 800 DDI_PROP_DONTPASS, "compatible", (caddr_t)dev_type, 801 &dev_len) == DDI_PROP_SUCCESS) { 802 if (strstr(dev_type, "pciex")) 803 child_is_pciex = 1; 804 } 805 806 if (DDI_INTR_IS_MSI_OR_MSIX(type)) { 807 if ((airqp = apic_find_irq(dip, ispec, type)) != NULL) { 808 airqp->airq_iflag.bustype = 809 child_is_pciex ? BUS_PCIE : BUS_PCI; 810 return (apic_vector_to_irq[airqp->airq_vector]); 811 } 812 return (apic_setup_irq_table(dip, irqno, NULL, ispec, 813 NULL, type)); 814 } 815 816 bustype = 0; 817 818 /* check if we have already translated this irq */ 819 mutex_enter(&airq_mutex); 820 newirq = apic_min_device_irq; 821 for (; newirq <= apic_max_device_irq; newirq++) { 822 airqp = apic_irq_table[newirq]; 823 while (airqp) { 824 if ((airqp->airq_dip == dip) && 825 (airqp->airq_origirq == irqno) && 826 (airqp->airq_mps_intr_index != FREE_INDEX)) { 827 828 mutex_exit(&airq_mutex); 829 return (VIRTIRQ(newirq, airqp->airq_share_id)); 830 } 831 airqp = airqp->airq_next; 832 } 833 } 834 mutex_exit(&airq_mutex); 835 836 if (apic_defconf) 837 goto defconf; 838 839 if ((dip == NULL) || (!apic_irq_translate && !apic_enable_acpi)) 840 goto nonpci; 841 842 if (parent_is_pci_or_pciex) { 843 /* pci device */ 844 if (acpica_get_bdf(dip, &busid, &devid, NULL) != 0) 845 goto nonpci; 846 if (busid == 0 && apic_pci_bus_total == 1) 847 busid = (int)apic_single_pci_busid; 848 849 if (pci_config_setup(dip, &cfg_handle) != DDI_SUCCESS) 850 return (-1); 851 ipin = pci_config_get8(cfg_handle, PCI_CONF_IPIN) - PCI_INTA; 852 pci_config_teardown(&cfg_handle); 853 if (apic_enable_acpi && !apic_use_acpi_madt_only) { 854 if (apic_acpi_translate_pci_irq(dip, busid, devid, 855 ipin, &pci_irq, &intr_flag) != ACPI_PSM_SUCCESS) 856 return (-1); 857 858 intr_flag.bustype = child_is_pciex ? BUS_PCIE : BUS_PCI; 859 return (apic_setup_irq_table(dip, pci_irq, NULL, ispec, 860 &intr_flag, type)); 861 } else { 862 pci_irq = ((devid & 0x1f) << 2) | (ipin & 0x3); 863 if ((intrp = apic_find_io_intr_w_busid(pci_irq, busid)) 864 == NULL) { 865 if ((pci_irq = apic_handle_pci_pci_bridge(dip, 866 devid, ipin, &intrp)) == -1) 867 return (-1); 868 } 869 return (apic_setup_irq_table(dip, pci_irq, intrp, ispec, 870 NULL, type)); 871 } 872 } else if (strcmp(dev_type, "isa") == 0) 873 bustype = BUS_ISA; 874 else if (strcmp(dev_type, "eisa") == 0) 875 bustype = BUS_EISA; 876 877 nonpci: 878 if (apic_enable_acpi && !apic_use_acpi_madt_only) { 879 /* search iso entries first */ 880 if (acpi_iso_cnt != 0) { 881 hp = (ACPI_SUBTABLE_HEADER *)acpi_isop; 882 i = 0; 883 while (i < acpi_iso_cnt) { 884 if (hp->Type == 885 ACPI_MADT_TYPE_INTERRUPT_OVERRIDE) { 886 isop = 887 (ACPI_MADT_INTERRUPT_OVERRIDE *) hp; 888 if (isop->Bus == 0 && 889 isop->SourceIrq == irqno) { 890 newirq = isop->GlobalIrq; 891 intr_flag.intr_po = 892 isop->IntiFlags & 893 ACPI_MADT_POLARITY_MASK; 894 intr_flag.intr_el = 895 (isop->IntiFlags & 896 ACPI_MADT_TRIGGER_MASK) 897 >> 2; 898 intr_flag.bustype = BUS_ISA; 899 900 return (apic_setup_irq_table( 901 dip, newirq, NULL, ispec, 902 &intr_flag, type)); 903 904 } 905 i++; 906 } 907 hp = (ACPI_SUBTABLE_HEADER *)(((char *)hp) + 908 hp->Length); 909 } 910 } 911 intr_flag.intr_po = INTR_PO_ACTIVE_HIGH; 912 intr_flag.intr_el = INTR_EL_EDGE; 913 intr_flag.bustype = BUS_ISA; 914 return (apic_setup_irq_table(dip, irqno, NULL, ispec, 915 &intr_flag, type)); 916 } else { 917 if (bustype == 0) /* not initialized */ 918 bustype = eisa_level_intr_mask ? BUS_EISA : BUS_ISA; 919 for (i = 0; i < 2; i++) { 920 if (((busid = apic_find_bus_id(bustype)) != -1) && 921 ((intrp = apic_find_io_intr_w_busid(irqno, busid)) 922 != NULL)) { 923 if ((newirq = apic_setup_irq_table(dip, irqno, 924 intrp, ispec, NULL, type)) != -1) { 925 return (newirq); 926 } 927 goto defconf; 928 } 929 bustype = (bustype == BUS_EISA) ? BUS_ISA : BUS_EISA; 930 } 931 } 932 933 /* MPS default configuration */ 934 defconf: 935 newirq = apic_setup_irq_table(dip, irqno, NULL, ispec, NULL, type); 936 if (newirq == -1) 937 return (-1); 938 ASSERT(IRQINDEX(newirq) == irqno); 939 ASSERT(apic_irq_table[irqno]); 940 return (newirq); 941 } 942 943 /* 944 * Attempt to share vector with someone else 945 */ 946 static int 947 apic_share_vector(int irqno, iflag_t *intr_flagp, short intr_index, int ipl, 948 uchar_t ioapicindex, uchar_t ipin, apic_irq_t **irqptrp) 949 { 950 #ifdef DEBUG 951 apic_irq_t *tmpirqp = NULL; 952 #endif /* DEBUG */ 953 apic_irq_t *irqptr, dummyirq; 954 int newirq, chosen_irq = -1, share = 127; 955 int lowest, highest, i; 956 uchar_t share_id; 957 958 DDI_INTR_IMPLDBG((CE_CONT, "apic_share_vector: irqno=0x%x " 959 "intr_index=0x%x ipl=0x%x\n", irqno, intr_index, ipl)); 960 961 highest = apic_ipltopri[ipl] + APIC_VECTOR_MASK; 962 lowest = apic_ipltopri[ipl-1] + APIC_VECTOR_PER_IPL; 963 964 if (highest < lowest) /* Both ipl and ipl-1 map to same pri */ 965 lowest -= APIC_VECTOR_PER_IPL; 966 dummyirq.airq_mps_intr_index = intr_index; 967 dummyirq.airq_ioapicindex = ioapicindex; 968 dummyirq.airq_intin_no = ipin; 969 if (intr_flagp) 970 dummyirq.airq_iflag = *intr_flagp; 971 apic_record_rdt_entry(&dummyirq, irqno); 972 for (i = lowest; i <= highest; i++) { 973 newirq = apic_vector_to_irq[i]; 974 if (newirq == APIC_RESV_IRQ) 975 continue; 976 irqptr = apic_irq_table[newirq]; 977 978 if ((dummyirq.airq_rdt_entry & 0xFF00) != 979 (irqptr->airq_rdt_entry & 0xFF00)) 980 /* not compatible */ 981 continue; 982 983 if (irqptr->airq_share < share) { 984 share = irqptr->airq_share; 985 chosen_irq = newirq; 986 } 987 } 988 if (chosen_irq != -1) { 989 /* 990 * Assign a share id which is free or which is larger 991 * than the largest one. 992 */ 993 share_id = 1; 994 mutex_enter(&airq_mutex); 995 irqptr = apic_irq_table[chosen_irq]; 996 while (irqptr) { 997 if (irqptr->airq_mps_intr_index == FREE_INDEX) { 998 share_id = irqptr->airq_share_id; 999 break; 1000 } 1001 if (share_id <= irqptr->airq_share_id) 1002 share_id = irqptr->airq_share_id + 1; 1003 #ifdef DEBUG 1004 tmpirqp = irqptr; 1005 #endif /* DEBUG */ 1006 irqptr = irqptr->airq_next; 1007 } 1008 if (!irqptr) { 1009 irqptr = kmem_zalloc(sizeof (apic_irq_t), KM_SLEEP); 1010 irqptr->airq_temp_cpu = IRQ_UNINIT; 1011 irqptr->airq_next = 1012 apic_irq_table[chosen_irq]->airq_next; 1013 apic_irq_table[chosen_irq]->airq_next = irqptr; 1014 #ifdef DEBUG 1015 tmpirqp = apic_irq_table[chosen_irq]; 1016 #endif /* DEBUG */ 1017 } 1018 irqptr->airq_mps_intr_index = intr_index; 1019 irqptr->airq_ioapicindex = ioapicindex; 1020 irqptr->airq_intin_no = ipin; 1021 if (intr_flagp) 1022 irqptr->airq_iflag = *intr_flagp; 1023 irqptr->airq_vector = apic_irq_table[chosen_irq]->airq_vector; 1024 irqptr->airq_share_id = share_id; 1025 apic_record_rdt_entry(irqptr, irqno); 1026 *irqptrp = irqptr; 1027 #ifdef DEBUG 1028 /* shuffle the pointers to test apic_delspl path */ 1029 if (tmpirqp) { 1030 tmpirqp->airq_next = irqptr->airq_next; 1031 irqptr->airq_next = apic_irq_table[chosen_irq]; 1032 apic_irq_table[chosen_irq] = irqptr; 1033 } 1034 #endif /* DEBUG */ 1035 mutex_exit(&airq_mutex); 1036 return (VIRTIRQ(chosen_irq, share_id)); 1037 } 1038 return (-1); 1039 } 1040 1041 /* 1042 * Allocate/Initialize the apic_irq_table[] entry for given irqno. If the entry 1043 * is used already, we will try to allocate a new irqno. 1044 * 1045 * Return value: 1046 * Success: irqno 1047 * Failure: -1 1048 */ 1049 static int 1050 apic_setup_irq_table(dev_info_t *dip, int irqno, struct apic_io_intr *intrp, 1051 struct intrspec *ispec, iflag_t *intr_flagp, int type) 1052 { 1053 int origirq; 1054 uchar_t ipl; 1055 int newirq, intr_index; 1056 uchar_t ipin, ioapic, ioapicindex, vector; 1057 apic_irq_t *irqptr; 1058 major_t major; 1059 dev_info_t *sdip; 1060 1061 ASSERT(ispec != NULL); 1062 1063 origirq = ispec->intrspec_vec; 1064 ipl = ispec->intrspec_pri; 1065 1066 DDI_INTR_IMPLDBG((CE_CONT, "apic_setup_irq_table: dip=0x%p type=%d " 1067 "irqno=0x%x origirq=0x%x\n", (void *)dip, type, irqno, origirq)); 1068 1069 major = (dip != NULL) ? ddi_driver_major(dip) : 0; 1070 1071 if (DDI_INTR_IS_MSI_OR_MSIX(type)) { 1072 /* MSI/X doesn't need to setup ioapic stuffs */ 1073 ioapicindex = 0xff; 1074 ioapic = 0xff; 1075 ipin = (uchar_t)0xff; 1076 intr_index = (type == DDI_INTR_TYPE_MSI) ? MSI_INDEX : 1077 MSIX_INDEX; 1078 mutex_enter(&airq_mutex); 1079 if ((irqno = apic_allocate_irq(apic_first_avail_irq)) == -1) { 1080 mutex_exit(&airq_mutex); 1081 /* need an irq for MSI/X to index into autovect[] */ 1082 cmn_err(CE_WARN, "No interrupt irq: %s instance %d", 1083 ddi_get_name(dip), ddi_get_instance(dip)); 1084 return (-1); 1085 } 1086 mutex_exit(&airq_mutex); 1087 1088 } else if (intrp != NULL) { 1089 intr_index = (int)(intrp - apic_io_intrp); 1090 ioapic = intrp->intr_destid; 1091 ipin = intrp->intr_destintin; 1092 /* Find ioapicindex. If destid was ALL, we will exit with 0. */ 1093 for (ioapicindex = apic_io_max - 1; ioapicindex; ioapicindex--) 1094 if (apic_io_id[ioapicindex] == ioapic) 1095 break; 1096 ASSERT((ioapic == apic_io_id[ioapicindex]) || 1097 (ioapic == INTR_ALL_APIC)); 1098 1099 /* check whether this intin# has been used by another irqno */ 1100 if ((newirq = apic_find_intin(ioapicindex, ipin)) != -1) { 1101 return (newirq); 1102 } 1103 1104 } else if (intr_flagp != NULL) { 1105 /* ACPI case */ 1106 intr_index = ACPI_INDEX; 1107 ioapicindex = acpi_find_ioapic(irqno); 1108 ASSERT(ioapicindex != 0xFF); 1109 ioapic = apic_io_id[ioapicindex]; 1110 ipin = irqno - apic_io_vectbase[ioapicindex]; 1111 if (apic_irq_table[irqno] && 1112 apic_irq_table[irqno]->airq_mps_intr_index == ACPI_INDEX) { 1113 ASSERT(apic_irq_table[irqno]->airq_intin_no == ipin && 1114 apic_irq_table[irqno]->airq_ioapicindex == 1115 ioapicindex); 1116 return (irqno); 1117 } 1118 1119 } else { 1120 /* default configuration */ 1121 ioapicindex = 0; 1122 ioapic = apic_io_id[ioapicindex]; 1123 ipin = (uchar_t)irqno; 1124 intr_index = DEFAULT_INDEX; 1125 } 1126 1127 if ((vector = apic_allocate_vector(ipl, irqno, 0)) == 0) { 1128 if ((newirq = apic_share_vector(irqno, intr_flagp, intr_index, 1129 ipl, ioapicindex, ipin, &irqptr)) != -1) { 1130 irqptr->airq_ipl = ipl; 1131 irqptr->airq_origirq = (uchar_t)origirq; 1132 irqptr->airq_dip = dip; 1133 irqptr->airq_major = major; 1134 sdip = apic_irq_table[IRQINDEX(newirq)]->airq_dip; 1135 /* This is OK to do really */ 1136 if (sdip == NULL) { 1137 cmn_err(CE_WARN, "Sharing vectors: %s" 1138 " instance %d and SCI", 1139 ddi_get_name(dip), ddi_get_instance(dip)); 1140 } else { 1141 cmn_err(CE_WARN, "Sharing vectors: %s" 1142 " instance %d and %s instance %d", 1143 ddi_get_name(sdip), ddi_get_instance(sdip), 1144 ddi_get_name(dip), ddi_get_instance(dip)); 1145 } 1146 return (newirq); 1147 } 1148 /* try high priority allocation now that share has failed */ 1149 if ((vector = apic_allocate_vector(ipl, irqno, 1)) == 0) { 1150 cmn_err(CE_WARN, "No interrupt vector: %s instance %d", 1151 ddi_get_name(dip), ddi_get_instance(dip)); 1152 return (-1); 1153 } 1154 } 1155 1156 mutex_enter(&airq_mutex); 1157 if (apic_irq_table[irqno] == NULL) { 1158 irqptr = kmem_zalloc(sizeof (apic_irq_t), KM_SLEEP); 1159 irqptr->airq_temp_cpu = IRQ_UNINIT; 1160 apic_irq_table[irqno] = irqptr; 1161 } else { 1162 irqptr = apic_irq_table[irqno]; 1163 if (irqptr->airq_mps_intr_index != FREE_INDEX) { 1164 /* 1165 * The slot is used by another irqno, so allocate 1166 * a free irqno for this interrupt 1167 */ 1168 newirq = apic_allocate_irq(apic_first_avail_irq); 1169 if (newirq == -1) { 1170 mutex_exit(&airq_mutex); 1171 return (-1); 1172 } 1173 irqno = newirq; 1174 irqptr = apic_irq_table[irqno]; 1175 if (irqptr == NULL) { 1176 irqptr = kmem_zalloc(sizeof (apic_irq_t), 1177 KM_SLEEP); 1178 irqptr->airq_temp_cpu = IRQ_UNINIT; 1179 apic_irq_table[irqno] = irqptr; 1180 } 1181 vector = apic_modify_vector(vector, newirq); 1182 } 1183 } 1184 apic_max_device_irq = max(irqno, apic_max_device_irq); 1185 apic_min_device_irq = min(irqno, apic_min_device_irq); 1186 mutex_exit(&airq_mutex); 1187 irqptr->airq_ioapicindex = ioapicindex; 1188 irqptr->airq_intin_no = ipin; 1189 irqptr->airq_ipl = ipl; 1190 irqptr->airq_vector = vector; 1191 irqptr->airq_origirq = (uchar_t)origirq; 1192 irqptr->airq_share_id = 0; 1193 irqptr->airq_mps_intr_index = (short)intr_index; 1194 irqptr->airq_dip = dip; 1195 irqptr->airq_major = major; 1196 irqptr->airq_cpu = apic_bind_intr(dip, irqno, ioapic, ipin); 1197 if (intr_flagp) 1198 irqptr->airq_iflag = *intr_flagp; 1199 1200 if (!DDI_INTR_IS_MSI_OR_MSIX(type)) { 1201 /* setup I/O APIC entry for non-MSI/X interrupts */ 1202 apic_record_rdt_entry(irqptr, irqno); 1203 } 1204 return (irqno); 1205 } 1206 1207 /* 1208 * return the cpu to which this intr should be bound. 1209 * Check properties or any other mechanism to see if user wants it 1210 * bound to a specific CPU. If so, return the cpu id with high bit set. 1211 * If not, use the policy to choose a cpu and return the id. 1212 */ 1213 uint32_t 1214 apic_bind_intr(dev_info_t *dip, int irq, uchar_t ioapicid, uchar_t intin) 1215 { 1216 int instance, instno, prop_len, bind_cpu, count; 1217 uint_t i, rc; 1218 uint32_t cpu; 1219 major_t major; 1220 char *name, *drv_name, *prop_val, *cptr; 1221 char prop_name[32]; 1222 ulong_t iflag; 1223 1224 1225 if (apic_intr_policy == INTR_LOWEST_PRIORITY) 1226 return (IRQ_UNBOUND); 1227 1228 if (apic_nproc == 1) 1229 return (0); 1230 1231 /* 1232 * dip may be NULL for interrupts not associated with a device driver, 1233 * such as the ACPI SCI or HPET interrupts. In that case just use the 1234 * next CPU and return. 1235 */ 1236 if (dip == NULL) { 1237 iflag = intr_clear(); 1238 lock_set(&apic_ioapic_lock); 1239 bind_cpu = apic_get_next_bind_cpu(); 1240 lock_clear(&apic_ioapic_lock); 1241 intr_restore(iflag); 1242 1243 cmn_err(CE_CONT, "!%s: irq 0x%x " 1244 "vector 0x%x ioapic 0x%x intin 0x%x is bound to cpu %d\n", 1245 psm_name, irq, apic_irq_table[irq]->airq_vector, ioapicid, 1246 intin, bind_cpu & ~IRQ_USER_BOUND); 1247 1248 return ((uint32_t)bind_cpu); 1249 } 1250 1251 name = ddi_get_name(dip); 1252 major = ddi_name_to_major(name); 1253 drv_name = ddi_major_to_name(major); 1254 instance = ddi_get_instance(dip); 1255 if (apic_intr_policy == INTR_ROUND_ROBIN_WITH_AFFINITY) { 1256 i = apic_min_device_irq; 1257 for (; i <= apic_max_device_irq; i++) { 1258 if ((i == irq) || (apic_irq_table[i] == NULL) || 1259 (apic_irq_table[i]->airq_mps_intr_index 1260 == FREE_INDEX)) 1261 continue; 1262 1263 if ((apic_irq_table[i]->airq_major == major) && 1264 (!(apic_irq_table[i]->airq_cpu & IRQ_USER_BOUND))) { 1265 cpu = apic_irq_table[i]->airq_cpu; 1266 1267 cmn_err(CE_CONT, 1268 "!%s: %s (%s) instance #%d " 1269 "irq 0x%x vector 0x%x ioapic 0x%x " 1270 "intin 0x%x is bound to cpu %d\n", 1271 psm_name, 1272 name, drv_name, instance, irq, 1273 apic_irq_table[irq]->airq_vector, 1274 ioapicid, intin, cpu); 1275 return (cpu); 1276 } 1277 } 1278 } 1279 /* 1280 * search for "drvname"_intpt_bind_cpus property first, the 1281 * syntax of the property should be "a[,b,c,...]" where 1282 * instance 0 binds to cpu a, instance 1 binds to cpu b, 1283 * instance 3 binds to cpu c... 1284 * ddi_getlongprop() will search /option first, then / 1285 * if "drvname"_intpt_bind_cpus doesn't exist, then find 1286 * intpt_bind_cpus property. The syntax is the same, and 1287 * it applies to all the devices if its "drvname" specific 1288 * property doesn't exist 1289 */ 1290 (void) strcpy(prop_name, drv_name); 1291 (void) strcat(prop_name, "_intpt_bind_cpus"); 1292 rc = ddi_getlongprop(DDI_DEV_T_ANY, dip, 0, prop_name, 1293 (caddr_t)&prop_val, &prop_len); 1294 if (rc != DDI_PROP_SUCCESS) { 1295 rc = ddi_getlongprop(DDI_DEV_T_ANY, dip, 0, 1296 "intpt_bind_cpus", (caddr_t)&prop_val, &prop_len); 1297 } 1298 if (rc == DDI_PROP_SUCCESS) { 1299 for (i = count = 0; i < (prop_len - 1); i++) 1300 if (prop_val[i] == ',') 1301 count++; 1302 if (prop_val[i-1] != ',') 1303 count++; 1304 /* 1305 * if somehow the binding instances defined in the 1306 * property are not enough for this instno., then 1307 * reuse the pattern for the next instance until 1308 * it reaches the requested instno 1309 */ 1310 instno = instance % count; 1311 i = 0; 1312 cptr = prop_val; 1313 while (i < instno) 1314 if (*cptr++ == ',') 1315 i++; 1316 bind_cpu = stoi(&cptr); 1317 kmem_free(prop_val, prop_len); 1318 /* if specific CPU is bogus, then default to next cpu */ 1319 if (!apic_cpu_in_range(bind_cpu)) { 1320 cmn_err(CE_WARN, "%s: %s=%s: CPU %d not present", 1321 psm_name, prop_name, prop_val, bind_cpu); 1322 rc = DDI_PROP_NOT_FOUND; 1323 } else { 1324 /* indicate that we are bound at user request */ 1325 bind_cpu |= IRQ_USER_BOUND; 1326 } 1327 /* 1328 * no need to check apic_cpus[].aci_status, if specific CPU is 1329 * not up, then post_cpu_start will handle it. 1330 */ 1331 } 1332 1333 if (rc != DDI_PROP_SUCCESS) { 1334 iflag = intr_clear(); 1335 lock_set(&apic_ioapic_lock); 1336 bind_cpu = apic_get_next_bind_cpu(); 1337 lock_clear(&apic_ioapic_lock); 1338 intr_restore(iflag); 1339 } 1340 1341 cmn_err(CE_CONT, "!%s: %s (%s) instance %d irq 0x%x " 1342 "vector 0x%x ioapic 0x%x intin 0x%x is bound to cpu %d\n", 1343 psm_name, name, drv_name, instance, irq, 1344 apic_irq_table[irq]->airq_vector, ioapicid, intin, 1345 bind_cpu & ~IRQ_USER_BOUND); 1346 1347 return ((uint32_t)bind_cpu); 1348 } 1349 1350 /* 1351 * Mark vector as being in the process of being deleted. Interrupts 1352 * may still come in on some CPU. The moment an interrupt comes with 1353 * the new vector, we know we can free the old one. Called only from 1354 * addspl and delspl with interrupts disabled. Because an interrupt 1355 * can be shared, but no interrupt from either device may come in, 1356 * we also use a timeout mechanism, which we arbitrarily set to 1357 * apic_revector_timeout microseconds. 1358 */ 1359 static void 1360 apic_mark_vector(uchar_t oldvector, uchar_t newvector) 1361 { 1362 ulong_t iflag; 1363 1364 iflag = intr_clear(); 1365 lock_set(&apic_revector_lock); 1366 if (!apic_oldvec_to_newvec) { 1367 apic_oldvec_to_newvec = 1368 kmem_zalloc(sizeof (newvector) * APIC_MAX_VECTOR * 2, 1369 KM_NOSLEEP); 1370 1371 if (!apic_oldvec_to_newvec) { 1372 /* 1373 * This failure is not catastrophic. 1374 * But, the oldvec will never be freed. 1375 */ 1376 apic_error |= APIC_ERR_MARK_VECTOR_FAIL; 1377 lock_clear(&apic_revector_lock); 1378 intr_restore(iflag); 1379 return; 1380 } 1381 apic_newvec_to_oldvec = &apic_oldvec_to_newvec[APIC_MAX_VECTOR]; 1382 } 1383 1384 /* See if we already did this for drivers which do double addintrs */ 1385 if (apic_oldvec_to_newvec[oldvector] != newvector) { 1386 apic_oldvec_to_newvec[oldvector] = newvector; 1387 apic_newvec_to_oldvec[newvector] = oldvector; 1388 apic_revector_pending++; 1389 } 1390 lock_clear(&apic_revector_lock); 1391 intr_restore(iflag); 1392 (void) timeout(apic_xlate_vector_free_timeout_handler, 1393 (void *)(uintptr_t)oldvector, drv_usectohz(apic_revector_timeout)); 1394 } 1395 1396 /* 1397 * xlate_vector is called from intr_enter if revector_pending is set. 1398 * It will xlate it if needed and mark the old vector as free. 1399 */ 1400 uchar_t 1401 apic_xlate_vector(uchar_t vector) 1402 { 1403 uchar_t newvector, oldvector = 0; 1404 1405 lock_set(&apic_revector_lock); 1406 /* Do we really need to do this ? */ 1407 if (!apic_revector_pending) { 1408 lock_clear(&apic_revector_lock); 1409 return (vector); 1410 } 1411 if ((newvector = apic_oldvec_to_newvec[vector]) != 0) 1412 oldvector = vector; 1413 else { 1414 /* 1415 * The incoming vector is new . See if a stale entry is 1416 * remaining 1417 */ 1418 if ((oldvector = apic_newvec_to_oldvec[vector]) != 0) 1419 newvector = vector; 1420 } 1421 1422 if (oldvector) { 1423 apic_revector_pending--; 1424 apic_oldvec_to_newvec[oldvector] = 0; 1425 apic_newvec_to_oldvec[newvector] = 0; 1426 apic_free_vector(oldvector); 1427 lock_clear(&apic_revector_lock); 1428 /* There could have been more than one reprogramming! */ 1429 return (apic_xlate_vector(newvector)); 1430 } 1431 lock_clear(&apic_revector_lock); 1432 return (vector); 1433 } 1434 1435 void 1436 apic_xlate_vector_free_timeout_handler(void *arg) 1437 { 1438 ulong_t iflag; 1439 uchar_t oldvector, newvector; 1440 1441 oldvector = (uchar_t)(uintptr_t)arg; 1442 iflag = intr_clear(); 1443 lock_set(&apic_revector_lock); 1444 if ((newvector = apic_oldvec_to_newvec[oldvector]) != 0) { 1445 apic_free_vector(oldvector); 1446 apic_oldvec_to_newvec[oldvector] = 0; 1447 apic_newvec_to_oldvec[newvector] = 0; 1448 apic_revector_pending--; 1449 } 1450 1451 lock_clear(&apic_revector_lock); 1452 intr_restore(iflag); 1453 } 1454 1455 /* 1456 * Bind interrupt corresponding to irq_ptr to bind_cpu. 1457 * Must be called with interrupts disabled and apic_ioapic_lock held 1458 */ 1459 int 1460 apic_rebind(apic_irq_t *irq_ptr, int bind_cpu, 1461 struct ioapic_reprogram_data *drep) 1462 { 1463 int ioapicindex, intin_no; 1464 uint32_t airq_temp_cpu; 1465 apic_cpus_info_t *cpu_infop; 1466 uint32_t rdt_entry; 1467 int which_irq; 1468 ioapic_rdt_t irdt; 1469 1470 which_irq = apic_vector_to_irq[irq_ptr->airq_vector]; 1471 1472 intin_no = irq_ptr->airq_intin_no; 1473 ioapicindex = irq_ptr->airq_ioapicindex; 1474 airq_temp_cpu = irq_ptr->airq_temp_cpu; 1475 if (airq_temp_cpu != IRQ_UNINIT && airq_temp_cpu != IRQ_UNBOUND) { 1476 if (airq_temp_cpu & IRQ_USER_BOUND) 1477 /* Mask off high bit so it can be used as array index */ 1478 airq_temp_cpu &= ~IRQ_USER_BOUND; 1479 1480 ASSERT(apic_cpu_in_range(airq_temp_cpu)); 1481 } 1482 1483 /* 1484 * Can't bind to a CPU that's not accepting interrupts: 1485 */ 1486 cpu_infop = &apic_cpus[bind_cpu & ~IRQ_USER_BOUND]; 1487 if (!(cpu_infop->aci_status & APIC_CPU_INTR_ENABLE)) 1488 return (1); 1489 1490 /* 1491 * If we are about to change the interrupt vector for this interrupt, 1492 * and this interrupt is level-triggered, attached to an IOAPIC, 1493 * has been delivered to a CPU and that CPU has not handled it 1494 * yet, we cannot reprogram the IOAPIC now. 1495 */ 1496 if (!APIC_IS_MSI_OR_MSIX_INDEX(irq_ptr->airq_mps_intr_index)) { 1497 1498 rdt_entry = READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapicindex, 1499 intin_no); 1500 1501 if ((irq_ptr->airq_vector != RDT_VECTOR(rdt_entry)) && 1502 apic_check_stuck_interrupt(irq_ptr, airq_temp_cpu, 1503 bind_cpu, ioapicindex, intin_no, which_irq, drep) != 0) { 1504 1505 return (0); 1506 } 1507 1508 /* 1509 * NOTE: We do not unmask the RDT here, as an interrupt MAY 1510 * still come in before we have a chance to reprogram it below. 1511 * The reprogramming below will simultaneously change and 1512 * unmask the RDT entry. 1513 */ 1514 1515 if ((uint32_t)bind_cpu == IRQ_UNBOUND) { 1516 irdt.ir_lo = AV_LDEST | AV_LOPRI | 1517 irq_ptr->airq_rdt_entry; 1518 1519 irdt.ir_hi = AV_TOALL >> APIC_ID_BIT_OFFSET; 1520 1521 apic_vt_ops->apic_intrmap_alloc_entry( 1522 &irq_ptr->airq_intrmap_private, NULL, 1523 DDI_INTR_TYPE_FIXED, 1, ioapicindex); 1524 apic_vt_ops->apic_intrmap_map_entry( 1525 irq_ptr->airq_intrmap_private, (void *)&irdt, 1526 DDI_INTR_TYPE_FIXED, 1); 1527 apic_vt_ops->apic_intrmap_record_rdt( 1528 irq_ptr->airq_intrmap_private, &irdt); 1529 1530 /* Write the RDT entry -- no specific CPU binding */ 1531 WRITE_IOAPIC_RDT_ENTRY_HIGH_DWORD(ioapicindex, intin_no, 1532 irdt.ir_hi | AV_TOALL); 1533 1534 if (airq_temp_cpu != IRQ_UNINIT && airq_temp_cpu != 1535 IRQ_UNBOUND) 1536 apic_cpus[airq_temp_cpu].aci_temp_bound--; 1537 1538 /* 1539 * Write the vector, trigger, and polarity portion of 1540 * the RDT 1541 */ 1542 WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapicindex, intin_no, 1543 irdt.ir_lo); 1544 1545 irq_ptr->airq_temp_cpu = IRQ_UNBOUND; 1546 return (0); 1547 } 1548 } 1549 1550 if (bind_cpu & IRQ_USER_BOUND) { 1551 cpu_infop->aci_bound++; 1552 } else { 1553 cpu_infop->aci_temp_bound++; 1554 } 1555 ASSERT(apic_cpu_in_range(bind_cpu)); 1556 1557 if ((airq_temp_cpu != IRQ_UNBOUND) && (airq_temp_cpu != IRQ_UNINIT)) { 1558 apic_cpus[airq_temp_cpu].aci_temp_bound--; 1559 } 1560 if (!APIC_IS_MSI_OR_MSIX_INDEX(irq_ptr->airq_mps_intr_index)) { 1561 1562 irdt.ir_lo = AV_PDEST | AV_FIXED | irq_ptr->airq_rdt_entry; 1563 irdt.ir_hi = cpu_infop->aci_local_id; 1564 1565 apic_vt_ops->apic_intrmap_alloc_entry( 1566 &irq_ptr->airq_intrmap_private, NULL, DDI_INTR_TYPE_FIXED, 1567 1, ioapicindex); 1568 apic_vt_ops->apic_intrmap_map_entry( 1569 irq_ptr->airq_intrmap_private, 1570 (void *)&irdt, DDI_INTR_TYPE_FIXED, 1); 1571 apic_vt_ops->apic_intrmap_record_rdt( 1572 irq_ptr->airq_intrmap_private, &irdt); 1573 1574 /* Write the RDT entry -- bind to a specific CPU: */ 1575 WRITE_IOAPIC_RDT_ENTRY_HIGH_DWORD(ioapicindex, intin_no, 1576 irdt.ir_hi); 1577 1578 /* Write the vector, trigger, and polarity portion of the RDT */ 1579 WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapicindex, intin_no, 1580 irdt.ir_lo); 1581 1582 } else { 1583 int type = (irq_ptr->airq_mps_intr_index == MSI_INDEX) ? 1584 DDI_INTR_TYPE_MSI : DDI_INTR_TYPE_MSIX; 1585 if (type == DDI_INTR_TYPE_MSI) { 1586 if (irq_ptr->airq_ioapicindex == 1587 irq_ptr->airq_origirq) { 1588 /* first one */ 1589 DDI_INTR_IMPLDBG((CE_CONT, "apic_rebind: call " 1590 "apic_pci_msi_enable_vector\n")); 1591 apic_pci_msi_enable_vector(irq_ptr, 1592 type, which_irq, irq_ptr->airq_vector, 1593 irq_ptr->airq_intin_no, 1594 cpu_infop->aci_local_id); 1595 } 1596 if ((irq_ptr->airq_ioapicindex + 1597 irq_ptr->airq_intin_no - 1) == 1598 irq_ptr->airq_origirq) { /* last one */ 1599 DDI_INTR_IMPLDBG((CE_CONT, "apic_rebind: call " 1600 "apic_pci_msi_enable_mode\n")); 1601 apic_pci_msi_enable_mode(irq_ptr->airq_dip, 1602 type, which_irq); 1603 } 1604 } else { /* MSI-X */ 1605 apic_pci_msi_enable_vector(irq_ptr, type, 1606 irq_ptr->airq_origirq, irq_ptr->airq_vector, 1, 1607 cpu_infop->aci_local_id); 1608 apic_pci_msi_enable_mode(irq_ptr->airq_dip, type, 1609 irq_ptr->airq_origirq); 1610 } 1611 } 1612 irq_ptr->airq_temp_cpu = (uint32_t)bind_cpu; 1613 apic_redist_cpu_skip &= ~(1 << (bind_cpu & ~IRQ_USER_BOUND)); 1614 return (0); 1615 } 1616 1617 static void 1618 apic_last_ditch_clear_remote_irr(int ioapic_ix, int intin_no) 1619 { 1620 if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, intin_no) 1621 & AV_REMOTE_IRR) != 0) { 1622 /* 1623 * Trying to clear the bit through normal 1624 * channels has failed. So as a last-ditch 1625 * effort, try to set the trigger mode to 1626 * edge, then to level. This has been 1627 * observed to work on many systems. 1628 */ 1629 WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, 1630 intin_no, 1631 READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, 1632 intin_no) & ~AV_LEVEL); 1633 1634 WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, 1635 intin_no, 1636 READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, 1637 intin_no) | AV_LEVEL); 1638 1639 /* 1640 * If the bit's STILL set, this interrupt may 1641 * be hosed. 1642 */ 1643 if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, 1644 intin_no) & AV_REMOTE_IRR) != 0) { 1645 1646 prom_printf("%s: Remote IRR still " 1647 "not clear for IOAPIC %d intin %d.\n" 1648 "\tInterrupts to this pin may cease " 1649 "functioning.\n", psm_name, ioapic_ix, 1650 intin_no); 1651 #ifdef DEBUG 1652 apic_last_ditch_reprogram_failures++; 1653 #endif 1654 } 1655 } 1656 } 1657 1658 /* 1659 * This function is protected by apic_ioapic_lock coupled with the 1660 * fact that interrupts are disabled. 1661 */ 1662 static void 1663 delete_defer_repro_ent(int which_irq) 1664 { 1665 ASSERT(which_irq >= 0); 1666 ASSERT(which_irq <= 255); 1667 ASSERT(LOCK_HELD(&apic_ioapic_lock)); 1668 1669 if (apic_reprogram_info[which_irq].done) 1670 return; 1671 1672 apic_reprogram_info[which_irq].done = B_TRUE; 1673 1674 #ifdef DEBUG 1675 apic_defer_repro_total_retries += 1676 apic_reprogram_info[which_irq].tries; 1677 1678 apic_defer_repro_successes++; 1679 #endif 1680 1681 if (--apic_reprogram_outstanding == 0) { 1682 1683 setlvlx = psm_intr_exit_fn(); 1684 } 1685 } 1686 1687 1688 /* 1689 * Interrupts must be disabled during this function to prevent 1690 * self-deadlock. Interrupts are disabled because this function 1691 * is called from apic_check_stuck_interrupt(), which is called 1692 * from apic_rebind(), which requires its caller to disable interrupts. 1693 */ 1694 static void 1695 add_defer_repro_ent(apic_irq_t *irq_ptr, int which_irq, int new_bind_cpu) 1696 { 1697 ASSERT(which_irq >= 0); 1698 ASSERT(which_irq <= 255); 1699 ASSERT(!interrupts_enabled()); 1700 1701 /* 1702 * On the off-chance that there's already a deferred 1703 * reprogramming on this irq, check, and if so, just update the 1704 * CPU and irq pointer to which the interrupt is targeted, then return. 1705 */ 1706 if (!apic_reprogram_info[which_irq].done) { 1707 apic_reprogram_info[which_irq].bindcpu = new_bind_cpu; 1708 apic_reprogram_info[which_irq].irqp = irq_ptr; 1709 return; 1710 } 1711 1712 apic_reprogram_info[which_irq].irqp = irq_ptr; 1713 apic_reprogram_info[which_irq].bindcpu = new_bind_cpu; 1714 apic_reprogram_info[which_irq].tries = 0; 1715 /* 1716 * This must be the last thing set, since we're not 1717 * grabbing any locks, apic_try_deferred_reprogram() will 1718 * make its decision about using this entry iff done 1719 * is false. 1720 */ 1721 apic_reprogram_info[which_irq].done = B_FALSE; 1722 1723 /* 1724 * If there were previously no deferred reprogrammings, change 1725 * setlvlx to call apic_try_deferred_reprogram() 1726 */ 1727 if (++apic_reprogram_outstanding == 1) { 1728 1729 setlvlx = apic_try_deferred_reprogram; 1730 } 1731 } 1732 1733 static void 1734 apic_try_deferred_reprogram(int prev_ipl, int irq) 1735 { 1736 int reproirq; 1737 ulong_t iflag; 1738 struct ioapic_reprogram_data *drep; 1739 1740 (*psm_intr_exit_fn())(prev_ipl, irq); 1741 1742 if (!lock_try(&apic_defer_reprogram_lock)) { 1743 return; 1744 } 1745 1746 /* 1747 * Acquire the apic_ioapic_lock so that any other operations that 1748 * may affect the apic_reprogram_info state are serialized. 1749 * It's still possible for the last deferred reprogramming to clear 1750 * between the time we entered this function and the time we get to 1751 * the for loop below. In that case, *setlvlx will have been set 1752 * back to *_intr_exit and drep will be NULL. (There's no way to 1753 * stop that from happening -- we would need to grab a lock before 1754 * calling *setlvlx, which is neither realistic nor prudent). 1755 */ 1756 iflag = intr_clear(); 1757 lock_set(&apic_ioapic_lock); 1758 1759 /* 1760 * For each deferred RDT entry, try to reprogram it now. Note that 1761 * there is no lock acquisition to read apic_reprogram_info because 1762 * '.done' is set only after the other fields in the structure are set. 1763 */ 1764 1765 drep = NULL; 1766 for (reproirq = 0; reproirq <= APIC_MAX_VECTOR; reproirq++) { 1767 if (apic_reprogram_info[reproirq].done == B_FALSE) { 1768 drep = &apic_reprogram_info[reproirq]; 1769 break; 1770 } 1771 } 1772 1773 /* 1774 * Either we found a deferred action to perform, or 1775 * we entered this function spuriously, after *setlvlx 1776 * was restored to point to *_intr_exit. Any other 1777 * permutation is invalid. 1778 */ 1779 ASSERT(drep != NULL || *setlvlx == psm_intr_exit_fn()); 1780 1781 /* 1782 * Though we can't really do anything about errors 1783 * at this point, keep track of them for reporting. 1784 * Note that it is very possible for apic_setup_io_intr 1785 * to re-register this very timeout if the Remote IRR bit 1786 * has not yet cleared. 1787 */ 1788 1789 #ifdef DEBUG 1790 if (drep != NULL) { 1791 if (apic_setup_io_intr(drep, reproirq, B_TRUE) != 0) { 1792 apic_deferred_setup_failures++; 1793 } 1794 } else { 1795 apic_deferred_spurious_enters++; 1796 } 1797 #else 1798 if (drep != NULL) 1799 (void) apic_setup_io_intr(drep, reproirq, B_TRUE); 1800 #endif 1801 1802 lock_clear(&apic_ioapic_lock); 1803 intr_restore(iflag); 1804 1805 lock_clear(&apic_defer_reprogram_lock); 1806 } 1807 1808 static void 1809 apic_ioapic_wait_pending_clear(int ioapic_ix, int intin_no) 1810 { 1811 int waited; 1812 1813 /* 1814 * Wait for the delivery pending bit to clear. 1815 */ 1816 if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, intin_no) & 1817 (AV_LEVEL|AV_PENDING)) == (AV_LEVEL|AV_PENDING)) { 1818 1819 /* 1820 * If we're still waiting on the delivery of this interrupt, 1821 * continue to wait here until it is delivered (this should be 1822 * a very small amount of time, but include a timeout just in 1823 * case). 1824 */ 1825 for (waited = 0; waited < apic_max_reps_clear_pending; 1826 waited++) { 1827 if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, 1828 intin_no) & AV_PENDING) == 0) { 1829 break; 1830 } 1831 } 1832 } 1833 } 1834 1835 1836 /* 1837 * Checks to see if the IOAPIC interrupt entry specified has its Remote IRR 1838 * bit set. Calls functions that modify the function that setlvlx points to, 1839 * so that the reprogramming can be retried very shortly. 1840 * 1841 * This function will mask the RDT entry if the interrupt is level-triggered. 1842 * (The caller is responsible for unmasking the RDT entry.) 1843 * 1844 * Returns non-zero if the caller should defer IOAPIC reprogramming. 1845 */ 1846 static int 1847 apic_check_stuck_interrupt(apic_irq_t *irq_ptr, int old_bind_cpu, 1848 int new_bind_cpu, int ioapic_ix, int intin_no, int which_irq, 1849 struct ioapic_reprogram_data *drep) 1850 { 1851 int32_t rdt_entry; 1852 int waited; 1853 int reps = 0; 1854 1855 /* 1856 * Wait for the delivery pending bit to clear. 1857 */ 1858 do { 1859 ++reps; 1860 1861 apic_ioapic_wait_pending_clear(ioapic_ix, intin_no); 1862 1863 /* 1864 * Mask the RDT entry, but only if it's a level-triggered 1865 * interrupt 1866 */ 1867 rdt_entry = READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, 1868 intin_no); 1869 if ((rdt_entry & (AV_LEVEL|AV_MASK)) == AV_LEVEL) { 1870 1871 /* Mask it */ 1872 WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, intin_no, 1873 AV_MASK | rdt_entry); 1874 } 1875 1876 if ((rdt_entry & AV_LEVEL) == AV_LEVEL) { 1877 /* 1878 * If there was a race and an interrupt was injected 1879 * just before we masked, check for that case here. 1880 * Then, unmask the RDT entry and try again. If we're 1881 * on our last try, don't unmask (because we want the 1882 * RDT entry to remain masked for the rest of the 1883 * function). 1884 */ 1885 rdt_entry = READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, 1886 intin_no); 1887 if ((rdt_entry & AV_PENDING) && 1888 (reps < apic_max_reps_clear_pending)) { 1889 /* Unmask it */ 1890 WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, 1891 intin_no, rdt_entry & ~AV_MASK); 1892 } 1893 } 1894 1895 } while ((rdt_entry & AV_PENDING) && 1896 (reps < apic_max_reps_clear_pending)); 1897 1898 #ifdef DEBUG 1899 if (rdt_entry & AV_PENDING) 1900 apic_intr_deliver_timeouts++; 1901 #endif 1902 1903 /* 1904 * If the remote IRR bit is set, then the interrupt has been sent 1905 * to a CPU for processing. We have no choice but to wait for 1906 * that CPU to process the interrupt, at which point the remote IRR 1907 * bit will be cleared. 1908 */ 1909 if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, intin_no) & 1910 (AV_LEVEL|AV_REMOTE_IRR)) == (AV_LEVEL|AV_REMOTE_IRR)) { 1911 1912 /* 1913 * If the CPU that this RDT is bound to is NOT the current 1914 * CPU, wait until that CPU handles the interrupt and ACKs 1915 * it. If this interrupt is not bound to any CPU (that is, 1916 * if it's bound to the logical destination of "anyone"), it 1917 * may have been delivered to the current CPU so handle that 1918 * case by deferring the reprogramming (below). 1919 */ 1920 if ((old_bind_cpu != IRQ_UNBOUND) && 1921 (old_bind_cpu != IRQ_UNINIT) && 1922 (old_bind_cpu != psm_get_cpu_id())) { 1923 for (waited = 0; waited < apic_max_reps_clear_pending; 1924 waited++) { 1925 if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, 1926 intin_no) & AV_REMOTE_IRR) == 0) { 1927 1928 delete_defer_repro_ent(which_irq); 1929 1930 /* Remote IRR has cleared! */ 1931 return (0); 1932 } 1933 } 1934 } 1935 1936 /* 1937 * If we waited and the Remote IRR bit is still not cleared, 1938 * AND if we've invoked the timeout APIC_REPROGRAM_MAX_TIMEOUTS 1939 * times for this interrupt, try the last-ditch workaround: 1940 */ 1941 if (drep && drep->tries >= APIC_REPROGRAM_MAX_TRIES) { 1942 1943 apic_last_ditch_clear_remote_irr(ioapic_ix, intin_no); 1944 1945 /* Mark this one as reprogrammed: */ 1946 delete_defer_repro_ent(which_irq); 1947 1948 return (0); 1949 } else { 1950 #ifdef DEBUG 1951 apic_intr_deferrals++; 1952 #endif 1953 1954 /* 1955 * If waiting for the Remote IRR bit (above) didn't 1956 * allow it to clear, defer the reprogramming. 1957 * Add a new deferred-programming entry if the 1958 * caller passed a NULL one (and update the existing one 1959 * in case anything changed). 1960 */ 1961 add_defer_repro_ent(irq_ptr, which_irq, new_bind_cpu); 1962 if (drep) 1963 drep->tries++; 1964 1965 /* Inform caller to defer IOAPIC programming: */ 1966 return (1); 1967 } 1968 1969 } 1970 1971 /* Remote IRR is clear */ 1972 delete_defer_repro_ent(which_irq); 1973 1974 return (0); 1975 } 1976 1977 /* 1978 * Called to migrate all interrupts at an irq to another cpu. 1979 * Must be called with interrupts disabled and apic_ioapic_lock held 1980 */ 1981 int 1982 apic_rebind_all(apic_irq_t *irq_ptr, int bind_cpu) 1983 { 1984 apic_irq_t *irqptr = irq_ptr; 1985 int retval = 0; 1986 1987 while (irqptr) { 1988 if (irqptr->airq_temp_cpu != IRQ_UNINIT) 1989 retval |= apic_rebind(irqptr, bind_cpu, NULL); 1990 irqptr = irqptr->airq_next; 1991 } 1992 1993 return (retval); 1994 } 1995 1996 /* 1997 * apic_intr_redistribute does all the messy computations for identifying 1998 * which interrupt to move to which CPU. Currently we do just one interrupt 1999 * at a time. This reduces the time we spent doing all this within clock 2000 * interrupt. When it is done in idle, we could do more than 1. 2001 * First we find the most busy and the most free CPU (time in ISR only) 2002 * skipping those CPUs that has been identified as being ineligible (cpu_skip) 2003 * Then we look for IRQs which are closest to the difference between the 2004 * most busy CPU and the average ISR load. We try to find one whose load 2005 * is less than difference.If none exists, then we chose one larger than the 2006 * difference, provided it does not make the most idle CPU worse than the 2007 * most busy one. In the end, we clear all the busy fields for CPUs. For 2008 * IRQs, they are cleared as they are scanned. 2009 */ 2010 void 2011 apic_intr_redistribute(void) 2012 { 2013 int busiest_cpu, most_free_cpu; 2014 int cpu_free, cpu_busy, max_busy, min_busy; 2015 int min_free, diff; 2016 int average_busy, cpus_online; 2017 int i, busy; 2018 ulong_t iflag; 2019 apic_cpus_info_t *cpu_infop; 2020 apic_irq_t *min_busy_irq = NULL; 2021 apic_irq_t *max_busy_irq = NULL; 2022 2023 busiest_cpu = most_free_cpu = -1; 2024 cpu_free = cpu_busy = max_busy = average_busy = 0; 2025 min_free = apic_sample_factor_redistribution; 2026 cpus_online = 0; 2027 /* 2028 * Below we will check for CPU_INTR_ENABLE, bound, temp_bound, temp_cpu 2029 * without ioapic_lock. That is OK as we are just doing statistical 2030 * sampling anyway and any inaccuracy now will get corrected next time 2031 * The call to rebind which actually changes things will make sure 2032 * we are consistent. 2033 */ 2034 for (i = 0; i < apic_nproc; i++) { 2035 if (apic_cpu_in_range(i) && 2036 !(apic_redist_cpu_skip & (1 << i)) && 2037 (apic_cpus[i].aci_status & APIC_CPU_INTR_ENABLE)) { 2038 2039 cpu_infop = &apic_cpus[i]; 2040 /* 2041 * If no unbound interrupts or only 1 total on this 2042 * CPU, skip 2043 */ 2044 if (!cpu_infop->aci_temp_bound || 2045 (cpu_infop->aci_bound + cpu_infop->aci_temp_bound) 2046 == 1) { 2047 apic_redist_cpu_skip |= 1 << i; 2048 continue; 2049 } 2050 2051 busy = cpu_infop->aci_busy; 2052 average_busy += busy; 2053 cpus_online++; 2054 if (max_busy < busy) { 2055 max_busy = busy; 2056 busiest_cpu = i; 2057 } 2058 if (min_free > busy) { 2059 min_free = busy; 2060 most_free_cpu = i; 2061 } 2062 if (busy > apic_int_busy_mark) { 2063 cpu_busy |= 1 << i; 2064 } else { 2065 if (busy < apic_int_free_mark) 2066 cpu_free |= 1 << i; 2067 } 2068 } 2069 } 2070 if ((cpu_busy && cpu_free) || 2071 (max_busy >= (min_free + apic_diff_for_redistribution))) { 2072 2073 apic_num_imbalance++; 2074 #ifdef DEBUG 2075 if (apic_verbose & APIC_VERBOSE_IOAPIC_FLAG) { 2076 prom_printf( 2077 "redistribute busy=%x free=%x max=%x min=%x", 2078 cpu_busy, cpu_free, max_busy, min_free); 2079 } 2080 #endif /* DEBUG */ 2081 2082 2083 average_busy /= cpus_online; 2084 2085 diff = max_busy - average_busy; 2086 min_busy = max_busy; /* start with the max possible value */ 2087 max_busy = 0; 2088 min_busy_irq = max_busy_irq = NULL; 2089 i = apic_min_device_irq; 2090 for (; i <= apic_max_device_irq; i++) { 2091 apic_irq_t *irq_ptr; 2092 /* Change to linked list per CPU ? */ 2093 if ((irq_ptr = apic_irq_table[i]) == NULL) 2094 continue; 2095 /* Check for irq_busy & decide which one to move */ 2096 /* Also zero them for next round */ 2097 if ((irq_ptr->airq_temp_cpu == busiest_cpu) && 2098 irq_ptr->airq_busy) { 2099 if (irq_ptr->airq_busy < diff) { 2100 /* 2101 * Check for least busy CPU, 2102 * best fit or what ? 2103 */ 2104 if (max_busy < irq_ptr->airq_busy) { 2105 /* 2106 * Most busy within the 2107 * required differential 2108 */ 2109 max_busy = irq_ptr->airq_busy; 2110 max_busy_irq = irq_ptr; 2111 } 2112 } else { 2113 if (min_busy > irq_ptr->airq_busy) { 2114 /* 2115 * least busy, but more than 2116 * the reqd diff 2117 */ 2118 if (min_busy < 2119 (diff + average_busy - 2120 min_free)) { 2121 /* 2122 * Making sure new cpu 2123 * will not end up 2124 * worse 2125 */ 2126 min_busy = 2127 irq_ptr->airq_busy; 2128 2129 min_busy_irq = irq_ptr; 2130 } 2131 } 2132 } 2133 } 2134 irq_ptr->airq_busy = 0; 2135 } 2136 2137 if (max_busy_irq != NULL) { 2138 #ifdef DEBUG 2139 if (apic_verbose & APIC_VERBOSE_IOAPIC_FLAG) { 2140 prom_printf("rebinding %x to %x", 2141 max_busy_irq->airq_vector, most_free_cpu); 2142 } 2143 #endif /* DEBUG */ 2144 iflag = intr_clear(); 2145 if (lock_try(&apic_ioapic_lock)) { 2146 if (apic_rebind_all(max_busy_irq, 2147 most_free_cpu) == 0) { 2148 /* Make change permenant */ 2149 max_busy_irq->airq_cpu = 2150 (uint32_t)most_free_cpu; 2151 } 2152 lock_clear(&apic_ioapic_lock); 2153 } 2154 intr_restore(iflag); 2155 2156 } else if (min_busy_irq != NULL) { 2157 #ifdef DEBUG 2158 if (apic_verbose & APIC_VERBOSE_IOAPIC_FLAG) { 2159 prom_printf("rebinding %x to %x", 2160 min_busy_irq->airq_vector, most_free_cpu); 2161 } 2162 #endif /* DEBUG */ 2163 2164 iflag = intr_clear(); 2165 if (lock_try(&apic_ioapic_lock)) { 2166 if (apic_rebind_all(min_busy_irq, 2167 most_free_cpu) == 0) { 2168 /* Make change permenant */ 2169 min_busy_irq->airq_cpu = 2170 (uint32_t)most_free_cpu; 2171 } 2172 lock_clear(&apic_ioapic_lock); 2173 } 2174 intr_restore(iflag); 2175 2176 } else { 2177 if (cpu_busy != (1 << busiest_cpu)) { 2178 apic_redist_cpu_skip |= 1 << busiest_cpu; 2179 /* 2180 * We leave cpu_skip set so that next time we 2181 * can choose another cpu 2182 */ 2183 } 2184 } 2185 apic_num_rebind++; 2186 } else { 2187 /* 2188 * found nothing. Could be that we skipped over valid CPUs 2189 * or we have balanced everything. If we had a variable 2190 * ticks_for_redistribution, it could be increased here. 2191 * apic_int_busy, int_free etc would also need to be 2192 * changed. 2193 */ 2194 if (apic_redist_cpu_skip) 2195 apic_redist_cpu_skip = 0; 2196 } 2197 for (i = 0; i < apic_nproc; i++) { 2198 if (apic_cpu_in_range(i)) { 2199 apic_cpus[i].aci_busy = 0; 2200 } 2201 } 2202 } 2203 2204 void 2205 apic_cleanup_busy(void) 2206 { 2207 int i; 2208 apic_irq_t *irq_ptr; 2209 2210 for (i = 0; i < apic_nproc; i++) { 2211 if (apic_cpu_in_range(i)) { 2212 apic_cpus[i].aci_busy = 0; 2213 } 2214 } 2215 2216 for (i = apic_min_device_irq; i <= apic_max_device_irq; i++) { 2217 if ((irq_ptr = apic_irq_table[i]) != NULL) 2218 irq_ptr->airq_busy = 0; 2219 } 2220 } 2221