1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 /* 25 * Copyright (c) 2010, Intel Corporation. 26 * All rights reserved. 27 */ 28 29 /* 30 * PSMI 1.1 extensions are supported only in 2.6 and later versions. 31 * PSMI 1.2 extensions are supported only in 2.7 and later versions. 32 * PSMI 1.3 and 1.4 extensions are supported in Solaris 10. 33 * PSMI 1.5 extensions are supported in Solaris Nevada. 34 * PSMI 1.6 extensions are supported in Solaris Nevada. 35 * PSMI 1.7 extensions are supported in Solaris Nevada. 36 */ 37 #define PSMI_1_7 38 39 #include <sys/processor.h> 40 #include <sys/time.h> 41 #include <sys/psm.h> 42 #include <sys/smp_impldefs.h> 43 #include <sys/inttypes.h> 44 #include <sys/cram.h> 45 #include <sys/acpi/acpi.h> 46 #include <sys/acpica.h> 47 #include <sys/psm_common.h> 48 #include <sys/apic.h> 49 #include <sys/apic_common.h> 50 #include <sys/pit.h> 51 #include <sys/ddi.h> 52 #include <sys/sunddi.h> 53 #include <sys/ddi_impldefs.h> 54 #include <sys/pci.h> 55 #include <sys/promif.h> 56 #include <sys/x86_archext.h> 57 #include <sys/cpc_impl.h> 58 #include <sys/uadmin.h> 59 #include <sys/panic.h> 60 #include <sys/debug.h> 61 #include <sys/archsystm.h> 62 #include <sys/trap.h> 63 #include <sys/machsystm.h> 64 #include <sys/cpuvar.h> 65 #include <sys/rm_platter.h> 66 #include <sys/privregs.h> 67 #include <sys/cyclic.h> 68 #include <sys/note.h> 69 #include <sys/pci_intr_lib.h> 70 #include <sys/sunndi.h> 71 #include <sys/hpet.h> 72 #include <sys/clock.h> 73 74 /* 75 * Part of mp_platfrom_common.c that's used only by pcplusmp & xpv_psm 76 * but not apix. 77 * These functions may be moved to xpv_psm later when apix and pcplusmp 78 * are merged together 79 */ 80 81 /* 82 * Local Function Prototypes 83 */ 84 static void apic_mark_vector(uchar_t oldvector, uchar_t newvector); 85 static void apic_xlate_vector_free_timeout_handler(void *arg); 86 static int apic_check_stuck_interrupt(apic_irq_t *irq_ptr, int old_bind_cpu, 87 int new_bind_cpu, int apicindex, int intin_no, int which_irq, 88 struct ioapic_reprogram_data *drep); 89 static int apic_setup_irq_table(dev_info_t *dip, int irqno, 90 struct apic_io_intr *intrp, struct intrspec *ispec, iflag_t *intr_flagp, 91 int type); 92 static void apic_try_deferred_reprogram(int ipl, int vect); 93 static void delete_defer_repro_ent(int which_irq); 94 static void apic_ioapic_wait_pending_clear(int ioapicindex, 95 int intin_no); 96 97 extern int apic_acpi_translate_pci_irq(dev_info_t *dip, int busid, int devid, 98 int ipin, int *pci_irqp, iflag_t *intr_flagp); 99 extern int apic_handle_pci_pci_bridge(dev_info_t *idip, int child_devno, 100 int child_ipin, struct apic_io_intr **intrp); 101 extern uchar_t acpi_find_ioapic(int irq); 102 extern struct apic_io_intr *apic_find_io_intr_w_busid(int irqno, int busid); 103 extern int apic_find_bus_id(int bustype); 104 extern int apic_find_intin(uchar_t ioapic, uchar_t intin); 105 extern void apic_record_rdt_entry(apic_irq_t *irqptr, int irq); 106 107 extern int apic_sci_vect; 108 extern iflag_t apic_sci_flags; 109 /* ACPI HPET interrupt configuration; -1 if HPET not used */ 110 extern int apic_hpet_vect; 111 extern iflag_t apic_hpet_flags; 112 extern int apic_intr_policy; 113 extern char *psm_name; 114 115 /* 116 * number of bits per byte, from <sys/param.h> 117 */ 118 #define UCHAR_MAX UINT8_MAX 119 120 /* Max wait time (in repetitions) for flags to clear in an RDT entry. */ 121 extern int apic_max_reps_clear_pending; 122 123 /* The irq # is implicit in the array index: */ 124 struct ioapic_reprogram_data apic_reprogram_info[APIC_MAX_VECTOR+1]; 125 /* 126 * APIC_MAX_VECTOR + 1 is the maximum # of IRQs as well. ioapic_reprogram_info 127 * is indexed by IRQ number, NOT by vector number. 128 */ 129 130 extern int apic_int_busy_mark; 131 extern int apic_int_free_mark; 132 extern int apic_diff_for_redistribution; 133 extern int apic_sample_factor_redistribution; 134 extern int apic_redist_cpu_skip; 135 extern int apic_num_imbalance; 136 extern int apic_num_rebind; 137 138 /* timeout for xlate_vector, mark_vector */ 139 int apic_revector_timeout = 16 * 10000; /* 160 millisec */ 140 141 extern int apic_defconf; 142 extern int apic_irq_translate; 143 144 extern int apic_use_acpi_madt_only; /* 1=ONLY use MADT from ACPI */ 145 146 extern uchar_t apic_io_vectbase[MAX_IO_APIC]; 147 148 extern boolean_t ioapic_mask_workaround[MAX_IO_APIC]; 149 150 /* 151 * First available slot to be used as IRQ index into the apic_irq_table 152 * for those interrupts (like MSI/X) that don't have a physical IRQ. 153 */ 154 extern int apic_first_avail_irq; 155 156 /* 157 * apic_defer_reprogram_lock ensures that only one processor is handling 158 * deferred interrupt programming at *_intr_exit time. 159 */ 160 static lock_t apic_defer_reprogram_lock; 161 162 /* 163 * The current number of deferred reprogrammings outstanding 164 */ 165 uint_t apic_reprogram_outstanding = 0; 166 167 #ifdef DEBUG 168 /* 169 * Counters that keep track of deferred reprogramming stats 170 */ 171 uint_t apic_intr_deferrals = 0; 172 uint_t apic_intr_deliver_timeouts = 0; 173 uint_t apic_last_ditch_reprogram_failures = 0; 174 uint_t apic_deferred_setup_failures = 0; 175 uint_t apic_defer_repro_total_retries = 0; 176 uint_t apic_defer_repro_successes = 0; 177 uint_t apic_deferred_spurious_enters = 0; 178 #endif 179 180 extern int apic_io_max; 181 extern struct apic_io_intr *apic_io_intrp; 182 183 uchar_t apic_vector_to_irq[APIC_MAX_VECTOR+1]; 184 185 extern uint32_t eisa_level_intr_mask; 186 /* At least MSB will be set if EISA bus */ 187 188 extern int apic_pci_bus_total; 189 extern uchar_t apic_single_pci_busid; 190 191 /* 192 * Following declarations are for revectoring; used when ISRs at different 193 * IPLs share an irq. 194 */ 195 static lock_t apic_revector_lock; 196 int apic_revector_pending = 0; 197 static uchar_t *apic_oldvec_to_newvec; 198 static uchar_t *apic_newvec_to_oldvec; 199 200 /* ACPI Interrupt Source Override Structure ptr */ 201 extern ACPI_MADT_INTERRUPT_OVERRIDE *acpi_isop; 202 extern int acpi_iso_cnt; 203 204 /* 205 * Auto-configuration routines 206 */ 207 208 /* 209 * Initialise vector->ipl and ipl->pri arrays. level_intr and irqtable 210 * are also set to NULL. vector->irq is set to a value which cannot map 211 * to a real irq to show that it is free. 212 */ 213 void 214 apic_init_common(void) 215 { 216 int i, j, indx; 217 int *iptr; 218 219 /* 220 * Initialize apic_ipls from apic_vectortoipl. This array is 221 * used in apic_intr_enter to determine the IPL to use for the 222 * corresponding vector. On some systems, due to hardware errata 223 * and interrupt sharing, the IPL may not correspond to the IPL listed 224 * in apic_vectortoipl (see apic_addspl and apic_delspl). 225 */ 226 for (i = 0; i < (APIC_AVAIL_VECTOR / APIC_VECTOR_PER_IPL); i++) { 227 indx = i * APIC_VECTOR_PER_IPL; 228 229 for (j = 0; j < APIC_VECTOR_PER_IPL; j++, indx++) 230 apic_ipls[indx] = apic_vectortoipl[i]; 231 } 232 233 /* cpu 0 is always up (for now) */ 234 apic_cpus[0].aci_status = APIC_CPU_ONLINE | APIC_CPU_INTR_ENABLE; 235 236 iptr = (int *)&apic_irq_table[0]; 237 for (i = 0; i <= APIC_MAX_VECTOR; i++) { 238 apic_level_intr[i] = 0; 239 *iptr++ = NULL; 240 apic_vector_to_irq[i] = APIC_RESV_IRQ; 241 242 /* These *must* be initted to B_TRUE! */ 243 apic_reprogram_info[i].done = B_TRUE; 244 apic_reprogram_info[i].irqp = NULL; 245 apic_reprogram_info[i].tries = 0; 246 apic_reprogram_info[i].bindcpu = 0; 247 } 248 249 /* 250 * Allocate a dummy irq table entry for the reserved entry. 251 * This takes care of the race between removing an irq and 252 * clock detecting a CPU in that irq during interrupt load 253 * sampling. 254 */ 255 apic_irq_table[APIC_RESV_IRQ] = 256 kmem_zalloc(sizeof (apic_irq_t), KM_SLEEP); 257 258 mutex_init(&airq_mutex, NULL, MUTEX_DEFAULT, NULL); 259 } 260 261 void 262 ioapic_init_intr(int mask_apic) 263 { 264 int ioapic_ix; 265 struct intrspec ispec; 266 apic_irq_t *irqptr; 267 int i, j; 268 ulong_t iflag; 269 270 LOCK_INIT_CLEAR(&apic_revector_lock); 271 LOCK_INIT_CLEAR(&apic_defer_reprogram_lock); 272 273 /* mask interrupt vectors */ 274 for (j = 0; j < apic_io_max && mask_apic; j++) { 275 int intin_max; 276 277 ioapic_ix = j; 278 /* Bits 23-16 define the maximum redirection entries */ 279 intin_max = (ioapic_read(ioapic_ix, APIC_VERS_CMD) >> 16) 280 & 0xff; 281 for (i = 0; i <= intin_max; i++) 282 ioapic_write(ioapic_ix, APIC_RDT_CMD + 2 * i, AV_MASK); 283 } 284 285 /* 286 * Hack alert: deal with ACPI SCI interrupt chicken/egg here 287 */ 288 if (apic_sci_vect > 0) { 289 /* 290 * acpica has already done add_avintr(); we just 291 * to finish the job by mimicing translate_irq() 292 * 293 * Fake up an intrspec and setup the tables 294 */ 295 ispec.intrspec_vec = apic_sci_vect; 296 ispec.intrspec_pri = SCI_IPL; 297 298 if (apic_setup_irq_table(NULL, apic_sci_vect, NULL, 299 &ispec, &apic_sci_flags, DDI_INTR_TYPE_FIXED) < 0) { 300 cmn_err(CE_WARN, "!apic: SCI setup failed"); 301 return; 302 } 303 irqptr = apic_irq_table[apic_sci_vect]; 304 305 iflag = intr_clear(); 306 lock_set(&apic_ioapic_lock); 307 308 /* Program I/O APIC */ 309 (void) apic_setup_io_intr(irqptr, apic_sci_vect, B_FALSE); 310 311 lock_clear(&apic_ioapic_lock); 312 intr_restore(iflag); 313 314 irqptr->airq_share++; 315 } 316 317 /* 318 * Hack alert: deal with ACPI HPET interrupt chicken/egg here. 319 */ 320 if (apic_hpet_vect > 0) { 321 /* 322 * hpet has already done add_avintr(); we just need 323 * to finish the job by mimicing translate_irq() 324 * 325 * Fake up an intrspec and setup the tables 326 */ 327 ispec.intrspec_vec = apic_hpet_vect; 328 ispec.intrspec_pri = CBE_HIGH_PIL; 329 330 if (apic_setup_irq_table(NULL, apic_hpet_vect, NULL, 331 &ispec, &apic_hpet_flags, DDI_INTR_TYPE_FIXED) < 0) { 332 cmn_err(CE_WARN, "!apic: HPET setup failed"); 333 return; 334 } 335 irqptr = apic_irq_table[apic_hpet_vect]; 336 337 iflag = intr_clear(); 338 lock_set(&apic_ioapic_lock); 339 340 /* Program I/O APIC */ 341 (void) apic_setup_io_intr(irqptr, apic_hpet_vect, B_FALSE); 342 343 lock_clear(&apic_ioapic_lock); 344 intr_restore(iflag); 345 346 irqptr->airq_share++; 347 } 348 } 349 350 /* 351 * Add mask bits to disable interrupt vector from happening 352 * at or above IPL. In addition, it should remove mask bits 353 * to enable interrupt vectors below the given IPL. 354 * 355 * Both add and delspl are complicated by the fact that different interrupts 356 * may share IRQs. This can happen in two ways. 357 * 1. The same H/W line is shared by more than 1 device 358 * 1a. with interrupts at different IPLs 359 * 1b. with interrupts at same IPL 360 * 2. We ran out of vectors at a given IPL and started sharing vectors. 361 * 1b and 2 should be handled gracefully, except for the fact some ISRs 362 * will get called often when no interrupt is pending for the device. 363 * For 1a, we handle it at the higher IPL. 364 */ 365 /*ARGSUSED*/ 366 int 367 apic_addspl_common(int irqno, int ipl, int min_ipl, int max_ipl) 368 { 369 uchar_t vector; 370 ulong_t iflag; 371 apic_irq_t *irqptr, *irqheadptr; 372 int irqindex; 373 374 ASSERT(max_ipl <= UCHAR_MAX); 375 irqindex = IRQINDEX(irqno); 376 377 if ((irqindex == -1) || (!apic_irq_table[irqindex])) 378 return (PSM_FAILURE); 379 380 mutex_enter(&airq_mutex); 381 irqptr = irqheadptr = apic_irq_table[irqindex]; 382 383 DDI_INTR_IMPLDBG((CE_CONT, "apic_addspl: dip=0x%p type=%d irqno=0x%x " 384 "vector=0x%x\n", (void *)irqptr->airq_dip, 385 irqptr->airq_mps_intr_index, irqno, irqptr->airq_vector)); 386 387 while (irqptr) { 388 if (VIRTIRQ(irqindex, irqptr->airq_share_id) == irqno) 389 break; 390 irqptr = irqptr->airq_next; 391 } 392 irqptr->airq_share++; 393 394 mutex_exit(&airq_mutex); 395 396 /* return if it is not hardware interrupt */ 397 if (irqptr->airq_mps_intr_index == RESERVE_INDEX) 398 return (PSM_SUCCESS); 399 400 /* Or if there are more interupts at a higher IPL */ 401 if (ipl != max_ipl) 402 return (PSM_SUCCESS); 403 404 /* 405 * if apic_picinit() has not been called yet, just return. 406 * At the end of apic_picinit(), we will call setup_io_intr(). 407 */ 408 409 if (!apic_picinit_called) 410 return (PSM_SUCCESS); 411 412 /* 413 * Upgrade vector if max_ipl is not earlier ipl. If we cannot allocate, 414 * return failure. 415 */ 416 if (irqptr->airq_ipl != max_ipl && 417 !ioapic_mask_workaround[irqptr->airq_ioapicindex]) { 418 419 vector = apic_allocate_vector(max_ipl, irqindex, 1); 420 if (vector == 0) { 421 irqptr->airq_share--; 422 return (PSM_FAILURE); 423 } 424 irqptr = irqheadptr; 425 apic_mark_vector(irqptr->airq_vector, vector); 426 while (irqptr) { 427 irqptr->airq_vector = vector; 428 irqptr->airq_ipl = (uchar_t)max_ipl; 429 /* 430 * reprogram irq being added and every one else 431 * who is not in the UNINIT state 432 */ 433 if ((VIRTIRQ(irqindex, irqptr->airq_share_id) == 434 irqno) || (irqptr->airq_temp_cpu != IRQ_UNINIT)) { 435 apic_record_rdt_entry(irqptr, irqindex); 436 437 iflag = intr_clear(); 438 lock_set(&apic_ioapic_lock); 439 440 (void) apic_setup_io_intr(irqptr, irqindex, 441 B_FALSE); 442 443 lock_clear(&apic_ioapic_lock); 444 intr_restore(iflag); 445 } 446 irqptr = irqptr->airq_next; 447 } 448 return (PSM_SUCCESS); 449 450 } else if (irqptr->airq_ipl != max_ipl && 451 ioapic_mask_workaround[irqptr->airq_ioapicindex]) { 452 /* 453 * We cannot upgrade the vector, but we can change 454 * the IPL that this vector induces. 455 * 456 * Note that we subtract APIC_BASE_VECT from the vector 457 * here because this array is used in apic_intr_enter 458 * (no need to add APIC_BASE_VECT in that hot code 459 * path since we can do it in the rarely-executed path 460 * here). 461 */ 462 apic_ipls[irqptr->airq_vector - APIC_BASE_VECT] = 463 (uchar_t)max_ipl; 464 465 irqptr = irqheadptr; 466 while (irqptr) { 467 irqptr->airq_ipl = (uchar_t)max_ipl; 468 irqptr = irqptr->airq_next; 469 } 470 471 return (PSM_SUCCESS); 472 } 473 474 ASSERT(irqptr); 475 476 iflag = intr_clear(); 477 lock_set(&apic_ioapic_lock); 478 479 (void) apic_setup_io_intr(irqptr, irqindex, B_FALSE); 480 481 lock_clear(&apic_ioapic_lock); 482 intr_restore(iflag); 483 484 return (PSM_SUCCESS); 485 } 486 487 /* 488 * Recompute mask bits for the given interrupt vector. 489 * If there is no interrupt servicing routine for this 490 * vector, this function should disable interrupt vector 491 * from happening at all IPLs. If there are still 492 * handlers using the given vector, this function should 493 * disable the given vector from happening below the lowest 494 * IPL of the remaining hadlers. 495 */ 496 /*ARGSUSED*/ 497 int 498 apic_delspl_common(int irqno, int ipl, int min_ipl, int max_ipl) 499 { 500 uchar_t vector; 501 uint32_t bind_cpu; 502 int intin, irqindex; 503 int ioapic_ix; 504 apic_irq_t *irqptr, *preirqptr, *irqheadptr, *irqp; 505 ulong_t iflag; 506 507 mutex_enter(&airq_mutex); 508 irqindex = IRQINDEX(irqno); 509 irqptr = preirqptr = irqheadptr = apic_irq_table[irqindex]; 510 511 DDI_INTR_IMPLDBG((CE_CONT, "apic_delspl: dip=0x%p type=%d irqno=0x%x " 512 "vector=0x%x\n", (void *)irqptr->airq_dip, 513 irqptr->airq_mps_intr_index, irqno, irqptr->airq_vector)); 514 515 while (irqptr) { 516 if (VIRTIRQ(irqindex, irqptr->airq_share_id) == irqno) 517 break; 518 preirqptr = irqptr; 519 irqptr = irqptr->airq_next; 520 } 521 ASSERT(irqptr); 522 523 irqptr->airq_share--; 524 525 mutex_exit(&airq_mutex); 526 527 /* 528 * If there are more interrupts at a higher IPL, we don't need 529 * to disable anything. 530 */ 531 if (ipl < max_ipl) 532 return (PSM_SUCCESS); 533 534 /* return if it is not hardware interrupt */ 535 if (irqptr->airq_mps_intr_index == RESERVE_INDEX) 536 return (PSM_SUCCESS); 537 538 if (!apic_picinit_called) { 539 /* 540 * Clear irq_struct. If two devices shared an intpt 541 * line & 1 unloaded before picinit, we are hosed. But, then 542 * we hope the machine survive. 543 */ 544 irqptr->airq_mps_intr_index = FREE_INDEX; 545 irqptr->airq_temp_cpu = IRQ_UNINIT; 546 apic_free_vector(irqptr->airq_vector); 547 return (PSM_SUCCESS); 548 } 549 /* 550 * Downgrade vector to new max_ipl if needed. If we cannot allocate, 551 * use old IPL. Not very elegant, but it should work. 552 */ 553 if ((irqptr->airq_ipl != max_ipl) && (max_ipl != PSM_INVALID_IPL) && 554 !ioapic_mask_workaround[irqptr->airq_ioapicindex]) { 555 apic_irq_t *irqp; 556 if (vector = apic_allocate_vector(max_ipl, irqno, 1)) { 557 apic_mark_vector(irqheadptr->airq_vector, vector); 558 irqp = irqheadptr; 559 while (irqp) { 560 irqp->airq_vector = vector; 561 irqp->airq_ipl = (uchar_t)max_ipl; 562 if (irqp->airq_temp_cpu != IRQ_UNINIT) { 563 apic_record_rdt_entry(irqp, irqindex); 564 565 iflag = intr_clear(); 566 lock_set(&apic_ioapic_lock); 567 568 (void) apic_setup_io_intr(irqp, 569 irqindex, B_FALSE); 570 571 lock_clear(&apic_ioapic_lock); 572 intr_restore(iflag); 573 } 574 irqp = irqp->airq_next; 575 } 576 } 577 578 } else if (irqptr->airq_ipl != max_ipl && 579 max_ipl != PSM_INVALID_IPL && 580 ioapic_mask_workaround[irqptr->airq_ioapicindex]) { 581 582 /* 583 * We cannot downgrade the IPL of the vector below the vector's 584 * hardware priority. If we did, it would be possible for a 585 * higher-priority hardware vector to interrupt a CPU running at an IPL 586 * lower than the hardware priority of the interrupting vector (but 587 * higher than the soft IPL of this IRQ). When this happens, we would 588 * then try to drop the IPL BELOW what it was (effectively dropping 589 * below base_spl) which would be potentially catastrophic. 590 * 591 * (e.g. Suppose the hardware vector associated with this IRQ is 0x40 592 * (hardware IPL of 4). Further assume that the old IPL of this IRQ 593 * was 4, but the new IPL is 1. If we forced vector 0x40 to result in 594 * an IPL of 1, it would be possible for the processor to be executing 595 * at IPL 3 and for an interrupt to come in on vector 0x40, interrupting 596 * the currently-executing ISR. When apic_intr_enter consults 597 * apic_irqs[], it will return 1, bringing the IPL of the CPU down to 1 598 * so even though the processor was running at IPL 4, an IPL 1 599 * interrupt will have interrupted it, which must not happen)). 600 * 601 * Effectively, this means that the hardware priority corresponding to 602 * the IRQ's IPL (in apic_ipls[]) cannot be lower than the vector's 603 * hardware priority. 604 * 605 * (In the above example, then, after removal of the IPL 4 device's 606 * interrupt handler, the new IPL will continue to be 4 because the 607 * hardware priority that IPL 1 implies is lower than the hardware 608 * priority of the vector used.) 609 */ 610 /* apic_ipls is indexed by vector, starting at APIC_BASE_VECT */ 611 const int apic_ipls_index = irqptr->airq_vector - 612 APIC_BASE_VECT; 613 const int vect_inherent_hwpri = irqptr->airq_vector >> 614 APIC_IPL_SHIFT; 615 616 /* 617 * If there are still devices using this IRQ, determine the 618 * new ipl to use. 619 */ 620 if (irqptr->airq_share) { 621 int vect_desired_hwpri, hwpri; 622 623 ASSERT(max_ipl < MAXIPL); 624 vect_desired_hwpri = apic_ipltopri[max_ipl] >> 625 APIC_IPL_SHIFT; 626 627 /* 628 * If the desired IPL's hardware priority is lower 629 * than that of the vector, use the hardware priority 630 * of the vector to determine the new IPL. 631 */ 632 hwpri = (vect_desired_hwpri < vect_inherent_hwpri) ? 633 vect_inherent_hwpri : vect_desired_hwpri; 634 635 /* 636 * Now, to get the right index for apic_vectortoipl, 637 * we need to subtract APIC_BASE_VECT from the 638 * hardware-vector-equivalent (in hwpri). Since hwpri 639 * is already shifted, we shift APIC_BASE_VECT before 640 * doing the subtraction. 641 */ 642 hwpri -= (APIC_BASE_VECT >> APIC_IPL_SHIFT); 643 644 ASSERT(hwpri >= 0); 645 ASSERT(hwpri < MAXIPL); 646 max_ipl = apic_vectortoipl[hwpri]; 647 apic_ipls[apic_ipls_index] = max_ipl; 648 649 irqp = irqheadptr; 650 while (irqp) { 651 irqp->airq_ipl = (uchar_t)max_ipl; 652 irqp = irqp->airq_next; 653 } 654 } else { 655 /* 656 * No more devices on this IRQ, so reset this vector's 657 * element in apic_ipls to the original IPL for this 658 * vector 659 */ 660 apic_ipls[apic_ipls_index] = 661 apic_vectortoipl[vect_inherent_hwpri]; 662 } 663 } 664 665 /* 666 * If there are still active interrupts, we are done. 667 */ 668 if (irqptr->airq_share) 669 return (PSM_SUCCESS); 670 671 iflag = intr_clear(); 672 lock_set(&apic_ioapic_lock); 673 674 if (irqptr->airq_mps_intr_index == MSI_INDEX) { 675 /* 676 * Disable the MSI vector 677 * Make sure we only disable on the last 678 * of the multi-MSI support 679 */ 680 if (i_ddi_intr_get_current_nenables(irqptr->airq_dip) == 1) { 681 apic_pci_msi_disable_mode(irqptr->airq_dip, 682 DDI_INTR_TYPE_MSI); 683 } 684 } else if (irqptr->airq_mps_intr_index == MSIX_INDEX) { 685 /* 686 * Disable the MSI-X vector 687 * needs to clear its mask and addr/data for each MSI-X 688 */ 689 apic_pci_msi_unconfigure(irqptr->airq_dip, DDI_INTR_TYPE_MSIX, 690 irqptr->airq_origirq); 691 /* 692 * Make sure we only disable on the last MSI-X 693 */ 694 if (i_ddi_intr_get_current_nenables(irqptr->airq_dip) == 1) { 695 apic_pci_msi_disable_mode(irqptr->airq_dip, 696 DDI_INTR_TYPE_MSIX); 697 } 698 } else { 699 /* 700 * The assumption here is that this is safe, even for 701 * systems with IOAPICs that suffer from the hardware 702 * erratum because all devices have been quiesced before 703 * they unregister their interrupt handlers. If that 704 * assumption turns out to be false, this mask operation 705 * can induce the same erratum result we're trying to 706 * avoid. 707 */ 708 ioapic_ix = irqptr->airq_ioapicindex; 709 intin = irqptr->airq_intin_no; 710 ioapic_write(ioapic_ix, APIC_RDT_CMD + 2 * intin, AV_MASK); 711 } 712 713 apic_vt_ops->apic_intrmap_free_entry(&irqptr->airq_intrmap_private); 714 715 /* 716 * This irq entry is the only one in the chain. 717 */ 718 if (irqheadptr->airq_next == NULL) { 719 ASSERT(irqheadptr == irqptr); 720 bind_cpu = irqptr->airq_temp_cpu; 721 if (((uint32_t)bind_cpu != IRQ_UNBOUND) && 722 ((uint32_t)bind_cpu != IRQ_UNINIT)) { 723 ASSERT(apic_cpu_in_range(bind_cpu)); 724 if (bind_cpu & IRQ_USER_BOUND) { 725 /* If hardbound, temp_cpu == cpu */ 726 bind_cpu &= ~IRQ_USER_BOUND; 727 apic_cpus[bind_cpu].aci_bound--; 728 } else 729 apic_cpus[bind_cpu].aci_temp_bound--; 730 } 731 irqptr->airq_temp_cpu = IRQ_UNINIT; 732 irqptr->airq_mps_intr_index = FREE_INDEX; 733 lock_clear(&apic_ioapic_lock); 734 intr_restore(iflag); 735 apic_free_vector(irqptr->airq_vector); 736 return (PSM_SUCCESS); 737 } 738 739 /* 740 * If we get here, we are sharing the vector and there are more than 741 * one active irq entries in the chain. 742 */ 743 lock_clear(&apic_ioapic_lock); 744 intr_restore(iflag); 745 746 mutex_enter(&airq_mutex); 747 /* Remove the irq entry from the chain */ 748 if (irqptr == irqheadptr) { /* The irq entry is at the head */ 749 apic_irq_table[irqindex] = irqptr->airq_next; 750 } else { 751 preirqptr->airq_next = irqptr->airq_next; 752 } 753 /* Free the irq entry */ 754 kmem_free(irqptr, sizeof (apic_irq_t)); 755 mutex_exit(&airq_mutex); 756 757 return (PSM_SUCCESS); 758 } 759 760 /* 761 * apic_introp_xlate() replaces apic_translate_irq() and is 762 * called only from apic_intr_ops(). With the new ADII framework, 763 * the priority can no longer be retrieved through i_ddi_get_intrspec(). 764 * It has to be passed in from the caller. 765 * 766 * Return value: 767 * Success: irqno for the given device 768 * Failure: -1 769 */ 770 int 771 apic_introp_xlate(dev_info_t *dip, struct intrspec *ispec, int type) 772 { 773 char dev_type[16]; 774 int dev_len, pci_irq, newirq, bustype, devid, busid, i; 775 int irqno = ispec->intrspec_vec; 776 ddi_acc_handle_t cfg_handle; 777 uchar_t ipin; 778 struct apic_io_intr *intrp; 779 iflag_t intr_flag; 780 ACPI_SUBTABLE_HEADER *hp; 781 ACPI_MADT_INTERRUPT_OVERRIDE *isop; 782 apic_irq_t *airqp; 783 int parent_is_pci_or_pciex = 0; 784 int child_is_pciex = 0; 785 786 DDI_INTR_IMPLDBG((CE_CONT, "apic_introp_xlate: dip=0x%p name=%s " 787 "type=%d irqno=0x%x\n", (void *)dip, ddi_get_name(dip), type, 788 irqno)); 789 790 dev_len = sizeof (dev_type); 791 if (ddi_getlongprop_buf(DDI_DEV_T_ANY, ddi_get_parent(dip), 792 DDI_PROP_DONTPASS, "device_type", (caddr_t)dev_type, 793 &dev_len) == DDI_PROP_SUCCESS) { 794 if ((strcmp(dev_type, "pci") == 0) || 795 (strcmp(dev_type, "pciex") == 0)) 796 parent_is_pci_or_pciex = 1; 797 } 798 799 if (ddi_getlongprop_buf(DDI_DEV_T_ANY, dip, 800 DDI_PROP_DONTPASS, "compatible", (caddr_t)dev_type, 801 &dev_len) == DDI_PROP_SUCCESS) { 802 if (strstr(dev_type, "pciex")) 803 child_is_pciex = 1; 804 } 805 806 if (DDI_INTR_IS_MSI_OR_MSIX(type)) { 807 if ((airqp = apic_find_irq(dip, ispec, type)) != NULL) { 808 airqp->airq_iflag.bustype = 809 child_is_pciex ? BUS_PCIE : BUS_PCI; 810 return (apic_vector_to_irq[airqp->airq_vector]); 811 } 812 return (apic_setup_irq_table(dip, irqno, NULL, ispec, 813 NULL, type)); 814 } 815 816 bustype = 0; 817 818 /* check if we have already translated this irq */ 819 mutex_enter(&airq_mutex); 820 newirq = apic_min_device_irq; 821 for (; newirq <= apic_max_device_irq; newirq++) { 822 airqp = apic_irq_table[newirq]; 823 while (airqp) { 824 if ((airqp->airq_dip == dip) && 825 (airqp->airq_origirq == irqno) && 826 (airqp->airq_mps_intr_index != FREE_INDEX)) { 827 828 mutex_exit(&airq_mutex); 829 return (VIRTIRQ(newirq, airqp->airq_share_id)); 830 } 831 airqp = airqp->airq_next; 832 } 833 } 834 mutex_exit(&airq_mutex); 835 836 if (apic_defconf) 837 goto defconf; 838 839 if ((dip == NULL) || (!apic_irq_translate && !apic_enable_acpi)) 840 goto nonpci; 841 842 if (parent_is_pci_or_pciex) { 843 /* pci device */ 844 if (acpica_get_bdf(dip, &busid, &devid, NULL) != 0) 845 goto nonpci; 846 if (busid == 0 && apic_pci_bus_total == 1) 847 busid = (int)apic_single_pci_busid; 848 849 if (pci_config_setup(dip, &cfg_handle) != DDI_SUCCESS) 850 return (-1); 851 ipin = pci_config_get8(cfg_handle, PCI_CONF_IPIN) - PCI_INTA; 852 pci_config_teardown(&cfg_handle); 853 if (apic_enable_acpi && !apic_use_acpi_madt_only) { 854 if (apic_acpi_translate_pci_irq(dip, busid, devid, 855 ipin, &pci_irq, &intr_flag) != ACPI_PSM_SUCCESS) 856 return (-1); 857 858 intr_flag.bustype = child_is_pciex ? BUS_PCIE : BUS_PCI; 859 return (apic_setup_irq_table(dip, pci_irq, NULL, ispec, 860 &intr_flag, type)); 861 } else { 862 pci_irq = ((devid & 0x1f) << 2) | (ipin & 0x3); 863 if ((intrp = apic_find_io_intr_w_busid(pci_irq, busid)) 864 == NULL) { 865 if ((pci_irq = apic_handle_pci_pci_bridge(dip, 866 devid, ipin, &intrp)) == -1) 867 return (-1); 868 } 869 return (apic_setup_irq_table(dip, pci_irq, intrp, ispec, 870 NULL, type)); 871 } 872 } else if (strcmp(dev_type, "isa") == 0) 873 bustype = BUS_ISA; 874 else if (strcmp(dev_type, "eisa") == 0) 875 bustype = BUS_EISA; 876 877 nonpci: 878 if (apic_enable_acpi && !apic_use_acpi_madt_only) { 879 /* search iso entries first */ 880 if (acpi_iso_cnt != 0) { 881 hp = (ACPI_SUBTABLE_HEADER *)acpi_isop; 882 i = 0; 883 while (i < acpi_iso_cnt) { 884 if (hp->Type == 885 ACPI_MADT_TYPE_INTERRUPT_OVERRIDE) { 886 isop = 887 (ACPI_MADT_INTERRUPT_OVERRIDE *) hp; 888 if (isop->Bus == 0 && 889 isop->SourceIrq == irqno) { 890 newirq = isop->GlobalIrq; 891 intr_flag.intr_po = 892 isop->IntiFlags & 893 ACPI_MADT_POLARITY_MASK; 894 intr_flag.intr_el = 895 (isop->IntiFlags & 896 ACPI_MADT_TRIGGER_MASK) 897 >> 2; 898 intr_flag.bustype = BUS_ISA; 899 900 return (apic_setup_irq_table( 901 dip, newirq, NULL, ispec, 902 &intr_flag, type)); 903 904 } 905 i++; 906 } 907 hp = (ACPI_SUBTABLE_HEADER *)(((char *)hp) + 908 hp->Length); 909 } 910 } 911 intr_flag.intr_po = INTR_PO_ACTIVE_HIGH; 912 intr_flag.intr_el = INTR_EL_EDGE; 913 intr_flag.bustype = BUS_ISA; 914 return (apic_setup_irq_table(dip, irqno, NULL, ispec, 915 &intr_flag, type)); 916 } else { 917 if (bustype == 0) /* not initialized */ 918 bustype = eisa_level_intr_mask ? BUS_EISA : BUS_ISA; 919 for (i = 0; i < 2; i++) { 920 if (((busid = apic_find_bus_id(bustype)) != -1) && 921 ((intrp = apic_find_io_intr_w_busid(irqno, busid)) 922 != NULL)) { 923 if ((newirq = apic_setup_irq_table(dip, irqno, 924 intrp, ispec, NULL, type)) != -1) { 925 return (newirq); 926 } 927 goto defconf; 928 } 929 bustype = (bustype == BUS_EISA) ? BUS_ISA : BUS_EISA; 930 } 931 } 932 933 /* MPS default configuration */ 934 defconf: 935 newirq = apic_setup_irq_table(dip, irqno, NULL, ispec, NULL, type); 936 if (newirq == -1) 937 return (-1); 938 ASSERT(IRQINDEX(newirq) == irqno); 939 ASSERT(apic_irq_table[irqno]); 940 return (newirq); 941 } 942 943 /* 944 * Attempt to share vector with someone else 945 */ 946 static int 947 apic_share_vector(int irqno, iflag_t *intr_flagp, short intr_index, int ipl, 948 uchar_t ioapicindex, uchar_t ipin, apic_irq_t **irqptrp) 949 { 950 #ifdef DEBUG 951 apic_irq_t *tmpirqp = NULL; 952 #endif /* DEBUG */ 953 apic_irq_t *irqptr, dummyirq; 954 int newirq, chosen_irq = -1, share = 127; 955 int lowest, highest, i; 956 uchar_t share_id; 957 958 DDI_INTR_IMPLDBG((CE_CONT, "apic_share_vector: irqno=0x%x " 959 "intr_index=0x%x ipl=0x%x\n", irqno, intr_index, ipl)); 960 961 highest = apic_ipltopri[ipl] + APIC_VECTOR_MASK; 962 lowest = apic_ipltopri[ipl-1] + APIC_VECTOR_PER_IPL; 963 964 if (highest < lowest) /* Both ipl and ipl-1 map to same pri */ 965 lowest -= APIC_VECTOR_PER_IPL; 966 dummyirq.airq_mps_intr_index = intr_index; 967 dummyirq.airq_ioapicindex = ioapicindex; 968 dummyirq.airq_intin_no = ipin; 969 if (intr_flagp) 970 dummyirq.airq_iflag = *intr_flagp; 971 apic_record_rdt_entry(&dummyirq, irqno); 972 for (i = lowest; i <= highest; i++) { 973 newirq = apic_vector_to_irq[i]; 974 if (newirq == APIC_RESV_IRQ) 975 continue; 976 irqptr = apic_irq_table[newirq]; 977 978 if ((dummyirq.airq_rdt_entry & 0xFF00) != 979 (irqptr->airq_rdt_entry & 0xFF00)) 980 /* not compatible */ 981 continue; 982 983 if (irqptr->airq_share < share) { 984 share = irqptr->airq_share; 985 chosen_irq = newirq; 986 } 987 } 988 if (chosen_irq != -1) { 989 /* 990 * Assign a share id which is free or which is larger 991 * than the largest one. 992 */ 993 share_id = 1; 994 mutex_enter(&airq_mutex); 995 irqptr = apic_irq_table[chosen_irq]; 996 while (irqptr) { 997 if (irqptr->airq_mps_intr_index == FREE_INDEX) { 998 share_id = irqptr->airq_share_id; 999 break; 1000 } 1001 if (share_id <= irqptr->airq_share_id) 1002 share_id = irqptr->airq_share_id + 1; 1003 #ifdef DEBUG 1004 tmpirqp = irqptr; 1005 #endif /* DEBUG */ 1006 irqptr = irqptr->airq_next; 1007 } 1008 if (!irqptr) { 1009 irqptr = kmem_zalloc(sizeof (apic_irq_t), KM_SLEEP); 1010 irqptr->airq_temp_cpu = IRQ_UNINIT; 1011 irqptr->airq_next = 1012 apic_irq_table[chosen_irq]->airq_next; 1013 apic_irq_table[chosen_irq]->airq_next = irqptr; 1014 #ifdef DEBUG 1015 tmpirqp = apic_irq_table[chosen_irq]; 1016 #endif /* DEBUG */ 1017 } 1018 irqptr->airq_mps_intr_index = intr_index; 1019 irqptr->airq_ioapicindex = ioapicindex; 1020 irqptr->airq_intin_no = ipin; 1021 if (intr_flagp) 1022 irqptr->airq_iflag = *intr_flagp; 1023 irqptr->airq_vector = apic_irq_table[chosen_irq]->airq_vector; 1024 irqptr->airq_share_id = share_id; 1025 apic_record_rdt_entry(irqptr, irqno); 1026 *irqptrp = irqptr; 1027 #ifdef DEBUG 1028 /* shuffle the pointers to test apic_delspl path */ 1029 if (tmpirqp) { 1030 tmpirqp->airq_next = irqptr->airq_next; 1031 irqptr->airq_next = apic_irq_table[chosen_irq]; 1032 apic_irq_table[chosen_irq] = irqptr; 1033 } 1034 #endif /* DEBUG */ 1035 mutex_exit(&airq_mutex); 1036 return (VIRTIRQ(chosen_irq, share_id)); 1037 } 1038 return (-1); 1039 } 1040 1041 /* 1042 * Allocate/Initialize the apic_irq_table[] entry for given irqno. If the entry 1043 * is used already, we will try to allocate a new irqno. 1044 * 1045 * Return value: 1046 * Success: irqno 1047 * Failure: -1 1048 */ 1049 static int 1050 apic_setup_irq_table(dev_info_t *dip, int irqno, struct apic_io_intr *intrp, 1051 struct intrspec *ispec, iflag_t *intr_flagp, int type) 1052 { 1053 int origirq = ispec->intrspec_vec; 1054 uchar_t ipl = ispec->intrspec_pri; 1055 int newirq, intr_index; 1056 uchar_t ipin, ioapic, ioapicindex, vector; 1057 apic_irq_t *irqptr; 1058 major_t major; 1059 dev_info_t *sdip; 1060 1061 DDI_INTR_IMPLDBG((CE_CONT, "apic_setup_irq_table: dip=0x%p type=%d " 1062 "irqno=0x%x origirq=0x%x\n", (void *)dip, type, irqno, origirq)); 1063 1064 ASSERT(ispec != NULL); 1065 1066 major = (dip != NULL) ? ddi_driver_major(dip) : 0; 1067 1068 if (DDI_INTR_IS_MSI_OR_MSIX(type)) { 1069 /* MSI/X doesn't need to setup ioapic stuffs */ 1070 ioapicindex = 0xff; 1071 ioapic = 0xff; 1072 ipin = (uchar_t)0xff; 1073 intr_index = (type == DDI_INTR_TYPE_MSI) ? MSI_INDEX : 1074 MSIX_INDEX; 1075 mutex_enter(&airq_mutex); 1076 if ((irqno = apic_allocate_irq(apic_first_avail_irq)) == -1) { 1077 mutex_exit(&airq_mutex); 1078 /* need an irq for MSI/X to index into autovect[] */ 1079 cmn_err(CE_WARN, "No interrupt irq: %s instance %d", 1080 ddi_get_name(dip), ddi_get_instance(dip)); 1081 return (-1); 1082 } 1083 mutex_exit(&airq_mutex); 1084 1085 } else if (intrp != NULL) { 1086 intr_index = (int)(intrp - apic_io_intrp); 1087 ioapic = intrp->intr_destid; 1088 ipin = intrp->intr_destintin; 1089 /* Find ioapicindex. If destid was ALL, we will exit with 0. */ 1090 for (ioapicindex = apic_io_max - 1; ioapicindex; ioapicindex--) 1091 if (apic_io_id[ioapicindex] == ioapic) 1092 break; 1093 ASSERT((ioapic == apic_io_id[ioapicindex]) || 1094 (ioapic == INTR_ALL_APIC)); 1095 1096 /* check whether this intin# has been used by another irqno */ 1097 if ((newirq = apic_find_intin(ioapicindex, ipin)) != -1) { 1098 return (newirq); 1099 } 1100 1101 } else if (intr_flagp != NULL) { 1102 /* ACPI case */ 1103 intr_index = ACPI_INDEX; 1104 ioapicindex = acpi_find_ioapic(irqno); 1105 ASSERT(ioapicindex != 0xFF); 1106 ioapic = apic_io_id[ioapicindex]; 1107 ipin = irqno - apic_io_vectbase[ioapicindex]; 1108 if (apic_irq_table[irqno] && 1109 apic_irq_table[irqno]->airq_mps_intr_index == ACPI_INDEX) { 1110 ASSERT(apic_irq_table[irqno]->airq_intin_no == ipin && 1111 apic_irq_table[irqno]->airq_ioapicindex == 1112 ioapicindex); 1113 return (irqno); 1114 } 1115 1116 } else { 1117 /* default configuration */ 1118 ioapicindex = 0; 1119 ioapic = apic_io_id[ioapicindex]; 1120 ipin = (uchar_t)irqno; 1121 intr_index = DEFAULT_INDEX; 1122 } 1123 1124 if (ispec == NULL) { 1125 APIC_VERBOSE_IOAPIC((CE_WARN, "No intrspec for irqno = %x\n", 1126 irqno)); 1127 } else if ((vector = apic_allocate_vector(ipl, irqno, 0)) == 0) { 1128 if ((newirq = apic_share_vector(irqno, intr_flagp, intr_index, 1129 ipl, ioapicindex, ipin, &irqptr)) != -1) { 1130 irqptr->airq_ipl = ipl; 1131 irqptr->airq_origirq = (uchar_t)origirq; 1132 irqptr->airq_dip = dip; 1133 irqptr->airq_major = major; 1134 sdip = apic_irq_table[IRQINDEX(newirq)]->airq_dip; 1135 /* This is OK to do really */ 1136 if (sdip == NULL) { 1137 cmn_err(CE_WARN, "Sharing vectors: %s" 1138 " instance %d and SCI", 1139 ddi_get_name(dip), ddi_get_instance(dip)); 1140 } else { 1141 cmn_err(CE_WARN, "Sharing vectors: %s" 1142 " instance %d and %s instance %d", 1143 ddi_get_name(sdip), ddi_get_instance(sdip), 1144 ddi_get_name(dip), ddi_get_instance(dip)); 1145 } 1146 return (newirq); 1147 } 1148 /* try high priority allocation now that share has failed */ 1149 if ((vector = apic_allocate_vector(ipl, irqno, 1)) == 0) { 1150 cmn_err(CE_WARN, "No interrupt vector: %s instance %d", 1151 ddi_get_name(dip), ddi_get_instance(dip)); 1152 return (-1); 1153 } 1154 } 1155 1156 mutex_enter(&airq_mutex); 1157 if (apic_irq_table[irqno] == NULL) { 1158 irqptr = kmem_zalloc(sizeof (apic_irq_t), KM_SLEEP); 1159 irqptr->airq_temp_cpu = IRQ_UNINIT; 1160 apic_irq_table[irqno] = irqptr; 1161 } else { 1162 irqptr = apic_irq_table[irqno]; 1163 if (irqptr->airq_mps_intr_index != FREE_INDEX) { 1164 /* 1165 * The slot is used by another irqno, so allocate 1166 * a free irqno for this interrupt 1167 */ 1168 newirq = apic_allocate_irq(apic_first_avail_irq); 1169 if (newirq == -1) { 1170 mutex_exit(&airq_mutex); 1171 return (-1); 1172 } 1173 irqno = newirq; 1174 irqptr = apic_irq_table[irqno]; 1175 if (irqptr == NULL) { 1176 irqptr = kmem_zalloc(sizeof (apic_irq_t), 1177 KM_SLEEP); 1178 irqptr->airq_temp_cpu = IRQ_UNINIT; 1179 apic_irq_table[irqno] = irqptr; 1180 } 1181 vector = apic_modify_vector(vector, newirq); 1182 } 1183 } 1184 apic_max_device_irq = max(irqno, apic_max_device_irq); 1185 apic_min_device_irq = min(irqno, apic_min_device_irq); 1186 mutex_exit(&airq_mutex); 1187 irqptr->airq_ioapicindex = ioapicindex; 1188 irqptr->airq_intin_no = ipin; 1189 irqptr->airq_ipl = ipl; 1190 irqptr->airq_vector = vector; 1191 irqptr->airq_origirq = (uchar_t)origirq; 1192 irqptr->airq_share_id = 0; 1193 irqptr->airq_mps_intr_index = (short)intr_index; 1194 irqptr->airq_dip = dip; 1195 irqptr->airq_major = major; 1196 irqptr->airq_cpu = apic_bind_intr(dip, irqno, ioapic, ipin); 1197 if (intr_flagp) 1198 irqptr->airq_iflag = *intr_flagp; 1199 1200 if (!DDI_INTR_IS_MSI_OR_MSIX(type)) { 1201 /* setup I/O APIC entry for non-MSI/X interrupts */ 1202 apic_record_rdt_entry(irqptr, irqno); 1203 } 1204 return (irqno); 1205 } 1206 1207 /* 1208 * return the cpu to which this intr should be bound. 1209 * Check properties or any other mechanism to see if user wants it 1210 * bound to a specific CPU. If so, return the cpu id with high bit set. 1211 * If not, use the policy to choose a cpu and return the id. 1212 */ 1213 uint32_t 1214 apic_bind_intr(dev_info_t *dip, int irq, uchar_t ioapicid, uchar_t intin) 1215 { 1216 int instance, instno, prop_len, bind_cpu, count; 1217 uint_t i, rc; 1218 uint32_t cpu; 1219 major_t major; 1220 char *name, *drv_name, *prop_val, *cptr; 1221 char prop_name[32]; 1222 ulong_t iflag; 1223 1224 1225 if (apic_intr_policy == INTR_LOWEST_PRIORITY) 1226 return (IRQ_UNBOUND); 1227 1228 if (apic_nproc == 1) 1229 return (0); 1230 1231 drv_name = NULL; 1232 rc = DDI_PROP_NOT_FOUND; 1233 major = (major_t)-1; 1234 if (dip != NULL) { 1235 name = ddi_get_name(dip); 1236 major = ddi_name_to_major(name); 1237 drv_name = ddi_major_to_name(major); 1238 instance = ddi_get_instance(dip); 1239 if (apic_intr_policy == INTR_ROUND_ROBIN_WITH_AFFINITY) { 1240 i = apic_min_device_irq; 1241 for (; i <= apic_max_device_irq; i++) { 1242 1243 if ((i == irq) || (apic_irq_table[i] == NULL) || 1244 (apic_irq_table[i]->airq_mps_intr_index 1245 == FREE_INDEX)) 1246 continue; 1247 1248 if ((apic_irq_table[i]->airq_major == major) && 1249 (!(apic_irq_table[i]->airq_cpu & 1250 IRQ_USER_BOUND))) { 1251 1252 cpu = apic_irq_table[i]->airq_cpu; 1253 1254 cmn_err(CE_CONT, 1255 "!%s: %s (%s) instance #%d " 1256 "irq 0x%x vector 0x%x ioapic 0x%x " 1257 "intin 0x%x is bound to cpu %d\n", 1258 psm_name, 1259 name, drv_name, instance, irq, 1260 apic_irq_table[irq]->airq_vector, 1261 ioapicid, intin, cpu); 1262 return (cpu); 1263 } 1264 } 1265 } 1266 /* 1267 * search for "drvname"_intpt_bind_cpus property first, the 1268 * syntax of the property should be "a[,b,c,...]" where 1269 * instance 0 binds to cpu a, instance 1 binds to cpu b, 1270 * instance 3 binds to cpu c... 1271 * ddi_getlongprop() will search /option first, then / 1272 * if "drvname"_intpt_bind_cpus doesn't exist, then find 1273 * intpt_bind_cpus property. The syntax is the same, and 1274 * it applies to all the devices if its "drvname" specific 1275 * property doesn't exist 1276 */ 1277 (void) strcpy(prop_name, drv_name); 1278 (void) strcat(prop_name, "_intpt_bind_cpus"); 1279 rc = ddi_getlongprop(DDI_DEV_T_ANY, dip, 0, prop_name, 1280 (caddr_t)&prop_val, &prop_len); 1281 if (rc != DDI_PROP_SUCCESS) { 1282 rc = ddi_getlongprop(DDI_DEV_T_ANY, dip, 0, 1283 "intpt_bind_cpus", (caddr_t)&prop_val, &prop_len); 1284 } 1285 } 1286 if (rc == DDI_PROP_SUCCESS) { 1287 for (i = count = 0; i < (prop_len - 1); i++) 1288 if (prop_val[i] == ',') 1289 count++; 1290 if (prop_val[i-1] != ',') 1291 count++; 1292 /* 1293 * if somehow the binding instances defined in the 1294 * property are not enough for this instno., then 1295 * reuse the pattern for the next instance until 1296 * it reaches the requested instno 1297 */ 1298 instno = instance % count; 1299 i = 0; 1300 cptr = prop_val; 1301 while (i < instno) 1302 if (*cptr++ == ',') 1303 i++; 1304 bind_cpu = stoi(&cptr); 1305 kmem_free(prop_val, prop_len); 1306 /* if specific CPU is bogus, then default to next cpu */ 1307 if (!apic_cpu_in_range(bind_cpu)) { 1308 cmn_err(CE_WARN, "%s: %s=%s: CPU %d not present", 1309 psm_name, prop_name, prop_val, bind_cpu); 1310 rc = DDI_PROP_NOT_FOUND; 1311 } else { 1312 /* indicate that we are bound at user request */ 1313 bind_cpu |= IRQ_USER_BOUND; 1314 } 1315 /* 1316 * no need to check apic_cpus[].aci_status, if specific CPU is 1317 * not up, then post_cpu_start will handle it. 1318 */ 1319 } 1320 if (rc != DDI_PROP_SUCCESS) { 1321 iflag = intr_clear(); 1322 lock_set(&apic_ioapic_lock); 1323 bind_cpu = apic_get_next_bind_cpu(); 1324 lock_clear(&apic_ioapic_lock); 1325 intr_restore(iflag); 1326 } 1327 1328 if (drv_name != NULL) 1329 cmn_err(CE_CONT, "!%s: %s (%s) instance %d irq 0x%x " 1330 "vector 0x%x ioapic 0x%x intin 0x%x is bound to cpu %d\n", 1331 psm_name, name, drv_name, instance, irq, 1332 apic_irq_table[irq]->airq_vector, ioapicid, intin, 1333 bind_cpu & ~IRQ_USER_BOUND); 1334 else 1335 cmn_err(CE_CONT, "!%s: irq 0x%x " 1336 "vector 0x%x ioapic 0x%x intin 0x%x is bound to cpu %d\n", 1337 psm_name, irq, apic_irq_table[irq]->airq_vector, ioapicid, 1338 intin, bind_cpu & ~IRQ_USER_BOUND); 1339 1340 return ((uint32_t)bind_cpu); 1341 } 1342 1343 /* 1344 * Mark vector as being in the process of being deleted. Interrupts 1345 * may still come in on some CPU. The moment an interrupt comes with 1346 * the new vector, we know we can free the old one. Called only from 1347 * addspl and delspl with interrupts disabled. Because an interrupt 1348 * can be shared, but no interrupt from either device may come in, 1349 * we also use a timeout mechanism, which we arbitrarily set to 1350 * apic_revector_timeout microseconds. 1351 */ 1352 static void 1353 apic_mark_vector(uchar_t oldvector, uchar_t newvector) 1354 { 1355 ulong_t iflag; 1356 1357 iflag = intr_clear(); 1358 lock_set(&apic_revector_lock); 1359 if (!apic_oldvec_to_newvec) { 1360 apic_oldvec_to_newvec = 1361 kmem_zalloc(sizeof (newvector) * APIC_MAX_VECTOR * 2, 1362 KM_NOSLEEP); 1363 1364 if (!apic_oldvec_to_newvec) { 1365 /* 1366 * This failure is not catastrophic. 1367 * But, the oldvec will never be freed. 1368 */ 1369 apic_error |= APIC_ERR_MARK_VECTOR_FAIL; 1370 lock_clear(&apic_revector_lock); 1371 intr_restore(iflag); 1372 return; 1373 } 1374 apic_newvec_to_oldvec = &apic_oldvec_to_newvec[APIC_MAX_VECTOR]; 1375 } 1376 1377 /* See if we already did this for drivers which do double addintrs */ 1378 if (apic_oldvec_to_newvec[oldvector] != newvector) { 1379 apic_oldvec_to_newvec[oldvector] = newvector; 1380 apic_newvec_to_oldvec[newvector] = oldvector; 1381 apic_revector_pending++; 1382 } 1383 lock_clear(&apic_revector_lock); 1384 intr_restore(iflag); 1385 (void) timeout(apic_xlate_vector_free_timeout_handler, 1386 (void *)(uintptr_t)oldvector, drv_usectohz(apic_revector_timeout)); 1387 } 1388 1389 /* 1390 * xlate_vector is called from intr_enter if revector_pending is set. 1391 * It will xlate it if needed and mark the old vector as free. 1392 */ 1393 uchar_t 1394 apic_xlate_vector(uchar_t vector) 1395 { 1396 uchar_t newvector, oldvector = 0; 1397 1398 lock_set(&apic_revector_lock); 1399 /* Do we really need to do this ? */ 1400 if (!apic_revector_pending) { 1401 lock_clear(&apic_revector_lock); 1402 return (vector); 1403 } 1404 if ((newvector = apic_oldvec_to_newvec[vector]) != 0) 1405 oldvector = vector; 1406 else { 1407 /* 1408 * The incoming vector is new . See if a stale entry is 1409 * remaining 1410 */ 1411 if ((oldvector = apic_newvec_to_oldvec[vector]) != 0) 1412 newvector = vector; 1413 } 1414 1415 if (oldvector) { 1416 apic_revector_pending--; 1417 apic_oldvec_to_newvec[oldvector] = 0; 1418 apic_newvec_to_oldvec[newvector] = 0; 1419 apic_free_vector(oldvector); 1420 lock_clear(&apic_revector_lock); 1421 /* There could have been more than one reprogramming! */ 1422 return (apic_xlate_vector(newvector)); 1423 } 1424 lock_clear(&apic_revector_lock); 1425 return (vector); 1426 } 1427 1428 void 1429 apic_xlate_vector_free_timeout_handler(void *arg) 1430 { 1431 ulong_t iflag; 1432 uchar_t oldvector, newvector; 1433 1434 oldvector = (uchar_t)(uintptr_t)arg; 1435 iflag = intr_clear(); 1436 lock_set(&apic_revector_lock); 1437 if ((newvector = apic_oldvec_to_newvec[oldvector]) != 0) { 1438 apic_free_vector(oldvector); 1439 apic_oldvec_to_newvec[oldvector] = 0; 1440 apic_newvec_to_oldvec[newvector] = 0; 1441 apic_revector_pending--; 1442 } 1443 1444 lock_clear(&apic_revector_lock); 1445 intr_restore(iflag); 1446 } 1447 1448 /* 1449 * Bind interrupt corresponding to irq_ptr to bind_cpu. 1450 * Must be called with interrupts disabled and apic_ioapic_lock held 1451 */ 1452 int 1453 apic_rebind(apic_irq_t *irq_ptr, int bind_cpu, 1454 struct ioapic_reprogram_data *drep) 1455 { 1456 int ioapicindex, intin_no; 1457 uint32_t airq_temp_cpu; 1458 apic_cpus_info_t *cpu_infop; 1459 uint32_t rdt_entry; 1460 int which_irq; 1461 ioapic_rdt_t irdt; 1462 1463 which_irq = apic_vector_to_irq[irq_ptr->airq_vector]; 1464 1465 intin_no = irq_ptr->airq_intin_no; 1466 ioapicindex = irq_ptr->airq_ioapicindex; 1467 airq_temp_cpu = irq_ptr->airq_temp_cpu; 1468 if (airq_temp_cpu != IRQ_UNINIT && airq_temp_cpu != IRQ_UNBOUND) { 1469 if (airq_temp_cpu & IRQ_USER_BOUND) 1470 /* Mask off high bit so it can be used as array index */ 1471 airq_temp_cpu &= ~IRQ_USER_BOUND; 1472 1473 ASSERT(apic_cpu_in_range(airq_temp_cpu)); 1474 } 1475 1476 /* 1477 * Can't bind to a CPU that's not accepting interrupts: 1478 */ 1479 cpu_infop = &apic_cpus[bind_cpu & ~IRQ_USER_BOUND]; 1480 if (!(cpu_infop->aci_status & APIC_CPU_INTR_ENABLE)) 1481 return (1); 1482 1483 /* 1484 * If we are about to change the interrupt vector for this interrupt, 1485 * and this interrupt is level-triggered, attached to an IOAPIC, 1486 * has been delivered to a CPU and that CPU has not handled it 1487 * yet, we cannot reprogram the IOAPIC now. 1488 */ 1489 if (!APIC_IS_MSI_OR_MSIX_INDEX(irq_ptr->airq_mps_intr_index)) { 1490 1491 rdt_entry = READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapicindex, 1492 intin_no); 1493 1494 if ((irq_ptr->airq_vector != RDT_VECTOR(rdt_entry)) && 1495 apic_check_stuck_interrupt(irq_ptr, airq_temp_cpu, 1496 bind_cpu, ioapicindex, intin_no, which_irq, drep) != 0) { 1497 1498 return (0); 1499 } 1500 1501 /* 1502 * NOTE: We do not unmask the RDT here, as an interrupt MAY 1503 * still come in before we have a chance to reprogram it below. 1504 * The reprogramming below will simultaneously change and 1505 * unmask the RDT entry. 1506 */ 1507 1508 if ((uint32_t)bind_cpu == IRQ_UNBOUND) { 1509 irdt.ir_lo = AV_LDEST | AV_LOPRI | 1510 irq_ptr->airq_rdt_entry; 1511 1512 irdt.ir_hi = AV_TOALL >> APIC_ID_BIT_OFFSET; 1513 1514 apic_vt_ops->apic_intrmap_alloc_entry( 1515 &irq_ptr->airq_intrmap_private, NULL, 1516 DDI_INTR_TYPE_FIXED, 1, ioapicindex); 1517 apic_vt_ops->apic_intrmap_map_entry( 1518 irq_ptr->airq_intrmap_private, (void *)&irdt, 1519 DDI_INTR_TYPE_FIXED, 1); 1520 apic_vt_ops->apic_intrmap_record_rdt( 1521 irq_ptr->airq_intrmap_private, &irdt); 1522 1523 /* Write the RDT entry -- no specific CPU binding */ 1524 WRITE_IOAPIC_RDT_ENTRY_HIGH_DWORD(ioapicindex, intin_no, 1525 irdt.ir_hi | AV_TOALL); 1526 1527 if (airq_temp_cpu != IRQ_UNINIT && airq_temp_cpu != 1528 IRQ_UNBOUND) 1529 apic_cpus[airq_temp_cpu].aci_temp_bound--; 1530 1531 /* 1532 * Write the vector, trigger, and polarity portion of 1533 * the RDT 1534 */ 1535 WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapicindex, intin_no, 1536 irdt.ir_lo); 1537 1538 irq_ptr->airq_temp_cpu = IRQ_UNBOUND; 1539 return (0); 1540 } 1541 } 1542 1543 if (bind_cpu & IRQ_USER_BOUND) { 1544 cpu_infop->aci_bound++; 1545 } else { 1546 cpu_infop->aci_temp_bound++; 1547 } 1548 ASSERT(apic_cpu_in_range(bind_cpu)); 1549 1550 if ((airq_temp_cpu != IRQ_UNBOUND) && (airq_temp_cpu != IRQ_UNINIT)) { 1551 apic_cpus[airq_temp_cpu].aci_temp_bound--; 1552 } 1553 if (!APIC_IS_MSI_OR_MSIX_INDEX(irq_ptr->airq_mps_intr_index)) { 1554 1555 irdt.ir_lo = AV_PDEST | AV_FIXED | irq_ptr->airq_rdt_entry; 1556 irdt.ir_hi = cpu_infop->aci_local_id; 1557 1558 apic_vt_ops->apic_intrmap_alloc_entry( 1559 &irq_ptr->airq_intrmap_private, NULL, DDI_INTR_TYPE_FIXED, 1560 1, ioapicindex); 1561 apic_vt_ops->apic_intrmap_map_entry( 1562 irq_ptr->airq_intrmap_private, 1563 (void *)&irdt, DDI_INTR_TYPE_FIXED, 1); 1564 apic_vt_ops->apic_intrmap_record_rdt( 1565 irq_ptr->airq_intrmap_private, &irdt); 1566 1567 /* Write the RDT entry -- bind to a specific CPU: */ 1568 WRITE_IOAPIC_RDT_ENTRY_HIGH_DWORD(ioapicindex, intin_no, 1569 irdt.ir_hi); 1570 1571 /* Write the vector, trigger, and polarity portion of the RDT */ 1572 WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapicindex, intin_no, 1573 irdt.ir_lo); 1574 1575 } else { 1576 int type = (irq_ptr->airq_mps_intr_index == MSI_INDEX) ? 1577 DDI_INTR_TYPE_MSI : DDI_INTR_TYPE_MSIX; 1578 if (type == DDI_INTR_TYPE_MSI) { 1579 if (irq_ptr->airq_ioapicindex == 1580 irq_ptr->airq_origirq) { 1581 /* first one */ 1582 DDI_INTR_IMPLDBG((CE_CONT, "apic_rebind: call " 1583 "apic_pci_msi_enable_vector\n")); 1584 apic_pci_msi_enable_vector(irq_ptr, 1585 type, which_irq, irq_ptr->airq_vector, 1586 irq_ptr->airq_intin_no, 1587 cpu_infop->aci_local_id); 1588 } 1589 if ((irq_ptr->airq_ioapicindex + 1590 irq_ptr->airq_intin_no - 1) == 1591 irq_ptr->airq_origirq) { /* last one */ 1592 DDI_INTR_IMPLDBG((CE_CONT, "apic_rebind: call " 1593 "apic_pci_msi_enable_mode\n")); 1594 apic_pci_msi_enable_mode(irq_ptr->airq_dip, 1595 type, which_irq); 1596 } 1597 } else { /* MSI-X */ 1598 apic_pci_msi_enable_vector(irq_ptr, type, 1599 irq_ptr->airq_origirq, irq_ptr->airq_vector, 1, 1600 cpu_infop->aci_local_id); 1601 apic_pci_msi_enable_mode(irq_ptr->airq_dip, type, 1602 irq_ptr->airq_origirq); 1603 } 1604 } 1605 irq_ptr->airq_temp_cpu = (uint32_t)bind_cpu; 1606 apic_redist_cpu_skip &= ~(1 << (bind_cpu & ~IRQ_USER_BOUND)); 1607 return (0); 1608 } 1609 1610 static void 1611 apic_last_ditch_clear_remote_irr(int ioapic_ix, int intin_no) 1612 { 1613 if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, intin_no) 1614 & AV_REMOTE_IRR) != 0) { 1615 /* 1616 * Trying to clear the bit through normal 1617 * channels has failed. So as a last-ditch 1618 * effort, try to set the trigger mode to 1619 * edge, then to level. This has been 1620 * observed to work on many systems. 1621 */ 1622 WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, 1623 intin_no, 1624 READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, 1625 intin_no) & ~AV_LEVEL); 1626 1627 WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, 1628 intin_no, 1629 READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, 1630 intin_no) | AV_LEVEL); 1631 1632 /* 1633 * If the bit's STILL set, this interrupt may 1634 * be hosed. 1635 */ 1636 if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, 1637 intin_no) & AV_REMOTE_IRR) != 0) { 1638 1639 prom_printf("%s: Remote IRR still " 1640 "not clear for IOAPIC %d intin %d.\n" 1641 "\tInterrupts to this pin may cease " 1642 "functioning.\n", psm_name, ioapic_ix, 1643 intin_no); 1644 #ifdef DEBUG 1645 apic_last_ditch_reprogram_failures++; 1646 #endif 1647 } 1648 } 1649 } 1650 1651 /* 1652 * This function is protected by apic_ioapic_lock coupled with the 1653 * fact that interrupts are disabled. 1654 */ 1655 static void 1656 delete_defer_repro_ent(int which_irq) 1657 { 1658 ASSERT(which_irq >= 0); 1659 ASSERT(which_irq <= 255); 1660 ASSERT(LOCK_HELD(&apic_ioapic_lock)); 1661 1662 if (apic_reprogram_info[which_irq].done) 1663 return; 1664 1665 apic_reprogram_info[which_irq].done = B_TRUE; 1666 1667 #ifdef DEBUG 1668 apic_defer_repro_total_retries += 1669 apic_reprogram_info[which_irq].tries; 1670 1671 apic_defer_repro_successes++; 1672 #endif 1673 1674 if (--apic_reprogram_outstanding == 0) { 1675 1676 setlvlx = psm_intr_exit_fn(); 1677 } 1678 } 1679 1680 1681 /* 1682 * Interrupts must be disabled during this function to prevent 1683 * self-deadlock. Interrupts are disabled because this function 1684 * is called from apic_check_stuck_interrupt(), which is called 1685 * from apic_rebind(), which requires its caller to disable interrupts. 1686 */ 1687 static void 1688 add_defer_repro_ent(apic_irq_t *irq_ptr, int which_irq, int new_bind_cpu) 1689 { 1690 ASSERT(which_irq >= 0); 1691 ASSERT(which_irq <= 255); 1692 ASSERT(!interrupts_enabled()); 1693 1694 /* 1695 * On the off-chance that there's already a deferred 1696 * reprogramming on this irq, check, and if so, just update the 1697 * CPU and irq pointer to which the interrupt is targeted, then return. 1698 */ 1699 if (!apic_reprogram_info[which_irq].done) { 1700 apic_reprogram_info[which_irq].bindcpu = new_bind_cpu; 1701 apic_reprogram_info[which_irq].irqp = irq_ptr; 1702 return; 1703 } 1704 1705 apic_reprogram_info[which_irq].irqp = irq_ptr; 1706 apic_reprogram_info[which_irq].bindcpu = new_bind_cpu; 1707 apic_reprogram_info[which_irq].tries = 0; 1708 /* 1709 * This must be the last thing set, since we're not 1710 * grabbing any locks, apic_try_deferred_reprogram() will 1711 * make its decision about using this entry iff done 1712 * is false. 1713 */ 1714 apic_reprogram_info[which_irq].done = B_FALSE; 1715 1716 /* 1717 * If there were previously no deferred reprogrammings, change 1718 * setlvlx to call apic_try_deferred_reprogram() 1719 */ 1720 if (++apic_reprogram_outstanding == 1) { 1721 1722 setlvlx = apic_try_deferred_reprogram; 1723 } 1724 } 1725 1726 static void 1727 apic_try_deferred_reprogram(int prev_ipl, int irq) 1728 { 1729 int reproirq; 1730 ulong_t iflag; 1731 struct ioapic_reprogram_data *drep; 1732 1733 (*psm_intr_exit_fn())(prev_ipl, irq); 1734 1735 if (!lock_try(&apic_defer_reprogram_lock)) { 1736 return; 1737 } 1738 1739 /* 1740 * Acquire the apic_ioapic_lock so that any other operations that 1741 * may affect the apic_reprogram_info state are serialized. 1742 * It's still possible for the last deferred reprogramming to clear 1743 * between the time we entered this function and the time we get to 1744 * the for loop below. In that case, *setlvlx will have been set 1745 * back to *_intr_exit and drep will be NULL. (There's no way to 1746 * stop that from happening -- we would need to grab a lock before 1747 * calling *setlvlx, which is neither realistic nor prudent). 1748 */ 1749 iflag = intr_clear(); 1750 lock_set(&apic_ioapic_lock); 1751 1752 /* 1753 * For each deferred RDT entry, try to reprogram it now. Note that 1754 * there is no lock acquisition to read apic_reprogram_info because 1755 * '.done' is set only after the other fields in the structure are set. 1756 */ 1757 1758 drep = NULL; 1759 for (reproirq = 0; reproirq <= APIC_MAX_VECTOR; reproirq++) { 1760 if (apic_reprogram_info[reproirq].done == B_FALSE) { 1761 drep = &apic_reprogram_info[reproirq]; 1762 break; 1763 } 1764 } 1765 1766 /* 1767 * Either we found a deferred action to perform, or 1768 * we entered this function spuriously, after *setlvlx 1769 * was restored to point to *_intr_exit. Any other 1770 * permutation is invalid. 1771 */ 1772 ASSERT(drep != NULL || *setlvlx == psm_intr_exit_fn()); 1773 1774 /* 1775 * Though we can't really do anything about errors 1776 * at this point, keep track of them for reporting. 1777 * Note that it is very possible for apic_setup_io_intr 1778 * to re-register this very timeout if the Remote IRR bit 1779 * has not yet cleared. 1780 */ 1781 1782 #ifdef DEBUG 1783 if (drep != NULL) { 1784 if (apic_setup_io_intr(drep, reproirq, B_TRUE) != 0) { 1785 apic_deferred_setup_failures++; 1786 } 1787 } else { 1788 apic_deferred_spurious_enters++; 1789 } 1790 #else 1791 if (drep != NULL) 1792 (void) apic_setup_io_intr(drep, reproirq, B_TRUE); 1793 #endif 1794 1795 lock_clear(&apic_ioapic_lock); 1796 intr_restore(iflag); 1797 1798 lock_clear(&apic_defer_reprogram_lock); 1799 } 1800 1801 static void 1802 apic_ioapic_wait_pending_clear(int ioapic_ix, int intin_no) 1803 { 1804 int waited; 1805 1806 /* 1807 * Wait for the delivery pending bit to clear. 1808 */ 1809 if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, intin_no) & 1810 (AV_LEVEL|AV_PENDING)) == (AV_LEVEL|AV_PENDING)) { 1811 1812 /* 1813 * If we're still waiting on the delivery of this interrupt, 1814 * continue to wait here until it is delivered (this should be 1815 * a very small amount of time, but include a timeout just in 1816 * case). 1817 */ 1818 for (waited = 0; waited < apic_max_reps_clear_pending; 1819 waited++) { 1820 if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, 1821 intin_no) & AV_PENDING) == 0) { 1822 break; 1823 } 1824 } 1825 } 1826 } 1827 1828 1829 /* 1830 * Checks to see if the IOAPIC interrupt entry specified has its Remote IRR 1831 * bit set. Calls functions that modify the function that setlvlx points to, 1832 * so that the reprogramming can be retried very shortly. 1833 * 1834 * This function will mask the RDT entry if the interrupt is level-triggered. 1835 * (The caller is responsible for unmasking the RDT entry.) 1836 * 1837 * Returns non-zero if the caller should defer IOAPIC reprogramming. 1838 */ 1839 static int 1840 apic_check_stuck_interrupt(apic_irq_t *irq_ptr, int old_bind_cpu, 1841 int new_bind_cpu, int ioapic_ix, int intin_no, int which_irq, 1842 struct ioapic_reprogram_data *drep) 1843 { 1844 int32_t rdt_entry; 1845 int waited; 1846 int reps = 0; 1847 1848 /* 1849 * Wait for the delivery pending bit to clear. 1850 */ 1851 do { 1852 ++reps; 1853 1854 apic_ioapic_wait_pending_clear(ioapic_ix, intin_no); 1855 1856 /* 1857 * Mask the RDT entry, but only if it's a level-triggered 1858 * interrupt 1859 */ 1860 rdt_entry = READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, 1861 intin_no); 1862 if ((rdt_entry & (AV_LEVEL|AV_MASK)) == AV_LEVEL) { 1863 1864 /* Mask it */ 1865 WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, intin_no, 1866 AV_MASK | rdt_entry); 1867 } 1868 1869 if ((rdt_entry & AV_LEVEL) == AV_LEVEL) { 1870 /* 1871 * If there was a race and an interrupt was injected 1872 * just before we masked, check for that case here. 1873 * Then, unmask the RDT entry and try again. If we're 1874 * on our last try, don't unmask (because we want the 1875 * RDT entry to remain masked for the rest of the 1876 * function). 1877 */ 1878 rdt_entry = READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, 1879 intin_no); 1880 if ((rdt_entry & AV_PENDING) && 1881 (reps < apic_max_reps_clear_pending)) { 1882 /* Unmask it */ 1883 WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, 1884 intin_no, rdt_entry & ~AV_MASK); 1885 } 1886 } 1887 1888 } while ((rdt_entry & AV_PENDING) && 1889 (reps < apic_max_reps_clear_pending)); 1890 1891 #ifdef DEBUG 1892 if (rdt_entry & AV_PENDING) 1893 apic_intr_deliver_timeouts++; 1894 #endif 1895 1896 /* 1897 * If the remote IRR bit is set, then the interrupt has been sent 1898 * to a CPU for processing. We have no choice but to wait for 1899 * that CPU to process the interrupt, at which point the remote IRR 1900 * bit will be cleared. 1901 */ 1902 if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, intin_no) & 1903 (AV_LEVEL|AV_REMOTE_IRR)) == (AV_LEVEL|AV_REMOTE_IRR)) { 1904 1905 /* 1906 * If the CPU that this RDT is bound to is NOT the current 1907 * CPU, wait until that CPU handles the interrupt and ACKs 1908 * it. If this interrupt is not bound to any CPU (that is, 1909 * if it's bound to the logical destination of "anyone"), it 1910 * may have been delivered to the current CPU so handle that 1911 * case by deferring the reprogramming (below). 1912 */ 1913 if ((old_bind_cpu != IRQ_UNBOUND) && 1914 (old_bind_cpu != IRQ_UNINIT) && 1915 (old_bind_cpu != psm_get_cpu_id())) { 1916 for (waited = 0; waited < apic_max_reps_clear_pending; 1917 waited++) { 1918 if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, 1919 intin_no) & AV_REMOTE_IRR) == 0) { 1920 1921 delete_defer_repro_ent(which_irq); 1922 1923 /* Remote IRR has cleared! */ 1924 return (0); 1925 } 1926 } 1927 } 1928 1929 /* 1930 * If we waited and the Remote IRR bit is still not cleared, 1931 * AND if we've invoked the timeout APIC_REPROGRAM_MAX_TIMEOUTS 1932 * times for this interrupt, try the last-ditch workaround: 1933 */ 1934 if (drep && drep->tries >= APIC_REPROGRAM_MAX_TRIES) { 1935 1936 apic_last_ditch_clear_remote_irr(ioapic_ix, intin_no); 1937 1938 /* Mark this one as reprogrammed: */ 1939 delete_defer_repro_ent(which_irq); 1940 1941 return (0); 1942 } else { 1943 #ifdef DEBUG 1944 apic_intr_deferrals++; 1945 #endif 1946 1947 /* 1948 * If waiting for the Remote IRR bit (above) didn't 1949 * allow it to clear, defer the reprogramming. 1950 * Add a new deferred-programming entry if the 1951 * caller passed a NULL one (and update the existing one 1952 * in case anything changed). 1953 */ 1954 add_defer_repro_ent(irq_ptr, which_irq, new_bind_cpu); 1955 if (drep) 1956 drep->tries++; 1957 1958 /* Inform caller to defer IOAPIC programming: */ 1959 return (1); 1960 } 1961 1962 } 1963 1964 /* Remote IRR is clear */ 1965 delete_defer_repro_ent(which_irq); 1966 1967 return (0); 1968 } 1969 1970 /* 1971 * Called to migrate all interrupts at an irq to another cpu. 1972 * Must be called with interrupts disabled and apic_ioapic_lock held 1973 */ 1974 int 1975 apic_rebind_all(apic_irq_t *irq_ptr, int bind_cpu) 1976 { 1977 apic_irq_t *irqptr = irq_ptr; 1978 int retval = 0; 1979 1980 while (irqptr) { 1981 if (irqptr->airq_temp_cpu != IRQ_UNINIT) 1982 retval |= apic_rebind(irqptr, bind_cpu, NULL); 1983 irqptr = irqptr->airq_next; 1984 } 1985 1986 return (retval); 1987 } 1988 1989 /* 1990 * apic_intr_redistribute does all the messy computations for identifying 1991 * which interrupt to move to which CPU. Currently we do just one interrupt 1992 * at a time. This reduces the time we spent doing all this within clock 1993 * interrupt. When it is done in idle, we could do more than 1. 1994 * First we find the most busy and the most free CPU (time in ISR only) 1995 * skipping those CPUs that has been identified as being ineligible (cpu_skip) 1996 * Then we look for IRQs which are closest to the difference between the 1997 * most busy CPU and the average ISR load. We try to find one whose load 1998 * is less than difference.If none exists, then we chose one larger than the 1999 * difference, provided it does not make the most idle CPU worse than the 2000 * most busy one. In the end, we clear all the busy fields for CPUs. For 2001 * IRQs, they are cleared as they are scanned. 2002 */ 2003 void 2004 apic_intr_redistribute(void) 2005 { 2006 int busiest_cpu, most_free_cpu; 2007 int cpu_free, cpu_busy, max_busy, min_busy; 2008 int min_free, diff; 2009 int average_busy, cpus_online; 2010 int i, busy; 2011 ulong_t iflag; 2012 apic_cpus_info_t *cpu_infop; 2013 apic_irq_t *min_busy_irq = NULL; 2014 apic_irq_t *max_busy_irq = NULL; 2015 2016 busiest_cpu = most_free_cpu = -1; 2017 cpu_free = cpu_busy = max_busy = average_busy = 0; 2018 min_free = apic_sample_factor_redistribution; 2019 cpus_online = 0; 2020 /* 2021 * Below we will check for CPU_INTR_ENABLE, bound, temp_bound, temp_cpu 2022 * without ioapic_lock. That is OK as we are just doing statistical 2023 * sampling anyway and any inaccuracy now will get corrected next time 2024 * The call to rebind which actually changes things will make sure 2025 * we are consistent. 2026 */ 2027 for (i = 0; i < apic_nproc; i++) { 2028 if (apic_cpu_in_range(i) && 2029 !(apic_redist_cpu_skip & (1 << i)) && 2030 (apic_cpus[i].aci_status & APIC_CPU_INTR_ENABLE)) { 2031 2032 cpu_infop = &apic_cpus[i]; 2033 /* 2034 * If no unbound interrupts or only 1 total on this 2035 * CPU, skip 2036 */ 2037 if (!cpu_infop->aci_temp_bound || 2038 (cpu_infop->aci_bound + cpu_infop->aci_temp_bound) 2039 == 1) { 2040 apic_redist_cpu_skip |= 1 << i; 2041 continue; 2042 } 2043 2044 busy = cpu_infop->aci_busy; 2045 average_busy += busy; 2046 cpus_online++; 2047 if (max_busy < busy) { 2048 max_busy = busy; 2049 busiest_cpu = i; 2050 } 2051 if (min_free > busy) { 2052 min_free = busy; 2053 most_free_cpu = i; 2054 } 2055 if (busy > apic_int_busy_mark) { 2056 cpu_busy |= 1 << i; 2057 } else { 2058 if (busy < apic_int_free_mark) 2059 cpu_free |= 1 << i; 2060 } 2061 } 2062 } 2063 if ((cpu_busy && cpu_free) || 2064 (max_busy >= (min_free + apic_diff_for_redistribution))) { 2065 2066 apic_num_imbalance++; 2067 #ifdef DEBUG 2068 if (apic_verbose & APIC_VERBOSE_IOAPIC_FLAG) { 2069 prom_printf( 2070 "redistribute busy=%x free=%x max=%x min=%x", 2071 cpu_busy, cpu_free, max_busy, min_free); 2072 } 2073 #endif /* DEBUG */ 2074 2075 2076 average_busy /= cpus_online; 2077 2078 diff = max_busy - average_busy; 2079 min_busy = max_busy; /* start with the max possible value */ 2080 max_busy = 0; 2081 min_busy_irq = max_busy_irq = NULL; 2082 i = apic_min_device_irq; 2083 for (; i <= apic_max_device_irq; i++) { 2084 apic_irq_t *irq_ptr; 2085 /* Change to linked list per CPU ? */ 2086 if ((irq_ptr = apic_irq_table[i]) == NULL) 2087 continue; 2088 /* Check for irq_busy & decide which one to move */ 2089 /* Also zero them for next round */ 2090 if ((irq_ptr->airq_temp_cpu == busiest_cpu) && 2091 irq_ptr->airq_busy) { 2092 if (irq_ptr->airq_busy < diff) { 2093 /* 2094 * Check for least busy CPU, 2095 * best fit or what ? 2096 */ 2097 if (max_busy < irq_ptr->airq_busy) { 2098 /* 2099 * Most busy within the 2100 * required differential 2101 */ 2102 max_busy = irq_ptr->airq_busy; 2103 max_busy_irq = irq_ptr; 2104 } 2105 } else { 2106 if (min_busy > irq_ptr->airq_busy) { 2107 /* 2108 * least busy, but more than 2109 * the reqd diff 2110 */ 2111 if (min_busy < 2112 (diff + average_busy - 2113 min_free)) { 2114 /* 2115 * Making sure new cpu 2116 * will not end up 2117 * worse 2118 */ 2119 min_busy = 2120 irq_ptr->airq_busy; 2121 2122 min_busy_irq = irq_ptr; 2123 } 2124 } 2125 } 2126 } 2127 irq_ptr->airq_busy = 0; 2128 } 2129 2130 if (max_busy_irq != NULL) { 2131 #ifdef DEBUG 2132 if (apic_verbose & APIC_VERBOSE_IOAPIC_FLAG) { 2133 prom_printf("rebinding %x to %x", 2134 max_busy_irq->airq_vector, most_free_cpu); 2135 } 2136 #endif /* DEBUG */ 2137 iflag = intr_clear(); 2138 if (lock_try(&apic_ioapic_lock)) { 2139 if (apic_rebind_all(max_busy_irq, 2140 most_free_cpu) == 0) { 2141 /* Make change permenant */ 2142 max_busy_irq->airq_cpu = 2143 (uint32_t)most_free_cpu; 2144 } 2145 lock_clear(&apic_ioapic_lock); 2146 } 2147 intr_restore(iflag); 2148 2149 } else if (min_busy_irq != NULL) { 2150 #ifdef DEBUG 2151 if (apic_verbose & APIC_VERBOSE_IOAPIC_FLAG) { 2152 prom_printf("rebinding %x to %x", 2153 min_busy_irq->airq_vector, most_free_cpu); 2154 } 2155 #endif /* DEBUG */ 2156 2157 iflag = intr_clear(); 2158 if (lock_try(&apic_ioapic_lock)) { 2159 if (apic_rebind_all(min_busy_irq, 2160 most_free_cpu) == 0) { 2161 /* Make change permenant */ 2162 min_busy_irq->airq_cpu = 2163 (uint32_t)most_free_cpu; 2164 } 2165 lock_clear(&apic_ioapic_lock); 2166 } 2167 intr_restore(iflag); 2168 2169 } else { 2170 if (cpu_busy != (1 << busiest_cpu)) { 2171 apic_redist_cpu_skip |= 1 << busiest_cpu; 2172 /* 2173 * We leave cpu_skip set so that next time we 2174 * can choose another cpu 2175 */ 2176 } 2177 } 2178 apic_num_rebind++; 2179 } else { 2180 /* 2181 * found nothing. Could be that we skipped over valid CPUs 2182 * or we have balanced everything. If we had a variable 2183 * ticks_for_redistribution, it could be increased here. 2184 * apic_int_busy, int_free etc would also need to be 2185 * changed. 2186 */ 2187 if (apic_redist_cpu_skip) 2188 apic_redist_cpu_skip = 0; 2189 } 2190 for (i = 0; i < apic_nproc; i++) { 2191 if (apic_cpu_in_range(i)) { 2192 apic_cpus[i].aci_busy = 0; 2193 } 2194 } 2195 } 2196 2197 void 2198 apic_cleanup_busy(void) 2199 { 2200 int i; 2201 apic_irq_t *irq_ptr; 2202 2203 for (i = 0; i < apic_nproc; i++) { 2204 if (apic_cpu_in_range(i)) { 2205 apic_cpus[i].aci_busy = 0; 2206 } 2207 } 2208 2209 for (i = apic_min_device_irq; i <= apic_max_device_irq; i++) { 2210 if ((irq_ptr = apic_irq_table[i]) != NULL) 2211 irq_ptr->airq_busy = 0; 2212 } 2213 } 2214