1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2017 Joyent, Inc. 24 */ 25 /* 26 * Copyright (c) 2010, Intel Corporation. 27 * All rights reserved. 28 */ 29 30 /* 31 * Copyright (c) 2018, Joyent, Inc. 32 */ 33 34 /* 35 * PSMI 1.1 extensions are supported only in 2.6 and later versions. 36 * PSMI 1.2 extensions are supported only in 2.7 and later versions. 37 * PSMI 1.3 and 1.4 extensions are supported in Solaris 10. 38 * PSMI 1.5 extensions are supported in Solaris Nevada. 39 * PSMI 1.6 extensions are supported in Solaris Nevada. 40 * PSMI 1.7 extensions are supported in Solaris Nevada. 41 */ 42 #define PSMI_1_7 43 44 #include <sys/processor.h> 45 #include <sys/time.h> 46 #include <sys/psm.h> 47 #include <sys/smp_impldefs.h> 48 #include <sys/inttypes.h> 49 #include <sys/cram.h> 50 #include <sys/acpi/acpi.h> 51 #include <sys/acpica.h> 52 #include <sys/psm_common.h> 53 #include <sys/apic.h> 54 #include <sys/apic_common.h> 55 #include <sys/pit.h> 56 #include <sys/ddi.h> 57 #include <sys/sunddi.h> 58 #include <sys/ddi_impldefs.h> 59 #include <sys/pci.h> 60 #include <sys/promif.h> 61 #include <sys/x86_archext.h> 62 #include <sys/cpc_impl.h> 63 #include <sys/uadmin.h> 64 #include <sys/panic.h> 65 #include <sys/debug.h> 66 #include <sys/archsystm.h> 67 #include <sys/trap.h> 68 #include <sys/machsystm.h> 69 #include <sys/cpuvar.h> 70 #include <sys/rm_platter.h> 71 #include <sys/privregs.h> 72 #include <sys/cyclic.h> 73 #include <sys/note.h> 74 #include <sys/pci_intr_lib.h> 75 #include <sys/sunndi.h> 76 #include <sys/hpet.h> 77 #include <sys/clock.h> 78 79 /* 80 * Part of mp_platfrom_common.c that's used only by pcplusmp & xpv_psm 81 * but not apix. 82 * These functions may be moved to xpv_psm later when apix and pcplusmp 83 * are merged together 84 */ 85 86 /* 87 * Local Function Prototypes 88 */ 89 static void apic_mark_vector(uchar_t oldvector, uchar_t newvector); 90 static void apic_xlate_vector_free_timeout_handler(void *arg); 91 static int apic_check_stuck_interrupt(apic_irq_t *irq_ptr, int old_bind_cpu, 92 int new_bind_cpu, int apicindex, int intin_no, int which_irq, 93 struct ioapic_reprogram_data *drep); 94 static int apic_setup_irq_table(dev_info_t *dip, int irqno, 95 struct apic_io_intr *intrp, struct intrspec *ispec, iflag_t *intr_flagp, 96 int type); 97 static void apic_try_deferred_reprogram(int ipl, int vect); 98 static void delete_defer_repro_ent(int which_irq); 99 static void apic_ioapic_wait_pending_clear(int ioapicindex, 100 int intin_no); 101 102 extern int apic_acpi_translate_pci_irq(dev_info_t *dip, int busid, int devid, 103 int ipin, int *pci_irqp, iflag_t *intr_flagp); 104 extern int apic_handle_pci_pci_bridge(dev_info_t *idip, int child_devno, 105 int child_ipin, struct apic_io_intr **intrp); 106 extern uchar_t acpi_find_ioapic(int irq); 107 extern struct apic_io_intr *apic_find_io_intr_w_busid(int irqno, int busid); 108 extern int apic_find_bus_id(int bustype); 109 extern int apic_find_intin(uchar_t ioapic, uchar_t intin); 110 extern void apic_record_rdt_entry(apic_irq_t *irqptr, int irq); 111 112 extern int apic_sci_vect; 113 extern iflag_t apic_sci_flags; 114 /* ACPI HPET interrupt configuration; -1 if HPET not used */ 115 extern int apic_hpet_vect; 116 extern iflag_t apic_hpet_flags; 117 extern int apic_intr_policy; 118 extern char *psm_name; 119 120 /* 121 * number of bits per byte, from <sys/param.h> 122 */ 123 #define UCHAR_MAX UINT8_MAX 124 125 /* Max wait time (in repetitions) for flags to clear in an RDT entry. */ 126 extern int apic_max_reps_clear_pending; 127 128 /* The irq # is implicit in the array index: */ 129 struct ioapic_reprogram_data apic_reprogram_info[APIC_MAX_VECTOR+1]; 130 /* 131 * APIC_MAX_VECTOR + 1 is the maximum # of IRQs as well. ioapic_reprogram_info 132 * is indexed by IRQ number, NOT by vector number. 133 */ 134 135 extern int apic_int_busy_mark; 136 extern int apic_int_free_mark; 137 extern int apic_diff_for_redistribution; 138 extern int apic_sample_factor_redistribution; 139 extern int apic_redist_cpu_skip; 140 extern int apic_num_imbalance; 141 extern int apic_num_rebind; 142 143 /* timeout for xlate_vector, mark_vector */ 144 int apic_revector_timeout = 16 * 10000; /* 160 millisec */ 145 146 extern int apic_defconf; 147 extern int apic_irq_translate; 148 149 extern int apic_use_acpi_madt_only; /* 1=ONLY use MADT from ACPI */ 150 151 extern uchar_t apic_io_vectbase[MAX_IO_APIC]; 152 153 extern boolean_t ioapic_mask_workaround[MAX_IO_APIC]; 154 155 /* 156 * First available slot to be used as IRQ index into the apic_irq_table 157 * for those interrupts (like MSI/X) that don't have a physical IRQ. 158 */ 159 extern int apic_first_avail_irq; 160 161 /* 162 * apic_defer_reprogram_lock ensures that only one processor is handling 163 * deferred interrupt programming at *_intr_exit time. 164 */ 165 static lock_t apic_defer_reprogram_lock; 166 167 /* 168 * The current number of deferred reprogrammings outstanding 169 */ 170 uint_t apic_reprogram_outstanding = 0; 171 172 #ifdef DEBUG 173 /* 174 * Counters that keep track of deferred reprogramming stats 175 */ 176 uint_t apic_intr_deferrals = 0; 177 uint_t apic_intr_deliver_timeouts = 0; 178 uint_t apic_last_ditch_reprogram_failures = 0; 179 uint_t apic_deferred_setup_failures = 0; 180 uint_t apic_defer_repro_total_retries = 0; 181 uint_t apic_defer_repro_successes = 0; 182 uint_t apic_deferred_spurious_enters = 0; 183 #endif 184 185 extern int apic_io_max; 186 extern struct apic_io_intr *apic_io_intrp; 187 188 uchar_t apic_vector_to_irq[APIC_MAX_VECTOR+1]; 189 190 extern uint32_t eisa_level_intr_mask; 191 /* At least MSB will be set if EISA bus */ 192 193 extern int apic_pci_bus_total; 194 extern uchar_t apic_single_pci_busid; 195 196 /* 197 * Following declarations are for revectoring; used when ISRs at different 198 * IPLs share an irq. 199 */ 200 static lock_t apic_revector_lock; 201 int apic_revector_pending = 0; 202 static uchar_t *apic_oldvec_to_newvec; 203 static uchar_t *apic_newvec_to_oldvec; 204 205 /* ACPI Interrupt Source Override Structure ptr */ 206 extern ACPI_MADT_INTERRUPT_OVERRIDE *acpi_isop; 207 extern int acpi_iso_cnt; 208 209 /* 210 * Auto-configuration routines 211 */ 212 213 /* 214 * Initialise vector->ipl and ipl->pri arrays. level_intr and irqtable 215 * are also set to NULL. vector->irq is set to a value which cannot map 216 * to a real irq to show that it is free. 217 */ 218 void 219 apic_init_common(void) 220 { 221 int i, j, indx; 222 int *iptr; 223 224 /* 225 * Initialize apic_ipls from apic_vectortoipl. This array is 226 * used in apic_intr_enter to determine the IPL to use for the 227 * corresponding vector. On some systems, due to hardware errata 228 * and interrupt sharing, the IPL may not correspond to the IPL listed 229 * in apic_vectortoipl (see apic_addspl and apic_delspl). 230 */ 231 for (i = 0; i < (APIC_AVAIL_VECTOR / APIC_VECTOR_PER_IPL); i++) { 232 indx = i * APIC_VECTOR_PER_IPL; 233 234 for (j = 0; j < APIC_VECTOR_PER_IPL; j++, indx++) 235 apic_ipls[indx] = apic_vectortoipl[i]; 236 } 237 238 /* cpu 0 is always up (for now) */ 239 apic_cpus[0].aci_status = APIC_CPU_ONLINE | APIC_CPU_INTR_ENABLE; 240 241 iptr = (int *)&apic_irq_table[0]; 242 for (i = 0; i <= APIC_MAX_VECTOR; i++) { 243 apic_level_intr[i] = 0; 244 *iptr++ = 0; 245 apic_vector_to_irq[i] = APIC_RESV_IRQ; 246 247 /* These *must* be initted to B_TRUE! */ 248 apic_reprogram_info[i].done = B_TRUE; 249 apic_reprogram_info[i].irqp = NULL; 250 apic_reprogram_info[i].tries = 0; 251 apic_reprogram_info[i].bindcpu = 0; 252 } 253 254 /* 255 * Allocate a dummy irq table entry for the reserved entry. 256 * This takes care of the race between removing an irq and 257 * clock detecting a CPU in that irq during interrupt load 258 * sampling. 259 */ 260 apic_irq_table[APIC_RESV_IRQ] = 261 kmem_zalloc(sizeof (apic_irq_t), KM_SLEEP); 262 263 mutex_init(&airq_mutex, NULL, MUTEX_DEFAULT, NULL); 264 } 265 266 void 267 ioapic_init_intr(int mask_apic) 268 { 269 int ioapic_ix; 270 struct intrspec ispec; 271 apic_irq_t *irqptr; 272 int i, j; 273 ulong_t iflag; 274 275 LOCK_INIT_CLEAR(&apic_revector_lock); 276 LOCK_INIT_CLEAR(&apic_defer_reprogram_lock); 277 278 /* mask interrupt vectors */ 279 for (j = 0; j < apic_io_max && mask_apic; j++) { 280 int intin_max; 281 282 ioapic_ix = j; 283 /* Bits 23-16 define the maximum redirection entries */ 284 intin_max = (ioapic_read(ioapic_ix, APIC_VERS_CMD) >> 16) 285 & 0xff; 286 for (i = 0; i <= intin_max; i++) 287 ioapic_write(ioapic_ix, APIC_RDT_CMD + 2 * i, AV_MASK); 288 } 289 290 /* 291 * Hack alert: deal with ACPI SCI interrupt chicken/egg here 292 */ 293 if (apic_sci_vect > 0) { 294 /* 295 * acpica has already done add_avintr(); we just 296 * to finish the job by mimicing translate_irq() 297 * 298 * Fake up an intrspec and setup the tables 299 */ 300 ispec.intrspec_vec = apic_sci_vect; 301 ispec.intrspec_pri = SCI_IPL; 302 303 if (apic_setup_irq_table(NULL, apic_sci_vect, NULL, 304 &ispec, &apic_sci_flags, DDI_INTR_TYPE_FIXED) < 0) { 305 cmn_err(CE_WARN, "!apic: SCI setup failed"); 306 return; 307 } 308 irqptr = apic_irq_table[apic_sci_vect]; 309 310 iflag = intr_clear(); 311 lock_set(&apic_ioapic_lock); 312 313 /* Program I/O APIC */ 314 (void) apic_setup_io_intr(irqptr, apic_sci_vect, B_FALSE); 315 316 lock_clear(&apic_ioapic_lock); 317 intr_restore(iflag); 318 319 irqptr->airq_share++; 320 } 321 322 /* 323 * Hack alert: deal with ACPI HPET interrupt chicken/egg here. 324 */ 325 if (apic_hpet_vect > 0) { 326 /* 327 * hpet has already done add_avintr(); we just need 328 * to finish the job by mimicing translate_irq() 329 * 330 * Fake up an intrspec and setup the tables 331 */ 332 ispec.intrspec_vec = apic_hpet_vect; 333 ispec.intrspec_pri = CBE_HIGH_PIL; 334 335 if (apic_setup_irq_table(NULL, apic_hpet_vect, NULL, 336 &ispec, &apic_hpet_flags, DDI_INTR_TYPE_FIXED) < 0) { 337 cmn_err(CE_WARN, "!apic: HPET setup failed"); 338 return; 339 } 340 irqptr = apic_irq_table[apic_hpet_vect]; 341 342 iflag = intr_clear(); 343 lock_set(&apic_ioapic_lock); 344 345 /* Program I/O APIC */ 346 (void) apic_setup_io_intr(irqptr, apic_hpet_vect, B_FALSE); 347 348 lock_clear(&apic_ioapic_lock); 349 intr_restore(iflag); 350 351 irqptr->airq_share++; 352 } 353 } 354 355 /* 356 * Add mask bits to disable interrupt vector from happening 357 * at or above IPL. In addition, it should remove mask bits 358 * to enable interrupt vectors below the given IPL. 359 * 360 * Both add and delspl are complicated by the fact that different interrupts 361 * may share IRQs. This can happen in two ways. 362 * 1. The same H/W line is shared by more than 1 device 363 * 1a. with interrupts at different IPLs 364 * 1b. with interrupts at same IPL 365 * 2. We ran out of vectors at a given IPL and started sharing vectors. 366 * 1b and 2 should be handled gracefully, except for the fact some ISRs 367 * will get called often when no interrupt is pending for the device. 368 * For 1a, we handle it at the higher IPL. 369 */ 370 /*ARGSUSED*/ 371 int 372 apic_addspl_common(int irqno, int ipl, int min_ipl, int max_ipl) 373 { 374 uchar_t vector; 375 ulong_t iflag; 376 apic_irq_t *irqptr, *irqheadptr; 377 int irqindex; 378 379 ASSERT(max_ipl <= UCHAR_MAX); 380 irqindex = IRQINDEX(irqno); 381 382 if ((irqindex == -1) || (!apic_irq_table[irqindex])) 383 return (PSM_FAILURE); 384 385 mutex_enter(&airq_mutex); 386 irqptr = irqheadptr = apic_irq_table[irqindex]; 387 388 DDI_INTR_IMPLDBG((CE_CONT, "apic_addspl: dip=0x%p type=%d irqno=0x%x " 389 "vector=0x%x\n", (void *)irqptr->airq_dip, 390 irqptr->airq_mps_intr_index, irqno, irqptr->airq_vector)); 391 392 while (irqptr) { 393 if (VIRTIRQ(irqindex, irqptr->airq_share_id) == irqno) 394 break; 395 irqptr = irqptr->airq_next; 396 } 397 irqptr->airq_share++; 398 399 mutex_exit(&airq_mutex); 400 401 /* return if it is not hardware interrupt */ 402 if (irqptr->airq_mps_intr_index == RESERVE_INDEX) 403 return (PSM_SUCCESS); 404 405 /* Or if there are more interupts at a higher IPL */ 406 if (ipl != max_ipl) 407 return (PSM_SUCCESS); 408 409 /* 410 * if apic_picinit() has not been called yet, just return. 411 * At the end of apic_picinit(), we will call setup_io_intr(). 412 */ 413 414 if (!apic_picinit_called) 415 return (PSM_SUCCESS); 416 417 /* 418 * Upgrade vector if max_ipl is not earlier ipl. If we cannot allocate, 419 * return failure. 420 */ 421 if (irqptr->airq_ipl != max_ipl && 422 !ioapic_mask_workaround[irqptr->airq_ioapicindex]) { 423 424 vector = apic_allocate_vector(max_ipl, irqindex, 1); 425 if (vector == 0) { 426 irqptr->airq_share--; 427 return (PSM_FAILURE); 428 } 429 irqptr = irqheadptr; 430 apic_mark_vector(irqptr->airq_vector, vector); 431 while (irqptr) { 432 irqptr->airq_vector = vector; 433 irqptr->airq_ipl = (uchar_t)max_ipl; 434 /* 435 * reprogram irq being added and every one else 436 * who is not in the UNINIT state 437 */ 438 if ((VIRTIRQ(irqindex, irqptr->airq_share_id) == 439 irqno) || (irqptr->airq_temp_cpu != IRQ_UNINIT)) { 440 apic_record_rdt_entry(irqptr, irqindex); 441 442 iflag = intr_clear(); 443 lock_set(&apic_ioapic_lock); 444 445 (void) apic_setup_io_intr(irqptr, irqindex, 446 B_FALSE); 447 448 lock_clear(&apic_ioapic_lock); 449 intr_restore(iflag); 450 } 451 irqptr = irqptr->airq_next; 452 } 453 return (PSM_SUCCESS); 454 455 } else if (irqptr->airq_ipl != max_ipl && 456 ioapic_mask_workaround[irqptr->airq_ioapicindex]) { 457 /* 458 * We cannot upgrade the vector, but we can change 459 * the IPL that this vector induces. 460 * 461 * Note that we subtract APIC_BASE_VECT from the vector 462 * here because this array is used in apic_intr_enter 463 * (no need to add APIC_BASE_VECT in that hot code 464 * path since we can do it in the rarely-executed path 465 * here). 466 */ 467 apic_ipls[irqptr->airq_vector - APIC_BASE_VECT] = 468 (uchar_t)max_ipl; 469 470 irqptr = irqheadptr; 471 while (irqptr) { 472 irqptr->airq_ipl = (uchar_t)max_ipl; 473 irqptr = irqptr->airq_next; 474 } 475 476 return (PSM_SUCCESS); 477 } 478 479 ASSERT(irqptr); 480 481 iflag = intr_clear(); 482 lock_set(&apic_ioapic_lock); 483 484 (void) apic_setup_io_intr(irqptr, irqindex, B_FALSE); 485 486 lock_clear(&apic_ioapic_lock); 487 intr_restore(iflag); 488 489 return (PSM_SUCCESS); 490 } 491 492 /* 493 * Recompute mask bits for the given interrupt vector. 494 * If there is no interrupt servicing routine for this 495 * vector, this function should disable interrupt vector 496 * from happening at all IPLs. If there are still 497 * handlers using the given vector, this function should 498 * disable the given vector from happening below the lowest 499 * IPL of the remaining hadlers. 500 */ 501 /*ARGSUSED*/ 502 int 503 apic_delspl_common(int irqno, int ipl, int min_ipl, int max_ipl) 504 { 505 uchar_t vector; 506 uint32_t bind_cpu; 507 int intin, irqindex; 508 int ioapic_ix; 509 apic_irq_t *irqptr, *preirqptr, *irqheadptr, *irqp; 510 ulong_t iflag; 511 512 mutex_enter(&airq_mutex); 513 irqindex = IRQINDEX(irqno); 514 irqptr = preirqptr = irqheadptr = apic_irq_table[irqindex]; 515 516 DDI_INTR_IMPLDBG((CE_CONT, "apic_delspl: dip=0x%p type=%d irqno=0x%x " 517 "vector=0x%x\n", (void *)irqptr->airq_dip, 518 irqptr->airq_mps_intr_index, irqno, irqptr->airq_vector)); 519 520 while (irqptr) { 521 if (VIRTIRQ(irqindex, irqptr->airq_share_id) == irqno) 522 break; 523 preirqptr = irqptr; 524 irqptr = irqptr->airq_next; 525 } 526 ASSERT(irqptr); 527 528 irqptr->airq_share--; 529 530 mutex_exit(&airq_mutex); 531 532 /* 533 * If there are more interrupts at a higher IPL, we don't need 534 * to disable anything. 535 */ 536 if (ipl < max_ipl) 537 return (PSM_SUCCESS); 538 539 /* return if it is not hardware interrupt */ 540 if (irqptr->airq_mps_intr_index == RESERVE_INDEX) 541 return (PSM_SUCCESS); 542 543 if (!apic_picinit_called) { 544 /* 545 * Clear irq_struct. If two devices shared an intpt 546 * line & 1 unloaded before picinit, we are hosed. But, then 547 * we hope the machine survive. 548 */ 549 irqptr->airq_mps_intr_index = FREE_INDEX; 550 irqptr->airq_temp_cpu = IRQ_UNINIT; 551 apic_free_vector(irqptr->airq_vector); 552 return (PSM_SUCCESS); 553 } 554 /* 555 * Downgrade vector to new max_ipl if needed. If we cannot allocate, 556 * use old IPL. Not very elegant, but it should work. 557 */ 558 if ((irqptr->airq_ipl != max_ipl) && (max_ipl != PSM_INVALID_IPL) && 559 !ioapic_mask_workaround[irqptr->airq_ioapicindex]) { 560 apic_irq_t *irqp; 561 if ((vector = apic_allocate_vector(max_ipl, irqno, 1))) { 562 apic_mark_vector(irqheadptr->airq_vector, vector); 563 irqp = irqheadptr; 564 while (irqp) { 565 irqp->airq_vector = vector; 566 irqp->airq_ipl = (uchar_t)max_ipl; 567 if (irqp->airq_temp_cpu != IRQ_UNINIT) { 568 apic_record_rdt_entry(irqp, irqindex); 569 570 iflag = intr_clear(); 571 lock_set(&apic_ioapic_lock); 572 573 (void) apic_setup_io_intr(irqp, 574 irqindex, B_FALSE); 575 576 lock_clear(&apic_ioapic_lock); 577 intr_restore(iflag); 578 } 579 irqp = irqp->airq_next; 580 } 581 } 582 583 } else if (irqptr->airq_ipl != max_ipl && 584 max_ipl != PSM_INVALID_IPL && 585 ioapic_mask_workaround[irqptr->airq_ioapicindex]) { 586 587 /* 588 * We cannot downgrade the IPL of the vector below the vector's 589 * hardware priority. If we did, it would be possible for a 590 * higher-priority hardware vector to interrupt a CPU running at an IPL 591 * lower than the hardware priority of the interrupting vector (but 592 * higher than the soft IPL of this IRQ). When this happens, we would 593 * then try to drop the IPL BELOW what it was (effectively dropping 594 * below base_spl) which would be potentially catastrophic. 595 * 596 * (e.g. Suppose the hardware vector associated with this IRQ is 0x40 597 * (hardware IPL of 4). Further assume that the old IPL of this IRQ 598 * was 4, but the new IPL is 1. If we forced vector 0x40 to result in 599 * an IPL of 1, it would be possible for the processor to be executing 600 * at IPL 3 and for an interrupt to come in on vector 0x40, interrupting 601 * the currently-executing ISR. When apic_intr_enter consults 602 * apic_irqs[], it will return 1, bringing the IPL of the CPU down to 1 603 * so even though the processor was running at IPL 4, an IPL 1 604 * interrupt will have interrupted it, which must not happen)). 605 * 606 * Effectively, this means that the hardware priority corresponding to 607 * the IRQ's IPL (in apic_ipls[]) cannot be lower than the vector's 608 * hardware priority. 609 * 610 * (In the above example, then, after removal of the IPL 4 device's 611 * interrupt handler, the new IPL will continue to be 4 because the 612 * hardware priority that IPL 1 implies is lower than the hardware 613 * priority of the vector used.) 614 */ 615 /* apic_ipls is indexed by vector, starting at APIC_BASE_VECT */ 616 const int apic_ipls_index = irqptr->airq_vector - 617 APIC_BASE_VECT; 618 const int vect_inherent_hwpri = irqptr->airq_vector >> 619 APIC_IPL_SHIFT; 620 621 /* 622 * If there are still devices using this IRQ, determine the 623 * new ipl to use. 624 */ 625 if (irqptr->airq_share) { 626 int vect_desired_hwpri, hwpri; 627 628 ASSERT(max_ipl < MAXIPL); 629 vect_desired_hwpri = apic_ipltopri[max_ipl] >> 630 APIC_IPL_SHIFT; 631 632 /* 633 * If the desired IPL's hardware priority is lower 634 * than that of the vector, use the hardware priority 635 * of the vector to determine the new IPL. 636 */ 637 hwpri = (vect_desired_hwpri < vect_inherent_hwpri) ? 638 vect_inherent_hwpri : vect_desired_hwpri; 639 640 /* 641 * Now, to get the right index for apic_vectortoipl, 642 * we need to subtract APIC_BASE_VECT from the 643 * hardware-vector-equivalent (in hwpri). Since hwpri 644 * is already shifted, we shift APIC_BASE_VECT before 645 * doing the subtraction. 646 */ 647 hwpri -= (APIC_BASE_VECT >> APIC_IPL_SHIFT); 648 649 ASSERT(hwpri >= 0); 650 ASSERT(hwpri < MAXIPL); 651 max_ipl = apic_vectortoipl[hwpri]; 652 apic_ipls[apic_ipls_index] = (uchar_t)max_ipl; 653 654 irqp = irqheadptr; 655 while (irqp) { 656 irqp->airq_ipl = (uchar_t)max_ipl; 657 irqp = irqp->airq_next; 658 } 659 } else { 660 /* 661 * No more devices on this IRQ, so reset this vector's 662 * element in apic_ipls to the original IPL for this 663 * vector 664 */ 665 apic_ipls[apic_ipls_index] = 666 apic_vectortoipl[vect_inherent_hwpri]; 667 } 668 } 669 670 /* 671 * If there are still active interrupts, we are done. 672 */ 673 if (irqptr->airq_share) 674 return (PSM_SUCCESS); 675 676 iflag = intr_clear(); 677 lock_set(&apic_ioapic_lock); 678 679 if (irqptr->airq_mps_intr_index == MSI_INDEX) { 680 /* 681 * Disable the MSI vector 682 * Make sure we only disable on the last 683 * of the multi-MSI support 684 */ 685 if (i_ddi_intr_get_current_nenables(irqptr->airq_dip) == 1) { 686 apic_pci_msi_disable_mode(irqptr->airq_dip, 687 DDI_INTR_TYPE_MSI); 688 } 689 } else if (irqptr->airq_mps_intr_index == MSIX_INDEX) { 690 /* 691 * Disable the MSI-X vector 692 * needs to clear its mask and addr/data for each MSI-X 693 */ 694 apic_pci_msi_unconfigure(irqptr->airq_dip, DDI_INTR_TYPE_MSIX, 695 irqptr->airq_origirq); 696 /* 697 * Make sure we only disable on the last MSI-X 698 */ 699 if (i_ddi_intr_get_current_nenables(irqptr->airq_dip) == 1) { 700 apic_pci_msi_disable_mode(irqptr->airq_dip, 701 DDI_INTR_TYPE_MSIX); 702 } 703 } else { 704 /* 705 * The assumption here is that this is safe, even for 706 * systems with IOAPICs that suffer from the hardware 707 * erratum because all devices have been quiesced before 708 * they unregister their interrupt handlers. If that 709 * assumption turns out to be false, this mask operation 710 * can induce the same erratum result we're trying to 711 * avoid. 712 */ 713 ioapic_ix = irqptr->airq_ioapicindex; 714 intin = irqptr->airq_intin_no; 715 ioapic_write(ioapic_ix, APIC_RDT_CMD + 2 * intin, AV_MASK); 716 } 717 718 apic_vt_ops->apic_intrmap_free_entry(&irqptr->airq_intrmap_private); 719 720 /* 721 * This irq entry is the only one in the chain. 722 */ 723 if (irqheadptr->airq_next == NULL) { 724 ASSERT(irqheadptr == irqptr); 725 bind_cpu = irqptr->airq_temp_cpu; 726 if (((uint32_t)bind_cpu != IRQ_UNBOUND) && 727 ((uint32_t)bind_cpu != IRQ_UNINIT)) { 728 ASSERT(apic_cpu_in_range(bind_cpu)); 729 if (bind_cpu & IRQ_USER_BOUND) { 730 /* If hardbound, temp_cpu == cpu */ 731 bind_cpu &= ~IRQ_USER_BOUND; 732 apic_cpus[bind_cpu].aci_bound--; 733 } else 734 apic_cpus[bind_cpu].aci_temp_bound--; 735 } 736 irqptr->airq_temp_cpu = IRQ_UNINIT; 737 irqptr->airq_mps_intr_index = FREE_INDEX; 738 lock_clear(&apic_ioapic_lock); 739 intr_restore(iflag); 740 apic_free_vector(irqptr->airq_vector); 741 return (PSM_SUCCESS); 742 } 743 744 /* 745 * If we get here, we are sharing the vector and there are more than 746 * one active irq entries in the chain. 747 */ 748 lock_clear(&apic_ioapic_lock); 749 intr_restore(iflag); 750 751 mutex_enter(&airq_mutex); 752 /* Remove the irq entry from the chain */ 753 if (irqptr == irqheadptr) { /* The irq entry is at the head */ 754 apic_irq_table[irqindex] = irqptr->airq_next; 755 } else { 756 preirqptr->airq_next = irqptr->airq_next; 757 } 758 /* Free the irq entry */ 759 kmem_free(irqptr, sizeof (apic_irq_t)); 760 mutex_exit(&airq_mutex); 761 762 return (PSM_SUCCESS); 763 } 764 765 /* 766 * apic_introp_xlate() replaces apic_translate_irq() and is 767 * called only from apic_intr_ops(). With the new ADII framework, 768 * the priority can no longer be retrieved through i_ddi_get_intrspec(). 769 * It has to be passed in from the caller. 770 * 771 * Return value: 772 * Success: irqno for the given device 773 * Failure: -1 774 */ 775 int 776 apic_introp_xlate(dev_info_t *dip, struct intrspec *ispec, int type) 777 { 778 char dev_type[16]; 779 int dev_len, pci_irq, newirq, bustype, devid, busid, i; 780 int irqno = ispec->intrspec_vec; 781 ddi_acc_handle_t cfg_handle; 782 uchar_t ipin; 783 struct apic_io_intr *intrp; 784 iflag_t intr_flag; 785 ACPI_SUBTABLE_HEADER *hp; 786 ACPI_MADT_INTERRUPT_OVERRIDE *isop; 787 apic_irq_t *airqp; 788 int parent_is_pci_or_pciex = 0; 789 int child_is_pciex = 0; 790 791 DDI_INTR_IMPLDBG((CE_CONT, "apic_introp_xlate: dip=0x%p name=%s " 792 "type=%d irqno=0x%x\n", (void *)dip, ddi_get_name(dip), type, 793 irqno)); 794 795 dev_len = sizeof (dev_type); 796 if (ddi_getlongprop_buf(DDI_DEV_T_ANY, ddi_get_parent(dip), 797 DDI_PROP_DONTPASS, "device_type", (caddr_t)dev_type, 798 &dev_len) == DDI_PROP_SUCCESS) { 799 if ((strcmp(dev_type, "pci") == 0) || 800 (strcmp(dev_type, "pciex") == 0)) 801 parent_is_pci_or_pciex = 1; 802 } 803 804 if (ddi_getlongprop_buf(DDI_DEV_T_ANY, dip, 805 DDI_PROP_DONTPASS, "compatible", (caddr_t)dev_type, 806 &dev_len) == DDI_PROP_SUCCESS) { 807 if (strstr(dev_type, "pciex")) 808 child_is_pciex = 1; 809 } 810 811 if (DDI_INTR_IS_MSI_OR_MSIX(type)) { 812 if ((airqp = apic_find_irq(dip, ispec, type)) != NULL) { 813 airqp->airq_iflag.bustype = 814 child_is_pciex ? BUS_PCIE : BUS_PCI; 815 return (apic_vector_to_irq[airqp->airq_vector]); 816 } 817 return (apic_setup_irq_table(dip, irqno, NULL, ispec, 818 NULL, type)); 819 } 820 821 bustype = 0; 822 823 /* check if we have already translated this irq */ 824 mutex_enter(&airq_mutex); 825 newirq = apic_min_device_irq; 826 for (; newirq <= apic_max_device_irq; newirq++) { 827 airqp = apic_irq_table[newirq]; 828 while (airqp) { 829 if ((airqp->airq_dip == dip) && 830 (airqp->airq_origirq == irqno) && 831 (airqp->airq_mps_intr_index != FREE_INDEX)) { 832 833 mutex_exit(&airq_mutex); 834 return (VIRTIRQ(newirq, airqp->airq_share_id)); 835 } 836 airqp = airqp->airq_next; 837 } 838 } 839 mutex_exit(&airq_mutex); 840 841 if (apic_defconf) 842 goto defconf; 843 844 if ((dip == NULL) || (!apic_irq_translate && !apic_enable_acpi)) 845 goto nonpci; 846 847 if (parent_is_pci_or_pciex) { 848 /* pci device */ 849 if (acpica_get_bdf(dip, &busid, &devid, NULL) != 0) 850 goto nonpci; 851 if (busid == 0 && apic_pci_bus_total == 1) 852 busid = (int)apic_single_pci_busid; 853 854 if (pci_config_setup(dip, &cfg_handle) != DDI_SUCCESS) 855 return (-1); 856 ipin = pci_config_get8(cfg_handle, PCI_CONF_IPIN) - PCI_INTA; 857 pci_config_teardown(&cfg_handle); 858 if (apic_enable_acpi && !apic_use_acpi_madt_only) { 859 if (apic_acpi_translate_pci_irq(dip, busid, devid, 860 ipin, &pci_irq, &intr_flag) != ACPI_PSM_SUCCESS) 861 return (-1); 862 863 intr_flag.bustype = child_is_pciex ? BUS_PCIE : BUS_PCI; 864 return (apic_setup_irq_table(dip, pci_irq, NULL, ispec, 865 &intr_flag, type)); 866 } else { 867 pci_irq = ((devid & 0x1f) << 2) | (ipin & 0x3); 868 if ((intrp = apic_find_io_intr_w_busid(pci_irq, busid)) 869 == NULL) { 870 if ((pci_irq = apic_handle_pci_pci_bridge(dip, 871 devid, ipin, &intrp)) == -1) 872 return (-1); 873 } 874 return (apic_setup_irq_table(dip, pci_irq, intrp, ispec, 875 NULL, type)); 876 } 877 } else if (strcmp(dev_type, "isa") == 0) 878 bustype = BUS_ISA; 879 else if (strcmp(dev_type, "eisa") == 0) 880 bustype = BUS_EISA; 881 882 nonpci: 883 if (apic_enable_acpi && !apic_use_acpi_madt_only) { 884 /* search iso entries first */ 885 if (acpi_iso_cnt != 0) { 886 hp = (ACPI_SUBTABLE_HEADER *)acpi_isop; 887 i = 0; 888 while (i < acpi_iso_cnt) { 889 if (hp->Type == 890 ACPI_MADT_TYPE_INTERRUPT_OVERRIDE) { 891 isop = 892 (ACPI_MADT_INTERRUPT_OVERRIDE *) hp; 893 if (isop->Bus == 0 && 894 isop->SourceIrq == irqno) { 895 newirq = isop->GlobalIrq; 896 intr_flag.intr_po = 897 isop->IntiFlags & 898 ACPI_MADT_POLARITY_MASK; 899 intr_flag.intr_el = 900 (isop->IntiFlags & 901 ACPI_MADT_TRIGGER_MASK) 902 >> 2; 903 intr_flag.bustype = BUS_ISA; 904 905 return (apic_setup_irq_table( 906 dip, newirq, NULL, ispec, 907 &intr_flag, type)); 908 909 } 910 i++; 911 } 912 hp = (ACPI_SUBTABLE_HEADER *)(((char *)hp) + 913 hp->Length); 914 } 915 } 916 intr_flag.intr_po = INTR_PO_ACTIVE_HIGH; 917 intr_flag.intr_el = INTR_EL_EDGE; 918 intr_flag.bustype = BUS_ISA; 919 return (apic_setup_irq_table(dip, irqno, NULL, ispec, 920 &intr_flag, type)); 921 } else { 922 if (bustype == 0) /* not initialized */ 923 bustype = eisa_level_intr_mask ? BUS_EISA : BUS_ISA; 924 for (i = 0; i < 2; i++) { 925 if (((busid = apic_find_bus_id(bustype)) != -1) && 926 ((intrp = apic_find_io_intr_w_busid(irqno, busid)) 927 != NULL)) { 928 if ((newirq = apic_setup_irq_table(dip, irqno, 929 intrp, ispec, NULL, type)) != -1) { 930 return (newirq); 931 } 932 goto defconf; 933 } 934 bustype = (bustype == BUS_EISA) ? BUS_ISA : BUS_EISA; 935 } 936 } 937 938 /* MPS default configuration */ 939 defconf: 940 newirq = apic_setup_irq_table(dip, irqno, NULL, ispec, NULL, type); 941 if (newirq == -1) 942 return (-1); 943 ASSERT(IRQINDEX(newirq) == irqno); 944 ASSERT(apic_irq_table[irqno]); 945 return (newirq); 946 } 947 948 /* 949 * Attempt to share vector with someone else 950 */ 951 static int 952 apic_share_vector(int irqno, iflag_t *intr_flagp, short intr_index, int ipl, 953 uchar_t ioapicindex, uchar_t ipin, apic_irq_t **irqptrp) 954 { 955 #ifdef DEBUG 956 apic_irq_t *tmpirqp = NULL; 957 #endif /* DEBUG */ 958 apic_irq_t *irqptr, dummyirq; 959 int newirq, chosen_irq = -1, share = 127; 960 int lowest, highest, i; 961 uchar_t share_id; 962 963 DDI_INTR_IMPLDBG((CE_CONT, "apic_share_vector: irqno=0x%x " 964 "intr_index=0x%x ipl=0x%x\n", irqno, intr_index, ipl)); 965 966 highest = apic_ipltopri[ipl] + APIC_VECTOR_MASK; 967 lowest = apic_ipltopri[ipl-1] + APIC_VECTOR_PER_IPL; 968 969 if (highest < lowest) /* Both ipl and ipl-1 map to same pri */ 970 lowest -= APIC_VECTOR_PER_IPL; 971 dummyirq.airq_mps_intr_index = intr_index; 972 dummyirq.airq_ioapicindex = ioapicindex; 973 dummyirq.airq_intin_no = ipin; 974 if (intr_flagp) 975 dummyirq.airq_iflag = *intr_flagp; 976 apic_record_rdt_entry(&dummyirq, irqno); 977 for (i = lowest; i <= highest; i++) { 978 newirq = apic_vector_to_irq[i]; 979 if (newirq == APIC_RESV_IRQ) 980 continue; 981 irqptr = apic_irq_table[newirq]; 982 983 if ((dummyirq.airq_rdt_entry & 0xFF00) != 984 (irqptr->airq_rdt_entry & 0xFF00)) 985 /* not compatible */ 986 continue; 987 988 if (irqptr->airq_share < share) { 989 share = irqptr->airq_share; 990 chosen_irq = newirq; 991 } 992 } 993 if (chosen_irq != -1) { 994 /* 995 * Assign a share id which is free or which is larger 996 * than the largest one. 997 */ 998 share_id = 1; 999 mutex_enter(&airq_mutex); 1000 irqptr = apic_irq_table[chosen_irq]; 1001 while (irqptr) { 1002 if (irqptr->airq_mps_intr_index == FREE_INDEX) { 1003 share_id = irqptr->airq_share_id; 1004 break; 1005 } 1006 if (share_id <= irqptr->airq_share_id) 1007 share_id = irqptr->airq_share_id + 1; 1008 #ifdef DEBUG 1009 tmpirqp = irqptr; 1010 #endif /* DEBUG */ 1011 irqptr = irqptr->airq_next; 1012 } 1013 if (!irqptr) { 1014 irqptr = kmem_zalloc(sizeof (apic_irq_t), KM_SLEEP); 1015 irqptr->airq_temp_cpu = IRQ_UNINIT; 1016 irqptr->airq_next = 1017 apic_irq_table[chosen_irq]->airq_next; 1018 apic_irq_table[chosen_irq]->airq_next = irqptr; 1019 #ifdef DEBUG 1020 tmpirqp = apic_irq_table[chosen_irq]; 1021 #endif /* DEBUG */ 1022 } 1023 irqptr->airq_mps_intr_index = intr_index; 1024 irqptr->airq_ioapicindex = ioapicindex; 1025 irqptr->airq_intin_no = ipin; 1026 if (intr_flagp) 1027 irqptr->airq_iflag = *intr_flagp; 1028 irqptr->airq_vector = apic_irq_table[chosen_irq]->airq_vector; 1029 irqptr->airq_share_id = share_id; 1030 apic_record_rdt_entry(irqptr, irqno); 1031 *irqptrp = irqptr; 1032 #ifdef DEBUG 1033 /* shuffle the pointers to test apic_delspl path */ 1034 if (tmpirqp) { 1035 tmpirqp->airq_next = irqptr->airq_next; 1036 irqptr->airq_next = apic_irq_table[chosen_irq]; 1037 apic_irq_table[chosen_irq] = irqptr; 1038 } 1039 #endif /* DEBUG */ 1040 mutex_exit(&airq_mutex); 1041 return (VIRTIRQ(chosen_irq, share_id)); 1042 } 1043 return (-1); 1044 } 1045 1046 /* 1047 * Allocate/Initialize the apic_irq_table[] entry for given irqno. If the entry 1048 * is used already, we will try to allocate a new irqno. 1049 * 1050 * Return value: 1051 * Success: irqno 1052 * Failure: -1 1053 */ 1054 static int 1055 apic_setup_irq_table(dev_info_t *dip, int irqno, struct apic_io_intr *intrp, 1056 struct intrspec *ispec, iflag_t *intr_flagp, int type) 1057 { 1058 int origirq; 1059 uchar_t ipl; 1060 int newirq, intr_index; 1061 uchar_t ipin, ioapic, ioapicindex, vector; 1062 apic_irq_t *irqptr; 1063 major_t major; 1064 dev_info_t *sdip; 1065 1066 ASSERT(ispec != NULL); 1067 1068 origirq = ispec->intrspec_vec; 1069 ipl = ispec->intrspec_pri; 1070 1071 DDI_INTR_IMPLDBG((CE_CONT, "apic_setup_irq_table: dip=0x%p type=%d " 1072 "irqno=0x%x origirq=0x%x\n", (void *)dip, type, irqno, origirq)); 1073 1074 major = (dip != NULL) ? ddi_driver_major(dip) : 0; 1075 1076 if (DDI_INTR_IS_MSI_OR_MSIX(type)) { 1077 /* MSI/X doesn't need to setup ioapic stuffs */ 1078 ioapicindex = 0xff; 1079 ioapic = 0xff; 1080 ipin = (uchar_t)0xff; 1081 intr_index = (type == DDI_INTR_TYPE_MSI) ? MSI_INDEX : 1082 MSIX_INDEX; 1083 mutex_enter(&airq_mutex); 1084 if ((irqno = apic_allocate_irq(apic_first_avail_irq)) == -1) { 1085 mutex_exit(&airq_mutex); 1086 /* need an irq for MSI/X to index into autovect[] */ 1087 cmn_err(CE_WARN, "No interrupt irq: %s instance %d", 1088 ddi_get_name(dip), ddi_get_instance(dip)); 1089 return (-1); 1090 } 1091 mutex_exit(&airq_mutex); 1092 1093 } else if (intrp != NULL) { 1094 intr_index = (int)(intrp - apic_io_intrp); 1095 ioapic = intrp->intr_destid; 1096 ipin = intrp->intr_destintin; 1097 /* Find ioapicindex. If destid was ALL, we will exit with 0. */ 1098 for (ioapicindex = apic_io_max - 1; ioapicindex; ioapicindex--) 1099 if (apic_io_id[ioapicindex] == ioapic) 1100 break; 1101 ASSERT((ioapic == apic_io_id[ioapicindex]) || 1102 (ioapic == INTR_ALL_APIC)); 1103 1104 /* check whether this intin# has been used by another irqno */ 1105 if ((newirq = apic_find_intin(ioapicindex, ipin)) != -1) { 1106 return (newirq); 1107 } 1108 1109 } else if (intr_flagp != NULL) { 1110 /* ACPI case */ 1111 intr_index = ACPI_INDEX; 1112 ioapicindex = acpi_find_ioapic(irqno); 1113 ASSERT(ioapicindex != 0xFF); 1114 ioapic = apic_io_id[ioapicindex]; 1115 ipin = irqno - apic_io_vectbase[ioapicindex]; 1116 if (apic_irq_table[irqno] && 1117 apic_irq_table[irqno]->airq_mps_intr_index == ACPI_INDEX) { 1118 ASSERT(apic_irq_table[irqno]->airq_intin_no == ipin && 1119 apic_irq_table[irqno]->airq_ioapicindex == 1120 ioapicindex); 1121 return (irqno); 1122 } 1123 1124 } else { 1125 /* default configuration */ 1126 ioapicindex = 0; 1127 ioapic = apic_io_id[ioapicindex]; 1128 ipin = (uchar_t)irqno; 1129 intr_index = DEFAULT_INDEX; 1130 } 1131 1132 if ((vector = apic_allocate_vector(ipl, irqno, 0)) == 0) { 1133 if ((newirq = apic_share_vector(irqno, intr_flagp, intr_index, 1134 ipl, ioapicindex, ipin, &irqptr)) != -1) { 1135 irqptr->airq_ipl = ipl; 1136 irqptr->airq_origirq = (uchar_t)origirq; 1137 irqptr->airq_dip = dip; 1138 irqptr->airq_major = major; 1139 sdip = apic_irq_table[IRQINDEX(newirq)]->airq_dip; 1140 /* This is OK to do really */ 1141 if (sdip == NULL) { 1142 cmn_err(CE_WARN, "Sharing vectors: %s" 1143 " instance %d and SCI", 1144 ddi_get_name(dip), ddi_get_instance(dip)); 1145 } else { 1146 cmn_err(CE_WARN, "Sharing vectors: %s" 1147 " instance %d and %s instance %d", 1148 ddi_get_name(sdip), ddi_get_instance(sdip), 1149 ddi_get_name(dip), ddi_get_instance(dip)); 1150 } 1151 return (newirq); 1152 } 1153 /* try high priority allocation now that share has failed */ 1154 if ((vector = apic_allocate_vector(ipl, irqno, 1)) == 0) { 1155 cmn_err(CE_WARN, "No interrupt vector: %s instance %d", 1156 ddi_get_name(dip), ddi_get_instance(dip)); 1157 return (-1); 1158 } 1159 } 1160 1161 mutex_enter(&airq_mutex); 1162 if (apic_irq_table[irqno] == NULL) { 1163 irqptr = kmem_zalloc(sizeof (apic_irq_t), KM_SLEEP); 1164 irqptr->airq_temp_cpu = IRQ_UNINIT; 1165 apic_irq_table[irqno] = irqptr; 1166 } else { 1167 irqptr = apic_irq_table[irqno]; 1168 if (irqptr->airq_mps_intr_index != FREE_INDEX) { 1169 /* 1170 * The slot is used by another irqno, so allocate 1171 * a free irqno for this interrupt 1172 */ 1173 newirq = apic_allocate_irq(apic_first_avail_irq); 1174 if (newirq == -1) { 1175 mutex_exit(&airq_mutex); 1176 return (-1); 1177 } 1178 irqno = newirq; 1179 irqptr = apic_irq_table[irqno]; 1180 if (irqptr == NULL) { 1181 irqptr = kmem_zalloc(sizeof (apic_irq_t), 1182 KM_SLEEP); 1183 irqptr->airq_temp_cpu = IRQ_UNINIT; 1184 apic_irq_table[irqno] = irqptr; 1185 } 1186 vector = apic_modify_vector(vector, newirq); 1187 } 1188 } 1189 apic_max_device_irq = max(irqno, apic_max_device_irq); 1190 apic_min_device_irq = min(irqno, apic_min_device_irq); 1191 mutex_exit(&airq_mutex); 1192 irqptr->airq_ioapicindex = ioapicindex; 1193 irqptr->airq_intin_no = ipin; 1194 irqptr->airq_ipl = ipl; 1195 irqptr->airq_vector = vector; 1196 irqptr->airq_origirq = (uchar_t)origirq; 1197 irqptr->airq_share_id = 0; 1198 irqptr->airq_mps_intr_index = (short)intr_index; 1199 irqptr->airq_dip = dip; 1200 irqptr->airq_major = major; 1201 irqptr->airq_cpu = apic_bind_intr(dip, irqno, ioapic, ipin); 1202 if (intr_flagp) 1203 irqptr->airq_iflag = *intr_flagp; 1204 1205 if (!DDI_INTR_IS_MSI_OR_MSIX(type)) { 1206 /* setup I/O APIC entry for non-MSI/X interrupts */ 1207 apic_record_rdt_entry(irqptr, irqno); 1208 } 1209 return (irqno); 1210 } 1211 1212 /* 1213 * return the cpu to which this intr should be bound. 1214 * Check properties or any other mechanism to see if user wants it 1215 * bound to a specific CPU. If so, return the cpu id with high bit set. 1216 * If not, use the policy to choose a cpu and return the id. 1217 */ 1218 uint32_t 1219 apic_bind_intr(dev_info_t *dip, int irq, uchar_t ioapicid, uchar_t intin) 1220 { 1221 int instance, instno, prop_len, bind_cpu, count; 1222 uint_t i, rc; 1223 uint32_t cpu; 1224 major_t major; 1225 char *name, *drv_name, *prop_val, *cptr; 1226 char prop_name[32]; 1227 ulong_t iflag; 1228 1229 1230 if (apic_intr_policy == INTR_LOWEST_PRIORITY) 1231 return (IRQ_UNBOUND); 1232 1233 if (apic_nproc == 1) 1234 return (0); 1235 1236 /* 1237 * dip may be NULL for interrupts not associated with a device driver, 1238 * such as the ACPI SCI or HPET interrupts. In that case just use the 1239 * next CPU and return. 1240 */ 1241 if (dip == NULL) { 1242 iflag = intr_clear(); 1243 lock_set(&apic_ioapic_lock); 1244 bind_cpu = apic_get_next_bind_cpu(); 1245 lock_clear(&apic_ioapic_lock); 1246 intr_restore(iflag); 1247 1248 cmn_err(CE_CONT, "!%s: irq 0x%x " 1249 "vector 0x%x ioapic 0x%x intin 0x%x is bound to cpu %d\n", 1250 psm_name, irq, apic_irq_table[irq]->airq_vector, ioapicid, 1251 intin, bind_cpu & ~IRQ_USER_BOUND); 1252 1253 return ((uint32_t)bind_cpu); 1254 } 1255 1256 name = ddi_get_name(dip); 1257 major = ddi_name_to_major(name); 1258 drv_name = ddi_major_to_name(major); 1259 instance = ddi_get_instance(dip); 1260 if (apic_intr_policy == INTR_ROUND_ROBIN_WITH_AFFINITY) { 1261 i = apic_min_device_irq; 1262 for (; i <= apic_max_device_irq; i++) { 1263 if ((i == irq) || (apic_irq_table[i] == NULL) || 1264 (apic_irq_table[i]->airq_mps_intr_index 1265 == FREE_INDEX)) 1266 continue; 1267 1268 if ((apic_irq_table[i]->airq_major == major) && 1269 (!(apic_irq_table[i]->airq_cpu & IRQ_USER_BOUND))) { 1270 cpu = apic_irq_table[i]->airq_cpu; 1271 1272 cmn_err(CE_CONT, 1273 "!%s: %s (%s) instance #%d " 1274 "irq 0x%x vector 0x%x ioapic 0x%x " 1275 "intin 0x%x is bound to cpu %d\n", 1276 psm_name, 1277 name, drv_name, instance, irq, 1278 apic_irq_table[irq]->airq_vector, 1279 ioapicid, intin, cpu); 1280 return (cpu); 1281 } 1282 } 1283 } 1284 /* 1285 * search for "drvname"_intpt_bind_cpus property first, the 1286 * syntax of the property should be "a[,b,c,...]" where 1287 * instance 0 binds to cpu a, instance 1 binds to cpu b, 1288 * instance 3 binds to cpu c... 1289 * ddi_getlongprop() will search /option first, then / 1290 * if "drvname"_intpt_bind_cpus doesn't exist, then find 1291 * intpt_bind_cpus property. The syntax is the same, and 1292 * it applies to all the devices if its "drvname" specific 1293 * property doesn't exist 1294 */ 1295 (void) strcpy(prop_name, drv_name); 1296 (void) strcat(prop_name, "_intpt_bind_cpus"); 1297 rc = ddi_getlongprop(DDI_DEV_T_ANY, dip, 0, prop_name, 1298 (caddr_t)&prop_val, &prop_len); 1299 if (rc != DDI_PROP_SUCCESS) { 1300 rc = ddi_getlongprop(DDI_DEV_T_ANY, dip, 0, 1301 "intpt_bind_cpus", (caddr_t)&prop_val, &prop_len); 1302 } 1303 if (rc == DDI_PROP_SUCCESS) { 1304 for (i = count = 0; i < (prop_len - 1); i++) 1305 if (prop_val[i] == ',') 1306 count++; 1307 if (prop_val[i-1] != ',') 1308 count++; 1309 /* 1310 * if somehow the binding instances defined in the 1311 * property are not enough for this instno., then 1312 * reuse the pattern for the next instance until 1313 * it reaches the requested instno 1314 */ 1315 instno = instance % count; 1316 i = 0; 1317 cptr = prop_val; 1318 while (i < instno) 1319 if (*cptr++ == ',') 1320 i++; 1321 bind_cpu = stoi(&cptr); 1322 kmem_free(prop_val, prop_len); 1323 /* if specific CPU is bogus, then default to next cpu */ 1324 if (!apic_cpu_in_range(bind_cpu)) { 1325 cmn_err(CE_WARN, "%s: %s=%s: CPU %d not present", 1326 psm_name, prop_name, prop_val, bind_cpu); 1327 rc = DDI_PROP_NOT_FOUND; 1328 } else { 1329 /* indicate that we are bound at user request */ 1330 bind_cpu |= IRQ_USER_BOUND; 1331 } 1332 /* 1333 * no need to check apic_cpus[].aci_status, if specific CPU is 1334 * not up, then post_cpu_start will handle it. 1335 */ 1336 } 1337 1338 if (rc != DDI_PROP_SUCCESS) { 1339 iflag = intr_clear(); 1340 lock_set(&apic_ioapic_lock); 1341 bind_cpu = apic_get_next_bind_cpu(); 1342 lock_clear(&apic_ioapic_lock); 1343 intr_restore(iflag); 1344 } 1345 1346 cmn_err(CE_CONT, "!%s: %s (%s) instance %d irq 0x%x " 1347 "vector 0x%x ioapic 0x%x intin 0x%x is bound to cpu %d\n", 1348 psm_name, name, drv_name, instance, irq, 1349 apic_irq_table[irq]->airq_vector, ioapicid, intin, 1350 bind_cpu & ~IRQ_USER_BOUND); 1351 1352 return ((uint32_t)bind_cpu); 1353 } 1354 1355 /* 1356 * Mark vector as being in the process of being deleted. Interrupts 1357 * may still come in on some CPU. The moment an interrupt comes with 1358 * the new vector, we know we can free the old one. Called only from 1359 * addspl and delspl with interrupts disabled. Because an interrupt 1360 * can be shared, but no interrupt from either device may come in, 1361 * we also use a timeout mechanism, which we arbitrarily set to 1362 * apic_revector_timeout microseconds. 1363 */ 1364 static void 1365 apic_mark_vector(uchar_t oldvector, uchar_t newvector) 1366 { 1367 ulong_t iflag; 1368 1369 iflag = intr_clear(); 1370 lock_set(&apic_revector_lock); 1371 if (!apic_oldvec_to_newvec) { 1372 apic_oldvec_to_newvec = 1373 kmem_zalloc(sizeof (newvector) * APIC_MAX_VECTOR * 2, 1374 KM_NOSLEEP); 1375 1376 if (!apic_oldvec_to_newvec) { 1377 /* 1378 * This failure is not catastrophic. 1379 * But, the oldvec will never be freed. 1380 */ 1381 apic_error |= APIC_ERR_MARK_VECTOR_FAIL; 1382 lock_clear(&apic_revector_lock); 1383 intr_restore(iflag); 1384 return; 1385 } 1386 apic_newvec_to_oldvec = &apic_oldvec_to_newvec[APIC_MAX_VECTOR]; 1387 } 1388 1389 /* See if we already did this for drivers which do double addintrs */ 1390 if (apic_oldvec_to_newvec[oldvector] != newvector) { 1391 apic_oldvec_to_newvec[oldvector] = newvector; 1392 apic_newvec_to_oldvec[newvector] = oldvector; 1393 apic_revector_pending++; 1394 } 1395 lock_clear(&apic_revector_lock); 1396 intr_restore(iflag); 1397 (void) timeout(apic_xlate_vector_free_timeout_handler, 1398 (void *)(uintptr_t)oldvector, drv_usectohz(apic_revector_timeout)); 1399 } 1400 1401 /* 1402 * xlate_vector is called from intr_enter if revector_pending is set. 1403 * It will xlate it if needed and mark the old vector as free. 1404 */ 1405 uchar_t 1406 apic_xlate_vector(uchar_t vector) 1407 { 1408 uchar_t newvector, oldvector = 0; 1409 1410 lock_set(&apic_revector_lock); 1411 /* Do we really need to do this ? */ 1412 if (!apic_revector_pending) { 1413 lock_clear(&apic_revector_lock); 1414 return (vector); 1415 } 1416 if ((newvector = apic_oldvec_to_newvec[vector]) != 0) 1417 oldvector = vector; 1418 else { 1419 /* 1420 * The incoming vector is new . See if a stale entry is 1421 * remaining 1422 */ 1423 if ((oldvector = apic_newvec_to_oldvec[vector]) != 0) 1424 newvector = vector; 1425 } 1426 1427 if (oldvector) { 1428 apic_revector_pending--; 1429 apic_oldvec_to_newvec[oldvector] = 0; 1430 apic_newvec_to_oldvec[newvector] = 0; 1431 apic_free_vector(oldvector); 1432 lock_clear(&apic_revector_lock); 1433 /* There could have been more than one reprogramming! */ 1434 return (apic_xlate_vector(newvector)); 1435 } 1436 lock_clear(&apic_revector_lock); 1437 return (vector); 1438 } 1439 1440 void 1441 apic_xlate_vector_free_timeout_handler(void *arg) 1442 { 1443 ulong_t iflag; 1444 uchar_t oldvector, newvector; 1445 1446 oldvector = (uchar_t)(uintptr_t)arg; 1447 iflag = intr_clear(); 1448 lock_set(&apic_revector_lock); 1449 if ((newvector = apic_oldvec_to_newvec[oldvector]) != 0) { 1450 apic_free_vector(oldvector); 1451 apic_oldvec_to_newvec[oldvector] = 0; 1452 apic_newvec_to_oldvec[newvector] = 0; 1453 apic_revector_pending--; 1454 } 1455 1456 lock_clear(&apic_revector_lock); 1457 intr_restore(iflag); 1458 } 1459 1460 /* 1461 * Bind interrupt corresponding to irq_ptr to bind_cpu. 1462 * Must be called with interrupts disabled and apic_ioapic_lock held 1463 */ 1464 int 1465 apic_rebind(apic_irq_t *irq_ptr, int bind_cpu, 1466 struct ioapic_reprogram_data *drep) 1467 { 1468 int ioapicindex, intin_no; 1469 uint32_t airq_temp_cpu; 1470 apic_cpus_info_t *cpu_infop; 1471 uint32_t rdt_entry; 1472 int which_irq; 1473 ioapic_rdt_t irdt; 1474 1475 which_irq = apic_vector_to_irq[irq_ptr->airq_vector]; 1476 1477 intin_no = irq_ptr->airq_intin_no; 1478 ioapicindex = irq_ptr->airq_ioapicindex; 1479 airq_temp_cpu = irq_ptr->airq_temp_cpu; 1480 if (airq_temp_cpu != IRQ_UNINIT && airq_temp_cpu != IRQ_UNBOUND) { 1481 if (airq_temp_cpu & IRQ_USER_BOUND) 1482 /* Mask off high bit so it can be used as array index */ 1483 airq_temp_cpu &= ~IRQ_USER_BOUND; 1484 1485 ASSERT(apic_cpu_in_range(airq_temp_cpu)); 1486 } 1487 1488 /* 1489 * Can't bind to a CPU that's not accepting interrupts: 1490 */ 1491 cpu_infop = &apic_cpus[bind_cpu & ~IRQ_USER_BOUND]; 1492 if (!(cpu_infop->aci_status & APIC_CPU_INTR_ENABLE)) 1493 return (1); 1494 1495 /* 1496 * If we are about to change the interrupt vector for this interrupt, 1497 * and this interrupt is level-triggered, attached to an IOAPIC, 1498 * has been delivered to a CPU and that CPU has not handled it 1499 * yet, we cannot reprogram the IOAPIC now. 1500 */ 1501 if (!APIC_IS_MSI_OR_MSIX_INDEX(irq_ptr->airq_mps_intr_index)) { 1502 1503 rdt_entry = READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapicindex, 1504 intin_no); 1505 1506 if ((irq_ptr->airq_vector != RDT_VECTOR(rdt_entry)) && 1507 apic_check_stuck_interrupt(irq_ptr, airq_temp_cpu, 1508 bind_cpu, ioapicindex, intin_no, which_irq, drep) != 0) { 1509 1510 return (0); 1511 } 1512 1513 /* 1514 * NOTE: We do not unmask the RDT here, as an interrupt MAY 1515 * still come in before we have a chance to reprogram it below. 1516 * The reprogramming below will simultaneously change and 1517 * unmask the RDT entry. 1518 */ 1519 1520 if ((uint32_t)bind_cpu == IRQ_UNBOUND) { 1521 irdt.ir_lo = AV_LDEST | AV_LOPRI | 1522 irq_ptr->airq_rdt_entry; 1523 1524 irdt.ir_hi = AV_TOALL >> APIC_ID_BIT_OFFSET; 1525 1526 apic_vt_ops->apic_intrmap_alloc_entry( 1527 &irq_ptr->airq_intrmap_private, NULL, 1528 DDI_INTR_TYPE_FIXED, 1, ioapicindex); 1529 apic_vt_ops->apic_intrmap_map_entry( 1530 irq_ptr->airq_intrmap_private, (void *)&irdt, 1531 DDI_INTR_TYPE_FIXED, 1); 1532 apic_vt_ops->apic_intrmap_record_rdt( 1533 irq_ptr->airq_intrmap_private, &irdt); 1534 1535 /* Write the RDT entry -- no specific CPU binding */ 1536 WRITE_IOAPIC_RDT_ENTRY_HIGH_DWORD(ioapicindex, intin_no, 1537 irdt.ir_hi | AV_TOALL); 1538 1539 if (airq_temp_cpu != IRQ_UNINIT && airq_temp_cpu != 1540 IRQ_UNBOUND) 1541 apic_cpus[airq_temp_cpu].aci_temp_bound--; 1542 1543 /* 1544 * Write the vector, trigger, and polarity portion of 1545 * the RDT 1546 */ 1547 WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapicindex, intin_no, 1548 irdt.ir_lo); 1549 1550 irq_ptr->airq_temp_cpu = IRQ_UNBOUND; 1551 return (0); 1552 } 1553 } 1554 1555 if (bind_cpu & IRQ_USER_BOUND) { 1556 cpu_infop->aci_bound++; 1557 } else { 1558 cpu_infop->aci_temp_bound++; 1559 } 1560 ASSERT(apic_cpu_in_range(bind_cpu)); 1561 1562 if ((airq_temp_cpu != IRQ_UNBOUND) && (airq_temp_cpu != IRQ_UNINIT)) { 1563 apic_cpus[airq_temp_cpu].aci_temp_bound--; 1564 } 1565 if (!APIC_IS_MSI_OR_MSIX_INDEX(irq_ptr->airq_mps_intr_index)) { 1566 1567 irdt.ir_lo = AV_PDEST | AV_FIXED | irq_ptr->airq_rdt_entry; 1568 irdt.ir_hi = cpu_infop->aci_local_id; 1569 1570 apic_vt_ops->apic_intrmap_alloc_entry( 1571 &irq_ptr->airq_intrmap_private, NULL, DDI_INTR_TYPE_FIXED, 1572 1, ioapicindex); 1573 apic_vt_ops->apic_intrmap_map_entry( 1574 irq_ptr->airq_intrmap_private, 1575 (void *)&irdt, DDI_INTR_TYPE_FIXED, 1); 1576 apic_vt_ops->apic_intrmap_record_rdt( 1577 irq_ptr->airq_intrmap_private, &irdt); 1578 1579 /* Write the RDT entry -- bind to a specific CPU: */ 1580 WRITE_IOAPIC_RDT_ENTRY_HIGH_DWORD(ioapicindex, intin_no, 1581 irdt.ir_hi); 1582 1583 /* Write the vector, trigger, and polarity portion of the RDT */ 1584 WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapicindex, intin_no, 1585 irdt.ir_lo); 1586 1587 } else { 1588 int type = (irq_ptr->airq_mps_intr_index == MSI_INDEX) ? 1589 DDI_INTR_TYPE_MSI : DDI_INTR_TYPE_MSIX; 1590 if (type == DDI_INTR_TYPE_MSI) { 1591 if (irq_ptr->airq_ioapicindex == 1592 irq_ptr->airq_origirq) { 1593 /* first one */ 1594 DDI_INTR_IMPLDBG((CE_CONT, "apic_rebind: call " 1595 "apic_pci_msi_enable_vector\n")); 1596 apic_pci_msi_enable_vector(irq_ptr, 1597 type, which_irq, irq_ptr->airq_vector, 1598 irq_ptr->airq_intin_no, 1599 cpu_infop->aci_local_id); 1600 } 1601 if ((irq_ptr->airq_ioapicindex + 1602 irq_ptr->airq_intin_no - 1) == 1603 irq_ptr->airq_origirq) { /* last one */ 1604 DDI_INTR_IMPLDBG((CE_CONT, "apic_rebind: call " 1605 "apic_pci_msi_enable_mode\n")); 1606 apic_pci_msi_enable_mode(irq_ptr->airq_dip, 1607 type, which_irq); 1608 } 1609 } else { /* MSI-X */ 1610 apic_pci_msi_enable_vector(irq_ptr, type, 1611 irq_ptr->airq_origirq, irq_ptr->airq_vector, 1, 1612 cpu_infop->aci_local_id); 1613 apic_pci_msi_enable_mode(irq_ptr->airq_dip, type, 1614 irq_ptr->airq_origirq); 1615 } 1616 } 1617 irq_ptr->airq_temp_cpu = (uint32_t)bind_cpu; 1618 apic_redist_cpu_skip &= ~(1 << (bind_cpu & ~IRQ_USER_BOUND)); 1619 return (0); 1620 } 1621 1622 static void 1623 apic_last_ditch_clear_remote_irr(int ioapic_ix, int intin_no) 1624 { 1625 if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, intin_no) 1626 & AV_REMOTE_IRR) != 0) { 1627 /* 1628 * Trying to clear the bit through normal 1629 * channels has failed. So as a last-ditch 1630 * effort, try to set the trigger mode to 1631 * edge, then to level. This has been 1632 * observed to work on many systems. 1633 */ 1634 WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, 1635 intin_no, 1636 READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, 1637 intin_no) & ~AV_LEVEL); 1638 1639 WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, 1640 intin_no, 1641 READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, 1642 intin_no) | AV_LEVEL); 1643 1644 /* 1645 * If the bit's STILL set, this interrupt may 1646 * be hosed. 1647 */ 1648 if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, 1649 intin_no) & AV_REMOTE_IRR) != 0) { 1650 1651 prom_printf("%s: Remote IRR still " 1652 "not clear for IOAPIC %d intin %d.\n" 1653 "\tInterrupts to this pin may cease " 1654 "functioning.\n", psm_name, ioapic_ix, 1655 intin_no); 1656 #ifdef DEBUG 1657 apic_last_ditch_reprogram_failures++; 1658 #endif 1659 } 1660 } 1661 } 1662 1663 /* 1664 * This function is protected by apic_ioapic_lock coupled with the 1665 * fact that interrupts are disabled. 1666 */ 1667 static void 1668 delete_defer_repro_ent(int which_irq) 1669 { 1670 ASSERT(which_irq >= 0); 1671 ASSERT(which_irq <= 255); 1672 ASSERT(LOCK_HELD(&apic_ioapic_lock)); 1673 1674 if (apic_reprogram_info[which_irq].done) 1675 return; 1676 1677 apic_reprogram_info[which_irq].done = B_TRUE; 1678 1679 #ifdef DEBUG 1680 apic_defer_repro_total_retries += 1681 apic_reprogram_info[which_irq].tries; 1682 1683 apic_defer_repro_successes++; 1684 #endif 1685 1686 if (--apic_reprogram_outstanding == 0) { 1687 1688 setlvlx = psm_intr_exit_fn(); 1689 } 1690 } 1691 1692 1693 /* 1694 * Interrupts must be disabled during this function to prevent 1695 * self-deadlock. Interrupts are disabled because this function 1696 * is called from apic_check_stuck_interrupt(), which is called 1697 * from apic_rebind(), which requires its caller to disable interrupts. 1698 */ 1699 static void 1700 add_defer_repro_ent(apic_irq_t *irq_ptr, int which_irq, int new_bind_cpu) 1701 { 1702 ASSERT(which_irq >= 0); 1703 ASSERT(which_irq <= 255); 1704 ASSERT(!interrupts_enabled()); 1705 1706 /* 1707 * On the off-chance that there's already a deferred 1708 * reprogramming on this irq, check, and if so, just update the 1709 * CPU and irq pointer to which the interrupt is targeted, then return. 1710 */ 1711 if (!apic_reprogram_info[which_irq].done) { 1712 apic_reprogram_info[which_irq].bindcpu = new_bind_cpu; 1713 apic_reprogram_info[which_irq].irqp = irq_ptr; 1714 return; 1715 } 1716 1717 apic_reprogram_info[which_irq].irqp = irq_ptr; 1718 apic_reprogram_info[which_irq].bindcpu = new_bind_cpu; 1719 apic_reprogram_info[which_irq].tries = 0; 1720 /* 1721 * This must be the last thing set, since we're not 1722 * grabbing any locks, apic_try_deferred_reprogram() will 1723 * make its decision about using this entry iff done 1724 * is false. 1725 */ 1726 apic_reprogram_info[which_irq].done = B_FALSE; 1727 1728 /* 1729 * If there were previously no deferred reprogrammings, change 1730 * setlvlx to call apic_try_deferred_reprogram() 1731 */ 1732 if (++apic_reprogram_outstanding == 1) { 1733 1734 setlvlx = apic_try_deferred_reprogram; 1735 } 1736 } 1737 1738 static void 1739 apic_try_deferred_reprogram(int prev_ipl, int irq) 1740 { 1741 int reproirq; 1742 ulong_t iflag; 1743 struct ioapic_reprogram_data *drep; 1744 1745 (*psm_intr_exit_fn())(prev_ipl, irq); 1746 1747 if (!lock_try(&apic_defer_reprogram_lock)) { 1748 return; 1749 } 1750 1751 /* 1752 * Acquire the apic_ioapic_lock so that any other operations that 1753 * may affect the apic_reprogram_info state are serialized. 1754 * It's still possible for the last deferred reprogramming to clear 1755 * between the time we entered this function and the time we get to 1756 * the for loop below. In that case, *setlvlx will have been set 1757 * back to *_intr_exit and drep will be NULL. (There's no way to 1758 * stop that from happening -- we would need to grab a lock before 1759 * calling *setlvlx, which is neither realistic nor prudent). 1760 */ 1761 iflag = intr_clear(); 1762 lock_set(&apic_ioapic_lock); 1763 1764 /* 1765 * For each deferred RDT entry, try to reprogram it now. Note that 1766 * there is no lock acquisition to read apic_reprogram_info because 1767 * '.done' is set only after the other fields in the structure are set. 1768 */ 1769 1770 drep = NULL; 1771 for (reproirq = 0; reproirq <= APIC_MAX_VECTOR; reproirq++) { 1772 if (apic_reprogram_info[reproirq].done == B_FALSE) { 1773 drep = &apic_reprogram_info[reproirq]; 1774 break; 1775 } 1776 } 1777 1778 /* 1779 * Either we found a deferred action to perform, or 1780 * we entered this function spuriously, after *setlvlx 1781 * was restored to point to *_intr_exit. Any other 1782 * permutation is invalid. 1783 */ 1784 ASSERT(drep != NULL || *setlvlx == psm_intr_exit_fn()); 1785 1786 /* 1787 * Though we can't really do anything about errors 1788 * at this point, keep track of them for reporting. 1789 * Note that it is very possible for apic_setup_io_intr 1790 * to re-register this very timeout if the Remote IRR bit 1791 * has not yet cleared. 1792 */ 1793 1794 #ifdef DEBUG 1795 if (drep != NULL) { 1796 if (apic_setup_io_intr(drep, reproirq, B_TRUE) != 0) { 1797 apic_deferred_setup_failures++; 1798 } 1799 } else { 1800 apic_deferred_spurious_enters++; 1801 } 1802 #else 1803 if (drep != NULL) 1804 (void) apic_setup_io_intr(drep, reproirq, B_TRUE); 1805 #endif 1806 1807 lock_clear(&apic_ioapic_lock); 1808 intr_restore(iflag); 1809 1810 lock_clear(&apic_defer_reprogram_lock); 1811 } 1812 1813 static void 1814 apic_ioapic_wait_pending_clear(int ioapic_ix, int intin_no) 1815 { 1816 int waited; 1817 1818 /* 1819 * Wait for the delivery pending bit to clear. 1820 */ 1821 if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, intin_no) & 1822 (AV_LEVEL|AV_PENDING)) == (AV_LEVEL|AV_PENDING)) { 1823 1824 /* 1825 * If we're still waiting on the delivery of this interrupt, 1826 * continue to wait here until it is delivered (this should be 1827 * a very small amount of time, but include a timeout just in 1828 * case). 1829 */ 1830 for (waited = 0; waited < apic_max_reps_clear_pending; 1831 waited++) { 1832 if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, 1833 intin_no) & AV_PENDING) == 0) { 1834 break; 1835 } 1836 } 1837 } 1838 } 1839 1840 1841 /* 1842 * Checks to see if the IOAPIC interrupt entry specified has its Remote IRR 1843 * bit set. Calls functions that modify the function that setlvlx points to, 1844 * so that the reprogramming can be retried very shortly. 1845 * 1846 * This function will mask the RDT entry if the interrupt is level-triggered. 1847 * (The caller is responsible for unmasking the RDT entry.) 1848 * 1849 * Returns non-zero if the caller should defer IOAPIC reprogramming. 1850 */ 1851 static int 1852 apic_check_stuck_interrupt(apic_irq_t *irq_ptr, int old_bind_cpu, 1853 int new_bind_cpu, int ioapic_ix, int intin_no, int which_irq, 1854 struct ioapic_reprogram_data *drep) 1855 { 1856 int32_t rdt_entry; 1857 int waited; 1858 int reps = 0; 1859 1860 /* 1861 * Wait for the delivery pending bit to clear. 1862 */ 1863 do { 1864 ++reps; 1865 1866 apic_ioapic_wait_pending_clear(ioapic_ix, intin_no); 1867 1868 /* 1869 * Mask the RDT entry, but only if it's a level-triggered 1870 * interrupt 1871 */ 1872 rdt_entry = READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, 1873 intin_no); 1874 if ((rdt_entry & (AV_LEVEL|AV_MASK)) == AV_LEVEL) { 1875 1876 /* Mask it */ 1877 WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, intin_no, 1878 AV_MASK | rdt_entry); 1879 } 1880 1881 if ((rdt_entry & AV_LEVEL) == AV_LEVEL) { 1882 /* 1883 * If there was a race and an interrupt was injected 1884 * just before we masked, check for that case here. 1885 * Then, unmask the RDT entry and try again. If we're 1886 * on our last try, don't unmask (because we want the 1887 * RDT entry to remain masked for the rest of the 1888 * function). 1889 */ 1890 rdt_entry = READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, 1891 intin_no); 1892 if ((rdt_entry & AV_PENDING) && 1893 (reps < apic_max_reps_clear_pending)) { 1894 /* Unmask it */ 1895 WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, 1896 intin_no, rdt_entry & ~AV_MASK); 1897 } 1898 } 1899 1900 } while ((rdt_entry & AV_PENDING) && 1901 (reps < apic_max_reps_clear_pending)); 1902 1903 #ifdef DEBUG 1904 if (rdt_entry & AV_PENDING) 1905 apic_intr_deliver_timeouts++; 1906 #endif 1907 1908 /* 1909 * If the remote IRR bit is set, then the interrupt has been sent 1910 * to a CPU for processing. We have no choice but to wait for 1911 * that CPU to process the interrupt, at which point the remote IRR 1912 * bit will be cleared. 1913 */ 1914 if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, intin_no) & 1915 (AV_LEVEL|AV_REMOTE_IRR)) == (AV_LEVEL|AV_REMOTE_IRR)) { 1916 1917 /* 1918 * If the CPU that this RDT is bound to is NOT the current 1919 * CPU, wait until that CPU handles the interrupt and ACKs 1920 * it. If this interrupt is not bound to any CPU (that is, 1921 * if it's bound to the logical destination of "anyone"), it 1922 * may have been delivered to the current CPU so handle that 1923 * case by deferring the reprogramming (below). 1924 */ 1925 if ((old_bind_cpu != IRQ_UNBOUND) && 1926 (old_bind_cpu != IRQ_UNINIT) && 1927 (old_bind_cpu != psm_get_cpu_id())) { 1928 for (waited = 0; waited < apic_max_reps_clear_pending; 1929 waited++) { 1930 if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, 1931 intin_no) & AV_REMOTE_IRR) == 0) { 1932 1933 delete_defer_repro_ent(which_irq); 1934 1935 /* Remote IRR has cleared! */ 1936 return (0); 1937 } 1938 } 1939 } 1940 1941 /* 1942 * If we waited and the Remote IRR bit is still not cleared, 1943 * AND if we've invoked the timeout APIC_REPROGRAM_MAX_TIMEOUTS 1944 * times for this interrupt, try the last-ditch workaround: 1945 */ 1946 if (drep && drep->tries >= APIC_REPROGRAM_MAX_TRIES) { 1947 1948 apic_last_ditch_clear_remote_irr(ioapic_ix, intin_no); 1949 1950 /* Mark this one as reprogrammed: */ 1951 delete_defer_repro_ent(which_irq); 1952 1953 return (0); 1954 } else { 1955 #ifdef DEBUG 1956 apic_intr_deferrals++; 1957 #endif 1958 1959 /* 1960 * If waiting for the Remote IRR bit (above) didn't 1961 * allow it to clear, defer the reprogramming. 1962 * Add a new deferred-programming entry if the 1963 * caller passed a NULL one (and update the existing one 1964 * in case anything changed). 1965 */ 1966 add_defer_repro_ent(irq_ptr, which_irq, new_bind_cpu); 1967 if (drep) 1968 drep->tries++; 1969 1970 /* Inform caller to defer IOAPIC programming: */ 1971 return (1); 1972 } 1973 1974 } 1975 1976 /* Remote IRR is clear */ 1977 delete_defer_repro_ent(which_irq); 1978 1979 return (0); 1980 } 1981 1982 /* 1983 * Called to migrate all interrupts at an irq to another cpu. 1984 * Must be called with interrupts disabled and apic_ioapic_lock held 1985 */ 1986 int 1987 apic_rebind_all(apic_irq_t *irq_ptr, int bind_cpu) 1988 { 1989 apic_irq_t *irqptr = irq_ptr; 1990 int retval = 0; 1991 1992 while (irqptr) { 1993 if (irqptr->airq_temp_cpu != IRQ_UNINIT) 1994 retval |= apic_rebind(irqptr, bind_cpu, NULL); 1995 irqptr = irqptr->airq_next; 1996 } 1997 1998 return (retval); 1999 } 2000 2001 /* 2002 * apic_intr_redistribute does all the messy computations for identifying 2003 * which interrupt to move to which CPU. Currently we do just one interrupt 2004 * at a time. This reduces the time we spent doing all this within clock 2005 * interrupt. When it is done in idle, we could do more than 1. 2006 * First we find the most busy and the most free CPU (time in ISR only) 2007 * skipping those CPUs that has been identified as being ineligible (cpu_skip) 2008 * Then we look for IRQs which are closest to the difference between the 2009 * most busy CPU and the average ISR load. We try to find one whose load 2010 * is less than difference.If none exists, then we chose one larger than the 2011 * difference, provided it does not make the most idle CPU worse than the 2012 * most busy one. In the end, we clear all the busy fields for CPUs. For 2013 * IRQs, they are cleared as they are scanned. 2014 */ 2015 void 2016 apic_intr_redistribute(void) 2017 { 2018 int busiest_cpu, most_free_cpu; 2019 int cpu_free, cpu_busy, max_busy, min_busy; 2020 int min_free, diff; 2021 int average_busy, cpus_online; 2022 int i, busy; 2023 ulong_t iflag; 2024 apic_cpus_info_t *cpu_infop; 2025 apic_irq_t *min_busy_irq = NULL; 2026 apic_irq_t *max_busy_irq = NULL; 2027 2028 busiest_cpu = most_free_cpu = -1; 2029 cpu_free = cpu_busy = max_busy = average_busy = 0; 2030 min_free = apic_sample_factor_redistribution; 2031 cpus_online = 0; 2032 /* 2033 * Below we will check for CPU_INTR_ENABLE, bound, temp_bound, temp_cpu 2034 * without ioapic_lock. That is OK as we are just doing statistical 2035 * sampling anyway and any inaccuracy now will get corrected next time 2036 * The call to rebind which actually changes things will make sure 2037 * we are consistent. 2038 */ 2039 for (i = 0; i < apic_nproc; i++) { 2040 if (apic_cpu_in_range(i) && 2041 !(apic_redist_cpu_skip & (1 << i)) && 2042 (apic_cpus[i].aci_status & APIC_CPU_INTR_ENABLE)) { 2043 2044 cpu_infop = &apic_cpus[i]; 2045 /* 2046 * If no unbound interrupts or only 1 total on this 2047 * CPU, skip 2048 */ 2049 if (!cpu_infop->aci_temp_bound || 2050 (cpu_infop->aci_bound + cpu_infop->aci_temp_bound) 2051 == 1) { 2052 apic_redist_cpu_skip |= 1 << i; 2053 continue; 2054 } 2055 2056 busy = cpu_infop->aci_busy; 2057 average_busy += busy; 2058 cpus_online++; 2059 if (max_busy < busy) { 2060 max_busy = busy; 2061 busiest_cpu = i; 2062 } 2063 if (min_free > busy) { 2064 min_free = busy; 2065 most_free_cpu = i; 2066 } 2067 if (busy > apic_int_busy_mark) { 2068 cpu_busy |= 1 << i; 2069 } else { 2070 if (busy < apic_int_free_mark) 2071 cpu_free |= 1 << i; 2072 } 2073 } 2074 } 2075 if ((cpu_busy && cpu_free) || 2076 (max_busy >= (min_free + apic_diff_for_redistribution))) { 2077 2078 apic_num_imbalance++; 2079 #ifdef DEBUG 2080 if (apic_verbose & APIC_VERBOSE_IOAPIC_FLAG) { 2081 prom_printf( 2082 "redistribute busy=%x free=%x max=%x min=%x", 2083 cpu_busy, cpu_free, max_busy, min_free); 2084 } 2085 #endif /* DEBUG */ 2086 2087 2088 average_busy /= cpus_online; 2089 2090 diff = max_busy - average_busy; 2091 min_busy = max_busy; /* start with the max possible value */ 2092 max_busy = 0; 2093 min_busy_irq = max_busy_irq = NULL; 2094 i = apic_min_device_irq; 2095 for (; i <= apic_max_device_irq; i++) { 2096 apic_irq_t *irq_ptr; 2097 /* Change to linked list per CPU ? */ 2098 if ((irq_ptr = apic_irq_table[i]) == NULL) 2099 continue; 2100 /* Check for irq_busy & decide which one to move */ 2101 /* Also zero them for next round */ 2102 if ((irq_ptr->airq_temp_cpu == busiest_cpu) && 2103 irq_ptr->airq_busy) { 2104 if (irq_ptr->airq_busy < diff) { 2105 /* 2106 * Check for least busy CPU, 2107 * best fit or what ? 2108 */ 2109 if (max_busy < irq_ptr->airq_busy) { 2110 /* 2111 * Most busy within the 2112 * required differential 2113 */ 2114 max_busy = irq_ptr->airq_busy; 2115 max_busy_irq = irq_ptr; 2116 } 2117 } else { 2118 if (min_busy > irq_ptr->airq_busy) { 2119 /* 2120 * least busy, but more than 2121 * the reqd diff 2122 */ 2123 if (min_busy < 2124 (diff + average_busy - 2125 min_free)) { 2126 /* 2127 * Making sure new cpu 2128 * will not end up 2129 * worse 2130 */ 2131 min_busy = 2132 irq_ptr->airq_busy; 2133 2134 min_busy_irq = irq_ptr; 2135 } 2136 } 2137 } 2138 } 2139 irq_ptr->airq_busy = 0; 2140 } 2141 2142 if (max_busy_irq != NULL) { 2143 #ifdef DEBUG 2144 if (apic_verbose & APIC_VERBOSE_IOAPIC_FLAG) { 2145 prom_printf("rebinding %x to %x", 2146 max_busy_irq->airq_vector, most_free_cpu); 2147 } 2148 #endif /* DEBUG */ 2149 iflag = intr_clear(); 2150 if (lock_try(&apic_ioapic_lock)) { 2151 if (apic_rebind_all(max_busy_irq, 2152 most_free_cpu) == 0) { 2153 /* Make change permenant */ 2154 max_busy_irq->airq_cpu = 2155 (uint32_t)most_free_cpu; 2156 } 2157 lock_clear(&apic_ioapic_lock); 2158 } 2159 intr_restore(iflag); 2160 2161 } else if (min_busy_irq != NULL) { 2162 #ifdef DEBUG 2163 if (apic_verbose & APIC_VERBOSE_IOAPIC_FLAG) { 2164 prom_printf("rebinding %x to %x", 2165 min_busy_irq->airq_vector, most_free_cpu); 2166 } 2167 #endif /* DEBUG */ 2168 2169 iflag = intr_clear(); 2170 if (lock_try(&apic_ioapic_lock)) { 2171 if (apic_rebind_all(min_busy_irq, 2172 most_free_cpu) == 0) { 2173 /* Make change permenant */ 2174 min_busy_irq->airq_cpu = 2175 (uint32_t)most_free_cpu; 2176 } 2177 lock_clear(&apic_ioapic_lock); 2178 } 2179 intr_restore(iflag); 2180 2181 } else { 2182 if (cpu_busy != (1 << busiest_cpu)) { 2183 apic_redist_cpu_skip |= 1 << busiest_cpu; 2184 /* 2185 * We leave cpu_skip set so that next time we 2186 * can choose another cpu 2187 */ 2188 } 2189 } 2190 apic_num_rebind++; 2191 } else { 2192 /* 2193 * found nothing. Could be that we skipped over valid CPUs 2194 * or we have balanced everything. If we had a variable 2195 * ticks_for_redistribution, it could be increased here. 2196 * apic_int_busy, int_free etc would also need to be 2197 * changed. 2198 */ 2199 if (apic_redist_cpu_skip) 2200 apic_redist_cpu_skip = 0; 2201 } 2202 for (i = 0; i < apic_nproc; i++) { 2203 if (apic_cpu_in_range(i)) { 2204 apic_cpus[i].aci_busy = 0; 2205 } 2206 } 2207 } 2208 2209 void 2210 apic_cleanup_busy(void) 2211 { 2212 int i; 2213 apic_irq_t *irq_ptr; 2214 2215 for (i = 0; i < apic_nproc; i++) { 2216 if (apic_cpu_in_range(i)) { 2217 apic_cpus[i].aci_busy = 0; 2218 } 2219 } 2220 2221 for (i = apic_min_device_irq; i <= apic_max_device_irq; i++) { 2222 if ((irq_ptr = apic_irq_table[i]) != NULL) 2223 irq_ptr->airq_busy = 0; 2224 } 2225 } 2226