1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright (c) 2020, the University of Queensland 14 * Copyright 2020 RackTop Systems, Inc. 15 */ 16 17 /* 18 * Mellanox Connect-X 4/5/6 driver. 19 */ 20 21 #include <sys/modctl.h> 22 #include <sys/conf.h> 23 #include <sys/devops.h> 24 #include <sys/sysmacros.h> 25 #include <sys/disp.h> 26 #include <sys/sdt.h> 27 28 #include <sys/mac_provider.h> 29 30 #include <mlxcx.h> 31 32 /* 33 * CTASSERT(s) to cover bad values which would induce bugs. 34 */ 35 CTASSERT(MLXCX_CQ_LWM_GAP >= MLXCX_CQ_HWM_GAP); 36 37 /* 38 * Disable interrupts. 39 * The act of calling ddi_intr_disable() does not guarantee an interrupt 40 * routine is not running, so flag the vector as quiescing and wait 41 * for anything active to finish. 42 */ 43 void 44 mlxcx_intr_disable(mlxcx_t *mlxp) 45 { 46 int i; 47 48 mlxcx_cmd_eq_disable(mlxp); 49 50 for (i = 0; i < mlxp->mlx_intr_count; ++i) { 51 mlxcx_event_queue_t *mleq = &mlxp->mlx_eqs[i]; 52 53 mutex_enter(&mleq->mleq_mtx); 54 55 if ((mleq->mleq_state & MLXCX_EQ_INTR_ENABLED) == 0) { 56 mutex_exit(&mleq->mleq_mtx); 57 continue; 58 } 59 60 (void) ddi_intr_disable(mlxp->mlx_intr_handles[i]); 61 62 mleq->mleq_state |= MLXCX_EQ_INTR_QUIESCE; 63 while ((mleq->mleq_state & MLXCX_EQ_INTR_ACTIVE) != 0) 64 cv_wait(&mleq->mleq_cv, &mleq->mleq_mtx); 65 66 mleq->mleq_state &= ~MLXCX_EQ_INTR_ENABLED; 67 68 mutex_exit(&mleq->mleq_mtx); 69 } 70 } 71 72 void 73 mlxcx_intr_teardown(mlxcx_t *mlxp) 74 { 75 int i; 76 int ret; 77 78 for (i = 0; i < mlxp->mlx_intr_count; ++i) { 79 mlxcx_event_queue_t *mleq = &mlxp->mlx_eqs[i]; 80 81 mutex_enter(&mleq->mleq_mtx); 82 VERIFY0(mleq->mleq_state & MLXCX_EQ_ALLOC); 83 if (mleq->mleq_state & MLXCX_EQ_CREATED) 84 VERIFY(mleq->mleq_state & MLXCX_EQ_DESTROYED); 85 if (i >= mlxp->mlx_intr_cq0) { 86 VERIFY(avl_is_empty(&mleq->mleq_cqs)); 87 avl_destroy(&mleq->mleq_cqs); 88 } 89 mutex_exit(&mleq->mleq_mtx); 90 (void) ddi_intr_remove_handler(mlxp->mlx_intr_handles[i]); 91 ret = ddi_intr_free(mlxp->mlx_intr_handles[i]); 92 if (ret != DDI_SUCCESS) { 93 mlxcx_warn(mlxp, "failed to free interrupt %d: %d", 94 i, ret); 95 } 96 mutex_destroy(&mleq->mleq_mtx); 97 cv_destroy(&mleq->mleq_cv); 98 } 99 kmem_free(mlxp->mlx_intr_handles, mlxp->mlx_intr_size); 100 kmem_free(mlxp->mlx_eqs, mlxp->mlx_eqs_size); 101 mlxp->mlx_intr_handles = NULL; 102 mlxp->mlx_eqs = NULL; 103 } 104 105 /* 106 * Get the next SW-owned entry on the event queue, or NULL if we reach the end. 107 */ 108 static mlxcx_eventq_ent_t * 109 mlxcx_eq_next(mlxcx_event_queue_t *mleq) 110 { 111 mlxcx_eventq_ent_t *ent; 112 ddi_fm_error_t err; 113 uint_t ci; 114 const uint_t swowner = ((mleq->mleq_cc >> mleq->mleq_entshift) & 1); 115 116 /* 117 * This should only be called from interrupt context to ensure 118 * correctness of mleq_cc. 119 */ 120 ASSERT(servicing_interrupt()); 121 ASSERT(mleq->mleq_state & MLXCX_EQ_CREATED); 122 ASSERT0(mleq->mleq_state & MLXCX_EQ_DESTROYED); 123 124 /* mleq_nents is always a power of 2 */ 125 ci = mleq->mleq_cc & (mleq->mleq_nents - 1); 126 127 ent = &mleq->mleq_ent[ci]; 128 VERIFY0(ddi_dma_sync(mleq->mleq_dma.mxdb_dma_handle, 129 (uintptr_t)ent - (uintptr_t)mleq->mleq_ent, 130 sizeof (mlxcx_eventq_ent_t), DDI_DMA_SYNC_FORCPU)); 131 ddi_fm_dma_err_get(mleq->mleq_dma.mxdb_dma_handle, &err, 132 DDI_FME_VERSION); 133 if (err.fme_status == DDI_FM_OK && (ent->mleqe_owner & 1) == swowner) { 134 /* The PRM says we have to membar here, so we're doing it */ 135 membar_consumer(); 136 ++mleq->mleq_cc; 137 return (ent); 138 } 139 /* 140 * In the case of a DMA error, we should re-arm this EQ and then come 141 * back and try again when the device wakes us back up. 142 * 143 * Hopefully the fault will be gone by then. 144 */ 145 ddi_fm_dma_err_clear(mleq->mleq_dma.mxdb_dma_handle, DDI_FME_VERSION); 146 147 return (NULL); 148 } 149 150 void 151 mlxcx_arm_eq(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq) 152 { 153 uint_t try = 0; 154 ddi_fm_error_t err; 155 bits32_t v = new_bits32(); 156 157 /* 158 * This is only called during initialization when the EQ is 159 * armed for the first time, and when re-armed at the end of 160 * interrupt processing. 161 */ 162 ASSERT(mutex_owned(&mleq->mleq_mtx) || servicing_interrupt()); 163 ASSERT(mleq->mleq_state & MLXCX_EQ_CREATED); 164 ASSERT0(mleq->mleq_state & MLXCX_EQ_DESTROYED); 165 ASSERT0(mleq->mleq_state & MLXCX_EQ_ARMED); 166 ASSERT0(mleq->mleq_state & MLXCX_EQ_POLLING); 167 168 mleq->mleq_state |= MLXCX_EQ_ARMED; 169 mleq->mleq_cc_armed = mleq->mleq_cc; 170 171 set_bits32(&v, MLXCX_EQ_ARM_EQN, mleq->mleq_num); 172 set_bits32(&v, MLXCX_EQ_ARM_CI, mleq->mleq_cc); 173 174 retry: 175 mlxcx_uar_put32(mlxp, mleq->mleq_uar, MLXCX_UAR_EQ_ARM, 176 from_bits32(v)); 177 ddi_fm_acc_err_get(mlxp->mlx_regs_handle, &err, 178 DDI_FME_VERSION); 179 if (err.fme_status == DDI_FM_OK) 180 return; 181 if (try++ < mlxcx_doorbell_tries) { 182 ddi_fm_acc_err_clear(mlxp->mlx_regs_handle, DDI_FME_VERSION); 183 goto retry; 184 } 185 ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST); 186 } 187 188 static void 189 mlxcx_update_eq(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq) 190 { 191 bits32_t v = new_bits32(); 192 ddi_fm_error_t err; 193 194 /* 195 * This should only be called from interrupt context to ensure 196 * correctness of mleq_cc. 197 */ 198 ASSERT(servicing_interrupt()); 199 ASSERT(mleq->mleq_state & MLXCX_EQ_CREATED); 200 ASSERT0(mleq->mleq_state & MLXCX_EQ_DESTROYED); 201 ASSERT0(mleq->mleq_state & MLXCX_EQ_ARMED); 202 203 set_bits32(&v, MLXCX_EQ_ARM_EQN, mleq->mleq_num); 204 set_bits32(&v, MLXCX_EQ_ARM_CI, mleq->mleq_cc); 205 206 mlxcx_uar_put32(mlxp, mleq->mleq_uar, MLXCX_UAR_EQ_NOARM, 207 from_bits32(v)); 208 ddi_fm_acc_err_get(mlxp->mlx_regs_handle, &err, 209 DDI_FME_VERSION); 210 ddi_fm_acc_err_clear(mlxp->mlx_regs_handle, DDI_FME_VERSION); 211 /* 212 * Ignore the error, if it's still happening when we try to re-arm the 213 * EQ, we will note the impact then. 214 */ 215 } 216 217 static mlxcx_completionq_ent_t * 218 mlxcx_cq_next(mlxcx_completion_queue_t *mlcq) 219 { 220 mlxcx_completionq_ent_t *ent; 221 ddi_fm_error_t err; 222 uint_t ci; 223 const uint_t swowner = ((mlcq->mlcq_cc >> mlcq->mlcq_entshift) & 1); 224 225 ASSERT(mutex_owned(&mlcq->mlcq_mtx)); 226 ASSERT(mlcq->mlcq_state & MLXCX_CQ_CREATED); 227 ASSERT0(mlcq->mlcq_state & MLXCX_CQ_DESTROYED); 228 229 /* mlcq_nents is always a power of 2 */ 230 ci = mlcq->mlcq_cc & (mlcq->mlcq_nents - 1); 231 232 ent = &mlcq->mlcq_ent[ci]; 233 VERIFY0(ddi_dma_sync(mlcq->mlcq_dma.mxdb_dma_handle, 234 (uintptr_t)ent - (uintptr_t)mlcq->mlcq_ent, 235 sizeof (mlxcx_completionq_ent_t), DDI_DMA_SYNC_FORCPU)); 236 ddi_fm_dma_err_get(mlcq->mlcq_dma.mxdb_dma_handle, &err, 237 DDI_FME_VERSION); 238 if (err.fme_status == DDI_FM_OK && (ent->mlcqe_owner & 1) == swowner) { 239 /* The PRM says we have to membar here, so we're doing it */ 240 membar_consumer(); 241 ++mlcq->mlcq_cc; 242 return (ent); 243 } 244 ddi_fm_dma_err_clear(mlcq->mlcq_dma.mxdb_dma_handle, DDI_FME_VERSION); 245 246 return (NULL); 247 } 248 249 void 250 mlxcx_update_cqci(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq) 251 { 252 ddi_fm_error_t err; 253 uint_t try = 0; 254 255 mlcq->mlcq_doorbell->mlcqd_update_ci = to_be24(mlcq->mlcq_cc); 256 257 retry: 258 MLXCX_DMA_SYNC(mlcq->mlcq_doorbell_dma, DDI_DMA_SYNC_FORDEV); 259 ddi_fm_dma_err_get(mlcq->mlcq_doorbell_dma.mxdb_dma_handle, &err, 260 DDI_FME_VERSION); 261 if (err.fme_status != DDI_FM_OK) { 262 if (try++ < mlxcx_doorbell_tries) { 263 ddi_fm_dma_err_clear( 264 mlcq->mlcq_doorbell_dma.mxdb_dma_handle, 265 DDI_FME_VERSION); 266 goto retry; 267 } else { 268 ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST); 269 return; 270 } 271 } 272 } 273 274 void 275 mlxcx_arm_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq) 276 { 277 bits32_t dbval = new_bits32(); 278 uint64_t udbval; 279 ddi_fm_error_t err; 280 uint_t try = 0; 281 282 ASSERT(mutex_owned(&mlcq->mlcq_arm_mtx)); 283 ASSERT(mutex_owned(&mlcq->mlcq_mtx)); 284 ASSERT(mlcq->mlcq_state & MLXCX_CQ_CREATED); 285 ASSERT0(mlcq->mlcq_state & MLXCX_CQ_DESTROYED); 286 287 if (mlcq->mlcq_state & MLXCX_CQ_ARMED) { 288 ASSERT3U(mlcq->mlcq_ec, >, mlcq->mlcq_ec_armed); 289 } 290 291 if (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN) 292 return; 293 294 atomic_or_uint(&mlcq->mlcq_state, MLXCX_CQ_ARMED); 295 mlcq->mlcq_cc_armed = mlcq->mlcq_cc; 296 mlcq->mlcq_ec_armed = mlcq->mlcq_ec; 297 298 set_bits32(&dbval, MLXCX_CQ_ARM_SEQ, mlcq->mlcq_ec); 299 set_bits32(&dbval, MLXCX_CQ_ARM_CI, mlcq->mlcq_cc); 300 301 udbval = (uint64_t)from_bits32(dbval) << 32; 302 udbval |= mlcq->mlcq_num & 0xffffff; 303 304 mlcq->mlcq_doorbell->mlcqd_update_ci = to_be24(mlcq->mlcq_cc); 305 mlcq->mlcq_doorbell->mlcqd_arm_ci = dbval; 306 307 retry: 308 MLXCX_DMA_SYNC(mlcq->mlcq_doorbell_dma, DDI_DMA_SYNC_FORDEV); 309 ddi_fm_dma_err_get(mlcq->mlcq_doorbell_dma.mxdb_dma_handle, &err, 310 DDI_FME_VERSION); 311 if (err.fme_status != DDI_FM_OK) { 312 if (try++ < mlxcx_doorbell_tries) { 313 ddi_fm_dma_err_clear( 314 mlcq->mlcq_doorbell_dma.mxdb_dma_handle, 315 DDI_FME_VERSION); 316 goto retry; 317 } else { 318 goto err; 319 } 320 } 321 322 mlxcx_uar_put64(mlxp, mlcq->mlcq_uar, MLXCX_UAR_CQ_ARM, udbval); 323 ddi_fm_acc_err_get(mlxp->mlx_regs_handle, &err, 324 DDI_FME_VERSION); 325 if (err.fme_status == DDI_FM_OK) 326 return; 327 if (try++ < mlxcx_doorbell_tries) { 328 ddi_fm_acc_err_clear(mlxp->mlx_regs_handle, DDI_FME_VERSION); 329 goto retry; 330 } 331 332 err: 333 ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST); 334 } 335 336 const char * 337 mlxcx_event_name(mlxcx_event_t evt) 338 { 339 switch (evt) { 340 case MLXCX_EVENT_COMPLETION: 341 return ("COMPLETION"); 342 case MLXCX_EVENT_PATH_MIGRATED: 343 return ("PATH_MIGRATED"); 344 case MLXCX_EVENT_COMM_ESTABLISH: 345 return ("COMM_ESTABLISH"); 346 case MLXCX_EVENT_SENDQ_DRAIN: 347 return ("SENDQ_DRAIN"); 348 case MLXCX_EVENT_LAST_WQE: 349 return ("LAST_WQE"); 350 case MLXCX_EVENT_SRQ_LIMIT: 351 return ("SRQ_LIMIT"); 352 case MLXCX_EVENT_DCT_ALL_CLOSED: 353 return ("DCT_ALL_CLOSED"); 354 case MLXCX_EVENT_DCT_ACCKEY_VIOL: 355 return ("DCT_ACCKEY_VIOL"); 356 case MLXCX_EVENT_CQ_ERROR: 357 return ("CQ_ERROR"); 358 case MLXCX_EVENT_WQ_CATASTROPHE: 359 return ("WQ_CATASTROPHE"); 360 case MLXCX_EVENT_PATH_MIGRATE_FAIL: 361 return ("PATH_MIGRATE_FAIL"); 362 case MLXCX_EVENT_PAGE_FAULT: 363 return ("PAGE_FAULT"); 364 case MLXCX_EVENT_WQ_INVALID_REQ: 365 return ("WQ_INVALID_REQ"); 366 case MLXCX_EVENT_WQ_ACCESS_VIOL: 367 return ("WQ_ACCESS_VIOL"); 368 case MLXCX_EVENT_SRQ_CATASTROPHE: 369 return ("SRQ_CATASTROPHE"); 370 case MLXCX_EVENT_INTERNAL_ERROR: 371 return ("INTERNAL_ERROR"); 372 case MLXCX_EVENT_PORT_STATE: 373 return ("PORT_STATE"); 374 case MLXCX_EVENT_GPIO: 375 return ("GPIO"); 376 case MLXCX_EVENT_PORT_MODULE: 377 return ("PORT_MODULE"); 378 case MLXCX_EVENT_TEMP_WARNING: 379 return ("TEMP_WARNING"); 380 case MLXCX_EVENT_REMOTE_CONFIG: 381 return ("REMOTE_CONFIG"); 382 case MLXCX_EVENT_DCBX_CHANGE: 383 return ("DCBX_CHANGE"); 384 case MLXCX_EVENT_DOORBELL_CONGEST: 385 return ("DOORBELL_CONGEST"); 386 case MLXCX_EVENT_STALL_VL: 387 return ("STALL_VL"); 388 case MLXCX_EVENT_CMD_COMPLETION: 389 return ("CMD_COMPLETION"); 390 case MLXCX_EVENT_PAGE_REQUEST: 391 return ("PAGE_REQUEST"); 392 case MLXCX_EVENT_NIC_VPORT: 393 return ("NIC_VPORT"); 394 case MLXCX_EVENT_EC_PARAMS_CHANGE: 395 return ("EC_PARAMS_CHANGE"); 396 case MLXCX_EVENT_XRQ_ERROR: 397 return ("XRQ_ERROR"); 398 } 399 return ("UNKNOWN"); 400 } 401 402 /* Should be called only when link state has changed. */ 403 void 404 mlxcx_update_link_state(mlxcx_t *mlxp, mlxcx_port_t *port) 405 { 406 link_state_t ls; 407 408 mutex_enter(&port->mlp_mtx); 409 (void) mlxcx_cmd_query_port_status(mlxp, port); 410 (void) mlxcx_cmd_query_port_speed(mlxp, port); 411 (void) mlxcx_cmd_query_port_fec(mlxp, port); 412 413 switch (port->mlp_oper_status) { 414 case MLXCX_PORT_STATUS_UP: 415 case MLXCX_PORT_STATUS_UP_ONCE: 416 ls = LINK_STATE_UP; 417 break; 418 case MLXCX_PORT_STATUS_DOWN: 419 ls = LINK_STATE_DOWN; 420 break; 421 default: 422 ls = LINK_STATE_UNKNOWN; 423 } 424 mac_link_update(mlxp->mlx_mac_hdl, ls); 425 426 mutex_exit(&port->mlp_mtx); 427 } 428 429 CTASSERT(MLXCX_MANAGE_PAGES_MAX_PAGES < UINT_MAX); 430 431 static void 432 mlxcx_give_pages_once(mlxcx_t *mlxp, size_t npages) 433 { 434 ddi_device_acc_attr_t acc; 435 ddi_dma_attr_t attr; 436 mlxcx_dev_page_t *mdp; 437 mlxcx_dev_page_t **pages; 438 size_t i; 439 const ddi_dma_cookie_t *ck; 440 441 /* 442 * If this isn't enough, the HCA will ask for more 443 */ 444 npages = MIN(npages, MLXCX_MANAGE_PAGES_MAX_PAGES); 445 446 pages = kmem_zalloc(sizeof (*pages) * npages, KM_SLEEP); 447 448 for (i = 0; i < npages; i++) { 449 mdp = kmem_zalloc(sizeof (mlxcx_dev_page_t), KM_SLEEP); 450 mlxcx_dma_acc_attr(mlxp, &acc); 451 mlxcx_dma_page_attr(mlxp, &attr); 452 if (!mlxcx_dma_alloc(mlxp, &mdp->mxdp_dma, &attr, &acc, 453 B_TRUE, MLXCX_HW_PAGE_SIZE, B_TRUE)) { 454 mlxcx_warn(mlxp, "failed to allocate 4k page %u/%lu", i, 455 npages); 456 kmem_free(mdp, sizeof (mlxcx_dev_page_t)); 457 goto cleanup_npages; 458 } 459 ck = mlxcx_dma_cookie_one(&mdp->mxdp_dma); 460 mdp->mxdp_pa = ck->dmac_laddress; 461 pages[i] = mdp; 462 } 463 464 mutex_enter(&mlxp->mlx_pagemtx); 465 466 if (!mlxcx_cmd_give_pages(mlxp, 467 MLXCX_MANAGE_PAGES_OPMOD_GIVE_PAGES, npages, pages)) { 468 mlxcx_warn(mlxp, "!hardware refused our gift of %lu " 469 "pages!", npages); 470 mutex_exit(&mlxp->mlx_pagemtx); 471 goto cleanup_npages; 472 } 473 474 for (i = 0; i < npages; i++) { 475 avl_add(&mlxp->mlx_pages, pages[i]); 476 } 477 mlxp->mlx_npages += npages; 478 mutex_exit(&mlxp->mlx_pagemtx); 479 480 kmem_free(pages, sizeof (*pages) * npages); 481 482 return; 483 484 cleanup_npages: 485 for (i = 0; i < npages; i++) { 486 if ((mdp = pages[i]) == NULL) 487 break; 488 489 mlxcx_dma_free(&mdp->mxdp_dma); 490 kmem_free(mdp, sizeof (mlxcx_dev_page_t)); 491 } 492 /* Tell the hardware we had an allocation failure. */ 493 (void) mlxcx_cmd_give_pages(mlxp, MLXCX_MANAGE_PAGES_OPMOD_ALLOC_FAIL, 494 0, NULL); 495 mutex_exit(&mlxp->mlx_pagemtx); 496 497 kmem_free(pages, sizeof (*pages) * npages); 498 } 499 500 static void 501 mlxcx_take_pages_once(mlxcx_t *mlxp, size_t npages) 502 { 503 uint_t i; 504 int32_t ret; 505 uint64_t *pas; 506 mlxcx_dev_page_t *mdp, probe; 507 508 pas = kmem_alloc(sizeof (*pas) * npages, KM_SLEEP); 509 510 if (!mlxcx_cmd_return_pages(mlxp, npages, pas, &ret)) { 511 kmem_free(pas, sizeof (*pas) * npages); 512 return; 513 } 514 515 mutex_enter(&mlxp->mlx_pagemtx); 516 517 ASSERT0(avl_is_empty(&mlxp->mlx_pages)); 518 519 for (i = 0; i < ret; i++) { 520 bzero(&probe, sizeof (probe)); 521 probe.mxdp_pa = pas[i]; 522 523 mdp = avl_find(&mlxp->mlx_pages, &probe, NULL); 524 525 if (mdp != NULL) { 526 avl_remove(&mlxp->mlx_pages, mdp); 527 mlxp->mlx_npages--; 528 mlxcx_dma_free(&mdp->mxdp_dma); 529 kmem_free(mdp, sizeof (mlxcx_dev_page_t)); 530 } else { 531 mlxcx_warn(mlxp, "hardware returned a page " 532 "with PA 0x%" PRIx64 " but we have no " 533 "record of giving out such a page", pas[i]); 534 } 535 } 536 537 mutex_exit(&mlxp->mlx_pagemtx); 538 539 kmem_free(pas, sizeof (*pas) * npages); 540 } 541 542 static void 543 mlxcx_pages_task(void *arg) 544 { 545 mlxcx_async_param_t *param = arg; 546 mlxcx_t *mlxp = param->mla_mlx; 547 int32_t npages; 548 549 /* 550 * We can drop the pending status now, as we've extracted what 551 * is needed to process the pages request. 552 * 553 * Even though we should never get another pages request until 554 * we have responded to this, along with the guard in mlxcx_sync_intr, 555 * this safely allows the reuse of mlxcx_async_param_t. 556 */ 557 mutex_enter(¶m->mla_mtx); 558 npages = param->mla_pages.mlp_npages; 559 param->mla_pending = B_FALSE; 560 bzero(¶m->mla_pages, sizeof (param->mla_pages)); 561 mutex_exit(¶m->mla_mtx); 562 563 /* 564 * The PRM describes npages as: "Number of missing / unneeded pages 565 * (signed number, msb indicate sign)". The implication is that 566 * it will not be zero. We are expected to use this to give or 567 * take back pages (based on the sign) using the MANAGE_PAGES 568 * command but we can't determine whether to give or take 569 * when npages is zero. So we do nothing. 570 */ 571 if (npages > 0) { 572 mlxcx_give_pages_once(mlxp, npages); 573 } else if (npages < 0) { 574 mlxcx_take_pages_once(mlxp, -1 * npages); 575 } 576 } 577 578 static void 579 mlxcx_link_state_task(void *arg) 580 { 581 mlxcx_async_param_t *param = arg; 582 mlxcx_port_t *port; 583 mlxcx_t *mlxp; 584 585 /* 586 * Gather the argruments from the parameters and clear the 587 * pending status. 588 * 589 * The pending status must be cleared *before* we update the 590 * link state. This is both safe and required to ensure we always 591 * have the correct link state. It is safe because taskq_ents are 592 * reusable (by the caller of taskq_dispatch_ent()) once the 593 * task function has started executing. It is necessarily before 594 * updating the link state to guarantee further link state change 595 * events are not missed and we always have the current link state. 596 */ 597 mutex_enter(¶m->mla_mtx); 598 mlxp = param->mla_mlx; 599 port = param->mla_port; 600 param->mla_pending = B_FALSE; 601 mutex_exit(¶m->mla_mtx); 602 603 mlxcx_update_link_state(mlxp, port); 604 } 605 606 static const char * 607 mlxcx_module_error_string(mlxcx_module_error_type_t err) 608 { 609 switch (err) { 610 case MLXCX_MODULE_ERR_POWER_BUDGET: 611 return ("POWER_BUDGET"); 612 case MLXCX_MODULE_ERR_LONG_RANGE: 613 return ("LONG_RANGE"); 614 case MLXCX_MODULE_ERR_BUS_STUCK: 615 return ("BUS_STUCK"); 616 case MLXCX_MODULE_ERR_NO_EEPROM: 617 return ("NO_EEPROM"); 618 case MLXCX_MODULE_ERR_ENFORCEMENT: 619 return ("ENFORCEMENT"); 620 case MLXCX_MODULE_ERR_UNKNOWN_IDENT: 621 return ("UNKNOWN_IDENT"); 622 case MLXCX_MODULE_ERR_HIGH_TEMP: 623 return ("HIGH_TEMP"); 624 case MLXCX_MODULE_ERR_CABLE_SHORTED: 625 return ("CABLE_SHORTED"); 626 default: 627 return ("UNKNOWN"); 628 } 629 } 630 631 static void 632 mlxcx_report_module_error(mlxcx_t *mlxp, mlxcx_evdata_port_mod_t *evd) 633 { 634 uint64_t ena; 635 char buf[FM_MAX_CLASS]; 636 const char *lename; 637 const char *ename; 638 const char *stname; 639 uint_t eno = 0; 640 mlxcx_module_status_t state = evd->mled_port_mod_module_status; 641 642 switch (state) { 643 case MLXCX_MODULE_ERROR: 644 stname = "error"; 645 eno = evd->mled_port_mod_error_type; 646 lename = mlxcx_module_error_string(eno); 647 switch (eno) { 648 case MLXCX_MODULE_ERR_ENFORCEMENT: 649 ename = DDI_FM_TXR_ERROR_WHITELIST; 650 break; 651 case MLXCX_MODULE_ERR_UNKNOWN_IDENT: 652 case MLXCX_MODULE_ERR_NO_EEPROM: 653 ename = DDI_FM_TXR_ERROR_NOTSUPP; 654 break; 655 case MLXCX_MODULE_ERR_HIGH_TEMP: 656 ename = DDI_FM_TXR_ERROR_OVERTEMP; 657 break; 658 case MLXCX_MODULE_ERR_POWER_BUDGET: 659 case MLXCX_MODULE_ERR_LONG_RANGE: 660 case MLXCX_MODULE_ERR_CABLE_SHORTED: 661 ename = DDI_FM_TXR_ERROR_HWFAIL; 662 break; 663 case MLXCX_MODULE_ERR_BUS_STUCK: 664 default: 665 ename = DDI_FM_TXR_ERROR_UNKNOWN; 666 } 667 break; 668 default: 669 return; 670 } 671 672 (void) snprintf(buf, FM_MAX_CLASS, "%s.%s", 673 DDI_FM_NIC, DDI_FM_TXR_ERROR); 674 ena = fm_ena_generate(0, FM_ENA_FMT1); 675 if (!DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps)) 676 return; 677 678 ddi_fm_ereport_post(mlxp->mlx_dip, buf, ena, DDI_NOSLEEP, 679 /* compulsory FM props */ 680 FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0, 681 /* generic NIC txr error event props */ 682 "error", DATA_TYPE_STRING, ename, 683 "port_index", DATA_TYPE_UINT8, 0, 684 "txr_index", DATA_TYPE_UINT8, evd->mled_port_mod_module, 685 /* local props */ 686 "mlxcx_state", DATA_TYPE_STRING, stname, 687 "mlxcx_error", DATA_TYPE_STRING, lename, 688 "mlxcx_error_num", DATA_TYPE_UINT8, eno, 689 NULL); 690 ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST); 691 } 692 693 /* 694 * Common beginning of interrupt processing. 695 * Confirm interrupt hasn't been disabled, verify its state and 696 * mark the vector as active. 697 */ 698 static boolean_t 699 mlxcx_intr_ini(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq) 700 { 701 mutex_enter(&mleq->mleq_mtx); 702 703 if ((mleq->mleq_state & MLXCX_EQ_INTR_ENABLED) == 0) { 704 mutex_exit(&mleq->mleq_mtx); 705 return (B_FALSE); 706 } 707 708 if (!(mleq->mleq_state & MLXCX_EQ_ALLOC) || 709 !(mleq->mleq_state & MLXCX_EQ_CREATED) || 710 (mleq->mleq_state & MLXCX_EQ_DESTROYED)) { 711 mlxcx_warn(mlxp, "intr %d in bad eq state", 712 mleq->mleq_intr_index); 713 mutex_exit(&mleq->mleq_mtx); 714 return (B_FALSE); 715 } 716 717 mleq->mleq_state |= MLXCX_EQ_INTR_ACTIVE; 718 mutex_exit(&mleq->mleq_mtx); 719 720 return (B_TRUE); 721 } 722 723 /* 724 * End of interrupt processing. 725 * Mark vector as no longer active and if shutdown is blocked on this vector, 726 * wake it up. 727 */ 728 static void 729 mlxcx_intr_fini(mlxcx_event_queue_t *mleq) 730 { 731 mutex_enter(&mleq->mleq_mtx); 732 if ((mleq->mleq_state & MLXCX_EQ_INTR_QUIESCE) != 0) 733 cv_signal(&mleq->mleq_cv); 734 735 mleq->mleq_state &= ~MLXCX_EQ_INTR_ACTIVE; 736 mutex_exit(&mleq->mleq_mtx); 737 } 738 739 static uint_t 740 mlxcx_intr_async(caddr_t arg, caddr_t arg2) 741 { 742 mlxcx_t *mlxp = (mlxcx_t *)arg; 743 mlxcx_event_queue_t *mleq = (mlxcx_event_queue_t *)arg2; 744 mlxcx_eventq_ent_t *ent; 745 mlxcx_async_param_t *param; 746 uint_t portn; 747 uint16_t func; 748 749 if (!mlxcx_intr_ini(mlxp, mleq)) 750 return (DDI_INTR_CLAIMED); 751 752 ent = mlxcx_eq_next(mleq); 753 if (ent == NULL) { 754 goto done; 755 } 756 757 ASSERT(mleq->mleq_state & MLXCX_EQ_ARMED); 758 mleq->mleq_state &= ~MLXCX_EQ_ARMED; 759 760 for (; ent != NULL; ent = mlxcx_eq_next(mleq)) { 761 DTRACE_PROBE2(event, mlxcx_t *, mlxp, mlxcx_eventq_ent_t *, 762 ent); 763 764 switch (ent->mleqe_event_type) { 765 case MLXCX_EVENT_CMD_COMPLETION: 766 mlxcx_cmd_completion(mlxp, ent); 767 break; 768 case MLXCX_EVENT_PAGE_REQUEST: 769 func = from_be16(ent->mleqe_page_request. 770 mled_page_request_function_id); 771 VERIFY3U(func, <=, MLXCX_FUNC_ID_MAX); 772 773 param = &mlxp->mlx_npages_req[func]; 774 mutex_enter(¶m->mla_mtx); 775 if (param->mla_pending) { 776 /* 777 * The PRM states we will not get another 778 * page request event until any pending have 779 * been posted as complete to the HCA. 780 * This will guard against this anyway. 781 */ 782 mutex_exit(¶m->mla_mtx); 783 mlxcx_warn(mlxp, "Unexpected page request " 784 "whilst another is pending"); 785 break; 786 } 787 param->mla_pages.mlp_npages = 788 (int32_t)from_be32(ent->mleqe_page_request. 789 mled_page_request_num_pages); 790 param->mla_pages.mlp_func = func; 791 param->mla_pending = B_TRUE; 792 ASSERT3P(param->mla_mlx, ==, mlxp); 793 mutex_exit(¶m->mla_mtx); 794 795 taskq_dispatch_ent(mlxp->mlx_async_tq, mlxcx_pages_task, 796 param, 0, ¶m->mla_tqe); 797 break; 798 case MLXCX_EVENT_PORT_STATE: 799 portn = get_bits8( 800 ent->mleqe_port_state.mled_port_state_port_num, 801 MLXCX_EVENT_PORT_NUM) - 1; 802 if (portn >= mlxp->mlx_nports) 803 break; 804 805 param = &mlxp->mlx_ports[portn].mlx_port_event; 806 mutex_enter(¶m->mla_mtx); 807 if (param->mla_pending) { 808 /* 809 * There is a link state event pending 810 * processing. When that event is handled 811 * it will get the current link state. 812 */ 813 mutex_exit(¶m->mla_mtx); 814 break; 815 } 816 817 ASSERT3P(param->mla_mlx, ==, mlxp); 818 ASSERT3P(param->mla_port, ==, &mlxp->mlx_ports[portn]); 819 820 param->mla_pending = B_TRUE; 821 mutex_exit(¶m->mla_mtx); 822 823 taskq_dispatch_ent(mlxp->mlx_async_tq, 824 mlxcx_link_state_task, param, 0, ¶m->mla_tqe); 825 break; 826 case MLXCX_EVENT_PORT_MODULE: 827 mlxcx_report_module_error(mlxp, &ent->mleqe_port_mod); 828 break; 829 default: 830 mlxcx_warn(mlxp, "unhandled event 0x%x on intr %d", 831 ent->mleqe_event_type, mleq->mleq_intr_index); 832 } 833 } 834 835 mlxcx_arm_eq(mlxp, mleq); 836 837 done: 838 mlxcx_intr_fini(mleq); 839 return (DDI_INTR_CLAIMED); 840 } 841 842 static boolean_t 843 mlxcx_process_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, mblk_t **mpp, 844 size_t bytelim) 845 { 846 mlxcx_work_queue_t *wq = mlcq->mlcq_wq; 847 mlxcx_completionq_ent_t *cent; 848 mblk_t *mp, *cmp, *nmp; 849 mlxcx_buffer_t *buf; 850 boolean_t found, added; 851 size_t bytes = 0; 852 uint_t rx_frames = 0; 853 uint_t comp_cnt = 0; 854 int64_t wqebbs, bufcnt; 855 856 *mpp = NULL; 857 858 if (!(mlcq->mlcq_state & MLXCX_CQ_ALLOC) || 859 !(mlcq->mlcq_state & MLXCX_CQ_CREATED) || 860 (mlcq->mlcq_state & MLXCX_CQ_DESTROYED) || 861 (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN)) { 862 return (B_FALSE); 863 } 864 865 nmp = cmp = mp = NULL; 866 867 wqebbs = 0; 868 bufcnt = 0; 869 for (cent = mlxcx_cq_next(mlcq); cent != NULL; 870 cent = mlxcx_cq_next(mlcq)) { 871 /* 872 * Teardown and ring stop can atomic_or this flag 873 * into our state if they want us to stop early. 874 */ 875 if (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN) 876 return (B_FALSE); 877 878 comp_cnt++; 879 if (cent->mlcqe_opcode == MLXCX_CQE_OP_REQ && 880 cent->mlcqe_send_wqe_opcode == MLXCX_WQE_OP_NOP) { 881 /* NOP */ 882 atomic_dec_64(&wq->mlwq_wqebb_used); 883 goto nextcq; 884 } 885 886 lookagain: 887 /* 888 * Generally the buffer we're looking for will be 889 * at the front of the list, so this loop won't 890 * need to look far. 891 */ 892 buf = list_head(&mlcq->mlcq_buffers); 893 found = B_FALSE; 894 while (buf != NULL) { 895 if ((buf->mlb_wqe_index & UINT16_MAX) == 896 from_be16(cent->mlcqe_wqe_counter)) { 897 found = B_TRUE; 898 break; 899 } 900 buf = list_next(&mlcq->mlcq_buffers, buf); 901 } 902 903 if (!found) { 904 /* 905 * If there's any buffers waiting on the 906 * buffers_b list, then merge those into 907 * the main list and have another look. 908 * 909 * The wq enqueue routines push new buffers 910 * into buffers_b so that they can avoid 911 * taking the mlcq_mtx and blocking us for 912 * every single packet. 913 */ 914 added = B_FALSE; 915 mutex_enter(&mlcq->mlcq_bufbmtx); 916 if (!list_is_empty(&mlcq->mlcq_buffers_b)) { 917 list_move_tail(&mlcq->mlcq_buffers, 918 &mlcq->mlcq_buffers_b); 919 added = B_TRUE; 920 } 921 mutex_exit(&mlcq->mlcq_bufbmtx); 922 if (added) 923 goto lookagain; 924 925 buf = list_head(&mlcq->mlcq_buffers); 926 mlxcx_warn(mlxp, "got completion on CQ %x but " 927 "no buffer matching wqe found: %x (first " 928 "buffer counter = %x)", mlcq->mlcq_num, 929 from_be16(cent->mlcqe_wqe_counter), 930 buf == NULL ? UINT32_MAX : 931 buf->mlb_wqe_index); 932 mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_INVAL_STATE); 933 goto nextcq; 934 } 935 936 /* 937 * The buf is likely to be freed below, count this now. 938 */ 939 wqebbs += buf->mlb_wqebbs; 940 941 list_remove(&mlcq->mlcq_buffers, buf); 942 bufcnt++; 943 944 switch (mlcq->mlcq_wq->mlwq_type) { 945 case MLXCX_WQ_TYPE_SENDQ: 946 mlxcx_tx_completion(mlxp, mlcq, cent, buf); 947 break; 948 case MLXCX_WQ_TYPE_RECVQ: 949 nmp = mlxcx_rx_completion(mlxp, mlcq, cent, buf); 950 bytes += from_be32(cent->mlcqe_byte_cnt); 951 if (nmp != NULL) { 952 if (cmp != NULL) { 953 cmp->b_next = nmp; 954 cmp = nmp; 955 } else { 956 mp = cmp = nmp; 957 } 958 959 rx_frames++; 960 } 961 break; 962 } 963 964 /* 965 * Update the consumer index with what has been processed, 966 * followed by driver counters. It is important to tell the 967 * hardware first, otherwise when we throw more packets at 968 * it, it may get an overflow error. 969 * We do this whenever we've processed enough to bridge the 970 * high->low water mark. 971 */ 972 if (bufcnt > (MLXCX_CQ_LWM_GAP - MLXCX_CQ_HWM_GAP)) { 973 mlxcx_update_cqci(mlxp, mlcq); 974 /* 975 * Both these variables are incremented using 976 * atomics as they are modified in other code paths 977 * (Eg during tx) which hold different locks. 978 */ 979 atomic_add_64(&mlcq->mlcq_bufcnt, -bufcnt); 980 atomic_add_64(&wq->mlwq_wqebb_used, -wqebbs); 981 wqebbs = 0; 982 bufcnt = 0; 983 comp_cnt = 0; 984 } 985 nextcq: 986 if (rx_frames > mlxp->mlx_props.mldp_rx_per_cq || 987 (bytelim != 0 && bytes > bytelim)) 988 break; 989 } 990 991 if (comp_cnt > 0) { 992 mlxcx_update_cqci(mlxp, mlcq); 993 atomic_add_64(&mlcq->mlcq_bufcnt, -bufcnt); 994 atomic_add_64(&wq->mlwq_wqebb_used, -wqebbs); 995 } 996 997 *mpp = mp; 998 return (B_TRUE); 999 } 1000 1001 1002 mblk_t * 1003 mlxcx_rx_poll(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, size_t bytelim) 1004 { 1005 mblk_t *mp = NULL; 1006 1007 ASSERT(mutex_owned(&mlcq->mlcq_mtx)); 1008 1009 ASSERT(mlcq->mlcq_wq != NULL); 1010 ASSERT3U(mlcq->mlcq_wq->mlwq_type, ==, MLXCX_WQ_TYPE_RECVQ); 1011 1012 (void) mlxcx_process_cq(mlxp, mlcq, &mp, bytelim); 1013 1014 return (mp); 1015 } 1016 1017 static uint_t 1018 mlxcx_intr_n(caddr_t arg, caddr_t arg2) 1019 { 1020 mlxcx_t *mlxp = (mlxcx_t *)arg; 1021 mlxcx_event_queue_t *mleq = (mlxcx_event_queue_t *)arg2; 1022 mlxcx_eventq_ent_t *ent; 1023 mlxcx_completion_queue_t *mlcq, probe; 1024 mlxcx_work_queue_t *mlwq; 1025 mblk_t *mp = NULL; 1026 boolean_t tellmac = B_FALSE; 1027 1028 if (!mlxcx_intr_ini(mlxp, mleq)) 1029 return (DDI_INTR_CLAIMED); 1030 1031 ent = mlxcx_eq_next(mleq); 1032 if (ent == NULL) { 1033 if (++mleq->mleq_badintrs > mlxcx_stuck_intr_count) { 1034 mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_BADINT_LIMIT); 1035 ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST); 1036 (void) ddi_intr_disable(mlxp->mlx_intr_handles[ 1037 mleq->mleq_intr_index]); 1038 } 1039 goto done; 1040 } 1041 mleq->mleq_badintrs = 0; 1042 1043 ASSERT(mleq->mleq_state & MLXCX_EQ_ARMED); 1044 mleq->mleq_state &= ~MLXCX_EQ_ARMED; 1045 1046 for (; ent != NULL; ent = mlxcx_eq_next(mleq)) { 1047 if (ent->mleqe_event_type != MLXCX_EVENT_COMPLETION) { 1048 mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_INVAL_STATE); 1049 ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST); 1050 (void) ddi_intr_disable(mlxp->mlx_intr_handles[ 1051 mleq->mleq_intr_index]); 1052 goto done; 1053 } 1054 ASSERT3U(ent->mleqe_event_type, ==, MLXCX_EVENT_COMPLETION); 1055 1056 probe.mlcq_num = 1057 from_be24(ent->mleqe_completion.mled_completion_cqn); 1058 mutex_enter(&mleq->mleq_mtx); 1059 mlcq = avl_find(&mleq->mleq_cqs, &probe, NULL); 1060 mutex_exit(&mleq->mleq_mtx); 1061 1062 if (mlcq == NULL) 1063 continue; 1064 1065 mlwq = mlcq->mlcq_wq; 1066 1067 /* 1068 * mlcq_arm_mtx is used to avoid race conditions between 1069 * this interrupt routine and the transition from polling 1070 * back to interrupt mode. When exiting poll mode the 1071 * CQ is likely to be un-armed, which means there will 1072 * be no events for the CQ coming though here, 1073 * consequently very low contention on mlcq_arm_mtx. 1074 * 1075 * mlcq_arm_mtx must be released before calls into mac 1076 * layer in order to avoid deadlocks. 1077 */ 1078 mutex_enter(&mlcq->mlcq_arm_mtx); 1079 mlcq->mlcq_ec++; 1080 atomic_and_uint(&mlcq->mlcq_state, ~MLXCX_CQ_ARMED); 1081 1082 if (mutex_tryenter(&mlcq->mlcq_mtx) == 0) { 1083 /* 1084 * If we failed to take the mutex because the 1085 * polling function has it, just move on. 1086 * We don't want to block other CQs behind 1087 * this one. 1088 */ 1089 if ((mlcq->mlcq_state & MLXCX_CQ_POLLING) != 0) { 1090 mutex_exit(&mlcq->mlcq_arm_mtx); 1091 goto update_eq; 1092 } 1093 1094 /* Otherwise we will wait. */ 1095 mutex_enter(&mlcq->mlcq_mtx); 1096 } 1097 1098 if ((mlcq->mlcq_state & MLXCX_CQ_POLLING) == 0 && 1099 mlxcx_process_cq(mlxp, mlcq, &mp, 0)) { 1100 /* 1101 * The ring is not in polling mode and we processed 1102 * some completion queue entries. 1103 */ 1104 if ((mlcq->mlcq_state & MLXCX_CQ_BLOCKED_MAC) != 0 && 1105 mlcq->mlcq_bufcnt < mlcq->mlcq_buflwm) { 1106 atomic_and_uint(&mlcq->mlcq_state, 1107 ~MLXCX_CQ_BLOCKED_MAC); 1108 tellmac = B_TRUE; 1109 } 1110 1111 if ((mlwq->mlwq_state & MLXCX_WQ_BLOCKED_MAC) != 0 && 1112 mlwq->mlwq_wqebb_used < mlwq->mlwq_buflwm) { 1113 atomic_and_uint(&mlwq->mlwq_state, 1114 ~MLXCX_WQ_BLOCKED_MAC); 1115 tellmac = B_TRUE; 1116 } 1117 1118 mlxcx_arm_cq(mlxp, mlcq); 1119 1120 mutex_exit(&mlcq->mlcq_mtx); 1121 mutex_exit(&mlcq->mlcq_arm_mtx); 1122 1123 if (tellmac) { 1124 mac_tx_ring_update(mlxp->mlx_mac_hdl, 1125 mlcq->mlcq_mac_hdl); 1126 tellmac = B_FALSE; 1127 } 1128 1129 if (mp != NULL) { 1130 mac_rx_ring(mlxp->mlx_mac_hdl, 1131 mlcq->mlcq_mac_hdl, mp, mlcq->mlcq_mac_gen); 1132 } 1133 } else { 1134 mutex_exit(&mlcq->mlcq_mtx); 1135 mutex_exit(&mlcq->mlcq_arm_mtx); 1136 } 1137 1138 update_eq: 1139 /* 1140 * Updating the consumer counter for an EQ requires a write 1141 * to the UAR, which is possibly expensive. 1142 * 1143 * Try to do it only often enough to stop us wrapping around. 1144 */ 1145 if ((mleq->mleq_cc & 0x7) == 0) 1146 mlxcx_update_eq(mlxp, mleq); 1147 } 1148 1149 mlxcx_arm_eq(mlxp, mleq); 1150 1151 done: 1152 mlxcx_intr_fini(mleq); 1153 return (DDI_INTR_CLAIMED); 1154 } 1155 1156 boolean_t 1157 mlxcx_intr_setup(mlxcx_t *mlxp) 1158 { 1159 dev_info_t *dip = mlxp->mlx_dip; 1160 int ret; 1161 int nintrs = 0; 1162 int navail = 0; 1163 int types, i; 1164 mlxcx_eventq_type_t eqt = MLXCX_EQ_TYPE_ANY; 1165 1166 ret = ddi_intr_get_supported_types(dip, &types); 1167 if (ret != DDI_SUCCESS) { 1168 return (B_FALSE); 1169 } 1170 1171 if (!(types & DDI_INTR_TYPE_MSIX)) { 1172 mlxcx_warn(mlxp, "MSI-X interrupts not available, but mlxcx " 1173 "requires MSI-X"); 1174 return (B_FALSE); 1175 } 1176 1177 ret = ddi_intr_get_nintrs(dip, DDI_INTR_TYPE_MSIX, &nintrs); 1178 if (ret != DDI_SUCCESS) { 1179 return (B_FALSE); 1180 } 1181 if (nintrs < 2) { 1182 mlxcx_warn(mlxp, "%d MSI-X interrupts available, but mlxcx " 1183 "requires 2", nintrs); 1184 return (B_FALSE); 1185 } 1186 1187 ret = ddi_intr_get_navail(dip, DDI_INTR_TYPE_MSIX, &navail); 1188 if (navail < 2) { 1189 mlxcx_warn(mlxp, "%d MSI-X interrupts available, but mlxcx " 1190 "requires 2", navail); 1191 return (B_FALSE); 1192 } 1193 1194 mlxp->mlx_intr_size = navail * sizeof (ddi_intr_handle_t); 1195 mlxp->mlx_intr_handles = kmem_alloc(mlxp->mlx_intr_size, KM_SLEEP); 1196 /* 1197 * Interrupts for Completion Queues events start from vector 1 1198 * up to available vectors. Vector 0 is used for asynchronous 1199 * events. 1200 */ 1201 mlxp->mlx_intr_cq0 = 1; 1202 1203 ret = ddi_intr_alloc(dip, mlxp->mlx_intr_handles, DDI_INTR_TYPE_MSIX, 1204 0, navail, &mlxp->mlx_intr_count, DDI_INTR_ALLOC_NORMAL); 1205 if (ret != DDI_SUCCESS) { 1206 mlxcx_intr_teardown(mlxp); 1207 return (B_FALSE); 1208 } 1209 if (mlxp->mlx_intr_count < mlxp->mlx_intr_cq0 + 1) { 1210 mlxcx_intr_teardown(mlxp); 1211 return (B_FALSE); 1212 } 1213 mlxp->mlx_intr_type = DDI_INTR_TYPE_MSIX; 1214 1215 ret = ddi_intr_get_pri(mlxp->mlx_intr_handles[0], &mlxp->mlx_intr_pri); 1216 if (ret != DDI_SUCCESS) { 1217 mlxcx_intr_teardown(mlxp); 1218 return (B_FALSE); 1219 } 1220 1221 mlxp->mlx_eqs_size = mlxp->mlx_intr_count * 1222 sizeof (mlxcx_event_queue_t); 1223 mlxp->mlx_eqs = kmem_zalloc(mlxp->mlx_eqs_size, KM_SLEEP); 1224 1225 /* 1226 * In the failure path, mlxcx_intr_teardown() expects this 1227 * mutex and avl tree to be init'ed - so do it now. 1228 */ 1229 for (i = 0; i < mlxp->mlx_intr_count; ++i) { 1230 mutex_init(&mlxp->mlx_eqs[i].mleq_mtx, NULL, MUTEX_DRIVER, 1231 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 1232 cv_init(&mlxp->mlx_eqs[i].mleq_cv, NULL, CV_DRIVER, NULL); 1233 1234 if (i < mlxp->mlx_intr_cq0) 1235 continue; 1236 1237 avl_create(&mlxp->mlx_eqs[i].mleq_cqs, mlxcx_cq_compare, 1238 sizeof (mlxcx_completion_queue_t), 1239 offsetof(mlxcx_completion_queue_t, mlcq_eq_entry)); 1240 } 1241 1242 ret = ddi_intr_add_handler(mlxp->mlx_intr_handles[0], mlxcx_intr_async, 1243 (caddr_t)mlxp, (caddr_t)&mlxp->mlx_eqs[0]); 1244 if (ret != DDI_SUCCESS) { 1245 mlxcx_intr_teardown(mlxp); 1246 return (B_FALSE); 1247 } 1248 1249 /* 1250 * If we have enough interrupts, set their "type" fields so that we 1251 * avoid mixing RX and TX queues on the same EQs. 1252 */ 1253 if (mlxp->mlx_intr_count >= 8) { 1254 eqt = MLXCX_EQ_TYPE_RX; 1255 } 1256 1257 for (i = mlxp->mlx_intr_cq0; i < mlxp->mlx_intr_count; ++i) { 1258 mlxp->mlx_eqs[i].mleq_intr_index = i; 1259 1260 mlxp->mlx_eqs[i].mleq_type = eqt; 1261 /* 1262 * If eqt is still ANY, just leave it set to that 1263 * (no else here). 1264 */ 1265 if (eqt == MLXCX_EQ_TYPE_RX) { 1266 eqt = MLXCX_EQ_TYPE_TX; 1267 } else if (eqt == MLXCX_EQ_TYPE_TX) { 1268 eqt = MLXCX_EQ_TYPE_RX; 1269 } 1270 1271 ret = ddi_intr_add_handler(mlxp->mlx_intr_handles[i], 1272 mlxcx_intr_n, (caddr_t)mlxp, (caddr_t)&mlxp->mlx_eqs[i]); 1273 if (ret != DDI_SUCCESS) { 1274 mlxcx_intr_teardown(mlxp); 1275 return (B_FALSE); 1276 } 1277 } 1278 1279 return (B_TRUE); 1280 } 1281