1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright (c) 2020, the University of Queensland 14 * Copyright 2020 RackTop Systems, Inc. 15 * Copyright 2020 OmniOS Community Edition (OmniOSce) Association. 16 */ 17 18 /* 19 * Mellanox Connect-X 4/5/6 driver. 20 */ 21 22 #include <sys/modctl.h> 23 #include <sys/conf.h> 24 #include <sys/devops.h> 25 #include <sys/sysmacros.h> 26 #include <sys/disp.h> 27 #include <sys/sdt.h> 28 29 #include <sys/mac_provider.h> 30 31 #include <mlxcx.h> 32 33 /* 34 * CTASSERT(s) to cover bad values which would induce bugs. 35 */ 36 CTASSERT(MLXCX_CQ_LWM_GAP >= MLXCX_CQ_HWM_GAP); 37 38 /* 39 * Disable interrupts. 40 * The act of calling ddi_intr_disable() does not guarantee an interrupt 41 * routine is not running, so flag the vector as quiescing and wait 42 * for anything active to finish. 43 */ 44 void 45 mlxcx_intr_disable(mlxcx_t *mlxp) 46 { 47 int i; 48 49 mlxcx_cmd_eq_disable(mlxp); 50 51 for (i = 0; i < mlxp->mlx_intr_count; ++i) { 52 mlxcx_event_queue_t *mleq = &mlxp->mlx_eqs[i]; 53 54 mutex_enter(&mleq->mleq_mtx); 55 56 if ((mleq->mleq_state & MLXCX_EQ_INTR_ENABLED) == 0) { 57 mutex_exit(&mleq->mleq_mtx); 58 continue; 59 } 60 61 (void) ddi_intr_disable(mlxp->mlx_intr_handles[i]); 62 63 mleq->mleq_state |= MLXCX_EQ_INTR_QUIESCE; 64 while ((mleq->mleq_state & MLXCX_EQ_INTR_ACTIVE) != 0) 65 cv_wait(&mleq->mleq_cv, &mleq->mleq_mtx); 66 67 mleq->mleq_state &= ~MLXCX_EQ_INTR_ENABLED; 68 69 mutex_exit(&mleq->mleq_mtx); 70 } 71 } 72 73 void 74 mlxcx_intr_teardown(mlxcx_t *mlxp) 75 { 76 int i; 77 int ret; 78 79 for (i = 0; i < mlxp->mlx_intr_count; ++i) { 80 mlxcx_event_queue_t *mleq = &mlxp->mlx_eqs[i]; 81 82 mutex_enter(&mleq->mleq_mtx); 83 VERIFY0(mleq->mleq_state & MLXCX_EQ_ALLOC); 84 if (mleq->mleq_state & MLXCX_EQ_CREATED) 85 VERIFY(mleq->mleq_state & MLXCX_EQ_DESTROYED); 86 if (i >= mlxp->mlx_intr_cq0) { 87 VERIFY(avl_is_empty(&mleq->mleq_cqs)); 88 avl_destroy(&mleq->mleq_cqs); 89 } 90 mutex_exit(&mleq->mleq_mtx); 91 (void) ddi_intr_remove_handler(mlxp->mlx_intr_handles[i]); 92 ret = ddi_intr_free(mlxp->mlx_intr_handles[i]); 93 if (ret != DDI_SUCCESS) { 94 mlxcx_warn(mlxp, "failed to free interrupt %d: %d", 95 i, ret); 96 } 97 mutex_destroy(&mleq->mleq_mtx); 98 cv_destroy(&mleq->mleq_cv); 99 } 100 kmem_free(mlxp->mlx_intr_handles, mlxp->mlx_intr_size); 101 kmem_free(mlxp->mlx_eqs, mlxp->mlx_eqs_size); 102 mlxp->mlx_intr_handles = NULL; 103 mlxp->mlx_eqs = NULL; 104 } 105 106 /* 107 * Get the next SW-owned entry on the event queue, or NULL if we reach the end. 108 */ 109 static mlxcx_eventq_ent_t * 110 mlxcx_eq_next(mlxcx_event_queue_t *mleq) 111 { 112 mlxcx_eventq_ent_t *ent; 113 ddi_fm_error_t err; 114 uint_t ci; 115 const uint_t swowner = ((mleq->mleq_cc >> mleq->mleq_entshift) & 1); 116 117 /* 118 * This should only be called from interrupt context to ensure 119 * correctness of mleq_cc. 120 */ 121 ASSERT(servicing_interrupt()); 122 ASSERT(mleq->mleq_state & MLXCX_EQ_CREATED); 123 ASSERT0(mleq->mleq_state & MLXCX_EQ_DESTROYED); 124 125 /* mleq_nents is always a power of 2 */ 126 ci = mleq->mleq_cc & (mleq->mleq_nents - 1); 127 128 ent = &mleq->mleq_ent[ci]; 129 VERIFY0(ddi_dma_sync(mleq->mleq_dma.mxdb_dma_handle, 130 (uintptr_t)ent - (uintptr_t)mleq->mleq_ent, 131 sizeof (mlxcx_eventq_ent_t), DDI_DMA_SYNC_FORCPU)); 132 ddi_fm_dma_err_get(mleq->mleq_dma.mxdb_dma_handle, &err, 133 DDI_FME_VERSION); 134 if (err.fme_status == DDI_FM_OK && (ent->mleqe_owner & 1) == swowner) { 135 /* The PRM says we have to membar here, so we're doing it */ 136 membar_consumer(); 137 ++mleq->mleq_cc; 138 return (ent); 139 } 140 /* 141 * In the case of a DMA error, we should re-arm this EQ and then come 142 * back and try again when the device wakes us back up. 143 * 144 * Hopefully the fault will be gone by then. 145 */ 146 ddi_fm_dma_err_clear(mleq->mleq_dma.mxdb_dma_handle, DDI_FME_VERSION); 147 148 return (NULL); 149 } 150 151 void 152 mlxcx_arm_eq(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq) 153 { 154 uint_t try = 0; 155 ddi_fm_error_t err; 156 bits32_t v = new_bits32(); 157 158 /* 159 * This is only called during initialization when the EQ is 160 * armed for the first time, and when re-armed at the end of 161 * interrupt processing. 162 */ 163 ASSERT(mutex_owned(&mleq->mleq_mtx) || servicing_interrupt()); 164 ASSERT(mleq->mleq_state & MLXCX_EQ_CREATED); 165 ASSERT0(mleq->mleq_state & MLXCX_EQ_DESTROYED); 166 ASSERT0(mleq->mleq_state & MLXCX_EQ_ARMED); 167 ASSERT0(mleq->mleq_state & MLXCX_EQ_POLLING); 168 169 mleq->mleq_state |= MLXCX_EQ_ARMED; 170 mleq->mleq_cc_armed = mleq->mleq_cc; 171 172 set_bits32(&v, MLXCX_EQ_ARM_EQN, mleq->mleq_num); 173 set_bits32(&v, MLXCX_EQ_ARM_CI, mleq->mleq_cc); 174 175 retry: 176 mlxcx_uar_put32(mlxp, mleq->mleq_uar, MLXCX_UAR_EQ_ARM, 177 from_bits32(v)); 178 ddi_fm_acc_err_get(mlxp->mlx_regs_handle, &err, 179 DDI_FME_VERSION); 180 if (err.fme_status == DDI_FM_OK) 181 return; 182 if (try++ < mlxcx_doorbell_tries) { 183 ddi_fm_acc_err_clear(mlxp->mlx_regs_handle, DDI_FME_VERSION); 184 goto retry; 185 } 186 ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST); 187 } 188 189 static void 190 mlxcx_update_eq(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq) 191 { 192 bits32_t v = new_bits32(); 193 ddi_fm_error_t err; 194 195 /* 196 * This should only be called from interrupt context to ensure 197 * correctness of mleq_cc. 198 */ 199 ASSERT(servicing_interrupt()); 200 ASSERT(mleq->mleq_state & MLXCX_EQ_CREATED); 201 ASSERT0(mleq->mleq_state & MLXCX_EQ_DESTROYED); 202 ASSERT0(mleq->mleq_state & MLXCX_EQ_ARMED); 203 204 set_bits32(&v, MLXCX_EQ_ARM_EQN, mleq->mleq_num); 205 set_bits32(&v, MLXCX_EQ_ARM_CI, mleq->mleq_cc); 206 207 mlxcx_uar_put32(mlxp, mleq->mleq_uar, MLXCX_UAR_EQ_NOARM, 208 from_bits32(v)); 209 ddi_fm_acc_err_get(mlxp->mlx_regs_handle, &err, 210 DDI_FME_VERSION); 211 ddi_fm_acc_err_clear(mlxp->mlx_regs_handle, DDI_FME_VERSION); 212 /* 213 * Ignore the error, if it's still happening when we try to re-arm the 214 * EQ, we will note the impact then. 215 */ 216 } 217 218 static mlxcx_completionq_ent_t * 219 mlxcx_cq_next(mlxcx_completion_queue_t *mlcq) 220 { 221 mlxcx_completionq_ent_t *ent; 222 ddi_fm_error_t err; 223 uint_t ci; 224 const uint_t swowner = ((mlcq->mlcq_cc >> mlcq->mlcq_entshift) & 1); 225 226 ASSERT(mutex_owned(&mlcq->mlcq_mtx)); 227 ASSERT(mlcq->mlcq_state & MLXCX_CQ_CREATED); 228 ASSERT0(mlcq->mlcq_state & MLXCX_CQ_DESTROYED); 229 230 /* mlcq_nents is always a power of 2 */ 231 ci = mlcq->mlcq_cc & (mlcq->mlcq_nents - 1); 232 233 ent = &mlcq->mlcq_ent[ci]; 234 VERIFY0(ddi_dma_sync(mlcq->mlcq_dma.mxdb_dma_handle, 235 (uintptr_t)ent - (uintptr_t)mlcq->mlcq_ent, 236 sizeof (mlxcx_completionq_ent_t), DDI_DMA_SYNC_FORCPU)); 237 ddi_fm_dma_err_get(mlcq->mlcq_dma.mxdb_dma_handle, &err, 238 DDI_FME_VERSION); 239 if (err.fme_status == DDI_FM_OK && (ent->mlcqe_owner & 1) == swowner) { 240 /* The PRM says we have to membar here, so we're doing it */ 241 membar_consumer(); 242 ++mlcq->mlcq_cc; 243 return (ent); 244 } 245 ddi_fm_dma_err_clear(mlcq->mlcq_dma.mxdb_dma_handle, DDI_FME_VERSION); 246 247 return (NULL); 248 } 249 250 void 251 mlxcx_update_cqci(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq) 252 { 253 ddi_fm_error_t err; 254 uint_t try = 0; 255 256 mlcq->mlcq_doorbell->mlcqd_update_ci = to_be24(mlcq->mlcq_cc); 257 258 retry: 259 MLXCX_DMA_SYNC(mlcq->mlcq_doorbell_dma, DDI_DMA_SYNC_FORDEV); 260 ddi_fm_dma_err_get(mlcq->mlcq_doorbell_dma.mxdb_dma_handle, &err, 261 DDI_FME_VERSION); 262 if (err.fme_status != DDI_FM_OK) { 263 if (try++ < mlxcx_doorbell_tries) { 264 ddi_fm_dma_err_clear( 265 mlcq->mlcq_doorbell_dma.mxdb_dma_handle, 266 DDI_FME_VERSION); 267 goto retry; 268 } else { 269 ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST); 270 return; 271 } 272 } 273 } 274 275 void 276 mlxcx_arm_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq) 277 { 278 bits32_t dbval = new_bits32(); 279 uint64_t udbval; 280 ddi_fm_error_t err; 281 uint_t try = 0; 282 283 ASSERT(mutex_owned(&mlcq->mlcq_arm_mtx)); 284 ASSERT(mutex_owned(&mlcq->mlcq_mtx)); 285 ASSERT(mlcq->mlcq_state & MLXCX_CQ_CREATED); 286 ASSERT0(mlcq->mlcq_state & MLXCX_CQ_DESTROYED); 287 288 if (mlcq->mlcq_state & MLXCX_CQ_ARMED) { 289 ASSERT3U(mlcq->mlcq_ec, >, mlcq->mlcq_ec_armed); 290 } 291 292 if (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN) 293 return; 294 295 atomic_or_uint(&mlcq->mlcq_state, MLXCX_CQ_ARMED); 296 mlcq->mlcq_cc_armed = mlcq->mlcq_cc; 297 mlcq->mlcq_ec_armed = mlcq->mlcq_ec; 298 299 set_bits32(&dbval, MLXCX_CQ_ARM_SEQ, mlcq->mlcq_ec); 300 set_bits32(&dbval, MLXCX_CQ_ARM_CI, mlcq->mlcq_cc); 301 302 udbval = (uint64_t)from_bits32(dbval) << 32; 303 udbval |= mlcq->mlcq_num & 0xffffff; 304 305 mlcq->mlcq_doorbell->mlcqd_update_ci = to_be24(mlcq->mlcq_cc); 306 mlcq->mlcq_doorbell->mlcqd_arm_ci = dbval; 307 308 retry: 309 MLXCX_DMA_SYNC(mlcq->mlcq_doorbell_dma, DDI_DMA_SYNC_FORDEV); 310 ddi_fm_dma_err_get(mlcq->mlcq_doorbell_dma.mxdb_dma_handle, &err, 311 DDI_FME_VERSION); 312 if (err.fme_status != DDI_FM_OK) { 313 if (try++ < mlxcx_doorbell_tries) { 314 ddi_fm_dma_err_clear( 315 mlcq->mlcq_doorbell_dma.mxdb_dma_handle, 316 DDI_FME_VERSION); 317 goto retry; 318 } else { 319 goto err; 320 } 321 } 322 323 mlxcx_uar_put64(mlxp, mlcq->mlcq_uar, MLXCX_UAR_CQ_ARM, udbval); 324 ddi_fm_acc_err_get(mlxp->mlx_regs_handle, &err, 325 DDI_FME_VERSION); 326 if (err.fme_status == DDI_FM_OK) 327 return; 328 if (try++ < mlxcx_doorbell_tries) { 329 ddi_fm_acc_err_clear(mlxp->mlx_regs_handle, DDI_FME_VERSION); 330 goto retry; 331 } 332 333 err: 334 ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST); 335 } 336 337 const char * 338 mlxcx_event_name(mlxcx_event_t evt) 339 { 340 switch (evt) { 341 case MLXCX_EVENT_COMPLETION: 342 return ("COMPLETION"); 343 case MLXCX_EVENT_PATH_MIGRATED: 344 return ("PATH_MIGRATED"); 345 case MLXCX_EVENT_COMM_ESTABLISH: 346 return ("COMM_ESTABLISH"); 347 case MLXCX_EVENT_SENDQ_DRAIN: 348 return ("SENDQ_DRAIN"); 349 case MLXCX_EVENT_LAST_WQE: 350 return ("LAST_WQE"); 351 case MLXCX_EVENT_SRQ_LIMIT: 352 return ("SRQ_LIMIT"); 353 case MLXCX_EVENT_DCT_ALL_CLOSED: 354 return ("DCT_ALL_CLOSED"); 355 case MLXCX_EVENT_DCT_ACCKEY_VIOL: 356 return ("DCT_ACCKEY_VIOL"); 357 case MLXCX_EVENT_CQ_ERROR: 358 return ("CQ_ERROR"); 359 case MLXCX_EVENT_WQ_CATASTROPHE: 360 return ("WQ_CATASTROPHE"); 361 case MLXCX_EVENT_PATH_MIGRATE_FAIL: 362 return ("PATH_MIGRATE_FAIL"); 363 case MLXCX_EVENT_PAGE_FAULT: 364 return ("PAGE_FAULT"); 365 case MLXCX_EVENT_WQ_INVALID_REQ: 366 return ("WQ_INVALID_REQ"); 367 case MLXCX_EVENT_WQ_ACCESS_VIOL: 368 return ("WQ_ACCESS_VIOL"); 369 case MLXCX_EVENT_SRQ_CATASTROPHE: 370 return ("SRQ_CATASTROPHE"); 371 case MLXCX_EVENT_INTERNAL_ERROR: 372 return ("INTERNAL_ERROR"); 373 case MLXCX_EVENT_PORT_STATE: 374 return ("PORT_STATE"); 375 case MLXCX_EVENT_GPIO: 376 return ("GPIO"); 377 case MLXCX_EVENT_PORT_MODULE: 378 return ("PORT_MODULE"); 379 case MLXCX_EVENT_TEMP_WARNING: 380 return ("TEMP_WARNING"); 381 case MLXCX_EVENT_REMOTE_CONFIG: 382 return ("REMOTE_CONFIG"); 383 case MLXCX_EVENT_DCBX_CHANGE: 384 return ("DCBX_CHANGE"); 385 case MLXCX_EVENT_DOORBELL_CONGEST: 386 return ("DOORBELL_CONGEST"); 387 case MLXCX_EVENT_STALL_VL: 388 return ("STALL_VL"); 389 case MLXCX_EVENT_CMD_COMPLETION: 390 return ("CMD_COMPLETION"); 391 case MLXCX_EVENT_PAGE_REQUEST: 392 return ("PAGE_REQUEST"); 393 case MLXCX_EVENT_NIC_VPORT: 394 return ("NIC_VPORT"); 395 case MLXCX_EVENT_EC_PARAMS_CHANGE: 396 return ("EC_PARAMS_CHANGE"); 397 case MLXCX_EVENT_XRQ_ERROR: 398 return ("XRQ_ERROR"); 399 } 400 return ("UNKNOWN"); 401 } 402 403 /* Should be called only when link state has changed. */ 404 void 405 mlxcx_update_link_state(mlxcx_t *mlxp, mlxcx_port_t *port) 406 { 407 link_state_t ls; 408 409 mutex_enter(&port->mlp_mtx); 410 (void) mlxcx_cmd_query_port_status(mlxp, port); 411 (void) mlxcx_cmd_query_port_speed(mlxp, port); 412 (void) mlxcx_cmd_query_port_fec(mlxp, port); 413 414 switch (port->mlp_oper_status) { 415 case MLXCX_PORT_STATUS_UP: 416 case MLXCX_PORT_STATUS_UP_ONCE: 417 ls = LINK_STATE_UP; 418 break; 419 case MLXCX_PORT_STATUS_DOWN: 420 ls = LINK_STATE_DOWN; 421 break; 422 default: 423 ls = LINK_STATE_UNKNOWN; 424 } 425 mac_link_update(mlxp->mlx_mac_hdl, ls); 426 427 mutex_exit(&port->mlp_mtx); 428 } 429 430 CTASSERT(MLXCX_MANAGE_PAGES_MAX_PAGES < UINT_MAX); 431 432 static void 433 mlxcx_give_pages_once(mlxcx_t *mlxp, size_t npages) 434 { 435 ddi_device_acc_attr_t acc; 436 ddi_dma_attr_t attr; 437 mlxcx_dev_page_t *mdp; 438 mlxcx_dev_page_t **pages; 439 size_t i; 440 const ddi_dma_cookie_t *ck; 441 442 /* 443 * If this isn't enough, the HCA will ask for more 444 */ 445 npages = MIN(npages, MLXCX_MANAGE_PAGES_MAX_PAGES); 446 447 pages = kmem_zalloc(sizeof (*pages) * npages, KM_SLEEP); 448 449 for (i = 0; i < npages; i++) { 450 mdp = kmem_zalloc(sizeof (mlxcx_dev_page_t), KM_SLEEP); 451 mlxcx_dma_acc_attr(mlxp, &acc); 452 mlxcx_dma_page_attr(mlxp, &attr); 453 if (!mlxcx_dma_alloc(mlxp, &mdp->mxdp_dma, &attr, &acc, 454 B_TRUE, MLXCX_HW_PAGE_SIZE, B_TRUE)) { 455 mlxcx_warn(mlxp, "failed to allocate 4k page %u/%lu", i, 456 npages); 457 kmem_free(mdp, sizeof (mlxcx_dev_page_t)); 458 goto cleanup_npages; 459 } 460 ck = mlxcx_dma_cookie_one(&mdp->mxdp_dma); 461 mdp->mxdp_pa = ck->dmac_laddress; 462 pages[i] = mdp; 463 } 464 465 mutex_enter(&mlxp->mlx_pagemtx); 466 467 if (!mlxcx_cmd_give_pages(mlxp, 468 MLXCX_MANAGE_PAGES_OPMOD_GIVE_PAGES, npages, pages)) { 469 mlxcx_warn(mlxp, "!hardware refused our gift of %lu " 470 "pages!", npages); 471 mutex_exit(&mlxp->mlx_pagemtx); 472 goto cleanup_npages; 473 } 474 475 for (i = 0; i < npages; i++) { 476 avl_add(&mlxp->mlx_pages, pages[i]); 477 } 478 mlxp->mlx_npages += npages; 479 mutex_exit(&mlxp->mlx_pagemtx); 480 481 kmem_free(pages, sizeof (*pages) * npages); 482 483 return; 484 485 cleanup_npages: 486 for (i = 0; i < npages; i++) { 487 if ((mdp = pages[i]) == NULL) 488 break; 489 490 mlxcx_dma_free(&mdp->mxdp_dma); 491 kmem_free(mdp, sizeof (mlxcx_dev_page_t)); 492 } 493 /* Tell the hardware we had an allocation failure. */ 494 (void) mlxcx_cmd_give_pages(mlxp, MLXCX_MANAGE_PAGES_OPMOD_ALLOC_FAIL, 495 0, NULL); 496 mutex_exit(&mlxp->mlx_pagemtx); 497 498 kmem_free(pages, sizeof (*pages) * npages); 499 } 500 501 static void 502 mlxcx_take_pages_once(mlxcx_t *mlxp, size_t npages) 503 { 504 uint_t i; 505 int32_t ret; 506 uint64_t *pas; 507 mlxcx_dev_page_t *mdp, probe; 508 509 pas = kmem_alloc(sizeof (*pas) * npages, KM_SLEEP); 510 511 if (!mlxcx_cmd_return_pages(mlxp, npages, pas, &ret)) { 512 kmem_free(pas, sizeof (*pas) * npages); 513 return; 514 } 515 516 mutex_enter(&mlxp->mlx_pagemtx); 517 518 ASSERT0(avl_is_empty(&mlxp->mlx_pages)); 519 520 for (i = 0; i < ret; i++) { 521 bzero(&probe, sizeof (probe)); 522 probe.mxdp_pa = pas[i]; 523 524 mdp = avl_find(&mlxp->mlx_pages, &probe, NULL); 525 526 if (mdp != NULL) { 527 avl_remove(&mlxp->mlx_pages, mdp); 528 mlxp->mlx_npages--; 529 mlxcx_dma_free(&mdp->mxdp_dma); 530 kmem_free(mdp, sizeof (mlxcx_dev_page_t)); 531 } else { 532 mlxcx_warn(mlxp, "hardware returned a page " 533 "with PA 0x%" PRIx64 " but we have no " 534 "record of giving out such a page", pas[i]); 535 } 536 } 537 538 mutex_exit(&mlxp->mlx_pagemtx); 539 540 kmem_free(pas, sizeof (*pas) * npages); 541 } 542 543 static void 544 mlxcx_pages_task(void *arg) 545 { 546 mlxcx_async_param_t *param = arg; 547 mlxcx_t *mlxp = param->mla_mlx; 548 int32_t npages; 549 550 /* 551 * We can drop the pending status now, as we've extracted what 552 * is needed to process the pages request. 553 * 554 * Even though we should never get another pages request until 555 * we have responded to this, along with the guard in mlxcx_sync_intr, 556 * this safely allows the reuse of mlxcx_async_param_t. 557 */ 558 mutex_enter(¶m->mla_mtx); 559 npages = param->mla_pages.mlp_npages; 560 param->mla_pending = B_FALSE; 561 bzero(¶m->mla_pages, sizeof (param->mla_pages)); 562 mutex_exit(¶m->mla_mtx); 563 564 /* 565 * The PRM describes npages as: "Number of missing / unneeded pages 566 * (signed number, msb indicate sign)". The implication is that 567 * it will not be zero. We are expected to use this to give or 568 * take back pages (based on the sign) using the MANAGE_PAGES 569 * command but we can't determine whether to give or take 570 * when npages is zero. So we do nothing. 571 */ 572 if (npages > 0) { 573 mlxcx_give_pages_once(mlxp, npages); 574 } else if (npages < 0) { 575 mlxcx_take_pages_once(mlxp, -1 * npages); 576 } 577 } 578 579 static void 580 mlxcx_link_state_task(void *arg) 581 { 582 mlxcx_async_param_t *param = arg; 583 mlxcx_port_t *port; 584 mlxcx_t *mlxp; 585 586 /* 587 * Gather the argruments from the parameters and clear the 588 * pending status. 589 * 590 * The pending status must be cleared *before* we update the 591 * link state. This is both safe and required to ensure we always 592 * have the correct link state. It is safe because taskq_ents are 593 * reusable (by the caller of taskq_dispatch_ent()) once the 594 * task function has started executing. It is necessarily before 595 * updating the link state to guarantee further link state change 596 * events are not missed and we always have the current link state. 597 */ 598 mutex_enter(¶m->mla_mtx); 599 mlxp = param->mla_mlx; 600 port = param->mla_port; 601 param->mla_pending = B_FALSE; 602 mutex_exit(¶m->mla_mtx); 603 604 mlxcx_update_link_state(mlxp, port); 605 } 606 607 static const char * 608 mlxcx_module_error_string(mlxcx_module_error_type_t err) 609 { 610 switch (err) { 611 case MLXCX_MODULE_ERR_POWER_BUDGET: 612 return ("POWER_BUDGET"); 613 case MLXCX_MODULE_ERR_LONG_RANGE: 614 return ("LONG_RANGE"); 615 case MLXCX_MODULE_ERR_BUS_STUCK: 616 return ("BUS_STUCK"); 617 case MLXCX_MODULE_ERR_NO_EEPROM: 618 return ("NO_EEPROM"); 619 case MLXCX_MODULE_ERR_ENFORCEMENT: 620 return ("ENFORCEMENT"); 621 case MLXCX_MODULE_ERR_UNKNOWN_IDENT: 622 return ("UNKNOWN_IDENT"); 623 case MLXCX_MODULE_ERR_HIGH_TEMP: 624 return ("HIGH_TEMP"); 625 case MLXCX_MODULE_ERR_CABLE_SHORTED: 626 return ("CABLE_SHORTED"); 627 default: 628 return ("UNKNOWN"); 629 } 630 } 631 632 static void 633 mlxcx_report_module_error(mlxcx_t *mlxp, mlxcx_evdata_port_mod_t *evd) 634 { 635 uint64_t ena; 636 char buf[FM_MAX_CLASS]; 637 const char *lename; 638 const char *ename; 639 const char *stname; 640 uint_t eno = 0; 641 mlxcx_module_status_t state = evd->mled_port_mod_module_status; 642 643 switch (state) { 644 case MLXCX_MODULE_ERROR: 645 stname = "error"; 646 eno = evd->mled_port_mod_error_type; 647 lename = mlxcx_module_error_string(eno); 648 switch (eno) { 649 case MLXCX_MODULE_ERR_ENFORCEMENT: 650 ename = DDI_FM_TXR_ERROR_WHITELIST; 651 break; 652 case MLXCX_MODULE_ERR_UNKNOWN_IDENT: 653 case MLXCX_MODULE_ERR_NO_EEPROM: 654 ename = DDI_FM_TXR_ERROR_NOTSUPP; 655 break; 656 case MLXCX_MODULE_ERR_HIGH_TEMP: 657 ename = DDI_FM_TXR_ERROR_OVERTEMP; 658 break; 659 case MLXCX_MODULE_ERR_POWER_BUDGET: 660 case MLXCX_MODULE_ERR_LONG_RANGE: 661 case MLXCX_MODULE_ERR_CABLE_SHORTED: 662 ename = DDI_FM_TXR_ERROR_HWFAIL; 663 break; 664 case MLXCX_MODULE_ERR_BUS_STUCK: 665 default: 666 ename = DDI_FM_TXR_ERROR_UNKNOWN; 667 } 668 break; 669 default: 670 return; 671 } 672 673 (void) snprintf(buf, FM_MAX_CLASS, "%s.%s", 674 DDI_FM_NIC, DDI_FM_TXR_ERROR); 675 ena = fm_ena_generate(0, FM_ENA_FMT1); 676 if (!DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps)) 677 return; 678 679 ddi_fm_ereport_post(mlxp->mlx_dip, buf, ena, DDI_NOSLEEP, 680 /* compulsory FM props */ 681 FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0, 682 /* generic NIC txr error event props */ 683 "error", DATA_TYPE_STRING, ename, 684 "port_index", DATA_TYPE_UINT8, 0, 685 "txr_index", DATA_TYPE_UINT8, evd->mled_port_mod_module, 686 /* local props */ 687 "mlxcx_state", DATA_TYPE_STRING, stname, 688 "mlxcx_error", DATA_TYPE_STRING, lename, 689 "mlxcx_error_num", DATA_TYPE_UINT8, eno, 690 NULL); 691 ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST); 692 } 693 694 /* 695 * Common beginning of interrupt processing. 696 * Confirm interrupt hasn't been disabled, verify its state and 697 * mark the vector as active. 698 */ 699 static boolean_t 700 mlxcx_intr_ini(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq) 701 { 702 mutex_enter(&mleq->mleq_mtx); 703 704 if ((mleq->mleq_state & MLXCX_EQ_INTR_ENABLED) == 0) { 705 mutex_exit(&mleq->mleq_mtx); 706 return (B_FALSE); 707 } 708 709 if (!(mleq->mleq_state & MLXCX_EQ_ALLOC) || 710 !(mleq->mleq_state & MLXCX_EQ_CREATED) || 711 (mleq->mleq_state & MLXCX_EQ_DESTROYED)) { 712 mlxcx_warn(mlxp, "intr %d in bad eq state", 713 mleq->mleq_intr_index); 714 mutex_exit(&mleq->mleq_mtx); 715 return (B_FALSE); 716 } 717 718 mleq->mleq_state |= MLXCX_EQ_INTR_ACTIVE; 719 mutex_exit(&mleq->mleq_mtx); 720 721 return (B_TRUE); 722 } 723 724 /* 725 * End of interrupt processing. 726 * Mark vector as no longer active and if shutdown is blocked on this vector, 727 * wake it up. 728 */ 729 static void 730 mlxcx_intr_fini(mlxcx_event_queue_t *mleq) 731 { 732 mutex_enter(&mleq->mleq_mtx); 733 if ((mleq->mleq_state & MLXCX_EQ_INTR_QUIESCE) != 0) 734 cv_signal(&mleq->mleq_cv); 735 736 mleq->mleq_state &= ~MLXCX_EQ_INTR_ACTIVE; 737 mutex_exit(&mleq->mleq_mtx); 738 } 739 740 static uint_t 741 mlxcx_intr_async(caddr_t arg, caddr_t arg2) 742 { 743 mlxcx_t *mlxp = (mlxcx_t *)arg; 744 mlxcx_event_queue_t *mleq = (mlxcx_event_queue_t *)arg2; 745 mlxcx_eventq_ent_t *ent; 746 mlxcx_async_param_t *param; 747 uint_t portn; 748 uint16_t func; 749 750 if (!mlxcx_intr_ini(mlxp, mleq)) 751 return (DDI_INTR_CLAIMED); 752 753 ent = mlxcx_eq_next(mleq); 754 if (ent == NULL) { 755 goto done; 756 } 757 758 ASSERT(mleq->mleq_state & MLXCX_EQ_ARMED); 759 mleq->mleq_state &= ~MLXCX_EQ_ARMED; 760 761 for (; ent != NULL; ent = mlxcx_eq_next(mleq)) { 762 DTRACE_PROBE2(event, mlxcx_t *, mlxp, mlxcx_eventq_ent_t *, 763 ent); 764 765 switch (ent->mleqe_event_type) { 766 case MLXCX_EVENT_CMD_COMPLETION: 767 mlxcx_cmd_completion(mlxp, ent); 768 break; 769 case MLXCX_EVENT_PAGE_REQUEST: 770 func = from_be16(ent->mleqe_page_request. 771 mled_page_request_function_id); 772 VERIFY3U(func, <=, MLXCX_FUNC_ID_MAX); 773 774 param = &mlxp->mlx_npages_req[func]; 775 mutex_enter(¶m->mla_mtx); 776 if (param->mla_pending) { 777 /* 778 * The PRM states we will not get another 779 * page request event until any pending have 780 * been posted as complete to the HCA. 781 * This will guard against this anyway. 782 */ 783 mutex_exit(¶m->mla_mtx); 784 mlxcx_warn(mlxp, "Unexpected page request " 785 "whilst another is pending"); 786 break; 787 } 788 param->mla_pages.mlp_npages = 789 (int32_t)from_be32(ent->mleqe_page_request. 790 mled_page_request_num_pages); 791 param->mla_pages.mlp_func = func; 792 param->mla_pending = B_TRUE; 793 ASSERT3P(param->mla_mlx, ==, mlxp); 794 mutex_exit(¶m->mla_mtx); 795 796 taskq_dispatch_ent(mlxp->mlx_async_tq, mlxcx_pages_task, 797 param, 0, ¶m->mla_tqe); 798 break; 799 case MLXCX_EVENT_PORT_STATE: 800 portn = get_bits8( 801 ent->mleqe_port_state.mled_port_state_port_num, 802 MLXCX_EVENT_PORT_NUM) - 1; 803 if (portn >= mlxp->mlx_nports) 804 break; 805 806 param = &mlxp->mlx_ports[portn].mlx_port_event; 807 mutex_enter(¶m->mla_mtx); 808 if (param->mla_pending) { 809 /* 810 * There is a link state event pending 811 * processing. When that event is handled 812 * it will get the current link state. 813 */ 814 mutex_exit(¶m->mla_mtx); 815 break; 816 } 817 818 ASSERT3P(param->mla_mlx, ==, mlxp); 819 ASSERT3P(param->mla_port, ==, &mlxp->mlx_ports[portn]); 820 821 param->mla_pending = B_TRUE; 822 mutex_exit(¶m->mla_mtx); 823 824 taskq_dispatch_ent(mlxp->mlx_async_tq, 825 mlxcx_link_state_task, param, 0, ¶m->mla_tqe); 826 break; 827 case MLXCX_EVENT_PORT_MODULE: 828 mlxcx_report_module_error(mlxp, &ent->mleqe_port_mod); 829 break; 830 default: 831 mlxcx_warn(mlxp, "unhandled event 0x%x on intr %d", 832 ent->mleqe_event_type, mleq->mleq_intr_index); 833 } 834 } 835 836 mlxcx_arm_eq(mlxp, mleq); 837 838 done: 839 mlxcx_intr_fini(mleq); 840 return (DDI_INTR_CLAIMED); 841 } 842 843 static boolean_t 844 mlxcx_process_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, mblk_t **mpp, 845 size_t bytelim) 846 { 847 mlxcx_work_queue_t *wq = mlcq->mlcq_wq; 848 mlxcx_completionq_ent_t *cent; 849 mblk_t *mp, *cmp, *nmp; 850 mlxcx_buffer_t *buf; 851 boolean_t found, added; 852 size_t bytes = 0; 853 uint_t rx_frames = 0; 854 uint_t comp_cnt = 0; 855 int64_t wqebbs, bufcnt; 856 857 *mpp = NULL; 858 859 if (!(mlcq->mlcq_state & MLXCX_CQ_ALLOC) || 860 !(mlcq->mlcq_state & MLXCX_CQ_CREATED) || 861 (mlcq->mlcq_state & MLXCX_CQ_DESTROYED) || 862 (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN)) { 863 return (B_FALSE); 864 } 865 866 nmp = cmp = mp = NULL; 867 868 wqebbs = 0; 869 bufcnt = 0; 870 for (cent = mlxcx_cq_next(mlcq); cent != NULL; 871 cent = mlxcx_cq_next(mlcq)) { 872 /* 873 * Teardown and ring stop can atomic_or this flag 874 * into our state if they want us to stop early. 875 */ 876 if (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN) 877 return (B_FALSE); 878 879 comp_cnt++; 880 if (cent->mlcqe_opcode == MLXCX_CQE_OP_REQ && 881 cent->mlcqe_send_wqe_opcode == MLXCX_WQE_OP_NOP) { 882 /* NOP */ 883 atomic_dec_64(&wq->mlwq_wqebb_used); 884 goto nextcq; 885 } 886 887 lookagain: 888 /* 889 * Generally the buffer we're looking for will be 890 * at the front of the list, so this loop won't 891 * need to look far. 892 */ 893 buf = list_head(&mlcq->mlcq_buffers); 894 found = B_FALSE; 895 while (buf != NULL) { 896 if ((buf->mlb_wqe_index & UINT16_MAX) == 897 from_be16(cent->mlcqe_wqe_counter)) { 898 found = B_TRUE; 899 break; 900 } 901 buf = list_next(&mlcq->mlcq_buffers, buf); 902 } 903 904 if (!found) { 905 /* 906 * If there's any buffers waiting on the 907 * buffers_b list, then merge those into 908 * the main list and have another look. 909 * 910 * The wq enqueue routines push new buffers 911 * into buffers_b so that they can avoid 912 * taking the mlcq_mtx and blocking us for 913 * every single packet. 914 */ 915 added = B_FALSE; 916 mutex_enter(&mlcq->mlcq_bufbmtx); 917 if (!list_is_empty(&mlcq->mlcq_buffers_b)) { 918 list_move_tail(&mlcq->mlcq_buffers, 919 &mlcq->mlcq_buffers_b); 920 added = B_TRUE; 921 } 922 mutex_exit(&mlcq->mlcq_bufbmtx); 923 if (added) 924 goto lookagain; 925 926 /* 927 * This check could go just after the lookagain 928 * label, but it is a hot code path so we don't 929 * want to unnecessarily grab a lock and check 930 * a flag for a relatively rare event (the ring 931 * being stopped). 932 */ 933 mutex_enter(&wq->mlwq_mtx); 934 if ((wq->mlwq_state & MLXCX_WQ_STARTED) == 0) { 935 mutex_exit(&wq->mlwq_mtx); 936 goto nextcq; 937 } 938 mutex_exit(&wq->mlwq_mtx); 939 940 buf = list_head(&mlcq->mlcq_buffers); 941 mlxcx_warn(mlxp, "got completion on CQ %x but " 942 "no buffer matching wqe found: %x (first " 943 "buffer counter = %x)", mlcq->mlcq_num, 944 from_be16(cent->mlcqe_wqe_counter), 945 buf == NULL ? UINT32_MAX : 946 buf->mlb_wqe_index); 947 mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_INVAL_STATE); 948 goto nextcq; 949 } 950 951 /* 952 * The buf is likely to be freed below, count this now. 953 */ 954 wqebbs += buf->mlb_wqebbs; 955 956 list_remove(&mlcq->mlcq_buffers, buf); 957 bufcnt++; 958 959 switch (mlcq->mlcq_wq->mlwq_type) { 960 case MLXCX_WQ_TYPE_SENDQ: 961 mlxcx_tx_completion(mlxp, mlcq, cent, buf); 962 break; 963 case MLXCX_WQ_TYPE_RECVQ: 964 nmp = mlxcx_rx_completion(mlxp, mlcq, cent, buf); 965 bytes += from_be32(cent->mlcqe_byte_cnt); 966 if (nmp != NULL) { 967 if (cmp != NULL) { 968 cmp->b_next = nmp; 969 cmp = nmp; 970 } else { 971 mp = cmp = nmp; 972 } 973 974 rx_frames++; 975 } 976 break; 977 } 978 979 /* 980 * Update the consumer index with what has been processed, 981 * followed by driver counters. It is important to tell the 982 * hardware first, otherwise when we throw more packets at 983 * it, it may get an overflow error. 984 * We do this whenever we've processed enough to bridge the 985 * high->low water mark. 986 */ 987 if (bufcnt > (MLXCX_CQ_LWM_GAP - MLXCX_CQ_HWM_GAP)) { 988 mlxcx_update_cqci(mlxp, mlcq); 989 /* 990 * Both these variables are incremented using 991 * atomics as they are modified in other code paths 992 * (Eg during tx) which hold different locks. 993 */ 994 atomic_add_64(&mlcq->mlcq_bufcnt, -bufcnt); 995 atomic_add_64(&wq->mlwq_wqebb_used, -wqebbs); 996 wqebbs = 0; 997 bufcnt = 0; 998 comp_cnt = 0; 999 } 1000 nextcq: 1001 if (rx_frames > mlxp->mlx_props.mldp_rx_per_cq || 1002 (bytelim != 0 && bytes > bytelim)) 1003 break; 1004 } 1005 1006 if (comp_cnt > 0) { 1007 mlxcx_update_cqci(mlxp, mlcq); 1008 atomic_add_64(&mlcq->mlcq_bufcnt, -bufcnt); 1009 atomic_add_64(&wq->mlwq_wqebb_used, -wqebbs); 1010 } 1011 1012 *mpp = mp; 1013 return (B_TRUE); 1014 } 1015 1016 1017 mblk_t * 1018 mlxcx_rx_poll(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, size_t bytelim) 1019 { 1020 mblk_t *mp = NULL; 1021 1022 ASSERT(mutex_owned(&mlcq->mlcq_mtx)); 1023 1024 ASSERT(mlcq->mlcq_wq != NULL); 1025 ASSERT3U(mlcq->mlcq_wq->mlwq_type, ==, MLXCX_WQ_TYPE_RECVQ); 1026 1027 (void) mlxcx_process_cq(mlxp, mlcq, &mp, bytelim); 1028 1029 return (mp); 1030 } 1031 1032 static uint_t 1033 mlxcx_intr_n(caddr_t arg, caddr_t arg2) 1034 { 1035 mlxcx_t *mlxp = (mlxcx_t *)arg; 1036 mlxcx_event_queue_t *mleq = (mlxcx_event_queue_t *)arg2; 1037 mlxcx_eventq_ent_t *ent; 1038 mlxcx_completion_queue_t *mlcq, probe; 1039 mlxcx_work_queue_t *mlwq; 1040 mblk_t *mp = NULL; 1041 boolean_t tellmac = B_FALSE; 1042 1043 if (!mlxcx_intr_ini(mlxp, mleq)) 1044 return (DDI_INTR_CLAIMED); 1045 1046 ent = mlxcx_eq_next(mleq); 1047 if (ent == NULL) { 1048 if (++mleq->mleq_badintrs > mlxcx_stuck_intr_count) { 1049 mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_BADINT_LIMIT); 1050 ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST); 1051 (void) ddi_intr_disable(mlxp->mlx_intr_handles[ 1052 mleq->mleq_intr_index]); 1053 } 1054 goto done; 1055 } 1056 mleq->mleq_badintrs = 0; 1057 1058 ASSERT(mleq->mleq_state & MLXCX_EQ_ARMED); 1059 mleq->mleq_state &= ~MLXCX_EQ_ARMED; 1060 1061 for (; ent != NULL; ent = mlxcx_eq_next(mleq)) { 1062 if (ent->mleqe_event_type != MLXCX_EVENT_COMPLETION) { 1063 mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_INVAL_STATE); 1064 ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST); 1065 (void) ddi_intr_disable(mlxp->mlx_intr_handles[ 1066 mleq->mleq_intr_index]); 1067 goto done; 1068 } 1069 ASSERT3U(ent->mleqe_event_type, ==, MLXCX_EVENT_COMPLETION); 1070 1071 probe.mlcq_num = 1072 from_be24(ent->mleqe_completion.mled_completion_cqn); 1073 mutex_enter(&mleq->mleq_mtx); 1074 mlcq = avl_find(&mleq->mleq_cqs, &probe, NULL); 1075 mutex_exit(&mleq->mleq_mtx); 1076 1077 if (mlcq == NULL) 1078 continue; 1079 1080 mlwq = mlcq->mlcq_wq; 1081 1082 /* 1083 * mlcq_arm_mtx is used to avoid race conditions between 1084 * this interrupt routine and the transition from polling 1085 * back to interrupt mode. When exiting poll mode the 1086 * CQ is likely to be un-armed, which means there will 1087 * be no events for the CQ coming though here, 1088 * consequently very low contention on mlcq_arm_mtx. 1089 * 1090 * mlcq_arm_mtx must be released before calls into mac 1091 * layer in order to avoid deadlocks. 1092 */ 1093 mutex_enter(&mlcq->mlcq_arm_mtx); 1094 mlcq->mlcq_ec++; 1095 atomic_and_uint(&mlcq->mlcq_state, ~MLXCX_CQ_ARMED); 1096 1097 if (mutex_tryenter(&mlcq->mlcq_mtx) == 0) { 1098 /* 1099 * If we failed to take the mutex because the 1100 * polling function has it, just move on. 1101 * We don't want to block other CQs behind 1102 * this one. 1103 */ 1104 if ((mlcq->mlcq_state & MLXCX_CQ_POLLING) != 0) { 1105 mutex_exit(&mlcq->mlcq_arm_mtx); 1106 goto update_eq; 1107 } 1108 1109 /* Otherwise we will wait. */ 1110 mutex_enter(&mlcq->mlcq_mtx); 1111 } 1112 1113 if ((mlcq->mlcq_state & MLXCX_CQ_POLLING) == 0 && 1114 mlxcx_process_cq(mlxp, mlcq, &mp, 0)) { 1115 /* 1116 * The ring is not in polling mode and we processed 1117 * some completion queue entries. 1118 */ 1119 if ((mlcq->mlcq_state & MLXCX_CQ_BLOCKED_MAC) != 0 && 1120 mlcq->mlcq_bufcnt < mlcq->mlcq_buflwm) { 1121 atomic_and_uint(&mlcq->mlcq_state, 1122 ~MLXCX_CQ_BLOCKED_MAC); 1123 tellmac = B_TRUE; 1124 } 1125 1126 if ((mlwq->mlwq_state & MLXCX_WQ_BLOCKED_MAC) != 0 && 1127 mlwq->mlwq_wqebb_used < mlwq->mlwq_buflwm) { 1128 atomic_and_uint(&mlwq->mlwq_state, 1129 ~MLXCX_WQ_BLOCKED_MAC); 1130 tellmac = B_TRUE; 1131 } 1132 1133 mlxcx_arm_cq(mlxp, mlcq); 1134 1135 mutex_exit(&mlcq->mlcq_mtx); 1136 mutex_exit(&mlcq->mlcq_arm_mtx); 1137 1138 if (tellmac) { 1139 mac_tx_ring_update(mlxp->mlx_mac_hdl, 1140 mlcq->mlcq_mac_hdl); 1141 tellmac = B_FALSE; 1142 } 1143 1144 if (mp != NULL) { 1145 mac_rx_ring(mlxp->mlx_mac_hdl, 1146 mlcq->mlcq_mac_hdl, mp, mlcq->mlcq_mac_gen); 1147 } 1148 } else { 1149 mutex_exit(&mlcq->mlcq_mtx); 1150 mutex_exit(&mlcq->mlcq_arm_mtx); 1151 } 1152 1153 update_eq: 1154 /* 1155 * Updating the consumer counter for an EQ requires a write 1156 * to the UAR, which is possibly expensive. 1157 * 1158 * Try to do it only often enough to stop us wrapping around. 1159 */ 1160 if ((mleq->mleq_cc & 0x7) == 0) 1161 mlxcx_update_eq(mlxp, mleq); 1162 } 1163 1164 mlxcx_arm_eq(mlxp, mleq); 1165 1166 done: 1167 mlxcx_intr_fini(mleq); 1168 return (DDI_INTR_CLAIMED); 1169 } 1170 1171 boolean_t 1172 mlxcx_intr_setup(mlxcx_t *mlxp) 1173 { 1174 dev_info_t *dip = mlxp->mlx_dip; 1175 int ret; 1176 int nintrs = 0; 1177 int navail = 0; 1178 int types, i; 1179 mlxcx_eventq_type_t eqt = MLXCX_EQ_TYPE_ANY; 1180 1181 ret = ddi_intr_get_supported_types(dip, &types); 1182 if (ret != DDI_SUCCESS) { 1183 mlxcx_warn(mlxp, "Failed to get supported interrupt types"); 1184 return (B_FALSE); 1185 } 1186 1187 if (!(types & DDI_INTR_TYPE_MSIX)) { 1188 mlxcx_warn(mlxp, "MSI-X interrupts not available, but mlxcx " 1189 "requires MSI-X"); 1190 return (B_FALSE); 1191 } 1192 1193 ret = ddi_intr_get_nintrs(dip, DDI_INTR_TYPE_MSIX, &nintrs); 1194 if (ret != DDI_SUCCESS) { 1195 mlxcx_warn(mlxp, "Failed to get number of interrupts"); 1196 return (B_FALSE); 1197 } 1198 if (nintrs < 2) { 1199 mlxcx_warn(mlxp, "%d MSI-X interrupts supported, but mlxcx " 1200 "requires 2", nintrs); 1201 return (B_FALSE); 1202 } 1203 1204 ret = ddi_intr_get_navail(dip, DDI_INTR_TYPE_MSIX, &navail); 1205 if (ret != DDI_SUCCESS) { 1206 mlxcx_warn(mlxp, 1207 "Failed to get number of available interrupts"); 1208 return (B_FALSE); 1209 } 1210 if (navail < 2) { 1211 mlxcx_warn(mlxp, "%d MSI-X interrupts available, but mlxcx " 1212 "requires 2", navail); 1213 return (B_FALSE); 1214 } 1215 1216 mlxp->mlx_intr_size = navail * sizeof (ddi_intr_handle_t); 1217 mlxp->mlx_intr_handles = kmem_alloc(mlxp->mlx_intr_size, KM_SLEEP); 1218 /* 1219 * Interrupts for Completion Queues events start from vector 1 1220 * up to available vectors. Vector 0 is used for asynchronous 1221 * events. 1222 */ 1223 mlxp->mlx_intr_cq0 = 1; 1224 1225 ret = ddi_intr_alloc(dip, mlxp->mlx_intr_handles, DDI_INTR_TYPE_MSIX, 1226 0, navail, &mlxp->mlx_intr_count, DDI_INTR_ALLOC_NORMAL); 1227 if (ret != DDI_SUCCESS) { 1228 mlxcx_warn(mlxp, "Failed to allocate %d interrupts", navail); 1229 mlxcx_intr_teardown(mlxp); 1230 return (B_FALSE); 1231 } 1232 if (mlxp->mlx_intr_count < mlxp->mlx_intr_cq0 + 1) { 1233 mlxcx_warn(mlxp, "%d MSI-X interrupts allocated, but mlxcx " 1234 "requires %d", mlxp->mlx_intr_count, 1235 mlxp->mlx_intr_cq0 + 1); 1236 mlxcx_intr_teardown(mlxp); 1237 return (B_FALSE); 1238 } 1239 mlxp->mlx_intr_type = DDI_INTR_TYPE_MSIX; 1240 1241 ret = ddi_intr_get_pri(mlxp->mlx_intr_handles[0], &mlxp->mlx_intr_pri); 1242 if (ret != DDI_SUCCESS) { 1243 mlxcx_warn(mlxp, "Failed to get interrupt priority"); 1244 mlxcx_intr_teardown(mlxp); 1245 return (B_FALSE); 1246 } 1247 1248 /* 1249 * Set the interrupt priority for the asynchronous handler higher 1250 * than the ring handlers. Some operations which issue commands, 1251 * and thus rely on the async interrupt handler for posting 1252 * completion, do so with a CQ mutex held. The CQ mutex is also 1253 * acquired during ring processing, so if the ring processing vector 1254 * happens to be assigned to the same CPU as the async vector 1255 * it can hold off the async interrupt thread and lead to a deadlock. 1256 * By assigning a higher priority to the async vector, it will 1257 * always be dispatched. 1258 */ 1259 mlxp->mlx_async_intr_pri = mlxp->mlx_intr_pri; 1260 if (mlxp->mlx_async_intr_pri < LOCK_LEVEL) { 1261 mlxp->mlx_async_intr_pri++; 1262 } else { 1263 mlxp->mlx_intr_pri--; 1264 } 1265 1266 mlxp->mlx_eqs_size = mlxp->mlx_intr_count * 1267 sizeof (mlxcx_event_queue_t); 1268 mlxp->mlx_eqs = kmem_zalloc(mlxp->mlx_eqs_size, KM_SLEEP); 1269 1270 /* 1271 * In the failure path, mlxcx_intr_teardown() expects this 1272 * mutex and avl tree to be init'ed - so do it now. 1273 */ 1274 for (i = 0; i < mlxp->mlx_intr_count; ++i) { 1275 uint_t pri = (i == 0) ? mlxp->mlx_async_intr_pri : 1276 mlxp->mlx_intr_pri; 1277 1278 mutex_init(&mlxp->mlx_eqs[i].mleq_mtx, NULL, MUTEX_DRIVER, 1279 DDI_INTR_PRI(pri)); 1280 cv_init(&mlxp->mlx_eqs[i].mleq_cv, NULL, CV_DRIVER, NULL); 1281 1282 if (i < mlxp->mlx_intr_cq0) 1283 continue; 1284 1285 avl_create(&mlxp->mlx_eqs[i].mleq_cqs, mlxcx_cq_compare, 1286 sizeof (mlxcx_completion_queue_t), 1287 offsetof(mlxcx_completion_queue_t, mlcq_eq_entry)); 1288 } 1289 1290 while (mlxp->mlx_async_intr_pri > DDI_INTR_PRI_MIN) { 1291 ret = ddi_intr_set_pri(mlxp->mlx_intr_handles[0], 1292 mlxp->mlx_async_intr_pri); 1293 if (ret == DDI_SUCCESS) 1294 break; 1295 mlxcx_note(mlxp, 1296 "!Failed to set interrupt priority to %u for " 1297 "async interrupt vector", mlxp->mlx_async_intr_pri); 1298 /* 1299 * If it was not possible to set the IPL for the async 1300 * interrupt to the desired value, then try a lower priority. 1301 * Some PSMs can only accommodate a limited number of vectors 1302 * at eatch priority level (or group of priority levels). Since 1303 * the async priority must be set higher than the ring 1304 * handlers, lower both. The ring handler priority is set 1305 * below. 1306 */ 1307 mlxp->mlx_async_intr_pri--; 1308 mlxp->mlx_intr_pri--; 1309 } 1310 1311 if (mlxp->mlx_async_intr_pri == DDI_INTR_PRI_MIN) { 1312 mlxcx_warn(mlxp, "Failed to find an interrupt priority for " 1313 "async interrupt vector"); 1314 mlxcx_intr_teardown(mlxp); 1315 return (B_FALSE); 1316 } 1317 1318 ret = ddi_intr_add_handler(mlxp->mlx_intr_handles[0], mlxcx_intr_async, 1319 (caddr_t)mlxp, (caddr_t)&mlxp->mlx_eqs[0]); 1320 if (ret != DDI_SUCCESS) { 1321 mlxcx_warn(mlxp, "Failed to add async interrupt handler"); 1322 mlxcx_intr_teardown(mlxp); 1323 return (B_FALSE); 1324 } 1325 1326 /* 1327 * If we have enough interrupts, set their "type" fields so that we 1328 * avoid mixing RX and TX queues on the same EQs. 1329 */ 1330 if (mlxp->mlx_intr_count >= 8) { 1331 eqt = MLXCX_EQ_TYPE_RX; 1332 } 1333 1334 for (i = mlxp->mlx_intr_cq0; i < mlxp->mlx_intr_count; ++i) { 1335 mlxp->mlx_eqs[i].mleq_intr_index = i; 1336 1337 mlxp->mlx_eqs[i].mleq_type = eqt; 1338 /* 1339 * If eqt is still ANY, just leave it set to that 1340 * (no else here). 1341 */ 1342 if (eqt == MLXCX_EQ_TYPE_RX) { 1343 eqt = MLXCX_EQ_TYPE_TX; 1344 } else if (eqt == MLXCX_EQ_TYPE_TX) { 1345 eqt = MLXCX_EQ_TYPE_RX; 1346 } 1347 1348 while (mlxp->mlx_intr_pri >= DDI_INTR_PRI_MIN) { 1349 ret = ddi_intr_set_pri(mlxp->mlx_intr_handles[i], 1350 mlxp->mlx_intr_pri); 1351 if (ret == DDI_SUCCESS) 1352 break; 1353 mlxcx_note(mlxp, "!Failed to set interrupt priority to " 1354 "%u for interrupt vector %d", 1355 mlxp->mlx_intr_pri, i); 1356 mlxp->mlx_intr_pri--; 1357 } 1358 if (mlxp->mlx_intr_pri < DDI_INTR_PRI_MIN) { 1359 mlxcx_warn(mlxp, 1360 "Failed to find an interrupt priority for " 1361 "interrupt vector %d", i); 1362 mlxcx_intr_teardown(mlxp); 1363 return (B_FALSE); 1364 } 1365 1366 ret = ddi_intr_add_handler(mlxp->mlx_intr_handles[i], 1367 mlxcx_intr_n, (caddr_t)mlxp, (caddr_t)&mlxp->mlx_eqs[i]); 1368 if (ret != DDI_SUCCESS) { 1369 mlxcx_warn(mlxp, "Failed to add interrupt handler %d", 1370 i); 1371 mlxcx_intr_teardown(mlxp); 1372 return (B_FALSE); 1373 } 1374 } 1375 1376 return (B_TRUE); 1377 } 1378