1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright (c) 2021, the University of Queensland 14 * Copyright 2020 RackTop Systems, Inc. 15 * Copyright 2020 OmniOS Community Edition (OmniOSce) Association. 16 */ 17 18 /* 19 * Mellanox Connect-X 4/5/6 driver. 20 */ 21 22 #include <sys/modctl.h> 23 #include <sys/conf.h> 24 #include <sys/devops.h> 25 #include <sys/sysmacros.h> 26 #include <sys/disp.h> 27 #include <sys/sdt.h> 28 29 #include <sys/mac_provider.h> 30 31 #include <mlxcx.h> 32 33 /* 34 * CTASSERT(s) to cover bad values which would induce bugs. 35 */ 36 CTASSERT(MLXCX_CQ_LWM_GAP >= MLXCX_CQ_HWM_GAP); 37 38 /* 39 * Disable interrupts. 40 * The act of calling ddi_intr_disable() does not guarantee an interrupt 41 * routine is not running, so flag the vector as quiescing and wait 42 * for anything active to finish. 43 */ 44 void 45 mlxcx_intr_disable(mlxcx_t *mlxp) 46 { 47 int i; 48 49 mlxcx_cmd_eq_disable(mlxp); 50 51 for (i = 0; i < mlxp->mlx_intr_count; ++i) { 52 mlxcx_event_queue_t *mleq = &mlxp->mlx_eqs[i]; 53 54 mutex_enter(&mleq->mleq_mtx); 55 56 if ((mleq->mleq_state & MLXCX_EQ_INTR_ENABLED) == 0) { 57 mutex_exit(&mleq->mleq_mtx); 58 continue; 59 } 60 61 (void) ddi_intr_disable(mlxp->mlx_intr_handles[i]); 62 63 mleq->mleq_state |= MLXCX_EQ_INTR_QUIESCE; 64 while ((mleq->mleq_state & MLXCX_EQ_INTR_ACTIVE) != 0) 65 cv_wait(&mleq->mleq_cv, &mleq->mleq_mtx); 66 67 mleq->mleq_state &= ~MLXCX_EQ_INTR_ENABLED; 68 69 mutex_exit(&mleq->mleq_mtx); 70 } 71 } 72 73 void 74 mlxcx_intr_teardown(mlxcx_t *mlxp) 75 { 76 int i; 77 int ret; 78 79 for (i = 0; i < mlxp->mlx_intr_count; ++i) { 80 mlxcx_event_queue_t *mleq = &mlxp->mlx_eqs[i]; 81 82 mutex_enter(&mleq->mleq_mtx); 83 VERIFY0(mleq->mleq_state & MLXCX_EQ_ALLOC); 84 if (mleq->mleq_state & MLXCX_EQ_CREATED) 85 VERIFY(mleq->mleq_state & MLXCX_EQ_DESTROYED); 86 if (i >= mlxp->mlx_intr_cq0) { 87 VERIFY(avl_is_empty(&mleq->mleq_cqs)); 88 avl_destroy(&mleq->mleq_cqs); 89 } 90 mutex_exit(&mleq->mleq_mtx); 91 (void) ddi_intr_remove_handler(mlxp->mlx_intr_handles[i]); 92 ret = ddi_intr_free(mlxp->mlx_intr_handles[i]); 93 if (ret != DDI_SUCCESS) { 94 mlxcx_warn(mlxp, "failed to free interrupt %d: %d", 95 i, ret); 96 } 97 mutex_destroy(&mleq->mleq_mtx); 98 cv_destroy(&mleq->mleq_cv); 99 } 100 kmem_free(mlxp->mlx_intr_handles, mlxp->mlx_intr_size); 101 kmem_free(mlxp->mlx_eqs, mlxp->mlx_eqs_size); 102 mlxp->mlx_intr_handles = NULL; 103 mlxp->mlx_eqs = NULL; 104 } 105 106 /* 107 * Get the next SW-owned entry on the event queue, or NULL if we reach the end. 108 */ 109 static mlxcx_eventq_ent_t * 110 mlxcx_eq_next(mlxcx_event_queue_t *mleq) 111 { 112 mlxcx_eventq_ent_t *ent; 113 ddi_fm_error_t err; 114 uint_t ci; 115 const uint_t swowner = ((mleq->mleq_cc >> mleq->mleq_entshift) & 1); 116 117 /* 118 * This should only be called from interrupt context to ensure 119 * correctness of mleq_cc. 120 */ 121 ASSERT(servicing_interrupt()); 122 ASSERT(mleq->mleq_state & MLXCX_EQ_CREATED); 123 ASSERT0(mleq->mleq_state & MLXCX_EQ_DESTROYED); 124 125 /* mleq_nents is always a power of 2 */ 126 ci = mleq->mleq_cc & (mleq->mleq_nents - 1); 127 128 ent = &mleq->mleq_ent[ci]; 129 VERIFY0(ddi_dma_sync(mleq->mleq_dma.mxdb_dma_handle, 130 (uintptr_t)ent - (uintptr_t)mleq->mleq_ent, 131 sizeof (mlxcx_eventq_ent_t), DDI_DMA_SYNC_FORCPU)); 132 ddi_fm_dma_err_get(mleq->mleq_dma.mxdb_dma_handle, &err, 133 DDI_FME_VERSION); 134 if (err.fme_status == DDI_FM_OK && (ent->mleqe_owner & 1) == swowner) { 135 /* The PRM says we have to membar here, so we're doing it */ 136 membar_consumer(); 137 ++mleq->mleq_cc; 138 return (ent); 139 } 140 /* 141 * In the case of a DMA error, we should re-arm this EQ and then come 142 * back and try again when the device wakes us back up. 143 * 144 * Hopefully the fault will be gone by then. 145 */ 146 ddi_fm_dma_err_clear(mleq->mleq_dma.mxdb_dma_handle, DDI_FME_VERSION); 147 148 return (NULL); 149 } 150 151 void 152 mlxcx_arm_eq(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq) 153 { 154 uint_t try = 0; 155 ddi_fm_error_t err; 156 bits32_t v = new_bits32(); 157 158 /* 159 * This is only called during initialization when the EQ is 160 * armed for the first time, and when re-armed at the end of 161 * interrupt processing. 162 */ 163 ASSERT(mutex_owned(&mleq->mleq_mtx) || servicing_interrupt()); 164 ASSERT(mleq->mleq_state & MLXCX_EQ_CREATED); 165 ASSERT0(mleq->mleq_state & MLXCX_EQ_DESTROYED); 166 ASSERT0(mleq->mleq_state & MLXCX_EQ_ARMED); 167 ASSERT0(mleq->mleq_state & MLXCX_EQ_POLLING); 168 169 mleq->mleq_state |= MLXCX_EQ_ARMED; 170 mleq->mleq_cc_armed = mleq->mleq_cc; 171 172 set_bits32(&v, MLXCX_EQ_ARM_EQN, mleq->mleq_num); 173 set_bits32(&v, MLXCX_EQ_ARM_CI, mleq->mleq_cc); 174 175 retry: 176 mlxcx_uar_put32(mlxp, mleq->mleq_uar, MLXCX_UAR_EQ_ARM, 177 from_bits32(v)); 178 ddi_fm_acc_err_get(mlxp->mlx_regs_handle, &err, 179 DDI_FME_VERSION); 180 if (err.fme_status == DDI_FM_OK) 181 return; 182 if (try++ < mlxcx_doorbell_tries) { 183 ddi_fm_acc_err_clear(mlxp->mlx_regs_handle, DDI_FME_VERSION); 184 goto retry; 185 } 186 ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST); 187 } 188 189 static void 190 mlxcx_update_eq(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq) 191 { 192 bits32_t v = new_bits32(); 193 ddi_fm_error_t err; 194 195 /* 196 * This should only be called from interrupt context to ensure 197 * correctness of mleq_cc. 198 */ 199 ASSERT(servicing_interrupt()); 200 ASSERT(mleq->mleq_state & MLXCX_EQ_CREATED); 201 ASSERT0(mleq->mleq_state & MLXCX_EQ_DESTROYED); 202 ASSERT0(mleq->mleq_state & MLXCX_EQ_ARMED); 203 204 set_bits32(&v, MLXCX_EQ_ARM_EQN, mleq->mleq_num); 205 set_bits32(&v, MLXCX_EQ_ARM_CI, mleq->mleq_cc); 206 207 mlxcx_uar_put32(mlxp, mleq->mleq_uar, MLXCX_UAR_EQ_NOARM, 208 from_bits32(v)); 209 ddi_fm_acc_err_get(mlxp->mlx_regs_handle, &err, 210 DDI_FME_VERSION); 211 ddi_fm_acc_err_clear(mlxp->mlx_regs_handle, DDI_FME_VERSION); 212 /* 213 * Ignore the error, if it's still happening when we try to re-arm the 214 * EQ, we will note the impact then. 215 */ 216 } 217 218 static mlxcx_completionq_ent_t * 219 mlxcx_cq_next(mlxcx_completion_queue_t *mlcq) 220 { 221 mlxcx_completionq_ent_t *ent; 222 ddi_fm_error_t err; 223 uint_t ci; 224 const uint_t swowner = ((mlcq->mlcq_cc >> mlcq->mlcq_entshift) & 1); 225 226 ASSERT(mutex_owned(&mlcq->mlcq_mtx)); 227 ASSERT(mlcq->mlcq_state & MLXCX_CQ_CREATED); 228 ASSERT0(mlcq->mlcq_state & MLXCX_CQ_DESTROYED); 229 230 /* mlcq_nents is always a power of 2 */ 231 ci = mlcq->mlcq_cc & (mlcq->mlcq_nents - 1); 232 233 ent = &mlcq->mlcq_ent[ci]; 234 VERIFY0(ddi_dma_sync(mlcq->mlcq_dma.mxdb_dma_handle, 235 (uintptr_t)ent - (uintptr_t)mlcq->mlcq_ent, 236 sizeof (mlxcx_completionq_ent_t), DDI_DMA_SYNC_FORCPU)); 237 ddi_fm_dma_err_get(mlcq->mlcq_dma.mxdb_dma_handle, &err, 238 DDI_FME_VERSION); 239 if (err.fme_status == DDI_FM_OK && (ent->mlcqe_owner & 1) == swowner) { 240 /* The PRM says we have to membar here, so we're doing it */ 241 membar_consumer(); 242 ++mlcq->mlcq_cc; 243 return (ent); 244 } 245 ddi_fm_dma_err_clear(mlcq->mlcq_dma.mxdb_dma_handle, DDI_FME_VERSION); 246 247 return (NULL); 248 } 249 250 void 251 mlxcx_update_cqci(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq) 252 { 253 ddi_fm_error_t err; 254 uint_t try = 0; 255 256 mlcq->mlcq_doorbell->mlcqd_update_ci = to_be24(mlcq->mlcq_cc); 257 258 retry: 259 MLXCX_DMA_SYNC(mlcq->mlcq_doorbell_dma, DDI_DMA_SYNC_FORDEV); 260 ddi_fm_dma_err_get(mlcq->mlcq_doorbell_dma.mxdb_dma_handle, &err, 261 DDI_FME_VERSION); 262 if (err.fme_status != DDI_FM_OK) { 263 if (try++ < mlxcx_doorbell_tries) { 264 ddi_fm_dma_err_clear( 265 mlcq->mlcq_doorbell_dma.mxdb_dma_handle, 266 DDI_FME_VERSION); 267 goto retry; 268 } else { 269 ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST); 270 return; 271 } 272 } 273 } 274 275 void 276 mlxcx_arm_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq) 277 { 278 bits32_t dbval = new_bits32(); 279 uint64_t udbval; 280 ddi_fm_error_t err; 281 uint_t try = 0; 282 283 ASSERT(mutex_owned(&mlcq->mlcq_arm_mtx)); 284 ASSERT(mutex_owned(&mlcq->mlcq_mtx)); 285 ASSERT(mlcq->mlcq_state & MLXCX_CQ_CREATED); 286 ASSERT0(mlcq->mlcq_state & MLXCX_CQ_DESTROYED); 287 288 if (mlcq->mlcq_state & MLXCX_CQ_ARMED) { 289 ASSERT3U(mlcq->mlcq_ec, >, mlcq->mlcq_ec_armed); 290 } 291 292 if (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN) 293 return; 294 295 atomic_or_uint(&mlcq->mlcq_state, MLXCX_CQ_ARMED); 296 mlcq->mlcq_cc_armed = mlcq->mlcq_cc; 297 mlcq->mlcq_ec_armed = mlcq->mlcq_ec; 298 299 set_bits32(&dbval, MLXCX_CQ_ARM_SEQ, mlcq->mlcq_ec); 300 set_bits32(&dbval, MLXCX_CQ_ARM_CI, mlcq->mlcq_cc); 301 302 udbval = (uint64_t)from_bits32(dbval) << 32; 303 udbval |= mlcq->mlcq_num & 0xffffff; 304 305 mlcq->mlcq_doorbell->mlcqd_update_ci = to_be24(mlcq->mlcq_cc); 306 mlcq->mlcq_doorbell->mlcqd_arm_ci = dbval; 307 308 retry: 309 MLXCX_DMA_SYNC(mlcq->mlcq_doorbell_dma, DDI_DMA_SYNC_FORDEV); 310 ddi_fm_dma_err_get(mlcq->mlcq_doorbell_dma.mxdb_dma_handle, &err, 311 DDI_FME_VERSION); 312 if (err.fme_status != DDI_FM_OK) { 313 if (try++ < mlxcx_doorbell_tries) { 314 ddi_fm_dma_err_clear( 315 mlcq->mlcq_doorbell_dma.mxdb_dma_handle, 316 DDI_FME_VERSION); 317 goto retry; 318 } else { 319 goto err; 320 } 321 } 322 323 mlxcx_uar_put64(mlxp, mlcq->mlcq_uar, MLXCX_UAR_CQ_ARM, udbval); 324 ddi_fm_acc_err_get(mlxp->mlx_regs_handle, &err, 325 DDI_FME_VERSION); 326 if (err.fme_status == DDI_FM_OK) 327 return; 328 if (try++ < mlxcx_doorbell_tries) { 329 ddi_fm_acc_err_clear(mlxp->mlx_regs_handle, DDI_FME_VERSION); 330 goto retry; 331 } 332 333 err: 334 ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST); 335 } 336 337 const char * 338 mlxcx_event_name(mlxcx_event_t evt) 339 { 340 switch (evt) { 341 case MLXCX_EVENT_COMPLETION: 342 return ("COMPLETION"); 343 case MLXCX_EVENT_PATH_MIGRATED: 344 return ("PATH_MIGRATED"); 345 case MLXCX_EVENT_COMM_ESTABLISH: 346 return ("COMM_ESTABLISH"); 347 case MLXCX_EVENT_SENDQ_DRAIN: 348 return ("SENDQ_DRAIN"); 349 case MLXCX_EVENT_LAST_WQE: 350 return ("LAST_WQE"); 351 case MLXCX_EVENT_SRQ_LIMIT: 352 return ("SRQ_LIMIT"); 353 case MLXCX_EVENT_DCT_ALL_CLOSED: 354 return ("DCT_ALL_CLOSED"); 355 case MLXCX_EVENT_DCT_ACCKEY_VIOL: 356 return ("DCT_ACCKEY_VIOL"); 357 case MLXCX_EVENT_CQ_ERROR: 358 return ("CQ_ERROR"); 359 case MLXCX_EVENT_WQ_CATASTROPHE: 360 return ("WQ_CATASTROPHE"); 361 case MLXCX_EVENT_PATH_MIGRATE_FAIL: 362 return ("PATH_MIGRATE_FAIL"); 363 case MLXCX_EVENT_PAGE_FAULT: 364 return ("PAGE_FAULT"); 365 case MLXCX_EVENT_WQ_INVALID_REQ: 366 return ("WQ_INVALID_REQ"); 367 case MLXCX_EVENT_WQ_ACCESS_VIOL: 368 return ("WQ_ACCESS_VIOL"); 369 case MLXCX_EVENT_SRQ_CATASTROPHE: 370 return ("SRQ_CATASTROPHE"); 371 case MLXCX_EVENT_INTERNAL_ERROR: 372 return ("INTERNAL_ERROR"); 373 case MLXCX_EVENT_PORT_STATE: 374 return ("PORT_STATE"); 375 case MLXCX_EVENT_GPIO: 376 return ("GPIO"); 377 case MLXCX_EVENT_PORT_MODULE: 378 return ("PORT_MODULE"); 379 case MLXCX_EVENT_TEMP_WARNING: 380 return ("TEMP_WARNING"); 381 case MLXCX_EVENT_REMOTE_CONFIG: 382 return ("REMOTE_CONFIG"); 383 case MLXCX_EVENT_DCBX_CHANGE: 384 return ("DCBX_CHANGE"); 385 case MLXCX_EVENT_DOORBELL_CONGEST: 386 return ("DOORBELL_CONGEST"); 387 case MLXCX_EVENT_STALL_VL: 388 return ("STALL_VL"); 389 case MLXCX_EVENT_CMD_COMPLETION: 390 return ("CMD_COMPLETION"); 391 case MLXCX_EVENT_PAGE_REQUEST: 392 return ("PAGE_REQUEST"); 393 case MLXCX_EVENT_NIC_VPORT: 394 return ("NIC_VPORT"); 395 case MLXCX_EVENT_EC_PARAMS_CHANGE: 396 return ("EC_PARAMS_CHANGE"); 397 case MLXCX_EVENT_XRQ_ERROR: 398 return ("XRQ_ERROR"); 399 } 400 return ("UNKNOWN"); 401 } 402 403 /* Should be called only when link state has changed. */ 404 void 405 mlxcx_update_link_state(mlxcx_t *mlxp, mlxcx_port_t *port) 406 { 407 link_state_t ls; 408 409 mutex_enter(&port->mlp_mtx); 410 (void) mlxcx_cmd_query_port_status(mlxp, port); 411 (void) mlxcx_cmd_query_port_speed(mlxp, port); 412 (void) mlxcx_cmd_query_port_fec(mlxp, port); 413 414 switch (port->mlp_oper_status) { 415 case MLXCX_PORT_STATUS_UP: 416 case MLXCX_PORT_STATUS_UP_ONCE: 417 ls = LINK_STATE_UP; 418 break; 419 case MLXCX_PORT_STATUS_DOWN: 420 ls = LINK_STATE_DOWN; 421 break; 422 default: 423 ls = LINK_STATE_UNKNOWN; 424 } 425 426 if (mlxp->mlx_mac_hdl != NULL) 427 mac_link_update(mlxp->mlx_mac_hdl, ls); 428 429 mutex_exit(&port->mlp_mtx); 430 } 431 432 CTASSERT(MLXCX_MANAGE_PAGES_MAX_PAGES < UINT_MAX); 433 434 static void 435 mlxcx_give_pages_once(mlxcx_t *mlxp, size_t npages) 436 { 437 ddi_device_acc_attr_t acc; 438 ddi_dma_attr_t attr; 439 mlxcx_dev_page_t *mdp; 440 mlxcx_dev_page_t **pages; 441 size_t i; 442 const ddi_dma_cookie_t *ck; 443 444 /* 445 * If this isn't enough, the HCA will ask for more 446 */ 447 npages = MIN(npages, MLXCX_MANAGE_PAGES_MAX_PAGES); 448 449 pages = kmem_zalloc(sizeof (*pages) * npages, KM_SLEEP); 450 451 for (i = 0; i < npages; i++) { 452 mdp = kmem_zalloc(sizeof (mlxcx_dev_page_t), KM_SLEEP); 453 mlxcx_dma_acc_attr(mlxp, &acc); 454 mlxcx_dma_page_attr(mlxp, &attr); 455 if (!mlxcx_dma_alloc(mlxp, &mdp->mxdp_dma, &attr, &acc, 456 B_TRUE, MLXCX_HW_PAGE_SIZE, B_TRUE)) { 457 mlxcx_warn(mlxp, "failed to allocate 4k page %u/%lu", i, 458 npages); 459 kmem_free(mdp, sizeof (mlxcx_dev_page_t)); 460 goto cleanup_npages; 461 } 462 ck = mlxcx_dma_cookie_one(&mdp->mxdp_dma); 463 mdp->mxdp_pa = ck->dmac_laddress; 464 pages[i] = mdp; 465 } 466 467 mutex_enter(&mlxp->mlx_pagemtx); 468 469 if (!mlxcx_cmd_give_pages(mlxp, 470 MLXCX_MANAGE_PAGES_OPMOD_GIVE_PAGES, npages, pages)) { 471 mlxcx_warn(mlxp, "!hardware refused our gift of %lu " 472 "pages!", npages); 473 mutex_exit(&mlxp->mlx_pagemtx); 474 goto cleanup_npages; 475 } 476 477 for (i = 0; i < npages; i++) { 478 avl_add(&mlxp->mlx_pages, pages[i]); 479 } 480 mlxp->mlx_npages += npages; 481 mutex_exit(&mlxp->mlx_pagemtx); 482 483 kmem_free(pages, sizeof (*pages) * npages); 484 485 return; 486 487 cleanup_npages: 488 for (i = 0; i < npages; i++) { 489 if ((mdp = pages[i]) == NULL) 490 break; 491 492 mlxcx_dma_free(&mdp->mxdp_dma); 493 kmem_free(mdp, sizeof (mlxcx_dev_page_t)); 494 } 495 /* Tell the hardware we had an allocation failure. */ 496 (void) mlxcx_cmd_give_pages(mlxp, MLXCX_MANAGE_PAGES_OPMOD_ALLOC_FAIL, 497 0, NULL); 498 mutex_exit(&mlxp->mlx_pagemtx); 499 500 kmem_free(pages, sizeof (*pages) * npages); 501 } 502 503 static void 504 mlxcx_take_pages_once(mlxcx_t *mlxp, size_t npages) 505 { 506 uint_t i; 507 int32_t ret; 508 uint64_t *pas; 509 mlxcx_dev_page_t *mdp, probe; 510 511 pas = kmem_alloc(sizeof (*pas) * npages, KM_SLEEP); 512 513 if (!mlxcx_cmd_return_pages(mlxp, npages, pas, &ret)) { 514 kmem_free(pas, sizeof (*pas) * npages); 515 return; 516 } 517 518 mutex_enter(&mlxp->mlx_pagemtx); 519 520 ASSERT0(avl_is_empty(&mlxp->mlx_pages)); 521 522 for (i = 0; i < ret; i++) { 523 bzero(&probe, sizeof (probe)); 524 probe.mxdp_pa = pas[i]; 525 526 mdp = avl_find(&mlxp->mlx_pages, &probe, NULL); 527 528 if (mdp != NULL) { 529 avl_remove(&mlxp->mlx_pages, mdp); 530 mlxp->mlx_npages--; 531 mlxcx_dma_free(&mdp->mxdp_dma); 532 kmem_free(mdp, sizeof (mlxcx_dev_page_t)); 533 } else { 534 mlxcx_warn(mlxp, "hardware returned a page " 535 "with PA 0x%" PRIx64 " but we have no " 536 "record of giving out such a page", pas[i]); 537 } 538 } 539 540 mutex_exit(&mlxp->mlx_pagemtx); 541 542 kmem_free(pas, sizeof (*pas) * npages); 543 } 544 545 static void 546 mlxcx_pages_task(void *arg) 547 { 548 mlxcx_async_param_t *param = arg; 549 mlxcx_t *mlxp = param->mla_mlx; 550 int32_t npages; 551 552 /* 553 * We can drop the pending status now, as we've extracted what 554 * is needed to process the pages request. 555 * 556 * Even though we should never get another pages request until 557 * we have responded to this, along with the guard in mlxcx_sync_intr, 558 * this safely allows the reuse of mlxcx_async_param_t. 559 */ 560 mutex_enter(¶m->mla_mtx); 561 npages = param->mla_pages.mlp_npages; 562 param->mla_pending = B_FALSE; 563 bzero(¶m->mla_pages, sizeof (param->mla_pages)); 564 mutex_exit(¶m->mla_mtx); 565 566 /* 567 * The PRM describes npages as: "Number of missing / unneeded pages 568 * (signed number, msb indicate sign)". The implication is that 569 * it will not be zero. We are expected to use this to give or 570 * take back pages (based on the sign) using the MANAGE_PAGES 571 * command but we can't determine whether to give or take 572 * when npages is zero. So we do nothing. 573 */ 574 if (npages > 0) { 575 mlxcx_give_pages_once(mlxp, npages); 576 } else if (npages < 0) { 577 mlxcx_take_pages_once(mlxp, -1 * npages); 578 } 579 } 580 581 static void 582 mlxcx_link_state_task(void *arg) 583 { 584 mlxcx_async_param_t *param = arg; 585 mlxcx_port_t *port; 586 mlxcx_t *mlxp; 587 588 /* 589 * Gather the argruments from the parameters and clear the 590 * pending status. 591 * 592 * The pending status must be cleared *before* we update the 593 * link state. This is both safe and required to ensure we always 594 * have the correct link state. It is safe because taskq_ents are 595 * reusable (by the caller of taskq_dispatch_ent()) once the 596 * task function has started executing. It is necessarily before 597 * updating the link state to guarantee further link state change 598 * events are not missed and we always have the current link state. 599 */ 600 mutex_enter(¶m->mla_mtx); 601 mlxp = param->mla_mlx; 602 port = param->mla_port; 603 param->mla_pending = B_FALSE; 604 mutex_exit(¶m->mla_mtx); 605 606 mlxcx_update_link_state(mlxp, port); 607 } 608 609 static const char * 610 mlxcx_module_error_string(mlxcx_module_error_type_t err) 611 { 612 switch (err) { 613 case MLXCX_MODULE_ERR_POWER_BUDGET: 614 return ("POWER_BUDGET"); 615 case MLXCX_MODULE_ERR_LONG_RANGE: 616 return ("LONG_RANGE"); 617 case MLXCX_MODULE_ERR_BUS_STUCK: 618 return ("BUS_STUCK"); 619 case MLXCX_MODULE_ERR_NO_EEPROM: 620 return ("NO_EEPROM"); 621 case MLXCX_MODULE_ERR_ENFORCEMENT: 622 return ("ENFORCEMENT"); 623 case MLXCX_MODULE_ERR_UNKNOWN_IDENT: 624 return ("UNKNOWN_IDENT"); 625 case MLXCX_MODULE_ERR_HIGH_TEMP: 626 return ("HIGH_TEMP"); 627 case MLXCX_MODULE_ERR_CABLE_SHORTED: 628 return ("CABLE_SHORTED"); 629 default: 630 return ("UNKNOWN"); 631 } 632 } 633 634 static void 635 mlxcx_report_module_error(mlxcx_t *mlxp, mlxcx_evdata_port_mod_t *evd) 636 { 637 uint64_t ena; 638 char buf[FM_MAX_CLASS]; 639 const char *lename; 640 const char *ename; 641 const char *stname; 642 uint_t eno = 0; 643 mlxcx_module_status_t state = evd->mled_port_mod_module_status; 644 645 switch (state) { 646 case MLXCX_MODULE_ERROR: 647 stname = "error"; 648 eno = evd->mled_port_mod_error_type; 649 lename = mlxcx_module_error_string(eno); 650 switch (eno) { 651 case MLXCX_MODULE_ERR_ENFORCEMENT: 652 ename = DDI_FM_TXR_ERROR_WHITELIST; 653 break; 654 case MLXCX_MODULE_ERR_UNKNOWN_IDENT: 655 case MLXCX_MODULE_ERR_NO_EEPROM: 656 ename = DDI_FM_TXR_ERROR_NOTSUPP; 657 break; 658 case MLXCX_MODULE_ERR_HIGH_TEMP: 659 ename = DDI_FM_TXR_ERROR_OVERTEMP; 660 break; 661 case MLXCX_MODULE_ERR_POWER_BUDGET: 662 case MLXCX_MODULE_ERR_LONG_RANGE: 663 case MLXCX_MODULE_ERR_CABLE_SHORTED: 664 ename = DDI_FM_TXR_ERROR_HWFAIL; 665 break; 666 case MLXCX_MODULE_ERR_BUS_STUCK: 667 default: 668 ename = DDI_FM_TXR_ERROR_UNKNOWN; 669 } 670 break; 671 default: 672 return; 673 } 674 675 (void) snprintf(buf, FM_MAX_CLASS, "%s.%s", 676 DDI_FM_NIC, DDI_FM_TXR_ERROR); 677 ena = fm_ena_generate(0, FM_ENA_FMT1); 678 if (!DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps)) 679 return; 680 681 ddi_fm_ereport_post(mlxp->mlx_dip, buf, ena, DDI_NOSLEEP, 682 /* compulsory FM props */ 683 FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0, 684 /* generic NIC txr error event props */ 685 "error", DATA_TYPE_STRING, ename, 686 "port_index", DATA_TYPE_UINT8, 0, 687 "txr_index", DATA_TYPE_UINT8, evd->mled_port_mod_module, 688 /* local props */ 689 "mlxcx_state", DATA_TYPE_STRING, stname, 690 "mlxcx_error", DATA_TYPE_STRING, lename, 691 "mlxcx_error_num", DATA_TYPE_UINT8, eno, 692 NULL); 693 ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST); 694 } 695 696 /* 697 * Common beginning of interrupt processing. 698 * Confirm interrupt hasn't been disabled, verify its state and 699 * mark the vector as active. 700 */ 701 static boolean_t 702 mlxcx_intr_ini(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq) 703 { 704 mutex_enter(&mleq->mleq_mtx); 705 706 if ((mleq->mleq_state & MLXCX_EQ_INTR_ENABLED) == 0) { 707 mutex_exit(&mleq->mleq_mtx); 708 return (B_FALSE); 709 } 710 711 if (!(mleq->mleq_state & MLXCX_EQ_ALLOC) || 712 !(mleq->mleq_state & MLXCX_EQ_CREATED) || 713 (mleq->mleq_state & MLXCX_EQ_DESTROYED)) { 714 mlxcx_warn(mlxp, "intr %d in bad eq state", 715 mleq->mleq_intr_index); 716 mutex_exit(&mleq->mleq_mtx); 717 return (B_FALSE); 718 } 719 720 mleq->mleq_state |= MLXCX_EQ_INTR_ACTIVE; 721 mutex_exit(&mleq->mleq_mtx); 722 723 return (B_TRUE); 724 } 725 726 /* 727 * End of interrupt processing. 728 * Mark vector as no longer active and if shutdown is blocked on this vector, 729 * wake it up. 730 */ 731 static void 732 mlxcx_intr_fini(mlxcx_event_queue_t *mleq) 733 { 734 mutex_enter(&mleq->mleq_mtx); 735 if ((mleq->mleq_state & MLXCX_EQ_INTR_QUIESCE) != 0) 736 cv_signal(&mleq->mleq_cv); 737 738 mleq->mleq_state &= ~MLXCX_EQ_INTR_ACTIVE; 739 mutex_exit(&mleq->mleq_mtx); 740 } 741 742 static uint_t 743 mlxcx_intr_async(caddr_t arg, caddr_t arg2) 744 { 745 mlxcx_t *mlxp = (mlxcx_t *)arg; 746 mlxcx_event_queue_t *mleq = (mlxcx_event_queue_t *)arg2; 747 mlxcx_eventq_ent_t *ent; 748 mlxcx_async_param_t *param; 749 uint_t portn; 750 uint16_t func; 751 752 if (!mlxcx_intr_ini(mlxp, mleq)) 753 return (DDI_INTR_CLAIMED); 754 755 ent = mlxcx_eq_next(mleq); 756 if (ent == NULL) { 757 goto done; 758 } 759 760 ASSERT(mleq->mleq_state & MLXCX_EQ_ARMED); 761 mleq->mleq_state &= ~MLXCX_EQ_ARMED; 762 763 for (; ent != NULL; ent = mlxcx_eq_next(mleq)) { 764 DTRACE_PROBE2(event, mlxcx_t *, mlxp, mlxcx_eventq_ent_t *, 765 ent); 766 767 /* 768 * Handle events which can be processed while we're still in 769 * mlxcx_attach(). Everything on the mlxcx_t which these events 770 * use must be allocated and set up prior to the call to 771 * mlxcx_setup_async_eqs(). 772 */ 773 switch (ent->mleqe_event_type) { 774 case MLXCX_EVENT_CMD_COMPLETION: 775 mlxcx_cmd_completion(mlxp, ent); 776 continue; 777 case MLXCX_EVENT_PAGE_REQUEST: 778 func = from_be16(ent->mleqe_page_request. 779 mled_page_request_function_id); 780 VERIFY3U(func, <=, MLXCX_FUNC_ID_MAX); 781 782 param = &mlxp->mlx_npages_req[func]; 783 mutex_enter(¶m->mla_mtx); 784 if (param->mla_pending) { 785 /* 786 * The PRM states we will not get another 787 * page request event until any pending have 788 * been posted as complete to the HCA. 789 * This will guard against this anyway. 790 */ 791 mutex_exit(¶m->mla_mtx); 792 mlxcx_warn(mlxp, "Unexpected page request " 793 "whilst another is pending"); 794 continue; 795 } 796 param->mla_pages.mlp_npages = 797 (int32_t)from_be32(ent->mleqe_page_request. 798 mled_page_request_num_pages); 799 param->mla_pages.mlp_func = func; 800 param->mla_pending = B_TRUE; 801 ASSERT3P(param->mla_mlx, ==, mlxp); 802 mutex_exit(¶m->mla_mtx); 803 804 taskq_dispatch_ent(mlxp->mlx_async_tq, mlxcx_pages_task, 805 param, 0, ¶m->mla_tqe); 806 continue; 807 } 808 809 /* 810 * All other events should be ignored while in attach. 811 */ 812 mutex_enter(&mleq->mleq_mtx); 813 if (mleq->mleq_state & MLXCX_EQ_ATTACHING) { 814 mutex_exit(&mleq->mleq_mtx); 815 continue; 816 } 817 mutex_exit(&mleq->mleq_mtx); 818 819 switch (ent->mleqe_event_type) { 820 case MLXCX_EVENT_PORT_STATE: 821 portn = get_bits8( 822 ent->mleqe_port_state.mled_port_state_port_num, 823 MLXCX_EVENT_PORT_NUM) - 1; 824 if (portn >= mlxp->mlx_nports) 825 break; 826 827 param = &mlxp->mlx_ports[portn].mlx_port_event; 828 mutex_enter(¶m->mla_mtx); 829 if (param->mla_pending) { 830 /* 831 * There is a link state event pending 832 * processing. When that event is handled 833 * it will get the current link state. 834 */ 835 mutex_exit(¶m->mla_mtx); 836 break; 837 } 838 839 ASSERT3P(param->mla_mlx, ==, mlxp); 840 ASSERT3P(param->mla_port, ==, &mlxp->mlx_ports[portn]); 841 842 param->mla_pending = B_TRUE; 843 mutex_exit(¶m->mla_mtx); 844 845 taskq_dispatch_ent(mlxp->mlx_async_tq, 846 mlxcx_link_state_task, param, 0, ¶m->mla_tqe); 847 break; 848 case MLXCX_EVENT_PORT_MODULE: 849 mlxcx_report_module_error(mlxp, &ent->mleqe_port_mod); 850 break; 851 default: 852 mlxcx_warn(mlxp, "unhandled event 0x%x on intr %d", 853 ent->mleqe_event_type, mleq->mleq_intr_index); 854 } 855 } 856 857 mlxcx_arm_eq(mlxp, mleq); 858 859 done: 860 mlxcx_intr_fini(mleq); 861 return (DDI_INTR_CLAIMED); 862 } 863 864 static boolean_t 865 mlxcx_process_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, mblk_t **mpp, 866 size_t bytelim) 867 { 868 mlxcx_work_queue_t *wq = mlcq->mlcq_wq; 869 mlxcx_completionq_ent_t *cent; 870 mblk_t *mp, *cmp, *nmp; 871 mlxcx_buffer_t *buf; 872 boolean_t found, added; 873 size_t bytes = 0; 874 uint_t rx_frames = 0; 875 uint_t comp_cnt = 0; 876 int64_t wqebbs, bufcnt; 877 878 *mpp = NULL; 879 880 if (!(mlcq->mlcq_state & MLXCX_CQ_ALLOC) || 881 !(mlcq->mlcq_state & MLXCX_CQ_CREATED) || 882 (mlcq->mlcq_state & MLXCX_CQ_DESTROYED) || 883 (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN)) { 884 return (B_FALSE); 885 } 886 887 nmp = cmp = mp = NULL; 888 889 wqebbs = 0; 890 bufcnt = 0; 891 for (cent = mlxcx_cq_next(mlcq); cent != NULL; 892 cent = mlxcx_cq_next(mlcq)) { 893 /* 894 * Teardown and ring stop can atomic_or this flag 895 * into our state if they want us to stop early. 896 */ 897 if (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN) 898 return (B_FALSE); 899 900 comp_cnt++; 901 if (cent->mlcqe_opcode == MLXCX_CQE_OP_REQ && 902 cent->mlcqe_send_wqe_opcode == MLXCX_WQE_OP_NOP) { 903 /* NOP */ 904 atomic_dec_64(&wq->mlwq_wqebb_used); 905 goto nextcq; 906 } 907 908 lookagain: 909 /* 910 * Generally the buffer we're looking for will be 911 * at the front of the list, so this loop won't 912 * need to look far. 913 */ 914 buf = list_head(&mlcq->mlcq_buffers); 915 found = B_FALSE; 916 while (buf != NULL) { 917 if ((buf->mlb_wqe_index & UINT16_MAX) == 918 from_be16(cent->mlcqe_wqe_counter)) { 919 found = B_TRUE; 920 break; 921 } 922 buf = list_next(&mlcq->mlcq_buffers, buf); 923 } 924 925 if (!found) { 926 /* 927 * If there's any buffers waiting on the 928 * buffers_b list, then merge those into 929 * the main list and have another look. 930 * 931 * The wq enqueue routines push new buffers 932 * into buffers_b so that they can avoid 933 * taking the mlcq_mtx and blocking us for 934 * every single packet. 935 */ 936 added = B_FALSE; 937 mutex_enter(&mlcq->mlcq_bufbmtx); 938 if (!list_is_empty(&mlcq->mlcq_buffers_b)) { 939 list_move_tail(&mlcq->mlcq_buffers, 940 &mlcq->mlcq_buffers_b); 941 added = B_TRUE; 942 } 943 mutex_exit(&mlcq->mlcq_bufbmtx); 944 if (added) 945 goto lookagain; 946 947 /* 948 * This check could go just after the lookagain 949 * label, but it is a hot code path so we don't 950 * want to unnecessarily grab a lock and check 951 * a flag for a relatively rare event (the ring 952 * being stopped). 953 */ 954 mutex_enter(&wq->mlwq_mtx); 955 if ((wq->mlwq_state & MLXCX_WQ_STARTED) == 0) { 956 mutex_exit(&wq->mlwq_mtx); 957 goto nextcq; 958 } 959 mutex_exit(&wq->mlwq_mtx); 960 961 buf = list_head(&mlcq->mlcq_buffers); 962 mlxcx_warn(mlxp, "got completion on CQ %x but " 963 "no buffer matching wqe found: %x (first " 964 "buffer counter = %x)", mlcq->mlcq_num, 965 from_be16(cent->mlcqe_wqe_counter), 966 buf == NULL ? UINT32_MAX : 967 buf->mlb_wqe_index); 968 mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_INVAL_STATE); 969 goto nextcq; 970 } 971 972 /* 973 * The buf is likely to be freed below, count this now. 974 */ 975 wqebbs += buf->mlb_wqebbs; 976 977 list_remove(&mlcq->mlcq_buffers, buf); 978 bufcnt++; 979 980 switch (mlcq->mlcq_wq->mlwq_type) { 981 case MLXCX_WQ_TYPE_SENDQ: 982 mlxcx_tx_completion(mlxp, mlcq, cent, buf); 983 break; 984 case MLXCX_WQ_TYPE_RECVQ: 985 nmp = mlxcx_rx_completion(mlxp, mlcq, cent, buf); 986 bytes += from_be32(cent->mlcqe_byte_cnt); 987 if (nmp != NULL) { 988 if (cmp != NULL) { 989 cmp->b_next = nmp; 990 cmp = nmp; 991 } else { 992 mp = cmp = nmp; 993 } 994 995 rx_frames++; 996 } 997 break; 998 } 999 1000 /* 1001 * Update the consumer index with what has been processed, 1002 * followed by driver counters. It is important to tell the 1003 * hardware first, otherwise when we throw more packets at 1004 * it, it may get an overflow error. 1005 * We do this whenever we've processed enough to bridge the 1006 * high->low water mark. 1007 */ 1008 if (bufcnt > (MLXCX_CQ_LWM_GAP - MLXCX_CQ_HWM_GAP)) { 1009 mlxcx_update_cqci(mlxp, mlcq); 1010 /* 1011 * Both these variables are incremented using 1012 * atomics as they are modified in other code paths 1013 * (Eg during tx) which hold different locks. 1014 */ 1015 atomic_add_64(&mlcq->mlcq_bufcnt, -bufcnt); 1016 atomic_add_64(&wq->mlwq_wqebb_used, -wqebbs); 1017 wqebbs = 0; 1018 bufcnt = 0; 1019 comp_cnt = 0; 1020 } 1021 nextcq: 1022 if (rx_frames > mlxp->mlx_props.mldp_rx_per_cq || 1023 (bytelim != 0 && bytes > bytelim)) 1024 break; 1025 } 1026 1027 if (comp_cnt > 0) { 1028 mlxcx_update_cqci(mlxp, mlcq); 1029 atomic_add_64(&mlcq->mlcq_bufcnt, -bufcnt); 1030 atomic_add_64(&wq->mlwq_wqebb_used, -wqebbs); 1031 } 1032 1033 *mpp = mp; 1034 return (B_TRUE); 1035 } 1036 1037 1038 mblk_t * 1039 mlxcx_rx_poll(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, size_t bytelim) 1040 { 1041 mblk_t *mp = NULL; 1042 1043 ASSERT(mutex_owned(&mlcq->mlcq_mtx)); 1044 1045 ASSERT(mlcq->mlcq_wq != NULL); 1046 ASSERT3U(mlcq->mlcq_wq->mlwq_type, ==, MLXCX_WQ_TYPE_RECVQ); 1047 1048 (void) mlxcx_process_cq(mlxp, mlcq, &mp, bytelim); 1049 1050 return (mp); 1051 } 1052 1053 static uint_t 1054 mlxcx_intr_n(caddr_t arg, caddr_t arg2) 1055 { 1056 mlxcx_t *mlxp = (mlxcx_t *)arg; 1057 mlxcx_event_queue_t *mleq = (mlxcx_event_queue_t *)arg2; 1058 mlxcx_eventq_ent_t *ent; 1059 mlxcx_completion_queue_t *mlcq, probe; 1060 mlxcx_work_queue_t *mlwq; 1061 mblk_t *mp = NULL; 1062 boolean_t tellmac = B_FALSE; 1063 1064 if (!mlxcx_intr_ini(mlxp, mleq)) 1065 return (DDI_INTR_CLAIMED); 1066 1067 ent = mlxcx_eq_next(mleq); 1068 if (ent == NULL) { 1069 if (++mleq->mleq_badintrs > mlxcx_stuck_intr_count) { 1070 mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_BADINT_LIMIT); 1071 ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST); 1072 (void) ddi_intr_disable(mlxp->mlx_intr_handles[ 1073 mleq->mleq_intr_index]); 1074 } 1075 goto done; 1076 } 1077 mleq->mleq_badintrs = 0; 1078 1079 mutex_enter(&mleq->mleq_mtx); 1080 ASSERT(mleq->mleq_state & MLXCX_EQ_ARMED); 1081 mleq->mleq_state &= ~MLXCX_EQ_ARMED; 1082 #if defined(DEBUG) 1083 /* 1084 * If we're still in mlxcx_attach and an intr_n fired, something really 1085 * weird is going on. This shouldn't happen in the absence of a driver 1086 * or firmware bug, so in the interests of minimizing branches in this 1087 * function this check is under DEBUG. 1088 */ 1089 if (mleq->mleq_state & MLXCX_EQ_ATTACHING) { 1090 mutex_exit(&mleq->mleq_mtx); 1091 mlxcx_warn(mlxp, "intr_n (%u) fired during attach, disabling " 1092 "vector", mleq->mleq_intr_index); 1093 mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_INVAL_STATE); 1094 ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST); 1095 (void) ddi_intr_disable(mlxp->mlx_intr_handles[ 1096 mleq->mleq_intr_index]); 1097 goto done; 1098 } 1099 #endif 1100 mutex_exit(&mleq->mleq_mtx); 1101 1102 for (; ent != NULL; ent = mlxcx_eq_next(mleq)) { 1103 ASSERT3U(ent->mleqe_event_type, ==, MLXCX_EVENT_COMPLETION); 1104 1105 probe.mlcq_num = 1106 from_be24(ent->mleqe_completion.mled_completion_cqn); 1107 mutex_enter(&mleq->mleq_mtx); 1108 mlcq = avl_find(&mleq->mleq_cqs, &probe, NULL); 1109 mutex_exit(&mleq->mleq_mtx); 1110 1111 if (mlcq == NULL) 1112 goto update_eq; 1113 1114 mlwq = mlcq->mlcq_wq; 1115 1116 /* 1117 * mlcq_arm_mtx is used to avoid race conditions between 1118 * this interrupt routine and the transition from polling 1119 * back to interrupt mode. When exiting poll mode the 1120 * CQ is likely to be un-armed, which means there will 1121 * be no events for the CQ coming though here, 1122 * consequently very low contention on mlcq_arm_mtx. 1123 * 1124 * mlcq_arm_mtx must be released before calls into mac 1125 * layer in order to avoid deadlocks. 1126 */ 1127 mutex_enter(&mlcq->mlcq_arm_mtx); 1128 mlcq->mlcq_ec++; 1129 atomic_and_uint(&mlcq->mlcq_state, ~MLXCX_CQ_ARMED); 1130 1131 if (mutex_tryenter(&mlcq->mlcq_mtx) == 0) { 1132 /* 1133 * If we failed to take the mutex because the 1134 * polling function has it, just move on. 1135 * We don't want to block other CQs behind 1136 * this one. 1137 */ 1138 if ((mlcq->mlcq_state & MLXCX_CQ_POLLING) != 0) { 1139 mutex_exit(&mlcq->mlcq_arm_mtx); 1140 goto update_eq; 1141 } 1142 1143 /* Otherwise we will wait. */ 1144 mutex_enter(&mlcq->mlcq_mtx); 1145 } 1146 1147 if ((mlcq->mlcq_state & MLXCX_CQ_POLLING) == 0 && 1148 mlxcx_process_cq(mlxp, mlcq, &mp, 0)) { 1149 /* 1150 * The ring is not in polling mode and we processed 1151 * some completion queue entries. 1152 */ 1153 if ((mlcq->mlcq_state & MLXCX_CQ_BLOCKED_MAC) != 0 && 1154 mlcq->mlcq_bufcnt < mlcq->mlcq_buflwm) { 1155 atomic_and_uint(&mlcq->mlcq_state, 1156 ~MLXCX_CQ_BLOCKED_MAC); 1157 tellmac = B_TRUE; 1158 } 1159 1160 if ((mlwq->mlwq_state & MLXCX_WQ_BLOCKED_MAC) != 0 && 1161 mlwq->mlwq_wqebb_used < mlwq->mlwq_buflwm) { 1162 atomic_and_uint(&mlwq->mlwq_state, 1163 ~MLXCX_WQ_BLOCKED_MAC); 1164 tellmac = B_TRUE; 1165 } 1166 1167 mlxcx_arm_cq(mlxp, mlcq); 1168 1169 mutex_exit(&mlcq->mlcq_mtx); 1170 mutex_exit(&mlcq->mlcq_arm_mtx); 1171 1172 if (tellmac) { 1173 mac_tx_ring_update(mlxp->mlx_mac_hdl, 1174 mlcq->mlcq_mac_hdl); 1175 tellmac = B_FALSE; 1176 } 1177 1178 if (mp != NULL) { 1179 mac_rx_ring(mlxp->mlx_mac_hdl, 1180 mlcq->mlcq_mac_hdl, mp, mlcq->mlcq_mac_gen); 1181 } 1182 } else { 1183 mutex_exit(&mlcq->mlcq_mtx); 1184 mutex_exit(&mlcq->mlcq_arm_mtx); 1185 } 1186 1187 update_eq: 1188 /* 1189 * Updating the consumer counter for an EQ requires a write 1190 * to the UAR, which is possibly expensive. 1191 * 1192 * Try to do it only often enough to stop us wrapping around. 1193 */ 1194 if ((mleq->mleq_cc & 0x7) == 0) 1195 mlxcx_update_eq(mlxp, mleq); 1196 } 1197 1198 mlxcx_arm_eq(mlxp, mleq); 1199 1200 done: 1201 mlxcx_intr_fini(mleq); 1202 return (DDI_INTR_CLAIMED); 1203 } 1204 1205 boolean_t 1206 mlxcx_intr_setup(mlxcx_t *mlxp) 1207 { 1208 dev_info_t *dip = mlxp->mlx_dip; 1209 int ret; 1210 int nintrs = 0; 1211 int navail = 0; 1212 int types, i; 1213 mlxcx_eventq_type_t eqt = MLXCX_EQ_TYPE_ANY; 1214 1215 ret = ddi_intr_get_supported_types(dip, &types); 1216 if (ret != DDI_SUCCESS) { 1217 mlxcx_warn(mlxp, "Failed to get supported interrupt types"); 1218 return (B_FALSE); 1219 } 1220 1221 if (!(types & DDI_INTR_TYPE_MSIX)) { 1222 mlxcx_warn(mlxp, "MSI-X interrupts not available, but mlxcx " 1223 "requires MSI-X"); 1224 return (B_FALSE); 1225 } 1226 1227 ret = ddi_intr_get_nintrs(dip, DDI_INTR_TYPE_MSIX, &nintrs); 1228 if (ret != DDI_SUCCESS) { 1229 mlxcx_warn(mlxp, "Failed to get number of interrupts"); 1230 return (B_FALSE); 1231 } 1232 if (nintrs < 2) { 1233 mlxcx_warn(mlxp, "%d MSI-X interrupts supported, but mlxcx " 1234 "requires 2", nintrs); 1235 return (B_FALSE); 1236 } 1237 1238 ret = ddi_intr_get_navail(dip, DDI_INTR_TYPE_MSIX, &navail); 1239 if (ret != DDI_SUCCESS) { 1240 mlxcx_warn(mlxp, 1241 "Failed to get number of available interrupts"); 1242 return (B_FALSE); 1243 } 1244 if (navail < 2) { 1245 mlxcx_warn(mlxp, "%d MSI-X interrupts available, but mlxcx " 1246 "requires 2", navail); 1247 return (B_FALSE); 1248 } 1249 1250 mlxp->mlx_intr_size = navail * sizeof (ddi_intr_handle_t); 1251 mlxp->mlx_intr_handles = kmem_alloc(mlxp->mlx_intr_size, KM_SLEEP); 1252 /* 1253 * Interrupts for Completion Queues events start from vector 1 1254 * up to available vectors. Vector 0 is used for asynchronous 1255 * events. 1256 */ 1257 mlxp->mlx_intr_cq0 = 1; 1258 1259 ret = ddi_intr_alloc(dip, mlxp->mlx_intr_handles, DDI_INTR_TYPE_MSIX, 1260 0, navail, &mlxp->mlx_intr_count, DDI_INTR_ALLOC_NORMAL); 1261 if (ret != DDI_SUCCESS) { 1262 mlxcx_warn(mlxp, "Failed to allocate %d interrupts", navail); 1263 mlxcx_intr_teardown(mlxp); 1264 return (B_FALSE); 1265 } 1266 if (mlxp->mlx_intr_count < mlxp->mlx_intr_cq0 + 1) { 1267 mlxcx_warn(mlxp, "%d MSI-X interrupts allocated, but mlxcx " 1268 "requires %d", mlxp->mlx_intr_count, 1269 mlxp->mlx_intr_cq0 + 1); 1270 mlxcx_intr_teardown(mlxp); 1271 return (B_FALSE); 1272 } 1273 mlxp->mlx_intr_type = DDI_INTR_TYPE_MSIX; 1274 1275 ret = ddi_intr_get_pri(mlxp->mlx_intr_handles[0], &mlxp->mlx_intr_pri); 1276 if (ret != DDI_SUCCESS) { 1277 mlxcx_warn(mlxp, "Failed to get interrupt priority"); 1278 mlxcx_intr_teardown(mlxp); 1279 return (B_FALSE); 1280 } 1281 1282 /* 1283 * Set the interrupt priority for the asynchronous handler higher 1284 * than the ring handlers. Some operations which issue commands, 1285 * and thus rely on the async interrupt handler for posting 1286 * completion, do so with a CQ mutex held. The CQ mutex is also 1287 * acquired during ring processing, so if the ring processing vector 1288 * happens to be assigned to the same CPU as the async vector 1289 * it can hold off the async interrupt thread and lead to a deadlock. 1290 * By assigning a higher priority to the async vector, it will 1291 * always be dispatched. 1292 */ 1293 mlxp->mlx_async_intr_pri = mlxp->mlx_intr_pri; 1294 if (mlxp->mlx_async_intr_pri < LOCK_LEVEL) { 1295 mlxp->mlx_async_intr_pri++; 1296 } else { 1297 mlxp->mlx_intr_pri--; 1298 } 1299 1300 mlxp->mlx_eqs_size = mlxp->mlx_intr_count * 1301 sizeof (mlxcx_event_queue_t); 1302 mlxp->mlx_eqs = kmem_zalloc(mlxp->mlx_eqs_size, KM_SLEEP); 1303 1304 /* 1305 * In the failure path, mlxcx_intr_teardown() expects this 1306 * mutex and avl tree to be init'ed - so do it now. 1307 */ 1308 for (i = 0; i < mlxp->mlx_intr_count; ++i) { 1309 uint_t pri = (i == 0) ? mlxp->mlx_async_intr_pri : 1310 mlxp->mlx_intr_pri; 1311 1312 mutex_init(&mlxp->mlx_eqs[i].mleq_mtx, NULL, MUTEX_DRIVER, 1313 DDI_INTR_PRI(pri)); 1314 cv_init(&mlxp->mlx_eqs[i].mleq_cv, NULL, CV_DRIVER, NULL); 1315 1316 if (i < mlxp->mlx_intr_cq0) 1317 continue; 1318 1319 avl_create(&mlxp->mlx_eqs[i].mleq_cqs, mlxcx_cq_compare, 1320 sizeof (mlxcx_completion_queue_t), 1321 offsetof(mlxcx_completion_queue_t, mlcq_eq_entry)); 1322 } 1323 1324 while (mlxp->mlx_async_intr_pri > DDI_INTR_PRI_MIN) { 1325 ret = ddi_intr_set_pri(mlxp->mlx_intr_handles[0], 1326 mlxp->mlx_async_intr_pri); 1327 if (ret == DDI_SUCCESS) 1328 break; 1329 mlxcx_note(mlxp, 1330 "!Failed to set interrupt priority to %u for " 1331 "async interrupt vector", mlxp->mlx_async_intr_pri); 1332 /* 1333 * If it was not possible to set the IPL for the async 1334 * interrupt to the desired value, then try a lower priority. 1335 * Some PSMs can only accommodate a limited number of vectors 1336 * at eatch priority level (or group of priority levels). Since 1337 * the async priority must be set higher than the ring 1338 * handlers, lower both. The ring handler priority is set 1339 * below. 1340 */ 1341 mlxp->mlx_async_intr_pri--; 1342 mlxp->mlx_intr_pri--; 1343 } 1344 1345 if (mlxp->mlx_async_intr_pri == DDI_INTR_PRI_MIN) { 1346 mlxcx_warn(mlxp, "Failed to find an interrupt priority for " 1347 "async interrupt vector"); 1348 mlxcx_intr_teardown(mlxp); 1349 return (B_FALSE); 1350 } 1351 1352 ret = ddi_intr_add_handler(mlxp->mlx_intr_handles[0], mlxcx_intr_async, 1353 (caddr_t)mlxp, (caddr_t)&mlxp->mlx_eqs[0]); 1354 if (ret != DDI_SUCCESS) { 1355 mlxcx_warn(mlxp, "Failed to add async interrupt handler"); 1356 mlxcx_intr_teardown(mlxp); 1357 return (B_FALSE); 1358 } 1359 1360 /* 1361 * If we have enough interrupts, set their "type" fields so that we 1362 * avoid mixing RX and TX queues on the same EQs. 1363 */ 1364 if (mlxp->mlx_intr_count >= 8) { 1365 eqt = MLXCX_EQ_TYPE_RX; 1366 } 1367 1368 for (i = mlxp->mlx_intr_cq0; i < mlxp->mlx_intr_count; ++i) { 1369 mlxp->mlx_eqs[i].mleq_intr_index = i; 1370 1371 mlxp->mlx_eqs[i].mleq_type = eqt; 1372 /* 1373 * If eqt is still ANY, just leave it set to that 1374 * (no else here). 1375 */ 1376 if (eqt == MLXCX_EQ_TYPE_RX) { 1377 eqt = MLXCX_EQ_TYPE_TX; 1378 } else if (eqt == MLXCX_EQ_TYPE_TX) { 1379 eqt = MLXCX_EQ_TYPE_RX; 1380 } 1381 1382 while (mlxp->mlx_intr_pri >= DDI_INTR_PRI_MIN) { 1383 ret = ddi_intr_set_pri(mlxp->mlx_intr_handles[i], 1384 mlxp->mlx_intr_pri); 1385 if (ret == DDI_SUCCESS) 1386 break; 1387 mlxcx_note(mlxp, "!Failed to set interrupt priority to " 1388 "%u for interrupt vector %d", 1389 mlxp->mlx_intr_pri, i); 1390 mlxp->mlx_intr_pri--; 1391 } 1392 if (mlxp->mlx_intr_pri < DDI_INTR_PRI_MIN) { 1393 mlxcx_warn(mlxp, 1394 "Failed to find an interrupt priority for " 1395 "interrupt vector %d", i); 1396 mlxcx_intr_teardown(mlxp); 1397 return (B_FALSE); 1398 } 1399 1400 ret = ddi_intr_add_handler(mlxp->mlx_intr_handles[i], 1401 mlxcx_intr_n, (caddr_t)mlxp, (caddr_t)&mlxp->mlx_eqs[i]); 1402 if (ret != DDI_SUCCESS) { 1403 mlxcx_warn(mlxp, "Failed to add interrupt handler %d", 1404 i); 1405 mlxcx_intr_teardown(mlxp); 1406 return (B_FALSE); 1407 } 1408 } 1409 1410 return (B_TRUE); 1411 } 1412