1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright (c) 2020, the University of Queensland 14 */ 15 16 /* 17 * Mellanox Connect-X 4/5/6 driver. 18 */ 19 20 #include <sys/modctl.h> 21 #include <sys/conf.h> 22 #include <sys/devops.h> 23 #include <sys/sysmacros.h> 24 25 #include <sys/mac_provider.h> 26 27 #include <mlxcx.h> 28 29 void 30 mlxcx_intr_teardown(mlxcx_t *mlxp) 31 { 32 int i; 33 int ret; 34 35 for (i = 0; i < mlxp->mlx_intr_count; ++i) { 36 mlxcx_event_queue_t *mleq = &mlxp->mlx_eqs[i]; 37 mutex_enter(&mleq->mleq_mtx); 38 VERIFY0(mleq->mleq_state & MLXCX_EQ_ALLOC); 39 if (mleq->mleq_state & MLXCX_EQ_CREATED) 40 VERIFY(mleq->mleq_state & MLXCX_EQ_DESTROYED); 41 if (i != 0) { 42 VERIFY(avl_is_empty(&mleq->mleq_cqs)); 43 avl_destroy(&mleq->mleq_cqs); 44 } 45 mutex_exit(&mleq->mleq_mtx); 46 (void) ddi_intr_disable(mlxp->mlx_intr_handles[i]); 47 (void) ddi_intr_remove_handler(mlxp->mlx_intr_handles[i]); 48 ret = ddi_intr_free(mlxp->mlx_intr_handles[i]); 49 if (ret != DDI_SUCCESS) { 50 mlxcx_warn(mlxp, "failed to free interrupt %d: %d", 51 i, ret); 52 } 53 mutex_destroy(&mleq->mleq_mtx); 54 } 55 kmem_free(mlxp->mlx_intr_handles, mlxp->mlx_intr_size); 56 kmem_free(mlxp->mlx_eqs, mlxp->mlx_eqs_size); 57 mlxp->mlx_intr_handles = NULL; 58 mlxp->mlx_eqs = NULL; 59 } 60 61 /* 62 * Get the next SW-owned entry on the event queue, or NULL if we reach the end. 63 */ 64 static mlxcx_eventq_ent_t * 65 mlxcx_eq_next(mlxcx_event_queue_t *mleq) 66 { 67 mlxcx_eventq_ent_t *ent; 68 ddi_fm_error_t err; 69 uint_t ci; 70 const uint_t swowner = ((mleq->mleq_cc >> mleq->mleq_entshift) & 1); 71 72 ASSERT(mutex_owned(&mleq->mleq_mtx)); 73 ASSERT(mleq->mleq_state & MLXCX_EQ_CREATED); 74 ASSERT0(mleq->mleq_state & MLXCX_EQ_DESTROYED); 75 76 /* mleq_nents is always a power of 2 */ 77 ci = mleq->mleq_cc & (mleq->mleq_nents - 1); 78 79 ent = &mleq->mleq_ent[ci]; 80 VERIFY0(ddi_dma_sync(mleq->mleq_dma.mxdb_dma_handle, 81 (uintptr_t)ent - (uintptr_t)mleq->mleq_ent, 82 sizeof (mlxcx_eventq_ent_t), DDI_DMA_SYNC_FORCPU)); 83 ddi_fm_dma_err_get(mleq->mleq_dma.mxdb_dma_handle, &err, 84 DDI_FME_VERSION); 85 if (err.fme_status == DDI_FM_OK && (ent->mleqe_owner & 1) == swowner) { 86 /* The PRM says we have to membar here, so we're doing it */ 87 membar_consumer(); 88 ++mleq->mleq_cc; 89 return (ent); 90 } 91 /* 92 * In the case of a DMA error, we should re-arm this EQ and then come 93 * back and try again when the device wakes us back up. 94 * 95 * Hopefully the fault will be gone by then. 96 */ 97 ddi_fm_dma_err_clear(mleq->mleq_dma.mxdb_dma_handle, DDI_FME_VERSION); 98 99 return (NULL); 100 } 101 102 void 103 mlxcx_arm_eq(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq) 104 { 105 uint_t try = 0; 106 ddi_fm_error_t err; 107 bits32_t v = new_bits32(); 108 109 ASSERT(mutex_owned(&mleq->mleq_mtx)); 110 ASSERT(mleq->mleq_state & MLXCX_EQ_CREATED); 111 ASSERT0(mleq->mleq_state & MLXCX_EQ_DESTROYED); 112 ASSERT0(mleq->mleq_state & MLXCX_EQ_ARMED); 113 ASSERT0(mleq->mleq_state & MLXCX_EQ_POLLING); 114 115 mleq->mleq_state |= MLXCX_EQ_ARMED; 116 mleq->mleq_cc_armed = mleq->mleq_cc; 117 118 set_bits32(&v, MLXCX_EQ_ARM_EQN, mleq->mleq_num); 119 set_bits32(&v, MLXCX_EQ_ARM_CI, mleq->mleq_cc); 120 121 retry: 122 mlxcx_uar_put32(mlxp, mleq->mleq_uar, MLXCX_UAR_EQ_ARM, 123 from_bits32(v)); 124 ddi_fm_acc_err_get(mlxp->mlx_regs_handle, &err, 125 DDI_FME_VERSION); 126 if (err.fme_status == DDI_FM_OK) 127 return; 128 if (try++ < mlxcx_doorbell_tries) { 129 ddi_fm_acc_err_clear(mlxp->mlx_regs_handle, DDI_FME_VERSION); 130 goto retry; 131 } 132 ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST); 133 } 134 135 static void 136 mlxcx_update_eq(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq) 137 { 138 bits32_t v = new_bits32(); 139 ddi_fm_error_t err; 140 141 ASSERT(mutex_owned(&mleq->mleq_mtx)); 142 ASSERT(mleq->mleq_state & MLXCX_EQ_CREATED); 143 ASSERT0(mleq->mleq_state & MLXCX_EQ_DESTROYED); 144 ASSERT0(mleq->mleq_state & MLXCX_EQ_ARMED); 145 146 set_bits32(&v, MLXCX_EQ_ARM_EQN, mleq->mleq_num); 147 set_bits32(&v, MLXCX_EQ_ARM_CI, mleq->mleq_cc); 148 149 mlxcx_uar_put32(mlxp, mleq->mleq_uar, MLXCX_UAR_EQ_NOARM, 150 from_bits32(v)); 151 ddi_fm_acc_err_get(mlxp->mlx_regs_handle, &err, 152 DDI_FME_VERSION); 153 ddi_fm_acc_err_clear(mlxp->mlx_regs_handle, DDI_FME_VERSION); 154 /* 155 * Ignore the error, if it's still happening when we try to re-arm the 156 * EQ, we will note the impact then. 157 */ 158 } 159 160 static mlxcx_completionq_ent_t * 161 mlxcx_cq_next(mlxcx_completion_queue_t *mlcq) 162 { 163 mlxcx_completionq_ent_t *ent; 164 ddi_fm_error_t err; 165 uint_t ci; 166 const uint_t swowner = ((mlcq->mlcq_cc >> mlcq->mlcq_entshift) & 1); 167 168 ASSERT(mutex_owned(&mlcq->mlcq_mtx)); 169 ASSERT(mlcq->mlcq_state & MLXCX_CQ_CREATED); 170 ASSERT0(mlcq->mlcq_state & MLXCX_CQ_DESTROYED); 171 172 /* mlcq_nents is always a power of 2 */ 173 ci = mlcq->mlcq_cc & (mlcq->mlcq_nents - 1); 174 175 ent = &mlcq->mlcq_ent[ci]; 176 VERIFY0(ddi_dma_sync(mlcq->mlcq_dma.mxdb_dma_handle, 177 (uintptr_t)ent - (uintptr_t)mlcq->mlcq_ent, 178 sizeof (mlxcx_completionq_ent_t), DDI_DMA_SYNC_FORCPU)); 179 ddi_fm_dma_err_get(mlcq->mlcq_dma.mxdb_dma_handle, &err, 180 DDI_FME_VERSION); 181 if (err.fme_status == DDI_FM_OK && (ent->mlcqe_owner & 1) == swowner) { 182 /* The PRM says we have to membar here, so we're doing it */ 183 membar_consumer(); 184 ++mlcq->mlcq_cc; 185 return (ent); 186 } 187 ddi_fm_dma_err_clear(mlcq->mlcq_dma.mxdb_dma_handle, DDI_FME_VERSION); 188 189 return (NULL); 190 } 191 192 void 193 mlxcx_arm_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq) 194 { 195 bits32_t dbval = new_bits32(); 196 uint64_t udbval; 197 ddi_fm_error_t err; 198 uint_t try = 0; 199 200 ASSERT(mutex_owned(&mlcq->mlcq_mtx)); 201 ASSERT(mlcq->mlcq_state & MLXCX_CQ_CREATED); 202 ASSERT0(mlcq->mlcq_state & MLXCX_CQ_DESTROYED); 203 204 if (mlcq->mlcq_state & MLXCX_CQ_ARMED) 205 ASSERT3U(mlcq->mlcq_ec, >, mlcq->mlcq_ec_armed); 206 207 if (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN) 208 return; 209 210 mlcq->mlcq_state |= MLXCX_CQ_ARMED; 211 mlcq->mlcq_cc_armed = mlcq->mlcq_cc; 212 mlcq->mlcq_ec_armed = mlcq->mlcq_ec; 213 214 set_bits32(&dbval, MLXCX_CQ_ARM_SEQ, mlcq->mlcq_ec); 215 set_bits32(&dbval, MLXCX_CQ_ARM_CI, mlcq->mlcq_cc); 216 217 udbval = (uint64_t)from_bits32(dbval) << 32; 218 udbval |= mlcq->mlcq_num & 0xffffff; 219 220 mlcq->mlcq_doorbell->mlcqd_update_ci = to_be24(mlcq->mlcq_cc); 221 mlcq->mlcq_doorbell->mlcqd_arm_ci = dbval; 222 223 retry: 224 MLXCX_DMA_SYNC(mlcq->mlcq_doorbell_dma, DDI_DMA_SYNC_FORDEV); 225 ddi_fm_dma_err_get(mlcq->mlcq_doorbell_dma.mxdb_dma_handle, &err, 226 DDI_FME_VERSION); 227 if (err.fme_status != DDI_FM_OK) { 228 if (try++ < mlxcx_doorbell_tries) { 229 ddi_fm_dma_err_clear( 230 mlcq->mlcq_doorbell_dma.mxdb_dma_handle, 231 DDI_FME_VERSION); 232 goto retry; 233 } else { 234 goto err; 235 } 236 } 237 238 mlxcx_uar_put64(mlxp, mlcq->mlcq_uar, MLXCX_UAR_CQ_ARM, udbval); 239 ddi_fm_acc_err_get(mlxp->mlx_regs_handle, &err, 240 DDI_FME_VERSION); 241 if (err.fme_status == DDI_FM_OK) 242 return; 243 if (try++ < mlxcx_doorbell_tries) { 244 ddi_fm_acc_err_clear(mlxp->mlx_regs_handle, DDI_FME_VERSION); 245 goto retry; 246 } 247 248 err: 249 ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST); 250 } 251 252 const char * 253 mlxcx_event_name(mlxcx_event_t evt) 254 { 255 switch (evt) { 256 case MLXCX_EVENT_COMPLETION: 257 return ("COMPLETION"); 258 case MLXCX_EVENT_PATH_MIGRATED: 259 return ("PATH_MIGRATED"); 260 case MLXCX_EVENT_COMM_ESTABLISH: 261 return ("COMM_ESTABLISH"); 262 case MLXCX_EVENT_SENDQ_DRAIN: 263 return ("SENDQ_DRAIN"); 264 case MLXCX_EVENT_LAST_WQE: 265 return ("LAST_WQE"); 266 case MLXCX_EVENT_SRQ_LIMIT: 267 return ("SRQ_LIMIT"); 268 case MLXCX_EVENT_DCT_ALL_CLOSED: 269 return ("DCT_ALL_CLOSED"); 270 case MLXCX_EVENT_DCT_ACCKEY_VIOL: 271 return ("DCT_ACCKEY_VIOL"); 272 case MLXCX_EVENT_CQ_ERROR: 273 return ("CQ_ERROR"); 274 case MLXCX_EVENT_WQ_CATASTROPHE: 275 return ("WQ_CATASTROPHE"); 276 case MLXCX_EVENT_PATH_MIGRATE_FAIL: 277 return ("PATH_MIGRATE_FAIL"); 278 case MLXCX_EVENT_PAGE_FAULT: 279 return ("PAGE_FAULT"); 280 case MLXCX_EVENT_WQ_INVALID_REQ: 281 return ("WQ_INVALID_REQ"); 282 case MLXCX_EVENT_WQ_ACCESS_VIOL: 283 return ("WQ_ACCESS_VIOL"); 284 case MLXCX_EVENT_SRQ_CATASTROPHE: 285 return ("SRQ_CATASTROPHE"); 286 case MLXCX_EVENT_INTERNAL_ERROR: 287 return ("INTERNAL_ERROR"); 288 case MLXCX_EVENT_PORT_STATE: 289 return ("PORT_STATE"); 290 case MLXCX_EVENT_GPIO: 291 return ("GPIO"); 292 case MLXCX_EVENT_PORT_MODULE: 293 return ("PORT_MODULE"); 294 case MLXCX_EVENT_TEMP_WARNING: 295 return ("TEMP_WARNING"); 296 case MLXCX_EVENT_REMOTE_CONFIG: 297 return ("REMOTE_CONFIG"); 298 case MLXCX_EVENT_DCBX_CHANGE: 299 return ("DCBX_CHANGE"); 300 case MLXCX_EVENT_DOORBELL_CONGEST: 301 return ("DOORBELL_CONGEST"); 302 case MLXCX_EVENT_STALL_VL: 303 return ("STALL_VL"); 304 case MLXCX_EVENT_CMD_COMPLETION: 305 return ("CMD_COMPLETION"); 306 case MLXCX_EVENT_PAGE_REQUEST: 307 return ("PAGE_REQUEST"); 308 case MLXCX_EVENT_NIC_VPORT: 309 return ("NIC_VPORT"); 310 case MLXCX_EVENT_EC_PARAMS_CHANGE: 311 return ("EC_PARAMS_CHANGE"); 312 case MLXCX_EVENT_XRQ_ERROR: 313 return ("XRQ_ERROR"); 314 } 315 return ("UNKNOWN"); 316 } 317 318 /* Should be called only when link state has changed. */ 319 void 320 mlxcx_update_link_state(mlxcx_t *mlxp, mlxcx_port_t *port) 321 { 322 link_state_t ls; 323 324 mutex_enter(&port->mlp_mtx); 325 (void) mlxcx_cmd_query_port_status(mlxp, port); 326 (void) mlxcx_cmd_query_port_speed(mlxp, port); 327 328 switch (port->mlp_oper_status) { 329 case MLXCX_PORT_STATUS_UP: 330 case MLXCX_PORT_STATUS_UP_ONCE: 331 ls = LINK_STATE_UP; 332 break; 333 case MLXCX_PORT_STATUS_DOWN: 334 ls = LINK_STATE_DOWN; 335 break; 336 default: 337 ls = LINK_STATE_UNKNOWN; 338 } 339 mac_link_update(mlxp->mlx_mac_hdl, ls); 340 341 mutex_exit(&port->mlp_mtx); 342 } 343 344 static void 345 mlxcx_give_pages_once(mlxcx_t *mlxp, size_t npages) 346 { 347 ddi_device_acc_attr_t acc; 348 ddi_dma_attr_t attr; 349 mlxcx_dev_page_t *mdp; 350 int32_t togive; 351 mlxcx_dev_page_t *pages[MLXCX_MANAGE_PAGES_MAX_PAGES]; 352 uint_t i; 353 const ddi_dma_cookie_t *ck; 354 355 togive = MIN(npages, MLXCX_MANAGE_PAGES_MAX_PAGES); 356 357 for (i = 0; i < togive; i++) { 358 mdp = kmem_zalloc(sizeof (mlxcx_dev_page_t), KM_SLEEP); 359 mlxcx_dma_acc_attr(mlxp, &acc); 360 mlxcx_dma_page_attr(mlxp, &attr); 361 if (!mlxcx_dma_alloc(mlxp, &mdp->mxdp_dma, &attr, &acc, 362 B_TRUE, MLXCX_HW_PAGE_SIZE, B_TRUE)) { 363 mlxcx_warn(mlxp, "failed to allocate 4k page %u/%u", i, 364 togive); 365 goto cleanup_npages; 366 } 367 ck = mlxcx_dma_cookie_one(&mdp->mxdp_dma); 368 mdp->mxdp_pa = ck->dmac_laddress; 369 pages[i] = mdp; 370 } 371 372 mutex_enter(&mlxp->mlx_pagemtx); 373 374 if (!mlxcx_cmd_give_pages(mlxp, 375 MLXCX_MANAGE_PAGES_OPMOD_GIVE_PAGES, togive, pages)) { 376 mlxcx_warn(mlxp, "!hardware refused our gift of %u " 377 "pages!", togive); 378 goto cleanup_npages; 379 } 380 381 for (i = 0; i < togive; i++) { 382 avl_add(&mlxp->mlx_pages, pages[i]); 383 } 384 mlxp->mlx_npages += togive; 385 mutex_exit(&mlxp->mlx_pagemtx); 386 387 return; 388 389 cleanup_npages: 390 for (i = 0; i < togive; i++) { 391 mdp = pages[i]; 392 mlxcx_dma_free(&mdp->mxdp_dma); 393 kmem_free(mdp, sizeof (mlxcx_dev_page_t)); 394 } 395 /* Tell the hardware we had an allocation failure. */ 396 (void) mlxcx_cmd_give_pages(mlxp, MLXCX_MANAGE_PAGES_OPMOD_ALLOC_FAIL, 397 0, NULL); 398 mutex_exit(&mlxp->mlx_pagemtx); 399 } 400 401 static void 402 mlxcx_take_pages_once(mlxcx_t *mlxp, size_t npages) 403 { 404 uint_t i; 405 int32_t req, ret; 406 uint64_t pas[MLXCX_MANAGE_PAGES_MAX_PAGES]; 407 mlxcx_dev_page_t *mdp, probe; 408 409 mutex_enter(&mlxp->mlx_pagemtx); 410 411 ASSERT0(avl_is_empty(&mlxp->mlx_pages)); 412 req = MIN(npages, MLXCX_MANAGE_PAGES_MAX_PAGES); 413 414 if (!mlxcx_cmd_return_pages(mlxp, req, pas, &ret)) { 415 return; 416 } 417 418 for (i = 0; i < ret; i++) { 419 bzero(&probe, sizeof (probe)); 420 probe.mxdp_pa = pas[i]; 421 422 mdp = avl_find(&mlxp->mlx_pages, &probe, NULL); 423 424 if (mdp != NULL) { 425 avl_remove(&mlxp->mlx_pages, mdp); 426 mlxp->mlx_npages--; 427 mlxcx_dma_free(&mdp->mxdp_dma); 428 kmem_free(mdp, sizeof (mlxcx_dev_page_t)); 429 } else { 430 mlxcx_warn(mlxp, "hardware returned a page " 431 "with PA 0x%" PRIx64 " but we have no " 432 "record of giving out such a page", pas[i]); 433 } 434 } 435 436 mutex_exit(&mlxp->mlx_pagemtx); 437 } 438 439 static const char * 440 mlxcx_module_error_string(mlxcx_module_error_type_t err) 441 { 442 switch (err) { 443 case MLXCX_MODULE_ERR_POWER_BUDGET: 444 return ("POWER_BUDGET"); 445 case MLXCX_MODULE_ERR_LONG_RANGE: 446 return ("LONG_RANGE"); 447 case MLXCX_MODULE_ERR_BUS_STUCK: 448 return ("BUS_STUCK"); 449 case MLXCX_MODULE_ERR_NO_EEPROM: 450 return ("NO_EEPROM"); 451 case MLXCX_MODULE_ERR_ENFORCEMENT: 452 return ("ENFORCEMENT"); 453 case MLXCX_MODULE_ERR_UNKNOWN_IDENT: 454 return ("UNKNOWN_IDENT"); 455 case MLXCX_MODULE_ERR_HIGH_TEMP: 456 return ("HIGH_TEMP"); 457 case MLXCX_MODULE_ERR_CABLE_SHORTED: 458 return ("CABLE_SHORTED"); 459 default: 460 return ("UNKNOWN"); 461 } 462 } 463 464 static void 465 mlxcx_report_module_error(mlxcx_t *mlxp, mlxcx_evdata_port_mod_t *evd) 466 { 467 uint64_t ena; 468 char buf[FM_MAX_CLASS]; 469 const char *lename; 470 const char *ename; 471 const char *stname; 472 uint_t eno = 0; 473 mlxcx_module_status_t state = evd->mled_port_mod_module_status; 474 475 switch (state) { 476 case MLXCX_MODULE_ERROR: 477 stname = "error"; 478 eno = evd->mled_port_mod_error_type; 479 lename = mlxcx_module_error_string(eno); 480 switch (eno) { 481 case MLXCX_MODULE_ERR_ENFORCEMENT: 482 ename = DDI_FM_TXR_ERROR_WHITELIST; 483 break; 484 case MLXCX_MODULE_ERR_UNKNOWN_IDENT: 485 case MLXCX_MODULE_ERR_NO_EEPROM: 486 ename = DDI_FM_TXR_ERROR_NOTSUPP; 487 break; 488 case MLXCX_MODULE_ERR_HIGH_TEMP: 489 ename = DDI_FM_TXR_ERROR_OVERTEMP; 490 break; 491 case MLXCX_MODULE_ERR_POWER_BUDGET: 492 case MLXCX_MODULE_ERR_LONG_RANGE: 493 case MLXCX_MODULE_ERR_CABLE_SHORTED: 494 ename = DDI_FM_TXR_ERROR_HWFAIL; 495 break; 496 case MLXCX_MODULE_ERR_BUS_STUCK: 497 default: 498 ename = DDI_FM_TXR_ERROR_UNKNOWN; 499 } 500 break; 501 default: 502 return; 503 } 504 505 (void) snprintf(buf, FM_MAX_CLASS, "%s.%s", 506 DDI_FM_NIC, DDI_FM_TXR_ERROR); 507 ena = fm_ena_generate(0, FM_ENA_FMT1); 508 if (!DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps)) 509 return; 510 511 ddi_fm_ereport_post(mlxp->mlx_dip, buf, ena, DDI_NOSLEEP, 512 /* compulsory FM props */ 513 FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0, 514 /* generic NIC txr error event props */ 515 "error", DATA_TYPE_STRING, ename, 516 "port_index", DATA_TYPE_UINT8, 0, 517 "txr_index", DATA_TYPE_UINT8, evd->mled_port_mod_module, 518 /* local props */ 519 "mlxcx_state", DATA_TYPE_STRING, stname, 520 "mlxcx_error", DATA_TYPE_STRING, lename, 521 "mlxcx_error_num", DATA_TYPE_UINT8, eno, 522 NULL); 523 ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST); 524 } 525 526 static uint_t 527 mlxcx_intr_0(caddr_t arg, caddr_t arg2) 528 { 529 mlxcx_t *mlxp = (mlxcx_t *)arg; 530 mlxcx_event_queue_t *mleq = (mlxcx_event_queue_t *)arg2; 531 mlxcx_eventq_ent_t *ent; 532 mlxcx_port_t *port; 533 uint_t portn; 534 int32_t npages = 0; 535 536 mutex_enter(&mleq->mleq_mtx); 537 538 if (!(mleq->mleq_state & MLXCX_EQ_ALLOC) || 539 !(mleq->mleq_state & MLXCX_EQ_CREATED) || 540 (mleq->mleq_state & MLXCX_EQ_DESTROYED)) { 541 mlxcx_warn(mlxp, "int0 on bad eq state"); 542 mutex_exit(&mleq->mleq_mtx); 543 return (DDI_INTR_UNCLAIMED); 544 } 545 546 ent = mlxcx_eq_next(mleq); 547 if (ent == NULL) { 548 mlxcx_warn(mlxp, "spurious int 0?"); 549 mutex_exit(&mleq->mleq_mtx); 550 return (DDI_INTR_UNCLAIMED); 551 } 552 553 ASSERT(mleq->mleq_state & MLXCX_EQ_ARMED); 554 mleq->mleq_state &= ~MLXCX_EQ_ARMED; 555 556 for (; ent != NULL; ent = mlxcx_eq_next(mleq)) { 557 switch (ent->mleqe_event_type) { 558 case MLXCX_EVENT_PAGE_REQUEST: 559 VERIFY3U(from_be16(ent->mleqe_page_request. 560 mled_page_request_function_id), ==, 0); 561 npages += (int32_t)from_be32(ent->mleqe_page_request. 562 mled_page_request_num_pages); 563 break; 564 case MLXCX_EVENT_PORT_STATE: 565 portn = get_bits8( 566 ent->mleqe_port_state.mled_port_state_port_num, 567 MLXCX_EVENT_PORT_NUM) - 1; 568 if (portn >= mlxp->mlx_nports) 569 break; 570 port = &mlxp->mlx_ports[portn]; 571 mlxcx_update_link_state(mlxp, port); 572 break; 573 case MLXCX_EVENT_PORT_MODULE: 574 mlxcx_report_module_error(mlxp, &ent->mleqe_port_mod); 575 break; 576 default: 577 mlxcx_warn(mlxp, "unhandled event 0x%x on int0", 578 ent->mleqe_event_type); 579 } 580 } 581 582 if (npages > 0) { 583 mlxcx_give_pages_once(mlxp, npages); 584 } else if (npages < 0) { 585 mlxcx_take_pages_once(mlxp, -1 * npages); 586 } 587 588 mlxcx_arm_eq(mlxp, mleq); 589 mutex_exit(&mleq->mleq_mtx); 590 591 return (DDI_INTR_CLAIMED); 592 } 593 594 mblk_t * 595 mlxcx_rx_poll(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, size_t bytelim) 596 { 597 mlxcx_buffer_t *buf; 598 mblk_t *mp, *cmp, *nmp; 599 mlxcx_completionq_ent_t *cent; 600 size_t bytes = 0; 601 boolean_t found; 602 603 ASSERT(mutex_owned(&mlcq->mlcq_mtx)); 604 605 ASSERT(mlcq->mlcq_wq != NULL); 606 ASSERT3U(mlcq->mlcq_wq->mlwq_type, ==, MLXCX_WQ_TYPE_RECVQ); 607 608 if (!(mlcq->mlcq_state & MLXCX_CQ_ALLOC) || 609 !(mlcq->mlcq_state & MLXCX_CQ_CREATED) || 610 (mlcq->mlcq_state & MLXCX_CQ_DESTROYED) || 611 (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN)) { 612 return (NULL); 613 } 614 615 ASSERT(mlcq->mlcq_state & MLXCX_CQ_POLLING); 616 617 nmp = cmp = mp = NULL; 618 619 cent = mlxcx_cq_next(mlcq); 620 for (; cent != NULL; cent = mlxcx_cq_next(mlcq)) { 621 /* 622 * Teardown and ring stop can atomic_or this flag 623 * into our state if they want us to stop early. 624 */ 625 if (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN) 626 break; 627 628 if (cent->mlcqe_opcode == MLXCX_CQE_OP_REQ && 629 cent->mlcqe_send_wqe_opcode == MLXCX_WQE_OP_NOP) { 630 /* NOP */ 631 goto nextcq; 632 } 633 634 buf = list_head(&mlcq->mlcq_buffers); 635 found = B_FALSE; 636 while (buf != NULL) { 637 if ((buf->mlb_wqe_index & UINT16_MAX) == 638 from_be16(cent->mlcqe_wqe_counter)) { 639 found = B_TRUE; 640 break; 641 } 642 buf = list_next(&mlcq->mlcq_buffers, buf); 643 } 644 if (!found) { 645 buf = list_head(&mlcq->mlcq_buffers); 646 mlxcx_warn(mlxp, "got completion on CQ %x but " 647 "no buffer matching wqe found: %x (first " 648 "buffer counter = %x)", mlcq->mlcq_num, 649 from_be16(cent->mlcqe_wqe_counter), 650 buf == NULL ? UINT32_MAX : buf->mlb_wqe_index); 651 mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_INVAL_STATE); 652 goto nextcq; 653 } 654 list_remove(&mlcq->mlcq_buffers, buf); 655 atomic_dec_64(&mlcq->mlcq_bufcnt); 656 657 nmp = mlxcx_rx_completion(mlxp, mlcq, cent, buf); 658 if (nmp != NULL) { 659 bytes += from_be32(cent->mlcqe_byte_cnt); 660 if (cmp != NULL) { 661 cmp->b_next = nmp; 662 cmp = nmp; 663 } else { 664 mp = cmp = nmp; 665 } 666 } 667 nextcq: 668 mlcq->mlcq_doorbell->mlcqd_update_ci = to_be24(mlcq->mlcq_cc); 669 670 if (bytelim != 0 && bytes > bytelim) 671 break; 672 } 673 674 return (mp); 675 } 676 677 static uint_t 678 mlxcx_intr_n(caddr_t arg, caddr_t arg2) 679 { 680 mlxcx_t *mlxp = (mlxcx_t *)arg; 681 mlxcx_event_queue_t *mleq = (mlxcx_event_queue_t *)arg2; 682 mlxcx_eventq_ent_t *ent; 683 mlxcx_completionq_ent_t *cent; 684 mlxcx_completion_queue_t *mlcq, probe; 685 mlxcx_buffer_t *buf; 686 mblk_t *mp, *cmp, *nmp; 687 boolean_t found, tellmac = B_FALSE, added; 688 689 mutex_enter(&mleq->mleq_mtx); 690 691 if (!(mleq->mleq_state & MLXCX_EQ_ALLOC) || 692 !(mleq->mleq_state & MLXCX_EQ_CREATED) || 693 (mleq->mleq_state & MLXCX_EQ_DESTROYED)) { 694 mutex_exit(&mleq->mleq_mtx); 695 return (DDI_INTR_CLAIMED); 696 } 697 698 ent = mlxcx_eq_next(mleq); 699 if (ent == NULL) { 700 if (++mleq->mleq_badintrs > mlxcx_stuck_intr_count) { 701 mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_BADINT_LIMIT); 702 ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST); 703 (void) ddi_intr_disable(mlxp->mlx_intr_handles[ 704 mleq->mleq_intr_index]); 705 } 706 mutex_exit(&mleq->mleq_mtx); 707 return (DDI_INTR_CLAIMED); 708 } 709 mleq->mleq_badintrs = 0; 710 711 ASSERT(mleq->mleq_state & MLXCX_EQ_ARMED); 712 mleq->mleq_state &= ~MLXCX_EQ_ARMED; 713 714 for (; ent != NULL; ent = mlxcx_eq_next(mleq)) { 715 if (ent->mleqe_event_type != MLXCX_EVENT_COMPLETION) { 716 mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_INVAL_STATE); 717 ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST); 718 (void) ddi_intr_disable(mlxp->mlx_intr_handles[ 719 mleq->mleq_intr_index]); 720 mutex_exit(&mleq->mleq_mtx); 721 return (DDI_INTR_CLAIMED); 722 } 723 ASSERT3U(ent->mleqe_event_type, ==, MLXCX_EVENT_COMPLETION); 724 725 probe.mlcq_num = 726 from_be24(ent->mleqe_completion.mled_completion_cqn); 727 mlcq = avl_find(&mleq->mleq_cqs, &probe, NULL); 728 729 if (mlcq == NULL) 730 continue; 731 732 /* 733 * The polling function might have the mutex and stop us from 734 * getting the lock here, so we increment the event counter 735 * atomically from outside. 736 * 737 * This way at the end of polling when we go back to interrupts 738 * from this CQ, the event counter is still correct. 739 * 740 * Note that mlxcx_mac_ring_intr_enable() takes the EQ lock so 741 * as to avoid any possibility of racing against us here, so we 742 * only have to consider mlxcx_rx_poll(). 743 */ 744 atomic_inc_32(&mlcq->mlcq_ec); 745 atomic_and_uint(&mlcq->mlcq_state, ~MLXCX_CQ_ARMED); 746 747 if (mutex_tryenter(&mlcq->mlcq_mtx) == 0) { 748 /* 749 * If we failed to take the mutex because the polling 750 * function has it, just move on. We don't want to 751 * block other CQs behind this one. 752 */ 753 if (mlcq->mlcq_state & MLXCX_CQ_POLLING) 754 continue; 755 /* Otherwise we will wait. */ 756 mutex_enter(&mlcq->mlcq_mtx); 757 } 758 759 if (!(mlcq->mlcq_state & MLXCX_CQ_ALLOC) || 760 !(mlcq->mlcq_state & MLXCX_CQ_CREATED) || 761 (mlcq->mlcq_state & MLXCX_CQ_DESTROYED) || 762 (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN) || 763 (mlcq->mlcq_state & MLXCX_CQ_POLLING)) { 764 mutex_exit(&mlcq->mlcq_mtx); 765 continue; 766 } 767 768 nmp = cmp = mp = NULL; 769 tellmac = B_FALSE; 770 771 cent = mlxcx_cq_next(mlcq); 772 for (; cent != NULL; cent = mlxcx_cq_next(mlcq)) { 773 /* 774 * Teardown and ring stop can atomic_or this flag 775 * into our state if they want us to stop early. 776 */ 777 if (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN) 778 break; 779 if (mlcq->mlcq_state & MLXCX_CQ_POLLING) 780 break; 781 782 if (cent->mlcqe_opcode == MLXCX_CQE_OP_REQ && 783 cent->mlcqe_send_wqe_opcode == MLXCX_WQE_OP_NOP) { 784 /* NOP */ 785 goto nextcq; 786 } 787 788 lookagain: 789 /* 790 * Generally the buffer we're looking for will be 791 * at the front of the list, so this loop won't 792 * need to look far. 793 */ 794 buf = list_head(&mlcq->mlcq_buffers); 795 found = B_FALSE; 796 while (buf != NULL) { 797 if ((buf->mlb_wqe_index & UINT16_MAX) == 798 from_be16(cent->mlcqe_wqe_counter)) { 799 found = B_TRUE; 800 break; 801 } 802 buf = list_next(&mlcq->mlcq_buffers, buf); 803 } 804 if (!found) { 805 /* 806 * If there's any buffers waiting on the 807 * buffers_b list, then merge those into 808 * the main list and have another look. 809 * 810 * The wq enqueue routines push new buffers 811 * into buffers_b so that they can avoid 812 * taking the mlcq_mtx and blocking us for 813 * every single packet. 814 */ 815 added = B_FALSE; 816 mutex_enter(&mlcq->mlcq_bufbmtx); 817 if (!list_is_empty(&mlcq->mlcq_buffers_b)) { 818 list_move_tail(&mlcq->mlcq_buffers, 819 &mlcq->mlcq_buffers_b); 820 added = B_TRUE; 821 } 822 mutex_exit(&mlcq->mlcq_bufbmtx); 823 if (added) 824 goto lookagain; 825 } 826 if (!found) { 827 buf = list_head(&mlcq->mlcq_buffers); 828 mlxcx_warn(mlxp, "got completion on CQ %x but " 829 "no buffer matching wqe found: %x (first " 830 "buffer counter = %x)", mlcq->mlcq_num, 831 from_be16(cent->mlcqe_wqe_counter), 832 buf == NULL ? UINT32_MAX : 833 buf->mlb_wqe_index); 834 mlxcx_fm_ereport(mlxp, 835 DDI_FM_DEVICE_INVAL_STATE); 836 goto nextcq; 837 } 838 list_remove(&mlcq->mlcq_buffers, buf); 839 atomic_dec_64(&mlcq->mlcq_bufcnt); 840 841 switch (mlcq->mlcq_wq->mlwq_type) { 842 case MLXCX_WQ_TYPE_SENDQ: 843 mlxcx_tx_completion(mlxp, mlcq, cent, buf); 844 break; 845 case MLXCX_WQ_TYPE_RECVQ: 846 nmp = mlxcx_rx_completion(mlxp, mlcq, cent, 847 buf); 848 if (nmp != NULL) { 849 if (cmp != NULL) { 850 cmp->b_next = nmp; 851 cmp = nmp; 852 } else { 853 mp = cmp = nmp; 854 } 855 } 856 break; 857 } 858 859 nextcq: 860 /* 861 * Update the "doorbell" consumer counter for the queue 862 * every time. Unlike a UAR write, this is relatively 863 * cheap and doesn't require us to go out on the bus 864 * straight away (since it's our memory). 865 */ 866 mlcq->mlcq_doorbell->mlcqd_update_ci = 867 to_be24(mlcq->mlcq_cc); 868 869 if ((mlcq->mlcq_state & MLXCX_CQ_BLOCKED_MAC) && 870 mlcq->mlcq_bufcnt < mlcq->mlcq_buflwm) { 871 mlcq->mlcq_state &= ~MLXCX_CQ_BLOCKED_MAC; 872 tellmac = B_TRUE; 873 } 874 } 875 876 mlxcx_arm_cq(mlxp, mlcq); 877 mutex_exit(&mlcq->mlcq_mtx); 878 879 if (tellmac) { 880 mac_tx_ring_update(mlxp->mlx_mac_hdl, 881 mlcq->mlcq_mac_hdl); 882 } 883 if (mp != NULL) { 884 mac_rx_ring(mlxp->mlx_mac_hdl, mlcq->mlcq_mac_hdl, 885 mp, mlcq->mlcq_mac_gen); 886 } 887 888 /* 889 * Updating the consumer counter for an EQ requires a write 890 * to the UAR, which is possibly expensive. 891 * 892 * Try to do it only often enough to stop us wrapping around. 893 */ 894 if ((mleq->mleq_cc & 0x7) == 0) 895 mlxcx_update_eq(mlxp, mleq); 896 } 897 898 mlxcx_arm_eq(mlxp, mleq); 899 mutex_exit(&mleq->mleq_mtx); 900 901 return (DDI_INTR_CLAIMED); 902 } 903 904 boolean_t 905 mlxcx_intr_setup(mlxcx_t *mlxp) 906 { 907 dev_info_t *dip = mlxp->mlx_dip; 908 int ret; 909 int nintrs = 0; 910 int navail = 0; 911 int types, i; 912 mlxcx_eventq_type_t eqt = MLXCX_EQ_TYPE_ANY; 913 914 ret = ddi_intr_get_supported_types(dip, &types); 915 if (ret != DDI_SUCCESS) { 916 return (B_FALSE); 917 } 918 919 if (!(types & DDI_INTR_TYPE_MSIX)) { 920 mlxcx_warn(mlxp, "MSI-X interrupts not available, but mlxcx " 921 "requires MSI-X"); 922 return (B_FALSE); 923 } 924 925 ret = ddi_intr_get_nintrs(dip, DDI_INTR_TYPE_MSIX, &nintrs); 926 if (ret != DDI_SUCCESS) { 927 return (B_FALSE); 928 } 929 if (nintrs < 2) { 930 mlxcx_warn(mlxp, "%d MSI-X interrupts available, but mlxcx " 931 "requires 2", nintrs); 932 return (B_FALSE); 933 } 934 935 ret = ddi_intr_get_navail(dip, DDI_INTR_TYPE_MSIX, &navail); 936 if (navail < 2) { 937 mlxcx_warn(mlxp, "%d MSI-X interrupts available, but mlxcx " 938 "requires 2", navail); 939 return (B_FALSE); 940 } 941 942 mlxp->mlx_intr_size = navail * sizeof (ddi_intr_handle_t); 943 mlxp->mlx_intr_handles = kmem_alloc(mlxp->mlx_intr_size, KM_SLEEP); 944 945 ret = ddi_intr_alloc(dip, mlxp->mlx_intr_handles, DDI_INTR_TYPE_MSIX, 946 0, navail, &mlxp->mlx_intr_count, DDI_INTR_ALLOC_NORMAL); 947 if (ret != DDI_SUCCESS) { 948 mlxcx_intr_teardown(mlxp); 949 return (B_FALSE); 950 } 951 if (mlxp->mlx_intr_count < 2) { 952 mlxcx_intr_teardown(mlxp); 953 return (B_FALSE); 954 } 955 mlxp->mlx_intr_type = DDI_INTR_TYPE_MSIX; 956 957 ret = ddi_intr_get_pri(mlxp->mlx_intr_handles[0], &mlxp->mlx_intr_pri); 958 if (ret != DDI_SUCCESS) { 959 mlxcx_intr_teardown(mlxp); 960 return (B_FALSE); 961 } 962 963 mlxp->mlx_eqs_size = mlxp->mlx_intr_count * 964 sizeof (mlxcx_event_queue_t); 965 mlxp->mlx_eqs = kmem_zalloc(mlxp->mlx_eqs_size, KM_SLEEP); 966 967 ret = ddi_intr_add_handler(mlxp->mlx_intr_handles[0], mlxcx_intr_0, 968 (caddr_t)mlxp, (caddr_t)&mlxp->mlx_eqs[0]); 969 if (ret != DDI_SUCCESS) { 970 mlxcx_intr_teardown(mlxp); 971 return (B_FALSE); 972 } 973 974 /* 975 * If we have enough interrupts, set their "type" fields so that we 976 * avoid mixing RX and TX queues on the same EQs. 977 */ 978 if (mlxp->mlx_intr_count >= 8) { 979 eqt = MLXCX_EQ_TYPE_RX; 980 } 981 982 for (i = 1; i < mlxp->mlx_intr_count; ++i) { 983 mutex_init(&mlxp->mlx_eqs[i].mleq_mtx, NULL, MUTEX_DRIVER, 984 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 985 avl_create(&mlxp->mlx_eqs[i].mleq_cqs, mlxcx_cq_compare, 986 sizeof (mlxcx_completion_queue_t), 987 offsetof(mlxcx_completion_queue_t, mlcq_eq_entry)); 988 mlxp->mlx_eqs[i].mleq_intr_index = i; 989 990 mlxp->mlx_eqs[i].mleq_type = eqt; 991 /* 992 * If eqt is still ANY, just leave it set to that 993 * (no else here). 994 */ 995 if (eqt == MLXCX_EQ_TYPE_RX) { 996 eqt = MLXCX_EQ_TYPE_TX; 997 } else if (eqt == MLXCX_EQ_TYPE_TX) { 998 eqt = MLXCX_EQ_TYPE_RX; 999 } 1000 1001 ret = ddi_intr_add_handler(mlxp->mlx_intr_handles[i], 1002 mlxcx_intr_n, (caddr_t)mlxp, (caddr_t)&mlxp->mlx_eqs[i]); 1003 if (ret != DDI_SUCCESS) { 1004 mlxcx_intr_teardown(mlxp); 1005 return (B_FALSE); 1006 } 1007 } 1008 1009 return (B_TRUE); 1010 } 1011