1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright (c) 2020, the University of Queensland 14 * Copyright 2020 RackTop Systems, Inc. 15 */ 16 17 /* 18 * Mellanox Connect-X 4/5/6 driver. 19 */ 20 21 #include <sys/modctl.h> 22 #include <sys/conf.h> 23 #include <sys/devops.h> 24 #include <sys/sysmacros.h> 25 26 #include <sys/mac_provider.h> 27 28 #include <mlxcx.h> 29 30 /* 31 * CTASSERT(s) to cover bad values which would induce bugs. 32 */ 33 CTASSERT(MLXCX_CQ_LWM_GAP >= MLXCX_CQ_HWM_GAP); 34 35 void 36 mlxcx_intr_teardown(mlxcx_t *mlxp) 37 { 38 int i; 39 int ret; 40 41 for (i = 0; i < mlxp->mlx_intr_count; ++i) { 42 mlxcx_event_queue_t *mleq = &mlxp->mlx_eqs[i]; 43 mutex_enter(&mleq->mleq_mtx); 44 VERIFY0(mleq->mleq_state & MLXCX_EQ_ALLOC); 45 if (mleq->mleq_state & MLXCX_EQ_CREATED) 46 VERIFY(mleq->mleq_state & MLXCX_EQ_DESTROYED); 47 if (i != 0) { 48 VERIFY(avl_is_empty(&mleq->mleq_cqs)); 49 avl_destroy(&mleq->mleq_cqs); 50 } 51 mutex_exit(&mleq->mleq_mtx); 52 (void) ddi_intr_disable(mlxp->mlx_intr_handles[i]); 53 (void) ddi_intr_remove_handler(mlxp->mlx_intr_handles[i]); 54 ret = ddi_intr_free(mlxp->mlx_intr_handles[i]); 55 if (ret != DDI_SUCCESS) { 56 mlxcx_warn(mlxp, "failed to free interrupt %d: %d", 57 i, ret); 58 } 59 mutex_destroy(&mleq->mleq_mtx); 60 } 61 kmem_free(mlxp->mlx_intr_handles, mlxp->mlx_intr_size); 62 kmem_free(mlxp->mlx_eqs, mlxp->mlx_eqs_size); 63 mlxp->mlx_intr_handles = NULL; 64 mlxp->mlx_eqs = NULL; 65 } 66 67 /* 68 * Get the next SW-owned entry on the event queue, or NULL if we reach the end. 69 */ 70 static mlxcx_eventq_ent_t * 71 mlxcx_eq_next(mlxcx_event_queue_t *mleq) 72 { 73 mlxcx_eventq_ent_t *ent; 74 ddi_fm_error_t err; 75 uint_t ci; 76 const uint_t swowner = ((mleq->mleq_cc >> mleq->mleq_entshift) & 1); 77 78 ASSERT(mutex_owned(&mleq->mleq_mtx)); 79 ASSERT(mleq->mleq_state & MLXCX_EQ_CREATED); 80 ASSERT0(mleq->mleq_state & MLXCX_EQ_DESTROYED); 81 82 /* mleq_nents is always a power of 2 */ 83 ci = mleq->mleq_cc & (mleq->mleq_nents - 1); 84 85 ent = &mleq->mleq_ent[ci]; 86 VERIFY0(ddi_dma_sync(mleq->mleq_dma.mxdb_dma_handle, 87 (uintptr_t)ent - (uintptr_t)mleq->mleq_ent, 88 sizeof (mlxcx_eventq_ent_t), DDI_DMA_SYNC_FORCPU)); 89 ddi_fm_dma_err_get(mleq->mleq_dma.mxdb_dma_handle, &err, 90 DDI_FME_VERSION); 91 if (err.fme_status == DDI_FM_OK && (ent->mleqe_owner & 1) == swowner) { 92 /* The PRM says we have to membar here, so we're doing it */ 93 membar_consumer(); 94 ++mleq->mleq_cc; 95 return (ent); 96 } 97 /* 98 * In the case of a DMA error, we should re-arm this EQ and then come 99 * back and try again when the device wakes us back up. 100 * 101 * Hopefully the fault will be gone by then. 102 */ 103 ddi_fm_dma_err_clear(mleq->mleq_dma.mxdb_dma_handle, DDI_FME_VERSION); 104 105 return (NULL); 106 } 107 108 void 109 mlxcx_arm_eq(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq) 110 { 111 uint_t try = 0; 112 ddi_fm_error_t err; 113 bits32_t v = new_bits32(); 114 115 ASSERT(mutex_owned(&mleq->mleq_mtx)); 116 ASSERT(mleq->mleq_state & MLXCX_EQ_CREATED); 117 ASSERT0(mleq->mleq_state & MLXCX_EQ_DESTROYED); 118 ASSERT0(mleq->mleq_state & MLXCX_EQ_ARMED); 119 ASSERT0(mleq->mleq_state & MLXCX_EQ_POLLING); 120 121 mleq->mleq_state |= MLXCX_EQ_ARMED; 122 mleq->mleq_cc_armed = mleq->mleq_cc; 123 124 set_bits32(&v, MLXCX_EQ_ARM_EQN, mleq->mleq_num); 125 set_bits32(&v, MLXCX_EQ_ARM_CI, mleq->mleq_cc); 126 127 retry: 128 mlxcx_uar_put32(mlxp, mleq->mleq_uar, MLXCX_UAR_EQ_ARM, 129 from_bits32(v)); 130 ddi_fm_acc_err_get(mlxp->mlx_regs_handle, &err, 131 DDI_FME_VERSION); 132 if (err.fme_status == DDI_FM_OK) 133 return; 134 if (try++ < mlxcx_doorbell_tries) { 135 ddi_fm_acc_err_clear(mlxp->mlx_regs_handle, DDI_FME_VERSION); 136 goto retry; 137 } 138 ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST); 139 } 140 141 static void 142 mlxcx_update_eq(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq) 143 { 144 bits32_t v = new_bits32(); 145 ddi_fm_error_t err; 146 147 ASSERT(mutex_owned(&mleq->mleq_mtx)); 148 ASSERT(mleq->mleq_state & MLXCX_EQ_CREATED); 149 ASSERT0(mleq->mleq_state & MLXCX_EQ_DESTROYED); 150 ASSERT0(mleq->mleq_state & MLXCX_EQ_ARMED); 151 152 set_bits32(&v, MLXCX_EQ_ARM_EQN, mleq->mleq_num); 153 set_bits32(&v, MLXCX_EQ_ARM_CI, mleq->mleq_cc); 154 155 mlxcx_uar_put32(mlxp, mleq->mleq_uar, MLXCX_UAR_EQ_NOARM, 156 from_bits32(v)); 157 ddi_fm_acc_err_get(mlxp->mlx_regs_handle, &err, 158 DDI_FME_VERSION); 159 ddi_fm_acc_err_clear(mlxp->mlx_regs_handle, DDI_FME_VERSION); 160 /* 161 * Ignore the error, if it's still happening when we try to re-arm the 162 * EQ, we will note the impact then. 163 */ 164 } 165 166 static mlxcx_completionq_ent_t * 167 mlxcx_cq_next(mlxcx_completion_queue_t *mlcq) 168 { 169 mlxcx_completionq_ent_t *ent; 170 ddi_fm_error_t err; 171 uint_t ci; 172 const uint_t swowner = ((mlcq->mlcq_cc >> mlcq->mlcq_entshift) & 1); 173 174 ASSERT(mutex_owned(&mlcq->mlcq_mtx)); 175 ASSERT(mlcq->mlcq_state & MLXCX_CQ_CREATED); 176 ASSERT0(mlcq->mlcq_state & MLXCX_CQ_DESTROYED); 177 178 /* mlcq_nents is always a power of 2 */ 179 ci = mlcq->mlcq_cc & (mlcq->mlcq_nents - 1); 180 181 ent = &mlcq->mlcq_ent[ci]; 182 VERIFY0(ddi_dma_sync(mlcq->mlcq_dma.mxdb_dma_handle, 183 (uintptr_t)ent - (uintptr_t)mlcq->mlcq_ent, 184 sizeof (mlxcx_completionq_ent_t), DDI_DMA_SYNC_FORCPU)); 185 ddi_fm_dma_err_get(mlcq->mlcq_dma.mxdb_dma_handle, &err, 186 DDI_FME_VERSION); 187 if (err.fme_status == DDI_FM_OK && (ent->mlcqe_owner & 1) == swowner) { 188 /* The PRM says we have to membar here, so we're doing it */ 189 membar_consumer(); 190 ++mlcq->mlcq_cc; 191 return (ent); 192 } 193 ddi_fm_dma_err_clear(mlcq->mlcq_dma.mxdb_dma_handle, DDI_FME_VERSION); 194 195 return (NULL); 196 } 197 198 void 199 mlxcx_update_cqci(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq) 200 { 201 ddi_fm_error_t err; 202 uint_t try = 0; 203 204 mlcq->mlcq_doorbell->mlcqd_update_ci = to_be24(mlcq->mlcq_cc); 205 206 retry: 207 MLXCX_DMA_SYNC(mlcq->mlcq_doorbell_dma, DDI_DMA_SYNC_FORDEV); 208 ddi_fm_dma_err_get(mlcq->mlcq_doorbell_dma.mxdb_dma_handle, &err, 209 DDI_FME_VERSION); 210 if (err.fme_status != DDI_FM_OK) { 211 if (try++ < mlxcx_doorbell_tries) { 212 ddi_fm_dma_err_clear( 213 mlcq->mlcq_doorbell_dma.mxdb_dma_handle, 214 DDI_FME_VERSION); 215 goto retry; 216 } else { 217 ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST); 218 return; 219 } 220 } 221 } 222 223 void 224 mlxcx_arm_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq) 225 { 226 bits32_t dbval = new_bits32(); 227 uint64_t udbval; 228 ddi_fm_error_t err; 229 uint_t try = 0; 230 231 ASSERT(mutex_owned(&mlcq->mlcq_mtx)); 232 ASSERT(mlcq->mlcq_state & MLXCX_CQ_CREATED); 233 ASSERT0(mlcq->mlcq_state & MLXCX_CQ_DESTROYED); 234 235 if (mlcq->mlcq_state & MLXCX_CQ_ARMED) 236 ASSERT3U(mlcq->mlcq_ec, >, mlcq->mlcq_ec_armed); 237 238 if (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN) 239 return; 240 241 mlcq->mlcq_state |= MLXCX_CQ_ARMED; 242 mlcq->mlcq_cc_armed = mlcq->mlcq_cc; 243 mlcq->mlcq_ec_armed = mlcq->mlcq_ec; 244 245 set_bits32(&dbval, MLXCX_CQ_ARM_SEQ, mlcq->mlcq_ec); 246 set_bits32(&dbval, MLXCX_CQ_ARM_CI, mlcq->mlcq_cc); 247 248 udbval = (uint64_t)from_bits32(dbval) << 32; 249 udbval |= mlcq->mlcq_num & 0xffffff; 250 251 mlcq->mlcq_doorbell->mlcqd_update_ci = to_be24(mlcq->mlcq_cc); 252 mlcq->mlcq_doorbell->mlcqd_arm_ci = dbval; 253 254 retry: 255 MLXCX_DMA_SYNC(mlcq->mlcq_doorbell_dma, DDI_DMA_SYNC_FORDEV); 256 ddi_fm_dma_err_get(mlcq->mlcq_doorbell_dma.mxdb_dma_handle, &err, 257 DDI_FME_VERSION); 258 if (err.fme_status != DDI_FM_OK) { 259 if (try++ < mlxcx_doorbell_tries) { 260 ddi_fm_dma_err_clear( 261 mlcq->mlcq_doorbell_dma.mxdb_dma_handle, 262 DDI_FME_VERSION); 263 goto retry; 264 } else { 265 goto err; 266 } 267 } 268 269 mlxcx_uar_put64(mlxp, mlcq->mlcq_uar, MLXCX_UAR_CQ_ARM, udbval); 270 ddi_fm_acc_err_get(mlxp->mlx_regs_handle, &err, 271 DDI_FME_VERSION); 272 if (err.fme_status == DDI_FM_OK) 273 return; 274 if (try++ < mlxcx_doorbell_tries) { 275 ddi_fm_acc_err_clear(mlxp->mlx_regs_handle, DDI_FME_VERSION); 276 goto retry; 277 } 278 279 err: 280 ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST); 281 } 282 283 const char * 284 mlxcx_event_name(mlxcx_event_t evt) 285 { 286 switch (evt) { 287 case MLXCX_EVENT_COMPLETION: 288 return ("COMPLETION"); 289 case MLXCX_EVENT_PATH_MIGRATED: 290 return ("PATH_MIGRATED"); 291 case MLXCX_EVENT_COMM_ESTABLISH: 292 return ("COMM_ESTABLISH"); 293 case MLXCX_EVENT_SENDQ_DRAIN: 294 return ("SENDQ_DRAIN"); 295 case MLXCX_EVENT_LAST_WQE: 296 return ("LAST_WQE"); 297 case MLXCX_EVENT_SRQ_LIMIT: 298 return ("SRQ_LIMIT"); 299 case MLXCX_EVENT_DCT_ALL_CLOSED: 300 return ("DCT_ALL_CLOSED"); 301 case MLXCX_EVENT_DCT_ACCKEY_VIOL: 302 return ("DCT_ACCKEY_VIOL"); 303 case MLXCX_EVENT_CQ_ERROR: 304 return ("CQ_ERROR"); 305 case MLXCX_EVENT_WQ_CATASTROPHE: 306 return ("WQ_CATASTROPHE"); 307 case MLXCX_EVENT_PATH_MIGRATE_FAIL: 308 return ("PATH_MIGRATE_FAIL"); 309 case MLXCX_EVENT_PAGE_FAULT: 310 return ("PAGE_FAULT"); 311 case MLXCX_EVENT_WQ_INVALID_REQ: 312 return ("WQ_INVALID_REQ"); 313 case MLXCX_EVENT_WQ_ACCESS_VIOL: 314 return ("WQ_ACCESS_VIOL"); 315 case MLXCX_EVENT_SRQ_CATASTROPHE: 316 return ("SRQ_CATASTROPHE"); 317 case MLXCX_EVENT_INTERNAL_ERROR: 318 return ("INTERNAL_ERROR"); 319 case MLXCX_EVENT_PORT_STATE: 320 return ("PORT_STATE"); 321 case MLXCX_EVENT_GPIO: 322 return ("GPIO"); 323 case MLXCX_EVENT_PORT_MODULE: 324 return ("PORT_MODULE"); 325 case MLXCX_EVENT_TEMP_WARNING: 326 return ("TEMP_WARNING"); 327 case MLXCX_EVENT_REMOTE_CONFIG: 328 return ("REMOTE_CONFIG"); 329 case MLXCX_EVENT_DCBX_CHANGE: 330 return ("DCBX_CHANGE"); 331 case MLXCX_EVENT_DOORBELL_CONGEST: 332 return ("DOORBELL_CONGEST"); 333 case MLXCX_EVENT_STALL_VL: 334 return ("STALL_VL"); 335 case MLXCX_EVENT_CMD_COMPLETION: 336 return ("CMD_COMPLETION"); 337 case MLXCX_EVENT_PAGE_REQUEST: 338 return ("PAGE_REQUEST"); 339 case MLXCX_EVENT_NIC_VPORT: 340 return ("NIC_VPORT"); 341 case MLXCX_EVENT_EC_PARAMS_CHANGE: 342 return ("EC_PARAMS_CHANGE"); 343 case MLXCX_EVENT_XRQ_ERROR: 344 return ("XRQ_ERROR"); 345 } 346 return ("UNKNOWN"); 347 } 348 349 /* Should be called only when link state has changed. */ 350 void 351 mlxcx_update_link_state(mlxcx_t *mlxp, mlxcx_port_t *port) 352 { 353 link_state_t ls; 354 355 mutex_enter(&port->mlp_mtx); 356 (void) mlxcx_cmd_query_port_status(mlxp, port); 357 (void) mlxcx_cmd_query_port_speed(mlxp, port); 358 (void) mlxcx_cmd_query_port_fec(mlxp, port); 359 360 switch (port->mlp_oper_status) { 361 case MLXCX_PORT_STATUS_UP: 362 case MLXCX_PORT_STATUS_UP_ONCE: 363 ls = LINK_STATE_UP; 364 break; 365 case MLXCX_PORT_STATUS_DOWN: 366 ls = LINK_STATE_DOWN; 367 break; 368 default: 369 ls = LINK_STATE_UNKNOWN; 370 } 371 mac_link_update(mlxp->mlx_mac_hdl, ls); 372 373 mutex_exit(&port->mlp_mtx); 374 } 375 376 static void 377 mlxcx_give_pages_once(mlxcx_t *mlxp, size_t npages) 378 { 379 ddi_device_acc_attr_t acc; 380 ddi_dma_attr_t attr; 381 mlxcx_dev_page_t *mdp; 382 int32_t togive; 383 mlxcx_dev_page_t *pages[MLXCX_MANAGE_PAGES_MAX_PAGES]; 384 uint_t i; 385 const ddi_dma_cookie_t *ck; 386 387 togive = MIN(npages, MLXCX_MANAGE_PAGES_MAX_PAGES); 388 389 for (i = 0; i < togive; i++) { 390 mdp = kmem_zalloc(sizeof (mlxcx_dev_page_t), KM_SLEEP); 391 mlxcx_dma_acc_attr(mlxp, &acc); 392 mlxcx_dma_page_attr(mlxp, &attr); 393 if (!mlxcx_dma_alloc(mlxp, &mdp->mxdp_dma, &attr, &acc, 394 B_TRUE, MLXCX_HW_PAGE_SIZE, B_TRUE)) { 395 mlxcx_warn(mlxp, "failed to allocate 4k page %u/%u", i, 396 togive); 397 goto cleanup_npages; 398 } 399 ck = mlxcx_dma_cookie_one(&mdp->mxdp_dma); 400 mdp->mxdp_pa = ck->dmac_laddress; 401 pages[i] = mdp; 402 } 403 404 mutex_enter(&mlxp->mlx_pagemtx); 405 406 if (!mlxcx_cmd_give_pages(mlxp, 407 MLXCX_MANAGE_PAGES_OPMOD_GIVE_PAGES, togive, pages)) { 408 mlxcx_warn(mlxp, "!hardware refused our gift of %u " 409 "pages!", togive); 410 goto cleanup_npages; 411 } 412 413 for (i = 0; i < togive; i++) { 414 avl_add(&mlxp->mlx_pages, pages[i]); 415 } 416 mlxp->mlx_npages += togive; 417 mutex_exit(&mlxp->mlx_pagemtx); 418 419 return; 420 421 cleanup_npages: 422 for (i = 0; i < togive; i++) { 423 mdp = pages[i]; 424 mlxcx_dma_free(&mdp->mxdp_dma); 425 kmem_free(mdp, sizeof (mlxcx_dev_page_t)); 426 } 427 /* Tell the hardware we had an allocation failure. */ 428 (void) mlxcx_cmd_give_pages(mlxp, MLXCX_MANAGE_PAGES_OPMOD_ALLOC_FAIL, 429 0, NULL); 430 mutex_exit(&mlxp->mlx_pagemtx); 431 } 432 433 static void 434 mlxcx_take_pages_once(mlxcx_t *mlxp, size_t npages) 435 { 436 uint_t i; 437 int32_t req, ret; 438 uint64_t pas[MLXCX_MANAGE_PAGES_MAX_PAGES]; 439 mlxcx_dev_page_t *mdp, probe; 440 441 mutex_enter(&mlxp->mlx_pagemtx); 442 443 ASSERT0(avl_is_empty(&mlxp->mlx_pages)); 444 req = MIN(npages, MLXCX_MANAGE_PAGES_MAX_PAGES); 445 446 if (!mlxcx_cmd_return_pages(mlxp, req, pas, &ret)) { 447 return; 448 } 449 450 for (i = 0; i < ret; i++) { 451 bzero(&probe, sizeof (probe)); 452 probe.mxdp_pa = pas[i]; 453 454 mdp = avl_find(&mlxp->mlx_pages, &probe, NULL); 455 456 if (mdp != NULL) { 457 avl_remove(&mlxp->mlx_pages, mdp); 458 mlxp->mlx_npages--; 459 mlxcx_dma_free(&mdp->mxdp_dma); 460 kmem_free(mdp, sizeof (mlxcx_dev_page_t)); 461 } else { 462 mlxcx_warn(mlxp, "hardware returned a page " 463 "with PA 0x%" PRIx64 " but we have no " 464 "record of giving out such a page", pas[i]); 465 } 466 } 467 468 mutex_exit(&mlxp->mlx_pagemtx); 469 } 470 471 static const char * 472 mlxcx_module_error_string(mlxcx_module_error_type_t err) 473 { 474 switch (err) { 475 case MLXCX_MODULE_ERR_POWER_BUDGET: 476 return ("POWER_BUDGET"); 477 case MLXCX_MODULE_ERR_LONG_RANGE: 478 return ("LONG_RANGE"); 479 case MLXCX_MODULE_ERR_BUS_STUCK: 480 return ("BUS_STUCK"); 481 case MLXCX_MODULE_ERR_NO_EEPROM: 482 return ("NO_EEPROM"); 483 case MLXCX_MODULE_ERR_ENFORCEMENT: 484 return ("ENFORCEMENT"); 485 case MLXCX_MODULE_ERR_UNKNOWN_IDENT: 486 return ("UNKNOWN_IDENT"); 487 case MLXCX_MODULE_ERR_HIGH_TEMP: 488 return ("HIGH_TEMP"); 489 case MLXCX_MODULE_ERR_CABLE_SHORTED: 490 return ("CABLE_SHORTED"); 491 default: 492 return ("UNKNOWN"); 493 } 494 } 495 496 static void 497 mlxcx_report_module_error(mlxcx_t *mlxp, mlxcx_evdata_port_mod_t *evd) 498 { 499 uint64_t ena; 500 char buf[FM_MAX_CLASS]; 501 const char *lename; 502 const char *ename; 503 const char *stname; 504 uint_t eno = 0; 505 mlxcx_module_status_t state = evd->mled_port_mod_module_status; 506 507 switch (state) { 508 case MLXCX_MODULE_ERROR: 509 stname = "error"; 510 eno = evd->mled_port_mod_error_type; 511 lename = mlxcx_module_error_string(eno); 512 switch (eno) { 513 case MLXCX_MODULE_ERR_ENFORCEMENT: 514 ename = DDI_FM_TXR_ERROR_WHITELIST; 515 break; 516 case MLXCX_MODULE_ERR_UNKNOWN_IDENT: 517 case MLXCX_MODULE_ERR_NO_EEPROM: 518 ename = DDI_FM_TXR_ERROR_NOTSUPP; 519 break; 520 case MLXCX_MODULE_ERR_HIGH_TEMP: 521 ename = DDI_FM_TXR_ERROR_OVERTEMP; 522 break; 523 case MLXCX_MODULE_ERR_POWER_BUDGET: 524 case MLXCX_MODULE_ERR_LONG_RANGE: 525 case MLXCX_MODULE_ERR_CABLE_SHORTED: 526 ename = DDI_FM_TXR_ERROR_HWFAIL; 527 break; 528 case MLXCX_MODULE_ERR_BUS_STUCK: 529 default: 530 ename = DDI_FM_TXR_ERROR_UNKNOWN; 531 } 532 break; 533 default: 534 return; 535 } 536 537 (void) snprintf(buf, FM_MAX_CLASS, "%s.%s", 538 DDI_FM_NIC, DDI_FM_TXR_ERROR); 539 ena = fm_ena_generate(0, FM_ENA_FMT1); 540 if (!DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps)) 541 return; 542 543 ddi_fm_ereport_post(mlxp->mlx_dip, buf, ena, DDI_NOSLEEP, 544 /* compulsory FM props */ 545 FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0, 546 /* generic NIC txr error event props */ 547 "error", DATA_TYPE_STRING, ename, 548 "port_index", DATA_TYPE_UINT8, 0, 549 "txr_index", DATA_TYPE_UINT8, evd->mled_port_mod_module, 550 /* local props */ 551 "mlxcx_state", DATA_TYPE_STRING, stname, 552 "mlxcx_error", DATA_TYPE_STRING, lename, 553 "mlxcx_error_num", DATA_TYPE_UINT8, eno, 554 NULL); 555 ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST); 556 } 557 558 static uint_t 559 mlxcx_intr_0(caddr_t arg, caddr_t arg2) 560 { 561 mlxcx_t *mlxp = (mlxcx_t *)arg; 562 mlxcx_event_queue_t *mleq = (mlxcx_event_queue_t *)arg2; 563 mlxcx_eventq_ent_t *ent; 564 mlxcx_port_t *port; 565 uint_t portn; 566 int32_t npages = 0; 567 568 mutex_enter(&mleq->mleq_mtx); 569 570 if (!(mleq->mleq_state & MLXCX_EQ_ALLOC) || 571 !(mleq->mleq_state & MLXCX_EQ_CREATED) || 572 (mleq->mleq_state & MLXCX_EQ_DESTROYED)) { 573 mlxcx_warn(mlxp, "int %d on bad eq state", 574 mleq->mleq_intr_index); 575 mutex_exit(&mleq->mleq_mtx); 576 return (DDI_INTR_UNCLAIMED); 577 } 578 579 ent = mlxcx_eq_next(mleq); 580 if (ent == NULL) { 581 mlxcx_warn(mlxp, "spurious int %d", mleq->mleq_intr_index); 582 mutex_exit(&mleq->mleq_mtx); 583 return (DDI_INTR_UNCLAIMED); 584 } 585 586 ASSERT(mleq->mleq_state & MLXCX_EQ_ARMED); 587 mleq->mleq_state &= ~MLXCX_EQ_ARMED; 588 589 for (; ent != NULL; ent = mlxcx_eq_next(mleq)) { 590 switch (ent->mleqe_event_type) { 591 case MLXCX_EVENT_PAGE_REQUEST: 592 VERIFY3U(from_be16(ent->mleqe_page_request. 593 mled_page_request_function_id), ==, 0); 594 npages += (int32_t)from_be32(ent->mleqe_page_request. 595 mled_page_request_num_pages); 596 break; 597 case MLXCX_EVENT_PORT_STATE: 598 portn = get_bits8( 599 ent->mleqe_port_state.mled_port_state_port_num, 600 MLXCX_EVENT_PORT_NUM) - 1; 601 if (portn >= mlxp->mlx_nports) 602 break; 603 port = &mlxp->mlx_ports[portn]; 604 mlxcx_update_link_state(mlxp, port); 605 break; 606 case MLXCX_EVENT_PORT_MODULE: 607 mlxcx_report_module_error(mlxp, &ent->mleqe_port_mod); 608 break; 609 default: 610 mlxcx_warn(mlxp, "unhandled event 0x%x on int %d", 611 ent->mleqe_event_type, mleq->mleq_intr_index); 612 } 613 } 614 615 if (npages > 0) { 616 mlxcx_give_pages_once(mlxp, npages); 617 } else if (npages < 0) { 618 mlxcx_take_pages_once(mlxp, -1 * npages); 619 } 620 621 mlxcx_arm_eq(mlxp, mleq); 622 mutex_exit(&mleq->mleq_mtx); 623 624 return (DDI_INTR_CLAIMED); 625 } 626 627 static boolean_t 628 mlxcx_process_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, mblk_t **mpp, 629 size_t bytelim) 630 { 631 mlxcx_work_queue_t *wq = mlcq->mlcq_wq; 632 mlxcx_completionq_ent_t *cent; 633 mblk_t *mp, *cmp, *nmp; 634 mlxcx_buffer_t *buf; 635 boolean_t found, added; 636 size_t bytes = 0; 637 uint_t rx_frames = 0; 638 uint_t comp_cnt = 0; 639 int64_t wqebbs, bufcnt; 640 641 *mpp = NULL; 642 643 if (!(mlcq->mlcq_state & MLXCX_CQ_ALLOC) || 644 !(mlcq->mlcq_state & MLXCX_CQ_CREATED) || 645 (mlcq->mlcq_state & MLXCX_CQ_DESTROYED) || 646 (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN)) { 647 return (B_FALSE); 648 } 649 650 nmp = cmp = mp = NULL; 651 652 wqebbs = 0; 653 bufcnt = 0; 654 for (cent = mlxcx_cq_next(mlcq); cent != NULL; 655 cent = mlxcx_cq_next(mlcq)) { 656 /* 657 * Teardown and ring stop can atomic_or this flag 658 * into our state if they want us to stop early. 659 */ 660 if (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN) 661 return (B_FALSE); 662 663 comp_cnt++; 664 if (cent->mlcqe_opcode == MLXCX_CQE_OP_REQ && 665 cent->mlcqe_send_wqe_opcode == MLXCX_WQE_OP_NOP) { 666 /* NOP */ 667 atomic_dec_64(&wq->mlwq_wqebb_used); 668 goto nextcq; 669 } 670 671 lookagain: 672 /* 673 * Generally the buffer we're looking for will be 674 * at the front of the list, so this loop won't 675 * need to look far. 676 */ 677 buf = list_head(&mlcq->mlcq_buffers); 678 found = B_FALSE; 679 while (buf != NULL) { 680 if ((buf->mlb_wqe_index & UINT16_MAX) == 681 from_be16(cent->mlcqe_wqe_counter)) { 682 found = B_TRUE; 683 break; 684 } 685 buf = list_next(&mlcq->mlcq_buffers, buf); 686 } 687 688 if (!found) { 689 /* 690 * If there's any buffers waiting on the 691 * buffers_b list, then merge those into 692 * the main list and have another look. 693 * 694 * The wq enqueue routines push new buffers 695 * into buffers_b so that they can avoid 696 * taking the mlcq_mtx and blocking us for 697 * every single packet. 698 */ 699 added = B_FALSE; 700 mutex_enter(&mlcq->mlcq_bufbmtx); 701 if (!list_is_empty(&mlcq->mlcq_buffers_b)) { 702 list_move_tail(&mlcq->mlcq_buffers, 703 &mlcq->mlcq_buffers_b); 704 added = B_TRUE; 705 } 706 mutex_exit(&mlcq->mlcq_bufbmtx); 707 if (added) 708 goto lookagain; 709 710 buf = list_head(&mlcq->mlcq_buffers); 711 mlxcx_warn(mlxp, "got completion on CQ %x but " 712 "no buffer matching wqe found: %x (first " 713 "buffer counter = %x)", mlcq->mlcq_num, 714 from_be16(cent->mlcqe_wqe_counter), 715 buf == NULL ? UINT32_MAX : 716 buf->mlb_wqe_index); 717 mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_INVAL_STATE); 718 goto nextcq; 719 } 720 721 /* 722 * The buf is likely to be freed below, count this now. 723 */ 724 wqebbs += buf->mlb_wqebbs; 725 726 list_remove(&mlcq->mlcq_buffers, buf); 727 bufcnt++; 728 729 switch (mlcq->mlcq_wq->mlwq_type) { 730 case MLXCX_WQ_TYPE_SENDQ: 731 mlxcx_tx_completion(mlxp, mlcq, cent, buf); 732 break; 733 case MLXCX_WQ_TYPE_RECVQ: 734 nmp = mlxcx_rx_completion(mlxp, mlcq, cent, buf); 735 bytes += from_be32(cent->mlcqe_byte_cnt); 736 if (nmp != NULL) { 737 if (cmp != NULL) { 738 cmp->b_next = nmp; 739 cmp = nmp; 740 } else { 741 mp = cmp = nmp; 742 } 743 744 rx_frames++; 745 } 746 break; 747 } 748 749 /* 750 * Update the consumer index with what has been processed, 751 * followed by driver counters. It is important to tell the 752 * hardware first, otherwise when we throw more packets at 753 * it, it may get an overflow error. 754 * We do this whenever we've processed enough to bridge the 755 * high->low water mark. 756 */ 757 if (bufcnt > (MLXCX_CQ_LWM_GAP - MLXCX_CQ_HWM_GAP)) { 758 mlxcx_update_cqci(mlxp, mlcq); 759 /* 760 * Both these variables are incremented using 761 * atomics as they are modified in other code paths 762 * (Eg during tx) which hold different locks. 763 */ 764 atomic_add_64(&mlcq->mlcq_bufcnt, -bufcnt); 765 atomic_add_64(&wq->mlwq_wqebb_used, -wqebbs); 766 wqebbs = 0; 767 bufcnt = 0; 768 comp_cnt = 0; 769 } 770 nextcq: 771 if (rx_frames > mlxp->mlx_props.mldp_rx_per_cq || 772 (bytelim != 0 && bytes > bytelim)) 773 break; 774 } 775 776 if (comp_cnt > 0) { 777 mlxcx_update_cqci(mlxp, mlcq); 778 atomic_add_64(&mlcq->mlcq_bufcnt, -bufcnt); 779 atomic_add_64(&wq->mlwq_wqebb_used, -wqebbs); 780 } 781 782 *mpp = mp; 783 return (B_TRUE); 784 } 785 786 787 mblk_t * 788 mlxcx_rx_poll(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, size_t bytelim) 789 { 790 mblk_t *mp = NULL; 791 792 ASSERT(mutex_owned(&mlcq->mlcq_mtx)); 793 794 ASSERT(mlcq->mlcq_wq != NULL); 795 ASSERT3U(mlcq->mlcq_wq->mlwq_type, ==, MLXCX_WQ_TYPE_RECVQ); 796 797 (void) mlxcx_process_cq(mlxp, mlcq, &mp, bytelim); 798 799 return (mp); 800 } 801 802 static uint_t 803 mlxcx_intr_n(caddr_t arg, caddr_t arg2) 804 { 805 mlxcx_t *mlxp = (mlxcx_t *)arg; 806 mlxcx_event_queue_t *mleq = (mlxcx_event_queue_t *)arg2; 807 mlxcx_eventq_ent_t *ent; 808 mlxcx_completion_queue_t *mlcq, probe; 809 mlxcx_work_queue_t *mlwq; 810 mblk_t *mp = NULL; 811 boolean_t tellmac = B_FALSE; 812 813 mutex_enter(&mleq->mleq_mtx); 814 815 if (!(mleq->mleq_state & MLXCX_EQ_ALLOC) || 816 !(mleq->mleq_state & MLXCX_EQ_CREATED) || 817 (mleq->mleq_state & MLXCX_EQ_DESTROYED)) { 818 mutex_exit(&mleq->mleq_mtx); 819 return (DDI_INTR_CLAIMED); 820 } 821 822 ent = mlxcx_eq_next(mleq); 823 if (ent == NULL) { 824 if (++mleq->mleq_badintrs > mlxcx_stuck_intr_count) { 825 mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_BADINT_LIMIT); 826 ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST); 827 (void) ddi_intr_disable(mlxp->mlx_intr_handles[ 828 mleq->mleq_intr_index]); 829 } 830 mutex_exit(&mleq->mleq_mtx); 831 return (DDI_INTR_CLAIMED); 832 } 833 mleq->mleq_badintrs = 0; 834 835 ASSERT(mleq->mleq_state & MLXCX_EQ_ARMED); 836 mleq->mleq_state &= ~MLXCX_EQ_ARMED; 837 838 for (; ent != NULL; ent = mlxcx_eq_next(mleq)) { 839 if (ent->mleqe_event_type != MLXCX_EVENT_COMPLETION) { 840 mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_INVAL_STATE); 841 ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST); 842 (void) ddi_intr_disable(mlxp->mlx_intr_handles[ 843 mleq->mleq_intr_index]); 844 mutex_exit(&mleq->mleq_mtx); 845 return (DDI_INTR_CLAIMED); 846 } 847 ASSERT3U(ent->mleqe_event_type, ==, MLXCX_EVENT_COMPLETION); 848 849 probe.mlcq_num = 850 from_be24(ent->mleqe_completion.mled_completion_cqn); 851 mlcq = avl_find(&mleq->mleq_cqs, &probe, NULL); 852 853 if (mlcq == NULL) 854 continue; 855 856 mlwq = mlcq->mlcq_wq; 857 858 /* 859 * The polling function might have the mutex and stop us from 860 * getting the lock in mlxcx_process_cq(), so we increment 861 * the event counter atomically from outside. 862 * 863 * This way at the end of polling when we go back to interrupts 864 * from this CQ, the event counter is still correct. 865 * 866 * Note that mlxcx_mac_ring_intr_enable() takes the EQ lock so 867 * as to avoid any possibility of racing against us here, so we 868 * only have to consider mlxcx_rx_poll(). 869 */ 870 atomic_inc_32(&mlcq->mlcq_ec); 871 atomic_and_uint(&mlcq->mlcq_state, ~MLXCX_CQ_ARMED); 872 873 if (mutex_tryenter(&mlcq->mlcq_mtx) == 0) { 874 /* 875 * If we failed to take the mutex because the 876 * polling function has it, just move on. 877 * We don't want to block other CQs behind 878 * this one. 879 */ 880 if (mlcq->mlcq_state & MLXCX_CQ_POLLING) 881 goto update_eq; 882 883 /* Otherwise we will wait. */ 884 mutex_enter(&mlcq->mlcq_mtx); 885 } 886 887 if ((mlcq->mlcq_state & MLXCX_CQ_POLLING) == 0 && 888 mlxcx_process_cq(mlxp, mlcq, &mp, 0)) { 889 /* 890 * The ring is not in polling mode and we processed 891 * some completion queue entries. 892 */ 893 if ((mlcq->mlcq_state & MLXCX_CQ_BLOCKED_MAC) != 0 && 894 mlcq->mlcq_bufcnt < mlcq->mlcq_buflwm) { 895 atomic_and_uint(&mlcq->mlcq_state, 896 ~MLXCX_CQ_BLOCKED_MAC); 897 tellmac = B_TRUE; 898 } 899 900 if ((mlwq->mlwq_state & MLXCX_WQ_BLOCKED_MAC) != 0 && 901 mlwq->mlwq_wqebb_used < mlwq->mlwq_buflwm) { 902 atomic_and_uint(&mlwq->mlwq_state, 903 ~MLXCX_WQ_BLOCKED_MAC); 904 tellmac = B_TRUE; 905 } 906 907 mlxcx_arm_cq(mlxp, mlcq); 908 909 mutex_exit(&mlcq->mlcq_mtx); 910 911 if (tellmac) { 912 mac_tx_ring_update(mlxp->mlx_mac_hdl, 913 mlcq->mlcq_mac_hdl); 914 tellmac = B_FALSE; 915 } 916 917 if (mp != NULL) { 918 mac_rx_ring(mlxp->mlx_mac_hdl, 919 mlcq->mlcq_mac_hdl, mp, mlcq->mlcq_mac_gen); 920 } 921 } else { 922 mutex_exit(&mlcq->mlcq_mtx); 923 } 924 925 update_eq: 926 /* 927 * Updating the consumer counter for an EQ requires a write 928 * to the UAR, which is possibly expensive. 929 * 930 * Try to do it only often enough to stop us wrapping around. 931 */ 932 if ((mleq->mleq_cc & 0x7) == 0) 933 mlxcx_update_eq(mlxp, mleq); 934 } 935 936 mlxcx_arm_eq(mlxp, mleq); 937 mutex_exit(&mleq->mleq_mtx); 938 939 return (DDI_INTR_CLAIMED); 940 } 941 942 boolean_t 943 mlxcx_intr_setup(mlxcx_t *mlxp) 944 { 945 dev_info_t *dip = mlxp->mlx_dip; 946 int ret; 947 int nintrs = 0; 948 int navail = 0; 949 int types, i; 950 mlxcx_eventq_type_t eqt = MLXCX_EQ_TYPE_ANY; 951 952 ret = ddi_intr_get_supported_types(dip, &types); 953 if (ret != DDI_SUCCESS) { 954 return (B_FALSE); 955 } 956 957 if (!(types & DDI_INTR_TYPE_MSIX)) { 958 mlxcx_warn(mlxp, "MSI-X interrupts not available, but mlxcx " 959 "requires MSI-X"); 960 return (B_FALSE); 961 } 962 963 ret = ddi_intr_get_nintrs(dip, DDI_INTR_TYPE_MSIX, &nintrs); 964 if (ret != DDI_SUCCESS) { 965 return (B_FALSE); 966 } 967 if (nintrs < 2) { 968 mlxcx_warn(mlxp, "%d MSI-X interrupts available, but mlxcx " 969 "requires 2", nintrs); 970 return (B_FALSE); 971 } 972 973 ret = ddi_intr_get_navail(dip, DDI_INTR_TYPE_MSIX, &navail); 974 if (navail < 2) { 975 mlxcx_warn(mlxp, "%d MSI-X interrupts available, but mlxcx " 976 "requires 2", navail); 977 return (B_FALSE); 978 } 979 980 mlxp->mlx_intr_size = navail * sizeof (ddi_intr_handle_t); 981 mlxp->mlx_intr_handles = kmem_alloc(mlxp->mlx_intr_size, KM_SLEEP); 982 983 ret = ddi_intr_alloc(dip, mlxp->mlx_intr_handles, DDI_INTR_TYPE_MSIX, 984 0, navail, &mlxp->mlx_intr_count, DDI_INTR_ALLOC_NORMAL); 985 if (ret != DDI_SUCCESS) { 986 mlxcx_intr_teardown(mlxp); 987 return (B_FALSE); 988 } 989 if (mlxp->mlx_intr_count < 2) { 990 mlxcx_intr_teardown(mlxp); 991 return (B_FALSE); 992 } 993 mlxp->mlx_intr_type = DDI_INTR_TYPE_MSIX; 994 995 ret = ddi_intr_get_pri(mlxp->mlx_intr_handles[0], &mlxp->mlx_intr_pri); 996 if (ret != DDI_SUCCESS) { 997 mlxcx_intr_teardown(mlxp); 998 return (B_FALSE); 999 } 1000 1001 mlxp->mlx_eqs_size = mlxp->mlx_intr_count * 1002 sizeof (mlxcx_event_queue_t); 1003 mlxp->mlx_eqs = kmem_zalloc(mlxp->mlx_eqs_size, KM_SLEEP); 1004 1005 ret = ddi_intr_add_handler(mlxp->mlx_intr_handles[0], mlxcx_intr_0, 1006 (caddr_t)mlxp, (caddr_t)&mlxp->mlx_eqs[0]); 1007 if (ret != DDI_SUCCESS) { 1008 mlxcx_intr_teardown(mlxp); 1009 return (B_FALSE); 1010 } 1011 1012 /* 1013 * If we have enough interrupts, set their "type" fields so that we 1014 * avoid mixing RX and TX queues on the same EQs. 1015 */ 1016 if (mlxp->mlx_intr_count >= 8) { 1017 eqt = MLXCX_EQ_TYPE_RX; 1018 } 1019 1020 for (i = 1; i < mlxp->mlx_intr_count; ++i) { 1021 mutex_init(&mlxp->mlx_eqs[i].mleq_mtx, NULL, MUTEX_DRIVER, 1022 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 1023 avl_create(&mlxp->mlx_eqs[i].mleq_cqs, mlxcx_cq_compare, 1024 sizeof (mlxcx_completion_queue_t), 1025 offsetof(mlxcx_completion_queue_t, mlcq_eq_entry)); 1026 mlxp->mlx_eqs[i].mleq_intr_index = i; 1027 1028 mlxp->mlx_eqs[i].mleq_type = eqt; 1029 /* 1030 * If eqt is still ANY, just leave it set to that 1031 * (no else here). 1032 */ 1033 if (eqt == MLXCX_EQ_TYPE_RX) { 1034 eqt = MLXCX_EQ_TYPE_TX; 1035 } else if (eqt == MLXCX_EQ_TYPE_TX) { 1036 eqt = MLXCX_EQ_TYPE_RX; 1037 } 1038 1039 ret = ddi_intr_add_handler(mlxp->mlx_intr_handles[i], 1040 mlxcx_intr_n, (caddr_t)mlxp, (caddr_t)&mlxp->mlx_eqs[i]); 1041 if (ret != DDI_SUCCESS) { 1042 mlxcx_intr_teardown(mlxp); 1043 return (B_FALSE); 1044 } 1045 } 1046 1047 return (B_TRUE); 1048 } 1049