1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright (c) 2020, the University of Queensland 14 * Copyright 2020 RackTop Systems, Inc. 15 */ 16 17 /* 18 * Mellanox Connect-X 4/5/6 driver. 19 */ 20 21 #include <sys/modctl.h> 22 #include <sys/conf.h> 23 #include <sys/devops.h> 24 #include <sys/sysmacros.h> 25 26 #include <sys/mac_provider.h> 27 28 #include <mlxcx.h> 29 30 /* 31 * CTASSERT(s) to cover bad values which would induce bugs. 32 */ 33 CTASSERT(MLXCX_CQ_LWM_GAP >= MLXCX_CQ_HWM_GAP); 34 35 void 36 mlxcx_intr_teardown(mlxcx_t *mlxp) 37 { 38 int i; 39 int ret; 40 41 for (i = 0; i < mlxp->mlx_intr_count; ++i) { 42 mlxcx_event_queue_t *mleq = &mlxp->mlx_eqs[i]; 43 mutex_enter(&mleq->mleq_mtx); 44 VERIFY0(mleq->mleq_state & MLXCX_EQ_ALLOC); 45 if (mleq->mleq_state & MLXCX_EQ_CREATED) 46 VERIFY(mleq->mleq_state & MLXCX_EQ_DESTROYED); 47 if (i != 0) { 48 VERIFY(avl_is_empty(&mleq->mleq_cqs)); 49 avl_destroy(&mleq->mleq_cqs); 50 } 51 mutex_exit(&mleq->mleq_mtx); 52 (void) ddi_intr_disable(mlxp->mlx_intr_handles[i]); 53 (void) ddi_intr_remove_handler(mlxp->mlx_intr_handles[i]); 54 ret = ddi_intr_free(mlxp->mlx_intr_handles[i]); 55 if (ret != DDI_SUCCESS) { 56 mlxcx_warn(mlxp, "failed to free interrupt %d: %d", 57 i, ret); 58 } 59 mutex_destroy(&mleq->mleq_mtx); 60 } 61 kmem_free(mlxp->mlx_intr_handles, mlxp->mlx_intr_size); 62 kmem_free(mlxp->mlx_eqs, mlxp->mlx_eqs_size); 63 mlxp->mlx_intr_handles = NULL; 64 mlxp->mlx_eqs = NULL; 65 } 66 67 /* 68 * Get the next SW-owned entry on the event queue, or NULL if we reach the end. 69 */ 70 static mlxcx_eventq_ent_t * 71 mlxcx_eq_next(mlxcx_event_queue_t *mleq) 72 { 73 mlxcx_eventq_ent_t *ent; 74 ddi_fm_error_t err; 75 uint_t ci; 76 const uint_t swowner = ((mleq->mleq_cc >> mleq->mleq_entshift) & 1); 77 78 ASSERT(mutex_owned(&mleq->mleq_mtx)); 79 ASSERT(mleq->mleq_state & MLXCX_EQ_CREATED); 80 ASSERT0(mleq->mleq_state & MLXCX_EQ_DESTROYED); 81 82 /* mleq_nents is always a power of 2 */ 83 ci = mleq->mleq_cc & (mleq->mleq_nents - 1); 84 85 ent = &mleq->mleq_ent[ci]; 86 VERIFY0(ddi_dma_sync(mleq->mleq_dma.mxdb_dma_handle, 87 (uintptr_t)ent - (uintptr_t)mleq->mleq_ent, 88 sizeof (mlxcx_eventq_ent_t), DDI_DMA_SYNC_FORCPU)); 89 ddi_fm_dma_err_get(mleq->mleq_dma.mxdb_dma_handle, &err, 90 DDI_FME_VERSION); 91 if (err.fme_status == DDI_FM_OK && (ent->mleqe_owner & 1) == swowner) { 92 /* The PRM says we have to membar here, so we're doing it */ 93 membar_consumer(); 94 ++mleq->mleq_cc; 95 return (ent); 96 } 97 /* 98 * In the case of a DMA error, we should re-arm this EQ and then come 99 * back and try again when the device wakes us back up. 100 * 101 * Hopefully the fault will be gone by then. 102 */ 103 ddi_fm_dma_err_clear(mleq->mleq_dma.mxdb_dma_handle, DDI_FME_VERSION); 104 105 return (NULL); 106 } 107 108 void 109 mlxcx_arm_eq(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq) 110 { 111 uint_t try = 0; 112 ddi_fm_error_t err; 113 bits32_t v = new_bits32(); 114 115 ASSERT(mutex_owned(&mleq->mleq_mtx)); 116 ASSERT(mleq->mleq_state & MLXCX_EQ_CREATED); 117 ASSERT0(mleq->mleq_state & MLXCX_EQ_DESTROYED); 118 ASSERT0(mleq->mleq_state & MLXCX_EQ_ARMED); 119 ASSERT0(mleq->mleq_state & MLXCX_EQ_POLLING); 120 121 mleq->mleq_state |= MLXCX_EQ_ARMED; 122 mleq->mleq_cc_armed = mleq->mleq_cc; 123 124 set_bits32(&v, MLXCX_EQ_ARM_EQN, mleq->mleq_num); 125 set_bits32(&v, MLXCX_EQ_ARM_CI, mleq->mleq_cc); 126 127 retry: 128 mlxcx_uar_put32(mlxp, mleq->mleq_uar, MLXCX_UAR_EQ_ARM, 129 from_bits32(v)); 130 ddi_fm_acc_err_get(mlxp->mlx_regs_handle, &err, 131 DDI_FME_VERSION); 132 if (err.fme_status == DDI_FM_OK) 133 return; 134 if (try++ < mlxcx_doorbell_tries) { 135 ddi_fm_acc_err_clear(mlxp->mlx_regs_handle, DDI_FME_VERSION); 136 goto retry; 137 } 138 ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST); 139 } 140 141 static void 142 mlxcx_update_eq(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq) 143 { 144 bits32_t v = new_bits32(); 145 ddi_fm_error_t err; 146 147 ASSERT(mutex_owned(&mleq->mleq_mtx)); 148 ASSERT(mleq->mleq_state & MLXCX_EQ_CREATED); 149 ASSERT0(mleq->mleq_state & MLXCX_EQ_DESTROYED); 150 ASSERT0(mleq->mleq_state & MLXCX_EQ_ARMED); 151 152 set_bits32(&v, MLXCX_EQ_ARM_EQN, mleq->mleq_num); 153 set_bits32(&v, MLXCX_EQ_ARM_CI, mleq->mleq_cc); 154 155 mlxcx_uar_put32(mlxp, mleq->mleq_uar, MLXCX_UAR_EQ_NOARM, 156 from_bits32(v)); 157 ddi_fm_acc_err_get(mlxp->mlx_regs_handle, &err, 158 DDI_FME_VERSION); 159 ddi_fm_acc_err_clear(mlxp->mlx_regs_handle, DDI_FME_VERSION); 160 /* 161 * Ignore the error, if it's still happening when we try to re-arm the 162 * EQ, we will note the impact then. 163 */ 164 } 165 166 static mlxcx_completionq_ent_t * 167 mlxcx_cq_next(mlxcx_completion_queue_t *mlcq) 168 { 169 mlxcx_completionq_ent_t *ent; 170 ddi_fm_error_t err; 171 uint_t ci; 172 const uint_t swowner = ((mlcq->mlcq_cc >> mlcq->mlcq_entshift) & 1); 173 174 ASSERT(mutex_owned(&mlcq->mlcq_mtx)); 175 ASSERT(mlcq->mlcq_state & MLXCX_CQ_CREATED); 176 ASSERT0(mlcq->mlcq_state & MLXCX_CQ_DESTROYED); 177 178 /* mlcq_nents is always a power of 2 */ 179 ci = mlcq->mlcq_cc & (mlcq->mlcq_nents - 1); 180 181 ent = &mlcq->mlcq_ent[ci]; 182 VERIFY0(ddi_dma_sync(mlcq->mlcq_dma.mxdb_dma_handle, 183 (uintptr_t)ent - (uintptr_t)mlcq->mlcq_ent, 184 sizeof (mlxcx_completionq_ent_t), DDI_DMA_SYNC_FORCPU)); 185 ddi_fm_dma_err_get(mlcq->mlcq_dma.mxdb_dma_handle, &err, 186 DDI_FME_VERSION); 187 if (err.fme_status == DDI_FM_OK && (ent->mlcqe_owner & 1) == swowner) { 188 /* The PRM says we have to membar here, so we're doing it */ 189 membar_consumer(); 190 ++mlcq->mlcq_cc; 191 return (ent); 192 } 193 ddi_fm_dma_err_clear(mlcq->mlcq_dma.mxdb_dma_handle, DDI_FME_VERSION); 194 195 return (NULL); 196 } 197 198 void 199 mlxcx_update_cqci(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq) 200 { 201 ddi_fm_error_t err; 202 uint_t try = 0; 203 204 mlcq->mlcq_doorbell->mlcqd_update_ci = to_be24(mlcq->mlcq_cc); 205 206 retry: 207 MLXCX_DMA_SYNC(mlcq->mlcq_doorbell_dma, DDI_DMA_SYNC_FORDEV); 208 ddi_fm_dma_err_get(mlcq->mlcq_doorbell_dma.mxdb_dma_handle, &err, 209 DDI_FME_VERSION); 210 if (err.fme_status != DDI_FM_OK) { 211 if (try++ < mlxcx_doorbell_tries) { 212 ddi_fm_dma_err_clear( 213 mlcq->mlcq_doorbell_dma.mxdb_dma_handle, 214 DDI_FME_VERSION); 215 goto retry; 216 } else { 217 ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST); 218 return; 219 } 220 } 221 } 222 223 void 224 mlxcx_arm_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq) 225 { 226 bits32_t dbval = new_bits32(); 227 uint64_t udbval; 228 ddi_fm_error_t err; 229 uint_t try = 0; 230 231 ASSERT(mutex_owned(&mlcq->mlcq_mtx)); 232 ASSERT(mlcq->mlcq_state & MLXCX_CQ_CREATED); 233 ASSERT0(mlcq->mlcq_state & MLXCX_CQ_DESTROYED); 234 235 if (mlcq->mlcq_state & MLXCX_CQ_ARMED) 236 ASSERT3U(mlcq->mlcq_ec, >, mlcq->mlcq_ec_armed); 237 238 if (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN) 239 return; 240 241 mlcq->mlcq_state |= MLXCX_CQ_ARMED; 242 mlcq->mlcq_cc_armed = mlcq->mlcq_cc; 243 mlcq->mlcq_ec_armed = mlcq->mlcq_ec; 244 245 set_bits32(&dbval, MLXCX_CQ_ARM_SEQ, mlcq->mlcq_ec); 246 set_bits32(&dbval, MLXCX_CQ_ARM_CI, mlcq->mlcq_cc); 247 248 udbval = (uint64_t)from_bits32(dbval) << 32; 249 udbval |= mlcq->mlcq_num & 0xffffff; 250 251 mlcq->mlcq_doorbell->mlcqd_update_ci = to_be24(mlcq->mlcq_cc); 252 mlcq->mlcq_doorbell->mlcqd_arm_ci = dbval; 253 254 retry: 255 MLXCX_DMA_SYNC(mlcq->mlcq_doorbell_dma, DDI_DMA_SYNC_FORDEV); 256 ddi_fm_dma_err_get(mlcq->mlcq_doorbell_dma.mxdb_dma_handle, &err, 257 DDI_FME_VERSION); 258 if (err.fme_status != DDI_FM_OK) { 259 if (try++ < mlxcx_doorbell_tries) { 260 ddi_fm_dma_err_clear( 261 mlcq->mlcq_doorbell_dma.mxdb_dma_handle, 262 DDI_FME_VERSION); 263 goto retry; 264 } else { 265 goto err; 266 } 267 } 268 269 mlxcx_uar_put64(mlxp, mlcq->mlcq_uar, MLXCX_UAR_CQ_ARM, udbval); 270 ddi_fm_acc_err_get(mlxp->mlx_regs_handle, &err, 271 DDI_FME_VERSION); 272 if (err.fme_status == DDI_FM_OK) 273 return; 274 if (try++ < mlxcx_doorbell_tries) { 275 ddi_fm_acc_err_clear(mlxp->mlx_regs_handle, DDI_FME_VERSION); 276 goto retry; 277 } 278 279 err: 280 ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST); 281 } 282 283 const char * 284 mlxcx_event_name(mlxcx_event_t evt) 285 { 286 switch (evt) { 287 case MLXCX_EVENT_COMPLETION: 288 return ("COMPLETION"); 289 case MLXCX_EVENT_PATH_MIGRATED: 290 return ("PATH_MIGRATED"); 291 case MLXCX_EVENT_COMM_ESTABLISH: 292 return ("COMM_ESTABLISH"); 293 case MLXCX_EVENT_SENDQ_DRAIN: 294 return ("SENDQ_DRAIN"); 295 case MLXCX_EVENT_LAST_WQE: 296 return ("LAST_WQE"); 297 case MLXCX_EVENT_SRQ_LIMIT: 298 return ("SRQ_LIMIT"); 299 case MLXCX_EVENT_DCT_ALL_CLOSED: 300 return ("DCT_ALL_CLOSED"); 301 case MLXCX_EVENT_DCT_ACCKEY_VIOL: 302 return ("DCT_ACCKEY_VIOL"); 303 case MLXCX_EVENT_CQ_ERROR: 304 return ("CQ_ERROR"); 305 case MLXCX_EVENT_WQ_CATASTROPHE: 306 return ("WQ_CATASTROPHE"); 307 case MLXCX_EVENT_PATH_MIGRATE_FAIL: 308 return ("PATH_MIGRATE_FAIL"); 309 case MLXCX_EVENT_PAGE_FAULT: 310 return ("PAGE_FAULT"); 311 case MLXCX_EVENT_WQ_INVALID_REQ: 312 return ("WQ_INVALID_REQ"); 313 case MLXCX_EVENT_WQ_ACCESS_VIOL: 314 return ("WQ_ACCESS_VIOL"); 315 case MLXCX_EVENT_SRQ_CATASTROPHE: 316 return ("SRQ_CATASTROPHE"); 317 case MLXCX_EVENT_INTERNAL_ERROR: 318 return ("INTERNAL_ERROR"); 319 case MLXCX_EVENT_PORT_STATE: 320 return ("PORT_STATE"); 321 case MLXCX_EVENT_GPIO: 322 return ("GPIO"); 323 case MLXCX_EVENT_PORT_MODULE: 324 return ("PORT_MODULE"); 325 case MLXCX_EVENT_TEMP_WARNING: 326 return ("TEMP_WARNING"); 327 case MLXCX_EVENT_REMOTE_CONFIG: 328 return ("REMOTE_CONFIG"); 329 case MLXCX_EVENT_DCBX_CHANGE: 330 return ("DCBX_CHANGE"); 331 case MLXCX_EVENT_DOORBELL_CONGEST: 332 return ("DOORBELL_CONGEST"); 333 case MLXCX_EVENT_STALL_VL: 334 return ("STALL_VL"); 335 case MLXCX_EVENT_CMD_COMPLETION: 336 return ("CMD_COMPLETION"); 337 case MLXCX_EVENT_PAGE_REQUEST: 338 return ("PAGE_REQUEST"); 339 case MLXCX_EVENT_NIC_VPORT: 340 return ("NIC_VPORT"); 341 case MLXCX_EVENT_EC_PARAMS_CHANGE: 342 return ("EC_PARAMS_CHANGE"); 343 case MLXCX_EVENT_XRQ_ERROR: 344 return ("XRQ_ERROR"); 345 } 346 return ("UNKNOWN"); 347 } 348 349 /* Should be called only when link state has changed. */ 350 void 351 mlxcx_update_link_state(mlxcx_t *mlxp, mlxcx_port_t *port) 352 { 353 link_state_t ls; 354 355 mutex_enter(&port->mlp_mtx); 356 (void) mlxcx_cmd_query_port_status(mlxp, port); 357 (void) mlxcx_cmd_query_port_speed(mlxp, port); 358 359 switch (port->mlp_oper_status) { 360 case MLXCX_PORT_STATUS_UP: 361 case MLXCX_PORT_STATUS_UP_ONCE: 362 ls = LINK_STATE_UP; 363 break; 364 case MLXCX_PORT_STATUS_DOWN: 365 ls = LINK_STATE_DOWN; 366 break; 367 default: 368 ls = LINK_STATE_UNKNOWN; 369 } 370 mac_link_update(mlxp->mlx_mac_hdl, ls); 371 372 mutex_exit(&port->mlp_mtx); 373 } 374 375 static void 376 mlxcx_give_pages_once(mlxcx_t *mlxp, size_t npages) 377 { 378 ddi_device_acc_attr_t acc; 379 ddi_dma_attr_t attr; 380 mlxcx_dev_page_t *mdp; 381 int32_t togive; 382 mlxcx_dev_page_t *pages[MLXCX_MANAGE_PAGES_MAX_PAGES]; 383 uint_t i; 384 const ddi_dma_cookie_t *ck; 385 386 togive = MIN(npages, MLXCX_MANAGE_PAGES_MAX_PAGES); 387 388 for (i = 0; i < togive; i++) { 389 mdp = kmem_zalloc(sizeof (mlxcx_dev_page_t), KM_SLEEP); 390 mlxcx_dma_acc_attr(mlxp, &acc); 391 mlxcx_dma_page_attr(mlxp, &attr); 392 if (!mlxcx_dma_alloc(mlxp, &mdp->mxdp_dma, &attr, &acc, 393 B_TRUE, MLXCX_HW_PAGE_SIZE, B_TRUE)) { 394 mlxcx_warn(mlxp, "failed to allocate 4k page %u/%u", i, 395 togive); 396 goto cleanup_npages; 397 } 398 ck = mlxcx_dma_cookie_one(&mdp->mxdp_dma); 399 mdp->mxdp_pa = ck->dmac_laddress; 400 pages[i] = mdp; 401 } 402 403 mutex_enter(&mlxp->mlx_pagemtx); 404 405 if (!mlxcx_cmd_give_pages(mlxp, 406 MLXCX_MANAGE_PAGES_OPMOD_GIVE_PAGES, togive, pages)) { 407 mlxcx_warn(mlxp, "!hardware refused our gift of %u " 408 "pages!", togive); 409 goto cleanup_npages; 410 } 411 412 for (i = 0; i < togive; i++) { 413 avl_add(&mlxp->mlx_pages, pages[i]); 414 } 415 mlxp->mlx_npages += togive; 416 mutex_exit(&mlxp->mlx_pagemtx); 417 418 return; 419 420 cleanup_npages: 421 for (i = 0; i < togive; i++) { 422 mdp = pages[i]; 423 mlxcx_dma_free(&mdp->mxdp_dma); 424 kmem_free(mdp, sizeof (mlxcx_dev_page_t)); 425 } 426 /* Tell the hardware we had an allocation failure. */ 427 (void) mlxcx_cmd_give_pages(mlxp, MLXCX_MANAGE_PAGES_OPMOD_ALLOC_FAIL, 428 0, NULL); 429 mutex_exit(&mlxp->mlx_pagemtx); 430 } 431 432 static void 433 mlxcx_take_pages_once(mlxcx_t *mlxp, size_t npages) 434 { 435 uint_t i; 436 int32_t req, ret; 437 uint64_t pas[MLXCX_MANAGE_PAGES_MAX_PAGES]; 438 mlxcx_dev_page_t *mdp, probe; 439 440 mutex_enter(&mlxp->mlx_pagemtx); 441 442 ASSERT0(avl_is_empty(&mlxp->mlx_pages)); 443 req = MIN(npages, MLXCX_MANAGE_PAGES_MAX_PAGES); 444 445 if (!mlxcx_cmd_return_pages(mlxp, req, pas, &ret)) { 446 return; 447 } 448 449 for (i = 0; i < ret; i++) { 450 bzero(&probe, sizeof (probe)); 451 probe.mxdp_pa = pas[i]; 452 453 mdp = avl_find(&mlxp->mlx_pages, &probe, NULL); 454 455 if (mdp != NULL) { 456 avl_remove(&mlxp->mlx_pages, mdp); 457 mlxp->mlx_npages--; 458 mlxcx_dma_free(&mdp->mxdp_dma); 459 kmem_free(mdp, sizeof (mlxcx_dev_page_t)); 460 } else { 461 mlxcx_warn(mlxp, "hardware returned a page " 462 "with PA 0x%" PRIx64 " but we have no " 463 "record of giving out such a page", pas[i]); 464 } 465 } 466 467 mutex_exit(&mlxp->mlx_pagemtx); 468 } 469 470 static const char * 471 mlxcx_module_error_string(mlxcx_module_error_type_t err) 472 { 473 switch (err) { 474 case MLXCX_MODULE_ERR_POWER_BUDGET: 475 return ("POWER_BUDGET"); 476 case MLXCX_MODULE_ERR_LONG_RANGE: 477 return ("LONG_RANGE"); 478 case MLXCX_MODULE_ERR_BUS_STUCK: 479 return ("BUS_STUCK"); 480 case MLXCX_MODULE_ERR_NO_EEPROM: 481 return ("NO_EEPROM"); 482 case MLXCX_MODULE_ERR_ENFORCEMENT: 483 return ("ENFORCEMENT"); 484 case MLXCX_MODULE_ERR_UNKNOWN_IDENT: 485 return ("UNKNOWN_IDENT"); 486 case MLXCX_MODULE_ERR_HIGH_TEMP: 487 return ("HIGH_TEMP"); 488 case MLXCX_MODULE_ERR_CABLE_SHORTED: 489 return ("CABLE_SHORTED"); 490 default: 491 return ("UNKNOWN"); 492 } 493 } 494 495 static void 496 mlxcx_report_module_error(mlxcx_t *mlxp, mlxcx_evdata_port_mod_t *evd) 497 { 498 uint64_t ena; 499 char buf[FM_MAX_CLASS]; 500 const char *lename; 501 const char *ename; 502 const char *stname; 503 uint_t eno = 0; 504 mlxcx_module_status_t state = evd->mled_port_mod_module_status; 505 506 switch (state) { 507 case MLXCX_MODULE_ERROR: 508 stname = "error"; 509 eno = evd->mled_port_mod_error_type; 510 lename = mlxcx_module_error_string(eno); 511 switch (eno) { 512 case MLXCX_MODULE_ERR_ENFORCEMENT: 513 ename = DDI_FM_TXR_ERROR_WHITELIST; 514 break; 515 case MLXCX_MODULE_ERR_UNKNOWN_IDENT: 516 case MLXCX_MODULE_ERR_NO_EEPROM: 517 ename = DDI_FM_TXR_ERROR_NOTSUPP; 518 break; 519 case MLXCX_MODULE_ERR_HIGH_TEMP: 520 ename = DDI_FM_TXR_ERROR_OVERTEMP; 521 break; 522 case MLXCX_MODULE_ERR_POWER_BUDGET: 523 case MLXCX_MODULE_ERR_LONG_RANGE: 524 case MLXCX_MODULE_ERR_CABLE_SHORTED: 525 ename = DDI_FM_TXR_ERROR_HWFAIL; 526 break; 527 case MLXCX_MODULE_ERR_BUS_STUCK: 528 default: 529 ename = DDI_FM_TXR_ERROR_UNKNOWN; 530 } 531 break; 532 default: 533 return; 534 } 535 536 (void) snprintf(buf, FM_MAX_CLASS, "%s.%s", 537 DDI_FM_NIC, DDI_FM_TXR_ERROR); 538 ena = fm_ena_generate(0, FM_ENA_FMT1); 539 if (!DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps)) 540 return; 541 542 ddi_fm_ereport_post(mlxp->mlx_dip, buf, ena, DDI_NOSLEEP, 543 /* compulsory FM props */ 544 FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0, 545 /* generic NIC txr error event props */ 546 "error", DATA_TYPE_STRING, ename, 547 "port_index", DATA_TYPE_UINT8, 0, 548 "txr_index", DATA_TYPE_UINT8, evd->mled_port_mod_module, 549 /* local props */ 550 "mlxcx_state", DATA_TYPE_STRING, stname, 551 "mlxcx_error", DATA_TYPE_STRING, lename, 552 "mlxcx_error_num", DATA_TYPE_UINT8, eno, 553 NULL); 554 ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST); 555 } 556 557 static uint_t 558 mlxcx_intr_0(caddr_t arg, caddr_t arg2) 559 { 560 mlxcx_t *mlxp = (mlxcx_t *)arg; 561 mlxcx_event_queue_t *mleq = (mlxcx_event_queue_t *)arg2; 562 mlxcx_eventq_ent_t *ent; 563 mlxcx_port_t *port; 564 uint_t portn; 565 int32_t npages = 0; 566 567 mutex_enter(&mleq->mleq_mtx); 568 569 if (!(mleq->mleq_state & MLXCX_EQ_ALLOC) || 570 !(mleq->mleq_state & MLXCX_EQ_CREATED) || 571 (mleq->mleq_state & MLXCX_EQ_DESTROYED)) { 572 mlxcx_warn(mlxp, "int %d on bad eq state", 573 mleq->mleq_intr_index); 574 mutex_exit(&mleq->mleq_mtx); 575 return (DDI_INTR_UNCLAIMED); 576 } 577 578 ent = mlxcx_eq_next(mleq); 579 if (ent == NULL) { 580 mlxcx_warn(mlxp, "spurious int %d", mleq->mleq_intr_index); 581 mutex_exit(&mleq->mleq_mtx); 582 return (DDI_INTR_UNCLAIMED); 583 } 584 585 ASSERT(mleq->mleq_state & MLXCX_EQ_ARMED); 586 mleq->mleq_state &= ~MLXCX_EQ_ARMED; 587 588 for (; ent != NULL; ent = mlxcx_eq_next(mleq)) { 589 switch (ent->mleqe_event_type) { 590 case MLXCX_EVENT_PAGE_REQUEST: 591 VERIFY3U(from_be16(ent->mleqe_page_request. 592 mled_page_request_function_id), ==, 0); 593 npages += (int32_t)from_be32(ent->mleqe_page_request. 594 mled_page_request_num_pages); 595 break; 596 case MLXCX_EVENT_PORT_STATE: 597 portn = get_bits8( 598 ent->mleqe_port_state.mled_port_state_port_num, 599 MLXCX_EVENT_PORT_NUM) - 1; 600 if (portn >= mlxp->mlx_nports) 601 break; 602 port = &mlxp->mlx_ports[portn]; 603 mlxcx_update_link_state(mlxp, port); 604 break; 605 case MLXCX_EVENT_PORT_MODULE: 606 mlxcx_report_module_error(mlxp, &ent->mleqe_port_mod); 607 break; 608 default: 609 mlxcx_warn(mlxp, "unhandled event 0x%x on int %d", 610 ent->mleqe_event_type, mleq->mleq_intr_index); 611 } 612 } 613 614 if (npages > 0) { 615 mlxcx_give_pages_once(mlxp, npages); 616 } else if (npages < 0) { 617 mlxcx_take_pages_once(mlxp, -1 * npages); 618 } 619 620 mlxcx_arm_eq(mlxp, mleq); 621 mutex_exit(&mleq->mleq_mtx); 622 623 return (DDI_INTR_CLAIMED); 624 } 625 626 static boolean_t 627 mlxcx_process_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, mblk_t **mpp, 628 size_t bytelim) 629 { 630 mlxcx_work_queue_t *wq = mlcq->mlcq_wq; 631 mlxcx_completionq_ent_t *cent; 632 mblk_t *mp, *cmp, *nmp; 633 mlxcx_buffer_t *buf; 634 boolean_t found, added; 635 size_t bytes = 0; 636 uint_t rx_frames = 0; 637 uint_t comp_cnt = 0; 638 int64_t wqebbs, bufcnt; 639 640 *mpp = NULL; 641 642 if (!(mlcq->mlcq_state & MLXCX_CQ_ALLOC) || 643 !(mlcq->mlcq_state & MLXCX_CQ_CREATED) || 644 (mlcq->mlcq_state & MLXCX_CQ_DESTROYED) || 645 (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN)) { 646 return (B_FALSE); 647 } 648 649 nmp = cmp = mp = NULL; 650 651 wqebbs = 0; 652 bufcnt = 0; 653 for (cent = mlxcx_cq_next(mlcq); cent != NULL; 654 cent = mlxcx_cq_next(mlcq)) { 655 /* 656 * Teardown and ring stop can atomic_or this flag 657 * into our state if they want us to stop early. 658 */ 659 if (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN) 660 return (B_FALSE); 661 662 comp_cnt++; 663 if (cent->mlcqe_opcode == MLXCX_CQE_OP_REQ && 664 cent->mlcqe_send_wqe_opcode == MLXCX_WQE_OP_NOP) { 665 /* NOP */ 666 atomic_dec_64(&wq->mlwq_wqebb_used); 667 goto nextcq; 668 } 669 670 lookagain: 671 /* 672 * Generally the buffer we're looking for will be 673 * at the front of the list, so this loop won't 674 * need to look far. 675 */ 676 buf = list_head(&mlcq->mlcq_buffers); 677 found = B_FALSE; 678 while (buf != NULL) { 679 if ((buf->mlb_wqe_index & UINT16_MAX) == 680 from_be16(cent->mlcqe_wqe_counter)) { 681 found = B_TRUE; 682 break; 683 } 684 buf = list_next(&mlcq->mlcq_buffers, buf); 685 } 686 687 if (!found) { 688 /* 689 * If there's any buffers waiting on the 690 * buffers_b list, then merge those into 691 * the main list and have another look. 692 * 693 * The wq enqueue routines push new buffers 694 * into buffers_b so that they can avoid 695 * taking the mlcq_mtx and blocking us for 696 * every single packet. 697 */ 698 added = B_FALSE; 699 mutex_enter(&mlcq->mlcq_bufbmtx); 700 if (!list_is_empty(&mlcq->mlcq_buffers_b)) { 701 list_move_tail(&mlcq->mlcq_buffers, 702 &mlcq->mlcq_buffers_b); 703 added = B_TRUE; 704 } 705 mutex_exit(&mlcq->mlcq_bufbmtx); 706 if (added) 707 goto lookagain; 708 709 buf = list_head(&mlcq->mlcq_buffers); 710 mlxcx_warn(mlxp, "got completion on CQ %x but " 711 "no buffer matching wqe found: %x (first " 712 "buffer counter = %x)", mlcq->mlcq_num, 713 from_be16(cent->mlcqe_wqe_counter), 714 buf == NULL ? UINT32_MAX : 715 buf->mlb_wqe_index); 716 mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_INVAL_STATE); 717 goto nextcq; 718 } 719 720 /* 721 * The buf is likely to be freed below, count this now. 722 */ 723 wqebbs += buf->mlb_wqebbs; 724 725 list_remove(&mlcq->mlcq_buffers, buf); 726 bufcnt++; 727 728 switch (mlcq->mlcq_wq->mlwq_type) { 729 case MLXCX_WQ_TYPE_SENDQ: 730 mlxcx_tx_completion(mlxp, mlcq, cent, buf); 731 break; 732 case MLXCX_WQ_TYPE_RECVQ: 733 nmp = mlxcx_rx_completion(mlxp, mlcq, cent, buf); 734 bytes += from_be32(cent->mlcqe_byte_cnt); 735 if (nmp != NULL) { 736 if (cmp != NULL) { 737 cmp->b_next = nmp; 738 cmp = nmp; 739 } else { 740 mp = cmp = nmp; 741 } 742 743 rx_frames++; 744 } 745 break; 746 } 747 748 /* 749 * Update the consumer index with what has been processed, 750 * followed by driver counters. It is important to tell the 751 * hardware first, otherwise when we throw more packets at 752 * it, it may get an overflow error. 753 * We do this whenever we've processed enough to bridge the 754 * high->low water mark. 755 */ 756 if (bufcnt > (MLXCX_CQ_LWM_GAP - MLXCX_CQ_HWM_GAP)) { 757 mlxcx_update_cqci(mlxp, mlcq); 758 /* 759 * Both these variables are incremented using 760 * atomics as they are modified in other code paths 761 * (Eg during tx) which hold different locks. 762 */ 763 atomic_add_64(&mlcq->mlcq_bufcnt, -bufcnt); 764 atomic_add_64(&wq->mlwq_wqebb_used, -wqebbs); 765 wqebbs = 0; 766 bufcnt = 0; 767 comp_cnt = 0; 768 } 769 nextcq: 770 if (rx_frames > mlxp->mlx_props.mldp_rx_per_cq || 771 (bytelim != 0 && bytes > bytelim)) 772 break; 773 } 774 775 if (comp_cnt > 0) { 776 mlxcx_update_cqci(mlxp, mlcq); 777 atomic_add_64(&mlcq->mlcq_bufcnt, -bufcnt); 778 atomic_add_64(&wq->mlwq_wqebb_used, -wqebbs); 779 } 780 781 *mpp = mp; 782 return (B_TRUE); 783 } 784 785 786 mblk_t * 787 mlxcx_rx_poll(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, size_t bytelim) 788 { 789 mblk_t *mp = NULL; 790 791 ASSERT(mutex_owned(&mlcq->mlcq_mtx)); 792 793 ASSERT(mlcq->mlcq_wq != NULL); 794 ASSERT3U(mlcq->mlcq_wq->mlwq_type, ==, MLXCX_WQ_TYPE_RECVQ); 795 796 (void) mlxcx_process_cq(mlxp, mlcq, &mp, bytelim); 797 798 return (mp); 799 } 800 801 static uint_t 802 mlxcx_intr_n(caddr_t arg, caddr_t arg2) 803 { 804 mlxcx_t *mlxp = (mlxcx_t *)arg; 805 mlxcx_event_queue_t *mleq = (mlxcx_event_queue_t *)arg2; 806 mlxcx_eventq_ent_t *ent; 807 mlxcx_completion_queue_t *mlcq, probe; 808 mlxcx_work_queue_t *mlwq; 809 mblk_t *mp = NULL; 810 boolean_t tellmac = B_FALSE; 811 812 mutex_enter(&mleq->mleq_mtx); 813 814 if (!(mleq->mleq_state & MLXCX_EQ_ALLOC) || 815 !(mleq->mleq_state & MLXCX_EQ_CREATED) || 816 (mleq->mleq_state & MLXCX_EQ_DESTROYED)) { 817 mutex_exit(&mleq->mleq_mtx); 818 return (DDI_INTR_CLAIMED); 819 } 820 821 ent = mlxcx_eq_next(mleq); 822 if (ent == NULL) { 823 if (++mleq->mleq_badintrs > mlxcx_stuck_intr_count) { 824 mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_BADINT_LIMIT); 825 ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST); 826 (void) ddi_intr_disable(mlxp->mlx_intr_handles[ 827 mleq->mleq_intr_index]); 828 } 829 mutex_exit(&mleq->mleq_mtx); 830 return (DDI_INTR_CLAIMED); 831 } 832 mleq->mleq_badintrs = 0; 833 834 ASSERT(mleq->mleq_state & MLXCX_EQ_ARMED); 835 mleq->mleq_state &= ~MLXCX_EQ_ARMED; 836 837 for (; ent != NULL; ent = mlxcx_eq_next(mleq)) { 838 if (ent->mleqe_event_type != MLXCX_EVENT_COMPLETION) { 839 mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_INVAL_STATE); 840 ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST); 841 (void) ddi_intr_disable(mlxp->mlx_intr_handles[ 842 mleq->mleq_intr_index]); 843 mutex_exit(&mleq->mleq_mtx); 844 return (DDI_INTR_CLAIMED); 845 } 846 ASSERT3U(ent->mleqe_event_type, ==, MLXCX_EVENT_COMPLETION); 847 848 probe.mlcq_num = 849 from_be24(ent->mleqe_completion.mled_completion_cqn); 850 mlcq = avl_find(&mleq->mleq_cqs, &probe, NULL); 851 852 if (mlcq == NULL) 853 continue; 854 855 mlwq = mlcq->mlcq_wq; 856 857 /* 858 * The polling function might have the mutex and stop us from 859 * getting the lock in mlxcx_process_cq(), so we increment 860 * the event counter atomically from outside. 861 * 862 * This way at the end of polling when we go back to interrupts 863 * from this CQ, the event counter is still correct. 864 * 865 * Note that mlxcx_mac_ring_intr_enable() takes the EQ lock so 866 * as to avoid any possibility of racing against us here, so we 867 * only have to consider mlxcx_rx_poll(). 868 */ 869 atomic_inc_32(&mlcq->mlcq_ec); 870 atomic_and_uint(&mlcq->mlcq_state, ~MLXCX_CQ_ARMED); 871 872 if (mutex_tryenter(&mlcq->mlcq_mtx) == 0) { 873 /* 874 * If we failed to take the mutex because the 875 * polling function has it, just move on. 876 * We don't want to block other CQs behind 877 * this one. 878 */ 879 if (mlcq->mlcq_state & MLXCX_CQ_POLLING) 880 goto update_eq; 881 882 /* Otherwise we will wait. */ 883 mutex_enter(&mlcq->mlcq_mtx); 884 } 885 886 if ((mlcq->mlcq_state & MLXCX_CQ_POLLING) == 0 && 887 mlxcx_process_cq(mlxp, mlcq, &mp, 0)) { 888 /* 889 * The ring is not in polling mode and we processed 890 * some completion queue entries. 891 */ 892 if ((mlcq->mlcq_state & MLXCX_CQ_BLOCKED_MAC) != 0 && 893 mlcq->mlcq_bufcnt < mlcq->mlcq_buflwm) { 894 atomic_and_uint(&mlcq->mlcq_state, 895 ~MLXCX_CQ_BLOCKED_MAC); 896 tellmac = B_TRUE; 897 } 898 899 if ((mlwq->mlwq_state & MLXCX_WQ_BLOCKED_MAC) != 0 && 900 mlwq->mlwq_wqebb_used < mlwq->mlwq_buflwm) { 901 atomic_and_uint(&mlwq->mlwq_state, 902 ~MLXCX_WQ_BLOCKED_MAC); 903 tellmac = B_TRUE; 904 } 905 906 mlxcx_arm_cq(mlxp, mlcq); 907 908 mutex_exit(&mlcq->mlcq_mtx); 909 910 if (tellmac) { 911 mac_tx_ring_update(mlxp->mlx_mac_hdl, 912 mlcq->mlcq_mac_hdl); 913 tellmac = B_FALSE; 914 } 915 916 if (mp != NULL) { 917 mac_rx_ring(mlxp->mlx_mac_hdl, 918 mlcq->mlcq_mac_hdl, mp, mlcq->mlcq_mac_gen); 919 } 920 } else { 921 mutex_exit(&mlcq->mlcq_mtx); 922 } 923 924 update_eq: 925 /* 926 * Updating the consumer counter for an EQ requires a write 927 * to the UAR, which is possibly expensive. 928 * 929 * Try to do it only often enough to stop us wrapping around. 930 */ 931 if ((mleq->mleq_cc & 0x7) == 0) 932 mlxcx_update_eq(mlxp, mleq); 933 } 934 935 mlxcx_arm_eq(mlxp, mleq); 936 mutex_exit(&mleq->mleq_mtx); 937 938 return (DDI_INTR_CLAIMED); 939 } 940 941 boolean_t 942 mlxcx_intr_setup(mlxcx_t *mlxp) 943 { 944 dev_info_t *dip = mlxp->mlx_dip; 945 int ret; 946 int nintrs = 0; 947 int navail = 0; 948 int types, i; 949 mlxcx_eventq_type_t eqt = MLXCX_EQ_TYPE_ANY; 950 951 ret = ddi_intr_get_supported_types(dip, &types); 952 if (ret != DDI_SUCCESS) { 953 return (B_FALSE); 954 } 955 956 if (!(types & DDI_INTR_TYPE_MSIX)) { 957 mlxcx_warn(mlxp, "MSI-X interrupts not available, but mlxcx " 958 "requires MSI-X"); 959 return (B_FALSE); 960 } 961 962 ret = ddi_intr_get_nintrs(dip, DDI_INTR_TYPE_MSIX, &nintrs); 963 if (ret != DDI_SUCCESS) { 964 return (B_FALSE); 965 } 966 if (nintrs < 2) { 967 mlxcx_warn(mlxp, "%d MSI-X interrupts available, but mlxcx " 968 "requires 2", nintrs); 969 return (B_FALSE); 970 } 971 972 ret = ddi_intr_get_navail(dip, DDI_INTR_TYPE_MSIX, &navail); 973 if (navail < 2) { 974 mlxcx_warn(mlxp, "%d MSI-X interrupts available, but mlxcx " 975 "requires 2", navail); 976 return (B_FALSE); 977 } 978 979 mlxp->mlx_intr_size = navail * sizeof (ddi_intr_handle_t); 980 mlxp->mlx_intr_handles = kmem_alloc(mlxp->mlx_intr_size, KM_SLEEP); 981 982 ret = ddi_intr_alloc(dip, mlxp->mlx_intr_handles, DDI_INTR_TYPE_MSIX, 983 0, navail, &mlxp->mlx_intr_count, DDI_INTR_ALLOC_NORMAL); 984 if (ret != DDI_SUCCESS) { 985 mlxcx_intr_teardown(mlxp); 986 return (B_FALSE); 987 } 988 if (mlxp->mlx_intr_count < 2) { 989 mlxcx_intr_teardown(mlxp); 990 return (B_FALSE); 991 } 992 mlxp->mlx_intr_type = DDI_INTR_TYPE_MSIX; 993 994 ret = ddi_intr_get_pri(mlxp->mlx_intr_handles[0], &mlxp->mlx_intr_pri); 995 if (ret != DDI_SUCCESS) { 996 mlxcx_intr_teardown(mlxp); 997 return (B_FALSE); 998 } 999 1000 mlxp->mlx_eqs_size = mlxp->mlx_intr_count * 1001 sizeof (mlxcx_event_queue_t); 1002 mlxp->mlx_eqs = kmem_zalloc(mlxp->mlx_eqs_size, KM_SLEEP); 1003 1004 ret = ddi_intr_add_handler(mlxp->mlx_intr_handles[0], mlxcx_intr_0, 1005 (caddr_t)mlxp, (caddr_t)&mlxp->mlx_eqs[0]); 1006 if (ret != DDI_SUCCESS) { 1007 mlxcx_intr_teardown(mlxp); 1008 return (B_FALSE); 1009 } 1010 1011 /* 1012 * If we have enough interrupts, set their "type" fields so that we 1013 * avoid mixing RX and TX queues on the same EQs. 1014 */ 1015 if (mlxp->mlx_intr_count >= 8) { 1016 eqt = MLXCX_EQ_TYPE_RX; 1017 } 1018 1019 for (i = 1; i < mlxp->mlx_intr_count; ++i) { 1020 mutex_init(&mlxp->mlx_eqs[i].mleq_mtx, NULL, MUTEX_DRIVER, 1021 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 1022 avl_create(&mlxp->mlx_eqs[i].mleq_cqs, mlxcx_cq_compare, 1023 sizeof (mlxcx_completion_queue_t), 1024 offsetof(mlxcx_completion_queue_t, mlcq_eq_entry)); 1025 mlxp->mlx_eqs[i].mleq_intr_index = i; 1026 1027 mlxp->mlx_eqs[i].mleq_type = eqt; 1028 /* 1029 * If eqt is still ANY, just leave it set to that 1030 * (no else here). 1031 */ 1032 if (eqt == MLXCX_EQ_TYPE_RX) { 1033 eqt = MLXCX_EQ_TYPE_TX; 1034 } else if (eqt == MLXCX_EQ_TYPE_TX) { 1035 eqt = MLXCX_EQ_TYPE_RX; 1036 } 1037 1038 ret = ddi_intr_add_handler(mlxp->mlx_intr_handles[i], 1039 mlxcx_intr_n, (caddr_t)mlxp, (caddr_t)&mlxp->mlx_eqs[i]); 1040 if (ret != DDI_SUCCESS) { 1041 mlxcx_intr_teardown(mlxp); 1042 return (B_FALSE); 1043 } 1044 } 1045 1046 return (B_TRUE); 1047 } 1048