1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2020, The University of Queensland 14 * Copyright (c) 2018, Joyent, Inc. 15 * Copyright 2020 RackTop Systems, Inc. 16 */ 17 18 /* 19 * Mellanox Connect-X 4/5/6 driver. 20 */ 21 22 #include <sys/modctl.h> 23 #include <sys/conf.h> 24 #include <sys/devops.h> 25 #include <sys/sysmacros.h> 26 #include <sys/atomic.h> 27 #include <sys/cpuvar.h> 28 29 #include <sys/pattr.h> 30 #include <sys/dlpi.h> 31 32 #include <sys/mac_provider.h> 33 34 #include <sys/random.h> 35 36 #include <mlxcx.h> 37 38 boolean_t 39 mlxcx_wq_alloc_dma(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq) 40 { 41 ddi_device_acc_attr_t acc; 42 ddi_dma_attr_t attr; 43 boolean_t ret; 44 size_t sz; 45 46 VERIFY0(mlwq->mlwq_state & MLXCX_WQ_ALLOC); 47 48 /* Receive and send queue entries might be different sizes. */ 49 switch (mlwq->mlwq_type) { 50 case MLXCX_WQ_TYPE_SENDQ: 51 mlwq->mlwq_entshift = mlxp->mlx_props.mldp_sq_size_shift; 52 mlwq->mlwq_nents = (1 << mlwq->mlwq_entshift); 53 sz = mlwq->mlwq_nents * sizeof (mlxcx_sendq_ent_t); 54 break; 55 case MLXCX_WQ_TYPE_RECVQ: 56 mlwq->mlwq_entshift = mlxp->mlx_props.mldp_rq_size_shift; 57 mlwq->mlwq_nents = (1 << mlwq->mlwq_entshift); 58 sz = mlwq->mlwq_nents * sizeof (mlxcx_recvq_ent_t); 59 break; 60 default: 61 VERIFY(0); 62 return (B_FALSE); 63 } 64 ASSERT3U(sz & (MLXCX_HW_PAGE_SIZE - 1), ==, 0); 65 66 mlxcx_dma_acc_attr(mlxp, &acc); 67 mlxcx_dma_queue_attr(mlxp, &attr); 68 69 ret = mlxcx_dma_alloc(mlxp, &mlwq->mlwq_dma, &attr, &acc, 70 B_TRUE, sz, B_TRUE); 71 if (!ret) { 72 mlxcx_warn(mlxp, "failed to allocate WQ memory"); 73 return (B_FALSE); 74 } 75 76 /* 77 * Just set the first pointer in the union. Yes, this is a strict 78 * aliasing violation. No, I don't care. 79 */ 80 mlwq->mlwq_send_ent = (mlxcx_sendq_ent_t *)mlwq->mlwq_dma.mxdb_va; 81 82 mlxcx_dma_acc_attr(mlxp, &acc); 83 mlxcx_dma_qdbell_attr(mlxp, &attr); 84 sz = sizeof (mlxcx_workq_doorbell_t); 85 ret = mlxcx_dma_alloc(mlxp, &mlwq->mlwq_doorbell_dma, &attr, &acc, 86 B_TRUE, sz, B_TRUE); 87 if (!ret) { 88 mlxcx_warn(mlxp, "failed to allocate WQ doorbell memory"); 89 mlxcx_dma_free(&mlwq->mlwq_dma); 90 mlwq->mlwq_send_ent = NULL; 91 return (B_FALSE); 92 } 93 94 mlwq->mlwq_doorbell = 95 (mlxcx_workq_doorbell_t *)mlwq->mlwq_doorbell_dma.mxdb_va; 96 97 mlwq->mlwq_state |= MLXCX_WQ_ALLOC; 98 99 return (B_TRUE); 100 } 101 102 void 103 mlxcx_wq_rele_dma(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq) 104 { 105 VERIFY(mlwq->mlwq_state & MLXCX_WQ_ALLOC); 106 if (mlwq->mlwq_state & MLXCX_WQ_CREATED) 107 VERIFY(mlwq->mlwq_state & MLXCX_WQ_DESTROYED); 108 109 mlxcx_dma_free(&mlwq->mlwq_dma); 110 mlwq->mlwq_send_ent = NULL; 111 mlxcx_dma_free(&mlwq->mlwq_doorbell_dma); 112 mlwq->mlwq_doorbell = NULL; 113 114 mlwq->mlwq_state &= ~MLXCX_CQ_ALLOC; 115 } 116 117 static boolean_t 118 mlxcx_cq_alloc_dma(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, 119 uint_t ent_shift) 120 { 121 ddi_device_acc_attr_t acc; 122 ddi_dma_attr_t attr; 123 boolean_t ret; 124 size_t sz, i; 125 126 VERIFY0(mlcq->mlcq_state & MLXCX_EQ_ALLOC); 127 128 mlcq->mlcq_entshift = ent_shift; 129 mlcq->mlcq_nents = (1 << mlcq->mlcq_entshift); 130 sz = mlcq->mlcq_nents * sizeof (mlxcx_completionq_ent_t); 131 ASSERT3U(sz & (MLXCX_HW_PAGE_SIZE - 1), ==, 0); 132 133 mlxcx_dma_acc_attr(mlxp, &acc); 134 mlxcx_dma_queue_attr(mlxp, &attr); 135 136 ret = mlxcx_dma_alloc(mlxp, &mlcq->mlcq_dma, &attr, &acc, 137 B_TRUE, sz, B_TRUE); 138 if (!ret) { 139 mlxcx_warn(mlxp, "failed to allocate CQ memory"); 140 return (B_FALSE); 141 } 142 143 mlcq->mlcq_ent = (mlxcx_completionq_ent_t *)mlcq->mlcq_dma.mxdb_va; 144 145 for (i = 0; i < mlcq->mlcq_nents; ++i) { 146 mlcq->mlcq_ent[i].mlcqe_opcode = MLXCX_CQE_OP_INVALID; 147 mlcq->mlcq_ent[i].mlcqe_owner = MLXCX_CQE_OWNER_INIT; 148 } 149 150 mlxcx_dma_acc_attr(mlxp, &acc); 151 mlxcx_dma_qdbell_attr(mlxp, &attr); 152 sz = sizeof (mlxcx_completionq_doorbell_t); 153 ret = mlxcx_dma_alloc(mlxp, &mlcq->mlcq_doorbell_dma, &attr, &acc, 154 B_TRUE, sz, B_TRUE); 155 if (!ret) { 156 mlxcx_warn(mlxp, "failed to allocate CQ doorbell memory"); 157 mlxcx_dma_free(&mlcq->mlcq_dma); 158 mlcq->mlcq_ent = NULL; 159 return (B_FALSE); 160 } 161 162 mlcq->mlcq_doorbell = 163 (mlxcx_completionq_doorbell_t *)mlcq->mlcq_doorbell_dma.mxdb_va; 164 165 mlcq->mlcq_state |= MLXCX_CQ_ALLOC; 166 167 return (B_TRUE); 168 } 169 170 static void 171 mlxcx_cq_rele_dma(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq) 172 { 173 VERIFY(mlcq->mlcq_state & MLXCX_CQ_ALLOC); 174 if (mlcq->mlcq_state & MLXCX_CQ_CREATED) 175 VERIFY(mlcq->mlcq_state & MLXCX_CQ_DESTROYED); 176 177 mlxcx_dma_free(&mlcq->mlcq_dma); 178 mlcq->mlcq_ent = NULL; 179 mlxcx_dma_free(&mlcq->mlcq_doorbell_dma); 180 mlcq->mlcq_doorbell = NULL; 181 182 mlcq->mlcq_state &= ~MLXCX_CQ_ALLOC; 183 } 184 185 void 186 mlxcx_wq_teardown(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq) 187 { 188 mlxcx_completion_queue_t *mlcq; 189 190 /* 191 * If something is holding the lock on a long operation like a 192 * refill, setting this flag asks them to exit early if possible. 193 */ 194 atomic_or_uint(&mlwq->mlwq_state, MLXCX_WQ_TEARDOWN); 195 196 mutex_enter(&mlwq->mlwq_mtx); 197 198 list_remove(&mlxp->mlx_wqs, mlwq); 199 200 if ((mlwq->mlwq_state & MLXCX_WQ_CREATED) && 201 !(mlwq->mlwq_state & MLXCX_WQ_DESTROYED)) { 202 if (mlwq->mlwq_type == MLXCX_WQ_TYPE_RECVQ && 203 mlwq->mlwq_state & MLXCX_WQ_STARTED && 204 !mlxcx_cmd_stop_rq(mlxp, mlwq)) { 205 mlxcx_warn(mlxp, "failed to stop " 206 "recv queue num %x", mlwq->mlwq_num); 207 } 208 if (mlwq->mlwq_type == MLXCX_WQ_TYPE_SENDQ && 209 mlwq->mlwq_state & MLXCX_WQ_STARTED && 210 !mlxcx_cmd_stop_sq(mlxp, mlwq)) { 211 mlxcx_warn(mlxp, "failed to stop " 212 "send queue num %x", mlwq->mlwq_num); 213 } 214 if (mlwq->mlwq_type == MLXCX_WQ_TYPE_RECVQ && 215 !mlxcx_cmd_destroy_rq(mlxp, mlwq)) { 216 mlxcx_warn(mlxp, "failed to destroy " 217 "recv queue num %x", mlwq->mlwq_num); 218 } 219 if (mlwq->mlwq_type == MLXCX_WQ_TYPE_SENDQ && 220 !mlxcx_cmd_destroy_sq(mlxp, mlwq)) { 221 mlxcx_warn(mlxp, "failed to destroy " 222 "send queue num %x", mlwq->mlwq_num); 223 } 224 } 225 if (mlwq->mlwq_state & MLXCX_WQ_ALLOC) { 226 mlxcx_wq_rele_dma(mlxp, mlwq); 227 } 228 mlcq = mlwq->mlwq_cq; 229 230 /* These will be released by mlxcx_teardown_bufs() */ 231 mlwq->mlwq_bufs = NULL; 232 mlwq->mlwq_foreign_bufs = NULL; 233 234 mutex_exit(&mlwq->mlwq_mtx); 235 236 mutex_enter(&mlcq->mlcq_mtx); 237 mutex_enter(&mlwq->mlwq_mtx); 238 ASSERT3P(mlcq->mlcq_wq, ==, mlwq); 239 mlcq->mlcq_wq = NULL; 240 mutex_exit(&mlwq->mlwq_mtx); 241 mutex_exit(&mlcq->mlcq_mtx); 242 243 mutex_destroy(&mlwq->mlwq_mtx); 244 } 245 246 void 247 mlxcx_cq_teardown(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq) 248 { 249 mlxcx_event_queue_t *mleq; 250 mlxcx_buffer_t *b; 251 252 /* 253 * If something is holding the lock on a long operation like polling 254 * which we're going to abort anyway, this flag asks them to exit 255 * early if possible. 256 */ 257 atomic_or_uint(&mlcq->mlcq_state, MLXCX_CQ_TEARDOWN); 258 259 mutex_enter(&mlcq->mlcq_mtx); 260 261 list_remove(&mlxp->mlx_cqs, mlcq); 262 263 if ((mlcq->mlcq_state & MLXCX_CQ_CREATED) && 264 !(mlcq->mlcq_state & MLXCX_CQ_DESTROYED)) { 265 if (!mlxcx_cmd_destroy_cq(mlxp, mlcq)) { 266 mlxcx_warn(mlxp, "failed to destroy " 267 "completion queue num %u", 268 mlcq->mlcq_num); 269 } 270 } 271 if (mlcq->mlcq_state & MLXCX_CQ_ALLOC) { 272 mlxcx_cq_rele_dma(mlxp, mlcq); 273 } 274 /* 275 * If we're on an EQ AVL tree, then we need to grab 276 * the EQ's mutex to take it off. The ISR always takes 277 * EQ mutex before CQ mutex, so we have to let go of 278 * the CQ mutex then come back again. 279 * 280 * The ISR will bail out if tries to touch this CQ now since 281 * we added the CQ_DESTROYED flag above. 282 */ 283 if (mlcq->mlcq_state & MLXCX_CQ_EQAVL) { 284 mleq = mlcq->mlcq_eq; 285 } else { 286 mleq = NULL; 287 } 288 289 /* Return any outstanding buffers to the free pool. */ 290 while ((b = list_remove_head(&mlcq->mlcq_buffers)) != NULL) { 291 mlxcx_buf_return_chain(mlxp, b, B_FALSE); 292 } 293 mutex_enter(&mlcq->mlcq_bufbmtx); 294 while ((b = list_remove_head(&mlcq->mlcq_buffers_b)) != NULL) { 295 mlxcx_buf_return_chain(mlxp, b, B_FALSE); 296 } 297 mutex_exit(&mlcq->mlcq_bufbmtx); 298 299 /* 300 * Since the interrupt handlers take the EQ lock before the CQ one, 301 * we must do the same here. That means letting go of the lock 302 * for a brief window here (we'll double-check the state when we 303 * get back in). 304 */ 305 mutex_exit(&mlcq->mlcq_mtx); 306 307 if (mleq != NULL) { 308 mutex_enter(&mleq->mleq_mtx); 309 mutex_enter(&mlcq->mlcq_mtx); 310 /* 311 * Double-check the state, we let go of the 312 * mutex briefly. 313 */ 314 if (mlcq->mlcq_state & MLXCX_CQ_EQAVL) { 315 avl_remove(&mleq->mleq_cqs, mlcq); 316 mlcq->mlcq_state &= ~MLXCX_CQ_EQAVL; 317 } 318 mutex_exit(&mlcq->mlcq_mtx); 319 mutex_exit(&mleq->mleq_mtx); 320 } 321 322 mutex_enter(&mlcq->mlcq_mtx); 323 ASSERT0(mlcq->mlcq_state & ~(MLXCX_CQ_CREATED | MLXCX_CQ_DESTROYED | 324 MLXCX_CQ_TEARDOWN | MLXCX_CQ_ARMED)); 325 mutex_exit(&mlcq->mlcq_mtx); 326 327 mutex_destroy(&mlcq->mlcq_mtx); 328 mutex_destroy(&mlcq->mlcq_bufbmtx); 329 list_destroy(&mlcq->mlcq_buffers); 330 list_destroy(&mlcq->mlcq_buffers_b); 331 kmem_free(mlcq, sizeof (mlxcx_completion_queue_t)); 332 } 333 334 static boolean_t 335 mlxcx_cq_setup(mlxcx_t *mlxp, mlxcx_event_queue_t *eq, 336 mlxcx_completion_queue_t **cqp, uint_t ent_shift) 337 { 338 mlxcx_completion_queue_t *cq; 339 340 cq = kmem_zalloc(sizeof (mlxcx_completion_queue_t), KM_SLEEP); 341 mutex_init(&cq->mlcq_mtx, NULL, MUTEX_DRIVER, 342 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 343 mutex_init(&cq->mlcq_bufbmtx, NULL, MUTEX_DRIVER, 344 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 345 list_create(&cq->mlcq_buffers, sizeof (mlxcx_buffer_t), 346 offsetof(mlxcx_buffer_t, mlb_cq_entry)); 347 list_create(&cq->mlcq_buffers_b, sizeof (mlxcx_buffer_t), 348 offsetof(mlxcx_buffer_t, mlb_cq_entry)); 349 350 cq->mlcq_mlx = mlxp; 351 list_insert_tail(&mlxp->mlx_cqs, cq); 352 353 mutex_enter(&cq->mlcq_mtx); 354 355 if (!mlxcx_cq_alloc_dma(mlxp, cq, ent_shift)) { 356 mutex_exit(&cq->mlcq_mtx); 357 return (B_FALSE); 358 } 359 360 cq->mlcq_bufhwm = cq->mlcq_nents - MLXCX_CQ_HWM_GAP; 361 cq->mlcq_buflwm = cq->mlcq_nents - MLXCX_CQ_LWM_GAP; 362 363 cq->mlcq_uar = &mlxp->mlx_uar; 364 cq->mlcq_eq = eq; 365 366 cq->mlcq_cqemod_period_usec = mlxp->mlx_props.mldp_cqemod_period_usec; 367 cq->mlcq_cqemod_count = mlxp->mlx_props.mldp_cqemod_count; 368 369 if (!mlxcx_cmd_create_cq(mlxp, cq)) { 370 mutex_exit(&cq->mlcq_mtx); 371 return (B_FALSE); 372 } 373 374 mutex_exit(&cq->mlcq_mtx); 375 376 mutex_enter(&eq->mleq_mtx); 377 mutex_enter(&cq->mlcq_mtx); 378 ASSERT0(cq->mlcq_state & MLXCX_CQ_EQAVL); 379 avl_add(&eq->mleq_cqs, cq); 380 cq->mlcq_state |= MLXCX_CQ_EQAVL; 381 mlxcx_arm_cq(mlxp, cq); 382 mutex_exit(&cq->mlcq_mtx); 383 mutex_exit(&eq->mleq_mtx); 384 385 *cqp = cq; 386 return (B_TRUE); 387 } 388 389 static boolean_t 390 mlxcx_rq_setup(mlxcx_t *mlxp, mlxcx_completion_queue_t *cq, 391 mlxcx_work_queue_t *wq) 392 { 393 mutex_init(&wq->mlwq_mtx, NULL, MUTEX_DRIVER, 394 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 395 396 list_insert_tail(&mlxp->mlx_wqs, wq); 397 398 mutex_enter(&wq->mlwq_mtx); 399 400 wq->mlwq_mlx = mlxp; 401 wq->mlwq_type = MLXCX_WQ_TYPE_RECVQ; 402 wq->mlwq_cq = cq; 403 wq->mlwq_pd = &mlxp->mlx_pd; 404 wq->mlwq_uar = &mlxp->mlx_uar; 405 406 wq->mlwq_bufs = mlxcx_mlbs_create(mlxp); 407 408 if (!mlxcx_wq_alloc_dma(mlxp, wq)) { 409 mutex_exit(&wq->mlwq_mtx); 410 return (B_FALSE); 411 } 412 413 if (!mlxcx_cmd_create_rq(mlxp, wq)) { 414 mutex_exit(&wq->mlwq_mtx); 415 return (B_FALSE); 416 } 417 418 wq->mlwq_bufhwm = wq->mlwq_nents - MLXCX_WQ_HWM_GAP; 419 wq->mlwq_buflwm = wq->mlwq_nents - MLXCX_WQ_LWM_GAP; 420 421 mutex_exit(&wq->mlwq_mtx); 422 423 mutex_enter(&cq->mlcq_mtx); 424 mutex_enter(&wq->mlwq_mtx); 425 ASSERT3P(cq->mlcq_wq, ==, NULL); 426 cq->mlcq_wq = wq; 427 mutex_exit(&wq->mlwq_mtx); 428 mutex_exit(&cq->mlcq_mtx); 429 430 return (B_TRUE); 431 } 432 433 static boolean_t 434 mlxcx_sq_setup(mlxcx_t *mlxp, mlxcx_port_t *port, mlxcx_completion_queue_t *cq, 435 mlxcx_tis_t *tis, mlxcx_work_queue_t *wq) 436 { 437 mutex_init(&wq->mlwq_mtx, NULL, MUTEX_DRIVER, 438 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 439 440 list_insert_tail(&mlxp->mlx_wqs, wq); 441 442 mutex_enter(&wq->mlwq_mtx); 443 444 wq->mlwq_mlx = mlxp; 445 wq->mlwq_type = MLXCX_WQ_TYPE_SENDQ; 446 wq->mlwq_cq = cq; 447 wq->mlwq_pd = &mlxp->mlx_pd; 448 wq->mlwq_uar = &mlxp->mlx_uar; 449 wq->mlwq_tis = tis; 450 451 wq->mlwq_bufs = mlxcx_mlbs_create(mlxp); 452 wq->mlwq_foreign_bufs = mlxcx_mlbs_create(mlxp); 453 454 VERIFY3U(port->mlp_wqe_min_inline, <=, MLXCX_ETH_INLINE_L2); 455 wq->mlwq_inline_mode = MLXCX_ETH_INLINE_L2; 456 457 if (!mlxcx_wq_alloc_dma(mlxp, wq)) { 458 mutex_exit(&wq->mlwq_mtx); 459 return (B_FALSE); 460 } 461 462 if (!mlxcx_cmd_create_sq(mlxp, wq)) { 463 mutex_exit(&wq->mlwq_mtx); 464 return (B_FALSE); 465 } 466 467 wq->mlwq_bufhwm = wq->mlwq_nents - MLXCX_WQ_HWM_GAP; 468 wq->mlwq_buflwm = wq->mlwq_nents - MLXCX_WQ_LWM_GAP; 469 470 mutex_exit(&wq->mlwq_mtx); 471 472 mutex_enter(&cq->mlcq_mtx); 473 mutex_enter(&wq->mlwq_mtx); 474 ASSERT3P(cq->mlcq_wq, ==, NULL); 475 cq->mlcq_wq = wq; 476 mutex_exit(&wq->mlwq_mtx); 477 mutex_exit(&cq->mlcq_mtx); 478 479 return (B_TRUE); 480 } 481 482 /* 483 * Before we tear down the queues associated with the rx group, 484 * flag each cq as being torn down and wake up any tasks. 485 */ 486 static void 487 mlxcx_quiesce_rx_cqs(mlxcx_t *mlxp, mlxcx_ring_group_t *g) 488 { 489 mlxcx_work_queue_t *wq; 490 mlxcx_completion_queue_t *cq; 491 mlxcx_buf_shard_t *s; 492 uint_t i; 493 494 mutex_enter(&g->mlg_mtx); 495 496 for (i = 0; i < g->mlg_nwqs; ++i) { 497 wq = &g->mlg_wqs[i]; 498 cq = wq->mlwq_cq; 499 if (cq != NULL) { 500 s = wq->mlwq_bufs; 501 mutex_enter(&s->mlbs_mtx); 502 atomic_or_uint(&cq->mlcq_state, MLXCX_CQ_TEARDOWN); 503 cv_broadcast(&s->mlbs_free_nonempty); 504 mutex_exit(&s->mlbs_mtx); 505 } 506 } 507 508 mutex_exit(&g->mlg_mtx); 509 } 510 511 void 512 mlxcx_teardown_rx_group(mlxcx_t *mlxp, mlxcx_ring_group_t *g) 513 { 514 mlxcx_work_queue_t *wq; 515 mlxcx_completion_queue_t *cq; 516 mlxcx_flow_entry_t *fe; 517 mlxcx_flow_group_t *fg; 518 mlxcx_flow_table_t *ft; 519 uint_t i; 520 521 mutex_enter(&g->mlg_port->mlp_mtx); 522 mutex_enter(&g->mlg_mtx); 523 524 if (g->mlg_state & MLXCX_GROUP_FLOWS) { 525 mlxcx_remove_all_umcast_entries(mlxp, g->mlg_port, g); 526 527 if (g->mlg_rx_vlan_ft != NULL) 528 mlxcx_remove_all_vlan_entries(mlxp, g); 529 530 if (g == &mlxp->mlx_rx_groups[0]) { 531 ft = g->mlg_port->mlp_rx_flow; 532 mutex_enter(&ft->mlft_mtx); 533 534 fg = g->mlg_port->mlp_bcast; 535 fe = list_head(&fg->mlfg_entries); 536 if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) { 537 (void) mlxcx_cmd_delete_flow_table_entry( 538 mlxp, fe); 539 } 540 541 fg = g->mlg_port->mlp_promisc; 542 fe = list_head(&fg->mlfg_entries); 543 if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) { 544 (void) mlxcx_cmd_delete_flow_table_entry( 545 mlxp, fe); 546 } 547 548 mutex_exit(&ft->mlft_mtx); 549 } 550 551 if (g->mlg_rx_vlan_ft != NULL) { 552 mutex_enter(&g->mlg_rx_vlan_ft->mlft_mtx); 553 ASSERT(list_is_empty(&g->mlg_rx_vlans)); 554 fg = g->mlg_rx_vlan_def_fg; 555 fe = list_head(&fg->mlfg_entries); 556 if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) { 557 (void) mlxcx_cmd_delete_flow_table_entry( 558 mlxp, fe); 559 } 560 fg = g->mlg_rx_vlan_promisc_fg; 561 fe = list_head(&fg->mlfg_entries); 562 if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) { 563 (void) mlxcx_cmd_delete_flow_table_entry( 564 mlxp, fe); 565 } 566 mlxcx_teardown_flow_table(mlxp, g->mlg_rx_vlan_ft); 567 list_destroy(&g->mlg_rx_vlans); 568 569 g->mlg_rx_vlan_ft = NULL; 570 } 571 572 mutex_enter(&g->mlg_rx_hash_ft->mlft_mtx); 573 mlxcx_teardown_flow_table(mlxp, g->mlg_rx_hash_ft); 574 g->mlg_rx_hash_ft = NULL; 575 576 avl_destroy(&g->mlg_rx_macs); 577 g->mlg_state &= ~MLXCX_GROUP_FLOWS; 578 } 579 580 if (g->mlg_state & MLXCX_GROUP_RUNNING) { 581 for (i = 0; i < g->mlg_nwqs; ++i) { 582 wq = &g->mlg_wqs[i]; 583 mutex_enter(&wq->mlwq_mtx); 584 if (wq->mlwq_state & MLXCX_WQ_STARTED && 585 !mlxcx_cmd_stop_rq(mlxp, wq)) { 586 mlxcx_warn(mlxp, "failed to stop rq %x", 587 wq->mlwq_num); 588 } 589 mutex_exit(&wq->mlwq_mtx); 590 } 591 taskq_destroy(g->mlg_refill_tq); 592 g->mlg_state &= ~MLXCX_GROUP_RUNNING; 593 } 594 595 if (g->mlg_state & MLXCX_GROUP_TIRTIS) { 596 for (i = 0; i < MLXCX_TIRS_PER_GROUP; ++i) { 597 mlxcx_tir_t *tir = &g->mlg_tir[i]; 598 if (tir->mltir_state & MLXCX_TIR_CREATED && 599 !(tir->mltir_state & MLXCX_TIR_DESTROYED)) { 600 if (!mlxcx_cmd_destroy_tir(mlxp, tir)) { 601 mlxcx_warn(mlxp, 602 "failed to destroy tir %u " 603 "for rx ring", tir->mltir_num); 604 } 605 } 606 } 607 g->mlg_state &= ~MLXCX_GROUP_TIRTIS; 608 } 609 610 if (g->mlg_state & MLXCX_GROUP_RQT) { 611 if (g->mlg_rqt->mlrqt_state & MLXCX_RQT_CREATED && 612 !(g->mlg_rqt->mlrqt_state & MLXCX_RQT_DESTROYED)) { 613 if (!mlxcx_cmd_destroy_rqt(mlxp, g->mlg_rqt)) { 614 mlxcx_warn(mlxp, "failed to destroy rqt %u " 615 "for rx ring", g->mlg_rqt->mlrqt_num); 616 } 617 kmem_free(g->mlg_rqt->mlrqt_rq, 618 g->mlg_rqt->mlrqt_rq_size); 619 g->mlg_rqt->mlrqt_rq = NULL; 620 kmem_free(g->mlg_rqt, sizeof (mlxcx_rqtable_t)); 621 g->mlg_rqt = NULL; 622 } 623 g->mlg_state &= ~MLXCX_GROUP_RQT; 624 } 625 626 for (i = 0; i < g->mlg_nwqs; ++i) { 627 wq = &g->mlg_wqs[i]; 628 cq = wq->mlwq_cq; 629 mlxcx_wq_teardown(mlxp, wq); 630 if (cq != NULL) 631 mlxcx_cq_teardown(mlxp, cq); 632 } 633 kmem_free(g->mlg_wqs, g->mlg_wqs_size); 634 g->mlg_wqs = NULL; 635 g->mlg_state &= ~MLXCX_GROUP_WQS; 636 637 mutex_exit(&g->mlg_mtx); 638 mutex_exit(&g->mlg_port->mlp_mtx); 639 640 mutex_destroy(&g->mlg_mtx); 641 642 g->mlg_state &= ~MLXCX_GROUP_INIT; 643 ASSERT3S(g->mlg_state, ==, 0); 644 } 645 646 void 647 mlxcx_teardown_tx_group(mlxcx_t *mlxp, mlxcx_ring_group_t *g) 648 { 649 mlxcx_work_queue_t *wq; 650 mlxcx_completion_queue_t *cq; 651 uint_t i; 652 653 mutex_enter(&g->mlg_mtx); 654 655 if (g->mlg_state & MLXCX_GROUP_WQS) { 656 for (i = 0; i < g->mlg_nwqs; ++i) { 657 wq = &g->mlg_wqs[i]; 658 mutex_enter(&wq->mlwq_mtx); 659 cq = wq->mlwq_cq; 660 if (wq->mlwq_state & MLXCX_WQ_STARTED && 661 !mlxcx_cmd_stop_sq(mlxp, wq)) { 662 mlxcx_warn(mlxp, "failed to stop sq %x", 663 wq->mlwq_num); 664 } 665 mutex_exit(&wq->mlwq_mtx); 666 mlxcx_wq_teardown(mlxp, wq); 667 if (cq != NULL) 668 mlxcx_cq_teardown(mlxp, cq); 669 } 670 g->mlg_state &= ~MLXCX_GROUP_RUNNING; 671 kmem_free(g->mlg_wqs, g->mlg_wqs_size); 672 g->mlg_wqs = NULL; 673 g->mlg_state &= ~MLXCX_GROUP_WQS; 674 } 675 676 if ((g->mlg_state & MLXCX_GROUP_TIRTIS) && 677 g->mlg_tis.mltis_state & MLXCX_TIS_CREATED && 678 !(g->mlg_tis.mltis_state & MLXCX_TIS_DESTROYED)) { 679 if (!mlxcx_cmd_destroy_tis(mlxp, &g->mlg_tis)) { 680 mlxcx_warn(mlxp, "failed to destroy tis %u for tx ring", 681 g->mlg_tis.mltis_num); 682 } 683 } 684 g->mlg_state &= ~MLXCX_GROUP_TIRTIS; 685 686 mutex_exit(&g->mlg_mtx); 687 mutex_destroy(&g->mlg_mtx); 688 g->mlg_state &= ~MLXCX_GROUP_INIT; 689 ASSERT3S(g->mlg_state, ==, 0); 690 } 691 692 void 693 mlxcx_teardown_groups(mlxcx_t *mlxp) 694 { 695 mlxcx_ring_group_t *g; 696 uint_t i; 697 698 for (i = 0; i < mlxp->mlx_rx_ngroups; ++i) { 699 g = &mlxp->mlx_rx_groups[i]; 700 if (!(g->mlg_state & MLXCX_GROUP_INIT)) 701 continue; 702 ASSERT3S(g->mlg_type, ==, MLXCX_GROUP_RX); 703 mlxcx_quiesce_rx_cqs(mlxp, g); 704 } 705 706 for (i = 0; i < mlxp->mlx_rx_ngroups; ++i) { 707 g = &mlxp->mlx_rx_groups[i]; 708 if (!(g->mlg_state & MLXCX_GROUP_INIT)) 709 continue; 710 mlxcx_teardown_rx_group(mlxp, g); 711 } 712 713 kmem_free(mlxp->mlx_rx_groups, mlxp->mlx_rx_groups_size); 714 mlxp->mlx_rx_groups = NULL; 715 716 for (i = 0; i < mlxp->mlx_tx_ngroups; ++i) { 717 g = &mlxp->mlx_tx_groups[i]; 718 if (!(g->mlg_state & MLXCX_GROUP_INIT)) 719 continue; 720 ASSERT3S(g->mlg_type, ==, MLXCX_GROUP_TX); 721 mlxcx_teardown_tx_group(mlxp, g); 722 } 723 724 kmem_free(mlxp->mlx_tx_groups, mlxp->mlx_tx_groups_size); 725 mlxp->mlx_tx_groups = NULL; 726 } 727 728 boolean_t 729 mlxcx_rx_group_setup(mlxcx_t *mlxp, mlxcx_ring_group_t *g) 730 { 731 mlxcx_event_queue_t *eq; 732 mlxcx_completion_queue_t *cq; 733 mlxcx_work_queue_t *rq; 734 mlxcx_flow_table_t *ft; 735 mlxcx_flow_group_t *fg; 736 mlxcx_flow_entry_t *fe; 737 uint_t ent_shift; 738 uint_t i, j; 739 740 ASSERT3S(g->mlg_state, ==, 0); 741 742 mutex_init(&g->mlg_mtx, NULL, MUTEX_DRIVER, 743 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 744 mutex_enter(&g->mlg_mtx); 745 g->mlg_mlx = mlxp; 746 g->mlg_type = MLXCX_GROUP_RX; 747 g->mlg_port = &mlxp->mlx_ports[0]; 748 g->mlg_state |= MLXCX_GROUP_INIT; 749 750 g->mlg_nwqs = mlxp->mlx_props.mldp_rx_nrings_per_small_group; 751 i = g - &mlxp->mlx_rx_groups[0]; 752 if (i < mlxp->mlx_props.mldp_rx_ngroups_large) 753 g->mlg_nwqs = mlxp->mlx_props.mldp_rx_nrings_per_large_group; 754 755 g->mlg_wqs_size = g->mlg_nwqs * sizeof (mlxcx_work_queue_t); 756 g->mlg_wqs = kmem_zalloc(g->mlg_wqs_size, KM_SLEEP); 757 g->mlg_state |= MLXCX_GROUP_WQS; 758 759 g->mlg_rqt = kmem_zalloc(sizeof (mlxcx_rqtable_t), KM_SLEEP); 760 g->mlg_rqt->mlrqt_max = 2; 761 while (g->mlg_rqt->mlrqt_max < g->mlg_nwqs) 762 g->mlg_rqt->mlrqt_max <<= 1; 763 g->mlg_rqt->mlrqt_rq_size = g->mlg_rqt->mlrqt_max * 764 sizeof (mlxcx_work_queue_t *); 765 g->mlg_rqt->mlrqt_rq = kmem_zalloc(g->mlg_rqt->mlrqt_rq_size, KM_SLEEP); 766 g->mlg_state |= MLXCX_GROUP_RQT; 767 768 for (i = 0; i < g->mlg_nwqs; ++i) { 769 eq = NULL; 770 while (eq == NULL) { 771 eq = &mlxp->mlx_eqs[mlxp->mlx_next_eq++]; 772 if (mlxp->mlx_next_eq >= mlxp->mlx_intr_count) 773 mlxp->mlx_next_eq = 1; 774 if (eq->mleq_type != MLXCX_EQ_TYPE_ANY && 775 eq->mleq_type != MLXCX_EQ_TYPE_RX) { 776 /* Try the next one */ 777 eq = NULL; 778 } 779 } 780 781 /* 782 * A single completion is indicated for each rq entry as 783 * it is used. So, the number of cq entries never needs 784 * to be larger than the rq. 785 */ 786 ent_shift = MIN(mlxp->mlx_props.mldp_cq_size_shift, 787 mlxp->mlx_props.mldp_rq_size_shift); 788 if (!mlxcx_cq_setup(mlxp, eq, &cq, ent_shift)) { 789 g->mlg_nwqs = i; 790 break; 791 } 792 793 cq->mlcq_stats = &g->mlg_port->mlp_stats; 794 795 rq = &g->mlg_wqs[i]; 796 if (!mlxcx_rq_setup(mlxp, cq, rq)) { 797 g->mlg_nwqs = i; 798 break; 799 } 800 g->mlg_rqt->mlrqt_rq[g->mlg_rqt->mlrqt_used++] = rq; 801 g->mlg_rqt->mlrqt_state |= MLXCX_RQT_DIRTY; 802 rq->mlwq_group = g; 803 } 804 if (g->mlg_nwqs == 0) { 805 mutex_exit(&g->mlg_mtx); 806 return (B_FALSE); 807 } 808 809 if (!mlxcx_cmd_create_rqt(mlxp, g->mlg_rqt)) { 810 mutex_exit(&g->mlg_mtx); 811 return (B_FALSE); 812 } 813 814 for (i = 0; i < MLXCX_TIRS_PER_GROUP; ++i) { 815 mlxcx_tir_t *tir = &g->mlg_tir[i]; 816 tir->mltir_tdom = &mlxp->mlx_tdom; 817 switch (i) { 818 case MLXCX_TIR_ROLE_OTHER: 819 tir->mltir_type = MLXCX_TIR_DIRECT; 820 tir->mltir_rq = &g->mlg_wqs[0]; 821 break; 822 case MLXCX_TIR_ROLE_IPv4: 823 case MLXCX_TIR_ROLE_IPv6: 824 case MLXCX_TIR_ROLE_TCPv4: 825 case MLXCX_TIR_ROLE_TCPv6: 826 case MLXCX_TIR_ROLE_UDPv4: 827 case MLXCX_TIR_ROLE_UDPv6: 828 tir->mltir_type = MLXCX_TIR_INDIRECT; 829 tir->mltir_rqtable = g->mlg_rqt; 830 tir->mltir_hash_fn = MLXCX_TIR_HASH_TOEPLITZ; 831 (void) random_get_pseudo_bytes(tir->mltir_toeplitz_key, 832 sizeof (tir->mltir_toeplitz_key)); 833 break; 834 } 835 switch (i) { 836 case MLXCX_TIR_ROLE_OTHER: 837 break; 838 case MLXCX_TIR_ROLE_IPv4: 839 case MLXCX_TIR_ROLE_TCPv4: 840 case MLXCX_TIR_ROLE_UDPv4: 841 tir->mltir_l3_type = MLXCX_RX_HASH_L3_IPv4; 842 tir->mltir_hash_fields = 843 MLXCX_RX_HASH_SRC_IP | MLXCX_RX_HASH_DST_IP; 844 break; 845 case MLXCX_TIR_ROLE_IPv6: 846 case MLXCX_TIR_ROLE_TCPv6: 847 case MLXCX_TIR_ROLE_UDPv6: 848 tir->mltir_l3_type = MLXCX_RX_HASH_L3_IPv6; 849 tir->mltir_hash_fields = 850 MLXCX_RX_HASH_SRC_IP | MLXCX_RX_HASH_DST_IP; 851 break; 852 } 853 switch (i) { 854 case MLXCX_TIR_ROLE_OTHER: 855 case MLXCX_TIR_ROLE_IPv4: 856 case MLXCX_TIR_ROLE_IPv6: 857 break; 858 case MLXCX_TIR_ROLE_TCPv4: 859 case MLXCX_TIR_ROLE_TCPv6: 860 tir->mltir_l4_type = MLXCX_RX_HASH_L4_TCP; 861 tir->mltir_hash_fields |= 862 MLXCX_RX_HASH_L4_SPORT | MLXCX_RX_HASH_L4_DPORT; 863 break; 864 case MLXCX_TIR_ROLE_UDPv4: 865 case MLXCX_TIR_ROLE_UDPv6: 866 tir->mltir_l4_type = MLXCX_RX_HASH_L4_UDP; 867 tir->mltir_hash_fields |= 868 MLXCX_RX_HASH_L4_SPORT | MLXCX_RX_HASH_L4_DPORT; 869 break; 870 } 871 872 if (!mlxcx_cmd_create_tir(mlxp, tir)) { 873 mutex_exit(&g->mlg_mtx); 874 return (B_FALSE); 875 } 876 877 g->mlg_state |= MLXCX_GROUP_TIRTIS; 878 } 879 880 /* 881 * Flow table: our RX hashing breakout table for RSS 882 */ 883 884 g->mlg_rx_hash_ft = (ft = kmem_zalloc(sizeof (mlxcx_flow_table_t), 885 KM_SLEEP)); 886 mutex_init(&ft->mlft_mtx, NULL, MUTEX_DRIVER, 887 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 888 avl_create(&g->mlg_rx_macs, mlxcx_grmac_compare, 889 sizeof (mlxcx_group_mac_t), 890 offsetof(mlxcx_group_mac_t, mlgm_group_entry)); 891 g->mlg_state |= MLXCX_GROUP_FLOWS; 892 893 mutex_enter(&ft->mlft_mtx); 894 895 ft->mlft_type = MLXCX_FLOW_TABLE_NIC_RX; 896 ft->mlft_level = 2; 897 ft->mlft_port = g->mlg_port; 898 ft->mlft_entshift = MLXCX_RX_HASH_FT_SIZE_SHIFT; 899 ft->mlft_nents = (1 << ft->mlft_entshift); 900 ASSERT3U(ft->mlft_nents, >=, MLXCX_TIRS_PER_GROUP); 901 ft->mlft_entsize = ft->mlft_nents * sizeof (mlxcx_flow_entry_t); 902 ft->mlft_ent = kmem_zalloc(ft->mlft_entsize, KM_SLEEP); 903 list_create(&ft->mlft_groups, sizeof (mlxcx_flow_group_t), 904 offsetof(mlxcx_flow_group_t, mlfg_entry)); 905 906 for (j = 0; j < ft->mlft_nents; ++j) { 907 ft->mlft_ent[j].mlfe_table = ft; 908 ft->mlft_ent[j].mlfe_index = j; 909 } 910 911 if (!mlxcx_cmd_create_flow_table(mlxp, ft)) { 912 mutex_exit(&ft->mlft_mtx); 913 mutex_exit(&g->mlg_mtx); 914 return (B_FALSE); 915 } 916 917 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 918 list_insert_tail(&ft->mlft_groups, fg); 919 fg->mlfg_table = ft; 920 fg->mlfg_size = 1; 921 fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER | MLXCX_FLOW_MATCH_IP_PROTO; 922 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 923 mutex_exit(&ft->mlft_mtx); 924 mutex_exit(&g->mlg_mtx); 925 return (B_FALSE); 926 } 927 fe = list_head(&fg->mlfg_entries); 928 fe->mlfe_ip_version = 6; 929 fe->mlfe_ip_proto = IPPROTO_UDP; 930 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; 931 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir = 932 &g->mlg_tir[MLXCX_TIR_ROLE_UDPv6]; 933 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 934 mutex_exit(&ft->mlft_mtx); 935 mutex_exit(&g->mlg_mtx); 936 return (B_FALSE); 937 } 938 939 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 940 list_insert_tail(&ft->mlft_groups, fg); 941 fg->mlfg_table = ft; 942 fg->mlfg_size = 1; 943 fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER | MLXCX_FLOW_MATCH_IP_PROTO; 944 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 945 mutex_exit(&ft->mlft_mtx); 946 mutex_exit(&g->mlg_mtx); 947 return (B_FALSE); 948 } 949 fe = list_head(&fg->mlfg_entries); 950 fe->mlfe_ip_version = 4; 951 fe->mlfe_ip_proto = IPPROTO_UDP; 952 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; 953 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir = 954 &g->mlg_tir[MLXCX_TIR_ROLE_UDPv4]; 955 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 956 mutex_exit(&ft->mlft_mtx); 957 mutex_exit(&g->mlg_mtx); 958 return (B_FALSE); 959 } 960 961 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 962 list_insert_tail(&ft->mlft_groups, fg); 963 fg->mlfg_table = ft; 964 fg->mlfg_size = 1; 965 fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER | MLXCX_FLOW_MATCH_IP_PROTO; 966 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 967 mutex_exit(&ft->mlft_mtx); 968 mutex_exit(&g->mlg_mtx); 969 return (B_FALSE); 970 } 971 fe = list_head(&fg->mlfg_entries); 972 fe->mlfe_ip_version = 6; 973 fe->mlfe_ip_proto = IPPROTO_TCP; 974 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; 975 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir = 976 &g->mlg_tir[MLXCX_TIR_ROLE_TCPv6]; 977 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 978 mutex_exit(&ft->mlft_mtx); 979 mutex_exit(&g->mlg_mtx); 980 return (B_FALSE); 981 } 982 983 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 984 list_insert_tail(&ft->mlft_groups, fg); 985 fg->mlfg_table = ft; 986 fg->mlfg_size = 1; 987 fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER | MLXCX_FLOW_MATCH_IP_PROTO; 988 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 989 mutex_exit(&ft->mlft_mtx); 990 mutex_exit(&g->mlg_mtx); 991 return (B_FALSE); 992 } 993 fe = list_head(&fg->mlfg_entries); 994 fe->mlfe_ip_version = 4; 995 fe->mlfe_ip_proto = IPPROTO_TCP; 996 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; 997 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir = 998 &g->mlg_tir[MLXCX_TIR_ROLE_TCPv4]; 999 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 1000 mutex_exit(&ft->mlft_mtx); 1001 mutex_exit(&g->mlg_mtx); 1002 return (B_FALSE); 1003 } 1004 1005 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 1006 list_insert_tail(&ft->mlft_groups, fg); 1007 fg->mlfg_table = ft; 1008 fg->mlfg_size = 1; 1009 fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER; 1010 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 1011 mutex_exit(&ft->mlft_mtx); 1012 mutex_exit(&g->mlg_mtx); 1013 return (B_FALSE); 1014 } 1015 fe = list_head(&fg->mlfg_entries); 1016 fe->mlfe_ip_version = 6; 1017 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; 1018 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir = 1019 &g->mlg_tir[MLXCX_TIR_ROLE_IPv6]; 1020 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 1021 mutex_exit(&ft->mlft_mtx); 1022 mutex_exit(&g->mlg_mtx); 1023 return (B_FALSE); 1024 } 1025 1026 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 1027 list_insert_tail(&ft->mlft_groups, fg); 1028 fg->mlfg_table = ft; 1029 fg->mlfg_size = 1; 1030 fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER; 1031 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 1032 mutex_exit(&ft->mlft_mtx); 1033 mutex_exit(&g->mlg_mtx); 1034 return (B_FALSE); 1035 } 1036 fe = list_head(&fg->mlfg_entries); 1037 fe->mlfe_ip_version = 4; 1038 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; 1039 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir = 1040 &g->mlg_tir[MLXCX_TIR_ROLE_IPv4]; 1041 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 1042 mutex_exit(&ft->mlft_mtx); 1043 mutex_exit(&g->mlg_mtx); 1044 return (B_FALSE); 1045 } 1046 1047 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 1048 list_insert_tail(&ft->mlft_groups, fg); 1049 fg->mlfg_table = ft; 1050 fg->mlfg_size = 1; 1051 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 1052 mutex_exit(&ft->mlft_mtx); 1053 mutex_exit(&g->mlg_mtx); 1054 return (B_FALSE); 1055 } 1056 fe = list_head(&fg->mlfg_entries); 1057 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; 1058 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir = 1059 &g->mlg_tir[MLXCX_TIR_ROLE_OTHER]; 1060 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 1061 mutex_exit(&ft->mlft_mtx); 1062 mutex_exit(&g->mlg_mtx); 1063 return (B_FALSE); 1064 } 1065 1066 mutex_exit(&ft->mlft_mtx); 1067 1068 /* 1069 * Flow table: the VLAN breakout table for doing VLAN filtering after 1070 * we've matched a MAC address. 1071 */ 1072 1073 g->mlg_rx_vlan_ft = (ft = kmem_zalloc(sizeof (mlxcx_flow_table_t), 1074 KM_SLEEP)); 1075 mutex_init(&ft->mlft_mtx, NULL, MUTEX_DRIVER, 1076 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 1077 list_create(&g->mlg_rx_vlans, sizeof (mlxcx_group_vlan_t), 1078 offsetof(mlxcx_group_vlan_t, mlgv_entry)); 1079 1080 mutex_enter(&ft->mlft_mtx); 1081 1082 ft->mlft_type = MLXCX_FLOW_TABLE_NIC_RX; 1083 ft->mlft_level = 1; 1084 ft->mlft_port = g->mlg_port; 1085 ft->mlft_entshift = mlxp->mlx_props.mldp_ftbl_vlan_size_shift; 1086 ft->mlft_nents = (1 << ft->mlft_entshift); 1087 ft->mlft_entsize = ft->mlft_nents * sizeof (mlxcx_flow_entry_t); 1088 ft->mlft_ent = kmem_zalloc(ft->mlft_entsize, KM_SLEEP); 1089 list_create(&ft->mlft_groups, sizeof (mlxcx_flow_group_t), 1090 offsetof(mlxcx_flow_group_t, mlfg_entry)); 1091 1092 for (j = 0; j < ft->mlft_nents; ++j) { 1093 fe = &ft->mlft_ent[j]; 1094 fe->mlfe_table = ft; 1095 fe->mlfe_index = j; 1096 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; 1097 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = g->mlg_rx_hash_ft; 1098 } 1099 1100 if (!mlxcx_cmd_create_flow_table(mlxp, ft)) { 1101 mutex_exit(&ft->mlft_mtx); 1102 mutex_exit(&g->mlg_mtx); 1103 return (B_FALSE); 1104 } 1105 1106 /* First group is all actual matched VLANs */ 1107 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 1108 g->mlg_rx_vlan_fg = fg; 1109 list_insert_tail(&ft->mlft_groups, fg); 1110 fg->mlfg_table = ft; 1111 fg->mlfg_size = ft->mlft_nents - 2; 1112 fg->mlfg_mask |= MLXCX_FLOW_MATCH_VLAN; 1113 fg->mlfg_mask |= MLXCX_FLOW_MATCH_VID; 1114 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 1115 mutex_exit(&ft->mlft_mtx); 1116 mutex_exit(&g->mlg_mtx); 1117 return (B_FALSE); 1118 } 1119 1120 /* 1121 * Then the "default" entry which we enable when we have no VLAN IDs 1122 * added to the group (we start with this enabled). 1123 */ 1124 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 1125 g->mlg_rx_vlan_def_fg = fg; 1126 list_insert_tail(&ft->mlft_groups, fg); 1127 fg->mlfg_table = ft; 1128 fg->mlfg_size = 1; 1129 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 1130 mutex_exit(&ft->mlft_mtx); 1131 mutex_exit(&g->mlg_mtx); 1132 return (B_FALSE); 1133 } 1134 fe = list_head(&fg->mlfg_entries); 1135 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 1136 mutex_exit(&ft->mlft_mtx); 1137 mutex_exit(&g->mlg_mtx); 1138 return (B_FALSE); 1139 } 1140 1141 /* 1142 * Finally, the promisc entry which points at the *hash ft* from the 1143 * default group. We only enable this when we have promisc on. 1144 */ 1145 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 1146 g->mlg_rx_vlan_promisc_fg = fg; 1147 list_insert_tail(&ft->mlft_groups, fg); 1148 fg->mlfg_table = ft; 1149 fg->mlfg_size = 1; 1150 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 1151 mutex_exit(&ft->mlft_mtx); 1152 mutex_exit(&g->mlg_mtx); 1153 return (B_FALSE); 1154 } 1155 fe = list_head(&fg->mlfg_entries); 1156 fe->mlfe_ndest = 1; 1157 fe->mlfe_dest[0].mlfed_flow = mlxp->mlx_rx_groups[0].mlg_rx_hash_ft; 1158 1159 mutex_exit(&ft->mlft_mtx); 1160 1161 mutex_exit(&g->mlg_mtx); 1162 1163 return (B_TRUE); 1164 } 1165 1166 boolean_t 1167 mlxcx_rx_ring_start(mlxcx_t *mlxp, mlxcx_ring_group_t *g, 1168 mlxcx_work_queue_t *rq) 1169 { 1170 uint_t j; 1171 mlxcx_buffer_t *b; 1172 mlxcx_completion_queue_t *cq; 1173 1174 mutex_enter(&g->mlg_mtx); 1175 /* 1176 * Sadly, even though MAC has the mgi_start callback, it is not always 1177 * called -- in particular when we are being managed under an aggr, the 1178 * mgi_start callback will only ever be called on the default group. 1179 * 1180 * So instead of asserting about the group state here, we have to 1181 * check it and call group start if needed. 1182 */ 1183 if (!(g->mlg_state & MLXCX_GROUP_RUNNING)) { 1184 mutex_exit(&g->mlg_mtx); 1185 if (!mlxcx_rx_group_start(mlxp, g)) 1186 return (B_FALSE); 1187 mutex_enter(&g->mlg_mtx); 1188 } 1189 ASSERT(g->mlg_state & MLXCX_GROUP_RUNNING); 1190 1191 cq = rq->mlwq_cq; 1192 ASSERT(cq != NULL); 1193 1194 mutex_enter(&cq->mlcq_mtx); 1195 mutex_enter(&rq->mlwq_mtx); 1196 1197 if (rq->mlwq_state & MLXCX_WQ_STARTED) { 1198 mutex_exit(&rq->mlwq_mtx); 1199 mutex_exit(&cq->mlcq_mtx); 1200 mutex_exit(&g->mlg_mtx); 1201 return (B_TRUE); 1202 } 1203 1204 if (!mlxcx_cmd_start_rq(mlxp, rq)) { 1205 mutex_exit(&rq->mlwq_mtx); 1206 mutex_exit(&cq->mlcq_mtx); 1207 mutex_exit(&g->mlg_mtx); 1208 return (B_FALSE); 1209 } 1210 ASSERT(rq->mlwq_state & MLXCX_WQ_STARTED); 1211 1212 ASSERT0(rq->mlwq_state & MLXCX_WQ_BUFFERS); 1213 rq->mlwq_state |= MLXCX_WQ_BUFFERS; 1214 1215 for (j = 0; j < rq->mlwq_nents; ++j) { 1216 if (!mlxcx_buf_create(mlxp, rq->mlwq_bufs, &b)) 1217 break; 1218 mlxcx_buf_return(mlxp, b); 1219 } 1220 for (j = 0; j < rq->mlwq_nents / 2; ++j) { 1221 if (!mlxcx_buf_create(mlxp, rq->mlwq_bufs, &b)) 1222 break; 1223 mlxcx_buf_return(mlxp, b); 1224 } 1225 1226 mlxcx_rq_refill(mlxp, rq); 1227 1228 mutex_exit(&rq->mlwq_mtx); 1229 mutex_exit(&cq->mlcq_mtx); 1230 mutex_exit(&g->mlg_mtx); 1231 1232 return (B_TRUE); 1233 } 1234 1235 boolean_t 1236 mlxcx_rx_group_start(mlxcx_t *mlxp, mlxcx_ring_group_t *g) 1237 { 1238 mlxcx_flow_table_t *ft; 1239 mlxcx_flow_group_t *fg; 1240 mlxcx_flow_entry_t *fe; 1241 char tq_name[TASKQ_NAMELEN]; 1242 1243 mutex_enter(&g->mlg_mtx); 1244 1245 if (g->mlg_state & MLXCX_GROUP_RUNNING) { 1246 mutex_exit(&g->mlg_mtx); 1247 return (B_TRUE); 1248 } 1249 1250 ASSERT0(g->mlg_state & MLXCX_GROUP_RUNNING); 1251 1252 g->mlg_state |= MLXCX_GROUP_RUNNING; 1253 1254 (void) snprintf(tq_name, sizeof (tq_name), "%s_refill_%d_%ld", 1255 ddi_driver_name(mlxp->mlx_dip), mlxp->mlx_inst, 1256 g - &mlxp->mlx_rx_groups[0]); 1257 1258 /* 1259 * Create one refill taskq per group with one thread per work queue. 1260 * The refill task may block waiting for resources, so by effectively 1261 * having one thread per work queue we avoid work queues blocking each 1262 * other. 1263 */ 1264 if ((g->mlg_refill_tq = taskq_create(tq_name, g->mlg_nwqs, minclsyspri, 1265 g->mlg_nwqs, INT_MAX, TASKQ_PREPOPULATE)) == NULL) { 1266 mlxcx_warn(mlxp, "failed to create rq refill task queue"); 1267 mutex_exit(&g->mlg_mtx); 1268 return (B_FALSE); 1269 } 1270 1271 if (g == &mlxp->mlx_rx_groups[0]) { 1272 ft = g->mlg_port->mlp_rx_flow; 1273 mutex_enter(&ft->mlft_mtx); 1274 1275 /* 1276 * Broadcast and promisc entries go directly to group 0's 1277 * RSS hash fanout flow table. They bypass VLAN filtering. 1278 */ 1279 fg = g->mlg_port->mlp_bcast; 1280 fe = list_head(&fg->mlfg_entries); 1281 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = g->mlg_rx_hash_ft; 1282 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 1283 mutex_exit(&ft->mlft_mtx); 1284 g->mlg_state &= ~MLXCX_GROUP_RUNNING; 1285 taskq_destroy(g->mlg_refill_tq); 1286 mutex_exit(&g->mlg_mtx); 1287 return (B_FALSE); 1288 } 1289 1290 fg = g->mlg_port->mlp_promisc; 1291 fe = list_head(&fg->mlfg_entries); 1292 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = g->mlg_rx_hash_ft; 1293 /* 1294 * Don't actually set the promisc entry until promisc is 1295 * enabled. 1296 */ 1297 1298 mutex_exit(&ft->mlft_mtx); 1299 } 1300 1301 mutex_exit(&g->mlg_mtx); 1302 1303 return (B_TRUE); 1304 } 1305 1306 boolean_t 1307 mlxcx_tx_group_setup(mlxcx_t *mlxp, mlxcx_ring_group_t *g) 1308 { 1309 mlxcx_event_queue_t *eq; 1310 mlxcx_completion_queue_t *cq; 1311 mlxcx_work_queue_t *sq; 1312 uint_t i; 1313 1314 ASSERT3S(g->mlg_state, ==, 0); 1315 1316 mutex_init(&g->mlg_mtx, NULL, MUTEX_DRIVER, 1317 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 1318 g->mlg_state |= MLXCX_GROUP_INIT; 1319 mutex_enter(&g->mlg_mtx); 1320 1321 g->mlg_mlx = mlxp; 1322 g->mlg_type = MLXCX_GROUP_TX; 1323 g->mlg_port = &mlxp->mlx_ports[0]; 1324 1325 g->mlg_nwqs = mlxp->mlx_props.mldp_tx_nrings_per_group; 1326 g->mlg_wqs_size = g->mlg_nwqs * sizeof (mlxcx_work_queue_t); 1327 g->mlg_wqs = kmem_zalloc(g->mlg_wqs_size, KM_SLEEP); 1328 g->mlg_state |= MLXCX_GROUP_WQS; 1329 1330 g->mlg_tis.mltis_tdom = &mlxp->mlx_tdom; 1331 1332 if (!mlxcx_cmd_create_tis(mlxp, &g->mlg_tis)) { 1333 mutex_exit(&g->mlg_mtx); 1334 return (B_FALSE); 1335 } 1336 1337 g->mlg_state |= MLXCX_GROUP_TIRTIS; 1338 1339 for (i = 0; i < g->mlg_nwqs; ++i) { 1340 eq = NULL; 1341 while (eq == NULL) { 1342 eq = &mlxp->mlx_eqs[mlxp->mlx_next_eq++]; 1343 if (mlxp->mlx_next_eq >= mlxp->mlx_intr_count) 1344 mlxp->mlx_next_eq = 1; 1345 if (eq->mleq_type != MLXCX_EQ_TYPE_ANY && 1346 eq->mleq_type != MLXCX_EQ_TYPE_TX) { 1347 /* Try the next one */ 1348 eq = NULL; 1349 } 1350 } 1351 1352 if (!mlxcx_cq_setup(mlxp, eq, &cq, 1353 mlxp->mlx_props.mldp_cq_size_shift)) 1354 return (B_FALSE); 1355 1356 cq->mlcq_stats = &g->mlg_port->mlp_stats; 1357 1358 sq = &g->mlg_wqs[i]; 1359 if (!mlxcx_sq_setup(mlxp, g->mlg_port, cq, &g->mlg_tis, sq)) { 1360 mutex_exit(&g->mlg_mtx); 1361 return (B_FALSE); 1362 } 1363 sq->mlwq_group = g; 1364 } 1365 1366 mutex_exit(&g->mlg_mtx); 1367 1368 return (B_TRUE); 1369 } 1370 1371 boolean_t 1372 mlxcx_tx_ring_start(mlxcx_t *mlxp, mlxcx_ring_group_t *g, 1373 mlxcx_work_queue_t *sq) 1374 { 1375 uint_t i; 1376 mlxcx_buffer_t *b; 1377 mlxcx_completion_queue_t *cq; 1378 1379 mutex_enter(&g->mlg_mtx); 1380 1381 cq = sq->mlwq_cq; 1382 ASSERT(cq != NULL); 1383 1384 mutex_enter(&cq->mlcq_mtx); 1385 mutex_enter(&sq->mlwq_mtx); 1386 if (sq->mlwq_state & MLXCX_WQ_STARTED) { 1387 mutex_exit(&sq->mlwq_mtx); 1388 mutex_exit(&cq->mlcq_mtx); 1389 mutex_exit(&g->mlg_mtx); 1390 return (B_TRUE); 1391 } 1392 1393 ASSERT0(sq->mlwq_state & MLXCX_WQ_BUFFERS); 1394 for (i = 0; i < sq->mlwq_nents; ++i) { 1395 if (!mlxcx_buf_create_foreign(mlxp, sq->mlwq_foreign_bufs, &b)) 1396 break; 1397 mlxcx_buf_return(mlxp, b); 1398 } 1399 for (i = 0; i < sq->mlwq_nents / 2; ++i) { 1400 if (!mlxcx_buf_create_foreign(mlxp, sq->mlwq_foreign_bufs, &b)) 1401 break; 1402 mlxcx_buf_return(mlxp, b); 1403 } 1404 for (i = 0; i < sq->mlwq_nents; ++i) { 1405 if (!mlxcx_buf_create(mlxp, sq->mlwq_bufs, &b)) 1406 break; 1407 mlxcx_buf_return(mlxp, b); 1408 } 1409 sq->mlwq_state |= MLXCX_WQ_BUFFERS; 1410 1411 if (!mlxcx_cmd_start_sq(mlxp, sq)) { 1412 mutex_exit(&sq->mlwq_mtx); 1413 mutex_exit(&cq->mlcq_mtx); 1414 mutex_exit(&g->mlg_mtx); 1415 return (B_FALSE); 1416 } 1417 g->mlg_state |= MLXCX_GROUP_RUNNING; 1418 1419 (void) mlxcx_sq_add_nop(mlxp, sq); 1420 1421 mutex_exit(&sq->mlwq_mtx); 1422 mutex_exit(&cq->mlcq_mtx); 1423 mutex_exit(&g->mlg_mtx); 1424 1425 return (B_TRUE); 1426 } 1427 1428 static boolean_t 1429 mlxcx_sq_ring_dbell(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, uint_t first) 1430 { 1431 uint_t idx; 1432 mlxcx_bf_t *bf; 1433 ddi_fm_error_t err; 1434 uint_t try = 0; 1435 1436 ASSERT3U(mlwq->mlwq_type, ==, MLXCX_WQ_TYPE_SENDQ); 1437 ASSERT(mutex_owned(&mlwq->mlwq_mtx)); 1438 1439 mlwq->mlwq_doorbell->mlwqd_send_counter = to_be16(mlwq->mlwq_pc); 1440 1441 ASSERT(mlwq->mlwq_cq != NULL); 1442 ASSERT(mlwq->mlwq_cq->mlcq_eq != NULL); 1443 idx = mlwq->mlwq_cq->mlcq_eq->mleq_intr_index & MLXCX_BF_PER_UAR_MASK; 1444 bf = &mlwq->mlwq_uar->mlu_bf[idx]; 1445 1446 retry: 1447 MLXCX_DMA_SYNC(mlwq->mlwq_doorbell_dma, DDI_DMA_SYNC_FORDEV); 1448 ddi_fm_dma_err_get(mlwq->mlwq_doorbell_dma.mxdb_dma_handle, &err, 1449 DDI_FME_VERSION); 1450 if (err.fme_status != DDI_FM_OK) { 1451 if (try++ < mlxcx_doorbell_tries) { 1452 ddi_fm_dma_err_clear( 1453 mlwq->mlwq_doorbell_dma.mxdb_dma_handle, 1454 DDI_FME_VERSION); 1455 goto retry; 1456 } else { 1457 goto err; 1458 } 1459 } 1460 1461 mlxcx_put64(mlxp, bf->mbf_even, from_be64( 1462 mlwq->mlwq_bf_ent[first].mlsqbf_qwords[0])); 1463 ddi_fm_acc_err_get(mlxp->mlx_regs_handle, &err, 1464 DDI_FME_VERSION); 1465 if (err.fme_status == DDI_FM_OK) 1466 return (B_TRUE); 1467 if (try++ < mlxcx_doorbell_tries) { 1468 ddi_fm_acc_err_clear(mlxp->mlx_regs_handle, DDI_FME_VERSION); 1469 goto retry; 1470 } 1471 1472 err: 1473 ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST); 1474 return (B_FALSE); 1475 } 1476 1477 boolean_t 1478 mlxcx_sq_add_nop(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq) 1479 { 1480 uint_t index, start_pc; 1481 mlxcx_sendq_ent_t *ent0; 1482 ddi_fm_error_t err; 1483 1484 ASSERT(mutex_owned(&mlwq->mlwq_mtx)); 1485 1486 index = mlwq->mlwq_pc & (mlwq->mlwq_nents - 1); 1487 ent0 = &mlwq->mlwq_send_ent[index]; 1488 start_pc = mlwq->mlwq_pc; 1489 ++mlwq->mlwq_pc; 1490 /* 1491 * This counter is manipulated in the interrupt handler, which 1492 * does not hold the mlwq_mtx, hence the atomic. 1493 */ 1494 atomic_inc_64(&mlwq->mlwq_wqebb_used); 1495 1496 bzero(ent0, sizeof (mlxcx_sendq_ent_t)); 1497 ent0->mlsqe_control.mlcs_opcode = MLXCX_WQE_OP_NOP; 1498 ent0->mlsqe_control.mlcs_qp_or_sq = to_be24(mlwq->mlwq_num); 1499 ent0->mlsqe_control.mlcs_wqe_index = to_be16(start_pc); 1500 1501 set_bits8(&ent0->mlsqe_control.mlcs_flags, 1502 MLXCX_SQE_FENCE_MODE, MLXCX_SQE_FENCE_NONE); 1503 set_bits8(&ent0->mlsqe_control.mlcs_flags, 1504 MLXCX_SQE_COMPLETION_MODE, MLXCX_SQE_CQE_ALWAYS); 1505 1506 ent0->mlsqe_control.mlcs_ds = 1; 1507 1508 VERIFY0(ddi_dma_sync(mlwq->mlwq_dma.mxdb_dma_handle, 1509 (uintptr_t)ent0 - (uintptr_t)mlwq->mlwq_send_ent, 1510 sizeof (mlxcx_sendq_ent_t), DDI_DMA_SYNC_FORDEV)); 1511 ddi_fm_dma_err_get(mlwq->mlwq_dma.mxdb_dma_handle, &err, 1512 DDI_FME_VERSION); 1513 if (err.fme_status != DDI_FM_OK) { 1514 return (B_FALSE); 1515 } 1516 if (!mlxcx_sq_ring_dbell(mlxp, mlwq, index)) { 1517 return (B_FALSE); 1518 } 1519 return (B_TRUE); 1520 } 1521 1522 boolean_t 1523 mlxcx_sq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, 1524 uint8_t *inlinehdrs, size_t inlinelen, uint32_t chkflags, 1525 mlxcx_buffer_t *b0) 1526 { 1527 uint_t index, first, ents; 1528 mlxcx_completion_queue_t *cq; 1529 mlxcx_sendq_ent_t *ent0; 1530 mlxcx_sendq_extra_ent_t *ent; 1531 mlxcx_wqe_data_seg_t *seg; 1532 uint_t ptri, nptr; 1533 const ddi_dma_cookie_t *c; 1534 size_t rem; 1535 uint64_t wqebb_used; 1536 mlxcx_buffer_t *b; 1537 ddi_fm_error_t err; 1538 boolean_t rv; 1539 1540 ASSERT(mutex_owned(&mlwq->mlwq_mtx)); 1541 ASSERT3P(b0->mlb_tx_head, ==, b0); 1542 ASSERT3U(b0->mlb_state, ==, MLXCX_BUFFER_ON_WQ); 1543 cq = mlwq->mlwq_cq; 1544 1545 index = mlwq->mlwq_pc & (mlwq->mlwq_nents - 1); 1546 ent0 = &mlwq->mlwq_send_ent[index]; 1547 b0->mlb_wqe_index = mlwq->mlwq_pc; 1548 ents = 1; 1549 1550 first = index; 1551 1552 bzero(ent0, sizeof (mlxcx_sendq_ent_t)); 1553 ent0->mlsqe_control.mlcs_opcode = MLXCX_WQE_OP_SEND; 1554 ent0->mlsqe_control.mlcs_qp_or_sq = to_be24(mlwq->mlwq_num); 1555 ent0->mlsqe_control.mlcs_wqe_index = to_be16(b0->mlb_wqe_index); 1556 1557 set_bits8(&ent0->mlsqe_control.mlcs_flags, 1558 MLXCX_SQE_FENCE_MODE, MLXCX_SQE_FENCE_WAIT_OTHERS); 1559 set_bits8(&ent0->mlsqe_control.mlcs_flags, 1560 MLXCX_SQE_COMPLETION_MODE, MLXCX_SQE_CQE_ALWAYS); 1561 1562 VERIFY3U(inlinelen, <=, sizeof (ent0->mlsqe_eth.mles_inline_headers)); 1563 set_bits16(&ent0->mlsqe_eth.mles_szflags, 1564 MLXCX_SQE_ETH_INLINE_HDR_SZ, inlinelen); 1565 if (inlinelen > 0) { 1566 bcopy(inlinehdrs, ent0->mlsqe_eth.mles_inline_headers, 1567 inlinelen); 1568 } 1569 1570 ent0->mlsqe_control.mlcs_ds = 1571 offsetof(mlxcx_sendq_ent_t, mlsqe_data) / 16; 1572 1573 if (chkflags & HCK_IPV4_HDRCKSUM) { 1574 ASSERT(mlxp->mlx_caps->mlc_checksum); 1575 set_bit8(&ent0->mlsqe_eth.mles_csflags, 1576 MLXCX_SQE_ETH_CSFLAG_L3_CHECKSUM); 1577 } 1578 if (chkflags & HCK_FULLCKSUM) { 1579 ASSERT(mlxp->mlx_caps->mlc_checksum); 1580 set_bit8(&ent0->mlsqe_eth.mles_csflags, 1581 MLXCX_SQE_ETH_CSFLAG_L4_CHECKSUM); 1582 } 1583 1584 /* 1585 * mlwq_wqebb_used is only incremented whilst holding 1586 * the mlwq_mtx mutex, but it is decremented (atomically) in 1587 * the interrupt context *not* under mlwq_mtx mutex. 1588 * So, now take a snapshot of the number of used wqes which will 1589 * be a conistent maximum we can use whilst iterating through 1590 * the buffers and DMA cookies. 1591 */ 1592 wqebb_used = mlwq->mlwq_wqebb_used; 1593 1594 b = b0; 1595 ptri = 0; 1596 nptr = sizeof (ent0->mlsqe_data) / sizeof (mlxcx_wqe_data_seg_t); 1597 seg = ent0->mlsqe_data; 1598 while (b != NULL) { 1599 rem = b->mlb_used; 1600 1601 c = NULL; 1602 while (rem > 0 && 1603 (c = mlxcx_dma_cookie_iter(&b->mlb_dma, c)) != NULL) { 1604 if (ptri >= nptr) { 1605 if ((ents + wqebb_used) >= mlwq->mlwq_nents) 1606 return (B_FALSE); 1607 1608 index = (mlwq->mlwq_pc + ents) & 1609 (mlwq->mlwq_nents - 1); 1610 ent = &mlwq->mlwq_send_extra_ent[index]; 1611 ++ents; 1612 1613 seg = ent->mlsqe_data; 1614 ptri = 0; 1615 nptr = sizeof (ent->mlsqe_data) / 1616 sizeof (mlxcx_wqe_data_seg_t); 1617 } 1618 1619 seg->mlds_lkey = to_be32(mlxp->mlx_rsvd_lkey); 1620 if (c->dmac_size > rem) { 1621 seg->mlds_byte_count = to_be32(rem); 1622 rem = 0; 1623 } else { 1624 seg->mlds_byte_count = to_be32(c->dmac_size); 1625 rem -= c->dmac_size; 1626 } 1627 seg->mlds_address = to_be64(c->dmac_laddress); 1628 ++seg; 1629 ++ptri; 1630 ++ent0->mlsqe_control.mlcs_ds; 1631 1632 ASSERT3U(ent0->mlsqe_control.mlcs_ds, <=, 1633 MLXCX_SQE_MAX_DS); 1634 } 1635 1636 if (b == b0) { 1637 b = list_head(&b0->mlb_tx_chain); 1638 } else { 1639 b = list_next(&b0->mlb_tx_chain, b); 1640 } 1641 } 1642 1643 b0->mlb_wqebbs = ents; 1644 mlwq->mlwq_pc += ents; 1645 atomic_add_64(&mlwq->mlwq_wqebb_used, ents); 1646 1647 for (; ptri < nptr; ++ptri, ++seg) { 1648 seg->mlds_lkey = to_be32(MLXCX_NULL_LKEY); 1649 seg->mlds_byte_count = to_be32(0); 1650 seg->mlds_address = to_be64(0); 1651 } 1652 1653 /* 1654 * Make sure the workqueue entry is flushed out before updating 1655 * the doorbell. 1656 */ 1657 VERIFY0(ddi_dma_sync(mlwq->mlwq_dma.mxdb_dma_handle, 1658 (uintptr_t)ent0 - (uintptr_t)mlwq->mlwq_send_ent, 1659 ents * sizeof (mlxcx_sendq_ent_t), DDI_DMA_SYNC_FORDEV)); 1660 ddi_fm_dma_err_get(mlwq->mlwq_dma.mxdb_dma_handle, &err, 1661 DDI_FME_VERSION); 1662 if (err.fme_status != DDI_FM_OK) { 1663 return (B_FALSE); 1664 } 1665 1666 /* 1667 * Hold the bufmtx whilst ringing the doorbell, to prevent 1668 * the buffer from being moved to another list, so we can 1669 * safely remove it should the ring fail. 1670 */ 1671 mutex_enter(&cq->mlcq_bufbmtx); 1672 1673 list_insert_tail(&cq->mlcq_buffers_b, b0); 1674 if ((rv = mlxcx_sq_ring_dbell(mlxp, mlwq, first))) { 1675 atomic_inc_64(&cq->mlcq_bufcnt); 1676 } else { 1677 list_remove(&cq->mlcq_buffers_b, b0); 1678 } 1679 1680 mutex_exit(&cq->mlcq_bufbmtx); 1681 1682 return (rv); 1683 } 1684 1685 boolean_t 1686 mlxcx_rq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, 1687 mlxcx_buffer_t *buf) 1688 { 1689 return (mlxcx_rq_add_buffers(mlxp, mlwq, &buf, 1)); 1690 } 1691 1692 boolean_t 1693 mlxcx_rq_add_buffers(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, 1694 mlxcx_buffer_t **bufs, size_t nbufs) 1695 { 1696 uint_t index; 1697 mlxcx_recvq_ent_t *ent; 1698 mlxcx_completion_queue_t *cq; 1699 mlxcx_wqe_data_seg_t *seg; 1700 uint_t bi, ptri; 1701 const ddi_dma_cookie_t *c; 1702 mlxcx_buffer_t *buf; 1703 ddi_fm_error_t err; 1704 1705 ASSERT(mutex_owned(&mlwq->mlwq_mtx)); 1706 cq = mlwq->mlwq_cq; 1707 ASSERT(mutex_owned(&cq->mlcq_mtx)); 1708 1709 for (bi = 0; bi < nbufs; ++bi) { 1710 buf = bufs[bi]; 1711 bufs[bi] = NULL; 1712 ASSERT3U(buf->mlb_state, ==, MLXCX_BUFFER_ON_WQ); 1713 1714 index = mlwq->mlwq_pc & (mlwq->mlwq_nents - 1); 1715 ent = &mlwq->mlwq_recv_ent[index]; 1716 buf->mlb_wqe_index = mlwq->mlwq_pc; 1717 buf->mlb_wqebbs = 1; 1718 1719 ++mlwq->mlwq_pc; 1720 atomic_inc_64(&mlwq->mlwq_wqebb_used); 1721 1722 mutex_enter(&cq->mlcq_bufbmtx); 1723 list_insert_tail(&cq->mlcq_buffers, buf); 1724 atomic_inc_64(&cq->mlcq_bufcnt); 1725 mutex_exit(&cq->mlcq_bufbmtx); 1726 1727 ASSERT3U(buf->mlb_dma.mxdb_ncookies, <=, MLXCX_RECVQ_MAX_PTRS); 1728 ptri = 0; 1729 c = NULL; 1730 while ((c = mlxcx_dma_cookie_iter(&buf->mlb_dma, c)) != NULL) { 1731 seg = &ent->mlrqe_data[ptri++]; 1732 seg->mlds_lkey = to_be32(mlxp->mlx_rsvd_lkey); 1733 seg->mlds_byte_count = to_be32(c->dmac_size); 1734 seg->mlds_address = to_be64(c->dmac_laddress); 1735 } 1736 /* 1737 * Fill any unused scatter pointers with the special null 1738 * value. 1739 */ 1740 for (; ptri < MLXCX_RECVQ_MAX_PTRS; ++ptri) { 1741 seg = &ent->mlrqe_data[ptri]; 1742 seg->mlds_lkey = to_be32(MLXCX_NULL_LKEY); 1743 seg->mlds_byte_count = to_be32(0); 1744 seg->mlds_address = to_be64(0); 1745 } 1746 1747 /* 1748 * Make sure the workqueue entry is flushed out before updating 1749 * the doorbell. 1750 */ 1751 VERIFY0(ddi_dma_sync(mlwq->mlwq_dma.mxdb_dma_handle, 1752 (uintptr_t)ent - (uintptr_t)mlwq->mlwq_recv_ent, 1753 sizeof (mlxcx_recvq_ent_t), DDI_DMA_SYNC_FORDEV)); 1754 ddi_fm_dma_err_get(mlwq->mlwq_dma.mxdb_dma_handle, &err, 1755 DDI_FME_VERSION); 1756 if (err.fme_status != DDI_FM_OK) { 1757 return (B_FALSE); 1758 } 1759 } 1760 1761 mlwq->mlwq_doorbell->mlwqd_recv_counter = to_be16(mlwq->mlwq_pc); 1762 /* 1763 * Flush the CQ doorbell as well so that HW knows how many 1764 * completions we've consumed. 1765 */ 1766 MLXCX_DMA_SYNC(cq->mlcq_doorbell_dma, DDI_DMA_SYNC_FORDEV); 1767 ddi_fm_dma_err_get(cq->mlcq_doorbell_dma.mxdb_dma_handle, &err, 1768 DDI_FME_VERSION); 1769 if (err.fme_status != DDI_FM_OK) { 1770 return (B_FALSE); 1771 } 1772 MLXCX_DMA_SYNC(mlwq->mlwq_doorbell_dma, DDI_DMA_SYNC_FORDEV); 1773 ddi_fm_dma_err_get(mlwq->mlwq_doorbell_dma.mxdb_dma_handle, &err, 1774 DDI_FME_VERSION); 1775 if (err.fme_status != DDI_FM_OK) { 1776 return (B_FALSE); 1777 } 1778 return (B_TRUE); 1779 } 1780 1781 static void 1782 mlxcx_rq_refill_task(void *arg) 1783 { 1784 mlxcx_work_queue_t *wq = arg; 1785 mlxcx_completion_queue_t *cq = wq->mlwq_cq; 1786 mlxcx_t *mlxp = wq->mlwq_mlx; 1787 mlxcx_buf_shard_t *s = wq->mlwq_bufs; 1788 boolean_t refill; 1789 1790 do { 1791 /* 1792 * Wait until there are some free buffers. 1793 */ 1794 mutex_enter(&s->mlbs_mtx); 1795 while (list_is_empty(&s->mlbs_free) && 1796 (cq->mlcq_state & MLXCX_CQ_TEARDOWN) == 0) 1797 cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx); 1798 mutex_exit(&s->mlbs_mtx); 1799 1800 mutex_enter(&cq->mlcq_mtx); 1801 mutex_enter(&wq->mlwq_mtx); 1802 1803 if ((cq->mlcq_state & MLXCX_CQ_TEARDOWN) != 0) { 1804 refill = B_FALSE; 1805 wq->mlwq_state &= ~MLXCX_WQ_REFILLING; 1806 } else { 1807 mlxcx_rq_refill(mlxp, wq); 1808 1809 if (cq->mlcq_bufcnt < MLXCX_RQ_REFILL_STEP) { 1810 refill = B_TRUE; 1811 } else { 1812 refill = B_FALSE; 1813 wq->mlwq_state &= ~MLXCX_WQ_REFILLING; 1814 } 1815 } 1816 1817 mutex_exit(&wq->mlwq_mtx); 1818 mutex_exit(&cq->mlcq_mtx); 1819 } while (refill); 1820 } 1821 1822 void 1823 mlxcx_rq_refill(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq) 1824 { 1825 size_t target, current, want, done, n; 1826 mlxcx_completion_queue_t *cq; 1827 mlxcx_ring_group_t *g; 1828 mlxcx_buffer_t *b[MLXCX_RQ_REFILL_STEP]; 1829 uint_t i; 1830 1831 ASSERT(mutex_owned(&mlwq->mlwq_mtx)); 1832 cq = mlwq->mlwq_cq; 1833 ASSERT(mutex_owned(&cq->mlcq_mtx)); 1834 1835 ASSERT(mlwq->mlwq_state & MLXCX_WQ_BUFFERS); 1836 1837 target = mlwq->mlwq_nents - MLXCX_RQ_REFILL_STEP; 1838 cq = mlwq->mlwq_cq; 1839 1840 if (cq->mlcq_state & MLXCX_CQ_TEARDOWN) 1841 return; 1842 1843 current = cq->mlcq_bufcnt; 1844 1845 if (current >= target - MLXCX_RQ_REFILL_STEP) 1846 return; 1847 1848 want = target - current; 1849 done = 0; 1850 1851 while (!(mlwq->mlwq_state & MLXCX_WQ_TEARDOWN) && done < want) { 1852 n = mlxcx_buf_take_n(mlxp, mlwq, b, MLXCX_RQ_REFILL_STEP); 1853 if (n == 0) { 1854 /* 1855 * We didn't get any buffers from the free queue. 1856 * It might not be an issue, schedule a taskq 1857 * to wait for free buffers if the completion 1858 * queue is low. 1859 */ 1860 if (current < MLXCX_RQ_REFILL_STEP && 1861 (mlwq->mlwq_state & MLXCX_WQ_REFILLING) == 0) { 1862 mlwq->mlwq_state |= MLXCX_WQ_REFILLING; 1863 g = mlwq->mlwq_group; 1864 taskq_dispatch_ent(g->mlg_refill_tq, 1865 mlxcx_rq_refill_task, mlwq, TQ_NOSLEEP, 1866 &mlwq->mlwq_tqe); 1867 } 1868 1869 return; 1870 } 1871 1872 if (mlwq->mlwq_state & MLXCX_WQ_TEARDOWN) { 1873 for (i = 0; i < n; ++i) 1874 mlxcx_buf_return(mlxp, b[i]); 1875 return; 1876 } 1877 if (!mlxcx_rq_add_buffers(mlxp, mlwq, b, n)) { 1878 /* 1879 * mlxcx_rq_add_buffers NULLs out the buffers as it 1880 * enqueues them, so any that are non-NULL we have to 1881 * free now. The others now belong to the WQ, even if 1882 * we failed. 1883 */ 1884 for (i = 0; i < n; ++i) { 1885 if (b[i] != NULL) { 1886 mlxcx_buf_return(mlxp, b[i]); 1887 } 1888 } 1889 return; 1890 } 1891 done += n; 1892 } 1893 } 1894 1895 static const char * 1896 mlxcx_cq_err_syndrome_string(mlxcx_cq_error_syndrome_t sy) 1897 { 1898 switch (sy) { 1899 case MLXCX_CQ_ERR_LOCAL_LENGTH: 1900 return ("LOCAL_LENGTH"); 1901 case MLXCX_CQ_ERR_LOCAL_QP_OP: 1902 return ("LOCAL_QP_OP"); 1903 case MLXCX_CQ_ERR_LOCAL_PROTECTION: 1904 return ("LOCAL_PROTECTION"); 1905 case MLXCX_CQ_ERR_WR_FLUSHED: 1906 return ("WR_FLUSHED"); 1907 case MLXCX_CQ_ERR_MEM_WINDOW_BIND: 1908 return ("MEM_WINDOW_BIND"); 1909 case MLXCX_CQ_ERR_BAD_RESPONSE: 1910 return ("BAD_RESPONSE"); 1911 case MLXCX_CQ_ERR_LOCAL_ACCESS: 1912 return ("LOCAL_ACCESS"); 1913 case MLXCX_CQ_ERR_XPORT_RETRY_CTR: 1914 return ("XPORT_RETRY_CTR"); 1915 case MLXCX_CQ_ERR_RNR_RETRY_CTR: 1916 return ("RNR_RETRY_CTR"); 1917 case MLXCX_CQ_ERR_ABORTED: 1918 return ("ABORTED"); 1919 default: 1920 return ("UNKNOWN"); 1921 } 1922 } 1923 1924 static void 1925 mlxcx_fm_cqe_ereport(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, 1926 mlxcx_completionq_error_ent_t *ent) 1927 { 1928 uint64_t ena; 1929 char buf[FM_MAX_CLASS]; 1930 const char *name = mlxcx_cq_err_syndrome_string(ent->mlcqee_syndrome); 1931 1932 if (!DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps)) 1933 return; 1934 1935 (void) snprintf(buf, FM_MAX_CLASS, "%s.%s", 1936 MLXCX_FM_SERVICE_MLXCX, "cqe.err"); 1937 ena = fm_ena_generate(0, FM_ENA_FMT1); 1938 1939 ddi_fm_ereport_post(mlxp->mlx_dip, buf, ena, DDI_NOSLEEP, 1940 FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0, 1941 "syndrome", DATA_TYPE_STRING, name, 1942 "syndrome_num", DATA_TYPE_UINT8, ent->mlcqee_syndrome, 1943 "vendor_syndrome", DATA_TYPE_UINT8, 1944 ent->mlcqee_vendor_error_syndrome, 1945 "wqe_counter", DATA_TYPE_UINT16, from_be16(ent->mlcqee_wqe_counter), 1946 "wq_type", DATA_TYPE_STRING, 1947 (mlcq->mlcq_wq->mlwq_type == MLXCX_WQ_TYPE_SENDQ) ? "send": "recv", 1948 "cq_num", DATA_TYPE_UINT32, mlcq->mlcq_num, 1949 "wq_num", DATA_TYPE_UINT32, mlcq->mlcq_wq->mlwq_num, 1950 NULL); 1951 ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_DEGRADED); 1952 } 1953 1954 void 1955 mlxcx_tx_completion(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, 1956 mlxcx_completionq_ent_t *ent, mlxcx_buffer_t *buf) 1957 { 1958 ASSERT(mutex_owned(&mlcq->mlcq_mtx)); 1959 if (ent->mlcqe_opcode == MLXCX_CQE_OP_REQ_ERR) { 1960 mlxcx_completionq_error_ent_t *eent = 1961 (mlxcx_completionq_error_ent_t *)ent; 1962 mlxcx_fm_cqe_ereport(mlxp, mlcq, eent); 1963 mlxcx_buf_return_chain(mlxp, buf, B_FALSE); 1964 mutex_enter(&mlcq->mlcq_wq->mlwq_mtx); 1965 mlxcx_check_sq(mlxp, mlcq->mlcq_wq); 1966 mutex_exit(&mlcq->mlcq_wq->mlwq_mtx); 1967 return; 1968 } 1969 1970 if (ent->mlcqe_opcode != MLXCX_CQE_OP_REQ) { 1971 mlxcx_warn(mlxp, "!got weird cq opcode: %x", ent->mlcqe_opcode); 1972 mlxcx_buf_return_chain(mlxp, buf, B_FALSE); 1973 return; 1974 } 1975 1976 if (ent->mlcqe_send_wqe_opcode != MLXCX_WQE_OP_SEND) { 1977 mlxcx_warn(mlxp, "!got weird cq wqe opcode: %x", 1978 ent->mlcqe_send_wqe_opcode); 1979 mlxcx_buf_return_chain(mlxp, buf, B_FALSE); 1980 return; 1981 } 1982 1983 if (ent->mlcqe_format != MLXCX_CQE_FORMAT_BASIC) { 1984 mlxcx_warn(mlxp, "!got weird cq format: %x", ent->mlcqe_format); 1985 mlxcx_buf_return_chain(mlxp, buf, B_FALSE); 1986 return; 1987 } 1988 1989 mlxcx_buf_return_chain(mlxp, buf, B_FALSE); 1990 } 1991 1992 mblk_t * 1993 mlxcx_rx_completion(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, 1994 mlxcx_completionq_ent_t *ent, mlxcx_buffer_t *buf) 1995 { 1996 uint32_t chkflags = 0; 1997 uint_t wqe_index; 1998 ddi_fm_error_t err; 1999 2000 ASSERT(mutex_owned(&mlcq->mlcq_mtx)); 2001 2002 if (ent->mlcqe_opcode == MLXCX_CQE_OP_RESP_ERR) { 2003 mlxcx_completionq_error_ent_t *eent = 2004 (mlxcx_completionq_error_ent_t *)ent; 2005 mlxcx_fm_cqe_ereport(mlxp, mlcq, eent); 2006 mlxcx_buf_return(mlxp, buf); 2007 mutex_enter(&mlcq->mlcq_wq->mlwq_mtx); 2008 mlxcx_check_rq(mlxp, mlcq->mlcq_wq); 2009 mutex_exit(&mlcq->mlcq_wq->mlwq_mtx); 2010 return (NULL); 2011 } 2012 2013 if (ent->mlcqe_opcode != MLXCX_CQE_OP_RESP) { 2014 mlxcx_warn(mlxp, "!got weird cq opcode: %x", ent->mlcqe_opcode); 2015 mlxcx_buf_return(mlxp, buf); 2016 return (NULL); 2017 } 2018 2019 if (ent->mlcqe_format != MLXCX_CQE_FORMAT_BASIC) { 2020 mlxcx_warn(mlxp, "!got weird cq format: %x", ent->mlcqe_format); 2021 mlxcx_buf_return(mlxp, buf); 2022 return (NULL); 2023 } 2024 2025 if (ent->mlcqe_rx_drop_counter > 0) { 2026 atomic_add_64(&mlcq->mlcq_stats->mlps_rx_drops, 2027 ent->mlcqe_rx_drop_counter); 2028 } 2029 2030 MLXCX_DMA_SYNC(buf->mlb_dma, DDI_DMA_SYNC_FORCPU); 2031 ddi_fm_dma_err_get(buf->mlb_dma.mxdb_dma_handle, &err, 2032 DDI_FME_VERSION); 2033 if (err.fme_status != DDI_FM_OK) { 2034 ddi_fm_dma_err_clear(buf->mlb_dma.mxdb_dma_handle, 2035 DDI_FME_VERSION); 2036 mlxcx_buf_return(mlxp, buf); 2037 return (NULL); 2038 } 2039 2040 /* 2041 * mlxcx_buf_loan() will set mlb_wqe_index to zero. 2042 * Remember it for later. 2043 */ 2044 wqe_index = buf->mlb_wqe_index; 2045 2046 if (!mlxcx_buf_loan(mlxp, buf)) { 2047 mlxcx_warn(mlxp, "!loan failed, dropping packet"); 2048 mlxcx_buf_return(mlxp, buf); 2049 return (NULL); 2050 } 2051 2052 buf->mlb_mp->b_next = NULL; 2053 buf->mlb_mp->b_cont = NULL; 2054 buf->mlb_mp->b_wptr = buf->mlb_mp->b_rptr + 2055 from_be32(ent->mlcqe_byte_cnt); 2056 2057 if (get_bit8(ent->mlcqe_csflags, MLXCX_CQE_CSFLAGS_L4_OK)) { 2058 chkflags |= HCK_FULLCKSUM_OK; 2059 } 2060 if (get_bit8(ent->mlcqe_csflags, MLXCX_CQE_CSFLAGS_L3_OK)) { 2061 chkflags |= HCK_IPV4_HDRCKSUM_OK; 2062 } 2063 if (chkflags != 0) { 2064 mac_hcksum_set(buf->mlb_mp, 0, 0, 0, 2065 from_be16(ent->mlcqe_checksum), chkflags); 2066 } 2067 2068 /* 2069 * Don't check if a refill is needed on every single completion, 2070 * since checking involves taking the RQ lock. 2071 */ 2072 if ((wqe_index & 0x7) == 0) { 2073 mlxcx_work_queue_t *wq = mlcq->mlcq_wq; 2074 ASSERT(wq != NULL); 2075 mutex_enter(&wq->mlwq_mtx); 2076 if (!(wq->mlwq_state & MLXCX_WQ_TEARDOWN)) 2077 mlxcx_rq_refill(mlxp, wq); 2078 mutex_exit(&wq->mlwq_mtx); 2079 } 2080 2081 return (buf->mlb_mp); 2082 } 2083 2084 static void 2085 mlxcx_buf_mp_return(caddr_t arg) 2086 { 2087 mlxcx_buffer_t *b = (mlxcx_buffer_t *)arg; 2088 mlxcx_t *mlxp = b->mlb_mlx; 2089 2090 if (b->mlb_state != MLXCX_BUFFER_ON_LOAN) { 2091 b->mlb_mp = NULL; 2092 return; 2093 } 2094 /* 2095 * The mblk for this buffer_t (in its mlb_mp field) has been used now, 2096 * so NULL it out. 2097 */ 2098 b->mlb_mp = NULL; 2099 mlxcx_buf_return(mlxp, b); 2100 } 2101 2102 boolean_t 2103 mlxcx_buf_create(mlxcx_t *mlxp, mlxcx_buf_shard_t *shard, mlxcx_buffer_t **bp) 2104 { 2105 mlxcx_buffer_t *b; 2106 ddi_device_acc_attr_t acc; 2107 ddi_dma_attr_t attr; 2108 boolean_t ret; 2109 2110 b = kmem_cache_alloc(mlxp->mlx_bufs_cache, KM_SLEEP); 2111 b->mlb_shard = shard; 2112 b->mlb_foreign = B_FALSE; 2113 2114 mlxcx_dma_acc_attr(mlxp, &acc); 2115 mlxcx_dma_buf_attr(mlxp, &attr); 2116 2117 ret = mlxcx_dma_alloc_offset(mlxp, &b->mlb_dma, &attr, &acc, 2118 B_FALSE, mlxp->mlx_ports[0].mlp_mtu, 2, B_TRUE); 2119 if (!ret) { 2120 kmem_cache_free(mlxp->mlx_bufs_cache, b); 2121 return (B_FALSE); 2122 } 2123 2124 b->mlb_frtn.free_func = mlxcx_buf_mp_return; 2125 b->mlb_frtn.free_arg = (caddr_t)b; 2126 b->mlb_mp = desballoc((unsigned char *)b->mlb_dma.mxdb_va, 2127 b->mlb_dma.mxdb_len, 0, &b->mlb_frtn); 2128 2129 *bp = b; 2130 2131 return (B_TRUE); 2132 } 2133 2134 boolean_t 2135 mlxcx_buf_create_foreign(mlxcx_t *mlxp, mlxcx_buf_shard_t *shard, 2136 mlxcx_buffer_t **bp) 2137 { 2138 mlxcx_buffer_t *b; 2139 ddi_dma_attr_t attr; 2140 boolean_t ret; 2141 2142 b = kmem_cache_alloc(mlxp->mlx_bufs_cache, KM_SLEEP); 2143 b->mlb_shard = shard; 2144 b->mlb_foreign = B_TRUE; 2145 2146 mlxcx_dma_buf_attr(mlxp, &attr); 2147 2148 ret = mlxcx_dma_init(mlxp, &b->mlb_dma, &attr, B_TRUE); 2149 if (!ret) { 2150 kmem_cache_free(mlxp->mlx_bufs_cache, b); 2151 return (B_FALSE); 2152 } 2153 2154 *bp = b; 2155 2156 return (B_TRUE); 2157 } 2158 2159 static mlxcx_buffer_t * 2160 mlxcx_buf_take_foreign(mlxcx_t *mlxp, mlxcx_work_queue_t *wq) 2161 { 2162 mlxcx_buffer_t *b; 2163 mlxcx_buf_shard_t *s = wq->mlwq_foreign_bufs; 2164 2165 mutex_enter(&s->mlbs_mtx); 2166 if ((b = list_remove_head(&s->mlbs_free)) != NULL) { 2167 ASSERT3U(b->mlb_state, ==, MLXCX_BUFFER_FREE); 2168 ASSERT(b->mlb_foreign); 2169 b->mlb_state = MLXCX_BUFFER_ON_WQ; 2170 list_insert_tail(&s->mlbs_busy, b); 2171 } 2172 mutex_exit(&s->mlbs_mtx); 2173 2174 return (b); 2175 } 2176 2177 static mlxcx_buffer_t * 2178 mlxcx_copy_data(mlxcx_t *mlxp, mlxcx_work_queue_t *wq, uint8_t *rptr, size_t sz) 2179 { 2180 ddi_fm_error_t err; 2181 mlxcx_buffer_t *b; 2182 uint_t attempts = 0; 2183 2184 copyb: 2185 if ((b = mlxcx_buf_take(mlxp, wq)) == NULL) 2186 return (NULL); 2187 2188 ASSERT3U(b->mlb_dma.mxdb_len, >=, sz); 2189 bcopy(rptr, b->mlb_dma.mxdb_va, sz); 2190 2191 MLXCX_DMA_SYNC(b->mlb_dma, DDI_DMA_SYNC_FORDEV); 2192 2193 ddi_fm_dma_err_get(b->mlb_dma.mxdb_dma_handle, &err, 2194 DDI_FME_VERSION); 2195 if (err.fme_status != DDI_FM_OK) { 2196 ddi_fm_dma_err_clear(b->mlb_dma.mxdb_dma_handle, 2197 DDI_FME_VERSION); 2198 mlxcx_buf_return(mlxp, b); 2199 if (++attempts > MLXCX_BUF_BIND_MAX_ATTEMTPS) { 2200 return (NULL); 2201 } 2202 goto copyb; 2203 } 2204 2205 return (b); 2206 } 2207 2208 mlxcx_buffer_t * 2209 mlxcx_buf_bind_or_copy(mlxcx_t *mlxp, mlxcx_work_queue_t *wq, 2210 mblk_t *mpb, size_t off) 2211 { 2212 mlxcx_buffer_t *b, *b0 = NULL; 2213 boolean_t first = B_TRUE; 2214 mblk_t *mp; 2215 uint8_t *rptr; 2216 size_t sz; 2217 size_t ncookies = 0; 2218 boolean_t ret; 2219 2220 for (mp = mpb; mp != NULL; mp = mp->b_cont) { 2221 rptr = mp->b_rptr; 2222 sz = MBLKL(mp); 2223 2224 if (off > 0) 2225 ASSERT3U(off, <, sz); 2226 rptr += off; 2227 sz -= off; 2228 2229 if (sz < mlxp->mlx_props.mldp_tx_bind_threshold) { 2230 b = mlxcx_copy_data(mlxp, wq, rptr, sz); 2231 if (b == NULL) 2232 goto failed; 2233 } else { 2234 b = mlxcx_buf_take_foreign(mlxp, wq); 2235 if (b == NULL) 2236 goto failed; 2237 2238 ret = mlxcx_dma_bind_mblk(mlxp, &b->mlb_dma, mp, off, 2239 B_FALSE); 2240 2241 if (!ret) { 2242 mlxcx_buf_return(mlxp, b); 2243 2244 b = mlxcx_copy_data(mlxp, wq, rptr, sz); 2245 if (b == NULL) 2246 goto failed; 2247 } 2248 } 2249 2250 /* 2251 * We might overestimate here when we've copied data, since 2252 * the buffer might be longer than what we copied into it. This 2253 * is safe since it's always wrong in the conservative 2254 * direction (and we will blow up later when we actually 2255 * generate the WQE anyway). 2256 * 2257 * If the assert below ever blows, we'll have to come and fix 2258 * this up so we can transmit these packets. 2259 */ 2260 ncookies += b->mlb_dma.mxdb_ncookies; 2261 2262 if (first) 2263 b0 = b; 2264 2265 if (!first) 2266 b->mlb_state = MLXCX_BUFFER_ON_CHAIN; 2267 2268 b->mlb_tx_mp = mp; 2269 b->mlb_tx_head = b0; 2270 b->mlb_used = sz; 2271 2272 if (!first) 2273 list_insert_tail(&b0->mlb_tx_chain, b); 2274 first = B_FALSE; 2275 off = 0; 2276 } 2277 2278 ASSERT3U(ncookies, <=, MLXCX_SQE_MAX_PTRS); 2279 2280 return (b0); 2281 2282 failed: 2283 if (b0 != NULL) 2284 mlxcx_buf_return_chain(mlxp, b0, B_TRUE); 2285 2286 return (NULL); 2287 } 2288 2289 mlxcx_buffer_t * 2290 mlxcx_buf_take(mlxcx_t *mlxp, mlxcx_work_queue_t *wq) 2291 { 2292 mlxcx_buffer_t *b; 2293 mlxcx_buf_shard_t *s = wq->mlwq_bufs; 2294 2295 mutex_enter(&s->mlbs_mtx); 2296 if ((b = list_remove_head(&s->mlbs_free)) != NULL) { 2297 ASSERT3U(b->mlb_state, ==, MLXCX_BUFFER_FREE); 2298 b->mlb_state = MLXCX_BUFFER_ON_WQ; 2299 list_insert_tail(&s->mlbs_busy, b); 2300 } 2301 mutex_exit(&s->mlbs_mtx); 2302 2303 return (b); 2304 } 2305 2306 size_t 2307 mlxcx_buf_take_n(mlxcx_t *mlxp, mlxcx_work_queue_t *wq, 2308 mlxcx_buffer_t **bp, size_t nbufs) 2309 { 2310 mlxcx_buffer_t *b; 2311 size_t done = 0; 2312 mlxcx_buf_shard_t *s; 2313 2314 s = wq->mlwq_bufs; 2315 2316 mutex_enter(&s->mlbs_mtx); 2317 while (done < nbufs && (b = list_remove_head(&s->mlbs_free)) != NULL) { 2318 ASSERT3U(b->mlb_state, ==, MLXCX_BUFFER_FREE); 2319 b->mlb_state = MLXCX_BUFFER_ON_WQ; 2320 list_insert_tail(&s->mlbs_busy, b); 2321 bp[done++] = b; 2322 } 2323 mutex_exit(&s->mlbs_mtx); 2324 return (done); 2325 } 2326 2327 boolean_t 2328 mlxcx_buf_loan(mlxcx_t *mlxp, mlxcx_buffer_t *b) 2329 { 2330 VERIFY3U(b->mlb_state, ==, MLXCX_BUFFER_ON_WQ); 2331 ASSERT3P(b->mlb_mlx, ==, mlxp); 2332 2333 if (b->mlb_mp == NULL) { 2334 b->mlb_mp = desballoc((unsigned char *)b->mlb_dma.mxdb_va, 2335 b->mlb_dma.mxdb_len, 0, &b->mlb_frtn); 2336 if (b->mlb_mp == NULL) 2337 return (B_FALSE); 2338 } 2339 2340 b->mlb_state = MLXCX_BUFFER_ON_LOAN; 2341 b->mlb_wqe_index = 0; 2342 return (B_TRUE); 2343 } 2344 2345 void 2346 mlxcx_buf_return_chain(mlxcx_t *mlxp, mlxcx_buffer_t *b0, boolean_t keepmp) 2347 { 2348 mlxcx_buffer_t *b; 2349 2350 if (b0->mlb_tx_head != b0) { 2351 mlxcx_buf_return(mlxp, b0); 2352 return; 2353 } 2354 2355 while ((b = list_head(&b0->mlb_tx_chain)) != NULL) { 2356 mlxcx_buf_return(mlxp, b); 2357 } 2358 if (keepmp) { 2359 b0->mlb_tx_mp = NULL; 2360 b0->mlb_tx_head = NULL; 2361 } 2362 mlxcx_buf_return(mlxp, b0); 2363 } 2364 2365 void 2366 mlxcx_buf_return(mlxcx_t *mlxp, mlxcx_buffer_t *b) 2367 { 2368 mlxcx_buffer_state_t oldstate = b->mlb_state; 2369 mlxcx_buffer_t *txhead = b->mlb_tx_head; 2370 mlxcx_buf_shard_t *s = b->mlb_shard; 2371 mblk_t *mp = b->mlb_tx_mp; 2372 2373 VERIFY3U(oldstate, !=, MLXCX_BUFFER_FREE); 2374 ASSERT3P(b->mlb_mlx, ==, mlxp); 2375 2376 /* 2377 * The mlbs_mtx held below is a heavily contended lock, so it is 2378 * imperative we do as much of the buffer clean up outside the lock 2379 * as is possible. 2380 */ 2381 b->mlb_state = MLXCX_BUFFER_FREE; 2382 b->mlb_wqe_index = 0; 2383 b->mlb_tx_head = NULL; 2384 b->mlb_tx_mp = NULL; 2385 b->mlb_used = 0; 2386 b->mlb_wqebbs = 0; 2387 ASSERT(list_is_empty(&b->mlb_tx_chain)); 2388 2389 if (b->mlb_foreign) { 2390 if (b->mlb_dma.mxdb_flags & MLXCX_DMABUF_BOUND) { 2391 mlxcx_dma_unbind(mlxp, &b->mlb_dma); 2392 } 2393 } 2394 2395 mutex_enter(&s->mlbs_mtx); 2396 switch (oldstate) { 2397 case MLXCX_BUFFER_INIT: 2398 break; 2399 case MLXCX_BUFFER_ON_WQ: 2400 list_remove(&s->mlbs_busy, b); 2401 break; 2402 case MLXCX_BUFFER_ON_LOAN: 2403 ASSERT(!b->mlb_foreign); 2404 list_remove(&s->mlbs_busy, b); 2405 break; 2406 case MLXCX_BUFFER_FREE: 2407 VERIFY(0); 2408 break; 2409 case MLXCX_BUFFER_ON_CHAIN: 2410 ASSERT(txhead != NULL); 2411 list_remove(&txhead->mlb_tx_chain, b); 2412 list_remove(&s->mlbs_busy, b); 2413 break; 2414 } 2415 2416 list_insert_tail(&s->mlbs_free, b); 2417 cv_signal(&s->mlbs_free_nonempty); 2418 2419 mutex_exit(&s->mlbs_mtx); 2420 2421 /* 2422 * For TX chain heads, free the mblk_t after we let go of the lock. 2423 * This might be a borrowed buf that we in turn loaned to MAC, in which 2424 * case calling freemsg() on it will re-enter this very function -- so 2425 * we better not be holding the lock! 2426 */ 2427 if (txhead == b) 2428 freemsg(mp); 2429 } 2430 2431 void 2432 mlxcx_buf_destroy(mlxcx_t *mlxp, mlxcx_buffer_t *b) 2433 { 2434 mlxcx_buf_shard_t *s = b->mlb_shard; 2435 VERIFY(b->mlb_state == MLXCX_BUFFER_FREE || 2436 b->mlb_state == MLXCX_BUFFER_INIT); 2437 ASSERT(mutex_owned(&s->mlbs_mtx)); 2438 if (b->mlb_state == MLXCX_BUFFER_FREE) 2439 list_remove(&s->mlbs_free, b); 2440 2441 /* 2442 * This is going back to the kmem cache, so it needs to be set up in 2443 * the same way we expect a new buffer to come out (state INIT, other 2444 * fields NULL'd) 2445 */ 2446 b->mlb_state = MLXCX_BUFFER_INIT; 2447 b->mlb_shard = NULL; 2448 if (b->mlb_mp != NULL) { 2449 freeb(b->mlb_mp); 2450 ASSERT(b->mlb_mp == NULL); 2451 } 2452 mlxcx_dma_free(&b->mlb_dma); 2453 ASSERT(list_is_empty(&b->mlb_tx_chain)); 2454 2455 kmem_cache_free(mlxp->mlx_bufs_cache, b); 2456 } 2457