1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2020, The University of Queensland 14 * Copyright (c) 2018, Joyent, Inc. 15 * Copyright 2020 RackTop Systems, Inc. 16 */ 17 18 /* 19 * Mellanox Connect-X 4/5/6 driver. 20 */ 21 22 #include <sys/modctl.h> 23 #include <sys/conf.h> 24 #include <sys/devops.h> 25 #include <sys/sysmacros.h> 26 #include <sys/atomic.h> 27 #include <sys/cpuvar.h> 28 #include <sys/sdt.h> 29 30 #include <sys/pattr.h> 31 #include <sys/dlpi.h> 32 33 #include <sys/mac_provider.h> 34 35 #include <sys/random.h> 36 37 #include <mlxcx.h> 38 39 boolean_t 40 mlxcx_wq_alloc_dma(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq) 41 { 42 ddi_device_acc_attr_t acc; 43 ddi_dma_attr_t attr; 44 boolean_t ret; 45 size_t sz; 46 47 VERIFY0(mlwq->mlwq_state & MLXCX_WQ_ALLOC); 48 49 /* Receive and send queue entries might be different sizes. */ 50 switch (mlwq->mlwq_type) { 51 case MLXCX_WQ_TYPE_SENDQ: 52 mlwq->mlwq_entshift = mlxp->mlx_props.mldp_sq_size_shift; 53 mlwq->mlwq_nents = (1 << mlwq->mlwq_entshift); 54 sz = mlwq->mlwq_nents * sizeof (mlxcx_sendq_ent_t); 55 break; 56 case MLXCX_WQ_TYPE_RECVQ: 57 mlwq->mlwq_entshift = mlxp->mlx_props.mldp_rq_size_shift; 58 mlwq->mlwq_nents = (1 << mlwq->mlwq_entshift); 59 sz = mlwq->mlwq_nents * sizeof (mlxcx_recvq_ent_t); 60 break; 61 default: 62 VERIFY(0); 63 return (B_FALSE); 64 } 65 ASSERT3U(sz & (MLXCX_HW_PAGE_SIZE - 1), ==, 0); 66 67 mlxcx_dma_acc_attr(mlxp, &acc); 68 mlxcx_dma_queue_attr(mlxp, &attr); 69 70 ret = mlxcx_dma_alloc(mlxp, &mlwq->mlwq_dma, &attr, &acc, 71 B_TRUE, sz, B_TRUE); 72 if (!ret) { 73 mlxcx_warn(mlxp, "failed to allocate WQ memory"); 74 return (B_FALSE); 75 } 76 77 /* 78 * Just set the first pointer in the union. Yes, this is a strict 79 * aliasing violation. No, I don't care. 80 */ 81 mlwq->mlwq_send_ent = (mlxcx_sendq_ent_t *)mlwq->mlwq_dma.mxdb_va; 82 83 mlxcx_dma_acc_attr(mlxp, &acc); 84 mlxcx_dma_qdbell_attr(mlxp, &attr); 85 sz = sizeof (mlxcx_workq_doorbell_t); 86 ret = mlxcx_dma_alloc(mlxp, &mlwq->mlwq_doorbell_dma, &attr, &acc, 87 B_TRUE, sz, B_TRUE); 88 if (!ret) { 89 mlxcx_warn(mlxp, "failed to allocate WQ doorbell memory"); 90 mlxcx_dma_free(&mlwq->mlwq_dma); 91 mlwq->mlwq_send_ent = NULL; 92 return (B_FALSE); 93 } 94 95 mlwq->mlwq_doorbell = 96 (mlxcx_workq_doorbell_t *)mlwq->mlwq_doorbell_dma.mxdb_va; 97 98 mlwq->mlwq_state |= MLXCX_WQ_ALLOC; 99 100 return (B_TRUE); 101 } 102 103 void 104 mlxcx_wq_rele_dma(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq) 105 { 106 VERIFY(mlwq->mlwq_state & MLXCX_WQ_ALLOC); 107 if (mlwq->mlwq_state & MLXCX_WQ_CREATED) 108 VERIFY(mlwq->mlwq_state & MLXCX_WQ_DESTROYED); 109 110 mlxcx_dma_free(&mlwq->mlwq_dma); 111 mlwq->mlwq_send_ent = NULL; 112 mlxcx_dma_free(&mlwq->mlwq_doorbell_dma); 113 mlwq->mlwq_doorbell = NULL; 114 115 mlwq->mlwq_state &= ~MLXCX_CQ_ALLOC; 116 } 117 118 static boolean_t 119 mlxcx_cq_alloc_dma(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, 120 uint_t ent_shift) 121 { 122 ddi_device_acc_attr_t acc; 123 ddi_dma_attr_t attr; 124 boolean_t ret; 125 size_t sz, i; 126 127 VERIFY0(mlcq->mlcq_state & MLXCX_EQ_ALLOC); 128 129 mlcq->mlcq_entshift = ent_shift; 130 mlcq->mlcq_nents = (1 << mlcq->mlcq_entshift); 131 sz = mlcq->mlcq_nents * sizeof (mlxcx_completionq_ent_t); 132 ASSERT3U(sz & (MLXCX_HW_PAGE_SIZE - 1), ==, 0); 133 134 mlxcx_dma_acc_attr(mlxp, &acc); 135 mlxcx_dma_queue_attr(mlxp, &attr); 136 137 ret = mlxcx_dma_alloc(mlxp, &mlcq->mlcq_dma, &attr, &acc, 138 B_TRUE, sz, B_TRUE); 139 if (!ret) { 140 mlxcx_warn(mlxp, "failed to allocate CQ memory"); 141 return (B_FALSE); 142 } 143 144 mlcq->mlcq_ent = (mlxcx_completionq_ent_t *)mlcq->mlcq_dma.mxdb_va; 145 146 for (i = 0; i < mlcq->mlcq_nents; ++i) { 147 mlcq->mlcq_ent[i].mlcqe_opcode = MLXCX_CQE_OP_INVALID; 148 mlcq->mlcq_ent[i].mlcqe_owner = MLXCX_CQE_OWNER_INIT; 149 } 150 151 mlxcx_dma_acc_attr(mlxp, &acc); 152 mlxcx_dma_qdbell_attr(mlxp, &attr); 153 sz = sizeof (mlxcx_completionq_doorbell_t); 154 ret = mlxcx_dma_alloc(mlxp, &mlcq->mlcq_doorbell_dma, &attr, &acc, 155 B_TRUE, sz, B_TRUE); 156 if (!ret) { 157 mlxcx_warn(mlxp, "failed to allocate CQ doorbell memory"); 158 mlxcx_dma_free(&mlcq->mlcq_dma); 159 mlcq->mlcq_ent = NULL; 160 return (B_FALSE); 161 } 162 163 mlcq->mlcq_doorbell = 164 (mlxcx_completionq_doorbell_t *)mlcq->mlcq_doorbell_dma.mxdb_va; 165 166 mlcq->mlcq_state |= MLXCX_CQ_ALLOC; 167 168 return (B_TRUE); 169 } 170 171 static void 172 mlxcx_cq_rele_dma(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq) 173 { 174 VERIFY(mlcq->mlcq_state & MLXCX_CQ_ALLOC); 175 if (mlcq->mlcq_state & MLXCX_CQ_CREATED) 176 VERIFY(mlcq->mlcq_state & MLXCX_CQ_DESTROYED); 177 178 mlxcx_dma_free(&mlcq->mlcq_dma); 179 mlcq->mlcq_ent = NULL; 180 mlxcx_dma_free(&mlcq->mlcq_doorbell_dma); 181 mlcq->mlcq_doorbell = NULL; 182 183 mlcq->mlcq_state &= ~MLXCX_CQ_ALLOC; 184 } 185 186 void 187 mlxcx_wq_teardown(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq) 188 { 189 mlxcx_completion_queue_t *mlcq; 190 191 /* 192 * If something is holding the lock on a long operation like a 193 * refill, setting this flag asks them to exit early if possible. 194 */ 195 atomic_or_uint(&mlwq->mlwq_state, MLXCX_WQ_TEARDOWN); 196 197 mutex_enter(&mlwq->mlwq_mtx); 198 199 list_remove(&mlxp->mlx_wqs, mlwq); 200 201 if ((mlwq->mlwq_state & MLXCX_WQ_CREATED) && 202 !(mlwq->mlwq_state & MLXCX_WQ_DESTROYED)) { 203 if (mlwq->mlwq_type == MLXCX_WQ_TYPE_RECVQ && 204 mlwq->mlwq_state & MLXCX_WQ_STARTED && 205 !mlxcx_cmd_stop_rq(mlxp, mlwq)) { 206 mlxcx_warn(mlxp, "failed to stop " 207 "recv queue num %x", mlwq->mlwq_num); 208 } 209 if (mlwq->mlwq_type == MLXCX_WQ_TYPE_SENDQ && 210 mlwq->mlwq_state & MLXCX_WQ_STARTED && 211 !mlxcx_cmd_stop_sq(mlxp, mlwq)) { 212 mlxcx_warn(mlxp, "failed to stop " 213 "send queue num %x", mlwq->mlwq_num); 214 } 215 if (mlwq->mlwq_type == MLXCX_WQ_TYPE_RECVQ && 216 !mlxcx_cmd_destroy_rq(mlxp, mlwq)) { 217 mlxcx_warn(mlxp, "failed to destroy " 218 "recv queue num %x", mlwq->mlwq_num); 219 } 220 if (mlwq->mlwq_type == MLXCX_WQ_TYPE_SENDQ && 221 !mlxcx_cmd_destroy_sq(mlxp, mlwq)) { 222 mlxcx_warn(mlxp, "failed to destroy " 223 "send queue num %x", mlwq->mlwq_num); 224 } 225 } 226 if (mlwq->mlwq_state & MLXCX_WQ_ALLOC) { 227 mlxcx_wq_rele_dma(mlxp, mlwq); 228 } 229 mlcq = mlwq->mlwq_cq; 230 231 /* These will be released by mlxcx_teardown_bufs() */ 232 mlwq->mlwq_bufs = NULL; 233 mlwq->mlwq_foreign_bufs = NULL; 234 235 mutex_exit(&mlwq->mlwq_mtx); 236 237 mutex_enter(&mlcq->mlcq_mtx); 238 mutex_enter(&mlwq->mlwq_mtx); 239 ASSERT3P(mlcq->mlcq_wq, ==, mlwq); 240 mlcq->mlcq_wq = NULL; 241 mutex_exit(&mlwq->mlwq_mtx); 242 mutex_exit(&mlcq->mlcq_mtx); 243 244 mutex_destroy(&mlwq->mlwq_mtx); 245 } 246 247 void 248 mlxcx_cq_teardown(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq) 249 { 250 mlxcx_event_queue_t *mleq; 251 mlxcx_buffer_t *b; 252 253 /* 254 * If something is holding the lock on a long operation like polling 255 * which we're going to abort anyway, this flag asks them to exit 256 * early if possible. 257 */ 258 atomic_or_uint(&mlcq->mlcq_state, MLXCX_CQ_TEARDOWN); 259 260 mutex_enter(&mlcq->mlcq_mtx); 261 262 list_remove(&mlxp->mlx_cqs, mlcq); 263 264 if ((mlcq->mlcq_state & MLXCX_CQ_CREATED) && 265 !(mlcq->mlcq_state & MLXCX_CQ_DESTROYED)) { 266 if (!mlxcx_cmd_destroy_cq(mlxp, mlcq)) { 267 mlxcx_warn(mlxp, "failed to destroy " 268 "completion queue num %u", 269 mlcq->mlcq_num); 270 } 271 } 272 if (mlcq->mlcq_state & MLXCX_CQ_ALLOC) { 273 mlxcx_cq_rele_dma(mlxp, mlcq); 274 } 275 /* 276 * If we're on an EQ AVL tree, then we need to grab 277 * the EQ's mutex to take it off. The ISR always takes 278 * EQ mutex before CQ mutex, so we have to let go of 279 * the CQ mutex then come back again. 280 * 281 * The ISR will bail out if tries to touch this CQ now since 282 * we added the CQ_DESTROYED flag above. 283 */ 284 if (mlcq->mlcq_state & MLXCX_CQ_EQAVL) { 285 mleq = mlcq->mlcq_eq; 286 } else { 287 mleq = NULL; 288 } 289 290 /* Return any outstanding buffers to the free pool. */ 291 while ((b = list_remove_head(&mlcq->mlcq_buffers)) != NULL) { 292 mlxcx_buf_return_chain(mlxp, b, B_FALSE); 293 } 294 mutex_enter(&mlcq->mlcq_bufbmtx); 295 while ((b = list_remove_head(&mlcq->mlcq_buffers_b)) != NULL) { 296 mlxcx_buf_return_chain(mlxp, b, B_FALSE); 297 } 298 mutex_exit(&mlcq->mlcq_bufbmtx); 299 300 /* 301 * Since the interrupt handlers take the EQ lock before the CQ one, 302 * we must do the same here. That means letting go of the lock 303 * for a brief window here (we'll double-check the state when we 304 * get back in). 305 */ 306 mutex_exit(&mlcq->mlcq_mtx); 307 308 if (mleq != NULL) { 309 mutex_enter(&mleq->mleq_mtx); 310 mutex_enter(&mlcq->mlcq_mtx); 311 /* 312 * Double-check the state, we let go of the 313 * mutex briefly. 314 */ 315 if (mlcq->mlcq_state & MLXCX_CQ_EQAVL) { 316 avl_remove(&mleq->mleq_cqs, mlcq); 317 mlcq->mlcq_state &= ~MLXCX_CQ_EQAVL; 318 } 319 mutex_exit(&mlcq->mlcq_mtx); 320 mutex_exit(&mleq->mleq_mtx); 321 } 322 323 mutex_enter(&mlcq->mlcq_mtx); 324 ASSERT0(mlcq->mlcq_state & ~(MLXCX_CQ_CREATED | MLXCX_CQ_DESTROYED | 325 MLXCX_CQ_TEARDOWN | MLXCX_CQ_ARMED)); 326 mutex_exit(&mlcq->mlcq_mtx); 327 328 mutex_destroy(&mlcq->mlcq_mtx); 329 mutex_destroy(&mlcq->mlcq_bufbmtx); 330 list_destroy(&mlcq->mlcq_buffers); 331 list_destroy(&mlcq->mlcq_buffers_b); 332 kmem_free(mlcq, sizeof (mlxcx_completion_queue_t)); 333 } 334 335 static boolean_t 336 mlxcx_cq_setup(mlxcx_t *mlxp, mlxcx_event_queue_t *eq, 337 mlxcx_completion_queue_t **cqp, uint_t ent_shift) 338 { 339 mlxcx_completion_queue_t *cq; 340 341 cq = kmem_zalloc(sizeof (mlxcx_completion_queue_t), KM_SLEEP); 342 mutex_init(&cq->mlcq_mtx, NULL, MUTEX_DRIVER, 343 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 344 mutex_init(&cq->mlcq_bufbmtx, NULL, MUTEX_DRIVER, 345 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 346 list_create(&cq->mlcq_buffers, sizeof (mlxcx_buffer_t), 347 offsetof(mlxcx_buffer_t, mlb_cq_entry)); 348 list_create(&cq->mlcq_buffers_b, sizeof (mlxcx_buffer_t), 349 offsetof(mlxcx_buffer_t, mlb_cq_entry)); 350 351 cq->mlcq_mlx = mlxp; 352 list_insert_tail(&mlxp->mlx_cqs, cq); 353 354 mutex_enter(&cq->mlcq_mtx); 355 356 if (!mlxcx_cq_alloc_dma(mlxp, cq, ent_shift)) { 357 mutex_exit(&cq->mlcq_mtx); 358 return (B_FALSE); 359 } 360 361 cq->mlcq_bufhwm = cq->mlcq_nents - MLXCX_CQ_HWM_GAP; 362 cq->mlcq_buflwm = cq->mlcq_nents - MLXCX_CQ_LWM_GAP; 363 364 cq->mlcq_uar = &mlxp->mlx_uar; 365 cq->mlcq_eq = eq; 366 367 cq->mlcq_cqemod_period_usec = mlxp->mlx_props.mldp_cqemod_period_usec; 368 cq->mlcq_cqemod_count = mlxp->mlx_props.mldp_cqemod_count; 369 370 if (!mlxcx_cmd_create_cq(mlxp, cq)) { 371 mutex_exit(&cq->mlcq_mtx); 372 return (B_FALSE); 373 } 374 375 mutex_exit(&cq->mlcq_mtx); 376 377 mutex_enter(&eq->mleq_mtx); 378 mutex_enter(&cq->mlcq_mtx); 379 ASSERT0(cq->mlcq_state & MLXCX_CQ_EQAVL); 380 avl_add(&eq->mleq_cqs, cq); 381 cq->mlcq_state |= MLXCX_CQ_EQAVL; 382 mlxcx_arm_cq(mlxp, cq); 383 mutex_exit(&cq->mlcq_mtx); 384 mutex_exit(&eq->mleq_mtx); 385 386 *cqp = cq; 387 return (B_TRUE); 388 } 389 390 static boolean_t 391 mlxcx_rq_setup(mlxcx_t *mlxp, mlxcx_completion_queue_t *cq, 392 mlxcx_work_queue_t *wq) 393 { 394 mutex_init(&wq->mlwq_mtx, NULL, MUTEX_DRIVER, 395 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 396 397 list_insert_tail(&mlxp->mlx_wqs, wq); 398 399 mutex_enter(&wq->mlwq_mtx); 400 401 wq->mlwq_mlx = mlxp; 402 wq->mlwq_type = MLXCX_WQ_TYPE_RECVQ; 403 wq->mlwq_cq = cq; 404 wq->mlwq_pd = &mlxp->mlx_pd; 405 wq->mlwq_uar = &mlxp->mlx_uar; 406 407 wq->mlwq_bufs = mlxcx_mlbs_create(mlxp); 408 409 if (!mlxcx_wq_alloc_dma(mlxp, wq)) { 410 mutex_exit(&wq->mlwq_mtx); 411 return (B_FALSE); 412 } 413 414 if (!mlxcx_cmd_create_rq(mlxp, wq)) { 415 mutex_exit(&wq->mlwq_mtx); 416 return (B_FALSE); 417 } 418 419 wq->mlwq_bufhwm = wq->mlwq_nents - MLXCX_WQ_HWM_GAP; 420 wq->mlwq_buflwm = wq->mlwq_nents - MLXCX_WQ_LWM_GAP; 421 422 mutex_exit(&wq->mlwq_mtx); 423 424 mutex_enter(&cq->mlcq_mtx); 425 mutex_enter(&wq->mlwq_mtx); 426 ASSERT3P(cq->mlcq_wq, ==, NULL); 427 cq->mlcq_wq = wq; 428 mutex_exit(&wq->mlwq_mtx); 429 mutex_exit(&cq->mlcq_mtx); 430 431 return (B_TRUE); 432 } 433 434 static boolean_t 435 mlxcx_sq_setup(mlxcx_t *mlxp, mlxcx_port_t *port, mlxcx_completion_queue_t *cq, 436 mlxcx_tis_t *tis, mlxcx_work_queue_t *wq) 437 { 438 mutex_init(&wq->mlwq_mtx, NULL, MUTEX_DRIVER, 439 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 440 441 list_insert_tail(&mlxp->mlx_wqs, wq); 442 443 mutex_enter(&wq->mlwq_mtx); 444 445 wq->mlwq_mlx = mlxp; 446 wq->mlwq_type = MLXCX_WQ_TYPE_SENDQ; 447 wq->mlwq_cq = cq; 448 wq->mlwq_pd = &mlxp->mlx_pd; 449 wq->mlwq_uar = &mlxp->mlx_uar; 450 wq->mlwq_tis = tis; 451 452 wq->mlwq_bufs = mlxcx_mlbs_create(mlxp); 453 wq->mlwq_foreign_bufs = mlxcx_mlbs_create(mlxp); 454 455 VERIFY3U(port->mlp_wqe_min_inline, <=, MLXCX_ETH_INLINE_L2); 456 wq->mlwq_inline_mode = MLXCX_ETH_INLINE_L2; 457 458 if (!mlxcx_wq_alloc_dma(mlxp, wq)) { 459 mutex_exit(&wq->mlwq_mtx); 460 return (B_FALSE); 461 } 462 463 if (!mlxcx_cmd_create_sq(mlxp, wq)) { 464 mutex_exit(&wq->mlwq_mtx); 465 return (B_FALSE); 466 } 467 468 wq->mlwq_bufhwm = wq->mlwq_nents - MLXCX_WQ_HWM_GAP; 469 wq->mlwq_buflwm = wq->mlwq_nents - MLXCX_WQ_LWM_GAP; 470 471 mutex_exit(&wq->mlwq_mtx); 472 473 mutex_enter(&cq->mlcq_mtx); 474 mutex_enter(&wq->mlwq_mtx); 475 ASSERT3P(cq->mlcq_wq, ==, NULL); 476 cq->mlcq_wq = wq; 477 mutex_exit(&wq->mlwq_mtx); 478 mutex_exit(&cq->mlcq_mtx); 479 480 return (B_TRUE); 481 } 482 483 /* 484 * Before we tear down the queues associated with the rx group, 485 * flag each cq as being torn down and wake up any tasks. 486 */ 487 static void 488 mlxcx_quiesce_rx_cqs(mlxcx_t *mlxp, mlxcx_ring_group_t *g) 489 { 490 mlxcx_work_queue_t *wq; 491 mlxcx_completion_queue_t *cq; 492 mlxcx_buf_shard_t *s; 493 uint_t i; 494 495 mutex_enter(&g->mlg_mtx); 496 497 for (i = 0; i < g->mlg_nwqs; ++i) { 498 wq = &g->mlg_wqs[i]; 499 cq = wq->mlwq_cq; 500 if (cq != NULL) { 501 s = wq->mlwq_bufs; 502 mutex_enter(&s->mlbs_mtx); 503 atomic_or_uint(&cq->mlcq_state, MLXCX_CQ_TEARDOWN); 504 cv_broadcast(&s->mlbs_free_nonempty); 505 mutex_exit(&s->mlbs_mtx); 506 } 507 } 508 509 mutex_exit(&g->mlg_mtx); 510 } 511 512 void 513 mlxcx_teardown_rx_group(mlxcx_t *mlxp, mlxcx_ring_group_t *g) 514 { 515 mlxcx_work_queue_t *wq; 516 mlxcx_completion_queue_t *cq; 517 mlxcx_flow_entry_t *fe; 518 mlxcx_flow_group_t *fg; 519 mlxcx_flow_table_t *ft; 520 uint_t i; 521 522 mutex_enter(&g->mlg_port->mlp_mtx); 523 mutex_enter(&g->mlg_mtx); 524 525 if (g->mlg_state & MLXCX_GROUP_FLOWS) { 526 mlxcx_remove_all_umcast_entries(mlxp, g->mlg_port, g); 527 528 if (g->mlg_rx_vlan_ft != NULL) 529 mlxcx_remove_all_vlan_entries(mlxp, g); 530 531 if (g == &mlxp->mlx_rx_groups[0]) { 532 ft = g->mlg_port->mlp_rx_flow; 533 mutex_enter(&ft->mlft_mtx); 534 535 fg = g->mlg_port->mlp_bcast; 536 fe = list_head(&fg->mlfg_entries); 537 if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) { 538 (void) mlxcx_cmd_delete_flow_table_entry( 539 mlxp, fe); 540 } 541 542 fg = g->mlg_port->mlp_promisc; 543 fe = list_head(&fg->mlfg_entries); 544 if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) { 545 (void) mlxcx_cmd_delete_flow_table_entry( 546 mlxp, fe); 547 } 548 549 mutex_exit(&ft->mlft_mtx); 550 } 551 552 if (g->mlg_rx_vlan_ft != NULL) { 553 mutex_enter(&g->mlg_rx_vlan_ft->mlft_mtx); 554 ASSERT(list_is_empty(&g->mlg_rx_vlans)); 555 fg = g->mlg_rx_vlan_def_fg; 556 fe = list_head(&fg->mlfg_entries); 557 if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) { 558 (void) mlxcx_cmd_delete_flow_table_entry( 559 mlxp, fe); 560 } 561 fg = g->mlg_rx_vlan_promisc_fg; 562 fe = list_head(&fg->mlfg_entries); 563 if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) { 564 (void) mlxcx_cmd_delete_flow_table_entry( 565 mlxp, fe); 566 } 567 mlxcx_teardown_flow_table(mlxp, g->mlg_rx_vlan_ft); 568 list_destroy(&g->mlg_rx_vlans); 569 570 g->mlg_rx_vlan_ft = NULL; 571 } 572 573 mutex_enter(&g->mlg_rx_hash_ft->mlft_mtx); 574 mlxcx_teardown_flow_table(mlxp, g->mlg_rx_hash_ft); 575 g->mlg_rx_hash_ft = NULL; 576 577 avl_destroy(&g->mlg_rx_macs); 578 g->mlg_state &= ~MLXCX_GROUP_FLOWS; 579 } 580 581 if (g->mlg_state & MLXCX_GROUP_RUNNING) { 582 for (i = 0; i < g->mlg_nwqs; ++i) { 583 wq = &g->mlg_wqs[i]; 584 mutex_enter(&wq->mlwq_mtx); 585 if (wq->mlwq_state & MLXCX_WQ_STARTED && 586 !mlxcx_cmd_stop_rq(mlxp, wq)) { 587 mlxcx_warn(mlxp, "failed to stop rq %x", 588 wq->mlwq_num); 589 } 590 mutex_exit(&wq->mlwq_mtx); 591 } 592 taskq_destroy(g->mlg_refill_tq); 593 g->mlg_state &= ~MLXCX_GROUP_RUNNING; 594 } 595 596 if (g->mlg_state & MLXCX_GROUP_TIRTIS) { 597 for (i = 0; i < MLXCX_TIRS_PER_GROUP; ++i) { 598 mlxcx_tir_t *tir = &g->mlg_tir[i]; 599 if (tir->mltir_state & MLXCX_TIR_CREATED && 600 !(tir->mltir_state & MLXCX_TIR_DESTROYED)) { 601 if (!mlxcx_cmd_destroy_tir(mlxp, tir)) { 602 mlxcx_warn(mlxp, 603 "failed to destroy tir %u " 604 "for rx ring", tir->mltir_num); 605 } 606 } 607 } 608 g->mlg_state &= ~MLXCX_GROUP_TIRTIS; 609 } 610 611 if (g->mlg_state & MLXCX_GROUP_RQT) { 612 if (g->mlg_rqt->mlrqt_state & MLXCX_RQT_CREATED && 613 !(g->mlg_rqt->mlrqt_state & MLXCX_RQT_DESTROYED)) { 614 if (!mlxcx_cmd_destroy_rqt(mlxp, g->mlg_rqt)) { 615 mlxcx_warn(mlxp, "failed to destroy rqt %u " 616 "for rx ring", g->mlg_rqt->mlrqt_num); 617 } 618 kmem_free(g->mlg_rqt->mlrqt_rq, 619 g->mlg_rqt->mlrqt_rq_size); 620 g->mlg_rqt->mlrqt_rq = NULL; 621 kmem_free(g->mlg_rqt, sizeof (mlxcx_rqtable_t)); 622 g->mlg_rqt = NULL; 623 } 624 g->mlg_state &= ~MLXCX_GROUP_RQT; 625 } 626 627 for (i = 0; i < g->mlg_nwqs; ++i) { 628 wq = &g->mlg_wqs[i]; 629 cq = wq->mlwq_cq; 630 mlxcx_wq_teardown(mlxp, wq); 631 if (cq != NULL) 632 mlxcx_cq_teardown(mlxp, cq); 633 } 634 kmem_free(g->mlg_wqs, g->mlg_wqs_size); 635 g->mlg_wqs = NULL; 636 g->mlg_state &= ~MLXCX_GROUP_WQS; 637 638 mutex_exit(&g->mlg_mtx); 639 mutex_exit(&g->mlg_port->mlp_mtx); 640 641 mutex_destroy(&g->mlg_mtx); 642 643 g->mlg_state &= ~MLXCX_GROUP_INIT; 644 ASSERT3S(g->mlg_state, ==, 0); 645 } 646 647 void 648 mlxcx_teardown_tx_group(mlxcx_t *mlxp, mlxcx_ring_group_t *g) 649 { 650 mlxcx_work_queue_t *wq; 651 mlxcx_completion_queue_t *cq; 652 uint_t i; 653 654 mutex_enter(&g->mlg_mtx); 655 656 if (g->mlg_state & MLXCX_GROUP_WQS) { 657 for (i = 0; i < g->mlg_nwqs; ++i) { 658 wq = &g->mlg_wqs[i]; 659 mutex_enter(&wq->mlwq_mtx); 660 cq = wq->mlwq_cq; 661 if (wq->mlwq_state & MLXCX_WQ_STARTED && 662 !mlxcx_cmd_stop_sq(mlxp, wq)) { 663 mlxcx_warn(mlxp, "failed to stop sq %x", 664 wq->mlwq_num); 665 } 666 mutex_exit(&wq->mlwq_mtx); 667 mlxcx_wq_teardown(mlxp, wq); 668 if (cq != NULL) 669 mlxcx_cq_teardown(mlxp, cq); 670 } 671 g->mlg_state &= ~MLXCX_GROUP_RUNNING; 672 kmem_free(g->mlg_wqs, g->mlg_wqs_size); 673 g->mlg_wqs = NULL; 674 g->mlg_state &= ~MLXCX_GROUP_WQS; 675 } 676 677 if ((g->mlg_state & MLXCX_GROUP_TIRTIS) && 678 g->mlg_tis.mltis_state & MLXCX_TIS_CREATED && 679 !(g->mlg_tis.mltis_state & MLXCX_TIS_DESTROYED)) { 680 if (!mlxcx_cmd_destroy_tis(mlxp, &g->mlg_tis)) { 681 mlxcx_warn(mlxp, "failed to destroy tis %u for tx ring", 682 g->mlg_tis.mltis_num); 683 } 684 } 685 g->mlg_state &= ~MLXCX_GROUP_TIRTIS; 686 687 mutex_exit(&g->mlg_mtx); 688 mutex_destroy(&g->mlg_mtx); 689 g->mlg_state &= ~MLXCX_GROUP_INIT; 690 ASSERT3S(g->mlg_state, ==, 0); 691 } 692 693 void 694 mlxcx_teardown_groups(mlxcx_t *mlxp) 695 { 696 mlxcx_ring_group_t *g; 697 uint_t i; 698 699 for (i = 0; i < mlxp->mlx_rx_ngroups; ++i) { 700 g = &mlxp->mlx_rx_groups[i]; 701 if (!(g->mlg_state & MLXCX_GROUP_INIT)) 702 continue; 703 ASSERT3S(g->mlg_type, ==, MLXCX_GROUP_RX); 704 mlxcx_quiesce_rx_cqs(mlxp, g); 705 } 706 707 for (i = 0; i < mlxp->mlx_rx_ngroups; ++i) { 708 g = &mlxp->mlx_rx_groups[i]; 709 if (!(g->mlg_state & MLXCX_GROUP_INIT)) 710 continue; 711 mlxcx_teardown_rx_group(mlxp, g); 712 } 713 714 kmem_free(mlxp->mlx_rx_groups, mlxp->mlx_rx_groups_size); 715 mlxp->mlx_rx_groups = NULL; 716 717 for (i = 0; i < mlxp->mlx_tx_ngroups; ++i) { 718 g = &mlxp->mlx_tx_groups[i]; 719 if (!(g->mlg_state & MLXCX_GROUP_INIT)) 720 continue; 721 ASSERT3S(g->mlg_type, ==, MLXCX_GROUP_TX); 722 mlxcx_teardown_tx_group(mlxp, g); 723 } 724 725 kmem_free(mlxp->mlx_tx_groups, mlxp->mlx_tx_groups_size); 726 mlxp->mlx_tx_groups = NULL; 727 } 728 729 boolean_t 730 mlxcx_rx_group_setup(mlxcx_t *mlxp, mlxcx_ring_group_t *g) 731 { 732 mlxcx_event_queue_t *eq; 733 mlxcx_completion_queue_t *cq; 734 mlxcx_work_queue_t *rq; 735 mlxcx_flow_table_t *ft; 736 mlxcx_flow_group_t *fg; 737 mlxcx_flow_entry_t *fe; 738 uint_t ent_shift; 739 uint_t i, j; 740 741 ASSERT3S(g->mlg_state, ==, 0); 742 743 mutex_init(&g->mlg_mtx, NULL, MUTEX_DRIVER, 744 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 745 mutex_enter(&g->mlg_mtx); 746 g->mlg_mlx = mlxp; 747 g->mlg_type = MLXCX_GROUP_RX; 748 g->mlg_port = &mlxp->mlx_ports[0]; 749 g->mlg_state |= MLXCX_GROUP_INIT; 750 751 g->mlg_nwqs = mlxp->mlx_props.mldp_rx_nrings_per_small_group; 752 i = g - &mlxp->mlx_rx_groups[0]; 753 if (i < mlxp->mlx_props.mldp_rx_ngroups_large) 754 g->mlg_nwqs = mlxp->mlx_props.mldp_rx_nrings_per_large_group; 755 756 g->mlg_wqs_size = g->mlg_nwqs * sizeof (mlxcx_work_queue_t); 757 g->mlg_wqs = kmem_zalloc(g->mlg_wqs_size, KM_SLEEP); 758 g->mlg_state |= MLXCX_GROUP_WQS; 759 760 g->mlg_rqt = kmem_zalloc(sizeof (mlxcx_rqtable_t), KM_SLEEP); 761 g->mlg_rqt->mlrqt_max = 2; 762 while (g->mlg_rqt->mlrqt_max < g->mlg_nwqs) 763 g->mlg_rqt->mlrqt_max <<= 1; 764 g->mlg_rqt->mlrqt_rq_size = g->mlg_rqt->mlrqt_max * 765 sizeof (mlxcx_work_queue_t *); 766 g->mlg_rqt->mlrqt_rq = kmem_zalloc(g->mlg_rqt->mlrqt_rq_size, KM_SLEEP); 767 g->mlg_state |= MLXCX_GROUP_RQT; 768 769 for (i = 0; i < g->mlg_nwqs; ++i) { 770 eq = NULL; 771 while (eq == NULL) { 772 eq = &mlxp->mlx_eqs[mlxp->mlx_next_eq++]; 773 if (mlxp->mlx_next_eq >= mlxp->mlx_intr_count) 774 mlxp->mlx_next_eq = 1; 775 if (eq->mleq_type != MLXCX_EQ_TYPE_ANY && 776 eq->mleq_type != MLXCX_EQ_TYPE_RX) { 777 /* Try the next one */ 778 eq = NULL; 779 } 780 } 781 782 /* 783 * A single completion is indicated for each rq entry as 784 * it is used. So, the number of cq entries never needs 785 * to be larger than the rq. 786 */ 787 ent_shift = MIN(mlxp->mlx_props.mldp_cq_size_shift, 788 mlxp->mlx_props.mldp_rq_size_shift); 789 if (!mlxcx_cq_setup(mlxp, eq, &cq, ent_shift)) { 790 g->mlg_nwqs = i; 791 break; 792 } 793 794 cq->mlcq_stats = &g->mlg_port->mlp_stats; 795 796 rq = &g->mlg_wqs[i]; 797 if (!mlxcx_rq_setup(mlxp, cq, rq)) { 798 g->mlg_nwqs = i; 799 break; 800 } 801 g->mlg_rqt->mlrqt_rq[g->mlg_rqt->mlrqt_used++] = rq; 802 g->mlg_rqt->mlrqt_state |= MLXCX_RQT_DIRTY; 803 rq->mlwq_group = g; 804 } 805 if (g->mlg_nwqs == 0) { 806 mutex_exit(&g->mlg_mtx); 807 return (B_FALSE); 808 } 809 810 if (!mlxcx_cmd_create_rqt(mlxp, g->mlg_rqt)) { 811 mutex_exit(&g->mlg_mtx); 812 return (B_FALSE); 813 } 814 815 for (i = 0; i < MLXCX_TIRS_PER_GROUP; ++i) { 816 mlxcx_tir_t *tir = &g->mlg_tir[i]; 817 tir->mltir_tdom = &mlxp->mlx_tdom; 818 switch (i) { 819 case MLXCX_TIR_ROLE_OTHER: 820 tir->mltir_type = MLXCX_TIR_DIRECT; 821 tir->mltir_rq = &g->mlg_wqs[0]; 822 break; 823 case MLXCX_TIR_ROLE_IPv4: 824 case MLXCX_TIR_ROLE_IPv6: 825 case MLXCX_TIR_ROLE_TCPv4: 826 case MLXCX_TIR_ROLE_TCPv6: 827 case MLXCX_TIR_ROLE_UDPv4: 828 case MLXCX_TIR_ROLE_UDPv6: 829 tir->mltir_type = MLXCX_TIR_INDIRECT; 830 tir->mltir_rqtable = g->mlg_rqt; 831 tir->mltir_hash_fn = MLXCX_TIR_HASH_TOEPLITZ; 832 (void) random_get_pseudo_bytes(tir->mltir_toeplitz_key, 833 sizeof (tir->mltir_toeplitz_key)); 834 break; 835 } 836 switch (i) { 837 case MLXCX_TIR_ROLE_OTHER: 838 break; 839 case MLXCX_TIR_ROLE_IPv4: 840 case MLXCX_TIR_ROLE_TCPv4: 841 case MLXCX_TIR_ROLE_UDPv4: 842 tir->mltir_l3_type = MLXCX_RX_HASH_L3_IPv4; 843 tir->mltir_hash_fields = 844 MLXCX_RX_HASH_SRC_IP | MLXCX_RX_HASH_DST_IP; 845 break; 846 case MLXCX_TIR_ROLE_IPv6: 847 case MLXCX_TIR_ROLE_TCPv6: 848 case MLXCX_TIR_ROLE_UDPv6: 849 tir->mltir_l3_type = MLXCX_RX_HASH_L3_IPv6; 850 tir->mltir_hash_fields = 851 MLXCX_RX_HASH_SRC_IP | MLXCX_RX_HASH_DST_IP; 852 break; 853 } 854 switch (i) { 855 case MLXCX_TIR_ROLE_OTHER: 856 case MLXCX_TIR_ROLE_IPv4: 857 case MLXCX_TIR_ROLE_IPv6: 858 break; 859 case MLXCX_TIR_ROLE_TCPv4: 860 case MLXCX_TIR_ROLE_TCPv6: 861 tir->mltir_l4_type = MLXCX_RX_HASH_L4_TCP; 862 tir->mltir_hash_fields |= 863 MLXCX_RX_HASH_L4_SPORT | MLXCX_RX_HASH_L4_DPORT; 864 break; 865 case MLXCX_TIR_ROLE_UDPv4: 866 case MLXCX_TIR_ROLE_UDPv6: 867 tir->mltir_l4_type = MLXCX_RX_HASH_L4_UDP; 868 tir->mltir_hash_fields |= 869 MLXCX_RX_HASH_L4_SPORT | MLXCX_RX_HASH_L4_DPORT; 870 break; 871 } 872 873 if (!mlxcx_cmd_create_tir(mlxp, tir)) { 874 mutex_exit(&g->mlg_mtx); 875 return (B_FALSE); 876 } 877 878 g->mlg_state |= MLXCX_GROUP_TIRTIS; 879 } 880 881 /* 882 * Flow table: our RX hashing breakout table for RSS 883 */ 884 885 g->mlg_rx_hash_ft = (ft = kmem_zalloc(sizeof (mlxcx_flow_table_t), 886 KM_SLEEP)); 887 mutex_init(&ft->mlft_mtx, NULL, MUTEX_DRIVER, 888 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 889 avl_create(&g->mlg_rx_macs, mlxcx_grmac_compare, 890 sizeof (mlxcx_group_mac_t), 891 offsetof(mlxcx_group_mac_t, mlgm_group_entry)); 892 g->mlg_state |= MLXCX_GROUP_FLOWS; 893 894 mutex_enter(&ft->mlft_mtx); 895 896 ft->mlft_type = MLXCX_FLOW_TABLE_NIC_RX; 897 ft->mlft_level = 2; 898 ft->mlft_port = g->mlg_port; 899 ft->mlft_entshift = MLXCX_RX_HASH_FT_SIZE_SHIFT; 900 ft->mlft_nents = (1 << ft->mlft_entshift); 901 ASSERT3U(ft->mlft_nents, >=, MLXCX_TIRS_PER_GROUP); 902 ft->mlft_entsize = ft->mlft_nents * sizeof (mlxcx_flow_entry_t); 903 ft->mlft_ent = kmem_zalloc(ft->mlft_entsize, KM_SLEEP); 904 list_create(&ft->mlft_groups, sizeof (mlxcx_flow_group_t), 905 offsetof(mlxcx_flow_group_t, mlfg_entry)); 906 907 for (j = 0; j < ft->mlft_nents; ++j) { 908 ft->mlft_ent[j].mlfe_table = ft; 909 ft->mlft_ent[j].mlfe_index = j; 910 } 911 912 if (!mlxcx_cmd_create_flow_table(mlxp, ft)) { 913 mutex_exit(&ft->mlft_mtx); 914 mutex_exit(&g->mlg_mtx); 915 return (B_FALSE); 916 } 917 918 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 919 list_insert_tail(&ft->mlft_groups, fg); 920 fg->mlfg_table = ft; 921 fg->mlfg_size = 1; 922 fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER | MLXCX_FLOW_MATCH_IP_PROTO; 923 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 924 mutex_exit(&ft->mlft_mtx); 925 mutex_exit(&g->mlg_mtx); 926 return (B_FALSE); 927 } 928 fe = list_head(&fg->mlfg_entries); 929 fe->mlfe_ip_version = 6; 930 fe->mlfe_ip_proto = IPPROTO_UDP; 931 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; 932 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir = 933 &g->mlg_tir[MLXCX_TIR_ROLE_UDPv6]; 934 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 935 mutex_exit(&ft->mlft_mtx); 936 mutex_exit(&g->mlg_mtx); 937 return (B_FALSE); 938 } 939 940 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 941 list_insert_tail(&ft->mlft_groups, fg); 942 fg->mlfg_table = ft; 943 fg->mlfg_size = 1; 944 fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER | MLXCX_FLOW_MATCH_IP_PROTO; 945 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 946 mutex_exit(&ft->mlft_mtx); 947 mutex_exit(&g->mlg_mtx); 948 return (B_FALSE); 949 } 950 fe = list_head(&fg->mlfg_entries); 951 fe->mlfe_ip_version = 4; 952 fe->mlfe_ip_proto = IPPROTO_UDP; 953 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; 954 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir = 955 &g->mlg_tir[MLXCX_TIR_ROLE_UDPv4]; 956 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 957 mutex_exit(&ft->mlft_mtx); 958 mutex_exit(&g->mlg_mtx); 959 return (B_FALSE); 960 } 961 962 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 963 list_insert_tail(&ft->mlft_groups, fg); 964 fg->mlfg_table = ft; 965 fg->mlfg_size = 1; 966 fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER | MLXCX_FLOW_MATCH_IP_PROTO; 967 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 968 mutex_exit(&ft->mlft_mtx); 969 mutex_exit(&g->mlg_mtx); 970 return (B_FALSE); 971 } 972 fe = list_head(&fg->mlfg_entries); 973 fe->mlfe_ip_version = 6; 974 fe->mlfe_ip_proto = IPPROTO_TCP; 975 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; 976 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir = 977 &g->mlg_tir[MLXCX_TIR_ROLE_TCPv6]; 978 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 979 mutex_exit(&ft->mlft_mtx); 980 mutex_exit(&g->mlg_mtx); 981 return (B_FALSE); 982 } 983 984 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 985 list_insert_tail(&ft->mlft_groups, fg); 986 fg->mlfg_table = ft; 987 fg->mlfg_size = 1; 988 fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER | MLXCX_FLOW_MATCH_IP_PROTO; 989 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 990 mutex_exit(&ft->mlft_mtx); 991 mutex_exit(&g->mlg_mtx); 992 return (B_FALSE); 993 } 994 fe = list_head(&fg->mlfg_entries); 995 fe->mlfe_ip_version = 4; 996 fe->mlfe_ip_proto = IPPROTO_TCP; 997 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; 998 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir = 999 &g->mlg_tir[MLXCX_TIR_ROLE_TCPv4]; 1000 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 1001 mutex_exit(&ft->mlft_mtx); 1002 mutex_exit(&g->mlg_mtx); 1003 return (B_FALSE); 1004 } 1005 1006 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 1007 list_insert_tail(&ft->mlft_groups, fg); 1008 fg->mlfg_table = ft; 1009 fg->mlfg_size = 1; 1010 fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER; 1011 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 1012 mutex_exit(&ft->mlft_mtx); 1013 mutex_exit(&g->mlg_mtx); 1014 return (B_FALSE); 1015 } 1016 fe = list_head(&fg->mlfg_entries); 1017 fe->mlfe_ip_version = 6; 1018 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; 1019 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir = 1020 &g->mlg_tir[MLXCX_TIR_ROLE_IPv6]; 1021 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 1022 mutex_exit(&ft->mlft_mtx); 1023 mutex_exit(&g->mlg_mtx); 1024 return (B_FALSE); 1025 } 1026 1027 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 1028 list_insert_tail(&ft->mlft_groups, fg); 1029 fg->mlfg_table = ft; 1030 fg->mlfg_size = 1; 1031 fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER; 1032 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 1033 mutex_exit(&ft->mlft_mtx); 1034 mutex_exit(&g->mlg_mtx); 1035 return (B_FALSE); 1036 } 1037 fe = list_head(&fg->mlfg_entries); 1038 fe->mlfe_ip_version = 4; 1039 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; 1040 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir = 1041 &g->mlg_tir[MLXCX_TIR_ROLE_IPv4]; 1042 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 1043 mutex_exit(&ft->mlft_mtx); 1044 mutex_exit(&g->mlg_mtx); 1045 return (B_FALSE); 1046 } 1047 1048 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 1049 list_insert_tail(&ft->mlft_groups, fg); 1050 fg->mlfg_table = ft; 1051 fg->mlfg_size = 1; 1052 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 1053 mutex_exit(&ft->mlft_mtx); 1054 mutex_exit(&g->mlg_mtx); 1055 return (B_FALSE); 1056 } 1057 fe = list_head(&fg->mlfg_entries); 1058 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; 1059 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir = 1060 &g->mlg_tir[MLXCX_TIR_ROLE_OTHER]; 1061 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 1062 mutex_exit(&ft->mlft_mtx); 1063 mutex_exit(&g->mlg_mtx); 1064 return (B_FALSE); 1065 } 1066 1067 mutex_exit(&ft->mlft_mtx); 1068 1069 /* 1070 * Flow table: the VLAN breakout table for doing VLAN filtering after 1071 * we've matched a MAC address. 1072 */ 1073 1074 g->mlg_rx_vlan_ft = (ft = kmem_zalloc(sizeof (mlxcx_flow_table_t), 1075 KM_SLEEP)); 1076 mutex_init(&ft->mlft_mtx, NULL, MUTEX_DRIVER, 1077 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 1078 list_create(&g->mlg_rx_vlans, sizeof (mlxcx_group_vlan_t), 1079 offsetof(mlxcx_group_vlan_t, mlgv_entry)); 1080 1081 mutex_enter(&ft->mlft_mtx); 1082 1083 ft->mlft_type = MLXCX_FLOW_TABLE_NIC_RX; 1084 ft->mlft_level = 1; 1085 ft->mlft_port = g->mlg_port; 1086 ft->mlft_entshift = mlxp->mlx_props.mldp_ftbl_vlan_size_shift; 1087 ft->mlft_nents = (1 << ft->mlft_entshift); 1088 ft->mlft_entsize = ft->mlft_nents * sizeof (mlxcx_flow_entry_t); 1089 ft->mlft_ent = kmem_zalloc(ft->mlft_entsize, KM_SLEEP); 1090 list_create(&ft->mlft_groups, sizeof (mlxcx_flow_group_t), 1091 offsetof(mlxcx_flow_group_t, mlfg_entry)); 1092 1093 for (j = 0; j < ft->mlft_nents; ++j) { 1094 fe = &ft->mlft_ent[j]; 1095 fe->mlfe_table = ft; 1096 fe->mlfe_index = j; 1097 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; 1098 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = g->mlg_rx_hash_ft; 1099 } 1100 1101 if (!mlxcx_cmd_create_flow_table(mlxp, ft)) { 1102 mutex_exit(&ft->mlft_mtx); 1103 mutex_exit(&g->mlg_mtx); 1104 return (B_FALSE); 1105 } 1106 1107 /* First group is all actual matched VLANs */ 1108 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 1109 g->mlg_rx_vlan_fg = fg; 1110 list_insert_tail(&ft->mlft_groups, fg); 1111 fg->mlfg_table = ft; 1112 fg->mlfg_size = ft->mlft_nents - 2; 1113 fg->mlfg_mask |= MLXCX_FLOW_MATCH_VLAN; 1114 fg->mlfg_mask |= MLXCX_FLOW_MATCH_VID; 1115 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 1116 mutex_exit(&ft->mlft_mtx); 1117 mutex_exit(&g->mlg_mtx); 1118 return (B_FALSE); 1119 } 1120 1121 /* 1122 * Then the "default" entry which we enable when we have no VLAN IDs 1123 * added to the group (we start with this enabled). 1124 */ 1125 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 1126 g->mlg_rx_vlan_def_fg = fg; 1127 list_insert_tail(&ft->mlft_groups, fg); 1128 fg->mlfg_table = ft; 1129 fg->mlfg_size = 1; 1130 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 1131 mutex_exit(&ft->mlft_mtx); 1132 mutex_exit(&g->mlg_mtx); 1133 return (B_FALSE); 1134 } 1135 fe = list_head(&fg->mlfg_entries); 1136 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 1137 mutex_exit(&ft->mlft_mtx); 1138 mutex_exit(&g->mlg_mtx); 1139 return (B_FALSE); 1140 } 1141 1142 /* 1143 * Finally, the promisc entry which points at the *hash ft* from the 1144 * default group. We only enable this when we have promisc on. 1145 */ 1146 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 1147 g->mlg_rx_vlan_promisc_fg = fg; 1148 list_insert_tail(&ft->mlft_groups, fg); 1149 fg->mlfg_table = ft; 1150 fg->mlfg_size = 1; 1151 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 1152 mutex_exit(&ft->mlft_mtx); 1153 mutex_exit(&g->mlg_mtx); 1154 return (B_FALSE); 1155 } 1156 fe = list_head(&fg->mlfg_entries); 1157 fe->mlfe_ndest = 1; 1158 fe->mlfe_dest[0].mlfed_flow = mlxp->mlx_rx_groups[0].mlg_rx_hash_ft; 1159 1160 mutex_exit(&ft->mlft_mtx); 1161 1162 mutex_exit(&g->mlg_mtx); 1163 1164 return (B_TRUE); 1165 } 1166 1167 boolean_t 1168 mlxcx_rx_ring_start(mlxcx_t *mlxp, mlxcx_ring_group_t *g, 1169 mlxcx_work_queue_t *rq) 1170 { 1171 uint_t j; 1172 mlxcx_buffer_t *b; 1173 mlxcx_completion_queue_t *cq; 1174 1175 mutex_enter(&g->mlg_mtx); 1176 /* 1177 * Sadly, even though MAC has the mgi_start callback, it is not always 1178 * called -- in particular when we are being managed under an aggr, the 1179 * mgi_start callback will only ever be called on the default group. 1180 * 1181 * So instead of asserting about the group state here, we have to 1182 * check it and call group start if needed. 1183 */ 1184 if (!(g->mlg_state & MLXCX_GROUP_RUNNING)) { 1185 mutex_exit(&g->mlg_mtx); 1186 if (!mlxcx_rx_group_start(mlxp, g)) 1187 return (B_FALSE); 1188 mutex_enter(&g->mlg_mtx); 1189 } 1190 ASSERT(g->mlg_state & MLXCX_GROUP_RUNNING); 1191 1192 cq = rq->mlwq_cq; 1193 ASSERT(cq != NULL); 1194 1195 mutex_enter(&cq->mlcq_mtx); 1196 mutex_enter(&rq->mlwq_mtx); 1197 1198 if (rq->mlwq_state & MLXCX_WQ_STARTED) { 1199 mutex_exit(&rq->mlwq_mtx); 1200 mutex_exit(&cq->mlcq_mtx); 1201 mutex_exit(&g->mlg_mtx); 1202 return (B_TRUE); 1203 } 1204 1205 if (!mlxcx_cmd_start_rq(mlxp, rq)) { 1206 mutex_exit(&rq->mlwq_mtx); 1207 mutex_exit(&cq->mlcq_mtx); 1208 mutex_exit(&g->mlg_mtx); 1209 return (B_FALSE); 1210 } 1211 ASSERT(rq->mlwq_state & MLXCX_WQ_STARTED); 1212 1213 ASSERT0(rq->mlwq_state & MLXCX_WQ_BUFFERS); 1214 rq->mlwq_state |= MLXCX_WQ_BUFFERS; 1215 1216 mlxcx_shard_ready(rq->mlwq_bufs); 1217 1218 for (j = 0; j < rq->mlwq_nents; ++j) { 1219 if (!mlxcx_buf_create(mlxp, rq->mlwq_bufs, &b)) 1220 break; 1221 mlxcx_buf_return(mlxp, b); 1222 } 1223 for (j = 0; j < rq->mlwq_nents / 2; ++j) { 1224 if (!mlxcx_buf_create(mlxp, rq->mlwq_bufs, &b)) 1225 break; 1226 mlxcx_buf_return(mlxp, b); 1227 } 1228 1229 mlxcx_rq_refill(mlxp, rq); 1230 1231 mutex_exit(&rq->mlwq_mtx); 1232 mutex_exit(&cq->mlcq_mtx); 1233 mutex_exit(&g->mlg_mtx); 1234 1235 return (B_TRUE); 1236 } 1237 1238 boolean_t 1239 mlxcx_rx_group_start(mlxcx_t *mlxp, mlxcx_ring_group_t *g) 1240 { 1241 mlxcx_flow_table_t *ft; 1242 mlxcx_flow_group_t *fg; 1243 mlxcx_flow_entry_t *fe; 1244 char tq_name[TASKQ_NAMELEN]; 1245 1246 mutex_enter(&g->mlg_mtx); 1247 1248 if (g->mlg_state & MLXCX_GROUP_RUNNING) { 1249 mutex_exit(&g->mlg_mtx); 1250 return (B_TRUE); 1251 } 1252 1253 ASSERT0(g->mlg_state & MLXCX_GROUP_RUNNING); 1254 1255 g->mlg_state |= MLXCX_GROUP_RUNNING; 1256 1257 (void) snprintf(tq_name, sizeof (tq_name), "%s_refill_%d_%ld", 1258 ddi_driver_name(mlxp->mlx_dip), mlxp->mlx_inst, 1259 g - &mlxp->mlx_rx_groups[0]); 1260 1261 /* 1262 * Create one refill taskq per group with one thread per work queue. 1263 * The refill task may block waiting for resources, so by effectively 1264 * having one thread per work queue we avoid work queues blocking each 1265 * other. 1266 */ 1267 if ((g->mlg_refill_tq = taskq_create(tq_name, g->mlg_nwqs, minclsyspri, 1268 g->mlg_nwqs, INT_MAX, TASKQ_PREPOPULATE)) == NULL) { 1269 mlxcx_warn(mlxp, "failed to create rq refill task queue"); 1270 mutex_exit(&g->mlg_mtx); 1271 return (B_FALSE); 1272 } 1273 1274 if (g == &mlxp->mlx_rx_groups[0]) { 1275 ft = g->mlg_port->mlp_rx_flow; 1276 mutex_enter(&ft->mlft_mtx); 1277 1278 /* 1279 * Broadcast and promisc entries go directly to group 0's 1280 * RSS hash fanout flow table. They bypass VLAN filtering. 1281 */ 1282 fg = g->mlg_port->mlp_bcast; 1283 fe = list_head(&fg->mlfg_entries); 1284 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = g->mlg_rx_hash_ft; 1285 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 1286 mutex_exit(&ft->mlft_mtx); 1287 g->mlg_state &= ~MLXCX_GROUP_RUNNING; 1288 taskq_destroy(g->mlg_refill_tq); 1289 mutex_exit(&g->mlg_mtx); 1290 return (B_FALSE); 1291 } 1292 1293 fg = g->mlg_port->mlp_promisc; 1294 fe = list_head(&fg->mlfg_entries); 1295 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = g->mlg_rx_hash_ft; 1296 /* 1297 * Don't actually set the promisc entry until promisc is 1298 * enabled. 1299 */ 1300 1301 mutex_exit(&ft->mlft_mtx); 1302 } 1303 1304 mutex_exit(&g->mlg_mtx); 1305 1306 return (B_TRUE); 1307 } 1308 1309 boolean_t 1310 mlxcx_tx_group_setup(mlxcx_t *mlxp, mlxcx_ring_group_t *g) 1311 { 1312 mlxcx_event_queue_t *eq; 1313 mlxcx_completion_queue_t *cq; 1314 mlxcx_work_queue_t *sq; 1315 uint_t i; 1316 1317 ASSERT3S(g->mlg_state, ==, 0); 1318 1319 mutex_init(&g->mlg_mtx, NULL, MUTEX_DRIVER, 1320 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 1321 g->mlg_state |= MLXCX_GROUP_INIT; 1322 mutex_enter(&g->mlg_mtx); 1323 1324 g->mlg_mlx = mlxp; 1325 g->mlg_type = MLXCX_GROUP_TX; 1326 g->mlg_port = &mlxp->mlx_ports[0]; 1327 1328 g->mlg_nwqs = mlxp->mlx_props.mldp_tx_nrings_per_group; 1329 g->mlg_wqs_size = g->mlg_nwqs * sizeof (mlxcx_work_queue_t); 1330 g->mlg_wqs = kmem_zalloc(g->mlg_wqs_size, KM_SLEEP); 1331 g->mlg_state |= MLXCX_GROUP_WQS; 1332 1333 g->mlg_tis.mltis_tdom = &mlxp->mlx_tdom; 1334 1335 if (!mlxcx_cmd_create_tis(mlxp, &g->mlg_tis)) { 1336 mutex_exit(&g->mlg_mtx); 1337 return (B_FALSE); 1338 } 1339 1340 g->mlg_state |= MLXCX_GROUP_TIRTIS; 1341 1342 for (i = 0; i < g->mlg_nwqs; ++i) { 1343 eq = NULL; 1344 while (eq == NULL) { 1345 eq = &mlxp->mlx_eqs[mlxp->mlx_next_eq++]; 1346 if (mlxp->mlx_next_eq >= mlxp->mlx_intr_count) 1347 mlxp->mlx_next_eq = 1; 1348 if (eq->mleq_type != MLXCX_EQ_TYPE_ANY && 1349 eq->mleq_type != MLXCX_EQ_TYPE_TX) { 1350 /* Try the next one */ 1351 eq = NULL; 1352 } 1353 } 1354 1355 if (!mlxcx_cq_setup(mlxp, eq, &cq, 1356 mlxp->mlx_props.mldp_cq_size_shift)) 1357 return (B_FALSE); 1358 1359 cq->mlcq_stats = &g->mlg_port->mlp_stats; 1360 1361 sq = &g->mlg_wqs[i]; 1362 if (!mlxcx_sq_setup(mlxp, g->mlg_port, cq, &g->mlg_tis, sq)) { 1363 mutex_exit(&g->mlg_mtx); 1364 return (B_FALSE); 1365 } 1366 sq->mlwq_group = g; 1367 } 1368 1369 mutex_exit(&g->mlg_mtx); 1370 1371 return (B_TRUE); 1372 } 1373 1374 boolean_t 1375 mlxcx_tx_ring_start(mlxcx_t *mlxp, mlxcx_ring_group_t *g, 1376 mlxcx_work_queue_t *sq) 1377 { 1378 uint_t i; 1379 mlxcx_buffer_t *b; 1380 mlxcx_completion_queue_t *cq; 1381 1382 mutex_enter(&g->mlg_mtx); 1383 1384 cq = sq->mlwq_cq; 1385 ASSERT(cq != NULL); 1386 1387 mutex_enter(&cq->mlcq_mtx); 1388 mutex_enter(&sq->mlwq_mtx); 1389 if (sq->mlwq_state & MLXCX_WQ_STARTED) { 1390 mutex_exit(&sq->mlwq_mtx); 1391 mutex_exit(&cq->mlcq_mtx); 1392 mutex_exit(&g->mlg_mtx); 1393 return (B_TRUE); 1394 } 1395 1396 ASSERT0(sq->mlwq_state & MLXCX_WQ_BUFFERS); 1397 for (i = 0; i < sq->mlwq_nents; ++i) { 1398 if (!mlxcx_buf_create_foreign(mlxp, sq->mlwq_foreign_bufs, &b)) 1399 break; 1400 mlxcx_buf_return(mlxp, b); 1401 } 1402 for (i = 0; i < sq->mlwq_nents / 2; ++i) { 1403 if (!mlxcx_buf_create_foreign(mlxp, sq->mlwq_foreign_bufs, &b)) 1404 break; 1405 mlxcx_buf_return(mlxp, b); 1406 } 1407 for (i = 0; i < sq->mlwq_nents; ++i) { 1408 if (!mlxcx_buf_create(mlxp, sq->mlwq_bufs, &b)) 1409 break; 1410 mlxcx_buf_return(mlxp, b); 1411 } 1412 sq->mlwq_state |= MLXCX_WQ_BUFFERS; 1413 1414 mlxcx_shard_ready(sq->mlwq_bufs); 1415 mlxcx_shard_ready(sq->mlwq_foreign_bufs); 1416 1417 if (!mlxcx_cmd_start_sq(mlxp, sq)) { 1418 mutex_exit(&sq->mlwq_mtx); 1419 mutex_exit(&cq->mlcq_mtx); 1420 mutex_exit(&g->mlg_mtx); 1421 return (B_FALSE); 1422 } 1423 g->mlg_state |= MLXCX_GROUP_RUNNING; 1424 1425 (void) mlxcx_sq_add_nop(mlxp, sq); 1426 1427 mutex_exit(&sq->mlwq_mtx); 1428 mutex_exit(&cq->mlcq_mtx); 1429 mutex_exit(&g->mlg_mtx); 1430 1431 return (B_TRUE); 1432 } 1433 1434 static boolean_t 1435 mlxcx_sq_ring_dbell(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, uint_t first) 1436 { 1437 uint_t idx; 1438 mlxcx_bf_t *bf; 1439 ddi_fm_error_t err; 1440 uint_t try = 0; 1441 1442 ASSERT3U(mlwq->mlwq_type, ==, MLXCX_WQ_TYPE_SENDQ); 1443 ASSERT(mutex_owned(&mlwq->mlwq_mtx)); 1444 1445 mlwq->mlwq_doorbell->mlwqd_send_counter = to_be16(mlwq->mlwq_pc); 1446 1447 ASSERT(mlwq->mlwq_cq != NULL); 1448 ASSERT(mlwq->mlwq_cq->mlcq_eq != NULL); 1449 idx = mlwq->mlwq_cq->mlcq_eq->mleq_intr_index & MLXCX_BF_PER_UAR_MASK; 1450 bf = &mlwq->mlwq_uar->mlu_bf[idx]; 1451 1452 retry: 1453 MLXCX_DMA_SYNC(mlwq->mlwq_doorbell_dma, DDI_DMA_SYNC_FORDEV); 1454 ddi_fm_dma_err_get(mlwq->mlwq_doorbell_dma.mxdb_dma_handle, &err, 1455 DDI_FME_VERSION); 1456 if (err.fme_status != DDI_FM_OK) { 1457 if (try++ < mlxcx_doorbell_tries) { 1458 ddi_fm_dma_err_clear( 1459 mlwq->mlwq_doorbell_dma.mxdb_dma_handle, 1460 DDI_FME_VERSION); 1461 goto retry; 1462 } else { 1463 goto err; 1464 } 1465 } 1466 1467 mlxcx_put64(mlxp, bf->mbf_even, from_be64( 1468 mlwq->mlwq_bf_ent[first].mlsqbf_qwords[0])); 1469 ddi_fm_acc_err_get(mlxp->mlx_regs_handle, &err, 1470 DDI_FME_VERSION); 1471 if (err.fme_status == DDI_FM_OK) 1472 return (B_TRUE); 1473 if (try++ < mlxcx_doorbell_tries) { 1474 ddi_fm_acc_err_clear(mlxp->mlx_regs_handle, DDI_FME_VERSION); 1475 goto retry; 1476 } 1477 1478 err: 1479 ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST); 1480 return (B_FALSE); 1481 } 1482 1483 boolean_t 1484 mlxcx_sq_add_nop(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq) 1485 { 1486 uint_t index, start_pc; 1487 mlxcx_sendq_ent_t *ent0; 1488 ddi_fm_error_t err; 1489 1490 ASSERT(mutex_owned(&mlwq->mlwq_mtx)); 1491 1492 index = mlwq->mlwq_pc & (mlwq->mlwq_nents - 1); 1493 ent0 = &mlwq->mlwq_send_ent[index]; 1494 start_pc = mlwq->mlwq_pc; 1495 ++mlwq->mlwq_pc; 1496 /* 1497 * This counter is manipulated in the interrupt handler, which 1498 * does not hold the mlwq_mtx, hence the atomic. 1499 */ 1500 atomic_inc_64(&mlwq->mlwq_wqebb_used); 1501 1502 bzero(ent0, sizeof (mlxcx_sendq_ent_t)); 1503 ent0->mlsqe_control.mlcs_opcode = MLXCX_WQE_OP_NOP; 1504 ent0->mlsqe_control.mlcs_qp_or_sq = to_be24(mlwq->mlwq_num); 1505 ent0->mlsqe_control.mlcs_wqe_index = to_be16(start_pc); 1506 1507 set_bits8(&ent0->mlsqe_control.mlcs_flags, 1508 MLXCX_SQE_FENCE_MODE, MLXCX_SQE_FENCE_NONE); 1509 set_bits8(&ent0->mlsqe_control.mlcs_flags, 1510 MLXCX_SQE_COMPLETION_MODE, MLXCX_SQE_CQE_ALWAYS); 1511 1512 ent0->mlsqe_control.mlcs_ds = 1; 1513 1514 VERIFY0(ddi_dma_sync(mlwq->mlwq_dma.mxdb_dma_handle, 1515 (uintptr_t)ent0 - (uintptr_t)mlwq->mlwq_send_ent, 1516 sizeof (mlxcx_sendq_ent_t), DDI_DMA_SYNC_FORDEV)); 1517 ddi_fm_dma_err_get(mlwq->mlwq_dma.mxdb_dma_handle, &err, 1518 DDI_FME_VERSION); 1519 if (err.fme_status != DDI_FM_OK) { 1520 return (B_FALSE); 1521 } 1522 if (!mlxcx_sq_ring_dbell(mlxp, mlwq, index)) { 1523 return (B_FALSE); 1524 } 1525 return (B_TRUE); 1526 } 1527 1528 boolean_t 1529 mlxcx_sq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, 1530 uint8_t *inlinehdrs, size_t inlinelen, uint32_t chkflags, 1531 mlxcx_buffer_t *b0) 1532 { 1533 uint_t index, first, ents; 1534 mlxcx_completion_queue_t *cq; 1535 mlxcx_sendq_ent_t *ent0; 1536 mlxcx_sendq_extra_ent_t *ent; 1537 mlxcx_wqe_data_seg_t *seg; 1538 uint_t ptri, nptr; 1539 const ddi_dma_cookie_t *c; 1540 size_t rem; 1541 uint64_t wqebb_used; 1542 mlxcx_buffer_t *b; 1543 ddi_fm_error_t err; 1544 boolean_t rv; 1545 1546 ASSERT(mutex_owned(&mlwq->mlwq_mtx)); 1547 ASSERT3P(b0->mlb_tx_head, ==, b0); 1548 ASSERT3U(b0->mlb_state, ==, MLXCX_BUFFER_ON_WQ); 1549 cq = mlwq->mlwq_cq; 1550 1551 index = mlwq->mlwq_pc & (mlwq->mlwq_nents - 1); 1552 ent0 = &mlwq->mlwq_send_ent[index]; 1553 b0->mlb_wqe_index = mlwq->mlwq_pc; 1554 ents = 1; 1555 1556 first = index; 1557 1558 bzero(ent0, sizeof (mlxcx_sendq_ent_t)); 1559 ent0->mlsqe_control.mlcs_opcode = MLXCX_WQE_OP_SEND; 1560 ent0->mlsqe_control.mlcs_qp_or_sq = to_be24(mlwq->mlwq_num); 1561 ent0->mlsqe_control.mlcs_wqe_index = to_be16(b0->mlb_wqe_index); 1562 1563 set_bits8(&ent0->mlsqe_control.mlcs_flags, 1564 MLXCX_SQE_FENCE_MODE, MLXCX_SQE_FENCE_WAIT_OTHERS); 1565 set_bits8(&ent0->mlsqe_control.mlcs_flags, 1566 MLXCX_SQE_COMPLETION_MODE, MLXCX_SQE_CQE_ALWAYS); 1567 1568 VERIFY3U(inlinelen, <=, sizeof (ent0->mlsqe_eth.mles_inline_headers)); 1569 set_bits16(&ent0->mlsqe_eth.mles_szflags, 1570 MLXCX_SQE_ETH_INLINE_HDR_SZ, inlinelen); 1571 if (inlinelen > 0) { 1572 bcopy(inlinehdrs, ent0->mlsqe_eth.mles_inline_headers, 1573 inlinelen); 1574 } 1575 1576 ent0->mlsqe_control.mlcs_ds = offsetof(mlxcx_sendq_ent_t, mlsqe_data) / 1577 MLXCX_WQE_OCTOWORD; 1578 1579 if (chkflags & HCK_IPV4_HDRCKSUM) { 1580 ASSERT(mlxp->mlx_caps->mlc_checksum); 1581 set_bit8(&ent0->mlsqe_eth.mles_csflags, 1582 MLXCX_SQE_ETH_CSFLAG_L3_CHECKSUM); 1583 } 1584 if (chkflags & HCK_FULLCKSUM) { 1585 ASSERT(mlxp->mlx_caps->mlc_checksum); 1586 set_bit8(&ent0->mlsqe_eth.mles_csflags, 1587 MLXCX_SQE_ETH_CSFLAG_L4_CHECKSUM); 1588 } 1589 1590 /* 1591 * mlwq_wqebb_used is only incremented whilst holding 1592 * the mlwq_mtx mutex, but it is decremented (atomically) in 1593 * the interrupt context *not* under mlwq_mtx mutex. 1594 * So, now take a snapshot of the number of used wqes which will 1595 * be a conistent maximum we can use whilst iterating through 1596 * the buffers and DMA cookies. 1597 */ 1598 wqebb_used = mlwq->mlwq_wqebb_used; 1599 1600 b = b0; 1601 ptri = 0; 1602 nptr = sizeof (ent0->mlsqe_data) / sizeof (mlxcx_wqe_data_seg_t); 1603 seg = ent0->mlsqe_data; 1604 while (b != NULL) { 1605 rem = b->mlb_used; 1606 1607 c = NULL; 1608 while (rem > 0 && 1609 (c = mlxcx_dma_cookie_iter(&b->mlb_dma, c)) != NULL) { 1610 if (ptri >= nptr) { 1611 if ((ents + wqebb_used) >= mlwq->mlwq_nents) 1612 return (B_FALSE); 1613 1614 index = (mlwq->mlwq_pc + ents) & 1615 (mlwq->mlwq_nents - 1); 1616 ent = &mlwq->mlwq_send_extra_ent[index]; 1617 ++ents; 1618 1619 seg = ent->mlsqe_data; 1620 ptri = 0; 1621 nptr = sizeof (ent->mlsqe_data) / 1622 sizeof (mlxcx_wqe_data_seg_t); 1623 } 1624 1625 seg->mlds_lkey = to_be32(mlxp->mlx_rsvd_lkey); 1626 if (c->dmac_size > rem) { 1627 seg->mlds_byte_count = to_be32(rem); 1628 rem = 0; 1629 } else { 1630 seg->mlds_byte_count = to_be32(c->dmac_size); 1631 rem -= c->dmac_size; 1632 } 1633 seg->mlds_address = to_be64(c->dmac_laddress); 1634 ++seg; 1635 ++ptri; 1636 ++ent0->mlsqe_control.mlcs_ds; 1637 1638 ASSERT3U(ent0->mlsqe_control.mlcs_ds, <=, 1639 MLXCX_SQE_MAX_DS); 1640 } 1641 1642 if (b == b0) { 1643 b = list_head(&b0->mlb_tx_chain); 1644 } else { 1645 b = list_next(&b0->mlb_tx_chain, b); 1646 } 1647 } 1648 1649 b0->mlb_wqebbs = ents; 1650 mlwq->mlwq_pc += ents; 1651 atomic_add_64(&mlwq->mlwq_wqebb_used, ents); 1652 1653 for (; ptri < nptr; ++ptri, ++seg) { 1654 seg->mlds_lkey = to_be32(MLXCX_NULL_LKEY); 1655 seg->mlds_byte_count = to_be32(0); 1656 seg->mlds_address = to_be64(0); 1657 } 1658 1659 /* 1660 * Make sure the workqueue entry is flushed out before updating 1661 * the doorbell. 1662 * If the ring has wrapped, we need to flush the front and back. 1663 */ 1664 if ((first + ents) > mlwq->mlwq_nents) { 1665 uint_t sync_cnt = mlwq->mlwq_nents - first; 1666 1667 VERIFY0(ddi_dma_sync(mlwq->mlwq_dma.mxdb_dma_handle, 1668 (uintptr_t)ent0 - (uintptr_t)mlwq->mlwq_send_ent, 1669 sync_cnt * sizeof (mlxcx_sendq_ent_t), 1670 DDI_DMA_SYNC_FORDEV)); 1671 1672 ent0 = &mlwq->mlwq_send_ent[0]; 1673 ents -= sync_cnt; 1674 } 1675 1676 VERIFY0(ddi_dma_sync(mlwq->mlwq_dma.mxdb_dma_handle, 1677 (uintptr_t)ent0 - (uintptr_t)mlwq->mlwq_send_ent, 1678 ents * sizeof (mlxcx_sendq_ent_t), DDI_DMA_SYNC_FORDEV)); 1679 ddi_fm_dma_err_get(mlwq->mlwq_dma.mxdb_dma_handle, &err, 1680 DDI_FME_VERSION); 1681 if (err.fme_status != DDI_FM_OK) { 1682 return (B_FALSE); 1683 } 1684 1685 /* 1686 * Hold the bufmtx whilst ringing the doorbell, to prevent 1687 * the buffer from being moved to another list, so we can 1688 * safely remove it should the ring fail. 1689 */ 1690 mutex_enter(&cq->mlcq_bufbmtx); 1691 1692 list_insert_tail(&cq->mlcq_buffers_b, b0); 1693 if ((rv = mlxcx_sq_ring_dbell(mlxp, mlwq, first))) { 1694 atomic_inc_64(&cq->mlcq_bufcnt); 1695 } else { 1696 list_remove(&cq->mlcq_buffers_b, b0); 1697 } 1698 1699 mutex_exit(&cq->mlcq_bufbmtx); 1700 1701 return (rv); 1702 } 1703 1704 boolean_t 1705 mlxcx_rq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, 1706 mlxcx_buffer_t *buf) 1707 { 1708 return (mlxcx_rq_add_buffers(mlxp, mlwq, &buf, 1)); 1709 } 1710 1711 boolean_t 1712 mlxcx_rq_add_buffers(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, 1713 mlxcx_buffer_t **bufs, size_t nbufs) 1714 { 1715 uint_t index; 1716 mlxcx_recvq_ent_t *ent; 1717 mlxcx_completion_queue_t *cq; 1718 mlxcx_wqe_data_seg_t *seg; 1719 uint_t bi, ptri; 1720 const ddi_dma_cookie_t *c; 1721 mlxcx_buffer_t *buf; 1722 ddi_fm_error_t err; 1723 1724 ASSERT(mutex_owned(&mlwq->mlwq_mtx)); 1725 cq = mlwq->mlwq_cq; 1726 ASSERT(mutex_owned(&cq->mlcq_mtx)); 1727 1728 for (bi = 0; bi < nbufs; ++bi) { 1729 buf = bufs[bi]; 1730 bufs[bi] = NULL; 1731 ASSERT3U(buf->mlb_state, ==, MLXCX_BUFFER_ON_WQ); 1732 1733 index = mlwq->mlwq_pc & (mlwq->mlwq_nents - 1); 1734 ent = &mlwq->mlwq_recv_ent[index]; 1735 buf->mlb_wqe_index = mlwq->mlwq_pc; 1736 buf->mlb_wqebbs = 1; 1737 1738 ++mlwq->mlwq_pc; 1739 atomic_inc_64(&mlwq->mlwq_wqebb_used); 1740 1741 mutex_enter(&cq->mlcq_bufbmtx); 1742 list_insert_tail(&cq->mlcq_buffers, buf); 1743 atomic_inc_64(&cq->mlcq_bufcnt); 1744 mutex_exit(&cq->mlcq_bufbmtx); 1745 1746 ASSERT3U(buf->mlb_dma.mxdb_ncookies, <=, MLXCX_RECVQ_MAX_PTRS); 1747 ptri = 0; 1748 c = NULL; 1749 while ((c = mlxcx_dma_cookie_iter(&buf->mlb_dma, c)) != NULL) { 1750 seg = &ent->mlrqe_data[ptri++]; 1751 seg->mlds_lkey = to_be32(mlxp->mlx_rsvd_lkey); 1752 seg->mlds_byte_count = to_be32(c->dmac_size); 1753 seg->mlds_address = to_be64(c->dmac_laddress); 1754 } 1755 /* 1756 * Fill any unused scatter pointers with the special null 1757 * value. 1758 */ 1759 for (; ptri < MLXCX_RECVQ_MAX_PTRS; ++ptri) { 1760 seg = &ent->mlrqe_data[ptri]; 1761 seg->mlds_lkey = to_be32(MLXCX_NULL_LKEY); 1762 seg->mlds_byte_count = to_be32(0); 1763 seg->mlds_address = to_be64(0); 1764 } 1765 1766 /* 1767 * Make sure the workqueue entry is flushed out before updating 1768 * the doorbell. 1769 */ 1770 VERIFY0(ddi_dma_sync(mlwq->mlwq_dma.mxdb_dma_handle, 1771 (uintptr_t)ent - (uintptr_t)mlwq->mlwq_recv_ent, 1772 sizeof (mlxcx_recvq_ent_t), DDI_DMA_SYNC_FORDEV)); 1773 ddi_fm_dma_err_get(mlwq->mlwq_dma.mxdb_dma_handle, &err, 1774 DDI_FME_VERSION); 1775 if (err.fme_status != DDI_FM_OK) { 1776 return (B_FALSE); 1777 } 1778 } 1779 1780 mlwq->mlwq_doorbell->mlwqd_recv_counter = to_be16(mlwq->mlwq_pc); 1781 /* 1782 * Flush the CQ doorbell as well so that HW knows how many 1783 * completions we've consumed. 1784 */ 1785 MLXCX_DMA_SYNC(cq->mlcq_doorbell_dma, DDI_DMA_SYNC_FORDEV); 1786 ddi_fm_dma_err_get(cq->mlcq_doorbell_dma.mxdb_dma_handle, &err, 1787 DDI_FME_VERSION); 1788 if (err.fme_status != DDI_FM_OK) { 1789 return (B_FALSE); 1790 } 1791 MLXCX_DMA_SYNC(mlwq->mlwq_doorbell_dma, DDI_DMA_SYNC_FORDEV); 1792 ddi_fm_dma_err_get(mlwq->mlwq_doorbell_dma.mxdb_dma_handle, &err, 1793 DDI_FME_VERSION); 1794 if (err.fme_status != DDI_FM_OK) { 1795 return (B_FALSE); 1796 } 1797 return (B_TRUE); 1798 } 1799 1800 static void 1801 mlxcx_rq_refill_task(void *arg) 1802 { 1803 mlxcx_work_queue_t *wq = arg; 1804 mlxcx_completion_queue_t *cq = wq->mlwq_cq; 1805 mlxcx_t *mlxp = wq->mlwq_mlx; 1806 mlxcx_buf_shard_t *s = wq->mlwq_bufs; 1807 boolean_t refill, draining; 1808 1809 do { 1810 /* 1811 * Wait here until one of 3 conditions: 1812 * 1. The shard is draining, or 1813 * 2. There are buffers on the free list, or 1814 * 3. The WQ is being shut down. 1815 */ 1816 mutex_enter(&s->mlbs_mtx); 1817 while (s->mlbs_state != MLXCX_SHARD_DRAINING && 1818 list_is_empty(&s->mlbs_free) && 1819 (cq->mlcq_state & MLXCX_CQ_TEARDOWN) == 0) { 1820 cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx); 1821 } 1822 1823 draining = (s->mlbs_state == MLXCX_SHARD_DRAINING); 1824 mutex_exit(&s->mlbs_mtx); 1825 1826 mutex_enter(&cq->mlcq_mtx); 1827 mutex_enter(&wq->mlwq_mtx); 1828 1829 if (draining || (cq->mlcq_state & MLXCX_CQ_TEARDOWN) != 0) { 1830 refill = B_FALSE; 1831 wq->mlwq_state &= ~MLXCX_WQ_REFILLING; 1832 } else { 1833 mlxcx_rq_refill(mlxp, wq); 1834 1835 if (cq->mlcq_bufcnt < MLXCX_RQ_REFILL_STEP) { 1836 refill = B_TRUE; 1837 } else { 1838 refill = B_FALSE; 1839 wq->mlwq_state &= ~MLXCX_WQ_REFILLING; 1840 } 1841 } 1842 1843 mutex_exit(&wq->mlwq_mtx); 1844 mutex_exit(&cq->mlcq_mtx); 1845 } while (refill); 1846 } 1847 1848 void 1849 mlxcx_rq_refill(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq) 1850 { 1851 size_t target, current, want, done, n; 1852 mlxcx_completion_queue_t *cq; 1853 mlxcx_ring_group_t *g; 1854 mlxcx_buffer_t *b[MLXCX_RQ_REFILL_STEP]; 1855 uint_t i; 1856 1857 ASSERT(mutex_owned(&mlwq->mlwq_mtx)); 1858 cq = mlwq->mlwq_cq; 1859 ASSERT(mutex_owned(&cq->mlcq_mtx)); 1860 1861 ASSERT(mlwq->mlwq_state & MLXCX_WQ_BUFFERS); 1862 1863 target = mlwq->mlwq_nents - MLXCX_RQ_REFILL_STEP; 1864 cq = mlwq->mlwq_cq; 1865 1866 if ((mlwq->mlwq_state & MLXCX_WQ_STARTED) == 0) 1867 return; 1868 1869 if ((cq->mlcq_state & MLXCX_CQ_TEARDOWN) != 0) 1870 return; 1871 1872 current = cq->mlcq_bufcnt; 1873 1874 if (current >= target - MLXCX_RQ_REFILL_STEP) 1875 return; 1876 1877 want = target - current; 1878 done = 0; 1879 1880 while (!(mlwq->mlwq_state & MLXCX_WQ_TEARDOWN) && done < want) { 1881 n = mlxcx_buf_take_n(mlxp, mlwq, b, MLXCX_RQ_REFILL_STEP); 1882 if (n == 0) { 1883 /* 1884 * We didn't get any buffers from the free queue. 1885 * It might not be an issue, schedule a taskq 1886 * to wait for free buffers if the completion 1887 * queue is low. 1888 */ 1889 if (current < MLXCX_RQ_REFILL_STEP && 1890 (mlwq->mlwq_state & MLXCX_WQ_REFILLING) == 0) { 1891 mlwq->mlwq_state |= MLXCX_WQ_REFILLING; 1892 g = mlwq->mlwq_group; 1893 taskq_dispatch_ent(g->mlg_refill_tq, 1894 mlxcx_rq_refill_task, mlwq, TQ_NOSLEEP, 1895 &mlwq->mlwq_tqe); 1896 } 1897 1898 return; 1899 } 1900 1901 if ((mlwq->mlwq_state & MLXCX_WQ_TEARDOWN) != 0) { 1902 for (i = 0; i < n; ++i) 1903 mlxcx_buf_return(mlxp, b[i]); 1904 return; 1905 } 1906 if (!mlxcx_rq_add_buffers(mlxp, mlwq, b, n)) { 1907 /* 1908 * mlxcx_rq_add_buffers NULLs out the buffers as it 1909 * enqueues them, so any that are non-NULL we have to 1910 * free now. The others now belong to the WQ, even if 1911 * we failed. 1912 */ 1913 for (i = 0; i < n; ++i) { 1914 if (b[i] != NULL) { 1915 mlxcx_buf_return(mlxp, b[i]); 1916 } 1917 } 1918 return; 1919 } 1920 done += n; 1921 } 1922 } 1923 1924 static const char * 1925 mlxcx_cq_err_syndrome_string(mlxcx_cq_error_syndrome_t sy) 1926 { 1927 switch (sy) { 1928 case MLXCX_CQ_ERR_LOCAL_LENGTH: 1929 return ("LOCAL_LENGTH"); 1930 case MLXCX_CQ_ERR_LOCAL_QP_OP: 1931 return ("LOCAL_QP_OP"); 1932 case MLXCX_CQ_ERR_LOCAL_PROTECTION: 1933 return ("LOCAL_PROTECTION"); 1934 case MLXCX_CQ_ERR_WR_FLUSHED: 1935 return ("WR_FLUSHED"); 1936 case MLXCX_CQ_ERR_MEM_WINDOW_BIND: 1937 return ("MEM_WINDOW_BIND"); 1938 case MLXCX_CQ_ERR_BAD_RESPONSE: 1939 return ("BAD_RESPONSE"); 1940 case MLXCX_CQ_ERR_LOCAL_ACCESS: 1941 return ("LOCAL_ACCESS"); 1942 case MLXCX_CQ_ERR_XPORT_RETRY_CTR: 1943 return ("XPORT_RETRY_CTR"); 1944 case MLXCX_CQ_ERR_RNR_RETRY_CTR: 1945 return ("RNR_RETRY_CTR"); 1946 case MLXCX_CQ_ERR_ABORTED: 1947 return ("ABORTED"); 1948 default: 1949 return ("UNKNOWN"); 1950 } 1951 } 1952 1953 static void 1954 mlxcx_fm_cqe_ereport(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, 1955 mlxcx_completionq_error_ent_t *ent) 1956 { 1957 uint64_t ena; 1958 char buf[FM_MAX_CLASS]; 1959 const char *name = mlxcx_cq_err_syndrome_string(ent->mlcqee_syndrome); 1960 1961 if (!DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps)) 1962 return; 1963 1964 (void) snprintf(buf, FM_MAX_CLASS, "%s.%s", 1965 MLXCX_FM_SERVICE_MLXCX, "cqe.err"); 1966 ena = fm_ena_generate(0, FM_ENA_FMT1); 1967 1968 ddi_fm_ereport_post(mlxp->mlx_dip, buf, ena, DDI_NOSLEEP, 1969 FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0, 1970 "syndrome", DATA_TYPE_STRING, name, 1971 "syndrome_num", DATA_TYPE_UINT8, ent->mlcqee_syndrome, 1972 "vendor_syndrome", DATA_TYPE_UINT8, 1973 ent->mlcqee_vendor_error_syndrome, 1974 "wqe_counter", DATA_TYPE_UINT16, from_be16(ent->mlcqee_wqe_counter), 1975 "wq_type", DATA_TYPE_STRING, 1976 (mlcq->mlcq_wq->mlwq_type == MLXCX_WQ_TYPE_SENDQ) ? "send": "recv", 1977 "cq_num", DATA_TYPE_UINT32, mlcq->mlcq_num, 1978 "wq_num", DATA_TYPE_UINT32, mlcq->mlcq_wq->mlwq_num, 1979 NULL); 1980 ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_DEGRADED); 1981 } 1982 1983 void 1984 mlxcx_tx_completion(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, 1985 mlxcx_completionq_ent_t *ent, mlxcx_buffer_t *buf) 1986 { 1987 ASSERT(mutex_owned(&mlcq->mlcq_mtx)); 1988 if (ent->mlcqe_opcode == MLXCX_CQE_OP_REQ_ERR) { 1989 mlxcx_completionq_error_ent_t *eent = 1990 (mlxcx_completionq_error_ent_t *)ent; 1991 mlxcx_fm_cqe_ereport(mlxp, mlcq, eent); 1992 mlxcx_buf_return_chain(mlxp, buf, B_FALSE); 1993 mutex_enter(&mlcq->mlcq_wq->mlwq_mtx); 1994 mlxcx_check_sq(mlxp, mlcq->mlcq_wq); 1995 mutex_exit(&mlcq->mlcq_wq->mlwq_mtx); 1996 return; 1997 } 1998 1999 if (ent->mlcqe_opcode != MLXCX_CQE_OP_REQ) { 2000 mlxcx_warn(mlxp, "!got weird cq opcode: %x", ent->mlcqe_opcode); 2001 mlxcx_buf_return_chain(mlxp, buf, B_FALSE); 2002 return; 2003 } 2004 2005 if (ent->mlcqe_send_wqe_opcode != MLXCX_WQE_OP_SEND) { 2006 mlxcx_warn(mlxp, "!got weird cq wqe opcode: %x", 2007 ent->mlcqe_send_wqe_opcode); 2008 mlxcx_buf_return_chain(mlxp, buf, B_FALSE); 2009 return; 2010 } 2011 2012 if (ent->mlcqe_format != MLXCX_CQE_FORMAT_BASIC) { 2013 mlxcx_warn(mlxp, "!got weird cq format: %x", ent->mlcqe_format); 2014 mlxcx_buf_return_chain(mlxp, buf, B_FALSE); 2015 return; 2016 } 2017 2018 mlxcx_buf_return_chain(mlxp, buf, B_FALSE); 2019 } 2020 2021 mblk_t * 2022 mlxcx_rx_completion(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, 2023 mlxcx_completionq_ent_t *ent, mlxcx_buffer_t *buf) 2024 { 2025 uint32_t chkflags = 0; 2026 uint_t wqe_index; 2027 ddi_fm_error_t err; 2028 2029 ASSERT(mutex_owned(&mlcq->mlcq_mtx)); 2030 2031 if (ent->mlcqe_opcode == MLXCX_CQE_OP_RESP_ERR) { 2032 mlxcx_completionq_error_ent_t *eent = 2033 (mlxcx_completionq_error_ent_t *)ent; 2034 mlxcx_fm_cqe_ereport(mlxp, mlcq, eent); 2035 mlxcx_buf_return(mlxp, buf); 2036 mutex_enter(&mlcq->mlcq_wq->mlwq_mtx); 2037 mlxcx_check_rq(mlxp, mlcq->mlcq_wq); 2038 mutex_exit(&mlcq->mlcq_wq->mlwq_mtx); 2039 return (NULL); 2040 } 2041 2042 if (ent->mlcqe_opcode != MLXCX_CQE_OP_RESP) { 2043 mlxcx_warn(mlxp, "!got weird cq opcode: %x", ent->mlcqe_opcode); 2044 mlxcx_buf_return(mlxp, buf); 2045 return (NULL); 2046 } 2047 2048 if (ent->mlcqe_format != MLXCX_CQE_FORMAT_BASIC) { 2049 mlxcx_warn(mlxp, "!got weird cq format: %x", ent->mlcqe_format); 2050 mlxcx_buf_return(mlxp, buf); 2051 return (NULL); 2052 } 2053 2054 if (ent->mlcqe_rx_drop_counter > 0) { 2055 atomic_add_64(&mlcq->mlcq_stats->mlps_rx_drops, 2056 ent->mlcqe_rx_drop_counter); 2057 } 2058 2059 MLXCX_DMA_SYNC(buf->mlb_dma, DDI_DMA_SYNC_FORCPU); 2060 ddi_fm_dma_err_get(buf->mlb_dma.mxdb_dma_handle, &err, 2061 DDI_FME_VERSION); 2062 if (err.fme_status != DDI_FM_OK) { 2063 ddi_fm_dma_err_clear(buf->mlb_dma.mxdb_dma_handle, 2064 DDI_FME_VERSION); 2065 mlxcx_buf_return(mlxp, buf); 2066 return (NULL); 2067 } 2068 2069 /* 2070 * mlxcx_buf_loan() will set mlb_wqe_index to zero. 2071 * Remember it for later. 2072 */ 2073 wqe_index = buf->mlb_wqe_index; 2074 2075 if (!mlxcx_buf_loan(mlxp, buf)) { 2076 mlxcx_buf_return(mlxp, buf); 2077 return (NULL); 2078 } 2079 2080 buf->mlb_mp->b_next = NULL; 2081 buf->mlb_mp->b_cont = NULL; 2082 buf->mlb_mp->b_wptr = buf->mlb_mp->b_rptr + 2083 from_be32(ent->mlcqe_byte_cnt); 2084 2085 if (get_bit8(ent->mlcqe_csflags, MLXCX_CQE_CSFLAGS_L4_OK)) { 2086 chkflags |= HCK_FULLCKSUM_OK; 2087 } 2088 if (get_bit8(ent->mlcqe_csflags, MLXCX_CQE_CSFLAGS_L3_OK)) { 2089 chkflags |= HCK_IPV4_HDRCKSUM_OK; 2090 } 2091 if (chkflags != 0) { 2092 mac_hcksum_set(buf->mlb_mp, 0, 0, 0, 2093 from_be16(ent->mlcqe_checksum), chkflags); 2094 } 2095 2096 /* 2097 * Don't check if a refill is needed on every single completion, 2098 * since checking involves taking the RQ lock. 2099 */ 2100 if ((wqe_index & 0x7) == 0) { 2101 mlxcx_work_queue_t *wq = mlcq->mlcq_wq; 2102 ASSERT(wq != NULL); 2103 mutex_enter(&wq->mlwq_mtx); 2104 if (!(wq->mlwq_state & MLXCX_WQ_TEARDOWN)) 2105 mlxcx_rq_refill(mlxp, wq); 2106 mutex_exit(&wq->mlwq_mtx); 2107 } 2108 2109 return (buf->mlb_mp); 2110 } 2111 2112 static void 2113 mlxcx_buf_mp_return(caddr_t arg) 2114 { 2115 mlxcx_buffer_t *b = (mlxcx_buffer_t *)arg; 2116 mlxcx_t *mlxp = b->mlb_mlx; 2117 2118 /* The mblk has been used now, so NULL it out. */ 2119 b->mlb_mp = NULL; 2120 2121 if (b->mlb_state == MLXCX_BUFFER_ON_LOAN) 2122 mlxcx_buf_return(mlxp, b); 2123 } 2124 2125 boolean_t 2126 mlxcx_buf_create(mlxcx_t *mlxp, mlxcx_buf_shard_t *shard, mlxcx_buffer_t **bp) 2127 { 2128 mlxcx_buffer_t *b; 2129 ddi_device_acc_attr_t acc; 2130 ddi_dma_attr_t attr; 2131 boolean_t ret; 2132 2133 b = kmem_cache_alloc(mlxp->mlx_bufs_cache, KM_SLEEP); 2134 b->mlb_shard = shard; 2135 b->mlb_foreign = B_FALSE; 2136 2137 mlxcx_dma_acc_attr(mlxp, &acc); 2138 mlxcx_dma_buf_attr(mlxp, &attr); 2139 2140 ret = mlxcx_dma_alloc_offset(mlxp, &b->mlb_dma, &attr, &acc, 2141 B_FALSE, mlxp->mlx_ports[0].mlp_mtu, 2, B_TRUE); 2142 if (!ret) { 2143 kmem_cache_free(mlxp->mlx_bufs_cache, b); 2144 return (B_FALSE); 2145 } 2146 2147 b->mlb_frtn.free_func = mlxcx_buf_mp_return; 2148 b->mlb_frtn.free_arg = (caddr_t)b; 2149 b->mlb_mp = desballoc((unsigned char *)b->mlb_dma.mxdb_va, 2150 b->mlb_dma.mxdb_len, 0, &b->mlb_frtn); 2151 2152 *bp = b; 2153 2154 return (B_TRUE); 2155 } 2156 2157 boolean_t 2158 mlxcx_buf_create_foreign(mlxcx_t *mlxp, mlxcx_buf_shard_t *shard, 2159 mlxcx_buffer_t **bp) 2160 { 2161 mlxcx_buffer_t *b; 2162 ddi_dma_attr_t attr; 2163 boolean_t ret; 2164 2165 b = kmem_cache_alloc(mlxp->mlx_bufs_cache, KM_SLEEP); 2166 b->mlb_shard = shard; 2167 b->mlb_foreign = B_TRUE; 2168 2169 mlxcx_dma_buf_attr(mlxp, &attr); 2170 2171 ret = mlxcx_dma_init(mlxp, &b->mlb_dma, &attr, B_TRUE); 2172 if (!ret) { 2173 kmem_cache_free(mlxp->mlx_bufs_cache, b); 2174 return (B_FALSE); 2175 } 2176 2177 *bp = b; 2178 2179 return (B_TRUE); 2180 } 2181 2182 static mlxcx_buffer_t * 2183 mlxcx_buf_take_foreign(mlxcx_t *mlxp, mlxcx_work_queue_t *wq) 2184 { 2185 mlxcx_buffer_t *b; 2186 mlxcx_buf_shard_t *s = wq->mlwq_foreign_bufs; 2187 2188 mutex_enter(&s->mlbs_mtx); 2189 if (s->mlbs_state != MLXCX_SHARD_READY) { 2190 mutex_exit(&s->mlbs_mtx); 2191 return (NULL); 2192 } 2193 2194 if ((b = list_remove_head(&s->mlbs_free)) != NULL) { 2195 ASSERT3U(b->mlb_state, ==, MLXCX_BUFFER_FREE); 2196 ASSERT(b->mlb_foreign); 2197 b->mlb_state = MLXCX_BUFFER_ON_WQ; 2198 list_insert_tail(&s->mlbs_busy, b); 2199 } 2200 mutex_exit(&s->mlbs_mtx); 2201 2202 return (b); 2203 } 2204 2205 static mlxcx_buffer_t * 2206 mlxcx_copy_data(mlxcx_t *mlxp, mlxcx_work_queue_t *wq, uint8_t *rptr, size_t sz) 2207 { 2208 ddi_fm_error_t err; 2209 mlxcx_buffer_t *b; 2210 uint_t attempts = 0; 2211 2212 copyb: 2213 if ((b = mlxcx_buf_take(mlxp, wq)) == NULL) 2214 return (NULL); 2215 2216 ASSERT3U(b->mlb_dma.mxdb_len, >=, sz); 2217 bcopy(rptr, b->mlb_dma.mxdb_va, sz); 2218 2219 MLXCX_DMA_SYNC(b->mlb_dma, DDI_DMA_SYNC_FORDEV); 2220 2221 ddi_fm_dma_err_get(b->mlb_dma.mxdb_dma_handle, &err, 2222 DDI_FME_VERSION); 2223 if (err.fme_status != DDI_FM_OK) { 2224 ddi_fm_dma_err_clear(b->mlb_dma.mxdb_dma_handle, 2225 DDI_FME_VERSION); 2226 mlxcx_buf_return(mlxp, b); 2227 if (++attempts > MLXCX_BUF_BIND_MAX_ATTEMTPS) { 2228 return (NULL); 2229 } 2230 goto copyb; 2231 } 2232 2233 return (b); 2234 } 2235 2236 static mlxcx_buffer_t * 2237 mlxcx_bind_or_copy_mblk(mlxcx_t *mlxp, mlxcx_work_queue_t *wq, 2238 mblk_t *mp, size_t off) 2239 { 2240 mlxcx_buffer_t *b; 2241 uint8_t *rptr; 2242 size_t sz; 2243 boolean_t ret; 2244 2245 rptr = mp->b_rptr; 2246 sz = MBLKL(mp); 2247 2248 #ifdef DEBUG 2249 if (off > 0) { 2250 ASSERT3U(off, <, sz); 2251 } 2252 #endif 2253 2254 rptr += off; 2255 sz -= off; 2256 2257 if (sz < mlxp->mlx_props.mldp_tx_bind_threshold) { 2258 b = mlxcx_copy_data(mlxp, wq, rptr, sz); 2259 } else { 2260 b = mlxcx_buf_take_foreign(mlxp, wq); 2261 if (b == NULL) 2262 return (NULL); 2263 2264 ret = mlxcx_dma_bind_mblk(mlxp, &b->mlb_dma, mp, off, 2265 B_FALSE); 2266 2267 if (!ret) { 2268 mlxcx_buf_return(mlxp, b); 2269 2270 b = mlxcx_copy_data(mlxp, wq, rptr, sz); 2271 } 2272 } 2273 2274 return (b); 2275 } 2276 2277 uint_t 2278 mlxcx_buf_bind_or_copy(mlxcx_t *mlxp, mlxcx_work_queue_t *wq, 2279 mblk_t *mpb, size_t off, mlxcx_buffer_t **bp) 2280 { 2281 mlxcx_buffer_t *b, *b0 = NULL; 2282 boolean_t first = B_TRUE; 2283 mblk_t *mp; 2284 size_t offset = off; 2285 size_t ncookies = 0; 2286 uint_t count = 0; 2287 2288 for (mp = mpb; mp != NULL && ncookies <= MLXCX_SQE_MAX_PTRS; 2289 mp = mp->b_cont) { 2290 b = mlxcx_bind_or_copy_mblk(mlxp, wq, mp, offset); 2291 if (b == NULL) 2292 goto failed; 2293 2294 ncookies += b->mlb_dma.mxdb_ncookies; 2295 2296 if (first) 2297 b0 = b; 2298 2299 if (!first) 2300 b->mlb_state = MLXCX_BUFFER_ON_CHAIN; 2301 2302 b->mlb_tx_mp = mp; 2303 b->mlb_tx_head = b0; 2304 b->mlb_used = MBLKL(mp) - offset; 2305 2306 if (!first) 2307 list_insert_tail(&b0->mlb_tx_chain, b); 2308 first = B_FALSE; 2309 offset = 0; 2310 2311 count++; 2312 } 2313 2314 /* 2315 * The chain of mblks has resulted in too many cookies for 2316 * a single message. This is unusual, so take the hit to tidy 2317 * up, do a pullup to a single mblk and allocate the requisite 2318 * buf. 2319 */ 2320 if (ncookies > MLXCX_SQE_MAX_PTRS) { 2321 DTRACE_PROBE4(pullup, mlxcx_t *, mlxp, mlxcx_work_queue_t *, wq, 2322 mblk_t *, mpb, size_t, ncookies); 2323 2324 if (b0 != NULL) 2325 mlxcx_buf_return_chain(mlxp, b0, B_TRUE); 2326 2327 if ((mp = msgpullup(mpb, -1)) == NULL) 2328 return (0); 2329 2330 b0 = mlxcx_bind_or_copy_mblk(mlxp, wq, mp, off); 2331 if (b0 == NULL) { 2332 freemsg(mp); 2333 return (0); 2334 } 2335 freemsg(mpb); 2336 2337 b0->mlb_tx_mp = mp; 2338 b0->mlb_tx_head = b0; 2339 b0->mlb_used = MBLKL(mp) - off; 2340 2341 count = 1; 2342 } 2343 2344 *bp = b0; 2345 2346 return (count); 2347 2348 failed: 2349 if (b0 != NULL) 2350 mlxcx_buf_return_chain(mlxp, b0, B_TRUE); 2351 2352 return (0); 2353 } 2354 2355 mlxcx_buffer_t * 2356 mlxcx_buf_take(mlxcx_t *mlxp, mlxcx_work_queue_t *wq) 2357 { 2358 mlxcx_buffer_t *b; 2359 mlxcx_buf_shard_t *s = wq->mlwq_bufs; 2360 2361 mutex_enter(&s->mlbs_mtx); 2362 if (s->mlbs_state != MLXCX_SHARD_READY) { 2363 mutex_exit(&s->mlbs_mtx); 2364 return (NULL); 2365 } 2366 2367 if ((b = list_remove_head(&s->mlbs_free)) != NULL) { 2368 ASSERT3U(b->mlb_state, ==, MLXCX_BUFFER_FREE); 2369 b->mlb_state = MLXCX_BUFFER_ON_WQ; 2370 list_insert_tail(&s->mlbs_busy, b); 2371 } 2372 mutex_exit(&s->mlbs_mtx); 2373 2374 return (b); 2375 } 2376 2377 size_t 2378 mlxcx_buf_take_n(mlxcx_t *mlxp, mlxcx_work_queue_t *wq, 2379 mlxcx_buffer_t **bp, size_t nbufs) 2380 { 2381 mlxcx_buffer_t *b; 2382 size_t done = 0; 2383 mlxcx_buf_shard_t *s; 2384 2385 s = wq->mlwq_bufs; 2386 2387 mutex_enter(&s->mlbs_mtx); 2388 if (s->mlbs_state != MLXCX_SHARD_READY) { 2389 mutex_exit(&s->mlbs_mtx); 2390 return (0); 2391 } 2392 2393 while (done < nbufs && (b = list_remove_head(&s->mlbs_free)) != NULL) { 2394 ASSERT3U(b->mlb_state, ==, MLXCX_BUFFER_FREE); 2395 b->mlb_state = MLXCX_BUFFER_ON_WQ; 2396 list_insert_tail(&s->mlbs_busy, b); 2397 bp[done++] = b; 2398 } 2399 mutex_exit(&s->mlbs_mtx); 2400 return (done); 2401 } 2402 2403 boolean_t 2404 mlxcx_buf_loan(mlxcx_t *mlxp, mlxcx_buffer_t *b) 2405 { 2406 mlxcx_buf_shard_t *s = b->mlb_shard; 2407 2408 VERIFY3U(b->mlb_state, ==, MLXCX_BUFFER_ON_WQ); 2409 ASSERT3P(b->mlb_mlx, ==, mlxp); 2410 2411 if (b->mlb_mp == NULL) { 2412 b->mlb_mp = desballoc((unsigned char *)b->mlb_dma.mxdb_va, 2413 b->mlb_dma.mxdb_len, 0, &b->mlb_frtn); 2414 if (b->mlb_mp == NULL) 2415 return (B_FALSE); 2416 } 2417 2418 b->mlb_state = MLXCX_BUFFER_ON_LOAN; 2419 b->mlb_wqe_index = 0; 2420 2421 mutex_enter(&s->mlbs_mtx); 2422 list_remove(&s->mlbs_busy, b); 2423 list_insert_tail(&s->mlbs_loaned, b); 2424 mutex_exit(&s->mlbs_mtx); 2425 2426 return (B_TRUE); 2427 } 2428 2429 void 2430 mlxcx_buf_return_chain(mlxcx_t *mlxp, mlxcx_buffer_t *b0, boolean_t keepmp) 2431 { 2432 mlxcx_buffer_t *b; 2433 2434 if (b0->mlb_tx_head != b0) { 2435 mlxcx_buf_return(mlxp, b0); 2436 return; 2437 } 2438 2439 while ((b = list_head(&b0->mlb_tx_chain)) != NULL) { 2440 mlxcx_buf_return(mlxp, b); 2441 } 2442 if (keepmp) { 2443 b0->mlb_tx_mp = NULL; 2444 b0->mlb_tx_head = NULL; 2445 } 2446 mlxcx_buf_return(mlxp, b0); 2447 } 2448 2449 void 2450 mlxcx_buf_return(mlxcx_t *mlxp, mlxcx_buffer_t *b) 2451 { 2452 mlxcx_buffer_state_t oldstate = b->mlb_state; 2453 mlxcx_buffer_t *txhead = b->mlb_tx_head; 2454 mlxcx_buf_shard_t *s = b->mlb_shard; 2455 mblk_t *mp = b->mlb_tx_mp; 2456 2457 VERIFY3U(oldstate, !=, MLXCX_BUFFER_FREE); 2458 ASSERT3P(b->mlb_mlx, ==, mlxp); 2459 2460 /* 2461 * The mlbs_mtx held below is a heavily contended lock, so it is 2462 * imperative we do as much of the buffer clean up outside the lock 2463 * as is possible. 2464 */ 2465 b->mlb_state = MLXCX_BUFFER_FREE; 2466 b->mlb_wqe_index = 0; 2467 b->mlb_tx_head = NULL; 2468 b->mlb_tx_mp = NULL; 2469 b->mlb_used = 0; 2470 b->mlb_wqebbs = 0; 2471 ASSERT(list_is_empty(&b->mlb_tx_chain)); 2472 2473 if (b->mlb_foreign) { 2474 if (b->mlb_dma.mxdb_flags & MLXCX_DMABUF_BOUND) { 2475 mlxcx_dma_unbind(mlxp, &b->mlb_dma); 2476 } 2477 } 2478 2479 mutex_enter(&s->mlbs_mtx); 2480 switch (oldstate) { 2481 case MLXCX_BUFFER_INIT: 2482 break; 2483 case MLXCX_BUFFER_ON_WQ: 2484 list_remove(&s->mlbs_busy, b); 2485 break; 2486 case MLXCX_BUFFER_ON_LOAN: 2487 ASSERT(!b->mlb_foreign); 2488 list_remove(&s->mlbs_loaned, b); 2489 if (s->mlbs_state == MLXCX_SHARD_DRAINING) { 2490 /* 2491 * When we're draining, Eg during mac_stop(), 2492 * we destroy the buffer immediately rather than 2493 * recycling it. Otherwise we risk leaving it 2494 * on the free list and leaking it. 2495 */ 2496 list_insert_tail(&s->mlbs_free, b); 2497 mlxcx_buf_destroy(mlxp, b); 2498 /* 2499 * Teardown might be waiting for loaned list to empty. 2500 */ 2501 cv_broadcast(&s->mlbs_free_nonempty); 2502 mutex_exit(&s->mlbs_mtx); 2503 return; 2504 } 2505 break; 2506 case MLXCX_BUFFER_FREE: 2507 VERIFY(0); 2508 break; 2509 case MLXCX_BUFFER_ON_CHAIN: 2510 ASSERT(txhead != NULL); 2511 list_remove(&txhead->mlb_tx_chain, b); 2512 list_remove(&s->mlbs_busy, b); 2513 break; 2514 } 2515 2516 list_insert_tail(&s->mlbs_free, b); 2517 cv_broadcast(&s->mlbs_free_nonempty); 2518 2519 mutex_exit(&s->mlbs_mtx); 2520 2521 /* 2522 * For TX chain heads, free the mblk_t after we let go of the lock. 2523 * This might be a borrowed buf that we in turn loaned to MAC, in which 2524 * case calling freemsg() on it will re-enter this very function -- so 2525 * we better not be holding the lock! 2526 */ 2527 if (txhead == b) 2528 freemsg(mp); 2529 } 2530 2531 void 2532 mlxcx_buf_destroy(mlxcx_t *mlxp, mlxcx_buffer_t *b) 2533 { 2534 mlxcx_buf_shard_t *s = b->mlb_shard; 2535 2536 VERIFY(b->mlb_state == MLXCX_BUFFER_FREE || 2537 b->mlb_state == MLXCX_BUFFER_INIT); 2538 ASSERT(mutex_owned(&s->mlbs_mtx)); 2539 2540 if (b->mlb_state == MLXCX_BUFFER_FREE) 2541 list_remove(&s->mlbs_free, b); 2542 2543 /* 2544 * This is going back to the kmem cache, so it needs to be set up in 2545 * the same way we expect a new buffer to come out (state INIT, other 2546 * fields NULL'd) 2547 */ 2548 b->mlb_state = MLXCX_BUFFER_INIT; 2549 b->mlb_shard = NULL; 2550 if (b->mlb_mp != NULL) { 2551 freeb(b->mlb_mp); 2552 ASSERT(b->mlb_mp == NULL); 2553 } 2554 mlxcx_dma_free(&b->mlb_dma); 2555 ASSERT(list_is_empty(&b->mlb_tx_chain)); 2556 2557 kmem_cache_free(mlxp->mlx_bufs_cache, b); 2558 } 2559 2560 void 2561 mlxcx_shard_ready(mlxcx_buf_shard_t *s) 2562 { 2563 mutex_enter(&s->mlbs_mtx); 2564 s->mlbs_state = MLXCX_SHARD_READY; 2565 mutex_exit(&s->mlbs_mtx); 2566 } 2567 2568 void 2569 mlxcx_shard_draining(mlxcx_buf_shard_t *s) 2570 { 2571 mutex_enter(&s->mlbs_mtx); 2572 s->mlbs_state = MLXCX_SHARD_DRAINING; 2573 cv_broadcast(&s->mlbs_free_nonempty); 2574 mutex_exit(&s->mlbs_mtx); 2575 } 2576