1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2023 The University of Queensland 14 * Copyright (c) 2018, Joyent, Inc. 15 * Copyright 2020 RackTop Systems, Inc. 16 */ 17 18 /* 19 * Mellanox Connect-X 4/5/6 driver. 20 */ 21 22 #include <sys/modctl.h> 23 #include <sys/conf.h> 24 #include <sys/devops.h> 25 #include <sys/sysmacros.h> 26 #include <sys/atomic.h> 27 #include <sys/cpuvar.h> 28 #include <sys/sdt.h> 29 30 #include <sys/pattr.h> 31 #include <sys/dlpi.h> 32 33 #include <sys/mac_provider.h> 34 35 #include <sys/random.h> 36 37 #include <mlxcx.h> 38 39 boolean_t 40 mlxcx_wq_alloc_dma(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq) 41 { 42 ddi_device_acc_attr_t acc; 43 ddi_dma_attr_t attr; 44 boolean_t ret; 45 size_t sz; 46 47 VERIFY0(mlwq->mlwq_state & MLXCX_WQ_ALLOC); 48 49 /* Receive and send queue entries might be different sizes. */ 50 switch (mlwq->mlwq_type) { 51 case MLXCX_WQ_TYPE_SENDQ: 52 mlwq->mlwq_entshift = mlxp->mlx_props.mldp_sq_size_shift; 53 mlwq->mlwq_nents = (1 << mlwq->mlwq_entshift); 54 sz = mlwq->mlwq_nents * sizeof (mlxcx_sendq_ent_t); 55 break; 56 case MLXCX_WQ_TYPE_RECVQ: 57 mlwq->mlwq_entshift = mlxp->mlx_props.mldp_rq_size_shift; 58 mlwq->mlwq_nents = (1 << mlwq->mlwq_entshift); 59 sz = mlwq->mlwq_nents * sizeof (mlxcx_recvq_ent_t); 60 break; 61 default: 62 VERIFY(0); 63 return (B_FALSE); 64 } 65 ASSERT3U(sz & (MLXCX_HW_PAGE_SIZE - 1), ==, 0); 66 67 mlxcx_dma_acc_attr(mlxp, &acc); 68 mlxcx_dma_queue_attr(mlxp, &attr); 69 70 ret = mlxcx_dma_alloc(mlxp, &mlwq->mlwq_dma, &attr, &acc, 71 B_TRUE, sz, B_TRUE); 72 if (!ret) { 73 mlxcx_warn(mlxp, "failed to allocate WQ memory"); 74 return (B_FALSE); 75 } 76 77 /* 78 * Just set the first pointer in the union. Yes, this is a strict 79 * aliasing violation. No, I don't care. 80 */ 81 mlwq->mlwq_send_ent = (mlxcx_sendq_ent_t *)mlwq->mlwq_dma.mxdb_va; 82 83 mlxcx_dma_acc_attr(mlxp, &acc); 84 mlxcx_dma_qdbell_attr(mlxp, &attr); 85 sz = sizeof (mlxcx_workq_doorbell_t); 86 ret = mlxcx_dma_alloc(mlxp, &mlwq->mlwq_doorbell_dma, &attr, &acc, 87 B_TRUE, sz, B_TRUE); 88 if (!ret) { 89 mlxcx_warn(mlxp, "failed to allocate WQ doorbell memory"); 90 mlxcx_dma_free(&mlwq->mlwq_dma); 91 mlwq->mlwq_send_ent = NULL; 92 return (B_FALSE); 93 } 94 95 mlwq->mlwq_doorbell = 96 (mlxcx_workq_doorbell_t *)mlwq->mlwq_doorbell_dma.mxdb_va; 97 98 mlwq->mlwq_state |= MLXCX_WQ_ALLOC; 99 100 return (B_TRUE); 101 } 102 103 void 104 mlxcx_wq_rele_dma(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq) 105 { 106 VERIFY(mlwq->mlwq_state & MLXCX_WQ_ALLOC); 107 if (mlwq->mlwq_state & MLXCX_WQ_CREATED) 108 VERIFY(mlwq->mlwq_state & MLXCX_WQ_DESTROYED); 109 110 mlxcx_dma_free(&mlwq->mlwq_dma); 111 mlwq->mlwq_send_ent = NULL; 112 mlxcx_dma_free(&mlwq->mlwq_doorbell_dma); 113 mlwq->mlwq_doorbell = NULL; 114 115 mlwq->mlwq_state &= ~MLXCX_CQ_ALLOC; 116 } 117 118 static boolean_t 119 mlxcx_cq_alloc_dma(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, 120 uint_t ent_shift) 121 { 122 ddi_device_acc_attr_t acc; 123 ddi_dma_attr_t attr; 124 boolean_t ret; 125 size_t sz, i; 126 127 VERIFY0(mlcq->mlcq_state & MLXCX_EQ_ALLOC); 128 129 mlcq->mlcq_entshift = ent_shift; 130 mlcq->mlcq_nents = (1 << mlcq->mlcq_entshift); 131 sz = mlcq->mlcq_nents * sizeof (mlxcx_completionq_ent_t); 132 ASSERT3U(sz & (MLXCX_HW_PAGE_SIZE - 1), ==, 0); 133 134 mlxcx_dma_acc_attr(mlxp, &acc); 135 mlxcx_dma_queue_attr(mlxp, &attr); 136 137 ret = mlxcx_dma_alloc(mlxp, &mlcq->mlcq_dma, &attr, &acc, 138 B_TRUE, sz, B_TRUE); 139 if (!ret) { 140 mlxcx_warn(mlxp, "failed to allocate CQ memory"); 141 return (B_FALSE); 142 } 143 144 mlcq->mlcq_ent = (mlxcx_completionq_ent_t *)mlcq->mlcq_dma.mxdb_va; 145 146 for (i = 0; i < mlcq->mlcq_nents; ++i) { 147 mlcq->mlcq_ent[i].mlcqe_opcode = MLXCX_CQE_OP_INVALID; 148 mlcq->mlcq_ent[i].mlcqe_owner = MLXCX_CQE_OWNER_INIT; 149 } 150 151 mlxcx_dma_acc_attr(mlxp, &acc); 152 mlxcx_dma_qdbell_attr(mlxp, &attr); 153 sz = sizeof (mlxcx_completionq_doorbell_t); 154 ret = mlxcx_dma_alloc(mlxp, &mlcq->mlcq_doorbell_dma, &attr, &acc, 155 B_TRUE, sz, B_TRUE); 156 if (!ret) { 157 mlxcx_warn(mlxp, "failed to allocate CQ doorbell memory"); 158 mlxcx_dma_free(&mlcq->mlcq_dma); 159 mlcq->mlcq_ent = NULL; 160 return (B_FALSE); 161 } 162 163 mlcq->mlcq_doorbell = 164 (mlxcx_completionq_doorbell_t *)mlcq->mlcq_doorbell_dma.mxdb_va; 165 166 atomic_or_uint(&mlcq->mlcq_state, MLXCX_CQ_ALLOC); 167 168 return (B_TRUE); 169 } 170 171 static void 172 mlxcx_cq_rele_dma(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq) 173 { 174 VERIFY(mlcq->mlcq_state & MLXCX_CQ_ALLOC); 175 if (mlcq->mlcq_state & MLXCX_CQ_CREATED) 176 VERIFY(mlcq->mlcq_state & MLXCX_CQ_DESTROYED); 177 178 mlxcx_dma_free(&mlcq->mlcq_dma); 179 mlcq->mlcq_ent = NULL; 180 mlxcx_dma_free(&mlcq->mlcq_doorbell_dma); 181 mlcq->mlcq_doorbell = NULL; 182 183 atomic_and_uint(&mlcq->mlcq_state, ~MLXCX_CQ_ALLOC); 184 } 185 186 void 187 mlxcx_wq_teardown(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq) 188 { 189 mlxcx_completion_queue_t *mlcq; 190 191 /* 192 * If something is holding the lock on a long operation like a 193 * refill, setting this flag asks them to exit early if possible. 194 */ 195 atomic_or_uint(&mlwq->mlwq_state, MLXCX_WQ_TEARDOWN); 196 197 mutex_enter(&mlwq->mlwq_mtx); 198 199 list_remove(&mlxp->mlx_wqs, mlwq); 200 201 if ((mlwq->mlwq_state & MLXCX_WQ_CREATED) && 202 !(mlwq->mlwq_state & MLXCX_WQ_DESTROYED)) { 203 if (mlwq->mlwq_type == MLXCX_WQ_TYPE_RECVQ && 204 mlwq->mlwq_state & MLXCX_WQ_STARTED && 205 !mlxcx_cmd_stop_rq(mlxp, mlwq)) { 206 mlxcx_warn(mlxp, "failed to stop " 207 "recv queue num %x", mlwq->mlwq_num); 208 } 209 if (mlwq->mlwq_type == MLXCX_WQ_TYPE_SENDQ && 210 mlwq->mlwq_state & MLXCX_WQ_STARTED && 211 !mlxcx_cmd_stop_sq(mlxp, mlwq)) { 212 mlxcx_warn(mlxp, "failed to stop " 213 "send queue num %x", mlwq->mlwq_num); 214 } 215 if (mlwq->mlwq_type == MLXCX_WQ_TYPE_RECVQ && 216 !mlxcx_cmd_destroy_rq(mlxp, mlwq)) { 217 mlxcx_warn(mlxp, "failed to destroy " 218 "recv queue num %x", mlwq->mlwq_num); 219 } 220 if (mlwq->mlwq_type == MLXCX_WQ_TYPE_SENDQ && 221 !mlxcx_cmd_destroy_sq(mlxp, mlwq)) { 222 mlxcx_warn(mlxp, "failed to destroy " 223 "send queue num %x", mlwq->mlwq_num); 224 } 225 } 226 if (mlwq->mlwq_state & MLXCX_WQ_ALLOC) { 227 mlxcx_wq_rele_dma(mlxp, mlwq); 228 } 229 mlcq = mlwq->mlwq_cq; 230 231 /* These will be released by mlxcx_teardown_bufs() */ 232 mlwq->mlwq_bufs = NULL; 233 mlwq->mlwq_foreign_bufs = NULL; 234 235 mutex_exit(&mlwq->mlwq_mtx); 236 237 mutex_enter(&mlcq->mlcq_mtx); 238 mutex_enter(&mlwq->mlwq_mtx); 239 ASSERT3P(mlcq->mlcq_wq, ==, mlwq); 240 mlcq->mlcq_wq = NULL; 241 mutex_exit(&mlwq->mlwq_mtx); 242 mutex_exit(&mlcq->mlcq_mtx); 243 244 mutex_destroy(&mlwq->mlwq_mtx); 245 } 246 247 void 248 mlxcx_cq_teardown(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq) 249 { 250 mlxcx_event_queue_t *mleq; 251 mlxcx_buffer_t *b; 252 253 /* 254 * If something is holding the lock on a long operation like polling 255 * which we're going to abort anyway, this flag asks them to exit 256 * early if possible. 257 */ 258 atomic_or_uint(&mlcq->mlcq_state, MLXCX_CQ_TEARDOWN); 259 260 mutex_enter(&mlcq->mlcq_mtx); 261 262 list_remove(&mlxp->mlx_cqs, mlcq); 263 264 if ((mlcq->mlcq_state & MLXCX_CQ_CREATED) && 265 !(mlcq->mlcq_state & MLXCX_CQ_DESTROYED)) { 266 if (!mlxcx_cmd_destroy_cq(mlxp, mlcq)) { 267 mlxcx_warn(mlxp, "failed to destroy " 268 "completion queue num %u", 269 mlcq->mlcq_num); 270 } 271 } 272 if (mlcq->mlcq_state & MLXCX_CQ_ALLOC) { 273 mlxcx_cq_rele_dma(mlxp, mlcq); 274 } 275 /* 276 * If we're on an EQ AVL tree, then we need to grab 277 * the EQ's mutex to take it off. The ISR always takes 278 * EQ mutex before CQ mutex, so we have to let go of 279 * the CQ mutex then come back again. 280 * 281 * The ISR will bail out if tries to touch this CQ now since 282 * we added the CQ_DESTROYED flag above. 283 */ 284 if (mlcq->mlcq_state & MLXCX_CQ_EQAVL) { 285 mleq = mlcq->mlcq_eq; 286 } else { 287 mleq = NULL; 288 } 289 290 /* Return any outstanding buffers to the free pool. */ 291 while ((b = list_remove_head(&mlcq->mlcq_buffers)) != NULL) { 292 mlxcx_buf_return_chain(mlxp, b, B_FALSE); 293 } 294 mutex_enter(&mlcq->mlcq_bufbmtx); 295 while ((b = list_remove_head(&mlcq->mlcq_buffers_b)) != NULL) { 296 mlxcx_buf_return_chain(mlxp, b, B_FALSE); 297 } 298 mutex_exit(&mlcq->mlcq_bufbmtx); 299 300 /* 301 * Since the interrupt handlers take the EQ lock before the CQ one, 302 * we must do the same here. That means letting go of the lock 303 * for a brief window here (we'll double-check the state when we 304 * get back in). 305 */ 306 mutex_exit(&mlcq->mlcq_mtx); 307 308 if (mleq != NULL) { 309 mutex_enter(&mleq->mleq_mtx); 310 mutex_enter(&mlcq->mlcq_mtx); 311 /* 312 * Double-check the state, we let go of the 313 * mutex briefly. 314 */ 315 if (mlcq->mlcq_state & MLXCX_CQ_EQAVL) { 316 avl_remove(&mleq->mleq_cqs, mlcq); 317 atomic_and_uint(&mlcq->mlcq_state, ~MLXCX_CQ_EQAVL); 318 } 319 mutex_exit(&mlcq->mlcq_mtx); 320 mutex_exit(&mleq->mleq_mtx); 321 } 322 323 mutex_enter(&mlcq->mlcq_mtx); 324 ASSERT0(mlcq->mlcq_state & ~(MLXCX_CQ_CREATED | MLXCX_CQ_DESTROYED | 325 MLXCX_CQ_TEARDOWN | MLXCX_CQ_ARMED)); 326 mutex_exit(&mlcq->mlcq_mtx); 327 328 mutex_destroy(&mlcq->mlcq_mtx); 329 mutex_destroy(&mlcq->mlcq_arm_mtx); 330 mutex_destroy(&mlcq->mlcq_bufbmtx); 331 list_destroy(&mlcq->mlcq_buffers); 332 list_destroy(&mlcq->mlcq_buffers_b); 333 kmem_free(mlcq, sizeof (mlxcx_completion_queue_t)); 334 } 335 336 static boolean_t 337 mlxcx_cq_setup(mlxcx_t *mlxp, mlxcx_event_queue_t *eq, 338 mlxcx_completion_queue_t **cqp, uint_t ent_shift) 339 { 340 mlxcx_completion_queue_t *cq; 341 342 cq = kmem_zalloc(sizeof (mlxcx_completion_queue_t), KM_SLEEP); 343 mutex_init(&cq->mlcq_mtx, NULL, MUTEX_DRIVER, 344 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 345 mutex_init(&cq->mlcq_arm_mtx, NULL, MUTEX_DRIVER, 346 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 347 mutex_init(&cq->mlcq_bufbmtx, NULL, MUTEX_DRIVER, 348 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 349 list_create(&cq->mlcq_buffers, sizeof (mlxcx_buffer_t), 350 offsetof(mlxcx_buffer_t, mlb_cq_entry)); 351 list_create(&cq->mlcq_buffers_b, sizeof (mlxcx_buffer_t), 352 offsetof(mlxcx_buffer_t, mlb_cq_entry)); 353 354 cq->mlcq_mlx = mlxp; 355 list_insert_tail(&mlxp->mlx_cqs, cq); 356 357 mutex_enter(&cq->mlcq_mtx); 358 359 if (!mlxcx_cq_alloc_dma(mlxp, cq, ent_shift)) { 360 mutex_exit(&cq->mlcq_mtx); 361 return (B_FALSE); 362 } 363 364 cq->mlcq_bufhwm = cq->mlcq_nents - MLXCX_CQ_HWM_GAP; 365 cq->mlcq_buflwm = cq->mlcq_nents - MLXCX_CQ_LWM_GAP; 366 367 cq->mlcq_uar = &mlxp->mlx_uar; 368 cq->mlcq_eq = eq; 369 370 cq->mlcq_cqemod_period_usec = mlxp->mlx_props.mldp_cqemod_period_usec; 371 cq->mlcq_cqemod_count = mlxp->mlx_props.mldp_cqemod_count; 372 373 if (!mlxcx_cmd_create_cq(mlxp, cq)) { 374 mutex_exit(&cq->mlcq_mtx); 375 return (B_FALSE); 376 } 377 378 mutex_exit(&cq->mlcq_mtx); 379 380 mutex_enter(&eq->mleq_mtx); 381 mutex_enter(&cq->mlcq_arm_mtx); 382 mutex_enter(&cq->mlcq_mtx); 383 ASSERT0(cq->mlcq_state & MLXCX_CQ_EQAVL); 384 avl_add(&eq->mleq_cqs, cq); 385 atomic_or_uint(&cq->mlcq_state, MLXCX_CQ_EQAVL); 386 mlxcx_arm_cq(mlxp, cq); 387 mutex_exit(&cq->mlcq_mtx); 388 mutex_exit(&cq->mlcq_arm_mtx); 389 mutex_exit(&eq->mleq_mtx); 390 391 *cqp = cq; 392 return (B_TRUE); 393 } 394 395 static boolean_t 396 mlxcx_rq_setup(mlxcx_t *mlxp, mlxcx_completion_queue_t *cq, 397 mlxcx_work_queue_t *wq) 398 { 399 mutex_init(&wq->mlwq_mtx, NULL, MUTEX_DRIVER, 400 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 401 402 list_insert_tail(&mlxp->mlx_wqs, wq); 403 404 mutex_enter(&wq->mlwq_mtx); 405 406 wq->mlwq_mlx = mlxp; 407 wq->mlwq_type = MLXCX_WQ_TYPE_RECVQ; 408 wq->mlwq_cq = cq; 409 wq->mlwq_pd = &mlxp->mlx_pd; 410 wq->mlwq_uar = &mlxp->mlx_uar; 411 412 wq->mlwq_bufs = mlxcx_mlbs_create(mlxp); 413 414 if (!mlxcx_wq_alloc_dma(mlxp, wq)) { 415 mutex_exit(&wq->mlwq_mtx); 416 return (B_FALSE); 417 } 418 419 if (!mlxcx_cmd_create_rq(mlxp, wq)) { 420 mutex_exit(&wq->mlwq_mtx); 421 return (B_FALSE); 422 } 423 424 wq->mlwq_bufhwm = wq->mlwq_nents - MLXCX_WQ_HWM_GAP; 425 wq->mlwq_buflwm = wq->mlwq_nents - MLXCX_WQ_LWM_GAP; 426 427 mutex_exit(&wq->mlwq_mtx); 428 429 mutex_enter(&cq->mlcq_mtx); 430 mutex_enter(&wq->mlwq_mtx); 431 ASSERT3P(cq->mlcq_wq, ==, NULL); 432 cq->mlcq_wq = wq; 433 mutex_exit(&wq->mlwq_mtx); 434 mutex_exit(&cq->mlcq_mtx); 435 436 return (B_TRUE); 437 } 438 439 static boolean_t 440 mlxcx_sq_setup(mlxcx_t *mlxp, mlxcx_port_t *port, mlxcx_completion_queue_t *cq, 441 mlxcx_tis_t *tis, mlxcx_work_queue_t *wq) 442 { 443 mutex_init(&wq->mlwq_mtx, NULL, MUTEX_DRIVER, 444 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 445 446 list_insert_tail(&mlxp->mlx_wqs, wq); 447 448 mutex_enter(&wq->mlwq_mtx); 449 450 wq->mlwq_mlx = mlxp; 451 wq->mlwq_type = MLXCX_WQ_TYPE_SENDQ; 452 wq->mlwq_cq = cq; 453 wq->mlwq_pd = &mlxp->mlx_pd; 454 wq->mlwq_uar = &mlxp->mlx_uar; 455 wq->mlwq_tis = tis; 456 457 wq->mlwq_bufs = mlxcx_mlbs_create(mlxp); 458 wq->mlwq_foreign_bufs = mlxcx_mlbs_create(mlxp); 459 460 VERIFY3U(port->mlp_wqe_min_inline, <=, MLXCX_ETH_INLINE_L2); 461 wq->mlwq_inline_mode = MLXCX_ETH_INLINE_L2; 462 463 if (!mlxcx_wq_alloc_dma(mlxp, wq)) { 464 mutex_exit(&wq->mlwq_mtx); 465 return (B_FALSE); 466 } 467 468 if (!mlxcx_cmd_create_sq(mlxp, wq)) { 469 mutex_exit(&wq->mlwq_mtx); 470 return (B_FALSE); 471 } 472 473 wq->mlwq_bufhwm = wq->mlwq_nents - MLXCX_WQ_HWM_GAP; 474 wq->mlwq_buflwm = wq->mlwq_nents - MLXCX_WQ_LWM_GAP; 475 476 mutex_exit(&wq->mlwq_mtx); 477 478 mutex_enter(&cq->mlcq_mtx); 479 mutex_enter(&wq->mlwq_mtx); 480 ASSERT3P(cq->mlcq_wq, ==, NULL); 481 cq->mlcq_wq = wq; 482 mutex_exit(&wq->mlwq_mtx); 483 mutex_exit(&cq->mlcq_mtx); 484 485 return (B_TRUE); 486 } 487 488 /* 489 * Before we tear down the queues associated with the rx group, 490 * flag each cq as being torn down and wake up any tasks. 491 */ 492 static void 493 mlxcx_quiesce_rx_cqs(mlxcx_t *mlxp, mlxcx_ring_group_t *g) 494 { 495 mlxcx_work_queue_t *wq; 496 mlxcx_completion_queue_t *cq; 497 mlxcx_buf_shard_t *s; 498 uint_t i; 499 500 mutex_enter(&g->mlg_mtx); 501 502 for (i = 0; i < g->mlg_nwqs; ++i) { 503 wq = &g->mlg_wqs[i]; 504 cq = wq->mlwq_cq; 505 if (cq != NULL) { 506 s = wq->mlwq_bufs; 507 mutex_enter(&s->mlbs_mtx); 508 atomic_or_uint(&cq->mlcq_state, MLXCX_CQ_TEARDOWN); 509 cv_broadcast(&s->mlbs_free_nonempty); 510 mutex_exit(&s->mlbs_mtx); 511 } 512 } 513 514 mutex_exit(&g->mlg_mtx); 515 } 516 517 void 518 mlxcx_teardown_rx_group(mlxcx_t *mlxp, mlxcx_ring_group_t *g) 519 { 520 mlxcx_work_queue_t *wq; 521 mlxcx_completion_queue_t *cq; 522 mlxcx_flow_entry_t *fe; 523 mlxcx_flow_group_t *fg; 524 mlxcx_flow_table_t *ft; 525 uint_t i; 526 527 mutex_enter(&g->mlg_port->mlp_mtx); 528 mutex_enter(&g->mlg_mtx); 529 530 if (g->mlg_state & MLXCX_GROUP_FLOWS) { 531 mlxcx_remove_all_umcast_entries(mlxp, g->mlg_port, g); 532 533 if (g->mlg_rx_vlan_ft != NULL) 534 mlxcx_remove_all_vlan_entries(mlxp, g); 535 536 if (g == &mlxp->mlx_rx_groups[0]) { 537 ft = g->mlg_port->mlp_rx_flow; 538 mutex_enter(&ft->mlft_mtx); 539 540 fg = g->mlg_port->mlp_bcast; 541 fe = list_head(&fg->mlfg_entries); 542 if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) { 543 (void) mlxcx_cmd_delete_flow_table_entry( 544 mlxp, fe); 545 } 546 547 fg = g->mlg_port->mlp_promisc; 548 fe = list_head(&fg->mlfg_entries); 549 if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) { 550 (void) mlxcx_cmd_delete_flow_table_entry( 551 mlxp, fe); 552 } 553 554 mutex_exit(&ft->mlft_mtx); 555 } 556 557 if (g->mlg_rx_vlan_ft != NULL) { 558 mutex_enter(&g->mlg_rx_vlan_ft->mlft_mtx); 559 ASSERT(list_is_empty(&g->mlg_rx_vlans)); 560 fg = g->mlg_rx_vlan_def_fg; 561 if (fg != NULL) { 562 fe = list_head(&fg->mlfg_entries); 563 if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) { 564 (void) 565 mlxcx_cmd_delete_flow_table_entry( 566 mlxp, fe); 567 } 568 } 569 fg = g->mlg_rx_vlan_promisc_fg; 570 if (fg != NULL) { 571 fe = list_head(&fg->mlfg_entries); 572 if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) { 573 (void) 574 mlxcx_cmd_delete_flow_table_entry( 575 mlxp, fe); 576 } 577 } 578 mlxcx_teardown_flow_table(mlxp, g->mlg_rx_vlan_ft); 579 list_destroy(&g->mlg_rx_vlans); 580 581 g->mlg_rx_vlan_ft = NULL; 582 } 583 584 mutex_enter(&g->mlg_rx_hash_ft->mlft_mtx); 585 mlxcx_teardown_flow_table(mlxp, g->mlg_rx_hash_ft); 586 g->mlg_rx_hash_ft = NULL; 587 588 avl_destroy(&g->mlg_rx_macs); 589 g->mlg_state &= ~MLXCX_GROUP_FLOWS; 590 } 591 592 if (g->mlg_state & MLXCX_GROUP_RUNNING) { 593 for (i = 0; i < g->mlg_nwqs; ++i) { 594 wq = &g->mlg_wqs[i]; 595 mutex_enter(&wq->mlwq_mtx); 596 if (wq->mlwq_state & MLXCX_WQ_STARTED && 597 !mlxcx_cmd_stop_rq(mlxp, wq)) { 598 mlxcx_warn(mlxp, "failed to stop rq %x", 599 wq->mlwq_num); 600 } 601 mutex_exit(&wq->mlwq_mtx); 602 } 603 taskq_destroy(g->mlg_refill_tq); 604 g->mlg_state &= ~MLXCX_GROUP_RUNNING; 605 } 606 607 if (g->mlg_state & MLXCX_GROUP_TIRTIS) { 608 for (i = 0; i < MLXCX_TIRS_PER_GROUP; ++i) { 609 mlxcx_tir_t *tir = &g->mlg_tir[i]; 610 if (tir->mltir_state & MLXCX_TIR_CREATED && 611 !(tir->mltir_state & MLXCX_TIR_DESTROYED)) { 612 if (!mlxcx_cmd_destroy_tir(mlxp, tir)) { 613 mlxcx_warn(mlxp, 614 "failed to destroy tir %u " 615 "for rx ring", tir->mltir_num); 616 } 617 } 618 } 619 g->mlg_state &= ~MLXCX_GROUP_TIRTIS; 620 } 621 622 if (g->mlg_state & MLXCX_GROUP_RQT) { 623 if (g->mlg_rqt->mlrqt_state & MLXCX_RQT_CREATED && 624 !(g->mlg_rqt->mlrqt_state & MLXCX_RQT_DESTROYED)) { 625 if (!mlxcx_cmd_destroy_rqt(mlxp, g->mlg_rqt)) { 626 mlxcx_warn(mlxp, "failed to destroy rqt %u " 627 "for rx ring", g->mlg_rqt->mlrqt_num); 628 } 629 kmem_free(g->mlg_rqt->mlrqt_rq, 630 g->mlg_rqt->mlrqt_rq_size); 631 g->mlg_rqt->mlrqt_rq = NULL; 632 kmem_free(g->mlg_rqt, sizeof (mlxcx_rqtable_t)); 633 g->mlg_rqt = NULL; 634 } 635 g->mlg_state &= ~MLXCX_GROUP_RQT; 636 } 637 638 for (i = 0; i < g->mlg_nwqs; ++i) { 639 wq = &g->mlg_wqs[i]; 640 cq = wq->mlwq_cq; 641 mlxcx_wq_teardown(mlxp, wq); 642 if (cq != NULL) 643 mlxcx_cq_teardown(mlxp, cq); 644 } 645 kmem_free(g->mlg_wqs, g->mlg_wqs_size); 646 g->mlg_wqs = NULL; 647 g->mlg_state &= ~MLXCX_GROUP_WQS; 648 649 mutex_exit(&g->mlg_mtx); 650 mutex_exit(&g->mlg_port->mlp_mtx); 651 652 mutex_destroy(&g->mlg_mtx); 653 654 g->mlg_state &= ~MLXCX_GROUP_INIT; 655 ASSERT3S(g->mlg_state, ==, 0); 656 } 657 658 void 659 mlxcx_teardown_tx_group(mlxcx_t *mlxp, mlxcx_ring_group_t *g) 660 { 661 mlxcx_work_queue_t *wq; 662 mlxcx_completion_queue_t *cq; 663 uint_t i; 664 665 mutex_enter(&g->mlg_mtx); 666 667 if (g->mlg_state & MLXCX_GROUP_WQS) { 668 for (i = 0; i < g->mlg_nwqs; ++i) { 669 wq = &g->mlg_wqs[i]; 670 mutex_enter(&wq->mlwq_mtx); 671 cq = wq->mlwq_cq; 672 if (wq->mlwq_state & MLXCX_WQ_STARTED && 673 !mlxcx_cmd_stop_sq(mlxp, wq)) { 674 mlxcx_warn(mlxp, "failed to stop sq %x", 675 wq->mlwq_num); 676 } 677 mutex_exit(&wq->mlwq_mtx); 678 mlxcx_wq_teardown(mlxp, wq); 679 if (cq != NULL) 680 mlxcx_cq_teardown(mlxp, cq); 681 } 682 g->mlg_state &= ~MLXCX_GROUP_RUNNING; 683 kmem_free(g->mlg_wqs, g->mlg_wqs_size); 684 g->mlg_wqs = NULL; 685 g->mlg_state &= ~MLXCX_GROUP_WQS; 686 } 687 688 if ((g->mlg_state & MLXCX_GROUP_TIRTIS) && 689 g->mlg_tis.mltis_state & MLXCX_TIS_CREATED && 690 !(g->mlg_tis.mltis_state & MLXCX_TIS_DESTROYED)) { 691 if (!mlxcx_cmd_destroy_tis(mlxp, &g->mlg_tis)) { 692 mlxcx_warn(mlxp, "failed to destroy tis %u for tx ring", 693 g->mlg_tis.mltis_num); 694 } 695 } 696 g->mlg_state &= ~MLXCX_GROUP_TIRTIS; 697 698 mutex_exit(&g->mlg_mtx); 699 mutex_destroy(&g->mlg_mtx); 700 g->mlg_state &= ~MLXCX_GROUP_INIT; 701 ASSERT3S(g->mlg_state, ==, 0); 702 } 703 704 void 705 mlxcx_teardown_groups(mlxcx_t *mlxp) 706 { 707 mlxcx_ring_group_t *g; 708 uint_t i; 709 710 for (i = 0; i < mlxp->mlx_rx_ngroups; ++i) { 711 g = &mlxp->mlx_rx_groups[i]; 712 if (!(g->mlg_state & MLXCX_GROUP_INIT)) 713 continue; 714 ASSERT3S(g->mlg_type, ==, MLXCX_GROUP_RX); 715 mlxcx_quiesce_rx_cqs(mlxp, g); 716 } 717 718 for (i = 0; i < mlxp->mlx_rx_ngroups; ++i) { 719 g = &mlxp->mlx_rx_groups[i]; 720 if (!(g->mlg_state & MLXCX_GROUP_INIT)) 721 continue; 722 mlxcx_teardown_rx_group(mlxp, g); 723 } 724 725 kmem_free(mlxp->mlx_rx_groups, mlxp->mlx_rx_groups_size); 726 mlxp->mlx_rx_groups = NULL; 727 728 for (i = 0; i < mlxp->mlx_tx_ngroups; ++i) { 729 g = &mlxp->mlx_tx_groups[i]; 730 if (!(g->mlg_state & MLXCX_GROUP_INIT)) 731 continue; 732 ASSERT3S(g->mlg_type, ==, MLXCX_GROUP_TX); 733 mlxcx_teardown_tx_group(mlxp, g); 734 } 735 736 kmem_free(mlxp->mlx_tx_groups, mlxp->mlx_tx_groups_size); 737 mlxp->mlx_tx_groups = NULL; 738 } 739 740 boolean_t 741 mlxcx_rx_group_setup(mlxcx_t *mlxp, mlxcx_ring_group_t *g) 742 { 743 mlxcx_event_queue_t *eq; 744 mlxcx_completion_queue_t *cq; 745 mlxcx_work_queue_t *rq; 746 mlxcx_flow_table_t *ft; 747 mlxcx_flow_group_t *fg; 748 mlxcx_flow_entry_t *fe; 749 uint_t ent_shift; 750 uint_t i, j; 751 752 ASSERT3S(g->mlg_state, ==, 0); 753 754 mutex_init(&g->mlg_mtx, NULL, MUTEX_DRIVER, 755 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 756 mutex_enter(&g->mlg_mtx); 757 g->mlg_mlx = mlxp; 758 g->mlg_type = MLXCX_GROUP_RX; 759 g->mlg_port = &mlxp->mlx_ports[0]; 760 g->mlg_state |= MLXCX_GROUP_INIT; 761 762 g->mlg_nwqs = mlxp->mlx_props.mldp_rx_nrings_per_small_group; 763 i = g - &mlxp->mlx_rx_groups[0]; 764 if (i < mlxp->mlx_props.mldp_rx_ngroups_large) 765 g->mlg_nwqs = mlxp->mlx_props.mldp_rx_nrings_per_large_group; 766 767 g->mlg_wqs_size = g->mlg_nwqs * sizeof (mlxcx_work_queue_t); 768 g->mlg_wqs = kmem_zalloc(g->mlg_wqs_size, KM_SLEEP); 769 g->mlg_state |= MLXCX_GROUP_WQS; 770 771 g->mlg_rqt = kmem_zalloc(sizeof (mlxcx_rqtable_t), KM_SLEEP); 772 g->mlg_rqt->mlrqt_max = 2; 773 while (g->mlg_rqt->mlrqt_max < g->mlg_nwqs) 774 g->mlg_rqt->mlrqt_max <<= 1; 775 g->mlg_rqt->mlrqt_rq_size = g->mlg_rqt->mlrqt_max * 776 sizeof (mlxcx_work_queue_t *); 777 g->mlg_rqt->mlrqt_rq = kmem_zalloc(g->mlg_rqt->mlrqt_rq_size, KM_SLEEP); 778 g->mlg_state |= MLXCX_GROUP_RQT; 779 780 for (i = 0; i < g->mlg_nwqs; ++i) { 781 eq = NULL; 782 while (eq == NULL) { 783 eq = &mlxp->mlx_eqs[mlxp->mlx_next_eq++]; 784 if (mlxp->mlx_next_eq >= mlxp->mlx_intr_count) 785 mlxp->mlx_next_eq = mlxp->mlx_intr_cq0; 786 if (eq->mleq_type != MLXCX_EQ_TYPE_ANY && 787 eq->mleq_type != MLXCX_EQ_TYPE_RX) { 788 /* Try the next one */ 789 eq = NULL; 790 } 791 } 792 793 /* 794 * A single completion is indicated for each rq entry as 795 * it is used. So, the number of cq entries never needs 796 * to be larger than the rq. 797 */ 798 ent_shift = MIN(mlxp->mlx_props.mldp_cq_size_shift, 799 mlxp->mlx_props.mldp_rq_size_shift); 800 if (!mlxcx_cq_setup(mlxp, eq, &cq, ent_shift)) { 801 g->mlg_nwqs = i; 802 break; 803 } 804 805 cq->mlcq_stats = &g->mlg_port->mlp_stats; 806 807 rq = &g->mlg_wqs[i]; 808 if (!mlxcx_rq_setup(mlxp, cq, rq)) { 809 g->mlg_nwqs = i; 810 break; 811 } 812 g->mlg_rqt->mlrqt_rq[g->mlg_rqt->mlrqt_used++] = rq; 813 g->mlg_rqt->mlrqt_state |= MLXCX_RQT_DIRTY; 814 rq->mlwq_group = g; 815 } 816 if (g->mlg_nwqs == 0) { 817 mutex_exit(&g->mlg_mtx); 818 return (B_FALSE); 819 } 820 821 if (!mlxcx_cmd_create_rqt(mlxp, g->mlg_rqt)) { 822 mutex_exit(&g->mlg_mtx); 823 return (B_FALSE); 824 } 825 826 for (i = 0; i < MLXCX_TIRS_PER_GROUP; ++i) { 827 mlxcx_tir_t *tir = &g->mlg_tir[i]; 828 tir->mltir_tdom = &mlxp->mlx_tdom; 829 switch (i) { 830 case MLXCX_TIR_ROLE_OTHER: 831 tir->mltir_type = MLXCX_TIR_DIRECT; 832 tir->mltir_rq = &g->mlg_wqs[0]; 833 break; 834 case MLXCX_TIR_ROLE_IPv4: 835 case MLXCX_TIR_ROLE_IPv6: 836 case MLXCX_TIR_ROLE_TCPv4: 837 case MLXCX_TIR_ROLE_TCPv6: 838 case MLXCX_TIR_ROLE_UDPv4: 839 case MLXCX_TIR_ROLE_UDPv6: 840 tir->mltir_type = MLXCX_TIR_INDIRECT; 841 tir->mltir_rqtable = g->mlg_rqt; 842 tir->mltir_hash_fn = MLXCX_TIR_HASH_TOEPLITZ; 843 (void) random_get_pseudo_bytes(tir->mltir_toeplitz_key, 844 sizeof (tir->mltir_toeplitz_key)); 845 break; 846 } 847 switch (i) { 848 case MLXCX_TIR_ROLE_OTHER: 849 break; 850 case MLXCX_TIR_ROLE_IPv4: 851 case MLXCX_TIR_ROLE_TCPv4: 852 case MLXCX_TIR_ROLE_UDPv4: 853 tir->mltir_l3_type = MLXCX_RX_HASH_L3_IPv4; 854 tir->mltir_hash_fields = 855 MLXCX_RX_HASH_SRC_IP | MLXCX_RX_HASH_DST_IP; 856 break; 857 case MLXCX_TIR_ROLE_IPv6: 858 case MLXCX_TIR_ROLE_TCPv6: 859 case MLXCX_TIR_ROLE_UDPv6: 860 tir->mltir_l3_type = MLXCX_RX_HASH_L3_IPv6; 861 tir->mltir_hash_fields = 862 MLXCX_RX_HASH_SRC_IP | MLXCX_RX_HASH_DST_IP; 863 break; 864 } 865 switch (i) { 866 case MLXCX_TIR_ROLE_OTHER: 867 case MLXCX_TIR_ROLE_IPv4: 868 case MLXCX_TIR_ROLE_IPv6: 869 break; 870 case MLXCX_TIR_ROLE_TCPv4: 871 case MLXCX_TIR_ROLE_TCPv6: 872 tir->mltir_l4_type = MLXCX_RX_HASH_L4_TCP; 873 tir->mltir_hash_fields |= 874 MLXCX_RX_HASH_L4_SPORT | MLXCX_RX_HASH_L4_DPORT; 875 break; 876 case MLXCX_TIR_ROLE_UDPv4: 877 case MLXCX_TIR_ROLE_UDPv6: 878 tir->mltir_l4_type = MLXCX_RX_HASH_L4_UDP; 879 tir->mltir_hash_fields |= 880 MLXCX_RX_HASH_L4_SPORT | MLXCX_RX_HASH_L4_DPORT; 881 break; 882 } 883 884 if (!mlxcx_cmd_create_tir(mlxp, tir)) { 885 mutex_exit(&g->mlg_mtx); 886 return (B_FALSE); 887 } 888 889 g->mlg_state |= MLXCX_GROUP_TIRTIS; 890 } 891 892 /* 893 * Flow table: our RX hashing breakout table for RSS 894 */ 895 896 g->mlg_rx_hash_ft = (ft = kmem_zalloc(sizeof (mlxcx_flow_table_t), 897 KM_SLEEP)); 898 mutex_init(&ft->mlft_mtx, NULL, MUTEX_DRIVER, 899 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 900 avl_create(&g->mlg_rx_macs, mlxcx_grmac_compare, 901 sizeof (mlxcx_group_mac_t), 902 offsetof(mlxcx_group_mac_t, mlgm_group_entry)); 903 g->mlg_state |= MLXCX_GROUP_FLOWS; 904 905 mutex_enter(&ft->mlft_mtx); 906 907 ft->mlft_type = MLXCX_FLOW_TABLE_NIC_RX; 908 ft->mlft_level = 2; 909 ft->mlft_port = g->mlg_port; 910 ft->mlft_entshift = MLXCX_RX_HASH_FT_SIZE_SHIFT; 911 ft->mlft_nents = (1 << ft->mlft_entshift); 912 ASSERT3U(ft->mlft_nents, >=, MLXCX_TIRS_PER_GROUP); 913 ft->mlft_entsize = ft->mlft_nents * sizeof (mlxcx_flow_entry_t); 914 ft->mlft_ent = kmem_zalloc(ft->mlft_entsize, KM_SLEEP); 915 list_create(&ft->mlft_groups, sizeof (mlxcx_flow_group_t), 916 offsetof(mlxcx_flow_group_t, mlfg_entry)); 917 918 for (j = 0; j < ft->mlft_nents; ++j) { 919 ft->mlft_ent[j].mlfe_table = ft; 920 ft->mlft_ent[j].mlfe_index = j; 921 } 922 923 if (!mlxcx_cmd_create_flow_table(mlxp, ft)) { 924 mutex_exit(&ft->mlft_mtx); 925 mutex_exit(&g->mlg_mtx); 926 return (B_FALSE); 927 } 928 929 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 930 list_insert_tail(&ft->mlft_groups, fg); 931 fg->mlfg_table = ft; 932 fg->mlfg_size = 1; 933 fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER | MLXCX_FLOW_MATCH_IP_PROTO; 934 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 935 mutex_exit(&ft->mlft_mtx); 936 mutex_exit(&g->mlg_mtx); 937 return (B_FALSE); 938 } 939 fe = list_head(&fg->mlfg_entries); 940 fe->mlfe_ip_version = 6; 941 fe->mlfe_ip_proto = IPPROTO_UDP; 942 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; 943 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir = 944 &g->mlg_tir[MLXCX_TIR_ROLE_UDPv6]; 945 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 946 mutex_exit(&ft->mlft_mtx); 947 mutex_exit(&g->mlg_mtx); 948 return (B_FALSE); 949 } 950 951 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 952 list_insert_tail(&ft->mlft_groups, fg); 953 fg->mlfg_table = ft; 954 fg->mlfg_size = 1; 955 fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER | MLXCX_FLOW_MATCH_IP_PROTO; 956 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 957 mutex_exit(&ft->mlft_mtx); 958 mutex_exit(&g->mlg_mtx); 959 return (B_FALSE); 960 } 961 fe = list_head(&fg->mlfg_entries); 962 fe->mlfe_ip_version = 4; 963 fe->mlfe_ip_proto = IPPROTO_UDP; 964 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; 965 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir = 966 &g->mlg_tir[MLXCX_TIR_ROLE_UDPv4]; 967 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 968 mutex_exit(&ft->mlft_mtx); 969 mutex_exit(&g->mlg_mtx); 970 return (B_FALSE); 971 } 972 973 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 974 list_insert_tail(&ft->mlft_groups, fg); 975 fg->mlfg_table = ft; 976 fg->mlfg_size = 1; 977 fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER | MLXCX_FLOW_MATCH_IP_PROTO; 978 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 979 mutex_exit(&ft->mlft_mtx); 980 mutex_exit(&g->mlg_mtx); 981 return (B_FALSE); 982 } 983 fe = list_head(&fg->mlfg_entries); 984 fe->mlfe_ip_version = 6; 985 fe->mlfe_ip_proto = IPPROTO_TCP; 986 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; 987 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir = 988 &g->mlg_tir[MLXCX_TIR_ROLE_TCPv6]; 989 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 990 mutex_exit(&ft->mlft_mtx); 991 mutex_exit(&g->mlg_mtx); 992 return (B_FALSE); 993 } 994 995 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 996 list_insert_tail(&ft->mlft_groups, fg); 997 fg->mlfg_table = ft; 998 fg->mlfg_size = 1; 999 fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER | MLXCX_FLOW_MATCH_IP_PROTO; 1000 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 1001 mutex_exit(&ft->mlft_mtx); 1002 mutex_exit(&g->mlg_mtx); 1003 return (B_FALSE); 1004 } 1005 fe = list_head(&fg->mlfg_entries); 1006 fe->mlfe_ip_version = 4; 1007 fe->mlfe_ip_proto = IPPROTO_TCP; 1008 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; 1009 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir = 1010 &g->mlg_tir[MLXCX_TIR_ROLE_TCPv4]; 1011 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 1012 mutex_exit(&ft->mlft_mtx); 1013 mutex_exit(&g->mlg_mtx); 1014 return (B_FALSE); 1015 } 1016 1017 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 1018 list_insert_tail(&ft->mlft_groups, fg); 1019 fg->mlfg_table = ft; 1020 fg->mlfg_size = 1; 1021 fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER; 1022 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 1023 mutex_exit(&ft->mlft_mtx); 1024 mutex_exit(&g->mlg_mtx); 1025 return (B_FALSE); 1026 } 1027 fe = list_head(&fg->mlfg_entries); 1028 fe->mlfe_ip_version = 6; 1029 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; 1030 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir = 1031 &g->mlg_tir[MLXCX_TIR_ROLE_IPv6]; 1032 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 1033 mutex_exit(&ft->mlft_mtx); 1034 mutex_exit(&g->mlg_mtx); 1035 return (B_FALSE); 1036 } 1037 1038 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 1039 list_insert_tail(&ft->mlft_groups, fg); 1040 fg->mlfg_table = ft; 1041 fg->mlfg_size = 1; 1042 fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER; 1043 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 1044 mutex_exit(&ft->mlft_mtx); 1045 mutex_exit(&g->mlg_mtx); 1046 return (B_FALSE); 1047 } 1048 fe = list_head(&fg->mlfg_entries); 1049 fe->mlfe_ip_version = 4; 1050 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; 1051 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir = 1052 &g->mlg_tir[MLXCX_TIR_ROLE_IPv4]; 1053 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 1054 mutex_exit(&ft->mlft_mtx); 1055 mutex_exit(&g->mlg_mtx); 1056 return (B_FALSE); 1057 } 1058 1059 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 1060 list_insert_tail(&ft->mlft_groups, fg); 1061 fg->mlfg_table = ft; 1062 fg->mlfg_size = 1; 1063 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 1064 mutex_exit(&ft->mlft_mtx); 1065 mutex_exit(&g->mlg_mtx); 1066 return (B_FALSE); 1067 } 1068 fe = list_head(&fg->mlfg_entries); 1069 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; 1070 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir = 1071 &g->mlg_tir[MLXCX_TIR_ROLE_OTHER]; 1072 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 1073 mutex_exit(&ft->mlft_mtx); 1074 mutex_exit(&g->mlg_mtx); 1075 return (B_FALSE); 1076 } 1077 1078 mutex_exit(&ft->mlft_mtx); 1079 1080 /* 1081 * Flow table: the VLAN breakout table for doing VLAN filtering after 1082 * we've matched a MAC address. 1083 */ 1084 1085 g->mlg_rx_vlan_ft = (ft = kmem_zalloc(sizeof (mlxcx_flow_table_t), 1086 KM_SLEEP)); 1087 mutex_init(&ft->mlft_mtx, NULL, MUTEX_DRIVER, 1088 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 1089 list_create(&g->mlg_rx_vlans, sizeof (mlxcx_group_vlan_t), 1090 offsetof(mlxcx_group_vlan_t, mlgv_entry)); 1091 1092 mutex_enter(&ft->mlft_mtx); 1093 1094 ft->mlft_type = MLXCX_FLOW_TABLE_NIC_RX; 1095 ft->mlft_level = 1; 1096 ft->mlft_port = g->mlg_port; 1097 ft->mlft_entshift = mlxp->mlx_props.mldp_ftbl_vlan_size_shift; 1098 ft->mlft_nents = (1 << ft->mlft_entshift); 1099 ft->mlft_entsize = ft->mlft_nents * sizeof (mlxcx_flow_entry_t); 1100 ft->mlft_ent = kmem_zalloc(ft->mlft_entsize, KM_SLEEP); 1101 list_create(&ft->mlft_groups, sizeof (mlxcx_flow_group_t), 1102 offsetof(mlxcx_flow_group_t, mlfg_entry)); 1103 1104 for (j = 0; j < ft->mlft_nents; ++j) { 1105 fe = &ft->mlft_ent[j]; 1106 fe->mlfe_table = ft; 1107 fe->mlfe_index = j; 1108 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; 1109 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = g->mlg_rx_hash_ft; 1110 } 1111 1112 if (!mlxcx_cmd_create_flow_table(mlxp, ft)) { 1113 mutex_exit(&ft->mlft_mtx); 1114 mutex_exit(&g->mlg_mtx); 1115 return (B_FALSE); 1116 } 1117 1118 /* First group is all actual matched VLANs */ 1119 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 1120 g->mlg_rx_vlan_fg = fg; 1121 list_insert_tail(&ft->mlft_groups, fg); 1122 fg->mlfg_table = ft; 1123 fg->mlfg_size = ft->mlft_nents - 2; 1124 fg->mlfg_mask |= MLXCX_FLOW_MATCH_VLAN; 1125 fg->mlfg_mask |= MLXCX_FLOW_MATCH_VID; 1126 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 1127 mutex_exit(&ft->mlft_mtx); 1128 mutex_exit(&g->mlg_mtx); 1129 return (B_FALSE); 1130 } 1131 1132 /* 1133 * Then the "default" entry which we enable when we have no VLAN IDs 1134 * added to the group (we start with this enabled). 1135 */ 1136 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 1137 g->mlg_rx_vlan_def_fg = fg; 1138 list_insert_tail(&ft->mlft_groups, fg); 1139 fg->mlfg_table = ft; 1140 fg->mlfg_size = 1; 1141 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 1142 mutex_exit(&ft->mlft_mtx); 1143 mutex_exit(&g->mlg_mtx); 1144 return (B_FALSE); 1145 } 1146 fe = list_head(&fg->mlfg_entries); 1147 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 1148 mutex_exit(&ft->mlft_mtx); 1149 mutex_exit(&g->mlg_mtx); 1150 return (B_FALSE); 1151 } 1152 1153 /* 1154 * Finally, the promisc entry which points at the *hash ft* from the 1155 * default group. We only enable this when we have promisc on. 1156 */ 1157 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 1158 g->mlg_rx_vlan_promisc_fg = fg; 1159 list_insert_tail(&ft->mlft_groups, fg); 1160 fg->mlfg_table = ft; 1161 fg->mlfg_size = 1; 1162 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 1163 mutex_exit(&ft->mlft_mtx); 1164 mutex_exit(&g->mlg_mtx); 1165 return (B_FALSE); 1166 } 1167 fe = list_head(&fg->mlfg_entries); 1168 fe->mlfe_ndest = 1; 1169 fe->mlfe_dest[0].mlfed_flow = mlxp->mlx_rx_groups[0].mlg_rx_hash_ft; 1170 1171 mutex_exit(&ft->mlft_mtx); 1172 1173 mutex_exit(&g->mlg_mtx); 1174 1175 return (B_TRUE); 1176 } 1177 1178 boolean_t 1179 mlxcx_rx_ring_start(mlxcx_t *mlxp, mlxcx_ring_group_t *g, 1180 mlxcx_work_queue_t *rq) 1181 { 1182 uint_t j; 1183 mlxcx_buffer_t *b; 1184 mlxcx_completion_queue_t *cq; 1185 1186 mutex_enter(&g->mlg_mtx); 1187 /* 1188 * Sadly, even though MAC has the mgi_start callback, it is not always 1189 * called -- in particular when we are being managed under an aggr, the 1190 * mgi_start callback will only ever be called on the default group. 1191 * 1192 * So instead of asserting about the group state here, we have to 1193 * check it and call group start if needed. 1194 */ 1195 if (!(g->mlg_state & MLXCX_GROUP_RUNNING)) { 1196 mutex_exit(&g->mlg_mtx); 1197 if (!mlxcx_rx_group_start(mlxp, g)) 1198 return (B_FALSE); 1199 mutex_enter(&g->mlg_mtx); 1200 } 1201 ASSERT(g->mlg_state & MLXCX_GROUP_RUNNING); 1202 1203 cq = rq->mlwq_cq; 1204 ASSERT(cq != NULL); 1205 1206 mutex_enter(&cq->mlcq_mtx); 1207 mutex_enter(&rq->mlwq_mtx); 1208 1209 if (rq->mlwq_state & MLXCX_WQ_STARTED) { 1210 mutex_exit(&rq->mlwq_mtx); 1211 mutex_exit(&cq->mlcq_mtx); 1212 mutex_exit(&g->mlg_mtx); 1213 return (B_TRUE); 1214 } 1215 1216 if (!mlxcx_cmd_start_rq(mlxp, rq)) { 1217 mutex_exit(&rq->mlwq_mtx); 1218 mutex_exit(&cq->mlcq_mtx); 1219 mutex_exit(&g->mlg_mtx); 1220 return (B_FALSE); 1221 } 1222 ASSERT(rq->mlwq_state & MLXCX_WQ_STARTED); 1223 1224 ASSERT0(rq->mlwq_state & MLXCX_WQ_BUFFERS); 1225 rq->mlwq_state |= MLXCX_WQ_BUFFERS; 1226 1227 mlxcx_shard_ready(rq->mlwq_bufs); 1228 1229 for (j = 0; j < rq->mlwq_nents; ++j) { 1230 if (!mlxcx_buf_create(mlxp, rq->mlwq_bufs, &b)) 1231 break; 1232 mlxcx_buf_return(mlxp, b); 1233 } 1234 for (j = 0; j < rq->mlwq_nents / 2; ++j) { 1235 if (!mlxcx_buf_create(mlxp, rq->mlwq_bufs, &b)) 1236 break; 1237 mlxcx_buf_return(mlxp, b); 1238 } 1239 1240 mlxcx_rq_refill(mlxp, rq); 1241 1242 mutex_exit(&rq->mlwq_mtx); 1243 mutex_exit(&cq->mlcq_mtx); 1244 mutex_exit(&g->mlg_mtx); 1245 1246 return (B_TRUE); 1247 } 1248 1249 boolean_t 1250 mlxcx_rx_group_start(mlxcx_t *mlxp, mlxcx_ring_group_t *g) 1251 { 1252 mlxcx_flow_table_t *ft; 1253 mlxcx_flow_group_t *fg; 1254 mlxcx_flow_entry_t *fe; 1255 char tq_name[TASKQ_NAMELEN]; 1256 1257 mutex_enter(&g->mlg_mtx); 1258 1259 if (g->mlg_state & MLXCX_GROUP_RUNNING) { 1260 mutex_exit(&g->mlg_mtx); 1261 return (B_TRUE); 1262 } 1263 1264 ASSERT0(g->mlg_state & MLXCX_GROUP_RUNNING); 1265 1266 g->mlg_state |= MLXCX_GROUP_RUNNING; 1267 1268 (void) snprintf(tq_name, sizeof (tq_name), "%s_refill_%d_%ld", 1269 ddi_driver_name(mlxp->mlx_dip), mlxp->mlx_inst, 1270 g - &mlxp->mlx_rx_groups[0]); 1271 1272 /* 1273 * Create one refill taskq per group with one thread per work queue. 1274 * The refill task may block waiting for resources, so by effectively 1275 * having one thread per work queue we avoid work queues blocking each 1276 * other. 1277 */ 1278 if ((g->mlg_refill_tq = taskq_create(tq_name, g->mlg_nwqs, minclsyspri, 1279 g->mlg_nwqs, INT_MAX, TASKQ_PREPOPULATE)) == NULL) { 1280 mlxcx_warn(mlxp, "failed to create rq refill task queue"); 1281 mutex_exit(&g->mlg_mtx); 1282 return (B_FALSE); 1283 } 1284 1285 if (g == &mlxp->mlx_rx_groups[0]) { 1286 ft = g->mlg_port->mlp_rx_flow; 1287 mutex_enter(&ft->mlft_mtx); 1288 1289 /* 1290 * Broadcast and promisc entries go directly to group 0's 1291 * RSS hash fanout flow table. They bypass VLAN filtering. 1292 */ 1293 fg = g->mlg_port->mlp_bcast; 1294 fe = list_head(&fg->mlfg_entries); 1295 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = g->mlg_rx_hash_ft; 1296 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 1297 mutex_exit(&ft->mlft_mtx); 1298 g->mlg_state &= ~MLXCX_GROUP_RUNNING; 1299 taskq_destroy(g->mlg_refill_tq); 1300 mutex_exit(&g->mlg_mtx); 1301 return (B_FALSE); 1302 } 1303 1304 fg = g->mlg_port->mlp_promisc; 1305 fe = list_head(&fg->mlfg_entries); 1306 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = g->mlg_rx_hash_ft; 1307 /* 1308 * Don't actually set the promisc entry until promisc is 1309 * enabled. 1310 */ 1311 1312 mutex_exit(&ft->mlft_mtx); 1313 } 1314 1315 mutex_exit(&g->mlg_mtx); 1316 1317 return (B_TRUE); 1318 } 1319 1320 boolean_t 1321 mlxcx_tx_group_setup(mlxcx_t *mlxp, mlxcx_ring_group_t *g) 1322 { 1323 mlxcx_event_queue_t *eq; 1324 mlxcx_completion_queue_t *cq; 1325 mlxcx_work_queue_t *sq; 1326 uint_t i; 1327 1328 ASSERT3S(g->mlg_state, ==, 0); 1329 1330 mutex_init(&g->mlg_mtx, NULL, MUTEX_DRIVER, 1331 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 1332 g->mlg_state |= MLXCX_GROUP_INIT; 1333 mutex_enter(&g->mlg_mtx); 1334 1335 g->mlg_mlx = mlxp; 1336 g->mlg_type = MLXCX_GROUP_TX; 1337 g->mlg_port = &mlxp->mlx_ports[0]; 1338 1339 g->mlg_nwqs = mlxp->mlx_props.mldp_tx_nrings_per_group; 1340 g->mlg_wqs_size = g->mlg_nwqs * sizeof (mlxcx_work_queue_t); 1341 g->mlg_wqs = kmem_zalloc(g->mlg_wqs_size, KM_SLEEP); 1342 g->mlg_state |= MLXCX_GROUP_WQS; 1343 1344 g->mlg_tis.mltis_tdom = &mlxp->mlx_tdom; 1345 1346 if (!mlxcx_cmd_create_tis(mlxp, &g->mlg_tis)) { 1347 mutex_exit(&g->mlg_mtx); 1348 return (B_FALSE); 1349 } 1350 1351 g->mlg_state |= MLXCX_GROUP_TIRTIS; 1352 1353 for (i = 0; i < g->mlg_nwqs; ++i) { 1354 eq = NULL; 1355 while (eq == NULL) { 1356 eq = &mlxp->mlx_eqs[mlxp->mlx_next_eq++]; 1357 if (mlxp->mlx_next_eq >= mlxp->mlx_intr_count) 1358 mlxp->mlx_next_eq = mlxp->mlx_intr_cq0; 1359 if (eq->mleq_type != MLXCX_EQ_TYPE_ANY && 1360 eq->mleq_type != MLXCX_EQ_TYPE_TX) { 1361 /* Try the next one */ 1362 eq = NULL; 1363 } 1364 } 1365 1366 if (!mlxcx_cq_setup(mlxp, eq, &cq, 1367 mlxp->mlx_props.mldp_cq_size_shift)) 1368 return (B_FALSE); 1369 1370 cq->mlcq_stats = &g->mlg_port->mlp_stats; 1371 1372 sq = &g->mlg_wqs[i]; 1373 if (!mlxcx_sq_setup(mlxp, g->mlg_port, cq, &g->mlg_tis, sq)) { 1374 mutex_exit(&g->mlg_mtx); 1375 return (B_FALSE); 1376 } 1377 sq->mlwq_group = g; 1378 } 1379 1380 mutex_exit(&g->mlg_mtx); 1381 1382 return (B_TRUE); 1383 } 1384 1385 boolean_t 1386 mlxcx_tx_ring_start(mlxcx_t *mlxp, mlxcx_ring_group_t *g, 1387 mlxcx_work_queue_t *sq) 1388 { 1389 uint_t i; 1390 mlxcx_buffer_t *b; 1391 mlxcx_completion_queue_t *cq; 1392 1393 mutex_enter(&g->mlg_mtx); 1394 1395 cq = sq->mlwq_cq; 1396 ASSERT(cq != NULL); 1397 1398 mutex_enter(&cq->mlcq_mtx); 1399 mutex_enter(&sq->mlwq_mtx); 1400 if (sq->mlwq_state & MLXCX_WQ_STARTED) { 1401 mutex_exit(&sq->mlwq_mtx); 1402 mutex_exit(&cq->mlcq_mtx); 1403 mutex_exit(&g->mlg_mtx); 1404 return (B_TRUE); 1405 } 1406 1407 ASSERT0(sq->mlwq_state & MLXCX_WQ_BUFFERS); 1408 for (i = 0; i < sq->mlwq_nents; ++i) { 1409 if (!mlxcx_buf_create_foreign(mlxp, sq->mlwq_foreign_bufs, &b)) 1410 break; 1411 mlxcx_buf_return(mlxp, b); 1412 } 1413 for (i = 0; i < sq->mlwq_nents / 2; ++i) { 1414 if (!mlxcx_buf_create_foreign(mlxp, sq->mlwq_foreign_bufs, &b)) 1415 break; 1416 mlxcx_buf_return(mlxp, b); 1417 } 1418 for (i = 0; i < sq->mlwq_nents; ++i) { 1419 if (!mlxcx_buf_create(mlxp, sq->mlwq_bufs, &b)) 1420 break; 1421 mlxcx_buf_return(mlxp, b); 1422 } 1423 sq->mlwq_state |= MLXCX_WQ_BUFFERS; 1424 1425 mlxcx_shard_ready(sq->mlwq_bufs); 1426 mlxcx_shard_ready(sq->mlwq_foreign_bufs); 1427 1428 if (!mlxcx_cmd_start_sq(mlxp, sq)) { 1429 mutex_exit(&sq->mlwq_mtx); 1430 mutex_exit(&cq->mlcq_mtx); 1431 mutex_exit(&g->mlg_mtx); 1432 return (B_FALSE); 1433 } 1434 g->mlg_state |= MLXCX_GROUP_RUNNING; 1435 1436 (void) mlxcx_sq_add_nop(mlxp, sq); 1437 1438 mutex_exit(&sq->mlwq_mtx); 1439 mutex_exit(&cq->mlcq_mtx); 1440 mutex_exit(&g->mlg_mtx); 1441 1442 return (B_TRUE); 1443 } 1444 1445 static boolean_t 1446 mlxcx_sq_ring_dbell(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, uint_t first) 1447 { 1448 uint_t idx; 1449 mlxcx_bf_t *bf; 1450 ddi_fm_error_t err; 1451 uint_t try = 0; 1452 1453 ASSERT3U(mlwq->mlwq_type, ==, MLXCX_WQ_TYPE_SENDQ); 1454 ASSERT(mutex_owned(&mlwq->mlwq_mtx)); 1455 1456 mlwq->mlwq_doorbell->mlwqd_send_counter = to_be16(mlwq->mlwq_pc); 1457 1458 ASSERT(mlwq->mlwq_cq != NULL); 1459 ASSERT(mlwq->mlwq_cq->mlcq_eq != NULL); 1460 idx = mlwq->mlwq_cq->mlcq_eq->mleq_intr_index & MLXCX_BF_PER_UAR_MASK; 1461 bf = &mlwq->mlwq_uar->mlu_bf[idx]; 1462 1463 retry: 1464 MLXCX_DMA_SYNC(mlwq->mlwq_doorbell_dma, DDI_DMA_SYNC_FORDEV); 1465 ddi_fm_dma_err_get(mlwq->mlwq_doorbell_dma.mxdb_dma_handle, &err, 1466 DDI_FME_VERSION); 1467 if (err.fme_status != DDI_FM_OK) { 1468 if (try++ < mlxcx_doorbell_tries) { 1469 ddi_fm_dma_err_clear( 1470 mlwq->mlwq_doorbell_dma.mxdb_dma_handle, 1471 DDI_FME_VERSION); 1472 goto retry; 1473 } else { 1474 goto err; 1475 } 1476 } 1477 1478 mlxcx_put64(mlxp, bf->mbf_even, from_be64( 1479 mlwq->mlwq_bf_ent[first].mlsqbf_qwords[0])); 1480 ddi_fm_acc_err_get(mlxp->mlx_regs_handle, &err, 1481 DDI_FME_VERSION); 1482 if (err.fme_status == DDI_FM_OK) 1483 return (B_TRUE); 1484 if (try++ < mlxcx_doorbell_tries) { 1485 ddi_fm_acc_err_clear(mlxp->mlx_regs_handle, DDI_FME_VERSION); 1486 goto retry; 1487 } 1488 1489 err: 1490 ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST); 1491 return (B_FALSE); 1492 } 1493 1494 boolean_t 1495 mlxcx_sq_add_nop(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq) 1496 { 1497 uint_t index, start_pc; 1498 mlxcx_sendq_ent_t *ent0; 1499 ddi_fm_error_t err; 1500 1501 ASSERT(mutex_owned(&mlwq->mlwq_mtx)); 1502 1503 index = mlwq->mlwq_pc & (mlwq->mlwq_nents - 1); 1504 ent0 = &mlwq->mlwq_send_ent[index]; 1505 start_pc = mlwq->mlwq_pc; 1506 ++mlwq->mlwq_pc; 1507 /* 1508 * This counter is manipulated in the interrupt handler, which 1509 * does not hold the mlwq_mtx, hence the atomic. 1510 */ 1511 atomic_inc_64(&mlwq->mlwq_wqebb_used); 1512 1513 bzero(ent0, sizeof (mlxcx_sendq_ent_t)); 1514 ent0->mlsqe_control.mlcs_opcode = MLXCX_WQE_OP_NOP; 1515 ent0->mlsqe_control.mlcs_qp_or_sq = to_be24(mlwq->mlwq_num); 1516 ent0->mlsqe_control.mlcs_wqe_index = to_be16(start_pc); 1517 1518 set_bits8(&ent0->mlsqe_control.mlcs_flags, 1519 MLXCX_SQE_FENCE_MODE, MLXCX_SQE_FENCE_NONE); 1520 set_bits8(&ent0->mlsqe_control.mlcs_flags, 1521 MLXCX_SQE_COMPLETION_MODE, MLXCX_SQE_CQE_ALWAYS); 1522 1523 ent0->mlsqe_control.mlcs_ds = 1; 1524 1525 VERIFY0(ddi_dma_sync(mlwq->mlwq_dma.mxdb_dma_handle, 1526 (uintptr_t)ent0 - (uintptr_t)mlwq->mlwq_send_ent, 1527 sizeof (mlxcx_sendq_ent_t), DDI_DMA_SYNC_FORDEV)); 1528 ddi_fm_dma_err_get(mlwq->mlwq_dma.mxdb_dma_handle, &err, 1529 DDI_FME_VERSION); 1530 if (err.fme_status != DDI_FM_OK) { 1531 return (B_FALSE); 1532 } 1533 if (!mlxcx_sq_ring_dbell(mlxp, mlwq, index)) { 1534 return (B_FALSE); 1535 } 1536 return (B_TRUE); 1537 } 1538 1539 boolean_t 1540 mlxcx_sq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, 1541 uint8_t *inlinehdrs, size_t inlinelen, uint32_t chkflags, 1542 mlxcx_buffer_t *b0) 1543 { 1544 uint_t index, first, ents; 1545 mlxcx_completion_queue_t *cq; 1546 mlxcx_sendq_ent_t *ent0; 1547 mlxcx_sendq_extra_ent_t *ent; 1548 mlxcx_wqe_data_seg_t *seg; 1549 uint_t ptri, nptr; 1550 const ddi_dma_cookie_t *c; 1551 size_t rem; 1552 uint64_t wqebb_used; 1553 mlxcx_buffer_t *b; 1554 ddi_fm_error_t err; 1555 boolean_t rv; 1556 1557 ASSERT(mutex_owned(&mlwq->mlwq_mtx)); 1558 ASSERT3P(b0->mlb_tx_head, ==, b0); 1559 ASSERT3U(b0->mlb_state, ==, MLXCX_BUFFER_ON_WQ); 1560 cq = mlwq->mlwq_cq; 1561 1562 index = mlwq->mlwq_pc & (mlwq->mlwq_nents - 1); 1563 ent0 = &mlwq->mlwq_send_ent[index]; 1564 b0->mlb_wqe_index = mlwq->mlwq_pc; 1565 ents = 1; 1566 1567 first = index; 1568 1569 bzero(ent0, sizeof (mlxcx_sendq_ent_t)); 1570 ent0->mlsqe_control.mlcs_opcode = MLXCX_WQE_OP_SEND; 1571 ent0->mlsqe_control.mlcs_qp_or_sq = to_be24(mlwq->mlwq_num); 1572 ent0->mlsqe_control.mlcs_wqe_index = to_be16(b0->mlb_wqe_index); 1573 1574 set_bits8(&ent0->mlsqe_control.mlcs_flags, 1575 MLXCX_SQE_FENCE_MODE, MLXCX_SQE_FENCE_WAIT_OTHERS); 1576 set_bits8(&ent0->mlsqe_control.mlcs_flags, 1577 MLXCX_SQE_COMPLETION_MODE, MLXCX_SQE_CQE_ALWAYS); 1578 1579 VERIFY3U(inlinelen, <=, sizeof (ent0->mlsqe_eth.mles_inline_headers)); 1580 set_bits16(&ent0->mlsqe_eth.mles_szflags, 1581 MLXCX_SQE_ETH_INLINE_HDR_SZ, inlinelen); 1582 if (inlinelen > 0) { 1583 bcopy(inlinehdrs, ent0->mlsqe_eth.mles_inline_headers, 1584 inlinelen); 1585 } 1586 1587 ent0->mlsqe_control.mlcs_ds = offsetof(mlxcx_sendq_ent_t, mlsqe_data) / 1588 MLXCX_WQE_OCTOWORD; 1589 1590 if (chkflags & HCK_IPV4_HDRCKSUM) { 1591 ASSERT(mlxp->mlx_caps->mlc_checksum); 1592 set_bit8(&ent0->mlsqe_eth.mles_csflags, 1593 MLXCX_SQE_ETH_CSFLAG_L3_CHECKSUM); 1594 } 1595 if (chkflags & HCK_FULLCKSUM) { 1596 ASSERT(mlxp->mlx_caps->mlc_checksum); 1597 set_bit8(&ent0->mlsqe_eth.mles_csflags, 1598 MLXCX_SQE_ETH_CSFLAG_L4_CHECKSUM); 1599 } 1600 1601 /* 1602 * mlwq_wqebb_used is only incremented whilst holding 1603 * the mlwq_mtx mutex, but it is decremented (atomically) in 1604 * the interrupt context *not* under mlwq_mtx mutex. 1605 * So, now take a snapshot of the number of used wqes which will 1606 * be a conistent maximum we can use whilst iterating through 1607 * the buffers and DMA cookies. 1608 */ 1609 wqebb_used = mlwq->mlwq_wqebb_used; 1610 1611 b = b0; 1612 ptri = 0; 1613 nptr = sizeof (ent0->mlsqe_data) / sizeof (mlxcx_wqe_data_seg_t); 1614 seg = ent0->mlsqe_data; 1615 while (b != NULL) { 1616 rem = b->mlb_used; 1617 1618 c = NULL; 1619 while (rem > 0 && 1620 (c = mlxcx_dma_cookie_iter(&b->mlb_dma, c)) != NULL) { 1621 if (ptri >= nptr) { 1622 if ((ents + wqebb_used) >= mlwq->mlwq_nents) 1623 return (B_FALSE); 1624 1625 index = (mlwq->mlwq_pc + ents) & 1626 (mlwq->mlwq_nents - 1); 1627 ent = &mlwq->mlwq_send_extra_ent[index]; 1628 ++ents; 1629 1630 seg = ent->mlsqe_data; 1631 ptri = 0; 1632 nptr = sizeof (ent->mlsqe_data) / 1633 sizeof (mlxcx_wqe_data_seg_t); 1634 } 1635 1636 seg->mlds_lkey = to_be32(mlxp->mlx_rsvd_lkey); 1637 if (c->dmac_size > rem) { 1638 seg->mlds_byte_count = to_be32(rem); 1639 rem = 0; 1640 } else { 1641 seg->mlds_byte_count = to_be32(c->dmac_size); 1642 rem -= c->dmac_size; 1643 } 1644 seg->mlds_address = to_be64(c->dmac_laddress); 1645 ++seg; 1646 ++ptri; 1647 ++ent0->mlsqe_control.mlcs_ds; 1648 1649 ASSERT3U(ent0->mlsqe_control.mlcs_ds, <=, 1650 MLXCX_SQE_MAX_DS); 1651 } 1652 1653 if (b == b0) { 1654 b = list_head(&b0->mlb_tx_chain); 1655 } else { 1656 b = list_next(&b0->mlb_tx_chain, b); 1657 } 1658 } 1659 1660 b0->mlb_wqebbs = ents; 1661 mlwq->mlwq_pc += ents; 1662 atomic_add_64(&mlwq->mlwq_wqebb_used, ents); 1663 1664 for (; ptri < nptr; ++ptri, ++seg) { 1665 seg->mlds_lkey = to_be32(MLXCX_NULL_LKEY); 1666 seg->mlds_byte_count = to_be32(0); 1667 seg->mlds_address = to_be64(0); 1668 } 1669 1670 /* 1671 * Make sure the workqueue entry is flushed out before updating 1672 * the doorbell. 1673 * If the ring has wrapped, we need to flush the front and back. 1674 */ 1675 if ((first + ents) > mlwq->mlwq_nents) { 1676 uint_t sync_cnt = mlwq->mlwq_nents - first; 1677 1678 VERIFY0(ddi_dma_sync(mlwq->mlwq_dma.mxdb_dma_handle, 1679 (uintptr_t)ent0 - (uintptr_t)mlwq->mlwq_send_ent, 1680 sync_cnt * sizeof (mlxcx_sendq_ent_t), 1681 DDI_DMA_SYNC_FORDEV)); 1682 1683 ent0 = &mlwq->mlwq_send_ent[0]; 1684 ents -= sync_cnt; 1685 } 1686 1687 VERIFY0(ddi_dma_sync(mlwq->mlwq_dma.mxdb_dma_handle, 1688 (uintptr_t)ent0 - (uintptr_t)mlwq->mlwq_send_ent, 1689 ents * sizeof (mlxcx_sendq_ent_t), DDI_DMA_SYNC_FORDEV)); 1690 ddi_fm_dma_err_get(mlwq->mlwq_dma.mxdb_dma_handle, &err, 1691 DDI_FME_VERSION); 1692 if (err.fme_status != DDI_FM_OK) { 1693 return (B_FALSE); 1694 } 1695 1696 /* 1697 * Hold the bufmtx whilst ringing the doorbell, to prevent 1698 * the buffer from being moved to another list, so we can 1699 * safely remove it should the ring fail. 1700 */ 1701 mutex_enter(&cq->mlcq_bufbmtx); 1702 1703 list_insert_tail(&cq->mlcq_buffers_b, b0); 1704 if ((rv = mlxcx_sq_ring_dbell(mlxp, mlwq, first))) { 1705 atomic_inc_64(&cq->mlcq_bufcnt); 1706 } else { 1707 list_remove(&cq->mlcq_buffers_b, b0); 1708 } 1709 1710 mutex_exit(&cq->mlcq_bufbmtx); 1711 1712 return (rv); 1713 } 1714 1715 boolean_t 1716 mlxcx_rq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, 1717 mlxcx_buffer_t *buf) 1718 { 1719 return (mlxcx_rq_add_buffers(mlxp, mlwq, &buf, 1)); 1720 } 1721 1722 boolean_t 1723 mlxcx_rq_add_buffers(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, 1724 mlxcx_buffer_t **bufs, size_t nbufs) 1725 { 1726 uint_t index; 1727 mlxcx_recvq_ent_t *ent; 1728 mlxcx_completion_queue_t *cq; 1729 mlxcx_wqe_data_seg_t *seg; 1730 uint_t bi, ptri; 1731 const ddi_dma_cookie_t *c; 1732 mlxcx_buffer_t *buf; 1733 ddi_fm_error_t err; 1734 1735 ASSERT(mutex_owned(&mlwq->mlwq_mtx)); 1736 cq = mlwq->mlwq_cq; 1737 ASSERT(mutex_owned(&cq->mlcq_mtx)); 1738 1739 for (bi = 0; bi < nbufs; ++bi) { 1740 buf = bufs[bi]; 1741 bufs[bi] = NULL; 1742 ASSERT3U(buf->mlb_state, ==, MLXCX_BUFFER_ON_WQ); 1743 1744 index = mlwq->mlwq_pc & (mlwq->mlwq_nents - 1); 1745 ent = &mlwq->mlwq_recv_ent[index]; 1746 buf->mlb_wqe_index = mlwq->mlwq_pc; 1747 buf->mlb_wqebbs = 1; 1748 1749 ++mlwq->mlwq_pc; 1750 atomic_inc_64(&mlwq->mlwq_wqebb_used); 1751 1752 mutex_enter(&cq->mlcq_bufbmtx); 1753 list_insert_tail(&cq->mlcq_buffers, buf); 1754 atomic_inc_64(&cq->mlcq_bufcnt); 1755 mutex_exit(&cq->mlcq_bufbmtx); 1756 1757 ASSERT3U(buf->mlb_dma.mxdb_ncookies, <=, MLXCX_RECVQ_MAX_PTRS); 1758 ptri = 0; 1759 c = NULL; 1760 while ((c = mlxcx_dma_cookie_iter(&buf->mlb_dma, c)) != NULL) { 1761 seg = &ent->mlrqe_data[ptri++]; 1762 seg->mlds_lkey = to_be32(mlxp->mlx_rsvd_lkey); 1763 seg->mlds_byte_count = to_be32(c->dmac_size); 1764 seg->mlds_address = to_be64(c->dmac_laddress); 1765 } 1766 /* 1767 * Fill any unused scatter pointers with the special null 1768 * value. 1769 */ 1770 for (; ptri < MLXCX_RECVQ_MAX_PTRS; ++ptri) { 1771 seg = &ent->mlrqe_data[ptri]; 1772 seg->mlds_lkey = to_be32(MLXCX_NULL_LKEY); 1773 seg->mlds_byte_count = to_be32(0); 1774 seg->mlds_address = to_be64(0); 1775 } 1776 1777 /* 1778 * Make sure the workqueue entry is flushed out before updating 1779 * the doorbell. 1780 */ 1781 VERIFY0(ddi_dma_sync(mlwq->mlwq_dma.mxdb_dma_handle, 1782 (uintptr_t)ent - (uintptr_t)mlwq->mlwq_recv_ent, 1783 sizeof (mlxcx_recvq_ent_t), DDI_DMA_SYNC_FORDEV)); 1784 ddi_fm_dma_err_get(mlwq->mlwq_dma.mxdb_dma_handle, &err, 1785 DDI_FME_VERSION); 1786 if (err.fme_status != DDI_FM_OK) { 1787 return (B_FALSE); 1788 } 1789 } 1790 1791 mlwq->mlwq_doorbell->mlwqd_recv_counter = to_be16(mlwq->mlwq_pc); 1792 /* 1793 * Flush the CQ doorbell as well so that HW knows how many 1794 * completions we've consumed. 1795 */ 1796 MLXCX_DMA_SYNC(cq->mlcq_doorbell_dma, DDI_DMA_SYNC_FORDEV); 1797 ddi_fm_dma_err_get(cq->mlcq_doorbell_dma.mxdb_dma_handle, &err, 1798 DDI_FME_VERSION); 1799 if (err.fme_status != DDI_FM_OK) { 1800 return (B_FALSE); 1801 } 1802 MLXCX_DMA_SYNC(mlwq->mlwq_doorbell_dma, DDI_DMA_SYNC_FORDEV); 1803 ddi_fm_dma_err_get(mlwq->mlwq_doorbell_dma.mxdb_dma_handle, &err, 1804 DDI_FME_VERSION); 1805 if (err.fme_status != DDI_FM_OK) { 1806 return (B_FALSE); 1807 } 1808 return (B_TRUE); 1809 } 1810 1811 static void 1812 mlxcx_rq_refill_task(void *arg) 1813 { 1814 mlxcx_work_queue_t *wq = arg; 1815 mlxcx_completion_queue_t *cq = wq->mlwq_cq; 1816 mlxcx_t *mlxp = wq->mlwq_mlx; 1817 mlxcx_buf_shard_t *s = wq->mlwq_bufs; 1818 boolean_t refill, draining; 1819 1820 do { 1821 /* 1822 * Wait here until one of 3 conditions: 1823 * 1. The shard is draining, or 1824 * 2. There are buffers on the free list, or 1825 * 3. The WQ is being shut down. 1826 */ 1827 mutex_enter(&s->mlbs_mtx); 1828 while (s->mlbs_state != MLXCX_SHARD_DRAINING && 1829 list_is_empty(&s->mlbs_free) && 1830 (cq->mlcq_state & MLXCX_CQ_TEARDOWN) == 0) { 1831 cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx); 1832 } 1833 1834 draining = (s->mlbs_state == MLXCX_SHARD_DRAINING); 1835 mutex_exit(&s->mlbs_mtx); 1836 1837 mutex_enter(&cq->mlcq_mtx); 1838 mutex_enter(&wq->mlwq_mtx); 1839 1840 if (draining || (cq->mlcq_state & MLXCX_CQ_TEARDOWN) != 0) { 1841 refill = B_FALSE; 1842 wq->mlwq_state &= ~MLXCX_WQ_REFILLING; 1843 } else { 1844 mlxcx_rq_refill(mlxp, wq); 1845 1846 if (cq->mlcq_bufcnt < MLXCX_RQ_REFILL_STEP) { 1847 refill = B_TRUE; 1848 } else { 1849 refill = B_FALSE; 1850 wq->mlwq_state &= ~MLXCX_WQ_REFILLING; 1851 } 1852 } 1853 1854 mutex_exit(&wq->mlwq_mtx); 1855 mutex_exit(&cq->mlcq_mtx); 1856 } while (refill); 1857 } 1858 1859 void 1860 mlxcx_rq_refill(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq) 1861 { 1862 size_t target, current, want, done, n; 1863 mlxcx_completion_queue_t *cq; 1864 mlxcx_ring_group_t *g; 1865 mlxcx_buffer_t *b[MLXCX_RQ_REFILL_STEP]; 1866 uint_t i; 1867 1868 ASSERT(mutex_owned(&mlwq->mlwq_mtx)); 1869 cq = mlwq->mlwq_cq; 1870 ASSERT(mutex_owned(&cq->mlcq_mtx)); 1871 1872 ASSERT(mlwq->mlwq_state & MLXCX_WQ_BUFFERS); 1873 1874 target = mlwq->mlwq_nents - MLXCX_RQ_REFILL_STEP; 1875 cq = mlwq->mlwq_cq; 1876 1877 if ((mlwq->mlwq_state & MLXCX_WQ_STARTED) == 0) 1878 return; 1879 1880 if ((cq->mlcq_state & MLXCX_CQ_TEARDOWN) != 0) 1881 return; 1882 1883 current = cq->mlcq_bufcnt; 1884 1885 if (current >= target - MLXCX_RQ_REFILL_STEP) 1886 return; 1887 1888 want = target - current; 1889 done = 0; 1890 1891 while (!(mlwq->mlwq_state & MLXCX_WQ_TEARDOWN) && done < want) { 1892 n = mlxcx_buf_take_n(mlxp, mlwq, b, MLXCX_RQ_REFILL_STEP); 1893 if (n == 0) { 1894 /* 1895 * We didn't get any buffers from the free queue. 1896 * It might not be an issue, schedule a taskq 1897 * to wait for free buffers if the completion 1898 * queue is low. 1899 */ 1900 if (current < MLXCX_RQ_REFILL_STEP && 1901 (mlwq->mlwq_state & MLXCX_WQ_REFILLING) == 0) { 1902 mlwq->mlwq_state |= MLXCX_WQ_REFILLING; 1903 g = mlwq->mlwq_group; 1904 taskq_dispatch_ent(g->mlg_refill_tq, 1905 mlxcx_rq_refill_task, mlwq, TQ_NOSLEEP, 1906 &mlwq->mlwq_tqe); 1907 } 1908 1909 return; 1910 } 1911 1912 if ((mlwq->mlwq_state & MLXCX_WQ_TEARDOWN) != 0) { 1913 for (i = 0; i < n; ++i) 1914 mlxcx_buf_return(mlxp, b[i]); 1915 return; 1916 } 1917 if (!mlxcx_rq_add_buffers(mlxp, mlwq, b, n)) { 1918 /* 1919 * mlxcx_rq_add_buffers NULLs out the buffers as it 1920 * enqueues them, so any that are non-NULL we have to 1921 * free now. The others now belong to the WQ, even if 1922 * we failed. 1923 */ 1924 for (i = 0; i < n; ++i) { 1925 if (b[i] != NULL) { 1926 mlxcx_buf_return(mlxp, b[i]); 1927 } 1928 } 1929 return; 1930 } 1931 done += n; 1932 } 1933 } 1934 1935 static const char * 1936 mlxcx_cq_err_syndrome_string(mlxcx_cq_error_syndrome_t sy) 1937 { 1938 switch (sy) { 1939 case MLXCX_CQ_ERR_LOCAL_LENGTH: 1940 return ("LOCAL_LENGTH"); 1941 case MLXCX_CQ_ERR_LOCAL_QP_OP: 1942 return ("LOCAL_QP_OP"); 1943 case MLXCX_CQ_ERR_LOCAL_PROTECTION: 1944 return ("LOCAL_PROTECTION"); 1945 case MLXCX_CQ_ERR_WR_FLUSHED: 1946 return ("WR_FLUSHED"); 1947 case MLXCX_CQ_ERR_MEM_WINDOW_BIND: 1948 return ("MEM_WINDOW_BIND"); 1949 case MLXCX_CQ_ERR_BAD_RESPONSE: 1950 return ("BAD_RESPONSE"); 1951 case MLXCX_CQ_ERR_LOCAL_ACCESS: 1952 return ("LOCAL_ACCESS"); 1953 case MLXCX_CQ_ERR_XPORT_RETRY_CTR: 1954 return ("XPORT_RETRY_CTR"); 1955 case MLXCX_CQ_ERR_RNR_RETRY_CTR: 1956 return ("RNR_RETRY_CTR"); 1957 case MLXCX_CQ_ERR_ABORTED: 1958 return ("ABORTED"); 1959 default: 1960 return ("UNKNOWN"); 1961 } 1962 } 1963 1964 static void 1965 mlxcx_fm_cqe_ereport(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, 1966 mlxcx_completionq_error_ent_t *ent) 1967 { 1968 uint64_t ena; 1969 char buf[FM_MAX_CLASS]; 1970 const char *name = mlxcx_cq_err_syndrome_string(ent->mlcqee_syndrome); 1971 1972 if (!DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps)) 1973 return; 1974 1975 (void) snprintf(buf, FM_MAX_CLASS, "%s.%s", 1976 MLXCX_FM_SERVICE_MLXCX, "cqe.err"); 1977 ena = fm_ena_generate(0, FM_ENA_FMT1); 1978 1979 ddi_fm_ereport_post(mlxp->mlx_dip, buf, ena, DDI_NOSLEEP, 1980 FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0, 1981 "syndrome", DATA_TYPE_STRING, name, 1982 "syndrome_num", DATA_TYPE_UINT8, ent->mlcqee_syndrome, 1983 "vendor_syndrome", DATA_TYPE_UINT8, 1984 ent->mlcqee_vendor_error_syndrome, 1985 "wqe_counter", DATA_TYPE_UINT16, from_be16(ent->mlcqee_wqe_counter), 1986 "wq_type", DATA_TYPE_STRING, 1987 (mlcq->mlcq_wq->mlwq_type == MLXCX_WQ_TYPE_SENDQ) ? "send": "recv", 1988 "cq_num", DATA_TYPE_UINT32, mlcq->mlcq_num, 1989 "wq_num", DATA_TYPE_UINT32, mlcq->mlcq_wq->mlwq_num, 1990 NULL); 1991 ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_DEGRADED); 1992 } 1993 1994 void 1995 mlxcx_tx_completion(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, 1996 mlxcx_completionq_ent_t *ent, mlxcx_buffer_t *buf) 1997 { 1998 ASSERT(mutex_owned(&mlcq->mlcq_mtx)); 1999 if (ent->mlcqe_opcode == MLXCX_CQE_OP_REQ_ERR) { 2000 mlxcx_completionq_error_ent_t *eent = 2001 (mlxcx_completionq_error_ent_t *)ent; 2002 mlxcx_fm_cqe_ereport(mlxp, mlcq, eent); 2003 mlxcx_buf_return_chain(mlxp, buf, B_FALSE); 2004 mutex_enter(&mlcq->mlcq_wq->mlwq_mtx); 2005 mlxcx_check_sq(mlxp, mlcq->mlcq_wq); 2006 mutex_exit(&mlcq->mlcq_wq->mlwq_mtx); 2007 return; 2008 } 2009 2010 if (ent->mlcqe_opcode != MLXCX_CQE_OP_REQ) { 2011 mlxcx_warn(mlxp, "!got weird cq opcode: %x", ent->mlcqe_opcode); 2012 mlxcx_buf_return_chain(mlxp, buf, B_FALSE); 2013 return; 2014 } 2015 2016 if (ent->mlcqe_send_wqe_opcode != MLXCX_WQE_OP_SEND) { 2017 mlxcx_warn(mlxp, "!got weird cq wqe opcode: %x", 2018 ent->mlcqe_send_wqe_opcode); 2019 mlxcx_buf_return_chain(mlxp, buf, B_FALSE); 2020 return; 2021 } 2022 2023 if (ent->mlcqe_format != MLXCX_CQE_FORMAT_BASIC) { 2024 mlxcx_warn(mlxp, "!got weird cq format: %x", ent->mlcqe_format); 2025 mlxcx_buf_return_chain(mlxp, buf, B_FALSE); 2026 return; 2027 } 2028 2029 mlxcx_buf_return_chain(mlxp, buf, B_FALSE); 2030 } 2031 2032 mblk_t * 2033 mlxcx_rx_completion(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, 2034 mlxcx_completionq_ent_t *ent, mlxcx_buffer_t *buf) 2035 { 2036 uint32_t chkflags = 0; 2037 uint_t wqe_index, used; 2038 ddi_fm_error_t err; 2039 mblk_t *mp; 2040 2041 ASSERT(mutex_owned(&mlcq->mlcq_mtx)); 2042 2043 if (ent->mlcqe_opcode == MLXCX_CQE_OP_RESP_ERR) { 2044 mlxcx_completionq_error_ent_t *eent = 2045 (mlxcx_completionq_error_ent_t *)ent; 2046 mlxcx_fm_cqe_ereport(mlxp, mlcq, eent); 2047 mlxcx_buf_return(mlxp, buf); 2048 mutex_enter(&mlcq->mlcq_wq->mlwq_mtx); 2049 mlxcx_check_rq(mlxp, mlcq->mlcq_wq); 2050 mutex_exit(&mlcq->mlcq_wq->mlwq_mtx); 2051 return (NULL); 2052 } 2053 2054 if (ent->mlcqe_opcode != MLXCX_CQE_OP_RESP) { 2055 mlxcx_warn(mlxp, "!got weird cq opcode: %x", ent->mlcqe_opcode); 2056 mlxcx_buf_return(mlxp, buf); 2057 return (NULL); 2058 } 2059 2060 if (ent->mlcqe_format != MLXCX_CQE_FORMAT_BASIC) { 2061 mlxcx_warn(mlxp, "!got weird cq format: %x", ent->mlcqe_format); 2062 mlxcx_buf_return(mlxp, buf); 2063 return (NULL); 2064 } 2065 2066 if (ent->mlcqe_rx_drop_counter > 0) { 2067 atomic_add_64(&mlcq->mlcq_stats->mlps_rx_drops, 2068 ent->mlcqe_rx_drop_counter); 2069 } 2070 2071 MLXCX_DMA_SYNC(buf->mlb_dma, DDI_DMA_SYNC_FORCPU); 2072 ddi_fm_dma_err_get(buf->mlb_dma.mxdb_dma_handle, &err, 2073 DDI_FME_VERSION); 2074 if (err.fme_status != DDI_FM_OK) { 2075 ddi_fm_dma_err_clear(buf->mlb_dma.mxdb_dma_handle, 2076 DDI_FME_VERSION); 2077 mlxcx_buf_return(mlxp, buf); 2078 return (NULL); 2079 } 2080 2081 /* 2082 * mlxcx_buf_loan() will set mlb_wqe_index to zero. 2083 * Remember it for later. 2084 */ 2085 wqe_index = buf->mlb_wqe_index; 2086 2087 /* Set the used field with the actual length of the packet. */ 2088 buf->mlb_used = (used = from_be32(ent->mlcqe_byte_cnt)); 2089 2090 /* Try to loan this buffer to MAC directly. */ 2091 if (mlxcx_buf_loan(mlxp, buf)) { 2092 mp = buf->mlb_mp; 2093 2094 } else { 2095 /* 2096 * Loan rejected: we will try to allocate a new mblk and copy 2097 * this packet for MAC instead. 2098 */ 2099 mp = allocb(buf->mlb_used, 0); 2100 if (mp == NULL) { 2101 /* No memory :( */ 2102 atomic_add_64(&mlcq->mlcq_stats->mlps_rx_drops, 1); 2103 mlxcx_buf_return(mlxp, buf); 2104 return (NULL); 2105 } 2106 bcopy((unsigned char *)buf->mlb_dma.mxdb_va, mp->b_rptr, 2107 buf->mlb_used); 2108 2109 /* We're done with this buf now, return it to the free list. */ 2110 mlxcx_buf_return(mlxp, buf); 2111 buf = NULL; 2112 } 2113 2114 mp->b_next = NULL; 2115 mp->b_cont = NULL; 2116 mp->b_wptr = mp->b_rptr + used; 2117 2118 if (get_bit8(ent->mlcqe_csflags, MLXCX_CQE_CSFLAGS_L4_OK)) { 2119 chkflags |= HCK_FULLCKSUM_OK; 2120 } 2121 if (get_bit8(ent->mlcqe_csflags, MLXCX_CQE_CSFLAGS_L3_OK)) { 2122 chkflags |= HCK_IPV4_HDRCKSUM_OK; 2123 } 2124 if (chkflags != 0) { 2125 mac_hcksum_set(mp, 0, 0, 0, from_be16(ent->mlcqe_checksum), 2126 chkflags); 2127 } 2128 2129 /* 2130 * Don't check if a refill is needed on every single completion, 2131 * since checking involves taking the RQ lock. 2132 */ 2133 if ((wqe_index & 0x7) == 0) { 2134 mlxcx_work_queue_t *wq = mlcq->mlcq_wq; 2135 ASSERT(wq != NULL); 2136 mutex_enter(&wq->mlwq_mtx); 2137 if (!(wq->mlwq_state & MLXCX_WQ_TEARDOWN)) 2138 mlxcx_rq_refill(mlxp, wq); 2139 mutex_exit(&wq->mlwq_mtx); 2140 } 2141 2142 return (mp); 2143 } 2144 2145 static void 2146 mlxcx_buf_mp_return(caddr_t arg) 2147 { 2148 mlxcx_buffer_t *b = (mlxcx_buffer_t *)arg; 2149 mlxcx_t *mlxp = b->mlb_mlx; 2150 2151 /* The mblk has been used now, so NULL it out. */ 2152 b->mlb_mp = NULL; 2153 2154 if (b->mlb_state == MLXCX_BUFFER_ON_LOAN) 2155 mlxcx_buf_return(mlxp, b); 2156 } 2157 2158 boolean_t 2159 mlxcx_buf_create(mlxcx_t *mlxp, mlxcx_buf_shard_t *shard, mlxcx_buffer_t **bp) 2160 { 2161 mlxcx_buffer_t *b; 2162 ddi_device_acc_attr_t acc; 2163 ddi_dma_attr_t attr; 2164 boolean_t ret; 2165 2166 b = kmem_cache_alloc(mlxp->mlx_bufs_cache, KM_SLEEP); 2167 b->mlb_shard = shard; 2168 b->mlb_foreign = B_FALSE; 2169 2170 mlxcx_dma_acc_attr(mlxp, &acc); 2171 mlxcx_dma_buf_attr(mlxp, &attr); 2172 2173 ret = mlxcx_dma_alloc_offset(mlxp, &b->mlb_dma, &attr, &acc, 2174 B_FALSE, mlxp->mlx_ports[0].mlp_mtu, 2, B_TRUE); 2175 if (!ret) { 2176 kmem_cache_free(mlxp->mlx_bufs_cache, b); 2177 return (B_FALSE); 2178 } 2179 2180 b->mlb_frtn.free_func = mlxcx_buf_mp_return; 2181 b->mlb_frtn.free_arg = (caddr_t)b; 2182 b->mlb_mp = desballoc((unsigned char *)b->mlb_dma.mxdb_va, 2183 b->mlb_dma.mxdb_len, 0, &b->mlb_frtn); 2184 2185 *bp = b; 2186 2187 return (B_TRUE); 2188 } 2189 2190 boolean_t 2191 mlxcx_buf_create_foreign(mlxcx_t *mlxp, mlxcx_buf_shard_t *shard, 2192 mlxcx_buffer_t **bp) 2193 { 2194 mlxcx_buffer_t *b; 2195 ddi_dma_attr_t attr; 2196 boolean_t ret; 2197 2198 b = kmem_cache_alloc(mlxp->mlx_bufs_cache, KM_SLEEP); 2199 b->mlb_shard = shard; 2200 b->mlb_foreign = B_TRUE; 2201 2202 mlxcx_dma_buf_attr(mlxp, &attr); 2203 2204 ret = mlxcx_dma_init(mlxp, &b->mlb_dma, &attr, B_TRUE); 2205 if (!ret) { 2206 kmem_cache_free(mlxp->mlx_bufs_cache, b); 2207 return (B_FALSE); 2208 } 2209 2210 *bp = b; 2211 2212 return (B_TRUE); 2213 } 2214 2215 static mlxcx_buffer_t * 2216 mlxcx_buf_take_foreign(mlxcx_t *mlxp, mlxcx_work_queue_t *wq) 2217 { 2218 mlxcx_buffer_t *b; 2219 mlxcx_buf_shard_t *s = wq->mlwq_foreign_bufs; 2220 2221 mutex_enter(&s->mlbs_mtx); 2222 if (s->mlbs_state != MLXCX_SHARD_READY) { 2223 mutex_exit(&s->mlbs_mtx); 2224 return (NULL); 2225 } 2226 2227 if ((b = list_remove_head(&s->mlbs_free)) != NULL) { 2228 ASSERT3U(b->mlb_state, ==, MLXCX_BUFFER_FREE); 2229 ASSERT(b->mlb_foreign); 2230 b->mlb_state = MLXCX_BUFFER_ON_WQ; 2231 list_insert_tail(&s->mlbs_busy, b); 2232 } 2233 mutex_exit(&s->mlbs_mtx); 2234 2235 return (b); 2236 } 2237 2238 static mlxcx_buffer_t * 2239 mlxcx_copy_data(mlxcx_t *mlxp, mlxcx_work_queue_t *wq, uint8_t *rptr, size_t sz) 2240 { 2241 ddi_fm_error_t err; 2242 mlxcx_buffer_t *b; 2243 uint_t attempts = 0; 2244 2245 copyb: 2246 if ((b = mlxcx_buf_take(mlxp, wq)) == NULL) 2247 return (NULL); 2248 2249 ASSERT3U(b->mlb_dma.mxdb_len, >=, sz); 2250 bcopy(rptr, b->mlb_dma.mxdb_va, sz); 2251 2252 MLXCX_DMA_SYNC(b->mlb_dma, DDI_DMA_SYNC_FORDEV); 2253 2254 ddi_fm_dma_err_get(b->mlb_dma.mxdb_dma_handle, &err, 2255 DDI_FME_VERSION); 2256 if (err.fme_status != DDI_FM_OK) { 2257 ddi_fm_dma_err_clear(b->mlb_dma.mxdb_dma_handle, 2258 DDI_FME_VERSION); 2259 mlxcx_buf_return(mlxp, b); 2260 if (++attempts > MLXCX_BUF_BIND_MAX_ATTEMTPS) { 2261 return (NULL); 2262 } 2263 goto copyb; 2264 } 2265 2266 return (b); 2267 } 2268 2269 static mlxcx_buffer_t * 2270 mlxcx_bind_or_copy_mblk(mlxcx_t *mlxp, mlxcx_work_queue_t *wq, 2271 mblk_t *mp, size_t off) 2272 { 2273 mlxcx_buffer_t *b; 2274 uint8_t *rptr; 2275 size_t sz; 2276 boolean_t ret; 2277 2278 rptr = mp->b_rptr; 2279 sz = MBLKL(mp); 2280 2281 #ifdef DEBUG 2282 if (off > 0) { 2283 ASSERT3U(off, <, sz); 2284 } 2285 #endif 2286 2287 rptr += off; 2288 sz -= off; 2289 2290 if (sz < mlxp->mlx_props.mldp_tx_bind_threshold) { 2291 b = mlxcx_copy_data(mlxp, wq, rptr, sz); 2292 } else { 2293 b = mlxcx_buf_take_foreign(mlxp, wq); 2294 if (b == NULL) 2295 return (NULL); 2296 2297 ret = mlxcx_dma_bind_mblk(mlxp, &b->mlb_dma, mp, off, 2298 B_FALSE); 2299 2300 if (!ret) { 2301 mlxcx_buf_return(mlxp, b); 2302 2303 b = mlxcx_copy_data(mlxp, wq, rptr, sz); 2304 } 2305 } 2306 2307 return (b); 2308 } 2309 2310 uint_t 2311 mlxcx_buf_bind_or_copy(mlxcx_t *mlxp, mlxcx_work_queue_t *wq, 2312 mblk_t *mpb, size_t off, mlxcx_buffer_t **bp) 2313 { 2314 mlxcx_buffer_t *b, *b0 = NULL; 2315 boolean_t first = B_TRUE; 2316 mblk_t *mp; 2317 size_t offset = off; 2318 size_t ncookies = 0; 2319 uint_t count = 0; 2320 2321 for (mp = mpb; mp != NULL && ncookies <= MLXCX_SQE_MAX_PTRS; 2322 mp = mp->b_cont) { 2323 b = mlxcx_bind_or_copy_mblk(mlxp, wq, mp, offset); 2324 if (b == NULL) 2325 goto failed; 2326 2327 ncookies += b->mlb_dma.mxdb_ncookies; 2328 2329 if (first) 2330 b0 = b; 2331 2332 if (!first) 2333 b->mlb_state = MLXCX_BUFFER_ON_CHAIN; 2334 2335 b->mlb_tx_mp = mp; 2336 b->mlb_tx_head = b0; 2337 b->mlb_used = MBLKL(mp) - offset; 2338 2339 if (!first) 2340 list_insert_tail(&b0->mlb_tx_chain, b); 2341 first = B_FALSE; 2342 offset = 0; 2343 2344 count++; 2345 } 2346 2347 /* 2348 * The chain of mblks has resulted in too many cookies for 2349 * a single message. This is unusual, so take the hit to tidy 2350 * up, do a pullup to a single mblk and allocate the requisite 2351 * buf. 2352 */ 2353 if (ncookies > MLXCX_SQE_MAX_PTRS) { 2354 DTRACE_PROBE4(pullup, mlxcx_t *, mlxp, mlxcx_work_queue_t *, wq, 2355 mblk_t *, mpb, size_t, ncookies); 2356 2357 if (b0 != NULL) 2358 mlxcx_buf_return_chain(mlxp, b0, B_TRUE); 2359 2360 if ((mp = msgpullup(mpb, -1)) == NULL) 2361 return (0); 2362 2363 b0 = mlxcx_bind_or_copy_mblk(mlxp, wq, mp, off); 2364 if (b0 == NULL) { 2365 freemsg(mp); 2366 return (0); 2367 } 2368 freemsg(mpb); 2369 2370 b0->mlb_tx_mp = mp; 2371 b0->mlb_tx_head = b0; 2372 b0->mlb_used = MBLKL(mp) - off; 2373 2374 count = 1; 2375 } 2376 2377 *bp = b0; 2378 2379 return (count); 2380 2381 failed: 2382 if (b0 != NULL) 2383 mlxcx_buf_return_chain(mlxp, b0, B_TRUE); 2384 2385 return (0); 2386 } 2387 2388 mlxcx_buffer_t * 2389 mlxcx_buf_take(mlxcx_t *mlxp, mlxcx_work_queue_t *wq) 2390 { 2391 mlxcx_buffer_t *b; 2392 mlxcx_buf_shard_t *s = wq->mlwq_bufs; 2393 2394 mutex_enter(&s->mlbs_mtx); 2395 if (s->mlbs_state != MLXCX_SHARD_READY) { 2396 mutex_exit(&s->mlbs_mtx); 2397 return (NULL); 2398 } 2399 2400 if ((b = list_remove_head(&s->mlbs_free)) != NULL) { 2401 ASSERT3U(b->mlb_state, ==, MLXCX_BUFFER_FREE); 2402 b->mlb_state = MLXCX_BUFFER_ON_WQ; 2403 list_insert_tail(&s->mlbs_busy, b); 2404 } 2405 mutex_exit(&s->mlbs_mtx); 2406 2407 return (b); 2408 } 2409 2410 size_t 2411 mlxcx_buf_take_n(mlxcx_t *mlxp, mlxcx_work_queue_t *wq, mlxcx_buffer_t **bp, 2412 size_t nbufs) 2413 { 2414 mlxcx_buffer_t *b; 2415 size_t done = 0; 2416 mlxcx_buf_shard_t *s; 2417 2418 s = wq->mlwq_bufs; 2419 2420 mutex_enter(&s->mlbs_mtx); 2421 if (s->mlbs_state != MLXCX_SHARD_READY) { 2422 mutex_exit(&s->mlbs_mtx); 2423 return (0); 2424 } 2425 2426 while (done < nbufs && (b = list_remove_head(&s->mlbs_free)) != NULL) { 2427 ASSERT3U(b->mlb_state, ==, MLXCX_BUFFER_FREE); 2428 b->mlb_state = MLXCX_BUFFER_ON_WQ; 2429 list_insert_tail(&s->mlbs_busy, b); 2430 bp[done++] = b; 2431 } 2432 mutex_exit(&s->mlbs_mtx); 2433 return (done); 2434 } 2435 2436 boolean_t 2437 mlxcx_buf_loan(mlxcx_t *mlxp, mlxcx_buffer_t *b) 2438 { 2439 mlxcx_buf_shard_t *s = b->mlb_shard; 2440 2441 VERIFY3U(b->mlb_state, ==, MLXCX_BUFFER_ON_WQ); 2442 ASSERT3P(b->mlb_mlx, ==, mlxp); 2443 2444 if (b->mlb_mp == NULL) { 2445 b->mlb_mp = desballoc((unsigned char *)b->mlb_dma.mxdb_va, 2446 b->mlb_dma.mxdb_len, 0, &b->mlb_frtn); 2447 if (b->mlb_mp == NULL) 2448 return (B_FALSE); 2449 } 2450 2451 mutex_enter(&s->mlbs_mtx); 2452 2453 /* Check if we have too many buffers on loan. */ 2454 if (s->mlbs_nloaned >= s->mlbs_hiwat1 && 2455 b->mlb_used < mlxp->mlx_props.mldp_rx_p50_loan_min_size) { 2456 mutex_exit(&s->mlbs_mtx); 2457 return (B_FALSE); 2458 } else if (s->mlbs_nloaned >= s->mlbs_hiwat2) { 2459 mutex_exit(&s->mlbs_mtx); 2460 return (B_FALSE); 2461 } 2462 2463 b->mlb_state = MLXCX_BUFFER_ON_LOAN; 2464 b->mlb_wqe_index = 0; 2465 list_remove(&s->mlbs_busy, b); 2466 list_insert_tail(&s->mlbs_loaned, b); 2467 s->mlbs_nloaned++; 2468 mutex_exit(&s->mlbs_mtx); 2469 2470 return (B_TRUE); 2471 } 2472 2473 void 2474 mlxcx_buf_return_chain(mlxcx_t *mlxp, mlxcx_buffer_t *b0, boolean_t keepmp) 2475 { 2476 mlxcx_buffer_t *b; 2477 2478 if (b0->mlb_tx_head != b0) { 2479 mlxcx_buf_return(mlxp, b0); 2480 return; 2481 } 2482 2483 while ((b = list_head(&b0->mlb_tx_chain)) != NULL) { 2484 mlxcx_buf_return(mlxp, b); 2485 } 2486 if (keepmp) { 2487 b0->mlb_tx_mp = NULL; 2488 b0->mlb_tx_head = NULL; 2489 } 2490 mlxcx_buf_return(mlxp, b0); 2491 } 2492 2493 inline void 2494 mlxcx_bufshard_adjust_total(mlxcx_buf_shard_t *s, int64_t incr) 2495 { 2496 s->mlbs_ntotal += incr; 2497 s->mlbs_hiwat1 = s->mlbs_ntotal / 2; 2498 s->mlbs_hiwat2 = 3 * (s->mlbs_ntotal / 4); 2499 } 2500 2501 void 2502 mlxcx_buf_return(mlxcx_t *mlxp, mlxcx_buffer_t *b) 2503 { 2504 mlxcx_buffer_state_t oldstate = b->mlb_state; 2505 mlxcx_buffer_t *txhead = b->mlb_tx_head; 2506 mlxcx_buf_shard_t *s = b->mlb_shard; 2507 mblk_t *mp = b->mlb_tx_mp; 2508 2509 VERIFY3U(oldstate, !=, MLXCX_BUFFER_FREE); 2510 ASSERT3P(b->mlb_mlx, ==, mlxp); 2511 2512 /* 2513 * The mlbs_mtx held below is a heavily contended lock, so it is 2514 * imperative we do as much of the buffer clean up outside the lock 2515 * as is possible. 2516 */ 2517 b->mlb_state = MLXCX_BUFFER_FREE; 2518 b->mlb_wqe_index = 0; 2519 b->mlb_tx_head = NULL; 2520 b->mlb_tx_mp = NULL; 2521 b->mlb_used = 0; 2522 b->mlb_wqebbs = 0; 2523 ASSERT(list_is_empty(&b->mlb_tx_chain)); 2524 2525 if (b->mlb_foreign) { 2526 if (b->mlb_dma.mxdb_flags & MLXCX_DMABUF_BOUND) { 2527 mlxcx_dma_unbind(mlxp, &b->mlb_dma); 2528 } 2529 } 2530 2531 mutex_enter(&s->mlbs_mtx); 2532 switch (oldstate) { 2533 case MLXCX_BUFFER_INIT: 2534 mlxcx_bufshard_adjust_total(s, 1); 2535 break; 2536 case MLXCX_BUFFER_ON_WQ: 2537 list_remove(&s->mlbs_busy, b); 2538 break; 2539 case MLXCX_BUFFER_ON_LOAN: 2540 ASSERT(!b->mlb_foreign); 2541 --s->mlbs_nloaned; 2542 list_remove(&s->mlbs_loaned, b); 2543 if (s->mlbs_state == MLXCX_SHARD_DRAINING) { 2544 /* 2545 * When we're draining, Eg during mac_stop(), 2546 * we destroy the buffer immediately rather than 2547 * recycling it. Otherwise we risk leaving it 2548 * on the free list and leaking it. 2549 */ 2550 list_insert_tail(&s->mlbs_free, b); 2551 mlxcx_buf_destroy(mlxp, b); 2552 /* 2553 * Teardown might be waiting for loaned list to empty. 2554 */ 2555 cv_broadcast(&s->mlbs_free_nonempty); 2556 mutex_exit(&s->mlbs_mtx); 2557 return; 2558 } 2559 break; 2560 case MLXCX_BUFFER_FREE: 2561 VERIFY(0); 2562 break; 2563 case MLXCX_BUFFER_ON_CHAIN: 2564 ASSERT(txhead != NULL); 2565 list_remove(&txhead->mlb_tx_chain, b); 2566 list_remove(&s->mlbs_busy, b); 2567 break; 2568 } 2569 2570 list_insert_tail(&s->mlbs_free, b); 2571 cv_broadcast(&s->mlbs_free_nonempty); 2572 2573 mutex_exit(&s->mlbs_mtx); 2574 2575 /* 2576 * For TX chain heads, free the mblk_t after we let go of the lock. 2577 * This might be a borrowed buf that we in turn loaned to MAC, in which 2578 * case calling freemsg() on it will re-enter this very function -- so 2579 * we better not be holding the lock! 2580 */ 2581 if (txhead == b) 2582 freemsg(mp); 2583 } 2584 2585 void 2586 mlxcx_buf_destroy(mlxcx_t *mlxp, mlxcx_buffer_t *b) 2587 { 2588 mlxcx_buf_shard_t *s = b->mlb_shard; 2589 2590 VERIFY(b->mlb_state == MLXCX_BUFFER_FREE || 2591 b->mlb_state == MLXCX_BUFFER_INIT); 2592 ASSERT(mutex_owned(&s->mlbs_mtx)); 2593 2594 if (b->mlb_state == MLXCX_BUFFER_FREE) { 2595 list_remove(&s->mlbs_free, b); 2596 mlxcx_bufshard_adjust_total(s, -1); 2597 } 2598 2599 /* 2600 * This is going back to the kmem cache, so it needs to be set up in 2601 * the same way we expect a new buffer to come out (state INIT, other 2602 * fields NULL'd) 2603 */ 2604 b->mlb_state = MLXCX_BUFFER_INIT; 2605 b->mlb_shard = NULL; 2606 if (b->mlb_mp != NULL) { 2607 freeb(b->mlb_mp); 2608 ASSERT(b->mlb_mp == NULL); 2609 } 2610 mlxcx_dma_free(&b->mlb_dma); 2611 ASSERT(list_is_empty(&b->mlb_tx_chain)); 2612 2613 kmem_cache_free(mlxp->mlx_bufs_cache, b); 2614 } 2615 2616 void 2617 mlxcx_shard_ready(mlxcx_buf_shard_t *s) 2618 { 2619 mutex_enter(&s->mlbs_mtx); 2620 s->mlbs_state = MLXCX_SHARD_READY; 2621 mutex_exit(&s->mlbs_mtx); 2622 } 2623 2624 void 2625 mlxcx_shard_draining(mlxcx_buf_shard_t *s) 2626 { 2627 mutex_enter(&s->mlbs_mtx); 2628 s->mlbs_state = MLXCX_SHARD_DRAINING; 2629 cv_broadcast(&s->mlbs_free_nonempty); 2630 mutex_exit(&s->mlbs_mtx); 2631 } 2632