1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2020, The University of Queensland 14 * Copyright (c) 2018, Joyent, Inc. 15 * Copyright 2020 RackTop Systems, Inc. 16 */ 17 18 /* 19 * Mellanox Connect-X 4/5/6 driver. 20 */ 21 22 #include <sys/modctl.h> 23 #include <sys/conf.h> 24 #include <sys/devops.h> 25 #include <sys/sysmacros.h> 26 #include <sys/atomic.h> 27 #include <sys/cpuvar.h> 28 #include <sys/sdt.h> 29 30 #include <sys/pattr.h> 31 #include <sys/dlpi.h> 32 33 #include <sys/mac_provider.h> 34 35 #include <sys/random.h> 36 37 #include <mlxcx.h> 38 39 boolean_t 40 mlxcx_wq_alloc_dma(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq) 41 { 42 ddi_device_acc_attr_t acc; 43 ddi_dma_attr_t attr; 44 boolean_t ret; 45 size_t sz; 46 47 VERIFY0(mlwq->mlwq_state & MLXCX_WQ_ALLOC); 48 49 /* Receive and send queue entries might be different sizes. */ 50 switch (mlwq->mlwq_type) { 51 case MLXCX_WQ_TYPE_SENDQ: 52 mlwq->mlwq_entshift = mlxp->mlx_props.mldp_sq_size_shift; 53 mlwq->mlwq_nents = (1 << mlwq->mlwq_entshift); 54 sz = mlwq->mlwq_nents * sizeof (mlxcx_sendq_ent_t); 55 break; 56 case MLXCX_WQ_TYPE_RECVQ: 57 mlwq->mlwq_entshift = mlxp->mlx_props.mldp_rq_size_shift; 58 mlwq->mlwq_nents = (1 << mlwq->mlwq_entshift); 59 sz = mlwq->mlwq_nents * sizeof (mlxcx_recvq_ent_t); 60 break; 61 default: 62 VERIFY(0); 63 return (B_FALSE); 64 } 65 ASSERT3U(sz & (MLXCX_HW_PAGE_SIZE - 1), ==, 0); 66 67 mlxcx_dma_acc_attr(mlxp, &acc); 68 mlxcx_dma_queue_attr(mlxp, &attr); 69 70 ret = mlxcx_dma_alloc(mlxp, &mlwq->mlwq_dma, &attr, &acc, 71 B_TRUE, sz, B_TRUE); 72 if (!ret) { 73 mlxcx_warn(mlxp, "failed to allocate WQ memory"); 74 return (B_FALSE); 75 } 76 77 /* 78 * Just set the first pointer in the union. Yes, this is a strict 79 * aliasing violation. No, I don't care. 80 */ 81 mlwq->mlwq_send_ent = (mlxcx_sendq_ent_t *)mlwq->mlwq_dma.mxdb_va; 82 83 mlxcx_dma_acc_attr(mlxp, &acc); 84 mlxcx_dma_qdbell_attr(mlxp, &attr); 85 sz = sizeof (mlxcx_workq_doorbell_t); 86 ret = mlxcx_dma_alloc(mlxp, &mlwq->mlwq_doorbell_dma, &attr, &acc, 87 B_TRUE, sz, B_TRUE); 88 if (!ret) { 89 mlxcx_warn(mlxp, "failed to allocate WQ doorbell memory"); 90 mlxcx_dma_free(&mlwq->mlwq_dma); 91 mlwq->mlwq_send_ent = NULL; 92 return (B_FALSE); 93 } 94 95 mlwq->mlwq_doorbell = 96 (mlxcx_workq_doorbell_t *)mlwq->mlwq_doorbell_dma.mxdb_va; 97 98 mlwq->mlwq_state |= MLXCX_WQ_ALLOC; 99 100 return (B_TRUE); 101 } 102 103 void 104 mlxcx_wq_rele_dma(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq) 105 { 106 VERIFY(mlwq->mlwq_state & MLXCX_WQ_ALLOC); 107 if (mlwq->mlwq_state & MLXCX_WQ_CREATED) 108 VERIFY(mlwq->mlwq_state & MLXCX_WQ_DESTROYED); 109 110 mlxcx_dma_free(&mlwq->mlwq_dma); 111 mlwq->mlwq_send_ent = NULL; 112 mlxcx_dma_free(&mlwq->mlwq_doorbell_dma); 113 mlwq->mlwq_doorbell = NULL; 114 115 mlwq->mlwq_state &= ~MLXCX_CQ_ALLOC; 116 } 117 118 static boolean_t 119 mlxcx_cq_alloc_dma(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, 120 uint_t ent_shift) 121 { 122 ddi_device_acc_attr_t acc; 123 ddi_dma_attr_t attr; 124 boolean_t ret; 125 size_t sz, i; 126 127 VERIFY0(mlcq->mlcq_state & MLXCX_EQ_ALLOC); 128 129 mlcq->mlcq_entshift = ent_shift; 130 mlcq->mlcq_nents = (1 << mlcq->mlcq_entshift); 131 sz = mlcq->mlcq_nents * sizeof (mlxcx_completionq_ent_t); 132 ASSERT3U(sz & (MLXCX_HW_PAGE_SIZE - 1), ==, 0); 133 134 mlxcx_dma_acc_attr(mlxp, &acc); 135 mlxcx_dma_queue_attr(mlxp, &attr); 136 137 ret = mlxcx_dma_alloc(mlxp, &mlcq->mlcq_dma, &attr, &acc, 138 B_TRUE, sz, B_TRUE); 139 if (!ret) { 140 mlxcx_warn(mlxp, "failed to allocate CQ memory"); 141 return (B_FALSE); 142 } 143 144 mlcq->mlcq_ent = (mlxcx_completionq_ent_t *)mlcq->mlcq_dma.mxdb_va; 145 146 for (i = 0; i < mlcq->mlcq_nents; ++i) { 147 mlcq->mlcq_ent[i].mlcqe_opcode = MLXCX_CQE_OP_INVALID; 148 mlcq->mlcq_ent[i].mlcqe_owner = MLXCX_CQE_OWNER_INIT; 149 } 150 151 mlxcx_dma_acc_attr(mlxp, &acc); 152 mlxcx_dma_qdbell_attr(mlxp, &attr); 153 sz = sizeof (mlxcx_completionq_doorbell_t); 154 ret = mlxcx_dma_alloc(mlxp, &mlcq->mlcq_doorbell_dma, &attr, &acc, 155 B_TRUE, sz, B_TRUE); 156 if (!ret) { 157 mlxcx_warn(mlxp, "failed to allocate CQ doorbell memory"); 158 mlxcx_dma_free(&mlcq->mlcq_dma); 159 mlcq->mlcq_ent = NULL; 160 return (B_FALSE); 161 } 162 163 mlcq->mlcq_doorbell = 164 (mlxcx_completionq_doorbell_t *)mlcq->mlcq_doorbell_dma.mxdb_va; 165 166 atomic_or_uint(&mlcq->mlcq_state, MLXCX_CQ_ALLOC); 167 168 return (B_TRUE); 169 } 170 171 static void 172 mlxcx_cq_rele_dma(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq) 173 { 174 VERIFY(mlcq->mlcq_state & MLXCX_CQ_ALLOC); 175 if (mlcq->mlcq_state & MLXCX_CQ_CREATED) 176 VERIFY(mlcq->mlcq_state & MLXCX_CQ_DESTROYED); 177 178 mlxcx_dma_free(&mlcq->mlcq_dma); 179 mlcq->mlcq_ent = NULL; 180 mlxcx_dma_free(&mlcq->mlcq_doorbell_dma); 181 mlcq->mlcq_doorbell = NULL; 182 183 atomic_and_uint(&mlcq->mlcq_state, ~MLXCX_CQ_ALLOC); 184 } 185 186 void 187 mlxcx_wq_teardown(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq) 188 { 189 mlxcx_completion_queue_t *mlcq; 190 191 /* 192 * If something is holding the lock on a long operation like a 193 * refill, setting this flag asks them to exit early if possible. 194 */ 195 atomic_or_uint(&mlwq->mlwq_state, MLXCX_WQ_TEARDOWN); 196 197 mutex_enter(&mlwq->mlwq_mtx); 198 199 list_remove(&mlxp->mlx_wqs, mlwq); 200 201 if ((mlwq->mlwq_state & MLXCX_WQ_CREATED) && 202 !(mlwq->mlwq_state & MLXCX_WQ_DESTROYED)) { 203 if (mlwq->mlwq_type == MLXCX_WQ_TYPE_RECVQ && 204 mlwq->mlwq_state & MLXCX_WQ_STARTED && 205 !mlxcx_cmd_stop_rq(mlxp, mlwq)) { 206 mlxcx_warn(mlxp, "failed to stop " 207 "recv queue num %x", mlwq->mlwq_num); 208 } 209 if (mlwq->mlwq_type == MLXCX_WQ_TYPE_SENDQ && 210 mlwq->mlwq_state & MLXCX_WQ_STARTED && 211 !mlxcx_cmd_stop_sq(mlxp, mlwq)) { 212 mlxcx_warn(mlxp, "failed to stop " 213 "send queue num %x", mlwq->mlwq_num); 214 } 215 if (mlwq->mlwq_type == MLXCX_WQ_TYPE_RECVQ && 216 !mlxcx_cmd_destroy_rq(mlxp, mlwq)) { 217 mlxcx_warn(mlxp, "failed to destroy " 218 "recv queue num %x", mlwq->mlwq_num); 219 } 220 if (mlwq->mlwq_type == MLXCX_WQ_TYPE_SENDQ && 221 !mlxcx_cmd_destroy_sq(mlxp, mlwq)) { 222 mlxcx_warn(mlxp, "failed to destroy " 223 "send queue num %x", mlwq->mlwq_num); 224 } 225 } 226 if (mlwq->mlwq_state & MLXCX_WQ_ALLOC) { 227 mlxcx_wq_rele_dma(mlxp, mlwq); 228 } 229 mlcq = mlwq->mlwq_cq; 230 231 /* These will be released by mlxcx_teardown_bufs() */ 232 mlwq->mlwq_bufs = NULL; 233 mlwq->mlwq_foreign_bufs = NULL; 234 235 mutex_exit(&mlwq->mlwq_mtx); 236 237 mutex_enter(&mlcq->mlcq_mtx); 238 mutex_enter(&mlwq->mlwq_mtx); 239 ASSERT3P(mlcq->mlcq_wq, ==, mlwq); 240 mlcq->mlcq_wq = NULL; 241 mutex_exit(&mlwq->mlwq_mtx); 242 mutex_exit(&mlcq->mlcq_mtx); 243 244 mutex_destroy(&mlwq->mlwq_mtx); 245 } 246 247 void 248 mlxcx_cq_teardown(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq) 249 { 250 mlxcx_event_queue_t *mleq; 251 mlxcx_buffer_t *b; 252 253 /* 254 * If something is holding the lock on a long operation like polling 255 * which we're going to abort anyway, this flag asks them to exit 256 * early if possible. 257 */ 258 atomic_or_uint(&mlcq->mlcq_state, MLXCX_CQ_TEARDOWN); 259 260 mutex_enter(&mlcq->mlcq_mtx); 261 262 list_remove(&mlxp->mlx_cqs, mlcq); 263 264 if ((mlcq->mlcq_state & MLXCX_CQ_CREATED) && 265 !(mlcq->mlcq_state & MLXCX_CQ_DESTROYED)) { 266 if (!mlxcx_cmd_destroy_cq(mlxp, mlcq)) { 267 mlxcx_warn(mlxp, "failed to destroy " 268 "completion queue num %u", 269 mlcq->mlcq_num); 270 } 271 } 272 if (mlcq->mlcq_state & MLXCX_CQ_ALLOC) { 273 mlxcx_cq_rele_dma(mlxp, mlcq); 274 } 275 /* 276 * If we're on an EQ AVL tree, then we need to grab 277 * the EQ's mutex to take it off. The ISR always takes 278 * EQ mutex before CQ mutex, so we have to let go of 279 * the CQ mutex then come back again. 280 * 281 * The ISR will bail out if tries to touch this CQ now since 282 * we added the CQ_DESTROYED flag above. 283 */ 284 if (mlcq->mlcq_state & MLXCX_CQ_EQAVL) { 285 mleq = mlcq->mlcq_eq; 286 } else { 287 mleq = NULL; 288 } 289 290 /* Return any outstanding buffers to the free pool. */ 291 while ((b = list_remove_head(&mlcq->mlcq_buffers)) != NULL) { 292 mlxcx_buf_return_chain(mlxp, b, B_FALSE); 293 } 294 mutex_enter(&mlcq->mlcq_bufbmtx); 295 while ((b = list_remove_head(&mlcq->mlcq_buffers_b)) != NULL) { 296 mlxcx_buf_return_chain(mlxp, b, B_FALSE); 297 } 298 mutex_exit(&mlcq->mlcq_bufbmtx); 299 300 /* 301 * Since the interrupt handlers take the EQ lock before the CQ one, 302 * we must do the same here. That means letting go of the lock 303 * for a brief window here (we'll double-check the state when we 304 * get back in). 305 */ 306 mutex_exit(&mlcq->mlcq_mtx); 307 308 if (mleq != NULL) { 309 mutex_enter(&mleq->mleq_mtx); 310 mutex_enter(&mlcq->mlcq_mtx); 311 /* 312 * Double-check the state, we let go of the 313 * mutex briefly. 314 */ 315 if (mlcq->mlcq_state & MLXCX_CQ_EQAVL) { 316 avl_remove(&mleq->mleq_cqs, mlcq); 317 atomic_and_uint(&mlcq->mlcq_state, ~MLXCX_CQ_EQAVL); 318 } 319 mutex_exit(&mlcq->mlcq_mtx); 320 mutex_exit(&mleq->mleq_mtx); 321 } 322 323 mutex_enter(&mlcq->mlcq_mtx); 324 ASSERT0(mlcq->mlcq_state & ~(MLXCX_CQ_CREATED | MLXCX_CQ_DESTROYED | 325 MLXCX_CQ_TEARDOWN | MLXCX_CQ_ARMED)); 326 mutex_exit(&mlcq->mlcq_mtx); 327 328 mutex_destroy(&mlcq->mlcq_mtx); 329 mutex_destroy(&mlcq->mlcq_arm_mtx); 330 mutex_destroy(&mlcq->mlcq_bufbmtx); 331 list_destroy(&mlcq->mlcq_buffers); 332 list_destroy(&mlcq->mlcq_buffers_b); 333 kmem_free(mlcq, sizeof (mlxcx_completion_queue_t)); 334 } 335 336 static boolean_t 337 mlxcx_cq_setup(mlxcx_t *mlxp, mlxcx_event_queue_t *eq, 338 mlxcx_completion_queue_t **cqp, uint_t ent_shift) 339 { 340 mlxcx_completion_queue_t *cq; 341 342 cq = kmem_zalloc(sizeof (mlxcx_completion_queue_t), KM_SLEEP); 343 mutex_init(&cq->mlcq_mtx, NULL, MUTEX_DRIVER, 344 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 345 mutex_init(&cq->mlcq_arm_mtx, NULL, MUTEX_DRIVER, 346 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 347 mutex_init(&cq->mlcq_bufbmtx, NULL, MUTEX_DRIVER, 348 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 349 list_create(&cq->mlcq_buffers, sizeof (mlxcx_buffer_t), 350 offsetof(mlxcx_buffer_t, mlb_cq_entry)); 351 list_create(&cq->mlcq_buffers_b, sizeof (mlxcx_buffer_t), 352 offsetof(mlxcx_buffer_t, mlb_cq_entry)); 353 354 cq->mlcq_mlx = mlxp; 355 list_insert_tail(&mlxp->mlx_cqs, cq); 356 357 mutex_enter(&cq->mlcq_mtx); 358 359 if (!mlxcx_cq_alloc_dma(mlxp, cq, ent_shift)) { 360 mutex_exit(&cq->mlcq_mtx); 361 return (B_FALSE); 362 } 363 364 cq->mlcq_bufhwm = cq->mlcq_nents - MLXCX_CQ_HWM_GAP; 365 cq->mlcq_buflwm = cq->mlcq_nents - MLXCX_CQ_LWM_GAP; 366 367 cq->mlcq_uar = &mlxp->mlx_uar; 368 cq->mlcq_eq = eq; 369 370 cq->mlcq_cqemod_period_usec = mlxp->mlx_props.mldp_cqemod_period_usec; 371 cq->mlcq_cqemod_count = mlxp->mlx_props.mldp_cqemod_count; 372 373 if (!mlxcx_cmd_create_cq(mlxp, cq)) { 374 mutex_exit(&cq->mlcq_mtx); 375 return (B_FALSE); 376 } 377 378 mutex_exit(&cq->mlcq_mtx); 379 380 mutex_enter(&eq->mleq_mtx); 381 mutex_enter(&cq->mlcq_mtx); 382 ASSERT0(cq->mlcq_state & MLXCX_CQ_EQAVL); 383 avl_add(&eq->mleq_cqs, cq); 384 atomic_or_uint(&cq->mlcq_state, MLXCX_CQ_EQAVL); 385 mlxcx_arm_cq(mlxp, cq); 386 mutex_exit(&cq->mlcq_mtx); 387 mutex_exit(&eq->mleq_mtx); 388 389 *cqp = cq; 390 return (B_TRUE); 391 } 392 393 static boolean_t 394 mlxcx_rq_setup(mlxcx_t *mlxp, mlxcx_completion_queue_t *cq, 395 mlxcx_work_queue_t *wq) 396 { 397 mutex_init(&wq->mlwq_mtx, NULL, MUTEX_DRIVER, 398 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 399 400 list_insert_tail(&mlxp->mlx_wqs, wq); 401 402 mutex_enter(&wq->mlwq_mtx); 403 404 wq->mlwq_mlx = mlxp; 405 wq->mlwq_type = MLXCX_WQ_TYPE_RECVQ; 406 wq->mlwq_cq = cq; 407 wq->mlwq_pd = &mlxp->mlx_pd; 408 wq->mlwq_uar = &mlxp->mlx_uar; 409 410 wq->mlwq_bufs = mlxcx_mlbs_create(mlxp); 411 412 if (!mlxcx_wq_alloc_dma(mlxp, wq)) { 413 mutex_exit(&wq->mlwq_mtx); 414 return (B_FALSE); 415 } 416 417 if (!mlxcx_cmd_create_rq(mlxp, wq)) { 418 mutex_exit(&wq->mlwq_mtx); 419 return (B_FALSE); 420 } 421 422 wq->mlwq_bufhwm = wq->mlwq_nents - MLXCX_WQ_HWM_GAP; 423 wq->mlwq_buflwm = wq->mlwq_nents - MLXCX_WQ_LWM_GAP; 424 425 mutex_exit(&wq->mlwq_mtx); 426 427 mutex_enter(&cq->mlcq_mtx); 428 mutex_enter(&wq->mlwq_mtx); 429 ASSERT3P(cq->mlcq_wq, ==, NULL); 430 cq->mlcq_wq = wq; 431 mutex_exit(&wq->mlwq_mtx); 432 mutex_exit(&cq->mlcq_mtx); 433 434 return (B_TRUE); 435 } 436 437 static boolean_t 438 mlxcx_sq_setup(mlxcx_t *mlxp, mlxcx_port_t *port, mlxcx_completion_queue_t *cq, 439 mlxcx_tis_t *tis, mlxcx_work_queue_t *wq) 440 { 441 mutex_init(&wq->mlwq_mtx, NULL, MUTEX_DRIVER, 442 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 443 444 list_insert_tail(&mlxp->mlx_wqs, wq); 445 446 mutex_enter(&wq->mlwq_mtx); 447 448 wq->mlwq_mlx = mlxp; 449 wq->mlwq_type = MLXCX_WQ_TYPE_SENDQ; 450 wq->mlwq_cq = cq; 451 wq->mlwq_pd = &mlxp->mlx_pd; 452 wq->mlwq_uar = &mlxp->mlx_uar; 453 wq->mlwq_tis = tis; 454 455 wq->mlwq_bufs = mlxcx_mlbs_create(mlxp); 456 wq->mlwq_foreign_bufs = mlxcx_mlbs_create(mlxp); 457 458 VERIFY3U(port->mlp_wqe_min_inline, <=, MLXCX_ETH_INLINE_L2); 459 wq->mlwq_inline_mode = MLXCX_ETH_INLINE_L2; 460 461 if (!mlxcx_wq_alloc_dma(mlxp, wq)) { 462 mutex_exit(&wq->mlwq_mtx); 463 return (B_FALSE); 464 } 465 466 if (!mlxcx_cmd_create_sq(mlxp, wq)) { 467 mutex_exit(&wq->mlwq_mtx); 468 return (B_FALSE); 469 } 470 471 wq->mlwq_bufhwm = wq->mlwq_nents - MLXCX_WQ_HWM_GAP; 472 wq->mlwq_buflwm = wq->mlwq_nents - MLXCX_WQ_LWM_GAP; 473 474 mutex_exit(&wq->mlwq_mtx); 475 476 mutex_enter(&cq->mlcq_mtx); 477 mutex_enter(&wq->mlwq_mtx); 478 ASSERT3P(cq->mlcq_wq, ==, NULL); 479 cq->mlcq_wq = wq; 480 mutex_exit(&wq->mlwq_mtx); 481 mutex_exit(&cq->mlcq_mtx); 482 483 return (B_TRUE); 484 } 485 486 /* 487 * Before we tear down the queues associated with the rx group, 488 * flag each cq as being torn down and wake up any tasks. 489 */ 490 static void 491 mlxcx_quiesce_rx_cqs(mlxcx_t *mlxp, mlxcx_ring_group_t *g) 492 { 493 mlxcx_work_queue_t *wq; 494 mlxcx_completion_queue_t *cq; 495 mlxcx_buf_shard_t *s; 496 uint_t i; 497 498 mutex_enter(&g->mlg_mtx); 499 500 for (i = 0; i < g->mlg_nwqs; ++i) { 501 wq = &g->mlg_wqs[i]; 502 cq = wq->mlwq_cq; 503 if (cq != NULL) { 504 s = wq->mlwq_bufs; 505 mutex_enter(&s->mlbs_mtx); 506 atomic_or_uint(&cq->mlcq_state, MLXCX_CQ_TEARDOWN); 507 cv_broadcast(&s->mlbs_free_nonempty); 508 mutex_exit(&s->mlbs_mtx); 509 } 510 } 511 512 mutex_exit(&g->mlg_mtx); 513 } 514 515 void 516 mlxcx_teardown_rx_group(mlxcx_t *mlxp, mlxcx_ring_group_t *g) 517 { 518 mlxcx_work_queue_t *wq; 519 mlxcx_completion_queue_t *cq; 520 mlxcx_flow_entry_t *fe; 521 mlxcx_flow_group_t *fg; 522 mlxcx_flow_table_t *ft; 523 uint_t i; 524 525 mutex_enter(&g->mlg_port->mlp_mtx); 526 mutex_enter(&g->mlg_mtx); 527 528 if (g->mlg_state & MLXCX_GROUP_FLOWS) { 529 mlxcx_remove_all_umcast_entries(mlxp, g->mlg_port, g); 530 531 if (g->mlg_rx_vlan_ft != NULL) 532 mlxcx_remove_all_vlan_entries(mlxp, g); 533 534 if (g == &mlxp->mlx_rx_groups[0]) { 535 ft = g->mlg_port->mlp_rx_flow; 536 mutex_enter(&ft->mlft_mtx); 537 538 fg = g->mlg_port->mlp_bcast; 539 fe = list_head(&fg->mlfg_entries); 540 if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) { 541 (void) mlxcx_cmd_delete_flow_table_entry( 542 mlxp, fe); 543 } 544 545 fg = g->mlg_port->mlp_promisc; 546 fe = list_head(&fg->mlfg_entries); 547 if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) { 548 (void) mlxcx_cmd_delete_flow_table_entry( 549 mlxp, fe); 550 } 551 552 mutex_exit(&ft->mlft_mtx); 553 } 554 555 if (g->mlg_rx_vlan_ft != NULL) { 556 mutex_enter(&g->mlg_rx_vlan_ft->mlft_mtx); 557 ASSERT(list_is_empty(&g->mlg_rx_vlans)); 558 fg = g->mlg_rx_vlan_def_fg; 559 if (fg != NULL) { 560 fe = list_head(&fg->mlfg_entries); 561 if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) { 562 (void) 563 mlxcx_cmd_delete_flow_table_entry( 564 mlxp, fe); 565 } 566 } 567 fg = g->mlg_rx_vlan_promisc_fg; 568 if (fg != NULL) { 569 fe = list_head(&fg->mlfg_entries); 570 if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) { 571 (void) 572 mlxcx_cmd_delete_flow_table_entry( 573 mlxp, fe); 574 } 575 } 576 mlxcx_teardown_flow_table(mlxp, g->mlg_rx_vlan_ft); 577 list_destroy(&g->mlg_rx_vlans); 578 579 g->mlg_rx_vlan_ft = NULL; 580 } 581 582 mutex_enter(&g->mlg_rx_hash_ft->mlft_mtx); 583 mlxcx_teardown_flow_table(mlxp, g->mlg_rx_hash_ft); 584 g->mlg_rx_hash_ft = NULL; 585 586 avl_destroy(&g->mlg_rx_macs); 587 g->mlg_state &= ~MLXCX_GROUP_FLOWS; 588 } 589 590 if (g->mlg_state & MLXCX_GROUP_RUNNING) { 591 for (i = 0; i < g->mlg_nwqs; ++i) { 592 wq = &g->mlg_wqs[i]; 593 mutex_enter(&wq->mlwq_mtx); 594 if (wq->mlwq_state & MLXCX_WQ_STARTED && 595 !mlxcx_cmd_stop_rq(mlxp, wq)) { 596 mlxcx_warn(mlxp, "failed to stop rq %x", 597 wq->mlwq_num); 598 } 599 mutex_exit(&wq->mlwq_mtx); 600 } 601 taskq_destroy(g->mlg_refill_tq); 602 g->mlg_state &= ~MLXCX_GROUP_RUNNING; 603 } 604 605 if (g->mlg_state & MLXCX_GROUP_TIRTIS) { 606 for (i = 0; i < MLXCX_TIRS_PER_GROUP; ++i) { 607 mlxcx_tir_t *tir = &g->mlg_tir[i]; 608 if (tir->mltir_state & MLXCX_TIR_CREATED && 609 !(tir->mltir_state & MLXCX_TIR_DESTROYED)) { 610 if (!mlxcx_cmd_destroy_tir(mlxp, tir)) { 611 mlxcx_warn(mlxp, 612 "failed to destroy tir %u " 613 "for rx ring", tir->mltir_num); 614 } 615 } 616 } 617 g->mlg_state &= ~MLXCX_GROUP_TIRTIS; 618 } 619 620 if (g->mlg_state & MLXCX_GROUP_RQT) { 621 if (g->mlg_rqt->mlrqt_state & MLXCX_RQT_CREATED && 622 !(g->mlg_rqt->mlrqt_state & MLXCX_RQT_DESTROYED)) { 623 if (!mlxcx_cmd_destroy_rqt(mlxp, g->mlg_rqt)) { 624 mlxcx_warn(mlxp, "failed to destroy rqt %u " 625 "for rx ring", g->mlg_rqt->mlrqt_num); 626 } 627 kmem_free(g->mlg_rqt->mlrqt_rq, 628 g->mlg_rqt->mlrqt_rq_size); 629 g->mlg_rqt->mlrqt_rq = NULL; 630 kmem_free(g->mlg_rqt, sizeof (mlxcx_rqtable_t)); 631 g->mlg_rqt = NULL; 632 } 633 g->mlg_state &= ~MLXCX_GROUP_RQT; 634 } 635 636 for (i = 0; i < g->mlg_nwqs; ++i) { 637 wq = &g->mlg_wqs[i]; 638 cq = wq->mlwq_cq; 639 mlxcx_wq_teardown(mlxp, wq); 640 if (cq != NULL) 641 mlxcx_cq_teardown(mlxp, cq); 642 } 643 kmem_free(g->mlg_wqs, g->mlg_wqs_size); 644 g->mlg_wqs = NULL; 645 g->mlg_state &= ~MLXCX_GROUP_WQS; 646 647 mutex_exit(&g->mlg_mtx); 648 mutex_exit(&g->mlg_port->mlp_mtx); 649 650 mutex_destroy(&g->mlg_mtx); 651 652 g->mlg_state &= ~MLXCX_GROUP_INIT; 653 ASSERT3S(g->mlg_state, ==, 0); 654 } 655 656 void 657 mlxcx_teardown_tx_group(mlxcx_t *mlxp, mlxcx_ring_group_t *g) 658 { 659 mlxcx_work_queue_t *wq; 660 mlxcx_completion_queue_t *cq; 661 uint_t i; 662 663 mutex_enter(&g->mlg_mtx); 664 665 if (g->mlg_state & MLXCX_GROUP_WQS) { 666 for (i = 0; i < g->mlg_nwqs; ++i) { 667 wq = &g->mlg_wqs[i]; 668 mutex_enter(&wq->mlwq_mtx); 669 cq = wq->mlwq_cq; 670 if (wq->mlwq_state & MLXCX_WQ_STARTED && 671 !mlxcx_cmd_stop_sq(mlxp, wq)) { 672 mlxcx_warn(mlxp, "failed to stop sq %x", 673 wq->mlwq_num); 674 } 675 mutex_exit(&wq->mlwq_mtx); 676 mlxcx_wq_teardown(mlxp, wq); 677 if (cq != NULL) 678 mlxcx_cq_teardown(mlxp, cq); 679 } 680 g->mlg_state &= ~MLXCX_GROUP_RUNNING; 681 kmem_free(g->mlg_wqs, g->mlg_wqs_size); 682 g->mlg_wqs = NULL; 683 g->mlg_state &= ~MLXCX_GROUP_WQS; 684 } 685 686 if ((g->mlg_state & MLXCX_GROUP_TIRTIS) && 687 g->mlg_tis.mltis_state & MLXCX_TIS_CREATED && 688 !(g->mlg_tis.mltis_state & MLXCX_TIS_DESTROYED)) { 689 if (!mlxcx_cmd_destroy_tis(mlxp, &g->mlg_tis)) { 690 mlxcx_warn(mlxp, "failed to destroy tis %u for tx ring", 691 g->mlg_tis.mltis_num); 692 } 693 } 694 g->mlg_state &= ~MLXCX_GROUP_TIRTIS; 695 696 mutex_exit(&g->mlg_mtx); 697 mutex_destroy(&g->mlg_mtx); 698 g->mlg_state &= ~MLXCX_GROUP_INIT; 699 ASSERT3S(g->mlg_state, ==, 0); 700 } 701 702 void 703 mlxcx_teardown_groups(mlxcx_t *mlxp) 704 { 705 mlxcx_ring_group_t *g; 706 uint_t i; 707 708 for (i = 0; i < mlxp->mlx_rx_ngroups; ++i) { 709 g = &mlxp->mlx_rx_groups[i]; 710 if (!(g->mlg_state & MLXCX_GROUP_INIT)) 711 continue; 712 ASSERT3S(g->mlg_type, ==, MLXCX_GROUP_RX); 713 mlxcx_quiesce_rx_cqs(mlxp, g); 714 } 715 716 for (i = 0; i < mlxp->mlx_rx_ngroups; ++i) { 717 g = &mlxp->mlx_rx_groups[i]; 718 if (!(g->mlg_state & MLXCX_GROUP_INIT)) 719 continue; 720 mlxcx_teardown_rx_group(mlxp, g); 721 } 722 723 kmem_free(mlxp->mlx_rx_groups, mlxp->mlx_rx_groups_size); 724 mlxp->mlx_rx_groups = NULL; 725 726 for (i = 0; i < mlxp->mlx_tx_ngroups; ++i) { 727 g = &mlxp->mlx_tx_groups[i]; 728 if (!(g->mlg_state & MLXCX_GROUP_INIT)) 729 continue; 730 ASSERT3S(g->mlg_type, ==, MLXCX_GROUP_TX); 731 mlxcx_teardown_tx_group(mlxp, g); 732 } 733 734 kmem_free(mlxp->mlx_tx_groups, mlxp->mlx_tx_groups_size); 735 mlxp->mlx_tx_groups = NULL; 736 } 737 738 boolean_t 739 mlxcx_rx_group_setup(mlxcx_t *mlxp, mlxcx_ring_group_t *g) 740 { 741 mlxcx_event_queue_t *eq; 742 mlxcx_completion_queue_t *cq; 743 mlxcx_work_queue_t *rq; 744 mlxcx_flow_table_t *ft; 745 mlxcx_flow_group_t *fg; 746 mlxcx_flow_entry_t *fe; 747 uint_t ent_shift; 748 uint_t i, j; 749 750 ASSERT3S(g->mlg_state, ==, 0); 751 752 mutex_init(&g->mlg_mtx, NULL, MUTEX_DRIVER, 753 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 754 mutex_enter(&g->mlg_mtx); 755 g->mlg_mlx = mlxp; 756 g->mlg_type = MLXCX_GROUP_RX; 757 g->mlg_port = &mlxp->mlx_ports[0]; 758 g->mlg_state |= MLXCX_GROUP_INIT; 759 760 g->mlg_nwqs = mlxp->mlx_props.mldp_rx_nrings_per_small_group; 761 i = g - &mlxp->mlx_rx_groups[0]; 762 if (i < mlxp->mlx_props.mldp_rx_ngroups_large) 763 g->mlg_nwqs = mlxp->mlx_props.mldp_rx_nrings_per_large_group; 764 765 g->mlg_wqs_size = g->mlg_nwqs * sizeof (mlxcx_work_queue_t); 766 g->mlg_wqs = kmem_zalloc(g->mlg_wqs_size, KM_SLEEP); 767 g->mlg_state |= MLXCX_GROUP_WQS; 768 769 g->mlg_rqt = kmem_zalloc(sizeof (mlxcx_rqtable_t), KM_SLEEP); 770 g->mlg_rqt->mlrqt_max = 2; 771 while (g->mlg_rqt->mlrqt_max < g->mlg_nwqs) 772 g->mlg_rqt->mlrqt_max <<= 1; 773 g->mlg_rqt->mlrqt_rq_size = g->mlg_rqt->mlrqt_max * 774 sizeof (mlxcx_work_queue_t *); 775 g->mlg_rqt->mlrqt_rq = kmem_zalloc(g->mlg_rqt->mlrqt_rq_size, KM_SLEEP); 776 g->mlg_state |= MLXCX_GROUP_RQT; 777 778 for (i = 0; i < g->mlg_nwqs; ++i) { 779 eq = NULL; 780 while (eq == NULL) { 781 eq = &mlxp->mlx_eqs[mlxp->mlx_next_eq++]; 782 if (mlxp->mlx_next_eq >= mlxp->mlx_intr_count) 783 mlxp->mlx_next_eq = mlxp->mlx_intr_cq0; 784 if (eq->mleq_type != MLXCX_EQ_TYPE_ANY && 785 eq->mleq_type != MLXCX_EQ_TYPE_RX) { 786 /* Try the next one */ 787 eq = NULL; 788 } 789 } 790 791 /* 792 * A single completion is indicated for each rq entry as 793 * it is used. So, the number of cq entries never needs 794 * to be larger than the rq. 795 */ 796 ent_shift = MIN(mlxp->mlx_props.mldp_cq_size_shift, 797 mlxp->mlx_props.mldp_rq_size_shift); 798 if (!mlxcx_cq_setup(mlxp, eq, &cq, ent_shift)) { 799 g->mlg_nwqs = i; 800 break; 801 } 802 803 cq->mlcq_stats = &g->mlg_port->mlp_stats; 804 805 rq = &g->mlg_wqs[i]; 806 if (!mlxcx_rq_setup(mlxp, cq, rq)) { 807 g->mlg_nwqs = i; 808 break; 809 } 810 g->mlg_rqt->mlrqt_rq[g->mlg_rqt->mlrqt_used++] = rq; 811 g->mlg_rqt->mlrqt_state |= MLXCX_RQT_DIRTY; 812 rq->mlwq_group = g; 813 } 814 if (g->mlg_nwqs == 0) { 815 mutex_exit(&g->mlg_mtx); 816 return (B_FALSE); 817 } 818 819 if (!mlxcx_cmd_create_rqt(mlxp, g->mlg_rqt)) { 820 mutex_exit(&g->mlg_mtx); 821 return (B_FALSE); 822 } 823 824 for (i = 0; i < MLXCX_TIRS_PER_GROUP; ++i) { 825 mlxcx_tir_t *tir = &g->mlg_tir[i]; 826 tir->mltir_tdom = &mlxp->mlx_tdom; 827 switch (i) { 828 case MLXCX_TIR_ROLE_OTHER: 829 tir->mltir_type = MLXCX_TIR_DIRECT; 830 tir->mltir_rq = &g->mlg_wqs[0]; 831 break; 832 case MLXCX_TIR_ROLE_IPv4: 833 case MLXCX_TIR_ROLE_IPv6: 834 case MLXCX_TIR_ROLE_TCPv4: 835 case MLXCX_TIR_ROLE_TCPv6: 836 case MLXCX_TIR_ROLE_UDPv4: 837 case MLXCX_TIR_ROLE_UDPv6: 838 tir->mltir_type = MLXCX_TIR_INDIRECT; 839 tir->mltir_rqtable = g->mlg_rqt; 840 tir->mltir_hash_fn = MLXCX_TIR_HASH_TOEPLITZ; 841 (void) random_get_pseudo_bytes(tir->mltir_toeplitz_key, 842 sizeof (tir->mltir_toeplitz_key)); 843 break; 844 } 845 switch (i) { 846 case MLXCX_TIR_ROLE_OTHER: 847 break; 848 case MLXCX_TIR_ROLE_IPv4: 849 case MLXCX_TIR_ROLE_TCPv4: 850 case MLXCX_TIR_ROLE_UDPv4: 851 tir->mltir_l3_type = MLXCX_RX_HASH_L3_IPv4; 852 tir->mltir_hash_fields = 853 MLXCX_RX_HASH_SRC_IP | MLXCX_RX_HASH_DST_IP; 854 break; 855 case MLXCX_TIR_ROLE_IPv6: 856 case MLXCX_TIR_ROLE_TCPv6: 857 case MLXCX_TIR_ROLE_UDPv6: 858 tir->mltir_l3_type = MLXCX_RX_HASH_L3_IPv6; 859 tir->mltir_hash_fields = 860 MLXCX_RX_HASH_SRC_IP | MLXCX_RX_HASH_DST_IP; 861 break; 862 } 863 switch (i) { 864 case MLXCX_TIR_ROLE_OTHER: 865 case MLXCX_TIR_ROLE_IPv4: 866 case MLXCX_TIR_ROLE_IPv6: 867 break; 868 case MLXCX_TIR_ROLE_TCPv4: 869 case MLXCX_TIR_ROLE_TCPv6: 870 tir->mltir_l4_type = MLXCX_RX_HASH_L4_TCP; 871 tir->mltir_hash_fields |= 872 MLXCX_RX_HASH_L4_SPORT | MLXCX_RX_HASH_L4_DPORT; 873 break; 874 case MLXCX_TIR_ROLE_UDPv4: 875 case MLXCX_TIR_ROLE_UDPv6: 876 tir->mltir_l4_type = MLXCX_RX_HASH_L4_UDP; 877 tir->mltir_hash_fields |= 878 MLXCX_RX_HASH_L4_SPORT | MLXCX_RX_HASH_L4_DPORT; 879 break; 880 } 881 882 if (!mlxcx_cmd_create_tir(mlxp, tir)) { 883 mutex_exit(&g->mlg_mtx); 884 return (B_FALSE); 885 } 886 887 g->mlg_state |= MLXCX_GROUP_TIRTIS; 888 } 889 890 /* 891 * Flow table: our RX hashing breakout table for RSS 892 */ 893 894 g->mlg_rx_hash_ft = (ft = kmem_zalloc(sizeof (mlxcx_flow_table_t), 895 KM_SLEEP)); 896 mutex_init(&ft->mlft_mtx, NULL, MUTEX_DRIVER, 897 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 898 avl_create(&g->mlg_rx_macs, mlxcx_grmac_compare, 899 sizeof (mlxcx_group_mac_t), 900 offsetof(mlxcx_group_mac_t, mlgm_group_entry)); 901 g->mlg_state |= MLXCX_GROUP_FLOWS; 902 903 mutex_enter(&ft->mlft_mtx); 904 905 ft->mlft_type = MLXCX_FLOW_TABLE_NIC_RX; 906 ft->mlft_level = 2; 907 ft->mlft_port = g->mlg_port; 908 ft->mlft_entshift = MLXCX_RX_HASH_FT_SIZE_SHIFT; 909 ft->mlft_nents = (1 << ft->mlft_entshift); 910 ASSERT3U(ft->mlft_nents, >=, MLXCX_TIRS_PER_GROUP); 911 ft->mlft_entsize = ft->mlft_nents * sizeof (mlxcx_flow_entry_t); 912 ft->mlft_ent = kmem_zalloc(ft->mlft_entsize, KM_SLEEP); 913 list_create(&ft->mlft_groups, sizeof (mlxcx_flow_group_t), 914 offsetof(mlxcx_flow_group_t, mlfg_entry)); 915 916 for (j = 0; j < ft->mlft_nents; ++j) { 917 ft->mlft_ent[j].mlfe_table = ft; 918 ft->mlft_ent[j].mlfe_index = j; 919 } 920 921 if (!mlxcx_cmd_create_flow_table(mlxp, ft)) { 922 mutex_exit(&ft->mlft_mtx); 923 mutex_exit(&g->mlg_mtx); 924 return (B_FALSE); 925 } 926 927 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 928 list_insert_tail(&ft->mlft_groups, fg); 929 fg->mlfg_table = ft; 930 fg->mlfg_size = 1; 931 fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER | MLXCX_FLOW_MATCH_IP_PROTO; 932 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 933 mutex_exit(&ft->mlft_mtx); 934 mutex_exit(&g->mlg_mtx); 935 return (B_FALSE); 936 } 937 fe = list_head(&fg->mlfg_entries); 938 fe->mlfe_ip_version = 6; 939 fe->mlfe_ip_proto = IPPROTO_UDP; 940 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; 941 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir = 942 &g->mlg_tir[MLXCX_TIR_ROLE_UDPv6]; 943 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 944 mutex_exit(&ft->mlft_mtx); 945 mutex_exit(&g->mlg_mtx); 946 return (B_FALSE); 947 } 948 949 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 950 list_insert_tail(&ft->mlft_groups, fg); 951 fg->mlfg_table = ft; 952 fg->mlfg_size = 1; 953 fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER | MLXCX_FLOW_MATCH_IP_PROTO; 954 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 955 mutex_exit(&ft->mlft_mtx); 956 mutex_exit(&g->mlg_mtx); 957 return (B_FALSE); 958 } 959 fe = list_head(&fg->mlfg_entries); 960 fe->mlfe_ip_version = 4; 961 fe->mlfe_ip_proto = IPPROTO_UDP; 962 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; 963 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir = 964 &g->mlg_tir[MLXCX_TIR_ROLE_UDPv4]; 965 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 966 mutex_exit(&ft->mlft_mtx); 967 mutex_exit(&g->mlg_mtx); 968 return (B_FALSE); 969 } 970 971 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 972 list_insert_tail(&ft->mlft_groups, fg); 973 fg->mlfg_table = ft; 974 fg->mlfg_size = 1; 975 fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER | MLXCX_FLOW_MATCH_IP_PROTO; 976 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 977 mutex_exit(&ft->mlft_mtx); 978 mutex_exit(&g->mlg_mtx); 979 return (B_FALSE); 980 } 981 fe = list_head(&fg->mlfg_entries); 982 fe->mlfe_ip_version = 6; 983 fe->mlfe_ip_proto = IPPROTO_TCP; 984 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; 985 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir = 986 &g->mlg_tir[MLXCX_TIR_ROLE_TCPv6]; 987 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 988 mutex_exit(&ft->mlft_mtx); 989 mutex_exit(&g->mlg_mtx); 990 return (B_FALSE); 991 } 992 993 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 994 list_insert_tail(&ft->mlft_groups, fg); 995 fg->mlfg_table = ft; 996 fg->mlfg_size = 1; 997 fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER | MLXCX_FLOW_MATCH_IP_PROTO; 998 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 999 mutex_exit(&ft->mlft_mtx); 1000 mutex_exit(&g->mlg_mtx); 1001 return (B_FALSE); 1002 } 1003 fe = list_head(&fg->mlfg_entries); 1004 fe->mlfe_ip_version = 4; 1005 fe->mlfe_ip_proto = IPPROTO_TCP; 1006 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; 1007 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir = 1008 &g->mlg_tir[MLXCX_TIR_ROLE_TCPv4]; 1009 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 1010 mutex_exit(&ft->mlft_mtx); 1011 mutex_exit(&g->mlg_mtx); 1012 return (B_FALSE); 1013 } 1014 1015 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 1016 list_insert_tail(&ft->mlft_groups, fg); 1017 fg->mlfg_table = ft; 1018 fg->mlfg_size = 1; 1019 fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER; 1020 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 1021 mutex_exit(&ft->mlft_mtx); 1022 mutex_exit(&g->mlg_mtx); 1023 return (B_FALSE); 1024 } 1025 fe = list_head(&fg->mlfg_entries); 1026 fe->mlfe_ip_version = 6; 1027 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; 1028 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir = 1029 &g->mlg_tir[MLXCX_TIR_ROLE_IPv6]; 1030 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 1031 mutex_exit(&ft->mlft_mtx); 1032 mutex_exit(&g->mlg_mtx); 1033 return (B_FALSE); 1034 } 1035 1036 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 1037 list_insert_tail(&ft->mlft_groups, fg); 1038 fg->mlfg_table = ft; 1039 fg->mlfg_size = 1; 1040 fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER; 1041 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 1042 mutex_exit(&ft->mlft_mtx); 1043 mutex_exit(&g->mlg_mtx); 1044 return (B_FALSE); 1045 } 1046 fe = list_head(&fg->mlfg_entries); 1047 fe->mlfe_ip_version = 4; 1048 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; 1049 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir = 1050 &g->mlg_tir[MLXCX_TIR_ROLE_IPv4]; 1051 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 1052 mutex_exit(&ft->mlft_mtx); 1053 mutex_exit(&g->mlg_mtx); 1054 return (B_FALSE); 1055 } 1056 1057 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 1058 list_insert_tail(&ft->mlft_groups, fg); 1059 fg->mlfg_table = ft; 1060 fg->mlfg_size = 1; 1061 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 1062 mutex_exit(&ft->mlft_mtx); 1063 mutex_exit(&g->mlg_mtx); 1064 return (B_FALSE); 1065 } 1066 fe = list_head(&fg->mlfg_entries); 1067 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; 1068 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir = 1069 &g->mlg_tir[MLXCX_TIR_ROLE_OTHER]; 1070 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 1071 mutex_exit(&ft->mlft_mtx); 1072 mutex_exit(&g->mlg_mtx); 1073 return (B_FALSE); 1074 } 1075 1076 mutex_exit(&ft->mlft_mtx); 1077 1078 /* 1079 * Flow table: the VLAN breakout table for doing VLAN filtering after 1080 * we've matched a MAC address. 1081 */ 1082 1083 g->mlg_rx_vlan_ft = (ft = kmem_zalloc(sizeof (mlxcx_flow_table_t), 1084 KM_SLEEP)); 1085 mutex_init(&ft->mlft_mtx, NULL, MUTEX_DRIVER, 1086 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 1087 list_create(&g->mlg_rx_vlans, sizeof (mlxcx_group_vlan_t), 1088 offsetof(mlxcx_group_vlan_t, mlgv_entry)); 1089 1090 mutex_enter(&ft->mlft_mtx); 1091 1092 ft->mlft_type = MLXCX_FLOW_TABLE_NIC_RX; 1093 ft->mlft_level = 1; 1094 ft->mlft_port = g->mlg_port; 1095 ft->mlft_entshift = mlxp->mlx_props.mldp_ftbl_vlan_size_shift; 1096 ft->mlft_nents = (1 << ft->mlft_entshift); 1097 ft->mlft_entsize = ft->mlft_nents * sizeof (mlxcx_flow_entry_t); 1098 ft->mlft_ent = kmem_zalloc(ft->mlft_entsize, KM_SLEEP); 1099 list_create(&ft->mlft_groups, sizeof (mlxcx_flow_group_t), 1100 offsetof(mlxcx_flow_group_t, mlfg_entry)); 1101 1102 for (j = 0; j < ft->mlft_nents; ++j) { 1103 fe = &ft->mlft_ent[j]; 1104 fe->mlfe_table = ft; 1105 fe->mlfe_index = j; 1106 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; 1107 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = g->mlg_rx_hash_ft; 1108 } 1109 1110 if (!mlxcx_cmd_create_flow_table(mlxp, ft)) { 1111 mutex_exit(&ft->mlft_mtx); 1112 mutex_exit(&g->mlg_mtx); 1113 return (B_FALSE); 1114 } 1115 1116 /* First group is all actual matched VLANs */ 1117 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 1118 g->mlg_rx_vlan_fg = fg; 1119 list_insert_tail(&ft->mlft_groups, fg); 1120 fg->mlfg_table = ft; 1121 fg->mlfg_size = ft->mlft_nents - 2; 1122 fg->mlfg_mask |= MLXCX_FLOW_MATCH_VLAN; 1123 fg->mlfg_mask |= MLXCX_FLOW_MATCH_VID; 1124 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 1125 mutex_exit(&ft->mlft_mtx); 1126 mutex_exit(&g->mlg_mtx); 1127 return (B_FALSE); 1128 } 1129 1130 /* 1131 * Then the "default" entry which we enable when we have no VLAN IDs 1132 * added to the group (we start with this enabled). 1133 */ 1134 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 1135 g->mlg_rx_vlan_def_fg = fg; 1136 list_insert_tail(&ft->mlft_groups, fg); 1137 fg->mlfg_table = ft; 1138 fg->mlfg_size = 1; 1139 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 1140 mutex_exit(&ft->mlft_mtx); 1141 mutex_exit(&g->mlg_mtx); 1142 return (B_FALSE); 1143 } 1144 fe = list_head(&fg->mlfg_entries); 1145 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 1146 mutex_exit(&ft->mlft_mtx); 1147 mutex_exit(&g->mlg_mtx); 1148 return (B_FALSE); 1149 } 1150 1151 /* 1152 * Finally, the promisc entry which points at the *hash ft* from the 1153 * default group. We only enable this when we have promisc on. 1154 */ 1155 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 1156 g->mlg_rx_vlan_promisc_fg = fg; 1157 list_insert_tail(&ft->mlft_groups, fg); 1158 fg->mlfg_table = ft; 1159 fg->mlfg_size = 1; 1160 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 1161 mutex_exit(&ft->mlft_mtx); 1162 mutex_exit(&g->mlg_mtx); 1163 return (B_FALSE); 1164 } 1165 fe = list_head(&fg->mlfg_entries); 1166 fe->mlfe_ndest = 1; 1167 fe->mlfe_dest[0].mlfed_flow = mlxp->mlx_rx_groups[0].mlg_rx_hash_ft; 1168 1169 mutex_exit(&ft->mlft_mtx); 1170 1171 mutex_exit(&g->mlg_mtx); 1172 1173 return (B_TRUE); 1174 } 1175 1176 boolean_t 1177 mlxcx_rx_ring_start(mlxcx_t *mlxp, mlxcx_ring_group_t *g, 1178 mlxcx_work_queue_t *rq) 1179 { 1180 uint_t j; 1181 mlxcx_buffer_t *b; 1182 mlxcx_completion_queue_t *cq; 1183 1184 mutex_enter(&g->mlg_mtx); 1185 /* 1186 * Sadly, even though MAC has the mgi_start callback, it is not always 1187 * called -- in particular when we are being managed under an aggr, the 1188 * mgi_start callback will only ever be called on the default group. 1189 * 1190 * So instead of asserting about the group state here, we have to 1191 * check it and call group start if needed. 1192 */ 1193 if (!(g->mlg_state & MLXCX_GROUP_RUNNING)) { 1194 mutex_exit(&g->mlg_mtx); 1195 if (!mlxcx_rx_group_start(mlxp, g)) 1196 return (B_FALSE); 1197 mutex_enter(&g->mlg_mtx); 1198 } 1199 ASSERT(g->mlg_state & MLXCX_GROUP_RUNNING); 1200 1201 cq = rq->mlwq_cq; 1202 ASSERT(cq != NULL); 1203 1204 mutex_enter(&cq->mlcq_mtx); 1205 mutex_enter(&rq->mlwq_mtx); 1206 1207 if (rq->mlwq_state & MLXCX_WQ_STARTED) { 1208 mutex_exit(&rq->mlwq_mtx); 1209 mutex_exit(&cq->mlcq_mtx); 1210 mutex_exit(&g->mlg_mtx); 1211 return (B_TRUE); 1212 } 1213 1214 if (!mlxcx_cmd_start_rq(mlxp, rq)) { 1215 mutex_exit(&rq->mlwq_mtx); 1216 mutex_exit(&cq->mlcq_mtx); 1217 mutex_exit(&g->mlg_mtx); 1218 return (B_FALSE); 1219 } 1220 ASSERT(rq->mlwq_state & MLXCX_WQ_STARTED); 1221 1222 ASSERT0(rq->mlwq_state & MLXCX_WQ_BUFFERS); 1223 rq->mlwq_state |= MLXCX_WQ_BUFFERS; 1224 1225 mlxcx_shard_ready(rq->mlwq_bufs); 1226 1227 for (j = 0; j < rq->mlwq_nents; ++j) { 1228 if (!mlxcx_buf_create(mlxp, rq->mlwq_bufs, &b)) 1229 break; 1230 mlxcx_buf_return(mlxp, b); 1231 } 1232 for (j = 0; j < rq->mlwq_nents / 2; ++j) { 1233 if (!mlxcx_buf_create(mlxp, rq->mlwq_bufs, &b)) 1234 break; 1235 mlxcx_buf_return(mlxp, b); 1236 } 1237 1238 mlxcx_rq_refill(mlxp, rq); 1239 1240 mutex_exit(&rq->mlwq_mtx); 1241 mutex_exit(&cq->mlcq_mtx); 1242 mutex_exit(&g->mlg_mtx); 1243 1244 return (B_TRUE); 1245 } 1246 1247 boolean_t 1248 mlxcx_rx_group_start(mlxcx_t *mlxp, mlxcx_ring_group_t *g) 1249 { 1250 mlxcx_flow_table_t *ft; 1251 mlxcx_flow_group_t *fg; 1252 mlxcx_flow_entry_t *fe; 1253 char tq_name[TASKQ_NAMELEN]; 1254 1255 mutex_enter(&g->mlg_mtx); 1256 1257 if (g->mlg_state & MLXCX_GROUP_RUNNING) { 1258 mutex_exit(&g->mlg_mtx); 1259 return (B_TRUE); 1260 } 1261 1262 ASSERT0(g->mlg_state & MLXCX_GROUP_RUNNING); 1263 1264 g->mlg_state |= MLXCX_GROUP_RUNNING; 1265 1266 (void) snprintf(tq_name, sizeof (tq_name), "%s_refill_%d_%ld", 1267 ddi_driver_name(mlxp->mlx_dip), mlxp->mlx_inst, 1268 g - &mlxp->mlx_rx_groups[0]); 1269 1270 /* 1271 * Create one refill taskq per group with one thread per work queue. 1272 * The refill task may block waiting for resources, so by effectively 1273 * having one thread per work queue we avoid work queues blocking each 1274 * other. 1275 */ 1276 if ((g->mlg_refill_tq = taskq_create(tq_name, g->mlg_nwqs, minclsyspri, 1277 g->mlg_nwqs, INT_MAX, TASKQ_PREPOPULATE)) == NULL) { 1278 mlxcx_warn(mlxp, "failed to create rq refill task queue"); 1279 mutex_exit(&g->mlg_mtx); 1280 return (B_FALSE); 1281 } 1282 1283 if (g == &mlxp->mlx_rx_groups[0]) { 1284 ft = g->mlg_port->mlp_rx_flow; 1285 mutex_enter(&ft->mlft_mtx); 1286 1287 /* 1288 * Broadcast and promisc entries go directly to group 0's 1289 * RSS hash fanout flow table. They bypass VLAN filtering. 1290 */ 1291 fg = g->mlg_port->mlp_bcast; 1292 fe = list_head(&fg->mlfg_entries); 1293 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = g->mlg_rx_hash_ft; 1294 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 1295 mutex_exit(&ft->mlft_mtx); 1296 g->mlg_state &= ~MLXCX_GROUP_RUNNING; 1297 taskq_destroy(g->mlg_refill_tq); 1298 mutex_exit(&g->mlg_mtx); 1299 return (B_FALSE); 1300 } 1301 1302 fg = g->mlg_port->mlp_promisc; 1303 fe = list_head(&fg->mlfg_entries); 1304 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = g->mlg_rx_hash_ft; 1305 /* 1306 * Don't actually set the promisc entry until promisc is 1307 * enabled. 1308 */ 1309 1310 mutex_exit(&ft->mlft_mtx); 1311 } 1312 1313 mutex_exit(&g->mlg_mtx); 1314 1315 return (B_TRUE); 1316 } 1317 1318 boolean_t 1319 mlxcx_tx_group_setup(mlxcx_t *mlxp, mlxcx_ring_group_t *g) 1320 { 1321 mlxcx_event_queue_t *eq; 1322 mlxcx_completion_queue_t *cq; 1323 mlxcx_work_queue_t *sq; 1324 uint_t i; 1325 1326 ASSERT3S(g->mlg_state, ==, 0); 1327 1328 mutex_init(&g->mlg_mtx, NULL, MUTEX_DRIVER, 1329 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 1330 g->mlg_state |= MLXCX_GROUP_INIT; 1331 mutex_enter(&g->mlg_mtx); 1332 1333 g->mlg_mlx = mlxp; 1334 g->mlg_type = MLXCX_GROUP_TX; 1335 g->mlg_port = &mlxp->mlx_ports[0]; 1336 1337 g->mlg_nwqs = mlxp->mlx_props.mldp_tx_nrings_per_group; 1338 g->mlg_wqs_size = g->mlg_nwqs * sizeof (mlxcx_work_queue_t); 1339 g->mlg_wqs = kmem_zalloc(g->mlg_wqs_size, KM_SLEEP); 1340 g->mlg_state |= MLXCX_GROUP_WQS; 1341 1342 g->mlg_tis.mltis_tdom = &mlxp->mlx_tdom; 1343 1344 if (!mlxcx_cmd_create_tis(mlxp, &g->mlg_tis)) { 1345 mutex_exit(&g->mlg_mtx); 1346 return (B_FALSE); 1347 } 1348 1349 g->mlg_state |= MLXCX_GROUP_TIRTIS; 1350 1351 for (i = 0; i < g->mlg_nwqs; ++i) { 1352 eq = NULL; 1353 while (eq == NULL) { 1354 eq = &mlxp->mlx_eqs[mlxp->mlx_next_eq++]; 1355 if (mlxp->mlx_next_eq >= mlxp->mlx_intr_count) 1356 mlxp->mlx_next_eq = mlxp->mlx_intr_cq0; 1357 if (eq->mleq_type != MLXCX_EQ_TYPE_ANY && 1358 eq->mleq_type != MLXCX_EQ_TYPE_TX) { 1359 /* Try the next one */ 1360 eq = NULL; 1361 } 1362 } 1363 1364 if (!mlxcx_cq_setup(mlxp, eq, &cq, 1365 mlxp->mlx_props.mldp_cq_size_shift)) 1366 return (B_FALSE); 1367 1368 cq->mlcq_stats = &g->mlg_port->mlp_stats; 1369 1370 sq = &g->mlg_wqs[i]; 1371 if (!mlxcx_sq_setup(mlxp, g->mlg_port, cq, &g->mlg_tis, sq)) { 1372 mutex_exit(&g->mlg_mtx); 1373 return (B_FALSE); 1374 } 1375 sq->mlwq_group = g; 1376 } 1377 1378 mutex_exit(&g->mlg_mtx); 1379 1380 return (B_TRUE); 1381 } 1382 1383 boolean_t 1384 mlxcx_tx_ring_start(mlxcx_t *mlxp, mlxcx_ring_group_t *g, 1385 mlxcx_work_queue_t *sq) 1386 { 1387 uint_t i; 1388 mlxcx_buffer_t *b; 1389 mlxcx_completion_queue_t *cq; 1390 1391 mutex_enter(&g->mlg_mtx); 1392 1393 cq = sq->mlwq_cq; 1394 ASSERT(cq != NULL); 1395 1396 mutex_enter(&cq->mlcq_mtx); 1397 mutex_enter(&sq->mlwq_mtx); 1398 if (sq->mlwq_state & MLXCX_WQ_STARTED) { 1399 mutex_exit(&sq->mlwq_mtx); 1400 mutex_exit(&cq->mlcq_mtx); 1401 mutex_exit(&g->mlg_mtx); 1402 return (B_TRUE); 1403 } 1404 1405 ASSERT0(sq->mlwq_state & MLXCX_WQ_BUFFERS); 1406 for (i = 0; i < sq->mlwq_nents; ++i) { 1407 if (!mlxcx_buf_create_foreign(mlxp, sq->mlwq_foreign_bufs, &b)) 1408 break; 1409 mlxcx_buf_return(mlxp, b); 1410 } 1411 for (i = 0; i < sq->mlwq_nents / 2; ++i) { 1412 if (!mlxcx_buf_create_foreign(mlxp, sq->mlwq_foreign_bufs, &b)) 1413 break; 1414 mlxcx_buf_return(mlxp, b); 1415 } 1416 for (i = 0; i < sq->mlwq_nents; ++i) { 1417 if (!mlxcx_buf_create(mlxp, sq->mlwq_bufs, &b)) 1418 break; 1419 mlxcx_buf_return(mlxp, b); 1420 } 1421 sq->mlwq_state |= MLXCX_WQ_BUFFERS; 1422 1423 mlxcx_shard_ready(sq->mlwq_bufs); 1424 mlxcx_shard_ready(sq->mlwq_foreign_bufs); 1425 1426 if (!mlxcx_cmd_start_sq(mlxp, sq)) { 1427 mutex_exit(&sq->mlwq_mtx); 1428 mutex_exit(&cq->mlcq_mtx); 1429 mutex_exit(&g->mlg_mtx); 1430 return (B_FALSE); 1431 } 1432 g->mlg_state |= MLXCX_GROUP_RUNNING; 1433 1434 (void) mlxcx_sq_add_nop(mlxp, sq); 1435 1436 mutex_exit(&sq->mlwq_mtx); 1437 mutex_exit(&cq->mlcq_mtx); 1438 mutex_exit(&g->mlg_mtx); 1439 1440 return (B_TRUE); 1441 } 1442 1443 static boolean_t 1444 mlxcx_sq_ring_dbell(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, uint_t first) 1445 { 1446 uint_t idx; 1447 mlxcx_bf_t *bf; 1448 ddi_fm_error_t err; 1449 uint_t try = 0; 1450 1451 ASSERT3U(mlwq->mlwq_type, ==, MLXCX_WQ_TYPE_SENDQ); 1452 ASSERT(mutex_owned(&mlwq->mlwq_mtx)); 1453 1454 mlwq->mlwq_doorbell->mlwqd_send_counter = to_be16(mlwq->mlwq_pc); 1455 1456 ASSERT(mlwq->mlwq_cq != NULL); 1457 ASSERT(mlwq->mlwq_cq->mlcq_eq != NULL); 1458 idx = mlwq->mlwq_cq->mlcq_eq->mleq_intr_index & MLXCX_BF_PER_UAR_MASK; 1459 bf = &mlwq->mlwq_uar->mlu_bf[idx]; 1460 1461 retry: 1462 MLXCX_DMA_SYNC(mlwq->mlwq_doorbell_dma, DDI_DMA_SYNC_FORDEV); 1463 ddi_fm_dma_err_get(mlwq->mlwq_doorbell_dma.mxdb_dma_handle, &err, 1464 DDI_FME_VERSION); 1465 if (err.fme_status != DDI_FM_OK) { 1466 if (try++ < mlxcx_doorbell_tries) { 1467 ddi_fm_dma_err_clear( 1468 mlwq->mlwq_doorbell_dma.mxdb_dma_handle, 1469 DDI_FME_VERSION); 1470 goto retry; 1471 } else { 1472 goto err; 1473 } 1474 } 1475 1476 mlxcx_put64(mlxp, bf->mbf_even, from_be64( 1477 mlwq->mlwq_bf_ent[first].mlsqbf_qwords[0])); 1478 ddi_fm_acc_err_get(mlxp->mlx_regs_handle, &err, 1479 DDI_FME_VERSION); 1480 if (err.fme_status == DDI_FM_OK) 1481 return (B_TRUE); 1482 if (try++ < mlxcx_doorbell_tries) { 1483 ddi_fm_acc_err_clear(mlxp->mlx_regs_handle, DDI_FME_VERSION); 1484 goto retry; 1485 } 1486 1487 err: 1488 ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST); 1489 return (B_FALSE); 1490 } 1491 1492 boolean_t 1493 mlxcx_sq_add_nop(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq) 1494 { 1495 uint_t index, start_pc; 1496 mlxcx_sendq_ent_t *ent0; 1497 ddi_fm_error_t err; 1498 1499 ASSERT(mutex_owned(&mlwq->mlwq_mtx)); 1500 1501 index = mlwq->mlwq_pc & (mlwq->mlwq_nents - 1); 1502 ent0 = &mlwq->mlwq_send_ent[index]; 1503 start_pc = mlwq->mlwq_pc; 1504 ++mlwq->mlwq_pc; 1505 /* 1506 * This counter is manipulated in the interrupt handler, which 1507 * does not hold the mlwq_mtx, hence the atomic. 1508 */ 1509 atomic_inc_64(&mlwq->mlwq_wqebb_used); 1510 1511 bzero(ent0, sizeof (mlxcx_sendq_ent_t)); 1512 ent0->mlsqe_control.mlcs_opcode = MLXCX_WQE_OP_NOP; 1513 ent0->mlsqe_control.mlcs_qp_or_sq = to_be24(mlwq->mlwq_num); 1514 ent0->mlsqe_control.mlcs_wqe_index = to_be16(start_pc); 1515 1516 set_bits8(&ent0->mlsqe_control.mlcs_flags, 1517 MLXCX_SQE_FENCE_MODE, MLXCX_SQE_FENCE_NONE); 1518 set_bits8(&ent0->mlsqe_control.mlcs_flags, 1519 MLXCX_SQE_COMPLETION_MODE, MLXCX_SQE_CQE_ALWAYS); 1520 1521 ent0->mlsqe_control.mlcs_ds = 1; 1522 1523 VERIFY0(ddi_dma_sync(mlwq->mlwq_dma.mxdb_dma_handle, 1524 (uintptr_t)ent0 - (uintptr_t)mlwq->mlwq_send_ent, 1525 sizeof (mlxcx_sendq_ent_t), DDI_DMA_SYNC_FORDEV)); 1526 ddi_fm_dma_err_get(mlwq->mlwq_dma.mxdb_dma_handle, &err, 1527 DDI_FME_VERSION); 1528 if (err.fme_status != DDI_FM_OK) { 1529 return (B_FALSE); 1530 } 1531 if (!mlxcx_sq_ring_dbell(mlxp, mlwq, index)) { 1532 return (B_FALSE); 1533 } 1534 return (B_TRUE); 1535 } 1536 1537 boolean_t 1538 mlxcx_sq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, 1539 uint8_t *inlinehdrs, size_t inlinelen, uint32_t chkflags, 1540 mlxcx_buffer_t *b0) 1541 { 1542 uint_t index, first, ents; 1543 mlxcx_completion_queue_t *cq; 1544 mlxcx_sendq_ent_t *ent0; 1545 mlxcx_sendq_extra_ent_t *ent; 1546 mlxcx_wqe_data_seg_t *seg; 1547 uint_t ptri, nptr; 1548 const ddi_dma_cookie_t *c; 1549 size_t rem; 1550 uint64_t wqebb_used; 1551 mlxcx_buffer_t *b; 1552 ddi_fm_error_t err; 1553 boolean_t rv; 1554 1555 ASSERT(mutex_owned(&mlwq->mlwq_mtx)); 1556 ASSERT3P(b0->mlb_tx_head, ==, b0); 1557 ASSERT3U(b0->mlb_state, ==, MLXCX_BUFFER_ON_WQ); 1558 cq = mlwq->mlwq_cq; 1559 1560 index = mlwq->mlwq_pc & (mlwq->mlwq_nents - 1); 1561 ent0 = &mlwq->mlwq_send_ent[index]; 1562 b0->mlb_wqe_index = mlwq->mlwq_pc; 1563 ents = 1; 1564 1565 first = index; 1566 1567 bzero(ent0, sizeof (mlxcx_sendq_ent_t)); 1568 ent0->mlsqe_control.mlcs_opcode = MLXCX_WQE_OP_SEND; 1569 ent0->mlsqe_control.mlcs_qp_or_sq = to_be24(mlwq->mlwq_num); 1570 ent0->mlsqe_control.mlcs_wqe_index = to_be16(b0->mlb_wqe_index); 1571 1572 set_bits8(&ent0->mlsqe_control.mlcs_flags, 1573 MLXCX_SQE_FENCE_MODE, MLXCX_SQE_FENCE_WAIT_OTHERS); 1574 set_bits8(&ent0->mlsqe_control.mlcs_flags, 1575 MLXCX_SQE_COMPLETION_MODE, MLXCX_SQE_CQE_ALWAYS); 1576 1577 VERIFY3U(inlinelen, <=, sizeof (ent0->mlsqe_eth.mles_inline_headers)); 1578 set_bits16(&ent0->mlsqe_eth.mles_szflags, 1579 MLXCX_SQE_ETH_INLINE_HDR_SZ, inlinelen); 1580 if (inlinelen > 0) { 1581 bcopy(inlinehdrs, ent0->mlsqe_eth.mles_inline_headers, 1582 inlinelen); 1583 } 1584 1585 ent0->mlsqe_control.mlcs_ds = offsetof(mlxcx_sendq_ent_t, mlsqe_data) / 1586 MLXCX_WQE_OCTOWORD; 1587 1588 if (chkflags & HCK_IPV4_HDRCKSUM) { 1589 ASSERT(mlxp->mlx_caps->mlc_checksum); 1590 set_bit8(&ent0->mlsqe_eth.mles_csflags, 1591 MLXCX_SQE_ETH_CSFLAG_L3_CHECKSUM); 1592 } 1593 if (chkflags & HCK_FULLCKSUM) { 1594 ASSERT(mlxp->mlx_caps->mlc_checksum); 1595 set_bit8(&ent0->mlsqe_eth.mles_csflags, 1596 MLXCX_SQE_ETH_CSFLAG_L4_CHECKSUM); 1597 } 1598 1599 /* 1600 * mlwq_wqebb_used is only incremented whilst holding 1601 * the mlwq_mtx mutex, but it is decremented (atomically) in 1602 * the interrupt context *not* under mlwq_mtx mutex. 1603 * So, now take a snapshot of the number of used wqes which will 1604 * be a conistent maximum we can use whilst iterating through 1605 * the buffers and DMA cookies. 1606 */ 1607 wqebb_used = mlwq->mlwq_wqebb_used; 1608 1609 b = b0; 1610 ptri = 0; 1611 nptr = sizeof (ent0->mlsqe_data) / sizeof (mlxcx_wqe_data_seg_t); 1612 seg = ent0->mlsqe_data; 1613 while (b != NULL) { 1614 rem = b->mlb_used; 1615 1616 c = NULL; 1617 while (rem > 0 && 1618 (c = mlxcx_dma_cookie_iter(&b->mlb_dma, c)) != NULL) { 1619 if (ptri >= nptr) { 1620 if ((ents + wqebb_used) >= mlwq->mlwq_nents) 1621 return (B_FALSE); 1622 1623 index = (mlwq->mlwq_pc + ents) & 1624 (mlwq->mlwq_nents - 1); 1625 ent = &mlwq->mlwq_send_extra_ent[index]; 1626 ++ents; 1627 1628 seg = ent->mlsqe_data; 1629 ptri = 0; 1630 nptr = sizeof (ent->mlsqe_data) / 1631 sizeof (mlxcx_wqe_data_seg_t); 1632 } 1633 1634 seg->mlds_lkey = to_be32(mlxp->mlx_rsvd_lkey); 1635 if (c->dmac_size > rem) { 1636 seg->mlds_byte_count = to_be32(rem); 1637 rem = 0; 1638 } else { 1639 seg->mlds_byte_count = to_be32(c->dmac_size); 1640 rem -= c->dmac_size; 1641 } 1642 seg->mlds_address = to_be64(c->dmac_laddress); 1643 ++seg; 1644 ++ptri; 1645 ++ent0->mlsqe_control.mlcs_ds; 1646 1647 ASSERT3U(ent0->mlsqe_control.mlcs_ds, <=, 1648 MLXCX_SQE_MAX_DS); 1649 } 1650 1651 if (b == b0) { 1652 b = list_head(&b0->mlb_tx_chain); 1653 } else { 1654 b = list_next(&b0->mlb_tx_chain, b); 1655 } 1656 } 1657 1658 b0->mlb_wqebbs = ents; 1659 mlwq->mlwq_pc += ents; 1660 atomic_add_64(&mlwq->mlwq_wqebb_used, ents); 1661 1662 for (; ptri < nptr; ++ptri, ++seg) { 1663 seg->mlds_lkey = to_be32(MLXCX_NULL_LKEY); 1664 seg->mlds_byte_count = to_be32(0); 1665 seg->mlds_address = to_be64(0); 1666 } 1667 1668 /* 1669 * Make sure the workqueue entry is flushed out before updating 1670 * the doorbell. 1671 * If the ring has wrapped, we need to flush the front and back. 1672 */ 1673 if ((first + ents) > mlwq->mlwq_nents) { 1674 uint_t sync_cnt = mlwq->mlwq_nents - first; 1675 1676 VERIFY0(ddi_dma_sync(mlwq->mlwq_dma.mxdb_dma_handle, 1677 (uintptr_t)ent0 - (uintptr_t)mlwq->mlwq_send_ent, 1678 sync_cnt * sizeof (mlxcx_sendq_ent_t), 1679 DDI_DMA_SYNC_FORDEV)); 1680 1681 ent0 = &mlwq->mlwq_send_ent[0]; 1682 ents -= sync_cnt; 1683 } 1684 1685 VERIFY0(ddi_dma_sync(mlwq->mlwq_dma.mxdb_dma_handle, 1686 (uintptr_t)ent0 - (uintptr_t)mlwq->mlwq_send_ent, 1687 ents * sizeof (mlxcx_sendq_ent_t), DDI_DMA_SYNC_FORDEV)); 1688 ddi_fm_dma_err_get(mlwq->mlwq_dma.mxdb_dma_handle, &err, 1689 DDI_FME_VERSION); 1690 if (err.fme_status != DDI_FM_OK) { 1691 return (B_FALSE); 1692 } 1693 1694 /* 1695 * Hold the bufmtx whilst ringing the doorbell, to prevent 1696 * the buffer from being moved to another list, so we can 1697 * safely remove it should the ring fail. 1698 */ 1699 mutex_enter(&cq->mlcq_bufbmtx); 1700 1701 list_insert_tail(&cq->mlcq_buffers_b, b0); 1702 if ((rv = mlxcx_sq_ring_dbell(mlxp, mlwq, first))) { 1703 atomic_inc_64(&cq->mlcq_bufcnt); 1704 } else { 1705 list_remove(&cq->mlcq_buffers_b, b0); 1706 } 1707 1708 mutex_exit(&cq->mlcq_bufbmtx); 1709 1710 return (rv); 1711 } 1712 1713 boolean_t 1714 mlxcx_rq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, 1715 mlxcx_buffer_t *buf) 1716 { 1717 return (mlxcx_rq_add_buffers(mlxp, mlwq, &buf, 1)); 1718 } 1719 1720 boolean_t 1721 mlxcx_rq_add_buffers(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, 1722 mlxcx_buffer_t **bufs, size_t nbufs) 1723 { 1724 uint_t index; 1725 mlxcx_recvq_ent_t *ent; 1726 mlxcx_completion_queue_t *cq; 1727 mlxcx_wqe_data_seg_t *seg; 1728 uint_t bi, ptri; 1729 const ddi_dma_cookie_t *c; 1730 mlxcx_buffer_t *buf; 1731 ddi_fm_error_t err; 1732 1733 ASSERT(mutex_owned(&mlwq->mlwq_mtx)); 1734 cq = mlwq->mlwq_cq; 1735 ASSERT(mutex_owned(&cq->mlcq_mtx)); 1736 1737 for (bi = 0; bi < nbufs; ++bi) { 1738 buf = bufs[bi]; 1739 bufs[bi] = NULL; 1740 ASSERT3U(buf->mlb_state, ==, MLXCX_BUFFER_ON_WQ); 1741 1742 index = mlwq->mlwq_pc & (mlwq->mlwq_nents - 1); 1743 ent = &mlwq->mlwq_recv_ent[index]; 1744 buf->mlb_wqe_index = mlwq->mlwq_pc; 1745 buf->mlb_wqebbs = 1; 1746 1747 ++mlwq->mlwq_pc; 1748 atomic_inc_64(&mlwq->mlwq_wqebb_used); 1749 1750 mutex_enter(&cq->mlcq_bufbmtx); 1751 list_insert_tail(&cq->mlcq_buffers, buf); 1752 atomic_inc_64(&cq->mlcq_bufcnt); 1753 mutex_exit(&cq->mlcq_bufbmtx); 1754 1755 ASSERT3U(buf->mlb_dma.mxdb_ncookies, <=, MLXCX_RECVQ_MAX_PTRS); 1756 ptri = 0; 1757 c = NULL; 1758 while ((c = mlxcx_dma_cookie_iter(&buf->mlb_dma, c)) != NULL) { 1759 seg = &ent->mlrqe_data[ptri++]; 1760 seg->mlds_lkey = to_be32(mlxp->mlx_rsvd_lkey); 1761 seg->mlds_byte_count = to_be32(c->dmac_size); 1762 seg->mlds_address = to_be64(c->dmac_laddress); 1763 } 1764 /* 1765 * Fill any unused scatter pointers with the special null 1766 * value. 1767 */ 1768 for (; ptri < MLXCX_RECVQ_MAX_PTRS; ++ptri) { 1769 seg = &ent->mlrqe_data[ptri]; 1770 seg->mlds_lkey = to_be32(MLXCX_NULL_LKEY); 1771 seg->mlds_byte_count = to_be32(0); 1772 seg->mlds_address = to_be64(0); 1773 } 1774 1775 /* 1776 * Make sure the workqueue entry is flushed out before updating 1777 * the doorbell. 1778 */ 1779 VERIFY0(ddi_dma_sync(mlwq->mlwq_dma.mxdb_dma_handle, 1780 (uintptr_t)ent - (uintptr_t)mlwq->mlwq_recv_ent, 1781 sizeof (mlxcx_recvq_ent_t), DDI_DMA_SYNC_FORDEV)); 1782 ddi_fm_dma_err_get(mlwq->mlwq_dma.mxdb_dma_handle, &err, 1783 DDI_FME_VERSION); 1784 if (err.fme_status != DDI_FM_OK) { 1785 return (B_FALSE); 1786 } 1787 } 1788 1789 mlwq->mlwq_doorbell->mlwqd_recv_counter = to_be16(mlwq->mlwq_pc); 1790 /* 1791 * Flush the CQ doorbell as well so that HW knows how many 1792 * completions we've consumed. 1793 */ 1794 MLXCX_DMA_SYNC(cq->mlcq_doorbell_dma, DDI_DMA_SYNC_FORDEV); 1795 ddi_fm_dma_err_get(cq->mlcq_doorbell_dma.mxdb_dma_handle, &err, 1796 DDI_FME_VERSION); 1797 if (err.fme_status != DDI_FM_OK) { 1798 return (B_FALSE); 1799 } 1800 MLXCX_DMA_SYNC(mlwq->mlwq_doorbell_dma, DDI_DMA_SYNC_FORDEV); 1801 ddi_fm_dma_err_get(mlwq->mlwq_doorbell_dma.mxdb_dma_handle, &err, 1802 DDI_FME_VERSION); 1803 if (err.fme_status != DDI_FM_OK) { 1804 return (B_FALSE); 1805 } 1806 return (B_TRUE); 1807 } 1808 1809 static void 1810 mlxcx_rq_refill_task(void *arg) 1811 { 1812 mlxcx_work_queue_t *wq = arg; 1813 mlxcx_completion_queue_t *cq = wq->mlwq_cq; 1814 mlxcx_t *mlxp = wq->mlwq_mlx; 1815 mlxcx_buf_shard_t *s = wq->mlwq_bufs; 1816 boolean_t refill, draining; 1817 1818 do { 1819 /* 1820 * Wait here until one of 3 conditions: 1821 * 1. The shard is draining, or 1822 * 2. There are buffers on the free list, or 1823 * 3. The WQ is being shut down. 1824 */ 1825 mutex_enter(&s->mlbs_mtx); 1826 while (s->mlbs_state != MLXCX_SHARD_DRAINING && 1827 list_is_empty(&s->mlbs_free) && 1828 (cq->mlcq_state & MLXCX_CQ_TEARDOWN) == 0) { 1829 cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx); 1830 } 1831 1832 draining = (s->mlbs_state == MLXCX_SHARD_DRAINING); 1833 mutex_exit(&s->mlbs_mtx); 1834 1835 mutex_enter(&cq->mlcq_mtx); 1836 mutex_enter(&wq->mlwq_mtx); 1837 1838 if (draining || (cq->mlcq_state & MLXCX_CQ_TEARDOWN) != 0) { 1839 refill = B_FALSE; 1840 wq->mlwq_state &= ~MLXCX_WQ_REFILLING; 1841 } else { 1842 mlxcx_rq_refill(mlxp, wq); 1843 1844 if (cq->mlcq_bufcnt < MLXCX_RQ_REFILL_STEP) { 1845 refill = B_TRUE; 1846 } else { 1847 refill = B_FALSE; 1848 wq->mlwq_state &= ~MLXCX_WQ_REFILLING; 1849 } 1850 } 1851 1852 mutex_exit(&wq->mlwq_mtx); 1853 mutex_exit(&cq->mlcq_mtx); 1854 } while (refill); 1855 } 1856 1857 void 1858 mlxcx_rq_refill(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq) 1859 { 1860 size_t target, current, want, done, n; 1861 mlxcx_completion_queue_t *cq; 1862 mlxcx_ring_group_t *g; 1863 mlxcx_buffer_t *b[MLXCX_RQ_REFILL_STEP]; 1864 uint_t i; 1865 1866 ASSERT(mutex_owned(&mlwq->mlwq_mtx)); 1867 cq = mlwq->mlwq_cq; 1868 ASSERT(mutex_owned(&cq->mlcq_mtx)); 1869 1870 ASSERT(mlwq->mlwq_state & MLXCX_WQ_BUFFERS); 1871 1872 target = mlwq->mlwq_nents - MLXCX_RQ_REFILL_STEP; 1873 cq = mlwq->mlwq_cq; 1874 1875 if ((mlwq->mlwq_state & MLXCX_WQ_STARTED) == 0) 1876 return; 1877 1878 if ((cq->mlcq_state & MLXCX_CQ_TEARDOWN) != 0) 1879 return; 1880 1881 current = cq->mlcq_bufcnt; 1882 1883 if (current >= target - MLXCX_RQ_REFILL_STEP) 1884 return; 1885 1886 want = target - current; 1887 done = 0; 1888 1889 while (!(mlwq->mlwq_state & MLXCX_WQ_TEARDOWN) && done < want) { 1890 n = mlxcx_buf_take_n(mlxp, mlwq, b, MLXCX_RQ_REFILL_STEP); 1891 if (n == 0) { 1892 /* 1893 * We didn't get any buffers from the free queue. 1894 * It might not be an issue, schedule a taskq 1895 * to wait for free buffers if the completion 1896 * queue is low. 1897 */ 1898 if (current < MLXCX_RQ_REFILL_STEP && 1899 (mlwq->mlwq_state & MLXCX_WQ_REFILLING) == 0) { 1900 mlwq->mlwq_state |= MLXCX_WQ_REFILLING; 1901 g = mlwq->mlwq_group; 1902 taskq_dispatch_ent(g->mlg_refill_tq, 1903 mlxcx_rq_refill_task, mlwq, TQ_NOSLEEP, 1904 &mlwq->mlwq_tqe); 1905 } 1906 1907 return; 1908 } 1909 1910 if ((mlwq->mlwq_state & MLXCX_WQ_TEARDOWN) != 0) { 1911 for (i = 0; i < n; ++i) 1912 mlxcx_buf_return(mlxp, b[i]); 1913 return; 1914 } 1915 if (!mlxcx_rq_add_buffers(mlxp, mlwq, b, n)) { 1916 /* 1917 * mlxcx_rq_add_buffers NULLs out the buffers as it 1918 * enqueues them, so any that are non-NULL we have to 1919 * free now. The others now belong to the WQ, even if 1920 * we failed. 1921 */ 1922 for (i = 0; i < n; ++i) { 1923 if (b[i] != NULL) { 1924 mlxcx_buf_return(mlxp, b[i]); 1925 } 1926 } 1927 return; 1928 } 1929 done += n; 1930 } 1931 } 1932 1933 static const char * 1934 mlxcx_cq_err_syndrome_string(mlxcx_cq_error_syndrome_t sy) 1935 { 1936 switch (sy) { 1937 case MLXCX_CQ_ERR_LOCAL_LENGTH: 1938 return ("LOCAL_LENGTH"); 1939 case MLXCX_CQ_ERR_LOCAL_QP_OP: 1940 return ("LOCAL_QP_OP"); 1941 case MLXCX_CQ_ERR_LOCAL_PROTECTION: 1942 return ("LOCAL_PROTECTION"); 1943 case MLXCX_CQ_ERR_WR_FLUSHED: 1944 return ("WR_FLUSHED"); 1945 case MLXCX_CQ_ERR_MEM_WINDOW_BIND: 1946 return ("MEM_WINDOW_BIND"); 1947 case MLXCX_CQ_ERR_BAD_RESPONSE: 1948 return ("BAD_RESPONSE"); 1949 case MLXCX_CQ_ERR_LOCAL_ACCESS: 1950 return ("LOCAL_ACCESS"); 1951 case MLXCX_CQ_ERR_XPORT_RETRY_CTR: 1952 return ("XPORT_RETRY_CTR"); 1953 case MLXCX_CQ_ERR_RNR_RETRY_CTR: 1954 return ("RNR_RETRY_CTR"); 1955 case MLXCX_CQ_ERR_ABORTED: 1956 return ("ABORTED"); 1957 default: 1958 return ("UNKNOWN"); 1959 } 1960 } 1961 1962 static void 1963 mlxcx_fm_cqe_ereport(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, 1964 mlxcx_completionq_error_ent_t *ent) 1965 { 1966 uint64_t ena; 1967 char buf[FM_MAX_CLASS]; 1968 const char *name = mlxcx_cq_err_syndrome_string(ent->mlcqee_syndrome); 1969 1970 if (!DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps)) 1971 return; 1972 1973 (void) snprintf(buf, FM_MAX_CLASS, "%s.%s", 1974 MLXCX_FM_SERVICE_MLXCX, "cqe.err"); 1975 ena = fm_ena_generate(0, FM_ENA_FMT1); 1976 1977 ddi_fm_ereport_post(mlxp->mlx_dip, buf, ena, DDI_NOSLEEP, 1978 FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0, 1979 "syndrome", DATA_TYPE_STRING, name, 1980 "syndrome_num", DATA_TYPE_UINT8, ent->mlcqee_syndrome, 1981 "vendor_syndrome", DATA_TYPE_UINT8, 1982 ent->mlcqee_vendor_error_syndrome, 1983 "wqe_counter", DATA_TYPE_UINT16, from_be16(ent->mlcqee_wqe_counter), 1984 "wq_type", DATA_TYPE_STRING, 1985 (mlcq->mlcq_wq->mlwq_type == MLXCX_WQ_TYPE_SENDQ) ? "send": "recv", 1986 "cq_num", DATA_TYPE_UINT32, mlcq->mlcq_num, 1987 "wq_num", DATA_TYPE_UINT32, mlcq->mlcq_wq->mlwq_num, 1988 NULL); 1989 ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_DEGRADED); 1990 } 1991 1992 void 1993 mlxcx_tx_completion(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, 1994 mlxcx_completionq_ent_t *ent, mlxcx_buffer_t *buf) 1995 { 1996 ASSERT(mutex_owned(&mlcq->mlcq_mtx)); 1997 if (ent->mlcqe_opcode == MLXCX_CQE_OP_REQ_ERR) { 1998 mlxcx_completionq_error_ent_t *eent = 1999 (mlxcx_completionq_error_ent_t *)ent; 2000 mlxcx_fm_cqe_ereport(mlxp, mlcq, eent); 2001 mlxcx_buf_return_chain(mlxp, buf, B_FALSE); 2002 mutex_enter(&mlcq->mlcq_wq->mlwq_mtx); 2003 mlxcx_check_sq(mlxp, mlcq->mlcq_wq); 2004 mutex_exit(&mlcq->mlcq_wq->mlwq_mtx); 2005 return; 2006 } 2007 2008 if (ent->mlcqe_opcode != MLXCX_CQE_OP_REQ) { 2009 mlxcx_warn(mlxp, "!got weird cq opcode: %x", ent->mlcqe_opcode); 2010 mlxcx_buf_return_chain(mlxp, buf, B_FALSE); 2011 return; 2012 } 2013 2014 if (ent->mlcqe_send_wqe_opcode != MLXCX_WQE_OP_SEND) { 2015 mlxcx_warn(mlxp, "!got weird cq wqe opcode: %x", 2016 ent->mlcqe_send_wqe_opcode); 2017 mlxcx_buf_return_chain(mlxp, buf, B_FALSE); 2018 return; 2019 } 2020 2021 if (ent->mlcqe_format != MLXCX_CQE_FORMAT_BASIC) { 2022 mlxcx_warn(mlxp, "!got weird cq format: %x", ent->mlcqe_format); 2023 mlxcx_buf_return_chain(mlxp, buf, B_FALSE); 2024 return; 2025 } 2026 2027 mlxcx_buf_return_chain(mlxp, buf, B_FALSE); 2028 } 2029 2030 mblk_t * 2031 mlxcx_rx_completion(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, 2032 mlxcx_completionq_ent_t *ent, mlxcx_buffer_t *buf) 2033 { 2034 uint32_t chkflags = 0; 2035 uint_t wqe_index; 2036 ddi_fm_error_t err; 2037 2038 ASSERT(mutex_owned(&mlcq->mlcq_mtx)); 2039 2040 if (ent->mlcqe_opcode == MLXCX_CQE_OP_RESP_ERR) { 2041 mlxcx_completionq_error_ent_t *eent = 2042 (mlxcx_completionq_error_ent_t *)ent; 2043 mlxcx_fm_cqe_ereport(mlxp, mlcq, eent); 2044 mlxcx_buf_return(mlxp, buf); 2045 mutex_enter(&mlcq->mlcq_wq->mlwq_mtx); 2046 mlxcx_check_rq(mlxp, mlcq->mlcq_wq); 2047 mutex_exit(&mlcq->mlcq_wq->mlwq_mtx); 2048 return (NULL); 2049 } 2050 2051 if (ent->mlcqe_opcode != MLXCX_CQE_OP_RESP) { 2052 mlxcx_warn(mlxp, "!got weird cq opcode: %x", ent->mlcqe_opcode); 2053 mlxcx_buf_return(mlxp, buf); 2054 return (NULL); 2055 } 2056 2057 if (ent->mlcqe_format != MLXCX_CQE_FORMAT_BASIC) { 2058 mlxcx_warn(mlxp, "!got weird cq format: %x", ent->mlcqe_format); 2059 mlxcx_buf_return(mlxp, buf); 2060 return (NULL); 2061 } 2062 2063 if (ent->mlcqe_rx_drop_counter > 0) { 2064 atomic_add_64(&mlcq->mlcq_stats->mlps_rx_drops, 2065 ent->mlcqe_rx_drop_counter); 2066 } 2067 2068 MLXCX_DMA_SYNC(buf->mlb_dma, DDI_DMA_SYNC_FORCPU); 2069 ddi_fm_dma_err_get(buf->mlb_dma.mxdb_dma_handle, &err, 2070 DDI_FME_VERSION); 2071 if (err.fme_status != DDI_FM_OK) { 2072 ddi_fm_dma_err_clear(buf->mlb_dma.mxdb_dma_handle, 2073 DDI_FME_VERSION); 2074 mlxcx_buf_return(mlxp, buf); 2075 return (NULL); 2076 } 2077 2078 /* 2079 * mlxcx_buf_loan() will set mlb_wqe_index to zero. 2080 * Remember it for later. 2081 */ 2082 wqe_index = buf->mlb_wqe_index; 2083 2084 if (!mlxcx_buf_loan(mlxp, buf)) { 2085 mlxcx_buf_return(mlxp, buf); 2086 return (NULL); 2087 } 2088 2089 buf->mlb_mp->b_next = NULL; 2090 buf->mlb_mp->b_cont = NULL; 2091 buf->mlb_mp->b_wptr = buf->mlb_mp->b_rptr + 2092 from_be32(ent->mlcqe_byte_cnt); 2093 2094 if (get_bit8(ent->mlcqe_csflags, MLXCX_CQE_CSFLAGS_L4_OK)) { 2095 chkflags |= HCK_FULLCKSUM_OK; 2096 } 2097 if (get_bit8(ent->mlcqe_csflags, MLXCX_CQE_CSFLAGS_L3_OK)) { 2098 chkflags |= HCK_IPV4_HDRCKSUM_OK; 2099 } 2100 if (chkflags != 0) { 2101 mac_hcksum_set(buf->mlb_mp, 0, 0, 0, 2102 from_be16(ent->mlcqe_checksum), chkflags); 2103 } 2104 2105 /* 2106 * Don't check if a refill is needed on every single completion, 2107 * since checking involves taking the RQ lock. 2108 */ 2109 if ((wqe_index & 0x7) == 0) { 2110 mlxcx_work_queue_t *wq = mlcq->mlcq_wq; 2111 ASSERT(wq != NULL); 2112 mutex_enter(&wq->mlwq_mtx); 2113 if (!(wq->mlwq_state & MLXCX_WQ_TEARDOWN)) 2114 mlxcx_rq_refill(mlxp, wq); 2115 mutex_exit(&wq->mlwq_mtx); 2116 } 2117 2118 return (buf->mlb_mp); 2119 } 2120 2121 static void 2122 mlxcx_buf_mp_return(caddr_t arg) 2123 { 2124 mlxcx_buffer_t *b = (mlxcx_buffer_t *)arg; 2125 mlxcx_t *mlxp = b->mlb_mlx; 2126 2127 /* The mblk has been used now, so NULL it out. */ 2128 b->mlb_mp = NULL; 2129 2130 if (b->mlb_state == MLXCX_BUFFER_ON_LOAN) 2131 mlxcx_buf_return(mlxp, b); 2132 } 2133 2134 boolean_t 2135 mlxcx_buf_create(mlxcx_t *mlxp, mlxcx_buf_shard_t *shard, mlxcx_buffer_t **bp) 2136 { 2137 mlxcx_buffer_t *b; 2138 ddi_device_acc_attr_t acc; 2139 ddi_dma_attr_t attr; 2140 boolean_t ret; 2141 2142 b = kmem_cache_alloc(mlxp->mlx_bufs_cache, KM_SLEEP); 2143 b->mlb_shard = shard; 2144 b->mlb_foreign = B_FALSE; 2145 2146 mlxcx_dma_acc_attr(mlxp, &acc); 2147 mlxcx_dma_buf_attr(mlxp, &attr); 2148 2149 ret = mlxcx_dma_alloc_offset(mlxp, &b->mlb_dma, &attr, &acc, 2150 B_FALSE, mlxp->mlx_ports[0].mlp_mtu, 2, B_TRUE); 2151 if (!ret) { 2152 kmem_cache_free(mlxp->mlx_bufs_cache, b); 2153 return (B_FALSE); 2154 } 2155 2156 b->mlb_frtn.free_func = mlxcx_buf_mp_return; 2157 b->mlb_frtn.free_arg = (caddr_t)b; 2158 b->mlb_mp = desballoc((unsigned char *)b->mlb_dma.mxdb_va, 2159 b->mlb_dma.mxdb_len, 0, &b->mlb_frtn); 2160 2161 *bp = b; 2162 2163 return (B_TRUE); 2164 } 2165 2166 boolean_t 2167 mlxcx_buf_create_foreign(mlxcx_t *mlxp, mlxcx_buf_shard_t *shard, 2168 mlxcx_buffer_t **bp) 2169 { 2170 mlxcx_buffer_t *b; 2171 ddi_dma_attr_t attr; 2172 boolean_t ret; 2173 2174 b = kmem_cache_alloc(mlxp->mlx_bufs_cache, KM_SLEEP); 2175 b->mlb_shard = shard; 2176 b->mlb_foreign = B_TRUE; 2177 2178 mlxcx_dma_buf_attr(mlxp, &attr); 2179 2180 ret = mlxcx_dma_init(mlxp, &b->mlb_dma, &attr, B_TRUE); 2181 if (!ret) { 2182 kmem_cache_free(mlxp->mlx_bufs_cache, b); 2183 return (B_FALSE); 2184 } 2185 2186 *bp = b; 2187 2188 return (B_TRUE); 2189 } 2190 2191 static mlxcx_buffer_t * 2192 mlxcx_buf_take_foreign(mlxcx_t *mlxp, mlxcx_work_queue_t *wq) 2193 { 2194 mlxcx_buffer_t *b; 2195 mlxcx_buf_shard_t *s = wq->mlwq_foreign_bufs; 2196 2197 mutex_enter(&s->mlbs_mtx); 2198 if (s->mlbs_state != MLXCX_SHARD_READY) { 2199 mutex_exit(&s->mlbs_mtx); 2200 return (NULL); 2201 } 2202 2203 if ((b = list_remove_head(&s->mlbs_free)) != NULL) { 2204 ASSERT3U(b->mlb_state, ==, MLXCX_BUFFER_FREE); 2205 ASSERT(b->mlb_foreign); 2206 b->mlb_state = MLXCX_BUFFER_ON_WQ; 2207 list_insert_tail(&s->mlbs_busy, b); 2208 } 2209 mutex_exit(&s->mlbs_mtx); 2210 2211 return (b); 2212 } 2213 2214 static mlxcx_buffer_t * 2215 mlxcx_copy_data(mlxcx_t *mlxp, mlxcx_work_queue_t *wq, uint8_t *rptr, size_t sz) 2216 { 2217 ddi_fm_error_t err; 2218 mlxcx_buffer_t *b; 2219 uint_t attempts = 0; 2220 2221 copyb: 2222 if ((b = mlxcx_buf_take(mlxp, wq)) == NULL) 2223 return (NULL); 2224 2225 ASSERT3U(b->mlb_dma.mxdb_len, >=, sz); 2226 bcopy(rptr, b->mlb_dma.mxdb_va, sz); 2227 2228 MLXCX_DMA_SYNC(b->mlb_dma, DDI_DMA_SYNC_FORDEV); 2229 2230 ddi_fm_dma_err_get(b->mlb_dma.mxdb_dma_handle, &err, 2231 DDI_FME_VERSION); 2232 if (err.fme_status != DDI_FM_OK) { 2233 ddi_fm_dma_err_clear(b->mlb_dma.mxdb_dma_handle, 2234 DDI_FME_VERSION); 2235 mlxcx_buf_return(mlxp, b); 2236 if (++attempts > MLXCX_BUF_BIND_MAX_ATTEMTPS) { 2237 return (NULL); 2238 } 2239 goto copyb; 2240 } 2241 2242 return (b); 2243 } 2244 2245 static mlxcx_buffer_t * 2246 mlxcx_bind_or_copy_mblk(mlxcx_t *mlxp, mlxcx_work_queue_t *wq, 2247 mblk_t *mp, size_t off) 2248 { 2249 mlxcx_buffer_t *b; 2250 uint8_t *rptr; 2251 size_t sz; 2252 boolean_t ret; 2253 2254 rptr = mp->b_rptr; 2255 sz = MBLKL(mp); 2256 2257 #ifdef DEBUG 2258 if (off > 0) { 2259 ASSERT3U(off, <, sz); 2260 } 2261 #endif 2262 2263 rptr += off; 2264 sz -= off; 2265 2266 if (sz < mlxp->mlx_props.mldp_tx_bind_threshold) { 2267 b = mlxcx_copy_data(mlxp, wq, rptr, sz); 2268 } else { 2269 b = mlxcx_buf_take_foreign(mlxp, wq); 2270 if (b == NULL) 2271 return (NULL); 2272 2273 ret = mlxcx_dma_bind_mblk(mlxp, &b->mlb_dma, mp, off, 2274 B_FALSE); 2275 2276 if (!ret) { 2277 mlxcx_buf_return(mlxp, b); 2278 2279 b = mlxcx_copy_data(mlxp, wq, rptr, sz); 2280 } 2281 } 2282 2283 return (b); 2284 } 2285 2286 uint_t 2287 mlxcx_buf_bind_or_copy(mlxcx_t *mlxp, mlxcx_work_queue_t *wq, 2288 mblk_t *mpb, size_t off, mlxcx_buffer_t **bp) 2289 { 2290 mlxcx_buffer_t *b, *b0 = NULL; 2291 boolean_t first = B_TRUE; 2292 mblk_t *mp; 2293 size_t offset = off; 2294 size_t ncookies = 0; 2295 uint_t count = 0; 2296 2297 for (mp = mpb; mp != NULL && ncookies <= MLXCX_SQE_MAX_PTRS; 2298 mp = mp->b_cont) { 2299 b = mlxcx_bind_or_copy_mblk(mlxp, wq, mp, offset); 2300 if (b == NULL) 2301 goto failed; 2302 2303 ncookies += b->mlb_dma.mxdb_ncookies; 2304 2305 if (first) 2306 b0 = b; 2307 2308 if (!first) 2309 b->mlb_state = MLXCX_BUFFER_ON_CHAIN; 2310 2311 b->mlb_tx_mp = mp; 2312 b->mlb_tx_head = b0; 2313 b->mlb_used = MBLKL(mp) - offset; 2314 2315 if (!first) 2316 list_insert_tail(&b0->mlb_tx_chain, b); 2317 first = B_FALSE; 2318 offset = 0; 2319 2320 count++; 2321 } 2322 2323 /* 2324 * The chain of mblks has resulted in too many cookies for 2325 * a single message. This is unusual, so take the hit to tidy 2326 * up, do a pullup to a single mblk and allocate the requisite 2327 * buf. 2328 */ 2329 if (ncookies > MLXCX_SQE_MAX_PTRS) { 2330 DTRACE_PROBE4(pullup, mlxcx_t *, mlxp, mlxcx_work_queue_t *, wq, 2331 mblk_t *, mpb, size_t, ncookies); 2332 2333 if (b0 != NULL) 2334 mlxcx_buf_return_chain(mlxp, b0, B_TRUE); 2335 2336 if ((mp = msgpullup(mpb, -1)) == NULL) 2337 return (0); 2338 2339 b0 = mlxcx_bind_or_copy_mblk(mlxp, wq, mp, off); 2340 if (b0 == NULL) { 2341 freemsg(mp); 2342 return (0); 2343 } 2344 freemsg(mpb); 2345 2346 b0->mlb_tx_mp = mp; 2347 b0->mlb_tx_head = b0; 2348 b0->mlb_used = MBLKL(mp) - off; 2349 2350 count = 1; 2351 } 2352 2353 *bp = b0; 2354 2355 return (count); 2356 2357 failed: 2358 if (b0 != NULL) 2359 mlxcx_buf_return_chain(mlxp, b0, B_TRUE); 2360 2361 return (0); 2362 } 2363 2364 mlxcx_buffer_t * 2365 mlxcx_buf_take(mlxcx_t *mlxp, mlxcx_work_queue_t *wq) 2366 { 2367 mlxcx_buffer_t *b; 2368 mlxcx_buf_shard_t *s = wq->mlwq_bufs; 2369 2370 mutex_enter(&s->mlbs_mtx); 2371 if (s->mlbs_state != MLXCX_SHARD_READY) { 2372 mutex_exit(&s->mlbs_mtx); 2373 return (NULL); 2374 } 2375 2376 if ((b = list_remove_head(&s->mlbs_free)) != NULL) { 2377 ASSERT3U(b->mlb_state, ==, MLXCX_BUFFER_FREE); 2378 b->mlb_state = MLXCX_BUFFER_ON_WQ; 2379 list_insert_tail(&s->mlbs_busy, b); 2380 } 2381 mutex_exit(&s->mlbs_mtx); 2382 2383 return (b); 2384 } 2385 2386 size_t 2387 mlxcx_buf_take_n(mlxcx_t *mlxp, mlxcx_work_queue_t *wq, mlxcx_buffer_t **bp, 2388 size_t nbufs) 2389 { 2390 mlxcx_buffer_t *b; 2391 size_t done = 0; 2392 mlxcx_buf_shard_t *s; 2393 2394 s = wq->mlwq_bufs; 2395 2396 mutex_enter(&s->mlbs_mtx); 2397 if (s->mlbs_state != MLXCX_SHARD_READY) { 2398 mutex_exit(&s->mlbs_mtx); 2399 return (0); 2400 } 2401 2402 while (done < nbufs && (b = list_remove_head(&s->mlbs_free)) != NULL) { 2403 ASSERT3U(b->mlb_state, ==, MLXCX_BUFFER_FREE); 2404 b->mlb_state = MLXCX_BUFFER_ON_WQ; 2405 list_insert_tail(&s->mlbs_busy, b); 2406 bp[done++] = b; 2407 } 2408 mutex_exit(&s->mlbs_mtx); 2409 return (done); 2410 } 2411 2412 boolean_t 2413 mlxcx_buf_loan(mlxcx_t *mlxp, mlxcx_buffer_t *b) 2414 { 2415 mlxcx_buf_shard_t *s = b->mlb_shard; 2416 2417 VERIFY3U(b->mlb_state, ==, MLXCX_BUFFER_ON_WQ); 2418 ASSERT3P(b->mlb_mlx, ==, mlxp); 2419 2420 if (b->mlb_mp == NULL) { 2421 b->mlb_mp = desballoc((unsigned char *)b->mlb_dma.mxdb_va, 2422 b->mlb_dma.mxdb_len, 0, &b->mlb_frtn); 2423 if (b->mlb_mp == NULL) 2424 return (B_FALSE); 2425 } 2426 2427 b->mlb_state = MLXCX_BUFFER_ON_LOAN; 2428 b->mlb_wqe_index = 0; 2429 2430 mutex_enter(&s->mlbs_mtx); 2431 list_remove(&s->mlbs_busy, b); 2432 list_insert_tail(&s->mlbs_loaned, b); 2433 mutex_exit(&s->mlbs_mtx); 2434 2435 return (B_TRUE); 2436 } 2437 2438 void 2439 mlxcx_buf_return_chain(mlxcx_t *mlxp, mlxcx_buffer_t *b0, boolean_t keepmp) 2440 { 2441 mlxcx_buffer_t *b; 2442 2443 if (b0->mlb_tx_head != b0) { 2444 mlxcx_buf_return(mlxp, b0); 2445 return; 2446 } 2447 2448 while ((b = list_head(&b0->mlb_tx_chain)) != NULL) { 2449 mlxcx_buf_return(mlxp, b); 2450 } 2451 if (keepmp) { 2452 b0->mlb_tx_mp = NULL; 2453 b0->mlb_tx_head = NULL; 2454 } 2455 mlxcx_buf_return(mlxp, b0); 2456 } 2457 2458 void 2459 mlxcx_buf_return(mlxcx_t *mlxp, mlxcx_buffer_t *b) 2460 { 2461 mlxcx_buffer_state_t oldstate = b->mlb_state; 2462 mlxcx_buffer_t *txhead = b->mlb_tx_head; 2463 mlxcx_buf_shard_t *s = b->mlb_shard; 2464 mblk_t *mp = b->mlb_tx_mp; 2465 2466 VERIFY3U(oldstate, !=, MLXCX_BUFFER_FREE); 2467 ASSERT3P(b->mlb_mlx, ==, mlxp); 2468 2469 /* 2470 * The mlbs_mtx held below is a heavily contended lock, so it is 2471 * imperative we do as much of the buffer clean up outside the lock 2472 * as is possible. 2473 */ 2474 b->mlb_state = MLXCX_BUFFER_FREE; 2475 b->mlb_wqe_index = 0; 2476 b->mlb_tx_head = NULL; 2477 b->mlb_tx_mp = NULL; 2478 b->mlb_used = 0; 2479 b->mlb_wqebbs = 0; 2480 ASSERT(list_is_empty(&b->mlb_tx_chain)); 2481 2482 if (b->mlb_foreign) { 2483 if (b->mlb_dma.mxdb_flags & MLXCX_DMABUF_BOUND) { 2484 mlxcx_dma_unbind(mlxp, &b->mlb_dma); 2485 } 2486 } 2487 2488 mutex_enter(&s->mlbs_mtx); 2489 switch (oldstate) { 2490 case MLXCX_BUFFER_INIT: 2491 break; 2492 case MLXCX_BUFFER_ON_WQ: 2493 list_remove(&s->mlbs_busy, b); 2494 break; 2495 case MLXCX_BUFFER_ON_LOAN: 2496 ASSERT(!b->mlb_foreign); 2497 list_remove(&s->mlbs_loaned, b); 2498 if (s->mlbs_state == MLXCX_SHARD_DRAINING) { 2499 /* 2500 * When we're draining, Eg during mac_stop(), 2501 * we destroy the buffer immediately rather than 2502 * recycling it. Otherwise we risk leaving it 2503 * on the free list and leaking it. 2504 */ 2505 list_insert_tail(&s->mlbs_free, b); 2506 mlxcx_buf_destroy(mlxp, b); 2507 /* 2508 * Teardown might be waiting for loaned list to empty. 2509 */ 2510 cv_broadcast(&s->mlbs_free_nonempty); 2511 mutex_exit(&s->mlbs_mtx); 2512 return; 2513 } 2514 break; 2515 case MLXCX_BUFFER_FREE: 2516 VERIFY(0); 2517 break; 2518 case MLXCX_BUFFER_ON_CHAIN: 2519 ASSERT(txhead != NULL); 2520 list_remove(&txhead->mlb_tx_chain, b); 2521 list_remove(&s->mlbs_busy, b); 2522 break; 2523 } 2524 2525 list_insert_tail(&s->mlbs_free, b); 2526 cv_broadcast(&s->mlbs_free_nonempty); 2527 2528 mutex_exit(&s->mlbs_mtx); 2529 2530 /* 2531 * For TX chain heads, free the mblk_t after we let go of the lock. 2532 * This might be a borrowed buf that we in turn loaned to MAC, in which 2533 * case calling freemsg() on it will re-enter this very function -- so 2534 * we better not be holding the lock! 2535 */ 2536 if (txhead == b) 2537 freemsg(mp); 2538 } 2539 2540 void 2541 mlxcx_buf_destroy(mlxcx_t *mlxp, mlxcx_buffer_t *b) 2542 { 2543 mlxcx_buf_shard_t *s = b->mlb_shard; 2544 2545 VERIFY(b->mlb_state == MLXCX_BUFFER_FREE || 2546 b->mlb_state == MLXCX_BUFFER_INIT); 2547 ASSERT(mutex_owned(&s->mlbs_mtx)); 2548 2549 if (b->mlb_state == MLXCX_BUFFER_FREE) 2550 list_remove(&s->mlbs_free, b); 2551 2552 /* 2553 * This is going back to the kmem cache, so it needs to be set up in 2554 * the same way we expect a new buffer to come out (state INIT, other 2555 * fields NULL'd) 2556 */ 2557 b->mlb_state = MLXCX_BUFFER_INIT; 2558 b->mlb_shard = NULL; 2559 if (b->mlb_mp != NULL) { 2560 freeb(b->mlb_mp); 2561 ASSERT(b->mlb_mp == NULL); 2562 } 2563 mlxcx_dma_free(&b->mlb_dma); 2564 ASSERT(list_is_empty(&b->mlb_tx_chain)); 2565 2566 kmem_cache_free(mlxp->mlx_bufs_cache, b); 2567 } 2568 2569 void 2570 mlxcx_shard_ready(mlxcx_buf_shard_t *s) 2571 { 2572 mutex_enter(&s->mlbs_mtx); 2573 s->mlbs_state = MLXCX_SHARD_READY; 2574 mutex_exit(&s->mlbs_mtx); 2575 } 2576 2577 void 2578 mlxcx_shard_draining(mlxcx_buf_shard_t *s) 2579 { 2580 mutex_enter(&s->mlbs_mtx); 2581 s->mlbs_state = MLXCX_SHARD_DRAINING; 2582 cv_broadcast(&s->mlbs_free_nonempty); 2583 mutex_exit(&s->mlbs_mtx); 2584 } 2585