1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 #include <sys/types.h> 27 #include <sys/kmem.h> 28 #include <sys/conf.h> 29 #include <sys/ddi.h> 30 #include <sys/sunddi.h> 31 #include <sys/ksynch.h> 32 33 #include <sys/ib/clients/eoib/eib_impl.h> 34 35 /* 36 * Declarations private to this file 37 */ 38 static int eib_rsrc_setup_txbufs(eib_t *, int *); 39 static int eib_rsrc_setup_rxbufs(eib_t *, int *); 40 static int eib_rsrc_setup_lsobufs(eib_t *, int *); 41 static void eib_rsrc_init_wqe_pool(eib_t *, eib_wqe_pool_t **, 42 ib_memlen_t, int); 43 static void eib_rsrc_fini_wqe_pool(eib_t *, eib_wqe_pool_t **); 44 static boolean_t eib_rsrc_ok_to_free_pool(eib_t *, eib_wqe_pool_t *, boolean_t); 45 static int eib_rsrc_grab_wqes(eib_t *, eib_wqe_pool_t *, eib_wqe_t **, uint_t, 46 uint_t *, int); 47 static void eib_rsrc_return_wqes(eib_t *, eib_wqe_pool_t *, eib_wqe_t **, 48 uint_t); 49 50 static void eib_rb_rsrc_setup_txbufs(eib_t *, boolean_t); 51 static void eib_rb_rsrc_setup_rxbufs(eib_t *, boolean_t); 52 static void eib_rb_rsrc_setup_lsobufs(eib_t *, boolean_t); 53 54 /* 55 * Definitions private to this file 56 */ 57 static uint_t eib_lso_num_bufs = EIB_LSO_NUM_BUFS; /* tunable? */ 58 59 int 60 eib_rsrc_setup_bufs(eib_t *ss, int *err) 61 { 62 if (eib_rsrc_setup_txbufs(ss, err) != EIB_E_SUCCESS) 63 return (EIB_E_FAILURE); 64 65 if (ss->ei_caps->cp_lso_maxlen && ss->ei_caps->cp_cksum_flags && 66 ss->ei_caps->cp_resv_lkey_capab) { 67 if (eib_rsrc_setup_lsobufs(ss, err) != EIB_E_SUCCESS) { 68 eib_rb_rsrc_setup_txbufs(ss, B_FALSE); 69 return (EIB_E_FAILURE); 70 } 71 } 72 73 if (eib_rsrc_setup_rxbufs(ss, err) != EIB_E_SUCCESS) { 74 eib_rb_rsrc_setup_lsobufs(ss, B_FALSE); 75 eib_rb_rsrc_setup_txbufs(ss, B_FALSE); 76 return (EIB_E_FAILURE); 77 } 78 79 return (EIB_E_SUCCESS); 80 } 81 82 int 83 eib_rsrc_grab_swqes(eib_t *ss, eib_wqe_t **wqes, uint_t n_req, uint_t *actual, 84 int pri) 85 { 86 eib_wqe_t *wqe; 87 uint32_t *encap_hdr; 88 int ret; 89 int i; 90 91 ASSERT(ss->ei_tx != NULL); 92 93 ret = eib_rsrc_grab_wqes(ss, ss->ei_tx, wqes, n_req, actual, pri); 94 if (ret != EIB_E_SUCCESS) 95 return (EIB_E_FAILURE); 96 97 /* 98 * See note for eib_rsrc_grab_swqe() 99 */ 100 for (i = 0; i < (*actual); i++) { 101 wqe = wqes[i]; 102 wqe->qe_wr.send.wr_flags = IBT_WR_NO_FLAGS; 103 wqe->qe_wr.send.wr.ud.udwr_dest = wqe->qe_dest; 104 wqe->qe_wr.send.wr_opcode = IBT_WRC_SEND; 105 wqe->qe_wr.send.wr_nds = 1; 106 wqe->qe_wr.send.wr_sgl = &wqe->qe_sgl; 107 wqe->qe_nxt_post = NULL; 108 wqe->qe_iov_hdl = NULL; 109 110 encap_hdr = (uint32_t *)(void *)wqe->qe_payload_hdr; 111 *encap_hdr = htonl(EIB_TX_ENCAP_HDR); 112 } 113 114 return (EIB_E_SUCCESS); 115 } 116 117 int 118 eib_rsrc_grab_rwqes(eib_t *ss, eib_wqe_t **wqes, uint_t n_req, uint_t *actual, 119 int pri) 120 { 121 ASSERT(ss->ei_rx != NULL); 122 123 return (eib_rsrc_grab_wqes(ss, ss->ei_rx, wqes, n_req, actual, pri)); 124 } 125 126 int 127 eib_rsrc_grab_lsobufs(eib_t *ss, uint_t req_sz, ibt_wr_ds_t *sgl, uint32_t *nds) 128 { 129 eib_lsobkt_t *bkt = ss->ei_lso; 130 eib_lsobuf_t *elem; 131 eib_lsobuf_t *nxt; 132 uint_t frag_sz; 133 uint_t num_needed; 134 int i; 135 136 ASSERT(req_sz != 0); 137 ASSERT(sgl != NULL); 138 ASSERT(nds != NULL); 139 140 /* 141 * Determine how many bufs we'd need for the size requested 142 */ 143 num_needed = req_sz / EIB_LSO_BUFSZ; 144 if ((frag_sz = req_sz % EIB_LSO_BUFSZ) != 0) 145 num_needed++; 146 147 if (bkt == NULL) 148 return (EIB_E_FAILURE); 149 150 /* 151 * If we don't have enough lso bufs, return failure 152 */ 153 mutex_enter(&bkt->bk_lock); 154 if (bkt->bk_nfree < num_needed) { 155 mutex_exit(&bkt->bk_lock); 156 return (EIB_E_FAILURE); 157 } 158 159 /* 160 * Pick the first "num_needed" bufs from the free list 161 */ 162 elem = bkt->bk_free_head; 163 for (i = 0; i < num_needed; i++) { 164 ASSERT(elem->lb_isfree != 0); 165 ASSERT(elem->lb_buf != NULL); 166 167 nxt = elem->lb_next; 168 169 sgl[i].ds_va = (ib_vaddr_t)(uintptr_t)elem->lb_buf; 170 sgl[i].ds_key = bkt->bk_lkey; 171 sgl[i].ds_len = EIB_LSO_BUFSZ; 172 173 elem->lb_isfree = 0; 174 elem->lb_next = NULL; 175 176 elem = nxt; 177 } 178 bkt->bk_free_head = elem; 179 180 /* 181 * If the requested size is not a multiple of EIB_LSO_BUFSZ, we need 182 * to adjust the last sgl entry's length. Since we know we need atleast 183 * one, the i-1 use below is ok. 184 */ 185 if (frag_sz) { 186 sgl[i-1].ds_len = frag_sz; 187 } 188 189 /* 190 * Update nfree count and return 191 */ 192 bkt->bk_nfree -= num_needed; 193 194 mutex_exit(&bkt->bk_lock); 195 196 *nds = num_needed; 197 198 return (EIB_E_SUCCESS); 199 } 200 201 eib_wqe_t * 202 eib_rsrc_grab_swqe(eib_t *ss, int pri) 203 { 204 eib_wqe_t *wqe = NULL; 205 uint32_t *encap_hdr; 206 207 ASSERT(ss->ei_tx != NULL); 208 (void) eib_rsrc_grab_wqes(ss, ss->ei_tx, &wqe, 1, NULL, pri); 209 210 /* 211 * Let's reset the swqe basic wr parameters to default. We need 212 * to do this because this swqe could've previously been used 213 * for a checksum offload (when the flags would've been set) 214 * or for an LSO send (in which case the opcode would've been set 215 * to a different value), or been iov mapped (in which case the 216 * sgl/nds could've been set to different values). We'll make 217 * it easy and initialize it here, so simple transactions can 218 * go through without any special effort by the caller. 219 * 220 * Note that even though the wqe structure is common for both 221 * send and recv, they're in two independent pools and the wqe 222 * type remains the same throughout its lifetime. So we don't 223 * have to worry about resetting any other field. 224 */ 225 if (wqe) { 226 wqe->qe_wr.send.wr_flags = IBT_WR_NO_FLAGS; 227 wqe->qe_wr.send.wr.ud.udwr_dest = wqe->qe_dest; 228 wqe->qe_wr.send.wr_opcode = IBT_WRC_SEND; 229 wqe->qe_wr.send.wr_nds = 1; 230 wqe->qe_wr.send.wr_sgl = &wqe->qe_sgl; 231 wqe->qe_nxt_post = NULL; 232 wqe->qe_iov_hdl = NULL; 233 234 encap_hdr = (uint32_t *)(void *)wqe->qe_payload_hdr; 235 *encap_hdr = htonl(EIB_TX_ENCAP_HDR); 236 } 237 238 return (wqe); 239 } 240 241 eib_wqe_t * 242 eib_rsrc_grab_rwqe(eib_t *ss, int pri) 243 { 244 eib_wqe_t *wqe = NULL; 245 246 ASSERT(ss->ei_rx != NULL); 247 (void) eib_rsrc_grab_wqes(ss, ss->ei_rx, &wqe, 1, NULL, pri); 248 249 return (wqe); 250 } 251 252 void 253 eib_rsrc_return_swqe(eib_t *ss, eib_wqe_t *wqe, eib_chan_t *chan) 254 { 255 ASSERT(ss->ei_tx != NULL); 256 257 eib_rsrc_return_wqes(ss, ss->ei_tx, &wqe, 1); 258 if (chan) { 259 eib_rsrc_decr_posted_swqe(ss, chan); 260 } 261 } 262 263 264 void 265 eib_rsrc_return_rwqe(eib_t *ss, eib_wqe_t *wqe, eib_chan_t *chan) 266 { 267 ASSERT(ss->ei_rx != NULL); 268 269 eib_rsrc_return_wqes(ss, ss->ei_rx, &wqe, 1); 270 if (chan) { 271 eib_rsrc_decr_posted_rwqe(ss, chan); 272 } 273 } 274 275 void 276 eib_rsrc_return_lsobufs(eib_t *ss, ibt_wr_ds_t *sgl_p, uint32_t nds) 277 { 278 eib_lsobkt_t *bkt = ss->ei_lso; 279 eib_lsobuf_t *elem; 280 uint8_t *va; 281 ptrdiff_t ndx; 282 int i; 283 284 /* 285 * Nowhere to return the buffers to ?? 286 */ 287 if (bkt == NULL) 288 return; 289 290 mutex_enter(&bkt->bk_lock); 291 292 for (i = 0; i < nds; i++) { 293 va = (uint8_t *)(uintptr_t)sgl_p[i].ds_va; 294 295 ASSERT(va >= bkt->bk_mem); 296 ASSERT(va < (bkt->bk_mem + bkt->bk_nelem * EIB_LSO_BUFSZ)); 297 298 /* 299 * Figure out the buflist element this sgl buffer corresponds 300 * to and put it back at the head 301 */ 302 ndx = ((uintptr_t)va - (uintptr_t)bkt->bk_mem) / EIB_LSO_BUFSZ; 303 elem = bkt->bk_bufl + ndx; 304 305 ASSERT(elem->lb_isfree == 0); 306 ASSERT(elem->lb_buf == va); 307 308 elem->lb_isfree = 1; 309 elem->lb_next = bkt->bk_free_head; 310 bkt->bk_free_head = elem; 311 } 312 bkt->bk_nfree += nds; 313 314 /* 315 * If the number of available lso buffers just crossed the 316 * threshold, wakeup anyone who may be sleeping on the event. 317 */ 318 if (((bkt->bk_nfree - nds) < EIB_LSO_FREE_BUFS_THRESH) && 319 (bkt->bk_nfree >= EIB_LSO_FREE_BUFS_THRESH)) { 320 cv_broadcast(&bkt->bk_cv); 321 } 322 323 mutex_exit(&bkt->bk_lock); 324 } 325 326 /*ARGSUSED*/ 327 void 328 eib_rsrc_decr_posted_swqe(eib_t *ss, eib_chan_t *chan) 329 { 330 ASSERT(chan != NULL); 331 332 mutex_enter(&chan->ch_tx_lock); 333 334 chan->ch_tx_posted--; 335 if ((chan->ch_tear_down) && (chan->ch_tx_posted == 0)) { 336 cv_signal(&chan->ch_tx_cv); 337 } 338 339 mutex_exit(&chan->ch_tx_lock); 340 } 341 342 void 343 eib_rsrc_decr_posted_rwqe(eib_t *ss, eib_chan_t *chan) 344 { 345 eib_chan_t *tail; 346 boolean_t queue_for_refill = B_FALSE; 347 348 ASSERT(chan != NULL); 349 350 /* 351 * Decrement the ch_rx_posted count. If we are tearing this channel 352 * down, signal the waiter when the count reaches 0. If we aren't 353 * tearing the channel down, see if the count has gone below the low 354 * water mark. If it has, and if this channel isn't already being 355 * refilled, queue the channel up with the service thread for a 356 * rwqe refill. 357 */ 358 mutex_enter(&chan->ch_rx_lock); 359 chan->ch_rx_posted--; 360 if (chan->ch_tear_down) { 361 if (chan->ch_rx_posted == 0) 362 cv_signal(&chan->ch_rx_cv); 363 } else if (chan->ch_rx_posted < chan->ch_lwm_rwqes) { 364 if (chan->ch_rx_refilling == B_FALSE) { 365 chan->ch_rx_refilling = B_TRUE; 366 queue_for_refill = B_TRUE; 367 } 368 } 369 mutex_exit(&chan->ch_rx_lock); 370 371 if (queue_for_refill) { 372 mutex_enter(&ss->ei_rxpost_lock); 373 374 chan->ch_rxpost_next = NULL; 375 for (tail = ss->ei_rxpost; tail; tail = tail->ch_rxpost_next) { 376 if (tail->ch_rxpost_next == NULL) 377 break; 378 } 379 if (tail) { 380 tail->ch_rxpost_next = chan; 381 } else { 382 ss->ei_rxpost = chan; 383 } 384 385 cv_signal(&ss->ei_rxpost_cv); 386 mutex_exit(&ss->ei_rxpost_lock); 387 } 388 } 389 390 void 391 eib_rsrc_txwqes_needed(eib_t *ss) 392 { 393 eib_wqe_pool_t *wp = ss->ei_tx; 394 395 EIB_INCR_COUNTER(&ss->ei_stats->st_noxmitbuf); 396 397 mutex_enter(&wp->wp_lock); 398 if ((wp->wp_status & EIB_TXWQE_SHORT) == 0) { 399 wp->wp_status |= EIB_TXWQE_SHORT; 400 cv_broadcast(&wp->wp_cv); 401 } 402 mutex_exit(&wp->wp_lock); 403 } 404 405 void 406 eib_rsrc_lsobufs_needed(eib_t *ss) 407 { 408 eib_lsobkt_t *bkt = ss->ei_lso; 409 410 EIB_INCR_COUNTER(&ss->ei_stats->st_noxmitbuf); 411 412 if (bkt == NULL) { 413 EIB_DPRINTF_WARN(ss->ei_instance, 414 "eib_rsrc_lsobufs_needed: " 415 "lso bufs seem to be needed even though " 416 "LSO support was not advertised"); 417 return; 418 } 419 420 mutex_enter(&bkt->bk_lock); 421 if ((bkt->bk_status & EIB_LBUF_SHORT) == 0) { 422 bkt->bk_status |= EIB_LBUF_SHORT; 423 cv_broadcast(&bkt->bk_cv); 424 } 425 mutex_exit(&bkt->bk_lock); 426 } 427 428 boolean_t 429 eib_rsrc_rxpool_low(eib_wqe_t *wqe) 430 { 431 eib_wqe_pool_t *wp = wqe->qe_pool; 432 boolean_t ret = B_FALSE; 433 434 /* 435 * Set the EIB_RXWQE_SHORT flag when the number of free wqes 436 * in the rx pool falls below the low threshold for rwqes and 437 * clear it only when the number of free wqes gets back above 438 * the high water mark. 439 */ 440 mutex_enter(&wp->wp_lock); 441 442 if (wp->wp_nfree <= EIB_NFREE_RWQES_LOW) { 443 wp->wp_status |= (EIB_RXWQE_SHORT); 444 } else if (wp->wp_nfree >= EIB_NFREE_RWQES_HWM) { 445 wp->wp_status &= (~EIB_RXWQE_SHORT); 446 } 447 448 if ((wp->wp_status & EIB_RXWQE_SHORT) == EIB_RXWQE_SHORT) 449 ret = B_TRUE; 450 451 mutex_exit(&wp->wp_lock); 452 453 return (ret); 454 } 455 456 void 457 eib_rb_rsrc_setup_bufs(eib_t *ss, boolean_t force) 458 { 459 eib_rb_rsrc_setup_rxbufs(ss, force); 460 eib_rb_rsrc_setup_lsobufs(ss, force); 461 eib_rb_rsrc_setup_txbufs(ss, force); 462 } 463 464 static int 465 eib_rsrc_setup_txbufs(eib_t *ss, int *err) 466 { 467 eib_wqe_pool_t *tx; 468 eib_wqe_t *wqe; 469 ibt_ud_dest_hdl_t dest; 470 ibt_mr_attr_t attr; 471 ibt_mr_desc_t desc; 472 ibt_status_t ret; 473 kthread_t *kt; 474 uint32_t *encap_hdr; 475 uint8_t *buf; 476 uint_t mtu = ss->ei_props->ep_mtu; 477 uint_t tx_bufsz; 478 uint_t blk; 479 uint_t ndx; 480 uint_t i; 481 int lso_enabled; 482 483 /* 484 * Try to allocate and initialize the tx wqe pool 485 */ 486 if (ss->ei_tx != NULL) 487 return (EIB_E_SUCCESS); 488 489 /* 490 * If we keep the tx buffers as mtu-sized, then potentially every 491 * LSO request that cannot be satisfactorily mapped, will use up 492 * the 8K large (default size) lso buffers. This may be inadvisable 493 * given that lso buffers are a scarce resource. Instead, we'll 494 * slightly raise the size of the copy buffers in the send wqes 495 * (say to EIB_TX_COPY_THRESH) so that requests that cannot be 496 * mapped could still avoid using the 8K LSO buffers if they're 497 * less than the copy threshold size. 498 */ 499 lso_enabled = ss->ei_caps->cp_lso_maxlen && 500 ss->ei_caps->cp_cksum_flags && ss->ei_caps->cp_resv_lkey_capab; 501 tx_bufsz = ((lso_enabled) && (EIB_TX_COPY_THRESH > mtu)) ? 502 EIB_TX_COPY_THRESH : mtu; 503 504 eib_rsrc_init_wqe_pool(ss, &ss->ei_tx, tx_bufsz, EIB_WP_TYPE_TX); 505 tx = ss->ei_tx; 506 507 /* 508 * Register the TX memory region with IBTF for use 509 */ 510 attr.mr_vaddr = tx->wp_vaddr; 511 attr.mr_len = tx->wp_memsz; 512 attr.mr_as = NULL; 513 attr.mr_flags = IBT_MR_SLEEP; 514 515 ret = ibt_register_mr(ss->ei_hca_hdl, ss->ei_pd_hdl, &attr, 516 &tx->wp_mr, &desc); 517 if (ret != IBT_SUCCESS) { 518 EIB_DPRINTF_ERR(ss->ei_instance, "eib_rsrc_setup_txbufs: " 519 "ibt_register_mr() failed for tx " 520 "region (0x%llx, 0x%llx) with ret=%d", 521 attr.mr_vaddr, attr.mr_len, ret); 522 523 *err = EINVAL; 524 goto rsrc_setup_txbufs_fail; 525 } 526 tx->wp_lkey = desc.md_lkey; 527 528 /* 529 * Now setup the tx wqes 530 */ 531 buf = (uint8_t *)(uintptr_t)(tx->wp_vaddr); 532 for (i = 0, blk = 0; blk < EIB_BLKS_PER_POOL; blk++) { 533 for (ndx = 0; ndx < EIB_WQES_PER_BLK; ndx++, i++) { 534 wqe = &tx->wp_wqe[i]; 535 /* 536 * Allocate a UD destination handle 537 */ 538 ret = ibt_alloc_ud_dest(ss->ei_hca_hdl, 539 IBT_UD_DEST_NO_FLAGS, ss->ei_pd_hdl, &dest); 540 if (ret != IBT_SUCCESS) { 541 EIB_DPRINTF_ERR(ss->ei_instance, 542 "eib_rsrc_setup_txbufs: " 543 "ibt_alloc_ud_dest(hca_hdl=0x%llx) " 544 "failed, ret=%d", ss->ei_hca_hdl, ret); 545 546 *err = ENOMEM; 547 goto rsrc_setup_txbufs_fail; 548 } 549 550 /* 551 * These parameters should remain fixed throughout the 552 * lifetime of this wqe. 553 */ 554 wqe->qe_pool = tx; 555 wqe->qe_cpbuf = buf; 556 wqe->qe_bufsz = tx_bufsz; 557 558 /* 559 * The qe_dest and qe_payload_hdr are specific to tx 560 * only, but remain unchanged throughout the lifetime 561 * of the wqe. 562 * 563 * The payload header is normally used when we have an 564 * LSO packet to send. Since the EoIB encapsulation 565 * header won't be part of the message we get from the 566 * network layer, we'll need to copy the lso header into 567 * a new buffer every time before we hand over the LSO 568 * send request to the hca driver. 569 */ 570 wqe->qe_dest = dest; 571 wqe->qe_payload_hdr = 572 kmem_zalloc(EIB_MAX_PAYLOAD_HDR_SZ, KM_SLEEP); 573 574 /* 575 * The encapsulation header is at the start of the 576 * payload header and is initialized to the default 577 * encapsulation header we use (no multiple segments, 578 * no FCS). This part of the header is not expected 579 * to change. 580 */ 581 encap_hdr = (uint32_t *)(void *)wqe->qe_payload_hdr; 582 *encap_hdr = htonl(EIB_TX_ENCAP_HDR); 583 584 /* 585 * The parameter set below are used in tx and rx paths. 586 * These parameters (except ds_key) are reset to these 587 * default values in eib_rsrc_return_wqes(). 588 */ 589 wqe->qe_sgl.ds_key = tx->wp_lkey; 590 wqe->qe_sgl.ds_va = (ib_vaddr_t)(uintptr_t)buf; 591 wqe->qe_sgl.ds_len = wqe->qe_bufsz; 592 wqe->qe_mp = NULL; 593 wqe->qe_info = 594 ((blk & EIB_WQEBLK_MASK) << EIB_WQEBLK_SHIFT) | 595 ((ndx & EIB_WQENDX_MASK) << EIB_WQENDX_SHIFT) | 596 ((uint_t)EIB_WQE_TX << EIB_WQETYP_SHIFT); 597 598 /* 599 * These tx-specific parameters (except wr_id and 600 * wr_trans) are reset in eib_rsrc_grab_swqes() to make 601 * sure any freshly acquired swqe from the pool has 602 * these default settings for the caller. 603 */ 604 wqe->qe_wr.send.wr_id = (ibt_wrid_t)(uintptr_t)wqe; 605 wqe->qe_wr.send.wr_trans = IBT_UD_SRV; 606 wqe->qe_wr.send.wr_flags = IBT_WR_NO_FLAGS; 607 wqe->qe_wr.send.wr.ud.udwr_dest = wqe->qe_dest; 608 wqe->qe_wr.send.wr_opcode = IBT_WRC_SEND; 609 wqe->qe_wr.send.wr_nds = 1; 610 wqe->qe_wr.send.wr_sgl = &wqe->qe_sgl; 611 wqe->qe_nxt_post = NULL; 612 wqe->qe_iov_hdl = NULL; 613 614 buf += wqe->qe_bufsz; 615 } 616 } 617 618 /* 619 * Before returning, create a kernel thread to monitor the status 620 * of wqes in the tx wqe pool. Note that this thread cannot be 621 * created from eib_state_init() during attach(), since the thread 622 * expects the wqe pool to be allocated and ready when it starts, 623 * and the tx bufs initialization only happens during eib_m_start(). 624 */ 625 kt = thread_create(NULL, 0, eib_monitor_tx_wqes, ss, 0, 626 &p0, TS_RUN, minclsyspri); 627 ss->ei_txwqe_monitor = kt->t_did; 628 629 return (EIB_E_SUCCESS); 630 631 rsrc_setup_txbufs_fail: 632 eib_rb_rsrc_setup_txbufs(ss, B_FALSE); 633 return (EIB_E_FAILURE); 634 } 635 636 static int 637 eib_rsrc_setup_rxbufs(eib_t *ss, int *err) 638 { 639 eib_wqe_pool_t *rx; 640 eib_wqe_t *wqe; 641 ibt_mr_attr_t attr; 642 ibt_mr_desc_t desc; 643 ibt_status_t ret; 644 uint8_t *buf; 645 uint_t mtu = ss->ei_props->ep_mtu; 646 uint_t blk; 647 uint_t ndx; 648 uint_t i; 649 650 /* 651 * Try to allocate and initialize the wqe pool. When this is called 652 * during a plumb via the mac m_start callback, we need to make 653 * sure there is a need to allocate a wqe pool afresh. If during a 654 * previous unplumb we didn't free the wqe pool because the nw layer 655 * was holding on to some rx buffers, we don't need to allocate new 656 * pool and set up the buffers again; we'll just start re-using the 657 * previous one. 658 */ 659 if (ss->ei_rx != NULL) 660 return (EIB_E_SUCCESS); 661 662 /* 663 * The receive buffer has to work for all channels, specifically the 664 * data qp of the vnics. This means that the buffer must be large 665 * enough to hold MTU sized IB payload (including the EoIB and ethernet 666 * headers) plus the GRH. In addition, because the ethernet header is 667 * either 14 or 18 bytes (tagless or vlan tagged), we should have the 668 * buffer filled in such a way that the IP header starts at atleast a 669 * 4-byte aligned address. In order to do this, we need to have some 670 * additional room. 671 */ 672 eib_rsrc_init_wqe_pool(ss, &ss->ei_rx, 673 mtu + EIB_GRH_SZ + EIB_IPHDR_ALIGN_ROOM, EIB_WP_TYPE_RX); 674 rx = ss->ei_rx; 675 676 /* 677 * Register the RX memory region with IBTF for use 678 */ 679 attr.mr_vaddr = rx->wp_vaddr; 680 attr.mr_len = rx->wp_memsz; 681 attr.mr_as = NULL; 682 attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 683 684 ret = ibt_register_mr(ss->ei_hca_hdl, ss->ei_pd_hdl, &attr, 685 &rx->wp_mr, &desc); 686 if (ret != IBT_SUCCESS) { 687 EIB_DPRINTF_ERR(ss->ei_instance, "eib_rsrc_setup_rxbufs: " 688 "ibt_register_mr() failed for rx " 689 "region (0x%llx, 0x%llx) with ret=%d", 690 attr.mr_vaddr, attr.mr_len, ret); 691 692 *err = EINVAL; 693 goto rsrc_setup_rxbufs_fail; 694 } 695 rx->wp_lkey = desc.md_lkey; 696 697 /* 698 * Now setup the rx wqes 699 */ 700 buf = (uint8_t *)(uintptr_t)(rx->wp_vaddr); 701 for (i = 0, blk = 0; blk < EIB_BLKS_PER_POOL; blk++) { 702 for (ndx = 0; ndx < EIB_WQES_PER_BLK; ndx++, i++) { 703 wqe = &rx->wp_wqe[i]; 704 705 /* 706 * These parameters should remain fixed throughout the 707 * lifetime of this recv wqe. The qe_frp will only be 708 * used by the data channel of vnics and will remain 709 * unused by other channels. 710 */ 711 wqe->qe_pool = rx; 712 wqe->qe_cpbuf = buf; 713 wqe->qe_bufsz = mtu + EIB_GRH_SZ + EIB_IPHDR_ALIGN_ROOM; 714 wqe->qe_wr.recv.wr_id = (ibt_wrid_t)(uintptr_t)wqe; 715 wqe->qe_wr.recv.wr_nds = 1; 716 wqe->qe_wr.recv.wr_sgl = &wqe->qe_sgl; 717 wqe->qe_frp.free_func = eib_data_rx_recycle; 718 wqe->qe_frp.free_arg = (caddr_t)wqe; 719 720 /* 721 * The parameter set below are used in tx and rx paths. 722 * These parameters (except ds_key) are reset to these 723 * default values in eib_rsrc_return_wqes(). 724 */ 725 wqe->qe_sgl.ds_key = rx->wp_lkey; 726 wqe->qe_sgl.ds_va = (ib_vaddr_t)(uintptr_t)buf; 727 wqe->qe_sgl.ds_len = wqe->qe_bufsz; 728 wqe->qe_mp = NULL; 729 wqe->qe_info = 730 ((blk & EIB_WQEBLK_MASK) << EIB_WQEBLK_SHIFT) | 731 ((ndx & EIB_WQENDX_MASK) << EIB_WQENDX_SHIFT) | 732 ((uint_t)EIB_WQE_RX << EIB_WQETYP_SHIFT); 733 734 /* 735 * These rx-specific parameters are also reset to 736 * these default values in eib_rsrc_return_wqes(). 737 */ 738 wqe->qe_chan = NULL; 739 wqe->qe_vnic_inst = -1; 740 741 buf += (mtu + EIB_GRH_SZ + EIB_IPHDR_ALIGN_ROOM); 742 } 743 } 744 745 return (EIB_E_SUCCESS); 746 747 rsrc_setup_rxbufs_fail: 748 eib_rb_rsrc_setup_rxbufs(ss, B_FALSE); 749 return (EIB_E_FAILURE); 750 } 751 752 static int 753 eib_rsrc_setup_lsobufs(eib_t *ss, int *err) 754 { 755 eib_lsobkt_t *bkt; 756 eib_lsobuf_t *elem; 757 eib_lsobuf_t *tail; 758 ibt_mr_attr_t attr; 759 ibt_mr_desc_t desc; 760 kthread_t *kt; 761 762 uint8_t *lsomem; 763 uint8_t *memp; 764 ibt_status_t ret; 765 int i; 766 767 /* 768 * Allocate the lso bucket and space for buffers 769 */ 770 bkt = kmem_zalloc(sizeof (eib_lsobkt_t), KM_SLEEP); 771 lsomem = kmem_zalloc(eib_lso_num_bufs * EIB_LSO_BUFSZ, KM_SLEEP); 772 773 /* 774 * Register lso memory and save the lkey 775 */ 776 attr.mr_vaddr = (uint64_t)(uintptr_t)lsomem; 777 attr.mr_len = eib_lso_num_bufs * EIB_LSO_BUFSZ; 778 attr.mr_as = NULL; 779 attr.mr_flags = IBT_MR_SLEEP; 780 781 ret = ibt_register_mr(ss->ei_hca_hdl, ss->ei_pd_hdl, &attr, 782 &bkt->bk_mr_hdl, &desc); 783 if (ret != IBT_SUCCESS) { 784 *err = EINVAL; 785 EIB_DPRINTF_ERR(ss->ei_instance, "eib_rsrc_setup_lsobufs: " 786 "ibt_register_mr() failed for LSO " 787 "region (0x%llx, 0x%llx) with ret=%d", 788 attr.mr_vaddr, attr.mr_len, ret); 789 790 kmem_free(lsomem, eib_lso_num_bufs * EIB_LSO_BUFSZ); 791 kmem_free(bkt, sizeof (eib_lsobkt_t)); 792 793 return (EIB_E_FAILURE); 794 } 795 bkt->bk_lkey = desc.md_lkey; 796 797 /* 798 * Now allocate the buflist. Note that the elements in the buflist and 799 * the buffers in the lso memory have a permanent 1-1 relation, so we 800 * can always derive the address of a buflist entry from the address of 801 * an lso buffer. 802 */ 803 bkt->bk_bufl = kmem_zalloc(eib_lso_num_bufs * sizeof (eib_lsobuf_t), 804 KM_SLEEP); 805 806 /* 807 * Set up the lso buf chain 808 */ 809 memp = lsomem; 810 elem = bkt->bk_bufl; 811 for (i = 0; i < eib_lso_num_bufs; i++) { 812 elem->lb_isfree = 1; 813 elem->lb_buf = memp; 814 elem->lb_next = elem + 1; 815 816 tail = elem; 817 818 memp += EIB_LSO_BUFSZ; 819 elem++; 820 } 821 tail->lb_next = NULL; 822 823 /* 824 * Set up the LSO buffer information in eib state 825 */ 826 bkt->bk_free_head = bkt->bk_bufl; 827 bkt->bk_mem = lsomem; 828 bkt->bk_nelem = eib_lso_num_bufs; 829 bkt->bk_nfree = bkt->bk_nelem; 830 831 mutex_init(&bkt->bk_lock, NULL, MUTEX_DRIVER, NULL); 832 cv_init(&bkt->bk_cv, NULL, CV_DEFAULT, NULL); 833 834 ss->ei_lso = bkt; 835 836 /* 837 * Before returning, create a kernel thread to monitor the status 838 * of lso bufs 839 */ 840 kt = thread_create(NULL, 0, eib_monitor_lso_bufs, ss, 0, 841 &p0, TS_RUN, minclsyspri); 842 ss->ei_lsobufs_monitor = kt->t_did; 843 844 return (EIB_E_SUCCESS); 845 } 846 847 static void 848 eib_rsrc_init_wqe_pool(eib_t *ss, eib_wqe_pool_t **wpp, ib_memlen_t bufsz, 849 int wp_type) 850 { 851 eib_wqe_pool_t *wp; 852 uint_t wp_wqesz; 853 int i; 854 855 ASSERT(wpp != NULL); 856 ASSERT(*wpp == NULL); 857 858 /* 859 * Allocate the wqe pool, wqes and bufs 860 */ 861 wp = kmem_zalloc(sizeof (eib_wqe_pool_t), KM_SLEEP); 862 wp_wqesz = EIB_WQES_PER_POOL * sizeof (eib_wqe_t); 863 wp->wp_wqe = (eib_wqe_t *)kmem_zalloc(wp_wqesz, KM_SLEEP); 864 wp->wp_memsz = EIB_WQES_PER_POOL * bufsz; 865 wp->wp_vaddr = (ib_vaddr_t)(uintptr_t)kmem_zalloc(wp->wp_memsz, 866 KM_SLEEP); 867 wp->wp_ss = ss; 868 wp->wp_type = wp_type; 869 wp->wp_nfree_lwm = (wp_type == EIB_WP_TYPE_TX) ? 870 EIB_NFREE_SWQES_LWM : EIB_NFREE_RWQES_LWM; 871 872 /* 873 * Initialize the lock and bitmaps: everything is available at first, 874 * but note that if the number of blocks per pool is less than 64, we 875 * need to initialize those extra bits as "unavailable" - these will 876 * remain unavailable throughout. 877 */ 878 mutex_init(&wp->wp_lock, NULL, MUTEX_DRIVER, NULL); 879 cv_init(&wp->wp_cv, NULL, CV_DEFAULT, NULL); 880 881 wp->wp_nfree = EIB_WQES_PER_POOL; 882 wp->wp_free_blks = (EIB_BLKS_PER_POOL >= 64) ? (~0) : 883 (((uint64_t)1 << EIB_BLKS_PER_POOL) - 1); 884 for (i = 0; i < EIB_BLKS_PER_POOL; i++) 885 wp->wp_free_wqes[i] = ~0; 886 887 *wpp = wp; 888 } 889 890 /*ARGSUSED*/ 891 static void 892 eib_rsrc_fini_wqe_pool(eib_t *ss, eib_wqe_pool_t **wpp) 893 { 894 eib_wqe_pool_t *wp; 895 896 ASSERT(wpp != NULL); 897 898 wp = *wpp; 899 ASSERT(*wpp != NULL); 900 901 cv_destroy(&wp->wp_cv); 902 mutex_destroy(&wp->wp_lock); 903 904 kmem_free((void *)(uintptr_t)(wp->wp_vaddr), wp->wp_memsz); 905 kmem_free(wp->wp_wqe, EIB_WQES_PER_POOL * sizeof (eib_wqe_t)); 906 kmem_free(wp, sizeof (eib_wqe_pool_t)); 907 908 *wpp = NULL; 909 } 910 911 /*ARGSUSED*/ 912 static boolean_t 913 eib_rsrc_ok_to_free_pool(eib_t *ss, eib_wqe_pool_t *wp, boolean_t force) 914 { 915 uint64_t free_blks; 916 int i; 917 918 /* 919 * See if we can release all memory allocated for buffers, wqes and 920 * the pool. Note that in the case of data channel rx buffers, some 921 * of the buffers may not be free if the nw layer is holding on to 922 * them still. If this is the case, we cannot free the wqe pool now 923 * or a subsequent access by the nw layer to the buffers will cause 924 * a panic. 925 */ 926 ASSERT(wp != NULL); 927 928 /* 929 * If force-free flag is set, we can always release the memory. 930 * Note that this flag is unused currently, and should be removed. 931 */ 932 if (force == B_TRUE) 933 return (B_TRUE); 934 935 mutex_enter(&wp->wp_lock); 936 937 /* 938 * If a whole block remains allocated, obviously we cannot free 939 * the pool 940 */ 941 free_blks = (EIB_BLKS_PER_POOL >= 64) ? (~0) : 942 (((uint64_t)1 << EIB_BLKS_PER_POOL) - 1); 943 if (wp->wp_free_blks != free_blks) { 944 mutex_exit(&wp->wp_lock); 945 return (B_FALSE); 946 } 947 948 /* 949 * If even a single wqe within any one block remains in-use, we 950 * cannot free the pool 951 */ 952 for (i = 0; i < EIB_BLKS_PER_POOL; i++) { 953 if (wp->wp_free_wqes[i] != (~0)) { 954 mutex_exit(&wp->wp_lock); 955 return (B_FALSE); 956 } 957 } 958 959 mutex_exit(&wp->wp_lock); 960 961 return (B_TRUE); 962 } 963 964 /*ARGSUSED*/ 965 static int 966 eib_rsrc_grab_wqes(eib_t *ss, eib_wqe_pool_t *wp, eib_wqe_t **wqes, 967 uint_t n_req, uint_t *actual, int pri) 968 { 969 uint_t n_allocd = 0; 970 int blk; 971 int ndx; 972 int wqe_ndx; 973 974 ASSERT(wp != NULL); 975 ASSERT(wqes != NULL); 976 977 mutex_enter(&wp->wp_lock); 978 979 /* 980 * If this is a low priority request, adjust the number requested 981 * so we don't allocate beyond the low-water-mark 982 */ 983 if (pri == EIB_WPRI_LO) { 984 if (wp->wp_nfree <= wp->wp_nfree_lwm) 985 n_req = 0; 986 else if ((wp->wp_nfree - n_req) < wp->wp_nfree_lwm) 987 n_req = wp->wp_nfree - wp->wp_nfree_lwm; 988 } 989 990 for (n_allocd = 0; n_allocd < n_req; n_allocd++) { 991 /* 992 * If the entire pool is unavailable, quit 993 */ 994 if (wp->wp_free_blks == 0) 995 break; 996 997 /* 998 * Find the first wqe that's available 999 */ 1000 blk = EIB_FIND_LSB_SET(wp->wp_free_blks); 1001 ASSERT(blk != -1); 1002 ndx = EIB_FIND_LSB_SET(wp->wp_free_wqes[blk]); 1003 ASSERT(ndx != -1); 1004 1005 /* 1006 * Mark the wqe as allocated 1007 */ 1008 wp->wp_free_wqes[blk] &= (~((uint64_t)1 << ndx)); 1009 1010 /* 1011 * If this was the last free wqe in this block, mark 1012 * the block itself as unavailable 1013 */ 1014 if (wp->wp_free_wqes[blk] == 0) 1015 wp->wp_free_blks &= (~((uint64_t)1 << blk)); 1016 1017 /* 1018 * Return this wqe to the caller 1019 */ 1020 wqe_ndx = blk * EIB_WQES_PER_BLK + ndx; 1021 wqes[n_allocd] = &(wp->wp_wqe[wqe_ndx]); 1022 } 1023 1024 wp->wp_nfree -= n_allocd; 1025 1026 mutex_exit(&wp->wp_lock); 1027 1028 if (n_allocd == 0) 1029 return (EIB_E_FAILURE); 1030 1031 if (actual) { 1032 *actual = n_allocd; 1033 } 1034 1035 return (EIB_E_SUCCESS); 1036 } 1037 1038 /*ARGSUSED*/ 1039 static void 1040 eib_rsrc_return_wqes(eib_t *ss, eib_wqe_pool_t *wp, eib_wqe_t **wqes, 1041 uint_t n_wqes) 1042 { 1043 eib_wqe_t *wqe; 1044 uint_t n_freed = 0; 1045 uint_t blk; 1046 uint_t ndx; 1047 1048 ASSERT(wp != NULL); 1049 ASSERT(wqes != NULL); 1050 1051 mutex_enter(&wp->wp_lock); 1052 for (n_freed = 0; n_freed < n_wqes; n_freed++) { 1053 wqe = wqes[n_freed]; 1054 1055 /* 1056 * This wqe is being returned back to the pool, so clear 1057 * any wqe flags and reset buffer address and size in the 1058 * single segment sgl back to what they were initially. 1059 * Also erase any mblk pointer and callback function ptrs. 1060 */ 1061 wqe->qe_sgl.ds_va = (ib_vaddr_t)(uintptr_t)wqe->qe_cpbuf; 1062 wqe->qe_sgl.ds_len = wqe->qe_bufsz; 1063 wqe->qe_mp = NULL; 1064 wqe->qe_chan = NULL; 1065 wqe->qe_vnic_inst = -1; 1066 wqe->qe_info &= (~EIB_WQEFLGS_MASK); 1067 1068 /* 1069 * Mark the wqe free in its block 1070 */ 1071 blk = EIB_WQE_BLK(wqe->qe_info); 1072 ndx = EIB_WQE_NDX(wqe->qe_info); 1073 1074 wp->wp_free_wqes[blk] |= ((uint64_t)1 << ndx); 1075 1076 /* 1077 * This block now has atleast one wqe free, so mark 1078 * the block itself as available and move on to the 1079 * next wqe to free 1080 */ 1081 wp->wp_free_blks |= ((uint64_t)1 << blk); 1082 } 1083 1084 wp->wp_nfree += n_freed; 1085 1086 /* 1087 * If the number of available wqes in the pool has just crossed 1088 * the high-water-mark, wakeup anyone who may be sleeping on it. 1089 */ 1090 if ((wp->wp_type == EIB_WP_TYPE_TX) && 1091 ((wp->wp_nfree - n_freed) < EIB_NFREE_SWQES_HWM) && 1092 (wp->wp_nfree >= EIB_NFREE_SWQES_HWM)) { 1093 cv_broadcast(&wp->wp_cv); 1094 } 1095 1096 mutex_exit(&wp->wp_lock); 1097 } 1098 1099 static void 1100 eib_rb_rsrc_setup_txbufs(eib_t *ss, boolean_t force) 1101 { 1102 eib_wqe_pool_t *wp = ss->ei_tx; 1103 eib_wqe_t *wqe; 1104 ibt_ud_dest_hdl_t dest; 1105 ibt_status_t ret; 1106 uint8_t *plhdr; 1107 int i; 1108 1109 if (wp == NULL) 1110 return; 1111 1112 /* 1113 * Check if it's ok to free the tx wqe pool (i.e. all buffers have 1114 * been reclaimed) and if so, stop the txwqe monitor thread (and wait 1115 * for it to die), release the UD destination handles, deregister 1116 * memory and fini the wqe pool. 1117 */ 1118 if (eib_rsrc_ok_to_free_pool(ss, wp, force)) { 1119 eib_stop_monitor_tx_wqes(ss); 1120 1121 for (i = 0; i < EIB_WQES_PER_POOL; i++) { 1122 wqe = &wp->wp_wqe[i]; 1123 if ((plhdr = wqe->qe_payload_hdr) != NULL) { 1124 kmem_free(plhdr, EIB_MAX_PAYLOAD_HDR_SZ); 1125 } 1126 if ((dest = wqe->qe_dest) != NULL) { 1127 ret = ibt_free_ud_dest(dest); 1128 if (ret != IBT_SUCCESS) { 1129 EIB_DPRINTF_WARN(ss->ei_instance, 1130 "eib_rb_rsrc_setup_txbufs: " 1131 "ibt_free_ud_dest() failed, ret=%d", 1132 ret); 1133 } 1134 } 1135 } 1136 if (wp->wp_mr) { 1137 if ((ret = ibt_deregister_mr(ss->ei_hca_hdl, 1138 wp->wp_mr)) != IBT_SUCCESS) { 1139 EIB_DPRINTF_WARN(ss->ei_instance, 1140 "eib_rb_rsrc_setup_txbufs: " 1141 "ibt_deregister_mr() failed, ret=%d", ret); 1142 } 1143 wp->wp_mr = NULL; 1144 } 1145 eib_rsrc_fini_wqe_pool(ss, &ss->ei_tx); 1146 } 1147 } 1148 1149 void 1150 eib_rb_rsrc_setup_rxbufs(eib_t *ss, boolean_t force) 1151 { 1152 eib_wqe_pool_t *rx = ss->ei_rx; 1153 ibt_status_t ret; 1154 1155 if (rx == NULL) 1156 return; 1157 1158 /* 1159 * Check if it's ok to free the rx wqe pool (i.e. all buffers have 1160 * been reclaimed) and if so, deregister memory and fini the wqe pool. 1161 */ 1162 if (eib_rsrc_ok_to_free_pool(ss, rx, force)) { 1163 if (rx->wp_mr) { 1164 if ((ret = ibt_deregister_mr(ss->ei_hca_hdl, 1165 rx->wp_mr)) != IBT_SUCCESS) { 1166 EIB_DPRINTF_WARN(ss->ei_instance, 1167 "eib_rb_rsrc_setup_rxbufs: " 1168 "ibt_deregister_mr() failed, ret=%d", ret); 1169 } 1170 rx->wp_mr = NULL; 1171 } 1172 1173 eib_rsrc_fini_wqe_pool(ss, &ss->ei_rx); 1174 } 1175 } 1176 1177 static void 1178 eib_rb_rsrc_setup_lsobufs(eib_t *ss, boolean_t force) 1179 { 1180 eib_lsobkt_t *bkt; 1181 ibt_status_t ret; 1182 1183 /* 1184 * Remove the lso bucket from the state 1185 */ 1186 if ((bkt = ss->ei_lso) == NULL) 1187 return; 1188 1189 /* 1190 * Try to stop the lso bufs monitor thread. If we fail, we simply 1191 * return. We'll have another shot at it later from detach() with 1192 * the force flag set. 1193 */ 1194 if (eib_stop_monitor_lso_bufs(ss, force) != EIB_E_SUCCESS) 1195 return; 1196 1197 /* 1198 * Free the buflist 1199 */ 1200 if (bkt->bk_bufl) { 1201 kmem_free(bkt->bk_bufl, bkt->bk_nelem * sizeof (eib_lsobuf_t)); 1202 bkt->bk_bufl = NULL; 1203 } 1204 1205 /* 1206 * Deregister LSO memory and free it 1207 */ 1208 if (bkt->bk_mr_hdl) { 1209 if ((ret = ibt_deregister_mr(ss->ei_hca_hdl, 1210 bkt->bk_mr_hdl)) != IBT_SUCCESS) { 1211 EIB_DPRINTF_WARN(ss->ei_instance, 1212 "eib_rb_rsrc_setup_lsobufs: " 1213 "ibt_deregister_mr() failed, ret=%d", ret); 1214 } 1215 bkt->bk_mr_hdl = NULL; 1216 } 1217 if (bkt->bk_mem) { 1218 kmem_free(bkt->bk_mem, bkt->bk_nelem * EIB_LSO_BUFSZ); 1219 bkt->bk_mem = NULL; 1220 } 1221 1222 /* 1223 * Destroy the mutex and condvar 1224 */ 1225 cv_destroy(&bkt->bk_cv); 1226 mutex_destroy(&bkt->bk_lock); 1227 1228 /* 1229 * Finally, free the lso bucket itself 1230 */ 1231 kmem_free(bkt, sizeof (eib_lsobkt_t)); 1232 ss->ei_lso = NULL; 1233 } 1234