1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 #include <sys/types.h> 27 #include <sys/kmem.h> 28 #include <sys/conf.h> 29 #include <sys/ddi.h> 30 #include <sys/sunddi.h> 31 #include <sys/ksynch.h> 32 #include <sys/pattr.h> /* HCK_* */ 33 #include <inet/ip.h> /* ipha_t */ 34 #include <inet/tcp.h> /* tcph_t */ 35 #include <sys/mac_provider.h> /* mac_* */ 36 #include <sys/strsun.h> /* MBLKL */ 37 38 #include <sys/ib/clients/eoib/eib_impl.h> 39 40 /* 41 * Declarations private to this file 42 */ 43 static int eib_data_setup_cqs(eib_t *, eib_vnic_t *); 44 static int eib_data_setup_ud_channel(eib_t *, eib_vnic_t *); 45 static void eib_data_setup_lso(eib_wqe_t *, mblk_t *, uint32_t, 46 eib_ether_hdr_t *); 47 static int eib_data_prepare_sgl(eib_vnic_t *, eib_wqe_t *, mblk_t *); 48 static int eib_data_is_mcast_pkt_ok(eib_vnic_t *, uint8_t *, uint64_t *, 49 uint64_t *); 50 static void eib_data_rx_comp_intr(ibt_cq_hdl_t, void *); 51 static void eib_data_tx_comp_intr(ibt_cq_hdl_t, void *); 52 static mblk_t *eib_data_rx_comp(eib_vnic_t *, eib_wqe_t *, ibt_wc_t *); 53 static void eib_data_tx_comp(eib_vnic_t *, eib_wqe_t *, eib_chan_t *); 54 static void eib_data_err_comp(eib_vnic_t *, eib_wqe_t *, ibt_wc_t *); 55 static void eib_rb_data_setup_cqs(eib_t *, eib_vnic_t *); 56 static void eib_rb_data_setup_ud_channel(eib_t *, eib_vnic_t *); 57 58 59 int 60 eib_data_create_qp(eib_t *ss, eib_vnic_t *vnic, int *err) 61 { 62 eib_chan_t *chan = NULL; 63 64 /* 65 * Allocate a eib_chan_t to store stuff about this vnic's data qp 66 * and initialize it with default admin qp pkey parameters. We'll 67 * re-associate this with the pkey we receive from the gw once we 68 * receive the login ack. 69 */ 70 vnic->vn_data_chan = eib_chan_init(); 71 72 chan = vnic->vn_data_chan; 73 chan->ch_pkey = ss->ei_admin_chan->ch_pkey; 74 chan->ch_pkey_ix = ss->ei_admin_chan->ch_pkey_ix; 75 chan->ch_vnic_inst = vnic->vn_instance; 76 77 /* 78 * Setup tx/rx CQs and completion handlers 79 */ 80 if (eib_data_setup_cqs(ss, vnic) != EIB_E_SUCCESS) { 81 EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_create_qp: " 82 "eib_data_setup_cqs(vn_inst=0x%x) failed", 83 vnic->vn_instance); 84 *err = ENOMEM; 85 goto data_create_qp_fail; 86 } 87 88 /* 89 * Setup UD channel 90 */ 91 if (eib_data_setup_ud_channel(ss, vnic) != EIB_E_SUCCESS) { 92 EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_create_qp: " 93 "eib_data_setup_ud_channel(vn_inst=0x%x) failed", 94 vnic->vn_instance); 95 *err = ENOMEM; 96 goto data_create_qp_fail; 97 } 98 99 return (EIB_E_SUCCESS); 100 101 data_create_qp_fail: 102 eib_rb_data_create_qp(ss, vnic); 103 return (EIB_E_FAILURE); 104 } 105 106 /*ARGSUSED*/ 107 uint_t 108 eib_data_rx_comp_handler(caddr_t arg1, caddr_t arg2) 109 { 110 eib_vnic_t *vnic = (eib_vnic_t *)(void *)arg1; 111 eib_t *ss = vnic->vn_ss; 112 eib_chan_t *chan = vnic->vn_data_chan; 113 eib_stats_t *stats = ss->ei_stats; 114 ibt_wc_t *wc; 115 eib_wqe_t *wqe; 116 mblk_t *mp; 117 mblk_t *head = NULL; 118 mblk_t *tail = NULL; 119 ibt_status_t ret; 120 uint_t pkts_per_call = 0; 121 uint_t polled; 122 uint_t rbytes; 123 uint_t ipkts; 124 uint_t num_wc; 125 int i; 126 127 /* 128 * Re-arm the rx notification callback before we start polling 129 * the completion queue. There's nothing much we can do if the 130 * enable_cq_notify fails - we issue a warning and move on. 131 */ 132 ret = ibt_enable_cq_notify(chan->ch_rcv_cq_hdl, IBT_NEXT_COMPLETION); 133 if (ret != IBT_SUCCESS) { 134 EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_rx_comp_handler: " 135 "ibt_enable_cq_notify() failed, ret=%d", ret); 136 } 137 138 /* 139 * We don't want to be stuck in receive processing for too long without 140 * giving others a chance. 141 */ 142 num_wc = (chan->ch_rcv_cq_sz < EIB_MAX_RX_PKTS_ONINTR) ? 143 chan->ch_rcv_cq_sz : EIB_MAX_RX_PKTS_ONINTR; 144 145 /* 146 * Handle rx completions 147 */ 148 while ((ret = ibt_poll_cq(chan->ch_rcv_cq_hdl, chan->ch_rcv_wc, 149 num_wc, &polled)) == IBT_SUCCESS) { 150 151 rbytes = ipkts = 0; 152 head = tail = NULL; 153 154 for (wc = chan->ch_rcv_wc, i = 0; i < polled; i++, wc++) { 155 wqe = (eib_wqe_t *)(uintptr_t)wc->wc_id; 156 157 ASSERT(EIB_WQE_TYPE(wqe->qe_info) == EIB_WQE_RX); 158 159 /* 160 * Clear the posted-to-hca flag and reduce the number 161 * of posted-rwqes count 162 */ 163 wqe->qe_info &= (~EIB_WQE_FLG_POSTED_TO_HCA); 164 eib_rsrc_decr_posted_rwqe(ss, chan); 165 166 rbytes += wc->wc_bytes_xfer; 167 if (wc->wc_status != IBT_WC_SUCCESS) { 168 EIB_INCR_COUNTER(&stats->st_ierrors); 169 eib_data_err_comp(vnic, wqe, wc); 170 } else { 171 ipkts++; 172 mp = eib_data_rx_comp(vnic, wqe, wc); 173 if (mp == NULL) { 174 continue; 175 } else { 176 /* 177 * Add this mp to the list to 178 * send it to the nw layer. Note 179 * that the wqe could've been 180 * returned to the pool if we're 181 * running low, so don't process 182 * wqe after this point. 183 */ 184 if (head) 185 tail->b_next = mp; 186 else 187 head = mp; 188 tail = mp; 189 } 190 } 191 } 192 193 /* 194 * We reduce the number of atomic updates to key statistics 195 * by pooling them here, once per ibt_poll_cq(). The accuracy 196 * and consistency of the published statistics within a cq 197 * polling cycle will be compromised a little bit, but that 198 * should be ok, given that we probably gain a little bit by 199 * not having to do these atomic operations per packet. 200 */ 201 EIB_UPDATE_COUNTER(&stats->st_rbytes, rbytes); 202 EIB_UPDATE_COUNTER(&stats->st_ipkts, ipkts); 203 204 pkts_per_call += ipkts; 205 206 if (head) { 207 mac_rx(ss->ei_mac_hdl, NULL, head); 208 } 209 210 /* 211 * If we have processed too many packets in one attempt, we'll 212 * have to come back here later. 213 */ 214 if (pkts_per_call >= EIB_MAX_RX_PKTS_ONINTR) { 215 (void) ddi_intr_trigger_softint(vnic->vn_data_rx_si_hdl, 216 NULL); 217 break; 218 } 219 220 num_wc -= polled; 221 } 222 223 return (DDI_INTR_CLAIMED); 224 } 225 226 /*ARGSUSED*/ 227 uint_t 228 eib_data_tx_comp_handler(caddr_t arg1, caddr_t arg2) 229 { 230 eib_vnic_t *vnic = (eib_vnic_t *)(void *)arg1; 231 eib_t *ss = vnic->vn_ss; 232 eib_chan_t *chan = vnic->vn_data_chan; 233 eib_stats_t *stats = ss->ei_stats; 234 ibt_wc_t *wc; 235 eib_wqe_t *wqe; 236 ibt_status_t ret; 237 uint_t polled; 238 int i; 239 240 /* 241 * Re-arm the tx notification callback before we start polling 242 * the completion queue. There's nothing much we can do if the 243 * enable_cq_notify fails - we issue a warning and move on. 244 */ 245 ret = ibt_enable_cq_notify(chan->ch_cq_hdl, IBT_NEXT_COMPLETION); 246 if (ret != IBT_SUCCESS) { 247 EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_tx_comp_handler: " 248 "ibt_enable_cq_notify() failed, ret=%d", ret); 249 } 250 251 /* 252 * Handle tx completions 253 */ 254 while ((ret = ibt_poll_cq(chan->ch_cq_hdl, chan->ch_wc, chan->ch_cq_sz, 255 &polled)) == IBT_SUCCESS) { 256 for (wc = chan->ch_wc, i = 0; i < polled; i++, wc++) { 257 wqe = (eib_wqe_t *)(uintptr_t)wc->wc_id; 258 259 ASSERT(EIB_WQE_TYPE(wqe->qe_info) == EIB_WQE_TX); 260 261 if (wc->wc_status != IBT_WC_SUCCESS) { 262 EIB_INCR_COUNTER(&stats->st_oerrors); 263 eib_data_err_comp(vnic, wqe, wc); 264 } else { 265 eib_data_tx_comp(vnic, wqe, vnic->vn_data_chan); 266 } 267 } 268 } 269 270 return (DDI_INTR_CLAIMED); 271 } 272 273 void 274 eib_data_rx_recycle(caddr_t arg) 275 { 276 eib_wqe_t *rwqe = (eib_wqe_t *)(void *)arg; 277 eib_t *ss = rwqe->qe_pool->wp_ss; 278 eib_chan_t *vn_chan; 279 uint_t nic_state; 280 int ret; 281 282 /* 283 * We come here from three places - (a) from the nw layer if the 284 * rx mblk we handed to it has been done with and the nw layer is 285 * calling the freemsg() (b) from eib_data_rx_comp() if the rx 286 * completion processing discovers that the received EoIB packet 287 * has a problem and (c) from eib_data_err_comp() if we're tearing 288 * down this channel. We only need to repost the rwqe if we're 289 * being called back from the nw layer. For the other two cases, 290 * we'll simply return the rwqe to the pool. Also, since we would've 291 * already updated the ch_rx_posted counters in the rx completion 292 * handler, we don't pass the chan pointer to eib_rsrc_return_rwqe 293 * from within this routine. 294 */ 295 rwqe->qe_mp = NULL; 296 if ((rwqe->qe_info & EIB_WQE_FLG_WITH_NW) == 0) { 297 eib_rsrc_return_rwqe(ss, rwqe, NULL); 298 return; 299 } 300 301 rwqe->qe_info &= (~EIB_WQE_FLG_WITH_NW); 302 303 /* 304 * If the buffers are being returned by nw layer after a long 305 * time, this eoib instance could've even been stopped by now. 306 * If so, simply return the rwqe to the pool. 307 */ 308 nic_state = eib_mac_get_nic_state(ss); 309 if ((nic_state & EIB_NIC_STARTED) != EIB_NIC_STARTED) { 310 eib_rsrc_return_rwqe(ss, rwqe, NULL); 311 return; 312 } 313 314 /* 315 * Or it could've taken even longer, and the nic has even been 316 * restarted. Only thing we can do is to make sure that the 317 * original channel pointer we passed corresponds to what's in 318 * the instance of the vnic currently. 319 */ 320 vn_chan = eib_vnic_get_data_chan(ss, rwqe->qe_vnic_inst); 321 if (vn_chan == NULL || vn_chan != rwqe->qe_chan) { 322 eib_rsrc_return_rwqe(ss, rwqe, NULL); 323 return; 324 } 325 326 /* 327 * Try to repost the rwqe if we're not tearing down this channel 328 */ 329 if (vn_chan->ch_tear_down) { 330 eib_rsrc_return_rwqe(ss, rwqe, NULL); 331 } else { 332 ret = eib_chan_post_recv(ss, vn_chan, rwqe); 333 if (ret != EIB_E_SUCCESS) { 334 if (rwqe->qe_mp) 335 freemsg(rwqe->qe_mp); 336 else 337 eib_rsrc_return_rwqe(ss, rwqe, NULL); 338 } 339 } 340 } 341 342 void 343 eib_data_post_tx(eib_vnic_t *vnic, eib_wqe_t *swqe) 344 { 345 eib_chan_t *chan = vnic->vn_data_chan; 346 eib_t *ss = vnic->vn_ss; 347 eib_stats_t *stats = vnic->vn_ss->ei_stats; 348 ibt_send_wr_t wrs[EIB_MAX_POST_MULTIPLE]; 349 eib_wqe_t *wqes[EIB_MAX_POST_MULTIPLE]; 350 eib_wqe_t *elem; 351 ibt_status_t ret; 352 uint_t n_wrs; 353 uint_t n_posted; 354 uint_t total_failed = 0; 355 uint_t n_failed = 0; 356 uint_t i; 357 358 /* 359 * See if we have room for this wqe and then add it to the 360 * list of tx wrs to post in this channel. 361 */ 362 mutex_enter(&chan->ch_tx_lock); 363 364 if ((chan->ch_tx_posted + 1) >= (chan->ch_max_swqes - 1)) { 365 EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_post_tx: " 366 "too many swqes posted already, posted=0x%lx, " 367 "max=0x%lx", chan->ch_tx_posted, chan->ch_max_swqes); 368 mutex_exit(&chan->ch_tx_lock); 369 return; 370 } 371 372 swqe->qe_nxt_post = NULL; 373 if (chan->ch_tx) { 374 chan->ch_tx_tail->qe_nxt_post = swqe; 375 } else { 376 chan->ch_tx = swqe; 377 } 378 chan->ch_tx_tail = swqe; 379 chan->ch_tx_posted++; /* pre-increment */ 380 381 /* 382 * If someone's already posting tx wqes in this channel, let 383 * them post ours as well. 384 */ 385 if (chan->ch_tx_busy == B_TRUE) { 386 mutex_exit(&chan->ch_tx_lock); 387 return; 388 } 389 chan->ch_tx_busy = B_TRUE; 390 391 while (chan->ch_tx) { 392 /* 393 * Post EIB_MAX_POST_MULTIPLE wrs at a time 394 */ 395 for (n_wrs = 0, elem = chan->ch_tx; 396 (elem) && (n_wrs < EIB_MAX_POST_MULTIPLE); 397 elem = elem->qe_nxt_post, n_wrs++) { 398 wqes[n_wrs] = elem; 399 wrs[n_wrs] = (elem->qe_wr).send; 400 } 401 chan->ch_tx = elem; 402 if (elem == NULL) { 403 chan->ch_tx_tail = NULL; 404 } 405 mutex_exit(&chan->ch_tx_lock); 406 407 ASSERT(n_wrs != 0); 408 409 /* 410 * If multiple wrs posting fails for some reason, we'll try 411 * posting the unposted ones one by one. If even that fails, 412 * we'll release any mappings/buffers/mblks associated with 413 * this wqe and return it to the pool. 414 */ 415 n_posted = n_failed = 0; 416 ret = ibt_post_send(chan->ch_chan, wrs, n_wrs, &n_posted); 417 if (ret != IBT_SUCCESS) { 418 EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_post_tx: " 419 "ibt_post_send(n_wrs=0x%lx, n_posted=0x%lx) " 420 "failed, ret=%d", n_wrs, n_posted, ret); 421 422 for (i = n_posted; i < n_wrs; i++) { 423 ret = ibt_post_send(chan->ch_chan, &wrs[i], 424 1, NULL); 425 if (ret != IBT_SUCCESS) { 426 n_failed++; 427 eib_data_tx_comp(vnic, wqes[i], chan); 428 429 EIB_DPRINTF_WARN(ss->ei_instance, 430 "eib_data_post_tx: " 431 "ibt_post_send(n_wrs=1) failed, " 432 "ret=%d", ret); 433 } 434 } 435 } 436 total_failed += n_failed; 437 438 mutex_enter(&chan->ch_tx_lock); 439 } 440 441 chan->ch_tx_busy = B_FALSE; 442 mutex_exit(&chan->ch_tx_lock); 443 444 /* 445 * If we failed to post something, update error stats 446 */ 447 if (total_failed) { 448 EIB_UPDATE_COUNTER(&stats->st_oerrors, total_failed); 449 } 450 } 451 452 void 453 eib_data_parse_ether_hdr(mblk_t *mp, eib_ether_hdr_t *evh) 454 { 455 struct ether_vlan_header *vl_hdr; 456 struct ether_header *hdr; 457 458 /* 459 * Assume that the ether header (with or without vlan tag) is 460 * contained in one fragment 461 */ 462 hdr = (struct ether_header *)(void *)mp->b_rptr; 463 vl_hdr = (struct ether_vlan_header *)(void *)mp->b_rptr; 464 465 evh->eh_ether_type = ntohs(hdr->ether_type); 466 if (evh->eh_ether_type != ETHERTYPE_VLAN) { 467 evh->eh_tagless = 1; 468 evh->eh_vlan = 0; 469 ether_copy((void *)hdr->ether_dhost.ether_addr_octet, 470 (void *)evh->eh_dmac); 471 ether_copy((void *)hdr->ether_shost.ether_addr_octet, 472 (void *)evh->eh_smac); 473 } else { 474 evh->eh_ether_type = ntohs(vl_hdr->ether_type); 475 evh->eh_tagless = 0; 476 evh->eh_vlan = VLAN_ID(ntohs(vl_hdr->ether_tci)); 477 ether_copy((void *)vl_hdr->ether_dhost.ether_addr_octet, 478 (void *)evh->eh_dmac); 479 ether_copy((void *)vl_hdr->ether_shost.ether_addr_octet, 480 (void *)evh->eh_smac); 481 } 482 } 483 484 int 485 eib_data_lookup_vnic(eib_t *ss, uint8_t *mac, uint16_t vlan, eib_vnic_t **vnicp, 486 boolean_t *failed) 487 { 488 eib_vnic_t *vnic; 489 eib_vnic_req_t *vrq; 490 uint8_t *vn_mac; 491 uint16_t vn_vlan; 492 uint64_t av; 493 int inst = 0; 494 495 if (mac == NULL) 496 return (EIB_E_FAILURE); 497 498 /* 499 * For now, a simple search (but only what we've allocated). Note that 500 * if we're in the process of creating a vnic, the instance might've 501 * been allocated, but the vnic entry would be NULL. 502 */ 503 mutex_enter(&ss->ei_vnic_lock); 504 av = ss->ei_active_vnics; 505 while ((inst = EIB_FIND_LSB_SET(av)) != -1) { 506 if ((vnic = ss->ei_vnic[inst]) != NULL) { 507 vn_mac = vnic->vn_login_data.ld_assigned_mac; 508 vn_vlan = vnic->vn_login_data.ld_assigned_vlan; 509 510 if ((vn_vlan == vlan) && 511 (bcmp(vn_mac, mac, ETHERADDRL) == 0)) { 512 if (vnicp) { 513 *vnicp = vnic; 514 } 515 mutex_exit(&ss->ei_vnic_lock); 516 return (EIB_E_SUCCESS); 517 } 518 } 519 520 av &= (~((uint64_t)1 << inst)); 521 } 522 mutex_exit(&ss->ei_vnic_lock); 523 524 /* 525 * If we haven't been able to locate a vnic for this {mac,vlan} tuple, 526 * see if we've already failed a creation request for this vnic, and 527 * return that information. 528 */ 529 if (failed) { 530 mutex_enter(&ss->ei_vnic_req_lock); 531 *failed = B_FALSE; 532 for (vrq = ss->ei_failed_vnic_req; vrq; vrq = vrq->vr_next) { 533 if ((vrq->vr_vlan == vlan) && 534 (bcmp(vrq->vr_mac, mac, ETHERADDRL) == 0)) { 535 *failed = B_TRUE; 536 } 537 } 538 mutex_exit(&ss->ei_vnic_req_lock); 539 } 540 541 return (EIB_E_FAILURE); 542 } 543 544 int 545 eib_data_prepare_frame(eib_vnic_t *vnic, eib_wqe_t *swqe, mblk_t *mp, 546 eib_ether_hdr_t *evh) 547 { 548 uint32_t mss; 549 uint32_t lsoflags; 550 uint32_t hckflags; 551 552 /* 553 * The swqe defaults are set to use the regular ud work request 554 * member and the IBT_WRC_SEND opcode, so we don't need to do 555 * anything here if this isn't an LSO packet. 556 */ 557 mac_lso_get(mp, &mss, &lsoflags); 558 if ((lsoflags & HW_LSO) == HW_LSO) 559 eib_data_setup_lso(swqe, mp, mss, evh); 560 561 mac_hcksum_get(mp, NULL, NULL, NULL, NULL, &hckflags); 562 if ((hckflags & HCK_FULLCKSUM) == HCK_FULLCKSUM) { 563 swqe->qe_wr.send.wr_flags |= IBT_WR_SEND_CKSUM; 564 } else { 565 swqe->qe_wr.send.wr_flags &= (~IBT_WR_SEND_CKSUM); 566 } 567 568 if (eib_data_prepare_sgl(vnic, swqe, mp) != 0) 569 return (EIB_E_FAILURE); 570 571 swqe->qe_mp = mp; 572 573 return (EIB_E_SUCCESS); 574 } 575 576 void 577 eib_rb_data_create_qp(eib_t *ss, eib_vnic_t *vnic) 578 { 579 eib_rb_data_setup_ud_channel(ss, vnic); 580 581 eib_rb_data_setup_cqs(ss, vnic); 582 583 eib_chan_fini(vnic->vn_data_chan); 584 vnic->vn_data_chan = NULL; 585 } 586 587 static int 588 eib_data_setup_cqs(eib_t *ss, eib_vnic_t *vnic) 589 { 590 eib_chan_t *chan = vnic->vn_data_chan; 591 ibt_cq_attr_t cq_attr; 592 ibt_status_t ret; 593 uint_t snd_sz; 594 uint_t rcv_sz; 595 int rv; 596 597 /* 598 * Allocate send completion queue. Note that we've already verified 599 * that cp_max_swqe and cp_max_rwqe meet the max cq size requirements 600 * of the hca. 601 */ 602 cq_attr.cq_sched = NULL; 603 cq_attr.cq_flags = IBT_CQ_NO_FLAGS; 604 cq_attr.cq_size = ss->ei_caps->cp_max_swqe + 1; 605 606 ret = ibt_alloc_cq(ss->ei_hca_hdl, &cq_attr, &chan->ch_cq_hdl, &snd_sz); 607 if (ret != IBT_SUCCESS) { 608 EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_setup_cqs: " 609 "ibt_alloc_cq(snd_cq_sz=0x%lx) failed, ret=%d", 610 cq_attr.cq_size, ret); 611 goto setup_data_cqs_fail; 612 } 613 ret = ibt_modify_cq(chan->ch_cq_hdl, EIB_TX_COMP_COUNT, 614 EIB_TX_COMP_USEC, 0); 615 if (ret != IBT_SUCCESS) { 616 EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_setup_cqs: " 617 "ibt_modify_cq(snd_comp_count=0x%lx, snd_comp_usec=0x%lx) " 618 "failed, ret=%d", 619 EIB_TX_COMP_COUNT, EIB_TX_COMP_USEC, ret); 620 } 621 622 /* 623 * Allocate receive completion queue 624 */ 625 cq_attr.cq_sched = NULL; 626 cq_attr.cq_flags = IBT_CQ_NO_FLAGS; 627 cq_attr.cq_size = ss->ei_caps->cp_max_rwqe + 1; 628 629 ret = ibt_alloc_cq(ss->ei_hca_hdl, &cq_attr, &chan->ch_rcv_cq_hdl, 630 &rcv_sz); 631 if (ret != IBT_SUCCESS) { 632 EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_setup_cqs: " 633 "ibt_alloc_cq(rcv_cq_sz=0x%lx) failed, ret=%d", 634 cq_attr.cq_size, ret); 635 goto setup_data_cqs_fail; 636 } 637 ret = ibt_modify_cq(chan->ch_rcv_cq_hdl, EIB_RX_COMP_COUNT, 638 EIB_RX_COMP_USEC, 0); 639 if (ret != IBT_SUCCESS) { 640 EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_setup_cqs: " 641 "ibt_modify_cq(rcv_comp_count=0x%lx, rcv_comp_usec=0x%lx) " 642 "failed, ret=%d", 643 EIB_RX_COMP_COUNT, EIB_RX_COMP_USEC, ret); 644 } 645 646 /* 647 * Set up parameters for collecting tx and rx completion information 648 */ 649 chan->ch_cq_sz = snd_sz; 650 chan->ch_wc = kmem_zalloc(sizeof (ibt_wc_t) * snd_sz, KM_SLEEP); 651 chan->ch_rcv_cq_sz = rcv_sz; 652 chan->ch_rcv_wc = kmem_zalloc(sizeof (ibt_wc_t) * rcv_sz, KM_SLEEP); 653 654 /* 655 * Set up the vnic's data tx completion queue handler and allocate 656 * a softint for it as well. 657 */ 658 if ((rv = ddi_intr_add_softint(ss->ei_dip, &vnic->vn_data_tx_si_hdl, 659 EIB_SOFTPRI_DATA, eib_data_tx_comp_handler, vnic)) != DDI_SUCCESS) { 660 EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_setup_cqs: " 661 "ddi_intr_add_softint() failed for data tx qp, ret=%d", rv); 662 goto setup_data_cqs_fail; 663 } 664 ibt_set_cq_handler(chan->ch_cq_hdl, eib_data_tx_comp_intr, vnic); 665 ret = ibt_enable_cq_notify(chan->ch_cq_hdl, IBT_NEXT_COMPLETION); 666 if (ret != IBT_SUCCESS) { 667 EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_setup_cqs: " 668 "ibt_enable_cq_notify() failed for tx cq, ret=%d", ret); 669 goto setup_data_cqs_fail; 670 } 671 672 /* 673 * And then the data rx completion queue handler 674 */ 675 if ((rv = ddi_intr_add_softint(ss->ei_dip, &vnic->vn_data_rx_si_hdl, 676 EIB_SOFTPRI_DATA, eib_data_rx_comp_handler, vnic)) != DDI_SUCCESS) { 677 EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_setup_cqs: " 678 "ddi_intr_add_softint() failed for data rx qp, ret=%d", rv); 679 goto setup_data_cqs_fail; 680 } 681 ibt_set_cq_handler(chan->ch_rcv_cq_hdl, eib_data_rx_comp_intr, vnic); 682 ret = ibt_enable_cq_notify(chan->ch_rcv_cq_hdl, IBT_NEXT_COMPLETION); 683 if (ret != IBT_SUCCESS) { 684 EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_setup_cqs: " 685 "ibt_enable_cq_notify() failed for rx cq, ret=%d", ret); 686 goto setup_data_cqs_fail; 687 } 688 689 return (EIB_E_SUCCESS); 690 691 setup_data_cqs_fail: 692 eib_rb_data_setup_cqs(ss, vnic); 693 return (EIB_E_FAILURE); 694 } 695 696 static int 697 eib_data_setup_ud_channel(eib_t *ss, eib_vnic_t *vnic) 698 { 699 eib_chan_t *chan = vnic->vn_data_chan; 700 ibt_ud_chan_alloc_args_t alloc_attr; 701 ibt_ud_chan_query_attr_t query_attr; 702 ibt_status_t ret; 703 704 bzero(&alloc_attr, sizeof (ibt_ud_chan_alloc_args_t)); 705 bzero(&query_attr, sizeof (ibt_ud_chan_query_attr_t)); 706 707 alloc_attr.ud_flags = IBT_ALL_SIGNALED; 708 if (ss->ei_caps->cp_resv_lkey_capab) 709 alloc_attr.ud_flags |= IBT_FAST_REG_RES_LKEY; 710 if (ss->ei_caps->cp_lso_maxlen) 711 alloc_attr.ud_flags |= IBT_USES_LSO; 712 713 alloc_attr.ud_hca_port_num = ss->ei_props->ep_port_num; 714 alloc_attr.ud_pkey_ix = chan->ch_pkey_ix; 715 alloc_attr.ud_sizes.cs_sq = ss->ei_caps->cp_max_swqe; 716 alloc_attr.ud_sizes.cs_rq = ss->ei_caps->cp_max_rwqe; 717 alloc_attr.ud_sizes.cs_sq_sgl = ss->ei_caps->cp_max_sgl; 718 alloc_attr.ud_sizes.cs_rq_sgl = 1; 719 alloc_attr.ud_sizes.cs_inline = 0; 720 721 alloc_attr.ud_qkey = EIB_DATA_QKEY; 722 alloc_attr.ud_scq = chan->ch_cq_hdl; 723 alloc_attr.ud_rcq = chan->ch_rcv_cq_hdl; 724 alloc_attr.ud_pd = ss->ei_pd_hdl; 725 726 ret = ibt_alloc_ud_channel(ss->ei_hca_hdl, IBT_ACHAN_NO_FLAGS, 727 &alloc_attr, &chan->ch_chan, NULL); 728 if (ret != IBT_SUCCESS) { 729 EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_setup_ud_channel: " 730 "ibt_alloc_ud_channel(port=0x%x, pkey_ix=0x%x, " 731 "cs_sq=0x%lx, cs_rq=0x%lx, sq_sgl=0x%lx) failed, ret=%d", 732 alloc_attr.ud_hca_port_num, chan->ch_pkey_ix, 733 alloc_attr.ud_sizes.cs_sq, alloc_attr.ud_sizes.cs_rq, 734 alloc_attr.ud_sizes.cs_sq_sgl, ret); 735 736 goto setup_data_ud_channel_fail; 737 } 738 739 ret = ibt_query_ud_channel(chan->ch_chan, &query_attr); 740 if (ret != IBT_SUCCESS) { 741 EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_setup_ud_channel: " 742 "ibt_query_ud_channel() failed, ret=%d", ret); 743 goto setup_data_ud_channel_fail; 744 } 745 746 chan->ch_qpn = query_attr.ud_qpn; 747 chan->ch_max_swqes = query_attr.ud_chan_sizes.cs_sq; 748 chan->ch_max_rwqes = query_attr.ud_chan_sizes.cs_rq; 749 chan->ch_lwm_rwqes = chan->ch_max_rwqes >> 2; 750 chan->ch_rwqe_bktsz = (chan->ch_max_rwqes < EIB_DATA_RWQE_BKT) ? 751 chan->ch_max_rwqes : EIB_DATA_RWQE_BKT; 752 chan->ch_ip_hdr_align = EIB_IP_HDR_ALIGN; 753 chan->ch_alloc_mp = B_TRUE; 754 chan->ch_tear_down = B_FALSE; 755 756 return (EIB_E_SUCCESS); 757 758 setup_data_ud_channel_fail: 759 eib_rb_data_setup_ud_channel(ss, vnic); 760 return (EIB_E_FAILURE); 761 } 762 763 static void 764 eib_data_setup_lso(eib_wqe_t *swqe, mblk_t *mp, uint32_t mss, 765 eib_ether_hdr_t *evh) 766 { 767 ibt_wr_lso_t *lso; 768 mblk_t *nmp; 769 uint8_t *dst; 770 uintptr_t ip_start; 771 uintptr_t tcp_start; 772 uint_t pending; 773 uint_t mblen; 774 uint_t eth_hdr_len; 775 uint_t ip_hdr_len; 776 uint_t tcp_hdr_len; 777 778 /* 779 * When the swqe was grabbed, it would've had its wr_opcode and 780 * wr.ud.udwr_dest set to default values. Since we're now going 781 * to use LSO, we need to change these. 782 */ 783 swqe->qe_wr.send.wr_opcode = IBT_WRC_SEND_LSO; 784 lso = &(swqe->qe_wr.send.wr.ud_lso); 785 lso->lso_ud_dest = swqe->qe_dest; 786 lso->lso_mss = mss; 787 788 /* 789 * Details on the ethernet header in the mp is already known to us 790 */ 791 eth_hdr_len = (evh->eh_tagless) ? (sizeof (struct ether_header)) : 792 (sizeof (struct ether_vlan_header)); 793 794 /* 795 * Calculate the LSO header size and set it in the UD LSO structure. 796 * Note that the only assumption we make is that each of the Ethernet, 797 * IP and TCP headers will be contained in a single mblk fragment; 798 * together, the headers may span multiple mblk fragments. Note also 799 * that since the EoIB encapsulation header is not part of the message 800 * block we receive, we'll need to account space for inserting it later. 801 */ 802 nmp = mp; 803 ip_start = (uintptr_t)(nmp->b_rptr) + eth_hdr_len; 804 if (ip_start >= (uintptr_t)(nmp->b_wptr)) { 805 ip_start = (uintptr_t)nmp->b_cont->b_rptr 806 + (ip_start - (uintptr_t)(nmp->b_wptr)); 807 nmp = nmp->b_cont; 808 } 809 ip_hdr_len = IPH_HDR_LENGTH((ipha_t *)ip_start); 810 811 tcp_start = ip_start + ip_hdr_len; 812 if (tcp_start >= (uintptr_t)(nmp->b_wptr)) { 813 tcp_start = (uintptr_t)nmp->b_cont->b_rptr 814 + (tcp_start - (uintptr_t)(nmp->b_wptr)); 815 nmp = nmp->b_cont; 816 } 817 tcp_hdr_len = TCP_HDR_LENGTH((tcph_t *)tcp_start); 818 819 /* 820 * Since the passed mp fragment never contains the EoIB encapsulation 821 * header, we always have to copy the lso header. Sigh. 822 */ 823 lso->lso_hdr = swqe->qe_payload_hdr; 824 lso->lso_hdr_sz = EIB_ENCAP_HDR_SZ + eth_hdr_len + 825 ip_hdr_len + tcp_hdr_len; 826 827 /* 828 * We already have the EoIB encapsulation header written at the 829 * start of wqe->qe_payload_hdr during swqe acquisition. Only 830 * copy the remaining headers. 831 */ 832 dst = lso->lso_hdr + EIB_ENCAP_HDR_SZ; 833 pending = lso->lso_hdr_sz - EIB_ENCAP_HDR_SZ; 834 835 for (nmp = mp; nmp && pending; nmp = nmp->b_cont) { 836 mblen = MBLKL(nmp); 837 if (pending > mblen) { 838 bcopy(nmp->b_rptr, dst, mblen); 839 dst += mblen; 840 pending -= mblen; 841 } else { 842 bcopy(nmp->b_rptr, dst, pending); 843 break; 844 } 845 } 846 } 847 848 static int 849 eib_data_prepare_sgl(eib_vnic_t *vnic, eib_wqe_t *swqe, mblk_t *mp) 850 { 851 eib_t *ss = vnic->vn_ss; 852 eib_stats_t *stats = vnic->vn_ss->ei_stats; 853 ibt_iov_t iov_arr[EIB_MAX_SGL]; 854 ibt_iov_attr_t iov_attr; 855 ibt_wr_ds_t *sgl; 856 ibt_status_t ret; 857 mblk_t *nmp; 858 mblk_t *data_mp; 859 uchar_t *bufp; 860 size_t blksize; 861 size_t skip; 862 size_t avail; 863 uint_t lsohdr_sz; 864 uint_t pktsz; 865 ptrdiff_t frag_len; 866 uint_t pending_hdr; 867 uint_t nblks; 868 uint_t i; 869 870 /* 871 * Let's skip ahead to the TCP data if this is LSO. Note that while 872 * the lso header size in the swqe includes the EoIB encapsulation 873 * header size, that encapsulation header itself won't be found in 874 * the mblk. 875 */ 876 lsohdr_sz = (swqe->qe_wr.send.wr_opcode == IBT_WRC_SEND) ? 0 : 877 swqe->qe_wr.send.wr.ud_lso.lso_hdr_sz; 878 879 data_mp = mp; 880 pending_hdr = 0; 881 if (lsohdr_sz) { 882 pending_hdr = lsohdr_sz - EIB_ENCAP_HDR_SZ; 883 for (nmp = mp; nmp; nmp = nmp->b_cont) { 884 frag_len = 885 (uintptr_t)nmp->b_wptr - (uintptr_t)nmp->b_rptr; 886 if (frag_len > pending_hdr) 887 break; 888 pending_hdr -= frag_len; 889 } 890 data_mp = nmp; /* start of data past lso header */ 891 ASSERT(data_mp != NULL); 892 } 893 894 /* 895 * If this is an LSO packet, we want pktsz to hold the size of the 896 * data following the eoib/ethernet/tcp/ip headers. If this is a 897 * non-LSO packet, we want pktsz to refer to the size of the entire 898 * packet with all the headers, and nblks to hold the number of 899 * mappings we'll need to iov map this (for reserved lkey request). 900 */ 901 if (lsohdr_sz == 0) { 902 nblks = 1; 903 pktsz = EIB_ENCAP_HDR_SZ; 904 } else { 905 nblks = 0; 906 pktsz = 0; 907 } 908 for (nmp = data_mp; nmp != NULL; nmp = nmp->b_cont) { 909 pktsz += MBLKL(nmp); 910 nblks++; 911 } 912 pktsz -= pending_hdr; 913 914 EIB_UPDATE_COUNTER(&stats->st_obytes, pktsz); 915 EIB_INCR_COUNTER(&stats->st_opkts); 916 917 /* 918 * We only do ibt_map_mem_iov() if the pktsz is above the tx copy 919 * threshold and if the number of mp fragments is less than the 920 * maximum acceptable. 921 */ 922 if ((ss->ei_caps->cp_resv_lkey_capab) && (pktsz > EIB_TX_COPY_THRESH) && 923 (nblks < ss->ei_caps->cp_hiwm_sgl)) { 924 925 iov_attr.iov_as = NULL; 926 iov_attr.iov = iov_arr; 927 iov_attr.iov_buf = NULL; 928 iov_attr.iov_list_len = nblks; 929 iov_attr.iov_wr_nds = ss->ei_caps->cp_max_sgl; 930 iov_attr.iov_lso_hdr_sz = lsohdr_sz; 931 iov_attr.iov_flags = IBT_IOV_SLEEP; 932 933 i = 0; 934 if (lsohdr_sz == 0) { 935 iov_arr[i].iov_addr = (caddr_t)swqe->qe_payload_hdr; 936 iov_arr[i].iov_len = EIB_ENCAP_HDR_SZ; 937 i++; 938 } 939 for (nmp = data_mp; i < nblks; i++, nmp = nmp->b_cont) { 940 iov_arr[i].iov_addr = (caddr_t)(void *)nmp->b_rptr; 941 iov_arr[i].iov_len = MBLKL(nmp); 942 if (nmp == data_mp) { 943 iov_arr[i].iov_addr += pending_hdr; 944 iov_arr[i].iov_len -= pending_hdr; 945 } 946 } 947 swqe->qe_info |= EIB_WQE_FLG_BUFTYPE_MAPPED; 948 swqe->qe_wr.send.wr_sgl = swqe->qe_big_sgl; 949 950 ret = ibt_map_mem_iov(ss->ei_hca_hdl, &iov_attr, 951 &swqe->qe_wr, &swqe->qe_iov_hdl); 952 if (ret != IBT_SUCCESS) { 953 EIB_DPRINTF_WARN(ss->ei_instance, 954 "eib_data_prepare_sgl: " 955 "ibt_map_mem_iov(nblks=0x%lx) failed, ret=%d ", 956 "attempting to use copy path", nblks, ret); 957 goto prepare_sgl_copy_path; 958 } 959 960 return (EIB_E_SUCCESS); 961 } 962 963 prepare_sgl_copy_path: 964 if (pktsz <= swqe->qe_bufsz) { 965 swqe->qe_wr.send.wr_nds = 1; 966 swqe->qe_wr.send.wr_sgl = &swqe->qe_sgl; 967 swqe->qe_sgl.ds_len = pktsz; 968 969 /* 970 * Even though this is the copy path for transfers less than 971 * qe_bufsz, it could still be an LSO packet. If so, we only 972 * have to write the data following all the headers into the 973 * work request buffer, since we'll be sending the lso header 974 * itself separately. If this is not an LSO send (but pkt size 975 * greater than mtu, say for a jumbo frame), then we need 976 * to write all the headers including EoIB encapsulation, 977 * into the work request buffer. 978 */ 979 bufp = (uchar_t *)(uintptr_t)swqe->qe_sgl.ds_va; 980 if (lsohdr_sz == 0) { 981 *(uint32_t *)((void *)bufp) = htonl(EIB_TX_ENCAP_HDR); 982 bufp += EIB_ENCAP_HDR_SZ; 983 } 984 for (nmp = data_mp; nmp != NULL; nmp = nmp->b_cont) { 985 blksize = MBLKL(nmp) - pending_hdr; 986 bcopy(nmp->b_rptr + pending_hdr, bufp, blksize); 987 bufp += blksize; 988 pending_hdr = 0; 989 } 990 991 /* 992 * If the ethernet frame we're going to send is less than 993 * ETHERMIN, pad up the buffer to ETHERMIN (with zeros) 994 */ 995 if ((pktsz + lsohdr_sz) < (ETHERMIN + EIB_ENCAP_HDR_SZ)) { 996 bzero(bufp, (ETHERMIN + EIB_ENCAP_HDR_SZ) - 997 (pktsz + lsohdr_sz)); 998 swqe->qe_sgl.ds_len = ETHERMIN + EIB_ENCAP_HDR_SZ; 999 } 1000 return (EIB_E_SUCCESS); 1001 } 1002 1003 /* 1004 * Copy path for transfers greater than swqe->qe_bufsz 1005 */ 1006 swqe->qe_wr.send.wr_sgl = swqe->qe_big_sgl; 1007 if (eib_rsrc_grab_lsobufs(ss, pktsz, swqe->qe_wr.send.wr_sgl, 1008 &(swqe->qe_wr.send.wr_nds)) != EIB_E_SUCCESS) { 1009 EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_prepare_sgl: " 1010 "eib_rsrc_grab_lsobufs() failed"); 1011 return (EIB_E_FAILURE); 1012 } 1013 swqe->qe_info |= EIB_WQE_FLG_BUFTYPE_LSO; 1014 1015 /* 1016 * Copy the larger-than-qe_buf_sz packet into a set of fixed-sized, 1017 * pre-mapped LSO buffers. Note that we might need to skip part of 1018 * the LSO header in the first fragment as before. 1019 */ 1020 nmp = data_mp; 1021 skip = pending_hdr; 1022 for (i = 0; i < swqe->qe_wr.send.wr_nds; i++) { 1023 sgl = swqe->qe_wr.send.wr_sgl + i; 1024 bufp = (uchar_t *)(uintptr_t)sgl->ds_va; 1025 avail = EIB_LSO_BUFSZ; 1026 1027 /* 1028 * If this is a non-LSO packet (perhaps a jumbo frame?) 1029 * we may still need to prefix the EoIB header in the 1030 * wr buffer. 1031 */ 1032 if ((i == 0) && (lsohdr_sz == 0)) { 1033 *(uint32_t *)((void *)bufp) = htonl(EIB_TX_ENCAP_HDR); 1034 bufp += EIB_ENCAP_HDR_SZ; 1035 avail -= EIB_ENCAP_HDR_SZ; 1036 } 1037 1038 while (nmp && avail) { 1039 blksize = MBLKL(nmp) - skip; 1040 if (blksize > avail) { 1041 bcopy(nmp->b_rptr + skip, bufp, avail); 1042 skip += avail; 1043 avail = 0; 1044 } else { 1045 bcopy(nmp->b_rptr + skip, bufp, blksize); 1046 skip = 0; 1047 bufp += blksize; 1048 avail -= blksize; 1049 nmp = nmp->b_cont; 1050 } 1051 } 1052 } 1053 1054 return (EIB_E_SUCCESS); 1055 } 1056 1057 /*ARGSUSED*/ 1058 static int 1059 eib_data_is_mcast_pkt_ok(eib_vnic_t *vnic, uint8_t *macaddr, uint64_t *brdcst, 1060 uint64_t *multicst) 1061 { 1062 /* 1063 * If the dmac is a broadcast packet, let it through. Otherwise, either 1064 * we should be in promiscuous mode or the dmac should be in our list of 1065 * joined multicast addresses. Currently we only update the stat 1066 * counters and always let things through. 1067 */ 1068 if (bcmp(macaddr, eib_broadcast_mac, ETHERADDRL) == 0) 1069 EIB_INCR_COUNTER(brdcst); 1070 else 1071 EIB_INCR_COUNTER(multicst); 1072 1073 return (1); 1074 } 1075 1076 static void 1077 eib_data_rx_comp_intr(ibt_cq_hdl_t cq_hdl, void *arg) 1078 { 1079 eib_vnic_t *vnic = arg; 1080 eib_chan_t *chan = vnic->vn_data_chan; 1081 eib_t *ss = vnic->vn_ss; 1082 1083 if (cq_hdl != chan->ch_rcv_cq_hdl) { 1084 EIB_DPRINTF_DEBUG(ss->ei_instance, "eib_data_rx_comp_intr: " 1085 "cq_hdl(0x%llx) != chan->ch_cq_hdl(0x%llx), " 1086 "ignoring completion", cq_hdl, chan->ch_cq_hdl); 1087 return; 1088 } 1089 1090 ASSERT(vnic->vn_data_rx_si_hdl != NULL); 1091 1092 (void) ddi_intr_trigger_softint(vnic->vn_data_rx_si_hdl, NULL); 1093 } 1094 1095 static void 1096 eib_data_tx_comp_intr(ibt_cq_hdl_t cq_hdl, void *arg) 1097 { 1098 eib_vnic_t *vnic = arg; 1099 eib_chan_t *chan = vnic->vn_data_chan; 1100 eib_t *ss = vnic->vn_ss; 1101 1102 if (cq_hdl != chan->ch_cq_hdl) { 1103 EIB_DPRINTF_DEBUG(ss->ei_instance, "eib_data_tx_comp_intr: " 1104 "cq_hdl(0x%llx) != chan->ch_cq_hdl(0x%llx), " 1105 "ignoring completion", cq_hdl, chan->ch_cq_hdl); 1106 return; 1107 } 1108 1109 ASSERT(vnic->vn_data_tx_si_hdl != NULL); 1110 1111 (void) ddi_intr_trigger_softint(vnic->vn_data_tx_si_hdl, NULL); 1112 } 1113 1114 static mblk_t * 1115 eib_data_rx_comp(eib_vnic_t *vnic, eib_wqe_t *wqe, ibt_wc_t *wc) 1116 { 1117 eib_t *ss = vnic->vn_ss; 1118 eib_chan_t *chan = vnic->vn_data_chan; 1119 eib_login_data_t *ld = &vnic->vn_login_data; 1120 eib_stats_t *stats = ss->ei_stats; 1121 eib_ether_hdr_t evh; 1122 mblk_t *mp; 1123 boolean_t allocd_mp = B_FALSE; 1124 uint_t ec_hdr; 1125 uint_t ec_sign; 1126 uint_t ec_ver; 1127 uint_t ec_tu_cs; 1128 uint_t ec_ip_cs; 1129 1130 /* 1131 * Before we process this mblk and send it up to network layer, see 1132 * if we're running low on rwqes in the wqe pool. If so, allocate a 1133 * new mblk, copy the received data into it and send it up (and return 1134 * the current rwqe back to the pool immediately by calling freemsg() 1135 * on the original mblk). 1136 */ 1137 if (!eib_rsrc_rxpool_low(wqe)) { 1138 mp = wqe->qe_mp; 1139 } else { 1140 if ((mp = allocb(wc->wc_bytes_xfer, BPRI_HI)) != NULL) { 1141 bcopy(wqe->qe_mp->b_rptr, mp->b_rptr, 1142 wc->wc_bytes_xfer); 1143 freemsg(wqe->qe_mp); 1144 allocd_mp = B_TRUE; 1145 } else { 1146 EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_rx_comp: " 1147 "wqe level below watermark, dropping rx pkt"); 1148 EIB_INCR_COUNTER(&stats->st_norcvbuf); 1149 freemsg(wqe->qe_mp); 1150 return (NULL); 1151 } 1152 } 1153 1154 /* 1155 * Adjust write pointer depending on how much data came in. Note that 1156 * since the nw layer will expect us to hand over the mp with the 1157 * ethernet header starting at mp->b_rptr, update the b_rptr as well. 1158 */ 1159 mp->b_wptr = mp->b_rptr + wc->wc_bytes_xfer; 1160 1161 /* 1162 * We have a problem if this really happens! 1163 */ 1164 if (mp->b_next != NULL) { 1165 EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_rx_comp: " 1166 "received packet's b_next not NULL, possible dup from cq"); 1167 mp->b_next = NULL; 1168 } 1169 1170 /* 1171 * Drop loopback packets ? 1172 */ 1173 if ((wc->wc_slid == ss->ei_props->ep_blid) && 1174 (wc->wc_qpn == chan->ch_qpn)) { 1175 goto data_rx_comp_fail; 1176 } 1177 1178 mp->b_rptr += EIB_GRH_SZ; 1179 1180 /* 1181 * Since the recv buffer has been aligned for IP header to start on 1182 * a word boundary, it is safe to say that the EoIB and ethernet 1183 * headers won't start on a word boundary. 1184 */ 1185 bcopy(mp->b_rptr, &ec_hdr, EIB_ENCAP_HDR_SZ); 1186 1187 /* 1188 * Check EoIB signature and version 1189 */ 1190 ec_hdr = ntohl(ec_hdr); 1191 1192 ec_sign = (ec_hdr >> EIB_ENCAP_SIGN_SHIFT) & EIB_ENCAP_SIGN_MASK; 1193 if (ec_sign != EIB_EH_SIGNATURE) { 1194 EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_rx_comp: " 1195 "EoIB encapsulation header signature (0x%lx) unknown", 1196 ec_sign); 1197 goto data_rx_comp_fail; 1198 } 1199 1200 ec_ver = (ec_hdr >> EIB_ENCAP_VER_SHIFT) & EIB_ENCAP_VER_MASK; 1201 if (ec_ver != EIB_EH_VERSION) { 1202 EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_rx_comp: " 1203 "EoIB encapsulation header version (0x%lx) unknown", 1204 ec_ver); 1205 goto data_rx_comp_fail; 1206 } 1207 1208 /* 1209 * Check TCP/UDP and IP checksum 1210 */ 1211 ec_tu_cs = (ec_hdr >> EIB_ENCAP_TCPCHK_SHIFT) & EIB_ENCAP_TCPCHK_MASK; 1212 ec_ip_cs = (ec_hdr >> EIB_ENCAP_IPCHK_SHIFT) & EIB_ENCAP_IPCHK_MASK; 1213 1214 if ((ec_tu_cs == EIB_EH_UDPCSUM_OK || ec_tu_cs == EIB_EH_TCPCSUM_OK) && 1215 (ec_ip_cs == EIB_EH_IPCSUM_OK)) { 1216 mac_hcksum_set(mp, 0, 0, 0, 0, HCK_FULLCKSUM_OK); 1217 } else if (ec_tu_cs == EIB_EH_CSUM_BAD || ec_ip_cs == EIB_EH_CSUM_BAD) { 1218 EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_rx_comp: " 1219 "EoIB encapsulation header tcp/udp checksum (0x%lx) or" 1220 "ip checksum (0x%lx) is bad", ec_tu_cs, ec_ip_cs); 1221 } 1222 1223 /* 1224 * Update the message block's b_rptr to the start of ethernet header 1225 * and parse the header information 1226 */ 1227 mp->b_rptr += EIB_ENCAP_HDR_SZ; 1228 eib_data_parse_ether_hdr(mp, &evh); 1229 1230 /* 1231 * If the incoming packet is vlan-tagged, but the tag doesn't match 1232 * this vnic's vlan, drop it. 1233 */ 1234 if ((evh.eh_tagless == 0) && (evh.eh_vlan != ld->ld_assigned_vlan)) { 1235 EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_rx_comp: " 1236 "received packet's vlan unknown, expected=0x%x, got=0x%x", 1237 ld->ld_assigned_vlan, evh.eh_vlan); 1238 goto data_rx_comp_fail; 1239 } 1240 1241 /* 1242 * Final checks to see if the unicast destination is indeed correct 1243 * and to see if the multicast address is ok for us. 1244 */ 1245 if (EIB_UNICAST_MAC(evh.eh_dmac)) { 1246 if (bcmp(evh.eh_dmac, ld->ld_assigned_mac, ETHERADDRL) != 0) { 1247 uint8_t *exp; 1248 uint8_t *got; 1249 1250 exp = ld->ld_assigned_mac; 1251 got = evh.eh_dmac; 1252 1253 EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_rx_comp: " 1254 "received packet's macaddr mismatch, " 1255 "expected=%x:%x:%x:%x:%x:%x, got=%x:%x:%x:%x:%x:%x", 1256 exp[0], exp[1], exp[2], exp[3], exp[4], exp[5], 1257 got[0], got[1], got[2], got[3], got[4], got[5]); 1258 1259 goto data_rx_comp_fail; 1260 } 1261 } else { 1262 if (!eib_data_is_mcast_pkt_ok(vnic, evh.eh_dmac, 1263 &stats->st_brdcstrcv, &stats->st_multircv)) { 1264 EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_rx_comp: " 1265 "multicast packet not ok"); 1266 goto data_rx_comp_fail; 1267 } 1268 } 1269 1270 /* 1271 * Strip ethernet FCS if present in the packet. ConnectX-2 doesn't 1272 * support ethernet FCS, so this shouldn't happen anyway. 1273 */ 1274 if ((ec_hdr >> EIB_ENCAP_FCS_B_SHIFT) & 0x1) { 1275 EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_rx_comp: " 1276 "ethernet FCS present (ec_hdr=0%lx), ignoring", 1277 ec_hdr); 1278 1279 mp->b_wptr -= ETHERFCSL; 1280 } 1281 1282 /* 1283 * If this is the same mp as was in the original rwqe (i.e. we didn't 1284 * do any allocb()), then mark the rwqe flag so we know that its mblk 1285 * is with the network layer. 1286 */ 1287 if (!allocd_mp) { 1288 wqe->qe_info |= EIB_WQE_FLG_WITH_NW; 1289 } 1290 1291 return (mp); 1292 1293 data_rx_comp_fail: 1294 freemsg(mp); 1295 return (NULL); 1296 } 1297 1298 static void 1299 eib_data_tx_comp(eib_vnic_t *vnic, eib_wqe_t *wqe, eib_chan_t *chan) 1300 { 1301 eib_t *ss = vnic->vn_ss; 1302 ibt_status_t ret; 1303 1304 if (wqe->qe_mp) { 1305 if (wqe->qe_info & EIB_WQE_FLG_BUFTYPE_MAPPED) { 1306 ret = ibt_unmap_mem_iov(ss->ei_hca_hdl, 1307 wqe->qe_iov_hdl); 1308 if (ret != IBT_SUCCESS) { 1309 EIB_DPRINTF_WARN(ss->ei_instance, 1310 "eib_data_tx_comp: " 1311 "ibt_unmap_mem_iov() failed, ret=%d", ret); 1312 } 1313 wqe->qe_iov_hdl = NULL; 1314 } else if (wqe->qe_info & EIB_WQE_FLG_BUFTYPE_LSO) { 1315 eib_rsrc_return_lsobufs(ss, wqe->qe_big_sgl, 1316 wqe->qe_wr.send.wr_nds); 1317 } 1318 freemsg(wqe->qe_mp); 1319 wqe->qe_mp = NULL; 1320 } 1321 1322 eib_rsrc_return_swqe(ss, wqe, chan); 1323 } 1324 1325 static void 1326 eib_data_err_comp(eib_vnic_t *vnic, eib_wqe_t *wqe, ibt_wc_t *wc) 1327 { 1328 eib_t *ss = vnic->vn_ss; 1329 1330 /* 1331 * Currently, all we do is report 1332 */ 1333 switch (wc->wc_status) { 1334 case IBT_WC_WR_FLUSHED_ERR: 1335 break; 1336 1337 case IBT_WC_LOCAL_CHAN_OP_ERR: 1338 EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_err_comp: " 1339 "IBT_WC_LOCAL_CHAN_OP_ERR seen, wqe_info=0x%lx ", 1340 wqe->qe_info); 1341 break; 1342 1343 case IBT_WC_LOCAL_PROTECT_ERR: 1344 EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_err_comp: " 1345 "IBT_WC_LOCAL_PROTECT_ERR seen, wqe_info=0x%lx ", 1346 wqe->qe_info); 1347 break; 1348 } 1349 1350 /* 1351 * When a wc indicates error, we do not attempt to repost the 1352 * rwqe but simply return it to the wqe pool. Also for rwqes, 1353 * attempting to free the mblk in the wqe invokes the 1354 * eib_data_rx_recycle() callback. For tx wqes, error handling 1355 * is the same as successful completion handling. We still 1356 * have to unmap iov/free lsobufs/free mblk and then return the 1357 * swqe to the pool. 1358 */ 1359 if (EIB_WQE_TYPE(wqe->qe_info) == EIB_WQE_RX) { 1360 ASSERT(wqe->qe_mp != NULL); 1361 freemsg(wqe->qe_mp); 1362 } else { 1363 eib_data_tx_comp(vnic, wqe, vnic->vn_data_chan); 1364 } 1365 } 1366 1367 /*ARGSUSED*/ 1368 static void 1369 eib_rb_data_setup_cqs(eib_t *ss, eib_vnic_t *vnic) 1370 { 1371 eib_chan_t *chan = vnic->vn_data_chan; 1372 ibt_status_t ret; 1373 1374 if (chan == NULL) 1375 return; 1376 1377 /* 1378 * Reset any completion handlers we may have set up 1379 */ 1380 if (chan->ch_rcv_cq_hdl) { 1381 ibt_set_cq_handler(chan->ch_rcv_cq_hdl, NULL, NULL); 1382 } 1383 if (chan->ch_cq_hdl) { 1384 ibt_set_cq_handler(chan->ch_cq_hdl, NULL, NULL); 1385 } 1386 1387 /* 1388 * Remove any softints that were added 1389 */ 1390 if (vnic->vn_data_rx_si_hdl) { 1391 (void) ddi_intr_remove_softint(vnic->vn_data_rx_si_hdl); 1392 vnic->vn_data_rx_si_hdl = NULL; 1393 } 1394 if (vnic->vn_data_tx_si_hdl) { 1395 (void) ddi_intr_remove_softint(vnic->vn_data_tx_si_hdl); 1396 vnic->vn_data_tx_si_hdl = NULL; 1397 } 1398 1399 /* 1400 * Release any work completion buffers we may have allocated 1401 */ 1402 if (chan->ch_rcv_wc && chan->ch_rcv_cq_sz) { 1403 kmem_free(chan->ch_rcv_wc, 1404 sizeof (ibt_wc_t) * chan->ch_rcv_cq_sz); 1405 } 1406 chan->ch_rcv_cq_sz = 0; 1407 chan->ch_rcv_wc = NULL; 1408 1409 if (chan->ch_wc && chan->ch_cq_sz) { 1410 kmem_free(chan->ch_wc, sizeof (ibt_wc_t) * chan->ch_cq_sz); 1411 } 1412 chan->ch_cq_sz = 0; 1413 chan->ch_wc = NULL; 1414 1415 /* 1416 * Free any completion queues we may have allocated 1417 */ 1418 if (chan->ch_rcv_cq_hdl) { 1419 ret = ibt_free_cq(chan->ch_rcv_cq_hdl); 1420 if (ret != IBT_SUCCESS) { 1421 EIB_DPRINTF_WARN(ss->ei_instance, 1422 "eib_rb_data_setup_cqs: " 1423 "ibt_free_cq(rcv_cq) failed, ret=%d", ret); 1424 } 1425 chan->ch_rcv_cq_hdl = NULL; 1426 } 1427 if (chan->ch_cq_hdl) { 1428 ret = ibt_free_cq(chan->ch_cq_hdl); 1429 if (ret != IBT_SUCCESS) { 1430 EIB_DPRINTF_WARN(ss->ei_instance, 1431 "eib_rb_data_setup_cqs: " 1432 "ibt_free_cq(snd_cq) failed, ret=%d", ret); 1433 } 1434 chan->ch_cq_hdl = NULL; 1435 } 1436 } 1437 1438 /*ARGSUSED*/ 1439 static void 1440 eib_rb_data_setup_ud_channel(eib_t *ss, eib_vnic_t *vnic) 1441 { 1442 eib_chan_t *chan = vnic->vn_data_chan; 1443 ibt_status_t ret; 1444 1445 if (chan == NULL) 1446 return; 1447 1448 if (chan->ch_chan) { 1449 /* 1450 * We're trying to tear down this UD channel. Make sure that 1451 * we don't attempt to refill (repost) at any point from now on. 1452 */ 1453 chan->ch_tear_down = B_TRUE; 1454 if ((ret = ibt_flush_channel(chan->ch_chan)) != IBT_SUCCESS) { 1455 EIB_DPRINTF_WARN(ss->ei_instance, 1456 "eib_rb_data_setup_ud_channel: " 1457 "ibt_flush_channel() failed, ret=%d", ret); 1458 } 1459 1460 /* 1461 * Wait until all posted tx wqes on this channel are back with 1462 * the wqe pool. 1463 */ 1464 mutex_enter(&chan->ch_tx_lock); 1465 while (chan->ch_tx_posted > 0) 1466 cv_wait(&chan->ch_tx_cv, &chan->ch_tx_lock); 1467 mutex_exit(&chan->ch_tx_lock); 1468 1469 /* 1470 * Wait until all posted rx wqes on this channel are back with 1471 * the wqe pool. 1472 */ 1473 mutex_enter(&chan->ch_rx_lock); 1474 while (chan->ch_rx_posted > 0) 1475 cv_wait(&chan->ch_rx_cv, &chan->ch_rx_lock); 1476 mutex_exit(&chan->ch_rx_lock); 1477 1478 /* 1479 * Now we're ready to free this channel 1480 */ 1481 if ((ret = ibt_free_channel(chan->ch_chan)) != IBT_SUCCESS) { 1482 EIB_DPRINTF_WARN(ss->ei_instance, 1483 "eib_rb_data_setup_ud_channel: " 1484 "ibt_free_channel() failed, ret=%d", ret); 1485 } 1486 1487 chan->ch_alloc_mp = B_FALSE; 1488 chan->ch_ip_hdr_align = 0; 1489 chan->ch_rwqe_bktsz = 0; 1490 chan->ch_lwm_rwqes = 0; 1491 chan->ch_max_rwqes = 0; 1492 chan->ch_max_swqes = 0; 1493 chan->ch_qpn = 0; 1494 chan->ch_chan = NULL; 1495 } 1496 } 1497