1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright(c) 2007-2010 Intel Corporation. All rights reserved. 24 */ 25 26 /* 27 * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. 28 * Copyright 2012 Nexenta Systems, Inc. All rights reserved. 29 * Copyright 2016 OmniTI Computer Consulting, Inc. All rights reserved. 30 * Copyright 2021 Joyent, Inc. 31 */ 32 33 #include "ixgbe_sw.h" 34 35 static int ixgbe_tx_copy(ixgbe_tx_ring_t *, tx_control_block_t **, 36 link_list_t *, const void *, size_t); 37 static int ixgbe_tx_bind(ixgbe_tx_ring_t *, tx_control_block_t **, 38 link_list_t *, uint8_t *, size_t); 39 static uint_t ixgbe_tcb_done(tx_control_block_t *); 40 static int ixgbe_tx_fill_ring(ixgbe_tx_ring_t *, link_list_t *, 41 ixgbe_tx_context_t *, size_t); 42 static void ixgbe_save_desc(tx_control_block_t *, uint64_t, size_t); 43 static tx_control_block_t *ixgbe_get_free_list(ixgbe_tx_ring_t *, 44 link_list_t *); 45 46 static int ixgbe_get_context(mblk_t *, ixgbe_tx_context_t *); 47 static boolean_t ixgbe_check_context(ixgbe_tx_ring_t *, 48 ixgbe_tx_context_t *); 49 static void ixgbe_fill_context(struct ixgbe_adv_tx_context_desc *, 50 ixgbe_tx_context_t *); 51 52 #ifndef IXGBE_DEBUG 53 #pragma inline(ixgbe_save_desc) 54 #pragma inline(ixgbe_get_context) 55 #pragma inline(ixgbe_check_context) 56 #pragma inline(ixgbe_fill_context) 57 #endif 58 59 /* 60 * ixgbe_ring_tx 61 * 62 * To transmit one mblk through one specified ring. 63 * 64 * One mblk can consist of several fragments, each fragment 65 * will be processed with different methods based on the size. 66 * For the fragments with size less than the bcopy threshold, 67 * they will be processed by using bcopy; otherwise, they will 68 * be processed by using DMA binding. 69 * 70 * To process the mblk, for each fragment, we pass a pointer to the location 71 * of the current transmit control block (tcb) (initialized to NULL) to either 72 * ixgbe_tx_copy() or ixgbe_tx_bind() (based on the size of the mblk fragment). 73 * ixgbe_tx_copy() and ixgbe_tx_bind() will either continue to use the current 74 * if possible, or close out the current tcb, allocate a new tcb, and update 75 * the passed location (tx_control_block_t **) to reflect the new current tcb. 76 * 77 * Since bound mblk fragments require their own tcb, the close, allocate new, 78 * and update steps occur on every call to ixgbe_tx_bind(), but since 79 * consecutive small mblk fragments can be combined into a single tcb, the 80 * close, allocate new, and update steps may not occur on every call to 81 * ixgbe_tx_copy(). If the current tcb is already being used to copy data and 82 * we call ixgbe_tx_copy(), if there is enough room in the current tcb for 83 * the current mblk fragment, we append the data from the mblk fragment. If 84 * we call ixgbe_tx_copy() and the current tcb isn't being used to copy (i.e. 85 * the previous iteration of the loop called ixgbe_tx_bind()), or doesn't 86 * have enough space for the mblk fragment, we close out the current tcb, 87 * grab a new tcb from the free list, and update the current tcb to the 88 * newly obtained tcb. 89 * 90 * When LSO (large segment offload) is enabled, we first copy the packet 91 * headers (ethernet, IP, and TCP/UDP) into their own descriptor before 92 * processing the remainder of the packet. The remaining bytes of the packet 93 * are then copied or mapped based on the fragment size as described above. 94 * 95 * Through the entire processing of a packet, we keep track of the number of 96 * DMA descriptors being used (either bound or pre-bound buffers used for 97 * copying) by this packet. Each tcb requires at least one DMA descriptor, but 98 * may require more than one. When a tcb is closed by ixgbe_tx_bind() or 99 * ixgbe_tx_copy(), it does so by calling ixgbe_tcb_done() which returns the 100 * number of DMA descriptors that are closed (ready for the HW). Since the 101 * hardware limits the number of descriptors that can be used to transmit a 102 * single packet, if the total number DMA descriptors required to transmit 103 * this packet exceeds this limit, we perform a msgpullup() and try again. 104 * Since our DMA attributes limit the number of DMA cookies allowed to 105 * map a single span of memory to a value (MAX_COOKIE) less than the 106 * maximum number of descriptors allowed for a packet (IXGBE_TX_DESC_LIMIT), 107 * as long as sufficient tcbs are available, we should always be able to 108 * process a packet that's contained in a single mblk_t (no additional 109 * fragments). 110 * 111 * Once all of the tcbs have been setup, ixgbe_tx_fill_ring() is called to 112 * setup the tx ring to transmit the tcbs and then tell the HW to start 113 * transmitting. When transmission is complete, an interrupt is triggered 114 * which calls the appropriate recycle routine to place the tcbs that were 115 * used in transmission back in the free list. We also may also try to 116 * recycle any available tcbs when the size of the tcb free list gets low 117 * or if the watchdog timer triggers. 118 * 119 */ 120 mblk_t * 121 ixgbe_ring_tx(void *arg, mblk_t *orig_mp) 122 { 123 ixgbe_tx_ring_t *tx_ring = (ixgbe_tx_ring_t *)arg; 124 ixgbe_t *ixgbe = tx_ring->ixgbe; 125 mblk_t *mp = orig_mp; 126 mblk_t *pull_mp = NULL; 127 tx_control_block_t *tcb; 128 size_t mbsize, offset, len; 129 uint32_t desc_total; 130 uint32_t copy_thresh; 131 int desc_num; 132 ixgbe_tx_context_t tx_context, *ctx = NULL; 133 link_list_t pending_list; 134 boolean_t limit_retry = B_FALSE; 135 136 ASSERT(mp->b_next == NULL); 137 138 if ((ixgbe->ixgbe_state & IXGBE_SUSPENDED) || 139 (ixgbe->ixgbe_state & IXGBE_ERROR) || 140 (ixgbe->ixgbe_state & IXGBE_OVERTEMP) || 141 !(ixgbe->ixgbe_state & IXGBE_STARTED) || 142 ixgbe->link_state != LINK_STATE_UP) { 143 freemsg(mp); 144 return (NULL); 145 } 146 147 copy_thresh = ixgbe->tx_copy_thresh; 148 149 mbsize = msgsize(mp); 150 151 if (ixgbe->tx_hcksum_enable) { 152 /* 153 * Retrieve checksum context information from the mblk 154 * that will be used to decide whether/how to fill the 155 * context descriptor. 156 */ 157 ctx = &tx_context; 158 if (ixgbe_get_context(mp, ctx) < 0) { 159 freemsg(mp); 160 return (NULL); 161 } 162 163 /* 164 * If the mblk size exceeds the max size ixgbe could 165 * process, then discard this mblk, and return NULL. 166 */ 167 if ((ctx->lso_flag && 168 ((mbsize - ctx->mac_hdr_len) > IXGBE_LSO_MAXLEN)) || 169 (!ctx->lso_flag && 170 (mbsize > (ixgbe->max_frame_size - ETHERFCSL)))) { 171 freemsg(mp); 172 IXGBE_DEBUGLOG_0(ixgbe, "ixgbe_tx: packet oversize"); 173 return (NULL); 174 } 175 } 176 177 /* 178 * If we use too many descriptors (see comments below), we may do 179 * pull_mp = msgpullup(orig_mp, -1), and jump back to here. As such, 180 * any time we error return past here, we should check and free 181 * pull_mp if != NULL. 182 */ 183 retry: 184 /* 185 * Check and recycle tx descriptors. 186 * The recycle threshold here should be selected carefully 187 */ 188 if (tx_ring->tbd_free < ixgbe->tx_recycle_thresh) { 189 tx_ring->tx_recycle(tx_ring); 190 } 191 192 /* 193 * After the recycling, if the tbd_free is less than the 194 * overload_threshold, assert overload, return mp; 195 * and we need to re-schedule the tx again. 196 */ 197 if (tx_ring->tbd_free < ixgbe->tx_overload_thresh) { 198 tx_ring->reschedule = B_TRUE; 199 tx_ring->stat_overload++; 200 if (pull_mp != NULL) 201 freemsg(pull_mp); 202 return (orig_mp); 203 } 204 205 /* 206 * The pending_list is a linked list that is used to save 207 * the tx control blocks that have packet data processed 208 * but have not put the data to the tx descriptor ring. 209 * It is used to reduce the lock contention of the tx_lock. 210 */ 211 LINK_LIST_INIT(&pending_list); 212 213 tcb = NULL; 214 desc_num = 0; 215 desc_total = 0; 216 offset = 0; 217 218 /* 219 * For LSO, we always copy the packet header (Ethernet + IP + TCP/UDP) 220 * into a single descriptor separate from the remaining data. 221 */ 222 if ((ctx != NULL) && ctx->lso_flag) { 223 size_t hdr_len; 224 225 hdr_len = ctx->ip_hdr_len + ctx->mac_hdr_len + ctx->l4_hdr_len; 226 227 /* 228 * copy the first hdr_len bytes of mp (i.e. the Ethernet, IP, 229 * and TCP/UDP headers) into tcb. 230 */ 231 for (len = hdr_len; mp != NULL && len > 0; mp = mp->b_cont) { 232 size_t mlen = MBLKL(mp); 233 size_t amt = MIN(mlen, len); 234 int ret; 235 236 ret = ixgbe_tx_copy(tx_ring, &tcb, &pending_list, 237 mp->b_rptr, amt); 238 /* 239 * Since we're trying to copy all of the headers into 240 * a single buffer in a single tcb, if ixgbe_tx_copy() 241 * returns anything but 0, it means either no tcbs 242 * are available (< 0), or while copying, we spilled 243 * over and couldn't fit all the headers into a 244 * single tcb. 245 */ 246 if (ret != 0) { 247 if (ret > 0) 248 tx_ring->stat_lso_header_fail++; 249 goto tx_failure; 250 } 251 252 len -= amt; 253 254 /* 255 * If we copy less than the full amount of this 256 * mblk_t, we have some amount to copy below. 257 */ 258 if (amt < mlen) { 259 offset = amt; 260 break; 261 } 262 } 263 264 ASSERT0(len); 265 266 /* 267 * Finish off the header tcb, and start anew for the 268 * rest of the packet. 269 */ 270 desc_total += ixgbe_tcb_done(tcb); 271 tcb = NULL; 272 } 273 274 /* 275 * Process each remaining segment in the packet -- either binding 276 * the dblk_t or copying the contents of the dblk_t to an already 277 * bound buffer. When we copy, we will accumulate consecutive small 278 * (less than copy_thresh bytes) segments into a single tcb buffer 279 * until no more can fit (or we encounter a segment larger than 280 * copy_thresh and bind the dblk_t). 281 * 282 * Both ixgbe_tx_bind() and ixgbe_tx_copy() will allocate new 283 * transmit control blocks (tcb)s as needed (and append them onto 284 * 'pending_list'). Both functions also replace 'tcb' with the new 285 * tcb when they allocate a new tcb. 286 * 287 * We stop trying to process the packet once the number of descriptors 288 * used equals IXGBE_TX_DESC_LIMIT. Even if we're copying into the 289 * IXGBE_TX_DESC_LIMIT-th descriptor, we won't have room to add a 290 * context descriptor (since we're already at the limit), so there's 291 * no point in continuing. We'll pull up the mblk_t (see below) 292 * and try again. 293 */ 294 while (mp != NULL && desc_total < IXGBE_TX_DESC_LIMIT) { 295 uint8_t *rptr = mp->b_rptr + offset; 296 int ret; 297 298 len = MBLKL(mp) - offset; 299 offset = 0; 300 301 if (len > copy_thresh) { 302 ret = ixgbe_tx_bind(tx_ring, &tcb, &pending_list, rptr, 303 len); 304 } else { 305 ret = ixgbe_tx_copy(tx_ring, &tcb, &pending_list, rptr, 306 len); 307 } 308 309 if (ret < 0) 310 goto tx_failure; 311 312 desc_total += ret; 313 mp = mp->b_cont; 314 } 315 316 /* Finish off the last tcb */ 317 desc_total += ixgbe_tcb_done(tcb); 318 319 /* 320 * 82598/82599 chipset has a limitation that no more than 32 tx 321 * descriptors can be transmited out at one time. As noted above, 322 * we need to include space for a context descriptor in case its 323 * necessary, so we do this even if desc_total == IXGBE_TX_DESC_LIMIT 324 * as well as when it exceeds the limit. 325 * 326 * If we exceed this limit, we take the hit, do a msgpullup(), and 327 * then try again. Our DMA attributes guarantee we should never use 328 * more than MAX_COOKIE (18) descriptors to map a single mblk_t, so we 329 * should only need to retry once. 330 */ 331 if (desc_total >= IXGBE_TX_DESC_LIMIT) { 332 /* We shouldn't hit this path twice */ 333 VERIFY0(limit_retry); 334 335 tx_ring->stat_break_tbd_limit++; 336 337 /* Release all the tcbs we used previously */ 338 ixgbe_put_free_list(tx_ring, &pending_list); 339 desc_total = 0; 340 offset = 0; 341 342 pull_mp = msgpullup(orig_mp, -1); 343 if (pull_mp == NULL) { 344 tx_ring->reschedule = B_TRUE; 345 return (orig_mp); 346 } 347 348 mp = pull_mp; 349 limit_retry = B_TRUE; 350 goto retry; 351 } 352 353 /* 354 * Before filling the tx descriptor ring with the data, we need to 355 * ensure there are adequate free descriptors for transmit 356 * (including one context descriptor). 357 * Do not use up all the tx descriptors. 358 * Otherwise tx recycle will fail and cause false hang. 359 */ 360 if (tx_ring->tbd_free <= (desc_total + 1)) { 361 tx_ring->tx_recycle(tx_ring); 362 } 363 364 mutex_enter(&tx_ring->tx_lock); 365 /* 366 * If the number of free tx descriptors is not enough for transmit 367 * then return mp. 368 * 369 * Note: we must put this check under the mutex protection to 370 * ensure the correctness when multiple threads access it in 371 * parallel. 372 */ 373 if (tx_ring->tbd_free <= (desc_total + 1)) { 374 tx_ring->stat_fail_no_tbd++; 375 mutex_exit(&tx_ring->tx_lock); 376 goto tx_failure; 377 } 378 379 /* 380 * Attach the mblk_t we've setup to the last control block. 381 * This is only done once we know there are enough free descriptors 382 * to transmit so that the cleanup in tx_failure doesn't try to 383 * call freemsg() on mp (since we will want to return it). 384 */ 385 tcb->mp = (pull_mp != NULL) ? pull_mp : orig_mp; 386 387 desc_num = ixgbe_tx_fill_ring(tx_ring, &pending_list, ctx, 388 mbsize); 389 390 ASSERT((desc_num == desc_total) || (desc_num == (desc_total + 1))); 391 392 tx_ring->stat_obytes += mbsize; 393 tx_ring->stat_opackets++; 394 395 mutex_exit(&tx_ring->tx_lock); 396 397 /* 398 * Now that tx is done, if we pulled up the original message, we 399 * can free the original message since it is no longer being 400 * used. 401 */ 402 if (pull_mp != NULL) { 403 freemsg(orig_mp); 404 } 405 406 return (NULL); 407 408 tx_failure: 409 /* 410 * If transmission fails, need to free the pulling up mblk. 411 */ 412 if (pull_mp) { 413 freemsg(pull_mp); 414 } 415 416 /* 417 * Return the tx control blocks in the pending list to the free list. 418 */ 419 ixgbe_put_free_list(tx_ring, &pending_list); 420 421 /* Transmit failed, do not drop the mblk, rechedule the transmit */ 422 tx_ring->reschedule = B_TRUE; 423 424 return (orig_mp); 425 } 426 427 /* 428 * ixgbe_tx_copy 429 * 430 * Copy the mblk fragment to the pre-allocated tx buffer. Return -1 on error, 431 * otherwise return the number of descriptors we've completed in this call. 432 */ 433 static int 434 ixgbe_tx_copy(ixgbe_tx_ring_t *tx_ring, tx_control_block_t **tcbp, 435 link_list_t *pending_list, const void *buf, size_t len) 436 { 437 tx_control_block_t *tcb = *tcbp; 438 dma_buffer_t *tx_buf; 439 uint32_t desc_num = 0; 440 441 /* 442 * We need a new tcb -- either the current one (tcb) is NULL because 443 * we just started, tcb is being used for DMA, or tcb isn't large enough 444 * to hold the contents we need to copy. 445 */ 446 if (tcb == NULL || tcb->tx_type == USE_DMA || 447 tcb->tx_buf.len + len > tcb->tx_buf.size) { 448 tx_control_block_t *newtcb; 449 450 newtcb = ixgbe_get_free_list(tx_ring, pending_list); 451 if (newtcb == NULL) 452 return (-1); 453 454 newtcb->tx_type = USE_COPY; 455 456 if (tcb != NULL) 457 desc_num += ixgbe_tcb_done(tcb); 458 *tcbp = tcb = newtcb; 459 } 460 461 ASSERT3S(tcb->tx_type, ==, USE_COPY); 462 tx_buf = &tcb->tx_buf; 463 464 /* 465 * Copy the packet data of the mblk fragment into the 466 * pre-allocated tx buffer, which is maintained by the 467 * tx control block. 468 * 469 * Several mblk fragments can be copied into one tx buffer. 470 * The destination address of the current copied fragment in 471 * the tx buffer is next to the end of the previous copied 472 * fragment. 473 */ 474 if (len > 0) { 475 bcopy(buf, tx_buf->address + tx_buf->len, len); 476 477 tx_buf->len += len; 478 tcb->frag_num++; 479 } 480 481 return (desc_num); 482 } 483 484 /* 485 * ixgbe_tx_bind 486 * 487 * Bind the mblk fragment with DMA. Returns -1 on error, otherwise it 488 * returns the number of descriptors completed in this call. This count 489 * can include descriptors that weren't filled in by the current call to 490 * ixgbe_tx_bind() but were being used (but not yet completed) in previous 491 * calls to ixgbe_tx_bind() or ixgbe_tx_copy(). 492 */ 493 static int 494 ixgbe_tx_bind(ixgbe_tx_ring_t *tx_ring, tx_control_block_t **tcbp, 495 link_list_t *pending_list, uint8_t *buf, size_t len) 496 { 497 tx_control_block_t *tcb = NULL; 498 uint_t desc_num = 0; 499 int status; 500 501 tcb = ixgbe_get_free_list(tx_ring, pending_list); 502 if (tcb == NULL) 503 return (-1); 504 505 /* 506 * Use DMA binding to process the mblk fragment 507 */ 508 status = ddi_dma_addr_bind_handle(tcb->tx_dma_handle, NULL, 509 (caddr_t)buf, len, 510 DDI_DMA_WRITE | DDI_DMA_STREAMING, DDI_DMA_DONTWAIT, 511 0, NULL, NULL); 512 513 if (status != DDI_DMA_MAPPED) { 514 tx_ring->stat_fail_dma_bind++; 515 return (-1); 516 } 517 518 tcb->frag_num++; 519 tcb->tx_type = USE_DMA; 520 521 /* 522 * If there was an old tcb, we're about to replace it. Finish 523 * setting up the old tcb so we can replace it with the new one. 524 */ 525 if (*tcbp != NULL) 526 desc_num += ixgbe_tcb_done(*tcbp); 527 528 *tcbp = tcb; 529 return (desc_num); 530 } 531 532 /* 533 * Once we're done populating a tcb (either by binding or copying into 534 * a buffer in the tcb), get it ready for tx and return the number of 535 * descriptors used. 536 */ 537 static uint_t 538 ixgbe_tcb_done(tx_control_block_t *tcb) 539 { 540 uint_t desc_num = 0; 541 542 if (tcb->tx_type == USE_DMA) { 543 const ddi_dma_cookie_t *c; 544 545 for (c = ddi_dma_cookie_iter(tcb->tx_dma_handle, NULL); 546 c != NULL; 547 c = ddi_dma_cookie_iter(tcb->tx_dma_handle, c)) { 548 /* 549 * Save the address and length to the private data 550 * structure of the tx control block, which will be 551 * used to fill the tx descriptor ring after all the 552 * fragments are processed. 553 */ 554 ixgbe_save_desc(tcb, c->dmac_laddress, c->dmac_size); 555 desc_num++; 556 } 557 } else if (tcb->tx_type == USE_COPY) { 558 dma_buffer_t *tx_buf = &tcb->tx_buf; 559 560 DMA_SYNC(tx_buf, DDI_DMA_SYNC_FORDEV); 561 ixgbe_save_desc(tcb, tx_buf->dma_address, tx_buf->len); 562 desc_num++; 563 } else { 564 panic("invalid tcb type"); 565 } 566 567 return (desc_num); 568 } 569 570 /* 571 * ixgbe_get_context 572 * 573 * Get the context information from the mblk 574 */ 575 static int 576 ixgbe_get_context(mblk_t *mp, ixgbe_tx_context_t *ctx) 577 { 578 uint32_t start; 579 uint32_t hckflags; 580 uint32_t lsoflags; 581 uint32_t lsocksum; 582 uint32_t mss; 583 uint32_t len; 584 uint32_t size; 585 uint32_t offset; 586 unsigned char *pos; 587 ushort_t etype; 588 uint32_t mac_hdr_len; 589 uint32_t l4_proto; 590 uint32_t l4_hdr_len; 591 592 ASSERT(mp != NULL); 593 594 mac_hcksum_get(mp, &start, NULL, NULL, NULL, &hckflags); 595 bzero(ctx, sizeof (ixgbe_tx_context_t)); 596 597 if (hckflags == 0) { 598 return (0); 599 } 600 601 ctx->hcksum_flags = hckflags; 602 603 mac_lso_get(mp, &mss, &lsoflags); 604 ctx->mss = mss; 605 ctx->lso_flag = (lsoflags == HW_LSO); 606 607 etype = 0; 608 mac_hdr_len = 0; 609 l4_proto = 0; 610 611 /* 612 * Firstly get the position of the ether_type/ether_tpid. 613 * Here we don't assume the ether (VLAN) header is fully included 614 * in one mblk fragment, so we go thourgh the fragments to parse 615 * the ether type. 616 */ 617 size = len = MBLKL(mp); 618 offset = offsetof(struct ether_header, ether_type); 619 while (size <= offset) { 620 mp = mp->b_cont; 621 ASSERT(mp != NULL); 622 len = MBLKL(mp); 623 size += len; 624 } 625 pos = mp->b_rptr + offset + len - size; 626 627 etype = ntohs(*(ushort_t *)(uintptr_t)pos); 628 if (etype == ETHERTYPE_VLAN) { 629 /* 630 * Get the position of the ether_type in VLAN header 631 */ 632 offset = offsetof(struct ether_vlan_header, ether_type); 633 while (size <= offset) { 634 mp = mp->b_cont; 635 ASSERT(mp != NULL); 636 len = MBLKL(mp); 637 size += len; 638 } 639 pos = mp->b_rptr + offset + len - size; 640 641 etype = ntohs(*(ushort_t *)(uintptr_t)pos); 642 mac_hdr_len = sizeof (struct ether_vlan_header); 643 } else { 644 mac_hdr_len = sizeof (struct ether_header); 645 } 646 647 /* 648 * Here we don't assume the IP(V6) header is fully included in 649 * one mblk fragment. 650 */ 651 lsocksum = HCK_PARTIALCKSUM; 652 ctx->l3_proto = etype; 653 switch (etype) { 654 case ETHERTYPE_IP: 655 if (ctx->lso_flag) { 656 offset = offsetof(ipha_t, ipha_length) + mac_hdr_len; 657 while (size <= offset) { 658 mp = mp->b_cont; 659 ASSERT(mp != NULL); 660 len = MBLKL(mp); 661 size += len; 662 } 663 pos = mp->b_rptr + offset + len - size; 664 *((uint16_t *)(uintptr_t)(pos)) = 0; 665 666 offset = offsetof(ipha_t, ipha_hdr_checksum) + 667 mac_hdr_len; 668 while (size <= offset) { 669 mp = mp->b_cont; 670 ASSERT(mp != NULL); 671 len = MBLKL(mp); 672 size += len; 673 } 674 pos = mp->b_rptr + offset + len - size; 675 *((uint16_t *)(uintptr_t)(pos)) = 0; 676 677 /* 678 * To perform ixgbe LSO, here also need to fill 679 * the tcp checksum field of the packet with the 680 * following pseudo-header checksum: 681 * (ip_source_addr, ip_destination_addr, l4_proto) 682 * Currently the tcp/ip stack has done it. 683 */ 684 lsocksum |= HCK_IPV4_HDRCKSUM; 685 } 686 687 offset = offsetof(ipha_t, ipha_protocol) + mac_hdr_len; 688 while (size <= offset) { 689 mp = mp->b_cont; 690 ASSERT(mp != NULL); 691 len = MBLKL(mp); 692 size += len; 693 } 694 pos = mp->b_rptr + offset + len - size; 695 696 l4_proto = *(uint8_t *)pos; 697 break; 698 case ETHERTYPE_IPV6: 699 /* 700 * We need to zero out the length in the header. 701 */ 702 if (ctx->lso_flag) { 703 offset = offsetof(ip6_t, ip6_plen) + mac_hdr_len; 704 while (size <= offset) { 705 mp = mp->b_cont; 706 ASSERT(mp != NULL); 707 len = MBLKL(mp); 708 size += len; 709 } 710 pos = mp->b_rptr + offset + len - size; 711 *((uint16_t *)(uintptr_t)(pos)) = 0; 712 } 713 714 offset = offsetof(ip6_t, ip6_nxt) + mac_hdr_len; 715 while (size <= offset) { 716 mp = mp->b_cont; 717 ASSERT(mp != NULL); 718 len = MBLKL(mp); 719 size += len; 720 } 721 pos = mp->b_rptr + offset + len - size; 722 723 l4_proto = *(uint8_t *)pos; 724 break; 725 default: 726 /* Unrecoverable error */ 727 IXGBE_DEBUGLOG_0(NULL, "Ether type error with tx hcksum"); 728 return (-2); 729 } 730 731 if (ctx->lso_flag) { 732 /* 733 * LSO relies on tx h/w checksum, so here will drop the packet 734 * if h/w checksum flag is not declared. 735 */ 736 if ((ctx->hcksum_flags & lsocksum) != lsocksum) { 737 IXGBE_DEBUGLOG_2(NULL, "ixgbe_tx: h/w checksum flags " 738 "are not set for LSO, found 0x%x, needed bits 0x%x", 739 ctx->hcksum_flags, lsocksum); 740 return (-1); 741 } 742 743 744 offset = mac_hdr_len + start; 745 while (size <= offset) { 746 mp = mp->b_cont; 747 ASSERT(mp != NULL); 748 len = MBLKL(mp); 749 size += len; 750 } 751 pos = mp->b_rptr + offset + len - size; 752 753 l4_hdr_len = TCP_HDR_LENGTH((tcph_t *)pos); 754 } else { 755 /* 756 * l4 header length is only required for LSO 757 */ 758 l4_hdr_len = 0; 759 } 760 761 ctx->mac_hdr_len = mac_hdr_len; 762 ctx->ip_hdr_len = start; 763 ctx->l4_proto = l4_proto; 764 ctx->l4_hdr_len = l4_hdr_len; 765 766 return (0); 767 } 768 769 /* 770 * ixgbe_check_context 771 * 772 * Check if a new context descriptor is needed 773 */ 774 static boolean_t 775 ixgbe_check_context(ixgbe_tx_ring_t *tx_ring, ixgbe_tx_context_t *ctx) 776 { 777 ixgbe_tx_context_t *last; 778 779 if (ctx == NULL) 780 return (B_FALSE); 781 782 /* 783 * Compare the context data retrieved from the mblk and the 784 * stored data of the last context descriptor. The data need 785 * to be checked are: 786 * hcksum_flags 787 * l4_proto 788 * mac_hdr_len 789 * ip_hdr_len 790 * lso_flag 791 * mss (only checked for LSO) 792 * l4_hr_len (only checked for LSO) 793 * Either one of the above data is changed, a new context descriptor 794 * will be needed. 795 */ 796 last = &tx_ring->tx_context; 797 798 if ((ctx->hcksum_flags != last->hcksum_flags) || 799 (ctx->l4_proto != last->l4_proto) || 800 (ctx->l3_proto != last->l3_proto) || 801 (ctx->mac_hdr_len != last->mac_hdr_len) || 802 (ctx->ip_hdr_len != last->ip_hdr_len) || 803 (ctx->lso_flag != last->lso_flag) || 804 (ctx->lso_flag && ((ctx->mss != last->mss) || 805 (ctx->l4_hdr_len != last->l4_hdr_len)))) { 806 return (B_TRUE); 807 } 808 809 return (B_FALSE); 810 } 811 812 /* 813 * ixgbe_fill_context 814 * 815 * Fill the context descriptor with hardware checksum informations 816 */ 817 static void 818 ixgbe_fill_context(struct ixgbe_adv_tx_context_desc *ctx_tbd, 819 ixgbe_tx_context_t *ctx) 820 { 821 /* 822 * Fill the context descriptor with the checksum 823 * context information we've got. 824 */ 825 ctx_tbd->vlan_macip_lens = ctx->ip_hdr_len; 826 ctx_tbd->vlan_macip_lens |= ctx->mac_hdr_len << 827 IXGBE_ADVTXD_MACLEN_SHIFT; 828 829 ctx_tbd->type_tucmd_mlhl = 830 IXGBE_ADVTXD_DCMD_DEXT | IXGBE_ADVTXD_DTYP_CTXT; 831 /* 832 * When we have a TX context set up, we enforce that the ethertype is 833 * either IPv4 or IPv6 in ixgbe_get_tx_context(). 834 */ 835 if (ctx->lso_flag || ctx->hcksum_flags & HCK_IPV4_HDRCKSUM) { 836 if (ctx->l3_proto == ETHERTYPE_IP) { 837 ctx_tbd->type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV4; 838 } else { 839 ctx_tbd->type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV6; 840 } 841 } 842 843 if (ctx->lso_flag || ctx->hcksum_flags & HCK_PARTIALCKSUM) { 844 switch (ctx->l4_proto) { 845 case IPPROTO_TCP: 846 ctx_tbd->type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_TCP; 847 break; 848 case IPPROTO_UDP: 849 /* 850 * We don't have to explicitly set: 851 * ctx_tbd->type_tucmd_mlhl |= 852 * IXGBE_ADVTXD_TUCMD_L4T_UDP; 853 * Because IXGBE_ADVTXD_TUCMD_L4T_UDP == 0b 854 */ 855 break; 856 default: 857 /* Unrecoverable error */ 858 IXGBE_DEBUGLOG_0(NULL, "L4 type error with tx hcksum"); 859 break; 860 } 861 } 862 863 ctx_tbd->seqnum_seed = 0; 864 865 if (ctx->lso_flag) { 866 ctx_tbd->mss_l4len_idx = 867 (ctx->l4_hdr_len << IXGBE_ADVTXD_L4LEN_SHIFT) | 868 (ctx->mss << IXGBE_ADVTXD_MSS_SHIFT); 869 } else { 870 ctx_tbd->mss_l4len_idx = 0; 871 } 872 } 873 874 /* 875 * ixgbe_tx_fill_ring 876 * 877 * Fill the tx descriptor ring with the data 878 */ 879 static int 880 ixgbe_tx_fill_ring(ixgbe_tx_ring_t *tx_ring, link_list_t *pending_list, 881 ixgbe_tx_context_t *ctx, size_t mbsize) 882 { 883 struct ixgbe_hw *hw = &tx_ring->ixgbe->hw; 884 boolean_t load_context; 885 uint32_t index, tcb_index, desc_num; 886 union ixgbe_adv_tx_desc *tbd, *first_tbd; 887 tx_control_block_t *tcb, *first_tcb; 888 uint32_t hcksum_flags; 889 int i; 890 891 ASSERT(mutex_owned(&tx_ring->tx_lock)); 892 893 tbd = NULL; 894 first_tbd = NULL; 895 first_tcb = NULL; 896 desc_num = 0; 897 hcksum_flags = 0; 898 load_context = B_FALSE; 899 900 /* 901 * Get the index of the first tx descriptor that will be filled, 902 * and the index of the first work list item that will be attached 903 * with the first used tx control block in the pending list. 904 * Note: the two indexes are the same. 905 */ 906 index = tx_ring->tbd_tail; 907 tcb_index = tx_ring->tbd_tail; 908 909 if (ctx != NULL) { 910 hcksum_flags = ctx->hcksum_flags; 911 912 /* 913 * Check if a new context descriptor is needed for this packet 914 */ 915 load_context = ixgbe_check_context(tx_ring, ctx); 916 917 if (load_context) { 918 tbd = &tx_ring->tbd_ring[index]; 919 920 /* 921 * Fill the context descriptor with the 922 * hardware checksum offload informations. 923 */ 924 ixgbe_fill_context( 925 (struct ixgbe_adv_tx_context_desc *)tbd, ctx); 926 927 index = NEXT_INDEX(index, 1, tx_ring->ring_size); 928 desc_num++; 929 930 /* 931 * Store the checksum context data if 932 * a new context descriptor is added 933 */ 934 tx_ring->tx_context = *ctx; 935 } 936 } 937 938 first_tbd = &tx_ring->tbd_ring[index]; 939 940 /* 941 * Fill tx data descriptors with the data saved in the pending list. 942 * The tx control blocks in the pending list are added to the work list 943 * at the same time. 944 * 945 * The work list is strictly 1:1 corresponding to the descriptor ring. 946 * One item of the work list corresponds to one tx descriptor. Because 947 * one tx control block can span multiple tx descriptors, the tx 948 * control block will be added to the first work list item that 949 * corresponds to the first tx descriptor generated from that tx 950 * control block. 951 */ 952 tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list); 953 first_tcb = tcb; 954 while (tcb != NULL) { 955 956 for (i = 0; i < tcb->desc_num; i++) { 957 tbd = &tx_ring->tbd_ring[index]; 958 959 tbd->read.buffer_addr = tcb->desc[i].address; 960 tbd->read.cmd_type_len = tcb->desc[i].length; 961 962 tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_DEXT 963 | IXGBE_ADVTXD_DTYP_DATA; 964 965 tbd->read.olinfo_status = 0; 966 967 index = NEXT_INDEX(index, 1, tx_ring->ring_size); 968 desc_num++; 969 } 970 971 /* 972 * Add the tx control block to the work list 973 */ 974 ASSERT(tx_ring->work_list[tcb_index] == NULL); 975 tx_ring->work_list[tcb_index] = tcb; 976 977 tcb_index = index; 978 tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list); 979 } 980 981 if (load_context) { 982 /* 983 * Count the context descriptor for 984 * the first tx control block. 985 */ 986 first_tcb->desc_num++; 987 } 988 first_tcb->last_index = PREV_INDEX(index, 1, tx_ring->ring_size); 989 990 /* 991 * The Insert Ethernet CRC (IFCS) bit and the checksum fields are only 992 * valid in the first descriptor of the packet. 993 * Setting paylen in every first_tbd for all parts. 994 * 82599, X540 and X550 require the packet length in paylen field 995 * with or without LSO and 82598 will ignore it in non-LSO mode. 996 */ 997 ASSERT(first_tbd != NULL); 998 first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_IFCS; 999 1000 switch (hw->mac.type) { 1001 case ixgbe_mac_82598EB: 1002 if (ctx != NULL && ctx->lso_flag) { 1003 first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_TSE; 1004 first_tbd->read.olinfo_status |= 1005 (mbsize - ctx->mac_hdr_len - ctx->ip_hdr_len 1006 - ctx->l4_hdr_len) << IXGBE_ADVTXD_PAYLEN_SHIFT; 1007 } 1008 break; 1009 1010 case ixgbe_mac_82599EB: 1011 case ixgbe_mac_X540: 1012 case ixgbe_mac_X550: 1013 case ixgbe_mac_X550EM_x: 1014 case ixgbe_mac_X550EM_a: 1015 if (ctx != NULL && ctx->lso_flag) { 1016 first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_TSE; 1017 first_tbd->read.olinfo_status |= 1018 (mbsize - ctx->mac_hdr_len - ctx->ip_hdr_len 1019 - ctx->l4_hdr_len) << IXGBE_ADVTXD_PAYLEN_SHIFT; 1020 } else { 1021 first_tbd->read.olinfo_status |= 1022 (mbsize << IXGBE_ADVTXD_PAYLEN_SHIFT); 1023 } 1024 break; 1025 1026 default: 1027 break; 1028 } 1029 1030 /* Set hardware checksum bits */ 1031 if (hcksum_flags != 0) { 1032 if (hcksum_flags & HCK_IPV4_HDRCKSUM) 1033 first_tbd->read.olinfo_status |= 1034 IXGBE_ADVTXD_POPTS_IXSM; 1035 if (hcksum_flags & HCK_PARTIALCKSUM) 1036 first_tbd->read.olinfo_status |= 1037 IXGBE_ADVTXD_POPTS_TXSM; 1038 } 1039 1040 /* 1041 * The last descriptor of packet needs End Of Packet (EOP), 1042 * and Report Status (RS) bits set 1043 */ 1044 ASSERT(tbd != NULL); 1045 tbd->read.cmd_type_len |= 1046 IXGBE_ADVTXD_DCMD_EOP | IXGBE_ADVTXD_DCMD_RS; 1047 1048 /* 1049 * Sync the DMA buffer of the tx descriptor ring 1050 */ 1051 DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORDEV); 1052 1053 /* 1054 * Update the number of the free tx descriptors. 1055 * The mutual exclusion between the transmission and the recycling 1056 * (for the tx descriptor ring and the work list) is implemented 1057 * with the atomic operation on the number of the free tx descriptors. 1058 * 1059 * Note: we should always decrement the counter tbd_free before 1060 * advancing the hardware TDT pointer to avoid the race condition - 1061 * before the counter tbd_free is decremented, the transmit of the 1062 * tx descriptors has done and the counter tbd_free is increased by 1063 * the tx recycling. 1064 */ 1065 i = ixgbe_atomic_reserve(&tx_ring->tbd_free, desc_num); 1066 ASSERT(i >= 0); 1067 1068 tx_ring->tbd_tail = index; 1069 1070 /* 1071 * Advance the hardware TDT pointer of the tx descriptor ring 1072 */ 1073 IXGBE_WRITE_REG(hw, IXGBE_TDT(tx_ring->index), index); 1074 1075 if (ixgbe_check_acc_handle(tx_ring->ixgbe->osdep.reg_handle) != 1076 DDI_FM_OK) { 1077 ddi_fm_service_impact(tx_ring->ixgbe->dip, 1078 DDI_SERVICE_DEGRADED); 1079 atomic_or_32(&tx_ring->ixgbe->ixgbe_state, IXGBE_ERROR); 1080 } 1081 1082 return (desc_num); 1083 } 1084 1085 /* 1086 * ixgbe_save_desc 1087 * 1088 * Save the address/length pair to the private array 1089 * of the tx control block. The address/length pairs 1090 * will be filled into the tx descriptor ring later. 1091 */ 1092 static void 1093 ixgbe_save_desc(tx_control_block_t *tcb, uint64_t address, size_t length) 1094 { 1095 sw_desc_t *desc; 1096 1097 desc = &tcb->desc[tcb->desc_num]; 1098 desc->address = address; 1099 desc->length = length; 1100 1101 tcb->desc_num++; 1102 } 1103 1104 /* 1105 * ixgbe_tx_recycle_legacy 1106 * 1107 * Recycle the tx descriptors and tx control blocks. 1108 * 1109 * The work list is traversed to check if the corresponding 1110 * tx descriptors have been transmitted. If so, the resources 1111 * bound to the tx control blocks will be freed, and those 1112 * tx control blocks will be returned to the free list. 1113 */ 1114 uint32_t 1115 ixgbe_tx_recycle_legacy(ixgbe_tx_ring_t *tx_ring) 1116 { 1117 uint32_t index, last_index, prev_index; 1118 int desc_num; 1119 boolean_t desc_done; 1120 tx_control_block_t *tcb; 1121 link_list_t pending_list; 1122 ixgbe_t *ixgbe = tx_ring->ixgbe; 1123 1124 mutex_enter(&tx_ring->recycle_lock); 1125 1126 ASSERT(tx_ring->tbd_free <= tx_ring->ring_size); 1127 1128 if (tx_ring->tbd_free == tx_ring->ring_size) { 1129 tx_ring->recycle_fail = 0; 1130 tx_ring->stall_watchdog = 0; 1131 if (tx_ring->reschedule) { 1132 tx_ring->reschedule = B_FALSE; 1133 mac_tx_ring_update(ixgbe->mac_hdl, 1134 tx_ring->ring_handle); 1135 } 1136 mutex_exit(&tx_ring->recycle_lock); 1137 return (0); 1138 } 1139 1140 /* 1141 * Sync the DMA buffer of the tx descriptor ring 1142 */ 1143 DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORKERNEL); 1144 1145 if (ixgbe_check_dma_handle(tx_ring->tbd_area.dma_handle) != DDI_FM_OK) { 1146 mutex_exit(&tx_ring->recycle_lock); 1147 ddi_fm_service_impact(ixgbe->dip, DDI_SERVICE_DEGRADED); 1148 atomic_or_32(&ixgbe->ixgbe_state, IXGBE_ERROR); 1149 return (0); 1150 } 1151 1152 LINK_LIST_INIT(&pending_list); 1153 desc_num = 0; 1154 index = tx_ring->tbd_head; /* Index of next tbd/tcb to recycle */ 1155 1156 tcb = tx_ring->work_list[index]; 1157 ASSERT(tcb != NULL); 1158 1159 while (tcb != NULL) { 1160 /* 1161 * Get the last tx descriptor of this packet. 1162 * If the last tx descriptor is done, then 1163 * we can recycle all descriptors of a packet 1164 * which usually includes several tx control blocks. 1165 * For 82599, LSO descriptors can not be recycled 1166 * unless the whole packet's transmission is done. 1167 * That's why packet level recycling is used here. 1168 * For 82598, there's not such limit. 1169 */ 1170 last_index = tcb->last_index; 1171 /* 1172 * MAX_TX_RING_SIZE is used to judge whether 1173 * the index is a valid value or not. 1174 */ 1175 if (last_index == MAX_TX_RING_SIZE) 1176 break; 1177 1178 /* 1179 * Check if the Descriptor Done bit is set 1180 */ 1181 desc_done = tx_ring->tbd_ring[last_index].wb.status & 1182 IXGBE_TXD_STAT_DD; 1183 if (desc_done) { 1184 /* 1185 * recycle all descriptors of the packet 1186 */ 1187 while (tcb != NULL) { 1188 /* 1189 * Strip off the tx control block from 1190 * the work list, and add it to the 1191 * pending list. 1192 */ 1193 tx_ring->work_list[index] = NULL; 1194 LIST_PUSH_TAIL(&pending_list, &tcb->link); 1195 1196 /* 1197 * Count the total number of the tx 1198 * descriptors recycled 1199 */ 1200 desc_num += tcb->desc_num; 1201 1202 index = NEXT_INDEX(index, tcb->desc_num, 1203 tx_ring->ring_size); 1204 1205 tcb = tx_ring->work_list[index]; 1206 1207 prev_index = PREV_INDEX(index, 1, 1208 tx_ring->ring_size); 1209 if (prev_index == last_index) 1210 break; 1211 } 1212 } else { 1213 break; 1214 } 1215 } 1216 1217 /* 1218 * If no tx descriptors are recycled, no need to do more processing 1219 */ 1220 if (desc_num == 0) { 1221 tx_ring->recycle_fail++; 1222 mutex_exit(&tx_ring->recycle_lock); 1223 return (0); 1224 } 1225 1226 tx_ring->recycle_fail = 0; 1227 tx_ring->stall_watchdog = 0; 1228 1229 /* 1230 * Update the head index of the tx descriptor ring 1231 */ 1232 tx_ring->tbd_head = index; 1233 1234 /* 1235 * Update the number of the free tx descriptors with atomic operations 1236 */ 1237 atomic_add_32(&tx_ring->tbd_free, desc_num); 1238 1239 if ((tx_ring->tbd_free >= ixgbe->tx_resched_thresh) && 1240 (tx_ring->reschedule)) { 1241 tx_ring->reschedule = B_FALSE; 1242 mac_tx_ring_update(ixgbe->mac_hdl, 1243 tx_ring->ring_handle); 1244 } 1245 mutex_exit(&tx_ring->recycle_lock); 1246 1247 /* 1248 * Add the tx control blocks in the pending list to the free list. 1249 */ 1250 ixgbe_put_free_list(tx_ring, &pending_list); 1251 1252 return (desc_num); 1253 } 1254 1255 /* 1256 * ixgbe_tx_recycle_head_wb 1257 * 1258 * Check the head write-back, and recycle all the transmitted 1259 * tx descriptors and tx control blocks. 1260 */ 1261 uint32_t 1262 ixgbe_tx_recycle_head_wb(ixgbe_tx_ring_t *tx_ring) 1263 { 1264 uint32_t index; 1265 uint32_t head_wb; 1266 int desc_num; 1267 tx_control_block_t *tcb; 1268 link_list_t pending_list; 1269 ixgbe_t *ixgbe = tx_ring->ixgbe; 1270 1271 mutex_enter(&tx_ring->recycle_lock); 1272 1273 ASSERT(tx_ring->tbd_free <= tx_ring->ring_size); 1274 1275 if (tx_ring->tbd_free == tx_ring->ring_size) { 1276 tx_ring->recycle_fail = 0; 1277 tx_ring->stall_watchdog = 0; 1278 if (tx_ring->reschedule) { 1279 tx_ring->reschedule = B_FALSE; 1280 mac_tx_ring_update(ixgbe->mac_hdl, 1281 tx_ring->ring_handle); 1282 } 1283 mutex_exit(&tx_ring->recycle_lock); 1284 return (0); 1285 } 1286 1287 /* 1288 * Sync the DMA buffer of the tx descriptor ring 1289 * 1290 * Note: For head write-back mode, the tx descriptors will not 1291 * be written back, but the head write-back value is stored at 1292 * the last extra tbd at the end of the DMA area, we still need 1293 * to sync the head write-back value for kernel. 1294 * 1295 * DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORKERNEL); 1296 */ 1297 (void) ddi_dma_sync(tx_ring->tbd_area.dma_handle, 1298 sizeof (union ixgbe_adv_tx_desc) * tx_ring->ring_size, 1299 sizeof (uint32_t), 1300 DDI_DMA_SYNC_FORKERNEL); 1301 1302 if (ixgbe_check_dma_handle(tx_ring->tbd_area.dma_handle) != DDI_FM_OK) { 1303 mutex_exit(&tx_ring->recycle_lock); 1304 ddi_fm_service_impact(ixgbe->dip, 1305 DDI_SERVICE_DEGRADED); 1306 atomic_or_32(&ixgbe->ixgbe_state, IXGBE_ERROR); 1307 return (0); 1308 } 1309 1310 LINK_LIST_INIT(&pending_list); 1311 desc_num = 0; 1312 index = tx_ring->tbd_head; /* Next index to clean */ 1313 1314 /* 1315 * Get the value of head write-back 1316 */ 1317 head_wb = *tx_ring->tbd_head_wb; 1318 while (index != head_wb) { 1319 tcb = tx_ring->work_list[index]; 1320 ASSERT(tcb != NULL); 1321 1322 if (OFFSET(index, head_wb, tx_ring->ring_size) < 1323 tcb->desc_num) { 1324 /* 1325 * The current tx control block is not 1326 * completely transmitted, stop recycling 1327 */ 1328 break; 1329 } 1330 1331 /* 1332 * Strip off the tx control block from the work list, 1333 * and add it to the pending list. 1334 */ 1335 tx_ring->work_list[index] = NULL; 1336 LIST_PUSH_TAIL(&pending_list, &tcb->link); 1337 1338 /* 1339 * Advance the index of the tx descriptor ring 1340 */ 1341 index = NEXT_INDEX(index, tcb->desc_num, tx_ring->ring_size); 1342 1343 /* 1344 * Count the total number of the tx descriptors recycled 1345 */ 1346 desc_num += tcb->desc_num; 1347 } 1348 1349 /* 1350 * If no tx descriptors are recycled, no need to do more processing 1351 */ 1352 if (desc_num == 0) { 1353 tx_ring->recycle_fail++; 1354 mutex_exit(&tx_ring->recycle_lock); 1355 return (0); 1356 } 1357 1358 tx_ring->recycle_fail = 0; 1359 tx_ring->stall_watchdog = 0; 1360 1361 /* 1362 * Update the head index of the tx descriptor ring 1363 */ 1364 tx_ring->tbd_head = index; 1365 1366 /* 1367 * Update the number of the free tx descriptors with atomic operations 1368 */ 1369 atomic_add_32(&tx_ring->tbd_free, desc_num); 1370 1371 if ((tx_ring->tbd_free >= ixgbe->tx_resched_thresh) && 1372 (tx_ring->reschedule)) { 1373 tx_ring->reschedule = B_FALSE; 1374 mac_tx_ring_update(ixgbe->mac_hdl, 1375 tx_ring->ring_handle); 1376 } 1377 mutex_exit(&tx_ring->recycle_lock); 1378 1379 /* 1380 * Add the tx control blocks in the pending list to the free list. 1381 */ 1382 ixgbe_put_free_list(tx_ring, &pending_list); 1383 1384 return (desc_num); 1385 } 1386 1387 /* 1388 * ixgbe_free_tcb - free up the tx control block 1389 * 1390 * Free the resources of the tx control block, including 1391 * unbind the previously bound DMA handle, and reset other 1392 * control fields. 1393 */ 1394 void 1395 ixgbe_free_tcb(tx_control_block_t *tcb) 1396 { 1397 if (tcb == NULL) 1398 return; 1399 1400 switch (tcb->tx_type) { 1401 case USE_COPY: 1402 /* 1403 * Reset the buffer length that is used for copy 1404 */ 1405 tcb->tx_buf.len = 0; 1406 break; 1407 case USE_DMA: 1408 /* 1409 * Release the DMA resource that is used for 1410 * DMA binding. 1411 */ 1412 (void) ddi_dma_unbind_handle(tcb->tx_dma_handle); 1413 break; 1414 default: 1415 break; 1416 } 1417 1418 /* 1419 * Free the mblk 1420 */ 1421 if (tcb->mp != NULL) { 1422 freemsg(tcb->mp); 1423 tcb->mp = NULL; 1424 } 1425 1426 tcb->tx_type = USE_NONE; 1427 tcb->last_index = MAX_TX_RING_SIZE; 1428 tcb->frag_num = 0; 1429 tcb->desc_num = 0; 1430 } 1431 1432 /* 1433 * ixgbe_get_free_list - Get a free tx control block from the free list. 1434 * Returns the tx control block and appends it to list. 1435 * 1436 * The atomic operation on the number of the available tx control block 1437 * in the free list is used to keep this routine mutual exclusive with 1438 * the routine ixgbe_put_check_list. 1439 */ 1440 static tx_control_block_t * 1441 ixgbe_get_free_list(ixgbe_tx_ring_t *tx_ring, link_list_t *list) 1442 { 1443 tx_control_block_t *tcb; 1444 1445 /* 1446 * Check and update the number of the free tx control block 1447 * in the free list. 1448 */ 1449 if (ixgbe_atomic_reserve(&tx_ring->tcb_free, 1) < 0) { 1450 tx_ring->stat_fail_no_tcb++; 1451 return (NULL); 1452 } 1453 1454 mutex_enter(&tx_ring->tcb_head_lock); 1455 1456 tcb = tx_ring->free_list[tx_ring->tcb_head]; 1457 ASSERT(tcb != NULL); 1458 tx_ring->free_list[tx_ring->tcb_head] = NULL; 1459 tx_ring->tcb_head = NEXT_INDEX(tx_ring->tcb_head, 1, 1460 tx_ring->free_list_size); 1461 1462 mutex_exit(&tx_ring->tcb_head_lock); 1463 1464 LIST_PUSH_TAIL(list, &tcb->link); 1465 return (tcb); 1466 } 1467 1468 /* 1469 * ixgbe_put_free_list 1470 * 1471 * Put a list of used tx control blocks back to the free list 1472 * 1473 * A mutex is used here to ensure the serialization. The mutual exclusion 1474 * between ixgbe_get_free_list and ixgbe_put_free_list is implemented with 1475 * the atomic operation on the counter tcb_free. 1476 */ 1477 void 1478 ixgbe_put_free_list(ixgbe_tx_ring_t *tx_ring, link_list_t *pending_list) 1479 { 1480 uint32_t index; 1481 int tcb_num; 1482 tx_control_block_t *tcb; 1483 1484 for (tcb = (tx_control_block_t *)LIST_GET_HEAD(pending_list); 1485 tcb != NULL; 1486 tcb = (tx_control_block_t *)LIST_GET_NEXT(pending_list, tcb)) { 1487 /* 1488 * Despite the name, ixgbe_free_tcb() just releases the 1489 * resources in tcb, but does not free tcb itself. 1490 */ 1491 ixgbe_free_tcb(tcb); 1492 } 1493 1494 mutex_enter(&tx_ring->tcb_tail_lock); 1495 1496 index = tx_ring->tcb_tail; 1497 1498 tcb_num = 0; 1499 tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list); 1500 while (tcb != NULL) { 1501 ASSERT(tx_ring->free_list[index] == NULL); 1502 tx_ring->free_list[index] = tcb; 1503 1504 tcb_num++; 1505 1506 index = NEXT_INDEX(index, 1, tx_ring->free_list_size); 1507 1508 tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list); 1509 } 1510 1511 tx_ring->tcb_tail = index; 1512 1513 /* 1514 * Update the number of the free tx control block 1515 * in the free list. This operation must be placed 1516 * under the protection of the lock. 1517 */ 1518 atomic_add_32(&tx_ring->tcb_free, tcb_num); 1519 1520 mutex_exit(&tx_ring->tcb_tail_lock); 1521 } 1522