1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright(c) 2007-2010 Intel Corporation. All rights reserved. 24 */ 25 26 /* 27 * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. 28 * Copyright 2012 Nexenta Systems, Inc. All rights reserved. 29 * Copyright 2016 OmniTI Computer Consulting, Inc. All rights reserved. 30 * Copyright 2017 Joyent, Inc. 31 */ 32 33 #include "ixgbe_sw.h" 34 35 static int ixgbe_tx_copy(ixgbe_tx_ring_t *, tx_control_block_t *, mblk_t *, 36 uint32_t, boolean_t); 37 static int ixgbe_tx_bind(ixgbe_tx_ring_t *, tx_control_block_t *, mblk_t *, 38 uint32_t); 39 static int ixgbe_tx_fill_ring(ixgbe_tx_ring_t *, link_list_t *, 40 ixgbe_tx_context_t *, size_t); 41 static void ixgbe_save_desc(tx_control_block_t *, uint64_t, size_t); 42 static tx_control_block_t *ixgbe_get_free_list(ixgbe_tx_ring_t *); 43 44 static int ixgbe_get_context(mblk_t *, ixgbe_tx_context_t *); 45 static boolean_t ixgbe_check_context(ixgbe_tx_ring_t *, 46 ixgbe_tx_context_t *); 47 static void ixgbe_fill_context(struct ixgbe_adv_tx_context_desc *, 48 ixgbe_tx_context_t *); 49 50 #ifndef IXGBE_DEBUG 51 #pragma inline(ixgbe_save_desc) 52 #pragma inline(ixgbe_get_context) 53 #pragma inline(ixgbe_check_context) 54 #pragma inline(ixgbe_fill_context) 55 #endif 56 57 /* 58 * ixgbe_ring_tx 59 * 60 * To transmit one mblk through one specified ring. 61 * 62 * One mblk can consist of several fragments, each fragment 63 * will be processed with different methods based on the size. 64 * For the fragments with size less than the bcopy threshold, 65 * they will be processed by using bcopy; otherwise, they will 66 * be processed by using DMA binding. 67 * 68 * To process the mblk, a tx control block is got from the 69 * free list. One tx control block contains one tx buffer, which 70 * is used to copy mblk fragments' data; and one tx DMA handle, 71 * which is used to bind a mblk fragment with DMA resource. 72 * 73 * Several small mblk fragments can be copied into one tx control 74 * block's buffer, and then the buffer will be transmitted with 75 * one tx descriptor. 76 * 77 * A large fragment only binds with one tx control block's DMA 78 * handle, and it can span several tx descriptors for transmitting. 79 * 80 * So to transmit a packet (mblk), several tx control blocks can 81 * be used. After the processing, those tx control blocks will 82 * be put to the work list. 83 */ 84 mblk_t * 85 ixgbe_ring_tx(void *arg, mblk_t *mp) 86 { 87 ixgbe_tx_ring_t *tx_ring = (ixgbe_tx_ring_t *)arg; 88 ixgbe_t *ixgbe = tx_ring->ixgbe; 89 tx_type_t current_flag, next_flag; 90 uint32_t current_len, next_len; 91 uint32_t desc_total; 92 size_t mbsize; 93 int desc_num; 94 boolean_t copy_done, eop; 95 mblk_t *current_mp, *next_mp, *nmp, *pull_mp = NULL; 96 tx_control_block_t *tcb; 97 ixgbe_tx_context_t tx_context, *ctx; 98 link_list_t pending_list; 99 uint32_t len, hdr_frag_len, hdr_len; 100 uint32_t copy_thresh; 101 mblk_t *hdr_new_mp = NULL; 102 mblk_t *hdr_pre_mp = NULL; 103 mblk_t *hdr_nmp = NULL; 104 105 ASSERT(mp->b_next == NULL); 106 107 if ((ixgbe->ixgbe_state & IXGBE_SUSPENDED) || 108 (ixgbe->ixgbe_state & IXGBE_ERROR) || 109 (ixgbe->ixgbe_state & IXGBE_OVERTEMP) || 110 !(ixgbe->ixgbe_state & IXGBE_STARTED) || 111 ixgbe->link_state != LINK_STATE_UP) { 112 freemsg(mp); 113 return (NULL); 114 } 115 116 copy_thresh = ixgbe->tx_copy_thresh; 117 118 /* Get the mblk size */ 119 mbsize = 0; 120 for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) { 121 mbsize += MBLKL(nmp); 122 } 123 124 if (ixgbe->tx_hcksum_enable) { 125 /* 126 * Retrieve checksum context information from the mblk 127 * that will be used to decide whether/how to fill the 128 * context descriptor. 129 */ 130 ctx = &tx_context; 131 if (ixgbe_get_context(mp, ctx) < 0) { 132 freemsg(mp); 133 return (NULL); 134 } 135 136 /* 137 * If the mblk size exceeds the max size ixgbe could 138 * process, then discard this mblk, and return NULL. 139 */ 140 if ((ctx->lso_flag && 141 ((mbsize - ctx->mac_hdr_len) > IXGBE_LSO_MAXLEN)) || 142 (!ctx->lso_flag && 143 (mbsize > (ixgbe->max_frame_size - ETHERFCSL)))) { 144 freemsg(mp); 145 IXGBE_DEBUGLOG_0(ixgbe, "ixgbe_tx: packet oversize"); 146 return (NULL); 147 } 148 } else { 149 ctx = NULL; 150 } 151 152 /* 153 * Check and recycle tx descriptors. 154 * The recycle threshold here should be selected carefully 155 */ 156 if (tx_ring->tbd_free < ixgbe->tx_recycle_thresh) { 157 tx_ring->tx_recycle(tx_ring); 158 } 159 160 /* 161 * After the recycling, if the tbd_free is less than the 162 * overload_threshold, assert overload, return mp; 163 * and we need to re-schedule the tx again. 164 */ 165 if (tx_ring->tbd_free < ixgbe->tx_overload_thresh) { 166 tx_ring->reschedule = B_TRUE; 167 tx_ring->stat_overload++; 168 return (mp); 169 } 170 171 /* 172 * The pending_list is a linked list that is used to save 173 * the tx control blocks that have packet data processed 174 * but have not put the data to the tx descriptor ring. 175 * It is used to reduce the lock contention of the tx_lock. 176 */ 177 LINK_LIST_INIT(&pending_list); 178 desc_num = 0; 179 desc_total = 0; 180 181 /* 182 * The software should guarantee LSO packet header(MAC+IP+TCP) 183 * to be within one descriptor. Here we reallocate and refill the 184 * the header if it's physical memory non-contiguous. 185 */ 186 if ((ctx != NULL) && ctx->lso_flag) { 187 /* find the last fragment of the header */ 188 len = MBLKL(mp); 189 ASSERT(len > 0); 190 hdr_nmp = mp; 191 hdr_len = ctx->ip_hdr_len + ctx->mac_hdr_len + ctx->l4_hdr_len; 192 while (len < hdr_len) { 193 hdr_pre_mp = hdr_nmp; 194 hdr_nmp = hdr_nmp->b_cont; 195 len += MBLKL(hdr_nmp); 196 } 197 /* 198 * If the header and the payload are in different mblks, 199 * we simply force the header to be copied into pre-allocated 200 * page-aligned buffer. 201 */ 202 if (len == hdr_len) 203 goto adjust_threshold; 204 205 hdr_frag_len = hdr_len - (len - MBLKL(hdr_nmp)); 206 /* 207 * There are two cases we need to reallocate a mblk for the 208 * last header fragment: 209 * 1. the header is in multiple mblks and the last fragment 210 * share the same mblk with the payload 211 * 2. the header is in a single mblk shared with the payload 212 * and the header is physical memory non-contiguous 213 */ 214 if ((hdr_nmp != mp) || 215 (P2NPHASE((uintptr_t)hdr_nmp->b_rptr, ixgbe->sys_page_size) 216 < hdr_len)) { 217 tx_ring->stat_lso_header_fail++; 218 /* 219 * reallocate the mblk for the last header fragment, 220 * expect to bcopy into pre-allocated page-aligned 221 * buffer 222 */ 223 hdr_new_mp = allocb(hdr_frag_len, 0); 224 if (!hdr_new_mp) 225 return (mp); 226 bcopy(hdr_nmp->b_rptr, hdr_new_mp->b_rptr, 227 hdr_frag_len); 228 /* link the new header fragment with the other parts */ 229 hdr_new_mp->b_wptr = hdr_new_mp->b_rptr + hdr_frag_len; 230 hdr_new_mp->b_cont = hdr_nmp; 231 if (hdr_pre_mp) 232 hdr_pre_mp->b_cont = hdr_new_mp; 233 else 234 mp = hdr_new_mp; 235 hdr_nmp->b_rptr += hdr_frag_len; 236 } 237 adjust_threshold: 238 /* 239 * adjust the bcopy threshhold to guarantee 240 * the header to use bcopy way 241 */ 242 if (copy_thresh < hdr_len) 243 copy_thresh = hdr_len; 244 } 245 246 current_mp = mp; 247 current_len = MBLKL(current_mp); 248 /* 249 * Decide which method to use for the first fragment 250 */ 251 current_flag = (current_len <= copy_thresh) ? 252 USE_COPY : USE_DMA; 253 /* 254 * If the mblk includes several contiguous small fragments, 255 * they may be copied into one buffer. This flag is used to 256 * indicate whether there are pending fragments that need to 257 * be copied to the current tx buffer. 258 * 259 * If this flag is B_TRUE, it indicates that a new tx control 260 * block is needed to process the next fragment using either 261 * copy or DMA binding. 262 * 263 * Otherwise, it indicates that the next fragment will be 264 * copied to the current tx buffer that is maintained by the 265 * current tx control block. No new tx control block is needed. 266 */ 267 copy_done = B_TRUE; 268 while (current_mp) { 269 next_mp = current_mp->b_cont; 270 eop = (next_mp == NULL); /* Last fragment of the packet? */ 271 next_len = eop ? 0: MBLKL(next_mp); 272 273 /* 274 * When the current fragment is an empty fragment, if 275 * the next fragment will still be copied to the current 276 * tx buffer, we cannot skip this fragment here. Because 277 * the copy processing is pending for completion. We have 278 * to process this empty fragment in the tx_copy routine. 279 * 280 * If the copy processing is completed or a DMA binding 281 * processing is just completed, we can just skip this 282 * empty fragment. 283 */ 284 if ((current_len == 0) && (copy_done)) { 285 current_mp = next_mp; 286 current_len = next_len; 287 current_flag = (current_len <= copy_thresh) ? 288 USE_COPY : USE_DMA; 289 continue; 290 } 291 292 if (copy_done) { 293 /* 294 * Get a new tx control block from the free list 295 */ 296 tcb = ixgbe_get_free_list(tx_ring); 297 298 if (tcb == NULL) { 299 tx_ring->stat_fail_no_tcb++; 300 goto tx_failure; 301 } 302 303 /* 304 * Push the tx control block to the pending list 305 * to avoid using lock too early 306 */ 307 LIST_PUSH_TAIL(&pending_list, &tcb->link); 308 } 309 310 if (current_flag == USE_COPY) { 311 /* 312 * Check whether to use bcopy or DMA binding to process 313 * the next fragment, and if using bcopy, whether we 314 * need to continue copying the next fragment into the 315 * current tx buffer. 316 */ 317 ASSERT((tcb->tx_buf.len + current_len) <= 318 tcb->tx_buf.size); 319 320 if (eop) { 321 /* 322 * This is the last fragment of the packet, so 323 * the copy processing will be completed with 324 * this fragment. 325 */ 326 next_flag = USE_NONE; 327 copy_done = B_TRUE; 328 } else if ((tcb->tx_buf.len + current_len + next_len) > 329 tcb->tx_buf.size) { 330 /* 331 * If the next fragment is too large to be 332 * copied to the current tx buffer, we need 333 * to complete the current copy processing. 334 */ 335 next_flag = (next_len > copy_thresh) ? 336 USE_DMA: USE_COPY; 337 copy_done = B_TRUE; 338 } else if (next_len > copy_thresh) { 339 /* 340 * The next fragment needs to be processed with 341 * DMA binding. So the copy prcessing will be 342 * completed with the current fragment. 343 */ 344 next_flag = USE_DMA; 345 copy_done = B_TRUE; 346 } else { 347 /* 348 * Continue to copy the next fragment to the 349 * current tx buffer. 350 */ 351 next_flag = USE_COPY; 352 copy_done = B_FALSE; 353 } 354 355 desc_num = ixgbe_tx_copy(tx_ring, tcb, current_mp, 356 current_len, copy_done); 357 } else { 358 /* 359 * Check whether to use bcopy or DMA binding to process 360 * the next fragment. 361 */ 362 next_flag = (next_len > copy_thresh) ? 363 USE_DMA: USE_COPY; 364 ASSERT(copy_done == B_TRUE); 365 366 desc_num = ixgbe_tx_bind(tx_ring, tcb, current_mp, 367 current_len); 368 } 369 370 if (desc_num > 0) 371 desc_total += desc_num; 372 else if (desc_num < 0) 373 goto tx_failure; 374 375 current_mp = next_mp; 376 current_len = next_len; 377 current_flag = next_flag; 378 } 379 380 /* 381 * Attach the mblk to the last tx control block 382 */ 383 ASSERT(tcb); 384 ASSERT(tcb->mp == NULL); 385 tcb->mp = mp; 386 387 /* 388 * 82598/82599 chipset has a limitation that no more than 32 tx 389 * descriptors can be transmited out at one time. 390 * 391 * Here is a workaround for it: pull up the mblk then send it 392 * out with bind way. By doing so, no more than MAX_COOKIE (18) 393 * descriptors is needed. 394 */ 395 if (desc_total + 1 > IXGBE_TX_DESC_LIMIT) { 396 tx_ring->stat_break_tbd_limit++; 397 398 /* 399 * Discard the mblk and free the used resources 400 */ 401 tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list); 402 while (tcb) { 403 tcb->mp = NULL; 404 ixgbe_free_tcb(tcb); 405 tcb = (tx_control_block_t *) 406 LIST_GET_NEXT(&pending_list, &tcb->link); 407 } 408 409 /* 410 * Return the tx control blocks in the pending list to 411 * the free list. 412 */ 413 ixgbe_put_free_list(tx_ring, &pending_list); 414 415 /* 416 * pull up the mblk and send it out with bind way 417 */ 418 if ((pull_mp = msgpullup(mp, -1)) == NULL) { 419 tx_ring->reschedule = B_TRUE; 420 421 /* 422 * If new mblk has been allocted for the last header 423 * fragment of a LSO packet, we should restore the 424 * modified mp. 425 */ 426 if (hdr_new_mp) { 427 hdr_new_mp->b_cont = NULL; 428 freeb(hdr_new_mp); 429 hdr_nmp->b_rptr -= hdr_frag_len; 430 if (hdr_pre_mp) 431 hdr_pre_mp->b_cont = hdr_nmp; 432 else 433 mp = hdr_nmp; 434 } 435 return (mp); 436 } 437 438 LINK_LIST_INIT(&pending_list); 439 desc_total = 0; 440 441 /* 442 * if the packet is a LSO packet, we simply 443 * transmit the header in one descriptor using the copy way 444 */ 445 if ((ctx != NULL) && ctx->lso_flag) { 446 hdr_len = ctx->ip_hdr_len + ctx->mac_hdr_len + 447 ctx->l4_hdr_len; 448 449 tcb = ixgbe_get_free_list(tx_ring); 450 if (tcb == NULL) { 451 tx_ring->stat_fail_no_tcb++; 452 goto tx_failure; 453 } 454 desc_num = ixgbe_tx_copy(tx_ring, tcb, pull_mp, 455 hdr_len, B_TRUE); 456 LIST_PUSH_TAIL(&pending_list, &tcb->link); 457 desc_total += desc_num; 458 459 pull_mp->b_rptr += hdr_len; 460 } 461 462 tcb = ixgbe_get_free_list(tx_ring); 463 if (tcb == NULL) { 464 tx_ring->stat_fail_no_tcb++; 465 goto tx_failure; 466 } 467 if ((ctx != NULL) && ctx->lso_flag) { 468 desc_num = ixgbe_tx_bind(tx_ring, tcb, pull_mp, 469 mbsize - hdr_len); 470 } else { 471 desc_num = ixgbe_tx_bind(tx_ring, tcb, pull_mp, 472 mbsize); 473 } 474 if (desc_num < 0) { 475 goto tx_failure; 476 } 477 LIST_PUSH_TAIL(&pending_list, &tcb->link); 478 479 desc_total += desc_num; 480 tcb->mp = pull_mp; 481 } 482 483 /* 484 * Before fill the tx descriptor ring with the data, we need to 485 * ensure there are adequate free descriptors for transmit 486 * (including one context descriptor). 487 * Do not use up all the tx descriptors. 488 * Otherwise tx recycle will fail and cause false hang. 489 */ 490 if (tx_ring->tbd_free <= (desc_total + 1)) { 491 tx_ring->tx_recycle(tx_ring); 492 } 493 494 mutex_enter(&tx_ring->tx_lock); 495 /* 496 * If the number of free tx descriptors is not enough for transmit 497 * then return mp. 498 * 499 * Note: we must put this check under the mutex protection to 500 * ensure the correctness when multiple threads access it in 501 * parallel. 502 */ 503 if (tx_ring->tbd_free <= (desc_total + 1)) { 504 tx_ring->stat_fail_no_tbd++; 505 mutex_exit(&tx_ring->tx_lock); 506 goto tx_failure; 507 } 508 509 desc_num = ixgbe_tx_fill_ring(tx_ring, &pending_list, ctx, 510 mbsize); 511 512 ASSERT((desc_num == desc_total) || (desc_num == (desc_total + 1))); 513 514 tx_ring->stat_obytes += mbsize; 515 tx_ring->stat_opackets ++; 516 517 mutex_exit(&tx_ring->tx_lock); 518 519 /* 520 * now that the transmission succeeds, need to free the original 521 * mp if we used the pulling up mblk for transmission. 522 */ 523 if (pull_mp) { 524 freemsg(mp); 525 } 526 527 return (NULL); 528 529 tx_failure: 530 /* 531 * If transmission fails, need to free the pulling up mblk. 532 */ 533 if (pull_mp) { 534 freemsg(pull_mp); 535 } 536 537 /* 538 * If new mblk has been allocted for the last header 539 * fragment of a LSO packet, we should restore the 540 * modified mp. 541 */ 542 if (hdr_new_mp) { 543 hdr_new_mp->b_cont = NULL; 544 freeb(hdr_new_mp); 545 hdr_nmp->b_rptr -= hdr_frag_len; 546 if (hdr_pre_mp) 547 hdr_pre_mp->b_cont = hdr_nmp; 548 else 549 mp = hdr_nmp; 550 } 551 /* 552 * Discard the mblk and free the used resources 553 */ 554 tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list); 555 while (tcb) { 556 tcb->mp = NULL; 557 558 ixgbe_free_tcb(tcb); 559 560 tcb = (tx_control_block_t *) 561 LIST_GET_NEXT(&pending_list, &tcb->link); 562 } 563 564 /* 565 * Return the tx control blocks in the pending list to the free list. 566 */ 567 ixgbe_put_free_list(tx_ring, &pending_list); 568 569 /* Transmit failed, do not drop the mblk, rechedule the transmit */ 570 tx_ring->reschedule = B_TRUE; 571 572 return (mp); 573 } 574 575 /* 576 * ixgbe_tx_copy 577 * 578 * Copy the mblk fragment to the pre-allocated tx buffer 579 */ 580 static int 581 ixgbe_tx_copy(ixgbe_tx_ring_t *tx_ring, tx_control_block_t *tcb, mblk_t *mp, 582 uint32_t len, boolean_t copy_done) 583 { 584 dma_buffer_t *tx_buf; 585 uint32_t desc_num; 586 _NOTE(ARGUNUSED(tx_ring)); 587 588 tx_buf = &tcb->tx_buf; 589 590 /* 591 * Copy the packet data of the mblk fragment into the 592 * pre-allocated tx buffer, which is maintained by the 593 * tx control block. 594 * 595 * Several mblk fragments can be copied into one tx buffer. 596 * The destination address of the current copied fragment in 597 * the tx buffer is next to the end of the previous copied 598 * fragment. 599 */ 600 if (len > 0) { 601 bcopy(mp->b_rptr, tx_buf->address + tx_buf->len, len); 602 603 tx_buf->len += len; 604 tcb->frag_num++; 605 } 606 607 desc_num = 0; 608 609 /* 610 * If it is the last fragment copied to the current tx buffer, 611 * in other words, if there's no remaining fragment or the remaining 612 * fragment requires a new tx control block to process, we need to 613 * complete the current copy processing by syncing up the current 614 * DMA buffer and saving the descriptor data. 615 */ 616 if (copy_done) { 617 /* 618 * Sync the DMA buffer of the packet data 619 */ 620 DMA_SYNC(tx_buf, DDI_DMA_SYNC_FORDEV); 621 622 tcb->tx_type = USE_COPY; 623 624 /* 625 * Save the address and length to the private data structure 626 * of the tx control block, which will be used to fill the 627 * tx descriptor ring after all the fragments are processed. 628 */ 629 ixgbe_save_desc(tcb, tx_buf->dma_address, tx_buf->len); 630 desc_num++; 631 } 632 633 return (desc_num); 634 } 635 636 /* 637 * ixgbe_tx_bind 638 * 639 * Bind the mblk fragment with DMA 640 */ 641 static int 642 ixgbe_tx_bind(ixgbe_tx_ring_t *tx_ring, tx_control_block_t *tcb, mblk_t *mp, 643 uint32_t len) 644 { 645 int status, i; 646 ddi_dma_cookie_t dma_cookie; 647 uint_t ncookies; 648 int desc_num; 649 650 /* 651 * Use DMA binding to process the mblk fragment 652 */ 653 status = ddi_dma_addr_bind_handle(tcb->tx_dma_handle, NULL, 654 (caddr_t)mp->b_rptr, len, 655 DDI_DMA_WRITE | DDI_DMA_STREAMING, DDI_DMA_DONTWAIT, 656 0, &dma_cookie, &ncookies); 657 658 if (status != DDI_DMA_MAPPED) { 659 tx_ring->stat_fail_dma_bind++; 660 return (-1); 661 } 662 663 tcb->frag_num++; 664 tcb->tx_type = USE_DMA; 665 /* 666 * Each fragment can span several cookies. One cookie will have 667 * one tx descriptor to transmit. 668 */ 669 desc_num = 0; 670 for (i = ncookies; i > 0; i--) { 671 /* 672 * Save the address and length to the private data structure 673 * of the tx control block, which will be used to fill the 674 * tx descriptor ring after all the fragments are processed. 675 */ 676 ixgbe_save_desc(tcb, 677 dma_cookie.dmac_laddress, 678 dma_cookie.dmac_size); 679 680 desc_num++; 681 682 if (i > 1) 683 ddi_dma_nextcookie(tcb->tx_dma_handle, &dma_cookie); 684 } 685 686 return (desc_num); 687 } 688 689 /* 690 * ixgbe_get_context 691 * 692 * Get the context information from the mblk 693 */ 694 static int 695 ixgbe_get_context(mblk_t *mp, ixgbe_tx_context_t *ctx) 696 { 697 uint32_t start; 698 uint32_t hckflags; 699 uint32_t lsoflags; 700 uint32_t lsocksum; 701 uint32_t mss; 702 uint32_t len; 703 uint32_t size; 704 uint32_t offset; 705 unsigned char *pos; 706 ushort_t etype; 707 uint32_t mac_hdr_len; 708 uint32_t l4_proto; 709 uint32_t l4_hdr_len; 710 711 ASSERT(mp != NULL); 712 713 mac_hcksum_get(mp, &start, NULL, NULL, NULL, &hckflags); 714 bzero(ctx, sizeof (ixgbe_tx_context_t)); 715 716 if (hckflags == 0) { 717 return (0); 718 } 719 720 ctx->hcksum_flags = hckflags; 721 722 mac_lso_get(mp, &mss, &lsoflags); 723 ctx->mss = mss; 724 ctx->lso_flag = (lsoflags == HW_LSO); 725 726 etype = 0; 727 mac_hdr_len = 0; 728 l4_proto = 0; 729 730 /* 731 * Firstly get the position of the ether_type/ether_tpid. 732 * Here we don't assume the ether (VLAN) header is fully included 733 * in one mblk fragment, so we go thourgh the fragments to parse 734 * the ether type. 735 */ 736 size = len = MBLKL(mp); 737 offset = offsetof(struct ether_header, ether_type); 738 while (size <= offset) { 739 mp = mp->b_cont; 740 ASSERT(mp != NULL); 741 len = MBLKL(mp); 742 size += len; 743 } 744 pos = mp->b_rptr + offset + len - size; 745 746 etype = ntohs(*(ushort_t *)(uintptr_t)pos); 747 if (etype == ETHERTYPE_VLAN) { 748 /* 749 * Get the position of the ether_type in VLAN header 750 */ 751 offset = offsetof(struct ether_vlan_header, ether_type); 752 while (size <= offset) { 753 mp = mp->b_cont; 754 ASSERT(mp != NULL); 755 len = MBLKL(mp); 756 size += len; 757 } 758 pos = mp->b_rptr + offset + len - size; 759 760 etype = ntohs(*(ushort_t *)(uintptr_t)pos); 761 mac_hdr_len = sizeof (struct ether_vlan_header); 762 } else { 763 mac_hdr_len = sizeof (struct ether_header); 764 } 765 766 /* 767 * Here we don't assume the IP(V6) header is fully included in 768 * one mblk fragment. 769 */ 770 lsocksum = HCK_PARTIALCKSUM; 771 ctx->l3_proto = etype; 772 switch (etype) { 773 case ETHERTYPE_IP: 774 if (ctx->lso_flag) { 775 offset = offsetof(ipha_t, ipha_length) + mac_hdr_len; 776 while (size <= offset) { 777 mp = mp->b_cont; 778 ASSERT(mp != NULL); 779 len = MBLKL(mp); 780 size += len; 781 } 782 pos = mp->b_rptr + offset + len - size; 783 *((uint16_t *)(uintptr_t)(pos)) = 0; 784 785 offset = offsetof(ipha_t, ipha_hdr_checksum) + 786 mac_hdr_len; 787 while (size <= offset) { 788 mp = mp->b_cont; 789 ASSERT(mp != NULL); 790 len = MBLKL(mp); 791 size += len; 792 } 793 pos = mp->b_rptr + offset + len - size; 794 *((uint16_t *)(uintptr_t)(pos)) = 0; 795 796 /* 797 * To perform ixgbe LSO, here also need to fill 798 * the tcp checksum field of the packet with the 799 * following pseudo-header checksum: 800 * (ip_source_addr, ip_destination_addr, l4_proto) 801 * Currently the tcp/ip stack has done it. 802 */ 803 lsocksum |= HCK_IPV4_HDRCKSUM; 804 } 805 806 offset = offsetof(ipha_t, ipha_protocol) + mac_hdr_len; 807 while (size <= offset) { 808 mp = mp->b_cont; 809 ASSERT(mp != NULL); 810 len = MBLKL(mp); 811 size += len; 812 } 813 pos = mp->b_rptr + offset + len - size; 814 815 l4_proto = *(uint8_t *)pos; 816 break; 817 case ETHERTYPE_IPV6: 818 /* 819 * We need to zero out the length in the header. 820 */ 821 if (ctx->lso_flag) { 822 offset = offsetof(ip6_t, ip6_plen) + mac_hdr_len; 823 while (size <= offset) { 824 mp = mp->b_cont; 825 ASSERT(mp != NULL); 826 len = MBLKL(mp); 827 size += len; 828 } 829 pos = mp->b_rptr + offset + len - size; 830 *((uint16_t *)(uintptr_t)(pos)) = 0; 831 } 832 833 offset = offsetof(ip6_t, ip6_nxt) + mac_hdr_len; 834 while (size <= offset) { 835 mp = mp->b_cont; 836 ASSERT(mp != NULL); 837 len = MBLKL(mp); 838 size += len; 839 } 840 pos = mp->b_rptr + offset + len - size; 841 842 l4_proto = *(uint8_t *)pos; 843 break; 844 default: 845 /* Unrecoverable error */ 846 IXGBE_DEBUGLOG_0(NULL, "Ether type error with tx hcksum"); 847 return (-2); 848 } 849 850 if (ctx->lso_flag) { 851 /* 852 * LSO relies on tx h/w checksum, so here will drop the packet 853 * if h/w checksum flag is not declared. 854 */ 855 if ((ctx->hcksum_flags & lsocksum) != lsocksum) { 856 IXGBE_DEBUGLOG_2(NULL, "ixgbe_tx: h/w checksum flags " 857 "are not set for LSO, found 0x%x, needed bits 0x%x", 858 ctx->hcksum_flags, lsocksum); 859 return (-1); 860 } 861 862 863 offset = mac_hdr_len + start; 864 while (size <= offset) { 865 mp = mp->b_cont; 866 ASSERT(mp != NULL); 867 len = MBLKL(mp); 868 size += len; 869 } 870 pos = mp->b_rptr + offset + len - size; 871 872 l4_hdr_len = TCP_HDR_LENGTH((tcph_t *)pos); 873 } else { 874 /* 875 * l4 header length is only required for LSO 876 */ 877 l4_hdr_len = 0; 878 } 879 880 ctx->mac_hdr_len = mac_hdr_len; 881 ctx->ip_hdr_len = start; 882 ctx->l4_proto = l4_proto; 883 ctx->l4_hdr_len = l4_hdr_len; 884 885 return (0); 886 } 887 888 /* 889 * ixgbe_check_context 890 * 891 * Check if a new context descriptor is needed 892 */ 893 static boolean_t 894 ixgbe_check_context(ixgbe_tx_ring_t *tx_ring, ixgbe_tx_context_t *ctx) 895 { 896 ixgbe_tx_context_t *last; 897 898 if (ctx == NULL) 899 return (B_FALSE); 900 901 /* 902 * Compare the context data retrieved from the mblk and the 903 * stored data of the last context descriptor. The data need 904 * to be checked are: 905 * hcksum_flags 906 * l4_proto 907 * mac_hdr_len 908 * ip_hdr_len 909 * lso_flag 910 * mss (only checked for LSO) 911 * l4_hr_len (only checked for LSO) 912 * Either one of the above data is changed, a new context descriptor 913 * will be needed. 914 */ 915 last = &tx_ring->tx_context; 916 917 if ((ctx->hcksum_flags != last->hcksum_flags) || 918 (ctx->l4_proto != last->l4_proto) || 919 (ctx->l3_proto != last->l3_proto) || 920 (ctx->mac_hdr_len != last->mac_hdr_len) || 921 (ctx->ip_hdr_len != last->ip_hdr_len) || 922 (ctx->lso_flag != last->lso_flag) || 923 (ctx->lso_flag && ((ctx->mss != last->mss) || 924 (ctx->l4_hdr_len != last->l4_hdr_len)))) { 925 return (B_TRUE); 926 } 927 928 return (B_FALSE); 929 } 930 931 /* 932 * ixgbe_fill_context 933 * 934 * Fill the context descriptor with hardware checksum informations 935 */ 936 static void 937 ixgbe_fill_context(struct ixgbe_adv_tx_context_desc *ctx_tbd, 938 ixgbe_tx_context_t *ctx) 939 { 940 /* 941 * Fill the context descriptor with the checksum 942 * context information we've got. 943 */ 944 ctx_tbd->vlan_macip_lens = ctx->ip_hdr_len; 945 ctx_tbd->vlan_macip_lens |= ctx->mac_hdr_len << 946 IXGBE_ADVTXD_MACLEN_SHIFT; 947 948 ctx_tbd->type_tucmd_mlhl = 949 IXGBE_ADVTXD_DCMD_DEXT | IXGBE_ADVTXD_DTYP_CTXT; 950 /* 951 * When we have a TX context set up, we enforce that the ethertype is 952 * either IPv4 or IPv6 in ixgbe_get_tx_context(). 953 */ 954 if (ctx->lso_flag || ctx->hcksum_flags & HCK_IPV4_HDRCKSUM) { 955 if (ctx->l3_proto == ETHERTYPE_IP) { 956 ctx_tbd->type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV4; 957 } else { 958 ctx_tbd->type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV6; 959 } 960 } 961 962 if (ctx->lso_flag || ctx->hcksum_flags & HCK_PARTIALCKSUM) { 963 switch (ctx->l4_proto) { 964 case IPPROTO_TCP: 965 ctx_tbd->type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_TCP; 966 break; 967 case IPPROTO_UDP: 968 /* 969 * We don't have to explicitly set: 970 * ctx_tbd->type_tucmd_mlhl |= 971 * IXGBE_ADVTXD_TUCMD_L4T_UDP; 972 * Because IXGBE_ADVTXD_TUCMD_L4T_UDP == 0b 973 */ 974 break; 975 default: 976 /* Unrecoverable error */ 977 IXGBE_DEBUGLOG_0(NULL, "L4 type error with tx hcksum"); 978 break; 979 } 980 } 981 982 ctx_tbd->seqnum_seed = 0; 983 984 if (ctx->lso_flag) { 985 ctx_tbd->mss_l4len_idx = 986 (ctx->l4_hdr_len << IXGBE_ADVTXD_L4LEN_SHIFT) | 987 (ctx->mss << IXGBE_ADVTXD_MSS_SHIFT); 988 } else { 989 ctx_tbd->mss_l4len_idx = 0; 990 } 991 } 992 993 /* 994 * ixgbe_tx_fill_ring 995 * 996 * Fill the tx descriptor ring with the data 997 */ 998 static int 999 ixgbe_tx_fill_ring(ixgbe_tx_ring_t *tx_ring, link_list_t *pending_list, 1000 ixgbe_tx_context_t *ctx, size_t mbsize) 1001 { 1002 struct ixgbe_hw *hw = &tx_ring->ixgbe->hw; 1003 boolean_t load_context; 1004 uint32_t index, tcb_index, desc_num; 1005 union ixgbe_adv_tx_desc *tbd, *first_tbd; 1006 tx_control_block_t *tcb, *first_tcb; 1007 uint32_t hcksum_flags; 1008 int i; 1009 1010 ASSERT(mutex_owned(&tx_ring->tx_lock)); 1011 1012 tbd = NULL; 1013 first_tbd = NULL; 1014 first_tcb = NULL; 1015 desc_num = 0; 1016 hcksum_flags = 0; 1017 load_context = B_FALSE; 1018 1019 /* 1020 * Get the index of the first tx descriptor that will be filled, 1021 * and the index of the first work list item that will be attached 1022 * with the first used tx control block in the pending list. 1023 * Note: the two indexes are the same. 1024 */ 1025 index = tx_ring->tbd_tail; 1026 tcb_index = tx_ring->tbd_tail; 1027 1028 if (ctx != NULL) { 1029 hcksum_flags = ctx->hcksum_flags; 1030 1031 /* 1032 * Check if a new context descriptor is needed for this packet 1033 */ 1034 load_context = ixgbe_check_context(tx_ring, ctx); 1035 1036 if (load_context) { 1037 tbd = &tx_ring->tbd_ring[index]; 1038 1039 /* 1040 * Fill the context descriptor with the 1041 * hardware checksum offload informations. 1042 */ 1043 ixgbe_fill_context( 1044 (struct ixgbe_adv_tx_context_desc *)tbd, ctx); 1045 1046 index = NEXT_INDEX(index, 1, tx_ring->ring_size); 1047 desc_num++; 1048 1049 /* 1050 * Store the checksum context data if 1051 * a new context descriptor is added 1052 */ 1053 tx_ring->tx_context = *ctx; 1054 } 1055 } 1056 1057 first_tbd = &tx_ring->tbd_ring[index]; 1058 1059 /* 1060 * Fill tx data descriptors with the data saved in the pending list. 1061 * The tx control blocks in the pending list are added to the work list 1062 * at the same time. 1063 * 1064 * The work list is strictly 1:1 corresponding to the descriptor ring. 1065 * One item of the work list corresponds to one tx descriptor. Because 1066 * one tx control block can span multiple tx descriptors, the tx 1067 * control block will be added to the first work list item that 1068 * corresponds to the first tx descriptor generated from that tx 1069 * control block. 1070 */ 1071 tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list); 1072 first_tcb = tcb; 1073 while (tcb != NULL) { 1074 1075 for (i = 0; i < tcb->desc_num; i++) { 1076 tbd = &tx_ring->tbd_ring[index]; 1077 1078 tbd->read.buffer_addr = tcb->desc[i].address; 1079 tbd->read.cmd_type_len = tcb->desc[i].length; 1080 1081 tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_DEXT 1082 | IXGBE_ADVTXD_DTYP_DATA; 1083 1084 tbd->read.olinfo_status = 0; 1085 1086 index = NEXT_INDEX(index, 1, tx_ring->ring_size); 1087 desc_num++; 1088 } 1089 1090 /* 1091 * Add the tx control block to the work list 1092 */ 1093 ASSERT(tx_ring->work_list[tcb_index] == NULL); 1094 tx_ring->work_list[tcb_index] = tcb; 1095 1096 tcb_index = index; 1097 tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list); 1098 } 1099 1100 if (load_context) { 1101 /* 1102 * Count the context descriptor for 1103 * the first tx control block. 1104 */ 1105 first_tcb->desc_num++; 1106 } 1107 first_tcb->last_index = PREV_INDEX(index, 1, tx_ring->ring_size); 1108 1109 /* 1110 * The Insert Ethernet CRC (IFCS) bit and the checksum fields are only 1111 * valid in the first descriptor of the packet. 1112 * Setting paylen in every first_tbd for all parts. 1113 * 82599, X540 and X550 require the packet length in paylen field 1114 * with or without LSO and 82598 will ignore it in non-LSO mode. 1115 */ 1116 ASSERT(first_tbd != NULL); 1117 first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_IFCS; 1118 1119 switch (hw->mac.type) { 1120 case ixgbe_mac_82598EB: 1121 if (ctx != NULL && ctx->lso_flag) { 1122 first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_TSE; 1123 first_tbd->read.olinfo_status |= 1124 (mbsize - ctx->mac_hdr_len - ctx->ip_hdr_len 1125 - ctx->l4_hdr_len) << IXGBE_ADVTXD_PAYLEN_SHIFT; 1126 } 1127 break; 1128 1129 case ixgbe_mac_82599EB: 1130 case ixgbe_mac_X540: 1131 case ixgbe_mac_X550: 1132 case ixgbe_mac_X550EM_x: 1133 case ixgbe_mac_X550EM_a: 1134 if (ctx != NULL && ctx->lso_flag) { 1135 first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_TSE; 1136 first_tbd->read.olinfo_status |= 1137 (mbsize - ctx->mac_hdr_len - ctx->ip_hdr_len 1138 - ctx->l4_hdr_len) << IXGBE_ADVTXD_PAYLEN_SHIFT; 1139 } else { 1140 first_tbd->read.olinfo_status |= 1141 (mbsize << IXGBE_ADVTXD_PAYLEN_SHIFT); 1142 } 1143 break; 1144 1145 default: 1146 break; 1147 } 1148 1149 /* Set hardware checksum bits */ 1150 if (hcksum_flags != 0) { 1151 if (hcksum_flags & HCK_IPV4_HDRCKSUM) 1152 first_tbd->read.olinfo_status |= 1153 IXGBE_ADVTXD_POPTS_IXSM; 1154 if (hcksum_flags & HCK_PARTIALCKSUM) 1155 first_tbd->read.olinfo_status |= 1156 IXGBE_ADVTXD_POPTS_TXSM; 1157 } 1158 1159 /* 1160 * The last descriptor of packet needs End Of Packet (EOP), 1161 * and Report Status (RS) bits set 1162 */ 1163 ASSERT(tbd != NULL); 1164 tbd->read.cmd_type_len |= 1165 IXGBE_ADVTXD_DCMD_EOP | IXGBE_ADVTXD_DCMD_RS; 1166 1167 /* 1168 * Sync the DMA buffer of the tx descriptor ring 1169 */ 1170 DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORDEV); 1171 1172 /* 1173 * Update the number of the free tx descriptors. 1174 * The mutual exclusion between the transmission and the recycling 1175 * (for the tx descriptor ring and the work list) is implemented 1176 * with the atomic operation on the number of the free tx descriptors. 1177 * 1178 * Note: we should always decrement the counter tbd_free before 1179 * advancing the hardware TDT pointer to avoid the race condition - 1180 * before the counter tbd_free is decremented, the transmit of the 1181 * tx descriptors has done and the counter tbd_free is increased by 1182 * the tx recycling. 1183 */ 1184 i = ixgbe_atomic_reserve(&tx_ring->tbd_free, desc_num); 1185 ASSERT(i >= 0); 1186 1187 tx_ring->tbd_tail = index; 1188 1189 /* 1190 * Advance the hardware TDT pointer of the tx descriptor ring 1191 */ 1192 IXGBE_WRITE_REG(hw, IXGBE_TDT(tx_ring->index), index); 1193 1194 if (ixgbe_check_acc_handle(tx_ring->ixgbe->osdep.reg_handle) != 1195 DDI_FM_OK) { 1196 ddi_fm_service_impact(tx_ring->ixgbe->dip, 1197 DDI_SERVICE_DEGRADED); 1198 atomic_or_32(&tx_ring->ixgbe->ixgbe_state, IXGBE_ERROR); 1199 } 1200 1201 return (desc_num); 1202 } 1203 1204 /* 1205 * ixgbe_save_desc 1206 * 1207 * Save the address/length pair to the private array 1208 * of the tx control block. The address/length pairs 1209 * will be filled into the tx descriptor ring later. 1210 */ 1211 static void 1212 ixgbe_save_desc(tx_control_block_t *tcb, uint64_t address, size_t length) 1213 { 1214 sw_desc_t *desc; 1215 1216 desc = &tcb->desc[tcb->desc_num]; 1217 desc->address = address; 1218 desc->length = length; 1219 1220 tcb->desc_num++; 1221 } 1222 1223 /* 1224 * ixgbe_tx_recycle_legacy 1225 * 1226 * Recycle the tx descriptors and tx control blocks. 1227 * 1228 * The work list is traversed to check if the corresponding 1229 * tx descriptors have been transmitted. If so, the resources 1230 * bound to the tx control blocks will be freed, and those 1231 * tx control blocks will be returned to the free list. 1232 */ 1233 uint32_t 1234 ixgbe_tx_recycle_legacy(ixgbe_tx_ring_t *tx_ring) 1235 { 1236 uint32_t index, last_index, prev_index; 1237 int desc_num; 1238 boolean_t desc_done; 1239 tx_control_block_t *tcb; 1240 link_list_t pending_list; 1241 ixgbe_t *ixgbe = tx_ring->ixgbe; 1242 1243 mutex_enter(&tx_ring->recycle_lock); 1244 1245 ASSERT(tx_ring->tbd_free <= tx_ring->ring_size); 1246 1247 if (tx_ring->tbd_free == tx_ring->ring_size) { 1248 tx_ring->recycle_fail = 0; 1249 tx_ring->stall_watchdog = 0; 1250 if (tx_ring->reschedule) { 1251 tx_ring->reschedule = B_FALSE; 1252 mac_tx_ring_update(ixgbe->mac_hdl, 1253 tx_ring->ring_handle); 1254 } 1255 mutex_exit(&tx_ring->recycle_lock); 1256 return (0); 1257 } 1258 1259 /* 1260 * Sync the DMA buffer of the tx descriptor ring 1261 */ 1262 DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORKERNEL); 1263 1264 if (ixgbe_check_dma_handle(tx_ring->tbd_area.dma_handle) != DDI_FM_OK) { 1265 mutex_exit(&tx_ring->recycle_lock); 1266 ddi_fm_service_impact(ixgbe->dip, DDI_SERVICE_DEGRADED); 1267 atomic_or_32(&ixgbe->ixgbe_state, IXGBE_ERROR); 1268 return (0); 1269 } 1270 1271 LINK_LIST_INIT(&pending_list); 1272 desc_num = 0; 1273 index = tx_ring->tbd_head; /* Index of next tbd/tcb to recycle */ 1274 1275 tcb = tx_ring->work_list[index]; 1276 ASSERT(tcb != NULL); 1277 1278 while (tcb != NULL) { 1279 /* 1280 * Get the last tx descriptor of this packet. 1281 * If the last tx descriptor is done, then 1282 * we can recycle all descriptors of a packet 1283 * which usually includes several tx control blocks. 1284 * For 82599, LSO descriptors can not be recycled 1285 * unless the whole packet's transmission is done. 1286 * That's why packet level recycling is used here. 1287 * For 82598, there's not such limit. 1288 */ 1289 last_index = tcb->last_index; 1290 /* 1291 * MAX_TX_RING_SIZE is used to judge whether 1292 * the index is a valid value or not. 1293 */ 1294 if (last_index == MAX_TX_RING_SIZE) 1295 break; 1296 1297 /* 1298 * Check if the Descriptor Done bit is set 1299 */ 1300 desc_done = tx_ring->tbd_ring[last_index].wb.status & 1301 IXGBE_TXD_STAT_DD; 1302 if (desc_done) { 1303 /* 1304 * recycle all descriptors of the packet 1305 */ 1306 while (tcb != NULL) { 1307 /* 1308 * Strip off the tx control block from 1309 * the work list, and add it to the 1310 * pending list. 1311 */ 1312 tx_ring->work_list[index] = NULL; 1313 LIST_PUSH_TAIL(&pending_list, &tcb->link); 1314 1315 /* 1316 * Count the total number of the tx 1317 * descriptors recycled 1318 */ 1319 desc_num += tcb->desc_num; 1320 1321 index = NEXT_INDEX(index, tcb->desc_num, 1322 tx_ring->ring_size); 1323 1324 tcb = tx_ring->work_list[index]; 1325 1326 prev_index = PREV_INDEX(index, 1, 1327 tx_ring->ring_size); 1328 if (prev_index == last_index) 1329 break; 1330 } 1331 } else { 1332 break; 1333 } 1334 } 1335 1336 /* 1337 * If no tx descriptors are recycled, no need to do more processing 1338 */ 1339 if (desc_num == 0) { 1340 tx_ring->recycle_fail++; 1341 mutex_exit(&tx_ring->recycle_lock); 1342 return (0); 1343 } 1344 1345 tx_ring->recycle_fail = 0; 1346 tx_ring->stall_watchdog = 0; 1347 1348 /* 1349 * Update the head index of the tx descriptor ring 1350 */ 1351 tx_ring->tbd_head = index; 1352 1353 /* 1354 * Update the number of the free tx descriptors with atomic operations 1355 */ 1356 atomic_add_32(&tx_ring->tbd_free, desc_num); 1357 1358 if ((tx_ring->tbd_free >= ixgbe->tx_resched_thresh) && 1359 (tx_ring->reschedule)) { 1360 tx_ring->reschedule = B_FALSE; 1361 mac_tx_ring_update(ixgbe->mac_hdl, 1362 tx_ring->ring_handle); 1363 } 1364 mutex_exit(&tx_ring->recycle_lock); 1365 1366 /* 1367 * Free the resources used by the tx control blocks 1368 * in the pending list 1369 */ 1370 tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list); 1371 while (tcb != NULL) { 1372 /* 1373 * Release the resources occupied by the tx control block 1374 */ 1375 ixgbe_free_tcb(tcb); 1376 1377 tcb = (tx_control_block_t *) 1378 LIST_GET_NEXT(&pending_list, &tcb->link); 1379 } 1380 1381 /* 1382 * Add the tx control blocks in the pending list to the free list. 1383 */ 1384 ixgbe_put_free_list(tx_ring, &pending_list); 1385 1386 return (desc_num); 1387 } 1388 1389 /* 1390 * ixgbe_tx_recycle_head_wb 1391 * 1392 * Check the head write-back, and recycle all the transmitted 1393 * tx descriptors and tx control blocks. 1394 */ 1395 uint32_t 1396 ixgbe_tx_recycle_head_wb(ixgbe_tx_ring_t *tx_ring) 1397 { 1398 uint32_t index; 1399 uint32_t head_wb; 1400 int desc_num; 1401 tx_control_block_t *tcb; 1402 link_list_t pending_list; 1403 ixgbe_t *ixgbe = tx_ring->ixgbe; 1404 1405 mutex_enter(&tx_ring->recycle_lock); 1406 1407 ASSERT(tx_ring->tbd_free <= tx_ring->ring_size); 1408 1409 if (tx_ring->tbd_free == tx_ring->ring_size) { 1410 tx_ring->recycle_fail = 0; 1411 tx_ring->stall_watchdog = 0; 1412 if (tx_ring->reschedule) { 1413 tx_ring->reschedule = B_FALSE; 1414 mac_tx_ring_update(ixgbe->mac_hdl, 1415 tx_ring->ring_handle); 1416 } 1417 mutex_exit(&tx_ring->recycle_lock); 1418 return (0); 1419 } 1420 1421 /* 1422 * Sync the DMA buffer of the tx descriptor ring 1423 * 1424 * Note: For head write-back mode, the tx descriptors will not 1425 * be written back, but the head write-back value is stored at 1426 * the last extra tbd at the end of the DMA area, we still need 1427 * to sync the head write-back value for kernel. 1428 * 1429 * DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORKERNEL); 1430 */ 1431 (void) ddi_dma_sync(tx_ring->tbd_area.dma_handle, 1432 sizeof (union ixgbe_adv_tx_desc) * tx_ring->ring_size, 1433 sizeof (uint32_t), 1434 DDI_DMA_SYNC_FORKERNEL); 1435 1436 if (ixgbe_check_dma_handle(tx_ring->tbd_area.dma_handle) != DDI_FM_OK) { 1437 mutex_exit(&tx_ring->recycle_lock); 1438 ddi_fm_service_impact(ixgbe->dip, 1439 DDI_SERVICE_DEGRADED); 1440 atomic_or_32(&ixgbe->ixgbe_state, IXGBE_ERROR); 1441 return (0); 1442 } 1443 1444 LINK_LIST_INIT(&pending_list); 1445 desc_num = 0; 1446 index = tx_ring->tbd_head; /* Next index to clean */ 1447 1448 /* 1449 * Get the value of head write-back 1450 */ 1451 head_wb = *tx_ring->tbd_head_wb; 1452 while (index != head_wb) { 1453 tcb = tx_ring->work_list[index]; 1454 ASSERT(tcb != NULL); 1455 1456 if (OFFSET(index, head_wb, tx_ring->ring_size) < 1457 tcb->desc_num) { 1458 /* 1459 * The current tx control block is not 1460 * completely transmitted, stop recycling 1461 */ 1462 break; 1463 } 1464 1465 /* 1466 * Strip off the tx control block from the work list, 1467 * and add it to the pending list. 1468 */ 1469 tx_ring->work_list[index] = NULL; 1470 LIST_PUSH_TAIL(&pending_list, &tcb->link); 1471 1472 /* 1473 * Advance the index of the tx descriptor ring 1474 */ 1475 index = NEXT_INDEX(index, tcb->desc_num, tx_ring->ring_size); 1476 1477 /* 1478 * Count the total number of the tx descriptors recycled 1479 */ 1480 desc_num += tcb->desc_num; 1481 } 1482 1483 /* 1484 * If no tx descriptors are recycled, no need to do more processing 1485 */ 1486 if (desc_num == 0) { 1487 tx_ring->recycle_fail++; 1488 mutex_exit(&tx_ring->recycle_lock); 1489 return (0); 1490 } 1491 1492 tx_ring->recycle_fail = 0; 1493 tx_ring->stall_watchdog = 0; 1494 1495 /* 1496 * Update the head index of the tx descriptor ring 1497 */ 1498 tx_ring->tbd_head = index; 1499 1500 /* 1501 * Update the number of the free tx descriptors with atomic operations 1502 */ 1503 atomic_add_32(&tx_ring->tbd_free, desc_num); 1504 1505 if ((tx_ring->tbd_free >= ixgbe->tx_resched_thresh) && 1506 (tx_ring->reschedule)) { 1507 tx_ring->reschedule = B_FALSE; 1508 mac_tx_ring_update(ixgbe->mac_hdl, 1509 tx_ring->ring_handle); 1510 } 1511 mutex_exit(&tx_ring->recycle_lock); 1512 1513 /* 1514 * Free the resources used by the tx control blocks 1515 * in the pending list 1516 */ 1517 tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list); 1518 while (tcb) { 1519 /* 1520 * Release the resources occupied by the tx control block 1521 */ 1522 ixgbe_free_tcb(tcb); 1523 1524 tcb = (tx_control_block_t *) 1525 LIST_GET_NEXT(&pending_list, &tcb->link); 1526 } 1527 1528 /* 1529 * Add the tx control blocks in the pending list to the free list. 1530 */ 1531 ixgbe_put_free_list(tx_ring, &pending_list); 1532 1533 return (desc_num); 1534 } 1535 1536 /* 1537 * ixgbe_free_tcb - free up the tx control block 1538 * 1539 * Free the resources of the tx control block, including 1540 * unbind the previously bound DMA handle, and reset other 1541 * control fields. 1542 */ 1543 void 1544 ixgbe_free_tcb(tx_control_block_t *tcb) 1545 { 1546 switch (tcb->tx_type) { 1547 case USE_COPY: 1548 /* 1549 * Reset the buffer length that is used for copy 1550 */ 1551 tcb->tx_buf.len = 0; 1552 break; 1553 case USE_DMA: 1554 /* 1555 * Release the DMA resource that is used for 1556 * DMA binding. 1557 */ 1558 (void) ddi_dma_unbind_handle(tcb->tx_dma_handle); 1559 break; 1560 default: 1561 break; 1562 } 1563 1564 /* 1565 * Free the mblk 1566 */ 1567 if (tcb->mp != NULL) { 1568 freemsg(tcb->mp); 1569 tcb->mp = NULL; 1570 } 1571 1572 tcb->tx_type = USE_NONE; 1573 tcb->last_index = MAX_TX_RING_SIZE; 1574 tcb->frag_num = 0; 1575 tcb->desc_num = 0; 1576 } 1577 1578 /* 1579 * ixgbe_get_free_list - Get a free tx control block from the free list 1580 * 1581 * The atomic operation on the number of the available tx control block 1582 * in the free list is used to keep this routine mutual exclusive with 1583 * the routine ixgbe_put_check_list. 1584 */ 1585 static tx_control_block_t * 1586 ixgbe_get_free_list(ixgbe_tx_ring_t *tx_ring) 1587 { 1588 tx_control_block_t *tcb; 1589 1590 /* 1591 * Check and update the number of the free tx control block 1592 * in the free list. 1593 */ 1594 if (ixgbe_atomic_reserve(&tx_ring->tcb_free, 1) < 0) 1595 return (NULL); 1596 1597 mutex_enter(&tx_ring->tcb_head_lock); 1598 1599 tcb = tx_ring->free_list[tx_ring->tcb_head]; 1600 ASSERT(tcb != NULL); 1601 tx_ring->free_list[tx_ring->tcb_head] = NULL; 1602 tx_ring->tcb_head = NEXT_INDEX(tx_ring->tcb_head, 1, 1603 tx_ring->free_list_size); 1604 1605 mutex_exit(&tx_ring->tcb_head_lock); 1606 1607 return (tcb); 1608 } 1609 1610 /* 1611 * ixgbe_put_free_list 1612 * 1613 * Put a list of used tx control blocks back to the free list 1614 * 1615 * A mutex is used here to ensure the serialization. The mutual exclusion 1616 * between ixgbe_get_free_list and ixgbe_put_free_list is implemented with 1617 * the atomic operation on the counter tcb_free. 1618 */ 1619 void 1620 ixgbe_put_free_list(ixgbe_tx_ring_t *tx_ring, link_list_t *pending_list) 1621 { 1622 uint32_t index; 1623 int tcb_num; 1624 tx_control_block_t *tcb; 1625 1626 mutex_enter(&tx_ring->tcb_tail_lock); 1627 1628 index = tx_ring->tcb_tail; 1629 1630 tcb_num = 0; 1631 tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list); 1632 while (tcb != NULL) { 1633 ASSERT(tx_ring->free_list[index] == NULL); 1634 tx_ring->free_list[index] = tcb; 1635 1636 tcb_num++; 1637 1638 index = NEXT_INDEX(index, 1, tx_ring->free_list_size); 1639 1640 tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list); 1641 } 1642 1643 tx_ring->tcb_tail = index; 1644 1645 /* 1646 * Update the number of the free tx control block 1647 * in the free list. This operation must be placed 1648 * under the protection of the lock. 1649 */ 1650 atomic_add_32(&tx_ring->tcb_free, tcb_num); 1651 1652 mutex_exit(&tx_ring->tcb_tail_lock); 1653 } 1654