1 /* 2 * CDDL HEADER START 3 * 4 * Copyright(c) 2007-2009 Intel Corporation. All rights reserved. 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at: 10 * http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When using or redistributing this file, you may do so under the 15 * License only. No other modification of this header is permitted. 16 * 17 * If applicable, add the following below this CDDL HEADER, with the 18 * fields enclosed by brackets "[]" replaced with your own identifying 19 * information: Portions Copyright [yyyy] [name of copyright owner] 20 * 21 * CDDL HEADER END 22 */ 23 24 /* 25 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 26 * Use is subject to license terms. 27 */ 28 29 #include "igb_sw.h" 30 31 static boolean_t igb_tx(igb_tx_ring_t *, mblk_t *); 32 static int igb_tx_copy(igb_tx_ring_t *, tx_control_block_t *, mblk_t *, 33 uint32_t, boolean_t); 34 static int igb_tx_bind(igb_tx_ring_t *, tx_control_block_t *, mblk_t *, 35 uint32_t); 36 static int igb_tx_fill_ring(igb_tx_ring_t *, link_list_t *, tx_context_t *, 37 size_t); 38 static void igb_save_desc(tx_control_block_t *, uint64_t, size_t); 39 static tx_control_block_t *igb_get_free_list(igb_tx_ring_t *); 40 static int igb_get_tx_context(mblk_t *, tx_context_t *); 41 static boolean_t igb_check_tx_context(igb_tx_ring_t *, tx_context_t *); 42 static void igb_fill_tx_context(struct e1000_adv_tx_context_desc *, 43 tx_context_t *, uint32_t); 44 45 #ifndef IGB_DEBUG 46 #pragma inline(igb_save_desc) 47 #pragma inline(igb_get_tx_context) 48 #pragma inline(igb_check_tx_context) 49 #pragma inline(igb_fill_tx_context) 50 #endif 51 52 mblk_t * 53 igb_tx_ring_send(void *arg, mblk_t *mp) 54 { 55 igb_tx_ring_t *tx_ring = (igb_tx_ring_t *)arg; 56 57 ASSERT(tx_ring != NULL); 58 59 return ((igb_tx(tx_ring, mp)) ? NULL : mp); 60 } 61 62 /* 63 * igb_tx - Main transmit processing 64 * 65 * Called from igb_m_tx with an mblk ready to transmit. this 66 * routine sets up the transmit descriptors and sends data to 67 * the wire. 68 * 69 * One mblk can consist of several fragments, each fragment 70 * will be processed with different methods based on the size. 71 * For the fragments with size less than the bcopy threshold, 72 * they will be processed by using bcopy; otherwise, they will 73 * be processed by using DMA binding. 74 * 75 * To process the mblk, a tx control block is got from the 76 * free list. One tx control block contains one tx buffer, which 77 * is used to copy mblk fragments' data; and one tx DMA handle, 78 * which is used to bind a mblk fragment with DMA resource. 79 * 80 * Several small mblk fragments can be copied into one tx control 81 * block's buffer, and then the buffer will be transmitted with 82 * one tx descriptor. 83 * 84 * A large fragment only binds with one tx control block's DMA 85 * handle, and it can span several tx descriptors for transmitting. 86 * 87 * So to transmit a packet (mblk), several tx control blocks can 88 * be used. After the processing, those tx control blocks will 89 * be put to the work list. 90 */ 91 static boolean_t 92 igb_tx(igb_tx_ring_t *tx_ring, mblk_t *mp) 93 { 94 igb_t *igb = tx_ring->igb; 95 tx_type_t current_flag, next_flag; 96 uint32_t current_len, next_len; 97 uint32_t desc_total; 98 size_t mbsize; 99 int desc_num; 100 boolean_t copy_done, eop; 101 mblk_t *current_mp, *next_mp, *nmp; 102 tx_control_block_t *tcb; 103 tx_context_t tx_context, *ctx; 104 link_list_t pending_list; 105 mblk_t *new_mp; 106 mblk_t *previous_mp; 107 uint32_t hdr_frag_len; 108 uint32_t hdr_len, len; 109 uint32_t copy_thresh; 110 111 copy_thresh = tx_ring->copy_thresh; 112 113 /* Get the mblk size */ 114 mbsize = 0; 115 for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) { 116 mbsize += MBLKL(nmp); 117 } 118 119 if (igb->tx_hcksum_enable) { 120 ctx = &tx_context; 121 /* 122 * Retrieve offloading context information from the mblk 123 * that will be used to decide whether/how to fill the 124 * context descriptor. 125 */ 126 if (igb_get_tx_context(mp, ctx) != TX_CXT_SUCCESS) { 127 freemsg(mp); 128 return (B_TRUE); 129 } 130 131 if ((ctx->lso_flag && 132 (mbsize > (ctx->mac_hdr_len + IGB_LSO_MAXLEN))) || 133 (!ctx->lso_flag && 134 (mbsize > (igb->max_frame_size - ETHERFCSL)))) { 135 freemsg(mp); 136 IGB_DEBUGLOG_0(igb, "igb_tx: packet oversize"); 137 return (B_TRUE); 138 } 139 } else { 140 ctx = NULL; 141 if (mbsize > (igb->max_frame_size - ETHERFCSL)) { 142 freemsg(mp); 143 IGB_DEBUGLOG_0(igb, "igb_tx: packet oversize"); 144 return (B_TRUE); 145 } 146 } 147 148 /* 149 * Check and recycle tx descriptors. 150 * The recycle threshold here should be selected carefully 151 */ 152 if (tx_ring->tbd_free < tx_ring->recycle_thresh) 153 tx_ring->tx_recycle(tx_ring); 154 155 /* 156 * After the recycling, if the tbd_free is less than the 157 * overload_threshold, assert overload, return B_FALSE; 158 * and we need to re-schedule the tx again. 159 */ 160 if (tx_ring->tbd_free < tx_ring->overload_thresh) { 161 tx_ring->reschedule = B_TRUE; 162 IGB_DEBUG_STAT(tx_ring->stat_overload); 163 return (B_FALSE); 164 } 165 166 /* 167 * The software should guarantee LSO packet header(MAC+IP+TCP) 168 * to be within one descriptor - this is required by h/w. 169 * Here will reallocate and refill the header if 170 * the headers(MAC+IP+TCP) is physical memory non-contiguous. 171 */ 172 if (ctx && ctx->lso_flag) { 173 hdr_len = ctx->mac_hdr_len + ctx->ip_hdr_len + 174 ctx->l4_hdr_len; 175 len = MBLKL(mp); 176 current_mp = mp; 177 previous_mp = NULL; 178 while (len < hdr_len) { 179 previous_mp = current_mp; 180 current_mp = current_mp->b_cont; 181 len += MBLKL(current_mp); 182 } 183 184 /* 185 * If len is larger than copy_thresh, we do not 186 * need to do anything since igb's tx copy mechanism 187 * will ensure that the headers will be handled 188 * in one descriptor. 189 */ 190 if (len > copy_thresh) { 191 if (len != hdr_len) { 192 /* 193 * If the header and the payload are in 194 * different mblks, we simply force the 195 * header to be copied into a 196 * new-allocated buffer. 197 */ 198 hdr_frag_len = hdr_len - 199 (len - MBLKL(current_mp)); 200 201 /* 202 * There are two cases we will reallocate 203 * a mblk for the last header fragment. 204 * 1. the header is in multiple mblks and 205 * the last fragment shares the same mblk 206 * with the payload 207 * 2. the header is in a single mblk shared 208 * with the payload but the header crosses 209 * a page. 210 */ 211 if ((current_mp != mp) || 212 (P2NPHASE((uintptr_t)current_mp->b_rptr, 213 igb->page_size) < hdr_len)) { 214 /* 215 * reallocate the mblk for the last 216 * header fragment, expect it to be 217 * copied into pre-allocated 218 * page-aligned buffer 219 */ 220 new_mp = allocb(hdr_frag_len, NULL); 221 if (!new_mp) { 222 return (B_FALSE); 223 } 224 225 /* 226 * Insert the new mblk 227 */ 228 bcopy(current_mp->b_rptr, 229 new_mp->b_rptr, hdr_frag_len); 230 new_mp->b_wptr = new_mp->b_rptr + 231 hdr_frag_len; 232 new_mp->b_cont = current_mp; 233 if (previous_mp) 234 previous_mp->b_cont = new_mp; 235 else 236 mp = new_mp; 237 current_mp->b_rptr += hdr_frag_len; 238 } 239 } 240 241 if (copy_thresh < hdr_len) 242 copy_thresh = hdr_len; 243 } 244 } 245 246 /* 247 * The pending_list is a linked list that is used to save 248 * the tx control blocks that have packet data processed 249 * but have not put the data to the tx descriptor ring. 250 * It is used to reduce the lock contention of the tx_lock. 251 */ 252 LINK_LIST_INIT(&pending_list); 253 desc_num = 0; 254 desc_total = 0; 255 256 current_mp = mp; 257 current_len = MBLKL(current_mp); 258 /* 259 * Decide which method to use for the first fragment 260 */ 261 current_flag = (current_len <= copy_thresh) ? 262 USE_COPY : USE_DMA; 263 /* 264 * If the mblk includes several contiguous small fragments, 265 * they may be copied into one buffer. This flag is used to 266 * indicate whether there are pending fragments that need to 267 * be copied to the current tx buffer. 268 * 269 * If this flag is B_TRUE, it indicates that a new tx control 270 * block is needed to process the next fragment using either 271 * copy or DMA binding. 272 * 273 * Otherwise, it indicates that the next fragment will be 274 * copied to the current tx buffer that is maintained by the 275 * current tx control block. No new tx control block is needed. 276 */ 277 copy_done = B_TRUE; 278 while (current_mp) { 279 next_mp = current_mp->b_cont; 280 eop = (next_mp == NULL); /* Last fragment of the packet? */ 281 next_len = eop ? 0: MBLKL(next_mp); 282 283 /* 284 * When the current fragment is an empty fragment, if 285 * the next fragment will still be copied to the current 286 * tx buffer, we cannot skip this fragment here. Because 287 * the copy processing is pending for completion. We have 288 * to process this empty fragment in the tx_copy routine. 289 * 290 * If the copy processing is completed or a DMA binding 291 * processing is just completed, we can just skip this 292 * empty fragment. 293 */ 294 if ((current_len == 0) && (copy_done)) { 295 current_mp = next_mp; 296 current_len = next_len; 297 current_flag = (current_len <= copy_thresh) ? 298 USE_COPY : USE_DMA; 299 continue; 300 } 301 302 if (copy_done) { 303 /* 304 * Get a new tx control block from the free list 305 */ 306 tcb = igb_get_free_list(tx_ring); 307 308 if (tcb == NULL) { 309 IGB_DEBUG_STAT(tx_ring->stat_fail_no_tcb); 310 goto tx_failure; 311 } 312 313 /* 314 * Push the tx control block to the pending list 315 * to avoid using lock too early 316 */ 317 LIST_PUSH_TAIL(&pending_list, &tcb->link); 318 } 319 320 if (current_flag == USE_COPY) { 321 /* 322 * Check whether to use bcopy or DMA binding to process 323 * the next fragment, and if using bcopy, whether we 324 * need to continue copying the next fragment into the 325 * current tx buffer. 326 */ 327 ASSERT((tcb->tx_buf.len + current_len) <= 328 tcb->tx_buf.size); 329 330 if (eop) { 331 /* 332 * This is the last fragment of the packet, so 333 * the copy processing will be completed with 334 * this fragment. 335 */ 336 next_flag = USE_NONE; 337 copy_done = B_TRUE; 338 } else if ((tcb->tx_buf.len + current_len + next_len) > 339 tcb->tx_buf.size) { 340 /* 341 * If the next fragment is too large to be 342 * copied to the current tx buffer, we need 343 * to complete the current copy processing. 344 */ 345 next_flag = (next_len > copy_thresh) ? 346 USE_DMA: USE_COPY; 347 copy_done = B_TRUE; 348 } else if (next_len > copy_thresh) { 349 /* 350 * The next fragment needs to be processed with 351 * DMA binding. So the copy prcessing will be 352 * completed with the current fragment. 353 */ 354 next_flag = USE_DMA; 355 copy_done = B_TRUE; 356 } else { 357 /* 358 * Continue to copy the next fragment to the 359 * current tx buffer. 360 */ 361 next_flag = USE_COPY; 362 copy_done = B_FALSE; 363 } 364 365 desc_num = igb_tx_copy(tx_ring, tcb, current_mp, 366 current_len, copy_done); 367 } else { 368 /* 369 * Check whether to use bcopy or DMA binding to process 370 * the next fragment. 371 */ 372 next_flag = (next_len > copy_thresh) ? 373 USE_DMA: USE_COPY; 374 ASSERT(copy_done == B_TRUE); 375 376 desc_num = igb_tx_bind(tx_ring, tcb, current_mp, 377 current_len); 378 } 379 380 if (desc_num > 0) 381 desc_total += desc_num; 382 else if (desc_num < 0) 383 goto tx_failure; 384 385 current_mp = next_mp; 386 current_len = next_len; 387 current_flag = next_flag; 388 } 389 390 /* 391 * Attach the mblk to the last tx control block 392 */ 393 ASSERT(tcb); 394 ASSERT(tcb->mp == NULL); 395 tcb->mp = mp; 396 397 /* 398 * Before fill the tx descriptor ring with the data, we need to 399 * ensure there are adequate free descriptors for transmit 400 * (including one context descriptor). 401 */ 402 if (tx_ring->tbd_free < (desc_total + 1)) { 403 tx_ring->tx_recycle(tx_ring); 404 } 405 406 mutex_enter(&tx_ring->tx_lock); 407 408 /* 409 * If the number of free tx descriptors is not enough for transmit 410 * then return failure. 411 * 412 * Note: we must put this check under the mutex protection to 413 * ensure the correctness when multiple threads access it in 414 * parallel. 415 */ 416 if (tx_ring->tbd_free < (desc_total + 1)) { 417 IGB_DEBUG_STAT(tx_ring->stat_fail_no_tbd); 418 mutex_exit(&tx_ring->tx_lock); 419 goto tx_failure; 420 } 421 422 desc_num = igb_tx_fill_ring(tx_ring, &pending_list, ctx, mbsize); 423 424 ASSERT((desc_num == desc_total) || (desc_num == (desc_total + 1))); 425 426 mutex_exit(&tx_ring->tx_lock); 427 428 return (B_TRUE); 429 430 tx_failure: 431 /* 432 * Discard the mblk and free the used resources 433 */ 434 tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list); 435 while (tcb) { 436 tcb->mp = NULL; 437 438 igb_free_tcb(tcb); 439 440 tcb = (tx_control_block_t *) 441 LIST_GET_NEXT(&pending_list, &tcb->link); 442 } 443 444 /* 445 * Return the tx control blocks in the pending list to the free list. 446 */ 447 igb_put_free_list(tx_ring, &pending_list); 448 449 /* Transmit failed, do not drop the mblk, rechedule the transmit */ 450 tx_ring->reschedule = B_TRUE; 451 452 return (B_FALSE); 453 } 454 455 /* 456 * igb_tx_copy 457 * 458 * Copy the mblk fragment to the pre-allocated tx buffer 459 */ 460 static int 461 igb_tx_copy(igb_tx_ring_t *tx_ring, tx_control_block_t *tcb, mblk_t *mp, 462 uint32_t len, boolean_t copy_done) 463 { 464 dma_buffer_t *tx_buf; 465 uint32_t desc_num; 466 _NOTE(ARGUNUSED(tx_ring)); 467 468 tx_buf = &tcb->tx_buf; 469 470 /* 471 * Copy the packet data of the mblk fragment into the 472 * pre-allocated tx buffer, which is maintained by the 473 * tx control block. 474 * 475 * Several mblk fragments can be copied into one tx buffer. 476 * The destination address of the current copied fragment in 477 * the tx buffer is next to the end of the previous copied 478 * fragment. 479 */ 480 if (len > 0) { 481 bcopy(mp->b_rptr, tx_buf->address + tx_buf->len, len); 482 483 tx_buf->len += len; 484 tcb->frag_num++; 485 } 486 487 desc_num = 0; 488 489 /* 490 * If it is the last fragment copied to the current tx buffer, 491 * in other words, if there's no remaining fragment or the remaining 492 * fragment requires a new tx control block to process, we need to 493 * complete the current copy processing by syncing up the current 494 * DMA buffer and saving the descriptor data. 495 */ 496 if (copy_done) { 497 /* 498 * Sync the DMA buffer of the packet data 499 */ 500 DMA_SYNC(tx_buf, DDI_DMA_SYNC_FORDEV); 501 502 tcb->tx_type = USE_COPY; 503 504 /* 505 * Save the address and length to the private data structure 506 * of the tx control block, which will be used to fill the 507 * tx descriptor ring after all the fragments are processed. 508 */ 509 igb_save_desc(tcb, tx_buf->dma_address, tx_buf->len); 510 desc_num++; 511 } 512 513 return (desc_num); 514 } 515 516 /* 517 * igb_tx_bind 518 * 519 * Bind the mblk fragment with DMA 520 */ 521 static int 522 igb_tx_bind(igb_tx_ring_t *tx_ring, tx_control_block_t *tcb, mblk_t *mp, 523 uint32_t len) 524 { 525 int status, i; 526 ddi_dma_cookie_t dma_cookie; 527 uint_t ncookies; 528 int desc_num; 529 530 /* 531 * Use DMA binding to process the mblk fragment 532 */ 533 status = ddi_dma_addr_bind_handle(tcb->tx_dma_handle, NULL, 534 (caddr_t)mp->b_rptr, len, 535 DDI_DMA_WRITE | DDI_DMA_STREAMING, DDI_DMA_DONTWAIT, 536 0, &dma_cookie, &ncookies); 537 538 if (status != DDI_DMA_MAPPED) { 539 IGB_DEBUG_STAT(tx_ring->stat_fail_dma_bind); 540 return (-1); 541 } 542 543 tcb->frag_num++; 544 tcb->tx_type = USE_DMA; 545 /* 546 * Each fragment can span several cookies. One cookie will have 547 * one tx descriptor to transmit. 548 */ 549 desc_num = 0; 550 for (i = ncookies; i > 0; i--) { 551 /* 552 * Save the address and length to the private data structure 553 * of the tx control block, which will be used to fill the 554 * tx descriptor ring after all the fragments are processed. 555 */ 556 igb_save_desc(tcb, 557 dma_cookie.dmac_laddress, 558 dma_cookie.dmac_size); 559 560 desc_num++; 561 562 if (i > 1) 563 ddi_dma_nextcookie(tcb->tx_dma_handle, &dma_cookie); 564 } 565 566 return (desc_num); 567 } 568 569 /* 570 * igb_get_tx_context 571 * 572 * Get the tx context information from the mblk 573 */ 574 static int 575 igb_get_tx_context(mblk_t *mp, tx_context_t *ctx) 576 { 577 uint32_t start; 578 uint32_t flags; 579 uint32_t lso_flag; 580 uint32_t mss; 581 uint32_t len; 582 uint32_t size; 583 uint32_t offset; 584 unsigned char *pos; 585 ushort_t etype; 586 uint32_t mac_hdr_len; 587 uint32_t l4_proto; 588 uint32_t l4_hdr_len; 589 590 ASSERT(mp != NULL); 591 592 hcksum_retrieve(mp, NULL, NULL, &start, NULL, NULL, NULL, &flags); 593 bzero(ctx, sizeof (tx_context_t)); 594 595 ctx->hcksum_flags = flags; 596 597 if (flags == 0) 598 return (TX_CXT_SUCCESS); 599 600 lso_info_get(mp, &mss, &lso_flag); 601 ctx->mss = mss; 602 ctx->lso_flag = (lso_flag == HW_LSO); 603 604 /* 605 * LSO relies on tx h/w checksum, so here the packet will be 606 * dropped if the h/w checksum flags are not set. 607 */ 608 if (ctx->lso_flag) { 609 if (!((ctx->hcksum_flags & HCK_PARTIALCKSUM) && 610 (ctx->hcksum_flags & HCK_IPV4_HDRCKSUM))) { 611 IGB_DEBUGLOG_0(NULL, "igb_tx: h/w " 612 "checksum flags are not set for LSO"); 613 return (TX_CXT_E_LSO_CSUM); 614 } 615 } 616 617 etype = 0; 618 mac_hdr_len = 0; 619 l4_proto = 0; 620 621 /* 622 * Firstly get the position of the ether_type/ether_tpid. 623 * Here we don't assume the ether (VLAN) header is fully included 624 * in one mblk fragment, so we go thourgh the fragments to parse 625 * the ether type. 626 */ 627 size = len = MBLKL(mp); 628 offset = offsetof(struct ether_header, ether_type); 629 while (size <= offset) { 630 mp = mp->b_cont; 631 ASSERT(mp != NULL); 632 len = MBLKL(mp); 633 size += len; 634 } 635 pos = mp->b_rptr + offset + len - size; 636 637 etype = ntohs(*(ushort_t *)(uintptr_t)pos); 638 if (etype == ETHERTYPE_VLAN) { 639 /* 640 * Get the position of the ether_type in VLAN header 641 */ 642 offset = offsetof(struct ether_vlan_header, ether_type); 643 while (size <= offset) { 644 mp = mp->b_cont; 645 ASSERT(mp != NULL); 646 len = MBLKL(mp); 647 size += len; 648 } 649 pos = mp->b_rptr + offset + len - size; 650 651 etype = ntohs(*(ushort_t *)(uintptr_t)pos); 652 mac_hdr_len = sizeof (struct ether_vlan_header); 653 } else { 654 mac_hdr_len = sizeof (struct ether_header); 655 } 656 657 /* 658 * Here we assume the IP(V6) header is fully included in one 659 * mblk fragment. 660 */ 661 switch (etype) { 662 case ETHERTYPE_IP: 663 offset = mac_hdr_len; 664 while (size <= offset) { 665 mp = mp->b_cont; 666 ASSERT(mp != NULL); 667 len = MBLKL(mp); 668 size += len; 669 } 670 pos = mp->b_rptr + offset + len - size; 671 672 if (ctx->lso_flag) { 673 *((uint16_t *)(uintptr_t)(pos + offsetof(ipha_t, 674 ipha_length))) = 0; 675 676 /* 677 * To utilize igb LSO, here need to fill 678 * the tcp checksum field of the packet with the 679 * following pseudo-header checksum: 680 * (ip_source_addr, ip_destination_addr, l4_proto) 681 * and also need to fill the ip header checksum 682 * with zero. Currently the tcp/ip stack has done 683 * these. 684 */ 685 } 686 687 l4_proto = *(uint8_t *)(pos + offsetof(ipha_t, ipha_protocol)); 688 break; 689 case ETHERTYPE_IPV6: 690 offset = offsetof(ip6_t, ip6_nxt) + mac_hdr_len; 691 while (size <= offset) { 692 mp = mp->b_cont; 693 ASSERT(mp != NULL); 694 len = MBLKL(mp); 695 size += len; 696 } 697 pos = mp->b_rptr + offset + len - size; 698 699 l4_proto = *(uint8_t *)pos; 700 break; 701 default: 702 /* Unrecoverable error */ 703 IGB_DEBUGLOG_0(NULL, "Ethernet type field error with " 704 "tx hcksum flag set"); 705 return (TX_CXT_E_ETHER_TYPE); 706 } 707 708 if (ctx->lso_flag) { 709 offset = mac_hdr_len + start; 710 while (size <= offset) { 711 mp = mp->b_cont; 712 ASSERT(mp != NULL); 713 len = MBLKL(mp); 714 size += len; 715 } 716 pos = mp->b_rptr + offset + len - size; 717 718 l4_hdr_len = TCP_HDR_LENGTH((tcph_t *)pos); 719 } else { 720 /* 721 * l4 header length is only required for LSO 722 */ 723 l4_hdr_len = 0; 724 } 725 726 ctx->mac_hdr_len = mac_hdr_len; 727 ctx->ip_hdr_len = start; 728 ctx->l4_proto = l4_proto; 729 ctx->l4_hdr_len = l4_hdr_len; 730 731 return (TX_CXT_SUCCESS); 732 } 733 734 /* 735 * igb_check_tx_context 736 * 737 * Check if a new context descriptor is needed 738 */ 739 static boolean_t 740 igb_check_tx_context(igb_tx_ring_t *tx_ring, tx_context_t *ctx) 741 { 742 tx_context_t *last; 743 744 if (ctx == NULL) 745 return (B_FALSE); 746 747 /* 748 * Compare the context data retrieved from the mblk and the 749 * stored context data of the last context descriptor. The data 750 * need to be checked are: 751 * hcksum_flags 752 * l4_proto 753 * mss (only check for LSO) 754 * l4_hdr_len (only check for LSO) 755 * ip_hdr_len 756 * mac_hdr_len 757 * Either one of the above data is changed, a new context descriptor 758 * will be needed. 759 */ 760 last = &tx_ring->tx_context; 761 762 if (ctx->hcksum_flags != 0) { 763 if ((ctx->hcksum_flags != last->hcksum_flags) || 764 (ctx->l4_proto != last->l4_proto) || 765 (ctx->lso_flag && ((ctx->mss != last->mss) || 766 (ctx->l4_hdr_len != last->l4_hdr_len))) || 767 (ctx->ip_hdr_len != last->ip_hdr_len) || 768 (ctx->mac_hdr_len != last->mac_hdr_len)) { 769 return (B_TRUE); 770 } 771 } 772 773 return (B_FALSE); 774 } 775 776 /* 777 * igb_fill_tx_context 778 * 779 * Fill the context descriptor with hardware checksum informations 780 */ 781 static void 782 igb_fill_tx_context(struct e1000_adv_tx_context_desc *ctx_tbd, 783 tx_context_t *ctx, uint32_t ring_index) 784 { 785 /* 786 * Fill the context descriptor with the checksum 787 * context information we've got 788 */ 789 ctx_tbd->vlan_macip_lens = ctx->ip_hdr_len; 790 ctx_tbd->vlan_macip_lens |= ctx->mac_hdr_len << 791 E1000_ADVTXD_MACLEN_SHIFT; 792 793 ctx_tbd->type_tucmd_mlhl = 794 E1000_ADVTXD_DCMD_DEXT | E1000_ADVTXD_DTYP_CTXT; 795 796 if (ctx->hcksum_flags & HCK_IPV4_HDRCKSUM) 797 ctx_tbd->type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_IPV4; 798 799 if (ctx->hcksum_flags & HCK_PARTIALCKSUM) { 800 switch (ctx->l4_proto) { 801 case IPPROTO_TCP: 802 ctx_tbd->type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_TCP; 803 break; 804 case IPPROTO_UDP: 805 /* 806 * We don't have to explicitly set: 807 * ctx_tbd->type_tucmd_mlhl |= 808 * E1000_ADVTXD_TUCMD_L4T_UDP; 809 * Because E1000_ADVTXD_TUCMD_L4T_UDP == 0b 810 */ 811 break; 812 default: 813 /* Unrecoverable error */ 814 IGB_DEBUGLOG_0(NULL, "L4 type error with tx hcksum"); 815 break; 816 } 817 } 818 819 ctx_tbd->seqnum_seed = 0; 820 ctx_tbd->mss_l4len_idx = ring_index << 4; 821 if (ctx->lso_flag) { 822 ctx_tbd->mss_l4len_idx |= 823 (ctx->l4_hdr_len << E1000_ADVTXD_L4LEN_SHIFT) | 824 (ctx->mss << E1000_ADVTXD_MSS_SHIFT); 825 } 826 } 827 828 /* 829 * igb_tx_fill_ring 830 * 831 * Fill the tx descriptor ring with the data 832 */ 833 static int 834 igb_tx_fill_ring(igb_tx_ring_t *tx_ring, link_list_t *pending_list, 835 tx_context_t *ctx, size_t mbsize) 836 { 837 struct e1000_hw *hw = &tx_ring->igb->hw; 838 boolean_t load_context; 839 uint32_t index, tcb_index, desc_num; 840 union e1000_adv_tx_desc *tbd, *first_tbd; 841 tx_control_block_t *tcb, *first_tcb; 842 uint32_t hcksum_flags; 843 int i; 844 igb_t *igb = tx_ring->igb; 845 846 ASSERT(mutex_owned(&tx_ring->tx_lock)); 847 848 tbd = NULL; 849 first_tbd = NULL; 850 first_tcb = NULL; 851 desc_num = 0; 852 hcksum_flags = 0; 853 load_context = B_FALSE; 854 855 /* 856 * Get the index of the first tx descriptor that will be filled, 857 * and the index of the first work list item that will be attached 858 * with the first used tx control block in the pending list. 859 * Note: the two indexes are the same. 860 */ 861 index = tx_ring->tbd_tail; 862 tcb_index = tx_ring->tbd_tail; 863 864 if (ctx != NULL) { 865 hcksum_flags = ctx->hcksum_flags; 866 867 /* 868 * Check if a new context descriptor is needed for this packet 869 */ 870 load_context = igb_check_tx_context(tx_ring, ctx); 871 if (load_context) { 872 first_tcb = (tx_control_block_t *) 873 LIST_GET_HEAD(pending_list); 874 tbd = &tx_ring->tbd_ring[index]; 875 876 /* 877 * Fill the context descriptor with the 878 * hardware checksum offload informations. 879 */ 880 igb_fill_tx_context( 881 (struct e1000_adv_tx_context_desc *)tbd, 882 ctx, tx_ring->index); 883 884 index = NEXT_INDEX(index, 1, tx_ring->ring_size); 885 desc_num++; 886 887 /* 888 * Store the checksum context data if 889 * a new context descriptor is added 890 */ 891 tx_ring->tx_context = *ctx; 892 } 893 } 894 895 first_tbd = &tx_ring->tbd_ring[index]; 896 897 /* 898 * Fill tx data descriptors with the data saved in the pending list. 899 * The tx control blocks in the pending list are added to the work list 900 * at the same time. 901 * 902 * The work list is strictly 1:1 corresponding to the descriptor ring. 903 * One item of the work list corresponds to one tx descriptor. Because 904 * one tx control block can span multiple tx descriptors, the tx 905 * control block will be added to the first work list item that 906 * corresponds to the first tx descriptor generated from that tx 907 * control block. 908 */ 909 tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list); 910 while (tcb != NULL) { 911 912 for (i = 0; i < tcb->desc_num; i++) { 913 tbd = &tx_ring->tbd_ring[index]; 914 915 tbd->read.buffer_addr = tcb->desc[i].address; 916 tbd->read.cmd_type_len = tcb->desc[i].length; 917 918 tbd->read.cmd_type_len |= E1000_ADVTXD_DCMD_RS | 919 E1000_ADVTXD_DCMD_DEXT | E1000_ADVTXD_DTYP_DATA | 920 E1000_ADVTXD_DCMD_IFCS; 921 922 tbd->read.olinfo_status = 0; 923 924 index = NEXT_INDEX(index, 1, tx_ring->ring_size); 925 desc_num++; 926 } 927 928 if (first_tcb != NULL) { 929 /* 930 * Count the checksum context descriptor for 931 * the first tx control block. 932 */ 933 first_tcb->desc_num++; 934 first_tcb = NULL; 935 } 936 937 /* 938 * Add the tx control block to the work list 939 */ 940 ASSERT(tx_ring->work_list[tcb_index] == NULL); 941 tx_ring->work_list[tcb_index] = tcb; 942 943 tcb_index = index; 944 tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list); 945 } 946 947 /* 948 * The Insert Ethernet CRC (IFCS) bit and the checksum fields are only 949 * valid in the first descriptor of the packet. 950 * 82576 also requires the payload length setting even without LSO 951 */ 952 ASSERT(first_tbd != NULL); 953 first_tbd->read.cmd_type_len |= E1000_ADVTXD_DCMD_IFCS; 954 if (ctx != NULL && ctx->lso_flag) { 955 first_tbd->read.cmd_type_len |= E1000_ADVTXD_DCMD_TSE; 956 first_tbd->read.olinfo_status |= 957 (mbsize - ctx->mac_hdr_len - ctx->ip_hdr_len 958 - ctx->l4_hdr_len) << E1000_ADVTXD_PAYLEN_SHIFT; 959 } else { 960 if (hw->mac.type >= e1000_82576) { 961 first_tbd->read.olinfo_status |= 962 (mbsize << E1000_ADVTXD_PAYLEN_SHIFT); 963 } 964 } 965 966 /* Set hardware checksum bits */ 967 if (hcksum_flags != 0) { 968 if (hcksum_flags & HCK_IPV4_HDRCKSUM) 969 first_tbd->read.olinfo_status |= 970 E1000_TXD_POPTS_IXSM << 8; 971 if (hcksum_flags & HCK_PARTIALCKSUM) 972 first_tbd->read.olinfo_status |= 973 E1000_TXD_POPTS_TXSM << 8; 974 first_tbd->read.olinfo_status |= tx_ring->index << 4; 975 } 976 977 /* 978 * The last descriptor of packet needs End Of Packet (EOP), 979 * and Report Status (RS) bits set 980 */ 981 ASSERT(tbd != NULL); 982 tbd->read.cmd_type_len |= 983 E1000_ADVTXD_DCMD_EOP | E1000_ADVTXD_DCMD_RS; 984 985 IGB_DEBUG_STAT(tx_ring->stat_pkt_cnt); 986 987 /* 988 * Sync the DMA buffer of the tx descriptor ring 989 */ 990 DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORDEV); 991 992 /* 993 * Update the number of the free tx descriptors. 994 * The mutual exclusion between the transmission and the recycling 995 * (for the tx descriptor ring and the work list) is implemented 996 * with the atomic operation on the number of the free tx descriptors. 997 * 998 * Note: we should always decrement the counter tbd_free before 999 * advancing the hardware TDT pointer to avoid the race condition - 1000 * before the counter tbd_free is decremented, the transmit of the 1001 * tx descriptors has done and the counter tbd_free is increased by 1002 * the tx recycling. 1003 */ 1004 i = igb_atomic_reserve(&tx_ring->tbd_free, desc_num); 1005 ASSERT(i >= 0); 1006 1007 tx_ring->tbd_tail = index; 1008 1009 /* 1010 * Advance the hardware TDT pointer of the tx descriptor ring 1011 */ 1012 E1000_WRITE_REG(hw, E1000_TDT(tx_ring->index), index); 1013 1014 if (igb_check_acc_handle(igb->osdep.reg_handle) != DDI_FM_OK) { 1015 ddi_fm_service_impact(igb->dip, DDI_SERVICE_DEGRADED); 1016 } 1017 1018 return (desc_num); 1019 } 1020 1021 /* 1022 * igb_save_desc 1023 * 1024 * Save the address/length pair to the private array 1025 * of the tx control block. The address/length pairs 1026 * will be filled into the tx descriptor ring later. 1027 */ 1028 static void 1029 igb_save_desc(tx_control_block_t *tcb, uint64_t address, size_t length) 1030 { 1031 sw_desc_t *desc; 1032 1033 desc = &tcb->desc[tcb->desc_num]; 1034 desc->address = address; 1035 desc->length = length; 1036 1037 tcb->desc_num++; 1038 } 1039 1040 /* 1041 * igb_tx_recycle_legacy 1042 * 1043 * Recycle the tx descriptors and tx control blocks. 1044 * 1045 * The work list is traversed to check if the corresponding 1046 * tx descriptors have been transmitted. If so, the resources 1047 * bound to the tx control blocks will be freed, and those 1048 * tx control blocks will be returned to the free list. 1049 */ 1050 uint32_t 1051 igb_tx_recycle_legacy(igb_tx_ring_t *tx_ring) 1052 { 1053 uint32_t index, last_index; 1054 int desc_num; 1055 boolean_t desc_done; 1056 tx_control_block_t *tcb; 1057 link_list_t pending_list; 1058 igb_t *igb = tx_ring->igb; 1059 1060 /* 1061 * The mutex_tryenter() is used to avoid unnecessary 1062 * lock contention. 1063 */ 1064 if (mutex_tryenter(&tx_ring->recycle_lock) == 0) 1065 return (0); 1066 1067 ASSERT(tx_ring->tbd_free <= tx_ring->ring_size); 1068 1069 if (tx_ring->tbd_free == tx_ring->ring_size) { 1070 tx_ring->recycle_fail = 0; 1071 tx_ring->stall_watchdog = 0; 1072 mutex_exit(&tx_ring->recycle_lock); 1073 return (0); 1074 } 1075 1076 /* 1077 * Sync the DMA buffer of the tx descriptor ring 1078 */ 1079 DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORKERNEL); 1080 1081 if (igb_check_dma_handle( 1082 tx_ring->tbd_area.dma_handle) != DDI_FM_OK) { 1083 ddi_fm_service_impact(igb->dip, DDI_SERVICE_DEGRADED); 1084 } 1085 1086 LINK_LIST_INIT(&pending_list); 1087 desc_num = 0; 1088 index = tx_ring->tbd_head; /* Index of next tbd/tcb to recycle */ 1089 1090 tcb = tx_ring->work_list[index]; 1091 ASSERT(tcb != NULL); 1092 1093 desc_done = B_TRUE; 1094 while (desc_done && (tcb != NULL)) { 1095 1096 /* 1097 * Get the last tx descriptor of the tx control block. 1098 * If the last tx descriptor is done, it is done with 1099 * all the tx descriptors of the tx control block. 1100 * Then the tx control block and all the corresponding 1101 * tx descriptors can be recycled. 1102 */ 1103 last_index = NEXT_INDEX(index, tcb->desc_num - 1, 1104 tx_ring->ring_size); 1105 1106 /* 1107 * Check if the Descriptor Done bit is set 1108 */ 1109 desc_done = tx_ring->tbd_ring[last_index].wb.status & 1110 E1000_TXD_STAT_DD; 1111 if (desc_done) { 1112 /* 1113 * Strip off the tx control block from the work list, 1114 * and add it to the pending list. 1115 */ 1116 tx_ring->work_list[index] = NULL; 1117 LIST_PUSH_TAIL(&pending_list, &tcb->link); 1118 1119 /* 1120 * Count the total number of the tx descriptors recycled 1121 */ 1122 desc_num += tcb->desc_num; 1123 1124 /* 1125 * Advance the index of the tx descriptor ring 1126 */ 1127 index = NEXT_INDEX(last_index, 1, tx_ring->ring_size); 1128 1129 tcb = tx_ring->work_list[index]; 1130 } 1131 } 1132 1133 /* 1134 * If no tx descriptors are recycled, no need to do more processing 1135 */ 1136 if (desc_num == 0) { 1137 tx_ring->recycle_fail++; 1138 mutex_exit(&tx_ring->recycle_lock); 1139 return (0); 1140 } 1141 1142 tx_ring->recycle_fail = 0; 1143 tx_ring->stall_watchdog = 0; 1144 1145 /* 1146 * Update the head index of the tx descriptor ring 1147 */ 1148 tx_ring->tbd_head = index; 1149 1150 /* 1151 * Update the number of the free tx descriptors with atomic operations 1152 */ 1153 atomic_add_32(&tx_ring->tbd_free, desc_num); 1154 1155 mutex_exit(&tx_ring->recycle_lock); 1156 1157 /* 1158 * Free the resources used by the tx control blocks 1159 * in the pending list 1160 */ 1161 tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list); 1162 while (tcb != NULL) { 1163 /* 1164 * Release the resources occupied by the tx control block 1165 */ 1166 igb_free_tcb(tcb); 1167 1168 tcb = (tx_control_block_t *) 1169 LIST_GET_NEXT(&pending_list, &tcb->link); 1170 } 1171 1172 /* 1173 * Add the tx control blocks in the pending list to the free list. 1174 */ 1175 igb_put_free_list(tx_ring, &pending_list); 1176 1177 return (desc_num); 1178 } 1179 1180 /* 1181 * igb_tx_recycle_head_wb 1182 * 1183 * Check the head write-back, and recycle all the transmitted 1184 * tx descriptors and tx control blocks. 1185 */ 1186 uint32_t 1187 igb_tx_recycle_head_wb(igb_tx_ring_t *tx_ring) 1188 { 1189 uint32_t index; 1190 uint32_t head_wb; 1191 int desc_num; 1192 tx_control_block_t *tcb; 1193 link_list_t pending_list; 1194 igb_t *igb = tx_ring->igb; 1195 1196 /* 1197 * The mutex_tryenter() is used to avoid unnecessary 1198 * lock contention. 1199 */ 1200 if (mutex_tryenter(&tx_ring->recycle_lock) == 0) 1201 return (0); 1202 1203 ASSERT(tx_ring->tbd_free <= tx_ring->ring_size); 1204 1205 if (tx_ring->tbd_free == tx_ring->ring_size) { 1206 tx_ring->recycle_fail = 0; 1207 tx_ring->stall_watchdog = 0; 1208 mutex_exit(&tx_ring->recycle_lock); 1209 return (0); 1210 } 1211 1212 /* 1213 * Sync the DMA buffer of the tx descriptor ring 1214 * 1215 * Note: For head write-back mode, the tx descriptors will not 1216 * be written back, but the head write-back value is stored at 1217 * the last extra tbd at the end of the DMA area, we still need 1218 * to sync the head write-back value for kernel. 1219 * 1220 * DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORKERNEL); 1221 */ 1222 (void) ddi_dma_sync(tx_ring->tbd_area.dma_handle, 1223 sizeof (union e1000_adv_tx_desc) * tx_ring->ring_size, 1224 sizeof (uint32_t), 1225 DDI_DMA_SYNC_FORKERNEL); 1226 1227 if (igb_check_dma_handle( 1228 tx_ring->tbd_area.dma_handle) != DDI_FM_OK) { 1229 ddi_fm_service_impact(igb->dip, DDI_SERVICE_DEGRADED); 1230 } 1231 1232 LINK_LIST_INIT(&pending_list); 1233 desc_num = 0; 1234 index = tx_ring->tbd_head; /* Next index to clean */ 1235 1236 /* 1237 * Get the value of head write-back 1238 */ 1239 head_wb = *tx_ring->tbd_head_wb; 1240 while (index != head_wb) { 1241 tcb = tx_ring->work_list[index]; 1242 ASSERT(tcb != NULL); 1243 1244 if (OFFSET(index, head_wb, tx_ring->ring_size) < 1245 tcb->desc_num) { 1246 /* 1247 * The current tx control block is not 1248 * completely transmitted, stop recycling 1249 */ 1250 break; 1251 } 1252 1253 /* 1254 * Strip off the tx control block from the work list, 1255 * and add it to the pending list. 1256 */ 1257 tx_ring->work_list[index] = NULL; 1258 LIST_PUSH_TAIL(&pending_list, &tcb->link); 1259 1260 /* 1261 * Advance the index of the tx descriptor ring 1262 */ 1263 index = NEXT_INDEX(index, tcb->desc_num, tx_ring->ring_size); 1264 1265 /* 1266 * Count the total number of the tx descriptors recycled 1267 */ 1268 desc_num += tcb->desc_num; 1269 } 1270 1271 /* 1272 * If no tx descriptors are recycled, no need to do more processing 1273 */ 1274 if (desc_num == 0) { 1275 tx_ring->recycle_fail++; 1276 mutex_exit(&tx_ring->recycle_lock); 1277 return (0); 1278 } 1279 1280 tx_ring->recycle_fail = 0; 1281 tx_ring->stall_watchdog = 0; 1282 1283 /* 1284 * Update the head index of the tx descriptor ring 1285 */ 1286 tx_ring->tbd_head = index; 1287 1288 /* 1289 * Update the number of the free tx descriptors with atomic operations 1290 */ 1291 atomic_add_32(&tx_ring->tbd_free, desc_num); 1292 1293 mutex_exit(&tx_ring->recycle_lock); 1294 1295 /* 1296 * Free the resources used by the tx control blocks 1297 * in the pending list 1298 */ 1299 tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list); 1300 while (tcb) { 1301 /* 1302 * Release the resources occupied by the tx control block 1303 */ 1304 igb_free_tcb(tcb); 1305 1306 tcb = (tx_control_block_t *) 1307 LIST_GET_NEXT(&pending_list, &tcb->link); 1308 } 1309 1310 /* 1311 * Add the tx control blocks in the pending list to the free list. 1312 */ 1313 igb_put_free_list(tx_ring, &pending_list); 1314 1315 return (desc_num); 1316 } 1317 1318 /* 1319 * igb_free_tcb - free up the tx control block 1320 * 1321 * Free the resources of the tx control block, including 1322 * unbind the previously bound DMA handle, and reset other 1323 * control fields. 1324 */ 1325 void 1326 igb_free_tcb(tx_control_block_t *tcb) 1327 { 1328 switch (tcb->tx_type) { 1329 case USE_COPY: 1330 /* 1331 * Reset the buffer length that is used for copy 1332 */ 1333 tcb->tx_buf.len = 0; 1334 break; 1335 case USE_DMA: 1336 /* 1337 * Release the DMA resource that is used for 1338 * DMA binding. 1339 */ 1340 (void) ddi_dma_unbind_handle(tcb->tx_dma_handle); 1341 break; 1342 default: 1343 break; 1344 } 1345 1346 /* 1347 * Free the mblk 1348 */ 1349 if (tcb->mp != NULL) { 1350 freemsg(tcb->mp); 1351 tcb->mp = NULL; 1352 } 1353 1354 tcb->tx_type = USE_NONE; 1355 tcb->frag_num = 0; 1356 tcb->desc_num = 0; 1357 } 1358 1359 /* 1360 * igb_get_free_list - Get a free tx control block from the free list 1361 * 1362 * The atomic operation on the number of the available tx control block 1363 * in the free list is used to keep this routine mutual exclusive with 1364 * the routine igb_put_check_list. 1365 */ 1366 static tx_control_block_t * 1367 igb_get_free_list(igb_tx_ring_t *tx_ring) 1368 { 1369 tx_control_block_t *tcb; 1370 1371 /* 1372 * Check and update the number of the free tx control block 1373 * in the free list. 1374 */ 1375 if (igb_atomic_reserve(&tx_ring->tcb_free, 1) < 0) 1376 return (NULL); 1377 1378 mutex_enter(&tx_ring->tcb_head_lock); 1379 1380 tcb = tx_ring->free_list[tx_ring->tcb_head]; 1381 ASSERT(tcb != NULL); 1382 tx_ring->free_list[tx_ring->tcb_head] = NULL; 1383 tx_ring->tcb_head = NEXT_INDEX(tx_ring->tcb_head, 1, 1384 tx_ring->free_list_size); 1385 1386 mutex_exit(&tx_ring->tcb_head_lock); 1387 1388 return (tcb); 1389 } 1390 1391 /* 1392 * igb_put_free_list 1393 * 1394 * Put a list of used tx control blocks back to the free list 1395 * 1396 * A mutex is used here to ensure the serialization. The mutual exclusion 1397 * between igb_get_free_list and igb_put_free_list is implemented with 1398 * the atomic operation on the counter tcb_free. 1399 */ 1400 void 1401 igb_put_free_list(igb_tx_ring_t *tx_ring, link_list_t *pending_list) 1402 { 1403 uint32_t index; 1404 int tcb_num; 1405 tx_control_block_t *tcb; 1406 1407 mutex_enter(&tx_ring->tcb_tail_lock); 1408 1409 index = tx_ring->tcb_tail; 1410 1411 tcb_num = 0; 1412 tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list); 1413 while (tcb != NULL) { 1414 ASSERT(tx_ring->free_list[index] == NULL); 1415 tx_ring->free_list[index] = tcb; 1416 1417 tcb_num++; 1418 1419 index = NEXT_INDEX(index, 1, tx_ring->free_list_size); 1420 1421 tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list); 1422 } 1423 1424 tx_ring->tcb_tail = index; 1425 1426 /* 1427 * Update the number of the free tx control block 1428 * in the free list. This operation must be placed 1429 * under the protection of the lock. 1430 */ 1431 atomic_add_32(&tx_ring->tcb_free, tcb_num); 1432 1433 mutex_exit(&tx_ring->tcb_tail_lock); 1434 } 1435