1 /* 2 * CDDL HEADER START 3 * 4 * Copyright(c) 2007-2009 Intel Corporation. All rights reserved. 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at: 10 * http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When using or redistributing this file, you may do so under the 15 * License only. No other modification of this header is permitted. 16 * 17 * If applicable, add the following below this CDDL HEADER, with the 18 * fields enclosed by brackets "[]" replaced with your own identifying 19 * information: Portions Copyright [yyyy] [name of copyright owner] 20 * 21 * CDDL HEADER END 22 */ 23 24 /* 25 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 26 * Use is subject to license terms of the CDDL. 27 */ 28 29 #include "igb_sw.h" 30 31 static boolean_t igb_tx(igb_tx_ring_t *, mblk_t *); 32 static int igb_tx_copy(igb_tx_ring_t *, tx_control_block_t *, mblk_t *, 33 uint32_t, boolean_t); 34 static int igb_tx_bind(igb_tx_ring_t *, tx_control_block_t *, mblk_t *, 35 uint32_t); 36 static int igb_tx_fill_ring(igb_tx_ring_t *, link_list_t *, hcksum_context_t *); 37 static void igb_save_desc(tx_control_block_t *, uint64_t, size_t); 38 static tx_control_block_t *igb_get_free_list(igb_tx_ring_t *); 39 40 static void igb_get_hcksum_context(mblk_t *, hcksum_context_t *); 41 static boolean_t igb_check_hcksum_context(igb_tx_ring_t *, hcksum_context_t *); 42 static void igb_fill_hcksum_context(struct e1000_adv_tx_context_desc *, 43 hcksum_context_t *, uint32_t); 44 45 #ifndef IGB_DEBUG 46 #pragma inline(igb_save_desc) 47 #pragma inline(igb_get_hcksum_context) 48 #pragma inline(igb_check_hcksum_context) 49 #pragma inline(igb_fill_hcksum_context) 50 #endif 51 52 mblk_t * 53 igb_tx_ring_send(void *arg, mblk_t *mp) 54 { 55 igb_tx_ring_t *tx_ring = (igb_tx_ring_t *)arg; 56 57 ASSERT(tx_ring != NULL); 58 59 return ((igb_tx(tx_ring, mp)) ? NULL : mp); 60 } 61 62 /* 63 * igb_tx - Main transmit processing 64 * 65 * Called from igb_m_tx with an mblk ready to transmit. this 66 * routine sets up the transmit descriptors and sends data to 67 * the wire. 68 * 69 * One mblk can consist of several fragments, each fragment 70 * will be processed with different methods based on the size. 71 * For the fragments with size less than the bcopy threshold, 72 * they will be processed by using bcopy; otherwise, they will 73 * be processed by using DMA binding. 74 * 75 * To process the mblk, a tx control block is got from the 76 * free list. One tx control block contains one tx buffer, which 77 * is used to copy mblk fragments' data; and one tx DMA handle, 78 * which is used to bind a mblk fragment with DMA resource. 79 * 80 * Several small mblk fragments can be copied into one tx control 81 * block's buffer, and then the buffer will be transmitted with 82 * one tx descriptor. 83 * 84 * A large fragment only binds with one tx control block's DMA 85 * handle, and it can span several tx descriptors for transmitting. 86 * 87 * So to transmit a packet (mblk), several tx control blocks can 88 * be used. After the processing, those tx control blocks will 89 * be put to the work list. 90 */ 91 static boolean_t 92 igb_tx(igb_tx_ring_t *tx_ring, mblk_t *mp) 93 { 94 igb_t *igb = tx_ring->igb; 95 tx_type_t current_flag, next_flag; 96 uint32_t current_len, next_len; 97 uint32_t desc_total; 98 size_t mbsize; 99 int desc_num; 100 boolean_t copy_done, eop; 101 mblk_t *current_mp, *next_mp, *nmp; 102 tx_control_block_t *tcb; 103 hcksum_context_t hcksum_context, *hcksum; 104 link_list_t pending_list; 105 106 /* Get the mblk size */ 107 mbsize = 0; 108 for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) { 109 mbsize += MBLK_LEN(nmp); 110 } 111 112 /* 113 * If the mblk size exceeds the max frame size, 114 * discard this mblk, and return B_TRUE 115 */ 116 if (mbsize > (igb->max_frame_size - ETHERFCSL)) { 117 freemsg(mp); 118 IGB_DEBUGLOG_0(igb, "igb_tx: packet oversize"); 119 return (B_TRUE); 120 } 121 122 /* 123 * Check and recycle tx descriptors. 124 * The recycle threshold here should be selected carefully 125 */ 126 if (tx_ring->tbd_free < tx_ring->recycle_thresh) 127 tx_ring->tx_recycle(tx_ring); 128 129 /* 130 * After the recycling, if the tbd_free is less than the 131 * overload_threshold, assert overload, return B_FALSE; 132 * and we need to re-schedule the tx again. 133 */ 134 if (tx_ring->tbd_free < tx_ring->overload_thresh) { 135 tx_ring->reschedule = B_TRUE; 136 IGB_DEBUG_STAT(tx_ring->stat_overload); 137 return (B_FALSE); 138 } 139 140 /* 141 * The pending_list is a linked list that is used to save 142 * the tx control blocks that have packet data processed 143 * but have not put the data to the tx descriptor ring. 144 * It is used to reduce the lock contention of the tx_lock. 145 */ 146 LINK_LIST_INIT(&pending_list); 147 desc_num = 0; 148 desc_total = 0; 149 150 current_mp = mp; 151 current_len = MBLK_LEN(current_mp); 152 /* 153 * Decide which method to use for the first fragment 154 */ 155 current_flag = (current_len <= tx_ring->copy_thresh) ? 156 USE_COPY : USE_DMA; 157 /* 158 * If the mblk includes several contiguous small fragments, 159 * they may be copied into one buffer. This flag is used to 160 * indicate whether there are pending fragments that need to 161 * be copied to the current tx buffer. 162 * 163 * If this flag is B_TRUE, it indicates that a new tx control 164 * block is needed to process the next fragment using either 165 * copy or DMA binding. 166 * 167 * Otherwise, it indicates that the next fragment will be 168 * copied to the current tx buffer that is maintained by the 169 * current tx control block. No new tx control block is needed. 170 */ 171 copy_done = B_TRUE; 172 while (current_mp) { 173 next_mp = current_mp->b_cont; 174 eop = (next_mp == NULL); /* Last fragment of the packet? */ 175 next_len = eop ? 0: MBLK_LEN(next_mp); 176 177 /* 178 * When the current fragment is an empty fragment, if 179 * the next fragment will still be copied to the current 180 * tx buffer, we cannot skip this fragment here. Because 181 * the copy processing is pending for completion. We have 182 * to process this empty fragment in the tx_copy routine. 183 * 184 * If the copy processing is completed or a DMA binding 185 * processing is just completed, we can just skip this 186 * empty fragment. 187 */ 188 if ((current_len == 0) && (copy_done)) { 189 current_mp = next_mp; 190 current_len = next_len; 191 current_flag = (current_len <= tx_ring->copy_thresh) ? 192 USE_COPY : USE_DMA; 193 continue; 194 } 195 196 if (copy_done) { 197 /* 198 * Get a new tx control block from the free list 199 */ 200 tcb = igb_get_free_list(tx_ring); 201 202 if (tcb == NULL) { 203 IGB_DEBUG_STAT(tx_ring->stat_fail_no_tcb); 204 goto tx_failure; 205 } 206 207 /* 208 * Push the tx control block to the pending list 209 * to avoid using lock too early 210 */ 211 LIST_PUSH_TAIL(&pending_list, &tcb->link); 212 } 213 214 if (current_flag == USE_COPY) { 215 /* 216 * Check whether to use bcopy or DMA binding to process 217 * the next fragment, and if using bcopy, whether we 218 * need to continue copying the next fragment into the 219 * current tx buffer. 220 */ 221 ASSERT((tcb->tx_buf.len + current_len) <= 222 tcb->tx_buf.size); 223 224 if (eop) { 225 /* 226 * This is the last fragment of the packet, so 227 * the copy processing will be completed with 228 * this fragment. 229 */ 230 next_flag = USE_NONE; 231 copy_done = B_TRUE; 232 } else if ((tcb->tx_buf.len + current_len + next_len) > 233 tcb->tx_buf.size) { 234 /* 235 * If the next fragment is too large to be 236 * copied to the current tx buffer, we need 237 * to complete the current copy processing. 238 */ 239 next_flag = (next_len > tx_ring->copy_thresh) ? 240 USE_DMA: USE_COPY; 241 copy_done = B_TRUE; 242 } else if (next_len > tx_ring->copy_thresh) { 243 /* 244 * The next fragment needs to be processed with 245 * DMA binding. So the copy prcessing will be 246 * completed with the current fragment. 247 */ 248 next_flag = USE_DMA; 249 copy_done = B_TRUE; 250 } else { 251 /* 252 * Continue to copy the next fragment to the 253 * current tx buffer. 254 */ 255 next_flag = USE_COPY; 256 copy_done = B_FALSE; 257 } 258 259 desc_num = igb_tx_copy(tx_ring, tcb, current_mp, 260 current_len, copy_done); 261 } else { 262 /* 263 * Check whether to use bcopy or DMA binding to process 264 * the next fragment. 265 */ 266 next_flag = (next_len > tx_ring->copy_thresh) ? 267 USE_DMA: USE_COPY; 268 ASSERT(copy_done == B_TRUE); 269 270 desc_num = igb_tx_bind(tx_ring, tcb, current_mp, 271 current_len); 272 } 273 274 if (desc_num > 0) 275 desc_total += desc_num; 276 else if (desc_num < 0) 277 goto tx_failure; 278 279 current_mp = next_mp; 280 current_len = next_len; 281 current_flag = next_flag; 282 } 283 284 /* 285 * Attach the mblk to the last tx control block 286 */ 287 ASSERT(tcb); 288 ASSERT(tcb->mp == NULL); 289 tcb->mp = mp; 290 291 if (igb->tx_hcksum_enable) { 292 /* 293 * Retrieve checksum context information from the mblk that will 294 * be used to decide whether/how to fill the context descriptor. 295 */ 296 hcksum = &hcksum_context; 297 igb_get_hcksum_context(mp, hcksum); 298 } else { 299 hcksum = NULL; 300 } 301 302 /* 303 * Before fill the tx descriptor ring with the data, we need to 304 * ensure there are adequate free descriptors for transmit 305 * (including one context descriptor). 306 */ 307 if (tx_ring->tbd_free < (desc_total + 1)) { 308 tx_ring->tx_recycle(tx_ring); 309 } 310 311 mutex_enter(&tx_ring->tx_lock); 312 313 /* 314 * If the number of free tx descriptors is not enough for transmit 315 * then return failure. 316 * 317 * Note: we must put this check under the mutex protection to 318 * ensure the correctness when multiple threads access it in 319 * parallel. 320 */ 321 if (tx_ring->tbd_free < (desc_total + 1)) { 322 IGB_DEBUG_STAT(tx_ring->stat_fail_no_tbd); 323 mutex_exit(&tx_ring->tx_lock); 324 goto tx_failure; 325 } 326 327 desc_num = igb_tx_fill_ring(tx_ring, &pending_list, hcksum); 328 329 ASSERT((desc_num == desc_total) || (desc_num == (desc_total + 1))); 330 331 mutex_exit(&tx_ring->tx_lock); 332 333 return (B_TRUE); 334 335 tx_failure: 336 /* 337 * Discard the mblk and free the used resources 338 */ 339 tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list); 340 while (tcb) { 341 tcb->mp = NULL; 342 343 igb_free_tcb(tcb); 344 345 tcb = (tx_control_block_t *) 346 LIST_GET_NEXT(&pending_list, &tcb->link); 347 } 348 349 /* 350 * Return the tx control blocks in the pending list to the free list. 351 */ 352 igb_put_free_list(tx_ring, &pending_list); 353 354 /* Transmit failed, do not drop the mblk, rechedule the transmit */ 355 tx_ring->reschedule = B_TRUE; 356 357 return (B_FALSE); 358 } 359 360 /* 361 * igb_tx_copy 362 * 363 * Copy the mblk fragment to the pre-allocated tx buffer 364 */ 365 static int 366 igb_tx_copy(igb_tx_ring_t *tx_ring, tx_control_block_t *tcb, mblk_t *mp, 367 uint32_t len, boolean_t copy_done) 368 { 369 dma_buffer_t *tx_buf; 370 uint32_t desc_num; 371 _NOTE(ARGUNUSED(tx_ring)); 372 373 tx_buf = &tcb->tx_buf; 374 375 /* 376 * Copy the packet data of the mblk fragment into the 377 * pre-allocated tx buffer, which is maintained by the 378 * tx control block. 379 * 380 * Several mblk fragments can be copied into one tx buffer. 381 * The destination address of the current copied fragment in 382 * the tx buffer is next to the end of the previous copied 383 * fragment. 384 */ 385 if (len > 0) { 386 bcopy(mp->b_rptr, tx_buf->address + tx_buf->len, len); 387 388 tx_buf->len += len; 389 tcb->frag_num++; 390 } 391 392 desc_num = 0; 393 394 /* 395 * If it is the last fragment copied to the current tx buffer, 396 * in other words, if there's no remaining fragment or the remaining 397 * fragment requires a new tx control block to process, we need to 398 * complete the current copy processing by syncing up the current 399 * DMA buffer and saving the descriptor data. 400 */ 401 if (copy_done) { 402 /* 403 * Sync the DMA buffer of the packet data 404 */ 405 DMA_SYNC(tx_buf, DDI_DMA_SYNC_FORDEV); 406 407 tcb->tx_type = USE_COPY; 408 409 /* 410 * Save the address and length to the private data structure 411 * of the tx control block, which will be used to fill the 412 * tx descriptor ring after all the fragments are processed. 413 */ 414 igb_save_desc(tcb, tx_buf->dma_address, tx_buf->len); 415 desc_num++; 416 } 417 418 return (desc_num); 419 } 420 421 /* 422 * igb_tx_bind 423 * 424 * Bind the mblk fragment with DMA 425 */ 426 static int 427 igb_tx_bind(igb_tx_ring_t *tx_ring, tx_control_block_t *tcb, mblk_t *mp, 428 uint32_t len) 429 { 430 int status, i; 431 ddi_dma_cookie_t dma_cookie; 432 uint_t ncookies; 433 int desc_num; 434 435 /* 436 * Use DMA binding to process the mblk fragment 437 */ 438 status = ddi_dma_addr_bind_handle(tcb->tx_dma_handle, NULL, 439 (caddr_t)mp->b_rptr, len, 440 DDI_DMA_WRITE | DDI_DMA_STREAMING, DDI_DMA_DONTWAIT, 441 0, &dma_cookie, &ncookies); 442 443 if (status != DDI_DMA_MAPPED) { 444 IGB_DEBUG_STAT(tx_ring->stat_fail_dma_bind); 445 return (-1); 446 } 447 448 tcb->frag_num++; 449 tcb->tx_type = USE_DMA; 450 /* 451 * Each fragment can span several cookies. One cookie will have 452 * one tx descriptor to transmit. 453 */ 454 desc_num = 0; 455 for (i = ncookies; i > 0; i--) { 456 /* 457 * Save the address and length to the private data structure 458 * of the tx control block, which will be used to fill the 459 * tx descriptor ring after all the fragments are processed. 460 */ 461 igb_save_desc(tcb, 462 dma_cookie.dmac_laddress, 463 dma_cookie.dmac_size); 464 465 desc_num++; 466 467 if (i > 1) 468 ddi_dma_nextcookie(tcb->tx_dma_handle, &dma_cookie); 469 } 470 471 return (desc_num); 472 } 473 474 /* 475 * igb_get_hcksum_context 476 * 477 * Get the hcksum context information from the mblk 478 */ 479 static void 480 igb_get_hcksum_context(mblk_t *mp, hcksum_context_t *hcksum) 481 { 482 uint32_t start; 483 uint32_t flags; 484 uint32_t len; 485 uint32_t size; 486 uint32_t offset; 487 unsigned char *pos; 488 ushort_t etype; 489 uint32_t mac_hdr_len; 490 uint32_t l4_proto; 491 492 ASSERT(mp != NULL); 493 494 hcksum_retrieve(mp, NULL, NULL, &start, NULL, NULL, NULL, &flags); 495 496 hcksum->hcksum_flags = flags; 497 498 if (flags == 0) 499 return; 500 501 etype = 0; 502 mac_hdr_len = 0; 503 l4_proto = 0; 504 505 /* 506 * Firstly get the position of the ether_type/ether_tpid. 507 * Here we don't assume the ether (VLAN) header is fully included 508 * in one mblk fragment, so we go thourgh the fragments to parse 509 * the ether type. 510 */ 511 size = len = MBLK_LEN(mp); 512 offset = offsetof(struct ether_header, ether_type); 513 while (size <= offset) { 514 mp = mp->b_cont; 515 ASSERT(mp != NULL); 516 len = MBLK_LEN(mp); 517 size += len; 518 } 519 pos = mp->b_rptr + offset + len - size; 520 521 etype = ntohs(*(ushort_t *)(uintptr_t)pos); 522 if (etype == ETHERTYPE_VLAN) { 523 /* 524 * Get the position of the ether_type in VLAN header 525 */ 526 offset = offsetof(struct ether_vlan_header, ether_type); 527 while (size <= offset) { 528 mp = mp->b_cont; 529 ASSERT(mp != NULL); 530 len = MBLK_LEN(mp); 531 size += len; 532 } 533 pos = mp->b_rptr + offset + len - size; 534 535 etype = ntohs(*(ushort_t *)(uintptr_t)pos); 536 mac_hdr_len = sizeof (struct ether_vlan_header); 537 } else { 538 mac_hdr_len = sizeof (struct ether_header); 539 } 540 541 /* 542 * Here we don't assume the IP(V6) header is fully included in 543 * one mblk fragment, so we go thourgh the fragments to parse 544 * the protocol type. 545 */ 546 switch (etype) { 547 case ETHERTYPE_IP: 548 offset = offsetof(ipha_t, ipha_protocol) + mac_hdr_len; 549 while (size <= offset) { 550 mp = mp->b_cont; 551 ASSERT(mp != NULL); 552 len = MBLK_LEN(mp); 553 size += len; 554 } 555 pos = mp->b_rptr + offset + len - size; 556 557 l4_proto = *(uint8_t *)pos; 558 break; 559 case ETHERTYPE_IPV6: 560 offset = offsetof(ip6_t, ip6_nxt) + mac_hdr_len; 561 while (size <= offset) { 562 mp = mp->b_cont; 563 ASSERT(mp != NULL); 564 len = MBLK_LEN(mp); 565 size += len; 566 } 567 pos = mp->b_rptr + offset + len - size; 568 569 l4_proto = *(uint8_t *)pos; 570 break; 571 default: 572 /* Unrecoverable error */ 573 IGB_DEBUGLOG_0(NULL, "Ether type error with tx hcksum"); 574 return; 575 } 576 577 hcksum->mac_hdr_len = mac_hdr_len; 578 hcksum->ip_hdr_len = start; 579 hcksum->l4_proto = l4_proto; 580 } 581 582 /* 583 * igb_check_hcksum_context 584 * 585 * Check if a new context descriptor is needed 586 */ 587 static boolean_t 588 igb_check_hcksum_context(igb_tx_ring_t *tx_ring, hcksum_context_t *hcksum) 589 { 590 hcksum_context_t *last; 591 592 if (hcksum == NULL) 593 return (B_FALSE); 594 595 /* 596 * Compare the checksum data retrieved from the mblk and the 597 * stored checksum data of the last context descriptor. The data 598 * need to be checked are: 599 * hcksum_flags 600 * l4_proto 601 * mac_hdr_len 602 * ip_hdr_len 603 * Either one of the above data is changed, a new context descriptor 604 * will be needed. 605 */ 606 last = &tx_ring->hcksum_context; 607 608 if (hcksum->hcksum_flags != 0) { 609 if ((hcksum->hcksum_flags != last->hcksum_flags) || 610 (hcksum->l4_proto != last->l4_proto) || 611 (hcksum->mac_hdr_len != last->mac_hdr_len) || 612 (hcksum->ip_hdr_len != last->ip_hdr_len)) { 613 614 return (B_TRUE); 615 } 616 } 617 618 return (B_FALSE); 619 } 620 621 /* 622 * igb_fill_hcksum_context 623 * 624 * Fill the context descriptor with hardware checksum informations 625 */ 626 static void 627 igb_fill_hcksum_context(struct e1000_adv_tx_context_desc *ctx_tbd, 628 hcksum_context_t *hcksum, uint32_t ring_index) 629 { 630 /* 631 * Fill the context descriptor with the checksum 632 * context information we've got 633 */ 634 ctx_tbd->vlan_macip_lens = hcksum->ip_hdr_len; 635 ctx_tbd->vlan_macip_lens |= hcksum->mac_hdr_len << 636 E1000_ADVTXD_MACLEN_SHIFT; 637 638 ctx_tbd->type_tucmd_mlhl = 639 E1000_ADVTXD_DCMD_DEXT | E1000_ADVTXD_DTYP_CTXT; 640 641 if (hcksum->hcksum_flags & HCK_IPV4_HDRCKSUM) 642 ctx_tbd->type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_IPV4; 643 644 if (hcksum->hcksum_flags & HCK_PARTIALCKSUM) { 645 switch (hcksum->l4_proto) { 646 case IPPROTO_TCP: 647 ctx_tbd->type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_TCP; 648 break; 649 case IPPROTO_UDP: 650 /* 651 * We don't have to explicitly set: 652 * ctx_tbd->type_tucmd_mlhl |= 653 * E1000_ADVTXD_TUCMD_L4T_UDP; 654 * Because E1000_ADVTXD_TUCMD_L4T_UDP == 0b 655 */ 656 break; 657 default: 658 /* Unrecoverable error */ 659 IGB_DEBUGLOG_0(NULL, "L4 type error with tx hcksum"); 660 break; 661 } 662 } 663 664 ctx_tbd->seqnum_seed = 0; 665 ctx_tbd->mss_l4len_idx = ring_index << 4; 666 } 667 668 /* 669 * igb_tx_fill_ring 670 * 671 * Fill the tx descriptor ring with the data 672 */ 673 static int 674 igb_tx_fill_ring(igb_tx_ring_t *tx_ring, link_list_t *pending_list, 675 hcksum_context_t *hcksum) 676 { 677 struct e1000_hw *hw = &tx_ring->igb->hw; 678 boolean_t load_context; 679 uint32_t index, tcb_index, desc_num; 680 union e1000_adv_tx_desc *tbd, *first_tbd; 681 tx_control_block_t *tcb, *first_tcb; 682 uint32_t hcksum_flags; 683 uint32_t pay_len; 684 int i; 685 igb_t *igb = tx_ring->igb; 686 687 ASSERT(mutex_owned(&tx_ring->tx_lock)); 688 689 tbd = NULL; 690 first_tbd = NULL; 691 first_tcb = NULL; 692 desc_num = 0; 693 hcksum_flags = 0; 694 pay_len = 0; 695 load_context = B_FALSE; 696 697 /* 698 * Get the index of the first tx descriptor that will be filled, 699 * and the index of the first work list item that will be attached 700 * with the first used tx control block in the pending list. 701 * Note: the two indexes are the same. 702 */ 703 index = tx_ring->tbd_tail; 704 tcb_index = tx_ring->tbd_tail; 705 706 if (hcksum != NULL) { 707 hcksum_flags = hcksum->hcksum_flags; 708 709 /* 710 * Check if a new context descriptor is needed for this packet 711 */ 712 load_context = igb_check_hcksum_context(tx_ring, hcksum); 713 if (load_context) { 714 first_tcb = (tx_control_block_t *) 715 LIST_GET_HEAD(pending_list); 716 tbd = &tx_ring->tbd_ring[index]; 717 718 /* 719 * Fill the context descriptor with the 720 * hardware checksum offload informations. 721 */ 722 igb_fill_hcksum_context( 723 (struct e1000_adv_tx_context_desc *)tbd, hcksum, 724 tx_ring->index); 725 726 index = NEXT_INDEX(index, 1, tx_ring->ring_size); 727 desc_num++; 728 729 /* 730 * Store the checksum context data if 731 * a new context descriptor is added 732 */ 733 tx_ring->hcksum_context = *hcksum; 734 } 735 } 736 737 first_tbd = &tx_ring->tbd_ring[index]; 738 739 /* 740 * Fill tx data descriptors with the data saved in the pending list. 741 * The tx control blocks in the pending list are added to the work list 742 * at the same time. 743 * 744 * The work list is strictly 1:1 corresponding to the descriptor ring. 745 * One item of the work list corresponds to one tx descriptor. Because 746 * one tx control block can span multiple tx descriptors, the tx 747 * control block will be added to the first work list item that 748 * corresponds to the first tx descriptor generated from that tx 749 * control block. 750 */ 751 tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list); 752 while (tcb != NULL) { 753 754 for (i = 0; i < tcb->desc_num; i++) { 755 tbd = &tx_ring->tbd_ring[index]; 756 757 tbd->read.buffer_addr = tcb->desc[i].address; 758 tbd->read.cmd_type_len = tcb->desc[i].length; 759 760 tbd->read.cmd_type_len |= E1000_ADVTXD_DCMD_RS | 761 E1000_ADVTXD_DCMD_DEXT | E1000_ADVTXD_DTYP_DATA | 762 E1000_ADVTXD_DCMD_IFCS; 763 764 tbd->read.olinfo_status = 0; 765 766 pay_len += tcb->desc[i].length; 767 768 index = NEXT_INDEX(index, 1, tx_ring->ring_size); 769 desc_num++; 770 } 771 772 if (first_tcb != NULL) { 773 /* 774 * Count the checksum context descriptor for 775 * the first tx control block. 776 */ 777 first_tcb->desc_num++; 778 first_tcb = NULL; 779 } 780 781 /* 782 * Add the tx control block to the work list 783 */ 784 ASSERT(tx_ring->work_list[tcb_index] == NULL); 785 tx_ring->work_list[tcb_index] = tcb; 786 787 tcb_index = index; 788 tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list); 789 } 790 791 /* 792 * The Insert Ethernet CRC (IFCS) bit and the checksum fields are only 793 * valid in the first descriptor of the packet. 794 * 82576 also requires the payload length setting even without TSO 795 */ 796 ASSERT(first_tbd != NULL); 797 first_tbd->read.cmd_type_len |= E1000_ADVTXD_DCMD_IFCS; 798 if (hw->mac.type == e1000_82576) { 799 first_tbd->read.olinfo_status = 800 (pay_len << E1000_ADVTXD_PAYLEN_SHIFT); 801 } 802 803 /* Set hardware checksum bits */ 804 if (hcksum_flags != 0) { 805 if (hcksum_flags & HCK_IPV4_HDRCKSUM) 806 first_tbd->read.olinfo_status |= 807 E1000_TXD_POPTS_IXSM << 8; 808 if (hcksum_flags & HCK_PARTIALCKSUM) 809 first_tbd->read.olinfo_status |= 810 E1000_TXD_POPTS_TXSM << 8; 811 first_tbd->read.olinfo_status |= tx_ring->index << 4; 812 } 813 814 /* 815 * The last descriptor of packet needs End Of Packet (EOP), 816 * and Report Status (RS) bits set 817 */ 818 ASSERT(tbd != NULL); 819 tbd->read.cmd_type_len |= 820 E1000_ADVTXD_DCMD_EOP | E1000_ADVTXD_DCMD_RS; 821 822 IGB_DEBUG_STAT(tx_ring->stat_pkt_cnt); 823 824 /* 825 * Sync the DMA buffer of the tx descriptor ring 826 */ 827 DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORDEV); 828 829 /* 830 * Update the number of the free tx descriptors. 831 * The mutual exclusion between the transmission and the recycling 832 * (for the tx descriptor ring and the work list) is implemented 833 * with the atomic operation on the number of the free tx descriptors. 834 * 835 * Note: we should always decrement the counter tbd_free before 836 * advancing the hardware TDT pointer to avoid the race condition - 837 * before the counter tbd_free is decremented, the transmit of the 838 * tx descriptors has done and the counter tbd_free is increased by 839 * the tx recycling. 840 */ 841 i = igb_atomic_reserve(&tx_ring->tbd_free, desc_num); 842 ASSERT(i >= 0); 843 844 tx_ring->tbd_tail = index; 845 846 /* 847 * Advance the hardware TDT pointer of the tx descriptor ring 848 */ 849 E1000_WRITE_REG(hw, E1000_TDT(tx_ring->index), index); 850 851 if (igb_check_acc_handle(igb->osdep.reg_handle) != DDI_FM_OK) { 852 ddi_fm_service_impact(igb->dip, DDI_SERVICE_DEGRADED); 853 } 854 855 return (desc_num); 856 } 857 858 /* 859 * igb_save_desc 860 * 861 * Save the address/length pair to the private array 862 * of the tx control block. The address/length pairs 863 * will be filled into the tx descriptor ring later. 864 */ 865 static void 866 igb_save_desc(tx_control_block_t *tcb, uint64_t address, size_t length) 867 { 868 sw_desc_t *desc; 869 870 desc = &tcb->desc[tcb->desc_num]; 871 desc->address = address; 872 desc->length = length; 873 874 tcb->desc_num++; 875 } 876 877 /* 878 * igb_tx_recycle_legacy 879 * 880 * Recycle the tx descriptors and tx control blocks. 881 * 882 * The work list is traversed to check if the corresponding 883 * tx descriptors have been transmitted. If so, the resources 884 * bound to the tx control blocks will be freed, and those 885 * tx control blocks will be returned to the free list. 886 */ 887 uint32_t 888 igb_tx_recycle_legacy(igb_tx_ring_t *tx_ring) 889 { 890 uint32_t index, last_index; 891 int desc_num; 892 boolean_t desc_done; 893 tx_control_block_t *tcb; 894 link_list_t pending_list; 895 igb_t *igb = tx_ring->igb; 896 897 /* 898 * The mutex_tryenter() is used to avoid unnecessary 899 * lock contention. 900 */ 901 if (mutex_tryenter(&tx_ring->recycle_lock) == 0) 902 return (0); 903 904 ASSERT(tx_ring->tbd_free <= tx_ring->ring_size); 905 906 if (tx_ring->tbd_free == tx_ring->ring_size) { 907 tx_ring->recycle_fail = 0; 908 tx_ring->stall_watchdog = 0; 909 mutex_exit(&tx_ring->recycle_lock); 910 return (0); 911 } 912 913 /* 914 * Sync the DMA buffer of the tx descriptor ring 915 */ 916 DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORKERNEL); 917 918 if (igb_check_dma_handle( 919 tx_ring->tbd_area.dma_handle) != DDI_FM_OK) { 920 ddi_fm_service_impact(igb->dip, DDI_SERVICE_DEGRADED); 921 } 922 923 LINK_LIST_INIT(&pending_list); 924 desc_num = 0; 925 index = tx_ring->tbd_head; /* Index of next tbd/tcb to recycle */ 926 927 tcb = tx_ring->work_list[index]; 928 ASSERT(tcb != NULL); 929 930 desc_done = B_TRUE; 931 while (desc_done && (tcb != NULL)) { 932 933 /* 934 * Get the last tx descriptor of the tx control block. 935 * If the last tx descriptor is done, it is done with 936 * all the tx descriptors of the tx control block. 937 * Then the tx control block and all the corresponding 938 * tx descriptors can be recycled. 939 */ 940 last_index = NEXT_INDEX(index, tcb->desc_num - 1, 941 tx_ring->ring_size); 942 943 /* 944 * Check if the Descriptor Done bit is set 945 */ 946 desc_done = tx_ring->tbd_ring[last_index].wb.status & 947 E1000_TXD_STAT_DD; 948 if (desc_done) { 949 /* 950 * Strip off the tx control block from the work list, 951 * and add it to the pending list. 952 */ 953 tx_ring->work_list[index] = NULL; 954 LIST_PUSH_TAIL(&pending_list, &tcb->link); 955 956 /* 957 * Count the total number of the tx descriptors recycled 958 */ 959 desc_num += tcb->desc_num; 960 961 /* 962 * Advance the index of the tx descriptor ring 963 */ 964 index = NEXT_INDEX(last_index, 1, tx_ring->ring_size); 965 966 tcb = tx_ring->work_list[index]; 967 } 968 } 969 970 /* 971 * If no tx descriptors are recycled, no need to do more processing 972 */ 973 if (desc_num == 0) { 974 tx_ring->recycle_fail++; 975 mutex_exit(&tx_ring->recycle_lock); 976 return (0); 977 } 978 979 tx_ring->recycle_fail = 0; 980 tx_ring->stall_watchdog = 0; 981 982 /* 983 * Update the head index of the tx descriptor ring 984 */ 985 tx_ring->tbd_head = index; 986 987 /* 988 * Update the number of the free tx descriptors with atomic operations 989 */ 990 atomic_add_32(&tx_ring->tbd_free, desc_num); 991 992 mutex_exit(&tx_ring->recycle_lock); 993 994 /* 995 * Free the resources used by the tx control blocks 996 * in the pending list 997 */ 998 tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list); 999 while (tcb != NULL) { 1000 /* 1001 * Release the resources occupied by the tx control block 1002 */ 1003 igb_free_tcb(tcb); 1004 1005 tcb = (tx_control_block_t *) 1006 LIST_GET_NEXT(&pending_list, &tcb->link); 1007 } 1008 1009 /* 1010 * Add the tx control blocks in the pending list to the free list. 1011 */ 1012 igb_put_free_list(tx_ring, &pending_list); 1013 1014 return (desc_num); 1015 } 1016 1017 /* 1018 * igb_tx_recycle_head_wb 1019 * 1020 * Check the head write-back, and recycle all the transmitted 1021 * tx descriptors and tx control blocks. 1022 */ 1023 uint32_t 1024 igb_tx_recycle_head_wb(igb_tx_ring_t *tx_ring) 1025 { 1026 uint32_t index; 1027 uint32_t head_wb; 1028 int desc_num; 1029 tx_control_block_t *tcb; 1030 link_list_t pending_list; 1031 igb_t *igb = tx_ring->igb; 1032 1033 /* 1034 * The mutex_tryenter() is used to avoid unnecessary 1035 * lock contention. 1036 */ 1037 if (mutex_tryenter(&tx_ring->recycle_lock) == 0) 1038 return (0); 1039 1040 ASSERT(tx_ring->tbd_free <= tx_ring->ring_size); 1041 1042 if (tx_ring->tbd_free == tx_ring->ring_size) { 1043 tx_ring->recycle_fail = 0; 1044 tx_ring->stall_watchdog = 0; 1045 mutex_exit(&tx_ring->recycle_lock); 1046 return (0); 1047 } 1048 1049 /* 1050 * Sync the DMA buffer of the tx descriptor ring 1051 * 1052 * Note: For head write-back mode, the tx descriptors will not 1053 * be written back, but the head write-back value is stored at 1054 * the last extra tbd at the end of the DMA area, we still need 1055 * to sync the head write-back value for kernel. 1056 * 1057 * DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORKERNEL); 1058 */ 1059 (void) ddi_dma_sync(tx_ring->tbd_area.dma_handle, 1060 sizeof (union e1000_adv_tx_desc) * tx_ring->ring_size, 1061 sizeof (uint32_t), 1062 DDI_DMA_SYNC_FORKERNEL); 1063 1064 if (igb_check_dma_handle( 1065 tx_ring->tbd_area.dma_handle) != DDI_FM_OK) { 1066 ddi_fm_service_impact(igb->dip, DDI_SERVICE_DEGRADED); 1067 } 1068 1069 LINK_LIST_INIT(&pending_list); 1070 desc_num = 0; 1071 index = tx_ring->tbd_head; /* Next index to clean */ 1072 1073 /* 1074 * Get the value of head write-back 1075 */ 1076 head_wb = *tx_ring->tbd_head_wb; 1077 while (index != head_wb) { 1078 tcb = tx_ring->work_list[index]; 1079 ASSERT(tcb != NULL); 1080 1081 if (OFFSET(index, head_wb, tx_ring->ring_size) < 1082 tcb->desc_num) { 1083 /* 1084 * The current tx control block is not 1085 * completely transmitted, stop recycling 1086 */ 1087 break; 1088 } 1089 1090 /* 1091 * Strip off the tx control block from the work list, 1092 * and add it to the pending list. 1093 */ 1094 tx_ring->work_list[index] = NULL; 1095 LIST_PUSH_TAIL(&pending_list, &tcb->link); 1096 1097 /* 1098 * Advance the index of the tx descriptor ring 1099 */ 1100 index = NEXT_INDEX(index, tcb->desc_num, tx_ring->ring_size); 1101 1102 /* 1103 * Count the total number of the tx descriptors recycled 1104 */ 1105 desc_num += tcb->desc_num; 1106 } 1107 1108 /* 1109 * If no tx descriptors are recycled, no need to do more processing 1110 */ 1111 if (desc_num == 0) { 1112 tx_ring->recycle_fail++; 1113 mutex_exit(&tx_ring->recycle_lock); 1114 return (0); 1115 } 1116 1117 tx_ring->recycle_fail = 0; 1118 tx_ring->stall_watchdog = 0; 1119 1120 /* 1121 * Update the head index of the tx descriptor ring 1122 */ 1123 tx_ring->tbd_head = index; 1124 1125 /* 1126 * Update the number of the free tx descriptors with atomic operations 1127 */ 1128 atomic_add_32(&tx_ring->tbd_free, desc_num); 1129 1130 mutex_exit(&tx_ring->recycle_lock); 1131 1132 /* 1133 * Free the resources used by the tx control blocks 1134 * in the pending list 1135 */ 1136 tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list); 1137 while (tcb) { 1138 /* 1139 * Release the resources occupied by the tx control block 1140 */ 1141 igb_free_tcb(tcb); 1142 1143 tcb = (tx_control_block_t *) 1144 LIST_GET_NEXT(&pending_list, &tcb->link); 1145 } 1146 1147 /* 1148 * Add the tx control blocks in the pending list to the free list. 1149 */ 1150 igb_put_free_list(tx_ring, &pending_list); 1151 1152 return (desc_num); 1153 } 1154 1155 /* 1156 * igb_free_tcb - free up the tx control block 1157 * 1158 * Free the resources of the tx control block, including 1159 * unbind the previously bound DMA handle, and reset other 1160 * control fields. 1161 */ 1162 void 1163 igb_free_tcb(tx_control_block_t *tcb) 1164 { 1165 switch (tcb->tx_type) { 1166 case USE_COPY: 1167 /* 1168 * Reset the buffer length that is used for copy 1169 */ 1170 tcb->tx_buf.len = 0; 1171 break; 1172 case USE_DMA: 1173 /* 1174 * Release the DMA resource that is used for 1175 * DMA binding. 1176 */ 1177 (void) ddi_dma_unbind_handle(tcb->tx_dma_handle); 1178 break; 1179 default: 1180 break; 1181 } 1182 1183 /* 1184 * Free the mblk 1185 */ 1186 if (tcb->mp != NULL) { 1187 freemsg(tcb->mp); 1188 tcb->mp = NULL; 1189 } 1190 1191 tcb->tx_type = USE_NONE; 1192 tcb->frag_num = 0; 1193 tcb->desc_num = 0; 1194 } 1195 1196 /* 1197 * igb_get_free_list - Get a free tx control block from the free list 1198 * 1199 * The atomic operation on the number of the available tx control block 1200 * in the free list is used to keep this routine mutual exclusive with 1201 * the routine igb_put_check_list. 1202 */ 1203 static tx_control_block_t * 1204 igb_get_free_list(igb_tx_ring_t *tx_ring) 1205 { 1206 tx_control_block_t *tcb; 1207 1208 /* 1209 * Check and update the number of the free tx control block 1210 * in the free list. 1211 */ 1212 if (igb_atomic_reserve(&tx_ring->tcb_free, 1) < 0) 1213 return (NULL); 1214 1215 mutex_enter(&tx_ring->tcb_head_lock); 1216 1217 tcb = tx_ring->free_list[tx_ring->tcb_head]; 1218 ASSERT(tcb != NULL); 1219 tx_ring->free_list[tx_ring->tcb_head] = NULL; 1220 tx_ring->tcb_head = NEXT_INDEX(tx_ring->tcb_head, 1, 1221 tx_ring->free_list_size); 1222 1223 mutex_exit(&tx_ring->tcb_head_lock); 1224 1225 return (tcb); 1226 } 1227 1228 /* 1229 * igb_put_free_list 1230 * 1231 * Put a list of used tx control blocks back to the free list 1232 * 1233 * A mutex is used here to ensure the serialization. The mutual exclusion 1234 * between igb_get_free_list and igb_put_free_list is implemented with 1235 * the atomic operation on the counter tcb_free. 1236 */ 1237 void 1238 igb_put_free_list(igb_tx_ring_t *tx_ring, link_list_t *pending_list) 1239 { 1240 uint32_t index; 1241 int tcb_num; 1242 tx_control_block_t *tcb; 1243 1244 mutex_enter(&tx_ring->tcb_tail_lock); 1245 1246 index = tx_ring->tcb_tail; 1247 1248 tcb_num = 0; 1249 tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list); 1250 while (tcb != NULL) { 1251 ASSERT(tx_ring->free_list[index] == NULL); 1252 tx_ring->free_list[index] = tcb; 1253 1254 tcb_num++; 1255 1256 index = NEXT_INDEX(index, 1, tx_ring->free_list_size); 1257 1258 tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list); 1259 } 1260 1261 tx_ring->tcb_tail = index; 1262 1263 /* 1264 * Update the number of the free tx control block 1265 * in the free list. This operation must be placed 1266 * under the protection of the lock. 1267 */ 1268 atomic_add_32(&tx_ring->tcb_free, tcb_num); 1269 1270 mutex_exit(&tx_ring->tcb_tail_lock); 1271 } 1272