1 /* 2 * CDDL HEADER START 3 * 4 * Copyright(c) 2007-2008 Intel Corporation. All rights reserved. 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at: 10 * http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When using or redistributing this file, you may do so under the 15 * License only. No other modification of this header is permitted. 16 * 17 * If applicable, add the following below this CDDL HEADER, with the 18 * fields enclosed by brackets "[]" replaced with your own identifying 19 * information: Portions Copyright [yyyy] [name of copyright owner] 20 * 21 * CDDL HEADER END 22 */ 23 24 /* 25 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 26 * Use is subject to license terms of the CDDL. 27 */ 28 29 #pragma ident "%Z%%M% %I% %E% SMI" 30 31 #include "ixgbe_sw.h" 32 33 static boolean_t ixgbe_tx(ixgbe_tx_ring_t *, mblk_t *); 34 static int ixgbe_tx_copy(ixgbe_tx_ring_t *, tx_control_block_t *, mblk_t *, 35 uint32_t, boolean_t, boolean_t); 36 static int ixgbe_tx_bind(ixgbe_tx_ring_t *, tx_control_block_t *, mblk_t *, 37 uint32_t); 38 static int ixgbe_tx_fill_ring(ixgbe_tx_ring_t *, link_list_t *, 39 hcksum_context_t *); 40 static void ixgbe_save_desc(tx_control_block_t *, uint64_t, size_t); 41 static tx_control_block_t *ixgbe_get_free_list(ixgbe_tx_ring_t *); 42 43 static void ixgbe_get_hcksum_context(mblk_t *, hcksum_context_t *); 44 static boolean_t ixgbe_check_hcksum_context(ixgbe_tx_ring_t *, 45 hcksum_context_t *); 46 static void ixgbe_fill_hcksum_context(struct ixgbe_adv_tx_context_desc *, 47 hcksum_context_t *); 48 49 #ifndef IXGBE_DEBUG 50 #pragma inline(ixgbe_save_desc) 51 #pragma inline(ixgbe_get_hcksum_context) 52 #pragma inline(ixgbe_check_hcksum_context) 53 #pragma inline(ixgbe_fill_hcksum_context) 54 #endif 55 56 /* 57 * ixgbe_m_tx 58 * 59 * The GLDv3 interface to call driver's tx routine to transmit 60 * the mblks. 61 */ 62 mblk_t * 63 ixgbe_m_tx(void *arg, mblk_t *mp) 64 { 65 ixgbe_t *ixgbe = (ixgbe_t *)arg; 66 mblk_t *next; 67 ixgbe_tx_ring_t *tx_ring; 68 69 /* 70 * If the adapter is suspended, or it is not started, or the link 71 * is not up, the mblks are simply dropped. 72 */ 73 if (((ixgbe->ixgbe_state & IXGBE_SUSPENDED) != 0) || 74 ((ixgbe->ixgbe_state & IXGBE_STARTED) == 0) || 75 (ixgbe->link_state != LINK_STATE_UP)) { 76 /* Free the mblk chain */ 77 while (mp != NULL) { 78 next = mp->b_next; 79 mp->b_next = NULL; 80 81 freemsg(mp); 82 mp = next; 83 } 84 85 return (NULL); 86 } 87 88 /* 89 * Decide which tx ring is used to transmit the packets. 90 * This needs to be updated later to fit the new interface 91 * of the multiple rings support. 92 */ 93 tx_ring = &ixgbe->tx_rings[0]; 94 95 while (mp != NULL) { 96 next = mp->b_next; 97 mp->b_next = NULL; 98 99 if (!ixgbe_tx(tx_ring, mp)) { 100 mp->b_next = next; 101 break; 102 } 103 104 mp = next; 105 } 106 107 return (mp); 108 } 109 110 /* 111 * ixgbe_tx - Main transmit processing 112 * 113 * Called from ixgbe_m_tx with an mblk ready to transmit. this 114 * routine sets up the transmit descriptors and sends data to 115 * the wire. 116 * 117 * One mblk can consist of several fragments, each fragment 118 * will be processed with different methods based on the size. 119 * For the fragments with size less than the bcopy threshold, 120 * they will be processed by using bcopy; otherwise, they will 121 * be processed by using DMA binding. 122 * 123 * To process the mblk, a tx control block is got from the 124 * free list. One tx control block contains one tx buffer, which 125 * is used to copy mblk fragments' data; and one tx DMA handle, 126 * which is used to bind a mblk fragment with DMA resource. 127 * 128 * Several small mblk fragments can be copied into one tx control 129 * block's buffer, and then the buffer will be transmitted with 130 * one tx descriptor. 131 * 132 * A large fragment only binds with one tx control block's DMA 133 * handle, and it can span several tx descriptors for transmitting. 134 * 135 * So to transmit a packet (mblk), several tx control blocks can 136 * be used. After the processing, those tx control blocks will 137 * be put to the work list. 138 */ 139 static boolean_t 140 ixgbe_tx(ixgbe_tx_ring_t *tx_ring, mblk_t *mp) 141 { 142 ixgbe_t *ixgbe = tx_ring->ixgbe; 143 tx_type_t current_flag, next_flag; 144 uint32_t current_len, next_len; 145 uint32_t desc_total; 146 size_t mbsize; 147 int desc_num; 148 boolean_t copy_done, eop; 149 mblk_t *current_mp, *next_mp, *nmp; 150 tx_control_block_t *tcb; 151 hcksum_context_t hcksum_context, *hcksum; 152 link_list_t pending_list; 153 154 /* Get the mblk size */ 155 mbsize = 0; 156 for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) { 157 mbsize += MBLK_LEN(nmp); 158 } 159 160 /* 161 * If the mblk size exceeds the max frame size, 162 * discard this mblk, and return B_TRUE 163 */ 164 if (mbsize > (ixgbe->max_frame_size - ETHERFCSL)) { 165 freemsg(mp); 166 IXGBE_DEBUGLOG_0(ixgbe, "ixgbe_tx: packet oversize"); 167 return (B_TRUE); 168 } 169 170 /* 171 * Check and recycle tx descriptors. 172 * The recycle threshold here should be selected carefully 173 */ 174 if (tx_ring->tbd_free < tx_ring->recycle_thresh) 175 tx_ring->tx_recycle(tx_ring); 176 177 /* 178 * After the recycling, if the tbd_free is less than the 179 * overload_threshold, assert overload, return B_FALSE; 180 * and we need to re-schedule the tx again. 181 */ 182 if (tx_ring->tbd_free < tx_ring->overload_thresh) { 183 tx_ring->reschedule = B_TRUE; 184 IXGBE_DEBUG_STAT(tx_ring->stat_overload); 185 return (B_FALSE); 186 } 187 188 /* 189 * The pending_list is a linked list that is used to save 190 * the tx control blocks that have packet data processed 191 * but have not put the data to the tx descriptor ring. 192 * It is used to reduce the lock contention of the tx_lock. 193 */ 194 LINK_LIST_INIT(&pending_list); 195 desc_num = 0; 196 desc_total = 0; 197 198 current_mp = mp; 199 current_len = MBLK_LEN(current_mp); 200 /* 201 * Decide which method to use for the first fragment 202 */ 203 current_flag = (current_len <= tx_ring->copy_thresh) ? 204 USE_COPY : USE_DMA; 205 /* 206 * If the mblk includes several contiguous small fragments, 207 * they may be copied into one buffer. This flag is used to 208 * indicate whether there are pending fragments that need to 209 * be copied to the current tx buffer. 210 * 211 * If this flag is B_TRUE, it indicates that a new tx control 212 * block is needed to process the next fragment using either 213 * copy or DMA binding. 214 * 215 * Otherwise, it indicates that the next fragment will be 216 * copied to the current tx buffer that is maintained by the 217 * current tx control block. No new tx control block is needed. 218 */ 219 copy_done = B_TRUE; 220 while (current_mp) { 221 next_mp = current_mp->b_cont; 222 eop = (next_mp == NULL); /* Last fragment of the packet? */ 223 next_len = eop ? 0: MBLK_LEN(next_mp); 224 225 /* 226 * When the current fragment is an empty fragment, if 227 * the next fragment will still be copied to the current 228 * tx buffer, we cannot skip this fragment here. Because 229 * the copy processing is pending for completion. We have 230 * to process this empty fragment in the tx_copy routine. 231 * 232 * If the copy processing is completed or a DMA binding 233 * processing is just completed, we can just skip this 234 * empty fragment. 235 */ 236 if ((current_len == 0) && (copy_done)) { 237 current_mp = next_mp; 238 current_len = next_len; 239 current_flag = (current_len <= tx_ring->copy_thresh) ? 240 USE_COPY : USE_DMA; 241 continue; 242 } 243 244 if (copy_done) { 245 /* 246 * Get a new tx control block from the free list 247 */ 248 tcb = ixgbe_get_free_list(tx_ring); 249 250 if (tcb == NULL) { 251 IXGBE_DEBUG_STAT(tx_ring->stat_fail_no_tcb); 252 goto tx_failure; 253 } 254 255 /* 256 * Push the tx control block to the pending list 257 * to avoid using lock too early 258 */ 259 LIST_PUSH_TAIL(&pending_list, &tcb->link); 260 } 261 262 if (current_flag == USE_COPY) { 263 /* 264 * Check whether to use bcopy or DMA binding to process 265 * the next fragment, and if using bcopy, whether we 266 * need to continue copying the next fragment into the 267 * current tx buffer. 268 */ 269 ASSERT((tcb->tx_buf.len + current_len) <= 270 tcb->tx_buf.size); 271 272 if (eop) { 273 /* 274 * This is the last fragment of the packet, so 275 * the copy processing will be completed with 276 * this fragment. 277 */ 278 next_flag = USE_NONE; 279 copy_done = B_TRUE; 280 } else if ((tcb->tx_buf.len + current_len + next_len) > 281 tcb->tx_buf.size) { 282 /* 283 * If the next fragment is too large to be 284 * copied to the current tx buffer, we need 285 * to complete the current copy processing. 286 */ 287 next_flag = (next_len > tx_ring->copy_thresh) ? 288 USE_DMA: USE_COPY; 289 copy_done = B_TRUE; 290 } else if (next_len > tx_ring->copy_thresh) { 291 /* 292 * The next fragment needs to be processed with 293 * DMA binding. So the copy prcessing will be 294 * completed with the current fragment. 295 */ 296 next_flag = USE_DMA; 297 copy_done = B_TRUE; 298 } else { 299 /* 300 * Continue to copy the next fragment to the 301 * current tx buffer. 302 */ 303 next_flag = USE_COPY; 304 copy_done = B_FALSE; 305 } 306 307 desc_num = ixgbe_tx_copy(tx_ring, tcb, current_mp, 308 current_len, copy_done, eop); 309 } else { 310 /* 311 * Check whether to use bcopy or DMA binding to process 312 * the next fragment. 313 */ 314 next_flag = (next_len > tx_ring->copy_thresh) ? 315 USE_DMA: USE_COPY; 316 ASSERT(copy_done == B_TRUE); 317 318 desc_num = ixgbe_tx_bind(tx_ring, tcb, current_mp, 319 current_len); 320 } 321 322 if (desc_num > 0) 323 desc_total += desc_num; 324 else if (desc_num < 0) 325 goto tx_failure; 326 327 current_mp = next_mp; 328 current_len = next_len; 329 current_flag = next_flag; 330 } 331 332 /* 333 * Attach the mblk to the last tx control block 334 */ 335 ASSERT(tcb); 336 ASSERT(tcb->mp == NULL); 337 tcb->mp = mp; 338 339 if (ixgbe->tx_hcksum_enable) { 340 /* 341 * Retrieve checksum context information from the mblk that will 342 * be used to decide whether/how to fill the context descriptor. 343 */ 344 hcksum = &hcksum_context; 345 ixgbe_get_hcksum_context(mp, hcksum); 346 } else { 347 hcksum = NULL; 348 } 349 350 /* 351 * Before fill the tx descriptor ring with the data, we need to 352 * ensure there are adequate free descriptors for transmit 353 * (including one context descriptor). 354 */ 355 if (tx_ring->tbd_free < (desc_total + 1)) { 356 tx_ring->tx_recycle(tx_ring); 357 } 358 359 mutex_enter(&tx_ring->tx_lock); 360 361 /* 362 * If the number of free tx descriptors is not enough for transmit 363 * then return failure. 364 * 365 * Note: we must put this check under the mutex protection to 366 * ensure the correctness when multiple threads access it in 367 * parallel. 368 */ 369 if (tx_ring->tbd_free < (desc_total + 1)) { 370 IXGBE_DEBUG_STAT(tx_ring->stat_fail_no_tbd); 371 mutex_exit(&tx_ring->tx_lock); 372 goto tx_failure; 373 } 374 375 desc_num = ixgbe_tx_fill_ring(tx_ring, &pending_list, hcksum); 376 377 ASSERT((desc_num == desc_total) || (desc_num == (desc_total + 1))); 378 379 mutex_exit(&tx_ring->tx_lock); 380 381 return (B_TRUE); 382 383 tx_failure: 384 /* 385 * Discard the mblk and free the used resources 386 */ 387 tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list); 388 while (tcb) { 389 tcb->mp = NULL; 390 391 ixgbe_free_tcb(tcb); 392 393 tcb = (tx_control_block_t *) 394 LIST_GET_NEXT(&pending_list, &tcb->link); 395 } 396 397 /* 398 * Return the tx control blocks in the pending list to the free list. 399 */ 400 ixgbe_put_free_list(tx_ring, &pending_list); 401 402 /* Transmit failed, do not drop the mblk, rechedule the transmit */ 403 tx_ring->reschedule = B_TRUE; 404 405 return (B_FALSE); 406 } 407 408 /* 409 * ixgbe_tx_copy 410 * 411 * Copy the mblk fragment to the pre-allocated tx buffer 412 */ 413 static int 414 ixgbe_tx_copy(ixgbe_tx_ring_t *tx_ring, tx_control_block_t *tcb, mblk_t *mp, 415 uint32_t len, boolean_t copy_done, boolean_t eop) 416 { 417 dma_buffer_t *tx_buf; 418 uint32_t desc_num; 419 _NOTE(ARGUNUSED(tx_ring)); 420 421 tx_buf = &tcb->tx_buf; 422 423 /* 424 * Copy the packet data of the mblk fragment into the 425 * pre-allocated tx buffer, which is maintained by the 426 * tx control block. 427 * 428 * Several mblk fragments can be copied into one tx buffer. 429 * The destination address of the current copied fragment in 430 * the tx buffer is next to the end of the previous copied 431 * fragment. 432 */ 433 if (len > 0) { 434 bcopy(mp->b_rptr, tx_buf->address + tx_buf->len, len); 435 436 tx_buf->len += len; 437 tcb->frag_num++; 438 } 439 440 desc_num = 0; 441 442 /* 443 * If it is the last fragment copied to the current tx buffer, 444 * in other words, if there's no remaining fragment or the remaining 445 * fragment requires a new tx control block to process, we need to 446 * complete the current copy processing by syncing up the current 447 * DMA buffer and saving the descriptor data. 448 */ 449 if (copy_done) { 450 /* 451 * For the packet smaller than 64 bytes, we need to 452 * pad it to 60 bytes. The NIC hardware will add 4 453 * bytes of CRC. 454 */ 455 if (eop && (tx_buf->len < ETHERMIN)) { 456 bzero(tx_buf->address + tx_buf->len, 457 ETHERMIN - tx_buf->len); 458 tx_buf->len = ETHERMIN; 459 } 460 461 /* 462 * Sync the DMA buffer of the packet data 463 */ 464 DMA_SYNC(tx_buf, DDI_DMA_SYNC_FORDEV); 465 466 tcb->tx_type = USE_COPY; 467 468 /* 469 * Save the address and length to the private data structure 470 * of the tx control block, which will be used to fill the 471 * tx descriptor ring after all the fragments are processed. 472 */ 473 ixgbe_save_desc(tcb, tx_buf->dma_address, tx_buf->len); 474 desc_num++; 475 } 476 477 return (desc_num); 478 } 479 480 /* 481 * ixgbe_tx_bind 482 * 483 * Bind the mblk fragment with DMA 484 */ 485 static int 486 ixgbe_tx_bind(ixgbe_tx_ring_t *tx_ring, tx_control_block_t *tcb, mblk_t *mp, 487 uint32_t len) 488 { 489 int status, i; 490 ddi_dma_cookie_t dma_cookie; 491 uint_t ncookies; 492 int desc_num; 493 494 /* 495 * Use DMA binding to process the mblk fragment 496 */ 497 status = ddi_dma_addr_bind_handle(tcb->tx_dma_handle, NULL, 498 (caddr_t)mp->b_rptr, len, 499 DDI_DMA_WRITE | DDI_DMA_STREAMING, DDI_DMA_DONTWAIT, 500 0, &dma_cookie, &ncookies); 501 502 if (status != DDI_DMA_MAPPED) { 503 IXGBE_DEBUG_STAT(tx_ring->stat_fail_dma_bind); 504 return (-1); 505 } 506 507 tcb->frag_num++; 508 tcb->tx_type = USE_DMA; 509 /* 510 * Each fragment can span several cookies. One cookie will have 511 * one tx descriptor to transmit. 512 */ 513 desc_num = 0; 514 for (i = ncookies; i > 0; i--) { 515 /* 516 * Save the address and length to the private data structure 517 * of the tx control block, which will be used to fill the 518 * tx descriptor ring after all the fragments are processed. 519 */ 520 ixgbe_save_desc(tcb, 521 dma_cookie.dmac_laddress, 522 dma_cookie.dmac_size); 523 524 desc_num++; 525 526 if (i > 1) 527 ddi_dma_nextcookie(tcb->tx_dma_handle, &dma_cookie); 528 } 529 530 return (desc_num); 531 } 532 533 /* 534 * ixgbe_get_hcksum_context 535 * 536 * Get the hcksum context information from the mblk 537 */ 538 static void 539 ixgbe_get_hcksum_context(mblk_t *mp, hcksum_context_t *hcksum) 540 { 541 uint32_t start; 542 uint32_t flags; 543 uint32_t len; 544 uint32_t size; 545 uint32_t offset; 546 unsigned char *pos; 547 ushort_t etype; 548 uint32_t mac_hdr_len; 549 uint32_t l4_proto; 550 551 ASSERT(mp != NULL); 552 553 hcksum_retrieve(mp, NULL, NULL, &start, NULL, NULL, NULL, &flags); 554 555 hcksum->hcksum_flags = flags; 556 557 if (flags == 0) 558 return; 559 560 etype = 0; 561 mac_hdr_len = 0; 562 l4_proto = 0; 563 564 /* 565 * Firstly get the position of the ether_type/ether_tpid. 566 * Here we don't assume the ether (VLAN) header is fully included 567 * in one mblk fragment, so we go thourgh the fragments to parse 568 * the ether type. 569 */ 570 size = len = MBLK_LEN(mp); 571 offset = offsetof(struct ether_header, ether_type); 572 while (size <= offset) { 573 mp = mp->b_cont; 574 ASSERT(mp != NULL); 575 len = MBLK_LEN(mp); 576 size += len; 577 } 578 pos = mp->b_rptr + offset + len - size; 579 580 etype = ntohs(*(ushort_t *)(uintptr_t)pos); 581 if (etype == ETHERTYPE_VLAN) { 582 /* 583 * Get the position of the ether_type in VLAN header 584 */ 585 offset = offsetof(struct ether_vlan_header, ether_type); 586 while (size <= offset) { 587 mp = mp->b_cont; 588 ASSERT(mp != NULL); 589 len = MBLK_LEN(mp); 590 size += len; 591 } 592 pos = mp->b_rptr + offset + len - size; 593 594 etype = ntohs(*(ushort_t *)(uintptr_t)pos); 595 mac_hdr_len = sizeof (struct ether_vlan_header); 596 } else { 597 mac_hdr_len = sizeof (struct ether_header); 598 } 599 600 /* 601 * Here we don't assume the IP(V6) header is fully included in 602 * one mblk fragment, so we go thourgh the fragments to parse 603 * the protocol type. 604 */ 605 switch (etype) { 606 case ETHERTYPE_IP: 607 offset = offsetof(ipha_t, ipha_protocol) + mac_hdr_len; 608 while (size <= offset) { 609 mp = mp->b_cont; 610 ASSERT(mp != NULL); 611 len = MBLK_LEN(mp); 612 size += len; 613 } 614 pos = mp->b_rptr + offset + len - size; 615 616 l4_proto = *(uint8_t *)pos; 617 break; 618 case ETHERTYPE_IPV6: 619 offset = offsetof(ip6_t, ip6_nxt) + mac_hdr_len; 620 while (size <= offset) { 621 mp = mp->b_cont; 622 ASSERT(mp != NULL); 623 len = MBLK_LEN(mp); 624 size += len; 625 } 626 pos = mp->b_rptr + offset + len - size; 627 628 l4_proto = *(uint8_t *)pos; 629 break; 630 default: 631 /* Unrecoverable error */ 632 IXGBE_DEBUGLOG_0(NULL, "Ether type error with tx hcksum"); 633 return; 634 } 635 636 hcksum->mac_hdr_len = mac_hdr_len; 637 hcksum->ip_hdr_len = start; 638 hcksum->l4_proto = l4_proto; 639 } 640 641 /* 642 * ixgbe_check_hcksum_context 643 * 644 * Check if a new context descriptor is needed 645 */ 646 static boolean_t 647 ixgbe_check_hcksum_context(ixgbe_tx_ring_t *tx_ring, hcksum_context_t *hcksum) 648 { 649 hcksum_context_t *last; 650 651 if (hcksum == NULL) 652 return (B_FALSE); 653 654 /* 655 * Compare the checksum data retrieved from the mblk and the 656 * stored checksum data of the last context descriptor. The data 657 * need to be checked are: 658 * hcksum_flags 659 * l4_proto 660 * mac_hdr_len 661 * ip_hdr_len 662 * Either one of the above data is changed, a new context descriptor 663 * will be needed. 664 */ 665 last = &tx_ring->hcksum_context; 666 667 if (hcksum->hcksum_flags != 0) { 668 if ((hcksum->hcksum_flags != last->hcksum_flags) || 669 (hcksum->l4_proto != last->l4_proto) || 670 (hcksum->mac_hdr_len != last->mac_hdr_len) || 671 (hcksum->ip_hdr_len != last->ip_hdr_len)) { 672 673 return (B_TRUE); 674 } 675 } 676 677 return (B_FALSE); 678 } 679 680 /* 681 * ixgbe_fill_hcksum_context 682 * 683 * Fill the context descriptor with hardware checksum informations 684 */ 685 static void 686 ixgbe_fill_hcksum_context(struct ixgbe_adv_tx_context_desc *ctx_tbd, 687 hcksum_context_t *hcksum) 688 { 689 /* 690 * Fill the context descriptor with the checksum 691 * context information we've got 692 */ 693 ctx_tbd->vlan_macip_lens = hcksum->ip_hdr_len; 694 ctx_tbd->vlan_macip_lens |= hcksum->mac_hdr_len << 695 IXGBE_ADVTXD_MACLEN_SHIFT; 696 697 ctx_tbd->type_tucmd_mlhl = 698 IXGBE_ADVTXD_DCMD_DEXT | IXGBE_ADVTXD_DTYP_CTXT; 699 700 if (hcksum->hcksum_flags & HCK_IPV4_HDRCKSUM) 701 ctx_tbd->type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV4; 702 703 if (hcksum->hcksum_flags & HCK_PARTIALCKSUM) { 704 switch (hcksum->l4_proto) { 705 case IPPROTO_TCP: 706 ctx_tbd->type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_TCP; 707 break; 708 case IPPROTO_UDP: 709 /* 710 * We don't have to explicitly set: 711 * ctx_tbd->type_tucmd_mlhl |= 712 * IXGBE_ADVTXD_TUCMD_L4T_UDP; 713 * Because IXGBE_ADVTXD_TUCMD_L4T_UDP == 0b 714 */ 715 break; 716 default: 717 /* Unrecoverable error */ 718 IXGBE_DEBUGLOG_0(NULL, "L4 type error with tx hcksum"); 719 break; 720 } 721 } 722 723 ctx_tbd->seqnum_seed = 0; 724 ctx_tbd->mss_l4len_idx = 0; 725 } 726 727 /* 728 * ixgbe_tx_fill_ring 729 * 730 * Fill the tx descriptor ring with the data 731 */ 732 static int 733 ixgbe_tx_fill_ring(ixgbe_tx_ring_t *tx_ring, link_list_t *pending_list, 734 hcksum_context_t *hcksum) 735 { 736 struct ixgbe_hw *hw = &tx_ring->ixgbe->hw; 737 boolean_t load_context; 738 uint32_t index, tcb_index, desc_num; 739 union ixgbe_adv_tx_desc *tbd, *first_tbd; 740 tx_control_block_t *tcb, *first_tcb; 741 uint32_t hcksum_flags; 742 int i; 743 744 ASSERT(mutex_owned(&tx_ring->tx_lock)); 745 746 tbd = NULL; 747 first_tbd = NULL; 748 first_tcb = NULL; 749 desc_num = 0; 750 hcksum_flags = 0; 751 load_context = B_FALSE; 752 753 /* 754 * Get the index of the first tx descriptor that will be filled, 755 * and the index of the first work list item that will be attached 756 * with the first used tx control block in the pending list. 757 * Note: the two indexes are the same. 758 */ 759 index = tx_ring->tbd_tail; 760 tcb_index = tx_ring->tbd_tail; 761 762 if (hcksum != NULL) { 763 hcksum_flags = hcksum->hcksum_flags; 764 765 /* 766 * Check if a new context descriptor is needed for this packet 767 */ 768 load_context = ixgbe_check_hcksum_context(tx_ring, hcksum); 769 if (load_context) { 770 first_tcb = (tx_control_block_t *) 771 LIST_GET_HEAD(pending_list); 772 tbd = &tx_ring->tbd_ring[index]; 773 774 /* 775 * Fill the context descriptor with the 776 * hardware checksum offload informations. 777 */ 778 ixgbe_fill_hcksum_context( 779 (struct ixgbe_adv_tx_context_desc *)tbd, hcksum); 780 781 index = NEXT_INDEX(index, 1, tx_ring->ring_size); 782 desc_num++; 783 784 /* 785 * Store the checksum context data if 786 * a new context descriptor is added 787 */ 788 tx_ring->hcksum_context = *hcksum; 789 } 790 } 791 792 first_tbd = &tx_ring->tbd_ring[index]; 793 794 /* 795 * Fill tx data descriptors with the data saved in the pending list. 796 * The tx control blocks in the pending list are added to the work list 797 * at the same time. 798 * 799 * The work list is strictly 1:1 corresponding to the descriptor ring. 800 * One item of the work list corresponds to one tx descriptor. Because 801 * one tx control block can span multiple tx descriptors, the tx 802 * control block will be added to the first work list item that 803 * corresponds to the first tx descriptor generated from that tx 804 * control block. 805 */ 806 tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list); 807 while (tcb != NULL) { 808 809 for (i = 0; i < tcb->desc_num; i++) { 810 tbd = &tx_ring->tbd_ring[index]; 811 812 tbd->read.buffer_addr = tcb->desc[i].address; 813 tbd->read.cmd_type_len = tcb->desc[i].length; 814 815 tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_RS | 816 IXGBE_ADVTXD_DCMD_DEXT | IXGBE_ADVTXD_DTYP_DATA; 817 818 tbd->read.olinfo_status = 0; 819 820 index = NEXT_INDEX(index, 1, tx_ring->ring_size); 821 desc_num++; 822 } 823 824 if (first_tcb != NULL) { 825 /* 826 * Count the checksum context descriptor for 827 * the first tx control block. 828 */ 829 first_tcb->desc_num++; 830 first_tcb = NULL; 831 } 832 833 /* 834 * Add the tx control block to the work list 835 */ 836 ASSERT(tx_ring->work_list[tcb_index] == NULL); 837 tx_ring->work_list[tcb_index] = tcb; 838 839 tcb_index = index; 840 tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list); 841 } 842 843 /* 844 * The Insert Ethernet CRC (IFCS) bit and the checksum fields are only 845 * valid in the first descriptor of the packet. 846 */ 847 ASSERT(first_tbd != NULL); 848 first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_IFCS; 849 850 /* Set hardware checksum bits */ 851 if (hcksum_flags != 0) { 852 if (hcksum_flags & HCK_IPV4_HDRCKSUM) 853 first_tbd->read.olinfo_status |= 854 IXGBE_TXD_POPTS_IXSM << 8; 855 if (hcksum_flags & HCK_PARTIALCKSUM) 856 first_tbd->read.olinfo_status |= 857 IXGBE_TXD_POPTS_TXSM << 8; 858 } 859 860 /* 861 * The last descriptor of packet needs End Of Packet (EOP), 862 * and Report Status (RS) bits set 863 */ 864 ASSERT(tbd != NULL); 865 tbd->read.cmd_type_len |= 866 IXGBE_ADVTXD_DCMD_EOP | IXGBE_ADVTXD_DCMD_RS; 867 868 /* 869 * Sync the DMA buffer of the tx descriptor ring 870 */ 871 DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORDEV); 872 873 if (ixgbe_check_dma_handle(tx_ring->tbd_area.dma_handle) != DDI_FM_OK) { 874 ddi_fm_service_impact(tx_ring->ixgbe->dip, 875 DDI_SERVICE_DEGRADED); 876 } 877 878 /* 879 * Update the number of the free tx descriptors. 880 * The mutual exclusion between the transmission and the recycling 881 * (for the tx descriptor ring and the work list) is implemented 882 * with the atomic operation on the number of the free tx descriptors. 883 * 884 * Note: we should always decrement the counter tbd_free before 885 * advancing the hardware TDT pointer to avoid the race condition - 886 * before the counter tbd_free is decremented, the transmit of the 887 * tx descriptors has done and the counter tbd_free is increased by 888 * the tx recycling. 889 */ 890 i = ixgbe_atomic_reserve(&tx_ring->tbd_free, desc_num); 891 ASSERT(i >= 0); 892 893 tx_ring->tbd_tail = index; 894 895 /* 896 * Advance the hardware TDT pointer of the tx descriptor ring 897 */ 898 IXGBE_WRITE_REG(hw, IXGBE_TDT(tx_ring->index), index); 899 900 if (ixgbe_check_acc_handle(tx_ring->ixgbe->osdep.reg_handle) != 901 DDI_FM_OK) { 902 ddi_fm_service_impact(tx_ring->ixgbe->dip, 903 DDI_SERVICE_DEGRADED); 904 } 905 906 return (desc_num); 907 } 908 909 /* 910 * ixgbe_save_desc 911 * 912 * Save the address/length pair to the private array 913 * of the tx control block. The address/length pairs 914 * will be filled into the tx descriptor ring later. 915 */ 916 static void 917 ixgbe_save_desc(tx_control_block_t *tcb, uint64_t address, size_t length) 918 { 919 sw_desc_t *desc; 920 921 desc = &tcb->desc[tcb->desc_num]; 922 desc->address = address; 923 desc->length = length; 924 925 tcb->desc_num++; 926 } 927 928 /* 929 * ixgbe_tx_recycle_legacy 930 * 931 * Recycle the tx descriptors and tx control blocks. 932 * 933 * The work list is traversed to check if the corresponding 934 * tx descriptors have been transmitted. If so, the resources 935 * bound to the tx control blocks will be freed, and those 936 * tx control blocks will be returned to the free list. 937 */ 938 uint32_t 939 ixgbe_tx_recycle_legacy(ixgbe_tx_ring_t *tx_ring) 940 { 941 uint32_t index, last_index; 942 int desc_num; 943 boolean_t desc_done; 944 tx_control_block_t *tcb; 945 link_list_t pending_list; 946 947 /* 948 * The mutex_tryenter() is used to avoid unnecessary 949 * lock contention. 950 */ 951 if (mutex_tryenter(&tx_ring->recycle_lock) == 0) 952 return (0); 953 954 ASSERT(tx_ring->tbd_free <= tx_ring->ring_size); 955 956 if (tx_ring->tbd_free == tx_ring->ring_size) { 957 tx_ring->recycle_fail = 0; 958 tx_ring->stall_watchdog = 0; 959 mutex_exit(&tx_ring->recycle_lock); 960 return (0); 961 } 962 963 /* 964 * Sync the DMA buffer of the tx descriptor ring 965 */ 966 DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORKERNEL); 967 968 if (ixgbe_check_dma_handle(tx_ring->tbd_area.dma_handle) != DDI_FM_OK) { 969 ddi_fm_service_impact(tx_ring->ixgbe->dip, 970 DDI_SERVICE_DEGRADED); 971 } 972 973 LINK_LIST_INIT(&pending_list); 974 desc_num = 0; 975 index = tx_ring->tbd_head; /* Index of next tbd/tcb to recycle */ 976 977 tcb = tx_ring->work_list[index]; 978 ASSERT(tcb != NULL); 979 980 desc_done = B_TRUE; 981 while (desc_done && (tcb != NULL)) { 982 983 /* 984 * Get the last tx descriptor of the tx control block. 985 * If the last tx descriptor is done, it is done with 986 * all the tx descriptors of the tx control block. 987 * Then the tx control block and all the corresponding 988 * tx descriptors can be recycled. 989 */ 990 last_index = NEXT_INDEX(index, tcb->desc_num - 1, 991 tx_ring->ring_size); 992 993 /* 994 * Check if the Descriptor Done bit is set 995 */ 996 desc_done = tx_ring->tbd_ring[last_index].wb.status & 997 IXGBE_TXD_STAT_DD; 998 if (desc_done) { 999 /* 1000 * Strip off the tx control block from the work list, 1001 * and add it to the pending list. 1002 */ 1003 tx_ring->work_list[index] = NULL; 1004 LIST_PUSH_TAIL(&pending_list, &tcb->link); 1005 1006 /* 1007 * Count the total number of the tx descriptors recycled 1008 */ 1009 desc_num += tcb->desc_num; 1010 1011 /* 1012 * Advance the index of the tx descriptor ring 1013 */ 1014 index = NEXT_INDEX(last_index, 1, tx_ring->ring_size); 1015 1016 tcb = tx_ring->work_list[index]; 1017 } 1018 } 1019 1020 /* 1021 * If no tx descriptors are recycled, no need to do more processing 1022 */ 1023 if (desc_num == 0) { 1024 tx_ring->recycle_fail++; 1025 mutex_exit(&tx_ring->recycle_lock); 1026 return (0); 1027 } 1028 1029 tx_ring->recycle_fail = 0; 1030 tx_ring->stall_watchdog = 0; 1031 1032 /* 1033 * Update the head index of the tx descriptor ring 1034 */ 1035 tx_ring->tbd_head = index; 1036 1037 /* 1038 * Update the number of the free tx descriptors with atomic operations 1039 */ 1040 atomic_add_32(&tx_ring->tbd_free, desc_num); 1041 1042 mutex_exit(&tx_ring->recycle_lock); 1043 1044 /* 1045 * Free the resources used by the tx control blocks 1046 * in the pending list 1047 */ 1048 tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list); 1049 while (tcb != NULL) { 1050 /* 1051 * Release the resources occupied by the tx control block 1052 */ 1053 ixgbe_free_tcb(tcb); 1054 1055 tcb = (tx_control_block_t *) 1056 LIST_GET_NEXT(&pending_list, &tcb->link); 1057 } 1058 1059 /* 1060 * Add the tx control blocks in the pending list to the free list. 1061 */ 1062 ixgbe_put_free_list(tx_ring, &pending_list); 1063 1064 return (desc_num); 1065 } 1066 1067 /* 1068 * ixgbe_tx_recycle_head_wb 1069 * 1070 * Check the head write-back, and recycle all the transmitted 1071 * tx descriptors and tx control blocks. 1072 */ 1073 uint32_t 1074 ixgbe_tx_recycle_head_wb(ixgbe_tx_ring_t *tx_ring) 1075 { 1076 uint32_t index; 1077 uint32_t head_wb; 1078 int desc_num; 1079 tx_control_block_t *tcb; 1080 link_list_t pending_list; 1081 1082 /* 1083 * The mutex_tryenter() is used to avoid unnecessary 1084 * lock contention. 1085 */ 1086 if (mutex_tryenter(&tx_ring->recycle_lock) == 0) 1087 return (0); 1088 1089 ASSERT(tx_ring->tbd_free <= tx_ring->ring_size); 1090 1091 if (tx_ring->tbd_free == tx_ring->ring_size) { 1092 tx_ring->recycle_fail = 0; 1093 tx_ring->stall_watchdog = 0; 1094 mutex_exit(&tx_ring->recycle_lock); 1095 return (0); 1096 } 1097 1098 /* 1099 * Sync the DMA buffer of the tx descriptor ring 1100 * 1101 * Note: For head write-back mode, the tx descriptors will not 1102 * be written back, but the head write-back value is stored at 1103 * the last extra tbd at the end of the DMA area, we still need 1104 * to sync the head write-back value for kernel. 1105 * 1106 * DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORKERNEL); 1107 */ 1108 (void) ddi_dma_sync(tx_ring->tbd_area.dma_handle, 1109 sizeof (union ixgbe_adv_tx_desc) * tx_ring->ring_size, 1110 sizeof (uint32_t), 1111 DDI_DMA_SYNC_FORKERNEL); 1112 1113 if (ixgbe_check_dma_handle(tx_ring->tbd_area.dma_handle) != DDI_FM_OK) { 1114 ddi_fm_service_impact(tx_ring->ixgbe->dip, 1115 DDI_SERVICE_DEGRADED); 1116 } 1117 1118 LINK_LIST_INIT(&pending_list); 1119 desc_num = 0; 1120 index = tx_ring->tbd_head; /* Next index to clean */ 1121 1122 /* 1123 * Get the value of head write-back 1124 */ 1125 head_wb = *tx_ring->tbd_head_wb; 1126 while (index != head_wb) { 1127 tcb = tx_ring->work_list[index]; 1128 ASSERT(tcb != NULL); 1129 1130 if (OFFSET(index, head_wb, tx_ring->ring_size) < 1131 tcb->desc_num) { 1132 /* 1133 * The current tx control block is not 1134 * completely transmitted, stop recycling 1135 */ 1136 break; 1137 } 1138 1139 /* 1140 * Strip off the tx control block from the work list, 1141 * and add it to the pending list. 1142 */ 1143 tx_ring->work_list[index] = NULL; 1144 LIST_PUSH_TAIL(&pending_list, &tcb->link); 1145 1146 /* 1147 * Advance the index of the tx descriptor ring 1148 */ 1149 index = NEXT_INDEX(index, tcb->desc_num, tx_ring->ring_size); 1150 1151 /* 1152 * Count the total number of the tx descriptors recycled 1153 */ 1154 desc_num += tcb->desc_num; 1155 } 1156 1157 /* 1158 * If no tx descriptors are recycled, no need to do more processing 1159 */ 1160 if (desc_num == 0) { 1161 tx_ring->recycle_fail++; 1162 mutex_exit(&tx_ring->recycle_lock); 1163 return (0); 1164 } 1165 1166 tx_ring->recycle_fail = 0; 1167 tx_ring->stall_watchdog = 0; 1168 1169 /* 1170 * Update the head index of the tx descriptor ring 1171 */ 1172 tx_ring->tbd_head = index; 1173 1174 /* 1175 * Update the number of the free tx descriptors with atomic operations 1176 */ 1177 atomic_add_32(&tx_ring->tbd_free, desc_num); 1178 1179 mutex_exit(&tx_ring->recycle_lock); 1180 1181 /* 1182 * Free the resources used by the tx control blocks 1183 * in the pending list 1184 */ 1185 tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list); 1186 while (tcb) { 1187 /* 1188 * Release the resources occupied by the tx control block 1189 */ 1190 ixgbe_free_tcb(tcb); 1191 1192 tcb = (tx_control_block_t *) 1193 LIST_GET_NEXT(&pending_list, &tcb->link); 1194 } 1195 1196 /* 1197 * Add the tx control blocks in the pending list to the free list. 1198 */ 1199 ixgbe_put_free_list(tx_ring, &pending_list); 1200 1201 return (desc_num); 1202 } 1203 1204 /* 1205 * ixgbe_free_tcb - free up the tx control block 1206 * 1207 * Free the resources of the tx control block, including 1208 * unbind the previously bound DMA handle, and reset other 1209 * control fields. 1210 */ 1211 void 1212 ixgbe_free_tcb(tx_control_block_t *tcb) 1213 { 1214 switch (tcb->tx_type) { 1215 case USE_COPY: 1216 /* 1217 * Reset the buffer length that is used for copy 1218 */ 1219 tcb->tx_buf.len = 0; 1220 break; 1221 case USE_DMA: 1222 /* 1223 * Release the DMA resource that is used for 1224 * DMA binding. 1225 */ 1226 (void) ddi_dma_unbind_handle(tcb->tx_dma_handle); 1227 break; 1228 default: 1229 break; 1230 } 1231 1232 /* 1233 * Free the mblk 1234 */ 1235 if (tcb->mp != NULL) { 1236 freemsg(tcb->mp); 1237 tcb->mp = NULL; 1238 } 1239 1240 tcb->tx_type = USE_NONE; 1241 tcb->frag_num = 0; 1242 tcb->desc_num = 0; 1243 } 1244 1245 /* 1246 * ixgbe_get_free_list - Get a free tx control block from the free list 1247 * 1248 * The atomic operation on the number of the available tx control block 1249 * in the free list is used to keep this routine mutual exclusive with 1250 * the routine ixgbe_put_check_list. 1251 */ 1252 static tx_control_block_t * 1253 ixgbe_get_free_list(ixgbe_tx_ring_t *tx_ring) 1254 { 1255 tx_control_block_t *tcb; 1256 1257 /* 1258 * Check and update the number of the free tx control block 1259 * in the free list. 1260 */ 1261 if (ixgbe_atomic_reserve(&tx_ring->tcb_free, 1) < 0) 1262 return (NULL); 1263 1264 mutex_enter(&tx_ring->tcb_head_lock); 1265 1266 tcb = tx_ring->free_list[tx_ring->tcb_head]; 1267 ASSERT(tcb != NULL); 1268 tx_ring->free_list[tx_ring->tcb_head] = NULL; 1269 tx_ring->tcb_head = NEXT_INDEX(tx_ring->tcb_head, 1, 1270 tx_ring->free_list_size); 1271 1272 mutex_exit(&tx_ring->tcb_head_lock); 1273 1274 return (tcb); 1275 } 1276 1277 /* 1278 * ixgbe_put_free_list 1279 * 1280 * Put a list of used tx control blocks back to the free list 1281 * 1282 * A mutex is used here to ensure the serialization. The mutual exclusion 1283 * between ixgbe_get_free_list and ixgbe_put_free_list is implemented with 1284 * the atomic operation on the counter tcb_free. 1285 */ 1286 void 1287 ixgbe_put_free_list(ixgbe_tx_ring_t *tx_ring, link_list_t *pending_list) 1288 { 1289 uint32_t index; 1290 int tcb_num; 1291 tx_control_block_t *tcb; 1292 1293 mutex_enter(&tx_ring->tcb_tail_lock); 1294 1295 index = tx_ring->tcb_tail; 1296 1297 tcb_num = 0; 1298 tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list); 1299 while (tcb != NULL) { 1300 ASSERT(tx_ring->free_list[index] == NULL); 1301 tx_ring->free_list[index] = tcb; 1302 1303 tcb_num++; 1304 1305 index = NEXT_INDEX(index, 1, tx_ring->free_list_size); 1306 1307 tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list); 1308 } 1309 1310 tx_ring->tcb_tail = index; 1311 1312 /* 1313 * Update the number of the free tx control block 1314 * in the free list. This operation must be placed 1315 * under the protection of the lock. 1316 */ 1317 atomic_add_32(&tx_ring->tcb_free, tcb_num); 1318 1319 mutex_exit(&tx_ring->tcb_tail_lock); 1320 } 1321