1 /* 2 * CDDL HEADER START 3 * 4 * Copyright(c) 2007-2008 Intel Corporation. All rights reserved. 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at: 10 * http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When using or redistributing this file, you may do so under the 15 * License only. No other modification of this header is permitted. 16 * 17 * If applicable, add the following below this CDDL HEADER, with the 18 * fields enclosed by brackets "[]" replaced with your own identifying 19 * information: Portions Copyright [yyyy] [name of copyright owner] 20 * 21 * CDDL HEADER END 22 */ 23 24 /* 25 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 26 * Use is subject to license terms of the CDDL. 27 */ 28 29 #pragma ident "%Z%%M% %I% %E% SMI" 30 31 #include "ixgbe_sw.h" 32 33 static boolean_t ixgbe_tx(ixgbe_tx_ring_t *, mblk_t *); 34 static int ixgbe_tx_copy(ixgbe_tx_ring_t *, tx_control_block_t *, mblk_t *, 35 uint32_t, boolean_t); 36 static int ixgbe_tx_bind(ixgbe_tx_ring_t *, tx_control_block_t *, mblk_t *, 37 uint32_t); 38 static int ixgbe_tx_fill_ring(ixgbe_tx_ring_t *, link_list_t *, 39 ixgbe_tx_context_t *, size_t); 40 static void ixgbe_save_desc(tx_control_block_t *, uint64_t, size_t); 41 static tx_control_block_t *ixgbe_get_free_list(ixgbe_tx_ring_t *); 42 43 static int ixgbe_get_context(mblk_t *, ixgbe_tx_context_t *); 44 static boolean_t ixgbe_check_context(ixgbe_tx_ring_t *, 45 ixgbe_tx_context_t *); 46 static void ixgbe_fill_context(struct ixgbe_adv_tx_context_desc *, 47 ixgbe_tx_context_t *); 48 49 #ifndef IXGBE_DEBUG 50 #pragma inline(ixgbe_save_desc) 51 #pragma inline(ixgbe_get_context) 52 #pragma inline(ixgbe_check_context) 53 #pragma inline(ixgbe_fill_context) 54 #endif 55 56 /* 57 * ixgbe_m_tx 58 * 59 * The GLDv3 interface to call driver's tx routine to transmit 60 * the mblks. 61 */ 62 mblk_t * 63 ixgbe_m_tx(void *arg, mblk_t *mp) 64 { 65 ixgbe_t *ixgbe = (ixgbe_t *)arg; 66 mblk_t *next; 67 ixgbe_tx_ring_t *tx_ring; 68 69 /* 70 * If the adapter is suspended, or it is not started, or the link 71 * is not up, the mblks are simply dropped. 72 */ 73 if (((ixgbe->ixgbe_state & IXGBE_SUSPENDED) != 0) || 74 ((ixgbe->ixgbe_state & IXGBE_STARTED) == 0) || 75 (ixgbe->link_state != LINK_STATE_UP)) { 76 /* Free the mblk chain */ 77 while (mp != NULL) { 78 next = mp->b_next; 79 mp->b_next = NULL; 80 81 freemsg(mp); 82 mp = next; 83 } 84 85 return (NULL); 86 } 87 88 /* 89 * Decide which tx ring is used to transmit the packets. 90 * This needs to be updated later to fit the new interface 91 * of the multiple rings support. 92 */ 93 tx_ring = &ixgbe->tx_rings[0]; 94 95 while (mp != NULL) { 96 next = mp->b_next; 97 mp->b_next = NULL; 98 99 if (!ixgbe_tx(tx_ring, mp)) { 100 mp->b_next = next; 101 break; 102 } 103 104 mp = next; 105 } 106 107 return (mp); 108 } 109 110 /* 111 * ixgbe_tx - Main transmit processing 112 * 113 * Called from ixgbe_m_tx with an mblk ready to transmit. this 114 * routine sets up the transmit descriptors and sends data to 115 * the wire. 116 * 117 * One mblk can consist of several fragments, each fragment 118 * will be processed with different methods based on the size. 119 * For the fragments with size less than the bcopy threshold, 120 * they will be processed by using bcopy; otherwise, they will 121 * be processed by using DMA binding. 122 * 123 * To process the mblk, a tx control block is got from the 124 * free list. One tx control block contains one tx buffer, which 125 * is used to copy mblk fragments' data; and one tx DMA handle, 126 * which is used to bind a mblk fragment with DMA resource. 127 * 128 * Several small mblk fragments can be copied into one tx control 129 * block's buffer, and then the buffer will be transmitted with 130 * one tx descriptor. 131 * 132 * A large fragment only binds with one tx control block's DMA 133 * handle, and it can span several tx descriptors for transmitting. 134 * 135 * So to transmit a packet (mblk), several tx control blocks can 136 * be used. After the processing, those tx control blocks will 137 * be put to the work list. 138 */ 139 static boolean_t 140 ixgbe_tx(ixgbe_tx_ring_t *tx_ring, mblk_t *mp) 141 { 142 ixgbe_t *ixgbe = tx_ring->ixgbe; 143 tx_type_t current_flag, next_flag; 144 uint32_t current_len, next_len; 145 uint32_t desc_total; 146 size_t mbsize; 147 int desc_num; 148 boolean_t copy_done, eop; 149 mblk_t *current_mp, *next_mp, *nmp; 150 tx_control_block_t *tcb; 151 ixgbe_tx_context_t tx_context, *ctx; 152 link_list_t pending_list; 153 154 /* Get the mblk size */ 155 mbsize = 0; 156 for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) { 157 mbsize += MBLK_LEN(nmp); 158 } 159 160 if (ixgbe->tx_hcksum_enable) { 161 /* 162 * Retrieve checksum context information from the mblk 163 * that will be used to decide whether/how to fill the 164 * context descriptor. 165 */ 166 ctx = &tx_context; 167 if (ixgbe_get_context(mp, ctx) < 0) { 168 freemsg(mp); 169 return (B_TRUE); 170 } 171 172 /* 173 * If the mblk size exceeds the max size ixgbe could 174 * process, then discard this mblk, and return B_TRUE 175 */ 176 if ((ctx->lso_flag && ((mbsize - ctx->mac_hdr_len) 177 > IXGBE_LSO_MAXLEN)) || (!ctx->lso_flag && 178 (mbsize > (ixgbe->max_frame_size - ETHERFCSL)))) { 179 freemsg(mp); 180 IXGBE_DEBUGLOG_0(ixgbe, "ixgbe_tx: packet oversize"); 181 return (B_TRUE); 182 } 183 } else { 184 ctx = NULL; 185 } 186 187 188 /* 189 * Check and recycle tx descriptors. 190 * The recycle threshold here should be selected carefully 191 */ 192 if (tx_ring->tbd_free < tx_ring->recycle_thresh) 193 tx_ring->tx_recycle(tx_ring); 194 195 /* 196 * After the recycling, if the tbd_free is less than the 197 * overload_threshold, assert overload, return B_FALSE; 198 * and we need to re-schedule the tx again. 199 */ 200 if (tx_ring->tbd_free < tx_ring->overload_thresh) { 201 tx_ring->reschedule = B_TRUE; 202 IXGBE_DEBUG_STAT(tx_ring->stat_overload); 203 return (B_FALSE); 204 } 205 206 /* 207 * The pending_list is a linked list that is used to save 208 * the tx control blocks that have packet data processed 209 * but have not put the data to the tx descriptor ring. 210 * It is used to reduce the lock contention of the tx_lock. 211 */ 212 LINK_LIST_INIT(&pending_list); 213 desc_num = 0; 214 desc_total = 0; 215 216 current_mp = mp; 217 current_len = MBLK_LEN(current_mp); 218 /* 219 * Decide which method to use for the first fragment 220 */ 221 current_flag = (current_len <= tx_ring->copy_thresh) ? 222 USE_COPY : USE_DMA; 223 /* 224 * If the mblk includes several contiguous small fragments, 225 * they may be copied into one buffer. This flag is used to 226 * indicate whether there are pending fragments that need to 227 * be copied to the current tx buffer. 228 * 229 * If this flag is B_TRUE, it indicates that a new tx control 230 * block is needed to process the next fragment using either 231 * copy or DMA binding. 232 * 233 * Otherwise, it indicates that the next fragment will be 234 * copied to the current tx buffer that is maintained by the 235 * current tx control block. No new tx control block is needed. 236 */ 237 copy_done = B_TRUE; 238 while (current_mp) { 239 next_mp = current_mp->b_cont; 240 eop = (next_mp == NULL); /* Last fragment of the packet? */ 241 next_len = eop ? 0: MBLK_LEN(next_mp); 242 243 /* 244 * When the current fragment is an empty fragment, if 245 * the next fragment will still be copied to the current 246 * tx buffer, we cannot skip this fragment here. Because 247 * the copy processing is pending for completion. We have 248 * to process this empty fragment in the tx_copy routine. 249 * 250 * If the copy processing is completed or a DMA binding 251 * processing is just completed, we can just skip this 252 * empty fragment. 253 */ 254 if ((current_len == 0) && (copy_done)) { 255 current_mp = next_mp; 256 current_len = next_len; 257 current_flag = (current_len <= tx_ring->copy_thresh) ? 258 USE_COPY : USE_DMA; 259 continue; 260 } 261 262 if (copy_done) { 263 /* 264 * Get a new tx control block from the free list 265 */ 266 tcb = ixgbe_get_free_list(tx_ring); 267 268 if (tcb == NULL) { 269 IXGBE_DEBUG_STAT(tx_ring->stat_fail_no_tcb); 270 goto tx_failure; 271 } 272 273 /* 274 * Push the tx control block to the pending list 275 * to avoid using lock too early 276 */ 277 LIST_PUSH_TAIL(&pending_list, &tcb->link); 278 } 279 280 if (current_flag == USE_COPY) { 281 /* 282 * Check whether to use bcopy or DMA binding to process 283 * the next fragment, and if using bcopy, whether we 284 * need to continue copying the next fragment into the 285 * current tx buffer. 286 */ 287 ASSERT((tcb->tx_buf.len + current_len) <= 288 tcb->tx_buf.size); 289 290 if (eop) { 291 /* 292 * This is the last fragment of the packet, so 293 * the copy processing will be completed with 294 * this fragment. 295 */ 296 next_flag = USE_NONE; 297 copy_done = B_TRUE; 298 } else if ((tcb->tx_buf.len + current_len + next_len) > 299 tcb->tx_buf.size) { 300 /* 301 * If the next fragment is too large to be 302 * copied to the current tx buffer, we need 303 * to complete the current copy processing. 304 */ 305 next_flag = (next_len > tx_ring->copy_thresh) ? 306 USE_DMA: USE_COPY; 307 copy_done = B_TRUE; 308 } else if (next_len > tx_ring->copy_thresh) { 309 /* 310 * The next fragment needs to be processed with 311 * DMA binding. So the copy prcessing will be 312 * completed with the current fragment. 313 */ 314 next_flag = USE_DMA; 315 copy_done = B_TRUE; 316 } else { 317 /* 318 * Continue to copy the next fragment to the 319 * current tx buffer. 320 */ 321 next_flag = USE_COPY; 322 copy_done = B_FALSE; 323 } 324 325 desc_num = ixgbe_tx_copy(tx_ring, tcb, current_mp, 326 current_len, copy_done); 327 } else { 328 /* 329 * Check whether to use bcopy or DMA binding to process 330 * the next fragment. 331 */ 332 next_flag = (next_len > tx_ring->copy_thresh) ? 333 USE_DMA: USE_COPY; 334 ASSERT(copy_done == B_TRUE); 335 336 desc_num = ixgbe_tx_bind(tx_ring, tcb, current_mp, 337 current_len); 338 } 339 340 if (desc_num > 0) 341 desc_total += desc_num; 342 else if (desc_num < 0) 343 goto tx_failure; 344 345 current_mp = next_mp; 346 current_len = next_len; 347 current_flag = next_flag; 348 } 349 350 /* 351 * Attach the mblk to the last tx control block 352 */ 353 ASSERT(tcb); 354 ASSERT(tcb->mp == NULL); 355 tcb->mp = mp; 356 357 /* 358 * Before fill the tx descriptor ring with the data, we need to 359 * ensure there are adequate free descriptors for transmit 360 * (including one context descriptor). 361 */ 362 if (tx_ring->tbd_free < (desc_total + 1)) { 363 tx_ring->tx_recycle(tx_ring); 364 } 365 366 mutex_enter(&tx_ring->tx_lock); 367 368 /* 369 * If the number of free tx descriptors is not enough for transmit 370 * then return failure. 371 * 372 * Note: we must put this check under the mutex protection to 373 * ensure the correctness when multiple threads access it in 374 * parallel. 375 */ 376 if (tx_ring->tbd_free < (desc_total + 1)) { 377 IXGBE_DEBUG_STAT(tx_ring->stat_fail_no_tbd); 378 mutex_exit(&tx_ring->tx_lock); 379 goto tx_failure; 380 } 381 382 desc_num = ixgbe_tx_fill_ring(tx_ring, &pending_list, ctx, 383 mbsize); 384 385 ASSERT((desc_num == desc_total) || (desc_num == (desc_total + 1))); 386 387 mutex_exit(&tx_ring->tx_lock); 388 389 return (B_TRUE); 390 391 tx_failure: 392 /* 393 * Discard the mblk and free the used resources 394 */ 395 tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list); 396 while (tcb) { 397 tcb->mp = NULL; 398 399 ixgbe_free_tcb(tcb); 400 401 tcb = (tx_control_block_t *) 402 LIST_GET_NEXT(&pending_list, &tcb->link); 403 } 404 405 /* 406 * Return the tx control blocks in the pending list to the free list. 407 */ 408 ixgbe_put_free_list(tx_ring, &pending_list); 409 410 /* Transmit failed, do not drop the mblk, rechedule the transmit */ 411 tx_ring->reschedule = B_TRUE; 412 413 return (B_FALSE); 414 } 415 416 /* 417 * ixgbe_tx_copy 418 * 419 * Copy the mblk fragment to the pre-allocated tx buffer 420 */ 421 static int 422 ixgbe_tx_copy(ixgbe_tx_ring_t *tx_ring, tx_control_block_t *tcb, mblk_t *mp, 423 uint32_t len, boolean_t copy_done) 424 { 425 dma_buffer_t *tx_buf; 426 uint32_t desc_num; 427 _NOTE(ARGUNUSED(tx_ring)); 428 429 tx_buf = &tcb->tx_buf; 430 431 /* 432 * Copy the packet data of the mblk fragment into the 433 * pre-allocated tx buffer, which is maintained by the 434 * tx control block. 435 * 436 * Several mblk fragments can be copied into one tx buffer. 437 * The destination address of the current copied fragment in 438 * the tx buffer is next to the end of the previous copied 439 * fragment. 440 */ 441 if (len > 0) { 442 bcopy(mp->b_rptr, tx_buf->address + tx_buf->len, len); 443 444 tx_buf->len += len; 445 tcb->frag_num++; 446 } 447 448 desc_num = 0; 449 450 /* 451 * If it is the last fragment copied to the current tx buffer, 452 * in other words, if there's no remaining fragment or the remaining 453 * fragment requires a new tx control block to process, we need to 454 * complete the current copy processing by syncing up the current 455 * DMA buffer and saving the descriptor data. 456 */ 457 if (copy_done) { 458 /* 459 * Sync the DMA buffer of the packet data 460 */ 461 DMA_SYNC(tx_buf, DDI_DMA_SYNC_FORDEV); 462 463 tcb->tx_type = USE_COPY; 464 465 /* 466 * Save the address and length to the private data structure 467 * of the tx control block, which will be used to fill the 468 * tx descriptor ring after all the fragments are processed. 469 */ 470 ixgbe_save_desc(tcb, tx_buf->dma_address, tx_buf->len); 471 desc_num++; 472 } 473 474 return (desc_num); 475 } 476 477 /* 478 * ixgbe_tx_bind 479 * 480 * Bind the mblk fragment with DMA 481 */ 482 static int 483 ixgbe_tx_bind(ixgbe_tx_ring_t *tx_ring, tx_control_block_t *tcb, mblk_t *mp, 484 uint32_t len) 485 { 486 int status, i; 487 ddi_dma_cookie_t dma_cookie; 488 uint_t ncookies; 489 int desc_num; 490 491 /* 492 * Use DMA binding to process the mblk fragment 493 */ 494 status = ddi_dma_addr_bind_handle(tcb->tx_dma_handle, NULL, 495 (caddr_t)mp->b_rptr, len, 496 DDI_DMA_WRITE | DDI_DMA_STREAMING, DDI_DMA_DONTWAIT, 497 0, &dma_cookie, &ncookies); 498 499 if (status != DDI_DMA_MAPPED) { 500 IXGBE_DEBUG_STAT(tx_ring->stat_fail_dma_bind); 501 return (-1); 502 } 503 504 tcb->frag_num++; 505 tcb->tx_type = USE_DMA; 506 /* 507 * Each fragment can span several cookies. One cookie will have 508 * one tx descriptor to transmit. 509 */ 510 desc_num = 0; 511 for (i = ncookies; i > 0; i--) { 512 /* 513 * Save the address and length to the private data structure 514 * of the tx control block, which will be used to fill the 515 * tx descriptor ring after all the fragments are processed. 516 */ 517 ixgbe_save_desc(tcb, 518 dma_cookie.dmac_laddress, 519 dma_cookie.dmac_size); 520 521 desc_num++; 522 523 if (i > 1) 524 ddi_dma_nextcookie(tcb->tx_dma_handle, &dma_cookie); 525 } 526 527 return (desc_num); 528 } 529 530 /* 531 * ixgbe_get_context 532 * 533 * Get the context information from the mblk 534 */ 535 static int 536 ixgbe_get_context(mblk_t *mp, ixgbe_tx_context_t *ctx) 537 { 538 uint32_t start; 539 uint32_t flags; 540 uint32_t len; 541 uint32_t size; 542 uint32_t offset; 543 unsigned char *pos; 544 ushort_t etype; 545 uint32_t mac_hdr_len; 546 uint32_t l4_proto; 547 uint32_t l4_hdr_len; 548 549 ASSERT(mp != NULL); 550 551 hcksum_retrieve(mp, NULL, NULL, &start, NULL, NULL, NULL, &flags); 552 bzero(ctx, sizeof (ixgbe_tx_context_t)); 553 ctx->hcksum_flags = flags; 554 555 if (flags == 0) 556 return (0); 557 558 ctx->mss = DB_LSOMSS(mp); 559 ctx->lso_flag = (ctx->hcksum_flags & HW_LSO) && 560 (ctx->mss != 0); 561 562 /* 563 * LSO relies on tx h/w checksum, so here will drop the package 564 * if h/w checksum flag is not declared. 565 */ 566 if (ctx->lso_flag) { 567 if (!((ctx->hcksum_flags & HCK_PARTIALCKSUM) && 568 (ctx->hcksum_flags & HCK_IPV4_HDRCKSUM))) { 569 IXGBE_DEBUGLOG_0(NULL, "ixgbe_tx: h/w " 570 "checksum flags are not specified when doing LSO"); 571 return (-1); 572 } 573 } 574 575 etype = 0; 576 mac_hdr_len = 0; 577 l4_proto = 0; 578 579 /* 580 * Firstly get the position of the ether_type/ether_tpid. 581 * Here we don't assume the ether (VLAN) header is fully included 582 * in one mblk fragment, so we go thourgh the fragments to parse 583 * the ether type. 584 */ 585 size = len = MBLK_LEN(mp); 586 offset = offsetof(struct ether_header, ether_type); 587 while (size <= offset) { 588 mp = mp->b_cont; 589 ASSERT(mp != NULL); 590 len = MBLK_LEN(mp); 591 size += len; 592 } 593 pos = mp->b_rptr + offset + len - size; 594 595 etype = ntohs(*(ushort_t *)(uintptr_t)pos); 596 if (etype == ETHERTYPE_VLAN) { 597 /* 598 * Get the position of the ether_type in VLAN header 599 */ 600 offset = offsetof(struct ether_vlan_header, ether_type); 601 while (size <= offset) { 602 mp = mp->b_cont; 603 ASSERT(mp != NULL); 604 len = MBLK_LEN(mp); 605 size += len; 606 } 607 pos = mp->b_rptr + offset + len - size; 608 609 etype = ntohs(*(ushort_t *)(uintptr_t)pos); 610 mac_hdr_len = sizeof (struct ether_vlan_header); 611 } else { 612 mac_hdr_len = sizeof (struct ether_header); 613 } 614 615 /* 616 * Here we assume the IP(V6) header is fully included in 617 * one mblk fragment. 618 */ 619 switch (etype) { 620 case ETHERTYPE_IP: 621 offset = mac_hdr_len; 622 while (size <= offset) { 623 mp = mp->b_cont; 624 ASSERT(mp != NULL); 625 len = MBLK_LEN(mp); 626 size += len; 627 } 628 pos = mp->b_rptr + offset + len - size; 629 630 if (ctx->lso_flag) { 631 *((uint16_t *)(uintptr_t)(pos + offsetof(ipha_t, 632 ipha_length))) = 0; 633 *((uint16_t *)(uintptr_t)(pos + offsetof(ipha_t, 634 ipha_hdr_checksum))) = 0; 635 636 /* 637 * To perform ixgbe LSO, here also need to fill 638 * the tcp checksum field of the packet with the 639 * following pseudo-header checksum: 640 * (ip_source_addr, ip_destination_addr, l4_proto) 641 * Currently the tcp/ip stack has done it. 642 */ 643 } 644 645 l4_proto = *(uint8_t *)(pos + offsetof(ipha_t, ipha_protocol)); 646 break; 647 case ETHERTYPE_IPV6: 648 offset = offsetof(ip6_t, ip6_nxt) + mac_hdr_len; 649 while (size <= offset) { 650 mp = mp->b_cont; 651 ASSERT(mp != NULL); 652 len = MBLK_LEN(mp); 653 size += len; 654 } 655 pos = mp->b_rptr + offset + len - size; 656 657 l4_proto = *(uint8_t *)pos; 658 break; 659 default: 660 /* Unrecoverable error */ 661 IXGBE_DEBUGLOG_0(NULL, "Ether type error with tx hcksum"); 662 return (-2); 663 } 664 665 if (ctx->lso_flag) { 666 offset = mac_hdr_len + start; 667 while (size <= offset) { 668 mp = mp->b_cont; 669 ASSERT(mp != NULL); 670 len = MBLK_LEN(mp); 671 size += len; 672 } 673 pos = mp->b_rptr + offset + len - size; 674 675 l4_hdr_len = TCP_HDR_LENGTH((tcph_t *)pos); 676 } else { 677 /* 678 * l4 header length is only required for LSO 679 */ 680 l4_hdr_len = 0; 681 } 682 683 ctx->mac_hdr_len = mac_hdr_len; 684 ctx->ip_hdr_len = start; 685 ctx->l4_proto = l4_proto; 686 ctx->l4_hdr_len = l4_hdr_len; 687 688 return (0); 689 } 690 691 /* 692 * ixgbe_check_context 693 * 694 * Check if a new context descriptor is needed 695 */ 696 static boolean_t 697 ixgbe_check_context(ixgbe_tx_ring_t *tx_ring, ixgbe_tx_context_t *ctx) 698 { 699 ixgbe_tx_context_t *last; 700 701 if (ctx == NULL) 702 return (B_FALSE); 703 704 /* 705 * Compare the checksum data retrieved from the mblk and the 706 * stored checksum data of the last context descriptor. The data 707 * need to be checked are: 708 * hcksum_flags 709 * l4_proto 710 * mac_hdr_len 711 * ip_hdr_len 712 * mss (only checked for LSO) 713 * l4_hr_len (only checked for LSO) 714 * Either one of the above data is changed, a new context descriptor 715 * will be needed. 716 */ 717 last = &tx_ring->tx_context; 718 719 if (ctx->hcksum_flags != 0) { 720 if ((ctx->hcksum_flags != last->hcksum_flags) || 721 (ctx->l4_proto != last->l4_proto) || 722 (ctx->mac_hdr_len != last->mac_hdr_len) || 723 (ctx->ip_hdr_len != last->ip_hdr_len) || 724 (ctx->lso_flag && ((ctx->mss != last->mss) || 725 (ctx->l4_hdr_len != last->l4_hdr_len)))) { 726 727 return (B_TRUE); 728 } 729 } 730 731 return (B_FALSE); 732 } 733 734 /* 735 * ixgbe_fill_context 736 * 737 * Fill the context descriptor with hardware checksum informations 738 */ 739 static void 740 ixgbe_fill_context(struct ixgbe_adv_tx_context_desc *ctx_tbd, 741 ixgbe_tx_context_t *ctx) 742 { 743 /* 744 * Fill the context descriptor with the checksum 745 * context information we've got 746 */ 747 ctx_tbd->vlan_macip_lens = ctx->ip_hdr_len; 748 ctx_tbd->vlan_macip_lens |= ctx->mac_hdr_len << 749 IXGBE_ADVTXD_MACLEN_SHIFT; 750 751 ctx_tbd->type_tucmd_mlhl = 752 IXGBE_ADVTXD_DCMD_DEXT | IXGBE_ADVTXD_DTYP_CTXT; 753 754 if (ctx->hcksum_flags & HCK_IPV4_HDRCKSUM) 755 ctx_tbd->type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV4; 756 757 if (ctx->hcksum_flags & HCK_PARTIALCKSUM) { 758 switch (ctx->l4_proto) { 759 case IPPROTO_TCP: 760 ctx_tbd->type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_TCP; 761 break; 762 case IPPROTO_UDP: 763 /* 764 * We don't have to explicitly set: 765 * ctx_tbd->type_tucmd_mlhl |= 766 * IXGBE_ADVTXD_TUCMD_L4T_UDP; 767 * Because IXGBE_ADVTXD_TUCMD_L4T_UDP == 0b 768 */ 769 break; 770 default: 771 /* Unrecoverable error */ 772 IXGBE_DEBUGLOG_0(NULL, "L4 type error with tx hcksum"); 773 break; 774 } 775 } 776 777 ctx_tbd->seqnum_seed = 0; 778 if (ctx->lso_flag) { 779 ctx_tbd->mss_l4len_idx = 780 (ctx->l4_hdr_len << IXGBE_ADVTXD_L4LEN_SHIFT) | 781 (ctx->mss << IXGBE_ADVTXD_MSS_SHIFT); 782 } else { 783 ctx_tbd->mss_l4len_idx = 0; 784 } 785 } 786 787 /* 788 * ixgbe_tx_fill_ring 789 * 790 * Fill the tx descriptor ring with the data 791 */ 792 static int 793 ixgbe_tx_fill_ring(ixgbe_tx_ring_t *tx_ring, link_list_t *pending_list, 794 ixgbe_tx_context_t *ctx, size_t mbsize) 795 { 796 struct ixgbe_hw *hw = &tx_ring->ixgbe->hw; 797 boolean_t load_context; 798 uint32_t index, tcb_index, desc_num; 799 union ixgbe_adv_tx_desc *tbd, *first_tbd; 800 tx_control_block_t *tcb, *first_tcb; 801 uint32_t hcksum_flags; 802 int i; 803 804 ASSERT(mutex_owned(&tx_ring->tx_lock)); 805 806 tbd = NULL; 807 first_tbd = NULL; 808 first_tcb = NULL; 809 desc_num = 0; 810 hcksum_flags = 0; 811 load_context = B_FALSE; 812 813 /* 814 * Get the index of the first tx descriptor that will be filled, 815 * and the index of the first work list item that will be attached 816 * with the first used tx control block in the pending list. 817 * Note: the two indexes are the same. 818 */ 819 index = tx_ring->tbd_tail; 820 tcb_index = tx_ring->tbd_tail; 821 822 if (ctx != NULL) { 823 hcksum_flags = ctx->hcksum_flags; 824 825 /* 826 * Check if a new context descriptor is needed for this packet 827 */ 828 load_context = ixgbe_check_context(tx_ring, ctx); 829 830 if (load_context) { 831 first_tcb = (tx_control_block_t *) 832 LIST_GET_HEAD(pending_list); 833 tbd = &tx_ring->tbd_ring[index]; 834 835 /* 836 * Fill the context descriptor with the 837 * hardware checksum offload informations. 838 */ 839 ixgbe_fill_context( 840 (struct ixgbe_adv_tx_context_desc *)tbd, 841 ctx); 842 843 index = NEXT_INDEX(index, 1, tx_ring->ring_size); 844 desc_num++; 845 846 /* 847 * Store the checksum context data if 848 * a new context descriptor is added 849 */ 850 tx_ring->tx_context = *ctx; 851 } 852 } 853 854 first_tbd = &tx_ring->tbd_ring[index]; 855 856 /* 857 * Fill tx data descriptors with the data saved in the pending list. 858 * The tx control blocks in the pending list are added to the work list 859 * at the same time. 860 * 861 * The work list is strictly 1:1 corresponding to the descriptor ring. 862 * One item of the work list corresponds to one tx descriptor. Because 863 * one tx control block can span multiple tx descriptors, the tx 864 * control block will be added to the first work list item that 865 * corresponds to the first tx descriptor generated from that tx 866 * control block. 867 */ 868 tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list); 869 while (tcb != NULL) { 870 871 for (i = 0; i < tcb->desc_num; i++) { 872 tbd = &tx_ring->tbd_ring[index]; 873 874 tbd->read.buffer_addr = tcb->desc[i].address; 875 tbd->read.cmd_type_len = tcb->desc[i].length; 876 877 tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_RS | 878 IXGBE_ADVTXD_DCMD_DEXT | IXGBE_ADVTXD_DTYP_DATA; 879 880 tbd->read.olinfo_status = 0; 881 882 index = NEXT_INDEX(index, 1, tx_ring->ring_size); 883 desc_num++; 884 } 885 886 if (first_tcb != NULL) { 887 /* 888 * Count the checksum context descriptor for 889 * the first tx control block. 890 */ 891 first_tcb->desc_num++; 892 first_tcb = NULL; 893 } 894 895 /* 896 * Add the tx control block to the work list 897 */ 898 ASSERT(tx_ring->work_list[tcb_index] == NULL); 899 tx_ring->work_list[tcb_index] = tcb; 900 901 tcb_index = index; 902 tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list); 903 } 904 905 /* 906 * The Insert Ethernet CRC (IFCS) bit and the checksum fields are only 907 * valid in the first descriptor of the packet. 908 */ 909 ASSERT(first_tbd != NULL); 910 first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_IFCS; 911 912 if (ctx != NULL && ctx->lso_flag) { 913 first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_TSE; 914 first_tbd->read.olinfo_status |= 915 (mbsize - ctx->mac_hdr_len - ctx->ip_hdr_len 916 - ctx->l4_hdr_len) << IXGBE_ADVTXD_PAYLEN_SHIFT; 917 } 918 919 /* Set hardware checksum bits */ 920 if (hcksum_flags != 0) { 921 if (hcksum_flags & HCK_IPV4_HDRCKSUM) 922 first_tbd->read.olinfo_status |= 923 IXGBE_ADVTXD_POPTS_IXSM; 924 if (hcksum_flags & HCK_PARTIALCKSUM) 925 first_tbd->read.olinfo_status |= 926 IXGBE_ADVTXD_POPTS_TXSM; 927 } 928 929 /* 930 * The last descriptor of packet needs End Of Packet (EOP), 931 * and Report Status (RS) bits set 932 */ 933 ASSERT(tbd != NULL); 934 tbd->read.cmd_type_len |= 935 IXGBE_ADVTXD_DCMD_EOP | IXGBE_ADVTXD_DCMD_RS; 936 937 /* 938 * Sync the DMA buffer of the tx descriptor ring 939 */ 940 DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORDEV); 941 942 if (ixgbe_check_dma_handle(tx_ring->tbd_area.dma_handle) != DDI_FM_OK) { 943 ddi_fm_service_impact(tx_ring->ixgbe->dip, 944 DDI_SERVICE_DEGRADED); 945 } 946 947 /* 948 * Update the number of the free tx descriptors. 949 * The mutual exclusion between the transmission and the recycling 950 * (for the tx descriptor ring and the work list) is implemented 951 * with the atomic operation on the number of the free tx descriptors. 952 * 953 * Note: we should always decrement the counter tbd_free before 954 * advancing the hardware TDT pointer to avoid the race condition - 955 * before the counter tbd_free is decremented, the transmit of the 956 * tx descriptors has done and the counter tbd_free is increased by 957 * the tx recycling. 958 */ 959 i = ixgbe_atomic_reserve(&tx_ring->tbd_free, desc_num); 960 ASSERT(i >= 0); 961 962 tx_ring->tbd_tail = index; 963 964 /* 965 * Advance the hardware TDT pointer of the tx descriptor ring 966 */ 967 IXGBE_WRITE_REG(hw, IXGBE_TDT(tx_ring->index), index); 968 969 if (ixgbe_check_acc_handle(tx_ring->ixgbe->osdep.reg_handle) != 970 DDI_FM_OK) { 971 ddi_fm_service_impact(tx_ring->ixgbe->dip, 972 DDI_SERVICE_DEGRADED); 973 } 974 975 return (desc_num); 976 } 977 978 /* 979 * ixgbe_save_desc 980 * 981 * Save the address/length pair to the private array 982 * of the tx control block. The address/length pairs 983 * will be filled into the tx descriptor ring later. 984 */ 985 static void 986 ixgbe_save_desc(tx_control_block_t *tcb, uint64_t address, size_t length) 987 { 988 sw_desc_t *desc; 989 990 desc = &tcb->desc[tcb->desc_num]; 991 desc->address = address; 992 desc->length = length; 993 994 tcb->desc_num++; 995 } 996 997 /* 998 * ixgbe_tx_recycle_legacy 999 * 1000 * Recycle the tx descriptors and tx control blocks. 1001 * 1002 * The work list is traversed to check if the corresponding 1003 * tx descriptors have been transmitted. If so, the resources 1004 * bound to the tx control blocks will be freed, and those 1005 * tx control blocks will be returned to the free list. 1006 */ 1007 uint32_t 1008 ixgbe_tx_recycle_legacy(ixgbe_tx_ring_t *tx_ring) 1009 { 1010 uint32_t index, last_index; 1011 int desc_num; 1012 boolean_t desc_done; 1013 tx_control_block_t *tcb; 1014 link_list_t pending_list; 1015 1016 /* 1017 * The mutex_tryenter() is used to avoid unnecessary 1018 * lock contention. 1019 */ 1020 if (mutex_tryenter(&tx_ring->recycle_lock) == 0) 1021 return (0); 1022 1023 ASSERT(tx_ring->tbd_free <= tx_ring->ring_size); 1024 1025 if (tx_ring->tbd_free == tx_ring->ring_size) { 1026 tx_ring->recycle_fail = 0; 1027 tx_ring->stall_watchdog = 0; 1028 mutex_exit(&tx_ring->recycle_lock); 1029 return (0); 1030 } 1031 1032 /* 1033 * Sync the DMA buffer of the tx descriptor ring 1034 */ 1035 DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORKERNEL); 1036 1037 if (ixgbe_check_dma_handle(tx_ring->tbd_area.dma_handle) != DDI_FM_OK) { 1038 ddi_fm_service_impact(tx_ring->ixgbe->dip, 1039 DDI_SERVICE_DEGRADED); 1040 } 1041 1042 LINK_LIST_INIT(&pending_list); 1043 desc_num = 0; 1044 index = tx_ring->tbd_head; /* Index of next tbd/tcb to recycle */ 1045 1046 tcb = tx_ring->work_list[index]; 1047 ASSERT(tcb != NULL); 1048 1049 desc_done = B_TRUE; 1050 while (desc_done && (tcb != NULL)) { 1051 1052 /* 1053 * Get the last tx descriptor of the tx control block. 1054 * If the last tx descriptor is done, it is done with 1055 * all the tx descriptors of the tx control block. 1056 * Then the tx control block and all the corresponding 1057 * tx descriptors can be recycled. 1058 */ 1059 last_index = NEXT_INDEX(index, tcb->desc_num - 1, 1060 tx_ring->ring_size); 1061 1062 /* 1063 * Check if the Descriptor Done bit is set 1064 */ 1065 desc_done = tx_ring->tbd_ring[last_index].wb.status & 1066 IXGBE_TXD_STAT_DD; 1067 if (desc_done) { 1068 /* 1069 * Strip off the tx control block from the work list, 1070 * and add it to the pending list. 1071 */ 1072 tx_ring->work_list[index] = NULL; 1073 LIST_PUSH_TAIL(&pending_list, &tcb->link); 1074 1075 /* 1076 * Count the total number of the tx descriptors recycled 1077 */ 1078 desc_num += tcb->desc_num; 1079 1080 /* 1081 * Advance the index of the tx descriptor ring 1082 */ 1083 index = NEXT_INDEX(last_index, 1, tx_ring->ring_size); 1084 1085 tcb = tx_ring->work_list[index]; 1086 } 1087 } 1088 1089 /* 1090 * If no tx descriptors are recycled, no need to do more processing 1091 */ 1092 if (desc_num == 0) { 1093 tx_ring->recycle_fail++; 1094 mutex_exit(&tx_ring->recycle_lock); 1095 return (0); 1096 } 1097 1098 tx_ring->recycle_fail = 0; 1099 tx_ring->stall_watchdog = 0; 1100 1101 /* 1102 * Update the head index of the tx descriptor ring 1103 */ 1104 tx_ring->tbd_head = index; 1105 1106 /* 1107 * Update the number of the free tx descriptors with atomic operations 1108 */ 1109 atomic_add_32(&tx_ring->tbd_free, desc_num); 1110 1111 mutex_exit(&tx_ring->recycle_lock); 1112 1113 /* 1114 * Free the resources used by the tx control blocks 1115 * in the pending list 1116 */ 1117 tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list); 1118 while (tcb != NULL) { 1119 /* 1120 * Release the resources occupied by the tx control block 1121 */ 1122 ixgbe_free_tcb(tcb); 1123 1124 tcb = (tx_control_block_t *) 1125 LIST_GET_NEXT(&pending_list, &tcb->link); 1126 } 1127 1128 /* 1129 * Add the tx control blocks in the pending list to the free list. 1130 */ 1131 ixgbe_put_free_list(tx_ring, &pending_list); 1132 1133 return (desc_num); 1134 } 1135 1136 /* 1137 * ixgbe_tx_recycle_head_wb 1138 * 1139 * Check the head write-back, and recycle all the transmitted 1140 * tx descriptors and tx control blocks. 1141 */ 1142 uint32_t 1143 ixgbe_tx_recycle_head_wb(ixgbe_tx_ring_t *tx_ring) 1144 { 1145 uint32_t index; 1146 uint32_t head_wb; 1147 int desc_num; 1148 tx_control_block_t *tcb; 1149 link_list_t pending_list; 1150 1151 /* 1152 * The mutex_tryenter() is used to avoid unnecessary 1153 * lock contention. 1154 */ 1155 if (mutex_tryenter(&tx_ring->recycle_lock) == 0) 1156 return (0); 1157 1158 ASSERT(tx_ring->tbd_free <= tx_ring->ring_size); 1159 1160 if (tx_ring->tbd_free == tx_ring->ring_size) { 1161 tx_ring->recycle_fail = 0; 1162 tx_ring->stall_watchdog = 0; 1163 mutex_exit(&tx_ring->recycle_lock); 1164 return (0); 1165 } 1166 1167 /* 1168 * Sync the DMA buffer of the tx descriptor ring 1169 * 1170 * Note: For head write-back mode, the tx descriptors will not 1171 * be written back, but the head write-back value is stored at 1172 * the last extra tbd at the end of the DMA area, we still need 1173 * to sync the head write-back value for kernel. 1174 * 1175 * DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORKERNEL); 1176 */ 1177 (void) ddi_dma_sync(tx_ring->tbd_area.dma_handle, 1178 sizeof (union ixgbe_adv_tx_desc) * tx_ring->ring_size, 1179 sizeof (uint32_t), 1180 DDI_DMA_SYNC_FORKERNEL); 1181 1182 if (ixgbe_check_dma_handle(tx_ring->tbd_area.dma_handle) != DDI_FM_OK) { 1183 ddi_fm_service_impact(tx_ring->ixgbe->dip, 1184 DDI_SERVICE_DEGRADED); 1185 } 1186 1187 LINK_LIST_INIT(&pending_list); 1188 desc_num = 0; 1189 index = tx_ring->tbd_head; /* Next index to clean */ 1190 1191 /* 1192 * Get the value of head write-back 1193 */ 1194 head_wb = *tx_ring->tbd_head_wb; 1195 while (index != head_wb) { 1196 tcb = tx_ring->work_list[index]; 1197 ASSERT(tcb != NULL); 1198 1199 if (OFFSET(index, head_wb, tx_ring->ring_size) < 1200 tcb->desc_num) { 1201 /* 1202 * The current tx control block is not 1203 * completely transmitted, stop recycling 1204 */ 1205 break; 1206 } 1207 1208 /* 1209 * Strip off the tx control block from the work list, 1210 * and add it to the pending list. 1211 */ 1212 tx_ring->work_list[index] = NULL; 1213 LIST_PUSH_TAIL(&pending_list, &tcb->link); 1214 1215 /* 1216 * Advance the index of the tx descriptor ring 1217 */ 1218 index = NEXT_INDEX(index, tcb->desc_num, tx_ring->ring_size); 1219 1220 /* 1221 * Count the total number of the tx descriptors recycled 1222 */ 1223 desc_num += tcb->desc_num; 1224 } 1225 1226 /* 1227 * If no tx descriptors are recycled, no need to do more processing 1228 */ 1229 if (desc_num == 0) { 1230 tx_ring->recycle_fail++; 1231 mutex_exit(&tx_ring->recycle_lock); 1232 return (0); 1233 } 1234 1235 tx_ring->recycle_fail = 0; 1236 tx_ring->stall_watchdog = 0; 1237 1238 /* 1239 * Update the head index of the tx descriptor ring 1240 */ 1241 tx_ring->tbd_head = index; 1242 1243 /* 1244 * Update the number of the free tx descriptors with atomic operations 1245 */ 1246 atomic_add_32(&tx_ring->tbd_free, desc_num); 1247 1248 mutex_exit(&tx_ring->recycle_lock); 1249 1250 /* 1251 * Free the resources used by the tx control blocks 1252 * in the pending list 1253 */ 1254 tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list); 1255 while (tcb) { 1256 /* 1257 * Release the resources occupied by the tx control block 1258 */ 1259 ixgbe_free_tcb(tcb); 1260 1261 tcb = (tx_control_block_t *) 1262 LIST_GET_NEXT(&pending_list, &tcb->link); 1263 } 1264 1265 /* 1266 * Add the tx control blocks in the pending list to the free list. 1267 */ 1268 ixgbe_put_free_list(tx_ring, &pending_list); 1269 1270 return (desc_num); 1271 } 1272 1273 /* 1274 * ixgbe_free_tcb - free up the tx control block 1275 * 1276 * Free the resources of the tx control block, including 1277 * unbind the previously bound DMA handle, and reset other 1278 * control fields. 1279 */ 1280 void 1281 ixgbe_free_tcb(tx_control_block_t *tcb) 1282 { 1283 switch (tcb->tx_type) { 1284 case USE_COPY: 1285 /* 1286 * Reset the buffer length that is used for copy 1287 */ 1288 tcb->tx_buf.len = 0; 1289 break; 1290 case USE_DMA: 1291 /* 1292 * Release the DMA resource that is used for 1293 * DMA binding. 1294 */ 1295 (void) ddi_dma_unbind_handle(tcb->tx_dma_handle); 1296 break; 1297 default: 1298 break; 1299 } 1300 1301 /* 1302 * Free the mblk 1303 */ 1304 if (tcb->mp != NULL) { 1305 freemsg(tcb->mp); 1306 tcb->mp = NULL; 1307 } 1308 1309 tcb->tx_type = USE_NONE; 1310 tcb->frag_num = 0; 1311 tcb->desc_num = 0; 1312 } 1313 1314 /* 1315 * ixgbe_get_free_list - Get a free tx control block from the free list 1316 * 1317 * The atomic operation on the number of the available tx control block 1318 * in the free list is used to keep this routine mutual exclusive with 1319 * the routine ixgbe_put_check_list. 1320 */ 1321 static tx_control_block_t * 1322 ixgbe_get_free_list(ixgbe_tx_ring_t *tx_ring) 1323 { 1324 tx_control_block_t *tcb; 1325 1326 /* 1327 * Check and update the number of the free tx control block 1328 * in the free list. 1329 */ 1330 if (ixgbe_atomic_reserve(&tx_ring->tcb_free, 1) < 0) 1331 return (NULL); 1332 1333 mutex_enter(&tx_ring->tcb_head_lock); 1334 1335 tcb = tx_ring->free_list[tx_ring->tcb_head]; 1336 ASSERT(tcb != NULL); 1337 tx_ring->free_list[tx_ring->tcb_head] = NULL; 1338 tx_ring->tcb_head = NEXT_INDEX(tx_ring->tcb_head, 1, 1339 tx_ring->free_list_size); 1340 1341 mutex_exit(&tx_ring->tcb_head_lock); 1342 1343 return (tcb); 1344 } 1345 1346 /* 1347 * ixgbe_put_free_list 1348 * 1349 * Put a list of used tx control blocks back to the free list 1350 * 1351 * A mutex is used here to ensure the serialization. The mutual exclusion 1352 * between ixgbe_get_free_list and ixgbe_put_free_list is implemented with 1353 * the atomic operation on the counter tcb_free. 1354 */ 1355 void 1356 ixgbe_put_free_list(ixgbe_tx_ring_t *tx_ring, link_list_t *pending_list) 1357 { 1358 uint32_t index; 1359 int tcb_num; 1360 tx_control_block_t *tcb; 1361 1362 mutex_enter(&tx_ring->tcb_tail_lock); 1363 1364 index = tx_ring->tcb_tail; 1365 1366 tcb_num = 0; 1367 tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list); 1368 while (tcb != NULL) { 1369 ASSERT(tx_ring->free_list[index] == NULL); 1370 tx_ring->free_list[index] = tcb; 1371 1372 tcb_num++; 1373 1374 index = NEXT_INDEX(index, 1, tx_ring->free_list_size); 1375 1376 tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list); 1377 } 1378 1379 tx_ring->tcb_tail = index; 1380 1381 /* 1382 * Update the number of the free tx control block 1383 * in the free list. This operation must be placed 1384 * under the protection of the lock. 1385 */ 1386 atomic_add_32(&tx_ring->tcb_free, tcb_num); 1387 1388 mutex_exit(&tx_ring->tcb_tail_lock); 1389 } 1390