1 /* 2 * This file is provided under a CDDLv1 license. When using or 3 * redistributing this file, you may do so under this license. 4 * In redistributing this file this license must be included 5 * and no other modification of this header file is permitted. 6 * 7 * CDDL LICENSE SUMMARY 8 * 9 * Copyright(c) 1999 - 2009 Intel Corporation. All rights reserved. 10 * 11 * The contents of this file are subject to the terms of Version 12 * 1.0 of the Common Development and Distribution License (the "License"). 13 * 14 * You should have received a copy of the License with this software. 15 * You can obtain a copy of the License at 16 * http://www.opensolaris.org/os/licensing. 17 * See the License for the specific language governing permissions 18 * and limitations under the License. 19 */ 20 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright (c) 2012, Joyent, Inc. All rights reserved. 28 */ 29 30 /* 31 * ********************************************************************** 32 * * 33 * Module Name: * 34 * e1000g_tx.c * 35 * * 36 * Abstract: * 37 * This file contains some routines that take care of Transmit, * 38 * make the hardware to send the data pointed by the packet out * 39 * on to the physical medium. * 40 * * 41 * ********************************************************************** 42 */ 43 44 #include "e1000g_sw.h" 45 #include "e1000g_debug.h" 46 47 static boolean_t e1000g_send(struct e1000g *, mblk_t *); 48 static int e1000g_tx_copy(e1000g_tx_ring_t *, 49 p_tx_sw_packet_t, mblk_t *, boolean_t); 50 static int e1000g_tx_bind(e1000g_tx_ring_t *, 51 p_tx_sw_packet_t, mblk_t *); 52 static boolean_t e1000g_retrieve_context(mblk_t *, context_data_t *, size_t); 53 static boolean_t e1000g_check_context(e1000g_tx_ring_t *, context_data_t *); 54 static int e1000g_fill_tx_ring(e1000g_tx_ring_t *, LIST_DESCRIBER *, 55 context_data_t *); 56 static void e1000g_fill_context_descriptor(context_data_t *, 57 struct e1000_context_desc *); 58 static int e1000g_fill_tx_desc(e1000g_tx_ring_t *, 59 p_tx_sw_packet_t, uint64_t, size_t); 60 static uint32_t e1000g_fill_82544_desc(uint64_t Address, size_t Length, 61 p_desc_array_t desc_array); 62 static int e1000g_tx_workaround_PCIX_82544(p_tx_sw_packet_t, uint64_t, size_t); 63 static int e1000g_tx_workaround_jumbo_82544(p_tx_sw_packet_t, uint64_t, size_t); 64 static void e1000g_82547_timeout(void *); 65 static void e1000g_82547_tx_move_tail(e1000g_tx_ring_t *); 66 static void e1000g_82547_tx_move_tail_work(e1000g_tx_ring_t *); 67 68 #ifndef E1000G_DEBUG 69 #pragma inline(e1000g_tx_copy) 70 #pragma inline(e1000g_tx_bind) 71 #pragma inline(e1000g_retrieve_context) 72 #pragma inline(e1000g_check_context) 73 #pragma inline(e1000g_fill_tx_ring) 74 #pragma inline(e1000g_fill_context_descriptor) 75 #pragma inline(e1000g_fill_tx_desc) 76 #pragma inline(e1000g_fill_82544_desc) 77 #pragma inline(e1000g_tx_workaround_PCIX_82544) 78 #pragma inline(e1000g_tx_workaround_jumbo_82544) 79 #pragma inline(e1000g_free_tx_swpkt) 80 #endif 81 82 /* 83 * e1000g_free_tx_swpkt - free up the tx sw packet 84 * 85 * Unbind the previously bound DMA handle for a given 86 * transmit sw packet. And reset the sw packet data. 87 */ 88 void 89 e1000g_free_tx_swpkt(register p_tx_sw_packet_t packet) 90 { 91 switch (packet->data_transfer_type) { 92 case USE_BCOPY: 93 packet->tx_buf->len = 0; 94 break; 95 #ifdef __sparc 96 case USE_DVMA: 97 dvma_unload(packet->tx_dma_handle, 0, -1); 98 break; 99 #endif 100 case USE_DMA: 101 (void) ddi_dma_unbind_handle(packet->tx_dma_handle); 102 break; 103 default: 104 break; 105 } 106 107 /* 108 * The mblk has been stripped off the sw packet 109 * and will be freed in a triggered soft intr. 110 */ 111 ASSERT(packet->mp == NULL); 112 113 packet->data_transfer_type = USE_NONE; 114 packet->num_mblk_frag = 0; 115 packet->num_desc = 0; 116 } 117 118 mblk_t * 119 e1000g_m_tx(void *arg, mblk_t *mp) 120 { 121 struct e1000g *Adapter = (struct e1000g *)arg; 122 mblk_t *next; 123 124 rw_enter(&Adapter->chip_lock, RW_READER); 125 126 if ((Adapter->e1000g_state & E1000G_SUSPENDED) || 127 !(Adapter->e1000g_state & E1000G_STARTED) || 128 (Adapter->link_state != LINK_STATE_UP)) { 129 freemsgchain(mp); 130 mp = NULL; 131 } 132 133 while (mp != NULL) { 134 next = mp->b_next; 135 mp->b_next = NULL; 136 137 if (!e1000g_send(Adapter, mp)) { 138 mp->b_next = next; 139 break; 140 } 141 142 mp = next; 143 } 144 145 rw_exit(&Adapter->chip_lock); 146 return (mp); 147 } 148 149 /* 150 * e1000g_send - send packets onto the wire 151 * 152 * Called from e1000g_m_tx with an mblk ready to send. this 153 * routine sets up the transmit descriptors and sends data to 154 * the wire. It also pushes the just transmitted packet to 155 * the used tx sw packet list. 156 */ 157 static boolean_t 158 e1000g_send(struct e1000g *Adapter, mblk_t *mp) 159 { 160 p_tx_sw_packet_t packet; 161 LIST_DESCRIBER pending_list; 162 size_t len; 163 size_t msg_size; 164 uint32_t frag_count; 165 int desc_count; 166 uint32_t desc_total; 167 uint32_t bcopy_thresh; 168 uint32_t hdr_frag_len; 169 boolean_t tx_undersize_flag; 170 mblk_t *nmp; 171 mblk_t *tmp; 172 mblk_t *new_mp; 173 mblk_t *pre_mp; 174 mblk_t *next_mp; 175 e1000g_tx_ring_t *tx_ring; 176 context_data_t cur_context; 177 178 tx_ring = Adapter->tx_ring; 179 bcopy_thresh = Adapter->tx_bcopy_thresh; 180 181 /* Get the total size and frags number of the message */ 182 tx_undersize_flag = B_FALSE; 183 frag_count = 0; 184 msg_size = 0; 185 for (nmp = mp; nmp; nmp = nmp->b_cont) { 186 frag_count++; 187 msg_size += MBLKL(nmp); 188 } 189 190 /* retrieve and compute information for context descriptor */ 191 if (!e1000g_retrieve_context(mp, &cur_context, msg_size)) { 192 freemsg(mp); 193 return (B_TRUE); 194 } 195 196 /* 197 * Make sure the packet is less than the allowed size 198 */ 199 if (!cur_context.lso_flag && 200 (msg_size > Adapter->max_frame_size - ETHERFCSL)) { 201 /* 202 * For the over size packet, we'll just drop it. 203 * So we return B_TRUE here. 204 */ 205 E1000G_DEBUGLOG_1(Adapter, E1000G_WARN_LEVEL, 206 "Tx packet out of bound. length = %d \n", msg_size); 207 E1000G_STAT(tx_ring->stat_over_size); 208 freemsg(mp); 209 return (B_TRUE); 210 } 211 212 /* 213 * Check and reclaim tx descriptors. 214 * This low water mark check should be done all the time as 215 * Transmit interrupt delay can produce Transmit interrupts little 216 * late and that may cause few problems related to reaping Tx 217 * Descriptors... As you may run short of them before getting any 218 * transmit interrupt... 219 */ 220 if (tx_ring->tbd_avail < DEFAULT_TX_NO_RESOURCE) { 221 (void) e1000g_recycle(tx_ring); 222 E1000G_DEBUG_STAT(tx_ring->stat_recycle); 223 224 if (tx_ring->tbd_avail < DEFAULT_TX_NO_RESOURCE) { 225 E1000G_DEBUG_STAT(tx_ring->stat_lack_desc); 226 goto tx_no_resource; 227 } 228 } 229 230 /* 231 * If the message size is less than the minimum ethernet packet size, 232 * we'll use bcopy to send it, and padd it to 60 bytes later. 233 */ 234 if (msg_size < ETHERMIN) { 235 E1000G_DEBUG_STAT(tx_ring->stat_under_size); 236 tx_undersize_flag = B_TRUE; 237 } 238 239 /* Initialize variables */ 240 desc_count = 1; /* The initial value should be greater than 0 */ 241 desc_total = 0; 242 new_mp = NULL; 243 QUEUE_INIT_LIST(&pending_list); 244 245 /* Process each mblk fragment and fill tx descriptors */ 246 /* 247 * The software should guarantee LSO packet header(MAC+IP+TCP) 248 * to be within one descriptor. Here we reallocate and refill the 249 * the header if it's physical memory non-contiguous. 250 */ 251 if (cur_context.lso_flag) { 252 /* find the last fragment of the header */ 253 len = MBLKL(mp); 254 ASSERT(len > 0); 255 next_mp = mp; 256 pre_mp = NULL; 257 while (len < cur_context.hdr_len) { 258 pre_mp = next_mp; 259 next_mp = next_mp->b_cont; 260 len += MBLKL(next_mp); 261 } 262 /* 263 * If the header and the payload are in different mblks, 264 * we simply force the header to be copied into pre-allocated 265 * page-aligned buffer. 266 */ 267 if (len == cur_context.hdr_len) 268 goto adjust_threshold; 269 270 hdr_frag_len = cur_context.hdr_len - (len - MBLKL(next_mp)); 271 /* 272 * There are three cases we need to reallocate a mblk for the 273 * last header fragment: 274 * 275 * 1. the header is in multiple mblks and the last fragment 276 * share the same mblk with the payload 277 * 278 * 2. the header is in a single mblk shared with the payload 279 * and the header is physical memory non-contiguous 280 * 281 * 3. there is 4 KB boundary within the header and 64 bytes 282 * following the end of the header bytes. The case may cause 283 * TCP data corruption issue. 284 * 285 * The workaround for the case #2 and case #3 is: 286 * Assuming standard Ethernet/IP/TCP headers of 54 bytes, 287 * this means that the buffer(containing the headers) should 288 * not start -118 bytes before a 4 KB boundary. For example, 289 * 128-byte alignment for this buffer could be used to fulfill 290 * this condition. 291 */ 292 if ((next_mp != mp) || 293 (P2NPHASE((uintptr_t)next_mp->b_rptr, 294 E1000_LSO_FIRST_DESC_ALIGNMENT_BOUNDARY_4K) 295 < E1000_LSO_FIRST_DESC_ALIGNMENT)) { 296 E1000G_DEBUG_STAT(tx_ring->stat_lso_header_fail); 297 /* 298 * reallocate the mblk for the last header fragment, 299 * expect to bcopy into pre-allocated page-aligned 300 * buffer 301 */ 302 new_mp = allocb(hdr_frag_len, NULL); 303 if (!new_mp) 304 return (B_FALSE); 305 bcopy(next_mp->b_rptr, new_mp->b_rptr, hdr_frag_len); 306 /* link the new header fragment with the other parts */ 307 new_mp->b_wptr = new_mp->b_rptr + hdr_frag_len; 308 new_mp->b_cont = next_mp; 309 if (pre_mp) 310 pre_mp->b_cont = new_mp; 311 else 312 mp = new_mp; 313 next_mp->b_rptr += hdr_frag_len; 314 frag_count++; 315 } 316 adjust_threshold: 317 /* 318 * adjust the bcopy threshhold to guarantee 319 * the header to use bcopy way 320 */ 321 if (bcopy_thresh < cur_context.hdr_len) 322 bcopy_thresh = cur_context.hdr_len; 323 } 324 325 packet = NULL; 326 nmp = mp; 327 while (nmp) { 328 tmp = nmp->b_cont; 329 330 len = MBLKL(nmp); 331 /* Check zero length mblks */ 332 if (len == 0) { 333 E1000G_DEBUG_STAT(tx_ring->stat_empty_frags); 334 /* 335 * If there're no packet buffers have been used, 336 * or we just completed processing a buffer, then 337 * skip the empty mblk fragment. 338 * Otherwise, there's still a pending buffer that 339 * needs to be processed (tx_copy). 340 */ 341 if (desc_count > 0) { 342 nmp = tmp; 343 continue; 344 } 345 } 346 347 /* 348 * Get a new TxSwPacket to process mblk buffers. 349 */ 350 if (desc_count > 0) { 351 mutex_enter(&tx_ring->freelist_lock); 352 packet = (p_tx_sw_packet_t) 353 QUEUE_POP_HEAD(&tx_ring->free_list); 354 mutex_exit(&tx_ring->freelist_lock); 355 356 if (packet == NULL) { 357 E1000G_DEBUGLOG_0(Adapter, E1000G_INFO_LEVEL, 358 "No Tx SwPacket available\n"); 359 E1000G_STAT(tx_ring->stat_no_swpkt); 360 goto tx_send_failed; 361 } 362 QUEUE_PUSH_TAIL(&pending_list, &packet->Link); 363 } 364 365 ASSERT(packet); 366 /* 367 * If the size of the fragment is less than the tx_bcopy_thresh 368 * we'll use bcopy; Otherwise, we'll use DMA binding. 369 */ 370 if ((len <= bcopy_thresh) || tx_undersize_flag) { 371 desc_count = 372 e1000g_tx_copy(tx_ring, packet, nmp, 373 tx_undersize_flag); 374 E1000G_DEBUG_STAT(tx_ring->stat_copy); 375 } else { 376 desc_count = 377 e1000g_tx_bind(tx_ring, packet, nmp); 378 E1000G_DEBUG_STAT(tx_ring->stat_bind); 379 } 380 381 if (desc_count > 0) 382 desc_total += desc_count; 383 else if (desc_count < 0) 384 goto tx_send_failed; 385 386 nmp = tmp; 387 } 388 389 /* Assign the message to the last sw packet */ 390 ASSERT(packet); 391 ASSERT(packet->mp == NULL); 392 packet->mp = mp; 393 394 /* Try to recycle the tx descriptors again */ 395 if (tx_ring->tbd_avail < (desc_total + 3)) { 396 E1000G_DEBUG_STAT(tx_ring->stat_recycle_retry); 397 (void) e1000g_recycle(tx_ring); 398 } 399 400 mutex_enter(&tx_ring->tx_lock); 401 402 /* 403 * If the number of available tx descriptors is not enough for transmit 404 * (one redundant descriptor and one hw checksum context descriptor are 405 * included), then return failure. 406 */ 407 if (tx_ring->tbd_avail < (desc_total + 3)) { 408 E1000G_DEBUGLOG_0(Adapter, E1000G_INFO_LEVEL, 409 "No Enough Tx descriptors\n"); 410 E1000G_STAT(tx_ring->stat_no_desc); 411 mutex_exit(&tx_ring->tx_lock); 412 goto tx_send_failed; 413 } 414 415 desc_count = e1000g_fill_tx_ring(tx_ring, &pending_list, &cur_context); 416 417 mutex_exit(&tx_ring->tx_lock); 418 419 ASSERT(desc_count > 0); 420 421 /* Send successful */ 422 return (B_TRUE); 423 424 tx_send_failed: 425 /* Restore mp to original */ 426 if (new_mp) { 427 if (pre_mp) { 428 pre_mp->b_cont = next_mp; 429 } 430 new_mp->b_cont = NULL; 431 freemsg(new_mp); 432 433 next_mp->b_rptr -= hdr_frag_len; 434 } 435 436 /* 437 * Enable Transmit interrupts, so that the interrupt routine can 438 * call mac_tx_update() when transmit descriptors become available. 439 */ 440 tx_ring->resched_timestamp = ddi_get_lbolt(); 441 tx_ring->resched_needed = B_TRUE; 442 if (!Adapter->tx_intr_enable) 443 e1000g_mask_tx_interrupt(Adapter); 444 445 /* Free pending TxSwPackets */ 446 packet = (p_tx_sw_packet_t)QUEUE_GET_HEAD(&pending_list); 447 while (packet) { 448 packet->mp = NULL; 449 e1000g_free_tx_swpkt(packet); 450 packet = (p_tx_sw_packet_t) 451 QUEUE_GET_NEXT(&pending_list, &packet->Link); 452 } 453 454 /* Return pending TxSwPackets to the "Free" list */ 455 mutex_enter(&tx_ring->freelist_lock); 456 QUEUE_APPEND(&tx_ring->free_list, &pending_list); 457 mutex_exit(&tx_ring->freelist_lock); 458 459 E1000G_STAT(tx_ring->stat_send_fail); 460 461 /* Message will be scheduled for re-transmit */ 462 return (B_FALSE); 463 464 tx_no_resource: 465 /* 466 * Enable Transmit interrupts, so that the interrupt routine can 467 * call mac_tx_update() when transmit descriptors become available. 468 */ 469 tx_ring->resched_timestamp = ddi_get_lbolt(); 470 tx_ring->resched_needed = B_TRUE; 471 if (!Adapter->tx_intr_enable) 472 e1000g_mask_tx_interrupt(Adapter); 473 474 /* Message will be scheduled for re-transmit */ 475 return (B_FALSE); 476 } 477 478 static boolean_t 479 e1000g_retrieve_context(mblk_t *mp, context_data_t *cur_context, 480 size_t msg_size) 481 { 482 uintptr_t ip_start; 483 uintptr_t tcp_start; 484 mblk_t *nmp; 485 uint32_t lsoflags; 486 uint32_t mss; 487 488 bzero(cur_context, sizeof (context_data_t)); 489 490 /* first check lso information */ 491 mac_lso_get(mp, &mss, &lsoflags); 492 493 /* retrieve checksum info */ 494 mac_hcksum_get(mp, &cur_context->cksum_start, 495 &cur_context->cksum_stuff, NULL, NULL, &cur_context->cksum_flags); 496 /* retrieve ethernet header size */ 497 if (((struct ether_vlan_header *)(uintptr_t)mp->b_rptr)->ether_tpid == 498 htons(ETHERTYPE_VLAN)) 499 cur_context->ether_header_size = 500 sizeof (struct ether_vlan_header); 501 else 502 cur_context->ether_header_size = 503 sizeof (struct ether_header); 504 505 if (lsoflags & HW_LSO) { 506 ASSERT(mss != 0); 507 508 /* free the invalid packet */ 509 if (mss == 0 || 510 !((cur_context->cksum_flags & HCK_PARTIALCKSUM) && 511 (cur_context->cksum_flags & HCK_IPV4_HDRCKSUM))) { 512 return (B_FALSE); 513 } 514 cur_context->mss = (uint16_t)mss; 515 cur_context->lso_flag = B_TRUE; 516 517 /* 518 * Some fields are cleared for the hardware to fill 519 * in. We don't assume Ethernet header, IP header and 520 * TCP header are always in the same mblk fragment, 521 * while we assume each header is always within one 522 * mblk fragment and Ethernet header is always in the 523 * first mblk fragment. 524 */ 525 nmp = mp; 526 ip_start = (uintptr_t)(nmp->b_rptr) 527 + cur_context->ether_header_size; 528 if (ip_start >= (uintptr_t)(nmp->b_wptr)) { 529 ip_start = (uintptr_t)nmp->b_cont->b_rptr 530 + (ip_start - (uintptr_t)(nmp->b_wptr)); 531 nmp = nmp->b_cont; 532 } 533 tcp_start = ip_start + 534 IPH_HDR_LENGTH((ipha_t *)ip_start); 535 if (tcp_start >= (uintptr_t)(nmp->b_wptr)) { 536 tcp_start = (uintptr_t)nmp->b_cont->b_rptr 537 + (tcp_start - (uintptr_t)(nmp->b_wptr)); 538 nmp = nmp->b_cont; 539 } 540 cur_context->hdr_len = cur_context->ether_header_size 541 + IPH_HDR_LENGTH((ipha_t *)ip_start) 542 + TCP_HDR_LENGTH((tcph_t *)tcp_start); 543 ((ipha_t *)ip_start)->ipha_length = 0; 544 ((ipha_t *)ip_start)->ipha_hdr_checksum = 0; 545 /* calculate the TCP packet payload length */ 546 cur_context->pay_len = msg_size - cur_context->hdr_len; 547 } 548 return (B_TRUE); 549 } 550 551 static boolean_t 552 e1000g_check_context(e1000g_tx_ring_t *tx_ring, context_data_t *cur_context) 553 { 554 boolean_t context_reload; 555 context_data_t *pre_context; 556 struct e1000g *Adapter; 557 558 context_reload = B_FALSE; 559 pre_context = &tx_ring->pre_context; 560 Adapter = tx_ring->adapter; 561 562 /* 563 * The following code determine if the context descriptor is 564 * needed to be reloaded. The sequence of the conditions is 565 * made by their possibilities of changing. 566 */ 567 /* 568 * workaround for 82546EB, context descriptor must be reloaded 569 * per LSO/hw_cksum packet if LSO is enabled. 570 */ 571 if (Adapter->lso_premature_issue && 572 Adapter->lso_enable && 573 (cur_context->cksum_flags != 0)) { 574 575 context_reload = B_TRUE; 576 } else if (cur_context->lso_flag) { 577 if ((cur_context->lso_flag != pre_context->lso_flag) || 578 (cur_context->cksum_flags != pre_context->cksum_flags) || 579 (cur_context->pay_len != pre_context->pay_len) || 580 (cur_context->mss != pre_context->mss) || 581 (cur_context->hdr_len != pre_context->hdr_len) || 582 (cur_context->cksum_stuff != pre_context->cksum_stuff) || 583 (cur_context->cksum_start != pre_context->cksum_start) || 584 (cur_context->ether_header_size != 585 pre_context->ether_header_size)) { 586 587 context_reload = B_TRUE; 588 } 589 } else if (cur_context->cksum_flags != 0) { 590 if ((cur_context->lso_flag != pre_context->lso_flag) || 591 (cur_context->cksum_flags != pre_context->cksum_flags) || 592 (cur_context->cksum_stuff != pre_context->cksum_stuff) || 593 (cur_context->cksum_start != pre_context->cksum_start) || 594 (cur_context->ether_header_size != 595 pre_context->ether_header_size)) { 596 597 context_reload = B_TRUE; 598 } 599 } 600 601 return (context_reload); 602 } 603 604 static int 605 e1000g_fill_tx_ring(e1000g_tx_ring_t *tx_ring, LIST_DESCRIBER *pending_list, 606 context_data_t *cur_context) 607 { 608 struct e1000g *Adapter; 609 struct e1000_hw *hw; 610 p_tx_sw_packet_t first_packet; 611 p_tx_sw_packet_t packet; 612 p_tx_sw_packet_t previous_packet; 613 boolean_t context_reload; 614 struct e1000_tx_desc *first_data_desc; 615 struct e1000_tx_desc *next_desc; 616 struct e1000_tx_desc *descriptor; 617 struct e1000_data_desc zeroed; 618 int desc_count; 619 boolean_t buff_overrun_flag; 620 int i; 621 622 Adapter = tx_ring->adapter; 623 hw = &Adapter->shared; 624 625 desc_count = 0; 626 first_packet = NULL; 627 first_data_desc = NULL; 628 descriptor = NULL; 629 first_packet = NULL; 630 packet = NULL; 631 buff_overrun_flag = B_FALSE; 632 zeroed.upper.data = 0; 633 634 next_desc = tx_ring->tbd_next; 635 636 /* Context descriptor reload check */ 637 context_reload = e1000g_check_context(tx_ring, cur_context); 638 639 if (context_reload) { 640 first_packet = (p_tx_sw_packet_t)QUEUE_GET_HEAD(pending_list); 641 642 descriptor = next_desc; 643 644 e1000g_fill_context_descriptor(cur_context, 645 (struct e1000_context_desc *)descriptor); 646 647 /* Check the wrap-around case */ 648 if (descriptor == tx_ring->tbd_last) 649 next_desc = tx_ring->tbd_first; 650 else 651 next_desc++; 652 653 desc_count++; 654 } 655 656 first_data_desc = next_desc; 657 658 /* 659 * According to the documentation, the packet options field (POPTS) is 660 * "ignored except on the first data descriptor of a packet." However, 661 * there is a bug in QEMU (638955) whereby the POPTS field within a 662 * given data descriptor is used to interpret that data descriptor -- 663 * regardless of whether or not the descriptor is the first in a packet 664 * or not. For a packet that spans multiple descriptors, the (virtual) 665 * HW checksum (either TCP/UDP or IP or both) will therefore _not_ be 666 * performed on descriptors after the first, resulting in incorrect 667 * checksums and mysteriously dropped/retransmitted packets. Other 668 * drivers do not have this issue because they (harmlessly) set the 669 * POPTS field on every data descriptor to be the intended options for 670 * the entire packet. To circumvent this QEMU bug, we engage in this 671 * same behavior iff the subsystem vendor and device IDs indicate that 672 * this is an emulated QEMU device (1af4,1100). 673 */ 674 if (hw->subsystem_vendor_id == 0x1af4 && 675 hw->subsystem_device_id == 0x1100 && 676 cur_context->cksum_flags) { 677 if (cur_context->cksum_flags & HCK_IPV4_HDRCKSUM) 678 zeroed.upper.fields.popts |= E1000_TXD_POPTS_IXSM; 679 680 if (cur_context->cksum_flags & HCK_PARTIALCKSUM) 681 zeroed.upper.fields.popts |= E1000_TXD_POPTS_TXSM; 682 } 683 684 packet = (p_tx_sw_packet_t)QUEUE_GET_HEAD(pending_list); 685 while (packet) { 686 ASSERT(packet->num_desc); 687 688 for (i = 0; i < packet->num_desc; i++) { 689 ASSERT(tx_ring->tbd_avail > 0); 690 691 descriptor = next_desc; 692 descriptor->buffer_addr = 693 packet->desc[i].address; 694 descriptor->lower.data = 695 packet->desc[i].length; 696 697 /* Zero out status */ 698 descriptor->upper.data = zeroed.upper.data; 699 700 descriptor->lower.data |= 701 E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D; 702 /* must set RS on every outgoing descriptor */ 703 descriptor->lower.data |= 704 E1000_TXD_CMD_RS; 705 706 if (cur_context->lso_flag) 707 descriptor->lower.data |= E1000_TXD_CMD_TSE; 708 709 /* Check the wrap-around case */ 710 if (descriptor == tx_ring->tbd_last) 711 next_desc = tx_ring->tbd_first; 712 else 713 next_desc++; 714 715 desc_count++; 716 717 /* 718 * workaround for 82546EB errata 33, hang in PCI-X 719 * systems due to 2k Buffer Overrun during Transmit 720 * Operation. The workaround applies to all the Intel 721 * PCI-X chips. 722 */ 723 if (hw->bus.type == e1000_bus_type_pcix && 724 descriptor == first_data_desc && 725 ((descriptor->lower.data & E1000G_TBD_LENGTH_MASK) 726 > E1000_TX_BUFFER_OEVRRUN_THRESHOLD)) { 727 /* modified the first descriptor */ 728 descriptor->lower.data &= 729 ~E1000G_TBD_LENGTH_MASK; 730 descriptor->lower.flags.length = 731 E1000_TX_BUFFER_OEVRRUN_THRESHOLD; 732 733 /* insert a new descriptor */ 734 ASSERT(tx_ring->tbd_avail > 0); 735 next_desc->buffer_addr = 736 packet->desc[0].address + 737 E1000_TX_BUFFER_OEVRRUN_THRESHOLD; 738 next_desc->lower.data = 739 packet->desc[0].length - 740 E1000_TX_BUFFER_OEVRRUN_THRESHOLD; 741 742 /* Zero out status */ 743 next_desc->upper.data = zeroed.upper.data; 744 745 next_desc->lower.data |= 746 E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D; 747 /* must set RS on every outgoing descriptor */ 748 next_desc->lower.data |= 749 E1000_TXD_CMD_RS; 750 751 if (cur_context->lso_flag) 752 next_desc->lower.data |= 753 E1000_TXD_CMD_TSE; 754 755 descriptor = next_desc; 756 757 /* Check the wrap-around case */ 758 if (next_desc == tx_ring->tbd_last) 759 next_desc = tx_ring->tbd_first; 760 else 761 next_desc++; 762 763 desc_count++; 764 buff_overrun_flag = B_TRUE; 765 } 766 } 767 768 if (buff_overrun_flag) { 769 packet->num_desc++; 770 buff_overrun_flag = B_FALSE; 771 } 772 773 if (first_packet != NULL) { 774 /* 775 * Count the checksum context descriptor for 776 * the first SwPacket. 777 */ 778 first_packet->num_desc++; 779 first_packet = NULL; 780 } 781 782 packet->tickstamp = ddi_get_lbolt64(); 783 784 previous_packet = packet; 785 packet = (p_tx_sw_packet_t) 786 QUEUE_GET_NEXT(pending_list, &packet->Link); 787 } 788 789 /* 790 * workaround for 82546EB errata 21, LSO Premature Descriptor Write Back 791 */ 792 if (Adapter->lso_premature_issue && cur_context->lso_flag && 793 ((descriptor->lower.data & E1000G_TBD_LENGTH_MASK) > 8)) { 794 /* modified the previous descriptor */ 795 descriptor->lower.data -= 4; 796 797 /* insert a new descriptor */ 798 ASSERT(tx_ring->tbd_avail > 0); 799 /* the lower 20 bits of lower.data is the length field */ 800 next_desc->buffer_addr = 801 descriptor->buffer_addr + 802 (descriptor->lower.data & E1000G_TBD_LENGTH_MASK); 803 next_desc->lower.data = 4; 804 805 /* Zero out status */ 806 next_desc->upper.data = zeroed.upper.data; 807 /* It must be part of a LSO packet */ 808 next_desc->lower.data |= 809 E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D | 810 E1000_TXD_CMD_RS | E1000_TXD_CMD_TSE; 811 812 descriptor = next_desc; 813 814 /* Check the wrap-around case */ 815 if (descriptor == tx_ring->tbd_last) 816 next_desc = tx_ring->tbd_first; 817 else 818 next_desc++; 819 820 desc_count++; 821 /* update the number of descriptors */ 822 previous_packet->num_desc++; 823 } 824 825 ASSERT(descriptor); 826 827 if (cur_context->cksum_flags) { 828 if (cur_context->cksum_flags & HCK_IPV4_HDRCKSUM) 829 ((struct e1000_data_desc *)first_data_desc)-> 830 upper.fields.popts |= E1000_TXD_POPTS_IXSM; 831 if (cur_context->cksum_flags & HCK_PARTIALCKSUM) 832 ((struct e1000_data_desc *)first_data_desc)-> 833 upper.fields.popts |= E1000_TXD_POPTS_TXSM; 834 } 835 836 /* 837 * Last Descriptor of Packet needs End Of Packet (EOP), Report 838 * Status (RS) set. 839 */ 840 if (Adapter->tx_intr_delay) { 841 descriptor->lower.data |= E1000_TXD_CMD_IDE | 842 E1000_TXD_CMD_EOP; 843 } else { 844 descriptor->lower.data |= E1000_TXD_CMD_EOP; 845 } 846 847 /* Set append Ethernet CRC (IFCS) bits */ 848 if (cur_context->lso_flag) { 849 first_data_desc->lower.data |= E1000_TXD_CMD_IFCS; 850 } else { 851 descriptor->lower.data |= E1000_TXD_CMD_IFCS; 852 } 853 854 /* 855 * Sync the Tx descriptors DMA buffer 856 */ 857 (void) ddi_dma_sync(tx_ring->tbd_dma_handle, 858 0, 0, DDI_DMA_SYNC_FORDEV); 859 860 tx_ring->tbd_next = next_desc; 861 862 /* 863 * Advance the Transmit Descriptor Tail (Tdt), this tells the 864 * FX1000 that this frame is available to transmit. 865 */ 866 if (hw->mac.type == e1000_82547) 867 e1000g_82547_tx_move_tail(tx_ring); 868 else 869 E1000_WRITE_REG(hw, E1000_TDT(0), 870 (uint32_t)(next_desc - tx_ring->tbd_first)); 871 872 if (e1000g_check_acc_handle(Adapter->osdep.reg_handle) != DDI_FM_OK) { 873 ddi_fm_service_impact(Adapter->dip, DDI_SERVICE_DEGRADED); 874 Adapter->e1000g_state |= E1000G_ERROR; 875 } 876 877 /* Put the pending SwPackets to the "Used" list */ 878 mutex_enter(&tx_ring->usedlist_lock); 879 QUEUE_APPEND(&tx_ring->used_list, pending_list); 880 tx_ring->tbd_avail -= desc_count; 881 mutex_exit(&tx_ring->usedlist_lock); 882 883 /* update LSO related data */ 884 if (context_reload) 885 tx_ring->pre_context = *cur_context; 886 887 return (desc_count); 888 } 889 890 /* 891 * e1000g_tx_setup - setup tx data structures 892 * 893 * This routine initializes all of the transmit related 894 * structures. This includes the Transmit descriptors, 895 * and the tx_sw_packet structures. 896 */ 897 void 898 e1000g_tx_setup(struct e1000g *Adapter) 899 { 900 struct e1000_hw *hw; 901 p_tx_sw_packet_t packet; 902 uint32_t i; 903 uint32_t buf_high; 904 uint32_t buf_low; 905 uint32_t reg_tipg; 906 uint32_t reg_tctl; 907 int size; 908 e1000g_tx_ring_t *tx_ring; 909 910 hw = &Adapter->shared; 911 tx_ring = Adapter->tx_ring; 912 913 /* init the lists */ 914 /* 915 * Here we don't need to protect the lists using the 916 * usedlist_lock and freelist_lock, for they have 917 * been protected by the chip_lock. 918 */ 919 QUEUE_INIT_LIST(&tx_ring->used_list); 920 QUEUE_INIT_LIST(&tx_ring->free_list); 921 922 /* Go through and set up each SW_Packet */ 923 packet = tx_ring->packet_area; 924 for (i = 0; i < Adapter->tx_freelist_num; i++, packet++) { 925 /* Initialize this tx_sw_apcket area */ 926 e1000g_free_tx_swpkt(packet); 927 /* Add this tx_sw_packet to the free list */ 928 QUEUE_PUSH_TAIL(&tx_ring->free_list, 929 &packet->Link); 930 } 931 932 /* Setup TX descriptor pointers */ 933 tx_ring->tbd_next = tx_ring->tbd_first; 934 tx_ring->tbd_oldest = tx_ring->tbd_first; 935 936 /* 937 * Setup Hardware TX Registers 938 */ 939 /* Setup the Transmit Control Register (TCTL). */ 940 reg_tctl = E1000_READ_REG(hw, E1000_TCTL); 941 reg_tctl |= E1000_TCTL_PSP | E1000_TCTL_EN | 942 (E1000_COLLISION_THRESHOLD << E1000_CT_SHIFT) | 943 (E1000_COLLISION_DISTANCE << E1000_COLD_SHIFT) | 944 E1000_TCTL_RTLC; 945 946 /* Enable the MULR bit */ 947 if (hw->bus.type == e1000_bus_type_pci_express) 948 reg_tctl |= E1000_TCTL_MULR; 949 950 E1000_WRITE_REG(hw, E1000_TCTL, reg_tctl); 951 952 /* Setup HW Base and Length of Tx descriptor area */ 953 size = (Adapter->tx_desc_num * sizeof (struct e1000_tx_desc)); 954 E1000_WRITE_REG(hw, E1000_TDLEN(0), size); 955 size = E1000_READ_REG(hw, E1000_TDLEN(0)); 956 957 buf_low = (uint32_t)tx_ring->tbd_dma_addr; 958 buf_high = (uint32_t)(tx_ring->tbd_dma_addr >> 32); 959 960 /* 961 * Write the highest location first and work backward to the lowest. 962 * This is necessary for some adapter types to 963 * prevent write combining from occurring. 964 */ 965 E1000_WRITE_REG(hw, E1000_TDBAH(0), buf_high); 966 E1000_WRITE_REG(hw, E1000_TDBAL(0), buf_low); 967 968 /* Setup our HW Tx Head & Tail descriptor pointers */ 969 E1000_WRITE_REG(hw, E1000_TDH(0), 0); 970 E1000_WRITE_REG(hw, E1000_TDT(0), 0); 971 972 /* Set the default values for the Tx Inter Packet Gap timer */ 973 if ((hw->mac.type == e1000_82542) && 974 ((hw->revision_id == E1000_REVISION_2) || 975 (hw->revision_id == E1000_REVISION_3))) { 976 reg_tipg = DEFAULT_82542_TIPG_IPGT; 977 reg_tipg |= 978 DEFAULT_82542_TIPG_IPGR1 << E1000_TIPG_IPGR1_SHIFT; 979 reg_tipg |= 980 DEFAULT_82542_TIPG_IPGR2 << E1000_TIPG_IPGR2_SHIFT; 981 } else if (hw->mac.type == e1000_80003es2lan) { 982 reg_tipg = DEFAULT_82543_TIPG_IPGR1; 983 reg_tipg |= DEFAULT_80003ES2LAN_TIPG_IPGR2 << 984 E1000_TIPG_IPGR2_SHIFT; 985 } else { 986 if (hw->phy.media_type == e1000_media_type_fiber) 987 reg_tipg = DEFAULT_82543_TIPG_IPGT_FIBER; 988 else 989 reg_tipg = DEFAULT_82543_TIPG_IPGT_COPPER; 990 reg_tipg |= 991 DEFAULT_82543_TIPG_IPGR1 << E1000_TIPG_IPGR1_SHIFT; 992 reg_tipg |= 993 DEFAULT_82543_TIPG_IPGR2 << E1000_TIPG_IPGR2_SHIFT; 994 } 995 E1000_WRITE_REG(hw, E1000_TIPG, reg_tipg); 996 997 /* Setup Transmit Interrupt Delay Value */ 998 E1000_WRITE_REG(hw, E1000_TIDV, Adapter->tx_intr_delay); 999 E1000G_DEBUGLOG_1(Adapter, E1000G_INFO_LEVEL, 1000 "E1000_TIDV: 0x%x\n", Adapter->tx_intr_delay); 1001 1002 if (hw->mac.type >= e1000_82540) { 1003 E1000_WRITE_REG(&Adapter->shared, E1000_TADV, 1004 Adapter->tx_intr_abs_delay); 1005 E1000G_DEBUGLOG_1(Adapter, E1000G_INFO_LEVEL, 1006 "E1000_TADV: 0x%x\n", Adapter->tx_intr_abs_delay); 1007 } 1008 1009 tx_ring->tbd_avail = Adapter->tx_desc_num; 1010 1011 /* Initialize stored context information */ 1012 bzero(&(tx_ring->pre_context), sizeof (context_data_t)); 1013 } 1014 1015 /* 1016 * e1000g_recycle - recycle the tx descriptors and tx sw packets 1017 */ 1018 int 1019 e1000g_recycle(e1000g_tx_ring_t *tx_ring) 1020 { 1021 struct e1000g *Adapter; 1022 LIST_DESCRIBER pending_list; 1023 p_tx_sw_packet_t packet; 1024 mblk_t *mp; 1025 mblk_t *nmp; 1026 struct e1000_tx_desc *descriptor; 1027 int desc_count; 1028 int64_t delta; 1029 1030 /* 1031 * This function will examine each TxSwPacket in the 'used' queue 1032 * if the e1000g is done with it then the associated resources (Tx 1033 * Descriptors) will be "freed" and the TxSwPacket will be 1034 * returned to the 'free' queue. 1035 */ 1036 Adapter = tx_ring->adapter; 1037 delta = 0; 1038 1039 packet = (p_tx_sw_packet_t)QUEUE_GET_HEAD(&tx_ring->used_list); 1040 if (packet == NULL) { 1041 Adapter->stall_flag = B_FALSE; 1042 return (0); 1043 } 1044 1045 desc_count = 0; 1046 QUEUE_INIT_LIST(&pending_list); 1047 1048 /* Sync the Tx descriptor DMA buffer */ 1049 (void) ddi_dma_sync(tx_ring->tbd_dma_handle, 1050 0, 0, DDI_DMA_SYNC_FORKERNEL); 1051 if (e1000g_check_dma_handle( 1052 tx_ring->tbd_dma_handle) != DDI_FM_OK) { 1053 ddi_fm_service_impact(Adapter->dip, DDI_SERVICE_DEGRADED); 1054 Adapter->e1000g_state |= E1000G_ERROR; 1055 return (0); 1056 } 1057 1058 /* 1059 * While there are still TxSwPackets in the used queue check them 1060 */ 1061 mutex_enter(&tx_ring->usedlist_lock); 1062 while ((packet = 1063 (p_tx_sw_packet_t)QUEUE_GET_HEAD(&tx_ring->used_list)) != NULL) { 1064 1065 /* 1066 * Get hold of the next descriptor that the e1000g will 1067 * report status back to (this will be the last descriptor 1068 * of a given sw packet). We only want to free the 1069 * sw packet (and it resources) if the e1000g is done 1070 * with ALL of the descriptors. If the e1000g is done 1071 * with the last one then it is done with all of them. 1072 */ 1073 ASSERT(packet->num_desc); 1074 descriptor = tx_ring->tbd_oldest + (packet->num_desc - 1); 1075 1076 /* Check for wrap case */ 1077 if (descriptor > tx_ring->tbd_last) 1078 descriptor -= Adapter->tx_desc_num; 1079 1080 /* 1081 * If the descriptor done bit is set free TxSwPacket and 1082 * associated resources 1083 */ 1084 if (descriptor->upper.fields.status & E1000_TXD_STAT_DD) { 1085 QUEUE_POP_HEAD(&tx_ring->used_list); 1086 QUEUE_PUSH_TAIL(&pending_list, &packet->Link); 1087 1088 if (descriptor == tx_ring->tbd_last) 1089 tx_ring->tbd_oldest = 1090 tx_ring->tbd_first; 1091 else 1092 tx_ring->tbd_oldest = 1093 descriptor + 1; 1094 1095 desc_count += packet->num_desc; 1096 } else { 1097 /* 1098 * Found a sw packet that the e1000g is not done 1099 * with then there is no reason to check the rest 1100 * of the queue. 1101 */ 1102 delta = ddi_get_lbolt64() - packet->tickstamp; 1103 break; 1104 } 1105 } 1106 1107 tx_ring->tbd_avail += desc_count; 1108 Adapter->tx_pkt_cnt += desc_count; 1109 1110 mutex_exit(&tx_ring->usedlist_lock); 1111 1112 if (desc_count == 0) { 1113 E1000G_DEBUG_STAT(tx_ring->stat_recycle_none); 1114 /* 1115 * If the packet hasn't been sent out for seconds and 1116 * the transmitter is not under paused flowctrl condition, 1117 * the transmitter is considered to be stalled. 1118 */ 1119 if ((delta > Adapter->stall_threshold) && 1120 !(E1000_READ_REG(&Adapter->shared, 1121 E1000_STATUS) & E1000_STATUS_TXOFF)) { 1122 Adapter->stall_flag = B_TRUE; 1123 } 1124 return (0); 1125 } 1126 1127 Adapter->stall_flag = B_FALSE; 1128 1129 mp = NULL; 1130 nmp = NULL; 1131 packet = (p_tx_sw_packet_t)QUEUE_GET_HEAD(&pending_list); 1132 ASSERT(packet != NULL); 1133 while (packet != NULL) { 1134 if (packet->mp != NULL) { 1135 ASSERT(packet->mp->b_next == NULL); 1136 /* Assemble the message chain */ 1137 if (mp == NULL) { 1138 mp = packet->mp; 1139 nmp = packet->mp; 1140 } else { 1141 nmp->b_next = packet->mp; 1142 nmp = packet->mp; 1143 } 1144 /* Disconnect the message from the sw packet */ 1145 packet->mp = NULL; 1146 } 1147 1148 /* Free the TxSwPackets */ 1149 e1000g_free_tx_swpkt(packet); 1150 1151 packet = (p_tx_sw_packet_t) 1152 QUEUE_GET_NEXT(&pending_list, &packet->Link); 1153 } 1154 1155 /* Return the TxSwPackets back to the FreeList */ 1156 mutex_enter(&tx_ring->freelist_lock); 1157 QUEUE_APPEND(&tx_ring->free_list, &pending_list); 1158 mutex_exit(&tx_ring->freelist_lock); 1159 1160 if (mp != NULL) 1161 freemsgchain(mp); 1162 1163 return (desc_count); 1164 } 1165 /* 1166 * 82544 Coexistence issue workaround: 1167 * There are 2 issues. 1168 * 1. If a 32 bit split completion happens from P64H2 and another 1169 * agent drives a 64 bit request/split completion after ONLY 1170 * 1 idle clock (BRCM/Emulex/Adaptec fiber channel cards) then 1171 * 82544 has a problem where in to clock all the data in, it 1172 * looks at REQ64# signal and since it has changed so fast (i.e. 1 1173 * idle clock turn around), it will fail to clock all the data in. 1174 * Data coming from certain ending addresses has exposure to this issue. 1175 * 1176 * To detect this issue, following equation can be used... 1177 * SIZE[3:0] + ADDR[2:0] = SUM[3:0]. 1178 * If SUM[3:0] is in between 1 to 4, we will have this issue. 1179 * 1180 * ROOT CAUSE: 1181 * The erratum involves the 82544 PCIX elasticity FIFO implementations as 1182 * 64-bit FIFO's and flushing of the final partial-bytes corresponding 1183 * to the end of a requested read burst. Under a specific burst condition 1184 * of ending-data alignment and 32-byte split-completions, the final 1185 * byte(s) of split-completion data require an extra clock cycle to flush 1186 * into 64-bit FIFO orientation. An incorrect logic dependency on the 1187 * REQ64# signal occurring during during this clock cycle may cause the 1188 * residual byte(s) to be lost, thereby rendering the internal DMA client 1189 * forever awaiting the final byte(s) for an outbound data-fetch. The 1190 * erratum is confirmed to *only* occur if certain subsequent external 1191 * 64-bit PCIX bus transactions occur immediately (minimum possible bus 1192 * turn- around) following the odd-aligned 32-bit split-completion 1193 * containing the final byte(s). Intel has confirmed that this has been 1194 * seen only with chipset/bridges which have the capability to provide 1195 * 32-bit split-completion data, and in the presence of newer PCIX bus 1196 * agents which fully-optimize the inter-transaction turn-around (zero 1197 * additional initiator latency when pre-granted bus ownership). 1198 * 1199 * This issue does not exist in PCI bus mode, when any agent is operating 1200 * in 32 bit only mode or on chipsets that do not do 32 bit split 1201 * completions for 64 bit read requests (Serverworks chipsets). P64H2 does 1202 * 32 bit split completions for any read request that has bit 2 set to 1 1203 * for the requested address and read request size is more than 8 bytes. 1204 * 1205 * 2. Another issue is related to 82544 driving DACs under the similar 1206 * scenario (32 bit split completion followed by 64 bit transaction with 1207 * only 1 cycle turnaround). This issue is still being root caused. We 1208 * think that both of these issues can be avoided if following workaround 1209 * is implemented. It seems DAC issues is related to ending addresses being 1210 * 0x9, 0xA, 0xB, 0xC and hence ending up at odd boundaries in elasticity 1211 * FIFO which does not get flushed due to REQ64# dependency. We will only 1212 * know the full story after it has been simulated successfully by HW team. 1213 * 1214 * WORKAROUND: 1215 * Make sure we do not have ending address as 1,2,3,4(Hang) or 9,a,b,c(DAC) 1216 */ 1217 static uint32_t 1218 e1000g_fill_82544_desc(uint64_t address, 1219 size_t length, p_desc_array_t desc_array) 1220 { 1221 /* 1222 * Since issue is sensitive to length and address. 1223 * Let us first check the address... 1224 */ 1225 uint32_t safe_terminator; 1226 1227 if (length <= 4) { 1228 desc_array->descriptor[0].address = address; 1229 desc_array->descriptor[0].length = (uint32_t)length; 1230 desc_array->elements = 1; 1231 return (desc_array->elements); 1232 } 1233 safe_terminator = 1234 (uint32_t)((((uint32_t)address & 0x7) + 1235 (length & 0xF)) & 0xF); 1236 /* 1237 * if it does not fall between 0x1 to 0x4 and 0x9 to 0xC then 1238 * return 1239 */ 1240 if (safe_terminator == 0 || 1241 (safe_terminator > 4 && safe_terminator < 9) || 1242 (safe_terminator > 0xC && safe_terminator <= 0xF)) { 1243 desc_array->descriptor[0].address = address; 1244 desc_array->descriptor[0].length = (uint32_t)length; 1245 desc_array->elements = 1; 1246 return (desc_array->elements); 1247 } 1248 1249 desc_array->descriptor[0].address = address; 1250 desc_array->descriptor[0].length = length - 4; 1251 desc_array->descriptor[1].address = address + (length - 4); 1252 desc_array->descriptor[1].length = 4; 1253 desc_array->elements = 2; 1254 return (desc_array->elements); 1255 } 1256 1257 static int 1258 e1000g_tx_copy(e1000g_tx_ring_t *tx_ring, p_tx_sw_packet_t packet, 1259 mblk_t *mp, boolean_t tx_undersize_flag) 1260 { 1261 size_t len; 1262 size_t len1; 1263 dma_buffer_t *tx_buf; 1264 mblk_t *nmp; 1265 boolean_t finished; 1266 int desc_count; 1267 1268 desc_count = 0; 1269 tx_buf = packet->tx_buf; 1270 len = MBLKL(mp); 1271 1272 ASSERT((tx_buf->len + len) <= tx_buf->size); 1273 1274 if (len > 0) { 1275 bcopy(mp->b_rptr, 1276 tx_buf->address + tx_buf->len, 1277 len); 1278 tx_buf->len += len; 1279 1280 packet->num_mblk_frag++; 1281 } 1282 1283 nmp = mp->b_cont; 1284 if (nmp == NULL) { 1285 finished = B_TRUE; 1286 } else { 1287 len1 = MBLKL(nmp); 1288 if ((tx_buf->len + len1) > tx_buf->size) 1289 finished = B_TRUE; 1290 else if (tx_undersize_flag) 1291 finished = B_FALSE; 1292 else if (len1 > tx_ring->adapter->tx_bcopy_thresh) 1293 finished = B_TRUE; 1294 else 1295 finished = B_FALSE; 1296 } 1297 1298 if (finished) { 1299 E1000G_DEBUG_STAT_COND(tx_ring->stat_multi_copy, 1300 (tx_buf->len > len)); 1301 1302 /* 1303 * If the packet is smaller than 64 bytes, which is the 1304 * minimum ethernet packet size, pad the packet to make 1305 * it at least 60 bytes. The hardware will add 4 bytes 1306 * for CRC. 1307 */ 1308 if (tx_undersize_flag) { 1309 ASSERT(tx_buf->len < ETHERMIN); 1310 1311 bzero(tx_buf->address + tx_buf->len, 1312 ETHERMIN - tx_buf->len); 1313 tx_buf->len = ETHERMIN; 1314 } 1315 1316 #ifdef __sparc 1317 if (packet->dma_type == USE_DVMA) 1318 dvma_sync(tx_buf->dma_handle, 0, DDI_DMA_SYNC_FORDEV); 1319 else 1320 (void) ddi_dma_sync(tx_buf->dma_handle, 0, 1321 tx_buf->len, DDI_DMA_SYNC_FORDEV); 1322 #else 1323 (void) ddi_dma_sync(tx_buf->dma_handle, 0, 1324 tx_buf->len, DDI_DMA_SYNC_FORDEV); 1325 #endif 1326 1327 packet->data_transfer_type = USE_BCOPY; 1328 1329 desc_count = e1000g_fill_tx_desc(tx_ring, 1330 packet, 1331 tx_buf->dma_address, 1332 tx_buf->len); 1333 1334 if (desc_count <= 0) 1335 return (-1); 1336 } 1337 1338 return (desc_count); 1339 } 1340 1341 static int 1342 e1000g_tx_bind(e1000g_tx_ring_t *tx_ring, p_tx_sw_packet_t packet, mblk_t *mp) 1343 { 1344 int j; 1345 int mystat; 1346 size_t len; 1347 ddi_dma_cookie_t dma_cookie; 1348 uint_t ncookies; 1349 int desc_count; 1350 uint32_t desc_total; 1351 1352 desc_total = 0; 1353 len = MBLKL(mp); 1354 1355 /* 1356 * ddi_dma_addr_bind_handle() allocates DMA resources for a 1357 * memory object such that a device can perform DMA to or from 1358 * the object. DMA resources are allocated considering the 1359 * device's DMA attributes as expressed by ddi_dma_attr(9S) 1360 * (see ddi_dma_alloc_handle(9F)). 1361 * 1362 * ddi_dma_addr_bind_handle() fills in the first DMA cookie 1363 * pointed to by cookiep with the appropriate address, length, 1364 * and bus type. *ccountp is set to the number of DMA cookies 1365 * representing this DMA object. Subsequent DMA cookies must be 1366 * retrieved by calling ddi_dma_nextcookie(9F) the number of 1367 * times specified by *countp - 1. 1368 */ 1369 switch (packet->dma_type) { 1370 #ifdef __sparc 1371 case USE_DVMA: 1372 dvma_kaddr_load(packet->tx_dma_handle, 1373 (caddr_t)mp->b_rptr, len, 0, &dma_cookie); 1374 1375 dvma_sync(packet->tx_dma_handle, 0, 1376 DDI_DMA_SYNC_FORDEV); 1377 1378 ncookies = 1; 1379 packet->data_transfer_type = USE_DVMA; 1380 break; 1381 #endif 1382 case USE_DMA: 1383 if ((mystat = ddi_dma_addr_bind_handle( 1384 packet->tx_dma_handle, NULL, 1385 (caddr_t)mp->b_rptr, len, 1386 DDI_DMA_WRITE | DDI_DMA_STREAMING, 1387 DDI_DMA_DONTWAIT, 0, &dma_cookie, 1388 &ncookies)) != DDI_DMA_MAPPED) { 1389 1390 e1000g_log(tx_ring->adapter, CE_WARN, 1391 "Couldn't bind mblk buffer to Tx DMA handle: " 1392 "return: %X, Pkt: %X\n", 1393 mystat, packet); 1394 return (-1); 1395 } 1396 1397 /* 1398 * An implicit ddi_dma_sync() is done when the 1399 * ddi_dma_addr_bind_handle() is called. So we 1400 * don't need to explicitly call ddi_dma_sync() 1401 * here any more. 1402 */ 1403 ASSERT(ncookies); 1404 E1000G_DEBUG_STAT_COND(tx_ring->stat_multi_cookie, 1405 (ncookies > 1)); 1406 1407 /* 1408 * The data_transfer_type value must be set after the handle 1409 * has been bound, for it will be used in e1000g_free_tx_swpkt() 1410 * to decide whether we need to unbind the handle. 1411 */ 1412 packet->data_transfer_type = USE_DMA; 1413 break; 1414 default: 1415 ASSERT(B_FALSE); 1416 break; 1417 } 1418 1419 packet->num_mblk_frag++; 1420 1421 /* 1422 * Each address could span thru multpile cookie.. 1423 * Each cookie will have one descriptor 1424 */ 1425 for (j = ncookies; j != 0; j--) { 1426 1427 desc_count = e1000g_fill_tx_desc(tx_ring, 1428 packet, 1429 dma_cookie.dmac_laddress, 1430 dma_cookie.dmac_size); 1431 1432 if (desc_count <= 0) 1433 return (-1); 1434 1435 desc_total += desc_count; 1436 1437 /* 1438 * ddi_dma_nextcookie() retrieves subsequent DMA 1439 * cookies for a DMA object. 1440 * ddi_dma_nextcookie() fills in the 1441 * ddi_dma_cookie(9S) structure pointed to by 1442 * cookiep. The ddi_dma_cookie(9S) structure 1443 * must be allocated prior to calling 1444 * ddi_dma_nextcookie(). The DMA cookie count 1445 * returned by ddi_dma_buf_bind_handle(9F), 1446 * ddi_dma_addr_bind_handle(9F), or 1447 * ddi_dma_getwin(9F) indicates the number of DMA 1448 * cookies a DMA object consists of. If the 1449 * resulting cookie count, N, is larger than 1, 1450 * ddi_dma_nextcookie() must be called N-1 times 1451 * to retrieve all DMA cookies. 1452 */ 1453 if (j > 1) { 1454 ddi_dma_nextcookie(packet->tx_dma_handle, 1455 &dma_cookie); 1456 } 1457 } 1458 1459 return (desc_total); 1460 } 1461 1462 static void 1463 e1000g_fill_context_descriptor(context_data_t *cur_context, 1464 struct e1000_context_desc *context_desc) 1465 { 1466 if (cur_context->cksum_flags & HCK_IPV4_HDRCKSUM) { 1467 context_desc->lower_setup.ip_fields.ipcss = 1468 cur_context->ether_header_size; 1469 context_desc->lower_setup.ip_fields.ipcso = 1470 cur_context->ether_header_size + 1471 offsetof(struct ip, ip_sum); 1472 context_desc->lower_setup.ip_fields.ipcse = 1473 cur_context->ether_header_size + 1474 cur_context->cksum_start - 1; 1475 } else 1476 context_desc->lower_setup.ip_config = 0; 1477 1478 if (cur_context->cksum_flags & HCK_PARTIALCKSUM) { 1479 /* 1480 * The packet with same protocol has the following 1481 * stuff and start offset: 1482 * | Protocol | Stuff | Start | Checksum 1483 * | | Offset | Offset | Enable 1484 * | IPv4 + TCP | 0x24 | 0x14 | Yes 1485 * | IPv4 + UDP | 0x1A | 0x14 | Yes 1486 * | IPv6 + TCP | 0x20 | 0x10 | No 1487 * | IPv6 + UDP | 0x14 | 0x10 | No 1488 */ 1489 context_desc->upper_setup.tcp_fields.tucss = 1490 cur_context->cksum_start + cur_context->ether_header_size; 1491 context_desc->upper_setup.tcp_fields.tucso = 1492 cur_context->cksum_stuff + cur_context->ether_header_size; 1493 context_desc->upper_setup.tcp_fields.tucse = 0; 1494 } else 1495 context_desc->upper_setup.tcp_config = 0; 1496 1497 if (cur_context->lso_flag) { 1498 context_desc->tcp_seg_setup.fields.mss = cur_context->mss; 1499 context_desc->tcp_seg_setup.fields.hdr_len = 1500 cur_context->hdr_len; 1501 /* 1502 * workaround for 82546EB errata 23, status-writeback 1503 * reporting (RS) should not be set on context or 1504 * Null descriptors 1505 */ 1506 context_desc->cmd_and_length = E1000_TXD_CMD_DEXT 1507 | E1000_TXD_CMD_TSE | E1000_TXD_CMD_IP | E1000_TXD_CMD_TCP 1508 | E1000_TXD_DTYP_C | cur_context->pay_len; 1509 } else { 1510 context_desc->cmd_and_length = E1000_TXD_CMD_DEXT 1511 | E1000_TXD_DTYP_C; 1512 /* 1513 * Zero out the options for TCP Segmentation Offload 1514 */ 1515 context_desc->tcp_seg_setup.data = 0; 1516 } 1517 } 1518 1519 static int 1520 e1000g_fill_tx_desc(e1000g_tx_ring_t *tx_ring, 1521 p_tx_sw_packet_t packet, uint64_t address, size_t size) 1522 { 1523 struct e1000_hw *hw = &tx_ring->adapter->shared; 1524 p_sw_desc_t desc; 1525 1526 if (hw->mac.type == e1000_82544) { 1527 if (hw->bus.type == e1000_bus_type_pcix) 1528 return (e1000g_tx_workaround_PCIX_82544(packet, 1529 address, size)); 1530 1531 if (size > JUMBO_FRAG_LENGTH) 1532 return (e1000g_tx_workaround_jumbo_82544(packet, 1533 address, size)); 1534 } 1535 1536 ASSERT(packet->num_desc < MAX_TX_DESC_PER_PACKET); 1537 1538 desc = &packet->desc[packet->num_desc]; 1539 desc->address = address; 1540 desc->length = (uint32_t)size; 1541 1542 packet->num_desc++; 1543 1544 return (1); 1545 } 1546 1547 static int 1548 e1000g_tx_workaround_PCIX_82544(p_tx_sw_packet_t packet, 1549 uint64_t address, size_t size) 1550 { 1551 p_sw_desc_t desc; 1552 int desc_count; 1553 long size_left; 1554 size_t len; 1555 uint32_t counter; 1556 uint32_t array_elements; 1557 desc_array_t desc_array; 1558 1559 /* 1560 * Coexist Workaround for cordova: RP: 07/04/03 1561 * 1562 * RP: ERRATA: Workaround ISSUE: 1563 * 8kb_buffer_Lockup CONTROLLER: Cordova Breakup 1564 * Eachbuffer in to 8kb pieces until the 1565 * remainder is < 8kb 1566 */ 1567 size_left = size; 1568 desc_count = 0; 1569 1570 while (size_left > 0) { 1571 if (size_left > MAX_TX_BUF_SIZE) 1572 len = MAX_TX_BUF_SIZE; 1573 else 1574 len = size_left; 1575 1576 array_elements = e1000g_fill_82544_desc(address, 1577 len, &desc_array); 1578 1579 for (counter = 0; counter < array_elements; counter++) { 1580 ASSERT(packet->num_desc < MAX_TX_DESC_PER_PACKET); 1581 /* 1582 * Put in the buffer address 1583 */ 1584 desc = &packet->desc[packet->num_desc]; 1585 1586 desc->address = 1587 desc_array.descriptor[counter].address; 1588 desc->length = 1589 desc_array.descriptor[counter].length; 1590 1591 packet->num_desc++; 1592 desc_count++; 1593 } /* for */ 1594 1595 /* 1596 * Update the buffer address and length 1597 */ 1598 address += MAX_TX_BUF_SIZE; 1599 size_left -= MAX_TX_BUF_SIZE; 1600 } /* while */ 1601 1602 return (desc_count); 1603 } 1604 1605 static int 1606 e1000g_tx_workaround_jumbo_82544(p_tx_sw_packet_t packet, 1607 uint64_t address, size_t size) 1608 { 1609 p_sw_desc_t desc; 1610 int desc_count; 1611 long size_left; 1612 uint32_t offset; 1613 1614 /* 1615 * Workaround for Jumbo Frames on Cordova 1616 * PSD 06/01/2001 1617 */ 1618 size_left = size; 1619 desc_count = 0; 1620 offset = 0; 1621 while (size_left > 0) { 1622 ASSERT(packet->num_desc < MAX_TX_DESC_PER_PACKET); 1623 1624 desc = &packet->desc[packet->num_desc]; 1625 1626 desc->address = address + offset; 1627 1628 if (size_left > JUMBO_FRAG_LENGTH) 1629 desc->length = JUMBO_FRAG_LENGTH; 1630 else 1631 desc->length = (uint32_t)size_left; 1632 1633 packet->num_desc++; 1634 desc_count++; 1635 1636 offset += desc->length; 1637 size_left -= JUMBO_FRAG_LENGTH; 1638 } 1639 1640 return (desc_count); 1641 } 1642 1643 #pragma inline(e1000g_82547_tx_move_tail_work) 1644 1645 static void 1646 e1000g_82547_tx_move_tail_work(e1000g_tx_ring_t *tx_ring) 1647 { 1648 struct e1000_hw *hw; 1649 uint16_t hw_tdt; 1650 uint16_t sw_tdt; 1651 struct e1000_tx_desc *tx_desc; 1652 uint16_t length = 0; 1653 boolean_t eop = B_FALSE; 1654 struct e1000g *Adapter; 1655 1656 Adapter = tx_ring->adapter; 1657 hw = &Adapter->shared; 1658 1659 hw_tdt = E1000_READ_REG(hw, E1000_TDT(0)); 1660 sw_tdt = tx_ring->tbd_next - tx_ring->tbd_first; 1661 1662 while (hw_tdt != sw_tdt) { 1663 tx_desc = &(tx_ring->tbd_first[hw_tdt]); 1664 length += tx_desc->lower.flags.length; 1665 eop = tx_desc->lower.data & E1000_TXD_CMD_EOP; 1666 if (++hw_tdt == Adapter->tx_desc_num) 1667 hw_tdt = 0; 1668 1669 if (eop) { 1670 if ((Adapter->link_duplex == HALF_DUPLEX) && 1671 (e1000_fifo_workaround_82547(hw, length) 1672 != E1000_SUCCESS)) { 1673 if (tx_ring->timer_enable_82547) { 1674 ASSERT(tx_ring->timer_id_82547 == 0); 1675 tx_ring->timer_id_82547 = 1676 timeout(e1000g_82547_timeout, 1677 (void *)tx_ring, 1678 drv_usectohz(10000)); 1679 } 1680 return; 1681 1682 } else { 1683 E1000_WRITE_REG(hw, E1000_TDT(0), hw_tdt); 1684 e1000_update_tx_fifo_head_82547(hw, length); 1685 length = 0; 1686 } 1687 } 1688 } 1689 } 1690 1691 static void 1692 e1000g_82547_timeout(void *arg) 1693 { 1694 e1000g_tx_ring_t *tx_ring; 1695 1696 tx_ring = (e1000g_tx_ring_t *)arg; 1697 1698 mutex_enter(&tx_ring->tx_lock); 1699 1700 tx_ring->timer_id_82547 = 0; 1701 e1000g_82547_tx_move_tail_work(tx_ring); 1702 1703 mutex_exit(&tx_ring->tx_lock); 1704 } 1705 1706 static void 1707 e1000g_82547_tx_move_tail(e1000g_tx_ring_t *tx_ring) 1708 { 1709 timeout_id_t tid; 1710 1711 ASSERT(MUTEX_HELD(&tx_ring->tx_lock)); 1712 1713 tid = tx_ring->timer_id_82547; 1714 tx_ring->timer_id_82547 = 0; 1715 if (tid != 0) { 1716 tx_ring->timer_enable_82547 = B_FALSE; 1717 mutex_exit(&tx_ring->tx_lock); 1718 1719 (void) untimeout(tid); 1720 1721 mutex_enter(&tx_ring->tx_lock); 1722 } 1723 tx_ring->timer_enable_82547 = B_TRUE; 1724 e1000g_82547_tx_move_tail_work(tx_ring); 1725 } 1726