1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2024 Oxide Computer Company 14 */ 15 16 /* 17 * igc ring related functions. This is where the bulk of our I/O occurs. 18 */ 19 20 #include <sys/stddef.h> 21 #include <sys/strsubr.h> 22 #include <sys/strsun.h> 23 #include <sys/sysmacros.h> 24 #include <sys/sdt.h> 25 26 #include "igc.h" 27 28 /* 29 * Structure used to consolidate TX information about a given packet. 30 */ 31 typedef struct igc_tx_state { 32 list_t itx_bufs; 33 mac_ether_offload_info_t itx_meoi; 34 uint32_t itx_cksum; 35 uint32_t itx_mss; 36 uint32_t itx_lso; 37 igc_tx_buffer_t *itx_cur_buf; 38 size_t itx_buf_rem; 39 mblk_t *itx_free_mp; 40 uint32_t itx_ndescs; 41 } igc_tx_state_t; 42 43 /* 44 * DMA attributes that are used for descriptor rings. . 45 */ 46 static const ddi_dma_attr_t igc_desc_dma_attr = { 47 .dma_attr_version = DMA_ATTR_V0, 48 /* 49 * DMA descriptor rings can show up anywhere in the address space. The 50 * card supports a 64-bit address for this. 51 */ 52 .dma_attr_addr_lo = 0, 53 .dma_attr_addr_hi = UINT64_MAX, 54 /* 55 * The I210 datasheet says that the ring descriptor length can support 56 * at most 32K entries that are each 16 bytes long. Hence the following 57 * max. 58 */ 59 .dma_attr_count_max = 0x80000, 60 /* 61 * The I210 datasheet, which is the closest we have for the I225, 62 * requires 128 byte alignment for rings. Note, igb and e1000g default 63 * to a 4KiB alignment here. 64 */ 65 .dma_attr_align = 0x80, 66 /* 67 * Borrowed from igb(4D). 68 */ 69 .dma_attr_burstsizes = 0xfff, 70 /* 71 * We set the minimum and maximum based upon what the RDLEN/TDLEN 72 * register will actually support. 73 */ 74 .dma_attr_minxfer = 0x80, 75 .dma_attr_maxxfer = 0x80000, 76 /* 77 * The receive ring must be continuous, indicated by the maximum sgllen 78 * value, which means that this doesn't have any boundary crossing 79 * constraints. 80 */ 81 .dma_attr_seg = UINT64_MAX, 82 .dma_attr_sgllen = 1, 83 /* 84 * For descriptor rings, hardware asks for the size in 128 byte chunks, 85 * so we set that here again. 86 */ 87 .dma_attr_granular = 0x80, 88 .dma_attr_flags = 0 89 }; 90 91 /* 92 * DMA attributes that cover pre-allocated data buffers. Note, RX buffers are 93 * slightly more constrained than TX buffers because the RX buffer addr[0] can 94 * sometimes be used as a no snoop enable bit. Therefore we purposefully avoid 95 * that in our allocations here to allow for use of that in the future if 96 * desired. 97 */ 98 static const ddi_dma_attr_t igc_data_dma_attr = { 99 .dma_attr_version = DMA_ATTR_V0, 100 /* 101 * Packet data can go anywhere in memory. 102 */ 103 .dma_attr_addr_lo = 0, 104 .dma_attr_addr_hi = UINT64_MAX, 105 /* 106 * The maximum size of an RX packet is 127 KiB in the SRRCTL register. 107 * For TX, the maximum value is a 16-bit quantity because that's the 108 * tx descriptor's size. So we cap it at this value. 109 */ 110 .dma_attr_count_max = UINT16_MAX, 111 /* 112 * The hardware strictly requires only 2 byte alignment in RX 113 * descriptors in case no snoop is enabled and no such constraints in 114 * TX. We end up increasing this to a request for 16 byte alignment so 115 * that we can guarantee the IP header alignment and offsetting needs to 116 * happen on all rx descriptors. 117 */ 118 .dma_attr_align = 0x10, 119 /* 120 * We're not constrained here at least via PCIe, so we use the wider 121 * setting here. Similarly to the ring descriptors we just set the 122 * granularity widely. 123 */ 124 .dma_attr_minxfer = 0x1, 125 .dma_attr_maxxfer = UINT32_MAX, 126 .dma_attr_seg = UINT64_MAX, 127 /* 128 * The hardware allows for arbitrary chaining of descriptors; however, 129 * we want to move to a world where we are allocating page sized buffers 130 * at most and therefore constrain the number of cookies for these 131 * buffers. Transmit caps the buffer allocation size at the page size, 132 * but receive does not today. We set the granularity to 1 to reflect 133 * the device's flexibility. 134 */ 135 .dma_attr_sgllen = 1, 136 .dma_attr_granular = 1, 137 .dma_attr_flags = 0 138 }; 139 140 /* 141 * These are the DMA attributes we use when performing DMA TX binding for an 142 * mblk_t. 143 */ 144 static const ddi_dma_attr_t igc_tx_dma_attr = { 145 .dma_attr_version = DMA_ATTR_V0, 146 /* 147 * Packet data can go anywhere in memory. 148 */ 149 .dma_attr_addr_lo = 0, 150 .dma_attr_addr_hi = UINT64_MAX, 151 /* 152 * For TX, the maximum value is a 16-bit quantity because that's the 153 * tx descriptor's size. 154 */ 155 .dma_attr_count_max = UINT16_MAX, 156 /* 157 * TX data can go anywhere, but we ask for 16 byte alignment just to 158 * keep things somewhat aligned in the system. 159 */ 160 .dma_attr_align = 0x10, 161 /* 162 * We're not constrained here at least via PCIe, so we use the wider 163 * setting here. Similarly to the ring descriptors we just set the 164 * granularity widely. 165 */ 166 .dma_attr_minxfer = 0x1, 167 .dma_attr_maxxfer = UINT32_MAX, 168 .dma_attr_seg = UINT64_MAX, 169 /* 170 * We size our transmit cookies so that the maximum sized LSO packet can 171 * go through here. 172 */ 173 .dma_attr_sgllen = IGC_MAX_TX_COOKIES, 174 .dma_attr_granular = 1, 175 .dma_attr_flags = 0 176 177 }; 178 179 /* 180 * All of these wrappers are so we only have one place to tack into FMA 181 * register accesses in the future. 182 */ 183 static void 184 igc_dma_acc_attr(igc_t *igc, ddi_device_acc_attr_t *accp) 185 { 186 bzero(accp, sizeof (ddi_device_acc_attr_t)); 187 188 accp->devacc_attr_version = DDI_DEVICE_ATTR_V1; 189 accp->devacc_attr_endian_flags = DDI_NEVERSWAP_ACC; 190 accp->devacc_attr_dataorder = DDI_STRICTORDER_ACC; 191 accp->devacc_attr_access = DDI_DEFAULT_ACC; 192 } 193 194 static void 195 igc_dma_desc_attr(igc_t *igc, ddi_dma_attr_t *attrp) 196 { 197 bcopy(&igc_desc_dma_attr, attrp, sizeof (ddi_dma_attr_t)); 198 } 199 200 static void 201 igc_dma_data_attr(igc_t *igc, ddi_dma_attr_t *attrp) 202 { 203 bcopy(&igc_data_dma_attr, attrp, sizeof (ddi_dma_attr_t)); 204 } 205 206 static void 207 igc_dma_tx_attr(igc_t *igc, ddi_dma_attr_t *attrp) 208 { 209 bcopy(&igc_tx_dma_attr, attrp, sizeof (ddi_dma_attr_t)); 210 } 211 212 static void 213 igc_dma_free(igc_dma_buffer_t *idb) 214 { 215 /* Proxy for DMA handle bound */ 216 if (idb->idb_size != 0) { 217 (void) ddi_dma_unbind_handle(idb->idb_hdl); 218 idb->idb_size = 0; 219 } 220 221 if (idb->idb_acc != NULL) { 222 ddi_dma_mem_free(&idb->idb_acc); 223 idb->idb_acc = NULL; 224 idb->idb_va = NULL; 225 idb->idb_alloc_len = 0; 226 } 227 228 if (idb->idb_hdl != NULL) { 229 ddi_dma_free_handle(&idb->idb_hdl); 230 idb->idb_hdl = NULL; 231 } 232 233 ASSERT0(idb->idb_size); 234 ASSERT0(idb->idb_alloc_len); 235 ASSERT3P(idb->idb_acc, ==, NULL); 236 ASSERT3P(idb->idb_hdl, ==, NULL); 237 ASSERT3P(idb->idb_va, ==, NULL); 238 } 239 240 static bool 241 igc_dma_alloc(igc_t *igc, igc_dma_buffer_t *idb, ddi_dma_attr_t *attrp, 242 size_t size) 243 { 244 int ret; 245 ddi_device_acc_attr_t acc; 246 uint_t flags = DDI_DMA_STREAMING; 247 248 bzero(idb, sizeof (igc_dma_buffer_t)); 249 ret = ddi_dma_alloc_handle(igc->igc_dip, attrp, DDI_DMA_DONTWAIT, NULL, 250 &idb->idb_hdl); 251 if (ret != DDI_SUCCESS) { 252 dev_err(igc->igc_dip, CE_WARN, "!failed to allocate DMA " 253 "handle: %d", ret); 254 return (false); 255 } 256 257 igc_dma_acc_attr(igc, &acc); 258 ret = ddi_dma_mem_alloc(idb->idb_hdl, size, &acc, flags, 259 DDI_DMA_DONTWAIT, NULL, &idb->idb_va, &idb->idb_alloc_len, 260 &idb->idb_acc); 261 if (ret != DDI_SUCCESS) { 262 dev_err(igc->igc_dip, CE_WARN, "!failed to allocate %lu bytes " 263 "of DMA memory: %d", size, ret); 264 igc_dma_free(idb); 265 return (false); 266 } 267 268 bzero(idb->idb_va, idb->idb_alloc_len); 269 ret = ddi_dma_addr_bind_handle(idb->idb_hdl, NULL, idb->idb_va, 270 idb->idb_alloc_len, DDI_DMA_RDWR | flags, DDI_DMA_DONTWAIT, NULL, 271 NULL, NULL); 272 if (ret != DDI_SUCCESS) { 273 dev_err(igc->igc_dip, CE_WARN, "!failed to bind %lu bytes of " 274 "DMA memory: %d", idb->idb_alloc_len, ret); 275 igc_dma_free(idb); 276 return (false); 277 } 278 279 idb->idb_size = size; 280 return (true); 281 } 282 283 static void 284 igc_rx_recycle(caddr_t arg) 285 { 286 igc_rx_buffer_t *buf = (igc_rx_buffer_t *)arg; 287 igc_rx_ring_t *ring = buf->irb_ring; 288 caddr_t mblk_va; 289 size_t mblk_len; 290 291 /* 292 * The mblk is free regardless of what happens next, so make sure we 293 * clean up. 294 */ 295 buf->irb_mp = NULL; 296 297 /* 298 * The mblk_t is pre-created ahead of binding. If loaned is not set then 299 * this simply means we're tearing down this as part of tearing down the 300 * device as opposed to getting it from the rest of the stack and 301 * therefore there's nothing else to do. 302 */ 303 if (!buf->irb_loaned) { 304 return; 305 } 306 307 /* 308 * Ensure we mark this buffer as no longer loaned and then insert it 309 * onto the free list. 310 */ 311 buf->irb_loaned = false; 312 313 /* 314 * Create a new mblk and insert it on the free list. 315 */ 316 mblk_va = buf->irb_dma.idb_va + IGC_RX_BUF_IP_ALIGN; 317 mblk_len = buf->irb_dma.idb_size - IGC_RX_BUF_IP_ALIGN; 318 buf->irb_mp = desballoc((uchar_t *)mblk_va, mblk_len, 0, 319 &buf->irb_free_rtn); 320 321 mutex_enter(&ring->irr_free_lock); 322 ring->irr_free_list[ring->irr_nfree] = buf; 323 ring->irr_nfree++; 324 #ifdef DEBUG 325 igc_t *igc = ring->irr_igc; 326 ASSERT3U(ring->irr_nfree, <=, igc->igc_rx_nfree); 327 #endif 328 cv_signal(&ring->irr_free_cv); 329 mutex_exit(&ring->irr_free_lock); 330 } 331 332 static void 333 igc_rx_bufs_free(igc_t *igc, igc_rx_ring_t *ring) 334 { 335 for (uint32_t i = 0; i < igc->igc_rx_nbuf; i++) { 336 igc_rx_buffer_t *buf = &ring->irr_arena[i]; 337 338 ASSERT3U(buf->irb_loaned, ==, false); 339 freemsg(buf->irb_mp); 340 buf->irb_mp = NULL; 341 igc_dma_free(&buf->irb_dma); 342 } 343 } 344 345 static bool 346 igc_rx_bufs_alloc(igc_t *igc, igc_rx_ring_t *ring) 347 { 348 for (uint32_t i = 0; i < igc->igc_rx_nbuf; i++) { 349 igc_rx_buffer_t *buf = &ring->irr_arena[i]; 350 ddi_dma_attr_t attr; 351 caddr_t mblk_va; 352 size_t mblk_len; 353 354 buf->irb_ring = ring; 355 igc_dma_data_attr(igc, &attr); 356 if (!igc_dma_alloc(igc, &buf->irb_dma, &attr, 357 igc->igc_rx_buf_size)) { 358 dev_err(igc->igc_dip, CE_WARN, "!failed to allocate RX " 359 "ring %u buffer %u", ring->irr_idx, i); 360 return (false); 361 } 362 363 buf->irb_free_rtn.free_func = igc_rx_recycle; 364 buf->irb_free_rtn.free_arg = (caddr_t)buf; 365 366 /* 367 * We ignore whether or not this was successful because we have 368 * to handle the case that we will have buffers without mblk's 369 * due to loaning and related. 370 */ 371 mblk_va = buf->irb_dma.idb_va + IGC_RX_BUF_IP_ALIGN; 372 mblk_len = buf->irb_dma.idb_size - IGC_RX_BUF_IP_ALIGN; 373 buf->irb_mp = desballoc((uchar_t *)mblk_va, mblk_len, 0, 374 &buf->irb_free_rtn); 375 376 if (i < igc->igc_rx_ndesc) { 377 ring->irr_work_list[i] = buf; 378 } else { 379 ring->irr_free_list[ring->irr_nfree] = buf; 380 ring->irr_nfree++; 381 } 382 } 383 384 return (true); 385 } 386 387 void 388 igc_rx_data_free(igc_t *igc) 389 { 390 for (uint32_t i = 0; i < igc->igc_nrx_rings; i++) { 391 igc_rx_ring_t *ring = &igc->igc_rx_rings[i]; 392 393 if (ring->irr_arena != NULL) { 394 igc_rx_bufs_free(igc, ring); 395 kmem_free(ring->irr_arena, sizeof (igc_rx_buffer_t) * 396 igc->igc_rx_nbuf); 397 ring->irr_arena = NULL; 398 } 399 400 if (ring->irr_free_list != NULL) { 401 kmem_free(ring->irr_free_list, igc->igc_rx_nfree * 402 sizeof (igc_rx_buffer_t *)); 403 ring->irr_free_list = NULL; 404 } 405 406 if (ring->irr_work_list != NULL) { 407 kmem_free(ring->irr_work_list, igc->igc_rx_ndesc * 408 sizeof (igc_rx_buffer_t *)); 409 ring->irr_work_list = NULL; 410 } 411 412 if (ring->irr_ring != NULL) { 413 igc_dma_free(&ring->irr_desc_dma); 414 ring->irr_ring = NULL; 415 ring->irr_next = 0; 416 } 417 } 418 } 419 420 bool 421 igc_rx_data_alloc(igc_t *igc) 422 { 423 for (uint32_t i = 0; i < igc->igc_nrx_rings; i++) { 424 igc_rx_ring_t *ring = &igc->igc_rx_rings[i]; 425 ddi_dma_attr_t desc_attr; 426 size_t desc_len; 427 428 igc_dma_desc_attr(igc, &desc_attr); 429 desc_len = sizeof (union igc_adv_rx_desc) * 430 igc->igc_rx_ndesc; 431 if (!igc_dma_alloc(igc, &ring->irr_desc_dma, &desc_attr, 432 desc_len)) { 433 dev_err(igc->igc_dip, CE_WARN, "!failed to allocate " 434 "RX descriptor ring %u", i); 435 goto cleanup; 436 } 437 ring->irr_ring = (void *)ring->irr_desc_dma.idb_va; 438 439 ring->irr_work_list = kmem_zalloc(sizeof (igc_rx_buffer_t *) * 440 igc->igc_rx_ndesc, KM_NOSLEEP); 441 if (ring->irr_work_list == NULL) { 442 dev_err(igc->igc_dip, CE_WARN, "!failed to allocate " 443 "RX descriptor ring %u rx work list", i); 444 goto cleanup; 445 } 446 447 ring->irr_free_list = kmem_zalloc(sizeof (igc_rx_buffer_t *) * 448 igc->igc_rx_nfree, KM_NOSLEEP); 449 if (ring->irr_free_list == NULL) { 450 dev_err(igc->igc_dip, CE_WARN, "!failed to allocate " 451 "RX descriptor ring %u rx free list", i); 452 goto cleanup; 453 } 454 455 456 ring->irr_arena = kmem_zalloc(sizeof (igc_rx_buffer_t) * 457 igc->igc_rx_nbuf, KM_NOSLEEP); 458 if (ring->irr_arena == NULL) { 459 dev_err(igc->igc_dip, CE_WARN, "!failed to allocate " 460 "RX descriptor ring %u rx buf arena", i); 461 goto cleanup; 462 } 463 464 if (!igc_rx_bufs_alloc(igc, ring)) { 465 goto cleanup; 466 } 467 } 468 469 return (true); 470 471 cleanup: 472 igc_rx_data_free(igc); 473 return (false); 474 } 475 476 /* 477 * Write / update a descriptor ring entry. This had been implemented in a few 478 * places, so this was intended as a consolidation of those. 479 */ 480 static inline void 481 igc_rx_ring_desc_write(igc_rx_ring_t *ring, uint32_t idx) 482 { 483 const ddi_dma_cookie_t *cookie; 484 uint64_t addr; 485 igc_dma_buffer_t *irb = &ring->irr_work_list[idx]->irb_dma; 486 487 cookie = ddi_dma_cookie_one(irb->idb_hdl); 488 addr = cookie->dmac_laddress + IGC_RX_BUF_IP_ALIGN; 489 ring->irr_ring[idx].read.pkt_addr = LE_64(addr); 490 ring->irr_ring[idx].read.hdr_addr = LE_64(0); 491 } 492 493 /* 494 * Fully initialize a receive ring. This involves: 495 * 496 * - Doing an initial programming and sync of the descriptor ring 497 * - Programming the base and length registers 498 * - Programming the ring's buffer size and descriptor type 499 * - Programming the queue's receive control register 500 */ 501 static void 502 igc_rx_ring_hw_init(igc_t *igc, igc_rx_ring_t *ring) 503 { 504 uint32_t val, high, low; 505 const ddi_dma_cookie_t *desc; 506 507 for (uint32_t i = 0; i < igc->igc_rx_ndesc; i++) { 508 igc_rx_ring_desc_write(ring, i); 509 } 510 IGC_DMA_SYNC(&ring->irr_desc_dma, DDI_DMA_SYNC_FORDEV); 511 512 /* 513 * Program the ring's address. 514 */ 515 desc = ddi_dma_cookie_one(ring->irr_desc_dma.idb_hdl); 516 high = (uint32_t)(desc->dmac_laddress >> 32); 517 low = (uint32_t)desc->dmac_laddress; 518 igc_write32(igc, IGC_RDBAH(ring->irr_idx), high); 519 igc_write32(igc, IGC_RDBAL(ring->irr_idx), low); 520 521 /* 522 * Program the ring length. 523 */ 524 val = igc->igc_rx_ndesc * sizeof (union igc_adv_rx_desc); 525 igc_write32(igc, IGC_RDLEN(ring->irr_idx), val); 526 527 /* 528 * Program the descriptor type and buffer length. 529 */ 530 val = (igc->igc_rx_buf_size >> IGC_SRRCTL_BSIZEPKT_SHIFT) | 531 IGC_SRRCTL_DESCTYPE_ADV_ONEBUF; 532 igc_write32(igc, IGC_SRRCTL(ring->irr_idx), val); 533 534 /* 535 * Program the ring control register itself. Note, we crib the threshold 536 * values directly from igb and didn't think much harder than that. 537 */ 538 val = igc_read32(igc, IGC_RXDCTL(ring->irr_idx)); 539 val &= IGC_RXDCTL_PRESERVE; 540 val |= IGC_RXDCTL_QUEUE_ENABLE; 541 val = IGC_RXDCTL_SET_PTHRESH(val, 16); 542 val = IGC_RXDCTL_SET_HTHRESH(val, 8); 543 val = IGC_RXDCTL_SET_WTHRESH(val, 1); 544 igc_write32(igc, IGC_RXDCTL(ring->irr_idx), val); 545 } 546 547 void 548 igc_rx_hw_init(igc_t *igc) 549 { 550 uint32_t rctl, rxcsum; 551 552 /* 553 * Start by setting up the receive control register. 554 * 555 * We clear out any bits in the multicast shift portion. This'll leave 556 * it so [47:36] of the address are used as part of the look up. We also 557 * don't want to receive bad packets, so make sure that's cleared out. 558 * In addition, we clear out loopback mode. 559 */ 560 rctl = igc_read32(igc, IGC_RCTL); 561 rctl &= ~(3 << IGC_RCTL_MO_SHIFT); 562 rctl &= ~IGC_RCTL_SBP; 563 rctl &= ~(IGC_RCTL_LBM_MAC | IGC_RCTL_LBM_TCVR); 564 565 /* 566 * Set things up such that we're enabled, we receive broadcast packets, 567 * and we allow for large packets. We leave the rx descriptor threshold 568 * at 2048 bytes and make sure to always strip the Ethernet CRC as mac 569 * doesn't want it. 570 */ 571 rctl |= IGC_RCTL_EN | IGC_RCTL_BAM | IGC_RCTL_LPE | 572 IGC_RCTL_RDMTS_HALF | IGC_RCTL_SECRC; 573 574 /* 575 * Set the multicast filter based on hardware. 576 */ 577 rctl |= igc->igc_hw.mac.mc_filter_type << IGC_RCTL_MO_SHIFT; 578 579 /* 580 * Make sure each ring is set up and its registers are programmed. 581 */ 582 for (uint32_t i = 0; i < igc->igc_nrx_rings; i++) { 583 igc_rx_ring_hw_init(igc, &igc->igc_rx_rings[i]); 584 } 585 586 /* 587 * As we always set LPE (large packet enable) in the receive control 588 * register, we must go through and explicitly update the maximum frame 589 * size. 590 */ 591 igc_write32(igc, IGC_RLPML, igc->igc_max_frame); 592 593 /* 594 * Explicitly enable IPv4 and TCP checksums. We leave PCSD set to zero 595 * for the moment as we're not enabling RSS, which is what would be 596 * required to get that. After this is where we would set up the VMDq 597 * mode and RSS if we supported multiple RX rings. 598 */ 599 rxcsum = IGC_RXCSUM_IPOFL | IGC_RXCSUM_TUOFL; 600 igc_write32(igc, IGC_RXCSUM, rxcsum); 601 602 /* 603 * Enable the receive unit finally 604 */ 605 igc_write32(igc, IGC_RCTL, rctl); 606 607 /* 608 * Only after the receive unit is initialized can we actually set up the 609 * ring head and tail pointers. 610 */ 611 for (uint32_t i = 0; i < igc->igc_nrx_rings; i++) { 612 igc_write32(igc, IGC_RDH(igc->igc_rx_rings[i].irr_idx), 0); 613 igc_write32(igc, IGC_RDT(igc->igc_rx_rings[i].irr_idx), 614 igc->igc_rx_ndesc - 1); 615 } 616 } 617 618 static inline uint32_t 619 igc_next_desc(uint32_t cur, uint32_t count, uint32_t size) 620 { 621 uint32_t out; 622 623 if (cur + count < size) { 624 out = cur + count; 625 } else { 626 out = cur + count - size; 627 } 628 629 return (out); 630 } 631 632 static inline uint32_t 633 igc_prev_desc(uint32_t cur, uint32_t count, uint32_t size) 634 { 635 uint32_t out; 636 637 if (cur >= count) { 638 out = cur - count; 639 } else { 640 out = cur - count + size; 641 } 642 643 return (out); 644 } 645 646 647 static mblk_t * 648 igc_rx_copy(igc_rx_ring_t *ring, uint32_t idx, uint32_t len) 649 { 650 const igc_rx_buffer_t *buf = ring->irr_work_list[idx]; 651 mblk_t *mp; 652 653 IGC_DMA_SYNC(&buf->irb_dma, DDI_DMA_SYNC_FORKERNEL); 654 mp = allocb(len + IGC_RX_BUF_IP_ALIGN, 0); 655 if (mp == NULL) { 656 ring->irr_stat.irs_copy_nomem.value.ui64++; 657 return (NULL); 658 } 659 660 mp->b_rptr += IGC_RX_BUF_IP_ALIGN; 661 bcopy(buf->irb_dma.idb_va + IGC_RX_BUF_IP_ALIGN, mp->b_rptr, len); 662 mp->b_wptr = mp->b_rptr + len; 663 ring->irr_stat.irs_ncopy.value.ui64++; 664 return (mp); 665 } 666 667 static mblk_t * 668 igc_rx_bind(igc_rx_ring_t *ring, uint32_t idx, uint32_t len) 669 { 670 igc_rx_buffer_t *buf = ring->irr_work_list[idx]; 671 igc_rx_buffer_t *sub; 672 673 ASSERT(MUTEX_HELD(&ring->irr_lock)); 674 675 /* 676 * If there are no free buffers, we can't bind. Try to grab this now so 677 * we can minimize free list contention. 678 */ 679 mutex_enter(&ring->irr_free_lock); 680 if (ring->irr_nfree == 0) { 681 ring->irr_stat.irs_bind_nobuf.value.ui64++; 682 mutex_exit(&ring->irr_free_lock); 683 return (NULL); 684 } 685 ring->irr_nfree--; 686 sub = ring->irr_free_list[ring->irr_nfree]; 687 mutex_exit(&ring->irr_free_lock); 688 689 /* 690 * Check if we have an mblk_t here. If not, we'll need to allocate one 691 * again. If that fails, we'll fail this and fall back to copy, though 692 * the odds of that working are small. 693 */ 694 if (buf->irb_mp == NULL) { 695 caddr_t mblk_va = buf->irb_dma.idb_va + IGC_RX_BUF_IP_ALIGN; 696 size_t mblk_len = buf->irb_dma.idb_size - IGC_RX_BUF_IP_ALIGN; 697 buf->irb_mp = desballoc((uchar_t *)mblk_va, mblk_len, 0, 698 &buf->irb_free_rtn); 699 if (buf->irb_mp == NULL) { 700 ring->irr_stat.irs_bind_nomp.value.ui64++; 701 mutex_enter(&ring->irr_free_lock); 702 ring->irr_free_list[ring->irr_nfree] = sub; 703 ring->irr_nfree++; 704 mutex_exit(&ring->irr_free_lock); 705 return (NULL); 706 } 707 } 708 buf->irb_mp->b_wptr = buf->irb_mp->b_rptr + len; 709 IGC_DMA_SYNC(&buf->irb_dma, DDI_DMA_SYNC_FORKERNEL); 710 711 /* 712 * Swap an entry on the free list to replace this on the work list. 713 */ 714 ring->irr_work_list[idx] = sub; 715 ring->irr_stat.irs_nbind.value.ui64++; 716 717 /* 718 * Update the buffer to make sure that we indicate it's been loaned for 719 * future recycling. 720 */ 721 buf->irb_loaned = true; 722 723 return (buf->irb_mp); 724 } 725 726 /* 727 * Go through the status bits defined in hardware to see if we can set checksum 728 * information. 729 */ 730 static void 731 igc_rx_hcksum(igc_rx_ring_t *ring, mblk_t *mp, uint32_t status) 732 { 733 uint32_t cksum = 0; 734 const uint32_t l4_valid = IGC_RXD_STAT_TCPCS | IGC_RXD_STAT_UDPCS; 735 const uint32_t l4_invalid = IGC_RXDEXT_STATERR_L4E; 736 737 if ((status & IGC_RXD_STAT_IXSM) != 0) { 738 ring->irr_stat.irs_ixsm.value.ui64++; 739 return; 740 } 741 742 if ((status & l4_invalid) != 0) { 743 ring->irr_stat.irs_l4cksum_err.value.ui64++; 744 } else if ((status & l4_valid) != 0) { 745 cksum |= HCK_FULLCKSUM_OK; 746 } 747 748 if ((status & IGC_RXDEXT_STATERR_IPE) != 0) { 749 ring->irr_stat.irs_l3cksum_err.value.ui64++; 750 } else if ((status & IGC_RXD_STAT_IPCS) != 0) { 751 cksum |= HCK_IPV4_HDRCKSUM_OK; 752 } 753 754 if (cksum != 0) { 755 ring->irr_stat.irs_hcksum_hit.value.ui64++; 756 mac_hcksum_set(mp, 0, 0, 0, 0, cksum); 757 } else { 758 ring->irr_stat.irs_hcksum_miss.value.ui64++; 759 } 760 } 761 762 mblk_t * 763 igc_ring_rx(igc_rx_ring_t *ring, int poll_bytes) 764 { 765 union igc_adv_rx_desc *cur_desc; 766 uint32_t cur_status, cur_head; 767 uint64_t rx_bytes = 0, rx_frames = 0; 768 igc_t *igc = ring->irr_igc; 769 mblk_t *mp_head = NULL, **mp_tail = NULL; 770 771 ASSERT(MUTEX_HELD(&ring->irr_lock)); 772 IGC_DMA_SYNC(&ring->irr_desc_dma, DDI_DMA_SYNC_FORKERNEL); 773 774 /* 775 * Set up the invariants that we will maintain for the loop and then set 776 * up our mblk queue. 777 */ 778 cur_head = ring->irr_next; 779 cur_desc = &ring->irr_ring[cur_head]; 780 cur_status = LE_32(cur_desc->wb.upper.status_error); 781 mp_head = NULL; 782 mp_tail = &mp_head; 783 784 while ((cur_status & IGC_RXD_STAT_DD) != 0) { 785 uint16_t cur_length = 0; 786 mblk_t *mp; 787 788 /* 789 * Check that we have no errors on this packet. This packet 790 * should also have EOP set because we only use a single 791 * descriptor today. We primarily just check for the RXE error. 792 * Most other error types were dropped in the extended format. 793 */ 794 if ((cur_status & IGC_RXDEXT_STATERR_RXE) != 0 || 795 (cur_status & IGC_RXD_STAT_EOP) == 0) { 796 ring->irr_stat.irs_desc_error.value.ui64++; 797 goto discard; 798 } 799 800 801 /* 802 * We don't bump rx_frames here, because we do that at the end, 803 * even if we've discarded frames so we can know to write the 804 * tail register. 805 */ 806 cur_length = LE_16(cur_desc->wb.upper.length); 807 rx_bytes += cur_length; 808 809 mp = NULL; 810 if (cur_length > igc->igc_rx_bind_thresh) { 811 mp = igc_rx_bind(ring, cur_head, cur_length); 812 } 813 814 if (mp == NULL) { 815 mp = igc_rx_copy(ring, cur_head, cur_length); 816 } 817 818 if (mp != NULL) { 819 igc_rx_hcksum(ring, mp, cur_status); 820 *mp_tail = mp; 821 mp_tail = &mp->b_next; 822 } 823 824 discard: 825 /* 826 * Prepare the frame for use again. Note, we can't assume that 827 * the memory in the buffer is valid. 828 */ 829 igc_rx_ring_desc_write(ring, cur_head); 830 831 /* 832 * Go through and update the values that our loop is using now. 833 */ 834 cur_head = igc_next_desc(cur_head, 1, igc->igc_rx_ndesc); 835 cur_desc = &ring->irr_ring[cur_head]; 836 cur_status = LE_32(cur_desc->wb.upper.status_error); 837 838 /* 839 * If we're polling, we need to check against the number of 840 * received bytes. If we're in interrupt mode, we have a maximum 841 * number of frames we're allowed to check. 842 */ 843 rx_frames++; 844 if (poll_bytes != IGC_RX_POLL_INTR && 845 (cur_length + rx_bytes) > poll_bytes) { 846 break; 847 } else if (poll_bytes == IGC_RX_POLL_INTR && 848 rx_frames >= igc->igc_rx_intr_nframes) { 849 break; 850 } 851 } 852 853 /* 854 * Go ahead and re-arm the ring and update our stats along the way as 855 * long as we received at least one frame. Because we modified the 856 * descriptor ring as part of resetting frames, we must resync. 857 */ 858 if (rx_frames != 0) { 859 uint32_t tail; 860 861 IGC_DMA_SYNC(&ring->irr_desc_dma, DDI_DMA_SYNC_FORDEV); 862 ring->irr_next = cur_head; 863 tail = igc_prev_desc(cur_head, 1, igc->igc_rx_ndesc); 864 igc_write32(igc, IGC_RDT(ring->irr_idx), tail); 865 866 ring->irr_stat.irs_rbytes.value.ui64 += rx_bytes; 867 ring->irr_stat.irs_ipackets.value.ui64 += rx_frames; 868 } 869 870 #ifdef DEBUG 871 if (rx_frames == 0) { 872 ASSERT0(rx_bytes); 873 } 874 #endif 875 876 return (mp_head); 877 } 878 879 /* 880 * This is called from the stop entry point after the hardware has been reset. 881 * After the hardware has been reset, the other possible consumer of rx buffers 882 * are those that have been loaned up the stack. As such, we need to wait on 883 * each free list until the number of free entries have gotten back to the 884 * expected number. 885 */ 886 void 887 igc_rx_drain(igc_t *igc) 888 { 889 for (uint32_t i = 0; i < igc->igc_nrx_rings; i++) { 890 igc_rx_ring_t *ring = &igc->igc_rx_rings[i]; 891 892 mutex_enter(&ring->irr_free_lock); 893 while (ring->irr_nfree < igc->igc_rx_nfree) { 894 cv_wait(&ring->irr_free_cv, &ring->irr_free_lock); 895 } 896 mutex_exit(&ring->irr_free_lock); 897 } 898 } 899 900 static void 901 igc_tx_bufs_free(igc_t *igc, igc_tx_ring_t *ring) 902 { 903 for (uint32_t i = 0; i < igc->igc_tx_nbuf; i++) { 904 igc_tx_buffer_t *buf = &ring->itr_arena[i]; 905 906 /* 907 * While we try to clean up the ring reasonably well, if for 908 * some reason we insert descriptors that the device doesn't 909 * like, then parts of the ring may not end up cleaned up. In 910 * such cases we'll need to free the mblk here ourselves and 911 * clean up any binding. 912 */ 913 if (buf->itb_bind) { 914 buf->itb_bind = false; 915 (void) ddi_dma_unbind_handle(buf->itb_bind_hdl); 916 } 917 freemsgchain(buf->itb_mp); 918 igc_dma_free(&buf->itb_dma); 919 if (buf->itb_bind_hdl != NULL) { 920 ddi_dma_free_handle(&buf->itb_bind_hdl); 921 } 922 } 923 } 924 925 static bool 926 igc_tx_bufs_alloc(igc_t *igc, igc_tx_ring_t *ring) 927 { 928 for (uint32_t i = 0; i < igc->igc_tx_nbuf; i++) { 929 igc_tx_buffer_t *buf = &ring->itr_arena[i]; 930 ddi_dma_attr_t attr; 931 int ret; 932 933 igc_dma_data_attr(igc, &attr); 934 if (!igc_dma_alloc(igc, &buf->itb_dma, &attr, 935 igc->igc_tx_buf_size)) { 936 dev_err(igc->igc_dip, CE_WARN, "!failed to allocate TX " 937 "ring %u buffer %u", ring->itr_idx, i); 938 return (false); 939 } 940 941 igc_dma_tx_attr(igc, &attr); 942 if ((ret = ddi_dma_alloc_handle(igc->igc_dip, &attr, 943 DDI_DMA_DONTWAIT, NULL, &buf->itb_bind_hdl)) != 944 DDI_SUCCESS) { 945 dev_err(igc->igc_dip, CE_WARN, "!failed to allocate TX " 946 "ring %u TX DMA handle %u: %d", ring->itr_idx, i, 947 ret); 948 return (false); 949 } 950 951 list_insert_tail(&ring->itr_free_list, buf); 952 } 953 954 return (true); 955 } 956 957 void 958 igc_tx_data_free(igc_t *igc) 959 { 960 for (uint32_t i = 0; i < igc->igc_ntx_rings; i++) { 961 igc_tx_ring_t *ring = &igc->igc_tx_rings[i]; 962 963 /* 964 * Empty the free list before we destroy the list to avoid 965 * blowing an assertion. 966 */ 967 while (list_remove_head(&ring->itr_free_list) != NULL) 968 ; 969 970 if (ring->itr_arena != NULL) { 971 igc_tx_bufs_free(igc, ring); 972 kmem_free(ring->itr_arena, sizeof (igc_tx_buffer_t) * 973 igc->igc_tx_nbuf); 974 ring->itr_arena = NULL; 975 } 976 977 list_destroy(&ring->itr_free_list); 978 979 if (ring->itr_work_list != NULL) { 980 kmem_free(ring->itr_work_list, igc->igc_tx_ndesc * 981 sizeof (igc_tx_buffer_t *)); 982 ring->itr_work_list = NULL; 983 } 984 985 if (ring->itr_ring != NULL) { 986 igc_dma_free(&ring->itr_desc_dma); 987 ring->itr_ring = NULL; 988 ring->itr_ring_head = 0; 989 ring->itr_ring_tail = 0; 990 ring->itr_ring_free = 0; 991 } 992 } 993 } 994 995 bool 996 igc_tx_data_alloc(igc_t *igc) 997 { 998 for (uint32_t i = 0; i < igc->igc_ntx_rings; i++) { 999 igc_tx_ring_t *ring = &igc->igc_tx_rings[i]; 1000 ddi_dma_attr_t desc_attr; 1001 size_t desc_len; 1002 1003 igc_dma_desc_attr(igc, &desc_attr); 1004 desc_len = sizeof (union igc_adv_tx_desc) * 1005 igc->igc_tx_ndesc; 1006 if (!igc_dma_alloc(igc, &ring->itr_desc_dma, &desc_attr, 1007 desc_len)) { 1008 dev_err(igc->igc_dip, CE_WARN, "!failed to allocate " 1009 "TX descriptor ring %u", i); 1010 goto cleanup; 1011 } 1012 ring->itr_ring = (void *)ring->itr_desc_dma.idb_va; 1013 1014 ring->itr_work_list = kmem_zalloc(sizeof (igc_tx_buffer_t *) * 1015 igc->igc_tx_ndesc, KM_NOSLEEP); 1016 if (ring->itr_work_list == NULL) { 1017 dev_err(igc->igc_dip, CE_WARN, "!failed to allocate " 1018 "TX descriptor ring %u tx work list", i); 1019 goto cleanup; 1020 } 1021 1022 list_create(&ring->itr_free_list, sizeof (igc_tx_buffer_t), 1023 offsetof(igc_tx_buffer_t, itb_node)); 1024 1025 ring->itr_arena = kmem_zalloc(sizeof (igc_tx_buffer_t) * 1026 igc->igc_tx_nbuf, KM_NOSLEEP); 1027 if (ring->itr_arena == NULL) { 1028 dev_err(igc->igc_dip, CE_WARN, "!failed to allocate " 1029 "TX descriptor ring %u tx buf arena", i); 1030 goto cleanup; 1031 } 1032 1033 if (!igc_tx_bufs_alloc(igc, ring)) { 1034 goto cleanup; 1035 } 1036 } 1037 1038 return (true); 1039 1040 cleanup: 1041 igc_tx_data_free(igc); 1042 return (false); 1043 } 1044 1045 static void 1046 igc_tx_ring_hw_init(igc_t *igc, igc_tx_ring_t *ring) 1047 { 1048 uint32_t val, high, low; 1049 const ddi_dma_cookie_t *desc; 1050 1051 /* 1052 * Program the ring's address. 1053 */ 1054 desc = ddi_dma_cookie_one(ring->itr_desc_dma.idb_hdl); 1055 high = (uint32_t)(desc->dmac_laddress >> 32); 1056 low = (uint32_t)desc->dmac_laddress; 1057 igc_write32(igc, IGC_TDBAH(ring->itr_idx), high); 1058 igc_write32(igc, IGC_TDBAL(ring->itr_idx), low); 1059 1060 /* 1061 * Program the ring length. 1062 */ 1063 val = igc->igc_tx_ndesc * sizeof (union igc_adv_tx_desc); 1064 igc_write32(igc, IGC_TDLEN(ring->itr_idx), val); 1065 1066 /* 1067 * Initialize the head and tail pointers that are in use. We can do this 1068 * for TX unlike RX because we don't want the device to transmit 1069 * anything. 1070 */ 1071 igc_write32(igc, IGC_TDH(ring->itr_idx), 0); 1072 igc_write32(igc, IGC_TDT(ring->itr_idx), 0); 1073 ring->itr_ring_head = 0; 1074 ring->itr_ring_tail = 0; 1075 ring->itr_ring_free = igc->igc_tx_ndesc; 1076 1077 /* 1078 * Ensure that a tx queue is disabled prior to taking any action. We do 1079 * a subsequent read just in case relaxed ordering is enabled. We are 1080 * required to set the various thresholds for when prefetch should 1081 * occur, how many valid descriptors it waits before prefetch, and then 1082 * what the write back granularity is. Picking these numbers is a bit 1083 * weird. 1084 * 1085 * igb historically didn't modify these values. e1000g varied based on 1086 * the hardware type and has done any number of different things here. 1087 * The generic datasheet recommendation in the I210 is to set WTHRESH to 1088 * 1 and leave everything else at zero. Drivers in other systems vary 1089 * their settings. 1090 * 1091 * Right now we end up basically just following the datasheet and also 1092 * rely on the ITR that we set. This can probably be improved upon at 1093 * some point. 1094 */ 1095 igc_write32(igc, IGC_TXDCTL(0), 0); 1096 (void) igc_read32(igc, IGC_STATUS); 1097 val = 0; 1098 val = IGC_TXDCTL_SET_PTHRESH(val, 0); 1099 val = IGC_TXDCTL_SET_HTHRESH(val, 0); 1100 val = IGC_TXDCTL_SET_WTHRESH(val, 1); 1101 val |= IGC_TXDCTL_QUEUE_ENABLE; 1102 igc_write32(igc, IGC_TXDCTL(0), val); 1103 } 1104 1105 void 1106 igc_tx_hw_init(igc_t *igc) 1107 { 1108 uint32_t val; 1109 1110 for (uint32_t i = 0; i < igc->igc_ntx_rings; i++) { 1111 igc_tx_ring_hw_init(igc, &igc->igc_tx_rings[i]); 1112 } 1113 1114 val = igc_read32(igc, IGC_TCTL); 1115 val &= ~IGC_TCTL_CT; 1116 val |= IGC_TCTL_PSP | IGC_TCTL_RTLC | IGC_TCTL_EN | 1117 (IGC_COLLISION_THRESHOLD << IGC_CT_SHIFT); 1118 igc_write32(igc, IGC_TCTL, val); 1119 } 1120 1121 static void 1122 igc_tx_buf_reset(igc_tx_buffer_t *buf) 1123 { 1124 buf->itb_mp = NULL; 1125 buf->itb_len = 0; 1126 buf->itb_last_desc = 0; 1127 buf->itb_first = false; 1128 if (buf->itb_bind) { 1129 (void) ddi_dma_unbind_handle(buf->itb_bind_hdl); 1130 } 1131 buf->itb_bind = false; 1132 } 1133 1134 /* 1135 * When we are recycling packets, we need to sync the ring and then walk from 1136 * what we last processed up to what is in the tail or the first entry that is 1137 * not done. It is not clear that the I225 hardware has the separate write back 1138 * feature that igb does, so instead we have to look for the packet being noted 1139 * as done in the descriptor. 1140 */ 1141 void 1142 igc_tx_recycle(igc_t *igc, igc_tx_ring_t *ring) 1143 { 1144 uint32_t head, tail, ndesc = 0; 1145 list_t to_free; 1146 mblk_t *mp = NULL; 1147 bool notify = false; 1148 1149 /* 1150 * Snapshot the current head and tail before we do more processing. The 1151 * driver bumps the tail when transmitting and bumps the head only here, 1152 * so we know that anything in the region of [head, tail) is safe for us 1153 * to touch (if the hardware is done) while anything in the region of 1154 * [tail, head) is not. 1155 */ 1156 mutex_enter(&ring->itr_lock); 1157 if (ring->itr_recycle) { 1158 mutex_exit(&ring->itr_lock); 1159 return; 1160 } 1161 ring->itr_recycle = true; 1162 head = ring->itr_ring_head; 1163 tail = ring->itr_ring_tail; 1164 mutex_exit(&ring->itr_lock); 1165 1166 list_create(&to_free, sizeof (igc_tx_buffer_t), 1167 offsetof(igc_tx_buffer_t, itb_node)); 1168 1169 IGC_DMA_SYNC(&ring->itr_desc_dma, DDI_DMA_SYNC_FORKERNEL); 1170 1171 /* 1172 * We need to walk the transmit descriptors to see what we can free. 1173 * Here is where we need to deal with the wrinkle the theory statement 1174 * discusses (see 'TX Data Path Design' in igc.c). We look at the head 1175 * of the ring and see what item has the tail that we expect to be done 1176 * and use that to determine if we are done with the entire packet. If 1177 * we're done with the entire packet, then we walk the rest of the 1178 * descriptors and will proceed. 1179 */ 1180 while (head != tail) { 1181 uint32_t status, last_desc, next_desc; 1182 igc_tx_buffer_t *check_buf = ring->itr_work_list[head]; 1183 1184 ASSERT3P(check_buf, !=, NULL); 1185 ASSERT3U(check_buf->itb_first, ==, true); 1186 1187 last_desc = check_buf->itb_last_desc; 1188 status = LE_32(ring->itr_ring[last_desc].wb.status); 1189 if ((status & IGC_TXD_STAT_DD) == 0) { 1190 break; 1191 } 1192 1193 /* 1194 * We need to clean up this packet. This involves walking each 1195 * descriptor, resetting it, finding each tx buffer, and mblk, 1196 * and cleaning that up. A descriptor may or may not have a tx 1197 * buffer associated with it. 1198 */ 1199 next_desc = igc_next_desc(last_desc, 1, igc->igc_tx_ndesc); 1200 for (uint32_t desc = head; desc != next_desc; 1201 desc = igc_next_desc(desc, 1, igc->igc_tx_ndesc)) { 1202 igc_tx_buffer_t *buf; 1203 bzero(&ring->itr_ring[desc], 1204 sizeof (union igc_adv_tx_desc)); 1205 ndesc++; 1206 buf = ring->itr_work_list[desc]; 1207 if (buf == NULL) 1208 continue; 1209 ring->itr_work_list[desc] = NULL; 1210 1211 if (buf->itb_mp != NULL) { 1212 buf->itb_mp->b_next = mp; 1213 mp = buf->itb_mp; 1214 } 1215 igc_tx_buf_reset(buf); 1216 list_insert_tail(&to_free, buf); 1217 } 1218 1219 head = next_desc; 1220 } 1221 1222 mutex_enter(&ring->itr_lock); 1223 ring->itr_ring_head = head; 1224 ring->itr_ring_free += ndesc; 1225 list_move_tail(&ring->itr_free_list, &to_free); 1226 if (ring->itr_mac_blocked && ring->itr_ring_free > 1227 igc->igc_tx_notify_thresh) { 1228 ring->itr_mac_blocked = false; 1229 notify = true; 1230 } 1231 ring->itr_recycle = false; 1232 mutex_exit(&ring->itr_lock); 1233 1234 if (notify) { 1235 mac_tx_ring_update(igc->igc_mac_hdl, ring->itr_rh); 1236 } 1237 1238 freemsgchain(mp); 1239 list_destroy(&to_free); 1240 } 1241 1242 static igc_tx_buffer_t * 1243 igc_tx_buffer_alloc(igc_tx_ring_t *ring) 1244 { 1245 igc_tx_buffer_t *buf; 1246 mutex_enter(&ring->itr_lock); 1247 buf = list_remove_head(&ring->itr_free_list); 1248 if (buf == NULL) { 1249 ring->itr_stat.its_no_tx_bufs.value.ui64++; 1250 } 1251 mutex_exit(&ring->itr_lock); 1252 1253 return (buf); 1254 } 1255 1256 /* 1257 * Utilize a new tx buffer to perform a DMA binding for this mblk. 1258 */ 1259 static bool 1260 igc_tx_ring_bind(igc_tx_ring_t *ring, mblk_t *mp, igc_tx_state_t *tx) 1261 { 1262 size_t len = MBLKL(mp); 1263 igc_tx_buffer_t *buf; 1264 int ret; 1265 uint_t ncookie; 1266 1267 buf = igc_tx_buffer_alloc(ring); 1268 if (buf == NULL) { 1269 return (false); 1270 } 1271 1272 ret = ddi_dma_addr_bind_handle(buf->itb_bind_hdl, NULL, 1273 (void *)mp->b_rptr, len, DDI_DMA_WRITE | DDI_DMA_STREAMING, 1274 DDI_DMA_DONTWAIT, NULL, NULL, &ncookie); 1275 if (ret != DDI_DMA_MAPPED) { 1276 /* 1277 * Binding failed. Give this buffer back. 1278 */ 1279 ring->itr_stat.its_tx_bind_fail.value.ui64++; 1280 mutex_enter(&ring->itr_lock); 1281 list_insert_tail(&ring->itr_free_list, buf); 1282 mutex_exit(&ring->itr_lock); 1283 return (false); 1284 } 1285 1286 /* 1287 * Now that this is successful, we append it to the list and update our 1288 * tracking structure. We don't do this earlier so we can keep using the 1289 * extent buffer for copying as that's the fallback path. 1290 */ 1291 buf->itb_len = len; 1292 buf->itb_bind = true; 1293 tx->itx_ndescs += ncookie; 1294 tx->itx_buf_rem = 0; 1295 tx->itx_cur_buf = buf; 1296 list_insert_tail(&tx->itx_bufs, tx->itx_cur_buf); 1297 ring->itr_stat.its_tx_bind.value.ui64++; 1298 return (true); 1299 } 1300 1301 /* 1302 * Copy the current mblk into a series of one or more tx buffers depending on 1303 * what's available. 1304 */ 1305 static bool 1306 igc_tx_ring_copy(igc_tx_ring_t *ring, mblk_t *mp, igc_tx_state_t *tx) 1307 { 1308 size_t len = MBLKL(mp); 1309 size_t off = 0; 1310 1311 while (len > 0) { 1312 const void *src; 1313 void *dest; 1314 size_t to_copy; 1315 1316 /* 1317 * If the current buffer is used for binding, then we must get a 1318 * new one. If it is used for copying, we can keep going until 1319 * it is full. 1320 */ 1321 if (tx->itx_cur_buf != NULL && (tx->itx_cur_buf->itb_bind || 1322 tx->itx_buf_rem == 0)) { 1323 tx->itx_cur_buf = NULL; 1324 tx->itx_buf_rem = 0; 1325 } 1326 1327 if (tx->itx_cur_buf == NULL) { 1328 tx->itx_cur_buf = igc_tx_buffer_alloc(ring); 1329 if (tx->itx_cur_buf == NULL) { 1330 return (false); 1331 } 1332 list_insert_tail(&tx->itx_bufs, tx->itx_cur_buf); 1333 tx->itx_buf_rem = tx->itx_cur_buf->itb_dma.idb_size; 1334 /* 1335 * Each DMA buffer used for TX only requires a single 1336 * cookie. So note that descriptor requirement here and 1337 * flag this tx buffer as being used for copying. 1338 */ 1339 tx->itx_ndescs++; 1340 tx->itx_cur_buf->itb_bind = false; 1341 } 1342 1343 to_copy = MIN(len, tx->itx_buf_rem); 1344 src = mp->b_rptr + off; 1345 dest = tx->itx_cur_buf->itb_dma.idb_va + 1346 tx->itx_cur_buf->itb_len; 1347 bcopy(src, dest, to_copy); 1348 1349 tx->itx_buf_rem -= to_copy; 1350 tx->itx_cur_buf->itb_len += to_copy; 1351 len -= to_copy; 1352 off += to_copy; 1353 } 1354 1355 ring->itr_stat.its_tx_copy.value.ui64++; 1356 return (true); 1357 } 1358 1359 /* 1360 * We only need to load a context descriptor if what we're loading has changed. 1361 * This checks if it has and if so, updates the fields that have changed. Note, 1362 * a packet that doesn't require offloads won't end up taking us through this 1363 * path. 1364 */ 1365 static bool 1366 igc_tx_ring_context_changed(igc_tx_ring_t *ring, igc_tx_state_t *tx) 1367 { 1368 bool change = false; 1369 igc_tx_context_data_t *data = &ring->itr_tx_ctx; 1370 1371 if (data->itc_l2hlen != tx->itx_meoi.meoi_l2hlen) { 1372 change = true; 1373 data->itc_l2hlen = tx->itx_meoi.meoi_l2hlen; 1374 } 1375 1376 if (data->itc_l3hlen != tx->itx_meoi.meoi_l3hlen) { 1377 change = true; 1378 data->itc_l3hlen = tx->itx_meoi.meoi_l3hlen; 1379 } 1380 1381 if (data->itc_l3proto != tx->itx_meoi.meoi_l3proto) { 1382 change = true; 1383 data->itc_l3proto = tx->itx_meoi.meoi_l3proto; 1384 } 1385 1386 if (data->itc_l4proto != tx->itx_meoi.meoi_l4proto) { 1387 change = true; 1388 data->itc_l4proto = tx->itx_meoi.meoi_l4proto; 1389 } 1390 1391 if (data->itc_l4hlen != tx->itx_meoi.meoi_l4hlen) { 1392 change = true; 1393 data->itc_l4hlen = tx->itx_meoi.meoi_l4hlen; 1394 } 1395 1396 if (data->itc_mss != tx->itx_mss) { 1397 change = true; 1398 data->itc_mss = tx->itx_mss; 1399 } 1400 1401 if (data->itc_cksum != tx->itx_cksum) { 1402 change = true; 1403 data->itc_cksum = tx->itx_cksum; 1404 } 1405 1406 if (data->itc_lso != tx->itx_lso) { 1407 change = true; 1408 data->itc_lso = tx->itx_lso; 1409 } 1410 1411 return (change); 1412 } 1413 1414 /* 1415 * Fill out common descriptor information. First and last descriptor information 1416 * is handled after this. 1417 */ 1418 static void 1419 igc_tx_ring_write_buf_descs(igc_t *igc, igc_tx_ring_t *ring, 1420 igc_tx_buffer_t *buf) 1421 { 1422 ddi_dma_handle_t hdl = buf->itb_bind ? buf->itb_bind_hdl : 1423 buf->itb_dma.idb_hdl; 1424 uint_t nc = ddi_dma_ncookies(hdl); 1425 size_t rem_len = buf->itb_len; 1426 1427 ASSERT(MUTEX_HELD(&ring->itr_lock)); 1428 ASSERT3U(rem_len, !=, 0); 1429 1430 for (uint_t i = 0; i < nc; i++, ring->itr_ring_tail = 1431 igc_next_desc(ring->itr_ring_tail, 1, igc->igc_tx_ndesc)) { 1432 const ddi_dma_cookie_t *c = ddi_dma_cookie_get(hdl, i); 1433 union igc_adv_tx_desc *desc; 1434 uint32_t type = IGC_ADVTXD_DTYP_DATA | IGC_ADVTXD_DCMD_DEXT | 1435 IGC_ADVTXD_DCMD_IFCS; 1436 uint32_t desc_len = MIN(rem_len, c->dmac_size); 1437 1438 /* Quick sanity check on max data descriptor */ 1439 ASSERT3U(desc_len, <, 0x10000); 1440 ASSERT3U(desc_len, >, 0x0); 1441 type |= desc_len; 1442 rem_len -= desc_len; 1443 desc = &ring->itr_ring[ring->itr_ring_tail]; 1444 desc->read.buffer_addr = LE_64(c->dmac_laddress); 1445 desc->read.cmd_type_len = LE_32(type); 1446 desc->read.olinfo_status = LE_32(0); 1447 1448 /* 1449 * Save the transmit buffer in the first descriptor entry that 1450 * we use for this. 1451 */ 1452 if (i == 0) { 1453 ring->itr_work_list[ring->itr_ring_tail] = buf; 1454 } 1455 } 1456 } 1457 1458 /* 1459 * We have created our chain of tx buffers that have been copied and bound. Now 1460 * insert them into place and insert a context descriptor if it will be 1461 * required. Unlike igb we don't save the old context descriptor to try to reuse 1462 * it and instead just always set it. 1463 */ 1464 static bool 1465 igc_tx_ring_write_descs(igc_t *igc, igc_tx_ring_t *ring, mblk_t *mp, 1466 igc_tx_state_t *tx) 1467 { 1468 bool do_ctx = false; 1469 igc_tx_buffer_t *buf; 1470 uint32_t ctx_desc, first_desc, last_desc, flags, status; 1471 1472 /* 1473 * If either checksumming or LSO is set, we may need a context 1474 * descriptor. We assume we will and then if not will adjust that. 1475 */ 1476 if (tx->itx_cksum != 0 || tx->itx_lso != 0) { 1477 do_ctx = true; 1478 tx->itx_ndescs++; 1479 } 1480 1481 mutex_enter(&ring->itr_lock); 1482 if (tx->itx_ndescs + igc->igc_tx_gap > ring->itr_ring_free) { 1483 /* 1484 * Attempt to recycle descriptors before we give up. 1485 */ 1486 mutex_exit(&ring->itr_lock); 1487 igc_tx_recycle(igc, ring); 1488 mutex_enter(&ring->itr_lock); 1489 if (tx->itx_ndescs + igc->igc_tx_gap > ring->itr_ring_free) { 1490 mutex_exit(&ring->itr_lock); 1491 return (false); 1492 } 1493 } 1494 1495 /* 1496 * Now see if the context descriptor has changed, if required. If not, 1497 * then we can reduce the number of descriptors required. We wnt to do 1498 * this after we've checked for descriptors because this will mutate the 1499 * next tx descriptor we have to load. 1500 */ 1501 if (do_ctx && !igc_tx_ring_context_changed(ring, tx)) { 1502 do_ctx = false; 1503 tx->itx_ndescs--; 1504 } 1505 1506 ring->itr_ring_free -= tx->itx_ndescs; 1507 ctx_desc = ring->itr_ring_tail; 1508 if (do_ctx) { 1509 struct igc_adv_tx_context_desc *ctx; 1510 uint32_t len = tx->itx_meoi.meoi_l3hlen | 1511 (tx->itx_meoi.meoi_l2hlen << IGC_ADVTXD_MACLEN_SHIFT); 1512 uint32_t tucmd = IGC_ADVTXD_DCMD_DEXT | IGC_ADVTXD_DTYP_CTXT; 1513 uint32_t l4idx = 0; 1514 1515 if ((tx->itx_lso & HW_LSO) != 0 || 1516 (tx->itx_cksum & HCK_IPV4_HDRCKSUM) != 0) { 1517 if (tx->itx_meoi.meoi_l3proto == ETHERTYPE_IP) { 1518 tucmd |= IGC_ADVTXD_TUCMD_IPV4; 1519 } else { 1520 ASSERT3U(tx->itx_meoi.meoi_l3proto, ==, 1521 ETHERTYPE_IPV6); 1522 tucmd |= IGC_ADVTXD_TUCMD_IPV6; 1523 } 1524 } 1525 1526 if ((tx->itx_lso & HW_LSO) != 0 || 1527 (tx->itx_cksum & HCK_PARTIALCKSUM) != 0) { 1528 if (tx->itx_meoi.meoi_l4proto == IPPROTO_TCP) { 1529 tucmd |= IGC_ADVTXD_TUCMD_L4T_TCP; 1530 } else if (tx->itx_meoi.meoi_l4proto == IPPROTO_UDP) { 1531 tucmd |= IGC_ADVTXD_TUCMD_L4T_UDP; 1532 } 1533 } 1534 1535 /* 1536 * The L4LEN and MSS fields are only required if we're 1537 * performing TSO. The index is always zero regardless because 1538 * the I225 only has one context per queue. 1539 */ 1540 if ((tx->itx_lso & HW_LSO) != 0) { 1541 l4idx |= tx->itx_meoi.meoi_l4hlen << 1542 IGC_ADVTXD_L4LEN_SHIFT; 1543 l4idx |= tx->itx_mss << IGC_ADVTXD_MSS_SHIFT; 1544 } 1545 1546 ctx = (void *)&ring->itr_ring[ctx_desc]; 1547 ctx->vlan_macip_lens = LE_32(len); 1548 ctx->launch_time = 0; 1549 ctx->type_tucmd_mlhl = LE_32(tucmd); 1550 ctx->mss_l4len_idx = LE_32(l4idx); 1551 ring->itr_ring_tail = igc_next_desc(ring->itr_ring_tail, 1, 1552 igc->igc_tx_ndesc); 1553 DTRACE_PROBE4(igc__context__desc, igc_t *, igc, igc_tx_ring_t *, 1554 ring, igc_tx_state_t *, tx, 1555 struct igc_adv_tx_context_desc *, ctx); 1556 } 1557 1558 first_desc = ring->itr_ring_tail; 1559 1560 while ((buf = list_remove_head(&tx->itx_bufs)) != NULL) { 1561 igc_tx_ring_write_buf_descs(igc, ring, buf); 1562 } 1563 1564 /* 1565 * The last descriptor must have end of packet set and is the entry that 1566 * we ask for status on. That is, we don't actually ask for the status 1567 * of each transmit buffer, only the final one so we can more easily 1568 * collect everything including the context descriptor if present. 1569 */ 1570 last_desc = igc_prev_desc(ring->itr_ring_tail, 1, igc->igc_tx_ndesc); 1571 flags = IGC_ADVTXD_DCMD_EOP | IGC_ADVTXD_DCMD_RS; 1572 ring->itr_ring[last_desc].read.cmd_type_len |= LE_32(flags); 1573 1574 /* 1575 * We must now go back and set settings on the first data descriptor to 1576 * indicate what checksumming and offload features we require. Note, we 1577 * keep the IDX field as zero because there is only one context field 1578 * per queue in the I225. 1579 * 1580 * We also save the mblk_t on the first tx buffer in the set which 1581 * should always be saved with the first descriptor we use, which may 1582 * include the context descriptor. Because this descriptor tracks when 1583 * the entire packet is sent and we won't collect it until we're done 1584 * with the entire packet, it's okay to leave this on the start. 1585 */ 1586 flags = 0; 1587 status = 0; 1588 if ((tx->itx_cksum & HCK_IPV4_HDRCKSUM) != 0) { 1589 status |= IGC_TXD_POPTS_IXSM << 8; 1590 } 1591 1592 if ((tx->itx_cksum & HCK_PARTIALCKSUM) != 0) { 1593 status |= IGC_TXD_POPTS_TXSM << 8; 1594 } 1595 1596 if ((tx->itx_lso & HW_LSO) != 0) { 1597 size_t payload = tx->itx_meoi.meoi_len - 1598 tx->itx_meoi.meoi_l2hlen - tx->itx_meoi.meoi_l3hlen - 1599 tx->itx_meoi.meoi_l4hlen; 1600 flags |= IGC_ADVTXD_DCMD_TSE; 1601 status |= payload << IGC_ADVTXD_PAYLEN_SHIFT; 1602 } else { 1603 status |= tx->itx_meoi.meoi_len << IGC_ADVTXD_PAYLEN_SHIFT; 1604 } 1605 1606 ring->itr_ring[first_desc].read.cmd_type_len |= LE_32(flags); 1607 ring->itr_ring[first_desc].read.olinfo_status |= LE_32(status); 1608 ring->itr_work_list[first_desc]->itb_mp = mp; 1609 ring->itr_work_list[first_desc]->itb_first = true; 1610 ring->itr_work_list[first_desc]->itb_last_desc = last_desc; 1611 1612 /* 1613 * If we have a context descriptor, we must adjust the first work list 1614 * item to point to the context descriptor. See 'TX Data Path Design' in 1615 * the theory statemenet for more information. 1616 */ 1617 if (do_ctx) { 1618 ring->itr_work_list[ctx_desc] = ring->itr_work_list[first_desc]; 1619 ring->itr_work_list[first_desc] = NULL; 1620 } 1621 1622 ring->itr_stat.its_obytes.value.ui64 += tx->itx_meoi.meoi_len; 1623 ring->itr_stat.its_opackets.value.ui64++; 1624 1625 IGC_DMA_SYNC(&ring->itr_desc_dma, DDI_DMA_SYNC_FORDEV); 1626 igc_write32(igc, IGC_TDT(ring->itr_idx), ring->itr_ring_tail); 1627 mutex_exit(&ring->itr_lock); 1628 return (true); 1629 } 1630 1631 mblk_t * 1632 igc_ring_tx(void *arg, mblk_t *mp) 1633 { 1634 igc_tx_ring_t *ring = arg; 1635 igc_t *igc = ring->itr_igc; 1636 igc_tx_state_t tx = { 0 }; 1637 1638 ASSERT3P(mp->b_next, ==, NULL); 1639 1640 if (mac_ether_offload_info(mp, &tx.itx_meoi) != 0) { 1641 freemsg(mp); 1642 ring->itr_stat.its_bad_meo.value.ui64++; 1643 return (NULL); 1644 } 1645 1646 mac_hcksum_get(mp, NULL, NULL, NULL, NULL, &tx.itx_cksum); 1647 mac_lso_get(mp, &tx.itx_mss, &tx.itx_lso); 1648 1649 /* 1650 * Note, we don't really care that the following check of the number of 1651 * free descriptors may race with other threads due to a lack of the 1652 * lock. 1653 */ 1654 if (ring->itr_ring_free < igc->igc_tx_recycle_thresh) { 1655 igc_tx_recycle(igc, ring); 1656 } 1657 1658 mutex_enter(&ring->itr_lock); 1659 if (ring->itr_ring_free < igc->igc_tx_notify_thresh) { 1660 ring->itr_stat.its_ring_full.value.ui64++; 1661 ring->itr_mac_blocked = true; 1662 mutex_exit(&ring->itr_lock); 1663 return (mp); 1664 } 1665 mutex_exit(&ring->itr_lock); 1666 1667 /* 1668 * If we end up some day supporting lso and it was requested, then we 1669 * need to check that the header and the payoad are all in one 1670 * contiguous block. If they're not then we'll need to force a copy into 1671 * the descriptor for the headers. 1672 */ 1673 1674 /* 1675 * This list tracks the various tx buffers that we've allocated and will 1676 * use. 1677 */ 1678 list_create(&tx.itx_bufs, sizeof (igc_tx_buffer_t), 1679 offsetof(igc_tx_buffer_t, itb_node)); 1680 1681 for (mblk_t *cur_mp = mp; cur_mp != NULL; cur_mp = cur_mp->b_cont) { 1682 size_t len = MBLKL(cur_mp); 1683 1684 if (len == 0) { 1685 continue; 1686 } 1687 1688 if (len > igc->igc_tx_bind_thresh && 1689 igc_tx_ring_bind(ring, cur_mp, &tx)) { 1690 continue; 1691 } 1692 1693 if (!igc_tx_ring_copy(ring, cur_mp, &tx)) 1694 goto tx_failure; 1695 } 1696 1697 if (!igc_tx_ring_write_descs(igc, ring, mp, &tx)) { 1698 goto tx_failure; 1699 } 1700 1701 list_destroy(&tx.itx_bufs); 1702 return (NULL); 1703 1704 tx_failure: 1705 /* 1706 * We are out of descriptors. Clean up and give the mblk back to MAC. 1707 */ 1708 for (igc_tx_buffer_t *buf = list_head(&tx.itx_bufs); buf != NULL; 1709 buf = list_next(&tx.itx_bufs, buf)) { 1710 igc_tx_buf_reset(buf); 1711 } 1712 1713 mutex_enter(&ring->itr_lock); 1714 list_move_tail(&ring->itr_free_list, &tx.itx_bufs); 1715 ring->itr_mac_blocked = true; 1716 mutex_exit(&ring->itr_lock); 1717 list_destroy(&tx.itx_bufs); 1718 1719 return (mp); 1720 } 1721