1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2015 OmniTI Computer Consulting, Inc. All rights reserved. 14 * Copyright 2019 Joyent, Inc. 15 * Copyright 2020 RackTop Systems, Inc. 16 */ 17 18 #include "i40e_sw.h" 19 20 /* 21 * --------------------------------------------------------- 22 * Buffer and Memory Management, Receiving, and Transmitting 23 * --------------------------------------------------------- 24 * 25 * Each physical function (PF), which is what we think of as an instance of the 26 * device driver, has a series of associated transmit and receive queue pairs. 27 * Effectively, what we think of in MAC as rings. Each of these has their own 28 * ring of descriptors which is used as part of doing DMA activity. 29 * 30 * The transmit ring of descriptors are 16-byte entries which are used to send 31 * packets, program filters, etc. The receive ring of descriptors are either 32 * 16-byte or 32-bytes each. At the moment, we opt to use the larger descriptor 33 * format so that we're in a better position if we ever want to leverage that 34 * information later on. 35 * 36 * However, these rings are just for descriptors, they don't talk or deal with 37 * how we actually store the memory that we need for DMA or the associated 38 * information that we need for keeping track of message blocks. To correspond 39 * to the hardware descriptor ring which is how we communicate with hardware, we 40 * introduce a control block which keeps track of our required metadata like DMA 41 * mappings. 42 * 43 * There are two main considerations that dictate how much memory and buffers 44 * we end up allocating. Those are: 45 * 46 * o The size of the ring (controlled through the driver.conf file) 47 * 48 * o The maximum size frame we can receive. 49 * 50 * The size of the rings currently defaults to 1024 descriptors and is stored in 51 * the i40e_t`i40e_rx_ring_size and i40e_t`i40e_tx_ring_size. 52 * 53 * While the size of the rings is controlled by the driver.conf, the maximum 54 * size frame is informed primarily through the use of dladm and the setting of 55 * the MTU property on the device. From the MTU, we then go and do some 56 * machinations. The first thing we do is we then have to add in space for the 57 * Ethernet header, potentially a VLAN header, and the FCS check. This value is 58 * what's stored as i40e_t`i40e_frame_max and is derived any time 59 * i40e_t`i40e_sdu changes. 60 * 61 * This size is then rounded up to the nearest 1k chunk, which represents the 62 * actual amount of memory that we'll allocate for a single frame. 63 * 64 * Note, that for RX, we do something that might be unexpected. We always add 65 * an extra two bytes to the frame size that we allocate. We then offset the DMA 66 * address that we receive a packet into by two bytes. This ensures that the IP 67 * header will always be 4 byte aligned because the MAC header is either 14 or 68 * 18 bytes in length, depending on the use of 802.1Q tagging, which makes IP's 69 * and MAC's lives easier. 70 * 71 * Both the RX and TX descriptor rings (which are what we use to communicate 72 * with hardware) are allocated as a single region of DMA memory which is the 73 * size of the descriptor (4 bytes and 2 bytes respectively) times the total 74 * number of descriptors for an RX and TX ring. 75 * 76 * While the RX and TX descriptors are allocated using DMA-based memory, the 77 * control blocks for each of them are allocated using normal kernel memory. 78 * They aren't special from a DMA perspective. We'll go over the design of both 79 * receiving and transmitting separately, as they have slightly different 80 * control blocks and different ways that we manage the relationship between 81 * control blocks and descriptors. 82 * 83 * --------------------------------- 84 * RX Descriptors and Control Blocks 85 * --------------------------------- 86 * 87 * For every descriptor in the ring that the driver has, we need some associated 88 * memory, which means that we need to have the receive specific control block. 89 * We have a couple different, but related goals: 90 * 91 * o Once we've completed the mc_start GLDv3 endpoint (i40e_m_start), we do 92 * not want to do any additional memory allocations or DMA allocations if 93 * we don't have to. 94 * 95 * o We'd like to try and do as much zero-copy as possible, while taking into 96 * account the cost of mapping in DMA resources. 97 * 98 * o We'd like to have every receive descriptor available. 99 * 100 * Now, these rules are a bit in tension with one another. The act of mapping in 101 * is an exercise of trying to find the break-even point between page table 102 * updates and bcopy. We currently start by using the same metrics that ixgbe 103 * used; however, it should be known that this value has effectively been 104 * cargo-culted across to yet another driver, sorry. 105 * 106 * If we receive a packet which is larger than our copy threshold, we'll create 107 * a message block out of the DMA memory via desballoc(9F) and send that up to 108 * MAC that way. This will cause us to be notified when the message block is 109 * then freed because it has been consumed, dropped, or otherwise. Otherwise, if 110 * it's less than the threshold, we'll try to use allocb and bcopy it into the 111 * block, thus allowing us to immediately reuse the DMA resource. Note, on debug 112 * builds, we allow someone to whack the variable i40e_debug_rx_mode to override 113 * the behavior and always do a bcopy or a DMA bind. 114 * 115 * To try and ensure that the device always has blocks that it can receive data 116 * into, we maintain two lists of control blocks, a working list and a free 117 * list. Each list is sized equal to the number of descriptors in the RX ring. 118 * During the GLDv3 mc_start routine, we allocate a number of RX control blocks 119 * equal to twice the number of descriptors in the ring and we assign them 120 * equally to the free list and to the working list. Each control block also has 121 * DMA memory allocated and associated with which it will be used to receive the 122 * actual packet data. All of a received frame's data will end up in a single 123 * DMA buffer. 124 * 125 * During operation, we always maintain the invariant that each RX descriptor 126 * has an associated RX control block which lives in the working list. If we 127 * feel that we should loan up DMA memory to MAC in the form of a message block, 128 * we can only do so if we can maintain this invariant. To do that, we swap in 129 * one of the buffers from the free list. If none are available, then we resort 130 * to using allocb(9F) and bcopy(9F) on the packet instead, regardless of the 131 * size. 132 * 133 * Loaned message blocks come back to use when freemsg(9F) or freeb(9F) is 134 * called on the block, at which point we restore the RX control block to the 135 * free list and are able to reuse the DMA memory again. While the scheme may 136 * seem odd, it importantly keeps us out of trying to do any DMA allocations in 137 * the normal path of operation, even though we may still have to allocate 138 * message blocks and copy. 139 * 140 * The following state machine describes the life time of a RX control block. In 141 * the diagram we abbrviate the RX ring descriptor entry as rxd and the rx 142 * control block entry as rcb. 143 * 144 * | | 145 * * ... 1/2 of all initial rcb's ... * 146 * | | 147 * v v 148 * +------------------+ +------------------+ 149 * | rcb on free list |---*---------->| rcb on work list | 150 * +------------------+ . +------------------+ 151 * ^ . moved to | 152 * | replace rcb * . . Frame received, 153 * | loaned to | entry on free list 154 * | MAC + co. | available. rcb's 155 * | | memory made into mblk_t 156 * * . freemsg(9F) | and sent up to MAC. 157 * | called on | 158 * | loaned rcb | 159 * | and it is v 160 * | recycled. +-------------------+ 161 * +--------------------<-----| rcb loaned to MAC | 162 * +-------------------+ 163 * 164 * Finally, note that every RX control block has a reference count on it. One 165 * reference is added as long as the driver has had the GLDv3 mc_start endpoint 166 * called. If the GLDv3 mc_stop entry point is called, IP has been unplumbed and 167 * no other DLPI consumers remain, then we'll decrement the reference count by 168 * one. Whenever we loan up the RX control block and associated buffer to MAC, 169 * then we bump the reference count again. Even though the device is stopped, 170 * there may still be loaned frames in upper levels that we'll want to account 171 * for. Our callback from freemsg(9F)/freeb(9F) will take care of making sure 172 * that it is cleaned up. 173 * 174 * -------------------- 175 * Managing the RX Ring 176 * -------------------- 177 * 178 * The receive ring descriptors are arranged in a circular buffer with a head 179 * and tail pointer. There are both the conventional head and tail pointers 180 * which are used to partition the ring into two portions, a portion that we, 181 * the operating system, manage and a portion that is managed by hardware. When 182 * hardware owns a descriptor in the ring, it means that it is waiting for data 183 * to be filled in. However, when a portion of the ring is owned by the driver, 184 * then that means that the descriptor has been consumed and we need to go take 185 * a look at it. 186 * 187 * The initial head is configured to be zero by writing it as such in the 188 * receive queue context in the FPM (function private memory from the host). The 189 * initial tail is written to be the last descriptor. This is written to via the 190 * PCIe register I40E_QRX_TAIL(). Technically, hardware owns everything between 191 * the HEAD and TAIL, inclusive. Note that while we initially program the HEAD, 192 * the only values we ever consult ourselves are the TAIL register and our own 193 * state tracking. Effectively, we cache the HEAD register and then update it 194 * ourselves based on our work. 195 * 196 * When we iterate over the RX descriptors and thus the received frames, we are 197 * either in an interrupt context or we've been asked by MAC to poll on the 198 * ring. If we've been asked to poll on the ring, we have a maximum number of 199 * bytes of mblk_t's to return. If processing an RX descriptor would cause us to 200 * exceed that count, then we do not process it. When in interrupt context, we 201 * don't have a strict byte count. However, to ensure liveness, we limit the 202 * amount of data based on a configuration value 203 * (i40e_t`i40e_rx_limit_per_intr). The number that we've started with for this 204 * is based on similar numbers that are used for ixgbe. After some additional 205 * time in the field, we'll have a sense as to whether or not it should be 206 * changed. 207 * 208 * When processing, we start at our own HEAD pointer 209 * (i40e_rx_data_t`rxd_desc_next), which indicates the descriptor to start 210 * processing. Every RX descriptor has what's described as the DD bit. This bit 211 * (the LSB of the second 8-byte word), indicates whether or not the descriptor 212 * is done. When we give descriptors to the hardware, this value is always 213 * zero. When the hardware has finished a descriptor, it will always be one. 214 * 215 * The first thing that we check is whether the DD bit indicates that the 216 * current HEAD is ready. If it isn't, then we're done. That's the primary 217 * invariant of processing a frame. If it's done, then there are a few other 218 * things that we want to look at. In the same status word as the DD bit, there 219 * are two other important bits: 220 * 221 * o End of Packet (EOP) 222 * o Error bits 223 * 224 * The end of packet indicates that we have reached the last descriptor. Now, 225 * you might ask when would there be more than one descriptor. The reason for 226 * that might be due to large receive offload (lro) or header splitting 227 * functionality, which presently isn't supported in the driver. The error bits 228 * in the frame are only valid when EOP is set. 229 * 230 * If error bits are set on the frame, then we still consume it; however, we 231 * will not generate an mblk_t to send up to MAC. If there are no error bits 232 * set, then we'll consume the descriptor either using bcopy or DMA binding. See 233 * the earlier section 'RX DESCRIPTORS AND CONTROL BLOCKS' for more information 234 * on how that selection is made. 235 * 236 * Regardless of whether we construct an mblk_t or encounter an error, we end up 237 * resetting the descriptor. This re-arms the descriptor for hardware and in the 238 * process, we may end up assigning it a new receive control bock. After we do 239 * this, we always update our HEAD pointer, no matter what. 240 * 241 * Finally, once we've consumed as much as we will in a given window, we go and 242 * update the TAIL register to indicate all the frames we've consumed. We only 243 * do a single bulk write for the ring. 244 * 245 * --------------------------------- 246 * TX Descriptors and Control Blocks 247 * --------------------------------- 248 * 249 * While the transmit path is similar in spirit to the receive path, it works 250 * differently due to the fact that all data is originated by the operating 251 * system and not by the device. 252 * 253 * Like RX, there is both a descriptor ring that we use to communicate to the 254 * driver and which points to the memory used to transmit a frame. Similarly, 255 * there is a corresponding transmit control block, however, the correspondence 256 * between descriptors and control blocks is more complex and not necessarily 257 * 1-to-1. 258 * 259 * The driver is asked to process a single frame at a time. That message block 260 * may be made up of multiple fragments linked together by the mblk_t`b_cont 261 * member. The device has a hard limit of up to 8 buffers being allowed for use 262 * for a single non-LSO packet or LSO segment. The number of TX ring entires 263 * (and thus TX control blocks) used depends on the fragment sizes and DMA 264 * layout, as explained below. 265 * 266 * We alter our DMA strategy based on a threshold tied to the fragment size. 267 * This threshold is configurable via the tx_dma_threshold property. If the 268 * fragment is above the threshold, we DMA bind it -- consuming one TCB and 269 * potentially several data descriptors. The exact number of descriptors (equal 270 * to the number of DMA cookies) depends on page size, MTU size, b_rptr offset 271 * into page, b_wptr offset into page, and the physical layout of the dblk's 272 * memory (contiguous or not). Essentially, we are at the mercy of the DMA 273 * engine and the dblk's memory allocation. Knowing the exact number of 274 * descriptors up front is a task best not taken on by the driver itself. 275 * Instead, we attempt to DMA bind the fragment and verify the descriptor 276 * layout meets hardware constraints. If the proposed DMA bind does not satisfy 277 * the hardware constaints, then we discard it and instead copy the entire 278 * fragment into the pre-allocated TCB buffer (or buffers if the fragment is 279 * larger than the TCB buffer). 280 * 281 * If the fragment is below or at the threshold, we copy it to the pre-allocated 282 * buffer of a TCB. We compress consecutive copy fragments into a single TCB to 283 * conserve resources. We are guaranteed that the TCB buffer is made up of only 284 * 1 DMA cookie; and therefore consumes only one descriptor on the controller. 285 * 286 * Furthermore, if the frame requires HW offloads such as LSO, tunneling or 287 * filtering, then the TX data descriptors must be preceeded by a single TX 288 * context descriptor. Because there is no DMA transfer associated with the 289 * context descriptor, we allocate a control block with a special type which 290 * indicates to the TX ring recycle code that there are no associated DMA 291 * resources to unbind when the control block is free'd. 292 * 293 * If we don't have enough space in the ring or TX control blocks available, 294 * then we'll return the unprocessed message block to MAC. This will induce flow 295 * control and once we recycle enough entries, we'll once again enable sending 296 * on the ring. 297 * 298 * We size the working list as equal to the number of descriptors in the ring. 299 * We size the free list as equal to 1.5 times the number of descriptors in the 300 * ring. We'll allocate a number of TX control block entries equal to the number 301 * of entries in the free list. By default, all entries are placed in the free 302 * list. As we come along and try to send something, we'll allocate entries from 303 * the free list and add them to the working list, where they'll stay until the 304 * hardware indicates that all of the data has been written back to us. The 305 * reason that we start with 1.5x is to help facilitate having more than one TX 306 * buffer associated with the DMA activity. 307 * 308 * -------------------- 309 * Managing the TX Ring 310 * -------------------- 311 * 312 * The transmit descriptor ring is driven by us. We maintain our own notion of a 313 * HEAD and TAIL register and we update the hardware with updates to the TAIL 314 * register. When the hardware is done writing out data, it updates us by 315 * writing back to a specific address, not by updating the individual 316 * descriptors. That address is a 4-byte region after the main transmit 317 * descriptor ring. This is why the descriptor ring has an extra descriptor's 318 * worth allocated to it. 319 * 320 * We maintain our notion of the HEAD in the i40e_trqpair_t`itrq_desc_head and 321 * the TAIL in the i40e_trqpair_t`itrq_desc_tail. When we write out frames, 322 * we'll update the tail there and in the I40E_QTX_TAIL() register. At various 323 * points in time, through both interrupts, and our own internal checks, we'll 324 * sync the write-back head portion of the DMA space. Based on the index it 325 * reports back, we'll free everything between our current HEAD and the 326 * indicated index and update HEAD to the new index. 327 * 328 * When a frame comes in, we try to use a number of transmit control blocks and 329 * we'll transition them from the free list to the work list. They'll get moved 330 * to the entry on the work list that corresponds with the transmit descriptor 331 * they correspond to. Once we are indicated that the corresponding descriptor 332 * has been freed, we'll return it to the list. 333 * 334 * The transmit control block free list is managed by keeping track of the 335 * number of entries in it, i40e_trqpair_t`itrq_tcb_free. We use it as a way to 336 * index into the free list and add things to it. In effect, we always push and 337 * pop from the tail and protect it with a single lock, 338 * i40e_trqpair_t`itrq_tcb_lock. This scheme is somewhat simplistic and may not 339 * stand up to further performance testing; however, it does allow us to get off 340 * the ground with the device driver. 341 * 342 * The following image describes where a given transmit control block lives in 343 * its lifetime: 344 * 345 * | 346 * * ... Initial placement for all tcb's 347 * | 348 * v 349 * +------------------+ +------------------+ 350 * | tcb on free list |---*------------------>| tcb on work list | 351 * +------------------+ . +------------------+ 352 * ^ . N tcbs allocated[1] | 353 * | to send frame v 354 * | or fragment on | 355 * | wire, mblk from | 356 * | MAC associated. | 357 * | | 358 * +------*-------------------------------<----+ 359 * . 360 * . Hardware indicates 361 * entry transmitted. 362 * tcbs recycled, mblk 363 * from MAC freed. 364 * 365 * [1] We allocate N tcbs to transmit a single frame where N can be 1 context 366 * descriptor plus 1 data descriptor, in the non-DMA-bind case. In the DMA 367 * bind case, N can be 1 context descriptor plus 1 data descriptor per 368 * b_cont in the mblk. In this case, the mblk is associated with the first 369 * data descriptor and freed as part of freeing that data descriptor. 370 * 371 * ------------ 372 * Blocking MAC 373 * ------------ 374 * 375 * When performing transmit, we can run out of descriptors and ring entries. 376 * When such a case happens, we return the mblk_t to MAC to indicate that we've 377 * been blocked. At that point in time, MAC becomes blocked and will not 378 * transmit anything out that specific ring until we notify MAC. To indicate 379 * that we're in such a situation we set i40e_trqpair_t`itrq_tx_blocked member 380 * to B_TRUE. 381 * 382 * When we recycle TX descriptors then we'll end up signaling MAC by calling 383 * mac_tx_ring_update() if we were blocked, letting it know that it's safe to 384 * start sending frames out to us again. 385 */ 386 387 /* 388 * We set our DMA alignment requests based on the smallest supported page size 389 * of the corresponding platform. 390 */ 391 #if defined(__sparc) 392 #define I40E_DMA_ALIGNMENT 0x2000ull 393 #elif defined(__x86) 394 #define I40E_DMA_ALIGNMENT 0x1000ull 395 #else 396 #error "unknown architecture for i40e" 397 #endif 398 399 /* 400 * This structure is used to maintain information and flags related to 401 * transmitting a frame. These fields are ultimately used to construct the 402 * TX data descriptor(s) and, if necessary, the TX context descriptor. 403 */ 404 typedef struct i40e_tx_context { 405 enum i40e_tx_desc_cmd_bits itc_data_cmdflags; 406 uint32_t itc_data_offsets; 407 enum i40e_tx_ctx_desc_cmd_bits itc_ctx_cmdflags; 408 uint32_t itc_ctx_tsolen; 409 uint32_t itc_ctx_mss; 410 } i40e_tx_context_t; 411 412 /* 413 * Toggles on debug builds which can be used to override our RX behaviour based 414 * on thresholds. 415 */ 416 #ifdef DEBUG 417 typedef enum { 418 I40E_DEBUG_RX_DEFAULT = 0, 419 I40E_DEBUG_RX_BCOPY = 1, 420 I40E_DEBUG_RX_DMABIND = 2 421 } i40e_debug_rx_t; 422 423 i40e_debug_rx_t i40e_debug_rx_mode = I40E_DEBUG_RX_DEFAULT; 424 #endif /* DEBUG */ 425 426 /* 427 * Notes on the following pair of DMA attributes. The first attribute, 428 * i40e_static_dma_attr, is designed to be used for both the descriptor rings 429 * and the static buffers that we associate with control blocks. For this 430 * reason, we force an SGL length of one. While technically the driver supports 431 * a larger SGL (5 on RX and 8 on TX), we opt to only use one to simplify our 432 * management here. In addition, when the Intel common code wants to allocate 433 * memory via the i40e_allocate_virt_mem osdep function, we have it leverage 434 * the static dma attr. 435 * 436 * The latter two sets of attributes, are what we use when we're binding a 437 * bunch of mblk_t fragments to go out the door. Note that the main difference 438 * here is that we're allowed a larger SGL length. For non-LSO TX, we 439 * restrict the SGL length to match the number of TX buffers available to the 440 * PF (8). For the LSO case we can go much larger, with the caveat that each 441 * MSS-sized chunk (segment) must not span more than 8 data descriptors and 442 * hence must not span more than 8 cookies. 443 * 444 * Note, we default to setting ourselves to be DMA capable here. However, 445 * because we could have multiple instances which have different FMA error 446 * checking capabilities, or end up on different buses, we make these static 447 * and const and copy them into the i40e_t for the given device with the actual 448 * values that reflect the actual capabilities. 449 */ 450 static const ddi_dma_attr_t i40e_g_static_dma_attr = { 451 DMA_ATTR_V0, /* version number */ 452 0x0000000000000000ull, /* low address */ 453 0xFFFFFFFFFFFFFFFFull, /* high address */ 454 0x00000000FFFFFFFFull, /* dma counter max */ 455 I40E_DMA_ALIGNMENT, /* alignment */ 456 0x00000FFF, /* burst sizes */ 457 0x00000001, /* minimum transfer size */ 458 0x00000000FFFFFFFFull, /* maximum transfer size */ 459 0xFFFFFFFFFFFFFFFFull, /* maximum segment size */ 460 1, /* scatter/gather list length */ 461 0x00000001, /* granularity */ 462 DDI_DMA_FLAGERR /* DMA flags */ 463 }; 464 465 static const ddi_dma_attr_t i40e_g_txbind_dma_attr = { 466 DMA_ATTR_V0, /* version number */ 467 0x0000000000000000ull, /* low address */ 468 0xFFFFFFFFFFFFFFFFull, /* high address */ 469 I40E_MAX_TX_BUFSZ - 1, /* dma counter max */ 470 I40E_DMA_ALIGNMENT, /* alignment */ 471 0x00000FFF, /* burst sizes */ 472 0x00000001, /* minimum transfer size */ 473 0x00000000FFFFFFFFull, /* maximum transfer size */ 474 0xFFFFFFFFFFFFFFFFull, /* maximum segment size */ 475 I40E_TX_MAX_COOKIE, /* scatter/gather list length */ 476 0x00000001, /* granularity */ 477 DDI_DMA_FLAGERR /* DMA flags */ 478 }; 479 480 static const ddi_dma_attr_t i40e_g_txbind_lso_dma_attr = { 481 DMA_ATTR_V0, /* version number */ 482 0x0000000000000000ull, /* low address */ 483 0xFFFFFFFFFFFFFFFFull, /* high address */ 484 I40E_MAX_TX_BUFSZ - 1, /* dma counter max */ 485 I40E_DMA_ALIGNMENT, /* alignment */ 486 0x00000FFF, /* burst sizes */ 487 0x00000001, /* minimum transfer size */ 488 0x00000000FFFFFFFFull, /* maximum transfer size */ 489 0xFFFFFFFFFFFFFFFFull, /* maximum segment size */ 490 I40E_TX_LSO_MAX_COOKIE, /* scatter/gather list length */ 491 0x00000001, /* granularity */ 492 DDI_DMA_FLAGERR /* DMA flags */ 493 }; 494 495 /* 496 * Next, we have the attributes for these structures. The descriptor rings are 497 * all strictly little endian, while the data buffers are just arrays of bytes 498 * representing frames. Because of this, we purposefully simplify the driver 499 * programming life by programming the descriptor ring as little endian, while 500 * for the buffer data we keep it as unstructured. 501 * 502 * Note, that to keep the Intel common code operating in a reasonable way, when 503 * we allocate DMA memory for it, we do not use byte swapping and thus use the 504 * standard i40e_buf_acc_attr. 505 */ 506 static const ddi_device_acc_attr_t i40e_g_desc_acc_attr = { 507 DDI_DEVICE_ATTR_V0, 508 DDI_STRUCTURE_LE_ACC, 509 DDI_STRICTORDER_ACC 510 }; 511 512 static const ddi_device_acc_attr_t i40e_g_buf_acc_attr = { 513 DDI_DEVICE_ATTR_V0, 514 DDI_NEVERSWAP_ACC, 515 DDI_STRICTORDER_ACC 516 }; 517 518 /* 519 * The next two functions are designed to be type-safe versions of macros that 520 * are used to increment and decrement a descriptor index in the loop. Note, 521 * these are marked inline to try and keep the data path hot and they were 522 * effectively inlined in their previous life as macros. 523 */ 524 static inline int 525 i40e_next_desc(int base, int count, int size) 526 { 527 int out; 528 529 ASSERT(base >= 0); 530 ASSERT(count > 0); 531 ASSERT(size > 0); 532 533 if (base + count < size) { 534 out = base + count; 535 } else { 536 out = base + count - size; 537 } 538 539 ASSERT(out >= 0 && out < size); 540 return (out); 541 } 542 543 static inline int 544 i40e_prev_desc(int base, int count, int size) 545 { 546 int out; 547 548 ASSERT(base >= 0); 549 ASSERT(count > 0); 550 ASSERT(size > 0); 551 552 if (base >= count) { 553 out = base - count; 554 } else { 555 out = base - count + size; 556 } 557 558 ASSERT(out >= 0 && out < size); 559 return (out); 560 } 561 562 /* 563 * Free DMA memory that is represented by a i40e_dma_buffer_t. 564 */ 565 static void 566 i40e_free_dma_buffer(i40e_dma_buffer_t *dmap) 567 { 568 if (dmap->dmab_dma_address != 0) { 569 VERIFY(dmap->dmab_dma_handle != NULL); 570 (void) ddi_dma_unbind_handle(dmap->dmab_dma_handle); 571 dmap->dmab_dma_address = 0; 572 dmap->dmab_size = 0; 573 } 574 575 if (dmap->dmab_acc_handle != NULL) { 576 ddi_dma_mem_free(&dmap->dmab_acc_handle); 577 dmap->dmab_acc_handle = NULL; 578 dmap->dmab_address = NULL; 579 } 580 581 if (dmap->dmab_dma_handle != NULL) { 582 ddi_dma_free_handle(&dmap->dmab_dma_handle); 583 dmap->dmab_dma_handle = NULL; 584 } 585 586 /* 587 * These should only be set if we have valid handles allocated and 588 * therefore should always be NULLed out due to the above code. This 589 * is here to catch us acting sloppy. 590 */ 591 ASSERT(dmap->dmab_dma_address == 0); 592 ASSERT(dmap->dmab_address == NULL); 593 ASSERT(dmap->dmab_size == 0); 594 dmap->dmab_len = 0; 595 } 596 597 /* 598 * Allocate size bytes of DMA memory based on the passed in attributes. This 599 * fills in the information in dmap and is designed for all of our single cookie 600 * allocations. 601 */ 602 static boolean_t 603 i40e_alloc_dma_buffer(i40e_t *i40e, i40e_dma_buffer_t *dmap, 604 ddi_dma_attr_t *attrsp, ddi_device_acc_attr_t *accp, boolean_t stream, 605 boolean_t zero, size_t size) 606 { 607 int ret; 608 uint_t flags; 609 size_t len; 610 ddi_dma_cookie_t cookie; 611 uint_t ncookies; 612 613 if (stream == B_TRUE) 614 flags = DDI_DMA_STREAMING; 615 else 616 flags = DDI_DMA_CONSISTENT; 617 618 /* 619 * Step one: Allocate the DMA handle 620 */ 621 ret = ddi_dma_alloc_handle(i40e->i40e_dip, attrsp, DDI_DMA_DONTWAIT, 622 NULL, &dmap->dmab_dma_handle); 623 if (ret != DDI_SUCCESS) { 624 i40e_error(i40e, "failed to allocate dma handle for I/O " 625 "buffers: %d", ret); 626 dmap->dmab_dma_handle = NULL; 627 return (B_FALSE); 628 } 629 630 /* 631 * Step two: Allocate the DMA memory 632 */ 633 ret = ddi_dma_mem_alloc(dmap->dmab_dma_handle, size, accp, flags, 634 DDI_DMA_DONTWAIT, NULL, &dmap->dmab_address, &len, 635 &dmap->dmab_acc_handle); 636 if (ret != DDI_SUCCESS) { 637 i40e_error(i40e, "failed to allocate %ld bytes of DMA for I/O " 638 "buffers", size); 639 dmap->dmab_address = NULL; 640 dmap->dmab_acc_handle = NULL; 641 i40e_free_dma_buffer(dmap); 642 return (B_FALSE); 643 } 644 645 /* 646 * Step three: Optionally zero 647 */ 648 if (zero == B_TRUE) 649 bzero(dmap->dmab_address, len); 650 651 /* 652 * Step four: Bind the memory 653 */ 654 ret = ddi_dma_addr_bind_handle(dmap->dmab_dma_handle, NULL, 655 dmap->dmab_address, len, DDI_DMA_RDWR | flags, DDI_DMA_DONTWAIT, 656 NULL, &cookie, &ncookies); 657 if (ret != DDI_DMA_MAPPED) { 658 i40e_error(i40e, "failed to allocate %ld bytes of DMA for I/O " 659 "buffers: %d", size, ret); 660 i40e_free_dma_buffer(dmap); 661 return (B_FALSE); 662 } 663 664 VERIFY(ncookies == 1); 665 dmap->dmab_dma_address = cookie.dmac_laddress; 666 dmap->dmab_size = len; 667 dmap->dmab_len = 0; 668 return (B_TRUE); 669 } 670 671 /* 672 * This function is called once the last pending rcb has been freed by the upper 673 * levels of the system. 674 */ 675 static void 676 i40e_free_rx_data(i40e_rx_data_t *rxd) 677 { 678 VERIFY(rxd->rxd_rcb_pending == 0); 679 680 if (rxd->rxd_rcb_area != NULL) { 681 kmem_free(rxd->rxd_rcb_area, 682 sizeof (i40e_rx_control_block_t) * 683 (rxd->rxd_free_list_size + rxd->rxd_ring_size)); 684 rxd->rxd_rcb_area = NULL; 685 } 686 687 if (rxd->rxd_free_list != NULL) { 688 kmem_free(rxd->rxd_free_list, 689 sizeof (i40e_rx_control_block_t *) * 690 rxd->rxd_free_list_size); 691 rxd->rxd_free_list = NULL; 692 } 693 694 if (rxd->rxd_work_list != NULL) { 695 kmem_free(rxd->rxd_work_list, 696 sizeof (i40e_rx_control_block_t *) * 697 rxd->rxd_ring_size); 698 rxd->rxd_work_list = NULL; 699 } 700 701 kmem_free(rxd, sizeof (i40e_rx_data_t)); 702 } 703 704 static boolean_t 705 i40e_alloc_rx_data(i40e_t *i40e, i40e_trqpair_t *itrq) 706 { 707 i40e_rx_data_t *rxd; 708 709 rxd = kmem_zalloc(sizeof (i40e_rx_data_t), KM_NOSLEEP); 710 if (rxd == NULL) 711 return (B_FALSE); 712 itrq->itrq_rxdata = rxd; 713 rxd->rxd_i40e = i40e; 714 715 rxd->rxd_ring_size = i40e->i40e_rx_ring_size; 716 rxd->rxd_free_list_size = i40e->i40e_rx_ring_size; 717 718 rxd->rxd_rcb_free = rxd->rxd_free_list_size; 719 720 rxd->rxd_work_list = kmem_zalloc(sizeof (i40e_rx_control_block_t *) * 721 rxd->rxd_ring_size, KM_NOSLEEP); 722 if (rxd->rxd_work_list == NULL) { 723 i40e_error(i40e, "failed to allocate RX work list for a ring " 724 "of %d entries for ring %d", rxd->rxd_ring_size, 725 itrq->itrq_index); 726 goto cleanup; 727 } 728 729 rxd->rxd_free_list = kmem_zalloc(sizeof (i40e_rx_control_block_t *) * 730 rxd->rxd_free_list_size, KM_NOSLEEP); 731 if (rxd->rxd_free_list == NULL) { 732 i40e_error(i40e, "failed to allocate a %d entry RX free list " 733 "for ring %d", rxd->rxd_free_list_size, itrq->itrq_index); 734 goto cleanup; 735 } 736 737 rxd->rxd_rcb_area = kmem_zalloc(sizeof (i40e_rx_control_block_t) * 738 (rxd->rxd_free_list_size + rxd->rxd_ring_size), KM_NOSLEEP); 739 if (rxd->rxd_rcb_area == NULL) { 740 i40e_error(i40e, "failed to allocate a %d entry rcb area for " 741 "ring %d", rxd->rxd_ring_size + rxd->rxd_free_list_size, 742 itrq->itrq_index); 743 goto cleanup; 744 } 745 746 return (B_TRUE); 747 748 cleanup: 749 i40e_free_rx_data(rxd); 750 itrq->itrq_rxdata = NULL; 751 return (B_FALSE); 752 } 753 754 /* 755 * Free all of the memory that we've allocated for DMA. Note that we may have 756 * buffers that we've loaned up to the OS which are still outstanding. We'll 757 * always free up the descriptor ring, because we no longer need that. For each 758 * rcb, we'll iterate over it and if we send the reference count to zero, then 759 * we'll free the message block and DMA related resources. However, if we don't 760 * take the last one, then we'll go ahead and keep track that we'll have pending 761 * data and clean it up when we get there. 762 */ 763 static void 764 i40e_free_rx_dma(i40e_rx_data_t *rxd, boolean_t failed_init) 765 { 766 uint32_t i, count, ref; 767 768 i40e_rx_control_block_t *rcb; 769 i40e_t *i40e = rxd->rxd_i40e; 770 771 i40e_free_dma_buffer(&rxd->rxd_desc_area); 772 rxd->rxd_desc_ring = NULL; 773 rxd->rxd_desc_next = 0; 774 775 mutex_enter(&i40e->i40e_rx_pending_lock); 776 777 rcb = rxd->rxd_rcb_area; 778 count = rxd->rxd_ring_size + rxd->rxd_free_list_size; 779 780 for (i = 0; i < count; i++, rcb++) { 781 VERIFY(rcb != NULL); 782 783 /* 784 * If we're cleaning up from a failed creation attempt, then an 785 * entry may never have been assembled which would mean that 786 * it's reference count is zero. If we find that, we leave it 787 * be, because nothing else should be modifying it at this 788 * point. We're not at the point that any more references can be 789 * added, just removed. 790 */ 791 if (failed_init == B_TRUE && rcb->rcb_ref == 0) 792 continue; 793 794 ref = atomic_dec_32_nv(&rcb->rcb_ref); 795 if (ref == 0) { 796 freemsg(rcb->rcb_mp); 797 rcb->rcb_mp = NULL; 798 i40e_free_dma_buffer(&rcb->rcb_dma); 799 } else { 800 atomic_inc_32(&rxd->rxd_rcb_pending); 801 atomic_inc_32(&i40e->i40e_rx_pending); 802 } 803 } 804 mutex_exit(&i40e->i40e_rx_pending_lock); 805 } 806 807 /* 808 * Initialize the DMA memory for the descriptor ring and for each frame in the 809 * control block list. 810 */ 811 static boolean_t 812 i40e_alloc_rx_dma(i40e_rx_data_t *rxd) 813 { 814 int i, count; 815 size_t dmasz; 816 i40e_rx_control_block_t *rcb; 817 i40e_t *i40e = rxd->rxd_i40e; 818 819 /* 820 * First allocate the RX descriptor ring. 821 */ 822 dmasz = sizeof (i40e_rx_desc_t) * rxd->rxd_ring_size; 823 VERIFY(dmasz > 0); 824 if (i40e_alloc_dma_buffer(i40e, &rxd->rxd_desc_area, 825 &i40e->i40e_static_dma_attr, &i40e->i40e_desc_acc_attr, B_FALSE, 826 B_TRUE, dmasz) == B_FALSE) { 827 i40e_error(i40e, "failed to allocate DMA resources " 828 "for RX descriptor ring"); 829 return (B_FALSE); 830 } 831 rxd->rxd_desc_ring = 832 (i40e_rx_desc_t *)(uintptr_t)rxd->rxd_desc_area.dmab_address; 833 rxd->rxd_desc_next = 0; 834 835 count = rxd->rxd_ring_size + rxd->rxd_free_list_size; 836 rcb = rxd->rxd_rcb_area; 837 838 dmasz = i40e->i40e_rx_buf_size; 839 VERIFY(dmasz > 0); 840 for (i = 0; i < count; i++, rcb++) { 841 i40e_dma_buffer_t *dmap; 842 VERIFY(rcb != NULL); 843 844 if (i < rxd->rxd_ring_size) { 845 rxd->rxd_work_list[i] = rcb; 846 } else { 847 rxd->rxd_free_list[i - rxd->rxd_ring_size] = rcb; 848 } 849 850 dmap = &rcb->rcb_dma; 851 if (i40e_alloc_dma_buffer(i40e, dmap, 852 &i40e->i40e_static_dma_attr, &i40e->i40e_buf_acc_attr, 853 B_TRUE, B_FALSE, dmasz) == B_FALSE) { 854 i40e_error(i40e, "failed to allocate RX dma buffer"); 855 return (B_FALSE); 856 } 857 858 /* 859 * Initialize the control block and offset the DMA address. See 860 * the note in the big theory statement that explains how this 861 * helps IP deal with alignment. Note, we don't worry about 862 * whether or not we successfully get an mblk_t from desballoc, 863 * it's a common case that we have to handle later on in the 864 * system. 865 */ 866 dmap->dmab_size -= I40E_BUF_IPHDR_ALIGNMENT; 867 dmap->dmab_address += I40E_BUF_IPHDR_ALIGNMENT; 868 dmap->dmab_dma_address += I40E_BUF_IPHDR_ALIGNMENT; 869 870 rcb->rcb_ref = 1; 871 rcb->rcb_rxd = rxd; 872 rcb->rcb_free_rtn.free_func = i40e_rx_recycle; 873 rcb->rcb_free_rtn.free_arg = (caddr_t)rcb; 874 rcb->rcb_mp = desballoc((unsigned char *)dmap->dmab_address, 875 dmap->dmab_size, 0, &rcb->rcb_free_rtn); 876 } 877 878 return (B_TRUE); 879 } 880 881 static void 882 i40e_free_tx_dma(i40e_trqpair_t *itrq) 883 { 884 size_t fsz; 885 886 if (itrq->itrq_tcb_area != NULL) { 887 uint32_t i; 888 i40e_tx_control_block_t *tcb = itrq->itrq_tcb_area; 889 890 for (i = 0; i < itrq->itrq_tx_free_list_size; i++, tcb++) { 891 i40e_free_dma_buffer(&tcb->tcb_dma); 892 if (tcb->tcb_dma_handle != NULL) { 893 ddi_dma_free_handle(&tcb->tcb_dma_handle); 894 tcb->tcb_dma_handle = NULL; 895 } 896 if (tcb->tcb_lso_dma_handle != NULL) { 897 ddi_dma_free_handle(&tcb->tcb_lso_dma_handle); 898 tcb->tcb_lso_dma_handle = NULL; 899 } 900 } 901 902 fsz = sizeof (i40e_tx_control_block_t) * 903 itrq->itrq_tx_free_list_size; 904 kmem_free(itrq->itrq_tcb_area, fsz); 905 itrq->itrq_tcb_area = NULL; 906 } 907 908 if (itrq->itrq_tcb_free_list != NULL) { 909 fsz = sizeof (i40e_tx_control_block_t *) * 910 itrq->itrq_tx_free_list_size; 911 kmem_free(itrq->itrq_tcb_free_list, fsz); 912 itrq->itrq_tcb_free_list = NULL; 913 } 914 915 if (itrq->itrq_tcb_work_list != NULL) { 916 fsz = sizeof (i40e_tx_control_block_t *) * 917 itrq->itrq_tx_ring_size; 918 kmem_free(itrq->itrq_tcb_work_list, fsz); 919 itrq->itrq_tcb_work_list = NULL; 920 } 921 922 i40e_free_dma_buffer(&itrq->itrq_desc_area); 923 itrq->itrq_desc_ring = NULL; 924 925 } 926 927 static boolean_t 928 i40e_alloc_tx_dma(i40e_trqpair_t *itrq) 929 { 930 int i, ret; 931 size_t dmasz; 932 i40e_tx_control_block_t *tcb; 933 i40e_t *i40e = itrq->itrq_i40e; 934 935 itrq->itrq_tx_ring_size = i40e->i40e_tx_ring_size; 936 itrq->itrq_tx_free_list_size = i40e->i40e_tx_ring_size + 937 (i40e->i40e_tx_ring_size >> 1); 938 939 /* 940 * Allocate an additional TX descriptor for the writeback head. 941 */ 942 dmasz = sizeof (i40e_tx_desc_t) * itrq->itrq_tx_ring_size; 943 dmasz += sizeof (i40e_tx_desc_t); 944 945 VERIFY(dmasz > 0); 946 if (i40e_alloc_dma_buffer(i40e, &itrq->itrq_desc_area, 947 &i40e->i40e_static_dma_attr, &i40e->i40e_desc_acc_attr, 948 B_FALSE, B_TRUE, dmasz) == B_FALSE) { 949 i40e_error(i40e, "failed to allocate DMA resources for TX " 950 "descriptor ring"); 951 return (B_FALSE); 952 } 953 itrq->itrq_desc_ring = 954 (i40e_tx_desc_t *)(uintptr_t)itrq->itrq_desc_area.dmab_address; 955 itrq->itrq_desc_wbhead = (uint32_t *)(itrq->itrq_desc_ring + 956 itrq->itrq_tx_ring_size); 957 itrq->itrq_desc_head = 0; 958 itrq->itrq_desc_tail = 0; 959 itrq->itrq_desc_free = itrq->itrq_tx_ring_size; 960 961 itrq->itrq_tcb_work_list = kmem_zalloc(itrq->itrq_tx_ring_size * 962 sizeof (i40e_tx_control_block_t *), KM_NOSLEEP); 963 if (itrq->itrq_tcb_work_list == NULL) { 964 i40e_error(i40e, "failed to allocate a %d entry TX work list " 965 "for ring %d", itrq->itrq_tx_ring_size, itrq->itrq_index); 966 goto cleanup; 967 } 968 969 itrq->itrq_tcb_free_list = kmem_zalloc(itrq->itrq_tx_free_list_size * 970 sizeof (i40e_tx_control_block_t *), KM_SLEEP); 971 if (itrq->itrq_tcb_free_list == NULL) { 972 i40e_error(i40e, "failed to allocate a %d entry TX free list " 973 "for ring %d", itrq->itrq_tx_free_list_size, 974 itrq->itrq_index); 975 goto cleanup; 976 } 977 978 /* 979 * We allocate enough TX control blocks to cover the free list. 980 */ 981 itrq->itrq_tcb_area = kmem_zalloc(sizeof (i40e_tx_control_block_t) * 982 itrq->itrq_tx_free_list_size, KM_NOSLEEP); 983 if (itrq->itrq_tcb_area == NULL) { 984 i40e_error(i40e, "failed to allocate a %d entry tcb area for " 985 "ring %d", itrq->itrq_tx_free_list_size, itrq->itrq_index); 986 goto cleanup; 987 } 988 989 /* 990 * For each tcb, allocate DMA memory. 991 */ 992 dmasz = i40e->i40e_tx_buf_size; 993 VERIFY(dmasz > 0); 994 tcb = itrq->itrq_tcb_area; 995 for (i = 0; i < itrq->itrq_tx_free_list_size; i++, tcb++) { 996 VERIFY(tcb != NULL); 997 998 /* 999 * Allocate both a DMA buffer which we'll use for when we copy 1000 * packets for transmission and allocate a DMA handle which 1001 * we'll use when we bind data. 1002 */ 1003 ret = ddi_dma_alloc_handle(i40e->i40e_dip, 1004 &i40e->i40e_txbind_dma_attr, DDI_DMA_DONTWAIT, NULL, 1005 &tcb->tcb_dma_handle); 1006 if (ret != DDI_SUCCESS) { 1007 i40e_error(i40e, "failed to allocate DMA handle for TX " 1008 "data binding on ring %d: %d", itrq->itrq_index, 1009 ret); 1010 tcb->tcb_dma_handle = NULL; 1011 goto cleanup; 1012 } 1013 1014 ret = ddi_dma_alloc_handle(i40e->i40e_dip, 1015 &i40e->i40e_txbind_lso_dma_attr, DDI_DMA_DONTWAIT, NULL, 1016 &tcb->tcb_lso_dma_handle); 1017 if (ret != DDI_SUCCESS) { 1018 i40e_error(i40e, "failed to allocate DMA handle for TX " 1019 "LSO data binding on ring %d: %d", itrq->itrq_index, 1020 ret); 1021 tcb->tcb_lso_dma_handle = NULL; 1022 goto cleanup; 1023 } 1024 1025 if (i40e_alloc_dma_buffer(i40e, &tcb->tcb_dma, 1026 &i40e->i40e_static_dma_attr, &i40e->i40e_buf_acc_attr, 1027 B_TRUE, B_FALSE, dmasz) == B_FALSE) { 1028 i40e_error(i40e, "failed to allocate %ld bytes of " 1029 "DMA for TX data binding on ring %d", dmasz, 1030 itrq->itrq_index); 1031 goto cleanup; 1032 } 1033 1034 itrq->itrq_tcb_free_list[i] = tcb; 1035 } 1036 1037 itrq->itrq_tcb_free = itrq->itrq_tx_free_list_size; 1038 1039 return (B_TRUE); 1040 1041 cleanup: 1042 i40e_free_tx_dma(itrq); 1043 return (B_FALSE); 1044 } 1045 1046 /* 1047 * Free all memory associated with a ring. Note, this is done as part of 1048 * the GLDv3 ring stop routine. 1049 */ 1050 void 1051 i40e_free_ring_mem(i40e_trqpair_t *itrq, boolean_t failed_init) 1052 { 1053 i40e_t *i40e = itrq->itrq_i40e; 1054 i40e_rx_data_t *rxd = itrq->itrq_rxdata; 1055 1056 /* 1057 * In some cases i40e_alloc_rx_data() may have failed 1058 * and in that case there is no rxd to free. 1059 */ 1060 if (rxd == NULL) 1061 return; 1062 1063 /* 1064 * Clean up our RX data. We have to free DMA resources first and 1065 * then if we have no more pending RCB's, then we'll go ahead 1066 * and clean things up. Note, we can't set the stopped flag on 1067 * the RX data until after we've done the first pass of the 1068 * pending resources. Otherwise we might race with 1069 * i40e_rx_recycle on determining who should free the 1070 * i40e_rx_data_t above. 1071 */ 1072 i40e_free_rx_dma(rxd, failed_init); 1073 1074 mutex_enter(&i40e->i40e_rx_pending_lock); 1075 rxd->rxd_shutdown = B_TRUE; 1076 if (rxd->rxd_rcb_pending == 0) { 1077 i40e_free_rx_data(rxd); 1078 itrq->itrq_rxdata = NULL; 1079 } 1080 mutex_exit(&i40e->i40e_rx_pending_lock); 1081 1082 i40e_free_tx_dma(itrq); 1083 } 1084 1085 /* 1086 * Allocate all of the resources associated with a ring. 1087 * Note this is done as part of the GLDv3 ring start routine. 1088 * This takes care of both DMA and non-DMA related resources. 1089 */ 1090 boolean_t 1091 i40e_alloc_ring_mem(i40e_trqpair_t *itrq) 1092 { 1093 if (!i40e_alloc_rx_data(itrq->itrq_i40e, itrq)) 1094 goto free; 1095 1096 if (!i40e_alloc_rx_dma(itrq->itrq_rxdata)) 1097 goto free; 1098 1099 if (!i40e_alloc_tx_dma(itrq)) 1100 goto free; 1101 1102 return (B_TRUE); 1103 1104 free: 1105 i40e_free_ring_mem(itrq, B_TRUE); 1106 return (B_FALSE); 1107 } 1108 1109 1110 /* 1111 * Because every instance of i40e may have different support for FMA 1112 * capabilities, we copy the DMA attributes into the i40e_t and set them that 1113 * way and use them for determining attributes. 1114 */ 1115 void 1116 i40e_init_dma_attrs(i40e_t *i40e, boolean_t fma) 1117 { 1118 bcopy(&i40e_g_static_dma_attr, &i40e->i40e_static_dma_attr, 1119 sizeof (ddi_dma_attr_t)); 1120 bcopy(&i40e_g_txbind_dma_attr, &i40e->i40e_txbind_dma_attr, 1121 sizeof (ddi_dma_attr_t)); 1122 bcopy(&i40e_g_txbind_lso_dma_attr, &i40e->i40e_txbind_lso_dma_attr, 1123 sizeof (ddi_dma_attr_t)); 1124 bcopy(&i40e_g_desc_acc_attr, &i40e->i40e_desc_acc_attr, 1125 sizeof (ddi_device_acc_attr_t)); 1126 bcopy(&i40e_g_buf_acc_attr, &i40e->i40e_buf_acc_attr, 1127 sizeof (ddi_device_acc_attr_t)); 1128 1129 if (fma == B_TRUE) { 1130 i40e->i40e_static_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR; 1131 i40e->i40e_txbind_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR; 1132 i40e->i40e_txbind_lso_dma_attr.dma_attr_flags |= 1133 DDI_DMA_FLAGERR; 1134 } else { 1135 i40e->i40e_static_dma_attr.dma_attr_flags &= ~DDI_DMA_FLAGERR; 1136 i40e->i40e_txbind_dma_attr.dma_attr_flags &= ~DDI_DMA_FLAGERR; 1137 i40e->i40e_txbind_lso_dma_attr.dma_attr_flags &= 1138 ~DDI_DMA_FLAGERR; 1139 } 1140 } 1141 1142 static void 1143 i40e_rcb_free(i40e_rx_data_t *rxd, i40e_rx_control_block_t *rcb) 1144 { 1145 mutex_enter(&rxd->rxd_free_lock); 1146 ASSERT(rxd->rxd_rcb_free < rxd->rxd_free_list_size); 1147 ASSERT(rxd->rxd_free_list[rxd->rxd_rcb_free] == NULL); 1148 rxd->rxd_free_list[rxd->rxd_rcb_free] = rcb; 1149 rxd->rxd_rcb_free++; 1150 mutex_exit(&rxd->rxd_free_lock); 1151 } 1152 1153 static i40e_rx_control_block_t * 1154 i40e_rcb_alloc(i40e_rx_data_t *rxd) 1155 { 1156 i40e_rx_control_block_t *rcb; 1157 1158 mutex_enter(&rxd->rxd_free_lock); 1159 if (rxd->rxd_rcb_free == 0) { 1160 mutex_exit(&rxd->rxd_free_lock); 1161 return (NULL); 1162 } 1163 rxd->rxd_rcb_free--; 1164 rcb = rxd->rxd_free_list[rxd->rxd_rcb_free]; 1165 VERIFY(rcb != NULL); 1166 rxd->rxd_free_list[rxd->rxd_rcb_free] = NULL; 1167 mutex_exit(&rxd->rxd_free_lock); 1168 1169 return (rcb); 1170 } 1171 1172 /* 1173 * This is the callback that we get from the OS when freemsg(9F) has been called 1174 * on a loaned descriptor. In addition, if we take the last reference count 1175 * here, then we have to tear down all of the RX data. 1176 */ 1177 void 1178 i40e_rx_recycle(caddr_t arg) 1179 { 1180 uint32_t ref; 1181 i40e_rx_control_block_t *rcb; 1182 i40e_rx_data_t *rxd; 1183 i40e_t *i40e; 1184 1185 /* LINTED: E_BAD_PTR_CAST_ALIGN */ 1186 rcb = (i40e_rx_control_block_t *)arg; 1187 rxd = rcb->rcb_rxd; 1188 i40e = rxd->rxd_i40e; 1189 1190 /* 1191 * It's possible for this to be called with a reference count of zero. 1192 * That will happen when we're doing the freemsg after taking the last 1193 * reference because we're tearing down everything and this rcb is not 1194 * outstanding. 1195 */ 1196 if (rcb->rcb_ref == 0) 1197 return; 1198 1199 /* 1200 * Don't worry about failure of desballoc here. It'll only become fatal 1201 * if we're trying to use it and we can't in i40e_rx_bind(). 1202 */ 1203 rcb->rcb_mp = desballoc((unsigned char *)rcb->rcb_dma.dmab_address, 1204 rcb->rcb_dma.dmab_size, 0, &rcb->rcb_free_rtn); 1205 i40e_rcb_free(rxd, rcb); 1206 1207 /* 1208 * It's possible that the rcb was being used while we are shutting down 1209 * the device. In that case, we'll take the final reference from the 1210 * device here. 1211 */ 1212 ref = atomic_dec_32_nv(&rcb->rcb_ref); 1213 if (ref == 0) { 1214 freemsg(rcb->rcb_mp); 1215 rcb->rcb_mp = NULL; 1216 i40e_free_dma_buffer(&rcb->rcb_dma); 1217 1218 mutex_enter(&i40e->i40e_rx_pending_lock); 1219 atomic_dec_32(&rxd->rxd_rcb_pending); 1220 atomic_dec_32(&i40e->i40e_rx_pending); 1221 1222 /* 1223 * If this was the last block and it's been indicated that we've 1224 * passed the shutdown point, we should clean up. 1225 */ 1226 if (rxd->rxd_shutdown == B_TRUE && rxd->rxd_rcb_pending == 0) { 1227 i40e_free_rx_data(rxd); 1228 cv_broadcast(&i40e->i40e_rx_pending_cv); 1229 } 1230 1231 mutex_exit(&i40e->i40e_rx_pending_lock); 1232 } 1233 } 1234 1235 static mblk_t * 1236 i40e_rx_bind(i40e_trqpair_t *itrq, i40e_rx_data_t *rxd, uint32_t index, 1237 uint32_t plen) 1238 { 1239 mblk_t *mp; 1240 i40e_t *i40e = rxd->rxd_i40e; 1241 i40e_rx_control_block_t *rcb, *rep_rcb; 1242 1243 ASSERT(MUTEX_HELD(&itrq->itrq_rx_lock)); 1244 1245 if ((rep_rcb = i40e_rcb_alloc(rxd)) == NULL) { 1246 itrq->itrq_rxstat.irxs_rx_bind_norcb.value.ui64++; 1247 return (NULL); 1248 } 1249 1250 rcb = rxd->rxd_work_list[index]; 1251 1252 /* 1253 * Check to make sure we have a mblk_t. If we don't, this is our last 1254 * chance to try and get one. 1255 */ 1256 if (rcb->rcb_mp == NULL) { 1257 rcb->rcb_mp = 1258 desballoc((unsigned char *)rcb->rcb_dma.dmab_address, 1259 rcb->rcb_dma.dmab_size, 0, &rcb->rcb_free_rtn); 1260 if (rcb->rcb_mp == NULL) { 1261 itrq->itrq_rxstat.irxs_rx_bind_nomp.value.ui64++; 1262 i40e_rcb_free(rxd, rcb); 1263 return (NULL); 1264 } 1265 } 1266 1267 I40E_DMA_SYNC(&rcb->rcb_dma, DDI_DMA_SYNC_FORKERNEL); 1268 1269 if (i40e_check_dma_handle(rcb->rcb_dma.dmab_dma_handle) != DDI_FM_OK) { 1270 ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED); 1271 atomic_or_32(&i40e->i40e_state, I40E_ERROR); 1272 i40e_rcb_free(rxd, rcb); 1273 return (NULL); 1274 } 1275 1276 /* 1277 * Note, we've already accounted for the I40E_BUF_IPHDR_ALIGNMENT. 1278 */ 1279 mp = rcb->rcb_mp; 1280 atomic_inc_32(&rcb->rcb_ref); 1281 mp->b_wptr = mp->b_rptr + plen; 1282 mp->b_next = mp->b_cont = NULL; 1283 1284 rxd->rxd_work_list[index] = rep_rcb; 1285 return (mp); 1286 } 1287 1288 /* 1289 * We're going to allocate a new message block for this frame and attempt to 1290 * receive it. See the big theory statement for more information on when we copy 1291 * versus bind. 1292 */ 1293 static mblk_t * 1294 i40e_rx_copy(i40e_trqpair_t *itrq, i40e_rx_data_t *rxd, uint32_t index, 1295 uint32_t plen) 1296 { 1297 i40e_t *i40e = rxd->rxd_i40e; 1298 i40e_rx_control_block_t *rcb; 1299 mblk_t *mp; 1300 1301 ASSERT(index < rxd->rxd_ring_size); 1302 rcb = rxd->rxd_work_list[index]; 1303 1304 I40E_DMA_SYNC(&rcb->rcb_dma, DDI_DMA_SYNC_FORKERNEL); 1305 1306 if (i40e_check_dma_handle(rcb->rcb_dma.dmab_dma_handle) != DDI_FM_OK) { 1307 ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED); 1308 atomic_or_32(&i40e->i40e_state, I40E_ERROR); 1309 return (NULL); 1310 } 1311 1312 mp = allocb(plen + I40E_BUF_IPHDR_ALIGNMENT, 0); 1313 if (mp == NULL) { 1314 itrq->itrq_rxstat.irxs_rx_copy_nomem.value.ui64++; 1315 return (NULL); 1316 } 1317 1318 mp->b_rptr += I40E_BUF_IPHDR_ALIGNMENT; 1319 bcopy(rcb->rcb_dma.dmab_address, mp->b_rptr, plen); 1320 mp->b_wptr = mp->b_rptr + plen; 1321 1322 return (mp); 1323 } 1324 1325 /* 1326 * Determine if the device has enabled any checksum flags for us. The level of 1327 * checksum computed will depend on the type packet that we have, which is 1328 * contained in ptype. For example, the checksum logic it does will vary 1329 * depending on whether or not the packet is considered tunneled, whether it 1330 * recognizes the L4 type, etc. Section 8.3.4.3 summarizes which checksums are 1331 * valid. 1332 * 1333 * While there are additional checksums that we could recognize here, we'll need 1334 * to get some additional GLDv3 enhancements to be able to properly describe 1335 * them. 1336 */ 1337 static void 1338 i40e_rx_hcksum(i40e_trqpair_t *itrq, mblk_t *mp, uint64_t status, uint32_t err, 1339 uint32_t ptype) 1340 { 1341 uint32_t cksum; 1342 struct i40e_rx_ptype_decoded pinfo; 1343 1344 ASSERT(ptype <= 255); 1345 pinfo = decode_rx_desc_ptype(ptype); 1346 1347 cksum = 0; 1348 1349 /* 1350 * If the ptype isn't something that we know in the driver, then we 1351 * shouldn't even consider moving forward. 1352 */ 1353 if (pinfo.known == 0) { 1354 itrq->itrq_rxstat.irxs_hck_unknown.value.ui64++; 1355 return; 1356 } 1357 1358 /* 1359 * If hardware didn't set the L3L4P bit on the frame, then there is no 1360 * checksum offload to consider. 1361 */ 1362 if ((status & (1 << I40E_RX_DESC_STATUS_L3L4P_SHIFT)) == 0) { 1363 itrq->itrq_rxstat.irxs_hck_nol3l4p.value.ui64++; 1364 return; 1365 } 1366 1367 /* 1368 * The device tells us that IPv6 checksums where a Destination Options 1369 * Header or a Routing header shouldn't be trusted. Discard all 1370 * checksums in this case. 1371 */ 1372 if (pinfo.outer_ip == I40E_RX_PTYPE_OUTER_IP && 1373 pinfo.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV6 && 1374 (status & (1 << I40E_RX_DESC_STATUS_IPV6EXADD_SHIFT))) { 1375 itrq->itrq_rxstat.irxs_hck_v6skip.value.ui64++; 1376 return; 1377 } 1378 1379 /* 1380 * The hardware denotes three kinds of possible errors. Two are reserved 1381 * for inner and outer IP checksum errors (IPE and EIPE) and the latter 1382 * is for L4 checksum errors (L4E). If there is only one IP header, then 1383 * the only thing that we care about is IPE. Note that since we don't 1384 * support inner checksums, we will ignore IPE being set on tunneled 1385 * packets and only care about EIPE. 1386 */ 1387 if (pinfo.outer_ip == I40E_RX_PTYPE_OUTER_IP && 1388 pinfo.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV4) { 1389 if (pinfo.tunnel_type == I40E_RX_PTYPE_OUTER_NONE) { 1390 if ((err & (1 << I40E_RX_DESC_ERROR_IPE_SHIFT)) != 0) { 1391 itrq->itrq_rxstat.irxs_hck_iperr.value.ui64++; 1392 } else { 1393 itrq->itrq_rxstat.irxs_hck_v4hdrok.value.ui64++; 1394 cksum |= HCK_IPV4_HDRCKSUM_OK; 1395 } 1396 } else { 1397 if ((err & (1 << I40E_RX_DESC_ERROR_EIPE_SHIFT)) != 0) { 1398 itrq->itrq_rxstat.irxs_hck_eiperr.value.ui64++; 1399 } else { 1400 itrq->itrq_rxstat.irxs_hck_v4hdrok.value.ui64++; 1401 cksum |= HCK_IPV4_HDRCKSUM_OK; 1402 } 1403 } 1404 } 1405 1406 /* 1407 * We only have meaningful L4 checksums in the case of IP->L4 and 1408 * IP->IP->L4. There is not outer L4 checksum data available in any 1409 * other case. Further, we don't bother reporting the valid checksum in 1410 * the case of IP->IP->L4 set. 1411 */ 1412 if (pinfo.outer_ip == I40E_RX_PTYPE_OUTER_IP && 1413 pinfo.tunnel_type == I40E_RX_PTYPE_TUNNEL_NONE && 1414 (pinfo.inner_prot == I40E_RX_PTYPE_INNER_PROT_UDP || 1415 pinfo.inner_prot == I40E_RX_PTYPE_INNER_PROT_TCP || 1416 pinfo.inner_prot == I40E_RX_PTYPE_INNER_PROT_ICMP || 1417 pinfo.inner_prot == I40E_RX_PTYPE_INNER_PROT_SCTP)) { 1418 ASSERT(pinfo.payload_layer == I40E_RX_PTYPE_PAYLOAD_LAYER_PAY4); 1419 if ((err & (1 << I40E_RX_DESC_ERROR_L4E_SHIFT)) != 0) { 1420 itrq->itrq_rxstat.irxs_hck_l4err.value.ui64++; 1421 } else { 1422 itrq->itrq_rxstat.irxs_hck_l4hdrok.value.ui64++; 1423 cksum |= HCK_FULLCKSUM_OK; 1424 } 1425 } 1426 1427 if (cksum != 0) { 1428 itrq->itrq_rxstat.irxs_hck_set.value.ui64++; 1429 mac_hcksum_set(mp, 0, 0, 0, 0, cksum); 1430 } else { 1431 itrq->itrq_rxstat.irxs_hck_miss.value.ui64++; 1432 } 1433 } 1434 1435 mblk_t * 1436 i40e_ring_rx(i40e_trqpair_t *itrq, int poll_bytes) 1437 { 1438 i40e_t *i40e; 1439 i40e_hw_t *hw; 1440 i40e_rx_data_t *rxd; 1441 uint32_t cur_head; 1442 i40e_rx_desc_t *cur_desc; 1443 i40e_rx_control_block_t *rcb; 1444 uint64_t rx_bytes, rx_frames; 1445 uint64_t stword; 1446 mblk_t *mp, *mp_head, **mp_tail; 1447 1448 ASSERT(MUTEX_HELD(&itrq->itrq_rx_lock)); 1449 rxd = itrq->itrq_rxdata; 1450 i40e = itrq->itrq_i40e; 1451 hw = &i40e->i40e_hw_space; 1452 1453 if (!(i40e->i40e_state & I40E_STARTED) || 1454 (i40e->i40e_state & I40E_OVERTEMP) || 1455 (i40e->i40e_state & I40E_SUSPENDED) || 1456 (i40e->i40e_state & I40E_ERROR)) 1457 return (NULL); 1458 1459 /* 1460 * Before we do anything else, we have to make sure that all of the DMA 1461 * buffers are synced up and then check to make sure that they're 1462 * actually good from an FM perspective. 1463 */ 1464 I40E_DMA_SYNC(&rxd->rxd_desc_area, DDI_DMA_SYNC_FORKERNEL); 1465 if (i40e_check_dma_handle(rxd->rxd_desc_area.dmab_dma_handle) != 1466 DDI_FM_OK) { 1467 ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED); 1468 atomic_or_32(&i40e->i40e_state, I40E_ERROR); 1469 return (NULL); 1470 } 1471 1472 /* 1473 * Prepare our stats. We do a limited amount of processing in both 1474 * polling and interrupt context. The limit in interrupt context is 1475 * based on frames, in polling context based on bytes. 1476 */ 1477 rx_bytes = rx_frames = 0; 1478 mp_head = NULL; 1479 mp_tail = &mp_head; 1480 1481 /* 1482 * At this point, the descriptor ring is available to check. We'll try 1483 * and process until we either run out of poll_bytes or descriptors. 1484 */ 1485 cur_head = rxd->rxd_desc_next; 1486 cur_desc = &rxd->rxd_desc_ring[cur_head]; 1487 stword = LE64_TO_CPU(cur_desc->wb.qword1.status_error_len); 1488 1489 /* 1490 * Note, the primary invariant of this loop should be that cur_head, 1491 * cur_desc, and stword always point to the currently processed 1492 * descriptor. When we leave the loop, it should point to a descriptor 1493 * that HAS NOT been processed. Meaning, that if we haven't consumed the 1494 * frame, the descriptor should not be advanced. 1495 */ 1496 while ((stword & (1 << I40E_RX_DESC_STATUS_DD_SHIFT)) != 0) { 1497 uint32_t error, eop, plen, ptype; 1498 1499 /* 1500 * The DD, PLEN, and EOP bits are the only ones that are valid 1501 * in every frame. The error information is only valid when EOP 1502 * is set in the same frame. 1503 * 1504 * At this time, because we don't do any LRO or header 1505 * splitting. We expect that every frame should have EOP set in 1506 * it. When later functionality comes in, we'll want to 1507 * re-evaluate this. 1508 */ 1509 eop = stword & (1 << I40E_RX_DESC_STATUS_EOF_SHIFT); 1510 VERIFY(eop != 0); 1511 1512 error = (stword & I40E_RXD_QW1_ERROR_MASK) >> 1513 I40E_RXD_QW1_ERROR_SHIFT; 1514 if (error & I40E_RX_ERR_BITS) { 1515 itrq->itrq_rxstat.irxs_rx_desc_error.value.ui64++; 1516 goto discard; 1517 } 1518 1519 plen = (stword & I40E_RXD_QW1_LENGTH_PBUF_MASK) >> 1520 I40E_RXD_QW1_LENGTH_PBUF_SHIFT; 1521 1522 ptype = (stword & I40E_RXD_QW1_PTYPE_MASK) >> 1523 I40E_RXD_QW1_PTYPE_SHIFT; 1524 1525 /* 1526 * This packet contains valid data. We should check to see if 1527 * we're actually going to consume it based on its length (to 1528 * ensure that we don't overshoot our quota). We determine 1529 * whether to bcopy or bind the DMA resources based on the size 1530 * of the frame. However, if on debug, we allow it to be 1531 * overridden for testing purposes. 1532 * 1533 * We should be smarter about this and do DMA binding for 1534 * larger frames, but for now, it's really more important that 1535 * we actually just get something simple working. 1536 */ 1537 1538 /* 1539 * Ensure we don't exceed our polling quota by reading this 1540 * frame. Note we only bump bytes now, we bump frames later. 1541 */ 1542 if ((poll_bytes != I40E_POLL_NULL) && 1543 (rx_bytes + plen) > poll_bytes) 1544 break; 1545 rx_bytes += plen; 1546 1547 mp = NULL; 1548 if (plen >= i40e->i40e_rx_dma_min) 1549 mp = i40e_rx_bind(itrq, rxd, cur_head, plen); 1550 if (mp == NULL) 1551 mp = i40e_rx_copy(itrq, rxd, cur_head, plen); 1552 1553 if (mp != NULL) { 1554 if (i40e->i40e_rx_hcksum_enable) 1555 i40e_rx_hcksum(itrq, mp, stword, error, ptype); 1556 *mp_tail = mp; 1557 mp_tail = &mp->b_next; 1558 } 1559 1560 /* 1561 * Now we need to prepare this frame for use again. See the 1562 * discussion in the big theory statements. 1563 * 1564 * However, right now we're doing the simple version of this. 1565 * Normally what we'd do would depend on whether or not we were 1566 * doing DMA binding or bcopying. But because we're always doing 1567 * bcopying, we can just always use the current index as a key 1568 * for what to do and reassign the buffer based on the ring. 1569 */ 1570 discard: 1571 rcb = rxd->rxd_work_list[cur_head]; 1572 cur_desc->read.pkt_addr = 1573 CPU_TO_LE64((uintptr_t)rcb->rcb_dma.dmab_dma_address); 1574 cur_desc->read.hdr_addr = 0; 1575 1576 /* 1577 * Finally, update our loop invariants. 1578 */ 1579 cur_head = i40e_next_desc(cur_head, 1, rxd->rxd_ring_size); 1580 cur_desc = &rxd->rxd_desc_ring[cur_head]; 1581 stword = LE64_TO_CPU(cur_desc->wb.qword1.status_error_len); 1582 1583 /* 1584 * To help provide liveness, we limit the amount of data that 1585 * we'll end up counting. Note that in these cases, an interrupt 1586 * is not dissimilar from a polling request. 1587 */ 1588 rx_frames++; 1589 if (rx_frames > i40e->i40e_rx_limit_per_intr) { 1590 itrq->itrq_rxstat.irxs_rx_intr_limit.value.ui64++; 1591 break; 1592 } 1593 } 1594 1595 /* 1596 * As we've modified the ring, we need to make sure that we sync the 1597 * descriptor ring for the device. Next, we update the hardware and 1598 * update our notion of where the head for us to read from hardware is 1599 * next. 1600 */ 1601 I40E_DMA_SYNC(&rxd->rxd_desc_area, DDI_DMA_SYNC_FORDEV); 1602 if (i40e_check_dma_handle(rxd->rxd_desc_area.dmab_dma_handle) != 1603 DDI_FM_OK) { 1604 ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED); 1605 atomic_or_32(&i40e->i40e_state, I40E_ERROR); 1606 } 1607 1608 if (rx_frames != 0) { 1609 uint32_t tail; 1610 ddi_acc_handle_t rh = i40e->i40e_osdep_space.ios_reg_handle; 1611 rxd->rxd_desc_next = cur_head; 1612 tail = i40e_prev_desc(cur_head, 1, rxd->rxd_ring_size); 1613 1614 I40E_WRITE_REG(hw, I40E_QRX_TAIL(itrq->itrq_index), tail); 1615 if (i40e_check_acc_handle(rh) != DDI_FM_OK) { 1616 ddi_fm_service_impact(i40e->i40e_dip, 1617 DDI_SERVICE_DEGRADED); 1618 atomic_or_32(&i40e->i40e_state, I40E_ERROR); 1619 } 1620 1621 itrq->itrq_rxstat.irxs_bytes.value.ui64 += rx_bytes; 1622 itrq->itrq_rxstat.irxs_packets.value.ui64 += rx_frames; 1623 } 1624 1625 #ifdef DEBUG 1626 if (rx_frames == 0) { 1627 ASSERT(rx_bytes == 0); 1628 } 1629 #endif 1630 1631 return (mp_head); 1632 } 1633 1634 /* 1635 * This function is called by the GLDv3 when it wants to poll on a ring. The 1636 * only primary difference from when we call this during an interrupt is that we 1637 * have a limit on the number of bytes that we should consume. 1638 */ 1639 mblk_t * 1640 i40e_ring_rx_poll(void *arg, int poll_bytes) 1641 { 1642 i40e_trqpair_t *itrq = arg; 1643 mblk_t *mp; 1644 1645 ASSERT(poll_bytes > 0); 1646 if (poll_bytes == 0) 1647 return (NULL); 1648 1649 mutex_enter(&itrq->itrq_rx_lock); 1650 mp = i40e_ring_rx(itrq, poll_bytes); 1651 mutex_exit(&itrq->itrq_rx_lock); 1652 1653 return (mp); 1654 } 1655 1656 /* 1657 * Attempt to put togther the information we'll need to feed into a descriptor 1658 * to properly program the hardware for checksum offload as well as the 1659 * generally required flags. 1660 * 1661 * The i40e_tx_context_t`itc_data_cmdflags contains the set of flags we need to 1662 * 'or' into the descriptor based on the checksum flags for this mblk_t and the 1663 * actual information we care about. 1664 * 1665 * If the mblk requires LSO then we'll also gather the information that will be 1666 * used to construct the Transmit Context Descriptor. 1667 */ 1668 static int 1669 i40e_tx_context(i40e_t *i40e, i40e_trqpair_t *itrq, mblk_t *mp, 1670 mac_ether_offload_info_t *meo, i40e_tx_context_t *tctx) 1671 { 1672 uint32_t chkflags, start, mss, lsoflags; 1673 i40e_txq_stat_t *txs = &itrq->itrq_txstat; 1674 1675 bzero(tctx, sizeof (i40e_tx_context_t)); 1676 1677 if (i40e->i40e_tx_hcksum_enable != B_TRUE) 1678 return (0); 1679 1680 mac_hcksum_get(mp, &start, NULL, NULL, NULL, &chkflags); 1681 mac_lso_get(mp, &mss, &lsoflags); 1682 1683 if (chkflags == 0 && lsoflags == 0) 1684 return (0); 1685 1686 /* 1687 * Have we been asked to checksum an IPv4 header. If so, verify that we 1688 * have sufficient information and then set the proper fields in the 1689 * command structure. 1690 */ 1691 if (chkflags & HCK_IPV4_HDRCKSUM) { 1692 if ((meo->meoi_flags & MEOI_L2INFO_SET) == 0) { 1693 txs->itxs_hck_nol2info.value.ui64++; 1694 return (-1); 1695 } 1696 if ((meo->meoi_flags & MEOI_L3INFO_SET) == 0) { 1697 txs->itxs_hck_nol3info.value.ui64++; 1698 return (-1); 1699 } 1700 if (meo->meoi_l3proto != ETHERTYPE_IP) { 1701 txs->itxs_hck_badl3.value.ui64++; 1702 return (-1); 1703 } 1704 tctx->itc_data_cmdflags |= I40E_TX_DESC_CMD_IIPT_IPV4_CSUM; 1705 tctx->itc_data_offsets |= (meo->meoi_l2hlen >> 1) << 1706 I40E_TX_DESC_LENGTH_MACLEN_SHIFT; 1707 tctx->itc_data_offsets |= (meo->meoi_l3hlen >> 2) << 1708 I40E_TX_DESC_LENGTH_IPLEN_SHIFT; 1709 } 1710 1711 /* 1712 * We've been asked to provide an L4 header, first, set up the IP 1713 * information in the descriptor if we haven't already before moving 1714 * onto seeing if we have enough information for the L4 checksum 1715 * offload. 1716 */ 1717 if (chkflags & HCK_PARTIALCKSUM) { 1718 if ((meo->meoi_flags & MEOI_L4INFO_SET) == 0) { 1719 txs->itxs_hck_nol4info.value.ui64++; 1720 return (-1); 1721 } 1722 1723 if (!(chkflags & HCK_IPV4_HDRCKSUM)) { 1724 if ((meo->meoi_flags & MEOI_L2INFO_SET) == 0) { 1725 txs->itxs_hck_nol2info.value.ui64++; 1726 return (-1); 1727 } 1728 if ((meo->meoi_flags & MEOI_L3INFO_SET) == 0) { 1729 txs->itxs_hck_nol3info.value.ui64++; 1730 return (-1); 1731 } 1732 1733 if (meo->meoi_l3proto == ETHERTYPE_IP) { 1734 tctx->itc_data_cmdflags |= 1735 I40E_TX_DESC_CMD_IIPT_IPV4; 1736 } else if (meo->meoi_l3proto == ETHERTYPE_IPV6) { 1737 tctx->itc_data_cmdflags |= 1738 I40E_TX_DESC_CMD_IIPT_IPV6; 1739 } else { 1740 txs->itxs_hck_badl3.value.ui64++; 1741 return (-1); 1742 } 1743 tctx->itc_data_offsets |= (meo->meoi_l2hlen >> 1) << 1744 I40E_TX_DESC_LENGTH_MACLEN_SHIFT; 1745 tctx->itc_data_offsets |= (meo->meoi_l3hlen >> 2) << 1746 I40E_TX_DESC_LENGTH_IPLEN_SHIFT; 1747 } 1748 1749 switch (meo->meoi_l4proto) { 1750 case IPPROTO_TCP: 1751 tctx->itc_data_cmdflags |= 1752 I40E_TX_DESC_CMD_L4T_EOFT_TCP; 1753 break; 1754 case IPPROTO_UDP: 1755 tctx->itc_data_cmdflags |= 1756 I40E_TX_DESC_CMD_L4T_EOFT_UDP; 1757 break; 1758 case IPPROTO_SCTP: 1759 tctx->itc_data_cmdflags |= 1760 I40E_TX_DESC_CMD_L4T_EOFT_SCTP; 1761 break; 1762 default: 1763 txs->itxs_hck_badl4.value.ui64++; 1764 return (-1); 1765 } 1766 1767 tctx->itc_data_offsets |= (meo->meoi_l4hlen >> 2) << 1768 I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT; 1769 } 1770 1771 if (lsoflags & HW_LSO) { 1772 /* 1773 * LSO requires that checksum offloads are enabled. If for 1774 * some reason they're not we bail out with an error. 1775 * 1776 * Fulfilling this requirement also ensures that the L4 info was 1777 * parsed by meoi, which is also necessary for LSO. 1778 */ 1779 if ((meo->meoi_l3proto == ETHERTYPE_IP && 1780 (chkflags & HCK_IPV4_HDRCKSUM) == 0) || 1781 (chkflags & HCK_PARTIALCKSUM) == 0) { 1782 txs->itxs_lso_nohck.value.ui64++; 1783 return (-1); 1784 } 1785 1786 tctx->itc_ctx_cmdflags |= I40E_TX_CTX_DESC_TSO; 1787 tctx->itc_ctx_mss = mss; 1788 tctx->itc_ctx_tsolen = msgsize(mp) - 1789 (meo->meoi_l2hlen + meo->meoi_l3hlen + meo->meoi_l4hlen); 1790 } 1791 1792 return (0); 1793 } 1794 1795 static void 1796 i40e_tcb_free(i40e_trqpair_t *itrq, i40e_tx_control_block_t *tcb) 1797 { 1798 ASSERT(tcb != NULL); 1799 1800 mutex_enter(&itrq->itrq_tcb_lock); 1801 ASSERT(itrq->itrq_tcb_free < itrq->itrq_tx_free_list_size); 1802 itrq->itrq_tcb_free_list[itrq->itrq_tcb_free] = tcb; 1803 itrq->itrq_tcb_free++; 1804 mutex_exit(&itrq->itrq_tcb_lock); 1805 } 1806 1807 static i40e_tx_control_block_t * 1808 i40e_tcb_alloc(i40e_trqpair_t *itrq) 1809 { 1810 i40e_tx_control_block_t *ret; 1811 1812 mutex_enter(&itrq->itrq_tcb_lock); 1813 if (itrq->itrq_tcb_free == 0) { 1814 mutex_exit(&itrq->itrq_tcb_lock); 1815 return (NULL); 1816 } 1817 1818 itrq->itrq_tcb_free--; 1819 ret = itrq->itrq_tcb_free_list[itrq->itrq_tcb_free]; 1820 itrq->itrq_tcb_free_list[itrq->itrq_tcb_free] = NULL; 1821 mutex_exit(&itrq->itrq_tcb_lock); 1822 1823 ASSERT(ret != NULL); 1824 return (ret); 1825 } 1826 1827 /* 1828 * This should be used to free any DMA resources, associated mblk_t's, etc. It's 1829 * used as part of recycling the message blocks when we have either an interrupt 1830 * or other activity that indicates that we need to take a look. 1831 */ 1832 static void 1833 i40e_tcb_reset(i40e_tx_control_block_t *tcb) 1834 { 1835 switch (tcb->tcb_type) { 1836 case I40E_TX_COPY: 1837 tcb->tcb_dma.dmab_len = 0; 1838 break; 1839 case I40E_TX_DMA: 1840 if (tcb->tcb_used_lso == B_TRUE && tcb->tcb_bind_ncookies > 0) 1841 (void) ddi_dma_unbind_handle(tcb->tcb_lso_dma_handle); 1842 else if (tcb->tcb_bind_ncookies > 0) 1843 (void) ddi_dma_unbind_handle(tcb->tcb_dma_handle); 1844 if (tcb->tcb_bind_info != NULL) { 1845 kmem_free(tcb->tcb_bind_info, 1846 tcb->tcb_bind_ncookies * 1847 sizeof (struct i40e_dma_bind_info)); 1848 } 1849 tcb->tcb_bind_info = NULL; 1850 tcb->tcb_bind_ncookies = 0; 1851 tcb->tcb_used_lso = B_FALSE; 1852 break; 1853 case I40E_TX_DESC: 1854 break; 1855 case I40E_TX_NONE: 1856 /* Cast to pacify lint */ 1857 panic("trying to free tcb %p with bad type none", (void *)tcb); 1858 default: 1859 panic("unknown i40e tcb type: %d", tcb->tcb_type); 1860 } 1861 1862 tcb->tcb_type = I40E_TX_NONE; 1863 if (tcb->tcb_mp != NULL) { 1864 freemsg(tcb->tcb_mp); 1865 tcb->tcb_mp = NULL; 1866 } 1867 tcb->tcb_next = NULL; 1868 } 1869 1870 /* 1871 * This is called as part of shutting down to clean up all outstanding 1872 * descriptors. Similar to recycle, except we don't re-arm anything and instead 1873 * just return control blocks to the free list. 1874 */ 1875 void 1876 i40e_tx_cleanup_ring(i40e_trqpair_t *itrq) 1877 { 1878 uint32_t index; 1879 1880 ASSERT(MUTEX_HELD(&itrq->itrq_tx_lock)); 1881 ASSERT(itrq->itrq_desc_free <= itrq->itrq_tx_ring_size); 1882 1883 /* 1884 * Because we should have shut down the chip at this point, it should be 1885 * safe to just clean up all the entries between our head and tail. 1886 */ 1887 #ifdef DEBUG 1888 index = I40E_READ_REG(&itrq->itrq_i40e->i40e_hw_space, 1889 I40E_QTX_ENA(itrq->itrq_index)); 1890 VERIFY0(index & (I40E_QTX_ENA_QENA_REQ_MASK | 1891 I40E_QTX_ENA_QENA_STAT_MASK)); 1892 #endif 1893 1894 index = itrq->itrq_desc_head; 1895 while (itrq->itrq_desc_free < itrq->itrq_tx_ring_size) { 1896 i40e_tx_control_block_t *tcb; 1897 1898 tcb = itrq->itrq_tcb_work_list[index]; 1899 if (tcb != NULL) { 1900 itrq->itrq_tcb_work_list[index] = NULL; 1901 i40e_tcb_reset(tcb); 1902 i40e_tcb_free(itrq, tcb); 1903 } 1904 1905 bzero(&itrq->itrq_desc_ring[index], sizeof (i40e_tx_desc_t)); 1906 index = i40e_next_desc(index, 1, itrq->itrq_tx_ring_size); 1907 itrq->itrq_desc_free++; 1908 } 1909 1910 ASSERT(index == itrq->itrq_desc_tail); 1911 itrq->itrq_desc_head = index; 1912 } 1913 1914 /* 1915 * We're here either by hook or by crook. We need to see if there are transmit 1916 * descriptors available for us to go and clean up and return to the hardware. 1917 * We may also be blocked, and if so, we should make sure that we let it know 1918 * we're good to go. 1919 */ 1920 void 1921 i40e_tx_recycle_ring(i40e_trqpair_t *itrq) 1922 { 1923 uint32_t wbhead, toclean, count; 1924 i40e_tx_control_block_t *tcbhead; 1925 i40e_t *i40e = itrq->itrq_i40e; 1926 uint_t desc_per_tcb, i; 1927 1928 mutex_enter(&itrq->itrq_tx_lock); 1929 1930 ASSERT(itrq->itrq_desc_free <= itrq->itrq_tx_ring_size); 1931 if (itrq->itrq_desc_free == itrq->itrq_tx_ring_size) { 1932 if (itrq->itrq_tx_blocked == B_TRUE) { 1933 itrq->itrq_tx_blocked = B_FALSE; 1934 mac_tx_ring_update(i40e->i40e_mac_hdl, 1935 itrq->itrq_mactxring); 1936 itrq->itrq_txstat.itxs_num_unblocked.value.ui64++; 1937 } 1938 mutex_exit(&itrq->itrq_tx_lock); 1939 return; 1940 } 1941 1942 /* 1943 * Now we need to try and see if there's anything available. The driver 1944 * will write to the head location and it guarantees that it does not 1945 * use relaxed ordering. 1946 */ 1947 VERIFY0(ddi_dma_sync(itrq->itrq_desc_area.dmab_dma_handle, 1948 (uintptr_t)itrq->itrq_desc_wbhead, 1949 sizeof (uint32_t), DDI_DMA_SYNC_FORKERNEL)); 1950 1951 if (i40e_check_dma_handle(itrq->itrq_desc_area.dmab_dma_handle) != 1952 DDI_FM_OK) { 1953 mutex_exit(&itrq->itrq_tx_lock); 1954 ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED); 1955 atomic_or_32(&i40e->i40e_state, I40E_ERROR); 1956 return; 1957 } 1958 1959 wbhead = *itrq->itrq_desc_wbhead; 1960 toclean = itrq->itrq_desc_head; 1961 count = 0; 1962 tcbhead = NULL; 1963 1964 while (toclean != wbhead) { 1965 i40e_tx_control_block_t *tcb; 1966 1967 tcb = itrq->itrq_tcb_work_list[toclean]; 1968 itrq->itrq_tcb_work_list[toclean] = NULL; 1969 ASSERT(tcb != NULL); 1970 tcb->tcb_next = tcbhead; 1971 tcbhead = tcb; 1972 1973 /* 1974 * In the DMA bind case, there may not necessarily be a 1:1 1975 * mapping between tcb's and descriptors. If the tcb type 1976 * indicates a DMA binding then check the number of DMA 1977 * cookies to determine how many entries to clean in the 1978 * descriptor ring. 1979 */ 1980 if (tcb->tcb_type == I40E_TX_DMA) 1981 desc_per_tcb = tcb->tcb_bind_ncookies; 1982 else 1983 desc_per_tcb = 1; 1984 1985 for (i = 0; i < desc_per_tcb; i++) { 1986 /* 1987 * We zero this out for sanity purposes. 1988 */ 1989 bzero(&itrq->itrq_desc_ring[toclean], 1990 sizeof (i40e_tx_desc_t)); 1991 toclean = i40e_next_desc(toclean, 1, 1992 itrq->itrq_tx_ring_size); 1993 count++; 1994 } 1995 } 1996 1997 itrq->itrq_desc_head = wbhead; 1998 itrq->itrq_desc_free += count; 1999 itrq->itrq_txstat.itxs_recycled.value.ui64 += count; 2000 ASSERT(itrq->itrq_desc_free <= itrq->itrq_tx_ring_size); 2001 2002 if (itrq->itrq_tx_blocked == B_TRUE && 2003 itrq->itrq_desc_free > i40e->i40e_tx_block_thresh) { 2004 itrq->itrq_tx_blocked = B_FALSE; 2005 2006 mac_tx_ring_update(i40e->i40e_mac_hdl, itrq->itrq_mactxring); 2007 itrq->itrq_txstat.itxs_num_unblocked.value.ui64++; 2008 } 2009 2010 mutex_exit(&itrq->itrq_tx_lock); 2011 2012 /* 2013 * Now clean up the tcb. 2014 */ 2015 while (tcbhead != NULL) { 2016 i40e_tx_control_block_t *tcb = tcbhead; 2017 2018 tcbhead = tcb->tcb_next; 2019 i40e_tcb_reset(tcb); 2020 i40e_tcb_free(itrq, tcb); 2021 } 2022 2023 DTRACE_PROBE2(i40e__recycle, i40e_trqpair_t *, itrq, uint32_t, count); 2024 } 2025 2026 static void 2027 i40e_tx_copy_fragment(i40e_tx_control_block_t *tcb, const mblk_t *mp, 2028 const size_t off, const size_t len) 2029 { 2030 const void *soff = mp->b_rptr + off; 2031 void *doff = tcb->tcb_dma.dmab_address + tcb->tcb_dma.dmab_len; 2032 2033 ASSERT3U(len, >, 0); 2034 ASSERT3P(soff, >=, mp->b_rptr); 2035 ASSERT3P(soff, <=, mp->b_wptr); 2036 ASSERT3U(len, <=, MBLKL(mp)); 2037 ASSERT3U((uintptr_t)soff + len, <=, (uintptr_t)mp->b_wptr); 2038 ASSERT3U(tcb->tcb_dma.dmab_size - tcb->tcb_dma.dmab_len, >=, len); 2039 bcopy(soff, doff, len); 2040 tcb->tcb_type = I40E_TX_COPY; 2041 tcb->tcb_dma.dmab_len += len; 2042 I40E_DMA_SYNC(&tcb->tcb_dma, DDI_DMA_SYNC_FORDEV); 2043 } 2044 2045 static i40e_tx_control_block_t * 2046 i40e_tx_bind_fragment(i40e_trqpair_t *itrq, const mblk_t *mp, 2047 size_t off, boolean_t use_lso) 2048 { 2049 ddi_dma_handle_t dma_handle; 2050 ddi_dma_cookie_t dma_cookie; 2051 uint_t i = 0, ncookies = 0, dmaflags; 2052 i40e_tx_control_block_t *tcb; 2053 i40e_txq_stat_t *txs = &itrq->itrq_txstat; 2054 2055 if ((tcb = i40e_tcb_alloc(itrq)) == NULL) { 2056 txs->itxs_err_notcb.value.ui64++; 2057 return (NULL); 2058 } 2059 tcb->tcb_type = I40E_TX_DMA; 2060 2061 if (use_lso == B_TRUE) 2062 dma_handle = tcb->tcb_lso_dma_handle; 2063 else 2064 dma_handle = tcb->tcb_dma_handle; 2065 2066 dmaflags = DDI_DMA_WRITE | DDI_DMA_STREAMING; 2067 if (ddi_dma_addr_bind_handle(dma_handle, NULL, 2068 (caddr_t)(mp->b_rptr + off), MBLKL(mp) - off, dmaflags, 2069 DDI_DMA_DONTWAIT, NULL, &dma_cookie, &ncookies) != DDI_DMA_MAPPED) { 2070 txs->itxs_bind_fails.value.ui64++; 2071 goto bffail; 2072 } 2073 2074 tcb->tcb_bind_ncookies = ncookies; 2075 tcb->tcb_used_lso = use_lso; 2076 2077 tcb->tcb_bind_info = 2078 kmem_zalloc(ncookies * sizeof (struct i40e_dma_bind_info), 2079 KM_NOSLEEP); 2080 if (tcb->tcb_bind_info == NULL) 2081 goto bffail; 2082 2083 while (i < ncookies) { 2084 if (i > 0) 2085 ddi_dma_nextcookie(dma_handle, &dma_cookie); 2086 2087 tcb->tcb_bind_info[i].dbi_paddr = 2088 (caddr_t)dma_cookie.dmac_laddress; 2089 tcb->tcb_bind_info[i++].dbi_len = dma_cookie.dmac_size; 2090 } 2091 2092 return (tcb); 2093 2094 bffail: 2095 i40e_tcb_reset(tcb); 2096 i40e_tcb_free(itrq, tcb); 2097 return (NULL); 2098 } 2099 2100 static void 2101 i40e_tx_set_data_desc(i40e_trqpair_t *itrq, i40e_tx_context_t *tctx, 2102 caddr_t buff, size_t len, boolean_t last_desc) 2103 { 2104 i40e_tx_desc_t *txdesc; 2105 int cmd; 2106 2107 ASSERT(MUTEX_HELD(&itrq->itrq_tx_lock)); 2108 itrq->itrq_desc_free--; 2109 txdesc = &itrq->itrq_desc_ring[itrq->itrq_desc_tail]; 2110 itrq->itrq_desc_tail = i40e_next_desc(itrq->itrq_desc_tail, 1, 2111 itrq->itrq_tx_ring_size); 2112 2113 cmd = I40E_TX_DESC_CMD_ICRC | tctx->itc_data_cmdflags; 2114 2115 /* 2116 * The last data descriptor needs the EOP bit set, so that the HW knows 2117 * that we're ready to send. Additionally, we set the RS (Report 2118 * Status) bit, so that we are notified when the transmit engine has 2119 * completed DMA'ing all of the data descriptors and data buffers 2120 * associated with this frame. 2121 */ 2122 if (last_desc == B_TRUE) { 2123 cmd |= I40E_TX_DESC_CMD_EOP; 2124 cmd |= I40E_TX_DESC_CMD_RS; 2125 } 2126 2127 /* 2128 * Per the X710 manual, section 8.4.2.1.1, the buffer size 2129 * must be a value from 1 to 16K minus 1, inclusive. 2130 */ 2131 ASSERT3U(len, >=, 1); 2132 ASSERT3U(len, <=, I40E_MAX_TX_BUFSZ); 2133 2134 txdesc->buffer_addr = CPU_TO_LE64((uintptr_t)buff); 2135 txdesc->cmd_type_offset_bsz = 2136 LE_64(((uint64_t)I40E_TX_DESC_DTYPE_DATA | 2137 ((uint64_t)tctx->itc_data_offsets << I40E_TXD_QW1_OFFSET_SHIFT) | 2138 ((uint64_t)cmd << I40E_TXD_QW1_CMD_SHIFT) | 2139 ((uint64_t)len << I40E_TXD_QW1_TX_BUF_SZ_SHIFT))); 2140 } 2141 2142 /* 2143 * Place 'tcb' on the tail of the list represented by 'head'/'tail'. 2144 */ 2145 static inline void 2146 tcb_list_append(i40e_tx_control_block_t **head, i40e_tx_control_block_t **tail, 2147 i40e_tx_control_block_t *tcb) 2148 { 2149 if (*head == NULL) { 2150 *head = tcb; 2151 *tail = *head; 2152 } else { 2153 ASSERT3P(*tail, !=, NULL); 2154 ASSERT3P((*tail)->tcb_next, ==, NULL); 2155 (*tail)->tcb_next = tcb; 2156 *tail = tcb; 2157 } 2158 } 2159 2160 /* 2161 * This function takes a single packet, possibly consisting of 2162 * multiple mblks, and creates a TCB chain to send to the controller. 2163 * This TCB chain may span up to a maximum of 8 descriptors. A copy 2164 * TCB consumes one descriptor; whereas a DMA TCB may consume 1 or 2165 * more, depending on several factors. For each fragment (invidual 2166 * mblk making up the packet), we determine if its size dictates a 2167 * copy to the TCB buffer or a DMA bind of the dblk buffer. We keep a 2168 * count of descriptors used; when that count reaches the max we force 2169 * all remaining fragments into a single TCB buffer. We have a 2170 * guarantee that the TCB buffer is always larger than the MTU -- so 2171 * there is always enough room. Consecutive fragments below the DMA 2172 * threshold are copied into a single TCB. In the event of an error 2173 * this function returns NULL but leaves 'mp' alone. 2174 */ 2175 static i40e_tx_control_block_t * 2176 i40e_non_lso_chain(i40e_trqpair_t *itrq, mblk_t *mp, uint_t *ndesc) 2177 { 2178 const mblk_t *nmp = mp; 2179 uint_t needed_desc = 0; 2180 boolean_t force_copy = B_FALSE; 2181 i40e_tx_control_block_t *tcb = NULL, *tcbhead = NULL, *tcbtail = NULL; 2182 i40e_t *i40e = itrq->itrq_i40e; 2183 i40e_txq_stat_t *txs = &itrq->itrq_txstat; 2184 2185 /* TCB buffer is always larger than MTU. */ 2186 ASSERT3U(msgsize(mp), <, i40e->i40e_tx_buf_size); 2187 2188 while (nmp != NULL) { 2189 const size_t nmp_len = MBLKL(nmp); 2190 2191 /* Ignore zero-length mblks. */ 2192 if (nmp_len == 0) { 2193 nmp = nmp->b_cont; 2194 continue; 2195 } 2196 2197 if (nmp_len < i40e->i40e_tx_dma_min || force_copy) { 2198 /* Compress consecutive copies into one TCB. */ 2199 if (tcb != NULL && tcb->tcb_type == I40E_TX_COPY) { 2200 i40e_tx_copy_fragment(tcb, nmp, 0, nmp_len); 2201 nmp = nmp->b_cont; 2202 continue; 2203 } 2204 2205 if ((tcb = i40e_tcb_alloc(itrq)) == NULL) { 2206 txs->itxs_err_notcb.value.ui64++; 2207 goto fail; 2208 } 2209 2210 /* 2211 * TCB DMA buffer is guaranteed to be one 2212 * cookie by i40e_alloc_dma_buffer(). 2213 */ 2214 i40e_tx_copy_fragment(tcb, nmp, 0, nmp_len); 2215 needed_desc++; 2216 tcb_list_append(&tcbhead, &tcbtail, tcb); 2217 } else { 2218 uint_t total_desc; 2219 2220 tcb = i40e_tx_bind_fragment(itrq, nmp, 0, B_FALSE); 2221 if (tcb == NULL) { 2222 i40e_error(i40e, "dma bind failed!"); 2223 goto fail; 2224 } 2225 2226 /* 2227 * If the new total exceeds the max or we've 2228 * reached the limit and there's data left, 2229 * then give up binding and copy the rest into 2230 * the pre-allocated TCB buffer. 2231 */ 2232 total_desc = needed_desc + tcb->tcb_bind_ncookies; 2233 if ((total_desc > I40E_TX_MAX_COOKIE) || 2234 (total_desc == I40E_TX_MAX_COOKIE && 2235 nmp->b_cont != NULL)) { 2236 i40e_tcb_reset(tcb); 2237 i40e_tcb_free(itrq, tcb); 2238 2239 if (tcbtail != NULL && 2240 tcbtail->tcb_type == I40E_TX_COPY) { 2241 tcb = tcbtail; 2242 } else { 2243 tcb = NULL; 2244 } 2245 2246 force_copy = B_TRUE; 2247 txs->itxs_force_copy.value.ui64++; 2248 continue; 2249 } 2250 2251 needed_desc += tcb->tcb_bind_ncookies; 2252 tcb_list_append(&tcbhead, &tcbtail, tcb); 2253 } 2254 2255 nmp = nmp->b_cont; 2256 } 2257 2258 ASSERT3P(nmp, ==, NULL); 2259 ASSERT3U(needed_desc, <=, I40E_TX_MAX_COOKIE); 2260 ASSERT3P(tcbhead, !=, NULL); 2261 *ndesc += needed_desc; 2262 return (tcbhead); 2263 2264 fail: 2265 tcb = tcbhead; 2266 while (tcb != NULL) { 2267 i40e_tx_control_block_t *next = tcb->tcb_next; 2268 2269 ASSERT(tcb->tcb_type == I40E_TX_DMA || 2270 tcb->tcb_type == I40E_TX_COPY); 2271 2272 tcb->tcb_mp = NULL; 2273 i40e_tcb_reset(tcb); 2274 i40e_tcb_free(itrq, tcb); 2275 tcb = next; 2276 } 2277 2278 return (NULL); 2279 } 2280 2281 /* 2282 * Section 8.4.1 of the 700-series programming guide states that a 2283 * segment may span up to 8 data descriptors; including both header 2284 * and payload data. However, empirical evidence shows that the 2285 * controller freezes the Tx queue when presented with a segment of 8 2286 * descriptors. Or, at least, when the first segment contains 8 2287 * descriptors. One explanation is that the controller counts the 2288 * context descriptor against the first segment, even though the 2289 * programming guide makes no mention of such a constraint. In any 2290 * case, we limit TSO segments to 7 descriptors to prevent Tx queue 2291 * freezes. We still allow non-TSO segments to utilize all 8 2292 * descriptors as they have not demonstrated the faulty behavior. 2293 */ 2294 uint_t i40e_lso_num_descs = 7; 2295 2296 #define I40E_TCB_LEFT(tcb) \ 2297 ((tcb)->tcb_dma.dmab_size - (tcb)->tcb_dma.dmab_len) 2298 2299 /* 2300 * This function is similar in spirit to i40e_non_lso_chain(), but 2301 * much more complicated in reality. Like the previous function, it 2302 * takes a packet (an LSO packet) as input and returns a chain of 2303 * TCBs. The complication comes with the fact that we are no longer 2304 * trying to fit the entire packet into 8 descriptors, but rather we 2305 * must fit each MSS-size segment of the LSO packet into 8 descriptors. 2306 * Except it's really 7 descriptors, see i40e_lso_num_descs. 2307 * 2308 * Your first inclination might be to verify that a given segment 2309 * spans no more than 7 mblks; but it's actually much more subtle than 2310 * that. First, let's describe what the hardware expects, and then we 2311 * can expound on the software side of things. 2312 * 2313 * For an LSO packet the hardware expects the following: 2314 * 2315 * o Each MSS-sized segment must span no more than 7 descriptors. 2316 * 2317 * o The header size does not count towards the segment size. 2318 * 2319 * o If header and payload share the first descriptor, then the 2320 * controller will count the descriptor twice. 2321 * 2322 * The most important thing to keep in mind is that the hardware does 2323 * not view the segments in terms of mblks, like we do. The hardware 2324 * only sees descriptors. It will iterate each descriptor in turn, 2325 * keeping a tally of bytes seen and descriptors visited. If the byte 2326 * count hasn't reached MSS by the time the descriptor count reaches 2327 * 7, then the controller freezes the queue and we are stuck. 2328 * Furthermore, the hardware picks up its tally where it left off. So 2329 * if it reached MSS in the middle of a descriptor, it will start 2330 * tallying the next segment in the middle of that descriptor. The 2331 * hardware's view is entirely removed from the mblk chain or even the 2332 * descriptor layout. Consider these facts: 2333 * 2334 * o The MSS will vary dpeneding on MTU and other factors. 2335 * 2336 * o The dblk allocation will sit at various offsets within a 2337 * memory page. 2338 * 2339 * o The page size itself could vary in the future (i.e. not 2340 * always 4K). 2341 * 2342 * o Just because a dblk is virtually contiguous doesn't mean 2343 * it's physically contiguous. The number of cookies 2344 * (descriptors) required by a DMA bind of a single dblk is at 2345 * the mercy of the page size and physical layout. 2346 * 2347 * o The descriptors will most often NOT start/end on a MSS 2348 * boundary. Thus the hardware will often start counting the 2349 * MSS mid descriptor and finish mid descriptor. 2350 * 2351 * The upshot of all this is that the driver must learn to think like 2352 * the controller; and verify that none of the constraints are broken. 2353 * It does this by tallying up the segment just like the hardware 2354 * would. This is handled by the two variables 'segsz' and 'segdesc'. 2355 * After each attempt to bind a dblk, we check the constaints. If 2356 * violated, we undo the DMA and force a copy until MSS is met. We 2357 * have a guarantee that the TCB buffer is larger than MTU; thus 2358 * ensuring we can always meet the MSS with a single copy buffer. We 2359 * also copy consecutive non-DMA fragments into the same TCB buffer. 2360 */ 2361 static i40e_tx_control_block_t * 2362 i40e_lso_chain(i40e_trqpair_t *itrq, const mblk_t *mp, 2363 const mac_ether_offload_info_t *meo, const i40e_tx_context_t *tctx, 2364 uint_t *ndesc) 2365 { 2366 size_t mp_len = MBLKL(mp); 2367 /* 2368 * The cpoff (copy offset) variable tracks the offset inside 2369 * the current mp. There are cases where the entire mp is not 2370 * fully copied in one go: such as the header copy followed by 2371 * a non-DMA mblk, or a TCB buffer that only has enough space 2372 * to copy part of the current mp. 2373 */ 2374 size_t cpoff = 0; 2375 /* 2376 * The segsz and segdesc variables track the controller's view 2377 * of the segment. The needed_desc variable tracks the total 2378 * number of data descriptors used by the driver. 2379 */ 2380 size_t segsz = 0; 2381 uint_t segdesc = 0; 2382 uint_t needed_desc = 0; 2383 size_t hdrcopied = 0; 2384 const size_t hdrlen = 2385 meo->meoi_l2hlen + meo->meoi_l3hlen + meo->meoi_l4hlen; 2386 const size_t mss = tctx->itc_ctx_mss; 2387 boolean_t force_copy = B_FALSE; 2388 i40e_tx_control_block_t *tcb = NULL, *tcbhead = NULL, *tcbtail = NULL; 2389 i40e_t *i40e = itrq->itrq_i40e; 2390 i40e_txq_stat_t *txs = &itrq->itrq_txstat; 2391 2392 /* 2393 * We always copy the header in order to avoid more 2394 * complicated code dealing with various edge cases. 2395 */ 2396 if ((tcb = i40e_tcb_alloc(itrq)) == NULL) { 2397 txs->itxs_err_notcb.value.ui64++; 2398 goto fail; 2399 } 2400 2401 needed_desc++; 2402 tcb_list_append(&tcbhead, &tcbtail, tcb); 2403 2404 while (hdrcopied < hdrlen) { 2405 const size_t tocopy = MIN(hdrlen - hdrcopied, mp_len); 2406 i40e_tx_copy_fragment(tcb, mp, 0, tocopy); 2407 hdrcopied += tocopy; 2408 cpoff += tocopy; 2409 if (tocopy == mp_len) { 2410 /* 2411 * This is a bit of defensive programming. We 2412 * should never have a chain too short to 2413 * satisfy the headers -- but just in case. 2414 */ 2415 if ((mp = mp->b_cont) == NULL) { 2416 txs->itxs_tx_short.value.ui64++; 2417 goto fail; 2418 } 2419 2420 while ((mp_len = MBLKL(mp)) == 0) { 2421 if ((mp = mp->b_cont) == NULL) { 2422 txs->itxs_tx_short.value.ui64++; 2423 goto fail; 2424 } 2425 } 2426 cpoff = 0; 2427 } 2428 } 2429 ASSERT3U(hdrcopied, ==, hdrlen); 2430 2431 /* 2432 * A single descriptor containing both header and data is 2433 * counted twice by the controller. 2434 */ 2435 if (mp_len < i40e->i40e_tx_dma_min) { 2436 segdesc = 2; 2437 } else { 2438 segdesc = 1; 2439 } 2440 2441 while (mp != NULL) { 2442 mp_len = MBLKL(mp); 2443 force_copy: 2444 /* Ignore zero-length mblks. */ 2445 if (mp_len == 0) { 2446 mp = mp->b_cont; 2447 cpoff = 0; 2448 continue; 2449 } 2450 2451 /* 2452 * We copy into the preallocated TCB buffer when the 2453 * current fragment is less than the DMA threshold OR 2454 * when the DMA bind can't meet the controller's 2455 * segment descriptor limit. 2456 */ 2457 if (mp_len < i40e->i40e_tx_dma_min || force_copy) { 2458 size_t tocopy; 2459 2460 /* 2461 * Our objective here is to compress 2462 * consecutive copies into one TCB (until it 2463 * is full). If there is no current TCB, or if 2464 * it is a DMA TCB, then allocate a new one. 2465 */ 2466 if (tcb == NULL || 2467 (tcb != NULL && tcb->tcb_type != I40E_TX_COPY)) { 2468 if ((tcb = i40e_tcb_alloc(itrq)) == NULL) { 2469 txs->itxs_err_notcb.value.ui64++; 2470 goto fail; 2471 } 2472 2473 /* 2474 * The TCB DMA buffer is guaranteed to 2475 * be one cookie by i40e_alloc_dma_buffer(). 2476 */ 2477 needed_desc++; 2478 segdesc++; 2479 ASSERT3U(segdesc, <=, i40e_lso_num_descs); 2480 tcb_list_append(&tcbhead, &tcbtail, tcb); 2481 } else if (segdesc == 0) { 2482 /* 2483 * We are copying into an existing TCB 2484 * but we just crossed the MSS 2485 * boundary. Make sure to increment 2486 * segdesc to track the descriptor 2487 * count as the hardware would. 2488 */ 2489 segdesc++; 2490 } 2491 2492 tocopy = MIN(I40E_TCB_LEFT(tcb), mp_len - cpoff); 2493 i40e_tx_copy_fragment(tcb, mp, cpoff, tocopy); 2494 cpoff += tocopy; 2495 segsz += tocopy; 2496 2497 /* We have consumed the current mp. */ 2498 if (cpoff == mp_len) { 2499 mp = mp->b_cont; 2500 cpoff = 0; 2501 } 2502 2503 /* We have consumed the current TCB buffer. */ 2504 if (I40E_TCB_LEFT(tcb) == 0) { 2505 tcb = NULL; 2506 } 2507 2508 /* 2509 * We have met MSS with this copy; restart the 2510 * counters. 2511 */ 2512 if (segsz >= mss) { 2513 segsz = segsz % mss; 2514 segdesc = segsz == 0 ? 0 : 1; 2515 force_copy = B_FALSE; 2516 } 2517 2518 /* 2519 * We are at the controller's descriptor 2520 * limit; we must copy into the current TCB 2521 * until MSS is reached. The TCB buffer is 2522 * always bigger than the MTU so we know it is 2523 * big enough to meet the MSS. 2524 */ 2525 if (segdesc == i40e_lso_num_descs) { 2526 force_copy = B_TRUE; 2527 } 2528 } else { 2529 uint_t tsegdesc = segdesc; 2530 size_t tsegsz = segsz; 2531 2532 ASSERT(force_copy == B_FALSE); 2533 ASSERT3U(tsegdesc, <, i40e_lso_num_descs); 2534 2535 tcb = i40e_tx_bind_fragment(itrq, mp, cpoff, B_TRUE); 2536 if (tcb == NULL) { 2537 i40e_error(i40e, "dma bind failed!"); 2538 goto fail; 2539 } 2540 2541 for (uint_t i = 0; i < tcb->tcb_bind_ncookies; i++) { 2542 struct i40e_dma_bind_info dbi = 2543 tcb->tcb_bind_info[i]; 2544 2545 tsegsz += dbi.dbi_len; 2546 tsegdesc++; 2547 ASSERT3U(tsegdesc, <=, i40e_lso_num_descs); 2548 2549 /* 2550 * We've met the MSS with this portion 2551 * of the DMA. 2552 */ 2553 if (tsegsz >= mss) { 2554 tsegsz = tsegsz % mss; 2555 tsegdesc = tsegsz == 0 ? 0 : 1; 2556 } 2557 2558 /* 2559 * We've reached max descriptors but 2560 * have not met the MSS. Undo the bind 2561 * and instead copy. 2562 */ 2563 if (tsegdesc == i40e_lso_num_descs) { 2564 i40e_tcb_reset(tcb); 2565 i40e_tcb_free(itrq, tcb); 2566 2567 if (tcbtail != NULL && 2568 I40E_TCB_LEFT(tcb) > 0 && 2569 tcbtail->tcb_type == I40E_TX_COPY) { 2570 tcb = tcbtail; 2571 } else { 2572 tcb = NULL; 2573 } 2574 2575 /* 2576 * Remember, we are still on 2577 * the same mp. 2578 */ 2579 force_copy = B_TRUE; 2580 txs->itxs_tso_force_copy.value.ui64++; 2581 goto force_copy; 2582 } 2583 } 2584 2585 ASSERT3U(tsegdesc, <=, i40e_lso_num_descs); 2586 ASSERT3U(tsegsz, <, mss); 2587 2588 /* 2589 * We've made if through the loop without 2590 * breaking the segment descriptor contract 2591 * with the controller -- replace the segment 2592 * tracking values with the temporary ones. 2593 */ 2594 segdesc = tsegdesc; 2595 segsz = tsegsz; 2596 needed_desc += tcb->tcb_bind_ncookies; 2597 cpoff = 0; 2598 tcb_list_append(&tcbhead, &tcbtail, tcb); 2599 mp = mp->b_cont; 2600 } 2601 } 2602 2603 ASSERT3P(mp, ==, NULL); 2604 ASSERT3P(tcbhead, !=, NULL); 2605 *ndesc += needed_desc; 2606 return (tcbhead); 2607 2608 fail: 2609 tcb = tcbhead; 2610 while (tcb != NULL) { 2611 i40e_tx_control_block_t *next = tcb->tcb_next; 2612 2613 ASSERT(tcb->tcb_type == I40E_TX_DMA || 2614 tcb->tcb_type == I40E_TX_COPY); 2615 2616 tcb->tcb_mp = NULL; 2617 i40e_tcb_reset(tcb); 2618 i40e_tcb_free(itrq, tcb); 2619 tcb = next; 2620 } 2621 2622 return (NULL); 2623 } 2624 2625 /* 2626 * Keep track of activity through the transmit data path. 2627 * 2628 * We need to ensure we don't try and transmit when a trqpair has been 2629 * stopped, nor do we want to stop a trqpair whilst transmitting. 2630 */ 2631 static boolean_t 2632 i40e_ring_tx_enter(i40e_trqpair_t *itrq) 2633 { 2634 boolean_t allow; 2635 2636 mutex_enter(&itrq->itrq_tx_lock); 2637 allow = !itrq->itrq_tx_quiesce; 2638 if (allow) 2639 itrq->itrq_tx_active++; 2640 mutex_exit(&itrq->itrq_tx_lock); 2641 2642 return (allow); 2643 } 2644 2645 static void 2646 i40e_ring_tx_exit_nolock(i40e_trqpair_t *itrq) 2647 { 2648 ASSERT(MUTEX_HELD(&itrq->itrq_tx_lock)); 2649 2650 itrq->itrq_tx_active--; 2651 if (itrq->itrq_tx_quiesce) 2652 cv_signal(&itrq->itrq_tx_cv); 2653 } 2654 2655 static void 2656 i40e_ring_tx_exit(i40e_trqpair_t *itrq) 2657 { 2658 mutex_enter(&itrq->itrq_tx_lock); 2659 i40e_ring_tx_exit_nolock(itrq); 2660 mutex_exit(&itrq->itrq_tx_lock); 2661 } 2662 2663 2664 /* 2665 * Tell the transmit path to quiesce and wait until there is no 2666 * more activity. 2667 * Will return B_TRUE if the transmit path is already quiesced, B_FALSE 2668 * otherwise. 2669 */ 2670 boolean_t 2671 i40e_ring_tx_quiesce(i40e_trqpair_t *itrq) 2672 { 2673 mutex_enter(&itrq->itrq_tx_lock); 2674 if (itrq->itrq_tx_quiesce) { 2675 /* 2676 * When itrq_tx_quiesce is set, then the ring has already 2677 * been shutdown. 2678 */ 2679 mutex_exit(&itrq->itrq_tx_lock); 2680 return (B_TRUE); 2681 } 2682 2683 /* 2684 * Tell any threads in transmit path this trqpair is quiesced and 2685 * wait until they've all exited the critical code path. 2686 */ 2687 itrq->itrq_tx_quiesce = B_TRUE; 2688 while (itrq->itrq_tx_active > 0) 2689 cv_wait(&itrq->itrq_tx_cv, &itrq->itrq_tx_lock); 2690 2691 mutex_exit(&itrq->itrq_tx_lock); 2692 2693 return (B_FALSE); 2694 } 2695 2696 /* 2697 * We've been asked to send a message block on the wire. We'll only have a 2698 * single chain. There will not be any b_next pointers; however, there may be 2699 * multiple b_cont blocks. The number of b_cont blocks may exceed the 2700 * controller's Tx descriptor limit. 2701 * 2702 * We may do one of three things with any given mblk_t chain: 2703 * 2704 * 1) Drop it 2705 * 2) Transmit it 2706 * 3) Return it 2707 * 2708 * If we return it to MAC, then MAC will flow control on our behalf. In other 2709 * words, it won't send us anything until we tell it that it's okay to send us 2710 * something. 2711 */ 2712 mblk_t * 2713 i40e_ring_tx(void *arg, mblk_t *mp) 2714 { 2715 size_t msglen; 2716 i40e_tx_control_block_t *tcb_ctx = NULL, *tcb = NULL, *tcbhead = NULL; 2717 i40e_tx_context_desc_t *ctxdesc; 2718 mac_ether_offload_info_t meo; 2719 i40e_tx_context_t tctx; 2720 int type; 2721 uint_t needed_desc = 0; 2722 boolean_t do_ctx_desc = B_FALSE, use_lso = B_FALSE; 2723 2724 i40e_trqpair_t *itrq = arg; 2725 i40e_t *i40e = itrq->itrq_i40e; 2726 i40e_hw_t *hw = &i40e->i40e_hw_space; 2727 i40e_txq_stat_t *txs = &itrq->itrq_txstat; 2728 2729 ASSERT(mp->b_next == NULL); 2730 2731 if (!(i40e->i40e_state & I40E_STARTED) || 2732 (i40e->i40e_state & I40E_OVERTEMP) || 2733 (i40e->i40e_state & I40E_SUSPENDED) || 2734 (i40e->i40e_state & I40E_ERROR) || 2735 (i40e->i40e_link_state != LINK_STATE_UP) || 2736 !i40e_ring_tx_enter(itrq)) { 2737 freemsg(mp); 2738 return (NULL); 2739 } 2740 2741 /* 2742 * Parse packet headers for use by any requested offloads. That offload 2743 * logic will later determine if the results here were adequate. 2744 */ 2745 mac_ether_offload_info(mp, &meo); 2746 2747 /* 2748 * Figure out the relevant context about this frame that we might need 2749 * for enabling checksum, LSO, etc. This also fills in information that 2750 * we might set around the packet type, etc. 2751 */ 2752 if (i40e_tx_context(i40e, itrq, mp, &meo, &tctx) < 0) { 2753 freemsg(mp); 2754 itrq->itrq_txstat.itxs_err_context.value.ui64++; 2755 i40e_ring_tx_exit(itrq); 2756 return (NULL); 2757 } 2758 if (tctx.itc_ctx_cmdflags & I40E_TX_CTX_DESC_TSO) { 2759 use_lso = B_TRUE; 2760 do_ctx_desc = B_TRUE; 2761 } 2762 2763 /* 2764 * For the primordial driver we can punt on doing any recycling right 2765 * now; however, longer term we need to probably do some more pro-active 2766 * recycling to cut back on stalls in the TX path. 2767 */ 2768 2769 msglen = msgsize(mp); 2770 2771 if (do_ctx_desc) { 2772 /* 2773 * If we're doing tunneling or LSO, then we'll need a TX 2774 * context descriptor in addition to one or more TX data 2775 * descriptors. Since there's no data DMA block or handle 2776 * associated with the context descriptor, we create a special 2777 * control block that behaves effectively like a NOP. 2778 */ 2779 if ((tcb_ctx = i40e_tcb_alloc(itrq)) == NULL) { 2780 txs->itxs_err_notcb.value.ui64++; 2781 goto txfail; 2782 } 2783 tcb_ctx->tcb_type = I40E_TX_DESC; 2784 needed_desc++; 2785 } 2786 2787 if (!use_lso) { 2788 tcbhead = i40e_non_lso_chain(itrq, mp, &needed_desc); 2789 } else { 2790 tcbhead = i40e_lso_chain(itrq, mp, &meo, &tctx, &needed_desc); 2791 } 2792 2793 if (tcbhead == NULL) 2794 goto txfail; 2795 2796 tcbhead->tcb_mp = mp; 2797 2798 /* 2799 * The second condition ensures that 'itrq_desc_tail' never 2800 * equals 'itrq_desc_head'. This enforces the rule found in 2801 * the second bullet point of section 8.4.3.1.5 of the XL710 2802 * PG, which declares the TAIL pointer in I40E_QTX_TAIL should 2803 * never overlap with the head. This means that we only ever 2804 * have 'itrq_tx_ring_size - 1' total available descriptors. 2805 */ 2806 mutex_enter(&itrq->itrq_tx_lock); 2807 if (itrq->itrq_desc_free < i40e->i40e_tx_block_thresh || 2808 (itrq->itrq_desc_free - 1) < needed_desc) { 2809 txs->itxs_err_nodescs.value.ui64++; 2810 mutex_exit(&itrq->itrq_tx_lock); 2811 goto txfail; 2812 } 2813 2814 if (do_ctx_desc) { 2815 /* 2816 * If we're enabling any offloads for this frame, then we'll 2817 * need to build up a transmit context descriptor, first. The 2818 * context descriptor needs to be placed in the TX ring before 2819 * the data descriptor(s). See section 8.4.2, table 8-16 2820 */ 2821 uint_t tail = itrq->itrq_desc_tail; 2822 itrq->itrq_desc_free--; 2823 ctxdesc = (i40e_tx_context_desc_t *)&itrq->itrq_desc_ring[tail]; 2824 itrq->itrq_tcb_work_list[tail] = tcb_ctx; 2825 itrq->itrq_desc_tail = i40e_next_desc(tail, 1, 2826 itrq->itrq_tx_ring_size); 2827 2828 /* QW0 */ 2829 type = I40E_TX_DESC_DTYPE_CONTEXT; 2830 ctxdesc->tunneling_params = 0; 2831 ctxdesc->l2tag2 = 0; 2832 2833 /* QW1 */ 2834 ctxdesc->type_cmd_tso_mss = CPU_TO_LE64((uint64_t)type); 2835 if (tctx.itc_ctx_cmdflags & I40E_TX_CTX_DESC_TSO) { 2836 ctxdesc->type_cmd_tso_mss |= CPU_TO_LE64((uint64_t) 2837 ((uint64_t)tctx.itc_ctx_cmdflags << 2838 I40E_TXD_CTX_QW1_CMD_SHIFT) | 2839 ((uint64_t)tctx.itc_ctx_tsolen << 2840 I40E_TXD_CTX_QW1_TSO_LEN_SHIFT) | 2841 ((uint64_t)tctx.itc_ctx_mss << 2842 I40E_TXD_CTX_QW1_MSS_SHIFT)); 2843 } 2844 } 2845 2846 tcb = tcbhead; 2847 while (tcb != NULL) { 2848 2849 itrq->itrq_tcb_work_list[itrq->itrq_desc_tail] = tcb; 2850 if (tcb->tcb_type == I40E_TX_COPY) { 2851 boolean_t last_desc = (tcb->tcb_next == NULL); 2852 2853 i40e_tx_set_data_desc(itrq, &tctx, 2854 (caddr_t)tcb->tcb_dma.dmab_dma_address, 2855 tcb->tcb_dma.dmab_len, last_desc); 2856 } else { 2857 boolean_t last_desc = B_FALSE; 2858 ASSERT3S(tcb->tcb_type, ==, I40E_TX_DMA); 2859 2860 for (uint_t c = 0; c < tcb->tcb_bind_ncookies; c++) { 2861 last_desc = (c == tcb->tcb_bind_ncookies - 1) && 2862 (tcb->tcb_next == NULL); 2863 2864 i40e_tx_set_data_desc(itrq, &tctx, 2865 tcb->tcb_bind_info[c].dbi_paddr, 2866 tcb->tcb_bind_info[c].dbi_len, 2867 last_desc); 2868 } 2869 } 2870 2871 tcb = tcb->tcb_next; 2872 } 2873 2874 /* 2875 * Now, finally, sync the DMA data and alert hardware. 2876 */ 2877 I40E_DMA_SYNC(&itrq->itrq_desc_area, DDI_DMA_SYNC_FORDEV); 2878 2879 I40E_WRITE_REG(hw, I40E_QTX_TAIL(itrq->itrq_index), 2880 itrq->itrq_desc_tail); 2881 2882 if (i40e_check_acc_handle(i40e->i40e_osdep_space.ios_reg_handle) != 2883 DDI_FM_OK) { 2884 /* 2885 * Note, we can't really go through and clean this up very well, 2886 * because the memory has been given to the device, so just 2887 * indicate it's been transmitted. 2888 */ 2889 ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED); 2890 atomic_or_32(&i40e->i40e_state, I40E_ERROR); 2891 } 2892 2893 txs->itxs_bytes.value.ui64 += msglen; 2894 txs->itxs_packets.value.ui64++; 2895 txs->itxs_descriptors.value.ui64 += needed_desc; 2896 2897 i40e_ring_tx_exit_nolock(itrq); 2898 2899 mutex_exit(&itrq->itrq_tx_lock); 2900 2901 return (NULL); 2902 2903 txfail: 2904 /* 2905 * We ran out of resources. Return it to MAC and indicate that we'll 2906 * need to signal MAC. If there are allocated tcb's, return them now. 2907 * Make sure to reset their message block's, since we'll return them 2908 * back to MAC. 2909 */ 2910 if (tcb_ctx != NULL) { 2911 tcb_ctx->tcb_mp = NULL; 2912 i40e_tcb_reset(tcb_ctx); 2913 i40e_tcb_free(itrq, tcb_ctx); 2914 } 2915 2916 tcb = tcbhead; 2917 while (tcb != NULL) { 2918 i40e_tx_control_block_t *next = tcb->tcb_next; 2919 2920 ASSERT(tcb->tcb_type == I40E_TX_DMA || 2921 tcb->tcb_type == I40E_TX_COPY); 2922 2923 tcb->tcb_mp = NULL; 2924 i40e_tcb_reset(tcb); 2925 i40e_tcb_free(itrq, tcb); 2926 tcb = next; 2927 } 2928 2929 mutex_enter(&itrq->itrq_tx_lock); 2930 i40e_ring_tx_exit_nolock(itrq); 2931 itrq->itrq_tx_blocked = B_TRUE; 2932 mutex_exit(&itrq->itrq_tx_lock); 2933 2934 return (mp); 2935 } 2936