1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2015 OmniTI Computer Consulting, Inc. All rights reserved. 14 * Copyright 2019 Joyent, Inc. 15 */ 16 17 #include "i40e_sw.h" 18 19 /* 20 * --------------------------------------------------------- 21 * Buffer and Memory Management, Receiving, and Transmitting 22 * --------------------------------------------------------- 23 * 24 * Each physical function (PF), which is what we think of as an instance of the 25 * device driver, has a series of associated transmit and receive queue pairs. 26 * Effectively, what we think of in MAC as rings. Each of these has their own 27 * ring of descriptors which is used as part of doing DMA activity. 28 * 29 * The transmit ring of descriptors are 16-byte entries which are used to send 30 * packets, program filters, etc. The receive ring of descriptors are either 31 * 16-byte or 32-bytes each. At the moment, we opt to use the larger descriptor 32 * format so that we're in a better position if we ever want to leverage that 33 * information later on. 34 * 35 * However, these rings are just for descriptors, they don't talk or deal with 36 * how we actually store the memory that we need for DMA or the associated 37 * information that we need for keeping track of message blocks. To correspond 38 * to the hardware descriptor ring which is how we communicate with hardware, we 39 * introduce a control block which keeps track of our required metadata like DMA 40 * mappings. 41 * 42 * There are two main considerations that dictate how much memory and buffers 43 * we end up allocating. Those are: 44 * 45 * o The size of the ring (controlled through the driver.conf file) 46 * 47 * o The maximum size frame we can receive. 48 * 49 * The size of the rings currently defaults to 1024 descriptors and is stored in 50 * the i40e_t`i40e_rx_ring_size and i40e_t`i40e_tx_ring_size. 51 * 52 * While the size of the rings is controlled by the driver.conf, the maximum 53 * size frame is informed primarily through the use of dladm and the setting of 54 * the MTU property on the device. From the MTU, we then go and do some 55 * machinations. The first thing we do is we then have to add in space for the 56 * Ethernet header, potentially a VLAN header, and the FCS check. This value is 57 * what's stored as i40e_t`i40e_frame_max and is derived any time 58 * i40e_t`i40e_sdu changes. 59 * 60 * This size is then rounded up to the nearest 1k chunk, which represents the 61 * actual amount of memory that we'll allocate for a single frame. 62 * 63 * Note, that for RX, we do something that might be unexpected. We always add 64 * an extra two bytes to the frame size that we allocate. We then offset the DMA 65 * address that we receive a packet into by two bytes. This ensures that the IP 66 * header will always be 4 byte aligned because the MAC header is either 14 or 67 * 18 bytes in length, depending on the use of 802.1Q tagging, which makes IP's 68 * and MAC's lives easier. 69 * 70 * Both the RX and TX descriptor rings (which are what we use to communicate 71 * with hardware) are allocated as a single region of DMA memory which is the 72 * size of the descriptor (4 bytes and 2 bytes respectively) times the total 73 * number of descriptors for an RX and TX ring. 74 * 75 * While the RX and TX descriptors are allocated using DMA-based memory, the 76 * control blocks for each of them are allocated using normal kernel memory. 77 * They aren't special from a DMA perspective. We'll go over the design of both 78 * receiving and transmitting separately, as they have slightly different 79 * control blocks and different ways that we manage the relationship between 80 * control blocks and descriptors. 81 * 82 * --------------------------------- 83 * RX Descriptors and Control Blocks 84 * --------------------------------- 85 * 86 * For every descriptor in the ring that the driver has, we need some associated 87 * memory, which means that we need to have the receive specific control block. 88 * We have a couple different, but related goals: 89 * 90 * o Once we've completed the mc_start GLDv3 endpoint (i40e_m_start), we do 91 * not want to do any additional memory allocations or DMA allocations if 92 * we don't have to. 93 * 94 * o We'd like to try and do as much zero-copy as possible, while taking into 95 * account the cost of mapping in DMA resources. 96 * 97 * o We'd like to have every receive descriptor available. 98 * 99 * Now, these rules are a bit in tension with one another. The act of mapping in 100 * is an exercise of trying to find the break-even point between page table 101 * updates and bcopy. We currently start by using the same metrics that ixgbe 102 * used; however, it should be known that this value has effectively been 103 * cargo-culted across to yet another driver, sorry. 104 * 105 * If we receive a packet which is larger than our copy threshold, we'll create 106 * a message block out of the DMA memory via desballoc(9F) and send that up to 107 * MAC that way. This will cause us to be notified when the message block is 108 * then freed because it has been consumed, dropped, or otherwise. Otherwise, if 109 * it's less than the threshold, we'll try to use allocb and bcopy it into the 110 * block, thus allowing us to immediately reuse the DMA resource. Note, on debug 111 * builds, we allow someone to whack the variable i40e_debug_rx_mode to override 112 * the behavior and always do a bcopy or a DMA bind. 113 * 114 * To try and ensure that the device always has blocks that it can receive data 115 * into, we maintain two lists of control blocks, a working list and a free 116 * list. Each list is sized equal to the number of descriptors in the RX ring. 117 * During the GLDv3 mc_start routine, we allocate a number of RX control blocks 118 * equal to twice the number of descriptors in the ring and we assign them 119 * equally to the free list and to the working list. Each control block also has 120 * DMA memory allocated and associated with which it will be used to receive the 121 * actual packet data. All of a received frame's data will end up in a single 122 * DMA buffer. 123 * 124 * During operation, we always maintain the invariant that each RX descriptor 125 * has an associated RX control block which lives in the working list. If we 126 * feel that we should loan up DMA memory to MAC in the form of a message block, 127 * we can only do so if we can maintain this invariant. To do that, we swap in 128 * one of the buffers from the free list. If none are available, then we resort 129 * to using allocb(9F) and bcopy(9F) on the packet instead, regardless of the 130 * size. 131 * 132 * Loaned message blocks come back to use when freemsg(9F) or freeb(9F) is 133 * called on the block, at which point we restore the RX control block to the 134 * free list and are able to reuse the DMA memory again. While the scheme may 135 * seem odd, it importantly keeps us out of trying to do any DMA allocations in 136 * the normal path of operation, even though we may still have to allocate 137 * message blocks and copy. 138 * 139 * The following state machine describes the life time of a RX control block. In 140 * the diagram we abbrviate the RX ring descriptor entry as rxd and the rx 141 * control block entry as rcb. 142 * 143 * | | 144 * * ... 1/2 of all initial rcb's ... * 145 * | | 146 * v v 147 * +------------------+ +------------------+ 148 * | rcb on free list |---*---------->| rcb on work list | 149 * +------------------+ . +------------------+ 150 * ^ . moved to | 151 * | replace rcb * . . Frame received, 152 * | loaned to | entry on free list 153 * | MAC + co. | available. rcb's 154 * | | memory made into mblk_t 155 * * . freemsg(9F) | and sent up to MAC. 156 * | called on | 157 * | loaned rcb | 158 * | and it is v 159 * | recycled. +-------------------+ 160 * +--------------------<-----| rcb loaned to MAC | 161 * +-------------------+ 162 * 163 * Finally, note that every RX control block has a reference count on it. One 164 * reference is added as long as the driver has had the GLDv3 mc_start endpoint 165 * called. If the GLDv3 mc_stop entry point is called, IP has been unplumbed and 166 * no other DLPI consumers remain, then we'll decrement the reference count by 167 * one. Whenever we loan up the RX control block and associated buffer to MAC, 168 * then we bump the reference count again. Even though the device is stopped, 169 * there may still be loaned frames in upper levels that we'll want to account 170 * for. Our callback from freemsg(9F)/freeb(9F) will take care of making sure 171 * that it is cleaned up. 172 * 173 * -------------------- 174 * Managing the RX Ring 175 * -------------------- 176 * 177 * The receive ring descriptors are arranged in a circular buffer with a head 178 * and tail pointer. There are both the conventional head and tail pointers 179 * which are used to partition the ring into two portions, a portion that we, 180 * the operating system, manage and a portion that is managed by hardware. When 181 * hardware owns a descriptor in the ring, it means that it is waiting for data 182 * to be filled in. However, when a portion of the ring is owned by the driver, 183 * then that means that the descriptor has been consumed and we need to go take 184 * a look at it. 185 * 186 * The initial head is configured to be zero by writing it as such in the 187 * receive queue context in the FPM (function private memory from the host). The 188 * initial tail is written to be the last descriptor. This is written to via the 189 * PCIe register I40E_QRX_TAIL(). Technically, hardware owns everything between 190 * the HEAD and TAIL, inclusive. Note that while we initially program the HEAD, 191 * the only values we ever consult ourselves are the TAIL register and our own 192 * state tracking. Effectively, we cache the HEAD register and then update it 193 * ourselves based on our work. 194 * 195 * When we iterate over the RX descriptors and thus the received frames, we are 196 * either in an interrupt context or we've been asked by MAC to poll on the 197 * ring. If we've been asked to poll on the ring, we have a maximum number of 198 * bytes of mblk_t's to return. If processing an RX descriptor would cause us to 199 * exceed that count, then we do not process it. When in interrupt context, we 200 * don't have a strict byte count. However, to ensure liveness, we limit the 201 * amount of data based on a configuration value 202 * (i40e_t`i40e_rx_limit_per_intr). The number that we've started with for this 203 * is based on similar numbers that are used for ixgbe. After some additional 204 * time in the field, we'll have a sense as to whether or not it should be 205 * changed. 206 * 207 * When processing, we start at our own HEAD pointer 208 * (i40e_rx_data_t`rxd_desc_next), which indicates the descriptor to start 209 * processing. Every RX descriptor has what's described as the DD bit. This bit 210 * (the LSB of the second 8-byte word), indicates whether or not the descriptor 211 * is done. When we give descriptors to the hardware, this value is always 212 * zero. When the hardware has finished a descriptor, it will always be one. 213 * 214 * The first thing that we check is whether the DD bit indicates that the 215 * current HEAD is ready. If it isn't, then we're done. That's the primary 216 * invariant of processing a frame. If it's done, then there are a few other 217 * things that we want to look at. In the same status word as the DD bit, there 218 * are two other important bits: 219 * 220 * o End of Packet (EOP) 221 * o Error bits 222 * 223 * The end of packet indicates that we have reached the last descriptor. Now, 224 * you might ask when would there be more than one descriptor. The reason for 225 * that might be due to large receive offload (lro) or header splitting 226 * functionality, which presently isn't supported in the driver. The error bits 227 * in the frame are only valid when EOP is set. 228 * 229 * If error bits are set on the frame, then we still consume it; however, we 230 * will not generate an mblk_t to send up to MAC. If there are no error bits 231 * set, then we'll consume the descriptor either using bcopy or DMA binding. See 232 * the earlier section 'RX DESCRIPTORS AND CONTROL BLOCKS' for more information 233 * on how that selection is made. 234 * 235 * Regardless of whether we construct an mblk_t or encounter an error, we end up 236 * resetting the descriptor. This re-arms the descriptor for hardware and in the 237 * process, we may end up assigning it a new receive control bock. After we do 238 * this, we always update our HEAD pointer, no matter what. 239 * 240 * Finally, once we've consumed as much as we will in a given window, we go and 241 * update the TAIL register to indicate all the frames we've consumed. We only 242 * do a single bulk write for the ring. 243 * 244 * --------------------------------- 245 * TX Descriptors and Control Blocks 246 * --------------------------------- 247 * 248 * While the transmit path is similar in spirit to the receive path, it works 249 * differently due to the fact that all data is originated by the operating 250 * system and not by the device. 251 * 252 * Like RX, there is both a descriptor ring that we use to communicate to the 253 * driver and which points to the memory used to transmit a frame. Similarly, 254 * there is a corresponding transmit control block, however, the correspondence 255 * between descriptors and control blocks is more complex and not necessarily 256 * 1-to-1. 257 * 258 * The driver is asked to process a single frame at a time. That message block 259 * may be made up of multiple fragments linked together by the mblk_t`b_cont 260 * member. The device has a hard limit of up to 8 buffers being allowed for use 261 * for a single non-LSO packet or LSO segment. The number of TX ring entires 262 * (and thus TX control blocks) used depends on the fragment sizes and DMA 263 * layout, as explained below. 264 * 265 * We alter our DMA strategy based on a threshold tied to the fragment size. 266 * This threshold is configurable via the tx_dma_threshold property. If the 267 * fragment is above the threshold, we DMA bind it -- consuming one TCB and 268 * potentially several data descriptors. The exact number of descriptors (equal 269 * to the number of DMA cookies) depends on page size, MTU size, b_rptr offset 270 * into page, b_wptr offset into page, and the physical layout of the dblk's 271 * memory (contiguous or not). Essentially, we are at the mercy of the DMA 272 * engine and the dblk's memory allocation. Knowing the exact number of 273 * descriptors up front is a task best not taken on by the driver itself. 274 * Instead, we attempt to DMA bind the fragment and verify the descriptor 275 * layout meets hardware constraints. If the proposed DMA bind does not satisfy 276 * the hardware constaints, then we discard it and instead copy the entire 277 * fragment into the pre-allocated TCB buffer (or buffers if the fragment is 278 * larger than the TCB buffer). 279 * 280 * If the fragment is below or at the threshold, we copy it to the pre-allocated 281 * buffer of a TCB. We compress consecutive copy fragments into a single TCB to 282 * conserve resources. We are guaranteed that the TCB buffer is made up of only 283 * 1 DMA cookie; and therefore consumes only one descriptor on the controller. 284 * 285 * Furthermore, if the frame requires HW offloads such as LSO, tunneling or 286 * filtering, then the TX data descriptors must be preceeded by a single TX 287 * context descriptor. Because there is no DMA transfer associated with the 288 * context descriptor, we allocate a control block with a special type which 289 * indicates to the TX ring recycle code that there are no associated DMA 290 * resources to unbind when the control block is free'd. 291 * 292 * If we don't have enough space in the ring or TX control blocks available, 293 * then we'll return the unprocessed message block to MAC. This will induce flow 294 * control and once we recycle enough entries, we'll once again enable sending 295 * on the ring. 296 * 297 * We size the working list as equal to the number of descriptors in the ring. 298 * We size the free list as equal to 1.5 times the number of descriptors in the 299 * ring. We'll allocate a number of TX control block entries equal to the number 300 * of entries in the free list. By default, all entries are placed in the free 301 * list. As we come along and try to send something, we'll allocate entries from 302 * the free list and add them to the working list, where they'll stay until the 303 * hardware indicates that all of the data has been written back to us. The 304 * reason that we start with 1.5x is to help facilitate having more than one TX 305 * buffer associated with the DMA activity. 306 * 307 * -------------------- 308 * Managing the TX Ring 309 * -------------------- 310 * 311 * The transmit descriptor ring is driven by us. We maintain our own notion of a 312 * HEAD and TAIL register and we update the hardware with updates to the TAIL 313 * register. When the hardware is done writing out data, it updates us by 314 * writing back to a specific address, not by updating the individual 315 * descriptors. That address is a 4-byte region after the main transmit 316 * descriptor ring. This is why the descriptor ring has an extra descriptor's 317 * worth allocated to it. 318 * 319 * We maintain our notion of the HEAD in the i40e_trqpair_t`itrq_desc_head and 320 * the TAIL in the i40e_trqpair_t`itrq_desc_tail. When we write out frames, 321 * we'll update the tail there and in the I40E_QTX_TAIL() register. At various 322 * points in time, through both interrupts, and our own internal checks, we'll 323 * sync the write-back head portion of the DMA space. Based on the index it 324 * reports back, we'll free everything between our current HEAD and the 325 * indicated index and update HEAD to the new index. 326 * 327 * When a frame comes in, we try to use a number of transmit control blocks and 328 * we'll transition them from the free list to the work list. They'll get moved 329 * to the entry on the work list that corresponds with the transmit descriptor 330 * they correspond to. Once we are indicated that the corresponding descriptor 331 * has been freed, we'll return it to the list. 332 * 333 * The transmit control block free list is managed by keeping track of the 334 * number of entries in it, i40e_trqpair_t`itrq_tcb_free. We use it as a way to 335 * index into the free list and add things to it. In effect, we always push and 336 * pop from the tail and protect it with a single lock, 337 * i40e_trqpair_t`itrq_tcb_lock. This scheme is somewhat simplistic and may not 338 * stand up to further performance testing; however, it does allow us to get off 339 * the ground with the device driver. 340 * 341 * The following image describes where a given transmit control block lives in 342 * its lifetime: 343 * 344 * | 345 * * ... Initial placement for all tcb's 346 * | 347 * v 348 * +------------------+ +------------------+ 349 * | tcb on free list |---*------------------>| tcb on work list | 350 * +------------------+ . +------------------+ 351 * ^ . N tcbs allocated[1] | 352 * | to send frame v 353 * | or fragment on | 354 * | wire, mblk from | 355 * | MAC associated. | 356 * | | 357 * +------*-------------------------------<----+ 358 * . 359 * . Hardware indicates 360 * entry transmitted. 361 * tcbs recycled, mblk 362 * from MAC freed. 363 * 364 * [1] We allocate N tcbs to transmit a single frame where N can be 1 context 365 * descriptor plus 1 data descriptor, in the non-DMA-bind case. In the DMA 366 * bind case, N can be 1 context descriptor plus 1 data descriptor per 367 * b_cont in the mblk. In this case, the mblk is associated with the first 368 * data descriptor and freed as part of freeing that data descriptor. 369 * 370 * ------------ 371 * Blocking MAC 372 * ------------ 373 * 374 * When performing transmit, we can run out of descriptors and ring entries. 375 * When such a case happens, we return the mblk_t to MAC to indicate that we've 376 * been blocked. At that point in time, MAC becomes blocked and will not 377 * transmit anything out that specific ring until we notify MAC. To indicate 378 * that we're in such a situation we set i40e_trqpair_t`itrq_tx_blocked member 379 * to B_TRUE. 380 * 381 * When we recycle TX descriptors then we'll end up signaling MAC by calling 382 * mac_tx_ring_update() if we were blocked, letting it know that it's safe to 383 * start sending frames out to us again. 384 */ 385 386 /* 387 * We set our DMA alignment requests based on the smallest supported page size 388 * of the corresponding platform. 389 */ 390 #if defined(__sparc) 391 #define I40E_DMA_ALIGNMENT 0x2000ull 392 #elif defined(__x86) 393 #define I40E_DMA_ALIGNMENT 0x1000ull 394 #else 395 #error "unknown architecture for i40e" 396 #endif 397 398 /* 399 * This structure is used to maintain information and flags related to 400 * transmitting a frame. These fields are ultimately used to construct the 401 * TX data descriptor(s) and, if necessary, the TX context descriptor. 402 */ 403 typedef struct i40e_tx_context { 404 enum i40e_tx_desc_cmd_bits itc_data_cmdflags; 405 uint32_t itc_data_offsets; 406 enum i40e_tx_ctx_desc_cmd_bits itc_ctx_cmdflags; 407 uint32_t itc_ctx_tsolen; 408 uint32_t itc_ctx_mss; 409 } i40e_tx_context_t; 410 411 /* 412 * Toggles on debug builds which can be used to override our RX behaviour based 413 * on thresholds. 414 */ 415 #ifdef DEBUG 416 typedef enum { 417 I40E_DEBUG_RX_DEFAULT = 0, 418 I40E_DEBUG_RX_BCOPY = 1, 419 I40E_DEBUG_RX_DMABIND = 2 420 } i40e_debug_rx_t; 421 422 i40e_debug_rx_t i40e_debug_rx_mode = I40E_DEBUG_RX_DEFAULT; 423 #endif /* DEBUG */ 424 425 /* 426 * Notes on the following pair of DMA attributes. The first attribute, 427 * i40e_static_dma_attr, is designed to be used for both the descriptor rings 428 * and the static buffers that we associate with control blocks. For this 429 * reason, we force an SGL length of one. While technically the driver supports 430 * a larger SGL (5 on RX and 8 on TX), we opt to only use one to simplify our 431 * management here. In addition, when the Intel common code wants to allocate 432 * memory via the i40e_allocate_virt_mem osdep function, we have it leverage 433 * the static dma attr. 434 * 435 * The latter two sets of attributes, are what we use when we're binding a 436 * bunch of mblk_t fragments to go out the door. Note that the main difference 437 * here is that we're allowed a larger SGL length. For non-LSO TX, we 438 * restrict the SGL length to match the number of TX buffers available to the 439 * PF (8). For the LSO case we can go much larger, with the caveat that each 440 * MSS-sized chunk (segment) must not span more than 8 data descriptors and 441 * hence must not span more than 8 cookies. 442 * 443 * Note, we default to setting ourselves to be DMA capable here. However, 444 * because we could have multiple instances which have different FMA error 445 * checking capabilities, or end up on different buses, we make these static 446 * and const and copy them into the i40e_t for the given device with the actual 447 * values that reflect the actual capabilities. 448 */ 449 static const ddi_dma_attr_t i40e_g_static_dma_attr = { 450 DMA_ATTR_V0, /* version number */ 451 0x0000000000000000ull, /* low address */ 452 0xFFFFFFFFFFFFFFFFull, /* high address */ 453 0x00000000FFFFFFFFull, /* dma counter max */ 454 I40E_DMA_ALIGNMENT, /* alignment */ 455 0x00000FFF, /* burst sizes */ 456 0x00000001, /* minimum transfer size */ 457 0x00000000FFFFFFFFull, /* maximum transfer size */ 458 0xFFFFFFFFFFFFFFFFull, /* maximum segment size */ 459 1, /* scatter/gather list length */ 460 0x00000001, /* granularity */ 461 DDI_DMA_FLAGERR /* DMA flags */ 462 }; 463 464 static const ddi_dma_attr_t i40e_g_txbind_dma_attr = { 465 DMA_ATTR_V0, /* version number */ 466 0x0000000000000000ull, /* low address */ 467 0xFFFFFFFFFFFFFFFFull, /* high address */ 468 I40E_MAX_TX_BUFSZ - 1, /* dma counter max */ 469 I40E_DMA_ALIGNMENT, /* alignment */ 470 0x00000FFF, /* burst sizes */ 471 0x00000001, /* minimum transfer size */ 472 0x00000000FFFFFFFFull, /* maximum transfer size */ 473 0xFFFFFFFFFFFFFFFFull, /* maximum segment size */ 474 I40E_TX_MAX_COOKIE, /* scatter/gather list length */ 475 0x00000001, /* granularity */ 476 DDI_DMA_FLAGERR /* DMA flags */ 477 }; 478 479 static const ddi_dma_attr_t i40e_g_txbind_lso_dma_attr = { 480 DMA_ATTR_V0, /* version number */ 481 0x0000000000000000ull, /* low address */ 482 0xFFFFFFFFFFFFFFFFull, /* high address */ 483 I40E_MAX_TX_BUFSZ - 1, /* dma counter max */ 484 I40E_DMA_ALIGNMENT, /* alignment */ 485 0x00000FFF, /* burst sizes */ 486 0x00000001, /* minimum transfer size */ 487 0x00000000FFFFFFFFull, /* maximum transfer size */ 488 0xFFFFFFFFFFFFFFFFull, /* maximum segment size */ 489 I40E_TX_LSO_MAX_COOKIE, /* scatter/gather list length */ 490 0x00000001, /* granularity */ 491 DDI_DMA_FLAGERR /* DMA flags */ 492 }; 493 494 /* 495 * Next, we have the attributes for these structures. The descriptor rings are 496 * all strictly little endian, while the data buffers are just arrays of bytes 497 * representing frames. Because of this, we purposefully simplify the driver 498 * programming life by programming the descriptor ring as little endian, while 499 * for the buffer data we keep it as unstructured. 500 * 501 * Note, that to keep the Intel common code operating in a reasonable way, when 502 * we allocate DMA memory for it, we do not use byte swapping and thus use the 503 * standard i40e_buf_acc_attr. 504 */ 505 static const ddi_device_acc_attr_t i40e_g_desc_acc_attr = { 506 DDI_DEVICE_ATTR_V0, 507 DDI_STRUCTURE_LE_ACC, 508 DDI_STRICTORDER_ACC 509 }; 510 511 static const ddi_device_acc_attr_t i40e_g_buf_acc_attr = { 512 DDI_DEVICE_ATTR_V0, 513 DDI_NEVERSWAP_ACC, 514 DDI_STRICTORDER_ACC 515 }; 516 517 /* 518 * The next two functions are designed to be type-safe versions of macros that 519 * are used to increment and decrement a descriptor index in the loop. Note, 520 * these are marked inline to try and keep the data path hot and they were 521 * effectively inlined in their previous life as macros. 522 */ 523 static inline int 524 i40e_next_desc(int base, int count, int size) 525 { 526 int out; 527 528 ASSERT(base >= 0); 529 ASSERT(count > 0); 530 ASSERT(size > 0); 531 532 if (base + count < size) { 533 out = base + count; 534 } else { 535 out = base + count - size; 536 } 537 538 ASSERT(out >= 0 && out < size); 539 return (out); 540 } 541 542 static inline int 543 i40e_prev_desc(int base, int count, int size) 544 { 545 int out; 546 547 ASSERT(base >= 0); 548 ASSERT(count > 0); 549 ASSERT(size > 0); 550 551 if (base >= count) { 552 out = base - count; 553 } else { 554 out = base - count + size; 555 } 556 557 ASSERT(out >= 0 && out < size); 558 return (out); 559 } 560 561 /* 562 * Free DMA memory that is represented by a i40e_dma_buffer_t. 563 */ 564 static void 565 i40e_free_dma_buffer(i40e_dma_buffer_t *dmap) 566 { 567 if (dmap->dmab_dma_address != 0) { 568 VERIFY(dmap->dmab_dma_handle != NULL); 569 (void) ddi_dma_unbind_handle(dmap->dmab_dma_handle); 570 dmap->dmab_dma_address = 0; 571 dmap->dmab_size = 0; 572 } 573 574 if (dmap->dmab_acc_handle != NULL) { 575 ddi_dma_mem_free(&dmap->dmab_acc_handle); 576 dmap->dmab_acc_handle = NULL; 577 dmap->dmab_address = NULL; 578 } 579 580 if (dmap->dmab_dma_handle != NULL) { 581 ddi_dma_free_handle(&dmap->dmab_dma_handle); 582 dmap->dmab_dma_handle = NULL; 583 } 584 585 /* 586 * These should only be set if we have valid handles allocated and 587 * therefore should always be NULLed out due to the above code. This 588 * is here to catch us acting sloppy. 589 */ 590 ASSERT(dmap->dmab_dma_address == 0); 591 ASSERT(dmap->dmab_address == NULL); 592 ASSERT(dmap->dmab_size == 0); 593 dmap->dmab_len = 0; 594 } 595 596 /* 597 * Allocate size bytes of DMA memory based on the passed in attributes. This 598 * fills in the information in dmap and is designed for all of our single cookie 599 * allocations. 600 */ 601 static boolean_t 602 i40e_alloc_dma_buffer(i40e_t *i40e, i40e_dma_buffer_t *dmap, 603 ddi_dma_attr_t *attrsp, ddi_device_acc_attr_t *accp, boolean_t stream, 604 boolean_t zero, size_t size) 605 { 606 int ret; 607 uint_t flags; 608 size_t len; 609 ddi_dma_cookie_t cookie; 610 uint_t ncookies; 611 612 if (stream == B_TRUE) 613 flags = DDI_DMA_STREAMING; 614 else 615 flags = DDI_DMA_CONSISTENT; 616 617 /* 618 * Step one: Allocate the DMA handle 619 */ 620 ret = ddi_dma_alloc_handle(i40e->i40e_dip, attrsp, DDI_DMA_DONTWAIT, 621 NULL, &dmap->dmab_dma_handle); 622 if (ret != DDI_SUCCESS) { 623 i40e_error(i40e, "failed to allocate dma handle for I/O " 624 "buffers: %d", ret); 625 dmap->dmab_dma_handle = NULL; 626 return (B_FALSE); 627 } 628 629 /* 630 * Step two: Allocate the DMA memory 631 */ 632 ret = ddi_dma_mem_alloc(dmap->dmab_dma_handle, size, accp, flags, 633 DDI_DMA_DONTWAIT, NULL, &dmap->dmab_address, &len, 634 &dmap->dmab_acc_handle); 635 if (ret != DDI_SUCCESS) { 636 i40e_error(i40e, "failed to allocate %ld bytes of DMA for I/O " 637 "buffers", size); 638 dmap->dmab_address = NULL; 639 dmap->dmab_acc_handle = NULL; 640 i40e_free_dma_buffer(dmap); 641 return (B_FALSE); 642 } 643 644 /* 645 * Step three: Optionally zero 646 */ 647 if (zero == B_TRUE) 648 bzero(dmap->dmab_address, len); 649 650 /* 651 * Step four: Bind the memory 652 */ 653 ret = ddi_dma_addr_bind_handle(dmap->dmab_dma_handle, NULL, 654 dmap->dmab_address, len, DDI_DMA_RDWR | flags, DDI_DMA_DONTWAIT, 655 NULL, &cookie, &ncookies); 656 if (ret != DDI_DMA_MAPPED) { 657 i40e_error(i40e, "failed to allocate %ld bytes of DMA for I/O " 658 "buffers: %d", size, ret); 659 i40e_free_dma_buffer(dmap); 660 return (B_FALSE); 661 } 662 663 VERIFY(ncookies == 1); 664 dmap->dmab_dma_address = cookie.dmac_laddress; 665 dmap->dmab_size = len; 666 dmap->dmab_len = 0; 667 return (B_TRUE); 668 } 669 670 /* 671 * This function is called once the last pending rcb has been freed by the upper 672 * levels of the system. 673 */ 674 static void 675 i40e_free_rx_data(i40e_rx_data_t *rxd) 676 { 677 VERIFY(rxd->rxd_rcb_pending == 0); 678 679 if (rxd->rxd_rcb_area != NULL) { 680 kmem_free(rxd->rxd_rcb_area, 681 sizeof (i40e_rx_control_block_t) * 682 (rxd->rxd_free_list_size + rxd->rxd_ring_size)); 683 rxd->rxd_rcb_area = NULL; 684 } 685 686 if (rxd->rxd_free_list != NULL) { 687 kmem_free(rxd->rxd_free_list, 688 sizeof (i40e_rx_control_block_t *) * 689 rxd->rxd_free_list_size); 690 rxd->rxd_free_list = NULL; 691 } 692 693 if (rxd->rxd_work_list != NULL) { 694 kmem_free(rxd->rxd_work_list, 695 sizeof (i40e_rx_control_block_t *) * 696 rxd->rxd_ring_size); 697 rxd->rxd_work_list = NULL; 698 } 699 700 kmem_free(rxd, sizeof (i40e_rx_data_t)); 701 } 702 703 static boolean_t 704 i40e_alloc_rx_data(i40e_t *i40e, i40e_trqpair_t *itrq) 705 { 706 i40e_rx_data_t *rxd; 707 708 rxd = kmem_zalloc(sizeof (i40e_rx_data_t), KM_NOSLEEP); 709 if (rxd == NULL) 710 return (B_FALSE); 711 itrq->itrq_rxdata = rxd; 712 rxd->rxd_i40e = i40e; 713 714 rxd->rxd_ring_size = i40e->i40e_rx_ring_size; 715 rxd->rxd_free_list_size = i40e->i40e_rx_ring_size; 716 717 rxd->rxd_rcb_free = rxd->rxd_free_list_size; 718 719 rxd->rxd_work_list = kmem_zalloc(sizeof (i40e_rx_control_block_t *) * 720 rxd->rxd_ring_size, KM_NOSLEEP); 721 if (rxd->rxd_work_list == NULL) { 722 i40e_error(i40e, "failed to allocate RX work list for a ring " 723 "of %d entries for ring %d", rxd->rxd_ring_size, 724 itrq->itrq_index); 725 goto cleanup; 726 } 727 728 rxd->rxd_free_list = kmem_zalloc(sizeof (i40e_rx_control_block_t *) * 729 rxd->rxd_free_list_size, KM_NOSLEEP); 730 if (rxd->rxd_free_list == NULL) { 731 i40e_error(i40e, "failed to allocate a %d entry RX free list " 732 "for ring %d", rxd->rxd_free_list_size, itrq->itrq_index); 733 goto cleanup; 734 } 735 736 rxd->rxd_rcb_area = kmem_zalloc(sizeof (i40e_rx_control_block_t) * 737 (rxd->rxd_free_list_size + rxd->rxd_ring_size), KM_NOSLEEP); 738 if (rxd->rxd_rcb_area == NULL) { 739 i40e_error(i40e, "failed to allocate a %d entry rcb area for " 740 "ring %d", rxd->rxd_ring_size + rxd->rxd_free_list_size, 741 itrq->itrq_index); 742 goto cleanup; 743 } 744 745 return (B_TRUE); 746 747 cleanup: 748 i40e_free_rx_data(rxd); 749 itrq->itrq_rxdata = NULL; 750 return (B_FALSE); 751 } 752 753 /* 754 * Free all of the memory that we've allocated for DMA. Note that we may have 755 * buffers that we've loaned up to the OS which are still outstanding. We'll 756 * always free up the descriptor ring, because we no longer need that. For each 757 * rcb, we'll iterate over it and if we send the reference count to zero, then 758 * we'll free the message block and DMA related resources. However, if we don't 759 * take the last one, then we'll go ahead and keep track that we'll have pending 760 * data and clean it up when we get there. 761 */ 762 static void 763 i40e_free_rx_dma(i40e_rx_data_t *rxd, boolean_t failed_init) 764 { 765 uint32_t i, count, ref; 766 767 i40e_rx_control_block_t *rcb; 768 i40e_t *i40e = rxd->rxd_i40e; 769 770 i40e_free_dma_buffer(&rxd->rxd_desc_area); 771 rxd->rxd_desc_ring = NULL; 772 rxd->rxd_desc_next = 0; 773 774 mutex_enter(&i40e->i40e_rx_pending_lock); 775 776 rcb = rxd->rxd_rcb_area; 777 count = rxd->rxd_ring_size + rxd->rxd_free_list_size; 778 779 for (i = 0; i < count; i++, rcb++) { 780 VERIFY(rcb != NULL); 781 782 /* 783 * If we're cleaning up from a failed creation attempt, then an 784 * entry may never have been assembled which would mean that 785 * it's reference count is zero. If we find that, we leave it 786 * be, because nothing else should be modifying it at this 787 * point. We're not at the point that any more references can be 788 * added, just removed. 789 */ 790 if (failed_init == B_TRUE && rcb->rcb_ref == 0) 791 continue; 792 793 ref = atomic_dec_32_nv(&rcb->rcb_ref); 794 if (ref == 0) { 795 freemsg(rcb->rcb_mp); 796 rcb->rcb_mp = NULL; 797 i40e_free_dma_buffer(&rcb->rcb_dma); 798 } else { 799 atomic_inc_32(&rxd->rxd_rcb_pending); 800 atomic_inc_32(&i40e->i40e_rx_pending); 801 } 802 } 803 mutex_exit(&i40e->i40e_rx_pending_lock); 804 } 805 806 /* 807 * Initialize the DMA memory for the descriptor ring and for each frame in the 808 * control block list. 809 */ 810 static boolean_t 811 i40e_alloc_rx_dma(i40e_rx_data_t *rxd) 812 { 813 int i, count; 814 size_t dmasz; 815 i40e_rx_control_block_t *rcb; 816 i40e_t *i40e = rxd->rxd_i40e; 817 818 /* 819 * First allocate the RX descriptor ring. 820 */ 821 dmasz = sizeof (i40e_rx_desc_t) * rxd->rxd_ring_size; 822 VERIFY(dmasz > 0); 823 if (i40e_alloc_dma_buffer(i40e, &rxd->rxd_desc_area, 824 &i40e->i40e_static_dma_attr, &i40e->i40e_desc_acc_attr, B_FALSE, 825 B_TRUE, dmasz) == B_FALSE) { 826 i40e_error(i40e, "failed to allocate DMA resources " 827 "for RX descriptor ring"); 828 return (B_FALSE); 829 } 830 rxd->rxd_desc_ring = 831 (i40e_rx_desc_t *)(uintptr_t)rxd->rxd_desc_area.dmab_address; 832 rxd->rxd_desc_next = 0; 833 834 count = rxd->rxd_ring_size + rxd->rxd_free_list_size; 835 rcb = rxd->rxd_rcb_area; 836 837 dmasz = i40e->i40e_rx_buf_size; 838 VERIFY(dmasz > 0); 839 for (i = 0; i < count; i++, rcb++) { 840 i40e_dma_buffer_t *dmap; 841 VERIFY(rcb != NULL); 842 843 if (i < rxd->rxd_ring_size) { 844 rxd->rxd_work_list[i] = rcb; 845 } else { 846 rxd->rxd_free_list[i - rxd->rxd_ring_size] = rcb; 847 } 848 849 dmap = &rcb->rcb_dma; 850 if (i40e_alloc_dma_buffer(i40e, dmap, 851 &i40e->i40e_static_dma_attr, &i40e->i40e_buf_acc_attr, 852 B_TRUE, B_FALSE, dmasz) == B_FALSE) { 853 i40e_error(i40e, "failed to allocate RX dma buffer"); 854 return (B_FALSE); 855 } 856 857 /* 858 * Initialize the control block and offset the DMA address. See 859 * the note in the big theory statement that explains how this 860 * helps IP deal with alignment. Note, we don't worry about 861 * whether or not we successfully get an mblk_t from desballoc, 862 * it's a common case that we have to handle later on in the 863 * system. 864 */ 865 dmap->dmab_size -= I40E_BUF_IPHDR_ALIGNMENT; 866 dmap->dmab_address += I40E_BUF_IPHDR_ALIGNMENT; 867 dmap->dmab_dma_address += I40E_BUF_IPHDR_ALIGNMENT; 868 869 rcb->rcb_ref = 1; 870 rcb->rcb_rxd = rxd; 871 rcb->rcb_free_rtn.free_func = i40e_rx_recycle; 872 rcb->rcb_free_rtn.free_arg = (caddr_t)rcb; 873 rcb->rcb_mp = desballoc((unsigned char *)dmap->dmab_address, 874 dmap->dmab_size, 0, &rcb->rcb_free_rtn); 875 } 876 877 return (B_TRUE); 878 } 879 880 static void 881 i40e_free_tx_dma(i40e_trqpair_t *itrq) 882 { 883 size_t fsz; 884 885 if (itrq->itrq_tcb_area != NULL) { 886 uint32_t i; 887 i40e_tx_control_block_t *tcb = itrq->itrq_tcb_area; 888 889 for (i = 0; i < itrq->itrq_tx_free_list_size; i++, tcb++) { 890 i40e_free_dma_buffer(&tcb->tcb_dma); 891 if (tcb->tcb_dma_handle != NULL) { 892 ddi_dma_free_handle(&tcb->tcb_dma_handle); 893 tcb->tcb_dma_handle = NULL; 894 } 895 if (tcb->tcb_lso_dma_handle != NULL) { 896 ddi_dma_free_handle(&tcb->tcb_lso_dma_handle); 897 tcb->tcb_lso_dma_handle = NULL; 898 } 899 } 900 901 fsz = sizeof (i40e_tx_control_block_t) * 902 itrq->itrq_tx_free_list_size; 903 kmem_free(itrq->itrq_tcb_area, fsz); 904 itrq->itrq_tcb_area = NULL; 905 } 906 907 if (itrq->itrq_tcb_free_list != NULL) { 908 fsz = sizeof (i40e_tx_control_block_t *) * 909 itrq->itrq_tx_free_list_size; 910 kmem_free(itrq->itrq_tcb_free_list, fsz); 911 itrq->itrq_tcb_free_list = NULL; 912 } 913 914 if (itrq->itrq_tcb_work_list != NULL) { 915 fsz = sizeof (i40e_tx_control_block_t *) * 916 itrq->itrq_tx_ring_size; 917 kmem_free(itrq->itrq_tcb_work_list, fsz); 918 itrq->itrq_tcb_work_list = NULL; 919 } 920 921 i40e_free_dma_buffer(&itrq->itrq_desc_area); 922 itrq->itrq_desc_ring = NULL; 923 924 } 925 926 static boolean_t 927 i40e_alloc_tx_dma(i40e_trqpair_t *itrq) 928 { 929 int i, ret; 930 size_t dmasz; 931 i40e_tx_control_block_t *tcb; 932 i40e_t *i40e = itrq->itrq_i40e; 933 934 itrq->itrq_tx_ring_size = i40e->i40e_tx_ring_size; 935 itrq->itrq_tx_free_list_size = i40e->i40e_tx_ring_size + 936 (i40e->i40e_tx_ring_size >> 1); 937 938 /* 939 * Allocate an additional TX descriptor for the writeback head. 940 */ 941 dmasz = sizeof (i40e_tx_desc_t) * itrq->itrq_tx_ring_size; 942 dmasz += sizeof (i40e_tx_desc_t); 943 944 VERIFY(dmasz > 0); 945 if (i40e_alloc_dma_buffer(i40e, &itrq->itrq_desc_area, 946 &i40e->i40e_static_dma_attr, &i40e->i40e_desc_acc_attr, 947 B_FALSE, B_TRUE, dmasz) == B_FALSE) { 948 i40e_error(i40e, "failed to allocate DMA resources for TX " 949 "descriptor ring"); 950 return (B_FALSE); 951 } 952 itrq->itrq_desc_ring = 953 (i40e_tx_desc_t *)(uintptr_t)itrq->itrq_desc_area.dmab_address; 954 itrq->itrq_desc_wbhead = (uint32_t *)(itrq->itrq_desc_ring + 955 itrq->itrq_tx_ring_size); 956 itrq->itrq_desc_head = 0; 957 itrq->itrq_desc_tail = 0; 958 itrq->itrq_desc_free = itrq->itrq_tx_ring_size; 959 960 itrq->itrq_tcb_work_list = kmem_zalloc(itrq->itrq_tx_ring_size * 961 sizeof (i40e_tx_control_block_t *), KM_NOSLEEP); 962 if (itrq->itrq_tcb_work_list == NULL) { 963 i40e_error(i40e, "failed to allocate a %d entry TX work list " 964 "for ring %d", itrq->itrq_tx_ring_size, itrq->itrq_index); 965 goto cleanup; 966 } 967 968 itrq->itrq_tcb_free_list = kmem_zalloc(itrq->itrq_tx_free_list_size * 969 sizeof (i40e_tx_control_block_t *), KM_SLEEP); 970 if (itrq->itrq_tcb_free_list == NULL) { 971 i40e_error(i40e, "failed to allocate a %d entry TX free list " 972 "for ring %d", itrq->itrq_tx_free_list_size, 973 itrq->itrq_index); 974 goto cleanup; 975 } 976 977 /* 978 * We allocate enough TX control blocks to cover the free list. 979 */ 980 itrq->itrq_tcb_area = kmem_zalloc(sizeof (i40e_tx_control_block_t) * 981 itrq->itrq_tx_free_list_size, KM_NOSLEEP); 982 if (itrq->itrq_tcb_area == NULL) { 983 i40e_error(i40e, "failed to allocate a %d entry tcb area for " 984 "ring %d", itrq->itrq_tx_free_list_size, itrq->itrq_index); 985 goto cleanup; 986 } 987 988 /* 989 * For each tcb, allocate DMA memory. 990 */ 991 dmasz = i40e->i40e_tx_buf_size; 992 VERIFY(dmasz > 0); 993 tcb = itrq->itrq_tcb_area; 994 for (i = 0; i < itrq->itrq_tx_free_list_size; i++, tcb++) { 995 VERIFY(tcb != NULL); 996 997 /* 998 * Allocate both a DMA buffer which we'll use for when we copy 999 * packets for transmission and allocate a DMA handle which 1000 * we'll use when we bind data. 1001 */ 1002 ret = ddi_dma_alloc_handle(i40e->i40e_dip, 1003 &i40e->i40e_txbind_dma_attr, DDI_DMA_DONTWAIT, NULL, 1004 &tcb->tcb_dma_handle); 1005 if (ret != DDI_SUCCESS) { 1006 i40e_error(i40e, "failed to allocate DMA handle for TX " 1007 "data binding on ring %d: %d", itrq->itrq_index, 1008 ret); 1009 tcb->tcb_dma_handle = NULL; 1010 goto cleanup; 1011 } 1012 1013 ret = ddi_dma_alloc_handle(i40e->i40e_dip, 1014 &i40e->i40e_txbind_lso_dma_attr, DDI_DMA_DONTWAIT, NULL, 1015 &tcb->tcb_lso_dma_handle); 1016 if (ret != DDI_SUCCESS) { 1017 i40e_error(i40e, "failed to allocate DMA handle for TX " 1018 "LSO data binding on ring %d: %d", itrq->itrq_index, 1019 ret); 1020 tcb->tcb_lso_dma_handle = NULL; 1021 goto cleanup; 1022 } 1023 1024 if (i40e_alloc_dma_buffer(i40e, &tcb->tcb_dma, 1025 &i40e->i40e_static_dma_attr, &i40e->i40e_buf_acc_attr, 1026 B_TRUE, B_FALSE, dmasz) == B_FALSE) { 1027 i40e_error(i40e, "failed to allocate %ld bytes of " 1028 "DMA for TX data binding on ring %d", dmasz, 1029 itrq->itrq_index); 1030 goto cleanup; 1031 } 1032 1033 itrq->itrq_tcb_free_list[i] = tcb; 1034 } 1035 1036 itrq->itrq_tcb_free = itrq->itrq_tx_free_list_size; 1037 1038 return (B_TRUE); 1039 1040 cleanup: 1041 i40e_free_tx_dma(itrq); 1042 return (B_FALSE); 1043 } 1044 1045 /* 1046 * Free all memory associated with all of the rings on this i40e instance. Note, 1047 * this is done as part of the GLDv3 stop routine. 1048 */ 1049 void 1050 i40e_free_ring_mem(i40e_t *i40e, boolean_t failed_init) 1051 { 1052 int i; 1053 1054 for (i = 0; i < i40e->i40e_num_trqpairs; i++) { 1055 i40e_rx_data_t *rxd = i40e->i40e_trqpairs[i].itrq_rxdata; 1056 1057 /* 1058 * In some cases i40e_alloc_rx_data() may have failed 1059 * and in that case there is no rxd to free. 1060 */ 1061 if (rxd == NULL) 1062 continue; 1063 1064 /* 1065 * Clean up our RX data. We have to free DMA resources first and 1066 * then if we have no more pending RCB's, then we'll go ahead 1067 * and clean things up. Note, we can't set the stopped flag on 1068 * the RX data until after we've done the first pass of the 1069 * pending resources. Otherwise we might race with 1070 * i40e_rx_recycle on determining who should free the 1071 * i40e_rx_data_t above. 1072 */ 1073 i40e_free_rx_dma(rxd, failed_init); 1074 1075 mutex_enter(&i40e->i40e_rx_pending_lock); 1076 rxd->rxd_shutdown = B_TRUE; 1077 if (rxd->rxd_rcb_pending == 0) { 1078 i40e_free_rx_data(rxd); 1079 i40e->i40e_trqpairs[i].itrq_rxdata = NULL; 1080 } 1081 mutex_exit(&i40e->i40e_rx_pending_lock); 1082 1083 i40e_free_tx_dma(&i40e->i40e_trqpairs[i]); 1084 } 1085 } 1086 1087 /* 1088 * Allocate all of the resources associated with all of the rings on this i40e 1089 * instance. Note this is done as part of the GLDv3 start routine and thus we 1090 * should not use blocking allocations. This takes care of both DMA and non-DMA 1091 * related resources. 1092 */ 1093 boolean_t 1094 i40e_alloc_ring_mem(i40e_t *i40e) 1095 { 1096 int i; 1097 1098 for (i = 0; i < i40e->i40e_num_trqpairs; i++) { 1099 if (i40e_alloc_rx_data(i40e, &i40e->i40e_trqpairs[i]) == 1100 B_FALSE) 1101 goto unwind; 1102 1103 if (i40e_alloc_rx_dma(i40e->i40e_trqpairs[i].itrq_rxdata) == 1104 B_FALSE) 1105 goto unwind; 1106 1107 if (i40e_alloc_tx_dma(&i40e->i40e_trqpairs[i]) == B_FALSE) 1108 goto unwind; 1109 } 1110 1111 return (B_TRUE); 1112 1113 unwind: 1114 i40e_free_ring_mem(i40e, B_TRUE); 1115 return (B_FALSE); 1116 } 1117 1118 1119 /* 1120 * Because every instance of i40e may have different support for FMA 1121 * capabilities, we copy the DMA attributes into the i40e_t and set them that 1122 * way and use them for determining attributes. 1123 */ 1124 void 1125 i40e_init_dma_attrs(i40e_t *i40e, boolean_t fma) 1126 { 1127 bcopy(&i40e_g_static_dma_attr, &i40e->i40e_static_dma_attr, 1128 sizeof (ddi_dma_attr_t)); 1129 bcopy(&i40e_g_txbind_dma_attr, &i40e->i40e_txbind_dma_attr, 1130 sizeof (ddi_dma_attr_t)); 1131 bcopy(&i40e_g_txbind_lso_dma_attr, &i40e->i40e_txbind_lso_dma_attr, 1132 sizeof (ddi_dma_attr_t)); 1133 bcopy(&i40e_g_desc_acc_attr, &i40e->i40e_desc_acc_attr, 1134 sizeof (ddi_device_acc_attr_t)); 1135 bcopy(&i40e_g_buf_acc_attr, &i40e->i40e_buf_acc_attr, 1136 sizeof (ddi_device_acc_attr_t)); 1137 1138 if (fma == B_TRUE) { 1139 i40e->i40e_static_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR; 1140 i40e->i40e_txbind_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR; 1141 i40e->i40e_txbind_lso_dma_attr.dma_attr_flags |= 1142 DDI_DMA_FLAGERR; 1143 } else { 1144 i40e->i40e_static_dma_attr.dma_attr_flags &= ~DDI_DMA_FLAGERR; 1145 i40e->i40e_txbind_dma_attr.dma_attr_flags &= ~DDI_DMA_FLAGERR; 1146 i40e->i40e_txbind_lso_dma_attr.dma_attr_flags &= 1147 ~DDI_DMA_FLAGERR; 1148 } 1149 } 1150 1151 static void 1152 i40e_rcb_free(i40e_rx_data_t *rxd, i40e_rx_control_block_t *rcb) 1153 { 1154 mutex_enter(&rxd->rxd_free_lock); 1155 ASSERT(rxd->rxd_rcb_free < rxd->rxd_free_list_size); 1156 ASSERT(rxd->rxd_free_list[rxd->rxd_rcb_free] == NULL); 1157 rxd->rxd_free_list[rxd->rxd_rcb_free] = rcb; 1158 rxd->rxd_rcb_free++; 1159 mutex_exit(&rxd->rxd_free_lock); 1160 } 1161 1162 static i40e_rx_control_block_t * 1163 i40e_rcb_alloc(i40e_rx_data_t *rxd) 1164 { 1165 i40e_rx_control_block_t *rcb; 1166 1167 mutex_enter(&rxd->rxd_free_lock); 1168 if (rxd->rxd_rcb_free == 0) { 1169 mutex_exit(&rxd->rxd_free_lock); 1170 return (NULL); 1171 } 1172 rxd->rxd_rcb_free--; 1173 rcb = rxd->rxd_free_list[rxd->rxd_rcb_free]; 1174 VERIFY(rcb != NULL); 1175 rxd->rxd_free_list[rxd->rxd_rcb_free] = NULL; 1176 mutex_exit(&rxd->rxd_free_lock); 1177 1178 return (rcb); 1179 } 1180 1181 /* 1182 * This is the callback that we get from the OS when freemsg(9F) has been called 1183 * on a loaned descriptor. In addition, if we take the last reference count 1184 * here, then we have to tear down all of the RX data. 1185 */ 1186 void 1187 i40e_rx_recycle(caddr_t arg) 1188 { 1189 uint32_t ref; 1190 i40e_rx_control_block_t *rcb; 1191 i40e_rx_data_t *rxd; 1192 i40e_t *i40e; 1193 1194 /* LINTED: E_BAD_PTR_CAST_ALIGN */ 1195 rcb = (i40e_rx_control_block_t *)arg; 1196 rxd = rcb->rcb_rxd; 1197 i40e = rxd->rxd_i40e; 1198 1199 /* 1200 * It's possible for this to be called with a reference count of zero. 1201 * That will happen when we're doing the freemsg after taking the last 1202 * reference because we're tearing down everything and this rcb is not 1203 * outstanding. 1204 */ 1205 if (rcb->rcb_ref == 0) 1206 return; 1207 1208 /* 1209 * Don't worry about failure of desballoc here. It'll only become fatal 1210 * if we're trying to use it and we can't in i40e_rx_bind(). 1211 */ 1212 rcb->rcb_mp = desballoc((unsigned char *)rcb->rcb_dma.dmab_address, 1213 rcb->rcb_dma.dmab_size, 0, &rcb->rcb_free_rtn); 1214 i40e_rcb_free(rxd, rcb); 1215 1216 /* 1217 * It's possible that the rcb was being used while we are shutting down 1218 * the device. In that case, we'll take the final reference from the 1219 * device here. 1220 */ 1221 ref = atomic_dec_32_nv(&rcb->rcb_ref); 1222 if (ref == 0) { 1223 freemsg(rcb->rcb_mp); 1224 rcb->rcb_mp = NULL; 1225 i40e_free_dma_buffer(&rcb->rcb_dma); 1226 1227 mutex_enter(&i40e->i40e_rx_pending_lock); 1228 atomic_dec_32(&rxd->rxd_rcb_pending); 1229 atomic_dec_32(&i40e->i40e_rx_pending); 1230 1231 /* 1232 * If this was the last block and it's been indicated that we've 1233 * passed the shutdown point, we should clean up. 1234 */ 1235 if (rxd->rxd_shutdown == B_TRUE && rxd->rxd_rcb_pending == 0) { 1236 i40e_free_rx_data(rxd); 1237 cv_broadcast(&i40e->i40e_rx_pending_cv); 1238 } 1239 1240 mutex_exit(&i40e->i40e_rx_pending_lock); 1241 } 1242 } 1243 1244 static mblk_t * 1245 i40e_rx_bind(i40e_trqpair_t *itrq, i40e_rx_data_t *rxd, uint32_t index, 1246 uint32_t plen) 1247 { 1248 mblk_t *mp; 1249 i40e_t *i40e = rxd->rxd_i40e; 1250 i40e_rx_control_block_t *rcb, *rep_rcb; 1251 1252 ASSERT(MUTEX_HELD(&itrq->itrq_rx_lock)); 1253 1254 if ((rep_rcb = i40e_rcb_alloc(rxd)) == NULL) { 1255 itrq->itrq_rxstat.irxs_rx_bind_norcb.value.ui64++; 1256 return (NULL); 1257 } 1258 1259 rcb = rxd->rxd_work_list[index]; 1260 1261 /* 1262 * Check to make sure we have a mblk_t. If we don't, this is our last 1263 * chance to try and get one. 1264 */ 1265 if (rcb->rcb_mp == NULL) { 1266 rcb->rcb_mp = 1267 desballoc((unsigned char *)rcb->rcb_dma.dmab_address, 1268 rcb->rcb_dma.dmab_size, 0, &rcb->rcb_free_rtn); 1269 if (rcb->rcb_mp == NULL) { 1270 itrq->itrq_rxstat.irxs_rx_bind_nomp.value.ui64++; 1271 i40e_rcb_free(rxd, rcb); 1272 return (NULL); 1273 } 1274 } 1275 1276 I40E_DMA_SYNC(&rcb->rcb_dma, DDI_DMA_SYNC_FORKERNEL); 1277 1278 if (i40e_check_dma_handle(rcb->rcb_dma.dmab_dma_handle) != DDI_FM_OK) { 1279 ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED); 1280 atomic_or_32(&i40e->i40e_state, I40E_ERROR); 1281 i40e_rcb_free(rxd, rcb); 1282 return (NULL); 1283 } 1284 1285 /* 1286 * Note, we've already accounted for the I40E_BUF_IPHDR_ALIGNMENT. 1287 */ 1288 mp = rcb->rcb_mp; 1289 atomic_inc_32(&rcb->rcb_ref); 1290 mp->b_wptr = mp->b_rptr + plen; 1291 mp->b_next = mp->b_cont = NULL; 1292 1293 rxd->rxd_work_list[index] = rep_rcb; 1294 return (mp); 1295 } 1296 1297 /* 1298 * We're going to allocate a new message block for this frame and attempt to 1299 * receive it. See the big theory statement for more information on when we copy 1300 * versus bind. 1301 */ 1302 static mblk_t * 1303 i40e_rx_copy(i40e_trqpair_t *itrq, i40e_rx_data_t *rxd, uint32_t index, 1304 uint32_t plen) 1305 { 1306 i40e_t *i40e = rxd->rxd_i40e; 1307 i40e_rx_control_block_t *rcb; 1308 mblk_t *mp; 1309 1310 ASSERT(index < rxd->rxd_ring_size); 1311 rcb = rxd->rxd_work_list[index]; 1312 1313 I40E_DMA_SYNC(&rcb->rcb_dma, DDI_DMA_SYNC_FORKERNEL); 1314 1315 if (i40e_check_dma_handle(rcb->rcb_dma.dmab_dma_handle) != DDI_FM_OK) { 1316 ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED); 1317 atomic_or_32(&i40e->i40e_state, I40E_ERROR); 1318 return (NULL); 1319 } 1320 1321 mp = allocb(plen + I40E_BUF_IPHDR_ALIGNMENT, 0); 1322 if (mp == NULL) { 1323 itrq->itrq_rxstat.irxs_rx_copy_nomem.value.ui64++; 1324 return (NULL); 1325 } 1326 1327 mp->b_rptr += I40E_BUF_IPHDR_ALIGNMENT; 1328 bcopy(rcb->rcb_dma.dmab_address, mp->b_rptr, plen); 1329 mp->b_wptr = mp->b_rptr + plen; 1330 1331 return (mp); 1332 } 1333 1334 /* 1335 * Determine if the device has enabled any checksum flags for us. The level of 1336 * checksum computed will depend on the type packet that we have, which is 1337 * contained in ptype. For example, the checksum logic it does will vary 1338 * depending on whether or not the packet is considered tunneled, whether it 1339 * recognizes the L4 type, etc. Section 8.3.4.3 summarizes which checksums are 1340 * valid. 1341 * 1342 * While there are additional checksums that we could recognize here, we'll need 1343 * to get some additional GLDv3 enhancements to be able to properly describe 1344 * them. 1345 */ 1346 static void 1347 i40e_rx_hcksum(i40e_trqpair_t *itrq, mblk_t *mp, uint64_t status, uint32_t err, 1348 uint32_t ptype) 1349 { 1350 uint32_t cksum; 1351 struct i40e_rx_ptype_decoded pinfo; 1352 1353 ASSERT(ptype <= 255); 1354 pinfo = decode_rx_desc_ptype(ptype); 1355 1356 cksum = 0; 1357 1358 /* 1359 * If the ptype isn't something that we know in the driver, then we 1360 * shouldn't even consider moving forward. 1361 */ 1362 if (pinfo.known == 0) { 1363 itrq->itrq_rxstat.irxs_hck_unknown.value.ui64++; 1364 return; 1365 } 1366 1367 /* 1368 * If hardware didn't set the L3L4P bit on the frame, then there is no 1369 * checksum offload to consider. 1370 */ 1371 if ((status & (1 << I40E_RX_DESC_STATUS_L3L4P_SHIFT)) == 0) { 1372 itrq->itrq_rxstat.irxs_hck_nol3l4p.value.ui64++; 1373 return; 1374 } 1375 1376 /* 1377 * The device tells us that IPv6 checksums where a Destination Options 1378 * Header or a Routing header shouldn't be trusted. Discard all 1379 * checksums in this case. 1380 */ 1381 if (pinfo.outer_ip == I40E_RX_PTYPE_OUTER_IP && 1382 pinfo.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV6 && 1383 (status & (1 << I40E_RX_DESC_STATUS_IPV6EXADD_SHIFT))) { 1384 itrq->itrq_rxstat.irxs_hck_v6skip.value.ui64++; 1385 return; 1386 } 1387 1388 /* 1389 * The hardware denotes three kinds of possible errors. Two are reserved 1390 * for inner and outer IP checksum errors (IPE and EIPE) and the latter 1391 * is for L4 checksum errors (L4E). If there is only one IP header, then 1392 * the only thing that we care about is IPE. Note that since we don't 1393 * support inner checksums, we will ignore IPE being set on tunneled 1394 * packets and only care about EIPE. 1395 */ 1396 if (pinfo.outer_ip == I40E_RX_PTYPE_OUTER_IP && 1397 pinfo.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV4) { 1398 if (pinfo.tunnel_type == I40E_RX_PTYPE_OUTER_NONE) { 1399 if ((err & (1 << I40E_RX_DESC_ERROR_IPE_SHIFT)) != 0) { 1400 itrq->itrq_rxstat.irxs_hck_iperr.value.ui64++; 1401 } else { 1402 itrq->itrq_rxstat.irxs_hck_v4hdrok.value.ui64++; 1403 cksum |= HCK_IPV4_HDRCKSUM_OK; 1404 } 1405 } else { 1406 if ((err & (1 << I40E_RX_DESC_ERROR_EIPE_SHIFT)) != 0) { 1407 itrq->itrq_rxstat.irxs_hck_eiperr.value.ui64++; 1408 } else { 1409 itrq->itrq_rxstat.irxs_hck_v4hdrok.value.ui64++; 1410 cksum |= HCK_IPV4_HDRCKSUM_OK; 1411 } 1412 } 1413 } 1414 1415 /* 1416 * We only have meaningful L4 checksums in the case of IP->L4 and 1417 * IP->IP->L4. There is not outer L4 checksum data available in any 1418 * other case. Further, we don't bother reporting the valid checksum in 1419 * the case of IP->IP->L4 set. 1420 */ 1421 if (pinfo.outer_ip == I40E_RX_PTYPE_OUTER_IP && 1422 pinfo.tunnel_type == I40E_RX_PTYPE_TUNNEL_NONE && 1423 (pinfo.inner_prot == I40E_RX_PTYPE_INNER_PROT_UDP || 1424 pinfo.inner_prot == I40E_RX_PTYPE_INNER_PROT_TCP || 1425 pinfo.inner_prot == I40E_RX_PTYPE_INNER_PROT_ICMP || 1426 pinfo.inner_prot == I40E_RX_PTYPE_INNER_PROT_SCTP)) { 1427 ASSERT(pinfo.payload_layer == I40E_RX_PTYPE_PAYLOAD_LAYER_PAY4); 1428 if ((err & (1 << I40E_RX_DESC_ERROR_L4E_SHIFT)) != 0) { 1429 itrq->itrq_rxstat.irxs_hck_l4err.value.ui64++; 1430 } else { 1431 itrq->itrq_rxstat.irxs_hck_l4hdrok.value.ui64++; 1432 cksum |= HCK_FULLCKSUM_OK; 1433 } 1434 } 1435 1436 if (cksum != 0) { 1437 itrq->itrq_rxstat.irxs_hck_set.value.ui64++; 1438 mac_hcksum_set(mp, 0, 0, 0, 0, cksum); 1439 } else { 1440 itrq->itrq_rxstat.irxs_hck_miss.value.ui64++; 1441 } 1442 } 1443 1444 mblk_t * 1445 i40e_ring_rx(i40e_trqpair_t *itrq, int poll_bytes) 1446 { 1447 i40e_t *i40e; 1448 i40e_hw_t *hw; 1449 i40e_rx_data_t *rxd; 1450 uint32_t cur_head; 1451 i40e_rx_desc_t *cur_desc; 1452 i40e_rx_control_block_t *rcb; 1453 uint64_t rx_bytes, rx_frames; 1454 uint64_t stword; 1455 mblk_t *mp, *mp_head, **mp_tail; 1456 1457 ASSERT(MUTEX_HELD(&itrq->itrq_rx_lock)); 1458 rxd = itrq->itrq_rxdata; 1459 i40e = itrq->itrq_i40e; 1460 hw = &i40e->i40e_hw_space; 1461 1462 if (!(i40e->i40e_state & I40E_STARTED) || 1463 (i40e->i40e_state & I40E_OVERTEMP) || 1464 (i40e->i40e_state & I40E_SUSPENDED) || 1465 (i40e->i40e_state & I40E_ERROR)) 1466 return (NULL); 1467 1468 /* 1469 * Before we do anything else, we have to make sure that all of the DMA 1470 * buffers are synced up and then check to make sure that they're 1471 * actually good from an FM perspective. 1472 */ 1473 I40E_DMA_SYNC(&rxd->rxd_desc_area, DDI_DMA_SYNC_FORKERNEL); 1474 if (i40e_check_dma_handle(rxd->rxd_desc_area.dmab_dma_handle) != 1475 DDI_FM_OK) { 1476 ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED); 1477 atomic_or_32(&i40e->i40e_state, I40E_ERROR); 1478 return (NULL); 1479 } 1480 1481 /* 1482 * Prepare our stats. We do a limited amount of processing in both 1483 * polling and interrupt context. The limit in interrupt context is 1484 * based on frames, in polling context based on bytes. 1485 */ 1486 rx_bytes = rx_frames = 0; 1487 mp_head = NULL; 1488 mp_tail = &mp_head; 1489 1490 /* 1491 * At this point, the descriptor ring is available to check. We'll try 1492 * and process until we either run out of poll_bytes or descriptors. 1493 */ 1494 cur_head = rxd->rxd_desc_next; 1495 cur_desc = &rxd->rxd_desc_ring[cur_head]; 1496 stword = LE64_TO_CPU(cur_desc->wb.qword1.status_error_len); 1497 1498 /* 1499 * Note, the primary invariant of this loop should be that cur_head, 1500 * cur_desc, and stword always point to the currently processed 1501 * descriptor. When we leave the loop, it should point to a descriptor 1502 * that HAS NOT been processed. Meaning, that if we haven't consumed the 1503 * frame, the descriptor should not be advanced. 1504 */ 1505 while ((stword & (1 << I40E_RX_DESC_STATUS_DD_SHIFT)) != 0) { 1506 uint32_t error, eop, plen, ptype; 1507 1508 /* 1509 * The DD, PLEN, and EOP bits are the only ones that are valid 1510 * in every frame. The error information is only valid when EOP 1511 * is set in the same frame. 1512 * 1513 * At this time, because we don't do any LRO or header 1514 * splitting. We expect that every frame should have EOP set in 1515 * it. When later functionality comes in, we'll want to 1516 * re-evaluate this. 1517 */ 1518 eop = stword & (1 << I40E_RX_DESC_STATUS_EOF_SHIFT); 1519 VERIFY(eop != 0); 1520 1521 error = (stword & I40E_RXD_QW1_ERROR_MASK) >> 1522 I40E_RXD_QW1_ERROR_SHIFT; 1523 if (error & I40E_RX_ERR_BITS) { 1524 itrq->itrq_rxstat.irxs_rx_desc_error.value.ui64++; 1525 goto discard; 1526 } 1527 1528 plen = (stword & I40E_RXD_QW1_LENGTH_PBUF_MASK) >> 1529 I40E_RXD_QW1_LENGTH_PBUF_SHIFT; 1530 1531 ptype = (stword & I40E_RXD_QW1_PTYPE_MASK) >> 1532 I40E_RXD_QW1_PTYPE_SHIFT; 1533 1534 /* 1535 * This packet contains valid data. We should check to see if 1536 * we're actually going to consume it based on its length (to 1537 * ensure that we don't overshoot our quota). We determine 1538 * whether to bcopy or bind the DMA resources based on the size 1539 * of the frame. However, if on debug, we allow it to be 1540 * overridden for testing purposes. 1541 * 1542 * We should be smarter about this and do DMA binding for 1543 * larger frames, but for now, it's really more important that 1544 * we actually just get something simple working. 1545 */ 1546 1547 /* 1548 * Ensure we don't exceed our polling quota by reading this 1549 * frame. Note we only bump bytes now, we bump frames later. 1550 */ 1551 if ((poll_bytes != I40E_POLL_NULL) && 1552 (rx_bytes + plen) > poll_bytes) 1553 break; 1554 rx_bytes += plen; 1555 1556 mp = NULL; 1557 if (plen >= i40e->i40e_rx_dma_min) 1558 mp = i40e_rx_bind(itrq, rxd, cur_head, plen); 1559 if (mp == NULL) 1560 mp = i40e_rx_copy(itrq, rxd, cur_head, plen); 1561 1562 if (mp != NULL) { 1563 if (i40e->i40e_rx_hcksum_enable) 1564 i40e_rx_hcksum(itrq, mp, stword, error, ptype); 1565 *mp_tail = mp; 1566 mp_tail = &mp->b_next; 1567 } 1568 1569 /* 1570 * Now we need to prepare this frame for use again. See the 1571 * discussion in the big theory statements. 1572 * 1573 * However, right now we're doing the simple version of this. 1574 * Normally what we'd do would depend on whether or not we were 1575 * doing DMA binding or bcopying. But because we're always doing 1576 * bcopying, we can just always use the current index as a key 1577 * for what to do and reassign the buffer based on the ring. 1578 */ 1579 discard: 1580 rcb = rxd->rxd_work_list[cur_head]; 1581 cur_desc->read.pkt_addr = 1582 CPU_TO_LE64((uintptr_t)rcb->rcb_dma.dmab_dma_address); 1583 cur_desc->read.hdr_addr = 0; 1584 1585 /* 1586 * Finally, update our loop invariants. 1587 */ 1588 cur_head = i40e_next_desc(cur_head, 1, rxd->rxd_ring_size); 1589 cur_desc = &rxd->rxd_desc_ring[cur_head]; 1590 stword = LE64_TO_CPU(cur_desc->wb.qword1.status_error_len); 1591 1592 /* 1593 * To help provide liveness, we limit the amount of data that 1594 * we'll end up counting. Note that in these cases, an interrupt 1595 * is not dissimilar from a polling request. 1596 */ 1597 rx_frames++; 1598 if (rx_frames > i40e->i40e_rx_limit_per_intr) { 1599 itrq->itrq_rxstat.irxs_rx_intr_limit.value.ui64++; 1600 break; 1601 } 1602 } 1603 1604 /* 1605 * As we've modified the ring, we need to make sure that we sync the 1606 * descriptor ring for the device. Next, we update the hardware and 1607 * update our notion of where the head for us to read from hardware is 1608 * next. 1609 */ 1610 I40E_DMA_SYNC(&rxd->rxd_desc_area, DDI_DMA_SYNC_FORDEV); 1611 if (i40e_check_dma_handle(rxd->rxd_desc_area.dmab_dma_handle) != 1612 DDI_FM_OK) { 1613 ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED); 1614 atomic_or_32(&i40e->i40e_state, I40E_ERROR); 1615 } 1616 1617 if (rx_frames != 0) { 1618 uint32_t tail; 1619 ddi_acc_handle_t rh = i40e->i40e_osdep_space.ios_reg_handle; 1620 rxd->rxd_desc_next = cur_head; 1621 tail = i40e_prev_desc(cur_head, 1, rxd->rxd_ring_size); 1622 1623 I40E_WRITE_REG(hw, I40E_QRX_TAIL(itrq->itrq_index), tail); 1624 if (i40e_check_acc_handle(rh) != DDI_FM_OK) { 1625 ddi_fm_service_impact(i40e->i40e_dip, 1626 DDI_SERVICE_DEGRADED); 1627 atomic_or_32(&i40e->i40e_state, I40E_ERROR); 1628 } 1629 1630 itrq->itrq_rxstat.irxs_bytes.value.ui64 += rx_bytes; 1631 itrq->itrq_rxstat.irxs_packets.value.ui64 += rx_frames; 1632 } 1633 1634 #ifdef DEBUG 1635 if (rx_frames == 0) { 1636 ASSERT(rx_bytes == 0); 1637 } 1638 #endif 1639 1640 return (mp_head); 1641 } 1642 1643 /* 1644 * This function is called by the GLDv3 when it wants to poll on a ring. The 1645 * only primary difference from when we call this during an interrupt is that we 1646 * have a limit on the number of bytes that we should consume. 1647 */ 1648 mblk_t * 1649 i40e_ring_rx_poll(void *arg, int poll_bytes) 1650 { 1651 i40e_trqpair_t *itrq = arg; 1652 mblk_t *mp; 1653 1654 ASSERT(poll_bytes > 0); 1655 if (poll_bytes == 0) 1656 return (NULL); 1657 1658 mutex_enter(&itrq->itrq_rx_lock); 1659 mp = i40e_ring_rx(itrq, poll_bytes); 1660 mutex_exit(&itrq->itrq_rx_lock); 1661 1662 return (mp); 1663 } 1664 1665 /* 1666 * This is a structure I wish someone would fill out for me for dorking with the 1667 * checksums. When we get some more experience with this, we should go ahead and 1668 * consider adding this to MAC. 1669 */ 1670 typedef enum mac_ether_offload_flags { 1671 MEOI_L2INFO_SET = 0x01, 1672 MEOI_VLAN_TAGGED = 0x02, 1673 MEOI_L3INFO_SET = 0x04, 1674 MEOI_L3CKSUM_SET = 0x08, 1675 MEOI_L4INFO_SET = 0x10, 1676 MEOI_L4CKSUM_SET = 0x20 1677 } mac_ether_offload_flags_t; 1678 1679 typedef struct mac_ether_offload_info { 1680 mac_ether_offload_flags_t meoi_flags; 1681 uint8_t meoi_l2hlen; /* How long is the Ethernet header? */ 1682 uint16_t meoi_l3proto; /* What's the Ethertype */ 1683 uint8_t meoi_l3hlen; /* How long is the header? */ 1684 uint8_t meoi_l4proto; /* What is the payload type? */ 1685 uint8_t meoi_l4hlen; /* How long is the L4 header */ 1686 mblk_t *meoi_l3ckmp; /* Which mblk has the l3 checksum */ 1687 off_t meoi_l3ckoff; /* What's the offset to it */ 1688 mblk_t *meoi_l4ckmp; /* Which mblk has the L4 checksum */ 1689 off_t meoi_l4off; /* What is the offset to it? */ 1690 } mac_ether_offload_info_t; 1691 1692 /* 1693 * This is something that we'd like to make a general MAC function. Before we do 1694 * that, we should add support for TSO. 1695 * 1696 * We should really keep track of our offset and not walk everything every 1697 * time. I can't imagine that this will be kind to us at high packet rates; 1698 * however, for the moment, let's leave that. 1699 * 1700 * This walks a message block chain without pulling up to fill in the context 1701 * information. Note that the data we care about could be hidden across more 1702 * than one mblk_t. 1703 */ 1704 static int 1705 i40e_meoi_get_uint8(mblk_t *mp, off_t off, uint8_t *out) 1706 { 1707 size_t mpsize; 1708 uint8_t *bp; 1709 1710 mpsize = msgsize(mp); 1711 /* Check for overflow */ 1712 if (off + sizeof (uint16_t) > mpsize) 1713 return (-1); 1714 1715 mpsize = MBLKL(mp); 1716 while (off >= mpsize) { 1717 mp = mp->b_cont; 1718 off -= mpsize; 1719 mpsize = MBLKL(mp); 1720 } 1721 1722 bp = mp->b_rptr + off; 1723 *out = *bp; 1724 return (0); 1725 1726 } 1727 1728 static int 1729 i40e_meoi_get_uint16(mblk_t *mp, off_t off, uint16_t *out) 1730 { 1731 size_t mpsize; 1732 uint8_t *bp; 1733 1734 mpsize = msgsize(mp); 1735 /* Check for overflow */ 1736 if (off + sizeof (uint16_t) > mpsize) 1737 return (-1); 1738 1739 mpsize = MBLKL(mp); 1740 while (off >= mpsize) { 1741 mp = mp->b_cont; 1742 off -= mpsize; 1743 mpsize = MBLKL(mp); 1744 } 1745 1746 /* 1747 * Data is in network order. Note the second byte of data might be in 1748 * the next mp. 1749 */ 1750 bp = mp->b_rptr + off; 1751 *out = *bp << 8; 1752 if (off + 1 == mpsize) { 1753 mp = mp->b_cont; 1754 bp = mp->b_rptr; 1755 } else { 1756 bp++; 1757 } 1758 1759 *out |= *bp; 1760 return (0); 1761 1762 } 1763 1764 static int 1765 mac_ether_offload_info(mblk_t *mp, mac_ether_offload_info_t *meoi) 1766 { 1767 size_t off; 1768 uint16_t ether; 1769 uint8_t ipproto, iplen, l4len, maclen; 1770 1771 bzero(meoi, sizeof (mac_ether_offload_info_t)); 1772 1773 off = offsetof(struct ether_header, ether_type); 1774 if (i40e_meoi_get_uint16(mp, off, ðer) != 0) 1775 return (-1); 1776 1777 if (ether == ETHERTYPE_VLAN) { 1778 off = offsetof(struct ether_vlan_header, ether_type); 1779 if (i40e_meoi_get_uint16(mp, off, ðer) != 0) 1780 return (-1); 1781 meoi->meoi_flags |= MEOI_VLAN_TAGGED; 1782 maclen = sizeof (struct ether_vlan_header); 1783 } else { 1784 maclen = sizeof (struct ether_header); 1785 } 1786 meoi->meoi_flags |= MEOI_L2INFO_SET; 1787 meoi->meoi_l2hlen = maclen; 1788 meoi->meoi_l3proto = ether; 1789 1790 switch (ether) { 1791 case ETHERTYPE_IP: 1792 /* 1793 * For IPv4 we need to get the length of the header, as it can 1794 * be variable. 1795 */ 1796 off = offsetof(ipha_t, ipha_version_and_hdr_length) + maclen; 1797 if (i40e_meoi_get_uint8(mp, off, &iplen) != 0) 1798 return (-1); 1799 iplen &= 0x0f; 1800 if (iplen < 5 || iplen > 0x0f) 1801 return (-1); 1802 iplen *= 4; 1803 off = offsetof(ipha_t, ipha_protocol) + maclen; 1804 if (i40e_meoi_get_uint8(mp, off, &ipproto) == -1) 1805 return (-1); 1806 break; 1807 case ETHERTYPE_IPV6: 1808 iplen = 40; 1809 off = offsetof(ip6_t, ip6_nxt) + maclen; 1810 if (i40e_meoi_get_uint8(mp, off, &ipproto) == -1) 1811 return (-1); 1812 break; 1813 default: 1814 return (0); 1815 } 1816 meoi->meoi_l3hlen = iplen; 1817 meoi->meoi_l4proto = ipproto; 1818 meoi->meoi_flags |= MEOI_L3INFO_SET; 1819 1820 switch (ipproto) { 1821 case IPPROTO_TCP: 1822 off = offsetof(tcph_t, th_offset_and_rsrvd) + maclen + iplen; 1823 if (i40e_meoi_get_uint8(mp, off, &l4len) == -1) 1824 return (-1); 1825 l4len = (l4len & 0xf0) >> 4; 1826 if (l4len < 5 || l4len > 0xf) 1827 return (-1); 1828 l4len *= 4; 1829 break; 1830 case IPPROTO_UDP: 1831 l4len = sizeof (struct udphdr); 1832 break; 1833 case IPPROTO_SCTP: 1834 l4len = sizeof (sctp_hdr_t); 1835 break; 1836 default: 1837 return (0); 1838 } 1839 1840 meoi->meoi_l4hlen = l4len; 1841 meoi->meoi_flags |= MEOI_L4INFO_SET; 1842 return (0); 1843 } 1844 1845 /* 1846 * Attempt to put togther the information we'll need to feed into a descriptor 1847 * to properly program the hardware for checksum offload as well as the 1848 * generally required flags. 1849 * 1850 * The i40e_tx_context_t`itc_data_cmdflags contains the set of flags we need to 1851 * 'or' into the descriptor based on the checksum flags for this mblk_t and the 1852 * actual information we care about. 1853 * 1854 * If the mblk requires LSO then we'll also gather the information that will be 1855 * used to construct the Transmit Context Descriptor. 1856 */ 1857 static int 1858 i40e_tx_context(i40e_t *i40e, i40e_trqpair_t *itrq, mblk_t *mp, 1859 mac_ether_offload_info_t *meo, i40e_tx_context_t *tctx) 1860 { 1861 uint32_t chkflags, start, mss, lsoflags; 1862 i40e_txq_stat_t *txs = &itrq->itrq_txstat; 1863 1864 bzero(tctx, sizeof (i40e_tx_context_t)); 1865 1866 if (i40e->i40e_tx_hcksum_enable != B_TRUE) 1867 return (0); 1868 1869 mac_hcksum_get(mp, &start, NULL, NULL, NULL, &chkflags); 1870 mac_lso_get(mp, &mss, &lsoflags); 1871 1872 if (chkflags == 0 && lsoflags == 0) 1873 return (0); 1874 1875 /* 1876 * Have we been asked to checksum an IPv4 header. If so, verify that we 1877 * have sufficient information and then set the proper fields in the 1878 * command structure. 1879 */ 1880 if (chkflags & HCK_IPV4_HDRCKSUM) { 1881 if ((meo->meoi_flags & MEOI_L2INFO_SET) == 0) { 1882 txs->itxs_hck_nol2info.value.ui64++; 1883 return (-1); 1884 } 1885 if ((meo->meoi_flags & MEOI_L3INFO_SET) == 0) { 1886 txs->itxs_hck_nol3info.value.ui64++; 1887 return (-1); 1888 } 1889 if (meo->meoi_l3proto != ETHERTYPE_IP) { 1890 txs->itxs_hck_badl3.value.ui64++; 1891 return (-1); 1892 } 1893 tctx->itc_data_cmdflags |= I40E_TX_DESC_CMD_IIPT_IPV4_CSUM; 1894 tctx->itc_data_offsets |= (meo->meoi_l2hlen >> 1) << 1895 I40E_TX_DESC_LENGTH_MACLEN_SHIFT; 1896 tctx->itc_data_offsets |= (meo->meoi_l3hlen >> 2) << 1897 I40E_TX_DESC_LENGTH_IPLEN_SHIFT; 1898 } 1899 1900 /* 1901 * We've been asked to provide an L4 header, first, set up the IP 1902 * information in the descriptor if we haven't already before moving 1903 * onto seeing if we have enough information for the L4 checksum 1904 * offload. 1905 */ 1906 if (chkflags & HCK_PARTIALCKSUM) { 1907 if ((meo->meoi_flags & MEOI_L4INFO_SET) == 0) { 1908 txs->itxs_hck_nol4info.value.ui64++; 1909 return (-1); 1910 } 1911 1912 if (!(chkflags & HCK_IPV4_HDRCKSUM)) { 1913 if ((meo->meoi_flags & MEOI_L2INFO_SET) == 0) { 1914 txs->itxs_hck_nol2info.value.ui64++; 1915 return (-1); 1916 } 1917 if ((meo->meoi_flags & MEOI_L3INFO_SET) == 0) { 1918 txs->itxs_hck_nol3info.value.ui64++; 1919 return (-1); 1920 } 1921 1922 if (meo->meoi_l3proto == ETHERTYPE_IP) { 1923 tctx->itc_data_cmdflags |= 1924 I40E_TX_DESC_CMD_IIPT_IPV4; 1925 } else if (meo->meoi_l3proto == ETHERTYPE_IPV6) { 1926 tctx->itc_data_cmdflags |= 1927 I40E_TX_DESC_CMD_IIPT_IPV6; 1928 } else { 1929 txs->itxs_hck_badl3.value.ui64++; 1930 return (-1); 1931 } 1932 tctx->itc_data_offsets |= (meo->meoi_l2hlen >> 1) << 1933 I40E_TX_DESC_LENGTH_MACLEN_SHIFT; 1934 tctx->itc_data_offsets |= (meo->meoi_l3hlen >> 2) << 1935 I40E_TX_DESC_LENGTH_IPLEN_SHIFT; 1936 } 1937 1938 switch (meo->meoi_l4proto) { 1939 case IPPROTO_TCP: 1940 tctx->itc_data_cmdflags |= 1941 I40E_TX_DESC_CMD_L4T_EOFT_TCP; 1942 break; 1943 case IPPROTO_UDP: 1944 tctx->itc_data_cmdflags |= 1945 I40E_TX_DESC_CMD_L4T_EOFT_UDP; 1946 break; 1947 case IPPROTO_SCTP: 1948 tctx->itc_data_cmdflags |= 1949 I40E_TX_DESC_CMD_L4T_EOFT_SCTP; 1950 break; 1951 default: 1952 txs->itxs_hck_badl4.value.ui64++; 1953 return (-1); 1954 } 1955 1956 tctx->itc_data_offsets |= (meo->meoi_l4hlen >> 2) << 1957 I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT; 1958 } 1959 1960 if (lsoflags & HW_LSO) { 1961 /* 1962 * LSO requires that checksum offloads are enabled. If for 1963 * some reason they're not we bail out with an error. 1964 */ 1965 if ((chkflags & HCK_IPV4_HDRCKSUM) == 0 || 1966 (chkflags & HCK_PARTIALCKSUM) == 0) { 1967 txs->itxs_lso_nohck.value.ui64++; 1968 return (-1); 1969 } 1970 1971 tctx->itc_ctx_cmdflags |= I40E_TX_CTX_DESC_TSO; 1972 tctx->itc_ctx_mss = mss; 1973 tctx->itc_ctx_tsolen = msgsize(mp) - 1974 (meo->meoi_l2hlen + meo->meoi_l3hlen + meo->meoi_l4hlen); 1975 } 1976 1977 return (0); 1978 } 1979 1980 static void 1981 i40e_tcb_free(i40e_trqpair_t *itrq, i40e_tx_control_block_t *tcb) 1982 { 1983 ASSERT(tcb != NULL); 1984 1985 mutex_enter(&itrq->itrq_tcb_lock); 1986 ASSERT(itrq->itrq_tcb_free < itrq->itrq_tx_free_list_size); 1987 itrq->itrq_tcb_free_list[itrq->itrq_tcb_free] = tcb; 1988 itrq->itrq_tcb_free++; 1989 mutex_exit(&itrq->itrq_tcb_lock); 1990 } 1991 1992 static i40e_tx_control_block_t * 1993 i40e_tcb_alloc(i40e_trqpair_t *itrq) 1994 { 1995 i40e_tx_control_block_t *ret; 1996 1997 mutex_enter(&itrq->itrq_tcb_lock); 1998 if (itrq->itrq_tcb_free == 0) { 1999 mutex_exit(&itrq->itrq_tcb_lock); 2000 return (NULL); 2001 } 2002 2003 itrq->itrq_tcb_free--; 2004 ret = itrq->itrq_tcb_free_list[itrq->itrq_tcb_free]; 2005 itrq->itrq_tcb_free_list[itrq->itrq_tcb_free] = NULL; 2006 mutex_exit(&itrq->itrq_tcb_lock); 2007 2008 ASSERT(ret != NULL); 2009 return (ret); 2010 } 2011 2012 /* 2013 * This should be used to free any DMA resources, associated mblk_t's, etc. It's 2014 * used as part of recycling the message blocks when we have either an interrupt 2015 * or other activity that indicates that we need to take a look. 2016 */ 2017 static void 2018 i40e_tcb_reset(i40e_tx_control_block_t *tcb) 2019 { 2020 switch (tcb->tcb_type) { 2021 case I40E_TX_COPY: 2022 tcb->tcb_dma.dmab_len = 0; 2023 break; 2024 case I40E_TX_DMA: 2025 if (tcb->tcb_used_lso == B_TRUE && tcb->tcb_bind_ncookies > 0) 2026 (void) ddi_dma_unbind_handle(tcb->tcb_lso_dma_handle); 2027 else if (tcb->tcb_bind_ncookies > 0) 2028 (void) ddi_dma_unbind_handle(tcb->tcb_dma_handle); 2029 if (tcb->tcb_bind_info != NULL) { 2030 kmem_free(tcb->tcb_bind_info, 2031 tcb->tcb_bind_ncookies * 2032 sizeof (struct i40e_dma_bind_info)); 2033 } 2034 tcb->tcb_bind_info = NULL; 2035 tcb->tcb_bind_ncookies = 0; 2036 tcb->tcb_used_lso = B_FALSE; 2037 break; 2038 case I40E_TX_DESC: 2039 break; 2040 case I40E_TX_NONE: 2041 /* Cast to pacify lint */ 2042 panic("trying to free tcb %p with bad type none", (void *)tcb); 2043 default: 2044 panic("unknown i40e tcb type: %d", tcb->tcb_type); 2045 } 2046 2047 tcb->tcb_type = I40E_TX_NONE; 2048 if (tcb->tcb_mp != NULL) { 2049 freemsg(tcb->tcb_mp); 2050 tcb->tcb_mp = NULL; 2051 } 2052 tcb->tcb_next = NULL; 2053 } 2054 2055 /* 2056 * This is called as part of shutting down to clean up all outstanding 2057 * descriptors. Similar to recycle, except we don't re-arm anything and instead 2058 * just return control blocks to the free list. 2059 */ 2060 void 2061 i40e_tx_cleanup_ring(i40e_trqpair_t *itrq) 2062 { 2063 uint32_t index; 2064 2065 ASSERT(MUTEX_HELD(&itrq->itrq_tx_lock)); 2066 ASSERT(itrq->itrq_desc_free <= itrq->itrq_tx_ring_size); 2067 2068 /* 2069 * Because we should have shut down the chip at this point, it should be 2070 * safe to just clean up all the entries between our head and tail. 2071 */ 2072 #ifdef DEBUG 2073 index = I40E_READ_REG(&itrq->itrq_i40e->i40e_hw_space, 2074 I40E_QTX_ENA(itrq->itrq_index)); 2075 VERIFY0(index & (I40E_QTX_ENA_QENA_REQ_MASK | 2076 I40E_QTX_ENA_QENA_STAT_MASK)); 2077 #endif 2078 2079 index = itrq->itrq_desc_head; 2080 while (itrq->itrq_desc_free < itrq->itrq_tx_ring_size) { 2081 i40e_tx_control_block_t *tcb; 2082 2083 tcb = itrq->itrq_tcb_work_list[index]; 2084 if (tcb != NULL) { 2085 itrq->itrq_tcb_work_list[index] = NULL; 2086 i40e_tcb_reset(tcb); 2087 i40e_tcb_free(itrq, tcb); 2088 } 2089 2090 bzero(&itrq->itrq_desc_ring[index], sizeof (i40e_tx_desc_t)); 2091 index = i40e_next_desc(index, 1, itrq->itrq_tx_ring_size); 2092 itrq->itrq_desc_free++; 2093 } 2094 2095 ASSERT(index == itrq->itrq_desc_tail); 2096 itrq->itrq_desc_head = index; 2097 } 2098 2099 /* 2100 * We're here either by hook or by crook. We need to see if there are transmit 2101 * descriptors available for us to go and clean up and return to the hardware. 2102 * We may also be blocked, and if so, we should make sure that we let it know 2103 * we're good to go. 2104 */ 2105 void 2106 i40e_tx_recycle_ring(i40e_trqpair_t *itrq) 2107 { 2108 uint32_t wbhead, toclean, count; 2109 i40e_tx_control_block_t *tcbhead; 2110 i40e_t *i40e = itrq->itrq_i40e; 2111 uint_t desc_per_tcb, i; 2112 2113 mutex_enter(&itrq->itrq_tx_lock); 2114 2115 ASSERT(itrq->itrq_desc_free <= itrq->itrq_tx_ring_size); 2116 if (itrq->itrq_desc_free == itrq->itrq_tx_ring_size) { 2117 if (itrq->itrq_tx_blocked == B_TRUE) { 2118 itrq->itrq_tx_blocked = B_FALSE; 2119 mac_tx_ring_update(i40e->i40e_mac_hdl, 2120 itrq->itrq_mactxring); 2121 itrq->itrq_txstat.itxs_num_unblocked.value.ui64++; 2122 } 2123 mutex_exit(&itrq->itrq_tx_lock); 2124 return; 2125 } 2126 2127 /* 2128 * Now we need to try and see if there's anything available. The driver 2129 * will write to the head location and it guarantees that it does not 2130 * use relaxed ordering. 2131 */ 2132 VERIFY0(ddi_dma_sync(itrq->itrq_desc_area.dmab_dma_handle, 2133 (uintptr_t)itrq->itrq_desc_wbhead, 2134 sizeof (uint32_t), DDI_DMA_SYNC_FORKERNEL)); 2135 2136 if (i40e_check_dma_handle(itrq->itrq_desc_area.dmab_dma_handle) != 2137 DDI_FM_OK) { 2138 mutex_exit(&itrq->itrq_tx_lock); 2139 ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED); 2140 atomic_or_32(&i40e->i40e_state, I40E_ERROR); 2141 return; 2142 } 2143 2144 wbhead = *itrq->itrq_desc_wbhead; 2145 toclean = itrq->itrq_desc_head; 2146 count = 0; 2147 tcbhead = NULL; 2148 2149 while (toclean != wbhead) { 2150 i40e_tx_control_block_t *tcb; 2151 2152 tcb = itrq->itrq_tcb_work_list[toclean]; 2153 itrq->itrq_tcb_work_list[toclean] = NULL; 2154 ASSERT(tcb != NULL); 2155 tcb->tcb_next = tcbhead; 2156 tcbhead = tcb; 2157 2158 /* 2159 * In the DMA bind case, there may not necessarily be a 1:1 2160 * mapping between tcb's and descriptors. If the tcb type 2161 * indicates a DMA binding then check the number of DMA 2162 * cookies to determine how many entries to clean in the 2163 * descriptor ring. 2164 */ 2165 if (tcb->tcb_type == I40E_TX_DMA) 2166 desc_per_tcb = tcb->tcb_bind_ncookies; 2167 else 2168 desc_per_tcb = 1; 2169 2170 for (i = 0; i < desc_per_tcb; i++) { 2171 /* 2172 * We zero this out for sanity purposes. 2173 */ 2174 bzero(&itrq->itrq_desc_ring[toclean], 2175 sizeof (i40e_tx_desc_t)); 2176 toclean = i40e_next_desc(toclean, 1, 2177 itrq->itrq_tx_ring_size); 2178 count++; 2179 } 2180 } 2181 2182 itrq->itrq_desc_head = wbhead; 2183 itrq->itrq_desc_free += count; 2184 itrq->itrq_txstat.itxs_recycled.value.ui64 += count; 2185 ASSERT(itrq->itrq_desc_free <= itrq->itrq_tx_ring_size); 2186 2187 if (itrq->itrq_tx_blocked == B_TRUE && 2188 itrq->itrq_desc_free > i40e->i40e_tx_block_thresh) { 2189 itrq->itrq_tx_blocked = B_FALSE; 2190 2191 mac_tx_ring_update(i40e->i40e_mac_hdl, itrq->itrq_mactxring); 2192 itrq->itrq_txstat.itxs_num_unblocked.value.ui64++; 2193 } 2194 2195 mutex_exit(&itrq->itrq_tx_lock); 2196 2197 /* 2198 * Now clean up the tcb. 2199 */ 2200 while (tcbhead != NULL) { 2201 i40e_tx_control_block_t *tcb = tcbhead; 2202 2203 tcbhead = tcb->tcb_next; 2204 i40e_tcb_reset(tcb); 2205 i40e_tcb_free(itrq, tcb); 2206 } 2207 2208 DTRACE_PROBE2(i40e__recycle, i40e_trqpair_t *, itrq, uint32_t, count); 2209 } 2210 2211 static void 2212 i40e_tx_copy_fragment(i40e_tx_control_block_t *tcb, const mblk_t *mp, 2213 const size_t off, const size_t len) 2214 { 2215 const void *soff = mp->b_rptr + off; 2216 void *doff = tcb->tcb_dma.dmab_address + tcb->tcb_dma.dmab_len; 2217 2218 ASSERT3U(len, >, 0); 2219 ASSERT3P(soff, >=, mp->b_rptr); 2220 ASSERT3P(soff, <=, mp->b_wptr); 2221 ASSERT3U(len, <=, MBLKL(mp)); 2222 ASSERT3U((uintptr_t)soff + len, <=, (uintptr_t)mp->b_wptr); 2223 ASSERT3U(tcb->tcb_dma.dmab_size - tcb->tcb_dma.dmab_len, >=, len); 2224 bcopy(soff, doff, len); 2225 tcb->tcb_type = I40E_TX_COPY; 2226 tcb->tcb_dma.dmab_len += len; 2227 I40E_DMA_SYNC(&tcb->tcb_dma, DDI_DMA_SYNC_FORDEV); 2228 } 2229 2230 static i40e_tx_control_block_t * 2231 i40e_tx_bind_fragment(i40e_trqpair_t *itrq, const mblk_t *mp, 2232 size_t off, boolean_t use_lso) 2233 { 2234 ddi_dma_handle_t dma_handle; 2235 ddi_dma_cookie_t dma_cookie; 2236 uint_t i = 0, ncookies = 0, dmaflags; 2237 i40e_tx_control_block_t *tcb; 2238 i40e_txq_stat_t *txs = &itrq->itrq_txstat; 2239 2240 if ((tcb = i40e_tcb_alloc(itrq)) == NULL) { 2241 txs->itxs_err_notcb.value.ui64++; 2242 return (NULL); 2243 } 2244 tcb->tcb_type = I40E_TX_DMA; 2245 2246 if (use_lso == B_TRUE) 2247 dma_handle = tcb->tcb_lso_dma_handle; 2248 else 2249 dma_handle = tcb->tcb_dma_handle; 2250 2251 dmaflags = DDI_DMA_WRITE | DDI_DMA_STREAMING; 2252 if (ddi_dma_addr_bind_handle(dma_handle, NULL, 2253 (caddr_t)(mp->b_rptr + off), MBLKL(mp) - off, dmaflags, 2254 DDI_DMA_DONTWAIT, NULL, &dma_cookie, &ncookies) != DDI_DMA_MAPPED) { 2255 txs->itxs_bind_fails.value.ui64++; 2256 goto bffail; 2257 } 2258 2259 tcb->tcb_bind_ncookies = ncookies; 2260 tcb->tcb_used_lso = use_lso; 2261 2262 tcb->tcb_bind_info = 2263 kmem_zalloc(ncookies * sizeof (struct i40e_dma_bind_info), 2264 KM_NOSLEEP); 2265 if (tcb->tcb_bind_info == NULL) 2266 goto bffail; 2267 2268 while (i < ncookies) { 2269 if (i > 0) 2270 ddi_dma_nextcookie(dma_handle, &dma_cookie); 2271 2272 tcb->tcb_bind_info[i].dbi_paddr = 2273 (caddr_t)dma_cookie.dmac_laddress; 2274 tcb->tcb_bind_info[i++].dbi_len = dma_cookie.dmac_size; 2275 } 2276 2277 return (tcb); 2278 2279 bffail: 2280 i40e_tcb_reset(tcb); 2281 i40e_tcb_free(itrq, tcb); 2282 return (NULL); 2283 } 2284 2285 static void 2286 i40e_tx_set_data_desc(i40e_trqpair_t *itrq, i40e_tx_context_t *tctx, 2287 caddr_t buff, size_t len, boolean_t last_desc) 2288 { 2289 i40e_tx_desc_t *txdesc; 2290 int cmd; 2291 2292 ASSERT(MUTEX_HELD(&itrq->itrq_tx_lock)); 2293 itrq->itrq_desc_free--; 2294 txdesc = &itrq->itrq_desc_ring[itrq->itrq_desc_tail]; 2295 itrq->itrq_desc_tail = i40e_next_desc(itrq->itrq_desc_tail, 1, 2296 itrq->itrq_tx_ring_size); 2297 2298 cmd = I40E_TX_DESC_CMD_ICRC | tctx->itc_data_cmdflags; 2299 2300 /* 2301 * The last data descriptor needs the EOP bit set, so that the HW knows 2302 * that we're ready to send. Additionally, we set the RS (Report 2303 * Status) bit, so that we are notified when the transmit engine has 2304 * completed DMA'ing all of the data descriptors and data buffers 2305 * associated with this frame. 2306 */ 2307 if (last_desc == B_TRUE) { 2308 cmd |= I40E_TX_DESC_CMD_EOP; 2309 cmd |= I40E_TX_DESC_CMD_RS; 2310 } 2311 2312 /* 2313 * Per the X710 manual, section 8.4.2.1.1, the buffer size 2314 * must be a value from 1 to 16K minus 1, inclusive. 2315 */ 2316 ASSERT3U(len, >=, 1); 2317 ASSERT3U(len, <=, I40E_MAX_TX_BUFSZ); 2318 2319 txdesc->buffer_addr = CPU_TO_LE64((uintptr_t)buff); 2320 txdesc->cmd_type_offset_bsz = 2321 LE_64(((uint64_t)I40E_TX_DESC_DTYPE_DATA | 2322 ((uint64_t)tctx->itc_data_offsets << I40E_TXD_QW1_OFFSET_SHIFT) | 2323 ((uint64_t)cmd << I40E_TXD_QW1_CMD_SHIFT) | 2324 ((uint64_t)len << I40E_TXD_QW1_TX_BUF_SZ_SHIFT))); 2325 } 2326 2327 /* 2328 * Place 'tcb' on the tail of the list represented by 'head'/'tail'. 2329 */ 2330 static inline void 2331 tcb_list_append(i40e_tx_control_block_t **head, i40e_tx_control_block_t **tail, 2332 i40e_tx_control_block_t *tcb) 2333 { 2334 if (*head == NULL) { 2335 *head = tcb; 2336 *tail = *head; 2337 } else { 2338 ASSERT3P(*tail, !=, NULL); 2339 ASSERT3P((*tail)->tcb_next, ==, NULL); 2340 (*tail)->tcb_next = tcb; 2341 *tail = tcb; 2342 } 2343 } 2344 2345 /* 2346 * This function takes a single packet, possibly consisting of 2347 * multiple mblks, and creates a TCB chain to send to the controller. 2348 * This TCB chain may span up to a maximum of 8 descriptors. A copy 2349 * TCB consumes one descriptor; whereas a DMA TCB may consume 1 or 2350 * more, depending on several factors. For each fragment (invidual 2351 * mblk making up the packet), we determine if its size dictates a 2352 * copy to the TCB buffer or a DMA bind of the dblk buffer. We keep a 2353 * count of descriptors used; when that count reaches the max we force 2354 * all remaining fragments into a single TCB buffer. We have a 2355 * guarantee that the TCB buffer is always larger than the MTU -- so 2356 * there is always enough room. Consecutive fragments below the DMA 2357 * threshold are copied into a single TCB. In the event of an error 2358 * this function returns NULL but leaves 'mp' alone. 2359 */ 2360 static i40e_tx_control_block_t * 2361 i40e_non_lso_chain(i40e_trqpair_t *itrq, mblk_t *mp, uint_t *ndesc) 2362 { 2363 const mblk_t *nmp = mp; 2364 uint_t needed_desc = 0; 2365 boolean_t force_copy = B_FALSE; 2366 i40e_tx_control_block_t *tcb = NULL, *tcbhead = NULL, *tcbtail = NULL; 2367 i40e_t *i40e = itrq->itrq_i40e; 2368 i40e_txq_stat_t *txs = &itrq->itrq_txstat; 2369 2370 /* TCB buffer is always larger than MTU. */ 2371 ASSERT3U(msgsize(mp), <, i40e->i40e_tx_buf_size); 2372 2373 while (nmp != NULL) { 2374 const size_t nmp_len = MBLKL(nmp); 2375 2376 /* Ignore zero-length mblks. */ 2377 if (nmp_len == 0) { 2378 nmp = nmp->b_cont; 2379 continue; 2380 } 2381 2382 if (nmp_len < i40e->i40e_tx_dma_min || force_copy) { 2383 /* Compress consecutive copies into one TCB. */ 2384 if (tcb != NULL && tcb->tcb_type == I40E_TX_COPY) { 2385 i40e_tx_copy_fragment(tcb, nmp, 0, nmp_len); 2386 nmp = nmp->b_cont; 2387 continue; 2388 } 2389 2390 if ((tcb = i40e_tcb_alloc(itrq)) == NULL) { 2391 txs->itxs_err_notcb.value.ui64++; 2392 goto fail; 2393 } 2394 2395 /* 2396 * TCB DMA buffer is guaranteed to be one 2397 * cookie by i40e_alloc_dma_buffer(). 2398 */ 2399 i40e_tx_copy_fragment(tcb, nmp, 0, nmp_len); 2400 needed_desc++; 2401 tcb_list_append(&tcbhead, &tcbtail, tcb); 2402 } else { 2403 uint_t total_desc; 2404 2405 tcb = i40e_tx_bind_fragment(itrq, nmp, 0, B_FALSE); 2406 if (tcb == NULL) { 2407 i40e_error(i40e, "dma bind failed!"); 2408 goto fail; 2409 } 2410 2411 /* 2412 * If the new total exceeds the max or we've 2413 * reached the limit and there's data left, 2414 * then give up binding and copy the rest into 2415 * the pre-allocated TCB buffer. 2416 */ 2417 total_desc = needed_desc + tcb->tcb_bind_ncookies; 2418 if ((total_desc > I40E_TX_MAX_COOKIE) || 2419 (total_desc == I40E_TX_MAX_COOKIE && 2420 nmp->b_cont != NULL)) { 2421 i40e_tcb_reset(tcb); 2422 i40e_tcb_free(itrq, tcb); 2423 2424 if (tcbtail != NULL && 2425 tcbtail->tcb_type == I40E_TX_COPY) { 2426 tcb = tcbtail; 2427 } else { 2428 tcb = NULL; 2429 } 2430 2431 force_copy = B_TRUE; 2432 txs->itxs_force_copy.value.ui64++; 2433 continue; 2434 } 2435 2436 needed_desc += tcb->tcb_bind_ncookies; 2437 tcb_list_append(&tcbhead, &tcbtail, tcb); 2438 } 2439 2440 nmp = nmp->b_cont; 2441 } 2442 2443 ASSERT3P(nmp, ==, NULL); 2444 ASSERT3U(needed_desc, <=, I40E_TX_MAX_COOKIE); 2445 ASSERT3P(tcbhead, !=, NULL); 2446 *ndesc += needed_desc; 2447 return (tcbhead); 2448 2449 fail: 2450 tcb = tcbhead; 2451 while (tcb != NULL) { 2452 i40e_tx_control_block_t *next = tcb->tcb_next; 2453 2454 ASSERT(tcb->tcb_type == I40E_TX_DMA || 2455 tcb->tcb_type == I40E_TX_COPY); 2456 2457 tcb->tcb_mp = NULL; 2458 i40e_tcb_reset(tcb); 2459 i40e_tcb_free(itrq, tcb); 2460 tcb = next; 2461 } 2462 2463 return (NULL); 2464 } 2465 2466 /* 2467 * Section 8.4.1 of the 700-series programming guide states that a 2468 * segment may span up to 8 data descriptors; including both header 2469 * and payload data. However, empirical evidence shows that the 2470 * controller freezes the Tx queue when presented with a segment of 8 2471 * descriptors. Or, at least, when the first segment contains 8 2472 * descriptors. One explanation is that the controller counts the 2473 * context descriptor against the first segment, even though the 2474 * programming guide makes no mention of such a constraint. In any 2475 * case, we limit TSO segments to 7 descriptors to prevent Tx queue 2476 * freezes. We still allow non-TSO segments to utilize all 8 2477 * descriptors as they have not demonstrated the faulty behavior. 2478 */ 2479 uint_t i40e_lso_num_descs = 7; 2480 2481 #define I40E_TCB_LEFT(tcb) \ 2482 ((tcb)->tcb_dma.dmab_size - (tcb)->tcb_dma.dmab_len) 2483 2484 /* 2485 * This function is similar in spirit to i40e_non_lso_chain(), but 2486 * much more complicated in reality. Like the previous function, it 2487 * takes a packet (an LSO packet) as input and returns a chain of 2488 * TCBs. The complication comes with the fact that we are no longer 2489 * trying to fit the entire packet into 8 descriptors, but rather we 2490 * must fit each MSS-size segment of the LSO packet into 8 descriptors. 2491 * Except it's really 7 descriptors, see i40e_lso_num_descs. 2492 * 2493 * Your first inclination might be to verify that a given segment 2494 * spans no more than 7 mblks; but it's actually much more subtle than 2495 * that. First, let's describe what the hardware expects, and then we 2496 * can expound on the software side of things. 2497 * 2498 * For an LSO packet the hardware expects the following: 2499 * 2500 * o Each MSS-sized segment must span no more than 7 descriptors. 2501 * 2502 * o The header size does not count towards the segment size. 2503 * 2504 * o If header and payload share the first descriptor, then the 2505 * controller will count the descriptor twice. 2506 * 2507 * The most important thing to keep in mind is that the hardware does 2508 * not view the segments in terms of mblks, like we do. The hardware 2509 * only sees descriptors. It will iterate each descriptor in turn, 2510 * keeping a tally of bytes seen and descriptors visited. If the byte 2511 * count hasn't reached MSS by the time the descriptor count reaches 2512 * 7, then the controller freezes the queue and we are stuck. 2513 * Furthermore, the hardware picks up its tally where it left off. So 2514 * if it reached MSS in the middle of a descriptor, it will start 2515 * tallying the next segment in the middle of that descriptor. The 2516 * hardware's view is entirely removed from the mblk chain or even the 2517 * descriptor layout. Consider these facts: 2518 * 2519 * o The MSS will vary dpeneding on MTU and other factors. 2520 * 2521 * o The dblk allocation will sit at various offsets within a 2522 * memory page. 2523 * 2524 * o The page size itself could vary in the future (i.e. not 2525 * always 4K). 2526 * 2527 * o Just because a dblk is virtually contiguous doesn't mean 2528 * it's physically contiguous. The number of cookies 2529 * (descriptors) required by a DMA bind of a single dblk is at 2530 * the mercy of the page size and physical layout. 2531 * 2532 * o The descriptors will most often NOT start/end on a MSS 2533 * boundary. Thus the hardware will often start counting the 2534 * MSS mid descriptor and finish mid descriptor. 2535 * 2536 * The upshot of all this is that the driver must learn to think like 2537 * the controller; and verify that none of the constraints are broken. 2538 * It does this by tallying up the segment just like the hardware 2539 * would. This is handled by the two variables 'segsz' and 'segdesc'. 2540 * After each attempt to bind a dblk, we check the constaints. If 2541 * violated, we undo the DMA and force a copy until MSS is met. We 2542 * have a guarantee that the TCB buffer is larger than MTU; thus 2543 * ensuring we can always meet the MSS with a single copy buffer. We 2544 * also copy consecutive non-DMA fragments into the same TCB buffer. 2545 */ 2546 static i40e_tx_control_block_t * 2547 i40e_lso_chain(i40e_trqpair_t *itrq, const mblk_t *mp, 2548 const mac_ether_offload_info_t *meo, const i40e_tx_context_t *tctx, 2549 uint_t *ndesc) 2550 { 2551 size_t mp_len = MBLKL(mp); 2552 /* 2553 * The cpoff (copy offset) variable tracks the offset inside 2554 * the current mp. There are cases where the entire mp is not 2555 * fully copied in one go: such as the header copy followed by 2556 * a non-DMA mblk, or a TCB buffer that only has enough space 2557 * to copy part of the current mp. 2558 */ 2559 size_t cpoff = 0; 2560 /* 2561 * The segsz and segdesc variables track the controller's view 2562 * of the segment. The needed_desc variable tracks the total 2563 * number of data descriptors used by the driver. 2564 */ 2565 size_t segsz = 0; 2566 uint_t segdesc = 0; 2567 uint_t needed_desc = 0; 2568 size_t hdrcopied = 0; 2569 const size_t hdrlen = 2570 meo->meoi_l2hlen + meo->meoi_l3hlen + meo->meoi_l4hlen; 2571 const size_t mss = tctx->itc_ctx_mss; 2572 boolean_t force_copy = B_FALSE; 2573 i40e_tx_control_block_t *tcb = NULL, *tcbhead = NULL, *tcbtail = NULL; 2574 i40e_t *i40e = itrq->itrq_i40e; 2575 i40e_txq_stat_t *txs = &itrq->itrq_txstat; 2576 2577 /* 2578 * We always copy the header in order to avoid more 2579 * complicated code dealing with various edge cases. 2580 */ 2581 if ((tcb = i40e_tcb_alloc(itrq)) == NULL) { 2582 txs->itxs_err_notcb.value.ui64++; 2583 goto fail; 2584 } 2585 2586 needed_desc++; 2587 tcb_list_append(&tcbhead, &tcbtail, tcb); 2588 2589 while (hdrcopied < hdrlen) { 2590 const size_t tocopy = MIN(hdrlen - hdrcopied, mp_len); 2591 i40e_tx_copy_fragment(tcb, mp, 0, tocopy); 2592 hdrcopied += tocopy; 2593 cpoff += tocopy; 2594 if (tocopy == mp_len) { 2595 /* 2596 * This is a bit of defensive programming. We 2597 * should never have a chain too short to 2598 * satisfy the headers -- but just in case. 2599 */ 2600 if ((mp = mp->b_cont) == NULL) { 2601 txs->itxs_tx_short.value.ui64++; 2602 goto fail; 2603 } 2604 2605 while ((mp_len = MBLKL(mp)) == 0) { 2606 if ((mp = mp->b_cont) == NULL) { 2607 txs->itxs_tx_short.value.ui64++; 2608 goto fail; 2609 } 2610 } 2611 cpoff = 0; 2612 } 2613 } 2614 ASSERT3U(hdrcopied, ==, hdrlen); 2615 2616 /* 2617 * A single descriptor containing both header and data is 2618 * counted twice by the controller. 2619 */ 2620 if (mp_len < i40e->i40e_tx_dma_min) { 2621 segdesc = 2; 2622 } else { 2623 segdesc = 1; 2624 } 2625 2626 while (mp != NULL) { 2627 mp_len = MBLKL(mp); 2628 force_copy: 2629 /* Ignore zero-length mblks. */ 2630 if (mp_len == 0) { 2631 mp = mp->b_cont; 2632 cpoff = 0; 2633 continue; 2634 } 2635 2636 /* 2637 * We copy into the preallocated TCB buffer when the 2638 * current fragment is less than the DMA threshold OR 2639 * when the DMA bind can't meet the controller's 2640 * segment descriptor limit. 2641 */ 2642 if (mp_len < i40e->i40e_tx_dma_min || force_copy) { 2643 size_t tocopy; 2644 2645 /* 2646 * Our objective here is to compress 2647 * consecutive copies into one TCB (until it 2648 * is full). If there is no current TCB, or if 2649 * it is a DMA TCB, then allocate a new one. 2650 */ 2651 if (tcb == NULL || 2652 (tcb != NULL && tcb->tcb_type != I40E_TX_COPY)) { 2653 if ((tcb = i40e_tcb_alloc(itrq)) == NULL) { 2654 txs->itxs_err_notcb.value.ui64++; 2655 goto fail; 2656 } 2657 2658 /* 2659 * The TCB DMA buffer is guaranteed to 2660 * be one cookie by i40e_alloc_dma_buffer(). 2661 */ 2662 needed_desc++; 2663 segdesc++; 2664 ASSERT3U(segdesc, <=, i40e_lso_num_descs); 2665 tcb_list_append(&tcbhead, &tcbtail, tcb); 2666 } else if (segdesc == 0) { 2667 /* 2668 * We are copying into an existing TCB 2669 * but we just crossed the MSS 2670 * boundary. Make sure to increment 2671 * segdesc to track the descriptor 2672 * count as the hardware would. 2673 */ 2674 segdesc++; 2675 } 2676 2677 tocopy = MIN(I40E_TCB_LEFT(tcb), mp_len - cpoff); 2678 i40e_tx_copy_fragment(tcb, mp, cpoff, tocopy); 2679 cpoff += tocopy; 2680 segsz += tocopy; 2681 2682 /* We have consumed the current mp. */ 2683 if (cpoff == mp_len) { 2684 mp = mp->b_cont; 2685 cpoff = 0; 2686 } 2687 2688 /* We have consumed the current TCB buffer. */ 2689 if (I40E_TCB_LEFT(tcb) == 0) { 2690 tcb = NULL; 2691 } 2692 2693 /* 2694 * We have met MSS with this copy; restart the 2695 * counters. 2696 */ 2697 if (segsz >= mss) { 2698 segsz = segsz % mss; 2699 segdesc = segsz == 0 ? 0 : 1; 2700 force_copy = B_FALSE; 2701 } 2702 2703 /* 2704 * We are at the controller's descriptor 2705 * limit; we must copy into the current TCB 2706 * until MSS is reached. The TCB buffer is 2707 * always bigger than the MTU so we know it is 2708 * big enough to meet the MSS. 2709 */ 2710 if (segdesc == i40e_lso_num_descs) { 2711 force_copy = B_TRUE; 2712 } 2713 } else { 2714 uint_t tsegdesc = segdesc; 2715 size_t tsegsz = segsz; 2716 2717 ASSERT(force_copy == B_FALSE); 2718 ASSERT3U(tsegdesc, <, i40e_lso_num_descs); 2719 2720 tcb = i40e_tx_bind_fragment(itrq, mp, cpoff, B_TRUE); 2721 if (tcb == NULL) { 2722 i40e_error(i40e, "dma bind failed!"); 2723 goto fail; 2724 } 2725 2726 for (uint_t i = 0; i < tcb->tcb_bind_ncookies; i++) { 2727 struct i40e_dma_bind_info dbi = 2728 tcb->tcb_bind_info[i]; 2729 2730 tsegsz += dbi.dbi_len; 2731 tsegdesc++; 2732 ASSERT3U(tsegdesc, <=, i40e_lso_num_descs); 2733 2734 /* 2735 * We've met the MSS with this portion 2736 * of the DMA. 2737 */ 2738 if (tsegsz >= mss) { 2739 tsegsz = tsegsz % mss; 2740 tsegdesc = tsegsz == 0 ? 0 : 1; 2741 } 2742 2743 /* 2744 * We've reached max descriptors but 2745 * have not met the MSS. Undo the bind 2746 * and instead copy. 2747 */ 2748 if (tsegdesc == i40e_lso_num_descs) { 2749 i40e_tcb_reset(tcb); 2750 i40e_tcb_free(itrq, tcb); 2751 2752 if (tcbtail != NULL && 2753 I40E_TCB_LEFT(tcb) > 0 && 2754 tcbtail->tcb_type == I40E_TX_COPY) { 2755 tcb = tcbtail; 2756 } else { 2757 tcb = NULL; 2758 } 2759 2760 /* 2761 * Remember, we are still on 2762 * the same mp. 2763 */ 2764 force_copy = B_TRUE; 2765 txs->itxs_tso_force_copy.value.ui64++; 2766 goto force_copy; 2767 } 2768 } 2769 2770 ASSERT3U(tsegdesc, <=, i40e_lso_num_descs); 2771 ASSERT3U(tsegsz, <, mss); 2772 2773 /* 2774 * We've made if through the loop without 2775 * breaking the segment descriptor contract 2776 * with the controller -- replace the segment 2777 * tracking values with the temporary ones. 2778 */ 2779 segdesc = tsegdesc; 2780 segsz = tsegsz; 2781 needed_desc += tcb->tcb_bind_ncookies; 2782 cpoff = 0; 2783 tcb_list_append(&tcbhead, &tcbtail, tcb); 2784 mp = mp->b_cont; 2785 } 2786 } 2787 2788 ASSERT3P(mp, ==, NULL); 2789 ASSERT3P(tcbhead, !=, NULL); 2790 *ndesc += needed_desc; 2791 return (tcbhead); 2792 2793 fail: 2794 tcb = tcbhead; 2795 while (tcb != NULL) { 2796 i40e_tx_control_block_t *next = tcb->tcb_next; 2797 2798 ASSERT(tcb->tcb_type == I40E_TX_DMA || 2799 tcb->tcb_type == I40E_TX_COPY); 2800 2801 tcb->tcb_mp = NULL; 2802 i40e_tcb_reset(tcb); 2803 i40e_tcb_free(itrq, tcb); 2804 tcb = next; 2805 } 2806 2807 return (NULL); 2808 } 2809 2810 /* 2811 * We've been asked to send a message block on the wire. We'll only have a 2812 * single chain. There will not be any b_next pointers; however, there may be 2813 * multiple b_cont blocks. The number of b_cont blocks may exceed the 2814 * controller's Tx descriptor limit. 2815 * 2816 * We may do one of three things with any given mblk_t chain: 2817 * 2818 * 1) Drop it 2819 * 2) Transmit it 2820 * 3) Return it 2821 * 2822 * If we return it to MAC, then MAC will flow control on our behalf. In other 2823 * words, it won't send us anything until we tell it that it's okay to send us 2824 * something. 2825 */ 2826 mblk_t * 2827 i40e_ring_tx(void *arg, mblk_t *mp) 2828 { 2829 size_t msglen; 2830 i40e_tx_control_block_t *tcb_ctx = NULL, *tcb = NULL, *tcbhead = NULL; 2831 i40e_tx_context_desc_t *ctxdesc; 2832 mac_ether_offload_info_t meo; 2833 i40e_tx_context_t tctx; 2834 int type; 2835 uint_t needed_desc = 0; 2836 boolean_t do_ctx_desc = B_FALSE, use_lso = B_FALSE; 2837 2838 i40e_trqpair_t *itrq = arg; 2839 i40e_t *i40e = itrq->itrq_i40e; 2840 i40e_hw_t *hw = &i40e->i40e_hw_space; 2841 i40e_txq_stat_t *txs = &itrq->itrq_txstat; 2842 2843 ASSERT(mp->b_next == NULL); 2844 2845 if (!(i40e->i40e_state & I40E_STARTED) || 2846 (i40e->i40e_state & I40E_OVERTEMP) || 2847 (i40e->i40e_state & I40E_SUSPENDED) || 2848 (i40e->i40e_state & I40E_ERROR) || 2849 (i40e->i40e_link_state != LINK_STATE_UP)) { 2850 freemsg(mp); 2851 return (NULL); 2852 } 2853 2854 if (mac_ether_offload_info(mp, &meo) != 0) { 2855 freemsg(mp); 2856 itrq->itrq_txstat.itxs_hck_meoifail.value.ui64++; 2857 return (NULL); 2858 } 2859 2860 /* 2861 * Figure out the relevant context about this frame that we might need 2862 * for enabling checksum, LSO, etc. This also fills in information that 2863 * we might set around the packet type, etc. 2864 */ 2865 if (i40e_tx_context(i40e, itrq, mp, &meo, &tctx) < 0) { 2866 freemsg(mp); 2867 itrq->itrq_txstat.itxs_err_context.value.ui64++; 2868 return (NULL); 2869 } 2870 if (tctx.itc_ctx_cmdflags & I40E_TX_CTX_DESC_TSO) { 2871 use_lso = B_TRUE; 2872 do_ctx_desc = B_TRUE; 2873 } 2874 2875 /* 2876 * For the primordial driver we can punt on doing any recycling right 2877 * now; however, longer term we need to probably do some more pro-active 2878 * recycling to cut back on stalls in the TX path. 2879 */ 2880 2881 msglen = msgsize(mp); 2882 2883 if (do_ctx_desc) { 2884 /* 2885 * If we're doing tunneling or LSO, then we'll need a TX 2886 * context descriptor in addition to one or more TX data 2887 * descriptors. Since there's no data DMA block or handle 2888 * associated with the context descriptor, we create a special 2889 * control block that behaves effectively like a NOP. 2890 */ 2891 if ((tcb_ctx = i40e_tcb_alloc(itrq)) == NULL) { 2892 txs->itxs_err_notcb.value.ui64++; 2893 goto txfail; 2894 } 2895 tcb_ctx->tcb_type = I40E_TX_DESC; 2896 needed_desc++; 2897 } 2898 2899 if (!use_lso) { 2900 tcbhead = i40e_non_lso_chain(itrq, mp, &needed_desc); 2901 } else { 2902 tcbhead = i40e_lso_chain(itrq, mp, &meo, &tctx, &needed_desc); 2903 } 2904 2905 if (tcbhead == NULL) 2906 goto txfail; 2907 2908 tcbhead->tcb_mp = mp; 2909 2910 /* 2911 * The second condition ensures that 'itrq_desc_tail' never 2912 * equals 'itrq_desc_head'. This enforces the rule found in 2913 * the second bullet point of section 8.4.3.1.5 of the XL710 2914 * PG, which declares the TAIL pointer in I40E_QTX_TAIL should 2915 * never overlap with the head. This means that we only ever 2916 * have 'itrq_tx_ring_size - 1' total available descriptors. 2917 */ 2918 mutex_enter(&itrq->itrq_tx_lock); 2919 if (itrq->itrq_desc_free < i40e->i40e_tx_block_thresh || 2920 (itrq->itrq_desc_free - 1) < needed_desc) { 2921 txs->itxs_err_nodescs.value.ui64++; 2922 mutex_exit(&itrq->itrq_tx_lock); 2923 goto txfail; 2924 } 2925 2926 if (do_ctx_desc) { 2927 /* 2928 * If we're enabling any offloads for this frame, then we'll 2929 * need to build up a transmit context descriptor, first. The 2930 * context descriptor needs to be placed in the TX ring before 2931 * the data descriptor(s). See section 8.4.2, table 8-16 2932 */ 2933 uint_t tail = itrq->itrq_desc_tail; 2934 itrq->itrq_desc_free--; 2935 ctxdesc = (i40e_tx_context_desc_t *)&itrq->itrq_desc_ring[tail]; 2936 itrq->itrq_tcb_work_list[tail] = tcb_ctx; 2937 itrq->itrq_desc_tail = i40e_next_desc(tail, 1, 2938 itrq->itrq_tx_ring_size); 2939 2940 /* QW0 */ 2941 type = I40E_TX_DESC_DTYPE_CONTEXT; 2942 ctxdesc->tunneling_params = 0; 2943 ctxdesc->l2tag2 = 0; 2944 2945 /* QW1 */ 2946 ctxdesc->type_cmd_tso_mss = CPU_TO_LE64((uint64_t)type); 2947 if (tctx.itc_ctx_cmdflags & I40E_TX_CTX_DESC_TSO) { 2948 ctxdesc->type_cmd_tso_mss |= CPU_TO_LE64((uint64_t) 2949 ((uint64_t)tctx.itc_ctx_cmdflags << 2950 I40E_TXD_CTX_QW1_CMD_SHIFT) | 2951 ((uint64_t)tctx.itc_ctx_tsolen << 2952 I40E_TXD_CTX_QW1_TSO_LEN_SHIFT) | 2953 ((uint64_t)tctx.itc_ctx_mss << 2954 I40E_TXD_CTX_QW1_MSS_SHIFT)); 2955 } 2956 } 2957 2958 tcb = tcbhead; 2959 while (tcb != NULL) { 2960 2961 itrq->itrq_tcb_work_list[itrq->itrq_desc_tail] = tcb; 2962 if (tcb->tcb_type == I40E_TX_COPY) { 2963 boolean_t last_desc = (tcb->tcb_next == NULL); 2964 2965 i40e_tx_set_data_desc(itrq, &tctx, 2966 (caddr_t)tcb->tcb_dma.dmab_dma_address, 2967 tcb->tcb_dma.dmab_len, last_desc); 2968 } else { 2969 boolean_t last_desc = B_FALSE; 2970 ASSERT3S(tcb->tcb_type, ==, I40E_TX_DMA); 2971 2972 for (uint_t c = 0; c < tcb->tcb_bind_ncookies; c++) { 2973 last_desc = (c == tcb->tcb_bind_ncookies - 1) && 2974 (tcb->tcb_next == NULL); 2975 2976 i40e_tx_set_data_desc(itrq, &tctx, 2977 tcb->tcb_bind_info[c].dbi_paddr, 2978 tcb->tcb_bind_info[c].dbi_len, 2979 last_desc); 2980 } 2981 } 2982 2983 tcb = tcb->tcb_next; 2984 } 2985 2986 /* 2987 * Now, finally, sync the DMA data and alert hardware. 2988 */ 2989 I40E_DMA_SYNC(&itrq->itrq_desc_area, DDI_DMA_SYNC_FORDEV); 2990 2991 I40E_WRITE_REG(hw, I40E_QTX_TAIL(itrq->itrq_index), 2992 itrq->itrq_desc_tail); 2993 2994 if (i40e_check_acc_handle(i40e->i40e_osdep_space.ios_reg_handle) != 2995 DDI_FM_OK) { 2996 /* 2997 * Note, we can't really go through and clean this up very well, 2998 * because the memory has been given to the device, so just 2999 * indicate it's been transmitted. 3000 */ 3001 ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED); 3002 atomic_or_32(&i40e->i40e_state, I40E_ERROR); 3003 } 3004 3005 txs->itxs_bytes.value.ui64 += msglen; 3006 txs->itxs_packets.value.ui64++; 3007 txs->itxs_descriptors.value.ui64 += needed_desc; 3008 3009 mutex_exit(&itrq->itrq_tx_lock); 3010 3011 return (NULL); 3012 3013 txfail: 3014 /* 3015 * We ran out of resources. Return it to MAC and indicate that we'll 3016 * need to signal MAC. If there are allocated tcb's, return them now. 3017 * Make sure to reset their message block's, since we'll return them 3018 * back to MAC. 3019 */ 3020 if (tcb_ctx != NULL) { 3021 tcb_ctx->tcb_mp = NULL; 3022 i40e_tcb_reset(tcb_ctx); 3023 i40e_tcb_free(itrq, tcb_ctx); 3024 } 3025 3026 tcb = tcbhead; 3027 while (tcb != NULL) { 3028 i40e_tx_control_block_t *next = tcb->tcb_next; 3029 3030 ASSERT(tcb->tcb_type == I40E_TX_DMA || 3031 tcb->tcb_type == I40E_TX_COPY); 3032 3033 tcb->tcb_mp = NULL; 3034 i40e_tcb_reset(tcb); 3035 i40e_tcb_free(itrq, tcb); 3036 tcb = next; 3037 } 3038 3039 mutex_enter(&itrq->itrq_tx_lock); 3040 itrq->itrq_tx_blocked = B_TRUE; 3041 mutex_exit(&itrq->itrq_tx_lock); 3042 3043 return (mp); 3044 } 3045