1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2015 OmniTI Computer Consulting, Inc. All rights reserved. 14 * Copyright 2016 Joyent, Inc. 15 */ 16 17 #include "i40e_sw.h" 18 19 /* 20 * --------------------------------------------------------- 21 * Buffer and Memory Management, Receiving, and Transmitting 22 * --------------------------------------------------------- 23 * 24 * Each physical function (PF), which is what we think of as an instance of the 25 * device driver, has a series of associated transmit and receive queue pairs. 26 * Effectively, what we think of in MAC as rings. Each of these has their own 27 * ring of descriptors which is used as part of doing DMA activity. 28 * 29 * The transmit ring of descriptors are 16-byte entries which are used to send 30 * packets, program filters, etc. The receive ring of descriptors are either 31 * 16-byte or 32-bytes each. At the moment, we opt to use the larger descriptor 32 * format so that we're in a better position if we ever want to leverage that 33 * information later on. 34 * 35 * However, these rings are just for descriptors, they don't talk or deal with 36 * how we actually store the memory that we need for DMA or the associated 37 * information that we need for keeping track of message blocks. To correspond 38 * to the hardware descriptor ring which is how we communicate with hardware, we 39 * introduce a control block which keeps track of our required metadata like DMA 40 * mappings. 41 * 42 * There are two main considerations that dictate how much memory and buffers 43 * we end up allocating. Those are: 44 * 45 * o The size of the ring (controlled through the driver.conf file) 46 * 47 * o The maximum size frame we can receive. 48 * 49 * The size of the rings currently defaults to 1024 descriptors and is stored in 50 * the i40e_t`i40e_rx_ring_size and i40e_t`i40e_tx_ring_size. 51 * 52 * While the size of the rings is controlled by the driver.conf, the maximum 53 * size frame is informed primarily through the use of dladm and the setting of 54 * the MTU property on the device. From the MTU, we then go and do some 55 * machinations. The first thing we do is we then have to add in space for the 56 * Ethernet header, potentially a VLAN header, and the FCS check. This value is 57 * what's stored as i40e_t`i40e_frame_max and is derived any time 58 * i40e_t`i40e_sdu changes. 59 * 60 * This size is then rounded up to the nearest 1k chunk, which represents the 61 * actual amount of memory that we'll allocate for a single frame. 62 * 63 * Note, that for rx, we do something that might be unexpected. We always add 64 * an extra two bytes to the frame size that we allocate. We then offset the DMA 65 * address that we receive a packet into by two bytes. This ensures that the IP 66 * header will always be 4 byte aligned because the MAC header is either 14 or 67 * 18 bytes in length, depending on the use of 802.1Q tagging, which makes IP's 68 * and MAC's lives easier. 69 * 70 * Both the rx and tx descriptor rings (which are what we use to communicate 71 * with hardware) are allocated as a single region of DMA memory which is the 72 * size of the descriptor (4 bytes and 2 bytes respectively) times the total 73 * number of descriptors for an rx and tx ring. 74 * 75 * While the rx and tx descriptors are allocated using DMA-based memory, the 76 * control blocks for each of them are allocated using normal kernel memory. 77 * They aren't special from a DMA perspective. We'll go over the design of both 78 * receiving and transmitting separately, as they have slightly different 79 * control blocks and different ways that we manage the relationship between 80 * control blocks and descriptors. 81 * 82 * --------------------------------- 83 * RX Descriptors and Control Blocks 84 * --------------------------------- 85 * 86 * For every descriptor in the ring that the driver has, we need some associated 87 * memory, which means that we need to have the receive specific control block. 88 * We have a couple different, but related goals: 89 * 90 * o Once we've completed the mc_start GLDv3 endpoint (i40e_m_start), we do 91 * not want to do any additional memory allocations or DMA allocations if 92 * we don't have to. 93 * 94 * o We'd like to try and do as much zero-copy as possible, while taking into 95 * account the cost of mapping in DMA resources. 96 * 97 * o We'd like to have every receive descriptor available. 98 * 99 * Now, these rules are a bit in tension with one another. The act of mapping in 100 * is an exercise of trying to find the break-even point between page table 101 * updates and bcopy. We currently start by using the same metrics that ixgbe 102 * used; however, it should be known that this value has effectively been 103 * cargo-culted across to yet another driver, sorry. 104 * 105 * If we receive a packet which is larger than our copy threshold, we'll create 106 * a message block out of the DMA memory via desballoc(9F) and send that up to 107 * MAC that way. This will cause us to be notified when the message block is 108 * then freed because it has been consumed, dropped, or otherwise. Otherwise, if 109 * it's less than the threshold, we'll try to use allocb and bcopy it into the 110 * block, thus allowing us to immediately reuse the DMA resource. Note, on debug 111 * builds, we allow someone to whack the variable i40e_debug_rx_mode to override 112 * the behavior and always do a bcopy or a DMA bind. 113 * 114 * To try and ensure that the device always has blocks that it can receive data 115 * into, we maintain two lists of control blocks, a working list and a free 116 * list. Each list is sized equal to the number of descriptors in the rx ring. 117 * During the GLDv3 mc_start routine, we allocate a number of rx control blocks 118 * equal to twice the number of descriptors in the ring and we assign them 119 * equally to the free list and to the working list. Each control block also has 120 * DMA memory allocated and associated with which it will be used to receive the 121 * actual packet data. All of a received frame's data will end up in a single 122 * DMA buffer. 123 * 124 * During operation, we always maintain the invariant that each rx descriptor 125 * has an associated rx control block which lives in the working list. If we 126 * feel that we should loan up DMA memory to MAC in the form of a message block, 127 * we can only do so if we can maintain this invariant. To do that, we swap in 128 * one of the buffers from the free list. If none are available, then we resort 129 * to using allocb(9F) and bcopy(9F) on the packet instead, regardless of the 130 * size. 131 * 132 * Loaned message blocks come back to use when freemsg(9F) or freeb(9F) is 133 * called on the block, at which point we restore the rx control block to the 134 * free list and are able to reuse the DMA memory again. While the scheme may 135 * seem odd, it importantly keeps us out of trying to do any DMA allocations in 136 * the normal path of operation, even though we may still have to allocate 137 * message blocks and copy. 138 * 139 * The following state machine describes the life time of a rx control block. In 140 * the diagram we abbrviate the rx ring descriptor entry as rxd and the rx 141 * control block entry as rcb. 142 * 143 * | | 144 * * ... 1/2 of all initial rcb's ... * 145 * | | 146 * v v 147 * +------------------+ +------------------+ 148 * | rcb on free list |---*---------->| rcb on work list | 149 * +------------------+ . +------------------+ 150 * ^ . moved to | 151 * | replace rcb * . . Frame received, 152 * | loaned to | entry on free list 153 * | MAC + co. | available. rcb's 154 * | | memory made into mblk_t 155 * * . freemsg(9F) | and sent up to MAC. 156 * | called on | 157 * | loaned rcb | 158 * | and it is v 159 * | recycled. +-------------------+ 160 * +--------------------<-----| rcb loaned to MAC | 161 * +-------------------+ 162 * 163 * Finally, note that every rx control block has a reference count on it. One 164 * reference is added as long as the driver has had the GLDv3 mc_start endpoint 165 * called. If the GLDv3 mc_stop entry point is called, IP has been unplumbed and 166 * no other DLPI consumers remain, then we'll decrement the reference count by 167 * one. Whenever we loan up the rx control block and associated buffer to MAC, 168 * then we bump the reference count again. Even though the device is stopped, 169 * there may still be loaned frames in upper levels that we'll want to account 170 * for. Our callback from freemsg(9F)/freeb(9F) will take care of making sure 171 * that it is cleaned up. 172 * 173 * -------------------- 174 * Managing the RX Ring 175 * -------------------- 176 * 177 * The receive ring descriptors are arranged in a circular buffer with a head 178 * and tail pointer. There are both the conventional head and tail pointers 179 * which are used to partition the ring into two portions, a portion that we, 180 * the operating system, manage and a portion that is managed by hardware. When 181 * hardware owns a descriptor in the ring, it means that it is waiting for data 182 * to be filled in. However, when a portion of the ring is owned by the driver, 183 * then that means that the descriptor has been consumed and we need to go take 184 * a look at it. 185 * 186 * The initial head is configured to be zero by writing it as such in the 187 * receive queue context in the FPM (function private memory from the host). The 188 * initial tail is written to be the last descriptor. This is written to via the 189 * PCIe register I40E_QRX_TAIL(). Technically, hardware owns everything between 190 * the HEAD and TAIL, inclusive. Note that while we initially program the HEAD, 191 * the only values we ever consult ourselves are the TAIL register and our own 192 * state tracking. Effectively, we cache the HEAD register and then update it 193 * ourselves based on our work. 194 * 195 * When we iterate over the rx descriptors and thus the received frames, we are 196 * either in an interrupt context or we've been asked by MAC to poll on the 197 * ring. If we've been asked to poll on the ring, we have a maximum number of 198 * bytes of mblk_t's to return. If processing an rx descriptor would cause us to 199 * exceed that count, then we do not process it. When in interrupt context, we 200 * don't have a strict byte count. However, to ensure liveness, we limit the 201 * amount of data based on a configuration value 202 * (i40e_t`i40e_rx_limit_per_intr). The number that we've started with for this 203 * is based on similar numbers that are used for ixgbe. After some additional 204 * time in the field, we'll have a sense as to whether or not it should be 205 * changed. 206 * 207 * When processing, we start at our own HEAD pointer 208 * (i40e_rx_data_t`rxd_desc_next), which indicates the descriptor to start 209 * processing. Every RX descriptor has what's described as the DD bit. This bit 210 * (the LSB of the second 8-byte word), indicates whether or not the descriptor 211 * is done. When we give descriptors to the hardware, this value is always 212 * zero. When the hardware has finished a descriptor, it will always be one. 213 * 214 * The first thing that we check is whether the DD bit indicates that the 215 * current HEAD is ready. If it isn't, then we're done. That's the primary 216 * invariant of processing a frame. If it's done, then there are a few other 217 * things that we want to look at. In the same status word as the DD bit, there 218 * are two other important bits: 219 * 220 * o End of Packet (EOP) 221 * o Error bits 222 * 223 * The end of packet indicates that we have reached the last descriptor. Now, 224 * you might ask when would there be more than one descriptor. The reason for 225 * that might be due to large receive offload (lro) or header splitting 226 * functionality, which presently isn't supported in the driver. The error bits 227 * in the frame are only valid when EOP is set. 228 * 229 * If error bits are set on the frame, then we still consume it; however, we 230 * will not generate an mblk_t to send up to MAC. If there are no error bits 231 * set, then we'll consume the descriptor either using bcopy or DMA binding. See 232 * the earlier section 'RX DESCRIPTORS AND CONTROL BLOCKS' for more information 233 * on how that selection is made. 234 * 235 * Regardless of whether we construct an mblk_t or encounter an error, we end up 236 * resetting the descriptor. This re-arms the descriptor for hardware and in the 237 * process, we may end up assigning it a new receive control bock. After we do 238 * this, we always update our HEAD pointer, no matter what. 239 * 240 * Finally, once we've consumed as much as we will in a given window, we go and 241 * update the TAIL register to indicate all the frames we've consumed. We only 242 * do a single bulk write for the ring. 243 * 244 * --------------------------------- 245 * TX Descriptors and Control Blocks 246 * --------------------------------- 247 * 248 * While the transmit path is similar in spirit to the receive path, it works 249 * differently due to the fact that all data is originated by the operating 250 * system and not by the device. 251 * 252 * Like rx, there is both a descriptor ring that we use to communicate to the 253 * driver and which points to the memory used to transmit a frame. Similarly, 254 * there is a corresponding transmit control block. Each transmit control block 255 * has a region of DMA memory allocated to it; however, the way we use it 256 * varies. 257 * 258 * The driver is asked to process a single frame at a time. That message block 259 * may be made up of multiple fragments linked together by the mblk_t`b_cont 260 * member. The device has a hard limit of up to 8 buffers being allowed for use 261 * for a single logical frame. For each fragment, we'll try and use an entry 262 * from the tx descriptor ring and then we'll allocate a corresponding tx 263 * control block. Depending on the size of the fragment, we may copy it around 264 * or we might instead try to do DMA binding of the fragment. 265 * 266 * If we exceed the number of blocks that fit, we'll try to pull up the block 267 * and then we'll do a DMA bind and send it out. 268 * 269 * If we don't have enough space in the ring or tx control blocks available, 270 * then we'll return the unprocessed message block to MAC. This will induce flow 271 * control and once we recycle enough entries, we'll once again enable sending 272 * on the ring. 273 * 274 * We size the working list as equal to the number of descriptors in the ring. 275 * We size the free list as equal to 1.5 times the number of descriptors in the 276 * ring. We'll allocate a number of tx control block entries equal to the number 277 * of entries in the free list. By default, all entries are placed in the free 278 * list. As we come along and try to send something, we'll allocate entries from 279 * the free list and add them to the working list, where they'll stay until the 280 * hardware indicates that all of the data has been written back to us. The 281 * reason that we start with 1.5x is to help facilitate having more than one TX 282 * buffer associated with the DMA activity. 283 * 284 * -------------------- 285 * Managing the TX Ring 286 * -------------------- 287 * 288 * The transmit descriptor ring is driven by us. We maintain our own notion of a 289 * HEAD and TAIL register and we update the hardware with updates to the TAIL 290 * register. When the hardware is done writing out data, it updates us by 291 * writing back to a specific address, not by updating the individual 292 * descriptors. That address is a 4-byte region after the main transmit 293 * descriptor ring. This is why the descriptor ring has an extra descriptor's 294 * worth allocated to it. 295 * 296 * We maintain our notion of the HEAD in the i40e_trqpair_t`itrq_desc_head and 297 * the TAIL in the i40e_trqpair_t`itrq_desc_tail. When we write out frames, 298 * we'll update the tail there and in the I40E_QTX_TAIL() register. At various 299 * points in time, through both interrupts, and our own internal checks, we'll 300 * sync the write-back head portion of the DMA space. Based on the index it 301 * reports back, we'll free everything between our current HEAD and the 302 * indicated index and update HEAD to the new index. 303 * 304 * When a frame comes in, we try to use a number of transmit control blocks and 305 * we'll transition them from the free list to the work list. They'll get moved 306 * to the entry on the work list that corresponds with the transmit descriptor 307 * they correspond to. Once we are indicated that the corresponding descriptor 308 * has been freed, we'll return it to the list. 309 * 310 * The transmit control block free list is managed by keeping track of the 311 * number of entries in it, i40e_trqpair_t`itrq_tcb_free. We use it as a way to 312 * index into the free list and add things to it. In effect, we always push and 313 * pop from the tail and protect it with a single lock, 314 * i40e_trqpair_t`itrq_tcb_lock. This scheme is somewhat simplistic and may not 315 * stand up to further performance testing; however, it does allow us to get off 316 * the ground with the device driver. 317 * 318 * The following image describes where a given transmit control block lives in 319 * its lifetime: 320 * 321 * | 322 * * ... Initial placement for all tcb's 323 * | 324 * v 325 * +------------------+ +------------------+ 326 * | tcb on free list |---*------------------>| tcb on work list | 327 * +------------------+ . +------------------+ 328 * ^ . tcb allocated | 329 * | to send frame v 330 * | or fragment on | 331 * | wire, mblk from | 332 * | MAC associated. | 333 * | | 334 * +------*-------------------------------<----+ 335 * . 336 * . Hardware indicates 337 * entry transmitted. 338 * tcb recycled, mblk 339 * from MAC freed. 340 * 341 * ------------ 342 * Blocking MAC 343 * ------------ 344 * 345 * Wen performing transmit, we can run out of descriptors and ring entries. When 346 * such a case happens, we return the mblk_t to MAC to indicate that we've been 347 * blocked. At that point in time, MAC becomes blocked and will not transmit 348 * anything out that specific ring until we notify MAC. To indicate that we're 349 * in such a situation we set i40e_trqpair_t`itrq_tx_blocked member to B_TRUE. 350 * 351 * When we recycle tx descriptors then we'll end up signaling MAC by calling 352 * mac_tx_ring_update() if we were blocked, letting it know that it's safe to 353 * start sending frames out to us again. 354 */ 355 356 /* 357 * We set our DMA alignment requests based on the smallest supported page size 358 * of the corresponding platform. 359 */ 360 #if defined(__sparc) 361 #define I40E_DMA_ALIGNMENT 0x2000ull 362 #elif defined(__x86) 363 #define I40E_DMA_ALIGNMENT 0x1000ull 364 #else 365 #error "unknown architecture for i40e" 366 #endif 367 368 /* 369 * This structure is used to maintain information and flags related to 370 * transmitting a frame. The first member is the set of flags we need to or into 371 * the command word (generally checksumming related). The second member controls 372 * the word offsets which is required for IP and L4 checksumming. 373 */ 374 typedef struct i40e_tx_context { 375 enum i40e_tx_desc_cmd_bits itc_cmdflags; 376 uint32_t itc_offsets; 377 } i40e_tx_context_t; 378 379 /* 380 * Toggles on debug builds which can be used to override our RX behaviour based 381 * on thresholds. 382 */ 383 #ifdef DEBUG 384 typedef enum { 385 I40E_DEBUG_RX_DEFAULT = 0, 386 I40E_DEBUG_RX_BCOPY = 1, 387 I40E_DEBUG_RX_DMABIND = 2 388 } i40e_debug_rx_t; 389 390 i40e_debug_rx_t i40e_debug_rx_mode = I40E_DEBUG_RX_DEFAULT; 391 #endif /* DEBUG */ 392 393 /* 394 * Notes on the following pair of DMA attributes. The first attribute, 395 * i40e_static_dma_attr, is designed to be used for both the descriptor rings 396 * and the static buffers that we associate with control blocks. For this 397 * reason, we force an SGL length of one. While technically the driver supports 398 * a larger SGL (5 on rx and 8 on tx), we opt to only use one to simplify our 399 * management here. In addition, when the Intel common code wants to allocate 400 * memory via the i40e_allocate_virt_mem osdep function, we have it leverage 401 * the static dma attr. 402 * 403 * The second set of attributes, i40e_txbind_dma_attr, is what we use when we're 404 * binding a bunch of mblk_t fragments to go out the door. Note that the main 405 * difference here is that we're allowed a larger SGL length -- eight. 406 * 407 * Note, we default to setting ourselves to be DMA capable here. However, 408 * because we could have multiple instances which have different FMA error 409 * checking capabilities, or end up on different buses, we make these static 410 * and const and copy them into the i40e_t for the given device with the actual 411 * values that reflect the actual capabilities. 412 */ 413 static const ddi_dma_attr_t i40e_g_static_dma_attr = { 414 DMA_ATTR_V0, /* version number */ 415 0x0000000000000000ull, /* low address */ 416 0xFFFFFFFFFFFFFFFFull, /* high address */ 417 0x00000000FFFFFFFFull, /* dma counter max */ 418 I40E_DMA_ALIGNMENT, /* alignment */ 419 0x00000FFF, /* burst sizes */ 420 0x00000001, /* minimum transfer size */ 421 0x00000000FFFFFFFFull, /* maximum transfer size */ 422 0xFFFFFFFFFFFFFFFFull, /* maximum segment size */ 423 1, /* scatter/gather list length */ 424 0x00000001, /* granularity */ 425 DDI_DMA_FLAGERR /* DMA flags */ 426 }; 427 428 static const ddi_dma_attr_t i40e_g_txbind_dma_attr = { 429 DMA_ATTR_V0, /* version number */ 430 0x0000000000000000ull, /* low address */ 431 0xFFFFFFFFFFFFFFFFull, /* high address */ 432 0x00000000FFFFFFFFull, /* dma counter max */ 433 I40E_DMA_ALIGNMENT, /* alignment */ 434 0x00000FFF, /* burst sizes */ 435 0x00000001, /* minimum transfer size */ 436 0x00000000FFFFFFFFull, /* maximum transfer size */ 437 0xFFFFFFFFFFFFFFFFull, /* maximum segment size */ 438 I40E_TX_MAX_COOKIE, /* scatter/gather list length */ 439 0x00000001, /* granularity */ 440 DDI_DMA_FLAGERR /* DMA flags */ 441 }; 442 443 /* 444 * Next, we have the attributes for these structures. The descriptor rings are 445 * all strictly little endian, while the data buffers are just arrays of bytes 446 * representing frames. Because of this, we purposefully simplify the driver 447 * programming life by programming the descriptor ring as little endian, while 448 * for the buffer data we keep it as unstructured. 449 * 450 * Note, that to keep the Intel common code operating in a reasonable way, when 451 * we allocate DMA memory for it, we do not use byte swapping and thus use the 452 * standard i40e_buf_acc_attr. 453 */ 454 static const ddi_device_acc_attr_t i40e_g_desc_acc_attr = { 455 DDI_DEVICE_ATTR_V0, 456 DDI_STRUCTURE_LE_ACC, 457 DDI_STRICTORDER_ACC 458 }; 459 460 static const ddi_device_acc_attr_t i40e_g_buf_acc_attr = { 461 DDI_DEVICE_ATTR_V0, 462 DDI_NEVERSWAP_ACC, 463 DDI_STRICTORDER_ACC 464 }; 465 466 /* 467 * The next two functions are designed to be type-safe versions of macros that 468 * are used to increment and decrement a descriptor index in the loop. Note, 469 * these are marked inline to try and keep the data path hot and they were 470 * effectively inlined in their previous life as macros. 471 */ 472 static inline int 473 i40e_next_desc(int base, int count, int size) 474 { 475 int out; 476 477 ASSERT(base >= 0); 478 ASSERT(count > 0); 479 ASSERT(size > 0); 480 481 if (base + count < size) { 482 out = base + count; 483 } else { 484 out = base + count - size; 485 } 486 487 ASSERT(out >= 0 && out < size); 488 return (out); 489 } 490 491 static inline int 492 i40e_prev_desc(int base, int count, int size) 493 { 494 int out; 495 496 ASSERT(base >= 0); 497 ASSERT(count > 0); 498 ASSERT(size > 0); 499 500 if (base >= count) { 501 out = base - count; 502 } else { 503 out = base - count + size; 504 } 505 506 ASSERT(out >= 0 && out < size); 507 return (out); 508 } 509 510 /* 511 * Free DMA memory that is represented by a i40e_dma_buffer_t. 512 */ 513 static void 514 i40e_free_dma_buffer(i40e_dma_buffer_t *dmap) 515 { 516 if (dmap->dmab_dma_address != 0) { 517 VERIFY(dmap->dmab_dma_handle != NULL); 518 (void) ddi_dma_unbind_handle(dmap->dmab_dma_handle); 519 dmap->dmab_dma_address = 0; 520 dmap->dmab_size = 0; 521 } 522 523 if (dmap->dmab_acc_handle != NULL) { 524 ddi_dma_mem_free(&dmap->dmab_acc_handle); 525 dmap->dmab_acc_handle = NULL; 526 dmap->dmab_address = NULL; 527 } 528 529 if (dmap->dmab_dma_handle != NULL) { 530 ddi_dma_free_handle(&dmap->dmab_dma_handle); 531 dmap->dmab_dma_handle = NULL; 532 } 533 534 /* 535 * These should only be set if we have valid handles allocated and 536 * therefore should always be NULLed out due to the above code. This 537 * is here to catch us acting sloppy. 538 */ 539 ASSERT(dmap->dmab_dma_address == 0); 540 ASSERT(dmap->dmab_address == NULL); 541 ASSERT(dmap->dmab_size == 0); 542 dmap->dmab_len = 0; 543 } 544 545 /* 546 * Allocate size bytes of DMA memory based on the passed in attributes. This 547 * fills in the information in dmap and is designed for all of our single cookie 548 * allocations. 549 */ 550 static boolean_t 551 i40e_alloc_dma_buffer(i40e_t *i40e, i40e_dma_buffer_t *dmap, 552 ddi_dma_attr_t *attrsp, ddi_device_acc_attr_t *accp, boolean_t stream, 553 boolean_t zero, size_t size) 554 { 555 int ret; 556 uint_t flags; 557 size_t len; 558 ddi_dma_cookie_t cookie; 559 uint_t ncookies; 560 561 if (stream == B_TRUE) 562 flags = DDI_DMA_STREAMING; 563 else 564 flags = DDI_DMA_CONSISTENT; 565 566 /* 567 * Step one: Allocate the DMA handle 568 */ 569 ret = ddi_dma_alloc_handle(i40e->i40e_dip, attrsp, DDI_DMA_DONTWAIT, 570 NULL, &dmap->dmab_dma_handle); 571 if (ret != DDI_SUCCESS) { 572 i40e_error(i40e, "failed to allocate dma handle for I/O " 573 "buffers: %d", ret); 574 dmap->dmab_dma_handle = NULL; 575 return (B_FALSE); 576 } 577 578 /* 579 * Step two: Allocate the DMA memory 580 */ 581 ret = ddi_dma_mem_alloc(dmap->dmab_dma_handle, size, accp, flags, 582 DDI_DMA_DONTWAIT, NULL, &dmap->dmab_address, &len, 583 &dmap->dmab_acc_handle); 584 if (ret != DDI_SUCCESS) { 585 i40e_error(i40e, "failed to allocate %ld bytes of DMA for I/O " 586 "buffers", size); 587 dmap->dmab_address = NULL; 588 dmap->dmab_acc_handle = NULL; 589 i40e_free_dma_buffer(dmap); 590 return (B_FALSE); 591 } 592 593 /* 594 * Step three: Optionally zero 595 */ 596 if (zero == B_TRUE) 597 bzero(dmap->dmab_address, len); 598 599 /* 600 * Step four: Bind the memory 601 */ 602 ret = ddi_dma_addr_bind_handle(dmap->dmab_dma_handle, NULL, 603 dmap->dmab_address, len, DDI_DMA_RDWR | flags, DDI_DMA_DONTWAIT, 604 NULL, &cookie, &ncookies); 605 if (ret != DDI_DMA_MAPPED) { 606 i40e_error(i40e, "failed to allocate %ld bytes of DMA for I/O " 607 "buffers: %d", size, ret); 608 i40e_free_dma_buffer(dmap); 609 return (B_FALSE); 610 } 611 612 VERIFY(ncookies == 1); 613 dmap->dmab_dma_address = cookie.dmac_laddress; 614 dmap->dmab_size = len; 615 dmap->dmab_len = 0; 616 return (B_TRUE); 617 } 618 619 /* 620 * This function is called once the last pending rcb has been freed by the upper 621 * levels of the system. 622 */ 623 static void 624 i40e_free_rx_data(i40e_rx_data_t *rxd) 625 { 626 VERIFY(rxd->rxd_rcb_pending == 0); 627 628 if (rxd->rxd_rcb_area != NULL) { 629 kmem_free(rxd->rxd_rcb_area, 630 sizeof (i40e_rx_control_block_t) * 631 (rxd->rxd_free_list_size + rxd->rxd_ring_size)); 632 rxd->rxd_rcb_area = NULL; 633 } 634 635 if (rxd->rxd_free_list != NULL) { 636 kmem_free(rxd->rxd_free_list, 637 sizeof (i40e_rx_control_block_t *) * 638 rxd->rxd_free_list_size); 639 rxd->rxd_free_list = NULL; 640 } 641 642 if (rxd->rxd_work_list != NULL) { 643 kmem_free(rxd->rxd_work_list, 644 sizeof (i40e_rx_control_block_t *) * 645 rxd->rxd_ring_size); 646 rxd->rxd_work_list = NULL; 647 } 648 649 kmem_free(rxd, sizeof (i40e_rx_data_t)); 650 } 651 652 static boolean_t 653 i40e_alloc_rx_data(i40e_t *i40e, i40e_trqpair_t *itrq) 654 { 655 i40e_rx_data_t *rxd; 656 657 rxd = kmem_zalloc(sizeof (i40e_rx_data_t), KM_NOSLEEP); 658 if (rxd == NULL) 659 return (B_FALSE); 660 itrq->itrq_rxdata = rxd; 661 rxd->rxd_i40e = i40e; 662 663 rxd->rxd_ring_size = i40e->i40e_rx_ring_size; 664 rxd->rxd_free_list_size = i40e->i40e_rx_ring_size; 665 666 rxd->rxd_rcb_free = rxd->rxd_free_list_size; 667 668 rxd->rxd_work_list = kmem_zalloc(sizeof (i40e_rx_control_block_t *) * 669 rxd->rxd_ring_size, KM_NOSLEEP); 670 if (rxd->rxd_work_list == NULL) { 671 i40e_error(i40e, "failed to allocate rx work list for a ring " 672 "of %d entries for ring %d", rxd->rxd_ring_size, 673 itrq->itrq_index); 674 goto cleanup; 675 } 676 677 rxd->rxd_free_list = kmem_zalloc(sizeof (i40e_rx_control_block_t *) * 678 rxd->rxd_free_list_size, KM_NOSLEEP); 679 if (rxd->rxd_free_list == NULL) { 680 i40e_error(i40e, "failed to allocate a %d entry rx free list " 681 "for ring %d", rxd->rxd_free_list_size, itrq->itrq_index); 682 goto cleanup; 683 } 684 685 rxd->rxd_rcb_area = kmem_zalloc(sizeof (i40e_rx_control_block_t) * 686 (rxd->rxd_free_list_size + rxd->rxd_ring_size), KM_NOSLEEP); 687 if (rxd->rxd_rcb_area == NULL) { 688 i40e_error(i40e, "failed to allocate a %d entry rcb area for " 689 "ring %d", rxd->rxd_ring_size + rxd->rxd_free_list_size, 690 itrq->itrq_index); 691 goto cleanup; 692 } 693 694 return (B_TRUE); 695 696 cleanup: 697 i40e_free_rx_data(rxd); 698 itrq->itrq_rxdata = NULL; 699 return (B_FALSE); 700 } 701 702 /* 703 * Free all of the memory that we've allocated for DMA. Note that we may have 704 * buffers that we've loaned up to the OS which are still outstanding. We'll 705 * always free up the descriptor ring, because we no longer need that. For each 706 * rcb, we'll iterate over it and if we send the reference count to zero, then 707 * we'll free the message block and DMA related resources. However, if we don't 708 * take the last one, then we'll go ahead and keep track that we'll have pending 709 * data and clean it up when we get there. 710 */ 711 static void 712 i40e_free_rx_dma(i40e_rx_data_t *rxd, boolean_t failed_init) 713 { 714 uint32_t i, count, ref; 715 716 i40e_rx_control_block_t *rcb; 717 i40e_t *i40e = rxd->rxd_i40e; 718 719 i40e_free_dma_buffer(&rxd->rxd_desc_area); 720 rxd->rxd_desc_ring = NULL; 721 rxd->rxd_desc_next = 0; 722 723 mutex_enter(&i40e->i40e_rx_pending_lock); 724 725 rcb = rxd->rxd_rcb_area; 726 count = rxd->rxd_ring_size + rxd->rxd_free_list_size; 727 728 for (i = 0; i < count; i++, rcb++) { 729 VERIFY(rcb != NULL); 730 731 /* 732 * If we're cleaning up from a failed creation attempt, then an 733 * entry may never have been assembled which would mean that 734 * it's reference count is zero. If we find that, we leave it 735 * be, because nothing else should be modifying it at this 736 * point. We're not at the point that any more references can be 737 * added, just removed. 738 */ 739 if (failed_init == B_TRUE && rcb->rcb_ref == 0) 740 continue; 741 742 ref = atomic_dec_32_nv(&rcb->rcb_ref); 743 if (ref == 0) { 744 freemsg(rcb->rcb_mp); 745 rcb->rcb_mp = NULL; 746 i40e_free_dma_buffer(&rcb->rcb_dma); 747 } else { 748 atomic_inc_32(&rxd->rxd_rcb_pending); 749 atomic_inc_32(&i40e->i40e_rx_pending); 750 } 751 } 752 mutex_exit(&i40e->i40e_rx_pending_lock); 753 } 754 755 /* 756 * Initialize the DMA memory for the descriptor ring and for each frame in the 757 * control block list. 758 */ 759 static boolean_t 760 i40e_alloc_rx_dma(i40e_rx_data_t *rxd) 761 { 762 int i, count; 763 size_t dmasz; 764 i40e_rx_control_block_t *rcb; 765 i40e_t *i40e = rxd->rxd_i40e; 766 767 /* 768 * First allocate the rx descriptor ring. 769 */ 770 dmasz = sizeof (i40e_rx_desc_t) * rxd->rxd_ring_size; 771 VERIFY(dmasz > 0); 772 if (i40e_alloc_dma_buffer(i40e, &rxd->rxd_desc_area, 773 &i40e->i40e_static_dma_attr, &i40e->i40e_desc_acc_attr, B_FALSE, 774 B_TRUE, dmasz) == B_FALSE) { 775 i40e_error(i40e, "failed to allocate DMA resources " 776 "for rx descriptor ring"); 777 return (B_FALSE); 778 } 779 rxd->rxd_desc_ring = 780 (i40e_rx_desc_t *)(uintptr_t)rxd->rxd_desc_area.dmab_address; 781 rxd->rxd_desc_next = 0; 782 783 count = rxd->rxd_ring_size + rxd->rxd_free_list_size; 784 rcb = rxd->rxd_rcb_area; 785 786 dmasz = i40e->i40e_rx_buf_size; 787 VERIFY(dmasz > 0); 788 for (i = 0; i < count; i++, rcb++) { 789 i40e_dma_buffer_t *dmap; 790 VERIFY(rcb != NULL); 791 792 if (i < rxd->rxd_ring_size) { 793 rxd->rxd_work_list[i] = rcb; 794 } else { 795 rxd->rxd_free_list[i - rxd->rxd_ring_size] = rcb; 796 } 797 798 dmap = &rcb->rcb_dma; 799 if (i40e_alloc_dma_buffer(i40e, dmap, 800 &i40e->i40e_static_dma_attr, &i40e->i40e_buf_acc_attr, 801 B_TRUE, B_FALSE, dmasz) == B_FALSE) { 802 i40e_error(i40e, "failed to allocate rx dma buffer"); 803 return (B_FALSE); 804 } 805 806 /* 807 * Initialize the control block and offset the DMA address. See 808 * the note in the big theory statement that explains how this 809 * helps IP deal with alignment. Note, we don't worry about 810 * whether or not we successfully get an mblk_t from desballoc, 811 * it's a common case that we have to handle later on in the 812 * system. 813 */ 814 dmap->dmab_size -= I40E_BUF_IPHDR_ALIGNMENT; 815 dmap->dmab_address += I40E_BUF_IPHDR_ALIGNMENT; 816 dmap->dmab_dma_address += I40E_BUF_IPHDR_ALIGNMENT; 817 818 rcb->rcb_ref = 1; 819 rcb->rcb_rxd = rxd; 820 rcb->rcb_free_rtn.free_func = i40e_rx_recycle; 821 rcb->rcb_free_rtn.free_arg = (caddr_t)rcb; 822 rcb->rcb_mp = desballoc((unsigned char *)dmap->dmab_address, 823 dmap->dmab_size, 0, &rcb->rcb_free_rtn); 824 } 825 826 return (B_TRUE); 827 } 828 829 static void 830 i40e_free_tx_dma(i40e_trqpair_t *itrq) 831 { 832 size_t fsz; 833 834 if (itrq->itrq_tcb_area != NULL) { 835 uint32_t i; 836 i40e_tx_control_block_t *tcb = itrq->itrq_tcb_area; 837 838 for (i = 0; i < itrq->itrq_tx_free_list_size; i++, tcb++) { 839 i40e_free_dma_buffer(&tcb->tcb_dma); 840 if (tcb->tcb_dma_handle != NULL) { 841 ddi_dma_free_handle(&tcb->tcb_dma_handle); 842 tcb->tcb_dma_handle = NULL; 843 } 844 } 845 846 fsz = sizeof (i40e_tx_control_block_t) * 847 itrq->itrq_tx_free_list_size; 848 kmem_free(itrq->itrq_tcb_area, fsz); 849 itrq->itrq_tcb_area = NULL; 850 } 851 852 if (itrq->itrq_tcb_free_list != NULL) { 853 fsz = sizeof (i40e_tx_control_block_t *) * 854 itrq->itrq_tx_free_list_size; 855 kmem_free(itrq->itrq_tcb_free_list, fsz); 856 itrq->itrq_tcb_free_list = NULL; 857 } 858 859 if (itrq->itrq_tcb_work_list != NULL) { 860 fsz = sizeof (i40e_tx_control_block_t *) * 861 itrq->itrq_tx_ring_size; 862 kmem_free(itrq->itrq_tcb_work_list, fsz); 863 itrq->itrq_tcb_work_list = NULL; 864 } 865 866 i40e_free_dma_buffer(&itrq->itrq_desc_area); 867 itrq->itrq_desc_ring = NULL; 868 869 } 870 871 static boolean_t 872 i40e_alloc_tx_dma(i40e_trqpair_t *itrq) 873 { 874 int i, ret; 875 size_t dmasz; 876 i40e_tx_control_block_t *tcb; 877 i40e_t *i40e = itrq->itrq_i40e; 878 879 itrq->itrq_tx_ring_size = i40e->i40e_tx_ring_size; 880 itrq->itrq_tx_free_list_size = i40e->i40e_tx_ring_size + 881 (i40e->i40e_tx_ring_size >> 1); 882 883 /* 884 * Allocate an additional tx descriptor for the writeback head. 885 */ 886 dmasz = sizeof (i40e_tx_desc_t) * itrq->itrq_tx_ring_size; 887 dmasz += sizeof (i40e_tx_desc_t); 888 889 VERIFY(dmasz > 0); 890 if (i40e_alloc_dma_buffer(i40e, &itrq->itrq_desc_area, 891 &i40e->i40e_static_dma_attr, &i40e->i40e_desc_acc_attr, 892 B_FALSE, B_TRUE, dmasz) == B_FALSE) { 893 i40e_error(i40e, "failed to allocate DMA resources for tx " 894 "descriptor ring"); 895 return (B_FALSE); 896 } 897 itrq->itrq_desc_ring = 898 (i40e_tx_desc_t *)(uintptr_t)itrq->itrq_desc_area.dmab_address; 899 itrq->itrq_desc_wbhead = (uint32_t *)(itrq->itrq_desc_ring + 900 itrq->itrq_tx_ring_size); 901 itrq->itrq_desc_head = 0; 902 itrq->itrq_desc_tail = 0; 903 itrq->itrq_desc_free = itrq->itrq_tx_ring_size; 904 905 itrq->itrq_tcb_work_list = kmem_zalloc(itrq->itrq_tx_ring_size * 906 sizeof (i40e_tx_control_block_t *), KM_NOSLEEP); 907 if (itrq->itrq_tcb_work_list == NULL) { 908 i40e_error(i40e, "failed to allocate a %d entry tx work list " 909 "for ring %d", itrq->itrq_tx_ring_size, itrq->itrq_index); 910 goto cleanup; 911 } 912 913 itrq->itrq_tcb_free_list = kmem_zalloc(itrq->itrq_tx_free_list_size * 914 sizeof (i40e_tx_control_block_t *), KM_SLEEP); 915 if (itrq->itrq_tcb_free_list == NULL) { 916 i40e_error(i40e, "failed to allocate a %d entry tx free list " 917 "for ring %d", itrq->itrq_tx_free_list_size, 918 itrq->itrq_index); 919 goto cleanup; 920 } 921 922 /* 923 * We allocate enough tx control blocks to cover the free list. 924 */ 925 itrq->itrq_tcb_area = kmem_zalloc(sizeof (i40e_tx_control_block_t) * 926 itrq->itrq_tx_free_list_size, KM_NOSLEEP); 927 if (itrq->itrq_tcb_area == NULL) { 928 i40e_error(i40e, "failed to allocate a %d entry tcb area for " 929 "ring %d", itrq->itrq_tx_free_list_size, itrq->itrq_index); 930 goto cleanup; 931 } 932 933 /* 934 * For each tcb, allocate DMA memory. 935 */ 936 dmasz = i40e->i40e_tx_buf_size; 937 VERIFY(dmasz > 0); 938 tcb = itrq->itrq_tcb_area; 939 for (i = 0; i < itrq->itrq_tx_free_list_size; i++, tcb++) { 940 VERIFY(tcb != NULL); 941 942 /* 943 * Allocate both a DMA buffer which we'll use for when we copy 944 * packets for transmission and allocate a DMA handle which 945 * we'll use when we bind data. 946 */ 947 ret = ddi_dma_alloc_handle(i40e->i40e_dip, 948 &i40e->i40e_txbind_dma_attr, DDI_DMA_DONTWAIT, NULL, 949 &tcb->tcb_dma_handle); 950 if (ret != DDI_SUCCESS) { 951 i40e_error(i40e, "failed to allocate DMA handle for tx " 952 "data binding on ring %d: %d", itrq->itrq_index, 953 ret); 954 tcb->tcb_dma_handle = NULL; 955 goto cleanup; 956 } 957 958 if (i40e_alloc_dma_buffer(i40e, &tcb->tcb_dma, 959 &i40e->i40e_static_dma_attr, &i40e->i40e_buf_acc_attr, 960 B_TRUE, B_FALSE, dmasz) == B_FALSE) { 961 i40e_error(i40e, "failed to allocate %ld bytes of " 962 "DMA for tx data binding on ring %d", dmasz, 963 itrq->itrq_index); 964 goto cleanup; 965 } 966 967 itrq->itrq_tcb_free_list[i] = tcb; 968 } 969 970 itrq->itrq_tcb_free = itrq->itrq_tx_free_list_size; 971 972 return (B_TRUE); 973 974 cleanup: 975 i40e_free_tx_dma(itrq); 976 return (B_FALSE); 977 } 978 979 /* 980 * Free all memory associated with all of the rings on this i40e instance. Note, 981 * this is done as part of the GLDv3 stop routine. 982 */ 983 void 984 i40e_free_ring_mem(i40e_t *i40e, boolean_t failed_init) 985 { 986 int i; 987 988 for (i = 0; i < i40e->i40e_num_trqpairs; i++) { 989 i40e_rx_data_t *rxd = i40e->i40e_trqpairs[i].itrq_rxdata; 990 991 /* 992 * Clean up our rx data. We have to free DMA resources first and 993 * then if we have no more pending RCB's, then we'll go ahead 994 * and clean things up. Note, we can't set the stopped flag on 995 * the rx data until after we've done the first pass of the 996 * pending resources. Otherwise we might race with 997 * i40e_rx_recycle on determining who should free the 998 * i40e_rx_data_t above. 999 */ 1000 i40e_free_rx_dma(rxd, failed_init); 1001 1002 mutex_enter(&i40e->i40e_rx_pending_lock); 1003 rxd->rxd_shutdown = B_TRUE; 1004 if (rxd->rxd_rcb_pending == 0) { 1005 i40e_free_rx_data(rxd); 1006 i40e->i40e_trqpairs[i].itrq_rxdata = NULL; 1007 } 1008 mutex_exit(&i40e->i40e_rx_pending_lock); 1009 1010 i40e_free_tx_dma(&i40e->i40e_trqpairs[i]); 1011 } 1012 } 1013 1014 /* 1015 * Allocate all of the resources associated with all of the rings on this i40e 1016 * instance. Note this is done as part of the GLDv3 start routine and thus we 1017 * should not use blocking allocations. This takes care of both DMA and non-DMA 1018 * related resources. 1019 */ 1020 boolean_t 1021 i40e_alloc_ring_mem(i40e_t *i40e) 1022 { 1023 int i; 1024 1025 for (i = 0; i < i40e->i40e_num_trqpairs; i++) { 1026 if (i40e_alloc_rx_data(i40e, &i40e->i40e_trqpairs[i]) == 1027 B_FALSE) 1028 goto unwind; 1029 1030 if (i40e_alloc_rx_dma(i40e->i40e_trqpairs[i].itrq_rxdata) == 1031 B_FALSE) 1032 goto unwind; 1033 1034 if (i40e_alloc_tx_dma(&i40e->i40e_trqpairs[i]) == B_FALSE) 1035 goto unwind; 1036 } 1037 1038 return (B_TRUE); 1039 1040 unwind: 1041 i40e_free_ring_mem(i40e, B_TRUE); 1042 return (B_FALSE); 1043 } 1044 1045 1046 /* 1047 * Because every instance of i40e may have different support for FMA 1048 * capabilities, we copy the DMA attributes into the i40e_t and set them that 1049 * way and use them for determining attributes. 1050 */ 1051 void 1052 i40e_init_dma_attrs(i40e_t *i40e, boolean_t fma) 1053 { 1054 bcopy(&i40e_g_static_dma_attr, &i40e->i40e_static_dma_attr, 1055 sizeof (ddi_dma_attr_t)); 1056 bcopy(&i40e_g_txbind_dma_attr, &i40e->i40e_txbind_dma_attr, 1057 sizeof (ddi_dma_attr_t)); 1058 bcopy(&i40e_g_desc_acc_attr, &i40e->i40e_desc_acc_attr, 1059 sizeof (ddi_device_acc_attr_t)); 1060 bcopy(&i40e_g_buf_acc_attr, &i40e->i40e_buf_acc_attr, 1061 sizeof (ddi_device_acc_attr_t)); 1062 1063 if (fma == B_TRUE) { 1064 i40e->i40e_static_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR; 1065 i40e->i40e_txbind_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR; 1066 } else { 1067 i40e->i40e_static_dma_attr.dma_attr_flags &= ~DDI_DMA_FLAGERR; 1068 i40e->i40e_txbind_dma_attr.dma_attr_flags &= ~DDI_DMA_FLAGERR; 1069 } 1070 } 1071 1072 static void 1073 i40e_rcb_free(i40e_rx_data_t *rxd, i40e_rx_control_block_t *rcb) 1074 { 1075 mutex_enter(&rxd->rxd_free_lock); 1076 ASSERT(rxd->rxd_rcb_free < rxd->rxd_free_list_size); 1077 ASSERT(rxd->rxd_free_list[rxd->rxd_rcb_free] == NULL); 1078 rxd->rxd_free_list[rxd->rxd_rcb_free] = rcb; 1079 rxd->rxd_rcb_free++; 1080 mutex_exit(&rxd->rxd_free_lock); 1081 } 1082 1083 static i40e_rx_control_block_t * 1084 i40e_rcb_alloc(i40e_rx_data_t *rxd) 1085 { 1086 i40e_rx_control_block_t *rcb; 1087 1088 mutex_enter(&rxd->rxd_free_lock); 1089 if (rxd->rxd_rcb_free == 0) { 1090 mutex_exit(&rxd->rxd_free_lock); 1091 return (NULL); 1092 } 1093 rxd->rxd_rcb_free--; 1094 rcb = rxd->rxd_free_list[rxd->rxd_rcb_free]; 1095 VERIFY(rcb != NULL); 1096 rxd->rxd_free_list[rxd->rxd_rcb_free] = NULL; 1097 mutex_exit(&rxd->rxd_free_lock); 1098 1099 return (rcb); 1100 } 1101 1102 /* 1103 * This is the callback that we get from the OS when freemsg(9F) has been called 1104 * on a loaned descriptor. In addition, if we take the last reference count 1105 * here, then we have to tear down all of the rx data. 1106 */ 1107 void 1108 i40e_rx_recycle(caddr_t arg) 1109 { 1110 uint32_t ref; 1111 i40e_rx_control_block_t *rcb; 1112 i40e_rx_data_t *rxd; 1113 i40e_t *i40e; 1114 1115 /* LINTED: E_BAD_PTR_CAST_ALIGN */ 1116 rcb = (i40e_rx_control_block_t *)arg; 1117 rxd = rcb->rcb_rxd; 1118 i40e = rxd->rxd_i40e; 1119 1120 /* 1121 * It's possible for this to be called with a reference count of zero. 1122 * That will happen when we're doing the freemsg after taking the last 1123 * reference because we're tearing down everything and this rcb is not 1124 * outstanding. 1125 */ 1126 if (rcb->rcb_ref == 0) 1127 return; 1128 1129 /* 1130 * Don't worry about failure of desballoc here. It'll only become fatal 1131 * if we're trying to use it and we can't in i40e_rx_bind(). 1132 */ 1133 rcb->rcb_mp = desballoc((unsigned char *)rcb->rcb_dma.dmab_address, 1134 rcb->rcb_dma.dmab_size, 0, &rcb->rcb_free_rtn); 1135 i40e_rcb_free(rxd, rcb); 1136 1137 /* 1138 * It's possible that the rcb was being used while we are shutting down 1139 * the device. In that case, we'll take the final reference from the 1140 * device here. 1141 */ 1142 ref = atomic_dec_32_nv(&rcb->rcb_ref); 1143 if (ref == 0) { 1144 freemsg(rcb->rcb_mp); 1145 rcb->rcb_mp = NULL; 1146 i40e_free_dma_buffer(&rcb->rcb_dma); 1147 1148 mutex_enter(&i40e->i40e_rx_pending_lock); 1149 atomic_dec_32(&rxd->rxd_rcb_pending); 1150 atomic_dec_32(&i40e->i40e_rx_pending); 1151 1152 /* 1153 * If this was the last block and it's been indicated that we've 1154 * passed the shutdown point, we should clean up. 1155 */ 1156 if (rxd->rxd_shutdown == B_TRUE && rxd->rxd_rcb_pending == 0) { 1157 i40e_free_rx_data(rxd); 1158 cv_broadcast(&i40e->i40e_rx_pending_cv); 1159 } 1160 1161 mutex_exit(&i40e->i40e_rx_pending_lock); 1162 } 1163 } 1164 1165 static mblk_t * 1166 i40e_rx_bind(i40e_trqpair_t *itrq, i40e_rx_data_t *rxd, uint32_t index, 1167 uint32_t plen) 1168 { 1169 mblk_t *mp; 1170 i40e_t *i40e = rxd->rxd_i40e; 1171 i40e_rx_control_block_t *rcb, *rep_rcb; 1172 1173 ASSERT(MUTEX_HELD(&itrq->itrq_rx_lock)); 1174 1175 if ((rep_rcb = i40e_rcb_alloc(rxd)) == NULL) { 1176 itrq->itrq_rxstat.irxs_rx_bind_norcb.value.ui64++; 1177 return (NULL); 1178 } 1179 1180 rcb = rxd->rxd_work_list[index]; 1181 1182 /* 1183 * Check to make sure we have a mblk_t. If we don't, this is our last 1184 * chance to try and get one. 1185 */ 1186 if (rcb->rcb_mp == NULL) { 1187 rcb->rcb_mp = 1188 desballoc((unsigned char *)rcb->rcb_dma.dmab_address, 1189 rcb->rcb_dma.dmab_size, 0, &rcb->rcb_free_rtn); 1190 if (rcb->rcb_mp == NULL) { 1191 itrq->itrq_rxstat.irxs_rx_bind_nomp.value.ui64++; 1192 i40e_rcb_free(rxd, rcb); 1193 return (NULL); 1194 } 1195 } 1196 1197 I40E_DMA_SYNC(&rcb->rcb_dma, DDI_DMA_SYNC_FORKERNEL); 1198 1199 if (i40e_check_dma_handle(rcb->rcb_dma.dmab_dma_handle) != DDI_FM_OK) { 1200 ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED); 1201 atomic_or_32(&i40e->i40e_state, I40E_ERROR); 1202 i40e_rcb_free(rxd, rcb); 1203 return (NULL); 1204 } 1205 1206 /* 1207 * Note, we've already accounted for the I40E_BUF_IPHDR_ALIGNMENT. 1208 */ 1209 mp = rcb->rcb_mp; 1210 atomic_inc_32(&rcb->rcb_ref); 1211 mp->b_wptr = mp->b_rptr + plen; 1212 mp->b_next = mp->b_cont = NULL; 1213 1214 rxd->rxd_work_list[index] = rep_rcb; 1215 return (mp); 1216 } 1217 1218 /* 1219 * We're going to allocate a new message block for this frame and attempt to 1220 * receive it. See the big theory statement for more information on when we copy 1221 * versus bind. 1222 */ 1223 static mblk_t * 1224 i40e_rx_copy(i40e_trqpair_t *itrq, i40e_rx_data_t *rxd, uint32_t index, 1225 uint32_t plen) 1226 { 1227 i40e_t *i40e = rxd->rxd_i40e; 1228 i40e_rx_control_block_t *rcb; 1229 mblk_t *mp; 1230 1231 ASSERT(index < rxd->rxd_ring_size); 1232 rcb = rxd->rxd_work_list[index]; 1233 1234 I40E_DMA_SYNC(&rcb->rcb_dma, DDI_DMA_SYNC_FORKERNEL); 1235 1236 if (i40e_check_dma_handle(rcb->rcb_dma.dmab_dma_handle) != DDI_FM_OK) { 1237 ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED); 1238 atomic_or_32(&i40e->i40e_state, I40E_ERROR); 1239 return (NULL); 1240 } 1241 1242 mp = allocb(plen + I40E_BUF_IPHDR_ALIGNMENT, 0); 1243 if (mp == NULL) { 1244 itrq->itrq_rxstat.irxs_rx_copy_nomem.value.ui64++; 1245 return (NULL); 1246 } 1247 1248 mp->b_rptr += I40E_BUF_IPHDR_ALIGNMENT; 1249 bcopy(rcb->rcb_dma.dmab_address, mp->b_rptr, plen); 1250 mp->b_wptr = mp->b_rptr + plen; 1251 1252 return (mp); 1253 } 1254 1255 /* 1256 * Determine if the device has enabled any checksum flags for us. The level of 1257 * checksum computed will depend on the type packet that we have, which is 1258 * contained in ptype. For example, the checksum logic it does will vary 1259 * depending on whether or not the packet is considered tunneled, whether it 1260 * recognizes the L4 type, etc. Section 8.3.4.3 summarizes which checksums are 1261 * valid. 1262 * 1263 * While there are additional checksums that we could recognize here, we'll need 1264 * to get some additional GLDv3 enhancements to be able to properly describe 1265 * them. 1266 */ 1267 static void 1268 i40e_rx_hcksum(i40e_trqpair_t *itrq, mblk_t *mp, uint64_t status, uint32_t err, 1269 uint32_t ptype) 1270 { 1271 uint32_t cksum; 1272 struct i40e_rx_ptype_decoded pinfo; 1273 1274 ASSERT(ptype <= 255); 1275 pinfo = decode_rx_desc_ptype(ptype); 1276 1277 cksum = 0; 1278 1279 /* 1280 * If the ptype isn't something that we know in the driver, then we 1281 * shouldn't even consider moving forward. 1282 */ 1283 if (pinfo.known == 0) { 1284 itrq->itrq_rxstat.irxs_hck_unknown.value.ui64++; 1285 return; 1286 } 1287 1288 /* 1289 * If hardware didn't set the L3L4P bit on the frame, then there is no 1290 * checksum offload to consider. 1291 */ 1292 if ((status & (1 << I40E_RX_DESC_STATUS_L3L4P_SHIFT)) == 0) { 1293 itrq->itrq_rxstat.irxs_hck_nol3l4p.value.ui64++; 1294 return; 1295 } 1296 1297 /* 1298 * The device tells us that IPv6 checksums where a Destination Options 1299 * Header or a Routing header shouldn't be trusted. Discard all 1300 * checksums in this case. 1301 */ 1302 if (pinfo.outer_ip == I40E_RX_PTYPE_OUTER_IP && 1303 pinfo.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV6 && 1304 (status & (1 << I40E_RX_DESC_STATUS_IPV6EXADD_SHIFT))) { 1305 itrq->itrq_rxstat.irxs_hck_v6skip.value.ui64++; 1306 return; 1307 } 1308 1309 /* 1310 * The hardware denotes three kinds of possible errors. Two are reserved 1311 * for inner and outer IP checksum errors (IPE and EIPE) and the latter 1312 * is for L4 checksum errors (L4E). If there is only one IP header, then 1313 * the only thing that we care about is IPE. Note that since we don't 1314 * support inner checksums, we will ignore IPE being set on tunneled 1315 * packets and only care about EIPE. 1316 */ 1317 if (pinfo.outer_ip == I40E_RX_PTYPE_OUTER_IP && 1318 pinfo.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV4) { 1319 if (pinfo.tunnel_type == I40E_RX_PTYPE_OUTER_NONE) { 1320 if ((err & (1 << I40E_RX_DESC_ERROR_IPE_SHIFT)) != 0) { 1321 itrq->itrq_rxstat.irxs_hck_iperr.value.ui64++; 1322 } else { 1323 itrq->itrq_rxstat.irxs_hck_v4hdrok.value.ui64++; 1324 cksum |= HCK_IPV4_HDRCKSUM_OK; 1325 } 1326 } else { 1327 if ((err & (1 << I40E_RX_DESC_ERROR_EIPE_SHIFT)) != 0) { 1328 itrq->itrq_rxstat.irxs_hck_eiperr.value.ui64++; 1329 } else { 1330 itrq->itrq_rxstat.irxs_hck_v4hdrok.value.ui64++; 1331 cksum |= HCK_IPV4_HDRCKSUM_OK; 1332 } 1333 } 1334 } 1335 1336 /* 1337 * We only have meaningful L4 checksums in the case of IP->L4 and 1338 * IP->IP->L4. There is not outer L4 checksum data available in any 1339 * other case. Further, we don't bother reporting the valid checksum in 1340 * the case of IP->IP->L4 set. 1341 */ 1342 if (pinfo.outer_ip == I40E_RX_PTYPE_OUTER_IP && 1343 pinfo.tunnel_type == I40E_RX_PTYPE_TUNNEL_NONE && 1344 (pinfo.inner_prot == I40E_RX_PTYPE_INNER_PROT_UDP || 1345 pinfo.inner_prot == I40E_RX_PTYPE_INNER_PROT_TCP || 1346 pinfo.inner_prot == I40E_RX_PTYPE_INNER_PROT_ICMP || 1347 pinfo.inner_prot == I40E_RX_PTYPE_INNER_PROT_SCTP)) { 1348 ASSERT(pinfo.payload_layer == I40E_RX_PTYPE_PAYLOAD_LAYER_PAY4); 1349 if ((err & (1 << I40E_RX_DESC_ERROR_L4E_SHIFT)) != 0) { 1350 itrq->itrq_rxstat.irxs_hck_l4err.value.ui64++; 1351 } else { 1352 itrq->itrq_rxstat.irxs_hck_l4hdrok.value.ui64++; 1353 cksum |= HCK_FULLCKSUM_OK; 1354 } 1355 } 1356 1357 if (cksum != 0) { 1358 itrq->itrq_rxstat.irxs_hck_set.value.ui64++; 1359 mac_hcksum_set(mp, 0, 0, 0, 0, cksum); 1360 } else { 1361 itrq->itrq_rxstat.irxs_hck_miss.value.ui64++; 1362 } 1363 } 1364 1365 mblk_t * 1366 i40e_ring_rx(i40e_trqpair_t *itrq, int poll_bytes) 1367 { 1368 i40e_t *i40e; 1369 i40e_hw_t *hw; 1370 i40e_rx_data_t *rxd; 1371 uint32_t cur_head; 1372 i40e_rx_desc_t *cur_desc; 1373 i40e_rx_control_block_t *rcb; 1374 uint64_t rx_bytes, rx_frames; 1375 uint64_t stword; 1376 mblk_t *mp, *mp_head, **mp_tail; 1377 1378 ASSERT(MUTEX_HELD(&itrq->itrq_rx_lock)); 1379 rxd = itrq->itrq_rxdata; 1380 i40e = itrq->itrq_i40e; 1381 hw = &i40e->i40e_hw_space; 1382 1383 if (!(i40e->i40e_state & I40E_STARTED) || 1384 (i40e->i40e_state & I40E_OVERTEMP) || 1385 (i40e->i40e_state & I40E_SUSPENDED) || 1386 (i40e->i40e_state & I40E_ERROR)) 1387 return (NULL); 1388 1389 /* 1390 * Before we do anything else, we have to make sure that all of the DMA 1391 * buffers are synced up and then check to make sure that they're 1392 * actually good from an FM perspective. 1393 */ 1394 I40E_DMA_SYNC(&rxd->rxd_desc_area, DDI_DMA_SYNC_FORKERNEL); 1395 if (i40e_check_dma_handle(rxd->rxd_desc_area.dmab_dma_handle) != 1396 DDI_FM_OK) { 1397 ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED); 1398 atomic_or_32(&i40e->i40e_state, I40E_ERROR); 1399 return (NULL); 1400 } 1401 1402 /* 1403 * Prepare our stats. We do a limited amount of processing in both 1404 * polling and interrupt context. The limit in interrupt context is 1405 * based on frames, in polling context based on bytes. 1406 */ 1407 rx_bytes = rx_frames = 0; 1408 mp_head = NULL; 1409 mp_tail = &mp_head; 1410 1411 /* 1412 * At this point, the descriptor ring is available to check. We'll try 1413 * and process until we either run out of poll_bytes or descriptors. 1414 */ 1415 cur_head = rxd->rxd_desc_next; 1416 cur_desc = &rxd->rxd_desc_ring[cur_head]; 1417 stword = LE64_TO_CPU(cur_desc->wb.qword1.status_error_len); 1418 1419 /* 1420 * Note, the primary invariant of this loop should be that cur_head, 1421 * cur_desc, and stword always point to the currently processed 1422 * descriptor. When we leave the loop, it should point to a descriptor 1423 * that HAS NOT been processed. Meaning, that if we haven't consumed the 1424 * frame, the descriptor should not be advanced. 1425 */ 1426 while ((stword & (1 << I40E_RX_DESC_STATUS_DD_SHIFT)) != 0) { 1427 uint32_t error, eop, plen, ptype; 1428 1429 /* 1430 * The DD, PLEN, and EOP bits are the only ones that are valid 1431 * in every frame. The error information is only valid when EOP 1432 * is set in the same frame. 1433 * 1434 * At this time, because we don't do any LRO or header 1435 * splitting. We expect that every frame should have EOP set in 1436 * it. When later functionality comes in, we'll want to 1437 * re-evaluate this. 1438 */ 1439 eop = stword & (1 << I40E_RX_DESC_STATUS_EOF_SHIFT); 1440 VERIFY(eop != 0); 1441 1442 error = (stword & I40E_RXD_QW1_ERROR_MASK) >> 1443 I40E_RXD_QW1_ERROR_SHIFT; 1444 if (error & I40E_RX_ERR_BITS) { 1445 itrq->itrq_rxstat.irxs_rx_desc_error.value.ui64++; 1446 goto discard; 1447 } 1448 1449 plen = (stword & I40E_RXD_QW1_LENGTH_PBUF_MASK) >> 1450 I40E_RXD_QW1_LENGTH_PBUF_SHIFT; 1451 1452 ptype = (stword & I40E_RXD_QW1_PTYPE_MASK) >> 1453 I40E_RXD_QW1_PTYPE_SHIFT; 1454 1455 /* 1456 * This packet contains valid data. We should check to see if 1457 * we're actually going to consume it based on its length (to 1458 * ensure that we don't overshoot our quota). We determine 1459 * whether to bcopy or bind the DMA resources based on the size 1460 * of the frame. However, if on debug, we allow it to be 1461 * overridden for testing purposes. 1462 * 1463 * We should be smarter about this and do DMA binding for 1464 * larger frames, but for now, it's really more important that 1465 * we actually just get something simple working. 1466 */ 1467 1468 /* 1469 * Ensure we don't exceed our polling quota by reading this 1470 * frame. Note we only bump bytes now, we bump frames later. 1471 */ 1472 if ((poll_bytes != I40E_POLL_NULL) && 1473 (rx_bytes + plen) > poll_bytes) 1474 break; 1475 rx_bytes += plen; 1476 1477 mp = NULL; 1478 if (plen >= i40e->i40e_rx_dma_min) 1479 mp = i40e_rx_bind(itrq, rxd, cur_head, plen); 1480 if (mp == NULL) 1481 mp = i40e_rx_copy(itrq, rxd, cur_head, plen); 1482 1483 if (mp != NULL) { 1484 if (i40e->i40e_rx_hcksum_enable) 1485 i40e_rx_hcksum(itrq, mp, stword, error, ptype); 1486 *mp_tail = mp; 1487 mp_tail = &mp->b_next; 1488 } 1489 1490 /* 1491 * Now we need to prepare this frame for use again. See the 1492 * discussion in the big theory statements. 1493 * 1494 * However, right now we're doing the simple version of this. 1495 * Normally what we'd do would depend on whether or not we were 1496 * doing DMA binding or bcopying. But because we're always doing 1497 * bcopying, we can just always use the current index as a key 1498 * for what to do and reassign the buffer based on the ring. 1499 */ 1500 discard: 1501 rcb = rxd->rxd_work_list[cur_head]; 1502 cur_desc->read.pkt_addr = 1503 CPU_TO_LE64((uintptr_t)rcb->rcb_dma.dmab_dma_address); 1504 cur_desc->read.hdr_addr = 0; 1505 1506 /* 1507 * Finally, update our loop invariants. 1508 */ 1509 cur_head = i40e_next_desc(cur_head, 1, rxd->rxd_ring_size); 1510 cur_desc = &rxd->rxd_desc_ring[cur_head]; 1511 stword = LE64_TO_CPU(cur_desc->wb.qword1.status_error_len); 1512 1513 /* 1514 * To help provide liveness, we limit the amount of data that 1515 * we'll end up counting. Note that in these cases, an interrupt 1516 * is not dissimilar from a polling request. 1517 */ 1518 rx_frames++; 1519 if (rx_frames > i40e->i40e_rx_limit_per_intr) { 1520 itrq->itrq_rxstat.irxs_rx_intr_limit.value.ui64++; 1521 break; 1522 } 1523 } 1524 1525 /* 1526 * As we've modified the ring, we need to make sure that we sync the 1527 * descriptor ring for the device. Next, we update the hardware and 1528 * update our notion of where the head for us to read from hardware is 1529 * next. 1530 */ 1531 I40E_DMA_SYNC(&rxd->rxd_desc_area, DDI_DMA_SYNC_FORDEV); 1532 if (i40e_check_dma_handle(rxd->rxd_desc_area.dmab_dma_handle) != 1533 DDI_FM_OK) { 1534 ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED); 1535 atomic_or_32(&i40e->i40e_state, I40E_ERROR); 1536 } 1537 1538 if (rx_frames != 0) { 1539 uint32_t tail; 1540 ddi_acc_handle_t rh = i40e->i40e_osdep_space.ios_reg_handle; 1541 rxd->rxd_desc_next = cur_head; 1542 tail = i40e_prev_desc(cur_head, 1, rxd->rxd_ring_size); 1543 1544 I40E_WRITE_REG(hw, I40E_QRX_TAIL(itrq->itrq_index), tail); 1545 if (i40e_check_acc_handle(rh) != DDI_FM_OK) { 1546 ddi_fm_service_impact(i40e->i40e_dip, 1547 DDI_SERVICE_DEGRADED); 1548 atomic_or_32(&i40e->i40e_state, I40E_ERROR); 1549 } 1550 1551 itrq->itrq_rxstat.irxs_bytes.value.ui64 += rx_bytes; 1552 itrq->itrq_rxstat.irxs_packets.value.ui64 += rx_frames; 1553 } 1554 1555 #ifdef DEBUG 1556 if (rx_frames == 0) { 1557 ASSERT(rx_bytes == 0); 1558 } 1559 #endif 1560 1561 return (mp_head); 1562 } 1563 1564 /* 1565 * This function is called by the GLDv3 when it wants to poll on a ring. The 1566 * only primary difference from when we call this during an interrupt is that we 1567 * have a limit on the number of bytes that we should consume. 1568 */ 1569 mblk_t * 1570 i40e_ring_rx_poll(void *arg, int poll_bytes) 1571 { 1572 i40e_trqpair_t *itrq = arg; 1573 mblk_t *mp; 1574 1575 ASSERT(poll_bytes > 0); 1576 if (poll_bytes == 0) 1577 return (NULL); 1578 1579 mutex_enter(&itrq->itrq_rx_lock); 1580 mp = i40e_ring_rx(itrq, poll_bytes); 1581 mutex_exit(&itrq->itrq_rx_lock); 1582 1583 return (mp); 1584 } 1585 1586 /* 1587 * This is a structure I wish someone would fill out for me for dorking with the 1588 * checksums. When we get some more experience with this, we should go ahead and 1589 * consider adding this to MAC. 1590 */ 1591 typedef enum mac_ether_offload_flags { 1592 MEOI_L2INFO_SET = 0x01, 1593 MEOI_VLAN_TAGGED = 0x02, 1594 MEOI_L3INFO_SET = 0x04, 1595 MEOI_L3CKSUM_SET = 0x08, 1596 MEOI_L4INFO_SET = 0x10, 1597 MEOI_L4CKSUM_SET = 0x20 1598 } mac_ether_offload_flags_t; 1599 1600 typedef struct mac_ether_offload_info { 1601 mac_ether_offload_flags_t meoi_flags; 1602 uint8_t meoi_l2hlen; /* How long is the Ethernet header? */ 1603 uint16_t meoi_l3proto; /* What's the Ethertype */ 1604 uint8_t meoi_l3hlen; /* How long is the header? */ 1605 uint8_t meoi_l4proto; /* What is the payload type? */ 1606 uint8_t meoi_l4hlen; /* How long is the L4 header */ 1607 mblk_t *meoi_l3ckmp; /* Which mblk has the l3 checksum */ 1608 off_t meoi_l3ckoff; /* What's the offset to it */ 1609 mblk_t *meoi_l4ckmp; /* Which mblk has the L4 checksum */ 1610 off_t meoi_l4off; /* What is the offset to it? */ 1611 } mac_ether_offload_info_t; 1612 1613 /* 1614 * This is something that we'd like to make a general MAC function. Before we do 1615 * that, we should add support for TSO. 1616 * 1617 * We should really keep track of our offset and not walk everything every 1618 * time. I can't imagine that this will be kind to us at high packet rates; 1619 * however, for the moment, let's leave that. 1620 * 1621 * This walks a message block chain without pulling up to fill in the context 1622 * information. Note that the data we care about could be hidden across more 1623 * than one mblk_t. 1624 */ 1625 static int 1626 i40e_meoi_get_uint8(mblk_t *mp, off_t off, uint8_t *out) 1627 { 1628 size_t mpsize; 1629 uint8_t *bp; 1630 1631 mpsize = msgsize(mp); 1632 /* Check for overflow */ 1633 if (off + sizeof (uint16_t) > mpsize) 1634 return (-1); 1635 1636 mpsize = MBLKL(mp); 1637 while (off >= mpsize) { 1638 mp = mp->b_cont; 1639 off -= mpsize; 1640 mpsize = MBLKL(mp); 1641 } 1642 1643 bp = mp->b_rptr + off; 1644 *out = *bp; 1645 return (0); 1646 1647 } 1648 1649 static int 1650 i40e_meoi_get_uint16(mblk_t *mp, off_t off, uint16_t *out) 1651 { 1652 size_t mpsize; 1653 uint8_t *bp; 1654 1655 mpsize = msgsize(mp); 1656 /* Check for overflow */ 1657 if (off + sizeof (uint16_t) > mpsize) 1658 return (-1); 1659 1660 mpsize = MBLKL(mp); 1661 while (off >= mpsize) { 1662 mp = mp->b_cont; 1663 off -= mpsize; 1664 mpsize = MBLKL(mp); 1665 } 1666 1667 /* 1668 * Data is in network order. Note the second byte of data might be in 1669 * the next mp. 1670 */ 1671 bp = mp->b_rptr + off; 1672 *out = *bp << 8; 1673 if (off + 1 == mpsize) { 1674 mp = mp->b_cont; 1675 bp = mp->b_rptr; 1676 } else { 1677 bp++; 1678 } 1679 1680 *out |= *bp; 1681 return (0); 1682 1683 } 1684 1685 static int 1686 mac_ether_offload_info(mblk_t *mp, mac_ether_offload_info_t *meoi) 1687 { 1688 size_t off; 1689 uint16_t ether; 1690 uint8_t ipproto, iplen, l4len, maclen; 1691 1692 bzero(meoi, sizeof (mac_ether_offload_info_t)); 1693 1694 off = offsetof(struct ether_header, ether_type); 1695 if (i40e_meoi_get_uint16(mp, off, ðer) != 0) 1696 return (-1); 1697 1698 if (ether == ETHERTYPE_VLAN) { 1699 off = offsetof(struct ether_vlan_header, ether_type); 1700 if (i40e_meoi_get_uint16(mp, off, ðer) != 0) 1701 return (-1); 1702 meoi->meoi_flags |= MEOI_VLAN_TAGGED; 1703 maclen = sizeof (struct ether_vlan_header); 1704 } else { 1705 maclen = sizeof (struct ether_header); 1706 } 1707 meoi->meoi_flags |= MEOI_L2INFO_SET; 1708 meoi->meoi_l2hlen = maclen; 1709 meoi->meoi_l3proto = ether; 1710 1711 switch (ether) { 1712 case ETHERTYPE_IP: 1713 /* 1714 * For IPv4 we need to get the length of the header, as it can 1715 * be variable. 1716 */ 1717 off = offsetof(ipha_t, ipha_version_and_hdr_length) + maclen; 1718 if (i40e_meoi_get_uint8(mp, off, &iplen) != 0) 1719 return (-1); 1720 iplen &= 0x0f; 1721 if (iplen < 5 || iplen > 0x0f) 1722 return (-1); 1723 iplen *= 4; 1724 off = offsetof(ipha_t, ipha_protocol) + maclen; 1725 if (i40e_meoi_get_uint8(mp, off, &ipproto) == -1) 1726 return (-1); 1727 break; 1728 case ETHERTYPE_IPV6: 1729 iplen = 40; 1730 off = offsetof(ip6_t, ip6_nxt) + maclen; 1731 if (i40e_meoi_get_uint8(mp, off, &ipproto) == -1) 1732 return (-1); 1733 break; 1734 default: 1735 return (0); 1736 } 1737 meoi->meoi_l3hlen = iplen; 1738 meoi->meoi_l4proto = ipproto; 1739 meoi->meoi_flags |= MEOI_L3INFO_SET; 1740 1741 switch (ipproto) { 1742 case IPPROTO_TCP: 1743 off = offsetof(tcph_t, th_offset_and_rsrvd) + maclen + iplen; 1744 if (i40e_meoi_get_uint8(mp, off, &l4len) == -1) 1745 return (-1); 1746 l4len = (l4len & 0xf0) >> 4; 1747 if (l4len < 5 || l4len > 0xf) 1748 return (-1); 1749 l4len *= 4; 1750 break; 1751 case IPPROTO_UDP: 1752 l4len = sizeof (struct udphdr); 1753 break; 1754 case IPPROTO_SCTP: 1755 l4len = sizeof (sctp_hdr_t); 1756 break; 1757 default: 1758 return (0); 1759 } 1760 1761 meoi->meoi_l4hlen = l4len; 1762 meoi->meoi_flags |= MEOI_L4INFO_SET; 1763 return (0); 1764 } 1765 1766 /* 1767 * Attempt to put togther the information we'll need to feed into a descriptor 1768 * to properly program the hardware for checksum offload as well as the 1769 * generally required flags. 1770 * 1771 * The i40e_tx_context_t`itc_cmdflags contains the set of flags we need to or 1772 * into the descriptor based on the checksum flags for this mblk_t and the 1773 * actual information we care about. 1774 */ 1775 static int 1776 i40e_tx_context(i40e_t *i40e, i40e_trqpair_t *itrq, mblk_t *mp, 1777 i40e_tx_context_t *tctx) 1778 { 1779 int ret; 1780 uint32_t flags, start; 1781 mac_ether_offload_info_t meo; 1782 i40e_txq_stat_t *txs = &itrq->itrq_txstat; 1783 1784 bzero(tctx, sizeof (i40e_tx_context_t)); 1785 1786 if (i40e->i40e_tx_hcksum_enable != B_TRUE) 1787 return (0); 1788 1789 mac_hcksum_get(mp, &start, NULL, NULL, NULL, &flags); 1790 if (flags == 0) 1791 return (0); 1792 1793 if ((ret = mac_ether_offload_info(mp, &meo)) != 0) { 1794 txs->itxs_hck_meoifail.value.ui64++; 1795 return (ret); 1796 } 1797 1798 /* 1799 * Have we been asked to checksum an IPv4 header. If so, verify that we 1800 * have sufficient information and then set the proper fields in the 1801 * command structure. 1802 */ 1803 if (flags & HCK_IPV4_HDRCKSUM) { 1804 if ((meo.meoi_flags & MEOI_L2INFO_SET) == 0) { 1805 txs->itxs_hck_nol2info.value.ui64++; 1806 return (-1); 1807 } 1808 if ((meo.meoi_flags & MEOI_L3INFO_SET) == 0) { 1809 txs->itxs_hck_nol3info.value.ui64++; 1810 return (-1); 1811 } 1812 if (meo.meoi_l3proto != ETHERTYPE_IP) { 1813 txs->itxs_hck_badl3.value.ui64++; 1814 return (-1); 1815 } 1816 tctx->itc_cmdflags |= I40E_TX_DESC_CMD_IIPT_IPV4_CSUM; 1817 tctx->itc_offsets |= (meo.meoi_l2hlen >> 1) << 1818 I40E_TX_DESC_LENGTH_MACLEN_SHIFT; 1819 tctx->itc_offsets |= (meo.meoi_l3hlen >> 2) << 1820 I40E_TX_DESC_LENGTH_IPLEN_SHIFT; 1821 } 1822 1823 /* 1824 * We've been asked to provide an L4 header, first, set up the IP 1825 * information in the descriptor if we haven't already before moving 1826 * onto seeing if we have enough information for the L4 checksum 1827 * offload. 1828 */ 1829 if (flags & HCK_PARTIALCKSUM) { 1830 if ((meo.meoi_flags & MEOI_L4INFO_SET) == 0) { 1831 txs->itxs_hck_nol4info.value.ui64++; 1832 return (-1); 1833 } 1834 1835 if (!(flags & HCK_IPV4_HDRCKSUM)) { 1836 if ((meo.meoi_flags & MEOI_L2INFO_SET) == 0) { 1837 txs->itxs_hck_nol2info.value.ui64++; 1838 return (-1); 1839 } 1840 if ((meo.meoi_flags & MEOI_L3INFO_SET) == 0) { 1841 txs->itxs_hck_nol3info.value.ui64++; 1842 return (-1); 1843 } 1844 1845 if (meo.meoi_l3proto == ETHERTYPE_IP) { 1846 tctx->itc_cmdflags |= 1847 I40E_TX_DESC_CMD_IIPT_IPV4; 1848 } else if (meo.meoi_l3proto == ETHERTYPE_IPV6) { 1849 tctx->itc_cmdflags |= 1850 I40E_TX_DESC_CMD_IIPT_IPV6; 1851 } else { 1852 txs->itxs_hck_badl3.value.ui64++; 1853 return (-1); 1854 } 1855 tctx->itc_offsets |= (meo.meoi_l2hlen >> 1) << 1856 I40E_TX_DESC_LENGTH_MACLEN_SHIFT; 1857 tctx->itc_offsets |= (meo.meoi_l3hlen >> 2) << 1858 I40E_TX_DESC_LENGTH_IPLEN_SHIFT; 1859 } 1860 1861 switch (meo.meoi_l4proto) { 1862 case IPPROTO_TCP: 1863 tctx->itc_cmdflags |= I40E_TX_DESC_CMD_L4T_EOFT_TCP; 1864 break; 1865 case IPPROTO_UDP: 1866 tctx->itc_cmdflags |= I40E_TX_DESC_CMD_L4T_EOFT_UDP; 1867 break; 1868 case IPPROTO_SCTP: 1869 tctx->itc_cmdflags |= I40E_TX_DESC_CMD_L4T_EOFT_SCTP; 1870 break; 1871 default: 1872 txs->itxs_hck_badl4.value.ui64++; 1873 return (-1); 1874 } 1875 1876 tctx->itc_offsets |= (meo.meoi_l4hlen >> 2) << 1877 I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT; 1878 } 1879 1880 return (0); 1881 } 1882 1883 static void 1884 i40e_tcb_free(i40e_trqpair_t *itrq, i40e_tx_control_block_t *tcb) 1885 { 1886 ASSERT(tcb != NULL); 1887 1888 mutex_enter(&itrq->itrq_tcb_lock); 1889 ASSERT(itrq->itrq_tcb_free < itrq->itrq_tx_free_list_size); 1890 itrq->itrq_tcb_free_list[itrq->itrq_tcb_free] = tcb; 1891 itrq->itrq_tcb_free++; 1892 mutex_exit(&itrq->itrq_tcb_lock); 1893 } 1894 1895 static i40e_tx_control_block_t * 1896 i40e_tcb_alloc(i40e_trqpair_t *itrq) 1897 { 1898 i40e_tx_control_block_t *ret; 1899 1900 mutex_enter(&itrq->itrq_tcb_lock); 1901 if (itrq->itrq_tcb_free == 0) { 1902 mutex_exit(&itrq->itrq_tcb_lock); 1903 return (NULL); 1904 } 1905 1906 itrq->itrq_tcb_free--; 1907 ret = itrq->itrq_tcb_free_list[itrq->itrq_tcb_free]; 1908 itrq->itrq_tcb_free_list[itrq->itrq_tcb_free] = NULL; 1909 mutex_exit(&itrq->itrq_tcb_lock); 1910 1911 ASSERT(ret != NULL); 1912 return (ret); 1913 } 1914 1915 /* 1916 * This should be used to free any DMA resources, associated mblk_t's, etc. It's 1917 * used as part of recycling the message blocks when we have either an interrupt 1918 * or other activity that indicates that we need to take a look. 1919 */ 1920 static void 1921 i40e_tcb_reset(i40e_tx_control_block_t *tcb) 1922 { 1923 switch (tcb->tcb_type) { 1924 case I40E_TX_COPY: 1925 tcb->tcb_dma.dmab_len = 0; 1926 break; 1927 case I40E_TX_DMA: 1928 (void) ddi_dma_unbind_handle(tcb->tcb_dma_handle); 1929 break; 1930 case I40E_TX_NONE: 1931 /* Cast to pacify lint */ 1932 panic("trying to free tcb %p with bad type none", (void *)tcb); 1933 default: 1934 panic("unknown i40e tcb type: %d", tcb->tcb_type); 1935 } 1936 1937 tcb->tcb_type = I40E_TX_NONE; 1938 freemsg(tcb->tcb_mp); 1939 tcb->tcb_mp = NULL; 1940 tcb->tcb_next = NULL; 1941 } 1942 1943 /* 1944 * This is called as part of shutting down to clean up all outstanding 1945 * descriptors. Similar to recycle, except we don't re-arm anything and instead 1946 * just return control blocks to the free list. 1947 */ 1948 void 1949 i40e_tx_cleanup_ring(i40e_trqpair_t *itrq) 1950 { 1951 uint32_t index; 1952 1953 ASSERT(MUTEX_HELD(&itrq->itrq_tx_lock)); 1954 ASSERT(itrq->itrq_desc_free <= itrq->itrq_tx_ring_size); 1955 1956 /* 1957 * Because we should have shut down the chip at this point, it should be 1958 * safe to just clean up all the entries between our head and tail. 1959 */ 1960 #ifdef DEBUG 1961 index = I40E_READ_REG(&itrq->itrq_i40e->i40e_hw_space, 1962 I40E_QTX_ENA(itrq->itrq_index)); 1963 VERIFY0(index & (I40E_QTX_ENA_QENA_REQ_MASK | 1964 I40E_QTX_ENA_QENA_STAT_MASK)); 1965 #endif 1966 1967 index = itrq->itrq_desc_head; 1968 while (itrq->itrq_desc_free < itrq->itrq_tx_ring_size) { 1969 i40e_tx_control_block_t *tcb; 1970 1971 tcb = itrq->itrq_tcb_work_list[index]; 1972 VERIFY(tcb != NULL); 1973 itrq->itrq_tcb_work_list[index] = NULL; 1974 i40e_tcb_reset(tcb); 1975 i40e_tcb_free(itrq, tcb); 1976 1977 bzero(&itrq->itrq_desc_ring[index], sizeof (i40e_tx_desc_t)); 1978 index = i40e_next_desc(index, 1, itrq->itrq_tx_ring_size); 1979 itrq->itrq_desc_free++; 1980 } 1981 1982 ASSERT(index == itrq->itrq_desc_tail); 1983 itrq->itrq_desc_head = index; 1984 } 1985 1986 /* 1987 * We're here either by hook or by crook. We need to see if there are transmit 1988 * descriptors available for us to go and clean up and return to the hardware. 1989 * We may also be blocked, and if so, we should make sure that we let it know 1990 * we're good to go. 1991 */ 1992 void 1993 i40e_tx_recycle_ring(i40e_trqpair_t *itrq) 1994 { 1995 uint32_t wbhead, toclean, count; 1996 i40e_tx_control_block_t *tcbhead; 1997 i40e_t *i40e = itrq->itrq_i40e; 1998 1999 mutex_enter(&itrq->itrq_tx_lock); 2000 2001 ASSERT(itrq->itrq_desc_free <= itrq->itrq_tx_ring_size); 2002 if (itrq->itrq_desc_free == itrq->itrq_tx_ring_size) { 2003 if (itrq->itrq_tx_blocked == B_TRUE) { 2004 itrq->itrq_tx_blocked = B_FALSE; 2005 mac_tx_ring_update(i40e->i40e_mac_hdl, 2006 itrq->itrq_mactxring); 2007 itrq->itrq_txstat.itxs_num_unblocked.value.ui64++; 2008 } 2009 mutex_exit(&itrq->itrq_tx_lock); 2010 return; 2011 } 2012 2013 /* 2014 * Now we need to try and see if there's anything available. The driver 2015 * will write to the head location and it guarantees that it does not 2016 * use relaxed ordering. 2017 */ 2018 VERIFY0(ddi_dma_sync(itrq->itrq_desc_area.dmab_dma_handle, 2019 (uintptr_t)itrq->itrq_desc_wbhead, 2020 sizeof (uint32_t), DDI_DMA_SYNC_FORKERNEL)); 2021 2022 if (i40e_check_dma_handle(itrq->itrq_desc_area.dmab_dma_handle) != 2023 DDI_FM_OK) { 2024 mutex_exit(&itrq->itrq_tx_lock); 2025 ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED); 2026 atomic_or_32(&i40e->i40e_state, I40E_ERROR); 2027 return; 2028 } 2029 2030 wbhead = *itrq->itrq_desc_wbhead; 2031 toclean = itrq->itrq_desc_head; 2032 count = 0; 2033 tcbhead = NULL; 2034 2035 while (toclean != wbhead) { 2036 i40e_tx_control_block_t *tcb; 2037 2038 tcb = itrq->itrq_tcb_work_list[toclean]; 2039 itrq->itrq_tcb_work_list[toclean] = NULL; 2040 ASSERT(tcb != NULL); 2041 tcb->tcb_next = tcbhead; 2042 tcbhead = tcb; 2043 2044 /* 2045 * We zero this out for sanity purposes. 2046 */ 2047 bzero(&itrq->itrq_desc_ring[toclean], sizeof (i40e_tx_desc_t)); 2048 toclean = i40e_next_desc(toclean, 1, itrq->itrq_tx_ring_size); 2049 count++; 2050 } 2051 2052 itrq->itrq_desc_head = wbhead; 2053 itrq->itrq_desc_free += count; 2054 itrq->itrq_txstat.itxs_recycled.value.ui64 += count; 2055 ASSERT(itrq->itrq_desc_free <= itrq->itrq_tx_ring_size); 2056 2057 if (itrq->itrq_tx_blocked == B_TRUE && 2058 itrq->itrq_desc_free > i40e->i40e_tx_block_thresh) { 2059 itrq->itrq_tx_blocked = B_FALSE; 2060 2061 mac_tx_ring_update(i40e->i40e_mac_hdl, itrq->itrq_mactxring); 2062 itrq->itrq_txstat.itxs_num_unblocked.value.ui64++; 2063 } 2064 2065 mutex_exit(&itrq->itrq_tx_lock); 2066 2067 /* 2068 * Now clean up the tcb. 2069 */ 2070 while (tcbhead != NULL) { 2071 i40e_tx_control_block_t *tcb = tcbhead; 2072 2073 tcbhead = tcb->tcb_next; 2074 i40e_tcb_reset(tcb); 2075 i40e_tcb_free(itrq, tcb); 2076 } 2077 2078 DTRACE_PROBE2(i40e__recycle, i40e_trqpair_t *, itrq, uint32_t, count); 2079 } 2080 2081 /* 2082 * We've been asked to send a message block on the wire. We'll only have a 2083 * single chain. There will not be any b_next pointers; however, there may be 2084 * multiple b_cont blocks. 2085 * 2086 * We may do one of three things with any given mblk_t chain: 2087 * 2088 * 1) Drop it 2089 * 2) Transmit it 2090 * 3) Return it 2091 * 2092 * If we return it to MAC, then MAC will flow control on our behalf. In other 2093 * words, it won't send us anything until we tell it that it's okay to send us 2094 * something. 2095 */ 2096 mblk_t * 2097 i40e_ring_tx(void *arg, mblk_t *mp) 2098 { 2099 const mblk_t *nmp; 2100 size_t mpsize; 2101 i40e_tx_control_block_t *tcb; 2102 i40e_tx_desc_t *txdesc; 2103 i40e_tx_context_t tctx; 2104 int cmd, type; 2105 2106 i40e_trqpair_t *itrq = arg; 2107 i40e_t *i40e = itrq->itrq_i40e; 2108 i40e_hw_t *hw = &i40e->i40e_hw_space; 2109 i40e_txq_stat_t *txs = &itrq->itrq_txstat; 2110 2111 ASSERT(mp->b_next == NULL); 2112 2113 if (!(i40e->i40e_state & I40E_STARTED) || 2114 (i40e->i40e_state & I40E_OVERTEMP) || 2115 (i40e->i40e_state & I40E_SUSPENDED) || 2116 (i40e->i40e_state & I40E_ERROR) || 2117 (i40e->i40e_link_state != LINK_STATE_UP)) { 2118 freemsg(mp); 2119 return (NULL); 2120 } 2121 2122 /* 2123 * Figure out the relevant context about this frame that we might need 2124 * for enabling checksum, lso, etc. This also fills in information that 2125 * we might set around the packet type, etc. 2126 */ 2127 if (i40e_tx_context(i40e, itrq, mp, &tctx) < 0) { 2128 freemsg(mp); 2129 itrq->itrq_txstat.itxs_err_context.value.ui64++; 2130 return (NULL); 2131 } 2132 2133 /* 2134 * For the primordial driver we can punt on doing any recycling right 2135 * now; however, longer term we need to probably do some more pro-active 2136 * recycling to cut back on stalls in the tx path. 2137 */ 2138 2139 /* 2140 * Do a quick size check to make sure it fits into what we think it 2141 * should for this device. Note that longer term this will be false, 2142 * particularly when we have the world of TSO. 2143 */ 2144 mpsize = 0; 2145 for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) { 2146 mpsize += MBLKL(nmp); 2147 } 2148 2149 /* 2150 * First we allocate our tx control block and prepare the packet for 2151 * transmit before we do a final check for descriptors. We do it this 2152 * way to minimize the time under the tx lock. 2153 */ 2154 tcb = i40e_tcb_alloc(itrq); 2155 if (tcb == NULL) { 2156 txs->itxs_err_notcb.value.ui64++; 2157 goto txfail; 2158 } 2159 2160 /* 2161 * For transmitting a block, we're currently going to use just a 2162 * single control block and bcopy all of the fragments into it. We 2163 * should be more intelligent about doing DMA binding or otherwise, but 2164 * for getting off the ground this will have to do. 2165 */ 2166 ASSERT(tcb->tcb_dma.dmab_len == 0); 2167 ASSERT(tcb->tcb_dma.dmab_size >= mpsize); 2168 for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) { 2169 size_t clen = MBLKL(nmp); 2170 void *coff = tcb->tcb_dma.dmab_address + tcb->tcb_dma.dmab_len; 2171 2172 bcopy(nmp->b_rptr, coff, clen); 2173 tcb->tcb_dma.dmab_len += clen; 2174 } 2175 ASSERT(tcb->tcb_dma.dmab_len == mpsize); 2176 2177 /* 2178 * While there's really no need to keep the mp here, but let's just do 2179 * it to help with our own debugging for now. 2180 */ 2181 tcb->tcb_mp = mp; 2182 tcb->tcb_type = I40E_TX_COPY; 2183 I40E_DMA_SYNC(&tcb->tcb_dma, DDI_DMA_SYNC_FORDEV); 2184 2185 mutex_enter(&itrq->itrq_tx_lock); 2186 if (itrq->itrq_desc_free < i40e->i40e_tx_block_thresh) { 2187 txs->itxs_err_nodescs.value.ui64++; 2188 mutex_exit(&itrq->itrq_tx_lock); 2189 goto txfail; 2190 } 2191 2192 /* 2193 * Build up the descriptor and send it out. Thankfully at the moment 2194 * we only need a single desc, because we're not doing anything fancy 2195 * yet. 2196 */ 2197 ASSERT(itrq->itrq_desc_free > 0); 2198 itrq->itrq_desc_free--; 2199 txdesc = &itrq->itrq_desc_ring[itrq->itrq_desc_tail]; 2200 itrq->itrq_tcb_work_list[itrq->itrq_desc_tail] = tcb; 2201 itrq->itrq_desc_tail = i40e_next_desc(itrq->itrq_desc_tail, 1, 2202 itrq->itrq_tx_ring_size); 2203 2204 /* 2205 * Note, we always set EOP and RS which indicates that this is the last 2206 * data frame and that we should ask for it to be transmitted. We also 2207 * must always set ICRC, because that is an internal bit that must be 2208 * set to one for data descriptors. The remaining bits in the command 2209 * descriptor depend on checksumming and are determined based on the 2210 * information set up in i40e_tx_context(). 2211 */ 2212 type = I40E_TX_DESC_DTYPE_DATA; 2213 cmd = I40E_TX_DESC_CMD_EOP | 2214 I40E_TX_DESC_CMD_RS | 2215 I40E_TX_DESC_CMD_ICRC | 2216 tctx.itc_cmdflags; 2217 txdesc->buffer_addr = 2218 CPU_TO_LE64((uintptr_t)tcb->tcb_dma.dmab_dma_address); 2219 txdesc->cmd_type_offset_bsz = CPU_TO_LE64(((uint64_t)type | 2220 ((uint64_t)tctx.itc_offsets << I40E_TXD_QW1_OFFSET_SHIFT) | 2221 ((uint64_t)cmd << I40E_TXD_QW1_CMD_SHIFT) | 2222 ((uint64_t)tcb->tcb_dma.dmab_len << I40E_TXD_QW1_TX_BUF_SZ_SHIFT))); 2223 2224 /* 2225 * Now, finally, sync the DMA data and alert hardware. 2226 */ 2227 I40E_DMA_SYNC(&itrq->itrq_desc_area, DDI_DMA_SYNC_FORDEV); 2228 2229 I40E_WRITE_REG(hw, I40E_QTX_TAIL(itrq->itrq_index), 2230 itrq->itrq_desc_tail); 2231 if (i40e_check_acc_handle(i40e->i40e_osdep_space.ios_reg_handle) != 2232 DDI_FM_OK) { 2233 /* 2234 * Note, we can't really go through and clean this up very well, 2235 * because the memory has been given to the device, so just 2236 * indicate it's been transmitted. 2237 */ 2238 ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED); 2239 atomic_or_32(&i40e->i40e_state, I40E_ERROR); 2240 } 2241 2242 txs->itxs_bytes.value.ui64 += mpsize; 2243 txs->itxs_packets.value.ui64++; 2244 txs->itxs_descriptors.value.ui64++; 2245 2246 mutex_exit(&itrq->itrq_tx_lock); 2247 2248 return (NULL); 2249 2250 txfail: 2251 /* 2252 * We ran out of resources. Return it to MAC and indicate that we'll 2253 * need to signal MAC. If there are allocated tcb's, return them now. 2254 * Make sure to reset their message block's, since we'll return them 2255 * back to MAC. 2256 */ 2257 if (tcb != NULL) { 2258 tcb->tcb_mp = NULL; 2259 i40e_tcb_reset(tcb); 2260 i40e_tcb_free(itrq, tcb); 2261 } 2262 2263 mutex_enter(&itrq->itrq_tx_lock); 2264 itrq->itrq_tx_blocked = B_TRUE; 2265 mutex_exit(&itrq->itrq_tx_lock); 2266 2267 return (mp); 2268 } 2269