xref: /illumos-gate/usr/src/uts/common/io/i40e/i40e_transceiver.c (revision a2876d03ca2556102e024ae4a50bb4db8fe562b0)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2015 OmniTI Computer Consulting, Inc. All rights reserved.
14  * Copyright 2019 Joyent, Inc.
15  */
16 
17 #include "i40e_sw.h"
18 
19 /*
20  * ---------------------------------------------------------
21  * Buffer and Memory Management, Receiving, and Transmitting
22  * ---------------------------------------------------------
23  *
24  * Each physical function (PF), which is what we think of as an instance of the
25  * device driver, has a series of associated transmit and receive queue pairs.
26  * Effectively, what we think of in MAC as rings. Each of these has their own
27  * ring of descriptors which is used as part of doing DMA activity.
28  *
29  * The transmit ring of descriptors are 16-byte entries which are used to send
30  * packets, program filters, etc. The receive ring of descriptors are either
31  * 16-byte or 32-bytes each. At the moment, we opt to use the larger descriptor
32  * format so that we're in a better position if we ever want to leverage that
33  * information later on.
34  *
35  * However, these rings are just for descriptors, they don't talk or deal with
36  * how we actually store the memory that we need for DMA or the associated
37  * information that we need for keeping track of message blocks. To correspond
38  * to the hardware descriptor ring which is how we communicate with hardware, we
39  * introduce a control block which keeps track of our required metadata like DMA
40  * mappings.
41  *
42  * There are two main considerations that dictate how much memory and buffers
43  * we end up allocating. Those are:
44  *
45  *   o The size of the ring (controlled through the driver.conf file)
46  *
47  *   o The maximum size frame we can receive.
48  *
49  * The size of the rings currently defaults to 1024 descriptors and is stored in
50  * the i40e_t`i40e_rx_ring_size and i40e_t`i40e_tx_ring_size.
51  *
52  * While the size of the rings is controlled by the driver.conf, the maximum
53  * size frame is informed primarily through the use of dladm and the setting of
54  * the MTU property on the device. From the MTU, we then go and do some
55  * machinations. The first thing we do is we then have to add in space for the
56  * Ethernet header, potentially a VLAN header, and the FCS check. This value is
57  * what's stored as i40e_t`i40e_frame_max and is derived any time
58  * i40e_t`i40e_sdu changes.
59  *
60  * This size is then rounded up to the nearest 1k chunk, which represents the
61  * actual amount of memory that we'll allocate for a single frame.
62  *
63  * Note, that for RX, we do something that might be unexpected. We always add
64  * an extra two bytes to the frame size that we allocate. We then offset the DMA
65  * address that we receive a packet into by two bytes. This ensures that the IP
66  * header will always be 4 byte aligned because the MAC header is either 14 or
67  * 18 bytes in length, depending on the use of 802.1Q tagging, which makes IP's
68  * and MAC's lives easier.
69  *
70  * Both the RX and TX descriptor rings (which are what we use to communicate
71  * with hardware) are allocated as a single region of DMA memory which is the
72  * size of the descriptor (4 bytes and 2 bytes respectively) times the total
73  * number of descriptors for an RX and TX ring.
74  *
75  * While the RX and TX descriptors are allocated using DMA-based memory, the
76  * control blocks for each of them are allocated using normal kernel memory.
77  * They aren't special from a DMA perspective. We'll go over the design of both
78  * receiving and transmitting separately, as they have slightly different
79  * control blocks and different ways that we manage the relationship between
80  * control blocks and descriptors.
81  *
82  * ---------------------------------
83  * RX Descriptors and Control Blocks
84  * ---------------------------------
85  *
86  * For every descriptor in the ring that the driver has, we need some associated
87  * memory, which means that we need to have the receive specific control block.
88  * We have a couple different, but related goals:
89  *
90  *   o Once we've completed the mc_start GLDv3 endpoint (i40e_m_start), we do
91  *     not want to do any additional memory allocations or DMA allocations if
92  *     we don't have to.
93  *
94  *   o We'd like to try and do as much zero-copy as possible, while taking into
95  *     account the cost of mapping in DMA resources.
96  *
97  *   o We'd like to have every receive descriptor available.
98  *
99  * Now, these rules are a bit in tension with one another. The act of mapping in
100  * is an exercise of trying to find the break-even point between page table
101  * updates and bcopy. We currently start by using the same metrics that ixgbe
102  * used; however, it should be known that this value has effectively been
103  * cargo-culted across to yet another driver, sorry.
104  *
105  * If we receive a packet which is larger than our copy threshold, we'll create
106  * a message block out of the DMA memory via desballoc(9F) and send that up to
107  * MAC that way. This will cause us to be notified when the message block is
108  * then freed because it has been consumed, dropped, or otherwise. Otherwise, if
109  * it's less than the threshold, we'll try to use allocb and bcopy it into the
110  * block, thus allowing us to immediately reuse the DMA resource. Note, on debug
111  * builds, we allow someone to whack the variable i40e_debug_rx_mode to override
112  * the behavior and always do a bcopy or a DMA bind.
113  *
114  * To try and ensure that the device always has blocks that it can receive data
115  * into, we maintain two lists of control blocks, a working list and a free
116  * list. Each list is sized equal to the number of descriptors in the RX ring.
117  * During the GLDv3 mc_start routine, we allocate a number of RX control blocks
118  * equal to twice the number of descriptors in the ring and we assign them
119  * equally to the free list and to the working list. Each control block also has
120  * DMA memory allocated and associated with which it will be used to receive the
121  * actual packet data. All of a received frame's data will end up in a single
122  * DMA buffer.
123  *
124  * During operation, we always maintain the invariant that each RX descriptor
125  * has an associated RX control block which lives in the working list. If we
126  * feel that we should loan up DMA memory to MAC in the form of a message block,
127  * we can only do so if we can maintain this invariant. To do that, we swap in
128  * one of the buffers from the free list. If none are available, then we resort
129  * to using allocb(9F) and bcopy(9F) on the packet instead, regardless of the
130  * size.
131  *
132  * Loaned message blocks come back to use when freemsg(9F) or freeb(9F) is
133  * called on the block, at which point we restore the RX control block to the
134  * free list and are able to reuse the DMA memory again. While the scheme may
135  * seem odd, it importantly keeps us out of trying to do any DMA allocations in
136  * the normal path of operation, even though we may still have to allocate
137  * message blocks and copy.
138  *
139  * The following state machine describes the life time of a RX control block. In
140  * the diagram we abbrviate the RX ring descriptor entry as rxd and the rx
141  * control block entry as rcb.
142  *
143  *             |                                   |
144  *             * ... 1/2 of all initial rcb's  ... *
145  *             |                                   |
146  *             v                                   v
147  *     +------------------+               +------------------+
148  *     | rcb on free list |---*---------->| rcb on work list |
149  *     +------------------+   .           +------------------+
150  *             ^              . moved to          |
151  *             |                replace rcb       * . . Frame received,
152  *             |                loaned to         |     entry on free list
153  *             |                MAC + co.         |     available. rcb's
154  *             |                                  |     memory made into mblk_t
155  *             * . freemsg(9F)                    |     and sent up to MAC.
156  *             |   called on                      |
157  *             |   loaned rcb                     |
158  *             |   and it is                      v
159  *             |   recycled.              +-------------------+
160  *             +--------------------<-----| rcb loaned to MAC |
161  *                                        +-------------------+
162  *
163  * Finally, note that every RX control block has a reference count on it. One
164  * reference is added as long as the driver has had the GLDv3 mc_start endpoint
165  * called. If the GLDv3 mc_stop entry point is called, IP has been unplumbed and
166  * no other DLPI consumers remain, then we'll decrement the reference count by
167  * one. Whenever we loan up the RX control block and associated buffer to MAC,
168  * then we bump the reference count again. Even though the device is stopped,
169  * there may still be loaned frames in upper levels that we'll want to account
170  * for. Our callback from freemsg(9F)/freeb(9F) will take care of making sure
171  * that it is cleaned up.
172  *
173  * --------------------
174  * Managing the RX Ring
175  * --------------------
176  *
177  * The receive ring descriptors are arranged in a circular buffer with a head
178  * and tail pointer. There are both the conventional head and tail pointers
179  * which are used to partition the ring into two portions, a portion that we,
180  * the operating system, manage and a portion that is managed by hardware. When
181  * hardware owns a descriptor in the ring, it means that it is waiting for data
182  * to be filled in. However, when a portion of the ring is owned by the driver,
183  * then that means that the descriptor has been consumed and we need to go take
184  * a look at it.
185  *
186  * The initial head is configured to be zero by writing it as such in the
187  * receive queue context in the FPM (function private memory from the host). The
188  * initial tail is written to be the last descriptor. This is written to via the
189  * PCIe register I40E_QRX_TAIL(). Technically, hardware owns everything between
190  * the HEAD and TAIL, inclusive. Note that while we initially program the HEAD,
191  * the only values we ever consult ourselves are the TAIL register and our own
192  * state tracking. Effectively, we cache the HEAD register and then update it
193  * ourselves based on our work.
194  *
195  * When we iterate over the RX descriptors and thus the received frames, we are
196  * either in an interrupt context or we've been asked by MAC to poll on the
197  * ring. If we've been asked to poll on the ring, we have a maximum number of
198  * bytes of mblk_t's to return. If processing an RX descriptor would cause us to
199  * exceed that count, then we do not process it. When in interrupt context, we
200  * don't have a strict byte count. However, to ensure liveness, we limit the
201  * amount of data based on a configuration value
202  * (i40e_t`i40e_rx_limit_per_intr). The number that we've started with for this
203  * is based on similar numbers that are used for ixgbe. After some additional
204  * time in the field, we'll have a sense as to whether or not it should be
205  * changed.
206  *
207  * When processing, we start at our own HEAD pointer
208  * (i40e_rx_data_t`rxd_desc_next), which indicates the descriptor to start
209  * processing. Every RX descriptor has what's described as the DD bit. This bit
210  * (the LSB of the second 8-byte word), indicates whether or not the descriptor
211  * is done.  When we give descriptors to the hardware, this value is always
212  * zero. When the hardware has finished a descriptor, it will always be one.
213  *
214  * The first thing that we check is whether the DD bit indicates that the
215  * current HEAD is ready. If it isn't, then we're done. That's the primary
216  * invariant of processing a frame. If it's done, then there are a few other
217  * things that we want to look at. In the same status word as the DD bit, there
218  * are two other important bits:
219  *
220  *   o End of Packet (EOP)
221  *   o Error bits
222  *
223  * The end of packet indicates that we have reached the last descriptor. Now,
224  * you might ask when would there be more than one descriptor. The reason for
225  * that might be due to large receive offload (lro) or header splitting
226  * functionality, which presently isn't supported in the driver. The error bits
227  * in the frame are only valid when EOP is set.
228  *
229  * If error bits are set on the frame, then we still consume it; however, we
230  * will not generate an mblk_t to send up to MAC. If there are no error bits
231  * set, then we'll consume the descriptor either using bcopy or DMA binding. See
232  * the earlier section 'RX DESCRIPTORS AND CONTROL BLOCKS' for more information
233  * on how that selection is made.
234  *
235  * Regardless of whether we construct an mblk_t or encounter an error, we end up
236  * resetting the descriptor. This re-arms the descriptor for hardware and in the
237  * process, we may end up assigning it a new receive control bock. After we do
238  * this, we always update our HEAD pointer, no matter what.
239  *
240  * Finally, once we've consumed as much as we will in a given window, we go and
241  * update the TAIL register to indicate all the frames we've consumed. We only
242  * do a single bulk write for the ring.
243  *
244  * ---------------------------------
245  * TX Descriptors and Control Blocks
246  * ---------------------------------
247  *
248  * While the transmit path is similar in spirit to the receive path, it works
249  * differently due to the fact that all data is originated by the operating
250  * system and not by the device.
251  *
252  * Like RX, there is both a descriptor ring that we use to communicate to the
253  * driver and which points to the memory used to transmit a frame.  Similarly,
254  * there is a corresponding transmit control block, however, the correspondence
255  * between descriptors and control blocks is more complex and not necessarily
256  * 1-to-1.
257  *
258  * The driver is asked to process a single frame at a time. That message block
259  * may be made up of multiple fragments linked together by the mblk_t`b_cont
260  * member. The device has a hard limit of up to 8 buffers being allowed for use
261  * for a single non-LSO packet or LSO segment. The number of TX ring entires
262  * (and thus TX control blocks) used depends on the fragment sizes and DMA
263  * layout, as explained below.
264  *
265  * We alter our DMA strategy based on a threshold tied to the fragment size.
266  * This threshold is configurable via the tx_dma_threshold property. If the
267  * fragment is above the threshold, we DMA bind it -- consuming one TCB and
268  * potentially several data descriptors. The exact number of descriptors (equal
269  * to the number of DMA cookies) depends on page size, MTU size, b_rptr offset
270  * into page, b_wptr offset into page, and the physical layout of the dblk's
271  * memory (contiguous or not). Essentially, we are at the mercy of the DMA
272  * engine and the dblk's memory allocation. Knowing the exact number of
273  * descriptors up front is a task best not taken on by the driver itself.
274  * Instead, we attempt to DMA bind the fragment and verify the descriptor
275  * layout meets hardware constraints. If the proposed DMA bind does not satisfy
276  * the hardware constaints, then we discard it and instead copy the entire
277  * fragment into the pre-allocated TCB buffer (or buffers if the fragment is
278  * larger than the TCB buffer).
279  *
280  * If the fragment is below or at the threshold, we copy it to the pre-allocated
281  * buffer of a TCB. We compress consecutive copy fragments into a single TCB to
282  * conserve resources. We are guaranteed that the TCB buffer is made up of only
283  * 1 DMA cookie; and therefore consumes only one descriptor on the controller.
284  *
285  * Furthermore, if the frame requires HW offloads such as LSO, tunneling or
286  * filtering, then the TX data descriptors must be preceeded by a single TX
287  * context descriptor.  Because there is no DMA transfer associated with the
288  * context descriptor, we allocate a control block with a special type which
289  * indicates to the TX ring recycle code that there are no associated DMA
290  * resources to unbind when the control block is free'd.
291  *
292  * If we don't have enough space in the ring or TX control blocks available,
293  * then we'll return the unprocessed message block to MAC. This will induce flow
294  * control and once we recycle enough entries, we'll once again enable sending
295  * on the ring.
296  *
297  * We size the working list as equal to the number of descriptors in the ring.
298  * We size the free list as equal to 1.5 times the number of descriptors in the
299  * ring. We'll allocate a number of TX control block entries equal to the number
300  * of entries in the free list. By default, all entries are placed in the free
301  * list. As we come along and try to send something, we'll allocate entries from
302  * the free list and add them to the working list, where they'll stay until the
303  * hardware indicates that all of the data has been written back to us. The
304  * reason that we start with 1.5x is to help facilitate having more than one TX
305  * buffer associated with the DMA activity.
306  *
307  * --------------------
308  * Managing the TX Ring
309  * --------------------
310  *
311  * The transmit descriptor ring is driven by us. We maintain our own notion of a
312  * HEAD and TAIL register and we update the hardware with updates to the TAIL
313  * register. When the hardware is done writing out data, it updates us by
314  * writing back to a specific address, not by updating the individual
315  * descriptors. That address is a 4-byte region after the main transmit
316  * descriptor ring. This is why the descriptor ring has an extra descriptor's
317  * worth allocated to it.
318  *
319  * We maintain our notion of the HEAD in the i40e_trqpair_t`itrq_desc_head and
320  * the TAIL in the i40e_trqpair_t`itrq_desc_tail. When we write out frames,
321  * we'll update the tail there and in the I40E_QTX_TAIL() register. At various
322  * points in time, through both interrupts, and our own internal checks, we'll
323  * sync the write-back head portion of the DMA space. Based on the index it
324  * reports back, we'll free everything between our current HEAD and the
325  * indicated index and update HEAD to the new index.
326  *
327  * When a frame comes in, we try to use a number of transmit control blocks and
328  * we'll transition them from the free list to the work list. They'll get moved
329  * to the entry on the work list that corresponds with the transmit descriptor
330  * they correspond to. Once we are indicated that the corresponding descriptor
331  * has been freed, we'll return it to the list.
332  *
333  * The transmit control block free list is managed by keeping track of the
334  * number of entries in it, i40e_trqpair_t`itrq_tcb_free. We use it as a way to
335  * index into the free list and add things to it. In effect, we always push and
336  * pop from the tail and protect it with a single lock,
337  * i40e_trqpair_t`itrq_tcb_lock. This scheme is somewhat simplistic and may not
338  * stand up to further performance testing; however, it does allow us to get off
339  * the ground with the device driver.
340  *
341  * The following image describes where a given transmit control block lives in
342  * its lifetime:
343  *
344  *             |
345  *             * ... Initial placement for all tcb's
346  *             |
347  *             v
348  *    +------------------+                       +------------------+
349  *    | tcb on free list |---*------------------>| tcb on work list |
350  *    +------------------+   .                   +------------------+
351  *             ^             . N tcbs allocated[1]         |
352  *             |               to send frame               v
353  *             |               or fragment on              |
354  *             |               wire, mblk from             |
355  *             |               MAC associated.             |
356  *             |                                           |
357  *             +------*-------------------------------<----+
358  *                    .
359  *                    . Hardware indicates
360  *                      entry transmitted.
361  *                      tcbs recycled, mblk
362  *                      from MAC freed.
363  *
364  * [1] We allocate N tcbs to transmit a single frame where N can be 1 context
365  *     descriptor plus 1 data descriptor, in the non-DMA-bind case.  In the DMA
366  *     bind case, N can be 1 context descriptor plus 1 data descriptor per
367  *     b_cont in the mblk.  In this case, the mblk is associated with the first
368  *     data descriptor and freed as part of freeing that data descriptor.
369  *
370  * ------------
371  * Blocking MAC
372  * ------------
373  *
374  * When performing transmit, we can run out of descriptors and ring entries.
375  * When such a case happens, we return the mblk_t to MAC to indicate that we've
376  * been blocked. At that point in time, MAC becomes blocked and will not
377  * transmit anything out that specific ring until we notify MAC. To indicate
378  * that we're in such a situation we set i40e_trqpair_t`itrq_tx_blocked member
379  * to B_TRUE.
380  *
381  * When we recycle TX descriptors then we'll end up signaling MAC by calling
382  * mac_tx_ring_update() if we were blocked, letting it know that it's safe to
383  * start sending frames out to us again.
384  */
385 
386 /*
387  * We set our DMA alignment requests based on the smallest supported page size
388  * of the corresponding platform.
389  */
390 #if	defined(__sparc)
391 #define	I40E_DMA_ALIGNMENT 0x2000ull
392 #elif defined(__x86)
393 #define	I40E_DMA_ALIGNMENT 0x1000ull
394 #else
395 #error	"unknown architecture for i40e"
396 #endif
397 
398 /*
399  * This structure is used to maintain information and flags related to
400  * transmitting a frame.  These fields are ultimately used to construct the
401  * TX data descriptor(s) and, if necessary, the TX context descriptor.
402  */
403 typedef struct i40e_tx_context {
404 	enum i40e_tx_desc_cmd_bits	itc_data_cmdflags;
405 	uint32_t			itc_data_offsets;
406 	enum i40e_tx_ctx_desc_cmd_bits	itc_ctx_cmdflags;
407 	uint32_t			itc_ctx_tsolen;
408 	uint32_t			itc_ctx_mss;
409 } i40e_tx_context_t;
410 
411 /*
412  * Toggles on debug builds which can be used to override our RX behaviour based
413  * on thresholds.
414  */
415 #ifdef	DEBUG
416 typedef enum {
417 	I40E_DEBUG_RX_DEFAULT	= 0,
418 	I40E_DEBUG_RX_BCOPY	= 1,
419 	I40E_DEBUG_RX_DMABIND	= 2
420 } i40e_debug_rx_t;
421 
422 i40e_debug_rx_t i40e_debug_rx_mode = I40E_DEBUG_RX_DEFAULT;
423 #endif	/* DEBUG */
424 
425 /*
426  * Notes on the following pair of DMA attributes. The first attribute,
427  * i40e_static_dma_attr, is designed to be used for both the descriptor rings
428  * and the static buffers that we associate with control blocks. For this
429  * reason, we force an SGL length of one. While technically the driver supports
430  * a larger SGL (5 on RX and 8 on TX), we opt to only use one to simplify our
431  * management here. In addition, when the Intel common code wants to allocate
432  * memory via the i40e_allocate_virt_mem osdep function, we have it leverage
433  * the static dma attr.
434  *
435  * The latter two sets of attributes, are what we use when we're binding a
436  * bunch of mblk_t fragments to go out the door. Note that the main difference
437  * here is that we're allowed a larger SGL length.  For non-LSO TX, we
438  * restrict the SGL length to match the number of TX buffers available to the
439  * PF (8).  For the LSO case we can go much larger, with the caveat that each
440  * MSS-sized chunk (segment) must not span more than 8 data descriptors and
441  * hence must not span more than 8 cookies.
442  *
443  * Note, we default to setting ourselves to be DMA capable here. However,
444  * because we could have multiple instances which have different FMA error
445  * checking capabilities, or end up on different buses, we make these static
446  * and const and copy them into the i40e_t for the given device with the actual
447  * values that reflect the actual capabilities.
448  */
449 static const ddi_dma_attr_t i40e_g_static_dma_attr = {
450 	DMA_ATTR_V0,			/* version number */
451 	0x0000000000000000ull,		/* low address */
452 	0xFFFFFFFFFFFFFFFFull,		/* high address */
453 	0x00000000FFFFFFFFull,		/* dma counter max */
454 	I40E_DMA_ALIGNMENT,		/* alignment */
455 	0x00000FFF,			/* burst sizes */
456 	0x00000001,			/* minimum transfer size */
457 	0x00000000FFFFFFFFull,		/* maximum transfer size */
458 	0xFFFFFFFFFFFFFFFFull,		/* maximum segment size */
459 	1,				/* scatter/gather list length */
460 	0x00000001,			/* granularity */
461 	DDI_DMA_FLAGERR			/* DMA flags */
462 };
463 
464 static const ddi_dma_attr_t i40e_g_txbind_dma_attr = {
465 	DMA_ATTR_V0,			/* version number */
466 	0x0000000000000000ull,		/* low address */
467 	0xFFFFFFFFFFFFFFFFull,		/* high address */
468 	I40E_MAX_TX_BUFSZ - 1,		/* dma counter max */
469 	I40E_DMA_ALIGNMENT,		/* alignment */
470 	0x00000FFF,			/* burst sizes */
471 	0x00000001,			/* minimum transfer size */
472 	0x00000000FFFFFFFFull,		/* maximum transfer size */
473 	0xFFFFFFFFFFFFFFFFull,		/* maximum segment size	 */
474 	I40E_TX_MAX_COOKIE,		/* scatter/gather list length */
475 	0x00000001,			/* granularity */
476 	DDI_DMA_FLAGERR			/* DMA flags */
477 };
478 
479 static const ddi_dma_attr_t i40e_g_txbind_lso_dma_attr = {
480 	DMA_ATTR_V0,			/* version number */
481 	0x0000000000000000ull,		/* low address */
482 	0xFFFFFFFFFFFFFFFFull,		/* high address */
483 	I40E_MAX_TX_BUFSZ - 1,		/* dma counter max */
484 	I40E_DMA_ALIGNMENT,		/* alignment */
485 	0x00000FFF,			/* burst sizes */
486 	0x00000001,			/* minimum transfer size */
487 	0x00000000FFFFFFFFull,		/* maximum transfer size */
488 	0xFFFFFFFFFFFFFFFFull,		/* maximum segment size	 */
489 	I40E_TX_LSO_MAX_COOKIE,		/* scatter/gather list length */
490 	0x00000001,			/* granularity */
491 	DDI_DMA_FLAGERR			/* DMA flags */
492 };
493 
494 /*
495  * Next, we have the attributes for these structures. The descriptor rings are
496  * all strictly little endian, while the data buffers are just arrays of bytes
497  * representing frames. Because of this, we purposefully simplify the driver
498  * programming life by programming the descriptor ring as little endian, while
499  * for the buffer data we keep it as unstructured.
500  *
501  * Note, that to keep the Intel common code operating in a reasonable way, when
502  * we allocate DMA memory for it, we do not use byte swapping and thus use the
503  * standard i40e_buf_acc_attr.
504  */
505 static const ddi_device_acc_attr_t i40e_g_desc_acc_attr = {
506 	DDI_DEVICE_ATTR_V0,
507 	DDI_STRUCTURE_LE_ACC,
508 	DDI_STRICTORDER_ACC
509 };
510 
511 static const ddi_device_acc_attr_t i40e_g_buf_acc_attr = {
512 	DDI_DEVICE_ATTR_V0,
513 	DDI_NEVERSWAP_ACC,
514 	DDI_STRICTORDER_ACC
515 };
516 
517 /*
518  * The next two functions are designed to be type-safe versions of macros that
519  * are used to increment and decrement a descriptor index in the loop. Note,
520  * these are marked inline to try and keep the data path hot and they were
521  * effectively inlined in their previous life as macros.
522  */
523 static inline int
524 i40e_next_desc(int base, int count, int size)
525 {
526 	int out;
527 
528 	ASSERT(base >= 0);
529 	ASSERT(count > 0);
530 	ASSERT(size > 0);
531 
532 	if (base + count < size) {
533 		out = base + count;
534 	} else {
535 		out = base + count - size;
536 	}
537 
538 	ASSERT(out >= 0 && out < size);
539 	return (out);
540 }
541 
542 static inline int
543 i40e_prev_desc(int base, int count, int size)
544 {
545 	int out;
546 
547 	ASSERT(base >= 0);
548 	ASSERT(count > 0);
549 	ASSERT(size > 0);
550 
551 	if (base >= count) {
552 		out = base - count;
553 	} else {
554 		out = base - count + size;
555 	}
556 
557 	ASSERT(out >= 0 && out < size);
558 	return (out);
559 }
560 
561 /*
562  * Free DMA memory that is represented by a i40e_dma_buffer_t.
563  */
564 static void
565 i40e_free_dma_buffer(i40e_dma_buffer_t *dmap)
566 {
567 	if (dmap->dmab_dma_address != 0) {
568 		VERIFY(dmap->dmab_dma_handle != NULL);
569 		(void) ddi_dma_unbind_handle(dmap->dmab_dma_handle);
570 		dmap->dmab_dma_address = 0;
571 		dmap->dmab_size = 0;
572 	}
573 
574 	if (dmap->dmab_acc_handle != NULL) {
575 		ddi_dma_mem_free(&dmap->dmab_acc_handle);
576 		dmap->dmab_acc_handle = NULL;
577 		dmap->dmab_address = NULL;
578 	}
579 
580 	if (dmap->dmab_dma_handle != NULL) {
581 		ddi_dma_free_handle(&dmap->dmab_dma_handle);
582 		dmap->dmab_dma_handle = NULL;
583 	}
584 
585 	/*
586 	 * These should only be set if we have valid handles allocated and
587 	 * therefore should always be NULLed out due to the above code. This
588 	 * is here to catch us acting sloppy.
589 	 */
590 	ASSERT(dmap->dmab_dma_address == 0);
591 	ASSERT(dmap->dmab_address == NULL);
592 	ASSERT(dmap->dmab_size == 0);
593 	dmap->dmab_len = 0;
594 }
595 
596 /*
597  * Allocate size bytes of DMA memory based on the passed in attributes. This
598  * fills in the information in dmap and is designed for all of our single cookie
599  * allocations.
600  */
601 static boolean_t
602 i40e_alloc_dma_buffer(i40e_t *i40e, i40e_dma_buffer_t *dmap,
603     ddi_dma_attr_t *attrsp, ddi_device_acc_attr_t *accp, boolean_t stream,
604     boolean_t zero, size_t size)
605 {
606 	int ret;
607 	uint_t flags;
608 	size_t len;
609 	ddi_dma_cookie_t cookie;
610 	uint_t ncookies;
611 
612 	if (stream == B_TRUE)
613 		flags = DDI_DMA_STREAMING;
614 	else
615 		flags = DDI_DMA_CONSISTENT;
616 
617 	/*
618 	 * Step one: Allocate the DMA handle
619 	 */
620 	ret = ddi_dma_alloc_handle(i40e->i40e_dip, attrsp, DDI_DMA_DONTWAIT,
621 	    NULL, &dmap->dmab_dma_handle);
622 	if (ret != DDI_SUCCESS) {
623 		i40e_error(i40e, "failed to allocate dma handle for I/O "
624 		    "buffers: %d", ret);
625 		dmap->dmab_dma_handle = NULL;
626 		return (B_FALSE);
627 	}
628 
629 	/*
630 	 * Step two: Allocate the DMA memory
631 	 */
632 	ret = ddi_dma_mem_alloc(dmap->dmab_dma_handle, size, accp, flags,
633 	    DDI_DMA_DONTWAIT, NULL, &dmap->dmab_address, &len,
634 	    &dmap->dmab_acc_handle);
635 	if (ret != DDI_SUCCESS) {
636 		i40e_error(i40e, "failed to allocate %ld bytes of DMA for I/O "
637 		    "buffers", size);
638 		dmap->dmab_address = NULL;
639 		dmap->dmab_acc_handle = NULL;
640 		i40e_free_dma_buffer(dmap);
641 		return (B_FALSE);
642 	}
643 
644 	/*
645 	 * Step three: Optionally zero
646 	 */
647 	if (zero == B_TRUE)
648 		bzero(dmap->dmab_address, len);
649 
650 	/*
651 	 * Step four: Bind the memory
652 	 */
653 	ret = ddi_dma_addr_bind_handle(dmap->dmab_dma_handle, NULL,
654 	    dmap->dmab_address, len, DDI_DMA_RDWR | flags, DDI_DMA_DONTWAIT,
655 	    NULL, &cookie, &ncookies);
656 	if (ret != DDI_DMA_MAPPED) {
657 		i40e_error(i40e, "failed to allocate %ld bytes of DMA for I/O "
658 		    "buffers: %d", size, ret);
659 		i40e_free_dma_buffer(dmap);
660 		return (B_FALSE);
661 	}
662 
663 	VERIFY(ncookies == 1);
664 	dmap->dmab_dma_address = cookie.dmac_laddress;
665 	dmap->dmab_size = len;
666 	dmap->dmab_len = 0;
667 	return (B_TRUE);
668 }
669 
670 /*
671  * This function is called once the last pending rcb has been freed by the upper
672  * levels of the system.
673  */
674 static void
675 i40e_free_rx_data(i40e_rx_data_t *rxd)
676 {
677 	VERIFY(rxd->rxd_rcb_pending == 0);
678 
679 	if (rxd->rxd_rcb_area != NULL) {
680 		kmem_free(rxd->rxd_rcb_area,
681 		    sizeof (i40e_rx_control_block_t) *
682 		    (rxd->rxd_free_list_size + rxd->rxd_ring_size));
683 		rxd->rxd_rcb_area = NULL;
684 	}
685 
686 	if (rxd->rxd_free_list != NULL) {
687 		kmem_free(rxd->rxd_free_list,
688 		    sizeof (i40e_rx_control_block_t *) *
689 		    rxd->rxd_free_list_size);
690 		rxd->rxd_free_list = NULL;
691 	}
692 
693 	if (rxd->rxd_work_list != NULL) {
694 		kmem_free(rxd->rxd_work_list,
695 		    sizeof (i40e_rx_control_block_t *) *
696 		    rxd->rxd_ring_size);
697 		rxd->rxd_work_list = NULL;
698 	}
699 
700 	kmem_free(rxd, sizeof (i40e_rx_data_t));
701 }
702 
703 static boolean_t
704 i40e_alloc_rx_data(i40e_t *i40e, i40e_trqpair_t *itrq)
705 {
706 	i40e_rx_data_t *rxd;
707 
708 	rxd = kmem_zalloc(sizeof (i40e_rx_data_t), KM_NOSLEEP);
709 	if (rxd == NULL)
710 		return (B_FALSE);
711 	itrq->itrq_rxdata = rxd;
712 	rxd->rxd_i40e = i40e;
713 
714 	rxd->rxd_ring_size = i40e->i40e_rx_ring_size;
715 	rxd->rxd_free_list_size = i40e->i40e_rx_ring_size;
716 
717 	rxd->rxd_rcb_free = rxd->rxd_free_list_size;
718 
719 	rxd->rxd_work_list = kmem_zalloc(sizeof (i40e_rx_control_block_t *) *
720 	    rxd->rxd_ring_size, KM_NOSLEEP);
721 	if (rxd->rxd_work_list == NULL) {
722 		i40e_error(i40e, "failed to allocate RX work list for a ring "
723 		    "of %d entries for ring %d", rxd->rxd_ring_size,
724 		    itrq->itrq_index);
725 		goto cleanup;
726 	}
727 
728 	rxd->rxd_free_list = kmem_zalloc(sizeof (i40e_rx_control_block_t *) *
729 	    rxd->rxd_free_list_size, KM_NOSLEEP);
730 	if (rxd->rxd_free_list == NULL) {
731 		i40e_error(i40e, "failed to allocate a %d entry RX free list "
732 		    "for ring %d", rxd->rxd_free_list_size, itrq->itrq_index);
733 		goto cleanup;
734 	}
735 
736 	rxd->rxd_rcb_area = kmem_zalloc(sizeof (i40e_rx_control_block_t) *
737 	    (rxd->rxd_free_list_size + rxd->rxd_ring_size), KM_NOSLEEP);
738 	if (rxd->rxd_rcb_area == NULL) {
739 		i40e_error(i40e, "failed to allocate a %d entry rcb area for "
740 		    "ring %d", rxd->rxd_ring_size + rxd->rxd_free_list_size,
741 		    itrq->itrq_index);
742 		goto cleanup;
743 	}
744 
745 	return (B_TRUE);
746 
747 cleanup:
748 	i40e_free_rx_data(rxd);
749 	itrq->itrq_rxdata = NULL;
750 	return (B_FALSE);
751 }
752 
753 /*
754  * Free all of the memory that we've allocated for DMA. Note that we may have
755  * buffers that we've loaned up to the OS which are still outstanding. We'll
756  * always free up the descriptor ring, because we no longer need that. For each
757  * rcb, we'll iterate over it and if we send the reference count to zero, then
758  * we'll free the message block and DMA related resources. However, if we don't
759  * take the last one, then we'll go ahead and keep track that we'll have pending
760  * data and clean it up when we get there.
761  */
762 static void
763 i40e_free_rx_dma(i40e_rx_data_t *rxd, boolean_t failed_init)
764 {
765 	uint32_t i, count, ref;
766 
767 	i40e_rx_control_block_t *rcb;
768 	i40e_t *i40e = rxd->rxd_i40e;
769 
770 	i40e_free_dma_buffer(&rxd->rxd_desc_area);
771 	rxd->rxd_desc_ring = NULL;
772 	rxd->rxd_desc_next = 0;
773 
774 	mutex_enter(&i40e->i40e_rx_pending_lock);
775 
776 	rcb = rxd->rxd_rcb_area;
777 	count = rxd->rxd_ring_size + rxd->rxd_free_list_size;
778 
779 	for (i = 0; i < count; i++, rcb++) {
780 		VERIFY(rcb != NULL);
781 
782 		/*
783 		 * If we're cleaning up from a failed creation attempt, then an
784 		 * entry may never have been assembled which would mean that
785 		 * it's reference count is zero. If we find that, we leave it
786 		 * be, because nothing else should be modifying it at this
787 		 * point. We're not at the point that any more references can be
788 		 * added, just removed.
789 		 */
790 		if (failed_init == B_TRUE && rcb->rcb_ref == 0)
791 			continue;
792 
793 		ref = atomic_dec_32_nv(&rcb->rcb_ref);
794 		if (ref == 0) {
795 			freemsg(rcb->rcb_mp);
796 			rcb->rcb_mp = NULL;
797 			i40e_free_dma_buffer(&rcb->rcb_dma);
798 		} else {
799 			atomic_inc_32(&rxd->rxd_rcb_pending);
800 			atomic_inc_32(&i40e->i40e_rx_pending);
801 		}
802 	}
803 	mutex_exit(&i40e->i40e_rx_pending_lock);
804 }
805 
806 /*
807  * Initialize the DMA memory for the descriptor ring and for each frame in the
808  * control block list.
809  */
810 static boolean_t
811 i40e_alloc_rx_dma(i40e_rx_data_t *rxd)
812 {
813 	int i, count;
814 	size_t dmasz;
815 	i40e_rx_control_block_t *rcb;
816 	i40e_t *i40e = rxd->rxd_i40e;
817 
818 	/*
819 	 * First allocate the RX descriptor ring.
820 	 */
821 	dmasz = sizeof (i40e_rx_desc_t) * rxd->rxd_ring_size;
822 	VERIFY(dmasz > 0);
823 	if (i40e_alloc_dma_buffer(i40e, &rxd->rxd_desc_area,
824 	    &i40e->i40e_static_dma_attr, &i40e->i40e_desc_acc_attr, B_FALSE,
825 	    B_TRUE, dmasz) == B_FALSE) {
826 		i40e_error(i40e, "failed to allocate DMA resources "
827 		    "for RX descriptor ring");
828 		return (B_FALSE);
829 	}
830 	rxd->rxd_desc_ring =
831 	    (i40e_rx_desc_t *)(uintptr_t)rxd->rxd_desc_area.dmab_address;
832 	rxd->rxd_desc_next = 0;
833 
834 	count = rxd->rxd_ring_size + rxd->rxd_free_list_size;
835 	rcb = rxd->rxd_rcb_area;
836 
837 	dmasz = i40e->i40e_rx_buf_size;
838 	VERIFY(dmasz > 0);
839 	for (i = 0; i < count; i++, rcb++) {
840 		i40e_dma_buffer_t *dmap;
841 		VERIFY(rcb != NULL);
842 
843 		if (i < rxd->rxd_ring_size) {
844 			rxd->rxd_work_list[i] = rcb;
845 		} else {
846 			rxd->rxd_free_list[i - rxd->rxd_ring_size] = rcb;
847 		}
848 
849 		dmap = &rcb->rcb_dma;
850 		if (i40e_alloc_dma_buffer(i40e, dmap,
851 		    &i40e->i40e_static_dma_attr, &i40e->i40e_buf_acc_attr,
852 		    B_TRUE, B_FALSE, dmasz) == B_FALSE) {
853 			i40e_error(i40e, "failed to allocate RX dma buffer");
854 			return (B_FALSE);
855 		}
856 
857 		/*
858 		 * Initialize the control block and offset the DMA address. See
859 		 * the note in the big theory statement that explains how this
860 		 * helps IP deal with alignment. Note, we don't worry about
861 		 * whether or not we successfully get an mblk_t from desballoc,
862 		 * it's a common case that we have to handle later on in the
863 		 * system.
864 		 */
865 		dmap->dmab_size -= I40E_BUF_IPHDR_ALIGNMENT;
866 		dmap->dmab_address += I40E_BUF_IPHDR_ALIGNMENT;
867 		dmap->dmab_dma_address += I40E_BUF_IPHDR_ALIGNMENT;
868 
869 		rcb->rcb_ref = 1;
870 		rcb->rcb_rxd = rxd;
871 		rcb->rcb_free_rtn.free_func = i40e_rx_recycle;
872 		rcb->rcb_free_rtn.free_arg = (caddr_t)rcb;
873 		rcb->rcb_mp = desballoc((unsigned char *)dmap->dmab_address,
874 		    dmap->dmab_size, 0, &rcb->rcb_free_rtn);
875 	}
876 
877 	return (B_TRUE);
878 }
879 
880 static void
881 i40e_free_tx_dma(i40e_trqpair_t *itrq)
882 {
883 	size_t fsz;
884 
885 	if (itrq->itrq_tcb_area != NULL) {
886 		uint32_t i;
887 		i40e_tx_control_block_t *tcb = itrq->itrq_tcb_area;
888 
889 		for (i = 0; i < itrq->itrq_tx_free_list_size; i++, tcb++) {
890 			i40e_free_dma_buffer(&tcb->tcb_dma);
891 			if (tcb->tcb_dma_handle != NULL) {
892 				ddi_dma_free_handle(&tcb->tcb_dma_handle);
893 				tcb->tcb_dma_handle = NULL;
894 			}
895 			if (tcb->tcb_lso_dma_handle != NULL) {
896 				ddi_dma_free_handle(&tcb->tcb_lso_dma_handle);
897 				tcb->tcb_lso_dma_handle = NULL;
898 			}
899 		}
900 
901 		fsz = sizeof (i40e_tx_control_block_t) *
902 		    itrq->itrq_tx_free_list_size;
903 		kmem_free(itrq->itrq_tcb_area, fsz);
904 		itrq->itrq_tcb_area = NULL;
905 	}
906 
907 	if (itrq->itrq_tcb_free_list != NULL) {
908 		fsz = sizeof (i40e_tx_control_block_t *) *
909 		    itrq->itrq_tx_free_list_size;
910 		kmem_free(itrq->itrq_tcb_free_list, fsz);
911 		itrq->itrq_tcb_free_list = NULL;
912 	}
913 
914 	if (itrq->itrq_tcb_work_list != NULL) {
915 		fsz = sizeof (i40e_tx_control_block_t *) *
916 		    itrq->itrq_tx_ring_size;
917 		kmem_free(itrq->itrq_tcb_work_list, fsz);
918 		itrq->itrq_tcb_work_list = NULL;
919 	}
920 
921 	i40e_free_dma_buffer(&itrq->itrq_desc_area);
922 	itrq->itrq_desc_ring = NULL;
923 
924 }
925 
926 static boolean_t
927 i40e_alloc_tx_dma(i40e_trqpair_t *itrq)
928 {
929 	int i, ret;
930 	size_t dmasz;
931 	i40e_tx_control_block_t *tcb;
932 	i40e_t *i40e = itrq->itrq_i40e;
933 
934 	itrq->itrq_tx_ring_size = i40e->i40e_tx_ring_size;
935 	itrq->itrq_tx_free_list_size = i40e->i40e_tx_ring_size +
936 	    (i40e->i40e_tx_ring_size >> 1);
937 
938 	/*
939 	 * Allocate an additional TX descriptor for the writeback head.
940 	 */
941 	dmasz = sizeof (i40e_tx_desc_t) * itrq->itrq_tx_ring_size;
942 	dmasz += sizeof (i40e_tx_desc_t);
943 
944 	VERIFY(dmasz > 0);
945 	if (i40e_alloc_dma_buffer(i40e, &itrq->itrq_desc_area,
946 	    &i40e->i40e_static_dma_attr, &i40e->i40e_desc_acc_attr,
947 	    B_FALSE, B_TRUE, dmasz) == B_FALSE) {
948 		i40e_error(i40e, "failed to allocate DMA resources for TX "
949 		    "descriptor ring");
950 		return (B_FALSE);
951 	}
952 	itrq->itrq_desc_ring =
953 	    (i40e_tx_desc_t *)(uintptr_t)itrq->itrq_desc_area.dmab_address;
954 	itrq->itrq_desc_wbhead = (uint32_t *)(itrq->itrq_desc_ring +
955 	    itrq->itrq_tx_ring_size);
956 	itrq->itrq_desc_head = 0;
957 	itrq->itrq_desc_tail = 0;
958 	itrq->itrq_desc_free = itrq->itrq_tx_ring_size;
959 
960 	itrq->itrq_tcb_work_list = kmem_zalloc(itrq->itrq_tx_ring_size *
961 	    sizeof (i40e_tx_control_block_t *), KM_NOSLEEP);
962 	if (itrq->itrq_tcb_work_list == NULL) {
963 		i40e_error(i40e, "failed to allocate a %d entry TX work list "
964 		    "for ring %d", itrq->itrq_tx_ring_size, itrq->itrq_index);
965 		goto cleanup;
966 	}
967 
968 	itrq->itrq_tcb_free_list = kmem_zalloc(itrq->itrq_tx_free_list_size *
969 	    sizeof (i40e_tx_control_block_t *), KM_SLEEP);
970 	if (itrq->itrq_tcb_free_list == NULL) {
971 		i40e_error(i40e, "failed to allocate a %d entry TX free list "
972 		    "for ring %d", itrq->itrq_tx_free_list_size,
973 		    itrq->itrq_index);
974 		goto cleanup;
975 	}
976 
977 	/*
978 	 * We allocate enough TX control blocks to cover the free list.
979 	 */
980 	itrq->itrq_tcb_area = kmem_zalloc(sizeof (i40e_tx_control_block_t) *
981 	    itrq->itrq_tx_free_list_size, KM_NOSLEEP);
982 	if (itrq->itrq_tcb_area == NULL) {
983 		i40e_error(i40e, "failed to allocate a %d entry tcb area for "
984 		    "ring %d", itrq->itrq_tx_free_list_size, itrq->itrq_index);
985 		goto cleanup;
986 	}
987 
988 	/*
989 	 * For each tcb, allocate DMA memory.
990 	 */
991 	dmasz = i40e->i40e_tx_buf_size;
992 	VERIFY(dmasz > 0);
993 	tcb = itrq->itrq_tcb_area;
994 	for (i = 0; i < itrq->itrq_tx_free_list_size; i++, tcb++) {
995 		VERIFY(tcb != NULL);
996 
997 		/*
998 		 * Allocate both a DMA buffer which we'll use for when we copy
999 		 * packets for transmission and allocate a DMA handle which
1000 		 * we'll use when we bind data.
1001 		 */
1002 		ret = ddi_dma_alloc_handle(i40e->i40e_dip,
1003 		    &i40e->i40e_txbind_dma_attr, DDI_DMA_DONTWAIT, NULL,
1004 		    &tcb->tcb_dma_handle);
1005 		if (ret != DDI_SUCCESS) {
1006 			i40e_error(i40e, "failed to allocate DMA handle for TX "
1007 			    "data binding on ring %d: %d", itrq->itrq_index,
1008 			    ret);
1009 			tcb->tcb_dma_handle = NULL;
1010 			goto cleanup;
1011 		}
1012 
1013 		ret = ddi_dma_alloc_handle(i40e->i40e_dip,
1014 		    &i40e->i40e_txbind_lso_dma_attr, DDI_DMA_DONTWAIT, NULL,
1015 		    &tcb->tcb_lso_dma_handle);
1016 		if (ret != DDI_SUCCESS) {
1017 			i40e_error(i40e, "failed to allocate DMA handle for TX "
1018 			    "LSO data binding on ring %d: %d", itrq->itrq_index,
1019 			    ret);
1020 			tcb->tcb_lso_dma_handle = NULL;
1021 			goto cleanup;
1022 		}
1023 
1024 		if (i40e_alloc_dma_buffer(i40e, &tcb->tcb_dma,
1025 		    &i40e->i40e_static_dma_attr, &i40e->i40e_buf_acc_attr,
1026 		    B_TRUE, B_FALSE, dmasz) == B_FALSE) {
1027 			i40e_error(i40e, "failed to allocate %ld bytes of "
1028 			    "DMA for TX data binding on ring %d", dmasz,
1029 			    itrq->itrq_index);
1030 			goto cleanup;
1031 		}
1032 
1033 		itrq->itrq_tcb_free_list[i] = tcb;
1034 	}
1035 
1036 	itrq->itrq_tcb_free = itrq->itrq_tx_free_list_size;
1037 
1038 	return (B_TRUE);
1039 
1040 cleanup:
1041 	i40e_free_tx_dma(itrq);
1042 	return (B_FALSE);
1043 }
1044 
1045 /*
1046  * Free all memory associated with all of the rings on this i40e instance. Note,
1047  * this is done as part of the GLDv3 stop routine.
1048  */
1049 void
1050 i40e_free_ring_mem(i40e_t *i40e, boolean_t failed_init)
1051 {
1052 	int i;
1053 
1054 	for (i = 0; i < i40e->i40e_num_trqpairs; i++) {
1055 		i40e_rx_data_t *rxd = i40e->i40e_trqpairs[i].itrq_rxdata;
1056 
1057 		/*
1058 		 * In some cases i40e_alloc_rx_data() may have failed
1059 		 * and in that case there is no rxd to free.
1060 		 */
1061 		if (rxd == NULL)
1062 			continue;
1063 
1064 		/*
1065 		 * Clean up our RX data. We have to free DMA resources first and
1066 		 * then if we have no more pending RCB's, then we'll go ahead
1067 		 * and clean things up. Note, we can't set the stopped flag on
1068 		 * the RX data until after we've done the first pass of the
1069 		 * pending resources. Otherwise we might race with
1070 		 * i40e_rx_recycle on determining who should free the
1071 		 * i40e_rx_data_t above.
1072 		 */
1073 		i40e_free_rx_dma(rxd, failed_init);
1074 
1075 		mutex_enter(&i40e->i40e_rx_pending_lock);
1076 		rxd->rxd_shutdown = B_TRUE;
1077 		if (rxd->rxd_rcb_pending == 0) {
1078 			i40e_free_rx_data(rxd);
1079 			i40e->i40e_trqpairs[i].itrq_rxdata = NULL;
1080 		}
1081 		mutex_exit(&i40e->i40e_rx_pending_lock);
1082 
1083 		i40e_free_tx_dma(&i40e->i40e_trqpairs[i]);
1084 	}
1085 }
1086 
1087 /*
1088  * Allocate all of the resources associated with all of the rings on this i40e
1089  * instance. Note this is done as part of the GLDv3 start routine and thus we
1090  * should not use blocking allocations. This takes care of both DMA and non-DMA
1091  * related resources.
1092  */
1093 boolean_t
1094 i40e_alloc_ring_mem(i40e_t *i40e)
1095 {
1096 	int i;
1097 
1098 	for (i = 0; i < i40e->i40e_num_trqpairs; i++) {
1099 		if (i40e_alloc_rx_data(i40e, &i40e->i40e_trqpairs[i]) ==
1100 		    B_FALSE)
1101 			goto unwind;
1102 
1103 		if (i40e_alloc_rx_dma(i40e->i40e_trqpairs[i].itrq_rxdata) ==
1104 		    B_FALSE)
1105 			goto unwind;
1106 
1107 		if (i40e_alloc_tx_dma(&i40e->i40e_trqpairs[i]) == B_FALSE)
1108 			goto unwind;
1109 	}
1110 
1111 	return (B_TRUE);
1112 
1113 unwind:
1114 	i40e_free_ring_mem(i40e, B_TRUE);
1115 	return (B_FALSE);
1116 }
1117 
1118 
1119 /*
1120  * Because every instance of i40e may have different support for FMA
1121  * capabilities, we copy the DMA attributes into the i40e_t and set them that
1122  * way and use them for determining attributes.
1123  */
1124 void
1125 i40e_init_dma_attrs(i40e_t *i40e, boolean_t fma)
1126 {
1127 	bcopy(&i40e_g_static_dma_attr, &i40e->i40e_static_dma_attr,
1128 	    sizeof (ddi_dma_attr_t));
1129 	bcopy(&i40e_g_txbind_dma_attr, &i40e->i40e_txbind_dma_attr,
1130 	    sizeof (ddi_dma_attr_t));
1131 	bcopy(&i40e_g_txbind_lso_dma_attr, &i40e->i40e_txbind_lso_dma_attr,
1132 	    sizeof (ddi_dma_attr_t));
1133 	bcopy(&i40e_g_desc_acc_attr, &i40e->i40e_desc_acc_attr,
1134 	    sizeof (ddi_device_acc_attr_t));
1135 	bcopy(&i40e_g_buf_acc_attr, &i40e->i40e_buf_acc_attr,
1136 	    sizeof (ddi_device_acc_attr_t));
1137 
1138 	if (fma == B_TRUE) {
1139 		i40e->i40e_static_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR;
1140 		i40e->i40e_txbind_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR;
1141 		i40e->i40e_txbind_lso_dma_attr.dma_attr_flags |=
1142 		    DDI_DMA_FLAGERR;
1143 	} else {
1144 		i40e->i40e_static_dma_attr.dma_attr_flags &= ~DDI_DMA_FLAGERR;
1145 		i40e->i40e_txbind_dma_attr.dma_attr_flags &= ~DDI_DMA_FLAGERR;
1146 		i40e->i40e_txbind_lso_dma_attr.dma_attr_flags &=
1147 		    ~DDI_DMA_FLAGERR;
1148 	}
1149 }
1150 
1151 static void
1152 i40e_rcb_free(i40e_rx_data_t *rxd, i40e_rx_control_block_t *rcb)
1153 {
1154 	mutex_enter(&rxd->rxd_free_lock);
1155 	ASSERT(rxd->rxd_rcb_free < rxd->rxd_free_list_size);
1156 	ASSERT(rxd->rxd_free_list[rxd->rxd_rcb_free] == NULL);
1157 	rxd->rxd_free_list[rxd->rxd_rcb_free] = rcb;
1158 	rxd->rxd_rcb_free++;
1159 	mutex_exit(&rxd->rxd_free_lock);
1160 }
1161 
1162 static i40e_rx_control_block_t *
1163 i40e_rcb_alloc(i40e_rx_data_t *rxd)
1164 {
1165 	i40e_rx_control_block_t *rcb;
1166 
1167 	mutex_enter(&rxd->rxd_free_lock);
1168 	if (rxd->rxd_rcb_free == 0) {
1169 		mutex_exit(&rxd->rxd_free_lock);
1170 		return (NULL);
1171 	}
1172 	rxd->rxd_rcb_free--;
1173 	rcb = rxd->rxd_free_list[rxd->rxd_rcb_free];
1174 	VERIFY(rcb != NULL);
1175 	rxd->rxd_free_list[rxd->rxd_rcb_free] = NULL;
1176 	mutex_exit(&rxd->rxd_free_lock);
1177 
1178 	return (rcb);
1179 }
1180 
1181 /*
1182  * This is the callback that we get from the OS when freemsg(9F) has been called
1183  * on a loaned descriptor. In addition, if we take the last reference count
1184  * here, then we have to tear down all of the RX data.
1185  */
1186 void
1187 i40e_rx_recycle(caddr_t arg)
1188 {
1189 	uint32_t ref;
1190 	i40e_rx_control_block_t *rcb;
1191 	i40e_rx_data_t *rxd;
1192 	i40e_t *i40e;
1193 
1194 	/* LINTED: E_BAD_PTR_CAST_ALIGN */
1195 	rcb = (i40e_rx_control_block_t *)arg;
1196 	rxd = rcb->rcb_rxd;
1197 	i40e = rxd->rxd_i40e;
1198 
1199 	/*
1200 	 * It's possible for this to be called with a reference count of zero.
1201 	 * That will happen when we're doing the freemsg after taking the last
1202 	 * reference because we're tearing down everything and this rcb is not
1203 	 * outstanding.
1204 	 */
1205 	if (rcb->rcb_ref == 0)
1206 		return;
1207 
1208 	/*
1209 	 * Don't worry about failure of desballoc here. It'll only become fatal
1210 	 * if we're trying to use it and we can't in i40e_rx_bind().
1211 	 */
1212 	rcb->rcb_mp = desballoc((unsigned char *)rcb->rcb_dma.dmab_address,
1213 	    rcb->rcb_dma.dmab_size, 0, &rcb->rcb_free_rtn);
1214 	i40e_rcb_free(rxd, rcb);
1215 
1216 	/*
1217 	 * It's possible that the rcb was being used while we are shutting down
1218 	 * the device. In that case, we'll take the final reference from the
1219 	 * device here.
1220 	 */
1221 	ref = atomic_dec_32_nv(&rcb->rcb_ref);
1222 	if (ref == 0) {
1223 		freemsg(rcb->rcb_mp);
1224 		rcb->rcb_mp = NULL;
1225 		i40e_free_dma_buffer(&rcb->rcb_dma);
1226 
1227 		mutex_enter(&i40e->i40e_rx_pending_lock);
1228 		atomic_dec_32(&rxd->rxd_rcb_pending);
1229 		atomic_dec_32(&i40e->i40e_rx_pending);
1230 
1231 		/*
1232 		 * If this was the last block and it's been indicated that we've
1233 		 * passed the shutdown point, we should clean up.
1234 		 */
1235 		if (rxd->rxd_shutdown == B_TRUE && rxd->rxd_rcb_pending == 0) {
1236 			i40e_free_rx_data(rxd);
1237 			cv_broadcast(&i40e->i40e_rx_pending_cv);
1238 		}
1239 
1240 		mutex_exit(&i40e->i40e_rx_pending_lock);
1241 	}
1242 }
1243 
1244 static mblk_t *
1245 i40e_rx_bind(i40e_trqpair_t *itrq, i40e_rx_data_t *rxd, uint32_t index,
1246     uint32_t plen)
1247 {
1248 	mblk_t *mp;
1249 	i40e_t *i40e = rxd->rxd_i40e;
1250 	i40e_rx_control_block_t *rcb, *rep_rcb;
1251 
1252 	ASSERT(MUTEX_HELD(&itrq->itrq_rx_lock));
1253 
1254 	if ((rep_rcb = i40e_rcb_alloc(rxd)) == NULL) {
1255 		itrq->itrq_rxstat.irxs_rx_bind_norcb.value.ui64++;
1256 		return (NULL);
1257 	}
1258 
1259 	rcb = rxd->rxd_work_list[index];
1260 
1261 	/*
1262 	 * Check to make sure we have a mblk_t. If we don't, this is our last
1263 	 * chance to try and get one.
1264 	 */
1265 	if (rcb->rcb_mp == NULL) {
1266 		rcb->rcb_mp =
1267 		    desballoc((unsigned char *)rcb->rcb_dma.dmab_address,
1268 		    rcb->rcb_dma.dmab_size, 0, &rcb->rcb_free_rtn);
1269 		if (rcb->rcb_mp == NULL) {
1270 			itrq->itrq_rxstat.irxs_rx_bind_nomp.value.ui64++;
1271 			i40e_rcb_free(rxd, rcb);
1272 			return (NULL);
1273 		}
1274 	}
1275 
1276 	I40E_DMA_SYNC(&rcb->rcb_dma, DDI_DMA_SYNC_FORKERNEL);
1277 
1278 	if (i40e_check_dma_handle(rcb->rcb_dma.dmab_dma_handle) != DDI_FM_OK) {
1279 		ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED);
1280 		atomic_or_32(&i40e->i40e_state, I40E_ERROR);
1281 		i40e_rcb_free(rxd, rcb);
1282 		return (NULL);
1283 	}
1284 
1285 	/*
1286 	 * Note, we've already accounted for the I40E_BUF_IPHDR_ALIGNMENT.
1287 	 */
1288 	mp = rcb->rcb_mp;
1289 	atomic_inc_32(&rcb->rcb_ref);
1290 	mp->b_wptr = mp->b_rptr + plen;
1291 	mp->b_next = mp->b_cont = NULL;
1292 
1293 	rxd->rxd_work_list[index] = rep_rcb;
1294 	return (mp);
1295 }
1296 
1297 /*
1298  * We're going to allocate a new message block for this frame and attempt to
1299  * receive it. See the big theory statement for more information on when we copy
1300  * versus bind.
1301  */
1302 static mblk_t *
1303 i40e_rx_copy(i40e_trqpair_t *itrq, i40e_rx_data_t *rxd, uint32_t index,
1304     uint32_t plen)
1305 {
1306 	i40e_t *i40e = rxd->rxd_i40e;
1307 	i40e_rx_control_block_t *rcb;
1308 	mblk_t *mp;
1309 
1310 	ASSERT(index < rxd->rxd_ring_size);
1311 	rcb = rxd->rxd_work_list[index];
1312 
1313 	I40E_DMA_SYNC(&rcb->rcb_dma, DDI_DMA_SYNC_FORKERNEL);
1314 
1315 	if (i40e_check_dma_handle(rcb->rcb_dma.dmab_dma_handle) != DDI_FM_OK) {
1316 		ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED);
1317 		atomic_or_32(&i40e->i40e_state, I40E_ERROR);
1318 		return (NULL);
1319 	}
1320 
1321 	mp = allocb(plen + I40E_BUF_IPHDR_ALIGNMENT, 0);
1322 	if (mp == NULL) {
1323 		itrq->itrq_rxstat.irxs_rx_copy_nomem.value.ui64++;
1324 		return (NULL);
1325 	}
1326 
1327 	mp->b_rptr += I40E_BUF_IPHDR_ALIGNMENT;
1328 	bcopy(rcb->rcb_dma.dmab_address, mp->b_rptr, plen);
1329 	mp->b_wptr = mp->b_rptr + plen;
1330 
1331 	return (mp);
1332 }
1333 
1334 /*
1335  * Determine if the device has enabled any checksum flags for us. The level of
1336  * checksum computed will depend on the type packet that we have, which is
1337  * contained in ptype. For example, the checksum logic it does will vary
1338  * depending on whether or not the packet is considered tunneled, whether it
1339  * recognizes the L4 type, etc. Section 8.3.4.3 summarizes which checksums are
1340  * valid.
1341  *
1342  * While there are additional checksums that we could recognize here, we'll need
1343  * to get some additional GLDv3 enhancements to be able to properly describe
1344  * them.
1345  */
1346 static void
1347 i40e_rx_hcksum(i40e_trqpair_t *itrq, mblk_t *mp, uint64_t status, uint32_t err,
1348     uint32_t ptype)
1349 {
1350 	uint32_t cksum;
1351 	struct i40e_rx_ptype_decoded pinfo;
1352 
1353 	ASSERT(ptype <= 255);
1354 	pinfo = decode_rx_desc_ptype(ptype);
1355 
1356 	cksum = 0;
1357 
1358 	/*
1359 	 * If the ptype isn't something that we know in the driver, then we
1360 	 * shouldn't even consider moving forward.
1361 	 */
1362 	if (pinfo.known == 0) {
1363 		itrq->itrq_rxstat.irxs_hck_unknown.value.ui64++;
1364 		return;
1365 	}
1366 
1367 	/*
1368 	 * If hardware didn't set the L3L4P bit on the frame, then there is no
1369 	 * checksum offload to consider.
1370 	 */
1371 	if ((status & (1 << I40E_RX_DESC_STATUS_L3L4P_SHIFT)) == 0) {
1372 		itrq->itrq_rxstat.irxs_hck_nol3l4p.value.ui64++;
1373 		return;
1374 	}
1375 
1376 	/*
1377 	 * The device tells us that IPv6 checksums where a Destination Options
1378 	 * Header or a Routing header shouldn't be trusted. Discard all
1379 	 * checksums in this case.
1380 	 */
1381 	if (pinfo.outer_ip == I40E_RX_PTYPE_OUTER_IP &&
1382 	    pinfo.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV6 &&
1383 	    (status & (1 << I40E_RX_DESC_STATUS_IPV6EXADD_SHIFT))) {
1384 		itrq->itrq_rxstat.irxs_hck_v6skip.value.ui64++;
1385 		return;
1386 	}
1387 
1388 	/*
1389 	 * The hardware denotes three kinds of possible errors. Two are reserved
1390 	 * for inner and outer IP checksum errors (IPE and EIPE) and the latter
1391 	 * is for L4 checksum errors (L4E). If there is only one IP header, then
1392 	 * the only thing that we care about is IPE. Note that since we don't
1393 	 * support inner checksums, we will ignore IPE being set on tunneled
1394 	 * packets and only care about EIPE.
1395 	 */
1396 	if (pinfo.outer_ip == I40E_RX_PTYPE_OUTER_IP &&
1397 	    pinfo.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV4) {
1398 		if (pinfo.tunnel_type == I40E_RX_PTYPE_OUTER_NONE) {
1399 			if ((err & (1 << I40E_RX_DESC_ERROR_IPE_SHIFT)) != 0) {
1400 				itrq->itrq_rxstat.irxs_hck_iperr.value.ui64++;
1401 			} else {
1402 				itrq->itrq_rxstat.irxs_hck_v4hdrok.value.ui64++;
1403 				cksum |= HCK_IPV4_HDRCKSUM_OK;
1404 			}
1405 		} else {
1406 			if ((err & (1 << I40E_RX_DESC_ERROR_EIPE_SHIFT)) != 0) {
1407 				itrq->itrq_rxstat.irxs_hck_eiperr.value.ui64++;
1408 			} else {
1409 				itrq->itrq_rxstat.irxs_hck_v4hdrok.value.ui64++;
1410 				cksum |= HCK_IPV4_HDRCKSUM_OK;
1411 			}
1412 		}
1413 	}
1414 
1415 	/*
1416 	 * We only have meaningful L4 checksums in the case of IP->L4 and
1417 	 * IP->IP->L4. There is not outer L4 checksum data available in any
1418 	 * other case. Further, we don't bother reporting the valid checksum in
1419 	 * the case of IP->IP->L4 set.
1420 	 */
1421 	if (pinfo.outer_ip == I40E_RX_PTYPE_OUTER_IP &&
1422 	    pinfo.tunnel_type == I40E_RX_PTYPE_TUNNEL_NONE &&
1423 	    (pinfo.inner_prot == I40E_RX_PTYPE_INNER_PROT_UDP ||
1424 	    pinfo.inner_prot == I40E_RX_PTYPE_INNER_PROT_TCP ||
1425 	    pinfo.inner_prot == I40E_RX_PTYPE_INNER_PROT_ICMP ||
1426 	    pinfo.inner_prot == I40E_RX_PTYPE_INNER_PROT_SCTP)) {
1427 		ASSERT(pinfo.payload_layer == I40E_RX_PTYPE_PAYLOAD_LAYER_PAY4);
1428 		if ((err & (1 << I40E_RX_DESC_ERROR_L4E_SHIFT)) != 0) {
1429 			itrq->itrq_rxstat.irxs_hck_l4err.value.ui64++;
1430 		} else {
1431 			itrq->itrq_rxstat.irxs_hck_l4hdrok.value.ui64++;
1432 			cksum |= HCK_FULLCKSUM_OK;
1433 		}
1434 	}
1435 
1436 	if (cksum != 0) {
1437 		itrq->itrq_rxstat.irxs_hck_set.value.ui64++;
1438 		mac_hcksum_set(mp, 0, 0, 0, 0, cksum);
1439 	} else {
1440 		itrq->itrq_rxstat.irxs_hck_miss.value.ui64++;
1441 	}
1442 }
1443 
1444 mblk_t *
1445 i40e_ring_rx(i40e_trqpair_t *itrq, int poll_bytes)
1446 {
1447 	i40e_t *i40e;
1448 	i40e_hw_t *hw;
1449 	i40e_rx_data_t *rxd;
1450 	uint32_t cur_head;
1451 	i40e_rx_desc_t *cur_desc;
1452 	i40e_rx_control_block_t *rcb;
1453 	uint64_t rx_bytes, rx_frames;
1454 	uint64_t stword;
1455 	mblk_t *mp, *mp_head, **mp_tail;
1456 
1457 	ASSERT(MUTEX_HELD(&itrq->itrq_rx_lock));
1458 	rxd = itrq->itrq_rxdata;
1459 	i40e = itrq->itrq_i40e;
1460 	hw = &i40e->i40e_hw_space;
1461 
1462 	if (!(i40e->i40e_state & I40E_STARTED) ||
1463 	    (i40e->i40e_state & I40E_OVERTEMP) ||
1464 	    (i40e->i40e_state & I40E_SUSPENDED) ||
1465 	    (i40e->i40e_state & I40E_ERROR))
1466 		return (NULL);
1467 
1468 	/*
1469 	 * Before we do anything else, we have to make sure that all of the DMA
1470 	 * buffers are synced up and then check to make sure that they're
1471 	 * actually good from an FM perspective.
1472 	 */
1473 	I40E_DMA_SYNC(&rxd->rxd_desc_area, DDI_DMA_SYNC_FORKERNEL);
1474 	if (i40e_check_dma_handle(rxd->rxd_desc_area.dmab_dma_handle) !=
1475 	    DDI_FM_OK) {
1476 		ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED);
1477 		atomic_or_32(&i40e->i40e_state, I40E_ERROR);
1478 		return (NULL);
1479 	}
1480 
1481 	/*
1482 	 * Prepare our stats. We do a limited amount of processing in both
1483 	 * polling and interrupt context. The limit in interrupt context is
1484 	 * based on frames, in polling context based on bytes.
1485 	 */
1486 	rx_bytes = rx_frames = 0;
1487 	mp_head = NULL;
1488 	mp_tail = &mp_head;
1489 
1490 	/*
1491 	 * At this point, the descriptor ring is available to check. We'll try
1492 	 * and process until we either run out of poll_bytes or descriptors.
1493 	 */
1494 	cur_head = rxd->rxd_desc_next;
1495 	cur_desc = &rxd->rxd_desc_ring[cur_head];
1496 	stword = LE64_TO_CPU(cur_desc->wb.qword1.status_error_len);
1497 
1498 	/*
1499 	 * Note, the primary invariant of this loop should be that cur_head,
1500 	 * cur_desc, and stword always point to the currently processed
1501 	 * descriptor. When we leave the loop, it should point to a descriptor
1502 	 * that HAS NOT been processed. Meaning, that if we haven't consumed the
1503 	 * frame, the descriptor should not be advanced.
1504 	 */
1505 	while ((stword & (1 << I40E_RX_DESC_STATUS_DD_SHIFT)) != 0) {
1506 		uint32_t error, eop, plen, ptype;
1507 
1508 		/*
1509 		 * The DD, PLEN, and EOP bits are the only ones that are valid
1510 		 * in every frame. The error information is only valid when EOP
1511 		 * is set in the same frame.
1512 		 *
1513 		 * At this time, because we don't do any LRO or header
1514 		 * splitting. We expect that every frame should have EOP set in
1515 		 * it. When later functionality comes in, we'll want to
1516 		 * re-evaluate this.
1517 		 */
1518 		eop = stword & (1 << I40E_RX_DESC_STATUS_EOF_SHIFT);
1519 		VERIFY(eop != 0);
1520 
1521 		error = (stword & I40E_RXD_QW1_ERROR_MASK) >>
1522 		    I40E_RXD_QW1_ERROR_SHIFT;
1523 		if (error & I40E_RX_ERR_BITS) {
1524 			itrq->itrq_rxstat.irxs_rx_desc_error.value.ui64++;
1525 			goto discard;
1526 		}
1527 
1528 		plen = (stword & I40E_RXD_QW1_LENGTH_PBUF_MASK) >>
1529 		    I40E_RXD_QW1_LENGTH_PBUF_SHIFT;
1530 
1531 		ptype = (stword & I40E_RXD_QW1_PTYPE_MASK) >>
1532 		    I40E_RXD_QW1_PTYPE_SHIFT;
1533 
1534 		/*
1535 		 * This packet contains valid data. We should check to see if
1536 		 * we're actually going to consume it based on its length (to
1537 		 * ensure that we don't overshoot our quota). We determine
1538 		 * whether to bcopy or bind the DMA resources based on the size
1539 		 * of the frame. However, if on debug, we allow it to be
1540 		 * overridden for testing purposes.
1541 		 *
1542 		 * We should be smarter about this and do DMA binding for
1543 		 * larger frames, but for now, it's really more important that
1544 		 * we actually just get something simple working.
1545 		 */
1546 
1547 		/*
1548 		 * Ensure we don't exceed our polling quota by reading this
1549 		 * frame. Note we only bump bytes now, we bump frames later.
1550 		 */
1551 		if ((poll_bytes != I40E_POLL_NULL) &&
1552 		    (rx_bytes + plen) > poll_bytes)
1553 			break;
1554 		rx_bytes += plen;
1555 
1556 		mp = NULL;
1557 		if (plen >= i40e->i40e_rx_dma_min)
1558 			mp = i40e_rx_bind(itrq, rxd, cur_head, plen);
1559 		if (mp == NULL)
1560 			mp = i40e_rx_copy(itrq, rxd, cur_head, plen);
1561 
1562 		if (mp != NULL) {
1563 			if (i40e->i40e_rx_hcksum_enable)
1564 				i40e_rx_hcksum(itrq, mp, stword, error, ptype);
1565 			*mp_tail = mp;
1566 			mp_tail = &mp->b_next;
1567 		}
1568 
1569 		/*
1570 		 * Now we need to prepare this frame for use again. See the
1571 		 * discussion in the big theory statements.
1572 		 *
1573 		 * However, right now we're doing the simple version of this.
1574 		 * Normally what we'd do would depend on whether or not we were
1575 		 * doing DMA binding or bcopying. But because we're always doing
1576 		 * bcopying, we can just always use the current index as a key
1577 		 * for what to do and reassign the buffer based on the ring.
1578 		 */
1579 discard:
1580 		rcb = rxd->rxd_work_list[cur_head];
1581 		cur_desc->read.pkt_addr =
1582 		    CPU_TO_LE64((uintptr_t)rcb->rcb_dma.dmab_dma_address);
1583 		cur_desc->read.hdr_addr = 0;
1584 
1585 		/*
1586 		 * Finally, update our loop invariants.
1587 		 */
1588 		cur_head = i40e_next_desc(cur_head, 1, rxd->rxd_ring_size);
1589 		cur_desc = &rxd->rxd_desc_ring[cur_head];
1590 		stword = LE64_TO_CPU(cur_desc->wb.qword1.status_error_len);
1591 
1592 		/*
1593 		 * To help provide liveness, we limit the amount of data that
1594 		 * we'll end up counting. Note that in these cases, an interrupt
1595 		 * is not dissimilar from a polling request.
1596 		 */
1597 		rx_frames++;
1598 		if (rx_frames > i40e->i40e_rx_limit_per_intr) {
1599 			itrq->itrq_rxstat.irxs_rx_intr_limit.value.ui64++;
1600 			break;
1601 		}
1602 	}
1603 
1604 	/*
1605 	 * As we've modified the ring, we need to make sure that we sync the
1606 	 * descriptor ring for the device. Next, we update the hardware and
1607 	 * update our notion of where the head for us to read from hardware is
1608 	 * next.
1609 	 */
1610 	I40E_DMA_SYNC(&rxd->rxd_desc_area, DDI_DMA_SYNC_FORDEV);
1611 	if (i40e_check_dma_handle(rxd->rxd_desc_area.dmab_dma_handle) !=
1612 	    DDI_FM_OK) {
1613 		ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED);
1614 		atomic_or_32(&i40e->i40e_state, I40E_ERROR);
1615 	}
1616 
1617 	if (rx_frames != 0) {
1618 		uint32_t tail;
1619 		ddi_acc_handle_t rh = i40e->i40e_osdep_space.ios_reg_handle;
1620 		rxd->rxd_desc_next = cur_head;
1621 		tail = i40e_prev_desc(cur_head, 1, rxd->rxd_ring_size);
1622 
1623 		I40E_WRITE_REG(hw, I40E_QRX_TAIL(itrq->itrq_index), tail);
1624 		if (i40e_check_acc_handle(rh) != DDI_FM_OK) {
1625 			ddi_fm_service_impact(i40e->i40e_dip,
1626 			    DDI_SERVICE_DEGRADED);
1627 			atomic_or_32(&i40e->i40e_state, I40E_ERROR);
1628 		}
1629 
1630 		itrq->itrq_rxstat.irxs_bytes.value.ui64 += rx_bytes;
1631 		itrq->itrq_rxstat.irxs_packets.value.ui64 += rx_frames;
1632 	}
1633 
1634 #ifdef DEBUG
1635 	if (rx_frames == 0) {
1636 		ASSERT(rx_bytes == 0);
1637 	}
1638 #endif
1639 
1640 	return (mp_head);
1641 }
1642 
1643 /*
1644  * This function is called by the GLDv3 when it wants to poll on a ring. The
1645  * only primary difference from when we call this during an interrupt is that we
1646  * have a limit on the number of bytes that we should consume.
1647  */
1648 mblk_t *
1649 i40e_ring_rx_poll(void *arg, int poll_bytes)
1650 {
1651 	i40e_trqpair_t *itrq = arg;
1652 	mblk_t *mp;
1653 
1654 	ASSERT(poll_bytes > 0);
1655 	if (poll_bytes == 0)
1656 		return (NULL);
1657 
1658 	mutex_enter(&itrq->itrq_rx_lock);
1659 	mp = i40e_ring_rx(itrq, poll_bytes);
1660 	mutex_exit(&itrq->itrq_rx_lock);
1661 
1662 	return (mp);
1663 }
1664 
1665 /*
1666  * Attempt to put togther the information we'll need to feed into a descriptor
1667  * to properly program the hardware for checksum offload as well as the
1668  * generally required flags.
1669  *
1670  * The i40e_tx_context_t`itc_data_cmdflags contains the set of flags we need to
1671  * 'or' into the descriptor based on the checksum flags for this mblk_t and the
1672  * actual information we care about.
1673  *
1674  * If the mblk requires LSO then we'll also gather the information that will be
1675  * used to construct the Transmit Context Descriptor.
1676  */
1677 static int
1678 i40e_tx_context(i40e_t *i40e, i40e_trqpair_t *itrq, mblk_t *mp,
1679     mac_ether_offload_info_t *meo, i40e_tx_context_t *tctx)
1680 {
1681 	uint32_t chkflags, start, mss, lsoflags;
1682 	i40e_txq_stat_t *txs = &itrq->itrq_txstat;
1683 
1684 	bzero(tctx, sizeof (i40e_tx_context_t));
1685 
1686 	if (i40e->i40e_tx_hcksum_enable != B_TRUE)
1687 		return (0);
1688 
1689 	mac_hcksum_get(mp, &start, NULL, NULL, NULL, &chkflags);
1690 	mac_lso_get(mp, &mss, &lsoflags);
1691 
1692 	if (chkflags == 0 && lsoflags == 0)
1693 		return (0);
1694 
1695 	/*
1696 	 * Have we been asked to checksum an IPv4 header. If so, verify that we
1697 	 * have sufficient information and then set the proper fields in the
1698 	 * command structure.
1699 	 */
1700 	if (chkflags & HCK_IPV4_HDRCKSUM) {
1701 		if ((meo->meoi_flags & MEOI_L2INFO_SET) == 0) {
1702 			txs->itxs_hck_nol2info.value.ui64++;
1703 			return (-1);
1704 		}
1705 		if ((meo->meoi_flags & MEOI_L3INFO_SET) == 0) {
1706 			txs->itxs_hck_nol3info.value.ui64++;
1707 			return (-1);
1708 		}
1709 		if (meo->meoi_l3proto != ETHERTYPE_IP) {
1710 			txs->itxs_hck_badl3.value.ui64++;
1711 			return (-1);
1712 		}
1713 		tctx->itc_data_cmdflags |= I40E_TX_DESC_CMD_IIPT_IPV4_CSUM;
1714 		tctx->itc_data_offsets |= (meo->meoi_l2hlen >> 1) <<
1715 		    I40E_TX_DESC_LENGTH_MACLEN_SHIFT;
1716 		tctx->itc_data_offsets |= (meo->meoi_l3hlen >> 2) <<
1717 		    I40E_TX_DESC_LENGTH_IPLEN_SHIFT;
1718 	}
1719 
1720 	/*
1721 	 * We've been asked to provide an L4 header, first, set up the IP
1722 	 * information in the descriptor if we haven't already before moving
1723 	 * onto seeing if we have enough information for the L4 checksum
1724 	 * offload.
1725 	 */
1726 	if (chkflags & HCK_PARTIALCKSUM) {
1727 		if ((meo->meoi_flags & MEOI_L4INFO_SET) == 0) {
1728 			txs->itxs_hck_nol4info.value.ui64++;
1729 			return (-1);
1730 		}
1731 
1732 		if (!(chkflags & HCK_IPV4_HDRCKSUM)) {
1733 			if ((meo->meoi_flags & MEOI_L2INFO_SET) == 0) {
1734 				txs->itxs_hck_nol2info.value.ui64++;
1735 				return (-1);
1736 			}
1737 			if ((meo->meoi_flags & MEOI_L3INFO_SET) == 0) {
1738 				txs->itxs_hck_nol3info.value.ui64++;
1739 				return (-1);
1740 			}
1741 
1742 			if (meo->meoi_l3proto == ETHERTYPE_IP) {
1743 				tctx->itc_data_cmdflags |=
1744 				    I40E_TX_DESC_CMD_IIPT_IPV4;
1745 			} else if (meo->meoi_l3proto == ETHERTYPE_IPV6) {
1746 				tctx->itc_data_cmdflags |=
1747 				    I40E_TX_DESC_CMD_IIPT_IPV6;
1748 			} else {
1749 				txs->itxs_hck_badl3.value.ui64++;
1750 				return (-1);
1751 			}
1752 			tctx->itc_data_offsets |= (meo->meoi_l2hlen >> 1) <<
1753 			    I40E_TX_DESC_LENGTH_MACLEN_SHIFT;
1754 			tctx->itc_data_offsets |= (meo->meoi_l3hlen >> 2) <<
1755 			    I40E_TX_DESC_LENGTH_IPLEN_SHIFT;
1756 		}
1757 
1758 		switch (meo->meoi_l4proto) {
1759 		case IPPROTO_TCP:
1760 			tctx->itc_data_cmdflags |=
1761 			    I40E_TX_DESC_CMD_L4T_EOFT_TCP;
1762 			break;
1763 		case IPPROTO_UDP:
1764 			tctx->itc_data_cmdflags |=
1765 			    I40E_TX_DESC_CMD_L4T_EOFT_UDP;
1766 			break;
1767 		case IPPROTO_SCTP:
1768 			tctx->itc_data_cmdflags |=
1769 			    I40E_TX_DESC_CMD_L4T_EOFT_SCTP;
1770 			break;
1771 		default:
1772 			txs->itxs_hck_badl4.value.ui64++;
1773 			return (-1);
1774 		}
1775 
1776 		tctx->itc_data_offsets |= (meo->meoi_l4hlen >> 2) <<
1777 		    I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
1778 	}
1779 
1780 	if (lsoflags & HW_LSO) {
1781 		/*
1782 		 * LSO requires that checksum offloads are enabled.  If for
1783 		 * some reason they're not we bail out with an error.
1784 		 */
1785 		if ((meo->meoi_l3proto == ETHERTYPE_IP &&
1786 		    (chkflags & HCK_IPV4_HDRCKSUM) == 0) ||
1787 		    (chkflags & HCK_PARTIALCKSUM) == 0) {
1788 			txs->itxs_lso_nohck.value.ui64++;
1789 			return (-1);
1790 		}
1791 
1792 		tctx->itc_ctx_cmdflags |= I40E_TX_CTX_DESC_TSO;
1793 		tctx->itc_ctx_mss = mss;
1794 		tctx->itc_ctx_tsolen = msgsize(mp) -
1795 		    (meo->meoi_l2hlen + meo->meoi_l3hlen + meo->meoi_l4hlen);
1796 	}
1797 
1798 	return (0);
1799 }
1800 
1801 static void
1802 i40e_tcb_free(i40e_trqpair_t *itrq, i40e_tx_control_block_t *tcb)
1803 {
1804 	ASSERT(tcb != NULL);
1805 
1806 	mutex_enter(&itrq->itrq_tcb_lock);
1807 	ASSERT(itrq->itrq_tcb_free < itrq->itrq_tx_free_list_size);
1808 	itrq->itrq_tcb_free_list[itrq->itrq_tcb_free] = tcb;
1809 	itrq->itrq_tcb_free++;
1810 	mutex_exit(&itrq->itrq_tcb_lock);
1811 }
1812 
1813 static i40e_tx_control_block_t *
1814 i40e_tcb_alloc(i40e_trqpair_t *itrq)
1815 {
1816 	i40e_tx_control_block_t *ret;
1817 
1818 	mutex_enter(&itrq->itrq_tcb_lock);
1819 	if (itrq->itrq_tcb_free == 0) {
1820 		mutex_exit(&itrq->itrq_tcb_lock);
1821 		return (NULL);
1822 	}
1823 
1824 	itrq->itrq_tcb_free--;
1825 	ret = itrq->itrq_tcb_free_list[itrq->itrq_tcb_free];
1826 	itrq->itrq_tcb_free_list[itrq->itrq_tcb_free] = NULL;
1827 	mutex_exit(&itrq->itrq_tcb_lock);
1828 
1829 	ASSERT(ret != NULL);
1830 	return (ret);
1831 }
1832 
1833 /*
1834  * This should be used to free any DMA resources, associated mblk_t's, etc. It's
1835  * used as part of recycling the message blocks when we have either an interrupt
1836  * or other activity that indicates that we need to take a look.
1837  */
1838 static void
1839 i40e_tcb_reset(i40e_tx_control_block_t *tcb)
1840 {
1841 	switch (tcb->tcb_type) {
1842 	case I40E_TX_COPY:
1843 		tcb->tcb_dma.dmab_len = 0;
1844 		break;
1845 	case I40E_TX_DMA:
1846 		if (tcb->tcb_used_lso == B_TRUE && tcb->tcb_bind_ncookies > 0)
1847 			(void) ddi_dma_unbind_handle(tcb->tcb_lso_dma_handle);
1848 		else if (tcb->tcb_bind_ncookies > 0)
1849 			(void) ddi_dma_unbind_handle(tcb->tcb_dma_handle);
1850 		if (tcb->tcb_bind_info != NULL) {
1851 			kmem_free(tcb->tcb_bind_info,
1852 			    tcb->tcb_bind_ncookies *
1853 			    sizeof (struct i40e_dma_bind_info));
1854 		}
1855 		tcb->tcb_bind_info = NULL;
1856 		tcb->tcb_bind_ncookies = 0;
1857 		tcb->tcb_used_lso = B_FALSE;
1858 		break;
1859 	case I40E_TX_DESC:
1860 		break;
1861 	case I40E_TX_NONE:
1862 		/* Cast to pacify lint */
1863 		panic("trying to free tcb %p with bad type none", (void *)tcb);
1864 	default:
1865 		panic("unknown i40e tcb type: %d", tcb->tcb_type);
1866 	}
1867 
1868 	tcb->tcb_type = I40E_TX_NONE;
1869 	if (tcb->tcb_mp != NULL) {
1870 		freemsg(tcb->tcb_mp);
1871 		tcb->tcb_mp = NULL;
1872 	}
1873 	tcb->tcb_next = NULL;
1874 }
1875 
1876 /*
1877  * This is called as part of shutting down to clean up all outstanding
1878  * descriptors. Similar to recycle, except we don't re-arm anything and instead
1879  * just return control blocks to the free list.
1880  */
1881 void
1882 i40e_tx_cleanup_ring(i40e_trqpair_t *itrq)
1883 {
1884 	uint32_t index;
1885 
1886 	ASSERT(MUTEX_HELD(&itrq->itrq_tx_lock));
1887 	ASSERT(itrq->itrq_desc_free <= itrq->itrq_tx_ring_size);
1888 
1889 	/*
1890 	 * Because we should have shut down the chip at this point, it should be
1891 	 * safe to just clean up all the entries between our head and tail.
1892 	 */
1893 #ifdef	DEBUG
1894 	index = I40E_READ_REG(&itrq->itrq_i40e->i40e_hw_space,
1895 	    I40E_QTX_ENA(itrq->itrq_index));
1896 	VERIFY0(index & (I40E_QTX_ENA_QENA_REQ_MASK |
1897 	    I40E_QTX_ENA_QENA_STAT_MASK));
1898 #endif
1899 
1900 	index = itrq->itrq_desc_head;
1901 	while (itrq->itrq_desc_free < itrq->itrq_tx_ring_size) {
1902 		i40e_tx_control_block_t *tcb;
1903 
1904 		tcb = itrq->itrq_tcb_work_list[index];
1905 		if (tcb != NULL) {
1906 			itrq->itrq_tcb_work_list[index] = NULL;
1907 			i40e_tcb_reset(tcb);
1908 			i40e_tcb_free(itrq, tcb);
1909 		}
1910 
1911 		bzero(&itrq->itrq_desc_ring[index], sizeof (i40e_tx_desc_t));
1912 		index = i40e_next_desc(index, 1, itrq->itrq_tx_ring_size);
1913 		itrq->itrq_desc_free++;
1914 	}
1915 
1916 	ASSERT(index == itrq->itrq_desc_tail);
1917 	itrq->itrq_desc_head = index;
1918 }
1919 
1920 /*
1921  * We're here either by hook or by crook. We need to see if there are transmit
1922  * descriptors available for us to go and clean up and return to the hardware.
1923  * We may also be blocked, and if so, we should make sure that we let it know
1924  * we're good to go.
1925  */
1926 void
1927 i40e_tx_recycle_ring(i40e_trqpair_t *itrq)
1928 {
1929 	uint32_t wbhead, toclean, count;
1930 	i40e_tx_control_block_t *tcbhead;
1931 	i40e_t *i40e = itrq->itrq_i40e;
1932 	uint_t desc_per_tcb, i;
1933 
1934 	mutex_enter(&itrq->itrq_tx_lock);
1935 
1936 	ASSERT(itrq->itrq_desc_free <= itrq->itrq_tx_ring_size);
1937 	if (itrq->itrq_desc_free == itrq->itrq_tx_ring_size) {
1938 		if (itrq->itrq_tx_blocked == B_TRUE) {
1939 			itrq->itrq_tx_blocked = B_FALSE;
1940 			mac_tx_ring_update(i40e->i40e_mac_hdl,
1941 			    itrq->itrq_mactxring);
1942 			itrq->itrq_txstat.itxs_num_unblocked.value.ui64++;
1943 		}
1944 		mutex_exit(&itrq->itrq_tx_lock);
1945 		return;
1946 	}
1947 
1948 	/*
1949 	 * Now we need to try and see if there's anything available. The driver
1950 	 * will write to the head location and it guarantees that it does not
1951 	 * use relaxed ordering.
1952 	 */
1953 	VERIFY0(ddi_dma_sync(itrq->itrq_desc_area.dmab_dma_handle,
1954 	    (uintptr_t)itrq->itrq_desc_wbhead,
1955 	    sizeof (uint32_t), DDI_DMA_SYNC_FORKERNEL));
1956 
1957 	if (i40e_check_dma_handle(itrq->itrq_desc_area.dmab_dma_handle) !=
1958 	    DDI_FM_OK) {
1959 		mutex_exit(&itrq->itrq_tx_lock);
1960 		ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED);
1961 		atomic_or_32(&i40e->i40e_state, I40E_ERROR);
1962 		return;
1963 	}
1964 
1965 	wbhead = *itrq->itrq_desc_wbhead;
1966 	toclean = itrq->itrq_desc_head;
1967 	count = 0;
1968 	tcbhead = NULL;
1969 
1970 	while (toclean != wbhead) {
1971 		i40e_tx_control_block_t *tcb;
1972 
1973 		tcb = itrq->itrq_tcb_work_list[toclean];
1974 		itrq->itrq_tcb_work_list[toclean] = NULL;
1975 		ASSERT(tcb != NULL);
1976 		tcb->tcb_next = tcbhead;
1977 		tcbhead = tcb;
1978 
1979 		/*
1980 		 * In the DMA bind case, there may not necessarily be a 1:1
1981 		 * mapping between tcb's and descriptors.  If the tcb type
1982 		 * indicates a DMA binding then check the number of DMA
1983 		 * cookies to determine how many entries to clean in the
1984 		 * descriptor ring.
1985 		 */
1986 		if (tcb->tcb_type == I40E_TX_DMA)
1987 			desc_per_tcb = tcb->tcb_bind_ncookies;
1988 		else
1989 			desc_per_tcb = 1;
1990 
1991 		for (i = 0; i < desc_per_tcb; i++) {
1992 			/*
1993 			 * We zero this out for sanity purposes.
1994 			 */
1995 			bzero(&itrq->itrq_desc_ring[toclean],
1996 			    sizeof (i40e_tx_desc_t));
1997 			toclean = i40e_next_desc(toclean, 1,
1998 			    itrq->itrq_tx_ring_size);
1999 			count++;
2000 		}
2001 	}
2002 
2003 	itrq->itrq_desc_head = wbhead;
2004 	itrq->itrq_desc_free += count;
2005 	itrq->itrq_txstat.itxs_recycled.value.ui64 += count;
2006 	ASSERT(itrq->itrq_desc_free <= itrq->itrq_tx_ring_size);
2007 
2008 	if (itrq->itrq_tx_blocked == B_TRUE &&
2009 	    itrq->itrq_desc_free > i40e->i40e_tx_block_thresh) {
2010 		itrq->itrq_tx_blocked = B_FALSE;
2011 
2012 		mac_tx_ring_update(i40e->i40e_mac_hdl, itrq->itrq_mactxring);
2013 		itrq->itrq_txstat.itxs_num_unblocked.value.ui64++;
2014 	}
2015 
2016 	mutex_exit(&itrq->itrq_tx_lock);
2017 
2018 	/*
2019 	 * Now clean up the tcb.
2020 	 */
2021 	while (tcbhead != NULL) {
2022 		i40e_tx_control_block_t *tcb = tcbhead;
2023 
2024 		tcbhead = tcb->tcb_next;
2025 		i40e_tcb_reset(tcb);
2026 		i40e_tcb_free(itrq, tcb);
2027 	}
2028 
2029 	DTRACE_PROBE2(i40e__recycle, i40e_trqpair_t *, itrq, uint32_t, count);
2030 }
2031 
2032 static void
2033 i40e_tx_copy_fragment(i40e_tx_control_block_t *tcb, const mblk_t *mp,
2034     const size_t off, const size_t len)
2035 {
2036 	const void *soff = mp->b_rptr + off;
2037 	void *doff = tcb->tcb_dma.dmab_address + tcb->tcb_dma.dmab_len;
2038 
2039 	ASSERT3U(len, >, 0);
2040 	ASSERT3P(soff, >=, mp->b_rptr);
2041 	ASSERT3P(soff, <=, mp->b_wptr);
2042 	ASSERT3U(len, <=, MBLKL(mp));
2043 	ASSERT3U((uintptr_t)soff + len, <=, (uintptr_t)mp->b_wptr);
2044 	ASSERT3U(tcb->tcb_dma.dmab_size - tcb->tcb_dma.dmab_len, >=, len);
2045 	bcopy(soff, doff, len);
2046 	tcb->tcb_type = I40E_TX_COPY;
2047 	tcb->tcb_dma.dmab_len += len;
2048 	I40E_DMA_SYNC(&tcb->tcb_dma, DDI_DMA_SYNC_FORDEV);
2049 }
2050 
2051 static i40e_tx_control_block_t *
2052 i40e_tx_bind_fragment(i40e_trqpair_t *itrq, const mblk_t *mp,
2053     size_t off, boolean_t use_lso)
2054 {
2055 	ddi_dma_handle_t dma_handle;
2056 	ddi_dma_cookie_t dma_cookie;
2057 	uint_t i = 0, ncookies = 0, dmaflags;
2058 	i40e_tx_control_block_t *tcb;
2059 	i40e_txq_stat_t *txs = &itrq->itrq_txstat;
2060 
2061 	if ((tcb = i40e_tcb_alloc(itrq)) == NULL) {
2062 		txs->itxs_err_notcb.value.ui64++;
2063 		return (NULL);
2064 	}
2065 	tcb->tcb_type = I40E_TX_DMA;
2066 
2067 	if (use_lso == B_TRUE)
2068 		dma_handle = tcb->tcb_lso_dma_handle;
2069 	else
2070 		dma_handle = tcb->tcb_dma_handle;
2071 
2072 	dmaflags = DDI_DMA_WRITE | DDI_DMA_STREAMING;
2073 	if (ddi_dma_addr_bind_handle(dma_handle, NULL,
2074 	    (caddr_t)(mp->b_rptr + off), MBLKL(mp) - off, dmaflags,
2075 	    DDI_DMA_DONTWAIT, NULL, &dma_cookie, &ncookies) != DDI_DMA_MAPPED) {
2076 		txs->itxs_bind_fails.value.ui64++;
2077 		goto bffail;
2078 	}
2079 
2080 	tcb->tcb_bind_ncookies = ncookies;
2081 	tcb->tcb_used_lso = use_lso;
2082 
2083 	tcb->tcb_bind_info =
2084 	    kmem_zalloc(ncookies * sizeof (struct i40e_dma_bind_info),
2085 	    KM_NOSLEEP);
2086 	if (tcb->tcb_bind_info == NULL)
2087 		goto bffail;
2088 
2089 	while (i < ncookies) {
2090 		if (i > 0)
2091 			ddi_dma_nextcookie(dma_handle, &dma_cookie);
2092 
2093 		tcb->tcb_bind_info[i].dbi_paddr =
2094 		    (caddr_t)dma_cookie.dmac_laddress;
2095 		tcb->tcb_bind_info[i++].dbi_len = dma_cookie.dmac_size;
2096 	}
2097 
2098 	return (tcb);
2099 
2100 bffail:
2101 	i40e_tcb_reset(tcb);
2102 	i40e_tcb_free(itrq, tcb);
2103 	return (NULL);
2104 }
2105 
2106 static void
2107 i40e_tx_set_data_desc(i40e_trqpair_t *itrq, i40e_tx_context_t *tctx,
2108     caddr_t buff, size_t len, boolean_t last_desc)
2109 {
2110 	i40e_tx_desc_t *txdesc;
2111 	int cmd;
2112 
2113 	ASSERT(MUTEX_HELD(&itrq->itrq_tx_lock));
2114 	itrq->itrq_desc_free--;
2115 	txdesc = &itrq->itrq_desc_ring[itrq->itrq_desc_tail];
2116 	itrq->itrq_desc_tail = i40e_next_desc(itrq->itrq_desc_tail, 1,
2117 	    itrq->itrq_tx_ring_size);
2118 
2119 	cmd = I40E_TX_DESC_CMD_ICRC | tctx->itc_data_cmdflags;
2120 
2121 	/*
2122 	 * The last data descriptor needs the EOP bit set, so that the HW knows
2123 	 * that we're ready to send.  Additionally, we set the RS (Report
2124 	 * Status) bit, so that we are notified when the transmit engine has
2125 	 * completed DMA'ing all of the data descriptors and data buffers
2126 	 * associated with this frame.
2127 	 */
2128 	if (last_desc == B_TRUE) {
2129 		cmd |= I40E_TX_DESC_CMD_EOP;
2130 		cmd |= I40E_TX_DESC_CMD_RS;
2131 	}
2132 
2133 	/*
2134 	 * Per the X710 manual, section 8.4.2.1.1, the buffer size
2135 	 * must be a value from 1 to 16K minus 1, inclusive.
2136 	 */
2137 	ASSERT3U(len, >=, 1);
2138 	ASSERT3U(len, <=, I40E_MAX_TX_BUFSZ);
2139 
2140 	txdesc->buffer_addr = CPU_TO_LE64((uintptr_t)buff);
2141 	txdesc->cmd_type_offset_bsz =
2142 	    LE_64(((uint64_t)I40E_TX_DESC_DTYPE_DATA |
2143 	    ((uint64_t)tctx->itc_data_offsets << I40E_TXD_QW1_OFFSET_SHIFT) |
2144 	    ((uint64_t)cmd << I40E_TXD_QW1_CMD_SHIFT) |
2145 	    ((uint64_t)len << I40E_TXD_QW1_TX_BUF_SZ_SHIFT)));
2146 }
2147 
2148 /*
2149  * Place 'tcb' on the tail of the list represented by 'head'/'tail'.
2150  */
2151 static inline void
2152 tcb_list_append(i40e_tx_control_block_t **head, i40e_tx_control_block_t **tail,
2153     i40e_tx_control_block_t *tcb)
2154 {
2155 	if (*head == NULL) {
2156 		*head = tcb;
2157 		*tail = *head;
2158 	} else {
2159 		ASSERT3P(*tail, !=, NULL);
2160 		ASSERT3P((*tail)->tcb_next, ==, NULL);
2161 		(*tail)->tcb_next = tcb;
2162 		*tail = tcb;
2163 	}
2164 }
2165 
2166 /*
2167  * This function takes a single packet, possibly consisting of
2168  * multiple mblks, and creates a TCB chain to send to the controller.
2169  * This TCB chain may span up to a maximum of 8 descriptors. A copy
2170  * TCB consumes one descriptor; whereas a DMA TCB may consume 1 or
2171  * more, depending on several factors. For each fragment (invidual
2172  * mblk making up the packet), we determine if its size dictates a
2173  * copy to the TCB buffer or a DMA bind of the dblk buffer. We keep a
2174  * count of descriptors used; when that count reaches the max we force
2175  * all remaining fragments into a single TCB buffer. We have a
2176  * guarantee that the TCB buffer is always larger than the MTU -- so
2177  * there is always enough room. Consecutive fragments below the DMA
2178  * threshold are copied into a single TCB. In the event of an error
2179  * this function returns NULL but leaves 'mp' alone.
2180  */
2181 static i40e_tx_control_block_t *
2182 i40e_non_lso_chain(i40e_trqpair_t *itrq, mblk_t *mp, uint_t *ndesc)
2183 {
2184 	const mblk_t *nmp = mp;
2185 	uint_t needed_desc = 0;
2186 	boolean_t force_copy = B_FALSE;
2187 	i40e_tx_control_block_t *tcb = NULL, *tcbhead = NULL, *tcbtail = NULL;
2188 	i40e_t *i40e = itrq->itrq_i40e;
2189 	i40e_txq_stat_t *txs = &itrq->itrq_txstat;
2190 
2191 	/* TCB buffer is always larger than MTU. */
2192 	ASSERT3U(msgsize(mp), <, i40e->i40e_tx_buf_size);
2193 
2194 	while (nmp != NULL) {
2195 		const size_t nmp_len = MBLKL(nmp);
2196 
2197 		/* Ignore zero-length mblks. */
2198 		if (nmp_len == 0) {
2199 			nmp = nmp->b_cont;
2200 			continue;
2201 		}
2202 
2203 		if (nmp_len < i40e->i40e_tx_dma_min || force_copy) {
2204 			/* Compress consecutive copies into one TCB. */
2205 			if (tcb != NULL && tcb->tcb_type == I40E_TX_COPY) {
2206 				i40e_tx_copy_fragment(tcb, nmp, 0, nmp_len);
2207 				nmp = nmp->b_cont;
2208 				continue;
2209 			}
2210 
2211 			if ((tcb = i40e_tcb_alloc(itrq)) == NULL) {
2212 				txs->itxs_err_notcb.value.ui64++;
2213 				goto fail;
2214 			}
2215 
2216 			/*
2217 			 * TCB DMA buffer is guaranteed to be one
2218 			 * cookie by i40e_alloc_dma_buffer().
2219 			 */
2220 			i40e_tx_copy_fragment(tcb, nmp, 0, nmp_len);
2221 			needed_desc++;
2222 			tcb_list_append(&tcbhead, &tcbtail, tcb);
2223 		} else {
2224 			uint_t total_desc;
2225 
2226 			tcb = i40e_tx_bind_fragment(itrq, nmp, 0, B_FALSE);
2227 			if (tcb == NULL) {
2228 				i40e_error(i40e, "dma bind failed!");
2229 				goto fail;
2230 			}
2231 
2232 			/*
2233 			 * If the new total exceeds the max or we've
2234 			 * reached the limit and there's data left,
2235 			 * then give up binding and copy the rest into
2236 			 * the pre-allocated TCB buffer.
2237 			 */
2238 			total_desc = needed_desc + tcb->tcb_bind_ncookies;
2239 			if ((total_desc > I40E_TX_MAX_COOKIE) ||
2240 			    (total_desc == I40E_TX_MAX_COOKIE &&
2241 			    nmp->b_cont != NULL)) {
2242 				i40e_tcb_reset(tcb);
2243 				i40e_tcb_free(itrq, tcb);
2244 
2245 				if (tcbtail != NULL &&
2246 				    tcbtail->tcb_type == I40E_TX_COPY) {
2247 					tcb = tcbtail;
2248 				} else {
2249 					tcb = NULL;
2250 				}
2251 
2252 				force_copy = B_TRUE;
2253 				txs->itxs_force_copy.value.ui64++;
2254 				continue;
2255 			}
2256 
2257 			needed_desc += tcb->tcb_bind_ncookies;
2258 			tcb_list_append(&tcbhead, &tcbtail, tcb);
2259 		}
2260 
2261 		nmp = nmp->b_cont;
2262 	}
2263 
2264 	ASSERT3P(nmp, ==, NULL);
2265 	ASSERT3U(needed_desc, <=, I40E_TX_MAX_COOKIE);
2266 	ASSERT3P(tcbhead, !=, NULL);
2267 	*ndesc += needed_desc;
2268 	return (tcbhead);
2269 
2270 fail:
2271 	tcb = tcbhead;
2272 	while (tcb != NULL) {
2273 		i40e_tx_control_block_t *next = tcb->tcb_next;
2274 
2275 		ASSERT(tcb->tcb_type == I40E_TX_DMA ||
2276 		    tcb->tcb_type == I40E_TX_COPY);
2277 
2278 		tcb->tcb_mp = NULL;
2279 		i40e_tcb_reset(tcb);
2280 		i40e_tcb_free(itrq, tcb);
2281 		tcb = next;
2282 	}
2283 
2284 	return (NULL);
2285 }
2286 
2287 /*
2288  * Section 8.4.1 of the 700-series programming guide states that a
2289  * segment may span up to 8 data descriptors; including both header
2290  * and payload data. However, empirical evidence shows that the
2291  * controller freezes the Tx queue when presented with a segment of 8
2292  * descriptors. Or, at least, when the first segment contains 8
2293  * descriptors. One explanation is that the controller counts the
2294  * context descriptor against the first segment, even though the
2295  * programming guide makes no mention of such a constraint. In any
2296  * case, we limit TSO segments to 7 descriptors to prevent Tx queue
2297  * freezes. We still allow non-TSO segments to utilize all 8
2298  * descriptors as they have not demonstrated the faulty behavior.
2299  */
2300 uint_t i40e_lso_num_descs = 7;
2301 
2302 #define	I40E_TCB_LEFT(tcb)				\
2303 	((tcb)->tcb_dma.dmab_size - (tcb)->tcb_dma.dmab_len)
2304 
2305 /*
2306  * This function is similar in spirit to i40e_non_lso_chain(), but
2307  * much more complicated in reality. Like the previous function, it
2308  * takes a packet (an LSO packet) as input and returns a chain of
2309  * TCBs. The complication comes with the fact that we are no longer
2310  * trying to fit the entire packet into 8 descriptors, but rather we
2311  * must fit each MSS-size segment of the LSO packet into 8 descriptors.
2312  * Except it's really 7 descriptors, see i40e_lso_num_descs.
2313  *
2314  * Your first inclination might be to verify that a given segment
2315  * spans no more than 7 mblks; but it's actually much more subtle than
2316  * that. First, let's describe what the hardware expects, and then we
2317  * can expound on the software side of things.
2318  *
2319  * For an LSO packet the hardware expects the following:
2320  *
2321  *	o Each MSS-sized segment must span no more than 7 descriptors.
2322  *
2323  *	o The header size does not count towards the segment size.
2324  *
2325  *	o If header and payload share the first descriptor, then the
2326  *	  controller will count the descriptor twice.
2327  *
2328  * The most important thing to keep in mind is that the hardware does
2329  * not view the segments in terms of mblks, like we do. The hardware
2330  * only sees descriptors. It will iterate each descriptor in turn,
2331  * keeping a tally of bytes seen and descriptors visited. If the byte
2332  * count hasn't reached MSS by the time the descriptor count reaches
2333  * 7, then the controller freezes the queue and we are stuck.
2334  * Furthermore, the hardware picks up its tally where it left off. So
2335  * if it reached MSS in the middle of a descriptor, it will start
2336  * tallying the next segment in the middle of that descriptor. The
2337  * hardware's view is entirely removed from the mblk chain or even the
2338  * descriptor layout. Consider these facts:
2339  *
2340  *	o The MSS will vary dpeneding on MTU and other factors.
2341  *
2342  *	o The dblk allocation will sit at various offsets within a
2343  *	  memory page.
2344  *
2345  *	o The page size itself could vary in the future (i.e. not
2346  *	  always 4K).
2347  *
2348  *	o Just because a dblk is virtually contiguous doesn't mean
2349  *	  it's physically contiguous. The number of cookies
2350  *	  (descriptors) required by a DMA bind of a single dblk is at
2351  *	  the mercy of the page size and physical layout.
2352  *
2353  *	o The descriptors will most often NOT start/end on a MSS
2354  *	  boundary. Thus the hardware will often start counting the
2355  *	  MSS mid descriptor and finish mid descriptor.
2356  *
2357  * The upshot of all this is that the driver must learn to think like
2358  * the controller; and verify that none of the constraints are broken.
2359  * It does this by tallying up the segment just like the hardware
2360  * would. This is handled by the two variables 'segsz' and 'segdesc'.
2361  * After each attempt to bind a dblk, we check the constaints. If
2362  * violated, we undo the DMA and force a copy until MSS is met. We
2363  * have a guarantee that the TCB buffer is larger than MTU; thus
2364  * ensuring we can always meet the MSS with a single copy buffer. We
2365  * also copy consecutive non-DMA fragments into the same TCB buffer.
2366  */
2367 static i40e_tx_control_block_t *
2368 i40e_lso_chain(i40e_trqpair_t *itrq, const mblk_t *mp,
2369     const mac_ether_offload_info_t *meo, const i40e_tx_context_t *tctx,
2370     uint_t *ndesc)
2371 {
2372 	size_t mp_len = MBLKL(mp);
2373 	/*
2374 	 * The cpoff (copy offset) variable tracks the offset inside
2375 	 * the current mp. There are cases where the entire mp is not
2376 	 * fully copied in one go: such as the header copy followed by
2377 	 * a non-DMA mblk, or a TCB buffer that only has enough space
2378 	 * to copy part of the current mp.
2379 	 */
2380 	size_t cpoff = 0;
2381 	/*
2382 	 * The segsz and segdesc variables track the controller's view
2383 	 * of the segment. The needed_desc variable tracks the total
2384 	 * number of data descriptors used by the driver.
2385 	 */
2386 	size_t segsz = 0;
2387 	uint_t segdesc = 0;
2388 	uint_t needed_desc = 0;
2389 	size_t hdrcopied = 0;
2390 	const size_t hdrlen =
2391 	    meo->meoi_l2hlen + meo->meoi_l3hlen + meo->meoi_l4hlen;
2392 	const size_t mss = tctx->itc_ctx_mss;
2393 	boolean_t force_copy = B_FALSE;
2394 	i40e_tx_control_block_t *tcb = NULL, *tcbhead = NULL, *tcbtail = NULL;
2395 	i40e_t *i40e = itrq->itrq_i40e;
2396 	i40e_txq_stat_t *txs = &itrq->itrq_txstat;
2397 
2398 	/*
2399 	 * We always copy the header in order to avoid more
2400 	 * complicated code dealing with various edge cases.
2401 	 */
2402 	if ((tcb = i40e_tcb_alloc(itrq)) == NULL) {
2403 		txs->itxs_err_notcb.value.ui64++;
2404 		goto fail;
2405 	}
2406 
2407 	needed_desc++;
2408 	tcb_list_append(&tcbhead, &tcbtail, tcb);
2409 
2410 	while (hdrcopied < hdrlen) {
2411 		const size_t tocopy = MIN(hdrlen - hdrcopied, mp_len);
2412 		i40e_tx_copy_fragment(tcb, mp, 0, tocopy);
2413 		hdrcopied += tocopy;
2414 		cpoff += tocopy;
2415 		if (tocopy == mp_len) {
2416 			/*
2417 			 * This is a bit of defensive programming. We
2418 			 * should never have a chain too short to
2419 			 * satisfy the headers -- but just in case.
2420 			 */
2421 			if ((mp = mp->b_cont) == NULL) {
2422 				txs->itxs_tx_short.value.ui64++;
2423 				goto fail;
2424 			}
2425 
2426 			while ((mp_len = MBLKL(mp)) == 0) {
2427 				if ((mp = mp->b_cont) == NULL) {
2428 					txs->itxs_tx_short.value.ui64++;
2429 					goto fail;
2430 				}
2431 			}
2432 			cpoff = 0;
2433 		}
2434 	}
2435 	ASSERT3U(hdrcopied, ==, hdrlen);
2436 
2437 	/*
2438 	 * A single descriptor containing both header and data is
2439 	 * counted twice by the controller.
2440 	 */
2441 	if (mp_len < i40e->i40e_tx_dma_min) {
2442 		segdesc = 2;
2443 	} else {
2444 		segdesc = 1;
2445 	}
2446 
2447 	while (mp != NULL) {
2448 		mp_len = MBLKL(mp);
2449 force_copy:
2450 		/* Ignore zero-length mblks. */
2451 		if (mp_len == 0) {
2452 			mp = mp->b_cont;
2453 			cpoff = 0;
2454 			continue;
2455 		}
2456 
2457 		/*
2458 		 * We copy into the preallocated TCB buffer when the
2459 		 * current fragment is less than the DMA threshold OR
2460 		 * when the DMA bind can't meet the controller's
2461 		 * segment descriptor limit.
2462 		 */
2463 		if (mp_len < i40e->i40e_tx_dma_min || force_copy) {
2464 			size_t tocopy;
2465 
2466 			/*
2467 			 * Our objective here is to compress
2468 			 * consecutive copies into one TCB (until it
2469 			 * is full). If there is no current TCB, or if
2470 			 * it is a DMA TCB, then allocate a new one.
2471 			 */
2472 			if (tcb == NULL ||
2473 			    (tcb != NULL && tcb->tcb_type != I40E_TX_COPY)) {
2474 				if ((tcb = i40e_tcb_alloc(itrq)) == NULL) {
2475 					txs->itxs_err_notcb.value.ui64++;
2476 					goto fail;
2477 				}
2478 
2479 				/*
2480 				 * The TCB DMA buffer is guaranteed to
2481 				 * be one cookie by i40e_alloc_dma_buffer().
2482 				 */
2483 				needed_desc++;
2484 				segdesc++;
2485 				ASSERT3U(segdesc, <=, i40e_lso_num_descs);
2486 				tcb_list_append(&tcbhead, &tcbtail, tcb);
2487 			} else if (segdesc == 0) {
2488 				/*
2489 				 * We are copying into an existing TCB
2490 				 * but we just crossed the MSS
2491 				 * boundary. Make sure to increment
2492 				 * segdesc to track the descriptor
2493 				 * count as the hardware would.
2494 				 */
2495 				segdesc++;
2496 			}
2497 
2498 			tocopy = MIN(I40E_TCB_LEFT(tcb), mp_len - cpoff);
2499 			i40e_tx_copy_fragment(tcb, mp, cpoff, tocopy);
2500 			cpoff += tocopy;
2501 			segsz += tocopy;
2502 
2503 			/* We have consumed the current mp. */
2504 			if (cpoff == mp_len) {
2505 				mp = mp->b_cont;
2506 				cpoff = 0;
2507 			}
2508 
2509 			/* We have consumed the current TCB buffer. */
2510 			if (I40E_TCB_LEFT(tcb) == 0) {
2511 				tcb = NULL;
2512 			}
2513 
2514 			/*
2515 			 * We have met MSS with this copy; restart the
2516 			 * counters.
2517 			 */
2518 			if (segsz >= mss) {
2519 				segsz = segsz % mss;
2520 				segdesc = segsz == 0 ? 0 : 1;
2521 				force_copy = B_FALSE;
2522 			}
2523 
2524 			/*
2525 			 * We are at the controller's descriptor
2526 			 * limit; we must copy into the current TCB
2527 			 * until MSS is reached. The TCB buffer is
2528 			 * always bigger than the MTU so we know it is
2529 			 * big enough to meet the MSS.
2530 			 */
2531 			if (segdesc == i40e_lso_num_descs) {
2532 				force_copy = B_TRUE;
2533 			}
2534 		} else {
2535 			uint_t tsegdesc = segdesc;
2536 			size_t tsegsz = segsz;
2537 
2538 			ASSERT(force_copy == B_FALSE);
2539 			ASSERT3U(tsegdesc, <, i40e_lso_num_descs);
2540 
2541 			tcb = i40e_tx_bind_fragment(itrq, mp, cpoff, B_TRUE);
2542 			if (tcb == NULL) {
2543 				i40e_error(i40e, "dma bind failed!");
2544 				goto fail;
2545 			}
2546 
2547 			for (uint_t i = 0; i < tcb->tcb_bind_ncookies; i++) {
2548 				struct i40e_dma_bind_info dbi =
2549 				    tcb->tcb_bind_info[i];
2550 
2551 				tsegsz += dbi.dbi_len;
2552 				tsegdesc++;
2553 				ASSERT3U(tsegdesc, <=, i40e_lso_num_descs);
2554 
2555 				/*
2556 				 * We've met the MSS with this portion
2557 				 * of the DMA.
2558 				 */
2559 				if (tsegsz >= mss) {
2560 					tsegsz = tsegsz % mss;
2561 					tsegdesc = tsegsz == 0 ? 0 : 1;
2562 				}
2563 
2564 				/*
2565 				 * We've reached max descriptors but
2566 				 * have not met the MSS. Undo the bind
2567 				 * and instead copy.
2568 				 */
2569 				if (tsegdesc == i40e_lso_num_descs) {
2570 					i40e_tcb_reset(tcb);
2571 					i40e_tcb_free(itrq, tcb);
2572 
2573 					if (tcbtail != NULL &&
2574 					    I40E_TCB_LEFT(tcb) > 0 &&
2575 					    tcbtail->tcb_type == I40E_TX_COPY) {
2576 						tcb = tcbtail;
2577 					} else {
2578 						tcb = NULL;
2579 					}
2580 
2581 					/*
2582 					 * Remember, we are still on
2583 					 * the same mp.
2584 					 */
2585 					force_copy = B_TRUE;
2586 					txs->itxs_tso_force_copy.value.ui64++;
2587 					goto force_copy;
2588 				}
2589 			}
2590 
2591 			ASSERT3U(tsegdesc, <=, i40e_lso_num_descs);
2592 			ASSERT3U(tsegsz, <, mss);
2593 
2594 			/*
2595 			 * We've made if through the loop without
2596 			 * breaking the segment descriptor contract
2597 			 * with the controller -- replace the segment
2598 			 * tracking values with the temporary ones.
2599 			 */
2600 			segdesc = tsegdesc;
2601 			segsz = tsegsz;
2602 			needed_desc += tcb->tcb_bind_ncookies;
2603 			cpoff = 0;
2604 			tcb_list_append(&tcbhead, &tcbtail, tcb);
2605 			mp = mp->b_cont;
2606 		}
2607 	}
2608 
2609 	ASSERT3P(mp, ==, NULL);
2610 	ASSERT3P(tcbhead, !=, NULL);
2611 	*ndesc += needed_desc;
2612 	return (tcbhead);
2613 
2614 fail:
2615 	tcb = tcbhead;
2616 	while (tcb != NULL) {
2617 		i40e_tx_control_block_t *next = tcb->tcb_next;
2618 
2619 		ASSERT(tcb->tcb_type == I40E_TX_DMA ||
2620 		    tcb->tcb_type == I40E_TX_COPY);
2621 
2622 		tcb->tcb_mp = NULL;
2623 		i40e_tcb_reset(tcb);
2624 		i40e_tcb_free(itrq, tcb);
2625 		tcb = next;
2626 	}
2627 
2628 	return (NULL);
2629 }
2630 
2631 /*
2632  * We've been asked to send a message block on the wire. We'll only have a
2633  * single chain. There will not be any b_next pointers; however, there may be
2634  * multiple b_cont blocks. The number of b_cont blocks may exceed the
2635  * controller's Tx descriptor limit.
2636  *
2637  * We may do one of three things with any given mblk_t chain:
2638  *
2639  *   1) Drop it
2640  *   2) Transmit it
2641  *   3) Return it
2642  *
2643  * If we return it to MAC, then MAC will flow control on our behalf. In other
2644  * words, it won't send us anything until we tell it that it's okay to send us
2645  * something.
2646  */
2647 mblk_t *
2648 i40e_ring_tx(void *arg, mblk_t *mp)
2649 {
2650 	size_t msglen;
2651 	i40e_tx_control_block_t *tcb_ctx = NULL, *tcb = NULL, *tcbhead = NULL;
2652 	i40e_tx_context_desc_t *ctxdesc;
2653 	mac_ether_offload_info_t meo;
2654 	i40e_tx_context_t tctx;
2655 	int type;
2656 	uint_t needed_desc = 0;
2657 	boolean_t do_ctx_desc = B_FALSE, use_lso = B_FALSE;
2658 
2659 	i40e_trqpair_t *itrq = arg;
2660 	i40e_t *i40e = itrq->itrq_i40e;
2661 	i40e_hw_t *hw = &i40e->i40e_hw_space;
2662 	i40e_txq_stat_t *txs = &itrq->itrq_txstat;
2663 
2664 	ASSERT(mp->b_next == NULL);
2665 
2666 	if (!(i40e->i40e_state & I40E_STARTED) ||
2667 	    (i40e->i40e_state & I40E_OVERTEMP) ||
2668 	    (i40e->i40e_state & I40E_SUSPENDED) ||
2669 	    (i40e->i40e_state & I40E_ERROR) ||
2670 	    (i40e->i40e_link_state != LINK_STATE_UP)) {
2671 		freemsg(mp);
2672 		return (NULL);
2673 	}
2674 
2675 	if (mac_ether_offload_info(mp, &meo) != 0) {
2676 		freemsg(mp);
2677 		itrq->itrq_txstat.itxs_hck_meoifail.value.ui64++;
2678 		return (NULL);
2679 	}
2680 
2681 	/*
2682 	 * Figure out the relevant context about this frame that we might need
2683 	 * for enabling checksum, LSO, etc. This also fills in information that
2684 	 * we might set around the packet type, etc.
2685 	 */
2686 	if (i40e_tx_context(i40e, itrq, mp, &meo, &tctx) < 0) {
2687 		freemsg(mp);
2688 		itrq->itrq_txstat.itxs_err_context.value.ui64++;
2689 		return (NULL);
2690 	}
2691 	if (tctx.itc_ctx_cmdflags & I40E_TX_CTX_DESC_TSO) {
2692 		use_lso = B_TRUE;
2693 		do_ctx_desc = B_TRUE;
2694 	}
2695 
2696 	/*
2697 	 * For the primordial driver we can punt on doing any recycling right
2698 	 * now; however, longer term we need to probably do some more pro-active
2699 	 * recycling to cut back on stalls in the TX path.
2700 	 */
2701 
2702 	msglen = msgsize(mp);
2703 
2704 	if (do_ctx_desc) {
2705 		/*
2706 		 * If we're doing tunneling or LSO, then we'll need a TX
2707 		 * context descriptor in addition to one or more TX data
2708 		 * descriptors.  Since there's no data DMA block or handle
2709 		 * associated with the context descriptor, we create a special
2710 		 * control block that behaves effectively like a NOP.
2711 		 */
2712 		if ((tcb_ctx = i40e_tcb_alloc(itrq)) == NULL) {
2713 			txs->itxs_err_notcb.value.ui64++;
2714 			goto txfail;
2715 		}
2716 		tcb_ctx->tcb_type = I40E_TX_DESC;
2717 		needed_desc++;
2718 	}
2719 
2720 	if (!use_lso) {
2721 		tcbhead = i40e_non_lso_chain(itrq, mp, &needed_desc);
2722 	} else {
2723 		tcbhead = i40e_lso_chain(itrq, mp, &meo, &tctx, &needed_desc);
2724 	}
2725 
2726 	if (tcbhead == NULL)
2727 		goto txfail;
2728 
2729 	tcbhead->tcb_mp = mp;
2730 
2731 	/*
2732 	 * The second condition ensures that 'itrq_desc_tail' never
2733 	 * equals 'itrq_desc_head'. This enforces the rule found in
2734 	 * the second bullet point of section 8.4.3.1.5 of the XL710
2735 	 * PG, which declares the TAIL pointer in I40E_QTX_TAIL should
2736 	 * never overlap with the head. This means that we only ever
2737 	 * have 'itrq_tx_ring_size - 1' total available descriptors.
2738 	 */
2739 	mutex_enter(&itrq->itrq_tx_lock);
2740 	if (itrq->itrq_desc_free < i40e->i40e_tx_block_thresh ||
2741 	    (itrq->itrq_desc_free - 1) < needed_desc) {
2742 		txs->itxs_err_nodescs.value.ui64++;
2743 		mutex_exit(&itrq->itrq_tx_lock);
2744 		goto txfail;
2745 	}
2746 
2747 	if (do_ctx_desc) {
2748 		/*
2749 		 * If we're enabling any offloads for this frame, then we'll
2750 		 * need to build up a transmit context descriptor, first.  The
2751 		 * context descriptor needs to be placed in the TX ring before
2752 		 * the data descriptor(s).  See section 8.4.2, table 8-16
2753 		 */
2754 		uint_t tail = itrq->itrq_desc_tail;
2755 		itrq->itrq_desc_free--;
2756 		ctxdesc = (i40e_tx_context_desc_t *)&itrq->itrq_desc_ring[tail];
2757 		itrq->itrq_tcb_work_list[tail] = tcb_ctx;
2758 		itrq->itrq_desc_tail = i40e_next_desc(tail, 1,
2759 		    itrq->itrq_tx_ring_size);
2760 
2761 		/* QW0 */
2762 		type = I40E_TX_DESC_DTYPE_CONTEXT;
2763 		ctxdesc->tunneling_params = 0;
2764 		ctxdesc->l2tag2 = 0;
2765 
2766 		/* QW1 */
2767 		ctxdesc->type_cmd_tso_mss = CPU_TO_LE64((uint64_t)type);
2768 		if (tctx.itc_ctx_cmdflags & I40E_TX_CTX_DESC_TSO) {
2769 			ctxdesc->type_cmd_tso_mss |= CPU_TO_LE64((uint64_t)
2770 			    ((uint64_t)tctx.itc_ctx_cmdflags <<
2771 			    I40E_TXD_CTX_QW1_CMD_SHIFT) |
2772 			    ((uint64_t)tctx.itc_ctx_tsolen <<
2773 			    I40E_TXD_CTX_QW1_TSO_LEN_SHIFT) |
2774 			    ((uint64_t)tctx.itc_ctx_mss <<
2775 			    I40E_TXD_CTX_QW1_MSS_SHIFT));
2776 		}
2777 	}
2778 
2779 	tcb = tcbhead;
2780 	while (tcb != NULL) {
2781 
2782 		itrq->itrq_tcb_work_list[itrq->itrq_desc_tail] = tcb;
2783 		if (tcb->tcb_type == I40E_TX_COPY) {
2784 			boolean_t last_desc = (tcb->tcb_next == NULL);
2785 
2786 			i40e_tx_set_data_desc(itrq, &tctx,
2787 			    (caddr_t)tcb->tcb_dma.dmab_dma_address,
2788 			    tcb->tcb_dma.dmab_len, last_desc);
2789 		} else {
2790 			boolean_t last_desc = B_FALSE;
2791 			ASSERT3S(tcb->tcb_type, ==, I40E_TX_DMA);
2792 
2793 			for (uint_t c = 0; c < tcb->tcb_bind_ncookies; c++) {
2794 				last_desc = (c == tcb->tcb_bind_ncookies - 1) &&
2795 				    (tcb->tcb_next == NULL);
2796 
2797 				i40e_tx_set_data_desc(itrq, &tctx,
2798 				    tcb->tcb_bind_info[c].dbi_paddr,
2799 				    tcb->tcb_bind_info[c].dbi_len,
2800 				    last_desc);
2801 			}
2802 		}
2803 
2804 		tcb = tcb->tcb_next;
2805 	}
2806 
2807 	/*
2808 	 * Now, finally, sync the DMA data and alert hardware.
2809 	 */
2810 	I40E_DMA_SYNC(&itrq->itrq_desc_area, DDI_DMA_SYNC_FORDEV);
2811 
2812 	I40E_WRITE_REG(hw, I40E_QTX_TAIL(itrq->itrq_index),
2813 	    itrq->itrq_desc_tail);
2814 
2815 	if (i40e_check_acc_handle(i40e->i40e_osdep_space.ios_reg_handle) !=
2816 	    DDI_FM_OK) {
2817 		/*
2818 		 * Note, we can't really go through and clean this up very well,
2819 		 * because the memory has been given to the device, so just
2820 		 * indicate it's been transmitted.
2821 		 */
2822 		ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED);
2823 		atomic_or_32(&i40e->i40e_state, I40E_ERROR);
2824 	}
2825 
2826 	txs->itxs_bytes.value.ui64 += msglen;
2827 	txs->itxs_packets.value.ui64++;
2828 	txs->itxs_descriptors.value.ui64 += needed_desc;
2829 
2830 	mutex_exit(&itrq->itrq_tx_lock);
2831 
2832 	return (NULL);
2833 
2834 txfail:
2835 	/*
2836 	 * We ran out of resources. Return it to MAC and indicate that we'll
2837 	 * need to signal MAC. If there are allocated tcb's, return them now.
2838 	 * Make sure to reset their message block's, since we'll return them
2839 	 * back to MAC.
2840 	 */
2841 	if (tcb_ctx != NULL) {
2842 		tcb_ctx->tcb_mp = NULL;
2843 		i40e_tcb_reset(tcb_ctx);
2844 		i40e_tcb_free(itrq, tcb_ctx);
2845 	}
2846 
2847 	tcb = tcbhead;
2848 	while (tcb != NULL) {
2849 		i40e_tx_control_block_t *next = tcb->tcb_next;
2850 
2851 		ASSERT(tcb->tcb_type == I40E_TX_DMA ||
2852 		    tcb->tcb_type == I40E_TX_COPY);
2853 
2854 		tcb->tcb_mp = NULL;
2855 		i40e_tcb_reset(tcb);
2856 		i40e_tcb_free(itrq, tcb);
2857 		tcb = next;
2858 	}
2859 
2860 	mutex_enter(&itrq->itrq_tx_lock);
2861 	itrq->itrq_tx_blocked = B_TRUE;
2862 	mutex_exit(&itrq->itrq_tx_lock);
2863 
2864 	return (mp);
2865 }
2866