xref: /illumos-gate/usr/src/uts/common/io/i40e/i40e_transceiver.c (revision 67d74cc3e7c9d9461311136a0b2069813a3fd927)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2015 OmniTI Computer Consulting, Inc. All rights reserved.
14  * Copyright 2016 Joyent, Inc.
15  */
16 
17 #include "i40e_sw.h"
18 
19 /*
20  * ---------------------------------------------------------
21  * Buffer and Memory Management, Receiving, and Transmitting
22  * ---------------------------------------------------------
23  *
24  * Each physical function (PF), which is what we think of as an instance of the
25  * device driver, has a series of associated transmit and receive queue pairs.
26  * Effectively, what we think of in MAC as rings. Each of these has their own
27  * ring of descriptors which is used as part of doing DMA activity.
28  *
29  * The transmit ring of descriptors are 16-byte entries which are used to send
30  * packets, program filters, etc. The receive ring of descriptors are either
31  * 16-byte or 32-bytes each. At the moment, we opt to use the larger descriptor
32  * format so that we're in a better position if we ever want to leverage that
33  * information later on.
34  *
35  * However, these rings are just for descriptors, they don't talk or deal with
36  * how we actually store the memory that we need for DMA or the associated
37  * information that we need for keeping track of message blocks. To correspond
38  * to the hardware descriptor ring which is how we communicate with hardware, we
39  * introduce a control block which keeps track of our required metadata like DMA
40  * mappings.
41  *
42  * There are two main considerations that dictate how much memory and buffers
43  * we end up allocating. Those are:
44  *
45  *   o The size of the ring (controlled through the driver.conf file)
46  *
47  *   o The maximum size frame we can receive.
48  *
49  * The size of the rings currently defaults to 1024 descriptors and is stored in
50  * the i40e_t`i40e_rx_ring_size and i40e_t`i40e_tx_ring_size.
51  *
52  * While the size of the rings is controlled by the driver.conf, the maximum
53  * size frame is informed primarily through the use of dladm and the setting of
54  * the MTU property on the device. From the MTU, we then go and do some
55  * machinations. The first thing we do is we then have to add in space for the
56  * Ethernet header, potentially a VLAN header, and the FCS check. This value is
57  * what's stored as i40e_t`i40e_frame_max and is derived any time
58  * i40e_t`i40e_sdu changes.
59  *
60  * This size is then rounded up to the nearest 1k chunk, which represents the
61  * actual amount of memory that we'll allocate for a single frame.
62  *
63  * Note, that for rx, we do something that might be unexpected. We always add
64  * an extra two bytes to the frame size that we allocate. We then offset the DMA
65  * address that we receive a packet into by two bytes. This ensures that the IP
66  * header will always be 4 byte aligned because the MAC header is either 14 or
67  * 18 bytes in length, depending on the use of 802.1Q tagging, which makes IP's
68  * and MAC's lives easier.
69  *
70  * Both the rx and tx descriptor rings (which are what we use to communicate
71  * with hardware) are allocated as a single region of DMA memory which is the
72  * size of the descriptor (4 bytes and 2 bytes respectively) times the total
73  * number of descriptors for an rx and tx ring.
74  *
75  * While the rx and tx descriptors are allocated using DMA-based memory, the
76  * control blocks for each of them are allocated using normal kernel memory.
77  * They aren't special from a DMA perspective. We'll go over the design of both
78  * receiving and transmitting separately, as they have slightly different
79  * control blocks and different ways that we manage the relationship between
80  * control blocks and descriptors.
81  *
82  * ---------------------------------
83  * RX Descriptors and Control Blocks
84  * ---------------------------------
85  *
86  * For every descriptor in the ring that the driver has, we need some associated
87  * memory, which means that we need to have the receive specific control block.
88  * We have a couple different, but related goals:
89  *
90  *   o Once we've completed the mc_start GLDv3 endpoint (i40e_m_start), we do
91  *     not want to do any additional memory allocations or DMA allocations if
92  *     we don't have to.
93  *
94  *   o We'd like to try and do as much zero-copy as possible, while taking into
95  *     account the cost of mapping in DMA resources.
96  *
97  *   o We'd like to have every receive descriptor available.
98  *
99  * Now, these rules are a bit in tension with one another. The act of mapping in
100  * is an exercise of trying to find the break-even point between page table
101  * updates and bcopy. We currently start by using the same metrics that ixgbe
102  * used; however, it should be known that this value has effectively been
103  * cargo-culted across to yet another driver, sorry.
104  *
105  * If we receive a packet which is larger than our copy threshold, we'll create
106  * a message block out of the DMA memory via desballoc(9F) and send that up to
107  * MAC that way. This will cause us to be notified when the message block is
108  * then freed because it has been consumed, dropped, or otherwise. Otherwise, if
109  * it's less than the threshold, we'll try to use allocb and bcopy it into the
110  * block, thus allowing us to immediately reuse the DMA resource. Note, on debug
111  * builds, we allow someone to whack the variable i40e_debug_rx_mode to override
112  * the behavior and always do a bcopy or a DMA bind.
113  *
114  * To try and ensure that the device always has blocks that it can receive data
115  * into, we maintain two lists of control blocks, a working list and a free
116  * list. Each list is sized equal to the number of descriptors in the rx ring.
117  * During the GLDv3 mc_start routine, we allocate a number of rx control blocks
118  * equal to twice the number of descriptors in the ring and we assign them
119  * equally to the free list and to the working list. Each control block also has
120  * DMA memory allocated and associated with which it will be used to receive the
121  * actual packet data. All of a received frame's data will end up in a single
122  * DMA buffer.
123  *
124  * During operation, we always maintain the invariant that each rx descriptor
125  * has an associated rx control block which lives in the working list. If we
126  * feel that we should loan up DMA memory to MAC in the form of a message block,
127  * we can only do so if we can maintain this invariant. To do that, we swap in
128  * one of the buffers from the free list. If none are available, then we resort
129  * to using allocb(9F) and bcopy(9F) on the packet instead, regardless of the
130  * size.
131  *
132  * Loaned message blocks come back to use when freemsg(9F) or freeb(9F) is
133  * called on the block, at which point we restore the rx control block to the
134  * free list and are able to reuse the DMA memory again. While the scheme may
135  * seem odd, it importantly keeps us out of trying to do any DMA allocations in
136  * the normal path of operation, even though we may still have to allocate
137  * message blocks and copy.
138  *
139  * The following state machine describes the life time of a rx control block. In
140  * the diagram we abbrviate the rx ring descriptor entry as rxd and the rx
141  * control block entry as rcb.
142  *
143  *             |                                   |
144  *             * ... 1/2 of all initial rcb's  ... *
145  *             |                                   |
146  *             v                                   v
147  *     +------------------+               +------------------+
148  *     | rcb on free list |---*---------->| rcb on work list |
149  *     +------------------+   .           +------------------+
150  *             ^              . moved to          |
151  *             |                replace rcb       * . . Frame received,
152  *             |                loaned to         |     entry on free list
153  *             |                MAC + co.         |     available. rcb's
154  *             |                                  |     memory made into mblk_t
155  *             * . freemsg(9F)                    |     and sent up to MAC.
156  *             |   called on                      |
157  *             |   loaned rcb                     |
158  *             |   and it is                      v
159  *             |   recycled.              +-------------------+
160  *             +--------------------<-----| rcb loaned to MAC |
161  *                                        +-------------------+
162  *
163  * Finally, note that every rx control block has a reference count on it. One
164  * reference is added as long as the driver has had the GLDv3 mc_start endpoint
165  * called. If the GLDv3 mc_stop entry point is called, IP has been unplumbed and
166  * no other DLPI consumers remain, then we'll decrement the reference count by
167  * one. Whenever we loan up the rx control block and associated buffer to MAC,
168  * then we bump the reference count again. Even though the device is stopped,
169  * there may still be loaned frames in upper levels that we'll want to account
170  * for. Our callback from freemsg(9F)/freeb(9F) will take care of making sure
171  * that it is cleaned up.
172  *
173  * --------------------
174  * Managing the RX Ring
175  * --------------------
176  *
177  * The receive ring descriptors are arranged in a circular buffer with a head
178  * and tail pointer. There are both the conventional head and tail pointers
179  * which are used to partition the ring into two portions, a portion that we,
180  * the operating system, manage and a portion that is managed by hardware. When
181  * hardware owns a descriptor in the ring, it means that it is waiting for data
182  * to be filled in. However, when a portion of the ring is owned by the driver,
183  * then that means that the descriptor has been consumed and we need to go take
184  * a look at it.
185  *
186  * The initial head is configured to be zero by writing it as such in the
187  * receive queue context in the FPM (function private memory from the host). The
188  * initial tail is written to be the last descriptor. This is written to via the
189  * PCIe register I40E_QRX_TAIL(). Technically, hardware owns everything between
190  * the HEAD and TAIL, inclusive. Note that while we initially program the HEAD,
191  * the only values we ever consult ourselves are the TAIL register and our own
192  * state tracking. Effectively, we cache the HEAD register and then update it
193  * ourselves based on our work.
194  *
195  * When we iterate over the rx descriptors and thus the received frames, we are
196  * either in an interrupt context or we've been asked by MAC to poll on the
197  * ring. If we've been asked to poll on the ring, we have a maximum number of
198  * bytes of mblk_t's to return. If processing an rx descriptor would cause us to
199  * exceed that count, then we do not process it. When in interrupt context, we
200  * don't have a strict byte count. However, to ensure liveness, we limit the
201  * amount of data based on a configuration value
202  * (i40e_t`i40e_rx_limit_per_intr). The number that we've started with for this
203  * is based on similar numbers that are used for ixgbe. After some additional
204  * time in the field, we'll have a sense as to whether or not it should be
205  * changed.
206  *
207  * When processing, we start at our own HEAD pointer
208  * (i40e_rx_data_t`rxd_desc_next), which indicates the descriptor to start
209  * processing. Every RX descriptor has what's described as the DD bit. This bit
210  * (the LSB of the second 8-byte word), indicates whether or not the descriptor
211  * is done.  When we give descriptors to the hardware, this value is always
212  * zero. When the hardware has finished a descriptor, it will always be one.
213  *
214  * The first thing that we check is whether the DD bit indicates that the
215  * current HEAD is ready. If it isn't, then we're done. That's the primary
216  * invariant of processing a frame. If it's done, then there are a few other
217  * things that we want to look at. In the same status word as the DD bit, there
218  * are two other important bits:
219  *
220  *   o End of Packet (EOP)
221  *   o Error bits
222  *
223  * The end of packet indicates that we have reached the last descriptor. Now,
224  * you might ask when would there be more than one descriptor. The reason for
225  * that might be due to large receive offload (lro) or header splitting
226  * functionality, which presently isn't supported in the driver. The error bits
227  * in the frame are only valid when EOP is set.
228  *
229  * If error bits are set on the frame, then we still consume it; however, we
230  * will not generate an mblk_t to send up to MAC. If there are no error bits
231  * set, then we'll consume the descriptor either using bcopy or DMA binding. See
232  * the earlier section 'RX DESCRIPTORS AND CONTROL BLOCKS' for more information
233  * on how that selection is made.
234  *
235  * Regardless of whether we construct an mblk_t or encounter an error, we end up
236  * resetting the descriptor. This re-arms the descriptor for hardware and in the
237  * process, we may end up assigning it a new receive control bock. After we do
238  * this, we always update our HEAD pointer, no matter what.
239  *
240  * Finally, once we've consumed as much as we will in a given window, we go and
241  * update the TAIL register to indicate all the frames we've consumed. We only
242  * do a single bulk write for the ring.
243  *
244  * ---------------------------------
245  * TX Descriptors and Control Blocks
246  * ---------------------------------
247  *
248  * While the transmit path is similar in spirit to the receive path, it works
249  * differently due to the fact that all data is originated by the operating
250  * system and not by the device.
251  *
252  * Like rx, there is both a descriptor ring that we use to communicate to the
253  * driver and which points to the memory used to transmit a frame. Similarly,
254  * there is a corresponding transmit control block. Each transmit control block
255  * has a region of DMA memory allocated to it; however, the way we use it
256  * varies.
257  *
258  * The driver is asked to process a single frame at a time. That message block
259  * may be made up of multiple fragments linked together by the mblk_t`b_cont
260  * member. The device has a hard limit of up to 8 buffers being allowed for use
261  * for a single logical frame. For each fragment, we'll try and use an entry
262  * from the tx descriptor ring and then we'll allocate a corresponding tx
263  * control block. Depending on the size of the fragment, we may copy it around
264  * or we might instead try to do DMA binding of the fragment.
265  *
266  * If we exceed the number of blocks that fit, we'll try to pull up the block
267  * and then we'll do a DMA bind and send it out.
268  *
269  * If we don't have enough space in the ring or tx control blocks available,
270  * then we'll return the unprocessed message block to MAC. This will induce flow
271  * control and once we recycle enough entries, we'll once again enable sending
272  * on the ring.
273  *
274  * We size the working list as equal to the number of descriptors in the ring.
275  * We size the free list as equal to 1.5 times the number of descriptors in the
276  * ring. We'll allocate a number of tx control block entries equal to the number
277  * of entries in the free list. By default, all entries are placed in the free
278  * list. As we come along and try to send something, we'll allocate entries from
279  * the free list and add them to the working list, where they'll stay until the
280  * hardware indicates that all of the data has been written back to us. The
281  * reason that we start with 1.5x is to help facilitate having more than one TX
282  * buffer associated with the DMA activity.
283  *
284  * --------------------
285  * Managing the TX Ring
286  * --------------------
287  *
288  * The transmit descriptor ring is driven by us. We maintain our own notion of a
289  * HEAD and TAIL register and we update the hardware with updates to the TAIL
290  * register. When the hardware is done writing out data, it updates us by
291  * writing back to a specific address, not by updating the individual
292  * descriptors. That address is a 4-byte region after the main transmit
293  * descriptor ring. This is why the descriptor ring has an extra descriptor's
294  * worth allocated to it.
295  *
296  * We maintain our notion of the HEAD in the i40e_trqpair_t`itrq_desc_head and
297  * the TAIL in the i40e_trqpair_t`itrq_desc_tail. When we write out frames,
298  * we'll update the tail there and in the I40E_QTX_TAIL() register. At various
299  * points in time, through both interrupts, and our own internal checks, we'll
300  * sync the write-back head portion of the DMA space. Based on the index it
301  * reports back, we'll free everything between our current HEAD and the
302  * indicated index and update HEAD to the new index.
303  *
304  * When a frame comes in, we try to use a number of transmit control blocks and
305  * we'll transition them from the free list to the work list. They'll get moved
306  * to the entry on the work list that corresponds with the transmit descriptor
307  * they correspond to. Once we are indicated that the corresponding descriptor
308  * has been freed, we'll return it to the list.
309  *
310  * The transmit control block free list is managed by keeping track of the
311  * number of entries in it, i40e_trqpair_t`itrq_tcb_free. We use it as a way to
312  * index into the free list and add things to it. In effect, we always push and
313  * pop from the tail and protect it with a single lock,
314  * i40e_trqpair_t`itrq_tcb_lock. This scheme is somewhat simplistic and may not
315  * stand up to further performance testing; however, it does allow us to get off
316  * the ground with the device driver.
317  *
318  * The following image describes where a given transmit control block lives in
319  * its lifetime:
320  *
321  *             |
322  *             * ... Initial placement for all tcb's
323  *             |
324  *             v
325  *    +------------------+                       +------------------+
326  *    | tcb on free list |---*------------------>| tcb on work list |
327  *    +------------------+   .                   +------------------+
328  *             ^             . tcb allocated               |
329  *             |               to send frame               v
330  *             |               or fragment on              |
331  *             |               wire, mblk from             |
332  *             |               MAC associated.             |
333  *             |                                           |
334  *             +------*-------------------------------<----+
335  *                    .
336  *                    . Hardware indicates
337  *                      entry transmitted.
338  *                      tcb recycled, mblk
339  *                      from MAC freed.
340  *
341  * ------------
342  * Blocking MAC
343  * ------------
344  *
345  * Wen performing transmit, we can run out of descriptors and ring entries. When
346  * such a case happens, we return the mblk_t to MAC to indicate that we've been
347  * blocked. At that point in time, MAC becomes blocked and will not transmit
348  * anything out that specific ring until we notify MAC. To indicate that we're
349  * in such a situation we set i40e_trqpair_t`itrq_tx_blocked member to B_TRUE.
350  *
351  * When we recycle tx descriptors then we'll end up signaling MAC by calling
352  * mac_tx_ring_update() if we were blocked, letting it know that it's safe to
353  * start sending frames out to us again.
354  */
355 
356 /*
357  * We set our DMA alignment requests based on the smallest supported page size
358  * of the corresponding platform.
359  */
360 #if	defined(__sparc)
361 #define	I40E_DMA_ALIGNMENT 0x2000ull
362 #elif defined(__x86)
363 #define	I40E_DMA_ALIGNMENT 0x1000ull
364 #else
365 #error	"unknown architecture for i40e"
366 #endif
367 
368 /*
369  * This structure is used to maintain information and flags related to
370  * transmitting a frame. The first member is the set of flags we need to or into
371  * the command word (generally checksumming related). The second member controls
372  * the word offsets which is required for IP and L4 checksumming.
373  */
374 typedef struct i40e_tx_context {
375 	enum i40e_tx_desc_cmd_bits	itc_cmdflags;
376 	uint32_t			itc_offsets;
377 } i40e_tx_context_t;
378 
379 /*
380  * Toggles on debug builds which can be used to override our RX behaviour based
381  * on thresholds.
382  */
383 #ifdef	DEBUG
384 typedef enum {
385 	I40E_DEBUG_RX_DEFAULT	= 0,
386 	I40E_DEBUG_RX_BCOPY	= 1,
387 	I40E_DEBUG_RX_DMABIND	= 2
388 } i40e_debug_rx_t;
389 
390 i40e_debug_rx_t i40e_debug_rx_mode = I40E_DEBUG_RX_DEFAULT;
391 #endif	/* DEBUG */
392 
393 /*
394  * Notes on the following pair of DMA attributes. The first attribute,
395  * i40e_static_dma_attr, is designed to be used for both the descriptor rings
396  * and the static buffers that we associate with control blocks. For this
397  * reason, we force an SGL length of one. While technically the driver supports
398  * a larger SGL (5 on rx and 8 on tx), we opt to only use one to simplify our
399  * management here. In addition, when the Intel common code wants to allocate
400  * memory via the i40e_allocate_virt_mem osdep function, we have it leverage
401  * the static dma attr.
402  *
403  * The second set of attributes, i40e_txbind_dma_attr, is what we use when we're
404  * binding a bunch of mblk_t fragments to go out the door. Note that the main
405  * difference here is that we're allowed a larger SGL length -- eight.
406  *
407  * Note, we default to setting ourselves to be DMA capable here. However,
408  * because we could have multiple instances which have different FMA error
409  * checking capabilities, or end up on different buses, we make these static
410  * and const and copy them into the i40e_t for the given device with the actual
411  * values that reflect the actual capabilities.
412  */
413 static const ddi_dma_attr_t i40e_g_static_dma_attr = {
414 	DMA_ATTR_V0,			/* version number */
415 	0x0000000000000000ull,		/* low address */
416 	0xFFFFFFFFFFFFFFFFull,		/* high address */
417 	0x00000000FFFFFFFFull,		/* dma counter max */
418 	I40E_DMA_ALIGNMENT,		/* alignment */
419 	0x00000FFF,			/* burst sizes */
420 	0x00000001,			/* minimum transfer size */
421 	0x00000000FFFFFFFFull,		/* maximum transfer size */
422 	0xFFFFFFFFFFFFFFFFull,		/* maximum segment size */
423 	1,				/* scatter/gather list length */
424 	0x00000001,			/* granularity */
425 	DDI_DMA_FLAGERR			/* DMA flags */
426 };
427 
428 static const ddi_dma_attr_t i40e_g_txbind_dma_attr = {
429 	DMA_ATTR_V0,			/* version number */
430 	0x0000000000000000ull,		/* low address */
431 	0xFFFFFFFFFFFFFFFFull,		/* high address */
432 	0x00000000FFFFFFFFull,		/* dma counter max */
433 	I40E_DMA_ALIGNMENT,		/* alignment */
434 	0x00000FFF,			/* burst sizes */
435 	0x00000001,			/* minimum transfer size */
436 	0x00000000FFFFFFFFull,		/* maximum transfer size */
437 	0xFFFFFFFFFFFFFFFFull,		/* maximum segment size	 */
438 	I40E_TX_MAX_COOKIE,		/* scatter/gather list length */
439 	0x00000001,			/* granularity */
440 	DDI_DMA_FLAGERR			/* DMA flags */
441 };
442 
443 /*
444  * Next, we have the attributes for these structures. The descriptor rings are
445  * all strictly little endian, while the data buffers are just arrays of bytes
446  * representing frames. Because of this, we purposefully simplify the driver
447  * programming life by programming the descriptor ring as little endian, while
448  * for the buffer data we keep it as unstructured.
449  *
450  * Note, that to keep the Intel common code operating in a reasonable way, when
451  * we allocate DMA memory for it, we do not use byte swapping and thus use the
452  * standard i40e_buf_acc_attr.
453  */
454 static const ddi_device_acc_attr_t i40e_g_desc_acc_attr = {
455 	DDI_DEVICE_ATTR_V0,
456 	DDI_STRUCTURE_LE_ACC,
457 	DDI_STRICTORDER_ACC
458 };
459 
460 static const ddi_device_acc_attr_t i40e_g_buf_acc_attr = {
461 	DDI_DEVICE_ATTR_V0,
462 	DDI_NEVERSWAP_ACC,
463 	DDI_STRICTORDER_ACC
464 };
465 
466 /*
467  * The next two functions are designed to be type-safe versions of macros that
468  * are used to increment and decrement a descriptor index in the loop. Note,
469  * these are marked inline to try and keep the data path hot and they were
470  * effectively inlined in their previous life as macros.
471  */
472 static inline int
473 i40e_next_desc(int base, int count, int size)
474 {
475 	int out;
476 
477 	ASSERT(base >= 0);
478 	ASSERT(count > 0);
479 	ASSERT(size > 0);
480 
481 	if (base + count < size) {
482 		out = base + count;
483 	} else {
484 		out = base + count - size;
485 	}
486 
487 	ASSERT(out >= 0 && out < size);
488 	return (out);
489 }
490 
491 static inline int
492 i40e_prev_desc(int base, int count, int size)
493 {
494 	int out;
495 
496 	ASSERT(base >= 0);
497 	ASSERT(count > 0);
498 	ASSERT(size > 0);
499 
500 	if (base >= count) {
501 		out = base - count;
502 	} else {
503 		out = base - count + size;
504 	}
505 
506 	ASSERT(out >= 0 && out < size);
507 	return (out);
508 }
509 
510 /*
511  * Free DMA memory that is represented by a i40e_dma_buffer_t.
512  */
513 static void
514 i40e_free_dma_buffer(i40e_dma_buffer_t *dmap)
515 {
516 	if (dmap->dmab_dma_address != 0) {
517 		VERIFY(dmap->dmab_dma_handle != NULL);
518 		(void) ddi_dma_unbind_handle(dmap->dmab_dma_handle);
519 		dmap->dmab_dma_address = 0;
520 		dmap->dmab_size = 0;
521 	}
522 
523 	if (dmap->dmab_acc_handle != NULL) {
524 		ddi_dma_mem_free(&dmap->dmab_acc_handle);
525 		dmap->dmab_acc_handle = NULL;
526 		dmap->dmab_address = NULL;
527 	}
528 
529 	if (dmap->dmab_dma_handle != NULL) {
530 		ddi_dma_free_handle(&dmap->dmab_dma_handle);
531 		dmap->dmab_dma_handle = NULL;
532 	}
533 
534 	/*
535 	 * These should only be set if we have valid handles allocated and
536 	 * therefore should always be NULLed out due to the above code. This
537 	 * is here to catch us acting sloppy.
538 	 */
539 	ASSERT(dmap->dmab_dma_address == 0);
540 	ASSERT(dmap->dmab_address == NULL);
541 	ASSERT(dmap->dmab_size == 0);
542 	dmap->dmab_len = 0;
543 }
544 
545 /*
546  * Allocate size bytes of DMA memory based on the passed in attributes. This
547  * fills in the information in dmap and is designed for all of our single cookie
548  * allocations.
549  */
550 static boolean_t
551 i40e_alloc_dma_buffer(i40e_t *i40e, i40e_dma_buffer_t *dmap,
552     ddi_dma_attr_t *attrsp, ddi_device_acc_attr_t *accp, boolean_t stream,
553     boolean_t zero, size_t size)
554 {
555 	int ret;
556 	uint_t flags;
557 	size_t len;
558 	ddi_dma_cookie_t cookie;
559 	uint_t ncookies;
560 
561 	if (stream == B_TRUE)
562 		flags = DDI_DMA_STREAMING;
563 	else
564 		flags = DDI_DMA_CONSISTENT;
565 
566 	/*
567 	 * Step one: Allocate the DMA handle
568 	 */
569 	ret = ddi_dma_alloc_handle(i40e->i40e_dip, attrsp, DDI_DMA_DONTWAIT,
570 	    NULL, &dmap->dmab_dma_handle);
571 	if (ret != DDI_SUCCESS) {
572 		i40e_error(i40e, "failed to allocate dma handle for I/O "
573 		    "buffers: %d", ret);
574 		dmap->dmab_dma_handle = NULL;
575 		return (B_FALSE);
576 	}
577 
578 	/*
579 	 * Step two: Allocate the DMA memory
580 	 */
581 	ret = ddi_dma_mem_alloc(dmap->dmab_dma_handle, size, accp, flags,
582 	    DDI_DMA_DONTWAIT, NULL, &dmap->dmab_address, &len,
583 	    &dmap->dmab_acc_handle);
584 	if (ret != DDI_SUCCESS) {
585 		i40e_error(i40e, "failed to allocate %ld bytes of DMA for I/O "
586 		    "buffers", size);
587 		dmap->dmab_address = NULL;
588 		dmap->dmab_acc_handle = NULL;
589 		i40e_free_dma_buffer(dmap);
590 		return (B_FALSE);
591 	}
592 
593 	/*
594 	 * Step three: Optionally zero
595 	 */
596 	if (zero == B_TRUE)
597 		bzero(dmap->dmab_address, len);
598 
599 	/*
600 	 * Step four: Bind the memory
601 	 */
602 	ret = ddi_dma_addr_bind_handle(dmap->dmab_dma_handle, NULL,
603 	    dmap->dmab_address, len, DDI_DMA_RDWR | flags, DDI_DMA_DONTWAIT,
604 	    NULL, &cookie, &ncookies);
605 	if (ret != DDI_DMA_MAPPED) {
606 		i40e_error(i40e, "failed to allocate %ld bytes of DMA for I/O "
607 		    "buffers: %d", size, ret);
608 		i40e_free_dma_buffer(dmap);
609 		return (B_FALSE);
610 	}
611 
612 	VERIFY(ncookies == 1);
613 	dmap->dmab_dma_address = cookie.dmac_laddress;
614 	dmap->dmab_size = len;
615 	dmap->dmab_len = 0;
616 	return (B_TRUE);
617 }
618 
619 /*
620  * This function is called once the last pending rcb has been freed by the upper
621  * levels of the system.
622  */
623 static void
624 i40e_free_rx_data(i40e_rx_data_t *rxd)
625 {
626 	VERIFY(rxd->rxd_rcb_pending == 0);
627 
628 	if (rxd->rxd_rcb_area != NULL) {
629 		kmem_free(rxd->rxd_rcb_area,
630 		    sizeof (i40e_rx_control_block_t) *
631 		    (rxd->rxd_free_list_size + rxd->rxd_ring_size));
632 		rxd->rxd_rcb_area = NULL;
633 	}
634 
635 	if (rxd->rxd_free_list != NULL) {
636 		kmem_free(rxd->rxd_free_list,
637 		    sizeof (i40e_rx_control_block_t *) *
638 		    rxd->rxd_free_list_size);
639 		rxd->rxd_free_list = NULL;
640 	}
641 
642 	if (rxd->rxd_work_list != NULL) {
643 		kmem_free(rxd->rxd_work_list,
644 		    sizeof (i40e_rx_control_block_t *) *
645 		    rxd->rxd_ring_size);
646 		rxd->rxd_work_list = NULL;
647 	}
648 
649 	kmem_free(rxd, sizeof (i40e_rx_data_t));
650 }
651 
652 static boolean_t
653 i40e_alloc_rx_data(i40e_t *i40e, i40e_trqpair_t *itrq)
654 {
655 	i40e_rx_data_t *rxd;
656 
657 	rxd = kmem_zalloc(sizeof (i40e_rx_data_t), KM_NOSLEEP);
658 	if (rxd == NULL)
659 		return (B_FALSE);
660 	itrq->itrq_rxdata = rxd;
661 	rxd->rxd_i40e = i40e;
662 
663 	rxd->rxd_ring_size = i40e->i40e_rx_ring_size;
664 	rxd->rxd_free_list_size = i40e->i40e_rx_ring_size;
665 
666 	rxd->rxd_rcb_free = rxd->rxd_free_list_size;
667 
668 	rxd->rxd_work_list = kmem_zalloc(sizeof (i40e_rx_control_block_t *) *
669 	    rxd->rxd_ring_size, KM_NOSLEEP);
670 	if (rxd->rxd_work_list == NULL) {
671 		i40e_error(i40e, "failed to allocate rx work list for a ring "
672 		    "of %d entries for ring %d", rxd->rxd_ring_size,
673 		    itrq->itrq_index);
674 		goto cleanup;
675 	}
676 
677 	rxd->rxd_free_list = kmem_zalloc(sizeof (i40e_rx_control_block_t *) *
678 	    rxd->rxd_free_list_size, KM_NOSLEEP);
679 	if (rxd->rxd_free_list == NULL) {
680 		i40e_error(i40e, "failed to allocate a %d entry rx free list "
681 		    "for ring %d", rxd->rxd_free_list_size, itrq->itrq_index);
682 		goto cleanup;
683 	}
684 
685 	rxd->rxd_rcb_area = kmem_zalloc(sizeof (i40e_rx_control_block_t) *
686 	    (rxd->rxd_free_list_size + rxd->rxd_ring_size), KM_NOSLEEP);
687 	if (rxd->rxd_rcb_area == NULL) {
688 		i40e_error(i40e, "failed to allocate a %d entry rcb area for "
689 		    "ring %d", rxd->rxd_ring_size + rxd->rxd_free_list_size,
690 		    itrq->itrq_index);
691 		goto cleanup;
692 	}
693 
694 	return (B_TRUE);
695 
696 cleanup:
697 	i40e_free_rx_data(rxd);
698 	itrq->itrq_rxdata = NULL;
699 	return (B_FALSE);
700 }
701 
702 /*
703  * Free all of the memory that we've allocated for DMA. Note that we may have
704  * buffers that we've loaned up to the OS which are still outstanding. We'll
705  * always free up the descriptor ring, because we no longer need that. For each
706  * rcb, we'll iterate over it and if we send the reference count to zero, then
707  * we'll free the message block and DMA related resources. However, if we don't
708  * take the last one, then we'll go ahead and keep track that we'll have pending
709  * data and clean it up when we get there.
710  */
711 static void
712 i40e_free_rx_dma(i40e_rx_data_t *rxd, boolean_t failed_init)
713 {
714 	uint32_t i, count, ref;
715 
716 	i40e_rx_control_block_t *rcb;
717 	i40e_t *i40e = rxd->rxd_i40e;
718 
719 	i40e_free_dma_buffer(&rxd->rxd_desc_area);
720 	rxd->rxd_desc_ring = NULL;
721 	rxd->rxd_desc_next = 0;
722 
723 	mutex_enter(&i40e->i40e_rx_pending_lock);
724 
725 	rcb = rxd->rxd_rcb_area;
726 	count = rxd->rxd_ring_size + rxd->rxd_free_list_size;
727 
728 	for (i = 0; i < count; i++, rcb++) {
729 		VERIFY(rcb != NULL);
730 
731 		/*
732 		 * If we're cleaning up from a failed creation attempt, then an
733 		 * entry may never have been assembled which would mean that
734 		 * it's reference count is zero. If we find that, we leave it
735 		 * be, because nothing else should be modifying it at this
736 		 * point. We're not at the point that any more references can be
737 		 * added, just removed.
738 		 */
739 		if (failed_init == B_TRUE && rcb->rcb_ref == 0)
740 			continue;
741 
742 		ref = atomic_dec_32_nv(&rcb->rcb_ref);
743 		if (ref == 0) {
744 			freemsg(rcb->rcb_mp);
745 			rcb->rcb_mp = NULL;
746 			i40e_free_dma_buffer(&rcb->rcb_dma);
747 		} else {
748 			atomic_inc_32(&rxd->rxd_rcb_pending);
749 			atomic_inc_32(&i40e->i40e_rx_pending);
750 		}
751 	}
752 	mutex_exit(&i40e->i40e_rx_pending_lock);
753 }
754 
755 /*
756  * Initialize the DMA memory for the descriptor ring and for each frame in the
757  * control block list.
758  */
759 static boolean_t
760 i40e_alloc_rx_dma(i40e_rx_data_t *rxd)
761 {
762 	int i, count;
763 	size_t dmasz;
764 	i40e_rx_control_block_t *rcb;
765 	i40e_t *i40e = rxd->rxd_i40e;
766 
767 	/*
768 	 * First allocate the rx descriptor ring.
769 	 */
770 	dmasz = sizeof (i40e_rx_desc_t) * rxd->rxd_ring_size;
771 	VERIFY(dmasz > 0);
772 	if (i40e_alloc_dma_buffer(i40e, &rxd->rxd_desc_area,
773 	    &i40e->i40e_static_dma_attr, &i40e->i40e_desc_acc_attr, B_FALSE,
774 	    B_TRUE, dmasz) == B_FALSE) {
775 		i40e_error(i40e, "failed to allocate DMA resources "
776 		    "for rx descriptor ring");
777 		return (B_FALSE);
778 	}
779 	rxd->rxd_desc_ring =
780 	    (i40e_rx_desc_t *)(uintptr_t)rxd->rxd_desc_area.dmab_address;
781 	rxd->rxd_desc_next = 0;
782 
783 	count = rxd->rxd_ring_size + rxd->rxd_free_list_size;
784 	rcb = rxd->rxd_rcb_area;
785 
786 	dmasz = i40e->i40e_rx_buf_size;
787 	VERIFY(dmasz > 0);
788 	for (i = 0; i < count; i++, rcb++) {
789 		i40e_dma_buffer_t *dmap;
790 		VERIFY(rcb != NULL);
791 
792 		if (i < rxd->rxd_ring_size) {
793 			rxd->rxd_work_list[i] = rcb;
794 		} else {
795 			rxd->rxd_free_list[i - rxd->rxd_ring_size] = rcb;
796 		}
797 
798 		dmap = &rcb->rcb_dma;
799 		if (i40e_alloc_dma_buffer(i40e, dmap,
800 		    &i40e->i40e_static_dma_attr, &i40e->i40e_buf_acc_attr,
801 		    B_TRUE, B_FALSE, dmasz) == B_FALSE) {
802 			i40e_error(i40e, "failed to allocate rx dma buffer");
803 			return (B_FALSE);
804 		}
805 
806 		/*
807 		 * Initialize the control block and offset the DMA address. See
808 		 * the note in the big theory statement that explains how this
809 		 * helps IP deal with alignment. Note, we don't worry about
810 		 * whether or not we successfully get an mblk_t from desballoc,
811 		 * it's a common case that we have to handle later on in the
812 		 * system.
813 		 */
814 		dmap->dmab_size -= I40E_BUF_IPHDR_ALIGNMENT;
815 		dmap->dmab_address += I40E_BUF_IPHDR_ALIGNMENT;
816 		dmap->dmab_dma_address += I40E_BUF_IPHDR_ALIGNMENT;
817 
818 		rcb->rcb_ref = 1;
819 		rcb->rcb_rxd = rxd;
820 		rcb->rcb_free_rtn.free_func = i40e_rx_recycle;
821 		rcb->rcb_free_rtn.free_arg = (caddr_t)rcb;
822 		rcb->rcb_mp = desballoc((unsigned char *)dmap->dmab_address,
823 		    dmap->dmab_size, 0, &rcb->rcb_free_rtn);
824 	}
825 
826 	return (B_TRUE);
827 }
828 
829 static void
830 i40e_free_tx_dma(i40e_trqpair_t *itrq)
831 {
832 	size_t fsz;
833 
834 	if (itrq->itrq_tcb_area != NULL) {
835 		uint32_t i;
836 		i40e_tx_control_block_t *tcb = itrq->itrq_tcb_area;
837 
838 		for (i = 0; i < itrq->itrq_tx_free_list_size; i++, tcb++) {
839 			i40e_free_dma_buffer(&tcb->tcb_dma);
840 			if (tcb->tcb_dma_handle != NULL) {
841 				ddi_dma_free_handle(&tcb->tcb_dma_handle);
842 				tcb->tcb_dma_handle = NULL;
843 			}
844 		}
845 
846 		fsz = sizeof (i40e_tx_control_block_t) *
847 		    itrq->itrq_tx_free_list_size;
848 		kmem_free(itrq->itrq_tcb_area, fsz);
849 		itrq->itrq_tcb_area = NULL;
850 	}
851 
852 	if (itrq->itrq_tcb_free_list != NULL) {
853 		fsz = sizeof (i40e_tx_control_block_t *) *
854 		    itrq->itrq_tx_free_list_size;
855 		kmem_free(itrq->itrq_tcb_free_list, fsz);
856 		itrq->itrq_tcb_free_list = NULL;
857 	}
858 
859 	if (itrq->itrq_tcb_work_list != NULL) {
860 		fsz = sizeof (i40e_tx_control_block_t *) *
861 		    itrq->itrq_tx_ring_size;
862 		kmem_free(itrq->itrq_tcb_work_list, fsz);
863 		itrq->itrq_tcb_work_list = NULL;
864 	}
865 
866 	i40e_free_dma_buffer(&itrq->itrq_desc_area);
867 	itrq->itrq_desc_ring = NULL;
868 
869 }
870 
871 static boolean_t
872 i40e_alloc_tx_dma(i40e_trqpair_t *itrq)
873 {
874 	int i, ret;
875 	size_t dmasz;
876 	i40e_tx_control_block_t *tcb;
877 	i40e_t *i40e = itrq->itrq_i40e;
878 
879 	itrq->itrq_tx_ring_size = i40e->i40e_tx_ring_size;
880 	itrq->itrq_tx_free_list_size = i40e->i40e_tx_ring_size +
881 	    (i40e->i40e_tx_ring_size >> 1);
882 
883 	/*
884 	 * Allocate an additional tx descriptor for the writeback head.
885 	 */
886 	dmasz = sizeof (i40e_tx_desc_t) * itrq->itrq_tx_ring_size;
887 	dmasz += sizeof (i40e_tx_desc_t);
888 
889 	VERIFY(dmasz > 0);
890 	if (i40e_alloc_dma_buffer(i40e, &itrq->itrq_desc_area,
891 	    &i40e->i40e_static_dma_attr, &i40e->i40e_desc_acc_attr,
892 	    B_FALSE, B_TRUE, dmasz) == B_FALSE) {
893 		i40e_error(i40e, "failed to allocate DMA resources for tx "
894 		    "descriptor ring");
895 		return (B_FALSE);
896 	}
897 	itrq->itrq_desc_ring =
898 	    (i40e_tx_desc_t *)(uintptr_t)itrq->itrq_desc_area.dmab_address;
899 	itrq->itrq_desc_wbhead = (uint32_t *)(itrq->itrq_desc_ring +
900 	    itrq->itrq_tx_ring_size);
901 	itrq->itrq_desc_head = 0;
902 	itrq->itrq_desc_tail = 0;
903 	itrq->itrq_desc_free = itrq->itrq_tx_ring_size;
904 
905 	itrq->itrq_tcb_work_list = kmem_zalloc(itrq->itrq_tx_ring_size *
906 	    sizeof (i40e_tx_control_block_t *), KM_NOSLEEP);
907 	if (itrq->itrq_tcb_work_list == NULL) {
908 		i40e_error(i40e, "failed to allocate a %d entry tx work list "
909 		    "for ring %d", itrq->itrq_tx_ring_size, itrq->itrq_index);
910 		goto cleanup;
911 	}
912 
913 	itrq->itrq_tcb_free_list = kmem_zalloc(itrq->itrq_tx_free_list_size *
914 	    sizeof (i40e_tx_control_block_t *), KM_SLEEP);
915 	if (itrq->itrq_tcb_free_list == NULL) {
916 		i40e_error(i40e, "failed to allocate a %d entry tx free list "
917 		    "for ring %d", itrq->itrq_tx_free_list_size,
918 		    itrq->itrq_index);
919 		goto cleanup;
920 	}
921 
922 	/*
923 	 * We allocate enough tx control blocks to cover the free list.
924 	 */
925 	itrq->itrq_tcb_area = kmem_zalloc(sizeof (i40e_tx_control_block_t) *
926 	    itrq->itrq_tx_free_list_size, KM_NOSLEEP);
927 	if (itrq->itrq_tcb_area == NULL) {
928 		i40e_error(i40e, "failed to allocate a %d entry tcb area for "
929 		    "ring %d", itrq->itrq_tx_free_list_size, itrq->itrq_index);
930 		goto cleanup;
931 	}
932 
933 	/*
934 	 * For each tcb, allocate DMA memory.
935 	 */
936 	dmasz = i40e->i40e_tx_buf_size;
937 	VERIFY(dmasz > 0);
938 	tcb = itrq->itrq_tcb_area;
939 	for (i = 0; i < itrq->itrq_tx_free_list_size; i++, tcb++) {
940 		VERIFY(tcb != NULL);
941 
942 		/*
943 		 * Allocate both a DMA buffer which we'll use for when we copy
944 		 * packets for transmission and allocate a DMA handle which
945 		 * we'll use when we bind data.
946 		 */
947 		ret = ddi_dma_alloc_handle(i40e->i40e_dip,
948 		    &i40e->i40e_txbind_dma_attr, DDI_DMA_DONTWAIT, NULL,
949 		    &tcb->tcb_dma_handle);
950 		if (ret != DDI_SUCCESS) {
951 			i40e_error(i40e, "failed to allocate DMA handle for tx "
952 			    "data binding on ring %d: %d", itrq->itrq_index,
953 			    ret);
954 			tcb->tcb_dma_handle = NULL;
955 			goto cleanup;
956 		}
957 
958 		if (i40e_alloc_dma_buffer(i40e, &tcb->tcb_dma,
959 		    &i40e->i40e_static_dma_attr, &i40e->i40e_buf_acc_attr,
960 		    B_TRUE, B_FALSE, dmasz) == B_FALSE) {
961 			i40e_error(i40e, "failed to allocate %ld bytes of "
962 			    "DMA for tx data binding on ring %d", dmasz,
963 			    itrq->itrq_index);
964 			goto cleanup;
965 		}
966 
967 		itrq->itrq_tcb_free_list[i] = tcb;
968 	}
969 
970 	itrq->itrq_tcb_free = itrq->itrq_tx_free_list_size;
971 
972 	return (B_TRUE);
973 
974 cleanup:
975 	i40e_free_tx_dma(itrq);
976 	return (B_FALSE);
977 }
978 
979 /*
980  * Free all memory associated with all of the rings on this i40e instance. Note,
981  * this is done as part of the GLDv3 stop routine.
982  */
983 void
984 i40e_free_ring_mem(i40e_t *i40e, boolean_t failed_init)
985 {
986 	int i;
987 
988 	for (i = 0; i < i40e->i40e_num_trqpairs; i++) {
989 		i40e_rx_data_t *rxd = i40e->i40e_trqpairs[i].itrq_rxdata;
990 
991 		/*
992 		 * Clean up our rx data. We have to free DMA resources first and
993 		 * then if we have no more pending RCB's, then we'll go ahead
994 		 * and clean things up. Note, we can't set the stopped flag on
995 		 * the rx data until after we've done the first pass of the
996 		 * pending resources. Otherwise we might race with
997 		 * i40e_rx_recycle on determining who should free the
998 		 * i40e_rx_data_t above.
999 		 */
1000 		i40e_free_rx_dma(rxd, failed_init);
1001 
1002 		mutex_enter(&i40e->i40e_rx_pending_lock);
1003 		rxd->rxd_shutdown = B_TRUE;
1004 		if (rxd->rxd_rcb_pending == 0) {
1005 			i40e_free_rx_data(rxd);
1006 			i40e->i40e_trqpairs[i].itrq_rxdata = NULL;
1007 		}
1008 		mutex_exit(&i40e->i40e_rx_pending_lock);
1009 
1010 		i40e_free_tx_dma(&i40e->i40e_trqpairs[i]);
1011 	}
1012 }
1013 
1014 /*
1015  * Allocate all of the resources associated with all of the rings on this i40e
1016  * instance. Note this is done as part of the GLDv3 start routine and thus we
1017  * should not use blocking allocations. This takes care of both DMA and non-DMA
1018  * related resources.
1019  */
1020 boolean_t
1021 i40e_alloc_ring_mem(i40e_t *i40e)
1022 {
1023 	int i;
1024 
1025 	for (i = 0; i < i40e->i40e_num_trqpairs; i++) {
1026 		if (i40e_alloc_rx_data(i40e, &i40e->i40e_trqpairs[i]) ==
1027 		    B_FALSE)
1028 			goto unwind;
1029 
1030 		if (i40e_alloc_rx_dma(i40e->i40e_trqpairs[i].itrq_rxdata) ==
1031 		    B_FALSE)
1032 			goto unwind;
1033 
1034 		if (i40e_alloc_tx_dma(&i40e->i40e_trqpairs[i]) == B_FALSE)
1035 			goto unwind;
1036 	}
1037 
1038 	return (B_TRUE);
1039 
1040 unwind:
1041 	i40e_free_ring_mem(i40e, B_TRUE);
1042 	return (B_FALSE);
1043 }
1044 
1045 
1046 /*
1047  * Because every instance of i40e may have different support for FMA
1048  * capabilities, we copy the DMA attributes into the i40e_t and set them that
1049  * way and use them for determining attributes.
1050  */
1051 void
1052 i40e_init_dma_attrs(i40e_t *i40e, boolean_t fma)
1053 {
1054 	bcopy(&i40e_g_static_dma_attr, &i40e->i40e_static_dma_attr,
1055 	    sizeof (ddi_dma_attr_t));
1056 	bcopy(&i40e_g_txbind_dma_attr, &i40e->i40e_txbind_dma_attr,
1057 	    sizeof (ddi_dma_attr_t));
1058 	bcopy(&i40e_g_desc_acc_attr, &i40e->i40e_desc_acc_attr,
1059 	    sizeof (ddi_device_acc_attr_t));
1060 	bcopy(&i40e_g_buf_acc_attr, &i40e->i40e_buf_acc_attr,
1061 	    sizeof (ddi_device_acc_attr_t));
1062 
1063 	if (fma == B_TRUE) {
1064 		i40e->i40e_static_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR;
1065 		i40e->i40e_txbind_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR;
1066 	} else {
1067 		i40e->i40e_static_dma_attr.dma_attr_flags &= ~DDI_DMA_FLAGERR;
1068 		i40e->i40e_txbind_dma_attr.dma_attr_flags &= ~DDI_DMA_FLAGERR;
1069 	}
1070 }
1071 
1072 static void
1073 i40e_rcb_free(i40e_rx_data_t *rxd, i40e_rx_control_block_t *rcb)
1074 {
1075 	mutex_enter(&rxd->rxd_free_lock);
1076 	ASSERT(rxd->rxd_rcb_free < rxd->rxd_free_list_size);
1077 	ASSERT(rxd->rxd_free_list[rxd->rxd_rcb_free] == NULL);
1078 	rxd->rxd_free_list[rxd->rxd_rcb_free] = rcb;
1079 	rxd->rxd_rcb_free++;
1080 	mutex_exit(&rxd->rxd_free_lock);
1081 }
1082 
1083 static i40e_rx_control_block_t *
1084 i40e_rcb_alloc(i40e_rx_data_t *rxd)
1085 {
1086 	i40e_rx_control_block_t *rcb;
1087 
1088 	mutex_enter(&rxd->rxd_free_lock);
1089 	if (rxd->rxd_rcb_free == 0) {
1090 		mutex_exit(&rxd->rxd_free_lock);
1091 		return (NULL);
1092 	}
1093 	rxd->rxd_rcb_free--;
1094 	rcb = rxd->rxd_free_list[rxd->rxd_rcb_free];
1095 	VERIFY(rcb != NULL);
1096 	rxd->rxd_free_list[rxd->rxd_rcb_free] = NULL;
1097 	mutex_exit(&rxd->rxd_free_lock);
1098 
1099 	return (rcb);
1100 }
1101 
1102 /*
1103  * This is the callback that we get from the OS when freemsg(9F) has been called
1104  * on a loaned descriptor. In addition, if we take the last reference count
1105  * here, then we have to tear down all of the rx data.
1106  */
1107 void
1108 i40e_rx_recycle(caddr_t arg)
1109 {
1110 	uint32_t ref;
1111 	i40e_rx_control_block_t *rcb;
1112 	i40e_rx_data_t *rxd;
1113 	i40e_t *i40e;
1114 
1115 	/* LINTED: E_BAD_PTR_CAST_ALIGN */
1116 	rcb = (i40e_rx_control_block_t *)arg;
1117 	rxd = rcb->rcb_rxd;
1118 	i40e = rxd->rxd_i40e;
1119 
1120 	/*
1121 	 * It's possible for this to be called with a reference count of zero.
1122 	 * That will happen when we're doing the freemsg after taking the last
1123 	 * reference because we're tearing down everything and this rcb is not
1124 	 * outstanding.
1125 	 */
1126 	if (rcb->rcb_ref == 0)
1127 		return;
1128 
1129 	/*
1130 	 * Don't worry about failure of desballoc here. It'll only become fatal
1131 	 * if we're trying to use it and we can't in i40e_rx_bind().
1132 	 */
1133 	rcb->rcb_mp = desballoc((unsigned char *)rcb->rcb_dma.dmab_address,
1134 	    rcb->rcb_dma.dmab_size, 0, &rcb->rcb_free_rtn);
1135 	i40e_rcb_free(rxd, rcb);
1136 
1137 	/*
1138 	 * It's possible that the rcb was being used while we are shutting down
1139 	 * the device. In that case, we'll take the final reference from the
1140 	 * device here.
1141 	 */
1142 	ref = atomic_dec_32_nv(&rcb->rcb_ref);
1143 	if (ref == 0) {
1144 		freemsg(rcb->rcb_mp);
1145 		rcb->rcb_mp = NULL;
1146 		i40e_free_dma_buffer(&rcb->rcb_dma);
1147 
1148 		mutex_enter(&i40e->i40e_rx_pending_lock);
1149 		atomic_dec_32(&rxd->rxd_rcb_pending);
1150 		atomic_dec_32(&i40e->i40e_rx_pending);
1151 
1152 		/*
1153 		 * If this was the last block and it's been indicated that we've
1154 		 * passed the shutdown point, we should clean up.
1155 		 */
1156 		if (rxd->rxd_shutdown == B_TRUE && rxd->rxd_rcb_pending == 0) {
1157 			i40e_free_rx_data(rxd);
1158 			cv_broadcast(&i40e->i40e_rx_pending_cv);
1159 		}
1160 
1161 		mutex_exit(&i40e->i40e_rx_pending_lock);
1162 	}
1163 }
1164 
1165 static mblk_t *
1166 i40e_rx_bind(i40e_trqpair_t *itrq, i40e_rx_data_t *rxd, uint32_t index,
1167     uint32_t plen)
1168 {
1169 	mblk_t *mp;
1170 	i40e_t *i40e = rxd->rxd_i40e;
1171 	i40e_rx_control_block_t *rcb, *rep_rcb;
1172 
1173 	ASSERT(MUTEX_HELD(&itrq->itrq_rx_lock));
1174 
1175 	if ((rep_rcb = i40e_rcb_alloc(rxd)) == NULL) {
1176 		itrq->itrq_rxstat.irxs_rx_bind_norcb.value.ui64++;
1177 		return (NULL);
1178 	}
1179 
1180 	rcb = rxd->rxd_work_list[index];
1181 
1182 	/*
1183 	 * Check to make sure we have a mblk_t. If we don't, this is our last
1184 	 * chance to try and get one.
1185 	 */
1186 	if (rcb->rcb_mp == NULL) {
1187 		rcb->rcb_mp =
1188 		    desballoc((unsigned char *)rcb->rcb_dma.dmab_address,
1189 		    rcb->rcb_dma.dmab_size, 0, &rcb->rcb_free_rtn);
1190 		if (rcb->rcb_mp == NULL) {
1191 			itrq->itrq_rxstat.irxs_rx_bind_nomp.value.ui64++;
1192 			i40e_rcb_free(rxd, rcb);
1193 			return (NULL);
1194 		}
1195 	}
1196 
1197 	I40E_DMA_SYNC(&rcb->rcb_dma, DDI_DMA_SYNC_FORKERNEL);
1198 
1199 	if (i40e_check_dma_handle(rcb->rcb_dma.dmab_dma_handle) != DDI_FM_OK) {
1200 		ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED);
1201 		atomic_or_32(&i40e->i40e_state, I40E_ERROR);
1202 		i40e_rcb_free(rxd, rcb);
1203 		return (NULL);
1204 	}
1205 
1206 	/*
1207 	 * Note, we've already accounted for the I40E_BUF_IPHDR_ALIGNMENT.
1208 	 */
1209 	mp = rcb->rcb_mp;
1210 	atomic_inc_32(&rcb->rcb_ref);
1211 	mp->b_wptr = mp->b_rptr + plen;
1212 	mp->b_next = mp->b_cont = NULL;
1213 
1214 	rxd->rxd_work_list[index] = rep_rcb;
1215 	return (mp);
1216 }
1217 
1218 /*
1219  * We're going to allocate a new message block for this frame and attempt to
1220  * receive it. See the big theory statement for more information on when we copy
1221  * versus bind.
1222  */
1223 static mblk_t *
1224 i40e_rx_copy(i40e_trqpair_t *itrq, i40e_rx_data_t *rxd, uint32_t index,
1225     uint32_t plen)
1226 {
1227 	i40e_t *i40e = rxd->rxd_i40e;
1228 	i40e_rx_control_block_t *rcb;
1229 	mblk_t *mp;
1230 
1231 	ASSERT(index < rxd->rxd_ring_size);
1232 	rcb = rxd->rxd_work_list[index];
1233 
1234 	I40E_DMA_SYNC(&rcb->rcb_dma, DDI_DMA_SYNC_FORKERNEL);
1235 
1236 	if (i40e_check_dma_handle(rcb->rcb_dma.dmab_dma_handle) != DDI_FM_OK) {
1237 		ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED);
1238 		atomic_or_32(&i40e->i40e_state, I40E_ERROR);
1239 		return (NULL);
1240 	}
1241 
1242 	mp = allocb(plen + I40E_BUF_IPHDR_ALIGNMENT, 0);
1243 	if (mp == NULL) {
1244 		itrq->itrq_rxstat.irxs_rx_copy_nomem.value.ui64++;
1245 		return (NULL);
1246 	}
1247 
1248 	mp->b_rptr += I40E_BUF_IPHDR_ALIGNMENT;
1249 	bcopy(rcb->rcb_dma.dmab_address, mp->b_rptr, plen);
1250 	mp->b_wptr = mp->b_rptr + plen;
1251 
1252 	return (mp);
1253 }
1254 
1255 /*
1256  * Determine if the device has enabled any checksum flags for us. The level of
1257  * checksum computed will depend on the type packet that we have, which is
1258  * contained in ptype. For example, the checksum logic it does will vary
1259  * depending on whether or not the packet is considered tunneled, whether it
1260  * recognizes the L4 type, etc. Section 8.3.4.3 summarizes which checksums are
1261  * valid.
1262  *
1263  * While there are additional checksums that we could recognize here, we'll need
1264  * to get some additional GLDv3 enhancements to be able to properly describe
1265  * them.
1266  */
1267 static void
1268 i40e_rx_hcksum(i40e_trqpair_t *itrq, mblk_t *mp, uint64_t status, uint32_t err,
1269     uint32_t ptype)
1270 {
1271 	uint32_t cksum;
1272 	struct i40e_rx_ptype_decoded pinfo;
1273 
1274 	ASSERT(ptype <= 255);
1275 	pinfo = decode_rx_desc_ptype(ptype);
1276 
1277 	cksum = 0;
1278 
1279 	/*
1280 	 * If the ptype isn't something that we know in the driver, then we
1281 	 * shouldn't even consider moving forward.
1282 	 */
1283 	if (pinfo.known == 0) {
1284 		itrq->itrq_rxstat.irxs_hck_unknown.value.ui64++;
1285 		return;
1286 	}
1287 
1288 	/*
1289 	 * If hardware didn't set the L3L4P bit on the frame, then there is no
1290 	 * checksum offload to consider.
1291 	 */
1292 	if ((status & (1 << I40E_RX_DESC_STATUS_L3L4P_SHIFT)) == 0) {
1293 		itrq->itrq_rxstat.irxs_hck_nol3l4p.value.ui64++;
1294 		return;
1295 	}
1296 
1297 	/*
1298 	 * The device tells us that IPv6 checksums where a Destination Options
1299 	 * Header or a Routing header shouldn't be trusted. Discard all
1300 	 * checksums in this case.
1301 	 */
1302 	if (pinfo.outer_ip == I40E_RX_PTYPE_OUTER_IP &&
1303 	    pinfo.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV6 &&
1304 	    (status & (1 << I40E_RX_DESC_STATUS_IPV6EXADD_SHIFT))) {
1305 		itrq->itrq_rxstat.irxs_hck_v6skip.value.ui64++;
1306 		return;
1307 	}
1308 
1309 	/*
1310 	 * The hardware denotes three kinds of possible errors. Two are reserved
1311 	 * for inner and outer IP checksum errors (IPE and EIPE) and the latter
1312 	 * is for L4 checksum errors (L4E). If there is only one IP header, then
1313 	 * the only thing that we care about is IPE. Note that since we don't
1314 	 * support inner checksums, we will ignore IPE being set on tunneled
1315 	 * packets and only care about EIPE.
1316 	 */
1317 	if (pinfo.outer_ip == I40E_RX_PTYPE_OUTER_IP &&
1318 	    pinfo.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV4) {
1319 		if (pinfo.tunnel_type == I40E_RX_PTYPE_OUTER_NONE) {
1320 			if ((err & (1 << I40E_RX_DESC_ERROR_IPE_SHIFT)) != 0) {
1321 				itrq->itrq_rxstat.irxs_hck_iperr.value.ui64++;
1322 			} else {
1323 				itrq->itrq_rxstat.irxs_hck_v4hdrok.value.ui64++;
1324 				cksum |= HCK_IPV4_HDRCKSUM_OK;
1325 			}
1326 		} else {
1327 			if ((err & (1 << I40E_RX_DESC_ERROR_EIPE_SHIFT)) != 0) {
1328 				itrq->itrq_rxstat.irxs_hck_eiperr.value.ui64++;
1329 			} else {
1330 				itrq->itrq_rxstat.irxs_hck_v4hdrok.value.ui64++;
1331 				cksum |= HCK_IPV4_HDRCKSUM_OK;
1332 			}
1333 		}
1334 	}
1335 
1336 	/*
1337 	 * We only have meaningful L4 checksums in the case of IP->L4 and
1338 	 * IP->IP->L4. There is not outer L4 checksum data available in any
1339 	 * other case. Further, we don't bother reporting the valid checksum in
1340 	 * the case of IP->IP->L4 set.
1341 	 */
1342 	if (pinfo.outer_ip == I40E_RX_PTYPE_OUTER_IP &&
1343 	    pinfo.tunnel_type == I40E_RX_PTYPE_TUNNEL_NONE &&
1344 	    (pinfo.inner_prot == I40E_RX_PTYPE_INNER_PROT_UDP ||
1345 	    pinfo.inner_prot == I40E_RX_PTYPE_INNER_PROT_TCP ||
1346 	    pinfo.inner_prot == I40E_RX_PTYPE_INNER_PROT_ICMP ||
1347 	    pinfo.inner_prot == I40E_RX_PTYPE_INNER_PROT_SCTP)) {
1348 		ASSERT(pinfo.payload_layer == I40E_RX_PTYPE_PAYLOAD_LAYER_PAY4);
1349 		if ((err & (1 << I40E_RX_DESC_ERROR_L4E_SHIFT)) != 0) {
1350 			itrq->itrq_rxstat.irxs_hck_l4err.value.ui64++;
1351 		} else {
1352 			itrq->itrq_rxstat.irxs_hck_l4hdrok.value.ui64++;
1353 			cksum |= HCK_FULLCKSUM_OK;
1354 		}
1355 	}
1356 
1357 	if (cksum != 0) {
1358 		itrq->itrq_rxstat.irxs_hck_set.value.ui64++;
1359 		mac_hcksum_set(mp, 0, 0, 0, 0, cksum);
1360 	} else {
1361 		itrq->itrq_rxstat.irxs_hck_miss.value.ui64++;
1362 	}
1363 }
1364 
1365 mblk_t *
1366 i40e_ring_rx(i40e_trqpair_t *itrq, int poll_bytes)
1367 {
1368 	i40e_t *i40e;
1369 	i40e_hw_t *hw;
1370 	i40e_rx_data_t *rxd;
1371 	uint32_t cur_head;
1372 	i40e_rx_desc_t *cur_desc;
1373 	i40e_rx_control_block_t *rcb;
1374 	uint64_t rx_bytes, rx_frames;
1375 	uint64_t stword;
1376 	mblk_t *mp, *mp_head, **mp_tail;
1377 
1378 	ASSERT(MUTEX_HELD(&itrq->itrq_rx_lock));
1379 	rxd = itrq->itrq_rxdata;
1380 	i40e = itrq->itrq_i40e;
1381 	hw = &i40e->i40e_hw_space;
1382 
1383 	if (!(i40e->i40e_state & I40E_STARTED) ||
1384 	    (i40e->i40e_state & I40E_OVERTEMP) ||
1385 	    (i40e->i40e_state & I40E_SUSPENDED) ||
1386 	    (i40e->i40e_state & I40E_ERROR))
1387 		return (NULL);
1388 
1389 	/*
1390 	 * Before we do anything else, we have to make sure that all of the DMA
1391 	 * buffers are synced up and then check to make sure that they're
1392 	 * actually good from an FM perspective.
1393 	 */
1394 	I40E_DMA_SYNC(&rxd->rxd_desc_area, DDI_DMA_SYNC_FORKERNEL);
1395 	if (i40e_check_dma_handle(rxd->rxd_desc_area.dmab_dma_handle) !=
1396 	    DDI_FM_OK) {
1397 		ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED);
1398 		atomic_or_32(&i40e->i40e_state, I40E_ERROR);
1399 		return (NULL);
1400 	}
1401 
1402 	/*
1403 	 * Prepare our stats. We do a limited amount of processing in both
1404 	 * polling and interrupt context. The limit in interrupt context is
1405 	 * based on frames, in polling context based on bytes.
1406 	 */
1407 	rx_bytes = rx_frames = 0;
1408 	mp_head = NULL;
1409 	mp_tail = &mp_head;
1410 
1411 	/*
1412 	 * At this point, the descriptor ring is available to check. We'll try
1413 	 * and process until we either run out of poll_bytes or descriptors.
1414 	 */
1415 	cur_head = rxd->rxd_desc_next;
1416 	cur_desc = &rxd->rxd_desc_ring[cur_head];
1417 	stword = LE64_TO_CPU(cur_desc->wb.qword1.status_error_len);
1418 
1419 	/*
1420 	 * Note, the primary invariant of this loop should be that cur_head,
1421 	 * cur_desc, and stword always point to the currently processed
1422 	 * descriptor. When we leave the loop, it should point to a descriptor
1423 	 * that HAS NOT been processed. Meaning, that if we haven't consumed the
1424 	 * frame, the descriptor should not be advanced.
1425 	 */
1426 	while ((stword & (1 << I40E_RX_DESC_STATUS_DD_SHIFT)) != 0) {
1427 		uint32_t error, eop, plen, ptype;
1428 
1429 		/*
1430 		 * The DD, PLEN, and EOP bits are the only ones that are valid
1431 		 * in every frame. The error information is only valid when EOP
1432 		 * is set in the same frame.
1433 		 *
1434 		 * At this time, because we don't do any LRO or header
1435 		 * splitting. We expect that every frame should have EOP set in
1436 		 * it. When later functionality comes in, we'll want to
1437 		 * re-evaluate this.
1438 		 */
1439 		eop = stword & (1 << I40E_RX_DESC_STATUS_EOF_SHIFT);
1440 		VERIFY(eop != 0);
1441 
1442 		error = (stword & I40E_RXD_QW1_ERROR_MASK) >>
1443 		    I40E_RXD_QW1_ERROR_SHIFT;
1444 		if (error & I40E_RX_ERR_BITS) {
1445 			itrq->itrq_rxstat.irxs_rx_desc_error.value.ui64++;
1446 			goto discard;
1447 		}
1448 
1449 		plen = (stword & I40E_RXD_QW1_LENGTH_PBUF_MASK) >>
1450 		    I40E_RXD_QW1_LENGTH_PBUF_SHIFT;
1451 
1452 		ptype = (stword & I40E_RXD_QW1_PTYPE_MASK) >>
1453 		    I40E_RXD_QW1_PTYPE_SHIFT;
1454 
1455 		/*
1456 		 * This packet contains valid data. We should check to see if
1457 		 * we're actually going to consume it based on its length (to
1458 		 * ensure that we don't overshoot our quota). We determine
1459 		 * whether to bcopy or bind the DMA resources based on the size
1460 		 * of the frame. However, if on debug, we allow it to be
1461 		 * overridden for testing purposes.
1462 		 *
1463 		 * We should be smarter about this and do DMA binding for
1464 		 * larger frames, but for now, it's really more important that
1465 		 * we actually just get something simple working.
1466 		 */
1467 
1468 		/*
1469 		 * Ensure we don't exceed our polling quota by reading this
1470 		 * frame. Note we only bump bytes now, we bump frames later.
1471 		 */
1472 		if ((poll_bytes != I40E_POLL_NULL) &&
1473 		    (rx_bytes + plen) > poll_bytes)
1474 			break;
1475 		rx_bytes += plen;
1476 
1477 		mp = NULL;
1478 		if (plen >= i40e->i40e_rx_dma_min)
1479 			mp = i40e_rx_bind(itrq, rxd, cur_head, plen);
1480 		if (mp == NULL)
1481 			mp = i40e_rx_copy(itrq, rxd, cur_head, plen);
1482 
1483 		if (mp != NULL) {
1484 			if (i40e->i40e_rx_hcksum_enable)
1485 				i40e_rx_hcksum(itrq, mp, stword, error, ptype);
1486 			*mp_tail = mp;
1487 			mp_tail = &mp->b_next;
1488 		}
1489 
1490 		/*
1491 		 * Now we need to prepare this frame for use again. See the
1492 		 * discussion in the big theory statements.
1493 		 *
1494 		 * However, right now we're doing the simple version of this.
1495 		 * Normally what we'd do would depend on whether or not we were
1496 		 * doing DMA binding or bcopying. But because we're always doing
1497 		 * bcopying, we can just always use the current index as a key
1498 		 * for what to do and reassign the buffer based on the ring.
1499 		 */
1500 discard:
1501 		rcb = rxd->rxd_work_list[cur_head];
1502 		cur_desc->read.pkt_addr =
1503 		    CPU_TO_LE64((uintptr_t)rcb->rcb_dma.dmab_dma_address);
1504 		cur_desc->read.hdr_addr = 0;
1505 
1506 		/*
1507 		 * Finally, update our loop invariants.
1508 		 */
1509 		cur_head = i40e_next_desc(cur_head, 1, rxd->rxd_ring_size);
1510 		cur_desc = &rxd->rxd_desc_ring[cur_head];
1511 		stword = LE64_TO_CPU(cur_desc->wb.qword1.status_error_len);
1512 
1513 		/*
1514 		 * To help provide liveness, we limit the amount of data that
1515 		 * we'll end up counting. Note that in these cases, an interrupt
1516 		 * is not dissimilar from a polling request.
1517 		 */
1518 		rx_frames++;
1519 		if (rx_frames > i40e->i40e_rx_limit_per_intr) {
1520 			itrq->itrq_rxstat.irxs_rx_intr_limit.value.ui64++;
1521 			break;
1522 		}
1523 	}
1524 
1525 	/*
1526 	 * As we've modified the ring, we need to make sure that we sync the
1527 	 * descriptor ring for the device. Next, we update the hardware and
1528 	 * update our notion of where the head for us to read from hardware is
1529 	 * next.
1530 	 */
1531 	I40E_DMA_SYNC(&rxd->rxd_desc_area, DDI_DMA_SYNC_FORDEV);
1532 	if (i40e_check_dma_handle(rxd->rxd_desc_area.dmab_dma_handle) !=
1533 	    DDI_FM_OK) {
1534 		ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED);
1535 		atomic_or_32(&i40e->i40e_state, I40E_ERROR);
1536 	}
1537 
1538 	if (rx_frames != 0) {
1539 		uint32_t tail;
1540 		ddi_acc_handle_t rh = i40e->i40e_osdep_space.ios_reg_handle;
1541 		rxd->rxd_desc_next = cur_head;
1542 		tail = i40e_prev_desc(cur_head, 1, rxd->rxd_ring_size);
1543 
1544 		I40E_WRITE_REG(hw, I40E_QRX_TAIL(itrq->itrq_index), tail);
1545 		if (i40e_check_acc_handle(rh) != DDI_FM_OK) {
1546 			ddi_fm_service_impact(i40e->i40e_dip,
1547 			    DDI_SERVICE_DEGRADED);
1548 			atomic_or_32(&i40e->i40e_state, I40E_ERROR);
1549 		}
1550 
1551 		itrq->itrq_rxstat.irxs_bytes.value.ui64 += rx_bytes;
1552 		itrq->itrq_rxstat.irxs_packets.value.ui64 += rx_frames;
1553 	}
1554 
1555 #ifdef DEBUG
1556 	if (rx_frames == 0) {
1557 		ASSERT(rx_bytes == 0);
1558 	}
1559 #endif
1560 
1561 	return (mp_head);
1562 }
1563 
1564 /*
1565  * This function is called by the GLDv3 when it wants to poll on a ring. The
1566  * only primary difference from when we call this during an interrupt is that we
1567  * have a limit on the number of bytes that we should consume.
1568  */
1569 mblk_t *
1570 i40e_ring_rx_poll(void *arg, int poll_bytes)
1571 {
1572 	i40e_trqpair_t *itrq = arg;
1573 	mblk_t *mp;
1574 
1575 	ASSERT(poll_bytes > 0);
1576 	if (poll_bytes == 0)
1577 		return (NULL);
1578 
1579 	mutex_enter(&itrq->itrq_rx_lock);
1580 	mp = i40e_ring_rx(itrq, poll_bytes);
1581 	mutex_exit(&itrq->itrq_rx_lock);
1582 
1583 	return (mp);
1584 }
1585 
1586 /*
1587  * This is a structure I wish someone would fill out for me for dorking with the
1588  * checksums. When we get some more experience with this, we should go ahead and
1589  * consider adding this to MAC.
1590  */
1591 typedef enum mac_ether_offload_flags {
1592 	MEOI_L2INFO_SET		= 0x01,
1593 	MEOI_VLAN_TAGGED	= 0x02,
1594 	MEOI_L3INFO_SET		= 0x04,
1595 	MEOI_L3CKSUM_SET	= 0x08,
1596 	MEOI_L4INFO_SET		= 0x10,
1597 	MEOI_L4CKSUM_SET	= 0x20
1598 } mac_ether_offload_flags_t;
1599 
1600 typedef struct mac_ether_offload_info {
1601 	mac_ether_offload_flags_t	meoi_flags;
1602 	uint8_t		meoi_l2hlen;	/* How long is the Ethernet header? */
1603 	uint16_t	meoi_l3proto;	/* What's the Ethertype */
1604 	uint8_t		meoi_l3hlen;	/* How long is the header? */
1605 	uint8_t		meoi_l4proto;	/* What is the payload type? */
1606 	uint8_t		meoi_l4hlen;	/* How long is the L4 header */
1607 	mblk_t		*meoi_l3ckmp;	/* Which mblk has the l3 checksum */
1608 	off_t		meoi_l3ckoff;	/* What's the offset to it */
1609 	mblk_t		*meoi_l4ckmp;	/* Which mblk has the L4 checksum */
1610 	off_t		meoi_l4off;	/* What is the offset to it? */
1611 } mac_ether_offload_info_t;
1612 
1613 /*
1614  * This is something that we'd like to make a general MAC function. Before we do
1615  * that, we should add support for TSO.
1616  *
1617  * We should really keep track of our offset and not walk everything every
1618  * time. I can't imagine that this will be kind to us at high packet rates;
1619  * however, for the moment, let's leave that.
1620  *
1621  * This walks a message block chain without pulling up to fill in the context
1622  * information. Note that the data we care about could be hidden across more
1623  * than one mblk_t.
1624  */
1625 static int
1626 i40e_meoi_get_uint8(mblk_t *mp, off_t off, uint8_t *out)
1627 {
1628 	size_t mpsize;
1629 	uint8_t *bp;
1630 
1631 	mpsize = msgsize(mp);
1632 	/* Check for overflow */
1633 	if (off + sizeof (uint16_t) > mpsize)
1634 		return (-1);
1635 
1636 	mpsize = MBLKL(mp);
1637 	while (off >= mpsize) {
1638 		mp = mp->b_cont;
1639 		off -= mpsize;
1640 		mpsize = MBLKL(mp);
1641 	}
1642 
1643 	bp = mp->b_rptr + off;
1644 	*out = *bp;
1645 	return (0);
1646 
1647 }
1648 
1649 static int
1650 i40e_meoi_get_uint16(mblk_t *mp, off_t off, uint16_t *out)
1651 {
1652 	size_t mpsize;
1653 	uint8_t *bp;
1654 
1655 	mpsize = msgsize(mp);
1656 	/* Check for overflow */
1657 	if (off + sizeof (uint16_t) > mpsize)
1658 		return (-1);
1659 
1660 	mpsize = MBLKL(mp);
1661 	while (off >= mpsize) {
1662 		mp = mp->b_cont;
1663 		off -= mpsize;
1664 		mpsize = MBLKL(mp);
1665 	}
1666 
1667 	/*
1668 	 * Data is in network order. Note the second byte of data might be in
1669 	 * the next mp.
1670 	 */
1671 	bp = mp->b_rptr + off;
1672 	*out = *bp << 8;
1673 	if (off + 1 == mpsize) {
1674 		mp = mp->b_cont;
1675 		bp = mp->b_rptr;
1676 	} else {
1677 		bp++;
1678 	}
1679 
1680 	*out |= *bp;
1681 	return (0);
1682 
1683 }
1684 
1685 static int
1686 mac_ether_offload_info(mblk_t *mp, mac_ether_offload_info_t *meoi)
1687 {
1688 	size_t off;
1689 	uint16_t ether;
1690 	uint8_t ipproto, iplen, l4len, maclen;
1691 
1692 	bzero(meoi, sizeof (mac_ether_offload_info_t));
1693 
1694 	off = offsetof(struct ether_header, ether_type);
1695 	if (i40e_meoi_get_uint16(mp, off, &ether) != 0)
1696 		return (-1);
1697 
1698 	if (ether == ETHERTYPE_VLAN) {
1699 		off = offsetof(struct ether_vlan_header, ether_type);
1700 		if (i40e_meoi_get_uint16(mp, off, &ether) != 0)
1701 			return (-1);
1702 		meoi->meoi_flags |= MEOI_VLAN_TAGGED;
1703 		maclen = sizeof (struct ether_vlan_header);
1704 	} else {
1705 		maclen = sizeof (struct ether_header);
1706 	}
1707 	meoi->meoi_flags |= MEOI_L2INFO_SET;
1708 	meoi->meoi_l2hlen = maclen;
1709 	meoi->meoi_l3proto = ether;
1710 
1711 	switch (ether) {
1712 	case ETHERTYPE_IP:
1713 		/*
1714 		 * For IPv4 we need to get the length of the header, as it can
1715 		 * be variable.
1716 		 */
1717 		off = offsetof(ipha_t, ipha_version_and_hdr_length) + maclen;
1718 		if (i40e_meoi_get_uint8(mp, off, &iplen) != 0)
1719 			return (-1);
1720 		iplen &= 0x0f;
1721 		if (iplen < 5 || iplen > 0x0f)
1722 			return (-1);
1723 		iplen *= 4;
1724 		off = offsetof(ipha_t, ipha_protocol) + maclen;
1725 		if (i40e_meoi_get_uint8(mp, off, &ipproto) == -1)
1726 			return (-1);
1727 		break;
1728 	case ETHERTYPE_IPV6:
1729 		iplen = 40;
1730 		off = offsetof(ip6_t, ip6_nxt) + maclen;
1731 		if (i40e_meoi_get_uint8(mp, off, &ipproto) == -1)
1732 			return (-1);
1733 		break;
1734 	default:
1735 		return (0);
1736 	}
1737 	meoi->meoi_l3hlen = iplen;
1738 	meoi->meoi_l4proto = ipproto;
1739 	meoi->meoi_flags |= MEOI_L3INFO_SET;
1740 
1741 	switch (ipproto) {
1742 	case IPPROTO_TCP:
1743 		off = offsetof(tcph_t, th_offset_and_rsrvd) + maclen + iplen;
1744 		if (i40e_meoi_get_uint8(mp, off, &l4len) == -1)
1745 			return (-1);
1746 		l4len = (l4len & 0xf0) >> 4;
1747 		if (l4len < 5 || l4len > 0xf)
1748 			return (-1);
1749 		l4len *= 4;
1750 		break;
1751 	case IPPROTO_UDP:
1752 		l4len = sizeof (struct udphdr);
1753 		break;
1754 	case IPPROTO_SCTP:
1755 		l4len = sizeof (sctp_hdr_t);
1756 		break;
1757 	default:
1758 		return (0);
1759 	}
1760 
1761 	meoi->meoi_l4hlen = l4len;
1762 	meoi->meoi_flags |= MEOI_L4INFO_SET;
1763 	return (0);
1764 }
1765 
1766 /*
1767  * Attempt to put togther the information we'll need to feed into a descriptor
1768  * to properly program the hardware for checksum offload as well as the
1769  * generally required flags.
1770  *
1771  * The i40e_tx_context_t`itc_cmdflags contains the set of flags we need to or
1772  * into the descriptor based on the checksum flags for this mblk_t and the
1773  * actual information we care about.
1774  */
1775 static int
1776 i40e_tx_context(i40e_t *i40e, i40e_trqpair_t *itrq, mblk_t *mp,
1777     i40e_tx_context_t *tctx)
1778 {
1779 	int ret;
1780 	uint32_t flags, start;
1781 	mac_ether_offload_info_t meo;
1782 	i40e_txq_stat_t *txs = &itrq->itrq_txstat;
1783 
1784 	bzero(tctx, sizeof (i40e_tx_context_t));
1785 
1786 	if (i40e->i40e_tx_hcksum_enable != B_TRUE)
1787 		return (0);
1788 
1789 	mac_hcksum_get(mp, &start, NULL, NULL, NULL, &flags);
1790 	if (flags == 0)
1791 		return (0);
1792 
1793 	if ((ret = mac_ether_offload_info(mp, &meo)) != 0) {
1794 		txs->itxs_hck_meoifail.value.ui64++;
1795 		return (ret);
1796 	}
1797 
1798 	/*
1799 	 * Have we been asked to checksum an IPv4 header. If so, verify that we
1800 	 * have sufficient information and then set the proper fields in the
1801 	 * command structure.
1802 	 */
1803 	if (flags & HCK_IPV4_HDRCKSUM) {
1804 		if ((meo.meoi_flags & MEOI_L2INFO_SET) == 0) {
1805 			txs->itxs_hck_nol2info.value.ui64++;
1806 			return (-1);
1807 		}
1808 		if ((meo.meoi_flags & MEOI_L3INFO_SET) == 0) {
1809 			txs->itxs_hck_nol3info.value.ui64++;
1810 			return (-1);
1811 		}
1812 		if (meo.meoi_l3proto != ETHERTYPE_IP) {
1813 			txs->itxs_hck_badl3.value.ui64++;
1814 			return (-1);
1815 		}
1816 		tctx->itc_cmdflags |= I40E_TX_DESC_CMD_IIPT_IPV4_CSUM;
1817 		tctx->itc_offsets |= (meo.meoi_l2hlen >> 1) <<
1818 		    I40E_TX_DESC_LENGTH_MACLEN_SHIFT;
1819 		tctx->itc_offsets |= (meo.meoi_l3hlen >> 2) <<
1820 		    I40E_TX_DESC_LENGTH_IPLEN_SHIFT;
1821 	}
1822 
1823 	/*
1824 	 * We've been asked to provide an L4 header, first, set up the IP
1825 	 * information in the descriptor if we haven't already before moving
1826 	 * onto seeing if we have enough information for the L4 checksum
1827 	 * offload.
1828 	 */
1829 	if (flags & HCK_PARTIALCKSUM) {
1830 		if ((meo.meoi_flags & MEOI_L4INFO_SET) == 0) {
1831 			txs->itxs_hck_nol4info.value.ui64++;
1832 			return (-1);
1833 		}
1834 
1835 		if (!(flags & HCK_IPV4_HDRCKSUM)) {
1836 			if ((meo.meoi_flags & MEOI_L2INFO_SET) == 0) {
1837 				txs->itxs_hck_nol2info.value.ui64++;
1838 				return (-1);
1839 			}
1840 			if ((meo.meoi_flags & MEOI_L3INFO_SET) == 0) {
1841 				txs->itxs_hck_nol3info.value.ui64++;
1842 				return (-1);
1843 			}
1844 
1845 			if (meo.meoi_l3proto == ETHERTYPE_IP) {
1846 				tctx->itc_cmdflags |=
1847 				    I40E_TX_DESC_CMD_IIPT_IPV4;
1848 			} else if (meo.meoi_l3proto == ETHERTYPE_IPV6) {
1849 				tctx->itc_cmdflags |=
1850 				    I40E_TX_DESC_CMD_IIPT_IPV6;
1851 			} else {
1852 				txs->itxs_hck_badl3.value.ui64++;
1853 				return (-1);
1854 			}
1855 			tctx->itc_offsets |= (meo.meoi_l2hlen >> 1) <<
1856 			    I40E_TX_DESC_LENGTH_MACLEN_SHIFT;
1857 			tctx->itc_offsets |= (meo.meoi_l3hlen >> 2) <<
1858 			    I40E_TX_DESC_LENGTH_IPLEN_SHIFT;
1859 		}
1860 
1861 		switch (meo.meoi_l4proto) {
1862 		case IPPROTO_TCP:
1863 			tctx->itc_cmdflags |= I40E_TX_DESC_CMD_L4T_EOFT_TCP;
1864 			break;
1865 		case IPPROTO_UDP:
1866 			tctx->itc_cmdflags |= I40E_TX_DESC_CMD_L4T_EOFT_UDP;
1867 			break;
1868 		case IPPROTO_SCTP:
1869 			tctx->itc_cmdflags |= I40E_TX_DESC_CMD_L4T_EOFT_SCTP;
1870 			break;
1871 		default:
1872 			txs->itxs_hck_badl4.value.ui64++;
1873 			return (-1);
1874 		}
1875 
1876 		tctx->itc_offsets |= (meo.meoi_l4hlen >> 2) <<
1877 		    I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
1878 	}
1879 
1880 	return (0);
1881 }
1882 
1883 static void
1884 i40e_tcb_free(i40e_trqpair_t *itrq, i40e_tx_control_block_t *tcb)
1885 {
1886 	ASSERT(tcb != NULL);
1887 
1888 	mutex_enter(&itrq->itrq_tcb_lock);
1889 	ASSERT(itrq->itrq_tcb_free < itrq->itrq_tx_free_list_size);
1890 	itrq->itrq_tcb_free_list[itrq->itrq_tcb_free] = tcb;
1891 	itrq->itrq_tcb_free++;
1892 	mutex_exit(&itrq->itrq_tcb_lock);
1893 }
1894 
1895 static i40e_tx_control_block_t *
1896 i40e_tcb_alloc(i40e_trqpair_t *itrq)
1897 {
1898 	i40e_tx_control_block_t *ret;
1899 
1900 	mutex_enter(&itrq->itrq_tcb_lock);
1901 	if (itrq->itrq_tcb_free == 0) {
1902 		mutex_exit(&itrq->itrq_tcb_lock);
1903 		return (NULL);
1904 	}
1905 
1906 	itrq->itrq_tcb_free--;
1907 	ret = itrq->itrq_tcb_free_list[itrq->itrq_tcb_free];
1908 	itrq->itrq_tcb_free_list[itrq->itrq_tcb_free] = NULL;
1909 	mutex_exit(&itrq->itrq_tcb_lock);
1910 
1911 	ASSERT(ret != NULL);
1912 	return (ret);
1913 }
1914 
1915 /*
1916  * This should be used to free any DMA resources, associated mblk_t's, etc. It's
1917  * used as part of recycling the message blocks when we have either an interrupt
1918  * or other activity that indicates that we need to take a look.
1919  */
1920 static void
1921 i40e_tcb_reset(i40e_tx_control_block_t *tcb)
1922 {
1923 	switch (tcb->tcb_type) {
1924 	case I40E_TX_COPY:
1925 		tcb->tcb_dma.dmab_len = 0;
1926 		break;
1927 	case I40E_TX_DMA:
1928 		(void) ddi_dma_unbind_handle(tcb->tcb_dma_handle);
1929 		break;
1930 	case I40E_TX_NONE:
1931 		/* Cast to pacify lint */
1932 		panic("trying to free tcb %p with bad type none", (void *)tcb);
1933 	default:
1934 		panic("unknown i40e tcb type: %d", tcb->tcb_type);
1935 	}
1936 
1937 	tcb->tcb_type = I40E_TX_NONE;
1938 	freemsg(tcb->tcb_mp);
1939 	tcb->tcb_mp = NULL;
1940 	tcb->tcb_next = NULL;
1941 }
1942 
1943 /*
1944  * This is called as part of shutting down to clean up all outstanding
1945  * descriptors. Similar to recycle, except we don't re-arm anything and instead
1946  * just return control blocks to the free list.
1947  */
1948 void
1949 i40e_tx_cleanup_ring(i40e_trqpair_t *itrq)
1950 {
1951 	uint32_t index;
1952 
1953 	ASSERT(MUTEX_HELD(&itrq->itrq_tx_lock));
1954 	ASSERT(itrq->itrq_desc_free <= itrq->itrq_tx_ring_size);
1955 
1956 	/*
1957 	 * Because we should have shut down the chip at this point, it should be
1958 	 * safe to just clean up all the entries between our head and tail.
1959 	 */
1960 #ifdef	DEBUG
1961 	index = I40E_READ_REG(&itrq->itrq_i40e->i40e_hw_space,
1962 	    I40E_QTX_ENA(itrq->itrq_index));
1963 	VERIFY0(index & (I40E_QTX_ENA_QENA_REQ_MASK |
1964 	    I40E_QTX_ENA_QENA_STAT_MASK));
1965 #endif
1966 
1967 	index = itrq->itrq_desc_head;
1968 	while (itrq->itrq_desc_free < itrq->itrq_tx_ring_size) {
1969 		i40e_tx_control_block_t *tcb;
1970 
1971 		tcb = itrq->itrq_tcb_work_list[index];
1972 		VERIFY(tcb != NULL);
1973 		itrq->itrq_tcb_work_list[index] = NULL;
1974 		i40e_tcb_reset(tcb);
1975 		i40e_tcb_free(itrq, tcb);
1976 
1977 		bzero(&itrq->itrq_desc_ring[index], sizeof (i40e_tx_desc_t));
1978 		index = i40e_next_desc(index, 1, itrq->itrq_tx_ring_size);
1979 		itrq->itrq_desc_free++;
1980 	}
1981 
1982 	ASSERT(index == itrq->itrq_desc_tail);
1983 	itrq->itrq_desc_head = index;
1984 }
1985 
1986 /*
1987  * We're here either by hook or by crook. We need to see if there are transmit
1988  * descriptors available for us to go and clean up and return to the hardware.
1989  * We may also be blocked, and if so, we should make sure that we let it know
1990  * we're good to go.
1991  */
1992 void
1993 i40e_tx_recycle_ring(i40e_trqpair_t *itrq)
1994 {
1995 	uint32_t wbhead, toclean, count;
1996 	i40e_tx_control_block_t *tcbhead;
1997 	i40e_t *i40e = itrq->itrq_i40e;
1998 
1999 	mutex_enter(&itrq->itrq_tx_lock);
2000 
2001 	ASSERT(itrq->itrq_desc_free <= itrq->itrq_tx_ring_size);
2002 	if (itrq->itrq_desc_free == itrq->itrq_tx_ring_size) {
2003 		if (itrq->itrq_tx_blocked == B_TRUE) {
2004 			itrq->itrq_tx_blocked = B_FALSE;
2005 			mac_tx_ring_update(i40e->i40e_mac_hdl,
2006 			    itrq->itrq_mactxring);
2007 			itrq->itrq_txstat.itxs_num_unblocked.value.ui64++;
2008 		}
2009 		mutex_exit(&itrq->itrq_tx_lock);
2010 		return;
2011 	}
2012 
2013 	/*
2014 	 * Now we need to try and see if there's anything available. The driver
2015 	 * will write to the head location and it guarantees that it does not
2016 	 * use relaxed ordering.
2017 	 */
2018 	VERIFY0(ddi_dma_sync(itrq->itrq_desc_area.dmab_dma_handle,
2019 	    (uintptr_t)itrq->itrq_desc_wbhead,
2020 	    sizeof (uint32_t), DDI_DMA_SYNC_FORKERNEL));
2021 
2022 	if (i40e_check_dma_handle(itrq->itrq_desc_area.dmab_dma_handle) !=
2023 	    DDI_FM_OK) {
2024 		mutex_exit(&itrq->itrq_tx_lock);
2025 		ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED);
2026 		atomic_or_32(&i40e->i40e_state, I40E_ERROR);
2027 		return;
2028 	}
2029 
2030 	wbhead = *itrq->itrq_desc_wbhead;
2031 	toclean = itrq->itrq_desc_head;
2032 	count = 0;
2033 	tcbhead = NULL;
2034 
2035 	while (toclean != wbhead) {
2036 		i40e_tx_control_block_t *tcb;
2037 
2038 		tcb = itrq->itrq_tcb_work_list[toclean];
2039 		itrq->itrq_tcb_work_list[toclean] = NULL;
2040 		ASSERT(tcb != NULL);
2041 		tcb->tcb_next = tcbhead;
2042 		tcbhead = tcb;
2043 
2044 		/*
2045 		 * We zero this out for sanity purposes.
2046 		 */
2047 		bzero(&itrq->itrq_desc_ring[toclean], sizeof (i40e_tx_desc_t));
2048 		toclean = i40e_next_desc(toclean, 1, itrq->itrq_tx_ring_size);
2049 		count++;
2050 	}
2051 
2052 	itrq->itrq_desc_head = wbhead;
2053 	itrq->itrq_desc_free += count;
2054 	itrq->itrq_txstat.itxs_recycled.value.ui64 += count;
2055 	ASSERT(itrq->itrq_desc_free <= itrq->itrq_tx_ring_size);
2056 
2057 	if (itrq->itrq_tx_blocked == B_TRUE &&
2058 	    itrq->itrq_desc_free > i40e->i40e_tx_block_thresh) {
2059 		itrq->itrq_tx_blocked = B_FALSE;
2060 
2061 		mac_tx_ring_update(i40e->i40e_mac_hdl, itrq->itrq_mactxring);
2062 		itrq->itrq_txstat.itxs_num_unblocked.value.ui64++;
2063 	}
2064 
2065 	mutex_exit(&itrq->itrq_tx_lock);
2066 
2067 	/*
2068 	 * Now clean up the tcb.
2069 	 */
2070 	while (tcbhead != NULL) {
2071 		i40e_tx_control_block_t *tcb = tcbhead;
2072 
2073 		tcbhead = tcb->tcb_next;
2074 		i40e_tcb_reset(tcb);
2075 		i40e_tcb_free(itrq, tcb);
2076 	}
2077 
2078 	DTRACE_PROBE2(i40e__recycle, i40e_trqpair_t *, itrq, uint32_t, count);
2079 }
2080 
2081 /*
2082  * We've been asked to send a message block on the wire. We'll only have a
2083  * single chain. There will not be any b_next pointers; however, there may be
2084  * multiple b_cont blocks.
2085  *
2086  * We may do one of three things with any given mblk_t chain:
2087  *
2088  *   1) Drop it
2089  *   2) Transmit it
2090  *   3) Return it
2091  *
2092  * If we return it to MAC, then MAC will flow control on our behalf. In other
2093  * words, it won't send us anything until we tell it that it's okay to send us
2094  * something.
2095  */
2096 mblk_t *
2097 i40e_ring_tx(void *arg, mblk_t *mp)
2098 {
2099 	const mblk_t *nmp;
2100 	size_t mpsize;
2101 	i40e_tx_control_block_t *tcb;
2102 	i40e_tx_desc_t *txdesc;
2103 	i40e_tx_context_t tctx;
2104 	int cmd, type;
2105 
2106 	i40e_trqpair_t *itrq = arg;
2107 	i40e_t *i40e = itrq->itrq_i40e;
2108 	i40e_hw_t *hw = &i40e->i40e_hw_space;
2109 	i40e_txq_stat_t *txs = &itrq->itrq_txstat;
2110 
2111 	ASSERT(mp->b_next == NULL);
2112 
2113 	if (!(i40e->i40e_state & I40E_STARTED) ||
2114 	    (i40e->i40e_state & I40E_OVERTEMP) ||
2115 	    (i40e->i40e_state & I40E_SUSPENDED) ||
2116 	    (i40e->i40e_state & I40E_ERROR) ||
2117 	    (i40e->i40e_link_state != LINK_STATE_UP)) {
2118 		freemsg(mp);
2119 		return (NULL);
2120 	}
2121 
2122 	/*
2123 	 * Figure out the relevant context about this frame that we might need
2124 	 * for enabling checksum, lso, etc. This also fills in information that
2125 	 * we might set around the packet type, etc.
2126 	 */
2127 	if (i40e_tx_context(i40e, itrq, mp, &tctx) < 0) {
2128 		freemsg(mp);
2129 		itrq->itrq_txstat.itxs_err_context.value.ui64++;
2130 		return (NULL);
2131 	}
2132 
2133 	/*
2134 	 * For the primordial driver we can punt on doing any recycling right
2135 	 * now; however, longer term we need to probably do some more pro-active
2136 	 * recycling to cut back on stalls in the tx path.
2137 	 */
2138 
2139 	/*
2140 	 * Do a quick size check to make sure it fits into what we think it
2141 	 * should for this device. Note that longer term this will be false,
2142 	 * particularly when we have the world of TSO.
2143 	 */
2144 	mpsize = 0;
2145 	for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
2146 		mpsize += MBLKL(nmp);
2147 	}
2148 
2149 	/*
2150 	 * First we allocate our tx control block and prepare the packet for
2151 	 * transmit before we do a final check for descriptors. We do it this
2152 	 * way to minimize the time under the tx lock.
2153 	 */
2154 	tcb = i40e_tcb_alloc(itrq);
2155 	if (tcb == NULL) {
2156 		txs->itxs_err_notcb.value.ui64++;
2157 		goto txfail;
2158 	}
2159 
2160 	/*
2161 	 * For transmitting a block, we're currently going to use just a
2162 	 * single control block and bcopy all of the fragments into it. We
2163 	 * should be more intelligent about doing DMA binding or otherwise, but
2164 	 * for getting off the ground this will have to do.
2165 	 */
2166 	ASSERT(tcb->tcb_dma.dmab_len == 0);
2167 	ASSERT(tcb->tcb_dma.dmab_size >= mpsize);
2168 	for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
2169 		size_t clen = MBLKL(nmp);
2170 		void *coff = tcb->tcb_dma.dmab_address + tcb->tcb_dma.dmab_len;
2171 
2172 		bcopy(nmp->b_rptr, coff, clen);
2173 		tcb->tcb_dma.dmab_len += clen;
2174 	}
2175 	ASSERT(tcb->tcb_dma.dmab_len == mpsize);
2176 
2177 	/*
2178 	 * While there's really no need to keep the mp here, but let's just do
2179 	 * it to help with our own debugging for now.
2180 	 */
2181 	tcb->tcb_mp = mp;
2182 	tcb->tcb_type = I40E_TX_COPY;
2183 	I40E_DMA_SYNC(&tcb->tcb_dma, DDI_DMA_SYNC_FORDEV);
2184 
2185 	mutex_enter(&itrq->itrq_tx_lock);
2186 	if (itrq->itrq_desc_free < i40e->i40e_tx_block_thresh) {
2187 		txs->itxs_err_nodescs.value.ui64++;
2188 		mutex_exit(&itrq->itrq_tx_lock);
2189 		goto txfail;
2190 	}
2191 
2192 	/*
2193 	 * Build up the descriptor and send it out. Thankfully at the moment
2194 	 * we only need a single desc, because we're not doing anything fancy
2195 	 * yet.
2196 	 */
2197 	ASSERT(itrq->itrq_desc_free > 0);
2198 	itrq->itrq_desc_free--;
2199 	txdesc = &itrq->itrq_desc_ring[itrq->itrq_desc_tail];
2200 	itrq->itrq_tcb_work_list[itrq->itrq_desc_tail] = tcb;
2201 	itrq->itrq_desc_tail = i40e_next_desc(itrq->itrq_desc_tail, 1,
2202 	    itrq->itrq_tx_ring_size);
2203 
2204 	/*
2205 	 * Note, we always set EOP and RS which indicates that this is the last
2206 	 * data frame and that we should ask for it to be transmitted. We also
2207 	 * must always set ICRC, because that is an internal bit that must be
2208 	 * set to one for data descriptors. The remaining bits in the command
2209 	 * descriptor depend on checksumming and are determined based on the
2210 	 * information set up in i40e_tx_context().
2211 	 */
2212 	type = I40E_TX_DESC_DTYPE_DATA;
2213 	cmd = I40E_TX_DESC_CMD_EOP |
2214 	    I40E_TX_DESC_CMD_RS |
2215 	    I40E_TX_DESC_CMD_ICRC |
2216 	    tctx.itc_cmdflags;
2217 	txdesc->buffer_addr =
2218 	    CPU_TO_LE64((uintptr_t)tcb->tcb_dma.dmab_dma_address);
2219 	txdesc->cmd_type_offset_bsz = CPU_TO_LE64(((uint64_t)type |
2220 	    ((uint64_t)tctx.itc_offsets << I40E_TXD_QW1_OFFSET_SHIFT) |
2221 	    ((uint64_t)cmd << I40E_TXD_QW1_CMD_SHIFT) |
2222 	    ((uint64_t)tcb->tcb_dma.dmab_len << I40E_TXD_QW1_TX_BUF_SZ_SHIFT)));
2223 
2224 	/*
2225 	 * Now, finally, sync the DMA data and alert hardware.
2226 	 */
2227 	I40E_DMA_SYNC(&itrq->itrq_desc_area, DDI_DMA_SYNC_FORDEV);
2228 
2229 	I40E_WRITE_REG(hw, I40E_QTX_TAIL(itrq->itrq_index),
2230 	    itrq->itrq_desc_tail);
2231 	if (i40e_check_acc_handle(i40e->i40e_osdep_space.ios_reg_handle) !=
2232 	    DDI_FM_OK) {
2233 		/*
2234 		 * Note, we can't really go through and clean this up very well,
2235 		 * because the memory has been given to the device, so just
2236 		 * indicate it's been transmitted.
2237 		 */
2238 		ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED);
2239 		atomic_or_32(&i40e->i40e_state, I40E_ERROR);
2240 	}
2241 
2242 	txs->itxs_bytes.value.ui64 += mpsize;
2243 	txs->itxs_packets.value.ui64++;
2244 	txs->itxs_descriptors.value.ui64++;
2245 
2246 	mutex_exit(&itrq->itrq_tx_lock);
2247 
2248 	return (NULL);
2249 
2250 txfail:
2251 	/*
2252 	 * We ran out of resources. Return it to MAC and indicate that we'll
2253 	 * need to signal MAC. If there are allocated tcb's, return them now.
2254 	 * Make sure to reset their message block's, since we'll return them
2255 	 * back to MAC.
2256 	 */
2257 	if (tcb != NULL) {
2258 		tcb->tcb_mp = NULL;
2259 		i40e_tcb_reset(tcb);
2260 		i40e_tcb_free(itrq, tcb);
2261 	}
2262 
2263 	mutex_enter(&itrq->itrq_tx_lock);
2264 	itrq->itrq_tx_blocked = B_TRUE;
2265 	mutex_exit(&itrq->itrq_tx_lock);
2266 
2267 	return (mp);
2268 }
2269