xref: /illumos-gate/usr/src/uts/common/io/i40e/i40e_transceiver.c (revision ca28c3d8eab8b53ff145fd15cf80cdc2da3fc032)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2015 OmniTI Computer Consulting, Inc. All rights reserved.
14  * Copyright 2019 Joyent, Inc.
15  * Copyright 2020 RackTop Systems, Inc.
16  */
17 
18 #include "i40e_sw.h"
19 
20 /*
21  * ---------------------------------------------------------
22  * Buffer and Memory Management, Receiving, and Transmitting
23  * ---------------------------------------------------------
24  *
25  * Each physical function (PF), which is what we think of as an instance of the
26  * device driver, has a series of associated transmit and receive queue pairs.
27  * Effectively, what we think of in MAC as rings. Each of these has their own
28  * ring of descriptors which is used as part of doing DMA activity.
29  *
30  * The transmit ring of descriptors are 16-byte entries which are used to send
31  * packets, program filters, etc. The receive ring of descriptors are either
32  * 16-byte or 32-bytes each. At the moment, we opt to use the larger descriptor
33  * format so that we're in a better position if we ever want to leverage that
34  * information later on.
35  *
36  * However, these rings are just for descriptors, they don't talk or deal with
37  * how we actually store the memory that we need for DMA or the associated
38  * information that we need for keeping track of message blocks. To correspond
39  * to the hardware descriptor ring which is how we communicate with hardware, we
40  * introduce a control block which keeps track of our required metadata like DMA
41  * mappings.
42  *
43  * There are two main considerations that dictate how much memory and buffers
44  * we end up allocating. Those are:
45  *
46  *   o The size of the ring (controlled through the driver.conf file)
47  *
48  *   o The maximum size frame we can receive.
49  *
50  * The size of the rings currently defaults to 1024 descriptors and is stored in
51  * the i40e_t`i40e_rx_ring_size and i40e_t`i40e_tx_ring_size.
52  *
53  * While the size of the rings is controlled by the driver.conf, the maximum
54  * size frame is informed primarily through the use of dladm and the setting of
55  * the MTU property on the device. From the MTU, we then go and do some
56  * machinations. The first thing we do is we then have to add in space for the
57  * Ethernet header, potentially a VLAN header, and the FCS check. This value is
58  * what's stored as i40e_t`i40e_frame_max and is derived any time
59  * i40e_t`i40e_sdu changes.
60  *
61  * This size is then rounded up to the nearest 1k chunk, which represents the
62  * actual amount of memory that we'll allocate for a single frame.
63  *
64  * Note, that for RX, we do something that might be unexpected. We always add
65  * an extra two bytes to the frame size that we allocate. We then offset the DMA
66  * address that we receive a packet into by two bytes. This ensures that the IP
67  * header will always be 4 byte aligned because the MAC header is either 14 or
68  * 18 bytes in length, depending on the use of 802.1Q tagging, which makes IP's
69  * and MAC's lives easier.
70  *
71  * Both the RX and TX descriptor rings (which are what we use to communicate
72  * with hardware) are allocated as a single region of DMA memory which is the
73  * size of the descriptor (4 bytes and 2 bytes respectively) times the total
74  * number of descriptors for an RX and TX ring.
75  *
76  * While the RX and TX descriptors are allocated using DMA-based memory, the
77  * control blocks for each of them are allocated using normal kernel memory.
78  * They aren't special from a DMA perspective. We'll go over the design of both
79  * receiving and transmitting separately, as they have slightly different
80  * control blocks and different ways that we manage the relationship between
81  * control blocks and descriptors.
82  *
83  * ---------------------------------
84  * RX Descriptors and Control Blocks
85  * ---------------------------------
86  *
87  * For every descriptor in the ring that the driver has, we need some associated
88  * memory, which means that we need to have the receive specific control block.
89  * We have a couple different, but related goals:
90  *
91  *   o Once we've completed the mc_start GLDv3 endpoint (i40e_m_start), we do
92  *     not want to do any additional memory allocations or DMA allocations if
93  *     we don't have to.
94  *
95  *   o We'd like to try and do as much zero-copy as possible, while taking into
96  *     account the cost of mapping in DMA resources.
97  *
98  *   o We'd like to have every receive descriptor available.
99  *
100  * Now, these rules are a bit in tension with one another. The act of mapping in
101  * is an exercise of trying to find the break-even point between page table
102  * updates and bcopy. We currently start by using the same metrics that ixgbe
103  * used; however, it should be known that this value has effectively been
104  * cargo-culted across to yet another driver, sorry.
105  *
106  * If we receive a packet which is larger than our copy threshold, we'll create
107  * a message block out of the DMA memory via desballoc(9F) and send that up to
108  * MAC that way. This will cause us to be notified when the message block is
109  * then freed because it has been consumed, dropped, or otherwise. Otherwise, if
110  * it's less than the threshold, we'll try to use allocb and bcopy it into the
111  * block, thus allowing us to immediately reuse the DMA resource. Note, on debug
112  * builds, we allow someone to whack the variable i40e_debug_rx_mode to override
113  * the behavior and always do a bcopy or a DMA bind.
114  *
115  * To try and ensure that the device always has blocks that it can receive data
116  * into, we maintain two lists of control blocks, a working list and a free
117  * list. Each list is sized equal to the number of descriptors in the RX ring.
118  * During the GLDv3 mc_start routine, we allocate a number of RX control blocks
119  * equal to twice the number of descriptors in the ring and we assign them
120  * equally to the free list and to the working list. Each control block also has
121  * DMA memory allocated and associated with which it will be used to receive the
122  * actual packet data. All of a received frame's data will end up in a single
123  * DMA buffer.
124  *
125  * During operation, we always maintain the invariant that each RX descriptor
126  * has an associated RX control block which lives in the working list. If we
127  * feel that we should loan up DMA memory to MAC in the form of a message block,
128  * we can only do so if we can maintain this invariant. To do that, we swap in
129  * one of the buffers from the free list. If none are available, then we resort
130  * to using allocb(9F) and bcopy(9F) on the packet instead, regardless of the
131  * size.
132  *
133  * Loaned message blocks come back to use when freemsg(9F) or freeb(9F) is
134  * called on the block, at which point we restore the RX control block to the
135  * free list and are able to reuse the DMA memory again. While the scheme may
136  * seem odd, it importantly keeps us out of trying to do any DMA allocations in
137  * the normal path of operation, even though we may still have to allocate
138  * message blocks and copy.
139  *
140  * The following state machine describes the life time of a RX control block. In
141  * the diagram we abbrviate the RX ring descriptor entry as rxd and the rx
142  * control block entry as rcb.
143  *
144  *             |                                   |
145  *             * ... 1/2 of all initial rcb's  ... *
146  *             |                                   |
147  *             v                                   v
148  *     +------------------+               +------------------+
149  *     | rcb on free list |---*---------->| rcb on work list |
150  *     +------------------+   .           +------------------+
151  *             ^              . moved to          |
152  *             |                replace rcb       * . . Frame received,
153  *             |                loaned to         |     entry on free list
154  *             |                MAC + co.         |     available. rcb's
155  *             |                                  |     memory made into mblk_t
156  *             * . freemsg(9F)                    |     and sent up to MAC.
157  *             |   called on                      |
158  *             |   loaned rcb                     |
159  *             |   and it is                      v
160  *             |   recycled.              +-------------------+
161  *             +--------------------<-----| rcb loaned to MAC |
162  *                                        +-------------------+
163  *
164  * Finally, note that every RX control block has a reference count on it. One
165  * reference is added as long as the driver has had the GLDv3 mc_start endpoint
166  * called. If the GLDv3 mc_stop entry point is called, IP has been unplumbed and
167  * no other DLPI consumers remain, then we'll decrement the reference count by
168  * one. Whenever we loan up the RX control block and associated buffer to MAC,
169  * then we bump the reference count again. Even though the device is stopped,
170  * there may still be loaned frames in upper levels that we'll want to account
171  * for. Our callback from freemsg(9F)/freeb(9F) will take care of making sure
172  * that it is cleaned up.
173  *
174  * --------------------
175  * Managing the RX Ring
176  * --------------------
177  *
178  * The receive ring descriptors are arranged in a circular buffer with a head
179  * and tail pointer. There are both the conventional head and tail pointers
180  * which are used to partition the ring into two portions, a portion that we,
181  * the operating system, manage and a portion that is managed by hardware. When
182  * hardware owns a descriptor in the ring, it means that it is waiting for data
183  * to be filled in. However, when a portion of the ring is owned by the driver,
184  * then that means that the descriptor has been consumed and we need to go take
185  * a look at it.
186  *
187  * The initial head is configured to be zero by writing it as such in the
188  * receive queue context in the FPM (function private memory from the host). The
189  * initial tail is written to be the last descriptor. This is written to via the
190  * PCIe register I40E_QRX_TAIL(). Technically, hardware owns everything between
191  * the HEAD and TAIL, inclusive. Note that while we initially program the HEAD,
192  * the only values we ever consult ourselves are the TAIL register and our own
193  * state tracking. Effectively, we cache the HEAD register and then update it
194  * ourselves based on our work.
195  *
196  * When we iterate over the RX descriptors and thus the received frames, we are
197  * either in an interrupt context or we've been asked by MAC to poll on the
198  * ring. If we've been asked to poll on the ring, we have a maximum number of
199  * bytes of mblk_t's to return. If processing an RX descriptor would cause us to
200  * exceed that count, then we do not process it. When in interrupt context, we
201  * don't have a strict byte count. However, to ensure liveness, we limit the
202  * amount of data based on a configuration value
203  * (i40e_t`i40e_rx_limit_per_intr). The number that we've started with for this
204  * is based on similar numbers that are used for ixgbe. After some additional
205  * time in the field, we'll have a sense as to whether or not it should be
206  * changed.
207  *
208  * When processing, we start at our own HEAD pointer
209  * (i40e_rx_data_t`rxd_desc_next), which indicates the descriptor to start
210  * processing. Every RX descriptor has what's described as the DD bit. This bit
211  * (the LSB of the second 8-byte word), indicates whether or not the descriptor
212  * is done.  When we give descriptors to the hardware, this value is always
213  * zero. When the hardware has finished a descriptor, it will always be one.
214  *
215  * The first thing that we check is whether the DD bit indicates that the
216  * current HEAD is ready. If it isn't, then we're done. That's the primary
217  * invariant of processing a frame. If it's done, then there are a few other
218  * things that we want to look at. In the same status word as the DD bit, there
219  * are two other important bits:
220  *
221  *   o End of Packet (EOP)
222  *   o Error bits
223  *
224  * The end of packet indicates that we have reached the last descriptor. Now,
225  * you might ask when would there be more than one descriptor. The reason for
226  * that might be due to large receive offload (lro) or header splitting
227  * functionality, which presently isn't supported in the driver. The error bits
228  * in the frame are only valid when EOP is set.
229  *
230  * If error bits are set on the frame, then we still consume it; however, we
231  * will not generate an mblk_t to send up to MAC. If there are no error bits
232  * set, then we'll consume the descriptor either using bcopy or DMA binding. See
233  * the earlier section 'RX DESCRIPTORS AND CONTROL BLOCKS' for more information
234  * on how that selection is made.
235  *
236  * Regardless of whether we construct an mblk_t or encounter an error, we end up
237  * resetting the descriptor. This re-arms the descriptor for hardware and in the
238  * process, we may end up assigning it a new receive control bock. After we do
239  * this, we always update our HEAD pointer, no matter what.
240  *
241  * Finally, once we've consumed as much as we will in a given window, we go and
242  * update the TAIL register to indicate all the frames we've consumed. We only
243  * do a single bulk write for the ring.
244  *
245  * ---------------------------------
246  * TX Descriptors and Control Blocks
247  * ---------------------------------
248  *
249  * While the transmit path is similar in spirit to the receive path, it works
250  * differently due to the fact that all data is originated by the operating
251  * system and not by the device.
252  *
253  * Like RX, there is both a descriptor ring that we use to communicate to the
254  * driver and which points to the memory used to transmit a frame.  Similarly,
255  * there is a corresponding transmit control block, however, the correspondence
256  * between descriptors and control blocks is more complex and not necessarily
257  * 1-to-1.
258  *
259  * The driver is asked to process a single frame at a time. That message block
260  * may be made up of multiple fragments linked together by the mblk_t`b_cont
261  * member. The device has a hard limit of up to 8 buffers being allowed for use
262  * for a single non-LSO packet or LSO segment. The number of TX ring entires
263  * (and thus TX control blocks) used depends on the fragment sizes and DMA
264  * layout, as explained below.
265  *
266  * We alter our DMA strategy based on a threshold tied to the fragment size.
267  * This threshold is configurable via the tx_dma_threshold property. If the
268  * fragment is above the threshold, we DMA bind it -- consuming one TCB and
269  * potentially several data descriptors. The exact number of descriptors (equal
270  * to the number of DMA cookies) depends on page size, MTU size, b_rptr offset
271  * into page, b_wptr offset into page, and the physical layout of the dblk's
272  * memory (contiguous or not). Essentially, we are at the mercy of the DMA
273  * engine and the dblk's memory allocation. Knowing the exact number of
274  * descriptors up front is a task best not taken on by the driver itself.
275  * Instead, we attempt to DMA bind the fragment and verify the descriptor
276  * layout meets hardware constraints. If the proposed DMA bind does not satisfy
277  * the hardware constaints, then we discard it and instead copy the entire
278  * fragment into the pre-allocated TCB buffer (or buffers if the fragment is
279  * larger than the TCB buffer).
280  *
281  * If the fragment is below or at the threshold, we copy it to the pre-allocated
282  * buffer of a TCB. We compress consecutive copy fragments into a single TCB to
283  * conserve resources. We are guaranteed that the TCB buffer is made up of only
284  * 1 DMA cookie; and therefore consumes only one descriptor on the controller.
285  *
286  * Furthermore, if the frame requires HW offloads such as LSO, tunneling or
287  * filtering, then the TX data descriptors must be preceeded by a single TX
288  * context descriptor.  Because there is no DMA transfer associated with the
289  * context descriptor, we allocate a control block with a special type which
290  * indicates to the TX ring recycle code that there are no associated DMA
291  * resources to unbind when the control block is free'd.
292  *
293  * If we don't have enough space in the ring or TX control blocks available,
294  * then we'll return the unprocessed message block to MAC. This will induce flow
295  * control and once we recycle enough entries, we'll once again enable sending
296  * on the ring.
297  *
298  * We size the working list as equal to the number of descriptors in the ring.
299  * We size the free list as equal to 1.5 times the number of descriptors in the
300  * ring. We'll allocate a number of TX control block entries equal to the number
301  * of entries in the free list. By default, all entries are placed in the free
302  * list. As we come along and try to send something, we'll allocate entries from
303  * the free list and add them to the working list, where they'll stay until the
304  * hardware indicates that all of the data has been written back to us. The
305  * reason that we start with 1.5x is to help facilitate having more than one TX
306  * buffer associated with the DMA activity.
307  *
308  * --------------------
309  * Managing the TX Ring
310  * --------------------
311  *
312  * The transmit descriptor ring is driven by us. We maintain our own notion of a
313  * HEAD and TAIL register and we update the hardware with updates to the TAIL
314  * register. When the hardware is done writing out data, it updates us by
315  * writing back to a specific address, not by updating the individual
316  * descriptors. That address is a 4-byte region after the main transmit
317  * descriptor ring. This is why the descriptor ring has an extra descriptor's
318  * worth allocated to it.
319  *
320  * We maintain our notion of the HEAD in the i40e_trqpair_t`itrq_desc_head and
321  * the TAIL in the i40e_trqpair_t`itrq_desc_tail. When we write out frames,
322  * we'll update the tail there and in the I40E_QTX_TAIL() register. At various
323  * points in time, through both interrupts, and our own internal checks, we'll
324  * sync the write-back head portion of the DMA space. Based on the index it
325  * reports back, we'll free everything between our current HEAD and the
326  * indicated index and update HEAD to the new index.
327  *
328  * When a frame comes in, we try to use a number of transmit control blocks and
329  * we'll transition them from the free list to the work list. They'll get moved
330  * to the entry on the work list that corresponds with the transmit descriptor
331  * they correspond to. Once we are indicated that the corresponding descriptor
332  * has been freed, we'll return it to the list.
333  *
334  * The transmit control block free list is managed by keeping track of the
335  * number of entries in it, i40e_trqpair_t`itrq_tcb_free. We use it as a way to
336  * index into the free list and add things to it. In effect, we always push and
337  * pop from the tail and protect it with a single lock,
338  * i40e_trqpair_t`itrq_tcb_lock. This scheme is somewhat simplistic and may not
339  * stand up to further performance testing; however, it does allow us to get off
340  * the ground with the device driver.
341  *
342  * The following image describes where a given transmit control block lives in
343  * its lifetime:
344  *
345  *             |
346  *             * ... Initial placement for all tcb's
347  *             |
348  *             v
349  *    +------------------+                       +------------------+
350  *    | tcb on free list |---*------------------>| tcb on work list |
351  *    +------------------+   .                   +------------------+
352  *             ^             . N tcbs allocated[1]         |
353  *             |               to send frame               v
354  *             |               or fragment on              |
355  *             |               wire, mblk from             |
356  *             |               MAC associated.             |
357  *             |                                           |
358  *             +------*-------------------------------<----+
359  *                    .
360  *                    . Hardware indicates
361  *                      entry transmitted.
362  *                      tcbs recycled, mblk
363  *                      from MAC freed.
364  *
365  * [1] We allocate N tcbs to transmit a single frame where N can be 1 context
366  *     descriptor plus 1 data descriptor, in the non-DMA-bind case.  In the DMA
367  *     bind case, N can be 1 context descriptor plus 1 data descriptor per
368  *     b_cont in the mblk.  In this case, the mblk is associated with the first
369  *     data descriptor and freed as part of freeing that data descriptor.
370  *
371  * ------------
372  * Blocking MAC
373  * ------------
374  *
375  * When performing transmit, we can run out of descriptors and ring entries.
376  * When such a case happens, we return the mblk_t to MAC to indicate that we've
377  * been blocked. At that point in time, MAC becomes blocked and will not
378  * transmit anything out that specific ring until we notify MAC. To indicate
379  * that we're in such a situation we set i40e_trqpair_t`itrq_tx_blocked member
380  * to B_TRUE.
381  *
382  * When we recycle TX descriptors then we'll end up signaling MAC by calling
383  * mac_tx_ring_update() if we were blocked, letting it know that it's safe to
384  * start sending frames out to us again.
385  */
386 
387 /*
388  * We set our DMA alignment requests based on the smallest supported page size
389  * of the corresponding platform.
390  */
391 #if	defined(__sparc)
392 #define	I40E_DMA_ALIGNMENT 0x2000ull
393 #elif defined(__x86)
394 #define	I40E_DMA_ALIGNMENT 0x1000ull
395 #else
396 #error	"unknown architecture for i40e"
397 #endif
398 
399 /*
400  * This structure is used to maintain information and flags related to
401  * transmitting a frame.  These fields are ultimately used to construct the
402  * TX data descriptor(s) and, if necessary, the TX context descriptor.
403  */
404 typedef struct i40e_tx_context {
405 	enum i40e_tx_desc_cmd_bits	itc_data_cmdflags;
406 	uint32_t			itc_data_offsets;
407 	enum i40e_tx_ctx_desc_cmd_bits	itc_ctx_cmdflags;
408 	uint32_t			itc_ctx_tsolen;
409 	uint32_t			itc_ctx_mss;
410 } i40e_tx_context_t;
411 
412 /*
413  * Toggles on debug builds which can be used to override our RX behaviour based
414  * on thresholds.
415  */
416 #ifdef	DEBUG
417 typedef enum {
418 	I40E_DEBUG_RX_DEFAULT	= 0,
419 	I40E_DEBUG_RX_BCOPY	= 1,
420 	I40E_DEBUG_RX_DMABIND	= 2
421 } i40e_debug_rx_t;
422 
423 i40e_debug_rx_t i40e_debug_rx_mode = I40E_DEBUG_RX_DEFAULT;
424 #endif	/* DEBUG */
425 
426 /*
427  * Notes on the following pair of DMA attributes. The first attribute,
428  * i40e_static_dma_attr, is designed to be used for both the descriptor rings
429  * and the static buffers that we associate with control blocks. For this
430  * reason, we force an SGL length of one. While technically the driver supports
431  * a larger SGL (5 on RX and 8 on TX), we opt to only use one to simplify our
432  * management here. In addition, when the Intel common code wants to allocate
433  * memory via the i40e_allocate_virt_mem osdep function, we have it leverage
434  * the static dma attr.
435  *
436  * The latter two sets of attributes, are what we use when we're binding a
437  * bunch of mblk_t fragments to go out the door. Note that the main difference
438  * here is that we're allowed a larger SGL length.  For non-LSO TX, we
439  * restrict the SGL length to match the number of TX buffers available to the
440  * PF (8).  For the LSO case we can go much larger, with the caveat that each
441  * MSS-sized chunk (segment) must not span more than 8 data descriptors and
442  * hence must not span more than 8 cookies.
443  *
444  * Note, we default to setting ourselves to be DMA capable here. However,
445  * because we could have multiple instances which have different FMA error
446  * checking capabilities, or end up on different buses, we make these static
447  * and const and copy them into the i40e_t for the given device with the actual
448  * values that reflect the actual capabilities.
449  */
450 static const ddi_dma_attr_t i40e_g_static_dma_attr = {
451 	DMA_ATTR_V0,			/* version number */
452 	0x0000000000000000ull,		/* low address */
453 	0xFFFFFFFFFFFFFFFFull,		/* high address */
454 	0x00000000FFFFFFFFull,		/* dma counter max */
455 	I40E_DMA_ALIGNMENT,		/* alignment */
456 	0x00000FFF,			/* burst sizes */
457 	0x00000001,			/* minimum transfer size */
458 	0x00000000FFFFFFFFull,		/* maximum transfer size */
459 	0xFFFFFFFFFFFFFFFFull,		/* maximum segment size */
460 	1,				/* scatter/gather list length */
461 	0x00000001,			/* granularity */
462 	DDI_DMA_FLAGERR			/* DMA flags */
463 };
464 
465 static const ddi_dma_attr_t i40e_g_txbind_dma_attr = {
466 	DMA_ATTR_V0,			/* version number */
467 	0x0000000000000000ull,		/* low address */
468 	0xFFFFFFFFFFFFFFFFull,		/* high address */
469 	I40E_MAX_TX_BUFSZ - 1,		/* dma counter max */
470 	I40E_DMA_ALIGNMENT,		/* alignment */
471 	0x00000FFF,			/* burst sizes */
472 	0x00000001,			/* minimum transfer size */
473 	0x00000000FFFFFFFFull,		/* maximum transfer size */
474 	0xFFFFFFFFFFFFFFFFull,		/* maximum segment size	 */
475 	I40E_TX_MAX_COOKIE,		/* scatter/gather list length */
476 	0x00000001,			/* granularity */
477 	DDI_DMA_FLAGERR			/* DMA flags */
478 };
479 
480 static const ddi_dma_attr_t i40e_g_txbind_lso_dma_attr = {
481 	DMA_ATTR_V0,			/* version number */
482 	0x0000000000000000ull,		/* low address */
483 	0xFFFFFFFFFFFFFFFFull,		/* high address */
484 	I40E_MAX_TX_BUFSZ - 1,		/* dma counter max */
485 	I40E_DMA_ALIGNMENT,		/* alignment */
486 	0x00000FFF,			/* burst sizes */
487 	0x00000001,			/* minimum transfer size */
488 	0x00000000FFFFFFFFull,		/* maximum transfer size */
489 	0xFFFFFFFFFFFFFFFFull,		/* maximum segment size	 */
490 	I40E_TX_LSO_MAX_COOKIE,		/* scatter/gather list length */
491 	0x00000001,			/* granularity */
492 	DDI_DMA_FLAGERR			/* DMA flags */
493 };
494 
495 /*
496  * Next, we have the attributes for these structures. The descriptor rings are
497  * all strictly little endian, while the data buffers are just arrays of bytes
498  * representing frames. Because of this, we purposefully simplify the driver
499  * programming life by programming the descriptor ring as little endian, while
500  * for the buffer data we keep it as unstructured.
501  *
502  * Note, that to keep the Intel common code operating in a reasonable way, when
503  * we allocate DMA memory for it, we do not use byte swapping and thus use the
504  * standard i40e_buf_acc_attr.
505  */
506 static const ddi_device_acc_attr_t i40e_g_desc_acc_attr = {
507 	DDI_DEVICE_ATTR_V0,
508 	DDI_STRUCTURE_LE_ACC,
509 	DDI_STRICTORDER_ACC
510 };
511 
512 static const ddi_device_acc_attr_t i40e_g_buf_acc_attr = {
513 	DDI_DEVICE_ATTR_V0,
514 	DDI_NEVERSWAP_ACC,
515 	DDI_STRICTORDER_ACC
516 };
517 
518 /*
519  * The next two functions are designed to be type-safe versions of macros that
520  * are used to increment and decrement a descriptor index in the loop. Note,
521  * these are marked inline to try and keep the data path hot and they were
522  * effectively inlined in their previous life as macros.
523  */
524 static inline int
i40e_next_desc(int base,int count,int size)525 i40e_next_desc(int base, int count, int size)
526 {
527 	int out;
528 
529 	ASSERT(base >= 0);
530 	ASSERT(count > 0);
531 	ASSERT(size > 0);
532 
533 	if (base + count < size) {
534 		out = base + count;
535 	} else {
536 		out = base + count - size;
537 	}
538 
539 	ASSERT(out >= 0 && out < size);
540 	return (out);
541 }
542 
543 static inline int
i40e_prev_desc(int base,int count,int size)544 i40e_prev_desc(int base, int count, int size)
545 {
546 	int out;
547 
548 	ASSERT(base >= 0);
549 	ASSERT(count > 0);
550 	ASSERT(size > 0);
551 
552 	if (base >= count) {
553 		out = base - count;
554 	} else {
555 		out = base - count + size;
556 	}
557 
558 	ASSERT(out >= 0 && out < size);
559 	return (out);
560 }
561 
562 /*
563  * Free DMA memory that is represented by a i40e_dma_buffer_t.
564  */
565 static void
i40e_free_dma_buffer(i40e_dma_buffer_t * dmap)566 i40e_free_dma_buffer(i40e_dma_buffer_t *dmap)
567 {
568 	if (dmap->dmab_dma_address != 0) {
569 		VERIFY(dmap->dmab_dma_handle != NULL);
570 		(void) ddi_dma_unbind_handle(dmap->dmab_dma_handle);
571 		dmap->dmab_dma_address = 0;
572 		dmap->dmab_size = 0;
573 	}
574 
575 	if (dmap->dmab_acc_handle != NULL) {
576 		ddi_dma_mem_free(&dmap->dmab_acc_handle);
577 		dmap->dmab_acc_handle = NULL;
578 		dmap->dmab_address = NULL;
579 	}
580 
581 	if (dmap->dmab_dma_handle != NULL) {
582 		ddi_dma_free_handle(&dmap->dmab_dma_handle);
583 		dmap->dmab_dma_handle = NULL;
584 	}
585 
586 	/*
587 	 * These should only be set if we have valid handles allocated and
588 	 * therefore should always be NULLed out due to the above code. This
589 	 * is here to catch us acting sloppy.
590 	 */
591 	ASSERT(dmap->dmab_dma_address == 0);
592 	ASSERT(dmap->dmab_address == NULL);
593 	ASSERT(dmap->dmab_size == 0);
594 	dmap->dmab_len = 0;
595 }
596 
597 /*
598  * Allocate size bytes of DMA memory based on the passed in attributes. This
599  * fills in the information in dmap and is designed for all of our single cookie
600  * allocations.
601  */
602 static boolean_t
i40e_alloc_dma_buffer(i40e_t * i40e,i40e_dma_buffer_t * dmap,ddi_dma_attr_t * attrsp,ddi_device_acc_attr_t * accp,boolean_t stream,boolean_t zero,size_t size)603 i40e_alloc_dma_buffer(i40e_t *i40e, i40e_dma_buffer_t *dmap,
604     ddi_dma_attr_t *attrsp, ddi_device_acc_attr_t *accp, boolean_t stream,
605     boolean_t zero, size_t size)
606 {
607 	int ret;
608 	uint_t flags;
609 	size_t len;
610 	ddi_dma_cookie_t cookie;
611 	uint_t ncookies;
612 
613 	if (stream == B_TRUE)
614 		flags = DDI_DMA_STREAMING;
615 	else
616 		flags = DDI_DMA_CONSISTENT;
617 
618 	/*
619 	 * Step one: Allocate the DMA handle
620 	 */
621 	ret = ddi_dma_alloc_handle(i40e->i40e_dip, attrsp, DDI_DMA_DONTWAIT,
622 	    NULL, &dmap->dmab_dma_handle);
623 	if (ret != DDI_SUCCESS) {
624 		i40e_error(i40e, "failed to allocate dma handle for I/O "
625 		    "buffers: %d", ret);
626 		dmap->dmab_dma_handle = NULL;
627 		return (B_FALSE);
628 	}
629 
630 	/*
631 	 * Step two: Allocate the DMA memory
632 	 */
633 	ret = ddi_dma_mem_alloc(dmap->dmab_dma_handle, size, accp, flags,
634 	    DDI_DMA_DONTWAIT, NULL, &dmap->dmab_address, &len,
635 	    &dmap->dmab_acc_handle);
636 	if (ret != DDI_SUCCESS) {
637 		i40e_error(i40e, "failed to allocate %ld bytes of DMA for I/O "
638 		    "buffers", size);
639 		dmap->dmab_address = NULL;
640 		dmap->dmab_acc_handle = NULL;
641 		i40e_free_dma_buffer(dmap);
642 		return (B_FALSE);
643 	}
644 
645 	/*
646 	 * Step three: Optionally zero
647 	 */
648 	if (zero == B_TRUE)
649 		bzero(dmap->dmab_address, len);
650 
651 	/*
652 	 * Step four: Bind the memory
653 	 */
654 	ret = ddi_dma_addr_bind_handle(dmap->dmab_dma_handle, NULL,
655 	    dmap->dmab_address, len, DDI_DMA_RDWR | flags, DDI_DMA_DONTWAIT,
656 	    NULL, &cookie, &ncookies);
657 	if (ret != DDI_DMA_MAPPED) {
658 		i40e_error(i40e, "failed to allocate %ld bytes of DMA for I/O "
659 		    "buffers: %d", size, ret);
660 		i40e_free_dma_buffer(dmap);
661 		return (B_FALSE);
662 	}
663 
664 	VERIFY(ncookies == 1);
665 	dmap->dmab_dma_address = cookie.dmac_laddress;
666 	dmap->dmab_size = len;
667 	dmap->dmab_len = 0;
668 	return (B_TRUE);
669 }
670 
671 /*
672  * This function is called once the last pending rcb has been freed by the upper
673  * levels of the system.
674  */
675 static void
i40e_free_rx_data(i40e_rx_data_t * rxd)676 i40e_free_rx_data(i40e_rx_data_t *rxd)
677 {
678 	VERIFY(rxd->rxd_rcb_pending == 0);
679 
680 	if (rxd->rxd_rcb_area != NULL) {
681 		kmem_free(rxd->rxd_rcb_area,
682 		    sizeof (i40e_rx_control_block_t) *
683 		    (rxd->rxd_free_list_size + rxd->rxd_ring_size));
684 		rxd->rxd_rcb_area = NULL;
685 	}
686 
687 	if (rxd->rxd_free_list != NULL) {
688 		kmem_free(rxd->rxd_free_list,
689 		    sizeof (i40e_rx_control_block_t *) *
690 		    rxd->rxd_free_list_size);
691 		rxd->rxd_free_list = NULL;
692 	}
693 
694 	if (rxd->rxd_work_list != NULL) {
695 		kmem_free(rxd->rxd_work_list,
696 		    sizeof (i40e_rx_control_block_t *) *
697 		    rxd->rxd_ring_size);
698 		rxd->rxd_work_list = NULL;
699 	}
700 
701 	kmem_free(rxd, sizeof (i40e_rx_data_t));
702 }
703 
704 static boolean_t
i40e_alloc_rx_data(i40e_t * i40e,i40e_trqpair_t * itrq)705 i40e_alloc_rx_data(i40e_t *i40e, i40e_trqpair_t *itrq)
706 {
707 	i40e_rx_data_t *rxd;
708 
709 	rxd = kmem_zalloc(sizeof (i40e_rx_data_t), KM_NOSLEEP);
710 	if (rxd == NULL)
711 		return (B_FALSE);
712 	itrq->itrq_rxdata = rxd;
713 	rxd->rxd_i40e = i40e;
714 
715 	rxd->rxd_ring_size = i40e->i40e_rx_ring_size;
716 	rxd->rxd_free_list_size = i40e->i40e_rx_ring_size;
717 
718 	rxd->rxd_rcb_free = rxd->rxd_free_list_size;
719 
720 	rxd->rxd_work_list = kmem_zalloc(sizeof (i40e_rx_control_block_t *) *
721 	    rxd->rxd_ring_size, KM_NOSLEEP);
722 	if (rxd->rxd_work_list == NULL) {
723 		i40e_error(i40e, "failed to allocate RX work list for a ring "
724 		    "of %d entries for ring %d", rxd->rxd_ring_size,
725 		    itrq->itrq_index);
726 		goto cleanup;
727 	}
728 
729 	rxd->rxd_free_list = kmem_zalloc(sizeof (i40e_rx_control_block_t *) *
730 	    rxd->rxd_free_list_size, KM_NOSLEEP);
731 	if (rxd->rxd_free_list == NULL) {
732 		i40e_error(i40e, "failed to allocate a %d entry RX free list "
733 		    "for ring %d", rxd->rxd_free_list_size, itrq->itrq_index);
734 		goto cleanup;
735 	}
736 
737 	rxd->rxd_rcb_area = kmem_zalloc(sizeof (i40e_rx_control_block_t) *
738 	    (rxd->rxd_free_list_size + rxd->rxd_ring_size), KM_NOSLEEP);
739 	if (rxd->rxd_rcb_area == NULL) {
740 		i40e_error(i40e, "failed to allocate a %d entry rcb area for "
741 		    "ring %d", rxd->rxd_ring_size + rxd->rxd_free_list_size,
742 		    itrq->itrq_index);
743 		goto cleanup;
744 	}
745 
746 	return (B_TRUE);
747 
748 cleanup:
749 	i40e_free_rx_data(rxd);
750 	itrq->itrq_rxdata = NULL;
751 	return (B_FALSE);
752 }
753 
754 /*
755  * Free all of the memory that we've allocated for DMA. Note that we may have
756  * buffers that we've loaned up to the OS which are still outstanding. We'll
757  * always free up the descriptor ring, because we no longer need that. For each
758  * rcb, we'll iterate over it and if we send the reference count to zero, then
759  * we'll free the message block and DMA related resources. However, if we don't
760  * take the last one, then we'll go ahead and keep track that we'll have pending
761  * data and clean it up when we get there.
762  */
763 static void
i40e_free_rx_dma(i40e_rx_data_t * rxd,boolean_t failed_init)764 i40e_free_rx_dma(i40e_rx_data_t *rxd, boolean_t failed_init)
765 {
766 	uint32_t i, count, ref;
767 
768 	i40e_rx_control_block_t *rcb;
769 	i40e_t *i40e = rxd->rxd_i40e;
770 
771 	i40e_free_dma_buffer(&rxd->rxd_desc_area);
772 	rxd->rxd_desc_ring = NULL;
773 	rxd->rxd_desc_next = 0;
774 
775 	mutex_enter(&i40e->i40e_rx_pending_lock);
776 
777 	rcb = rxd->rxd_rcb_area;
778 	count = rxd->rxd_ring_size + rxd->rxd_free_list_size;
779 
780 	for (i = 0; i < count; i++, rcb++) {
781 		VERIFY(rcb != NULL);
782 
783 		/*
784 		 * If we're cleaning up from a failed creation attempt, then an
785 		 * entry may never have been assembled which would mean that
786 		 * it's reference count is zero. If we find that, we leave it
787 		 * be, because nothing else should be modifying it at this
788 		 * point. We're not at the point that any more references can be
789 		 * added, just removed.
790 		 */
791 		if (failed_init == B_TRUE && rcb->rcb_ref == 0)
792 			continue;
793 
794 		ref = atomic_dec_32_nv(&rcb->rcb_ref);
795 		if (ref == 0) {
796 			freemsg(rcb->rcb_mp);
797 			rcb->rcb_mp = NULL;
798 			i40e_free_dma_buffer(&rcb->rcb_dma);
799 		} else {
800 			atomic_inc_32(&rxd->rxd_rcb_pending);
801 			atomic_inc_32(&i40e->i40e_rx_pending);
802 		}
803 	}
804 	mutex_exit(&i40e->i40e_rx_pending_lock);
805 }
806 
807 /*
808  * Initialize the DMA memory for the descriptor ring and for each frame in the
809  * control block list.
810  */
811 static boolean_t
i40e_alloc_rx_dma(i40e_rx_data_t * rxd)812 i40e_alloc_rx_dma(i40e_rx_data_t *rxd)
813 {
814 	int i, count;
815 	size_t dmasz;
816 	i40e_rx_control_block_t *rcb;
817 	i40e_t *i40e = rxd->rxd_i40e;
818 
819 	/*
820 	 * First allocate the RX descriptor ring.
821 	 */
822 	dmasz = sizeof (i40e_rx_desc_t) * rxd->rxd_ring_size;
823 	VERIFY(dmasz > 0);
824 	if (i40e_alloc_dma_buffer(i40e, &rxd->rxd_desc_area,
825 	    &i40e->i40e_static_dma_attr, &i40e->i40e_desc_acc_attr, B_FALSE,
826 	    B_TRUE, dmasz) == B_FALSE) {
827 		i40e_error(i40e, "failed to allocate DMA resources "
828 		    "for RX descriptor ring");
829 		return (B_FALSE);
830 	}
831 	rxd->rxd_desc_ring =
832 	    (i40e_rx_desc_t *)(uintptr_t)rxd->rxd_desc_area.dmab_address;
833 	rxd->rxd_desc_next = 0;
834 
835 	count = rxd->rxd_ring_size + rxd->rxd_free_list_size;
836 	rcb = rxd->rxd_rcb_area;
837 
838 	dmasz = i40e->i40e_rx_buf_size;
839 	VERIFY(dmasz > 0);
840 	for (i = 0; i < count; i++, rcb++) {
841 		i40e_dma_buffer_t *dmap;
842 		VERIFY(rcb != NULL);
843 
844 		if (i < rxd->rxd_ring_size) {
845 			rxd->rxd_work_list[i] = rcb;
846 		} else {
847 			rxd->rxd_free_list[i - rxd->rxd_ring_size] = rcb;
848 		}
849 
850 		dmap = &rcb->rcb_dma;
851 		if (i40e_alloc_dma_buffer(i40e, dmap,
852 		    &i40e->i40e_static_dma_attr, &i40e->i40e_buf_acc_attr,
853 		    B_TRUE, B_FALSE, dmasz) == B_FALSE) {
854 			i40e_error(i40e, "failed to allocate RX dma buffer");
855 			return (B_FALSE);
856 		}
857 
858 		/*
859 		 * Initialize the control block and offset the DMA address. See
860 		 * the note in the big theory statement that explains how this
861 		 * helps IP deal with alignment. Note, we don't worry about
862 		 * whether or not we successfully get an mblk_t from desballoc,
863 		 * it's a common case that we have to handle later on in the
864 		 * system.
865 		 */
866 		dmap->dmab_size -= I40E_BUF_IPHDR_ALIGNMENT;
867 		dmap->dmab_address += I40E_BUF_IPHDR_ALIGNMENT;
868 		dmap->dmab_dma_address += I40E_BUF_IPHDR_ALIGNMENT;
869 
870 		rcb->rcb_ref = 1;
871 		rcb->rcb_rxd = rxd;
872 		rcb->rcb_free_rtn.free_func = i40e_rx_recycle;
873 		rcb->rcb_free_rtn.free_arg = (caddr_t)rcb;
874 		rcb->rcb_mp = desballoc((unsigned char *)dmap->dmab_address,
875 		    dmap->dmab_size, 0, &rcb->rcb_free_rtn);
876 	}
877 
878 	return (B_TRUE);
879 }
880 
881 static void
i40e_free_tx_dma(i40e_trqpair_t * itrq)882 i40e_free_tx_dma(i40e_trqpair_t *itrq)
883 {
884 	size_t fsz;
885 
886 	if (itrq->itrq_tcb_area != NULL) {
887 		uint32_t i;
888 		i40e_tx_control_block_t *tcb = itrq->itrq_tcb_area;
889 
890 		for (i = 0; i < itrq->itrq_tx_free_list_size; i++, tcb++) {
891 			i40e_free_dma_buffer(&tcb->tcb_dma);
892 			if (tcb->tcb_dma_handle != NULL) {
893 				ddi_dma_free_handle(&tcb->tcb_dma_handle);
894 				tcb->tcb_dma_handle = NULL;
895 			}
896 			if (tcb->tcb_lso_dma_handle != NULL) {
897 				ddi_dma_free_handle(&tcb->tcb_lso_dma_handle);
898 				tcb->tcb_lso_dma_handle = NULL;
899 			}
900 		}
901 
902 		fsz = sizeof (i40e_tx_control_block_t) *
903 		    itrq->itrq_tx_free_list_size;
904 		kmem_free(itrq->itrq_tcb_area, fsz);
905 		itrq->itrq_tcb_area = NULL;
906 	}
907 
908 	if (itrq->itrq_tcb_free_list != NULL) {
909 		fsz = sizeof (i40e_tx_control_block_t *) *
910 		    itrq->itrq_tx_free_list_size;
911 		kmem_free(itrq->itrq_tcb_free_list, fsz);
912 		itrq->itrq_tcb_free_list = NULL;
913 	}
914 
915 	if (itrq->itrq_tcb_work_list != NULL) {
916 		fsz = sizeof (i40e_tx_control_block_t *) *
917 		    itrq->itrq_tx_ring_size;
918 		kmem_free(itrq->itrq_tcb_work_list, fsz);
919 		itrq->itrq_tcb_work_list = NULL;
920 	}
921 
922 	i40e_free_dma_buffer(&itrq->itrq_desc_area);
923 	itrq->itrq_desc_ring = NULL;
924 
925 }
926 
927 static boolean_t
i40e_alloc_tx_dma(i40e_trqpair_t * itrq)928 i40e_alloc_tx_dma(i40e_trqpair_t *itrq)
929 {
930 	int i, ret;
931 	size_t dmasz;
932 	i40e_tx_control_block_t *tcb;
933 	i40e_t *i40e = itrq->itrq_i40e;
934 
935 	itrq->itrq_tx_ring_size = i40e->i40e_tx_ring_size;
936 	itrq->itrq_tx_free_list_size = i40e->i40e_tx_ring_size +
937 	    (i40e->i40e_tx_ring_size >> 1);
938 
939 	/*
940 	 * Allocate an additional TX descriptor for the writeback head.
941 	 */
942 	dmasz = sizeof (i40e_tx_desc_t) * itrq->itrq_tx_ring_size;
943 	dmasz += sizeof (i40e_tx_desc_t);
944 
945 	VERIFY(dmasz > 0);
946 	if (i40e_alloc_dma_buffer(i40e, &itrq->itrq_desc_area,
947 	    &i40e->i40e_static_dma_attr, &i40e->i40e_desc_acc_attr,
948 	    B_FALSE, B_TRUE, dmasz) == B_FALSE) {
949 		i40e_error(i40e, "failed to allocate DMA resources for TX "
950 		    "descriptor ring");
951 		return (B_FALSE);
952 	}
953 	itrq->itrq_desc_ring =
954 	    (i40e_tx_desc_t *)(uintptr_t)itrq->itrq_desc_area.dmab_address;
955 	itrq->itrq_desc_wbhead = (uint32_t *)(itrq->itrq_desc_ring +
956 	    itrq->itrq_tx_ring_size);
957 	itrq->itrq_desc_head = 0;
958 	itrq->itrq_desc_tail = 0;
959 	itrq->itrq_desc_free = itrq->itrq_tx_ring_size;
960 
961 	itrq->itrq_tcb_work_list = kmem_zalloc(itrq->itrq_tx_ring_size *
962 	    sizeof (i40e_tx_control_block_t *), KM_NOSLEEP);
963 	if (itrq->itrq_tcb_work_list == NULL) {
964 		i40e_error(i40e, "failed to allocate a %d entry TX work list "
965 		    "for ring %d", itrq->itrq_tx_ring_size, itrq->itrq_index);
966 		goto cleanup;
967 	}
968 
969 	itrq->itrq_tcb_free_list = kmem_zalloc(itrq->itrq_tx_free_list_size *
970 	    sizeof (i40e_tx_control_block_t *), KM_SLEEP);
971 	if (itrq->itrq_tcb_free_list == NULL) {
972 		i40e_error(i40e, "failed to allocate a %d entry TX free list "
973 		    "for ring %d", itrq->itrq_tx_free_list_size,
974 		    itrq->itrq_index);
975 		goto cleanup;
976 	}
977 
978 	/*
979 	 * We allocate enough TX control blocks to cover the free list.
980 	 */
981 	itrq->itrq_tcb_area = kmem_zalloc(sizeof (i40e_tx_control_block_t) *
982 	    itrq->itrq_tx_free_list_size, KM_NOSLEEP);
983 	if (itrq->itrq_tcb_area == NULL) {
984 		i40e_error(i40e, "failed to allocate a %d entry tcb area for "
985 		    "ring %d", itrq->itrq_tx_free_list_size, itrq->itrq_index);
986 		goto cleanup;
987 	}
988 
989 	/*
990 	 * For each tcb, allocate DMA memory.
991 	 */
992 	dmasz = i40e->i40e_tx_buf_size;
993 	VERIFY(dmasz > 0);
994 	tcb = itrq->itrq_tcb_area;
995 	for (i = 0; i < itrq->itrq_tx_free_list_size; i++, tcb++) {
996 		VERIFY(tcb != NULL);
997 
998 		/*
999 		 * Allocate both a DMA buffer which we'll use for when we copy
1000 		 * packets for transmission and allocate a DMA handle which
1001 		 * we'll use when we bind data.
1002 		 */
1003 		ret = ddi_dma_alloc_handle(i40e->i40e_dip,
1004 		    &i40e->i40e_txbind_dma_attr, DDI_DMA_DONTWAIT, NULL,
1005 		    &tcb->tcb_dma_handle);
1006 		if (ret != DDI_SUCCESS) {
1007 			i40e_error(i40e, "failed to allocate DMA handle for TX "
1008 			    "data binding on ring %d: %d", itrq->itrq_index,
1009 			    ret);
1010 			tcb->tcb_dma_handle = NULL;
1011 			goto cleanup;
1012 		}
1013 
1014 		ret = ddi_dma_alloc_handle(i40e->i40e_dip,
1015 		    &i40e->i40e_txbind_lso_dma_attr, DDI_DMA_DONTWAIT, NULL,
1016 		    &tcb->tcb_lso_dma_handle);
1017 		if (ret != DDI_SUCCESS) {
1018 			i40e_error(i40e, "failed to allocate DMA handle for TX "
1019 			    "LSO data binding on ring %d: %d", itrq->itrq_index,
1020 			    ret);
1021 			tcb->tcb_lso_dma_handle = NULL;
1022 			goto cleanup;
1023 		}
1024 
1025 		if (i40e_alloc_dma_buffer(i40e, &tcb->tcb_dma,
1026 		    &i40e->i40e_static_dma_attr, &i40e->i40e_buf_acc_attr,
1027 		    B_TRUE, B_FALSE, dmasz) == B_FALSE) {
1028 			i40e_error(i40e, "failed to allocate %ld bytes of "
1029 			    "DMA for TX data binding on ring %d", dmasz,
1030 			    itrq->itrq_index);
1031 			goto cleanup;
1032 		}
1033 
1034 		itrq->itrq_tcb_free_list[i] = tcb;
1035 	}
1036 
1037 	itrq->itrq_tcb_free = itrq->itrq_tx_free_list_size;
1038 
1039 	return (B_TRUE);
1040 
1041 cleanup:
1042 	i40e_free_tx_dma(itrq);
1043 	return (B_FALSE);
1044 }
1045 
1046 /*
1047  * Free all memory associated with a ring. Note, this is done as part of
1048  * the GLDv3 ring stop routine.
1049  */
1050 void
i40e_free_ring_mem(i40e_trqpair_t * itrq,boolean_t failed_init)1051 i40e_free_ring_mem(i40e_trqpair_t *itrq, boolean_t failed_init)
1052 {
1053 	i40e_t *i40e = itrq->itrq_i40e;
1054 	i40e_rx_data_t *rxd = itrq->itrq_rxdata;
1055 
1056 	/*
1057 	 * In some cases i40e_alloc_rx_data() may have failed
1058 	 * and in that case there is no rxd to free.
1059 	 */
1060 	if (rxd == NULL)
1061 		return;
1062 
1063 	/*
1064 	 * Clean up our RX data. We have to free DMA resources first and
1065 	 * then if we have no more pending RCB's, then we'll go ahead
1066 	 * and clean things up. Note, we can't set the stopped flag on
1067 	 * the RX data until after we've done the first pass of the
1068 	 * pending resources. Otherwise we might race with
1069 	 * i40e_rx_recycle on determining who should free the
1070 	 * i40e_rx_data_t above.
1071 	 */
1072 	i40e_free_rx_dma(rxd, failed_init);
1073 
1074 	mutex_enter(&i40e->i40e_rx_pending_lock);
1075 	rxd->rxd_shutdown = B_TRUE;
1076 	if (rxd->rxd_rcb_pending == 0) {
1077 		i40e_free_rx_data(rxd);
1078 		itrq->itrq_rxdata = NULL;
1079 	}
1080 	mutex_exit(&i40e->i40e_rx_pending_lock);
1081 
1082 	i40e_free_tx_dma(itrq);
1083 }
1084 
1085 /*
1086  * Allocate all of the resources associated with a ring.
1087  * Note this is done as part of the GLDv3 ring start routine.
1088  * This takes care of both DMA and non-DMA related resources.
1089  */
1090 boolean_t
i40e_alloc_ring_mem(i40e_trqpair_t * itrq)1091 i40e_alloc_ring_mem(i40e_trqpair_t *itrq)
1092 {
1093 	if (!i40e_alloc_rx_data(itrq->itrq_i40e, itrq))
1094 		goto free;
1095 
1096 	if (!i40e_alloc_rx_dma(itrq->itrq_rxdata))
1097 		goto free;
1098 
1099 	if (!i40e_alloc_tx_dma(itrq))
1100 		goto free;
1101 
1102 	return (B_TRUE);
1103 
1104 free:
1105 	i40e_free_ring_mem(itrq, B_TRUE);
1106 	return (B_FALSE);
1107 }
1108 
1109 
1110 /*
1111  * Because every instance of i40e may have different support for FMA
1112  * capabilities, we copy the DMA attributes into the i40e_t and set them that
1113  * way and use them for determining attributes.
1114  */
1115 void
i40e_init_dma_attrs(i40e_t * i40e,boolean_t fma)1116 i40e_init_dma_attrs(i40e_t *i40e, boolean_t fma)
1117 {
1118 	bcopy(&i40e_g_static_dma_attr, &i40e->i40e_static_dma_attr,
1119 	    sizeof (ddi_dma_attr_t));
1120 	bcopy(&i40e_g_txbind_dma_attr, &i40e->i40e_txbind_dma_attr,
1121 	    sizeof (ddi_dma_attr_t));
1122 	bcopy(&i40e_g_txbind_lso_dma_attr, &i40e->i40e_txbind_lso_dma_attr,
1123 	    sizeof (ddi_dma_attr_t));
1124 	bcopy(&i40e_g_desc_acc_attr, &i40e->i40e_desc_acc_attr,
1125 	    sizeof (ddi_device_acc_attr_t));
1126 	bcopy(&i40e_g_buf_acc_attr, &i40e->i40e_buf_acc_attr,
1127 	    sizeof (ddi_device_acc_attr_t));
1128 
1129 	if (fma == B_TRUE) {
1130 		i40e->i40e_static_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR;
1131 		i40e->i40e_txbind_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR;
1132 		i40e->i40e_txbind_lso_dma_attr.dma_attr_flags |=
1133 		    DDI_DMA_FLAGERR;
1134 	} else {
1135 		i40e->i40e_static_dma_attr.dma_attr_flags &= ~DDI_DMA_FLAGERR;
1136 		i40e->i40e_txbind_dma_attr.dma_attr_flags &= ~DDI_DMA_FLAGERR;
1137 		i40e->i40e_txbind_lso_dma_attr.dma_attr_flags &=
1138 		    ~DDI_DMA_FLAGERR;
1139 	}
1140 }
1141 
1142 static void
i40e_rcb_free(i40e_rx_data_t * rxd,i40e_rx_control_block_t * rcb)1143 i40e_rcb_free(i40e_rx_data_t *rxd, i40e_rx_control_block_t *rcb)
1144 {
1145 	mutex_enter(&rxd->rxd_free_lock);
1146 	ASSERT(rxd->rxd_rcb_free < rxd->rxd_free_list_size);
1147 	ASSERT(rxd->rxd_free_list[rxd->rxd_rcb_free] == NULL);
1148 	rxd->rxd_free_list[rxd->rxd_rcb_free] = rcb;
1149 	rxd->rxd_rcb_free++;
1150 	mutex_exit(&rxd->rxd_free_lock);
1151 }
1152 
1153 static i40e_rx_control_block_t *
i40e_rcb_alloc(i40e_rx_data_t * rxd)1154 i40e_rcb_alloc(i40e_rx_data_t *rxd)
1155 {
1156 	i40e_rx_control_block_t *rcb;
1157 
1158 	mutex_enter(&rxd->rxd_free_lock);
1159 	if (rxd->rxd_rcb_free == 0) {
1160 		mutex_exit(&rxd->rxd_free_lock);
1161 		return (NULL);
1162 	}
1163 	rxd->rxd_rcb_free--;
1164 	rcb = rxd->rxd_free_list[rxd->rxd_rcb_free];
1165 	VERIFY(rcb != NULL);
1166 	rxd->rxd_free_list[rxd->rxd_rcb_free] = NULL;
1167 	mutex_exit(&rxd->rxd_free_lock);
1168 
1169 	return (rcb);
1170 }
1171 
1172 /*
1173  * This is the callback that we get from the OS when freemsg(9F) has been called
1174  * on a loaned descriptor. In addition, if we take the last reference count
1175  * here, then we have to tear down all of the RX data.
1176  */
1177 void
i40e_rx_recycle(caddr_t arg)1178 i40e_rx_recycle(caddr_t arg)
1179 {
1180 	uint32_t ref;
1181 	i40e_rx_control_block_t *rcb;
1182 	i40e_rx_data_t *rxd;
1183 	i40e_t *i40e;
1184 
1185 	/* LINTED: E_BAD_PTR_CAST_ALIGN */
1186 	rcb = (i40e_rx_control_block_t *)arg;
1187 	rxd = rcb->rcb_rxd;
1188 	i40e = rxd->rxd_i40e;
1189 
1190 	/*
1191 	 * It's possible for this to be called with a reference count of zero.
1192 	 * That will happen when we're doing the freemsg after taking the last
1193 	 * reference because we're tearing down everything and this rcb is not
1194 	 * outstanding.
1195 	 */
1196 	if (rcb->rcb_ref == 0)
1197 		return;
1198 
1199 	/*
1200 	 * Don't worry about failure of desballoc here. It'll only become fatal
1201 	 * if we're trying to use it and we can't in i40e_rx_bind().
1202 	 */
1203 	rcb->rcb_mp = desballoc((unsigned char *)rcb->rcb_dma.dmab_address,
1204 	    rcb->rcb_dma.dmab_size, 0, &rcb->rcb_free_rtn);
1205 	i40e_rcb_free(rxd, rcb);
1206 
1207 	/*
1208 	 * It's possible that the rcb was being used while we are shutting down
1209 	 * the device. In that case, we'll take the final reference from the
1210 	 * device here.
1211 	 */
1212 	ref = atomic_dec_32_nv(&rcb->rcb_ref);
1213 	if (ref == 0) {
1214 		freemsg(rcb->rcb_mp);
1215 		rcb->rcb_mp = NULL;
1216 		i40e_free_dma_buffer(&rcb->rcb_dma);
1217 
1218 		mutex_enter(&i40e->i40e_rx_pending_lock);
1219 		atomic_dec_32(&rxd->rxd_rcb_pending);
1220 		atomic_dec_32(&i40e->i40e_rx_pending);
1221 
1222 		/*
1223 		 * If this was the last block and it's been indicated that we've
1224 		 * passed the shutdown point, we should clean up.
1225 		 */
1226 		if (rxd->rxd_shutdown == B_TRUE && rxd->rxd_rcb_pending == 0) {
1227 			i40e_free_rx_data(rxd);
1228 			cv_broadcast(&i40e->i40e_rx_pending_cv);
1229 		}
1230 
1231 		mutex_exit(&i40e->i40e_rx_pending_lock);
1232 	}
1233 }
1234 
1235 static mblk_t *
i40e_rx_bind(i40e_trqpair_t * itrq,i40e_rx_data_t * rxd,uint32_t index,uint32_t plen)1236 i40e_rx_bind(i40e_trqpair_t *itrq, i40e_rx_data_t *rxd, uint32_t index,
1237     uint32_t plen)
1238 {
1239 	mblk_t *mp;
1240 	i40e_t *i40e = rxd->rxd_i40e;
1241 	i40e_rx_control_block_t *rcb, *rep_rcb;
1242 
1243 	ASSERT(MUTEX_HELD(&itrq->itrq_rx_lock));
1244 
1245 	if ((rep_rcb = i40e_rcb_alloc(rxd)) == NULL) {
1246 		itrq->itrq_rxstat.irxs_rx_bind_norcb.value.ui64++;
1247 		return (NULL);
1248 	}
1249 
1250 	rcb = rxd->rxd_work_list[index];
1251 
1252 	/*
1253 	 * Check to make sure we have a mblk_t. If we don't, this is our last
1254 	 * chance to try and get one.
1255 	 */
1256 	if (rcb->rcb_mp == NULL) {
1257 		rcb->rcb_mp =
1258 		    desballoc((unsigned char *)rcb->rcb_dma.dmab_address,
1259 		    rcb->rcb_dma.dmab_size, 0, &rcb->rcb_free_rtn);
1260 		if (rcb->rcb_mp == NULL) {
1261 			itrq->itrq_rxstat.irxs_rx_bind_nomp.value.ui64++;
1262 			i40e_rcb_free(rxd, rcb);
1263 			return (NULL);
1264 		}
1265 	}
1266 
1267 	I40E_DMA_SYNC(&rcb->rcb_dma, DDI_DMA_SYNC_FORKERNEL);
1268 
1269 	if (i40e_check_dma_handle(rcb->rcb_dma.dmab_dma_handle) != DDI_FM_OK) {
1270 		ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED);
1271 		atomic_or_32(&i40e->i40e_state, I40E_ERROR);
1272 		i40e_rcb_free(rxd, rcb);
1273 		return (NULL);
1274 	}
1275 
1276 	/*
1277 	 * Note, we've already accounted for the I40E_BUF_IPHDR_ALIGNMENT.
1278 	 */
1279 	mp = rcb->rcb_mp;
1280 	atomic_inc_32(&rcb->rcb_ref);
1281 	mp->b_wptr = mp->b_rptr + plen;
1282 	mp->b_next = mp->b_cont = NULL;
1283 
1284 	rxd->rxd_work_list[index] = rep_rcb;
1285 	return (mp);
1286 }
1287 
1288 /*
1289  * We're going to allocate a new message block for this frame and attempt to
1290  * receive it. See the big theory statement for more information on when we copy
1291  * versus bind.
1292  */
1293 static mblk_t *
i40e_rx_copy(i40e_trqpair_t * itrq,i40e_rx_data_t * rxd,uint32_t index,uint32_t plen)1294 i40e_rx_copy(i40e_trqpair_t *itrq, i40e_rx_data_t *rxd, uint32_t index,
1295     uint32_t plen)
1296 {
1297 	i40e_t *i40e = rxd->rxd_i40e;
1298 	i40e_rx_control_block_t *rcb;
1299 	mblk_t *mp;
1300 
1301 	ASSERT(index < rxd->rxd_ring_size);
1302 	rcb = rxd->rxd_work_list[index];
1303 
1304 	I40E_DMA_SYNC(&rcb->rcb_dma, DDI_DMA_SYNC_FORKERNEL);
1305 
1306 	if (i40e_check_dma_handle(rcb->rcb_dma.dmab_dma_handle) != DDI_FM_OK) {
1307 		ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED);
1308 		atomic_or_32(&i40e->i40e_state, I40E_ERROR);
1309 		return (NULL);
1310 	}
1311 
1312 	mp = allocb(plen + I40E_BUF_IPHDR_ALIGNMENT, 0);
1313 	if (mp == NULL) {
1314 		itrq->itrq_rxstat.irxs_rx_copy_nomem.value.ui64++;
1315 		return (NULL);
1316 	}
1317 
1318 	mp->b_rptr += I40E_BUF_IPHDR_ALIGNMENT;
1319 	bcopy(rcb->rcb_dma.dmab_address, mp->b_rptr, plen);
1320 	mp->b_wptr = mp->b_rptr + plen;
1321 
1322 	return (mp);
1323 }
1324 
1325 /*
1326  * Determine if the device has enabled any checksum flags for us. The level of
1327  * checksum computed will depend on the type packet that we have, which is
1328  * contained in ptype. For example, the checksum logic it does will vary
1329  * depending on whether or not the packet is considered tunneled, whether it
1330  * recognizes the L4 type, etc. Section 8.3.4.3 summarizes which checksums are
1331  * valid.
1332  *
1333  * While there are additional checksums that we could recognize here, we'll need
1334  * to get some additional GLDv3 enhancements to be able to properly describe
1335  * them.
1336  */
1337 static void
i40e_rx_hcksum(i40e_trqpair_t * itrq,mblk_t * mp,uint64_t status,uint32_t err,uint32_t ptype)1338 i40e_rx_hcksum(i40e_trqpair_t *itrq, mblk_t *mp, uint64_t status, uint32_t err,
1339     uint32_t ptype)
1340 {
1341 	uint32_t cksum;
1342 	struct i40e_rx_ptype_decoded pinfo;
1343 
1344 	ASSERT(ptype <= 255);
1345 	pinfo = decode_rx_desc_ptype(ptype);
1346 
1347 	cksum = 0;
1348 
1349 	/*
1350 	 * If the ptype isn't something that we know in the driver, then we
1351 	 * shouldn't even consider moving forward.
1352 	 */
1353 	if (pinfo.known == 0) {
1354 		itrq->itrq_rxstat.irxs_hck_unknown.value.ui64++;
1355 		return;
1356 	}
1357 
1358 	/*
1359 	 * If hardware didn't set the L3L4P bit on the frame, then there is no
1360 	 * checksum offload to consider.
1361 	 */
1362 	if ((status & (1 << I40E_RX_DESC_STATUS_L3L4P_SHIFT)) == 0) {
1363 		itrq->itrq_rxstat.irxs_hck_nol3l4p.value.ui64++;
1364 		return;
1365 	}
1366 
1367 	/*
1368 	 * The device tells us that IPv6 checksums where a Destination Options
1369 	 * Header or a Routing header shouldn't be trusted. Discard all
1370 	 * checksums in this case.
1371 	 */
1372 	if (pinfo.outer_ip == I40E_RX_PTYPE_OUTER_IP &&
1373 	    pinfo.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV6 &&
1374 	    (status & (1 << I40E_RX_DESC_STATUS_IPV6EXADD_SHIFT))) {
1375 		itrq->itrq_rxstat.irxs_hck_v6skip.value.ui64++;
1376 		return;
1377 	}
1378 
1379 	/*
1380 	 * The hardware denotes three kinds of possible errors. Two are reserved
1381 	 * for inner and outer IP checksum errors (IPE and EIPE) and the latter
1382 	 * is for L4 checksum errors (L4E). If there is only one IP header, then
1383 	 * the only thing that we care about is IPE. Note that since we don't
1384 	 * support inner checksums, we will ignore IPE being set on tunneled
1385 	 * packets and only care about EIPE.
1386 	 */
1387 	if (pinfo.outer_ip == I40E_RX_PTYPE_OUTER_IP &&
1388 	    pinfo.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV4) {
1389 		if (pinfo.tunnel_type == I40E_RX_PTYPE_OUTER_NONE) {
1390 			if ((err & (1 << I40E_RX_DESC_ERROR_IPE_SHIFT)) != 0) {
1391 				itrq->itrq_rxstat.irxs_hck_iperr.value.ui64++;
1392 			} else {
1393 				itrq->itrq_rxstat.irxs_hck_v4hdrok.value.ui64++;
1394 				cksum |= HCK_IPV4_HDRCKSUM_OK;
1395 			}
1396 		} else {
1397 			if ((err & (1 << I40E_RX_DESC_ERROR_EIPE_SHIFT)) != 0) {
1398 				itrq->itrq_rxstat.irxs_hck_eiperr.value.ui64++;
1399 			} else {
1400 				itrq->itrq_rxstat.irxs_hck_v4hdrok.value.ui64++;
1401 				cksum |= HCK_IPV4_HDRCKSUM_OK;
1402 			}
1403 		}
1404 	}
1405 
1406 	/*
1407 	 * We only have meaningful L4 checksums in the case of IP->L4 and
1408 	 * IP->IP->L4. There is not outer L4 checksum data available in any
1409 	 * other case. Further, we don't bother reporting the valid checksum in
1410 	 * the case of IP->IP->L4 set.
1411 	 */
1412 	if (pinfo.outer_ip == I40E_RX_PTYPE_OUTER_IP &&
1413 	    pinfo.tunnel_type == I40E_RX_PTYPE_TUNNEL_NONE &&
1414 	    (pinfo.inner_prot == I40E_RX_PTYPE_INNER_PROT_UDP ||
1415 	    pinfo.inner_prot == I40E_RX_PTYPE_INNER_PROT_TCP ||
1416 	    pinfo.inner_prot == I40E_RX_PTYPE_INNER_PROT_ICMP ||
1417 	    pinfo.inner_prot == I40E_RX_PTYPE_INNER_PROT_SCTP)) {
1418 		ASSERT(pinfo.payload_layer == I40E_RX_PTYPE_PAYLOAD_LAYER_PAY4);
1419 		if ((err & (1 << I40E_RX_DESC_ERROR_L4E_SHIFT)) != 0) {
1420 			itrq->itrq_rxstat.irxs_hck_l4err.value.ui64++;
1421 		} else {
1422 			itrq->itrq_rxstat.irxs_hck_l4hdrok.value.ui64++;
1423 			cksum |= HCK_FULLCKSUM_OK;
1424 		}
1425 	}
1426 
1427 	if (cksum != 0) {
1428 		itrq->itrq_rxstat.irxs_hck_set.value.ui64++;
1429 		mac_hcksum_set(mp, 0, 0, 0, 0, cksum);
1430 	} else {
1431 		itrq->itrq_rxstat.irxs_hck_miss.value.ui64++;
1432 	}
1433 }
1434 
1435 mblk_t *
i40e_ring_rx(i40e_trqpair_t * itrq,int poll_bytes)1436 i40e_ring_rx(i40e_trqpair_t *itrq, int poll_bytes)
1437 {
1438 	i40e_t *i40e;
1439 	i40e_hw_t *hw;
1440 	i40e_rx_data_t *rxd;
1441 	uint32_t cur_head;
1442 	i40e_rx_desc_t *cur_desc;
1443 	i40e_rx_control_block_t *rcb;
1444 	uint64_t rx_bytes, rx_frames;
1445 	uint64_t stword;
1446 	mblk_t *mp, *mp_head, **mp_tail;
1447 
1448 	ASSERT(MUTEX_HELD(&itrq->itrq_rx_lock));
1449 	rxd = itrq->itrq_rxdata;
1450 	i40e = itrq->itrq_i40e;
1451 	hw = &i40e->i40e_hw_space;
1452 
1453 	if (!(i40e->i40e_state & I40E_STARTED) ||
1454 	    (i40e->i40e_state & I40E_OVERTEMP) ||
1455 	    (i40e->i40e_state & I40E_SUSPENDED) ||
1456 	    (i40e->i40e_state & I40E_ERROR))
1457 		return (NULL);
1458 
1459 	/*
1460 	 * Before we do anything else, we have to make sure that all of the DMA
1461 	 * buffers are synced up and then check to make sure that they're
1462 	 * actually good from an FM perspective.
1463 	 */
1464 	I40E_DMA_SYNC(&rxd->rxd_desc_area, DDI_DMA_SYNC_FORKERNEL);
1465 	if (i40e_check_dma_handle(rxd->rxd_desc_area.dmab_dma_handle) !=
1466 	    DDI_FM_OK) {
1467 		ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED);
1468 		atomic_or_32(&i40e->i40e_state, I40E_ERROR);
1469 		return (NULL);
1470 	}
1471 
1472 	/*
1473 	 * Prepare our stats. We do a limited amount of processing in both
1474 	 * polling and interrupt context. The limit in interrupt context is
1475 	 * based on frames, in polling context based on bytes.
1476 	 */
1477 	rx_bytes = rx_frames = 0;
1478 	mp_head = NULL;
1479 	mp_tail = &mp_head;
1480 
1481 	/*
1482 	 * At this point, the descriptor ring is available to check. We'll try
1483 	 * and process until we either run out of poll_bytes or descriptors.
1484 	 */
1485 	cur_head = rxd->rxd_desc_next;
1486 	cur_desc = &rxd->rxd_desc_ring[cur_head];
1487 	stword = LE64_TO_CPU(cur_desc->wb.qword1.status_error_len);
1488 
1489 	/*
1490 	 * Note, the primary invariant of this loop should be that cur_head,
1491 	 * cur_desc, and stword always point to the currently processed
1492 	 * descriptor. When we leave the loop, it should point to a descriptor
1493 	 * that HAS NOT been processed. Meaning, that if we haven't consumed the
1494 	 * frame, the descriptor should not be advanced.
1495 	 */
1496 	while ((stword & (1 << I40E_RX_DESC_STATUS_DD_SHIFT)) != 0) {
1497 		uint32_t error, eop, plen, ptype;
1498 
1499 		/*
1500 		 * The DD, PLEN, and EOP bits are the only ones that are valid
1501 		 * in every frame. The error information is only valid when EOP
1502 		 * is set in the same frame.
1503 		 *
1504 		 * At this time, because we don't do any LRO or header
1505 		 * splitting. We expect that every frame should have EOP set in
1506 		 * it. When later functionality comes in, we'll want to
1507 		 * re-evaluate this.
1508 		 */
1509 		eop = stword & (1 << I40E_RX_DESC_STATUS_EOF_SHIFT);
1510 		VERIFY(eop != 0);
1511 
1512 		error = (stword & I40E_RXD_QW1_ERROR_MASK) >>
1513 		    I40E_RXD_QW1_ERROR_SHIFT;
1514 		if (error & I40E_RX_ERR_BITS) {
1515 			itrq->itrq_rxstat.irxs_rx_desc_error.value.ui64++;
1516 			goto discard;
1517 		}
1518 
1519 		plen = (stword & I40E_RXD_QW1_LENGTH_PBUF_MASK) >>
1520 		    I40E_RXD_QW1_LENGTH_PBUF_SHIFT;
1521 
1522 		ptype = (stword & I40E_RXD_QW1_PTYPE_MASK) >>
1523 		    I40E_RXD_QW1_PTYPE_SHIFT;
1524 
1525 		/*
1526 		 * This packet contains valid data. We should check to see if
1527 		 * we're actually going to consume it based on its length (to
1528 		 * ensure that we don't overshoot our quota). We determine
1529 		 * whether to bcopy or bind the DMA resources based on the size
1530 		 * of the frame. However, if on debug, we allow it to be
1531 		 * overridden for testing purposes.
1532 		 *
1533 		 * We should be smarter about this and do DMA binding for
1534 		 * larger frames, but for now, it's really more important that
1535 		 * we actually just get something simple working.
1536 		 */
1537 
1538 		/*
1539 		 * Ensure we don't exceed our polling quota by reading this
1540 		 * frame. Note we only bump bytes now, we bump frames later.
1541 		 */
1542 		if ((poll_bytes != I40E_POLL_NULL) &&
1543 		    (rx_bytes + plen) > poll_bytes)
1544 			break;
1545 		rx_bytes += plen;
1546 
1547 		mp = NULL;
1548 		if (plen >= i40e->i40e_rx_dma_min)
1549 			mp = i40e_rx_bind(itrq, rxd, cur_head, plen);
1550 		if (mp == NULL)
1551 			mp = i40e_rx_copy(itrq, rxd, cur_head, plen);
1552 
1553 		if (mp != NULL) {
1554 			if (i40e->i40e_rx_hcksum_enable)
1555 				i40e_rx_hcksum(itrq, mp, stword, error, ptype);
1556 			*mp_tail = mp;
1557 			mp_tail = &mp->b_next;
1558 		}
1559 
1560 		/*
1561 		 * Now we need to prepare this frame for use again. See the
1562 		 * discussion in the big theory statements.
1563 		 *
1564 		 * However, right now we're doing the simple version of this.
1565 		 * Normally what we'd do would depend on whether or not we were
1566 		 * doing DMA binding or bcopying. But because we're always doing
1567 		 * bcopying, we can just always use the current index as a key
1568 		 * for what to do and reassign the buffer based on the ring.
1569 		 */
1570 discard:
1571 		rcb = rxd->rxd_work_list[cur_head];
1572 		cur_desc->read.pkt_addr =
1573 		    CPU_TO_LE64((uintptr_t)rcb->rcb_dma.dmab_dma_address);
1574 		cur_desc->read.hdr_addr = 0;
1575 
1576 		/*
1577 		 * Finally, update our loop invariants.
1578 		 */
1579 		cur_head = i40e_next_desc(cur_head, 1, rxd->rxd_ring_size);
1580 		cur_desc = &rxd->rxd_desc_ring[cur_head];
1581 		stword = LE64_TO_CPU(cur_desc->wb.qword1.status_error_len);
1582 
1583 		/*
1584 		 * To help provide liveness, we limit the amount of data that
1585 		 * we'll end up counting. Note that in these cases, an interrupt
1586 		 * is not dissimilar from a polling request.
1587 		 */
1588 		rx_frames++;
1589 		if (rx_frames > i40e->i40e_rx_limit_per_intr) {
1590 			itrq->itrq_rxstat.irxs_rx_intr_limit.value.ui64++;
1591 			break;
1592 		}
1593 	}
1594 
1595 	/*
1596 	 * As we've modified the ring, we need to make sure that we sync the
1597 	 * descriptor ring for the device. Next, we update the hardware and
1598 	 * update our notion of where the head for us to read from hardware is
1599 	 * next.
1600 	 */
1601 	I40E_DMA_SYNC(&rxd->rxd_desc_area, DDI_DMA_SYNC_FORDEV);
1602 	if (i40e_check_dma_handle(rxd->rxd_desc_area.dmab_dma_handle) !=
1603 	    DDI_FM_OK) {
1604 		ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED);
1605 		atomic_or_32(&i40e->i40e_state, I40E_ERROR);
1606 	}
1607 
1608 	if (rx_frames != 0) {
1609 		uint32_t tail;
1610 		ddi_acc_handle_t rh = i40e->i40e_osdep_space.ios_reg_handle;
1611 		rxd->rxd_desc_next = cur_head;
1612 		tail = i40e_prev_desc(cur_head, 1, rxd->rxd_ring_size);
1613 
1614 		I40E_WRITE_REG(hw, I40E_QRX_TAIL(itrq->itrq_index), tail);
1615 		if (i40e_check_acc_handle(rh) != DDI_FM_OK) {
1616 			ddi_fm_service_impact(i40e->i40e_dip,
1617 			    DDI_SERVICE_DEGRADED);
1618 			atomic_or_32(&i40e->i40e_state, I40E_ERROR);
1619 		}
1620 
1621 		itrq->itrq_rxstat.irxs_bytes.value.ui64 += rx_bytes;
1622 		itrq->itrq_rxstat.irxs_packets.value.ui64 += rx_frames;
1623 	}
1624 
1625 #ifdef DEBUG
1626 	if (rx_frames == 0) {
1627 		ASSERT(rx_bytes == 0);
1628 	}
1629 #endif
1630 
1631 	return (mp_head);
1632 }
1633 
1634 /*
1635  * This function is called by the GLDv3 when it wants to poll on a ring. The
1636  * only primary difference from when we call this during an interrupt is that we
1637  * have a limit on the number of bytes that we should consume.
1638  */
1639 mblk_t *
i40e_ring_rx_poll(void * arg,int poll_bytes)1640 i40e_ring_rx_poll(void *arg, int poll_bytes)
1641 {
1642 	i40e_trqpair_t *itrq = arg;
1643 	mblk_t *mp;
1644 
1645 	ASSERT(poll_bytes > 0);
1646 	if (poll_bytes == 0)
1647 		return (NULL);
1648 
1649 	mutex_enter(&itrq->itrq_rx_lock);
1650 	mp = i40e_ring_rx(itrq, poll_bytes);
1651 	mutex_exit(&itrq->itrq_rx_lock);
1652 
1653 	return (mp);
1654 }
1655 
1656 /*
1657  * Attempt to put togther the information we'll need to feed into a descriptor
1658  * to properly program the hardware for checksum offload as well as the
1659  * generally required flags.
1660  *
1661  * The i40e_tx_context_t`itc_data_cmdflags contains the set of flags we need to
1662  * 'or' into the descriptor based on the checksum flags for this mblk_t and the
1663  * actual information we care about.
1664  *
1665  * If the mblk requires LSO then we'll also gather the information that will be
1666  * used to construct the Transmit Context Descriptor.
1667  */
1668 static int
i40e_tx_context(i40e_t * i40e,i40e_trqpair_t * itrq,mblk_t * mp,mac_ether_offload_info_t * meo,i40e_tx_context_t * tctx)1669 i40e_tx_context(i40e_t *i40e, i40e_trqpair_t *itrq, mblk_t *mp,
1670     mac_ether_offload_info_t *meo, i40e_tx_context_t *tctx)
1671 {
1672 	uint32_t chkflags, start, mss, lsoflags;
1673 	i40e_txq_stat_t *txs = &itrq->itrq_txstat;
1674 
1675 	bzero(tctx, sizeof (i40e_tx_context_t));
1676 
1677 	if (i40e->i40e_tx_hcksum_enable != B_TRUE)
1678 		return (0);
1679 
1680 	mac_hcksum_get(mp, &start, NULL, NULL, NULL, &chkflags);
1681 	mac_lso_get(mp, &mss, &lsoflags);
1682 
1683 	if (chkflags == 0 && lsoflags == 0)
1684 		return (0);
1685 
1686 	/*
1687 	 * Have we been asked to checksum an IPv4 header. If so, verify that we
1688 	 * have sufficient information and then set the proper fields in the
1689 	 * command structure.
1690 	 */
1691 	if (chkflags & HCK_IPV4_HDRCKSUM) {
1692 		if ((meo->meoi_flags & MEOI_L2INFO_SET) == 0) {
1693 			txs->itxs_hck_nol2info.value.ui64++;
1694 			return (-1);
1695 		}
1696 		if ((meo->meoi_flags & MEOI_L3INFO_SET) == 0) {
1697 			txs->itxs_hck_nol3info.value.ui64++;
1698 			return (-1);
1699 		}
1700 		if (meo->meoi_l3proto != ETHERTYPE_IP) {
1701 			txs->itxs_hck_badl3.value.ui64++;
1702 			return (-1);
1703 		}
1704 		tctx->itc_data_cmdflags |= I40E_TX_DESC_CMD_IIPT_IPV4_CSUM;
1705 		tctx->itc_data_offsets |= (meo->meoi_l2hlen >> 1) <<
1706 		    I40E_TX_DESC_LENGTH_MACLEN_SHIFT;
1707 		tctx->itc_data_offsets |= (meo->meoi_l3hlen >> 2) <<
1708 		    I40E_TX_DESC_LENGTH_IPLEN_SHIFT;
1709 	}
1710 
1711 	/*
1712 	 * We've been asked to provide an L4 header, first, set up the IP
1713 	 * information in the descriptor if we haven't already before moving
1714 	 * onto seeing if we have enough information for the L4 checksum
1715 	 * offload.
1716 	 */
1717 	if (chkflags & HCK_PARTIALCKSUM) {
1718 		if ((meo->meoi_flags & MEOI_L4INFO_SET) == 0) {
1719 			txs->itxs_hck_nol4info.value.ui64++;
1720 			return (-1);
1721 		}
1722 
1723 		if (!(chkflags & HCK_IPV4_HDRCKSUM)) {
1724 			if ((meo->meoi_flags & MEOI_L2INFO_SET) == 0) {
1725 				txs->itxs_hck_nol2info.value.ui64++;
1726 				return (-1);
1727 			}
1728 			if ((meo->meoi_flags & MEOI_L3INFO_SET) == 0) {
1729 				txs->itxs_hck_nol3info.value.ui64++;
1730 				return (-1);
1731 			}
1732 
1733 			if (meo->meoi_l3proto == ETHERTYPE_IP) {
1734 				tctx->itc_data_cmdflags |=
1735 				    I40E_TX_DESC_CMD_IIPT_IPV4;
1736 			} else if (meo->meoi_l3proto == ETHERTYPE_IPV6) {
1737 				tctx->itc_data_cmdflags |=
1738 				    I40E_TX_DESC_CMD_IIPT_IPV6;
1739 			} else {
1740 				txs->itxs_hck_badl3.value.ui64++;
1741 				return (-1);
1742 			}
1743 			tctx->itc_data_offsets |= (meo->meoi_l2hlen >> 1) <<
1744 			    I40E_TX_DESC_LENGTH_MACLEN_SHIFT;
1745 			tctx->itc_data_offsets |= (meo->meoi_l3hlen >> 2) <<
1746 			    I40E_TX_DESC_LENGTH_IPLEN_SHIFT;
1747 		}
1748 
1749 		switch (meo->meoi_l4proto) {
1750 		case IPPROTO_TCP:
1751 			tctx->itc_data_cmdflags |=
1752 			    I40E_TX_DESC_CMD_L4T_EOFT_TCP;
1753 			break;
1754 		case IPPROTO_UDP:
1755 			tctx->itc_data_cmdflags |=
1756 			    I40E_TX_DESC_CMD_L4T_EOFT_UDP;
1757 			break;
1758 		case IPPROTO_SCTP:
1759 			tctx->itc_data_cmdflags |=
1760 			    I40E_TX_DESC_CMD_L4T_EOFT_SCTP;
1761 			break;
1762 		default:
1763 			txs->itxs_hck_badl4.value.ui64++;
1764 			return (-1);
1765 		}
1766 
1767 		tctx->itc_data_offsets |= (meo->meoi_l4hlen >> 2) <<
1768 		    I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
1769 	}
1770 
1771 	if (lsoflags & HW_LSO) {
1772 		/*
1773 		 * LSO requires that checksum offloads are enabled.  If for
1774 		 * some reason they're not we bail out with an error.
1775 		 *
1776 		 * Fulfilling this requirement also ensures that the L4 info was
1777 		 * parsed by meoi, which is also necessary for LSO.
1778 		 */
1779 		if ((meo->meoi_l3proto == ETHERTYPE_IP &&
1780 		    (chkflags & HCK_IPV4_HDRCKSUM) == 0) ||
1781 		    (chkflags & HCK_PARTIALCKSUM) == 0) {
1782 			txs->itxs_lso_nohck.value.ui64++;
1783 			return (-1);
1784 		}
1785 
1786 		tctx->itc_ctx_cmdflags |= I40E_TX_CTX_DESC_TSO;
1787 		tctx->itc_ctx_mss = mss;
1788 		tctx->itc_ctx_tsolen = msgsize(mp) -
1789 		    (meo->meoi_l2hlen + meo->meoi_l3hlen + meo->meoi_l4hlen);
1790 	}
1791 
1792 	return (0);
1793 }
1794 
1795 static void
i40e_tcb_free(i40e_trqpair_t * itrq,i40e_tx_control_block_t * tcb)1796 i40e_tcb_free(i40e_trqpair_t *itrq, i40e_tx_control_block_t *tcb)
1797 {
1798 	ASSERT(tcb != NULL);
1799 
1800 	mutex_enter(&itrq->itrq_tcb_lock);
1801 	ASSERT(itrq->itrq_tcb_free < itrq->itrq_tx_free_list_size);
1802 	itrq->itrq_tcb_free_list[itrq->itrq_tcb_free] = tcb;
1803 	itrq->itrq_tcb_free++;
1804 	mutex_exit(&itrq->itrq_tcb_lock);
1805 }
1806 
1807 static i40e_tx_control_block_t *
i40e_tcb_alloc(i40e_trqpair_t * itrq)1808 i40e_tcb_alloc(i40e_trqpair_t *itrq)
1809 {
1810 	i40e_tx_control_block_t *ret;
1811 
1812 	mutex_enter(&itrq->itrq_tcb_lock);
1813 	if (itrq->itrq_tcb_free == 0) {
1814 		mutex_exit(&itrq->itrq_tcb_lock);
1815 		return (NULL);
1816 	}
1817 
1818 	itrq->itrq_tcb_free--;
1819 	ret = itrq->itrq_tcb_free_list[itrq->itrq_tcb_free];
1820 	itrq->itrq_tcb_free_list[itrq->itrq_tcb_free] = NULL;
1821 	mutex_exit(&itrq->itrq_tcb_lock);
1822 
1823 	ASSERT(ret != NULL);
1824 	return (ret);
1825 }
1826 
1827 /*
1828  * This should be used to free any DMA resources, associated mblk_t's, etc. It's
1829  * used as part of recycling the message blocks when we have either an interrupt
1830  * or other activity that indicates that we need to take a look.
1831  */
1832 static void
i40e_tcb_reset(i40e_tx_control_block_t * tcb)1833 i40e_tcb_reset(i40e_tx_control_block_t *tcb)
1834 {
1835 	switch (tcb->tcb_type) {
1836 	case I40E_TX_COPY:
1837 		tcb->tcb_dma.dmab_len = 0;
1838 		break;
1839 	case I40E_TX_DMA:
1840 		if (tcb->tcb_used_lso == B_TRUE && tcb->tcb_bind_ncookies > 0)
1841 			(void) ddi_dma_unbind_handle(tcb->tcb_lso_dma_handle);
1842 		else if (tcb->tcb_bind_ncookies > 0)
1843 			(void) ddi_dma_unbind_handle(tcb->tcb_dma_handle);
1844 		if (tcb->tcb_bind_info != NULL) {
1845 			kmem_free(tcb->tcb_bind_info,
1846 			    tcb->tcb_bind_ncookies *
1847 			    sizeof (struct i40e_dma_bind_info));
1848 		}
1849 		tcb->tcb_bind_info = NULL;
1850 		tcb->tcb_bind_ncookies = 0;
1851 		tcb->tcb_used_lso = B_FALSE;
1852 		break;
1853 	case I40E_TX_DESC:
1854 		break;
1855 	case I40E_TX_NONE:
1856 		/* Cast to pacify lint */
1857 		panic("trying to free tcb %p with bad type none", (void *)tcb);
1858 	default:
1859 		panic("unknown i40e tcb type: %d", tcb->tcb_type);
1860 	}
1861 
1862 	tcb->tcb_type = I40E_TX_NONE;
1863 	if (tcb->tcb_mp != NULL) {
1864 		freemsg(tcb->tcb_mp);
1865 		tcb->tcb_mp = NULL;
1866 	}
1867 	tcb->tcb_next = NULL;
1868 }
1869 
1870 /*
1871  * This is called as part of shutting down to clean up all outstanding
1872  * descriptors. Similar to recycle, except we don't re-arm anything and instead
1873  * just return control blocks to the free list.
1874  */
1875 void
i40e_tx_cleanup_ring(i40e_trqpair_t * itrq)1876 i40e_tx_cleanup_ring(i40e_trqpair_t *itrq)
1877 {
1878 	uint32_t index;
1879 
1880 	ASSERT(MUTEX_HELD(&itrq->itrq_tx_lock));
1881 	ASSERT(itrq->itrq_desc_free <= itrq->itrq_tx_ring_size);
1882 
1883 	/*
1884 	 * Because we should have shut down the chip at this point, it should be
1885 	 * safe to just clean up all the entries between our head and tail.
1886 	 */
1887 #ifdef	DEBUG
1888 	index = I40E_READ_REG(&itrq->itrq_i40e->i40e_hw_space,
1889 	    I40E_QTX_ENA(itrq->itrq_index));
1890 	VERIFY0(index & (I40E_QTX_ENA_QENA_REQ_MASK |
1891 	    I40E_QTX_ENA_QENA_STAT_MASK));
1892 #endif
1893 
1894 	index = itrq->itrq_desc_head;
1895 	while (itrq->itrq_desc_free < itrq->itrq_tx_ring_size) {
1896 		i40e_tx_control_block_t *tcb;
1897 
1898 		tcb = itrq->itrq_tcb_work_list[index];
1899 		if (tcb != NULL) {
1900 			itrq->itrq_tcb_work_list[index] = NULL;
1901 			i40e_tcb_reset(tcb);
1902 			i40e_tcb_free(itrq, tcb);
1903 		}
1904 
1905 		bzero(&itrq->itrq_desc_ring[index], sizeof (i40e_tx_desc_t));
1906 		index = i40e_next_desc(index, 1, itrq->itrq_tx_ring_size);
1907 		itrq->itrq_desc_free++;
1908 	}
1909 
1910 	ASSERT(index == itrq->itrq_desc_tail);
1911 	itrq->itrq_desc_head = index;
1912 }
1913 
1914 /*
1915  * We're here either by hook or by crook. We need to see if there are transmit
1916  * descriptors available for us to go and clean up and return to the hardware.
1917  * We may also be blocked, and if so, we should make sure that we let it know
1918  * we're good to go.
1919  */
1920 void
i40e_tx_recycle_ring(i40e_trqpair_t * itrq)1921 i40e_tx_recycle_ring(i40e_trqpair_t *itrq)
1922 {
1923 	uint32_t wbhead, toclean, count;
1924 	i40e_tx_control_block_t *tcbhead;
1925 	i40e_t *i40e = itrq->itrq_i40e;
1926 	uint_t desc_per_tcb, i;
1927 
1928 	mutex_enter(&itrq->itrq_tx_lock);
1929 
1930 	ASSERT(itrq->itrq_desc_free <= itrq->itrq_tx_ring_size);
1931 	if (itrq->itrq_desc_free == itrq->itrq_tx_ring_size) {
1932 		if (itrq->itrq_tx_blocked == B_TRUE) {
1933 			itrq->itrq_tx_blocked = B_FALSE;
1934 			mac_tx_ring_update(i40e->i40e_mac_hdl,
1935 			    itrq->itrq_mactxring);
1936 			itrq->itrq_txstat.itxs_num_unblocked.value.ui64++;
1937 		}
1938 		mutex_exit(&itrq->itrq_tx_lock);
1939 		return;
1940 	}
1941 
1942 	/*
1943 	 * Now we need to try and see if there's anything available. The driver
1944 	 * will write to the head location and it guarantees that it does not
1945 	 * use relaxed ordering.
1946 	 */
1947 	VERIFY0(ddi_dma_sync(itrq->itrq_desc_area.dmab_dma_handle,
1948 	    (uintptr_t)itrq->itrq_desc_wbhead,
1949 	    sizeof (uint32_t), DDI_DMA_SYNC_FORKERNEL));
1950 
1951 	if (i40e_check_dma_handle(itrq->itrq_desc_area.dmab_dma_handle) !=
1952 	    DDI_FM_OK) {
1953 		mutex_exit(&itrq->itrq_tx_lock);
1954 		ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED);
1955 		atomic_or_32(&i40e->i40e_state, I40E_ERROR);
1956 		return;
1957 	}
1958 
1959 	wbhead = *itrq->itrq_desc_wbhead;
1960 	toclean = itrq->itrq_desc_head;
1961 	count = 0;
1962 	tcbhead = NULL;
1963 
1964 	while (toclean != wbhead) {
1965 		i40e_tx_control_block_t *tcb;
1966 
1967 		tcb = itrq->itrq_tcb_work_list[toclean];
1968 		itrq->itrq_tcb_work_list[toclean] = NULL;
1969 		ASSERT(tcb != NULL);
1970 		tcb->tcb_next = tcbhead;
1971 		tcbhead = tcb;
1972 
1973 		/*
1974 		 * In the DMA bind case, there may not necessarily be a 1:1
1975 		 * mapping between tcb's and descriptors.  If the tcb type
1976 		 * indicates a DMA binding then check the number of DMA
1977 		 * cookies to determine how many entries to clean in the
1978 		 * descriptor ring.
1979 		 */
1980 		if (tcb->tcb_type == I40E_TX_DMA)
1981 			desc_per_tcb = tcb->tcb_bind_ncookies;
1982 		else
1983 			desc_per_tcb = 1;
1984 
1985 		for (i = 0; i < desc_per_tcb; i++) {
1986 			/*
1987 			 * We zero this out for sanity purposes.
1988 			 */
1989 			bzero(&itrq->itrq_desc_ring[toclean],
1990 			    sizeof (i40e_tx_desc_t));
1991 			toclean = i40e_next_desc(toclean, 1,
1992 			    itrq->itrq_tx_ring_size);
1993 			count++;
1994 		}
1995 	}
1996 
1997 	itrq->itrq_desc_head = wbhead;
1998 	itrq->itrq_desc_free += count;
1999 	itrq->itrq_txstat.itxs_recycled.value.ui64 += count;
2000 	ASSERT(itrq->itrq_desc_free <= itrq->itrq_tx_ring_size);
2001 
2002 	if (itrq->itrq_tx_blocked == B_TRUE &&
2003 	    itrq->itrq_desc_free > i40e->i40e_tx_block_thresh) {
2004 		itrq->itrq_tx_blocked = B_FALSE;
2005 
2006 		mac_tx_ring_update(i40e->i40e_mac_hdl, itrq->itrq_mactxring);
2007 		itrq->itrq_txstat.itxs_num_unblocked.value.ui64++;
2008 	}
2009 
2010 	mutex_exit(&itrq->itrq_tx_lock);
2011 
2012 	/*
2013 	 * Now clean up the tcb.
2014 	 */
2015 	while (tcbhead != NULL) {
2016 		i40e_tx_control_block_t *tcb = tcbhead;
2017 
2018 		tcbhead = tcb->tcb_next;
2019 		i40e_tcb_reset(tcb);
2020 		i40e_tcb_free(itrq, tcb);
2021 	}
2022 
2023 	DTRACE_PROBE2(i40e__recycle, i40e_trqpair_t *, itrq, uint32_t, count);
2024 }
2025 
2026 static void
i40e_tx_copy_fragment(i40e_tx_control_block_t * tcb,const mblk_t * mp,const size_t off,const size_t len)2027 i40e_tx_copy_fragment(i40e_tx_control_block_t *tcb, const mblk_t *mp,
2028     const size_t off, const size_t len)
2029 {
2030 	const void *soff = mp->b_rptr + off;
2031 	void *doff = tcb->tcb_dma.dmab_address + tcb->tcb_dma.dmab_len;
2032 
2033 	ASSERT3U(len, >, 0);
2034 	ASSERT3P(soff, >=, mp->b_rptr);
2035 	ASSERT3P(soff, <=, mp->b_wptr);
2036 	ASSERT3U(len, <=, MBLKL(mp));
2037 	ASSERT3U((uintptr_t)soff + len, <=, (uintptr_t)mp->b_wptr);
2038 	ASSERT3U(tcb->tcb_dma.dmab_size - tcb->tcb_dma.dmab_len, >=, len);
2039 	bcopy(soff, doff, len);
2040 	tcb->tcb_type = I40E_TX_COPY;
2041 	tcb->tcb_dma.dmab_len += len;
2042 	I40E_DMA_SYNC(&tcb->tcb_dma, DDI_DMA_SYNC_FORDEV);
2043 }
2044 
2045 static i40e_tx_control_block_t *
i40e_tx_bind_fragment(i40e_trqpair_t * itrq,const mblk_t * mp,size_t off,boolean_t use_lso)2046 i40e_tx_bind_fragment(i40e_trqpair_t *itrq, const mblk_t *mp,
2047     size_t off, boolean_t use_lso)
2048 {
2049 	ddi_dma_handle_t dma_handle;
2050 	ddi_dma_cookie_t dma_cookie;
2051 	uint_t i = 0, ncookies = 0, dmaflags;
2052 	i40e_tx_control_block_t *tcb;
2053 	i40e_txq_stat_t *txs = &itrq->itrq_txstat;
2054 
2055 	if ((tcb = i40e_tcb_alloc(itrq)) == NULL) {
2056 		txs->itxs_err_notcb.value.ui64++;
2057 		return (NULL);
2058 	}
2059 	tcb->tcb_type = I40E_TX_DMA;
2060 
2061 	if (use_lso == B_TRUE)
2062 		dma_handle = tcb->tcb_lso_dma_handle;
2063 	else
2064 		dma_handle = tcb->tcb_dma_handle;
2065 
2066 	dmaflags = DDI_DMA_WRITE | DDI_DMA_STREAMING;
2067 	if (ddi_dma_addr_bind_handle(dma_handle, NULL,
2068 	    (caddr_t)(mp->b_rptr + off), MBLKL(mp) - off, dmaflags,
2069 	    DDI_DMA_DONTWAIT, NULL, &dma_cookie, &ncookies) != DDI_DMA_MAPPED) {
2070 		txs->itxs_bind_fails.value.ui64++;
2071 		goto bffail;
2072 	}
2073 
2074 	tcb->tcb_bind_ncookies = ncookies;
2075 	tcb->tcb_used_lso = use_lso;
2076 
2077 	tcb->tcb_bind_info =
2078 	    kmem_zalloc(ncookies * sizeof (struct i40e_dma_bind_info),
2079 	    KM_NOSLEEP);
2080 	if (tcb->tcb_bind_info == NULL)
2081 		goto bffail;
2082 
2083 	while (i < ncookies) {
2084 		if (i > 0)
2085 			ddi_dma_nextcookie(dma_handle, &dma_cookie);
2086 
2087 		tcb->tcb_bind_info[i].dbi_paddr =
2088 		    (caddr_t)dma_cookie.dmac_laddress;
2089 		tcb->tcb_bind_info[i++].dbi_len = dma_cookie.dmac_size;
2090 	}
2091 
2092 	return (tcb);
2093 
2094 bffail:
2095 	i40e_tcb_reset(tcb);
2096 	i40e_tcb_free(itrq, tcb);
2097 	return (NULL);
2098 }
2099 
2100 static void
i40e_tx_set_data_desc(i40e_trqpair_t * itrq,i40e_tx_context_t * tctx,caddr_t buff,size_t len,boolean_t last_desc)2101 i40e_tx_set_data_desc(i40e_trqpair_t *itrq, i40e_tx_context_t *tctx,
2102     caddr_t buff, size_t len, boolean_t last_desc)
2103 {
2104 	i40e_tx_desc_t *txdesc;
2105 	int cmd;
2106 
2107 	ASSERT(MUTEX_HELD(&itrq->itrq_tx_lock));
2108 	itrq->itrq_desc_free--;
2109 	txdesc = &itrq->itrq_desc_ring[itrq->itrq_desc_tail];
2110 	itrq->itrq_desc_tail = i40e_next_desc(itrq->itrq_desc_tail, 1,
2111 	    itrq->itrq_tx_ring_size);
2112 
2113 	cmd = I40E_TX_DESC_CMD_ICRC | tctx->itc_data_cmdflags;
2114 
2115 	/*
2116 	 * The last data descriptor needs the EOP bit set, so that the HW knows
2117 	 * that we're ready to send.  Additionally, we set the RS (Report
2118 	 * Status) bit, so that we are notified when the transmit engine has
2119 	 * completed DMA'ing all of the data descriptors and data buffers
2120 	 * associated with this frame.
2121 	 */
2122 	if (last_desc == B_TRUE) {
2123 		cmd |= I40E_TX_DESC_CMD_EOP;
2124 		cmd |= I40E_TX_DESC_CMD_RS;
2125 	}
2126 
2127 	/*
2128 	 * Per the X710 manual, section 8.4.2.1.1, the buffer size
2129 	 * must be a value from 1 to 16K minus 1, inclusive.
2130 	 */
2131 	ASSERT3U(len, >=, 1);
2132 	ASSERT3U(len, <=, I40E_MAX_TX_BUFSZ);
2133 
2134 	txdesc->buffer_addr = CPU_TO_LE64((uintptr_t)buff);
2135 	txdesc->cmd_type_offset_bsz =
2136 	    LE_64(((uint64_t)I40E_TX_DESC_DTYPE_DATA |
2137 	    ((uint64_t)tctx->itc_data_offsets << I40E_TXD_QW1_OFFSET_SHIFT) |
2138 	    ((uint64_t)cmd << I40E_TXD_QW1_CMD_SHIFT) |
2139 	    ((uint64_t)len << I40E_TXD_QW1_TX_BUF_SZ_SHIFT)));
2140 }
2141 
2142 /*
2143  * Place 'tcb' on the tail of the list represented by 'head'/'tail'.
2144  */
2145 static inline void
tcb_list_append(i40e_tx_control_block_t ** head,i40e_tx_control_block_t ** tail,i40e_tx_control_block_t * tcb)2146 tcb_list_append(i40e_tx_control_block_t **head, i40e_tx_control_block_t **tail,
2147     i40e_tx_control_block_t *tcb)
2148 {
2149 	if (*head == NULL) {
2150 		*head = tcb;
2151 		*tail = *head;
2152 	} else {
2153 		ASSERT3P(*tail, !=, NULL);
2154 		ASSERT3P((*tail)->tcb_next, ==, NULL);
2155 		(*tail)->tcb_next = tcb;
2156 		*tail = tcb;
2157 	}
2158 }
2159 
2160 /*
2161  * This function takes a single packet, possibly consisting of
2162  * multiple mblks, and creates a TCB chain to send to the controller.
2163  * This TCB chain may span up to a maximum of 8 descriptors. A copy
2164  * TCB consumes one descriptor; whereas a DMA TCB may consume 1 or
2165  * more, depending on several factors. For each fragment (invidual
2166  * mblk making up the packet), we determine if its size dictates a
2167  * copy to the TCB buffer or a DMA bind of the dblk buffer. We keep a
2168  * count of descriptors used; when that count reaches the max we force
2169  * all remaining fragments into a single TCB buffer. We have a
2170  * guarantee that the TCB buffer is always larger than the MTU -- so
2171  * there is always enough room. Consecutive fragments below the DMA
2172  * threshold are copied into a single TCB. In the event of an error
2173  * this function returns NULL but leaves 'mp' alone.
2174  */
2175 static i40e_tx_control_block_t *
i40e_non_lso_chain(i40e_trqpair_t * itrq,mblk_t * mp,uint_t * ndesc)2176 i40e_non_lso_chain(i40e_trqpair_t *itrq, mblk_t *mp, uint_t *ndesc)
2177 {
2178 	const mblk_t *nmp = mp;
2179 	uint_t needed_desc = 0;
2180 	boolean_t force_copy = B_FALSE;
2181 	i40e_tx_control_block_t *tcb = NULL, *tcbhead = NULL, *tcbtail = NULL;
2182 	i40e_t *i40e = itrq->itrq_i40e;
2183 	i40e_txq_stat_t *txs = &itrq->itrq_txstat;
2184 
2185 	/* TCB buffer is always larger than MTU. */
2186 	ASSERT3U(msgsize(mp), <, i40e->i40e_tx_buf_size);
2187 
2188 	while (nmp != NULL) {
2189 		const size_t nmp_len = MBLKL(nmp);
2190 
2191 		/* Ignore zero-length mblks. */
2192 		if (nmp_len == 0) {
2193 			nmp = nmp->b_cont;
2194 			continue;
2195 		}
2196 
2197 		if (nmp_len < i40e->i40e_tx_dma_min || force_copy) {
2198 			/* Compress consecutive copies into one TCB. */
2199 			if (tcb != NULL && tcb->tcb_type == I40E_TX_COPY) {
2200 				i40e_tx_copy_fragment(tcb, nmp, 0, nmp_len);
2201 				nmp = nmp->b_cont;
2202 				continue;
2203 			}
2204 
2205 			if ((tcb = i40e_tcb_alloc(itrq)) == NULL) {
2206 				txs->itxs_err_notcb.value.ui64++;
2207 				goto fail;
2208 			}
2209 
2210 			/*
2211 			 * TCB DMA buffer is guaranteed to be one
2212 			 * cookie by i40e_alloc_dma_buffer().
2213 			 */
2214 			i40e_tx_copy_fragment(tcb, nmp, 0, nmp_len);
2215 			needed_desc++;
2216 			tcb_list_append(&tcbhead, &tcbtail, tcb);
2217 		} else {
2218 			uint_t total_desc;
2219 
2220 			tcb = i40e_tx_bind_fragment(itrq, nmp, 0, B_FALSE);
2221 			if (tcb == NULL) {
2222 				i40e_error(i40e, "dma bind failed!");
2223 				goto fail;
2224 			}
2225 
2226 			/*
2227 			 * If the new total exceeds the max or we've
2228 			 * reached the limit and there's data left,
2229 			 * then give up binding and copy the rest into
2230 			 * the pre-allocated TCB buffer.
2231 			 */
2232 			total_desc = needed_desc + tcb->tcb_bind_ncookies;
2233 			if ((total_desc > I40E_TX_MAX_COOKIE) ||
2234 			    (total_desc == I40E_TX_MAX_COOKIE &&
2235 			    nmp->b_cont != NULL)) {
2236 				i40e_tcb_reset(tcb);
2237 				i40e_tcb_free(itrq, tcb);
2238 
2239 				if (tcbtail != NULL &&
2240 				    tcbtail->tcb_type == I40E_TX_COPY) {
2241 					tcb = tcbtail;
2242 				} else {
2243 					tcb = NULL;
2244 				}
2245 
2246 				force_copy = B_TRUE;
2247 				txs->itxs_force_copy.value.ui64++;
2248 				continue;
2249 			}
2250 
2251 			needed_desc += tcb->tcb_bind_ncookies;
2252 			tcb_list_append(&tcbhead, &tcbtail, tcb);
2253 		}
2254 
2255 		nmp = nmp->b_cont;
2256 	}
2257 
2258 	ASSERT3P(nmp, ==, NULL);
2259 	ASSERT3U(needed_desc, <=, I40E_TX_MAX_COOKIE);
2260 	ASSERT3P(tcbhead, !=, NULL);
2261 	*ndesc += needed_desc;
2262 	return (tcbhead);
2263 
2264 fail:
2265 	tcb = tcbhead;
2266 	while (tcb != NULL) {
2267 		i40e_tx_control_block_t *next = tcb->tcb_next;
2268 
2269 		ASSERT(tcb->tcb_type == I40E_TX_DMA ||
2270 		    tcb->tcb_type == I40E_TX_COPY);
2271 
2272 		tcb->tcb_mp = NULL;
2273 		i40e_tcb_reset(tcb);
2274 		i40e_tcb_free(itrq, tcb);
2275 		tcb = next;
2276 	}
2277 
2278 	return (NULL);
2279 }
2280 
2281 /*
2282  * Section 8.4.1 of the 700-series programming guide states that a
2283  * segment may span up to 8 data descriptors; including both header
2284  * and payload data. However, empirical evidence shows that the
2285  * controller freezes the Tx queue when presented with a segment of 8
2286  * descriptors. Or, at least, when the first segment contains 8
2287  * descriptors. One explanation is that the controller counts the
2288  * context descriptor against the first segment, even though the
2289  * programming guide makes no mention of such a constraint. In any
2290  * case, we limit TSO segments to 7 descriptors to prevent Tx queue
2291  * freezes. We still allow non-TSO segments to utilize all 8
2292  * descriptors as they have not demonstrated the faulty behavior.
2293  */
2294 uint_t i40e_lso_num_descs = 7;
2295 
2296 #define	I40E_TCB_LEFT(tcb)				\
2297 	((tcb)->tcb_dma.dmab_size - (tcb)->tcb_dma.dmab_len)
2298 
2299 /*
2300  * This function is similar in spirit to i40e_non_lso_chain(), but
2301  * much more complicated in reality. Like the previous function, it
2302  * takes a packet (an LSO packet) as input and returns a chain of
2303  * TCBs. The complication comes with the fact that we are no longer
2304  * trying to fit the entire packet into 8 descriptors, but rather we
2305  * must fit each MSS-size segment of the LSO packet into 8 descriptors.
2306  * Except it's really 7 descriptors, see i40e_lso_num_descs.
2307  *
2308  * Your first inclination might be to verify that a given segment
2309  * spans no more than 7 mblks; but it's actually much more subtle than
2310  * that. First, let's describe what the hardware expects, and then we
2311  * can expound on the software side of things.
2312  *
2313  * For an LSO packet the hardware expects the following:
2314  *
2315  *	o Each MSS-sized segment must span no more than 7 descriptors.
2316  *
2317  *	o The header size does not count towards the segment size.
2318  *
2319  *	o If header and payload share the first descriptor, then the
2320  *	  controller will count the descriptor twice.
2321  *
2322  * The most important thing to keep in mind is that the hardware does
2323  * not view the segments in terms of mblks, like we do. The hardware
2324  * only sees descriptors. It will iterate each descriptor in turn,
2325  * keeping a tally of bytes seen and descriptors visited. If the byte
2326  * count hasn't reached MSS by the time the descriptor count reaches
2327  * 7, then the controller freezes the queue and we are stuck.
2328  * Furthermore, the hardware picks up its tally where it left off. So
2329  * if it reached MSS in the middle of a descriptor, it will start
2330  * tallying the next segment in the middle of that descriptor. The
2331  * hardware's view is entirely removed from the mblk chain or even the
2332  * descriptor layout. Consider these facts:
2333  *
2334  *	o The MSS will vary dpeneding on MTU and other factors.
2335  *
2336  *	o The dblk allocation will sit at various offsets within a
2337  *	  memory page.
2338  *
2339  *	o The page size itself could vary in the future (i.e. not
2340  *	  always 4K).
2341  *
2342  *	o Just because a dblk is virtually contiguous doesn't mean
2343  *	  it's physically contiguous. The number of cookies
2344  *	  (descriptors) required by a DMA bind of a single dblk is at
2345  *	  the mercy of the page size and physical layout.
2346  *
2347  *	o The descriptors will most often NOT start/end on a MSS
2348  *	  boundary. Thus the hardware will often start counting the
2349  *	  MSS mid descriptor and finish mid descriptor.
2350  *
2351  * The upshot of all this is that the driver must learn to think like
2352  * the controller; and verify that none of the constraints are broken.
2353  * It does this by tallying up the segment just like the hardware
2354  * would. This is handled by the two variables 'segsz' and 'segdesc'.
2355  * After each attempt to bind a dblk, we check the constaints. If
2356  * violated, we undo the DMA and force a copy until MSS is met. We
2357  * have a guarantee that the TCB buffer is larger than MTU; thus
2358  * ensuring we can always meet the MSS with a single copy buffer. We
2359  * also copy consecutive non-DMA fragments into the same TCB buffer.
2360  */
2361 static i40e_tx_control_block_t *
i40e_lso_chain(i40e_trqpair_t * itrq,const mblk_t * mp,const mac_ether_offload_info_t * meo,const i40e_tx_context_t * tctx,uint_t * ndesc)2362 i40e_lso_chain(i40e_trqpair_t *itrq, const mblk_t *mp,
2363     const mac_ether_offload_info_t *meo, const i40e_tx_context_t *tctx,
2364     uint_t *ndesc)
2365 {
2366 	size_t mp_len = MBLKL(mp);
2367 	/*
2368 	 * The cpoff (copy offset) variable tracks the offset inside
2369 	 * the current mp. There are cases where the entire mp is not
2370 	 * fully copied in one go: such as the header copy followed by
2371 	 * a non-DMA mblk, or a TCB buffer that only has enough space
2372 	 * to copy part of the current mp.
2373 	 */
2374 	size_t cpoff = 0;
2375 	/*
2376 	 * The segsz and segdesc variables track the controller's view
2377 	 * of the segment. The needed_desc variable tracks the total
2378 	 * number of data descriptors used by the driver.
2379 	 */
2380 	size_t segsz = 0;
2381 	uint_t segdesc = 0;
2382 	uint_t needed_desc = 0;
2383 	size_t hdrcopied = 0;
2384 	const size_t hdrlen =
2385 	    meo->meoi_l2hlen + meo->meoi_l3hlen + meo->meoi_l4hlen;
2386 	const size_t mss = tctx->itc_ctx_mss;
2387 	boolean_t force_copy = B_FALSE;
2388 	i40e_tx_control_block_t *tcb = NULL, *tcbhead = NULL, *tcbtail = NULL;
2389 	i40e_t *i40e = itrq->itrq_i40e;
2390 	i40e_txq_stat_t *txs = &itrq->itrq_txstat;
2391 
2392 	/*
2393 	 * We always copy the header in order to avoid more
2394 	 * complicated code dealing with various edge cases.
2395 	 */
2396 	if ((tcb = i40e_tcb_alloc(itrq)) == NULL) {
2397 		txs->itxs_err_notcb.value.ui64++;
2398 		goto fail;
2399 	}
2400 
2401 	needed_desc++;
2402 	tcb_list_append(&tcbhead, &tcbtail, tcb);
2403 
2404 	while (hdrcopied < hdrlen) {
2405 		const size_t tocopy = MIN(hdrlen - hdrcopied, mp_len);
2406 		i40e_tx_copy_fragment(tcb, mp, 0, tocopy);
2407 		hdrcopied += tocopy;
2408 		cpoff += tocopy;
2409 		if (tocopy == mp_len) {
2410 			/*
2411 			 * This is a bit of defensive programming. We
2412 			 * should never have a chain too short to
2413 			 * satisfy the headers -- but just in case.
2414 			 */
2415 			if ((mp = mp->b_cont) == NULL) {
2416 				txs->itxs_tx_short.value.ui64++;
2417 				goto fail;
2418 			}
2419 
2420 			while ((mp_len = MBLKL(mp)) == 0) {
2421 				if ((mp = mp->b_cont) == NULL) {
2422 					txs->itxs_tx_short.value.ui64++;
2423 					goto fail;
2424 				}
2425 			}
2426 			cpoff = 0;
2427 		}
2428 	}
2429 	ASSERT3U(hdrcopied, ==, hdrlen);
2430 
2431 	/*
2432 	 * A single descriptor containing both header and data is
2433 	 * counted twice by the controller.
2434 	 */
2435 	if (mp_len < i40e->i40e_tx_dma_min) {
2436 		segdesc = 2;
2437 	} else {
2438 		segdesc = 1;
2439 	}
2440 
2441 	while (mp != NULL) {
2442 		mp_len = MBLKL(mp);
2443 force_copy:
2444 		/* Ignore zero-length mblks. */
2445 		if (mp_len == 0) {
2446 			mp = mp->b_cont;
2447 			cpoff = 0;
2448 			continue;
2449 		}
2450 
2451 		/*
2452 		 * We copy into the preallocated TCB buffer when the
2453 		 * current fragment is less than the DMA threshold OR
2454 		 * when the DMA bind can't meet the controller's
2455 		 * segment descriptor limit.
2456 		 */
2457 		if (mp_len < i40e->i40e_tx_dma_min || force_copy) {
2458 			size_t tocopy;
2459 
2460 			/*
2461 			 * Our objective here is to compress
2462 			 * consecutive copies into one TCB (until it
2463 			 * is full). If there is no current TCB, or if
2464 			 * it is a DMA TCB, then allocate a new one.
2465 			 */
2466 			if (tcb == NULL ||
2467 			    (tcb != NULL && tcb->tcb_type != I40E_TX_COPY)) {
2468 				if ((tcb = i40e_tcb_alloc(itrq)) == NULL) {
2469 					txs->itxs_err_notcb.value.ui64++;
2470 					goto fail;
2471 				}
2472 
2473 				/*
2474 				 * The TCB DMA buffer is guaranteed to
2475 				 * be one cookie by i40e_alloc_dma_buffer().
2476 				 */
2477 				needed_desc++;
2478 				segdesc++;
2479 				ASSERT3U(segdesc, <=, i40e_lso_num_descs);
2480 				tcb_list_append(&tcbhead, &tcbtail, tcb);
2481 			} else if (segdesc == 0) {
2482 				/*
2483 				 * We are copying into an existing TCB
2484 				 * but we just crossed the MSS
2485 				 * boundary. Make sure to increment
2486 				 * segdesc to track the descriptor
2487 				 * count as the hardware would.
2488 				 */
2489 				segdesc++;
2490 			}
2491 
2492 			tocopy = MIN(I40E_TCB_LEFT(tcb), mp_len - cpoff);
2493 			i40e_tx_copy_fragment(tcb, mp, cpoff, tocopy);
2494 			cpoff += tocopy;
2495 			segsz += tocopy;
2496 
2497 			/* We have consumed the current mp. */
2498 			if (cpoff == mp_len) {
2499 				mp = mp->b_cont;
2500 				cpoff = 0;
2501 			}
2502 
2503 			/* We have consumed the current TCB buffer. */
2504 			if (I40E_TCB_LEFT(tcb) == 0) {
2505 				tcb = NULL;
2506 			}
2507 
2508 			/*
2509 			 * We have met MSS with this copy; restart the
2510 			 * counters.
2511 			 */
2512 			if (segsz >= mss) {
2513 				segsz = segsz % mss;
2514 				segdesc = segsz == 0 ? 0 : 1;
2515 				force_copy = B_FALSE;
2516 			}
2517 
2518 			/*
2519 			 * We are at the controller's descriptor
2520 			 * limit; we must copy into the current TCB
2521 			 * until MSS is reached. The TCB buffer is
2522 			 * always bigger than the MTU so we know it is
2523 			 * big enough to meet the MSS.
2524 			 */
2525 			if (segdesc == i40e_lso_num_descs) {
2526 				force_copy = B_TRUE;
2527 			}
2528 		} else {
2529 			uint_t tsegdesc = segdesc;
2530 			size_t tsegsz = segsz;
2531 
2532 			ASSERT(force_copy == B_FALSE);
2533 			ASSERT3U(tsegdesc, <, i40e_lso_num_descs);
2534 
2535 			tcb = i40e_tx_bind_fragment(itrq, mp, cpoff, B_TRUE);
2536 			if (tcb == NULL) {
2537 				i40e_error(i40e, "dma bind failed!");
2538 				goto fail;
2539 			}
2540 
2541 			for (uint_t i = 0; i < tcb->tcb_bind_ncookies; i++) {
2542 				struct i40e_dma_bind_info dbi =
2543 				    tcb->tcb_bind_info[i];
2544 
2545 				tsegsz += dbi.dbi_len;
2546 				tsegdesc++;
2547 				ASSERT3U(tsegdesc, <=, i40e_lso_num_descs);
2548 
2549 				/*
2550 				 * We've met the MSS with this portion
2551 				 * of the DMA.
2552 				 */
2553 				if (tsegsz >= mss) {
2554 					tsegsz = tsegsz % mss;
2555 					tsegdesc = tsegsz == 0 ? 0 : 1;
2556 				}
2557 
2558 				/*
2559 				 * We've reached max descriptors but
2560 				 * have not met the MSS. Undo the bind
2561 				 * and instead copy.
2562 				 */
2563 				if (tsegdesc == i40e_lso_num_descs) {
2564 					i40e_tcb_reset(tcb);
2565 					i40e_tcb_free(itrq, tcb);
2566 
2567 					if (tcbtail != NULL &&
2568 					    I40E_TCB_LEFT(tcb) > 0 &&
2569 					    tcbtail->tcb_type == I40E_TX_COPY) {
2570 						tcb = tcbtail;
2571 					} else {
2572 						tcb = NULL;
2573 					}
2574 
2575 					/*
2576 					 * Remember, we are still on
2577 					 * the same mp.
2578 					 */
2579 					force_copy = B_TRUE;
2580 					txs->itxs_tso_force_copy.value.ui64++;
2581 					goto force_copy;
2582 				}
2583 			}
2584 
2585 			ASSERT3U(tsegdesc, <=, i40e_lso_num_descs);
2586 			ASSERT3U(tsegsz, <, mss);
2587 
2588 			/*
2589 			 * We've made if through the loop without
2590 			 * breaking the segment descriptor contract
2591 			 * with the controller -- replace the segment
2592 			 * tracking values with the temporary ones.
2593 			 */
2594 			segdesc = tsegdesc;
2595 			segsz = tsegsz;
2596 			needed_desc += tcb->tcb_bind_ncookies;
2597 			cpoff = 0;
2598 			tcb_list_append(&tcbhead, &tcbtail, tcb);
2599 			mp = mp->b_cont;
2600 		}
2601 	}
2602 
2603 	ASSERT3P(mp, ==, NULL);
2604 	ASSERT3P(tcbhead, !=, NULL);
2605 	*ndesc += needed_desc;
2606 	return (tcbhead);
2607 
2608 fail:
2609 	tcb = tcbhead;
2610 	while (tcb != NULL) {
2611 		i40e_tx_control_block_t *next = tcb->tcb_next;
2612 
2613 		ASSERT(tcb->tcb_type == I40E_TX_DMA ||
2614 		    tcb->tcb_type == I40E_TX_COPY);
2615 
2616 		tcb->tcb_mp = NULL;
2617 		i40e_tcb_reset(tcb);
2618 		i40e_tcb_free(itrq, tcb);
2619 		tcb = next;
2620 	}
2621 
2622 	return (NULL);
2623 }
2624 
2625 /*
2626  * Keep track of activity through the transmit data path.
2627  *
2628  * We need to ensure we don't try and transmit when a trqpair has been
2629  * stopped, nor do we want to stop a trqpair whilst transmitting.
2630  */
2631 static boolean_t
i40e_ring_tx_enter(i40e_trqpair_t * itrq)2632 i40e_ring_tx_enter(i40e_trqpair_t *itrq)
2633 {
2634 	boolean_t allow;
2635 
2636 	mutex_enter(&itrq->itrq_tx_lock);
2637 	allow = !itrq->itrq_tx_quiesce;
2638 	if (allow)
2639 		itrq->itrq_tx_active++;
2640 	mutex_exit(&itrq->itrq_tx_lock);
2641 
2642 	return (allow);
2643 }
2644 
2645 static void
i40e_ring_tx_exit_nolock(i40e_trqpair_t * itrq)2646 i40e_ring_tx_exit_nolock(i40e_trqpair_t *itrq)
2647 {
2648 	ASSERT(MUTEX_HELD(&itrq->itrq_tx_lock));
2649 
2650 	itrq->itrq_tx_active--;
2651 	if (itrq->itrq_tx_quiesce)
2652 		cv_signal(&itrq->itrq_tx_cv);
2653 }
2654 
2655 static void
i40e_ring_tx_exit(i40e_trqpair_t * itrq)2656 i40e_ring_tx_exit(i40e_trqpair_t *itrq)
2657 {
2658 	mutex_enter(&itrq->itrq_tx_lock);
2659 	i40e_ring_tx_exit_nolock(itrq);
2660 	mutex_exit(&itrq->itrq_tx_lock);
2661 }
2662 
2663 
2664 /*
2665  * Tell the transmit path to quiesce and wait until there is no
2666  * more activity.
2667  * Will return B_TRUE if the transmit path is already quiesced, B_FALSE
2668  * otherwise.
2669  */
2670 boolean_t
i40e_ring_tx_quiesce(i40e_trqpair_t * itrq)2671 i40e_ring_tx_quiesce(i40e_trqpair_t *itrq)
2672 {
2673 	mutex_enter(&itrq->itrq_tx_lock);
2674 	if (itrq->itrq_tx_quiesce) {
2675 		/*
2676 		 * When itrq_tx_quiesce is set, then the ring has already
2677 		 * been shutdown.
2678 		 */
2679 		mutex_exit(&itrq->itrq_tx_lock);
2680 		return (B_TRUE);
2681 	}
2682 
2683 	/*
2684 	 * Tell any threads in transmit path this trqpair is quiesced and
2685 	 * wait until they've all exited the critical code path.
2686 	 */
2687 	itrq->itrq_tx_quiesce = B_TRUE;
2688 	while (itrq->itrq_tx_active > 0)
2689 		cv_wait(&itrq->itrq_tx_cv, &itrq->itrq_tx_lock);
2690 
2691 	mutex_exit(&itrq->itrq_tx_lock);
2692 
2693 	return (B_FALSE);
2694 }
2695 
2696 /*
2697  * We've been asked to send a message block on the wire. We'll only have a
2698  * single chain. There will not be any b_next pointers; however, there may be
2699  * multiple b_cont blocks. The number of b_cont blocks may exceed the
2700  * controller's Tx descriptor limit.
2701  *
2702  * We may do one of three things with any given mblk_t chain:
2703  *
2704  *   1) Drop it
2705  *   2) Transmit it
2706  *   3) Return it
2707  *
2708  * If we return it to MAC, then MAC will flow control on our behalf. In other
2709  * words, it won't send us anything until we tell it that it's okay to send us
2710  * something.
2711  */
2712 mblk_t *
i40e_ring_tx(void * arg,mblk_t * mp)2713 i40e_ring_tx(void *arg, mblk_t *mp)
2714 {
2715 	size_t msglen;
2716 	i40e_tx_control_block_t *tcb_ctx = NULL, *tcb = NULL, *tcbhead = NULL;
2717 	i40e_tx_context_desc_t *ctxdesc;
2718 	mac_ether_offload_info_t meo;
2719 	i40e_tx_context_t tctx;
2720 	int type;
2721 	uint_t needed_desc = 0;
2722 	boolean_t do_ctx_desc = B_FALSE, use_lso = B_FALSE;
2723 
2724 	i40e_trqpair_t *itrq = arg;
2725 	i40e_t *i40e = itrq->itrq_i40e;
2726 	i40e_hw_t *hw = &i40e->i40e_hw_space;
2727 	i40e_txq_stat_t *txs = &itrq->itrq_txstat;
2728 
2729 	ASSERT(mp->b_next == NULL);
2730 
2731 	if (!(i40e->i40e_state & I40E_STARTED) ||
2732 	    (i40e->i40e_state & I40E_OVERTEMP) ||
2733 	    (i40e->i40e_state & I40E_SUSPENDED) ||
2734 	    (i40e->i40e_state & I40E_ERROR) ||
2735 	    (i40e->i40e_link_state != LINK_STATE_UP) ||
2736 	    !i40e_ring_tx_enter(itrq)) {
2737 		freemsg(mp);
2738 		return (NULL);
2739 	}
2740 
2741 	/*
2742 	 * Parse packet headers for use by any requested offloads.  That offload
2743 	 * logic will later determine if the results here were adequate.
2744 	 */
2745 	mac_ether_offload_info(mp, &meo);
2746 
2747 	/*
2748 	 * Figure out the relevant context about this frame that we might need
2749 	 * for enabling checksum, LSO, etc. This also fills in information that
2750 	 * we might set around the packet type, etc.
2751 	 */
2752 	if (i40e_tx_context(i40e, itrq, mp, &meo, &tctx) < 0) {
2753 		freemsg(mp);
2754 		itrq->itrq_txstat.itxs_err_context.value.ui64++;
2755 		i40e_ring_tx_exit(itrq);
2756 		return (NULL);
2757 	}
2758 	if (tctx.itc_ctx_cmdflags & I40E_TX_CTX_DESC_TSO) {
2759 		use_lso = B_TRUE;
2760 		do_ctx_desc = B_TRUE;
2761 	}
2762 
2763 	/*
2764 	 * For the primordial driver we can punt on doing any recycling right
2765 	 * now; however, longer term we need to probably do some more pro-active
2766 	 * recycling to cut back on stalls in the TX path.
2767 	 */
2768 
2769 	msglen = msgsize(mp);
2770 
2771 	if (do_ctx_desc) {
2772 		/*
2773 		 * If we're doing tunneling or LSO, then we'll need a TX
2774 		 * context descriptor in addition to one or more TX data
2775 		 * descriptors.  Since there's no data DMA block or handle
2776 		 * associated with the context descriptor, we create a special
2777 		 * control block that behaves effectively like a NOP.
2778 		 */
2779 		if ((tcb_ctx = i40e_tcb_alloc(itrq)) == NULL) {
2780 			txs->itxs_err_notcb.value.ui64++;
2781 			goto txfail;
2782 		}
2783 		tcb_ctx->tcb_type = I40E_TX_DESC;
2784 		needed_desc++;
2785 	}
2786 
2787 	if (!use_lso) {
2788 		tcbhead = i40e_non_lso_chain(itrq, mp, &needed_desc);
2789 	} else {
2790 		tcbhead = i40e_lso_chain(itrq, mp, &meo, &tctx, &needed_desc);
2791 	}
2792 
2793 	if (tcbhead == NULL)
2794 		goto txfail;
2795 
2796 	tcbhead->tcb_mp = mp;
2797 
2798 	/*
2799 	 * The second condition ensures that 'itrq_desc_tail' never
2800 	 * equals 'itrq_desc_head'. This enforces the rule found in
2801 	 * the second bullet point of section 8.4.3.1.5 of the XL710
2802 	 * PG, which declares the TAIL pointer in I40E_QTX_TAIL should
2803 	 * never overlap with the head. This means that we only ever
2804 	 * have 'itrq_tx_ring_size - 1' total available descriptors.
2805 	 */
2806 	mutex_enter(&itrq->itrq_tx_lock);
2807 	if (itrq->itrq_desc_free < i40e->i40e_tx_block_thresh ||
2808 	    (itrq->itrq_desc_free - 1) < needed_desc) {
2809 		txs->itxs_err_nodescs.value.ui64++;
2810 		mutex_exit(&itrq->itrq_tx_lock);
2811 		goto txfail;
2812 	}
2813 
2814 	if (do_ctx_desc) {
2815 		/*
2816 		 * If we're enabling any offloads for this frame, then we'll
2817 		 * need to build up a transmit context descriptor, first.  The
2818 		 * context descriptor needs to be placed in the TX ring before
2819 		 * the data descriptor(s).  See section 8.4.2, table 8-16
2820 		 */
2821 		uint_t tail = itrq->itrq_desc_tail;
2822 		itrq->itrq_desc_free--;
2823 		ctxdesc = (i40e_tx_context_desc_t *)&itrq->itrq_desc_ring[tail];
2824 		itrq->itrq_tcb_work_list[tail] = tcb_ctx;
2825 		itrq->itrq_desc_tail = i40e_next_desc(tail, 1,
2826 		    itrq->itrq_tx_ring_size);
2827 
2828 		/* QW0 */
2829 		type = I40E_TX_DESC_DTYPE_CONTEXT;
2830 		ctxdesc->tunneling_params = 0;
2831 		ctxdesc->l2tag2 = 0;
2832 
2833 		/* QW1 */
2834 		ctxdesc->type_cmd_tso_mss = CPU_TO_LE64((uint64_t)type);
2835 		if (tctx.itc_ctx_cmdflags & I40E_TX_CTX_DESC_TSO) {
2836 			ctxdesc->type_cmd_tso_mss |= CPU_TO_LE64((uint64_t)
2837 			    ((uint64_t)tctx.itc_ctx_cmdflags <<
2838 			    I40E_TXD_CTX_QW1_CMD_SHIFT) |
2839 			    ((uint64_t)tctx.itc_ctx_tsolen <<
2840 			    I40E_TXD_CTX_QW1_TSO_LEN_SHIFT) |
2841 			    ((uint64_t)tctx.itc_ctx_mss <<
2842 			    I40E_TXD_CTX_QW1_MSS_SHIFT));
2843 		}
2844 	}
2845 
2846 	tcb = tcbhead;
2847 	while (tcb != NULL) {
2848 
2849 		itrq->itrq_tcb_work_list[itrq->itrq_desc_tail] = tcb;
2850 		if (tcb->tcb_type == I40E_TX_COPY) {
2851 			boolean_t last_desc = (tcb->tcb_next == NULL);
2852 
2853 			i40e_tx_set_data_desc(itrq, &tctx,
2854 			    (caddr_t)tcb->tcb_dma.dmab_dma_address,
2855 			    tcb->tcb_dma.dmab_len, last_desc);
2856 		} else {
2857 			boolean_t last_desc = B_FALSE;
2858 			ASSERT3S(tcb->tcb_type, ==, I40E_TX_DMA);
2859 
2860 			for (uint_t c = 0; c < tcb->tcb_bind_ncookies; c++) {
2861 				last_desc = (c == tcb->tcb_bind_ncookies - 1) &&
2862 				    (tcb->tcb_next == NULL);
2863 
2864 				i40e_tx_set_data_desc(itrq, &tctx,
2865 				    tcb->tcb_bind_info[c].dbi_paddr,
2866 				    tcb->tcb_bind_info[c].dbi_len,
2867 				    last_desc);
2868 			}
2869 		}
2870 
2871 		tcb = tcb->tcb_next;
2872 	}
2873 
2874 	/*
2875 	 * Now, finally, sync the DMA data and alert hardware.
2876 	 */
2877 	I40E_DMA_SYNC(&itrq->itrq_desc_area, DDI_DMA_SYNC_FORDEV);
2878 
2879 	I40E_WRITE_REG(hw, I40E_QTX_TAIL(itrq->itrq_index),
2880 	    itrq->itrq_desc_tail);
2881 
2882 	if (i40e_check_acc_handle(i40e->i40e_osdep_space.ios_reg_handle) !=
2883 	    DDI_FM_OK) {
2884 		/*
2885 		 * Note, we can't really go through and clean this up very well,
2886 		 * because the memory has been given to the device, so just
2887 		 * indicate it's been transmitted.
2888 		 */
2889 		ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED);
2890 		atomic_or_32(&i40e->i40e_state, I40E_ERROR);
2891 	}
2892 
2893 	txs->itxs_bytes.value.ui64 += msglen;
2894 	txs->itxs_packets.value.ui64++;
2895 	txs->itxs_descriptors.value.ui64 += needed_desc;
2896 
2897 	i40e_ring_tx_exit_nolock(itrq);
2898 
2899 	mutex_exit(&itrq->itrq_tx_lock);
2900 
2901 	return (NULL);
2902 
2903 txfail:
2904 	/*
2905 	 * We ran out of resources. Return it to MAC and indicate that we'll
2906 	 * need to signal MAC. If there are allocated tcb's, return them now.
2907 	 * Make sure to reset their message block's, since we'll return them
2908 	 * back to MAC.
2909 	 */
2910 	if (tcb_ctx != NULL) {
2911 		tcb_ctx->tcb_mp = NULL;
2912 		i40e_tcb_reset(tcb_ctx);
2913 		i40e_tcb_free(itrq, tcb_ctx);
2914 	}
2915 
2916 	tcb = tcbhead;
2917 	while (tcb != NULL) {
2918 		i40e_tx_control_block_t *next = tcb->tcb_next;
2919 
2920 		ASSERT(tcb->tcb_type == I40E_TX_DMA ||
2921 		    tcb->tcb_type == I40E_TX_COPY);
2922 
2923 		tcb->tcb_mp = NULL;
2924 		i40e_tcb_reset(tcb);
2925 		i40e_tcb_free(itrq, tcb);
2926 		tcb = next;
2927 	}
2928 
2929 	mutex_enter(&itrq->itrq_tx_lock);
2930 	i40e_ring_tx_exit_nolock(itrq);
2931 	itrq->itrq_tx_blocked = B_TRUE;
2932 	mutex_exit(&itrq->itrq_tx_lock);
2933 
2934 	return (mp);
2935 }
2936