1da5577f0SRobert Mustacchi /*
2da5577f0SRobert Mustacchi * This file and its contents are supplied under the terms of the
3da5577f0SRobert Mustacchi * Common Development and Distribution License ("CDDL"), version 1.0.
4da5577f0SRobert Mustacchi * You may only use this file in accordance with the terms of version
5da5577f0SRobert Mustacchi * 1.0 of the CDDL.
6da5577f0SRobert Mustacchi *
7da5577f0SRobert Mustacchi * A full copy of the text of the CDDL should have accompanied this
8da5577f0SRobert Mustacchi * source. A copy of the CDDL is also available via the Internet at
9da5577f0SRobert Mustacchi * http://www.illumos.org/license/CDDL.
10da5577f0SRobert Mustacchi */
11da5577f0SRobert Mustacchi
12da5577f0SRobert Mustacchi /*
13da5577f0SRobert Mustacchi * Copyright 2015 OmniTI Computer Consulting, Inc. All rights reserved.
14*8d5069bcSRyan Zezeski * Copyright 2019 Joyent, Inc.
15da5577f0SRobert Mustacchi */
16da5577f0SRobert Mustacchi
17da5577f0SRobert Mustacchi #include "i40e_sw.h"
18da5577f0SRobert Mustacchi
19da5577f0SRobert Mustacchi /*
20da5577f0SRobert Mustacchi * ---------------------------------------------------------
21da5577f0SRobert Mustacchi * Buffer and Memory Management, Receiving, and Transmitting
22da5577f0SRobert Mustacchi * ---------------------------------------------------------
23da5577f0SRobert Mustacchi *
24da5577f0SRobert Mustacchi * Each physical function (PF), which is what we think of as an instance of the
25da5577f0SRobert Mustacchi * device driver, has a series of associated transmit and receive queue pairs.
26da5577f0SRobert Mustacchi * Effectively, what we think of in MAC as rings. Each of these has their own
27da5577f0SRobert Mustacchi * ring of descriptors which is used as part of doing DMA activity.
28da5577f0SRobert Mustacchi *
29da5577f0SRobert Mustacchi * The transmit ring of descriptors are 16-byte entries which are used to send
30da5577f0SRobert Mustacchi * packets, program filters, etc. The receive ring of descriptors are either
31da5577f0SRobert Mustacchi * 16-byte or 32-bytes each. At the moment, we opt to use the larger descriptor
32da5577f0SRobert Mustacchi * format so that we're in a better position if we ever want to leverage that
33da5577f0SRobert Mustacchi * information later on.
34da5577f0SRobert Mustacchi *
35da5577f0SRobert Mustacchi * However, these rings are just for descriptors, they don't talk or deal with
36da5577f0SRobert Mustacchi * how we actually store the memory that we need for DMA or the associated
37da5577f0SRobert Mustacchi * information that we need for keeping track of message blocks. To correspond
38da5577f0SRobert Mustacchi * to the hardware descriptor ring which is how we communicate with hardware, we
39da5577f0SRobert Mustacchi * introduce a control block which keeps track of our required metadata like DMA
40da5577f0SRobert Mustacchi * mappings.
41da5577f0SRobert Mustacchi *
42da5577f0SRobert Mustacchi * There are two main considerations that dictate how much memory and buffers
43da5577f0SRobert Mustacchi * we end up allocating. Those are:
44da5577f0SRobert Mustacchi *
45da5577f0SRobert Mustacchi * o The size of the ring (controlled through the driver.conf file)
46da5577f0SRobert Mustacchi *
47da5577f0SRobert Mustacchi * o The maximum size frame we can receive.
48da5577f0SRobert Mustacchi *
49da5577f0SRobert Mustacchi * The size of the rings currently defaults to 1024 descriptors and is stored in
50da5577f0SRobert Mustacchi * the i40e_t`i40e_rx_ring_size and i40e_t`i40e_tx_ring_size.
51da5577f0SRobert Mustacchi *
52da5577f0SRobert Mustacchi * While the size of the rings is controlled by the driver.conf, the maximum
53da5577f0SRobert Mustacchi * size frame is informed primarily through the use of dladm and the setting of
54da5577f0SRobert Mustacchi * the MTU property on the device. From the MTU, we then go and do some
55da5577f0SRobert Mustacchi * machinations. The first thing we do is we then have to add in space for the
56da5577f0SRobert Mustacchi * Ethernet header, potentially a VLAN header, and the FCS check. This value is
57da5577f0SRobert Mustacchi * what's stored as i40e_t`i40e_frame_max and is derived any time
58da5577f0SRobert Mustacchi * i40e_t`i40e_sdu changes.
59da5577f0SRobert Mustacchi *
60da5577f0SRobert Mustacchi * This size is then rounded up to the nearest 1k chunk, which represents the
61da5577f0SRobert Mustacchi * actual amount of memory that we'll allocate for a single frame.
62da5577f0SRobert Mustacchi *
63*8d5069bcSRyan Zezeski * Note, that for RX, we do something that might be unexpected. We always add
64da5577f0SRobert Mustacchi * an extra two bytes to the frame size that we allocate. We then offset the DMA
65da5577f0SRobert Mustacchi * address that we receive a packet into by two bytes. This ensures that the IP
66da5577f0SRobert Mustacchi * header will always be 4 byte aligned because the MAC header is either 14 or
67da5577f0SRobert Mustacchi * 18 bytes in length, depending on the use of 802.1Q tagging, which makes IP's
68da5577f0SRobert Mustacchi * and MAC's lives easier.
69da5577f0SRobert Mustacchi *
70*8d5069bcSRyan Zezeski * Both the RX and TX descriptor rings (which are what we use to communicate
71da5577f0SRobert Mustacchi * with hardware) are allocated as a single region of DMA memory which is the
72da5577f0SRobert Mustacchi * size of the descriptor (4 bytes and 2 bytes respectively) times the total
73*8d5069bcSRyan Zezeski * number of descriptors for an RX and TX ring.
74da5577f0SRobert Mustacchi *
75*8d5069bcSRyan Zezeski * While the RX and TX descriptors are allocated using DMA-based memory, the
76da5577f0SRobert Mustacchi * control blocks for each of them are allocated using normal kernel memory.
77da5577f0SRobert Mustacchi * They aren't special from a DMA perspective. We'll go over the design of both
78da5577f0SRobert Mustacchi * receiving and transmitting separately, as they have slightly different
79da5577f0SRobert Mustacchi * control blocks and different ways that we manage the relationship between
80da5577f0SRobert Mustacchi * control blocks and descriptors.
81da5577f0SRobert Mustacchi *
82da5577f0SRobert Mustacchi * ---------------------------------
83da5577f0SRobert Mustacchi * RX Descriptors and Control Blocks
84da5577f0SRobert Mustacchi * ---------------------------------
85da5577f0SRobert Mustacchi *
86da5577f0SRobert Mustacchi * For every descriptor in the ring that the driver has, we need some associated
87da5577f0SRobert Mustacchi * memory, which means that we need to have the receive specific control block.
88da5577f0SRobert Mustacchi * We have a couple different, but related goals:
89da5577f0SRobert Mustacchi *
90da5577f0SRobert Mustacchi * o Once we've completed the mc_start GLDv3 endpoint (i40e_m_start), we do
91da5577f0SRobert Mustacchi * not want to do any additional memory allocations or DMA allocations if
92da5577f0SRobert Mustacchi * we don't have to.
93da5577f0SRobert Mustacchi *
94da5577f0SRobert Mustacchi * o We'd like to try and do as much zero-copy as possible, while taking into
95da5577f0SRobert Mustacchi * account the cost of mapping in DMA resources.
96da5577f0SRobert Mustacchi *
97da5577f0SRobert Mustacchi * o We'd like to have every receive descriptor available.
98da5577f0SRobert Mustacchi *
99da5577f0SRobert Mustacchi * Now, these rules are a bit in tension with one another. The act of mapping in
100da5577f0SRobert Mustacchi * is an exercise of trying to find the break-even point between page table
101da5577f0SRobert Mustacchi * updates and bcopy. We currently start by using the same metrics that ixgbe
102da5577f0SRobert Mustacchi * used; however, it should be known that this value has effectively been
103da5577f0SRobert Mustacchi * cargo-culted across to yet another driver, sorry.
104da5577f0SRobert Mustacchi *
105da5577f0SRobert Mustacchi * If we receive a packet which is larger than our copy threshold, we'll create
106da5577f0SRobert Mustacchi * a message block out of the DMA memory via desballoc(9F) and send that up to
107da5577f0SRobert Mustacchi * MAC that way. This will cause us to be notified when the message block is
108da5577f0SRobert Mustacchi * then freed because it has been consumed, dropped, or otherwise. Otherwise, if
109da5577f0SRobert Mustacchi * it's less than the threshold, we'll try to use allocb and bcopy it into the
110da5577f0SRobert Mustacchi * block, thus allowing us to immediately reuse the DMA resource. Note, on debug
111da5577f0SRobert Mustacchi * builds, we allow someone to whack the variable i40e_debug_rx_mode to override
112da5577f0SRobert Mustacchi * the behavior and always do a bcopy or a DMA bind.
113da5577f0SRobert Mustacchi *
114da5577f0SRobert Mustacchi * To try and ensure that the device always has blocks that it can receive data
115da5577f0SRobert Mustacchi * into, we maintain two lists of control blocks, a working list and a free
116*8d5069bcSRyan Zezeski * list. Each list is sized equal to the number of descriptors in the RX ring.
117*8d5069bcSRyan Zezeski * During the GLDv3 mc_start routine, we allocate a number of RX control blocks
118da5577f0SRobert Mustacchi * equal to twice the number of descriptors in the ring and we assign them
119da5577f0SRobert Mustacchi * equally to the free list and to the working list. Each control block also has
120da5577f0SRobert Mustacchi * DMA memory allocated and associated with which it will be used to receive the
121da5577f0SRobert Mustacchi * actual packet data. All of a received frame's data will end up in a single
122da5577f0SRobert Mustacchi * DMA buffer.
123da5577f0SRobert Mustacchi *
124*8d5069bcSRyan Zezeski * During operation, we always maintain the invariant that each RX descriptor
125*8d5069bcSRyan Zezeski * has an associated RX control block which lives in the working list. If we
126da5577f0SRobert Mustacchi * feel that we should loan up DMA memory to MAC in the form of a message block,
127da5577f0SRobert Mustacchi * we can only do so if we can maintain this invariant. To do that, we swap in
128da5577f0SRobert Mustacchi * one of the buffers from the free list. If none are available, then we resort
129da5577f0SRobert Mustacchi * to using allocb(9F) and bcopy(9F) on the packet instead, regardless of the
130da5577f0SRobert Mustacchi * size.
131da5577f0SRobert Mustacchi *
132da5577f0SRobert Mustacchi * Loaned message blocks come back to use when freemsg(9F) or freeb(9F) is
133*8d5069bcSRyan Zezeski * called on the block, at which point we restore the RX control block to the
134da5577f0SRobert Mustacchi * free list and are able to reuse the DMA memory again. While the scheme may
135da5577f0SRobert Mustacchi * seem odd, it importantly keeps us out of trying to do any DMA allocations in
136da5577f0SRobert Mustacchi * the normal path of operation, even though we may still have to allocate
137da5577f0SRobert Mustacchi * message blocks and copy.
138da5577f0SRobert Mustacchi *
139*8d5069bcSRyan Zezeski * The following state machine describes the life time of a RX control block. In
140*8d5069bcSRyan Zezeski * the diagram we abbrviate the RX ring descriptor entry as rxd and the rx
141da5577f0SRobert Mustacchi * control block entry as rcb.
142da5577f0SRobert Mustacchi *
143da5577f0SRobert Mustacchi * | |
144da5577f0SRobert Mustacchi * * ... 1/2 of all initial rcb's ... *
145da5577f0SRobert Mustacchi * | |
146da5577f0SRobert Mustacchi * v v
147da5577f0SRobert Mustacchi * +------------------+ +------------------+
148da5577f0SRobert Mustacchi * | rcb on free list |---*---------->| rcb on work list |
149da5577f0SRobert Mustacchi * +------------------+ . +------------------+
150da5577f0SRobert Mustacchi * ^ . moved to |
151da5577f0SRobert Mustacchi * | replace rcb * . . Frame received,
152da5577f0SRobert Mustacchi * | loaned to | entry on free list
153da5577f0SRobert Mustacchi * | MAC + co. | available. rcb's
154da5577f0SRobert Mustacchi * | | memory made into mblk_t
155da5577f0SRobert Mustacchi * * . freemsg(9F) | and sent up to MAC.
156da5577f0SRobert Mustacchi * | called on |
157da5577f0SRobert Mustacchi * | loaned rcb |
158da5577f0SRobert Mustacchi * | and it is v
159da5577f0SRobert Mustacchi * | recycled. +-------------------+
160da5577f0SRobert Mustacchi * +--------------------<-----| rcb loaned to MAC |
161da5577f0SRobert Mustacchi * +-------------------+
162da5577f0SRobert Mustacchi *
163*8d5069bcSRyan Zezeski * Finally, note that every RX control block has a reference count on it. One
164da5577f0SRobert Mustacchi * reference is added as long as the driver has had the GLDv3 mc_start endpoint
165da5577f0SRobert Mustacchi * called. If the GLDv3 mc_stop entry point is called, IP has been unplumbed and
166da5577f0SRobert Mustacchi * no other DLPI consumers remain, then we'll decrement the reference count by
167*8d5069bcSRyan Zezeski * one. Whenever we loan up the RX control block and associated buffer to MAC,
168da5577f0SRobert Mustacchi * then we bump the reference count again. Even though the device is stopped,
169da5577f0SRobert Mustacchi * there may still be loaned frames in upper levels that we'll want to account
170da5577f0SRobert Mustacchi * for. Our callback from freemsg(9F)/freeb(9F) will take care of making sure
171da5577f0SRobert Mustacchi * that it is cleaned up.
172da5577f0SRobert Mustacchi *
173da5577f0SRobert Mustacchi * --------------------
174da5577f0SRobert Mustacchi * Managing the RX Ring
175da5577f0SRobert Mustacchi * --------------------
176da5577f0SRobert Mustacchi *
177da5577f0SRobert Mustacchi * The receive ring descriptors are arranged in a circular buffer with a head
178da5577f0SRobert Mustacchi * and tail pointer. There are both the conventional head and tail pointers
179da5577f0SRobert Mustacchi * which are used to partition the ring into two portions, a portion that we,
180da5577f0SRobert Mustacchi * the operating system, manage and a portion that is managed by hardware. When
181da5577f0SRobert Mustacchi * hardware owns a descriptor in the ring, it means that it is waiting for data
182da5577f0SRobert Mustacchi * to be filled in. However, when a portion of the ring is owned by the driver,
183da5577f0SRobert Mustacchi * then that means that the descriptor has been consumed and we need to go take
184da5577f0SRobert Mustacchi * a look at it.
185da5577f0SRobert Mustacchi *
186da5577f0SRobert Mustacchi * The initial head is configured to be zero by writing it as such in the
187da5577f0SRobert Mustacchi * receive queue context in the FPM (function private memory from the host). The
188da5577f0SRobert Mustacchi * initial tail is written to be the last descriptor. This is written to via the
189da5577f0SRobert Mustacchi * PCIe register I40E_QRX_TAIL(). Technically, hardware owns everything between
190da5577f0SRobert Mustacchi * the HEAD and TAIL, inclusive. Note that while we initially program the HEAD,
191da5577f0SRobert Mustacchi * the only values we ever consult ourselves are the TAIL register and our own
192da5577f0SRobert Mustacchi * state tracking. Effectively, we cache the HEAD register and then update it
193da5577f0SRobert Mustacchi * ourselves based on our work.
194da5577f0SRobert Mustacchi *
195*8d5069bcSRyan Zezeski * When we iterate over the RX descriptors and thus the received frames, we are
196da5577f0SRobert Mustacchi * either in an interrupt context or we've been asked by MAC to poll on the
197da5577f0SRobert Mustacchi * ring. If we've been asked to poll on the ring, we have a maximum number of
198*8d5069bcSRyan Zezeski * bytes of mblk_t's to return. If processing an RX descriptor would cause us to
199da5577f0SRobert Mustacchi * exceed that count, then we do not process it. When in interrupt context, we
200da5577f0SRobert Mustacchi * don't have a strict byte count. However, to ensure liveness, we limit the
201da5577f0SRobert Mustacchi * amount of data based on a configuration value
202da5577f0SRobert Mustacchi * (i40e_t`i40e_rx_limit_per_intr). The number that we've started with for this
203da5577f0SRobert Mustacchi * is based on similar numbers that are used for ixgbe. After some additional
204da5577f0SRobert Mustacchi * time in the field, we'll have a sense as to whether or not it should be
205da5577f0SRobert Mustacchi * changed.
206da5577f0SRobert Mustacchi *
207da5577f0SRobert Mustacchi * When processing, we start at our own HEAD pointer
208da5577f0SRobert Mustacchi * (i40e_rx_data_t`rxd_desc_next), which indicates the descriptor to start
209da5577f0SRobert Mustacchi * processing. Every RX descriptor has what's described as the DD bit. This bit
210da5577f0SRobert Mustacchi * (the LSB of the second 8-byte word), indicates whether or not the descriptor
211da5577f0SRobert Mustacchi * is done. When we give descriptors to the hardware, this value is always
212da5577f0SRobert Mustacchi * zero. When the hardware has finished a descriptor, it will always be one.
213da5577f0SRobert Mustacchi *
214da5577f0SRobert Mustacchi * The first thing that we check is whether the DD bit indicates that the
215da5577f0SRobert Mustacchi * current HEAD is ready. If it isn't, then we're done. That's the primary
216da5577f0SRobert Mustacchi * invariant of processing a frame. If it's done, then there are a few other
217da5577f0SRobert Mustacchi * things that we want to look at. In the same status word as the DD bit, there
218da5577f0SRobert Mustacchi * are two other important bits:
219da5577f0SRobert Mustacchi *
220da5577f0SRobert Mustacchi * o End of Packet (EOP)
221da5577f0SRobert Mustacchi * o Error bits
222da5577f0SRobert Mustacchi *
223da5577f0SRobert Mustacchi * The end of packet indicates that we have reached the last descriptor. Now,
224da5577f0SRobert Mustacchi * you might ask when would there be more than one descriptor. The reason for
225da5577f0SRobert Mustacchi * that might be due to large receive offload (lro) or header splitting
226da5577f0SRobert Mustacchi * functionality, which presently isn't supported in the driver. The error bits
227da5577f0SRobert Mustacchi * in the frame are only valid when EOP is set.
228da5577f0SRobert Mustacchi *
229da5577f0SRobert Mustacchi * If error bits are set on the frame, then we still consume it; however, we
230da5577f0SRobert Mustacchi * will not generate an mblk_t to send up to MAC. If there are no error bits
231da5577f0SRobert Mustacchi * set, then we'll consume the descriptor either using bcopy or DMA binding. See
232da5577f0SRobert Mustacchi * the earlier section 'RX DESCRIPTORS AND CONTROL BLOCKS' for more information
233da5577f0SRobert Mustacchi * on how that selection is made.
234da5577f0SRobert Mustacchi *
235da5577f0SRobert Mustacchi * Regardless of whether we construct an mblk_t or encounter an error, we end up
236da5577f0SRobert Mustacchi * resetting the descriptor. This re-arms the descriptor for hardware and in the
237da5577f0SRobert Mustacchi * process, we may end up assigning it a new receive control bock. After we do
238da5577f0SRobert Mustacchi * this, we always update our HEAD pointer, no matter what.
239da5577f0SRobert Mustacchi *
240da5577f0SRobert Mustacchi * Finally, once we've consumed as much as we will in a given window, we go and
241da5577f0SRobert Mustacchi * update the TAIL register to indicate all the frames we've consumed. We only
242da5577f0SRobert Mustacchi * do a single bulk write for the ring.
243da5577f0SRobert Mustacchi *
244da5577f0SRobert Mustacchi * ---------------------------------
245da5577f0SRobert Mustacchi * TX Descriptors and Control Blocks
246da5577f0SRobert Mustacchi * ---------------------------------
247da5577f0SRobert Mustacchi *
248da5577f0SRobert Mustacchi * While the transmit path is similar in spirit to the receive path, it works
249da5577f0SRobert Mustacchi * differently due to the fact that all data is originated by the operating
250da5577f0SRobert Mustacchi * system and not by the device.
251da5577f0SRobert Mustacchi *
252*8d5069bcSRyan Zezeski * Like RX, there is both a descriptor ring that we use to communicate to the
253da5577f0SRobert Mustacchi * driver and which points to the memory used to transmit a frame. Similarly,
254*8d5069bcSRyan Zezeski * there is a corresponding transmit control block, however, the correspondence
255*8d5069bcSRyan Zezeski * between descriptors and control blocks is more complex and not necessarily
256*8d5069bcSRyan Zezeski * 1-to-1.
257da5577f0SRobert Mustacchi *
258da5577f0SRobert Mustacchi * The driver is asked to process a single frame at a time. That message block
259da5577f0SRobert Mustacchi * may be made up of multiple fragments linked together by the mblk_t`b_cont
260da5577f0SRobert Mustacchi * member. The device has a hard limit of up to 8 buffers being allowed for use
261*8d5069bcSRyan Zezeski * for a single non-LSO packet or LSO segment. The number of TX ring entires
262*8d5069bcSRyan Zezeski * (and thus TX control blocks) used depends on the fragment sizes and DMA
263*8d5069bcSRyan Zezeski * layout, as explained below.
264da5577f0SRobert Mustacchi *
265*8d5069bcSRyan Zezeski * We alter our DMA strategy based on a threshold tied to the fragment size.
266*8d5069bcSRyan Zezeski * This threshold is configurable via the tx_dma_threshold property. If the
267*8d5069bcSRyan Zezeski * fragment is above the threshold, we DMA bind it -- consuming one TCB and
268*8d5069bcSRyan Zezeski * potentially several data descriptors. The exact number of descriptors (equal
269*8d5069bcSRyan Zezeski * to the number of DMA cookies) depends on page size, MTU size, b_rptr offset
270*8d5069bcSRyan Zezeski * into page, b_wptr offset into page, and the physical layout of the dblk's
271*8d5069bcSRyan Zezeski * memory (contiguous or not). Essentially, we are at the mercy of the DMA
272*8d5069bcSRyan Zezeski * engine and the dblk's memory allocation. Knowing the exact number of
273*8d5069bcSRyan Zezeski * descriptors up front is a task best not taken on by the driver itself.
274*8d5069bcSRyan Zezeski * Instead, we attempt to DMA bind the fragment and verify the descriptor
275*8d5069bcSRyan Zezeski * layout meets hardware constraints. If the proposed DMA bind does not satisfy
276*8d5069bcSRyan Zezeski * the hardware constaints, then we discard it and instead copy the entire
277*8d5069bcSRyan Zezeski * fragment into the pre-allocated TCB buffer (or buffers if the fragment is
278*8d5069bcSRyan Zezeski * larger than the TCB buffer).
279da5577f0SRobert Mustacchi *
280*8d5069bcSRyan Zezeski * If the fragment is below or at the threshold, we copy it to the pre-allocated
281*8d5069bcSRyan Zezeski * buffer of a TCB. We compress consecutive copy fragments into a single TCB to
282*8d5069bcSRyan Zezeski * conserve resources. We are guaranteed that the TCB buffer is made up of only
283*8d5069bcSRyan Zezeski * 1 DMA cookie; and therefore consumes only one descriptor on the controller.
284*8d5069bcSRyan Zezeski *
285*8d5069bcSRyan Zezeski * Furthermore, if the frame requires HW offloads such as LSO, tunneling or
286*8d5069bcSRyan Zezeski * filtering, then the TX data descriptors must be preceeded by a single TX
287*8d5069bcSRyan Zezeski * context descriptor. Because there is no DMA transfer associated with the
288*8d5069bcSRyan Zezeski * context descriptor, we allocate a control block with a special type which
289*8d5069bcSRyan Zezeski * indicates to the TX ring recycle code that there are no associated DMA
290*8d5069bcSRyan Zezeski * resources to unbind when the control block is free'd.
291*8d5069bcSRyan Zezeski *
292*8d5069bcSRyan Zezeski * If we don't have enough space in the ring or TX control blocks available,
293da5577f0SRobert Mustacchi * then we'll return the unprocessed message block to MAC. This will induce flow
294da5577f0SRobert Mustacchi * control and once we recycle enough entries, we'll once again enable sending
295da5577f0SRobert Mustacchi * on the ring.
296da5577f0SRobert Mustacchi *
297da5577f0SRobert Mustacchi * We size the working list as equal to the number of descriptors in the ring.
298da5577f0SRobert Mustacchi * We size the free list as equal to 1.5 times the number of descriptors in the
299*8d5069bcSRyan Zezeski * ring. We'll allocate a number of TX control block entries equal to the number
300da5577f0SRobert Mustacchi * of entries in the free list. By default, all entries are placed in the free
301da5577f0SRobert Mustacchi * list. As we come along and try to send something, we'll allocate entries from
302da5577f0SRobert Mustacchi * the free list and add them to the working list, where they'll stay until the
303da5577f0SRobert Mustacchi * hardware indicates that all of the data has been written back to us. The
304da5577f0SRobert Mustacchi * reason that we start with 1.5x is to help facilitate having more than one TX
305da5577f0SRobert Mustacchi * buffer associated with the DMA activity.
306da5577f0SRobert Mustacchi *
307da5577f0SRobert Mustacchi * --------------------
308da5577f0SRobert Mustacchi * Managing the TX Ring
309da5577f0SRobert Mustacchi * --------------------
310da5577f0SRobert Mustacchi *
311da5577f0SRobert Mustacchi * The transmit descriptor ring is driven by us. We maintain our own notion of a
312da5577f0SRobert Mustacchi * HEAD and TAIL register and we update the hardware with updates to the TAIL
313da5577f0SRobert Mustacchi * register. When the hardware is done writing out data, it updates us by
314da5577f0SRobert Mustacchi * writing back to a specific address, not by updating the individual
315da5577f0SRobert Mustacchi * descriptors. That address is a 4-byte region after the main transmit
316da5577f0SRobert Mustacchi * descriptor ring. This is why the descriptor ring has an extra descriptor's
317da5577f0SRobert Mustacchi * worth allocated to it.
318da5577f0SRobert Mustacchi *
319da5577f0SRobert Mustacchi * We maintain our notion of the HEAD in the i40e_trqpair_t`itrq_desc_head and
320da5577f0SRobert Mustacchi * the TAIL in the i40e_trqpair_t`itrq_desc_tail. When we write out frames,
321da5577f0SRobert Mustacchi * we'll update the tail there and in the I40E_QTX_TAIL() register. At various
322da5577f0SRobert Mustacchi * points in time, through both interrupts, and our own internal checks, we'll
323da5577f0SRobert Mustacchi * sync the write-back head portion of the DMA space. Based on the index it
324da5577f0SRobert Mustacchi * reports back, we'll free everything between our current HEAD and the
325da5577f0SRobert Mustacchi * indicated index and update HEAD to the new index.
326da5577f0SRobert Mustacchi *
327da5577f0SRobert Mustacchi * When a frame comes in, we try to use a number of transmit control blocks and
328da5577f0SRobert Mustacchi * we'll transition them from the free list to the work list. They'll get moved
329da5577f0SRobert Mustacchi * to the entry on the work list that corresponds with the transmit descriptor
330da5577f0SRobert Mustacchi * they correspond to. Once we are indicated that the corresponding descriptor
331da5577f0SRobert Mustacchi * has been freed, we'll return it to the list.
332da5577f0SRobert Mustacchi *
333da5577f0SRobert Mustacchi * The transmit control block free list is managed by keeping track of the
334da5577f0SRobert Mustacchi * number of entries in it, i40e_trqpair_t`itrq_tcb_free. We use it as a way to
335da5577f0SRobert Mustacchi * index into the free list and add things to it. In effect, we always push and
336da5577f0SRobert Mustacchi * pop from the tail and protect it with a single lock,
337da5577f0SRobert Mustacchi * i40e_trqpair_t`itrq_tcb_lock. This scheme is somewhat simplistic and may not
338da5577f0SRobert Mustacchi * stand up to further performance testing; however, it does allow us to get off
339da5577f0SRobert Mustacchi * the ground with the device driver.
340da5577f0SRobert Mustacchi *
341da5577f0SRobert Mustacchi * The following image describes where a given transmit control block lives in
342da5577f0SRobert Mustacchi * its lifetime:
343da5577f0SRobert Mustacchi *
344da5577f0SRobert Mustacchi * |
345da5577f0SRobert Mustacchi * * ... Initial placement for all tcb's
346da5577f0SRobert Mustacchi * |
347da5577f0SRobert Mustacchi * v
348da5577f0SRobert Mustacchi * +------------------+ +------------------+
349da5577f0SRobert Mustacchi * | tcb on free list |---*------------------>| tcb on work list |
350da5577f0SRobert Mustacchi * +------------------+ . +------------------+
351*8d5069bcSRyan Zezeski * ^ . N tcbs allocated[1] |
352da5577f0SRobert Mustacchi * | to send frame v
353da5577f0SRobert Mustacchi * | or fragment on |
354da5577f0SRobert Mustacchi * | wire, mblk from |
355da5577f0SRobert Mustacchi * | MAC associated. |
356da5577f0SRobert Mustacchi * | |
357da5577f0SRobert Mustacchi * +------*-------------------------------<----+
358da5577f0SRobert Mustacchi * .
359da5577f0SRobert Mustacchi * . Hardware indicates
360da5577f0SRobert Mustacchi * entry transmitted.
361*8d5069bcSRyan Zezeski * tcbs recycled, mblk
362da5577f0SRobert Mustacchi * from MAC freed.
363da5577f0SRobert Mustacchi *
364*8d5069bcSRyan Zezeski * [1] We allocate N tcbs to transmit a single frame where N can be 1 context
365*8d5069bcSRyan Zezeski * descriptor plus 1 data descriptor, in the non-DMA-bind case. In the DMA
366*8d5069bcSRyan Zezeski * bind case, N can be 1 context descriptor plus 1 data descriptor per
367*8d5069bcSRyan Zezeski * b_cont in the mblk. In this case, the mblk is associated with the first
368*8d5069bcSRyan Zezeski * data descriptor and freed as part of freeing that data descriptor.
369*8d5069bcSRyan Zezeski *
370da5577f0SRobert Mustacchi * ------------
371da5577f0SRobert Mustacchi * Blocking MAC
372da5577f0SRobert Mustacchi * ------------
373da5577f0SRobert Mustacchi *
374*8d5069bcSRyan Zezeski * When performing transmit, we can run out of descriptors and ring entries.
375*8d5069bcSRyan Zezeski * When such a case happens, we return the mblk_t to MAC to indicate that we've
376*8d5069bcSRyan Zezeski * been blocked. At that point in time, MAC becomes blocked and will not
377*8d5069bcSRyan Zezeski * transmit anything out that specific ring until we notify MAC. To indicate
378*8d5069bcSRyan Zezeski * that we're in such a situation we set i40e_trqpair_t`itrq_tx_blocked member
379*8d5069bcSRyan Zezeski * to B_TRUE.
380da5577f0SRobert Mustacchi *
381*8d5069bcSRyan Zezeski * When we recycle TX descriptors then we'll end up signaling MAC by calling
382da5577f0SRobert Mustacchi * mac_tx_ring_update() if we were blocked, letting it know that it's safe to
383da5577f0SRobert Mustacchi * start sending frames out to us again.
384da5577f0SRobert Mustacchi */
385da5577f0SRobert Mustacchi
386da5577f0SRobert Mustacchi /*
387da5577f0SRobert Mustacchi * We set our DMA alignment requests based on the smallest supported page size
388da5577f0SRobert Mustacchi * of the corresponding platform.
389da5577f0SRobert Mustacchi */
390da5577f0SRobert Mustacchi #if defined(__sparc)
391da5577f0SRobert Mustacchi #define I40E_DMA_ALIGNMENT 0x2000ull
392da5577f0SRobert Mustacchi #elif defined(__x86)
393da5577f0SRobert Mustacchi #define I40E_DMA_ALIGNMENT 0x1000ull
394da5577f0SRobert Mustacchi #else
395da5577f0SRobert Mustacchi #error "unknown architecture for i40e"
396da5577f0SRobert Mustacchi #endif
397da5577f0SRobert Mustacchi
398da5577f0SRobert Mustacchi /*
399da5577f0SRobert Mustacchi * This structure is used to maintain information and flags related to
400*8d5069bcSRyan Zezeski * transmitting a frame. These fields are ultimately used to construct the
401*8d5069bcSRyan Zezeski * TX data descriptor(s) and, if necessary, the TX context descriptor.
402da5577f0SRobert Mustacchi */
403da5577f0SRobert Mustacchi typedef struct i40e_tx_context {
404*8d5069bcSRyan Zezeski enum i40e_tx_desc_cmd_bits itc_data_cmdflags;
405*8d5069bcSRyan Zezeski uint32_t itc_data_offsets;
406*8d5069bcSRyan Zezeski enum i40e_tx_ctx_desc_cmd_bits itc_ctx_cmdflags;
407*8d5069bcSRyan Zezeski uint32_t itc_ctx_tsolen;
408*8d5069bcSRyan Zezeski uint32_t itc_ctx_mss;
409da5577f0SRobert Mustacchi } i40e_tx_context_t;
410da5577f0SRobert Mustacchi
411da5577f0SRobert Mustacchi /*
412da5577f0SRobert Mustacchi * Toggles on debug builds which can be used to override our RX behaviour based
413da5577f0SRobert Mustacchi * on thresholds.
414da5577f0SRobert Mustacchi */
415da5577f0SRobert Mustacchi #ifdef DEBUG
416da5577f0SRobert Mustacchi typedef enum {
417da5577f0SRobert Mustacchi I40E_DEBUG_RX_DEFAULT = 0,
418da5577f0SRobert Mustacchi I40E_DEBUG_RX_BCOPY = 1,
419da5577f0SRobert Mustacchi I40E_DEBUG_RX_DMABIND = 2
420da5577f0SRobert Mustacchi } i40e_debug_rx_t;
421da5577f0SRobert Mustacchi
422da5577f0SRobert Mustacchi i40e_debug_rx_t i40e_debug_rx_mode = I40E_DEBUG_RX_DEFAULT;
423da5577f0SRobert Mustacchi #endif /* DEBUG */
424da5577f0SRobert Mustacchi
425da5577f0SRobert Mustacchi /*
426da5577f0SRobert Mustacchi * Notes on the following pair of DMA attributes. The first attribute,
427da5577f0SRobert Mustacchi * i40e_static_dma_attr, is designed to be used for both the descriptor rings
428da5577f0SRobert Mustacchi * and the static buffers that we associate with control blocks. For this
429da5577f0SRobert Mustacchi * reason, we force an SGL length of one. While technically the driver supports
430*8d5069bcSRyan Zezeski * a larger SGL (5 on RX and 8 on TX), we opt to only use one to simplify our
431da5577f0SRobert Mustacchi * management here. In addition, when the Intel common code wants to allocate
432da5577f0SRobert Mustacchi * memory via the i40e_allocate_virt_mem osdep function, we have it leverage
433da5577f0SRobert Mustacchi * the static dma attr.
434da5577f0SRobert Mustacchi *
435*8d5069bcSRyan Zezeski * The latter two sets of attributes, are what we use when we're binding a
436*8d5069bcSRyan Zezeski * bunch of mblk_t fragments to go out the door. Note that the main difference
437*8d5069bcSRyan Zezeski * here is that we're allowed a larger SGL length. For non-LSO TX, we
438*8d5069bcSRyan Zezeski * restrict the SGL length to match the number of TX buffers available to the
439*8d5069bcSRyan Zezeski * PF (8). For the LSO case we can go much larger, with the caveat that each
440*8d5069bcSRyan Zezeski * MSS-sized chunk (segment) must not span more than 8 data descriptors and
441*8d5069bcSRyan Zezeski * hence must not span more than 8 cookies.
442da5577f0SRobert Mustacchi *
443da5577f0SRobert Mustacchi * Note, we default to setting ourselves to be DMA capable here. However,
444da5577f0SRobert Mustacchi * because we could have multiple instances which have different FMA error
445da5577f0SRobert Mustacchi * checking capabilities, or end up on different buses, we make these static
446da5577f0SRobert Mustacchi * and const and copy them into the i40e_t for the given device with the actual
447da5577f0SRobert Mustacchi * values that reflect the actual capabilities.
448da5577f0SRobert Mustacchi */
449da5577f0SRobert Mustacchi static const ddi_dma_attr_t i40e_g_static_dma_attr = {
450da5577f0SRobert Mustacchi DMA_ATTR_V0, /* version number */
451da5577f0SRobert Mustacchi 0x0000000000000000ull, /* low address */
452da5577f0SRobert Mustacchi 0xFFFFFFFFFFFFFFFFull, /* high address */
453da5577f0SRobert Mustacchi 0x00000000FFFFFFFFull, /* dma counter max */
454da5577f0SRobert Mustacchi I40E_DMA_ALIGNMENT, /* alignment */
455da5577f0SRobert Mustacchi 0x00000FFF, /* burst sizes */
456da5577f0SRobert Mustacchi 0x00000001, /* minimum transfer size */
457da5577f0SRobert Mustacchi 0x00000000FFFFFFFFull, /* maximum transfer size */
458da5577f0SRobert Mustacchi 0xFFFFFFFFFFFFFFFFull, /* maximum segment size */
459da5577f0SRobert Mustacchi 1, /* scatter/gather list length */
460da5577f0SRobert Mustacchi 0x00000001, /* granularity */
461da5577f0SRobert Mustacchi DDI_DMA_FLAGERR /* DMA flags */
462da5577f0SRobert Mustacchi };
463da5577f0SRobert Mustacchi
464da5577f0SRobert Mustacchi static const ddi_dma_attr_t i40e_g_txbind_dma_attr = {
465da5577f0SRobert Mustacchi DMA_ATTR_V0, /* version number */
466da5577f0SRobert Mustacchi 0x0000000000000000ull, /* low address */
467da5577f0SRobert Mustacchi 0xFFFFFFFFFFFFFFFFull, /* high address */
468*8d5069bcSRyan Zezeski I40E_MAX_TX_BUFSZ - 1, /* dma counter max */
469da5577f0SRobert Mustacchi I40E_DMA_ALIGNMENT, /* alignment */
470da5577f0SRobert Mustacchi 0x00000FFF, /* burst sizes */
471da5577f0SRobert Mustacchi 0x00000001, /* minimum transfer size */
472da5577f0SRobert Mustacchi 0x00000000FFFFFFFFull, /* maximum transfer size */
473da5577f0SRobert Mustacchi 0xFFFFFFFFFFFFFFFFull, /* maximum segment size */
474da5577f0SRobert Mustacchi I40E_TX_MAX_COOKIE, /* scatter/gather list length */
475da5577f0SRobert Mustacchi 0x00000001, /* granularity */
476da5577f0SRobert Mustacchi DDI_DMA_FLAGERR /* DMA flags */
477da5577f0SRobert Mustacchi };
478da5577f0SRobert Mustacchi
479*8d5069bcSRyan Zezeski static const ddi_dma_attr_t i40e_g_txbind_lso_dma_attr = {
480*8d5069bcSRyan Zezeski DMA_ATTR_V0, /* version number */
481*8d5069bcSRyan Zezeski 0x0000000000000000ull, /* low address */
482*8d5069bcSRyan Zezeski 0xFFFFFFFFFFFFFFFFull, /* high address */
483*8d5069bcSRyan Zezeski I40E_MAX_TX_BUFSZ - 1, /* dma counter max */
484*8d5069bcSRyan Zezeski I40E_DMA_ALIGNMENT, /* alignment */
485*8d5069bcSRyan Zezeski 0x00000FFF, /* burst sizes */
486*8d5069bcSRyan Zezeski 0x00000001, /* minimum transfer size */
487*8d5069bcSRyan Zezeski 0x00000000FFFFFFFFull, /* maximum transfer size */
488*8d5069bcSRyan Zezeski 0xFFFFFFFFFFFFFFFFull, /* maximum segment size */
489*8d5069bcSRyan Zezeski I40E_TX_LSO_MAX_COOKIE, /* scatter/gather list length */
490*8d5069bcSRyan Zezeski 0x00000001, /* granularity */
491*8d5069bcSRyan Zezeski DDI_DMA_FLAGERR /* DMA flags */
492*8d5069bcSRyan Zezeski };
493*8d5069bcSRyan Zezeski
494da5577f0SRobert Mustacchi /*
495da5577f0SRobert Mustacchi * Next, we have the attributes for these structures. The descriptor rings are
496da5577f0SRobert Mustacchi * all strictly little endian, while the data buffers are just arrays of bytes
497da5577f0SRobert Mustacchi * representing frames. Because of this, we purposefully simplify the driver
498da5577f0SRobert Mustacchi * programming life by programming the descriptor ring as little endian, while
499da5577f0SRobert Mustacchi * for the buffer data we keep it as unstructured.
500da5577f0SRobert Mustacchi *
501da5577f0SRobert Mustacchi * Note, that to keep the Intel common code operating in a reasonable way, when
502da5577f0SRobert Mustacchi * we allocate DMA memory for it, we do not use byte swapping and thus use the
503da5577f0SRobert Mustacchi * standard i40e_buf_acc_attr.
504da5577f0SRobert Mustacchi */
505da5577f0SRobert Mustacchi static const ddi_device_acc_attr_t i40e_g_desc_acc_attr = {
506da5577f0SRobert Mustacchi DDI_DEVICE_ATTR_V0,
507da5577f0SRobert Mustacchi DDI_STRUCTURE_LE_ACC,
508da5577f0SRobert Mustacchi DDI_STRICTORDER_ACC
509da5577f0SRobert Mustacchi };
510da5577f0SRobert Mustacchi
511da5577f0SRobert Mustacchi static const ddi_device_acc_attr_t i40e_g_buf_acc_attr = {
512da5577f0SRobert Mustacchi DDI_DEVICE_ATTR_V0,
513da5577f0SRobert Mustacchi DDI_NEVERSWAP_ACC,
514da5577f0SRobert Mustacchi DDI_STRICTORDER_ACC
515da5577f0SRobert Mustacchi };
516da5577f0SRobert Mustacchi
517da5577f0SRobert Mustacchi /*
518da5577f0SRobert Mustacchi * The next two functions are designed to be type-safe versions of macros that
519da5577f0SRobert Mustacchi * are used to increment and decrement a descriptor index in the loop. Note,
520da5577f0SRobert Mustacchi * these are marked inline to try and keep the data path hot and they were
521da5577f0SRobert Mustacchi * effectively inlined in their previous life as macros.
522da5577f0SRobert Mustacchi */
523da5577f0SRobert Mustacchi static inline int
i40e_next_desc(int base,int count,int size)524da5577f0SRobert Mustacchi i40e_next_desc(int base, int count, int size)
525da5577f0SRobert Mustacchi {
526da5577f0SRobert Mustacchi int out;
527da5577f0SRobert Mustacchi
528da5577f0SRobert Mustacchi ASSERT(base >= 0);
529da5577f0SRobert Mustacchi ASSERT(count > 0);
530da5577f0SRobert Mustacchi ASSERT(size > 0);
531da5577f0SRobert Mustacchi
532da5577f0SRobert Mustacchi if (base + count < size) {
533da5577f0SRobert Mustacchi out = base + count;
534da5577f0SRobert Mustacchi } else {
535da5577f0SRobert Mustacchi out = base + count - size;
536da5577f0SRobert Mustacchi }
537da5577f0SRobert Mustacchi
538da5577f0SRobert Mustacchi ASSERT(out >= 0 && out < size);
539da5577f0SRobert Mustacchi return (out);
540da5577f0SRobert Mustacchi }
541da5577f0SRobert Mustacchi
542da5577f0SRobert Mustacchi static inline int
i40e_prev_desc(int base,int count,int size)543da5577f0SRobert Mustacchi i40e_prev_desc(int base, int count, int size)
544da5577f0SRobert Mustacchi {
545da5577f0SRobert Mustacchi int out;
546da5577f0SRobert Mustacchi
547da5577f0SRobert Mustacchi ASSERT(base >= 0);
548da5577f0SRobert Mustacchi ASSERT(count > 0);
549da5577f0SRobert Mustacchi ASSERT(size > 0);
550da5577f0SRobert Mustacchi
551da5577f0SRobert Mustacchi if (base >= count) {
552da5577f0SRobert Mustacchi out = base - count;
553da5577f0SRobert Mustacchi } else {
554da5577f0SRobert Mustacchi out = base - count + size;
555da5577f0SRobert Mustacchi }
556da5577f0SRobert Mustacchi
557da5577f0SRobert Mustacchi ASSERT(out >= 0 && out < size);
558da5577f0SRobert Mustacchi return (out);
559da5577f0SRobert Mustacchi }
560da5577f0SRobert Mustacchi
561da5577f0SRobert Mustacchi /*
562da5577f0SRobert Mustacchi * Free DMA memory that is represented by a i40e_dma_buffer_t.
563da5577f0SRobert Mustacchi */
564da5577f0SRobert Mustacchi static void
i40e_free_dma_buffer(i40e_dma_buffer_t * dmap)565da5577f0SRobert Mustacchi i40e_free_dma_buffer(i40e_dma_buffer_t *dmap)
566da5577f0SRobert Mustacchi {
5676845d4e7SToomas Soome if (dmap->dmab_dma_address != 0) {
568da5577f0SRobert Mustacchi VERIFY(dmap->dmab_dma_handle != NULL);
569da5577f0SRobert Mustacchi (void) ddi_dma_unbind_handle(dmap->dmab_dma_handle);
5706845d4e7SToomas Soome dmap->dmab_dma_address = 0;
571da5577f0SRobert Mustacchi dmap->dmab_size = 0;
572da5577f0SRobert Mustacchi }
573da5577f0SRobert Mustacchi
574da5577f0SRobert Mustacchi if (dmap->dmab_acc_handle != NULL) {
575da5577f0SRobert Mustacchi ddi_dma_mem_free(&dmap->dmab_acc_handle);
576da5577f0SRobert Mustacchi dmap->dmab_acc_handle = NULL;
577da5577f0SRobert Mustacchi dmap->dmab_address = NULL;
578da5577f0SRobert Mustacchi }
579da5577f0SRobert Mustacchi
580da5577f0SRobert Mustacchi if (dmap->dmab_dma_handle != NULL) {
581da5577f0SRobert Mustacchi ddi_dma_free_handle(&dmap->dmab_dma_handle);
582da5577f0SRobert Mustacchi dmap->dmab_dma_handle = NULL;
583da5577f0SRobert Mustacchi }
584da5577f0SRobert Mustacchi
585da5577f0SRobert Mustacchi /*
586da5577f0SRobert Mustacchi * These should only be set if we have valid handles allocated and
587da5577f0SRobert Mustacchi * therefore should always be NULLed out due to the above code. This
588da5577f0SRobert Mustacchi * is here to catch us acting sloppy.
589da5577f0SRobert Mustacchi */
5906845d4e7SToomas Soome ASSERT(dmap->dmab_dma_address == 0);
591da5577f0SRobert Mustacchi ASSERT(dmap->dmab_address == NULL);
592da5577f0SRobert Mustacchi ASSERT(dmap->dmab_size == 0);
593da5577f0SRobert Mustacchi dmap->dmab_len = 0;
594da5577f0SRobert Mustacchi }
595da5577f0SRobert Mustacchi
596da5577f0SRobert Mustacchi /*
597da5577f0SRobert Mustacchi * Allocate size bytes of DMA memory based on the passed in attributes. This
598da5577f0SRobert Mustacchi * fills in the information in dmap and is designed for all of our single cookie
599da5577f0SRobert Mustacchi * allocations.
600da5577f0SRobert Mustacchi */
601da5577f0SRobert Mustacchi static boolean_t
i40e_alloc_dma_buffer(i40e_t * i40e,i40e_dma_buffer_t * dmap,ddi_dma_attr_t * attrsp,ddi_device_acc_attr_t * accp,boolean_t stream,boolean_t zero,size_t size)602da5577f0SRobert Mustacchi i40e_alloc_dma_buffer(i40e_t *i40e, i40e_dma_buffer_t *dmap,
603da5577f0SRobert Mustacchi ddi_dma_attr_t *attrsp, ddi_device_acc_attr_t *accp, boolean_t stream,
604da5577f0SRobert Mustacchi boolean_t zero, size_t size)
605da5577f0SRobert Mustacchi {
606da5577f0SRobert Mustacchi int ret;
607da5577f0SRobert Mustacchi uint_t flags;
608da5577f0SRobert Mustacchi size_t len;
609da5577f0SRobert Mustacchi ddi_dma_cookie_t cookie;
610da5577f0SRobert Mustacchi uint_t ncookies;
611da5577f0SRobert Mustacchi
612da5577f0SRobert Mustacchi if (stream == B_TRUE)
613da5577f0SRobert Mustacchi flags = DDI_DMA_STREAMING;
614da5577f0SRobert Mustacchi else
615da5577f0SRobert Mustacchi flags = DDI_DMA_CONSISTENT;
616da5577f0SRobert Mustacchi
617da5577f0SRobert Mustacchi /*
618da5577f0SRobert Mustacchi * Step one: Allocate the DMA handle
619da5577f0SRobert Mustacchi */
620da5577f0SRobert Mustacchi ret = ddi_dma_alloc_handle(i40e->i40e_dip, attrsp, DDI_DMA_DONTWAIT,
621da5577f0SRobert Mustacchi NULL, &dmap->dmab_dma_handle);
622da5577f0SRobert Mustacchi if (ret != DDI_SUCCESS) {
623da5577f0SRobert Mustacchi i40e_error(i40e, "failed to allocate dma handle for I/O "
624da5577f0SRobert Mustacchi "buffers: %d", ret);
625da5577f0SRobert Mustacchi dmap->dmab_dma_handle = NULL;
626da5577f0SRobert Mustacchi return (B_FALSE);
627da5577f0SRobert Mustacchi }
628da5577f0SRobert Mustacchi
629da5577f0SRobert Mustacchi /*
630da5577f0SRobert Mustacchi * Step two: Allocate the DMA memory
631da5577f0SRobert Mustacchi */
632da5577f0SRobert Mustacchi ret = ddi_dma_mem_alloc(dmap->dmab_dma_handle, size, accp, flags,
633da5577f0SRobert Mustacchi DDI_DMA_DONTWAIT, NULL, &dmap->dmab_address, &len,
634da5577f0SRobert Mustacchi &dmap->dmab_acc_handle);
635da5577f0SRobert Mustacchi if (ret != DDI_SUCCESS) {
636da5577f0SRobert Mustacchi i40e_error(i40e, "failed to allocate %ld bytes of DMA for I/O "
637da5577f0SRobert Mustacchi "buffers", size);
638da5577f0SRobert Mustacchi dmap->dmab_address = NULL;
639da5577f0SRobert Mustacchi dmap->dmab_acc_handle = NULL;
640da5577f0SRobert Mustacchi i40e_free_dma_buffer(dmap);
641da5577f0SRobert Mustacchi return (B_FALSE);
642da5577f0SRobert Mustacchi }
643da5577f0SRobert Mustacchi
644da5577f0SRobert Mustacchi /*
645da5577f0SRobert Mustacchi * Step three: Optionally zero
646da5577f0SRobert Mustacchi */
647da5577f0SRobert Mustacchi if (zero == B_TRUE)
648da5577f0SRobert Mustacchi bzero(dmap->dmab_address, len);
649da5577f0SRobert Mustacchi
650da5577f0SRobert Mustacchi /*
651da5577f0SRobert Mustacchi * Step four: Bind the memory
652da5577f0SRobert Mustacchi */
653da5577f0SRobert Mustacchi ret = ddi_dma_addr_bind_handle(dmap->dmab_dma_handle, NULL,
654da5577f0SRobert Mustacchi dmap->dmab_address, len, DDI_DMA_RDWR | flags, DDI_DMA_DONTWAIT,
655da5577f0SRobert Mustacchi NULL, &cookie, &ncookies);
656da5577f0SRobert Mustacchi if (ret != DDI_DMA_MAPPED) {
657da5577f0SRobert Mustacchi i40e_error(i40e, "failed to allocate %ld bytes of DMA for I/O "
658da5577f0SRobert Mustacchi "buffers: %d", size, ret);
659da5577f0SRobert Mustacchi i40e_free_dma_buffer(dmap);
660da5577f0SRobert Mustacchi return (B_FALSE);
661da5577f0SRobert Mustacchi }
662da5577f0SRobert Mustacchi
663da5577f0SRobert Mustacchi VERIFY(ncookies == 1);
664da5577f0SRobert Mustacchi dmap->dmab_dma_address = cookie.dmac_laddress;
665da5577f0SRobert Mustacchi dmap->dmab_size = len;
666da5577f0SRobert Mustacchi dmap->dmab_len = 0;
667da5577f0SRobert Mustacchi return (B_TRUE);
668da5577f0SRobert Mustacchi }
669da5577f0SRobert Mustacchi
670da5577f0SRobert Mustacchi /*
671da5577f0SRobert Mustacchi * This function is called once the last pending rcb has been freed by the upper
672da5577f0SRobert Mustacchi * levels of the system.
673da5577f0SRobert Mustacchi */
674da5577f0SRobert Mustacchi static void
i40e_free_rx_data(i40e_rx_data_t * rxd)675da5577f0SRobert Mustacchi i40e_free_rx_data(i40e_rx_data_t *rxd)
676da5577f0SRobert Mustacchi {
677da5577f0SRobert Mustacchi VERIFY(rxd->rxd_rcb_pending == 0);
678da5577f0SRobert Mustacchi
679da5577f0SRobert Mustacchi if (rxd->rxd_rcb_area != NULL) {
680da5577f0SRobert Mustacchi kmem_free(rxd->rxd_rcb_area,
681da5577f0SRobert Mustacchi sizeof (i40e_rx_control_block_t) *
682da5577f0SRobert Mustacchi (rxd->rxd_free_list_size + rxd->rxd_ring_size));
683da5577f0SRobert Mustacchi rxd->rxd_rcb_area = NULL;
684da5577f0SRobert Mustacchi }
685da5577f0SRobert Mustacchi
686da5577f0SRobert Mustacchi if (rxd->rxd_free_list != NULL) {
687da5577f0SRobert Mustacchi kmem_free(rxd->rxd_free_list,
688da5577f0SRobert Mustacchi sizeof (i40e_rx_control_block_t *) *
689da5577f0SRobert Mustacchi rxd->rxd_free_list_size);
690da5577f0SRobert Mustacchi rxd->rxd_free_list = NULL;
691da5577f0SRobert Mustacchi }
692da5577f0SRobert Mustacchi
693da5577f0SRobert Mustacchi if (rxd->rxd_work_list != NULL) {
694da5577f0SRobert Mustacchi kmem_free(rxd->rxd_work_list,
695da5577f0SRobert Mustacchi sizeof (i40e_rx_control_block_t *) *
696da5577f0SRobert Mustacchi rxd->rxd_ring_size);
697da5577f0SRobert Mustacchi rxd->rxd_work_list = NULL;
698da5577f0SRobert Mustacchi }
699da5577f0SRobert Mustacchi
700da5577f0SRobert Mustacchi kmem_free(rxd, sizeof (i40e_rx_data_t));
701da5577f0SRobert Mustacchi }
702da5577f0SRobert Mustacchi
703da5577f0SRobert Mustacchi static boolean_t
i40e_alloc_rx_data(i40e_t * i40e,i40e_trqpair_t * itrq)704da5577f0SRobert Mustacchi i40e_alloc_rx_data(i40e_t *i40e, i40e_trqpair_t *itrq)
705da5577f0SRobert Mustacchi {
706da5577f0SRobert Mustacchi i40e_rx_data_t *rxd;
707da5577f0SRobert Mustacchi
708da5577f0SRobert Mustacchi rxd = kmem_zalloc(sizeof (i40e_rx_data_t), KM_NOSLEEP);
709da5577f0SRobert Mustacchi if (rxd == NULL)
710da5577f0SRobert Mustacchi return (B_FALSE);
711da5577f0SRobert Mustacchi itrq->itrq_rxdata = rxd;
712da5577f0SRobert Mustacchi rxd->rxd_i40e = i40e;
713da5577f0SRobert Mustacchi
714da5577f0SRobert Mustacchi rxd->rxd_ring_size = i40e->i40e_rx_ring_size;
715da5577f0SRobert Mustacchi rxd->rxd_free_list_size = i40e->i40e_rx_ring_size;
716da5577f0SRobert Mustacchi
717da5577f0SRobert Mustacchi rxd->rxd_rcb_free = rxd->rxd_free_list_size;
718da5577f0SRobert Mustacchi
719da5577f0SRobert Mustacchi rxd->rxd_work_list = kmem_zalloc(sizeof (i40e_rx_control_block_t *) *
720da5577f0SRobert Mustacchi rxd->rxd_ring_size, KM_NOSLEEP);
721da5577f0SRobert Mustacchi if (rxd->rxd_work_list == NULL) {
722*8d5069bcSRyan Zezeski i40e_error(i40e, "failed to allocate RX work list for a ring "
723da5577f0SRobert Mustacchi "of %d entries for ring %d", rxd->rxd_ring_size,
724da5577f0SRobert Mustacchi itrq->itrq_index);
725da5577f0SRobert Mustacchi goto cleanup;
726da5577f0SRobert Mustacchi }
727da5577f0SRobert Mustacchi
728da5577f0SRobert Mustacchi rxd->rxd_free_list = kmem_zalloc(sizeof (i40e_rx_control_block_t *) *
729da5577f0SRobert Mustacchi rxd->rxd_free_list_size, KM_NOSLEEP);
730da5577f0SRobert Mustacchi if (rxd->rxd_free_list == NULL) {
731*8d5069bcSRyan Zezeski i40e_error(i40e, "failed to allocate a %d entry RX free list "
732da5577f0SRobert Mustacchi "for ring %d", rxd->rxd_free_list_size, itrq->itrq_index);
733da5577f0SRobert Mustacchi goto cleanup;
734da5577f0SRobert Mustacchi }
735da5577f0SRobert Mustacchi
736da5577f0SRobert Mustacchi rxd->rxd_rcb_area = kmem_zalloc(sizeof (i40e_rx_control_block_t) *
737da5577f0SRobert Mustacchi (rxd->rxd_free_list_size + rxd->rxd_ring_size), KM_NOSLEEP);
738da5577f0SRobert Mustacchi if (rxd->rxd_rcb_area == NULL) {
739da5577f0SRobert Mustacchi i40e_error(i40e, "failed to allocate a %d entry rcb area for "
740da5577f0SRobert Mustacchi "ring %d", rxd->rxd_ring_size + rxd->rxd_free_list_size,
741da5577f0SRobert Mustacchi itrq->itrq_index);
742da5577f0SRobert Mustacchi goto cleanup;
743da5577f0SRobert Mustacchi }
744da5577f0SRobert Mustacchi
745da5577f0SRobert Mustacchi return (B_TRUE);
746da5577f0SRobert Mustacchi
747da5577f0SRobert Mustacchi cleanup:
748da5577f0SRobert Mustacchi i40e_free_rx_data(rxd);
749da5577f0SRobert Mustacchi itrq->itrq_rxdata = NULL;
750da5577f0SRobert Mustacchi return (B_FALSE);
751da5577f0SRobert Mustacchi }
752da5577f0SRobert Mustacchi
753da5577f0SRobert Mustacchi /*
754da5577f0SRobert Mustacchi * Free all of the memory that we've allocated for DMA. Note that we may have
755da5577f0SRobert Mustacchi * buffers that we've loaned up to the OS which are still outstanding. We'll
756da5577f0SRobert Mustacchi * always free up the descriptor ring, because we no longer need that. For each
757da5577f0SRobert Mustacchi * rcb, we'll iterate over it and if we send the reference count to zero, then
758da5577f0SRobert Mustacchi * we'll free the message block and DMA related resources. However, if we don't
759da5577f0SRobert Mustacchi * take the last one, then we'll go ahead and keep track that we'll have pending
760da5577f0SRobert Mustacchi * data and clean it up when we get there.
761da5577f0SRobert Mustacchi */
762da5577f0SRobert Mustacchi static void
i40e_free_rx_dma(i40e_rx_data_t * rxd,boolean_t failed_init)763da5577f0SRobert Mustacchi i40e_free_rx_dma(i40e_rx_data_t *rxd, boolean_t failed_init)
764da5577f0SRobert Mustacchi {
765da5577f0SRobert Mustacchi uint32_t i, count, ref;
766da5577f0SRobert Mustacchi
767da5577f0SRobert Mustacchi i40e_rx_control_block_t *rcb;
768da5577f0SRobert Mustacchi i40e_t *i40e = rxd->rxd_i40e;
769da5577f0SRobert Mustacchi
770da5577f0SRobert Mustacchi i40e_free_dma_buffer(&rxd->rxd_desc_area);
771da5577f0SRobert Mustacchi rxd->rxd_desc_ring = NULL;
772da5577f0SRobert Mustacchi rxd->rxd_desc_next = 0;
773da5577f0SRobert Mustacchi
774da5577f0SRobert Mustacchi mutex_enter(&i40e->i40e_rx_pending_lock);
775da5577f0SRobert Mustacchi
776da5577f0SRobert Mustacchi rcb = rxd->rxd_rcb_area;
777da5577f0SRobert Mustacchi count = rxd->rxd_ring_size + rxd->rxd_free_list_size;
778da5577f0SRobert Mustacchi
779da5577f0SRobert Mustacchi for (i = 0; i < count; i++, rcb++) {
780da5577f0SRobert Mustacchi VERIFY(rcb != NULL);
781da5577f0SRobert Mustacchi
782da5577f0SRobert Mustacchi /*
783da5577f0SRobert Mustacchi * If we're cleaning up from a failed creation attempt, then an
784da5577f0SRobert Mustacchi * entry may never have been assembled which would mean that
785da5577f0SRobert Mustacchi * it's reference count is zero. If we find that, we leave it
786da5577f0SRobert Mustacchi * be, because nothing else should be modifying it at this
787da5577f0SRobert Mustacchi * point. We're not at the point that any more references can be
788da5577f0SRobert Mustacchi * added, just removed.
789da5577f0SRobert Mustacchi */
790da5577f0SRobert Mustacchi if (failed_init == B_TRUE && rcb->rcb_ref == 0)
791da5577f0SRobert Mustacchi continue;
792da5577f0SRobert Mustacchi
793da5577f0SRobert Mustacchi ref = atomic_dec_32_nv(&rcb->rcb_ref);
794da5577f0SRobert Mustacchi if (ref == 0) {
795da5577f0SRobert Mustacchi freemsg(rcb->rcb_mp);
796da5577f0SRobert Mustacchi rcb->rcb_mp = NULL;
797da5577f0SRobert Mustacchi i40e_free_dma_buffer(&rcb->rcb_dma);
798da5577f0SRobert Mustacchi } else {
799da5577f0SRobert Mustacchi atomic_inc_32(&rxd->rxd_rcb_pending);
800da5577f0SRobert Mustacchi atomic_inc_32(&i40e->i40e_rx_pending);
801da5577f0SRobert Mustacchi }
802da5577f0SRobert Mustacchi }
803da5577f0SRobert Mustacchi mutex_exit(&i40e->i40e_rx_pending_lock);
804da5577f0SRobert Mustacchi }
805da5577f0SRobert Mustacchi
806da5577f0SRobert Mustacchi /*
807da5577f0SRobert Mustacchi * Initialize the DMA memory for the descriptor ring and for each frame in the
808da5577f0SRobert Mustacchi * control block list.
809da5577f0SRobert Mustacchi */
810da5577f0SRobert Mustacchi static boolean_t
i40e_alloc_rx_dma(i40e_rx_data_t * rxd)811da5577f0SRobert Mustacchi i40e_alloc_rx_dma(i40e_rx_data_t *rxd)
812da5577f0SRobert Mustacchi {
813da5577f0SRobert Mustacchi int i, count;
814da5577f0SRobert Mustacchi size_t dmasz;
815da5577f0SRobert Mustacchi i40e_rx_control_block_t *rcb;
816da5577f0SRobert Mustacchi i40e_t *i40e = rxd->rxd_i40e;
817da5577f0SRobert Mustacchi
818da5577f0SRobert Mustacchi /*
819*8d5069bcSRyan Zezeski * First allocate the RX descriptor ring.
820da5577f0SRobert Mustacchi */
821da5577f0SRobert Mustacchi dmasz = sizeof (i40e_rx_desc_t) * rxd->rxd_ring_size;
822da5577f0SRobert Mustacchi VERIFY(dmasz > 0);
823da5577f0SRobert Mustacchi if (i40e_alloc_dma_buffer(i40e, &rxd->rxd_desc_area,
824da5577f0SRobert Mustacchi &i40e->i40e_static_dma_attr, &i40e->i40e_desc_acc_attr, B_FALSE,
825da5577f0SRobert Mustacchi B_TRUE, dmasz) == B_FALSE) {
826da5577f0SRobert Mustacchi i40e_error(i40e, "failed to allocate DMA resources "
827*8d5069bcSRyan Zezeski "for RX descriptor ring");
828da5577f0SRobert Mustacchi return (B_FALSE);
829da5577f0SRobert Mustacchi }
830da5577f0SRobert Mustacchi rxd->rxd_desc_ring =
831da5577f0SRobert Mustacchi (i40e_rx_desc_t *)(uintptr_t)rxd->rxd_desc_area.dmab_address;
832da5577f0SRobert Mustacchi rxd->rxd_desc_next = 0;
833da5577f0SRobert Mustacchi
834da5577f0SRobert Mustacchi count = rxd->rxd_ring_size + rxd->rxd_free_list_size;
835da5577f0SRobert Mustacchi rcb = rxd->rxd_rcb_area;
836da5577f0SRobert Mustacchi
837da5577f0SRobert Mustacchi dmasz = i40e->i40e_rx_buf_size;
838da5577f0SRobert Mustacchi VERIFY(dmasz > 0);
839da5577f0SRobert Mustacchi for (i = 0; i < count; i++, rcb++) {
840da5577f0SRobert Mustacchi i40e_dma_buffer_t *dmap;
841da5577f0SRobert Mustacchi VERIFY(rcb != NULL);
842da5577f0SRobert Mustacchi
843da5577f0SRobert Mustacchi if (i < rxd->rxd_ring_size) {
844da5577f0SRobert Mustacchi rxd->rxd_work_list[i] = rcb;
845da5577f0SRobert Mustacchi } else {
846da5577f0SRobert Mustacchi rxd->rxd_free_list[i - rxd->rxd_ring_size] = rcb;
847da5577f0SRobert Mustacchi }
848da5577f0SRobert Mustacchi
849da5577f0SRobert Mustacchi dmap = &rcb->rcb_dma;
850da5577f0SRobert Mustacchi if (i40e_alloc_dma_buffer(i40e, dmap,
851da5577f0SRobert Mustacchi &i40e->i40e_static_dma_attr, &i40e->i40e_buf_acc_attr,
852da5577f0SRobert Mustacchi B_TRUE, B_FALSE, dmasz) == B_FALSE) {
853*8d5069bcSRyan Zezeski i40e_error(i40e, "failed to allocate RX dma buffer");
854da5577f0SRobert Mustacchi return (B_FALSE);
855da5577f0SRobert Mustacchi }
856da5577f0SRobert Mustacchi
857da5577f0SRobert Mustacchi /*
858da5577f0SRobert Mustacchi * Initialize the control block and offset the DMA address. See
859da5577f0SRobert Mustacchi * the note in the big theory statement that explains how this
860da5577f0SRobert Mustacchi * helps IP deal with alignment. Note, we don't worry about
861da5577f0SRobert Mustacchi * whether or not we successfully get an mblk_t from desballoc,
862da5577f0SRobert Mustacchi * it's a common case that we have to handle later on in the
863da5577f0SRobert Mustacchi * system.
864da5577f0SRobert Mustacchi */
865da5577f0SRobert Mustacchi dmap->dmab_size -= I40E_BUF_IPHDR_ALIGNMENT;
866da5577f0SRobert Mustacchi dmap->dmab_address += I40E_BUF_IPHDR_ALIGNMENT;
867da5577f0SRobert Mustacchi dmap->dmab_dma_address += I40E_BUF_IPHDR_ALIGNMENT;
868da5577f0SRobert Mustacchi
869da5577f0SRobert Mustacchi rcb->rcb_ref = 1;
870da5577f0SRobert Mustacchi rcb->rcb_rxd = rxd;
871da5577f0SRobert Mustacchi rcb->rcb_free_rtn.free_func = i40e_rx_recycle;
872da5577f0SRobert Mustacchi rcb->rcb_free_rtn.free_arg = (caddr_t)rcb;
873da5577f0SRobert Mustacchi rcb->rcb_mp = desballoc((unsigned char *)dmap->dmab_address,
874da5577f0SRobert Mustacchi dmap->dmab_size, 0, &rcb->rcb_free_rtn);
875da5577f0SRobert Mustacchi }
876da5577f0SRobert Mustacchi
877da5577f0SRobert Mustacchi return (B_TRUE);
878da5577f0SRobert Mustacchi }
879da5577f0SRobert Mustacchi
880da5577f0SRobert Mustacchi static void
i40e_free_tx_dma(i40e_trqpair_t * itrq)881da5577f0SRobert Mustacchi i40e_free_tx_dma(i40e_trqpair_t *itrq)
882da5577f0SRobert Mustacchi {
883da5577f0SRobert Mustacchi size_t fsz;
884da5577f0SRobert Mustacchi
885da5577f0SRobert Mustacchi if (itrq->itrq_tcb_area != NULL) {
886da5577f0SRobert Mustacchi uint32_t i;
887da5577f0SRobert Mustacchi i40e_tx_control_block_t *tcb = itrq->itrq_tcb_area;
888da5577f0SRobert Mustacchi
889da5577f0SRobert Mustacchi for (i = 0; i < itrq->itrq_tx_free_list_size; i++, tcb++) {
890da5577f0SRobert Mustacchi i40e_free_dma_buffer(&tcb->tcb_dma);
891da5577f0SRobert Mustacchi if (tcb->tcb_dma_handle != NULL) {
892da5577f0SRobert Mustacchi ddi_dma_free_handle(&tcb->tcb_dma_handle);
893da5577f0SRobert Mustacchi tcb->tcb_dma_handle = NULL;
894da5577f0SRobert Mustacchi }
895*8d5069bcSRyan Zezeski if (tcb->tcb_lso_dma_handle != NULL) {
896*8d5069bcSRyan Zezeski ddi_dma_free_handle(&tcb->tcb_lso_dma_handle);
897*8d5069bcSRyan Zezeski tcb->tcb_lso_dma_handle = NULL;
898*8d5069bcSRyan Zezeski }
899da5577f0SRobert Mustacchi }
900da5577f0SRobert Mustacchi
901da5577f0SRobert Mustacchi fsz = sizeof (i40e_tx_control_block_t) *
902da5577f0SRobert Mustacchi itrq->itrq_tx_free_list_size;
903da5577f0SRobert Mustacchi kmem_free(itrq->itrq_tcb_area, fsz);
904da5577f0SRobert Mustacchi itrq->itrq_tcb_area = NULL;
905da5577f0SRobert Mustacchi }
906da5577f0SRobert Mustacchi
907da5577f0SRobert Mustacchi if (itrq->itrq_tcb_free_list != NULL) {
908da5577f0SRobert Mustacchi fsz = sizeof (i40e_tx_control_block_t *) *
909da5577f0SRobert Mustacchi itrq->itrq_tx_free_list_size;
910da5577f0SRobert Mustacchi kmem_free(itrq->itrq_tcb_free_list, fsz);
911da5577f0SRobert Mustacchi itrq->itrq_tcb_free_list = NULL;
912da5577f0SRobert Mustacchi }
913da5577f0SRobert Mustacchi
914da5577f0SRobert Mustacchi if (itrq->itrq_tcb_work_list != NULL) {
915da5577f0SRobert Mustacchi fsz = sizeof (i40e_tx_control_block_t *) *
916da5577f0SRobert Mustacchi itrq->itrq_tx_ring_size;
917da5577f0SRobert Mustacchi kmem_free(itrq->itrq_tcb_work_list, fsz);
918da5577f0SRobert Mustacchi itrq->itrq_tcb_work_list = NULL;
919da5577f0SRobert Mustacchi }
920da5577f0SRobert Mustacchi
921da5577f0SRobert Mustacchi i40e_free_dma_buffer(&itrq->itrq_desc_area);
922da5577f0SRobert Mustacchi itrq->itrq_desc_ring = NULL;
923da5577f0SRobert Mustacchi
924da5577f0SRobert Mustacchi }
925da5577f0SRobert Mustacchi
926da5577f0SRobert Mustacchi static boolean_t
i40e_alloc_tx_dma(i40e_trqpair_t * itrq)927da5577f0SRobert Mustacchi i40e_alloc_tx_dma(i40e_trqpair_t *itrq)
928da5577f0SRobert Mustacchi {
929da5577f0SRobert Mustacchi int i, ret;
930da5577f0SRobert Mustacchi size_t dmasz;
931da5577f0SRobert Mustacchi i40e_tx_control_block_t *tcb;
932da5577f0SRobert Mustacchi i40e_t *i40e = itrq->itrq_i40e;
933da5577f0SRobert Mustacchi
934da5577f0SRobert Mustacchi itrq->itrq_tx_ring_size = i40e->i40e_tx_ring_size;
935da5577f0SRobert Mustacchi itrq->itrq_tx_free_list_size = i40e->i40e_tx_ring_size +
936da5577f0SRobert Mustacchi (i40e->i40e_tx_ring_size >> 1);
937da5577f0SRobert Mustacchi
938da5577f0SRobert Mustacchi /*
939*8d5069bcSRyan Zezeski * Allocate an additional TX descriptor for the writeback head.
940da5577f0SRobert Mustacchi */
941da5577f0SRobert Mustacchi dmasz = sizeof (i40e_tx_desc_t) * itrq->itrq_tx_ring_size;
942da5577f0SRobert Mustacchi dmasz += sizeof (i40e_tx_desc_t);
943da5577f0SRobert Mustacchi
944da5577f0SRobert Mustacchi VERIFY(dmasz > 0);
945da5577f0SRobert Mustacchi if (i40e_alloc_dma_buffer(i40e, &itrq->itrq_desc_area,
946da5577f0SRobert Mustacchi &i40e->i40e_static_dma_attr, &i40e->i40e_desc_acc_attr,
947da5577f0SRobert Mustacchi B_FALSE, B_TRUE, dmasz) == B_FALSE) {
948*8d5069bcSRyan Zezeski i40e_error(i40e, "failed to allocate DMA resources for TX "
949da5577f0SRobert Mustacchi "descriptor ring");
950da5577f0SRobert Mustacchi return (B_FALSE);
951da5577f0SRobert Mustacchi }
952da5577f0SRobert Mustacchi itrq->itrq_desc_ring =
953da5577f0SRobert Mustacchi (i40e_tx_desc_t *)(uintptr_t)itrq->itrq_desc_area.dmab_address;
954da5577f0SRobert Mustacchi itrq->itrq_desc_wbhead = (uint32_t *)(itrq->itrq_desc_ring +
955da5577f0SRobert Mustacchi itrq->itrq_tx_ring_size);
956da5577f0SRobert Mustacchi itrq->itrq_desc_head = 0;
957da5577f0SRobert Mustacchi itrq->itrq_desc_tail = 0;
958da5577f0SRobert Mustacchi itrq->itrq_desc_free = itrq->itrq_tx_ring_size;
959da5577f0SRobert Mustacchi
960da5577f0SRobert Mustacchi itrq->itrq_tcb_work_list = kmem_zalloc(itrq->itrq_tx_ring_size *
961da5577f0SRobert Mustacchi sizeof (i40e_tx_control_block_t *), KM_NOSLEEP);
962da5577f0SRobert Mustacchi if (itrq->itrq_tcb_work_list == NULL) {
963*8d5069bcSRyan Zezeski i40e_error(i40e, "failed to allocate a %d entry TX work list "
964da5577f0SRobert Mustacchi "for ring %d", itrq->itrq_tx_ring_size, itrq->itrq_index);
965da5577f0SRobert Mustacchi goto cleanup;
966da5577f0SRobert Mustacchi }
967da5577f0SRobert Mustacchi
968da5577f0SRobert Mustacchi itrq->itrq_tcb_free_list = kmem_zalloc(itrq->itrq_tx_free_list_size *
969da5577f0SRobert Mustacchi sizeof (i40e_tx_control_block_t *), KM_SLEEP);
970da5577f0SRobert Mustacchi if (itrq->itrq_tcb_free_list == NULL) {
971*8d5069bcSRyan Zezeski i40e_error(i40e, "failed to allocate a %d entry TX free list "
972da5577f0SRobert Mustacchi "for ring %d", itrq->itrq_tx_free_list_size,
973da5577f0SRobert Mustacchi itrq->itrq_index);
974da5577f0SRobert Mustacchi goto cleanup;
975da5577f0SRobert Mustacchi }
976da5577f0SRobert Mustacchi
977da5577f0SRobert Mustacchi /*
978*8d5069bcSRyan Zezeski * We allocate enough TX control blocks to cover the free list.
979da5577f0SRobert Mustacchi */
980da5577f0SRobert Mustacchi itrq->itrq_tcb_area = kmem_zalloc(sizeof (i40e_tx_control_block_t) *
981da5577f0SRobert Mustacchi itrq->itrq_tx_free_list_size, KM_NOSLEEP);
982da5577f0SRobert Mustacchi if (itrq->itrq_tcb_area == NULL) {
983da5577f0SRobert Mustacchi i40e_error(i40e, "failed to allocate a %d entry tcb area for "
984da5577f0SRobert Mustacchi "ring %d", itrq->itrq_tx_free_list_size, itrq->itrq_index);
985da5577f0SRobert Mustacchi goto cleanup;
986da5577f0SRobert Mustacchi }
987da5577f0SRobert Mustacchi
988da5577f0SRobert Mustacchi /*
989da5577f0SRobert Mustacchi * For each tcb, allocate DMA memory.
990da5577f0SRobert Mustacchi */
991da5577f0SRobert Mustacchi dmasz = i40e->i40e_tx_buf_size;
992da5577f0SRobert Mustacchi VERIFY(dmasz > 0);
993da5577f0SRobert Mustacchi tcb = itrq->itrq_tcb_area;
994da5577f0SRobert Mustacchi for (i = 0; i < itrq->itrq_tx_free_list_size; i++, tcb++) {
995da5577f0SRobert Mustacchi VERIFY(tcb != NULL);
996da5577f0SRobert Mustacchi
997da5577f0SRobert Mustacchi /*
998da5577f0SRobert Mustacchi * Allocate both a DMA buffer which we'll use for when we copy
999da5577f0SRobert Mustacchi * packets for transmission and allocate a DMA handle which
1000da5577f0SRobert Mustacchi * we'll use when we bind data.
1001da5577f0SRobert Mustacchi */
1002da5577f0SRobert Mustacchi ret = ddi_dma_alloc_handle(i40e->i40e_dip,
1003da5577f0SRobert Mustacchi &i40e->i40e_txbind_dma_attr, DDI_DMA_DONTWAIT, NULL,
1004da5577f0SRobert Mustacchi &tcb->tcb_dma_handle);
1005da5577f0SRobert Mustacchi if (ret != DDI_SUCCESS) {
1006*8d5069bcSRyan Zezeski i40e_error(i40e, "failed to allocate DMA handle for TX "
1007da5577f0SRobert Mustacchi "data binding on ring %d: %d", itrq->itrq_index,
1008da5577f0SRobert Mustacchi ret);
1009da5577f0SRobert Mustacchi tcb->tcb_dma_handle = NULL;
1010da5577f0SRobert Mustacchi goto cleanup;
1011da5577f0SRobert Mustacchi }
1012da5577f0SRobert Mustacchi
1013*8d5069bcSRyan Zezeski ret = ddi_dma_alloc_handle(i40e->i40e_dip,
1014*8d5069bcSRyan Zezeski &i40e->i40e_txbind_lso_dma_attr, DDI_DMA_DONTWAIT, NULL,
1015*8d5069bcSRyan Zezeski &tcb->tcb_lso_dma_handle);
1016*8d5069bcSRyan Zezeski if (ret != DDI_SUCCESS) {
1017*8d5069bcSRyan Zezeski i40e_error(i40e, "failed to allocate DMA handle for TX "
1018*8d5069bcSRyan Zezeski "LSO data binding on ring %d: %d", itrq->itrq_index,
1019*8d5069bcSRyan Zezeski ret);
1020*8d5069bcSRyan Zezeski tcb->tcb_lso_dma_handle = NULL;
1021*8d5069bcSRyan Zezeski goto cleanup;
1022*8d5069bcSRyan Zezeski }
1023*8d5069bcSRyan Zezeski
1024da5577f0SRobert Mustacchi if (i40e_alloc_dma_buffer(i40e, &tcb->tcb_dma,
1025da5577f0SRobert Mustacchi &i40e->i40e_static_dma_attr, &i40e->i40e_buf_acc_attr,
1026da5577f0SRobert Mustacchi B_TRUE, B_FALSE, dmasz) == B_FALSE) {
1027da5577f0SRobert Mustacchi i40e_error(i40e, "failed to allocate %ld bytes of "
1028*8d5069bcSRyan Zezeski "DMA for TX data binding on ring %d", dmasz,
1029da5577f0SRobert Mustacchi itrq->itrq_index);
1030da5577f0SRobert Mustacchi goto cleanup;
1031da5577f0SRobert Mustacchi }
1032da5577f0SRobert Mustacchi
1033da5577f0SRobert Mustacchi itrq->itrq_tcb_free_list[i] = tcb;
1034da5577f0SRobert Mustacchi }
1035da5577f0SRobert Mustacchi
1036da5577f0SRobert Mustacchi itrq->itrq_tcb_free = itrq->itrq_tx_free_list_size;
1037da5577f0SRobert Mustacchi
1038da5577f0SRobert Mustacchi return (B_TRUE);
1039da5577f0SRobert Mustacchi
1040da5577f0SRobert Mustacchi cleanup:
1041da5577f0SRobert Mustacchi i40e_free_tx_dma(itrq);
1042da5577f0SRobert Mustacchi return (B_FALSE);
1043da5577f0SRobert Mustacchi }
1044da5577f0SRobert Mustacchi
1045da5577f0SRobert Mustacchi /*
1046da5577f0SRobert Mustacchi * Free all memory associated with all of the rings on this i40e instance. Note,
1047da5577f0SRobert Mustacchi * this is done as part of the GLDv3 stop routine.
1048da5577f0SRobert Mustacchi */
1049da5577f0SRobert Mustacchi void
i40e_free_ring_mem(i40e_t * i40e,boolean_t failed_init)1050da5577f0SRobert Mustacchi i40e_free_ring_mem(i40e_t *i40e, boolean_t failed_init)
1051da5577f0SRobert Mustacchi {
1052da5577f0SRobert Mustacchi int i;
1053da5577f0SRobert Mustacchi
1054da5577f0SRobert Mustacchi for (i = 0; i < i40e->i40e_num_trqpairs; i++) {
1055da5577f0SRobert Mustacchi i40e_rx_data_t *rxd = i40e->i40e_trqpairs[i].itrq_rxdata;
1056da5577f0SRobert Mustacchi
1057da5577f0SRobert Mustacchi /*
1058*8d5069bcSRyan Zezeski * In some cases i40e_alloc_rx_data() may have failed
1059*8d5069bcSRyan Zezeski * and in that case there is no rxd to free.
1060*8d5069bcSRyan Zezeski */
1061*8d5069bcSRyan Zezeski if (rxd == NULL)
1062*8d5069bcSRyan Zezeski continue;
1063*8d5069bcSRyan Zezeski
1064*8d5069bcSRyan Zezeski /*
1065*8d5069bcSRyan Zezeski * Clean up our RX data. We have to free DMA resources first and
1066da5577f0SRobert Mustacchi * then if we have no more pending RCB's, then we'll go ahead
1067da5577f0SRobert Mustacchi * and clean things up. Note, we can't set the stopped flag on
1068*8d5069bcSRyan Zezeski * the RX data until after we've done the first pass of the
1069da5577f0SRobert Mustacchi * pending resources. Otherwise we might race with
1070da5577f0SRobert Mustacchi * i40e_rx_recycle on determining who should free the
1071da5577f0SRobert Mustacchi * i40e_rx_data_t above.
1072da5577f0SRobert Mustacchi */
1073da5577f0SRobert Mustacchi i40e_free_rx_dma(rxd, failed_init);
1074da5577f0SRobert Mustacchi
1075da5577f0SRobert Mustacchi mutex_enter(&i40e->i40e_rx_pending_lock);
1076da5577f0SRobert Mustacchi rxd->rxd_shutdown = B_TRUE;
1077da5577f0SRobert Mustacchi if (rxd->rxd_rcb_pending == 0) {
1078da5577f0SRobert Mustacchi i40e_free_rx_data(rxd);
1079da5577f0SRobert Mustacchi i40e->i40e_trqpairs[i].itrq_rxdata = NULL;
1080da5577f0SRobert Mustacchi }
1081da5577f0SRobert Mustacchi mutex_exit(&i40e->i40e_rx_pending_lock);
1082da5577f0SRobert Mustacchi
1083da5577f0SRobert Mustacchi i40e_free_tx_dma(&i40e->i40e_trqpairs[i]);
1084da5577f0SRobert Mustacchi }
1085da5577f0SRobert Mustacchi }
1086da5577f0SRobert Mustacchi
1087da5577f0SRobert Mustacchi /*
1088da5577f0SRobert Mustacchi * Allocate all of the resources associated with all of the rings on this i40e
1089da5577f0SRobert Mustacchi * instance. Note this is done as part of the GLDv3 start routine and thus we
1090da5577f0SRobert Mustacchi * should not use blocking allocations. This takes care of both DMA and non-DMA
1091da5577f0SRobert Mustacchi * related resources.
1092da5577f0SRobert Mustacchi */
1093da5577f0SRobert Mustacchi boolean_t
i40e_alloc_ring_mem(i40e_t * i40e)1094da5577f0SRobert Mustacchi i40e_alloc_ring_mem(i40e_t *i40e)
1095da5577f0SRobert Mustacchi {
1096da5577f0SRobert Mustacchi int i;
1097da5577f0SRobert Mustacchi
1098da5577f0SRobert Mustacchi for (i = 0; i < i40e->i40e_num_trqpairs; i++) {
1099da5577f0SRobert Mustacchi if (i40e_alloc_rx_data(i40e, &i40e->i40e_trqpairs[i]) ==
1100da5577f0SRobert Mustacchi B_FALSE)
1101da5577f0SRobert Mustacchi goto unwind;
1102da5577f0SRobert Mustacchi
1103da5577f0SRobert Mustacchi if (i40e_alloc_rx_dma(i40e->i40e_trqpairs[i].itrq_rxdata) ==
1104da5577f0SRobert Mustacchi B_FALSE)
1105da5577f0SRobert Mustacchi goto unwind;
1106da5577f0SRobert Mustacchi
1107da5577f0SRobert Mustacchi if (i40e_alloc_tx_dma(&i40e->i40e_trqpairs[i]) == B_FALSE)
1108da5577f0SRobert Mustacchi goto unwind;
1109da5577f0SRobert Mustacchi }
1110da5577f0SRobert Mustacchi
1111da5577f0SRobert Mustacchi return (B_TRUE);
1112da5577f0SRobert Mustacchi
1113da5577f0SRobert Mustacchi unwind:
1114da5577f0SRobert Mustacchi i40e_free_ring_mem(i40e, B_TRUE);
1115da5577f0SRobert Mustacchi return (B_FALSE);
1116da5577f0SRobert Mustacchi }
1117da5577f0SRobert Mustacchi
1118da5577f0SRobert Mustacchi
1119da5577f0SRobert Mustacchi /*
1120da5577f0SRobert Mustacchi * Because every instance of i40e may have different support for FMA
1121da5577f0SRobert Mustacchi * capabilities, we copy the DMA attributes into the i40e_t and set them that
1122da5577f0SRobert Mustacchi * way and use them for determining attributes.
1123da5577f0SRobert Mustacchi */
1124da5577f0SRobert Mustacchi void
i40e_init_dma_attrs(i40e_t * i40e,boolean_t fma)1125da5577f0SRobert Mustacchi i40e_init_dma_attrs(i40e_t *i40e, boolean_t fma)
1126da5577f0SRobert Mustacchi {
1127da5577f0SRobert Mustacchi bcopy(&i40e_g_static_dma_attr, &i40e->i40e_static_dma_attr,
1128da5577f0SRobert Mustacchi sizeof (ddi_dma_attr_t));
1129da5577f0SRobert Mustacchi bcopy(&i40e_g_txbind_dma_attr, &i40e->i40e_txbind_dma_attr,
1130da5577f0SRobert Mustacchi sizeof (ddi_dma_attr_t));
1131*8d5069bcSRyan Zezeski bcopy(&i40e_g_txbind_lso_dma_attr, &i40e->i40e_txbind_lso_dma_attr,
1132*8d5069bcSRyan Zezeski sizeof (ddi_dma_attr_t));
1133da5577f0SRobert Mustacchi bcopy(&i40e_g_desc_acc_attr, &i40e->i40e_desc_acc_attr,
1134da5577f0SRobert Mustacchi sizeof (ddi_device_acc_attr_t));
1135da5577f0SRobert Mustacchi bcopy(&i40e_g_buf_acc_attr, &i40e->i40e_buf_acc_attr,
1136da5577f0SRobert Mustacchi sizeof (ddi_device_acc_attr_t));
1137da5577f0SRobert Mustacchi
1138da5577f0SRobert Mustacchi if (fma == B_TRUE) {
1139da5577f0SRobert Mustacchi i40e->i40e_static_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR;
1140da5577f0SRobert Mustacchi i40e->i40e_txbind_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR;
1141*8d5069bcSRyan Zezeski i40e->i40e_txbind_lso_dma_attr.dma_attr_flags |=
1142*8d5069bcSRyan Zezeski DDI_DMA_FLAGERR;
1143da5577f0SRobert Mustacchi } else {
1144da5577f0SRobert Mustacchi i40e->i40e_static_dma_attr.dma_attr_flags &= ~DDI_DMA_FLAGERR;
1145da5577f0SRobert Mustacchi i40e->i40e_txbind_dma_attr.dma_attr_flags &= ~DDI_DMA_FLAGERR;
1146*8d5069bcSRyan Zezeski i40e->i40e_txbind_lso_dma_attr.dma_attr_flags &=
1147*8d5069bcSRyan Zezeski ~DDI_DMA_FLAGERR;
1148da5577f0SRobert Mustacchi }
1149da5577f0SRobert Mustacchi }
1150da5577f0SRobert Mustacchi
1151da5577f0SRobert Mustacchi static void
i40e_rcb_free(i40e_rx_data_t * rxd,i40e_rx_control_block_t * rcb)1152da5577f0SRobert Mustacchi i40e_rcb_free(i40e_rx_data_t *rxd, i40e_rx_control_block_t *rcb)
1153da5577f0SRobert Mustacchi {
1154da5577f0SRobert Mustacchi mutex_enter(&rxd->rxd_free_lock);
1155da5577f0SRobert Mustacchi ASSERT(rxd->rxd_rcb_free < rxd->rxd_free_list_size);
1156da5577f0SRobert Mustacchi ASSERT(rxd->rxd_free_list[rxd->rxd_rcb_free] == NULL);
1157da5577f0SRobert Mustacchi rxd->rxd_free_list[rxd->rxd_rcb_free] = rcb;
1158da5577f0SRobert Mustacchi rxd->rxd_rcb_free++;
1159da5577f0SRobert Mustacchi mutex_exit(&rxd->rxd_free_lock);
1160da5577f0SRobert Mustacchi }
1161da5577f0SRobert Mustacchi
1162da5577f0SRobert Mustacchi static i40e_rx_control_block_t *
i40e_rcb_alloc(i40e_rx_data_t * rxd)1163da5577f0SRobert Mustacchi i40e_rcb_alloc(i40e_rx_data_t *rxd)
1164da5577f0SRobert Mustacchi {
1165da5577f0SRobert Mustacchi i40e_rx_control_block_t *rcb;
1166da5577f0SRobert Mustacchi
1167da5577f0SRobert Mustacchi mutex_enter(&rxd->rxd_free_lock);
1168da5577f0SRobert Mustacchi if (rxd->rxd_rcb_free == 0) {
1169da5577f0SRobert Mustacchi mutex_exit(&rxd->rxd_free_lock);
1170da5577f0SRobert Mustacchi return (NULL);
1171da5577f0SRobert Mustacchi }
1172da5577f0SRobert Mustacchi rxd->rxd_rcb_free--;
1173da5577f0SRobert Mustacchi rcb = rxd->rxd_free_list[rxd->rxd_rcb_free];
1174da5577f0SRobert Mustacchi VERIFY(rcb != NULL);
1175da5577f0SRobert Mustacchi rxd->rxd_free_list[rxd->rxd_rcb_free] = NULL;
1176da5577f0SRobert Mustacchi mutex_exit(&rxd->rxd_free_lock);
1177da5577f0SRobert Mustacchi
1178da5577f0SRobert Mustacchi return (rcb);
1179da5577f0SRobert Mustacchi }
1180da5577f0SRobert Mustacchi
1181da5577f0SRobert Mustacchi /*
1182da5577f0SRobert Mustacchi * This is the callback that we get from the OS when freemsg(9F) has been called
1183da5577f0SRobert Mustacchi * on a loaned descriptor. In addition, if we take the last reference count
1184*8d5069bcSRyan Zezeski * here, then we have to tear down all of the RX data.
1185da5577f0SRobert Mustacchi */
1186da5577f0SRobert Mustacchi void
i40e_rx_recycle(caddr_t arg)1187da5577f0SRobert Mustacchi i40e_rx_recycle(caddr_t arg)
1188da5577f0SRobert Mustacchi {
1189da5577f0SRobert Mustacchi uint32_t ref;
1190da5577f0SRobert Mustacchi i40e_rx_control_block_t *rcb;
1191da5577f0SRobert Mustacchi i40e_rx_data_t *rxd;
1192da5577f0SRobert Mustacchi i40e_t *i40e;
1193da5577f0SRobert Mustacchi
1194da5577f0SRobert Mustacchi /* LINTED: E_BAD_PTR_CAST_ALIGN */
1195da5577f0SRobert Mustacchi rcb = (i40e_rx_control_block_t *)arg;
1196da5577f0SRobert Mustacchi rxd = rcb->rcb_rxd;
1197da5577f0SRobert Mustacchi i40e = rxd->rxd_i40e;
1198da5577f0SRobert Mustacchi
1199da5577f0SRobert Mustacchi /*
1200da5577f0SRobert Mustacchi * It's possible for this to be called with a reference count of zero.
1201da5577f0SRobert Mustacchi * That will happen when we're doing the freemsg after taking the last
1202da5577f0SRobert Mustacchi * reference because we're tearing down everything and this rcb is not
1203da5577f0SRobert Mustacchi * outstanding.
1204da5577f0SRobert Mustacchi */
1205da5577f0SRobert Mustacchi if (rcb->rcb_ref == 0)
1206da5577f0SRobert Mustacchi return;
1207da5577f0SRobert Mustacchi
1208da5577f0SRobert Mustacchi /*
1209da5577f0SRobert Mustacchi * Don't worry about failure of desballoc here. It'll only become fatal
1210da5577f0SRobert Mustacchi * if we're trying to use it and we can't in i40e_rx_bind().
1211da5577f0SRobert Mustacchi */
1212da5577f0SRobert Mustacchi rcb->rcb_mp = desballoc((unsigned char *)rcb->rcb_dma.dmab_address,
1213da5577f0SRobert Mustacchi rcb->rcb_dma.dmab_size, 0, &rcb->rcb_free_rtn);
1214da5577f0SRobert Mustacchi i40e_rcb_free(rxd, rcb);
1215da5577f0SRobert Mustacchi
1216da5577f0SRobert Mustacchi /*
1217da5577f0SRobert Mustacchi * It's possible that the rcb was being used while we are shutting down
1218da5577f0SRobert Mustacchi * the device. In that case, we'll take the final reference from the
1219da5577f0SRobert Mustacchi * device here.
1220da5577f0SRobert Mustacchi */
1221da5577f0SRobert Mustacchi ref = atomic_dec_32_nv(&rcb->rcb_ref);
1222da5577f0SRobert Mustacchi if (ref == 0) {
1223da5577f0SRobert Mustacchi freemsg(rcb->rcb_mp);
1224da5577f0SRobert Mustacchi rcb->rcb_mp = NULL;
1225da5577f0SRobert Mustacchi i40e_free_dma_buffer(&rcb->rcb_dma);
1226da5577f0SRobert Mustacchi
1227da5577f0SRobert Mustacchi mutex_enter(&i40e->i40e_rx_pending_lock);
1228da5577f0SRobert Mustacchi atomic_dec_32(&rxd->rxd_rcb_pending);
1229da5577f0SRobert Mustacchi atomic_dec_32(&i40e->i40e_rx_pending);
1230da5577f0SRobert Mustacchi
1231da5577f0SRobert Mustacchi /*
1232da5577f0SRobert Mustacchi * If this was the last block and it's been indicated that we've
1233da5577f0SRobert Mustacchi * passed the shutdown point, we should clean up.
1234da5577f0SRobert Mustacchi */
1235da5577f0SRobert Mustacchi if (rxd->rxd_shutdown == B_TRUE && rxd->rxd_rcb_pending == 0) {
1236da5577f0SRobert Mustacchi i40e_free_rx_data(rxd);
1237da5577f0SRobert Mustacchi cv_broadcast(&i40e->i40e_rx_pending_cv);
1238da5577f0SRobert Mustacchi }
1239da5577f0SRobert Mustacchi
1240da5577f0SRobert Mustacchi mutex_exit(&i40e->i40e_rx_pending_lock);
1241da5577f0SRobert Mustacchi }
1242da5577f0SRobert Mustacchi }
1243da5577f0SRobert Mustacchi
1244da5577f0SRobert Mustacchi static mblk_t *
i40e_rx_bind(i40e_trqpair_t * itrq,i40e_rx_data_t * rxd,uint32_t index,uint32_t plen)1245da5577f0SRobert Mustacchi i40e_rx_bind(i40e_trqpair_t *itrq, i40e_rx_data_t *rxd, uint32_t index,
1246da5577f0SRobert Mustacchi uint32_t plen)
1247da5577f0SRobert Mustacchi {
1248da5577f0SRobert Mustacchi mblk_t *mp;
1249da5577f0SRobert Mustacchi i40e_t *i40e = rxd->rxd_i40e;
1250da5577f0SRobert Mustacchi i40e_rx_control_block_t *rcb, *rep_rcb;
1251da5577f0SRobert Mustacchi
1252da5577f0SRobert Mustacchi ASSERT(MUTEX_HELD(&itrq->itrq_rx_lock));
1253da5577f0SRobert Mustacchi
1254da5577f0SRobert Mustacchi if ((rep_rcb = i40e_rcb_alloc(rxd)) == NULL) {
1255da5577f0SRobert Mustacchi itrq->itrq_rxstat.irxs_rx_bind_norcb.value.ui64++;
1256da5577f0SRobert Mustacchi return (NULL);
1257da5577f0SRobert Mustacchi }
1258da5577f0SRobert Mustacchi
1259da5577f0SRobert Mustacchi rcb = rxd->rxd_work_list[index];
1260da5577f0SRobert Mustacchi
1261da5577f0SRobert Mustacchi /*
1262da5577f0SRobert Mustacchi * Check to make sure we have a mblk_t. If we don't, this is our last
1263da5577f0SRobert Mustacchi * chance to try and get one.
1264da5577f0SRobert Mustacchi */
1265da5577f0SRobert Mustacchi if (rcb->rcb_mp == NULL) {
1266da5577f0SRobert Mustacchi rcb->rcb_mp =
1267da5577f0SRobert Mustacchi desballoc((unsigned char *)rcb->rcb_dma.dmab_address,
1268da5577f0SRobert Mustacchi rcb->rcb_dma.dmab_size, 0, &rcb->rcb_free_rtn);
1269da5577f0SRobert Mustacchi if (rcb->rcb_mp == NULL) {
1270da5577f0SRobert Mustacchi itrq->itrq_rxstat.irxs_rx_bind_nomp.value.ui64++;
1271da5577f0SRobert Mustacchi i40e_rcb_free(rxd, rcb);
1272da5577f0SRobert Mustacchi return (NULL);
1273da5577f0SRobert Mustacchi }
1274da5577f0SRobert Mustacchi }
1275da5577f0SRobert Mustacchi
1276da5577f0SRobert Mustacchi I40E_DMA_SYNC(&rcb->rcb_dma, DDI_DMA_SYNC_FORKERNEL);
1277da5577f0SRobert Mustacchi
1278da5577f0SRobert Mustacchi if (i40e_check_dma_handle(rcb->rcb_dma.dmab_dma_handle) != DDI_FM_OK) {
1279da5577f0SRobert Mustacchi ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED);
1280da5577f0SRobert Mustacchi atomic_or_32(&i40e->i40e_state, I40E_ERROR);
1281da5577f0SRobert Mustacchi i40e_rcb_free(rxd, rcb);
1282da5577f0SRobert Mustacchi return (NULL);
1283da5577f0SRobert Mustacchi }
1284da5577f0SRobert Mustacchi
1285da5577f0SRobert Mustacchi /*
1286da5577f0SRobert Mustacchi * Note, we've already accounted for the I40E_BUF_IPHDR_ALIGNMENT.
1287da5577f0SRobert Mustacchi */
1288da5577f0SRobert Mustacchi mp = rcb->rcb_mp;
1289da5577f0SRobert Mustacchi atomic_inc_32(&rcb->rcb_ref);
1290da5577f0SRobert Mustacchi mp->b_wptr = mp->b_rptr + plen;
1291da5577f0SRobert Mustacchi mp->b_next = mp->b_cont = NULL;
1292da5577f0SRobert Mustacchi
1293da5577f0SRobert Mustacchi rxd->rxd_work_list[index] = rep_rcb;
1294da5577f0SRobert Mustacchi return (mp);
1295da5577f0SRobert Mustacchi }
1296da5577f0SRobert Mustacchi
1297da5577f0SRobert Mustacchi /*
1298da5577f0SRobert Mustacchi * We're going to allocate a new message block for this frame and attempt to
1299da5577f0SRobert Mustacchi * receive it. See the big theory statement for more information on when we copy
1300da5577f0SRobert Mustacchi * versus bind.
1301da5577f0SRobert Mustacchi */
1302da5577f0SRobert Mustacchi static mblk_t *
i40e_rx_copy(i40e_trqpair_t * itrq,i40e_rx_data_t * rxd,uint32_t index,uint32_t plen)1303da5577f0SRobert Mustacchi i40e_rx_copy(i40e_trqpair_t *itrq, i40e_rx_data_t *rxd, uint32_t index,
1304da5577f0SRobert Mustacchi uint32_t plen)
1305da5577f0SRobert Mustacchi {
1306da5577f0SRobert Mustacchi i40e_t *i40e = rxd->rxd_i40e;
1307da5577f0SRobert Mustacchi i40e_rx_control_block_t *rcb;
1308da5577f0SRobert Mustacchi mblk_t *mp;
1309da5577f0SRobert Mustacchi
1310da5577f0SRobert Mustacchi ASSERT(index < rxd->rxd_ring_size);
1311da5577f0SRobert Mustacchi rcb = rxd->rxd_work_list[index];
1312da5577f0SRobert Mustacchi
1313da5577f0SRobert Mustacchi I40E_DMA_SYNC(&rcb->rcb_dma, DDI_DMA_SYNC_FORKERNEL);
1314da5577f0SRobert Mustacchi
1315da5577f0SRobert Mustacchi if (i40e_check_dma_handle(rcb->rcb_dma.dmab_dma_handle) != DDI_FM_OK) {
1316da5577f0SRobert Mustacchi ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED);
1317da5577f0SRobert Mustacchi atomic_or_32(&i40e->i40e_state, I40E_ERROR);
1318da5577f0SRobert Mustacchi return (NULL);
1319da5577f0SRobert Mustacchi }
1320da5577f0SRobert Mustacchi
1321da5577f0SRobert Mustacchi mp = allocb(plen + I40E_BUF_IPHDR_ALIGNMENT, 0);
1322da5577f0SRobert Mustacchi if (mp == NULL) {
1323da5577f0SRobert Mustacchi itrq->itrq_rxstat.irxs_rx_copy_nomem.value.ui64++;
1324da5577f0SRobert Mustacchi return (NULL);
1325da5577f0SRobert Mustacchi }
1326da5577f0SRobert Mustacchi
1327da5577f0SRobert Mustacchi mp->b_rptr += I40E_BUF_IPHDR_ALIGNMENT;
1328da5577f0SRobert Mustacchi bcopy(rcb->rcb_dma.dmab_address, mp->b_rptr, plen);
1329da5577f0SRobert Mustacchi mp->b_wptr = mp->b_rptr + plen;
1330da5577f0SRobert Mustacchi
1331da5577f0SRobert Mustacchi return (mp);
1332da5577f0SRobert Mustacchi }
1333da5577f0SRobert Mustacchi
1334da5577f0SRobert Mustacchi /*
1335da5577f0SRobert Mustacchi * Determine if the device has enabled any checksum flags for us. The level of
1336da5577f0SRobert Mustacchi * checksum computed will depend on the type packet that we have, which is
1337da5577f0SRobert Mustacchi * contained in ptype. For example, the checksum logic it does will vary
1338da5577f0SRobert Mustacchi * depending on whether or not the packet is considered tunneled, whether it
1339da5577f0SRobert Mustacchi * recognizes the L4 type, etc. Section 8.3.4.3 summarizes which checksums are
1340da5577f0SRobert Mustacchi * valid.
1341da5577f0SRobert Mustacchi *
1342da5577f0SRobert Mustacchi * While there are additional checksums that we could recognize here, we'll need
1343da5577f0SRobert Mustacchi * to get some additional GLDv3 enhancements to be able to properly describe
1344da5577f0SRobert Mustacchi * them.
1345da5577f0SRobert Mustacchi */
1346da5577f0SRobert Mustacchi static void
i40e_rx_hcksum(i40e_trqpair_t * itrq,mblk_t * mp,uint64_t status,uint32_t err,uint32_t ptype)1347da5577f0SRobert Mustacchi i40e_rx_hcksum(i40e_trqpair_t *itrq, mblk_t *mp, uint64_t status, uint32_t err,
1348da5577f0SRobert Mustacchi uint32_t ptype)
1349da5577f0SRobert Mustacchi {
1350da5577f0SRobert Mustacchi uint32_t cksum;
1351da5577f0SRobert Mustacchi struct i40e_rx_ptype_decoded pinfo;
1352da5577f0SRobert Mustacchi
1353da5577f0SRobert Mustacchi ASSERT(ptype <= 255);
1354da5577f0SRobert Mustacchi pinfo = decode_rx_desc_ptype(ptype);
1355da5577f0SRobert Mustacchi
1356da5577f0SRobert Mustacchi cksum = 0;
1357da5577f0SRobert Mustacchi
1358da5577f0SRobert Mustacchi /*
1359da5577f0SRobert Mustacchi * If the ptype isn't something that we know in the driver, then we
1360da5577f0SRobert Mustacchi * shouldn't even consider moving forward.
1361da5577f0SRobert Mustacchi */
1362da5577f0SRobert Mustacchi if (pinfo.known == 0) {
1363da5577f0SRobert Mustacchi itrq->itrq_rxstat.irxs_hck_unknown.value.ui64++;
1364da5577f0SRobert Mustacchi return;
1365da5577f0SRobert Mustacchi }
1366da5577f0SRobert Mustacchi
1367da5577f0SRobert Mustacchi /*
1368da5577f0SRobert Mustacchi * If hardware didn't set the L3L4P bit on the frame, then there is no
1369da5577f0SRobert Mustacchi * checksum offload to consider.
1370da5577f0SRobert Mustacchi */
1371da5577f0SRobert Mustacchi if ((status & (1 << I40E_RX_DESC_STATUS_L3L4P_SHIFT)) == 0) {
1372da5577f0SRobert Mustacchi itrq->itrq_rxstat.irxs_hck_nol3l4p.value.ui64++;
1373da5577f0SRobert Mustacchi return;
1374da5577f0SRobert Mustacchi }
1375da5577f0SRobert Mustacchi
1376da5577f0SRobert Mustacchi /*
1377da5577f0SRobert Mustacchi * The device tells us that IPv6 checksums where a Destination Options
1378da5577f0SRobert Mustacchi * Header or a Routing header shouldn't be trusted. Discard all
1379da5577f0SRobert Mustacchi * checksums in this case.
1380da5577f0SRobert Mustacchi */
1381da5577f0SRobert Mustacchi if (pinfo.outer_ip == I40E_RX_PTYPE_OUTER_IP &&
1382da5577f0SRobert Mustacchi pinfo.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV6 &&
1383da5577f0SRobert Mustacchi (status & (1 << I40E_RX_DESC_STATUS_IPV6EXADD_SHIFT))) {
1384da5577f0SRobert Mustacchi itrq->itrq_rxstat.irxs_hck_v6skip.value.ui64++;
1385da5577f0SRobert Mustacchi return;
1386da5577f0SRobert Mustacchi }
1387da5577f0SRobert Mustacchi
1388da5577f0SRobert Mustacchi /*
1389da5577f0SRobert Mustacchi * The hardware denotes three kinds of possible errors. Two are reserved
1390da5577f0SRobert Mustacchi * for inner and outer IP checksum errors (IPE and EIPE) and the latter
1391da5577f0SRobert Mustacchi * is for L4 checksum errors (L4E). If there is only one IP header, then
1392da5577f0SRobert Mustacchi * the only thing that we care about is IPE. Note that since we don't
1393da5577f0SRobert Mustacchi * support inner checksums, we will ignore IPE being set on tunneled
1394da5577f0SRobert Mustacchi * packets and only care about EIPE.
1395da5577f0SRobert Mustacchi */
1396da5577f0SRobert Mustacchi if (pinfo.outer_ip == I40E_RX_PTYPE_OUTER_IP &&
1397da5577f0SRobert Mustacchi pinfo.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV4) {
1398da5577f0SRobert Mustacchi if (pinfo.tunnel_type == I40E_RX_PTYPE_OUTER_NONE) {
1399da5577f0SRobert Mustacchi if ((err & (1 << I40E_RX_DESC_ERROR_IPE_SHIFT)) != 0) {
1400da5577f0SRobert Mustacchi itrq->itrq_rxstat.irxs_hck_iperr.value.ui64++;
1401da5577f0SRobert Mustacchi } else {
1402da5577f0SRobert Mustacchi itrq->itrq_rxstat.irxs_hck_v4hdrok.value.ui64++;
1403da5577f0SRobert Mustacchi cksum |= HCK_IPV4_HDRCKSUM_OK;
1404da5577f0SRobert Mustacchi }
1405da5577f0SRobert Mustacchi } else {
1406da5577f0SRobert Mustacchi if ((err & (1 << I40E_RX_DESC_ERROR_EIPE_SHIFT)) != 0) {
1407da5577f0SRobert Mustacchi itrq->itrq_rxstat.irxs_hck_eiperr.value.ui64++;
1408da5577f0SRobert Mustacchi } else {
1409da5577f0SRobert Mustacchi itrq->itrq_rxstat.irxs_hck_v4hdrok.value.ui64++;
1410da5577f0SRobert Mustacchi cksum |= HCK_IPV4_HDRCKSUM_OK;
1411da5577f0SRobert Mustacchi }
1412da5577f0SRobert Mustacchi }
1413da5577f0SRobert Mustacchi }
1414da5577f0SRobert Mustacchi
1415da5577f0SRobert Mustacchi /*
1416da5577f0SRobert Mustacchi * We only have meaningful L4 checksums in the case of IP->L4 and
1417da5577f0SRobert Mustacchi * IP->IP->L4. There is not outer L4 checksum data available in any
1418da5577f0SRobert Mustacchi * other case. Further, we don't bother reporting the valid checksum in
1419da5577f0SRobert Mustacchi * the case of IP->IP->L4 set.
1420da5577f0SRobert Mustacchi */
1421da5577f0SRobert Mustacchi if (pinfo.outer_ip == I40E_RX_PTYPE_OUTER_IP &&
1422da5577f0SRobert Mustacchi pinfo.tunnel_type == I40E_RX_PTYPE_TUNNEL_NONE &&
1423da5577f0SRobert Mustacchi (pinfo.inner_prot == I40E_RX_PTYPE_INNER_PROT_UDP ||
1424da5577f0SRobert Mustacchi pinfo.inner_prot == I40E_RX_PTYPE_INNER_PROT_TCP ||
1425da5577f0SRobert Mustacchi pinfo.inner_prot == I40E_RX_PTYPE_INNER_PROT_ICMP ||
1426da5577f0SRobert Mustacchi pinfo.inner_prot == I40E_RX_PTYPE_INNER_PROT_SCTP)) {
1427da5577f0SRobert Mustacchi ASSERT(pinfo.payload_layer == I40E_RX_PTYPE_PAYLOAD_LAYER_PAY4);
1428da5577f0SRobert Mustacchi if ((err & (1 << I40E_RX_DESC_ERROR_L4E_SHIFT)) != 0) {
1429da5577f0SRobert Mustacchi itrq->itrq_rxstat.irxs_hck_l4err.value.ui64++;
1430da5577f0SRobert Mustacchi } else {
1431da5577f0SRobert Mustacchi itrq->itrq_rxstat.irxs_hck_l4hdrok.value.ui64++;
1432da5577f0SRobert Mustacchi cksum |= HCK_FULLCKSUM_OK;
1433da5577f0SRobert Mustacchi }
1434da5577f0SRobert Mustacchi }
1435da5577f0SRobert Mustacchi
1436da5577f0SRobert Mustacchi if (cksum != 0) {
1437da5577f0SRobert Mustacchi itrq->itrq_rxstat.irxs_hck_set.value.ui64++;
1438da5577f0SRobert Mustacchi mac_hcksum_set(mp, 0, 0, 0, 0, cksum);
1439da5577f0SRobert Mustacchi } else {
1440da5577f0SRobert Mustacchi itrq->itrq_rxstat.irxs_hck_miss.value.ui64++;
1441da5577f0SRobert Mustacchi }
1442da5577f0SRobert Mustacchi }
1443da5577f0SRobert Mustacchi
1444da5577f0SRobert Mustacchi mblk_t *
i40e_ring_rx(i40e_trqpair_t * itrq,int poll_bytes)1445da5577f0SRobert Mustacchi i40e_ring_rx(i40e_trqpair_t *itrq, int poll_bytes)
1446da5577f0SRobert Mustacchi {
1447da5577f0SRobert Mustacchi i40e_t *i40e;
1448da5577f0SRobert Mustacchi i40e_hw_t *hw;
1449da5577f0SRobert Mustacchi i40e_rx_data_t *rxd;
1450da5577f0SRobert Mustacchi uint32_t cur_head;
1451da5577f0SRobert Mustacchi i40e_rx_desc_t *cur_desc;
1452da5577f0SRobert Mustacchi i40e_rx_control_block_t *rcb;
1453da5577f0SRobert Mustacchi uint64_t rx_bytes, rx_frames;
1454da5577f0SRobert Mustacchi uint64_t stword;
1455da5577f0SRobert Mustacchi mblk_t *mp, *mp_head, **mp_tail;
1456da5577f0SRobert Mustacchi
1457da5577f0SRobert Mustacchi ASSERT(MUTEX_HELD(&itrq->itrq_rx_lock));
1458da5577f0SRobert Mustacchi rxd = itrq->itrq_rxdata;
1459da5577f0SRobert Mustacchi i40e = itrq->itrq_i40e;
1460da5577f0SRobert Mustacchi hw = &i40e->i40e_hw_space;
1461da5577f0SRobert Mustacchi
1462da5577f0SRobert Mustacchi if (!(i40e->i40e_state & I40E_STARTED) ||
1463da5577f0SRobert Mustacchi (i40e->i40e_state & I40E_OVERTEMP) ||
1464da5577f0SRobert Mustacchi (i40e->i40e_state & I40E_SUSPENDED) ||
1465da5577f0SRobert Mustacchi (i40e->i40e_state & I40E_ERROR))
1466da5577f0SRobert Mustacchi return (NULL);
1467da5577f0SRobert Mustacchi
1468da5577f0SRobert Mustacchi /*
1469da5577f0SRobert Mustacchi * Before we do anything else, we have to make sure that all of the DMA
1470da5577f0SRobert Mustacchi * buffers are synced up and then check to make sure that they're
1471da5577f0SRobert Mustacchi * actually good from an FM perspective.
1472da5577f0SRobert Mustacchi */
1473da5577f0SRobert Mustacchi I40E_DMA_SYNC(&rxd->rxd_desc_area, DDI_DMA_SYNC_FORKERNEL);
1474da5577f0SRobert Mustacchi if (i40e_check_dma_handle(rxd->rxd_desc_area.dmab_dma_handle) !=
1475da5577f0SRobert Mustacchi DDI_FM_OK) {
1476da5577f0SRobert Mustacchi ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED);
1477da5577f0SRobert Mustacchi atomic_or_32(&i40e->i40e_state, I40E_ERROR);
1478da5577f0SRobert Mustacchi return (NULL);
1479da5577f0SRobert Mustacchi }
1480da5577f0SRobert Mustacchi
1481da5577f0SRobert Mustacchi /*
1482da5577f0SRobert Mustacchi * Prepare our stats. We do a limited amount of processing in both
1483da5577f0SRobert Mustacchi * polling and interrupt context. The limit in interrupt context is
1484da5577f0SRobert Mustacchi * based on frames, in polling context based on bytes.
1485da5577f0SRobert Mustacchi */
1486da5577f0SRobert Mustacchi rx_bytes = rx_frames = 0;
1487da5577f0SRobert Mustacchi mp_head = NULL;
1488da5577f0SRobert Mustacchi mp_tail = &mp_head;
1489da5577f0SRobert Mustacchi
1490da5577f0SRobert Mustacchi /*
1491da5577f0SRobert Mustacchi * At this point, the descriptor ring is available to check. We'll try
1492da5577f0SRobert Mustacchi * and process until we either run out of poll_bytes or descriptors.
1493da5577f0SRobert Mustacchi */
1494da5577f0SRobert Mustacchi cur_head = rxd->rxd_desc_next;
1495da5577f0SRobert Mustacchi cur_desc = &rxd->rxd_desc_ring[cur_head];
1496da5577f0SRobert Mustacchi stword = LE64_TO_CPU(cur_desc->wb.qword1.status_error_len);
1497da5577f0SRobert Mustacchi
1498da5577f0SRobert Mustacchi /*
1499da5577f0SRobert Mustacchi * Note, the primary invariant of this loop should be that cur_head,
1500da5577f0SRobert Mustacchi * cur_desc, and stword always point to the currently processed
1501da5577f0SRobert Mustacchi * descriptor. When we leave the loop, it should point to a descriptor
1502da5577f0SRobert Mustacchi * that HAS NOT been processed. Meaning, that if we haven't consumed the
1503da5577f0SRobert Mustacchi * frame, the descriptor should not be advanced.
1504da5577f0SRobert Mustacchi */
1505da5577f0SRobert Mustacchi while ((stword & (1 << I40E_RX_DESC_STATUS_DD_SHIFT)) != 0) {
1506da5577f0SRobert Mustacchi uint32_t error, eop, plen, ptype;
1507da5577f0SRobert Mustacchi
1508da5577f0SRobert Mustacchi /*
1509da5577f0SRobert Mustacchi * The DD, PLEN, and EOP bits are the only ones that are valid
1510da5577f0SRobert Mustacchi * in every frame. The error information is only valid when EOP
1511da5577f0SRobert Mustacchi * is set in the same frame.
1512da5577f0SRobert Mustacchi *
1513da5577f0SRobert Mustacchi * At this time, because we don't do any LRO or header
1514da5577f0SRobert Mustacchi * splitting. We expect that every frame should have EOP set in
1515da5577f0SRobert Mustacchi * it. When later functionality comes in, we'll want to
1516da5577f0SRobert Mustacchi * re-evaluate this.
1517da5577f0SRobert Mustacchi */
1518da5577f0SRobert Mustacchi eop = stword & (1 << I40E_RX_DESC_STATUS_EOF_SHIFT);
1519da5577f0SRobert Mustacchi VERIFY(eop != 0);
1520da5577f0SRobert Mustacchi
1521da5577f0SRobert Mustacchi error = (stword & I40E_RXD_QW1_ERROR_MASK) >>
1522da5577f0SRobert Mustacchi I40E_RXD_QW1_ERROR_SHIFT;
1523da5577f0SRobert Mustacchi if (error & I40E_RX_ERR_BITS) {
1524da5577f0SRobert Mustacchi itrq->itrq_rxstat.irxs_rx_desc_error.value.ui64++;
1525da5577f0SRobert Mustacchi goto discard;
1526da5577f0SRobert Mustacchi }
1527da5577f0SRobert Mustacchi
1528da5577f0SRobert Mustacchi plen = (stword & I40E_RXD_QW1_LENGTH_PBUF_MASK) >>
1529da5577f0SRobert Mustacchi I40E_RXD_QW1_LENGTH_PBUF_SHIFT;
1530da5577f0SRobert Mustacchi
1531da5577f0SRobert Mustacchi ptype = (stword & I40E_RXD_QW1_PTYPE_MASK) >>
1532da5577f0SRobert Mustacchi I40E_RXD_QW1_PTYPE_SHIFT;
1533da5577f0SRobert Mustacchi
1534da5577f0SRobert Mustacchi /*
1535da5577f0SRobert Mustacchi * This packet contains valid data. We should check to see if
1536da5577f0SRobert Mustacchi * we're actually going to consume it based on its length (to
1537da5577f0SRobert Mustacchi * ensure that we don't overshoot our quota). We determine
1538da5577f0SRobert Mustacchi * whether to bcopy or bind the DMA resources based on the size
1539da5577f0SRobert Mustacchi * of the frame. However, if on debug, we allow it to be
1540da5577f0SRobert Mustacchi * overridden for testing purposes.
1541da5577f0SRobert Mustacchi *
1542da5577f0SRobert Mustacchi * We should be smarter about this and do DMA binding for
1543da5577f0SRobert Mustacchi * larger frames, but for now, it's really more important that
1544da5577f0SRobert Mustacchi * we actually just get something simple working.
1545da5577f0SRobert Mustacchi */
1546da5577f0SRobert Mustacchi
1547da5577f0SRobert Mustacchi /*
1548da5577f0SRobert Mustacchi * Ensure we don't exceed our polling quota by reading this
1549da5577f0SRobert Mustacchi * frame. Note we only bump bytes now, we bump frames later.
1550da5577f0SRobert Mustacchi */
1551da5577f0SRobert Mustacchi if ((poll_bytes != I40E_POLL_NULL) &&
1552da5577f0SRobert Mustacchi (rx_bytes + plen) > poll_bytes)
1553da5577f0SRobert Mustacchi break;
1554da5577f0SRobert Mustacchi rx_bytes += plen;
1555da5577f0SRobert Mustacchi
1556da5577f0SRobert Mustacchi mp = NULL;
1557da5577f0SRobert Mustacchi if (plen >= i40e->i40e_rx_dma_min)
1558da5577f0SRobert Mustacchi mp = i40e_rx_bind(itrq, rxd, cur_head, plen);
1559da5577f0SRobert Mustacchi if (mp == NULL)
1560da5577f0SRobert Mustacchi mp = i40e_rx_copy(itrq, rxd, cur_head, plen);
1561da5577f0SRobert Mustacchi
1562da5577f0SRobert Mustacchi if (mp != NULL) {
1563da5577f0SRobert Mustacchi if (i40e->i40e_rx_hcksum_enable)
1564da5577f0SRobert Mustacchi i40e_rx_hcksum(itrq, mp, stword, error, ptype);
1565da5577f0SRobert Mustacchi *mp_tail = mp;
1566da5577f0SRobert Mustacchi mp_tail = &mp->b_next;
1567da5577f0SRobert Mustacchi }
1568da5577f0SRobert Mustacchi
1569da5577f0SRobert Mustacchi /*
1570da5577f0SRobert Mustacchi * Now we need to prepare this frame for use again. See the
1571da5577f0SRobert Mustacchi * discussion in the big theory statements.
1572da5577f0SRobert Mustacchi *
1573da5577f0SRobert Mustacchi * However, right now we're doing the simple version of this.
1574da5577f0SRobert Mustacchi * Normally what we'd do would depend on whether or not we were
1575da5577f0SRobert Mustacchi * doing DMA binding or bcopying. But because we're always doing
1576da5577f0SRobert Mustacchi * bcopying, we can just always use the current index as a key
1577da5577f0SRobert Mustacchi * for what to do and reassign the buffer based on the ring.
1578da5577f0SRobert Mustacchi */
1579da5577f0SRobert Mustacchi discard:
1580da5577f0SRobert Mustacchi rcb = rxd->rxd_work_list[cur_head];
1581da5577f0SRobert Mustacchi cur_desc->read.pkt_addr =
1582da5577f0SRobert Mustacchi CPU_TO_LE64((uintptr_t)rcb->rcb_dma.dmab_dma_address);
1583da5577f0SRobert Mustacchi cur_desc->read.hdr_addr = 0;
1584da5577f0SRobert Mustacchi
1585da5577f0SRobert Mustacchi /*
1586da5577f0SRobert Mustacchi * Finally, update our loop invariants.
1587da5577f0SRobert Mustacchi */
1588da5577f0SRobert Mustacchi cur_head = i40e_next_desc(cur_head, 1, rxd->rxd_ring_size);
1589da5577f0SRobert Mustacchi cur_desc = &rxd->rxd_desc_ring[cur_head];
1590da5577f0SRobert Mustacchi stword = LE64_TO_CPU(cur_desc->wb.qword1.status_error_len);
1591da5577f0SRobert Mustacchi
1592da5577f0SRobert Mustacchi /*
1593da5577f0SRobert Mustacchi * To help provide liveness, we limit the amount of data that
1594da5577f0SRobert Mustacchi * we'll end up counting. Note that in these cases, an interrupt
1595da5577f0SRobert Mustacchi * is not dissimilar from a polling request.
1596da5577f0SRobert Mustacchi */
1597da5577f0SRobert Mustacchi rx_frames++;
1598da5577f0SRobert Mustacchi if (rx_frames > i40e->i40e_rx_limit_per_intr) {
1599da5577f0SRobert Mustacchi itrq->itrq_rxstat.irxs_rx_intr_limit.value.ui64++;
1600da5577f0SRobert Mustacchi break;
1601da5577f0SRobert Mustacchi }
1602da5577f0SRobert Mustacchi }
1603da5577f0SRobert Mustacchi
1604da5577f0SRobert Mustacchi /*
1605da5577f0SRobert Mustacchi * As we've modified the ring, we need to make sure that we sync the
1606da5577f0SRobert Mustacchi * descriptor ring for the device. Next, we update the hardware and
1607da5577f0SRobert Mustacchi * update our notion of where the head for us to read from hardware is
1608da5577f0SRobert Mustacchi * next.
1609da5577f0SRobert Mustacchi */
1610da5577f0SRobert Mustacchi I40E_DMA_SYNC(&rxd->rxd_desc_area, DDI_DMA_SYNC_FORDEV);
1611da5577f0SRobert Mustacchi if (i40e_check_dma_handle(rxd->rxd_desc_area.dmab_dma_handle) !=
1612da5577f0SRobert Mustacchi DDI_FM_OK) {
1613da5577f0SRobert Mustacchi ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED);
1614da5577f0SRobert Mustacchi atomic_or_32(&i40e->i40e_state, I40E_ERROR);
1615da5577f0SRobert Mustacchi }
1616da5577f0SRobert Mustacchi
1617da5577f0SRobert Mustacchi if (rx_frames != 0) {
1618da5577f0SRobert Mustacchi uint32_t tail;
1619da5577f0SRobert Mustacchi ddi_acc_handle_t rh = i40e->i40e_osdep_space.ios_reg_handle;
1620da5577f0SRobert Mustacchi rxd->rxd_desc_next = cur_head;
1621da5577f0SRobert Mustacchi tail = i40e_prev_desc(cur_head, 1, rxd->rxd_ring_size);
1622da5577f0SRobert Mustacchi
1623da5577f0SRobert Mustacchi I40E_WRITE_REG(hw, I40E_QRX_TAIL(itrq->itrq_index), tail);
1624da5577f0SRobert Mustacchi if (i40e_check_acc_handle(rh) != DDI_FM_OK) {
1625da5577f0SRobert Mustacchi ddi_fm_service_impact(i40e->i40e_dip,
1626da5577f0SRobert Mustacchi DDI_SERVICE_DEGRADED);
1627da5577f0SRobert Mustacchi atomic_or_32(&i40e->i40e_state, I40E_ERROR);
1628da5577f0SRobert Mustacchi }
1629da5577f0SRobert Mustacchi
1630da5577f0SRobert Mustacchi itrq->itrq_rxstat.irxs_bytes.value.ui64 += rx_bytes;
1631da5577f0SRobert Mustacchi itrq->itrq_rxstat.irxs_packets.value.ui64 += rx_frames;
1632da5577f0SRobert Mustacchi }
1633da5577f0SRobert Mustacchi
1634da5577f0SRobert Mustacchi #ifdef DEBUG
1635da5577f0SRobert Mustacchi if (rx_frames == 0) {
1636da5577f0SRobert Mustacchi ASSERT(rx_bytes == 0);
1637da5577f0SRobert Mustacchi }
1638da5577f0SRobert Mustacchi #endif
1639da5577f0SRobert Mustacchi
1640da5577f0SRobert Mustacchi return (mp_head);
1641da5577f0SRobert Mustacchi }
1642da5577f0SRobert Mustacchi
1643da5577f0SRobert Mustacchi /*
1644da5577f0SRobert Mustacchi * This function is called by the GLDv3 when it wants to poll on a ring. The
1645da5577f0SRobert Mustacchi * only primary difference from when we call this during an interrupt is that we
1646da5577f0SRobert Mustacchi * have a limit on the number of bytes that we should consume.
1647da5577f0SRobert Mustacchi */
1648da5577f0SRobert Mustacchi mblk_t *
i40e_ring_rx_poll(void * arg,int poll_bytes)1649da5577f0SRobert Mustacchi i40e_ring_rx_poll(void *arg, int poll_bytes)
1650da5577f0SRobert Mustacchi {
1651da5577f0SRobert Mustacchi i40e_trqpair_t *itrq = arg;
1652da5577f0SRobert Mustacchi mblk_t *mp;
1653da5577f0SRobert Mustacchi
1654da5577f0SRobert Mustacchi ASSERT(poll_bytes > 0);
1655da5577f0SRobert Mustacchi if (poll_bytes == 0)
1656da5577f0SRobert Mustacchi return (NULL);
1657da5577f0SRobert Mustacchi
1658da5577f0SRobert Mustacchi mutex_enter(&itrq->itrq_rx_lock);
1659da5577f0SRobert Mustacchi mp = i40e_ring_rx(itrq, poll_bytes);
1660da5577f0SRobert Mustacchi mutex_exit(&itrq->itrq_rx_lock);
1661da5577f0SRobert Mustacchi
1662da5577f0SRobert Mustacchi return (mp);
1663da5577f0SRobert Mustacchi }
1664da5577f0SRobert Mustacchi
1665da5577f0SRobert Mustacchi /*
1666da5577f0SRobert Mustacchi * This is a structure I wish someone would fill out for me for dorking with the
1667da5577f0SRobert Mustacchi * checksums. When we get some more experience with this, we should go ahead and
1668da5577f0SRobert Mustacchi * consider adding this to MAC.
1669da5577f0SRobert Mustacchi */
1670da5577f0SRobert Mustacchi typedef enum mac_ether_offload_flags {
1671da5577f0SRobert Mustacchi MEOI_L2INFO_SET = 0x01,
1672da5577f0SRobert Mustacchi MEOI_VLAN_TAGGED = 0x02,
1673da5577f0SRobert Mustacchi MEOI_L3INFO_SET = 0x04,
1674da5577f0SRobert Mustacchi MEOI_L3CKSUM_SET = 0x08,
1675da5577f0SRobert Mustacchi MEOI_L4INFO_SET = 0x10,
1676da5577f0SRobert Mustacchi MEOI_L4CKSUM_SET = 0x20
1677da5577f0SRobert Mustacchi } mac_ether_offload_flags_t;
1678da5577f0SRobert Mustacchi
1679da5577f0SRobert Mustacchi typedef struct mac_ether_offload_info {
1680da5577f0SRobert Mustacchi mac_ether_offload_flags_t meoi_flags;
1681da5577f0SRobert Mustacchi uint8_t meoi_l2hlen; /* How long is the Ethernet header? */
1682da5577f0SRobert Mustacchi uint16_t meoi_l3proto; /* What's the Ethertype */
1683da5577f0SRobert Mustacchi uint8_t meoi_l3hlen; /* How long is the header? */
1684da5577f0SRobert Mustacchi uint8_t meoi_l4proto; /* What is the payload type? */
1685da5577f0SRobert Mustacchi uint8_t meoi_l4hlen; /* How long is the L4 header */
1686da5577f0SRobert Mustacchi mblk_t *meoi_l3ckmp; /* Which mblk has the l3 checksum */
1687da5577f0SRobert Mustacchi off_t meoi_l3ckoff; /* What's the offset to it */
1688da5577f0SRobert Mustacchi mblk_t *meoi_l4ckmp; /* Which mblk has the L4 checksum */
1689da5577f0SRobert Mustacchi off_t meoi_l4off; /* What is the offset to it? */
1690da5577f0SRobert Mustacchi } mac_ether_offload_info_t;
1691da5577f0SRobert Mustacchi
1692da5577f0SRobert Mustacchi /*
1693da5577f0SRobert Mustacchi * This is something that we'd like to make a general MAC function. Before we do
1694da5577f0SRobert Mustacchi * that, we should add support for TSO.
1695da5577f0SRobert Mustacchi *
1696da5577f0SRobert Mustacchi * We should really keep track of our offset and not walk everything every
1697da5577f0SRobert Mustacchi * time. I can't imagine that this will be kind to us at high packet rates;
1698da5577f0SRobert Mustacchi * however, for the moment, let's leave that.
1699da5577f0SRobert Mustacchi *
1700da5577f0SRobert Mustacchi * This walks a message block chain without pulling up to fill in the context
1701da5577f0SRobert Mustacchi * information. Note that the data we care about could be hidden across more
1702da5577f0SRobert Mustacchi * than one mblk_t.
1703da5577f0SRobert Mustacchi */
1704da5577f0SRobert Mustacchi static int
i40e_meoi_get_uint8(mblk_t * mp,off_t off,uint8_t * out)1705da5577f0SRobert Mustacchi i40e_meoi_get_uint8(mblk_t *mp, off_t off, uint8_t *out)
1706da5577f0SRobert Mustacchi {
1707da5577f0SRobert Mustacchi size_t mpsize;
1708da5577f0SRobert Mustacchi uint8_t *bp;
1709da5577f0SRobert Mustacchi
1710da5577f0SRobert Mustacchi mpsize = msgsize(mp);
1711da5577f0SRobert Mustacchi /* Check for overflow */
1712da5577f0SRobert Mustacchi if (off + sizeof (uint16_t) > mpsize)
1713da5577f0SRobert Mustacchi return (-1);
1714da5577f0SRobert Mustacchi
1715da5577f0SRobert Mustacchi mpsize = MBLKL(mp);
1716da5577f0SRobert Mustacchi while (off >= mpsize) {
1717da5577f0SRobert Mustacchi mp = mp->b_cont;
1718da5577f0SRobert Mustacchi off -= mpsize;
1719da5577f0SRobert Mustacchi mpsize = MBLKL(mp);
1720da5577f0SRobert Mustacchi }
1721da5577f0SRobert Mustacchi
1722da5577f0SRobert Mustacchi bp = mp->b_rptr + off;
1723da5577f0SRobert Mustacchi *out = *bp;
1724da5577f0SRobert Mustacchi return (0);
1725da5577f0SRobert Mustacchi
1726da5577f0SRobert Mustacchi }
1727da5577f0SRobert Mustacchi
1728da5577f0SRobert Mustacchi static int
i40e_meoi_get_uint16(mblk_t * mp,off_t off,uint16_t * out)1729da5577f0SRobert Mustacchi i40e_meoi_get_uint16(mblk_t *mp, off_t off, uint16_t *out)
1730da5577f0SRobert Mustacchi {
1731da5577f0SRobert Mustacchi size_t mpsize;
1732da5577f0SRobert Mustacchi uint8_t *bp;
1733da5577f0SRobert Mustacchi
1734da5577f0SRobert Mustacchi mpsize = msgsize(mp);
1735da5577f0SRobert Mustacchi /* Check for overflow */
1736da5577f0SRobert Mustacchi if (off + sizeof (uint16_t) > mpsize)
1737da5577f0SRobert Mustacchi return (-1);
1738da5577f0SRobert Mustacchi
1739da5577f0SRobert Mustacchi mpsize = MBLKL(mp);
1740da5577f0SRobert Mustacchi while (off >= mpsize) {
1741da5577f0SRobert Mustacchi mp = mp->b_cont;
1742da5577f0SRobert Mustacchi off -= mpsize;
1743da5577f0SRobert Mustacchi mpsize = MBLKL(mp);
1744da5577f0SRobert Mustacchi }
1745da5577f0SRobert Mustacchi
1746da5577f0SRobert Mustacchi /*
1747da5577f0SRobert Mustacchi * Data is in network order. Note the second byte of data might be in
1748da5577f0SRobert Mustacchi * the next mp.
1749da5577f0SRobert Mustacchi */
1750da5577f0SRobert Mustacchi bp = mp->b_rptr + off;
1751da5577f0SRobert Mustacchi *out = *bp << 8;
1752da5577f0SRobert Mustacchi if (off + 1 == mpsize) {
1753da5577f0SRobert Mustacchi mp = mp->b_cont;
1754da5577f0SRobert Mustacchi bp = mp->b_rptr;
1755da5577f0SRobert Mustacchi } else {
1756da5577f0SRobert Mustacchi bp++;
1757da5577f0SRobert Mustacchi }
1758da5577f0SRobert Mustacchi
1759da5577f0SRobert Mustacchi *out |= *bp;
1760da5577f0SRobert Mustacchi return (0);
1761da5577f0SRobert Mustacchi
1762da5577f0SRobert Mustacchi }
1763da5577f0SRobert Mustacchi
1764da5577f0SRobert Mustacchi static int
mac_ether_offload_info(mblk_t * mp,mac_ether_offload_info_t * meoi)1765da5577f0SRobert Mustacchi mac_ether_offload_info(mblk_t *mp, mac_ether_offload_info_t *meoi)
1766da5577f0SRobert Mustacchi {
1767da5577f0SRobert Mustacchi size_t off;
1768da5577f0SRobert Mustacchi uint16_t ether;
1769da5577f0SRobert Mustacchi uint8_t ipproto, iplen, l4len, maclen;
1770da5577f0SRobert Mustacchi
1771da5577f0SRobert Mustacchi bzero(meoi, sizeof (mac_ether_offload_info_t));
1772da5577f0SRobert Mustacchi
1773da5577f0SRobert Mustacchi off = offsetof(struct ether_header, ether_type);
1774da5577f0SRobert Mustacchi if (i40e_meoi_get_uint16(mp, off, ðer) != 0)
1775da5577f0SRobert Mustacchi return (-1);
1776da5577f0SRobert Mustacchi
1777da5577f0SRobert Mustacchi if (ether == ETHERTYPE_VLAN) {
1778da5577f0SRobert Mustacchi off = offsetof(struct ether_vlan_header, ether_type);
1779da5577f0SRobert Mustacchi if (i40e_meoi_get_uint16(mp, off, ðer) != 0)
1780da5577f0SRobert Mustacchi return (-1);
1781da5577f0SRobert Mustacchi meoi->meoi_flags |= MEOI_VLAN_TAGGED;
1782da5577f0SRobert Mustacchi maclen = sizeof (struct ether_vlan_header);
1783da5577f0SRobert Mustacchi } else {
1784da5577f0SRobert Mustacchi maclen = sizeof (struct ether_header);
1785da5577f0SRobert Mustacchi }
1786da5577f0SRobert Mustacchi meoi->meoi_flags |= MEOI_L2INFO_SET;
1787da5577f0SRobert Mustacchi meoi->meoi_l2hlen = maclen;
1788da5577f0SRobert Mustacchi meoi->meoi_l3proto = ether;
1789da5577f0SRobert Mustacchi
1790da5577f0SRobert Mustacchi switch (ether) {
1791da5577f0SRobert Mustacchi case ETHERTYPE_IP:
1792da5577f0SRobert Mustacchi /*
1793da5577f0SRobert Mustacchi * For IPv4 we need to get the length of the header, as it can
1794da5577f0SRobert Mustacchi * be variable.
1795da5577f0SRobert Mustacchi */
1796da5577f0SRobert Mustacchi off = offsetof(ipha_t, ipha_version_and_hdr_length) + maclen;
1797da5577f0SRobert Mustacchi if (i40e_meoi_get_uint8(mp, off, &iplen) != 0)
1798da5577f0SRobert Mustacchi return (-1);
1799da5577f0SRobert Mustacchi iplen &= 0x0f;
1800da5577f0SRobert Mustacchi if (iplen < 5 || iplen > 0x0f)
1801da5577f0SRobert Mustacchi return (-1);
1802da5577f0SRobert Mustacchi iplen *= 4;
1803da5577f0SRobert Mustacchi off = offsetof(ipha_t, ipha_protocol) + maclen;
1804da5577f0SRobert Mustacchi if (i40e_meoi_get_uint8(mp, off, &ipproto) == -1)
1805da5577f0SRobert Mustacchi return (-1);
1806da5577f0SRobert Mustacchi break;
1807da5577f0SRobert Mustacchi case ETHERTYPE_IPV6:
1808da5577f0SRobert Mustacchi iplen = 40;
1809da5577f0SRobert Mustacchi off = offsetof(ip6_t, ip6_nxt) + maclen;
1810da5577f0SRobert Mustacchi if (i40e_meoi_get_uint8(mp, off, &ipproto) == -1)
1811da5577f0SRobert Mustacchi return (-1);
1812da5577f0SRobert Mustacchi break;
1813da5577f0SRobert Mustacchi default:
1814da5577f0SRobert Mustacchi return (0);
1815da5577f0SRobert Mustacchi }
1816da5577f0SRobert Mustacchi meoi->meoi_l3hlen = iplen;
1817da5577f0SRobert Mustacchi meoi->meoi_l4proto = ipproto;
1818da5577f0SRobert Mustacchi meoi->meoi_flags |= MEOI_L3INFO_SET;
1819da5577f0SRobert Mustacchi
1820da5577f0SRobert Mustacchi switch (ipproto) {
1821da5577f0SRobert Mustacchi case IPPROTO_TCP:
1822da5577f0SRobert Mustacchi off = offsetof(tcph_t, th_offset_and_rsrvd) + maclen + iplen;
1823da5577f0SRobert Mustacchi if (i40e_meoi_get_uint8(mp, off, &l4len) == -1)
1824da5577f0SRobert Mustacchi return (-1);
1825da5577f0SRobert Mustacchi l4len = (l4len & 0xf0) >> 4;
1826da5577f0SRobert Mustacchi if (l4len < 5 || l4len > 0xf)
1827da5577f0SRobert Mustacchi return (-1);
1828da5577f0SRobert Mustacchi l4len *= 4;
1829da5577f0SRobert Mustacchi break;
1830da5577f0SRobert Mustacchi case IPPROTO_UDP:
1831da5577f0SRobert Mustacchi l4len = sizeof (struct udphdr);
1832da5577f0SRobert Mustacchi break;
1833da5577f0SRobert Mustacchi case IPPROTO_SCTP:
1834da5577f0SRobert Mustacchi l4len = sizeof (sctp_hdr_t);
1835da5577f0SRobert Mustacchi break;
1836da5577f0SRobert Mustacchi default:
1837da5577f0SRobert Mustacchi return (0);
1838da5577f0SRobert Mustacchi }
1839da5577f0SRobert Mustacchi
1840da5577f0SRobert Mustacchi meoi->meoi_l4hlen = l4len;
1841da5577f0SRobert Mustacchi meoi->meoi_flags |= MEOI_L4INFO_SET;
1842da5577f0SRobert Mustacchi return (0);
1843da5577f0SRobert Mustacchi }
1844da5577f0SRobert Mustacchi
1845da5577f0SRobert Mustacchi /*
1846da5577f0SRobert Mustacchi * Attempt to put togther the information we'll need to feed into a descriptor
1847da5577f0SRobert Mustacchi * to properly program the hardware for checksum offload as well as the
1848da5577f0SRobert Mustacchi * generally required flags.
1849da5577f0SRobert Mustacchi *
1850*8d5069bcSRyan Zezeski * The i40e_tx_context_t`itc_data_cmdflags contains the set of flags we need to
1851*8d5069bcSRyan Zezeski * 'or' into the descriptor based on the checksum flags for this mblk_t and the
1852da5577f0SRobert Mustacchi * actual information we care about.
1853*8d5069bcSRyan Zezeski *
1854*8d5069bcSRyan Zezeski * If the mblk requires LSO then we'll also gather the information that will be
1855*8d5069bcSRyan Zezeski * used to construct the Transmit Context Descriptor.
1856da5577f0SRobert Mustacchi */
1857da5577f0SRobert Mustacchi static int
i40e_tx_context(i40e_t * i40e,i40e_trqpair_t * itrq,mblk_t * mp,mac_ether_offload_info_t * meo,i40e_tx_context_t * tctx)1858da5577f0SRobert Mustacchi i40e_tx_context(i40e_t *i40e, i40e_trqpair_t *itrq, mblk_t *mp,
1859*8d5069bcSRyan Zezeski mac_ether_offload_info_t *meo, i40e_tx_context_t *tctx)
1860da5577f0SRobert Mustacchi {
1861*8d5069bcSRyan Zezeski uint32_t chkflags, start, mss, lsoflags;
1862da5577f0SRobert Mustacchi i40e_txq_stat_t *txs = &itrq->itrq_txstat;
1863da5577f0SRobert Mustacchi
1864da5577f0SRobert Mustacchi bzero(tctx, sizeof (i40e_tx_context_t));
1865da5577f0SRobert Mustacchi
1866da5577f0SRobert Mustacchi if (i40e->i40e_tx_hcksum_enable != B_TRUE)
1867da5577f0SRobert Mustacchi return (0);
1868da5577f0SRobert Mustacchi
1869*8d5069bcSRyan Zezeski mac_hcksum_get(mp, &start, NULL, NULL, NULL, &chkflags);
1870*8d5069bcSRyan Zezeski mac_lso_get(mp, &mss, &lsoflags);
1871da5577f0SRobert Mustacchi
1872*8d5069bcSRyan Zezeski if (chkflags == 0 && lsoflags == 0)
1873*8d5069bcSRyan Zezeski return (0);
1874da5577f0SRobert Mustacchi
1875da5577f0SRobert Mustacchi /*
1876da5577f0SRobert Mustacchi * Have we been asked to checksum an IPv4 header. If so, verify that we
1877da5577f0SRobert Mustacchi * have sufficient information and then set the proper fields in the
1878da5577f0SRobert Mustacchi * command structure.
1879da5577f0SRobert Mustacchi */
1880*8d5069bcSRyan Zezeski if (chkflags & HCK_IPV4_HDRCKSUM) {
1881*8d5069bcSRyan Zezeski if ((meo->meoi_flags & MEOI_L2INFO_SET) == 0) {
1882da5577f0SRobert Mustacchi txs->itxs_hck_nol2info.value.ui64++;
1883da5577f0SRobert Mustacchi return (-1);
1884da5577f0SRobert Mustacchi }
1885*8d5069bcSRyan Zezeski if ((meo->meoi_flags & MEOI_L3INFO_SET) == 0) {
1886da5577f0SRobert Mustacchi txs->itxs_hck_nol3info.value.ui64++;
1887da5577f0SRobert Mustacchi return (-1);
1888da5577f0SRobert Mustacchi }
1889*8d5069bcSRyan Zezeski if (meo->meoi_l3proto != ETHERTYPE_IP) {
1890da5577f0SRobert Mustacchi txs->itxs_hck_badl3.value.ui64++;
1891da5577f0SRobert Mustacchi return (-1);
1892da5577f0SRobert Mustacchi }
1893*8d5069bcSRyan Zezeski tctx->itc_data_cmdflags |= I40E_TX_DESC_CMD_IIPT_IPV4_CSUM;
1894*8d5069bcSRyan Zezeski tctx->itc_data_offsets |= (meo->meoi_l2hlen >> 1) <<
1895da5577f0SRobert Mustacchi I40E_TX_DESC_LENGTH_MACLEN_SHIFT;
1896*8d5069bcSRyan Zezeski tctx->itc_data_offsets |= (meo->meoi_l3hlen >> 2) <<
1897da5577f0SRobert Mustacchi I40E_TX_DESC_LENGTH_IPLEN_SHIFT;
1898da5577f0SRobert Mustacchi }
1899da5577f0SRobert Mustacchi
1900da5577f0SRobert Mustacchi /*
1901da5577f0SRobert Mustacchi * We've been asked to provide an L4 header, first, set up the IP
1902da5577f0SRobert Mustacchi * information in the descriptor if we haven't already before moving
1903da5577f0SRobert Mustacchi * onto seeing if we have enough information for the L4 checksum
1904da5577f0SRobert Mustacchi * offload.
1905da5577f0SRobert Mustacchi */
1906*8d5069bcSRyan Zezeski if (chkflags & HCK_PARTIALCKSUM) {
1907*8d5069bcSRyan Zezeski if ((meo->meoi_flags & MEOI_L4INFO_SET) == 0) {
1908da5577f0SRobert Mustacchi txs->itxs_hck_nol4info.value.ui64++;
1909da5577f0SRobert Mustacchi return (-1);
1910da5577f0SRobert Mustacchi }
1911da5577f0SRobert Mustacchi
1912*8d5069bcSRyan Zezeski if (!(chkflags & HCK_IPV4_HDRCKSUM)) {
1913*8d5069bcSRyan Zezeski if ((meo->meoi_flags & MEOI_L2INFO_SET) == 0) {
1914da5577f0SRobert Mustacchi txs->itxs_hck_nol2info.value.ui64++;
1915da5577f0SRobert Mustacchi return (-1);
1916da5577f0SRobert Mustacchi }
1917*8d5069bcSRyan Zezeski if ((meo->meoi_flags & MEOI_L3INFO_SET) == 0) {
1918da5577f0SRobert Mustacchi txs->itxs_hck_nol3info.value.ui64++;
1919da5577f0SRobert Mustacchi return (-1);
1920da5577f0SRobert Mustacchi }
1921da5577f0SRobert Mustacchi
1922*8d5069bcSRyan Zezeski if (meo->meoi_l3proto == ETHERTYPE_IP) {
1923*8d5069bcSRyan Zezeski tctx->itc_data_cmdflags |=
1924da5577f0SRobert Mustacchi I40E_TX_DESC_CMD_IIPT_IPV4;
1925*8d5069bcSRyan Zezeski } else if (meo->meoi_l3proto == ETHERTYPE_IPV6) {
1926*8d5069bcSRyan Zezeski tctx->itc_data_cmdflags |=
1927da5577f0SRobert Mustacchi I40E_TX_DESC_CMD_IIPT_IPV6;
1928da5577f0SRobert Mustacchi } else {
1929da5577f0SRobert Mustacchi txs->itxs_hck_badl3.value.ui64++;
1930da5577f0SRobert Mustacchi return (-1);
1931da5577f0SRobert Mustacchi }
1932*8d5069bcSRyan Zezeski tctx->itc_data_offsets |= (meo->meoi_l2hlen >> 1) <<
1933da5577f0SRobert Mustacchi I40E_TX_DESC_LENGTH_MACLEN_SHIFT;
1934*8d5069bcSRyan Zezeski tctx->itc_data_offsets |= (meo->meoi_l3hlen >> 2) <<
1935da5577f0SRobert Mustacchi I40E_TX_DESC_LENGTH_IPLEN_SHIFT;
1936da5577f0SRobert Mustacchi }
1937da5577f0SRobert Mustacchi
1938*8d5069bcSRyan Zezeski switch (meo->meoi_l4proto) {
1939da5577f0SRobert Mustacchi case IPPROTO_TCP:
1940*8d5069bcSRyan Zezeski tctx->itc_data_cmdflags |=
1941*8d5069bcSRyan Zezeski I40E_TX_DESC_CMD_L4T_EOFT_TCP;
1942da5577f0SRobert Mustacchi break;
1943da5577f0SRobert Mustacchi case IPPROTO_UDP:
1944*8d5069bcSRyan Zezeski tctx->itc_data_cmdflags |=
1945*8d5069bcSRyan Zezeski I40E_TX_DESC_CMD_L4T_EOFT_UDP;
1946da5577f0SRobert Mustacchi break;
1947da5577f0SRobert Mustacchi case IPPROTO_SCTP:
1948*8d5069bcSRyan Zezeski tctx->itc_data_cmdflags |=
1949*8d5069bcSRyan Zezeski I40E_TX_DESC_CMD_L4T_EOFT_SCTP;
1950da5577f0SRobert Mustacchi break;
1951da5577f0SRobert Mustacchi default:
1952da5577f0SRobert Mustacchi txs->itxs_hck_badl4.value.ui64++;
1953da5577f0SRobert Mustacchi return (-1);
1954da5577f0SRobert Mustacchi }
1955da5577f0SRobert Mustacchi
1956*8d5069bcSRyan Zezeski tctx->itc_data_offsets |= (meo->meoi_l4hlen >> 2) <<
1957da5577f0SRobert Mustacchi I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
1958da5577f0SRobert Mustacchi }
1959da5577f0SRobert Mustacchi
1960*8d5069bcSRyan Zezeski if (lsoflags & HW_LSO) {
1961*8d5069bcSRyan Zezeski /*
1962*8d5069bcSRyan Zezeski * LSO requires that checksum offloads are enabled. If for
1963*8d5069bcSRyan Zezeski * some reason they're not we bail out with an error.
1964*8d5069bcSRyan Zezeski */
1965*8d5069bcSRyan Zezeski if ((chkflags & HCK_IPV4_HDRCKSUM) == 0 ||
1966*8d5069bcSRyan Zezeski (chkflags & HCK_PARTIALCKSUM) == 0) {
1967*8d5069bcSRyan Zezeski txs->itxs_lso_nohck.value.ui64++;
1968*8d5069bcSRyan Zezeski return (-1);
1969*8d5069bcSRyan Zezeski }
1970*8d5069bcSRyan Zezeski
1971*8d5069bcSRyan Zezeski tctx->itc_ctx_cmdflags |= I40E_TX_CTX_DESC_TSO;
1972*8d5069bcSRyan Zezeski tctx->itc_ctx_mss = mss;
1973*8d5069bcSRyan Zezeski tctx->itc_ctx_tsolen = msgsize(mp) -
1974*8d5069bcSRyan Zezeski (meo->meoi_l2hlen + meo->meoi_l3hlen + meo->meoi_l4hlen);
1975*8d5069bcSRyan Zezeski }
1976*8d5069bcSRyan Zezeski
1977da5577f0SRobert Mustacchi return (0);
1978da5577f0SRobert Mustacchi }
1979da5577f0SRobert Mustacchi
1980da5577f0SRobert Mustacchi static void
i40e_tcb_free(i40e_trqpair_t * itrq,i40e_tx_control_block_t * tcb)1981da5577f0SRobert Mustacchi i40e_tcb_free(i40e_trqpair_t *itrq, i40e_tx_control_block_t *tcb)
1982da5577f0SRobert Mustacchi {
1983da5577f0SRobert Mustacchi ASSERT(tcb != NULL);
1984da5577f0SRobert Mustacchi
1985da5577f0SRobert Mustacchi mutex_enter(&itrq->itrq_tcb_lock);
1986da5577f0SRobert Mustacchi ASSERT(itrq->itrq_tcb_free < itrq->itrq_tx_free_list_size);
1987da5577f0SRobert Mustacchi itrq->itrq_tcb_free_list[itrq->itrq_tcb_free] = tcb;
1988da5577f0SRobert Mustacchi itrq->itrq_tcb_free++;
1989da5577f0SRobert Mustacchi mutex_exit(&itrq->itrq_tcb_lock);
1990da5577f0SRobert Mustacchi }
1991da5577f0SRobert Mustacchi
1992da5577f0SRobert Mustacchi static i40e_tx_control_block_t *
i40e_tcb_alloc(i40e_trqpair_t * itrq)1993da5577f0SRobert Mustacchi i40e_tcb_alloc(i40e_trqpair_t *itrq)
1994da5577f0SRobert Mustacchi {
1995da5577f0SRobert Mustacchi i40e_tx_control_block_t *ret;
1996da5577f0SRobert Mustacchi
1997da5577f0SRobert Mustacchi mutex_enter(&itrq->itrq_tcb_lock);
1998da5577f0SRobert Mustacchi if (itrq->itrq_tcb_free == 0) {
1999da5577f0SRobert Mustacchi mutex_exit(&itrq->itrq_tcb_lock);
2000da5577f0SRobert Mustacchi return (NULL);
2001da5577f0SRobert Mustacchi }
2002da5577f0SRobert Mustacchi
2003da5577f0SRobert Mustacchi itrq->itrq_tcb_free--;
2004da5577f0SRobert Mustacchi ret = itrq->itrq_tcb_free_list[itrq->itrq_tcb_free];
2005da5577f0SRobert Mustacchi itrq->itrq_tcb_free_list[itrq->itrq_tcb_free] = NULL;
2006da5577f0SRobert Mustacchi mutex_exit(&itrq->itrq_tcb_lock);
2007da5577f0SRobert Mustacchi
2008da5577f0SRobert Mustacchi ASSERT(ret != NULL);
2009da5577f0SRobert Mustacchi return (ret);
2010da5577f0SRobert Mustacchi }
2011da5577f0SRobert Mustacchi
2012da5577f0SRobert Mustacchi /*
2013da5577f0SRobert Mustacchi * This should be used to free any DMA resources, associated mblk_t's, etc. It's
2014da5577f0SRobert Mustacchi * used as part of recycling the message blocks when we have either an interrupt
2015da5577f0SRobert Mustacchi * or other activity that indicates that we need to take a look.
2016da5577f0SRobert Mustacchi */
2017da5577f0SRobert Mustacchi static void
i40e_tcb_reset(i40e_tx_control_block_t * tcb)2018da5577f0SRobert Mustacchi i40e_tcb_reset(i40e_tx_control_block_t *tcb)
2019da5577f0SRobert Mustacchi {
2020da5577f0SRobert Mustacchi switch (tcb->tcb_type) {
2021da5577f0SRobert Mustacchi case I40E_TX_COPY:
2022da5577f0SRobert Mustacchi tcb->tcb_dma.dmab_len = 0;
2023da5577f0SRobert Mustacchi break;
2024da5577f0SRobert Mustacchi case I40E_TX_DMA:
2025*8d5069bcSRyan Zezeski if (tcb->tcb_used_lso == B_TRUE && tcb->tcb_bind_ncookies > 0)
2026*8d5069bcSRyan Zezeski (void) ddi_dma_unbind_handle(tcb->tcb_lso_dma_handle);
2027*8d5069bcSRyan Zezeski else if (tcb->tcb_bind_ncookies > 0)
2028da5577f0SRobert Mustacchi (void) ddi_dma_unbind_handle(tcb->tcb_dma_handle);
2029*8d5069bcSRyan Zezeski if (tcb->tcb_bind_info != NULL) {
2030*8d5069bcSRyan Zezeski kmem_free(tcb->tcb_bind_info,
2031*8d5069bcSRyan Zezeski tcb->tcb_bind_ncookies *
2032*8d5069bcSRyan Zezeski sizeof (struct i40e_dma_bind_info));
2033*8d5069bcSRyan Zezeski }
2034*8d5069bcSRyan Zezeski tcb->tcb_bind_info = NULL;
2035*8d5069bcSRyan Zezeski tcb->tcb_bind_ncookies = 0;
2036*8d5069bcSRyan Zezeski tcb->tcb_used_lso = B_FALSE;
2037*8d5069bcSRyan Zezeski break;
2038*8d5069bcSRyan Zezeski case I40E_TX_DESC:
2039da5577f0SRobert Mustacchi break;
2040da5577f0SRobert Mustacchi case I40E_TX_NONE:
2041da5577f0SRobert Mustacchi /* Cast to pacify lint */
2042da5577f0SRobert Mustacchi panic("trying to free tcb %p with bad type none", (void *)tcb);
2043da5577f0SRobert Mustacchi default:
2044da5577f0SRobert Mustacchi panic("unknown i40e tcb type: %d", tcb->tcb_type);
2045da5577f0SRobert Mustacchi }
2046da5577f0SRobert Mustacchi
2047da5577f0SRobert Mustacchi tcb->tcb_type = I40E_TX_NONE;
2048*8d5069bcSRyan Zezeski if (tcb->tcb_mp != NULL) {
2049da5577f0SRobert Mustacchi freemsg(tcb->tcb_mp);
2050da5577f0SRobert Mustacchi tcb->tcb_mp = NULL;
2051*8d5069bcSRyan Zezeski }
2052da5577f0SRobert Mustacchi tcb->tcb_next = NULL;
2053da5577f0SRobert Mustacchi }
2054da5577f0SRobert Mustacchi
2055da5577f0SRobert Mustacchi /*
2056da5577f0SRobert Mustacchi * This is called as part of shutting down to clean up all outstanding
2057da5577f0SRobert Mustacchi * descriptors. Similar to recycle, except we don't re-arm anything and instead
2058da5577f0SRobert Mustacchi * just return control blocks to the free list.
2059da5577f0SRobert Mustacchi */
2060da5577f0SRobert Mustacchi void
i40e_tx_cleanup_ring(i40e_trqpair_t * itrq)2061da5577f0SRobert Mustacchi i40e_tx_cleanup_ring(i40e_trqpair_t *itrq)
2062da5577f0SRobert Mustacchi {
2063da5577f0SRobert Mustacchi uint32_t index;
2064da5577f0SRobert Mustacchi
2065da5577f0SRobert Mustacchi ASSERT(MUTEX_HELD(&itrq->itrq_tx_lock));
2066da5577f0SRobert Mustacchi ASSERT(itrq->itrq_desc_free <= itrq->itrq_tx_ring_size);
2067da5577f0SRobert Mustacchi
2068da5577f0SRobert Mustacchi /*
2069da5577f0SRobert Mustacchi * Because we should have shut down the chip at this point, it should be
2070da5577f0SRobert Mustacchi * safe to just clean up all the entries between our head and tail.
2071da5577f0SRobert Mustacchi */
2072da5577f0SRobert Mustacchi #ifdef DEBUG
2073da5577f0SRobert Mustacchi index = I40E_READ_REG(&itrq->itrq_i40e->i40e_hw_space,
2074da5577f0SRobert Mustacchi I40E_QTX_ENA(itrq->itrq_index));
2075da5577f0SRobert Mustacchi VERIFY0(index & (I40E_QTX_ENA_QENA_REQ_MASK |
2076da5577f0SRobert Mustacchi I40E_QTX_ENA_QENA_STAT_MASK));
2077da5577f0SRobert Mustacchi #endif
2078da5577f0SRobert Mustacchi
2079da5577f0SRobert Mustacchi index = itrq->itrq_desc_head;
2080da5577f0SRobert Mustacchi while (itrq->itrq_desc_free < itrq->itrq_tx_ring_size) {
2081da5577f0SRobert Mustacchi i40e_tx_control_block_t *tcb;
2082da5577f0SRobert Mustacchi
2083da5577f0SRobert Mustacchi tcb = itrq->itrq_tcb_work_list[index];
2084*8d5069bcSRyan Zezeski if (tcb != NULL) {
2085da5577f0SRobert Mustacchi itrq->itrq_tcb_work_list[index] = NULL;
2086da5577f0SRobert Mustacchi i40e_tcb_reset(tcb);
2087da5577f0SRobert Mustacchi i40e_tcb_free(itrq, tcb);
2088*8d5069bcSRyan Zezeski }
2089da5577f0SRobert Mustacchi
2090da5577f0SRobert Mustacchi bzero(&itrq->itrq_desc_ring[index], sizeof (i40e_tx_desc_t));
2091da5577f0SRobert Mustacchi index = i40e_next_desc(index, 1, itrq->itrq_tx_ring_size);
2092da5577f0SRobert Mustacchi itrq->itrq_desc_free++;
2093da5577f0SRobert Mustacchi }
2094da5577f0SRobert Mustacchi
2095da5577f0SRobert Mustacchi ASSERT(index == itrq->itrq_desc_tail);
2096da5577f0SRobert Mustacchi itrq->itrq_desc_head = index;
2097da5577f0SRobert Mustacchi }
2098da5577f0SRobert Mustacchi
2099da5577f0SRobert Mustacchi /*
2100da5577f0SRobert Mustacchi * We're here either by hook or by crook. We need to see if there are transmit
2101da5577f0SRobert Mustacchi * descriptors available for us to go and clean up and return to the hardware.
2102da5577f0SRobert Mustacchi * We may also be blocked, and if so, we should make sure that we let it know
2103da5577f0SRobert Mustacchi * we're good to go.
2104da5577f0SRobert Mustacchi */
2105da5577f0SRobert Mustacchi void
i40e_tx_recycle_ring(i40e_trqpair_t * itrq)2106da5577f0SRobert Mustacchi i40e_tx_recycle_ring(i40e_trqpair_t *itrq)
2107da5577f0SRobert Mustacchi {
2108da5577f0SRobert Mustacchi uint32_t wbhead, toclean, count;
2109da5577f0SRobert Mustacchi i40e_tx_control_block_t *tcbhead;
2110da5577f0SRobert Mustacchi i40e_t *i40e = itrq->itrq_i40e;
2111*8d5069bcSRyan Zezeski uint_t desc_per_tcb, i;
2112da5577f0SRobert Mustacchi
2113da5577f0SRobert Mustacchi mutex_enter(&itrq->itrq_tx_lock);
2114da5577f0SRobert Mustacchi
2115da5577f0SRobert Mustacchi ASSERT(itrq->itrq_desc_free <= itrq->itrq_tx_ring_size);
2116da5577f0SRobert Mustacchi if (itrq->itrq_desc_free == itrq->itrq_tx_ring_size) {
2117da5577f0SRobert Mustacchi if (itrq->itrq_tx_blocked == B_TRUE) {
2118da5577f0SRobert Mustacchi itrq->itrq_tx_blocked = B_FALSE;
2119da5577f0SRobert Mustacchi mac_tx_ring_update(i40e->i40e_mac_hdl,
2120da5577f0SRobert Mustacchi itrq->itrq_mactxring);
2121da5577f0SRobert Mustacchi itrq->itrq_txstat.itxs_num_unblocked.value.ui64++;
2122da5577f0SRobert Mustacchi }
2123da5577f0SRobert Mustacchi mutex_exit(&itrq->itrq_tx_lock);
2124da5577f0SRobert Mustacchi return;
2125da5577f0SRobert Mustacchi }
2126da5577f0SRobert Mustacchi
2127da5577f0SRobert Mustacchi /*
2128da5577f0SRobert Mustacchi * Now we need to try and see if there's anything available. The driver
2129da5577f0SRobert Mustacchi * will write to the head location and it guarantees that it does not
2130da5577f0SRobert Mustacchi * use relaxed ordering.
2131da5577f0SRobert Mustacchi */
2132da5577f0SRobert Mustacchi VERIFY0(ddi_dma_sync(itrq->itrq_desc_area.dmab_dma_handle,
2133da5577f0SRobert Mustacchi (uintptr_t)itrq->itrq_desc_wbhead,
2134da5577f0SRobert Mustacchi sizeof (uint32_t), DDI_DMA_SYNC_FORKERNEL));
2135da5577f0SRobert Mustacchi
2136da5577f0SRobert Mustacchi if (i40e_check_dma_handle(itrq->itrq_desc_area.dmab_dma_handle) !=
2137da5577f0SRobert Mustacchi DDI_FM_OK) {
2138da5577f0SRobert Mustacchi mutex_exit(&itrq->itrq_tx_lock);
2139da5577f0SRobert Mustacchi ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED);
2140da5577f0SRobert Mustacchi atomic_or_32(&i40e->i40e_state, I40E_ERROR);
2141da5577f0SRobert Mustacchi return;
2142da5577f0SRobert Mustacchi }
2143da5577f0SRobert Mustacchi
2144da5577f0SRobert Mustacchi wbhead = *itrq->itrq_desc_wbhead;
2145da5577f0SRobert Mustacchi toclean = itrq->itrq_desc_head;
2146da5577f0SRobert Mustacchi count = 0;
2147da5577f0SRobert Mustacchi tcbhead = NULL;
2148da5577f0SRobert Mustacchi
2149da5577f0SRobert Mustacchi while (toclean != wbhead) {
2150da5577f0SRobert Mustacchi i40e_tx_control_block_t *tcb;
2151da5577f0SRobert Mustacchi
2152da5577f0SRobert Mustacchi tcb = itrq->itrq_tcb_work_list[toclean];
2153da5577f0SRobert Mustacchi itrq->itrq_tcb_work_list[toclean] = NULL;
2154da5577f0SRobert Mustacchi ASSERT(tcb != NULL);
2155da5577f0SRobert Mustacchi tcb->tcb_next = tcbhead;
2156da5577f0SRobert Mustacchi tcbhead = tcb;
2157da5577f0SRobert Mustacchi
2158da5577f0SRobert Mustacchi /*
2159*8d5069bcSRyan Zezeski * In the DMA bind case, there may not necessarily be a 1:1
2160*8d5069bcSRyan Zezeski * mapping between tcb's and descriptors. If the tcb type
2161*8d5069bcSRyan Zezeski * indicates a DMA binding then check the number of DMA
2162*8d5069bcSRyan Zezeski * cookies to determine how many entries to clean in the
2163*8d5069bcSRyan Zezeski * descriptor ring.
2164*8d5069bcSRyan Zezeski */
2165*8d5069bcSRyan Zezeski if (tcb->tcb_type == I40E_TX_DMA)
2166*8d5069bcSRyan Zezeski desc_per_tcb = tcb->tcb_bind_ncookies;
2167*8d5069bcSRyan Zezeski else
2168*8d5069bcSRyan Zezeski desc_per_tcb = 1;
2169*8d5069bcSRyan Zezeski
2170*8d5069bcSRyan Zezeski for (i = 0; i < desc_per_tcb; i++) {
2171*8d5069bcSRyan Zezeski /*
2172da5577f0SRobert Mustacchi * We zero this out for sanity purposes.
2173da5577f0SRobert Mustacchi */
2174*8d5069bcSRyan Zezeski bzero(&itrq->itrq_desc_ring[toclean],
2175*8d5069bcSRyan Zezeski sizeof (i40e_tx_desc_t));
2176*8d5069bcSRyan Zezeski toclean = i40e_next_desc(toclean, 1,
2177*8d5069bcSRyan Zezeski itrq->itrq_tx_ring_size);
2178da5577f0SRobert Mustacchi count++;
2179da5577f0SRobert Mustacchi }
2180*8d5069bcSRyan Zezeski }
2181da5577f0SRobert Mustacchi
2182da5577f0SRobert Mustacchi itrq->itrq_desc_head = wbhead;
2183da5577f0SRobert Mustacchi itrq->itrq_desc_free += count;
2184da5577f0SRobert Mustacchi itrq->itrq_txstat.itxs_recycled.value.ui64 += count;
2185da5577f0SRobert Mustacchi ASSERT(itrq->itrq_desc_free <= itrq->itrq_tx_ring_size);
2186da5577f0SRobert Mustacchi
2187da5577f0SRobert Mustacchi if (itrq->itrq_tx_blocked == B_TRUE &&
2188da5577f0SRobert Mustacchi itrq->itrq_desc_free > i40e->i40e_tx_block_thresh) {
2189da5577f0SRobert Mustacchi itrq->itrq_tx_blocked = B_FALSE;
2190da5577f0SRobert Mustacchi
2191da5577f0SRobert Mustacchi mac_tx_ring_update(i40e->i40e_mac_hdl, itrq->itrq_mactxring);
2192da5577f0SRobert Mustacchi itrq->itrq_txstat.itxs_num_unblocked.value.ui64++;
2193da5577f0SRobert Mustacchi }
2194da5577f0SRobert Mustacchi
2195da5577f0SRobert Mustacchi mutex_exit(&itrq->itrq_tx_lock);
2196da5577f0SRobert Mustacchi
2197da5577f0SRobert Mustacchi /*
2198da5577f0SRobert Mustacchi * Now clean up the tcb.
2199da5577f0SRobert Mustacchi */
2200da5577f0SRobert Mustacchi while (tcbhead != NULL) {
2201da5577f0SRobert Mustacchi i40e_tx_control_block_t *tcb = tcbhead;
2202da5577f0SRobert Mustacchi
2203da5577f0SRobert Mustacchi tcbhead = tcb->tcb_next;
2204da5577f0SRobert Mustacchi i40e_tcb_reset(tcb);
2205da5577f0SRobert Mustacchi i40e_tcb_free(itrq, tcb);
2206da5577f0SRobert Mustacchi }
2207da5577f0SRobert Mustacchi
2208da5577f0SRobert Mustacchi DTRACE_PROBE2(i40e__recycle, i40e_trqpair_t *, itrq, uint32_t, count);
2209da5577f0SRobert Mustacchi }
2210da5577f0SRobert Mustacchi
2211*8d5069bcSRyan Zezeski static void
i40e_tx_copy_fragment(i40e_tx_control_block_t * tcb,const mblk_t * mp,const size_t off,const size_t len)2212*8d5069bcSRyan Zezeski i40e_tx_copy_fragment(i40e_tx_control_block_t *tcb, const mblk_t *mp,
2213*8d5069bcSRyan Zezeski const size_t off, const size_t len)
2214*8d5069bcSRyan Zezeski {
2215*8d5069bcSRyan Zezeski const void *soff = mp->b_rptr + off;
2216*8d5069bcSRyan Zezeski void *doff = tcb->tcb_dma.dmab_address + tcb->tcb_dma.dmab_len;
2217*8d5069bcSRyan Zezeski
2218*8d5069bcSRyan Zezeski ASSERT3U(len, >, 0);
2219*8d5069bcSRyan Zezeski ASSERT3P(soff, >=, mp->b_rptr);
2220*8d5069bcSRyan Zezeski ASSERT3P(soff, <=, mp->b_wptr);
2221*8d5069bcSRyan Zezeski ASSERT3U(len, <=, MBLKL(mp));
2222*8d5069bcSRyan Zezeski ASSERT3U((uintptr_t)soff + len, <=, (uintptr_t)mp->b_wptr);
2223*8d5069bcSRyan Zezeski ASSERT3U(tcb->tcb_dma.dmab_size - tcb->tcb_dma.dmab_len, >=, len);
2224*8d5069bcSRyan Zezeski bcopy(soff, doff, len);
2225*8d5069bcSRyan Zezeski tcb->tcb_type = I40E_TX_COPY;
2226*8d5069bcSRyan Zezeski tcb->tcb_dma.dmab_len += len;
2227*8d5069bcSRyan Zezeski I40E_DMA_SYNC(&tcb->tcb_dma, DDI_DMA_SYNC_FORDEV);
2228*8d5069bcSRyan Zezeski }
2229*8d5069bcSRyan Zezeski
2230*8d5069bcSRyan Zezeski static i40e_tx_control_block_t *
i40e_tx_bind_fragment(i40e_trqpair_t * itrq,const mblk_t * mp,size_t off,boolean_t use_lso)2231*8d5069bcSRyan Zezeski i40e_tx_bind_fragment(i40e_trqpair_t *itrq, const mblk_t *mp,
2232*8d5069bcSRyan Zezeski size_t off, boolean_t use_lso)
2233*8d5069bcSRyan Zezeski {
2234*8d5069bcSRyan Zezeski ddi_dma_handle_t dma_handle;
2235*8d5069bcSRyan Zezeski ddi_dma_cookie_t dma_cookie;
2236*8d5069bcSRyan Zezeski uint_t i = 0, ncookies = 0, dmaflags;
2237*8d5069bcSRyan Zezeski i40e_tx_control_block_t *tcb;
2238*8d5069bcSRyan Zezeski i40e_txq_stat_t *txs = &itrq->itrq_txstat;
2239*8d5069bcSRyan Zezeski
2240*8d5069bcSRyan Zezeski if ((tcb = i40e_tcb_alloc(itrq)) == NULL) {
2241*8d5069bcSRyan Zezeski txs->itxs_err_notcb.value.ui64++;
2242*8d5069bcSRyan Zezeski return (NULL);
2243*8d5069bcSRyan Zezeski }
2244*8d5069bcSRyan Zezeski tcb->tcb_type = I40E_TX_DMA;
2245*8d5069bcSRyan Zezeski
2246*8d5069bcSRyan Zezeski if (use_lso == B_TRUE)
2247*8d5069bcSRyan Zezeski dma_handle = tcb->tcb_lso_dma_handle;
2248*8d5069bcSRyan Zezeski else
2249*8d5069bcSRyan Zezeski dma_handle = tcb->tcb_dma_handle;
2250*8d5069bcSRyan Zezeski
2251*8d5069bcSRyan Zezeski dmaflags = DDI_DMA_WRITE | DDI_DMA_STREAMING;
2252*8d5069bcSRyan Zezeski if (ddi_dma_addr_bind_handle(dma_handle, NULL,
2253*8d5069bcSRyan Zezeski (caddr_t)(mp->b_rptr + off), MBLKL(mp) - off, dmaflags,
2254*8d5069bcSRyan Zezeski DDI_DMA_DONTWAIT, NULL, &dma_cookie, &ncookies) != DDI_DMA_MAPPED) {
2255*8d5069bcSRyan Zezeski txs->itxs_bind_fails.value.ui64++;
2256*8d5069bcSRyan Zezeski goto bffail;
2257*8d5069bcSRyan Zezeski }
2258*8d5069bcSRyan Zezeski
2259*8d5069bcSRyan Zezeski tcb->tcb_bind_ncookies = ncookies;
2260*8d5069bcSRyan Zezeski tcb->tcb_used_lso = use_lso;
2261*8d5069bcSRyan Zezeski
2262*8d5069bcSRyan Zezeski tcb->tcb_bind_info =
2263*8d5069bcSRyan Zezeski kmem_zalloc(ncookies * sizeof (struct i40e_dma_bind_info),
2264*8d5069bcSRyan Zezeski KM_NOSLEEP);
2265*8d5069bcSRyan Zezeski if (tcb->tcb_bind_info == NULL)
2266*8d5069bcSRyan Zezeski goto bffail;
2267*8d5069bcSRyan Zezeski
2268*8d5069bcSRyan Zezeski while (i < ncookies) {
2269*8d5069bcSRyan Zezeski if (i > 0)
2270*8d5069bcSRyan Zezeski ddi_dma_nextcookie(dma_handle, &dma_cookie);
2271*8d5069bcSRyan Zezeski
2272*8d5069bcSRyan Zezeski tcb->tcb_bind_info[i].dbi_paddr =
2273*8d5069bcSRyan Zezeski (caddr_t)dma_cookie.dmac_laddress;
2274*8d5069bcSRyan Zezeski tcb->tcb_bind_info[i++].dbi_len = dma_cookie.dmac_size;
2275*8d5069bcSRyan Zezeski }
2276*8d5069bcSRyan Zezeski
2277*8d5069bcSRyan Zezeski return (tcb);
2278*8d5069bcSRyan Zezeski
2279*8d5069bcSRyan Zezeski bffail:
2280*8d5069bcSRyan Zezeski i40e_tcb_reset(tcb);
2281*8d5069bcSRyan Zezeski i40e_tcb_free(itrq, tcb);
2282*8d5069bcSRyan Zezeski return (NULL);
2283*8d5069bcSRyan Zezeski }
2284*8d5069bcSRyan Zezeski
2285*8d5069bcSRyan Zezeski static void
i40e_tx_set_data_desc(i40e_trqpair_t * itrq,i40e_tx_context_t * tctx,caddr_t buff,size_t len,boolean_t last_desc)2286*8d5069bcSRyan Zezeski i40e_tx_set_data_desc(i40e_trqpair_t *itrq, i40e_tx_context_t *tctx,
2287*8d5069bcSRyan Zezeski caddr_t buff, size_t len, boolean_t last_desc)
2288*8d5069bcSRyan Zezeski {
2289*8d5069bcSRyan Zezeski i40e_tx_desc_t *txdesc;
2290*8d5069bcSRyan Zezeski int cmd;
2291*8d5069bcSRyan Zezeski
2292*8d5069bcSRyan Zezeski ASSERT(MUTEX_HELD(&itrq->itrq_tx_lock));
2293*8d5069bcSRyan Zezeski itrq->itrq_desc_free--;
2294*8d5069bcSRyan Zezeski txdesc = &itrq->itrq_desc_ring[itrq->itrq_desc_tail];
2295*8d5069bcSRyan Zezeski itrq->itrq_desc_tail = i40e_next_desc(itrq->itrq_desc_tail, 1,
2296*8d5069bcSRyan Zezeski itrq->itrq_tx_ring_size);
2297*8d5069bcSRyan Zezeski
2298*8d5069bcSRyan Zezeski cmd = I40E_TX_DESC_CMD_ICRC | tctx->itc_data_cmdflags;
2299*8d5069bcSRyan Zezeski
2300*8d5069bcSRyan Zezeski /*
2301*8d5069bcSRyan Zezeski * The last data descriptor needs the EOP bit set, so that the HW knows
2302*8d5069bcSRyan Zezeski * that we're ready to send. Additionally, we set the RS (Report
2303*8d5069bcSRyan Zezeski * Status) bit, so that we are notified when the transmit engine has
2304*8d5069bcSRyan Zezeski * completed DMA'ing all of the data descriptors and data buffers
2305*8d5069bcSRyan Zezeski * associated with this frame.
2306*8d5069bcSRyan Zezeski */
2307*8d5069bcSRyan Zezeski if (last_desc == B_TRUE) {
2308*8d5069bcSRyan Zezeski cmd |= I40E_TX_DESC_CMD_EOP;
2309*8d5069bcSRyan Zezeski cmd |= I40E_TX_DESC_CMD_RS;
2310*8d5069bcSRyan Zezeski }
2311*8d5069bcSRyan Zezeski
2312*8d5069bcSRyan Zezeski /*
2313*8d5069bcSRyan Zezeski * Per the X710 manual, section 8.4.2.1.1, the buffer size
2314*8d5069bcSRyan Zezeski * must be a value from 1 to 16K minus 1, inclusive.
2315*8d5069bcSRyan Zezeski */
2316*8d5069bcSRyan Zezeski ASSERT3U(len, >=, 1);
2317*8d5069bcSRyan Zezeski ASSERT3U(len, <=, I40E_MAX_TX_BUFSZ);
2318*8d5069bcSRyan Zezeski
2319*8d5069bcSRyan Zezeski txdesc->buffer_addr = CPU_TO_LE64((uintptr_t)buff);
2320*8d5069bcSRyan Zezeski txdesc->cmd_type_offset_bsz =
2321*8d5069bcSRyan Zezeski LE_64(((uint64_t)I40E_TX_DESC_DTYPE_DATA |
2322*8d5069bcSRyan Zezeski ((uint64_t)tctx->itc_data_offsets << I40E_TXD_QW1_OFFSET_SHIFT) |
2323*8d5069bcSRyan Zezeski ((uint64_t)cmd << I40E_TXD_QW1_CMD_SHIFT) |
2324*8d5069bcSRyan Zezeski ((uint64_t)len << I40E_TXD_QW1_TX_BUF_SZ_SHIFT)));
2325*8d5069bcSRyan Zezeski }
2326*8d5069bcSRyan Zezeski
2327*8d5069bcSRyan Zezeski /*
2328*8d5069bcSRyan Zezeski * Place 'tcb' on the tail of the list represented by 'head'/'tail'.
2329*8d5069bcSRyan Zezeski */
2330*8d5069bcSRyan Zezeski static inline void
tcb_list_append(i40e_tx_control_block_t ** head,i40e_tx_control_block_t ** tail,i40e_tx_control_block_t * tcb)2331*8d5069bcSRyan Zezeski tcb_list_append(i40e_tx_control_block_t **head, i40e_tx_control_block_t **tail,
2332*8d5069bcSRyan Zezeski i40e_tx_control_block_t *tcb)
2333*8d5069bcSRyan Zezeski {
2334*8d5069bcSRyan Zezeski if (*head == NULL) {
2335*8d5069bcSRyan Zezeski *head = tcb;
2336*8d5069bcSRyan Zezeski *tail = *head;
2337*8d5069bcSRyan Zezeski } else {
2338*8d5069bcSRyan Zezeski ASSERT3P(*tail, !=, NULL);
2339*8d5069bcSRyan Zezeski ASSERT3P((*tail)->tcb_next, ==, NULL);
2340*8d5069bcSRyan Zezeski (*tail)->tcb_next = tcb;
2341*8d5069bcSRyan Zezeski *tail = tcb;
2342*8d5069bcSRyan Zezeski }
2343*8d5069bcSRyan Zezeski }
2344*8d5069bcSRyan Zezeski
2345*8d5069bcSRyan Zezeski /*
2346*8d5069bcSRyan Zezeski * This function takes a single packet, possibly consisting of
2347*8d5069bcSRyan Zezeski * multiple mblks, and creates a TCB chain to send to the controller.
2348*8d5069bcSRyan Zezeski * This TCB chain may span up to a maximum of 8 descriptors. A copy
2349*8d5069bcSRyan Zezeski * TCB consumes one descriptor; whereas a DMA TCB may consume 1 or
2350*8d5069bcSRyan Zezeski * more, depending on several factors. For each fragment (invidual
2351*8d5069bcSRyan Zezeski * mblk making up the packet), we determine if its size dictates a
2352*8d5069bcSRyan Zezeski * copy to the TCB buffer or a DMA bind of the dblk buffer. We keep a
2353*8d5069bcSRyan Zezeski * count of descriptors used; when that count reaches the max we force
2354*8d5069bcSRyan Zezeski * all remaining fragments into a single TCB buffer. We have a
2355*8d5069bcSRyan Zezeski * guarantee that the TCB buffer is always larger than the MTU -- so
2356*8d5069bcSRyan Zezeski * there is always enough room. Consecutive fragments below the DMA
2357*8d5069bcSRyan Zezeski * threshold are copied into a single TCB. In the event of an error
2358*8d5069bcSRyan Zezeski * this function returns NULL but leaves 'mp' alone.
2359*8d5069bcSRyan Zezeski */
2360*8d5069bcSRyan Zezeski static i40e_tx_control_block_t *
i40e_non_lso_chain(i40e_trqpair_t * itrq,mblk_t * mp,uint_t * ndesc)2361*8d5069bcSRyan Zezeski i40e_non_lso_chain(i40e_trqpair_t *itrq, mblk_t *mp, uint_t *ndesc)
2362*8d5069bcSRyan Zezeski {
2363*8d5069bcSRyan Zezeski const mblk_t *nmp = mp;
2364*8d5069bcSRyan Zezeski uint_t needed_desc = 0;
2365*8d5069bcSRyan Zezeski boolean_t force_copy = B_FALSE;
2366*8d5069bcSRyan Zezeski i40e_tx_control_block_t *tcb = NULL, *tcbhead = NULL, *tcbtail = NULL;
2367*8d5069bcSRyan Zezeski i40e_t *i40e = itrq->itrq_i40e;
2368*8d5069bcSRyan Zezeski i40e_txq_stat_t *txs = &itrq->itrq_txstat;
2369*8d5069bcSRyan Zezeski
2370*8d5069bcSRyan Zezeski /* TCB buffer is always larger than MTU. */
2371*8d5069bcSRyan Zezeski ASSERT3U(msgsize(mp), <, i40e->i40e_tx_buf_size);
2372*8d5069bcSRyan Zezeski
2373*8d5069bcSRyan Zezeski while (nmp != NULL) {
2374*8d5069bcSRyan Zezeski const size_t nmp_len = MBLKL(nmp);
2375*8d5069bcSRyan Zezeski
2376*8d5069bcSRyan Zezeski /* Ignore zero-length mblks. */
2377*8d5069bcSRyan Zezeski if (nmp_len == 0) {
2378*8d5069bcSRyan Zezeski nmp = nmp->b_cont;
2379*8d5069bcSRyan Zezeski continue;
2380*8d5069bcSRyan Zezeski }
2381*8d5069bcSRyan Zezeski
2382*8d5069bcSRyan Zezeski if (nmp_len < i40e->i40e_tx_dma_min || force_copy) {
2383*8d5069bcSRyan Zezeski /* Compress consecutive copies into one TCB. */
2384*8d5069bcSRyan Zezeski if (tcb != NULL && tcb->tcb_type == I40E_TX_COPY) {
2385*8d5069bcSRyan Zezeski i40e_tx_copy_fragment(tcb, nmp, 0, nmp_len);
2386*8d5069bcSRyan Zezeski nmp = nmp->b_cont;
2387*8d5069bcSRyan Zezeski continue;
2388*8d5069bcSRyan Zezeski }
2389*8d5069bcSRyan Zezeski
2390*8d5069bcSRyan Zezeski if ((tcb = i40e_tcb_alloc(itrq)) == NULL) {
2391*8d5069bcSRyan Zezeski txs->itxs_err_notcb.value.ui64++;
2392*8d5069bcSRyan Zezeski goto fail;
2393*8d5069bcSRyan Zezeski }
2394*8d5069bcSRyan Zezeski
2395*8d5069bcSRyan Zezeski /*
2396*8d5069bcSRyan Zezeski * TCB DMA buffer is guaranteed to be one
2397*8d5069bcSRyan Zezeski * cookie by i40e_alloc_dma_buffer().
2398*8d5069bcSRyan Zezeski */
2399*8d5069bcSRyan Zezeski i40e_tx_copy_fragment(tcb, nmp, 0, nmp_len);
2400*8d5069bcSRyan Zezeski needed_desc++;
2401*8d5069bcSRyan Zezeski tcb_list_append(&tcbhead, &tcbtail, tcb);
2402*8d5069bcSRyan Zezeski } else {
2403*8d5069bcSRyan Zezeski uint_t total_desc;
2404*8d5069bcSRyan Zezeski
2405*8d5069bcSRyan Zezeski tcb = i40e_tx_bind_fragment(itrq, nmp, 0, B_FALSE);
2406*8d5069bcSRyan Zezeski if (tcb == NULL) {
2407*8d5069bcSRyan Zezeski i40e_error(i40e, "dma bind failed!");
2408*8d5069bcSRyan Zezeski goto fail;
2409*8d5069bcSRyan Zezeski }
2410*8d5069bcSRyan Zezeski
2411*8d5069bcSRyan Zezeski /*
2412*8d5069bcSRyan Zezeski * If the new total exceeds the max or we've
2413*8d5069bcSRyan Zezeski * reached the limit and there's data left,
2414*8d5069bcSRyan Zezeski * then give up binding and copy the rest into
2415*8d5069bcSRyan Zezeski * the pre-allocated TCB buffer.
2416*8d5069bcSRyan Zezeski */
2417*8d5069bcSRyan Zezeski total_desc = needed_desc + tcb->tcb_bind_ncookies;
2418*8d5069bcSRyan Zezeski if ((total_desc > I40E_TX_MAX_COOKIE) ||
2419*8d5069bcSRyan Zezeski (total_desc == I40E_TX_MAX_COOKIE &&
2420*8d5069bcSRyan Zezeski nmp->b_cont != NULL)) {
2421*8d5069bcSRyan Zezeski i40e_tcb_reset(tcb);
2422*8d5069bcSRyan Zezeski i40e_tcb_free(itrq, tcb);
2423*8d5069bcSRyan Zezeski
2424*8d5069bcSRyan Zezeski if (tcbtail != NULL &&
2425*8d5069bcSRyan Zezeski tcbtail->tcb_type == I40E_TX_COPY) {
2426*8d5069bcSRyan Zezeski tcb = tcbtail;
2427*8d5069bcSRyan Zezeski } else {
2428*8d5069bcSRyan Zezeski tcb = NULL;
2429*8d5069bcSRyan Zezeski }
2430*8d5069bcSRyan Zezeski
2431*8d5069bcSRyan Zezeski force_copy = B_TRUE;
2432*8d5069bcSRyan Zezeski txs->itxs_force_copy.value.ui64++;
2433*8d5069bcSRyan Zezeski continue;
2434*8d5069bcSRyan Zezeski }
2435*8d5069bcSRyan Zezeski
2436*8d5069bcSRyan Zezeski needed_desc += tcb->tcb_bind_ncookies;
2437*8d5069bcSRyan Zezeski tcb_list_append(&tcbhead, &tcbtail, tcb);
2438*8d5069bcSRyan Zezeski }
2439*8d5069bcSRyan Zezeski
2440*8d5069bcSRyan Zezeski nmp = nmp->b_cont;
2441*8d5069bcSRyan Zezeski }
2442*8d5069bcSRyan Zezeski
2443*8d5069bcSRyan Zezeski ASSERT3P(nmp, ==, NULL);
2444*8d5069bcSRyan Zezeski ASSERT3U(needed_desc, <=, I40E_TX_MAX_COOKIE);
2445*8d5069bcSRyan Zezeski ASSERT3P(tcbhead, !=, NULL);
2446*8d5069bcSRyan Zezeski *ndesc += needed_desc;
2447*8d5069bcSRyan Zezeski return (tcbhead);
2448*8d5069bcSRyan Zezeski
2449*8d5069bcSRyan Zezeski fail:
2450*8d5069bcSRyan Zezeski tcb = tcbhead;
2451*8d5069bcSRyan Zezeski while (tcb != NULL) {
2452*8d5069bcSRyan Zezeski i40e_tx_control_block_t *next = tcb->tcb_next;
2453*8d5069bcSRyan Zezeski
2454*8d5069bcSRyan Zezeski ASSERT(tcb->tcb_type == I40E_TX_DMA ||
2455*8d5069bcSRyan Zezeski tcb->tcb_type == I40E_TX_COPY);
2456*8d5069bcSRyan Zezeski
2457*8d5069bcSRyan Zezeski tcb->tcb_mp = NULL;
2458*8d5069bcSRyan Zezeski i40e_tcb_reset(tcb);
2459*8d5069bcSRyan Zezeski i40e_tcb_free(itrq, tcb);
2460*8d5069bcSRyan Zezeski tcb = next;
2461*8d5069bcSRyan Zezeski }
2462*8d5069bcSRyan Zezeski
2463*8d5069bcSRyan Zezeski return (NULL);
2464*8d5069bcSRyan Zezeski }
2465*8d5069bcSRyan Zezeski
2466*8d5069bcSRyan Zezeski /*
2467*8d5069bcSRyan Zezeski * Section 8.4.1 of the 700-series programming guide states that a
2468*8d5069bcSRyan Zezeski * segment may span up to 8 data descriptors; including both header
2469*8d5069bcSRyan Zezeski * and payload data. However, empirical evidence shows that the
2470*8d5069bcSRyan Zezeski * controller freezes the Tx queue when presented with a segment of 8
2471*8d5069bcSRyan Zezeski * descriptors. Or, at least, when the first segment contains 8
2472*8d5069bcSRyan Zezeski * descriptors. One explanation is that the controller counts the
2473*8d5069bcSRyan Zezeski * context descriptor against the first segment, even though the
2474*8d5069bcSRyan Zezeski * programming guide makes no mention of such a constraint. In any
2475*8d5069bcSRyan Zezeski * case, we limit TSO segments to 7 descriptors to prevent Tx queue
2476*8d5069bcSRyan Zezeski * freezes. We still allow non-TSO segments to utilize all 8
2477*8d5069bcSRyan Zezeski * descriptors as they have not demonstrated the faulty behavior.
2478*8d5069bcSRyan Zezeski */
2479*8d5069bcSRyan Zezeski uint_t i40e_lso_num_descs = 7;
2480*8d5069bcSRyan Zezeski
2481*8d5069bcSRyan Zezeski #define I40E_TCB_LEFT(tcb) \
2482*8d5069bcSRyan Zezeski ((tcb)->tcb_dma.dmab_size - (tcb)->tcb_dma.dmab_len)
2483*8d5069bcSRyan Zezeski
2484*8d5069bcSRyan Zezeski /*
2485*8d5069bcSRyan Zezeski * This function is similar in spirit to i40e_non_lso_chain(), but
2486*8d5069bcSRyan Zezeski * much more complicated in reality. Like the previous function, it
2487*8d5069bcSRyan Zezeski * takes a packet (an LSO packet) as input and returns a chain of
2488*8d5069bcSRyan Zezeski * TCBs. The complication comes with the fact that we are no longer
2489*8d5069bcSRyan Zezeski * trying to fit the entire packet into 8 descriptors, but rather we
2490*8d5069bcSRyan Zezeski * must fit each MSS-size segment of the LSO packet into 8 descriptors.
2491*8d5069bcSRyan Zezeski * Except it's really 7 descriptors, see i40e_lso_num_descs.
2492*8d5069bcSRyan Zezeski *
2493*8d5069bcSRyan Zezeski * Your first inclination might be to verify that a given segment
2494*8d5069bcSRyan Zezeski * spans no more than 7 mblks; but it's actually much more subtle than
2495*8d5069bcSRyan Zezeski * that. First, let's describe what the hardware expects, and then we
2496*8d5069bcSRyan Zezeski * can expound on the software side of things.
2497*8d5069bcSRyan Zezeski *
2498*8d5069bcSRyan Zezeski * For an LSO packet the hardware expects the following:
2499*8d5069bcSRyan Zezeski *
2500*8d5069bcSRyan Zezeski * o Each MSS-sized segment must span no more than 7 descriptors.
2501*8d5069bcSRyan Zezeski *
2502*8d5069bcSRyan Zezeski * o The header size does not count towards the segment size.
2503*8d5069bcSRyan Zezeski *
2504*8d5069bcSRyan Zezeski * o If header and payload share the first descriptor, then the
2505*8d5069bcSRyan Zezeski * controller will count the descriptor twice.
2506*8d5069bcSRyan Zezeski *
2507*8d5069bcSRyan Zezeski * The most important thing to keep in mind is that the hardware does
2508*8d5069bcSRyan Zezeski * not view the segments in terms of mblks, like we do. The hardware
2509*8d5069bcSRyan Zezeski * only sees descriptors. It will iterate each descriptor in turn,
2510*8d5069bcSRyan Zezeski * keeping a tally of bytes seen and descriptors visited. If the byte
2511*8d5069bcSRyan Zezeski * count hasn't reached MSS by the time the descriptor count reaches
2512*8d5069bcSRyan Zezeski * 7, then the controller freezes the queue and we are stuck.
2513*8d5069bcSRyan Zezeski * Furthermore, the hardware picks up its tally where it left off. So
2514*8d5069bcSRyan Zezeski * if it reached MSS in the middle of a descriptor, it will start
2515*8d5069bcSRyan Zezeski * tallying the next segment in the middle of that descriptor. The
2516*8d5069bcSRyan Zezeski * hardware's view is entirely removed from the mblk chain or even the
2517*8d5069bcSRyan Zezeski * descriptor layout. Consider these facts:
2518*8d5069bcSRyan Zezeski *
2519*8d5069bcSRyan Zezeski * o The MSS will vary dpeneding on MTU and other factors.
2520*8d5069bcSRyan Zezeski *
2521*8d5069bcSRyan Zezeski * o The dblk allocation will sit at various offsets within a
2522*8d5069bcSRyan Zezeski * memory page.
2523*8d5069bcSRyan Zezeski *
2524*8d5069bcSRyan Zezeski * o The page size itself could vary in the future (i.e. not
2525*8d5069bcSRyan Zezeski * always 4K).
2526*8d5069bcSRyan Zezeski *
2527*8d5069bcSRyan Zezeski * o Just because a dblk is virtually contiguous doesn't mean
2528*8d5069bcSRyan Zezeski * it's physically contiguous. The number of cookies
2529*8d5069bcSRyan Zezeski * (descriptors) required by a DMA bind of a single dblk is at
2530*8d5069bcSRyan Zezeski * the mercy of the page size and physical layout.
2531*8d5069bcSRyan Zezeski *
2532*8d5069bcSRyan Zezeski * o The descriptors will most often NOT start/end on a MSS
2533*8d5069bcSRyan Zezeski * boundary. Thus the hardware will often start counting the
2534*8d5069bcSRyan Zezeski * MSS mid descriptor and finish mid descriptor.
2535*8d5069bcSRyan Zezeski *
2536*8d5069bcSRyan Zezeski * The upshot of all this is that the driver must learn to think like
2537*8d5069bcSRyan Zezeski * the controller; and verify that none of the constraints are broken.
2538*8d5069bcSRyan Zezeski * It does this by tallying up the segment just like the hardware
2539*8d5069bcSRyan Zezeski * would. This is handled by the two variables 'segsz' and 'segdesc'.
2540*8d5069bcSRyan Zezeski * After each attempt to bind a dblk, we check the constaints. If
2541*8d5069bcSRyan Zezeski * violated, we undo the DMA and force a copy until MSS is met. We
2542*8d5069bcSRyan Zezeski * have a guarantee that the TCB buffer is larger than MTU; thus
2543*8d5069bcSRyan Zezeski * ensuring we can always meet the MSS with a single copy buffer. We
2544*8d5069bcSRyan Zezeski * also copy consecutive non-DMA fragments into the same TCB buffer.
2545*8d5069bcSRyan Zezeski */
2546*8d5069bcSRyan Zezeski static i40e_tx_control_block_t *
i40e_lso_chain(i40e_trqpair_t * itrq,const mblk_t * mp,const mac_ether_offload_info_t * meo,const i40e_tx_context_t * tctx,uint_t * ndesc)2547*8d5069bcSRyan Zezeski i40e_lso_chain(i40e_trqpair_t *itrq, const mblk_t *mp,
2548*8d5069bcSRyan Zezeski const mac_ether_offload_info_t *meo, const i40e_tx_context_t *tctx,
2549*8d5069bcSRyan Zezeski uint_t *ndesc)
2550*8d5069bcSRyan Zezeski {
2551*8d5069bcSRyan Zezeski size_t mp_len = MBLKL(mp);
2552*8d5069bcSRyan Zezeski /*
2553*8d5069bcSRyan Zezeski * The cpoff (copy offset) variable tracks the offset inside
2554*8d5069bcSRyan Zezeski * the current mp. There are cases where the entire mp is not
2555*8d5069bcSRyan Zezeski * fully copied in one go: such as the header copy followed by
2556*8d5069bcSRyan Zezeski * a non-DMA mblk, or a TCB buffer that only has enough space
2557*8d5069bcSRyan Zezeski * to copy part of the current mp.
2558*8d5069bcSRyan Zezeski */
2559*8d5069bcSRyan Zezeski size_t cpoff = 0;
2560*8d5069bcSRyan Zezeski /*
2561*8d5069bcSRyan Zezeski * The segsz and segdesc variables track the controller's view
2562*8d5069bcSRyan Zezeski * of the segment. The needed_desc variable tracks the total
2563*8d5069bcSRyan Zezeski * number of data descriptors used by the driver.
2564*8d5069bcSRyan Zezeski */
2565*8d5069bcSRyan Zezeski size_t segsz = 0;
2566*8d5069bcSRyan Zezeski uint_t segdesc = 0;
2567*8d5069bcSRyan Zezeski uint_t needed_desc = 0;
2568*8d5069bcSRyan Zezeski size_t hdrcopied = 0;
2569*8d5069bcSRyan Zezeski const size_t hdrlen =
2570*8d5069bcSRyan Zezeski meo->meoi_l2hlen + meo->meoi_l3hlen + meo->meoi_l4hlen;
2571*8d5069bcSRyan Zezeski const size_t mss = tctx->itc_ctx_mss;
2572*8d5069bcSRyan Zezeski boolean_t force_copy = B_FALSE;
2573*8d5069bcSRyan Zezeski i40e_tx_control_block_t *tcb = NULL, *tcbhead = NULL, *tcbtail = NULL;
2574*8d5069bcSRyan Zezeski i40e_t *i40e = itrq->itrq_i40e;
2575*8d5069bcSRyan Zezeski i40e_txq_stat_t *txs = &itrq->itrq_txstat;
2576*8d5069bcSRyan Zezeski
2577*8d5069bcSRyan Zezeski /*
2578*8d5069bcSRyan Zezeski * We always copy the header in order to avoid more
2579*8d5069bcSRyan Zezeski * complicated code dealing with various edge cases.
2580*8d5069bcSRyan Zezeski */
2581*8d5069bcSRyan Zezeski if ((tcb = i40e_tcb_alloc(itrq)) == NULL) {
2582*8d5069bcSRyan Zezeski txs->itxs_err_notcb.value.ui64++;
2583*8d5069bcSRyan Zezeski goto fail;
2584*8d5069bcSRyan Zezeski }
2585*8d5069bcSRyan Zezeski
2586*8d5069bcSRyan Zezeski needed_desc++;
2587*8d5069bcSRyan Zezeski tcb_list_append(&tcbhead, &tcbtail, tcb);
2588*8d5069bcSRyan Zezeski
2589*8d5069bcSRyan Zezeski while (hdrcopied < hdrlen) {
2590*8d5069bcSRyan Zezeski const size_t tocopy = MIN(hdrlen - hdrcopied, mp_len);
2591*8d5069bcSRyan Zezeski i40e_tx_copy_fragment(tcb, mp, 0, tocopy);
2592*8d5069bcSRyan Zezeski hdrcopied += tocopy;
2593*8d5069bcSRyan Zezeski cpoff += tocopy;
2594*8d5069bcSRyan Zezeski if (tocopy == mp_len) {
2595*8d5069bcSRyan Zezeski /*
2596*8d5069bcSRyan Zezeski * This is a bit of defensive programming. We
2597*8d5069bcSRyan Zezeski * should never have a chain too short to
2598*8d5069bcSRyan Zezeski * satisfy the headers -- but just in case.
2599*8d5069bcSRyan Zezeski */
2600*8d5069bcSRyan Zezeski if ((mp = mp->b_cont) == NULL) {
2601*8d5069bcSRyan Zezeski txs->itxs_tx_short.value.ui64++;
2602*8d5069bcSRyan Zezeski goto fail;
2603*8d5069bcSRyan Zezeski }
2604*8d5069bcSRyan Zezeski
2605*8d5069bcSRyan Zezeski while ((mp_len = MBLKL(mp)) == 0) {
2606*8d5069bcSRyan Zezeski if ((mp = mp->b_cont) == NULL) {
2607*8d5069bcSRyan Zezeski txs->itxs_tx_short.value.ui64++;
2608*8d5069bcSRyan Zezeski goto fail;
2609*8d5069bcSRyan Zezeski }
2610*8d5069bcSRyan Zezeski }
2611*8d5069bcSRyan Zezeski cpoff = 0;
2612*8d5069bcSRyan Zezeski }
2613*8d5069bcSRyan Zezeski }
2614*8d5069bcSRyan Zezeski ASSERT3U(hdrcopied, ==, hdrlen);
2615*8d5069bcSRyan Zezeski
2616*8d5069bcSRyan Zezeski /*
2617*8d5069bcSRyan Zezeski * A single descriptor containing both header and data is
2618*8d5069bcSRyan Zezeski * counted twice by the controller.
2619*8d5069bcSRyan Zezeski */
2620*8d5069bcSRyan Zezeski if (mp_len < i40e->i40e_tx_dma_min) {
2621*8d5069bcSRyan Zezeski segdesc = 2;
2622*8d5069bcSRyan Zezeski } else {
2623*8d5069bcSRyan Zezeski segdesc = 1;
2624*8d5069bcSRyan Zezeski }
2625*8d5069bcSRyan Zezeski
2626*8d5069bcSRyan Zezeski while (mp != NULL) {
2627*8d5069bcSRyan Zezeski mp_len = MBLKL(mp);
2628*8d5069bcSRyan Zezeski force_copy:
2629*8d5069bcSRyan Zezeski /* Ignore zero-length mblks. */
2630*8d5069bcSRyan Zezeski if (mp_len == 0) {
2631*8d5069bcSRyan Zezeski mp = mp->b_cont;
2632*8d5069bcSRyan Zezeski cpoff = 0;
2633*8d5069bcSRyan Zezeski continue;
2634*8d5069bcSRyan Zezeski }
2635*8d5069bcSRyan Zezeski
2636*8d5069bcSRyan Zezeski /*
2637*8d5069bcSRyan Zezeski * We copy into the preallocated TCB buffer when the
2638*8d5069bcSRyan Zezeski * current fragment is less than the DMA threshold OR
2639*8d5069bcSRyan Zezeski * when the DMA bind can't meet the controller's
2640*8d5069bcSRyan Zezeski * segment descriptor limit.
2641*8d5069bcSRyan Zezeski */
2642*8d5069bcSRyan Zezeski if (mp_len < i40e->i40e_tx_dma_min || force_copy) {
2643*8d5069bcSRyan Zezeski size_t tocopy;
2644*8d5069bcSRyan Zezeski
2645*8d5069bcSRyan Zezeski /*
2646*8d5069bcSRyan Zezeski * Our objective here is to compress
2647*8d5069bcSRyan Zezeski * consecutive copies into one TCB (until it
2648*8d5069bcSRyan Zezeski * is full). If there is no current TCB, or if
2649*8d5069bcSRyan Zezeski * it is a DMA TCB, then allocate a new one.
2650*8d5069bcSRyan Zezeski */
2651*8d5069bcSRyan Zezeski if (tcb == NULL ||
2652*8d5069bcSRyan Zezeski (tcb != NULL && tcb->tcb_type != I40E_TX_COPY)) {
2653*8d5069bcSRyan Zezeski if ((tcb = i40e_tcb_alloc(itrq)) == NULL) {
2654*8d5069bcSRyan Zezeski txs->itxs_err_notcb.value.ui64++;
2655*8d5069bcSRyan Zezeski goto fail;
2656*8d5069bcSRyan Zezeski }
2657*8d5069bcSRyan Zezeski
2658*8d5069bcSRyan Zezeski /*
2659*8d5069bcSRyan Zezeski * The TCB DMA buffer is guaranteed to
2660*8d5069bcSRyan Zezeski * be one cookie by i40e_alloc_dma_buffer().
2661*8d5069bcSRyan Zezeski */
2662*8d5069bcSRyan Zezeski needed_desc++;
2663*8d5069bcSRyan Zezeski segdesc++;
2664*8d5069bcSRyan Zezeski ASSERT3U(segdesc, <=, i40e_lso_num_descs);
2665*8d5069bcSRyan Zezeski tcb_list_append(&tcbhead, &tcbtail, tcb);
2666*8d5069bcSRyan Zezeski } else if (segdesc == 0) {
2667*8d5069bcSRyan Zezeski /*
2668*8d5069bcSRyan Zezeski * We are copying into an existing TCB
2669*8d5069bcSRyan Zezeski * but we just crossed the MSS
2670*8d5069bcSRyan Zezeski * boundary. Make sure to increment
2671*8d5069bcSRyan Zezeski * segdesc to track the descriptor
2672*8d5069bcSRyan Zezeski * count as the hardware would.
2673*8d5069bcSRyan Zezeski */
2674*8d5069bcSRyan Zezeski segdesc++;
2675*8d5069bcSRyan Zezeski }
2676*8d5069bcSRyan Zezeski
2677*8d5069bcSRyan Zezeski tocopy = MIN(I40E_TCB_LEFT(tcb), mp_len - cpoff);
2678*8d5069bcSRyan Zezeski i40e_tx_copy_fragment(tcb, mp, cpoff, tocopy);
2679*8d5069bcSRyan Zezeski cpoff += tocopy;
2680*8d5069bcSRyan Zezeski segsz += tocopy;
2681*8d5069bcSRyan Zezeski
2682*8d5069bcSRyan Zezeski /* We have consumed the current mp. */
2683*8d5069bcSRyan Zezeski if (cpoff == mp_len) {
2684*8d5069bcSRyan Zezeski mp = mp->b_cont;
2685*8d5069bcSRyan Zezeski cpoff = 0;
2686*8d5069bcSRyan Zezeski }
2687*8d5069bcSRyan Zezeski
2688*8d5069bcSRyan Zezeski /* We have consumed the current TCB buffer. */
2689*8d5069bcSRyan Zezeski if (I40E_TCB_LEFT(tcb) == 0) {
2690*8d5069bcSRyan Zezeski tcb = NULL;
2691*8d5069bcSRyan Zezeski }
2692*8d5069bcSRyan Zezeski
2693*8d5069bcSRyan Zezeski /*
2694*8d5069bcSRyan Zezeski * We have met MSS with this copy; restart the
2695*8d5069bcSRyan Zezeski * counters.
2696*8d5069bcSRyan Zezeski */
2697*8d5069bcSRyan Zezeski if (segsz >= mss) {
2698*8d5069bcSRyan Zezeski segsz = segsz % mss;
2699*8d5069bcSRyan Zezeski segdesc = segsz == 0 ? 0 : 1;
2700*8d5069bcSRyan Zezeski force_copy = B_FALSE;
2701*8d5069bcSRyan Zezeski }
2702*8d5069bcSRyan Zezeski
2703*8d5069bcSRyan Zezeski /*
2704*8d5069bcSRyan Zezeski * We are at the controller's descriptor
2705*8d5069bcSRyan Zezeski * limit; we must copy into the current TCB
2706*8d5069bcSRyan Zezeski * until MSS is reached. The TCB buffer is
2707*8d5069bcSRyan Zezeski * always bigger than the MTU so we know it is
2708*8d5069bcSRyan Zezeski * big enough to meet the MSS.
2709*8d5069bcSRyan Zezeski */
2710*8d5069bcSRyan Zezeski if (segdesc == i40e_lso_num_descs) {
2711*8d5069bcSRyan Zezeski force_copy = B_TRUE;
2712*8d5069bcSRyan Zezeski }
2713*8d5069bcSRyan Zezeski } else {
2714*8d5069bcSRyan Zezeski uint_t tsegdesc = segdesc;
2715*8d5069bcSRyan Zezeski size_t tsegsz = segsz;
2716*8d5069bcSRyan Zezeski
2717*8d5069bcSRyan Zezeski ASSERT(force_copy == B_FALSE);
2718*8d5069bcSRyan Zezeski ASSERT3U(tsegdesc, <, i40e_lso_num_descs);
2719*8d5069bcSRyan Zezeski
2720*8d5069bcSRyan Zezeski tcb = i40e_tx_bind_fragment(itrq, mp, cpoff, B_TRUE);
2721*8d5069bcSRyan Zezeski if (tcb == NULL) {
2722*8d5069bcSRyan Zezeski i40e_error(i40e, "dma bind failed!");
2723*8d5069bcSRyan Zezeski goto fail;
2724*8d5069bcSRyan Zezeski }
2725*8d5069bcSRyan Zezeski
2726*8d5069bcSRyan Zezeski for (uint_t i = 0; i < tcb->tcb_bind_ncookies; i++) {
2727*8d5069bcSRyan Zezeski struct i40e_dma_bind_info dbi =
2728*8d5069bcSRyan Zezeski tcb->tcb_bind_info[i];
2729*8d5069bcSRyan Zezeski
2730*8d5069bcSRyan Zezeski tsegsz += dbi.dbi_len;
2731*8d5069bcSRyan Zezeski tsegdesc++;
2732*8d5069bcSRyan Zezeski ASSERT3U(tsegdesc, <=, i40e_lso_num_descs);
2733*8d5069bcSRyan Zezeski
2734*8d5069bcSRyan Zezeski /*
2735*8d5069bcSRyan Zezeski * We've met the MSS with this portion
2736*8d5069bcSRyan Zezeski * of the DMA.
2737*8d5069bcSRyan Zezeski */
2738*8d5069bcSRyan Zezeski if (tsegsz >= mss) {
2739*8d5069bcSRyan Zezeski tsegsz = tsegsz % mss;
2740*8d5069bcSRyan Zezeski tsegdesc = tsegsz == 0 ? 0 : 1;
2741*8d5069bcSRyan Zezeski }
2742*8d5069bcSRyan Zezeski
2743*8d5069bcSRyan Zezeski /*
2744*8d5069bcSRyan Zezeski * We've reached max descriptors but
2745*8d5069bcSRyan Zezeski * have not met the MSS. Undo the bind
2746*8d5069bcSRyan Zezeski * and instead copy.
2747*8d5069bcSRyan Zezeski */
2748*8d5069bcSRyan Zezeski if (tsegdesc == i40e_lso_num_descs) {
2749*8d5069bcSRyan Zezeski i40e_tcb_reset(tcb);
2750*8d5069bcSRyan Zezeski i40e_tcb_free(itrq, tcb);
2751*8d5069bcSRyan Zezeski
2752*8d5069bcSRyan Zezeski if (tcbtail != NULL &&
2753*8d5069bcSRyan Zezeski I40E_TCB_LEFT(tcb) > 0 &&
2754*8d5069bcSRyan Zezeski tcbtail->tcb_type == I40E_TX_COPY) {
2755*8d5069bcSRyan Zezeski tcb = tcbtail;
2756*8d5069bcSRyan Zezeski } else {
2757*8d5069bcSRyan Zezeski tcb = NULL;
2758*8d5069bcSRyan Zezeski }
2759*8d5069bcSRyan Zezeski
2760*8d5069bcSRyan Zezeski /*
2761*8d5069bcSRyan Zezeski * Remember, we are still on
2762*8d5069bcSRyan Zezeski * the same mp.
2763*8d5069bcSRyan Zezeski */
2764*8d5069bcSRyan Zezeski force_copy = B_TRUE;
2765*8d5069bcSRyan Zezeski txs->itxs_tso_force_copy.value.ui64++;
2766*8d5069bcSRyan Zezeski goto force_copy;
2767*8d5069bcSRyan Zezeski }
2768*8d5069bcSRyan Zezeski }
2769*8d5069bcSRyan Zezeski
2770*8d5069bcSRyan Zezeski ASSERT3U(tsegdesc, <=, i40e_lso_num_descs);
2771*8d5069bcSRyan Zezeski ASSERT3U(tsegsz, <, mss);
2772*8d5069bcSRyan Zezeski
2773*8d5069bcSRyan Zezeski /*
2774*8d5069bcSRyan Zezeski * We've made if through the loop without
2775*8d5069bcSRyan Zezeski * breaking the segment descriptor contract
2776*8d5069bcSRyan Zezeski * with the controller -- replace the segment
2777*8d5069bcSRyan Zezeski * tracking values with the temporary ones.
2778*8d5069bcSRyan Zezeski */
2779*8d5069bcSRyan Zezeski segdesc = tsegdesc;
2780*8d5069bcSRyan Zezeski segsz = tsegsz;
2781*8d5069bcSRyan Zezeski needed_desc += tcb->tcb_bind_ncookies;
2782*8d5069bcSRyan Zezeski cpoff = 0;
2783*8d5069bcSRyan Zezeski tcb_list_append(&tcbhead, &tcbtail, tcb);
2784*8d5069bcSRyan Zezeski mp = mp->b_cont;
2785*8d5069bcSRyan Zezeski }
2786*8d5069bcSRyan Zezeski }
2787*8d5069bcSRyan Zezeski
2788*8d5069bcSRyan Zezeski ASSERT3P(mp, ==, NULL);
2789*8d5069bcSRyan Zezeski ASSERT3P(tcbhead, !=, NULL);
2790*8d5069bcSRyan Zezeski *ndesc += needed_desc;
2791*8d5069bcSRyan Zezeski return (tcbhead);
2792*8d5069bcSRyan Zezeski
2793*8d5069bcSRyan Zezeski fail:
2794*8d5069bcSRyan Zezeski tcb = tcbhead;
2795*8d5069bcSRyan Zezeski while (tcb != NULL) {
2796*8d5069bcSRyan Zezeski i40e_tx_control_block_t *next = tcb->tcb_next;
2797*8d5069bcSRyan Zezeski
2798*8d5069bcSRyan Zezeski ASSERT(tcb->tcb_type == I40E_TX_DMA ||
2799*8d5069bcSRyan Zezeski tcb->tcb_type == I40E_TX_COPY);
2800*8d5069bcSRyan Zezeski
2801*8d5069bcSRyan Zezeski tcb->tcb_mp = NULL;
2802*8d5069bcSRyan Zezeski i40e_tcb_reset(tcb);
2803*8d5069bcSRyan Zezeski i40e_tcb_free(itrq, tcb);
2804*8d5069bcSRyan Zezeski tcb = next;
2805*8d5069bcSRyan Zezeski }
2806*8d5069bcSRyan Zezeski
2807*8d5069bcSRyan Zezeski return (NULL);
2808*8d5069bcSRyan Zezeski }
2809*8d5069bcSRyan Zezeski
2810da5577f0SRobert Mustacchi /*
2811da5577f0SRobert Mustacchi * We've been asked to send a message block on the wire. We'll only have a
2812da5577f0SRobert Mustacchi * single chain. There will not be any b_next pointers; however, there may be
2813*8d5069bcSRyan Zezeski * multiple b_cont blocks. The number of b_cont blocks may exceed the
2814*8d5069bcSRyan Zezeski * controller's Tx descriptor limit.
2815da5577f0SRobert Mustacchi *
2816da5577f0SRobert Mustacchi * We may do one of three things with any given mblk_t chain:
2817da5577f0SRobert Mustacchi *
2818da5577f0SRobert Mustacchi * 1) Drop it
2819da5577f0SRobert Mustacchi * 2) Transmit it
2820da5577f0SRobert Mustacchi * 3) Return it
2821da5577f0SRobert Mustacchi *
2822da5577f0SRobert Mustacchi * If we return it to MAC, then MAC will flow control on our behalf. In other
2823da5577f0SRobert Mustacchi * words, it won't send us anything until we tell it that it's okay to send us
2824da5577f0SRobert Mustacchi * something.
2825da5577f0SRobert Mustacchi */
2826da5577f0SRobert Mustacchi mblk_t *
i40e_ring_tx(void * arg,mblk_t * mp)2827da5577f0SRobert Mustacchi i40e_ring_tx(void *arg, mblk_t *mp)
2828da5577f0SRobert Mustacchi {
2829*8d5069bcSRyan Zezeski size_t msglen;
2830*8d5069bcSRyan Zezeski i40e_tx_control_block_t *tcb_ctx = NULL, *tcb = NULL, *tcbhead = NULL;
2831*8d5069bcSRyan Zezeski i40e_tx_context_desc_t *ctxdesc;
2832*8d5069bcSRyan Zezeski mac_ether_offload_info_t meo;
2833da5577f0SRobert Mustacchi i40e_tx_context_t tctx;
2834*8d5069bcSRyan Zezeski int type;
2835*8d5069bcSRyan Zezeski uint_t needed_desc = 0;
2836*8d5069bcSRyan Zezeski boolean_t do_ctx_desc = B_FALSE, use_lso = B_FALSE;
2837da5577f0SRobert Mustacchi
2838da5577f0SRobert Mustacchi i40e_trqpair_t *itrq = arg;
2839da5577f0SRobert Mustacchi i40e_t *i40e = itrq->itrq_i40e;
2840da5577f0SRobert Mustacchi i40e_hw_t *hw = &i40e->i40e_hw_space;
2841da5577f0SRobert Mustacchi i40e_txq_stat_t *txs = &itrq->itrq_txstat;
2842da5577f0SRobert Mustacchi
2843da5577f0SRobert Mustacchi ASSERT(mp->b_next == NULL);
2844da5577f0SRobert Mustacchi
2845da5577f0SRobert Mustacchi if (!(i40e->i40e_state & I40E_STARTED) ||
2846da5577f0SRobert Mustacchi (i40e->i40e_state & I40E_OVERTEMP) ||
2847da5577f0SRobert Mustacchi (i40e->i40e_state & I40E_SUSPENDED) ||
2848da5577f0SRobert Mustacchi (i40e->i40e_state & I40E_ERROR) ||
2849da5577f0SRobert Mustacchi (i40e->i40e_link_state != LINK_STATE_UP)) {
2850da5577f0SRobert Mustacchi freemsg(mp);
2851da5577f0SRobert Mustacchi return (NULL);
2852da5577f0SRobert Mustacchi }
2853da5577f0SRobert Mustacchi
2854*8d5069bcSRyan Zezeski if (mac_ether_offload_info(mp, &meo) != 0) {
2855*8d5069bcSRyan Zezeski freemsg(mp);
2856*8d5069bcSRyan Zezeski itrq->itrq_txstat.itxs_hck_meoifail.value.ui64++;
2857*8d5069bcSRyan Zezeski return (NULL);
2858*8d5069bcSRyan Zezeski }
2859*8d5069bcSRyan Zezeski
2860da5577f0SRobert Mustacchi /*
2861da5577f0SRobert Mustacchi * Figure out the relevant context about this frame that we might need
2862*8d5069bcSRyan Zezeski * for enabling checksum, LSO, etc. This also fills in information that
2863da5577f0SRobert Mustacchi * we might set around the packet type, etc.
2864da5577f0SRobert Mustacchi */
2865*8d5069bcSRyan Zezeski if (i40e_tx_context(i40e, itrq, mp, &meo, &tctx) < 0) {
2866da5577f0SRobert Mustacchi freemsg(mp);
2867da5577f0SRobert Mustacchi itrq->itrq_txstat.itxs_err_context.value.ui64++;
2868da5577f0SRobert Mustacchi return (NULL);
2869da5577f0SRobert Mustacchi }
2870*8d5069bcSRyan Zezeski if (tctx.itc_ctx_cmdflags & I40E_TX_CTX_DESC_TSO) {
2871*8d5069bcSRyan Zezeski use_lso = B_TRUE;
2872*8d5069bcSRyan Zezeski do_ctx_desc = B_TRUE;
2873*8d5069bcSRyan Zezeski }
2874da5577f0SRobert Mustacchi
2875da5577f0SRobert Mustacchi /*
2876da5577f0SRobert Mustacchi * For the primordial driver we can punt on doing any recycling right
2877da5577f0SRobert Mustacchi * now; however, longer term we need to probably do some more pro-active
2878*8d5069bcSRyan Zezeski * recycling to cut back on stalls in the TX path.
2879da5577f0SRobert Mustacchi */
2880da5577f0SRobert Mustacchi
2881*8d5069bcSRyan Zezeski msglen = msgsize(mp);
2882da5577f0SRobert Mustacchi
2883*8d5069bcSRyan Zezeski if (do_ctx_desc) {
2884da5577f0SRobert Mustacchi /*
2885*8d5069bcSRyan Zezeski * If we're doing tunneling or LSO, then we'll need a TX
2886*8d5069bcSRyan Zezeski * context descriptor in addition to one or more TX data
2887*8d5069bcSRyan Zezeski * descriptors. Since there's no data DMA block or handle
2888*8d5069bcSRyan Zezeski * associated with the context descriptor, we create a special
2889*8d5069bcSRyan Zezeski * control block that behaves effectively like a NOP.
2890da5577f0SRobert Mustacchi */
2891*8d5069bcSRyan Zezeski if ((tcb_ctx = i40e_tcb_alloc(itrq)) == NULL) {
2892da5577f0SRobert Mustacchi txs->itxs_err_notcb.value.ui64++;
2893da5577f0SRobert Mustacchi goto txfail;
2894da5577f0SRobert Mustacchi }
2895*8d5069bcSRyan Zezeski tcb_ctx->tcb_type = I40E_TX_DESC;
2896*8d5069bcSRyan Zezeski needed_desc++;
2897da5577f0SRobert Mustacchi }
2898*8d5069bcSRyan Zezeski
2899*8d5069bcSRyan Zezeski if (!use_lso) {
2900*8d5069bcSRyan Zezeski tcbhead = i40e_non_lso_chain(itrq, mp, &needed_desc);
2901*8d5069bcSRyan Zezeski } else {
2902*8d5069bcSRyan Zezeski tcbhead = i40e_lso_chain(itrq, mp, &meo, &tctx, &needed_desc);
2903*8d5069bcSRyan Zezeski }
2904*8d5069bcSRyan Zezeski
2905*8d5069bcSRyan Zezeski if (tcbhead == NULL)
2906*8d5069bcSRyan Zezeski goto txfail;
2907*8d5069bcSRyan Zezeski
2908*8d5069bcSRyan Zezeski tcbhead->tcb_mp = mp;
2909da5577f0SRobert Mustacchi
2910da5577f0SRobert Mustacchi /*
2911*8d5069bcSRyan Zezeski * The second condition ensures that 'itrq_desc_tail' never
2912*8d5069bcSRyan Zezeski * equals 'itrq_desc_head'. This enforces the rule found in
2913*8d5069bcSRyan Zezeski * the second bullet point of section 8.4.3.1.5 of the XL710
2914*8d5069bcSRyan Zezeski * PG, which declares the TAIL pointer in I40E_QTX_TAIL should
2915*8d5069bcSRyan Zezeski * never overlap with the head. This means that we only ever
2916*8d5069bcSRyan Zezeski * have 'itrq_tx_ring_size - 1' total available descriptors.
2917da5577f0SRobert Mustacchi */
2918da5577f0SRobert Mustacchi mutex_enter(&itrq->itrq_tx_lock);
2919*8d5069bcSRyan Zezeski if (itrq->itrq_desc_free < i40e->i40e_tx_block_thresh ||
2920*8d5069bcSRyan Zezeski (itrq->itrq_desc_free - 1) < needed_desc) {
2921da5577f0SRobert Mustacchi txs->itxs_err_nodescs.value.ui64++;
2922da5577f0SRobert Mustacchi mutex_exit(&itrq->itrq_tx_lock);
2923da5577f0SRobert Mustacchi goto txfail;
2924da5577f0SRobert Mustacchi }
2925da5577f0SRobert Mustacchi
2926*8d5069bcSRyan Zezeski if (do_ctx_desc) {
2927da5577f0SRobert Mustacchi /*
2928*8d5069bcSRyan Zezeski * If we're enabling any offloads for this frame, then we'll
2929*8d5069bcSRyan Zezeski * need to build up a transmit context descriptor, first. The
2930*8d5069bcSRyan Zezeski * context descriptor needs to be placed in the TX ring before
2931*8d5069bcSRyan Zezeski * the data descriptor(s). See section 8.4.2, table 8-16
2932da5577f0SRobert Mustacchi */
2933*8d5069bcSRyan Zezeski uint_t tail = itrq->itrq_desc_tail;
2934da5577f0SRobert Mustacchi itrq->itrq_desc_free--;
2935*8d5069bcSRyan Zezeski ctxdesc = (i40e_tx_context_desc_t *)&itrq->itrq_desc_ring[tail];
2936*8d5069bcSRyan Zezeski itrq->itrq_tcb_work_list[tail] = tcb_ctx;
2937*8d5069bcSRyan Zezeski itrq->itrq_desc_tail = i40e_next_desc(tail, 1,
2938da5577f0SRobert Mustacchi itrq->itrq_tx_ring_size);
2939da5577f0SRobert Mustacchi
2940*8d5069bcSRyan Zezeski /* QW0 */
2941*8d5069bcSRyan Zezeski type = I40E_TX_DESC_DTYPE_CONTEXT;
2942*8d5069bcSRyan Zezeski ctxdesc->tunneling_params = 0;
2943*8d5069bcSRyan Zezeski ctxdesc->l2tag2 = 0;
2944*8d5069bcSRyan Zezeski
2945*8d5069bcSRyan Zezeski /* QW1 */
2946*8d5069bcSRyan Zezeski ctxdesc->type_cmd_tso_mss = CPU_TO_LE64((uint64_t)type);
2947*8d5069bcSRyan Zezeski if (tctx.itc_ctx_cmdflags & I40E_TX_CTX_DESC_TSO) {
2948*8d5069bcSRyan Zezeski ctxdesc->type_cmd_tso_mss |= CPU_TO_LE64((uint64_t)
2949*8d5069bcSRyan Zezeski ((uint64_t)tctx.itc_ctx_cmdflags <<
2950*8d5069bcSRyan Zezeski I40E_TXD_CTX_QW1_CMD_SHIFT) |
2951*8d5069bcSRyan Zezeski ((uint64_t)tctx.itc_ctx_tsolen <<
2952*8d5069bcSRyan Zezeski I40E_TXD_CTX_QW1_TSO_LEN_SHIFT) |
2953*8d5069bcSRyan Zezeski ((uint64_t)tctx.itc_ctx_mss <<
2954*8d5069bcSRyan Zezeski I40E_TXD_CTX_QW1_MSS_SHIFT));
2955*8d5069bcSRyan Zezeski }
2956*8d5069bcSRyan Zezeski }
2957*8d5069bcSRyan Zezeski
2958*8d5069bcSRyan Zezeski tcb = tcbhead;
2959*8d5069bcSRyan Zezeski while (tcb != NULL) {
2960*8d5069bcSRyan Zezeski
2961*8d5069bcSRyan Zezeski itrq->itrq_tcb_work_list[itrq->itrq_desc_tail] = tcb;
2962*8d5069bcSRyan Zezeski if (tcb->tcb_type == I40E_TX_COPY) {
2963*8d5069bcSRyan Zezeski boolean_t last_desc = (tcb->tcb_next == NULL);
2964*8d5069bcSRyan Zezeski
2965*8d5069bcSRyan Zezeski i40e_tx_set_data_desc(itrq, &tctx,
2966*8d5069bcSRyan Zezeski (caddr_t)tcb->tcb_dma.dmab_dma_address,
2967*8d5069bcSRyan Zezeski tcb->tcb_dma.dmab_len, last_desc);
2968*8d5069bcSRyan Zezeski } else {
2969*8d5069bcSRyan Zezeski boolean_t last_desc = B_FALSE;
2970*8d5069bcSRyan Zezeski ASSERT3S(tcb->tcb_type, ==, I40E_TX_DMA);
2971*8d5069bcSRyan Zezeski
2972*8d5069bcSRyan Zezeski for (uint_t c = 0; c < tcb->tcb_bind_ncookies; c++) {
2973*8d5069bcSRyan Zezeski last_desc = (c == tcb->tcb_bind_ncookies - 1) &&
2974*8d5069bcSRyan Zezeski (tcb->tcb_next == NULL);
2975*8d5069bcSRyan Zezeski
2976*8d5069bcSRyan Zezeski i40e_tx_set_data_desc(itrq, &tctx,
2977*8d5069bcSRyan Zezeski tcb->tcb_bind_info[c].dbi_paddr,
2978*8d5069bcSRyan Zezeski tcb->tcb_bind_info[c].dbi_len,
2979*8d5069bcSRyan Zezeski last_desc);
2980*8d5069bcSRyan Zezeski }
2981*8d5069bcSRyan Zezeski }
2982*8d5069bcSRyan Zezeski
2983*8d5069bcSRyan Zezeski tcb = tcb->tcb_next;
2984*8d5069bcSRyan Zezeski }
2985da5577f0SRobert Mustacchi
2986da5577f0SRobert Mustacchi /*
2987da5577f0SRobert Mustacchi * Now, finally, sync the DMA data and alert hardware.
2988da5577f0SRobert Mustacchi */
2989da5577f0SRobert Mustacchi I40E_DMA_SYNC(&itrq->itrq_desc_area, DDI_DMA_SYNC_FORDEV);
2990da5577f0SRobert Mustacchi
2991da5577f0SRobert Mustacchi I40E_WRITE_REG(hw, I40E_QTX_TAIL(itrq->itrq_index),
2992da5577f0SRobert Mustacchi itrq->itrq_desc_tail);
2993*8d5069bcSRyan Zezeski
2994da5577f0SRobert Mustacchi if (i40e_check_acc_handle(i40e->i40e_osdep_space.ios_reg_handle) !=
2995da5577f0SRobert Mustacchi DDI_FM_OK) {
2996da5577f0SRobert Mustacchi /*
2997da5577f0SRobert Mustacchi * Note, we can't really go through and clean this up very well,
2998da5577f0SRobert Mustacchi * because the memory has been given to the device, so just
2999da5577f0SRobert Mustacchi * indicate it's been transmitted.
3000da5577f0SRobert Mustacchi */
3001da5577f0SRobert Mustacchi ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED);
3002da5577f0SRobert Mustacchi atomic_or_32(&i40e->i40e_state, I40E_ERROR);
3003da5577f0SRobert Mustacchi }
3004da5577f0SRobert Mustacchi
3005*8d5069bcSRyan Zezeski txs->itxs_bytes.value.ui64 += msglen;
3006da5577f0SRobert Mustacchi txs->itxs_packets.value.ui64++;
3007*8d5069bcSRyan Zezeski txs->itxs_descriptors.value.ui64 += needed_desc;
3008da5577f0SRobert Mustacchi
3009da5577f0SRobert Mustacchi mutex_exit(&itrq->itrq_tx_lock);
3010da5577f0SRobert Mustacchi
3011da5577f0SRobert Mustacchi return (NULL);
3012da5577f0SRobert Mustacchi
3013da5577f0SRobert Mustacchi txfail:
3014da5577f0SRobert Mustacchi /*
3015da5577f0SRobert Mustacchi * We ran out of resources. Return it to MAC and indicate that we'll
3016da5577f0SRobert Mustacchi * need to signal MAC. If there are allocated tcb's, return them now.
3017da5577f0SRobert Mustacchi * Make sure to reset their message block's, since we'll return them
3018da5577f0SRobert Mustacchi * back to MAC.
3019da5577f0SRobert Mustacchi */
3020*8d5069bcSRyan Zezeski if (tcb_ctx != NULL) {
3021*8d5069bcSRyan Zezeski tcb_ctx->tcb_mp = NULL;
3022*8d5069bcSRyan Zezeski i40e_tcb_reset(tcb_ctx);
3023*8d5069bcSRyan Zezeski i40e_tcb_free(itrq, tcb_ctx);
3024*8d5069bcSRyan Zezeski }
3025*8d5069bcSRyan Zezeski
3026*8d5069bcSRyan Zezeski tcb = tcbhead;
3027*8d5069bcSRyan Zezeski while (tcb != NULL) {
3028*8d5069bcSRyan Zezeski i40e_tx_control_block_t *next = tcb->tcb_next;
3029*8d5069bcSRyan Zezeski
3030*8d5069bcSRyan Zezeski ASSERT(tcb->tcb_type == I40E_TX_DMA ||
3031*8d5069bcSRyan Zezeski tcb->tcb_type == I40E_TX_COPY);
3032*8d5069bcSRyan Zezeski
3033da5577f0SRobert Mustacchi tcb->tcb_mp = NULL;
3034da5577f0SRobert Mustacchi i40e_tcb_reset(tcb);
3035da5577f0SRobert Mustacchi i40e_tcb_free(itrq, tcb);
3036*8d5069bcSRyan Zezeski tcb = next;
3037da5577f0SRobert Mustacchi }
3038da5577f0SRobert Mustacchi
3039da5577f0SRobert Mustacchi mutex_enter(&itrq->itrq_tx_lock);
3040da5577f0SRobert Mustacchi itrq->itrq_tx_blocked = B_TRUE;
3041da5577f0SRobert Mustacchi mutex_exit(&itrq->itrq_tx_lock);
3042da5577f0SRobert Mustacchi
3043da5577f0SRobert Mustacchi return (mp);
3044da5577f0SRobert Mustacchi }
3045