1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright 2015 OmniTI Computer Consulting, Inc. All rights reserved.
14 * Copyright 2019 Joyent, Inc.
15 * Copyright 2020 RackTop Systems, Inc.
16 */
17
18 #include "i40e_sw.h"
19
20 /*
21 * ---------------------------------------------------------
22 * Buffer and Memory Management, Receiving, and Transmitting
23 * ---------------------------------------------------------
24 *
25 * Each physical function (PF), which is what we think of as an instance of the
26 * device driver, has a series of associated transmit and receive queue pairs.
27 * Effectively, what we think of in MAC as rings. Each of these has their own
28 * ring of descriptors which is used as part of doing DMA activity.
29 *
30 * The transmit ring of descriptors are 16-byte entries which are used to send
31 * packets, program filters, etc. The receive ring of descriptors are either
32 * 16-byte or 32-bytes each. At the moment, we opt to use the larger descriptor
33 * format so that we're in a better position if we ever want to leverage that
34 * information later on.
35 *
36 * However, these rings are just for descriptors, they don't talk or deal with
37 * how we actually store the memory that we need for DMA or the associated
38 * information that we need for keeping track of message blocks. To correspond
39 * to the hardware descriptor ring which is how we communicate with hardware, we
40 * introduce a control block which keeps track of our required metadata like DMA
41 * mappings.
42 *
43 * There are two main considerations that dictate how much memory and buffers
44 * we end up allocating. Those are:
45 *
46 * o The size of the ring (controlled through the driver.conf file)
47 *
48 * o The maximum size frame we can receive.
49 *
50 * The size of the rings currently defaults to 1024 descriptors and is stored in
51 * the i40e_t`i40e_rx_ring_size and i40e_t`i40e_tx_ring_size.
52 *
53 * While the size of the rings is controlled by the driver.conf, the maximum
54 * size frame is informed primarily through the use of dladm and the setting of
55 * the MTU property on the device. From the MTU, we then go and do some
56 * machinations. The first thing we do is we then have to add in space for the
57 * Ethernet header, potentially a VLAN header, and the FCS check. This value is
58 * what's stored as i40e_t`i40e_frame_max and is derived any time
59 * i40e_t`i40e_sdu changes.
60 *
61 * This size is then rounded up to the nearest 1k chunk, which represents the
62 * actual amount of memory that we'll allocate for a single frame.
63 *
64 * Note, that for RX, we do something that might be unexpected. We always add
65 * an extra two bytes to the frame size that we allocate. We then offset the DMA
66 * address that we receive a packet into by two bytes. This ensures that the IP
67 * header will always be 4 byte aligned because the MAC header is either 14 or
68 * 18 bytes in length, depending on the use of 802.1Q tagging, which makes IP's
69 * and MAC's lives easier.
70 *
71 * Both the RX and TX descriptor rings (which are what we use to communicate
72 * with hardware) are allocated as a single region of DMA memory which is the
73 * size of the descriptor (4 bytes and 2 bytes respectively) times the total
74 * number of descriptors for an RX and TX ring.
75 *
76 * While the RX and TX descriptors are allocated using DMA-based memory, the
77 * control blocks for each of them are allocated using normal kernel memory.
78 * They aren't special from a DMA perspective. We'll go over the design of both
79 * receiving and transmitting separately, as they have slightly different
80 * control blocks and different ways that we manage the relationship between
81 * control blocks and descriptors.
82 *
83 * ---------------------------------
84 * RX Descriptors and Control Blocks
85 * ---------------------------------
86 *
87 * For every descriptor in the ring that the driver has, we need some associated
88 * memory, which means that we need to have the receive specific control block.
89 * We have a couple different, but related goals:
90 *
91 * o Once we've completed the mc_start GLDv3 endpoint (i40e_m_start), we do
92 * not want to do any additional memory allocations or DMA allocations if
93 * we don't have to.
94 *
95 * o We'd like to try and do as much zero-copy as possible, while taking into
96 * account the cost of mapping in DMA resources.
97 *
98 * o We'd like to have every receive descriptor available.
99 *
100 * Now, these rules are a bit in tension with one another. The act of mapping in
101 * is an exercise of trying to find the break-even point between page table
102 * updates and bcopy. We currently start by using the same metrics that ixgbe
103 * used; however, it should be known that this value has effectively been
104 * cargo-culted across to yet another driver, sorry.
105 *
106 * If we receive a packet which is larger than our copy threshold, we'll create
107 * a message block out of the DMA memory via desballoc(9F) and send that up to
108 * MAC that way. This will cause us to be notified when the message block is
109 * then freed because it has been consumed, dropped, or otherwise. Otherwise, if
110 * it's less than the threshold, we'll try to use allocb and bcopy it into the
111 * block, thus allowing us to immediately reuse the DMA resource. Note, on debug
112 * builds, we allow someone to whack the variable i40e_debug_rx_mode to override
113 * the behavior and always do a bcopy or a DMA bind.
114 *
115 * To try and ensure that the device always has blocks that it can receive data
116 * into, we maintain two lists of control blocks, a working list and a free
117 * list. Each list is sized equal to the number of descriptors in the RX ring.
118 * During the GLDv3 mc_start routine, we allocate a number of RX control blocks
119 * equal to twice the number of descriptors in the ring and we assign them
120 * equally to the free list and to the working list. Each control block also has
121 * DMA memory allocated and associated with which it will be used to receive the
122 * actual packet data. All of a received frame's data will end up in a single
123 * DMA buffer.
124 *
125 * During operation, we always maintain the invariant that each RX descriptor
126 * has an associated RX control block which lives in the working list. If we
127 * feel that we should loan up DMA memory to MAC in the form of a message block,
128 * we can only do so if we can maintain this invariant. To do that, we swap in
129 * one of the buffers from the free list. If none are available, then we resort
130 * to using allocb(9F) and bcopy(9F) on the packet instead, regardless of the
131 * size.
132 *
133 * Loaned message blocks come back to use when freemsg(9F) or freeb(9F) is
134 * called on the block, at which point we restore the RX control block to the
135 * free list and are able to reuse the DMA memory again. While the scheme may
136 * seem odd, it importantly keeps us out of trying to do any DMA allocations in
137 * the normal path of operation, even though we may still have to allocate
138 * message blocks and copy.
139 *
140 * The following state machine describes the life time of a RX control block. In
141 * the diagram we abbrviate the RX ring descriptor entry as rxd and the rx
142 * control block entry as rcb.
143 *
144 * | |
145 * * ... 1/2 of all initial rcb's ... *
146 * | |
147 * v v
148 * +------------------+ +------------------+
149 * | rcb on free list |---*---------->| rcb on work list |
150 * +------------------+ . +------------------+
151 * ^ . moved to |
152 * | replace rcb * . . Frame received,
153 * | loaned to | entry on free list
154 * | MAC + co. | available. rcb's
155 * | | memory made into mblk_t
156 * * . freemsg(9F) | and sent up to MAC.
157 * | called on |
158 * | loaned rcb |
159 * | and it is v
160 * | recycled. +-------------------+
161 * +--------------------<-----| rcb loaned to MAC |
162 * +-------------------+
163 *
164 * Finally, note that every RX control block has a reference count on it. One
165 * reference is added as long as the driver has had the GLDv3 mc_start endpoint
166 * called. If the GLDv3 mc_stop entry point is called, IP has been unplumbed and
167 * no other DLPI consumers remain, then we'll decrement the reference count by
168 * one. Whenever we loan up the RX control block and associated buffer to MAC,
169 * then we bump the reference count again. Even though the device is stopped,
170 * there may still be loaned frames in upper levels that we'll want to account
171 * for. Our callback from freemsg(9F)/freeb(9F) will take care of making sure
172 * that it is cleaned up.
173 *
174 * --------------------
175 * Managing the RX Ring
176 * --------------------
177 *
178 * The receive ring descriptors are arranged in a circular buffer with a head
179 * and tail pointer. There are both the conventional head and tail pointers
180 * which are used to partition the ring into two portions, a portion that we,
181 * the operating system, manage and a portion that is managed by hardware. When
182 * hardware owns a descriptor in the ring, it means that it is waiting for data
183 * to be filled in. However, when a portion of the ring is owned by the driver,
184 * then that means that the descriptor has been consumed and we need to go take
185 * a look at it.
186 *
187 * The initial head is configured to be zero by writing it as such in the
188 * receive queue context in the FPM (function private memory from the host). The
189 * initial tail is written to be the last descriptor. This is written to via the
190 * PCIe register I40E_QRX_TAIL(). Technically, hardware owns everything between
191 * the HEAD and TAIL, inclusive. Note that while we initially program the HEAD,
192 * the only values we ever consult ourselves are the TAIL register and our own
193 * state tracking. Effectively, we cache the HEAD register and then update it
194 * ourselves based on our work.
195 *
196 * When we iterate over the RX descriptors and thus the received frames, we are
197 * either in an interrupt context or we've been asked by MAC to poll on the
198 * ring. If we've been asked to poll on the ring, we have a maximum number of
199 * bytes of mblk_t's to return. If processing an RX descriptor would cause us to
200 * exceed that count, then we do not process it. When in interrupt context, we
201 * don't have a strict byte count. However, to ensure liveness, we limit the
202 * amount of data based on a configuration value
203 * (i40e_t`i40e_rx_limit_per_intr). The number that we've started with for this
204 * is based on similar numbers that are used for ixgbe. After some additional
205 * time in the field, we'll have a sense as to whether or not it should be
206 * changed.
207 *
208 * When processing, we start at our own HEAD pointer
209 * (i40e_rx_data_t`rxd_desc_next), which indicates the descriptor to start
210 * processing. Every RX descriptor has what's described as the DD bit. This bit
211 * (the LSB of the second 8-byte word), indicates whether or not the descriptor
212 * is done. When we give descriptors to the hardware, this value is always
213 * zero. When the hardware has finished a descriptor, it will always be one.
214 *
215 * The first thing that we check is whether the DD bit indicates that the
216 * current HEAD is ready. If it isn't, then we're done. That's the primary
217 * invariant of processing a frame. If it's done, then there are a few other
218 * things that we want to look at. In the same status word as the DD bit, there
219 * are two other important bits:
220 *
221 * o End of Packet (EOP)
222 * o Error bits
223 *
224 * The end of packet indicates that we have reached the last descriptor. Now,
225 * you might ask when would there be more than one descriptor. The reason for
226 * that might be due to large receive offload (lro) or header splitting
227 * functionality, which presently isn't supported in the driver. The error bits
228 * in the frame are only valid when EOP is set.
229 *
230 * If error bits are set on the frame, then we still consume it; however, we
231 * will not generate an mblk_t to send up to MAC. If there are no error bits
232 * set, then we'll consume the descriptor either using bcopy or DMA binding. See
233 * the earlier section 'RX DESCRIPTORS AND CONTROL BLOCKS' for more information
234 * on how that selection is made.
235 *
236 * Regardless of whether we construct an mblk_t or encounter an error, we end up
237 * resetting the descriptor. This re-arms the descriptor for hardware and in the
238 * process, we may end up assigning it a new receive control bock. After we do
239 * this, we always update our HEAD pointer, no matter what.
240 *
241 * Finally, once we've consumed as much as we will in a given window, we go and
242 * update the TAIL register to indicate all the frames we've consumed. We only
243 * do a single bulk write for the ring.
244 *
245 * ---------------------------------
246 * TX Descriptors and Control Blocks
247 * ---------------------------------
248 *
249 * While the transmit path is similar in spirit to the receive path, it works
250 * differently due to the fact that all data is originated by the operating
251 * system and not by the device.
252 *
253 * Like RX, there is both a descriptor ring that we use to communicate to the
254 * driver and which points to the memory used to transmit a frame. Similarly,
255 * there is a corresponding transmit control block, however, the correspondence
256 * between descriptors and control blocks is more complex and not necessarily
257 * 1-to-1.
258 *
259 * The driver is asked to process a single frame at a time. That message block
260 * may be made up of multiple fragments linked together by the mblk_t`b_cont
261 * member. The device has a hard limit of up to 8 buffers being allowed for use
262 * for a single non-LSO packet or LSO segment. The number of TX ring entires
263 * (and thus TX control blocks) used depends on the fragment sizes and DMA
264 * layout, as explained below.
265 *
266 * We alter our DMA strategy based on a threshold tied to the fragment size.
267 * This threshold is configurable via the tx_dma_threshold property. If the
268 * fragment is above the threshold, we DMA bind it -- consuming one TCB and
269 * potentially several data descriptors. The exact number of descriptors (equal
270 * to the number of DMA cookies) depends on page size, MTU size, b_rptr offset
271 * into page, b_wptr offset into page, and the physical layout of the dblk's
272 * memory (contiguous or not). Essentially, we are at the mercy of the DMA
273 * engine and the dblk's memory allocation. Knowing the exact number of
274 * descriptors up front is a task best not taken on by the driver itself.
275 * Instead, we attempt to DMA bind the fragment and verify the descriptor
276 * layout meets hardware constraints. If the proposed DMA bind does not satisfy
277 * the hardware constaints, then we discard it and instead copy the entire
278 * fragment into the pre-allocated TCB buffer (or buffers if the fragment is
279 * larger than the TCB buffer).
280 *
281 * If the fragment is below or at the threshold, we copy it to the pre-allocated
282 * buffer of a TCB. We compress consecutive copy fragments into a single TCB to
283 * conserve resources. We are guaranteed that the TCB buffer is made up of only
284 * 1 DMA cookie; and therefore consumes only one descriptor on the controller.
285 *
286 * Furthermore, if the frame requires HW offloads such as LSO, tunneling or
287 * filtering, then the TX data descriptors must be preceeded by a single TX
288 * context descriptor. Because there is no DMA transfer associated with the
289 * context descriptor, we allocate a control block with a special type which
290 * indicates to the TX ring recycle code that there are no associated DMA
291 * resources to unbind when the control block is free'd.
292 *
293 * If we don't have enough space in the ring or TX control blocks available,
294 * then we'll return the unprocessed message block to MAC. This will induce flow
295 * control and once we recycle enough entries, we'll once again enable sending
296 * on the ring.
297 *
298 * We size the working list as equal to the number of descriptors in the ring.
299 * We size the free list as equal to 1.5 times the number of descriptors in the
300 * ring. We'll allocate a number of TX control block entries equal to the number
301 * of entries in the free list. By default, all entries are placed in the free
302 * list. As we come along and try to send something, we'll allocate entries from
303 * the free list and add them to the working list, where they'll stay until the
304 * hardware indicates that all of the data has been written back to us. The
305 * reason that we start with 1.5x is to help facilitate having more than one TX
306 * buffer associated with the DMA activity.
307 *
308 * --------------------
309 * Managing the TX Ring
310 * --------------------
311 *
312 * The transmit descriptor ring is driven by us. We maintain our own notion of a
313 * HEAD and TAIL register and we update the hardware with updates to the TAIL
314 * register. When the hardware is done writing out data, it updates us by
315 * writing back to a specific address, not by updating the individual
316 * descriptors. That address is a 4-byte region after the main transmit
317 * descriptor ring. This is why the descriptor ring has an extra descriptor's
318 * worth allocated to it.
319 *
320 * We maintain our notion of the HEAD in the i40e_trqpair_t`itrq_desc_head and
321 * the TAIL in the i40e_trqpair_t`itrq_desc_tail. When we write out frames,
322 * we'll update the tail there and in the I40E_QTX_TAIL() register. At various
323 * points in time, through both interrupts, and our own internal checks, we'll
324 * sync the write-back head portion of the DMA space. Based on the index it
325 * reports back, we'll free everything between our current HEAD and the
326 * indicated index and update HEAD to the new index.
327 *
328 * When a frame comes in, we try to use a number of transmit control blocks and
329 * we'll transition them from the free list to the work list. They'll get moved
330 * to the entry on the work list that corresponds with the transmit descriptor
331 * they correspond to. Once we are indicated that the corresponding descriptor
332 * has been freed, we'll return it to the list.
333 *
334 * The transmit control block free list is managed by keeping track of the
335 * number of entries in it, i40e_trqpair_t`itrq_tcb_free. We use it as a way to
336 * index into the free list and add things to it. In effect, we always push and
337 * pop from the tail and protect it with a single lock,
338 * i40e_trqpair_t`itrq_tcb_lock. This scheme is somewhat simplistic and may not
339 * stand up to further performance testing; however, it does allow us to get off
340 * the ground with the device driver.
341 *
342 * The following image describes where a given transmit control block lives in
343 * its lifetime:
344 *
345 * |
346 * * ... Initial placement for all tcb's
347 * |
348 * v
349 * +------------------+ +------------------+
350 * | tcb on free list |---*------------------>| tcb on work list |
351 * +------------------+ . +------------------+
352 * ^ . N tcbs allocated[1] |
353 * | to send frame v
354 * | or fragment on |
355 * | wire, mblk from |
356 * | MAC associated. |
357 * | |
358 * +------*-------------------------------<----+
359 * .
360 * . Hardware indicates
361 * entry transmitted.
362 * tcbs recycled, mblk
363 * from MAC freed.
364 *
365 * [1] We allocate N tcbs to transmit a single frame where N can be 1 context
366 * descriptor plus 1 data descriptor, in the non-DMA-bind case. In the DMA
367 * bind case, N can be 1 context descriptor plus 1 data descriptor per
368 * b_cont in the mblk. In this case, the mblk is associated with the first
369 * data descriptor and freed as part of freeing that data descriptor.
370 *
371 * ------------
372 * Blocking MAC
373 * ------------
374 *
375 * When performing transmit, we can run out of descriptors and ring entries.
376 * When such a case happens, we return the mblk_t to MAC to indicate that we've
377 * been blocked. At that point in time, MAC becomes blocked and will not
378 * transmit anything out that specific ring until we notify MAC. To indicate
379 * that we're in such a situation we set i40e_trqpair_t`itrq_tx_blocked member
380 * to B_TRUE.
381 *
382 * When we recycle TX descriptors then we'll end up signaling MAC by calling
383 * mac_tx_ring_update() if we were blocked, letting it know that it's safe to
384 * start sending frames out to us again.
385 */
386
387 /*
388 * We set our DMA alignment requests based on the smallest supported page size
389 * of the corresponding platform.
390 */
391 #if defined(__sparc)
392 #define I40E_DMA_ALIGNMENT 0x2000ull
393 #elif defined(__x86)
394 #define I40E_DMA_ALIGNMENT 0x1000ull
395 #else
396 #error "unknown architecture for i40e"
397 #endif
398
399 /*
400 * This structure is used to maintain information and flags related to
401 * transmitting a frame. These fields are ultimately used to construct the
402 * TX data descriptor(s) and, if necessary, the TX context descriptor.
403 */
404 typedef struct i40e_tx_context {
405 enum i40e_tx_desc_cmd_bits itc_data_cmdflags;
406 uint32_t itc_data_offsets;
407 enum i40e_tx_ctx_desc_cmd_bits itc_ctx_cmdflags;
408 uint32_t itc_ctx_tsolen;
409 uint32_t itc_ctx_mss;
410 } i40e_tx_context_t;
411
412 /*
413 * Toggles on debug builds which can be used to override our RX behaviour based
414 * on thresholds.
415 */
416 #ifdef DEBUG
417 typedef enum {
418 I40E_DEBUG_RX_DEFAULT = 0,
419 I40E_DEBUG_RX_BCOPY = 1,
420 I40E_DEBUG_RX_DMABIND = 2
421 } i40e_debug_rx_t;
422
423 i40e_debug_rx_t i40e_debug_rx_mode = I40E_DEBUG_RX_DEFAULT;
424 #endif /* DEBUG */
425
426 /*
427 * Notes on the following pair of DMA attributes. The first attribute,
428 * i40e_static_dma_attr, is designed to be used for both the descriptor rings
429 * and the static buffers that we associate with control blocks. For this
430 * reason, we force an SGL length of one. While technically the driver supports
431 * a larger SGL (5 on RX and 8 on TX), we opt to only use one to simplify our
432 * management here. In addition, when the Intel common code wants to allocate
433 * memory via the i40e_allocate_virt_mem osdep function, we have it leverage
434 * the static dma attr.
435 *
436 * The latter two sets of attributes, are what we use when we're binding a
437 * bunch of mblk_t fragments to go out the door. Note that the main difference
438 * here is that we're allowed a larger SGL length. For non-LSO TX, we
439 * restrict the SGL length to match the number of TX buffers available to the
440 * PF (8). For the LSO case we can go much larger, with the caveat that each
441 * MSS-sized chunk (segment) must not span more than 8 data descriptors and
442 * hence must not span more than 8 cookies.
443 *
444 * Note, we default to setting ourselves to be DMA capable here. However,
445 * because we could have multiple instances which have different FMA error
446 * checking capabilities, or end up on different buses, we make these static
447 * and const and copy them into the i40e_t for the given device with the actual
448 * values that reflect the actual capabilities.
449 */
450 static const ddi_dma_attr_t i40e_g_static_dma_attr = {
451 DMA_ATTR_V0, /* version number */
452 0x0000000000000000ull, /* low address */
453 0xFFFFFFFFFFFFFFFFull, /* high address */
454 0x00000000FFFFFFFFull, /* dma counter max */
455 I40E_DMA_ALIGNMENT, /* alignment */
456 0x00000FFF, /* burst sizes */
457 0x00000001, /* minimum transfer size */
458 0x00000000FFFFFFFFull, /* maximum transfer size */
459 0xFFFFFFFFFFFFFFFFull, /* maximum segment size */
460 1, /* scatter/gather list length */
461 0x00000001, /* granularity */
462 DDI_DMA_FLAGERR /* DMA flags */
463 };
464
465 static const ddi_dma_attr_t i40e_g_txbind_dma_attr = {
466 DMA_ATTR_V0, /* version number */
467 0x0000000000000000ull, /* low address */
468 0xFFFFFFFFFFFFFFFFull, /* high address */
469 I40E_MAX_TX_BUFSZ - 1, /* dma counter max */
470 I40E_DMA_ALIGNMENT, /* alignment */
471 0x00000FFF, /* burst sizes */
472 0x00000001, /* minimum transfer size */
473 0x00000000FFFFFFFFull, /* maximum transfer size */
474 0xFFFFFFFFFFFFFFFFull, /* maximum segment size */
475 I40E_TX_MAX_COOKIE, /* scatter/gather list length */
476 0x00000001, /* granularity */
477 DDI_DMA_FLAGERR /* DMA flags */
478 };
479
480 static const ddi_dma_attr_t i40e_g_txbind_lso_dma_attr = {
481 DMA_ATTR_V0, /* version number */
482 0x0000000000000000ull, /* low address */
483 0xFFFFFFFFFFFFFFFFull, /* high address */
484 I40E_MAX_TX_BUFSZ - 1, /* dma counter max */
485 I40E_DMA_ALIGNMENT, /* alignment */
486 0x00000FFF, /* burst sizes */
487 0x00000001, /* minimum transfer size */
488 0x00000000FFFFFFFFull, /* maximum transfer size */
489 0xFFFFFFFFFFFFFFFFull, /* maximum segment size */
490 I40E_TX_LSO_MAX_COOKIE, /* scatter/gather list length */
491 0x00000001, /* granularity */
492 DDI_DMA_FLAGERR /* DMA flags */
493 };
494
495 /*
496 * Next, we have the attributes for these structures. The descriptor rings are
497 * all strictly little endian, while the data buffers are just arrays of bytes
498 * representing frames. Because of this, we purposefully simplify the driver
499 * programming life by programming the descriptor ring as little endian, while
500 * for the buffer data we keep it as unstructured.
501 *
502 * Note, that to keep the Intel common code operating in a reasonable way, when
503 * we allocate DMA memory for it, we do not use byte swapping and thus use the
504 * standard i40e_buf_acc_attr.
505 */
506 static const ddi_device_acc_attr_t i40e_g_desc_acc_attr = {
507 DDI_DEVICE_ATTR_V0,
508 DDI_STRUCTURE_LE_ACC,
509 DDI_STRICTORDER_ACC
510 };
511
512 static const ddi_device_acc_attr_t i40e_g_buf_acc_attr = {
513 DDI_DEVICE_ATTR_V0,
514 DDI_NEVERSWAP_ACC,
515 DDI_STRICTORDER_ACC
516 };
517
518 /*
519 * The next two functions are designed to be type-safe versions of macros that
520 * are used to increment and decrement a descriptor index in the loop. Note,
521 * these are marked inline to try and keep the data path hot and they were
522 * effectively inlined in their previous life as macros.
523 */
524 static inline int
i40e_next_desc(int base,int count,int size)525 i40e_next_desc(int base, int count, int size)
526 {
527 int out;
528
529 ASSERT(base >= 0);
530 ASSERT(count > 0);
531 ASSERT(size > 0);
532
533 if (base + count < size) {
534 out = base + count;
535 } else {
536 out = base + count - size;
537 }
538
539 ASSERT(out >= 0 && out < size);
540 return (out);
541 }
542
543 static inline int
i40e_prev_desc(int base,int count,int size)544 i40e_prev_desc(int base, int count, int size)
545 {
546 int out;
547
548 ASSERT(base >= 0);
549 ASSERT(count > 0);
550 ASSERT(size > 0);
551
552 if (base >= count) {
553 out = base - count;
554 } else {
555 out = base - count + size;
556 }
557
558 ASSERT(out >= 0 && out < size);
559 return (out);
560 }
561
562 /*
563 * Free DMA memory that is represented by a i40e_dma_buffer_t.
564 */
565 static void
i40e_free_dma_buffer(i40e_dma_buffer_t * dmap)566 i40e_free_dma_buffer(i40e_dma_buffer_t *dmap)
567 {
568 if (dmap->dmab_dma_address != 0) {
569 VERIFY(dmap->dmab_dma_handle != NULL);
570 (void) ddi_dma_unbind_handle(dmap->dmab_dma_handle);
571 dmap->dmab_dma_address = 0;
572 dmap->dmab_size = 0;
573 }
574
575 if (dmap->dmab_acc_handle != NULL) {
576 ddi_dma_mem_free(&dmap->dmab_acc_handle);
577 dmap->dmab_acc_handle = NULL;
578 dmap->dmab_address = NULL;
579 }
580
581 if (dmap->dmab_dma_handle != NULL) {
582 ddi_dma_free_handle(&dmap->dmab_dma_handle);
583 dmap->dmab_dma_handle = NULL;
584 }
585
586 /*
587 * These should only be set if we have valid handles allocated and
588 * therefore should always be NULLed out due to the above code. This
589 * is here to catch us acting sloppy.
590 */
591 ASSERT(dmap->dmab_dma_address == 0);
592 ASSERT(dmap->dmab_address == NULL);
593 ASSERT(dmap->dmab_size == 0);
594 dmap->dmab_len = 0;
595 }
596
597 /*
598 * Allocate size bytes of DMA memory based on the passed in attributes. This
599 * fills in the information in dmap and is designed for all of our single cookie
600 * allocations.
601 */
602 static boolean_t
i40e_alloc_dma_buffer(i40e_t * i40e,i40e_dma_buffer_t * dmap,ddi_dma_attr_t * attrsp,ddi_device_acc_attr_t * accp,boolean_t stream,boolean_t zero,size_t size)603 i40e_alloc_dma_buffer(i40e_t *i40e, i40e_dma_buffer_t *dmap,
604 ddi_dma_attr_t *attrsp, ddi_device_acc_attr_t *accp, boolean_t stream,
605 boolean_t zero, size_t size)
606 {
607 int ret;
608 uint_t flags;
609 size_t len;
610 ddi_dma_cookie_t cookie;
611 uint_t ncookies;
612
613 if (stream == B_TRUE)
614 flags = DDI_DMA_STREAMING;
615 else
616 flags = DDI_DMA_CONSISTENT;
617
618 /*
619 * Step one: Allocate the DMA handle
620 */
621 ret = ddi_dma_alloc_handle(i40e->i40e_dip, attrsp, DDI_DMA_DONTWAIT,
622 NULL, &dmap->dmab_dma_handle);
623 if (ret != DDI_SUCCESS) {
624 i40e_error(i40e, "failed to allocate dma handle for I/O "
625 "buffers: %d", ret);
626 dmap->dmab_dma_handle = NULL;
627 return (B_FALSE);
628 }
629
630 /*
631 * Step two: Allocate the DMA memory
632 */
633 ret = ddi_dma_mem_alloc(dmap->dmab_dma_handle, size, accp, flags,
634 DDI_DMA_DONTWAIT, NULL, &dmap->dmab_address, &len,
635 &dmap->dmab_acc_handle);
636 if (ret != DDI_SUCCESS) {
637 i40e_error(i40e, "failed to allocate %ld bytes of DMA for I/O "
638 "buffers", size);
639 dmap->dmab_address = NULL;
640 dmap->dmab_acc_handle = NULL;
641 i40e_free_dma_buffer(dmap);
642 return (B_FALSE);
643 }
644
645 /*
646 * Step three: Optionally zero
647 */
648 if (zero == B_TRUE)
649 bzero(dmap->dmab_address, len);
650
651 /*
652 * Step four: Bind the memory
653 */
654 ret = ddi_dma_addr_bind_handle(dmap->dmab_dma_handle, NULL,
655 dmap->dmab_address, len, DDI_DMA_RDWR | flags, DDI_DMA_DONTWAIT,
656 NULL, &cookie, &ncookies);
657 if (ret != DDI_DMA_MAPPED) {
658 i40e_error(i40e, "failed to allocate %ld bytes of DMA for I/O "
659 "buffers: %d", size, ret);
660 i40e_free_dma_buffer(dmap);
661 return (B_FALSE);
662 }
663
664 VERIFY(ncookies == 1);
665 dmap->dmab_dma_address = cookie.dmac_laddress;
666 dmap->dmab_size = len;
667 dmap->dmab_len = 0;
668 return (B_TRUE);
669 }
670
671 /*
672 * This function is called once the last pending rcb has been freed by the upper
673 * levels of the system.
674 */
675 static void
i40e_free_rx_data(i40e_rx_data_t * rxd)676 i40e_free_rx_data(i40e_rx_data_t *rxd)
677 {
678 VERIFY(rxd->rxd_rcb_pending == 0);
679
680 if (rxd->rxd_rcb_area != NULL) {
681 kmem_free(rxd->rxd_rcb_area,
682 sizeof (i40e_rx_control_block_t) *
683 (rxd->rxd_free_list_size + rxd->rxd_ring_size));
684 rxd->rxd_rcb_area = NULL;
685 }
686
687 if (rxd->rxd_free_list != NULL) {
688 kmem_free(rxd->rxd_free_list,
689 sizeof (i40e_rx_control_block_t *) *
690 rxd->rxd_free_list_size);
691 rxd->rxd_free_list = NULL;
692 }
693
694 if (rxd->rxd_work_list != NULL) {
695 kmem_free(rxd->rxd_work_list,
696 sizeof (i40e_rx_control_block_t *) *
697 rxd->rxd_ring_size);
698 rxd->rxd_work_list = NULL;
699 }
700
701 kmem_free(rxd, sizeof (i40e_rx_data_t));
702 }
703
704 static boolean_t
i40e_alloc_rx_data(i40e_t * i40e,i40e_trqpair_t * itrq)705 i40e_alloc_rx_data(i40e_t *i40e, i40e_trqpair_t *itrq)
706 {
707 i40e_rx_data_t *rxd;
708
709 rxd = kmem_zalloc(sizeof (i40e_rx_data_t), KM_NOSLEEP);
710 if (rxd == NULL)
711 return (B_FALSE);
712 itrq->itrq_rxdata = rxd;
713 rxd->rxd_i40e = i40e;
714
715 rxd->rxd_ring_size = i40e->i40e_rx_ring_size;
716 rxd->rxd_free_list_size = i40e->i40e_rx_ring_size;
717
718 rxd->rxd_rcb_free = rxd->rxd_free_list_size;
719
720 rxd->rxd_work_list = kmem_zalloc(sizeof (i40e_rx_control_block_t *) *
721 rxd->rxd_ring_size, KM_NOSLEEP);
722 if (rxd->rxd_work_list == NULL) {
723 i40e_error(i40e, "failed to allocate RX work list for a ring "
724 "of %d entries for ring %d", rxd->rxd_ring_size,
725 itrq->itrq_index);
726 goto cleanup;
727 }
728
729 rxd->rxd_free_list = kmem_zalloc(sizeof (i40e_rx_control_block_t *) *
730 rxd->rxd_free_list_size, KM_NOSLEEP);
731 if (rxd->rxd_free_list == NULL) {
732 i40e_error(i40e, "failed to allocate a %d entry RX free list "
733 "for ring %d", rxd->rxd_free_list_size, itrq->itrq_index);
734 goto cleanup;
735 }
736
737 rxd->rxd_rcb_area = kmem_zalloc(sizeof (i40e_rx_control_block_t) *
738 (rxd->rxd_free_list_size + rxd->rxd_ring_size), KM_NOSLEEP);
739 if (rxd->rxd_rcb_area == NULL) {
740 i40e_error(i40e, "failed to allocate a %d entry rcb area for "
741 "ring %d", rxd->rxd_ring_size + rxd->rxd_free_list_size,
742 itrq->itrq_index);
743 goto cleanup;
744 }
745
746 return (B_TRUE);
747
748 cleanup:
749 i40e_free_rx_data(rxd);
750 itrq->itrq_rxdata = NULL;
751 return (B_FALSE);
752 }
753
754 /*
755 * Free all of the memory that we've allocated for DMA. Note that we may have
756 * buffers that we've loaned up to the OS which are still outstanding. We'll
757 * always free up the descriptor ring, because we no longer need that. For each
758 * rcb, we'll iterate over it and if we send the reference count to zero, then
759 * we'll free the message block and DMA related resources. However, if we don't
760 * take the last one, then we'll go ahead and keep track that we'll have pending
761 * data and clean it up when we get there.
762 */
763 static void
i40e_free_rx_dma(i40e_rx_data_t * rxd,boolean_t failed_init)764 i40e_free_rx_dma(i40e_rx_data_t *rxd, boolean_t failed_init)
765 {
766 uint32_t i, count, ref;
767
768 i40e_rx_control_block_t *rcb;
769 i40e_t *i40e = rxd->rxd_i40e;
770
771 i40e_free_dma_buffer(&rxd->rxd_desc_area);
772 rxd->rxd_desc_ring = NULL;
773 rxd->rxd_desc_next = 0;
774
775 mutex_enter(&i40e->i40e_rx_pending_lock);
776
777 rcb = rxd->rxd_rcb_area;
778 count = rxd->rxd_ring_size + rxd->rxd_free_list_size;
779
780 for (i = 0; i < count; i++, rcb++) {
781 VERIFY(rcb != NULL);
782
783 /*
784 * If we're cleaning up from a failed creation attempt, then an
785 * entry may never have been assembled which would mean that
786 * it's reference count is zero. If we find that, we leave it
787 * be, because nothing else should be modifying it at this
788 * point. We're not at the point that any more references can be
789 * added, just removed.
790 */
791 if (failed_init == B_TRUE && rcb->rcb_ref == 0)
792 continue;
793
794 ref = atomic_dec_32_nv(&rcb->rcb_ref);
795 if (ref == 0) {
796 freemsg(rcb->rcb_mp);
797 rcb->rcb_mp = NULL;
798 i40e_free_dma_buffer(&rcb->rcb_dma);
799 } else {
800 atomic_inc_32(&rxd->rxd_rcb_pending);
801 atomic_inc_32(&i40e->i40e_rx_pending);
802 }
803 }
804 mutex_exit(&i40e->i40e_rx_pending_lock);
805 }
806
807 /*
808 * Initialize the DMA memory for the descriptor ring and for each frame in the
809 * control block list.
810 */
811 static boolean_t
i40e_alloc_rx_dma(i40e_rx_data_t * rxd)812 i40e_alloc_rx_dma(i40e_rx_data_t *rxd)
813 {
814 int i, count;
815 size_t dmasz;
816 i40e_rx_control_block_t *rcb;
817 i40e_t *i40e = rxd->rxd_i40e;
818
819 /*
820 * First allocate the RX descriptor ring.
821 */
822 dmasz = sizeof (i40e_rx_desc_t) * rxd->rxd_ring_size;
823 VERIFY(dmasz > 0);
824 if (i40e_alloc_dma_buffer(i40e, &rxd->rxd_desc_area,
825 &i40e->i40e_static_dma_attr, &i40e->i40e_desc_acc_attr, B_FALSE,
826 B_TRUE, dmasz) == B_FALSE) {
827 i40e_error(i40e, "failed to allocate DMA resources "
828 "for RX descriptor ring");
829 return (B_FALSE);
830 }
831 rxd->rxd_desc_ring =
832 (i40e_rx_desc_t *)(uintptr_t)rxd->rxd_desc_area.dmab_address;
833 rxd->rxd_desc_next = 0;
834
835 count = rxd->rxd_ring_size + rxd->rxd_free_list_size;
836 rcb = rxd->rxd_rcb_area;
837
838 dmasz = i40e->i40e_rx_buf_size;
839 VERIFY(dmasz > 0);
840 for (i = 0; i < count; i++, rcb++) {
841 i40e_dma_buffer_t *dmap;
842 VERIFY(rcb != NULL);
843
844 if (i < rxd->rxd_ring_size) {
845 rxd->rxd_work_list[i] = rcb;
846 } else {
847 rxd->rxd_free_list[i - rxd->rxd_ring_size] = rcb;
848 }
849
850 dmap = &rcb->rcb_dma;
851 if (i40e_alloc_dma_buffer(i40e, dmap,
852 &i40e->i40e_static_dma_attr, &i40e->i40e_buf_acc_attr,
853 B_TRUE, B_FALSE, dmasz) == B_FALSE) {
854 i40e_error(i40e, "failed to allocate RX dma buffer");
855 return (B_FALSE);
856 }
857
858 /*
859 * Initialize the control block and offset the DMA address. See
860 * the note in the big theory statement that explains how this
861 * helps IP deal with alignment. Note, we don't worry about
862 * whether or not we successfully get an mblk_t from desballoc,
863 * it's a common case that we have to handle later on in the
864 * system.
865 */
866 dmap->dmab_size -= I40E_BUF_IPHDR_ALIGNMENT;
867 dmap->dmab_address += I40E_BUF_IPHDR_ALIGNMENT;
868 dmap->dmab_dma_address += I40E_BUF_IPHDR_ALIGNMENT;
869
870 rcb->rcb_ref = 1;
871 rcb->rcb_rxd = rxd;
872 rcb->rcb_free_rtn.free_func = i40e_rx_recycle;
873 rcb->rcb_free_rtn.free_arg = (caddr_t)rcb;
874 rcb->rcb_mp = desballoc((unsigned char *)dmap->dmab_address,
875 dmap->dmab_size, 0, &rcb->rcb_free_rtn);
876 }
877
878 return (B_TRUE);
879 }
880
881 static void
i40e_free_tx_dma(i40e_trqpair_t * itrq)882 i40e_free_tx_dma(i40e_trqpair_t *itrq)
883 {
884 size_t fsz;
885
886 if (itrq->itrq_tcb_area != NULL) {
887 uint32_t i;
888 i40e_tx_control_block_t *tcb = itrq->itrq_tcb_area;
889
890 for (i = 0; i < itrq->itrq_tx_free_list_size; i++, tcb++) {
891 i40e_free_dma_buffer(&tcb->tcb_dma);
892 if (tcb->tcb_dma_handle != NULL) {
893 ddi_dma_free_handle(&tcb->tcb_dma_handle);
894 tcb->tcb_dma_handle = NULL;
895 }
896 if (tcb->tcb_lso_dma_handle != NULL) {
897 ddi_dma_free_handle(&tcb->tcb_lso_dma_handle);
898 tcb->tcb_lso_dma_handle = NULL;
899 }
900 }
901
902 fsz = sizeof (i40e_tx_control_block_t) *
903 itrq->itrq_tx_free_list_size;
904 kmem_free(itrq->itrq_tcb_area, fsz);
905 itrq->itrq_tcb_area = NULL;
906 }
907
908 if (itrq->itrq_tcb_free_list != NULL) {
909 fsz = sizeof (i40e_tx_control_block_t *) *
910 itrq->itrq_tx_free_list_size;
911 kmem_free(itrq->itrq_tcb_free_list, fsz);
912 itrq->itrq_tcb_free_list = NULL;
913 }
914
915 if (itrq->itrq_tcb_work_list != NULL) {
916 fsz = sizeof (i40e_tx_control_block_t *) *
917 itrq->itrq_tx_ring_size;
918 kmem_free(itrq->itrq_tcb_work_list, fsz);
919 itrq->itrq_tcb_work_list = NULL;
920 }
921
922 i40e_free_dma_buffer(&itrq->itrq_desc_area);
923 itrq->itrq_desc_ring = NULL;
924
925 }
926
927 static boolean_t
i40e_alloc_tx_dma(i40e_trqpair_t * itrq)928 i40e_alloc_tx_dma(i40e_trqpair_t *itrq)
929 {
930 int i, ret;
931 size_t dmasz;
932 i40e_tx_control_block_t *tcb;
933 i40e_t *i40e = itrq->itrq_i40e;
934
935 itrq->itrq_tx_ring_size = i40e->i40e_tx_ring_size;
936 itrq->itrq_tx_free_list_size = i40e->i40e_tx_ring_size +
937 (i40e->i40e_tx_ring_size >> 1);
938
939 /*
940 * Allocate an additional TX descriptor for the writeback head.
941 */
942 dmasz = sizeof (i40e_tx_desc_t) * itrq->itrq_tx_ring_size;
943 dmasz += sizeof (i40e_tx_desc_t);
944
945 VERIFY(dmasz > 0);
946 if (i40e_alloc_dma_buffer(i40e, &itrq->itrq_desc_area,
947 &i40e->i40e_static_dma_attr, &i40e->i40e_desc_acc_attr,
948 B_FALSE, B_TRUE, dmasz) == B_FALSE) {
949 i40e_error(i40e, "failed to allocate DMA resources for TX "
950 "descriptor ring");
951 return (B_FALSE);
952 }
953 itrq->itrq_desc_ring =
954 (i40e_tx_desc_t *)(uintptr_t)itrq->itrq_desc_area.dmab_address;
955 itrq->itrq_desc_wbhead = (uint32_t *)(itrq->itrq_desc_ring +
956 itrq->itrq_tx_ring_size);
957 itrq->itrq_desc_head = 0;
958 itrq->itrq_desc_tail = 0;
959 itrq->itrq_desc_free = itrq->itrq_tx_ring_size;
960
961 itrq->itrq_tcb_work_list = kmem_zalloc(itrq->itrq_tx_ring_size *
962 sizeof (i40e_tx_control_block_t *), KM_NOSLEEP);
963 if (itrq->itrq_tcb_work_list == NULL) {
964 i40e_error(i40e, "failed to allocate a %d entry TX work list "
965 "for ring %d", itrq->itrq_tx_ring_size, itrq->itrq_index);
966 goto cleanup;
967 }
968
969 itrq->itrq_tcb_free_list = kmem_zalloc(itrq->itrq_tx_free_list_size *
970 sizeof (i40e_tx_control_block_t *), KM_SLEEP);
971 if (itrq->itrq_tcb_free_list == NULL) {
972 i40e_error(i40e, "failed to allocate a %d entry TX free list "
973 "for ring %d", itrq->itrq_tx_free_list_size,
974 itrq->itrq_index);
975 goto cleanup;
976 }
977
978 /*
979 * We allocate enough TX control blocks to cover the free list.
980 */
981 itrq->itrq_tcb_area = kmem_zalloc(sizeof (i40e_tx_control_block_t) *
982 itrq->itrq_tx_free_list_size, KM_NOSLEEP);
983 if (itrq->itrq_tcb_area == NULL) {
984 i40e_error(i40e, "failed to allocate a %d entry tcb area for "
985 "ring %d", itrq->itrq_tx_free_list_size, itrq->itrq_index);
986 goto cleanup;
987 }
988
989 /*
990 * For each tcb, allocate DMA memory.
991 */
992 dmasz = i40e->i40e_tx_buf_size;
993 VERIFY(dmasz > 0);
994 tcb = itrq->itrq_tcb_area;
995 for (i = 0; i < itrq->itrq_tx_free_list_size; i++, tcb++) {
996 VERIFY(tcb != NULL);
997
998 /*
999 * Allocate both a DMA buffer which we'll use for when we copy
1000 * packets for transmission and allocate a DMA handle which
1001 * we'll use when we bind data.
1002 */
1003 ret = ddi_dma_alloc_handle(i40e->i40e_dip,
1004 &i40e->i40e_txbind_dma_attr, DDI_DMA_DONTWAIT, NULL,
1005 &tcb->tcb_dma_handle);
1006 if (ret != DDI_SUCCESS) {
1007 i40e_error(i40e, "failed to allocate DMA handle for TX "
1008 "data binding on ring %d: %d", itrq->itrq_index,
1009 ret);
1010 tcb->tcb_dma_handle = NULL;
1011 goto cleanup;
1012 }
1013
1014 ret = ddi_dma_alloc_handle(i40e->i40e_dip,
1015 &i40e->i40e_txbind_lso_dma_attr, DDI_DMA_DONTWAIT, NULL,
1016 &tcb->tcb_lso_dma_handle);
1017 if (ret != DDI_SUCCESS) {
1018 i40e_error(i40e, "failed to allocate DMA handle for TX "
1019 "LSO data binding on ring %d: %d", itrq->itrq_index,
1020 ret);
1021 tcb->tcb_lso_dma_handle = NULL;
1022 goto cleanup;
1023 }
1024
1025 if (i40e_alloc_dma_buffer(i40e, &tcb->tcb_dma,
1026 &i40e->i40e_static_dma_attr, &i40e->i40e_buf_acc_attr,
1027 B_TRUE, B_FALSE, dmasz) == B_FALSE) {
1028 i40e_error(i40e, "failed to allocate %ld bytes of "
1029 "DMA for TX data binding on ring %d", dmasz,
1030 itrq->itrq_index);
1031 goto cleanup;
1032 }
1033
1034 itrq->itrq_tcb_free_list[i] = tcb;
1035 }
1036
1037 itrq->itrq_tcb_free = itrq->itrq_tx_free_list_size;
1038
1039 return (B_TRUE);
1040
1041 cleanup:
1042 i40e_free_tx_dma(itrq);
1043 return (B_FALSE);
1044 }
1045
1046 /*
1047 * Free all memory associated with a ring. Note, this is done as part of
1048 * the GLDv3 ring stop routine.
1049 */
1050 void
i40e_free_ring_mem(i40e_trqpair_t * itrq,boolean_t failed_init)1051 i40e_free_ring_mem(i40e_trqpair_t *itrq, boolean_t failed_init)
1052 {
1053 i40e_t *i40e = itrq->itrq_i40e;
1054 i40e_rx_data_t *rxd = itrq->itrq_rxdata;
1055
1056 /*
1057 * In some cases i40e_alloc_rx_data() may have failed
1058 * and in that case there is no rxd to free.
1059 */
1060 if (rxd == NULL)
1061 return;
1062
1063 /*
1064 * Clean up our RX data. We have to free DMA resources first and
1065 * then if we have no more pending RCB's, then we'll go ahead
1066 * and clean things up. Note, we can't set the stopped flag on
1067 * the RX data until after we've done the first pass of the
1068 * pending resources. Otherwise we might race with
1069 * i40e_rx_recycle on determining who should free the
1070 * i40e_rx_data_t above.
1071 */
1072 i40e_free_rx_dma(rxd, failed_init);
1073
1074 mutex_enter(&i40e->i40e_rx_pending_lock);
1075 rxd->rxd_shutdown = B_TRUE;
1076 if (rxd->rxd_rcb_pending == 0) {
1077 i40e_free_rx_data(rxd);
1078 itrq->itrq_rxdata = NULL;
1079 }
1080 mutex_exit(&i40e->i40e_rx_pending_lock);
1081
1082 i40e_free_tx_dma(itrq);
1083 }
1084
1085 /*
1086 * Allocate all of the resources associated with a ring.
1087 * Note this is done as part of the GLDv3 ring start routine.
1088 * This takes care of both DMA and non-DMA related resources.
1089 */
1090 boolean_t
i40e_alloc_ring_mem(i40e_trqpair_t * itrq)1091 i40e_alloc_ring_mem(i40e_trqpair_t *itrq)
1092 {
1093 if (!i40e_alloc_rx_data(itrq->itrq_i40e, itrq))
1094 goto free;
1095
1096 if (!i40e_alloc_rx_dma(itrq->itrq_rxdata))
1097 goto free;
1098
1099 if (!i40e_alloc_tx_dma(itrq))
1100 goto free;
1101
1102 return (B_TRUE);
1103
1104 free:
1105 i40e_free_ring_mem(itrq, B_TRUE);
1106 return (B_FALSE);
1107 }
1108
1109
1110 /*
1111 * Because every instance of i40e may have different support for FMA
1112 * capabilities, we copy the DMA attributes into the i40e_t and set them that
1113 * way and use them for determining attributes.
1114 */
1115 void
i40e_init_dma_attrs(i40e_t * i40e,boolean_t fma)1116 i40e_init_dma_attrs(i40e_t *i40e, boolean_t fma)
1117 {
1118 bcopy(&i40e_g_static_dma_attr, &i40e->i40e_static_dma_attr,
1119 sizeof (ddi_dma_attr_t));
1120 bcopy(&i40e_g_txbind_dma_attr, &i40e->i40e_txbind_dma_attr,
1121 sizeof (ddi_dma_attr_t));
1122 bcopy(&i40e_g_txbind_lso_dma_attr, &i40e->i40e_txbind_lso_dma_attr,
1123 sizeof (ddi_dma_attr_t));
1124 bcopy(&i40e_g_desc_acc_attr, &i40e->i40e_desc_acc_attr,
1125 sizeof (ddi_device_acc_attr_t));
1126 bcopy(&i40e_g_buf_acc_attr, &i40e->i40e_buf_acc_attr,
1127 sizeof (ddi_device_acc_attr_t));
1128
1129 if (fma == B_TRUE) {
1130 i40e->i40e_static_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR;
1131 i40e->i40e_txbind_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR;
1132 i40e->i40e_txbind_lso_dma_attr.dma_attr_flags |=
1133 DDI_DMA_FLAGERR;
1134 } else {
1135 i40e->i40e_static_dma_attr.dma_attr_flags &= ~DDI_DMA_FLAGERR;
1136 i40e->i40e_txbind_dma_attr.dma_attr_flags &= ~DDI_DMA_FLAGERR;
1137 i40e->i40e_txbind_lso_dma_attr.dma_attr_flags &=
1138 ~DDI_DMA_FLAGERR;
1139 }
1140 }
1141
1142 static void
i40e_rcb_free(i40e_rx_data_t * rxd,i40e_rx_control_block_t * rcb)1143 i40e_rcb_free(i40e_rx_data_t *rxd, i40e_rx_control_block_t *rcb)
1144 {
1145 mutex_enter(&rxd->rxd_free_lock);
1146 ASSERT(rxd->rxd_rcb_free < rxd->rxd_free_list_size);
1147 ASSERT(rxd->rxd_free_list[rxd->rxd_rcb_free] == NULL);
1148 rxd->rxd_free_list[rxd->rxd_rcb_free] = rcb;
1149 rxd->rxd_rcb_free++;
1150 mutex_exit(&rxd->rxd_free_lock);
1151 }
1152
1153 static i40e_rx_control_block_t *
i40e_rcb_alloc(i40e_rx_data_t * rxd)1154 i40e_rcb_alloc(i40e_rx_data_t *rxd)
1155 {
1156 i40e_rx_control_block_t *rcb;
1157
1158 mutex_enter(&rxd->rxd_free_lock);
1159 if (rxd->rxd_rcb_free == 0) {
1160 mutex_exit(&rxd->rxd_free_lock);
1161 return (NULL);
1162 }
1163 rxd->rxd_rcb_free--;
1164 rcb = rxd->rxd_free_list[rxd->rxd_rcb_free];
1165 VERIFY(rcb != NULL);
1166 rxd->rxd_free_list[rxd->rxd_rcb_free] = NULL;
1167 mutex_exit(&rxd->rxd_free_lock);
1168
1169 return (rcb);
1170 }
1171
1172 /*
1173 * This is the callback that we get from the OS when freemsg(9F) has been called
1174 * on a loaned descriptor. In addition, if we take the last reference count
1175 * here, then we have to tear down all of the RX data.
1176 */
1177 void
i40e_rx_recycle(caddr_t arg)1178 i40e_rx_recycle(caddr_t arg)
1179 {
1180 uint32_t ref;
1181 i40e_rx_control_block_t *rcb;
1182 i40e_rx_data_t *rxd;
1183 i40e_t *i40e;
1184
1185 /* LINTED: E_BAD_PTR_CAST_ALIGN */
1186 rcb = (i40e_rx_control_block_t *)arg;
1187 rxd = rcb->rcb_rxd;
1188 i40e = rxd->rxd_i40e;
1189
1190 /*
1191 * It's possible for this to be called with a reference count of zero.
1192 * That will happen when we're doing the freemsg after taking the last
1193 * reference because we're tearing down everything and this rcb is not
1194 * outstanding.
1195 */
1196 if (rcb->rcb_ref == 0)
1197 return;
1198
1199 /*
1200 * Don't worry about failure of desballoc here. It'll only become fatal
1201 * if we're trying to use it and we can't in i40e_rx_bind().
1202 */
1203 rcb->rcb_mp = desballoc((unsigned char *)rcb->rcb_dma.dmab_address,
1204 rcb->rcb_dma.dmab_size, 0, &rcb->rcb_free_rtn);
1205 i40e_rcb_free(rxd, rcb);
1206
1207 /*
1208 * It's possible that the rcb was being used while we are shutting down
1209 * the device. In that case, we'll take the final reference from the
1210 * device here.
1211 */
1212 ref = atomic_dec_32_nv(&rcb->rcb_ref);
1213 if (ref == 0) {
1214 freemsg(rcb->rcb_mp);
1215 rcb->rcb_mp = NULL;
1216 i40e_free_dma_buffer(&rcb->rcb_dma);
1217
1218 mutex_enter(&i40e->i40e_rx_pending_lock);
1219 atomic_dec_32(&rxd->rxd_rcb_pending);
1220 atomic_dec_32(&i40e->i40e_rx_pending);
1221
1222 /*
1223 * If this was the last block and it's been indicated that we've
1224 * passed the shutdown point, we should clean up.
1225 */
1226 if (rxd->rxd_shutdown == B_TRUE && rxd->rxd_rcb_pending == 0) {
1227 i40e_free_rx_data(rxd);
1228 cv_broadcast(&i40e->i40e_rx_pending_cv);
1229 }
1230
1231 mutex_exit(&i40e->i40e_rx_pending_lock);
1232 }
1233 }
1234
1235 static mblk_t *
i40e_rx_bind(i40e_trqpair_t * itrq,i40e_rx_data_t * rxd,uint32_t index,uint32_t plen)1236 i40e_rx_bind(i40e_trqpair_t *itrq, i40e_rx_data_t *rxd, uint32_t index,
1237 uint32_t plen)
1238 {
1239 mblk_t *mp;
1240 i40e_t *i40e = rxd->rxd_i40e;
1241 i40e_rx_control_block_t *rcb, *rep_rcb;
1242
1243 ASSERT(MUTEX_HELD(&itrq->itrq_rx_lock));
1244
1245 if ((rep_rcb = i40e_rcb_alloc(rxd)) == NULL) {
1246 itrq->itrq_rxstat.irxs_rx_bind_norcb.value.ui64++;
1247 return (NULL);
1248 }
1249
1250 rcb = rxd->rxd_work_list[index];
1251
1252 /*
1253 * Check to make sure we have a mblk_t. If we don't, this is our last
1254 * chance to try and get one.
1255 */
1256 if (rcb->rcb_mp == NULL) {
1257 rcb->rcb_mp =
1258 desballoc((unsigned char *)rcb->rcb_dma.dmab_address,
1259 rcb->rcb_dma.dmab_size, 0, &rcb->rcb_free_rtn);
1260 if (rcb->rcb_mp == NULL) {
1261 itrq->itrq_rxstat.irxs_rx_bind_nomp.value.ui64++;
1262 i40e_rcb_free(rxd, rcb);
1263 return (NULL);
1264 }
1265 }
1266
1267 I40E_DMA_SYNC(&rcb->rcb_dma, DDI_DMA_SYNC_FORKERNEL);
1268
1269 if (i40e_check_dma_handle(rcb->rcb_dma.dmab_dma_handle) != DDI_FM_OK) {
1270 ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED);
1271 atomic_or_32(&i40e->i40e_state, I40E_ERROR);
1272 i40e_rcb_free(rxd, rcb);
1273 return (NULL);
1274 }
1275
1276 /*
1277 * Note, we've already accounted for the I40E_BUF_IPHDR_ALIGNMENT.
1278 */
1279 mp = rcb->rcb_mp;
1280 atomic_inc_32(&rcb->rcb_ref);
1281 mp->b_wptr = mp->b_rptr + plen;
1282 mp->b_next = mp->b_cont = NULL;
1283
1284 rxd->rxd_work_list[index] = rep_rcb;
1285 return (mp);
1286 }
1287
1288 /*
1289 * We're going to allocate a new message block for this frame and attempt to
1290 * receive it. See the big theory statement for more information on when we copy
1291 * versus bind.
1292 */
1293 static mblk_t *
i40e_rx_copy(i40e_trqpair_t * itrq,i40e_rx_data_t * rxd,uint32_t index,uint32_t plen)1294 i40e_rx_copy(i40e_trqpair_t *itrq, i40e_rx_data_t *rxd, uint32_t index,
1295 uint32_t plen)
1296 {
1297 i40e_t *i40e = rxd->rxd_i40e;
1298 i40e_rx_control_block_t *rcb;
1299 mblk_t *mp;
1300
1301 ASSERT(index < rxd->rxd_ring_size);
1302 rcb = rxd->rxd_work_list[index];
1303
1304 I40E_DMA_SYNC(&rcb->rcb_dma, DDI_DMA_SYNC_FORKERNEL);
1305
1306 if (i40e_check_dma_handle(rcb->rcb_dma.dmab_dma_handle) != DDI_FM_OK) {
1307 ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED);
1308 atomic_or_32(&i40e->i40e_state, I40E_ERROR);
1309 return (NULL);
1310 }
1311
1312 mp = allocb(plen + I40E_BUF_IPHDR_ALIGNMENT, 0);
1313 if (mp == NULL) {
1314 itrq->itrq_rxstat.irxs_rx_copy_nomem.value.ui64++;
1315 return (NULL);
1316 }
1317
1318 mp->b_rptr += I40E_BUF_IPHDR_ALIGNMENT;
1319 bcopy(rcb->rcb_dma.dmab_address, mp->b_rptr, plen);
1320 mp->b_wptr = mp->b_rptr + plen;
1321
1322 return (mp);
1323 }
1324
1325 /*
1326 * Determine if the device has enabled any checksum flags for us. The level of
1327 * checksum computed will depend on the type packet that we have, which is
1328 * contained in ptype. For example, the checksum logic it does will vary
1329 * depending on whether or not the packet is considered tunneled, whether it
1330 * recognizes the L4 type, etc. Section 8.3.4.3 summarizes which checksums are
1331 * valid.
1332 *
1333 * While there are additional checksums that we could recognize here, we'll need
1334 * to get some additional GLDv3 enhancements to be able to properly describe
1335 * them.
1336 */
1337 static void
i40e_rx_hcksum(i40e_trqpair_t * itrq,mblk_t * mp,uint64_t status,uint32_t err,uint32_t ptype)1338 i40e_rx_hcksum(i40e_trqpair_t *itrq, mblk_t *mp, uint64_t status, uint32_t err,
1339 uint32_t ptype)
1340 {
1341 uint32_t cksum;
1342 struct i40e_rx_ptype_decoded pinfo;
1343
1344 ASSERT(ptype <= 255);
1345 pinfo = decode_rx_desc_ptype(ptype);
1346
1347 cksum = 0;
1348
1349 /*
1350 * If the ptype isn't something that we know in the driver, then we
1351 * shouldn't even consider moving forward.
1352 */
1353 if (pinfo.known == 0) {
1354 itrq->itrq_rxstat.irxs_hck_unknown.value.ui64++;
1355 return;
1356 }
1357
1358 /*
1359 * If hardware didn't set the L3L4P bit on the frame, then there is no
1360 * checksum offload to consider.
1361 */
1362 if ((status & (1 << I40E_RX_DESC_STATUS_L3L4P_SHIFT)) == 0) {
1363 itrq->itrq_rxstat.irxs_hck_nol3l4p.value.ui64++;
1364 return;
1365 }
1366
1367 /*
1368 * The device tells us that IPv6 checksums where a Destination Options
1369 * Header or a Routing header shouldn't be trusted. Discard all
1370 * checksums in this case.
1371 */
1372 if (pinfo.outer_ip == I40E_RX_PTYPE_OUTER_IP &&
1373 pinfo.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV6 &&
1374 (status & (1 << I40E_RX_DESC_STATUS_IPV6EXADD_SHIFT))) {
1375 itrq->itrq_rxstat.irxs_hck_v6skip.value.ui64++;
1376 return;
1377 }
1378
1379 /*
1380 * The hardware denotes three kinds of possible errors. Two are reserved
1381 * for inner and outer IP checksum errors (IPE and EIPE) and the latter
1382 * is for L4 checksum errors (L4E). If there is only one IP header, then
1383 * the only thing that we care about is IPE. Note that since we don't
1384 * support inner checksums, we will ignore IPE being set on tunneled
1385 * packets and only care about EIPE.
1386 */
1387 if (pinfo.outer_ip == I40E_RX_PTYPE_OUTER_IP &&
1388 pinfo.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV4) {
1389 if (pinfo.tunnel_type == I40E_RX_PTYPE_OUTER_NONE) {
1390 if ((err & (1 << I40E_RX_DESC_ERROR_IPE_SHIFT)) != 0) {
1391 itrq->itrq_rxstat.irxs_hck_iperr.value.ui64++;
1392 } else {
1393 itrq->itrq_rxstat.irxs_hck_v4hdrok.value.ui64++;
1394 cksum |= HCK_IPV4_HDRCKSUM_OK;
1395 }
1396 } else {
1397 if ((err & (1 << I40E_RX_DESC_ERROR_EIPE_SHIFT)) != 0) {
1398 itrq->itrq_rxstat.irxs_hck_eiperr.value.ui64++;
1399 } else {
1400 itrq->itrq_rxstat.irxs_hck_v4hdrok.value.ui64++;
1401 cksum |= HCK_IPV4_HDRCKSUM_OK;
1402 }
1403 }
1404 }
1405
1406 /*
1407 * We only have meaningful L4 checksums in the case of IP->L4 and
1408 * IP->IP->L4. There is not outer L4 checksum data available in any
1409 * other case. Further, we don't bother reporting the valid checksum in
1410 * the case of IP->IP->L4 set.
1411 */
1412 if (pinfo.outer_ip == I40E_RX_PTYPE_OUTER_IP &&
1413 pinfo.tunnel_type == I40E_RX_PTYPE_TUNNEL_NONE &&
1414 (pinfo.inner_prot == I40E_RX_PTYPE_INNER_PROT_UDP ||
1415 pinfo.inner_prot == I40E_RX_PTYPE_INNER_PROT_TCP ||
1416 pinfo.inner_prot == I40E_RX_PTYPE_INNER_PROT_ICMP ||
1417 pinfo.inner_prot == I40E_RX_PTYPE_INNER_PROT_SCTP)) {
1418 ASSERT(pinfo.payload_layer == I40E_RX_PTYPE_PAYLOAD_LAYER_PAY4);
1419 if ((err & (1 << I40E_RX_DESC_ERROR_L4E_SHIFT)) != 0) {
1420 itrq->itrq_rxstat.irxs_hck_l4err.value.ui64++;
1421 } else {
1422 itrq->itrq_rxstat.irxs_hck_l4hdrok.value.ui64++;
1423 cksum |= HCK_FULLCKSUM_OK;
1424 }
1425 }
1426
1427 if (cksum != 0) {
1428 itrq->itrq_rxstat.irxs_hck_set.value.ui64++;
1429 mac_hcksum_set(mp, 0, 0, 0, 0, cksum);
1430 } else {
1431 itrq->itrq_rxstat.irxs_hck_miss.value.ui64++;
1432 }
1433 }
1434
1435 mblk_t *
i40e_ring_rx(i40e_trqpair_t * itrq,int poll_bytes)1436 i40e_ring_rx(i40e_trqpair_t *itrq, int poll_bytes)
1437 {
1438 i40e_t *i40e;
1439 i40e_hw_t *hw;
1440 i40e_rx_data_t *rxd;
1441 uint32_t cur_head;
1442 i40e_rx_desc_t *cur_desc;
1443 i40e_rx_control_block_t *rcb;
1444 uint64_t rx_bytes, rx_frames;
1445 uint64_t stword;
1446 mblk_t *mp, *mp_head, **mp_tail;
1447
1448 ASSERT(MUTEX_HELD(&itrq->itrq_rx_lock));
1449 rxd = itrq->itrq_rxdata;
1450 i40e = itrq->itrq_i40e;
1451 hw = &i40e->i40e_hw_space;
1452
1453 if (!(i40e->i40e_state & I40E_STARTED) ||
1454 (i40e->i40e_state & I40E_OVERTEMP) ||
1455 (i40e->i40e_state & I40E_SUSPENDED) ||
1456 (i40e->i40e_state & I40E_ERROR))
1457 return (NULL);
1458
1459 /*
1460 * Before we do anything else, we have to make sure that all of the DMA
1461 * buffers are synced up and then check to make sure that they're
1462 * actually good from an FM perspective.
1463 */
1464 I40E_DMA_SYNC(&rxd->rxd_desc_area, DDI_DMA_SYNC_FORKERNEL);
1465 if (i40e_check_dma_handle(rxd->rxd_desc_area.dmab_dma_handle) !=
1466 DDI_FM_OK) {
1467 ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED);
1468 atomic_or_32(&i40e->i40e_state, I40E_ERROR);
1469 return (NULL);
1470 }
1471
1472 /*
1473 * Prepare our stats. We do a limited amount of processing in both
1474 * polling and interrupt context. The limit in interrupt context is
1475 * based on frames, in polling context based on bytes.
1476 */
1477 rx_bytes = rx_frames = 0;
1478 mp_head = NULL;
1479 mp_tail = &mp_head;
1480
1481 /*
1482 * At this point, the descriptor ring is available to check. We'll try
1483 * and process until we either run out of poll_bytes or descriptors.
1484 */
1485 cur_head = rxd->rxd_desc_next;
1486 cur_desc = &rxd->rxd_desc_ring[cur_head];
1487 stword = LE64_TO_CPU(cur_desc->wb.qword1.status_error_len);
1488
1489 /*
1490 * Note, the primary invariant of this loop should be that cur_head,
1491 * cur_desc, and stword always point to the currently processed
1492 * descriptor. When we leave the loop, it should point to a descriptor
1493 * that HAS NOT been processed. Meaning, that if we haven't consumed the
1494 * frame, the descriptor should not be advanced.
1495 */
1496 while ((stword & (1 << I40E_RX_DESC_STATUS_DD_SHIFT)) != 0) {
1497 uint32_t error, eop, plen, ptype;
1498
1499 /*
1500 * The DD, PLEN, and EOP bits are the only ones that are valid
1501 * in every frame. The error information is only valid when EOP
1502 * is set in the same frame.
1503 *
1504 * At this time, because we don't do any LRO or header
1505 * splitting. We expect that every frame should have EOP set in
1506 * it. When later functionality comes in, we'll want to
1507 * re-evaluate this.
1508 */
1509 eop = stword & (1 << I40E_RX_DESC_STATUS_EOF_SHIFT);
1510 VERIFY(eop != 0);
1511
1512 error = (stword & I40E_RXD_QW1_ERROR_MASK) >>
1513 I40E_RXD_QW1_ERROR_SHIFT;
1514 if (error & I40E_RX_ERR_BITS) {
1515 itrq->itrq_rxstat.irxs_rx_desc_error.value.ui64++;
1516 goto discard;
1517 }
1518
1519 plen = (stword & I40E_RXD_QW1_LENGTH_PBUF_MASK) >>
1520 I40E_RXD_QW1_LENGTH_PBUF_SHIFT;
1521
1522 ptype = (stword & I40E_RXD_QW1_PTYPE_MASK) >>
1523 I40E_RXD_QW1_PTYPE_SHIFT;
1524
1525 /*
1526 * This packet contains valid data. We should check to see if
1527 * we're actually going to consume it based on its length (to
1528 * ensure that we don't overshoot our quota). We determine
1529 * whether to bcopy or bind the DMA resources based on the size
1530 * of the frame. However, if on debug, we allow it to be
1531 * overridden for testing purposes.
1532 *
1533 * We should be smarter about this and do DMA binding for
1534 * larger frames, but for now, it's really more important that
1535 * we actually just get something simple working.
1536 */
1537
1538 /*
1539 * Ensure we don't exceed our polling quota by reading this
1540 * frame. Note we only bump bytes now, we bump frames later.
1541 */
1542 if ((poll_bytes != I40E_POLL_NULL) &&
1543 (rx_bytes + plen) > poll_bytes)
1544 break;
1545 rx_bytes += plen;
1546
1547 mp = NULL;
1548 if (plen >= i40e->i40e_rx_dma_min)
1549 mp = i40e_rx_bind(itrq, rxd, cur_head, plen);
1550 if (mp == NULL)
1551 mp = i40e_rx_copy(itrq, rxd, cur_head, plen);
1552
1553 if (mp != NULL) {
1554 if (i40e->i40e_rx_hcksum_enable)
1555 i40e_rx_hcksum(itrq, mp, stword, error, ptype);
1556 *mp_tail = mp;
1557 mp_tail = &mp->b_next;
1558 }
1559
1560 /*
1561 * Now we need to prepare this frame for use again. See the
1562 * discussion in the big theory statements.
1563 *
1564 * However, right now we're doing the simple version of this.
1565 * Normally what we'd do would depend on whether or not we were
1566 * doing DMA binding or bcopying. But because we're always doing
1567 * bcopying, we can just always use the current index as a key
1568 * for what to do and reassign the buffer based on the ring.
1569 */
1570 discard:
1571 rcb = rxd->rxd_work_list[cur_head];
1572 cur_desc->read.pkt_addr =
1573 CPU_TO_LE64((uintptr_t)rcb->rcb_dma.dmab_dma_address);
1574 cur_desc->read.hdr_addr = 0;
1575
1576 /*
1577 * Finally, update our loop invariants.
1578 */
1579 cur_head = i40e_next_desc(cur_head, 1, rxd->rxd_ring_size);
1580 cur_desc = &rxd->rxd_desc_ring[cur_head];
1581 stword = LE64_TO_CPU(cur_desc->wb.qword1.status_error_len);
1582
1583 /*
1584 * To help provide liveness, we limit the amount of data that
1585 * we'll end up counting. Note that in these cases, an interrupt
1586 * is not dissimilar from a polling request.
1587 */
1588 rx_frames++;
1589 if (rx_frames > i40e->i40e_rx_limit_per_intr) {
1590 itrq->itrq_rxstat.irxs_rx_intr_limit.value.ui64++;
1591 break;
1592 }
1593 }
1594
1595 /*
1596 * As we've modified the ring, we need to make sure that we sync the
1597 * descriptor ring for the device. Next, we update the hardware and
1598 * update our notion of where the head for us to read from hardware is
1599 * next.
1600 */
1601 I40E_DMA_SYNC(&rxd->rxd_desc_area, DDI_DMA_SYNC_FORDEV);
1602 if (i40e_check_dma_handle(rxd->rxd_desc_area.dmab_dma_handle) !=
1603 DDI_FM_OK) {
1604 ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED);
1605 atomic_or_32(&i40e->i40e_state, I40E_ERROR);
1606 }
1607
1608 if (rx_frames != 0) {
1609 uint32_t tail;
1610 ddi_acc_handle_t rh = i40e->i40e_osdep_space.ios_reg_handle;
1611 rxd->rxd_desc_next = cur_head;
1612 tail = i40e_prev_desc(cur_head, 1, rxd->rxd_ring_size);
1613
1614 I40E_WRITE_REG(hw, I40E_QRX_TAIL(itrq->itrq_index), tail);
1615 if (i40e_check_acc_handle(rh) != DDI_FM_OK) {
1616 ddi_fm_service_impact(i40e->i40e_dip,
1617 DDI_SERVICE_DEGRADED);
1618 atomic_or_32(&i40e->i40e_state, I40E_ERROR);
1619 }
1620
1621 itrq->itrq_rxstat.irxs_bytes.value.ui64 += rx_bytes;
1622 itrq->itrq_rxstat.irxs_packets.value.ui64 += rx_frames;
1623 }
1624
1625 #ifdef DEBUG
1626 if (rx_frames == 0) {
1627 ASSERT(rx_bytes == 0);
1628 }
1629 #endif
1630
1631 return (mp_head);
1632 }
1633
1634 /*
1635 * This function is called by the GLDv3 when it wants to poll on a ring. The
1636 * only primary difference from when we call this during an interrupt is that we
1637 * have a limit on the number of bytes that we should consume.
1638 */
1639 mblk_t *
i40e_ring_rx_poll(void * arg,int poll_bytes)1640 i40e_ring_rx_poll(void *arg, int poll_bytes)
1641 {
1642 i40e_trqpair_t *itrq = arg;
1643 mblk_t *mp;
1644
1645 ASSERT(poll_bytes > 0);
1646 if (poll_bytes == 0)
1647 return (NULL);
1648
1649 mutex_enter(&itrq->itrq_rx_lock);
1650 mp = i40e_ring_rx(itrq, poll_bytes);
1651 mutex_exit(&itrq->itrq_rx_lock);
1652
1653 return (mp);
1654 }
1655
1656 /*
1657 * Attempt to put togther the information we'll need to feed into a descriptor
1658 * to properly program the hardware for checksum offload as well as the
1659 * generally required flags.
1660 *
1661 * The i40e_tx_context_t`itc_data_cmdflags contains the set of flags we need to
1662 * 'or' into the descriptor based on the checksum flags for this mblk_t and the
1663 * actual information we care about.
1664 *
1665 * If the mblk requires LSO then we'll also gather the information that will be
1666 * used to construct the Transmit Context Descriptor.
1667 */
1668 static int
i40e_tx_context(i40e_t * i40e,i40e_trqpair_t * itrq,mblk_t * mp,mac_ether_offload_info_t * meo,i40e_tx_context_t * tctx)1669 i40e_tx_context(i40e_t *i40e, i40e_trqpair_t *itrq, mblk_t *mp,
1670 mac_ether_offload_info_t *meo, i40e_tx_context_t *tctx)
1671 {
1672 uint32_t chkflags, start, mss, lsoflags;
1673 i40e_txq_stat_t *txs = &itrq->itrq_txstat;
1674
1675 bzero(tctx, sizeof (i40e_tx_context_t));
1676
1677 if (i40e->i40e_tx_hcksum_enable != B_TRUE)
1678 return (0);
1679
1680 mac_hcksum_get(mp, &start, NULL, NULL, NULL, &chkflags);
1681 mac_lso_get(mp, &mss, &lsoflags);
1682
1683 if (chkflags == 0 && lsoflags == 0)
1684 return (0);
1685
1686 /*
1687 * Have we been asked to checksum an IPv4 header. If so, verify that we
1688 * have sufficient information and then set the proper fields in the
1689 * command structure.
1690 */
1691 if (chkflags & HCK_IPV4_HDRCKSUM) {
1692 if ((meo->meoi_flags & MEOI_L2INFO_SET) == 0) {
1693 txs->itxs_hck_nol2info.value.ui64++;
1694 return (-1);
1695 }
1696 if ((meo->meoi_flags & MEOI_L3INFO_SET) == 0) {
1697 txs->itxs_hck_nol3info.value.ui64++;
1698 return (-1);
1699 }
1700 if (meo->meoi_l3proto != ETHERTYPE_IP) {
1701 txs->itxs_hck_badl3.value.ui64++;
1702 return (-1);
1703 }
1704 tctx->itc_data_cmdflags |= I40E_TX_DESC_CMD_IIPT_IPV4_CSUM;
1705 tctx->itc_data_offsets |= (meo->meoi_l2hlen >> 1) <<
1706 I40E_TX_DESC_LENGTH_MACLEN_SHIFT;
1707 tctx->itc_data_offsets |= (meo->meoi_l3hlen >> 2) <<
1708 I40E_TX_DESC_LENGTH_IPLEN_SHIFT;
1709 }
1710
1711 /*
1712 * We've been asked to provide an L4 header, first, set up the IP
1713 * information in the descriptor if we haven't already before moving
1714 * onto seeing if we have enough information for the L4 checksum
1715 * offload.
1716 */
1717 if (chkflags & HCK_PARTIALCKSUM) {
1718 if ((meo->meoi_flags & MEOI_L4INFO_SET) == 0) {
1719 txs->itxs_hck_nol4info.value.ui64++;
1720 return (-1);
1721 }
1722
1723 if (!(chkflags & HCK_IPV4_HDRCKSUM)) {
1724 if ((meo->meoi_flags & MEOI_L2INFO_SET) == 0) {
1725 txs->itxs_hck_nol2info.value.ui64++;
1726 return (-1);
1727 }
1728 if ((meo->meoi_flags & MEOI_L3INFO_SET) == 0) {
1729 txs->itxs_hck_nol3info.value.ui64++;
1730 return (-1);
1731 }
1732
1733 if (meo->meoi_l3proto == ETHERTYPE_IP) {
1734 tctx->itc_data_cmdflags |=
1735 I40E_TX_DESC_CMD_IIPT_IPV4;
1736 } else if (meo->meoi_l3proto == ETHERTYPE_IPV6) {
1737 tctx->itc_data_cmdflags |=
1738 I40E_TX_DESC_CMD_IIPT_IPV6;
1739 } else {
1740 txs->itxs_hck_badl3.value.ui64++;
1741 return (-1);
1742 }
1743 tctx->itc_data_offsets |= (meo->meoi_l2hlen >> 1) <<
1744 I40E_TX_DESC_LENGTH_MACLEN_SHIFT;
1745 tctx->itc_data_offsets |= (meo->meoi_l3hlen >> 2) <<
1746 I40E_TX_DESC_LENGTH_IPLEN_SHIFT;
1747 }
1748
1749 switch (meo->meoi_l4proto) {
1750 case IPPROTO_TCP:
1751 tctx->itc_data_cmdflags |=
1752 I40E_TX_DESC_CMD_L4T_EOFT_TCP;
1753 break;
1754 case IPPROTO_UDP:
1755 tctx->itc_data_cmdflags |=
1756 I40E_TX_DESC_CMD_L4T_EOFT_UDP;
1757 break;
1758 case IPPROTO_SCTP:
1759 tctx->itc_data_cmdflags |=
1760 I40E_TX_DESC_CMD_L4T_EOFT_SCTP;
1761 break;
1762 default:
1763 txs->itxs_hck_badl4.value.ui64++;
1764 return (-1);
1765 }
1766
1767 tctx->itc_data_offsets |= (meo->meoi_l4hlen >> 2) <<
1768 I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
1769 }
1770
1771 if (lsoflags & HW_LSO) {
1772 /*
1773 * LSO requires that checksum offloads are enabled. If for
1774 * some reason they're not we bail out with an error.
1775 */
1776 if ((meo->meoi_l3proto == ETHERTYPE_IP &&
1777 (chkflags & HCK_IPV4_HDRCKSUM) == 0) ||
1778 (chkflags & HCK_PARTIALCKSUM) == 0) {
1779 txs->itxs_lso_nohck.value.ui64++;
1780 return (-1);
1781 }
1782
1783 tctx->itc_ctx_cmdflags |= I40E_TX_CTX_DESC_TSO;
1784 tctx->itc_ctx_mss = mss;
1785 tctx->itc_ctx_tsolen = msgsize(mp) -
1786 (meo->meoi_l2hlen + meo->meoi_l3hlen + meo->meoi_l4hlen);
1787 }
1788
1789 return (0);
1790 }
1791
1792 static void
i40e_tcb_free(i40e_trqpair_t * itrq,i40e_tx_control_block_t * tcb)1793 i40e_tcb_free(i40e_trqpair_t *itrq, i40e_tx_control_block_t *tcb)
1794 {
1795 ASSERT(tcb != NULL);
1796
1797 mutex_enter(&itrq->itrq_tcb_lock);
1798 ASSERT(itrq->itrq_tcb_free < itrq->itrq_tx_free_list_size);
1799 itrq->itrq_tcb_free_list[itrq->itrq_tcb_free] = tcb;
1800 itrq->itrq_tcb_free++;
1801 mutex_exit(&itrq->itrq_tcb_lock);
1802 }
1803
1804 static i40e_tx_control_block_t *
i40e_tcb_alloc(i40e_trqpair_t * itrq)1805 i40e_tcb_alloc(i40e_trqpair_t *itrq)
1806 {
1807 i40e_tx_control_block_t *ret;
1808
1809 mutex_enter(&itrq->itrq_tcb_lock);
1810 if (itrq->itrq_tcb_free == 0) {
1811 mutex_exit(&itrq->itrq_tcb_lock);
1812 return (NULL);
1813 }
1814
1815 itrq->itrq_tcb_free--;
1816 ret = itrq->itrq_tcb_free_list[itrq->itrq_tcb_free];
1817 itrq->itrq_tcb_free_list[itrq->itrq_tcb_free] = NULL;
1818 mutex_exit(&itrq->itrq_tcb_lock);
1819
1820 ASSERT(ret != NULL);
1821 return (ret);
1822 }
1823
1824 /*
1825 * This should be used to free any DMA resources, associated mblk_t's, etc. It's
1826 * used as part of recycling the message blocks when we have either an interrupt
1827 * or other activity that indicates that we need to take a look.
1828 */
1829 static void
i40e_tcb_reset(i40e_tx_control_block_t * tcb)1830 i40e_tcb_reset(i40e_tx_control_block_t *tcb)
1831 {
1832 switch (tcb->tcb_type) {
1833 case I40E_TX_COPY:
1834 tcb->tcb_dma.dmab_len = 0;
1835 break;
1836 case I40E_TX_DMA:
1837 if (tcb->tcb_used_lso == B_TRUE && tcb->tcb_bind_ncookies > 0)
1838 (void) ddi_dma_unbind_handle(tcb->tcb_lso_dma_handle);
1839 else if (tcb->tcb_bind_ncookies > 0)
1840 (void) ddi_dma_unbind_handle(tcb->tcb_dma_handle);
1841 if (tcb->tcb_bind_info != NULL) {
1842 kmem_free(tcb->tcb_bind_info,
1843 tcb->tcb_bind_ncookies *
1844 sizeof (struct i40e_dma_bind_info));
1845 }
1846 tcb->tcb_bind_info = NULL;
1847 tcb->tcb_bind_ncookies = 0;
1848 tcb->tcb_used_lso = B_FALSE;
1849 break;
1850 case I40E_TX_DESC:
1851 break;
1852 case I40E_TX_NONE:
1853 /* Cast to pacify lint */
1854 panic("trying to free tcb %p with bad type none", (void *)tcb);
1855 default:
1856 panic("unknown i40e tcb type: %d", tcb->tcb_type);
1857 }
1858
1859 tcb->tcb_type = I40E_TX_NONE;
1860 if (tcb->tcb_mp != NULL) {
1861 freemsg(tcb->tcb_mp);
1862 tcb->tcb_mp = NULL;
1863 }
1864 tcb->tcb_next = NULL;
1865 }
1866
1867 /*
1868 * This is called as part of shutting down to clean up all outstanding
1869 * descriptors. Similar to recycle, except we don't re-arm anything and instead
1870 * just return control blocks to the free list.
1871 */
1872 void
i40e_tx_cleanup_ring(i40e_trqpair_t * itrq)1873 i40e_tx_cleanup_ring(i40e_trqpair_t *itrq)
1874 {
1875 uint32_t index;
1876
1877 ASSERT(MUTEX_HELD(&itrq->itrq_tx_lock));
1878 ASSERT(itrq->itrq_desc_free <= itrq->itrq_tx_ring_size);
1879
1880 /*
1881 * Because we should have shut down the chip at this point, it should be
1882 * safe to just clean up all the entries between our head and tail.
1883 */
1884 #ifdef DEBUG
1885 index = I40E_READ_REG(&itrq->itrq_i40e->i40e_hw_space,
1886 I40E_QTX_ENA(itrq->itrq_index));
1887 VERIFY0(index & (I40E_QTX_ENA_QENA_REQ_MASK |
1888 I40E_QTX_ENA_QENA_STAT_MASK));
1889 #endif
1890
1891 index = itrq->itrq_desc_head;
1892 while (itrq->itrq_desc_free < itrq->itrq_tx_ring_size) {
1893 i40e_tx_control_block_t *tcb;
1894
1895 tcb = itrq->itrq_tcb_work_list[index];
1896 if (tcb != NULL) {
1897 itrq->itrq_tcb_work_list[index] = NULL;
1898 i40e_tcb_reset(tcb);
1899 i40e_tcb_free(itrq, tcb);
1900 }
1901
1902 bzero(&itrq->itrq_desc_ring[index], sizeof (i40e_tx_desc_t));
1903 index = i40e_next_desc(index, 1, itrq->itrq_tx_ring_size);
1904 itrq->itrq_desc_free++;
1905 }
1906
1907 ASSERT(index == itrq->itrq_desc_tail);
1908 itrq->itrq_desc_head = index;
1909 }
1910
1911 /*
1912 * We're here either by hook or by crook. We need to see if there are transmit
1913 * descriptors available for us to go and clean up and return to the hardware.
1914 * We may also be blocked, and if so, we should make sure that we let it know
1915 * we're good to go.
1916 */
1917 void
i40e_tx_recycle_ring(i40e_trqpair_t * itrq)1918 i40e_tx_recycle_ring(i40e_trqpair_t *itrq)
1919 {
1920 uint32_t wbhead, toclean, count;
1921 i40e_tx_control_block_t *tcbhead;
1922 i40e_t *i40e = itrq->itrq_i40e;
1923 uint_t desc_per_tcb, i;
1924
1925 mutex_enter(&itrq->itrq_tx_lock);
1926
1927 ASSERT(itrq->itrq_desc_free <= itrq->itrq_tx_ring_size);
1928 if (itrq->itrq_desc_free == itrq->itrq_tx_ring_size) {
1929 if (itrq->itrq_tx_blocked == B_TRUE) {
1930 itrq->itrq_tx_blocked = B_FALSE;
1931 mac_tx_ring_update(i40e->i40e_mac_hdl,
1932 itrq->itrq_mactxring);
1933 itrq->itrq_txstat.itxs_num_unblocked.value.ui64++;
1934 }
1935 mutex_exit(&itrq->itrq_tx_lock);
1936 return;
1937 }
1938
1939 /*
1940 * Now we need to try and see if there's anything available. The driver
1941 * will write to the head location and it guarantees that it does not
1942 * use relaxed ordering.
1943 */
1944 VERIFY0(ddi_dma_sync(itrq->itrq_desc_area.dmab_dma_handle,
1945 (uintptr_t)itrq->itrq_desc_wbhead,
1946 sizeof (uint32_t), DDI_DMA_SYNC_FORKERNEL));
1947
1948 if (i40e_check_dma_handle(itrq->itrq_desc_area.dmab_dma_handle) !=
1949 DDI_FM_OK) {
1950 mutex_exit(&itrq->itrq_tx_lock);
1951 ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED);
1952 atomic_or_32(&i40e->i40e_state, I40E_ERROR);
1953 return;
1954 }
1955
1956 wbhead = *itrq->itrq_desc_wbhead;
1957 toclean = itrq->itrq_desc_head;
1958 count = 0;
1959 tcbhead = NULL;
1960
1961 while (toclean != wbhead) {
1962 i40e_tx_control_block_t *tcb;
1963
1964 tcb = itrq->itrq_tcb_work_list[toclean];
1965 itrq->itrq_tcb_work_list[toclean] = NULL;
1966 ASSERT(tcb != NULL);
1967 tcb->tcb_next = tcbhead;
1968 tcbhead = tcb;
1969
1970 /*
1971 * In the DMA bind case, there may not necessarily be a 1:1
1972 * mapping between tcb's and descriptors. If the tcb type
1973 * indicates a DMA binding then check the number of DMA
1974 * cookies to determine how many entries to clean in the
1975 * descriptor ring.
1976 */
1977 if (tcb->tcb_type == I40E_TX_DMA)
1978 desc_per_tcb = tcb->tcb_bind_ncookies;
1979 else
1980 desc_per_tcb = 1;
1981
1982 for (i = 0; i < desc_per_tcb; i++) {
1983 /*
1984 * We zero this out for sanity purposes.
1985 */
1986 bzero(&itrq->itrq_desc_ring[toclean],
1987 sizeof (i40e_tx_desc_t));
1988 toclean = i40e_next_desc(toclean, 1,
1989 itrq->itrq_tx_ring_size);
1990 count++;
1991 }
1992 }
1993
1994 itrq->itrq_desc_head = wbhead;
1995 itrq->itrq_desc_free += count;
1996 itrq->itrq_txstat.itxs_recycled.value.ui64 += count;
1997 ASSERT(itrq->itrq_desc_free <= itrq->itrq_tx_ring_size);
1998
1999 if (itrq->itrq_tx_blocked == B_TRUE &&
2000 itrq->itrq_desc_free > i40e->i40e_tx_block_thresh) {
2001 itrq->itrq_tx_blocked = B_FALSE;
2002
2003 mac_tx_ring_update(i40e->i40e_mac_hdl, itrq->itrq_mactxring);
2004 itrq->itrq_txstat.itxs_num_unblocked.value.ui64++;
2005 }
2006
2007 mutex_exit(&itrq->itrq_tx_lock);
2008
2009 /*
2010 * Now clean up the tcb.
2011 */
2012 while (tcbhead != NULL) {
2013 i40e_tx_control_block_t *tcb = tcbhead;
2014
2015 tcbhead = tcb->tcb_next;
2016 i40e_tcb_reset(tcb);
2017 i40e_tcb_free(itrq, tcb);
2018 }
2019
2020 DTRACE_PROBE2(i40e__recycle, i40e_trqpair_t *, itrq, uint32_t, count);
2021 }
2022
2023 static void
i40e_tx_copy_fragment(i40e_tx_control_block_t * tcb,const mblk_t * mp,const size_t off,const size_t len)2024 i40e_tx_copy_fragment(i40e_tx_control_block_t *tcb, const mblk_t *mp,
2025 const size_t off, const size_t len)
2026 {
2027 const void *soff = mp->b_rptr + off;
2028 void *doff = tcb->tcb_dma.dmab_address + tcb->tcb_dma.dmab_len;
2029
2030 ASSERT3U(len, >, 0);
2031 ASSERT3P(soff, >=, mp->b_rptr);
2032 ASSERT3P(soff, <=, mp->b_wptr);
2033 ASSERT3U(len, <=, MBLKL(mp));
2034 ASSERT3U((uintptr_t)soff + len, <=, (uintptr_t)mp->b_wptr);
2035 ASSERT3U(tcb->tcb_dma.dmab_size - tcb->tcb_dma.dmab_len, >=, len);
2036 bcopy(soff, doff, len);
2037 tcb->tcb_type = I40E_TX_COPY;
2038 tcb->tcb_dma.dmab_len += len;
2039 I40E_DMA_SYNC(&tcb->tcb_dma, DDI_DMA_SYNC_FORDEV);
2040 }
2041
2042 static i40e_tx_control_block_t *
i40e_tx_bind_fragment(i40e_trqpair_t * itrq,const mblk_t * mp,size_t off,boolean_t use_lso)2043 i40e_tx_bind_fragment(i40e_trqpair_t *itrq, const mblk_t *mp,
2044 size_t off, boolean_t use_lso)
2045 {
2046 ddi_dma_handle_t dma_handle;
2047 ddi_dma_cookie_t dma_cookie;
2048 uint_t i = 0, ncookies = 0, dmaflags;
2049 i40e_tx_control_block_t *tcb;
2050 i40e_txq_stat_t *txs = &itrq->itrq_txstat;
2051
2052 if ((tcb = i40e_tcb_alloc(itrq)) == NULL) {
2053 txs->itxs_err_notcb.value.ui64++;
2054 return (NULL);
2055 }
2056 tcb->tcb_type = I40E_TX_DMA;
2057
2058 if (use_lso == B_TRUE)
2059 dma_handle = tcb->tcb_lso_dma_handle;
2060 else
2061 dma_handle = tcb->tcb_dma_handle;
2062
2063 dmaflags = DDI_DMA_WRITE | DDI_DMA_STREAMING;
2064 if (ddi_dma_addr_bind_handle(dma_handle, NULL,
2065 (caddr_t)(mp->b_rptr + off), MBLKL(mp) - off, dmaflags,
2066 DDI_DMA_DONTWAIT, NULL, &dma_cookie, &ncookies) != DDI_DMA_MAPPED) {
2067 txs->itxs_bind_fails.value.ui64++;
2068 goto bffail;
2069 }
2070
2071 tcb->tcb_bind_ncookies = ncookies;
2072 tcb->tcb_used_lso = use_lso;
2073
2074 tcb->tcb_bind_info =
2075 kmem_zalloc(ncookies * sizeof (struct i40e_dma_bind_info),
2076 KM_NOSLEEP);
2077 if (tcb->tcb_bind_info == NULL)
2078 goto bffail;
2079
2080 while (i < ncookies) {
2081 if (i > 0)
2082 ddi_dma_nextcookie(dma_handle, &dma_cookie);
2083
2084 tcb->tcb_bind_info[i].dbi_paddr =
2085 (caddr_t)dma_cookie.dmac_laddress;
2086 tcb->tcb_bind_info[i++].dbi_len = dma_cookie.dmac_size;
2087 }
2088
2089 return (tcb);
2090
2091 bffail:
2092 i40e_tcb_reset(tcb);
2093 i40e_tcb_free(itrq, tcb);
2094 return (NULL);
2095 }
2096
2097 static void
i40e_tx_set_data_desc(i40e_trqpair_t * itrq,i40e_tx_context_t * tctx,caddr_t buff,size_t len,boolean_t last_desc)2098 i40e_tx_set_data_desc(i40e_trqpair_t *itrq, i40e_tx_context_t *tctx,
2099 caddr_t buff, size_t len, boolean_t last_desc)
2100 {
2101 i40e_tx_desc_t *txdesc;
2102 int cmd;
2103
2104 ASSERT(MUTEX_HELD(&itrq->itrq_tx_lock));
2105 itrq->itrq_desc_free--;
2106 txdesc = &itrq->itrq_desc_ring[itrq->itrq_desc_tail];
2107 itrq->itrq_desc_tail = i40e_next_desc(itrq->itrq_desc_tail, 1,
2108 itrq->itrq_tx_ring_size);
2109
2110 cmd = I40E_TX_DESC_CMD_ICRC | tctx->itc_data_cmdflags;
2111
2112 /*
2113 * The last data descriptor needs the EOP bit set, so that the HW knows
2114 * that we're ready to send. Additionally, we set the RS (Report
2115 * Status) bit, so that we are notified when the transmit engine has
2116 * completed DMA'ing all of the data descriptors and data buffers
2117 * associated with this frame.
2118 */
2119 if (last_desc == B_TRUE) {
2120 cmd |= I40E_TX_DESC_CMD_EOP;
2121 cmd |= I40E_TX_DESC_CMD_RS;
2122 }
2123
2124 /*
2125 * Per the X710 manual, section 8.4.2.1.1, the buffer size
2126 * must be a value from 1 to 16K minus 1, inclusive.
2127 */
2128 ASSERT3U(len, >=, 1);
2129 ASSERT3U(len, <=, I40E_MAX_TX_BUFSZ);
2130
2131 txdesc->buffer_addr = CPU_TO_LE64((uintptr_t)buff);
2132 txdesc->cmd_type_offset_bsz =
2133 LE_64(((uint64_t)I40E_TX_DESC_DTYPE_DATA |
2134 ((uint64_t)tctx->itc_data_offsets << I40E_TXD_QW1_OFFSET_SHIFT) |
2135 ((uint64_t)cmd << I40E_TXD_QW1_CMD_SHIFT) |
2136 ((uint64_t)len << I40E_TXD_QW1_TX_BUF_SZ_SHIFT)));
2137 }
2138
2139 /*
2140 * Place 'tcb' on the tail of the list represented by 'head'/'tail'.
2141 */
2142 static inline void
tcb_list_append(i40e_tx_control_block_t ** head,i40e_tx_control_block_t ** tail,i40e_tx_control_block_t * tcb)2143 tcb_list_append(i40e_tx_control_block_t **head, i40e_tx_control_block_t **tail,
2144 i40e_tx_control_block_t *tcb)
2145 {
2146 if (*head == NULL) {
2147 *head = tcb;
2148 *tail = *head;
2149 } else {
2150 ASSERT3P(*tail, !=, NULL);
2151 ASSERT3P((*tail)->tcb_next, ==, NULL);
2152 (*tail)->tcb_next = tcb;
2153 *tail = tcb;
2154 }
2155 }
2156
2157 /*
2158 * This function takes a single packet, possibly consisting of
2159 * multiple mblks, and creates a TCB chain to send to the controller.
2160 * This TCB chain may span up to a maximum of 8 descriptors. A copy
2161 * TCB consumes one descriptor; whereas a DMA TCB may consume 1 or
2162 * more, depending on several factors. For each fragment (invidual
2163 * mblk making up the packet), we determine if its size dictates a
2164 * copy to the TCB buffer or a DMA bind of the dblk buffer. We keep a
2165 * count of descriptors used; when that count reaches the max we force
2166 * all remaining fragments into a single TCB buffer. We have a
2167 * guarantee that the TCB buffer is always larger than the MTU -- so
2168 * there is always enough room. Consecutive fragments below the DMA
2169 * threshold are copied into a single TCB. In the event of an error
2170 * this function returns NULL but leaves 'mp' alone.
2171 */
2172 static i40e_tx_control_block_t *
i40e_non_lso_chain(i40e_trqpair_t * itrq,mblk_t * mp,uint_t * ndesc)2173 i40e_non_lso_chain(i40e_trqpair_t *itrq, mblk_t *mp, uint_t *ndesc)
2174 {
2175 const mblk_t *nmp = mp;
2176 uint_t needed_desc = 0;
2177 boolean_t force_copy = B_FALSE;
2178 i40e_tx_control_block_t *tcb = NULL, *tcbhead = NULL, *tcbtail = NULL;
2179 i40e_t *i40e = itrq->itrq_i40e;
2180 i40e_txq_stat_t *txs = &itrq->itrq_txstat;
2181
2182 /* TCB buffer is always larger than MTU. */
2183 ASSERT3U(msgsize(mp), <, i40e->i40e_tx_buf_size);
2184
2185 while (nmp != NULL) {
2186 const size_t nmp_len = MBLKL(nmp);
2187
2188 /* Ignore zero-length mblks. */
2189 if (nmp_len == 0) {
2190 nmp = nmp->b_cont;
2191 continue;
2192 }
2193
2194 if (nmp_len < i40e->i40e_tx_dma_min || force_copy) {
2195 /* Compress consecutive copies into one TCB. */
2196 if (tcb != NULL && tcb->tcb_type == I40E_TX_COPY) {
2197 i40e_tx_copy_fragment(tcb, nmp, 0, nmp_len);
2198 nmp = nmp->b_cont;
2199 continue;
2200 }
2201
2202 if ((tcb = i40e_tcb_alloc(itrq)) == NULL) {
2203 txs->itxs_err_notcb.value.ui64++;
2204 goto fail;
2205 }
2206
2207 /*
2208 * TCB DMA buffer is guaranteed to be one
2209 * cookie by i40e_alloc_dma_buffer().
2210 */
2211 i40e_tx_copy_fragment(tcb, nmp, 0, nmp_len);
2212 needed_desc++;
2213 tcb_list_append(&tcbhead, &tcbtail, tcb);
2214 } else {
2215 uint_t total_desc;
2216
2217 tcb = i40e_tx_bind_fragment(itrq, nmp, 0, B_FALSE);
2218 if (tcb == NULL) {
2219 i40e_error(i40e, "dma bind failed!");
2220 goto fail;
2221 }
2222
2223 /*
2224 * If the new total exceeds the max or we've
2225 * reached the limit and there's data left,
2226 * then give up binding and copy the rest into
2227 * the pre-allocated TCB buffer.
2228 */
2229 total_desc = needed_desc + tcb->tcb_bind_ncookies;
2230 if ((total_desc > I40E_TX_MAX_COOKIE) ||
2231 (total_desc == I40E_TX_MAX_COOKIE &&
2232 nmp->b_cont != NULL)) {
2233 i40e_tcb_reset(tcb);
2234 i40e_tcb_free(itrq, tcb);
2235
2236 if (tcbtail != NULL &&
2237 tcbtail->tcb_type == I40E_TX_COPY) {
2238 tcb = tcbtail;
2239 } else {
2240 tcb = NULL;
2241 }
2242
2243 force_copy = B_TRUE;
2244 txs->itxs_force_copy.value.ui64++;
2245 continue;
2246 }
2247
2248 needed_desc += tcb->tcb_bind_ncookies;
2249 tcb_list_append(&tcbhead, &tcbtail, tcb);
2250 }
2251
2252 nmp = nmp->b_cont;
2253 }
2254
2255 ASSERT3P(nmp, ==, NULL);
2256 ASSERT3U(needed_desc, <=, I40E_TX_MAX_COOKIE);
2257 ASSERT3P(tcbhead, !=, NULL);
2258 *ndesc += needed_desc;
2259 return (tcbhead);
2260
2261 fail:
2262 tcb = tcbhead;
2263 while (tcb != NULL) {
2264 i40e_tx_control_block_t *next = tcb->tcb_next;
2265
2266 ASSERT(tcb->tcb_type == I40E_TX_DMA ||
2267 tcb->tcb_type == I40E_TX_COPY);
2268
2269 tcb->tcb_mp = NULL;
2270 i40e_tcb_reset(tcb);
2271 i40e_tcb_free(itrq, tcb);
2272 tcb = next;
2273 }
2274
2275 return (NULL);
2276 }
2277
2278 /*
2279 * Section 8.4.1 of the 700-series programming guide states that a
2280 * segment may span up to 8 data descriptors; including both header
2281 * and payload data. However, empirical evidence shows that the
2282 * controller freezes the Tx queue when presented with a segment of 8
2283 * descriptors. Or, at least, when the first segment contains 8
2284 * descriptors. One explanation is that the controller counts the
2285 * context descriptor against the first segment, even though the
2286 * programming guide makes no mention of such a constraint. In any
2287 * case, we limit TSO segments to 7 descriptors to prevent Tx queue
2288 * freezes. We still allow non-TSO segments to utilize all 8
2289 * descriptors as they have not demonstrated the faulty behavior.
2290 */
2291 uint_t i40e_lso_num_descs = 7;
2292
2293 #define I40E_TCB_LEFT(tcb) \
2294 ((tcb)->tcb_dma.dmab_size - (tcb)->tcb_dma.dmab_len)
2295
2296 /*
2297 * This function is similar in spirit to i40e_non_lso_chain(), but
2298 * much more complicated in reality. Like the previous function, it
2299 * takes a packet (an LSO packet) as input and returns a chain of
2300 * TCBs. The complication comes with the fact that we are no longer
2301 * trying to fit the entire packet into 8 descriptors, but rather we
2302 * must fit each MSS-size segment of the LSO packet into 8 descriptors.
2303 * Except it's really 7 descriptors, see i40e_lso_num_descs.
2304 *
2305 * Your first inclination might be to verify that a given segment
2306 * spans no more than 7 mblks; but it's actually much more subtle than
2307 * that. First, let's describe what the hardware expects, and then we
2308 * can expound on the software side of things.
2309 *
2310 * For an LSO packet the hardware expects the following:
2311 *
2312 * o Each MSS-sized segment must span no more than 7 descriptors.
2313 *
2314 * o The header size does not count towards the segment size.
2315 *
2316 * o If header and payload share the first descriptor, then the
2317 * controller will count the descriptor twice.
2318 *
2319 * The most important thing to keep in mind is that the hardware does
2320 * not view the segments in terms of mblks, like we do. The hardware
2321 * only sees descriptors. It will iterate each descriptor in turn,
2322 * keeping a tally of bytes seen and descriptors visited. If the byte
2323 * count hasn't reached MSS by the time the descriptor count reaches
2324 * 7, then the controller freezes the queue and we are stuck.
2325 * Furthermore, the hardware picks up its tally where it left off. So
2326 * if it reached MSS in the middle of a descriptor, it will start
2327 * tallying the next segment in the middle of that descriptor. The
2328 * hardware's view is entirely removed from the mblk chain or even the
2329 * descriptor layout. Consider these facts:
2330 *
2331 * o The MSS will vary dpeneding on MTU and other factors.
2332 *
2333 * o The dblk allocation will sit at various offsets within a
2334 * memory page.
2335 *
2336 * o The page size itself could vary in the future (i.e. not
2337 * always 4K).
2338 *
2339 * o Just because a dblk is virtually contiguous doesn't mean
2340 * it's physically contiguous. The number of cookies
2341 * (descriptors) required by a DMA bind of a single dblk is at
2342 * the mercy of the page size and physical layout.
2343 *
2344 * o The descriptors will most often NOT start/end on a MSS
2345 * boundary. Thus the hardware will often start counting the
2346 * MSS mid descriptor and finish mid descriptor.
2347 *
2348 * The upshot of all this is that the driver must learn to think like
2349 * the controller; and verify that none of the constraints are broken.
2350 * It does this by tallying up the segment just like the hardware
2351 * would. This is handled by the two variables 'segsz' and 'segdesc'.
2352 * After each attempt to bind a dblk, we check the constaints. If
2353 * violated, we undo the DMA and force a copy until MSS is met. We
2354 * have a guarantee that the TCB buffer is larger than MTU; thus
2355 * ensuring we can always meet the MSS with a single copy buffer. We
2356 * also copy consecutive non-DMA fragments into the same TCB buffer.
2357 */
2358 static i40e_tx_control_block_t *
i40e_lso_chain(i40e_trqpair_t * itrq,const mblk_t * mp,const mac_ether_offload_info_t * meo,const i40e_tx_context_t * tctx,uint_t * ndesc)2359 i40e_lso_chain(i40e_trqpair_t *itrq, const mblk_t *mp,
2360 const mac_ether_offload_info_t *meo, const i40e_tx_context_t *tctx,
2361 uint_t *ndesc)
2362 {
2363 size_t mp_len = MBLKL(mp);
2364 /*
2365 * The cpoff (copy offset) variable tracks the offset inside
2366 * the current mp. There are cases where the entire mp is not
2367 * fully copied in one go: such as the header copy followed by
2368 * a non-DMA mblk, or a TCB buffer that only has enough space
2369 * to copy part of the current mp.
2370 */
2371 size_t cpoff = 0;
2372 /*
2373 * The segsz and segdesc variables track the controller's view
2374 * of the segment. The needed_desc variable tracks the total
2375 * number of data descriptors used by the driver.
2376 */
2377 size_t segsz = 0;
2378 uint_t segdesc = 0;
2379 uint_t needed_desc = 0;
2380 size_t hdrcopied = 0;
2381 const size_t hdrlen =
2382 meo->meoi_l2hlen + meo->meoi_l3hlen + meo->meoi_l4hlen;
2383 const size_t mss = tctx->itc_ctx_mss;
2384 boolean_t force_copy = B_FALSE;
2385 i40e_tx_control_block_t *tcb = NULL, *tcbhead = NULL, *tcbtail = NULL;
2386 i40e_t *i40e = itrq->itrq_i40e;
2387 i40e_txq_stat_t *txs = &itrq->itrq_txstat;
2388
2389 /*
2390 * We always copy the header in order to avoid more
2391 * complicated code dealing with various edge cases.
2392 */
2393 if ((tcb = i40e_tcb_alloc(itrq)) == NULL) {
2394 txs->itxs_err_notcb.value.ui64++;
2395 goto fail;
2396 }
2397
2398 needed_desc++;
2399 tcb_list_append(&tcbhead, &tcbtail, tcb);
2400
2401 while (hdrcopied < hdrlen) {
2402 const size_t tocopy = MIN(hdrlen - hdrcopied, mp_len);
2403 i40e_tx_copy_fragment(tcb, mp, 0, tocopy);
2404 hdrcopied += tocopy;
2405 cpoff += tocopy;
2406 if (tocopy == mp_len) {
2407 /*
2408 * This is a bit of defensive programming. We
2409 * should never have a chain too short to
2410 * satisfy the headers -- but just in case.
2411 */
2412 if ((mp = mp->b_cont) == NULL) {
2413 txs->itxs_tx_short.value.ui64++;
2414 goto fail;
2415 }
2416
2417 while ((mp_len = MBLKL(mp)) == 0) {
2418 if ((mp = mp->b_cont) == NULL) {
2419 txs->itxs_tx_short.value.ui64++;
2420 goto fail;
2421 }
2422 }
2423 cpoff = 0;
2424 }
2425 }
2426 ASSERT3U(hdrcopied, ==, hdrlen);
2427
2428 /*
2429 * A single descriptor containing both header and data is
2430 * counted twice by the controller.
2431 */
2432 if (mp_len < i40e->i40e_tx_dma_min) {
2433 segdesc = 2;
2434 } else {
2435 segdesc = 1;
2436 }
2437
2438 while (mp != NULL) {
2439 mp_len = MBLKL(mp);
2440 force_copy:
2441 /* Ignore zero-length mblks. */
2442 if (mp_len == 0) {
2443 mp = mp->b_cont;
2444 cpoff = 0;
2445 continue;
2446 }
2447
2448 /*
2449 * We copy into the preallocated TCB buffer when the
2450 * current fragment is less than the DMA threshold OR
2451 * when the DMA bind can't meet the controller's
2452 * segment descriptor limit.
2453 */
2454 if (mp_len < i40e->i40e_tx_dma_min || force_copy) {
2455 size_t tocopy;
2456
2457 /*
2458 * Our objective here is to compress
2459 * consecutive copies into one TCB (until it
2460 * is full). If there is no current TCB, or if
2461 * it is a DMA TCB, then allocate a new one.
2462 */
2463 if (tcb == NULL ||
2464 (tcb != NULL && tcb->tcb_type != I40E_TX_COPY)) {
2465 if ((tcb = i40e_tcb_alloc(itrq)) == NULL) {
2466 txs->itxs_err_notcb.value.ui64++;
2467 goto fail;
2468 }
2469
2470 /*
2471 * The TCB DMA buffer is guaranteed to
2472 * be one cookie by i40e_alloc_dma_buffer().
2473 */
2474 needed_desc++;
2475 segdesc++;
2476 ASSERT3U(segdesc, <=, i40e_lso_num_descs);
2477 tcb_list_append(&tcbhead, &tcbtail, tcb);
2478 } else if (segdesc == 0) {
2479 /*
2480 * We are copying into an existing TCB
2481 * but we just crossed the MSS
2482 * boundary. Make sure to increment
2483 * segdesc to track the descriptor
2484 * count as the hardware would.
2485 */
2486 segdesc++;
2487 }
2488
2489 tocopy = MIN(I40E_TCB_LEFT(tcb), mp_len - cpoff);
2490 i40e_tx_copy_fragment(tcb, mp, cpoff, tocopy);
2491 cpoff += tocopy;
2492 segsz += tocopy;
2493
2494 /* We have consumed the current mp. */
2495 if (cpoff == mp_len) {
2496 mp = mp->b_cont;
2497 cpoff = 0;
2498 }
2499
2500 /* We have consumed the current TCB buffer. */
2501 if (I40E_TCB_LEFT(tcb) == 0) {
2502 tcb = NULL;
2503 }
2504
2505 /*
2506 * We have met MSS with this copy; restart the
2507 * counters.
2508 */
2509 if (segsz >= mss) {
2510 segsz = segsz % mss;
2511 segdesc = segsz == 0 ? 0 : 1;
2512 force_copy = B_FALSE;
2513 }
2514
2515 /*
2516 * We are at the controller's descriptor
2517 * limit; we must copy into the current TCB
2518 * until MSS is reached. The TCB buffer is
2519 * always bigger than the MTU so we know it is
2520 * big enough to meet the MSS.
2521 */
2522 if (segdesc == i40e_lso_num_descs) {
2523 force_copy = B_TRUE;
2524 }
2525 } else {
2526 uint_t tsegdesc = segdesc;
2527 size_t tsegsz = segsz;
2528
2529 ASSERT(force_copy == B_FALSE);
2530 ASSERT3U(tsegdesc, <, i40e_lso_num_descs);
2531
2532 tcb = i40e_tx_bind_fragment(itrq, mp, cpoff, B_TRUE);
2533 if (tcb == NULL) {
2534 i40e_error(i40e, "dma bind failed!");
2535 goto fail;
2536 }
2537
2538 for (uint_t i = 0; i < tcb->tcb_bind_ncookies; i++) {
2539 struct i40e_dma_bind_info dbi =
2540 tcb->tcb_bind_info[i];
2541
2542 tsegsz += dbi.dbi_len;
2543 tsegdesc++;
2544 ASSERT3U(tsegdesc, <=, i40e_lso_num_descs);
2545
2546 /*
2547 * We've met the MSS with this portion
2548 * of the DMA.
2549 */
2550 if (tsegsz >= mss) {
2551 tsegsz = tsegsz % mss;
2552 tsegdesc = tsegsz == 0 ? 0 : 1;
2553 }
2554
2555 /*
2556 * We've reached max descriptors but
2557 * have not met the MSS. Undo the bind
2558 * and instead copy.
2559 */
2560 if (tsegdesc == i40e_lso_num_descs) {
2561 i40e_tcb_reset(tcb);
2562 i40e_tcb_free(itrq, tcb);
2563
2564 if (tcbtail != NULL &&
2565 I40E_TCB_LEFT(tcb) > 0 &&
2566 tcbtail->tcb_type == I40E_TX_COPY) {
2567 tcb = tcbtail;
2568 } else {
2569 tcb = NULL;
2570 }
2571
2572 /*
2573 * Remember, we are still on
2574 * the same mp.
2575 */
2576 force_copy = B_TRUE;
2577 txs->itxs_tso_force_copy.value.ui64++;
2578 goto force_copy;
2579 }
2580 }
2581
2582 ASSERT3U(tsegdesc, <=, i40e_lso_num_descs);
2583 ASSERT3U(tsegsz, <, mss);
2584
2585 /*
2586 * We've made if through the loop without
2587 * breaking the segment descriptor contract
2588 * with the controller -- replace the segment
2589 * tracking values with the temporary ones.
2590 */
2591 segdesc = tsegdesc;
2592 segsz = tsegsz;
2593 needed_desc += tcb->tcb_bind_ncookies;
2594 cpoff = 0;
2595 tcb_list_append(&tcbhead, &tcbtail, tcb);
2596 mp = mp->b_cont;
2597 }
2598 }
2599
2600 ASSERT3P(mp, ==, NULL);
2601 ASSERT3P(tcbhead, !=, NULL);
2602 *ndesc += needed_desc;
2603 return (tcbhead);
2604
2605 fail:
2606 tcb = tcbhead;
2607 while (tcb != NULL) {
2608 i40e_tx_control_block_t *next = tcb->tcb_next;
2609
2610 ASSERT(tcb->tcb_type == I40E_TX_DMA ||
2611 tcb->tcb_type == I40E_TX_COPY);
2612
2613 tcb->tcb_mp = NULL;
2614 i40e_tcb_reset(tcb);
2615 i40e_tcb_free(itrq, tcb);
2616 tcb = next;
2617 }
2618
2619 return (NULL);
2620 }
2621
2622 /*
2623 * Keep track of activity through the transmit data path.
2624 *
2625 * We need to ensure we don't try and transmit when a trqpair has been
2626 * stopped, nor do we want to stop a trqpair whilst transmitting.
2627 */
2628 static boolean_t
i40e_ring_tx_enter(i40e_trqpair_t * itrq)2629 i40e_ring_tx_enter(i40e_trqpair_t *itrq)
2630 {
2631 boolean_t allow;
2632
2633 mutex_enter(&itrq->itrq_tx_lock);
2634 allow = !itrq->itrq_tx_quiesce;
2635 if (allow)
2636 itrq->itrq_tx_active++;
2637 mutex_exit(&itrq->itrq_tx_lock);
2638
2639 return (allow);
2640 }
2641
2642 static void
i40e_ring_tx_exit_nolock(i40e_trqpair_t * itrq)2643 i40e_ring_tx_exit_nolock(i40e_trqpair_t *itrq)
2644 {
2645 ASSERT(MUTEX_HELD(&itrq->itrq_tx_lock));
2646
2647 itrq->itrq_tx_active--;
2648 if (itrq->itrq_tx_quiesce)
2649 cv_signal(&itrq->itrq_tx_cv);
2650 }
2651
2652 static void
i40e_ring_tx_exit(i40e_trqpair_t * itrq)2653 i40e_ring_tx_exit(i40e_trqpair_t *itrq)
2654 {
2655 mutex_enter(&itrq->itrq_tx_lock);
2656 i40e_ring_tx_exit_nolock(itrq);
2657 mutex_exit(&itrq->itrq_tx_lock);
2658 }
2659
2660
2661 /*
2662 * Tell the transmit path to quiesce and wait until there is no
2663 * more activity.
2664 * Will return B_TRUE if the transmit path is already quiesced, B_FALSE
2665 * otherwise.
2666 */
2667 boolean_t
i40e_ring_tx_quiesce(i40e_trqpair_t * itrq)2668 i40e_ring_tx_quiesce(i40e_trqpair_t *itrq)
2669 {
2670 mutex_enter(&itrq->itrq_tx_lock);
2671 if (itrq->itrq_tx_quiesce) {
2672 /*
2673 * When itrq_tx_quiesce is set, then the ring has already
2674 * been shutdown.
2675 */
2676 mutex_exit(&itrq->itrq_tx_lock);
2677 return (B_TRUE);
2678 }
2679
2680 /*
2681 * Tell any threads in transmit path this trqpair is quiesced and
2682 * wait until they've all exited the critical code path.
2683 */
2684 itrq->itrq_tx_quiesce = B_TRUE;
2685 while (itrq->itrq_tx_active > 0)
2686 cv_wait(&itrq->itrq_tx_cv, &itrq->itrq_tx_lock);
2687
2688 mutex_exit(&itrq->itrq_tx_lock);
2689
2690 return (B_FALSE);
2691 }
2692
2693 /*
2694 * We've been asked to send a message block on the wire. We'll only have a
2695 * single chain. There will not be any b_next pointers; however, there may be
2696 * multiple b_cont blocks. The number of b_cont blocks may exceed the
2697 * controller's Tx descriptor limit.
2698 *
2699 * We may do one of three things with any given mblk_t chain:
2700 *
2701 * 1) Drop it
2702 * 2) Transmit it
2703 * 3) Return it
2704 *
2705 * If we return it to MAC, then MAC will flow control on our behalf. In other
2706 * words, it won't send us anything until we tell it that it's okay to send us
2707 * something.
2708 */
2709 mblk_t *
i40e_ring_tx(void * arg,mblk_t * mp)2710 i40e_ring_tx(void *arg, mblk_t *mp)
2711 {
2712 size_t msglen;
2713 i40e_tx_control_block_t *tcb_ctx = NULL, *tcb = NULL, *tcbhead = NULL;
2714 i40e_tx_context_desc_t *ctxdesc;
2715 mac_ether_offload_info_t meo;
2716 i40e_tx_context_t tctx;
2717 int type;
2718 uint_t needed_desc = 0;
2719 boolean_t do_ctx_desc = B_FALSE, use_lso = B_FALSE;
2720
2721 i40e_trqpair_t *itrq = arg;
2722 i40e_t *i40e = itrq->itrq_i40e;
2723 i40e_hw_t *hw = &i40e->i40e_hw_space;
2724 i40e_txq_stat_t *txs = &itrq->itrq_txstat;
2725
2726 ASSERT(mp->b_next == NULL);
2727
2728 if (!(i40e->i40e_state & I40E_STARTED) ||
2729 (i40e->i40e_state & I40E_OVERTEMP) ||
2730 (i40e->i40e_state & I40E_SUSPENDED) ||
2731 (i40e->i40e_state & I40E_ERROR) ||
2732 (i40e->i40e_link_state != LINK_STATE_UP) ||
2733 !i40e_ring_tx_enter(itrq)) {
2734 freemsg(mp);
2735 return (NULL);
2736 }
2737
2738 if (mac_ether_offload_info(mp, &meo) != 0) {
2739 freemsg(mp);
2740 itrq->itrq_txstat.itxs_hck_meoifail.value.ui64++;
2741 i40e_ring_tx_exit(itrq);
2742 return (NULL);
2743 }
2744
2745 /*
2746 * Figure out the relevant context about this frame that we might need
2747 * for enabling checksum, LSO, etc. This also fills in information that
2748 * we might set around the packet type, etc.
2749 */
2750 if (i40e_tx_context(i40e, itrq, mp, &meo, &tctx) < 0) {
2751 freemsg(mp);
2752 itrq->itrq_txstat.itxs_err_context.value.ui64++;
2753 i40e_ring_tx_exit(itrq);
2754 return (NULL);
2755 }
2756 if (tctx.itc_ctx_cmdflags & I40E_TX_CTX_DESC_TSO) {
2757 use_lso = B_TRUE;
2758 do_ctx_desc = B_TRUE;
2759 }
2760
2761 /*
2762 * For the primordial driver we can punt on doing any recycling right
2763 * now; however, longer term we need to probably do some more pro-active
2764 * recycling to cut back on stalls in the TX path.
2765 */
2766
2767 msglen = msgsize(mp);
2768
2769 if (do_ctx_desc) {
2770 /*
2771 * If we're doing tunneling or LSO, then we'll need a TX
2772 * context descriptor in addition to one or more TX data
2773 * descriptors. Since there's no data DMA block or handle
2774 * associated with the context descriptor, we create a special
2775 * control block that behaves effectively like a NOP.
2776 */
2777 if ((tcb_ctx = i40e_tcb_alloc(itrq)) == NULL) {
2778 txs->itxs_err_notcb.value.ui64++;
2779 goto txfail;
2780 }
2781 tcb_ctx->tcb_type = I40E_TX_DESC;
2782 needed_desc++;
2783 }
2784
2785 if (!use_lso) {
2786 tcbhead = i40e_non_lso_chain(itrq, mp, &needed_desc);
2787 } else {
2788 tcbhead = i40e_lso_chain(itrq, mp, &meo, &tctx, &needed_desc);
2789 }
2790
2791 if (tcbhead == NULL)
2792 goto txfail;
2793
2794 tcbhead->tcb_mp = mp;
2795
2796 /*
2797 * The second condition ensures that 'itrq_desc_tail' never
2798 * equals 'itrq_desc_head'. This enforces the rule found in
2799 * the second bullet point of section 8.4.3.1.5 of the XL710
2800 * PG, which declares the TAIL pointer in I40E_QTX_TAIL should
2801 * never overlap with the head. This means that we only ever
2802 * have 'itrq_tx_ring_size - 1' total available descriptors.
2803 */
2804 mutex_enter(&itrq->itrq_tx_lock);
2805 if (itrq->itrq_desc_free < i40e->i40e_tx_block_thresh ||
2806 (itrq->itrq_desc_free - 1) < needed_desc) {
2807 txs->itxs_err_nodescs.value.ui64++;
2808 mutex_exit(&itrq->itrq_tx_lock);
2809 goto txfail;
2810 }
2811
2812 if (do_ctx_desc) {
2813 /*
2814 * If we're enabling any offloads for this frame, then we'll
2815 * need to build up a transmit context descriptor, first. The
2816 * context descriptor needs to be placed in the TX ring before
2817 * the data descriptor(s). See section 8.4.2, table 8-16
2818 */
2819 uint_t tail = itrq->itrq_desc_tail;
2820 itrq->itrq_desc_free--;
2821 ctxdesc = (i40e_tx_context_desc_t *)&itrq->itrq_desc_ring[tail];
2822 itrq->itrq_tcb_work_list[tail] = tcb_ctx;
2823 itrq->itrq_desc_tail = i40e_next_desc(tail, 1,
2824 itrq->itrq_tx_ring_size);
2825
2826 /* QW0 */
2827 type = I40E_TX_DESC_DTYPE_CONTEXT;
2828 ctxdesc->tunneling_params = 0;
2829 ctxdesc->l2tag2 = 0;
2830
2831 /* QW1 */
2832 ctxdesc->type_cmd_tso_mss = CPU_TO_LE64((uint64_t)type);
2833 if (tctx.itc_ctx_cmdflags & I40E_TX_CTX_DESC_TSO) {
2834 ctxdesc->type_cmd_tso_mss |= CPU_TO_LE64((uint64_t)
2835 ((uint64_t)tctx.itc_ctx_cmdflags <<
2836 I40E_TXD_CTX_QW1_CMD_SHIFT) |
2837 ((uint64_t)tctx.itc_ctx_tsolen <<
2838 I40E_TXD_CTX_QW1_TSO_LEN_SHIFT) |
2839 ((uint64_t)tctx.itc_ctx_mss <<
2840 I40E_TXD_CTX_QW1_MSS_SHIFT));
2841 }
2842 }
2843
2844 tcb = tcbhead;
2845 while (tcb != NULL) {
2846
2847 itrq->itrq_tcb_work_list[itrq->itrq_desc_tail] = tcb;
2848 if (tcb->tcb_type == I40E_TX_COPY) {
2849 boolean_t last_desc = (tcb->tcb_next == NULL);
2850
2851 i40e_tx_set_data_desc(itrq, &tctx,
2852 (caddr_t)tcb->tcb_dma.dmab_dma_address,
2853 tcb->tcb_dma.dmab_len, last_desc);
2854 } else {
2855 boolean_t last_desc = B_FALSE;
2856 ASSERT3S(tcb->tcb_type, ==, I40E_TX_DMA);
2857
2858 for (uint_t c = 0; c < tcb->tcb_bind_ncookies; c++) {
2859 last_desc = (c == tcb->tcb_bind_ncookies - 1) &&
2860 (tcb->tcb_next == NULL);
2861
2862 i40e_tx_set_data_desc(itrq, &tctx,
2863 tcb->tcb_bind_info[c].dbi_paddr,
2864 tcb->tcb_bind_info[c].dbi_len,
2865 last_desc);
2866 }
2867 }
2868
2869 tcb = tcb->tcb_next;
2870 }
2871
2872 /*
2873 * Now, finally, sync the DMA data and alert hardware.
2874 */
2875 I40E_DMA_SYNC(&itrq->itrq_desc_area, DDI_DMA_SYNC_FORDEV);
2876
2877 I40E_WRITE_REG(hw, I40E_QTX_TAIL(itrq->itrq_index),
2878 itrq->itrq_desc_tail);
2879
2880 if (i40e_check_acc_handle(i40e->i40e_osdep_space.ios_reg_handle) !=
2881 DDI_FM_OK) {
2882 /*
2883 * Note, we can't really go through and clean this up very well,
2884 * because the memory has been given to the device, so just
2885 * indicate it's been transmitted.
2886 */
2887 ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED);
2888 atomic_or_32(&i40e->i40e_state, I40E_ERROR);
2889 }
2890
2891 txs->itxs_bytes.value.ui64 += msglen;
2892 txs->itxs_packets.value.ui64++;
2893 txs->itxs_descriptors.value.ui64 += needed_desc;
2894
2895 i40e_ring_tx_exit_nolock(itrq);
2896
2897 mutex_exit(&itrq->itrq_tx_lock);
2898
2899 return (NULL);
2900
2901 txfail:
2902 /*
2903 * We ran out of resources. Return it to MAC and indicate that we'll
2904 * need to signal MAC. If there are allocated tcb's, return them now.
2905 * Make sure to reset their message block's, since we'll return them
2906 * back to MAC.
2907 */
2908 if (tcb_ctx != NULL) {
2909 tcb_ctx->tcb_mp = NULL;
2910 i40e_tcb_reset(tcb_ctx);
2911 i40e_tcb_free(itrq, tcb_ctx);
2912 }
2913
2914 tcb = tcbhead;
2915 while (tcb != NULL) {
2916 i40e_tx_control_block_t *next = tcb->tcb_next;
2917
2918 ASSERT(tcb->tcb_type == I40E_TX_DMA ||
2919 tcb->tcb_type == I40E_TX_COPY);
2920
2921 tcb->tcb_mp = NULL;
2922 i40e_tcb_reset(tcb);
2923 i40e_tcb_free(itrq, tcb);
2924 tcb = next;
2925 }
2926
2927 mutex_enter(&itrq->itrq_tx_lock);
2928 i40e_ring_tx_exit_nolock(itrq);
2929 itrq->itrq_tx_blocked = B_TRUE;
2930 mutex_exit(&itrq->itrq_tx_lock);
2931
2932 return (mp);
2933 }
2934