1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright(c) 2007-2010 Intel Corporation. All rights reserved.
24 */
25
26 /*
27 * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Copyright 2012 Nexenta Systems, Inc. All rights reserved.
29 * Copyright 2016 OmniTI Computer Consulting, Inc. All rights reserved.
30 * Copyright 2021 Joyent, Inc.
31 */
32
33 #include "ixgbe_sw.h"
34
35 static int ixgbe_tx_copy(ixgbe_tx_ring_t *, tx_control_block_t **,
36 link_list_t *, const void *, size_t);
37 static int ixgbe_tx_bind(ixgbe_tx_ring_t *, tx_control_block_t **,
38 link_list_t *, uint8_t *, size_t);
39 static uint_t ixgbe_tcb_done(tx_control_block_t *);
40 static int ixgbe_tx_fill_ring(ixgbe_tx_ring_t *, link_list_t *,
41 ixgbe_tx_context_t *, size_t);
42 static void ixgbe_save_desc(tx_control_block_t *, uint64_t, size_t);
43 static tx_control_block_t *ixgbe_get_free_list(ixgbe_tx_ring_t *,
44 link_list_t *);
45
46 static int ixgbe_get_context(mblk_t *, ixgbe_tx_context_t *);
47 static boolean_t ixgbe_check_context(ixgbe_tx_ring_t *,
48 ixgbe_tx_context_t *);
49 static void ixgbe_fill_context(struct ixgbe_adv_tx_context_desc *,
50 ixgbe_tx_context_t *);
51
52 /*
53 * ixgbe_ring_tx
54 *
55 * To transmit one mblk through one specified ring.
56 *
57 * One mblk can consist of several fragments, each fragment
58 * will be processed with different methods based on the size.
59 * For the fragments with size less than the bcopy threshold,
60 * they will be processed by using bcopy; otherwise, they will
61 * be processed by using DMA binding.
62 *
63 * To process the mblk, for each fragment, we pass a pointer to the location
64 * of the current transmit control block (tcb) (initialized to NULL) to either
65 * ixgbe_tx_copy() or ixgbe_tx_bind() (based on the size of the mblk fragment).
66 * ixgbe_tx_copy() and ixgbe_tx_bind() will either continue to use the current
67 * if possible, or close out the current tcb, allocate a new tcb, and update
68 * the passed location (tx_control_block_t **) to reflect the new current tcb.
69 *
70 * Since bound mblk fragments require their own tcb, the close, allocate new,
71 * and update steps occur on every call to ixgbe_tx_bind(), but since
72 * consecutive small mblk fragments can be combined into a single tcb, the
73 * close, allocate new, and update steps may not occur on every call to
74 * ixgbe_tx_copy(). If the current tcb is already being used to copy data and
75 * we call ixgbe_tx_copy(), if there is enough room in the current tcb for
76 * the current mblk fragment, we append the data from the mblk fragment. If
77 * we call ixgbe_tx_copy() and the current tcb isn't being used to copy (i.e.
78 * the previous iteration of the loop called ixgbe_tx_bind()), or doesn't
79 * have enough space for the mblk fragment, we close out the current tcb,
80 * grab a new tcb from the free list, and update the current tcb to the
81 * newly obtained tcb.
82 *
83 * When LSO (large segment offload) is enabled, we first copy the packet
84 * headers (ethernet, IP, and TCP/UDP) into their own descriptor before
85 * processing the remainder of the packet. The remaining bytes of the packet
86 * are then copied or mapped based on the fragment size as described above.
87 *
88 * Through the entire processing of a packet, we keep track of the number of
89 * DMA descriptors being used (either bound or pre-bound buffers used for
90 * copying) by this packet. Each tcb requires at least one DMA descriptor, but
91 * may require more than one. When a tcb is closed by ixgbe_tx_bind() or
92 * ixgbe_tx_copy(), it does so by calling ixgbe_tcb_done() which returns the
93 * number of DMA descriptors that are closed (ready for the HW). Since the
94 * hardware limits the number of descriptors that can be used to transmit a
95 * single packet, if the total number DMA descriptors required to transmit
96 * this packet exceeds this limit, we perform a msgpullup() and try again.
97 * Since our DMA attributes limit the number of DMA cookies allowed to
98 * map a single span of memory to a value (MAX_COOKIE) less than the
99 * maximum number of descriptors allowed for a packet (IXGBE_TX_DESC_LIMIT),
100 * as long as sufficient tcbs are available, we should always be able to
101 * process a packet that's contained in a single mblk_t (no additional
102 * fragments).
103 *
104 * Once all of the tcbs have been setup, ixgbe_tx_fill_ring() is called to
105 * setup the tx ring to transmit the tcbs and then tell the HW to start
106 * transmitting. When transmission is complete, an interrupt is triggered
107 * which calls the appropriate recycle routine to place the tcbs that were
108 * used in transmission back in the free list. We also may also try to
109 * recycle any available tcbs when the size of the tcb free list gets low
110 * or if the watchdog timer triggers.
111 *
112 */
113 mblk_t *
ixgbe_ring_tx(void * arg,mblk_t * orig_mp)114 ixgbe_ring_tx(void *arg, mblk_t *orig_mp)
115 {
116 ixgbe_tx_ring_t *tx_ring = (ixgbe_tx_ring_t *)arg;
117 ixgbe_t *ixgbe = tx_ring->ixgbe;
118 mblk_t *mp = orig_mp;
119 mblk_t *pull_mp = NULL;
120 tx_control_block_t *tcb;
121 size_t mbsize, offset, len;
122 uint32_t desc_total;
123 uint32_t copy_thresh;
124 int desc_num;
125 ixgbe_tx_context_t tx_context, *ctx = NULL;
126 link_list_t pending_list;
127 boolean_t limit_retry = B_FALSE;
128
129 ASSERT(mp->b_next == NULL);
130
131 if ((ixgbe->ixgbe_state & IXGBE_SUSPENDED) ||
132 (ixgbe->ixgbe_state & IXGBE_ERROR) ||
133 (ixgbe->ixgbe_state & IXGBE_OVERTEMP) ||
134 !(ixgbe->ixgbe_state & IXGBE_STARTED) ||
135 ixgbe->link_state != LINK_STATE_UP) {
136 freemsg(mp);
137 return (NULL);
138 }
139
140 copy_thresh = ixgbe->tx_copy_thresh;
141
142 mbsize = msgsize(mp);
143
144 if (ixgbe->tx_hcksum_enable) {
145 /*
146 * Retrieve checksum context information from the mblk
147 * that will be used to decide whether/how to fill the
148 * context descriptor.
149 */
150 ctx = &tx_context;
151 if (ixgbe_get_context(mp, ctx) < 0) {
152 freemsg(mp);
153 return (NULL);
154 }
155
156 /*
157 * If the mblk size exceeds the max size ixgbe could
158 * process, then discard this mblk, and return NULL.
159 */
160 if ((ctx->lso_flag &&
161 ((mbsize - ctx->mac_hdr_len) > IXGBE_LSO_MAXLEN)) ||
162 (!ctx->lso_flag &&
163 (mbsize > (ixgbe->max_frame_size - ETHERFCSL)))) {
164 freemsg(mp);
165 IXGBE_DEBUGLOG_0(ixgbe, "ixgbe_tx: packet oversize");
166 return (NULL);
167 }
168 }
169
170 /*
171 * If we use too many descriptors (see comments below), we may do
172 * pull_mp = msgpullup(orig_mp, -1), and jump back to here. As such,
173 * any time we error return past here, we should check and free
174 * pull_mp if != NULL.
175 */
176 retry:
177 /*
178 * Check and recycle tx descriptors.
179 * The recycle threshold here should be selected carefully
180 */
181 if (tx_ring->tbd_free < ixgbe->tx_recycle_thresh) {
182 tx_ring->tx_recycle(tx_ring);
183 }
184
185 /*
186 * After the recycling, if the tbd_free is less than the
187 * overload_threshold, assert overload, return mp;
188 * and we need to re-schedule the tx again.
189 */
190 if (tx_ring->tbd_free < ixgbe->tx_overload_thresh) {
191 tx_ring->reschedule = B_TRUE;
192 tx_ring->stat_overload++;
193 if (pull_mp != NULL)
194 freemsg(pull_mp);
195 return (orig_mp);
196 }
197
198 /*
199 * The pending_list is a linked list that is used to save
200 * the tx control blocks that have packet data processed
201 * but have not put the data to the tx descriptor ring.
202 * It is used to reduce the lock contention of the tx_lock.
203 */
204 LINK_LIST_INIT(&pending_list);
205
206 tcb = NULL;
207 desc_num = 0;
208 desc_total = 0;
209 offset = 0;
210
211 /*
212 * For LSO, we always copy the packet header (Ethernet + IP + TCP/UDP)
213 * into a single descriptor separate from the remaining data.
214 */
215 if ((ctx != NULL) && ctx->lso_flag) {
216 size_t hdr_len;
217
218 hdr_len = ctx->ip_hdr_len + ctx->mac_hdr_len + ctx->l4_hdr_len;
219
220 /*
221 * copy the first hdr_len bytes of mp (i.e. the Ethernet, IP,
222 * and TCP/UDP headers) into tcb.
223 */
224 for (len = hdr_len; mp != NULL && len > 0; mp = mp->b_cont) {
225 size_t mlen = MBLKL(mp);
226 size_t amt = MIN(mlen, len);
227 int ret;
228
229 ret = ixgbe_tx_copy(tx_ring, &tcb, &pending_list,
230 mp->b_rptr, amt);
231 /*
232 * Since we're trying to copy all of the headers into
233 * a single buffer in a single tcb, if ixgbe_tx_copy()
234 * returns anything but 0, it means either no tcbs
235 * are available (< 0), or while copying, we spilled
236 * over and couldn't fit all the headers into a
237 * single tcb.
238 */
239 if (ret != 0) {
240 if (ret > 0)
241 tx_ring->stat_lso_header_fail++;
242 goto tx_failure;
243 }
244
245 len -= amt;
246
247 /*
248 * If we copy less than the full amount of this
249 * mblk_t, we have some amount to copy below.
250 */
251 if (amt < mlen) {
252 offset = amt;
253 break;
254 }
255 }
256
257 ASSERT0(len);
258
259 /*
260 * Finish off the header tcb, and start anew for the
261 * rest of the packet.
262 */
263 desc_total += ixgbe_tcb_done(tcb);
264 tcb = NULL;
265 }
266
267 /*
268 * Process each remaining segment in the packet -- either binding
269 * the dblk_t or copying the contents of the dblk_t to an already
270 * bound buffer. When we copy, we will accumulate consecutive small
271 * (less than copy_thresh bytes) segments into a single tcb buffer
272 * until no more can fit (or we encounter a segment larger than
273 * copy_thresh and bind the dblk_t).
274 *
275 * Both ixgbe_tx_bind() and ixgbe_tx_copy() will allocate new
276 * transmit control blocks (tcb)s as needed (and append them onto
277 * 'pending_list'). Both functions also replace 'tcb' with the new
278 * tcb when they allocate a new tcb.
279 *
280 * We stop trying to process the packet once the number of descriptors
281 * used equals IXGBE_TX_DESC_LIMIT. Even if we're copying into the
282 * IXGBE_TX_DESC_LIMIT-th descriptor, we won't have room to add a
283 * context descriptor (since we're already at the limit), so there's
284 * no point in continuing. We'll pull up the mblk_t (see below)
285 * and try again.
286 */
287 while (mp != NULL && desc_total < IXGBE_TX_DESC_LIMIT) {
288 uint8_t *rptr = mp->b_rptr + offset;
289 int ret;
290
291 len = MBLKL(mp) - offset;
292 offset = 0;
293
294 if (len > copy_thresh) {
295 ret = ixgbe_tx_bind(tx_ring, &tcb, &pending_list, rptr,
296 len);
297 } else {
298 ret = ixgbe_tx_copy(tx_ring, &tcb, &pending_list, rptr,
299 len);
300 }
301
302 if (ret < 0)
303 goto tx_failure;
304
305 desc_total += ret;
306 mp = mp->b_cont;
307 }
308
309 /* Finish off the last tcb */
310 desc_total += ixgbe_tcb_done(tcb);
311
312 /*
313 * 82598/82599 chipset has a limitation that no more than 32 tx
314 * descriptors can be transmited out at one time. As noted above,
315 * we need to include space for a context descriptor in case its
316 * necessary, so we do this even if desc_total == IXGBE_TX_DESC_LIMIT
317 * as well as when it exceeds the limit.
318 *
319 * If we exceed this limit, we take the hit, do a msgpullup(), and
320 * then try again. Our DMA attributes guarantee we should never use
321 * more than MAX_COOKIE (18) descriptors to map a single mblk_t, so we
322 * should only need to retry once.
323 */
324 if (desc_total >= IXGBE_TX_DESC_LIMIT) {
325 /* We shouldn't hit this path twice */
326 VERIFY0(limit_retry);
327
328 tx_ring->stat_break_tbd_limit++;
329
330 /* Release all the tcbs we used previously */
331 ixgbe_put_free_list(tx_ring, &pending_list);
332 desc_total = 0;
333 offset = 0;
334
335 pull_mp = msgpullup(orig_mp, -1);
336 if (pull_mp == NULL) {
337 tx_ring->reschedule = B_TRUE;
338 return (orig_mp);
339 }
340
341 mp = pull_mp;
342 limit_retry = B_TRUE;
343 goto retry;
344 }
345
346 /*
347 * Before filling the tx descriptor ring with the data, we need to
348 * ensure there are adequate free descriptors for transmit
349 * (including one context descriptor).
350 * Do not use up all the tx descriptors.
351 * Otherwise tx recycle will fail and cause false hang.
352 */
353 if (tx_ring->tbd_free <= (desc_total + 1)) {
354 tx_ring->tx_recycle(tx_ring);
355 }
356
357 mutex_enter(&tx_ring->tx_lock);
358 /*
359 * If the number of free tx descriptors is not enough for transmit
360 * then return mp.
361 *
362 * Note: we must put this check under the mutex protection to
363 * ensure the correctness when multiple threads access it in
364 * parallel.
365 */
366 if (tx_ring->tbd_free <= (desc_total + 1)) {
367 tx_ring->stat_fail_no_tbd++;
368 mutex_exit(&tx_ring->tx_lock);
369 goto tx_failure;
370 }
371
372 /*
373 * Attach the mblk_t we've setup to the last control block.
374 * This is only done once we know there are enough free descriptors
375 * to transmit so that the cleanup in tx_failure doesn't try to
376 * call freemsg() on mp (since we will want to return it).
377 */
378 tcb->mp = (pull_mp != NULL) ? pull_mp : orig_mp;
379
380 desc_num = ixgbe_tx_fill_ring(tx_ring, &pending_list, ctx,
381 mbsize);
382
383 ASSERT((desc_num == desc_total) || (desc_num == (desc_total + 1)));
384
385 tx_ring->stat_obytes += mbsize;
386 tx_ring->stat_opackets++;
387
388 mutex_exit(&tx_ring->tx_lock);
389
390 /*
391 * Now that tx is done, if we pulled up the original message, we
392 * can free the original message since it is no longer being
393 * used.
394 */
395 if (pull_mp != NULL) {
396 freemsg(orig_mp);
397 }
398
399 return (NULL);
400
401 tx_failure:
402 /*
403 * If transmission fails, need to free the pulling up mblk.
404 */
405 if (pull_mp) {
406 freemsg(pull_mp);
407 }
408
409 /*
410 * Return the tx control blocks in the pending list to the free list.
411 */
412 ixgbe_put_free_list(tx_ring, &pending_list);
413
414 /* Transmit failed, do not drop the mblk, rechedule the transmit */
415 tx_ring->reschedule = B_TRUE;
416
417 return (orig_mp);
418 }
419
420 /*
421 * ixgbe_tx_copy
422 *
423 * Copy the mblk fragment to the pre-allocated tx buffer. Return -1 on error,
424 * otherwise return the number of descriptors we've completed in this call.
425 */
426 static int
ixgbe_tx_copy(ixgbe_tx_ring_t * tx_ring,tx_control_block_t ** tcbp,link_list_t * pending_list,const void * buf,size_t len)427 ixgbe_tx_copy(ixgbe_tx_ring_t *tx_ring, tx_control_block_t **tcbp,
428 link_list_t *pending_list, const void *buf, size_t len)
429 {
430 tx_control_block_t *tcb = *tcbp;
431 dma_buffer_t *tx_buf;
432 uint32_t desc_num = 0;
433
434 /*
435 * We need a new tcb -- either the current one (tcb) is NULL because
436 * we just started, tcb is being used for DMA, or tcb isn't large enough
437 * to hold the contents we need to copy.
438 */
439 if (tcb == NULL || tcb->tx_type == USE_DMA ||
440 tcb->tx_buf.len + len > tcb->tx_buf.size) {
441 tx_control_block_t *newtcb;
442
443 newtcb = ixgbe_get_free_list(tx_ring, pending_list);
444 if (newtcb == NULL)
445 return (-1);
446
447 newtcb->tx_type = USE_COPY;
448
449 if (tcb != NULL)
450 desc_num += ixgbe_tcb_done(tcb);
451 *tcbp = tcb = newtcb;
452 }
453
454 ASSERT3S(tcb->tx_type, ==, USE_COPY);
455 tx_buf = &tcb->tx_buf;
456
457 /*
458 * Copy the packet data of the mblk fragment into the
459 * pre-allocated tx buffer, which is maintained by the
460 * tx control block.
461 *
462 * Several mblk fragments can be copied into one tx buffer.
463 * The destination address of the current copied fragment in
464 * the tx buffer is next to the end of the previous copied
465 * fragment.
466 */
467 if (len > 0) {
468 bcopy(buf, tx_buf->address + tx_buf->len, len);
469
470 tx_buf->len += len;
471 tcb->frag_num++;
472 }
473
474 return (desc_num);
475 }
476
477 /*
478 * ixgbe_tx_bind
479 *
480 * Bind the mblk fragment with DMA. Returns -1 on error, otherwise it
481 * returns the number of descriptors completed in this call. This count
482 * can include descriptors that weren't filled in by the current call to
483 * ixgbe_tx_bind() but were being used (but not yet completed) in previous
484 * calls to ixgbe_tx_bind() or ixgbe_tx_copy().
485 */
486 static int
ixgbe_tx_bind(ixgbe_tx_ring_t * tx_ring,tx_control_block_t ** tcbp,link_list_t * pending_list,uint8_t * buf,size_t len)487 ixgbe_tx_bind(ixgbe_tx_ring_t *tx_ring, tx_control_block_t **tcbp,
488 link_list_t *pending_list, uint8_t *buf, size_t len)
489 {
490 tx_control_block_t *tcb = NULL;
491 uint_t desc_num = 0;
492 int status;
493
494 tcb = ixgbe_get_free_list(tx_ring, pending_list);
495 if (tcb == NULL)
496 return (-1);
497
498 /*
499 * Use DMA binding to process the mblk fragment
500 */
501 status = ddi_dma_addr_bind_handle(tcb->tx_dma_handle, NULL,
502 (caddr_t)buf, len,
503 DDI_DMA_WRITE | DDI_DMA_STREAMING, DDI_DMA_DONTWAIT,
504 0, NULL, NULL);
505
506 if (status != DDI_DMA_MAPPED) {
507 tx_ring->stat_fail_dma_bind++;
508 return (-1);
509 }
510
511 tcb->frag_num++;
512 tcb->tx_type = USE_DMA;
513
514 /*
515 * If there was an old tcb, we're about to replace it. Finish
516 * setting up the old tcb so we can replace it with the new one.
517 */
518 if (*tcbp != NULL)
519 desc_num += ixgbe_tcb_done(*tcbp);
520
521 *tcbp = tcb;
522 return (desc_num);
523 }
524
525 /*
526 * Once we're done populating a tcb (either by binding or copying into
527 * a buffer in the tcb), get it ready for tx and return the number of
528 * descriptors used.
529 */
530 static uint_t
ixgbe_tcb_done(tx_control_block_t * tcb)531 ixgbe_tcb_done(tx_control_block_t *tcb)
532 {
533 uint_t desc_num = 0;
534
535 if (tcb->tx_type == USE_DMA) {
536 const ddi_dma_cookie_t *c;
537
538 for (c = ddi_dma_cookie_iter(tcb->tx_dma_handle, NULL);
539 c != NULL;
540 c = ddi_dma_cookie_iter(tcb->tx_dma_handle, c)) {
541 /*
542 * Save the address and length to the private data
543 * structure of the tx control block, which will be
544 * used to fill the tx descriptor ring after all the
545 * fragments are processed.
546 */
547 ixgbe_save_desc(tcb, c->dmac_laddress, c->dmac_size);
548 desc_num++;
549 }
550 } else if (tcb->tx_type == USE_COPY) {
551 dma_buffer_t *tx_buf = &tcb->tx_buf;
552
553 DMA_SYNC(tx_buf, DDI_DMA_SYNC_FORDEV);
554 ixgbe_save_desc(tcb, tx_buf->dma_address, tx_buf->len);
555 desc_num++;
556 } else {
557 panic("invalid tcb type");
558 }
559
560 return (desc_num);
561 }
562
563 /*
564 * ixgbe_get_context
565 *
566 * Get the context information from the mblk
567 */
568 static int
ixgbe_get_context(mblk_t * mp,ixgbe_tx_context_t * ctx)569 ixgbe_get_context(mblk_t *mp, ixgbe_tx_context_t *ctx)
570 {
571 uint32_t start;
572 uint32_t hckflags;
573 uint32_t lsoflags;
574 uint32_t lsocksum;
575 uint32_t mss;
576 uint32_t len;
577 uint32_t size;
578 uint32_t offset;
579 unsigned char *pos;
580 ushort_t etype;
581 uint32_t mac_hdr_len;
582 uint32_t l4_proto;
583 uint32_t l4_hdr_len;
584
585 ASSERT(mp != NULL);
586
587 mac_hcksum_get(mp, &start, NULL, NULL, NULL, &hckflags);
588 bzero(ctx, sizeof (ixgbe_tx_context_t));
589
590 if (hckflags == 0) {
591 return (0);
592 }
593
594 ctx->hcksum_flags = hckflags;
595
596 mac_lso_get(mp, &mss, &lsoflags);
597 ctx->mss = mss;
598 ctx->lso_flag = (lsoflags == HW_LSO);
599
600 etype = 0;
601 mac_hdr_len = 0;
602 l4_proto = 0;
603
604 /*
605 * Firstly get the position of the ether_type/ether_tpid.
606 * Here we don't assume the ether (VLAN) header is fully included
607 * in one mblk fragment, so we go thourgh the fragments to parse
608 * the ether type.
609 */
610 size = len = MBLKL(mp);
611 offset = offsetof(struct ether_header, ether_type);
612 while (size <= offset) {
613 mp = mp->b_cont;
614 ASSERT(mp != NULL);
615 len = MBLKL(mp);
616 size += len;
617 }
618 pos = mp->b_rptr + offset + len - size;
619
620 etype = ntohs(*(ushort_t *)(uintptr_t)pos);
621 if (etype == ETHERTYPE_VLAN) {
622 /*
623 * Get the position of the ether_type in VLAN header
624 */
625 offset = offsetof(struct ether_vlan_header, ether_type);
626 while (size <= offset) {
627 mp = mp->b_cont;
628 ASSERT(mp != NULL);
629 len = MBLKL(mp);
630 size += len;
631 }
632 pos = mp->b_rptr + offset + len - size;
633
634 etype = ntohs(*(ushort_t *)(uintptr_t)pos);
635 mac_hdr_len = sizeof (struct ether_vlan_header);
636 } else {
637 mac_hdr_len = sizeof (struct ether_header);
638 }
639
640 /*
641 * Here we don't assume the IP(V6) header is fully included in
642 * one mblk fragment.
643 */
644 lsocksum = HCK_PARTIALCKSUM;
645 ctx->l3_proto = etype;
646 switch (etype) {
647 case ETHERTYPE_IP:
648 if (ctx->lso_flag) {
649 offset = offsetof(ipha_t, ipha_length) + mac_hdr_len;
650 while (size <= offset) {
651 mp = mp->b_cont;
652 ASSERT(mp != NULL);
653 len = MBLKL(mp);
654 size += len;
655 }
656 pos = mp->b_rptr + offset + len - size;
657 *((uint16_t *)(uintptr_t)(pos)) = 0;
658
659 offset = offsetof(ipha_t, ipha_hdr_checksum) +
660 mac_hdr_len;
661 while (size <= offset) {
662 mp = mp->b_cont;
663 ASSERT(mp != NULL);
664 len = MBLKL(mp);
665 size += len;
666 }
667 pos = mp->b_rptr + offset + len - size;
668 *((uint16_t *)(uintptr_t)(pos)) = 0;
669
670 /*
671 * To perform ixgbe LSO, here also need to fill
672 * the tcp checksum field of the packet with the
673 * following pseudo-header checksum:
674 * (ip_source_addr, ip_destination_addr, l4_proto)
675 * Currently the tcp/ip stack has done it.
676 */
677 lsocksum |= HCK_IPV4_HDRCKSUM;
678 }
679
680 offset = offsetof(ipha_t, ipha_protocol) + mac_hdr_len;
681 while (size <= offset) {
682 mp = mp->b_cont;
683 ASSERT(mp != NULL);
684 len = MBLKL(mp);
685 size += len;
686 }
687 pos = mp->b_rptr + offset + len - size;
688
689 l4_proto = *(uint8_t *)pos;
690 break;
691 case ETHERTYPE_IPV6:
692 /*
693 * We need to zero out the length in the header.
694 */
695 if (ctx->lso_flag) {
696 offset = offsetof(ip6_t, ip6_plen) + mac_hdr_len;
697 while (size <= offset) {
698 mp = mp->b_cont;
699 ASSERT(mp != NULL);
700 len = MBLKL(mp);
701 size += len;
702 }
703 pos = mp->b_rptr + offset + len - size;
704 *((uint16_t *)(uintptr_t)(pos)) = 0;
705 }
706
707 offset = offsetof(ip6_t, ip6_nxt) + mac_hdr_len;
708 while (size <= offset) {
709 mp = mp->b_cont;
710 ASSERT(mp != NULL);
711 len = MBLKL(mp);
712 size += len;
713 }
714 pos = mp->b_rptr + offset + len - size;
715
716 l4_proto = *(uint8_t *)pos;
717 break;
718 default:
719 /* Unrecoverable error */
720 IXGBE_DEBUGLOG_0(NULL, "Ether type error with tx hcksum");
721 return (-2);
722 }
723
724 if (ctx->lso_flag) {
725 /*
726 * LSO relies on tx h/w checksum, so here will drop the packet
727 * if h/w checksum flag is not declared.
728 */
729 if ((ctx->hcksum_flags & lsocksum) != lsocksum) {
730 IXGBE_DEBUGLOG_2(NULL, "ixgbe_tx: h/w checksum flags "
731 "are not set for LSO, found 0x%x, needed bits 0x%x",
732 ctx->hcksum_flags, lsocksum);
733 return (-1);
734 }
735
736
737 offset = mac_hdr_len + start;
738 while (size <= offset) {
739 mp = mp->b_cont;
740 ASSERT(mp != NULL);
741 len = MBLKL(mp);
742 size += len;
743 }
744 pos = mp->b_rptr + offset + len - size;
745
746 l4_hdr_len = TCP_HDR_LENGTH((tcph_t *)pos);
747 } else {
748 /*
749 * l4 header length is only required for LSO
750 */
751 l4_hdr_len = 0;
752 }
753
754 ctx->mac_hdr_len = mac_hdr_len;
755 ctx->ip_hdr_len = start;
756 ctx->l4_proto = l4_proto;
757 ctx->l4_hdr_len = l4_hdr_len;
758
759 return (0);
760 }
761
762 /*
763 * ixgbe_check_context
764 *
765 * Check if a new context descriptor is needed
766 */
767 static boolean_t
ixgbe_check_context(ixgbe_tx_ring_t * tx_ring,ixgbe_tx_context_t * ctx)768 ixgbe_check_context(ixgbe_tx_ring_t *tx_ring, ixgbe_tx_context_t *ctx)
769 {
770 ixgbe_tx_context_t *last;
771
772 if (ctx == NULL)
773 return (B_FALSE);
774
775 /*
776 * Compare the context data retrieved from the mblk and the
777 * stored data of the last context descriptor. The data need
778 * to be checked are:
779 * hcksum_flags
780 * l4_proto
781 * mac_hdr_len
782 * ip_hdr_len
783 * lso_flag
784 * mss (only checked for LSO)
785 * l4_hr_len (only checked for LSO)
786 * Either one of the above data is changed, a new context descriptor
787 * will be needed.
788 */
789 last = &tx_ring->tx_context;
790
791 if ((ctx->hcksum_flags != last->hcksum_flags) ||
792 (ctx->l4_proto != last->l4_proto) ||
793 (ctx->l3_proto != last->l3_proto) ||
794 (ctx->mac_hdr_len != last->mac_hdr_len) ||
795 (ctx->ip_hdr_len != last->ip_hdr_len) ||
796 (ctx->lso_flag != last->lso_flag) ||
797 (ctx->lso_flag && ((ctx->mss != last->mss) ||
798 (ctx->l4_hdr_len != last->l4_hdr_len)))) {
799 return (B_TRUE);
800 }
801
802 return (B_FALSE);
803 }
804
805 /*
806 * ixgbe_fill_context
807 *
808 * Fill the context descriptor with hardware checksum informations
809 */
810 static void
ixgbe_fill_context(struct ixgbe_adv_tx_context_desc * ctx_tbd,ixgbe_tx_context_t * ctx)811 ixgbe_fill_context(struct ixgbe_adv_tx_context_desc *ctx_tbd,
812 ixgbe_tx_context_t *ctx)
813 {
814 /*
815 * Fill the context descriptor with the checksum
816 * context information we've got.
817 */
818 ctx_tbd->vlan_macip_lens = ctx->ip_hdr_len;
819 ctx_tbd->vlan_macip_lens |= ctx->mac_hdr_len <<
820 IXGBE_ADVTXD_MACLEN_SHIFT;
821
822 ctx_tbd->type_tucmd_mlhl =
823 IXGBE_ADVTXD_DCMD_DEXT | IXGBE_ADVTXD_DTYP_CTXT;
824 /*
825 * When we have a TX context set up, we enforce that the ethertype is
826 * either IPv4 or IPv6 in ixgbe_get_tx_context().
827 */
828 if (ctx->lso_flag || ctx->hcksum_flags & HCK_IPV4_HDRCKSUM) {
829 if (ctx->l3_proto == ETHERTYPE_IP) {
830 ctx_tbd->type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV4;
831 } else {
832 ctx_tbd->type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV6;
833 }
834 }
835
836 if (ctx->lso_flag || ctx->hcksum_flags & HCK_PARTIALCKSUM) {
837 switch (ctx->l4_proto) {
838 case IPPROTO_TCP:
839 ctx_tbd->type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_TCP;
840 break;
841 case IPPROTO_UDP:
842 /*
843 * We don't have to explicitly set:
844 * ctx_tbd->type_tucmd_mlhl |=
845 * IXGBE_ADVTXD_TUCMD_L4T_UDP;
846 * Because IXGBE_ADVTXD_TUCMD_L4T_UDP == 0b
847 */
848 break;
849 default:
850 /* Unrecoverable error */
851 IXGBE_DEBUGLOG_0(NULL, "L4 type error with tx hcksum");
852 break;
853 }
854 }
855
856 ctx_tbd->seqnum_seed = 0;
857
858 if (ctx->lso_flag) {
859 ctx_tbd->mss_l4len_idx =
860 (ctx->l4_hdr_len << IXGBE_ADVTXD_L4LEN_SHIFT) |
861 (ctx->mss << IXGBE_ADVTXD_MSS_SHIFT);
862 } else {
863 ctx_tbd->mss_l4len_idx = 0;
864 }
865 }
866
867 /*
868 * ixgbe_tx_fill_ring
869 *
870 * Fill the tx descriptor ring with the data
871 */
872 static int
ixgbe_tx_fill_ring(ixgbe_tx_ring_t * tx_ring,link_list_t * pending_list,ixgbe_tx_context_t * ctx,size_t mbsize)873 ixgbe_tx_fill_ring(ixgbe_tx_ring_t *tx_ring, link_list_t *pending_list,
874 ixgbe_tx_context_t *ctx, size_t mbsize)
875 {
876 struct ixgbe_hw *hw = &tx_ring->ixgbe->hw;
877 boolean_t load_context;
878 uint32_t index, tcb_index, desc_num;
879 union ixgbe_adv_tx_desc *tbd, *first_tbd;
880 tx_control_block_t *tcb, *first_tcb;
881 uint32_t hcksum_flags;
882 int i;
883
884 ASSERT(mutex_owned(&tx_ring->tx_lock));
885
886 tbd = NULL;
887 first_tbd = NULL;
888 first_tcb = NULL;
889 desc_num = 0;
890 hcksum_flags = 0;
891 load_context = B_FALSE;
892
893 /*
894 * Get the index of the first tx descriptor that will be filled,
895 * and the index of the first work list item that will be attached
896 * with the first used tx control block in the pending list.
897 * Note: the two indexes are the same.
898 */
899 index = tx_ring->tbd_tail;
900 tcb_index = tx_ring->tbd_tail;
901
902 if (ctx != NULL) {
903 hcksum_flags = ctx->hcksum_flags;
904
905 /*
906 * Check if a new context descriptor is needed for this packet
907 */
908 load_context = ixgbe_check_context(tx_ring, ctx);
909
910 if (load_context) {
911 tbd = &tx_ring->tbd_ring[index];
912
913 /*
914 * Fill the context descriptor with the
915 * hardware checksum offload informations.
916 */
917 ixgbe_fill_context(
918 (struct ixgbe_adv_tx_context_desc *)tbd, ctx);
919
920 index = NEXT_INDEX(index, 1, tx_ring->ring_size);
921 desc_num++;
922
923 /*
924 * Store the checksum context data if
925 * a new context descriptor is added
926 */
927 tx_ring->tx_context = *ctx;
928 }
929 }
930
931 first_tbd = &tx_ring->tbd_ring[index];
932
933 /*
934 * Fill tx data descriptors with the data saved in the pending list.
935 * The tx control blocks in the pending list are added to the work list
936 * at the same time.
937 *
938 * The work list is strictly 1:1 corresponding to the descriptor ring.
939 * One item of the work list corresponds to one tx descriptor. Because
940 * one tx control block can span multiple tx descriptors, the tx
941 * control block will be added to the first work list item that
942 * corresponds to the first tx descriptor generated from that tx
943 * control block.
944 */
945 tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
946 first_tcb = tcb;
947 while (tcb != NULL) {
948
949 for (i = 0; i < tcb->desc_num; i++) {
950 tbd = &tx_ring->tbd_ring[index];
951
952 tbd->read.buffer_addr = tcb->desc[i].address;
953 tbd->read.cmd_type_len = tcb->desc[i].length;
954
955 tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_DEXT
956 | IXGBE_ADVTXD_DTYP_DATA;
957
958 tbd->read.olinfo_status = 0;
959
960 index = NEXT_INDEX(index, 1, tx_ring->ring_size);
961 desc_num++;
962 }
963
964 /*
965 * Add the tx control block to the work list
966 */
967 ASSERT(tx_ring->work_list[tcb_index] == NULL);
968 tx_ring->work_list[tcb_index] = tcb;
969
970 tcb_index = index;
971 tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
972 }
973
974 if (load_context) {
975 /*
976 * Count the context descriptor for
977 * the first tx control block.
978 */
979 first_tcb->desc_num++;
980 }
981 first_tcb->last_index = PREV_INDEX(index, 1, tx_ring->ring_size);
982
983 /*
984 * The Insert Ethernet CRC (IFCS) bit and the checksum fields are only
985 * valid in the first descriptor of the packet.
986 * Setting paylen in every first_tbd for all parts.
987 * 82599, X540 and X550 require the packet length in paylen field
988 * with or without LSO and 82598 will ignore it in non-LSO mode.
989 */
990 ASSERT(first_tbd != NULL);
991 first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_IFCS;
992
993 switch (hw->mac.type) {
994 case ixgbe_mac_82598EB:
995 if (ctx != NULL && ctx->lso_flag) {
996 first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_TSE;
997 first_tbd->read.olinfo_status |=
998 (mbsize - ctx->mac_hdr_len - ctx->ip_hdr_len
999 - ctx->l4_hdr_len) << IXGBE_ADVTXD_PAYLEN_SHIFT;
1000 }
1001 break;
1002
1003 case ixgbe_mac_82599EB:
1004 case ixgbe_mac_X540:
1005 case ixgbe_mac_X550:
1006 case ixgbe_mac_X550EM_x:
1007 case ixgbe_mac_X550EM_a:
1008 if (ctx != NULL && ctx->lso_flag) {
1009 first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_TSE;
1010 first_tbd->read.olinfo_status |=
1011 (mbsize - ctx->mac_hdr_len - ctx->ip_hdr_len
1012 - ctx->l4_hdr_len) << IXGBE_ADVTXD_PAYLEN_SHIFT;
1013 } else {
1014 first_tbd->read.olinfo_status |=
1015 (mbsize << IXGBE_ADVTXD_PAYLEN_SHIFT);
1016 }
1017 break;
1018
1019 default:
1020 break;
1021 }
1022
1023 /* Set hardware checksum bits */
1024 if (hcksum_flags != 0) {
1025 if (hcksum_flags & HCK_IPV4_HDRCKSUM)
1026 first_tbd->read.olinfo_status |=
1027 IXGBE_ADVTXD_POPTS_IXSM;
1028 if (hcksum_flags & HCK_PARTIALCKSUM)
1029 first_tbd->read.olinfo_status |=
1030 IXGBE_ADVTXD_POPTS_TXSM;
1031 }
1032
1033 /*
1034 * The last descriptor of packet needs End Of Packet (EOP),
1035 * and Report Status (RS) bits set
1036 */
1037 ASSERT(tbd != NULL);
1038 tbd->read.cmd_type_len |=
1039 IXGBE_ADVTXD_DCMD_EOP | IXGBE_ADVTXD_DCMD_RS;
1040
1041 /*
1042 * Sync the DMA buffer of the tx descriptor ring
1043 */
1044 DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORDEV);
1045
1046 /*
1047 * Update the number of the free tx descriptors.
1048 * The mutual exclusion between the transmission and the recycling
1049 * (for the tx descriptor ring and the work list) is implemented
1050 * with the atomic operation on the number of the free tx descriptors.
1051 *
1052 * Note: we should always decrement the counter tbd_free before
1053 * advancing the hardware TDT pointer to avoid the race condition -
1054 * before the counter tbd_free is decremented, the transmit of the
1055 * tx descriptors has done and the counter tbd_free is increased by
1056 * the tx recycling.
1057 */
1058 i = ixgbe_atomic_reserve(&tx_ring->tbd_free, desc_num);
1059 ASSERT(i >= 0);
1060
1061 tx_ring->tbd_tail = index;
1062
1063 /*
1064 * Advance the hardware TDT pointer of the tx descriptor ring
1065 */
1066 IXGBE_WRITE_REG(hw, IXGBE_TDT(tx_ring->index), index);
1067
1068 if (ixgbe_check_acc_handle(tx_ring->ixgbe->osdep.reg_handle) !=
1069 DDI_FM_OK) {
1070 ddi_fm_service_impact(tx_ring->ixgbe->dip,
1071 DDI_SERVICE_DEGRADED);
1072 atomic_or_32(&tx_ring->ixgbe->ixgbe_state, IXGBE_ERROR);
1073 }
1074
1075 return (desc_num);
1076 }
1077
1078 /*
1079 * ixgbe_save_desc
1080 *
1081 * Save the address/length pair to the private array
1082 * of the tx control block. The address/length pairs
1083 * will be filled into the tx descriptor ring later.
1084 */
1085 static void
ixgbe_save_desc(tx_control_block_t * tcb,uint64_t address,size_t length)1086 ixgbe_save_desc(tx_control_block_t *tcb, uint64_t address, size_t length)
1087 {
1088 sw_desc_t *desc;
1089
1090 desc = &tcb->desc[tcb->desc_num];
1091 desc->address = address;
1092 desc->length = length;
1093
1094 tcb->desc_num++;
1095 }
1096
1097 /*
1098 * ixgbe_tx_recycle_legacy
1099 *
1100 * Recycle the tx descriptors and tx control blocks.
1101 *
1102 * The work list is traversed to check if the corresponding
1103 * tx descriptors have been transmitted. If so, the resources
1104 * bound to the tx control blocks will be freed, and those
1105 * tx control blocks will be returned to the free list.
1106 */
1107 uint32_t
ixgbe_tx_recycle_legacy(ixgbe_tx_ring_t * tx_ring)1108 ixgbe_tx_recycle_legacy(ixgbe_tx_ring_t *tx_ring)
1109 {
1110 uint32_t index, last_index, prev_index;
1111 int desc_num;
1112 boolean_t desc_done;
1113 tx_control_block_t *tcb;
1114 link_list_t pending_list;
1115 ixgbe_t *ixgbe = tx_ring->ixgbe;
1116
1117 mutex_enter(&tx_ring->recycle_lock);
1118
1119 ASSERT(tx_ring->tbd_free <= tx_ring->ring_size);
1120
1121 if (tx_ring->tbd_free == tx_ring->ring_size) {
1122 tx_ring->recycle_fail = 0;
1123 tx_ring->stall_watchdog = 0;
1124 if (tx_ring->reschedule) {
1125 tx_ring->reschedule = B_FALSE;
1126 mac_tx_ring_update(ixgbe->mac_hdl,
1127 tx_ring->ring_handle);
1128 }
1129 mutex_exit(&tx_ring->recycle_lock);
1130 return (0);
1131 }
1132
1133 /*
1134 * Sync the DMA buffer of the tx descriptor ring
1135 */
1136 DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORKERNEL);
1137
1138 if (ixgbe_check_dma_handle(tx_ring->tbd_area.dma_handle) != DDI_FM_OK) {
1139 mutex_exit(&tx_ring->recycle_lock);
1140 ddi_fm_service_impact(ixgbe->dip, DDI_SERVICE_DEGRADED);
1141 atomic_or_32(&ixgbe->ixgbe_state, IXGBE_ERROR);
1142 return (0);
1143 }
1144
1145 LINK_LIST_INIT(&pending_list);
1146 desc_num = 0;
1147 index = tx_ring->tbd_head; /* Index of next tbd/tcb to recycle */
1148
1149 tcb = tx_ring->work_list[index];
1150 ASSERT(tcb != NULL);
1151
1152 while (tcb != NULL) {
1153 /*
1154 * Get the last tx descriptor of this packet.
1155 * If the last tx descriptor is done, then
1156 * we can recycle all descriptors of a packet
1157 * which usually includes several tx control blocks.
1158 * For 82599, LSO descriptors can not be recycled
1159 * unless the whole packet's transmission is done.
1160 * That's why packet level recycling is used here.
1161 * For 82598, there's not such limit.
1162 */
1163 last_index = tcb->last_index;
1164 /*
1165 * MAX_TX_RING_SIZE is used to judge whether
1166 * the index is a valid value or not.
1167 */
1168 if (last_index == MAX_TX_RING_SIZE)
1169 break;
1170
1171 /*
1172 * Check if the Descriptor Done bit is set
1173 */
1174 desc_done = tx_ring->tbd_ring[last_index].wb.status &
1175 IXGBE_TXD_STAT_DD;
1176 if (desc_done) {
1177 /*
1178 * recycle all descriptors of the packet
1179 */
1180 while (tcb != NULL) {
1181 /*
1182 * Strip off the tx control block from
1183 * the work list, and add it to the
1184 * pending list.
1185 */
1186 tx_ring->work_list[index] = NULL;
1187 LIST_PUSH_TAIL(&pending_list, &tcb->link);
1188
1189 /*
1190 * Count the total number of the tx
1191 * descriptors recycled
1192 */
1193 desc_num += tcb->desc_num;
1194
1195 index = NEXT_INDEX(index, tcb->desc_num,
1196 tx_ring->ring_size);
1197
1198 tcb = tx_ring->work_list[index];
1199
1200 prev_index = PREV_INDEX(index, 1,
1201 tx_ring->ring_size);
1202 if (prev_index == last_index)
1203 break;
1204 }
1205 } else {
1206 break;
1207 }
1208 }
1209
1210 /*
1211 * If no tx descriptors are recycled, no need to do more processing
1212 */
1213 if (desc_num == 0) {
1214 tx_ring->recycle_fail++;
1215 mutex_exit(&tx_ring->recycle_lock);
1216 return (0);
1217 }
1218
1219 tx_ring->recycle_fail = 0;
1220 tx_ring->stall_watchdog = 0;
1221
1222 /*
1223 * Update the head index of the tx descriptor ring
1224 */
1225 tx_ring->tbd_head = index;
1226
1227 /*
1228 * Update the number of the free tx descriptors with atomic operations
1229 */
1230 atomic_add_32(&tx_ring->tbd_free, desc_num);
1231
1232 if ((tx_ring->tbd_free >= ixgbe->tx_resched_thresh) &&
1233 (tx_ring->reschedule)) {
1234 tx_ring->reschedule = B_FALSE;
1235 mac_tx_ring_update(ixgbe->mac_hdl,
1236 tx_ring->ring_handle);
1237 }
1238 mutex_exit(&tx_ring->recycle_lock);
1239
1240 /*
1241 * Add the tx control blocks in the pending list to the free list.
1242 */
1243 ixgbe_put_free_list(tx_ring, &pending_list);
1244
1245 return (desc_num);
1246 }
1247
1248 /*
1249 * ixgbe_tx_recycle_head_wb
1250 *
1251 * Check the head write-back, and recycle all the transmitted
1252 * tx descriptors and tx control blocks.
1253 */
1254 uint32_t
ixgbe_tx_recycle_head_wb(ixgbe_tx_ring_t * tx_ring)1255 ixgbe_tx_recycle_head_wb(ixgbe_tx_ring_t *tx_ring)
1256 {
1257 uint32_t index;
1258 uint32_t head_wb;
1259 int desc_num;
1260 tx_control_block_t *tcb;
1261 link_list_t pending_list;
1262 ixgbe_t *ixgbe = tx_ring->ixgbe;
1263
1264 mutex_enter(&tx_ring->recycle_lock);
1265
1266 ASSERT(tx_ring->tbd_free <= tx_ring->ring_size);
1267
1268 if (tx_ring->tbd_free == tx_ring->ring_size) {
1269 tx_ring->recycle_fail = 0;
1270 tx_ring->stall_watchdog = 0;
1271 if (tx_ring->reschedule) {
1272 tx_ring->reschedule = B_FALSE;
1273 mac_tx_ring_update(ixgbe->mac_hdl,
1274 tx_ring->ring_handle);
1275 }
1276 mutex_exit(&tx_ring->recycle_lock);
1277 return (0);
1278 }
1279
1280 /*
1281 * Sync the DMA buffer of the tx descriptor ring
1282 *
1283 * Note: For head write-back mode, the tx descriptors will not
1284 * be written back, but the head write-back value is stored at
1285 * the last extra tbd at the end of the DMA area, we still need
1286 * to sync the head write-back value for kernel.
1287 *
1288 * DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORKERNEL);
1289 */
1290 (void) ddi_dma_sync(tx_ring->tbd_area.dma_handle,
1291 sizeof (union ixgbe_adv_tx_desc) * tx_ring->ring_size,
1292 sizeof (uint32_t),
1293 DDI_DMA_SYNC_FORKERNEL);
1294
1295 if (ixgbe_check_dma_handle(tx_ring->tbd_area.dma_handle) != DDI_FM_OK) {
1296 mutex_exit(&tx_ring->recycle_lock);
1297 ddi_fm_service_impact(ixgbe->dip,
1298 DDI_SERVICE_DEGRADED);
1299 atomic_or_32(&ixgbe->ixgbe_state, IXGBE_ERROR);
1300 return (0);
1301 }
1302
1303 LINK_LIST_INIT(&pending_list);
1304 desc_num = 0;
1305 index = tx_ring->tbd_head; /* Next index to clean */
1306
1307 /*
1308 * Get the value of head write-back
1309 */
1310 head_wb = *tx_ring->tbd_head_wb;
1311 while (index != head_wb) {
1312 tcb = tx_ring->work_list[index];
1313 ASSERT(tcb != NULL);
1314
1315 if (OFFSET(index, head_wb, tx_ring->ring_size) <
1316 tcb->desc_num) {
1317 /*
1318 * The current tx control block is not
1319 * completely transmitted, stop recycling
1320 */
1321 break;
1322 }
1323
1324 /*
1325 * Strip off the tx control block from the work list,
1326 * and add it to the pending list.
1327 */
1328 tx_ring->work_list[index] = NULL;
1329 LIST_PUSH_TAIL(&pending_list, &tcb->link);
1330
1331 /*
1332 * Advance the index of the tx descriptor ring
1333 */
1334 index = NEXT_INDEX(index, tcb->desc_num, tx_ring->ring_size);
1335
1336 /*
1337 * Count the total number of the tx descriptors recycled
1338 */
1339 desc_num += tcb->desc_num;
1340 }
1341
1342 /*
1343 * If no tx descriptors are recycled, no need to do more processing
1344 */
1345 if (desc_num == 0) {
1346 tx_ring->recycle_fail++;
1347 mutex_exit(&tx_ring->recycle_lock);
1348 return (0);
1349 }
1350
1351 tx_ring->recycle_fail = 0;
1352 tx_ring->stall_watchdog = 0;
1353
1354 /*
1355 * Update the head index of the tx descriptor ring
1356 */
1357 tx_ring->tbd_head = index;
1358
1359 /*
1360 * Update the number of the free tx descriptors with atomic operations
1361 */
1362 atomic_add_32(&tx_ring->tbd_free, desc_num);
1363
1364 if ((tx_ring->tbd_free >= ixgbe->tx_resched_thresh) &&
1365 (tx_ring->reschedule)) {
1366 tx_ring->reschedule = B_FALSE;
1367 mac_tx_ring_update(ixgbe->mac_hdl,
1368 tx_ring->ring_handle);
1369 }
1370 mutex_exit(&tx_ring->recycle_lock);
1371
1372 /*
1373 * Add the tx control blocks in the pending list to the free list.
1374 */
1375 ixgbe_put_free_list(tx_ring, &pending_list);
1376
1377 return (desc_num);
1378 }
1379
1380 /*
1381 * ixgbe_free_tcb - free up the tx control block
1382 *
1383 * Free the resources of the tx control block, including
1384 * unbind the previously bound DMA handle, and reset other
1385 * control fields.
1386 */
1387 void
ixgbe_free_tcb(tx_control_block_t * tcb)1388 ixgbe_free_tcb(tx_control_block_t *tcb)
1389 {
1390 if (tcb == NULL)
1391 return;
1392
1393 switch (tcb->tx_type) {
1394 case USE_COPY:
1395 /*
1396 * Reset the buffer length that is used for copy
1397 */
1398 tcb->tx_buf.len = 0;
1399 break;
1400 case USE_DMA:
1401 /*
1402 * Release the DMA resource that is used for
1403 * DMA binding.
1404 */
1405 (void) ddi_dma_unbind_handle(tcb->tx_dma_handle);
1406 break;
1407 default:
1408 break;
1409 }
1410
1411 /*
1412 * Free the mblk
1413 */
1414 if (tcb->mp != NULL) {
1415 freemsg(tcb->mp);
1416 tcb->mp = NULL;
1417 }
1418
1419 tcb->tx_type = USE_NONE;
1420 tcb->last_index = MAX_TX_RING_SIZE;
1421 tcb->frag_num = 0;
1422 tcb->desc_num = 0;
1423 }
1424
1425 /*
1426 * ixgbe_get_free_list - Get a free tx control block from the free list.
1427 * Returns the tx control block and appends it to list.
1428 *
1429 * The atomic operation on the number of the available tx control block
1430 * in the free list is used to keep this routine mutual exclusive with
1431 * the routine ixgbe_put_check_list.
1432 */
1433 static tx_control_block_t *
ixgbe_get_free_list(ixgbe_tx_ring_t * tx_ring,link_list_t * list)1434 ixgbe_get_free_list(ixgbe_tx_ring_t *tx_ring, link_list_t *list)
1435 {
1436 tx_control_block_t *tcb;
1437
1438 /*
1439 * Check and update the number of the free tx control block
1440 * in the free list.
1441 */
1442 if (ixgbe_atomic_reserve(&tx_ring->tcb_free, 1) < 0) {
1443 tx_ring->stat_fail_no_tcb++;
1444 return (NULL);
1445 }
1446
1447 mutex_enter(&tx_ring->tcb_head_lock);
1448
1449 tcb = tx_ring->free_list[tx_ring->tcb_head];
1450 ASSERT(tcb != NULL);
1451 tx_ring->free_list[tx_ring->tcb_head] = NULL;
1452 tx_ring->tcb_head = NEXT_INDEX(tx_ring->tcb_head, 1,
1453 tx_ring->free_list_size);
1454
1455 mutex_exit(&tx_ring->tcb_head_lock);
1456
1457 LIST_PUSH_TAIL(list, &tcb->link);
1458 return (tcb);
1459 }
1460
1461 /*
1462 * ixgbe_put_free_list
1463 *
1464 * Put a list of used tx control blocks back to the free list
1465 *
1466 * A mutex is used here to ensure the serialization. The mutual exclusion
1467 * between ixgbe_get_free_list and ixgbe_put_free_list is implemented with
1468 * the atomic operation on the counter tcb_free.
1469 */
1470 void
ixgbe_put_free_list(ixgbe_tx_ring_t * tx_ring,link_list_t * pending_list)1471 ixgbe_put_free_list(ixgbe_tx_ring_t *tx_ring, link_list_t *pending_list)
1472 {
1473 uint32_t index;
1474 int tcb_num;
1475 tx_control_block_t *tcb;
1476
1477 for (tcb = (tx_control_block_t *)LIST_GET_HEAD(pending_list);
1478 tcb != NULL;
1479 tcb = (tx_control_block_t *)LIST_GET_NEXT(pending_list, tcb)) {
1480 /*
1481 * Despite the name, ixgbe_free_tcb() just releases the
1482 * resources in tcb, but does not free tcb itself.
1483 */
1484 ixgbe_free_tcb(tcb);
1485 }
1486
1487 mutex_enter(&tx_ring->tcb_tail_lock);
1488
1489 index = tx_ring->tcb_tail;
1490
1491 tcb_num = 0;
1492 tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
1493 while (tcb != NULL) {
1494 ASSERT(tx_ring->free_list[index] == NULL);
1495 tx_ring->free_list[index] = tcb;
1496
1497 tcb_num++;
1498
1499 index = NEXT_INDEX(index, 1, tx_ring->free_list_size);
1500
1501 tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
1502 }
1503
1504 tx_ring->tcb_tail = index;
1505
1506 /*
1507 * Update the number of the free tx control block
1508 * in the free list. This operation must be placed
1509 * under the protection of the lock.
1510 */
1511 atomic_add_32(&tx_ring->tcb_free, tcb_num);
1512
1513 mutex_exit(&tx_ring->tcb_tail_lock);
1514 }
1515