xref: /titanic_51/usr/src/uts/common/io/igb/igb_tx.c (revision 45e662eb8429b38c18931ebeed30f2e5287ae51b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright(c) 2007-2010 Intel Corporation. All rights reserved.
24  */
25 
26 /*
27  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
28  */
29 
30 #include "igb_sw.h"
31 
32 static boolean_t igb_tx(igb_tx_ring_t *, mblk_t *);
33 static int igb_tx_copy(igb_tx_ring_t *, tx_control_block_t *, mblk_t *,
34     uint32_t, boolean_t);
35 static int igb_tx_bind(igb_tx_ring_t *, tx_control_block_t *, mblk_t *,
36     uint32_t);
37 static int igb_tx_fill_ring(igb_tx_ring_t *, link_list_t *, tx_context_t *,
38     size_t);
39 static void igb_save_desc(tx_control_block_t *, uint64_t, size_t);
40 static tx_control_block_t *igb_get_free_list(igb_tx_ring_t *);
41 static int igb_get_tx_context(mblk_t *, tx_context_t *);
42 static boolean_t igb_check_tx_context(igb_tx_ring_t *, tx_context_t *);
43 static void igb_fill_tx_context(struct e1000_adv_tx_context_desc *,
44     tx_context_t *, uint32_t);
45 
46 #ifndef IGB_DEBUG
47 #pragma inline(igb_save_desc)
48 #pragma inline(igb_get_tx_context)
49 #pragma inline(igb_check_tx_context)
50 #pragma inline(igb_fill_tx_context)
51 #endif
52 
53 mblk_t *
54 igb_tx_ring_send(void *arg, mblk_t *mp)
55 {
56 	igb_tx_ring_t *tx_ring = (igb_tx_ring_t *)arg;
57 
58 	ASSERT(tx_ring != NULL);
59 
60 	if ((tx_ring->igb->igb_state & IGB_SUSPENDED) ||
61 	    (tx_ring->igb->igb_state & IGB_ERROR) ||
62 	    !(tx_ring->igb->igb_state & IGB_STARTED)) {
63 		freemsg(mp);
64 		return (NULL);
65 	}
66 
67 	return ((igb_tx(tx_ring, mp)) ? NULL : mp);
68 }
69 
70 /*
71  * igb_tx - Main transmit processing
72  *
73  * Called from igb_m_tx with an mblk ready to transmit. this
74  * routine sets up the transmit descriptors and sends data to
75  * the wire.
76  *
77  * One mblk can consist of several fragments, each fragment
78  * will be processed with different methods based on the size.
79  * For the fragments with size less than the bcopy threshold,
80  * they will be processed by using bcopy; otherwise, they will
81  * be processed by using DMA binding.
82  *
83  * To process the mblk, a tx control block is got from the
84  * free list. One tx control block contains one tx buffer, which
85  * is used to copy mblk fragments' data; and one tx DMA handle,
86  * which is used to bind a mblk fragment with DMA resource.
87  *
88  * Several small mblk fragments can be copied into one tx control
89  * block's buffer, and then the buffer will be transmitted with
90  * one tx descriptor.
91  *
92  * A large fragment only binds with one tx control block's DMA
93  * handle, and it can span several tx descriptors for transmitting.
94  *
95  * So to transmit a packet (mblk), several tx control blocks can
96  * be used. After the processing, those tx control blocks will
97  * be put to the work list.
98  */
99 static boolean_t
100 igb_tx(igb_tx_ring_t *tx_ring, mblk_t *mp)
101 {
102 	igb_t *igb = tx_ring->igb;
103 	tx_type_t current_flag, next_flag;
104 	uint32_t current_len, next_len;
105 	uint32_t desc_total;
106 	size_t mbsize;
107 	int desc_num;
108 	boolean_t copy_done, eop;
109 	mblk_t *current_mp, *next_mp, *nmp;
110 	tx_control_block_t *tcb;
111 	tx_context_t tx_context, *ctx;
112 	link_list_t pending_list;
113 	mblk_t *hdr_new_mp = NULL;
114 	mblk_t *hdr_previous_mp = NULL;
115 	mblk_t *hdr_current_mp = NULL;
116 	uint32_t hdr_frag_len;
117 	uint32_t hdr_len, len;
118 	uint32_t copy_thresh;
119 
120 	copy_thresh = igb->tx_copy_thresh;
121 
122 	/* Get the mblk size */
123 	mbsize = 0;
124 	for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
125 		mbsize += MBLKL(nmp);
126 	}
127 
128 	if (igb->tx_hcksum_enable) {
129 		ctx = &tx_context;
130 		/*
131 		 * Retrieve offloading context information from the mblk
132 		 * that will be used to decide whether/how to fill the
133 		 * context descriptor.
134 		 */
135 		if (igb_get_tx_context(mp, ctx) != TX_CXT_SUCCESS) {
136 			freemsg(mp);
137 			return (B_TRUE);
138 		}
139 
140 		if ((ctx->lso_flag &&
141 		    (mbsize > (ctx->mac_hdr_len + IGB_LSO_MAXLEN))) ||
142 		    (!ctx->lso_flag &&
143 		    (mbsize > (igb->max_frame_size - ETHERFCSL)))) {
144 			freemsg(mp);
145 			IGB_DEBUGLOG_0(igb, "igb_tx: packet oversize");
146 			return (B_TRUE);
147 		}
148 	} else {
149 		ctx = NULL;
150 		if (mbsize > (igb->max_frame_size - ETHERFCSL)) {
151 			freemsg(mp);
152 			IGB_DEBUGLOG_0(igb, "igb_tx: packet oversize");
153 			return (B_TRUE);
154 		}
155 	}
156 
157 	/*
158 	 * Check and recycle tx descriptors.
159 	 * The recycle threshold here should be selected carefully
160 	 */
161 	if (tx_ring->tbd_free < igb->tx_recycle_thresh)
162 		tx_ring->tx_recycle(tx_ring);
163 
164 	/*
165 	 * After the recycling, if the tbd_free is less than the
166 	 * tx_overload_threshold, assert overload, return B_FALSE;
167 	 * and we need to re-schedule the tx again.
168 	 */
169 	if (tx_ring->tbd_free < igb->tx_overload_thresh) {
170 		tx_ring->reschedule = B_TRUE;
171 		IGB_DEBUG_STAT(tx_ring->stat_overload);
172 		return (B_FALSE);
173 	}
174 
175 	/*
176 	 * The software should guarantee LSO packet header(MAC+IP+TCP)
177 	 * to be within one descriptor - this is required by h/w.
178 	 * Here will reallocate and refill the header if
179 	 * the headers(MAC+IP+TCP) is physical memory non-contiguous.
180 	 */
181 	if (ctx && ctx->lso_flag) {
182 		hdr_len = ctx->mac_hdr_len + ctx->ip_hdr_len + ctx->l4_hdr_len;
183 		len = MBLKL(mp);
184 		hdr_current_mp = mp;
185 		while (len < hdr_len) {
186 			hdr_previous_mp = hdr_current_mp;
187 			hdr_current_mp = hdr_current_mp->b_cont;
188 			len += MBLKL(hdr_current_mp);
189 		}
190 		/*
191 		 * If the header and the payload are in different mblks,
192 		 * we simply force the header to be copied into pre-allocated
193 		 * page-aligned buffer.
194 		 */
195 		if (len == hdr_len)
196 			goto adjust_threshold;
197 
198 		hdr_frag_len = hdr_len - (len - MBLKL(hdr_current_mp));
199 		/*
200 		 * There are two cases we will reallocate
201 		 * a mblk for the last header fragment.
202 		 * 1. the header is in multiple mblks and
203 		 *    the last fragment shares the same mblk
204 		 *    with the payload
205 		 * 2. the header is in a single mblk shared
206 		 *    with the payload but the header crosses
207 		 *    a page.
208 		 */
209 		if ((hdr_current_mp != mp) ||
210 		    (P2NPHASE((uintptr_t)hdr_current_mp->b_rptr, igb->page_size)
211 		    < hdr_len)) {
212 			/*
213 			 * reallocate the mblk for the last header fragment,
214 			 * expect it to be copied into pre-allocated
215 			 * page-aligned buffer
216 			 */
217 			hdr_new_mp = allocb(hdr_frag_len, NULL);
218 			if (!hdr_new_mp) {
219 				return (B_FALSE);
220 			}
221 
222 			/* link the new header fragment with the other parts */
223 			bcopy(hdr_current_mp->b_rptr,
224 			    hdr_new_mp->b_rptr, hdr_frag_len);
225 			hdr_new_mp->b_wptr = hdr_new_mp->b_rptr + hdr_frag_len;
226 			hdr_new_mp->b_cont = hdr_current_mp;
227 			if (hdr_previous_mp)
228 				hdr_previous_mp->b_cont = hdr_new_mp;
229 			else
230 				mp = hdr_new_mp;
231 			hdr_current_mp->b_rptr += hdr_frag_len;
232 		}
233 adjust_threshold:
234 		/*
235 		 * adjust the bcopy threshhold to guarantee
236 		 * the header to use bcopy way
237 		 */
238 		if (copy_thresh < hdr_len)
239 			copy_thresh = hdr_len;
240 	}
241 
242 	/*
243 	 * The pending_list is a linked list that is used to save
244 	 * the tx control blocks that have packet data processed
245 	 * but have not put the data to the tx descriptor ring.
246 	 * It is used to reduce the lock contention of the tx_lock.
247 	 */
248 	LINK_LIST_INIT(&pending_list);
249 	desc_num = 0;
250 	desc_total = 0;
251 
252 	current_mp = mp;
253 	current_len = MBLKL(current_mp);
254 	/*
255 	 * Decide which method to use for the first fragment
256 	 */
257 	current_flag = (current_len <= copy_thresh) ?
258 	    USE_COPY : USE_DMA;
259 	/*
260 	 * If the mblk includes several contiguous small fragments,
261 	 * they may be copied into one buffer. This flag is used to
262 	 * indicate whether there are pending fragments that need to
263 	 * be copied to the current tx buffer.
264 	 *
265 	 * If this flag is B_TRUE, it indicates that a new tx control
266 	 * block is needed to process the next fragment using either
267 	 * copy or DMA binding.
268 	 *
269 	 * Otherwise, it indicates that the next fragment will be
270 	 * copied to the current tx buffer that is maintained by the
271 	 * current tx control block. No new tx control block is needed.
272 	 */
273 	copy_done = B_TRUE;
274 	while (current_mp) {
275 		next_mp = current_mp->b_cont;
276 		eop = (next_mp == NULL); /* Last fragment of the packet? */
277 		next_len = eop ? 0: MBLKL(next_mp);
278 
279 		/*
280 		 * When the current fragment is an empty fragment, if
281 		 * the next fragment will still be copied to the current
282 		 * tx buffer, we cannot skip this fragment here. Because
283 		 * the copy processing is pending for completion. We have
284 		 * to process this empty fragment in the tx_copy routine.
285 		 *
286 		 * If the copy processing is completed or a DMA binding
287 		 * processing is just completed, we can just skip this
288 		 * empty fragment.
289 		 */
290 		if ((current_len == 0) && (copy_done)) {
291 			current_mp = next_mp;
292 			current_len = next_len;
293 			current_flag = (current_len <= copy_thresh) ?
294 			    USE_COPY : USE_DMA;
295 			continue;
296 		}
297 
298 		if (copy_done) {
299 			/*
300 			 * Get a new tx control block from the free list
301 			 */
302 			tcb = igb_get_free_list(tx_ring);
303 
304 			if (tcb == NULL) {
305 				IGB_DEBUG_STAT(tx_ring->stat_fail_no_tcb);
306 				goto tx_failure;
307 			}
308 
309 			/*
310 			 * Push the tx control block to the pending list
311 			 * to avoid using lock too early
312 			 */
313 			LIST_PUSH_TAIL(&pending_list, &tcb->link);
314 		}
315 
316 		if (current_flag == USE_COPY) {
317 			/*
318 			 * Check whether to use bcopy or DMA binding to process
319 			 * the next fragment, and if using bcopy, whether we
320 			 * need to continue copying the next fragment into the
321 			 * current tx buffer.
322 			 */
323 			ASSERT((tcb->tx_buf.len + current_len) <=
324 			    tcb->tx_buf.size);
325 
326 			if (eop) {
327 				/*
328 				 * This is the last fragment of the packet, so
329 				 * the copy processing will be completed with
330 				 * this fragment.
331 				 */
332 				next_flag = USE_NONE;
333 				copy_done = B_TRUE;
334 			} else if ((tcb->tx_buf.len + current_len + next_len) >
335 			    tcb->tx_buf.size) {
336 				/*
337 				 * If the next fragment is too large to be
338 				 * copied to the current tx buffer, we need
339 				 * to complete the current copy processing.
340 				 */
341 				next_flag = (next_len > copy_thresh) ?
342 				    USE_DMA: USE_COPY;
343 				copy_done = B_TRUE;
344 			} else if (next_len > copy_thresh) {
345 				/*
346 				 * The next fragment needs to be processed with
347 				 * DMA binding. So the copy prcessing will be
348 				 * completed with the current fragment.
349 				 */
350 				next_flag = USE_DMA;
351 				copy_done = B_TRUE;
352 			} else {
353 				/*
354 				 * Continue to copy the next fragment to the
355 				 * current tx buffer.
356 				 */
357 				next_flag = USE_COPY;
358 				copy_done = B_FALSE;
359 			}
360 
361 			desc_num = igb_tx_copy(tx_ring, tcb, current_mp,
362 			    current_len, copy_done);
363 		} else {
364 			/*
365 			 * Check whether to use bcopy or DMA binding to process
366 			 * the next fragment.
367 			 */
368 			next_flag = (next_len > copy_thresh) ?
369 			    USE_DMA: USE_COPY;
370 			ASSERT(copy_done == B_TRUE);
371 
372 			desc_num = igb_tx_bind(tx_ring, tcb, current_mp,
373 			    current_len);
374 		}
375 
376 		if (desc_num > 0)
377 			desc_total += desc_num;
378 		else if (desc_num < 0)
379 			goto tx_failure;
380 
381 		current_mp = next_mp;
382 		current_len = next_len;
383 		current_flag = next_flag;
384 	}
385 
386 	/*
387 	 * Attach the mblk to the last tx control block
388 	 */
389 	ASSERT(tcb);
390 	ASSERT(tcb->mp == NULL);
391 	tcb->mp = mp;
392 
393 	/*
394 	 * Before fill the tx descriptor ring with the data, we need to
395 	 * ensure there are adequate free descriptors for transmit
396 	 * (including one context descriptor).
397 	 * Do not use up all the tx descriptors.
398 	 * Otherwise tx recycle will fail and cause false hang.
399 	 */
400 	if (tx_ring->tbd_free <= (desc_total + 1)) {
401 		tx_ring->tx_recycle(tx_ring);
402 	}
403 
404 	mutex_enter(&tx_ring->tx_lock);
405 
406 	/*
407 	 * If the number of free tx descriptors is not enough for transmit
408 	 * then return failure.
409 	 *
410 	 * Note: we must put this check under the mutex protection to
411 	 * ensure the correctness when multiple threads access it in
412 	 * parallel.
413 	 */
414 	if (tx_ring->tbd_free <= (desc_total + 1)) {
415 		IGB_DEBUG_STAT(tx_ring->stat_fail_no_tbd);
416 		mutex_exit(&tx_ring->tx_lock);
417 		goto tx_failure;
418 	}
419 
420 	desc_num = igb_tx_fill_ring(tx_ring, &pending_list, ctx, mbsize);
421 
422 	ASSERT((desc_num == desc_total) || (desc_num == (desc_total + 1)));
423 
424 	/* Update per-ring tx statistics */
425 	tx_ring->tx_pkts++;
426 	tx_ring->tx_bytes += mbsize;
427 
428 	mutex_exit(&tx_ring->tx_lock);
429 
430 	return (B_TRUE);
431 
432 tx_failure:
433 	/*
434 	 * If new mblk has been allocted for the last header
435 	 * fragment of a LSO packet, we should restore the
436 	 * modified mp.
437 	 */
438 	if (hdr_new_mp) {
439 		hdr_new_mp->b_cont = NULL;
440 		freeb(hdr_new_mp);
441 		hdr_current_mp->b_rptr -= hdr_frag_len;
442 		if (hdr_previous_mp)
443 			hdr_previous_mp->b_cont = hdr_current_mp;
444 		else
445 			mp = hdr_current_mp;
446 	}
447 
448 	/*
449 	 * Discard the mblk and free the used resources
450 	 */
451 	tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list);
452 	while (tcb) {
453 		tcb->mp = NULL;
454 
455 		igb_free_tcb(tcb);
456 
457 		tcb = (tx_control_block_t *)
458 		    LIST_GET_NEXT(&pending_list, &tcb->link);
459 	}
460 
461 	/*
462 	 * Return the tx control blocks in the pending list to the free list.
463 	 */
464 	igb_put_free_list(tx_ring, &pending_list);
465 
466 	/* Transmit failed, do not drop the mblk, rechedule the transmit */
467 	tx_ring->reschedule = B_TRUE;
468 
469 	return (B_FALSE);
470 }
471 
472 /*
473  * igb_tx_copy
474  *
475  * Copy the mblk fragment to the pre-allocated tx buffer
476  */
477 static int
478 igb_tx_copy(igb_tx_ring_t *tx_ring, tx_control_block_t *tcb, mblk_t *mp,
479     uint32_t len, boolean_t copy_done)
480 {
481 	dma_buffer_t *tx_buf;
482 	uint32_t desc_num;
483 	_NOTE(ARGUNUSED(tx_ring));
484 
485 	tx_buf = &tcb->tx_buf;
486 
487 	/*
488 	 * Copy the packet data of the mblk fragment into the
489 	 * pre-allocated tx buffer, which is maintained by the
490 	 * tx control block.
491 	 *
492 	 * Several mblk fragments can be copied into one tx buffer.
493 	 * The destination address of the current copied fragment in
494 	 * the tx buffer is next to the end of the previous copied
495 	 * fragment.
496 	 */
497 	if (len > 0) {
498 		bcopy(mp->b_rptr, tx_buf->address + tx_buf->len, len);
499 
500 		tx_buf->len += len;
501 		tcb->frag_num++;
502 	}
503 
504 	desc_num = 0;
505 
506 	/*
507 	 * If it is the last fragment copied to the current tx buffer,
508 	 * in other words, if there's no remaining fragment or the remaining
509 	 * fragment requires a new tx control block to process, we need to
510 	 * complete the current copy processing by syncing up the current
511 	 * DMA buffer and saving the descriptor data.
512 	 */
513 	if (copy_done) {
514 		/*
515 		 * Sync the DMA buffer of the packet data
516 		 */
517 		DMA_SYNC(tx_buf, DDI_DMA_SYNC_FORDEV);
518 
519 		tcb->tx_type = USE_COPY;
520 
521 		/*
522 		 * Save the address and length to the private data structure
523 		 * of the tx control block, which will be used to fill the
524 		 * tx descriptor ring after all the fragments are processed.
525 		 */
526 		igb_save_desc(tcb, tx_buf->dma_address, tx_buf->len);
527 		desc_num++;
528 	}
529 
530 	return (desc_num);
531 }
532 
533 /*
534  * igb_tx_bind
535  *
536  * Bind the mblk fragment with DMA
537  */
538 static int
539 igb_tx_bind(igb_tx_ring_t *tx_ring, tx_control_block_t *tcb, mblk_t *mp,
540     uint32_t len)
541 {
542 	int status, i;
543 	ddi_dma_cookie_t dma_cookie;
544 	uint_t ncookies;
545 	int desc_num;
546 
547 	/*
548 	 * Use DMA binding to process the mblk fragment
549 	 */
550 	status = ddi_dma_addr_bind_handle(tcb->tx_dma_handle, NULL,
551 	    (caddr_t)mp->b_rptr, len,
552 	    DDI_DMA_WRITE | DDI_DMA_STREAMING, DDI_DMA_DONTWAIT,
553 	    0, &dma_cookie, &ncookies);
554 
555 	if (status != DDI_DMA_MAPPED) {
556 		IGB_DEBUG_STAT(tx_ring->stat_fail_dma_bind);
557 		return (-1);
558 	}
559 
560 	tcb->frag_num++;
561 	tcb->tx_type = USE_DMA;
562 	/*
563 	 * Each fragment can span several cookies. One cookie will have
564 	 * one tx descriptor to transmit.
565 	 */
566 	desc_num = 0;
567 	for (i = ncookies; i > 0; i--) {
568 		/*
569 		 * Save the address and length to the private data structure
570 		 * of the tx control block, which will be used to fill the
571 		 * tx descriptor ring after all the fragments are processed.
572 		 */
573 		igb_save_desc(tcb,
574 		    dma_cookie.dmac_laddress,
575 		    dma_cookie.dmac_size);
576 
577 		desc_num++;
578 
579 		if (i > 1)
580 			ddi_dma_nextcookie(tcb->tx_dma_handle, &dma_cookie);
581 	}
582 
583 	return (desc_num);
584 }
585 
586 /*
587  * igb_get_tx_context
588  *
589  * Get the tx context information from the mblk
590  */
591 static int
592 igb_get_tx_context(mblk_t *mp, tx_context_t *ctx)
593 {
594 	uint32_t start;
595 	uint32_t flags;
596 	uint32_t lso_flag;
597 	uint32_t mss;
598 	uint32_t len;
599 	uint32_t size;
600 	uint32_t offset;
601 	unsigned char *pos;
602 	ushort_t etype;
603 	uint32_t mac_hdr_len;
604 	uint32_t l4_proto;
605 	uint32_t l4_hdr_len;
606 
607 	ASSERT(mp != NULL);
608 
609 	mac_hcksum_get(mp, &start, NULL, NULL, NULL, &flags);
610 	bzero(ctx, sizeof (tx_context_t));
611 
612 	ctx->hcksum_flags = flags;
613 
614 	if (flags == 0)
615 		return (TX_CXT_SUCCESS);
616 
617 	mac_lso_get(mp, &mss, &lso_flag);
618 	ctx->mss = mss;
619 	ctx->lso_flag = (lso_flag == HW_LSO);
620 
621 	/*
622 	 * LSO relies on tx h/w checksum, so here the packet will be
623 	 * dropped if the h/w checksum flags are not set.
624 	 */
625 	if (ctx->lso_flag) {
626 		if (!((ctx->hcksum_flags & HCK_PARTIALCKSUM) &&
627 		    (ctx->hcksum_flags & HCK_IPV4_HDRCKSUM))) {
628 			IGB_DEBUGLOG_0(NULL, "igb_tx: h/w "
629 			    "checksum flags are not set for LSO");
630 			return (TX_CXT_E_LSO_CSUM);
631 		}
632 	}
633 
634 	etype = 0;
635 	mac_hdr_len = 0;
636 	l4_proto = 0;
637 
638 	/*
639 	 * Firstly get the position of the ether_type/ether_tpid.
640 	 * Here we don't assume the ether (VLAN) header is fully included
641 	 * in one mblk fragment, so we go thourgh the fragments to parse
642 	 * the ether type.
643 	 */
644 	size = len = MBLKL(mp);
645 	offset = offsetof(struct ether_header, ether_type);
646 	while (size <= offset) {
647 		mp = mp->b_cont;
648 		ASSERT(mp != NULL);
649 		len = MBLKL(mp);
650 		size += len;
651 	}
652 	pos = mp->b_rptr + offset + len - size;
653 
654 	etype = ntohs(*(ushort_t *)(uintptr_t)pos);
655 	if (etype == ETHERTYPE_VLAN) {
656 		/*
657 		 * Get the position of the ether_type in VLAN header
658 		 */
659 		offset = offsetof(struct ether_vlan_header, ether_type);
660 		while (size <= offset) {
661 			mp = mp->b_cont;
662 			ASSERT(mp != NULL);
663 			len = MBLKL(mp);
664 			size += len;
665 		}
666 		pos = mp->b_rptr + offset + len - size;
667 
668 		etype = ntohs(*(ushort_t *)(uintptr_t)pos);
669 		mac_hdr_len = sizeof (struct ether_vlan_header);
670 	} else {
671 		mac_hdr_len = sizeof (struct ether_header);
672 	}
673 
674 	/*
675 	 * Here we assume the IP(V6) header is fully included in one
676 	 * mblk fragment.
677 	 */
678 	switch (etype) {
679 	case ETHERTYPE_IP:
680 		offset = mac_hdr_len;
681 		while (size <= offset) {
682 			mp = mp->b_cont;
683 			ASSERT(mp != NULL);
684 			len = MBLKL(mp);
685 			size += len;
686 		}
687 		pos = mp->b_rptr + offset + len - size;
688 
689 		if (ctx->lso_flag) {
690 			*((uint16_t *)(uintptr_t)(pos + offsetof(ipha_t,
691 			    ipha_length))) = 0;
692 
693 			/*
694 			 * To utilize igb LSO, here need to fill
695 			 * the tcp checksum field of the packet with the
696 			 * following pseudo-header checksum:
697 			 * (ip_source_addr, ip_destination_addr, l4_proto)
698 			 * and also need to fill the ip header checksum
699 			 * with zero. Currently the tcp/ip stack has done
700 			 * these.
701 			 */
702 		}
703 
704 		l4_proto = *(uint8_t *)(pos + offsetof(ipha_t, ipha_protocol));
705 		break;
706 	case ETHERTYPE_IPV6:
707 		offset = offsetof(ip6_t, ip6_nxt) + mac_hdr_len;
708 		while (size <= offset) {
709 			mp = mp->b_cont;
710 			ASSERT(mp != NULL);
711 			len = MBLKL(mp);
712 			size += len;
713 		}
714 		pos = mp->b_rptr + offset + len - size;
715 
716 		l4_proto = *(uint8_t *)pos;
717 		break;
718 	default:
719 		/* Unrecoverable error */
720 		IGB_DEBUGLOG_0(NULL, "Ethernet type field error with "
721 		    "tx hcksum flag set");
722 		return (TX_CXT_E_ETHER_TYPE);
723 	}
724 
725 	if (ctx->lso_flag) {
726 		offset = mac_hdr_len + start;
727 		while (size <= offset) {
728 			mp = mp->b_cont;
729 			ASSERT(mp != NULL);
730 			len = MBLKL(mp);
731 			size += len;
732 		}
733 		pos = mp->b_rptr + offset + len - size;
734 
735 		l4_hdr_len = TCP_HDR_LENGTH((tcph_t *)pos);
736 	} else {
737 		/*
738 		 * l4 header length is only required for LSO
739 		 */
740 		l4_hdr_len = 0;
741 	}
742 
743 	ctx->mac_hdr_len = mac_hdr_len;
744 	ctx->ip_hdr_len = start;
745 	ctx->l4_proto = l4_proto;
746 	ctx->l4_hdr_len = l4_hdr_len;
747 
748 	return (TX_CXT_SUCCESS);
749 }
750 
751 /*
752  * igb_check_tx_context
753  *
754  * Check if a new context descriptor is needed
755  */
756 static boolean_t
757 igb_check_tx_context(igb_tx_ring_t *tx_ring, tx_context_t *ctx)
758 {
759 	tx_context_t *last;
760 
761 	if (ctx == NULL)
762 		return (B_FALSE);
763 
764 	/*
765 	 * Compare the context data retrieved from the mblk and the
766 	 * stored context data of the last context descriptor. The data
767 	 * need to be checked are:
768 	 *	hcksum_flags
769 	 *	l4_proto
770 	 *	mss (only check for LSO)
771 	 *	l4_hdr_len (only check for LSO)
772 	 *	ip_hdr_len
773 	 *	mac_hdr_len
774 	 * Either one of the above data is changed, a new context descriptor
775 	 * will be needed.
776 	 */
777 	last = &tx_ring->tx_context;
778 
779 	if (ctx->hcksum_flags != 0) {
780 		if ((ctx->hcksum_flags != last->hcksum_flags) ||
781 		    (ctx->l4_proto != last->l4_proto) ||
782 		    (ctx->lso_flag && ((ctx->mss != last->mss) ||
783 		    (ctx->l4_hdr_len != last->l4_hdr_len))) ||
784 		    (ctx->ip_hdr_len != last->ip_hdr_len) ||
785 		    (ctx->mac_hdr_len != last->mac_hdr_len)) {
786 			return (B_TRUE);
787 		}
788 	}
789 
790 	return (B_FALSE);
791 }
792 
793 /*
794  * igb_fill_tx_context
795  *
796  * Fill the context descriptor with hardware checksum informations
797  */
798 static void
799 igb_fill_tx_context(struct e1000_adv_tx_context_desc *ctx_tbd,
800     tx_context_t *ctx, uint32_t ring_index)
801 {
802 	/*
803 	 * Fill the context descriptor with the checksum
804 	 * context information we've got
805 	 */
806 	ctx_tbd->vlan_macip_lens = ctx->ip_hdr_len;
807 	ctx_tbd->vlan_macip_lens |= ctx->mac_hdr_len <<
808 	    E1000_ADVTXD_MACLEN_SHIFT;
809 
810 	ctx_tbd->type_tucmd_mlhl =
811 	    E1000_ADVTXD_DCMD_DEXT | E1000_ADVTXD_DTYP_CTXT;
812 
813 	if (ctx->hcksum_flags & HCK_IPV4_HDRCKSUM)
814 		ctx_tbd->type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_IPV4;
815 
816 	if (ctx->hcksum_flags & HCK_PARTIALCKSUM) {
817 		switch (ctx->l4_proto) {
818 		case IPPROTO_TCP:
819 			ctx_tbd->type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_TCP;
820 			break;
821 		case IPPROTO_UDP:
822 			/*
823 			 * We don't have to explicitly set:
824 			 *	ctx_tbd->type_tucmd_mlhl |=
825 			 *	    E1000_ADVTXD_TUCMD_L4T_UDP;
826 			 * Because E1000_ADVTXD_TUCMD_L4T_UDP == 0b
827 			 */
828 			break;
829 		default:
830 			/* Unrecoverable error */
831 			IGB_DEBUGLOG_0(NULL, "L4 type error with tx hcksum");
832 			break;
833 		}
834 	}
835 
836 	ctx_tbd->seqnum_seed = 0;
837 	ctx_tbd->mss_l4len_idx = ring_index << 4;
838 	if (ctx->lso_flag) {
839 		ctx_tbd->mss_l4len_idx |=
840 		    (ctx->l4_hdr_len << E1000_ADVTXD_L4LEN_SHIFT) |
841 		    (ctx->mss << E1000_ADVTXD_MSS_SHIFT);
842 	}
843 }
844 
845 /*
846  * igb_tx_fill_ring
847  *
848  * Fill the tx descriptor ring with the data
849  */
850 static int
851 igb_tx_fill_ring(igb_tx_ring_t *tx_ring, link_list_t *pending_list,
852     tx_context_t *ctx, size_t mbsize)
853 {
854 	struct e1000_hw *hw = &tx_ring->igb->hw;
855 	boolean_t load_context;
856 	uint32_t index, tcb_index, desc_num;
857 	union e1000_adv_tx_desc *tbd, *first_tbd;
858 	tx_control_block_t *tcb, *first_tcb;
859 	uint32_t hcksum_flags;
860 	int i;
861 	igb_t *igb = tx_ring->igb;
862 
863 	ASSERT(mutex_owned(&tx_ring->tx_lock));
864 
865 	tbd = NULL;
866 	first_tbd = NULL;
867 	first_tcb = NULL;
868 	desc_num = 0;
869 	hcksum_flags = 0;
870 	load_context = B_FALSE;
871 
872 	/*
873 	 * Get the index of the first tx descriptor that will be filled,
874 	 * and the index of the first work list item that will be attached
875 	 * with the first used tx control block in the pending list.
876 	 * Note: the two indexes are the same.
877 	 */
878 	index = tx_ring->tbd_tail;
879 	tcb_index = tx_ring->tbd_tail;
880 
881 	if (ctx != NULL) {
882 		hcksum_flags = ctx->hcksum_flags;
883 
884 		/*
885 		 * Check if a new context descriptor is needed for this packet
886 		 */
887 		load_context = igb_check_tx_context(tx_ring, ctx);
888 		if (load_context) {
889 			tbd = &tx_ring->tbd_ring[index];
890 
891 			/*
892 			 * Fill the context descriptor with the
893 			 * hardware checksum offload informations.
894 			 */
895 			igb_fill_tx_context(
896 			    (struct e1000_adv_tx_context_desc *)tbd,
897 			    ctx, tx_ring->index);
898 
899 			index = NEXT_INDEX(index, 1, tx_ring->ring_size);
900 			desc_num++;
901 
902 			/*
903 			 * Store the checksum context data if
904 			 * a new context descriptor is added
905 			 */
906 			tx_ring->tx_context = *ctx;
907 		}
908 	}
909 
910 	first_tbd = &tx_ring->tbd_ring[index];
911 
912 	/*
913 	 * Fill tx data descriptors with the data saved in the pending list.
914 	 * The tx control blocks in the pending list are added to the work list
915 	 * at the same time.
916 	 *
917 	 * The work list is strictly 1:1 corresponding to the descriptor ring.
918 	 * One item of the work list corresponds to one tx descriptor. Because
919 	 * one tx control block can span multiple tx descriptors, the tx
920 	 * control block will be added to the first work list item that
921 	 * corresponds to the first tx descriptor generated from that tx
922 	 * control block.
923 	 */
924 	tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
925 	first_tcb = tcb;
926 	while (tcb != NULL) {
927 
928 		for (i = 0; i < tcb->desc_num; i++) {
929 			tbd = &tx_ring->tbd_ring[index];
930 
931 			tbd->read.buffer_addr = tcb->desc[i].address;
932 			tbd->read.cmd_type_len = tcb->desc[i].length;
933 
934 			tbd->read.cmd_type_len |= E1000_ADVTXD_DCMD_RS |
935 			    E1000_ADVTXD_DCMD_DEXT | E1000_ADVTXD_DTYP_DATA |
936 			    E1000_ADVTXD_DCMD_IFCS;
937 
938 			tbd->read.olinfo_status = 0;
939 
940 			index = NEXT_INDEX(index, 1, tx_ring->ring_size);
941 			desc_num++;
942 		}
943 
944 		/*
945 		 * Add the tx control block to the work list
946 		 */
947 		ASSERT(tx_ring->work_list[tcb_index] == NULL);
948 		tx_ring->work_list[tcb_index] = tcb;
949 
950 		tcb_index = index;
951 		tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
952 	}
953 
954 	if (load_context) {
955 		/*
956 		 * Count the checksum context descriptor for
957 		 * the first tx control block.
958 		 */
959 		first_tcb->desc_num++;
960 	}
961 	first_tcb->last_index = PREV_INDEX(index, 1, tx_ring->ring_size);
962 
963 	/*
964 	 * The Insert Ethernet CRC (IFCS) bit and the checksum fields are only
965 	 * valid in the first descriptor of the packet.
966 	 * 82576 also requires the payload length setting even without LSO
967 	 */
968 	ASSERT(first_tbd != NULL);
969 	first_tbd->read.cmd_type_len |= E1000_ADVTXD_DCMD_IFCS;
970 	if (ctx != NULL && ctx->lso_flag) {
971 		first_tbd->read.cmd_type_len |= E1000_ADVTXD_DCMD_TSE;
972 		first_tbd->read.olinfo_status |=
973 		    (mbsize - ctx->mac_hdr_len - ctx->ip_hdr_len
974 		    - ctx->l4_hdr_len) << E1000_ADVTXD_PAYLEN_SHIFT;
975 	} else {
976 		if (hw->mac.type >= e1000_82576) {
977 			first_tbd->read.olinfo_status |=
978 			    (mbsize << E1000_ADVTXD_PAYLEN_SHIFT);
979 		}
980 	}
981 
982 	/* Set hardware checksum bits */
983 	if (hcksum_flags != 0) {
984 		if (hcksum_flags & HCK_IPV4_HDRCKSUM)
985 			first_tbd->read.olinfo_status |=
986 			    E1000_TXD_POPTS_IXSM << 8;
987 		if (hcksum_flags & HCK_PARTIALCKSUM)
988 			first_tbd->read.olinfo_status |=
989 			    E1000_TXD_POPTS_TXSM << 8;
990 		first_tbd->read.olinfo_status |= tx_ring->index << 4;
991 	}
992 
993 	/*
994 	 * The last descriptor of packet needs End Of Packet (EOP),
995 	 * and Report Status (RS) bits set
996 	 */
997 	ASSERT(tbd != NULL);
998 	tbd->read.cmd_type_len |=
999 	    E1000_ADVTXD_DCMD_EOP | E1000_ADVTXD_DCMD_RS;
1000 
1001 	IGB_DEBUG_STAT(tx_ring->stat_pkt_cnt);
1002 
1003 	/*
1004 	 * Sync the DMA buffer of the tx descriptor ring
1005 	 */
1006 	DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORDEV);
1007 
1008 	/*
1009 	 * Update the number of the free tx descriptors.
1010 	 * The mutual exclusion between the transmission and the recycling
1011 	 * (for the tx descriptor ring and the work list) is implemented
1012 	 * with the atomic operation on the number of the free tx descriptors.
1013 	 *
1014 	 * Note: we should always decrement the counter tbd_free before
1015 	 * advancing the hardware TDT pointer to avoid the race condition -
1016 	 * before the counter tbd_free is decremented, the transmit of the
1017 	 * tx descriptors has done and the counter tbd_free is increased by
1018 	 * the tx recycling.
1019 	 */
1020 	i = igb_atomic_reserve(&tx_ring->tbd_free, desc_num);
1021 	ASSERT(i >= 0);
1022 
1023 	tx_ring->tbd_tail = index;
1024 
1025 	/*
1026 	 * Advance the hardware TDT pointer of the tx descriptor ring
1027 	 */
1028 	E1000_WRITE_REG(hw, E1000_TDT(tx_ring->index), index);
1029 
1030 	if (igb_check_acc_handle(igb->osdep.reg_handle) != DDI_FM_OK) {
1031 		ddi_fm_service_impact(igb->dip, DDI_SERVICE_DEGRADED);
1032 		atomic_or_32(&igb->igb_state, IGB_ERROR);
1033 	}
1034 
1035 	return (desc_num);
1036 }
1037 
1038 /*
1039  * igb_save_desc
1040  *
1041  * Save the address/length pair to the private array
1042  * of the tx control block. The address/length pairs
1043  * will be filled into the tx descriptor ring later.
1044  */
1045 static void
1046 igb_save_desc(tx_control_block_t *tcb, uint64_t address, size_t length)
1047 {
1048 	sw_desc_t *desc;
1049 
1050 	desc = &tcb->desc[tcb->desc_num];
1051 	desc->address = address;
1052 	desc->length = length;
1053 
1054 	tcb->desc_num++;
1055 }
1056 
1057 /*
1058  * igb_tx_recycle_legacy
1059  *
1060  * Recycle the tx descriptors and tx control blocks.
1061  *
1062  * The work list is traversed to check if the corresponding
1063  * tx descriptors have been transmitted. If so, the resources
1064  * bound to the tx control blocks will be freed, and those
1065  * tx control blocks will be returned to the free list.
1066  */
1067 uint32_t
1068 igb_tx_recycle_legacy(igb_tx_ring_t *tx_ring)
1069 {
1070 	uint32_t index, last_index, next_index;
1071 	int desc_num;
1072 	boolean_t desc_done;
1073 	tx_control_block_t *tcb;
1074 	link_list_t pending_list;
1075 	igb_t *igb = tx_ring->igb;
1076 
1077 	/*
1078 	 * The mutex_tryenter() is used to avoid unnecessary
1079 	 * lock contention.
1080 	 */
1081 	if (mutex_tryenter(&tx_ring->recycle_lock) == 0)
1082 		return (0);
1083 
1084 	ASSERT(tx_ring->tbd_free <= tx_ring->ring_size);
1085 
1086 	if (tx_ring->tbd_free == tx_ring->ring_size) {
1087 		tx_ring->recycle_fail = 0;
1088 		tx_ring->stall_watchdog = 0;
1089 		mutex_exit(&tx_ring->recycle_lock);
1090 		return (0);
1091 	}
1092 
1093 	/*
1094 	 * Sync the DMA buffer of the tx descriptor ring
1095 	 */
1096 	DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORKERNEL);
1097 
1098 	if (igb_check_dma_handle(
1099 	    tx_ring->tbd_area.dma_handle) != DDI_FM_OK) {
1100 		mutex_exit(&tx_ring->recycle_lock);
1101 		ddi_fm_service_impact(igb->dip, DDI_SERVICE_DEGRADED);
1102 		atomic_or_32(&igb->igb_state, IGB_ERROR);
1103 		return (0);
1104 	}
1105 
1106 	LINK_LIST_INIT(&pending_list);
1107 	desc_num = 0;
1108 	index = tx_ring->tbd_head;	/* Index of next tbd/tcb to recycle */
1109 
1110 	tcb = tx_ring->work_list[index];
1111 	ASSERT(tcb != NULL);
1112 
1113 	while (tcb != NULL) {
1114 
1115 		/*
1116 		 * Get the last tx descriptor of this packet.
1117 		 * If the last tx descriptor is done, then
1118 		 * we can recycle all descriptors of a packet
1119 		 * which usually includes several tx control blocks.
1120 		 * For some chips, LSO descriptors can not be recycled
1121 		 * unless the whole packet's transmission is done.
1122 		 * That's why packet level recycling is used here.
1123 		 */
1124 		last_index = tcb->last_index;
1125 		/*
1126 		 * MAX_TX_RING_SIZE is used to judge whether
1127 		 * the index is a valid value or not.
1128 		 */
1129 		if (last_index == MAX_TX_RING_SIZE)
1130 			break;
1131 
1132 		next_index = NEXT_INDEX(last_index, 1, tx_ring->ring_size);
1133 
1134 		/*
1135 		 * Check if the Descriptor Done bit is set
1136 		 */
1137 		desc_done = tx_ring->tbd_ring[last_index].wb.status &
1138 		    E1000_TXD_STAT_DD;
1139 		if (desc_done) {
1140 			while (tcb != NULL) {
1141 				/*
1142 				 * Strip off the tx control block from the work
1143 				 * list, and add it to the pending list.
1144 				 */
1145 				tx_ring->work_list[index] = NULL;
1146 				LIST_PUSH_TAIL(&pending_list, &tcb->link);
1147 
1148 				/*
1149 				 * Count the total number of the tx descriptors
1150 				 * recycled.
1151 				 */
1152 				desc_num += tcb->desc_num;
1153 
1154 				/*
1155 				 * Advance the index of the tx descriptor ring
1156 				 */
1157 				index = NEXT_INDEX(index, tcb->desc_num,
1158 				    tx_ring->ring_size);
1159 
1160 				tcb = tx_ring->work_list[index];
1161 				if (index == next_index)
1162 					break;
1163 			}
1164 		} else {
1165 			break;
1166 		}
1167 	}
1168 
1169 	/*
1170 	 * If no tx descriptors are recycled, no need to do more processing
1171 	 */
1172 	if (desc_num == 0) {
1173 		tx_ring->recycle_fail++;
1174 		mutex_exit(&tx_ring->recycle_lock);
1175 		return (0);
1176 	}
1177 
1178 	tx_ring->recycle_fail = 0;
1179 	tx_ring->stall_watchdog = 0;
1180 
1181 	/*
1182 	 * Update the head index of the tx descriptor ring
1183 	 */
1184 	tx_ring->tbd_head = index;
1185 
1186 	/*
1187 	 * Update the number of the free tx descriptors with atomic operations
1188 	 */
1189 	atomic_add_32(&tx_ring->tbd_free, desc_num);
1190 
1191 	mutex_exit(&tx_ring->recycle_lock);
1192 
1193 	/*
1194 	 * Free the resources used by the tx control blocks
1195 	 * in the pending list
1196 	 */
1197 	tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list);
1198 	while (tcb != NULL) {
1199 		/*
1200 		 * Release the resources occupied by the tx control block
1201 		 */
1202 		igb_free_tcb(tcb);
1203 
1204 		tcb = (tx_control_block_t *)
1205 		    LIST_GET_NEXT(&pending_list, &tcb->link);
1206 	}
1207 
1208 	/*
1209 	 * Add the tx control blocks in the pending list to the free list.
1210 	 */
1211 	igb_put_free_list(tx_ring, &pending_list);
1212 
1213 	return (desc_num);
1214 }
1215 
1216 /*
1217  * igb_tx_recycle_head_wb
1218  *
1219  * Check the head write-back, and recycle all the transmitted
1220  * tx descriptors and tx control blocks.
1221  */
1222 uint32_t
1223 igb_tx_recycle_head_wb(igb_tx_ring_t *tx_ring)
1224 {
1225 	uint32_t index;
1226 	uint32_t head_wb;
1227 	int desc_num;
1228 	tx_control_block_t *tcb;
1229 	link_list_t pending_list;
1230 	igb_t *igb = tx_ring->igb;
1231 
1232 	/*
1233 	 * The mutex_tryenter() is used to avoid unnecessary
1234 	 * lock contention.
1235 	 */
1236 	if (mutex_tryenter(&tx_ring->recycle_lock) == 0)
1237 		return (0);
1238 
1239 	ASSERT(tx_ring->tbd_free <= tx_ring->ring_size);
1240 
1241 	if (tx_ring->tbd_free == tx_ring->ring_size) {
1242 		tx_ring->recycle_fail = 0;
1243 		tx_ring->stall_watchdog = 0;
1244 		mutex_exit(&tx_ring->recycle_lock);
1245 		return (0);
1246 	}
1247 
1248 	/*
1249 	 * Sync the DMA buffer of the tx descriptor ring
1250 	 *
1251 	 * Note: For head write-back mode, the tx descriptors will not
1252 	 * be written back, but the head write-back value is stored at
1253 	 * the last extra tbd at the end of the DMA area, we still need
1254 	 * to sync the head write-back value for kernel.
1255 	 *
1256 	 * DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORKERNEL);
1257 	 */
1258 	(void) ddi_dma_sync(tx_ring->tbd_area.dma_handle,
1259 	    sizeof (union e1000_adv_tx_desc) * tx_ring->ring_size,
1260 	    sizeof (uint32_t),
1261 	    DDI_DMA_SYNC_FORKERNEL);
1262 
1263 	if (igb_check_dma_handle(
1264 	    tx_ring->tbd_area.dma_handle) != DDI_FM_OK) {
1265 		mutex_exit(&tx_ring->recycle_lock);
1266 		ddi_fm_service_impact(igb->dip, DDI_SERVICE_DEGRADED);
1267 		atomic_or_32(&igb->igb_state, IGB_ERROR);
1268 		return (0);
1269 	}
1270 
1271 	LINK_LIST_INIT(&pending_list);
1272 	desc_num = 0;
1273 	index = tx_ring->tbd_head;	/* Next index to clean */
1274 
1275 	/*
1276 	 * Get the value of head write-back
1277 	 */
1278 	head_wb = *tx_ring->tbd_head_wb;
1279 	while (index != head_wb) {
1280 		tcb = tx_ring->work_list[index];
1281 		ASSERT(tcb != NULL);
1282 
1283 		if (OFFSET(index, head_wb, tx_ring->ring_size) <
1284 		    tcb->desc_num) {
1285 			/*
1286 			 * The current tx control block is not
1287 			 * completely transmitted, stop recycling
1288 			 */
1289 			break;
1290 		}
1291 
1292 		/*
1293 		 * Strip off the tx control block from the work list,
1294 		 * and add it to the pending list.
1295 		 */
1296 		tx_ring->work_list[index] = NULL;
1297 		LIST_PUSH_TAIL(&pending_list, &tcb->link);
1298 
1299 		/*
1300 		 * Advance the index of the tx descriptor ring
1301 		 */
1302 		index = NEXT_INDEX(index, tcb->desc_num, tx_ring->ring_size);
1303 
1304 		/*
1305 		 * Count the total number of the tx descriptors recycled
1306 		 */
1307 		desc_num += tcb->desc_num;
1308 	}
1309 
1310 	/*
1311 	 * If no tx descriptors are recycled, no need to do more processing
1312 	 */
1313 	if (desc_num == 0) {
1314 		tx_ring->recycle_fail++;
1315 		mutex_exit(&tx_ring->recycle_lock);
1316 		return (0);
1317 	}
1318 
1319 	tx_ring->recycle_fail = 0;
1320 	tx_ring->stall_watchdog = 0;
1321 
1322 	/*
1323 	 * Update the head index of the tx descriptor ring
1324 	 */
1325 	tx_ring->tbd_head = index;
1326 
1327 	/*
1328 	 * Update the number of the free tx descriptors with atomic operations
1329 	 */
1330 	atomic_add_32(&tx_ring->tbd_free, desc_num);
1331 
1332 	mutex_exit(&tx_ring->recycle_lock);
1333 
1334 	/*
1335 	 * Free the resources used by the tx control blocks
1336 	 * in the pending list
1337 	 */
1338 	tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list);
1339 	while (tcb) {
1340 		/*
1341 		 * Release the resources occupied by the tx control block
1342 		 */
1343 		igb_free_tcb(tcb);
1344 
1345 		tcb = (tx_control_block_t *)
1346 		    LIST_GET_NEXT(&pending_list, &tcb->link);
1347 	}
1348 
1349 	/*
1350 	 * Add the tx control blocks in the pending list to the free list.
1351 	 */
1352 	igb_put_free_list(tx_ring, &pending_list);
1353 
1354 	return (desc_num);
1355 }
1356 
1357 /*
1358  * igb_free_tcb - free up the tx control block
1359  *
1360  * Free the resources of the tx control block, including
1361  * unbind the previously bound DMA handle, and reset other
1362  * control fields.
1363  */
1364 void
1365 igb_free_tcb(tx_control_block_t *tcb)
1366 {
1367 	switch (tcb->tx_type) {
1368 	case USE_COPY:
1369 		/*
1370 		 * Reset the buffer length that is used for copy
1371 		 */
1372 		tcb->tx_buf.len = 0;
1373 		break;
1374 	case USE_DMA:
1375 		/*
1376 		 * Release the DMA resource that is used for
1377 		 * DMA binding.
1378 		 */
1379 		(void) ddi_dma_unbind_handle(tcb->tx_dma_handle);
1380 		break;
1381 	default:
1382 		break;
1383 	}
1384 
1385 	/*
1386 	 * Free the mblk
1387 	 */
1388 	if (tcb->mp != NULL) {
1389 		freemsg(tcb->mp);
1390 		tcb->mp = NULL;
1391 	}
1392 
1393 	tcb->tx_type = USE_NONE;
1394 	tcb->last_index = MAX_TX_RING_SIZE;
1395 	tcb->frag_num = 0;
1396 	tcb->desc_num = 0;
1397 }
1398 
1399 /*
1400  * igb_get_free_list - Get a free tx control block from the free list
1401  *
1402  * The atomic operation on the number of the available tx control block
1403  * in the free list is used to keep this routine mutual exclusive with
1404  * the routine igb_put_check_list.
1405  */
1406 static tx_control_block_t *
1407 igb_get_free_list(igb_tx_ring_t *tx_ring)
1408 {
1409 	tx_control_block_t *tcb;
1410 
1411 	/*
1412 	 * Check and update the number of the free tx control block
1413 	 * in the free list.
1414 	 */
1415 	if (igb_atomic_reserve(&tx_ring->tcb_free, 1) < 0)
1416 		return (NULL);
1417 
1418 	mutex_enter(&tx_ring->tcb_head_lock);
1419 
1420 	tcb = tx_ring->free_list[tx_ring->tcb_head];
1421 	ASSERT(tcb != NULL);
1422 	tx_ring->free_list[tx_ring->tcb_head] = NULL;
1423 	tx_ring->tcb_head = NEXT_INDEX(tx_ring->tcb_head, 1,
1424 	    tx_ring->free_list_size);
1425 
1426 	mutex_exit(&tx_ring->tcb_head_lock);
1427 
1428 	return (tcb);
1429 }
1430 
1431 /*
1432  * igb_put_free_list
1433  *
1434  * Put a list of used tx control blocks back to the free list
1435  *
1436  * A mutex is used here to ensure the serialization. The mutual exclusion
1437  * between igb_get_free_list and igb_put_free_list is implemented with
1438  * the atomic operation on the counter tcb_free.
1439  */
1440 void
1441 igb_put_free_list(igb_tx_ring_t *tx_ring, link_list_t *pending_list)
1442 {
1443 	uint32_t index;
1444 	int tcb_num;
1445 	tx_control_block_t *tcb;
1446 
1447 	mutex_enter(&tx_ring->tcb_tail_lock);
1448 
1449 	index = tx_ring->tcb_tail;
1450 
1451 	tcb_num = 0;
1452 	tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
1453 	while (tcb != NULL) {
1454 		ASSERT(tx_ring->free_list[index] == NULL);
1455 		tx_ring->free_list[index] = tcb;
1456 
1457 		tcb_num++;
1458 
1459 		index = NEXT_INDEX(index, 1, tx_ring->free_list_size);
1460 
1461 		tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
1462 	}
1463 
1464 	tx_ring->tcb_tail = index;
1465 
1466 	/*
1467 	 * Update the number of the free tx control block
1468 	 * in the free list. This operation must be placed
1469 	 * under the protection of the lock.
1470 	 */
1471 	atomic_add_32(&tx_ring->tcb_free, tcb_num);
1472 
1473 	mutex_exit(&tx_ring->tcb_tail_lock);
1474 }
1475