xref: /illumos-gate/usr/src/uts/common/io/ixgbe/ixgbe_tx.c (revision fba27d8741c08c38aa9cf5fd383633304ddad810)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright(c) 2007-2010 Intel Corporation. All rights reserved.
24  */
25 
26 /*
27  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
28  * Copyright 2012 Nexenta Systems, Inc. All rights reserved.
29  * Copyright 2016 OmniTI Computer Consulting, Inc. All rights reserved.
30  * Copyright 2017 Joyent, Inc.
31  */
32 
33 #include "ixgbe_sw.h"
34 
35 static int ixgbe_tx_copy(ixgbe_tx_ring_t *, tx_control_block_t *, mblk_t *,
36     uint32_t, boolean_t);
37 static int ixgbe_tx_bind(ixgbe_tx_ring_t *, tx_control_block_t *, mblk_t *,
38     uint32_t);
39 static int ixgbe_tx_fill_ring(ixgbe_tx_ring_t *, link_list_t *,
40     ixgbe_tx_context_t *, size_t);
41 static void ixgbe_save_desc(tx_control_block_t *, uint64_t, size_t);
42 static tx_control_block_t *ixgbe_get_free_list(ixgbe_tx_ring_t *);
43 
44 static int ixgbe_get_context(mblk_t *, ixgbe_tx_context_t *);
45 static boolean_t ixgbe_check_context(ixgbe_tx_ring_t *,
46     ixgbe_tx_context_t *);
47 static void ixgbe_fill_context(struct ixgbe_adv_tx_context_desc *,
48     ixgbe_tx_context_t *);
49 
50 #ifndef IXGBE_DEBUG
51 #pragma inline(ixgbe_save_desc)
52 #pragma inline(ixgbe_get_context)
53 #pragma inline(ixgbe_check_context)
54 #pragma inline(ixgbe_fill_context)
55 #endif
56 
57 /*
58  * ixgbe_ring_tx
59  *
60  * To transmit one mblk through one specified ring.
61  *
62  * One mblk can consist of several fragments, each fragment
63  * will be processed with different methods based on the size.
64  * For the fragments with size less than the bcopy threshold,
65  * they will be processed by using bcopy; otherwise, they will
66  * be processed by using DMA binding.
67  *
68  * To process the mblk, a tx control block is got from the
69  * free list. One tx control block contains one tx buffer, which
70  * is used to copy mblk fragments' data; and one tx DMA handle,
71  * which is used to bind a mblk fragment with DMA resource.
72  *
73  * Several small mblk fragments can be copied into one tx control
74  * block's buffer, and then the buffer will be transmitted with
75  * one tx descriptor.
76  *
77  * A large fragment only binds with one tx control block's DMA
78  * handle, and it can span several tx descriptors for transmitting.
79  *
80  * So to transmit a packet (mblk), several tx control blocks can
81  * be used. After the processing, those tx control blocks will
82  * be put to the work list.
83  */
84 mblk_t *
85 ixgbe_ring_tx(void *arg, mblk_t *mp)
86 {
87 	ixgbe_tx_ring_t *tx_ring = (ixgbe_tx_ring_t *)arg;
88 	ixgbe_t *ixgbe = tx_ring->ixgbe;
89 	tx_type_t current_flag, next_flag;
90 	uint32_t current_len, next_len;
91 	uint32_t desc_total;
92 	size_t mbsize;
93 	int desc_num;
94 	boolean_t copy_done, eop;
95 	mblk_t *current_mp, *next_mp, *nmp, *pull_mp = NULL;
96 	tx_control_block_t *tcb;
97 	ixgbe_tx_context_t tx_context, *ctx;
98 	link_list_t pending_list;
99 	uint32_t len, hdr_frag_len, hdr_len;
100 	uint32_t copy_thresh;
101 	mblk_t *hdr_new_mp = NULL;
102 	mblk_t *hdr_pre_mp = NULL;
103 	mblk_t *hdr_nmp = NULL;
104 
105 	ASSERT(mp->b_next == NULL);
106 
107 	if ((ixgbe->ixgbe_state & IXGBE_SUSPENDED) ||
108 	    (ixgbe->ixgbe_state & IXGBE_ERROR) ||
109 	    (ixgbe->ixgbe_state & IXGBE_OVERTEMP) ||
110 	    !(ixgbe->ixgbe_state & IXGBE_STARTED) ||
111 	    ixgbe->link_state != LINK_STATE_UP) {
112 		freemsg(mp);
113 		return (NULL);
114 	}
115 
116 	copy_thresh = ixgbe->tx_copy_thresh;
117 
118 	/* Get the mblk size */
119 	mbsize = 0;
120 	for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
121 		mbsize += MBLKL(nmp);
122 	}
123 
124 	if (ixgbe->tx_hcksum_enable) {
125 		/*
126 		 * Retrieve checksum context information from the mblk
127 		 * that will be used to decide whether/how to fill the
128 		 * context descriptor.
129 		 */
130 		ctx = &tx_context;
131 		if (ixgbe_get_context(mp, ctx) < 0) {
132 			freemsg(mp);
133 			return (NULL);
134 		}
135 
136 		/*
137 		 * If the mblk size exceeds the max size ixgbe could
138 		 * process, then discard this mblk, and return NULL.
139 		 */
140 		if ((ctx->lso_flag &&
141 		    ((mbsize - ctx->mac_hdr_len) > IXGBE_LSO_MAXLEN)) ||
142 		    (!ctx->lso_flag &&
143 		    (mbsize > (ixgbe->max_frame_size - ETHERFCSL)))) {
144 			freemsg(mp);
145 			IXGBE_DEBUGLOG_0(ixgbe, "ixgbe_tx: packet oversize");
146 			return (NULL);
147 		}
148 	} else {
149 		ctx = NULL;
150 	}
151 
152 	/*
153 	 * Check and recycle tx descriptors.
154 	 * The recycle threshold here should be selected carefully
155 	 */
156 	if (tx_ring->tbd_free < ixgbe->tx_recycle_thresh) {
157 		tx_ring->tx_recycle(tx_ring);
158 	}
159 
160 	/*
161 	 * After the recycling, if the tbd_free is less than the
162 	 * overload_threshold, assert overload, return mp;
163 	 * and we need to re-schedule the tx again.
164 	 */
165 	if (tx_ring->tbd_free < ixgbe->tx_overload_thresh) {
166 		tx_ring->reschedule = B_TRUE;
167 		tx_ring->stat_overload++;
168 		return (mp);
169 	}
170 
171 	/*
172 	 * The pending_list is a linked list that is used to save
173 	 * the tx control blocks that have packet data processed
174 	 * but have not put the data to the tx descriptor ring.
175 	 * It is used to reduce the lock contention of the tx_lock.
176 	 */
177 	LINK_LIST_INIT(&pending_list);
178 	desc_num = 0;
179 	desc_total = 0;
180 
181 	/*
182 	 * The software should guarantee LSO packet header(MAC+IP+TCP)
183 	 * to be within one descriptor. Here we reallocate and refill the
184 	 * the header if it's physical memory non-contiguous.
185 	 */
186 	if ((ctx != NULL) && ctx->lso_flag) {
187 		/* find the last fragment of the header */
188 		len = MBLKL(mp);
189 		ASSERT(len > 0);
190 		hdr_nmp = mp;
191 		hdr_len = ctx->ip_hdr_len + ctx->mac_hdr_len + ctx->l4_hdr_len;
192 		while (len < hdr_len) {
193 			hdr_pre_mp = hdr_nmp;
194 			hdr_nmp = hdr_nmp->b_cont;
195 			len += MBLKL(hdr_nmp);
196 		}
197 		/*
198 		 * If the header and the payload are in different mblks,
199 		 * we simply force the header to be copied into pre-allocated
200 		 * page-aligned buffer.
201 		 */
202 		if (len == hdr_len)
203 			goto adjust_threshold;
204 
205 		hdr_frag_len = hdr_len - (len - MBLKL(hdr_nmp));
206 		/*
207 		 * There are two cases we need to reallocate a mblk for the
208 		 * last header fragment:
209 		 * 1. the header is in multiple mblks and the last fragment
210 		 * share the same mblk with the payload
211 		 * 2. the header is in a single mblk shared with the payload
212 		 * and the header is physical memory non-contiguous
213 		 */
214 		if ((hdr_nmp != mp) ||
215 		    (P2NPHASE((uintptr_t)hdr_nmp->b_rptr, ixgbe->sys_page_size)
216 		    < hdr_len)) {
217 			tx_ring->stat_lso_header_fail++;
218 			/*
219 			 * reallocate the mblk for the last header fragment,
220 			 * expect to bcopy into pre-allocated page-aligned
221 			 * buffer
222 			 */
223 			hdr_new_mp = allocb(hdr_frag_len, 0);
224 			if (!hdr_new_mp)
225 				return (mp);
226 			bcopy(hdr_nmp->b_rptr, hdr_new_mp->b_rptr,
227 			    hdr_frag_len);
228 			/* link the new header fragment with the other parts */
229 			hdr_new_mp->b_wptr = hdr_new_mp->b_rptr + hdr_frag_len;
230 			hdr_new_mp->b_cont = hdr_nmp;
231 			if (hdr_pre_mp)
232 				hdr_pre_mp->b_cont = hdr_new_mp;
233 			else
234 				mp = hdr_new_mp;
235 			hdr_nmp->b_rptr += hdr_frag_len;
236 		}
237 adjust_threshold:
238 		/*
239 		 * adjust the bcopy threshhold to guarantee
240 		 * the header to use bcopy way
241 		 */
242 		if (copy_thresh < hdr_len)
243 			copy_thresh = hdr_len;
244 	}
245 
246 	current_mp = mp;
247 	current_len = MBLKL(current_mp);
248 	/*
249 	 * Decide which method to use for the first fragment
250 	 */
251 	current_flag = (current_len <= copy_thresh) ?
252 	    USE_COPY : USE_DMA;
253 	/*
254 	 * If the mblk includes several contiguous small fragments,
255 	 * they may be copied into one buffer. This flag is used to
256 	 * indicate whether there are pending fragments that need to
257 	 * be copied to the current tx buffer.
258 	 *
259 	 * If this flag is B_TRUE, it indicates that a new tx control
260 	 * block is needed to process the next fragment using either
261 	 * copy or DMA binding.
262 	 *
263 	 * Otherwise, it indicates that the next fragment will be
264 	 * copied to the current tx buffer that is maintained by the
265 	 * current tx control block. No new tx control block is needed.
266 	 */
267 	copy_done = B_TRUE;
268 	while (current_mp) {
269 		next_mp = current_mp->b_cont;
270 		eop = (next_mp == NULL); /* Last fragment of the packet? */
271 		next_len = eop ? 0: MBLKL(next_mp);
272 
273 		/*
274 		 * When the current fragment is an empty fragment, if
275 		 * the next fragment will still be copied to the current
276 		 * tx buffer, we cannot skip this fragment here. Because
277 		 * the copy processing is pending for completion. We have
278 		 * to process this empty fragment in the tx_copy routine.
279 		 *
280 		 * If the copy processing is completed or a DMA binding
281 		 * processing is just completed, we can just skip this
282 		 * empty fragment.
283 		 */
284 		if ((current_len == 0) && (copy_done)) {
285 			current_mp = next_mp;
286 			current_len = next_len;
287 			current_flag = (current_len <= copy_thresh) ?
288 			    USE_COPY : USE_DMA;
289 			continue;
290 		}
291 
292 		if (copy_done) {
293 			/*
294 			 * Get a new tx control block from the free list
295 			 */
296 			tcb = ixgbe_get_free_list(tx_ring);
297 
298 			if (tcb == NULL) {
299 				tx_ring->stat_fail_no_tcb++;
300 				goto tx_failure;
301 			}
302 
303 			/*
304 			 * Push the tx control block to the pending list
305 			 * to avoid using lock too early
306 			 */
307 			LIST_PUSH_TAIL(&pending_list, &tcb->link);
308 		}
309 
310 		if (current_flag == USE_COPY) {
311 			/*
312 			 * Check whether to use bcopy or DMA binding to process
313 			 * the next fragment, and if using bcopy, whether we
314 			 * need to continue copying the next fragment into the
315 			 * current tx buffer.
316 			 */
317 			ASSERT((tcb->tx_buf.len + current_len) <=
318 			    tcb->tx_buf.size);
319 
320 			if (eop) {
321 				/*
322 				 * This is the last fragment of the packet, so
323 				 * the copy processing will be completed with
324 				 * this fragment.
325 				 */
326 				next_flag = USE_NONE;
327 				copy_done = B_TRUE;
328 			} else if ((tcb->tx_buf.len + current_len + next_len) >
329 			    tcb->tx_buf.size) {
330 				/*
331 				 * If the next fragment is too large to be
332 				 * copied to the current tx buffer, we need
333 				 * to complete the current copy processing.
334 				 */
335 				next_flag = (next_len > copy_thresh) ?
336 				    USE_DMA: USE_COPY;
337 				copy_done = B_TRUE;
338 			} else if (next_len > copy_thresh) {
339 				/*
340 				 * The next fragment needs to be processed with
341 				 * DMA binding. So the copy prcessing will be
342 				 * completed with the current fragment.
343 				 */
344 				next_flag = USE_DMA;
345 				copy_done = B_TRUE;
346 			} else {
347 				/*
348 				 * Continue to copy the next fragment to the
349 				 * current tx buffer.
350 				 */
351 				next_flag = USE_COPY;
352 				copy_done = B_FALSE;
353 			}
354 
355 			desc_num = ixgbe_tx_copy(tx_ring, tcb, current_mp,
356 			    current_len, copy_done);
357 		} else {
358 			/*
359 			 * Check whether to use bcopy or DMA binding to process
360 			 * the next fragment.
361 			 */
362 			next_flag = (next_len > copy_thresh) ?
363 			    USE_DMA: USE_COPY;
364 			ASSERT(copy_done == B_TRUE);
365 
366 			desc_num = ixgbe_tx_bind(tx_ring, tcb, current_mp,
367 			    current_len);
368 		}
369 
370 		if (desc_num > 0)
371 			desc_total += desc_num;
372 		else if (desc_num < 0)
373 			goto tx_failure;
374 
375 		current_mp = next_mp;
376 		current_len = next_len;
377 		current_flag = next_flag;
378 	}
379 
380 	/*
381 	 * Attach the mblk to the last tx control block
382 	 */
383 	ASSERT(tcb);
384 	ASSERT(tcb->mp == NULL);
385 	tcb->mp = mp;
386 
387 	/*
388 	 * 82598/82599 chipset has a limitation that no more than 32 tx
389 	 * descriptors can be transmited out at one time.
390 	 *
391 	 * Here is a workaround for it: pull up the mblk then send it
392 	 * out with bind way. By doing so, no more than MAX_COOKIE (18)
393 	 * descriptors is needed.
394 	 */
395 	if (desc_total + 1 > IXGBE_TX_DESC_LIMIT) {
396 		tx_ring->stat_break_tbd_limit++;
397 
398 		/*
399 		 * Discard the mblk and free the used resources
400 		 */
401 		tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list);
402 		while (tcb) {
403 			tcb->mp = NULL;
404 			ixgbe_free_tcb(tcb);
405 			tcb = (tx_control_block_t *)
406 			    LIST_GET_NEXT(&pending_list, &tcb->link);
407 		}
408 
409 		/*
410 		 * Return the tx control blocks in the pending list to
411 		 * the free list.
412 		 */
413 		ixgbe_put_free_list(tx_ring, &pending_list);
414 
415 		/*
416 		 * pull up the mblk and send it out with bind way
417 		 */
418 		if ((pull_mp = msgpullup(mp, -1)) == NULL) {
419 			tx_ring->reschedule = B_TRUE;
420 
421 			/*
422 			 * If new mblk has been allocted for the last header
423 			 * fragment of a LSO packet, we should restore the
424 			 * modified mp.
425 			 */
426 			if (hdr_new_mp) {
427 				hdr_new_mp->b_cont = NULL;
428 				freeb(hdr_new_mp);
429 				hdr_nmp->b_rptr -= hdr_frag_len;
430 				if (hdr_pre_mp)
431 					hdr_pre_mp->b_cont = hdr_nmp;
432 				else
433 					mp = hdr_nmp;
434 			}
435 			return (mp);
436 		}
437 
438 		LINK_LIST_INIT(&pending_list);
439 		desc_total = 0;
440 
441 		/*
442 		 * if the packet is a LSO packet, we simply
443 		 * transmit the header in one descriptor using the copy way
444 		 */
445 		if ((ctx != NULL) && ctx->lso_flag) {
446 			hdr_len = ctx->ip_hdr_len + ctx->mac_hdr_len +
447 			    ctx->l4_hdr_len;
448 
449 			tcb = ixgbe_get_free_list(tx_ring);
450 			if (tcb == NULL) {
451 				tx_ring->stat_fail_no_tcb++;
452 				goto tx_failure;
453 			}
454 			desc_num = ixgbe_tx_copy(tx_ring, tcb, pull_mp,
455 			    hdr_len, B_TRUE);
456 			LIST_PUSH_TAIL(&pending_list, &tcb->link);
457 			desc_total  += desc_num;
458 
459 			pull_mp->b_rptr += hdr_len;
460 		}
461 
462 		tcb = ixgbe_get_free_list(tx_ring);
463 		if (tcb == NULL) {
464 			tx_ring->stat_fail_no_tcb++;
465 			goto tx_failure;
466 		}
467 		if ((ctx != NULL) && ctx->lso_flag) {
468 			desc_num = ixgbe_tx_bind(tx_ring, tcb, pull_mp,
469 			    mbsize - hdr_len);
470 		} else {
471 			desc_num = ixgbe_tx_bind(tx_ring, tcb, pull_mp,
472 			    mbsize);
473 		}
474 		if (desc_num < 0) {
475 			goto tx_failure;
476 		}
477 		LIST_PUSH_TAIL(&pending_list, &tcb->link);
478 
479 		desc_total += desc_num;
480 		tcb->mp = pull_mp;
481 	}
482 
483 	/*
484 	 * Before fill the tx descriptor ring with the data, we need to
485 	 * ensure there are adequate free descriptors for transmit
486 	 * (including one context descriptor).
487 	 * Do not use up all the tx descriptors.
488 	 * Otherwise tx recycle will fail and cause false hang.
489 	 */
490 	if (tx_ring->tbd_free <= (desc_total + 1)) {
491 		tx_ring->tx_recycle(tx_ring);
492 	}
493 
494 	mutex_enter(&tx_ring->tx_lock);
495 	/*
496 	 * If the number of free tx descriptors is not enough for transmit
497 	 * then return mp.
498 	 *
499 	 * Note: we must put this check under the mutex protection to
500 	 * ensure the correctness when multiple threads access it in
501 	 * parallel.
502 	 */
503 	if (tx_ring->tbd_free <= (desc_total + 1)) {
504 		tx_ring->stat_fail_no_tbd++;
505 		mutex_exit(&tx_ring->tx_lock);
506 		goto tx_failure;
507 	}
508 
509 	desc_num = ixgbe_tx_fill_ring(tx_ring, &pending_list, ctx,
510 	    mbsize);
511 
512 	ASSERT((desc_num == desc_total) || (desc_num == (desc_total + 1)));
513 
514 	tx_ring->stat_obytes += mbsize;
515 	tx_ring->stat_opackets ++;
516 
517 	mutex_exit(&tx_ring->tx_lock);
518 
519 	/*
520 	 * now that the transmission succeeds, need to free the original
521 	 * mp if we used the pulling up mblk for transmission.
522 	 */
523 	if (pull_mp) {
524 		freemsg(mp);
525 	}
526 
527 	return (NULL);
528 
529 tx_failure:
530 	/*
531 	 * If transmission fails, need to free the pulling up mblk.
532 	 */
533 	if (pull_mp) {
534 		freemsg(pull_mp);
535 	}
536 
537 	/*
538 	 * If new mblk has been allocted for the last header
539 	 * fragment of a LSO packet, we should restore the
540 	 * modified mp.
541 	 */
542 	if (hdr_new_mp) {
543 		hdr_new_mp->b_cont = NULL;
544 		freeb(hdr_new_mp);
545 		hdr_nmp->b_rptr -= hdr_frag_len;
546 		if (hdr_pre_mp)
547 			hdr_pre_mp->b_cont = hdr_nmp;
548 		else
549 			mp = hdr_nmp;
550 	}
551 	/*
552 	 * Discard the mblk and free the used resources
553 	 */
554 	tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list);
555 	while (tcb) {
556 		tcb->mp = NULL;
557 
558 		ixgbe_free_tcb(tcb);
559 
560 		tcb = (tx_control_block_t *)
561 		    LIST_GET_NEXT(&pending_list, &tcb->link);
562 	}
563 
564 	/*
565 	 * Return the tx control blocks in the pending list to the free list.
566 	 */
567 	ixgbe_put_free_list(tx_ring, &pending_list);
568 
569 	/* Transmit failed, do not drop the mblk, rechedule the transmit */
570 	tx_ring->reschedule = B_TRUE;
571 
572 	return (mp);
573 }
574 
575 /*
576  * ixgbe_tx_copy
577  *
578  * Copy the mblk fragment to the pre-allocated tx buffer
579  */
580 static int
581 ixgbe_tx_copy(ixgbe_tx_ring_t *tx_ring, tx_control_block_t *tcb, mblk_t *mp,
582     uint32_t len, boolean_t copy_done)
583 {
584 	dma_buffer_t *tx_buf;
585 	uint32_t desc_num;
586 	_NOTE(ARGUNUSED(tx_ring));
587 
588 	tx_buf = &tcb->tx_buf;
589 
590 	/*
591 	 * Copy the packet data of the mblk fragment into the
592 	 * pre-allocated tx buffer, which is maintained by the
593 	 * tx control block.
594 	 *
595 	 * Several mblk fragments can be copied into one tx buffer.
596 	 * The destination address of the current copied fragment in
597 	 * the tx buffer is next to the end of the previous copied
598 	 * fragment.
599 	 */
600 	if (len > 0) {
601 		bcopy(mp->b_rptr, tx_buf->address + tx_buf->len, len);
602 
603 		tx_buf->len += len;
604 		tcb->frag_num++;
605 	}
606 
607 	desc_num = 0;
608 
609 	/*
610 	 * If it is the last fragment copied to the current tx buffer,
611 	 * in other words, if there's no remaining fragment or the remaining
612 	 * fragment requires a new tx control block to process, we need to
613 	 * complete the current copy processing by syncing up the current
614 	 * DMA buffer and saving the descriptor data.
615 	 */
616 	if (copy_done) {
617 		/*
618 		 * Sync the DMA buffer of the packet data
619 		 */
620 		DMA_SYNC(tx_buf, DDI_DMA_SYNC_FORDEV);
621 
622 		tcb->tx_type = USE_COPY;
623 
624 		/*
625 		 * Save the address and length to the private data structure
626 		 * of the tx control block, which will be used to fill the
627 		 * tx descriptor ring after all the fragments are processed.
628 		 */
629 		ixgbe_save_desc(tcb, tx_buf->dma_address, tx_buf->len);
630 		desc_num++;
631 	}
632 
633 	return (desc_num);
634 }
635 
636 /*
637  * ixgbe_tx_bind
638  *
639  * Bind the mblk fragment with DMA
640  */
641 static int
642 ixgbe_tx_bind(ixgbe_tx_ring_t *tx_ring, tx_control_block_t *tcb, mblk_t *mp,
643     uint32_t len)
644 {
645 	int status, i;
646 	ddi_dma_cookie_t dma_cookie;
647 	uint_t ncookies;
648 	int desc_num;
649 
650 	/*
651 	 * Use DMA binding to process the mblk fragment
652 	 */
653 	status = ddi_dma_addr_bind_handle(tcb->tx_dma_handle, NULL,
654 	    (caddr_t)mp->b_rptr, len,
655 	    DDI_DMA_WRITE | DDI_DMA_STREAMING, DDI_DMA_DONTWAIT,
656 	    0, &dma_cookie, &ncookies);
657 
658 	if (status != DDI_DMA_MAPPED) {
659 		tx_ring->stat_fail_dma_bind++;
660 		return (-1);
661 	}
662 
663 	tcb->frag_num++;
664 	tcb->tx_type = USE_DMA;
665 	/*
666 	 * Each fragment can span several cookies. One cookie will have
667 	 * one tx descriptor to transmit.
668 	 */
669 	desc_num = 0;
670 	for (i = ncookies; i > 0; i--) {
671 		/*
672 		 * Save the address and length to the private data structure
673 		 * of the tx control block, which will be used to fill the
674 		 * tx descriptor ring after all the fragments are processed.
675 		 */
676 		ixgbe_save_desc(tcb,
677 		    dma_cookie.dmac_laddress,
678 		    dma_cookie.dmac_size);
679 
680 		desc_num++;
681 
682 		if (i > 1)
683 			ddi_dma_nextcookie(tcb->tx_dma_handle, &dma_cookie);
684 	}
685 
686 	return (desc_num);
687 }
688 
689 /*
690  * ixgbe_get_context
691  *
692  * Get the context information from the mblk
693  */
694 static int
695 ixgbe_get_context(mblk_t *mp, ixgbe_tx_context_t *ctx)
696 {
697 	uint32_t start;
698 	uint32_t hckflags;
699 	uint32_t lsoflags;
700 	uint32_t lsocksum;
701 	uint32_t mss;
702 	uint32_t len;
703 	uint32_t size;
704 	uint32_t offset;
705 	unsigned char *pos;
706 	ushort_t etype;
707 	uint32_t mac_hdr_len;
708 	uint32_t l4_proto;
709 	uint32_t l4_hdr_len;
710 
711 	ASSERT(mp != NULL);
712 
713 	mac_hcksum_get(mp, &start, NULL, NULL, NULL, &hckflags);
714 	bzero(ctx, sizeof (ixgbe_tx_context_t));
715 
716 	if (hckflags == 0) {
717 		return (0);
718 	}
719 
720 	ctx->hcksum_flags = hckflags;
721 
722 	mac_lso_get(mp, &mss, &lsoflags);
723 	ctx->mss = mss;
724 	ctx->lso_flag = (lsoflags == HW_LSO);
725 
726 	etype = 0;
727 	mac_hdr_len = 0;
728 	l4_proto = 0;
729 
730 	/*
731 	 * Firstly get the position of the ether_type/ether_tpid.
732 	 * Here we don't assume the ether (VLAN) header is fully included
733 	 * in one mblk fragment, so we go thourgh the fragments to parse
734 	 * the ether type.
735 	 */
736 	size = len = MBLKL(mp);
737 	offset = offsetof(struct ether_header, ether_type);
738 	while (size <= offset) {
739 		mp = mp->b_cont;
740 		ASSERT(mp != NULL);
741 		len = MBLKL(mp);
742 		size += len;
743 	}
744 	pos = mp->b_rptr + offset + len - size;
745 
746 	etype = ntohs(*(ushort_t *)(uintptr_t)pos);
747 	if (etype == ETHERTYPE_VLAN) {
748 		/*
749 		 * Get the position of the ether_type in VLAN header
750 		 */
751 		offset = offsetof(struct ether_vlan_header, ether_type);
752 		while (size <= offset) {
753 			mp = mp->b_cont;
754 			ASSERT(mp != NULL);
755 			len = MBLKL(mp);
756 			size += len;
757 		}
758 		pos = mp->b_rptr + offset + len - size;
759 
760 		etype = ntohs(*(ushort_t *)(uintptr_t)pos);
761 		mac_hdr_len = sizeof (struct ether_vlan_header);
762 	} else {
763 		mac_hdr_len = sizeof (struct ether_header);
764 	}
765 
766 	/*
767 	 * Here we don't assume the IP(V6) header is fully included in
768 	 * one mblk fragment.
769 	 */
770 	lsocksum = HCK_PARTIALCKSUM;
771 	ctx->l3_proto = etype;
772 	switch (etype) {
773 	case ETHERTYPE_IP:
774 		if (ctx->lso_flag) {
775 			offset = offsetof(ipha_t, ipha_length) + mac_hdr_len;
776 			while (size <= offset) {
777 				mp = mp->b_cont;
778 				ASSERT(mp != NULL);
779 				len = MBLKL(mp);
780 				size += len;
781 			}
782 			pos = mp->b_rptr + offset + len - size;
783 			*((uint16_t *)(uintptr_t)(pos)) = 0;
784 
785 			offset = offsetof(ipha_t, ipha_hdr_checksum) +
786 			    mac_hdr_len;
787 			while (size <= offset) {
788 				mp = mp->b_cont;
789 				ASSERT(mp != NULL);
790 				len = MBLKL(mp);
791 				size += len;
792 			}
793 			pos = mp->b_rptr + offset + len - size;
794 			*((uint16_t *)(uintptr_t)(pos)) = 0;
795 
796 			/*
797 			 * To perform ixgbe LSO, here also need to fill
798 			 * the tcp checksum field of the packet with the
799 			 * following pseudo-header checksum:
800 			 * (ip_source_addr, ip_destination_addr, l4_proto)
801 			 * Currently the tcp/ip stack has done it.
802 			 */
803 			lsocksum |= HCK_IPV4_HDRCKSUM;
804 		}
805 
806 		offset = offsetof(ipha_t, ipha_protocol) + mac_hdr_len;
807 		while (size <= offset) {
808 			mp = mp->b_cont;
809 			ASSERT(mp != NULL);
810 			len = MBLKL(mp);
811 			size += len;
812 		}
813 		pos = mp->b_rptr + offset + len - size;
814 
815 		l4_proto = *(uint8_t *)pos;
816 		break;
817 	case ETHERTYPE_IPV6:
818 		/*
819 		 * We need to zero out the length in the header.
820 		 */
821 		if (ctx->lso_flag) {
822 			offset = offsetof(ip6_t, ip6_plen) + mac_hdr_len;
823 			while (size <= offset) {
824 				mp = mp->b_cont;
825 				ASSERT(mp != NULL);
826 				len = MBLKL(mp);
827 				size += len;
828 			}
829 			pos = mp->b_rptr + offset + len - size;
830 			*((uint16_t *)(uintptr_t)(pos)) = 0;
831 		}
832 
833 		offset = offsetof(ip6_t, ip6_nxt) + mac_hdr_len;
834 		while (size <= offset) {
835 			mp = mp->b_cont;
836 			ASSERT(mp != NULL);
837 			len = MBLKL(mp);
838 			size += len;
839 		}
840 		pos = mp->b_rptr + offset + len - size;
841 
842 		l4_proto = *(uint8_t *)pos;
843 		break;
844 	default:
845 		/* Unrecoverable error */
846 		IXGBE_DEBUGLOG_0(NULL, "Ether type error with tx hcksum");
847 		return (-2);
848 	}
849 
850 	if (ctx->lso_flag) {
851 		/*
852 		 * LSO relies on tx h/w checksum, so here will drop the packet
853 		 * if h/w checksum flag is not declared.
854 		 */
855 		if ((ctx->hcksum_flags & lsocksum) != lsocksum) {
856 			IXGBE_DEBUGLOG_2(NULL, "ixgbe_tx: h/w checksum flags "
857 			    "are not set for LSO, found 0x%x, needed bits 0x%x",
858 			    ctx->hcksum_flags, lsocksum);
859 			return (-1);
860 		}
861 
862 
863 		offset = mac_hdr_len + start;
864 		while (size <= offset) {
865 			mp = mp->b_cont;
866 			ASSERT(mp != NULL);
867 			len = MBLKL(mp);
868 			size += len;
869 		}
870 		pos = mp->b_rptr + offset + len - size;
871 
872 		l4_hdr_len = TCP_HDR_LENGTH((tcph_t *)pos);
873 	} else {
874 		/*
875 		 * l4 header length is only required for LSO
876 		 */
877 		l4_hdr_len = 0;
878 	}
879 
880 	ctx->mac_hdr_len = mac_hdr_len;
881 	ctx->ip_hdr_len = start;
882 	ctx->l4_proto = l4_proto;
883 	ctx->l4_hdr_len = l4_hdr_len;
884 
885 	return (0);
886 }
887 
888 /*
889  * ixgbe_check_context
890  *
891  * Check if a new context descriptor is needed
892  */
893 static boolean_t
894 ixgbe_check_context(ixgbe_tx_ring_t *tx_ring, ixgbe_tx_context_t *ctx)
895 {
896 	ixgbe_tx_context_t *last;
897 
898 	if (ctx == NULL)
899 		return (B_FALSE);
900 
901 	/*
902 	 * Compare the context data retrieved from the mblk and the
903 	 * stored data of the last context descriptor. The data need
904 	 * to be checked are:
905 	 *	hcksum_flags
906 	 *	l4_proto
907 	 *	mac_hdr_len
908 	 *	ip_hdr_len
909 	 *	lso_flag
910 	 *	mss (only checked for LSO)
911 	 *	l4_hr_len (only checked for LSO)
912 	 * Either one of the above data is changed, a new context descriptor
913 	 * will be needed.
914 	 */
915 	last = &tx_ring->tx_context;
916 
917 	if ((ctx->hcksum_flags != last->hcksum_flags) ||
918 	    (ctx->l4_proto != last->l4_proto) ||
919 	    (ctx->l3_proto != last->l3_proto) ||
920 	    (ctx->mac_hdr_len != last->mac_hdr_len) ||
921 	    (ctx->ip_hdr_len != last->ip_hdr_len) ||
922 	    (ctx->lso_flag != last->lso_flag) ||
923 	    (ctx->lso_flag && ((ctx->mss != last->mss) ||
924 	    (ctx->l4_hdr_len != last->l4_hdr_len)))) {
925 		return (B_TRUE);
926 	}
927 
928 	return (B_FALSE);
929 }
930 
931 /*
932  * ixgbe_fill_context
933  *
934  * Fill the context descriptor with hardware checksum informations
935  */
936 static void
937 ixgbe_fill_context(struct ixgbe_adv_tx_context_desc *ctx_tbd,
938     ixgbe_tx_context_t *ctx)
939 {
940 	/*
941 	 * Fill the context descriptor with the checksum
942 	 * context information we've got.
943 	 */
944 	ctx_tbd->vlan_macip_lens = ctx->ip_hdr_len;
945 	ctx_tbd->vlan_macip_lens |= ctx->mac_hdr_len <<
946 	    IXGBE_ADVTXD_MACLEN_SHIFT;
947 
948 	ctx_tbd->type_tucmd_mlhl =
949 	    IXGBE_ADVTXD_DCMD_DEXT | IXGBE_ADVTXD_DTYP_CTXT;
950 	/*
951 	 * When we have a TX context set up, we enforce that the ethertype is
952 	 * either IPv4 or IPv6 in ixgbe_get_tx_context().
953 	 */
954 	if (ctx->lso_flag || ctx->hcksum_flags & HCK_IPV4_HDRCKSUM) {
955 		if (ctx->l3_proto == ETHERTYPE_IP) {
956 			ctx_tbd->type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV4;
957 		} else {
958 			ctx_tbd->type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV6;
959 		}
960 	}
961 
962 	if (ctx->lso_flag || ctx->hcksum_flags & HCK_PARTIALCKSUM) {
963 		switch (ctx->l4_proto) {
964 		case IPPROTO_TCP:
965 			ctx_tbd->type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_TCP;
966 			break;
967 		case IPPROTO_UDP:
968 			/*
969 			 * We don't have to explicitly set:
970 			 *	ctx_tbd->type_tucmd_mlhl |=
971 			 *	    IXGBE_ADVTXD_TUCMD_L4T_UDP;
972 			 * Because IXGBE_ADVTXD_TUCMD_L4T_UDP == 0b
973 			 */
974 			break;
975 		default:
976 			/* Unrecoverable error */
977 			IXGBE_DEBUGLOG_0(NULL, "L4 type error with tx hcksum");
978 			break;
979 		}
980 	}
981 
982 	ctx_tbd->seqnum_seed = 0;
983 
984 	if (ctx->lso_flag) {
985 		ctx_tbd->mss_l4len_idx =
986 		    (ctx->l4_hdr_len << IXGBE_ADVTXD_L4LEN_SHIFT) |
987 		    (ctx->mss << IXGBE_ADVTXD_MSS_SHIFT);
988 	} else {
989 		ctx_tbd->mss_l4len_idx = 0;
990 	}
991 }
992 
993 /*
994  * ixgbe_tx_fill_ring
995  *
996  * Fill the tx descriptor ring with the data
997  */
998 static int
999 ixgbe_tx_fill_ring(ixgbe_tx_ring_t *tx_ring, link_list_t *pending_list,
1000     ixgbe_tx_context_t *ctx, size_t mbsize)
1001 {
1002 	struct ixgbe_hw *hw = &tx_ring->ixgbe->hw;
1003 	boolean_t load_context;
1004 	uint32_t index, tcb_index, desc_num;
1005 	union ixgbe_adv_tx_desc *tbd, *first_tbd;
1006 	tx_control_block_t *tcb, *first_tcb;
1007 	uint32_t hcksum_flags;
1008 	int i;
1009 
1010 	ASSERT(mutex_owned(&tx_ring->tx_lock));
1011 
1012 	tbd = NULL;
1013 	first_tbd = NULL;
1014 	first_tcb = NULL;
1015 	desc_num = 0;
1016 	hcksum_flags = 0;
1017 	load_context = B_FALSE;
1018 
1019 	/*
1020 	 * Get the index of the first tx descriptor that will be filled,
1021 	 * and the index of the first work list item that will be attached
1022 	 * with the first used tx control block in the pending list.
1023 	 * Note: the two indexes are the same.
1024 	 */
1025 	index = tx_ring->tbd_tail;
1026 	tcb_index = tx_ring->tbd_tail;
1027 
1028 	if (ctx != NULL) {
1029 		hcksum_flags = ctx->hcksum_flags;
1030 
1031 		/*
1032 		 * Check if a new context descriptor is needed for this packet
1033 		 */
1034 		load_context = ixgbe_check_context(tx_ring, ctx);
1035 
1036 		if (load_context) {
1037 			tbd = &tx_ring->tbd_ring[index];
1038 
1039 			/*
1040 			 * Fill the context descriptor with the
1041 			 * hardware checksum offload informations.
1042 			 */
1043 			ixgbe_fill_context(
1044 			    (struct ixgbe_adv_tx_context_desc *)tbd, ctx);
1045 
1046 			index = NEXT_INDEX(index, 1, tx_ring->ring_size);
1047 			desc_num++;
1048 
1049 			/*
1050 			 * Store the checksum context data if
1051 			 * a new context descriptor is added
1052 			 */
1053 			tx_ring->tx_context = *ctx;
1054 		}
1055 	}
1056 
1057 	first_tbd = &tx_ring->tbd_ring[index];
1058 
1059 	/*
1060 	 * Fill tx data descriptors with the data saved in the pending list.
1061 	 * The tx control blocks in the pending list are added to the work list
1062 	 * at the same time.
1063 	 *
1064 	 * The work list is strictly 1:1 corresponding to the descriptor ring.
1065 	 * One item of the work list corresponds to one tx descriptor. Because
1066 	 * one tx control block can span multiple tx descriptors, the tx
1067 	 * control block will be added to the first work list item that
1068 	 * corresponds to the first tx descriptor generated from that tx
1069 	 * control block.
1070 	 */
1071 	tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
1072 	first_tcb = tcb;
1073 	while (tcb != NULL) {
1074 
1075 		for (i = 0; i < tcb->desc_num; i++) {
1076 			tbd = &tx_ring->tbd_ring[index];
1077 
1078 			tbd->read.buffer_addr = tcb->desc[i].address;
1079 			tbd->read.cmd_type_len = tcb->desc[i].length;
1080 
1081 			tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_DEXT
1082 			    | IXGBE_ADVTXD_DTYP_DATA;
1083 
1084 			tbd->read.olinfo_status = 0;
1085 
1086 			index = NEXT_INDEX(index, 1, tx_ring->ring_size);
1087 			desc_num++;
1088 		}
1089 
1090 		/*
1091 		 * Add the tx control block to the work list
1092 		 */
1093 		ASSERT(tx_ring->work_list[tcb_index] == NULL);
1094 		tx_ring->work_list[tcb_index] = tcb;
1095 
1096 		tcb_index = index;
1097 		tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
1098 	}
1099 
1100 	if (load_context) {
1101 		/*
1102 		 * Count the context descriptor for
1103 		 * the first tx control block.
1104 		 */
1105 		first_tcb->desc_num++;
1106 	}
1107 	first_tcb->last_index = PREV_INDEX(index, 1, tx_ring->ring_size);
1108 
1109 	/*
1110 	 * The Insert Ethernet CRC (IFCS) bit and the checksum fields are only
1111 	 * valid in the first descriptor of the packet.
1112 	 * Setting paylen in every first_tbd for all parts.
1113 	 * 82599, X540 and X550 require the packet length in paylen field
1114 	 * with or without LSO and 82598 will ignore it in non-LSO mode.
1115 	 */
1116 	ASSERT(first_tbd != NULL);
1117 	first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_IFCS;
1118 
1119 	switch (hw->mac.type) {
1120 	case ixgbe_mac_82598EB:
1121 		if (ctx != NULL && ctx->lso_flag) {
1122 			first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_TSE;
1123 			first_tbd->read.olinfo_status |=
1124 			    (mbsize - ctx->mac_hdr_len - ctx->ip_hdr_len
1125 			    - ctx->l4_hdr_len) << IXGBE_ADVTXD_PAYLEN_SHIFT;
1126 		}
1127 		break;
1128 
1129 	case ixgbe_mac_82599EB:
1130 	case ixgbe_mac_X540:
1131 	case ixgbe_mac_X550:
1132 	case ixgbe_mac_X550EM_x:
1133 	case ixgbe_mac_X550EM_a:
1134 		if (ctx != NULL && ctx->lso_flag) {
1135 			first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_TSE;
1136 			first_tbd->read.olinfo_status |=
1137 			    (mbsize - ctx->mac_hdr_len - ctx->ip_hdr_len
1138 			    - ctx->l4_hdr_len) << IXGBE_ADVTXD_PAYLEN_SHIFT;
1139 		} else {
1140 			first_tbd->read.olinfo_status |=
1141 			    (mbsize << IXGBE_ADVTXD_PAYLEN_SHIFT);
1142 		}
1143 		break;
1144 
1145 	default:
1146 		break;
1147 	}
1148 
1149 	/* Set hardware checksum bits */
1150 	if (hcksum_flags != 0) {
1151 		if (hcksum_flags & HCK_IPV4_HDRCKSUM)
1152 			first_tbd->read.olinfo_status |=
1153 			    IXGBE_ADVTXD_POPTS_IXSM;
1154 		if (hcksum_flags & HCK_PARTIALCKSUM)
1155 			first_tbd->read.olinfo_status |=
1156 			    IXGBE_ADVTXD_POPTS_TXSM;
1157 	}
1158 
1159 	/*
1160 	 * The last descriptor of packet needs End Of Packet (EOP),
1161 	 * and Report Status (RS) bits set
1162 	 */
1163 	ASSERT(tbd != NULL);
1164 	tbd->read.cmd_type_len |=
1165 	    IXGBE_ADVTXD_DCMD_EOP | IXGBE_ADVTXD_DCMD_RS;
1166 
1167 	/*
1168 	 * Sync the DMA buffer of the tx descriptor ring
1169 	 */
1170 	DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORDEV);
1171 
1172 	/*
1173 	 * Update the number of the free tx descriptors.
1174 	 * The mutual exclusion between the transmission and the recycling
1175 	 * (for the tx descriptor ring and the work list) is implemented
1176 	 * with the atomic operation on the number of the free tx descriptors.
1177 	 *
1178 	 * Note: we should always decrement the counter tbd_free before
1179 	 * advancing the hardware TDT pointer to avoid the race condition -
1180 	 * before the counter tbd_free is decremented, the transmit of the
1181 	 * tx descriptors has done and the counter tbd_free is increased by
1182 	 * the tx recycling.
1183 	 */
1184 	i = ixgbe_atomic_reserve(&tx_ring->tbd_free, desc_num);
1185 	ASSERT(i >= 0);
1186 
1187 	tx_ring->tbd_tail = index;
1188 
1189 	/*
1190 	 * Advance the hardware TDT pointer of the tx descriptor ring
1191 	 */
1192 	IXGBE_WRITE_REG(hw, IXGBE_TDT(tx_ring->index), index);
1193 
1194 	if (ixgbe_check_acc_handle(tx_ring->ixgbe->osdep.reg_handle) !=
1195 	    DDI_FM_OK) {
1196 		ddi_fm_service_impact(tx_ring->ixgbe->dip,
1197 		    DDI_SERVICE_DEGRADED);
1198 		atomic_or_32(&tx_ring->ixgbe->ixgbe_state, IXGBE_ERROR);
1199 	}
1200 
1201 	return (desc_num);
1202 }
1203 
1204 /*
1205  * ixgbe_save_desc
1206  *
1207  * Save the address/length pair to the private array
1208  * of the tx control block. The address/length pairs
1209  * will be filled into the tx descriptor ring later.
1210  */
1211 static void
1212 ixgbe_save_desc(tx_control_block_t *tcb, uint64_t address, size_t length)
1213 {
1214 	sw_desc_t *desc;
1215 
1216 	desc = &tcb->desc[tcb->desc_num];
1217 	desc->address = address;
1218 	desc->length = length;
1219 
1220 	tcb->desc_num++;
1221 }
1222 
1223 /*
1224  * ixgbe_tx_recycle_legacy
1225  *
1226  * Recycle the tx descriptors and tx control blocks.
1227  *
1228  * The work list is traversed to check if the corresponding
1229  * tx descriptors have been transmitted. If so, the resources
1230  * bound to the tx control blocks will be freed, and those
1231  * tx control blocks will be returned to the free list.
1232  */
1233 uint32_t
1234 ixgbe_tx_recycle_legacy(ixgbe_tx_ring_t *tx_ring)
1235 {
1236 	uint32_t index, last_index, prev_index;
1237 	int desc_num;
1238 	boolean_t desc_done;
1239 	tx_control_block_t *tcb;
1240 	link_list_t pending_list;
1241 	ixgbe_t *ixgbe = tx_ring->ixgbe;
1242 
1243 	mutex_enter(&tx_ring->recycle_lock);
1244 
1245 	ASSERT(tx_ring->tbd_free <= tx_ring->ring_size);
1246 
1247 	if (tx_ring->tbd_free == tx_ring->ring_size) {
1248 		tx_ring->recycle_fail = 0;
1249 		tx_ring->stall_watchdog = 0;
1250 		if (tx_ring->reschedule) {
1251 			tx_ring->reschedule = B_FALSE;
1252 			mac_tx_ring_update(ixgbe->mac_hdl,
1253 			    tx_ring->ring_handle);
1254 		}
1255 		mutex_exit(&tx_ring->recycle_lock);
1256 		return (0);
1257 	}
1258 
1259 	/*
1260 	 * Sync the DMA buffer of the tx descriptor ring
1261 	 */
1262 	DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORKERNEL);
1263 
1264 	if (ixgbe_check_dma_handle(tx_ring->tbd_area.dma_handle) != DDI_FM_OK) {
1265 		mutex_exit(&tx_ring->recycle_lock);
1266 		ddi_fm_service_impact(ixgbe->dip, DDI_SERVICE_DEGRADED);
1267 		atomic_or_32(&ixgbe->ixgbe_state, IXGBE_ERROR);
1268 		return (0);
1269 	}
1270 
1271 	LINK_LIST_INIT(&pending_list);
1272 	desc_num = 0;
1273 	index = tx_ring->tbd_head;	/* Index of next tbd/tcb to recycle */
1274 
1275 	tcb = tx_ring->work_list[index];
1276 	ASSERT(tcb != NULL);
1277 
1278 	while (tcb != NULL) {
1279 		/*
1280 		 * Get the last tx descriptor of this packet.
1281 		 * If the last tx descriptor is done, then
1282 		 * we can recycle all descriptors of a packet
1283 		 * which usually includes several tx control blocks.
1284 		 * For 82599, LSO descriptors can not be recycled
1285 		 * unless the whole packet's transmission is done.
1286 		 * That's why packet level recycling is used here.
1287 		 * For 82598, there's not such limit.
1288 		 */
1289 		last_index = tcb->last_index;
1290 		/*
1291 		 * MAX_TX_RING_SIZE is used to judge whether
1292 		 * the index is a valid value or not.
1293 		 */
1294 		if (last_index == MAX_TX_RING_SIZE)
1295 			break;
1296 
1297 		/*
1298 		 * Check if the Descriptor Done bit is set
1299 		 */
1300 		desc_done = tx_ring->tbd_ring[last_index].wb.status &
1301 		    IXGBE_TXD_STAT_DD;
1302 		if (desc_done) {
1303 			/*
1304 			 * recycle all descriptors of the packet
1305 			 */
1306 			while (tcb != NULL) {
1307 				/*
1308 				 * Strip off the tx control block from
1309 				 * the work list, and add it to the
1310 				 * pending list.
1311 				 */
1312 				tx_ring->work_list[index] = NULL;
1313 				LIST_PUSH_TAIL(&pending_list, &tcb->link);
1314 
1315 				/*
1316 				 * Count the total number of the tx
1317 				 * descriptors recycled
1318 				 */
1319 				desc_num += tcb->desc_num;
1320 
1321 				index = NEXT_INDEX(index, tcb->desc_num,
1322 				    tx_ring->ring_size);
1323 
1324 				tcb = tx_ring->work_list[index];
1325 
1326 				prev_index = PREV_INDEX(index, 1,
1327 				    tx_ring->ring_size);
1328 				if (prev_index == last_index)
1329 					break;
1330 			}
1331 		} else {
1332 			break;
1333 		}
1334 	}
1335 
1336 	/*
1337 	 * If no tx descriptors are recycled, no need to do more processing
1338 	 */
1339 	if (desc_num == 0) {
1340 		tx_ring->recycle_fail++;
1341 		mutex_exit(&tx_ring->recycle_lock);
1342 		return (0);
1343 	}
1344 
1345 	tx_ring->recycle_fail = 0;
1346 	tx_ring->stall_watchdog = 0;
1347 
1348 	/*
1349 	 * Update the head index of the tx descriptor ring
1350 	 */
1351 	tx_ring->tbd_head = index;
1352 
1353 	/*
1354 	 * Update the number of the free tx descriptors with atomic operations
1355 	 */
1356 	atomic_add_32(&tx_ring->tbd_free, desc_num);
1357 
1358 	if ((tx_ring->tbd_free >= ixgbe->tx_resched_thresh) &&
1359 	    (tx_ring->reschedule)) {
1360 		tx_ring->reschedule = B_FALSE;
1361 		mac_tx_ring_update(ixgbe->mac_hdl,
1362 		    tx_ring->ring_handle);
1363 	}
1364 	mutex_exit(&tx_ring->recycle_lock);
1365 
1366 	/*
1367 	 * Free the resources used by the tx control blocks
1368 	 * in the pending list
1369 	 */
1370 	tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list);
1371 	while (tcb != NULL) {
1372 		/*
1373 		 * Release the resources occupied by the tx control block
1374 		 */
1375 		ixgbe_free_tcb(tcb);
1376 
1377 		tcb = (tx_control_block_t *)
1378 		    LIST_GET_NEXT(&pending_list, &tcb->link);
1379 	}
1380 
1381 	/*
1382 	 * Add the tx control blocks in the pending list to the free list.
1383 	 */
1384 	ixgbe_put_free_list(tx_ring, &pending_list);
1385 
1386 	return (desc_num);
1387 }
1388 
1389 /*
1390  * ixgbe_tx_recycle_head_wb
1391  *
1392  * Check the head write-back, and recycle all the transmitted
1393  * tx descriptors and tx control blocks.
1394  */
1395 uint32_t
1396 ixgbe_tx_recycle_head_wb(ixgbe_tx_ring_t *tx_ring)
1397 {
1398 	uint32_t index;
1399 	uint32_t head_wb;
1400 	int desc_num;
1401 	tx_control_block_t *tcb;
1402 	link_list_t pending_list;
1403 	ixgbe_t *ixgbe = tx_ring->ixgbe;
1404 
1405 	mutex_enter(&tx_ring->recycle_lock);
1406 
1407 	ASSERT(tx_ring->tbd_free <= tx_ring->ring_size);
1408 
1409 	if (tx_ring->tbd_free == tx_ring->ring_size) {
1410 		tx_ring->recycle_fail = 0;
1411 		tx_ring->stall_watchdog = 0;
1412 		if (tx_ring->reschedule) {
1413 			tx_ring->reschedule = B_FALSE;
1414 			mac_tx_ring_update(ixgbe->mac_hdl,
1415 			    tx_ring->ring_handle);
1416 		}
1417 		mutex_exit(&tx_ring->recycle_lock);
1418 		return (0);
1419 	}
1420 
1421 	/*
1422 	 * Sync the DMA buffer of the tx descriptor ring
1423 	 *
1424 	 * Note: For head write-back mode, the tx descriptors will not
1425 	 * be written back, but the head write-back value is stored at
1426 	 * the last extra tbd at the end of the DMA area, we still need
1427 	 * to sync the head write-back value for kernel.
1428 	 *
1429 	 * DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORKERNEL);
1430 	 */
1431 	(void) ddi_dma_sync(tx_ring->tbd_area.dma_handle,
1432 	    sizeof (union ixgbe_adv_tx_desc) * tx_ring->ring_size,
1433 	    sizeof (uint32_t),
1434 	    DDI_DMA_SYNC_FORKERNEL);
1435 
1436 	if (ixgbe_check_dma_handle(tx_ring->tbd_area.dma_handle) != DDI_FM_OK) {
1437 		mutex_exit(&tx_ring->recycle_lock);
1438 		ddi_fm_service_impact(ixgbe->dip,
1439 		    DDI_SERVICE_DEGRADED);
1440 		atomic_or_32(&ixgbe->ixgbe_state, IXGBE_ERROR);
1441 		return (0);
1442 	}
1443 
1444 	LINK_LIST_INIT(&pending_list);
1445 	desc_num = 0;
1446 	index = tx_ring->tbd_head;	/* Next index to clean */
1447 
1448 	/*
1449 	 * Get the value of head write-back
1450 	 */
1451 	head_wb = *tx_ring->tbd_head_wb;
1452 	while (index != head_wb) {
1453 		tcb = tx_ring->work_list[index];
1454 		ASSERT(tcb != NULL);
1455 
1456 		if (OFFSET(index, head_wb, tx_ring->ring_size) <
1457 		    tcb->desc_num) {
1458 			/*
1459 			 * The current tx control block is not
1460 			 * completely transmitted, stop recycling
1461 			 */
1462 			break;
1463 		}
1464 
1465 		/*
1466 		 * Strip off the tx control block from the work list,
1467 		 * and add it to the pending list.
1468 		 */
1469 		tx_ring->work_list[index] = NULL;
1470 		LIST_PUSH_TAIL(&pending_list, &tcb->link);
1471 
1472 		/*
1473 		 * Advance the index of the tx descriptor ring
1474 		 */
1475 		index = NEXT_INDEX(index, tcb->desc_num, tx_ring->ring_size);
1476 
1477 		/*
1478 		 * Count the total number of the tx descriptors recycled
1479 		 */
1480 		desc_num += tcb->desc_num;
1481 	}
1482 
1483 	/*
1484 	 * If no tx descriptors are recycled, no need to do more processing
1485 	 */
1486 	if (desc_num == 0) {
1487 		tx_ring->recycle_fail++;
1488 		mutex_exit(&tx_ring->recycle_lock);
1489 		return (0);
1490 	}
1491 
1492 	tx_ring->recycle_fail = 0;
1493 	tx_ring->stall_watchdog = 0;
1494 
1495 	/*
1496 	 * Update the head index of the tx descriptor ring
1497 	 */
1498 	tx_ring->tbd_head = index;
1499 
1500 	/*
1501 	 * Update the number of the free tx descriptors with atomic operations
1502 	 */
1503 	atomic_add_32(&tx_ring->tbd_free, desc_num);
1504 
1505 	if ((tx_ring->tbd_free >= ixgbe->tx_resched_thresh) &&
1506 	    (tx_ring->reschedule)) {
1507 		tx_ring->reschedule = B_FALSE;
1508 		mac_tx_ring_update(ixgbe->mac_hdl,
1509 		    tx_ring->ring_handle);
1510 	}
1511 	mutex_exit(&tx_ring->recycle_lock);
1512 
1513 	/*
1514 	 * Free the resources used by the tx control blocks
1515 	 * in the pending list
1516 	 */
1517 	tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list);
1518 	while (tcb) {
1519 		/*
1520 		 * Release the resources occupied by the tx control block
1521 		 */
1522 		ixgbe_free_tcb(tcb);
1523 
1524 		tcb = (tx_control_block_t *)
1525 		    LIST_GET_NEXT(&pending_list, &tcb->link);
1526 	}
1527 
1528 	/*
1529 	 * Add the tx control blocks in the pending list to the free list.
1530 	 */
1531 	ixgbe_put_free_list(tx_ring, &pending_list);
1532 
1533 	return (desc_num);
1534 }
1535 
1536 /*
1537  * ixgbe_free_tcb - free up the tx control block
1538  *
1539  * Free the resources of the tx control block, including
1540  * unbind the previously bound DMA handle, and reset other
1541  * control fields.
1542  */
1543 void
1544 ixgbe_free_tcb(tx_control_block_t *tcb)
1545 {
1546 	switch (tcb->tx_type) {
1547 	case USE_COPY:
1548 		/*
1549 		 * Reset the buffer length that is used for copy
1550 		 */
1551 		tcb->tx_buf.len = 0;
1552 		break;
1553 	case USE_DMA:
1554 		/*
1555 		 * Release the DMA resource that is used for
1556 		 * DMA binding.
1557 		 */
1558 		(void) ddi_dma_unbind_handle(tcb->tx_dma_handle);
1559 		break;
1560 	default:
1561 		break;
1562 	}
1563 
1564 	/*
1565 	 * Free the mblk
1566 	 */
1567 	if (tcb->mp != NULL) {
1568 		freemsg(tcb->mp);
1569 		tcb->mp = NULL;
1570 	}
1571 
1572 	tcb->tx_type = USE_NONE;
1573 	tcb->last_index = MAX_TX_RING_SIZE;
1574 	tcb->frag_num = 0;
1575 	tcb->desc_num = 0;
1576 }
1577 
1578 /*
1579  * ixgbe_get_free_list - Get a free tx control block from the free list
1580  *
1581  * The atomic operation on the number of the available tx control block
1582  * in the free list is used to keep this routine mutual exclusive with
1583  * the routine ixgbe_put_check_list.
1584  */
1585 static tx_control_block_t *
1586 ixgbe_get_free_list(ixgbe_tx_ring_t *tx_ring)
1587 {
1588 	tx_control_block_t *tcb;
1589 
1590 	/*
1591 	 * Check and update the number of the free tx control block
1592 	 * in the free list.
1593 	 */
1594 	if (ixgbe_atomic_reserve(&tx_ring->tcb_free, 1) < 0)
1595 		return (NULL);
1596 
1597 	mutex_enter(&tx_ring->tcb_head_lock);
1598 
1599 	tcb = tx_ring->free_list[tx_ring->tcb_head];
1600 	ASSERT(tcb != NULL);
1601 	tx_ring->free_list[tx_ring->tcb_head] = NULL;
1602 	tx_ring->tcb_head = NEXT_INDEX(tx_ring->tcb_head, 1,
1603 	    tx_ring->free_list_size);
1604 
1605 	mutex_exit(&tx_ring->tcb_head_lock);
1606 
1607 	return (tcb);
1608 }
1609 
1610 /*
1611  * ixgbe_put_free_list
1612  *
1613  * Put a list of used tx control blocks back to the free list
1614  *
1615  * A mutex is used here to ensure the serialization. The mutual exclusion
1616  * between ixgbe_get_free_list and ixgbe_put_free_list is implemented with
1617  * the atomic operation on the counter tcb_free.
1618  */
1619 void
1620 ixgbe_put_free_list(ixgbe_tx_ring_t *tx_ring, link_list_t *pending_list)
1621 {
1622 	uint32_t index;
1623 	int tcb_num;
1624 	tx_control_block_t *tcb;
1625 
1626 	mutex_enter(&tx_ring->tcb_tail_lock);
1627 
1628 	index = tx_ring->tcb_tail;
1629 
1630 	tcb_num = 0;
1631 	tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
1632 	while (tcb != NULL) {
1633 		ASSERT(tx_ring->free_list[index] == NULL);
1634 		tx_ring->free_list[index] = tcb;
1635 
1636 		tcb_num++;
1637 
1638 		index = NEXT_INDEX(index, 1, tx_ring->free_list_size);
1639 
1640 		tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
1641 	}
1642 
1643 	tx_ring->tcb_tail = index;
1644 
1645 	/*
1646 	 * Update the number of the free tx control block
1647 	 * in the free list. This operation must be placed
1648 	 * under the protection of the lock.
1649 	 */
1650 	atomic_add_32(&tx_ring->tcb_free, tcb_num);
1651 
1652 	mutex_exit(&tx_ring->tcb_tail_lock);
1653 }
1654