xref: /illumos-gate/usr/src/uts/common/io/ixgbe/ixgbe_tx.c (revision 45680bd3312426f0b2a9e53e7b78a09c1fff0959)
1 /*
2  * CDDL HEADER START
3  *
4  * Copyright(c) 2007-2009 Intel Corporation. All rights reserved.
5  * The contents of this file are subject to the terms of the
6  * Common Development and Distribution License (the "License").
7  * You may not use this file except in compliance with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 
23 /*
24  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
25  * Use is subject to license terms.
26  */
27 
28 #include "ixgbe_sw.h"
29 
30 static int ixgbe_tx_copy(ixgbe_tx_ring_t *, tx_control_block_t *, mblk_t *,
31     uint32_t, boolean_t);
32 static int ixgbe_tx_bind(ixgbe_tx_ring_t *, tx_control_block_t *, mblk_t *,
33     uint32_t);
34 static int ixgbe_tx_fill_ring(ixgbe_tx_ring_t *, link_list_t *,
35     ixgbe_tx_context_t *, size_t);
36 static void ixgbe_save_desc(tx_control_block_t *, uint64_t, size_t);
37 static tx_control_block_t *ixgbe_get_free_list(ixgbe_tx_ring_t *);
38 
39 static int ixgbe_get_context(mblk_t *, ixgbe_tx_context_t *);
40 static boolean_t ixgbe_check_context(ixgbe_tx_ring_t *,
41     ixgbe_tx_context_t *);
42 static void ixgbe_fill_context(struct ixgbe_adv_tx_context_desc *,
43     ixgbe_tx_context_t *);
44 
45 #ifndef IXGBE_DEBUG
46 #pragma inline(ixgbe_save_desc)
47 #pragma inline(ixgbe_get_context)
48 #pragma inline(ixgbe_check_context)
49 #pragma inline(ixgbe_fill_context)
50 #endif
51 
52 /*
53  * ixgbe_ring_tx
54  *
55  * To transmit one mblk through one specified ring.
56  *
57  * One mblk can consist of several fragments, each fragment
58  * will be processed with different methods based on the size.
59  * For the fragments with size less than the bcopy threshold,
60  * they will be processed by using bcopy; otherwise, they will
61  * be processed by using DMA binding.
62  *
63  * To process the mblk, a tx control block is got from the
64  * free list. One tx control block contains one tx buffer, which
65  * is used to copy mblk fragments' data; and one tx DMA handle,
66  * which is used to bind a mblk fragment with DMA resource.
67  *
68  * Several small mblk fragments can be copied into one tx control
69  * block's buffer, and then the buffer will be transmitted with
70  * one tx descriptor.
71  *
72  * A large fragment only binds with one tx control block's DMA
73  * handle, and it can span several tx descriptors for transmitting.
74  *
75  * So to transmit a packet (mblk), several tx control blocks can
76  * be used. After the processing, those tx control blocks will
77  * be put to the work list.
78  */
79 mblk_t *
80 ixgbe_ring_tx(void *arg, mblk_t *mp)
81 {
82 	ixgbe_tx_ring_t *tx_ring = (ixgbe_tx_ring_t *)arg;
83 	ixgbe_t *ixgbe = tx_ring->ixgbe;
84 	tx_type_t current_flag, next_flag;
85 	uint32_t current_len, next_len;
86 	uint32_t desc_total;
87 	size_t mbsize;
88 	int desc_num;
89 	boolean_t copy_done, eop;
90 	mblk_t *current_mp, *next_mp, *nmp, *pull_mp = NULL;
91 	tx_control_block_t *tcb;
92 	ixgbe_tx_context_t tx_context, *ctx;
93 	link_list_t pending_list;
94 	uint32_t len, hdr_frag_len, hdr_len;
95 	uint32_t copy_thresh;
96 	mblk_t *new_mp;
97 	mblk_t *pre_mp;
98 
99 	ASSERT(mp->b_next == NULL);
100 
101 	copy_thresh = ixgbe->tx_copy_thresh;
102 
103 	/* Get the mblk size */
104 	mbsize = 0;
105 	for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
106 		mbsize += MBLKL(nmp);
107 	}
108 
109 	if (ixgbe->tx_hcksum_enable) {
110 		/*
111 		 * Retrieve checksum context information from the mblk
112 		 * that will be used to decide whether/how to fill the
113 		 * context descriptor.
114 		 */
115 		ctx = &tx_context;
116 		if (ixgbe_get_context(mp, ctx) < 0) {
117 			freemsg(mp);
118 			return (NULL);
119 		}
120 
121 		/*
122 		 * If the mblk size exceeds the max size ixgbe could
123 		 * process, then discard this mblk, and return NULL.
124 		 */
125 		if ((ctx->lso_flag &&
126 		    ((mbsize - ctx->mac_hdr_len) > IXGBE_LSO_MAXLEN)) ||
127 		    (!ctx->lso_flag &&
128 		    (mbsize > (ixgbe->max_frame_size - ETHERFCSL)))) {
129 			freemsg(mp);
130 			IXGBE_DEBUGLOG_0(ixgbe, "ixgbe_tx: packet oversize");
131 			return (NULL);
132 		}
133 	} else {
134 		ctx = NULL;
135 	}
136 
137 	/*
138 	 * Check and recycle tx descriptors.
139 	 * The recycle threshold here should be selected carefully
140 	 */
141 	if (tx_ring->tbd_free < ixgbe->tx_recycle_thresh) {
142 		tx_ring->tx_recycle(tx_ring);
143 	}
144 
145 	/*
146 	 * After the recycling, if the tbd_free is less than the
147 	 * overload_threshold, assert overload, return mp;
148 	 * and we need to re-schedule the tx again.
149 	 */
150 	if (tx_ring->tbd_free < ixgbe->tx_overload_thresh) {
151 		tx_ring->reschedule = B_TRUE;
152 		IXGBE_DEBUG_STAT(tx_ring->stat_overload);
153 		return (mp);
154 	}
155 
156 	/*
157 	 * The pending_list is a linked list that is used to save
158 	 * the tx control blocks that have packet data processed
159 	 * but have not put the data to the tx descriptor ring.
160 	 * It is used to reduce the lock contention of the tx_lock.
161 	 */
162 	LINK_LIST_INIT(&pending_list);
163 	desc_num = 0;
164 	desc_total = 0;
165 
166 	/*
167 	 * The software should guarantee LSO packet header(MAC+IP+TCP)
168 	 * to be within one descriptor. Here we reallocate and refill the
169 	 * the header if it's physical memory non-contiguous.
170 	 */
171 	if ((ctx != NULL) && ctx->lso_flag) {
172 		/* find the last fragment of the header */
173 		len = MBLKL(mp);
174 		ASSERT(len > 0);
175 		nmp = mp;
176 		pre_mp = NULL;
177 		hdr_len = ctx->ip_hdr_len + ctx->mac_hdr_len + ctx->l4_hdr_len;
178 		while (len < hdr_len) {
179 			pre_mp = nmp;
180 			nmp = nmp->b_cont;
181 			len += MBLKL(nmp);
182 		}
183 		/*
184 		 * If the header and the payload are in different mblks,
185 		 * we simply force the header to be copied into pre-allocated
186 		 * page-aligned buffer.
187 		 */
188 		if (len == hdr_len)
189 			goto adjust_threshold;
190 
191 		hdr_frag_len = hdr_len - (len - MBLKL(nmp));
192 		/*
193 		 * There are two cases we need to reallocate a mblk for the
194 		 * last header fragment:
195 		 * 1. the header is in multiple mblks and the last fragment
196 		 * share the same mblk with the payload
197 		 * 2. the header is in a single mblk shared with the payload
198 		 * and the header is physical memory non-contiguous
199 		 */
200 		if ((nmp != mp) ||
201 		    (P2NPHASE((uintptr_t)nmp->b_rptr, ixgbe->sys_page_size)
202 		    < hdr_len)) {
203 			IXGBE_DEBUG_STAT(tx_ring->stat_lso_header_fail);
204 			/*
205 			 * reallocate the mblk for the last header fragment,
206 			 * expect to bcopy into pre-allocated page-aligned
207 			 * buffer
208 			 */
209 			new_mp = allocb(hdr_frag_len, NULL);
210 			if (!new_mp)
211 				return (mp);
212 			bcopy(nmp->b_rptr, new_mp->b_rptr, hdr_frag_len);
213 			/* link the new header fragment with the other parts */
214 			new_mp->b_wptr = new_mp->b_rptr + hdr_frag_len;
215 			new_mp->b_cont = nmp;
216 			if (pre_mp)
217 				pre_mp->b_cont = new_mp;
218 			nmp->b_rptr += hdr_frag_len;
219 			if (hdr_frag_len == hdr_len)
220 				mp = new_mp;
221 		}
222 adjust_threshold:
223 		/*
224 		 * adjust the bcopy threshhold to guarantee
225 		 * the header to use bcopy way
226 		 */
227 		if (copy_thresh < hdr_len)
228 			copy_thresh = hdr_len;
229 	}
230 
231 	current_mp = mp;
232 	current_len = MBLKL(current_mp);
233 	/*
234 	 * Decide which method to use for the first fragment
235 	 */
236 	current_flag = (current_len <= copy_thresh) ?
237 	    USE_COPY : USE_DMA;
238 	/*
239 	 * If the mblk includes several contiguous small fragments,
240 	 * they may be copied into one buffer. This flag is used to
241 	 * indicate whether there are pending fragments that need to
242 	 * be copied to the current tx buffer.
243 	 *
244 	 * If this flag is B_TRUE, it indicates that a new tx control
245 	 * block is needed to process the next fragment using either
246 	 * copy or DMA binding.
247 	 *
248 	 * Otherwise, it indicates that the next fragment will be
249 	 * copied to the current tx buffer that is maintained by the
250 	 * current tx control block. No new tx control block is needed.
251 	 */
252 	copy_done = B_TRUE;
253 	while (current_mp) {
254 		next_mp = current_mp->b_cont;
255 		eop = (next_mp == NULL); /* Last fragment of the packet? */
256 		next_len = eop ? 0: MBLKL(next_mp);
257 
258 		/*
259 		 * When the current fragment is an empty fragment, if
260 		 * the next fragment will still be copied to the current
261 		 * tx buffer, we cannot skip this fragment here. Because
262 		 * the copy processing is pending for completion. We have
263 		 * to process this empty fragment in the tx_copy routine.
264 		 *
265 		 * If the copy processing is completed or a DMA binding
266 		 * processing is just completed, we can just skip this
267 		 * empty fragment.
268 		 */
269 		if ((current_len == 0) && (copy_done)) {
270 			current_mp = next_mp;
271 			current_len = next_len;
272 			current_flag = (current_len <= copy_thresh) ?
273 			    USE_COPY : USE_DMA;
274 			continue;
275 		}
276 
277 		if (copy_done) {
278 			/*
279 			 * Get a new tx control block from the free list
280 			 */
281 			tcb = ixgbe_get_free_list(tx_ring);
282 
283 			if (tcb == NULL) {
284 				IXGBE_DEBUG_STAT(tx_ring->stat_fail_no_tcb);
285 				goto tx_failure;
286 			}
287 
288 			/*
289 			 * Push the tx control block to the pending list
290 			 * to avoid using lock too early
291 			 */
292 			LIST_PUSH_TAIL(&pending_list, &tcb->link);
293 		}
294 
295 		if (current_flag == USE_COPY) {
296 			/*
297 			 * Check whether to use bcopy or DMA binding to process
298 			 * the next fragment, and if using bcopy, whether we
299 			 * need to continue copying the next fragment into the
300 			 * current tx buffer.
301 			 */
302 			ASSERT((tcb->tx_buf.len + current_len) <=
303 			    tcb->tx_buf.size);
304 
305 			if (eop) {
306 				/*
307 				 * This is the last fragment of the packet, so
308 				 * the copy processing will be completed with
309 				 * this fragment.
310 				 */
311 				next_flag = USE_NONE;
312 				copy_done = B_TRUE;
313 			} else if ((tcb->tx_buf.len + current_len + next_len) >
314 			    tcb->tx_buf.size) {
315 				/*
316 				 * If the next fragment is too large to be
317 				 * copied to the current tx buffer, we need
318 				 * to complete the current copy processing.
319 				 */
320 				next_flag = (next_len > copy_thresh) ?
321 				    USE_DMA: USE_COPY;
322 				copy_done = B_TRUE;
323 			} else if (next_len > copy_thresh) {
324 				/*
325 				 * The next fragment needs to be processed with
326 				 * DMA binding. So the copy prcessing will be
327 				 * completed with the current fragment.
328 				 */
329 				next_flag = USE_DMA;
330 				copy_done = B_TRUE;
331 			} else {
332 				/*
333 				 * Continue to copy the next fragment to the
334 				 * current tx buffer.
335 				 */
336 				next_flag = USE_COPY;
337 				copy_done = B_FALSE;
338 			}
339 
340 			desc_num = ixgbe_tx_copy(tx_ring, tcb, current_mp,
341 			    current_len, copy_done);
342 		} else {
343 			/*
344 			 * Check whether to use bcopy or DMA binding to process
345 			 * the next fragment.
346 			 */
347 			next_flag = (next_len > copy_thresh) ?
348 			    USE_DMA: USE_COPY;
349 			ASSERT(copy_done == B_TRUE);
350 
351 			desc_num = ixgbe_tx_bind(tx_ring, tcb, current_mp,
352 			    current_len);
353 		}
354 
355 		if (desc_num > 0)
356 			desc_total += desc_num;
357 		else if (desc_num < 0)
358 			goto tx_failure;
359 
360 		current_mp = next_mp;
361 		current_len = next_len;
362 		current_flag = next_flag;
363 	}
364 
365 	/*
366 	 * Attach the mblk to the last tx control block
367 	 */
368 	ASSERT(tcb);
369 	ASSERT(tcb->mp == NULL);
370 	tcb->mp = mp;
371 
372 	/*
373 	 * 82598/82599 chipset has a limitation that no more than 32 tx
374 	 * descriptors can be transmited out at one time.
375 	 *
376 	 * Here is a workaround for it: pull up the mblk then send it
377 	 * out with bind way. By doing so, no more than MAX_COOKIE (18)
378 	 * descriptors is needed.
379 	 */
380 	if (desc_total + 1 > IXGBE_TX_DESC_LIMIT) {
381 		IXGBE_DEBUG_STAT(tx_ring->stat_break_tbd_limit);
382 
383 		/*
384 		 * Discard the mblk and free the used resources
385 		 */
386 		tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list);
387 		while (tcb) {
388 			tcb->mp = NULL;
389 			ixgbe_free_tcb(tcb);
390 			tcb = (tx_control_block_t *)
391 			    LIST_GET_NEXT(&pending_list, &tcb->link);
392 		}
393 
394 		/*
395 		 * Return the tx control blocks in the pending list to
396 		 * the free list.
397 		 */
398 		ixgbe_put_free_list(tx_ring, &pending_list);
399 
400 		/*
401 		 * pull up the mblk and send it out with bind way
402 		 */
403 		if ((pull_mp = msgpullup(mp, -1)) == NULL) {
404 			tx_ring->reschedule = B_TRUE;
405 			return (mp);
406 		}
407 
408 		LINK_LIST_INIT(&pending_list);
409 		desc_total = 0;
410 
411 		/*
412 		 * if the packet is a LSO packet, we simply
413 		 * transmit the header in one descriptor using the copy way
414 		 */
415 		if ((ctx != NULL) && ctx->lso_flag) {
416 			hdr_len = ctx->ip_hdr_len + ctx->mac_hdr_len +
417 			    ctx->l4_hdr_len;
418 
419 			tcb = ixgbe_get_free_list(tx_ring);
420 			if (tcb == NULL) {
421 				IXGBE_DEBUG_STAT(tx_ring->stat_fail_no_tcb);
422 				goto tx_failure;
423 			}
424 			desc_num = ixgbe_tx_copy(tx_ring, tcb, pull_mp,
425 			    hdr_len, B_TRUE);
426 			LIST_PUSH_TAIL(&pending_list, &tcb->link);
427 			desc_total  += desc_num;
428 
429 			pull_mp->b_rptr += hdr_len;
430 		}
431 
432 		tcb = ixgbe_get_free_list(tx_ring);
433 		if (tcb == NULL) {
434 			IXGBE_DEBUG_STAT(tx_ring->stat_fail_no_tcb);
435 			goto tx_failure;
436 		}
437 		if ((ctx != NULL) && ctx->lso_flag) {
438 			desc_num = ixgbe_tx_bind(tx_ring, tcb, pull_mp,
439 			    mbsize - hdr_len);
440 		} else {
441 			desc_num = ixgbe_tx_bind(tx_ring, tcb, pull_mp,
442 			    mbsize);
443 		}
444 		if (desc_num < 0) {
445 			goto tx_failure;
446 		}
447 		LIST_PUSH_TAIL(&pending_list, &tcb->link);
448 
449 		desc_total += desc_num;
450 		tcb->mp = pull_mp;
451 	}
452 
453 	/*
454 	 * Before fill the tx descriptor ring with the data, we need to
455 	 * ensure there are adequate free descriptors for transmit
456 	 * (including one context descriptor).
457 	 */
458 	if (tx_ring->tbd_free < (desc_total + 1)) {
459 		tx_ring->tx_recycle(tx_ring);
460 	}
461 
462 	mutex_enter(&tx_ring->tx_lock);
463 	/*
464 	 * If the number of free tx descriptors is not enough for transmit
465 	 * then return mp.
466 	 *
467 	 * Note: we must put this check under the mutex protection to
468 	 * ensure the correctness when multiple threads access it in
469 	 * parallel.
470 	 */
471 	if (tx_ring->tbd_free < (desc_total + 1)) {
472 		IXGBE_DEBUG_STAT(tx_ring->stat_fail_no_tbd);
473 		mutex_exit(&tx_ring->tx_lock);
474 		goto tx_failure;
475 	}
476 
477 	desc_num = ixgbe_tx_fill_ring(tx_ring, &pending_list, ctx,
478 	    mbsize);
479 
480 	ASSERT((desc_num == desc_total) || (desc_num == (desc_total + 1)));
481 
482 	mutex_exit(&tx_ring->tx_lock);
483 
484 	/*
485 	 * now that the transmission succeeds, need to free the original
486 	 * mp if we used the pulling up mblk for transmission.
487 	 */
488 	if (pull_mp) {
489 		freemsg(mp);
490 	}
491 
492 	return (NULL);
493 
494 tx_failure:
495 	/*
496 	 * If transmission fails, need to free the pulling up mblk.
497 	 */
498 	if (pull_mp) {
499 		freemsg(pull_mp);
500 	}
501 
502 	/*
503 	 * Discard the mblk and free the used resources
504 	 */
505 	tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list);
506 	while (tcb) {
507 		tcb->mp = NULL;
508 
509 		ixgbe_free_tcb(tcb);
510 
511 		tcb = (tx_control_block_t *)
512 		    LIST_GET_NEXT(&pending_list, &tcb->link);
513 	}
514 
515 	/*
516 	 * Return the tx control blocks in the pending list to the free list.
517 	 */
518 	ixgbe_put_free_list(tx_ring, &pending_list);
519 
520 	/* Transmit failed, do not drop the mblk, rechedule the transmit */
521 	tx_ring->reschedule = B_TRUE;
522 
523 	return (mp);
524 }
525 
526 /*
527  * ixgbe_tx_copy
528  *
529  * Copy the mblk fragment to the pre-allocated tx buffer
530  */
531 static int
532 ixgbe_tx_copy(ixgbe_tx_ring_t *tx_ring, tx_control_block_t *tcb, mblk_t *mp,
533     uint32_t len, boolean_t copy_done)
534 {
535 	dma_buffer_t *tx_buf;
536 	uint32_t desc_num;
537 	_NOTE(ARGUNUSED(tx_ring));
538 
539 	tx_buf = &tcb->tx_buf;
540 
541 	/*
542 	 * Copy the packet data of the mblk fragment into the
543 	 * pre-allocated tx buffer, which is maintained by the
544 	 * tx control block.
545 	 *
546 	 * Several mblk fragments can be copied into one tx buffer.
547 	 * The destination address of the current copied fragment in
548 	 * the tx buffer is next to the end of the previous copied
549 	 * fragment.
550 	 */
551 	if (len > 0) {
552 		bcopy(mp->b_rptr, tx_buf->address + tx_buf->len, len);
553 
554 		tx_buf->len += len;
555 		tcb->frag_num++;
556 	}
557 
558 	desc_num = 0;
559 
560 	/*
561 	 * If it is the last fragment copied to the current tx buffer,
562 	 * in other words, if there's no remaining fragment or the remaining
563 	 * fragment requires a new tx control block to process, we need to
564 	 * complete the current copy processing by syncing up the current
565 	 * DMA buffer and saving the descriptor data.
566 	 */
567 	if (copy_done) {
568 		/*
569 		 * Sync the DMA buffer of the packet data
570 		 */
571 		DMA_SYNC(tx_buf, DDI_DMA_SYNC_FORDEV);
572 
573 		tcb->tx_type = USE_COPY;
574 
575 		/*
576 		 * Save the address and length to the private data structure
577 		 * of the tx control block, which will be used to fill the
578 		 * tx descriptor ring after all the fragments are processed.
579 		 */
580 		ixgbe_save_desc(tcb, tx_buf->dma_address, tx_buf->len);
581 		desc_num++;
582 	}
583 
584 	return (desc_num);
585 }
586 
587 /*
588  * ixgbe_tx_bind
589  *
590  * Bind the mblk fragment with DMA
591  */
592 static int
593 ixgbe_tx_bind(ixgbe_tx_ring_t *tx_ring, tx_control_block_t *tcb, mblk_t *mp,
594     uint32_t len)
595 {
596 	int status, i;
597 	ddi_dma_cookie_t dma_cookie;
598 	uint_t ncookies;
599 	int desc_num;
600 
601 	/*
602 	 * Use DMA binding to process the mblk fragment
603 	 */
604 	status = ddi_dma_addr_bind_handle(tcb->tx_dma_handle, NULL,
605 	    (caddr_t)mp->b_rptr, len,
606 	    DDI_DMA_WRITE | DDI_DMA_STREAMING, DDI_DMA_DONTWAIT,
607 	    0, &dma_cookie, &ncookies);
608 
609 	if (status != DDI_DMA_MAPPED) {
610 		IXGBE_DEBUG_STAT(tx_ring->stat_fail_dma_bind);
611 		return (-1);
612 	}
613 
614 	tcb->frag_num++;
615 	tcb->tx_type = USE_DMA;
616 	/*
617 	 * Each fragment can span several cookies. One cookie will have
618 	 * one tx descriptor to transmit.
619 	 */
620 	desc_num = 0;
621 	for (i = ncookies; i > 0; i--) {
622 		/*
623 		 * Save the address and length to the private data structure
624 		 * of the tx control block, which will be used to fill the
625 		 * tx descriptor ring after all the fragments are processed.
626 		 */
627 		ixgbe_save_desc(tcb,
628 		    dma_cookie.dmac_laddress,
629 		    dma_cookie.dmac_size);
630 
631 		desc_num++;
632 
633 		if (i > 1)
634 			ddi_dma_nextcookie(tcb->tx_dma_handle, &dma_cookie);
635 	}
636 
637 	return (desc_num);
638 }
639 
640 /*
641  * ixgbe_get_context
642  *
643  * Get the context information from the mblk
644  */
645 static int
646 ixgbe_get_context(mblk_t *mp, ixgbe_tx_context_t *ctx)
647 {
648 	uint32_t start;
649 	uint32_t hckflags;
650 	uint32_t lsoflags;
651 	uint32_t mss;
652 	uint32_t len;
653 	uint32_t size;
654 	uint32_t offset;
655 	unsigned char *pos;
656 	ushort_t etype;
657 	uint32_t mac_hdr_len;
658 	uint32_t l4_proto;
659 	uint32_t l4_hdr_len;
660 
661 	ASSERT(mp != NULL);
662 
663 	hcksum_retrieve(mp, NULL, NULL, &start, NULL, NULL, NULL, &hckflags);
664 	bzero(ctx, sizeof (ixgbe_tx_context_t));
665 
666 	if (hckflags == 0) {
667 		return (0);
668 	}
669 
670 	ctx->hcksum_flags = hckflags;
671 
672 	lso_info_get(mp, &mss, &lsoflags);
673 	ctx->mss = mss;
674 	ctx->lso_flag = (lsoflags == HW_LSO);
675 
676 	/*
677 	 * LSO relies on tx h/w checksum, so here will drop the package
678 	 * if h/w checksum flag is not declared.
679 	 */
680 	if (ctx->lso_flag) {
681 		if (!((ctx->hcksum_flags & HCK_PARTIALCKSUM) &&
682 		    (ctx->hcksum_flags & HCK_IPV4_HDRCKSUM))) {
683 			IXGBE_DEBUGLOG_0(NULL, "ixgbe_tx: h/w "
684 			    "checksum flags are not specified when doing LSO");
685 			return (-1);
686 		}
687 	}
688 
689 	etype = 0;
690 	mac_hdr_len = 0;
691 	l4_proto = 0;
692 
693 	/*
694 	 * Firstly get the position of the ether_type/ether_tpid.
695 	 * Here we don't assume the ether (VLAN) header is fully included
696 	 * in one mblk fragment, so we go thourgh the fragments to parse
697 	 * the ether type.
698 	 */
699 	size = len = MBLKL(mp);
700 	offset = offsetof(struct ether_header, ether_type);
701 	while (size <= offset) {
702 		mp = mp->b_cont;
703 		ASSERT(mp != NULL);
704 		len = MBLKL(mp);
705 		size += len;
706 	}
707 	pos = mp->b_rptr + offset + len - size;
708 
709 	etype = ntohs(*(ushort_t *)(uintptr_t)pos);
710 	if (etype == ETHERTYPE_VLAN) {
711 		/*
712 		 * Get the position of the ether_type in VLAN header
713 		 */
714 		offset = offsetof(struct ether_vlan_header, ether_type);
715 		while (size <= offset) {
716 			mp = mp->b_cont;
717 			ASSERT(mp != NULL);
718 			len = MBLKL(mp);
719 			size += len;
720 		}
721 		pos = mp->b_rptr + offset + len - size;
722 
723 		etype = ntohs(*(ushort_t *)(uintptr_t)pos);
724 		mac_hdr_len = sizeof (struct ether_vlan_header);
725 	} else {
726 		mac_hdr_len = sizeof (struct ether_header);
727 	}
728 
729 	/*
730 	 * Here we don't assume the IP(V6) header is fully included in
731 	 * one mblk fragment.
732 	 */
733 	switch (etype) {
734 	case ETHERTYPE_IP:
735 		if (ctx->lso_flag) {
736 			offset = offsetof(ipha_t, ipha_length) + mac_hdr_len;
737 			while (size <= offset) {
738 				mp = mp->b_cont;
739 				ASSERT(mp != NULL);
740 				len = MBLKL(mp);
741 				size += len;
742 			}
743 			pos = mp->b_rptr + offset + len - size;
744 			*((uint16_t *)(uintptr_t)(pos)) = 0;
745 
746 			offset = offsetof(ipha_t, ipha_hdr_checksum) +
747 			    mac_hdr_len;
748 			while (size <= offset) {
749 				mp = mp->b_cont;
750 				ASSERT(mp != NULL);
751 				len = MBLKL(mp);
752 				size += len;
753 			}
754 			pos = mp->b_rptr + offset + len - size;
755 			*((uint16_t *)(uintptr_t)(pos)) = 0;
756 
757 			/*
758 			 * To perform ixgbe LSO, here also need to fill
759 			 * the tcp checksum field of the packet with the
760 			 * following pseudo-header checksum:
761 			 * (ip_source_addr, ip_destination_addr, l4_proto)
762 			 * Currently the tcp/ip stack has done it.
763 			 */
764 		}
765 
766 		offset = offsetof(ipha_t, ipha_protocol) + mac_hdr_len;
767 		while (size <= offset) {
768 			mp = mp->b_cont;
769 			ASSERT(mp != NULL);
770 			len = MBLKL(mp);
771 			size += len;
772 		}
773 		pos = mp->b_rptr + offset + len - size;
774 
775 		l4_proto = *(uint8_t *)pos;
776 		break;
777 	case ETHERTYPE_IPV6:
778 		offset = offsetof(ip6_t, ip6_nxt) + mac_hdr_len;
779 		while (size <= offset) {
780 			mp = mp->b_cont;
781 			ASSERT(mp != NULL);
782 			len = MBLKL(mp);
783 			size += len;
784 		}
785 		pos = mp->b_rptr + offset + len - size;
786 
787 		l4_proto = *(uint8_t *)pos;
788 		break;
789 	default:
790 		/* Unrecoverable error */
791 		IXGBE_DEBUGLOG_0(NULL, "Ether type error with tx hcksum");
792 		return (-2);
793 	}
794 
795 	if (ctx->lso_flag) {
796 		offset = mac_hdr_len + start;
797 		while (size <= offset) {
798 			mp = mp->b_cont;
799 			ASSERT(mp != NULL);
800 			len = MBLKL(mp);
801 			size += len;
802 		}
803 		pos = mp->b_rptr + offset + len - size;
804 
805 		l4_hdr_len = TCP_HDR_LENGTH((tcph_t *)pos);
806 	} else {
807 		/*
808 		 * l4 header length is only required for LSO
809 		 */
810 		l4_hdr_len = 0;
811 	}
812 
813 	ctx->mac_hdr_len = mac_hdr_len;
814 	ctx->ip_hdr_len = start;
815 	ctx->l4_proto = l4_proto;
816 	ctx->l4_hdr_len = l4_hdr_len;
817 
818 	return (0);
819 }
820 
821 /*
822  * ixgbe_check_context
823  *
824  * Check if a new context descriptor is needed
825  */
826 static boolean_t
827 ixgbe_check_context(ixgbe_tx_ring_t *tx_ring, ixgbe_tx_context_t *ctx)
828 {
829 	ixgbe_tx_context_t *last;
830 
831 	if (ctx == NULL)
832 		return (B_FALSE);
833 
834 	/*
835 	 * Compare the context data retrieved from the mblk and the
836 	 * stored data of the last context descriptor. The data need
837 	 * to be checked are:
838 	 *	hcksum_flags
839 	 *	l4_proto
840 	 *	mac_hdr_len
841 	 *	ip_hdr_len
842 	 *	lso_flag
843 	 *	mss (only checked for LSO)
844 	 *	l4_hr_len (only checked for LSO)
845 	 * Either one of the above data is changed, a new context descriptor
846 	 * will be needed.
847 	 */
848 	last = &tx_ring->tx_context;
849 
850 	if ((ctx->hcksum_flags != last->hcksum_flags) ||
851 	    (ctx->l4_proto != last->l4_proto) ||
852 	    (ctx->mac_hdr_len != last->mac_hdr_len) ||
853 	    (ctx->ip_hdr_len != last->ip_hdr_len) ||
854 	    (ctx->lso_flag != last->lso_flag) ||
855 	    (ctx->lso_flag && ((ctx->mss != last->mss) ||
856 	    (ctx->l4_hdr_len != last->l4_hdr_len)))) {
857 		return (B_TRUE);
858 	}
859 
860 	return (B_FALSE);
861 }
862 
863 /*
864  * ixgbe_fill_context
865  *
866  * Fill the context descriptor with hardware checksum informations
867  */
868 static void
869 ixgbe_fill_context(struct ixgbe_adv_tx_context_desc *ctx_tbd,
870     ixgbe_tx_context_t *ctx)
871 {
872 	/*
873 	 * Fill the context descriptor with the checksum
874 	 * context information we've got.
875 	 */
876 	ctx_tbd->vlan_macip_lens = ctx->ip_hdr_len;
877 	ctx_tbd->vlan_macip_lens |= ctx->mac_hdr_len <<
878 	    IXGBE_ADVTXD_MACLEN_SHIFT;
879 
880 	ctx_tbd->type_tucmd_mlhl =
881 	    IXGBE_ADVTXD_DCMD_DEXT | IXGBE_ADVTXD_DTYP_CTXT;
882 
883 	if (ctx->hcksum_flags & HCK_IPV4_HDRCKSUM)
884 		ctx_tbd->type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV4;
885 
886 	if (ctx->hcksum_flags & HCK_PARTIALCKSUM) {
887 		switch (ctx->l4_proto) {
888 		case IPPROTO_TCP:
889 			ctx_tbd->type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_TCP;
890 			break;
891 		case IPPROTO_UDP:
892 			/*
893 			 * We don't have to explicitly set:
894 			 *	ctx_tbd->type_tucmd_mlhl |=
895 			 *	    IXGBE_ADVTXD_TUCMD_L4T_UDP;
896 			 * Because IXGBE_ADVTXD_TUCMD_L4T_UDP == 0b
897 			 */
898 			break;
899 		default:
900 			/* Unrecoverable error */
901 			IXGBE_DEBUGLOG_0(NULL, "L4 type error with tx hcksum");
902 			break;
903 		}
904 	}
905 
906 	ctx_tbd->seqnum_seed = 0;
907 
908 	if (ctx->lso_flag) {
909 		ctx_tbd->mss_l4len_idx =
910 		    (ctx->l4_hdr_len << IXGBE_ADVTXD_L4LEN_SHIFT) |
911 		    (ctx->mss << IXGBE_ADVTXD_MSS_SHIFT);
912 	} else {
913 		ctx_tbd->mss_l4len_idx = 0;
914 	}
915 }
916 
917 /*
918  * ixgbe_tx_fill_ring
919  *
920  * Fill the tx descriptor ring with the data
921  */
922 static int
923 ixgbe_tx_fill_ring(ixgbe_tx_ring_t *tx_ring, link_list_t *pending_list,
924     ixgbe_tx_context_t *ctx, size_t mbsize)
925 {
926 	struct ixgbe_hw *hw = &tx_ring->ixgbe->hw;
927 	boolean_t load_context;
928 	uint32_t index, tcb_index, desc_num;
929 	union ixgbe_adv_tx_desc *tbd, *first_tbd;
930 	tx_control_block_t *tcb, *first_tcb;
931 	uint32_t hcksum_flags;
932 	int i;
933 
934 	ASSERT(mutex_owned(&tx_ring->tx_lock));
935 
936 	tbd = NULL;
937 	first_tbd = NULL;
938 	first_tcb = NULL;
939 	desc_num = 0;
940 	hcksum_flags = 0;
941 	load_context = B_FALSE;
942 
943 	/*
944 	 * Get the index of the first tx descriptor that will be filled,
945 	 * and the index of the first work list item that will be attached
946 	 * with the first used tx control block in the pending list.
947 	 * Note: the two indexes are the same.
948 	 */
949 	index = tx_ring->tbd_tail;
950 	tcb_index = tx_ring->tbd_tail;
951 
952 	if (ctx != NULL) {
953 		hcksum_flags = ctx->hcksum_flags;
954 
955 		/*
956 		 * Check if a new context descriptor is needed for this packet
957 		 */
958 		load_context = ixgbe_check_context(tx_ring, ctx);
959 
960 		if (load_context) {
961 			tbd = &tx_ring->tbd_ring[index];
962 
963 			/*
964 			 * Fill the context descriptor with the
965 			 * hardware checksum offload informations.
966 			 */
967 			ixgbe_fill_context(
968 			    (struct ixgbe_adv_tx_context_desc *)tbd, ctx);
969 
970 			index = NEXT_INDEX(index, 1, tx_ring->ring_size);
971 			desc_num++;
972 
973 			/*
974 			 * Store the checksum context data if
975 			 * a new context descriptor is added
976 			 */
977 			tx_ring->tx_context = *ctx;
978 		}
979 	}
980 
981 	first_tbd = &tx_ring->tbd_ring[index];
982 
983 	/*
984 	 * Fill tx data descriptors with the data saved in the pending list.
985 	 * The tx control blocks in the pending list are added to the work list
986 	 * at the same time.
987 	 *
988 	 * The work list is strictly 1:1 corresponding to the descriptor ring.
989 	 * One item of the work list corresponds to one tx descriptor. Because
990 	 * one tx control block can span multiple tx descriptors, the tx
991 	 * control block will be added to the first work list item that
992 	 * corresponds to the first tx descriptor generated from that tx
993 	 * control block.
994 	 */
995 	tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
996 	first_tcb = tcb;
997 	while (tcb != NULL) {
998 
999 		for (i = 0; i < tcb->desc_num; i++) {
1000 			tbd = &tx_ring->tbd_ring[index];
1001 
1002 			tbd->read.buffer_addr = tcb->desc[i].address;
1003 			tbd->read.cmd_type_len = tcb->desc[i].length;
1004 
1005 			tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_DEXT
1006 			    | IXGBE_ADVTXD_DTYP_DATA;
1007 
1008 			tbd->read.olinfo_status = 0;
1009 
1010 			index = NEXT_INDEX(index, 1, tx_ring->ring_size);
1011 			desc_num++;
1012 		}
1013 
1014 		/*
1015 		 * Add the tx control block to the work list
1016 		 */
1017 		ASSERT(tx_ring->work_list[tcb_index] == NULL);
1018 		tx_ring->work_list[tcb_index] = tcb;
1019 
1020 		tcb_index = index;
1021 		tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
1022 	}
1023 
1024 	if (load_context) {
1025 		/*
1026 		 * Count the context descriptor for
1027 		 * the first tx control block.
1028 		 */
1029 		first_tcb->desc_num++;
1030 	}
1031 	first_tcb->last_index = PREV_INDEX(index, 1, tx_ring->ring_size);
1032 
1033 	/*
1034 	 * The Insert Ethernet CRC (IFCS) bit and the checksum fields are only
1035 	 * valid in the first descriptor of the packet.
1036 	 * Setting paylen in every first_tbd for all parts.
1037 	 * 82599 requires the packet length in paylen field with or without
1038 	 * LSO and 82598 will ignore it in non-LSO mode.
1039 	 */
1040 	ASSERT(first_tbd != NULL);
1041 	first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_IFCS;
1042 
1043 	switch (hw->mac.type) {
1044 	case ixgbe_mac_82599EB:
1045 		if (ctx != NULL && ctx->lso_flag) {
1046 			first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_TSE;
1047 			first_tbd->read.olinfo_status |=
1048 			    (mbsize - ctx->mac_hdr_len - ctx->ip_hdr_len
1049 			    - ctx->l4_hdr_len) << IXGBE_ADVTXD_PAYLEN_SHIFT;
1050 		} else {
1051 			first_tbd->read.olinfo_status |=
1052 			    (mbsize << IXGBE_ADVTXD_PAYLEN_SHIFT);
1053 		}
1054 		break;
1055 	case ixgbe_mac_82598EB:
1056 		if (ctx != NULL && ctx->lso_flag) {
1057 			first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_TSE;
1058 			first_tbd->read.olinfo_status |=
1059 			    (mbsize - ctx->mac_hdr_len - ctx->ip_hdr_len
1060 			    - ctx->l4_hdr_len) << IXGBE_ADVTXD_PAYLEN_SHIFT;
1061 		}
1062 		break;
1063 	default:
1064 		break;
1065 	}
1066 
1067 	/* Set hardware checksum bits */
1068 	if (hcksum_flags != 0) {
1069 		if (hcksum_flags & HCK_IPV4_HDRCKSUM)
1070 			first_tbd->read.olinfo_status |=
1071 			    IXGBE_ADVTXD_POPTS_IXSM;
1072 		if (hcksum_flags & HCK_PARTIALCKSUM)
1073 			first_tbd->read.olinfo_status |=
1074 			    IXGBE_ADVTXD_POPTS_TXSM;
1075 	}
1076 
1077 	/*
1078 	 * The last descriptor of packet needs End Of Packet (EOP),
1079 	 * and Report Status (RS) bits set
1080 	 */
1081 	ASSERT(tbd != NULL);
1082 	tbd->read.cmd_type_len |=
1083 	    IXGBE_ADVTXD_DCMD_EOP | IXGBE_ADVTXD_DCMD_RS;
1084 
1085 	/*
1086 	 * Sync the DMA buffer of the tx descriptor ring
1087 	 */
1088 	DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORDEV);
1089 
1090 	if (ixgbe_check_dma_handle(tx_ring->tbd_area.dma_handle) != DDI_FM_OK) {
1091 		ddi_fm_service_impact(tx_ring->ixgbe->dip,
1092 		    DDI_SERVICE_DEGRADED);
1093 	}
1094 
1095 	/*
1096 	 * Update the number of the free tx descriptors.
1097 	 * The mutual exclusion between the transmission and the recycling
1098 	 * (for the tx descriptor ring and the work list) is implemented
1099 	 * with the atomic operation on the number of the free tx descriptors.
1100 	 *
1101 	 * Note: we should always decrement the counter tbd_free before
1102 	 * advancing the hardware TDT pointer to avoid the race condition -
1103 	 * before the counter tbd_free is decremented, the transmit of the
1104 	 * tx descriptors has done and the counter tbd_free is increased by
1105 	 * the tx recycling.
1106 	 */
1107 	i = ixgbe_atomic_reserve(&tx_ring->tbd_free, desc_num);
1108 	ASSERT(i >= 0);
1109 
1110 	tx_ring->tbd_tail = index;
1111 
1112 	/*
1113 	 * Advance the hardware TDT pointer of the tx descriptor ring
1114 	 */
1115 	IXGBE_WRITE_REG(hw, IXGBE_TDT(tx_ring->index), index);
1116 
1117 	if (ixgbe_check_acc_handle(tx_ring->ixgbe->osdep.reg_handle) !=
1118 	    DDI_FM_OK) {
1119 		ddi_fm_service_impact(tx_ring->ixgbe->dip,
1120 		    DDI_SERVICE_DEGRADED);
1121 	}
1122 
1123 	return (desc_num);
1124 }
1125 
1126 /*
1127  * ixgbe_save_desc
1128  *
1129  * Save the address/length pair to the private array
1130  * of the tx control block. The address/length pairs
1131  * will be filled into the tx descriptor ring later.
1132  */
1133 static void
1134 ixgbe_save_desc(tx_control_block_t *tcb, uint64_t address, size_t length)
1135 {
1136 	sw_desc_t *desc;
1137 
1138 	desc = &tcb->desc[tcb->desc_num];
1139 	desc->address = address;
1140 	desc->length = length;
1141 
1142 	tcb->desc_num++;
1143 }
1144 
1145 /*
1146  * ixgbe_tx_recycle_legacy
1147  *
1148  * Recycle the tx descriptors and tx control blocks.
1149  *
1150  * The work list is traversed to check if the corresponding
1151  * tx descriptors have been transmitted. If so, the resources
1152  * bound to the tx control blocks will be freed, and those
1153  * tx control blocks will be returned to the free list.
1154  */
1155 uint32_t
1156 ixgbe_tx_recycle_legacy(ixgbe_tx_ring_t *tx_ring)
1157 {
1158 	uint32_t index, last_index, prev_index;
1159 	int desc_num;
1160 	boolean_t desc_done;
1161 	tx_control_block_t *tcb;
1162 	link_list_t pending_list;
1163 	ixgbe_t *ixgbe = tx_ring->ixgbe;
1164 
1165 	mutex_enter(&tx_ring->recycle_lock);
1166 
1167 	ASSERT(tx_ring->tbd_free <= tx_ring->ring_size);
1168 
1169 	if (tx_ring->tbd_free == tx_ring->ring_size) {
1170 		tx_ring->recycle_fail = 0;
1171 		tx_ring->stall_watchdog = 0;
1172 		if (tx_ring->reschedule) {
1173 			tx_ring->reschedule = B_FALSE;
1174 			mac_tx_ring_update(ixgbe->mac_hdl,
1175 			    tx_ring->ring_handle);
1176 		}
1177 		mutex_exit(&tx_ring->recycle_lock);
1178 		return (0);
1179 	}
1180 
1181 	/*
1182 	 * Sync the DMA buffer of the tx descriptor ring
1183 	 */
1184 	DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORKERNEL);
1185 
1186 	if (ixgbe_check_dma_handle(tx_ring->tbd_area.dma_handle) != DDI_FM_OK) {
1187 		ddi_fm_service_impact(ixgbe->dip, DDI_SERVICE_DEGRADED);
1188 	}
1189 
1190 	LINK_LIST_INIT(&pending_list);
1191 	desc_num = 0;
1192 	index = tx_ring->tbd_head;	/* Index of next tbd/tcb to recycle */
1193 
1194 	tcb = tx_ring->work_list[index];
1195 	ASSERT(tcb != NULL);
1196 
1197 	while (tcb != NULL) {
1198 		/*
1199 		 * Get the last tx descriptor of this packet.
1200 		 * If the last tx descriptor is done, then
1201 		 * we can recycle all descriptors of a packet
1202 		 * which usually includes several tx control blocks.
1203 		 * For 82599, LSO descriptors can not be recycled
1204 		 * unless the whole packet's transmission is done.
1205 		 * That's why packet level recycling is used here.
1206 		 * For 82598, there's not such limit.
1207 		 */
1208 		last_index = tcb->last_index;
1209 		/*
1210 		 * MAX_TX_RING_SIZE is used to judge whether
1211 		 * the index is a valid value or not.
1212 		 */
1213 		if (last_index == MAX_TX_RING_SIZE)
1214 			break;
1215 
1216 		/*
1217 		 * Check if the Descriptor Done bit is set
1218 		 */
1219 		desc_done = tx_ring->tbd_ring[last_index].wb.status &
1220 		    IXGBE_TXD_STAT_DD;
1221 		if (desc_done) {
1222 			/*
1223 			 * recycle all descriptors of the packet
1224 			 */
1225 			while (tcb != NULL) {
1226 				/*
1227 				 * Strip off the tx control block from
1228 				 * the work list, and add it to the
1229 				 * pending list.
1230 				 */
1231 				tx_ring->work_list[index] = NULL;
1232 				LIST_PUSH_TAIL(&pending_list, &tcb->link);
1233 
1234 				/*
1235 				 * Count the total number of the tx
1236 				 * descriptors recycled
1237 				 */
1238 				desc_num += tcb->desc_num;
1239 
1240 				index = NEXT_INDEX(index, tcb->desc_num,
1241 				    tx_ring->ring_size);
1242 
1243 				tcb = tx_ring->work_list[index];
1244 
1245 				prev_index = PREV_INDEX(index, 1,
1246 				    tx_ring->ring_size);
1247 				if (prev_index == last_index)
1248 					break;
1249 			}
1250 		} else {
1251 			break;
1252 		}
1253 	}
1254 
1255 	/*
1256 	 * If no tx descriptors are recycled, no need to do more processing
1257 	 */
1258 	if (desc_num == 0) {
1259 		tx_ring->recycle_fail++;
1260 		mutex_exit(&tx_ring->recycle_lock);
1261 		return (0);
1262 	}
1263 
1264 	tx_ring->recycle_fail = 0;
1265 	tx_ring->stall_watchdog = 0;
1266 
1267 	/*
1268 	 * Update the head index of the tx descriptor ring
1269 	 */
1270 	tx_ring->tbd_head = index;
1271 
1272 	/*
1273 	 * Update the number of the free tx descriptors with atomic operations
1274 	 */
1275 	atomic_add_32(&tx_ring->tbd_free, desc_num);
1276 
1277 	if ((tx_ring->tbd_free >= ixgbe->tx_resched_thresh) &&
1278 	    (tx_ring->reschedule)) {
1279 		tx_ring->reschedule = B_FALSE;
1280 		mac_tx_ring_update(ixgbe->mac_hdl,
1281 		    tx_ring->ring_handle);
1282 	}
1283 	mutex_exit(&tx_ring->recycle_lock);
1284 
1285 	/*
1286 	 * Free the resources used by the tx control blocks
1287 	 * in the pending list
1288 	 */
1289 	tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list);
1290 	while (tcb != NULL) {
1291 		/*
1292 		 * Release the resources occupied by the tx control block
1293 		 */
1294 		ixgbe_free_tcb(tcb);
1295 
1296 		tcb = (tx_control_block_t *)
1297 		    LIST_GET_NEXT(&pending_list, &tcb->link);
1298 	}
1299 
1300 	/*
1301 	 * Add the tx control blocks in the pending list to the free list.
1302 	 */
1303 	ixgbe_put_free_list(tx_ring, &pending_list);
1304 
1305 	return (desc_num);
1306 }
1307 
1308 /*
1309  * ixgbe_tx_recycle_head_wb
1310  *
1311  * Check the head write-back, and recycle all the transmitted
1312  * tx descriptors and tx control blocks.
1313  */
1314 uint32_t
1315 ixgbe_tx_recycle_head_wb(ixgbe_tx_ring_t *tx_ring)
1316 {
1317 	uint32_t index;
1318 	uint32_t head_wb;
1319 	int desc_num;
1320 	tx_control_block_t *tcb;
1321 	link_list_t pending_list;
1322 	ixgbe_t *ixgbe = tx_ring->ixgbe;
1323 
1324 	mutex_enter(&tx_ring->recycle_lock);
1325 
1326 	ASSERT(tx_ring->tbd_free <= tx_ring->ring_size);
1327 
1328 	if (tx_ring->tbd_free == tx_ring->ring_size) {
1329 		tx_ring->recycle_fail = 0;
1330 		tx_ring->stall_watchdog = 0;
1331 		if (tx_ring->reschedule) {
1332 			tx_ring->reschedule = B_FALSE;
1333 			mac_tx_ring_update(ixgbe->mac_hdl,
1334 			    tx_ring->ring_handle);
1335 		}
1336 		mutex_exit(&tx_ring->recycle_lock);
1337 		return (0);
1338 	}
1339 
1340 	/*
1341 	 * Sync the DMA buffer of the tx descriptor ring
1342 	 *
1343 	 * Note: For head write-back mode, the tx descriptors will not
1344 	 * be written back, but the head write-back value is stored at
1345 	 * the last extra tbd at the end of the DMA area, we still need
1346 	 * to sync the head write-back value for kernel.
1347 	 *
1348 	 * DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORKERNEL);
1349 	 */
1350 	(void) ddi_dma_sync(tx_ring->tbd_area.dma_handle,
1351 	    sizeof (union ixgbe_adv_tx_desc) * tx_ring->ring_size,
1352 	    sizeof (uint32_t),
1353 	    DDI_DMA_SYNC_FORKERNEL);
1354 
1355 	if (ixgbe_check_dma_handle(tx_ring->tbd_area.dma_handle) != DDI_FM_OK) {
1356 		ddi_fm_service_impact(ixgbe->dip,
1357 		    DDI_SERVICE_DEGRADED);
1358 	}
1359 
1360 	LINK_LIST_INIT(&pending_list);
1361 	desc_num = 0;
1362 	index = tx_ring->tbd_head;	/* Next index to clean */
1363 
1364 	/*
1365 	 * Get the value of head write-back
1366 	 */
1367 	head_wb = *tx_ring->tbd_head_wb;
1368 	while (index != head_wb) {
1369 		tcb = tx_ring->work_list[index];
1370 		ASSERT(tcb != NULL);
1371 
1372 		if (OFFSET(index, head_wb, tx_ring->ring_size) <
1373 		    tcb->desc_num) {
1374 			/*
1375 			 * The current tx control block is not
1376 			 * completely transmitted, stop recycling
1377 			 */
1378 			break;
1379 		}
1380 
1381 		/*
1382 		 * Strip off the tx control block from the work list,
1383 		 * and add it to the pending list.
1384 		 */
1385 		tx_ring->work_list[index] = NULL;
1386 		LIST_PUSH_TAIL(&pending_list, &tcb->link);
1387 
1388 		/*
1389 		 * Advance the index of the tx descriptor ring
1390 		 */
1391 		index = NEXT_INDEX(index, tcb->desc_num, tx_ring->ring_size);
1392 
1393 		/*
1394 		 * Count the total number of the tx descriptors recycled
1395 		 */
1396 		desc_num += tcb->desc_num;
1397 	}
1398 
1399 	/*
1400 	 * If no tx descriptors are recycled, no need to do more processing
1401 	 */
1402 	if (desc_num == 0) {
1403 		tx_ring->recycle_fail++;
1404 		mutex_exit(&tx_ring->recycle_lock);
1405 		return (0);
1406 	}
1407 
1408 	tx_ring->recycle_fail = 0;
1409 	tx_ring->stall_watchdog = 0;
1410 
1411 	/*
1412 	 * Update the head index of the tx descriptor ring
1413 	 */
1414 	tx_ring->tbd_head = index;
1415 
1416 	/*
1417 	 * Update the number of the free tx descriptors with atomic operations
1418 	 */
1419 	atomic_add_32(&tx_ring->tbd_free, desc_num);
1420 
1421 	if ((tx_ring->tbd_free >= ixgbe->tx_resched_thresh) &&
1422 	    (tx_ring->reschedule)) {
1423 		tx_ring->reschedule = B_FALSE;
1424 		mac_tx_ring_update(ixgbe->mac_hdl,
1425 		    tx_ring->ring_handle);
1426 	}
1427 	mutex_exit(&tx_ring->recycle_lock);
1428 
1429 	/*
1430 	 * Free the resources used by the tx control blocks
1431 	 * in the pending list
1432 	 */
1433 	tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list);
1434 	while (tcb) {
1435 		/*
1436 		 * Release the resources occupied by the tx control block
1437 		 */
1438 		ixgbe_free_tcb(tcb);
1439 
1440 		tcb = (tx_control_block_t *)
1441 		    LIST_GET_NEXT(&pending_list, &tcb->link);
1442 	}
1443 
1444 	/*
1445 	 * Add the tx control blocks in the pending list to the free list.
1446 	 */
1447 	ixgbe_put_free_list(tx_ring, &pending_list);
1448 
1449 	return (desc_num);
1450 }
1451 
1452 /*
1453  * ixgbe_free_tcb - free up the tx control block
1454  *
1455  * Free the resources of the tx control block, including
1456  * unbind the previously bound DMA handle, and reset other
1457  * control fields.
1458  */
1459 void
1460 ixgbe_free_tcb(tx_control_block_t *tcb)
1461 {
1462 	switch (tcb->tx_type) {
1463 	case USE_COPY:
1464 		/*
1465 		 * Reset the buffer length that is used for copy
1466 		 */
1467 		tcb->tx_buf.len = 0;
1468 		break;
1469 	case USE_DMA:
1470 		/*
1471 		 * Release the DMA resource that is used for
1472 		 * DMA binding.
1473 		 */
1474 		(void) ddi_dma_unbind_handle(tcb->tx_dma_handle);
1475 		break;
1476 	default:
1477 		break;
1478 	}
1479 
1480 	/*
1481 	 * Free the mblk
1482 	 */
1483 	if (tcb->mp != NULL) {
1484 		freemsg(tcb->mp);
1485 		tcb->mp = NULL;
1486 	}
1487 
1488 	tcb->tx_type = USE_NONE;
1489 	tcb->last_index = MAX_TX_RING_SIZE;
1490 	tcb->frag_num = 0;
1491 	tcb->desc_num = 0;
1492 }
1493 
1494 /*
1495  * ixgbe_get_free_list - Get a free tx control block from the free list
1496  *
1497  * The atomic operation on the number of the available tx control block
1498  * in the free list is used to keep this routine mutual exclusive with
1499  * the routine ixgbe_put_check_list.
1500  */
1501 static tx_control_block_t *
1502 ixgbe_get_free_list(ixgbe_tx_ring_t *tx_ring)
1503 {
1504 	tx_control_block_t *tcb;
1505 
1506 	/*
1507 	 * Check and update the number of the free tx control block
1508 	 * in the free list.
1509 	 */
1510 	if (ixgbe_atomic_reserve(&tx_ring->tcb_free, 1) < 0)
1511 		return (NULL);
1512 
1513 	mutex_enter(&tx_ring->tcb_head_lock);
1514 
1515 	tcb = tx_ring->free_list[tx_ring->tcb_head];
1516 	ASSERT(tcb != NULL);
1517 	tx_ring->free_list[tx_ring->tcb_head] = NULL;
1518 	tx_ring->tcb_head = NEXT_INDEX(tx_ring->tcb_head, 1,
1519 	    tx_ring->free_list_size);
1520 
1521 	mutex_exit(&tx_ring->tcb_head_lock);
1522 
1523 	return (tcb);
1524 }
1525 
1526 /*
1527  * ixgbe_put_free_list
1528  *
1529  * Put a list of used tx control blocks back to the free list
1530  *
1531  * A mutex is used here to ensure the serialization. The mutual exclusion
1532  * between ixgbe_get_free_list and ixgbe_put_free_list is implemented with
1533  * the atomic operation on the counter tcb_free.
1534  */
1535 void
1536 ixgbe_put_free_list(ixgbe_tx_ring_t *tx_ring, link_list_t *pending_list)
1537 {
1538 	uint32_t index;
1539 	int tcb_num;
1540 	tx_control_block_t *tcb;
1541 
1542 	mutex_enter(&tx_ring->tcb_tail_lock);
1543 
1544 	index = tx_ring->tcb_tail;
1545 
1546 	tcb_num = 0;
1547 	tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
1548 	while (tcb != NULL) {
1549 		ASSERT(tx_ring->free_list[index] == NULL);
1550 		tx_ring->free_list[index] = tcb;
1551 
1552 		tcb_num++;
1553 
1554 		index = NEXT_INDEX(index, 1, tx_ring->free_list_size);
1555 
1556 		tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
1557 	}
1558 
1559 	tx_ring->tcb_tail = index;
1560 
1561 	/*
1562 	 * Update the number of the free tx control block
1563 	 * in the free list. This operation must be placed
1564 	 * under the protection of the lock.
1565 	 */
1566 	atomic_add_32(&tx_ring->tcb_free, tcb_num);
1567 
1568 	mutex_exit(&tx_ring->tcb_tail_lock);
1569 }
1570