xref: /titanic_41/usr/src/uts/common/io/ixgbe/ixgbe_tx.c (revision 1e4c938b57d1656808e4112127ff1dce3eba5314)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright(c) 2007-2010 Intel Corporation. All rights reserved.
24  */
25 
26 /*
27  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
28  */
29 
30 #include "ixgbe_sw.h"
31 
32 static int ixgbe_tx_copy(ixgbe_tx_ring_t *, tx_control_block_t *, mblk_t *,
33     uint32_t, boolean_t);
34 static int ixgbe_tx_bind(ixgbe_tx_ring_t *, tx_control_block_t *, mblk_t *,
35     uint32_t);
36 static int ixgbe_tx_fill_ring(ixgbe_tx_ring_t *, link_list_t *,
37     ixgbe_tx_context_t *, size_t);
38 static void ixgbe_save_desc(tx_control_block_t *, uint64_t, size_t);
39 static tx_control_block_t *ixgbe_get_free_list(ixgbe_tx_ring_t *);
40 
41 static int ixgbe_get_context(mblk_t *, ixgbe_tx_context_t *);
42 static boolean_t ixgbe_check_context(ixgbe_tx_ring_t *,
43     ixgbe_tx_context_t *);
44 static void ixgbe_fill_context(struct ixgbe_adv_tx_context_desc *,
45     ixgbe_tx_context_t *);
46 
47 #ifndef IXGBE_DEBUG
48 #pragma inline(ixgbe_save_desc)
49 #pragma inline(ixgbe_get_context)
50 #pragma inline(ixgbe_check_context)
51 #pragma inline(ixgbe_fill_context)
52 #endif
53 
54 /*
55  * ixgbe_ring_tx
56  *
57  * To transmit one mblk through one specified ring.
58  *
59  * One mblk can consist of several fragments, each fragment
60  * will be processed with different methods based on the size.
61  * For the fragments with size less than the bcopy threshold,
62  * they will be processed by using bcopy; otherwise, they will
63  * be processed by using DMA binding.
64  *
65  * To process the mblk, a tx control block is got from the
66  * free list. One tx control block contains one tx buffer, which
67  * is used to copy mblk fragments' data; and one tx DMA handle,
68  * which is used to bind a mblk fragment with DMA resource.
69  *
70  * Several small mblk fragments can be copied into one tx control
71  * block's buffer, and then the buffer will be transmitted with
72  * one tx descriptor.
73  *
74  * A large fragment only binds with one tx control block's DMA
75  * handle, and it can span several tx descriptors for transmitting.
76  *
77  * So to transmit a packet (mblk), several tx control blocks can
78  * be used. After the processing, those tx control blocks will
79  * be put to the work list.
80  */
81 mblk_t *
82 ixgbe_ring_tx(void *arg, mblk_t *mp)
83 {
84 	ixgbe_tx_ring_t *tx_ring = (ixgbe_tx_ring_t *)arg;
85 	ixgbe_t *ixgbe = tx_ring->ixgbe;
86 	tx_type_t current_flag, next_flag;
87 	uint32_t current_len, next_len;
88 	uint32_t desc_total;
89 	size_t mbsize;
90 	int desc_num;
91 	boolean_t copy_done, eop;
92 	mblk_t *current_mp, *next_mp, *nmp, *pull_mp = NULL;
93 	tx_control_block_t *tcb;
94 	ixgbe_tx_context_t tx_context, *ctx;
95 	link_list_t pending_list;
96 	uint32_t len, hdr_frag_len, hdr_len;
97 	uint32_t copy_thresh;
98 	mblk_t *hdr_new_mp = NULL;
99 	mblk_t *hdr_pre_mp = NULL;
100 	mblk_t *hdr_nmp = NULL;
101 
102 	ASSERT(mp->b_next == NULL);
103 
104 	if ((ixgbe->ixgbe_state & IXGBE_SUSPENDED) ||
105 	    (ixgbe->ixgbe_state & IXGBE_ERROR) ||
106 	    (ixgbe->ixgbe_state & IXGBE_OVERTEMP) ||
107 	    !(ixgbe->ixgbe_state & IXGBE_STARTED)) {
108 		return (mp);
109 	}
110 
111 	copy_thresh = ixgbe->tx_copy_thresh;
112 
113 	/* Get the mblk size */
114 	mbsize = 0;
115 	for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
116 		mbsize += MBLKL(nmp);
117 	}
118 
119 	if (ixgbe->tx_hcksum_enable) {
120 		/*
121 		 * Retrieve checksum context information from the mblk
122 		 * that will be used to decide whether/how to fill the
123 		 * context descriptor.
124 		 */
125 		ctx = &tx_context;
126 		if (ixgbe_get_context(mp, ctx) < 0) {
127 			freemsg(mp);
128 			return (NULL);
129 		}
130 
131 		/*
132 		 * If the mblk size exceeds the max size ixgbe could
133 		 * process, then discard this mblk, and return NULL.
134 		 */
135 		if ((ctx->lso_flag &&
136 		    ((mbsize - ctx->mac_hdr_len) > IXGBE_LSO_MAXLEN)) ||
137 		    (!ctx->lso_flag &&
138 		    (mbsize > (ixgbe->max_frame_size - ETHERFCSL)))) {
139 			freemsg(mp);
140 			IXGBE_DEBUGLOG_0(ixgbe, "ixgbe_tx: packet oversize");
141 			return (NULL);
142 		}
143 	} else {
144 		ctx = NULL;
145 	}
146 
147 	/*
148 	 * Check and recycle tx descriptors.
149 	 * The recycle threshold here should be selected carefully
150 	 */
151 	if (tx_ring->tbd_free < ixgbe->tx_recycle_thresh) {
152 		tx_ring->tx_recycle(tx_ring);
153 	}
154 
155 	/*
156 	 * After the recycling, if the tbd_free is less than the
157 	 * overload_threshold, assert overload, return mp;
158 	 * and we need to re-schedule the tx again.
159 	 */
160 	if (tx_ring->tbd_free < ixgbe->tx_overload_thresh) {
161 		tx_ring->reschedule = B_TRUE;
162 		IXGBE_DEBUG_STAT(tx_ring->stat_overload);
163 		return (mp);
164 	}
165 
166 	/*
167 	 * The pending_list is a linked list that is used to save
168 	 * the tx control blocks that have packet data processed
169 	 * but have not put the data to the tx descriptor ring.
170 	 * It is used to reduce the lock contention of the tx_lock.
171 	 */
172 	LINK_LIST_INIT(&pending_list);
173 	desc_num = 0;
174 	desc_total = 0;
175 
176 	/*
177 	 * The software should guarantee LSO packet header(MAC+IP+TCP)
178 	 * to be within one descriptor. Here we reallocate and refill the
179 	 * the header if it's physical memory non-contiguous.
180 	 */
181 	if ((ctx != NULL) && ctx->lso_flag) {
182 		/* find the last fragment of the header */
183 		len = MBLKL(mp);
184 		ASSERT(len > 0);
185 		hdr_nmp = mp;
186 		hdr_len = ctx->ip_hdr_len + ctx->mac_hdr_len + ctx->l4_hdr_len;
187 		while (len < hdr_len) {
188 			hdr_pre_mp = hdr_nmp;
189 			hdr_nmp = hdr_nmp->b_cont;
190 			len += MBLKL(hdr_nmp);
191 		}
192 		/*
193 		 * If the header and the payload are in different mblks,
194 		 * we simply force the header to be copied into pre-allocated
195 		 * page-aligned buffer.
196 		 */
197 		if (len == hdr_len)
198 			goto adjust_threshold;
199 
200 		hdr_frag_len = hdr_len - (len - MBLKL(hdr_nmp));
201 		/*
202 		 * There are two cases we need to reallocate a mblk for the
203 		 * last header fragment:
204 		 * 1. the header is in multiple mblks and the last fragment
205 		 * share the same mblk with the payload
206 		 * 2. the header is in a single mblk shared with the payload
207 		 * and the header is physical memory non-contiguous
208 		 */
209 		if ((hdr_nmp != mp) ||
210 		    (P2NPHASE((uintptr_t)hdr_nmp->b_rptr, ixgbe->sys_page_size)
211 		    < hdr_len)) {
212 			IXGBE_DEBUG_STAT(tx_ring->stat_lso_header_fail);
213 			/*
214 			 * reallocate the mblk for the last header fragment,
215 			 * expect to bcopy into pre-allocated page-aligned
216 			 * buffer
217 			 */
218 			hdr_new_mp = allocb(hdr_frag_len, NULL);
219 			if (!hdr_new_mp)
220 				return (mp);
221 			bcopy(hdr_nmp->b_rptr, hdr_new_mp->b_rptr,
222 			    hdr_frag_len);
223 			/* link the new header fragment with the other parts */
224 			hdr_new_mp->b_wptr = hdr_new_mp->b_rptr + hdr_frag_len;
225 			hdr_new_mp->b_cont = hdr_nmp;
226 			if (hdr_pre_mp)
227 				hdr_pre_mp->b_cont = hdr_new_mp;
228 			else
229 				mp = hdr_new_mp;
230 			hdr_nmp->b_rptr += hdr_frag_len;
231 		}
232 adjust_threshold:
233 		/*
234 		 * adjust the bcopy threshhold to guarantee
235 		 * the header to use bcopy way
236 		 */
237 		if (copy_thresh < hdr_len)
238 			copy_thresh = hdr_len;
239 	}
240 
241 	current_mp = mp;
242 	current_len = MBLKL(current_mp);
243 	/*
244 	 * Decide which method to use for the first fragment
245 	 */
246 	current_flag = (current_len <= copy_thresh) ?
247 	    USE_COPY : USE_DMA;
248 	/*
249 	 * If the mblk includes several contiguous small fragments,
250 	 * they may be copied into one buffer. This flag is used to
251 	 * indicate whether there are pending fragments that need to
252 	 * be copied to the current tx buffer.
253 	 *
254 	 * If this flag is B_TRUE, it indicates that a new tx control
255 	 * block is needed to process the next fragment using either
256 	 * copy or DMA binding.
257 	 *
258 	 * Otherwise, it indicates that the next fragment will be
259 	 * copied to the current tx buffer that is maintained by the
260 	 * current tx control block. No new tx control block is needed.
261 	 */
262 	copy_done = B_TRUE;
263 	while (current_mp) {
264 		next_mp = current_mp->b_cont;
265 		eop = (next_mp == NULL); /* Last fragment of the packet? */
266 		next_len = eop ? 0: MBLKL(next_mp);
267 
268 		/*
269 		 * When the current fragment is an empty fragment, if
270 		 * the next fragment will still be copied to the current
271 		 * tx buffer, we cannot skip this fragment here. Because
272 		 * the copy processing is pending for completion. We have
273 		 * to process this empty fragment in the tx_copy routine.
274 		 *
275 		 * If the copy processing is completed or a DMA binding
276 		 * processing is just completed, we can just skip this
277 		 * empty fragment.
278 		 */
279 		if ((current_len == 0) && (copy_done)) {
280 			current_mp = next_mp;
281 			current_len = next_len;
282 			current_flag = (current_len <= copy_thresh) ?
283 			    USE_COPY : USE_DMA;
284 			continue;
285 		}
286 
287 		if (copy_done) {
288 			/*
289 			 * Get a new tx control block from the free list
290 			 */
291 			tcb = ixgbe_get_free_list(tx_ring);
292 
293 			if (tcb == NULL) {
294 				IXGBE_DEBUG_STAT(tx_ring->stat_fail_no_tcb);
295 				goto tx_failure;
296 			}
297 
298 			/*
299 			 * Push the tx control block to the pending list
300 			 * to avoid using lock too early
301 			 */
302 			LIST_PUSH_TAIL(&pending_list, &tcb->link);
303 		}
304 
305 		if (current_flag == USE_COPY) {
306 			/*
307 			 * Check whether to use bcopy or DMA binding to process
308 			 * the next fragment, and if using bcopy, whether we
309 			 * need to continue copying the next fragment into the
310 			 * current tx buffer.
311 			 */
312 			ASSERT((tcb->tx_buf.len + current_len) <=
313 			    tcb->tx_buf.size);
314 
315 			if (eop) {
316 				/*
317 				 * This is the last fragment of the packet, so
318 				 * the copy processing will be completed with
319 				 * this fragment.
320 				 */
321 				next_flag = USE_NONE;
322 				copy_done = B_TRUE;
323 			} else if ((tcb->tx_buf.len + current_len + next_len) >
324 			    tcb->tx_buf.size) {
325 				/*
326 				 * If the next fragment is too large to be
327 				 * copied to the current tx buffer, we need
328 				 * to complete the current copy processing.
329 				 */
330 				next_flag = (next_len > copy_thresh) ?
331 				    USE_DMA: USE_COPY;
332 				copy_done = B_TRUE;
333 			} else if (next_len > copy_thresh) {
334 				/*
335 				 * The next fragment needs to be processed with
336 				 * DMA binding. So the copy prcessing will be
337 				 * completed with the current fragment.
338 				 */
339 				next_flag = USE_DMA;
340 				copy_done = B_TRUE;
341 			} else {
342 				/*
343 				 * Continue to copy the next fragment to the
344 				 * current tx buffer.
345 				 */
346 				next_flag = USE_COPY;
347 				copy_done = B_FALSE;
348 			}
349 
350 			desc_num = ixgbe_tx_copy(tx_ring, tcb, current_mp,
351 			    current_len, copy_done);
352 		} else {
353 			/*
354 			 * Check whether to use bcopy or DMA binding to process
355 			 * the next fragment.
356 			 */
357 			next_flag = (next_len > copy_thresh) ?
358 			    USE_DMA: USE_COPY;
359 			ASSERT(copy_done == B_TRUE);
360 
361 			desc_num = ixgbe_tx_bind(tx_ring, tcb, current_mp,
362 			    current_len);
363 		}
364 
365 		if (desc_num > 0)
366 			desc_total += desc_num;
367 		else if (desc_num < 0)
368 			goto tx_failure;
369 
370 		current_mp = next_mp;
371 		current_len = next_len;
372 		current_flag = next_flag;
373 	}
374 
375 	/*
376 	 * Attach the mblk to the last tx control block
377 	 */
378 	ASSERT(tcb);
379 	ASSERT(tcb->mp == NULL);
380 	tcb->mp = mp;
381 
382 	/*
383 	 * 82598/82599 chipset has a limitation that no more than 32 tx
384 	 * descriptors can be transmited out at one time.
385 	 *
386 	 * Here is a workaround for it: pull up the mblk then send it
387 	 * out with bind way. By doing so, no more than MAX_COOKIE (18)
388 	 * descriptors is needed.
389 	 */
390 	if (desc_total + 1 > IXGBE_TX_DESC_LIMIT) {
391 		IXGBE_DEBUG_STAT(tx_ring->stat_break_tbd_limit);
392 
393 		/*
394 		 * Discard the mblk and free the used resources
395 		 */
396 		tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list);
397 		while (tcb) {
398 			tcb->mp = NULL;
399 			ixgbe_free_tcb(tcb);
400 			tcb = (tx_control_block_t *)
401 			    LIST_GET_NEXT(&pending_list, &tcb->link);
402 		}
403 
404 		/*
405 		 * Return the tx control blocks in the pending list to
406 		 * the free list.
407 		 */
408 		ixgbe_put_free_list(tx_ring, &pending_list);
409 
410 		/*
411 		 * pull up the mblk and send it out with bind way
412 		 */
413 		if ((pull_mp = msgpullup(mp, -1)) == NULL) {
414 			tx_ring->reschedule = B_TRUE;
415 
416 			/*
417 			 * If new mblk has been allocted for the last header
418 			 * fragment of a LSO packet, we should restore the
419 			 * modified mp.
420 			 */
421 			if (hdr_new_mp) {
422 				hdr_new_mp->b_cont = NULL;
423 				freeb(hdr_new_mp);
424 				hdr_nmp->b_rptr -= hdr_frag_len;
425 				if (hdr_pre_mp)
426 					hdr_pre_mp->b_cont = hdr_nmp;
427 				else
428 					mp = hdr_nmp;
429 			}
430 			return (mp);
431 		}
432 
433 		LINK_LIST_INIT(&pending_list);
434 		desc_total = 0;
435 
436 		/*
437 		 * if the packet is a LSO packet, we simply
438 		 * transmit the header in one descriptor using the copy way
439 		 */
440 		if ((ctx != NULL) && ctx->lso_flag) {
441 			hdr_len = ctx->ip_hdr_len + ctx->mac_hdr_len +
442 			    ctx->l4_hdr_len;
443 
444 			tcb = ixgbe_get_free_list(tx_ring);
445 			if (tcb == NULL) {
446 				IXGBE_DEBUG_STAT(tx_ring->stat_fail_no_tcb);
447 				goto tx_failure;
448 			}
449 			desc_num = ixgbe_tx_copy(tx_ring, tcb, pull_mp,
450 			    hdr_len, B_TRUE);
451 			LIST_PUSH_TAIL(&pending_list, &tcb->link);
452 			desc_total  += desc_num;
453 
454 			pull_mp->b_rptr += hdr_len;
455 		}
456 
457 		tcb = ixgbe_get_free_list(tx_ring);
458 		if (tcb == NULL) {
459 			IXGBE_DEBUG_STAT(tx_ring->stat_fail_no_tcb);
460 			goto tx_failure;
461 		}
462 		if ((ctx != NULL) && ctx->lso_flag) {
463 			desc_num = ixgbe_tx_bind(tx_ring, tcb, pull_mp,
464 			    mbsize - hdr_len);
465 		} else {
466 			desc_num = ixgbe_tx_bind(tx_ring, tcb, pull_mp,
467 			    mbsize);
468 		}
469 		if (desc_num < 0) {
470 			goto tx_failure;
471 		}
472 		LIST_PUSH_TAIL(&pending_list, &tcb->link);
473 
474 		desc_total += desc_num;
475 		tcb->mp = pull_mp;
476 	}
477 
478 	/*
479 	 * Before fill the tx descriptor ring with the data, we need to
480 	 * ensure there are adequate free descriptors for transmit
481 	 * (including one context descriptor).
482 	 * Do not use up all the tx descriptors.
483 	 * Otherwise tx recycle will fail and cause false hang.
484 	 */
485 	if (tx_ring->tbd_free <= (desc_total + 1)) {
486 		tx_ring->tx_recycle(tx_ring);
487 	}
488 
489 	mutex_enter(&tx_ring->tx_lock);
490 	/*
491 	 * If the number of free tx descriptors is not enough for transmit
492 	 * then return mp.
493 	 *
494 	 * Note: we must put this check under the mutex protection to
495 	 * ensure the correctness when multiple threads access it in
496 	 * parallel.
497 	 */
498 	if (tx_ring->tbd_free <= (desc_total + 1)) {
499 		IXGBE_DEBUG_STAT(tx_ring->stat_fail_no_tbd);
500 		mutex_exit(&tx_ring->tx_lock);
501 		goto tx_failure;
502 	}
503 
504 	desc_num = ixgbe_tx_fill_ring(tx_ring, &pending_list, ctx,
505 	    mbsize);
506 
507 	ASSERT((desc_num == desc_total) || (desc_num == (desc_total + 1)));
508 
509 	tx_ring->stat_obytes += mbsize;
510 	tx_ring->stat_opackets ++;
511 
512 	mutex_exit(&tx_ring->tx_lock);
513 
514 	/*
515 	 * now that the transmission succeeds, need to free the original
516 	 * mp if we used the pulling up mblk for transmission.
517 	 */
518 	if (pull_mp) {
519 		freemsg(mp);
520 	}
521 
522 	return (NULL);
523 
524 tx_failure:
525 	/*
526 	 * If transmission fails, need to free the pulling up mblk.
527 	 */
528 	if (pull_mp) {
529 		freemsg(pull_mp);
530 	}
531 
532 	/*
533 	 * If new mblk has been allocted for the last header
534 	 * fragment of a LSO packet, we should restore the
535 	 * modified mp.
536 	 */
537 	if (hdr_new_mp) {
538 		hdr_new_mp->b_cont = NULL;
539 		freeb(hdr_new_mp);
540 		hdr_nmp->b_rptr -= hdr_frag_len;
541 		if (hdr_pre_mp)
542 			hdr_pre_mp->b_cont = hdr_nmp;
543 		else
544 			mp = hdr_nmp;
545 	}
546 	/*
547 	 * Discard the mblk and free the used resources
548 	 */
549 	tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list);
550 	while (tcb) {
551 		tcb->mp = NULL;
552 
553 		ixgbe_free_tcb(tcb);
554 
555 		tcb = (tx_control_block_t *)
556 		    LIST_GET_NEXT(&pending_list, &tcb->link);
557 	}
558 
559 	/*
560 	 * Return the tx control blocks in the pending list to the free list.
561 	 */
562 	ixgbe_put_free_list(tx_ring, &pending_list);
563 
564 	/* Transmit failed, do not drop the mblk, rechedule the transmit */
565 	tx_ring->reschedule = B_TRUE;
566 
567 	return (mp);
568 }
569 
570 /*
571  * ixgbe_tx_copy
572  *
573  * Copy the mblk fragment to the pre-allocated tx buffer
574  */
575 static int
576 ixgbe_tx_copy(ixgbe_tx_ring_t *tx_ring, tx_control_block_t *tcb, mblk_t *mp,
577     uint32_t len, boolean_t copy_done)
578 {
579 	dma_buffer_t *tx_buf;
580 	uint32_t desc_num;
581 	_NOTE(ARGUNUSED(tx_ring));
582 
583 	tx_buf = &tcb->tx_buf;
584 
585 	/*
586 	 * Copy the packet data of the mblk fragment into the
587 	 * pre-allocated tx buffer, which is maintained by the
588 	 * tx control block.
589 	 *
590 	 * Several mblk fragments can be copied into one tx buffer.
591 	 * The destination address of the current copied fragment in
592 	 * the tx buffer is next to the end of the previous copied
593 	 * fragment.
594 	 */
595 	if (len > 0) {
596 		bcopy(mp->b_rptr, tx_buf->address + tx_buf->len, len);
597 
598 		tx_buf->len += len;
599 		tcb->frag_num++;
600 	}
601 
602 	desc_num = 0;
603 
604 	/*
605 	 * If it is the last fragment copied to the current tx buffer,
606 	 * in other words, if there's no remaining fragment or the remaining
607 	 * fragment requires a new tx control block to process, we need to
608 	 * complete the current copy processing by syncing up the current
609 	 * DMA buffer and saving the descriptor data.
610 	 */
611 	if (copy_done) {
612 		/*
613 		 * Sync the DMA buffer of the packet data
614 		 */
615 		DMA_SYNC(tx_buf, DDI_DMA_SYNC_FORDEV);
616 
617 		tcb->tx_type = USE_COPY;
618 
619 		/*
620 		 * Save the address and length to the private data structure
621 		 * of the tx control block, which will be used to fill the
622 		 * tx descriptor ring after all the fragments are processed.
623 		 */
624 		ixgbe_save_desc(tcb, tx_buf->dma_address, tx_buf->len);
625 		desc_num++;
626 	}
627 
628 	return (desc_num);
629 }
630 
631 /*
632  * ixgbe_tx_bind
633  *
634  * Bind the mblk fragment with DMA
635  */
636 static int
637 ixgbe_tx_bind(ixgbe_tx_ring_t *tx_ring, tx_control_block_t *tcb, mblk_t *mp,
638     uint32_t len)
639 {
640 	int status, i;
641 	ddi_dma_cookie_t dma_cookie;
642 	uint_t ncookies;
643 	int desc_num;
644 
645 	/*
646 	 * Use DMA binding to process the mblk fragment
647 	 */
648 	status = ddi_dma_addr_bind_handle(tcb->tx_dma_handle, NULL,
649 	    (caddr_t)mp->b_rptr, len,
650 	    DDI_DMA_WRITE | DDI_DMA_STREAMING, DDI_DMA_DONTWAIT,
651 	    0, &dma_cookie, &ncookies);
652 
653 	if (status != DDI_DMA_MAPPED) {
654 		IXGBE_DEBUG_STAT(tx_ring->stat_fail_dma_bind);
655 		return (-1);
656 	}
657 
658 	tcb->frag_num++;
659 	tcb->tx_type = USE_DMA;
660 	/*
661 	 * Each fragment can span several cookies. One cookie will have
662 	 * one tx descriptor to transmit.
663 	 */
664 	desc_num = 0;
665 	for (i = ncookies; i > 0; i--) {
666 		/*
667 		 * Save the address and length to the private data structure
668 		 * of the tx control block, which will be used to fill the
669 		 * tx descriptor ring after all the fragments are processed.
670 		 */
671 		ixgbe_save_desc(tcb,
672 		    dma_cookie.dmac_laddress,
673 		    dma_cookie.dmac_size);
674 
675 		desc_num++;
676 
677 		if (i > 1)
678 			ddi_dma_nextcookie(tcb->tx_dma_handle, &dma_cookie);
679 	}
680 
681 	return (desc_num);
682 }
683 
684 /*
685  * ixgbe_get_context
686  *
687  * Get the context information from the mblk
688  */
689 static int
690 ixgbe_get_context(mblk_t *mp, ixgbe_tx_context_t *ctx)
691 {
692 	uint32_t start;
693 	uint32_t hckflags;
694 	uint32_t lsoflags;
695 	uint32_t mss;
696 	uint32_t len;
697 	uint32_t size;
698 	uint32_t offset;
699 	unsigned char *pos;
700 	ushort_t etype;
701 	uint32_t mac_hdr_len;
702 	uint32_t l4_proto;
703 	uint32_t l4_hdr_len;
704 
705 	ASSERT(mp != NULL);
706 
707 	mac_hcksum_get(mp, &start, NULL, NULL, NULL, &hckflags);
708 	bzero(ctx, sizeof (ixgbe_tx_context_t));
709 
710 	if (hckflags == 0) {
711 		return (0);
712 	}
713 
714 	ctx->hcksum_flags = hckflags;
715 
716 	mac_lso_get(mp, &mss, &lsoflags);
717 	ctx->mss = mss;
718 	ctx->lso_flag = (lsoflags == HW_LSO);
719 
720 	/*
721 	 * LSO relies on tx h/w checksum, so here will drop the package
722 	 * if h/w checksum flag is not declared.
723 	 */
724 	if (ctx->lso_flag) {
725 		if (!((ctx->hcksum_flags & HCK_PARTIALCKSUM) &&
726 		    (ctx->hcksum_flags & HCK_IPV4_HDRCKSUM))) {
727 			IXGBE_DEBUGLOG_0(NULL, "ixgbe_tx: h/w "
728 			    "checksum flags are not specified when doing LSO");
729 			return (-1);
730 		}
731 	}
732 
733 	etype = 0;
734 	mac_hdr_len = 0;
735 	l4_proto = 0;
736 
737 	/*
738 	 * Firstly get the position of the ether_type/ether_tpid.
739 	 * Here we don't assume the ether (VLAN) header is fully included
740 	 * in one mblk fragment, so we go thourgh the fragments to parse
741 	 * the ether type.
742 	 */
743 	size = len = MBLKL(mp);
744 	offset = offsetof(struct ether_header, ether_type);
745 	while (size <= offset) {
746 		mp = mp->b_cont;
747 		ASSERT(mp != NULL);
748 		len = MBLKL(mp);
749 		size += len;
750 	}
751 	pos = mp->b_rptr + offset + len - size;
752 
753 	etype = ntohs(*(ushort_t *)(uintptr_t)pos);
754 	if (etype == ETHERTYPE_VLAN) {
755 		/*
756 		 * Get the position of the ether_type in VLAN header
757 		 */
758 		offset = offsetof(struct ether_vlan_header, ether_type);
759 		while (size <= offset) {
760 			mp = mp->b_cont;
761 			ASSERT(mp != NULL);
762 			len = MBLKL(mp);
763 			size += len;
764 		}
765 		pos = mp->b_rptr + offset + len - size;
766 
767 		etype = ntohs(*(ushort_t *)(uintptr_t)pos);
768 		mac_hdr_len = sizeof (struct ether_vlan_header);
769 	} else {
770 		mac_hdr_len = sizeof (struct ether_header);
771 	}
772 
773 	/*
774 	 * Here we don't assume the IP(V6) header is fully included in
775 	 * one mblk fragment.
776 	 */
777 	switch (etype) {
778 	case ETHERTYPE_IP:
779 		if (ctx->lso_flag) {
780 			offset = offsetof(ipha_t, ipha_length) + mac_hdr_len;
781 			while (size <= offset) {
782 				mp = mp->b_cont;
783 				ASSERT(mp != NULL);
784 				len = MBLKL(mp);
785 				size += len;
786 			}
787 			pos = mp->b_rptr + offset + len - size;
788 			*((uint16_t *)(uintptr_t)(pos)) = 0;
789 
790 			offset = offsetof(ipha_t, ipha_hdr_checksum) +
791 			    mac_hdr_len;
792 			while (size <= offset) {
793 				mp = mp->b_cont;
794 				ASSERT(mp != NULL);
795 				len = MBLKL(mp);
796 				size += len;
797 			}
798 			pos = mp->b_rptr + offset + len - size;
799 			*((uint16_t *)(uintptr_t)(pos)) = 0;
800 
801 			/*
802 			 * To perform ixgbe LSO, here also need to fill
803 			 * the tcp checksum field of the packet with the
804 			 * following pseudo-header checksum:
805 			 * (ip_source_addr, ip_destination_addr, l4_proto)
806 			 * Currently the tcp/ip stack has done it.
807 			 */
808 		}
809 
810 		offset = offsetof(ipha_t, ipha_protocol) + mac_hdr_len;
811 		while (size <= offset) {
812 			mp = mp->b_cont;
813 			ASSERT(mp != NULL);
814 			len = MBLKL(mp);
815 			size += len;
816 		}
817 		pos = mp->b_rptr + offset + len - size;
818 
819 		l4_proto = *(uint8_t *)pos;
820 		break;
821 	case ETHERTYPE_IPV6:
822 		offset = offsetof(ip6_t, ip6_nxt) + mac_hdr_len;
823 		while (size <= offset) {
824 			mp = mp->b_cont;
825 			ASSERT(mp != NULL);
826 			len = MBLKL(mp);
827 			size += len;
828 		}
829 		pos = mp->b_rptr + offset + len - size;
830 
831 		l4_proto = *(uint8_t *)pos;
832 		break;
833 	default:
834 		/* Unrecoverable error */
835 		IXGBE_DEBUGLOG_0(NULL, "Ether type error with tx hcksum");
836 		return (-2);
837 	}
838 
839 	if (ctx->lso_flag) {
840 		offset = mac_hdr_len + start;
841 		while (size <= offset) {
842 			mp = mp->b_cont;
843 			ASSERT(mp != NULL);
844 			len = MBLKL(mp);
845 			size += len;
846 		}
847 		pos = mp->b_rptr + offset + len - size;
848 
849 		l4_hdr_len = TCP_HDR_LENGTH((tcph_t *)pos);
850 	} else {
851 		/*
852 		 * l4 header length is only required for LSO
853 		 */
854 		l4_hdr_len = 0;
855 	}
856 
857 	ctx->mac_hdr_len = mac_hdr_len;
858 	ctx->ip_hdr_len = start;
859 	ctx->l4_proto = l4_proto;
860 	ctx->l4_hdr_len = l4_hdr_len;
861 
862 	return (0);
863 }
864 
865 /*
866  * ixgbe_check_context
867  *
868  * Check if a new context descriptor is needed
869  */
870 static boolean_t
871 ixgbe_check_context(ixgbe_tx_ring_t *tx_ring, ixgbe_tx_context_t *ctx)
872 {
873 	ixgbe_tx_context_t *last;
874 
875 	if (ctx == NULL)
876 		return (B_FALSE);
877 
878 	/*
879 	 * Compare the context data retrieved from the mblk and the
880 	 * stored data of the last context descriptor. The data need
881 	 * to be checked are:
882 	 *	hcksum_flags
883 	 *	l4_proto
884 	 *	mac_hdr_len
885 	 *	ip_hdr_len
886 	 *	lso_flag
887 	 *	mss (only checked for LSO)
888 	 *	l4_hr_len (only checked for LSO)
889 	 * Either one of the above data is changed, a new context descriptor
890 	 * will be needed.
891 	 */
892 	last = &tx_ring->tx_context;
893 
894 	if ((ctx->hcksum_flags != last->hcksum_flags) ||
895 	    (ctx->l4_proto != last->l4_proto) ||
896 	    (ctx->mac_hdr_len != last->mac_hdr_len) ||
897 	    (ctx->ip_hdr_len != last->ip_hdr_len) ||
898 	    (ctx->lso_flag != last->lso_flag) ||
899 	    (ctx->lso_flag && ((ctx->mss != last->mss) ||
900 	    (ctx->l4_hdr_len != last->l4_hdr_len)))) {
901 		return (B_TRUE);
902 	}
903 
904 	return (B_FALSE);
905 }
906 
907 /*
908  * ixgbe_fill_context
909  *
910  * Fill the context descriptor with hardware checksum informations
911  */
912 static void
913 ixgbe_fill_context(struct ixgbe_adv_tx_context_desc *ctx_tbd,
914     ixgbe_tx_context_t *ctx)
915 {
916 	/*
917 	 * Fill the context descriptor with the checksum
918 	 * context information we've got.
919 	 */
920 	ctx_tbd->vlan_macip_lens = ctx->ip_hdr_len;
921 	ctx_tbd->vlan_macip_lens |= ctx->mac_hdr_len <<
922 	    IXGBE_ADVTXD_MACLEN_SHIFT;
923 
924 	ctx_tbd->type_tucmd_mlhl =
925 	    IXGBE_ADVTXD_DCMD_DEXT | IXGBE_ADVTXD_DTYP_CTXT;
926 
927 	if (ctx->hcksum_flags & HCK_IPV4_HDRCKSUM)
928 		ctx_tbd->type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV4;
929 
930 	if (ctx->hcksum_flags & HCK_PARTIALCKSUM) {
931 		switch (ctx->l4_proto) {
932 		case IPPROTO_TCP:
933 			ctx_tbd->type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_TCP;
934 			break;
935 		case IPPROTO_UDP:
936 			/*
937 			 * We don't have to explicitly set:
938 			 *	ctx_tbd->type_tucmd_mlhl |=
939 			 *	    IXGBE_ADVTXD_TUCMD_L4T_UDP;
940 			 * Because IXGBE_ADVTXD_TUCMD_L4T_UDP == 0b
941 			 */
942 			break;
943 		default:
944 			/* Unrecoverable error */
945 			IXGBE_DEBUGLOG_0(NULL, "L4 type error with tx hcksum");
946 			break;
947 		}
948 	}
949 
950 	ctx_tbd->seqnum_seed = 0;
951 
952 	if (ctx->lso_flag) {
953 		ctx_tbd->mss_l4len_idx =
954 		    (ctx->l4_hdr_len << IXGBE_ADVTXD_L4LEN_SHIFT) |
955 		    (ctx->mss << IXGBE_ADVTXD_MSS_SHIFT);
956 	} else {
957 		ctx_tbd->mss_l4len_idx = 0;
958 	}
959 }
960 
961 /*
962  * ixgbe_tx_fill_ring
963  *
964  * Fill the tx descriptor ring with the data
965  */
966 static int
967 ixgbe_tx_fill_ring(ixgbe_tx_ring_t *tx_ring, link_list_t *pending_list,
968     ixgbe_tx_context_t *ctx, size_t mbsize)
969 {
970 	struct ixgbe_hw *hw = &tx_ring->ixgbe->hw;
971 	boolean_t load_context;
972 	uint32_t index, tcb_index, desc_num;
973 	union ixgbe_adv_tx_desc *tbd, *first_tbd;
974 	tx_control_block_t *tcb, *first_tcb;
975 	uint32_t hcksum_flags;
976 	int i;
977 
978 	ASSERT(mutex_owned(&tx_ring->tx_lock));
979 
980 	tbd = NULL;
981 	first_tbd = NULL;
982 	first_tcb = NULL;
983 	desc_num = 0;
984 	hcksum_flags = 0;
985 	load_context = B_FALSE;
986 
987 	/*
988 	 * Get the index of the first tx descriptor that will be filled,
989 	 * and the index of the first work list item that will be attached
990 	 * with the first used tx control block in the pending list.
991 	 * Note: the two indexes are the same.
992 	 */
993 	index = tx_ring->tbd_tail;
994 	tcb_index = tx_ring->tbd_tail;
995 
996 	if (ctx != NULL) {
997 		hcksum_flags = ctx->hcksum_flags;
998 
999 		/*
1000 		 * Check if a new context descriptor is needed for this packet
1001 		 */
1002 		load_context = ixgbe_check_context(tx_ring, ctx);
1003 
1004 		if (load_context) {
1005 			tbd = &tx_ring->tbd_ring[index];
1006 
1007 			/*
1008 			 * Fill the context descriptor with the
1009 			 * hardware checksum offload informations.
1010 			 */
1011 			ixgbe_fill_context(
1012 			    (struct ixgbe_adv_tx_context_desc *)tbd, ctx);
1013 
1014 			index = NEXT_INDEX(index, 1, tx_ring->ring_size);
1015 			desc_num++;
1016 
1017 			/*
1018 			 * Store the checksum context data if
1019 			 * a new context descriptor is added
1020 			 */
1021 			tx_ring->tx_context = *ctx;
1022 		}
1023 	}
1024 
1025 	first_tbd = &tx_ring->tbd_ring[index];
1026 
1027 	/*
1028 	 * Fill tx data descriptors with the data saved in the pending list.
1029 	 * The tx control blocks in the pending list are added to the work list
1030 	 * at the same time.
1031 	 *
1032 	 * The work list is strictly 1:1 corresponding to the descriptor ring.
1033 	 * One item of the work list corresponds to one tx descriptor. Because
1034 	 * one tx control block can span multiple tx descriptors, the tx
1035 	 * control block will be added to the first work list item that
1036 	 * corresponds to the first tx descriptor generated from that tx
1037 	 * control block.
1038 	 */
1039 	tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
1040 	first_tcb = tcb;
1041 	while (tcb != NULL) {
1042 
1043 		for (i = 0; i < tcb->desc_num; i++) {
1044 			tbd = &tx_ring->tbd_ring[index];
1045 
1046 			tbd->read.buffer_addr = tcb->desc[i].address;
1047 			tbd->read.cmd_type_len = tcb->desc[i].length;
1048 
1049 			tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_DEXT
1050 			    | IXGBE_ADVTXD_DTYP_DATA;
1051 
1052 			tbd->read.olinfo_status = 0;
1053 
1054 			index = NEXT_INDEX(index, 1, tx_ring->ring_size);
1055 			desc_num++;
1056 		}
1057 
1058 		/*
1059 		 * Add the tx control block to the work list
1060 		 */
1061 		ASSERT(tx_ring->work_list[tcb_index] == NULL);
1062 		tx_ring->work_list[tcb_index] = tcb;
1063 
1064 		tcb_index = index;
1065 		tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
1066 	}
1067 
1068 	if (load_context) {
1069 		/*
1070 		 * Count the context descriptor for
1071 		 * the first tx control block.
1072 		 */
1073 		first_tcb->desc_num++;
1074 	}
1075 	first_tcb->last_index = PREV_INDEX(index, 1, tx_ring->ring_size);
1076 
1077 	/*
1078 	 * The Insert Ethernet CRC (IFCS) bit and the checksum fields are only
1079 	 * valid in the first descriptor of the packet.
1080 	 * Setting paylen in every first_tbd for all parts.
1081 	 * 82599 requires the packet length in paylen field with or without
1082 	 * LSO and 82598 will ignore it in non-LSO mode.
1083 	 */
1084 	ASSERT(first_tbd != NULL);
1085 	first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_IFCS;
1086 
1087 	switch (hw->mac.type) {
1088 	case ixgbe_mac_82598EB:
1089 		if (ctx != NULL && ctx->lso_flag) {
1090 			first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_TSE;
1091 			first_tbd->read.olinfo_status |=
1092 			    (mbsize - ctx->mac_hdr_len - ctx->ip_hdr_len
1093 			    - ctx->l4_hdr_len) << IXGBE_ADVTXD_PAYLEN_SHIFT;
1094 		}
1095 		break;
1096 
1097 	case ixgbe_mac_82599EB:
1098 		if (ctx != NULL && ctx->lso_flag) {
1099 			first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_TSE;
1100 			first_tbd->read.olinfo_status |=
1101 			    (mbsize - ctx->mac_hdr_len - ctx->ip_hdr_len
1102 			    - ctx->l4_hdr_len) << IXGBE_ADVTXD_PAYLEN_SHIFT;
1103 		} else {
1104 			first_tbd->read.olinfo_status |=
1105 			    (mbsize << IXGBE_ADVTXD_PAYLEN_SHIFT);
1106 		}
1107 		break;
1108 
1109 	default:
1110 		break;
1111 	}
1112 
1113 	/* Set hardware checksum bits */
1114 	if (hcksum_flags != 0) {
1115 		if (hcksum_flags & HCK_IPV4_HDRCKSUM)
1116 			first_tbd->read.olinfo_status |=
1117 			    IXGBE_ADVTXD_POPTS_IXSM;
1118 		if (hcksum_flags & HCK_PARTIALCKSUM)
1119 			first_tbd->read.olinfo_status |=
1120 			    IXGBE_ADVTXD_POPTS_TXSM;
1121 	}
1122 
1123 	/*
1124 	 * The last descriptor of packet needs End Of Packet (EOP),
1125 	 * and Report Status (RS) bits set
1126 	 */
1127 	ASSERT(tbd != NULL);
1128 	tbd->read.cmd_type_len |=
1129 	    IXGBE_ADVTXD_DCMD_EOP | IXGBE_ADVTXD_DCMD_RS;
1130 
1131 	/*
1132 	 * Sync the DMA buffer of the tx descriptor ring
1133 	 */
1134 	DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORDEV);
1135 
1136 	/*
1137 	 * Update the number of the free tx descriptors.
1138 	 * The mutual exclusion between the transmission and the recycling
1139 	 * (for the tx descriptor ring and the work list) is implemented
1140 	 * with the atomic operation on the number of the free tx descriptors.
1141 	 *
1142 	 * Note: we should always decrement the counter tbd_free before
1143 	 * advancing the hardware TDT pointer to avoid the race condition -
1144 	 * before the counter tbd_free is decremented, the transmit of the
1145 	 * tx descriptors has done and the counter tbd_free is increased by
1146 	 * the tx recycling.
1147 	 */
1148 	i = ixgbe_atomic_reserve(&tx_ring->tbd_free, desc_num);
1149 	ASSERT(i >= 0);
1150 
1151 	tx_ring->tbd_tail = index;
1152 
1153 	/*
1154 	 * Advance the hardware TDT pointer of the tx descriptor ring
1155 	 */
1156 	IXGBE_WRITE_REG(hw, IXGBE_TDT(tx_ring->index), index);
1157 
1158 	if (ixgbe_check_acc_handle(tx_ring->ixgbe->osdep.reg_handle) !=
1159 	    DDI_FM_OK) {
1160 		ddi_fm_service_impact(tx_ring->ixgbe->dip,
1161 		    DDI_SERVICE_DEGRADED);
1162 		atomic_or_32(&tx_ring->ixgbe->ixgbe_state, IXGBE_ERROR);
1163 	}
1164 
1165 	return (desc_num);
1166 }
1167 
1168 /*
1169  * ixgbe_save_desc
1170  *
1171  * Save the address/length pair to the private array
1172  * of the tx control block. The address/length pairs
1173  * will be filled into the tx descriptor ring later.
1174  */
1175 static void
1176 ixgbe_save_desc(tx_control_block_t *tcb, uint64_t address, size_t length)
1177 {
1178 	sw_desc_t *desc;
1179 
1180 	desc = &tcb->desc[tcb->desc_num];
1181 	desc->address = address;
1182 	desc->length = length;
1183 
1184 	tcb->desc_num++;
1185 }
1186 
1187 /*
1188  * ixgbe_tx_recycle_legacy
1189  *
1190  * Recycle the tx descriptors and tx control blocks.
1191  *
1192  * The work list is traversed to check if the corresponding
1193  * tx descriptors have been transmitted. If so, the resources
1194  * bound to the tx control blocks will be freed, and those
1195  * tx control blocks will be returned to the free list.
1196  */
1197 uint32_t
1198 ixgbe_tx_recycle_legacy(ixgbe_tx_ring_t *tx_ring)
1199 {
1200 	uint32_t index, last_index, prev_index;
1201 	int desc_num;
1202 	boolean_t desc_done;
1203 	tx_control_block_t *tcb;
1204 	link_list_t pending_list;
1205 	ixgbe_t *ixgbe = tx_ring->ixgbe;
1206 
1207 	mutex_enter(&tx_ring->recycle_lock);
1208 
1209 	ASSERT(tx_ring->tbd_free <= tx_ring->ring_size);
1210 
1211 	if (tx_ring->tbd_free == tx_ring->ring_size) {
1212 		tx_ring->recycle_fail = 0;
1213 		tx_ring->stall_watchdog = 0;
1214 		if (tx_ring->reschedule) {
1215 			tx_ring->reschedule = B_FALSE;
1216 			mac_tx_ring_update(ixgbe->mac_hdl,
1217 			    tx_ring->ring_handle);
1218 		}
1219 		mutex_exit(&tx_ring->recycle_lock);
1220 		return (0);
1221 	}
1222 
1223 	/*
1224 	 * Sync the DMA buffer of the tx descriptor ring
1225 	 */
1226 	DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORKERNEL);
1227 
1228 	if (ixgbe_check_dma_handle(tx_ring->tbd_area.dma_handle) != DDI_FM_OK) {
1229 		mutex_exit(&tx_ring->recycle_lock);
1230 		ddi_fm_service_impact(ixgbe->dip, DDI_SERVICE_DEGRADED);
1231 		atomic_or_32(&ixgbe->ixgbe_state, IXGBE_ERROR);
1232 		return (0);
1233 	}
1234 
1235 	LINK_LIST_INIT(&pending_list);
1236 	desc_num = 0;
1237 	index = tx_ring->tbd_head;	/* Index of next tbd/tcb to recycle */
1238 
1239 	tcb = tx_ring->work_list[index];
1240 	ASSERT(tcb != NULL);
1241 
1242 	while (tcb != NULL) {
1243 		/*
1244 		 * Get the last tx descriptor of this packet.
1245 		 * If the last tx descriptor is done, then
1246 		 * we can recycle all descriptors of a packet
1247 		 * which usually includes several tx control blocks.
1248 		 * For 82599, LSO descriptors can not be recycled
1249 		 * unless the whole packet's transmission is done.
1250 		 * That's why packet level recycling is used here.
1251 		 * For 82598, there's not such limit.
1252 		 */
1253 		last_index = tcb->last_index;
1254 		/*
1255 		 * MAX_TX_RING_SIZE is used to judge whether
1256 		 * the index is a valid value or not.
1257 		 */
1258 		if (last_index == MAX_TX_RING_SIZE)
1259 			break;
1260 
1261 		/*
1262 		 * Check if the Descriptor Done bit is set
1263 		 */
1264 		desc_done = tx_ring->tbd_ring[last_index].wb.status &
1265 		    IXGBE_TXD_STAT_DD;
1266 		if (desc_done) {
1267 			/*
1268 			 * recycle all descriptors of the packet
1269 			 */
1270 			while (tcb != NULL) {
1271 				/*
1272 				 * Strip off the tx control block from
1273 				 * the work list, and add it to the
1274 				 * pending list.
1275 				 */
1276 				tx_ring->work_list[index] = NULL;
1277 				LIST_PUSH_TAIL(&pending_list, &tcb->link);
1278 
1279 				/*
1280 				 * Count the total number of the tx
1281 				 * descriptors recycled
1282 				 */
1283 				desc_num += tcb->desc_num;
1284 
1285 				index = NEXT_INDEX(index, tcb->desc_num,
1286 				    tx_ring->ring_size);
1287 
1288 				tcb = tx_ring->work_list[index];
1289 
1290 				prev_index = PREV_INDEX(index, 1,
1291 				    tx_ring->ring_size);
1292 				if (prev_index == last_index)
1293 					break;
1294 			}
1295 		} else {
1296 			break;
1297 		}
1298 	}
1299 
1300 	/*
1301 	 * If no tx descriptors are recycled, no need to do more processing
1302 	 */
1303 	if (desc_num == 0) {
1304 		tx_ring->recycle_fail++;
1305 		mutex_exit(&tx_ring->recycle_lock);
1306 		return (0);
1307 	}
1308 
1309 	tx_ring->recycle_fail = 0;
1310 	tx_ring->stall_watchdog = 0;
1311 
1312 	/*
1313 	 * Update the head index of the tx descriptor ring
1314 	 */
1315 	tx_ring->tbd_head = index;
1316 
1317 	/*
1318 	 * Update the number of the free tx descriptors with atomic operations
1319 	 */
1320 	atomic_add_32(&tx_ring->tbd_free, desc_num);
1321 
1322 	if ((tx_ring->tbd_free >= ixgbe->tx_resched_thresh) &&
1323 	    (tx_ring->reschedule)) {
1324 		tx_ring->reschedule = B_FALSE;
1325 		mac_tx_ring_update(ixgbe->mac_hdl,
1326 		    tx_ring->ring_handle);
1327 	}
1328 	mutex_exit(&tx_ring->recycle_lock);
1329 
1330 	/*
1331 	 * Free the resources used by the tx control blocks
1332 	 * in the pending list
1333 	 */
1334 	tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list);
1335 	while (tcb != NULL) {
1336 		/*
1337 		 * Release the resources occupied by the tx control block
1338 		 */
1339 		ixgbe_free_tcb(tcb);
1340 
1341 		tcb = (tx_control_block_t *)
1342 		    LIST_GET_NEXT(&pending_list, &tcb->link);
1343 	}
1344 
1345 	/*
1346 	 * Add the tx control blocks in the pending list to the free list.
1347 	 */
1348 	ixgbe_put_free_list(tx_ring, &pending_list);
1349 
1350 	return (desc_num);
1351 }
1352 
1353 /*
1354  * ixgbe_tx_recycle_head_wb
1355  *
1356  * Check the head write-back, and recycle all the transmitted
1357  * tx descriptors and tx control blocks.
1358  */
1359 uint32_t
1360 ixgbe_tx_recycle_head_wb(ixgbe_tx_ring_t *tx_ring)
1361 {
1362 	uint32_t index;
1363 	uint32_t head_wb;
1364 	int desc_num;
1365 	tx_control_block_t *tcb;
1366 	link_list_t pending_list;
1367 	ixgbe_t *ixgbe = tx_ring->ixgbe;
1368 
1369 	mutex_enter(&tx_ring->recycle_lock);
1370 
1371 	ASSERT(tx_ring->tbd_free <= tx_ring->ring_size);
1372 
1373 	if (tx_ring->tbd_free == tx_ring->ring_size) {
1374 		tx_ring->recycle_fail = 0;
1375 		tx_ring->stall_watchdog = 0;
1376 		if (tx_ring->reschedule) {
1377 			tx_ring->reschedule = B_FALSE;
1378 			mac_tx_ring_update(ixgbe->mac_hdl,
1379 			    tx_ring->ring_handle);
1380 		}
1381 		mutex_exit(&tx_ring->recycle_lock);
1382 		return (0);
1383 	}
1384 
1385 	/*
1386 	 * Sync the DMA buffer of the tx descriptor ring
1387 	 *
1388 	 * Note: For head write-back mode, the tx descriptors will not
1389 	 * be written back, but the head write-back value is stored at
1390 	 * the last extra tbd at the end of the DMA area, we still need
1391 	 * to sync the head write-back value for kernel.
1392 	 *
1393 	 * DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORKERNEL);
1394 	 */
1395 	(void) ddi_dma_sync(tx_ring->tbd_area.dma_handle,
1396 	    sizeof (union ixgbe_adv_tx_desc) * tx_ring->ring_size,
1397 	    sizeof (uint32_t),
1398 	    DDI_DMA_SYNC_FORKERNEL);
1399 
1400 	if (ixgbe_check_dma_handle(tx_ring->tbd_area.dma_handle) != DDI_FM_OK) {
1401 		mutex_exit(&tx_ring->recycle_lock);
1402 		ddi_fm_service_impact(ixgbe->dip,
1403 		    DDI_SERVICE_DEGRADED);
1404 		atomic_or_32(&ixgbe->ixgbe_state, IXGBE_ERROR);
1405 		return (0);
1406 	}
1407 
1408 	LINK_LIST_INIT(&pending_list);
1409 	desc_num = 0;
1410 	index = tx_ring->tbd_head;	/* Next index to clean */
1411 
1412 	/*
1413 	 * Get the value of head write-back
1414 	 */
1415 	head_wb = *tx_ring->tbd_head_wb;
1416 	while (index != head_wb) {
1417 		tcb = tx_ring->work_list[index];
1418 		ASSERT(tcb != NULL);
1419 
1420 		if (OFFSET(index, head_wb, tx_ring->ring_size) <
1421 		    tcb->desc_num) {
1422 			/*
1423 			 * The current tx control block is not
1424 			 * completely transmitted, stop recycling
1425 			 */
1426 			break;
1427 		}
1428 
1429 		/*
1430 		 * Strip off the tx control block from the work list,
1431 		 * and add it to the pending list.
1432 		 */
1433 		tx_ring->work_list[index] = NULL;
1434 		LIST_PUSH_TAIL(&pending_list, &tcb->link);
1435 
1436 		/*
1437 		 * Advance the index of the tx descriptor ring
1438 		 */
1439 		index = NEXT_INDEX(index, tcb->desc_num, tx_ring->ring_size);
1440 
1441 		/*
1442 		 * Count the total number of the tx descriptors recycled
1443 		 */
1444 		desc_num += tcb->desc_num;
1445 	}
1446 
1447 	/*
1448 	 * If no tx descriptors are recycled, no need to do more processing
1449 	 */
1450 	if (desc_num == 0) {
1451 		tx_ring->recycle_fail++;
1452 		mutex_exit(&tx_ring->recycle_lock);
1453 		return (0);
1454 	}
1455 
1456 	tx_ring->recycle_fail = 0;
1457 	tx_ring->stall_watchdog = 0;
1458 
1459 	/*
1460 	 * Update the head index of the tx descriptor ring
1461 	 */
1462 	tx_ring->tbd_head = index;
1463 
1464 	/*
1465 	 * Update the number of the free tx descriptors with atomic operations
1466 	 */
1467 	atomic_add_32(&tx_ring->tbd_free, desc_num);
1468 
1469 	if ((tx_ring->tbd_free >= ixgbe->tx_resched_thresh) &&
1470 	    (tx_ring->reschedule)) {
1471 		tx_ring->reschedule = B_FALSE;
1472 		mac_tx_ring_update(ixgbe->mac_hdl,
1473 		    tx_ring->ring_handle);
1474 	}
1475 	mutex_exit(&tx_ring->recycle_lock);
1476 
1477 	/*
1478 	 * Free the resources used by the tx control blocks
1479 	 * in the pending list
1480 	 */
1481 	tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list);
1482 	while (tcb) {
1483 		/*
1484 		 * Release the resources occupied by the tx control block
1485 		 */
1486 		ixgbe_free_tcb(tcb);
1487 
1488 		tcb = (tx_control_block_t *)
1489 		    LIST_GET_NEXT(&pending_list, &tcb->link);
1490 	}
1491 
1492 	/*
1493 	 * Add the tx control blocks in the pending list to the free list.
1494 	 */
1495 	ixgbe_put_free_list(tx_ring, &pending_list);
1496 
1497 	return (desc_num);
1498 }
1499 
1500 /*
1501  * ixgbe_free_tcb - free up the tx control block
1502  *
1503  * Free the resources of the tx control block, including
1504  * unbind the previously bound DMA handle, and reset other
1505  * control fields.
1506  */
1507 void
1508 ixgbe_free_tcb(tx_control_block_t *tcb)
1509 {
1510 	switch (tcb->tx_type) {
1511 	case USE_COPY:
1512 		/*
1513 		 * Reset the buffer length that is used for copy
1514 		 */
1515 		tcb->tx_buf.len = 0;
1516 		break;
1517 	case USE_DMA:
1518 		/*
1519 		 * Release the DMA resource that is used for
1520 		 * DMA binding.
1521 		 */
1522 		(void) ddi_dma_unbind_handle(tcb->tx_dma_handle);
1523 		break;
1524 	default:
1525 		break;
1526 	}
1527 
1528 	/*
1529 	 * Free the mblk
1530 	 */
1531 	if (tcb->mp != NULL) {
1532 		freemsg(tcb->mp);
1533 		tcb->mp = NULL;
1534 	}
1535 
1536 	tcb->tx_type = USE_NONE;
1537 	tcb->last_index = MAX_TX_RING_SIZE;
1538 	tcb->frag_num = 0;
1539 	tcb->desc_num = 0;
1540 }
1541 
1542 /*
1543  * ixgbe_get_free_list - Get a free tx control block from the free list
1544  *
1545  * The atomic operation on the number of the available tx control block
1546  * in the free list is used to keep this routine mutual exclusive with
1547  * the routine ixgbe_put_check_list.
1548  */
1549 static tx_control_block_t *
1550 ixgbe_get_free_list(ixgbe_tx_ring_t *tx_ring)
1551 {
1552 	tx_control_block_t *tcb;
1553 
1554 	/*
1555 	 * Check and update the number of the free tx control block
1556 	 * in the free list.
1557 	 */
1558 	if (ixgbe_atomic_reserve(&tx_ring->tcb_free, 1) < 0)
1559 		return (NULL);
1560 
1561 	mutex_enter(&tx_ring->tcb_head_lock);
1562 
1563 	tcb = tx_ring->free_list[tx_ring->tcb_head];
1564 	ASSERT(tcb != NULL);
1565 	tx_ring->free_list[tx_ring->tcb_head] = NULL;
1566 	tx_ring->tcb_head = NEXT_INDEX(tx_ring->tcb_head, 1,
1567 	    tx_ring->free_list_size);
1568 
1569 	mutex_exit(&tx_ring->tcb_head_lock);
1570 
1571 	return (tcb);
1572 }
1573 
1574 /*
1575  * ixgbe_put_free_list
1576  *
1577  * Put a list of used tx control blocks back to the free list
1578  *
1579  * A mutex is used here to ensure the serialization. The mutual exclusion
1580  * between ixgbe_get_free_list and ixgbe_put_free_list is implemented with
1581  * the atomic operation on the counter tcb_free.
1582  */
1583 void
1584 ixgbe_put_free_list(ixgbe_tx_ring_t *tx_ring, link_list_t *pending_list)
1585 {
1586 	uint32_t index;
1587 	int tcb_num;
1588 	tx_control_block_t *tcb;
1589 
1590 	mutex_enter(&tx_ring->tcb_tail_lock);
1591 
1592 	index = tx_ring->tcb_tail;
1593 
1594 	tcb_num = 0;
1595 	tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
1596 	while (tcb != NULL) {
1597 		ASSERT(tx_ring->free_list[index] == NULL);
1598 		tx_ring->free_list[index] = tcb;
1599 
1600 		tcb_num++;
1601 
1602 		index = NEXT_INDEX(index, 1, tx_ring->free_list_size);
1603 
1604 		tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
1605 	}
1606 
1607 	tx_ring->tcb_tail = index;
1608 
1609 	/*
1610 	 * Update the number of the free tx control block
1611 	 * in the free list. This operation must be placed
1612 	 * under the protection of the lock.
1613 	 */
1614 	atomic_add_32(&tx_ring->tcb_free, tcb_num);
1615 
1616 	mutex_exit(&tx_ring->tcb_tail_lock);
1617 }
1618