xref: /illumos-gate/usr/src/uts/common/io/ixgbe/ixgbe_tx.c (revision 9b9d39d2a32ff806d2431dbcc50968ef1e6d46b2)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright(c) 2007-2010 Intel Corporation. All rights reserved.
24  */
25 
26 /*
27  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
28  * Copyright 2012 Nexenta Systems, Inc. All rights reserved.
29  * Copyright 2016 OmniTI Computer Consulting, Inc. All rights reserved.
30  * Copyright 2021 Joyent, Inc.
31  */
32 
33 #include "ixgbe_sw.h"
34 
35 static int ixgbe_tx_copy(ixgbe_tx_ring_t *, tx_control_block_t **,
36     link_list_t *, const void *, size_t);
37 static int ixgbe_tx_bind(ixgbe_tx_ring_t *, tx_control_block_t **,
38     link_list_t *, uint8_t *, size_t);
39 static uint_t ixgbe_tcb_done(tx_control_block_t *);
40 static int ixgbe_tx_fill_ring(ixgbe_tx_ring_t *, link_list_t *,
41     ixgbe_tx_context_t *, size_t);
42 static void ixgbe_save_desc(tx_control_block_t *, uint64_t, size_t);
43 static tx_control_block_t *ixgbe_get_free_list(ixgbe_tx_ring_t *,
44     link_list_t *);
45 
46 static int ixgbe_get_context(mblk_t *, ixgbe_tx_context_t *);
47 static boolean_t ixgbe_check_context(ixgbe_tx_ring_t *,
48     ixgbe_tx_context_t *);
49 static void ixgbe_fill_context(struct ixgbe_adv_tx_context_desc *,
50     ixgbe_tx_context_t *);
51 
52 #ifndef IXGBE_DEBUG
53 #pragma inline(ixgbe_save_desc)
54 #pragma inline(ixgbe_get_context)
55 #pragma inline(ixgbe_check_context)
56 #pragma inline(ixgbe_fill_context)
57 #endif
58 
59 /*
60  * ixgbe_ring_tx
61  *
62  * To transmit one mblk through one specified ring.
63  *
64  * One mblk can consist of several fragments, each fragment
65  * will be processed with different methods based on the size.
66  * For the fragments with size less than the bcopy threshold,
67  * they will be processed by using bcopy; otherwise, they will
68  * be processed by using DMA binding.
69  *
70  * To process the mblk, for each fragment, we pass a pointer to the location
71  * of the current transmit control block (tcb) (initialized to NULL) to either
72  * ixgbe_tx_copy() or ixgbe_tx_bind() (based on the size of the mblk fragment).
73  * ixgbe_tx_copy() and ixgbe_tx_bind() will either continue to use the current
74  * if possible, or close out the current tcb, allocate a new tcb, and update
75  * the passed location (tx_control_block_t **) to reflect the new current tcb.
76  *
77  * Since bound mblk fragments require their own tcb, the close, allocate new,
78  * and update steps occur on every call to ixgbe_tx_bind(), but since
79  * consecutive small mblk fragments can be combined into a single tcb, the
80  * close, allocate new, and update steps may not occur on every call to
81  * ixgbe_tx_copy(). If the current tcb is already being used to copy data and
82  * we call ixgbe_tx_copy(), if there is enough room in the current tcb for
83  * the current mblk fragment, we append the data from the mblk fragment. If
84  * we call ixgbe_tx_copy() and the current tcb isn't being used to copy (i.e.
85  * the previous iteration of the loop called ixgbe_tx_bind()), or doesn't
86  * have enough space for the mblk fragment, we close out the current tcb,
87  * grab a new tcb from the free list, and update the current tcb to the
88  * newly obtained tcb.
89  *
90  * When LSO (large segment offload) is enabled, we first copy the packet
91  * headers (ethernet, IP, and TCP/UDP) into their own descriptor before
92  * processing the remainder of the packet. The remaining bytes of the packet
93  * are then copied or mapped based on the fragment size as described above.
94  *
95  * Through the entire processing of a packet, we keep track of the number of
96  * DMA descriptors being used (either bound or pre-bound buffers used for
97  * copying) by this packet. Each tcb requires at least one DMA descriptor, but
98  * may require more than one. When a tcb is closed by ixgbe_tx_bind() or
99  * ixgbe_tx_copy(), it does so by calling ixgbe_tcb_done() which returns the
100  * number of DMA descriptors that are closed (ready for the HW). Since the
101  * hardware limits the number of descriptors that can be used to transmit a
102  * single packet, if the total number DMA descriptors required to transmit
103  * this packet exceeds this limit, we perform a msgpullup() and try again.
104  * Since our DMA attributes limit the number of DMA cookies allowed to
105  * map a single span of memory to a value (MAX_COOKIE) less than the
106  * maximum number of descriptors allowed for a packet (IXGBE_TX_DESC_LIMIT),
107  * as long as sufficient tcbs are available, we should always be able to
108  * process a packet that's contained in a single mblk_t (no additional
109  * fragments).
110  *
111  * Once all of the tcbs have been setup, ixgbe_tx_fill_ring() is called to
112  * setup the tx ring to transmit the tcbs and then tell the HW to start
113  * transmitting. When transmission is complete, an interrupt is triggered
114  * which calls the appropriate recycle routine to place the tcbs that were
115  * used in transmission back in the free list. We also may also try to
116  * recycle any available tcbs when the size of the tcb free list gets low
117  * or if the watchdog timer triggers.
118  *
119  */
120 mblk_t *
121 ixgbe_ring_tx(void *arg, mblk_t *orig_mp)
122 {
123 	ixgbe_tx_ring_t *tx_ring = (ixgbe_tx_ring_t *)arg;
124 	ixgbe_t *ixgbe = tx_ring->ixgbe;
125 	mblk_t *mp = orig_mp;
126 	mblk_t *pull_mp = NULL;
127 	tx_control_block_t *tcb;
128 	size_t mbsize, offset, len;
129 	uint32_t desc_total;
130 	uint32_t copy_thresh;
131 	int desc_num;
132 	ixgbe_tx_context_t tx_context, *ctx = NULL;
133 	link_list_t pending_list;
134 	boolean_t limit_retry = B_FALSE;
135 
136 	ASSERT(mp->b_next == NULL);
137 
138 	if ((ixgbe->ixgbe_state & IXGBE_SUSPENDED) ||
139 	    (ixgbe->ixgbe_state & IXGBE_ERROR) ||
140 	    (ixgbe->ixgbe_state & IXGBE_OVERTEMP) ||
141 	    !(ixgbe->ixgbe_state & IXGBE_STARTED) ||
142 	    ixgbe->link_state != LINK_STATE_UP) {
143 		freemsg(mp);
144 		return (NULL);
145 	}
146 
147 	copy_thresh = ixgbe->tx_copy_thresh;
148 
149 	mbsize = msgsize(mp);
150 
151 	if (ixgbe->tx_hcksum_enable) {
152 		/*
153 		 * Retrieve checksum context information from the mblk
154 		 * that will be used to decide whether/how to fill the
155 		 * context descriptor.
156 		 */
157 		ctx = &tx_context;
158 		if (ixgbe_get_context(mp, ctx) < 0) {
159 			freemsg(mp);
160 			return (NULL);
161 		}
162 
163 		/*
164 		 * If the mblk size exceeds the max size ixgbe could
165 		 * process, then discard this mblk, and return NULL.
166 		 */
167 		if ((ctx->lso_flag &&
168 		    ((mbsize - ctx->mac_hdr_len) > IXGBE_LSO_MAXLEN)) ||
169 		    (!ctx->lso_flag &&
170 		    (mbsize > (ixgbe->max_frame_size - ETHERFCSL)))) {
171 			freemsg(mp);
172 			IXGBE_DEBUGLOG_0(ixgbe, "ixgbe_tx: packet oversize");
173 			return (NULL);
174 		}
175 	}
176 
177 	/*
178 	 * If we use too many descriptors (see comments below), we may do
179 	 * pull_mp = msgpullup(orig_mp, -1), and jump back to here. As such,
180 	 * any time we error return past here, we should check and free
181 	 * pull_mp if != NULL.
182 	 */
183 retry:
184 	/*
185 	 * Check and recycle tx descriptors.
186 	 * The recycle threshold here should be selected carefully
187 	 */
188 	if (tx_ring->tbd_free < ixgbe->tx_recycle_thresh) {
189 		tx_ring->tx_recycle(tx_ring);
190 	}
191 
192 	/*
193 	 * After the recycling, if the tbd_free is less than the
194 	 * overload_threshold, assert overload, return mp;
195 	 * and we need to re-schedule the tx again.
196 	 */
197 	if (tx_ring->tbd_free < ixgbe->tx_overload_thresh) {
198 		tx_ring->reschedule = B_TRUE;
199 		tx_ring->stat_overload++;
200 		if (pull_mp != NULL)
201 			freemsg(pull_mp);
202 		return (orig_mp);
203 	}
204 
205 	/*
206 	 * The pending_list is a linked list that is used to save
207 	 * the tx control blocks that have packet data processed
208 	 * but have not put the data to the tx descriptor ring.
209 	 * It is used to reduce the lock contention of the tx_lock.
210 	 */
211 	LINK_LIST_INIT(&pending_list);
212 
213 	tcb = NULL;
214 	desc_num = 0;
215 	desc_total = 0;
216 	offset = 0;
217 
218 	/*
219 	 * For LSO, we always copy the packet header (Ethernet + IP + TCP/UDP)
220 	 * into a single descriptor separate from the remaining data.
221 	 */
222 	if ((ctx != NULL) && ctx->lso_flag) {
223 		size_t hdr_len;
224 
225 		hdr_len = ctx->ip_hdr_len + ctx->mac_hdr_len + ctx->l4_hdr_len;
226 
227 		/*
228 		 * copy the first hdr_len bytes of mp (i.e. the Ethernet, IP,
229 		 * and TCP/UDP headers) into tcb.
230 		 */
231 		for (len = hdr_len; mp != NULL && len > 0; mp = mp->b_cont) {
232 			size_t mlen = MBLKL(mp);
233 			size_t amt = MIN(mlen, len);
234 			int ret;
235 
236 			ret = ixgbe_tx_copy(tx_ring, &tcb, &pending_list,
237 			    mp->b_rptr, amt);
238 			/*
239 			 * Since we're trying to copy all of the headers into
240 			 * a single buffer in a single tcb, if ixgbe_tx_copy()
241 			 * returns anything but 0, it means either no tcbs
242 			 * are available (< 0), or while copying, we spilled
243 			 * over and couldn't fit all the headers into a
244 			 * single tcb.
245 			 */
246 			if (ret != 0) {
247 				if (ret > 0)
248 					tx_ring->stat_lso_header_fail++;
249 				goto tx_failure;
250 			}
251 
252 			len -= amt;
253 
254 			/*
255 			 * If we copy less than the full amount of this
256 			 * mblk_t, we have some amount to copy below.
257 			 */
258 			if (amt < mlen) {
259 				offset = amt;
260 				break;
261 			}
262 		}
263 
264 		ASSERT0(len);
265 
266 		/*
267 		 * Finish off the header tcb, and start anew for the
268 		 * rest of the packet.
269 		 */
270 		desc_total += ixgbe_tcb_done(tcb);
271 		tcb = NULL;
272 	}
273 
274 	/*
275 	 * Process each remaining segment in the packet -- either binding
276 	 * the dblk_t or copying the contents of the dblk_t to an already
277 	 * bound buffer. When we copy, we will accumulate consecutive small
278 	 * (less than copy_thresh bytes) segments into a single tcb buffer
279 	 * until no more can fit (or we encounter a segment larger than
280 	 * copy_thresh and bind the dblk_t).
281 	 *
282 	 * Both ixgbe_tx_bind() and ixgbe_tx_copy() will allocate new
283 	 * transmit control blocks (tcb)s as needed (and append them onto
284 	 * 'pending_list'). Both functions also replace 'tcb' with the new
285 	 * tcb when they allocate a new tcb.
286 	 *
287 	 * We stop trying to process the packet once the number of descriptors
288 	 * used equals IXGBE_TX_DESC_LIMIT. Even if we're copying into the
289 	 * IXGBE_TX_DESC_LIMIT-th descriptor, we won't have room to add a
290 	 * context descriptor (since we're already at the limit), so there's
291 	 * no point in continuing. We'll pull up the mblk_t (see below)
292 	 * and try again.
293 	 */
294 	while (mp != NULL && desc_total < IXGBE_TX_DESC_LIMIT) {
295 		uint8_t *rptr = mp->b_rptr + offset;
296 		int ret;
297 
298 		len = MBLKL(mp) - offset;
299 		offset = 0;
300 
301 		if (len > copy_thresh) {
302 			ret = ixgbe_tx_bind(tx_ring, &tcb, &pending_list, rptr,
303 			    len);
304 		} else {
305 			ret = ixgbe_tx_copy(tx_ring, &tcb, &pending_list, rptr,
306 			    len);
307 		}
308 
309 		if (ret < 0)
310 			goto tx_failure;
311 
312 		desc_total += ret;
313 		mp = mp->b_cont;
314 	}
315 
316 	/* Finish off the last tcb */
317 	desc_total += ixgbe_tcb_done(tcb);
318 
319 	/*
320 	 * 82598/82599 chipset has a limitation that no more than 32 tx
321 	 * descriptors can be transmited out at one time. As noted above,
322 	 * we need to include space for a context descriptor in case its
323 	 * necessary, so we do this even if desc_total == IXGBE_TX_DESC_LIMIT
324 	 * as well as when it exceeds the limit.
325 	 *
326 	 * If we exceed this limit, we take the hit, do a msgpullup(), and
327 	 * then try again. Our DMA attributes guarantee we should never use
328 	 * more than MAX_COOKIE (18) descriptors to map a single mblk_t, so we
329 	 * should only need to retry once.
330 	 */
331 	if (desc_total >= IXGBE_TX_DESC_LIMIT) {
332 		/* We shouldn't hit this path twice */
333 		VERIFY0(limit_retry);
334 
335 		tx_ring->stat_break_tbd_limit++;
336 
337 		/* Release all the tcbs we used previously */
338 		ixgbe_put_free_list(tx_ring, &pending_list);
339 		desc_total = 0;
340 		offset = 0;
341 
342 		pull_mp = msgpullup(orig_mp, -1);
343 		if (pull_mp == NULL) {
344 			tx_ring->reschedule = B_TRUE;
345 			return (orig_mp);
346 		}
347 
348 		mp = pull_mp;
349 		limit_retry = B_TRUE;
350 		goto retry;
351 	}
352 
353 	/*
354 	 * Before filling the tx descriptor ring with the data, we need to
355 	 * ensure there are adequate free descriptors for transmit
356 	 * (including one context descriptor).
357 	 * Do not use up all the tx descriptors.
358 	 * Otherwise tx recycle will fail and cause false hang.
359 	 */
360 	if (tx_ring->tbd_free <= (desc_total + 1)) {
361 		tx_ring->tx_recycle(tx_ring);
362 	}
363 
364 	mutex_enter(&tx_ring->tx_lock);
365 	/*
366 	 * If the number of free tx descriptors is not enough for transmit
367 	 * then return mp.
368 	 *
369 	 * Note: we must put this check under the mutex protection to
370 	 * ensure the correctness when multiple threads access it in
371 	 * parallel.
372 	 */
373 	if (tx_ring->tbd_free <= (desc_total + 1)) {
374 		tx_ring->stat_fail_no_tbd++;
375 		mutex_exit(&tx_ring->tx_lock);
376 		goto tx_failure;
377 	}
378 
379 	/*
380 	 * Attach the mblk_t we've setup to the last control block.
381 	 * This is only done once we know there are enough free descriptors
382 	 * to transmit so that the cleanup in tx_failure doesn't try to
383 	 * call freemsg() on mp (since we will want to return it).
384 	 */
385 	tcb->mp = (pull_mp != NULL) ? pull_mp : orig_mp;
386 
387 	desc_num = ixgbe_tx_fill_ring(tx_ring, &pending_list, ctx,
388 	    mbsize);
389 
390 	ASSERT((desc_num == desc_total) || (desc_num == (desc_total + 1)));
391 
392 	tx_ring->stat_obytes += mbsize;
393 	tx_ring->stat_opackets++;
394 
395 	mutex_exit(&tx_ring->tx_lock);
396 
397 	/*
398 	 * Now that tx is done, if we pulled up the original message, we
399 	 * can free the original message since it is no longer being
400 	 * used.
401 	 */
402 	if (pull_mp != NULL) {
403 		freemsg(orig_mp);
404 	}
405 
406 	return (NULL);
407 
408 tx_failure:
409 	/*
410 	 * If transmission fails, need to free the pulling up mblk.
411 	 */
412 	if (pull_mp) {
413 		freemsg(pull_mp);
414 	}
415 
416 	/*
417 	 * Return the tx control blocks in the pending list to the free list.
418 	 */
419 	ixgbe_put_free_list(tx_ring, &pending_list);
420 
421 	/* Transmit failed, do not drop the mblk, rechedule the transmit */
422 	tx_ring->reschedule = B_TRUE;
423 
424 	return (orig_mp);
425 }
426 
427 /*
428  * ixgbe_tx_copy
429  *
430  * Copy the mblk fragment to the pre-allocated tx buffer. Return -1 on error,
431  * otherwise return the number of descriptors we've completed in this call.
432  */
433 static int
434 ixgbe_tx_copy(ixgbe_tx_ring_t *tx_ring, tx_control_block_t **tcbp,
435     link_list_t *pending_list, const void *buf, size_t len)
436 {
437 	tx_control_block_t *tcb = *tcbp;
438 	dma_buffer_t *tx_buf;
439 	uint32_t desc_num = 0;
440 
441 	/*
442 	 * We need a new tcb -- either the current one (tcb) is NULL because
443 	 * we just started, tcb is being used for DMA, or tcb isn't large enough
444 	 * to hold the contents we need to copy.
445 	 */
446 	if (tcb == NULL || tcb->tx_type == USE_DMA ||
447 	    tcb->tx_buf.len + len > tcb->tx_buf.size) {
448 		tx_control_block_t *newtcb;
449 
450 		newtcb = ixgbe_get_free_list(tx_ring, pending_list);
451 		if (newtcb == NULL)
452 			return (-1);
453 
454 		newtcb->tx_type = USE_COPY;
455 
456 		if (tcb != NULL)
457 			desc_num += ixgbe_tcb_done(tcb);
458 		*tcbp = tcb = newtcb;
459 	}
460 
461 	ASSERT3S(tcb->tx_type, ==, USE_COPY);
462 	tx_buf = &tcb->tx_buf;
463 
464 	/*
465 	 * Copy the packet data of the mblk fragment into the
466 	 * pre-allocated tx buffer, which is maintained by the
467 	 * tx control block.
468 	 *
469 	 * Several mblk fragments can be copied into one tx buffer.
470 	 * The destination address of the current copied fragment in
471 	 * the tx buffer is next to the end of the previous copied
472 	 * fragment.
473 	 */
474 	if (len > 0) {
475 		bcopy(buf, tx_buf->address + tx_buf->len, len);
476 
477 		tx_buf->len += len;
478 		tcb->frag_num++;
479 	}
480 
481 	return (desc_num);
482 }
483 
484 /*
485  * ixgbe_tx_bind
486  *
487  * Bind the mblk fragment with DMA. Returns -1 on error, otherwise it
488  * returns the number of descriptors completed in this call. This count
489  * can include descriptors that weren't filled in by the current call to
490  * ixgbe_tx_bind() but were being used (but not yet completed) in previous
491  * calls to ixgbe_tx_bind() or ixgbe_tx_copy().
492  */
493 static int
494 ixgbe_tx_bind(ixgbe_tx_ring_t *tx_ring, tx_control_block_t **tcbp,
495     link_list_t *pending_list, uint8_t *buf, size_t len)
496 {
497 	tx_control_block_t *tcb = NULL;
498 	uint_t desc_num = 0;
499 	int status;
500 
501 	tcb = ixgbe_get_free_list(tx_ring, pending_list);
502 	if (tcb == NULL)
503 		return (-1);
504 
505 	/*
506 	 * Use DMA binding to process the mblk fragment
507 	 */
508 	status = ddi_dma_addr_bind_handle(tcb->tx_dma_handle, NULL,
509 	    (caddr_t)buf, len,
510 	    DDI_DMA_WRITE | DDI_DMA_STREAMING, DDI_DMA_DONTWAIT,
511 	    0, NULL, NULL);
512 
513 	if (status != DDI_DMA_MAPPED) {
514 		tx_ring->stat_fail_dma_bind++;
515 		return (-1);
516 	}
517 
518 	tcb->frag_num++;
519 	tcb->tx_type = USE_DMA;
520 
521 	/*
522 	 * If there was an old tcb, we're about to replace it. Finish
523 	 * setting up the old tcb so we can replace it with the new one.
524 	 */
525 	if (*tcbp != NULL)
526 		desc_num += ixgbe_tcb_done(*tcbp);
527 
528 	*tcbp = tcb;
529 	return (desc_num);
530 }
531 
532 /*
533  * Once we're done populating a tcb (either by binding or copying into
534  * a buffer in the tcb), get it ready for tx and return the number of
535  * descriptors used.
536  */
537 static uint_t
538 ixgbe_tcb_done(tx_control_block_t *tcb)
539 {
540 	uint_t desc_num = 0;
541 
542 	if (tcb->tx_type == USE_DMA) {
543 		const ddi_dma_cookie_t *c;
544 
545 		for (c = ddi_dma_cookie_iter(tcb->tx_dma_handle, NULL);
546 		    c != NULL;
547 		    c = ddi_dma_cookie_iter(tcb->tx_dma_handle, c)) {
548 			/*
549 			 * Save the address and length to the private data
550 			 * structure of the tx control block, which will be
551 			 * used to fill the tx descriptor ring after all the
552 			 * fragments are processed.
553 			 */
554 			ixgbe_save_desc(tcb, c->dmac_laddress, c->dmac_size);
555 			desc_num++;
556 		}
557 	} else if (tcb->tx_type == USE_COPY) {
558 		dma_buffer_t *tx_buf = &tcb->tx_buf;
559 
560 		DMA_SYNC(tx_buf, DDI_DMA_SYNC_FORDEV);
561 		ixgbe_save_desc(tcb, tx_buf->dma_address, tx_buf->len);
562 		desc_num++;
563 	} else {
564 		panic("invalid tcb type");
565 	}
566 
567 	return (desc_num);
568 }
569 
570 /*
571  * ixgbe_get_context
572  *
573  * Get the context information from the mblk
574  */
575 static int
576 ixgbe_get_context(mblk_t *mp, ixgbe_tx_context_t *ctx)
577 {
578 	uint32_t start;
579 	uint32_t hckflags;
580 	uint32_t lsoflags;
581 	uint32_t lsocksum;
582 	uint32_t mss;
583 	uint32_t len;
584 	uint32_t size;
585 	uint32_t offset;
586 	unsigned char *pos;
587 	ushort_t etype;
588 	uint32_t mac_hdr_len;
589 	uint32_t l4_proto;
590 	uint32_t l4_hdr_len;
591 
592 	ASSERT(mp != NULL);
593 
594 	mac_hcksum_get(mp, &start, NULL, NULL, NULL, &hckflags);
595 	bzero(ctx, sizeof (ixgbe_tx_context_t));
596 
597 	if (hckflags == 0) {
598 		return (0);
599 	}
600 
601 	ctx->hcksum_flags = hckflags;
602 
603 	mac_lso_get(mp, &mss, &lsoflags);
604 	ctx->mss = mss;
605 	ctx->lso_flag = (lsoflags == HW_LSO);
606 
607 	etype = 0;
608 	mac_hdr_len = 0;
609 	l4_proto = 0;
610 
611 	/*
612 	 * Firstly get the position of the ether_type/ether_tpid.
613 	 * Here we don't assume the ether (VLAN) header is fully included
614 	 * in one mblk fragment, so we go thourgh the fragments to parse
615 	 * the ether type.
616 	 */
617 	size = len = MBLKL(mp);
618 	offset = offsetof(struct ether_header, ether_type);
619 	while (size <= offset) {
620 		mp = mp->b_cont;
621 		ASSERT(mp != NULL);
622 		len = MBLKL(mp);
623 		size += len;
624 	}
625 	pos = mp->b_rptr + offset + len - size;
626 
627 	etype = ntohs(*(ushort_t *)(uintptr_t)pos);
628 	if (etype == ETHERTYPE_VLAN) {
629 		/*
630 		 * Get the position of the ether_type in VLAN header
631 		 */
632 		offset = offsetof(struct ether_vlan_header, ether_type);
633 		while (size <= offset) {
634 			mp = mp->b_cont;
635 			ASSERT(mp != NULL);
636 			len = MBLKL(mp);
637 			size += len;
638 		}
639 		pos = mp->b_rptr + offset + len - size;
640 
641 		etype = ntohs(*(ushort_t *)(uintptr_t)pos);
642 		mac_hdr_len = sizeof (struct ether_vlan_header);
643 	} else {
644 		mac_hdr_len = sizeof (struct ether_header);
645 	}
646 
647 	/*
648 	 * Here we don't assume the IP(V6) header is fully included in
649 	 * one mblk fragment.
650 	 */
651 	lsocksum = HCK_PARTIALCKSUM;
652 	ctx->l3_proto = etype;
653 	switch (etype) {
654 	case ETHERTYPE_IP:
655 		if (ctx->lso_flag) {
656 			offset = offsetof(ipha_t, ipha_length) + mac_hdr_len;
657 			while (size <= offset) {
658 				mp = mp->b_cont;
659 				ASSERT(mp != NULL);
660 				len = MBLKL(mp);
661 				size += len;
662 			}
663 			pos = mp->b_rptr + offset + len - size;
664 			*((uint16_t *)(uintptr_t)(pos)) = 0;
665 
666 			offset = offsetof(ipha_t, ipha_hdr_checksum) +
667 			    mac_hdr_len;
668 			while (size <= offset) {
669 				mp = mp->b_cont;
670 				ASSERT(mp != NULL);
671 				len = MBLKL(mp);
672 				size += len;
673 			}
674 			pos = mp->b_rptr + offset + len - size;
675 			*((uint16_t *)(uintptr_t)(pos)) = 0;
676 
677 			/*
678 			 * To perform ixgbe LSO, here also need to fill
679 			 * the tcp checksum field of the packet with the
680 			 * following pseudo-header checksum:
681 			 * (ip_source_addr, ip_destination_addr, l4_proto)
682 			 * Currently the tcp/ip stack has done it.
683 			 */
684 			lsocksum |= HCK_IPV4_HDRCKSUM;
685 		}
686 
687 		offset = offsetof(ipha_t, ipha_protocol) + mac_hdr_len;
688 		while (size <= offset) {
689 			mp = mp->b_cont;
690 			ASSERT(mp != NULL);
691 			len = MBLKL(mp);
692 			size += len;
693 		}
694 		pos = mp->b_rptr + offset + len - size;
695 
696 		l4_proto = *(uint8_t *)pos;
697 		break;
698 	case ETHERTYPE_IPV6:
699 		/*
700 		 * We need to zero out the length in the header.
701 		 */
702 		if (ctx->lso_flag) {
703 			offset = offsetof(ip6_t, ip6_plen) + mac_hdr_len;
704 			while (size <= offset) {
705 				mp = mp->b_cont;
706 				ASSERT(mp != NULL);
707 				len = MBLKL(mp);
708 				size += len;
709 			}
710 			pos = mp->b_rptr + offset + len - size;
711 			*((uint16_t *)(uintptr_t)(pos)) = 0;
712 		}
713 
714 		offset = offsetof(ip6_t, ip6_nxt) + mac_hdr_len;
715 		while (size <= offset) {
716 			mp = mp->b_cont;
717 			ASSERT(mp != NULL);
718 			len = MBLKL(mp);
719 			size += len;
720 		}
721 		pos = mp->b_rptr + offset + len - size;
722 
723 		l4_proto = *(uint8_t *)pos;
724 		break;
725 	default:
726 		/* Unrecoverable error */
727 		IXGBE_DEBUGLOG_0(NULL, "Ether type error with tx hcksum");
728 		return (-2);
729 	}
730 
731 	if (ctx->lso_flag) {
732 		/*
733 		 * LSO relies on tx h/w checksum, so here will drop the packet
734 		 * if h/w checksum flag is not declared.
735 		 */
736 		if ((ctx->hcksum_flags & lsocksum) != lsocksum) {
737 			IXGBE_DEBUGLOG_2(NULL, "ixgbe_tx: h/w checksum flags "
738 			    "are not set for LSO, found 0x%x, needed bits 0x%x",
739 			    ctx->hcksum_flags, lsocksum);
740 			return (-1);
741 		}
742 
743 
744 		offset = mac_hdr_len + start;
745 		while (size <= offset) {
746 			mp = mp->b_cont;
747 			ASSERT(mp != NULL);
748 			len = MBLKL(mp);
749 			size += len;
750 		}
751 		pos = mp->b_rptr + offset + len - size;
752 
753 		l4_hdr_len = TCP_HDR_LENGTH((tcph_t *)pos);
754 	} else {
755 		/*
756 		 * l4 header length is only required for LSO
757 		 */
758 		l4_hdr_len = 0;
759 	}
760 
761 	ctx->mac_hdr_len = mac_hdr_len;
762 	ctx->ip_hdr_len = start;
763 	ctx->l4_proto = l4_proto;
764 	ctx->l4_hdr_len = l4_hdr_len;
765 
766 	return (0);
767 }
768 
769 /*
770  * ixgbe_check_context
771  *
772  * Check if a new context descriptor is needed
773  */
774 static boolean_t
775 ixgbe_check_context(ixgbe_tx_ring_t *tx_ring, ixgbe_tx_context_t *ctx)
776 {
777 	ixgbe_tx_context_t *last;
778 
779 	if (ctx == NULL)
780 		return (B_FALSE);
781 
782 	/*
783 	 * Compare the context data retrieved from the mblk and the
784 	 * stored data of the last context descriptor. The data need
785 	 * to be checked are:
786 	 *	hcksum_flags
787 	 *	l4_proto
788 	 *	mac_hdr_len
789 	 *	ip_hdr_len
790 	 *	lso_flag
791 	 *	mss (only checked for LSO)
792 	 *	l4_hr_len (only checked for LSO)
793 	 * Either one of the above data is changed, a new context descriptor
794 	 * will be needed.
795 	 */
796 	last = &tx_ring->tx_context;
797 
798 	if ((ctx->hcksum_flags != last->hcksum_flags) ||
799 	    (ctx->l4_proto != last->l4_proto) ||
800 	    (ctx->l3_proto != last->l3_proto) ||
801 	    (ctx->mac_hdr_len != last->mac_hdr_len) ||
802 	    (ctx->ip_hdr_len != last->ip_hdr_len) ||
803 	    (ctx->lso_flag != last->lso_flag) ||
804 	    (ctx->lso_flag && ((ctx->mss != last->mss) ||
805 	    (ctx->l4_hdr_len != last->l4_hdr_len)))) {
806 		return (B_TRUE);
807 	}
808 
809 	return (B_FALSE);
810 }
811 
812 /*
813  * ixgbe_fill_context
814  *
815  * Fill the context descriptor with hardware checksum informations
816  */
817 static void
818 ixgbe_fill_context(struct ixgbe_adv_tx_context_desc *ctx_tbd,
819     ixgbe_tx_context_t *ctx)
820 {
821 	/*
822 	 * Fill the context descriptor with the checksum
823 	 * context information we've got.
824 	 */
825 	ctx_tbd->vlan_macip_lens = ctx->ip_hdr_len;
826 	ctx_tbd->vlan_macip_lens |= ctx->mac_hdr_len <<
827 	    IXGBE_ADVTXD_MACLEN_SHIFT;
828 
829 	ctx_tbd->type_tucmd_mlhl =
830 	    IXGBE_ADVTXD_DCMD_DEXT | IXGBE_ADVTXD_DTYP_CTXT;
831 	/*
832 	 * When we have a TX context set up, we enforce that the ethertype is
833 	 * either IPv4 or IPv6 in ixgbe_get_tx_context().
834 	 */
835 	if (ctx->lso_flag || ctx->hcksum_flags & HCK_IPV4_HDRCKSUM) {
836 		if (ctx->l3_proto == ETHERTYPE_IP) {
837 			ctx_tbd->type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV4;
838 		} else {
839 			ctx_tbd->type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV6;
840 		}
841 	}
842 
843 	if (ctx->lso_flag || ctx->hcksum_flags & HCK_PARTIALCKSUM) {
844 		switch (ctx->l4_proto) {
845 		case IPPROTO_TCP:
846 			ctx_tbd->type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_TCP;
847 			break;
848 		case IPPROTO_UDP:
849 			/*
850 			 * We don't have to explicitly set:
851 			 *	ctx_tbd->type_tucmd_mlhl |=
852 			 *	    IXGBE_ADVTXD_TUCMD_L4T_UDP;
853 			 * Because IXGBE_ADVTXD_TUCMD_L4T_UDP == 0b
854 			 */
855 			break;
856 		default:
857 			/* Unrecoverable error */
858 			IXGBE_DEBUGLOG_0(NULL, "L4 type error with tx hcksum");
859 			break;
860 		}
861 	}
862 
863 	ctx_tbd->seqnum_seed = 0;
864 
865 	if (ctx->lso_flag) {
866 		ctx_tbd->mss_l4len_idx =
867 		    (ctx->l4_hdr_len << IXGBE_ADVTXD_L4LEN_SHIFT) |
868 		    (ctx->mss << IXGBE_ADVTXD_MSS_SHIFT);
869 	} else {
870 		ctx_tbd->mss_l4len_idx = 0;
871 	}
872 }
873 
874 /*
875  * ixgbe_tx_fill_ring
876  *
877  * Fill the tx descriptor ring with the data
878  */
879 static int
880 ixgbe_tx_fill_ring(ixgbe_tx_ring_t *tx_ring, link_list_t *pending_list,
881     ixgbe_tx_context_t *ctx, size_t mbsize)
882 {
883 	struct ixgbe_hw *hw = &tx_ring->ixgbe->hw;
884 	boolean_t load_context;
885 	uint32_t index, tcb_index, desc_num;
886 	union ixgbe_adv_tx_desc *tbd, *first_tbd;
887 	tx_control_block_t *tcb, *first_tcb;
888 	uint32_t hcksum_flags;
889 	int i;
890 
891 	ASSERT(mutex_owned(&tx_ring->tx_lock));
892 
893 	tbd = NULL;
894 	first_tbd = NULL;
895 	first_tcb = NULL;
896 	desc_num = 0;
897 	hcksum_flags = 0;
898 	load_context = B_FALSE;
899 
900 	/*
901 	 * Get the index of the first tx descriptor that will be filled,
902 	 * and the index of the first work list item that will be attached
903 	 * with the first used tx control block in the pending list.
904 	 * Note: the two indexes are the same.
905 	 */
906 	index = tx_ring->tbd_tail;
907 	tcb_index = tx_ring->tbd_tail;
908 
909 	if (ctx != NULL) {
910 		hcksum_flags = ctx->hcksum_flags;
911 
912 		/*
913 		 * Check if a new context descriptor is needed for this packet
914 		 */
915 		load_context = ixgbe_check_context(tx_ring, ctx);
916 
917 		if (load_context) {
918 			tbd = &tx_ring->tbd_ring[index];
919 
920 			/*
921 			 * Fill the context descriptor with the
922 			 * hardware checksum offload informations.
923 			 */
924 			ixgbe_fill_context(
925 			    (struct ixgbe_adv_tx_context_desc *)tbd, ctx);
926 
927 			index = NEXT_INDEX(index, 1, tx_ring->ring_size);
928 			desc_num++;
929 
930 			/*
931 			 * Store the checksum context data if
932 			 * a new context descriptor is added
933 			 */
934 			tx_ring->tx_context = *ctx;
935 		}
936 	}
937 
938 	first_tbd = &tx_ring->tbd_ring[index];
939 
940 	/*
941 	 * Fill tx data descriptors with the data saved in the pending list.
942 	 * The tx control blocks in the pending list are added to the work list
943 	 * at the same time.
944 	 *
945 	 * The work list is strictly 1:1 corresponding to the descriptor ring.
946 	 * One item of the work list corresponds to one tx descriptor. Because
947 	 * one tx control block can span multiple tx descriptors, the tx
948 	 * control block will be added to the first work list item that
949 	 * corresponds to the first tx descriptor generated from that tx
950 	 * control block.
951 	 */
952 	tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
953 	first_tcb = tcb;
954 	while (tcb != NULL) {
955 
956 		for (i = 0; i < tcb->desc_num; i++) {
957 			tbd = &tx_ring->tbd_ring[index];
958 
959 			tbd->read.buffer_addr = tcb->desc[i].address;
960 			tbd->read.cmd_type_len = tcb->desc[i].length;
961 
962 			tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_DEXT
963 			    | IXGBE_ADVTXD_DTYP_DATA;
964 
965 			tbd->read.olinfo_status = 0;
966 
967 			index = NEXT_INDEX(index, 1, tx_ring->ring_size);
968 			desc_num++;
969 		}
970 
971 		/*
972 		 * Add the tx control block to the work list
973 		 */
974 		ASSERT(tx_ring->work_list[tcb_index] == NULL);
975 		tx_ring->work_list[tcb_index] = tcb;
976 
977 		tcb_index = index;
978 		tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
979 	}
980 
981 	if (load_context) {
982 		/*
983 		 * Count the context descriptor for
984 		 * the first tx control block.
985 		 */
986 		first_tcb->desc_num++;
987 	}
988 	first_tcb->last_index = PREV_INDEX(index, 1, tx_ring->ring_size);
989 
990 	/*
991 	 * The Insert Ethernet CRC (IFCS) bit and the checksum fields are only
992 	 * valid in the first descriptor of the packet.
993 	 * Setting paylen in every first_tbd for all parts.
994 	 * 82599, X540 and X550 require the packet length in paylen field
995 	 * with or without LSO and 82598 will ignore it in non-LSO mode.
996 	 */
997 	ASSERT(first_tbd != NULL);
998 	first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_IFCS;
999 
1000 	switch (hw->mac.type) {
1001 	case ixgbe_mac_82598EB:
1002 		if (ctx != NULL && ctx->lso_flag) {
1003 			first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_TSE;
1004 			first_tbd->read.olinfo_status |=
1005 			    (mbsize - ctx->mac_hdr_len - ctx->ip_hdr_len
1006 			    - ctx->l4_hdr_len) << IXGBE_ADVTXD_PAYLEN_SHIFT;
1007 		}
1008 		break;
1009 
1010 	case ixgbe_mac_82599EB:
1011 	case ixgbe_mac_X540:
1012 	case ixgbe_mac_X550:
1013 	case ixgbe_mac_X550EM_x:
1014 	case ixgbe_mac_X550EM_a:
1015 		if (ctx != NULL && ctx->lso_flag) {
1016 			first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_TSE;
1017 			first_tbd->read.olinfo_status |=
1018 			    (mbsize - ctx->mac_hdr_len - ctx->ip_hdr_len
1019 			    - ctx->l4_hdr_len) << IXGBE_ADVTXD_PAYLEN_SHIFT;
1020 		} else {
1021 			first_tbd->read.olinfo_status |=
1022 			    (mbsize << IXGBE_ADVTXD_PAYLEN_SHIFT);
1023 		}
1024 		break;
1025 
1026 	default:
1027 		break;
1028 	}
1029 
1030 	/* Set hardware checksum bits */
1031 	if (hcksum_flags != 0) {
1032 		if (hcksum_flags & HCK_IPV4_HDRCKSUM)
1033 			first_tbd->read.olinfo_status |=
1034 			    IXGBE_ADVTXD_POPTS_IXSM;
1035 		if (hcksum_flags & HCK_PARTIALCKSUM)
1036 			first_tbd->read.olinfo_status |=
1037 			    IXGBE_ADVTXD_POPTS_TXSM;
1038 	}
1039 
1040 	/*
1041 	 * The last descriptor of packet needs End Of Packet (EOP),
1042 	 * and Report Status (RS) bits set
1043 	 */
1044 	ASSERT(tbd != NULL);
1045 	tbd->read.cmd_type_len |=
1046 	    IXGBE_ADVTXD_DCMD_EOP | IXGBE_ADVTXD_DCMD_RS;
1047 
1048 	/*
1049 	 * Sync the DMA buffer of the tx descriptor ring
1050 	 */
1051 	DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORDEV);
1052 
1053 	/*
1054 	 * Update the number of the free tx descriptors.
1055 	 * The mutual exclusion between the transmission and the recycling
1056 	 * (for the tx descriptor ring and the work list) is implemented
1057 	 * with the atomic operation on the number of the free tx descriptors.
1058 	 *
1059 	 * Note: we should always decrement the counter tbd_free before
1060 	 * advancing the hardware TDT pointer to avoid the race condition -
1061 	 * before the counter tbd_free is decremented, the transmit of the
1062 	 * tx descriptors has done and the counter tbd_free is increased by
1063 	 * the tx recycling.
1064 	 */
1065 	i = ixgbe_atomic_reserve(&tx_ring->tbd_free, desc_num);
1066 	ASSERT(i >= 0);
1067 
1068 	tx_ring->tbd_tail = index;
1069 
1070 	/*
1071 	 * Advance the hardware TDT pointer of the tx descriptor ring
1072 	 */
1073 	IXGBE_WRITE_REG(hw, IXGBE_TDT(tx_ring->index), index);
1074 
1075 	if (ixgbe_check_acc_handle(tx_ring->ixgbe->osdep.reg_handle) !=
1076 	    DDI_FM_OK) {
1077 		ddi_fm_service_impact(tx_ring->ixgbe->dip,
1078 		    DDI_SERVICE_DEGRADED);
1079 		atomic_or_32(&tx_ring->ixgbe->ixgbe_state, IXGBE_ERROR);
1080 	}
1081 
1082 	return (desc_num);
1083 }
1084 
1085 /*
1086  * ixgbe_save_desc
1087  *
1088  * Save the address/length pair to the private array
1089  * of the tx control block. The address/length pairs
1090  * will be filled into the tx descriptor ring later.
1091  */
1092 static void
1093 ixgbe_save_desc(tx_control_block_t *tcb, uint64_t address, size_t length)
1094 {
1095 	sw_desc_t *desc;
1096 
1097 	desc = &tcb->desc[tcb->desc_num];
1098 	desc->address = address;
1099 	desc->length = length;
1100 
1101 	tcb->desc_num++;
1102 }
1103 
1104 /*
1105  * ixgbe_tx_recycle_legacy
1106  *
1107  * Recycle the tx descriptors and tx control blocks.
1108  *
1109  * The work list is traversed to check if the corresponding
1110  * tx descriptors have been transmitted. If so, the resources
1111  * bound to the tx control blocks will be freed, and those
1112  * tx control blocks will be returned to the free list.
1113  */
1114 uint32_t
1115 ixgbe_tx_recycle_legacy(ixgbe_tx_ring_t *tx_ring)
1116 {
1117 	uint32_t index, last_index, prev_index;
1118 	int desc_num;
1119 	boolean_t desc_done;
1120 	tx_control_block_t *tcb;
1121 	link_list_t pending_list;
1122 	ixgbe_t *ixgbe = tx_ring->ixgbe;
1123 
1124 	mutex_enter(&tx_ring->recycle_lock);
1125 
1126 	ASSERT(tx_ring->tbd_free <= tx_ring->ring_size);
1127 
1128 	if (tx_ring->tbd_free == tx_ring->ring_size) {
1129 		tx_ring->recycle_fail = 0;
1130 		tx_ring->stall_watchdog = 0;
1131 		if (tx_ring->reschedule) {
1132 			tx_ring->reschedule = B_FALSE;
1133 			mac_tx_ring_update(ixgbe->mac_hdl,
1134 			    tx_ring->ring_handle);
1135 		}
1136 		mutex_exit(&tx_ring->recycle_lock);
1137 		return (0);
1138 	}
1139 
1140 	/*
1141 	 * Sync the DMA buffer of the tx descriptor ring
1142 	 */
1143 	DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORKERNEL);
1144 
1145 	if (ixgbe_check_dma_handle(tx_ring->tbd_area.dma_handle) != DDI_FM_OK) {
1146 		mutex_exit(&tx_ring->recycle_lock);
1147 		ddi_fm_service_impact(ixgbe->dip, DDI_SERVICE_DEGRADED);
1148 		atomic_or_32(&ixgbe->ixgbe_state, IXGBE_ERROR);
1149 		return (0);
1150 	}
1151 
1152 	LINK_LIST_INIT(&pending_list);
1153 	desc_num = 0;
1154 	index = tx_ring->tbd_head;	/* Index of next tbd/tcb to recycle */
1155 
1156 	tcb = tx_ring->work_list[index];
1157 	ASSERT(tcb != NULL);
1158 
1159 	while (tcb != NULL) {
1160 		/*
1161 		 * Get the last tx descriptor of this packet.
1162 		 * If the last tx descriptor is done, then
1163 		 * we can recycle all descriptors of a packet
1164 		 * which usually includes several tx control blocks.
1165 		 * For 82599, LSO descriptors can not be recycled
1166 		 * unless the whole packet's transmission is done.
1167 		 * That's why packet level recycling is used here.
1168 		 * For 82598, there's not such limit.
1169 		 */
1170 		last_index = tcb->last_index;
1171 		/*
1172 		 * MAX_TX_RING_SIZE is used to judge whether
1173 		 * the index is a valid value or not.
1174 		 */
1175 		if (last_index == MAX_TX_RING_SIZE)
1176 			break;
1177 
1178 		/*
1179 		 * Check if the Descriptor Done bit is set
1180 		 */
1181 		desc_done = tx_ring->tbd_ring[last_index].wb.status &
1182 		    IXGBE_TXD_STAT_DD;
1183 		if (desc_done) {
1184 			/*
1185 			 * recycle all descriptors of the packet
1186 			 */
1187 			while (tcb != NULL) {
1188 				/*
1189 				 * Strip off the tx control block from
1190 				 * the work list, and add it to the
1191 				 * pending list.
1192 				 */
1193 				tx_ring->work_list[index] = NULL;
1194 				LIST_PUSH_TAIL(&pending_list, &tcb->link);
1195 
1196 				/*
1197 				 * Count the total number of the tx
1198 				 * descriptors recycled
1199 				 */
1200 				desc_num += tcb->desc_num;
1201 
1202 				index = NEXT_INDEX(index, tcb->desc_num,
1203 				    tx_ring->ring_size);
1204 
1205 				tcb = tx_ring->work_list[index];
1206 
1207 				prev_index = PREV_INDEX(index, 1,
1208 				    tx_ring->ring_size);
1209 				if (prev_index == last_index)
1210 					break;
1211 			}
1212 		} else {
1213 			break;
1214 		}
1215 	}
1216 
1217 	/*
1218 	 * If no tx descriptors are recycled, no need to do more processing
1219 	 */
1220 	if (desc_num == 0) {
1221 		tx_ring->recycle_fail++;
1222 		mutex_exit(&tx_ring->recycle_lock);
1223 		return (0);
1224 	}
1225 
1226 	tx_ring->recycle_fail = 0;
1227 	tx_ring->stall_watchdog = 0;
1228 
1229 	/*
1230 	 * Update the head index of the tx descriptor ring
1231 	 */
1232 	tx_ring->tbd_head = index;
1233 
1234 	/*
1235 	 * Update the number of the free tx descriptors with atomic operations
1236 	 */
1237 	atomic_add_32(&tx_ring->tbd_free, desc_num);
1238 
1239 	if ((tx_ring->tbd_free >= ixgbe->tx_resched_thresh) &&
1240 	    (tx_ring->reschedule)) {
1241 		tx_ring->reschedule = B_FALSE;
1242 		mac_tx_ring_update(ixgbe->mac_hdl,
1243 		    tx_ring->ring_handle);
1244 	}
1245 	mutex_exit(&tx_ring->recycle_lock);
1246 
1247 	/*
1248 	 * Add the tx control blocks in the pending list to the free list.
1249 	 */
1250 	ixgbe_put_free_list(tx_ring, &pending_list);
1251 
1252 	return (desc_num);
1253 }
1254 
1255 /*
1256  * ixgbe_tx_recycle_head_wb
1257  *
1258  * Check the head write-back, and recycle all the transmitted
1259  * tx descriptors and tx control blocks.
1260  */
1261 uint32_t
1262 ixgbe_tx_recycle_head_wb(ixgbe_tx_ring_t *tx_ring)
1263 {
1264 	uint32_t index;
1265 	uint32_t head_wb;
1266 	int desc_num;
1267 	tx_control_block_t *tcb;
1268 	link_list_t pending_list;
1269 	ixgbe_t *ixgbe = tx_ring->ixgbe;
1270 
1271 	mutex_enter(&tx_ring->recycle_lock);
1272 
1273 	ASSERT(tx_ring->tbd_free <= tx_ring->ring_size);
1274 
1275 	if (tx_ring->tbd_free == tx_ring->ring_size) {
1276 		tx_ring->recycle_fail = 0;
1277 		tx_ring->stall_watchdog = 0;
1278 		if (tx_ring->reschedule) {
1279 			tx_ring->reschedule = B_FALSE;
1280 			mac_tx_ring_update(ixgbe->mac_hdl,
1281 			    tx_ring->ring_handle);
1282 		}
1283 		mutex_exit(&tx_ring->recycle_lock);
1284 		return (0);
1285 	}
1286 
1287 	/*
1288 	 * Sync the DMA buffer of the tx descriptor ring
1289 	 *
1290 	 * Note: For head write-back mode, the tx descriptors will not
1291 	 * be written back, but the head write-back value is stored at
1292 	 * the last extra tbd at the end of the DMA area, we still need
1293 	 * to sync the head write-back value for kernel.
1294 	 *
1295 	 * DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORKERNEL);
1296 	 */
1297 	(void) ddi_dma_sync(tx_ring->tbd_area.dma_handle,
1298 	    sizeof (union ixgbe_adv_tx_desc) * tx_ring->ring_size,
1299 	    sizeof (uint32_t),
1300 	    DDI_DMA_SYNC_FORKERNEL);
1301 
1302 	if (ixgbe_check_dma_handle(tx_ring->tbd_area.dma_handle) != DDI_FM_OK) {
1303 		mutex_exit(&tx_ring->recycle_lock);
1304 		ddi_fm_service_impact(ixgbe->dip,
1305 		    DDI_SERVICE_DEGRADED);
1306 		atomic_or_32(&ixgbe->ixgbe_state, IXGBE_ERROR);
1307 		return (0);
1308 	}
1309 
1310 	LINK_LIST_INIT(&pending_list);
1311 	desc_num = 0;
1312 	index = tx_ring->tbd_head;	/* Next index to clean */
1313 
1314 	/*
1315 	 * Get the value of head write-back
1316 	 */
1317 	head_wb = *tx_ring->tbd_head_wb;
1318 	while (index != head_wb) {
1319 		tcb = tx_ring->work_list[index];
1320 		ASSERT(tcb != NULL);
1321 
1322 		if (OFFSET(index, head_wb, tx_ring->ring_size) <
1323 		    tcb->desc_num) {
1324 			/*
1325 			 * The current tx control block is not
1326 			 * completely transmitted, stop recycling
1327 			 */
1328 			break;
1329 		}
1330 
1331 		/*
1332 		 * Strip off the tx control block from the work list,
1333 		 * and add it to the pending list.
1334 		 */
1335 		tx_ring->work_list[index] = NULL;
1336 		LIST_PUSH_TAIL(&pending_list, &tcb->link);
1337 
1338 		/*
1339 		 * Advance the index of the tx descriptor ring
1340 		 */
1341 		index = NEXT_INDEX(index, tcb->desc_num, tx_ring->ring_size);
1342 
1343 		/*
1344 		 * Count the total number of the tx descriptors recycled
1345 		 */
1346 		desc_num += tcb->desc_num;
1347 	}
1348 
1349 	/*
1350 	 * If no tx descriptors are recycled, no need to do more processing
1351 	 */
1352 	if (desc_num == 0) {
1353 		tx_ring->recycle_fail++;
1354 		mutex_exit(&tx_ring->recycle_lock);
1355 		return (0);
1356 	}
1357 
1358 	tx_ring->recycle_fail = 0;
1359 	tx_ring->stall_watchdog = 0;
1360 
1361 	/*
1362 	 * Update the head index of the tx descriptor ring
1363 	 */
1364 	tx_ring->tbd_head = index;
1365 
1366 	/*
1367 	 * Update the number of the free tx descriptors with atomic operations
1368 	 */
1369 	atomic_add_32(&tx_ring->tbd_free, desc_num);
1370 
1371 	if ((tx_ring->tbd_free >= ixgbe->tx_resched_thresh) &&
1372 	    (tx_ring->reschedule)) {
1373 		tx_ring->reschedule = B_FALSE;
1374 		mac_tx_ring_update(ixgbe->mac_hdl,
1375 		    tx_ring->ring_handle);
1376 	}
1377 	mutex_exit(&tx_ring->recycle_lock);
1378 
1379 	/*
1380 	 * Add the tx control blocks in the pending list to the free list.
1381 	 */
1382 	ixgbe_put_free_list(tx_ring, &pending_list);
1383 
1384 	return (desc_num);
1385 }
1386 
1387 /*
1388  * ixgbe_free_tcb - free up the tx control block
1389  *
1390  * Free the resources of the tx control block, including
1391  * unbind the previously bound DMA handle, and reset other
1392  * control fields.
1393  */
1394 void
1395 ixgbe_free_tcb(tx_control_block_t *tcb)
1396 {
1397 	if (tcb == NULL)
1398 		return;
1399 
1400 	switch (tcb->tx_type) {
1401 	case USE_COPY:
1402 		/*
1403 		 * Reset the buffer length that is used for copy
1404 		 */
1405 		tcb->tx_buf.len = 0;
1406 		break;
1407 	case USE_DMA:
1408 		/*
1409 		 * Release the DMA resource that is used for
1410 		 * DMA binding.
1411 		 */
1412 		(void) ddi_dma_unbind_handle(tcb->tx_dma_handle);
1413 		break;
1414 	default:
1415 		break;
1416 	}
1417 
1418 	/*
1419 	 * Free the mblk
1420 	 */
1421 	if (tcb->mp != NULL) {
1422 		freemsg(tcb->mp);
1423 		tcb->mp = NULL;
1424 	}
1425 
1426 	tcb->tx_type = USE_NONE;
1427 	tcb->last_index = MAX_TX_RING_SIZE;
1428 	tcb->frag_num = 0;
1429 	tcb->desc_num = 0;
1430 }
1431 
1432 /*
1433  * ixgbe_get_free_list - Get a free tx control block from the free list.
1434  * Returns the tx control block and appends it to list.
1435  *
1436  * The atomic operation on the number of the available tx control block
1437  * in the free list is used to keep this routine mutual exclusive with
1438  * the routine ixgbe_put_check_list.
1439  */
1440 static tx_control_block_t *
1441 ixgbe_get_free_list(ixgbe_tx_ring_t *tx_ring, link_list_t *list)
1442 {
1443 	tx_control_block_t *tcb;
1444 
1445 	/*
1446 	 * Check and update the number of the free tx control block
1447 	 * in the free list.
1448 	 */
1449 	if (ixgbe_atomic_reserve(&tx_ring->tcb_free, 1) < 0) {
1450 		tx_ring->stat_fail_no_tcb++;
1451 		return (NULL);
1452 	}
1453 
1454 	mutex_enter(&tx_ring->tcb_head_lock);
1455 
1456 	tcb = tx_ring->free_list[tx_ring->tcb_head];
1457 	ASSERT(tcb != NULL);
1458 	tx_ring->free_list[tx_ring->tcb_head] = NULL;
1459 	tx_ring->tcb_head = NEXT_INDEX(tx_ring->tcb_head, 1,
1460 	    tx_ring->free_list_size);
1461 
1462 	mutex_exit(&tx_ring->tcb_head_lock);
1463 
1464 	LIST_PUSH_TAIL(list, &tcb->link);
1465 	return (tcb);
1466 }
1467 
1468 /*
1469  * ixgbe_put_free_list
1470  *
1471  * Put a list of used tx control blocks back to the free list
1472  *
1473  * A mutex is used here to ensure the serialization. The mutual exclusion
1474  * between ixgbe_get_free_list and ixgbe_put_free_list is implemented with
1475  * the atomic operation on the counter tcb_free.
1476  */
1477 void
1478 ixgbe_put_free_list(ixgbe_tx_ring_t *tx_ring, link_list_t *pending_list)
1479 {
1480 	uint32_t index;
1481 	int tcb_num;
1482 	tx_control_block_t *tcb;
1483 
1484 	for (tcb = (tx_control_block_t *)LIST_GET_HEAD(pending_list);
1485 	    tcb != NULL;
1486 	    tcb = (tx_control_block_t *)LIST_GET_NEXT(pending_list, tcb)) {
1487 		/*
1488 		 * Despite the name, ixgbe_free_tcb() just releases the
1489 		 * resources in tcb, but does not free tcb itself.
1490 		 */
1491 		ixgbe_free_tcb(tcb);
1492 	}
1493 
1494 	mutex_enter(&tx_ring->tcb_tail_lock);
1495 
1496 	index = tx_ring->tcb_tail;
1497 
1498 	tcb_num = 0;
1499 	tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
1500 	while (tcb != NULL) {
1501 		ASSERT(tx_ring->free_list[index] == NULL);
1502 		tx_ring->free_list[index] = tcb;
1503 
1504 		tcb_num++;
1505 
1506 		index = NEXT_INDEX(index, 1, tx_ring->free_list_size);
1507 
1508 		tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
1509 	}
1510 
1511 	tx_ring->tcb_tail = index;
1512 
1513 	/*
1514 	 * Update the number of the free tx control block
1515 	 * in the free list. This operation must be placed
1516 	 * under the protection of the lock.
1517 	 */
1518 	atomic_add_32(&tx_ring->tcb_free, tcb_num);
1519 
1520 	mutex_exit(&tx_ring->tcb_tail_lock);
1521 }
1522