xref: /illumos-gate/usr/src/uts/common/io/ixgbe/ixgbe_tx.c (revision fcdb3229a31dd4ff700c69238814e326aad49098)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright(c) 2007-2010 Intel Corporation. All rights reserved.
24  */
25 
26 /*
27  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
28  * Copyright 2012 Nexenta Systems, Inc. All rights reserved.
29  * Copyright 2016 OmniTI Computer Consulting, Inc. All rights reserved.
30  * Copyright 2021 Joyent, Inc.
31  */
32 
33 #include "ixgbe_sw.h"
34 
35 static int ixgbe_tx_copy(ixgbe_tx_ring_t *, tx_control_block_t **,
36     link_list_t *, const void *, size_t);
37 static int ixgbe_tx_bind(ixgbe_tx_ring_t *, tx_control_block_t **,
38     link_list_t *, uint8_t *, size_t);
39 static uint_t ixgbe_tcb_done(tx_control_block_t *);
40 static int ixgbe_tx_fill_ring(ixgbe_tx_ring_t *, link_list_t *,
41     ixgbe_tx_context_t *, size_t);
42 static void ixgbe_save_desc(tx_control_block_t *, uint64_t, size_t);
43 static tx_control_block_t *ixgbe_get_free_list(ixgbe_tx_ring_t *,
44     link_list_t *);
45 
46 static int ixgbe_get_context(mblk_t *, ixgbe_tx_context_t *);
47 static boolean_t ixgbe_check_context(ixgbe_tx_ring_t *,
48     ixgbe_tx_context_t *);
49 static void ixgbe_fill_context(struct ixgbe_adv_tx_context_desc *,
50     ixgbe_tx_context_t *);
51 
52 /*
53  * ixgbe_ring_tx
54  *
55  * To transmit one mblk through one specified ring.
56  *
57  * One mblk can consist of several fragments, each fragment
58  * will be processed with different methods based on the size.
59  * For the fragments with size less than the bcopy threshold,
60  * they will be processed by using bcopy; otherwise, they will
61  * be processed by using DMA binding.
62  *
63  * To process the mblk, for each fragment, we pass a pointer to the location
64  * of the current transmit control block (tcb) (initialized to NULL) to either
65  * ixgbe_tx_copy() or ixgbe_tx_bind() (based on the size of the mblk fragment).
66  * ixgbe_tx_copy() and ixgbe_tx_bind() will either continue to use the current
67  * if possible, or close out the current tcb, allocate a new tcb, and update
68  * the passed location (tx_control_block_t **) to reflect the new current tcb.
69  *
70  * Since bound mblk fragments require their own tcb, the close, allocate new,
71  * and update steps occur on every call to ixgbe_tx_bind(), but since
72  * consecutive small mblk fragments can be combined into a single tcb, the
73  * close, allocate new, and update steps may not occur on every call to
74  * ixgbe_tx_copy(). If the current tcb is already being used to copy data and
75  * we call ixgbe_tx_copy(), if there is enough room in the current tcb for
76  * the current mblk fragment, we append the data from the mblk fragment. If
77  * we call ixgbe_tx_copy() and the current tcb isn't being used to copy (i.e.
78  * the previous iteration of the loop called ixgbe_tx_bind()), or doesn't
79  * have enough space for the mblk fragment, we close out the current tcb,
80  * grab a new tcb from the free list, and update the current tcb to the
81  * newly obtained tcb.
82  *
83  * When LSO (large segment offload) is enabled, we first copy the packet
84  * headers (ethernet, IP, and TCP/UDP) into their own descriptor before
85  * processing the remainder of the packet. The remaining bytes of the packet
86  * are then copied or mapped based on the fragment size as described above.
87  *
88  * Through the entire processing of a packet, we keep track of the number of
89  * DMA descriptors being used (either bound or pre-bound buffers used for
90  * copying) by this packet. Each tcb requires at least one DMA descriptor, but
91  * may require more than one. When a tcb is closed by ixgbe_tx_bind() or
92  * ixgbe_tx_copy(), it does so by calling ixgbe_tcb_done() which returns the
93  * number of DMA descriptors that are closed (ready for the HW). Since the
94  * hardware limits the number of descriptors that can be used to transmit a
95  * single packet, if the total number DMA descriptors required to transmit
96  * this packet exceeds this limit, we perform a msgpullup() and try again.
97  * Since our DMA attributes limit the number of DMA cookies allowed to
98  * map a single span of memory to a value (MAX_COOKIE) less than the
99  * maximum number of descriptors allowed for a packet (IXGBE_TX_DESC_LIMIT),
100  * as long as sufficient tcbs are available, we should always be able to
101  * process a packet that's contained in a single mblk_t (no additional
102  * fragments).
103  *
104  * Once all of the tcbs have been setup, ixgbe_tx_fill_ring() is called to
105  * setup the tx ring to transmit the tcbs and then tell the HW to start
106  * transmitting. When transmission is complete, an interrupt is triggered
107  * which calls the appropriate recycle routine to place the tcbs that were
108  * used in transmission back in the free list. We also may also try to
109  * recycle any available tcbs when the size of the tcb free list gets low
110  * or if the watchdog timer triggers.
111  *
112  */
113 mblk_t *
ixgbe_ring_tx(void * arg,mblk_t * orig_mp)114 ixgbe_ring_tx(void *arg, mblk_t *orig_mp)
115 {
116 	ixgbe_tx_ring_t *tx_ring = (ixgbe_tx_ring_t *)arg;
117 	ixgbe_t *ixgbe = tx_ring->ixgbe;
118 	mblk_t *mp = orig_mp;
119 	mblk_t *pull_mp = NULL;
120 	tx_control_block_t *tcb;
121 	size_t mbsize, offset, len;
122 	uint32_t desc_total;
123 	uint32_t copy_thresh;
124 	int desc_num;
125 	ixgbe_tx_context_t tx_context, *ctx = NULL;
126 	link_list_t pending_list;
127 	boolean_t limit_retry = B_FALSE;
128 
129 	ASSERT(mp->b_next == NULL);
130 
131 	if ((ixgbe->ixgbe_state & IXGBE_SUSPENDED) ||
132 	    (ixgbe->ixgbe_state & IXGBE_ERROR) ||
133 	    (ixgbe->ixgbe_state & IXGBE_OVERTEMP) ||
134 	    !(ixgbe->ixgbe_state & IXGBE_STARTED) ||
135 	    ixgbe->link_state != LINK_STATE_UP) {
136 		freemsg(mp);
137 		return (NULL);
138 	}
139 
140 	copy_thresh = ixgbe->tx_copy_thresh;
141 
142 	mbsize = msgsize(mp);
143 
144 	if (ixgbe->tx_hcksum_enable) {
145 		/*
146 		 * Retrieve checksum context information from the mblk
147 		 * that will be used to decide whether/how to fill the
148 		 * context descriptor.
149 		 */
150 		ctx = &tx_context;
151 		if (ixgbe_get_context(mp, ctx) < 0) {
152 			freemsg(mp);
153 			return (NULL);
154 		}
155 
156 		/*
157 		 * If the mblk size exceeds the max size ixgbe could
158 		 * process, then discard this mblk, and return NULL.
159 		 */
160 		if ((ctx->lso_flag &&
161 		    ((mbsize - ctx->mac_hdr_len) > IXGBE_LSO_MAXLEN)) ||
162 		    (!ctx->lso_flag &&
163 		    (mbsize > (ixgbe->max_frame_size - ETHERFCSL)))) {
164 			freemsg(mp);
165 			IXGBE_DEBUGLOG_0(ixgbe, "ixgbe_tx: packet oversize");
166 			return (NULL);
167 		}
168 	}
169 
170 	/*
171 	 * If we use too many descriptors (see comments below), we may do
172 	 * pull_mp = msgpullup(orig_mp, -1), and jump back to here. As such,
173 	 * any time we error return past here, we should check and free
174 	 * pull_mp if != NULL.
175 	 */
176 retry:
177 	/*
178 	 * Check and recycle tx descriptors.
179 	 * The recycle threshold here should be selected carefully
180 	 */
181 	if (tx_ring->tbd_free < ixgbe->tx_recycle_thresh) {
182 		tx_ring->tx_recycle(tx_ring);
183 	}
184 
185 	/*
186 	 * After the recycling, if the tbd_free is less than the
187 	 * overload_threshold, assert overload, return mp;
188 	 * and we need to re-schedule the tx again.
189 	 */
190 	if (tx_ring->tbd_free < ixgbe->tx_overload_thresh) {
191 		tx_ring->reschedule = B_TRUE;
192 		tx_ring->stat_overload++;
193 		if (pull_mp != NULL)
194 			freemsg(pull_mp);
195 		return (orig_mp);
196 	}
197 
198 	/*
199 	 * The pending_list is a linked list that is used to save
200 	 * the tx control blocks that have packet data processed
201 	 * but have not put the data to the tx descriptor ring.
202 	 * It is used to reduce the lock contention of the tx_lock.
203 	 */
204 	LINK_LIST_INIT(&pending_list);
205 
206 	tcb = NULL;
207 	desc_num = 0;
208 	desc_total = 0;
209 	offset = 0;
210 
211 	/*
212 	 * For LSO, we always copy the packet header (Ethernet + IP + TCP/UDP)
213 	 * into a single descriptor separate from the remaining data.
214 	 */
215 	if ((ctx != NULL) && ctx->lso_flag) {
216 		size_t hdr_len;
217 
218 		hdr_len = ctx->ip_hdr_len + ctx->mac_hdr_len + ctx->l4_hdr_len;
219 
220 		/*
221 		 * copy the first hdr_len bytes of mp (i.e. the Ethernet, IP,
222 		 * and TCP/UDP headers) into tcb.
223 		 */
224 		for (len = hdr_len; mp != NULL && len > 0; mp = mp->b_cont) {
225 			size_t mlen = MBLKL(mp);
226 			size_t amt = MIN(mlen, len);
227 			int ret;
228 
229 			ret = ixgbe_tx_copy(tx_ring, &tcb, &pending_list,
230 			    mp->b_rptr, amt);
231 			/*
232 			 * Since we're trying to copy all of the headers into
233 			 * a single buffer in a single tcb, if ixgbe_tx_copy()
234 			 * returns anything but 0, it means either no tcbs
235 			 * are available (< 0), or while copying, we spilled
236 			 * over and couldn't fit all the headers into a
237 			 * single tcb.
238 			 */
239 			if (ret != 0) {
240 				if (ret > 0)
241 					tx_ring->stat_lso_header_fail++;
242 				goto tx_failure;
243 			}
244 
245 			len -= amt;
246 
247 			/*
248 			 * If we copy less than the full amount of this
249 			 * mblk_t, we have some amount to copy below.
250 			 */
251 			if (amt < mlen) {
252 				offset = amt;
253 				break;
254 			}
255 		}
256 
257 		ASSERT0(len);
258 
259 		/*
260 		 * Finish off the header tcb, and start anew for the
261 		 * rest of the packet.
262 		 */
263 		desc_total += ixgbe_tcb_done(tcb);
264 		tcb = NULL;
265 	}
266 
267 	/*
268 	 * Process each remaining segment in the packet -- either binding
269 	 * the dblk_t or copying the contents of the dblk_t to an already
270 	 * bound buffer. When we copy, we will accumulate consecutive small
271 	 * (less than copy_thresh bytes) segments into a single tcb buffer
272 	 * until no more can fit (or we encounter a segment larger than
273 	 * copy_thresh and bind the dblk_t).
274 	 *
275 	 * Both ixgbe_tx_bind() and ixgbe_tx_copy() will allocate new
276 	 * transmit control blocks (tcb)s as needed (and append them onto
277 	 * 'pending_list'). Both functions also replace 'tcb' with the new
278 	 * tcb when they allocate a new tcb.
279 	 *
280 	 * We stop trying to process the packet once the number of descriptors
281 	 * used equals IXGBE_TX_DESC_LIMIT. Even if we're copying into the
282 	 * IXGBE_TX_DESC_LIMIT-th descriptor, we won't have room to add a
283 	 * context descriptor (since we're already at the limit), so there's
284 	 * no point in continuing. We'll pull up the mblk_t (see below)
285 	 * and try again.
286 	 */
287 	while (mp != NULL && desc_total < IXGBE_TX_DESC_LIMIT) {
288 		uint8_t *rptr = mp->b_rptr + offset;
289 		int ret;
290 
291 		len = MBLKL(mp) - offset;
292 		offset = 0;
293 
294 		if (len > copy_thresh) {
295 			ret = ixgbe_tx_bind(tx_ring, &tcb, &pending_list, rptr,
296 			    len);
297 		} else {
298 			ret = ixgbe_tx_copy(tx_ring, &tcb, &pending_list, rptr,
299 			    len);
300 		}
301 
302 		if (ret < 0)
303 			goto tx_failure;
304 
305 		desc_total += ret;
306 		mp = mp->b_cont;
307 	}
308 
309 	/* Finish off the last tcb */
310 	desc_total += ixgbe_tcb_done(tcb);
311 
312 	/*
313 	 * 82598/82599 chipset has a limitation that no more than 32 tx
314 	 * descriptors can be transmited out at one time. As noted above,
315 	 * we need to include space for a context descriptor in case its
316 	 * necessary, so we do this even if desc_total == IXGBE_TX_DESC_LIMIT
317 	 * as well as when it exceeds the limit.
318 	 *
319 	 * If we exceed this limit, we take the hit, do a msgpullup(), and
320 	 * then try again. Our DMA attributes guarantee we should never use
321 	 * more than MAX_COOKIE (18) descriptors to map a single mblk_t, so we
322 	 * should only need to retry once.
323 	 */
324 	if (desc_total >= IXGBE_TX_DESC_LIMIT) {
325 		/* We shouldn't hit this path twice */
326 		VERIFY0(limit_retry);
327 
328 		tx_ring->stat_break_tbd_limit++;
329 
330 		/* Release all the tcbs we used previously */
331 		ixgbe_put_free_list(tx_ring, &pending_list);
332 		desc_total = 0;
333 		offset = 0;
334 
335 		pull_mp = msgpullup(orig_mp, -1);
336 		if (pull_mp == NULL) {
337 			tx_ring->reschedule = B_TRUE;
338 			return (orig_mp);
339 		}
340 
341 		mp = pull_mp;
342 		limit_retry = B_TRUE;
343 		goto retry;
344 	}
345 
346 	/*
347 	 * Before filling the tx descriptor ring with the data, we need to
348 	 * ensure there are adequate free descriptors for transmit
349 	 * (including one context descriptor).
350 	 * Do not use up all the tx descriptors.
351 	 * Otherwise tx recycle will fail and cause false hang.
352 	 */
353 	if (tx_ring->tbd_free <= (desc_total + 1)) {
354 		tx_ring->tx_recycle(tx_ring);
355 	}
356 
357 	mutex_enter(&tx_ring->tx_lock);
358 	/*
359 	 * If the number of free tx descriptors is not enough for transmit
360 	 * then return mp.
361 	 *
362 	 * Note: we must put this check under the mutex protection to
363 	 * ensure the correctness when multiple threads access it in
364 	 * parallel.
365 	 */
366 	if (tx_ring->tbd_free <= (desc_total + 1)) {
367 		tx_ring->stat_fail_no_tbd++;
368 		mutex_exit(&tx_ring->tx_lock);
369 		goto tx_failure;
370 	}
371 
372 	/*
373 	 * Attach the mblk_t we've setup to the last control block.
374 	 * This is only done once we know there are enough free descriptors
375 	 * to transmit so that the cleanup in tx_failure doesn't try to
376 	 * call freemsg() on mp (since we will want to return it).
377 	 */
378 	tcb->mp = (pull_mp != NULL) ? pull_mp : orig_mp;
379 
380 	desc_num = ixgbe_tx_fill_ring(tx_ring, &pending_list, ctx,
381 	    mbsize);
382 
383 	ASSERT((desc_num == desc_total) || (desc_num == (desc_total + 1)));
384 
385 	tx_ring->stat_obytes += mbsize;
386 	tx_ring->stat_opackets++;
387 
388 	mutex_exit(&tx_ring->tx_lock);
389 
390 	/*
391 	 * Now that tx is done, if we pulled up the original message, we
392 	 * can free the original message since it is no longer being
393 	 * used.
394 	 */
395 	if (pull_mp != NULL) {
396 		freemsg(orig_mp);
397 	}
398 
399 	return (NULL);
400 
401 tx_failure:
402 	/*
403 	 * If transmission fails, need to free the pulling up mblk.
404 	 */
405 	if (pull_mp) {
406 		freemsg(pull_mp);
407 	}
408 
409 	/*
410 	 * Return the tx control blocks in the pending list to the free list.
411 	 */
412 	ixgbe_put_free_list(tx_ring, &pending_list);
413 
414 	/* Transmit failed, do not drop the mblk, rechedule the transmit */
415 	tx_ring->reschedule = B_TRUE;
416 
417 	return (orig_mp);
418 }
419 
420 /*
421  * ixgbe_tx_copy
422  *
423  * Copy the mblk fragment to the pre-allocated tx buffer. Return -1 on error,
424  * otherwise return the number of descriptors we've completed in this call.
425  */
426 static int
ixgbe_tx_copy(ixgbe_tx_ring_t * tx_ring,tx_control_block_t ** tcbp,link_list_t * pending_list,const void * buf,size_t len)427 ixgbe_tx_copy(ixgbe_tx_ring_t *tx_ring, tx_control_block_t **tcbp,
428     link_list_t *pending_list, const void *buf, size_t len)
429 {
430 	tx_control_block_t *tcb = *tcbp;
431 	dma_buffer_t *tx_buf;
432 	uint32_t desc_num = 0;
433 
434 	/*
435 	 * We need a new tcb -- either the current one (tcb) is NULL because
436 	 * we just started, tcb is being used for DMA, or tcb isn't large enough
437 	 * to hold the contents we need to copy.
438 	 */
439 	if (tcb == NULL || tcb->tx_type == USE_DMA ||
440 	    tcb->tx_buf.len + len > tcb->tx_buf.size) {
441 		tx_control_block_t *newtcb;
442 
443 		newtcb = ixgbe_get_free_list(tx_ring, pending_list);
444 		if (newtcb == NULL)
445 			return (-1);
446 
447 		newtcb->tx_type = USE_COPY;
448 
449 		if (tcb != NULL)
450 			desc_num += ixgbe_tcb_done(tcb);
451 		*tcbp = tcb = newtcb;
452 	}
453 
454 	ASSERT3S(tcb->tx_type, ==, USE_COPY);
455 	tx_buf = &tcb->tx_buf;
456 
457 	/*
458 	 * Copy the packet data of the mblk fragment into the
459 	 * pre-allocated tx buffer, which is maintained by the
460 	 * tx control block.
461 	 *
462 	 * Several mblk fragments can be copied into one tx buffer.
463 	 * The destination address of the current copied fragment in
464 	 * the tx buffer is next to the end of the previous copied
465 	 * fragment.
466 	 */
467 	if (len > 0) {
468 		bcopy(buf, tx_buf->address + tx_buf->len, len);
469 
470 		tx_buf->len += len;
471 		tcb->frag_num++;
472 	}
473 
474 	return (desc_num);
475 }
476 
477 /*
478  * ixgbe_tx_bind
479  *
480  * Bind the mblk fragment with DMA. Returns -1 on error, otherwise it
481  * returns the number of descriptors completed in this call. This count
482  * can include descriptors that weren't filled in by the current call to
483  * ixgbe_tx_bind() but were being used (but not yet completed) in previous
484  * calls to ixgbe_tx_bind() or ixgbe_tx_copy().
485  */
486 static int
ixgbe_tx_bind(ixgbe_tx_ring_t * tx_ring,tx_control_block_t ** tcbp,link_list_t * pending_list,uint8_t * buf,size_t len)487 ixgbe_tx_bind(ixgbe_tx_ring_t *tx_ring, tx_control_block_t **tcbp,
488     link_list_t *pending_list, uint8_t *buf, size_t len)
489 {
490 	tx_control_block_t *tcb = NULL;
491 	uint_t desc_num = 0;
492 	int status;
493 
494 	tcb = ixgbe_get_free_list(tx_ring, pending_list);
495 	if (tcb == NULL)
496 		return (-1);
497 
498 	/*
499 	 * Use DMA binding to process the mblk fragment
500 	 */
501 	status = ddi_dma_addr_bind_handle(tcb->tx_dma_handle, NULL,
502 	    (caddr_t)buf, len,
503 	    DDI_DMA_WRITE | DDI_DMA_STREAMING, DDI_DMA_DONTWAIT,
504 	    0, NULL, NULL);
505 
506 	if (status != DDI_DMA_MAPPED) {
507 		tx_ring->stat_fail_dma_bind++;
508 		return (-1);
509 	}
510 
511 	tcb->frag_num++;
512 	tcb->tx_type = USE_DMA;
513 
514 	/*
515 	 * If there was an old tcb, we're about to replace it. Finish
516 	 * setting up the old tcb so we can replace it with the new one.
517 	 */
518 	if (*tcbp != NULL)
519 		desc_num += ixgbe_tcb_done(*tcbp);
520 
521 	*tcbp = tcb;
522 	return (desc_num);
523 }
524 
525 /*
526  * Once we're done populating a tcb (either by binding or copying into
527  * a buffer in the tcb), get it ready for tx and return the number of
528  * descriptors used.
529  */
530 static uint_t
ixgbe_tcb_done(tx_control_block_t * tcb)531 ixgbe_tcb_done(tx_control_block_t *tcb)
532 {
533 	uint_t desc_num = 0;
534 
535 	if (tcb->tx_type == USE_DMA) {
536 		const ddi_dma_cookie_t *c;
537 
538 		for (c = ddi_dma_cookie_iter(tcb->tx_dma_handle, NULL);
539 		    c != NULL;
540 		    c = ddi_dma_cookie_iter(tcb->tx_dma_handle, c)) {
541 			/*
542 			 * Save the address and length to the private data
543 			 * structure of the tx control block, which will be
544 			 * used to fill the tx descriptor ring after all the
545 			 * fragments are processed.
546 			 */
547 			ixgbe_save_desc(tcb, c->dmac_laddress, c->dmac_size);
548 			desc_num++;
549 		}
550 	} else if (tcb->tx_type == USE_COPY) {
551 		dma_buffer_t *tx_buf = &tcb->tx_buf;
552 
553 		DMA_SYNC(tx_buf, DDI_DMA_SYNC_FORDEV);
554 		ixgbe_save_desc(tcb, tx_buf->dma_address, tx_buf->len);
555 		desc_num++;
556 	} else {
557 		panic("invalid tcb type");
558 	}
559 
560 	return (desc_num);
561 }
562 
563 /*
564  * ixgbe_get_context
565  *
566  * Get the context information from the mblk
567  */
568 static int
ixgbe_get_context(mblk_t * mp,ixgbe_tx_context_t * ctx)569 ixgbe_get_context(mblk_t *mp, ixgbe_tx_context_t *ctx)
570 {
571 	uint32_t start;
572 	uint32_t hckflags;
573 	uint32_t lsoflags;
574 	uint32_t lsocksum;
575 	uint32_t mss;
576 	uint32_t len;
577 	uint32_t size;
578 	uint32_t offset;
579 	unsigned char *pos;
580 	ushort_t etype;
581 	uint32_t mac_hdr_len;
582 	uint32_t l4_proto;
583 	uint32_t l4_hdr_len;
584 
585 	ASSERT(mp != NULL);
586 
587 	mac_hcksum_get(mp, &start, NULL, NULL, NULL, &hckflags);
588 	bzero(ctx, sizeof (ixgbe_tx_context_t));
589 
590 	if (hckflags == 0) {
591 		return (0);
592 	}
593 
594 	ctx->hcksum_flags = hckflags;
595 
596 	mac_lso_get(mp, &mss, &lsoflags);
597 	ctx->mss = mss;
598 	ctx->lso_flag = (lsoflags == HW_LSO);
599 
600 	etype = 0;
601 	mac_hdr_len = 0;
602 	l4_proto = 0;
603 
604 	/*
605 	 * Firstly get the position of the ether_type/ether_tpid.
606 	 * Here we don't assume the ether (VLAN) header is fully included
607 	 * in one mblk fragment, so we go thourgh the fragments to parse
608 	 * the ether type.
609 	 */
610 	size = len = MBLKL(mp);
611 	offset = offsetof(struct ether_header, ether_type);
612 	while (size <= offset) {
613 		mp = mp->b_cont;
614 		ASSERT(mp != NULL);
615 		len = MBLKL(mp);
616 		size += len;
617 	}
618 	pos = mp->b_rptr + offset + len - size;
619 
620 	etype = ntohs(*(ushort_t *)(uintptr_t)pos);
621 	if (etype == ETHERTYPE_VLAN) {
622 		/*
623 		 * Get the position of the ether_type in VLAN header
624 		 */
625 		offset = offsetof(struct ether_vlan_header, ether_type);
626 		while (size <= offset) {
627 			mp = mp->b_cont;
628 			ASSERT(mp != NULL);
629 			len = MBLKL(mp);
630 			size += len;
631 		}
632 		pos = mp->b_rptr + offset + len - size;
633 
634 		etype = ntohs(*(ushort_t *)(uintptr_t)pos);
635 		mac_hdr_len = sizeof (struct ether_vlan_header);
636 	} else {
637 		mac_hdr_len = sizeof (struct ether_header);
638 	}
639 
640 	/*
641 	 * Here we don't assume the IP(V6) header is fully included in
642 	 * one mblk fragment.
643 	 */
644 	lsocksum = HCK_PARTIALCKSUM;
645 	ctx->l3_proto = etype;
646 	switch (etype) {
647 	case ETHERTYPE_IP:
648 		if (ctx->lso_flag) {
649 			offset = offsetof(ipha_t, ipha_length) + mac_hdr_len;
650 			while (size <= offset) {
651 				mp = mp->b_cont;
652 				ASSERT(mp != NULL);
653 				len = MBLKL(mp);
654 				size += len;
655 			}
656 			pos = mp->b_rptr + offset + len - size;
657 			*((uint16_t *)(uintptr_t)(pos)) = 0;
658 
659 			offset = offsetof(ipha_t, ipha_hdr_checksum) +
660 			    mac_hdr_len;
661 			while (size <= offset) {
662 				mp = mp->b_cont;
663 				ASSERT(mp != NULL);
664 				len = MBLKL(mp);
665 				size += len;
666 			}
667 			pos = mp->b_rptr + offset + len - size;
668 			*((uint16_t *)(uintptr_t)(pos)) = 0;
669 
670 			/*
671 			 * To perform ixgbe LSO, here also need to fill
672 			 * the tcp checksum field of the packet with the
673 			 * following pseudo-header checksum:
674 			 * (ip_source_addr, ip_destination_addr, l4_proto)
675 			 * Currently the tcp/ip stack has done it.
676 			 */
677 			lsocksum |= HCK_IPV4_HDRCKSUM;
678 		}
679 
680 		offset = offsetof(ipha_t, ipha_protocol) + mac_hdr_len;
681 		while (size <= offset) {
682 			mp = mp->b_cont;
683 			ASSERT(mp != NULL);
684 			len = MBLKL(mp);
685 			size += len;
686 		}
687 		pos = mp->b_rptr + offset + len - size;
688 
689 		l4_proto = *(uint8_t *)pos;
690 		break;
691 	case ETHERTYPE_IPV6:
692 		/*
693 		 * We need to zero out the length in the header.
694 		 */
695 		if (ctx->lso_flag) {
696 			offset = offsetof(ip6_t, ip6_plen) + mac_hdr_len;
697 			while (size <= offset) {
698 				mp = mp->b_cont;
699 				ASSERT(mp != NULL);
700 				len = MBLKL(mp);
701 				size += len;
702 			}
703 			pos = mp->b_rptr + offset + len - size;
704 			*((uint16_t *)(uintptr_t)(pos)) = 0;
705 		}
706 
707 		offset = offsetof(ip6_t, ip6_nxt) + mac_hdr_len;
708 		while (size <= offset) {
709 			mp = mp->b_cont;
710 			ASSERT(mp != NULL);
711 			len = MBLKL(mp);
712 			size += len;
713 		}
714 		pos = mp->b_rptr + offset + len - size;
715 
716 		l4_proto = *(uint8_t *)pos;
717 		break;
718 	default:
719 		/* Unrecoverable error */
720 		IXGBE_DEBUGLOG_0(NULL, "Ether type error with tx hcksum");
721 		return (-2);
722 	}
723 
724 	if (ctx->lso_flag) {
725 		/*
726 		 * LSO relies on tx h/w checksum, so here will drop the packet
727 		 * if h/w checksum flag is not declared.
728 		 */
729 		if ((ctx->hcksum_flags & lsocksum) != lsocksum) {
730 			IXGBE_DEBUGLOG_2(NULL, "ixgbe_tx: h/w checksum flags "
731 			    "are not set for LSO, found 0x%x, needed bits 0x%x",
732 			    ctx->hcksum_flags, lsocksum);
733 			return (-1);
734 		}
735 
736 
737 		offset = mac_hdr_len + start;
738 		while (size <= offset) {
739 			mp = mp->b_cont;
740 			ASSERT(mp != NULL);
741 			len = MBLKL(mp);
742 			size += len;
743 		}
744 		pos = mp->b_rptr + offset + len - size;
745 
746 		l4_hdr_len = TCP_HDR_LENGTH((tcph_t *)pos);
747 	} else {
748 		/*
749 		 * l4 header length is only required for LSO
750 		 */
751 		l4_hdr_len = 0;
752 	}
753 
754 	ctx->mac_hdr_len = mac_hdr_len;
755 	ctx->ip_hdr_len = start;
756 	ctx->l4_proto = l4_proto;
757 	ctx->l4_hdr_len = l4_hdr_len;
758 
759 	return (0);
760 }
761 
762 /*
763  * ixgbe_check_context
764  *
765  * Check if a new context descriptor is needed
766  */
767 static boolean_t
ixgbe_check_context(ixgbe_tx_ring_t * tx_ring,ixgbe_tx_context_t * ctx)768 ixgbe_check_context(ixgbe_tx_ring_t *tx_ring, ixgbe_tx_context_t *ctx)
769 {
770 	ixgbe_tx_context_t *last;
771 
772 	if (ctx == NULL)
773 		return (B_FALSE);
774 
775 	/*
776 	 * Compare the context data retrieved from the mblk and the
777 	 * stored data of the last context descriptor. The data need
778 	 * to be checked are:
779 	 *	hcksum_flags
780 	 *	l4_proto
781 	 *	mac_hdr_len
782 	 *	ip_hdr_len
783 	 *	lso_flag
784 	 *	mss (only checked for LSO)
785 	 *	l4_hr_len (only checked for LSO)
786 	 * Either one of the above data is changed, a new context descriptor
787 	 * will be needed.
788 	 */
789 	last = &tx_ring->tx_context;
790 
791 	if ((ctx->hcksum_flags != last->hcksum_flags) ||
792 	    (ctx->l4_proto != last->l4_proto) ||
793 	    (ctx->l3_proto != last->l3_proto) ||
794 	    (ctx->mac_hdr_len != last->mac_hdr_len) ||
795 	    (ctx->ip_hdr_len != last->ip_hdr_len) ||
796 	    (ctx->lso_flag != last->lso_flag) ||
797 	    (ctx->lso_flag && ((ctx->mss != last->mss) ||
798 	    (ctx->l4_hdr_len != last->l4_hdr_len)))) {
799 		return (B_TRUE);
800 	}
801 
802 	return (B_FALSE);
803 }
804 
805 /*
806  * ixgbe_fill_context
807  *
808  * Fill the context descriptor with hardware checksum informations
809  */
810 static void
ixgbe_fill_context(struct ixgbe_adv_tx_context_desc * ctx_tbd,ixgbe_tx_context_t * ctx)811 ixgbe_fill_context(struct ixgbe_adv_tx_context_desc *ctx_tbd,
812     ixgbe_tx_context_t *ctx)
813 {
814 	/*
815 	 * Fill the context descriptor with the checksum
816 	 * context information we've got.
817 	 */
818 	ctx_tbd->vlan_macip_lens = ctx->ip_hdr_len;
819 	ctx_tbd->vlan_macip_lens |= ctx->mac_hdr_len <<
820 	    IXGBE_ADVTXD_MACLEN_SHIFT;
821 
822 	ctx_tbd->type_tucmd_mlhl =
823 	    IXGBE_ADVTXD_DCMD_DEXT | IXGBE_ADVTXD_DTYP_CTXT;
824 	/*
825 	 * When we have a TX context set up, we enforce that the ethertype is
826 	 * either IPv4 or IPv6 in ixgbe_get_tx_context().
827 	 */
828 	if (ctx->lso_flag || ctx->hcksum_flags & HCK_IPV4_HDRCKSUM) {
829 		if (ctx->l3_proto == ETHERTYPE_IP) {
830 			ctx_tbd->type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV4;
831 		} else {
832 			ctx_tbd->type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV6;
833 		}
834 	}
835 
836 	if (ctx->lso_flag || ctx->hcksum_flags & HCK_PARTIALCKSUM) {
837 		switch (ctx->l4_proto) {
838 		case IPPROTO_TCP:
839 			ctx_tbd->type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_TCP;
840 			break;
841 		case IPPROTO_UDP:
842 			/*
843 			 * We don't have to explicitly set:
844 			 *	ctx_tbd->type_tucmd_mlhl |=
845 			 *	    IXGBE_ADVTXD_TUCMD_L4T_UDP;
846 			 * Because IXGBE_ADVTXD_TUCMD_L4T_UDP == 0b
847 			 */
848 			break;
849 		default:
850 			/* Unrecoverable error */
851 			IXGBE_DEBUGLOG_0(NULL, "L4 type error with tx hcksum");
852 			break;
853 		}
854 	}
855 
856 	ctx_tbd->seqnum_seed = 0;
857 
858 	if (ctx->lso_flag) {
859 		ctx_tbd->mss_l4len_idx =
860 		    (ctx->l4_hdr_len << IXGBE_ADVTXD_L4LEN_SHIFT) |
861 		    (ctx->mss << IXGBE_ADVTXD_MSS_SHIFT);
862 	} else {
863 		ctx_tbd->mss_l4len_idx = 0;
864 	}
865 }
866 
867 /*
868  * ixgbe_tx_fill_ring
869  *
870  * Fill the tx descriptor ring with the data
871  */
872 static int
ixgbe_tx_fill_ring(ixgbe_tx_ring_t * tx_ring,link_list_t * pending_list,ixgbe_tx_context_t * ctx,size_t mbsize)873 ixgbe_tx_fill_ring(ixgbe_tx_ring_t *tx_ring, link_list_t *pending_list,
874     ixgbe_tx_context_t *ctx, size_t mbsize)
875 {
876 	struct ixgbe_hw *hw = &tx_ring->ixgbe->hw;
877 	boolean_t load_context;
878 	uint32_t index, tcb_index, desc_num;
879 	union ixgbe_adv_tx_desc *tbd, *first_tbd;
880 	tx_control_block_t *tcb, *first_tcb;
881 	uint32_t hcksum_flags;
882 	int i;
883 
884 	ASSERT(mutex_owned(&tx_ring->tx_lock));
885 
886 	tbd = NULL;
887 	first_tbd = NULL;
888 	first_tcb = NULL;
889 	desc_num = 0;
890 	hcksum_flags = 0;
891 	load_context = B_FALSE;
892 
893 	/*
894 	 * Get the index of the first tx descriptor that will be filled,
895 	 * and the index of the first work list item that will be attached
896 	 * with the first used tx control block in the pending list.
897 	 * Note: the two indexes are the same.
898 	 */
899 	index = tx_ring->tbd_tail;
900 	tcb_index = tx_ring->tbd_tail;
901 
902 	if (ctx != NULL) {
903 		hcksum_flags = ctx->hcksum_flags;
904 
905 		/*
906 		 * Check if a new context descriptor is needed for this packet
907 		 */
908 		load_context = ixgbe_check_context(tx_ring, ctx);
909 
910 		if (load_context) {
911 			tbd = &tx_ring->tbd_ring[index];
912 
913 			/*
914 			 * Fill the context descriptor with the
915 			 * hardware checksum offload informations.
916 			 */
917 			ixgbe_fill_context(
918 			    (struct ixgbe_adv_tx_context_desc *)tbd, ctx);
919 
920 			index = NEXT_INDEX(index, 1, tx_ring->ring_size);
921 			desc_num++;
922 
923 			/*
924 			 * Store the checksum context data if
925 			 * a new context descriptor is added
926 			 */
927 			tx_ring->tx_context = *ctx;
928 		}
929 	}
930 
931 	first_tbd = &tx_ring->tbd_ring[index];
932 
933 	/*
934 	 * Fill tx data descriptors with the data saved in the pending list.
935 	 * The tx control blocks in the pending list are added to the work list
936 	 * at the same time.
937 	 *
938 	 * The work list is strictly 1:1 corresponding to the descriptor ring.
939 	 * One item of the work list corresponds to one tx descriptor. Because
940 	 * one tx control block can span multiple tx descriptors, the tx
941 	 * control block will be added to the first work list item that
942 	 * corresponds to the first tx descriptor generated from that tx
943 	 * control block.
944 	 */
945 	tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
946 	first_tcb = tcb;
947 	while (tcb != NULL) {
948 
949 		for (i = 0; i < tcb->desc_num; i++) {
950 			tbd = &tx_ring->tbd_ring[index];
951 
952 			tbd->read.buffer_addr = tcb->desc[i].address;
953 			tbd->read.cmd_type_len = tcb->desc[i].length;
954 
955 			tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_DEXT
956 			    | IXGBE_ADVTXD_DTYP_DATA;
957 
958 			tbd->read.olinfo_status = 0;
959 
960 			index = NEXT_INDEX(index, 1, tx_ring->ring_size);
961 			desc_num++;
962 		}
963 
964 		/*
965 		 * Add the tx control block to the work list
966 		 */
967 		ASSERT(tx_ring->work_list[tcb_index] == NULL);
968 		tx_ring->work_list[tcb_index] = tcb;
969 
970 		tcb_index = index;
971 		tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
972 	}
973 
974 	if (load_context) {
975 		/*
976 		 * Count the context descriptor for
977 		 * the first tx control block.
978 		 */
979 		first_tcb->desc_num++;
980 	}
981 	first_tcb->last_index = PREV_INDEX(index, 1, tx_ring->ring_size);
982 
983 	/*
984 	 * The Insert Ethernet CRC (IFCS) bit and the checksum fields are only
985 	 * valid in the first descriptor of the packet.
986 	 * Setting paylen in every first_tbd for all parts.
987 	 * 82599, X540 and X550 require the packet length in paylen field
988 	 * with or without LSO and 82598 will ignore it in non-LSO mode.
989 	 */
990 	ASSERT(first_tbd != NULL);
991 	first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_IFCS;
992 
993 	switch (hw->mac.type) {
994 	case ixgbe_mac_82598EB:
995 		if (ctx != NULL && ctx->lso_flag) {
996 			first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_TSE;
997 			first_tbd->read.olinfo_status |=
998 			    (mbsize - ctx->mac_hdr_len - ctx->ip_hdr_len
999 			    - ctx->l4_hdr_len) << IXGBE_ADVTXD_PAYLEN_SHIFT;
1000 		}
1001 		break;
1002 
1003 	case ixgbe_mac_82599EB:
1004 	case ixgbe_mac_X540:
1005 	case ixgbe_mac_X550:
1006 	case ixgbe_mac_X550EM_x:
1007 	case ixgbe_mac_X550EM_a:
1008 		if (ctx != NULL && ctx->lso_flag) {
1009 			first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_TSE;
1010 			first_tbd->read.olinfo_status |=
1011 			    (mbsize - ctx->mac_hdr_len - ctx->ip_hdr_len
1012 			    - ctx->l4_hdr_len) << IXGBE_ADVTXD_PAYLEN_SHIFT;
1013 		} else {
1014 			first_tbd->read.olinfo_status |=
1015 			    (mbsize << IXGBE_ADVTXD_PAYLEN_SHIFT);
1016 		}
1017 		break;
1018 
1019 	default:
1020 		break;
1021 	}
1022 
1023 	/* Set hardware checksum bits */
1024 	if (hcksum_flags != 0) {
1025 		if (hcksum_flags & HCK_IPV4_HDRCKSUM)
1026 			first_tbd->read.olinfo_status |=
1027 			    IXGBE_ADVTXD_POPTS_IXSM;
1028 		if (hcksum_flags & HCK_PARTIALCKSUM)
1029 			first_tbd->read.olinfo_status |=
1030 			    IXGBE_ADVTXD_POPTS_TXSM;
1031 	}
1032 
1033 	/*
1034 	 * The last descriptor of packet needs End Of Packet (EOP),
1035 	 * and Report Status (RS) bits set
1036 	 */
1037 	ASSERT(tbd != NULL);
1038 	tbd->read.cmd_type_len |=
1039 	    IXGBE_ADVTXD_DCMD_EOP | IXGBE_ADVTXD_DCMD_RS;
1040 
1041 	/*
1042 	 * Sync the DMA buffer of the tx descriptor ring
1043 	 */
1044 	DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORDEV);
1045 
1046 	/*
1047 	 * Update the number of the free tx descriptors.
1048 	 * The mutual exclusion between the transmission and the recycling
1049 	 * (for the tx descriptor ring and the work list) is implemented
1050 	 * with the atomic operation on the number of the free tx descriptors.
1051 	 *
1052 	 * Note: we should always decrement the counter tbd_free before
1053 	 * advancing the hardware TDT pointer to avoid the race condition -
1054 	 * before the counter tbd_free is decremented, the transmit of the
1055 	 * tx descriptors has done and the counter tbd_free is increased by
1056 	 * the tx recycling.
1057 	 */
1058 	i = ixgbe_atomic_reserve(&tx_ring->tbd_free, desc_num);
1059 	ASSERT(i >= 0);
1060 
1061 	tx_ring->tbd_tail = index;
1062 
1063 	/*
1064 	 * Advance the hardware TDT pointer of the tx descriptor ring
1065 	 */
1066 	IXGBE_WRITE_REG(hw, IXGBE_TDT(tx_ring->index), index);
1067 
1068 	if (ixgbe_check_acc_handle(tx_ring->ixgbe->osdep.reg_handle) !=
1069 	    DDI_FM_OK) {
1070 		ddi_fm_service_impact(tx_ring->ixgbe->dip,
1071 		    DDI_SERVICE_DEGRADED);
1072 		atomic_or_32(&tx_ring->ixgbe->ixgbe_state, IXGBE_ERROR);
1073 	}
1074 
1075 	return (desc_num);
1076 }
1077 
1078 /*
1079  * ixgbe_save_desc
1080  *
1081  * Save the address/length pair to the private array
1082  * of the tx control block. The address/length pairs
1083  * will be filled into the tx descriptor ring later.
1084  */
1085 static void
ixgbe_save_desc(tx_control_block_t * tcb,uint64_t address,size_t length)1086 ixgbe_save_desc(tx_control_block_t *tcb, uint64_t address, size_t length)
1087 {
1088 	sw_desc_t *desc;
1089 
1090 	desc = &tcb->desc[tcb->desc_num];
1091 	desc->address = address;
1092 	desc->length = length;
1093 
1094 	tcb->desc_num++;
1095 }
1096 
1097 /*
1098  * ixgbe_tx_recycle_legacy
1099  *
1100  * Recycle the tx descriptors and tx control blocks.
1101  *
1102  * The work list is traversed to check if the corresponding
1103  * tx descriptors have been transmitted. If so, the resources
1104  * bound to the tx control blocks will be freed, and those
1105  * tx control blocks will be returned to the free list.
1106  */
1107 uint32_t
ixgbe_tx_recycle_legacy(ixgbe_tx_ring_t * tx_ring)1108 ixgbe_tx_recycle_legacy(ixgbe_tx_ring_t *tx_ring)
1109 {
1110 	uint32_t index, last_index, prev_index;
1111 	int desc_num;
1112 	boolean_t desc_done;
1113 	tx_control_block_t *tcb;
1114 	link_list_t pending_list;
1115 	ixgbe_t *ixgbe = tx_ring->ixgbe;
1116 
1117 	mutex_enter(&tx_ring->recycle_lock);
1118 
1119 	ASSERT(tx_ring->tbd_free <= tx_ring->ring_size);
1120 
1121 	if (tx_ring->tbd_free == tx_ring->ring_size) {
1122 		tx_ring->recycle_fail = 0;
1123 		tx_ring->stall_watchdog = 0;
1124 		if (tx_ring->reschedule) {
1125 			tx_ring->reschedule = B_FALSE;
1126 			mac_tx_ring_update(ixgbe->mac_hdl,
1127 			    tx_ring->ring_handle);
1128 		}
1129 		mutex_exit(&tx_ring->recycle_lock);
1130 		return (0);
1131 	}
1132 
1133 	/*
1134 	 * Sync the DMA buffer of the tx descriptor ring
1135 	 */
1136 	DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORKERNEL);
1137 
1138 	if (ixgbe_check_dma_handle(tx_ring->tbd_area.dma_handle) != DDI_FM_OK) {
1139 		mutex_exit(&tx_ring->recycle_lock);
1140 		ddi_fm_service_impact(ixgbe->dip, DDI_SERVICE_DEGRADED);
1141 		atomic_or_32(&ixgbe->ixgbe_state, IXGBE_ERROR);
1142 		return (0);
1143 	}
1144 
1145 	LINK_LIST_INIT(&pending_list);
1146 	desc_num = 0;
1147 	index = tx_ring->tbd_head;	/* Index of next tbd/tcb to recycle */
1148 
1149 	tcb = tx_ring->work_list[index];
1150 	ASSERT(tcb != NULL);
1151 
1152 	while (tcb != NULL) {
1153 		/*
1154 		 * Get the last tx descriptor of this packet.
1155 		 * If the last tx descriptor is done, then
1156 		 * we can recycle all descriptors of a packet
1157 		 * which usually includes several tx control blocks.
1158 		 * For 82599, LSO descriptors can not be recycled
1159 		 * unless the whole packet's transmission is done.
1160 		 * That's why packet level recycling is used here.
1161 		 * For 82598, there's not such limit.
1162 		 */
1163 		last_index = tcb->last_index;
1164 		/*
1165 		 * MAX_TX_RING_SIZE is used to judge whether
1166 		 * the index is a valid value or not.
1167 		 */
1168 		if (last_index == MAX_TX_RING_SIZE)
1169 			break;
1170 
1171 		/*
1172 		 * Check if the Descriptor Done bit is set
1173 		 */
1174 		desc_done = tx_ring->tbd_ring[last_index].wb.status &
1175 		    IXGBE_TXD_STAT_DD;
1176 		if (desc_done) {
1177 			/*
1178 			 * recycle all descriptors of the packet
1179 			 */
1180 			while (tcb != NULL) {
1181 				/*
1182 				 * Strip off the tx control block from
1183 				 * the work list, and add it to the
1184 				 * pending list.
1185 				 */
1186 				tx_ring->work_list[index] = NULL;
1187 				LIST_PUSH_TAIL(&pending_list, &tcb->link);
1188 
1189 				/*
1190 				 * Count the total number of the tx
1191 				 * descriptors recycled
1192 				 */
1193 				desc_num += tcb->desc_num;
1194 
1195 				index = NEXT_INDEX(index, tcb->desc_num,
1196 				    tx_ring->ring_size);
1197 
1198 				tcb = tx_ring->work_list[index];
1199 
1200 				prev_index = PREV_INDEX(index, 1,
1201 				    tx_ring->ring_size);
1202 				if (prev_index == last_index)
1203 					break;
1204 			}
1205 		} else {
1206 			break;
1207 		}
1208 	}
1209 
1210 	/*
1211 	 * If no tx descriptors are recycled, no need to do more processing
1212 	 */
1213 	if (desc_num == 0) {
1214 		tx_ring->recycle_fail++;
1215 		mutex_exit(&tx_ring->recycle_lock);
1216 		return (0);
1217 	}
1218 
1219 	tx_ring->recycle_fail = 0;
1220 	tx_ring->stall_watchdog = 0;
1221 
1222 	/*
1223 	 * Update the head index of the tx descriptor ring
1224 	 */
1225 	tx_ring->tbd_head = index;
1226 
1227 	/*
1228 	 * Update the number of the free tx descriptors with atomic operations
1229 	 */
1230 	atomic_add_32(&tx_ring->tbd_free, desc_num);
1231 
1232 	if ((tx_ring->tbd_free >= ixgbe->tx_resched_thresh) &&
1233 	    (tx_ring->reschedule)) {
1234 		tx_ring->reschedule = B_FALSE;
1235 		mac_tx_ring_update(ixgbe->mac_hdl,
1236 		    tx_ring->ring_handle);
1237 	}
1238 	mutex_exit(&tx_ring->recycle_lock);
1239 
1240 	/*
1241 	 * Add the tx control blocks in the pending list to the free list.
1242 	 */
1243 	ixgbe_put_free_list(tx_ring, &pending_list);
1244 
1245 	return (desc_num);
1246 }
1247 
1248 /*
1249  * ixgbe_tx_recycle_head_wb
1250  *
1251  * Check the head write-back, and recycle all the transmitted
1252  * tx descriptors and tx control blocks.
1253  */
1254 uint32_t
ixgbe_tx_recycle_head_wb(ixgbe_tx_ring_t * tx_ring)1255 ixgbe_tx_recycle_head_wb(ixgbe_tx_ring_t *tx_ring)
1256 {
1257 	uint32_t index;
1258 	uint32_t head_wb;
1259 	int desc_num;
1260 	tx_control_block_t *tcb;
1261 	link_list_t pending_list;
1262 	ixgbe_t *ixgbe = tx_ring->ixgbe;
1263 
1264 	mutex_enter(&tx_ring->recycle_lock);
1265 
1266 	ASSERT(tx_ring->tbd_free <= tx_ring->ring_size);
1267 
1268 	if (tx_ring->tbd_free == tx_ring->ring_size) {
1269 		tx_ring->recycle_fail = 0;
1270 		tx_ring->stall_watchdog = 0;
1271 		if (tx_ring->reschedule) {
1272 			tx_ring->reschedule = B_FALSE;
1273 			mac_tx_ring_update(ixgbe->mac_hdl,
1274 			    tx_ring->ring_handle);
1275 		}
1276 		mutex_exit(&tx_ring->recycle_lock);
1277 		return (0);
1278 	}
1279 
1280 	/*
1281 	 * Sync the DMA buffer of the tx descriptor ring
1282 	 *
1283 	 * Note: For head write-back mode, the tx descriptors will not
1284 	 * be written back, but the head write-back value is stored at
1285 	 * the last extra tbd at the end of the DMA area, we still need
1286 	 * to sync the head write-back value for kernel.
1287 	 *
1288 	 * DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORKERNEL);
1289 	 */
1290 	(void) ddi_dma_sync(tx_ring->tbd_area.dma_handle,
1291 	    sizeof (union ixgbe_adv_tx_desc) * tx_ring->ring_size,
1292 	    sizeof (uint32_t),
1293 	    DDI_DMA_SYNC_FORKERNEL);
1294 
1295 	if (ixgbe_check_dma_handle(tx_ring->tbd_area.dma_handle) != DDI_FM_OK) {
1296 		mutex_exit(&tx_ring->recycle_lock);
1297 		ddi_fm_service_impact(ixgbe->dip,
1298 		    DDI_SERVICE_DEGRADED);
1299 		atomic_or_32(&ixgbe->ixgbe_state, IXGBE_ERROR);
1300 		return (0);
1301 	}
1302 
1303 	LINK_LIST_INIT(&pending_list);
1304 	desc_num = 0;
1305 	index = tx_ring->tbd_head;	/* Next index to clean */
1306 
1307 	/*
1308 	 * Get the value of head write-back
1309 	 */
1310 	head_wb = *tx_ring->tbd_head_wb;
1311 	while (index != head_wb) {
1312 		tcb = tx_ring->work_list[index];
1313 		ASSERT(tcb != NULL);
1314 
1315 		if (OFFSET(index, head_wb, tx_ring->ring_size) <
1316 		    tcb->desc_num) {
1317 			/*
1318 			 * The current tx control block is not
1319 			 * completely transmitted, stop recycling
1320 			 */
1321 			break;
1322 		}
1323 
1324 		/*
1325 		 * Strip off the tx control block from the work list,
1326 		 * and add it to the pending list.
1327 		 */
1328 		tx_ring->work_list[index] = NULL;
1329 		LIST_PUSH_TAIL(&pending_list, &tcb->link);
1330 
1331 		/*
1332 		 * Advance the index of the tx descriptor ring
1333 		 */
1334 		index = NEXT_INDEX(index, tcb->desc_num, tx_ring->ring_size);
1335 
1336 		/*
1337 		 * Count the total number of the tx descriptors recycled
1338 		 */
1339 		desc_num += tcb->desc_num;
1340 	}
1341 
1342 	/*
1343 	 * If no tx descriptors are recycled, no need to do more processing
1344 	 */
1345 	if (desc_num == 0) {
1346 		tx_ring->recycle_fail++;
1347 		mutex_exit(&tx_ring->recycle_lock);
1348 		return (0);
1349 	}
1350 
1351 	tx_ring->recycle_fail = 0;
1352 	tx_ring->stall_watchdog = 0;
1353 
1354 	/*
1355 	 * Update the head index of the tx descriptor ring
1356 	 */
1357 	tx_ring->tbd_head = index;
1358 
1359 	/*
1360 	 * Update the number of the free tx descriptors with atomic operations
1361 	 */
1362 	atomic_add_32(&tx_ring->tbd_free, desc_num);
1363 
1364 	if ((tx_ring->tbd_free >= ixgbe->tx_resched_thresh) &&
1365 	    (tx_ring->reschedule)) {
1366 		tx_ring->reschedule = B_FALSE;
1367 		mac_tx_ring_update(ixgbe->mac_hdl,
1368 		    tx_ring->ring_handle);
1369 	}
1370 	mutex_exit(&tx_ring->recycle_lock);
1371 
1372 	/*
1373 	 * Add the tx control blocks in the pending list to the free list.
1374 	 */
1375 	ixgbe_put_free_list(tx_ring, &pending_list);
1376 
1377 	return (desc_num);
1378 }
1379 
1380 /*
1381  * ixgbe_free_tcb - free up the tx control block
1382  *
1383  * Free the resources of the tx control block, including
1384  * unbind the previously bound DMA handle, and reset other
1385  * control fields.
1386  */
1387 void
ixgbe_free_tcb(tx_control_block_t * tcb)1388 ixgbe_free_tcb(tx_control_block_t *tcb)
1389 {
1390 	if (tcb == NULL)
1391 		return;
1392 
1393 	switch (tcb->tx_type) {
1394 	case USE_COPY:
1395 		/*
1396 		 * Reset the buffer length that is used for copy
1397 		 */
1398 		tcb->tx_buf.len = 0;
1399 		break;
1400 	case USE_DMA:
1401 		/*
1402 		 * Release the DMA resource that is used for
1403 		 * DMA binding.
1404 		 */
1405 		(void) ddi_dma_unbind_handle(tcb->tx_dma_handle);
1406 		break;
1407 	default:
1408 		break;
1409 	}
1410 
1411 	/*
1412 	 * Free the mblk
1413 	 */
1414 	if (tcb->mp != NULL) {
1415 		freemsg(tcb->mp);
1416 		tcb->mp = NULL;
1417 	}
1418 
1419 	tcb->tx_type = USE_NONE;
1420 	tcb->last_index = MAX_TX_RING_SIZE;
1421 	tcb->frag_num = 0;
1422 	tcb->desc_num = 0;
1423 }
1424 
1425 /*
1426  * ixgbe_get_free_list - Get a free tx control block from the free list.
1427  * Returns the tx control block and appends it to list.
1428  *
1429  * The atomic operation on the number of the available tx control block
1430  * in the free list is used to keep this routine mutual exclusive with
1431  * the routine ixgbe_put_check_list.
1432  */
1433 static tx_control_block_t *
ixgbe_get_free_list(ixgbe_tx_ring_t * tx_ring,link_list_t * list)1434 ixgbe_get_free_list(ixgbe_tx_ring_t *tx_ring, link_list_t *list)
1435 {
1436 	tx_control_block_t *tcb;
1437 
1438 	/*
1439 	 * Check and update the number of the free tx control block
1440 	 * in the free list.
1441 	 */
1442 	if (ixgbe_atomic_reserve(&tx_ring->tcb_free, 1) < 0) {
1443 		tx_ring->stat_fail_no_tcb++;
1444 		return (NULL);
1445 	}
1446 
1447 	mutex_enter(&tx_ring->tcb_head_lock);
1448 
1449 	tcb = tx_ring->free_list[tx_ring->tcb_head];
1450 	ASSERT(tcb != NULL);
1451 	tx_ring->free_list[tx_ring->tcb_head] = NULL;
1452 	tx_ring->tcb_head = NEXT_INDEX(tx_ring->tcb_head, 1,
1453 	    tx_ring->free_list_size);
1454 
1455 	mutex_exit(&tx_ring->tcb_head_lock);
1456 
1457 	LIST_PUSH_TAIL(list, &tcb->link);
1458 	return (tcb);
1459 }
1460 
1461 /*
1462  * ixgbe_put_free_list
1463  *
1464  * Put a list of used tx control blocks back to the free list
1465  *
1466  * A mutex is used here to ensure the serialization. The mutual exclusion
1467  * between ixgbe_get_free_list and ixgbe_put_free_list is implemented with
1468  * the atomic operation on the counter tcb_free.
1469  */
1470 void
ixgbe_put_free_list(ixgbe_tx_ring_t * tx_ring,link_list_t * pending_list)1471 ixgbe_put_free_list(ixgbe_tx_ring_t *tx_ring, link_list_t *pending_list)
1472 {
1473 	uint32_t index;
1474 	int tcb_num;
1475 	tx_control_block_t *tcb;
1476 
1477 	for (tcb = (tx_control_block_t *)LIST_GET_HEAD(pending_list);
1478 	    tcb != NULL;
1479 	    tcb = (tx_control_block_t *)LIST_GET_NEXT(pending_list, tcb)) {
1480 		/*
1481 		 * Despite the name, ixgbe_free_tcb() just releases the
1482 		 * resources in tcb, but does not free tcb itself.
1483 		 */
1484 		ixgbe_free_tcb(tcb);
1485 	}
1486 
1487 	mutex_enter(&tx_ring->tcb_tail_lock);
1488 
1489 	index = tx_ring->tcb_tail;
1490 
1491 	tcb_num = 0;
1492 	tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
1493 	while (tcb != NULL) {
1494 		ASSERT(tx_ring->free_list[index] == NULL);
1495 		tx_ring->free_list[index] = tcb;
1496 
1497 		tcb_num++;
1498 
1499 		index = NEXT_INDEX(index, 1, tx_ring->free_list_size);
1500 
1501 		tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
1502 	}
1503 
1504 	tx_ring->tcb_tail = index;
1505 
1506 	/*
1507 	 * Update the number of the free tx control block
1508 	 * in the free list. This operation must be placed
1509 	 * under the protection of the lock.
1510 	 */
1511 	atomic_add_32(&tx_ring->tcb_free, tcb_num);
1512 
1513 	mutex_exit(&tx_ring->tcb_tail_lock);
1514 }
1515