1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright(c) 2007-2010 Intel Corporation. All rights reserved.
24 */
25
26 /*
27 * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Copyright 2012 Nexenta Systems, Inc. All rights reserved.
29 * Copyright 2016 OmniTI Computer Consulting, Inc. All rights reserved.
30 * Copyright 2021 Joyent, Inc.
31 */
32
33 #include "ixgbe_sw.h"
34
35 static int ixgbe_tx_copy(ixgbe_tx_ring_t *, tx_control_block_t **,
36 link_list_t *, const void *, size_t);
37 static int ixgbe_tx_bind(ixgbe_tx_ring_t *, tx_control_block_t **,
38 link_list_t *, uint8_t *, size_t);
39 static uint_t ixgbe_tcb_done(tx_control_block_t *);
40 static int ixgbe_tx_fill_ring(ixgbe_tx_ring_t *, link_list_t *,
41 ixgbe_tx_context_t *, size_t);
42 static void ixgbe_save_desc(tx_control_block_t *, uint64_t, size_t);
43 static tx_control_block_t *ixgbe_get_free_list(ixgbe_tx_ring_t *,
44 link_list_t *);
45
46 static int ixgbe_get_context(mblk_t *, ixgbe_tx_context_t *);
47 static boolean_t ixgbe_check_context(ixgbe_tx_ring_t *,
48 ixgbe_tx_context_t *);
49 static void ixgbe_fill_context(struct ixgbe_adv_tx_context_desc *,
50 ixgbe_tx_context_t *);
51
52 #ifndef IXGBE_DEBUG
53 #pragma inline(ixgbe_save_desc)
54 #pragma inline(ixgbe_get_context)
55 #pragma inline(ixgbe_check_context)
56 #pragma inline(ixgbe_fill_context)
57 #endif
58
59 /*
60 * ixgbe_ring_tx
61 *
62 * To transmit one mblk through one specified ring.
63 *
64 * One mblk can consist of several fragments, each fragment
65 * will be processed with different methods based on the size.
66 * For the fragments with size less than the bcopy threshold,
67 * they will be processed by using bcopy; otherwise, they will
68 * be processed by using DMA binding.
69 *
70 * To process the mblk, for each fragment, we pass a pointer to the location
71 * of the current transmit control block (tcb) (initialized to NULL) to either
72 * ixgbe_tx_copy() or ixgbe_tx_bind() (based on the size of the mblk fragment).
73 * ixgbe_tx_copy() and ixgbe_tx_bind() will either continue to use the current
74 * if possible, or close out the current tcb, allocate a new tcb, and update
75 * the passed location (tx_control_block_t **) to reflect the new current tcb.
76 *
77 * Since bound mblk fragments require their own tcb, the close, allocate new,
78 * and update steps occur on every call to ixgbe_tx_bind(), but since
79 * consecutive small mblk fragments can be combined into a single tcb, the
80 * close, allocate new, and update steps may not occur on every call to
81 * ixgbe_tx_copy(). If the current tcb is already being used to copy data and
82 * we call ixgbe_tx_copy(), if there is enough room in the current tcb for
83 * the current mblk fragment, we append the data from the mblk fragment. If
84 * we call ixgbe_tx_copy() and the current tcb isn't being used to copy (i.e.
85 * the previous iteration of the loop called ixgbe_tx_bind()), or doesn't
86 * have enough space for the mblk fragment, we close out the current tcb,
87 * grab a new tcb from the free list, and update the current tcb to the
88 * newly obtained tcb.
89 *
90 * When LSO (large segment offload) is enabled, we first copy the packet
91 * headers (ethernet, IP, and TCP/UDP) into their own descriptor before
92 * processing the remainder of the packet. The remaining bytes of the packet
93 * are then copied or mapped based on the fragment size as described above.
94 *
95 * Through the entire processing of a packet, we keep track of the number of
96 * DMA descriptors being used (either bound or pre-bound buffers used for
97 * copying) by this packet. Each tcb requires at least one DMA descriptor, but
98 * may require more than one. When a tcb is closed by ixgbe_tx_bind() or
99 * ixgbe_tx_copy(), it does so by calling ixgbe_tcb_done() which returns the
100 * number of DMA descriptors that are closed (ready for the HW). Since the
101 * hardware limits the number of descriptors that can be used to transmit a
102 * single packet, if the total number DMA descriptors required to transmit
103 * this packet exceeds this limit, we perform a msgpullup() and try again.
104 * Since our DMA attributes limit the number of DMA cookies allowed to
105 * map a single span of memory to a value (MAX_COOKIE) less than the
106 * maximum number of descriptors allowed for a packet (IXGBE_TX_DESC_LIMIT),
107 * as long as sufficient tcbs are available, we should always be able to
108 * process a packet that's contained in a single mblk_t (no additional
109 * fragments).
110 *
111 * Once all of the tcbs have been setup, ixgbe_tx_fill_ring() is called to
112 * setup the tx ring to transmit the tcbs and then tell the HW to start
113 * transmitting. When transmission is complete, an interrupt is triggered
114 * which calls the appropriate recycle routine to place the tcbs that were
115 * used in transmission back in the free list. We also may also try to
116 * recycle any available tcbs when the size of the tcb free list gets low
117 * or if the watchdog timer triggers.
118 *
119 */
120 mblk_t *
ixgbe_ring_tx(void * arg,mblk_t * orig_mp)121 ixgbe_ring_tx(void *arg, mblk_t *orig_mp)
122 {
123 ixgbe_tx_ring_t *tx_ring = (ixgbe_tx_ring_t *)arg;
124 ixgbe_t *ixgbe = tx_ring->ixgbe;
125 mblk_t *mp = orig_mp;
126 mblk_t *pull_mp = NULL;
127 tx_control_block_t *tcb;
128 size_t mbsize, offset, len;
129 uint32_t desc_total;
130 uint32_t copy_thresh;
131 int desc_num;
132 ixgbe_tx_context_t tx_context, *ctx = NULL;
133 link_list_t pending_list;
134 boolean_t limit_retry = B_FALSE;
135
136 ASSERT(mp->b_next == NULL);
137
138 if ((ixgbe->ixgbe_state & IXGBE_SUSPENDED) ||
139 (ixgbe->ixgbe_state & IXGBE_ERROR) ||
140 (ixgbe->ixgbe_state & IXGBE_OVERTEMP) ||
141 !(ixgbe->ixgbe_state & IXGBE_STARTED) ||
142 ixgbe->link_state != LINK_STATE_UP) {
143 freemsg(mp);
144 return (NULL);
145 }
146
147 copy_thresh = ixgbe->tx_copy_thresh;
148
149 mbsize = msgsize(mp);
150
151 if (ixgbe->tx_hcksum_enable) {
152 /*
153 * Retrieve checksum context information from the mblk
154 * that will be used to decide whether/how to fill the
155 * context descriptor.
156 */
157 ctx = &tx_context;
158 if (ixgbe_get_context(mp, ctx) < 0) {
159 freemsg(mp);
160 return (NULL);
161 }
162
163 /*
164 * If the mblk size exceeds the max size ixgbe could
165 * process, then discard this mblk, and return NULL.
166 */
167 if ((ctx->lso_flag &&
168 ((mbsize - ctx->mac_hdr_len) > IXGBE_LSO_MAXLEN)) ||
169 (!ctx->lso_flag &&
170 (mbsize > (ixgbe->max_frame_size - ETHERFCSL)))) {
171 freemsg(mp);
172 IXGBE_DEBUGLOG_0(ixgbe, "ixgbe_tx: packet oversize");
173 return (NULL);
174 }
175 }
176
177 /*
178 * If we use too many descriptors (see comments below), we may do
179 * pull_mp = msgpullup(orig_mp, -1), and jump back to here. As such,
180 * any time we error return past here, we should check and free
181 * pull_mp if != NULL.
182 */
183 retry:
184 /*
185 * Check and recycle tx descriptors.
186 * The recycle threshold here should be selected carefully
187 */
188 if (tx_ring->tbd_free < ixgbe->tx_recycle_thresh) {
189 tx_ring->tx_recycle(tx_ring);
190 }
191
192 /*
193 * After the recycling, if the tbd_free is less than the
194 * overload_threshold, assert overload, return mp;
195 * and we need to re-schedule the tx again.
196 */
197 if (tx_ring->tbd_free < ixgbe->tx_overload_thresh) {
198 tx_ring->reschedule = B_TRUE;
199 tx_ring->stat_overload++;
200 if (pull_mp != NULL)
201 freemsg(pull_mp);
202 return (orig_mp);
203 }
204
205 /*
206 * The pending_list is a linked list that is used to save
207 * the tx control blocks that have packet data processed
208 * but have not put the data to the tx descriptor ring.
209 * It is used to reduce the lock contention of the tx_lock.
210 */
211 LINK_LIST_INIT(&pending_list);
212
213 tcb = NULL;
214 desc_num = 0;
215 desc_total = 0;
216 offset = 0;
217
218 /*
219 * For LSO, we always copy the packet header (Ethernet + IP + TCP/UDP)
220 * into a single descriptor separate from the remaining data.
221 */
222 if ((ctx != NULL) && ctx->lso_flag) {
223 size_t hdr_len;
224
225 hdr_len = ctx->ip_hdr_len + ctx->mac_hdr_len + ctx->l4_hdr_len;
226
227 /*
228 * copy the first hdr_len bytes of mp (i.e. the Ethernet, IP,
229 * and TCP/UDP headers) into tcb.
230 */
231 for (len = hdr_len; mp != NULL && len > 0; mp = mp->b_cont) {
232 size_t mlen = MBLKL(mp);
233 size_t amt = MIN(mlen, len);
234 int ret;
235
236 ret = ixgbe_tx_copy(tx_ring, &tcb, &pending_list,
237 mp->b_rptr, amt);
238 /*
239 * Since we're trying to copy all of the headers into
240 * a single buffer in a single tcb, if ixgbe_tx_copy()
241 * returns anything but 0, it means either no tcbs
242 * are available (< 0), or while copying, we spilled
243 * over and couldn't fit all the headers into a
244 * single tcb.
245 */
246 if (ret != 0) {
247 if (ret > 0)
248 tx_ring->stat_lso_header_fail++;
249 goto tx_failure;
250 }
251
252 len -= amt;
253
254 /*
255 * If we copy less than the full amount of this
256 * mblk_t, we have some amount to copy below.
257 */
258 if (amt < mlen) {
259 offset = amt;
260 break;
261 }
262 }
263
264 ASSERT0(len);
265
266 /*
267 * Finish off the header tcb, and start anew for the
268 * rest of the packet.
269 */
270 desc_total += ixgbe_tcb_done(tcb);
271 tcb = NULL;
272 }
273
274 /*
275 * Process each remaining segment in the packet -- either binding
276 * the dblk_t or copying the contents of the dblk_t to an already
277 * bound buffer. When we copy, we will accumulate consecutive small
278 * (less than copy_thresh bytes) segments into a single tcb buffer
279 * until no more can fit (or we encounter a segment larger than
280 * copy_thresh and bind the dblk_t).
281 *
282 * Both ixgbe_tx_bind() and ixgbe_tx_copy() will allocate new
283 * transmit control blocks (tcb)s as needed (and append them onto
284 * 'pending_list'). Both functions also replace 'tcb' with the new
285 * tcb when they allocate a new tcb.
286 *
287 * We stop trying to process the packet once the number of descriptors
288 * used equals IXGBE_TX_DESC_LIMIT. Even if we're copying into the
289 * IXGBE_TX_DESC_LIMIT-th descriptor, we won't have room to add a
290 * context descriptor (since we're already at the limit), so there's
291 * no point in continuing. We'll pull up the mblk_t (see below)
292 * and try again.
293 */
294 while (mp != NULL && desc_total < IXGBE_TX_DESC_LIMIT) {
295 uint8_t *rptr = mp->b_rptr + offset;
296 int ret;
297
298 len = MBLKL(mp) - offset;
299 offset = 0;
300
301 if (len > copy_thresh) {
302 ret = ixgbe_tx_bind(tx_ring, &tcb, &pending_list, rptr,
303 len);
304 } else {
305 ret = ixgbe_tx_copy(tx_ring, &tcb, &pending_list, rptr,
306 len);
307 }
308
309 if (ret < 0)
310 goto tx_failure;
311
312 desc_total += ret;
313 mp = mp->b_cont;
314 }
315
316 /* Finish off the last tcb */
317 desc_total += ixgbe_tcb_done(tcb);
318
319 /*
320 * 82598/82599 chipset has a limitation that no more than 32 tx
321 * descriptors can be transmited out at one time. As noted above,
322 * we need to include space for a context descriptor in case its
323 * necessary, so we do this even if desc_total == IXGBE_TX_DESC_LIMIT
324 * as well as when it exceeds the limit.
325 *
326 * If we exceed this limit, we take the hit, do a msgpullup(), and
327 * then try again. Our DMA attributes guarantee we should never use
328 * more than MAX_COOKIE (18) descriptors to map a single mblk_t, so we
329 * should only need to retry once.
330 */
331 if (desc_total >= IXGBE_TX_DESC_LIMIT) {
332 /* We shouldn't hit this path twice */
333 VERIFY0(limit_retry);
334
335 tx_ring->stat_break_tbd_limit++;
336
337 /* Release all the tcbs we used previously */
338 ixgbe_put_free_list(tx_ring, &pending_list);
339 desc_total = 0;
340 offset = 0;
341
342 pull_mp = msgpullup(orig_mp, -1);
343 if (pull_mp == NULL) {
344 tx_ring->reschedule = B_TRUE;
345 return (orig_mp);
346 }
347
348 mp = pull_mp;
349 limit_retry = B_TRUE;
350 goto retry;
351 }
352
353 /*
354 * Before filling the tx descriptor ring with the data, we need to
355 * ensure there are adequate free descriptors for transmit
356 * (including one context descriptor).
357 * Do not use up all the tx descriptors.
358 * Otherwise tx recycle will fail and cause false hang.
359 */
360 if (tx_ring->tbd_free <= (desc_total + 1)) {
361 tx_ring->tx_recycle(tx_ring);
362 }
363
364 mutex_enter(&tx_ring->tx_lock);
365 /*
366 * If the number of free tx descriptors is not enough for transmit
367 * then return mp.
368 *
369 * Note: we must put this check under the mutex protection to
370 * ensure the correctness when multiple threads access it in
371 * parallel.
372 */
373 if (tx_ring->tbd_free <= (desc_total + 1)) {
374 tx_ring->stat_fail_no_tbd++;
375 mutex_exit(&tx_ring->tx_lock);
376 goto tx_failure;
377 }
378
379 /*
380 * Attach the mblk_t we've setup to the last control block.
381 * This is only done once we know there are enough free descriptors
382 * to transmit so that the cleanup in tx_failure doesn't try to
383 * call freemsg() on mp (since we will want to return it).
384 */
385 tcb->mp = (pull_mp != NULL) ? pull_mp : orig_mp;
386
387 desc_num = ixgbe_tx_fill_ring(tx_ring, &pending_list, ctx,
388 mbsize);
389
390 ASSERT((desc_num == desc_total) || (desc_num == (desc_total + 1)));
391
392 tx_ring->stat_obytes += mbsize;
393 tx_ring->stat_opackets++;
394
395 mutex_exit(&tx_ring->tx_lock);
396
397 /*
398 * Now that tx is done, if we pulled up the original message, we
399 * can free the original message since it is no longer being
400 * used.
401 */
402 if (pull_mp != NULL) {
403 freemsg(orig_mp);
404 }
405
406 return (NULL);
407
408 tx_failure:
409 /*
410 * If transmission fails, need to free the pulling up mblk.
411 */
412 if (pull_mp) {
413 freemsg(pull_mp);
414 }
415
416 /*
417 * Return the tx control blocks in the pending list to the free list.
418 */
419 ixgbe_put_free_list(tx_ring, &pending_list);
420
421 /* Transmit failed, do not drop the mblk, rechedule the transmit */
422 tx_ring->reschedule = B_TRUE;
423
424 return (orig_mp);
425 }
426
427 /*
428 * ixgbe_tx_copy
429 *
430 * Copy the mblk fragment to the pre-allocated tx buffer. Return -1 on error,
431 * otherwise return the number of descriptors we've completed in this call.
432 */
433 static int
ixgbe_tx_copy(ixgbe_tx_ring_t * tx_ring,tx_control_block_t ** tcbp,link_list_t * pending_list,const void * buf,size_t len)434 ixgbe_tx_copy(ixgbe_tx_ring_t *tx_ring, tx_control_block_t **tcbp,
435 link_list_t *pending_list, const void *buf, size_t len)
436 {
437 tx_control_block_t *tcb = *tcbp;
438 dma_buffer_t *tx_buf;
439 uint32_t desc_num = 0;
440
441 /*
442 * We need a new tcb -- either the current one (tcb) is NULL because
443 * we just started, tcb is being used for DMA, or tcb isn't large enough
444 * to hold the contents we need to copy.
445 */
446 if (tcb == NULL || tcb->tx_type == USE_DMA ||
447 tcb->tx_buf.len + len > tcb->tx_buf.size) {
448 tx_control_block_t *newtcb;
449
450 newtcb = ixgbe_get_free_list(tx_ring, pending_list);
451 if (newtcb == NULL)
452 return (-1);
453
454 newtcb->tx_type = USE_COPY;
455
456 if (tcb != NULL)
457 desc_num += ixgbe_tcb_done(tcb);
458 *tcbp = tcb = newtcb;
459 }
460
461 ASSERT3S(tcb->tx_type, ==, USE_COPY);
462 tx_buf = &tcb->tx_buf;
463
464 /*
465 * Copy the packet data of the mblk fragment into the
466 * pre-allocated tx buffer, which is maintained by the
467 * tx control block.
468 *
469 * Several mblk fragments can be copied into one tx buffer.
470 * The destination address of the current copied fragment in
471 * the tx buffer is next to the end of the previous copied
472 * fragment.
473 */
474 if (len > 0) {
475 bcopy(buf, tx_buf->address + tx_buf->len, len);
476
477 tx_buf->len += len;
478 tcb->frag_num++;
479 }
480
481 return (desc_num);
482 }
483
484 /*
485 * ixgbe_tx_bind
486 *
487 * Bind the mblk fragment with DMA. Returns -1 on error, otherwise it
488 * returns the number of descriptors completed in this call. This count
489 * can include descriptors that weren't filled in by the current call to
490 * ixgbe_tx_bind() but were being used (but not yet completed) in previous
491 * calls to ixgbe_tx_bind() or ixgbe_tx_copy().
492 */
493 static int
ixgbe_tx_bind(ixgbe_tx_ring_t * tx_ring,tx_control_block_t ** tcbp,link_list_t * pending_list,uint8_t * buf,size_t len)494 ixgbe_tx_bind(ixgbe_tx_ring_t *tx_ring, tx_control_block_t **tcbp,
495 link_list_t *pending_list, uint8_t *buf, size_t len)
496 {
497 tx_control_block_t *tcb = NULL;
498 uint_t desc_num = 0;
499 int status;
500
501 tcb = ixgbe_get_free_list(tx_ring, pending_list);
502 if (tcb == NULL)
503 return (-1);
504
505 /*
506 * Use DMA binding to process the mblk fragment
507 */
508 status = ddi_dma_addr_bind_handle(tcb->tx_dma_handle, NULL,
509 (caddr_t)buf, len,
510 DDI_DMA_WRITE | DDI_DMA_STREAMING, DDI_DMA_DONTWAIT,
511 0, NULL, NULL);
512
513 if (status != DDI_DMA_MAPPED) {
514 tx_ring->stat_fail_dma_bind++;
515 return (-1);
516 }
517
518 tcb->frag_num++;
519 tcb->tx_type = USE_DMA;
520
521 /*
522 * If there was an old tcb, we're about to replace it. Finish
523 * setting up the old tcb so we can replace it with the new one.
524 */
525 if (*tcbp != NULL)
526 desc_num += ixgbe_tcb_done(*tcbp);
527
528 *tcbp = tcb;
529 return (desc_num);
530 }
531
532 /*
533 * Once we're done populating a tcb (either by binding or copying into
534 * a buffer in the tcb), get it ready for tx and return the number of
535 * descriptors used.
536 */
537 static uint_t
ixgbe_tcb_done(tx_control_block_t * tcb)538 ixgbe_tcb_done(tx_control_block_t *tcb)
539 {
540 uint_t desc_num = 0;
541
542 if (tcb->tx_type == USE_DMA) {
543 const ddi_dma_cookie_t *c;
544
545 for (c = ddi_dma_cookie_iter(tcb->tx_dma_handle, NULL);
546 c != NULL;
547 c = ddi_dma_cookie_iter(tcb->tx_dma_handle, c)) {
548 /*
549 * Save the address and length to the private data
550 * structure of the tx control block, which will be
551 * used to fill the tx descriptor ring after all the
552 * fragments are processed.
553 */
554 ixgbe_save_desc(tcb, c->dmac_laddress, c->dmac_size);
555 desc_num++;
556 }
557 } else if (tcb->tx_type == USE_COPY) {
558 dma_buffer_t *tx_buf = &tcb->tx_buf;
559
560 DMA_SYNC(tx_buf, DDI_DMA_SYNC_FORDEV);
561 ixgbe_save_desc(tcb, tx_buf->dma_address, tx_buf->len);
562 desc_num++;
563 } else {
564 panic("invalid tcb type");
565 }
566
567 return (desc_num);
568 }
569
570 /*
571 * ixgbe_get_context
572 *
573 * Get the context information from the mblk
574 */
575 static int
ixgbe_get_context(mblk_t * mp,ixgbe_tx_context_t * ctx)576 ixgbe_get_context(mblk_t *mp, ixgbe_tx_context_t *ctx)
577 {
578 uint32_t start;
579 uint32_t hckflags;
580 uint32_t lsoflags;
581 uint32_t lsocksum;
582 uint32_t mss;
583 uint32_t len;
584 uint32_t size;
585 uint32_t offset;
586 unsigned char *pos;
587 ushort_t etype;
588 uint32_t mac_hdr_len;
589 uint32_t l4_proto;
590 uint32_t l4_hdr_len;
591
592 ASSERT(mp != NULL);
593
594 mac_hcksum_get(mp, &start, NULL, NULL, NULL, &hckflags);
595 bzero(ctx, sizeof (ixgbe_tx_context_t));
596
597 if (hckflags == 0) {
598 return (0);
599 }
600
601 ctx->hcksum_flags = hckflags;
602
603 mac_lso_get(mp, &mss, &lsoflags);
604 ctx->mss = mss;
605 ctx->lso_flag = (lsoflags == HW_LSO);
606
607 etype = 0;
608 mac_hdr_len = 0;
609 l4_proto = 0;
610
611 /*
612 * Firstly get the position of the ether_type/ether_tpid.
613 * Here we don't assume the ether (VLAN) header is fully included
614 * in one mblk fragment, so we go thourgh the fragments to parse
615 * the ether type.
616 */
617 size = len = MBLKL(mp);
618 offset = offsetof(struct ether_header, ether_type);
619 while (size <= offset) {
620 mp = mp->b_cont;
621 ASSERT(mp != NULL);
622 len = MBLKL(mp);
623 size += len;
624 }
625 pos = mp->b_rptr + offset + len - size;
626
627 etype = ntohs(*(ushort_t *)(uintptr_t)pos);
628 if (etype == ETHERTYPE_VLAN) {
629 /*
630 * Get the position of the ether_type in VLAN header
631 */
632 offset = offsetof(struct ether_vlan_header, ether_type);
633 while (size <= offset) {
634 mp = mp->b_cont;
635 ASSERT(mp != NULL);
636 len = MBLKL(mp);
637 size += len;
638 }
639 pos = mp->b_rptr + offset + len - size;
640
641 etype = ntohs(*(ushort_t *)(uintptr_t)pos);
642 mac_hdr_len = sizeof (struct ether_vlan_header);
643 } else {
644 mac_hdr_len = sizeof (struct ether_header);
645 }
646
647 /*
648 * Here we don't assume the IP(V6) header is fully included in
649 * one mblk fragment.
650 */
651 lsocksum = HCK_PARTIALCKSUM;
652 ctx->l3_proto = etype;
653 switch (etype) {
654 case ETHERTYPE_IP:
655 if (ctx->lso_flag) {
656 offset = offsetof(ipha_t, ipha_length) + mac_hdr_len;
657 while (size <= offset) {
658 mp = mp->b_cont;
659 ASSERT(mp != NULL);
660 len = MBLKL(mp);
661 size += len;
662 }
663 pos = mp->b_rptr + offset + len - size;
664 *((uint16_t *)(uintptr_t)(pos)) = 0;
665
666 offset = offsetof(ipha_t, ipha_hdr_checksum) +
667 mac_hdr_len;
668 while (size <= offset) {
669 mp = mp->b_cont;
670 ASSERT(mp != NULL);
671 len = MBLKL(mp);
672 size += len;
673 }
674 pos = mp->b_rptr + offset + len - size;
675 *((uint16_t *)(uintptr_t)(pos)) = 0;
676
677 /*
678 * To perform ixgbe LSO, here also need to fill
679 * the tcp checksum field of the packet with the
680 * following pseudo-header checksum:
681 * (ip_source_addr, ip_destination_addr, l4_proto)
682 * Currently the tcp/ip stack has done it.
683 */
684 lsocksum |= HCK_IPV4_HDRCKSUM;
685 }
686
687 offset = offsetof(ipha_t, ipha_protocol) + mac_hdr_len;
688 while (size <= offset) {
689 mp = mp->b_cont;
690 ASSERT(mp != NULL);
691 len = MBLKL(mp);
692 size += len;
693 }
694 pos = mp->b_rptr + offset + len - size;
695
696 l4_proto = *(uint8_t *)pos;
697 break;
698 case ETHERTYPE_IPV6:
699 /*
700 * We need to zero out the length in the header.
701 */
702 if (ctx->lso_flag) {
703 offset = offsetof(ip6_t, ip6_plen) + mac_hdr_len;
704 while (size <= offset) {
705 mp = mp->b_cont;
706 ASSERT(mp != NULL);
707 len = MBLKL(mp);
708 size += len;
709 }
710 pos = mp->b_rptr + offset + len - size;
711 *((uint16_t *)(uintptr_t)(pos)) = 0;
712 }
713
714 offset = offsetof(ip6_t, ip6_nxt) + mac_hdr_len;
715 while (size <= offset) {
716 mp = mp->b_cont;
717 ASSERT(mp != NULL);
718 len = MBLKL(mp);
719 size += len;
720 }
721 pos = mp->b_rptr + offset + len - size;
722
723 l4_proto = *(uint8_t *)pos;
724 break;
725 default:
726 /* Unrecoverable error */
727 IXGBE_DEBUGLOG_0(NULL, "Ether type error with tx hcksum");
728 return (-2);
729 }
730
731 if (ctx->lso_flag) {
732 /*
733 * LSO relies on tx h/w checksum, so here will drop the packet
734 * if h/w checksum flag is not declared.
735 */
736 if ((ctx->hcksum_flags & lsocksum) != lsocksum) {
737 IXGBE_DEBUGLOG_2(NULL, "ixgbe_tx: h/w checksum flags "
738 "are not set for LSO, found 0x%x, needed bits 0x%x",
739 ctx->hcksum_flags, lsocksum);
740 return (-1);
741 }
742
743
744 offset = mac_hdr_len + start;
745 while (size <= offset) {
746 mp = mp->b_cont;
747 ASSERT(mp != NULL);
748 len = MBLKL(mp);
749 size += len;
750 }
751 pos = mp->b_rptr + offset + len - size;
752
753 l4_hdr_len = TCP_HDR_LENGTH((tcph_t *)pos);
754 } else {
755 /*
756 * l4 header length is only required for LSO
757 */
758 l4_hdr_len = 0;
759 }
760
761 ctx->mac_hdr_len = mac_hdr_len;
762 ctx->ip_hdr_len = start;
763 ctx->l4_proto = l4_proto;
764 ctx->l4_hdr_len = l4_hdr_len;
765
766 return (0);
767 }
768
769 /*
770 * ixgbe_check_context
771 *
772 * Check if a new context descriptor is needed
773 */
774 static boolean_t
ixgbe_check_context(ixgbe_tx_ring_t * tx_ring,ixgbe_tx_context_t * ctx)775 ixgbe_check_context(ixgbe_tx_ring_t *tx_ring, ixgbe_tx_context_t *ctx)
776 {
777 ixgbe_tx_context_t *last;
778
779 if (ctx == NULL)
780 return (B_FALSE);
781
782 /*
783 * Compare the context data retrieved from the mblk and the
784 * stored data of the last context descriptor. The data need
785 * to be checked are:
786 * hcksum_flags
787 * l4_proto
788 * mac_hdr_len
789 * ip_hdr_len
790 * lso_flag
791 * mss (only checked for LSO)
792 * l4_hr_len (only checked for LSO)
793 * Either one of the above data is changed, a new context descriptor
794 * will be needed.
795 */
796 last = &tx_ring->tx_context;
797
798 if ((ctx->hcksum_flags != last->hcksum_flags) ||
799 (ctx->l4_proto != last->l4_proto) ||
800 (ctx->l3_proto != last->l3_proto) ||
801 (ctx->mac_hdr_len != last->mac_hdr_len) ||
802 (ctx->ip_hdr_len != last->ip_hdr_len) ||
803 (ctx->lso_flag != last->lso_flag) ||
804 (ctx->lso_flag && ((ctx->mss != last->mss) ||
805 (ctx->l4_hdr_len != last->l4_hdr_len)))) {
806 return (B_TRUE);
807 }
808
809 return (B_FALSE);
810 }
811
812 /*
813 * ixgbe_fill_context
814 *
815 * Fill the context descriptor with hardware checksum informations
816 */
817 static void
ixgbe_fill_context(struct ixgbe_adv_tx_context_desc * ctx_tbd,ixgbe_tx_context_t * ctx)818 ixgbe_fill_context(struct ixgbe_adv_tx_context_desc *ctx_tbd,
819 ixgbe_tx_context_t *ctx)
820 {
821 /*
822 * Fill the context descriptor with the checksum
823 * context information we've got.
824 */
825 ctx_tbd->vlan_macip_lens = ctx->ip_hdr_len;
826 ctx_tbd->vlan_macip_lens |= ctx->mac_hdr_len <<
827 IXGBE_ADVTXD_MACLEN_SHIFT;
828
829 ctx_tbd->type_tucmd_mlhl =
830 IXGBE_ADVTXD_DCMD_DEXT | IXGBE_ADVTXD_DTYP_CTXT;
831 /*
832 * When we have a TX context set up, we enforce that the ethertype is
833 * either IPv4 or IPv6 in ixgbe_get_tx_context().
834 */
835 if (ctx->lso_flag || ctx->hcksum_flags & HCK_IPV4_HDRCKSUM) {
836 if (ctx->l3_proto == ETHERTYPE_IP) {
837 ctx_tbd->type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV4;
838 } else {
839 ctx_tbd->type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV6;
840 }
841 }
842
843 if (ctx->lso_flag || ctx->hcksum_flags & HCK_PARTIALCKSUM) {
844 switch (ctx->l4_proto) {
845 case IPPROTO_TCP:
846 ctx_tbd->type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_TCP;
847 break;
848 case IPPROTO_UDP:
849 /*
850 * We don't have to explicitly set:
851 * ctx_tbd->type_tucmd_mlhl |=
852 * IXGBE_ADVTXD_TUCMD_L4T_UDP;
853 * Because IXGBE_ADVTXD_TUCMD_L4T_UDP == 0b
854 */
855 break;
856 default:
857 /* Unrecoverable error */
858 IXGBE_DEBUGLOG_0(NULL, "L4 type error with tx hcksum");
859 break;
860 }
861 }
862
863 ctx_tbd->seqnum_seed = 0;
864
865 if (ctx->lso_flag) {
866 ctx_tbd->mss_l4len_idx =
867 (ctx->l4_hdr_len << IXGBE_ADVTXD_L4LEN_SHIFT) |
868 (ctx->mss << IXGBE_ADVTXD_MSS_SHIFT);
869 } else {
870 ctx_tbd->mss_l4len_idx = 0;
871 }
872 }
873
874 /*
875 * ixgbe_tx_fill_ring
876 *
877 * Fill the tx descriptor ring with the data
878 */
879 static int
ixgbe_tx_fill_ring(ixgbe_tx_ring_t * tx_ring,link_list_t * pending_list,ixgbe_tx_context_t * ctx,size_t mbsize)880 ixgbe_tx_fill_ring(ixgbe_tx_ring_t *tx_ring, link_list_t *pending_list,
881 ixgbe_tx_context_t *ctx, size_t mbsize)
882 {
883 struct ixgbe_hw *hw = &tx_ring->ixgbe->hw;
884 boolean_t load_context;
885 uint32_t index, tcb_index, desc_num;
886 union ixgbe_adv_tx_desc *tbd, *first_tbd;
887 tx_control_block_t *tcb, *first_tcb;
888 uint32_t hcksum_flags;
889 int i;
890
891 ASSERT(mutex_owned(&tx_ring->tx_lock));
892
893 tbd = NULL;
894 first_tbd = NULL;
895 first_tcb = NULL;
896 desc_num = 0;
897 hcksum_flags = 0;
898 load_context = B_FALSE;
899
900 /*
901 * Get the index of the first tx descriptor that will be filled,
902 * and the index of the first work list item that will be attached
903 * with the first used tx control block in the pending list.
904 * Note: the two indexes are the same.
905 */
906 index = tx_ring->tbd_tail;
907 tcb_index = tx_ring->tbd_tail;
908
909 if (ctx != NULL) {
910 hcksum_flags = ctx->hcksum_flags;
911
912 /*
913 * Check if a new context descriptor is needed for this packet
914 */
915 load_context = ixgbe_check_context(tx_ring, ctx);
916
917 if (load_context) {
918 tbd = &tx_ring->tbd_ring[index];
919
920 /*
921 * Fill the context descriptor with the
922 * hardware checksum offload informations.
923 */
924 ixgbe_fill_context(
925 (struct ixgbe_adv_tx_context_desc *)tbd, ctx);
926
927 index = NEXT_INDEX(index, 1, tx_ring->ring_size);
928 desc_num++;
929
930 /*
931 * Store the checksum context data if
932 * a new context descriptor is added
933 */
934 tx_ring->tx_context = *ctx;
935 }
936 }
937
938 first_tbd = &tx_ring->tbd_ring[index];
939
940 /*
941 * Fill tx data descriptors with the data saved in the pending list.
942 * The tx control blocks in the pending list are added to the work list
943 * at the same time.
944 *
945 * The work list is strictly 1:1 corresponding to the descriptor ring.
946 * One item of the work list corresponds to one tx descriptor. Because
947 * one tx control block can span multiple tx descriptors, the tx
948 * control block will be added to the first work list item that
949 * corresponds to the first tx descriptor generated from that tx
950 * control block.
951 */
952 tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
953 first_tcb = tcb;
954 while (tcb != NULL) {
955
956 for (i = 0; i < tcb->desc_num; i++) {
957 tbd = &tx_ring->tbd_ring[index];
958
959 tbd->read.buffer_addr = tcb->desc[i].address;
960 tbd->read.cmd_type_len = tcb->desc[i].length;
961
962 tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_DEXT
963 | IXGBE_ADVTXD_DTYP_DATA;
964
965 tbd->read.olinfo_status = 0;
966
967 index = NEXT_INDEX(index, 1, tx_ring->ring_size);
968 desc_num++;
969 }
970
971 /*
972 * Add the tx control block to the work list
973 */
974 ASSERT(tx_ring->work_list[tcb_index] == NULL);
975 tx_ring->work_list[tcb_index] = tcb;
976
977 tcb_index = index;
978 tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
979 }
980
981 if (load_context) {
982 /*
983 * Count the context descriptor for
984 * the first tx control block.
985 */
986 first_tcb->desc_num++;
987 }
988 first_tcb->last_index = PREV_INDEX(index, 1, tx_ring->ring_size);
989
990 /*
991 * The Insert Ethernet CRC (IFCS) bit and the checksum fields are only
992 * valid in the first descriptor of the packet.
993 * Setting paylen in every first_tbd for all parts.
994 * 82599, X540 and X550 require the packet length in paylen field
995 * with or without LSO and 82598 will ignore it in non-LSO mode.
996 */
997 ASSERT(first_tbd != NULL);
998 first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_IFCS;
999
1000 switch (hw->mac.type) {
1001 case ixgbe_mac_82598EB:
1002 if (ctx != NULL && ctx->lso_flag) {
1003 first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_TSE;
1004 first_tbd->read.olinfo_status |=
1005 (mbsize - ctx->mac_hdr_len - ctx->ip_hdr_len
1006 - ctx->l4_hdr_len) << IXGBE_ADVTXD_PAYLEN_SHIFT;
1007 }
1008 break;
1009
1010 case ixgbe_mac_82599EB:
1011 case ixgbe_mac_X540:
1012 case ixgbe_mac_X550:
1013 case ixgbe_mac_X550EM_x:
1014 case ixgbe_mac_X550EM_a:
1015 if (ctx != NULL && ctx->lso_flag) {
1016 first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_TSE;
1017 first_tbd->read.olinfo_status |=
1018 (mbsize - ctx->mac_hdr_len - ctx->ip_hdr_len
1019 - ctx->l4_hdr_len) << IXGBE_ADVTXD_PAYLEN_SHIFT;
1020 } else {
1021 first_tbd->read.olinfo_status |=
1022 (mbsize << IXGBE_ADVTXD_PAYLEN_SHIFT);
1023 }
1024 break;
1025
1026 default:
1027 break;
1028 }
1029
1030 /* Set hardware checksum bits */
1031 if (hcksum_flags != 0) {
1032 if (hcksum_flags & HCK_IPV4_HDRCKSUM)
1033 first_tbd->read.olinfo_status |=
1034 IXGBE_ADVTXD_POPTS_IXSM;
1035 if (hcksum_flags & HCK_PARTIALCKSUM)
1036 first_tbd->read.olinfo_status |=
1037 IXGBE_ADVTXD_POPTS_TXSM;
1038 }
1039
1040 /*
1041 * The last descriptor of packet needs End Of Packet (EOP),
1042 * and Report Status (RS) bits set
1043 */
1044 ASSERT(tbd != NULL);
1045 tbd->read.cmd_type_len |=
1046 IXGBE_ADVTXD_DCMD_EOP | IXGBE_ADVTXD_DCMD_RS;
1047
1048 /*
1049 * Sync the DMA buffer of the tx descriptor ring
1050 */
1051 DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORDEV);
1052
1053 /*
1054 * Update the number of the free tx descriptors.
1055 * The mutual exclusion between the transmission and the recycling
1056 * (for the tx descriptor ring and the work list) is implemented
1057 * with the atomic operation on the number of the free tx descriptors.
1058 *
1059 * Note: we should always decrement the counter tbd_free before
1060 * advancing the hardware TDT pointer to avoid the race condition -
1061 * before the counter tbd_free is decremented, the transmit of the
1062 * tx descriptors has done and the counter tbd_free is increased by
1063 * the tx recycling.
1064 */
1065 i = ixgbe_atomic_reserve(&tx_ring->tbd_free, desc_num);
1066 ASSERT(i >= 0);
1067
1068 tx_ring->tbd_tail = index;
1069
1070 /*
1071 * Advance the hardware TDT pointer of the tx descriptor ring
1072 */
1073 IXGBE_WRITE_REG(hw, IXGBE_TDT(tx_ring->index), index);
1074
1075 if (ixgbe_check_acc_handle(tx_ring->ixgbe->osdep.reg_handle) !=
1076 DDI_FM_OK) {
1077 ddi_fm_service_impact(tx_ring->ixgbe->dip,
1078 DDI_SERVICE_DEGRADED);
1079 atomic_or_32(&tx_ring->ixgbe->ixgbe_state, IXGBE_ERROR);
1080 }
1081
1082 return (desc_num);
1083 }
1084
1085 /*
1086 * ixgbe_save_desc
1087 *
1088 * Save the address/length pair to the private array
1089 * of the tx control block. The address/length pairs
1090 * will be filled into the tx descriptor ring later.
1091 */
1092 static void
ixgbe_save_desc(tx_control_block_t * tcb,uint64_t address,size_t length)1093 ixgbe_save_desc(tx_control_block_t *tcb, uint64_t address, size_t length)
1094 {
1095 sw_desc_t *desc;
1096
1097 desc = &tcb->desc[tcb->desc_num];
1098 desc->address = address;
1099 desc->length = length;
1100
1101 tcb->desc_num++;
1102 }
1103
1104 /*
1105 * ixgbe_tx_recycle_legacy
1106 *
1107 * Recycle the tx descriptors and tx control blocks.
1108 *
1109 * The work list is traversed to check if the corresponding
1110 * tx descriptors have been transmitted. If so, the resources
1111 * bound to the tx control blocks will be freed, and those
1112 * tx control blocks will be returned to the free list.
1113 */
1114 uint32_t
ixgbe_tx_recycle_legacy(ixgbe_tx_ring_t * tx_ring)1115 ixgbe_tx_recycle_legacy(ixgbe_tx_ring_t *tx_ring)
1116 {
1117 uint32_t index, last_index, prev_index;
1118 int desc_num;
1119 boolean_t desc_done;
1120 tx_control_block_t *tcb;
1121 link_list_t pending_list;
1122 ixgbe_t *ixgbe = tx_ring->ixgbe;
1123
1124 mutex_enter(&tx_ring->recycle_lock);
1125
1126 ASSERT(tx_ring->tbd_free <= tx_ring->ring_size);
1127
1128 if (tx_ring->tbd_free == tx_ring->ring_size) {
1129 tx_ring->recycle_fail = 0;
1130 tx_ring->stall_watchdog = 0;
1131 if (tx_ring->reschedule) {
1132 tx_ring->reschedule = B_FALSE;
1133 mac_tx_ring_update(ixgbe->mac_hdl,
1134 tx_ring->ring_handle);
1135 }
1136 mutex_exit(&tx_ring->recycle_lock);
1137 return (0);
1138 }
1139
1140 /*
1141 * Sync the DMA buffer of the tx descriptor ring
1142 */
1143 DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORKERNEL);
1144
1145 if (ixgbe_check_dma_handle(tx_ring->tbd_area.dma_handle) != DDI_FM_OK) {
1146 mutex_exit(&tx_ring->recycle_lock);
1147 ddi_fm_service_impact(ixgbe->dip, DDI_SERVICE_DEGRADED);
1148 atomic_or_32(&ixgbe->ixgbe_state, IXGBE_ERROR);
1149 return (0);
1150 }
1151
1152 LINK_LIST_INIT(&pending_list);
1153 desc_num = 0;
1154 index = tx_ring->tbd_head; /* Index of next tbd/tcb to recycle */
1155
1156 tcb = tx_ring->work_list[index];
1157 ASSERT(tcb != NULL);
1158
1159 while (tcb != NULL) {
1160 /*
1161 * Get the last tx descriptor of this packet.
1162 * If the last tx descriptor is done, then
1163 * we can recycle all descriptors of a packet
1164 * which usually includes several tx control blocks.
1165 * For 82599, LSO descriptors can not be recycled
1166 * unless the whole packet's transmission is done.
1167 * That's why packet level recycling is used here.
1168 * For 82598, there's not such limit.
1169 */
1170 last_index = tcb->last_index;
1171 /*
1172 * MAX_TX_RING_SIZE is used to judge whether
1173 * the index is a valid value or not.
1174 */
1175 if (last_index == MAX_TX_RING_SIZE)
1176 break;
1177
1178 /*
1179 * Check if the Descriptor Done bit is set
1180 */
1181 desc_done = tx_ring->tbd_ring[last_index].wb.status &
1182 IXGBE_TXD_STAT_DD;
1183 if (desc_done) {
1184 /*
1185 * recycle all descriptors of the packet
1186 */
1187 while (tcb != NULL) {
1188 /*
1189 * Strip off the tx control block from
1190 * the work list, and add it to the
1191 * pending list.
1192 */
1193 tx_ring->work_list[index] = NULL;
1194 LIST_PUSH_TAIL(&pending_list, &tcb->link);
1195
1196 /*
1197 * Count the total number of the tx
1198 * descriptors recycled
1199 */
1200 desc_num += tcb->desc_num;
1201
1202 index = NEXT_INDEX(index, tcb->desc_num,
1203 tx_ring->ring_size);
1204
1205 tcb = tx_ring->work_list[index];
1206
1207 prev_index = PREV_INDEX(index, 1,
1208 tx_ring->ring_size);
1209 if (prev_index == last_index)
1210 break;
1211 }
1212 } else {
1213 break;
1214 }
1215 }
1216
1217 /*
1218 * If no tx descriptors are recycled, no need to do more processing
1219 */
1220 if (desc_num == 0) {
1221 tx_ring->recycle_fail++;
1222 mutex_exit(&tx_ring->recycle_lock);
1223 return (0);
1224 }
1225
1226 tx_ring->recycle_fail = 0;
1227 tx_ring->stall_watchdog = 0;
1228
1229 /*
1230 * Update the head index of the tx descriptor ring
1231 */
1232 tx_ring->tbd_head = index;
1233
1234 /*
1235 * Update the number of the free tx descriptors with atomic operations
1236 */
1237 atomic_add_32(&tx_ring->tbd_free, desc_num);
1238
1239 if ((tx_ring->tbd_free >= ixgbe->tx_resched_thresh) &&
1240 (tx_ring->reschedule)) {
1241 tx_ring->reschedule = B_FALSE;
1242 mac_tx_ring_update(ixgbe->mac_hdl,
1243 tx_ring->ring_handle);
1244 }
1245 mutex_exit(&tx_ring->recycle_lock);
1246
1247 /*
1248 * Add the tx control blocks in the pending list to the free list.
1249 */
1250 ixgbe_put_free_list(tx_ring, &pending_list);
1251
1252 return (desc_num);
1253 }
1254
1255 /*
1256 * ixgbe_tx_recycle_head_wb
1257 *
1258 * Check the head write-back, and recycle all the transmitted
1259 * tx descriptors and tx control blocks.
1260 */
1261 uint32_t
ixgbe_tx_recycle_head_wb(ixgbe_tx_ring_t * tx_ring)1262 ixgbe_tx_recycle_head_wb(ixgbe_tx_ring_t *tx_ring)
1263 {
1264 uint32_t index;
1265 uint32_t head_wb;
1266 int desc_num;
1267 tx_control_block_t *tcb;
1268 link_list_t pending_list;
1269 ixgbe_t *ixgbe = tx_ring->ixgbe;
1270
1271 mutex_enter(&tx_ring->recycle_lock);
1272
1273 ASSERT(tx_ring->tbd_free <= tx_ring->ring_size);
1274
1275 if (tx_ring->tbd_free == tx_ring->ring_size) {
1276 tx_ring->recycle_fail = 0;
1277 tx_ring->stall_watchdog = 0;
1278 if (tx_ring->reschedule) {
1279 tx_ring->reschedule = B_FALSE;
1280 mac_tx_ring_update(ixgbe->mac_hdl,
1281 tx_ring->ring_handle);
1282 }
1283 mutex_exit(&tx_ring->recycle_lock);
1284 return (0);
1285 }
1286
1287 /*
1288 * Sync the DMA buffer of the tx descriptor ring
1289 *
1290 * Note: For head write-back mode, the tx descriptors will not
1291 * be written back, but the head write-back value is stored at
1292 * the last extra tbd at the end of the DMA area, we still need
1293 * to sync the head write-back value for kernel.
1294 *
1295 * DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORKERNEL);
1296 */
1297 (void) ddi_dma_sync(tx_ring->tbd_area.dma_handle,
1298 sizeof (union ixgbe_adv_tx_desc) * tx_ring->ring_size,
1299 sizeof (uint32_t),
1300 DDI_DMA_SYNC_FORKERNEL);
1301
1302 if (ixgbe_check_dma_handle(tx_ring->tbd_area.dma_handle) != DDI_FM_OK) {
1303 mutex_exit(&tx_ring->recycle_lock);
1304 ddi_fm_service_impact(ixgbe->dip,
1305 DDI_SERVICE_DEGRADED);
1306 atomic_or_32(&ixgbe->ixgbe_state, IXGBE_ERROR);
1307 return (0);
1308 }
1309
1310 LINK_LIST_INIT(&pending_list);
1311 desc_num = 0;
1312 index = tx_ring->tbd_head; /* Next index to clean */
1313
1314 /*
1315 * Get the value of head write-back
1316 */
1317 head_wb = *tx_ring->tbd_head_wb;
1318 while (index != head_wb) {
1319 tcb = tx_ring->work_list[index];
1320 ASSERT(tcb != NULL);
1321
1322 if (OFFSET(index, head_wb, tx_ring->ring_size) <
1323 tcb->desc_num) {
1324 /*
1325 * The current tx control block is not
1326 * completely transmitted, stop recycling
1327 */
1328 break;
1329 }
1330
1331 /*
1332 * Strip off the tx control block from the work list,
1333 * and add it to the pending list.
1334 */
1335 tx_ring->work_list[index] = NULL;
1336 LIST_PUSH_TAIL(&pending_list, &tcb->link);
1337
1338 /*
1339 * Advance the index of the tx descriptor ring
1340 */
1341 index = NEXT_INDEX(index, tcb->desc_num, tx_ring->ring_size);
1342
1343 /*
1344 * Count the total number of the tx descriptors recycled
1345 */
1346 desc_num += tcb->desc_num;
1347 }
1348
1349 /*
1350 * If no tx descriptors are recycled, no need to do more processing
1351 */
1352 if (desc_num == 0) {
1353 tx_ring->recycle_fail++;
1354 mutex_exit(&tx_ring->recycle_lock);
1355 return (0);
1356 }
1357
1358 tx_ring->recycle_fail = 0;
1359 tx_ring->stall_watchdog = 0;
1360
1361 /*
1362 * Update the head index of the tx descriptor ring
1363 */
1364 tx_ring->tbd_head = index;
1365
1366 /*
1367 * Update the number of the free tx descriptors with atomic operations
1368 */
1369 atomic_add_32(&tx_ring->tbd_free, desc_num);
1370
1371 if ((tx_ring->tbd_free >= ixgbe->tx_resched_thresh) &&
1372 (tx_ring->reschedule)) {
1373 tx_ring->reschedule = B_FALSE;
1374 mac_tx_ring_update(ixgbe->mac_hdl,
1375 tx_ring->ring_handle);
1376 }
1377 mutex_exit(&tx_ring->recycle_lock);
1378
1379 /*
1380 * Add the tx control blocks in the pending list to the free list.
1381 */
1382 ixgbe_put_free_list(tx_ring, &pending_list);
1383
1384 return (desc_num);
1385 }
1386
1387 /*
1388 * ixgbe_free_tcb - free up the tx control block
1389 *
1390 * Free the resources of the tx control block, including
1391 * unbind the previously bound DMA handle, and reset other
1392 * control fields.
1393 */
1394 void
ixgbe_free_tcb(tx_control_block_t * tcb)1395 ixgbe_free_tcb(tx_control_block_t *tcb)
1396 {
1397 if (tcb == NULL)
1398 return;
1399
1400 switch (tcb->tx_type) {
1401 case USE_COPY:
1402 /*
1403 * Reset the buffer length that is used for copy
1404 */
1405 tcb->tx_buf.len = 0;
1406 break;
1407 case USE_DMA:
1408 /*
1409 * Release the DMA resource that is used for
1410 * DMA binding.
1411 */
1412 (void) ddi_dma_unbind_handle(tcb->tx_dma_handle);
1413 break;
1414 default:
1415 break;
1416 }
1417
1418 /*
1419 * Free the mblk
1420 */
1421 if (tcb->mp != NULL) {
1422 freemsg(tcb->mp);
1423 tcb->mp = NULL;
1424 }
1425
1426 tcb->tx_type = USE_NONE;
1427 tcb->last_index = MAX_TX_RING_SIZE;
1428 tcb->frag_num = 0;
1429 tcb->desc_num = 0;
1430 }
1431
1432 /*
1433 * ixgbe_get_free_list - Get a free tx control block from the free list.
1434 * Returns the tx control block and appends it to list.
1435 *
1436 * The atomic operation on the number of the available tx control block
1437 * in the free list is used to keep this routine mutual exclusive with
1438 * the routine ixgbe_put_check_list.
1439 */
1440 static tx_control_block_t *
ixgbe_get_free_list(ixgbe_tx_ring_t * tx_ring,link_list_t * list)1441 ixgbe_get_free_list(ixgbe_tx_ring_t *tx_ring, link_list_t *list)
1442 {
1443 tx_control_block_t *tcb;
1444
1445 /*
1446 * Check and update the number of the free tx control block
1447 * in the free list.
1448 */
1449 if (ixgbe_atomic_reserve(&tx_ring->tcb_free, 1) < 0) {
1450 tx_ring->stat_fail_no_tcb++;
1451 return (NULL);
1452 }
1453
1454 mutex_enter(&tx_ring->tcb_head_lock);
1455
1456 tcb = tx_ring->free_list[tx_ring->tcb_head];
1457 ASSERT(tcb != NULL);
1458 tx_ring->free_list[tx_ring->tcb_head] = NULL;
1459 tx_ring->tcb_head = NEXT_INDEX(tx_ring->tcb_head, 1,
1460 tx_ring->free_list_size);
1461
1462 mutex_exit(&tx_ring->tcb_head_lock);
1463
1464 LIST_PUSH_TAIL(list, &tcb->link);
1465 return (tcb);
1466 }
1467
1468 /*
1469 * ixgbe_put_free_list
1470 *
1471 * Put a list of used tx control blocks back to the free list
1472 *
1473 * A mutex is used here to ensure the serialization. The mutual exclusion
1474 * between ixgbe_get_free_list and ixgbe_put_free_list is implemented with
1475 * the atomic operation on the counter tcb_free.
1476 */
1477 void
ixgbe_put_free_list(ixgbe_tx_ring_t * tx_ring,link_list_t * pending_list)1478 ixgbe_put_free_list(ixgbe_tx_ring_t *tx_ring, link_list_t *pending_list)
1479 {
1480 uint32_t index;
1481 int tcb_num;
1482 tx_control_block_t *tcb;
1483
1484 for (tcb = (tx_control_block_t *)LIST_GET_HEAD(pending_list);
1485 tcb != NULL;
1486 tcb = (tx_control_block_t *)LIST_GET_NEXT(pending_list, tcb)) {
1487 /*
1488 * Despite the name, ixgbe_free_tcb() just releases the
1489 * resources in tcb, but does not free tcb itself.
1490 */
1491 ixgbe_free_tcb(tcb);
1492 }
1493
1494 mutex_enter(&tx_ring->tcb_tail_lock);
1495
1496 index = tx_ring->tcb_tail;
1497
1498 tcb_num = 0;
1499 tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
1500 while (tcb != NULL) {
1501 ASSERT(tx_ring->free_list[index] == NULL);
1502 tx_ring->free_list[index] = tcb;
1503
1504 tcb_num++;
1505
1506 index = NEXT_INDEX(index, 1, tx_ring->free_list_size);
1507
1508 tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
1509 }
1510
1511 tx_ring->tcb_tail = index;
1512
1513 /*
1514 * Update the number of the free tx control block
1515 * in the free list. This operation must be placed
1516 * under the protection of the lock.
1517 */
1518 atomic_add_32(&tx_ring->tcb_free, tcb_num);
1519
1520 mutex_exit(&tx_ring->tcb_tail_lock);
1521 }
1522