xref: /freebsd/sys/dev/sfxge/sfxge_tx.c (revision ecaeac805b044f715c98960a8fbf19fe2b76ae6b)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2010-2016 Solarflare Communications Inc.
5  * All rights reserved.
6  *
7  * This software was developed in part by Philip Paeps under contract for
8  * Solarflare Communications, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright notice,
14  *    this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright notice,
16  *    this list of conditions and the following disclaimer in the documentation
17  *    and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
23  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
29  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  *
31  * The views and conclusions contained in the software and documentation are
32  * those of the authors and should not be interpreted as representing official
33  * policies, either expressed or implied, of the FreeBSD Project.
34  */
35 
36 /* Theory of operation:
37  *
38  * Tx queues allocation and mapping on Siena
39  *
40  * One Tx queue with enabled checksum offload is allocated per Rx channel
41  * (event queue).  Also 2 Tx queues (one without checksum offload and one
42  * with IP checksum offload only) are allocated and bound to event queue 0.
43  * sfxge_txq_type is used as Tx queue label.
44  *
45  * So, event queue plus label mapping to Tx queue index is:
46  *	if event queue index is 0, TxQ-index = TxQ-label * [0..SFXGE_TXQ_NTYPES)
47  *	else TxQ-index = SFXGE_TXQ_NTYPES + EvQ-index - 1
48  * See sfxge_get_txq_by_label() sfxge_ev.c
49  *
50  * Tx queue allocation and mapping on EF10
51  *
52  * One Tx queue with enabled checksum offload is allocated per Rx
53  * channel (event queue). Checksum offload on all Tx queues is enabled or
54  * disabled dynamically by inserting option descriptors, so the additional
55  * queues used on Siena are not required.
56  *
57  * TxQ label is always set to zero on EF10 hardware.
58  * So, event queue to Tx queue mapping is simple:
59  * TxQ-index = EvQ-index
60  */
61 
62 #include <sys/cdefs.h>
63 __FBSDID("$FreeBSD$");
64 
65 #include "opt_rss.h"
66 
67 #include <sys/param.h>
68 #include <sys/malloc.h>
69 #include <sys/mbuf.h>
70 #include <sys/smp.h>
71 #include <sys/socket.h>
72 #include <sys/sysctl.h>
73 #include <sys/syslog.h>
74 #include <sys/limits.h>
75 
76 #include <net/bpf.h>
77 #include <net/ethernet.h>
78 #include <net/if.h>
79 #include <net/if_vlan_var.h>
80 
81 #include <netinet/in.h>
82 #include <netinet/ip.h>
83 #include <netinet/ip6.h>
84 #include <netinet/tcp.h>
85 
86 #ifdef RSS
87 #include <net/rss_config.h>
88 #endif
89 
90 #include "common/efx.h"
91 
92 #include "sfxge.h"
93 #include "sfxge_tx.h"
94 
95 #define	SFXGE_PARAM_TX_DPL_GET_MAX	SFXGE_PARAM(tx_dpl_get_max)
96 static int sfxge_tx_dpl_get_max = SFXGE_TX_DPL_GET_PKT_LIMIT_DEFAULT;
97 TUNABLE_INT(SFXGE_PARAM_TX_DPL_GET_MAX, &sfxge_tx_dpl_get_max);
98 SYSCTL_INT(_hw_sfxge, OID_AUTO, tx_dpl_get_max, CTLFLAG_RDTUN,
99 	   &sfxge_tx_dpl_get_max, 0,
100 	   "Maximum number of any packets in deferred packet get-list");
101 
102 #define	SFXGE_PARAM_TX_DPL_GET_NON_TCP_MAX \
103 	SFXGE_PARAM(tx_dpl_get_non_tcp_max)
104 static int sfxge_tx_dpl_get_non_tcp_max =
105 	SFXGE_TX_DPL_GET_NON_TCP_PKT_LIMIT_DEFAULT;
106 TUNABLE_INT(SFXGE_PARAM_TX_DPL_GET_NON_TCP_MAX, &sfxge_tx_dpl_get_non_tcp_max);
107 SYSCTL_INT(_hw_sfxge, OID_AUTO, tx_dpl_get_non_tcp_max, CTLFLAG_RDTUN,
108 	   &sfxge_tx_dpl_get_non_tcp_max, 0,
109 	   "Maximum number of non-TCP packets in deferred packet get-list");
110 
111 #define	SFXGE_PARAM_TX_DPL_PUT_MAX	SFXGE_PARAM(tx_dpl_put_max)
112 static int sfxge_tx_dpl_put_max = SFXGE_TX_DPL_PUT_PKT_LIMIT_DEFAULT;
113 TUNABLE_INT(SFXGE_PARAM_TX_DPL_PUT_MAX, &sfxge_tx_dpl_put_max);
114 SYSCTL_INT(_hw_sfxge, OID_AUTO, tx_dpl_put_max, CTLFLAG_RDTUN,
115 	   &sfxge_tx_dpl_put_max, 0,
116 	   "Maximum number of any packets in deferred packet put-list");
117 
118 #define	SFXGE_PARAM_TSO_FW_ASSISTED	SFXGE_PARAM(tso_fw_assisted)
119 static int sfxge_tso_fw_assisted = (SFXGE_FATSOV1 | SFXGE_FATSOV2);
120 TUNABLE_INT(SFXGE_PARAM_TSO_FW_ASSISTED, &sfxge_tso_fw_assisted);
121 SYSCTL_INT(_hw_sfxge, OID_AUTO, tso_fw_assisted, CTLFLAG_RDTUN,
122 	   &sfxge_tso_fw_assisted, 0,
123 	   "Bitmask of FW-assisted TSO allowed to use if supported by NIC firmware");
124 
125 static const struct {
126 	const char *name;
127 	size_t offset;
128 } sfxge_tx_stats[] = {
129 #define	SFXGE_TX_STAT(name, member) \
130 	{ #name, offsetof(struct sfxge_txq, member) }
131 	SFXGE_TX_STAT(tso_bursts, tso_bursts),
132 	SFXGE_TX_STAT(tso_packets, tso_packets),
133 	SFXGE_TX_STAT(tso_long_headers, tso_long_headers),
134 	SFXGE_TX_STAT(tso_pdrop_too_many, tso_pdrop_too_many),
135 	SFXGE_TX_STAT(tso_pdrop_no_rsrc, tso_pdrop_no_rsrc),
136 	SFXGE_TX_STAT(tx_collapses, collapses),
137 	SFXGE_TX_STAT(tx_drops, drops),
138 	SFXGE_TX_STAT(tx_get_overflow, get_overflow),
139 	SFXGE_TX_STAT(tx_get_non_tcp_overflow, get_non_tcp_overflow),
140 	SFXGE_TX_STAT(tx_put_overflow, put_overflow),
141 	SFXGE_TX_STAT(tx_netdown_drops, netdown_drops),
142 };
143 
144 /* Forward declarations. */
145 static void sfxge_tx_qdpl_service(struct sfxge_txq *txq);
146 static void sfxge_tx_qlist_post(struct sfxge_txq *txq);
147 static void sfxge_tx_qunblock(struct sfxge_txq *txq);
148 static int sfxge_tx_queue_tso(struct sfxge_txq *txq, struct mbuf *mbuf,
149 			      const bus_dma_segment_t *dma_seg, int n_dma_seg,
150 			      int n_extra_descs);
151 
152 static inline void
153 sfxge_next_stmp(struct sfxge_txq *txq, struct sfxge_tx_mapping **pstmp)
154 {
155 	KASSERT((*pstmp)->flags == 0, ("stmp flags are not 0"));
156 	if (__predict_false(*pstmp ==
157 			    &txq->stmp[txq->ptr_mask]))
158 		*pstmp = &txq->stmp[0];
159 	else
160 		(*pstmp)++;
161 }
162 
163 static int
164 sfxge_tx_maybe_toggle_cksum_offload(struct sfxge_txq *txq, struct mbuf *mbuf,
165 				    struct sfxge_tx_mapping **pstmp)
166 {
167 	uint16_t new_hw_cksum_flags;
168 	efx_desc_t *desc;
169 
170 	if (mbuf->m_pkthdr.csum_flags &
171 	    (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6 | CSUM_TSO)) {
172 		/*
173 		 * We always set EFX_TXQ_CKSUM_IPV4 here because this
174 		 * configuration is the most useful, and this won't
175 		 * cause any trouble in case of IPv6 traffic anyway.
176 		 */
177 		new_hw_cksum_flags = EFX_TXQ_CKSUM_IPV4 | EFX_TXQ_CKSUM_TCPUDP;
178 	} else if (mbuf->m_pkthdr.csum_flags & CSUM_DELAY_IP) {
179 		new_hw_cksum_flags = EFX_TXQ_CKSUM_IPV4;
180 	} else {
181 		new_hw_cksum_flags = 0;
182 	}
183 
184 	if (new_hw_cksum_flags == txq->hw_cksum_flags)
185 		return (0);
186 
187 	desc = &txq->pend_desc[txq->n_pend_desc];
188 	efx_tx_qdesc_checksum_create(txq->common, new_hw_cksum_flags, desc);
189 	txq->hw_cksum_flags = new_hw_cksum_flags;
190 	txq->n_pend_desc++;
191 
192 	sfxge_next_stmp(txq, pstmp);
193 
194 	return (1);
195 }
196 
197 static int
198 sfxge_tx_maybe_insert_tag(struct sfxge_txq *txq, struct mbuf *mbuf,
199 			  struct sfxge_tx_mapping **pstmp)
200 {
201 	uint16_t this_tag = ((mbuf->m_flags & M_VLANTAG) ?
202 			     mbuf->m_pkthdr.ether_vtag :
203 			     0);
204 	efx_desc_t *desc;
205 
206 	if (this_tag == txq->hw_vlan_tci)
207 		return (0);
208 
209 	desc = &txq->pend_desc[txq->n_pend_desc];
210 	efx_tx_qdesc_vlantci_create(txq->common, bswap16(this_tag), desc);
211 	txq->hw_vlan_tci = this_tag;
212 	txq->n_pend_desc++;
213 
214 	sfxge_next_stmp(txq, pstmp);
215 
216 	return (1);
217 }
218 
219 void
220 sfxge_tx_qcomplete(struct sfxge_txq *txq, struct sfxge_evq *evq)
221 {
222 	unsigned int completed;
223 
224 	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
225 
226 	completed = txq->completed;
227 	while (completed != txq->pending) {
228 		struct sfxge_tx_mapping *stmp;
229 		unsigned int id;
230 
231 		id = completed++ & txq->ptr_mask;
232 
233 		stmp = &txq->stmp[id];
234 		if (stmp->flags & TX_BUF_UNMAP) {
235 			bus_dmamap_unload(txq->packet_dma_tag, stmp->map);
236 			if (stmp->flags & TX_BUF_MBUF) {
237 				struct mbuf *m = stmp->u.mbuf;
238 				do
239 					m = m_free(m);
240 				while (m != NULL);
241 			} else {
242 				free(stmp->u.heap_buf, M_SFXGE);
243 			}
244 			stmp->flags = 0;
245 		}
246 	}
247 	txq->completed = completed;
248 
249 	/* Check whether we need to unblock the queue. */
250 	mb();
251 	if (txq->blocked) {
252 		unsigned int level;
253 
254 		level = txq->added - txq->completed;
255 		if (level <= SFXGE_TXQ_UNBLOCK_LEVEL(txq->entries))
256 			sfxge_tx_qunblock(txq);
257 	}
258 }
259 
260 static unsigned int
261 sfxge_is_mbuf_non_tcp(struct mbuf *mbuf)
262 {
263 	/* Absence of TCP checksum flags does not mean that it is non-TCP
264 	 * but it should be true if user wants to achieve high throughput.
265 	 */
266 	return (!(mbuf->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP)));
267 }
268 
269 /*
270  * Reorder the put list and append it to the get list.
271  */
272 static void
273 sfxge_tx_qdpl_swizzle(struct sfxge_txq *txq)
274 {
275 	struct sfxge_tx_dpl *stdp;
276 	struct mbuf *mbuf, *get_next, **get_tailp;
277 	volatile uintptr_t *putp;
278 	uintptr_t put;
279 	unsigned int count;
280 	unsigned int non_tcp_count;
281 
282 	SFXGE_TXQ_LOCK_ASSERT_OWNED(txq);
283 
284 	stdp = &txq->dpl;
285 
286 	/* Acquire the put list. */
287 	putp = &stdp->std_put;
288 	put = atomic_readandclear_ptr(putp);
289 	mbuf = (void *)put;
290 
291 	if (mbuf == NULL)
292 		return;
293 
294 	/* Reverse the put list. */
295 	get_tailp = &mbuf->m_nextpkt;
296 	get_next = NULL;
297 
298 	count = 0;
299 	non_tcp_count = 0;
300 	do {
301 		struct mbuf *put_next;
302 
303 		non_tcp_count += sfxge_is_mbuf_non_tcp(mbuf);
304 		put_next = mbuf->m_nextpkt;
305 		mbuf->m_nextpkt = get_next;
306 		get_next = mbuf;
307 		mbuf = put_next;
308 
309 		count++;
310 	} while (mbuf != NULL);
311 
312 	if (count > stdp->std_put_hiwat)
313 		stdp->std_put_hiwat = count;
314 
315 	/* Append the reversed put list to the get list. */
316 	KASSERT(*get_tailp == NULL, ("*get_tailp != NULL"));
317 	*stdp->std_getp = get_next;
318 	stdp->std_getp = get_tailp;
319 	stdp->std_get_count += count;
320 	stdp->std_get_non_tcp_count += non_tcp_count;
321 }
322 
323 static void
324 sfxge_tx_qreap(struct sfxge_txq *txq)
325 {
326 	SFXGE_TXQ_LOCK_ASSERT_OWNED(txq);
327 
328 	txq->reaped = txq->completed;
329 }
330 
331 static void
332 sfxge_tx_qlist_post(struct sfxge_txq *txq)
333 {
334 	unsigned int old_added __diagused;
335 	unsigned int block_level;
336 	unsigned int level;
337 	int rc __diagused;
338 
339 	SFXGE_TXQ_LOCK_ASSERT_OWNED(txq);
340 
341 	KASSERT(txq->n_pend_desc != 0, ("txq->n_pend_desc == 0"));
342 	KASSERT(txq->n_pend_desc <= txq->max_pkt_desc,
343 		("txq->n_pend_desc too large"));
344 	KASSERT(!txq->blocked, ("txq->blocked"));
345 
346 	old_added = txq->added;
347 
348 	/* Post the fragment list. */
349 	rc = efx_tx_qdesc_post(txq->common, txq->pend_desc, txq->n_pend_desc,
350 			  txq->reaped, &txq->added);
351 	KASSERT(rc == 0, ("efx_tx_qdesc_post() failed"));
352 
353 	/* If efx_tx_qdesc_post() had to refragment, our information about
354 	 * buffers to free may be associated with the wrong
355 	 * descriptors.
356 	 */
357 	KASSERT(txq->added - old_added == txq->n_pend_desc,
358 		("efx_tx_qdesc_post() refragmented descriptors"));
359 
360 	level = txq->added - txq->reaped;
361 	KASSERT(level <= txq->entries, ("overfilled TX queue"));
362 
363 	/* Clear the fragment list. */
364 	txq->n_pend_desc = 0;
365 
366 	/*
367 	 * Set the block level to ensure there is space to generate a
368 	 * large number of descriptors for TSO.
369 	 */
370 	block_level = EFX_TXQ_LIMIT(txq->entries) - txq->max_pkt_desc;
371 
372 	/* Have we reached the block level? */
373 	if (level < block_level)
374 		return;
375 
376 	/* Reap, and check again */
377 	sfxge_tx_qreap(txq);
378 	level = txq->added - txq->reaped;
379 	if (level < block_level)
380 		return;
381 
382 	txq->blocked = 1;
383 
384 	/*
385 	 * Avoid a race with completion interrupt handling that could leave
386 	 * the queue blocked.
387 	 */
388 	mb();
389 	sfxge_tx_qreap(txq);
390 	level = txq->added - txq->reaped;
391 	if (level < block_level) {
392 		mb();
393 		txq->blocked = 0;
394 	}
395 }
396 
397 static int sfxge_tx_queue_mbuf(struct sfxge_txq *txq, struct mbuf *mbuf)
398 {
399 	bus_dmamap_t *used_map;
400 	bus_dmamap_t map;
401 	bus_dma_segment_t dma_seg[SFXGE_TX_MAPPING_MAX_SEG];
402 	unsigned int id;
403 	struct sfxge_tx_mapping *stmp;
404 	efx_desc_t *desc;
405 	int n_dma_seg;
406 	int rc;
407 	int i;
408 	int eop;
409 	uint16_t hw_cksum_flags_prev;
410 	uint16_t hw_vlan_tci_prev;
411 	int n_extra_descs;
412 
413 	KASSERT(!txq->blocked, ("txq->blocked"));
414 
415 #if SFXGE_TX_PARSE_EARLY
416 	/*
417 	 * If software TSO is used, we still need to copy packet header,
418 	 * even if we have already parsed it early before enqueue.
419 	 */
420 	if ((mbuf->m_pkthdr.csum_flags & CSUM_TSO) &&
421 	    (txq->tso_fw_assisted == 0))
422 		prefetch_read_many(mbuf->m_data);
423 #else
424 	/*
425 	 * Prefetch packet header since we need to parse it and extract
426 	 * IP ID, TCP sequence number and flags.
427 	 */
428 	if (mbuf->m_pkthdr.csum_flags & CSUM_TSO)
429 		prefetch_read_many(mbuf->m_data);
430 #endif
431 
432 	if (__predict_false(txq->init_state != SFXGE_TXQ_STARTED)) {
433 		rc = EINTR;
434 		goto reject;
435 	}
436 
437 	/* Load the packet for DMA. */
438 	id = txq->added & txq->ptr_mask;
439 	stmp = &txq->stmp[id];
440 	rc = bus_dmamap_load_mbuf_sg(txq->packet_dma_tag, stmp->map,
441 				     mbuf, dma_seg, &n_dma_seg, 0);
442 	if (rc == EFBIG) {
443 		/* Try again. */
444 		struct mbuf *new_mbuf = m_collapse(mbuf, M_NOWAIT,
445 						   SFXGE_TX_MAPPING_MAX_SEG);
446 		if (new_mbuf == NULL)
447 			goto reject;
448 		++txq->collapses;
449 		mbuf = new_mbuf;
450 		rc = bus_dmamap_load_mbuf_sg(txq->packet_dma_tag,
451 					     stmp->map, mbuf,
452 					     dma_seg, &n_dma_seg, 0);
453 	}
454 	if (rc != 0)
455 		goto reject;
456 
457 	/* Make the packet visible to the hardware. */
458 	bus_dmamap_sync(txq->packet_dma_tag, stmp->map, BUS_DMASYNC_PREWRITE);
459 
460 	used_map = &stmp->map;
461 
462 	hw_cksum_flags_prev = txq->hw_cksum_flags;
463 	hw_vlan_tci_prev = txq->hw_vlan_tci;
464 
465 	/*
466 	 * The order of option descriptors, which are used to leverage VLAN tag
467 	 * and checksum offloads, might be important. Changing checksum offload
468 	 * between VLAN option and packet descriptors probably does not work.
469 	 */
470 	n_extra_descs = sfxge_tx_maybe_toggle_cksum_offload(txq, mbuf, &stmp);
471 	n_extra_descs += sfxge_tx_maybe_insert_tag(txq, mbuf, &stmp);
472 
473 	if (mbuf->m_pkthdr.csum_flags & CSUM_TSO) {
474 		rc = sfxge_tx_queue_tso(txq, mbuf, dma_seg, n_dma_seg,
475 					n_extra_descs);
476 		if (rc < 0)
477 			goto reject_mapped;
478 		stmp = &txq->stmp[(rc - 1) & txq->ptr_mask];
479 	} else {
480 		/* Add the mapping to the fragment list, and set flags
481 		 * for the buffer.
482 		 */
483 
484 		i = 0;
485 		for (;;) {
486 			desc = &txq->pend_desc[i + n_extra_descs];
487 			eop = (i == n_dma_seg - 1);
488 			efx_tx_qdesc_dma_create(txq->common,
489 						dma_seg[i].ds_addr,
490 						dma_seg[i].ds_len,
491 						eop,
492 						desc);
493 			if (eop)
494 				break;
495 			i++;
496 			sfxge_next_stmp(txq, &stmp);
497 		}
498 		txq->n_pend_desc = n_dma_seg + n_extra_descs;
499 	}
500 
501 	/*
502 	 * If the mapping required more than one descriptor
503 	 * then we need to associate the DMA map with the last
504 	 * descriptor, not the first.
505 	 */
506 	if (used_map != &stmp->map) {
507 		map = stmp->map;
508 		stmp->map = *used_map;
509 		*used_map = map;
510 	}
511 
512 	stmp->u.mbuf = mbuf;
513 	stmp->flags = TX_BUF_UNMAP | TX_BUF_MBUF;
514 
515 	/* Post the fragment list. */
516 	sfxge_tx_qlist_post(txq);
517 
518 	return (0);
519 
520 reject_mapped:
521 	txq->hw_vlan_tci = hw_vlan_tci_prev;
522 	txq->hw_cksum_flags = hw_cksum_flags_prev;
523 	bus_dmamap_unload(txq->packet_dma_tag, *used_map);
524 reject:
525 	/* Drop the packet on the floor. */
526 	m_freem(mbuf);
527 	++txq->drops;
528 
529 	return (rc);
530 }
531 
532 /*
533  * Drain the deferred packet list into the transmit queue.
534  */
535 static void
536 sfxge_tx_qdpl_drain(struct sfxge_txq *txq)
537 {
538 	struct sfxge_softc *sc;
539 	struct sfxge_tx_dpl *stdp;
540 	struct mbuf *mbuf, *next;
541 	unsigned int count;
542 	unsigned int non_tcp_count;
543 	unsigned int pushed;
544 	int rc;
545 
546 	SFXGE_TXQ_LOCK_ASSERT_OWNED(txq);
547 
548 	sc = txq->sc;
549 	stdp = &txq->dpl;
550 	pushed = txq->added;
551 
552 	if (__predict_true(txq->init_state == SFXGE_TXQ_STARTED)) {
553 		prefetch_read_many(sc->enp);
554 		prefetch_read_many(txq->common);
555 	}
556 
557 	mbuf = stdp->std_get;
558 	count = stdp->std_get_count;
559 	non_tcp_count = stdp->std_get_non_tcp_count;
560 
561 	if (count > stdp->std_get_hiwat)
562 		stdp->std_get_hiwat = count;
563 
564 	while (count != 0) {
565 		KASSERT(mbuf != NULL, ("mbuf == NULL"));
566 
567 		next = mbuf->m_nextpkt;
568 		mbuf->m_nextpkt = NULL;
569 
570 		ETHER_BPF_MTAP(sc->ifnet, mbuf); /* packet capture */
571 
572 		if (next != NULL)
573 			prefetch_read_many(next);
574 
575 		rc = sfxge_tx_queue_mbuf(txq, mbuf);
576 		--count;
577 		non_tcp_count -= sfxge_is_mbuf_non_tcp(mbuf);
578 		mbuf = next;
579 		if (rc != 0)
580 			continue;
581 
582 		if (txq->blocked)
583 			break;
584 
585 		/* Push the fragments to the hardware in batches. */
586 		if (txq->added - pushed >= SFXGE_TX_BATCH) {
587 			efx_tx_qpush(txq->common, txq->added, pushed);
588 			pushed = txq->added;
589 		}
590 	}
591 
592 	if (count == 0) {
593 		KASSERT(mbuf == NULL, ("mbuf != NULL"));
594 		KASSERT(non_tcp_count == 0,
595 			("inconsistent TCP/non-TCP detection"));
596 		stdp->std_get = NULL;
597 		stdp->std_get_count = 0;
598 		stdp->std_get_non_tcp_count = 0;
599 		stdp->std_getp = &stdp->std_get;
600 	} else {
601 		stdp->std_get = mbuf;
602 		stdp->std_get_count = count;
603 		stdp->std_get_non_tcp_count = non_tcp_count;
604 	}
605 
606 	if (txq->added != pushed)
607 		efx_tx_qpush(txq->common, txq->added, pushed);
608 
609 	KASSERT(txq->blocked || stdp->std_get_count == 0,
610 		("queue unblocked but count is non-zero"));
611 }
612 
613 #define	SFXGE_TX_QDPL_PENDING(_txq)	((_txq)->dpl.std_put != 0)
614 
615 /*
616  * Service the deferred packet list.
617  *
618  * NOTE: drops the txq mutex!
619  */
620 static void
621 sfxge_tx_qdpl_service(struct sfxge_txq *txq)
622 {
623 	SFXGE_TXQ_LOCK_ASSERT_OWNED(txq);
624 
625 	do {
626 		if (SFXGE_TX_QDPL_PENDING(txq))
627 			sfxge_tx_qdpl_swizzle(txq);
628 
629 		if (!txq->blocked)
630 			sfxge_tx_qdpl_drain(txq);
631 
632 		SFXGE_TXQ_UNLOCK(txq);
633 	} while (SFXGE_TX_QDPL_PENDING(txq) &&
634 		 SFXGE_TXQ_TRYLOCK(txq));
635 }
636 
637 /*
638  * Put a packet on the deferred packet get-list.
639  */
640 static int
641 sfxge_tx_qdpl_put_locked(struct sfxge_txq *txq, struct mbuf *mbuf)
642 {
643 	struct sfxge_tx_dpl *stdp;
644 
645 	stdp = &txq->dpl;
646 
647 	KASSERT(mbuf->m_nextpkt == NULL, ("mbuf->m_nextpkt != NULL"));
648 
649 	SFXGE_TXQ_LOCK_ASSERT_OWNED(txq);
650 
651 	if (stdp->std_get_count >= stdp->std_get_max) {
652 		txq->get_overflow++;
653 		return (ENOBUFS);
654 	}
655 	if (sfxge_is_mbuf_non_tcp(mbuf)) {
656 		if (stdp->std_get_non_tcp_count >=
657 		    stdp->std_get_non_tcp_max) {
658 			txq->get_non_tcp_overflow++;
659 			return (ENOBUFS);
660 		}
661 		stdp->std_get_non_tcp_count++;
662 	}
663 
664 	*(stdp->std_getp) = mbuf;
665 	stdp->std_getp = &mbuf->m_nextpkt;
666 	stdp->std_get_count++;
667 
668 	return (0);
669 }
670 
671 /*
672  * Put a packet on the deferred packet put-list.
673  *
674  * We overload the csum_data field in the mbuf to keep track of this length
675  * because there is no cheap alternative to avoid races.
676  */
677 static int
678 sfxge_tx_qdpl_put_unlocked(struct sfxge_txq *txq, struct mbuf *mbuf)
679 {
680 	struct sfxge_tx_dpl *stdp;
681 	volatile uintptr_t *putp;
682 	uintptr_t old;
683 	uintptr_t new;
684 	unsigned int put_count;
685 
686 	KASSERT(mbuf->m_nextpkt == NULL, ("mbuf->m_nextpkt != NULL"));
687 
688 	SFXGE_TXQ_LOCK_ASSERT_NOTOWNED(txq);
689 
690 	stdp = &txq->dpl;
691 	putp = &stdp->std_put;
692 	new = (uintptr_t)mbuf;
693 
694 	do {
695 		old = *putp;
696 		if (old != 0) {
697 			struct mbuf *mp = (struct mbuf *)old;
698 			put_count = mp->m_pkthdr.csum_data;
699 		} else
700 			put_count = 0;
701 		if (put_count >= stdp->std_put_max) {
702 			atomic_add_long(&txq->put_overflow, 1);
703 			return (ENOBUFS);
704 		}
705 		mbuf->m_pkthdr.csum_data = put_count + 1;
706 		mbuf->m_nextpkt = (void *)old;
707 	} while (atomic_cmpset_ptr(putp, old, new) == 0);
708 
709 	return (0);
710 }
711 
712 /*
713  * Called from if_transmit - will try to grab the txq lock and enqueue to the
714  * put list if it succeeds, otherwise try to push onto the defer list if space.
715  */
716 static int
717 sfxge_tx_packet_add(struct sfxge_txq *txq, struct mbuf *m)
718 {
719 	int rc;
720 
721 	if (!SFXGE_LINK_UP(txq->sc)) {
722 		atomic_add_long(&txq->netdown_drops, 1);
723 		return (ENETDOWN);
724 	}
725 
726 	/*
727 	 * Try to grab the txq lock.  If we are able to get the lock,
728 	 * the packet will be appended to the "get list" of the deferred
729 	 * packet list.  Otherwise, it will be pushed on the "put list".
730 	 */
731 	if (SFXGE_TXQ_TRYLOCK(txq)) {
732 		/* First swizzle put-list to get-list to keep order */
733 		sfxge_tx_qdpl_swizzle(txq);
734 
735 		rc = sfxge_tx_qdpl_put_locked(txq, m);
736 
737 		/* Try to service the list. */
738 		sfxge_tx_qdpl_service(txq);
739 		/* Lock has been dropped. */
740 	} else {
741 		rc = sfxge_tx_qdpl_put_unlocked(txq, m);
742 
743 		/*
744 		 * Try to grab the lock again.
745 		 *
746 		 * If we are able to get the lock, we need to process
747 		 * the deferred packet list.  If we are not able to get
748 		 * the lock, another thread is processing the list.
749 		 */
750 		if ((rc == 0) && SFXGE_TXQ_TRYLOCK(txq)) {
751 			sfxge_tx_qdpl_service(txq);
752 			/* Lock has been dropped. */
753 		}
754 	}
755 
756 	SFXGE_TXQ_LOCK_ASSERT_NOTOWNED(txq);
757 
758 	return (rc);
759 }
760 
761 static void
762 sfxge_tx_qdpl_flush(struct sfxge_txq *txq)
763 {
764 	struct sfxge_tx_dpl *stdp = &txq->dpl;
765 	struct mbuf *mbuf, *next;
766 
767 	SFXGE_TXQ_LOCK(txq);
768 
769 	sfxge_tx_qdpl_swizzle(txq);
770 	for (mbuf = stdp->std_get; mbuf != NULL; mbuf = next) {
771 		next = mbuf->m_nextpkt;
772 		m_freem(mbuf);
773 	}
774 	stdp->std_get = NULL;
775 	stdp->std_get_count = 0;
776 	stdp->std_get_non_tcp_count = 0;
777 	stdp->std_getp = &stdp->std_get;
778 
779 	SFXGE_TXQ_UNLOCK(txq);
780 }
781 
782 void
783 sfxge_if_qflush(if_t ifp)
784 {
785 	struct sfxge_softc *sc;
786 	unsigned int i;
787 
788 	sc = if_getsoftc(ifp);
789 
790 	for (i = 0; i < sc->txq_count; i++)
791 		sfxge_tx_qdpl_flush(sc->txq[i]);
792 }
793 
794 #if SFXGE_TX_PARSE_EARLY
795 
796 /* There is little space for user data in mbuf pkthdr, so we
797  * use l*hlen fields which are not used by the driver otherwise
798  * to store header offsets.
799  * The fields are 8-bit, but it's ok, no header may be longer than 255 bytes.
800  */
801 
802 #define TSO_MBUF_PROTO(_mbuf)    ((_mbuf)->m_pkthdr.PH_loc.sixteen[0])
803 /* We abuse l5hlen here because PH_loc can hold only 64 bits of data */
804 #define TSO_MBUF_FLAGS(_mbuf)    ((_mbuf)->m_pkthdr.l5hlen)
805 #define TSO_MBUF_PACKETID(_mbuf) ((_mbuf)->m_pkthdr.PH_loc.sixteen[1])
806 #define TSO_MBUF_SEQNUM(_mbuf)   ((_mbuf)->m_pkthdr.PH_loc.thirtytwo[1])
807 
808 static void sfxge_parse_tx_packet(struct mbuf *mbuf)
809 {
810 	struct ether_header *eh = mtod(mbuf, struct ether_header *);
811 	const struct tcphdr *th;
812 	struct tcphdr th_copy;
813 
814 	/* Find network protocol and header */
815 	TSO_MBUF_PROTO(mbuf) = eh->ether_type;
816 	if (TSO_MBUF_PROTO(mbuf) == htons(ETHERTYPE_VLAN)) {
817 		struct ether_vlan_header *veh =
818 			mtod(mbuf, struct ether_vlan_header *);
819 		TSO_MBUF_PROTO(mbuf) = veh->evl_proto;
820 		mbuf->m_pkthdr.l2hlen = sizeof(*veh);
821 	} else {
822 		mbuf->m_pkthdr.l2hlen = sizeof(*eh);
823 	}
824 
825 	/* Find TCP header */
826 	if (TSO_MBUF_PROTO(mbuf) == htons(ETHERTYPE_IP)) {
827 		const struct ip *iph = (const struct ip *)mtodo(mbuf, mbuf->m_pkthdr.l2hlen);
828 
829 		KASSERT(iph->ip_p == IPPROTO_TCP,
830 			("TSO required on non-TCP packet"));
831 		mbuf->m_pkthdr.l3hlen = mbuf->m_pkthdr.l2hlen + 4 * iph->ip_hl;
832 		TSO_MBUF_PACKETID(mbuf) = iph->ip_id;
833 	} else {
834 		KASSERT(TSO_MBUF_PROTO(mbuf) == htons(ETHERTYPE_IPV6),
835 			("TSO required on non-IP packet"));
836 		KASSERT(((const struct ip6_hdr *)mtodo(mbuf, mbuf->m_pkthdr.l2hlen))->ip6_nxt ==
837 			IPPROTO_TCP,
838 			("TSO required on non-TCP packet"));
839 		mbuf->m_pkthdr.l3hlen = mbuf->m_pkthdr.l2hlen + sizeof(struct ip6_hdr);
840 		TSO_MBUF_PACKETID(mbuf) = 0;
841 	}
842 
843 	KASSERT(mbuf->m_len >= mbuf->m_pkthdr.l3hlen,
844 		("network header is fragmented in mbuf"));
845 
846 	/* We need TCP header including flags (window is the next) */
847 	if (mbuf->m_len < mbuf->m_pkthdr.l3hlen + offsetof(struct tcphdr, th_win)) {
848 		m_copydata(mbuf, mbuf->m_pkthdr.l3hlen, sizeof(th_copy),
849 			   (caddr_t)&th_copy);
850 		th = &th_copy;
851 	} else {
852 		th = (const struct tcphdr *)mtodo(mbuf, mbuf->m_pkthdr.l3hlen);
853 	}
854 
855 	mbuf->m_pkthdr.l4hlen = mbuf->m_pkthdr.l3hlen + 4 * th->th_off;
856 	TSO_MBUF_SEQNUM(mbuf) = ntohl(th->th_seq);
857 
858 	/* These flags must not be duplicated */
859 	/*
860 	 * RST should not be duplicated as well, but FreeBSD kernel
861 	 * generates TSO packets with RST flag. So, do not assert
862 	 * its absence.
863 	 */
864 	KASSERT(!(th->th_flags & (TH_URG | TH_SYN)),
865 		("incompatible TCP flag 0x%x on TSO packet",
866 		 th->th_flags & (TH_URG | TH_SYN)));
867 	TSO_MBUF_FLAGS(mbuf) = th->th_flags;
868 }
869 #endif
870 
871 /*
872  * TX start -- called by the stack.
873  */
874 int
875 sfxge_if_transmit(if_t ifp, struct mbuf *m)
876 {
877 	struct sfxge_softc *sc;
878 	struct sfxge_txq *txq;
879 	int rc;
880 
881 	sc = (struct sfxge_softc *)if_getsoftc(ifp);
882 
883 	/*
884 	 * Transmit may be called when interface is up from the kernel
885 	 * point of view, but not yet up (in progress) from the driver
886 	 * point of view. I.e. link aggregation bring up.
887 	 * Transmit may be called when interface is up from the driver
888 	 * point of view, but already down from the kernel point of
889 	 * view. I.e. Rx when interface shutdown is in progress.
890 	 */
891 	KASSERT((if_getflags(ifp) & IFF_UP) || (sc->if_flags & IFF_UP),
892 		("interface not up"));
893 
894 	/* Pick the desired transmit queue. */
895 	if (sc->txq_dynamic_cksum_toggle_supported |
896 	    (m->m_pkthdr.csum_flags &
897 	     (CSUM_DELAY_DATA | CSUM_TCP_IPV6 | CSUM_UDP_IPV6 | CSUM_TSO))) {
898 		int index = 0;
899 
900 #ifdef RSS
901 		uint32_t bucket_id;
902 
903 		/*
904 		 * Select a TX queue which matches the corresponding
905 		 * RX queue for the hash in order to assign both
906 		 * TX and RX parts of the flow to the same CPU
907 		 */
908 		if (rss_m2bucket(m, &bucket_id) == 0)
909 			index = bucket_id % (sc->txq_count - (SFXGE_TXQ_NTYPES - 1));
910 #else
911 		/* check if flowid is set */
912 		if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
913 			uint32_t hash = m->m_pkthdr.flowid;
914 			uint32_t idx = hash % nitems(sc->rx_indir_table);
915 
916 			index = sc->rx_indir_table[idx];
917 		}
918 #endif
919 #if SFXGE_TX_PARSE_EARLY
920 		if (m->m_pkthdr.csum_flags & CSUM_TSO)
921 			sfxge_parse_tx_packet(m);
922 #endif
923 		index += (sc->txq_dynamic_cksum_toggle_supported == B_FALSE) ?
924 			 SFXGE_TXQ_IP_TCP_UDP_CKSUM : 0;
925 		txq = sc->txq[index];
926 	} else if (m->m_pkthdr.csum_flags & CSUM_DELAY_IP) {
927 		txq = sc->txq[SFXGE_TXQ_IP_CKSUM];
928 	} else {
929 		txq = sc->txq[SFXGE_TXQ_NON_CKSUM];
930 	}
931 
932 	rc = sfxge_tx_packet_add(txq, m);
933 	if (rc != 0)
934 		m_freem(m);
935 
936 	return (rc);
937 }
938 
939 /*
940  * Software "TSO".  Not quite as good as doing it in hardware, but
941  * still faster than segmenting in the stack.
942  */
943 
944 struct sfxge_tso_state {
945 	/* Output position */
946 	unsigned out_len;	/* Remaining length in current segment */
947 	unsigned seqnum;	/* Current sequence number */
948 	unsigned packet_space;	/* Remaining space in current packet */
949 	unsigned segs_space;	/* Remaining number of DMA segments
950 				   for the packet (FATSOv2 only) */
951 
952 	/* Input position */
953 	uint64_t dma_addr;	/* DMA address of current position */
954 	unsigned in_len;	/* Remaining length in current mbuf */
955 
956 	const struct mbuf *mbuf; /* Input mbuf (head of chain) */
957 	u_short protocol;	/* Network protocol (after VLAN decap) */
958 	ssize_t nh_off;		/* Offset of network header */
959 	ssize_t tcph_off;	/* Offset of TCP header */
960 	unsigned header_len;	/* Number of bytes of header */
961 	unsigned seg_size;	/* TCP segment size */
962 	int fw_assisted;	/* Use FW-assisted TSO */
963 	u_short packet_id;	/* IPv4 packet ID from the original packet */
964 	uint8_t tcp_flags;	/* TCP flags */
965 	efx_desc_t header_desc; /* Precomputed header descriptor for
966 				 * FW-assisted TSO */
967 };
968 
969 #if !SFXGE_TX_PARSE_EARLY
970 static const struct ip *tso_iph(const struct sfxge_tso_state *tso)
971 {
972 	KASSERT(tso->protocol == htons(ETHERTYPE_IP),
973 		("tso_iph() in non-IPv4 state"));
974 	return (const struct ip *)(tso->mbuf->m_data + tso->nh_off);
975 }
976 
977 static __unused const struct ip6_hdr *tso_ip6h(const struct sfxge_tso_state *tso)
978 {
979 	KASSERT(tso->protocol == htons(ETHERTYPE_IPV6),
980 		("tso_ip6h() in non-IPv6 state"));
981 	return (const struct ip6_hdr *)(tso->mbuf->m_data + tso->nh_off);
982 }
983 
984 static const struct tcphdr *tso_tcph(const struct sfxge_tso_state *tso)
985 {
986 	return (const struct tcphdr *)(tso->mbuf->m_data + tso->tcph_off);
987 }
988 #endif
989 
990 /* Size of preallocated TSO header buffers.  Larger blocks must be
991  * allocated from the heap.
992  */
993 #define	TSOH_STD_SIZE	128
994 
995 /* At most half the descriptors in the queue at any time will refer to
996  * a TSO header buffer, since they must always be followed by a
997  * payload descriptor referring to an mbuf.
998  */
999 #define	TSOH_COUNT(_txq_entries)	((_txq_entries) / 2u)
1000 #define	TSOH_PER_PAGE	(PAGE_SIZE / TSOH_STD_SIZE)
1001 #define	TSOH_PAGE_COUNT(_txq_entries)	\
1002 	howmany(TSOH_COUNT(_txq_entries), TSOH_PER_PAGE)
1003 
1004 static int tso_init(struct sfxge_txq *txq)
1005 {
1006 	struct sfxge_softc *sc = txq->sc;
1007 	unsigned int tsoh_page_count = TSOH_PAGE_COUNT(sc->txq_entries);
1008 	int i, rc;
1009 
1010 	/* Allocate TSO header buffers */
1011 	txq->tsoh_buffer = malloc(tsoh_page_count * sizeof(txq->tsoh_buffer[0]),
1012 				  M_SFXGE, M_WAITOK);
1013 
1014 	for (i = 0; i < tsoh_page_count; i++) {
1015 		rc = sfxge_dma_alloc(sc, PAGE_SIZE, &txq->tsoh_buffer[i]);
1016 		if (rc != 0)
1017 			goto fail;
1018 	}
1019 
1020 	return (0);
1021 
1022 fail:
1023 	while (i-- > 0)
1024 		sfxge_dma_free(&txq->tsoh_buffer[i]);
1025 	free(txq->tsoh_buffer, M_SFXGE);
1026 	txq->tsoh_buffer = NULL;
1027 	return (rc);
1028 }
1029 
1030 static void tso_fini(struct sfxge_txq *txq)
1031 {
1032 	int i;
1033 
1034 	if (txq->tsoh_buffer != NULL) {
1035 		for (i = 0; i < TSOH_PAGE_COUNT(txq->sc->txq_entries); i++)
1036 			sfxge_dma_free(&txq->tsoh_buffer[i]);
1037 		free(txq->tsoh_buffer, M_SFXGE);
1038 	}
1039 }
1040 
1041 static void tso_start(struct sfxge_txq *txq, struct sfxge_tso_state *tso,
1042 		      const bus_dma_segment_t *hdr_dma_seg,
1043 		      struct mbuf *mbuf)
1044 {
1045 	const efx_nic_cfg_t *encp = efx_nic_cfg_get(txq->sc->enp);
1046 #if !SFXGE_TX_PARSE_EARLY
1047 	struct ether_header *eh = mtod(mbuf, struct ether_header *);
1048 	const struct tcphdr *th;
1049 	struct tcphdr th_copy;
1050 #endif
1051 
1052 	tso->fw_assisted = txq->tso_fw_assisted;
1053 	tso->mbuf = mbuf;
1054 
1055 	/* Find network protocol and header */
1056 #if !SFXGE_TX_PARSE_EARLY
1057 	tso->protocol = eh->ether_type;
1058 	if (tso->protocol == htons(ETHERTYPE_VLAN)) {
1059 		struct ether_vlan_header *veh =
1060 			mtod(mbuf, struct ether_vlan_header *);
1061 		tso->protocol = veh->evl_proto;
1062 		tso->nh_off = sizeof(*veh);
1063 	} else {
1064 		tso->nh_off = sizeof(*eh);
1065 	}
1066 #else
1067 	tso->protocol = TSO_MBUF_PROTO(mbuf);
1068 	tso->nh_off = mbuf->m_pkthdr.l2hlen;
1069 	tso->tcph_off = mbuf->m_pkthdr.l3hlen;
1070 	tso->packet_id = ntohs(TSO_MBUF_PACKETID(mbuf));
1071 #endif
1072 
1073 #if !SFXGE_TX_PARSE_EARLY
1074 	/* Find TCP header */
1075 	if (tso->protocol == htons(ETHERTYPE_IP)) {
1076 		KASSERT(tso_iph(tso)->ip_p == IPPROTO_TCP,
1077 			("TSO required on non-TCP packet"));
1078 		tso->tcph_off = tso->nh_off + 4 * tso_iph(tso)->ip_hl;
1079 		tso->packet_id = ntohs(tso_iph(tso)->ip_id);
1080 	} else {
1081 		KASSERT(tso->protocol == htons(ETHERTYPE_IPV6),
1082 			("TSO required on non-IP packet"));
1083 		KASSERT(tso_ip6h(tso)->ip6_nxt == IPPROTO_TCP,
1084 			("TSO required on non-TCP packet"));
1085 		tso->tcph_off = tso->nh_off + sizeof(struct ip6_hdr);
1086 		tso->packet_id = 0;
1087 	}
1088 #endif
1089 
1090 	if (tso->fw_assisted &&
1091 	    __predict_false(tso->tcph_off >
1092 			    encp->enc_tx_tso_tcp_header_offset_limit)) {
1093 		tso->fw_assisted = 0;
1094 	}
1095 
1096 #if !SFXGE_TX_PARSE_EARLY
1097 	KASSERT(mbuf->m_len >= tso->tcph_off,
1098 		("network header is fragmented in mbuf"));
1099 	/* We need TCP header including flags (window is the next) */
1100 	if (mbuf->m_len < tso->tcph_off + offsetof(struct tcphdr, th_win)) {
1101 		m_copydata(tso->mbuf, tso->tcph_off, sizeof(th_copy),
1102 			   (caddr_t)&th_copy);
1103 		th = &th_copy;
1104 	} else {
1105 		th = tso_tcph(tso);
1106 	}
1107 	tso->header_len = tso->tcph_off + 4 * th->th_off;
1108 #else
1109 	tso->header_len = mbuf->m_pkthdr.l4hlen;
1110 #endif
1111 	tso->seg_size = mbuf->m_pkthdr.tso_segsz;
1112 
1113 #if !SFXGE_TX_PARSE_EARLY
1114 	tso->seqnum = ntohl(th->th_seq);
1115 
1116 	/* These flags must not be duplicated */
1117 	/*
1118 	 * RST should not be duplicated as well, but FreeBSD kernel
1119 	 * generates TSO packets with RST flag. So, do not assert
1120 	 * its absence.
1121 	 */
1122 	KASSERT(!(th->th_flags & (TH_URG | TH_SYN)),
1123 		("incompatible TCP flag 0x%x on TSO packet",
1124 		 th->th_flags & (TH_URG | TH_SYN)));
1125 	tso->tcp_flags = th->th_flags;
1126 #else
1127 	tso->seqnum = TSO_MBUF_SEQNUM(mbuf);
1128 	tso->tcp_flags = TSO_MBUF_FLAGS(mbuf);
1129 #endif
1130 
1131 	tso->out_len = mbuf->m_pkthdr.len - tso->header_len;
1132 
1133 	if (tso->fw_assisted) {
1134 		if (hdr_dma_seg->ds_len >= tso->header_len)
1135 			efx_tx_qdesc_dma_create(txq->common,
1136 						hdr_dma_seg->ds_addr,
1137 						tso->header_len,
1138 						B_FALSE,
1139 						&tso->header_desc);
1140 		else
1141 			tso->fw_assisted = 0;
1142 	}
1143 }
1144 
1145 /*
1146  * tso_fill_packet_with_fragment - form descriptors for the current fragment
1147  *
1148  * Form descriptors for the current fragment, until we reach the end
1149  * of fragment or end-of-packet.  Return 0 on success, 1 if not enough
1150  * space.
1151  */
1152 static void tso_fill_packet_with_fragment(struct sfxge_txq *txq,
1153 					  struct sfxge_tso_state *tso)
1154 {
1155 	efx_desc_t *desc;
1156 	int n;
1157 	uint64_t dma_addr = tso->dma_addr;
1158 	boolean_t eop;
1159 
1160 	if (tso->in_len == 0 || tso->packet_space == 0)
1161 		return;
1162 
1163 	KASSERT(tso->in_len > 0, ("TSO input length went negative"));
1164 	KASSERT(tso->packet_space > 0, ("TSO packet space went negative"));
1165 
1166 	if (tso->fw_assisted & SFXGE_FATSOV2) {
1167 		n = tso->in_len;
1168 		tso->out_len -= n;
1169 		tso->seqnum += n;
1170 		tso->in_len = 0;
1171 		if (n < tso->packet_space) {
1172 			tso->packet_space -= n;
1173 			tso->segs_space--;
1174 		} else {
1175 			tso->packet_space = tso->seg_size -
1176 			    (n - tso->packet_space) % tso->seg_size;
1177 			tso->segs_space =
1178 			    EFX_TX_FATSOV2_DMA_SEGS_PER_PKT_MAX - 1 -
1179 			    (tso->packet_space != tso->seg_size);
1180 		}
1181 	} else {
1182 		n = min(tso->in_len, tso->packet_space);
1183 		tso->packet_space -= n;
1184 		tso->out_len -= n;
1185 		tso->dma_addr += n;
1186 		tso->in_len -= n;
1187 	}
1188 
1189 	/*
1190 	 * It is OK to use binary OR below to avoid extra branching
1191 	 * since all conditions may always be checked.
1192 	 */
1193 	eop = (tso->out_len == 0) | (tso->packet_space == 0) |
1194 	    (tso->segs_space == 0);
1195 
1196 	desc = &txq->pend_desc[txq->n_pend_desc++];
1197 	efx_tx_qdesc_dma_create(txq->common, dma_addr, n, eop, desc);
1198 }
1199 
1200 /* Callback from bus_dmamap_load() for long TSO headers. */
1201 static void tso_map_long_header(void *dma_addr_ret,
1202 				bus_dma_segment_t *segs, int nseg,
1203 				int error)
1204 {
1205 	*(uint64_t *)dma_addr_ret = ((__predict_true(error == 0) &&
1206 				      __predict_true(nseg == 1)) ?
1207 				     segs->ds_addr : 0);
1208 }
1209 
1210 /*
1211  * tso_start_new_packet - generate a new header and prepare for the new packet
1212  *
1213  * Generate a new header and prepare for the new packet.  Return 0 on
1214  * success, or an error code if failed to alloc header.
1215  */
1216 static int tso_start_new_packet(struct sfxge_txq *txq,
1217 				struct sfxge_tso_state *tso,
1218 				unsigned int *idp)
1219 {
1220 	unsigned int id = *idp;
1221 	struct tcphdr *tsoh_th;
1222 	unsigned ip_length;
1223 	caddr_t header;
1224 	uint64_t dma_addr;
1225 	bus_dmamap_t map;
1226 	efx_desc_t *desc;
1227 	int rc;
1228 
1229 	if (tso->fw_assisted) {
1230 		if (tso->fw_assisted & SFXGE_FATSOV2) {
1231 			/* Add 2 FATSOv2 option descriptors */
1232 			desc = &txq->pend_desc[txq->n_pend_desc];
1233 			efx_tx_qdesc_tso2_create(txq->common,
1234 						 tso->packet_id,
1235 						 0,
1236 						 tso->seqnum,
1237 						 tso->seg_size,
1238 						 desc,
1239 						 EFX_TX_FATSOV2_OPT_NDESCS);
1240 			desc += EFX_TX_FATSOV2_OPT_NDESCS;
1241 			txq->n_pend_desc += EFX_TX_FATSOV2_OPT_NDESCS;
1242 			KASSERT(txq->stmp[id].flags == 0, ("stmp flags are not 0"));
1243 			id = (id + EFX_TX_FATSOV2_OPT_NDESCS) & txq->ptr_mask;
1244 
1245 			tso->segs_space =
1246 			    EFX_TX_FATSOV2_DMA_SEGS_PER_PKT_MAX - 1;
1247 		} else {
1248 			uint8_t tcp_flags = tso->tcp_flags;
1249 
1250 			if (tso->out_len > tso->seg_size)
1251 				tcp_flags &= ~(TH_FIN | TH_PUSH);
1252 
1253 			/* Add FATSOv1 option descriptor */
1254 			desc = &txq->pend_desc[txq->n_pend_desc++];
1255 			efx_tx_qdesc_tso_create(txq->common,
1256 						tso->packet_id,
1257 						tso->seqnum,
1258 						tcp_flags,
1259 						desc++);
1260 			KASSERT(txq->stmp[id].flags == 0, ("stmp flags are not 0"));
1261 			id = (id + 1) & txq->ptr_mask;
1262 
1263 			tso->seqnum += tso->seg_size;
1264 			tso->segs_space = UINT_MAX;
1265 		}
1266 
1267 		/* Header DMA descriptor */
1268 		*desc = tso->header_desc;
1269 		txq->n_pend_desc++;
1270 		KASSERT(txq->stmp[id].flags == 0, ("stmp flags are not 0"));
1271 		id = (id + 1) & txq->ptr_mask;
1272 	} else {
1273 		/* Allocate a DMA-mapped header buffer. */
1274 		if (__predict_true(tso->header_len <= TSOH_STD_SIZE)) {
1275 			unsigned int page_index = (id / 2) / TSOH_PER_PAGE;
1276 			unsigned int buf_index = (id / 2) % TSOH_PER_PAGE;
1277 
1278 			header = (txq->tsoh_buffer[page_index].esm_base +
1279 				  buf_index * TSOH_STD_SIZE);
1280 			dma_addr = (txq->tsoh_buffer[page_index].esm_addr +
1281 				    buf_index * TSOH_STD_SIZE);
1282 			map = txq->tsoh_buffer[page_index].esm_map;
1283 
1284 			KASSERT(txq->stmp[id].flags == 0,
1285 				("stmp flags are not 0"));
1286 		} else {
1287 			struct sfxge_tx_mapping *stmp = &txq->stmp[id];
1288 
1289 			/* We cannot use bus_dmamem_alloc() as that may sleep */
1290 			header = malloc(tso->header_len, M_SFXGE, M_NOWAIT);
1291 			if (__predict_false(!header))
1292 				return (ENOMEM);
1293 			rc = bus_dmamap_load(txq->packet_dma_tag, stmp->map,
1294 					     header, tso->header_len,
1295 					     tso_map_long_header, &dma_addr,
1296 					     BUS_DMA_NOWAIT);
1297 			if (__predict_false(dma_addr == 0)) {
1298 				if (rc == 0) {
1299 					/* Succeeded but got >1 segment */
1300 					bus_dmamap_unload(txq->packet_dma_tag,
1301 							  stmp->map);
1302 					rc = EINVAL;
1303 				}
1304 				free(header, M_SFXGE);
1305 				return (rc);
1306 			}
1307 			map = stmp->map;
1308 
1309 			txq->tso_long_headers++;
1310 			stmp->u.heap_buf = header;
1311 			stmp->flags = TX_BUF_UNMAP;
1312 		}
1313 
1314 		tsoh_th = (struct tcphdr *)(header + tso->tcph_off);
1315 
1316 		/* Copy and update the headers. */
1317 		m_copydata(tso->mbuf, 0, tso->header_len, header);
1318 
1319 		tsoh_th->th_seq = htonl(tso->seqnum);
1320 		tso->seqnum += tso->seg_size;
1321 		if (tso->out_len > tso->seg_size) {
1322 			/* This packet will not finish the TSO burst. */
1323 			ip_length = tso->header_len - tso->nh_off + tso->seg_size;
1324 			tsoh_th->th_flags &= ~(TH_FIN | TH_PUSH);
1325 		} else {
1326 			/* This packet will be the last in the TSO burst. */
1327 			ip_length = tso->header_len - tso->nh_off + tso->out_len;
1328 		}
1329 
1330 		if (tso->protocol == htons(ETHERTYPE_IP)) {
1331 			struct ip *tsoh_iph = (struct ip *)(header + tso->nh_off);
1332 			tsoh_iph->ip_len = htons(ip_length);
1333 			/* XXX We should increment ip_id, but FreeBSD doesn't
1334 			 * currently allocate extra IDs for multiple segments.
1335 			 */
1336 		} else {
1337 			struct ip6_hdr *tsoh_iph =
1338 				(struct ip6_hdr *)(header + tso->nh_off);
1339 			tsoh_iph->ip6_plen = htons(ip_length - sizeof(*tsoh_iph));
1340 		}
1341 
1342 		/* Make the header visible to the hardware. */
1343 		bus_dmamap_sync(txq->packet_dma_tag, map, BUS_DMASYNC_PREWRITE);
1344 
1345 		/* Form a descriptor for this header. */
1346 		desc = &txq->pend_desc[txq->n_pend_desc++];
1347 		efx_tx_qdesc_dma_create(txq->common,
1348 					dma_addr,
1349 					tso->header_len,
1350 					0,
1351 					desc);
1352 		id = (id + 1) & txq->ptr_mask;
1353 
1354 		tso->segs_space = UINT_MAX;
1355 	}
1356 	tso->packet_space = tso->seg_size;
1357 	txq->tso_packets++;
1358 	*idp = id;
1359 
1360 	return (0);
1361 }
1362 
1363 static int
1364 sfxge_tx_queue_tso(struct sfxge_txq *txq, struct mbuf *mbuf,
1365 		   const bus_dma_segment_t *dma_seg, int n_dma_seg,
1366 		   int n_extra_descs)
1367 {
1368 	struct sfxge_tso_state tso;
1369 	unsigned int id;
1370 	unsigned skipped = 0;
1371 
1372 	tso_start(txq, &tso, dma_seg, mbuf);
1373 
1374 	while (dma_seg->ds_len + skipped <= tso.header_len) {
1375 		skipped += dma_seg->ds_len;
1376 		--n_dma_seg;
1377 		KASSERT(n_dma_seg, ("no payload found in TSO packet"));
1378 		++dma_seg;
1379 	}
1380 	tso.in_len = dma_seg->ds_len - (tso.header_len - skipped);
1381 	tso.dma_addr = dma_seg->ds_addr + (tso.header_len - skipped);
1382 
1383 	id = (txq->added + n_extra_descs) & txq->ptr_mask;
1384 	if (__predict_false(tso_start_new_packet(txq, &tso, &id)))
1385 		return (-1);
1386 
1387 	while (1) {
1388 		tso_fill_packet_with_fragment(txq, &tso);
1389 		/* Exactly one DMA descriptor is added */
1390 		KASSERT(txq->stmp[id].flags == 0, ("stmp flags are not 0"));
1391 		id = (id + 1) & txq->ptr_mask;
1392 
1393 		/* Move onto the next fragment? */
1394 		if (tso.in_len == 0) {
1395 			--n_dma_seg;
1396 			if (n_dma_seg == 0)
1397 				break;
1398 			++dma_seg;
1399 			tso.in_len = dma_seg->ds_len;
1400 			tso.dma_addr = dma_seg->ds_addr;
1401 		}
1402 
1403 		/* End of packet? */
1404 		if ((tso.packet_space == 0) | (tso.segs_space == 0)) {
1405 			unsigned int n_fatso_opt_desc =
1406 			    (tso.fw_assisted & SFXGE_FATSOV2) ?
1407 			    EFX_TX_FATSOV2_OPT_NDESCS :
1408 			    (tso.fw_assisted & SFXGE_FATSOV1) ? 1 : 0;
1409 
1410 			/* If the queue is now full due to tiny MSS,
1411 			 * or we can't create another header, discard
1412 			 * the remainder of the input mbuf but do not
1413 			 * roll back the work we have done.
1414 			 */
1415 			if (txq->n_pend_desc + n_fatso_opt_desc +
1416 			    1 /* header */ + n_dma_seg > txq->max_pkt_desc) {
1417 				txq->tso_pdrop_too_many++;
1418 				break;
1419 			}
1420 			if (__predict_false(tso_start_new_packet(txq, &tso,
1421 								 &id))) {
1422 				txq->tso_pdrop_no_rsrc++;
1423 				break;
1424 			}
1425 		}
1426 	}
1427 
1428 	txq->tso_bursts++;
1429 	return (id);
1430 }
1431 
1432 static void
1433 sfxge_tx_qunblock(struct sfxge_txq *txq)
1434 {
1435 	struct sfxge_softc *sc;
1436 	struct sfxge_evq *evq __diagused;
1437 
1438 	sc = txq->sc;
1439 	evq = sc->evq[txq->evq_index];
1440 
1441 	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
1442 
1443 	if (__predict_false(txq->init_state != SFXGE_TXQ_STARTED))
1444 		return;
1445 
1446 	SFXGE_TXQ_LOCK(txq);
1447 
1448 	if (txq->blocked) {
1449 		unsigned int level;
1450 
1451 		level = txq->added - txq->completed;
1452 		if (level <= SFXGE_TXQ_UNBLOCK_LEVEL(txq->entries)) {
1453 			/* reaped must be in sync with blocked */
1454 			sfxge_tx_qreap(txq);
1455 			txq->blocked = 0;
1456 		}
1457 	}
1458 
1459 	sfxge_tx_qdpl_service(txq);
1460 	/* note: lock has been dropped */
1461 }
1462 
1463 void
1464 sfxge_tx_qflush_done(struct sfxge_txq *txq)
1465 {
1466 
1467 	txq->flush_state = SFXGE_FLUSH_DONE;
1468 }
1469 
1470 static void
1471 sfxge_tx_qstop(struct sfxge_softc *sc, unsigned int index)
1472 {
1473 	struct sfxge_txq *txq;
1474 	struct sfxge_evq *evq;
1475 	unsigned int count;
1476 
1477 	SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
1478 
1479 	txq = sc->txq[index];
1480 	evq = sc->evq[txq->evq_index];
1481 
1482 	SFXGE_EVQ_LOCK(evq);
1483 	SFXGE_TXQ_LOCK(txq);
1484 
1485 	KASSERT(txq->init_state == SFXGE_TXQ_STARTED,
1486 	    ("txq->init_state != SFXGE_TXQ_STARTED"));
1487 
1488 	txq->init_state = SFXGE_TXQ_INITIALIZED;
1489 
1490 	if (txq->flush_state != SFXGE_FLUSH_DONE) {
1491 		txq->flush_state = SFXGE_FLUSH_PENDING;
1492 
1493 		SFXGE_EVQ_UNLOCK(evq);
1494 		SFXGE_TXQ_UNLOCK(txq);
1495 
1496 		/* Flush the transmit queue. */
1497 		if (efx_tx_qflush(txq->common) != 0) {
1498 			log(LOG_ERR, "%s: Flushing Tx queue %u failed\n",
1499 			    device_get_nameunit(sc->dev), index);
1500 			txq->flush_state = SFXGE_FLUSH_DONE;
1501 		} else {
1502 			count = 0;
1503 			do {
1504 				/* Spin for 100ms. */
1505 				DELAY(100000);
1506 				if (txq->flush_state != SFXGE_FLUSH_PENDING)
1507 					break;
1508 			} while (++count < 20);
1509 		}
1510 		SFXGE_EVQ_LOCK(evq);
1511 		SFXGE_TXQ_LOCK(txq);
1512 
1513 		KASSERT(txq->flush_state != SFXGE_FLUSH_FAILED,
1514 		    ("txq->flush_state == SFXGE_FLUSH_FAILED"));
1515 
1516 		if (txq->flush_state != SFXGE_FLUSH_DONE) {
1517 			/* Flush timeout */
1518 			log(LOG_ERR, "%s: Cannot flush Tx queue %u\n",
1519 			    device_get_nameunit(sc->dev), index);
1520 			txq->flush_state = SFXGE_FLUSH_DONE;
1521 		}
1522 	}
1523 
1524 	txq->blocked = 0;
1525 	txq->pending = txq->added;
1526 
1527 	sfxge_tx_qcomplete(txq, evq);
1528 	KASSERT(txq->completed == txq->added,
1529 	    ("txq->completed != txq->added"));
1530 
1531 	sfxge_tx_qreap(txq);
1532 	KASSERT(txq->reaped == txq->completed,
1533 	    ("txq->reaped != txq->completed"));
1534 
1535 	txq->added = 0;
1536 	txq->pending = 0;
1537 	txq->completed = 0;
1538 	txq->reaped = 0;
1539 
1540 	/* Destroy the common code transmit queue. */
1541 	efx_tx_qdestroy(txq->common);
1542 	txq->common = NULL;
1543 
1544 	efx_sram_buf_tbl_clear(sc->enp, txq->buf_base_id,
1545 	    EFX_TXQ_NBUFS(sc->txq_entries));
1546 
1547 	txq->hw_cksum_flags = 0;
1548 
1549 	SFXGE_EVQ_UNLOCK(evq);
1550 	SFXGE_TXQ_UNLOCK(txq);
1551 }
1552 
1553 /*
1554  * Estimate maximum number of Tx descriptors required for TSO packet.
1555  * With minimum MSS and maximum mbuf length we might need more (even
1556  * than a ring-ful of descriptors), but this should not happen in
1557  * practice except due to deliberate attack.  In that case we will
1558  * truncate the output at a packet boundary.
1559  */
1560 static unsigned int
1561 sfxge_tx_max_pkt_desc(const struct sfxge_softc *sc, enum sfxge_txq_type type,
1562 		      unsigned int tso_fw_assisted)
1563 {
1564 	/* One descriptor for every input fragment */
1565 	unsigned int max_descs = SFXGE_TX_MAPPING_MAX_SEG;
1566 	unsigned int sw_tso_max_descs;
1567 	unsigned int fa_tso_v1_max_descs = 0;
1568 	unsigned int fa_tso_v2_max_descs = 0;
1569 
1570 	/* Checksum offload Tx option descriptor may be required */
1571 	if (sc->txq_dynamic_cksum_toggle_supported)
1572 		max_descs++;
1573 
1574 	/* VLAN tagging Tx option descriptor may be required */
1575 	if (efx_nic_cfg_get(sc->enp)->enc_hw_tx_insert_vlan_enabled)
1576 		max_descs++;
1577 
1578 	if (type == SFXGE_TXQ_IP_TCP_UDP_CKSUM) {
1579 		/*
1580 		 * Plus header and payload descriptor for each output segment.
1581 		 * Minus one since header fragment is already counted.
1582 		 * Even if FATSO is used, we should be ready to fallback
1583 		 * to do it in the driver.
1584 		 */
1585 		sw_tso_max_descs = SFXGE_TSO_MAX_SEGS * 2 - 1;
1586 
1587 		/* FW assisted TSOv1 requires one more descriptor per segment
1588 		 * in comparison to SW TSO */
1589 		if (tso_fw_assisted & SFXGE_FATSOV1)
1590 			fa_tso_v1_max_descs =
1591 			    sw_tso_max_descs + SFXGE_TSO_MAX_SEGS;
1592 
1593 		/* FW assisted TSOv2 requires 3 (2 FATSO plus header) extra
1594 		 * descriptors per superframe limited by number of DMA fetches
1595 		 * per packet. The first packet header is already counted.
1596 		 */
1597 		if (tso_fw_assisted & SFXGE_FATSOV2) {
1598 			fa_tso_v2_max_descs =
1599 			    howmany(SFXGE_TX_MAPPING_MAX_SEG,
1600 				    EFX_TX_FATSOV2_DMA_SEGS_PER_PKT_MAX - 1) *
1601 			    (EFX_TX_FATSOV2_OPT_NDESCS + 1) - 1;
1602 		}
1603 
1604 		max_descs += MAX(sw_tso_max_descs,
1605 				 MAX(fa_tso_v1_max_descs, fa_tso_v2_max_descs));
1606 	}
1607 
1608 	return (max_descs);
1609 }
1610 
1611 static int
1612 sfxge_tx_qstart(struct sfxge_softc *sc, unsigned int index)
1613 {
1614 	struct sfxge_txq *txq;
1615 	efsys_mem_t *esmp;
1616 	uint16_t flags;
1617 	unsigned int tso_fw_assisted;
1618 	unsigned int label;
1619 	struct sfxge_evq *evq;
1620 	unsigned int desc_index;
1621 	int rc;
1622 
1623 	SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
1624 
1625 	txq = sc->txq[index];
1626 	esmp = &txq->mem;
1627 	evq = sc->evq[txq->evq_index];
1628 
1629 	KASSERT(txq->init_state == SFXGE_TXQ_INITIALIZED,
1630 	    ("txq->init_state != SFXGE_TXQ_INITIALIZED"));
1631 	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
1632 	    ("evq->init_state != SFXGE_EVQ_STARTED"));
1633 
1634 	/* Program the buffer table. */
1635 	if ((rc = efx_sram_buf_tbl_set(sc->enp, txq->buf_base_id, esmp,
1636 	    EFX_TXQ_NBUFS(sc->txq_entries))) != 0)
1637 		return (rc);
1638 
1639 	/* Determine the kind of queue we are creating. */
1640 	tso_fw_assisted = 0;
1641 	switch (txq->type) {
1642 	case SFXGE_TXQ_NON_CKSUM:
1643 		flags = 0;
1644 		break;
1645 	case SFXGE_TXQ_IP_CKSUM:
1646 		flags = EFX_TXQ_CKSUM_IPV4;
1647 		break;
1648 	case SFXGE_TXQ_IP_TCP_UDP_CKSUM:
1649 		flags = EFX_TXQ_CKSUM_IPV4 | EFX_TXQ_CKSUM_TCPUDP;
1650 		tso_fw_assisted = sc->tso_fw_assisted;
1651 		if (tso_fw_assisted & SFXGE_FATSOV2)
1652 			flags |= EFX_TXQ_FATSOV2;
1653 		break;
1654 	default:
1655 		KASSERT(0, ("Impossible TX queue"));
1656 		flags = 0;
1657 		break;
1658 	}
1659 
1660 	label = (sc->txq_dynamic_cksum_toggle_supported) ? 0 : txq->type;
1661 
1662 	/* Create the common code transmit queue. */
1663 	if ((rc = efx_tx_qcreate(sc->enp, index, label, esmp,
1664 	    sc->txq_entries, txq->buf_base_id, flags, evq->common,
1665 	    &txq->common, &desc_index)) != 0) {
1666 		/* Retry if no FATSOv2 resources, otherwise fail */
1667 		if ((rc != ENOSPC) || (~flags & EFX_TXQ_FATSOV2))
1668 			goto fail;
1669 
1670 		/* Looks like all FATSOv2 contexts are used */
1671 		flags &= ~EFX_TXQ_FATSOV2;
1672 		tso_fw_assisted &= ~SFXGE_FATSOV2;
1673 		if ((rc = efx_tx_qcreate(sc->enp, index, label, esmp,
1674 		    sc->txq_entries, txq->buf_base_id, flags, evq->common,
1675 		    &txq->common, &desc_index)) != 0)
1676 			goto fail;
1677 	}
1678 
1679 	/* Initialise queue descriptor indexes */
1680 	txq->added = txq->pending = txq->completed = txq->reaped = desc_index;
1681 
1682 	SFXGE_TXQ_LOCK(txq);
1683 
1684 	/* Enable the transmit queue. */
1685 	efx_tx_qenable(txq->common);
1686 
1687 	txq->init_state = SFXGE_TXQ_STARTED;
1688 	txq->flush_state = SFXGE_FLUSH_REQUIRED;
1689 	txq->tso_fw_assisted = tso_fw_assisted;
1690 
1691 	txq->max_pkt_desc = sfxge_tx_max_pkt_desc(sc, txq->type,
1692 						  tso_fw_assisted);
1693 
1694 	txq->hw_vlan_tci = 0;
1695 
1696 	txq->hw_cksum_flags = flags &
1697 			      (EFX_TXQ_CKSUM_IPV4 | EFX_TXQ_CKSUM_TCPUDP);
1698 
1699 	SFXGE_TXQ_UNLOCK(txq);
1700 
1701 	return (0);
1702 
1703 fail:
1704 	efx_sram_buf_tbl_clear(sc->enp, txq->buf_base_id,
1705 	    EFX_TXQ_NBUFS(sc->txq_entries));
1706 	return (rc);
1707 }
1708 
1709 void
1710 sfxge_tx_stop(struct sfxge_softc *sc)
1711 {
1712 	int index;
1713 
1714 	index = sc->txq_count;
1715 	while (--index >= 0)
1716 		sfxge_tx_qstop(sc, index);
1717 
1718 	/* Tear down the transmit module */
1719 	efx_tx_fini(sc->enp);
1720 }
1721 
1722 int
1723 sfxge_tx_start(struct sfxge_softc *sc)
1724 {
1725 	int index;
1726 	int rc;
1727 
1728 	/* Initialize the common code transmit module. */
1729 	if ((rc = efx_tx_init(sc->enp)) != 0)
1730 		return (rc);
1731 
1732 	for (index = 0; index < sc->txq_count; index++) {
1733 		if ((rc = sfxge_tx_qstart(sc, index)) != 0)
1734 			goto fail;
1735 	}
1736 
1737 	return (0);
1738 
1739 fail:
1740 	while (--index >= 0)
1741 		sfxge_tx_qstop(sc, index);
1742 
1743 	efx_tx_fini(sc->enp);
1744 
1745 	return (rc);
1746 }
1747 
1748 static int
1749 sfxge_txq_stat_init(struct sfxge_txq *txq, struct sysctl_oid *txq_node)
1750 {
1751 	struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(txq->sc->dev);
1752 	struct sysctl_oid *stat_node;
1753 	unsigned int id;
1754 
1755 	stat_node = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(txq_node), OID_AUTO,
1756 	    "stats", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Tx queue statistics");
1757 	if (stat_node == NULL)
1758 		return (ENOMEM);
1759 
1760 	for (id = 0; id < nitems(sfxge_tx_stats); id++) {
1761 		SYSCTL_ADD_ULONG(
1762 		    ctx, SYSCTL_CHILDREN(stat_node), OID_AUTO,
1763 		    sfxge_tx_stats[id].name, CTLFLAG_RD | CTLFLAG_STATS,
1764 		    (unsigned long *)((caddr_t)txq + sfxge_tx_stats[id].offset),
1765 		    "");
1766 	}
1767 
1768 	return (0);
1769 }
1770 
1771 /**
1772  * Destroy a transmit queue.
1773  */
1774 static void
1775 sfxge_tx_qfini(struct sfxge_softc *sc, unsigned int index)
1776 {
1777 	struct sfxge_txq *txq;
1778 	unsigned int nmaps;
1779 
1780 	txq = sc->txq[index];
1781 
1782 	KASSERT(txq->init_state == SFXGE_TXQ_INITIALIZED,
1783 	    ("txq->init_state != SFXGE_TXQ_INITIALIZED"));
1784 
1785 	if (txq->type == SFXGE_TXQ_IP_TCP_UDP_CKSUM)
1786 		tso_fini(txq);
1787 
1788 	/* Free the context arrays. */
1789 	free(txq->pend_desc, M_SFXGE);
1790 	nmaps = sc->txq_entries;
1791 	while (nmaps-- != 0)
1792 		bus_dmamap_destroy(txq->packet_dma_tag, txq->stmp[nmaps].map);
1793 	free(txq->stmp, M_SFXGE);
1794 
1795 	/* Release DMA memory mapping. */
1796 	sfxge_dma_free(&txq->mem);
1797 
1798 	sc->txq[index] = NULL;
1799 
1800 	SFXGE_TXQ_LOCK_DESTROY(txq);
1801 
1802 	free(txq, M_SFXGE);
1803 }
1804 
1805 static int
1806 sfxge_tx_qinit(struct sfxge_softc *sc, unsigned int txq_index,
1807 	       enum sfxge_txq_type type, unsigned int evq_index)
1808 {
1809 	const efx_nic_cfg_t *encp = efx_nic_cfg_get(sc->enp);
1810 	char name[16];
1811 	struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1812 	struct sysctl_oid *txq_node;
1813 	struct sfxge_txq *txq;
1814 	struct sfxge_tx_dpl *stdp;
1815 	struct sysctl_oid *dpl_node;
1816 	efsys_mem_t *esmp;
1817 	unsigned int nmaps;
1818 	int rc;
1819 
1820 	txq = malloc(sizeof(struct sfxge_txq), M_SFXGE, M_ZERO | M_WAITOK);
1821 	txq->sc = sc;
1822 	txq->entries = sc->txq_entries;
1823 	txq->ptr_mask = txq->entries - 1;
1824 
1825 	sc->txq[txq_index] = txq;
1826 	esmp = &txq->mem;
1827 
1828 	/* Allocate and zero DMA space for the descriptor ring. */
1829 	if ((rc = sfxge_dma_alloc(sc, EFX_TXQ_SIZE(sc->txq_entries), esmp)) != 0)
1830 		return (rc);
1831 
1832 	/* Allocate buffer table entries. */
1833 	sfxge_sram_buf_tbl_alloc(sc, EFX_TXQ_NBUFS(sc->txq_entries),
1834 				 &txq->buf_base_id);
1835 
1836 	/* Create a DMA tag for packet mappings. */
1837 	if (bus_dma_tag_create(sc->parent_dma_tag, 1,
1838 	    encp->enc_tx_dma_desc_boundary,
1839 	    MIN(0x3FFFFFFFFFFFUL, BUS_SPACE_MAXADDR), BUS_SPACE_MAXADDR, NULL,
1840 	    NULL, 0x11000, SFXGE_TX_MAPPING_MAX_SEG,
1841 	    encp->enc_tx_dma_desc_size_max, 0, NULL, NULL,
1842 	    &txq->packet_dma_tag) != 0) {
1843 		device_printf(sc->dev, "Couldn't allocate txq DMA tag\n");
1844 		rc = ENOMEM;
1845 		goto fail;
1846 	}
1847 
1848 	/* Allocate pending descriptor array for batching writes. */
1849 	txq->pend_desc = malloc(sizeof(efx_desc_t) * sc->txq_entries,
1850 				M_SFXGE, M_ZERO | M_WAITOK);
1851 
1852 	/* Allocate and initialise mbuf DMA mapping array. */
1853 	txq->stmp = malloc(sizeof(struct sfxge_tx_mapping) * sc->txq_entries,
1854 	    M_SFXGE, M_ZERO | M_WAITOK);
1855 	for (nmaps = 0; nmaps < sc->txq_entries; nmaps++) {
1856 		rc = bus_dmamap_create(txq->packet_dma_tag, 0,
1857 				       &txq->stmp[nmaps].map);
1858 		if (rc != 0)
1859 			goto fail2;
1860 	}
1861 
1862 	snprintf(name, sizeof(name), "%u", txq_index);
1863 	txq_node = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(sc->txqs_node),
1864 	    OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "");
1865 	if (txq_node == NULL) {
1866 		rc = ENOMEM;
1867 		goto fail_txq_node;
1868 	}
1869 
1870 	if (type == SFXGE_TXQ_IP_TCP_UDP_CKSUM &&
1871 	    (rc = tso_init(txq)) != 0)
1872 		goto fail3;
1873 
1874 	/* Initialize the deferred packet list. */
1875 	stdp = &txq->dpl;
1876 	stdp->std_put_max = sfxge_tx_dpl_put_max;
1877 	stdp->std_get_max = sfxge_tx_dpl_get_max;
1878 	stdp->std_get_non_tcp_max = sfxge_tx_dpl_get_non_tcp_max;
1879 	stdp->std_getp = &stdp->std_get;
1880 
1881 	SFXGE_TXQ_LOCK_INIT(txq, device_get_nameunit(sc->dev), txq_index);
1882 
1883 	dpl_node = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(txq_node), OID_AUTO,
1884 	    "dpl", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
1885 	    "Deferred packet list statistics");
1886 	if (dpl_node == NULL) {
1887 		rc = ENOMEM;
1888 		goto fail_dpl_node;
1889 	}
1890 
1891 	SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(dpl_node), OID_AUTO,
1892 			"get_count", CTLFLAG_RD | CTLFLAG_STATS,
1893 			&stdp->std_get_count, 0, "");
1894 	SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(dpl_node), OID_AUTO,
1895 			"get_non_tcp_count", CTLFLAG_RD | CTLFLAG_STATS,
1896 			&stdp->std_get_non_tcp_count, 0, "");
1897 	SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(dpl_node), OID_AUTO,
1898 			"get_hiwat", CTLFLAG_RD | CTLFLAG_STATS,
1899 			&stdp->std_get_hiwat, 0, "");
1900 	SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(dpl_node), OID_AUTO,
1901 			"put_hiwat", CTLFLAG_RD | CTLFLAG_STATS,
1902 			&stdp->std_put_hiwat, 0, "");
1903 
1904 	rc = sfxge_txq_stat_init(txq, txq_node);
1905 	if (rc != 0)
1906 		goto fail_txq_stat_init;
1907 
1908 	txq->type = type;
1909 	txq->evq_index = evq_index;
1910 	txq->init_state = SFXGE_TXQ_INITIALIZED;
1911 
1912 	return (0);
1913 
1914 fail_txq_stat_init:
1915 fail_dpl_node:
1916 fail3:
1917 fail_txq_node:
1918 	free(txq->pend_desc, M_SFXGE);
1919 fail2:
1920 	while (nmaps-- != 0)
1921 		bus_dmamap_destroy(txq->packet_dma_tag, txq->stmp[nmaps].map);
1922 	free(txq->stmp, M_SFXGE);
1923 	bus_dma_tag_destroy(txq->packet_dma_tag);
1924 
1925 fail:
1926 	sfxge_dma_free(esmp);
1927 
1928 	return (rc);
1929 }
1930 
1931 static int
1932 sfxge_tx_stat_handler(SYSCTL_HANDLER_ARGS)
1933 {
1934 	struct sfxge_softc *sc = arg1;
1935 	unsigned int id = arg2;
1936 	unsigned long sum;
1937 	unsigned int index;
1938 
1939 	/* Sum across all TX queues */
1940 	sum = 0;
1941 	for (index = 0; index < sc->txq_count; index++)
1942 		sum += *(unsigned long *)((caddr_t)sc->txq[index] +
1943 					  sfxge_tx_stats[id].offset);
1944 
1945 	return (SYSCTL_OUT(req, &sum, sizeof(sum)));
1946 }
1947 
1948 static void
1949 sfxge_tx_stat_init(struct sfxge_softc *sc)
1950 {
1951 	struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1952 	struct sysctl_oid_list *stat_list;
1953 	unsigned int id;
1954 
1955 	stat_list = SYSCTL_CHILDREN(sc->stats_node);
1956 
1957 	for (id = 0; id < nitems(sfxge_tx_stats); id++) {
1958 		SYSCTL_ADD_PROC(ctx, stat_list, OID_AUTO,
1959 		    sfxge_tx_stats[id].name,
1960 		    CTLTYPE_ULONG | CTLFLAG_RD | CTLFLAG_NEEDGIANT,
1961 		    sc, id, sfxge_tx_stat_handler, "LU", "");
1962 	}
1963 }
1964 
1965 uint64_t
1966 sfxge_tx_get_drops(struct sfxge_softc *sc)
1967 {
1968 	unsigned int index;
1969 	uint64_t drops = 0;
1970 	struct sfxge_txq *txq;
1971 
1972 	/* Sum across all TX queues */
1973 	for (index = 0; index < sc->txq_count; index++) {
1974 		txq = sc->txq[index];
1975 		/*
1976 		 * In theory, txq->put_overflow and txq->netdown_drops
1977 		 * should use atomic operation and other should be
1978 		 * obtained under txq lock, but it is just statistics.
1979 		 */
1980 		drops += txq->drops + txq->get_overflow +
1981 			 txq->get_non_tcp_overflow +
1982 			 txq->put_overflow + txq->netdown_drops +
1983 			 txq->tso_pdrop_too_many + txq->tso_pdrop_no_rsrc;
1984 	}
1985 	return (drops);
1986 }
1987 
1988 void
1989 sfxge_tx_fini(struct sfxge_softc *sc)
1990 {
1991 	int index;
1992 
1993 	index = sc->txq_count;
1994 	while (--index >= 0)
1995 		sfxge_tx_qfini(sc, index);
1996 
1997 	sc->txq_count = 0;
1998 }
1999 
2000 int
2001 sfxge_tx_init(struct sfxge_softc *sc)
2002 {
2003 	const efx_nic_cfg_t *encp = efx_nic_cfg_get(sc->enp);
2004 	struct sfxge_intr *intr __diagused;
2005 	int index;
2006 	int rc;
2007 
2008 	intr = &sc->intr;
2009 
2010 	KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
2011 	    ("intr->state != SFXGE_INTR_INITIALIZED"));
2012 
2013 	if (sfxge_tx_dpl_get_max <= 0) {
2014 		log(LOG_ERR, "%s=%d must be greater than 0",
2015 		    SFXGE_PARAM_TX_DPL_GET_MAX, sfxge_tx_dpl_get_max);
2016 		rc = EINVAL;
2017 		goto fail_tx_dpl_get_max;
2018 	}
2019 	if (sfxge_tx_dpl_get_non_tcp_max <= 0) {
2020 		log(LOG_ERR, "%s=%d must be greater than 0",
2021 		    SFXGE_PARAM_TX_DPL_GET_NON_TCP_MAX,
2022 		    sfxge_tx_dpl_get_non_tcp_max);
2023 		rc = EINVAL;
2024 		goto fail_tx_dpl_get_non_tcp_max;
2025 	}
2026 	if (sfxge_tx_dpl_put_max < 0) {
2027 		log(LOG_ERR, "%s=%d must be greater or equal to 0",
2028 		    SFXGE_PARAM_TX_DPL_PUT_MAX, sfxge_tx_dpl_put_max);
2029 		rc = EINVAL;
2030 		goto fail_tx_dpl_put_max;
2031 	}
2032 
2033 	sc->txq_count = SFXGE_EVQ0_N_TXQ(sc) - 1 + sc->intr.n_alloc;
2034 
2035 	sc->tso_fw_assisted = sfxge_tso_fw_assisted;
2036 	if ((~encp->enc_features & EFX_FEATURE_FW_ASSISTED_TSO) ||
2037 	    (!encp->enc_fw_assisted_tso_enabled))
2038 		sc->tso_fw_assisted &= ~SFXGE_FATSOV1;
2039 	if ((~encp->enc_features & EFX_FEATURE_FW_ASSISTED_TSO_V2) ||
2040 	    (!encp->enc_fw_assisted_tso_v2_enabled))
2041 		sc->tso_fw_assisted &= ~SFXGE_FATSOV2;
2042 
2043 	sc->txqs_node = SYSCTL_ADD_NODE(device_get_sysctl_ctx(sc->dev),
2044 	    SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev)), OID_AUTO,
2045 	    "txq", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Tx queues");
2046 	if (sc->txqs_node == NULL) {
2047 		rc = ENOMEM;
2048 		goto fail_txq_node;
2049 	}
2050 
2051 	/* Initialize the transmit queues */
2052 	if (sc->txq_dynamic_cksum_toggle_supported == B_FALSE) {
2053 		if ((rc = sfxge_tx_qinit(sc, SFXGE_TXQ_NON_CKSUM,
2054 		    SFXGE_TXQ_NON_CKSUM, 0)) != 0)
2055 			goto fail;
2056 
2057 		if ((rc = sfxge_tx_qinit(sc, SFXGE_TXQ_IP_CKSUM,
2058 		    SFXGE_TXQ_IP_CKSUM, 0)) != 0)
2059 			goto fail2;
2060 	}
2061 
2062 	for (index = 0;
2063 	     index < sc->txq_count - SFXGE_EVQ0_N_TXQ(sc) + 1;
2064 	     index++) {
2065 		if ((rc = sfxge_tx_qinit(sc, SFXGE_EVQ0_N_TXQ(sc) - 1 + index,
2066 		    SFXGE_TXQ_IP_TCP_UDP_CKSUM, index)) != 0)
2067 			goto fail3;
2068 	}
2069 
2070 	sfxge_tx_stat_init(sc);
2071 
2072 	return (0);
2073 
2074 fail3:
2075 	while (--index >= 0)
2076 		sfxge_tx_qfini(sc, SFXGE_TXQ_IP_TCP_UDP_CKSUM + index);
2077 
2078 	sfxge_tx_qfini(sc, SFXGE_TXQ_IP_CKSUM);
2079 
2080 fail2:
2081 	sfxge_tx_qfini(sc, SFXGE_TXQ_NON_CKSUM);
2082 
2083 fail:
2084 fail_txq_node:
2085 	sc->txq_count = 0;
2086 fail_tx_dpl_put_max:
2087 fail_tx_dpl_get_non_tcp_max:
2088 fail_tx_dpl_get_max:
2089 	return (rc);
2090 }
2091