xref: /freebsd/sys/dev/sfxge/sfxge_tx.c (revision 361e428888e630eb708c72cf31579a25ba5d4f03)
1 /*-
2  * Copyright (c) 2010-2015 Solarflare Communications Inc.
3  * All rights reserved.
4  *
5  * This software was developed in part by Philip Paeps under contract for
6  * Solarflare Communications, Inc.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright notice,
12  *    this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright notice,
14  *    this list of conditions and the following disclaimer in the documentation
15  *    and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
19  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
21  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
22  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
23  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
24  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
25  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
26  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
27  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28  *
29  * The views and conclusions contained in the software and documentation are
30  * those of the authors and should not be interpreted as representing official
31  * policies, either expressed or implied, of the FreeBSD Project.
32  */
33 
34 /* Theory of operation:
35  *
36  * Tx queues allocation and mapping
37  *
38  * One Tx queue with enabled checksum offload is allocated per Rx channel
39  * (event queue).  Also 2 Tx queues (one without checksum offload and one
40  * with IP checksum offload only) are allocated and bound to event queue 0.
41  * sfxge_txq_type is used as Tx queue label.
42  *
43  * So, event queue plus label mapping to Tx queue index is:
44  *	if event queue index is 0, TxQ-index = TxQ-label * [0..SFXGE_TXQ_NTYPES)
45  *	else TxQ-index = SFXGE_TXQ_NTYPES + EvQ-index - 1
46  * See sfxge_get_txq_by_label() sfxge_ev.c
47  */
48 
49 #include <sys/cdefs.h>
50 __FBSDID("$FreeBSD$");
51 
52 #include <sys/types.h>
53 #include <sys/mbuf.h>
54 #include <sys/smp.h>
55 #include <sys/socket.h>
56 #include <sys/sysctl.h>
57 #include <sys/syslog.h>
58 #include <sys/limits.h>
59 
60 #include <net/bpf.h>
61 #include <net/ethernet.h>
62 #include <net/if.h>
63 #include <net/if_vlan_var.h>
64 
65 #include <netinet/in.h>
66 #include <netinet/ip.h>
67 #include <netinet/ip6.h>
68 #include <netinet/tcp.h>
69 
70 #include "common/efx.h"
71 
72 #include "sfxge.h"
73 #include "sfxge_tx.h"
74 
75 
76 #define	SFXGE_PARAM_TX_DPL_GET_MAX	SFXGE_PARAM(tx_dpl_get_max)
77 static int sfxge_tx_dpl_get_max = SFXGE_TX_DPL_GET_PKT_LIMIT_DEFAULT;
78 TUNABLE_INT(SFXGE_PARAM_TX_DPL_GET_MAX, &sfxge_tx_dpl_get_max);
79 SYSCTL_INT(_hw_sfxge, OID_AUTO, tx_dpl_get_max, CTLFLAG_RDTUN,
80 	   &sfxge_tx_dpl_get_max, 0,
81 	   "Maximum number of any packets in deferred packet get-list");
82 
83 #define	SFXGE_PARAM_TX_DPL_GET_NON_TCP_MAX \
84 	SFXGE_PARAM(tx_dpl_get_non_tcp_max)
85 static int sfxge_tx_dpl_get_non_tcp_max =
86 	SFXGE_TX_DPL_GET_NON_TCP_PKT_LIMIT_DEFAULT;
87 TUNABLE_INT(SFXGE_PARAM_TX_DPL_GET_NON_TCP_MAX, &sfxge_tx_dpl_get_non_tcp_max);
88 SYSCTL_INT(_hw_sfxge, OID_AUTO, tx_dpl_get_non_tcp_max, CTLFLAG_RDTUN,
89 	   &sfxge_tx_dpl_get_non_tcp_max, 0,
90 	   "Maximum number of non-TCP packets in deferred packet get-list");
91 
92 #define	SFXGE_PARAM_TX_DPL_PUT_MAX	SFXGE_PARAM(tx_dpl_put_max)
93 static int sfxge_tx_dpl_put_max = SFXGE_TX_DPL_PUT_PKT_LIMIT_DEFAULT;
94 TUNABLE_INT(SFXGE_PARAM_TX_DPL_PUT_MAX, &sfxge_tx_dpl_put_max);
95 SYSCTL_INT(_hw_sfxge, OID_AUTO, tx_dpl_put_max, CTLFLAG_RDTUN,
96 	   &sfxge_tx_dpl_put_max, 0,
97 	   "Maximum number of any packets in deferred packet put-list");
98 
99 #define	SFXGE_PARAM_TSO_FW_ASSISTED	SFXGE_PARAM(tso_fw_assisted)
100 static int sfxge_tso_fw_assisted = (SFXGE_FATSOV1 | SFXGE_FATSOV2);
101 TUNABLE_INT(SFXGE_PARAM_TSO_FW_ASSISTED, &sfxge_tso_fw_assisted);
102 SYSCTL_INT(_hw_sfxge, OID_AUTO, tso_fw_assisted, CTLFLAG_RDTUN,
103 	   &sfxge_tso_fw_assisted, 0,
104 	   "Bitmask of FW-assisted TSO allowed to use if supported by NIC firmware");
105 
106 
107 static const struct {
108 	const char *name;
109 	size_t offset;
110 } sfxge_tx_stats[] = {
111 #define	SFXGE_TX_STAT(name, member) \
112 	{ #name, offsetof(struct sfxge_txq, member) }
113 	SFXGE_TX_STAT(tso_bursts, tso_bursts),
114 	SFXGE_TX_STAT(tso_packets, tso_packets),
115 	SFXGE_TX_STAT(tso_long_headers, tso_long_headers),
116 	SFXGE_TX_STAT(tso_pdrop_too_many, tso_pdrop_too_many),
117 	SFXGE_TX_STAT(tso_pdrop_no_rsrc, tso_pdrop_no_rsrc),
118 	SFXGE_TX_STAT(tx_collapses, collapses),
119 	SFXGE_TX_STAT(tx_drops, drops),
120 	SFXGE_TX_STAT(tx_get_overflow, get_overflow),
121 	SFXGE_TX_STAT(tx_get_non_tcp_overflow, get_non_tcp_overflow),
122 	SFXGE_TX_STAT(tx_put_overflow, put_overflow),
123 	SFXGE_TX_STAT(tx_netdown_drops, netdown_drops),
124 };
125 
126 
127 /* Forward declarations. */
128 static void sfxge_tx_qdpl_service(struct sfxge_txq *txq);
129 static void sfxge_tx_qlist_post(struct sfxge_txq *txq);
130 static void sfxge_tx_qunblock(struct sfxge_txq *txq);
131 static int sfxge_tx_queue_tso(struct sfxge_txq *txq, struct mbuf *mbuf,
132 			      const bus_dma_segment_t *dma_seg, int n_dma_seg,
133 			      int vlan_tagged);
134 
135 static int
136 sfxge_tx_maybe_insert_tag(struct sfxge_txq *txq, struct mbuf *mbuf)
137 {
138 	uint16_t this_tag = ((mbuf->m_flags & M_VLANTAG) ?
139 			     mbuf->m_pkthdr.ether_vtag :
140 			     0);
141 
142 	if (this_tag == txq->hw_vlan_tci)
143 		return (0);
144 
145 	efx_tx_qdesc_vlantci_create(txq->common,
146 				    bswap16(this_tag),
147 				    &txq->pend_desc[0]);
148 	txq->n_pend_desc = 1;
149 	txq->hw_vlan_tci = this_tag;
150 	return (1);
151 }
152 
153 static inline void
154 sfxge_next_stmp(struct sfxge_txq *txq, struct sfxge_tx_mapping **pstmp)
155 {
156 	KASSERT((*pstmp)->flags == 0, ("stmp flags are not 0"));
157 	if (__predict_false(*pstmp ==
158 			    &txq->stmp[txq->ptr_mask]))
159 		*pstmp = &txq->stmp[0];
160 	else
161 		(*pstmp)++;
162 }
163 
164 
165 void
166 sfxge_tx_qcomplete(struct sfxge_txq *txq, struct sfxge_evq *evq)
167 {
168 	unsigned int completed;
169 
170 	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
171 
172 	completed = txq->completed;
173 	while (completed != txq->pending) {
174 		struct sfxge_tx_mapping *stmp;
175 		unsigned int id;
176 
177 		id = completed++ & txq->ptr_mask;
178 
179 		stmp = &txq->stmp[id];
180 		if (stmp->flags & TX_BUF_UNMAP) {
181 			bus_dmamap_unload(txq->packet_dma_tag, stmp->map);
182 			if (stmp->flags & TX_BUF_MBUF) {
183 				struct mbuf *m = stmp->u.mbuf;
184 				do
185 					m = m_free(m);
186 				while (m != NULL);
187 			} else {
188 				free(stmp->u.heap_buf, M_SFXGE);
189 			}
190 			stmp->flags = 0;
191 		}
192 	}
193 	txq->completed = completed;
194 
195 	/* Check whether we need to unblock the queue. */
196 	mb();
197 	if (txq->blocked) {
198 		unsigned int level;
199 
200 		level = txq->added - txq->completed;
201 		if (level <= SFXGE_TXQ_UNBLOCK_LEVEL(txq->entries))
202 			sfxge_tx_qunblock(txq);
203 	}
204 }
205 
206 static unsigned int
207 sfxge_is_mbuf_non_tcp(struct mbuf *mbuf)
208 {
209 	/* Absense of TCP checksum flags does not mean that it is non-TCP
210 	 * but it should be true if user wants to achieve high throughput.
211 	 */
212 	return (!(mbuf->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP)));
213 }
214 
215 /*
216  * Reorder the put list and append it to the get list.
217  */
218 static void
219 sfxge_tx_qdpl_swizzle(struct sfxge_txq *txq)
220 {
221 	struct sfxge_tx_dpl *stdp;
222 	struct mbuf *mbuf, *get_next, **get_tailp;
223 	volatile uintptr_t *putp;
224 	uintptr_t put;
225 	unsigned int count;
226 	unsigned int non_tcp_count;
227 
228 	SFXGE_TXQ_LOCK_ASSERT_OWNED(txq);
229 
230 	stdp = &txq->dpl;
231 
232 	/* Acquire the put list. */
233 	putp = &stdp->std_put;
234 	put = atomic_readandclear_ptr(putp);
235 	mbuf = (void *)put;
236 
237 	if (mbuf == NULL)
238 		return;
239 
240 	/* Reverse the put list. */
241 	get_tailp = &mbuf->m_nextpkt;
242 	get_next = NULL;
243 
244 	count = 0;
245 	non_tcp_count = 0;
246 	do {
247 		struct mbuf *put_next;
248 
249 		non_tcp_count += sfxge_is_mbuf_non_tcp(mbuf);
250 		put_next = mbuf->m_nextpkt;
251 		mbuf->m_nextpkt = get_next;
252 		get_next = mbuf;
253 		mbuf = put_next;
254 
255 		count++;
256 	} while (mbuf != NULL);
257 
258 	if (count > stdp->std_put_hiwat)
259 		stdp->std_put_hiwat = count;
260 
261 	/* Append the reversed put list to the get list. */
262 	KASSERT(*get_tailp == NULL, ("*get_tailp != NULL"));
263 	*stdp->std_getp = get_next;
264 	stdp->std_getp = get_tailp;
265 	stdp->std_get_count += count;
266 	stdp->std_get_non_tcp_count += non_tcp_count;
267 }
268 
269 static void
270 sfxge_tx_qreap(struct sfxge_txq *txq)
271 {
272 	SFXGE_TXQ_LOCK_ASSERT_OWNED(txq);
273 
274 	txq->reaped = txq->completed;
275 }
276 
277 static void
278 sfxge_tx_qlist_post(struct sfxge_txq *txq)
279 {
280 	unsigned int old_added;
281 	unsigned int block_level;
282 	unsigned int level;
283 	int rc;
284 
285 	SFXGE_TXQ_LOCK_ASSERT_OWNED(txq);
286 
287 	KASSERT(txq->n_pend_desc != 0, ("txq->n_pend_desc == 0"));
288 	KASSERT(txq->n_pend_desc <= txq->max_pkt_desc,
289 		("txq->n_pend_desc too large"));
290 	KASSERT(!txq->blocked, ("txq->blocked"));
291 
292 	old_added = txq->added;
293 
294 	/* Post the fragment list. */
295 	rc = efx_tx_qdesc_post(txq->common, txq->pend_desc, txq->n_pend_desc,
296 			  txq->reaped, &txq->added);
297 	KASSERT(rc == 0, ("efx_tx_qdesc_post() failed"));
298 
299 	/* If efx_tx_qdesc_post() had to refragment, our information about
300 	 * buffers to free may be associated with the wrong
301 	 * descriptors.
302 	 */
303 	KASSERT(txq->added - old_added == txq->n_pend_desc,
304 		("efx_tx_qdesc_post() refragmented descriptors"));
305 
306 	level = txq->added - txq->reaped;
307 	KASSERT(level <= txq->entries, ("overfilled TX queue"));
308 
309 	/* Clear the fragment list. */
310 	txq->n_pend_desc = 0;
311 
312 	/*
313 	 * Set the block level to ensure there is space to generate a
314 	 * large number of descriptors for TSO.
315 	 */
316 	block_level = EFX_TXQ_LIMIT(txq->entries) - txq->max_pkt_desc;
317 
318 	/* Have we reached the block level? */
319 	if (level < block_level)
320 		return;
321 
322 	/* Reap, and check again */
323 	sfxge_tx_qreap(txq);
324 	level = txq->added - txq->reaped;
325 	if (level < block_level)
326 		return;
327 
328 	txq->blocked = 1;
329 
330 	/*
331 	 * Avoid a race with completion interrupt handling that could leave
332 	 * the queue blocked.
333 	 */
334 	mb();
335 	sfxge_tx_qreap(txq);
336 	level = txq->added - txq->reaped;
337 	if (level < block_level) {
338 		mb();
339 		txq->blocked = 0;
340 	}
341 }
342 
343 static int sfxge_tx_queue_mbuf(struct sfxge_txq *txq, struct mbuf *mbuf)
344 {
345 	bus_dmamap_t *used_map;
346 	bus_dmamap_t map;
347 	bus_dma_segment_t dma_seg[SFXGE_TX_MAPPING_MAX_SEG];
348 	unsigned int id;
349 	struct sfxge_tx_mapping *stmp;
350 	efx_desc_t *desc;
351 	int n_dma_seg;
352 	int rc;
353 	int i;
354 	int eop;
355 	int vlan_tagged;
356 
357 	KASSERT(!txq->blocked, ("txq->blocked"));
358 
359 	if (mbuf->m_pkthdr.csum_flags & CSUM_TSO)
360 		prefetch_read_many(mbuf->m_data);
361 
362 	if (__predict_false(txq->init_state != SFXGE_TXQ_STARTED)) {
363 		rc = EINTR;
364 		goto reject;
365 	}
366 
367 	/* Load the packet for DMA. */
368 	id = txq->added & txq->ptr_mask;
369 	stmp = &txq->stmp[id];
370 	rc = bus_dmamap_load_mbuf_sg(txq->packet_dma_tag, stmp->map,
371 				     mbuf, dma_seg, &n_dma_seg, 0);
372 	if (rc == EFBIG) {
373 		/* Try again. */
374 		struct mbuf *new_mbuf = m_collapse(mbuf, M_NOWAIT,
375 						   SFXGE_TX_MAPPING_MAX_SEG);
376 		if (new_mbuf == NULL)
377 			goto reject;
378 		++txq->collapses;
379 		mbuf = new_mbuf;
380 		rc = bus_dmamap_load_mbuf_sg(txq->packet_dma_tag,
381 					     stmp->map, mbuf,
382 					     dma_seg, &n_dma_seg, 0);
383 	}
384 	if (rc != 0)
385 		goto reject;
386 
387 	/* Make the packet visible to the hardware. */
388 	bus_dmamap_sync(txq->packet_dma_tag, stmp->map, BUS_DMASYNC_PREWRITE);
389 
390 	used_map = &stmp->map;
391 
392 	vlan_tagged = sfxge_tx_maybe_insert_tag(txq, mbuf);
393 	if (vlan_tagged) {
394 		sfxge_next_stmp(txq, &stmp);
395 	}
396 	if (mbuf->m_pkthdr.csum_flags & CSUM_TSO) {
397 		rc = sfxge_tx_queue_tso(txq, mbuf, dma_seg, n_dma_seg, vlan_tagged);
398 		if (rc < 0)
399 			goto reject_mapped;
400 		stmp = &txq->stmp[(rc - 1) & txq->ptr_mask];
401 	} else {
402 		/* Add the mapping to the fragment list, and set flags
403 		 * for the buffer.
404 		 */
405 
406 		i = 0;
407 		for (;;) {
408 			desc = &txq->pend_desc[i + vlan_tagged];
409 			eop = (i == n_dma_seg - 1);
410 			efx_tx_qdesc_dma_create(txq->common,
411 						dma_seg[i].ds_addr,
412 						dma_seg[i].ds_len,
413 						eop,
414 						desc);
415 			if (eop)
416 				break;
417 			i++;
418 			sfxge_next_stmp(txq, &stmp);
419 		}
420 		txq->n_pend_desc = n_dma_seg + vlan_tagged;
421 	}
422 
423 	/*
424 	 * If the mapping required more than one descriptor
425 	 * then we need to associate the DMA map with the last
426 	 * descriptor, not the first.
427 	 */
428 	if (used_map != &stmp->map) {
429 		map = stmp->map;
430 		stmp->map = *used_map;
431 		*used_map = map;
432 	}
433 
434 	stmp->u.mbuf = mbuf;
435 	stmp->flags = TX_BUF_UNMAP | TX_BUF_MBUF;
436 
437 	/* Post the fragment list. */
438 	sfxge_tx_qlist_post(txq);
439 
440 	return (0);
441 
442 reject_mapped:
443 	bus_dmamap_unload(txq->packet_dma_tag, *used_map);
444 reject:
445 	/* Drop the packet on the floor. */
446 	m_freem(mbuf);
447 	++txq->drops;
448 
449 	return (rc);
450 }
451 
452 /*
453  * Drain the deferred packet list into the transmit queue.
454  */
455 static void
456 sfxge_tx_qdpl_drain(struct sfxge_txq *txq)
457 {
458 	struct sfxge_softc *sc;
459 	struct sfxge_tx_dpl *stdp;
460 	struct mbuf *mbuf, *next;
461 	unsigned int count;
462 	unsigned int non_tcp_count;
463 	unsigned int pushed;
464 	int rc;
465 
466 	SFXGE_TXQ_LOCK_ASSERT_OWNED(txq);
467 
468 	sc = txq->sc;
469 	stdp = &txq->dpl;
470 	pushed = txq->added;
471 
472 	if (__predict_true(txq->init_state == SFXGE_TXQ_STARTED)) {
473 		prefetch_read_many(sc->enp);
474 		prefetch_read_many(txq->common);
475 	}
476 
477 	mbuf = stdp->std_get;
478 	count = stdp->std_get_count;
479 	non_tcp_count = stdp->std_get_non_tcp_count;
480 
481 	if (count > stdp->std_get_hiwat)
482 		stdp->std_get_hiwat = count;
483 
484 	while (count != 0) {
485 		KASSERT(mbuf != NULL, ("mbuf == NULL"));
486 
487 		next = mbuf->m_nextpkt;
488 		mbuf->m_nextpkt = NULL;
489 
490 		ETHER_BPF_MTAP(sc->ifnet, mbuf); /* packet capture */
491 
492 		if (next != NULL)
493 			prefetch_read_many(next);
494 
495 		rc = sfxge_tx_queue_mbuf(txq, mbuf);
496 		--count;
497 		non_tcp_count -= sfxge_is_mbuf_non_tcp(mbuf);
498 		mbuf = next;
499 		if (rc != 0)
500 			continue;
501 
502 		if (txq->blocked)
503 			break;
504 
505 		/* Push the fragments to the hardware in batches. */
506 		if (txq->added - pushed >= SFXGE_TX_BATCH) {
507 			efx_tx_qpush(txq->common, txq->added, pushed);
508 			pushed = txq->added;
509 		}
510 	}
511 
512 	if (count == 0) {
513 		KASSERT(mbuf == NULL, ("mbuf != NULL"));
514 		KASSERT(non_tcp_count == 0,
515 			("inconsistent TCP/non-TCP detection"));
516 		stdp->std_get = NULL;
517 		stdp->std_get_count = 0;
518 		stdp->std_get_non_tcp_count = 0;
519 		stdp->std_getp = &stdp->std_get;
520 	} else {
521 		stdp->std_get = mbuf;
522 		stdp->std_get_count = count;
523 		stdp->std_get_non_tcp_count = non_tcp_count;
524 	}
525 
526 	if (txq->added != pushed)
527 		efx_tx_qpush(txq->common, txq->added, pushed);
528 
529 	KASSERT(txq->blocked || stdp->std_get_count == 0,
530 		("queue unblocked but count is non-zero"));
531 }
532 
533 #define	SFXGE_TX_QDPL_PENDING(_txq)	((_txq)->dpl.std_put != 0)
534 
535 /*
536  * Service the deferred packet list.
537  *
538  * NOTE: drops the txq mutex!
539  */
540 static void
541 sfxge_tx_qdpl_service(struct sfxge_txq *txq)
542 {
543 	SFXGE_TXQ_LOCK_ASSERT_OWNED(txq);
544 
545 	do {
546 		if (SFXGE_TX_QDPL_PENDING(txq))
547 			sfxge_tx_qdpl_swizzle(txq);
548 
549 		if (!txq->blocked)
550 			sfxge_tx_qdpl_drain(txq);
551 
552 		SFXGE_TXQ_UNLOCK(txq);
553 	} while (SFXGE_TX_QDPL_PENDING(txq) &&
554 		 SFXGE_TXQ_TRYLOCK(txq));
555 }
556 
557 /*
558  * Put a packet on the deferred packet get-list.
559  */
560 static int
561 sfxge_tx_qdpl_put_locked(struct sfxge_txq *txq, struct mbuf *mbuf)
562 {
563 	struct sfxge_tx_dpl *stdp;
564 
565 	stdp = &txq->dpl;
566 
567 	KASSERT(mbuf->m_nextpkt == NULL, ("mbuf->m_nextpkt != NULL"));
568 
569 	SFXGE_TXQ_LOCK_ASSERT_OWNED(txq);
570 
571 	if (stdp->std_get_count >= stdp->std_get_max) {
572 		txq->get_overflow++;
573 		return (ENOBUFS);
574 	}
575 	if (sfxge_is_mbuf_non_tcp(mbuf)) {
576 		if (stdp->std_get_non_tcp_count >=
577 		    stdp->std_get_non_tcp_max) {
578 			txq->get_non_tcp_overflow++;
579 			return (ENOBUFS);
580 		}
581 		stdp->std_get_non_tcp_count++;
582 	}
583 
584 	*(stdp->std_getp) = mbuf;
585 	stdp->std_getp = &mbuf->m_nextpkt;
586 	stdp->std_get_count++;
587 
588 	return (0);
589 }
590 
591 /*
592  * Put a packet on the deferred packet put-list.
593  *
594  * We overload the csum_data field in the mbuf to keep track of this length
595  * because there is no cheap alternative to avoid races.
596  */
597 static int
598 sfxge_tx_qdpl_put_unlocked(struct sfxge_txq *txq, struct mbuf *mbuf)
599 {
600 	struct sfxge_tx_dpl *stdp;
601 	volatile uintptr_t *putp;
602 	uintptr_t old;
603 	uintptr_t new;
604 	unsigned old_len;
605 
606 	KASSERT(mbuf->m_nextpkt == NULL, ("mbuf->m_nextpkt != NULL"));
607 
608 	SFXGE_TXQ_LOCK_ASSERT_NOTOWNED(txq);
609 
610 	stdp = &txq->dpl;
611 	putp = &stdp->std_put;
612 	new = (uintptr_t)mbuf;
613 
614 	do {
615 		old = *putp;
616 		if (old != 0) {
617 			struct mbuf *mp = (struct mbuf *)old;
618 			old_len = mp->m_pkthdr.csum_data;
619 		} else
620 			old_len = 0;
621 		if (old_len >= stdp->std_put_max) {
622 			atomic_add_long(&txq->put_overflow, 1);
623 			return (ENOBUFS);
624 		}
625 		mbuf->m_pkthdr.csum_data = old_len + 1;
626 		mbuf->m_nextpkt = (void *)old;
627 	} while (atomic_cmpset_ptr(putp, old, new) == 0);
628 
629 	return (0);
630 }
631 
632 /*
633  * Called from if_transmit - will try to grab the txq lock and enqueue to the
634  * put list if it succeeds, otherwise try to push onto the defer list if space.
635  */
636 static int
637 sfxge_tx_packet_add(struct sfxge_txq *txq, struct mbuf *m)
638 {
639 	int rc;
640 
641 	if (!SFXGE_LINK_UP(txq->sc)) {
642 		atomic_add_long(&txq->netdown_drops, 1);
643 		return (ENETDOWN);
644 	}
645 
646 	/*
647 	 * Try to grab the txq lock.  If we are able to get the lock,
648 	 * the packet will be appended to the "get list" of the deferred
649 	 * packet list.  Otherwise, it will be pushed on the "put list".
650 	 */
651 	if (SFXGE_TXQ_TRYLOCK(txq)) {
652 		/* First swizzle put-list to get-list to keep order */
653 		sfxge_tx_qdpl_swizzle(txq);
654 
655 		rc = sfxge_tx_qdpl_put_locked(txq, m);
656 
657 		/* Try to service the list. */
658 		sfxge_tx_qdpl_service(txq);
659 		/* Lock has been dropped. */
660 	} else {
661 		rc = sfxge_tx_qdpl_put_unlocked(txq, m);
662 
663 		/*
664 		 * Try to grab the lock again.
665 		 *
666 		 * If we are able to get the lock, we need to process
667 		 * the deferred packet list.  If we are not able to get
668 		 * the lock, another thread is processing the list.
669 		 */
670 		if ((rc == 0) && SFXGE_TXQ_TRYLOCK(txq)) {
671 			sfxge_tx_qdpl_service(txq);
672 			/* Lock has been dropped. */
673 		}
674 	}
675 
676 	SFXGE_TXQ_LOCK_ASSERT_NOTOWNED(txq);
677 
678 	return (rc);
679 }
680 
681 static void
682 sfxge_tx_qdpl_flush(struct sfxge_txq *txq)
683 {
684 	struct sfxge_tx_dpl *stdp = &txq->dpl;
685 	struct mbuf *mbuf, *next;
686 
687 	SFXGE_TXQ_LOCK(txq);
688 
689 	sfxge_tx_qdpl_swizzle(txq);
690 	for (mbuf = stdp->std_get; mbuf != NULL; mbuf = next) {
691 		next = mbuf->m_nextpkt;
692 		m_freem(mbuf);
693 	}
694 	stdp->std_get = NULL;
695 	stdp->std_get_count = 0;
696 	stdp->std_get_non_tcp_count = 0;
697 	stdp->std_getp = &stdp->std_get;
698 
699 	SFXGE_TXQ_UNLOCK(txq);
700 }
701 
702 void
703 sfxge_if_qflush(struct ifnet *ifp)
704 {
705 	struct sfxge_softc *sc;
706 	unsigned int i;
707 
708 	sc = ifp->if_softc;
709 
710 	for (i = 0; i < sc->txq_count; i++)
711 		sfxge_tx_qdpl_flush(sc->txq[i]);
712 }
713 
714 #if SFXGE_TX_PARSE_EARLY
715 
716 /* There is little space for user data in mbuf pkthdr, so we
717  * use l*hlen fields which are not used by the driver otherwise
718  * to store header offsets.
719  * The fields are 8-bit, but it's ok, no header may be longer than 255 bytes.
720  */
721 
722 
723 #define TSO_MBUF_PROTO(_mbuf)    ((_mbuf)->m_pkthdr.PH_loc.sixteen[0])
724 /* We abuse l5hlen here because PH_loc can hold only 64 bits of data */
725 #define TSO_MBUF_FLAGS(_mbuf)    ((_mbuf)->m_pkthdr.l5hlen)
726 #define TSO_MBUF_PACKETID(_mbuf) ((_mbuf)->m_pkthdr.PH_loc.sixteen[1])
727 #define TSO_MBUF_SEQNUM(_mbuf)   ((_mbuf)->m_pkthdr.PH_loc.thirtytwo[1])
728 
729 static void sfxge_parse_tx_packet(struct mbuf *mbuf)
730 {
731 	struct ether_header *eh = mtod(mbuf, struct ether_header *);
732 	const struct tcphdr *th;
733 	struct tcphdr th_copy;
734 
735 	/* Find network protocol and header */
736 	TSO_MBUF_PROTO(mbuf) = eh->ether_type;
737 	if (TSO_MBUF_PROTO(mbuf) == htons(ETHERTYPE_VLAN)) {
738 		struct ether_vlan_header *veh =
739 			mtod(mbuf, struct ether_vlan_header *);
740 		TSO_MBUF_PROTO(mbuf) = veh->evl_proto;
741 		mbuf->m_pkthdr.l2hlen = sizeof(*veh);
742 	} else {
743 		mbuf->m_pkthdr.l2hlen = sizeof(*eh);
744 	}
745 
746 	/* Find TCP header */
747 	if (TSO_MBUF_PROTO(mbuf) == htons(ETHERTYPE_IP)) {
748 		const struct ip *iph = (const struct ip *)mtodo(mbuf, mbuf->m_pkthdr.l2hlen);
749 
750 		KASSERT(iph->ip_p == IPPROTO_TCP,
751 			("TSO required on non-TCP packet"));
752 		mbuf->m_pkthdr.l3hlen = mbuf->m_pkthdr.l2hlen + 4 * iph->ip_hl;
753 		TSO_MBUF_PACKETID(mbuf) = iph->ip_id;
754 	} else {
755 		KASSERT(TSO_MBUF_PROTO(mbuf) == htons(ETHERTYPE_IPV6),
756 			("TSO required on non-IP packet"));
757 		KASSERT(((const struct ip6_hdr *)mtodo(mbuf, mbuf->m_pkthdr.l2hlen))->ip6_nxt ==
758 			IPPROTO_TCP,
759 			("TSO required on non-TCP packet"));
760 		mbuf->m_pkthdr.l3hlen = mbuf->m_pkthdr.l2hlen + sizeof(struct ip6_hdr);
761 		TSO_MBUF_PACKETID(mbuf) = 0;
762 	}
763 
764 	KASSERT(mbuf->m_len >= mbuf->m_pkthdr.l3hlen,
765 		("network header is fragmented in mbuf"));
766 
767 	/* We need TCP header including flags (window is the next) */
768 	if (mbuf->m_len < mbuf->m_pkthdr.l3hlen + offsetof(struct tcphdr, th_win)) {
769 		m_copydata(mbuf, mbuf->m_pkthdr.l3hlen, sizeof(th_copy),
770 			   (caddr_t)&th_copy);
771 		th = &th_copy;
772 	} else {
773 		th = (const struct tcphdr *)mtodo(mbuf, mbuf->m_pkthdr.l3hlen);
774 	}
775 
776 	mbuf->m_pkthdr.l4hlen = mbuf->m_pkthdr.l3hlen + 4 * th->th_off;
777 	TSO_MBUF_SEQNUM(mbuf) = ntohl(th->th_seq);
778 
779 	/* These flags must not be duplicated */
780 	/*
781 	 * RST should not be duplicated as well, but FreeBSD kernel
782 	 * generates TSO packets with RST flag. So, do not assert
783 	 * its absence.
784 	 */
785 	KASSERT(!(th->th_flags & (TH_URG | TH_SYN)),
786 		("incompatible TCP flag 0x%x on TSO packet",
787 		 th->th_flags & (TH_URG | TH_SYN)));
788 	TSO_MBUF_FLAGS(mbuf) = th->th_flags;
789 }
790 #endif
791 
792 /*
793  * TX start -- called by the stack.
794  */
795 int
796 sfxge_if_transmit(struct ifnet *ifp, struct mbuf *m)
797 {
798 	struct sfxge_softc *sc;
799 	struct sfxge_txq *txq;
800 	int rc;
801 
802 	sc = (struct sfxge_softc *)ifp->if_softc;
803 
804 	/*
805 	 * Transmit may be called when interface is up from the kernel
806 	 * point of view, but not yet up (in progress) from the driver
807 	 * point of view. I.e. link aggregation bring up.
808 	 * Transmit may be called when interface is up from the driver
809 	 * point of view, but already down from the kernel point of
810 	 * view. I.e. Rx when interface shutdown is in progress.
811 	 */
812 	KASSERT((ifp->if_flags & IFF_UP) || (sc->if_flags & IFF_UP),
813 		("interface not up"));
814 
815 	/* Pick the desired transmit queue. */
816 	if (m->m_pkthdr.csum_flags &
817 	    (CSUM_DELAY_DATA | CSUM_TCP_IPV6 | CSUM_UDP_IPV6 | CSUM_TSO)) {
818 		int index = 0;
819 
820 		/* check if flowid is set */
821 		if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
822 			uint32_t hash = m->m_pkthdr.flowid;
823 
824 			index = sc->rx_indir_table[hash % SFXGE_RX_SCALE_MAX];
825 		}
826 #if SFXGE_TX_PARSE_EARLY
827 		if (m->m_pkthdr.csum_flags & CSUM_TSO)
828 			sfxge_parse_tx_packet(m);
829 #endif
830 		txq = sc->txq[SFXGE_TXQ_IP_TCP_UDP_CKSUM + index];
831 	} else if (m->m_pkthdr.csum_flags & CSUM_DELAY_IP) {
832 		txq = sc->txq[SFXGE_TXQ_IP_CKSUM];
833 	} else {
834 		txq = sc->txq[SFXGE_TXQ_NON_CKSUM];
835 	}
836 
837 	rc = sfxge_tx_packet_add(txq, m);
838 	if (rc != 0)
839 		m_freem(m);
840 
841 	return (rc);
842 }
843 
844 /*
845  * Software "TSO".  Not quite as good as doing it in hardware, but
846  * still faster than segmenting in the stack.
847  */
848 
849 struct sfxge_tso_state {
850 	/* Output position */
851 	unsigned out_len;	/* Remaining length in current segment */
852 	unsigned seqnum;	/* Current sequence number */
853 	unsigned packet_space;	/* Remaining space in current packet */
854 	unsigned segs_space;	/* Remaining number of DMA segments
855 				   for the packet (FATSOv2 only) */
856 
857 	/* Input position */
858 	uint64_t dma_addr;	/* DMA address of current position */
859 	unsigned in_len;	/* Remaining length in current mbuf */
860 
861 	const struct mbuf *mbuf; /* Input mbuf (head of chain) */
862 	u_short protocol;	/* Network protocol (after VLAN decap) */
863 	ssize_t nh_off;		/* Offset of network header */
864 	ssize_t tcph_off;	/* Offset of TCP header */
865 	unsigned header_len;	/* Number of bytes of header */
866 	unsigned seg_size;	/* TCP segment size */
867 	int fw_assisted;	/* Use FW-assisted TSO */
868 	u_short packet_id;	/* IPv4 packet ID from the original packet */
869 	uint8_t tcp_flags;	/* TCP flags */
870 	efx_desc_t header_desc; /* Precomputed header descriptor for
871 				 * FW-assisted TSO */
872 };
873 
874 #if !SFXGE_TX_PARSE_EARLY
875 static const struct ip *tso_iph(const struct sfxge_tso_state *tso)
876 {
877 	KASSERT(tso->protocol == htons(ETHERTYPE_IP),
878 		("tso_iph() in non-IPv4 state"));
879 	return (const struct ip *)(tso->mbuf->m_data + tso->nh_off);
880 }
881 
882 static __unused const struct ip6_hdr *tso_ip6h(const struct sfxge_tso_state *tso)
883 {
884 	KASSERT(tso->protocol == htons(ETHERTYPE_IPV6),
885 		("tso_ip6h() in non-IPv6 state"));
886 	return (const struct ip6_hdr *)(tso->mbuf->m_data + tso->nh_off);
887 }
888 
889 static const struct tcphdr *tso_tcph(const struct sfxge_tso_state *tso)
890 {
891 	return (const struct tcphdr *)(tso->mbuf->m_data + tso->tcph_off);
892 }
893 #endif
894 
895 
896 /* Size of preallocated TSO header buffers.  Larger blocks must be
897  * allocated from the heap.
898  */
899 #define	TSOH_STD_SIZE	128
900 
901 /* At most half the descriptors in the queue at any time will refer to
902  * a TSO header buffer, since they must always be followed by a
903  * payload descriptor referring to an mbuf.
904  */
905 #define	TSOH_COUNT(_txq_entries)	((_txq_entries) / 2u)
906 #define	TSOH_PER_PAGE	(PAGE_SIZE / TSOH_STD_SIZE)
907 #define	TSOH_PAGE_COUNT(_txq_entries)	\
908 	((TSOH_COUNT(_txq_entries) + TSOH_PER_PAGE - 1) / TSOH_PER_PAGE)
909 
910 static int tso_init(struct sfxge_txq *txq)
911 {
912 	struct sfxge_softc *sc = txq->sc;
913 	unsigned int tsoh_page_count = TSOH_PAGE_COUNT(sc->txq_entries);
914 	int i, rc;
915 
916 	/* Allocate TSO header buffers */
917 	txq->tsoh_buffer = malloc(tsoh_page_count * sizeof(txq->tsoh_buffer[0]),
918 				  M_SFXGE, M_WAITOK);
919 
920 	for (i = 0; i < tsoh_page_count; i++) {
921 		rc = sfxge_dma_alloc(sc, PAGE_SIZE, &txq->tsoh_buffer[i]);
922 		if (rc != 0)
923 			goto fail;
924 	}
925 
926 	return (0);
927 
928 fail:
929 	while (i-- > 0)
930 		sfxge_dma_free(&txq->tsoh_buffer[i]);
931 	free(txq->tsoh_buffer, M_SFXGE);
932 	txq->tsoh_buffer = NULL;
933 	return (rc);
934 }
935 
936 static void tso_fini(struct sfxge_txq *txq)
937 {
938 	int i;
939 
940 	if (txq->tsoh_buffer != NULL) {
941 		for (i = 0; i < TSOH_PAGE_COUNT(txq->sc->txq_entries); i++)
942 			sfxge_dma_free(&txq->tsoh_buffer[i]);
943 		free(txq->tsoh_buffer, M_SFXGE);
944 	}
945 }
946 
947 static void tso_start(struct sfxge_txq *txq, struct sfxge_tso_state *tso,
948 		      const bus_dma_segment_t *hdr_dma_seg,
949 		      struct mbuf *mbuf)
950 {
951 	const efx_nic_cfg_t *encp = efx_nic_cfg_get(txq->sc->enp);
952 #if !SFXGE_TX_PARSE_EARLY
953 	struct ether_header *eh = mtod(mbuf, struct ether_header *);
954 	const struct tcphdr *th;
955 	struct tcphdr th_copy;
956 #endif
957 
958 	tso->fw_assisted = txq->tso_fw_assisted;
959 	tso->mbuf = mbuf;
960 
961 	/* Find network protocol and header */
962 #if !SFXGE_TX_PARSE_EARLY
963 	tso->protocol = eh->ether_type;
964 	if (tso->protocol == htons(ETHERTYPE_VLAN)) {
965 		struct ether_vlan_header *veh =
966 			mtod(mbuf, struct ether_vlan_header *);
967 		tso->protocol = veh->evl_proto;
968 		tso->nh_off = sizeof(*veh);
969 	} else {
970 		tso->nh_off = sizeof(*eh);
971 	}
972 #else
973 	tso->protocol = TSO_MBUF_PROTO(mbuf);
974 	tso->nh_off = mbuf->m_pkthdr.l2hlen;
975 	tso->tcph_off = mbuf->m_pkthdr.l3hlen;
976 	tso->packet_id = TSO_MBUF_PACKETID(mbuf);
977 #endif
978 
979 #if !SFXGE_TX_PARSE_EARLY
980 	/* Find TCP header */
981 	if (tso->protocol == htons(ETHERTYPE_IP)) {
982 		KASSERT(tso_iph(tso)->ip_p == IPPROTO_TCP,
983 			("TSO required on non-TCP packet"));
984 		tso->tcph_off = tso->nh_off + 4 * tso_iph(tso)->ip_hl;
985 		tso->packet_id = tso_iph(tso)->ip_id;
986 	} else {
987 		KASSERT(tso->protocol == htons(ETHERTYPE_IPV6),
988 			("TSO required on non-IP packet"));
989 		KASSERT(tso_ip6h(tso)->ip6_nxt == IPPROTO_TCP,
990 			("TSO required on non-TCP packet"));
991 		tso->tcph_off = tso->nh_off + sizeof(struct ip6_hdr);
992 		tso->packet_id = 0;
993 	}
994 #endif
995 
996 
997 	if (tso->fw_assisted &&
998 	    __predict_false(tso->tcph_off >
999 			    encp->enc_tx_tso_tcp_header_offset_limit)) {
1000 		tso->fw_assisted = 0;
1001 	}
1002 
1003 
1004 #if !SFXGE_TX_PARSE_EARLY
1005 	KASSERT(mbuf->m_len >= tso->tcph_off,
1006 		("network header is fragmented in mbuf"));
1007 	/* We need TCP header including flags (window is the next) */
1008 	if (mbuf->m_len < tso->tcph_off + offsetof(struct tcphdr, th_win)) {
1009 		m_copydata(tso->mbuf, tso->tcph_off, sizeof(th_copy),
1010 			   (caddr_t)&th_copy);
1011 		th = &th_copy;
1012 	} else {
1013 		th = tso_tcph(tso);
1014 	}
1015 	tso->header_len = tso->tcph_off + 4 * th->th_off;
1016 #else
1017 	tso->header_len = mbuf->m_pkthdr.l4hlen;
1018 #endif
1019 	tso->seg_size = mbuf->m_pkthdr.tso_segsz;
1020 
1021 #if !SFXGE_TX_PARSE_EARLY
1022 	tso->seqnum = ntohl(th->th_seq);
1023 
1024 	/* These flags must not be duplicated */
1025 	/*
1026 	 * RST should not be duplicated as well, but FreeBSD kernel
1027 	 * generates TSO packets with RST flag. So, do not assert
1028 	 * its absence.
1029 	 */
1030 	KASSERT(!(th->th_flags & (TH_URG | TH_SYN)),
1031 		("incompatible TCP flag 0x%x on TSO packet",
1032 		 th->th_flags & (TH_URG | TH_SYN)));
1033 	tso->tcp_flags = th->th_flags;
1034 #else
1035 	tso->seqnum = TSO_MBUF_SEQNUM(mbuf);
1036 	tso->tcp_flags = TSO_MBUF_FLAGS(mbuf);
1037 #endif
1038 
1039 	tso->out_len = mbuf->m_pkthdr.len - tso->header_len;
1040 
1041 	if (tso->fw_assisted) {
1042 		if (hdr_dma_seg->ds_len >= tso->header_len)
1043 			efx_tx_qdesc_dma_create(txq->common,
1044 						hdr_dma_seg->ds_addr,
1045 						tso->header_len,
1046 						B_FALSE,
1047 						&tso->header_desc);
1048 		else
1049 			tso->fw_assisted = 0;
1050 	}
1051 }
1052 
1053 /*
1054  * tso_fill_packet_with_fragment - form descriptors for the current fragment
1055  *
1056  * Form descriptors for the current fragment, until we reach the end
1057  * of fragment or end-of-packet.  Return 0 on success, 1 if not enough
1058  * space.
1059  */
1060 static void tso_fill_packet_with_fragment(struct sfxge_txq *txq,
1061 					  struct sfxge_tso_state *tso)
1062 {
1063 	efx_desc_t *desc;
1064 	int n;
1065 	uint64_t dma_addr = tso->dma_addr;
1066 	boolean_t eop;
1067 
1068 	if (tso->in_len == 0 || tso->packet_space == 0)
1069 		return;
1070 
1071 	KASSERT(tso->in_len > 0, ("TSO input length went negative"));
1072 	KASSERT(tso->packet_space > 0, ("TSO packet space went negative"));
1073 
1074 	if (tso->fw_assisted & SFXGE_FATSOV2) {
1075 		n = tso->in_len;
1076 		tso->out_len -= n;
1077 		tso->seqnum += n;
1078 		tso->in_len = 0;
1079 		if (n < tso->packet_space) {
1080 			tso->packet_space -= n;
1081 			tso->segs_space--;
1082 		} else {
1083 			tso->packet_space = tso->seg_size -
1084 			    (n - tso->packet_space) % tso->seg_size;
1085 			tso->segs_space =
1086 			    EFX_TX_FATSOV2_DMA_SEGS_PER_PKT_MAX - 1 -
1087 			    (tso->packet_space != tso->seg_size);
1088 		}
1089 	} else {
1090 		n = min(tso->in_len, tso->packet_space);
1091 		tso->packet_space -= n;
1092 		tso->out_len -= n;
1093 		tso->dma_addr += n;
1094 		tso->in_len -= n;
1095 	}
1096 
1097 	/*
1098 	 * It is OK to use binary OR below to avoid extra branching
1099 	 * since all conditions may always be checked.
1100 	 */
1101 	eop = (tso->out_len == 0) | (tso->packet_space == 0) |
1102 	    (tso->segs_space == 0);
1103 
1104 	desc = &txq->pend_desc[txq->n_pend_desc++];
1105 	efx_tx_qdesc_dma_create(txq->common, dma_addr, n, eop, desc);
1106 }
1107 
1108 /* Callback from bus_dmamap_load() for long TSO headers. */
1109 static void tso_map_long_header(void *dma_addr_ret,
1110 				bus_dma_segment_t *segs, int nseg,
1111 				int error)
1112 {
1113 	*(uint64_t *)dma_addr_ret = ((__predict_true(error == 0) &&
1114 				      __predict_true(nseg == 1)) ?
1115 				     segs->ds_addr : 0);
1116 }
1117 
1118 /*
1119  * tso_start_new_packet - generate a new header and prepare for the new packet
1120  *
1121  * Generate a new header and prepare for the new packet.  Return 0 on
1122  * success, or an error code if failed to alloc header.
1123  */
1124 static int tso_start_new_packet(struct sfxge_txq *txq,
1125 				struct sfxge_tso_state *tso,
1126 				unsigned int *idp)
1127 {
1128 	unsigned int id = *idp;
1129 	struct tcphdr *tsoh_th;
1130 	unsigned ip_length;
1131 	caddr_t header;
1132 	uint64_t dma_addr;
1133 	bus_dmamap_t map;
1134 	efx_desc_t *desc;
1135 	int rc;
1136 
1137 	if (tso->fw_assisted) {
1138 		if (tso->fw_assisted & SFXGE_FATSOV2) {
1139 			/* Add 2 FATSOv2 option descriptors */
1140 			desc = &txq->pend_desc[txq->n_pend_desc];
1141 			efx_tx_qdesc_tso2_create(txq->common,
1142 						 tso->packet_id,
1143 						 tso->seqnum,
1144 						 tso->seg_size,
1145 						 desc,
1146 						 EFX_TX_FATSOV2_OPT_NDESCS);
1147 			desc += EFX_TX_FATSOV2_OPT_NDESCS;
1148 			txq->n_pend_desc += EFX_TX_FATSOV2_OPT_NDESCS;
1149 			KASSERT(txq->stmp[id].flags == 0, ("stmp flags are not 0"));
1150 			id = (id + EFX_TX_FATSOV2_OPT_NDESCS) & txq->ptr_mask;
1151 
1152 			tso->segs_space =
1153 			    EFX_TX_FATSOV2_DMA_SEGS_PER_PKT_MAX - 1;
1154 		} else {
1155 			uint8_t tcp_flags = tso->tcp_flags;
1156 
1157 			if (tso->out_len > tso->seg_size)
1158 				tcp_flags &= ~(TH_FIN | TH_PUSH);
1159 
1160 			/* Add FATSOv1 option descriptor */
1161 			desc = &txq->pend_desc[txq->n_pend_desc++];
1162 			efx_tx_qdesc_tso_create(txq->common,
1163 						tso->packet_id,
1164 						tso->seqnum,
1165 						tcp_flags,
1166 						desc++);
1167 			KASSERT(txq->stmp[id].flags == 0, ("stmp flags are not 0"));
1168 			id = (id + 1) & txq->ptr_mask;
1169 
1170 			tso->seqnum += tso->seg_size;
1171 			tso->segs_space = UINT_MAX;
1172 		}
1173 
1174 		/* Header DMA descriptor */
1175 		*desc = tso->header_desc;
1176 		txq->n_pend_desc++;
1177 		KASSERT(txq->stmp[id].flags == 0, ("stmp flags are not 0"));
1178 		id = (id + 1) & txq->ptr_mask;
1179 	} else {
1180 		/* Allocate a DMA-mapped header buffer. */
1181 		if (__predict_true(tso->header_len <= TSOH_STD_SIZE)) {
1182 			unsigned int page_index = (id / 2) / TSOH_PER_PAGE;
1183 			unsigned int buf_index = (id / 2) % TSOH_PER_PAGE;
1184 
1185 			header = (txq->tsoh_buffer[page_index].esm_base +
1186 				  buf_index * TSOH_STD_SIZE);
1187 			dma_addr = (txq->tsoh_buffer[page_index].esm_addr +
1188 				    buf_index * TSOH_STD_SIZE);
1189 			map = txq->tsoh_buffer[page_index].esm_map;
1190 
1191 			KASSERT(txq->stmp[id].flags == 0,
1192 				("stmp flags are not 0"));
1193 		} else {
1194 			struct sfxge_tx_mapping *stmp = &txq->stmp[id];
1195 
1196 			/* We cannot use bus_dmamem_alloc() as that may sleep */
1197 			header = malloc(tso->header_len, M_SFXGE, M_NOWAIT);
1198 			if (__predict_false(!header))
1199 				return (ENOMEM);
1200 			rc = bus_dmamap_load(txq->packet_dma_tag, stmp->map,
1201 					     header, tso->header_len,
1202 					     tso_map_long_header, &dma_addr,
1203 					     BUS_DMA_NOWAIT);
1204 			if (__predict_false(dma_addr == 0)) {
1205 				if (rc == 0) {
1206 					/* Succeeded but got >1 segment */
1207 					bus_dmamap_unload(txq->packet_dma_tag,
1208 							  stmp->map);
1209 					rc = EINVAL;
1210 				}
1211 				free(header, M_SFXGE);
1212 				return (rc);
1213 			}
1214 			map = stmp->map;
1215 
1216 			txq->tso_long_headers++;
1217 			stmp->u.heap_buf = header;
1218 			stmp->flags = TX_BUF_UNMAP;
1219 		}
1220 
1221 		tsoh_th = (struct tcphdr *)(header + tso->tcph_off);
1222 
1223 		/* Copy and update the headers. */
1224 		m_copydata(tso->mbuf, 0, tso->header_len, header);
1225 
1226 		tsoh_th->th_seq = htonl(tso->seqnum);
1227 		tso->seqnum += tso->seg_size;
1228 		if (tso->out_len > tso->seg_size) {
1229 			/* This packet will not finish the TSO burst. */
1230 			ip_length = tso->header_len - tso->nh_off + tso->seg_size;
1231 			tsoh_th->th_flags &= ~(TH_FIN | TH_PUSH);
1232 		} else {
1233 			/* This packet will be the last in the TSO burst. */
1234 			ip_length = tso->header_len - tso->nh_off + tso->out_len;
1235 		}
1236 
1237 		if (tso->protocol == htons(ETHERTYPE_IP)) {
1238 			struct ip *tsoh_iph = (struct ip *)(header + tso->nh_off);
1239 			tsoh_iph->ip_len = htons(ip_length);
1240 			/* XXX We should increment ip_id, but FreeBSD doesn't
1241 			 * currently allocate extra IDs for multiple segments.
1242 			 */
1243 		} else {
1244 			struct ip6_hdr *tsoh_iph =
1245 				(struct ip6_hdr *)(header + tso->nh_off);
1246 			tsoh_iph->ip6_plen = htons(ip_length - sizeof(*tsoh_iph));
1247 		}
1248 
1249 		/* Make the header visible to the hardware. */
1250 		bus_dmamap_sync(txq->packet_dma_tag, map, BUS_DMASYNC_PREWRITE);
1251 
1252 		/* Form a descriptor for this header. */
1253 		desc = &txq->pend_desc[txq->n_pend_desc++];
1254 		efx_tx_qdesc_dma_create(txq->common,
1255 					dma_addr,
1256 					tso->header_len,
1257 					0,
1258 					desc);
1259 		id = (id + 1) & txq->ptr_mask;
1260 
1261 		tso->segs_space = UINT_MAX;
1262 	}
1263 	tso->packet_space = tso->seg_size;
1264 	txq->tso_packets++;
1265 	*idp = id;
1266 
1267 	return (0);
1268 }
1269 
1270 static int
1271 sfxge_tx_queue_tso(struct sfxge_txq *txq, struct mbuf *mbuf,
1272 		   const bus_dma_segment_t *dma_seg, int n_dma_seg,
1273 		   int vlan_tagged)
1274 {
1275 	struct sfxge_tso_state tso;
1276 	unsigned int id;
1277 	unsigned skipped = 0;
1278 
1279 	tso_start(txq, &tso, dma_seg, mbuf);
1280 
1281 	while (dma_seg->ds_len + skipped <= tso.header_len) {
1282 		skipped += dma_seg->ds_len;
1283 		--n_dma_seg;
1284 		KASSERT(n_dma_seg, ("no payload found in TSO packet"));
1285 		++dma_seg;
1286 	}
1287 	tso.in_len = dma_seg->ds_len - (tso.header_len - skipped);
1288 	tso.dma_addr = dma_seg->ds_addr + (tso.header_len - skipped);
1289 
1290 	id = (txq->added + vlan_tagged) & txq->ptr_mask;
1291 	if (__predict_false(tso_start_new_packet(txq, &tso, &id)))
1292 		return (-1);
1293 
1294 	while (1) {
1295 		tso_fill_packet_with_fragment(txq, &tso);
1296 		/* Exactly one DMA descriptor is added */
1297 		KASSERT(txq->stmp[id].flags == 0, ("stmp flags are not 0"));
1298 		id = (id + 1) & txq->ptr_mask;
1299 
1300 		/* Move onto the next fragment? */
1301 		if (tso.in_len == 0) {
1302 			--n_dma_seg;
1303 			if (n_dma_seg == 0)
1304 				break;
1305 			++dma_seg;
1306 			tso.in_len = dma_seg->ds_len;
1307 			tso.dma_addr = dma_seg->ds_addr;
1308 		}
1309 
1310 		/* End of packet? */
1311 		if ((tso.packet_space == 0) | (tso.segs_space == 0)) {
1312 			unsigned int n_fatso_opt_desc =
1313 			    (tso.fw_assisted & SFXGE_FATSOV2) ?
1314 			    EFX_TX_FATSOV2_OPT_NDESCS :
1315 			    (tso.fw_assisted & SFXGE_FATSOV1) ? 1 : 0;
1316 
1317 			/* If the queue is now full due to tiny MSS,
1318 			 * or we can't create another header, discard
1319 			 * the remainder of the input mbuf but do not
1320 			 * roll back the work we have done.
1321 			 */
1322 			if (txq->n_pend_desc + n_fatso_opt_desc +
1323 			    1 /* header */ + n_dma_seg > txq->max_pkt_desc) {
1324 				txq->tso_pdrop_too_many++;
1325 				break;
1326 			}
1327 			if (__predict_false(tso_start_new_packet(txq, &tso,
1328 								 &id))) {
1329 				txq->tso_pdrop_no_rsrc++;
1330 				break;
1331 			}
1332 		}
1333 	}
1334 
1335 	txq->tso_bursts++;
1336 	return (id);
1337 }
1338 
1339 static void
1340 sfxge_tx_qunblock(struct sfxge_txq *txq)
1341 {
1342 	struct sfxge_softc *sc;
1343 	struct sfxge_evq *evq;
1344 
1345 	sc = txq->sc;
1346 	evq = sc->evq[txq->evq_index];
1347 
1348 	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
1349 
1350 	if (__predict_false(txq->init_state != SFXGE_TXQ_STARTED))
1351 		return;
1352 
1353 	SFXGE_TXQ_LOCK(txq);
1354 
1355 	if (txq->blocked) {
1356 		unsigned int level;
1357 
1358 		level = txq->added - txq->completed;
1359 		if (level <= SFXGE_TXQ_UNBLOCK_LEVEL(txq->entries)) {
1360 			/* reaped must be in sync with blocked */
1361 			sfxge_tx_qreap(txq);
1362 			txq->blocked = 0;
1363 		}
1364 	}
1365 
1366 	sfxge_tx_qdpl_service(txq);
1367 	/* note: lock has been dropped */
1368 }
1369 
1370 void
1371 sfxge_tx_qflush_done(struct sfxge_txq *txq)
1372 {
1373 
1374 	txq->flush_state = SFXGE_FLUSH_DONE;
1375 }
1376 
1377 static void
1378 sfxge_tx_qstop(struct sfxge_softc *sc, unsigned int index)
1379 {
1380 	struct sfxge_txq *txq;
1381 	struct sfxge_evq *evq;
1382 	unsigned int count;
1383 
1384 	SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
1385 
1386 	txq = sc->txq[index];
1387 	evq = sc->evq[txq->evq_index];
1388 
1389 	SFXGE_EVQ_LOCK(evq);
1390 	SFXGE_TXQ_LOCK(txq);
1391 
1392 	KASSERT(txq->init_state == SFXGE_TXQ_STARTED,
1393 	    ("txq->init_state != SFXGE_TXQ_STARTED"));
1394 
1395 	txq->init_state = SFXGE_TXQ_INITIALIZED;
1396 
1397 	if (txq->flush_state != SFXGE_FLUSH_DONE) {
1398 		txq->flush_state = SFXGE_FLUSH_PENDING;
1399 
1400 		SFXGE_EVQ_UNLOCK(evq);
1401 		SFXGE_TXQ_UNLOCK(txq);
1402 
1403 		/* Flush the transmit queue. */
1404 		if (efx_tx_qflush(txq->common) != 0) {
1405 			log(LOG_ERR, "%s: Flushing Tx queue %u failed\n",
1406 			    device_get_nameunit(sc->dev), index);
1407 			txq->flush_state = SFXGE_FLUSH_DONE;
1408 		} else {
1409 			count = 0;
1410 			do {
1411 				/* Spin for 100ms. */
1412 				DELAY(100000);
1413 				if (txq->flush_state != SFXGE_FLUSH_PENDING)
1414 					break;
1415 			} while (++count < 20);
1416 		}
1417 		SFXGE_EVQ_LOCK(evq);
1418 		SFXGE_TXQ_LOCK(txq);
1419 
1420 		KASSERT(txq->flush_state != SFXGE_FLUSH_FAILED,
1421 		    ("txq->flush_state == SFXGE_FLUSH_FAILED"));
1422 
1423 		if (txq->flush_state != SFXGE_FLUSH_DONE) {
1424 			/* Flush timeout */
1425 			log(LOG_ERR, "%s: Cannot flush Tx queue %u\n",
1426 			    device_get_nameunit(sc->dev), index);
1427 			txq->flush_state = SFXGE_FLUSH_DONE;
1428 		}
1429 	}
1430 
1431 	txq->blocked = 0;
1432 	txq->pending = txq->added;
1433 
1434 	sfxge_tx_qcomplete(txq, evq);
1435 	KASSERT(txq->completed == txq->added,
1436 	    ("txq->completed != txq->added"));
1437 
1438 	sfxge_tx_qreap(txq);
1439 	KASSERT(txq->reaped == txq->completed,
1440 	    ("txq->reaped != txq->completed"));
1441 
1442 	txq->added = 0;
1443 	txq->pending = 0;
1444 	txq->completed = 0;
1445 	txq->reaped = 0;
1446 
1447 	/* Destroy the common code transmit queue. */
1448 	efx_tx_qdestroy(txq->common);
1449 	txq->common = NULL;
1450 
1451 	efx_sram_buf_tbl_clear(sc->enp, txq->buf_base_id,
1452 	    EFX_TXQ_NBUFS(sc->txq_entries));
1453 
1454 	SFXGE_EVQ_UNLOCK(evq);
1455 	SFXGE_TXQ_UNLOCK(txq);
1456 }
1457 
1458 /*
1459  * Estimate maximum number of Tx descriptors required for TSO packet.
1460  * With minimum MSS and maximum mbuf length we might need more (even
1461  * than a ring-ful of descriptors), but this should not happen in
1462  * practice except due to deliberate attack.  In that case we will
1463  * truncate the output at a packet boundary.
1464  */
1465 static unsigned int
1466 sfxge_tx_max_pkt_desc(const struct sfxge_softc *sc, enum sfxge_txq_type type,
1467 		      unsigned int tso_fw_assisted)
1468 {
1469 	/* One descriptor for every input fragment */
1470 	unsigned int max_descs = SFXGE_TX_MAPPING_MAX_SEG;
1471 	unsigned int sw_tso_max_descs;
1472 	unsigned int fa_tso_v1_max_descs = 0;
1473 	unsigned int fa_tso_v2_max_descs = 0;
1474 
1475 	/* VLAN tagging Tx option descriptor may be required */
1476 	if (efx_nic_cfg_get(sc->enp)->enc_hw_tx_insert_vlan_enabled)
1477 		max_descs++;
1478 
1479 	if (type == SFXGE_TXQ_IP_TCP_UDP_CKSUM) {
1480 		/*
1481 		 * Plus header and payload descriptor for each output segment.
1482 		 * Minus one since header fragment is already counted.
1483 		 * Even if FATSO is used, we should be ready to fallback
1484 		 * to do it in the driver.
1485 		 */
1486 		sw_tso_max_descs = SFXGE_TSO_MAX_SEGS * 2 - 1;
1487 
1488 		/* FW assisted TSOv1 requires one more descriptor per segment
1489 		 * in comparison to SW TSO */
1490 		if (tso_fw_assisted & SFXGE_FATSOV1)
1491 			fa_tso_v1_max_descs =
1492 			    sw_tso_max_descs + SFXGE_TSO_MAX_SEGS;
1493 
1494 		/* FW assisted TSOv2 requires 3 (2 FATSO plus header) extra
1495 		 * descriptors per superframe limited by number of DMA fetches
1496 		 * per packet. The first packet header is already counted.
1497 		 */
1498 		if (tso_fw_assisted & SFXGE_FATSOV2) {
1499 			fa_tso_v2_max_descs =
1500 			    howmany(SFXGE_TX_MAPPING_MAX_SEG,
1501 				    EFX_TX_FATSOV2_DMA_SEGS_PER_PKT_MAX - 1) *
1502 			    (EFX_TX_FATSOV2_OPT_NDESCS + 1) - 1;
1503 		}
1504 
1505 		max_descs += MAX(sw_tso_max_descs,
1506 				 MAX(fa_tso_v1_max_descs, fa_tso_v2_max_descs));
1507 	}
1508 
1509 	return (max_descs);
1510 }
1511 
1512 static int
1513 sfxge_tx_qstart(struct sfxge_softc *sc, unsigned int index)
1514 {
1515 	struct sfxge_txq *txq;
1516 	efsys_mem_t *esmp;
1517 	uint16_t flags;
1518 	unsigned int tso_fw_assisted;
1519 	struct sfxge_evq *evq;
1520 	unsigned int desc_index;
1521 	int rc;
1522 
1523 	SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
1524 
1525 	txq = sc->txq[index];
1526 	esmp = &txq->mem;
1527 	evq = sc->evq[txq->evq_index];
1528 
1529 	KASSERT(txq->init_state == SFXGE_TXQ_INITIALIZED,
1530 	    ("txq->init_state != SFXGE_TXQ_INITIALIZED"));
1531 	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
1532 	    ("evq->init_state != SFXGE_EVQ_STARTED"));
1533 
1534 	/* Program the buffer table. */
1535 	if ((rc = efx_sram_buf_tbl_set(sc->enp, txq->buf_base_id, esmp,
1536 	    EFX_TXQ_NBUFS(sc->txq_entries))) != 0)
1537 		return (rc);
1538 
1539 	/* Determine the kind of queue we are creating. */
1540 	tso_fw_assisted = 0;
1541 	switch (txq->type) {
1542 	case SFXGE_TXQ_NON_CKSUM:
1543 		flags = 0;
1544 		break;
1545 	case SFXGE_TXQ_IP_CKSUM:
1546 		flags = EFX_TXQ_CKSUM_IPV4;
1547 		break;
1548 	case SFXGE_TXQ_IP_TCP_UDP_CKSUM:
1549 		flags = EFX_TXQ_CKSUM_IPV4 | EFX_TXQ_CKSUM_TCPUDP;
1550 		tso_fw_assisted = sc->tso_fw_assisted;
1551 		if (tso_fw_assisted & SFXGE_FATSOV2)
1552 			flags |= EFX_TXQ_FATSOV2;
1553 		break;
1554 	default:
1555 		KASSERT(0, ("Impossible TX queue"));
1556 		flags = 0;
1557 		break;
1558 	}
1559 
1560 	/* Create the common code transmit queue. */
1561 	if ((rc = efx_tx_qcreate(sc->enp, index, txq->type, esmp,
1562 	    sc->txq_entries, txq->buf_base_id, flags, evq->common,
1563 	    &txq->common, &desc_index)) != 0) {
1564 		/* Retry if no FATSOv2 resources, otherwise fail */
1565 		if ((rc != ENOSPC) || (~flags & EFX_TXQ_FATSOV2))
1566 			goto fail;
1567 
1568 		/* Looks like all FATSOv2 contexts are used */
1569 		flags &= ~EFX_TXQ_FATSOV2;
1570 		tso_fw_assisted &= ~SFXGE_FATSOV2;
1571 		if ((rc = efx_tx_qcreate(sc->enp, index, txq->type, esmp,
1572 		    sc->txq_entries, txq->buf_base_id, flags, evq->common,
1573 		    &txq->common, &desc_index)) != 0)
1574 			goto fail;
1575 	}
1576 
1577 	/* Initialise queue descriptor indexes */
1578 	txq->added = txq->pending = txq->completed = txq->reaped = desc_index;
1579 
1580 	SFXGE_TXQ_LOCK(txq);
1581 
1582 	/* Enable the transmit queue. */
1583 	efx_tx_qenable(txq->common);
1584 
1585 	txq->init_state = SFXGE_TXQ_STARTED;
1586 	txq->flush_state = SFXGE_FLUSH_REQUIRED;
1587 	txq->tso_fw_assisted = tso_fw_assisted;
1588 
1589 	txq->max_pkt_desc = sfxge_tx_max_pkt_desc(sc, txq->type,
1590 						  tso_fw_assisted);
1591 
1592 	SFXGE_TXQ_UNLOCK(txq);
1593 
1594 	return (0);
1595 
1596 fail:
1597 	efx_sram_buf_tbl_clear(sc->enp, txq->buf_base_id,
1598 	    EFX_TXQ_NBUFS(sc->txq_entries));
1599 	return (rc);
1600 }
1601 
1602 void
1603 sfxge_tx_stop(struct sfxge_softc *sc)
1604 {
1605 	int index;
1606 
1607 	index = sc->txq_count;
1608 	while (--index >= 0)
1609 		sfxge_tx_qstop(sc, index);
1610 
1611 	/* Tear down the transmit module */
1612 	efx_tx_fini(sc->enp);
1613 }
1614 
1615 int
1616 sfxge_tx_start(struct sfxge_softc *sc)
1617 {
1618 	int index;
1619 	int rc;
1620 
1621 	/* Initialize the common code transmit module. */
1622 	if ((rc = efx_tx_init(sc->enp)) != 0)
1623 		return (rc);
1624 
1625 	for (index = 0; index < sc->txq_count; index++) {
1626 		if ((rc = sfxge_tx_qstart(sc, index)) != 0)
1627 			goto fail;
1628 	}
1629 
1630 	return (0);
1631 
1632 fail:
1633 	while (--index >= 0)
1634 		sfxge_tx_qstop(sc, index);
1635 
1636 	efx_tx_fini(sc->enp);
1637 
1638 	return (rc);
1639 }
1640 
1641 static int
1642 sfxge_txq_stat_init(struct sfxge_txq *txq, struct sysctl_oid *txq_node)
1643 {
1644 	struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(txq->sc->dev);
1645 	struct sysctl_oid *stat_node;
1646 	unsigned int id;
1647 
1648 	stat_node = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(txq_node), OID_AUTO,
1649 				    "stats", CTLFLAG_RD, NULL,
1650 				    "Tx queue statistics");
1651 	if (stat_node == NULL)
1652 		return (ENOMEM);
1653 
1654 	for (id = 0; id < nitems(sfxge_tx_stats); id++) {
1655 		SYSCTL_ADD_ULONG(
1656 		    ctx, SYSCTL_CHILDREN(stat_node), OID_AUTO,
1657 		    sfxge_tx_stats[id].name, CTLFLAG_RD | CTLFLAG_STATS,
1658 		    (unsigned long *)((caddr_t)txq + sfxge_tx_stats[id].offset),
1659 		    "");
1660 	}
1661 
1662 	return (0);
1663 }
1664 
1665 /**
1666  * Destroy a transmit queue.
1667  */
1668 static void
1669 sfxge_tx_qfini(struct sfxge_softc *sc, unsigned int index)
1670 {
1671 	struct sfxge_txq *txq;
1672 	unsigned int nmaps;
1673 
1674 	txq = sc->txq[index];
1675 
1676 	KASSERT(txq->init_state == SFXGE_TXQ_INITIALIZED,
1677 	    ("txq->init_state != SFXGE_TXQ_INITIALIZED"));
1678 
1679 	if (txq->type == SFXGE_TXQ_IP_TCP_UDP_CKSUM)
1680 		tso_fini(txq);
1681 
1682 	/* Free the context arrays. */
1683 	free(txq->pend_desc, M_SFXGE);
1684 	nmaps = sc->txq_entries;
1685 	while (nmaps-- != 0)
1686 		bus_dmamap_destroy(txq->packet_dma_tag, txq->stmp[nmaps].map);
1687 	free(txq->stmp, M_SFXGE);
1688 
1689 	/* Release DMA memory mapping. */
1690 	sfxge_dma_free(&txq->mem);
1691 
1692 	sc->txq[index] = NULL;
1693 
1694 	SFXGE_TXQ_LOCK_DESTROY(txq);
1695 
1696 	free(txq, M_SFXGE);
1697 }
1698 
1699 static int
1700 sfxge_tx_qinit(struct sfxge_softc *sc, unsigned int txq_index,
1701 	       enum sfxge_txq_type type, unsigned int evq_index)
1702 {
1703 	char name[16];
1704 	struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1705 	struct sysctl_oid *txq_node;
1706 	struct sfxge_txq *txq;
1707 	struct sfxge_evq *evq;
1708 	struct sfxge_tx_dpl *stdp;
1709 	struct sysctl_oid *dpl_node;
1710 	efsys_mem_t *esmp;
1711 	unsigned int nmaps;
1712 	int rc;
1713 
1714 	txq = malloc(sizeof(struct sfxge_txq), M_SFXGE, M_ZERO | M_WAITOK);
1715 	txq->sc = sc;
1716 	txq->entries = sc->txq_entries;
1717 	txq->ptr_mask = txq->entries - 1;
1718 
1719 	sc->txq[txq_index] = txq;
1720 	esmp = &txq->mem;
1721 
1722 	evq = sc->evq[evq_index];
1723 
1724 	/* Allocate and zero DMA space for the descriptor ring. */
1725 	if ((rc = sfxge_dma_alloc(sc, EFX_TXQ_SIZE(sc->txq_entries), esmp)) != 0)
1726 		return (rc);
1727 
1728 	/* Allocate buffer table entries. */
1729 	sfxge_sram_buf_tbl_alloc(sc, EFX_TXQ_NBUFS(sc->txq_entries),
1730 				 &txq->buf_base_id);
1731 
1732 	/* Create a DMA tag for packet mappings. */
1733 	if (bus_dma_tag_create(sc->parent_dma_tag, 1, 0x1000,
1734 	    MIN(0x3FFFFFFFFFFFUL, BUS_SPACE_MAXADDR), BUS_SPACE_MAXADDR, NULL,
1735 	    NULL, 0x11000, SFXGE_TX_MAPPING_MAX_SEG, 0x1000, 0, NULL, NULL,
1736 	    &txq->packet_dma_tag) != 0) {
1737 		device_printf(sc->dev, "Couldn't allocate txq DMA tag\n");
1738 		rc = ENOMEM;
1739 		goto fail;
1740 	}
1741 
1742 	/* Allocate pending descriptor array for batching writes. */
1743 	txq->pend_desc = malloc(sizeof(efx_desc_t) * sc->txq_entries,
1744 				M_SFXGE, M_ZERO | M_WAITOK);
1745 
1746 	/* Allocate and initialise mbuf DMA mapping array. */
1747 	txq->stmp = malloc(sizeof(struct sfxge_tx_mapping) * sc->txq_entries,
1748 	    M_SFXGE, M_ZERO | M_WAITOK);
1749 	for (nmaps = 0; nmaps < sc->txq_entries; nmaps++) {
1750 		rc = bus_dmamap_create(txq->packet_dma_tag, 0,
1751 				       &txq->stmp[nmaps].map);
1752 		if (rc != 0)
1753 			goto fail2;
1754 	}
1755 
1756 	snprintf(name, sizeof(name), "%u", txq_index);
1757 	txq_node = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(sc->txqs_node),
1758 				   OID_AUTO, name, CTLFLAG_RD, NULL, "");
1759 	if (txq_node == NULL) {
1760 		rc = ENOMEM;
1761 		goto fail_txq_node;
1762 	}
1763 
1764 	if (type == SFXGE_TXQ_IP_TCP_UDP_CKSUM &&
1765 	    (rc = tso_init(txq)) != 0)
1766 		goto fail3;
1767 
1768 	if (sfxge_tx_dpl_get_max <= 0) {
1769 		log(LOG_ERR, "%s=%d must be greater than 0",
1770 		    SFXGE_PARAM_TX_DPL_GET_MAX, sfxge_tx_dpl_get_max);
1771 		rc = EINVAL;
1772 		goto fail_tx_dpl_get_max;
1773 	}
1774 	if (sfxge_tx_dpl_get_non_tcp_max <= 0) {
1775 		log(LOG_ERR, "%s=%d must be greater than 0",
1776 		    SFXGE_PARAM_TX_DPL_GET_NON_TCP_MAX,
1777 		    sfxge_tx_dpl_get_non_tcp_max);
1778 		rc = EINVAL;
1779 		goto fail_tx_dpl_get_max;
1780 	}
1781 	if (sfxge_tx_dpl_put_max < 0) {
1782 		log(LOG_ERR, "%s=%d must be greater or equal to 0",
1783 		    SFXGE_PARAM_TX_DPL_PUT_MAX, sfxge_tx_dpl_put_max);
1784 		rc = EINVAL;
1785 		goto fail_tx_dpl_put_max;
1786 	}
1787 
1788 	/* Initialize the deferred packet list. */
1789 	stdp = &txq->dpl;
1790 	stdp->std_put_max = sfxge_tx_dpl_put_max;
1791 	stdp->std_get_max = sfxge_tx_dpl_get_max;
1792 	stdp->std_get_non_tcp_max = sfxge_tx_dpl_get_non_tcp_max;
1793 	stdp->std_getp = &stdp->std_get;
1794 
1795 	SFXGE_TXQ_LOCK_INIT(txq, device_get_nameunit(sc->dev), txq_index);
1796 
1797 	dpl_node = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(txq_node), OID_AUTO,
1798 				   "dpl", CTLFLAG_RD, NULL,
1799 				   "Deferred packet list statistics");
1800 	if (dpl_node == NULL) {
1801 		rc = ENOMEM;
1802 		goto fail_dpl_node;
1803 	}
1804 
1805 	SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(dpl_node), OID_AUTO,
1806 			"get_count", CTLFLAG_RD | CTLFLAG_STATS,
1807 			&stdp->std_get_count, 0, "");
1808 	SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(dpl_node), OID_AUTO,
1809 			"get_non_tcp_count", CTLFLAG_RD | CTLFLAG_STATS,
1810 			&stdp->std_get_non_tcp_count, 0, "");
1811 	SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(dpl_node), OID_AUTO,
1812 			"get_hiwat", CTLFLAG_RD | CTLFLAG_STATS,
1813 			&stdp->std_get_hiwat, 0, "");
1814 	SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(dpl_node), OID_AUTO,
1815 			"put_hiwat", CTLFLAG_RD | CTLFLAG_STATS,
1816 			&stdp->std_put_hiwat, 0, "");
1817 
1818 	rc = sfxge_txq_stat_init(txq, txq_node);
1819 	if (rc != 0)
1820 		goto fail_txq_stat_init;
1821 
1822 	txq->type = type;
1823 	txq->evq_index = evq_index;
1824 	txq->txq_index = txq_index;
1825 	txq->init_state = SFXGE_TXQ_INITIALIZED;
1826 	txq->hw_vlan_tci = 0;
1827 
1828 	return (0);
1829 
1830 fail_txq_stat_init:
1831 fail_dpl_node:
1832 fail_tx_dpl_put_max:
1833 fail_tx_dpl_get_max:
1834 fail3:
1835 fail_txq_node:
1836 	free(txq->pend_desc, M_SFXGE);
1837 fail2:
1838 	while (nmaps-- != 0)
1839 		bus_dmamap_destroy(txq->packet_dma_tag, txq->stmp[nmaps].map);
1840 	free(txq->stmp, M_SFXGE);
1841 	bus_dma_tag_destroy(txq->packet_dma_tag);
1842 
1843 fail:
1844 	sfxge_dma_free(esmp);
1845 
1846 	return (rc);
1847 }
1848 
1849 static int
1850 sfxge_tx_stat_handler(SYSCTL_HANDLER_ARGS)
1851 {
1852 	struct sfxge_softc *sc = arg1;
1853 	unsigned int id = arg2;
1854 	unsigned long sum;
1855 	unsigned int index;
1856 
1857 	/* Sum across all TX queues */
1858 	sum = 0;
1859 	for (index = 0; index < sc->txq_count; index++)
1860 		sum += *(unsigned long *)((caddr_t)sc->txq[index] +
1861 					  sfxge_tx_stats[id].offset);
1862 
1863 	return (SYSCTL_OUT(req, &sum, sizeof(sum)));
1864 }
1865 
1866 static void
1867 sfxge_tx_stat_init(struct sfxge_softc *sc)
1868 {
1869 	struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1870 	struct sysctl_oid_list *stat_list;
1871 	unsigned int id;
1872 
1873 	stat_list = SYSCTL_CHILDREN(sc->stats_node);
1874 
1875 	for (id = 0; id < nitems(sfxge_tx_stats); id++) {
1876 		SYSCTL_ADD_PROC(
1877 			ctx, stat_list,
1878 			OID_AUTO, sfxge_tx_stats[id].name,
1879 			CTLTYPE_ULONG|CTLFLAG_RD,
1880 			sc, id, sfxge_tx_stat_handler, "LU",
1881 			"");
1882 	}
1883 }
1884 
1885 uint64_t
1886 sfxge_tx_get_drops(struct sfxge_softc *sc)
1887 {
1888 	unsigned int index;
1889 	uint64_t drops = 0;
1890 	struct sfxge_txq *txq;
1891 
1892 	/* Sum across all TX queues */
1893 	for (index = 0; index < sc->txq_count; index++) {
1894 		txq = sc->txq[index];
1895 		/*
1896 		 * In theory, txq->put_overflow and txq->netdown_drops
1897 		 * should use atomic operation and other should be
1898 		 * obtained under txq lock, but it is just statistics.
1899 		 */
1900 		drops += txq->drops + txq->get_overflow +
1901 			 txq->get_non_tcp_overflow +
1902 			 txq->put_overflow + txq->netdown_drops +
1903 			 txq->tso_pdrop_too_many + txq->tso_pdrop_no_rsrc;
1904 	}
1905 	return (drops);
1906 }
1907 
1908 void
1909 sfxge_tx_fini(struct sfxge_softc *sc)
1910 {
1911 	int index;
1912 
1913 	index = sc->txq_count;
1914 	while (--index >= 0)
1915 		sfxge_tx_qfini(sc, index);
1916 
1917 	sc->txq_count = 0;
1918 }
1919 
1920 
1921 int
1922 sfxge_tx_init(struct sfxge_softc *sc)
1923 {
1924 	const efx_nic_cfg_t *encp = efx_nic_cfg_get(sc->enp);
1925 	struct sfxge_intr *intr;
1926 	int index;
1927 	int rc;
1928 
1929 	intr = &sc->intr;
1930 
1931 	KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
1932 	    ("intr->state != SFXGE_INTR_INITIALIZED"));
1933 
1934 	sc->txq_count = SFXGE_TXQ_NTYPES - 1 + sc->intr.n_alloc;
1935 
1936 	sc->tso_fw_assisted = sfxge_tso_fw_assisted;
1937 	if ((~encp->enc_features & EFX_FEATURE_FW_ASSISTED_TSO) ||
1938 	    (!encp->enc_fw_assisted_tso_enabled))
1939 		sc->tso_fw_assisted &= ~SFXGE_FATSOV1;
1940 	if ((~encp->enc_features & EFX_FEATURE_FW_ASSISTED_TSO_V2) ||
1941 	    (!encp->enc_fw_assisted_tso_v2_enabled))
1942 		sc->tso_fw_assisted &= ~SFXGE_FATSOV2;
1943 
1944 	sc->txqs_node = SYSCTL_ADD_NODE(
1945 		device_get_sysctl_ctx(sc->dev),
1946 		SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev)),
1947 		OID_AUTO, "txq", CTLFLAG_RD, NULL, "Tx queues");
1948 	if (sc->txqs_node == NULL) {
1949 		rc = ENOMEM;
1950 		goto fail_txq_node;
1951 	}
1952 
1953 	/* Initialize the transmit queues */
1954 	if ((rc = sfxge_tx_qinit(sc, SFXGE_TXQ_NON_CKSUM,
1955 	    SFXGE_TXQ_NON_CKSUM, 0)) != 0)
1956 		goto fail;
1957 
1958 	if ((rc = sfxge_tx_qinit(sc, SFXGE_TXQ_IP_CKSUM,
1959 	    SFXGE_TXQ_IP_CKSUM, 0)) != 0)
1960 		goto fail2;
1961 
1962 	for (index = 0;
1963 	     index < sc->txq_count - SFXGE_TXQ_NTYPES + 1;
1964 	     index++) {
1965 		if ((rc = sfxge_tx_qinit(sc, SFXGE_TXQ_NTYPES - 1 + index,
1966 		    SFXGE_TXQ_IP_TCP_UDP_CKSUM, index)) != 0)
1967 			goto fail3;
1968 	}
1969 
1970 	sfxge_tx_stat_init(sc);
1971 
1972 	return (0);
1973 
1974 fail3:
1975 	while (--index >= 0)
1976 		sfxge_tx_qfini(sc, SFXGE_TXQ_IP_TCP_UDP_CKSUM + index);
1977 
1978 	sfxge_tx_qfini(sc, SFXGE_TXQ_IP_CKSUM);
1979 
1980 fail2:
1981 	sfxge_tx_qfini(sc, SFXGE_TXQ_NON_CKSUM);
1982 
1983 fail:
1984 fail_txq_node:
1985 	sc->txq_count = 0;
1986 	return (rc);
1987 }
1988