xref: /freebsd/sys/dev/cxgbe/t4_sge.c (revision 2dc4dbb9673c9a3309c2dad59cb588c6f04beaea)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2011 Chelsio Communications, Inc.
5  * All rights reserved.
6  * Written by: Navdeep Parhar <np@FreeBSD.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 
33 #include "opt_inet.h"
34 #include "opt_inet6.h"
35 #include "opt_ratelimit.h"
36 
37 #include <sys/types.h>
38 #include <sys/eventhandler.h>
39 #include <sys/mbuf.h>
40 #include <sys/socket.h>
41 #include <sys/kernel.h>
42 #include <sys/malloc.h>
43 #include <sys/queue.h>
44 #include <sys/sbuf.h>
45 #include <sys/taskqueue.h>
46 #include <sys/time.h>
47 #include <sys/sglist.h>
48 #include <sys/sysctl.h>
49 #include <sys/smp.h>
50 #include <sys/counter.h>
51 #include <net/bpf.h>
52 #include <net/ethernet.h>
53 #include <net/if.h>
54 #include <net/if_vlan_var.h>
55 #include <netinet/in.h>
56 #include <netinet/ip.h>
57 #include <netinet/ip6.h>
58 #include <netinet/tcp.h>
59 #include <netinet/udp.h>
60 #include <machine/in_cksum.h>
61 #include <machine/md_var.h>
62 #include <vm/vm.h>
63 #include <vm/pmap.h>
64 #ifdef DEV_NETMAP
65 #include <machine/bus.h>
66 #include <sys/selinfo.h>
67 #include <net/if_var.h>
68 #include <net/netmap.h>
69 #include <dev/netmap/netmap_kern.h>
70 #endif
71 
72 #include "common/common.h"
73 #include "common/t4_regs.h"
74 #include "common/t4_regs_values.h"
75 #include "common/t4_msg.h"
76 #include "t4_l2t.h"
77 #include "t4_mp_ring.h"
78 
79 #ifdef T4_PKT_TIMESTAMP
80 #define RX_COPY_THRESHOLD (MINCLSIZE - 8)
81 #else
82 #define RX_COPY_THRESHOLD MINCLSIZE
83 #endif
84 
85 /*
86  * Ethernet frames are DMA'd at this byte offset into the freelist buffer.
87  * 0-7 are valid values.
88  */
89 static int fl_pktshift = 2;
90 TUNABLE_INT("hw.cxgbe.fl_pktshift", &fl_pktshift);
91 
92 /*
93  * Pad ethernet payload up to this boundary.
94  * -1: driver should figure out a good value.
95  *  0: disable padding.
96  *  Any power of 2 from 32 to 4096 (both inclusive) is also a valid value.
97  */
98 int fl_pad = -1;
99 TUNABLE_INT("hw.cxgbe.fl_pad", &fl_pad);
100 
101 /*
102  * Status page length.
103  * -1: driver should figure out a good value.
104  *  64 or 128 are the only other valid values.
105  */
106 static int spg_len = -1;
107 TUNABLE_INT("hw.cxgbe.spg_len", &spg_len);
108 
109 /*
110  * Congestion drops.
111  * -1: no congestion feedback (not recommended).
112  *  0: backpressure the channel instead of dropping packets right away.
113  *  1: no backpressure, drop packets for the congested queue immediately.
114  */
115 static int cong_drop = 0;
116 TUNABLE_INT("hw.cxgbe.cong_drop", &cong_drop);
117 
118 /*
119  * Deliver multiple frames in the same free list buffer if they fit.
120  * -1: let the driver decide whether to enable buffer packing or not.
121  *  0: disable buffer packing.
122  *  1: enable buffer packing.
123  */
124 static int buffer_packing = -1;
125 TUNABLE_INT("hw.cxgbe.buffer_packing", &buffer_packing);
126 
127 /*
128  * Start next frame in a packed buffer at this boundary.
129  * -1: driver should figure out a good value.
130  * T4: driver will ignore this and use the same value as fl_pad above.
131  * T5: 16, or a power of 2 from 64 to 4096 (both inclusive) is a valid value.
132  */
133 static int fl_pack = -1;
134 TUNABLE_INT("hw.cxgbe.fl_pack", &fl_pack);
135 
136 /*
137  * Allow the driver to create mbuf(s) in a cluster allocated for rx.
138  * 0: never; always allocate mbufs from the zone_mbuf UMA zone.
139  * 1: ok to create mbuf(s) within a cluster if there is room.
140  */
141 static int allow_mbufs_in_cluster = 1;
142 TUNABLE_INT("hw.cxgbe.allow_mbufs_in_cluster", &allow_mbufs_in_cluster);
143 
144 /*
145  * Largest rx cluster size that the driver is allowed to allocate.
146  */
147 static int largest_rx_cluster = MJUM16BYTES;
148 TUNABLE_INT("hw.cxgbe.largest_rx_cluster", &largest_rx_cluster);
149 
150 /*
151  * Size of cluster allocation that's most likely to succeed.  The driver will
152  * fall back to this size if it fails to allocate clusters larger than this.
153  */
154 static int safest_rx_cluster = PAGE_SIZE;
155 TUNABLE_INT("hw.cxgbe.safest_rx_cluster", &safest_rx_cluster);
156 
157 #ifdef RATELIMIT
158 /*
159  * Knob to control TCP timestamp rewriting, and the granularity of the tick used
160  * for rewriting.  -1 and 0-3 are all valid values.
161  * -1: hardware should leave the TCP timestamps alone.
162  * 0: 1ms
163  * 1: 100us
164  * 2: 10us
165  * 3: 1us
166  */
167 static int tsclk = -1;
168 TUNABLE_INT("hw.cxgbe.tsclk", &tsclk);
169 
170 static int eo_max_backlog = 1024 * 1024;
171 TUNABLE_INT("hw.cxgbe.eo_max_backlog", &eo_max_backlog);
172 #endif
173 
174 /*
175  * The interrupt holdoff timers are multiplied by this value on T6+.
176  * 1 and 3-17 (both inclusive) are legal values.
177  */
178 static int tscale = 1;
179 TUNABLE_INT("hw.cxgbe.tscale", &tscale);
180 
181 /*
182  * Number of LRO entries in the lro_ctrl structure per rx queue.
183  */
184 static int lro_entries = TCP_LRO_ENTRIES;
185 TUNABLE_INT("hw.cxgbe.lro_entries", &lro_entries);
186 
187 /*
188  * This enables presorting of frames before they're fed into tcp_lro_rx.
189  */
190 static int lro_mbufs = 0;
191 TUNABLE_INT("hw.cxgbe.lro_mbufs", &lro_mbufs);
192 
193 struct txpkts {
194 	u_int wr_type;		/* type 0 or type 1 */
195 	u_int npkt;		/* # of packets in this work request */
196 	u_int plen;		/* total payload (sum of all packets) */
197 	u_int len16;		/* # of 16B pieces used by this work request */
198 };
199 
200 /* A packet's SGL.  This + m_pkthdr has all info needed for tx */
201 struct sgl {
202 	struct sglist sg;
203 	struct sglist_seg seg[TX_SGL_SEGS];
204 };
205 
206 static int service_iq(struct sge_iq *, int);
207 static struct mbuf *get_fl_payload(struct adapter *, struct sge_fl *, uint32_t);
208 static int t4_eth_rx(struct sge_iq *, const struct rss_header *, struct mbuf *);
209 static inline void init_iq(struct sge_iq *, struct adapter *, int, int, int);
210 static inline void init_fl(struct adapter *, struct sge_fl *, int, int, char *);
211 static inline void init_eq(struct adapter *, struct sge_eq *, int, int, uint8_t,
212     uint16_t, char *);
213 static int alloc_ring(struct adapter *, size_t, bus_dma_tag_t *, bus_dmamap_t *,
214     bus_addr_t *, void **);
215 static int free_ring(struct adapter *, bus_dma_tag_t, bus_dmamap_t, bus_addr_t,
216     void *);
217 static int alloc_iq_fl(struct vi_info *, struct sge_iq *, struct sge_fl *,
218     int, int);
219 static int free_iq_fl(struct vi_info *, struct sge_iq *, struct sge_fl *);
220 static void add_iq_sysctls(struct sysctl_ctx_list *, struct sysctl_oid *,
221     struct sge_iq *);
222 static void add_fl_sysctls(struct adapter *, struct sysctl_ctx_list *,
223     struct sysctl_oid *, struct sge_fl *);
224 static int alloc_fwq(struct adapter *);
225 static int free_fwq(struct adapter *);
226 static int alloc_mgmtq(struct adapter *);
227 static int free_mgmtq(struct adapter *);
228 static int alloc_rxq(struct vi_info *, struct sge_rxq *, int, int,
229     struct sysctl_oid *);
230 static int free_rxq(struct vi_info *, struct sge_rxq *);
231 #ifdef TCP_OFFLOAD
232 static int alloc_ofld_rxq(struct vi_info *, struct sge_ofld_rxq *, int, int,
233     struct sysctl_oid *);
234 static int free_ofld_rxq(struct vi_info *, struct sge_ofld_rxq *);
235 #endif
236 #ifdef DEV_NETMAP
237 static int alloc_nm_rxq(struct vi_info *, struct sge_nm_rxq *, int, int,
238     struct sysctl_oid *);
239 static int free_nm_rxq(struct vi_info *, struct sge_nm_rxq *);
240 static int alloc_nm_txq(struct vi_info *, struct sge_nm_txq *, int, int,
241     struct sysctl_oid *);
242 static int free_nm_txq(struct vi_info *, struct sge_nm_txq *);
243 #endif
244 static int ctrl_eq_alloc(struct adapter *, struct sge_eq *);
245 static int eth_eq_alloc(struct adapter *, struct vi_info *, struct sge_eq *);
246 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
247 static int ofld_eq_alloc(struct adapter *, struct vi_info *, struct sge_eq *);
248 #endif
249 static int alloc_eq(struct adapter *, struct vi_info *, struct sge_eq *);
250 static int free_eq(struct adapter *, struct sge_eq *);
251 static int alloc_wrq(struct adapter *, struct vi_info *, struct sge_wrq *,
252     struct sysctl_oid *);
253 static int free_wrq(struct adapter *, struct sge_wrq *);
254 static int alloc_txq(struct vi_info *, struct sge_txq *, int,
255     struct sysctl_oid *);
256 static int free_txq(struct vi_info *, struct sge_txq *);
257 static void oneseg_dma_callback(void *, bus_dma_segment_t *, int, int);
258 static inline void ring_fl_db(struct adapter *, struct sge_fl *);
259 static int refill_fl(struct adapter *, struct sge_fl *, int);
260 static void refill_sfl(void *);
261 static int alloc_fl_sdesc(struct sge_fl *);
262 static void free_fl_sdesc(struct adapter *, struct sge_fl *);
263 static void find_best_refill_source(struct adapter *, struct sge_fl *, int);
264 static void find_safe_refill_source(struct adapter *, struct sge_fl *);
265 static void add_fl_to_sfl(struct adapter *, struct sge_fl *);
266 
267 static inline void get_pkt_gl(struct mbuf *, struct sglist *);
268 static inline u_int txpkt_len16(u_int, u_int);
269 static inline u_int txpkt_vm_len16(u_int, u_int);
270 static inline u_int txpkts0_len16(u_int);
271 static inline u_int txpkts1_len16(void);
272 static u_int write_txpkt_wr(struct sge_txq *, struct fw_eth_tx_pkt_wr *,
273     struct mbuf *, u_int);
274 static u_int write_txpkt_vm_wr(struct adapter *, struct sge_txq *,
275     struct fw_eth_tx_pkt_vm_wr *, struct mbuf *, u_int);
276 static int try_txpkts(struct mbuf *, struct mbuf *, struct txpkts *, u_int);
277 static int add_to_txpkts(struct mbuf *, struct txpkts *, u_int);
278 static u_int write_txpkts_wr(struct sge_txq *, struct fw_eth_tx_pkts_wr *,
279     struct mbuf *, const struct txpkts *, u_int);
280 static void write_gl_to_txd(struct sge_txq *, struct mbuf *, caddr_t *, int);
281 static inline void copy_to_txd(struct sge_eq *, caddr_t, caddr_t *, int);
282 static inline void ring_eq_db(struct adapter *, struct sge_eq *, u_int);
283 static inline uint16_t read_hw_cidx(struct sge_eq *);
284 static inline u_int reclaimable_tx_desc(struct sge_eq *);
285 static inline u_int total_available_tx_desc(struct sge_eq *);
286 static u_int reclaim_tx_descs(struct sge_txq *, u_int);
287 static void tx_reclaim(void *, int);
288 static __be64 get_flit(struct sglist_seg *, int, int);
289 static int handle_sge_egr_update(struct sge_iq *, const struct rss_header *,
290     struct mbuf *);
291 static int handle_fw_msg(struct sge_iq *, const struct rss_header *,
292     struct mbuf *);
293 static int t4_handle_wrerr_rpl(struct adapter *, const __be64 *);
294 static void wrq_tx_drain(void *, int);
295 static void drain_wrq_wr_list(struct adapter *, struct sge_wrq *);
296 
297 static int sysctl_uint16(SYSCTL_HANDLER_ARGS);
298 static int sysctl_bufsizes(SYSCTL_HANDLER_ARGS);
299 #ifdef RATELIMIT
300 static inline u_int txpkt_eo_len16(u_int, u_int, u_int);
301 static int ethofld_fw4_ack(struct sge_iq *, const struct rss_header *,
302     struct mbuf *);
303 #endif
304 
305 static counter_u64_t extfree_refs;
306 static counter_u64_t extfree_rels;
307 
308 an_handler_t t4_an_handler;
309 fw_msg_handler_t t4_fw_msg_handler[NUM_FW6_TYPES];
310 cpl_handler_t t4_cpl_handler[NUM_CPL_CMDS];
311 cpl_handler_t set_tcb_rpl_handlers[NUM_CPL_COOKIES];
312 cpl_handler_t l2t_write_rpl_handlers[NUM_CPL_COOKIES];
313 cpl_handler_t act_open_rpl_handlers[NUM_CPL_COOKIES];
314 cpl_handler_t abort_rpl_rss_handlers[NUM_CPL_COOKIES];
315 cpl_handler_t fw4_ack_handlers[NUM_CPL_COOKIES];
316 
317 void
318 t4_register_an_handler(an_handler_t h)
319 {
320 	uintptr_t *loc;
321 
322 	MPASS(h == NULL || t4_an_handler == NULL);
323 
324 	loc = (uintptr_t *)&t4_an_handler;
325 	atomic_store_rel_ptr(loc, (uintptr_t)h);
326 }
327 
328 void
329 t4_register_fw_msg_handler(int type, fw_msg_handler_t h)
330 {
331 	uintptr_t *loc;
332 
333 	MPASS(type < nitems(t4_fw_msg_handler));
334 	MPASS(h == NULL || t4_fw_msg_handler[type] == NULL);
335 	/*
336 	 * These are dispatched by the handler for FW{4|6}_CPL_MSG using the CPL
337 	 * handler dispatch table.  Reject any attempt to install a handler for
338 	 * this subtype.
339 	 */
340 	MPASS(type != FW_TYPE_RSSCPL);
341 	MPASS(type != FW6_TYPE_RSSCPL);
342 
343 	loc = (uintptr_t *)&t4_fw_msg_handler[type];
344 	atomic_store_rel_ptr(loc, (uintptr_t)h);
345 }
346 
347 void
348 t4_register_cpl_handler(int opcode, cpl_handler_t h)
349 {
350 	uintptr_t *loc;
351 
352 	MPASS(opcode < nitems(t4_cpl_handler));
353 	MPASS(h == NULL || t4_cpl_handler[opcode] == NULL);
354 
355 	loc = (uintptr_t *)&t4_cpl_handler[opcode];
356 	atomic_store_rel_ptr(loc, (uintptr_t)h);
357 }
358 
359 static int
360 set_tcb_rpl_handler(struct sge_iq *iq, const struct rss_header *rss,
361     struct mbuf *m)
362 {
363 	const struct cpl_set_tcb_rpl *cpl = (const void *)(rss + 1);
364 	u_int tid;
365 	int cookie;
366 
367 	MPASS(m == NULL);
368 
369 	tid = GET_TID(cpl);
370 	if (is_ftid(iq->adapter, tid)) {
371 		/*
372 		 * The return code for filter-write is put in the CPL cookie so
373 		 * we have to rely on the hardware tid (is_ftid) to determine
374 		 * that this is a response to a filter.
375 		 */
376 		cookie = CPL_COOKIE_FILTER;
377 	} else {
378 		cookie = G_COOKIE(cpl->cookie);
379 	}
380 	MPASS(cookie > CPL_COOKIE_RESERVED);
381 	MPASS(cookie < nitems(set_tcb_rpl_handlers));
382 
383 	return (set_tcb_rpl_handlers[cookie](iq, rss, m));
384 }
385 
386 static int
387 l2t_write_rpl_handler(struct sge_iq *iq, const struct rss_header *rss,
388     struct mbuf *m)
389 {
390 	const struct cpl_l2t_write_rpl *rpl = (const void *)(rss + 1);
391 	unsigned int cookie;
392 
393 	MPASS(m == NULL);
394 
395 	cookie = GET_TID(rpl) & F_SYNC_WR ? CPL_COOKIE_TOM : CPL_COOKIE_FILTER;
396 	return (l2t_write_rpl_handlers[cookie](iq, rss, m));
397 }
398 
399 static int
400 act_open_rpl_handler(struct sge_iq *iq, const struct rss_header *rss,
401     struct mbuf *m)
402 {
403 	const struct cpl_act_open_rpl *cpl = (const void *)(rss + 1);
404 	u_int cookie = G_TID_COOKIE(G_AOPEN_ATID(be32toh(cpl->atid_status)));
405 
406 	MPASS(m == NULL);
407 	MPASS(cookie != CPL_COOKIE_RESERVED);
408 
409 	return (act_open_rpl_handlers[cookie](iq, rss, m));
410 }
411 
412 static int
413 abort_rpl_rss_handler(struct sge_iq *iq, const struct rss_header *rss,
414     struct mbuf *m)
415 {
416 	struct adapter *sc = iq->adapter;
417 	u_int cookie;
418 
419 	MPASS(m == NULL);
420 	if (is_hashfilter(sc))
421 		cookie = CPL_COOKIE_HASHFILTER;
422 	else
423 		cookie = CPL_COOKIE_TOM;
424 
425 	return (abort_rpl_rss_handlers[cookie](iq, rss, m));
426 }
427 
428 static int
429 fw4_ack_handler(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
430 {
431 	struct adapter *sc = iq->adapter;
432 	const struct cpl_fw4_ack *cpl = (const void *)(rss + 1);
433 	unsigned int tid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl)));
434 	u_int cookie;
435 
436 	MPASS(m == NULL);
437 	if (is_etid(sc, tid))
438 		cookie = CPL_COOKIE_ETHOFLD;
439 	else
440 		cookie = CPL_COOKIE_TOM;
441 
442 	return (fw4_ack_handlers[cookie](iq, rss, m));
443 }
444 
445 static void
446 t4_init_shared_cpl_handlers(void)
447 {
448 
449 	t4_register_cpl_handler(CPL_SET_TCB_RPL, set_tcb_rpl_handler);
450 	t4_register_cpl_handler(CPL_L2T_WRITE_RPL, l2t_write_rpl_handler);
451 	t4_register_cpl_handler(CPL_ACT_OPEN_RPL, act_open_rpl_handler);
452 	t4_register_cpl_handler(CPL_ABORT_RPL_RSS, abort_rpl_rss_handler);
453 	t4_register_cpl_handler(CPL_FW4_ACK, fw4_ack_handler);
454 }
455 
456 void
457 t4_register_shared_cpl_handler(int opcode, cpl_handler_t h, int cookie)
458 {
459 	uintptr_t *loc;
460 
461 	MPASS(opcode < nitems(t4_cpl_handler));
462 	MPASS(cookie > CPL_COOKIE_RESERVED);
463 	MPASS(cookie < NUM_CPL_COOKIES);
464 	MPASS(t4_cpl_handler[opcode] != NULL);
465 
466 	switch (opcode) {
467 	case CPL_SET_TCB_RPL:
468 		loc = (uintptr_t *)&set_tcb_rpl_handlers[cookie];
469 		break;
470 	case CPL_L2T_WRITE_RPL:
471 		loc = (uintptr_t *)&l2t_write_rpl_handlers[cookie];
472 		break;
473 	case CPL_ACT_OPEN_RPL:
474 		loc = (uintptr_t *)&act_open_rpl_handlers[cookie];
475 		break;
476 	case CPL_ABORT_RPL_RSS:
477 		loc = (uintptr_t *)&abort_rpl_rss_handlers[cookie];
478 		break;
479 	case CPL_FW4_ACK:
480 		loc = (uintptr_t *)&fw4_ack_handlers[cookie];
481 		break;
482 	default:
483 		MPASS(0);
484 		return;
485 	}
486 	MPASS(h == NULL || *loc == (uintptr_t)NULL);
487 	atomic_store_rel_ptr(loc, (uintptr_t)h);
488 }
489 
490 /*
491  * Called on MOD_LOAD.  Validates and calculates the SGE tunables.
492  */
493 void
494 t4_sge_modload(void)
495 {
496 
497 	if (fl_pktshift < 0 || fl_pktshift > 7) {
498 		printf("Invalid hw.cxgbe.fl_pktshift value (%d),"
499 		    " using 2 instead.\n", fl_pktshift);
500 		fl_pktshift = 2;
501 	}
502 
503 	if (spg_len != 64 && spg_len != 128) {
504 		int len;
505 
506 #if defined(__i386__) || defined(__amd64__)
507 		len = cpu_clflush_line_size > 64 ? 128 : 64;
508 #else
509 		len = 64;
510 #endif
511 		if (spg_len != -1) {
512 			printf("Invalid hw.cxgbe.spg_len value (%d),"
513 			    " using %d instead.\n", spg_len, len);
514 		}
515 		spg_len = len;
516 	}
517 
518 	if (cong_drop < -1 || cong_drop > 1) {
519 		printf("Invalid hw.cxgbe.cong_drop value (%d),"
520 		    " using 0 instead.\n", cong_drop);
521 		cong_drop = 0;
522 	}
523 
524 	if (tscale != 1 && (tscale < 3 || tscale > 17)) {
525 		printf("Invalid hw.cxgbe.tscale value (%d),"
526 		    " using 1 instead.\n", tscale);
527 		tscale = 1;
528 	}
529 
530 	extfree_refs = counter_u64_alloc(M_WAITOK);
531 	extfree_rels = counter_u64_alloc(M_WAITOK);
532 	counter_u64_zero(extfree_refs);
533 	counter_u64_zero(extfree_rels);
534 
535 	t4_init_shared_cpl_handlers();
536 	t4_register_cpl_handler(CPL_FW4_MSG, handle_fw_msg);
537 	t4_register_cpl_handler(CPL_FW6_MSG, handle_fw_msg);
538 	t4_register_cpl_handler(CPL_SGE_EGR_UPDATE, handle_sge_egr_update);
539 	t4_register_cpl_handler(CPL_RX_PKT, t4_eth_rx);
540 #ifdef RATELIMIT
541 	t4_register_shared_cpl_handler(CPL_FW4_ACK, ethofld_fw4_ack,
542 	    CPL_COOKIE_ETHOFLD);
543 #endif
544 	t4_register_fw_msg_handler(FW6_TYPE_CMD_RPL, t4_handle_fw_rpl);
545 	t4_register_fw_msg_handler(FW6_TYPE_WRERR_RPL, t4_handle_wrerr_rpl);
546 }
547 
548 void
549 t4_sge_modunload(void)
550 {
551 
552 	counter_u64_free(extfree_refs);
553 	counter_u64_free(extfree_rels);
554 }
555 
556 uint64_t
557 t4_sge_extfree_refs(void)
558 {
559 	uint64_t refs, rels;
560 
561 	rels = counter_u64_fetch(extfree_rels);
562 	refs = counter_u64_fetch(extfree_refs);
563 
564 	return (refs - rels);
565 }
566 
567 static inline void
568 setup_pad_and_pack_boundaries(struct adapter *sc)
569 {
570 	uint32_t v, m;
571 	int pad, pack, pad_shift;
572 
573 	pad_shift = chip_id(sc) > CHELSIO_T5 ? X_T6_INGPADBOUNDARY_SHIFT :
574 	    X_INGPADBOUNDARY_SHIFT;
575 	pad = fl_pad;
576 	if (fl_pad < (1 << pad_shift) ||
577 	    fl_pad > (1 << (pad_shift + M_INGPADBOUNDARY)) ||
578 	    !powerof2(fl_pad)) {
579 		/*
580 		 * If there is any chance that we might use buffer packing and
581 		 * the chip is a T4, then pick 64 as the pad/pack boundary.  Set
582 		 * it to the minimum allowed in all other cases.
583 		 */
584 		pad = is_t4(sc) && buffer_packing ? 64 : 1 << pad_shift;
585 
586 		/*
587 		 * For fl_pad = 0 we'll still write a reasonable value to the
588 		 * register but all the freelists will opt out of padding.
589 		 * We'll complain here only if the user tried to set it to a
590 		 * value greater than 0 that was invalid.
591 		 */
592 		if (fl_pad > 0) {
593 			device_printf(sc->dev, "Invalid hw.cxgbe.fl_pad value"
594 			    " (%d), using %d instead.\n", fl_pad, pad);
595 		}
596 	}
597 	m = V_INGPADBOUNDARY(M_INGPADBOUNDARY);
598 	v = V_INGPADBOUNDARY(ilog2(pad) - pad_shift);
599 	t4_set_reg_field(sc, A_SGE_CONTROL, m, v);
600 
601 	if (is_t4(sc)) {
602 		if (fl_pack != -1 && fl_pack != pad) {
603 			/* Complain but carry on. */
604 			device_printf(sc->dev, "hw.cxgbe.fl_pack (%d) ignored,"
605 			    " using %d instead.\n", fl_pack, pad);
606 		}
607 		return;
608 	}
609 
610 	pack = fl_pack;
611 	if (fl_pack < 16 || fl_pack == 32 || fl_pack > 4096 ||
612 	    !powerof2(fl_pack)) {
613 		pack = max(sc->params.pci.mps, CACHE_LINE_SIZE);
614 		MPASS(powerof2(pack));
615 		if (pack < 16)
616 			pack = 16;
617 		if (pack == 32)
618 			pack = 64;
619 		if (pack > 4096)
620 			pack = 4096;
621 		if (fl_pack != -1) {
622 			device_printf(sc->dev, "Invalid hw.cxgbe.fl_pack value"
623 			    " (%d), using %d instead.\n", fl_pack, pack);
624 		}
625 	}
626 	m = V_INGPACKBOUNDARY(M_INGPACKBOUNDARY);
627 	if (pack == 16)
628 		v = V_INGPACKBOUNDARY(0);
629 	else
630 		v = V_INGPACKBOUNDARY(ilog2(pack) - 5);
631 
632 	MPASS(!is_t4(sc));	/* T4 doesn't have SGE_CONTROL2 */
633 	t4_set_reg_field(sc, A_SGE_CONTROL2, m, v);
634 }
635 
636 /*
637  * adap->params.vpd.cclk must be set up before this is called.
638  */
639 void
640 t4_tweak_chip_settings(struct adapter *sc)
641 {
642 	int i;
643 	uint32_t v, m;
644 	int intr_timer[SGE_NTIMERS] = {1, 5, 10, 50, 100, 200};
645 	int timer_max = M_TIMERVALUE0 * 1000 / sc->params.vpd.cclk;
646 	int intr_pktcount[SGE_NCOUNTERS] = {1, 8, 16, 32}; /* 63 max */
647 	uint16_t indsz = min(RX_COPY_THRESHOLD - 1, M_INDICATESIZE);
648 	static int sge_flbuf_sizes[] = {
649 		MCLBYTES,
650 #if MJUMPAGESIZE != MCLBYTES
651 		MJUMPAGESIZE,
652 		MJUMPAGESIZE - CL_METADATA_SIZE,
653 		MJUMPAGESIZE - 2 * MSIZE - CL_METADATA_SIZE,
654 #endif
655 		MJUM9BYTES,
656 		MJUM16BYTES,
657 		MCLBYTES - MSIZE - CL_METADATA_SIZE,
658 		MJUM9BYTES - CL_METADATA_SIZE,
659 		MJUM16BYTES - CL_METADATA_SIZE,
660 	};
661 
662 	KASSERT(sc->flags & MASTER_PF,
663 	    ("%s: trying to change chip settings when not master.", __func__));
664 
665 	m = V_PKTSHIFT(M_PKTSHIFT) | F_RXPKTCPLMODE | F_EGRSTATUSPAGESIZE;
666 	v = V_PKTSHIFT(fl_pktshift) | F_RXPKTCPLMODE |
667 	    V_EGRSTATUSPAGESIZE(spg_len == 128);
668 	t4_set_reg_field(sc, A_SGE_CONTROL, m, v);
669 
670 	setup_pad_and_pack_boundaries(sc);
671 
672 	v = V_HOSTPAGESIZEPF0(PAGE_SHIFT - 10) |
673 	    V_HOSTPAGESIZEPF1(PAGE_SHIFT - 10) |
674 	    V_HOSTPAGESIZEPF2(PAGE_SHIFT - 10) |
675 	    V_HOSTPAGESIZEPF3(PAGE_SHIFT - 10) |
676 	    V_HOSTPAGESIZEPF4(PAGE_SHIFT - 10) |
677 	    V_HOSTPAGESIZEPF5(PAGE_SHIFT - 10) |
678 	    V_HOSTPAGESIZEPF6(PAGE_SHIFT - 10) |
679 	    V_HOSTPAGESIZEPF7(PAGE_SHIFT - 10);
680 	t4_write_reg(sc, A_SGE_HOST_PAGE_SIZE, v);
681 
682 	KASSERT(nitems(sge_flbuf_sizes) <= SGE_FLBUF_SIZES,
683 	    ("%s: hw buffer size table too big", __func__));
684 	for (i = 0; i < min(nitems(sge_flbuf_sizes), SGE_FLBUF_SIZES); i++) {
685 		t4_write_reg(sc, A_SGE_FL_BUFFER_SIZE0 + (4 * i),
686 		    sge_flbuf_sizes[i]);
687 	}
688 
689 	v = V_THRESHOLD_0(intr_pktcount[0]) | V_THRESHOLD_1(intr_pktcount[1]) |
690 	    V_THRESHOLD_2(intr_pktcount[2]) | V_THRESHOLD_3(intr_pktcount[3]);
691 	t4_write_reg(sc, A_SGE_INGRESS_RX_THRESHOLD, v);
692 
693 	KASSERT(intr_timer[0] <= timer_max,
694 	    ("%s: not a single usable timer (%d, %d)", __func__, intr_timer[0],
695 	    timer_max));
696 	for (i = 1; i < nitems(intr_timer); i++) {
697 		KASSERT(intr_timer[i] >= intr_timer[i - 1],
698 		    ("%s: timers not listed in increasing order (%d)",
699 		    __func__, i));
700 
701 		while (intr_timer[i] > timer_max) {
702 			if (i == nitems(intr_timer) - 1) {
703 				intr_timer[i] = timer_max;
704 				break;
705 			}
706 			intr_timer[i] += intr_timer[i - 1];
707 			intr_timer[i] /= 2;
708 		}
709 	}
710 
711 	v = V_TIMERVALUE0(us_to_core_ticks(sc, intr_timer[0])) |
712 	    V_TIMERVALUE1(us_to_core_ticks(sc, intr_timer[1]));
713 	t4_write_reg(sc, A_SGE_TIMER_VALUE_0_AND_1, v);
714 	v = V_TIMERVALUE2(us_to_core_ticks(sc, intr_timer[2])) |
715 	    V_TIMERVALUE3(us_to_core_ticks(sc, intr_timer[3]));
716 	t4_write_reg(sc, A_SGE_TIMER_VALUE_2_AND_3, v);
717 	v = V_TIMERVALUE4(us_to_core_ticks(sc, intr_timer[4])) |
718 	    V_TIMERVALUE5(us_to_core_ticks(sc, intr_timer[5]));
719 	t4_write_reg(sc, A_SGE_TIMER_VALUE_4_AND_5, v);
720 
721 	if (chip_id(sc) >= CHELSIO_T6) {
722 		m = V_TSCALE(M_TSCALE);
723 		if (tscale == 1)
724 			v = 0;
725 		else
726 			v = V_TSCALE(tscale - 2);
727 		t4_set_reg_field(sc, A_SGE_ITP_CONTROL, m, v);
728 
729 		if (sc->debug_flags & DF_DISABLE_TCB_CACHE) {
730 			m = V_RDTHRESHOLD(M_RDTHRESHOLD) | F_WRTHRTHRESHEN |
731 			    V_WRTHRTHRESH(M_WRTHRTHRESH);
732 			t4_tp_pio_read(sc, &v, 1, A_TP_CMM_CONFIG, 1);
733 			v &= ~m;
734 			v |= V_RDTHRESHOLD(1) | F_WRTHRTHRESHEN |
735 			    V_WRTHRTHRESH(16);
736 			t4_tp_pio_write(sc, &v, 1, A_TP_CMM_CONFIG, 1);
737 		}
738 	}
739 
740 	/* 4K, 16K, 64K, 256K DDP "page sizes" for TDDP */
741 	v = V_HPZ0(0) | V_HPZ1(2) | V_HPZ2(4) | V_HPZ3(6);
742 	t4_write_reg(sc, A_ULP_RX_TDDP_PSZ, v);
743 
744 	/*
745 	 * 4K, 8K, 16K, 64K DDP "page sizes" for iSCSI DDP.  These have been
746 	 * chosen with MAXPHYS = 128K in mind.  The largest DDP buffer that we
747 	 * may have to deal with is MAXPHYS + 1 page.
748 	 */
749 	v = V_HPZ0(0) | V_HPZ1(1) | V_HPZ2(2) | V_HPZ3(4);
750 	t4_write_reg(sc, A_ULP_RX_ISCSI_PSZ, v);
751 
752 	/* We use multiple DDP page sizes both in plain-TOE and ISCSI modes. */
753 	m = v = F_TDDPTAGTCB | F_ISCSITAGTCB;
754 	t4_set_reg_field(sc, A_ULP_RX_CTL, m, v);
755 
756 	m = V_INDICATESIZE(M_INDICATESIZE) | F_REARMDDPOFFSET |
757 	    F_RESETDDPOFFSET;
758 	v = V_INDICATESIZE(indsz) | F_REARMDDPOFFSET | F_RESETDDPOFFSET;
759 	t4_set_reg_field(sc, A_TP_PARA_REG5, m, v);
760 }
761 
762 /*
763  * SGE wants the buffer to be at least 64B and then a multiple of 16.  If
764  * padding is in use, the buffer's start and end need to be aligned to the pad
765  * boundary as well.  We'll just make sure that the size is a multiple of the
766  * boundary here, it is up to the buffer allocation code to make sure the start
767  * of the buffer is aligned as well.
768  */
769 static inline int
770 hwsz_ok(struct adapter *sc, int hwsz)
771 {
772 	int mask = fl_pad ? sc->params.sge.pad_boundary - 1 : 16 - 1;
773 
774 	return (hwsz >= 64 && (hwsz & mask) == 0);
775 }
776 
777 /*
778  * XXX: driver really should be able to deal with unexpected settings.
779  */
780 int
781 t4_read_chip_settings(struct adapter *sc)
782 {
783 	struct sge *s = &sc->sge;
784 	struct sge_params *sp = &sc->params.sge;
785 	int i, j, n, rc = 0;
786 	uint32_t m, v, r;
787 	uint16_t indsz = min(RX_COPY_THRESHOLD - 1, M_INDICATESIZE);
788 	static int sw_buf_sizes[] = {	/* Sorted by size */
789 		MCLBYTES,
790 #if MJUMPAGESIZE != MCLBYTES
791 		MJUMPAGESIZE,
792 #endif
793 		MJUM9BYTES,
794 		MJUM16BYTES
795 	};
796 	struct sw_zone_info *swz, *safe_swz;
797 	struct hw_buf_info *hwb;
798 
799 	m = F_RXPKTCPLMODE;
800 	v = F_RXPKTCPLMODE;
801 	r = sc->params.sge.sge_control;
802 	if ((r & m) != v) {
803 		device_printf(sc->dev, "invalid SGE_CONTROL(0x%x)\n", r);
804 		rc = EINVAL;
805 	}
806 
807 	/*
808 	 * If this changes then every single use of PAGE_SHIFT in the driver
809 	 * needs to be carefully reviewed for PAGE_SHIFT vs sp->page_shift.
810 	 */
811 	if (sp->page_shift != PAGE_SHIFT) {
812 		device_printf(sc->dev, "invalid SGE_HOST_PAGE_SIZE(0x%x)\n", r);
813 		rc = EINVAL;
814 	}
815 
816 	/* Filter out unusable hw buffer sizes entirely (mark with -2). */
817 	hwb = &s->hw_buf_info[0];
818 	for (i = 0; i < nitems(s->hw_buf_info); i++, hwb++) {
819 		r = sc->params.sge.sge_fl_buffer_size[i];
820 		hwb->size = r;
821 		hwb->zidx = hwsz_ok(sc, r) ? -1 : -2;
822 		hwb->next = -1;
823 	}
824 
825 	/*
826 	 * Create a sorted list in decreasing order of hw buffer sizes (and so
827 	 * increasing order of spare area) for each software zone.
828 	 *
829 	 * If padding is enabled then the start and end of the buffer must align
830 	 * to the pad boundary; if packing is enabled then they must align with
831 	 * the pack boundary as well.  Allocations from the cluster zones are
832 	 * aligned to min(size, 4K), so the buffer starts at that alignment and
833 	 * ends at hwb->size alignment.  If mbuf inlining is allowed the
834 	 * starting alignment will be reduced to MSIZE and the driver will
835 	 * exercise appropriate caution when deciding on the best buffer layout
836 	 * to use.
837 	 */
838 	n = 0;	/* no usable buffer size to begin with */
839 	swz = &s->sw_zone_info[0];
840 	safe_swz = NULL;
841 	for (i = 0; i < SW_ZONE_SIZES; i++, swz++) {
842 		int8_t head = -1, tail = -1;
843 
844 		swz->size = sw_buf_sizes[i];
845 		swz->zone = m_getzone(swz->size);
846 		swz->type = m_gettype(swz->size);
847 
848 		if (swz->size < PAGE_SIZE) {
849 			MPASS(powerof2(swz->size));
850 			if (fl_pad && (swz->size % sp->pad_boundary != 0))
851 				continue;
852 		}
853 
854 		if (swz->size == safest_rx_cluster)
855 			safe_swz = swz;
856 
857 		hwb = &s->hw_buf_info[0];
858 		for (j = 0; j < SGE_FLBUF_SIZES; j++, hwb++) {
859 			if (hwb->zidx != -1 || hwb->size > swz->size)
860 				continue;
861 #ifdef INVARIANTS
862 			if (fl_pad)
863 				MPASS(hwb->size % sp->pad_boundary == 0);
864 #endif
865 			hwb->zidx = i;
866 			if (head == -1)
867 				head = tail = j;
868 			else if (hwb->size < s->hw_buf_info[tail].size) {
869 				s->hw_buf_info[tail].next = j;
870 				tail = j;
871 			} else {
872 				int8_t *cur;
873 				struct hw_buf_info *t;
874 
875 				for (cur = &head; *cur != -1; cur = &t->next) {
876 					t = &s->hw_buf_info[*cur];
877 					if (hwb->size == t->size) {
878 						hwb->zidx = -2;
879 						break;
880 					}
881 					if (hwb->size > t->size) {
882 						hwb->next = *cur;
883 						*cur = j;
884 						break;
885 					}
886 				}
887 			}
888 		}
889 		swz->head_hwidx = head;
890 		swz->tail_hwidx = tail;
891 
892 		if (tail != -1) {
893 			n++;
894 			if (swz->size - s->hw_buf_info[tail].size >=
895 			    CL_METADATA_SIZE)
896 				sc->flags |= BUF_PACKING_OK;
897 		}
898 	}
899 	if (n == 0) {
900 		device_printf(sc->dev, "no usable SGE FL buffer size.\n");
901 		rc = EINVAL;
902 	}
903 
904 	s->safe_hwidx1 = -1;
905 	s->safe_hwidx2 = -1;
906 	if (safe_swz != NULL) {
907 		s->safe_hwidx1 = safe_swz->head_hwidx;
908 		for (i = safe_swz->head_hwidx; i != -1; i = hwb->next) {
909 			int spare;
910 
911 			hwb = &s->hw_buf_info[i];
912 #ifdef INVARIANTS
913 			if (fl_pad)
914 				MPASS(hwb->size % sp->pad_boundary == 0);
915 #endif
916 			spare = safe_swz->size - hwb->size;
917 			if (spare >= CL_METADATA_SIZE) {
918 				s->safe_hwidx2 = i;
919 				break;
920 			}
921 		}
922 	}
923 
924 	if (sc->flags & IS_VF)
925 		return (0);
926 
927 	v = V_HPZ0(0) | V_HPZ1(2) | V_HPZ2(4) | V_HPZ3(6);
928 	r = t4_read_reg(sc, A_ULP_RX_TDDP_PSZ);
929 	if (r != v) {
930 		device_printf(sc->dev, "invalid ULP_RX_TDDP_PSZ(0x%x)\n", r);
931 		rc = EINVAL;
932 	}
933 
934 	m = v = F_TDDPTAGTCB;
935 	r = t4_read_reg(sc, A_ULP_RX_CTL);
936 	if ((r & m) != v) {
937 		device_printf(sc->dev, "invalid ULP_RX_CTL(0x%x)\n", r);
938 		rc = EINVAL;
939 	}
940 
941 	m = V_INDICATESIZE(M_INDICATESIZE) | F_REARMDDPOFFSET |
942 	    F_RESETDDPOFFSET;
943 	v = V_INDICATESIZE(indsz) | F_REARMDDPOFFSET | F_RESETDDPOFFSET;
944 	r = t4_read_reg(sc, A_TP_PARA_REG5);
945 	if ((r & m) != v) {
946 		device_printf(sc->dev, "invalid TP_PARA_REG5(0x%x)\n", r);
947 		rc = EINVAL;
948 	}
949 
950 	t4_init_tp_params(sc, 1);
951 
952 	t4_read_mtu_tbl(sc, sc->params.mtus, NULL);
953 	t4_load_mtus(sc, sc->params.mtus, sc->params.a_wnd, sc->params.b_wnd);
954 
955 	return (rc);
956 }
957 
958 int
959 t4_create_dma_tag(struct adapter *sc)
960 {
961 	int rc;
962 
963 	rc = bus_dma_tag_create(bus_get_dma_tag(sc->dev), 1, 0,
964 	    BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, BUS_SPACE_MAXSIZE,
965 	    BUS_SPACE_UNRESTRICTED, BUS_SPACE_MAXSIZE, BUS_DMA_ALLOCNOW, NULL,
966 	    NULL, &sc->dmat);
967 	if (rc != 0) {
968 		device_printf(sc->dev,
969 		    "failed to create main DMA tag: %d\n", rc);
970 	}
971 
972 	return (rc);
973 }
974 
975 void
976 t4_sge_sysctls(struct adapter *sc, struct sysctl_ctx_list *ctx,
977     struct sysctl_oid_list *children)
978 {
979 	struct sge_params *sp = &sc->params.sge;
980 
981 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "buffer_sizes",
982 	    CTLTYPE_STRING | CTLFLAG_RD, &sc->sge, 0, sysctl_bufsizes, "A",
983 	    "freelist buffer sizes");
984 
985 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pktshift", CTLFLAG_RD,
986 	    NULL, sp->fl_pktshift, "payload DMA offset in rx buffer (bytes)");
987 
988 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pad", CTLFLAG_RD,
989 	    NULL, sp->pad_boundary, "payload pad boundary (bytes)");
990 
991 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "spg_len", CTLFLAG_RD,
992 	    NULL, sp->spg_len, "status page size (bytes)");
993 
994 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "cong_drop", CTLFLAG_RD,
995 	    NULL, cong_drop, "congestion drop setting");
996 
997 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pack", CTLFLAG_RD,
998 	    NULL, sp->pack_boundary, "payload pack boundary (bytes)");
999 }
1000 
1001 int
1002 t4_destroy_dma_tag(struct adapter *sc)
1003 {
1004 	if (sc->dmat)
1005 		bus_dma_tag_destroy(sc->dmat);
1006 
1007 	return (0);
1008 }
1009 
1010 /*
1011  * Allocate and initialize the firmware event queue and the management queue.
1012  *
1013  * Returns errno on failure.  Resources allocated up to that point may still be
1014  * allocated.  Caller is responsible for cleanup in case this function fails.
1015  */
1016 int
1017 t4_setup_adapter_queues(struct adapter *sc)
1018 {
1019 	int rc;
1020 
1021 	ADAPTER_LOCK_ASSERT_NOTOWNED(sc);
1022 
1023 	sysctl_ctx_init(&sc->ctx);
1024 	sc->flags |= ADAP_SYSCTL_CTX;
1025 
1026 	/*
1027 	 * Firmware event queue
1028 	 */
1029 	rc = alloc_fwq(sc);
1030 	if (rc != 0)
1031 		return (rc);
1032 
1033 	/*
1034 	 * Management queue.  This is just a control queue that uses the fwq as
1035 	 * its associated iq.
1036 	 */
1037 	if (!(sc->flags & IS_VF))
1038 		rc = alloc_mgmtq(sc);
1039 
1040 	return (rc);
1041 }
1042 
1043 /*
1044  * Idempotent
1045  */
1046 int
1047 t4_teardown_adapter_queues(struct adapter *sc)
1048 {
1049 
1050 	ADAPTER_LOCK_ASSERT_NOTOWNED(sc);
1051 
1052 	/* Do this before freeing the queue */
1053 	if (sc->flags & ADAP_SYSCTL_CTX) {
1054 		sysctl_ctx_free(&sc->ctx);
1055 		sc->flags &= ~ADAP_SYSCTL_CTX;
1056 	}
1057 
1058 	free_mgmtq(sc);
1059 	free_fwq(sc);
1060 
1061 	return (0);
1062 }
1063 
1064 /* Maximum payload that can be delivered with a single iq descriptor */
1065 static inline int
1066 mtu_to_max_payload(struct adapter *sc, int mtu, const int toe)
1067 {
1068 	int payload;
1069 
1070 #ifdef TCP_OFFLOAD
1071 	if (toe) {
1072 		int rxcs = G_RXCOALESCESIZE(t4_read_reg(sc, A_TP_PARA_REG2));
1073 
1074 		/* Note that COP can set rx_coalesce on/off per connection. */
1075 		payload = max(mtu, rxcs);
1076 	} else {
1077 #endif
1078 		/* large enough even when hw VLAN extraction is disabled */
1079 		payload = sc->params.sge.fl_pktshift + ETHER_HDR_LEN +
1080 		    ETHER_VLAN_ENCAP_LEN + mtu;
1081 #ifdef TCP_OFFLOAD
1082 	}
1083 #endif
1084 
1085 	return (payload);
1086 }
1087 
1088 int
1089 t4_setup_vi_queues(struct vi_info *vi)
1090 {
1091 	int rc = 0, i, intr_idx, iqidx;
1092 	struct sge_rxq *rxq;
1093 	struct sge_txq *txq;
1094 	struct sge_wrq *ctrlq;
1095 #ifdef TCP_OFFLOAD
1096 	struct sge_ofld_rxq *ofld_rxq;
1097 #endif
1098 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
1099 	struct sge_wrq *ofld_txq;
1100 #endif
1101 #ifdef DEV_NETMAP
1102 	int saved_idx;
1103 	struct sge_nm_rxq *nm_rxq;
1104 	struct sge_nm_txq *nm_txq;
1105 #endif
1106 	char name[16];
1107 	struct port_info *pi = vi->pi;
1108 	struct adapter *sc = pi->adapter;
1109 	struct ifnet *ifp = vi->ifp;
1110 	struct sysctl_oid *oid = device_get_sysctl_tree(vi->dev);
1111 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
1112 	int maxp, mtu = ifp->if_mtu;
1113 
1114 	/* Interrupt vector to start from (when using multiple vectors) */
1115 	intr_idx = vi->first_intr;
1116 
1117 #ifdef DEV_NETMAP
1118 	saved_idx = intr_idx;
1119 	if (ifp->if_capabilities & IFCAP_NETMAP) {
1120 
1121 		/* netmap is supported with direct interrupts only. */
1122 		MPASS(!forwarding_intr_to_fwq(sc));
1123 
1124 		/*
1125 		 * We don't have buffers to back the netmap rx queues
1126 		 * right now so we create the queues in a way that
1127 		 * doesn't set off any congestion signal in the chip.
1128 		 */
1129 		oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "nm_rxq",
1130 		    CTLFLAG_RD, NULL, "rx queues");
1131 		for_each_nm_rxq(vi, i, nm_rxq) {
1132 			rc = alloc_nm_rxq(vi, nm_rxq, intr_idx, i, oid);
1133 			if (rc != 0)
1134 				goto done;
1135 			intr_idx++;
1136 		}
1137 
1138 		oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "nm_txq",
1139 		    CTLFLAG_RD, NULL, "tx queues");
1140 		for_each_nm_txq(vi, i, nm_txq) {
1141 			iqidx = vi->first_nm_rxq + (i % vi->nnmrxq);
1142 			rc = alloc_nm_txq(vi, nm_txq, iqidx, i, oid);
1143 			if (rc != 0)
1144 				goto done;
1145 		}
1146 	}
1147 
1148 	/* Normal rx queues and netmap rx queues share the same interrupts. */
1149 	intr_idx = saved_idx;
1150 #endif
1151 
1152 	/*
1153 	 * Allocate rx queues first because a default iqid is required when
1154 	 * creating a tx queue.
1155 	 */
1156 	maxp = mtu_to_max_payload(sc, mtu, 0);
1157 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "rxq",
1158 	    CTLFLAG_RD, NULL, "rx queues");
1159 	for_each_rxq(vi, i, rxq) {
1160 
1161 		init_iq(&rxq->iq, sc, vi->tmr_idx, vi->pktc_idx, vi->qsize_rxq);
1162 
1163 		snprintf(name, sizeof(name), "%s rxq%d-fl",
1164 		    device_get_nameunit(vi->dev), i);
1165 		init_fl(sc, &rxq->fl, vi->qsize_rxq / 8, maxp, name);
1166 
1167 		rc = alloc_rxq(vi, rxq,
1168 		    forwarding_intr_to_fwq(sc) ? -1 : intr_idx, i, oid);
1169 		if (rc != 0)
1170 			goto done;
1171 		intr_idx++;
1172 	}
1173 #ifdef DEV_NETMAP
1174 	if (ifp->if_capabilities & IFCAP_NETMAP)
1175 		intr_idx = saved_idx + max(vi->nrxq, vi->nnmrxq);
1176 #endif
1177 #ifdef TCP_OFFLOAD
1178 	maxp = mtu_to_max_payload(sc, mtu, 1);
1179 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "ofld_rxq",
1180 	    CTLFLAG_RD, NULL, "rx queues for offloaded TCP connections");
1181 	for_each_ofld_rxq(vi, i, ofld_rxq) {
1182 
1183 		init_iq(&ofld_rxq->iq, sc, vi->ofld_tmr_idx, vi->ofld_pktc_idx,
1184 		    vi->qsize_rxq);
1185 
1186 		snprintf(name, sizeof(name), "%s ofld_rxq%d-fl",
1187 		    device_get_nameunit(vi->dev), i);
1188 		init_fl(sc, &ofld_rxq->fl, vi->qsize_rxq / 8, maxp, name);
1189 
1190 		rc = alloc_ofld_rxq(vi, ofld_rxq,
1191 		    forwarding_intr_to_fwq(sc) ? -1 : intr_idx, i, oid);
1192 		if (rc != 0)
1193 			goto done;
1194 		intr_idx++;
1195 	}
1196 #endif
1197 
1198 	/*
1199 	 * Now the tx queues.
1200 	 */
1201 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "txq", CTLFLAG_RD,
1202 	    NULL, "tx queues");
1203 	for_each_txq(vi, i, txq) {
1204 		iqidx = vi->first_rxq + (i % vi->nrxq);
1205 		snprintf(name, sizeof(name), "%s txq%d",
1206 		    device_get_nameunit(vi->dev), i);
1207 		init_eq(sc, &txq->eq, EQ_ETH, vi->qsize_txq, pi->tx_chan,
1208 		    sc->sge.rxq[iqidx].iq.cntxt_id, name);
1209 
1210 		rc = alloc_txq(vi, txq, i, oid);
1211 		if (rc != 0)
1212 			goto done;
1213 	}
1214 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
1215 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "ofld_txq",
1216 	    CTLFLAG_RD, NULL, "tx queues for TOE/ETHOFLD");
1217 	for_each_ofld_txq(vi, i, ofld_txq) {
1218 		struct sysctl_oid *oid2;
1219 
1220 		snprintf(name, sizeof(name), "%s ofld_txq%d",
1221 		    device_get_nameunit(vi->dev), i);
1222 #ifdef TCP_OFFLOAD
1223 		iqidx = vi->first_ofld_rxq + (i % vi->nofldrxq);
1224 		init_eq(sc, &ofld_txq->eq, EQ_OFLD, vi->qsize_txq, pi->tx_chan,
1225 		    sc->sge.ofld_rxq[iqidx].iq.cntxt_id, name);
1226 #else
1227 		iqidx = vi->first_rxq + (i % vi->nrxq);
1228 		init_eq(sc, &ofld_txq->eq, EQ_OFLD, vi->qsize_txq, pi->tx_chan,
1229 		    sc->sge.rxq[iqidx].iq.cntxt_id, name);
1230 #endif
1231 
1232 		snprintf(name, sizeof(name), "%d", i);
1233 		oid2 = SYSCTL_ADD_NODE(&vi->ctx, SYSCTL_CHILDREN(oid), OID_AUTO,
1234 		    name, CTLFLAG_RD, NULL, "offload tx queue");
1235 
1236 		rc = alloc_wrq(sc, vi, ofld_txq, oid2);
1237 		if (rc != 0)
1238 			goto done;
1239 	}
1240 #endif
1241 
1242 	/*
1243 	 * Finally, the control queue.
1244 	 */
1245 	if (!IS_MAIN_VI(vi) || sc->flags & IS_VF)
1246 		goto done;
1247 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "ctrlq", CTLFLAG_RD,
1248 	    NULL, "ctrl queue");
1249 	ctrlq = &sc->sge.ctrlq[pi->port_id];
1250 	snprintf(name, sizeof(name), "%s ctrlq", device_get_nameunit(vi->dev));
1251 	init_eq(sc, &ctrlq->eq, EQ_CTRL, CTRL_EQ_QSIZE, pi->tx_chan,
1252 	    sc->sge.rxq[vi->first_rxq].iq.cntxt_id, name);
1253 	rc = alloc_wrq(sc, vi, ctrlq, oid);
1254 
1255 done:
1256 	if (rc)
1257 		t4_teardown_vi_queues(vi);
1258 
1259 	return (rc);
1260 }
1261 
1262 /*
1263  * Idempotent
1264  */
1265 int
1266 t4_teardown_vi_queues(struct vi_info *vi)
1267 {
1268 	int i;
1269 	struct port_info *pi = vi->pi;
1270 	struct adapter *sc = pi->adapter;
1271 	struct sge_rxq *rxq;
1272 	struct sge_txq *txq;
1273 #ifdef TCP_OFFLOAD
1274 	struct sge_ofld_rxq *ofld_rxq;
1275 #endif
1276 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
1277 	struct sge_wrq *ofld_txq;
1278 #endif
1279 #ifdef DEV_NETMAP
1280 	struct sge_nm_rxq *nm_rxq;
1281 	struct sge_nm_txq *nm_txq;
1282 #endif
1283 
1284 	/* Do this before freeing the queues */
1285 	if (vi->flags & VI_SYSCTL_CTX) {
1286 		sysctl_ctx_free(&vi->ctx);
1287 		vi->flags &= ~VI_SYSCTL_CTX;
1288 	}
1289 
1290 #ifdef DEV_NETMAP
1291 	if (vi->ifp->if_capabilities & IFCAP_NETMAP) {
1292 		for_each_nm_txq(vi, i, nm_txq) {
1293 			free_nm_txq(vi, nm_txq);
1294 		}
1295 
1296 		for_each_nm_rxq(vi, i, nm_rxq) {
1297 			free_nm_rxq(vi, nm_rxq);
1298 		}
1299 	}
1300 #endif
1301 
1302 	/*
1303 	 * Take down all the tx queues first, as they reference the rx queues
1304 	 * (for egress updates, etc.).
1305 	 */
1306 
1307 	if (IS_MAIN_VI(vi) && !(sc->flags & IS_VF))
1308 		free_wrq(sc, &sc->sge.ctrlq[pi->port_id]);
1309 
1310 	for_each_txq(vi, i, txq) {
1311 		free_txq(vi, txq);
1312 	}
1313 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
1314 	for_each_ofld_txq(vi, i, ofld_txq) {
1315 		free_wrq(sc, ofld_txq);
1316 	}
1317 #endif
1318 
1319 	/*
1320 	 * Then take down the rx queues.
1321 	 */
1322 
1323 	for_each_rxq(vi, i, rxq) {
1324 		free_rxq(vi, rxq);
1325 	}
1326 #ifdef TCP_OFFLOAD
1327 	for_each_ofld_rxq(vi, i, ofld_rxq) {
1328 		free_ofld_rxq(vi, ofld_rxq);
1329 	}
1330 #endif
1331 
1332 	return (0);
1333 }
1334 
1335 /*
1336  * Deals with errors and the firmware event queue.  All data rx queues forward
1337  * their interrupt to the firmware event queue.
1338  */
1339 void
1340 t4_intr_all(void *arg)
1341 {
1342 	struct adapter *sc = arg;
1343 	struct sge_iq *fwq = &sc->sge.fwq;
1344 
1345 	t4_intr_err(arg);
1346 	if (atomic_cmpset_int(&fwq->state, IQS_IDLE, IQS_BUSY)) {
1347 		service_iq(fwq, 0);
1348 		atomic_cmpset_int(&fwq->state, IQS_BUSY, IQS_IDLE);
1349 	}
1350 }
1351 
1352 /* Deals with error interrupts */
1353 void
1354 t4_intr_err(void *arg)
1355 {
1356 	struct adapter *sc = arg;
1357 
1358 	t4_write_reg(sc, MYPF_REG(A_PCIE_PF_CLI), 0);
1359 	t4_slow_intr_handler(sc);
1360 }
1361 
1362 void
1363 t4_intr_evt(void *arg)
1364 {
1365 	struct sge_iq *iq = arg;
1366 
1367 	if (atomic_cmpset_int(&iq->state, IQS_IDLE, IQS_BUSY)) {
1368 		service_iq(iq, 0);
1369 		atomic_cmpset_int(&iq->state, IQS_BUSY, IQS_IDLE);
1370 	}
1371 }
1372 
1373 void
1374 t4_intr(void *arg)
1375 {
1376 	struct sge_iq *iq = arg;
1377 
1378 	if (atomic_cmpset_int(&iq->state, IQS_IDLE, IQS_BUSY)) {
1379 		service_iq(iq, 0);
1380 		atomic_cmpset_int(&iq->state, IQS_BUSY, IQS_IDLE);
1381 	}
1382 }
1383 
1384 void
1385 t4_vi_intr(void *arg)
1386 {
1387 	struct irq *irq = arg;
1388 
1389 #ifdef DEV_NETMAP
1390 	if (atomic_cmpset_int(&irq->nm_state, NM_ON, NM_BUSY)) {
1391 		t4_nm_intr(irq->nm_rxq);
1392 		atomic_cmpset_int(&irq->nm_state, NM_BUSY, NM_ON);
1393 	}
1394 #endif
1395 	if (irq->rxq != NULL)
1396 		t4_intr(irq->rxq);
1397 }
1398 
1399 static inline int
1400 sort_before_lro(struct lro_ctrl *lro)
1401 {
1402 
1403 	return (lro->lro_mbuf_max != 0);
1404 }
1405 
1406 /*
1407  * Deals with anything and everything on the given ingress queue.
1408  */
1409 static int
1410 service_iq(struct sge_iq *iq, int budget)
1411 {
1412 	struct sge_iq *q;
1413 	struct sge_rxq *rxq = iq_to_rxq(iq);	/* Use iff iq is part of rxq */
1414 	struct sge_fl *fl;			/* Use iff IQ_HAS_FL */
1415 	struct adapter *sc = iq->adapter;
1416 	struct iq_desc *d = &iq->desc[iq->cidx];
1417 	int ndescs = 0, limit;
1418 	int rsp_type, refill;
1419 	uint32_t lq;
1420 	uint16_t fl_hw_cidx;
1421 	struct mbuf *m0;
1422 	STAILQ_HEAD(, sge_iq) iql = STAILQ_HEAD_INITIALIZER(iql);
1423 #if defined(INET) || defined(INET6)
1424 	const struct timeval lro_timeout = {0, sc->lro_timeout};
1425 	struct lro_ctrl *lro = &rxq->lro;
1426 #endif
1427 
1428 	KASSERT(iq->state == IQS_BUSY, ("%s: iq %p not BUSY", __func__, iq));
1429 
1430 	limit = budget ? budget : iq->qsize / 16;
1431 
1432 	if (iq->flags & IQ_HAS_FL) {
1433 		fl = &rxq->fl;
1434 		fl_hw_cidx = fl->hw_cidx;	/* stable snapshot */
1435 	} else {
1436 		fl = NULL;
1437 		fl_hw_cidx = 0;			/* to silence gcc warning */
1438 	}
1439 
1440 #if defined(INET) || defined(INET6)
1441 	if (iq->flags & IQ_ADJ_CREDIT) {
1442 		MPASS(sort_before_lro(lro));
1443 		iq->flags &= ~IQ_ADJ_CREDIT;
1444 		if ((d->rsp.u.type_gen & F_RSPD_GEN) != iq->gen) {
1445 			tcp_lro_flush_all(lro);
1446 			t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(1) |
1447 			    V_INGRESSQID((u32)iq->cntxt_id) |
1448 			    V_SEINTARM(iq->intr_params));
1449 			return (0);
1450 		}
1451 		ndescs = 1;
1452 	}
1453 #else
1454 	MPASS((iq->flags & IQ_ADJ_CREDIT) == 0);
1455 #endif
1456 
1457 	/*
1458 	 * We always come back and check the descriptor ring for new indirect
1459 	 * interrupts and other responses after running a single handler.
1460 	 */
1461 	for (;;) {
1462 		while ((d->rsp.u.type_gen & F_RSPD_GEN) == iq->gen) {
1463 
1464 			rmb();
1465 
1466 			refill = 0;
1467 			m0 = NULL;
1468 			rsp_type = G_RSPD_TYPE(d->rsp.u.type_gen);
1469 			lq = be32toh(d->rsp.pldbuflen_qid);
1470 
1471 			switch (rsp_type) {
1472 			case X_RSPD_TYPE_FLBUF:
1473 
1474 				KASSERT(iq->flags & IQ_HAS_FL,
1475 				    ("%s: data for an iq (%p) with no freelist",
1476 				    __func__, iq));
1477 
1478 				m0 = get_fl_payload(sc, fl, lq);
1479 				if (__predict_false(m0 == NULL))
1480 					goto process_iql;
1481 				refill = IDXDIFF(fl->hw_cidx, fl_hw_cidx, fl->sidx) > 2;
1482 #ifdef T4_PKT_TIMESTAMP
1483 				/*
1484 				 * 60 bit timestamp for the payload is
1485 				 * *(uint64_t *)m0->m_pktdat.  Note that it is
1486 				 * in the leading free-space in the mbuf.  The
1487 				 * kernel can clobber it during a pullup,
1488 				 * m_copymdata, etc.  You need to make sure that
1489 				 * the mbuf reaches you unmolested if you care
1490 				 * about the timestamp.
1491 				 */
1492 				*(uint64_t *)m0->m_pktdat =
1493 				    be64toh(ctrl->u.last_flit) &
1494 				    0xfffffffffffffff;
1495 #endif
1496 
1497 				/* fall through */
1498 
1499 			case X_RSPD_TYPE_CPL:
1500 				KASSERT(d->rss.opcode < NUM_CPL_CMDS,
1501 				    ("%s: bad opcode %02x.", __func__,
1502 				    d->rss.opcode));
1503 				t4_cpl_handler[d->rss.opcode](iq, &d->rss, m0);
1504 				break;
1505 
1506 			case X_RSPD_TYPE_INTR:
1507 
1508 				/*
1509 				 * Interrupts should be forwarded only to queues
1510 				 * that are not forwarding their interrupts.
1511 				 * This means service_iq can recurse but only 1
1512 				 * level deep.
1513 				 */
1514 				KASSERT(budget == 0,
1515 				    ("%s: budget %u, rsp_type %u", __func__,
1516 				    budget, rsp_type));
1517 
1518 				/*
1519 				 * There are 1K interrupt-capable queues (qids 0
1520 				 * through 1023).  A response type indicating a
1521 				 * forwarded interrupt with a qid >= 1K is an
1522 				 * iWARP async notification.
1523 				 */
1524 				if (lq >= 1024) {
1525                                         t4_an_handler(iq, &d->rsp);
1526                                         break;
1527                                 }
1528 
1529 				q = sc->sge.iqmap[lq - sc->sge.iq_start -
1530 				    sc->sge.iq_base];
1531 				if (atomic_cmpset_int(&q->state, IQS_IDLE,
1532 				    IQS_BUSY)) {
1533 					if (service_iq(q, q->qsize / 16) == 0) {
1534 						atomic_cmpset_int(&q->state,
1535 						    IQS_BUSY, IQS_IDLE);
1536 					} else {
1537 						STAILQ_INSERT_TAIL(&iql, q,
1538 						    link);
1539 					}
1540 				}
1541 				break;
1542 
1543 			default:
1544 				KASSERT(0,
1545 				    ("%s: illegal response type %d on iq %p",
1546 				    __func__, rsp_type, iq));
1547 				log(LOG_ERR,
1548 				    "%s: illegal response type %d on iq %p",
1549 				    device_get_nameunit(sc->dev), rsp_type, iq);
1550 				break;
1551 			}
1552 
1553 			d++;
1554 			if (__predict_false(++iq->cidx == iq->sidx)) {
1555 				iq->cidx = 0;
1556 				iq->gen ^= F_RSPD_GEN;
1557 				d = &iq->desc[0];
1558 			}
1559 			if (__predict_false(++ndescs == limit)) {
1560 				t4_write_reg(sc, sc->sge_gts_reg,
1561 				    V_CIDXINC(ndescs) |
1562 				    V_INGRESSQID(iq->cntxt_id) |
1563 				    V_SEINTARM(V_QINTR_TIMER_IDX(X_TIMERREG_UPDATE_CIDX)));
1564 				ndescs = 0;
1565 
1566 #if defined(INET) || defined(INET6)
1567 				if (iq->flags & IQ_LRO_ENABLED &&
1568 				    !sort_before_lro(lro) &&
1569 				    sc->lro_timeout != 0) {
1570 					tcp_lro_flush_inactive(lro,
1571 					    &lro_timeout);
1572 				}
1573 #endif
1574 
1575 				if (budget) {
1576 					if (iq->flags & IQ_HAS_FL) {
1577 						FL_LOCK(fl);
1578 						refill_fl(sc, fl, 32);
1579 						FL_UNLOCK(fl);
1580 					}
1581 					return (EINPROGRESS);
1582 				}
1583 			}
1584 			if (refill) {
1585 				FL_LOCK(fl);
1586 				refill_fl(sc, fl, 32);
1587 				FL_UNLOCK(fl);
1588 				fl_hw_cidx = fl->hw_cidx;
1589 			}
1590 		}
1591 
1592 process_iql:
1593 		if (STAILQ_EMPTY(&iql))
1594 			break;
1595 
1596 		/*
1597 		 * Process the head only, and send it to the back of the list if
1598 		 * it's still not done.
1599 		 */
1600 		q = STAILQ_FIRST(&iql);
1601 		STAILQ_REMOVE_HEAD(&iql, link);
1602 		if (service_iq(q, q->qsize / 8) == 0)
1603 			atomic_cmpset_int(&q->state, IQS_BUSY, IQS_IDLE);
1604 		else
1605 			STAILQ_INSERT_TAIL(&iql, q, link);
1606 	}
1607 
1608 #if defined(INET) || defined(INET6)
1609 	if (iq->flags & IQ_LRO_ENABLED) {
1610 		if (ndescs > 0 && lro->lro_mbuf_count > 8) {
1611 			MPASS(sort_before_lro(lro));
1612 			/* hold back one credit and don't flush LRO state */
1613 			iq->flags |= IQ_ADJ_CREDIT;
1614 			ndescs--;
1615 		} else {
1616 			tcp_lro_flush_all(lro);
1617 		}
1618 	}
1619 #endif
1620 
1621 	t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(ndescs) |
1622 	    V_INGRESSQID((u32)iq->cntxt_id) | V_SEINTARM(iq->intr_params));
1623 
1624 	if (iq->flags & IQ_HAS_FL) {
1625 		int starved;
1626 
1627 		FL_LOCK(fl);
1628 		starved = refill_fl(sc, fl, 64);
1629 		FL_UNLOCK(fl);
1630 		if (__predict_false(starved != 0))
1631 			add_fl_to_sfl(sc, fl);
1632 	}
1633 
1634 	return (0);
1635 }
1636 
1637 static inline int
1638 cl_has_metadata(struct sge_fl *fl, struct cluster_layout *cll)
1639 {
1640 	int rc = fl->flags & FL_BUF_PACKING || cll->region1 > 0;
1641 
1642 	if (rc)
1643 		MPASS(cll->region3 >= CL_METADATA_SIZE);
1644 
1645 	return (rc);
1646 }
1647 
1648 static inline struct cluster_metadata *
1649 cl_metadata(struct adapter *sc, struct sge_fl *fl, struct cluster_layout *cll,
1650     caddr_t cl)
1651 {
1652 
1653 	if (cl_has_metadata(fl, cll)) {
1654 		struct sw_zone_info *swz = &sc->sge.sw_zone_info[cll->zidx];
1655 
1656 		return ((struct cluster_metadata *)(cl + swz->size) - 1);
1657 	}
1658 	return (NULL);
1659 }
1660 
1661 static void
1662 rxb_free(struct mbuf *m)
1663 {
1664 	uma_zone_t zone = m->m_ext.ext_arg1;
1665 	void *cl = m->m_ext.ext_arg2;
1666 
1667 	uma_zfree(zone, cl);
1668 	counter_u64_add(extfree_rels, 1);
1669 }
1670 
1671 /*
1672  * The mbuf returned by this function could be allocated from zone_mbuf or
1673  * constructed in spare room in the cluster.
1674  *
1675  * The mbuf carries the payload in one of these ways
1676  * a) frame inside the mbuf (mbuf from zone_mbuf)
1677  * b) m_cljset (for clusters without metadata) zone_mbuf
1678  * c) m_extaddref (cluster with metadata) inline mbuf
1679  * d) m_extaddref (cluster with metadata) zone_mbuf
1680  */
1681 static struct mbuf *
1682 get_scatter_segment(struct adapter *sc, struct sge_fl *fl, int fr_offset,
1683     int remaining)
1684 {
1685 	struct mbuf *m;
1686 	struct fl_sdesc *sd = &fl->sdesc[fl->cidx];
1687 	struct cluster_layout *cll = &sd->cll;
1688 	struct sw_zone_info *swz = &sc->sge.sw_zone_info[cll->zidx];
1689 	struct hw_buf_info *hwb = &sc->sge.hw_buf_info[cll->hwidx];
1690 	struct cluster_metadata *clm = cl_metadata(sc, fl, cll, sd->cl);
1691 	int len, blen;
1692 	caddr_t payload;
1693 
1694 	blen = hwb->size - fl->rx_offset;	/* max possible in this buf */
1695 	len = min(remaining, blen);
1696 	payload = sd->cl + cll->region1 + fl->rx_offset;
1697 	if (fl->flags & FL_BUF_PACKING) {
1698 		const u_int l = fr_offset + len;
1699 		const u_int pad = roundup2(l, fl->buf_boundary) - l;
1700 
1701 		if (fl->rx_offset + len + pad < hwb->size)
1702 			blen = len + pad;
1703 		MPASS(fl->rx_offset + blen <= hwb->size);
1704 	} else {
1705 		MPASS(fl->rx_offset == 0);	/* not packing */
1706 	}
1707 
1708 
1709 	if (sc->sc_do_rxcopy && len < RX_COPY_THRESHOLD) {
1710 
1711 		/*
1712 		 * Copy payload into a freshly allocated mbuf.
1713 		 */
1714 
1715 		m = fr_offset == 0 ?
1716 		    m_gethdr(M_NOWAIT, MT_DATA) : m_get(M_NOWAIT, MT_DATA);
1717 		if (m == NULL)
1718 			return (NULL);
1719 		fl->mbuf_allocated++;
1720 #ifdef T4_PKT_TIMESTAMP
1721 		/* Leave room for a timestamp */
1722 		m->m_data += 8;
1723 #endif
1724 		/* copy data to mbuf */
1725 		bcopy(payload, mtod(m, caddr_t), len);
1726 
1727 	} else if (sd->nmbuf * MSIZE < cll->region1) {
1728 
1729 		/*
1730 		 * There's spare room in the cluster for an mbuf.  Create one
1731 		 * and associate it with the payload that's in the cluster.
1732 		 */
1733 
1734 		MPASS(clm != NULL);
1735 		m = (struct mbuf *)(sd->cl + sd->nmbuf * MSIZE);
1736 		/* No bzero required */
1737 		if (m_init(m, M_NOWAIT, MT_DATA,
1738 		    fr_offset == 0 ? M_PKTHDR | M_NOFREE : M_NOFREE))
1739 			return (NULL);
1740 		fl->mbuf_inlined++;
1741 		m_extaddref(m, payload, blen, &clm->refcount, rxb_free,
1742 		    swz->zone, sd->cl);
1743 		if (sd->nmbuf++ == 0)
1744 			counter_u64_add(extfree_refs, 1);
1745 
1746 	} else {
1747 
1748 		/*
1749 		 * Grab an mbuf from zone_mbuf and associate it with the
1750 		 * payload in the cluster.
1751 		 */
1752 
1753 		m = fr_offset == 0 ?
1754 		    m_gethdr(M_NOWAIT, MT_DATA) : m_get(M_NOWAIT, MT_DATA);
1755 		if (m == NULL)
1756 			return (NULL);
1757 		fl->mbuf_allocated++;
1758 		if (clm != NULL) {
1759 			m_extaddref(m, payload, blen, &clm->refcount,
1760 			    rxb_free, swz->zone, sd->cl);
1761 			if (sd->nmbuf++ == 0)
1762 				counter_u64_add(extfree_refs, 1);
1763 		} else {
1764 			m_cljset(m, sd->cl, swz->type);
1765 			sd->cl = NULL;	/* consumed, not a recycle candidate */
1766 		}
1767 	}
1768 	if (fr_offset == 0)
1769 		m->m_pkthdr.len = remaining;
1770 	m->m_len = len;
1771 
1772 	if (fl->flags & FL_BUF_PACKING) {
1773 		fl->rx_offset += blen;
1774 		MPASS(fl->rx_offset <= hwb->size);
1775 		if (fl->rx_offset < hwb->size)
1776 			return (m);	/* without advancing the cidx */
1777 	}
1778 
1779 	if (__predict_false(++fl->cidx % 8 == 0)) {
1780 		uint16_t cidx = fl->cidx / 8;
1781 
1782 		if (__predict_false(cidx == fl->sidx))
1783 			fl->cidx = cidx = 0;
1784 		fl->hw_cidx = cidx;
1785 	}
1786 	fl->rx_offset = 0;
1787 
1788 	return (m);
1789 }
1790 
1791 static struct mbuf *
1792 get_fl_payload(struct adapter *sc, struct sge_fl *fl, uint32_t len_newbuf)
1793 {
1794 	struct mbuf *m0, *m, **pnext;
1795 	u_int remaining;
1796 	const u_int total = G_RSPD_LEN(len_newbuf);
1797 
1798 	if (__predict_false(fl->flags & FL_BUF_RESUME)) {
1799 		M_ASSERTPKTHDR(fl->m0);
1800 		MPASS(fl->m0->m_pkthdr.len == total);
1801 		MPASS(fl->remaining < total);
1802 
1803 		m0 = fl->m0;
1804 		pnext = fl->pnext;
1805 		remaining = fl->remaining;
1806 		fl->flags &= ~FL_BUF_RESUME;
1807 		goto get_segment;
1808 	}
1809 
1810 	if (fl->rx_offset > 0 && len_newbuf & F_RSPD_NEWBUF) {
1811 		fl->rx_offset = 0;
1812 		if (__predict_false(++fl->cidx % 8 == 0)) {
1813 			uint16_t cidx = fl->cidx / 8;
1814 
1815 			if (__predict_false(cidx == fl->sidx))
1816 				fl->cidx = cidx = 0;
1817 			fl->hw_cidx = cidx;
1818 		}
1819 	}
1820 
1821 	/*
1822 	 * Payload starts at rx_offset in the current hw buffer.  Its length is
1823 	 * 'len' and it may span multiple hw buffers.
1824 	 */
1825 
1826 	m0 = get_scatter_segment(sc, fl, 0, total);
1827 	if (m0 == NULL)
1828 		return (NULL);
1829 	remaining = total - m0->m_len;
1830 	pnext = &m0->m_next;
1831 	while (remaining > 0) {
1832 get_segment:
1833 		MPASS(fl->rx_offset == 0);
1834 		m = get_scatter_segment(sc, fl, total - remaining, remaining);
1835 		if (__predict_false(m == NULL)) {
1836 			fl->m0 = m0;
1837 			fl->pnext = pnext;
1838 			fl->remaining = remaining;
1839 			fl->flags |= FL_BUF_RESUME;
1840 			return (NULL);
1841 		}
1842 		*pnext = m;
1843 		pnext = &m->m_next;
1844 		remaining -= m->m_len;
1845 	}
1846 	*pnext = NULL;
1847 
1848 	M_ASSERTPKTHDR(m0);
1849 	return (m0);
1850 }
1851 
1852 static int
1853 t4_eth_rx(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m0)
1854 {
1855 	struct sge_rxq *rxq = iq_to_rxq(iq);
1856 	struct ifnet *ifp = rxq->ifp;
1857 	struct adapter *sc = iq->adapter;
1858 	const struct cpl_rx_pkt *cpl = (const void *)(rss + 1);
1859 #if defined(INET) || defined(INET6)
1860 	struct lro_ctrl *lro = &rxq->lro;
1861 #endif
1862 	static const int sw_hashtype[4][2] = {
1863 		{M_HASHTYPE_NONE, M_HASHTYPE_NONE},
1864 		{M_HASHTYPE_RSS_IPV4, M_HASHTYPE_RSS_IPV6},
1865 		{M_HASHTYPE_RSS_TCP_IPV4, M_HASHTYPE_RSS_TCP_IPV6},
1866 		{M_HASHTYPE_RSS_UDP_IPV4, M_HASHTYPE_RSS_UDP_IPV6},
1867 	};
1868 
1869 	KASSERT(m0 != NULL, ("%s: no payload with opcode %02x", __func__,
1870 	    rss->opcode));
1871 
1872 	m0->m_pkthdr.len -= sc->params.sge.fl_pktshift;
1873 	m0->m_len -= sc->params.sge.fl_pktshift;
1874 	m0->m_data += sc->params.sge.fl_pktshift;
1875 
1876 	m0->m_pkthdr.rcvif = ifp;
1877 	M_HASHTYPE_SET(m0, sw_hashtype[rss->hash_type][rss->ipv6]);
1878 	m0->m_pkthdr.flowid = be32toh(rss->hash_val);
1879 
1880 	if (cpl->csum_calc && !(cpl->err_vec & sc->params.tp.err_vec_mask)) {
1881 		if (ifp->if_capenable & IFCAP_RXCSUM &&
1882 		    cpl->l2info & htobe32(F_RXF_IP)) {
1883 			m0->m_pkthdr.csum_flags = (CSUM_IP_CHECKED |
1884 			    CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
1885 			rxq->rxcsum++;
1886 		} else if (ifp->if_capenable & IFCAP_RXCSUM_IPV6 &&
1887 		    cpl->l2info & htobe32(F_RXF_IP6)) {
1888 			m0->m_pkthdr.csum_flags = (CSUM_DATA_VALID_IPV6 |
1889 			    CSUM_PSEUDO_HDR);
1890 			rxq->rxcsum++;
1891 		}
1892 
1893 		if (__predict_false(cpl->ip_frag))
1894 			m0->m_pkthdr.csum_data = be16toh(cpl->csum);
1895 		else
1896 			m0->m_pkthdr.csum_data = 0xffff;
1897 	}
1898 
1899 	if (cpl->vlan_ex) {
1900 		m0->m_pkthdr.ether_vtag = be16toh(cpl->vlan);
1901 		m0->m_flags |= M_VLANTAG;
1902 		rxq->vlan_extraction++;
1903 	}
1904 
1905 #if defined(INET) || defined(INET6)
1906 	if (iq->flags & IQ_LRO_ENABLED) {
1907 		if (sort_before_lro(lro)) {
1908 			tcp_lro_queue_mbuf(lro, m0);
1909 			return (0); /* queued for sort, then LRO */
1910 		}
1911 		if (tcp_lro_rx(lro, m0, 0) == 0)
1912 			return (0); /* queued for LRO */
1913 	}
1914 #endif
1915 	ifp->if_input(ifp, m0);
1916 
1917 	return (0);
1918 }
1919 
1920 /*
1921  * Must drain the wrq or make sure that someone else will.
1922  */
1923 static void
1924 wrq_tx_drain(void *arg, int n)
1925 {
1926 	struct sge_wrq *wrq = arg;
1927 	struct sge_eq *eq = &wrq->eq;
1928 
1929 	EQ_LOCK(eq);
1930 	if (TAILQ_EMPTY(&wrq->incomplete_wrs) && !STAILQ_EMPTY(&wrq->wr_list))
1931 		drain_wrq_wr_list(wrq->adapter, wrq);
1932 	EQ_UNLOCK(eq);
1933 }
1934 
1935 static void
1936 drain_wrq_wr_list(struct adapter *sc, struct sge_wrq *wrq)
1937 {
1938 	struct sge_eq *eq = &wrq->eq;
1939 	u_int available, dbdiff;	/* # of hardware descriptors */
1940 	u_int n;
1941 	struct wrqe *wr;
1942 	struct fw_eth_tx_pkt_wr *dst;	/* any fw WR struct will do */
1943 
1944 	EQ_LOCK_ASSERT_OWNED(eq);
1945 	MPASS(TAILQ_EMPTY(&wrq->incomplete_wrs));
1946 	wr = STAILQ_FIRST(&wrq->wr_list);
1947 	MPASS(wr != NULL);	/* Must be called with something useful to do */
1948 	MPASS(eq->pidx == eq->dbidx);
1949 	dbdiff = 0;
1950 
1951 	do {
1952 		eq->cidx = read_hw_cidx(eq);
1953 		if (eq->pidx == eq->cidx)
1954 			available = eq->sidx - 1;
1955 		else
1956 			available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;
1957 
1958 		MPASS(wr->wrq == wrq);
1959 		n = howmany(wr->wr_len, EQ_ESIZE);
1960 		if (available < n)
1961 			break;
1962 
1963 		dst = (void *)&eq->desc[eq->pidx];
1964 		if (__predict_true(eq->sidx - eq->pidx > n)) {
1965 			/* Won't wrap, won't end exactly at the status page. */
1966 			bcopy(&wr->wr[0], dst, wr->wr_len);
1967 			eq->pidx += n;
1968 		} else {
1969 			int first_portion = (eq->sidx - eq->pidx) * EQ_ESIZE;
1970 
1971 			bcopy(&wr->wr[0], dst, first_portion);
1972 			if (wr->wr_len > first_portion) {
1973 				bcopy(&wr->wr[first_portion], &eq->desc[0],
1974 				    wr->wr_len - first_portion);
1975 			}
1976 			eq->pidx = n - (eq->sidx - eq->pidx);
1977 		}
1978 		wrq->tx_wrs_copied++;
1979 
1980 		if (available < eq->sidx / 4 &&
1981 		    atomic_cmpset_int(&eq->equiq, 0, 1)) {
1982 			dst->equiq_to_len16 |= htobe32(F_FW_WR_EQUIQ |
1983 			    F_FW_WR_EQUEQ);
1984 			eq->equeqidx = eq->pidx;
1985 		} else if (IDXDIFF(eq->pidx, eq->equeqidx, eq->sidx) >= 32) {
1986 			dst->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ);
1987 			eq->equeqidx = eq->pidx;
1988 		}
1989 
1990 		dbdiff += n;
1991 		if (dbdiff >= 16) {
1992 			ring_eq_db(sc, eq, dbdiff);
1993 			dbdiff = 0;
1994 		}
1995 
1996 		STAILQ_REMOVE_HEAD(&wrq->wr_list, link);
1997 		free_wrqe(wr);
1998 		MPASS(wrq->nwr_pending > 0);
1999 		wrq->nwr_pending--;
2000 		MPASS(wrq->ndesc_needed >= n);
2001 		wrq->ndesc_needed -= n;
2002 	} while ((wr = STAILQ_FIRST(&wrq->wr_list)) != NULL);
2003 
2004 	if (dbdiff)
2005 		ring_eq_db(sc, eq, dbdiff);
2006 }
2007 
2008 /*
2009  * Doesn't fail.  Holds on to work requests it can't send right away.
2010  */
2011 void
2012 t4_wrq_tx_locked(struct adapter *sc, struct sge_wrq *wrq, struct wrqe *wr)
2013 {
2014 #ifdef INVARIANTS
2015 	struct sge_eq *eq = &wrq->eq;
2016 #endif
2017 
2018 	EQ_LOCK_ASSERT_OWNED(eq);
2019 	MPASS(wr != NULL);
2020 	MPASS(wr->wr_len > 0 && wr->wr_len <= SGE_MAX_WR_LEN);
2021 	MPASS((wr->wr_len & 0x7) == 0);
2022 
2023 	STAILQ_INSERT_TAIL(&wrq->wr_list, wr, link);
2024 	wrq->nwr_pending++;
2025 	wrq->ndesc_needed += howmany(wr->wr_len, EQ_ESIZE);
2026 
2027 	if (!TAILQ_EMPTY(&wrq->incomplete_wrs))
2028 		return;	/* commit_wrq_wr will drain wr_list as well. */
2029 
2030 	drain_wrq_wr_list(sc, wrq);
2031 
2032 	/* Doorbell must have caught up to the pidx. */
2033 	MPASS(eq->pidx == eq->dbidx);
2034 }
2035 
2036 void
2037 t4_update_fl_bufsize(struct ifnet *ifp)
2038 {
2039 	struct vi_info *vi = ifp->if_softc;
2040 	struct adapter *sc = vi->pi->adapter;
2041 	struct sge_rxq *rxq;
2042 #ifdef TCP_OFFLOAD
2043 	struct sge_ofld_rxq *ofld_rxq;
2044 #endif
2045 	struct sge_fl *fl;
2046 	int i, maxp, mtu = ifp->if_mtu;
2047 
2048 	maxp = mtu_to_max_payload(sc, mtu, 0);
2049 	for_each_rxq(vi, i, rxq) {
2050 		fl = &rxq->fl;
2051 
2052 		FL_LOCK(fl);
2053 		find_best_refill_source(sc, fl, maxp);
2054 		FL_UNLOCK(fl);
2055 	}
2056 #ifdef TCP_OFFLOAD
2057 	maxp = mtu_to_max_payload(sc, mtu, 1);
2058 	for_each_ofld_rxq(vi, i, ofld_rxq) {
2059 		fl = &ofld_rxq->fl;
2060 
2061 		FL_LOCK(fl);
2062 		find_best_refill_source(sc, fl, maxp);
2063 		FL_UNLOCK(fl);
2064 	}
2065 #endif
2066 }
2067 
2068 static inline int
2069 mbuf_nsegs(struct mbuf *m)
2070 {
2071 
2072 	M_ASSERTPKTHDR(m);
2073 	KASSERT(m->m_pkthdr.l5hlen > 0,
2074 	    ("%s: mbuf %p missing information on # of segments.", __func__, m));
2075 
2076 	return (m->m_pkthdr.l5hlen);
2077 }
2078 
2079 static inline void
2080 set_mbuf_nsegs(struct mbuf *m, uint8_t nsegs)
2081 {
2082 
2083 	M_ASSERTPKTHDR(m);
2084 	m->m_pkthdr.l5hlen = nsegs;
2085 }
2086 
2087 static inline int
2088 mbuf_len16(struct mbuf *m)
2089 {
2090 	int n;
2091 
2092 	M_ASSERTPKTHDR(m);
2093 	n = m->m_pkthdr.PH_loc.eight[0];
2094 	MPASS(n > 0 && n <= SGE_MAX_WR_LEN / 16);
2095 
2096 	return (n);
2097 }
2098 
2099 static inline void
2100 set_mbuf_len16(struct mbuf *m, uint8_t len16)
2101 {
2102 
2103 	M_ASSERTPKTHDR(m);
2104 	m->m_pkthdr.PH_loc.eight[0] = len16;
2105 }
2106 
2107 #ifdef RATELIMIT
2108 static inline int
2109 mbuf_eo_nsegs(struct mbuf *m)
2110 {
2111 
2112 	M_ASSERTPKTHDR(m);
2113 	return (m->m_pkthdr.PH_loc.eight[1]);
2114 }
2115 
2116 static inline void
2117 set_mbuf_eo_nsegs(struct mbuf *m, uint8_t nsegs)
2118 {
2119 
2120 	M_ASSERTPKTHDR(m);
2121 	m->m_pkthdr.PH_loc.eight[1] = nsegs;
2122 }
2123 
2124 static inline int
2125 mbuf_eo_len16(struct mbuf *m)
2126 {
2127 	int n;
2128 
2129 	M_ASSERTPKTHDR(m);
2130 	n = m->m_pkthdr.PH_loc.eight[2];
2131 	MPASS(n > 0 && n <= SGE_MAX_WR_LEN / 16);
2132 
2133 	return (n);
2134 }
2135 
2136 static inline void
2137 set_mbuf_eo_len16(struct mbuf *m, uint8_t len16)
2138 {
2139 
2140 	M_ASSERTPKTHDR(m);
2141 	m->m_pkthdr.PH_loc.eight[2] = len16;
2142 }
2143 
2144 static inline int
2145 mbuf_eo_tsclk_tsoff(struct mbuf *m)
2146 {
2147 
2148 	M_ASSERTPKTHDR(m);
2149 	return (m->m_pkthdr.PH_loc.eight[3]);
2150 }
2151 
2152 static inline void
2153 set_mbuf_eo_tsclk_tsoff(struct mbuf *m, uint8_t tsclk_tsoff)
2154 {
2155 
2156 	M_ASSERTPKTHDR(m);
2157 	m->m_pkthdr.PH_loc.eight[3] = tsclk_tsoff;
2158 }
2159 
2160 static inline int
2161 needs_eo(struct mbuf *m)
2162 {
2163 
2164 	return (m->m_pkthdr.snd_tag != NULL);
2165 }
2166 #endif
2167 
2168 static inline int
2169 needs_tso(struct mbuf *m)
2170 {
2171 
2172 	M_ASSERTPKTHDR(m);
2173 
2174 	return (m->m_pkthdr.csum_flags & CSUM_TSO);
2175 }
2176 
2177 static inline int
2178 needs_l3_csum(struct mbuf *m)
2179 {
2180 
2181 	M_ASSERTPKTHDR(m);
2182 
2183 	return (m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TSO));
2184 }
2185 
2186 static inline int
2187 needs_l4_csum(struct mbuf *m)
2188 {
2189 
2190 	M_ASSERTPKTHDR(m);
2191 
2192 	return (m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 |
2193 	    CSUM_TCP_IPV6 | CSUM_TSO));
2194 }
2195 
2196 static inline int
2197 needs_tcp_csum(struct mbuf *m)
2198 {
2199 
2200 	M_ASSERTPKTHDR(m);
2201 	return (m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_TCP_IPV6 | CSUM_TSO));
2202 }
2203 
2204 #ifdef RATELIMIT
2205 static inline int
2206 needs_udp_csum(struct mbuf *m)
2207 {
2208 
2209 	M_ASSERTPKTHDR(m);
2210 	return (m->m_pkthdr.csum_flags & (CSUM_UDP | CSUM_UDP_IPV6));
2211 }
2212 #endif
2213 
2214 static inline int
2215 needs_vlan_insertion(struct mbuf *m)
2216 {
2217 
2218 	M_ASSERTPKTHDR(m);
2219 
2220 	return (m->m_flags & M_VLANTAG);
2221 }
2222 
2223 static void *
2224 m_advance(struct mbuf **pm, int *poffset, int len)
2225 {
2226 	struct mbuf *m = *pm;
2227 	int offset = *poffset;
2228 	uintptr_t p = 0;
2229 
2230 	MPASS(len > 0);
2231 
2232 	for (;;) {
2233 		if (offset + len < m->m_len) {
2234 			offset += len;
2235 			p = mtod(m, uintptr_t) + offset;
2236 			break;
2237 		}
2238 		len -= m->m_len - offset;
2239 		m = m->m_next;
2240 		offset = 0;
2241 		MPASS(m != NULL);
2242 	}
2243 	*poffset = offset;
2244 	*pm = m;
2245 	return ((void *)p);
2246 }
2247 
2248 /*
2249  * Can deal with empty mbufs in the chain that have m_len = 0, but the chain
2250  * must have at least one mbuf that's not empty.  It is possible for this
2251  * routine to return 0 if skip accounts for all the contents of the mbuf chain.
2252  */
2253 static inline int
2254 count_mbuf_nsegs(struct mbuf *m, int skip)
2255 {
2256 	vm_paddr_t lastb, next;
2257 	vm_offset_t va;
2258 	int len, nsegs;
2259 
2260 	M_ASSERTPKTHDR(m);
2261 	MPASS(m->m_pkthdr.len > 0);
2262 	MPASS(m->m_pkthdr.len >= skip);
2263 
2264 	nsegs = 0;
2265 	lastb = 0;
2266 	for (; m; m = m->m_next) {
2267 
2268 		len = m->m_len;
2269 		if (__predict_false(len == 0))
2270 			continue;
2271 		if (skip >= len) {
2272 			skip -= len;
2273 			continue;
2274 		}
2275 		va = mtod(m, vm_offset_t) + skip;
2276 		len -= skip;
2277 		skip = 0;
2278 		next = pmap_kextract(va);
2279 		nsegs += sglist_count((void *)(uintptr_t)va, len);
2280 		if (lastb + 1 == next)
2281 			nsegs--;
2282 		lastb = pmap_kextract(va + len - 1);
2283 	}
2284 
2285 	return (nsegs);
2286 }
2287 
2288 /*
2289  * Analyze the mbuf to determine its tx needs.  The mbuf passed in may change:
2290  * a) caller can assume it's been freed if this function returns with an error.
2291  * b) it may get defragged up if the gather list is too long for the hardware.
2292  */
2293 int
2294 parse_pkt(struct adapter *sc, struct mbuf **mp)
2295 {
2296 	struct mbuf *m0 = *mp, *m;
2297 	int rc, nsegs, defragged = 0, offset;
2298 	struct ether_header *eh;
2299 	void *l3hdr;
2300 #if defined(INET) || defined(INET6)
2301 	struct tcphdr *tcp;
2302 #endif
2303 	uint16_t eh_type;
2304 
2305 	M_ASSERTPKTHDR(m0);
2306 	if (__predict_false(m0->m_pkthdr.len < ETHER_HDR_LEN)) {
2307 		rc = EINVAL;
2308 fail:
2309 		m_freem(m0);
2310 		*mp = NULL;
2311 		return (rc);
2312 	}
2313 restart:
2314 	/*
2315 	 * First count the number of gather list segments in the payload.
2316 	 * Defrag the mbuf if nsegs exceeds the hardware limit.
2317 	 */
2318 	M_ASSERTPKTHDR(m0);
2319 	MPASS(m0->m_pkthdr.len > 0);
2320 	nsegs = count_mbuf_nsegs(m0, 0);
2321 	if (nsegs > (needs_tso(m0) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS)) {
2322 		if (defragged++ > 0 || (m = m_defrag(m0, M_NOWAIT)) == NULL) {
2323 			rc = EFBIG;
2324 			goto fail;
2325 		}
2326 		*mp = m0 = m;	/* update caller's copy after defrag */
2327 		goto restart;
2328 	}
2329 
2330 	if (__predict_false(nsegs > 2 && m0->m_pkthdr.len <= MHLEN)) {
2331 		m0 = m_pullup(m0, m0->m_pkthdr.len);
2332 		if (m0 == NULL) {
2333 			/* Should have left well enough alone. */
2334 			rc = EFBIG;
2335 			goto fail;
2336 		}
2337 		*mp = m0;	/* update caller's copy after pullup */
2338 		goto restart;
2339 	}
2340 	set_mbuf_nsegs(m0, nsegs);
2341 	if (sc->flags & IS_VF)
2342 		set_mbuf_len16(m0, txpkt_vm_len16(nsegs, needs_tso(m0)));
2343 	else
2344 		set_mbuf_len16(m0, txpkt_len16(nsegs, needs_tso(m0)));
2345 
2346 #ifdef RATELIMIT
2347 	/*
2348 	 * Ethofld is limited to TCP and UDP for now, and only when L4 hw
2349 	 * checksumming is enabled.  needs_l4_csum happens to check for all the
2350 	 * right things.
2351 	 */
2352 	if (__predict_false(needs_eo(m0) && !needs_l4_csum(m0)))
2353 		m0->m_pkthdr.snd_tag = NULL;
2354 #endif
2355 
2356 	if (!needs_tso(m0) &&
2357 #ifdef RATELIMIT
2358 	    !needs_eo(m0) &&
2359 #endif
2360 	    !(sc->flags & IS_VF && (needs_l3_csum(m0) || needs_l4_csum(m0))))
2361 		return (0);
2362 
2363 	m = m0;
2364 	eh = mtod(m, struct ether_header *);
2365 	eh_type = ntohs(eh->ether_type);
2366 	if (eh_type == ETHERTYPE_VLAN) {
2367 		struct ether_vlan_header *evh = (void *)eh;
2368 
2369 		eh_type = ntohs(evh->evl_proto);
2370 		m0->m_pkthdr.l2hlen = sizeof(*evh);
2371 	} else
2372 		m0->m_pkthdr.l2hlen = sizeof(*eh);
2373 
2374 	offset = 0;
2375 	l3hdr = m_advance(&m, &offset, m0->m_pkthdr.l2hlen);
2376 
2377 	switch (eh_type) {
2378 #ifdef INET6
2379 	case ETHERTYPE_IPV6:
2380 	{
2381 		struct ip6_hdr *ip6 = l3hdr;
2382 
2383 		MPASS(!needs_tso(m0) || ip6->ip6_nxt == IPPROTO_TCP);
2384 
2385 		m0->m_pkthdr.l3hlen = sizeof(*ip6);
2386 		break;
2387 	}
2388 #endif
2389 #ifdef INET
2390 	case ETHERTYPE_IP:
2391 	{
2392 		struct ip *ip = l3hdr;
2393 
2394 		m0->m_pkthdr.l3hlen = ip->ip_hl * 4;
2395 		break;
2396 	}
2397 #endif
2398 	default:
2399 		panic("%s: ethertype 0x%04x unknown.  if_cxgbe must be compiled"
2400 		    " with the same INET/INET6 options as the kernel.",
2401 		    __func__, eh_type);
2402 	}
2403 
2404 #if defined(INET) || defined(INET6)
2405 	if (needs_tcp_csum(m0)) {
2406 		tcp = m_advance(&m, &offset, m0->m_pkthdr.l3hlen);
2407 		m0->m_pkthdr.l4hlen = tcp->th_off * 4;
2408 #ifdef RATELIMIT
2409 		if (tsclk >= 0 && *(uint32_t *)(tcp + 1) == ntohl(0x0101080a)) {
2410 			set_mbuf_eo_tsclk_tsoff(m0,
2411 			    V_FW_ETH_TX_EO_WR_TSCLK(tsclk) |
2412 			    V_FW_ETH_TX_EO_WR_TSOFF(sizeof(*tcp) / 2 + 1));
2413 		} else
2414 			set_mbuf_eo_tsclk_tsoff(m0, 0);
2415 	} else if (needs_udp_csum(m)) {
2416 		m0->m_pkthdr.l4hlen = sizeof(struct udphdr);
2417 #endif
2418 	}
2419 #ifdef RATELIMIT
2420 	if (needs_eo(m0)) {
2421 		u_int immhdrs;
2422 
2423 		/* EO WRs have the headers in the WR and not the GL. */
2424 		immhdrs = m0->m_pkthdr.l2hlen + m0->m_pkthdr.l3hlen +
2425 		    m0->m_pkthdr.l4hlen;
2426 		nsegs = count_mbuf_nsegs(m0, immhdrs);
2427 		set_mbuf_eo_nsegs(m0, nsegs);
2428 		set_mbuf_eo_len16(m0,
2429 		    txpkt_eo_len16(nsegs, immhdrs, needs_tso(m0)));
2430 	}
2431 #endif
2432 #endif
2433 	MPASS(m0 == *mp);
2434 	return (0);
2435 }
2436 
2437 void *
2438 start_wrq_wr(struct sge_wrq *wrq, int len16, struct wrq_cookie *cookie)
2439 {
2440 	struct sge_eq *eq = &wrq->eq;
2441 	struct adapter *sc = wrq->adapter;
2442 	int ndesc, available;
2443 	struct wrqe *wr;
2444 	void *w;
2445 
2446 	MPASS(len16 > 0);
2447 	ndesc = howmany(len16, EQ_ESIZE / 16);
2448 	MPASS(ndesc > 0 && ndesc <= SGE_MAX_WR_NDESC);
2449 
2450 	EQ_LOCK(eq);
2451 
2452 	if (TAILQ_EMPTY(&wrq->incomplete_wrs) && !STAILQ_EMPTY(&wrq->wr_list))
2453 		drain_wrq_wr_list(sc, wrq);
2454 
2455 	if (!STAILQ_EMPTY(&wrq->wr_list)) {
2456 slowpath:
2457 		EQ_UNLOCK(eq);
2458 		wr = alloc_wrqe(len16 * 16, wrq);
2459 		if (__predict_false(wr == NULL))
2460 			return (NULL);
2461 		cookie->pidx = -1;
2462 		cookie->ndesc = ndesc;
2463 		return (&wr->wr);
2464 	}
2465 
2466 	eq->cidx = read_hw_cidx(eq);
2467 	if (eq->pidx == eq->cidx)
2468 		available = eq->sidx - 1;
2469 	else
2470 		available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;
2471 	if (available < ndesc)
2472 		goto slowpath;
2473 
2474 	cookie->pidx = eq->pidx;
2475 	cookie->ndesc = ndesc;
2476 	TAILQ_INSERT_TAIL(&wrq->incomplete_wrs, cookie, link);
2477 
2478 	w = &eq->desc[eq->pidx];
2479 	IDXINCR(eq->pidx, ndesc, eq->sidx);
2480 	if (__predict_false(cookie->pidx + ndesc > eq->sidx)) {
2481 		w = &wrq->ss[0];
2482 		wrq->ss_pidx = cookie->pidx;
2483 		wrq->ss_len = len16 * 16;
2484 	}
2485 
2486 	EQ_UNLOCK(eq);
2487 
2488 	return (w);
2489 }
2490 
2491 void
2492 commit_wrq_wr(struct sge_wrq *wrq, void *w, struct wrq_cookie *cookie)
2493 {
2494 	struct sge_eq *eq = &wrq->eq;
2495 	struct adapter *sc = wrq->adapter;
2496 	int ndesc, pidx;
2497 	struct wrq_cookie *prev, *next;
2498 
2499 	if (cookie->pidx == -1) {
2500 		struct wrqe *wr = __containerof(w, struct wrqe, wr);
2501 
2502 		t4_wrq_tx(sc, wr);
2503 		return;
2504 	}
2505 
2506 	if (__predict_false(w == &wrq->ss[0])) {
2507 		int n = (eq->sidx - wrq->ss_pidx) * EQ_ESIZE;
2508 
2509 		MPASS(wrq->ss_len > n);	/* WR had better wrap around. */
2510 		bcopy(&wrq->ss[0], &eq->desc[wrq->ss_pidx], n);
2511 		bcopy(&wrq->ss[n], &eq->desc[0], wrq->ss_len - n);
2512 		wrq->tx_wrs_ss++;
2513 	} else
2514 		wrq->tx_wrs_direct++;
2515 
2516 	EQ_LOCK(eq);
2517 	ndesc = cookie->ndesc;	/* Can be more than SGE_MAX_WR_NDESC here. */
2518 	pidx = cookie->pidx;
2519 	MPASS(pidx >= 0 && pidx < eq->sidx);
2520 	prev = TAILQ_PREV(cookie, wrq_incomplete_wrs, link);
2521 	next = TAILQ_NEXT(cookie, link);
2522 	if (prev == NULL) {
2523 		MPASS(pidx == eq->dbidx);
2524 		if (next == NULL || ndesc >= 16) {
2525 			int available;
2526 			struct fw_eth_tx_pkt_wr *dst;	/* any fw WR struct will do */
2527 
2528 			/*
2529 			 * Note that the WR via which we'll request tx updates
2530 			 * is at pidx and not eq->pidx, which has moved on
2531 			 * already.
2532 			 */
2533 			dst = (void *)&eq->desc[pidx];
2534 			available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;
2535 			if (available < eq->sidx / 4 &&
2536 			    atomic_cmpset_int(&eq->equiq, 0, 1)) {
2537 				dst->equiq_to_len16 |= htobe32(F_FW_WR_EQUIQ |
2538 				    F_FW_WR_EQUEQ);
2539 				eq->equeqidx = pidx;
2540 			} else if (IDXDIFF(eq->pidx, eq->equeqidx, eq->sidx) >= 32) {
2541 				dst->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ);
2542 				eq->equeqidx = pidx;
2543 			}
2544 
2545 			ring_eq_db(wrq->adapter, eq, ndesc);
2546 		} else {
2547 			MPASS(IDXDIFF(next->pidx, pidx, eq->sidx) == ndesc);
2548 			next->pidx = pidx;
2549 			next->ndesc += ndesc;
2550 		}
2551 	} else {
2552 		MPASS(IDXDIFF(pidx, prev->pidx, eq->sidx) == prev->ndesc);
2553 		prev->ndesc += ndesc;
2554 	}
2555 	TAILQ_REMOVE(&wrq->incomplete_wrs, cookie, link);
2556 
2557 	if (TAILQ_EMPTY(&wrq->incomplete_wrs) && !STAILQ_EMPTY(&wrq->wr_list))
2558 		drain_wrq_wr_list(sc, wrq);
2559 
2560 #ifdef INVARIANTS
2561 	if (TAILQ_EMPTY(&wrq->incomplete_wrs)) {
2562 		/* Doorbell must have caught up to the pidx. */
2563 		MPASS(wrq->eq.pidx == wrq->eq.dbidx);
2564 	}
2565 #endif
2566 	EQ_UNLOCK(eq);
2567 }
2568 
2569 static u_int
2570 can_resume_eth_tx(struct mp_ring *r)
2571 {
2572 	struct sge_eq *eq = r->cookie;
2573 
2574 	return (total_available_tx_desc(eq) > eq->sidx / 8);
2575 }
2576 
2577 static inline int
2578 cannot_use_txpkts(struct mbuf *m)
2579 {
2580 	/* maybe put a GL limit too, to avoid silliness? */
2581 
2582 	return (needs_tso(m));
2583 }
2584 
2585 static inline int
2586 discard_tx(struct sge_eq *eq)
2587 {
2588 
2589 	return ((eq->flags & (EQ_ENABLED | EQ_QFLUSH)) != EQ_ENABLED);
2590 }
2591 
2592 /*
2593  * r->items[cidx] to r->items[pidx], with a wraparound at r->size, are ready to
2594  * be consumed.  Return the actual number consumed.  0 indicates a stall.
2595  */
2596 static u_int
2597 eth_tx(struct mp_ring *r, u_int cidx, u_int pidx)
2598 {
2599 	struct sge_txq *txq = r->cookie;
2600 	struct sge_eq *eq = &txq->eq;
2601 	struct ifnet *ifp = txq->ifp;
2602 	struct vi_info *vi = ifp->if_softc;
2603 	struct port_info *pi = vi->pi;
2604 	struct adapter *sc = pi->adapter;
2605 	u_int total, remaining;		/* # of packets */
2606 	u_int available, dbdiff;	/* # of hardware descriptors */
2607 	u_int n, next_cidx;
2608 	struct mbuf *m0, *tail;
2609 	struct txpkts txp;
2610 	struct fw_eth_tx_pkts_wr *wr;	/* any fw WR struct will do */
2611 
2612 	remaining = IDXDIFF(pidx, cidx, r->size);
2613 	MPASS(remaining > 0);	/* Must not be called without work to do. */
2614 	total = 0;
2615 
2616 	TXQ_LOCK(txq);
2617 	if (__predict_false(discard_tx(eq))) {
2618 		while (cidx != pidx) {
2619 			m0 = r->items[cidx];
2620 			m_freem(m0);
2621 			if (++cidx == r->size)
2622 				cidx = 0;
2623 		}
2624 		reclaim_tx_descs(txq, 2048);
2625 		total = remaining;
2626 		goto done;
2627 	}
2628 
2629 	/* How many hardware descriptors do we have readily available. */
2630 	if (eq->pidx == eq->cidx)
2631 		available = eq->sidx - 1;
2632 	else
2633 		available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;
2634 	dbdiff = IDXDIFF(eq->pidx, eq->dbidx, eq->sidx);
2635 
2636 	while (remaining > 0) {
2637 
2638 		m0 = r->items[cidx];
2639 		M_ASSERTPKTHDR(m0);
2640 		MPASS(m0->m_nextpkt == NULL);
2641 
2642 		if (available < SGE_MAX_WR_NDESC) {
2643 			available += reclaim_tx_descs(txq, 64);
2644 			if (available < howmany(mbuf_len16(m0), EQ_ESIZE / 16))
2645 				break;	/* out of descriptors */
2646 		}
2647 
2648 		next_cidx = cidx + 1;
2649 		if (__predict_false(next_cidx == r->size))
2650 			next_cidx = 0;
2651 
2652 		wr = (void *)&eq->desc[eq->pidx];
2653 		if (sc->flags & IS_VF) {
2654 			total++;
2655 			remaining--;
2656 			ETHER_BPF_MTAP(ifp, m0);
2657 			n = write_txpkt_vm_wr(sc, txq, (void *)wr, m0,
2658 			    available);
2659 		} else if (remaining > 1 &&
2660 		    try_txpkts(m0, r->items[next_cidx], &txp, available) == 0) {
2661 
2662 			/* pkts at cidx, next_cidx should both be in txp. */
2663 			MPASS(txp.npkt == 2);
2664 			tail = r->items[next_cidx];
2665 			MPASS(tail->m_nextpkt == NULL);
2666 			ETHER_BPF_MTAP(ifp, m0);
2667 			ETHER_BPF_MTAP(ifp, tail);
2668 			m0->m_nextpkt = tail;
2669 
2670 			if (__predict_false(++next_cidx == r->size))
2671 				next_cidx = 0;
2672 
2673 			while (next_cidx != pidx) {
2674 				if (add_to_txpkts(r->items[next_cidx], &txp,
2675 				    available) != 0)
2676 					break;
2677 				tail->m_nextpkt = r->items[next_cidx];
2678 				tail = tail->m_nextpkt;
2679 				ETHER_BPF_MTAP(ifp, tail);
2680 				if (__predict_false(++next_cidx == r->size))
2681 					next_cidx = 0;
2682 			}
2683 
2684 			n = write_txpkts_wr(txq, wr, m0, &txp, available);
2685 			total += txp.npkt;
2686 			remaining -= txp.npkt;
2687 		} else {
2688 			total++;
2689 			remaining--;
2690 			ETHER_BPF_MTAP(ifp, m0);
2691 			n = write_txpkt_wr(txq, (void *)wr, m0, available);
2692 		}
2693 		MPASS(n >= 1 && n <= available && n <= SGE_MAX_WR_NDESC);
2694 
2695 		available -= n;
2696 		dbdiff += n;
2697 		IDXINCR(eq->pidx, n, eq->sidx);
2698 
2699 		if (total_available_tx_desc(eq) < eq->sidx / 4 &&
2700 		    atomic_cmpset_int(&eq->equiq, 0, 1)) {
2701 			wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUIQ |
2702 			    F_FW_WR_EQUEQ);
2703 			eq->equeqidx = eq->pidx;
2704 		} else if (IDXDIFF(eq->pidx, eq->equeqidx, eq->sidx) >= 32) {
2705 			wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ);
2706 			eq->equeqidx = eq->pidx;
2707 		}
2708 
2709 		if (dbdiff >= 16 && remaining >= 4) {
2710 			ring_eq_db(sc, eq, dbdiff);
2711 			available += reclaim_tx_descs(txq, 4 * dbdiff);
2712 			dbdiff = 0;
2713 		}
2714 
2715 		cidx = next_cidx;
2716 	}
2717 	if (dbdiff != 0) {
2718 		ring_eq_db(sc, eq, dbdiff);
2719 		reclaim_tx_descs(txq, 32);
2720 	}
2721 done:
2722 	TXQ_UNLOCK(txq);
2723 
2724 	return (total);
2725 }
2726 
2727 static inline void
2728 init_iq(struct sge_iq *iq, struct adapter *sc, int tmr_idx, int pktc_idx,
2729     int qsize)
2730 {
2731 
2732 	KASSERT(tmr_idx >= 0 && tmr_idx < SGE_NTIMERS,
2733 	    ("%s: bad tmr_idx %d", __func__, tmr_idx));
2734 	KASSERT(pktc_idx < SGE_NCOUNTERS,	/* -ve is ok, means don't use */
2735 	    ("%s: bad pktc_idx %d", __func__, pktc_idx));
2736 
2737 	iq->flags = 0;
2738 	iq->adapter = sc;
2739 	iq->intr_params = V_QINTR_TIMER_IDX(tmr_idx);
2740 	iq->intr_pktc_idx = SGE_NCOUNTERS - 1;
2741 	if (pktc_idx >= 0) {
2742 		iq->intr_params |= F_QINTR_CNT_EN;
2743 		iq->intr_pktc_idx = pktc_idx;
2744 	}
2745 	iq->qsize = roundup2(qsize, 16);	/* See FW_IQ_CMD/iqsize */
2746 	iq->sidx = iq->qsize - sc->params.sge.spg_len / IQ_ESIZE;
2747 }
2748 
2749 static inline void
2750 init_fl(struct adapter *sc, struct sge_fl *fl, int qsize, int maxp, char *name)
2751 {
2752 
2753 	fl->qsize = qsize;
2754 	fl->sidx = qsize - sc->params.sge.spg_len / EQ_ESIZE;
2755 	strlcpy(fl->lockname, name, sizeof(fl->lockname));
2756 	if (sc->flags & BUF_PACKING_OK &&
2757 	    ((!is_t4(sc) && buffer_packing) ||	/* T5+: enabled unless 0 */
2758 	    (is_t4(sc) && buffer_packing == 1)))/* T4: disabled unless 1 */
2759 		fl->flags |= FL_BUF_PACKING;
2760 	find_best_refill_source(sc, fl, maxp);
2761 	find_safe_refill_source(sc, fl);
2762 }
2763 
2764 static inline void
2765 init_eq(struct adapter *sc, struct sge_eq *eq, int eqtype, int qsize,
2766     uint8_t tx_chan, uint16_t iqid, char *name)
2767 {
2768 	KASSERT(eqtype <= EQ_TYPEMASK, ("%s: bad qtype %d", __func__, eqtype));
2769 
2770 	eq->flags = eqtype & EQ_TYPEMASK;
2771 	eq->tx_chan = tx_chan;
2772 	eq->iqid = iqid;
2773 	eq->sidx = qsize - sc->params.sge.spg_len / EQ_ESIZE;
2774 	strlcpy(eq->lockname, name, sizeof(eq->lockname));
2775 }
2776 
2777 static int
2778 alloc_ring(struct adapter *sc, size_t len, bus_dma_tag_t *tag,
2779     bus_dmamap_t *map, bus_addr_t *pa, void **va)
2780 {
2781 	int rc;
2782 
2783 	rc = bus_dma_tag_create(sc->dmat, 512, 0, BUS_SPACE_MAXADDR,
2784 	    BUS_SPACE_MAXADDR, NULL, NULL, len, 1, len, 0, NULL, NULL, tag);
2785 	if (rc != 0) {
2786 		device_printf(sc->dev, "cannot allocate DMA tag: %d\n", rc);
2787 		goto done;
2788 	}
2789 
2790 	rc = bus_dmamem_alloc(*tag, va,
2791 	    BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, map);
2792 	if (rc != 0) {
2793 		device_printf(sc->dev, "cannot allocate DMA memory: %d\n", rc);
2794 		goto done;
2795 	}
2796 
2797 	rc = bus_dmamap_load(*tag, *map, *va, len, oneseg_dma_callback, pa, 0);
2798 	if (rc != 0) {
2799 		device_printf(sc->dev, "cannot load DMA map: %d\n", rc);
2800 		goto done;
2801 	}
2802 done:
2803 	if (rc)
2804 		free_ring(sc, *tag, *map, *pa, *va);
2805 
2806 	return (rc);
2807 }
2808 
2809 static int
2810 free_ring(struct adapter *sc, bus_dma_tag_t tag, bus_dmamap_t map,
2811     bus_addr_t pa, void *va)
2812 {
2813 	if (pa)
2814 		bus_dmamap_unload(tag, map);
2815 	if (va)
2816 		bus_dmamem_free(tag, va, map);
2817 	if (tag)
2818 		bus_dma_tag_destroy(tag);
2819 
2820 	return (0);
2821 }
2822 
2823 /*
2824  * Allocates the ring for an ingress queue and an optional freelist.  If the
2825  * freelist is specified it will be allocated and then associated with the
2826  * ingress queue.
2827  *
2828  * Returns errno on failure.  Resources allocated up to that point may still be
2829  * allocated.  Caller is responsible for cleanup in case this function fails.
2830  *
2831  * If the ingress queue will take interrupts directly then the intr_idx
2832  * specifies the vector, starting from 0.  -1 means the interrupts for this
2833  * queue should be forwarded to the fwq.
2834  */
2835 static int
2836 alloc_iq_fl(struct vi_info *vi, struct sge_iq *iq, struct sge_fl *fl,
2837     int intr_idx, int cong)
2838 {
2839 	int rc, i, cntxt_id;
2840 	size_t len;
2841 	struct fw_iq_cmd c;
2842 	struct port_info *pi = vi->pi;
2843 	struct adapter *sc = iq->adapter;
2844 	struct sge_params *sp = &sc->params.sge;
2845 	__be32 v = 0;
2846 
2847 	len = iq->qsize * IQ_ESIZE;
2848 	rc = alloc_ring(sc, len, &iq->desc_tag, &iq->desc_map, &iq->ba,
2849 	    (void **)&iq->desc);
2850 	if (rc != 0)
2851 		return (rc);
2852 
2853 	bzero(&c, sizeof(c));
2854 	c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_IQ_CMD) | F_FW_CMD_REQUEST |
2855 	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_IQ_CMD_PFN(sc->pf) |
2856 	    V_FW_IQ_CMD_VFN(0));
2857 
2858 	c.alloc_to_len16 = htobe32(F_FW_IQ_CMD_ALLOC | F_FW_IQ_CMD_IQSTART |
2859 	    FW_LEN16(c));
2860 
2861 	/* Special handling for firmware event queue */
2862 	if (iq == &sc->sge.fwq)
2863 		v |= F_FW_IQ_CMD_IQASYNCH;
2864 
2865 	if (intr_idx < 0) {
2866 		/* Forwarded interrupts, all headed to fwq */
2867 		v |= F_FW_IQ_CMD_IQANDST;
2868 		v |= V_FW_IQ_CMD_IQANDSTINDEX(sc->sge.fwq.cntxt_id);
2869 	} else {
2870 		KASSERT(intr_idx < sc->intr_count,
2871 		    ("%s: invalid direct intr_idx %d", __func__, intr_idx));
2872 		v |= V_FW_IQ_CMD_IQANDSTINDEX(intr_idx);
2873 	}
2874 
2875 	c.type_to_iqandstindex = htobe32(v |
2876 	    V_FW_IQ_CMD_TYPE(FW_IQ_TYPE_FL_INT_CAP) |
2877 	    V_FW_IQ_CMD_VIID(vi->viid) |
2878 	    V_FW_IQ_CMD_IQANUD(X_UPDATEDELIVERY_INTERRUPT));
2879 	c.iqdroprss_to_iqesize = htobe16(V_FW_IQ_CMD_IQPCIECH(pi->tx_chan) |
2880 	    F_FW_IQ_CMD_IQGTSMODE |
2881 	    V_FW_IQ_CMD_IQINTCNTTHRESH(iq->intr_pktc_idx) |
2882 	    V_FW_IQ_CMD_IQESIZE(ilog2(IQ_ESIZE) - 4));
2883 	c.iqsize = htobe16(iq->qsize);
2884 	c.iqaddr = htobe64(iq->ba);
2885 	if (cong >= 0)
2886 		c.iqns_to_fl0congen = htobe32(F_FW_IQ_CMD_IQFLINTCONGEN);
2887 
2888 	if (fl) {
2889 		mtx_init(&fl->fl_lock, fl->lockname, NULL, MTX_DEF);
2890 
2891 		len = fl->qsize * EQ_ESIZE;
2892 		rc = alloc_ring(sc, len, &fl->desc_tag, &fl->desc_map,
2893 		    &fl->ba, (void **)&fl->desc);
2894 		if (rc)
2895 			return (rc);
2896 
2897 		/* Allocate space for one software descriptor per buffer. */
2898 		rc = alloc_fl_sdesc(fl);
2899 		if (rc != 0) {
2900 			device_printf(sc->dev,
2901 			    "failed to setup fl software descriptors: %d\n",
2902 			    rc);
2903 			return (rc);
2904 		}
2905 
2906 		if (fl->flags & FL_BUF_PACKING) {
2907 			fl->lowat = roundup2(sp->fl_starve_threshold2, 8);
2908 			fl->buf_boundary = sp->pack_boundary;
2909 		} else {
2910 			fl->lowat = roundup2(sp->fl_starve_threshold, 8);
2911 			fl->buf_boundary = 16;
2912 		}
2913 		if (fl_pad && fl->buf_boundary < sp->pad_boundary)
2914 			fl->buf_boundary = sp->pad_boundary;
2915 
2916 		c.iqns_to_fl0congen |=
2917 		    htobe32(V_FW_IQ_CMD_FL0HOSTFCMODE(X_HOSTFCMODE_NONE) |
2918 			F_FW_IQ_CMD_FL0FETCHRO | F_FW_IQ_CMD_FL0DATARO |
2919 			(fl_pad ? F_FW_IQ_CMD_FL0PADEN : 0) |
2920 			(fl->flags & FL_BUF_PACKING ? F_FW_IQ_CMD_FL0PACKEN :
2921 			    0));
2922 		if (cong >= 0) {
2923 			c.iqns_to_fl0congen |=
2924 				htobe32(V_FW_IQ_CMD_FL0CNGCHMAP(cong) |
2925 				    F_FW_IQ_CMD_FL0CONGCIF |
2926 				    F_FW_IQ_CMD_FL0CONGEN);
2927 		}
2928 		c.fl0dcaen_to_fl0cidxfthresh =
2929 		    htobe16(V_FW_IQ_CMD_FL0FBMIN(chip_id(sc) <= CHELSIO_T5 ?
2930 			X_FETCHBURSTMIN_128B : X_FETCHBURSTMIN_64B) |
2931 			V_FW_IQ_CMD_FL0FBMAX(chip_id(sc) <= CHELSIO_T5 ?
2932 			X_FETCHBURSTMAX_512B : X_FETCHBURSTMAX_256B));
2933 		c.fl0size = htobe16(fl->qsize);
2934 		c.fl0addr = htobe64(fl->ba);
2935 	}
2936 
2937 	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
2938 	if (rc != 0) {
2939 		device_printf(sc->dev,
2940 		    "failed to create ingress queue: %d\n", rc);
2941 		return (rc);
2942 	}
2943 
2944 	iq->cidx = 0;
2945 	iq->gen = F_RSPD_GEN;
2946 	iq->intr_next = iq->intr_params;
2947 	iq->cntxt_id = be16toh(c.iqid);
2948 	iq->abs_id = be16toh(c.physiqid);
2949 	iq->flags |= IQ_ALLOCATED;
2950 
2951 	cntxt_id = iq->cntxt_id - sc->sge.iq_start;
2952 	if (cntxt_id >= sc->sge.niq) {
2953 		panic ("%s: iq->cntxt_id (%d) more than the max (%d)", __func__,
2954 		    cntxt_id, sc->sge.niq - 1);
2955 	}
2956 	sc->sge.iqmap[cntxt_id] = iq;
2957 
2958 	if (fl) {
2959 		u_int qid;
2960 
2961 		iq->flags |= IQ_HAS_FL;
2962 		fl->cntxt_id = be16toh(c.fl0id);
2963 		fl->pidx = fl->cidx = 0;
2964 
2965 		cntxt_id = fl->cntxt_id - sc->sge.eq_start;
2966 		if (cntxt_id >= sc->sge.neq) {
2967 			panic("%s: fl->cntxt_id (%d) more than the max (%d)",
2968 			    __func__, cntxt_id, sc->sge.neq - 1);
2969 		}
2970 		sc->sge.eqmap[cntxt_id] = (void *)fl;
2971 
2972 		qid = fl->cntxt_id;
2973 		if (isset(&sc->doorbells, DOORBELL_UDB)) {
2974 			uint32_t s_qpp = sc->params.sge.eq_s_qpp;
2975 			uint32_t mask = (1 << s_qpp) - 1;
2976 			volatile uint8_t *udb;
2977 
2978 			udb = sc->udbs_base + UDBS_DB_OFFSET;
2979 			udb += (qid >> s_qpp) << PAGE_SHIFT;
2980 			qid &= mask;
2981 			if (qid < PAGE_SIZE / UDBS_SEG_SIZE) {
2982 				udb += qid << UDBS_SEG_SHIFT;
2983 				qid = 0;
2984 			}
2985 			fl->udb = (volatile void *)udb;
2986 		}
2987 		fl->dbval = V_QID(qid) | sc->chip_params->sge_fl_db;
2988 
2989 		FL_LOCK(fl);
2990 		/* Enough to make sure the SGE doesn't think it's starved */
2991 		refill_fl(sc, fl, fl->lowat);
2992 		FL_UNLOCK(fl);
2993 	}
2994 
2995 	if (chip_id(sc) >= CHELSIO_T5 && !(sc->flags & IS_VF) && cong >= 0) {
2996 		uint32_t param, val;
2997 
2998 		param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DMAQ) |
2999 		    V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DMAQ_CONM_CTXT) |
3000 		    V_FW_PARAMS_PARAM_YZ(iq->cntxt_id);
3001 		if (cong == 0)
3002 			val = 1 << 19;
3003 		else {
3004 			val = 2 << 19;
3005 			for (i = 0; i < 4; i++) {
3006 				if (cong & (1 << i))
3007 					val |= 1 << (i << 2);
3008 			}
3009 		}
3010 
3011 		rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, &param, &val);
3012 		if (rc != 0) {
3013 			/* report error but carry on */
3014 			device_printf(sc->dev,
3015 			    "failed to set congestion manager context for "
3016 			    "ingress queue %d: %d\n", iq->cntxt_id, rc);
3017 		}
3018 	}
3019 
3020 	/* Enable IQ interrupts */
3021 	atomic_store_rel_int(&iq->state, IQS_IDLE);
3022 	t4_write_reg(sc, sc->sge_gts_reg, V_SEINTARM(iq->intr_params) |
3023 	    V_INGRESSQID(iq->cntxt_id));
3024 
3025 	return (0);
3026 }
3027 
3028 static int
3029 free_iq_fl(struct vi_info *vi, struct sge_iq *iq, struct sge_fl *fl)
3030 {
3031 	int rc;
3032 	struct adapter *sc = iq->adapter;
3033 	device_t dev;
3034 
3035 	if (sc == NULL)
3036 		return (0);	/* nothing to do */
3037 
3038 	dev = vi ? vi->dev : sc->dev;
3039 
3040 	if (iq->flags & IQ_ALLOCATED) {
3041 		rc = -t4_iq_free(sc, sc->mbox, sc->pf, 0,
3042 		    FW_IQ_TYPE_FL_INT_CAP, iq->cntxt_id,
3043 		    fl ? fl->cntxt_id : 0xffff, 0xffff);
3044 		if (rc != 0) {
3045 			device_printf(dev,
3046 			    "failed to free queue %p: %d\n", iq, rc);
3047 			return (rc);
3048 		}
3049 		iq->flags &= ~IQ_ALLOCATED;
3050 	}
3051 
3052 	free_ring(sc, iq->desc_tag, iq->desc_map, iq->ba, iq->desc);
3053 
3054 	bzero(iq, sizeof(*iq));
3055 
3056 	if (fl) {
3057 		free_ring(sc, fl->desc_tag, fl->desc_map, fl->ba,
3058 		    fl->desc);
3059 
3060 		if (fl->sdesc)
3061 			free_fl_sdesc(sc, fl);
3062 
3063 		if (mtx_initialized(&fl->fl_lock))
3064 			mtx_destroy(&fl->fl_lock);
3065 
3066 		bzero(fl, sizeof(*fl));
3067 	}
3068 
3069 	return (0);
3070 }
3071 
3072 static void
3073 add_iq_sysctls(struct sysctl_ctx_list *ctx, struct sysctl_oid *oid,
3074     struct sge_iq *iq)
3075 {
3076 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
3077 
3078 	SYSCTL_ADD_UAUTO(ctx, children, OID_AUTO, "ba", CTLFLAG_RD, &iq->ba,
3079 	    "bus address of descriptor ring");
3080 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL,
3081 	    iq->qsize * IQ_ESIZE, "descriptor ring size in bytes");
3082 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "abs_id",
3083 	    CTLTYPE_INT | CTLFLAG_RD, &iq->abs_id, 0, sysctl_uint16, "I",
3084 	    "absolute id of the queue");
3085 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cntxt_id",
3086 	    CTLTYPE_INT | CTLFLAG_RD, &iq->cntxt_id, 0, sysctl_uint16, "I",
3087 	    "SGE context id of the queue");
3088 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cidx",
3089 	    CTLTYPE_INT | CTLFLAG_RD, &iq->cidx, 0, sysctl_uint16, "I",
3090 	    "consumer index");
3091 }
3092 
3093 static void
3094 add_fl_sysctls(struct adapter *sc, struct sysctl_ctx_list *ctx,
3095     struct sysctl_oid *oid, struct sge_fl *fl)
3096 {
3097 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
3098 
3099 	oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "fl", CTLFLAG_RD, NULL,
3100 	    "freelist");
3101 	children = SYSCTL_CHILDREN(oid);
3102 
3103 	SYSCTL_ADD_UAUTO(ctx, children, OID_AUTO, "ba", CTLFLAG_RD,
3104 	    &fl->ba, "bus address of descriptor ring");
3105 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL,
3106 	    fl->sidx * EQ_ESIZE + sc->params.sge.spg_len,
3107 	    "desc ring size in bytes");
3108 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cntxt_id",
3109 	    CTLTYPE_INT | CTLFLAG_RD, &fl->cntxt_id, 0, sysctl_uint16, "I",
3110 	    "SGE context id of the freelist");
3111 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "padding", CTLFLAG_RD, NULL,
3112 	    fl_pad ? 1 : 0, "padding enabled");
3113 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "packing", CTLFLAG_RD, NULL,
3114 	    fl->flags & FL_BUF_PACKING ? 1 : 0, "packing enabled");
3115 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cidx", CTLFLAG_RD, &fl->cidx,
3116 	    0, "consumer index");
3117 	if (fl->flags & FL_BUF_PACKING) {
3118 		SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "rx_offset",
3119 		    CTLFLAG_RD, &fl->rx_offset, 0, "packing rx offset");
3120 	}
3121 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "pidx", CTLFLAG_RD, &fl->pidx,
3122 	    0, "producer index");
3123 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "mbuf_allocated",
3124 	    CTLFLAG_RD, &fl->mbuf_allocated, "# of mbuf allocated");
3125 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "mbuf_inlined",
3126 	    CTLFLAG_RD, &fl->mbuf_inlined, "# of mbuf inlined in clusters");
3127 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_allocated",
3128 	    CTLFLAG_RD, &fl->cl_allocated, "# of clusters allocated");
3129 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_recycled",
3130 	    CTLFLAG_RD, &fl->cl_recycled, "# of clusters recycled");
3131 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_fast_recycled",
3132 	    CTLFLAG_RD, &fl->cl_fast_recycled, "# of clusters recycled (fast)");
3133 }
3134 
3135 static int
3136 alloc_fwq(struct adapter *sc)
3137 {
3138 	int rc, intr_idx;
3139 	struct sge_iq *fwq = &sc->sge.fwq;
3140 	struct sysctl_oid *oid = device_get_sysctl_tree(sc->dev);
3141 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
3142 
3143 	init_iq(fwq, sc, 0, 0, FW_IQ_QSIZE);
3144 	if (sc->flags & IS_VF)
3145 		intr_idx = 0;
3146 	else
3147 		intr_idx = sc->intr_count > 1 ? 1 : 0;
3148 	rc = alloc_iq_fl(&sc->port[0]->vi[0], fwq, NULL, intr_idx, -1);
3149 	if (rc != 0) {
3150 		device_printf(sc->dev,
3151 		    "failed to create firmware event queue: %d\n", rc);
3152 		return (rc);
3153 	}
3154 
3155 	oid = SYSCTL_ADD_NODE(&sc->ctx, children, OID_AUTO, "fwq", CTLFLAG_RD,
3156 	    NULL, "firmware event queue");
3157 	add_iq_sysctls(&sc->ctx, oid, fwq);
3158 
3159 	return (0);
3160 }
3161 
3162 static int
3163 free_fwq(struct adapter *sc)
3164 {
3165 	return free_iq_fl(NULL, &sc->sge.fwq, NULL);
3166 }
3167 
3168 static int
3169 alloc_mgmtq(struct adapter *sc)
3170 {
3171 	int rc;
3172 	struct sge_wrq *mgmtq = &sc->sge.mgmtq;
3173 	char name[16];
3174 	struct sysctl_oid *oid = device_get_sysctl_tree(sc->dev);
3175 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
3176 
3177 	oid = SYSCTL_ADD_NODE(&sc->ctx, children, OID_AUTO, "mgmtq", CTLFLAG_RD,
3178 	    NULL, "management queue");
3179 
3180 	snprintf(name, sizeof(name), "%s mgmtq", device_get_nameunit(sc->dev));
3181 	init_eq(sc, &mgmtq->eq, EQ_CTRL, CTRL_EQ_QSIZE, sc->port[0]->tx_chan,
3182 	    sc->sge.fwq.cntxt_id, name);
3183 	rc = alloc_wrq(sc, NULL, mgmtq, oid);
3184 	if (rc != 0) {
3185 		device_printf(sc->dev,
3186 		    "failed to create management queue: %d\n", rc);
3187 		return (rc);
3188 	}
3189 
3190 	return (0);
3191 }
3192 
3193 static int
3194 free_mgmtq(struct adapter *sc)
3195 {
3196 
3197 	return free_wrq(sc, &sc->sge.mgmtq);
3198 }
3199 
3200 int
3201 tnl_cong(struct port_info *pi, int drop)
3202 {
3203 
3204 	if (drop == -1)
3205 		return (-1);
3206 	else if (drop == 1)
3207 		return (0);
3208 	else
3209 		return (pi->rx_e_chan_map);
3210 }
3211 
3212 static int
3213 alloc_rxq(struct vi_info *vi, struct sge_rxq *rxq, int intr_idx, int idx,
3214     struct sysctl_oid *oid)
3215 {
3216 	int rc;
3217 	struct adapter *sc = vi->pi->adapter;
3218 	struct sysctl_oid_list *children;
3219 	char name[16];
3220 
3221 	rc = alloc_iq_fl(vi, &rxq->iq, &rxq->fl, intr_idx,
3222 	    tnl_cong(vi->pi, cong_drop));
3223 	if (rc != 0)
3224 		return (rc);
3225 
3226 	if (idx == 0)
3227 		sc->sge.iq_base = rxq->iq.abs_id - rxq->iq.cntxt_id;
3228 	else
3229 		KASSERT(rxq->iq.cntxt_id + sc->sge.iq_base == rxq->iq.abs_id,
3230 		    ("iq_base mismatch"));
3231 	KASSERT(sc->sge.iq_base == 0 || sc->flags & IS_VF,
3232 	    ("PF with non-zero iq_base"));
3233 
3234 	/*
3235 	 * The freelist is just barely above the starvation threshold right now,
3236 	 * fill it up a bit more.
3237 	 */
3238 	FL_LOCK(&rxq->fl);
3239 	refill_fl(sc, &rxq->fl, 128);
3240 	FL_UNLOCK(&rxq->fl);
3241 
3242 #if defined(INET) || defined(INET6)
3243 	rc = tcp_lro_init_args(&rxq->lro, vi->ifp, lro_entries, lro_mbufs);
3244 	if (rc != 0)
3245 		return (rc);
3246 	MPASS(rxq->lro.ifp == vi->ifp);	/* also indicates LRO init'ed */
3247 
3248 	if (vi->ifp->if_capenable & IFCAP_LRO)
3249 		rxq->iq.flags |= IQ_LRO_ENABLED;
3250 #endif
3251 	rxq->ifp = vi->ifp;
3252 
3253 	children = SYSCTL_CHILDREN(oid);
3254 
3255 	snprintf(name, sizeof(name), "%d", idx);
3256 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name, CTLFLAG_RD,
3257 	    NULL, "rx queue");
3258 	children = SYSCTL_CHILDREN(oid);
3259 
3260 	add_iq_sysctls(&vi->ctx, oid, &rxq->iq);
3261 #if defined(INET) || defined(INET6)
3262 	SYSCTL_ADD_U64(&vi->ctx, children, OID_AUTO, "lro_queued", CTLFLAG_RD,
3263 	    &rxq->lro.lro_queued, 0, NULL);
3264 	SYSCTL_ADD_U64(&vi->ctx, children, OID_AUTO, "lro_flushed", CTLFLAG_RD,
3265 	    &rxq->lro.lro_flushed, 0, NULL);
3266 #endif
3267 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "rxcsum", CTLFLAG_RD,
3268 	    &rxq->rxcsum, "# of times hardware assisted with checksum");
3269 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "vlan_extraction",
3270 	    CTLFLAG_RD, &rxq->vlan_extraction,
3271 	    "# of times hardware extracted 802.1Q tag");
3272 
3273 	add_fl_sysctls(sc, &vi->ctx, oid, &rxq->fl);
3274 
3275 	return (rc);
3276 }
3277 
3278 static int
3279 free_rxq(struct vi_info *vi, struct sge_rxq *rxq)
3280 {
3281 	int rc;
3282 
3283 #if defined(INET) || defined(INET6)
3284 	if (rxq->lro.ifp) {
3285 		tcp_lro_free(&rxq->lro);
3286 		rxq->lro.ifp = NULL;
3287 	}
3288 #endif
3289 
3290 	rc = free_iq_fl(vi, &rxq->iq, &rxq->fl);
3291 	if (rc == 0)
3292 		bzero(rxq, sizeof(*rxq));
3293 
3294 	return (rc);
3295 }
3296 
3297 #ifdef TCP_OFFLOAD
3298 static int
3299 alloc_ofld_rxq(struct vi_info *vi, struct sge_ofld_rxq *ofld_rxq,
3300     int intr_idx, int idx, struct sysctl_oid *oid)
3301 {
3302 	struct port_info *pi = vi->pi;
3303 	int rc;
3304 	struct sysctl_oid_list *children;
3305 	char name[16];
3306 
3307 	rc = alloc_iq_fl(vi, &ofld_rxq->iq, &ofld_rxq->fl, intr_idx, 0);
3308 	if (rc != 0)
3309 		return (rc);
3310 
3311 	children = SYSCTL_CHILDREN(oid);
3312 
3313 	snprintf(name, sizeof(name), "%d", idx);
3314 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name, CTLFLAG_RD,
3315 	    NULL, "rx queue");
3316 	add_iq_sysctls(&vi->ctx, oid, &ofld_rxq->iq);
3317 	add_fl_sysctls(pi->adapter, &vi->ctx, oid, &ofld_rxq->fl);
3318 
3319 	return (rc);
3320 }
3321 
3322 static int
3323 free_ofld_rxq(struct vi_info *vi, struct sge_ofld_rxq *ofld_rxq)
3324 {
3325 	int rc;
3326 
3327 	rc = free_iq_fl(vi, &ofld_rxq->iq, &ofld_rxq->fl);
3328 	if (rc == 0)
3329 		bzero(ofld_rxq, sizeof(*ofld_rxq));
3330 
3331 	return (rc);
3332 }
3333 #endif
3334 
3335 #ifdef DEV_NETMAP
3336 static int
3337 alloc_nm_rxq(struct vi_info *vi, struct sge_nm_rxq *nm_rxq, int intr_idx,
3338     int idx, struct sysctl_oid *oid)
3339 {
3340 	int rc;
3341 	struct sysctl_oid_list *children;
3342 	struct sysctl_ctx_list *ctx;
3343 	char name[16];
3344 	size_t len;
3345 	struct adapter *sc = vi->pi->adapter;
3346 	struct netmap_adapter *na = NA(vi->ifp);
3347 
3348 	MPASS(na != NULL);
3349 
3350 	len = vi->qsize_rxq * IQ_ESIZE;
3351 	rc = alloc_ring(sc, len, &nm_rxq->iq_desc_tag, &nm_rxq->iq_desc_map,
3352 	    &nm_rxq->iq_ba, (void **)&nm_rxq->iq_desc);
3353 	if (rc != 0)
3354 		return (rc);
3355 
3356 	len = na->num_rx_desc * EQ_ESIZE + sc->params.sge.spg_len;
3357 	rc = alloc_ring(sc, len, &nm_rxq->fl_desc_tag, &nm_rxq->fl_desc_map,
3358 	    &nm_rxq->fl_ba, (void **)&nm_rxq->fl_desc);
3359 	if (rc != 0)
3360 		return (rc);
3361 
3362 	nm_rxq->vi = vi;
3363 	nm_rxq->nid = idx;
3364 	nm_rxq->iq_cidx = 0;
3365 	nm_rxq->iq_sidx = vi->qsize_rxq - sc->params.sge.spg_len / IQ_ESIZE;
3366 	nm_rxq->iq_gen = F_RSPD_GEN;
3367 	nm_rxq->fl_pidx = nm_rxq->fl_cidx = 0;
3368 	nm_rxq->fl_sidx = na->num_rx_desc;
3369 	nm_rxq->intr_idx = intr_idx;
3370 	nm_rxq->iq_cntxt_id = INVALID_NM_RXQ_CNTXT_ID;
3371 
3372 	ctx = &vi->ctx;
3373 	children = SYSCTL_CHILDREN(oid);
3374 
3375 	snprintf(name, sizeof(name), "%d", idx);
3376 	oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, name, CTLFLAG_RD, NULL,
3377 	    "rx queue");
3378 	children = SYSCTL_CHILDREN(oid);
3379 
3380 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "abs_id",
3381 	    CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->iq_abs_id, 0, sysctl_uint16,
3382 	    "I", "absolute id of the queue");
3383 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cntxt_id",
3384 	    CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->iq_cntxt_id, 0, sysctl_uint16,
3385 	    "I", "SGE context id of the queue");
3386 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cidx",
3387 	    CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->iq_cidx, 0, sysctl_uint16, "I",
3388 	    "consumer index");
3389 
3390 	children = SYSCTL_CHILDREN(oid);
3391 	oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "fl", CTLFLAG_RD, NULL,
3392 	    "freelist");
3393 	children = SYSCTL_CHILDREN(oid);
3394 
3395 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cntxt_id",
3396 	    CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->fl_cntxt_id, 0, sysctl_uint16,
3397 	    "I", "SGE context id of the freelist");
3398 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cidx", CTLFLAG_RD,
3399 	    &nm_rxq->fl_cidx, 0, "consumer index");
3400 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "pidx", CTLFLAG_RD,
3401 	    &nm_rxq->fl_pidx, 0, "producer index");
3402 
3403 	return (rc);
3404 }
3405 
3406 
3407 static int
3408 free_nm_rxq(struct vi_info *vi, struct sge_nm_rxq *nm_rxq)
3409 {
3410 	struct adapter *sc = vi->pi->adapter;
3411 
3412 	if (vi->flags & VI_INIT_DONE)
3413 		MPASS(nm_rxq->iq_cntxt_id == INVALID_NM_RXQ_CNTXT_ID);
3414 	else
3415 		MPASS(nm_rxq->iq_cntxt_id == 0);
3416 
3417 	free_ring(sc, nm_rxq->iq_desc_tag, nm_rxq->iq_desc_map, nm_rxq->iq_ba,
3418 	    nm_rxq->iq_desc);
3419 	free_ring(sc, nm_rxq->fl_desc_tag, nm_rxq->fl_desc_map, nm_rxq->fl_ba,
3420 	    nm_rxq->fl_desc);
3421 
3422 	return (0);
3423 }
3424 
3425 static int
3426 alloc_nm_txq(struct vi_info *vi, struct sge_nm_txq *nm_txq, int iqidx, int idx,
3427     struct sysctl_oid *oid)
3428 {
3429 	int rc;
3430 	size_t len;
3431 	struct port_info *pi = vi->pi;
3432 	struct adapter *sc = pi->adapter;
3433 	struct netmap_adapter *na = NA(vi->ifp);
3434 	char name[16];
3435 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
3436 
3437 	len = na->num_tx_desc * EQ_ESIZE + sc->params.sge.spg_len;
3438 	rc = alloc_ring(sc, len, &nm_txq->desc_tag, &nm_txq->desc_map,
3439 	    &nm_txq->ba, (void **)&nm_txq->desc);
3440 	if (rc)
3441 		return (rc);
3442 
3443 	nm_txq->pidx = nm_txq->cidx = 0;
3444 	nm_txq->sidx = na->num_tx_desc;
3445 	nm_txq->nid = idx;
3446 	nm_txq->iqidx = iqidx;
3447 	nm_txq->cpl_ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT) |
3448 	    V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_PF(G_FW_VIID_PFN(vi->viid)) |
3449 	    V_TXPKT_VF(G_FW_VIID_VIN(vi->viid)) |
3450 	    V_TXPKT_VF_VLD(G_FW_VIID_VIVLD(vi->viid)));
3451 	nm_txq->cntxt_id = INVALID_NM_TXQ_CNTXT_ID;
3452 
3453 	snprintf(name, sizeof(name), "%d", idx);
3454 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name, CTLFLAG_RD,
3455 	    NULL, "netmap tx queue");
3456 	children = SYSCTL_CHILDREN(oid);
3457 
3458 	SYSCTL_ADD_UINT(&vi->ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD,
3459 	    &nm_txq->cntxt_id, 0, "SGE context id of the queue");
3460 	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "cidx",
3461 	    CTLTYPE_INT | CTLFLAG_RD, &nm_txq->cidx, 0, sysctl_uint16, "I",
3462 	    "consumer index");
3463 	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "pidx",
3464 	    CTLTYPE_INT | CTLFLAG_RD, &nm_txq->pidx, 0, sysctl_uint16, "I",
3465 	    "producer index");
3466 
3467 	return (rc);
3468 }
3469 
3470 static int
3471 free_nm_txq(struct vi_info *vi, struct sge_nm_txq *nm_txq)
3472 {
3473 	struct adapter *sc = vi->pi->adapter;
3474 
3475 	if (vi->flags & VI_INIT_DONE)
3476 		MPASS(nm_txq->cntxt_id == INVALID_NM_TXQ_CNTXT_ID);
3477 	else
3478 		MPASS(nm_txq->cntxt_id == 0);
3479 
3480 	free_ring(sc, nm_txq->desc_tag, nm_txq->desc_map, nm_txq->ba,
3481 	    nm_txq->desc);
3482 
3483 	return (0);
3484 }
3485 #endif
3486 
3487 static int
3488 ctrl_eq_alloc(struct adapter *sc, struct sge_eq *eq)
3489 {
3490 	int rc, cntxt_id;
3491 	struct fw_eq_ctrl_cmd c;
3492 	int qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE;
3493 
3494 	bzero(&c, sizeof(c));
3495 
3496 	c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_EQ_CTRL_CMD) | F_FW_CMD_REQUEST |
3497 	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_CTRL_CMD_PFN(sc->pf) |
3498 	    V_FW_EQ_CTRL_CMD_VFN(0));
3499 	c.alloc_to_len16 = htobe32(F_FW_EQ_CTRL_CMD_ALLOC |
3500 	    F_FW_EQ_CTRL_CMD_EQSTART | FW_LEN16(c));
3501 	c.cmpliqid_eqid = htonl(V_FW_EQ_CTRL_CMD_CMPLIQID(eq->iqid));
3502 	c.physeqid_pkd = htobe32(0);
3503 	c.fetchszm_to_iqid =
3504 	    htobe32(V_FW_EQ_CTRL_CMD_HOSTFCMODE(X_HOSTFCMODE_STATUS_PAGE) |
3505 		V_FW_EQ_CTRL_CMD_PCIECHN(eq->tx_chan) |
3506 		F_FW_EQ_CTRL_CMD_FETCHRO | V_FW_EQ_CTRL_CMD_IQID(eq->iqid));
3507 	c.dcaen_to_eqsize =
3508 	    htobe32(V_FW_EQ_CTRL_CMD_FBMIN(X_FETCHBURSTMIN_64B) |
3509 		V_FW_EQ_CTRL_CMD_FBMAX(X_FETCHBURSTMAX_512B) |
3510 		V_FW_EQ_CTRL_CMD_CIDXFTHRESH(X_CIDXFLUSHTHRESH_32) |
3511 		V_FW_EQ_CTRL_CMD_EQSIZE(qsize));
3512 	c.eqaddr = htobe64(eq->ba);
3513 
3514 	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
3515 	if (rc != 0) {
3516 		device_printf(sc->dev,
3517 		    "failed to create control queue %d: %d\n", eq->tx_chan, rc);
3518 		return (rc);
3519 	}
3520 	eq->flags |= EQ_ALLOCATED;
3521 
3522 	eq->cntxt_id = G_FW_EQ_CTRL_CMD_EQID(be32toh(c.cmpliqid_eqid));
3523 	cntxt_id = eq->cntxt_id - sc->sge.eq_start;
3524 	if (cntxt_id >= sc->sge.neq)
3525 	    panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__,
3526 		cntxt_id, sc->sge.neq - 1);
3527 	sc->sge.eqmap[cntxt_id] = eq;
3528 
3529 	return (rc);
3530 }
3531 
3532 static int
3533 eth_eq_alloc(struct adapter *sc, struct vi_info *vi, struct sge_eq *eq)
3534 {
3535 	int rc, cntxt_id;
3536 	struct fw_eq_eth_cmd c;
3537 	int qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE;
3538 
3539 	bzero(&c, sizeof(c));
3540 
3541 	c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_EQ_ETH_CMD) | F_FW_CMD_REQUEST |
3542 	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_ETH_CMD_PFN(sc->pf) |
3543 	    V_FW_EQ_ETH_CMD_VFN(0));
3544 	c.alloc_to_len16 = htobe32(F_FW_EQ_ETH_CMD_ALLOC |
3545 	    F_FW_EQ_ETH_CMD_EQSTART | FW_LEN16(c));
3546 	c.autoequiqe_to_viid = htobe32(F_FW_EQ_ETH_CMD_AUTOEQUIQE |
3547 	    F_FW_EQ_ETH_CMD_AUTOEQUEQE | V_FW_EQ_ETH_CMD_VIID(vi->viid));
3548 	c.fetchszm_to_iqid =
3549 	    htobe32(V_FW_EQ_ETH_CMD_HOSTFCMODE(X_HOSTFCMODE_NONE) |
3550 		V_FW_EQ_ETH_CMD_PCIECHN(eq->tx_chan) | F_FW_EQ_ETH_CMD_FETCHRO |
3551 		V_FW_EQ_ETH_CMD_IQID(eq->iqid));
3552 	c.dcaen_to_eqsize = htobe32(V_FW_EQ_ETH_CMD_FBMIN(X_FETCHBURSTMIN_64B) |
3553 	    V_FW_EQ_ETH_CMD_FBMAX(X_FETCHBURSTMAX_512B) |
3554 	    V_FW_EQ_ETH_CMD_EQSIZE(qsize));
3555 	c.eqaddr = htobe64(eq->ba);
3556 
3557 	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
3558 	if (rc != 0) {
3559 		device_printf(vi->dev,
3560 		    "failed to create Ethernet egress queue: %d\n", rc);
3561 		return (rc);
3562 	}
3563 	eq->flags |= EQ_ALLOCATED;
3564 
3565 	eq->cntxt_id = G_FW_EQ_ETH_CMD_EQID(be32toh(c.eqid_pkd));
3566 	eq->abs_id = G_FW_EQ_ETH_CMD_PHYSEQID(be32toh(c.physeqid_pkd));
3567 	cntxt_id = eq->cntxt_id - sc->sge.eq_start;
3568 	if (cntxt_id >= sc->sge.neq)
3569 	    panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__,
3570 		cntxt_id, sc->sge.neq - 1);
3571 	sc->sge.eqmap[cntxt_id] = eq;
3572 
3573 	return (rc);
3574 }
3575 
3576 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
3577 static int
3578 ofld_eq_alloc(struct adapter *sc, struct vi_info *vi, struct sge_eq *eq)
3579 {
3580 	int rc, cntxt_id;
3581 	struct fw_eq_ofld_cmd c;
3582 	int qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE;
3583 
3584 	bzero(&c, sizeof(c));
3585 
3586 	c.op_to_vfn = htonl(V_FW_CMD_OP(FW_EQ_OFLD_CMD) | F_FW_CMD_REQUEST |
3587 	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_OFLD_CMD_PFN(sc->pf) |
3588 	    V_FW_EQ_OFLD_CMD_VFN(0));
3589 	c.alloc_to_len16 = htonl(F_FW_EQ_OFLD_CMD_ALLOC |
3590 	    F_FW_EQ_OFLD_CMD_EQSTART | FW_LEN16(c));
3591 	c.fetchszm_to_iqid =
3592 		htonl(V_FW_EQ_OFLD_CMD_HOSTFCMODE(X_HOSTFCMODE_NONE) |
3593 		    V_FW_EQ_OFLD_CMD_PCIECHN(eq->tx_chan) |
3594 		    F_FW_EQ_OFLD_CMD_FETCHRO | V_FW_EQ_OFLD_CMD_IQID(eq->iqid));
3595 	c.dcaen_to_eqsize =
3596 	    htobe32(V_FW_EQ_OFLD_CMD_FBMIN(X_FETCHBURSTMIN_64B) |
3597 		V_FW_EQ_OFLD_CMD_FBMAX(X_FETCHBURSTMAX_512B) |
3598 		V_FW_EQ_OFLD_CMD_EQSIZE(qsize));
3599 	c.eqaddr = htobe64(eq->ba);
3600 
3601 	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
3602 	if (rc != 0) {
3603 		device_printf(vi->dev,
3604 		    "failed to create egress queue for TCP offload: %d\n", rc);
3605 		return (rc);
3606 	}
3607 	eq->flags |= EQ_ALLOCATED;
3608 
3609 	eq->cntxt_id = G_FW_EQ_OFLD_CMD_EQID(be32toh(c.eqid_pkd));
3610 	cntxt_id = eq->cntxt_id - sc->sge.eq_start;
3611 	if (cntxt_id >= sc->sge.neq)
3612 	    panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__,
3613 		cntxt_id, sc->sge.neq - 1);
3614 	sc->sge.eqmap[cntxt_id] = eq;
3615 
3616 	return (rc);
3617 }
3618 #endif
3619 
3620 static int
3621 alloc_eq(struct adapter *sc, struct vi_info *vi, struct sge_eq *eq)
3622 {
3623 	int rc, qsize;
3624 	size_t len;
3625 
3626 	mtx_init(&eq->eq_lock, eq->lockname, NULL, MTX_DEF);
3627 
3628 	qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE;
3629 	len = qsize * EQ_ESIZE;
3630 	rc = alloc_ring(sc, len, &eq->desc_tag, &eq->desc_map,
3631 	    &eq->ba, (void **)&eq->desc);
3632 	if (rc)
3633 		return (rc);
3634 
3635 	eq->pidx = eq->cidx = 0;
3636 	eq->equeqidx = eq->dbidx = 0;
3637 	eq->doorbells = sc->doorbells;
3638 
3639 	switch (eq->flags & EQ_TYPEMASK) {
3640 	case EQ_CTRL:
3641 		rc = ctrl_eq_alloc(sc, eq);
3642 		break;
3643 
3644 	case EQ_ETH:
3645 		rc = eth_eq_alloc(sc, vi, eq);
3646 		break;
3647 
3648 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
3649 	case EQ_OFLD:
3650 		rc = ofld_eq_alloc(sc, vi, eq);
3651 		break;
3652 #endif
3653 
3654 	default:
3655 		panic("%s: invalid eq type %d.", __func__,
3656 		    eq->flags & EQ_TYPEMASK);
3657 	}
3658 	if (rc != 0) {
3659 		device_printf(sc->dev,
3660 		    "failed to allocate egress queue(%d): %d\n",
3661 		    eq->flags & EQ_TYPEMASK, rc);
3662 	}
3663 
3664 	if (isset(&eq->doorbells, DOORBELL_UDB) ||
3665 	    isset(&eq->doorbells, DOORBELL_UDBWC) ||
3666 	    isset(&eq->doorbells, DOORBELL_WCWR)) {
3667 		uint32_t s_qpp = sc->params.sge.eq_s_qpp;
3668 		uint32_t mask = (1 << s_qpp) - 1;
3669 		volatile uint8_t *udb;
3670 
3671 		udb = sc->udbs_base + UDBS_DB_OFFSET;
3672 		udb += (eq->cntxt_id >> s_qpp) << PAGE_SHIFT;	/* pg offset */
3673 		eq->udb_qid = eq->cntxt_id & mask;		/* id in page */
3674 		if (eq->udb_qid >= PAGE_SIZE / UDBS_SEG_SIZE)
3675 	    		clrbit(&eq->doorbells, DOORBELL_WCWR);
3676 		else {
3677 			udb += eq->udb_qid << UDBS_SEG_SHIFT;	/* seg offset */
3678 			eq->udb_qid = 0;
3679 		}
3680 		eq->udb = (volatile void *)udb;
3681 	}
3682 
3683 	return (rc);
3684 }
3685 
3686 static int
3687 free_eq(struct adapter *sc, struct sge_eq *eq)
3688 {
3689 	int rc;
3690 
3691 	if (eq->flags & EQ_ALLOCATED) {
3692 		switch (eq->flags & EQ_TYPEMASK) {
3693 		case EQ_CTRL:
3694 			rc = -t4_ctrl_eq_free(sc, sc->mbox, sc->pf, 0,
3695 			    eq->cntxt_id);
3696 			break;
3697 
3698 		case EQ_ETH:
3699 			rc = -t4_eth_eq_free(sc, sc->mbox, sc->pf, 0,
3700 			    eq->cntxt_id);
3701 			break;
3702 
3703 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
3704 		case EQ_OFLD:
3705 			rc = -t4_ofld_eq_free(sc, sc->mbox, sc->pf, 0,
3706 			    eq->cntxt_id);
3707 			break;
3708 #endif
3709 
3710 		default:
3711 			panic("%s: invalid eq type %d.", __func__,
3712 			    eq->flags & EQ_TYPEMASK);
3713 		}
3714 		if (rc != 0) {
3715 			device_printf(sc->dev,
3716 			    "failed to free egress queue (%d): %d\n",
3717 			    eq->flags & EQ_TYPEMASK, rc);
3718 			return (rc);
3719 		}
3720 		eq->flags &= ~EQ_ALLOCATED;
3721 	}
3722 
3723 	free_ring(sc, eq->desc_tag, eq->desc_map, eq->ba, eq->desc);
3724 
3725 	if (mtx_initialized(&eq->eq_lock))
3726 		mtx_destroy(&eq->eq_lock);
3727 
3728 	bzero(eq, sizeof(*eq));
3729 	return (0);
3730 }
3731 
3732 static int
3733 alloc_wrq(struct adapter *sc, struct vi_info *vi, struct sge_wrq *wrq,
3734     struct sysctl_oid *oid)
3735 {
3736 	int rc;
3737 	struct sysctl_ctx_list *ctx = vi ? &vi->ctx : &sc->ctx;
3738 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
3739 
3740 	rc = alloc_eq(sc, vi, &wrq->eq);
3741 	if (rc)
3742 		return (rc);
3743 
3744 	wrq->adapter = sc;
3745 	TASK_INIT(&wrq->wrq_tx_task, 0, wrq_tx_drain, wrq);
3746 	TAILQ_INIT(&wrq->incomplete_wrs);
3747 	STAILQ_INIT(&wrq->wr_list);
3748 	wrq->nwr_pending = 0;
3749 	wrq->ndesc_needed = 0;
3750 
3751 	SYSCTL_ADD_UAUTO(ctx, children, OID_AUTO, "ba", CTLFLAG_RD,
3752 	    &wrq->eq.ba, "bus address of descriptor ring");
3753 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL,
3754 	    wrq->eq.sidx * EQ_ESIZE + sc->params.sge.spg_len,
3755 	    "desc ring size in bytes");
3756 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD,
3757 	    &wrq->eq.cntxt_id, 0, "SGE context id of the queue");
3758 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cidx",
3759 	    CTLTYPE_INT | CTLFLAG_RD, &wrq->eq.cidx, 0, sysctl_uint16, "I",
3760 	    "consumer index");
3761 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "pidx",
3762 	    CTLTYPE_INT | CTLFLAG_RD, &wrq->eq.pidx, 0, sysctl_uint16, "I",
3763 	    "producer index");
3764 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "sidx", CTLFLAG_RD, NULL,
3765 	    wrq->eq.sidx, "status page index");
3766 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_direct", CTLFLAG_RD,
3767 	    &wrq->tx_wrs_direct, "# of work requests (direct)");
3768 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_copied", CTLFLAG_RD,
3769 	    &wrq->tx_wrs_copied, "# of work requests (copied)");
3770 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_sspace", CTLFLAG_RD,
3771 	    &wrq->tx_wrs_ss, "# of work requests (copied from scratch space)");
3772 
3773 	return (rc);
3774 }
3775 
3776 static int
3777 free_wrq(struct adapter *sc, struct sge_wrq *wrq)
3778 {
3779 	int rc;
3780 
3781 	rc = free_eq(sc, &wrq->eq);
3782 	if (rc)
3783 		return (rc);
3784 
3785 	bzero(wrq, sizeof(*wrq));
3786 	return (0);
3787 }
3788 
3789 static int
3790 alloc_txq(struct vi_info *vi, struct sge_txq *txq, int idx,
3791     struct sysctl_oid *oid)
3792 {
3793 	int rc;
3794 	struct port_info *pi = vi->pi;
3795 	struct adapter *sc = pi->adapter;
3796 	struct sge_eq *eq = &txq->eq;
3797 	char name[16];
3798 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
3799 
3800 	rc = mp_ring_alloc(&txq->r, eq->sidx, txq, eth_tx, can_resume_eth_tx,
3801 	    M_CXGBE, M_WAITOK);
3802 	if (rc != 0) {
3803 		device_printf(sc->dev, "failed to allocate mp_ring: %d\n", rc);
3804 		return (rc);
3805 	}
3806 
3807 	rc = alloc_eq(sc, vi, eq);
3808 	if (rc != 0) {
3809 		mp_ring_free(txq->r);
3810 		txq->r = NULL;
3811 		return (rc);
3812 	}
3813 
3814 	/* Can't fail after this point. */
3815 
3816 	if (idx == 0)
3817 		sc->sge.eq_base = eq->abs_id - eq->cntxt_id;
3818 	else
3819 		KASSERT(eq->cntxt_id + sc->sge.eq_base == eq->abs_id,
3820 		    ("eq_base mismatch"));
3821 	KASSERT(sc->sge.eq_base == 0 || sc->flags & IS_VF,
3822 	    ("PF with non-zero eq_base"));
3823 
3824 	TASK_INIT(&txq->tx_reclaim_task, 0, tx_reclaim, eq);
3825 	txq->ifp = vi->ifp;
3826 	txq->gl = sglist_alloc(TX_SGL_SEGS, M_WAITOK);
3827 	if (sc->flags & IS_VF)
3828 		txq->cpl_ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT_XT) |
3829 		    V_TXPKT_INTF(pi->tx_chan));
3830 	else
3831 		txq->cpl_ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT) |
3832 		    V_TXPKT_INTF(pi->tx_chan) |
3833 		    V_TXPKT_PF(G_FW_VIID_PFN(vi->viid)) |
3834 		    V_TXPKT_VF(G_FW_VIID_VIN(vi->viid)) |
3835 		    V_TXPKT_VF_VLD(G_FW_VIID_VIVLD(vi->viid)));
3836 	txq->tc_idx = -1;
3837 	txq->sdesc = malloc(eq->sidx * sizeof(struct tx_sdesc), M_CXGBE,
3838 	    M_ZERO | M_WAITOK);
3839 
3840 	snprintf(name, sizeof(name), "%d", idx);
3841 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name, CTLFLAG_RD,
3842 	    NULL, "tx queue");
3843 	children = SYSCTL_CHILDREN(oid);
3844 
3845 	SYSCTL_ADD_UAUTO(&vi->ctx, children, OID_AUTO, "ba", CTLFLAG_RD,
3846 	    &eq->ba, "bus address of descriptor ring");
3847 	SYSCTL_ADD_INT(&vi->ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL,
3848 	    eq->sidx * EQ_ESIZE + sc->params.sge.spg_len,
3849 	    "desc ring size in bytes");
3850 	SYSCTL_ADD_UINT(&vi->ctx, children, OID_AUTO, "abs_id", CTLFLAG_RD,
3851 	    &eq->abs_id, 0, "absolute id of the queue");
3852 	SYSCTL_ADD_UINT(&vi->ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD,
3853 	    &eq->cntxt_id, 0, "SGE context id of the queue");
3854 	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "cidx",
3855 	    CTLTYPE_INT | CTLFLAG_RD, &eq->cidx, 0, sysctl_uint16, "I",
3856 	    "consumer index");
3857 	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "pidx",
3858 	    CTLTYPE_INT | CTLFLAG_RD, &eq->pidx, 0, sysctl_uint16, "I",
3859 	    "producer index");
3860 	SYSCTL_ADD_INT(&vi->ctx, children, OID_AUTO, "sidx", CTLFLAG_RD, NULL,
3861 	    eq->sidx, "status page index");
3862 
3863 	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "tc",
3864 	    CTLTYPE_INT | CTLFLAG_RW, vi, idx, sysctl_tc, "I",
3865 	    "traffic class (-1 means none)");
3866 
3867 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txcsum", CTLFLAG_RD,
3868 	    &txq->txcsum, "# of times hardware assisted with checksum");
3869 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "vlan_insertion",
3870 	    CTLFLAG_RD, &txq->vlan_insertion,
3871 	    "# of times hardware inserted 802.1Q tag");
3872 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "tso_wrs", CTLFLAG_RD,
3873 	    &txq->tso_wrs, "# of TSO work requests");
3874 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "imm_wrs", CTLFLAG_RD,
3875 	    &txq->imm_wrs, "# of work requests with immediate data");
3876 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "sgl_wrs", CTLFLAG_RD,
3877 	    &txq->sgl_wrs, "# of work requests with direct SGL");
3878 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkt_wrs", CTLFLAG_RD,
3879 	    &txq->txpkt_wrs, "# of txpkt work requests (one pkt/WR)");
3880 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkts0_wrs",
3881 	    CTLFLAG_RD, &txq->txpkts0_wrs,
3882 	    "# of txpkts (type 0) work requests");
3883 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkts1_wrs",
3884 	    CTLFLAG_RD, &txq->txpkts1_wrs,
3885 	    "# of txpkts (type 1) work requests");
3886 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkts0_pkts",
3887 	    CTLFLAG_RD, &txq->txpkts0_pkts,
3888 	    "# of frames tx'd using type0 txpkts work requests");
3889 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkts1_pkts",
3890 	    CTLFLAG_RD, &txq->txpkts1_pkts,
3891 	    "# of frames tx'd using type1 txpkts work requests");
3892 
3893 	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_enqueues",
3894 	    CTLFLAG_RD, &txq->r->enqueues,
3895 	    "# of enqueues to the mp_ring for this queue");
3896 	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_drops",
3897 	    CTLFLAG_RD, &txq->r->drops,
3898 	    "# of drops in the mp_ring for this queue");
3899 	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_starts",
3900 	    CTLFLAG_RD, &txq->r->starts,
3901 	    "# of normal consumer starts in the mp_ring for this queue");
3902 	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_stalls",
3903 	    CTLFLAG_RD, &txq->r->stalls,
3904 	    "# of consumer stalls in the mp_ring for this queue");
3905 	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_restarts",
3906 	    CTLFLAG_RD, &txq->r->restarts,
3907 	    "# of consumer restarts in the mp_ring for this queue");
3908 	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_abdications",
3909 	    CTLFLAG_RD, &txq->r->abdications,
3910 	    "# of consumer abdications in the mp_ring for this queue");
3911 
3912 	return (0);
3913 }
3914 
3915 static int
3916 free_txq(struct vi_info *vi, struct sge_txq *txq)
3917 {
3918 	int rc;
3919 	struct adapter *sc = vi->pi->adapter;
3920 	struct sge_eq *eq = &txq->eq;
3921 
3922 	rc = free_eq(sc, eq);
3923 	if (rc)
3924 		return (rc);
3925 
3926 	sglist_free(txq->gl);
3927 	free(txq->sdesc, M_CXGBE);
3928 	mp_ring_free(txq->r);
3929 
3930 	bzero(txq, sizeof(*txq));
3931 	return (0);
3932 }
3933 
3934 static void
3935 oneseg_dma_callback(void *arg, bus_dma_segment_t *segs, int nseg, int error)
3936 {
3937 	bus_addr_t *ba = arg;
3938 
3939 	KASSERT(nseg == 1,
3940 	    ("%s meant for single segment mappings only.", __func__));
3941 
3942 	*ba = error ? 0 : segs->ds_addr;
3943 }
3944 
3945 static inline void
3946 ring_fl_db(struct adapter *sc, struct sge_fl *fl)
3947 {
3948 	uint32_t n, v;
3949 
3950 	n = IDXDIFF(fl->pidx / 8, fl->dbidx, fl->sidx);
3951 	MPASS(n > 0);
3952 
3953 	wmb();
3954 	v = fl->dbval | V_PIDX(n);
3955 	if (fl->udb)
3956 		*fl->udb = htole32(v);
3957 	else
3958 		t4_write_reg(sc, sc->sge_kdoorbell_reg, v);
3959 	IDXINCR(fl->dbidx, n, fl->sidx);
3960 }
3961 
3962 /*
3963  * Fills up the freelist by allocating up to 'n' buffers.  Buffers that are
3964  * recycled do not count towards this allocation budget.
3965  *
3966  * Returns non-zero to indicate that this freelist should be added to the list
3967  * of starving freelists.
3968  */
3969 static int
3970 refill_fl(struct adapter *sc, struct sge_fl *fl, int n)
3971 {
3972 	__be64 *d;
3973 	struct fl_sdesc *sd;
3974 	uintptr_t pa;
3975 	caddr_t cl;
3976 	struct cluster_layout *cll;
3977 	struct sw_zone_info *swz;
3978 	struct cluster_metadata *clm;
3979 	uint16_t max_pidx;
3980 	uint16_t hw_cidx = fl->hw_cidx;		/* stable snapshot */
3981 
3982 	FL_LOCK_ASSERT_OWNED(fl);
3983 
3984 	/*
3985 	 * We always stop at the beginning of the hardware descriptor that's just
3986 	 * before the one with the hw cidx.  This is to avoid hw pidx = hw cidx,
3987 	 * which would mean an empty freelist to the chip.
3988 	 */
3989 	max_pidx = __predict_false(hw_cidx == 0) ? fl->sidx - 1 : hw_cidx - 1;
3990 	if (fl->pidx == max_pidx * 8)
3991 		return (0);
3992 
3993 	d = &fl->desc[fl->pidx];
3994 	sd = &fl->sdesc[fl->pidx];
3995 	cll = &fl->cll_def;	/* default layout */
3996 	swz = &sc->sge.sw_zone_info[cll->zidx];
3997 
3998 	while (n > 0) {
3999 
4000 		if (sd->cl != NULL) {
4001 
4002 			if (sd->nmbuf == 0) {
4003 				/*
4004 				 * Fast recycle without involving any atomics on
4005 				 * the cluster's metadata (if the cluster has
4006 				 * metadata).  This happens when all frames
4007 				 * received in the cluster were small enough to
4008 				 * fit within a single mbuf each.
4009 				 */
4010 				fl->cl_fast_recycled++;
4011 #ifdef INVARIANTS
4012 				clm = cl_metadata(sc, fl, &sd->cll, sd->cl);
4013 				if (clm != NULL)
4014 					MPASS(clm->refcount == 1);
4015 #endif
4016 				goto recycled_fast;
4017 			}
4018 
4019 			/*
4020 			 * Cluster is guaranteed to have metadata.  Clusters
4021 			 * without metadata always take the fast recycle path
4022 			 * when they're recycled.
4023 			 */
4024 			clm = cl_metadata(sc, fl, &sd->cll, sd->cl);
4025 			MPASS(clm != NULL);
4026 
4027 			if (atomic_fetchadd_int(&clm->refcount, -1) == 1) {
4028 				fl->cl_recycled++;
4029 				counter_u64_add(extfree_rels, 1);
4030 				goto recycled;
4031 			}
4032 			sd->cl = NULL;	/* gave up my reference */
4033 		}
4034 		MPASS(sd->cl == NULL);
4035 alloc:
4036 		cl = uma_zalloc(swz->zone, M_NOWAIT);
4037 		if (__predict_false(cl == NULL)) {
4038 			if (cll == &fl->cll_alt || fl->cll_alt.zidx == -1 ||
4039 			    fl->cll_def.zidx == fl->cll_alt.zidx)
4040 				break;
4041 
4042 			/* fall back to the safe zone */
4043 			cll = &fl->cll_alt;
4044 			swz = &sc->sge.sw_zone_info[cll->zidx];
4045 			goto alloc;
4046 		}
4047 		fl->cl_allocated++;
4048 		n--;
4049 
4050 		pa = pmap_kextract((vm_offset_t)cl);
4051 		pa += cll->region1;
4052 		sd->cl = cl;
4053 		sd->cll = *cll;
4054 		*d = htobe64(pa | cll->hwidx);
4055 		clm = cl_metadata(sc, fl, cll, cl);
4056 		if (clm != NULL) {
4057 recycled:
4058 #ifdef INVARIANTS
4059 			clm->sd = sd;
4060 #endif
4061 			clm->refcount = 1;
4062 		}
4063 		sd->nmbuf = 0;
4064 recycled_fast:
4065 		d++;
4066 		sd++;
4067 		if (__predict_false(++fl->pidx % 8 == 0)) {
4068 			uint16_t pidx = fl->pidx / 8;
4069 
4070 			if (__predict_false(pidx == fl->sidx)) {
4071 				fl->pidx = 0;
4072 				pidx = 0;
4073 				sd = fl->sdesc;
4074 				d = fl->desc;
4075 			}
4076 			if (pidx == max_pidx)
4077 				break;
4078 
4079 			if (IDXDIFF(pidx, fl->dbidx, fl->sidx) >= 4)
4080 				ring_fl_db(sc, fl);
4081 		}
4082 	}
4083 
4084 	if (fl->pidx / 8 != fl->dbidx)
4085 		ring_fl_db(sc, fl);
4086 
4087 	return (FL_RUNNING_LOW(fl) && !(fl->flags & FL_STARVING));
4088 }
4089 
4090 /*
4091  * Attempt to refill all starving freelists.
4092  */
4093 static void
4094 refill_sfl(void *arg)
4095 {
4096 	struct adapter *sc = arg;
4097 	struct sge_fl *fl, *fl_temp;
4098 
4099 	mtx_assert(&sc->sfl_lock, MA_OWNED);
4100 	TAILQ_FOREACH_SAFE(fl, &sc->sfl, link, fl_temp) {
4101 		FL_LOCK(fl);
4102 		refill_fl(sc, fl, 64);
4103 		if (FL_NOT_RUNNING_LOW(fl) || fl->flags & FL_DOOMED) {
4104 			TAILQ_REMOVE(&sc->sfl, fl, link);
4105 			fl->flags &= ~FL_STARVING;
4106 		}
4107 		FL_UNLOCK(fl);
4108 	}
4109 
4110 	if (!TAILQ_EMPTY(&sc->sfl))
4111 		callout_schedule(&sc->sfl_callout, hz / 5);
4112 }
4113 
4114 static int
4115 alloc_fl_sdesc(struct sge_fl *fl)
4116 {
4117 
4118 	fl->sdesc = malloc(fl->sidx * 8 * sizeof(struct fl_sdesc), M_CXGBE,
4119 	    M_ZERO | M_WAITOK);
4120 
4121 	return (0);
4122 }
4123 
4124 static void
4125 free_fl_sdesc(struct adapter *sc, struct sge_fl *fl)
4126 {
4127 	struct fl_sdesc *sd;
4128 	struct cluster_metadata *clm;
4129 	struct cluster_layout *cll;
4130 	int i;
4131 
4132 	sd = fl->sdesc;
4133 	for (i = 0; i < fl->sidx * 8; i++, sd++) {
4134 		if (sd->cl == NULL)
4135 			continue;
4136 
4137 		cll = &sd->cll;
4138 		clm = cl_metadata(sc, fl, cll, sd->cl);
4139 		if (sd->nmbuf == 0)
4140 			uma_zfree(sc->sge.sw_zone_info[cll->zidx].zone, sd->cl);
4141 		else if (clm && atomic_fetchadd_int(&clm->refcount, -1) == 1) {
4142 			uma_zfree(sc->sge.sw_zone_info[cll->zidx].zone, sd->cl);
4143 			counter_u64_add(extfree_rels, 1);
4144 		}
4145 		sd->cl = NULL;
4146 	}
4147 
4148 	free(fl->sdesc, M_CXGBE);
4149 	fl->sdesc = NULL;
4150 }
4151 
4152 static inline void
4153 get_pkt_gl(struct mbuf *m, struct sglist *gl)
4154 {
4155 	int rc;
4156 
4157 	M_ASSERTPKTHDR(m);
4158 
4159 	sglist_reset(gl);
4160 	rc = sglist_append_mbuf(gl, m);
4161 	if (__predict_false(rc != 0)) {
4162 		panic("%s: mbuf %p (%d segs) was vetted earlier but now fails "
4163 		    "with %d.", __func__, m, mbuf_nsegs(m), rc);
4164 	}
4165 
4166 	KASSERT(gl->sg_nseg == mbuf_nsegs(m),
4167 	    ("%s: nsegs changed for mbuf %p from %d to %d", __func__, m,
4168 	    mbuf_nsegs(m), gl->sg_nseg));
4169 	KASSERT(gl->sg_nseg > 0 &&
4170 	    gl->sg_nseg <= (needs_tso(m) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS),
4171 	    ("%s: %d segments, should have been 1 <= nsegs <= %d", __func__,
4172 		gl->sg_nseg, needs_tso(m) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS));
4173 }
4174 
4175 /*
4176  * len16 for a txpkt WR with a GL.  Includes the firmware work request header.
4177  */
4178 static inline u_int
4179 txpkt_len16(u_int nsegs, u_int tso)
4180 {
4181 	u_int n;
4182 
4183 	MPASS(nsegs > 0);
4184 
4185 	nsegs--; /* first segment is part of ulptx_sgl */
4186 	n = sizeof(struct fw_eth_tx_pkt_wr) + sizeof(struct cpl_tx_pkt_core) +
4187 	    sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1));
4188 	if (tso)
4189 		n += sizeof(struct cpl_tx_pkt_lso_core);
4190 
4191 	return (howmany(n, 16));
4192 }
4193 
4194 /*
4195  * len16 for a txpkt_vm WR with a GL.  Includes the firmware work
4196  * request header.
4197  */
4198 static inline u_int
4199 txpkt_vm_len16(u_int nsegs, u_int tso)
4200 {
4201 	u_int n;
4202 
4203 	MPASS(nsegs > 0);
4204 
4205 	nsegs--; /* first segment is part of ulptx_sgl */
4206 	n = sizeof(struct fw_eth_tx_pkt_vm_wr) +
4207 	    sizeof(struct cpl_tx_pkt_core) +
4208 	    sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1));
4209 	if (tso)
4210 		n += sizeof(struct cpl_tx_pkt_lso_core);
4211 
4212 	return (howmany(n, 16));
4213 }
4214 
4215 /*
4216  * len16 for a txpkts type 0 WR with a GL.  Does not include the firmware work
4217  * request header.
4218  */
4219 static inline u_int
4220 txpkts0_len16(u_int nsegs)
4221 {
4222 	u_int n;
4223 
4224 	MPASS(nsegs > 0);
4225 
4226 	nsegs--; /* first segment is part of ulptx_sgl */
4227 	n = sizeof(struct ulp_txpkt) + sizeof(struct ulptx_idata) +
4228 	    sizeof(struct cpl_tx_pkt_core) + sizeof(struct ulptx_sgl) +
4229 	    8 * ((3 * nsegs) / 2 + (nsegs & 1));
4230 
4231 	return (howmany(n, 16));
4232 }
4233 
4234 /*
4235  * len16 for a txpkts type 1 WR with a GL.  Does not include the firmware work
4236  * request header.
4237  */
4238 static inline u_int
4239 txpkts1_len16(void)
4240 {
4241 	u_int n;
4242 
4243 	n = sizeof(struct cpl_tx_pkt_core) + sizeof(struct ulptx_sgl);
4244 
4245 	return (howmany(n, 16));
4246 }
4247 
4248 static inline u_int
4249 imm_payload(u_int ndesc)
4250 {
4251 	u_int n;
4252 
4253 	n = ndesc * EQ_ESIZE - sizeof(struct fw_eth_tx_pkt_wr) -
4254 	    sizeof(struct cpl_tx_pkt_core);
4255 
4256 	return (n);
4257 }
4258 
4259 /*
4260  * Write a VM txpkt WR for this packet to the hardware descriptors, update the
4261  * software descriptor, and advance the pidx.  It is guaranteed that enough
4262  * descriptors are available.
4263  *
4264  * The return value is the # of hardware descriptors used.
4265  */
4266 static u_int
4267 write_txpkt_vm_wr(struct adapter *sc, struct sge_txq *txq,
4268     struct fw_eth_tx_pkt_vm_wr *wr, struct mbuf *m0, u_int available)
4269 {
4270 	struct sge_eq *eq = &txq->eq;
4271 	struct tx_sdesc *txsd;
4272 	struct cpl_tx_pkt_core *cpl;
4273 	uint32_t ctrl;	/* used in many unrelated places */
4274 	uint64_t ctrl1;
4275 	int csum_type, len16, ndesc, pktlen, nsegs;
4276 	caddr_t dst;
4277 
4278 	TXQ_LOCK_ASSERT_OWNED(txq);
4279 	M_ASSERTPKTHDR(m0);
4280 	MPASS(available > 0 && available < eq->sidx);
4281 
4282 	len16 = mbuf_len16(m0);
4283 	nsegs = mbuf_nsegs(m0);
4284 	pktlen = m0->m_pkthdr.len;
4285 	ctrl = sizeof(struct cpl_tx_pkt_core);
4286 	if (needs_tso(m0))
4287 		ctrl += sizeof(struct cpl_tx_pkt_lso_core);
4288 	ndesc = howmany(len16, EQ_ESIZE / 16);
4289 	MPASS(ndesc <= available);
4290 
4291 	/* Firmware work request header */
4292 	MPASS(wr == (void *)&eq->desc[eq->pidx]);
4293 	wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_VM_WR) |
4294 	    V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl));
4295 
4296 	ctrl = V_FW_WR_LEN16(len16);
4297 	wr->equiq_to_len16 = htobe32(ctrl);
4298 	wr->r3[0] = 0;
4299 	wr->r3[1] = 0;
4300 
4301 	/*
4302 	 * Copy over ethmacdst, ethmacsrc, ethtype, and vlantci.
4303 	 * vlantci is ignored unless the ethtype is 0x8100, so it's
4304 	 * simpler to always copy it rather than making it
4305 	 * conditional.  Also, it seems that we do not have to set
4306 	 * vlantci or fake the ethtype when doing VLAN tag insertion.
4307 	 */
4308 	m_copydata(m0, 0, sizeof(struct ether_header) + 2, wr->ethmacdst);
4309 
4310 	csum_type = -1;
4311 	if (needs_tso(m0)) {
4312 		struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1);
4313 
4314 		KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 &&
4315 		    m0->m_pkthdr.l4hlen > 0,
4316 		    ("%s: mbuf %p needs TSO but missing header lengths",
4317 			__func__, m0));
4318 
4319 		ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) | F_LSO_FIRST_SLICE |
4320 		    F_LSO_LAST_SLICE | V_LSO_IPHDR_LEN(m0->m_pkthdr.l3hlen >> 2)
4321 		    | V_LSO_TCPHDR_LEN(m0->m_pkthdr.l4hlen >> 2);
4322 		if (m0->m_pkthdr.l2hlen == sizeof(struct ether_vlan_header))
4323 			ctrl |= V_LSO_ETHHDR_LEN(1);
4324 		if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr))
4325 			ctrl |= F_LSO_IPV6;
4326 
4327 		lso->lso_ctrl = htobe32(ctrl);
4328 		lso->ipid_ofst = htobe16(0);
4329 		lso->mss = htobe16(m0->m_pkthdr.tso_segsz);
4330 		lso->seqno_offset = htobe32(0);
4331 		lso->len = htobe32(pktlen);
4332 
4333 		if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr))
4334 			csum_type = TX_CSUM_TCPIP6;
4335 		else
4336 			csum_type = TX_CSUM_TCPIP;
4337 
4338 		cpl = (void *)(lso + 1);
4339 
4340 		txq->tso_wrs++;
4341 	} else {
4342 		if (m0->m_pkthdr.csum_flags & CSUM_IP_TCP)
4343 			csum_type = TX_CSUM_TCPIP;
4344 		else if (m0->m_pkthdr.csum_flags & CSUM_IP_UDP)
4345 			csum_type = TX_CSUM_UDPIP;
4346 		else if (m0->m_pkthdr.csum_flags & CSUM_IP6_TCP)
4347 			csum_type = TX_CSUM_TCPIP6;
4348 		else if (m0->m_pkthdr.csum_flags & CSUM_IP6_UDP)
4349 			csum_type = TX_CSUM_UDPIP6;
4350 #if defined(INET)
4351 		else if (m0->m_pkthdr.csum_flags & CSUM_IP) {
4352 			/*
4353 			 * XXX: The firmware appears to stomp on the
4354 			 * fragment/flags field of the IP header when
4355 			 * using TX_CSUM_IP.  Fall back to doing
4356 			 * software checksums.
4357 			 */
4358 			u_short *sump;
4359 			struct mbuf *m;
4360 			int offset;
4361 
4362 			m = m0;
4363 			offset = 0;
4364 			sump = m_advance(&m, &offset, m0->m_pkthdr.l2hlen +
4365 			    offsetof(struct ip, ip_sum));
4366 			*sump = in_cksum_skip(m0, m0->m_pkthdr.l2hlen +
4367 			    m0->m_pkthdr.l3hlen, m0->m_pkthdr.l2hlen);
4368 			m0->m_pkthdr.csum_flags &= ~CSUM_IP;
4369 		}
4370 #endif
4371 
4372 		cpl = (void *)(wr + 1);
4373 	}
4374 
4375 	/* Checksum offload */
4376 	ctrl1 = 0;
4377 	if (needs_l3_csum(m0) == 0)
4378 		ctrl1 |= F_TXPKT_IPCSUM_DIS;
4379 	if (csum_type >= 0) {
4380 		KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0,
4381 	    ("%s: mbuf %p needs checksum offload but missing header lengths",
4382 			__func__, m0));
4383 
4384 		if (chip_id(sc) <= CHELSIO_T5) {
4385 			ctrl1 |= V_TXPKT_ETHHDR_LEN(m0->m_pkthdr.l2hlen -
4386 			    ETHER_HDR_LEN);
4387 		} else {
4388 			ctrl1 |= V_T6_TXPKT_ETHHDR_LEN(m0->m_pkthdr.l2hlen -
4389 			    ETHER_HDR_LEN);
4390 		}
4391 		ctrl1 |= V_TXPKT_IPHDR_LEN(m0->m_pkthdr.l3hlen);
4392 		ctrl1 |= V_TXPKT_CSUM_TYPE(csum_type);
4393 	} else
4394 		ctrl1 |= F_TXPKT_L4CSUM_DIS;
4395 	if (m0->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP |
4396 	    CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO))
4397 		txq->txcsum++;	/* some hardware assistance provided */
4398 
4399 	/* VLAN tag insertion */
4400 	if (needs_vlan_insertion(m0)) {
4401 		ctrl1 |= F_TXPKT_VLAN_VLD |
4402 		    V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag);
4403 		txq->vlan_insertion++;
4404 	}
4405 
4406 	/* CPL header */
4407 	cpl->ctrl0 = txq->cpl_ctrl0;
4408 	cpl->pack = 0;
4409 	cpl->len = htobe16(pktlen);
4410 	cpl->ctrl1 = htobe64(ctrl1);
4411 
4412 	/* SGL */
4413 	dst = (void *)(cpl + 1);
4414 
4415 	/*
4416 	 * A packet using TSO will use up an entire descriptor for the
4417 	 * firmware work request header, LSO CPL, and TX_PKT_XT CPL.
4418 	 * If this descriptor is the last descriptor in the ring, wrap
4419 	 * around to the front of the ring explicitly for the start of
4420 	 * the sgl.
4421 	 */
4422 	if (dst == (void *)&eq->desc[eq->sidx]) {
4423 		dst = (void *)&eq->desc[0];
4424 		write_gl_to_txd(txq, m0, &dst, 0);
4425 	} else
4426 		write_gl_to_txd(txq, m0, &dst, eq->sidx - ndesc < eq->pidx);
4427 	txq->sgl_wrs++;
4428 
4429 	txq->txpkt_wrs++;
4430 
4431 	txsd = &txq->sdesc[eq->pidx];
4432 	txsd->m = m0;
4433 	txsd->desc_used = ndesc;
4434 
4435 	return (ndesc);
4436 }
4437 
4438 /*
4439  * Write a txpkt WR for this packet to the hardware descriptors, update the
4440  * software descriptor, and advance the pidx.  It is guaranteed that enough
4441  * descriptors are available.
4442  *
4443  * The return value is the # of hardware descriptors used.
4444  */
4445 static u_int
4446 write_txpkt_wr(struct sge_txq *txq, struct fw_eth_tx_pkt_wr *wr,
4447     struct mbuf *m0, u_int available)
4448 {
4449 	struct sge_eq *eq = &txq->eq;
4450 	struct tx_sdesc *txsd;
4451 	struct cpl_tx_pkt_core *cpl;
4452 	uint32_t ctrl;	/* used in many unrelated places */
4453 	uint64_t ctrl1;
4454 	int len16, ndesc, pktlen, nsegs;
4455 	caddr_t dst;
4456 
4457 	TXQ_LOCK_ASSERT_OWNED(txq);
4458 	M_ASSERTPKTHDR(m0);
4459 	MPASS(available > 0 && available < eq->sidx);
4460 
4461 	len16 = mbuf_len16(m0);
4462 	nsegs = mbuf_nsegs(m0);
4463 	pktlen = m0->m_pkthdr.len;
4464 	ctrl = sizeof(struct cpl_tx_pkt_core);
4465 	if (needs_tso(m0))
4466 		ctrl += sizeof(struct cpl_tx_pkt_lso_core);
4467 	else if (pktlen <= imm_payload(2) && available >= 2) {
4468 		/* Immediate data.  Recalculate len16 and set nsegs to 0. */
4469 		ctrl += pktlen;
4470 		len16 = howmany(sizeof(struct fw_eth_tx_pkt_wr) +
4471 		    sizeof(struct cpl_tx_pkt_core) + pktlen, 16);
4472 		nsegs = 0;
4473 	}
4474 	ndesc = howmany(len16, EQ_ESIZE / 16);
4475 	MPASS(ndesc <= available);
4476 
4477 	/* Firmware work request header */
4478 	MPASS(wr == (void *)&eq->desc[eq->pidx]);
4479 	wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_WR) |
4480 	    V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl));
4481 
4482 	ctrl = V_FW_WR_LEN16(len16);
4483 	wr->equiq_to_len16 = htobe32(ctrl);
4484 	wr->r3 = 0;
4485 
4486 	if (needs_tso(m0)) {
4487 		struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1);
4488 
4489 		KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 &&
4490 		    m0->m_pkthdr.l4hlen > 0,
4491 		    ("%s: mbuf %p needs TSO but missing header lengths",
4492 			__func__, m0));
4493 
4494 		ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) | F_LSO_FIRST_SLICE |
4495 		    F_LSO_LAST_SLICE | V_LSO_IPHDR_LEN(m0->m_pkthdr.l3hlen >> 2)
4496 		    | V_LSO_TCPHDR_LEN(m0->m_pkthdr.l4hlen >> 2);
4497 		if (m0->m_pkthdr.l2hlen == sizeof(struct ether_vlan_header))
4498 			ctrl |= V_LSO_ETHHDR_LEN(1);
4499 		if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr))
4500 			ctrl |= F_LSO_IPV6;
4501 
4502 		lso->lso_ctrl = htobe32(ctrl);
4503 		lso->ipid_ofst = htobe16(0);
4504 		lso->mss = htobe16(m0->m_pkthdr.tso_segsz);
4505 		lso->seqno_offset = htobe32(0);
4506 		lso->len = htobe32(pktlen);
4507 
4508 		cpl = (void *)(lso + 1);
4509 
4510 		txq->tso_wrs++;
4511 	} else
4512 		cpl = (void *)(wr + 1);
4513 
4514 	/* Checksum offload */
4515 	ctrl1 = 0;
4516 	if (needs_l3_csum(m0) == 0)
4517 		ctrl1 |= F_TXPKT_IPCSUM_DIS;
4518 	if (needs_l4_csum(m0) == 0)
4519 		ctrl1 |= F_TXPKT_L4CSUM_DIS;
4520 	if (m0->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP |
4521 	    CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO))
4522 		txq->txcsum++;	/* some hardware assistance provided */
4523 
4524 	/* VLAN tag insertion */
4525 	if (needs_vlan_insertion(m0)) {
4526 		ctrl1 |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag);
4527 		txq->vlan_insertion++;
4528 	}
4529 
4530 	/* CPL header */
4531 	cpl->ctrl0 = txq->cpl_ctrl0;
4532 	cpl->pack = 0;
4533 	cpl->len = htobe16(pktlen);
4534 	cpl->ctrl1 = htobe64(ctrl1);
4535 
4536 	/* SGL */
4537 	dst = (void *)(cpl + 1);
4538 	if (nsegs > 0) {
4539 
4540 		write_gl_to_txd(txq, m0, &dst, eq->sidx - ndesc < eq->pidx);
4541 		txq->sgl_wrs++;
4542 	} else {
4543 		struct mbuf *m;
4544 
4545 		for (m = m0; m != NULL; m = m->m_next) {
4546 			copy_to_txd(eq, mtod(m, caddr_t), &dst, m->m_len);
4547 #ifdef INVARIANTS
4548 			pktlen -= m->m_len;
4549 #endif
4550 		}
4551 #ifdef INVARIANTS
4552 		KASSERT(pktlen == 0, ("%s: %d bytes left.", __func__, pktlen));
4553 #endif
4554 		txq->imm_wrs++;
4555 	}
4556 
4557 	txq->txpkt_wrs++;
4558 
4559 	txsd = &txq->sdesc[eq->pidx];
4560 	txsd->m = m0;
4561 	txsd->desc_used = ndesc;
4562 
4563 	return (ndesc);
4564 }
4565 
4566 static int
4567 try_txpkts(struct mbuf *m, struct mbuf *n, struct txpkts *txp, u_int available)
4568 {
4569 	u_int needed, nsegs1, nsegs2, l1, l2;
4570 
4571 	if (cannot_use_txpkts(m) || cannot_use_txpkts(n))
4572 		return (1);
4573 
4574 	nsegs1 = mbuf_nsegs(m);
4575 	nsegs2 = mbuf_nsegs(n);
4576 	if (nsegs1 + nsegs2 == 2) {
4577 		txp->wr_type = 1;
4578 		l1 = l2 = txpkts1_len16();
4579 	} else {
4580 		txp->wr_type = 0;
4581 		l1 = txpkts0_len16(nsegs1);
4582 		l2 = txpkts0_len16(nsegs2);
4583 	}
4584 	txp->len16 = howmany(sizeof(struct fw_eth_tx_pkts_wr), 16) + l1 + l2;
4585 	needed = howmany(txp->len16, EQ_ESIZE / 16);
4586 	if (needed > SGE_MAX_WR_NDESC || needed > available)
4587 		return (1);
4588 
4589 	txp->plen = m->m_pkthdr.len + n->m_pkthdr.len;
4590 	if (txp->plen > 65535)
4591 		return (1);
4592 
4593 	txp->npkt = 2;
4594 	set_mbuf_len16(m, l1);
4595 	set_mbuf_len16(n, l2);
4596 
4597 	return (0);
4598 }
4599 
4600 static int
4601 add_to_txpkts(struct mbuf *m, struct txpkts *txp, u_int available)
4602 {
4603 	u_int plen, len16, needed, nsegs;
4604 
4605 	MPASS(txp->wr_type == 0 || txp->wr_type == 1);
4606 
4607 	nsegs = mbuf_nsegs(m);
4608 	if (needs_tso(m) || (txp->wr_type == 1 && nsegs != 1))
4609 		return (1);
4610 
4611 	plen = txp->plen + m->m_pkthdr.len;
4612 	if (plen > 65535)
4613 		return (1);
4614 
4615 	if (txp->wr_type == 0)
4616 		len16 = txpkts0_len16(nsegs);
4617 	else
4618 		len16 = txpkts1_len16();
4619 	needed = howmany(txp->len16 + len16, EQ_ESIZE / 16);
4620 	if (needed > SGE_MAX_WR_NDESC || needed > available)
4621 		return (1);
4622 
4623 	txp->npkt++;
4624 	txp->plen = plen;
4625 	txp->len16 += len16;
4626 	set_mbuf_len16(m, len16);
4627 
4628 	return (0);
4629 }
4630 
4631 /*
4632  * Write a txpkts WR for the packets in txp to the hardware descriptors, update
4633  * the software descriptor, and advance the pidx.  It is guaranteed that enough
4634  * descriptors are available.
4635  *
4636  * The return value is the # of hardware descriptors used.
4637  */
4638 static u_int
4639 write_txpkts_wr(struct sge_txq *txq, struct fw_eth_tx_pkts_wr *wr,
4640     struct mbuf *m0, const struct txpkts *txp, u_int available)
4641 {
4642 	struct sge_eq *eq = &txq->eq;
4643 	struct tx_sdesc *txsd;
4644 	struct cpl_tx_pkt_core *cpl;
4645 	uint32_t ctrl;
4646 	uint64_t ctrl1;
4647 	int ndesc, checkwrap;
4648 	struct mbuf *m;
4649 	void *flitp;
4650 
4651 	TXQ_LOCK_ASSERT_OWNED(txq);
4652 	MPASS(txp->npkt > 0);
4653 	MPASS(txp->plen < 65536);
4654 	MPASS(m0 != NULL);
4655 	MPASS(m0->m_nextpkt != NULL);
4656 	MPASS(txp->len16 <= howmany(SGE_MAX_WR_LEN, 16));
4657 	MPASS(available > 0 && available < eq->sidx);
4658 
4659 	ndesc = howmany(txp->len16, EQ_ESIZE / 16);
4660 	MPASS(ndesc <= available);
4661 
4662 	MPASS(wr == (void *)&eq->desc[eq->pidx]);
4663 	wr->op_pkd = htobe32(V_FW_WR_OP(FW_ETH_TX_PKTS_WR));
4664 	ctrl = V_FW_WR_LEN16(txp->len16);
4665 	wr->equiq_to_len16 = htobe32(ctrl);
4666 	wr->plen = htobe16(txp->plen);
4667 	wr->npkt = txp->npkt;
4668 	wr->r3 = 0;
4669 	wr->type = txp->wr_type;
4670 	flitp = wr + 1;
4671 
4672 	/*
4673 	 * At this point we are 16B into a hardware descriptor.  If checkwrap is
4674 	 * set then we know the WR is going to wrap around somewhere.  We'll
4675 	 * check for that at appropriate points.
4676 	 */
4677 	checkwrap = eq->sidx - ndesc < eq->pidx;
4678 	for (m = m0; m != NULL; m = m->m_nextpkt) {
4679 		if (txp->wr_type == 0) {
4680 			struct ulp_txpkt *ulpmc;
4681 			struct ulptx_idata *ulpsc;
4682 
4683 			/* ULP master command */
4684 			ulpmc = flitp;
4685 			ulpmc->cmd_dest = htobe32(V_ULPTX_CMD(ULP_TX_PKT) |
4686 			    V_ULP_TXPKT_DEST(0) | V_ULP_TXPKT_FID(eq->iqid));
4687 			ulpmc->len = htobe32(mbuf_len16(m));
4688 
4689 			/* ULP subcommand */
4690 			ulpsc = (void *)(ulpmc + 1);
4691 			ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM) |
4692 			    F_ULP_TX_SC_MORE);
4693 			ulpsc->len = htobe32(sizeof(struct cpl_tx_pkt_core));
4694 
4695 			cpl = (void *)(ulpsc + 1);
4696 			if (checkwrap &&
4697 			    (uintptr_t)cpl == (uintptr_t)&eq->desc[eq->sidx])
4698 				cpl = (void *)&eq->desc[0];
4699 		} else {
4700 			cpl = flitp;
4701 		}
4702 
4703 		/* Checksum offload */
4704 		ctrl1 = 0;
4705 		if (needs_l3_csum(m) == 0)
4706 			ctrl1 |= F_TXPKT_IPCSUM_DIS;
4707 		if (needs_l4_csum(m) == 0)
4708 			ctrl1 |= F_TXPKT_L4CSUM_DIS;
4709 		if (m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP |
4710 		    CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO))
4711 			txq->txcsum++;	/* some hardware assistance provided */
4712 
4713 		/* VLAN tag insertion */
4714 		if (needs_vlan_insertion(m)) {
4715 			ctrl1 |= F_TXPKT_VLAN_VLD |
4716 			    V_TXPKT_VLAN(m->m_pkthdr.ether_vtag);
4717 			txq->vlan_insertion++;
4718 		}
4719 
4720 		/* CPL header */
4721 		cpl->ctrl0 = txq->cpl_ctrl0;
4722 		cpl->pack = 0;
4723 		cpl->len = htobe16(m->m_pkthdr.len);
4724 		cpl->ctrl1 = htobe64(ctrl1);
4725 
4726 		flitp = cpl + 1;
4727 		if (checkwrap &&
4728 		    (uintptr_t)flitp == (uintptr_t)&eq->desc[eq->sidx])
4729 			flitp = (void *)&eq->desc[0];
4730 
4731 		write_gl_to_txd(txq, m, (caddr_t *)(&flitp), checkwrap);
4732 
4733 	}
4734 
4735 	if (txp->wr_type == 0) {
4736 		txq->txpkts0_pkts += txp->npkt;
4737 		txq->txpkts0_wrs++;
4738 	} else {
4739 		txq->txpkts1_pkts += txp->npkt;
4740 		txq->txpkts1_wrs++;
4741 	}
4742 
4743 	txsd = &txq->sdesc[eq->pidx];
4744 	txsd->m = m0;
4745 	txsd->desc_used = ndesc;
4746 
4747 	return (ndesc);
4748 }
4749 
4750 /*
4751  * If the SGL ends on an address that is not 16 byte aligned, this function will
4752  * add a 0 filled flit at the end.
4753  */
4754 static void
4755 write_gl_to_txd(struct sge_txq *txq, struct mbuf *m, caddr_t *to, int checkwrap)
4756 {
4757 	struct sge_eq *eq = &txq->eq;
4758 	struct sglist *gl = txq->gl;
4759 	struct sglist_seg *seg;
4760 	__be64 *flitp, *wrap;
4761 	struct ulptx_sgl *usgl;
4762 	int i, nflits, nsegs;
4763 
4764 	KASSERT(((uintptr_t)(*to) & 0xf) == 0,
4765 	    ("%s: SGL must start at a 16 byte boundary: %p", __func__, *to));
4766 	MPASS((uintptr_t)(*to) >= (uintptr_t)&eq->desc[0]);
4767 	MPASS((uintptr_t)(*to) < (uintptr_t)&eq->desc[eq->sidx]);
4768 
4769 	get_pkt_gl(m, gl);
4770 	nsegs = gl->sg_nseg;
4771 	MPASS(nsegs > 0);
4772 
4773 	nflits = (3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1) + 2;
4774 	flitp = (__be64 *)(*to);
4775 	wrap = (__be64 *)(&eq->desc[eq->sidx]);
4776 	seg = &gl->sg_segs[0];
4777 	usgl = (void *)flitp;
4778 
4779 	/*
4780 	 * We start at a 16 byte boundary somewhere inside the tx descriptor
4781 	 * ring, so we're at least 16 bytes away from the status page.  There is
4782 	 * no chance of a wrap around in the middle of usgl (which is 16 bytes).
4783 	 */
4784 
4785 	usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) |
4786 	    V_ULPTX_NSGE(nsegs));
4787 	usgl->len0 = htobe32(seg->ss_len);
4788 	usgl->addr0 = htobe64(seg->ss_paddr);
4789 	seg++;
4790 
4791 	if (checkwrap == 0 || (uintptr_t)(flitp + nflits) <= (uintptr_t)wrap) {
4792 
4793 		/* Won't wrap around at all */
4794 
4795 		for (i = 0; i < nsegs - 1; i++, seg++) {
4796 			usgl->sge[i / 2].len[i & 1] = htobe32(seg->ss_len);
4797 			usgl->sge[i / 2].addr[i & 1] = htobe64(seg->ss_paddr);
4798 		}
4799 		if (i & 1)
4800 			usgl->sge[i / 2].len[1] = htobe32(0);
4801 		flitp += nflits;
4802 	} else {
4803 
4804 		/* Will wrap somewhere in the rest of the SGL */
4805 
4806 		/* 2 flits already written, write the rest flit by flit */
4807 		flitp = (void *)(usgl + 1);
4808 		for (i = 0; i < nflits - 2; i++) {
4809 			if (flitp == wrap)
4810 				flitp = (void *)eq->desc;
4811 			*flitp++ = get_flit(seg, nsegs - 1, i);
4812 		}
4813 	}
4814 
4815 	if (nflits & 1) {
4816 		MPASS(((uintptr_t)flitp) & 0xf);
4817 		*flitp++ = 0;
4818 	}
4819 
4820 	MPASS((((uintptr_t)flitp) & 0xf) == 0);
4821 	if (__predict_false(flitp == wrap))
4822 		*to = (void *)eq->desc;
4823 	else
4824 		*to = (void *)flitp;
4825 }
4826 
4827 static inline void
4828 copy_to_txd(struct sge_eq *eq, caddr_t from, caddr_t *to, int len)
4829 {
4830 
4831 	MPASS((uintptr_t)(*to) >= (uintptr_t)&eq->desc[0]);
4832 	MPASS((uintptr_t)(*to) < (uintptr_t)&eq->desc[eq->sidx]);
4833 
4834 	if (__predict_true((uintptr_t)(*to) + len <=
4835 	    (uintptr_t)&eq->desc[eq->sidx])) {
4836 		bcopy(from, *to, len);
4837 		(*to) += len;
4838 	} else {
4839 		int portion = (uintptr_t)&eq->desc[eq->sidx] - (uintptr_t)(*to);
4840 
4841 		bcopy(from, *to, portion);
4842 		from += portion;
4843 		portion = len - portion;	/* remaining */
4844 		bcopy(from, (void *)eq->desc, portion);
4845 		(*to) = (caddr_t)eq->desc + portion;
4846 	}
4847 }
4848 
4849 static inline void
4850 ring_eq_db(struct adapter *sc, struct sge_eq *eq, u_int n)
4851 {
4852 	u_int db;
4853 
4854 	MPASS(n > 0);
4855 
4856 	db = eq->doorbells;
4857 	if (n > 1)
4858 		clrbit(&db, DOORBELL_WCWR);
4859 	wmb();
4860 
4861 	switch (ffs(db) - 1) {
4862 	case DOORBELL_UDB:
4863 		*eq->udb = htole32(V_QID(eq->udb_qid) | V_PIDX(n));
4864 		break;
4865 
4866 	case DOORBELL_WCWR: {
4867 		volatile uint64_t *dst, *src;
4868 		int i;
4869 
4870 		/*
4871 		 * Queues whose 128B doorbell segment fits in the page do not
4872 		 * use relative qid (udb_qid is always 0).  Only queues with
4873 		 * doorbell segments can do WCWR.
4874 		 */
4875 		KASSERT(eq->udb_qid == 0 && n == 1,
4876 		    ("%s: inappropriate doorbell (0x%x, %d, %d) for eq %p",
4877 		    __func__, eq->doorbells, n, eq->dbidx, eq));
4878 
4879 		dst = (volatile void *)((uintptr_t)eq->udb + UDBS_WR_OFFSET -
4880 		    UDBS_DB_OFFSET);
4881 		i = eq->dbidx;
4882 		src = (void *)&eq->desc[i];
4883 		while (src != (void *)&eq->desc[i + 1])
4884 			*dst++ = *src++;
4885 		wmb();
4886 		break;
4887 	}
4888 
4889 	case DOORBELL_UDBWC:
4890 		*eq->udb = htole32(V_QID(eq->udb_qid) | V_PIDX(n));
4891 		wmb();
4892 		break;
4893 
4894 	case DOORBELL_KDB:
4895 		t4_write_reg(sc, sc->sge_kdoorbell_reg,
4896 		    V_QID(eq->cntxt_id) | V_PIDX(n));
4897 		break;
4898 	}
4899 
4900 	IDXINCR(eq->dbidx, n, eq->sidx);
4901 }
4902 
4903 static inline u_int
4904 reclaimable_tx_desc(struct sge_eq *eq)
4905 {
4906 	uint16_t hw_cidx;
4907 
4908 	hw_cidx = read_hw_cidx(eq);
4909 	return (IDXDIFF(hw_cidx, eq->cidx, eq->sidx));
4910 }
4911 
4912 static inline u_int
4913 total_available_tx_desc(struct sge_eq *eq)
4914 {
4915 	uint16_t hw_cidx, pidx;
4916 
4917 	hw_cidx = read_hw_cidx(eq);
4918 	pidx = eq->pidx;
4919 
4920 	if (pidx == hw_cidx)
4921 		return (eq->sidx - 1);
4922 	else
4923 		return (IDXDIFF(hw_cidx, pidx, eq->sidx) - 1);
4924 }
4925 
4926 static inline uint16_t
4927 read_hw_cidx(struct sge_eq *eq)
4928 {
4929 	struct sge_qstat *spg = (void *)&eq->desc[eq->sidx];
4930 	uint16_t cidx = spg->cidx;	/* stable snapshot */
4931 
4932 	return (be16toh(cidx));
4933 }
4934 
4935 /*
4936  * Reclaim 'n' descriptors approximately.
4937  */
4938 static u_int
4939 reclaim_tx_descs(struct sge_txq *txq, u_int n)
4940 {
4941 	struct tx_sdesc *txsd;
4942 	struct sge_eq *eq = &txq->eq;
4943 	u_int can_reclaim, reclaimed;
4944 
4945 	TXQ_LOCK_ASSERT_OWNED(txq);
4946 	MPASS(n > 0);
4947 
4948 	reclaimed = 0;
4949 	can_reclaim = reclaimable_tx_desc(eq);
4950 	while (can_reclaim && reclaimed < n) {
4951 		int ndesc;
4952 		struct mbuf *m, *nextpkt;
4953 
4954 		txsd = &txq->sdesc[eq->cidx];
4955 		ndesc = txsd->desc_used;
4956 
4957 		/* Firmware doesn't return "partial" credits. */
4958 		KASSERT(can_reclaim >= ndesc,
4959 		    ("%s: unexpected number of credits: %d, %d",
4960 		    __func__, can_reclaim, ndesc));
4961 
4962 		for (m = txsd->m; m != NULL; m = nextpkt) {
4963 			nextpkt = m->m_nextpkt;
4964 			m->m_nextpkt = NULL;
4965 			m_freem(m);
4966 		}
4967 		reclaimed += ndesc;
4968 		can_reclaim -= ndesc;
4969 		IDXINCR(eq->cidx, ndesc, eq->sidx);
4970 	}
4971 
4972 	return (reclaimed);
4973 }
4974 
4975 static void
4976 tx_reclaim(void *arg, int n)
4977 {
4978 	struct sge_txq *txq = arg;
4979 	struct sge_eq *eq = &txq->eq;
4980 
4981 	do {
4982 		if (TXQ_TRYLOCK(txq) == 0)
4983 			break;
4984 		n = reclaim_tx_descs(txq, 32);
4985 		if (eq->cidx == eq->pidx)
4986 			eq->equeqidx = eq->pidx;
4987 		TXQ_UNLOCK(txq);
4988 	} while (n > 0);
4989 }
4990 
4991 static __be64
4992 get_flit(struct sglist_seg *segs, int nsegs, int idx)
4993 {
4994 	int i = (idx / 3) * 2;
4995 
4996 	switch (idx % 3) {
4997 	case 0: {
4998 		uint64_t rc;
4999 
5000 		rc = (uint64_t)segs[i].ss_len << 32;
5001 		if (i + 1 < nsegs)
5002 			rc |= (uint64_t)(segs[i + 1].ss_len);
5003 
5004 		return (htobe64(rc));
5005 	}
5006 	case 1:
5007 		return (htobe64(segs[i].ss_paddr));
5008 	case 2:
5009 		return (htobe64(segs[i + 1].ss_paddr));
5010 	}
5011 
5012 	return (0);
5013 }
5014 
5015 static void
5016 find_best_refill_source(struct adapter *sc, struct sge_fl *fl, int maxp)
5017 {
5018 	int8_t zidx, hwidx, idx;
5019 	uint16_t region1, region3;
5020 	int spare, spare_needed, n;
5021 	struct sw_zone_info *swz;
5022 	struct hw_buf_info *hwb, *hwb_list = &sc->sge.hw_buf_info[0];
5023 
5024 	/*
5025 	 * Buffer Packing: Look for PAGE_SIZE or larger zone which has a bufsize
5026 	 * large enough for the max payload and cluster metadata.  Otherwise
5027 	 * settle for the largest bufsize that leaves enough room in the cluster
5028 	 * for metadata.
5029 	 *
5030 	 * Without buffer packing: Look for the smallest zone which has a
5031 	 * bufsize large enough for the max payload.  Settle for the largest
5032 	 * bufsize available if there's nothing big enough for max payload.
5033 	 */
5034 	spare_needed = fl->flags & FL_BUF_PACKING ? CL_METADATA_SIZE : 0;
5035 	swz = &sc->sge.sw_zone_info[0];
5036 	hwidx = -1;
5037 	for (zidx = 0; zidx < SW_ZONE_SIZES; zidx++, swz++) {
5038 		if (swz->size > largest_rx_cluster) {
5039 			if (__predict_true(hwidx != -1))
5040 				break;
5041 
5042 			/*
5043 			 * This is a misconfiguration.  largest_rx_cluster is
5044 			 * preventing us from finding a refill source.  See
5045 			 * dev.t5nex.<n>.buffer_sizes to figure out why.
5046 			 */
5047 			device_printf(sc->dev, "largest_rx_cluster=%u leaves no"
5048 			    " refill source for fl %p (dma %u).  Ignored.\n",
5049 			    largest_rx_cluster, fl, maxp);
5050 		}
5051 		for (idx = swz->head_hwidx; idx != -1; idx = hwb->next) {
5052 			hwb = &hwb_list[idx];
5053 			spare = swz->size - hwb->size;
5054 			if (spare < spare_needed)
5055 				continue;
5056 
5057 			hwidx = idx;		/* best option so far */
5058 			if (hwb->size >= maxp) {
5059 
5060 				if ((fl->flags & FL_BUF_PACKING) == 0)
5061 					goto done; /* stop looking (not packing) */
5062 
5063 				if (swz->size >= safest_rx_cluster)
5064 					goto done; /* stop looking (packing) */
5065 			}
5066 			break;		/* keep looking, next zone */
5067 		}
5068 	}
5069 done:
5070 	/* A usable hwidx has been located. */
5071 	MPASS(hwidx != -1);
5072 	hwb = &hwb_list[hwidx];
5073 	zidx = hwb->zidx;
5074 	swz = &sc->sge.sw_zone_info[zidx];
5075 	region1 = 0;
5076 	region3 = swz->size - hwb->size;
5077 
5078 	/*
5079 	 * Stay within this zone and see if there is a better match when mbuf
5080 	 * inlining is allowed.  Remember that the hwidx's are sorted in
5081 	 * decreasing order of size (so in increasing order of spare area).
5082 	 */
5083 	for (idx = hwidx; idx != -1; idx = hwb->next) {
5084 		hwb = &hwb_list[idx];
5085 		spare = swz->size - hwb->size;
5086 
5087 		if (allow_mbufs_in_cluster == 0 || hwb->size < maxp)
5088 			break;
5089 
5090 		/*
5091 		 * Do not inline mbufs if doing so would violate the pad/pack
5092 		 * boundary alignment requirement.
5093 		 */
5094 		if (fl_pad && (MSIZE % sc->params.sge.pad_boundary) != 0)
5095 			continue;
5096 		if (fl->flags & FL_BUF_PACKING &&
5097 		    (MSIZE % sc->params.sge.pack_boundary) != 0)
5098 			continue;
5099 
5100 		if (spare < CL_METADATA_SIZE + MSIZE)
5101 			continue;
5102 		n = (spare - CL_METADATA_SIZE) / MSIZE;
5103 		if (n > howmany(hwb->size, maxp))
5104 			break;
5105 
5106 		hwidx = idx;
5107 		if (fl->flags & FL_BUF_PACKING) {
5108 			region1 = n * MSIZE;
5109 			region3 = spare - region1;
5110 		} else {
5111 			region1 = MSIZE;
5112 			region3 = spare - region1;
5113 			break;
5114 		}
5115 	}
5116 
5117 	KASSERT(zidx >= 0 && zidx < SW_ZONE_SIZES,
5118 	    ("%s: bad zone %d for fl %p, maxp %d", __func__, zidx, fl, maxp));
5119 	KASSERT(hwidx >= 0 && hwidx <= SGE_FLBUF_SIZES,
5120 	    ("%s: bad hwidx %d for fl %p, maxp %d", __func__, hwidx, fl, maxp));
5121 	KASSERT(region1 + sc->sge.hw_buf_info[hwidx].size + region3 ==
5122 	    sc->sge.sw_zone_info[zidx].size,
5123 	    ("%s: bad buffer layout for fl %p, maxp %d. "
5124 		"cl %d; r1 %d, payload %d, r3 %d", __func__, fl, maxp,
5125 		sc->sge.sw_zone_info[zidx].size, region1,
5126 		sc->sge.hw_buf_info[hwidx].size, region3));
5127 	if (fl->flags & FL_BUF_PACKING || region1 > 0) {
5128 		KASSERT(region3 >= CL_METADATA_SIZE,
5129 		    ("%s: no room for metadata.  fl %p, maxp %d; "
5130 		    "cl %d; r1 %d, payload %d, r3 %d", __func__, fl, maxp,
5131 		    sc->sge.sw_zone_info[zidx].size, region1,
5132 		    sc->sge.hw_buf_info[hwidx].size, region3));
5133 		KASSERT(region1 % MSIZE == 0,
5134 		    ("%s: bad mbuf region for fl %p, maxp %d. "
5135 		    "cl %d; r1 %d, payload %d, r3 %d", __func__, fl, maxp,
5136 		    sc->sge.sw_zone_info[zidx].size, region1,
5137 		    sc->sge.hw_buf_info[hwidx].size, region3));
5138 	}
5139 
5140 	fl->cll_def.zidx = zidx;
5141 	fl->cll_def.hwidx = hwidx;
5142 	fl->cll_def.region1 = region1;
5143 	fl->cll_def.region3 = region3;
5144 }
5145 
5146 static void
5147 find_safe_refill_source(struct adapter *sc, struct sge_fl *fl)
5148 {
5149 	struct sge *s = &sc->sge;
5150 	struct hw_buf_info *hwb;
5151 	struct sw_zone_info *swz;
5152 	int spare;
5153 	int8_t hwidx;
5154 
5155 	if (fl->flags & FL_BUF_PACKING)
5156 		hwidx = s->safe_hwidx2;	/* with room for metadata */
5157 	else if (allow_mbufs_in_cluster && s->safe_hwidx2 != -1) {
5158 		hwidx = s->safe_hwidx2;
5159 		hwb = &s->hw_buf_info[hwidx];
5160 		swz = &s->sw_zone_info[hwb->zidx];
5161 		spare = swz->size - hwb->size;
5162 
5163 		/* no good if there isn't room for an mbuf as well */
5164 		if (spare < CL_METADATA_SIZE + MSIZE)
5165 			hwidx = s->safe_hwidx1;
5166 	} else
5167 		hwidx = s->safe_hwidx1;
5168 
5169 	if (hwidx == -1) {
5170 		/* No fallback source */
5171 		fl->cll_alt.hwidx = -1;
5172 		fl->cll_alt.zidx = -1;
5173 
5174 		return;
5175 	}
5176 
5177 	hwb = &s->hw_buf_info[hwidx];
5178 	swz = &s->sw_zone_info[hwb->zidx];
5179 	spare = swz->size - hwb->size;
5180 	fl->cll_alt.hwidx = hwidx;
5181 	fl->cll_alt.zidx = hwb->zidx;
5182 	if (allow_mbufs_in_cluster &&
5183 	    (fl_pad == 0 || (MSIZE % sc->params.sge.pad_boundary) == 0))
5184 		fl->cll_alt.region1 = ((spare - CL_METADATA_SIZE) / MSIZE) * MSIZE;
5185 	else
5186 		fl->cll_alt.region1 = 0;
5187 	fl->cll_alt.region3 = spare - fl->cll_alt.region1;
5188 }
5189 
5190 static void
5191 add_fl_to_sfl(struct adapter *sc, struct sge_fl *fl)
5192 {
5193 	mtx_lock(&sc->sfl_lock);
5194 	FL_LOCK(fl);
5195 	if ((fl->flags & FL_DOOMED) == 0) {
5196 		fl->flags |= FL_STARVING;
5197 		TAILQ_INSERT_TAIL(&sc->sfl, fl, link);
5198 		callout_reset(&sc->sfl_callout, hz / 5, refill_sfl, sc);
5199 	}
5200 	FL_UNLOCK(fl);
5201 	mtx_unlock(&sc->sfl_lock);
5202 }
5203 
5204 static void
5205 handle_wrq_egr_update(struct adapter *sc, struct sge_eq *eq)
5206 {
5207 	struct sge_wrq *wrq = (void *)eq;
5208 
5209 	atomic_readandclear_int(&eq->equiq);
5210 	taskqueue_enqueue(sc->tq[eq->tx_chan], &wrq->wrq_tx_task);
5211 }
5212 
5213 static void
5214 handle_eth_egr_update(struct adapter *sc, struct sge_eq *eq)
5215 {
5216 	struct sge_txq *txq = (void *)eq;
5217 
5218 	MPASS((eq->flags & EQ_TYPEMASK) == EQ_ETH);
5219 
5220 	atomic_readandclear_int(&eq->equiq);
5221 	mp_ring_check_drainage(txq->r, 0);
5222 	taskqueue_enqueue(sc->tq[eq->tx_chan], &txq->tx_reclaim_task);
5223 }
5224 
5225 static int
5226 handle_sge_egr_update(struct sge_iq *iq, const struct rss_header *rss,
5227     struct mbuf *m)
5228 {
5229 	const struct cpl_sge_egr_update *cpl = (const void *)(rss + 1);
5230 	unsigned int qid = G_EGR_QID(ntohl(cpl->opcode_qid));
5231 	struct adapter *sc = iq->adapter;
5232 	struct sge *s = &sc->sge;
5233 	struct sge_eq *eq;
5234 	static void (*h[])(struct adapter *, struct sge_eq *) = {NULL,
5235 		&handle_wrq_egr_update, &handle_eth_egr_update,
5236 		&handle_wrq_egr_update};
5237 
5238 	KASSERT(m == NULL, ("%s: payload with opcode %02x", __func__,
5239 	    rss->opcode));
5240 
5241 	eq = s->eqmap[qid - s->eq_start - s->eq_base];
5242 	(*h[eq->flags & EQ_TYPEMASK])(sc, eq);
5243 
5244 	return (0);
5245 }
5246 
5247 /* handle_fw_msg works for both fw4_msg and fw6_msg because this is valid */
5248 CTASSERT(offsetof(struct cpl_fw4_msg, data) == \
5249     offsetof(struct cpl_fw6_msg, data));
5250 
5251 static int
5252 handle_fw_msg(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
5253 {
5254 	struct adapter *sc = iq->adapter;
5255 	const struct cpl_fw6_msg *cpl = (const void *)(rss + 1);
5256 
5257 	KASSERT(m == NULL, ("%s: payload with opcode %02x", __func__,
5258 	    rss->opcode));
5259 
5260 	if (cpl->type == FW_TYPE_RSSCPL || cpl->type == FW6_TYPE_RSSCPL) {
5261 		const struct rss_header *rss2;
5262 
5263 		rss2 = (const struct rss_header *)&cpl->data[0];
5264 		return (t4_cpl_handler[rss2->opcode](iq, rss2, m));
5265 	}
5266 
5267 	return (t4_fw_msg_handler[cpl->type](sc, &cpl->data[0]));
5268 }
5269 
5270 /**
5271  *	t4_handle_wrerr_rpl - process a FW work request error message
5272  *	@adap: the adapter
5273  *	@rpl: start of the FW message
5274  */
5275 static int
5276 t4_handle_wrerr_rpl(struct adapter *adap, const __be64 *rpl)
5277 {
5278 	u8 opcode = *(const u8 *)rpl;
5279 	const struct fw_error_cmd *e = (const void *)rpl;
5280 	unsigned int i;
5281 
5282 	if (opcode != FW_ERROR_CMD) {
5283 		log(LOG_ERR,
5284 		    "%s: Received WRERR_RPL message with opcode %#x\n",
5285 		    device_get_nameunit(adap->dev), opcode);
5286 		return (EINVAL);
5287 	}
5288 	log(LOG_ERR, "%s: FW_ERROR (%s) ", device_get_nameunit(adap->dev),
5289 	    G_FW_ERROR_CMD_FATAL(be32toh(e->op_to_type)) ? "fatal" :
5290 	    "non-fatal");
5291 	switch (G_FW_ERROR_CMD_TYPE(be32toh(e->op_to_type))) {
5292 	case FW_ERROR_TYPE_EXCEPTION:
5293 		log(LOG_ERR, "exception info:\n");
5294 		for (i = 0; i < nitems(e->u.exception.info); i++)
5295 			log(LOG_ERR, "%s%08x", i == 0 ? "\t" : " ",
5296 			    be32toh(e->u.exception.info[i]));
5297 		log(LOG_ERR, "\n");
5298 		break;
5299 	case FW_ERROR_TYPE_HWMODULE:
5300 		log(LOG_ERR, "HW module regaddr %08x regval %08x\n",
5301 		    be32toh(e->u.hwmodule.regaddr),
5302 		    be32toh(e->u.hwmodule.regval));
5303 		break;
5304 	case FW_ERROR_TYPE_WR:
5305 		log(LOG_ERR, "WR cidx %d PF %d VF %d eqid %d hdr:\n",
5306 		    be16toh(e->u.wr.cidx),
5307 		    G_FW_ERROR_CMD_PFN(be16toh(e->u.wr.pfn_vfn)),
5308 		    G_FW_ERROR_CMD_VFN(be16toh(e->u.wr.pfn_vfn)),
5309 		    be32toh(e->u.wr.eqid));
5310 		for (i = 0; i < nitems(e->u.wr.wrhdr); i++)
5311 			log(LOG_ERR, "%s%02x", i == 0 ? "\t" : " ",
5312 			    e->u.wr.wrhdr[i]);
5313 		log(LOG_ERR, "\n");
5314 		break;
5315 	case FW_ERROR_TYPE_ACL:
5316 		log(LOG_ERR, "ACL cidx %d PF %d VF %d eqid %d %s",
5317 		    be16toh(e->u.acl.cidx),
5318 		    G_FW_ERROR_CMD_PFN(be16toh(e->u.acl.pfn_vfn)),
5319 		    G_FW_ERROR_CMD_VFN(be16toh(e->u.acl.pfn_vfn)),
5320 		    be32toh(e->u.acl.eqid),
5321 		    G_FW_ERROR_CMD_MV(be16toh(e->u.acl.mv_pkd)) ? "vlanid" :
5322 		    "MAC");
5323 		for (i = 0; i < nitems(e->u.acl.val); i++)
5324 			log(LOG_ERR, " %02x", e->u.acl.val[i]);
5325 		log(LOG_ERR, "\n");
5326 		break;
5327 	default:
5328 		log(LOG_ERR, "type %#x\n",
5329 		    G_FW_ERROR_CMD_TYPE(be32toh(e->op_to_type)));
5330 		return (EINVAL);
5331 	}
5332 	return (0);
5333 }
5334 
5335 static int
5336 sysctl_uint16(SYSCTL_HANDLER_ARGS)
5337 {
5338 	uint16_t *id = arg1;
5339 	int i = *id;
5340 
5341 	return sysctl_handle_int(oidp, &i, 0, req);
5342 }
5343 
5344 static int
5345 sysctl_bufsizes(SYSCTL_HANDLER_ARGS)
5346 {
5347 	struct sge *s = arg1;
5348 	struct hw_buf_info *hwb = &s->hw_buf_info[0];
5349 	struct sw_zone_info *swz = &s->sw_zone_info[0];
5350 	int i, rc;
5351 	struct sbuf sb;
5352 	char c;
5353 
5354 	sbuf_new(&sb, NULL, 32, SBUF_AUTOEXTEND);
5355 	for (i = 0; i < SGE_FLBUF_SIZES; i++, hwb++) {
5356 		if (hwb->zidx >= 0 && swz[hwb->zidx].size <= largest_rx_cluster)
5357 			c = '*';
5358 		else
5359 			c = '\0';
5360 
5361 		sbuf_printf(&sb, "%u%c ", hwb->size, c);
5362 	}
5363 	sbuf_trim(&sb);
5364 	sbuf_finish(&sb);
5365 	rc = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
5366 	sbuf_delete(&sb);
5367 	return (rc);
5368 }
5369 
5370 #ifdef RATELIMIT
5371 /*
5372  * len16 for a txpkt WR with a GL.  Includes the firmware work request header.
5373  */
5374 static inline u_int
5375 txpkt_eo_len16(u_int nsegs, u_int immhdrs, u_int tso)
5376 {
5377 	u_int n;
5378 
5379 	MPASS(immhdrs > 0);
5380 
5381 	n = roundup2(sizeof(struct fw_eth_tx_eo_wr) +
5382 	    sizeof(struct cpl_tx_pkt_core) + immhdrs, 16);
5383 	if (__predict_false(nsegs == 0))
5384 		goto done;
5385 
5386 	nsegs--; /* first segment is part of ulptx_sgl */
5387 	n += sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1));
5388 	if (tso)
5389 		n += sizeof(struct cpl_tx_pkt_lso_core);
5390 
5391 done:
5392 	return (howmany(n, 16));
5393 }
5394 
5395 #define ETID_FLOWC_NPARAMS 6
5396 #define ETID_FLOWC_LEN (roundup2((sizeof(struct fw_flowc_wr) + \
5397     ETID_FLOWC_NPARAMS * sizeof(struct fw_flowc_mnemval)), 16))
5398 #define ETID_FLOWC_LEN16 (howmany(ETID_FLOWC_LEN, 16))
5399 
5400 static int
5401 send_etid_flowc_wr(struct cxgbe_snd_tag *cst, struct port_info *pi,
5402     struct vi_info *vi)
5403 {
5404 	struct wrq_cookie cookie;
5405 	u_int pfvf = G_FW_VIID_PFN(vi->viid) << S_FW_VIID_PFN;
5406 	struct fw_flowc_wr *flowc;
5407 
5408 	mtx_assert(&cst->lock, MA_OWNED);
5409 	MPASS((cst->flags & (EO_FLOWC_PENDING | EO_FLOWC_RPL_PENDING)) ==
5410 	    EO_FLOWC_PENDING);
5411 
5412 	flowc = start_wrq_wr(cst->eo_txq, ETID_FLOWC_LEN16, &cookie);
5413 	if (__predict_false(flowc == NULL))
5414 		return (ENOMEM);
5415 
5416 	bzero(flowc, ETID_FLOWC_LEN);
5417 	flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
5418 	    V_FW_FLOWC_WR_NPARAMS(ETID_FLOWC_NPARAMS) | V_FW_WR_COMPL(0));
5419 	flowc->flowid_len16 = htonl(V_FW_WR_LEN16(ETID_FLOWC_LEN16) |
5420 	    V_FW_WR_FLOWID(cst->etid));
5421 	flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN;
5422 	flowc->mnemval[0].val = htobe32(pfvf);
5423 	flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH;
5424 	flowc->mnemval[1].val = htobe32(pi->tx_chan);
5425 	flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT;
5426 	flowc->mnemval[2].val = htobe32(pi->tx_chan);
5427 	flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID;
5428 	flowc->mnemval[3].val = htobe32(cst->iqid);
5429 	flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_EOSTATE;
5430 	flowc->mnemval[4].val = htobe32(FW_FLOWC_MNEM_EOSTATE_ESTABLISHED);
5431 	flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_SCHEDCLASS;
5432 	flowc->mnemval[5].val = htobe32(cst->schedcl);
5433 
5434 	commit_wrq_wr(cst->eo_txq, flowc, &cookie);
5435 
5436 	cst->flags &= ~EO_FLOWC_PENDING;
5437 	cst->flags |= EO_FLOWC_RPL_PENDING;
5438 	MPASS(cst->tx_credits >= ETID_FLOWC_LEN16);	/* flowc is first WR. */
5439 	cst->tx_credits -= ETID_FLOWC_LEN16;
5440 
5441 	return (0);
5442 }
5443 
5444 #define ETID_FLUSH_LEN16 (howmany(sizeof (struct fw_flowc_wr), 16))
5445 
5446 void
5447 send_etid_flush_wr(struct cxgbe_snd_tag *cst)
5448 {
5449 	struct fw_flowc_wr *flowc;
5450 	struct wrq_cookie cookie;
5451 
5452 	mtx_assert(&cst->lock, MA_OWNED);
5453 
5454 	flowc = start_wrq_wr(cst->eo_txq, ETID_FLUSH_LEN16, &cookie);
5455 	if (__predict_false(flowc == NULL))
5456 		CXGBE_UNIMPLEMENTED(__func__);
5457 
5458 	bzero(flowc, ETID_FLUSH_LEN16 * 16);
5459 	flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
5460 	    V_FW_FLOWC_WR_NPARAMS(0) | F_FW_WR_COMPL);
5461 	flowc->flowid_len16 = htobe32(V_FW_WR_LEN16(ETID_FLUSH_LEN16) |
5462 	    V_FW_WR_FLOWID(cst->etid));
5463 
5464 	commit_wrq_wr(cst->eo_txq, flowc, &cookie);
5465 
5466 	cst->flags |= EO_FLUSH_RPL_PENDING;
5467 	MPASS(cst->tx_credits >= ETID_FLUSH_LEN16);
5468 	cst->tx_credits -= ETID_FLUSH_LEN16;
5469 	cst->ncompl++;
5470 }
5471 
5472 static void
5473 write_ethofld_wr(struct cxgbe_snd_tag *cst, struct fw_eth_tx_eo_wr *wr,
5474     struct mbuf *m0, int compl)
5475 {
5476 	struct cpl_tx_pkt_core *cpl;
5477 	uint64_t ctrl1;
5478 	uint32_t ctrl;	/* used in many unrelated places */
5479 	int len16, pktlen, nsegs, immhdrs;
5480 	caddr_t dst;
5481 	uintptr_t p;
5482 	struct ulptx_sgl *usgl;
5483 	struct sglist sg;
5484 	struct sglist_seg segs[38];	/* XXX: find real limit.  XXX: get off the stack */
5485 
5486 	mtx_assert(&cst->lock, MA_OWNED);
5487 	M_ASSERTPKTHDR(m0);
5488 	KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 &&
5489 	    m0->m_pkthdr.l4hlen > 0,
5490 	    ("%s: ethofld mbuf %p is missing header lengths", __func__, m0));
5491 
5492 	if (needs_udp_csum(m0)) {
5493 		CXGBE_UNIMPLEMENTED("UDP ethofld");
5494 	}
5495 
5496 	len16 = mbuf_eo_len16(m0);
5497 	nsegs = mbuf_eo_nsegs(m0);
5498 	pktlen = m0->m_pkthdr.len;
5499 	ctrl = sizeof(struct cpl_tx_pkt_core);
5500 	if (needs_tso(m0))
5501 		ctrl += sizeof(struct cpl_tx_pkt_lso_core);
5502 	immhdrs = m0->m_pkthdr.l2hlen + m0->m_pkthdr.l3hlen + m0->m_pkthdr.l4hlen;
5503 	ctrl += immhdrs;
5504 
5505 	wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_EO_WR) |
5506 	    V_FW_ETH_TX_EO_WR_IMMDLEN(ctrl) | V_FW_WR_COMPL(!!compl));
5507 	wr->equiq_to_len16 = htobe32(V_FW_WR_LEN16(len16) |
5508 	    V_FW_WR_FLOWID(cst->etid));
5509 	wr->r3 = 0;
5510 	wr->u.tcpseg.type = FW_ETH_TX_EO_TYPE_TCPSEG;
5511 	wr->u.tcpseg.ethlen = m0->m_pkthdr.l2hlen;
5512 	wr->u.tcpseg.iplen = htobe16(m0->m_pkthdr.l3hlen);
5513 	wr->u.tcpseg.tcplen = m0->m_pkthdr.l4hlen;
5514 	wr->u.tcpseg.tsclk_tsoff = mbuf_eo_tsclk_tsoff(m0);
5515 	wr->u.tcpseg.r4 = 0;
5516 	wr->u.tcpseg.r5 = 0;
5517 	wr->u.tcpseg.plen = htobe32(pktlen - immhdrs);
5518 
5519 	if (needs_tso(m0)) {
5520 		struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1);
5521 
5522 		wr->u.tcpseg.mss = htobe16(m0->m_pkthdr.tso_segsz);
5523 
5524 		ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) | F_LSO_FIRST_SLICE |
5525 		    F_LSO_LAST_SLICE | V_LSO_IPHDR_LEN(m0->m_pkthdr.l3hlen >> 2)
5526 		    | V_LSO_TCPHDR_LEN(m0->m_pkthdr.l4hlen >> 2);
5527 		if (m0->m_pkthdr.l2hlen == sizeof(struct ether_vlan_header))
5528 			ctrl |= V_LSO_ETHHDR_LEN(1);
5529 		if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr))
5530 			ctrl |= F_LSO_IPV6;
5531 		lso->lso_ctrl = htobe32(ctrl);
5532 		lso->ipid_ofst = htobe16(0);
5533 		lso->mss = htobe16(m0->m_pkthdr.tso_segsz);
5534 		lso->seqno_offset = htobe32(0);
5535 		lso->len = htobe32(pktlen);
5536 
5537 		cpl = (void *)(lso + 1);
5538 	} else {
5539 		wr->u.tcpseg.mss = htobe16(0xffff);
5540 		cpl = (void *)(wr + 1);
5541 	}
5542 
5543 	/* Checksum offload must be requested for ethofld. */
5544 	ctrl1 = 0;
5545 	MPASS(needs_l4_csum(m0));
5546 
5547 	/* VLAN tag insertion */
5548 	if (needs_vlan_insertion(m0)) {
5549 		ctrl1 |= F_TXPKT_VLAN_VLD |
5550 		    V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag);
5551 	}
5552 
5553 	/* CPL header */
5554 	cpl->ctrl0 = cst->ctrl0;
5555 	cpl->pack = 0;
5556 	cpl->len = htobe16(pktlen);
5557 	cpl->ctrl1 = htobe64(ctrl1);
5558 
5559 	/* Copy Ethernet, IP & TCP hdrs as immediate data */
5560 	p = (uintptr_t)(cpl + 1);
5561 	m_copydata(m0, 0, immhdrs, (void *)p);
5562 
5563 	/* SGL */
5564 	dst = (void *)(cpl + 1);
5565 	if (nsegs > 0) {
5566 		int i, pad;
5567 
5568 		/* zero-pad upto next 16Byte boundary, if not 16Byte aligned */
5569 		p += immhdrs;
5570 		pad = 16 - (immhdrs & 0xf);
5571 		bzero((void *)p, pad);
5572 
5573 		usgl = (void *)(p + pad);
5574 		usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) |
5575 		    V_ULPTX_NSGE(nsegs));
5576 
5577 		sglist_init(&sg, nitems(segs), segs);
5578 		for (; m0 != NULL; m0 = m0->m_next) {
5579 			if (__predict_false(m0->m_len == 0))
5580 				continue;
5581 			if (immhdrs >= m0->m_len) {
5582 				immhdrs -= m0->m_len;
5583 				continue;
5584 			}
5585 
5586 			sglist_append(&sg, mtod(m0, char *) + immhdrs,
5587 			    m0->m_len - immhdrs);
5588 			immhdrs = 0;
5589 		}
5590 		MPASS(sg.sg_nseg == nsegs);
5591 
5592 		/*
5593 		 * Zero pad last 8B in case the WR doesn't end on a 16B
5594 		 * boundary.
5595 		 */
5596 		*(uint64_t *)((char *)wr + len16 * 16 - 8) = 0;
5597 
5598 		usgl->len0 = htobe32(segs[0].ss_len);
5599 		usgl->addr0 = htobe64(segs[0].ss_paddr);
5600 		for (i = 0; i < nsegs - 1; i++) {
5601 			usgl->sge[i / 2].len[i & 1] = htobe32(segs[i + 1].ss_len);
5602 			usgl->sge[i / 2].addr[i & 1] = htobe64(segs[i + 1].ss_paddr);
5603 		}
5604 		if (i & 1)
5605 			usgl->sge[i / 2].len[1] = htobe32(0);
5606 	}
5607 
5608 }
5609 
5610 static void
5611 ethofld_tx(struct cxgbe_snd_tag *cst)
5612 {
5613 	struct mbuf *m;
5614 	struct wrq_cookie cookie;
5615 	int next_credits, compl;
5616 	struct fw_eth_tx_eo_wr *wr;
5617 
5618 	mtx_assert(&cst->lock, MA_OWNED);
5619 
5620 	while ((m = mbufq_first(&cst->pending_tx)) != NULL) {
5621 		M_ASSERTPKTHDR(m);
5622 
5623 		/* How many len16 credits do we need to send this mbuf. */
5624 		next_credits = mbuf_eo_len16(m);
5625 		MPASS(next_credits > 0);
5626 		if (next_credits > cst->tx_credits) {
5627 			/*
5628 			 * Tx will make progress eventually because there is at
5629 			 * least one outstanding fw4_ack that will return
5630 			 * credits and kick the tx.
5631 			 */
5632 			MPASS(cst->ncompl > 0);
5633 			return;
5634 		}
5635 		wr = start_wrq_wr(cst->eo_txq, next_credits, &cookie);
5636 		if (__predict_false(wr == NULL)) {
5637 			/* XXX: wishful thinking, not a real assertion. */
5638 			MPASS(cst->ncompl > 0);
5639 			return;
5640 		}
5641 		cst->tx_credits -= next_credits;
5642 		cst->tx_nocompl += next_credits;
5643 		compl = cst->ncompl == 0 || cst->tx_nocompl >= cst->tx_total / 2;
5644 		ETHER_BPF_MTAP(cst->com.ifp, m);
5645 		write_ethofld_wr(cst, wr, m, compl);
5646 		commit_wrq_wr(cst->eo_txq, wr, &cookie);
5647 		if (compl) {
5648 			cst->ncompl++;
5649 			cst->tx_nocompl	= 0;
5650 		}
5651 		(void) mbufq_dequeue(&cst->pending_tx);
5652 		mbufq_enqueue(&cst->pending_fwack, m);
5653 	}
5654 }
5655 
5656 int
5657 ethofld_transmit(struct ifnet *ifp, struct mbuf *m0)
5658 {
5659 	struct cxgbe_snd_tag *cst;
5660 	int rc;
5661 
5662 	MPASS(m0->m_nextpkt == NULL);
5663 	MPASS(m0->m_pkthdr.snd_tag != NULL);
5664 	cst = mst_to_cst(m0->m_pkthdr.snd_tag);
5665 
5666 	mtx_lock(&cst->lock);
5667 	MPASS(cst->flags & EO_SND_TAG_REF);
5668 
5669 	if (__predict_false(cst->flags & EO_FLOWC_PENDING)) {
5670 		struct vi_info *vi = ifp->if_softc;
5671 		struct port_info *pi = vi->pi;
5672 		struct adapter *sc = pi->adapter;
5673 		const uint32_t rss_mask = vi->rss_size - 1;
5674 		uint32_t rss_hash;
5675 
5676 		cst->eo_txq = &sc->sge.ofld_txq[vi->first_ofld_txq];
5677 		if (M_HASHTYPE_ISHASH(m0))
5678 			rss_hash = m0->m_pkthdr.flowid;
5679 		else
5680 			rss_hash = arc4random();
5681 		/* We assume RSS hashing */
5682 		cst->iqid = vi->rss[rss_hash & rss_mask];
5683 		cst->eo_txq += rss_hash % vi->nofldtxq;
5684 		rc = send_etid_flowc_wr(cst, pi, vi);
5685 		if (rc != 0)
5686 			goto done;
5687 	}
5688 
5689 	if (__predict_false(cst->plen + m0->m_pkthdr.len > eo_max_backlog)) {
5690 		rc = ENOBUFS;
5691 		goto done;
5692 	}
5693 
5694 	mbufq_enqueue(&cst->pending_tx, m0);
5695 	cst->plen += m0->m_pkthdr.len;
5696 
5697 	ethofld_tx(cst);
5698 	rc = 0;
5699 done:
5700 	mtx_unlock(&cst->lock);
5701 	if (__predict_false(rc != 0))
5702 		m_freem(m0);
5703 	return (rc);
5704 }
5705 
5706 static int
5707 ethofld_fw4_ack(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m0)
5708 {
5709 	struct adapter *sc = iq->adapter;
5710 	const struct cpl_fw4_ack *cpl = (const void *)(rss + 1);
5711 	struct mbuf *m;
5712 	u_int etid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl)));
5713 	struct cxgbe_snd_tag *cst;
5714 	uint8_t credits = cpl->credits;
5715 
5716 	cst = lookup_etid(sc, etid);
5717 	mtx_lock(&cst->lock);
5718 	if (__predict_false(cst->flags & EO_FLOWC_RPL_PENDING)) {
5719 		MPASS(credits >= ETID_FLOWC_LEN16);
5720 		credits -= ETID_FLOWC_LEN16;
5721 		cst->flags &= ~EO_FLOWC_RPL_PENDING;
5722 	}
5723 
5724 	KASSERT(cst->ncompl > 0,
5725 	    ("%s: etid %u (%p) wasn't expecting completion.",
5726 	    __func__, etid, cst));
5727 	cst->ncompl--;
5728 
5729 	while (credits > 0) {
5730 		m = mbufq_dequeue(&cst->pending_fwack);
5731 		if (__predict_false(m == NULL)) {
5732 			/*
5733 			 * The remaining credits are for the final flush that
5734 			 * was issued when the tag was freed by the kernel.
5735 			 */
5736 			MPASS((cst->flags &
5737 			    (EO_FLUSH_RPL_PENDING | EO_SND_TAG_REF)) ==
5738 			    EO_FLUSH_RPL_PENDING);
5739 			MPASS(credits == ETID_FLUSH_LEN16);
5740 			MPASS(cst->tx_credits + cpl->credits == cst->tx_total);
5741 			MPASS(cst->ncompl == 0);
5742 
5743 			cst->flags &= ~EO_FLUSH_RPL_PENDING;
5744 			cst->tx_credits += cpl->credits;
5745 freetag:
5746 			cxgbe_snd_tag_free_locked(cst);
5747 			return (0);	/* cst is gone. */
5748 		}
5749 		KASSERT(m != NULL,
5750 		    ("%s: too many credits (%u, %u)", __func__, cpl->credits,
5751 		    credits));
5752 		KASSERT(credits >= mbuf_eo_len16(m),
5753 		    ("%s: too few credits (%u, %u, %u)", __func__,
5754 		    cpl->credits, credits, mbuf_eo_len16(m)));
5755 		credits -= mbuf_eo_len16(m);
5756 		cst->plen -= m->m_pkthdr.len;
5757 		m_freem(m);
5758 	}
5759 
5760 	cst->tx_credits += cpl->credits;
5761 	MPASS(cst->tx_credits <= cst->tx_total);
5762 
5763 	m = mbufq_first(&cst->pending_tx);
5764 	if (m != NULL && cst->tx_credits >= mbuf_eo_len16(m))
5765 		ethofld_tx(cst);
5766 
5767 	if (__predict_false((cst->flags & EO_SND_TAG_REF) == 0) &&
5768 	    cst->ncompl == 0) {
5769 		if (cst->tx_credits == cst->tx_total)
5770 			goto freetag;
5771 		else {
5772 			MPASS((cst->flags & EO_FLUSH_RPL_PENDING) == 0);
5773 			send_etid_flush_wr(cst);
5774 		}
5775 	}
5776 
5777 	mtx_unlock(&cst->lock);
5778 
5779 	return (0);
5780 }
5781 #endif
5782