xref: /freebsd/sys/dev/cxgbe/t4_sge.c (revision 1de7b4b805ddbf2429da511c053686ac4591ed89)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2011 Chelsio Communications, Inc.
5  * All rights reserved.
6  * Written by: Navdeep Parhar <np@FreeBSD.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 
33 #include "opt_inet.h"
34 #include "opt_inet6.h"
35 
36 #include <sys/types.h>
37 #include <sys/eventhandler.h>
38 #include <sys/mbuf.h>
39 #include <sys/socket.h>
40 #include <sys/kernel.h>
41 #include <sys/malloc.h>
42 #include <sys/queue.h>
43 #include <sys/sbuf.h>
44 #include <sys/taskqueue.h>
45 #include <sys/time.h>
46 #include <sys/sglist.h>
47 #include <sys/sysctl.h>
48 #include <sys/smp.h>
49 #include <sys/counter.h>
50 #include <net/bpf.h>
51 #include <net/ethernet.h>
52 #include <net/if.h>
53 #include <net/if_vlan_var.h>
54 #include <netinet/in.h>
55 #include <netinet/ip.h>
56 #include <netinet/ip6.h>
57 #include <netinet/tcp.h>
58 #include <machine/in_cksum.h>
59 #include <machine/md_var.h>
60 #include <vm/vm.h>
61 #include <vm/pmap.h>
62 #ifdef DEV_NETMAP
63 #include <machine/bus.h>
64 #include <sys/selinfo.h>
65 #include <net/if_var.h>
66 #include <net/netmap.h>
67 #include <dev/netmap/netmap_kern.h>
68 #endif
69 
70 #include "common/common.h"
71 #include "common/t4_regs.h"
72 #include "common/t4_regs_values.h"
73 #include "common/t4_msg.h"
74 #include "t4_l2t.h"
75 #include "t4_mp_ring.h"
76 
77 #ifdef T4_PKT_TIMESTAMP
78 #define RX_COPY_THRESHOLD (MINCLSIZE - 8)
79 #else
80 #define RX_COPY_THRESHOLD MINCLSIZE
81 #endif
82 
83 /*
84  * Ethernet frames are DMA'd at this byte offset into the freelist buffer.
85  * 0-7 are valid values.
86  */
87 static int fl_pktshift = 2;
88 TUNABLE_INT("hw.cxgbe.fl_pktshift", &fl_pktshift);
89 
90 /*
91  * Pad ethernet payload up to this boundary.
92  * -1: driver should figure out a good value.
93  *  0: disable padding.
94  *  Any power of 2 from 32 to 4096 (both inclusive) is also a valid value.
95  */
96 int fl_pad = -1;
97 TUNABLE_INT("hw.cxgbe.fl_pad", &fl_pad);
98 
99 /*
100  * Status page length.
101  * -1: driver should figure out a good value.
102  *  64 or 128 are the only other valid values.
103  */
104 static int spg_len = -1;
105 TUNABLE_INT("hw.cxgbe.spg_len", &spg_len);
106 
107 /*
108  * Congestion drops.
109  * -1: no congestion feedback (not recommended).
110  *  0: backpressure the channel instead of dropping packets right away.
111  *  1: no backpressure, drop packets for the congested queue immediately.
112  */
113 static int cong_drop = 0;
114 TUNABLE_INT("hw.cxgbe.cong_drop", &cong_drop);
115 
116 /*
117  * Deliver multiple frames in the same free list buffer if they fit.
118  * -1: let the driver decide whether to enable buffer packing or not.
119  *  0: disable buffer packing.
120  *  1: enable buffer packing.
121  */
122 static int buffer_packing = -1;
123 TUNABLE_INT("hw.cxgbe.buffer_packing", &buffer_packing);
124 
125 /*
126  * Start next frame in a packed buffer at this boundary.
127  * -1: driver should figure out a good value.
128  * T4: driver will ignore this and use the same value as fl_pad above.
129  * T5: 16, or a power of 2 from 64 to 4096 (both inclusive) is a valid value.
130  */
131 static int fl_pack = -1;
132 TUNABLE_INT("hw.cxgbe.fl_pack", &fl_pack);
133 
134 /*
135  * Allow the driver to create mbuf(s) in a cluster allocated for rx.
136  * 0: never; always allocate mbufs from the zone_mbuf UMA zone.
137  * 1: ok to create mbuf(s) within a cluster if there is room.
138  */
139 static int allow_mbufs_in_cluster = 1;
140 TUNABLE_INT("hw.cxgbe.allow_mbufs_in_cluster", &allow_mbufs_in_cluster);
141 
142 /*
143  * Largest rx cluster size that the driver is allowed to allocate.
144  */
145 static int largest_rx_cluster = MJUM16BYTES;
146 TUNABLE_INT("hw.cxgbe.largest_rx_cluster", &largest_rx_cluster);
147 
148 /*
149  * Size of cluster allocation that's most likely to succeed.  The driver will
150  * fall back to this size if it fails to allocate clusters larger than this.
151  */
152 static int safest_rx_cluster = PAGE_SIZE;
153 TUNABLE_INT("hw.cxgbe.safest_rx_cluster", &safest_rx_cluster);
154 
155 /*
156  * The interrupt holdoff timers are multiplied by this value on T6+.
157  * 1 and 3-17 (both inclusive) are legal values.
158  */
159 static int tscale = 1;
160 TUNABLE_INT("hw.cxgbe.tscale", &tscale);
161 
162 /*
163  * Number of LRO entries in the lro_ctrl structure per rx queue.
164  */
165 static int lro_entries = TCP_LRO_ENTRIES;
166 TUNABLE_INT("hw.cxgbe.lro_entries", &lro_entries);
167 
168 /*
169  * This enables presorting of frames before they're fed into tcp_lro_rx.
170  */
171 static int lro_mbufs = 0;
172 TUNABLE_INT("hw.cxgbe.lro_mbufs", &lro_mbufs);
173 
174 struct txpkts {
175 	u_int wr_type;		/* type 0 or type 1 */
176 	u_int npkt;		/* # of packets in this work request */
177 	u_int plen;		/* total payload (sum of all packets) */
178 	u_int len16;		/* # of 16B pieces used by this work request */
179 };
180 
181 /* A packet's SGL.  This + m_pkthdr has all info needed for tx */
182 struct sgl {
183 	struct sglist sg;
184 	struct sglist_seg seg[TX_SGL_SEGS];
185 };
186 
187 static int service_iq(struct sge_iq *, int);
188 static struct mbuf *get_fl_payload(struct adapter *, struct sge_fl *, uint32_t);
189 static int t4_eth_rx(struct sge_iq *, const struct rss_header *, struct mbuf *);
190 static inline void init_iq(struct sge_iq *, struct adapter *, int, int, int);
191 static inline void init_fl(struct adapter *, struct sge_fl *, int, int, char *);
192 static inline void init_eq(struct adapter *, struct sge_eq *, int, int, uint8_t,
193     uint16_t, char *);
194 static int alloc_ring(struct adapter *, size_t, bus_dma_tag_t *, bus_dmamap_t *,
195     bus_addr_t *, void **);
196 static int free_ring(struct adapter *, bus_dma_tag_t, bus_dmamap_t, bus_addr_t,
197     void *);
198 static int alloc_iq_fl(struct vi_info *, struct sge_iq *, struct sge_fl *,
199     int, int);
200 static int free_iq_fl(struct vi_info *, struct sge_iq *, struct sge_fl *);
201 static void add_fl_sysctls(struct adapter *, struct sysctl_ctx_list *,
202     struct sysctl_oid *, struct sge_fl *);
203 static int alloc_fwq(struct adapter *);
204 static int free_fwq(struct adapter *);
205 static int alloc_mgmtq(struct adapter *);
206 static int free_mgmtq(struct adapter *);
207 static int alloc_rxq(struct vi_info *, struct sge_rxq *, int, int,
208     struct sysctl_oid *);
209 static int free_rxq(struct vi_info *, struct sge_rxq *);
210 #ifdef TCP_OFFLOAD
211 static int alloc_ofld_rxq(struct vi_info *, struct sge_ofld_rxq *, int, int,
212     struct sysctl_oid *);
213 static int free_ofld_rxq(struct vi_info *, struct sge_ofld_rxq *);
214 #endif
215 #ifdef DEV_NETMAP
216 static int alloc_nm_rxq(struct vi_info *, struct sge_nm_rxq *, int, int,
217     struct sysctl_oid *);
218 static int free_nm_rxq(struct vi_info *, struct sge_nm_rxq *);
219 static int alloc_nm_txq(struct vi_info *, struct sge_nm_txq *, int, int,
220     struct sysctl_oid *);
221 static int free_nm_txq(struct vi_info *, struct sge_nm_txq *);
222 #endif
223 static int ctrl_eq_alloc(struct adapter *, struct sge_eq *);
224 static int eth_eq_alloc(struct adapter *, struct vi_info *, struct sge_eq *);
225 #ifdef TCP_OFFLOAD
226 static int ofld_eq_alloc(struct adapter *, struct vi_info *, struct sge_eq *);
227 #endif
228 static int alloc_eq(struct adapter *, struct vi_info *, struct sge_eq *);
229 static int free_eq(struct adapter *, struct sge_eq *);
230 static int alloc_wrq(struct adapter *, struct vi_info *, struct sge_wrq *,
231     struct sysctl_oid *);
232 static int free_wrq(struct adapter *, struct sge_wrq *);
233 static int alloc_txq(struct vi_info *, struct sge_txq *, int,
234     struct sysctl_oid *);
235 static int free_txq(struct vi_info *, struct sge_txq *);
236 static void oneseg_dma_callback(void *, bus_dma_segment_t *, int, int);
237 static inline void ring_fl_db(struct adapter *, struct sge_fl *);
238 static int refill_fl(struct adapter *, struct sge_fl *, int);
239 static void refill_sfl(void *);
240 static int alloc_fl_sdesc(struct sge_fl *);
241 static void free_fl_sdesc(struct adapter *, struct sge_fl *);
242 static void find_best_refill_source(struct adapter *, struct sge_fl *, int);
243 static void find_safe_refill_source(struct adapter *, struct sge_fl *);
244 static void add_fl_to_sfl(struct adapter *, struct sge_fl *);
245 
246 static inline void get_pkt_gl(struct mbuf *, struct sglist *);
247 static inline u_int txpkt_len16(u_int, u_int);
248 static inline u_int txpkt_vm_len16(u_int, u_int);
249 static inline u_int txpkts0_len16(u_int);
250 static inline u_int txpkts1_len16(void);
251 static u_int write_txpkt_wr(struct sge_txq *, struct fw_eth_tx_pkt_wr *,
252     struct mbuf *, u_int);
253 static u_int write_txpkt_vm_wr(struct adapter *, struct sge_txq *,
254     struct fw_eth_tx_pkt_vm_wr *, struct mbuf *, u_int);
255 static int try_txpkts(struct mbuf *, struct mbuf *, struct txpkts *, u_int);
256 static int add_to_txpkts(struct mbuf *, struct txpkts *, u_int);
257 static u_int write_txpkts_wr(struct sge_txq *, struct fw_eth_tx_pkts_wr *,
258     struct mbuf *, const struct txpkts *, u_int);
259 static void write_gl_to_txd(struct sge_txq *, struct mbuf *, caddr_t *, int);
260 static inline void copy_to_txd(struct sge_eq *, caddr_t, caddr_t *, int);
261 static inline void ring_eq_db(struct adapter *, struct sge_eq *, u_int);
262 static inline uint16_t read_hw_cidx(struct sge_eq *);
263 static inline u_int reclaimable_tx_desc(struct sge_eq *);
264 static inline u_int total_available_tx_desc(struct sge_eq *);
265 static u_int reclaim_tx_descs(struct sge_txq *, u_int);
266 static void tx_reclaim(void *, int);
267 static __be64 get_flit(struct sglist_seg *, int, int);
268 static int handle_sge_egr_update(struct sge_iq *, const struct rss_header *,
269     struct mbuf *);
270 static int handle_fw_msg(struct sge_iq *, const struct rss_header *,
271     struct mbuf *);
272 static int t4_handle_wrerr_rpl(struct adapter *, const __be64 *);
273 static void wrq_tx_drain(void *, int);
274 static void drain_wrq_wr_list(struct adapter *, struct sge_wrq *);
275 
276 static int sysctl_uint16(SYSCTL_HANDLER_ARGS);
277 static int sysctl_bufsizes(SYSCTL_HANDLER_ARGS);
278 static int sysctl_tc(SYSCTL_HANDLER_ARGS);
279 
280 static counter_u64_t extfree_refs;
281 static counter_u64_t extfree_rels;
282 
283 an_handler_t t4_an_handler;
284 fw_msg_handler_t t4_fw_msg_handler[NUM_FW6_TYPES];
285 cpl_handler_t t4_cpl_handler[NUM_CPL_CMDS];
286 
287 
288 static int
289 an_not_handled(struct sge_iq *iq, const struct rsp_ctrl *ctrl)
290 {
291 
292 #ifdef INVARIANTS
293 	panic("%s: async notification on iq %p (ctrl %p)", __func__, iq, ctrl);
294 #else
295 	log(LOG_ERR, "%s: async notification on iq %p (ctrl %p)\n",
296 	    __func__, iq, ctrl);
297 #endif
298 	return (EDOOFUS);
299 }
300 
301 int
302 t4_register_an_handler(an_handler_t h)
303 {
304 	uintptr_t *loc, new;
305 
306 	new = h ? (uintptr_t)h : (uintptr_t)an_not_handled;
307 	loc = (uintptr_t *) &t4_an_handler;
308 	atomic_store_rel_ptr(loc, new);
309 
310 	return (0);
311 }
312 
313 static int
314 fw_msg_not_handled(struct adapter *sc, const __be64 *rpl)
315 {
316 	const struct cpl_fw6_msg *cpl =
317 	    __containerof(rpl, struct cpl_fw6_msg, data[0]);
318 
319 #ifdef INVARIANTS
320 	panic("%s: fw_msg type %d", __func__, cpl->type);
321 #else
322 	log(LOG_ERR, "%s: fw_msg type %d\n", __func__, cpl->type);
323 #endif
324 	return (EDOOFUS);
325 }
326 
327 int
328 t4_register_fw_msg_handler(int type, fw_msg_handler_t h)
329 {
330 	uintptr_t *loc, new;
331 
332 	if (type >= nitems(t4_fw_msg_handler))
333 		return (EINVAL);
334 
335 	/*
336 	 * These are dispatched by the handler for FW{4|6}_CPL_MSG using the CPL
337 	 * handler dispatch table.  Reject any attempt to install a handler for
338 	 * this subtype.
339 	 */
340 	if (type == FW_TYPE_RSSCPL || type == FW6_TYPE_RSSCPL)
341 		return (EINVAL);
342 
343 	new = h ? (uintptr_t)h : (uintptr_t)fw_msg_not_handled;
344 	loc = (uintptr_t *) &t4_fw_msg_handler[type];
345 	atomic_store_rel_ptr(loc, new);
346 
347 	return (0);
348 }
349 
350 static int
351 cpl_not_handled(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
352 {
353 
354 #ifdef INVARIANTS
355 	panic("%s: opcode 0x%02x on iq %p with payload %p",
356 	    __func__, rss->opcode, iq, m);
357 #else
358 	log(LOG_ERR, "%s: opcode 0x%02x on iq %p with payload %p\n",
359 	    __func__, rss->opcode, iq, m);
360 	m_freem(m);
361 #endif
362 	return (EDOOFUS);
363 }
364 
365 int
366 t4_register_cpl_handler(int opcode, cpl_handler_t h)
367 {
368 	uintptr_t *loc, new;
369 
370 	if (opcode >= nitems(t4_cpl_handler))
371 		return (EINVAL);
372 
373 	new = h ? (uintptr_t)h : (uintptr_t)cpl_not_handled;
374 	loc = (uintptr_t *) &t4_cpl_handler[opcode];
375 	atomic_store_rel_ptr(loc, new);
376 
377 	return (0);
378 }
379 
380 /*
381  * Called on MOD_LOAD.  Validates and calculates the SGE tunables.
382  */
383 void
384 t4_sge_modload(void)
385 {
386 	int i;
387 
388 	if (fl_pktshift < 0 || fl_pktshift > 7) {
389 		printf("Invalid hw.cxgbe.fl_pktshift value (%d),"
390 		    " using 2 instead.\n", fl_pktshift);
391 		fl_pktshift = 2;
392 	}
393 
394 	if (spg_len != 64 && spg_len != 128) {
395 		int len;
396 
397 #if defined(__i386__) || defined(__amd64__)
398 		len = cpu_clflush_line_size > 64 ? 128 : 64;
399 #else
400 		len = 64;
401 #endif
402 		if (spg_len != -1) {
403 			printf("Invalid hw.cxgbe.spg_len value (%d),"
404 			    " using %d instead.\n", spg_len, len);
405 		}
406 		spg_len = len;
407 	}
408 
409 	if (cong_drop < -1 || cong_drop > 1) {
410 		printf("Invalid hw.cxgbe.cong_drop value (%d),"
411 		    " using 0 instead.\n", cong_drop);
412 		cong_drop = 0;
413 	}
414 
415 	if (tscale != 1 && (tscale < 3 || tscale > 17)) {
416 		printf("Invalid hw.cxgbe.tscale value (%d),"
417 		    " using 1 instead.\n", tscale);
418 		tscale = 1;
419 	}
420 
421 	extfree_refs = counter_u64_alloc(M_WAITOK);
422 	extfree_rels = counter_u64_alloc(M_WAITOK);
423 	counter_u64_zero(extfree_refs);
424 	counter_u64_zero(extfree_rels);
425 
426 	t4_an_handler = an_not_handled;
427 	for (i = 0; i < nitems(t4_fw_msg_handler); i++)
428 		t4_fw_msg_handler[i] = fw_msg_not_handled;
429 	for (i = 0; i < nitems(t4_cpl_handler); i++)
430 		t4_cpl_handler[i] = cpl_not_handled;
431 
432 	t4_register_cpl_handler(CPL_FW4_MSG, handle_fw_msg);
433 	t4_register_cpl_handler(CPL_FW6_MSG, handle_fw_msg);
434 	t4_register_cpl_handler(CPL_SGE_EGR_UPDATE, handle_sge_egr_update);
435 	t4_register_cpl_handler(CPL_RX_PKT, t4_eth_rx);
436 	t4_register_fw_msg_handler(FW6_TYPE_CMD_RPL, t4_handle_fw_rpl);
437 	t4_register_fw_msg_handler(FW6_TYPE_WRERR_RPL, t4_handle_wrerr_rpl);
438 }
439 
440 void
441 t4_sge_modunload(void)
442 {
443 
444 	counter_u64_free(extfree_refs);
445 	counter_u64_free(extfree_rels);
446 }
447 
448 uint64_t
449 t4_sge_extfree_refs(void)
450 {
451 	uint64_t refs, rels;
452 
453 	rels = counter_u64_fetch(extfree_rels);
454 	refs = counter_u64_fetch(extfree_refs);
455 
456 	return (refs - rels);
457 }
458 
459 static inline void
460 setup_pad_and_pack_boundaries(struct adapter *sc)
461 {
462 	uint32_t v, m;
463 	int pad, pack, pad_shift;
464 
465 	pad_shift = chip_id(sc) > CHELSIO_T5 ? X_T6_INGPADBOUNDARY_SHIFT :
466 	    X_INGPADBOUNDARY_SHIFT;
467 	pad = fl_pad;
468 	if (fl_pad < (1 << pad_shift) ||
469 	    fl_pad > (1 << (pad_shift + M_INGPADBOUNDARY)) ||
470 	    !powerof2(fl_pad)) {
471 		/*
472 		 * If there is any chance that we might use buffer packing and
473 		 * the chip is a T4, then pick 64 as the pad/pack boundary.  Set
474 		 * it to the minimum allowed in all other cases.
475 		 */
476 		pad = is_t4(sc) && buffer_packing ? 64 : 1 << pad_shift;
477 
478 		/*
479 		 * For fl_pad = 0 we'll still write a reasonable value to the
480 		 * register but all the freelists will opt out of padding.
481 		 * We'll complain here only if the user tried to set it to a
482 		 * value greater than 0 that was invalid.
483 		 */
484 		if (fl_pad > 0) {
485 			device_printf(sc->dev, "Invalid hw.cxgbe.fl_pad value"
486 			    " (%d), using %d instead.\n", fl_pad, pad);
487 		}
488 	}
489 	m = V_INGPADBOUNDARY(M_INGPADBOUNDARY);
490 	v = V_INGPADBOUNDARY(ilog2(pad) - pad_shift);
491 	t4_set_reg_field(sc, A_SGE_CONTROL, m, v);
492 
493 	if (is_t4(sc)) {
494 		if (fl_pack != -1 && fl_pack != pad) {
495 			/* Complain but carry on. */
496 			device_printf(sc->dev, "hw.cxgbe.fl_pack (%d) ignored,"
497 			    " using %d instead.\n", fl_pack, pad);
498 		}
499 		return;
500 	}
501 
502 	pack = fl_pack;
503 	if (fl_pack < 16 || fl_pack == 32 || fl_pack > 4096 ||
504 	    !powerof2(fl_pack)) {
505 		pack = max(sc->params.pci.mps, CACHE_LINE_SIZE);
506 		MPASS(powerof2(pack));
507 		if (pack < 16)
508 			pack = 16;
509 		if (pack == 32)
510 			pack = 64;
511 		if (pack > 4096)
512 			pack = 4096;
513 		if (fl_pack != -1) {
514 			device_printf(sc->dev, "Invalid hw.cxgbe.fl_pack value"
515 			    " (%d), using %d instead.\n", fl_pack, pack);
516 		}
517 	}
518 	m = V_INGPACKBOUNDARY(M_INGPACKBOUNDARY);
519 	if (pack == 16)
520 		v = V_INGPACKBOUNDARY(0);
521 	else
522 		v = V_INGPACKBOUNDARY(ilog2(pack) - 5);
523 
524 	MPASS(!is_t4(sc));	/* T4 doesn't have SGE_CONTROL2 */
525 	t4_set_reg_field(sc, A_SGE_CONTROL2, m, v);
526 }
527 
528 /*
529  * adap->params.vpd.cclk must be set up before this is called.
530  */
531 void
532 t4_tweak_chip_settings(struct adapter *sc)
533 {
534 	int i;
535 	uint32_t v, m;
536 	int intr_timer[SGE_NTIMERS] = {1, 5, 10, 50, 100, 200};
537 	int timer_max = M_TIMERVALUE0 * 1000 / sc->params.vpd.cclk;
538 	int intr_pktcount[SGE_NCOUNTERS] = {1, 8, 16, 32}; /* 63 max */
539 	uint16_t indsz = min(RX_COPY_THRESHOLD - 1, M_INDICATESIZE);
540 	static int sge_flbuf_sizes[] = {
541 		MCLBYTES,
542 #if MJUMPAGESIZE != MCLBYTES
543 		MJUMPAGESIZE,
544 		MJUMPAGESIZE - CL_METADATA_SIZE,
545 		MJUMPAGESIZE - 2 * MSIZE - CL_METADATA_SIZE,
546 #endif
547 		MJUM9BYTES,
548 		MJUM16BYTES,
549 		MCLBYTES - MSIZE - CL_METADATA_SIZE,
550 		MJUM9BYTES - CL_METADATA_SIZE,
551 		MJUM16BYTES - CL_METADATA_SIZE,
552 	};
553 
554 	KASSERT(sc->flags & MASTER_PF,
555 	    ("%s: trying to change chip settings when not master.", __func__));
556 
557 	m = V_PKTSHIFT(M_PKTSHIFT) | F_RXPKTCPLMODE | F_EGRSTATUSPAGESIZE;
558 	v = V_PKTSHIFT(fl_pktshift) | F_RXPKTCPLMODE |
559 	    V_EGRSTATUSPAGESIZE(spg_len == 128);
560 	t4_set_reg_field(sc, A_SGE_CONTROL, m, v);
561 
562 	setup_pad_and_pack_boundaries(sc);
563 
564 	v = V_HOSTPAGESIZEPF0(PAGE_SHIFT - 10) |
565 	    V_HOSTPAGESIZEPF1(PAGE_SHIFT - 10) |
566 	    V_HOSTPAGESIZEPF2(PAGE_SHIFT - 10) |
567 	    V_HOSTPAGESIZEPF3(PAGE_SHIFT - 10) |
568 	    V_HOSTPAGESIZEPF4(PAGE_SHIFT - 10) |
569 	    V_HOSTPAGESIZEPF5(PAGE_SHIFT - 10) |
570 	    V_HOSTPAGESIZEPF6(PAGE_SHIFT - 10) |
571 	    V_HOSTPAGESIZEPF7(PAGE_SHIFT - 10);
572 	t4_write_reg(sc, A_SGE_HOST_PAGE_SIZE, v);
573 
574 	KASSERT(nitems(sge_flbuf_sizes) <= SGE_FLBUF_SIZES,
575 	    ("%s: hw buffer size table too big", __func__));
576 	for (i = 0; i < min(nitems(sge_flbuf_sizes), SGE_FLBUF_SIZES); i++) {
577 		t4_write_reg(sc, A_SGE_FL_BUFFER_SIZE0 + (4 * i),
578 		    sge_flbuf_sizes[i]);
579 	}
580 
581 	v = V_THRESHOLD_0(intr_pktcount[0]) | V_THRESHOLD_1(intr_pktcount[1]) |
582 	    V_THRESHOLD_2(intr_pktcount[2]) | V_THRESHOLD_3(intr_pktcount[3]);
583 	t4_write_reg(sc, A_SGE_INGRESS_RX_THRESHOLD, v);
584 
585 	KASSERT(intr_timer[0] <= timer_max,
586 	    ("%s: not a single usable timer (%d, %d)", __func__, intr_timer[0],
587 	    timer_max));
588 	for (i = 1; i < nitems(intr_timer); i++) {
589 		KASSERT(intr_timer[i] >= intr_timer[i - 1],
590 		    ("%s: timers not listed in increasing order (%d)",
591 		    __func__, i));
592 
593 		while (intr_timer[i] > timer_max) {
594 			if (i == nitems(intr_timer) - 1) {
595 				intr_timer[i] = timer_max;
596 				break;
597 			}
598 			intr_timer[i] += intr_timer[i - 1];
599 			intr_timer[i] /= 2;
600 		}
601 	}
602 
603 	v = V_TIMERVALUE0(us_to_core_ticks(sc, intr_timer[0])) |
604 	    V_TIMERVALUE1(us_to_core_ticks(sc, intr_timer[1]));
605 	t4_write_reg(sc, A_SGE_TIMER_VALUE_0_AND_1, v);
606 	v = V_TIMERVALUE2(us_to_core_ticks(sc, intr_timer[2])) |
607 	    V_TIMERVALUE3(us_to_core_ticks(sc, intr_timer[3]));
608 	t4_write_reg(sc, A_SGE_TIMER_VALUE_2_AND_3, v);
609 	v = V_TIMERVALUE4(us_to_core_ticks(sc, intr_timer[4])) |
610 	    V_TIMERVALUE5(us_to_core_ticks(sc, intr_timer[5]));
611 	t4_write_reg(sc, A_SGE_TIMER_VALUE_4_AND_5, v);
612 
613 	if (chip_id(sc) >= CHELSIO_T6) {
614 		m = V_TSCALE(M_TSCALE);
615 		if (tscale == 1)
616 			v = 0;
617 		else
618 			v = V_TSCALE(tscale - 2);
619 		t4_set_reg_field(sc, A_SGE_ITP_CONTROL, m, v);
620 
621 		if (sc->debug_flags & DF_DISABLE_TCB_CACHE) {
622 			m = V_RDTHRESHOLD(M_RDTHRESHOLD) | F_WRTHRTHRESHEN |
623 			    V_WRTHRTHRESH(M_WRTHRTHRESH);
624 			t4_tp_pio_read(sc, &v, 1, A_TP_CMM_CONFIG, 1);
625 			v &= ~m;
626 			v |= V_RDTHRESHOLD(1) | F_WRTHRTHRESHEN |
627 			    V_WRTHRTHRESH(16);
628 			t4_tp_pio_write(sc, &v, 1, A_TP_CMM_CONFIG, 1);
629 		}
630 	}
631 
632 	/* 4K, 16K, 64K, 256K DDP "page sizes" for TDDP */
633 	v = V_HPZ0(0) | V_HPZ1(2) | V_HPZ2(4) | V_HPZ3(6);
634 	t4_write_reg(sc, A_ULP_RX_TDDP_PSZ, v);
635 
636 	/*
637 	 * 4K, 8K, 16K, 64K DDP "page sizes" for iSCSI DDP.  These have been
638 	 * chosen with MAXPHYS = 128K in mind.  The largest DDP buffer that we
639 	 * may have to deal with is MAXPHYS + 1 page.
640 	 */
641 	v = V_HPZ0(0) | V_HPZ1(1) | V_HPZ2(2) | V_HPZ3(4);
642 	t4_write_reg(sc, A_ULP_RX_ISCSI_PSZ, v);
643 
644 	/* We use multiple DDP page sizes both in plain-TOE and ISCSI modes. */
645 	m = v = F_TDDPTAGTCB | F_ISCSITAGTCB;
646 	t4_set_reg_field(sc, A_ULP_RX_CTL, m, v);
647 
648 	m = V_INDICATESIZE(M_INDICATESIZE) | F_REARMDDPOFFSET |
649 	    F_RESETDDPOFFSET;
650 	v = V_INDICATESIZE(indsz) | F_REARMDDPOFFSET | F_RESETDDPOFFSET;
651 	t4_set_reg_field(sc, A_TP_PARA_REG5, m, v);
652 }
653 
654 /*
655  * SGE wants the buffer to be at least 64B and then a multiple of 16.  If
656  * padding is in use, the buffer's start and end need to be aligned to the pad
657  * boundary as well.  We'll just make sure that the size is a multiple of the
658  * boundary here, it is up to the buffer allocation code to make sure the start
659  * of the buffer is aligned as well.
660  */
661 static inline int
662 hwsz_ok(struct adapter *sc, int hwsz)
663 {
664 	int mask = fl_pad ? sc->params.sge.pad_boundary - 1 : 16 - 1;
665 
666 	return (hwsz >= 64 && (hwsz & mask) == 0);
667 }
668 
669 /*
670  * XXX: driver really should be able to deal with unexpected settings.
671  */
672 int
673 t4_read_chip_settings(struct adapter *sc)
674 {
675 	struct sge *s = &sc->sge;
676 	struct sge_params *sp = &sc->params.sge;
677 	int i, j, n, rc = 0;
678 	uint32_t m, v, r;
679 	uint16_t indsz = min(RX_COPY_THRESHOLD - 1, M_INDICATESIZE);
680 	static int sw_buf_sizes[] = {	/* Sorted by size */
681 		MCLBYTES,
682 #if MJUMPAGESIZE != MCLBYTES
683 		MJUMPAGESIZE,
684 #endif
685 		MJUM9BYTES,
686 		MJUM16BYTES
687 	};
688 	struct sw_zone_info *swz, *safe_swz;
689 	struct hw_buf_info *hwb;
690 
691 	m = F_RXPKTCPLMODE;
692 	v = F_RXPKTCPLMODE;
693 	r = sc->params.sge.sge_control;
694 	if ((r & m) != v) {
695 		device_printf(sc->dev, "invalid SGE_CONTROL(0x%x)\n", r);
696 		rc = EINVAL;
697 	}
698 
699 	/*
700 	 * If this changes then every single use of PAGE_SHIFT in the driver
701 	 * needs to be carefully reviewed for PAGE_SHIFT vs sp->page_shift.
702 	 */
703 	if (sp->page_shift != PAGE_SHIFT) {
704 		device_printf(sc->dev, "invalid SGE_HOST_PAGE_SIZE(0x%x)\n", r);
705 		rc = EINVAL;
706 	}
707 
708 	/* Filter out unusable hw buffer sizes entirely (mark with -2). */
709 	hwb = &s->hw_buf_info[0];
710 	for (i = 0; i < nitems(s->hw_buf_info); i++, hwb++) {
711 		r = sc->params.sge.sge_fl_buffer_size[i];
712 		hwb->size = r;
713 		hwb->zidx = hwsz_ok(sc, r) ? -1 : -2;
714 		hwb->next = -1;
715 	}
716 
717 	/*
718 	 * Create a sorted list in decreasing order of hw buffer sizes (and so
719 	 * increasing order of spare area) for each software zone.
720 	 *
721 	 * If padding is enabled then the start and end of the buffer must align
722 	 * to the pad boundary; if packing is enabled then they must align with
723 	 * the pack boundary as well.  Allocations from the cluster zones are
724 	 * aligned to min(size, 4K), so the buffer starts at that alignment and
725 	 * ends at hwb->size alignment.  If mbuf inlining is allowed the
726 	 * starting alignment will be reduced to MSIZE and the driver will
727 	 * exercise appropriate caution when deciding on the best buffer layout
728 	 * to use.
729 	 */
730 	n = 0;	/* no usable buffer size to begin with */
731 	swz = &s->sw_zone_info[0];
732 	safe_swz = NULL;
733 	for (i = 0; i < SW_ZONE_SIZES; i++, swz++) {
734 		int8_t head = -1, tail = -1;
735 
736 		swz->size = sw_buf_sizes[i];
737 		swz->zone = m_getzone(swz->size);
738 		swz->type = m_gettype(swz->size);
739 
740 		if (swz->size < PAGE_SIZE) {
741 			MPASS(powerof2(swz->size));
742 			if (fl_pad && (swz->size % sp->pad_boundary != 0))
743 				continue;
744 		}
745 
746 		if (swz->size == safest_rx_cluster)
747 			safe_swz = swz;
748 
749 		hwb = &s->hw_buf_info[0];
750 		for (j = 0; j < SGE_FLBUF_SIZES; j++, hwb++) {
751 			if (hwb->zidx != -1 || hwb->size > swz->size)
752 				continue;
753 #ifdef INVARIANTS
754 			if (fl_pad)
755 				MPASS(hwb->size % sp->pad_boundary == 0);
756 #endif
757 			hwb->zidx = i;
758 			if (head == -1)
759 				head = tail = j;
760 			else if (hwb->size < s->hw_buf_info[tail].size) {
761 				s->hw_buf_info[tail].next = j;
762 				tail = j;
763 			} else {
764 				int8_t *cur;
765 				struct hw_buf_info *t;
766 
767 				for (cur = &head; *cur != -1; cur = &t->next) {
768 					t = &s->hw_buf_info[*cur];
769 					if (hwb->size == t->size) {
770 						hwb->zidx = -2;
771 						break;
772 					}
773 					if (hwb->size > t->size) {
774 						hwb->next = *cur;
775 						*cur = j;
776 						break;
777 					}
778 				}
779 			}
780 		}
781 		swz->head_hwidx = head;
782 		swz->tail_hwidx = tail;
783 
784 		if (tail != -1) {
785 			n++;
786 			if (swz->size - s->hw_buf_info[tail].size >=
787 			    CL_METADATA_SIZE)
788 				sc->flags |= BUF_PACKING_OK;
789 		}
790 	}
791 	if (n == 0) {
792 		device_printf(sc->dev, "no usable SGE FL buffer size.\n");
793 		rc = EINVAL;
794 	}
795 
796 	s->safe_hwidx1 = -1;
797 	s->safe_hwidx2 = -1;
798 	if (safe_swz != NULL) {
799 		s->safe_hwidx1 = safe_swz->head_hwidx;
800 		for (i = safe_swz->head_hwidx; i != -1; i = hwb->next) {
801 			int spare;
802 
803 			hwb = &s->hw_buf_info[i];
804 #ifdef INVARIANTS
805 			if (fl_pad)
806 				MPASS(hwb->size % sp->pad_boundary == 0);
807 #endif
808 			spare = safe_swz->size - hwb->size;
809 			if (spare >= CL_METADATA_SIZE) {
810 				s->safe_hwidx2 = i;
811 				break;
812 			}
813 		}
814 	}
815 
816 	if (sc->flags & IS_VF)
817 		return (0);
818 
819 	v = V_HPZ0(0) | V_HPZ1(2) | V_HPZ2(4) | V_HPZ3(6);
820 	r = t4_read_reg(sc, A_ULP_RX_TDDP_PSZ);
821 	if (r != v) {
822 		device_printf(sc->dev, "invalid ULP_RX_TDDP_PSZ(0x%x)\n", r);
823 		rc = EINVAL;
824 	}
825 
826 	m = v = F_TDDPTAGTCB;
827 	r = t4_read_reg(sc, A_ULP_RX_CTL);
828 	if ((r & m) != v) {
829 		device_printf(sc->dev, "invalid ULP_RX_CTL(0x%x)\n", r);
830 		rc = EINVAL;
831 	}
832 
833 	m = V_INDICATESIZE(M_INDICATESIZE) | F_REARMDDPOFFSET |
834 	    F_RESETDDPOFFSET;
835 	v = V_INDICATESIZE(indsz) | F_REARMDDPOFFSET | F_RESETDDPOFFSET;
836 	r = t4_read_reg(sc, A_TP_PARA_REG5);
837 	if ((r & m) != v) {
838 		device_printf(sc->dev, "invalid TP_PARA_REG5(0x%x)\n", r);
839 		rc = EINVAL;
840 	}
841 
842 	t4_init_tp_params(sc, 1);
843 
844 	t4_read_mtu_tbl(sc, sc->params.mtus, NULL);
845 	t4_load_mtus(sc, sc->params.mtus, sc->params.a_wnd, sc->params.b_wnd);
846 
847 	return (rc);
848 }
849 
850 int
851 t4_create_dma_tag(struct adapter *sc)
852 {
853 	int rc;
854 
855 	rc = bus_dma_tag_create(bus_get_dma_tag(sc->dev), 1, 0,
856 	    BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, BUS_SPACE_MAXSIZE,
857 	    BUS_SPACE_UNRESTRICTED, BUS_SPACE_MAXSIZE, BUS_DMA_ALLOCNOW, NULL,
858 	    NULL, &sc->dmat);
859 	if (rc != 0) {
860 		device_printf(sc->dev,
861 		    "failed to create main DMA tag: %d\n", rc);
862 	}
863 
864 	return (rc);
865 }
866 
867 void
868 t4_sge_sysctls(struct adapter *sc, struct sysctl_ctx_list *ctx,
869     struct sysctl_oid_list *children)
870 {
871 	struct sge_params *sp = &sc->params.sge;
872 
873 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "buffer_sizes",
874 	    CTLTYPE_STRING | CTLFLAG_RD, &sc->sge, 0, sysctl_bufsizes, "A",
875 	    "freelist buffer sizes");
876 
877 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pktshift", CTLFLAG_RD,
878 	    NULL, sp->fl_pktshift, "payload DMA offset in rx buffer (bytes)");
879 
880 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pad", CTLFLAG_RD,
881 	    NULL, sp->pad_boundary, "payload pad boundary (bytes)");
882 
883 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "spg_len", CTLFLAG_RD,
884 	    NULL, sp->spg_len, "status page size (bytes)");
885 
886 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "cong_drop", CTLFLAG_RD,
887 	    NULL, cong_drop, "congestion drop setting");
888 
889 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pack", CTLFLAG_RD,
890 	    NULL, sp->pack_boundary, "payload pack boundary (bytes)");
891 }
892 
893 int
894 t4_destroy_dma_tag(struct adapter *sc)
895 {
896 	if (sc->dmat)
897 		bus_dma_tag_destroy(sc->dmat);
898 
899 	return (0);
900 }
901 
902 /*
903  * Allocate and initialize the firmware event queue and the management queue.
904  *
905  * Returns errno on failure.  Resources allocated up to that point may still be
906  * allocated.  Caller is responsible for cleanup in case this function fails.
907  */
908 int
909 t4_setup_adapter_queues(struct adapter *sc)
910 {
911 	int rc;
912 
913 	ADAPTER_LOCK_ASSERT_NOTOWNED(sc);
914 
915 	sysctl_ctx_init(&sc->ctx);
916 	sc->flags |= ADAP_SYSCTL_CTX;
917 
918 	/*
919 	 * Firmware event queue
920 	 */
921 	rc = alloc_fwq(sc);
922 	if (rc != 0)
923 		return (rc);
924 
925 	/*
926 	 * Management queue.  This is just a control queue that uses the fwq as
927 	 * its associated iq.
928 	 */
929 	if (!(sc->flags & IS_VF))
930 		rc = alloc_mgmtq(sc);
931 
932 	return (rc);
933 }
934 
935 /*
936  * Idempotent
937  */
938 int
939 t4_teardown_adapter_queues(struct adapter *sc)
940 {
941 
942 	ADAPTER_LOCK_ASSERT_NOTOWNED(sc);
943 
944 	/* Do this before freeing the queue */
945 	if (sc->flags & ADAP_SYSCTL_CTX) {
946 		sysctl_ctx_free(&sc->ctx);
947 		sc->flags &= ~ADAP_SYSCTL_CTX;
948 	}
949 
950 	free_mgmtq(sc);
951 	free_fwq(sc);
952 
953 	return (0);
954 }
955 
956 static inline int
957 first_vector(struct vi_info *vi)
958 {
959 	struct adapter *sc = vi->pi->adapter;
960 
961 	if (sc->intr_count == 1)
962 		return (0);
963 
964 	return (vi->first_intr);
965 }
966 
967 /*
968  * Given an arbitrary "index," come up with an iq that can be used by other
969  * queues (of this VI) for interrupt forwarding, SGE egress updates, etc.
970  * The iq returned is guaranteed to be something that takes direct interrupts.
971  */
972 static struct sge_iq *
973 vi_intr_iq(struct vi_info *vi, int idx)
974 {
975 	struct adapter *sc = vi->pi->adapter;
976 	struct sge *s = &sc->sge;
977 	struct sge_iq *iq = NULL;
978 	int nintr, i;
979 
980 	if (sc->intr_count == 1)
981 		return (&sc->sge.fwq);
982 
983 	nintr = vi->nintr;
984 #ifdef DEV_NETMAP
985 	/* Do not consider any netmap-only interrupts */
986 	if (vi->flags & INTR_RXQ && vi->nnmrxq > vi->nrxq)
987 		nintr -= vi->nnmrxq - vi->nrxq;
988 #endif
989 	KASSERT(nintr != 0,
990 	    ("%s: vi %p has no exclusive interrupts, total interrupts = %d",
991 	    __func__, vi, sc->intr_count));
992 	i = idx % nintr;
993 
994 	if (vi->flags & INTR_RXQ) {
995 	       	if (i < vi->nrxq) {
996 			iq = &s->rxq[vi->first_rxq + i].iq;
997 			goto done;
998 		}
999 		i -= vi->nrxq;
1000 	}
1001 #ifdef TCP_OFFLOAD
1002 	if (vi->flags & INTR_OFLD_RXQ) {
1003 	       	if (i < vi->nofldrxq) {
1004 			iq = &s->ofld_rxq[vi->first_ofld_rxq + i].iq;
1005 			goto done;
1006 		}
1007 		i -= vi->nofldrxq;
1008 	}
1009 #endif
1010 	panic("%s: vi %p, intr_flags 0x%lx, idx %d, total intr %d\n", __func__,
1011 	    vi, vi->flags & INTR_ALL, idx, nintr);
1012 done:
1013 	MPASS(iq != NULL);
1014 	KASSERT(iq->flags & IQ_INTR,
1015 	    ("%s: iq %p (vi %p, intr_flags 0x%lx, idx %d)", __func__, iq, vi,
1016 	    vi->flags & INTR_ALL, idx));
1017 	return (iq);
1018 }
1019 
1020 /* Maximum payload that can be delivered with a single iq descriptor */
1021 static inline int
1022 mtu_to_max_payload(struct adapter *sc, int mtu, const int toe)
1023 {
1024 	int payload;
1025 
1026 #ifdef TCP_OFFLOAD
1027 	if (toe) {
1028 		payload = sc->tt.rx_coalesce ?
1029 		    G_RXCOALESCESIZE(t4_read_reg(sc, A_TP_PARA_REG2)) : mtu;
1030 	} else {
1031 #endif
1032 		/* large enough even when hw VLAN extraction is disabled */
1033 		payload = sc->params.sge.fl_pktshift + ETHER_HDR_LEN +
1034 		    ETHER_VLAN_ENCAP_LEN + mtu;
1035 #ifdef TCP_OFFLOAD
1036 	}
1037 #endif
1038 
1039 	return (payload);
1040 }
1041 
1042 int
1043 t4_setup_vi_queues(struct vi_info *vi)
1044 {
1045 	int rc = 0, i, j, intr_idx, iqid;
1046 	struct sge_rxq *rxq;
1047 	struct sge_txq *txq;
1048 	struct sge_wrq *ctrlq;
1049 #ifdef TCP_OFFLOAD
1050 	struct sge_ofld_rxq *ofld_rxq;
1051 	struct sge_wrq *ofld_txq;
1052 #endif
1053 #ifdef DEV_NETMAP
1054 	int saved_idx;
1055 	struct sge_nm_rxq *nm_rxq;
1056 	struct sge_nm_txq *nm_txq;
1057 #endif
1058 	char name[16];
1059 	struct port_info *pi = vi->pi;
1060 	struct adapter *sc = pi->adapter;
1061 	struct ifnet *ifp = vi->ifp;
1062 	struct sysctl_oid *oid = device_get_sysctl_tree(vi->dev);
1063 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
1064 	int maxp, mtu = ifp->if_mtu;
1065 
1066 	/* Interrupt vector to start from (when using multiple vectors) */
1067 	intr_idx = first_vector(vi);
1068 
1069 #ifdef DEV_NETMAP
1070 	saved_idx = intr_idx;
1071 	if (ifp->if_capabilities & IFCAP_NETMAP) {
1072 
1073 		/* netmap is supported with direct interrupts only. */
1074 		MPASS(vi->flags & INTR_RXQ);
1075 
1076 		/*
1077 		 * We don't have buffers to back the netmap rx queues
1078 		 * right now so we create the queues in a way that
1079 		 * doesn't set off any congestion signal in the chip.
1080 		 */
1081 		oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "nm_rxq",
1082 		    CTLFLAG_RD, NULL, "rx queues");
1083 		for_each_nm_rxq(vi, i, nm_rxq) {
1084 			rc = alloc_nm_rxq(vi, nm_rxq, intr_idx, i, oid);
1085 			if (rc != 0)
1086 				goto done;
1087 			intr_idx++;
1088 		}
1089 
1090 		oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "nm_txq",
1091 		    CTLFLAG_RD, NULL, "tx queues");
1092 		for_each_nm_txq(vi, i, nm_txq) {
1093 			iqid = vi->first_nm_rxq + (i % vi->nnmrxq);
1094 			rc = alloc_nm_txq(vi, nm_txq, iqid, i, oid);
1095 			if (rc != 0)
1096 				goto done;
1097 		}
1098 	}
1099 
1100 	/* Normal rx queues and netmap rx queues share the same interrupts. */
1101 	intr_idx = saved_idx;
1102 #endif
1103 
1104 	/*
1105 	 * First pass over all NIC and TOE rx queues:
1106 	 * a) initialize iq and fl
1107 	 * b) allocate queue iff it will take direct interrupts.
1108 	 */
1109 	maxp = mtu_to_max_payload(sc, mtu, 0);
1110 	if (vi->flags & INTR_RXQ) {
1111 		oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "rxq",
1112 		    CTLFLAG_RD, NULL, "rx queues");
1113 	}
1114 	for_each_rxq(vi, i, rxq) {
1115 
1116 		init_iq(&rxq->iq, sc, vi->tmr_idx, vi->pktc_idx, vi->qsize_rxq);
1117 
1118 		snprintf(name, sizeof(name), "%s rxq%d-fl",
1119 		    device_get_nameunit(vi->dev), i);
1120 		init_fl(sc, &rxq->fl, vi->qsize_rxq / 8, maxp, name);
1121 
1122 		if (vi->flags & INTR_RXQ) {
1123 			rxq->iq.flags |= IQ_INTR;
1124 			rc = alloc_rxq(vi, rxq, intr_idx, i, oid);
1125 			if (rc != 0)
1126 				goto done;
1127 			intr_idx++;
1128 		}
1129 	}
1130 #ifdef DEV_NETMAP
1131 	if (ifp->if_capabilities & IFCAP_NETMAP)
1132 		intr_idx = saved_idx + max(vi->nrxq, vi->nnmrxq);
1133 #endif
1134 #ifdef TCP_OFFLOAD
1135 	maxp = mtu_to_max_payload(sc, mtu, 1);
1136 	if (vi->flags & INTR_OFLD_RXQ) {
1137 		oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "ofld_rxq",
1138 		    CTLFLAG_RD, NULL,
1139 		    "rx queues for offloaded TCP connections");
1140 	}
1141 	for_each_ofld_rxq(vi, i, ofld_rxq) {
1142 
1143 		init_iq(&ofld_rxq->iq, sc, vi->ofld_tmr_idx, vi->ofld_pktc_idx,
1144 		    vi->qsize_rxq);
1145 
1146 		snprintf(name, sizeof(name), "%s ofld_rxq%d-fl",
1147 		    device_get_nameunit(vi->dev), i);
1148 		init_fl(sc, &ofld_rxq->fl, vi->qsize_rxq / 8, maxp, name);
1149 
1150 		if (vi->flags & INTR_OFLD_RXQ) {
1151 			ofld_rxq->iq.flags |= IQ_INTR;
1152 			rc = alloc_ofld_rxq(vi, ofld_rxq, intr_idx, i, oid);
1153 			if (rc != 0)
1154 				goto done;
1155 			intr_idx++;
1156 		}
1157 	}
1158 #endif
1159 
1160 	/*
1161 	 * Second pass over all NIC and TOE rx queues.  The queues forwarding
1162 	 * their interrupts are allocated now.
1163 	 */
1164 	j = 0;
1165 	if (!(vi->flags & INTR_RXQ)) {
1166 		oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "rxq",
1167 		    CTLFLAG_RD, NULL, "rx queues");
1168 		for_each_rxq(vi, i, rxq) {
1169 			MPASS(!(rxq->iq.flags & IQ_INTR));
1170 
1171 			intr_idx = vi_intr_iq(vi, j)->abs_id;
1172 
1173 			rc = alloc_rxq(vi, rxq, intr_idx, i, oid);
1174 			if (rc != 0)
1175 				goto done;
1176 			j++;
1177 		}
1178 	}
1179 #ifdef TCP_OFFLOAD
1180 	if (vi->nofldrxq != 0 && !(vi->flags & INTR_OFLD_RXQ)) {
1181 		oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "ofld_rxq",
1182 		    CTLFLAG_RD, NULL,
1183 		    "rx queues for offloaded TCP connections");
1184 		for_each_ofld_rxq(vi, i, ofld_rxq) {
1185 			MPASS(!(ofld_rxq->iq.flags & IQ_INTR));
1186 
1187 			intr_idx = vi_intr_iq(vi, j)->abs_id;
1188 
1189 			rc = alloc_ofld_rxq(vi, ofld_rxq, intr_idx, i, oid);
1190 			if (rc != 0)
1191 				goto done;
1192 			j++;
1193 		}
1194 	}
1195 #endif
1196 
1197 	/*
1198 	 * Now the tx queues.  Only one pass needed.
1199 	 */
1200 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "txq", CTLFLAG_RD,
1201 	    NULL, "tx queues");
1202 	j = 0;
1203 	for_each_txq(vi, i, txq) {
1204 		iqid = vi_intr_iq(vi, j)->cntxt_id;
1205 		snprintf(name, sizeof(name), "%s txq%d",
1206 		    device_get_nameunit(vi->dev), i);
1207 		init_eq(sc, &txq->eq, EQ_ETH, vi->qsize_txq, pi->tx_chan, iqid,
1208 		    name);
1209 
1210 		rc = alloc_txq(vi, txq, i, oid);
1211 		if (rc != 0)
1212 			goto done;
1213 		j++;
1214 	}
1215 #ifdef TCP_OFFLOAD
1216 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "ofld_txq",
1217 	    CTLFLAG_RD, NULL, "tx queues for offloaded TCP connections");
1218 	for_each_ofld_txq(vi, i, ofld_txq) {
1219 		struct sysctl_oid *oid2;
1220 
1221 		iqid = vi_intr_iq(vi, j)->cntxt_id;
1222 		snprintf(name, sizeof(name), "%s ofld_txq%d",
1223 		    device_get_nameunit(vi->dev), i);
1224 		init_eq(sc, &ofld_txq->eq, EQ_OFLD, vi->qsize_txq, pi->tx_chan,
1225 		    iqid, name);
1226 
1227 		snprintf(name, sizeof(name), "%d", i);
1228 		oid2 = SYSCTL_ADD_NODE(&vi->ctx, SYSCTL_CHILDREN(oid), OID_AUTO,
1229 		    name, CTLFLAG_RD, NULL, "offload tx queue");
1230 
1231 		rc = alloc_wrq(sc, vi, ofld_txq, oid2);
1232 		if (rc != 0)
1233 			goto done;
1234 		j++;
1235 	}
1236 #endif
1237 
1238 	/*
1239 	 * Finally, the control queue.
1240 	 */
1241 	if (!IS_MAIN_VI(vi) || sc->flags & IS_VF)
1242 		goto done;
1243 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "ctrlq", CTLFLAG_RD,
1244 	    NULL, "ctrl queue");
1245 	ctrlq = &sc->sge.ctrlq[pi->port_id];
1246 	iqid = vi_intr_iq(vi, 0)->cntxt_id;
1247 	snprintf(name, sizeof(name), "%s ctrlq", device_get_nameunit(vi->dev));
1248 	init_eq(sc, &ctrlq->eq, EQ_CTRL, CTRL_EQ_QSIZE, pi->tx_chan, iqid,
1249 	    name);
1250 	rc = alloc_wrq(sc, vi, ctrlq, oid);
1251 
1252 done:
1253 	if (rc)
1254 		t4_teardown_vi_queues(vi);
1255 
1256 	return (rc);
1257 }
1258 
1259 /*
1260  * Idempotent
1261  */
1262 int
1263 t4_teardown_vi_queues(struct vi_info *vi)
1264 {
1265 	int i;
1266 	struct port_info *pi = vi->pi;
1267 	struct adapter *sc = pi->adapter;
1268 	struct sge_rxq *rxq;
1269 	struct sge_txq *txq;
1270 #ifdef TCP_OFFLOAD
1271 	struct sge_ofld_rxq *ofld_rxq;
1272 	struct sge_wrq *ofld_txq;
1273 #endif
1274 #ifdef DEV_NETMAP
1275 	struct sge_nm_rxq *nm_rxq;
1276 	struct sge_nm_txq *nm_txq;
1277 #endif
1278 
1279 	/* Do this before freeing the queues */
1280 	if (vi->flags & VI_SYSCTL_CTX) {
1281 		sysctl_ctx_free(&vi->ctx);
1282 		vi->flags &= ~VI_SYSCTL_CTX;
1283 	}
1284 
1285 #ifdef DEV_NETMAP
1286 	if (vi->ifp->if_capabilities & IFCAP_NETMAP) {
1287 		for_each_nm_txq(vi, i, nm_txq) {
1288 			free_nm_txq(vi, nm_txq);
1289 		}
1290 
1291 		for_each_nm_rxq(vi, i, nm_rxq) {
1292 			free_nm_rxq(vi, nm_rxq);
1293 		}
1294 	}
1295 #endif
1296 
1297 	/*
1298 	 * Take down all the tx queues first, as they reference the rx queues
1299 	 * (for egress updates, etc.).
1300 	 */
1301 
1302 	if (IS_MAIN_VI(vi) && !(sc->flags & IS_VF))
1303 		free_wrq(sc, &sc->sge.ctrlq[pi->port_id]);
1304 
1305 	for_each_txq(vi, i, txq) {
1306 		free_txq(vi, txq);
1307 	}
1308 #ifdef TCP_OFFLOAD
1309 	for_each_ofld_txq(vi, i, ofld_txq) {
1310 		free_wrq(sc, ofld_txq);
1311 	}
1312 #endif
1313 
1314 	/*
1315 	 * Then take down the rx queues that forward their interrupts, as they
1316 	 * reference other rx queues.
1317 	 */
1318 
1319 	for_each_rxq(vi, i, rxq) {
1320 		if ((rxq->iq.flags & IQ_INTR) == 0)
1321 			free_rxq(vi, rxq);
1322 	}
1323 #ifdef TCP_OFFLOAD
1324 	for_each_ofld_rxq(vi, i, ofld_rxq) {
1325 		if ((ofld_rxq->iq.flags & IQ_INTR) == 0)
1326 			free_ofld_rxq(vi, ofld_rxq);
1327 	}
1328 #endif
1329 
1330 	/*
1331 	 * Then take down the rx queues that take direct interrupts.
1332 	 */
1333 
1334 	for_each_rxq(vi, i, rxq) {
1335 		if (rxq->iq.flags & IQ_INTR)
1336 			free_rxq(vi, rxq);
1337 	}
1338 #ifdef TCP_OFFLOAD
1339 	for_each_ofld_rxq(vi, i, ofld_rxq) {
1340 		if (ofld_rxq->iq.flags & IQ_INTR)
1341 			free_ofld_rxq(vi, ofld_rxq);
1342 	}
1343 #endif
1344 
1345 	return (0);
1346 }
1347 
1348 /*
1349  * Deals with errors and the firmware event queue.  All data rx queues forward
1350  * their interrupt to the firmware event queue.
1351  */
1352 void
1353 t4_intr_all(void *arg)
1354 {
1355 	struct adapter *sc = arg;
1356 	struct sge_iq *fwq = &sc->sge.fwq;
1357 
1358 	t4_intr_err(arg);
1359 	if (atomic_cmpset_int(&fwq->state, IQS_IDLE, IQS_BUSY)) {
1360 		service_iq(fwq, 0);
1361 		atomic_cmpset_int(&fwq->state, IQS_BUSY, IQS_IDLE);
1362 	}
1363 }
1364 
1365 /* Deals with error interrupts */
1366 void
1367 t4_intr_err(void *arg)
1368 {
1369 	struct adapter *sc = arg;
1370 
1371 	t4_write_reg(sc, MYPF_REG(A_PCIE_PF_CLI), 0);
1372 	t4_slow_intr_handler(sc);
1373 }
1374 
1375 void
1376 t4_intr_evt(void *arg)
1377 {
1378 	struct sge_iq *iq = arg;
1379 
1380 	if (atomic_cmpset_int(&iq->state, IQS_IDLE, IQS_BUSY)) {
1381 		service_iq(iq, 0);
1382 		atomic_cmpset_int(&iq->state, IQS_BUSY, IQS_IDLE);
1383 	}
1384 }
1385 
1386 void
1387 t4_intr(void *arg)
1388 {
1389 	struct sge_iq *iq = arg;
1390 
1391 	if (atomic_cmpset_int(&iq->state, IQS_IDLE, IQS_BUSY)) {
1392 		service_iq(iq, 0);
1393 		atomic_cmpset_int(&iq->state, IQS_BUSY, IQS_IDLE);
1394 	}
1395 }
1396 
1397 void
1398 t4_vi_intr(void *arg)
1399 {
1400 	struct irq *irq = arg;
1401 
1402 #ifdef DEV_NETMAP
1403 	if (atomic_cmpset_int(&irq->nm_state, NM_ON, NM_BUSY)) {
1404 		t4_nm_intr(irq->nm_rxq);
1405 		atomic_cmpset_int(&irq->nm_state, NM_BUSY, NM_ON);
1406 	}
1407 #endif
1408 	if (irq->rxq != NULL)
1409 		t4_intr(irq->rxq);
1410 }
1411 
1412 static inline int
1413 sort_before_lro(struct lro_ctrl *lro)
1414 {
1415 
1416 	return (lro->lro_mbuf_max != 0);
1417 }
1418 
1419 /*
1420  * Deals with anything and everything on the given ingress queue.
1421  */
1422 static int
1423 service_iq(struct sge_iq *iq, int budget)
1424 {
1425 	struct sge_iq *q;
1426 	struct sge_rxq *rxq = iq_to_rxq(iq);	/* Use iff iq is part of rxq */
1427 	struct sge_fl *fl;			/* Use iff IQ_HAS_FL */
1428 	struct adapter *sc = iq->adapter;
1429 	struct iq_desc *d = &iq->desc[iq->cidx];
1430 	int ndescs = 0, limit;
1431 	int rsp_type, refill;
1432 	uint32_t lq;
1433 	uint16_t fl_hw_cidx;
1434 	struct mbuf *m0;
1435 	STAILQ_HEAD(, sge_iq) iql = STAILQ_HEAD_INITIALIZER(iql);
1436 #if defined(INET) || defined(INET6)
1437 	const struct timeval lro_timeout = {0, sc->lro_timeout};
1438 	struct lro_ctrl *lro = &rxq->lro;
1439 #endif
1440 
1441 	KASSERT(iq->state == IQS_BUSY, ("%s: iq %p not BUSY", __func__, iq));
1442 
1443 	limit = budget ? budget : iq->qsize / 16;
1444 
1445 	if (iq->flags & IQ_HAS_FL) {
1446 		fl = &rxq->fl;
1447 		fl_hw_cidx = fl->hw_cidx;	/* stable snapshot */
1448 	} else {
1449 		fl = NULL;
1450 		fl_hw_cidx = 0;			/* to silence gcc warning */
1451 	}
1452 
1453 #if defined(INET) || defined(INET6)
1454 	if (iq->flags & IQ_ADJ_CREDIT) {
1455 		MPASS(sort_before_lro(lro));
1456 		iq->flags &= ~IQ_ADJ_CREDIT;
1457 		if ((d->rsp.u.type_gen & F_RSPD_GEN) != iq->gen) {
1458 			tcp_lro_flush_all(lro);
1459 			t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(1) |
1460 			    V_INGRESSQID((u32)iq->cntxt_id) |
1461 			    V_SEINTARM(iq->intr_params));
1462 			return (0);
1463 		}
1464 		ndescs = 1;
1465 	}
1466 #else
1467 	MPASS((iq->flags & IQ_ADJ_CREDIT) == 0);
1468 #endif
1469 
1470 	/*
1471 	 * We always come back and check the descriptor ring for new indirect
1472 	 * interrupts and other responses after running a single handler.
1473 	 */
1474 	for (;;) {
1475 		while ((d->rsp.u.type_gen & F_RSPD_GEN) == iq->gen) {
1476 
1477 			rmb();
1478 
1479 			refill = 0;
1480 			m0 = NULL;
1481 			rsp_type = G_RSPD_TYPE(d->rsp.u.type_gen);
1482 			lq = be32toh(d->rsp.pldbuflen_qid);
1483 
1484 			switch (rsp_type) {
1485 			case X_RSPD_TYPE_FLBUF:
1486 
1487 				KASSERT(iq->flags & IQ_HAS_FL,
1488 				    ("%s: data for an iq (%p) with no freelist",
1489 				    __func__, iq));
1490 
1491 				m0 = get_fl_payload(sc, fl, lq);
1492 				if (__predict_false(m0 == NULL))
1493 					goto process_iql;
1494 				refill = IDXDIFF(fl->hw_cidx, fl_hw_cidx, fl->sidx) > 2;
1495 #ifdef T4_PKT_TIMESTAMP
1496 				/*
1497 				 * 60 bit timestamp for the payload is
1498 				 * *(uint64_t *)m0->m_pktdat.  Note that it is
1499 				 * in the leading free-space in the mbuf.  The
1500 				 * kernel can clobber it during a pullup,
1501 				 * m_copymdata, etc.  You need to make sure that
1502 				 * the mbuf reaches you unmolested if you care
1503 				 * about the timestamp.
1504 				 */
1505 				*(uint64_t *)m0->m_pktdat =
1506 				    be64toh(ctrl->u.last_flit) &
1507 				    0xfffffffffffffff;
1508 #endif
1509 
1510 				/* fall through */
1511 
1512 			case X_RSPD_TYPE_CPL:
1513 				KASSERT(d->rss.opcode < NUM_CPL_CMDS,
1514 				    ("%s: bad opcode %02x.", __func__,
1515 				    d->rss.opcode));
1516 				t4_cpl_handler[d->rss.opcode](iq, &d->rss, m0);
1517 				break;
1518 
1519 			case X_RSPD_TYPE_INTR:
1520 
1521 				/*
1522 				 * Interrupts should be forwarded only to queues
1523 				 * that are not forwarding their interrupts.
1524 				 * This means service_iq can recurse but only 1
1525 				 * level deep.
1526 				 */
1527 				KASSERT(budget == 0,
1528 				    ("%s: budget %u, rsp_type %u", __func__,
1529 				    budget, rsp_type));
1530 
1531 				/*
1532 				 * There are 1K interrupt-capable queues (qids 0
1533 				 * through 1023).  A response type indicating a
1534 				 * forwarded interrupt with a qid >= 1K is an
1535 				 * iWARP async notification.
1536 				 */
1537 				if (lq >= 1024) {
1538                                         t4_an_handler(iq, &d->rsp);
1539                                         break;
1540                                 }
1541 
1542 				q = sc->sge.iqmap[lq - sc->sge.iq_start -
1543 				    sc->sge.iq_base];
1544 				if (atomic_cmpset_int(&q->state, IQS_IDLE,
1545 				    IQS_BUSY)) {
1546 					if (service_iq(q, q->qsize / 16) == 0) {
1547 						atomic_cmpset_int(&q->state,
1548 						    IQS_BUSY, IQS_IDLE);
1549 					} else {
1550 						STAILQ_INSERT_TAIL(&iql, q,
1551 						    link);
1552 					}
1553 				}
1554 				break;
1555 
1556 			default:
1557 				KASSERT(0,
1558 				    ("%s: illegal response type %d on iq %p",
1559 				    __func__, rsp_type, iq));
1560 				log(LOG_ERR,
1561 				    "%s: illegal response type %d on iq %p",
1562 				    device_get_nameunit(sc->dev), rsp_type, iq);
1563 				break;
1564 			}
1565 
1566 			d++;
1567 			if (__predict_false(++iq->cidx == iq->sidx)) {
1568 				iq->cidx = 0;
1569 				iq->gen ^= F_RSPD_GEN;
1570 				d = &iq->desc[0];
1571 			}
1572 			if (__predict_false(++ndescs == limit)) {
1573 				t4_write_reg(sc, sc->sge_gts_reg,
1574 				    V_CIDXINC(ndescs) |
1575 				    V_INGRESSQID(iq->cntxt_id) |
1576 				    V_SEINTARM(V_QINTR_TIMER_IDX(X_TIMERREG_UPDATE_CIDX)));
1577 				ndescs = 0;
1578 
1579 #if defined(INET) || defined(INET6)
1580 				if (iq->flags & IQ_LRO_ENABLED &&
1581 				    !sort_before_lro(lro) &&
1582 				    sc->lro_timeout != 0) {
1583 					tcp_lro_flush_inactive(lro,
1584 					    &lro_timeout);
1585 				}
1586 #endif
1587 
1588 				if (budget) {
1589 					if (iq->flags & IQ_HAS_FL) {
1590 						FL_LOCK(fl);
1591 						refill_fl(sc, fl, 32);
1592 						FL_UNLOCK(fl);
1593 					}
1594 					return (EINPROGRESS);
1595 				}
1596 			}
1597 			if (refill) {
1598 				FL_LOCK(fl);
1599 				refill_fl(sc, fl, 32);
1600 				FL_UNLOCK(fl);
1601 				fl_hw_cidx = fl->hw_cidx;
1602 			}
1603 		}
1604 
1605 process_iql:
1606 		if (STAILQ_EMPTY(&iql))
1607 			break;
1608 
1609 		/*
1610 		 * Process the head only, and send it to the back of the list if
1611 		 * it's still not done.
1612 		 */
1613 		q = STAILQ_FIRST(&iql);
1614 		STAILQ_REMOVE_HEAD(&iql, link);
1615 		if (service_iq(q, q->qsize / 8) == 0)
1616 			atomic_cmpset_int(&q->state, IQS_BUSY, IQS_IDLE);
1617 		else
1618 			STAILQ_INSERT_TAIL(&iql, q, link);
1619 	}
1620 
1621 #if defined(INET) || defined(INET6)
1622 	if (iq->flags & IQ_LRO_ENABLED) {
1623 		if (ndescs > 0 && lro->lro_mbuf_count > 8) {
1624 			MPASS(sort_before_lro(lro));
1625 			/* hold back one credit and don't flush LRO state */
1626 			iq->flags |= IQ_ADJ_CREDIT;
1627 			ndescs--;
1628 		} else {
1629 			tcp_lro_flush_all(lro);
1630 		}
1631 	}
1632 #endif
1633 
1634 	t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(ndescs) |
1635 	    V_INGRESSQID((u32)iq->cntxt_id) | V_SEINTARM(iq->intr_params));
1636 
1637 	if (iq->flags & IQ_HAS_FL) {
1638 		int starved;
1639 
1640 		FL_LOCK(fl);
1641 		starved = refill_fl(sc, fl, 64);
1642 		FL_UNLOCK(fl);
1643 		if (__predict_false(starved != 0))
1644 			add_fl_to_sfl(sc, fl);
1645 	}
1646 
1647 	return (0);
1648 }
1649 
1650 static inline int
1651 cl_has_metadata(struct sge_fl *fl, struct cluster_layout *cll)
1652 {
1653 	int rc = fl->flags & FL_BUF_PACKING || cll->region1 > 0;
1654 
1655 	if (rc)
1656 		MPASS(cll->region3 >= CL_METADATA_SIZE);
1657 
1658 	return (rc);
1659 }
1660 
1661 static inline struct cluster_metadata *
1662 cl_metadata(struct adapter *sc, struct sge_fl *fl, struct cluster_layout *cll,
1663     caddr_t cl)
1664 {
1665 
1666 	if (cl_has_metadata(fl, cll)) {
1667 		struct sw_zone_info *swz = &sc->sge.sw_zone_info[cll->zidx];
1668 
1669 		return ((struct cluster_metadata *)(cl + swz->size) - 1);
1670 	}
1671 	return (NULL);
1672 }
1673 
1674 static void
1675 rxb_free(struct mbuf *m)
1676 {
1677 	uma_zone_t zone = m->m_ext.ext_arg1;
1678 	void *cl = m->m_ext.ext_arg2;
1679 
1680 	uma_zfree(zone, cl);
1681 	counter_u64_add(extfree_rels, 1);
1682 }
1683 
1684 /*
1685  * The mbuf returned by this function could be allocated from zone_mbuf or
1686  * constructed in spare room in the cluster.
1687  *
1688  * The mbuf carries the payload in one of these ways
1689  * a) frame inside the mbuf (mbuf from zone_mbuf)
1690  * b) m_cljset (for clusters without metadata) zone_mbuf
1691  * c) m_extaddref (cluster with metadata) inline mbuf
1692  * d) m_extaddref (cluster with metadata) zone_mbuf
1693  */
1694 static struct mbuf *
1695 get_scatter_segment(struct adapter *sc, struct sge_fl *fl, int fr_offset,
1696     int remaining)
1697 {
1698 	struct mbuf *m;
1699 	struct fl_sdesc *sd = &fl->sdesc[fl->cidx];
1700 	struct cluster_layout *cll = &sd->cll;
1701 	struct sw_zone_info *swz = &sc->sge.sw_zone_info[cll->zidx];
1702 	struct hw_buf_info *hwb = &sc->sge.hw_buf_info[cll->hwidx];
1703 	struct cluster_metadata *clm = cl_metadata(sc, fl, cll, sd->cl);
1704 	int len, blen;
1705 	caddr_t payload;
1706 
1707 	blen = hwb->size - fl->rx_offset;	/* max possible in this buf */
1708 	len = min(remaining, blen);
1709 	payload = sd->cl + cll->region1 + fl->rx_offset;
1710 	if (fl->flags & FL_BUF_PACKING) {
1711 		const u_int l = fr_offset + len;
1712 		const u_int pad = roundup2(l, fl->buf_boundary) - l;
1713 
1714 		if (fl->rx_offset + len + pad < hwb->size)
1715 			blen = len + pad;
1716 		MPASS(fl->rx_offset + blen <= hwb->size);
1717 	} else {
1718 		MPASS(fl->rx_offset == 0);	/* not packing */
1719 	}
1720 
1721 
1722 	if (sc->sc_do_rxcopy && len < RX_COPY_THRESHOLD) {
1723 
1724 		/*
1725 		 * Copy payload into a freshly allocated mbuf.
1726 		 */
1727 
1728 		m = fr_offset == 0 ?
1729 		    m_gethdr(M_NOWAIT, MT_DATA) : m_get(M_NOWAIT, MT_DATA);
1730 		if (m == NULL)
1731 			return (NULL);
1732 		fl->mbuf_allocated++;
1733 #ifdef T4_PKT_TIMESTAMP
1734 		/* Leave room for a timestamp */
1735 		m->m_data += 8;
1736 #endif
1737 		/* copy data to mbuf */
1738 		bcopy(payload, mtod(m, caddr_t), len);
1739 
1740 	} else if (sd->nmbuf * MSIZE < cll->region1) {
1741 
1742 		/*
1743 		 * There's spare room in the cluster for an mbuf.  Create one
1744 		 * and associate it with the payload that's in the cluster.
1745 		 */
1746 
1747 		MPASS(clm != NULL);
1748 		m = (struct mbuf *)(sd->cl + sd->nmbuf * MSIZE);
1749 		/* No bzero required */
1750 		if (m_init(m, M_NOWAIT, MT_DATA,
1751 		    fr_offset == 0 ? M_PKTHDR | M_NOFREE : M_NOFREE))
1752 			return (NULL);
1753 		fl->mbuf_inlined++;
1754 		m_extaddref(m, payload, blen, &clm->refcount, rxb_free,
1755 		    swz->zone, sd->cl);
1756 		if (sd->nmbuf++ == 0)
1757 			counter_u64_add(extfree_refs, 1);
1758 
1759 	} else {
1760 
1761 		/*
1762 		 * Grab an mbuf from zone_mbuf and associate it with the
1763 		 * payload in the cluster.
1764 		 */
1765 
1766 		m = fr_offset == 0 ?
1767 		    m_gethdr(M_NOWAIT, MT_DATA) : m_get(M_NOWAIT, MT_DATA);
1768 		if (m == NULL)
1769 			return (NULL);
1770 		fl->mbuf_allocated++;
1771 		if (clm != NULL) {
1772 			m_extaddref(m, payload, blen, &clm->refcount,
1773 			    rxb_free, swz->zone, sd->cl);
1774 			if (sd->nmbuf++ == 0)
1775 				counter_u64_add(extfree_refs, 1);
1776 		} else {
1777 			m_cljset(m, sd->cl, swz->type);
1778 			sd->cl = NULL;	/* consumed, not a recycle candidate */
1779 		}
1780 	}
1781 	if (fr_offset == 0)
1782 		m->m_pkthdr.len = remaining;
1783 	m->m_len = len;
1784 
1785 	if (fl->flags & FL_BUF_PACKING) {
1786 		fl->rx_offset += blen;
1787 		MPASS(fl->rx_offset <= hwb->size);
1788 		if (fl->rx_offset < hwb->size)
1789 			return (m);	/* without advancing the cidx */
1790 	}
1791 
1792 	if (__predict_false(++fl->cidx % 8 == 0)) {
1793 		uint16_t cidx = fl->cidx / 8;
1794 
1795 		if (__predict_false(cidx == fl->sidx))
1796 			fl->cidx = cidx = 0;
1797 		fl->hw_cidx = cidx;
1798 	}
1799 	fl->rx_offset = 0;
1800 
1801 	return (m);
1802 }
1803 
1804 static struct mbuf *
1805 get_fl_payload(struct adapter *sc, struct sge_fl *fl, uint32_t len_newbuf)
1806 {
1807 	struct mbuf *m0, *m, **pnext;
1808 	u_int remaining;
1809 	const u_int total = G_RSPD_LEN(len_newbuf);
1810 
1811 	if (__predict_false(fl->flags & FL_BUF_RESUME)) {
1812 		M_ASSERTPKTHDR(fl->m0);
1813 		MPASS(fl->m0->m_pkthdr.len == total);
1814 		MPASS(fl->remaining < total);
1815 
1816 		m0 = fl->m0;
1817 		pnext = fl->pnext;
1818 		remaining = fl->remaining;
1819 		fl->flags &= ~FL_BUF_RESUME;
1820 		goto get_segment;
1821 	}
1822 
1823 	if (fl->rx_offset > 0 && len_newbuf & F_RSPD_NEWBUF) {
1824 		fl->rx_offset = 0;
1825 		if (__predict_false(++fl->cidx % 8 == 0)) {
1826 			uint16_t cidx = fl->cidx / 8;
1827 
1828 			if (__predict_false(cidx == fl->sidx))
1829 				fl->cidx = cidx = 0;
1830 			fl->hw_cidx = cidx;
1831 		}
1832 	}
1833 
1834 	/*
1835 	 * Payload starts at rx_offset in the current hw buffer.  Its length is
1836 	 * 'len' and it may span multiple hw buffers.
1837 	 */
1838 
1839 	m0 = get_scatter_segment(sc, fl, 0, total);
1840 	if (m0 == NULL)
1841 		return (NULL);
1842 	remaining = total - m0->m_len;
1843 	pnext = &m0->m_next;
1844 	while (remaining > 0) {
1845 get_segment:
1846 		MPASS(fl->rx_offset == 0);
1847 		m = get_scatter_segment(sc, fl, total - remaining, remaining);
1848 		if (__predict_false(m == NULL)) {
1849 			fl->m0 = m0;
1850 			fl->pnext = pnext;
1851 			fl->remaining = remaining;
1852 			fl->flags |= FL_BUF_RESUME;
1853 			return (NULL);
1854 		}
1855 		*pnext = m;
1856 		pnext = &m->m_next;
1857 		remaining -= m->m_len;
1858 	}
1859 	*pnext = NULL;
1860 
1861 	M_ASSERTPKTHDR(m0);
1862 	return (m0);
1863 }
1864 
1865 static int
1866 t4_eth_rx(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m0)
1867 {
1868 	struct sge_rxq *rxq = iq_to_rxq(iq);
1869 	struct ifnet *ifp = rxq->ifp;
1870 	struct adapter *sc = iq->adapter;
1871 	const struct cpl_rx_pkt *cpl = (const void *)(rss + 1);
1872 #if defined(INET) || defined(INET6)
1873 	struct lro_ctrl *lro = &rxq->lro;
1874 #endif
1875 	static const int sw_hashtype[4][2] = {
1876 		{M_HASHTYPE_NONE, M_HASHTYPE_NONE},
1877 		{M_HASHTYPE_RSS_IPV4, M_HASHTYPE_RSS_IPV6},
1878 		{M_HASHTYPE_RSS_TCP_IPV4, M_HASHTYPE_RSS_TCP_IPV6},
1879 		{M_HASHTYPE_RSS_UDP_IPV4, M_HASHTYPE_RSS_UDP_IPV6},
1880 	};
1881 
1882 	KASSERT(m0 != NULL, ("%s: no payload with opcode %02x", __func__,
1883 	    rss->opcode));
1884 
1885 	m0->m_pkthdr.len -= sc->params.sge.fl_pktshift;
1886 	m0->m_len -= sc->params.sge.fl_pktshift;
1887 	m0->m_data += sc->params.sge.fl_pktshift;
1888 
1889 	m0->m_pkthdr.rcvif = ifp;
1890 	M_HASHTYPE_SET(m0, sw_hashtype[rss->hash_type][rss->ipv6]);
1891 	m0->m_pkthdr.flowid = be32toh(rss->hash_val);
1892 
1893 	if (cpl->csum_calc && !(cpl->err_vec & sc->params.tp.err_vec_mask)) {
1894 		if (ifp->if_capenable & IFCAP_RXCSUM &&
1895 		    cpl->l2info & htobe32(F_RXF_IP)) {
1896 			m0->m_pkthdr.csum_flags = (CSUM_IP_CHECKED |
1897 			    CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
1898 			rxq->rxcsum++;
1899 		} else if (ifp->if_capenable & IFCAP_RXCSUM_IPV6 &&
1900 		    cpl->l2info & htobe32(F_RXF_IP6)) {
1901 			m0->m_pkthdr.csum_flags = (CSUM_DATA_VALID_IPV6 |
1902 			    CSUM_PSEUDO_HDR);
1903 			rxq->rxcsum++;
1904 		}
1905 
1906 		if (__predict_false(cpl->ip_frag))
1907 			m0->m_pkthdr.csum_data = be16toh(cpl->csum);
1908 		else
1909 			m0->m_pkthdr.csum_data = 0xffff;
1910 	}
1911 
1912 	if (cpl->vlan_ex) {
1913 		m0->m_pkthdr.ether_vtag = be16toh(cpl->vlan);
1914 		m0->m_flags |= M_VLANTAG;
1915 		rxq->vlan_extraction++;
1916 	}
1917 
1918 #if defined(INET) || defined(INET6)
1919 	if (iq->flags & IQ_LRO_ENABLED) {
1920 		if (sort_before_lro(lro)) {
1921 			tcp_lro_queue_mbuf(lro, m0);
1922 			return (0); /* queued for sort, then LRO */
1923 		}
1924 		if (tcp_lro_rx(lro, m0, 0) == 0)
1925 			return (0); /* queued for LRO */
1926 	}
1927 #endif
1928 	ifp->if_input(ifp, m0);
1929 
1930 	return (0);
1931 }
1932 
1933 /*
1934  * Must drain the wrq or make sure that someone else will.
1935  */
1936 static void
1937 wrq_tx_drain(void *arg, int n)
1938 {
1939 	struct sge_wrq *wrq = arg;
1940 	struct sge_eq *eq = &wrq->eq;
1941 
1942 	EQ_LOCK(eq);
1943 	if (TAILQ_EMPTY(&wrq->incomplete_wrs) && !STAILQ_EMPTY(&wrq->wr_list))
1944 		drain_wrq_wr_list(wrq->adapter, wrq);
1945 	EQ_UNLOCK(eq);
1946 }
1947 
1948 static void
1949 drain_wrq_wr_list(struct adapter *sc, struct sge_wrq *wrq)
1950 {
1951 	struct sge_eq *eq = &wrq->eq;
1952 	u_int available, dbdiff;	/* # of hardware descriptors */
1953 	u_int n;
1954 	struct wrqe *wr;
1955 	struct fw_eth_tx_pkt_wr *dst;	/* any fw WR struct will do */
1956 
1957 	EQ_LOCK_ASSERT_OWNED(eq);
1958 	MPASS(TAILQ_EMPTY(&wrq->incomplete_wrs));
1959 	wr = STAILQ_FIRST(&wrq->wr_list);
1960 	MPASS(wr != NULL);	/* Must be called with something useful to do */
1961 	MPASS(eq->pidx == eq->dbidx);
1962 	dbdiff = 0;
1963 
1964 	do {
1965 		eq->cidx = read_hw_cidx(eq);
1966 		if (eq->pidx == eq->cidx)
1967 			available = eq->sidx - 1;
1968 		else
1969 			available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;
1970 
1971 		MPASS(wr->wrq == wrq);
1972 		n = howmany(wr->wr_len, EQ_ESIZE);
1973 		if (available < n)
1974 			break;
1975 
1976 		dst = (void *)&eq->desc[eq->pidx];
1977 		if (__predict_true(eq->sidx - eq->pidx > n)) {
1978 			/* Won't wrap, won't end exactly at the status page. */
1979 			bcopy(&wr->wr[0], dst, wr->wr_len);
1980 			eq->pidx += n;
1981 		} else {
1982 			int first_portion = (eq->sidx - eq->pidx) * EQ_ESIZE;
1983 
1984 			bcopy(&wr->wr[0], dst, first_portion);
1985 			if (wr->wr_len > first_portion) {
1986 				bcopy(&wr->wr[first_portion], &eq->desc[0],
1987 				    wr->wr_len - first_portion);
1988 			}
1989 			eq->pidx = n - (eq->sidx - eq->pidx);
1990 		}
1991 		wrq->tx_wrs_copied++;
1992 
1993 		if (available < eq->sidx / 4 &&
1994 		    atomic_cmpset_int(&eq->equiq, 0, 1)) {
1995 			dst->equiq_to_len16 |= htobe32(F_FW_WR_EQUIQ |
1996 			    F_FW_WR_EQUEQ);
1997 			eq->equeqidx = eq->pidx;
1998 		} else if (IDXDIFF(eq->pidx, eq->equeqidx, eq->sidx) >= 32) {
1999 			dst->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ);
2000 			eq->equeqidx = eq->pidx;
2001 		}
2002 
2003 		dbdiff += n;
2004 		if (dbdiff >= 16) {
2005 			ring_eq_db(sc, eq, dbdiff);
2006 			dbdiff = 0;
2007 		}
2008 
2009 		STAILQ_REMOVE_HEAD(&wrq->wr_list, link);
2010 		free_wrqe(wr);
2011 		MPASS(wrq->nwr_pending > 0);
2012 		wrq->nwr_pending--;
2013 		MPASS(wrq->ndesc_needed >= n);
2014 		wrq->ndesc_needed -= n;
2015 	} while ((wr = STAILQ_FIRST(&wrq->wr_list)) != NULL);
2016 
2017 	if (dbdiff)
2018 		ring_eq_db(sc, eq, dbdiff);
2019 }
2020 
2021 /*
2022  * Doesn't fail.  Holds on to work requests it can't send right away.
2023  */
2024 void
2025 t4_wrq_tx_locked(struct adapter *sc, struct sge_wrq *wrq, struct wrqe *wr)
2026 {
2027 #ifdef INVARIANTS
2028 	struct sge_eq *eq = &wrq->eq;
2029 #endif
2030 
2031 	EQ_LOCK_ASSERT_OWNED(eq);
2032 	MPASS(wr != NULL);
2033 	MPASS(wr->wr_len > 0 && wr->wr_len <= SGE_MAX_WR_LEN);
2034 	MPASS((wr->wr_len & 0x7) == 0);
2035 
2036 	STAILQ_INSERT_TAIL(&wrq->wr_list, wr, link);
2037 	wrq->nwr_pending++;
2038 	wrq->ndesc_needed += howmany(wr->wr_len, EQ_ESIZE);
2039 
2040 	if (!TAILQ_EMPTY(&wrq->incomplete_wrs))
2041 		return;	/* commit_wrq_wr will drain wr_list as well. */
2042 
2043 	drain_wrq_wr_list(sc, wrq);
2044 
2045 	/* Doorbell must have caught up to the pidx. */
2046 	MPASS(eq->pidx == eq->dbidx);
2047 }
2048 
2049 void
2050 t4_update_fl_bufsize(struct ifnet *ifp)
2051 {
2052 	struct vi_info *vi = ifp->if_softc;
2053 	struct adapter *sc = vi->pi->adapter;
2054 	struct sge_rxq *rxq;
2055 #ifdef TCP_OFFLOAD
2056 	struct sge_ofld_rxq *ofld_rxq;
2057 #endif
2058 	struct sge_fl *fl;
2059 	int i, maxp, mtu = ifp->if_mtu;
2060 
2061 	maxp = mtu_to_max_payload(sc, mtu, 0);
2062 	for_each_rxq(vi, i, rxq) {
2063 		fl = &rxq->fl;
2064 
2065 		FL_LOCK(fl);
2066 		find_best_refill_source(sc, fl, maxp);
2067 		FL_UNLOCK(fl);
2068 	}
2069 #ifdef TCP_OFFLOAD
2070 	maxp = mtu_to_max_payload(sc, mtu, 1);
2071 	for_each_ofld_rxq(vi, i, ofld_rxq) {
2072 		fl = &ofld_rxq->fl;
2073 
2074 		FL_LOCK(fl);
2075 		find_best_refill_source(sc, fl, maxp);
2076 		FL_UNLOCK(fl);
2077 	}
2078 #endif
2079 }
2080 
2081 static inline int
2082 mbuf_nsegs(struct mbuf *m)
2083 {
2084 
2085 	M_ASSERTPKTHDR(m);
2086 	KASSERT(m->m_pkthdr.l5hlen > 0,
2087 	    ("%s: mbuf %p missing information on # of segments.", __func__, m));
2088 
2089 	return (m->m_pkthdr.l5hlen);
2090 }
2091 
2092 static inline void
2093 set_mbuf_nsegs(struct mbuf *m, uint8_t nsegs)
2094 {
2095 
2096 	M_ASSERTPKTHDR(m);
2097 	m->m_pkthdr.l5hlen = nsegs;
2098 }
2099 
2100 static inline int
2101 mbuf_len16(struct mbuf *m)
2102 {
2103 	int n;
2104 
2105 	M_ASSERTPKTHDR(m);
2106 	n = m->m_pkthdr.PH_loc.eight[0];
2107 	MPASS(n > 0 && n <= SGE_MAX_WR_LEN / 16);
2108 
2109 	return (n);
2110 }
2111 
2112 static inline void
2113 set_mbuf_len16(struct mbuf *m, uint8_t len16)
2114 {
2115 
2116 	M_ASSERTPKTHDR(m);
2117 	m->m_pkthdr.PH_loc.eight[0] = len16;
2118 }
2119 
2120 static inline int
2121 needs_tso(struct mbuf *m)
2122 {
2123 
2124 	M_ASSERTPKTHDR(m);
2125 
2126 	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
2127 		KASSERT(m->m_pkthdr.tso_segsz > 0,
2128 		    ("%s: TSO requested in mbuf %p but MSS not provided",
2129 		    __func__, m));
2130 		return (1);
2131 	}
2132 
2133 	return (0);
2134 }
2135 
2136 static inline int
2137 needs_l3_csum(struct mbuf *m)
2138 {
2139 
2140 	M_ASSERTPKTHDR(m);
2141 
2142 	if (m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TSO))
2143 		return (1);
2144 	return (0);
2145 }
2146 
2147 static inline int
2148 needs_l4_csum(struct mbuf *m)
2149 {
2150 
2151 	M_ASSERTPKTHDR(m);
2152 
2153 	if (m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 |
2154 	    CSUM_TCP_IPV6 | CSUM_TSO))
2155 		return (1);
2156 	return (0);
2157 }
2158 
2159 static inline int
2160 needs_vlan_insertion(struct mbuf *m)
2161 {
2162 
2163 	M_ASSERTPKTHDR(m);
2164 
2165 	if (m->m_flags & M_VLANTAG) {
2166 		KASSERT(m->m_pkthdr.ether_vtag != 0,
2167 		    ("%s: HWVLAN requested in mbuf %p but tag not provided",
2168 		    __func__, m));
2169 		return (1);
2170 	}
2171 	return (0);
2172 }
2173 
2174 static void *
2175 m_advance(struct mbuf **pm, int *poffset, int len)
2176 {
2177 	struct mbuf *m = *pm;
2178 	int offset = *poffset;
2179 	uintptr_t p = 0;
2180 
2181 	MPASS(len > 0);
2182 
2183 	for (;;) {
2184 		if (offset + len < m->m_len) {
2185 			offset += len;
2186 			p = mtod(m, uintptr_t) + offset;
2187 			break;
2188 		}
2189 		len -= m->m_len - offset;
2190 		m = m->m_next;
2191 		offset = 0;
2192 		MPASS(m != NULL);
2193 	}
2194 	*poffset = offset;
2195 	*pm = m;
2196 	return ((void *)p);
2197 }
2198 
2199 /*
2200  * Can deal with empty mbufs in the chain that have m_len = 0, but the chain
2201  * must have at least one mbuf that's not empty.
2202  */
2203 static inline int
2204 count_mbuf_nsegs(struct mbuf *m)
2205 {
2206 	vm_paddr_t lastb, next;
2207 	vm_offset_t va;
2208 	int len, nsegs;
2209 
2210 	MPASS(m != NULL);
2211 
2212 	nsegs = 0;
2213 	lastb = 0;
2214 	for (; m; m = m->m_next) {
2215 
2216 		len = m->m_len;
2217 		if (__predict_false(len == 0))
2218 			continue;
2219 		va = mtod(m, vm_offset_t);
2220 		next = pmap_kextract(va);
2221 		nsegs += sglist_count(m->m_data, len);
2222 		if (lastb + 1 == next)
2223 			nsegs--;
2224 		lastb = pmap_kextract(va + len - 1);
2225 	}
2226 
2227 	MPASS(nsegs > 0);
2228 	return (nsegs);
2229 }
2230 
2231 /*
2232  * Analyze the mbuf to determine its tx needs.  The mbuf passed in may change:
2233  * a) caller can assume it's been freed if this function returns with an error.
2234  * b) it may get defragged up if the gather list is too long for the hardware.
2235  */
2236 int
2237 parse_pkt(struct adapter *sc, struct mbuf **mp)
2238 {
2239 	struct mbuf *m0 = *mp, *m;
2240 	int rc, nsegs, defragged = 0, offset;
2241 	struct ether_header *eh;
2242 	void *l3hdr;
2243 #if defined(INET) || defined(INET6)
2244 	struct tcphdr *tcp;
2245 #endif
2246 	uint16_t eh_type;
2247 
2248 	M_ASSERTPKTHDR(m0);
2249 	if (__predict_false(m0->m_pkthdr.len < ETHER_HDR_LEN)) {
2250 		rc = EINVAL;
2251 fail:
2252 		m_freem(m0);
2253 		*mp = NULL;
2254 		return (rc);
2255 	}
2256 restart:
2257 	/*
2258 	 * First count the number of gather list segments in the payload.
2259 	 * Defrag the mbuf if nsegs exceeds the hardware limit.
2260 	 */
2261 	M_ASSERTPKTHDR(m0);
2262 	MPASS(m0->m_pkthdr.len > 0);
2263 	nsegs = count_mbuf_nsegs(m0);
2264 	if (nsegs > (needs_tso(m0) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS)) {
2265 		if (defragged++ > 0 || (m = m_defrag(m0, M_NOWAIT)) == NULL) {
2266 			rc = EFBIG;
2267 			goto fail;
2268 		}
2269 		*mp = m0 = m;	/* update caller's copy after defrag */
2270 		goto restart;
2271 	}
2272 
2273 	if (__predict_false(nsegs > 2 && m0->m_pkthdr.len <= MHLEN)) {
2274 		m0 = m_pullup(m0, m0->m_pkthdr.len);
2275 		if (m0 == NULL) {
2276 			/* Should have left well enough alone. */
2277 			rc = EFBIG;
2278 			goto fail;
2279 		}
2280 		*mp = m0;	/* update caller's copy after pullup */
2281 		goto restart;
2282 	}
2283 	set_mbuf_nsegs(m0, nsegs);
2284 	if (sc->flags & IS_VF)
2285 		set_mbuf_len16(m0, txpkt_vm_len16(nsegs, needs_tso(m0)));
2286 	else
2287 		set_mbuf_len16(m0, txpkt_len16(nsegs, needs_tso(m0)));
2288 
2289 	if (!needs_tso(m0) &&
2290 	    !(sc->flags & IS_VF && (needs_l3_csum(m0) || needs_l4_csum(m0))))
2291 		return (0);
2292 
2293 	m = m0;
2294 	eh = mtod(m, struct ether_header *);
2295 	eh_type = ntohs(eh->ether_type);
2296 	if (eh_type == ETHERTYPE_VLAN) {
2297 		struct ether_vlan_header *evh = (void *)eh;
2298 
2299 		eh_type = ntohs(evh->evl_proto);
2300 		m0->m_pkthdr.l2hlen = sizeof(*evh);
2301 	} else
2302 		m0->m_pkthdr.l2hlen = sizeof(*eh);
2303 
2304 	offset = 0;
2305 	l3hdr = m_advance(&m, &offset, m0->m_pkthdr.l2hlen);
2306 
2307 	switch (eh_type) {
2308 #ifdef INET6
2309 	case ETHERTYPE_IPV6:
2310 	{
2311 		struct ip6_hdr *ip6 = l3hdr;
2312 
2313 		MPASS(!needs_tso(m0) || ip6->ip6_nxt == IPPROTO_TCP);
2314 
2315 		m0->m_pkthdr.l3hlen = sizeof(*ip6);
2316 		break;
2317 	}
2318 #endif
2319 #ifdef INET
2320 	case ETHERTYPE_IP:
2321 	{
2322 		struct ip *ip = l3hdr;
2323 
2324 		m0->m_pkthdr.l3hlen = ip->ip_hl * 4;
2325 		break;
2326 	}
2327 #endif
2328 	default:
2329 		panic("%s: ethertype 0x%04x unknown.  if_cxgbe must be compiled"
2330 		    " with the same INET/INET6 options as the kernel.",
2331 		    __func__, eh_type);
2332 	}
2333 
2334 #if defined(INET) || defined(INET6)
2335 	if (needs_tso(m0)) {
2336 		tcp = m_advance(&m, &offset, m0->m_pkthdr.l3hlen);
2337 		m0->m_pkthdr.l4hlen = tcp->th_off * 4;
2338 	}
2339 #endif
2340 	MPASS(m0 == *mp);
2341 	return (0);
2342 }
2343 
2344 void *
2345 start_wrq_wr(struct sge_wrq *wrq, int len16, struct wrq_cookie *cookie)
2346 {
2347 	struct sge_eq *eq = &wrq->eq;
2348 	struct adapter *sc = wrq->adapter;
2349 	int ndesc, available;
2350 	struct wrqe *wr;
2351 	void *w;
2352 
2353 	MPASS(len16 > 0);
2354 	ndesc = howmany(len16, EQ_ESIZE / 16);
2355 	MPASS(ndesc > 0 && ndesc <= SGE_MAX_WR_NDESC);
2356 
2357 	EQ_LOCK(eq);
2358 
2359 	if (TAILQ_EMPTY(&wrq->incomplete_wrs) && !STAILQ_EMPTY(&wrq->wr_list))
2360 		drain_wrq_wr_list(sc, wrq);
2361 
2362 	if (!STAILQ_EMPTY(&wrq->wr_list)) {
2363 slowpath:
2364 		EQ_UNLOCK(eq);
2365 		wr = alloc_wrqe(len16 * 16, wrq);
2366 		if (__predict_false(wr == NULL))
2367 			return (NULL);
2368 		cookie->pidx = -1;
2369 		cookie->ndesc = ndesc;
2370 		return (&wr->wr);
2371 	}
2372 
2373 	eq->cidx = read_hw_cidx(eq);
2374 	if (eq->pidx == eq->cidx)
2375 		available = eq->sidx - 1;
2376 	else
2377 		available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;
2378 	if (available < ndesc)
2379 		goto slowpath;
2380 
2381 	cookie->pidx = eq->pidx;
2382 	cookie->ndesc = ndesc;
2383 	TAILQ_INSERT_TAIL(&wrq->incomplete_wrs, cookie, link);
2384 
2385 	w = &eq->desc[eq->pidx];
2386 	IDXINCR(eq->pidx, ndesc, eq->sidx);
2387 	if (__predict_false(cookie->pidx + ndesc > eq->sidx)) {
2388 		w = &wrq->ss[0];
2389 		wrq->ss_pidx = cookie->pidx;
2390 		wrq->ss_len = len16 * 16;
2391 	}
2392 
2393 	EQ_UNLOCK(eq);
2394 
2395 	return (w);
2396 }
2397 
2398 void
2399 commit_wrq_wr(struct sge_wrq *wrq, void *w, struct wrq_cookie *cookie)
2400 {
2401 	struct sge_eq *eq = &wrq->eq;
2402 	struct adapter *sc = wrq->adapter;
2403 	int ndesc, pidx;
2404 	struct wrq_cookie *prev, *next;
2405 
2406 	if (cookie->pidx == -1) {
2407 		struct wrqe *wr = __containerof(w, struct wrqe, wr);
2408 
2409 		t4_wrq_tx(sc, wr);
2410 		return;
2411 	}
2412 
2413 	if (__predict_false(w == &wrq->ss[0])) {
2414 		int n = (eq->sidx - wrq->ss_pidx) * EQ_ESIZE;
2415 
2416 		MPASS(wrq->ss_len > n);	/* WR had better wrap around. */
2417 		bcopy(&wrq->ss[0], &eq->desc[wrq->ss_pidx], n);
2418 		bcopy(&wrq->ss[n], &eq->desc[0], wrq->ss_len - n);
2419 		wrq->tx_wrs_ss++;
2420 	} else
2421 		wrq->tx_wrs_direct++;
2422 
2423 	EQ_LOCK(eq);
2424 	ndesc = cookie->ndesc;	/* Can be more than SGE_MAX_WR_NDESC here. */
2425 	pidx = cookie->pidx;
2426 	MPASS(pidx >= 0 && pidx < eq->sidx);
2427 	prev = TAILQ_PREV(cookie, wrq_incomplete_wrs, link);
2428 	next = TAILQ_NEXT(cookie, link);
2429 	if (prev == NULL) {
2430 		MPASS(pidx == eq->dbidx);
2431 		if (next == NULL || ndesc >= 16)
2432 			ring_eq_db(wrq->adapter, eq, ndesc);
2433 		else {
2434 			MPASS(IDXDIFF(next->pidx, pidx, eq->sidx) == ndesc);
2435 			next->pidx = pidx;
2436 			next->ndesc += ndesc;
2437 		}
2438 	} else {
2439 		MPASS(IDXDIFF(pidx, prev->pidx, eq->sidx) == prev->ndesc);
2440 		prev->ndesc += ndesc;
2441 	}
2442 	TAILQ_REMOVE(&wrq->incomplete_wrs, cookie, link);
2443 
2444 	if (TAILQ_EMPTY(&wrq->incomplete_wrs) && !STAILQ_EMPTY(&wrq->wr_list))
2445 		drain_wrq_wr_list(sc, wrq);
2446 
2447 #ifdef INVARIANTS
2448 	if (TAILQ_EMPTY(&wrq->incomplete_wrs)) {
2449 		/* Doorbell must have caught up to the pidx. */
2450 		MPASS(wrq->eq.pidx == wrq->eq.dbidx);
2451 	}
2452 #endif
2453 	EQ_UNLOCK(eq);
2454 }
2455 
2456 static u_int
2457 can_resume_eth_tx(struct mp_ring *r)
2458 {
2459 	struct sge_eq *eq = r->cookie;
2460 
2461 	return (total_available_tx_desc(eq) > eq->sidx / 8);
2462 }
2463 
2464 static inline int
2465 cannot_use_txpkts(struct mbuf *m)
2466 {
2467 	/* maybe put a GL limit too, to avoid silliness? */
2468 
2469 	return (needs_tso(m));
2470 }
2471 
2472 static inline int
2473 discard_tx(struct sge_eq *eq)
2474 {
2475 
2476 	return ((eq->flags & (EQ_ENABLED | EQ_QFLUSH)) != EQ_ENABLED);
2477 }
2478 
2479 /*
2480  * r->items[cidx] to r->items[pidx], with a wraparound at r->size, are ready to
2481  * be consumed.  Return the actual number consumed.  0 indicates a stall.
2482  */
2483 static u_int
2484 eth_tx(struct mp_ring *r, u_int cidx, u_int pidx)
2485 {
2486 	struct sge_txq *txq = r->cookie;
2487 	struct sge_eq *eq = &txq->eq;
2488 	struct ifnet *ifp = txq->ifp;
2489 	struct vi_info *vi = ifp->if_softc;
2490 	struct port_info *pi = vi->pi;
2491 	struct adapter *sc = pi->adapter;
2492 	u_int total, remaining;		/* # of packets */
2493 	u_int available, dbdiff;	/* # of hardware descriptors */
2494 	u_int n, next_cidx;
2495 	struct mbuf *m0, *tail;
2496 	struct txpkts txp;
2497 	struct fw_eth_tx_pkts_wr *wr;	/* any fw WR struct will do */
2498 
2499 	remaining = IDXDIFF(pidx, cidx, r->size);
2500 	MPASS(remaining > 0);	/* Must not be called without work to do. */
2501 	total = 0;
2502 
2503 	TXQ_LOCK(txq);
2504 	if (__predict_false(discard_tx(eq))) {
2505 		while (cidx != pidx) {
2506 			m0 = r->items[cidx];
2507 			m_freem(m0);
2508 			if (++cidx == r->size)
2509 				cidx = 0;
2510 		}
2511 		reclaim_tx_descs(txq, 2048);
2512 		total = remaining;
2513 		goto done;
2514 	}
2515 
2516 	/* How many hardware descriptors do we have readily available. */
2517 	if (eq->pidx == eq->cidx)
2518 		available = eq->sidx - 1;
2519 	else
2520 		available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;
2521 	dbdiff = IDXDIFF(eq->pidx, eq->dbidx, eq->sidx);
2522 
2523 	while (remaining > 0) {
2524 
2525 		m0 = r->items[cidx];
2526 		M_ASSERTPKTHDR(m0);
2527 		MPASS(m0->m_nextpkt == NULL);
2528 
2529 		if (available < SGE_MAX_WR_NDESC) {
2530 			available += reclaim_tx_descs(txq, 64);
2531 			if (available < howmany(mbuf_len16(m0), EQ_ESIZE / 16))
2532 				break;	/* out of descriptors */
2533 		}
2534 
2535 		next_cidx = cidx + 1;
2536 		if (__predict_false(next_cidx == r->size))
2537 			next_cidx = 0;
2538 
2539 		wr = (void *)&eq->desc[eq->pidx];
2540 		if (sc->flags & IS_VF) {
2541 			total++;
2542 			remaining--;
2543 			ETHER_BPF_MTAP(ifp, m0);
2544 			n = write_txpkt_vm_wr(sc, txq, (void *)wr, m0,
2545 			    available);
2546 		} else if (remaining > 1 &&
2547 		    try_txpkts(m0, r->items[next_cidx], &txp, available) == 0) {
2548 
2549 			/* pkts at cidx, next_cidx should both be in txp. */
2550 			MPASS(txp.npkt == 2);
2551 			tail = r->items[next_cidx];
2552 			MPASS(tail->m_nextpkt == NULL);
2553 			ETHER_BPF_MTAP(ifp, m0);
2554 			ETHER_BPF_MTAP(ifp, tail);
2555 			m0->m_nextpkt = tail;
2556 
2557 			if (__predict_false(++next_cidx == r->size))
2558 				next_cidx = 0;
2559 
2560 			while (next_cidx != pidx) {
2561 				if (add_to_txpkts(r->items[next_cidx], &txp,
2562 				    available) != 0)
2563 					break;
2564 				tail->m_nextpkt = r->items[next_cidx];
2565 				tail = tail->m_nextpkt;
2566 				ETHER_BPF_MTAP(ifp, tail);
2567 				if (__predict_false(++next_cidx == r->size))
2568 					next_cidx = 0;
2569 			}
2570 
2571 			n = write_txpkts_wr(txq, wr, m0, &txp, available);
2572 			total += txp.npkt;
2573 			remaining -= txp.npkt;
2574 		} else {
2575 			total++;
2576 			remaining--;
2577 			ETHER_BPF_MTAP(ifp, m0);
2578 			n = write_txpkt_wr(txq, (void *)wr, m0, available);
2579 		}
2580 		MPASS(n >= 1 && n <= available && n <= SGE_MAX_WR_NDESC);
2581 
2582 		available -= n;
2583 		dbdiff += n;
2584 		IDXINCR(eq->pidx, n, eq->sidx);
2585 
2586 		if (total_available_tx_desc(eq) < eq->sidx / 4 &&
2587 		    atomic_cmpset_int(&eq->equiq, 0, 1)) {
2588 			wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUIQ |
2589 			    F_FW_WR_EQUEQ);
2590 			eq->equeqidx = eq->pidx;
2591 		} else if (IDXDIFF(eq->pidx, eq->equeqidx, eq->sidx) >= 32) {
2592 			wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ);
2593 			eq->equeqidx = eq->pidx;
2594 		}
2595 
2596 		if (dbdiff >= 16 && remaining >= 4) {
2597 			ring_eq_db(sc, eq, dbdiff);
2598 			available += reclaim_tx_descs(txq, 4 * dbdiff);
2599 			dbdiff = 0;
2600 		}
2601 
2602 		cidx = next_cidx;
2603 	}
2604 	if (dbdiff != 0) {
2605 		ring_eq_db(sc, eq, dbdiff);
2606 		reclaim_tx_descs(txq, 32);
2607 	}
2608 done:
2609 	TXQ_UNLOCK(txq);
2610 
2611 	return (total);
2612 }
2613 
2614 static inline void
2615 init_iq(struct sge_iq *iq, struct adapter *sc, int tmr_idx, int pktc_idx,
2616     int qsize)
2617 {
2618 
2619 	KASSERT(tmr_idx >= 0 && tmr_idx < SGE_NTIMERS,
2620 	    ("%s: bad tmr_idx %d", __func__, tmr_idx));
2621 	KASSERT(pktc_idx < SGE_NCOUNTERS,	/* -ve is ok, means don't use */
2622 	    ("%s: bad pktc_idx %d", __func__, pktc_idx));
2623 
2624 	iq->flags = 0;
2625 	iq->adapter = sc;
2626 	iq->intr_params = V_QINTR_TIMER_IDX(tmr_idx);
2627 	iq->intr_pktc_idx = SGE_NCOUNTERS - 1;
2628 	if (pktc_idx >= 0) {
2629 		iq->intr_params |= F_QINTR_CNT_EN;
2630 		iq->intr_pktc_idx = pktc_idx;
2631 	}
2632 	iq->qsize = roundup2(qsize, 16);	/* See FW_IQ_CMD/iqsize */
2633 	iq->sidx = iq->qsize - sc->params.sge.spg_len / IQ_ESIZE;
2634 }
2635 
2636 static inline void
2637 init_fl(struct adapter *sc, struct sge_fl *fl, int qsize, int maxp, char *name)
2638 {
2639 
2640 	fl->qsize = qsize;
2641 	fl->sidx = qsize - sc->params.sge.spg_len / EQ_ESIZE;
2642 	strlcpy(fl->lockname, name, sizeof(fl->lockname));
2643 	if (sc->flags & BUF_PACKING_OK &&
2644 	    ((!is_t4(sc) && buffer_packing) ||	/* T5+: enabled unless 0 */
2645 	    (is_t4(sc) && buffer_packing == 1)))/* T4: disabled unless 1 */
2646 		fl->flags |= FL_BUF_PACKING;
2647 	find_best_refill_source(sc, fl, maxp);
2648 	find_safe_refill_source(sc, fl);
2649 }
2650 
2651 static inline void
2652 init_eq(struct adapter *sc, struct sge_eq *eq, int eqtype, int qsize,
2653     uint8_t tx_chan, uint16_t iqid, char *name)
2654 {
2655 	KASSERT(eqtype <= EQ_TYPEMASK, ("%s: bad qtype %d", __func__, eqtype));
2656 
2657 	eq->flags = eqtype & EQ_TYPEMASK;
2658 	eq->tx_chan = tx_chan;
2659 	eq->iqid = iqid;
2660 	eq->sidx = qsize - sc->params.sge.spg_len / EQ_ESIZE;
2661 	strlcpy(eq->lockname, name, sizeof(eq->lockname));
2662 }
2663 
2664 static int
2665 alloc_ring(struct adapter *sc, size_t len, bus_dma_tag_t *tag,
2666     bus_dmamap_t *map, bus_addr_t *pa, void **va)
2667 {
2668 	int rc;
2669 
2670 	rc = bus_dma_tag_create(sc->dmat, 512, 0, BUS_SPACE_MAXADDR,
2671 	    BUS_SPACE_MAXADDR, NULL, NULL, len, 1, len, 0, NULL, NULL, tag);
2672 	if (rc != 0) {
2673 		device_printf(sc->dev, "cannot allocate DMA tag: %d\n", rc);
2674 		goto done;
2675 	}
2676 
2677 	rc = bus_dmamem_alloc(*tag, va,
2678 	    BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, map);
2679 	if (rc != 0) {
2680 		device_printf(sc->dev, "cannot allocate DMA memory: %d\n", rc);
2681 		goto done;
2682 	}
2683 
2684 	rc = bus_dmamap_load(*tag, *map, *va, len, oneseg_dma_callback, pa, 0);
2685 	if (rc != 0) {
2686 		device_printf(sc->dev, "cannot load DMA map: %d\n", rc);
2687 		goto done;
2688 	}
2689 done:
2690 	if (rc)
2691 		free_ring(sc, *tag, *map, *pa, *va);
2692 
2693 	return (rc);
2694 }
2695 
2696 static int
2697 free_ring(struct adapter *sc, bus_dma_tag_t tag, bus_dmamap_t map,
2698     bus_addr_t pa, void *va)
2699 {
2700 	if (pa)
2701 		bus_dmamap_unload(tag, map);
2702 	if (va)
2703 		bus_dmamem_free(tag, va, map);
2704 	if (tag)
2705 		bus_dma_tag_destroy(tag);
2706 
2707 	return (0);
2708 }
2709 
2710 /*
2711  * Allocates the ring for an ingress queue and an optional freelist.  If the
2712  * freelist is specified it will be allocated and then associated with the
2713  * ingress queue.
2714  *
2715  * Returns errno on failure.  Resources allocated up to that point may still be
2716  * allocated.  Caller is responsible for cleanup in case this function fails.
2717  *
2718  * If the ingress queue will take interrupts directly (iq->flags & IQ_INTR) then
2719  * the intr_idx specifies the vector, starting from 0.  Otherwise it specifies
2720  * the abs_id of the ingress queue to which its interrupts should be forwarded.
2721  */
2722 static int
2723 alloc_iq_fl(struct vi_info *vi, struct sge_iq *iq, struct sge_fl *fl,
2724     int intr_idx, int cong)
2725 {
2726 	int rc, i, cntxt_id;
2727 	size_t len;
2728 	struct fw_iq_cmd c;
2729 	struct port_info *pi = vi->pi;
2730 	struct adapter *sc = iq->adapter;
2731 	struct sge_params *sp = &sc->params.sge;
2732 	__be32 v = 0;
2733 
2734 	len = iq->qsize * IQ_ESIZE;
2735 	rc = alloc_ring(sc, len, &iq->desc_tag, &iq->desc_map, &iq->ba,
2736 	    (void **)&iq->desc);
2737 	if (rc != 0)
2738 		return (rc);
2739 
2740 	bzero(&c, sizeof(c));
2741 	c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_IQ_CMD) | F_FW_CMD_REQUEST |
2742 	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_IQ_CMD_PFN(sc->pf) |
2743 	    V_FW_IQ_CMD_VFN(0));
2744 
2745 	c.alloc_to_len16 = htobe32(F_FW_IQ_CMD_ALLOC | F_FW_IQ_CMD_IQSTART |
2746 	    FW_LEN16(c));
2747 
2748 	/* Special handling for firmware event queue */
2749 	if (iq == &sc->sge.fwq)
2750 		v |= F_FW_IQ_CMD_IQASYNCH;
2751 
2752 	if (iq->flags & IQ_INTR) {
2753 		KASSERT(intr_idx < sc->intr_count,
2754 		    ("%s: invalid direct intr_idx %d", __func__, intr_idx));
2755 	} else
2756 		v |= F_FW_IQ_CMD_IQANDST;
2757 	v |= V_FW_IQ_CMD_IQANDSTINDEX(intr_idx);
2758 
2759 	c.type_to_iqandstindex = htobe32(v |
2760 	    V_FW_IQ_CMD_TYPE(FW_IQ_TYPE_FL_INT_CAP) |
2761 	    V_FW_IQ_CMD_VIID(vi->viid) |
2762 	    V_FW_IQ_CMD_IQANUD(X_UPDATEDELIVERY_INTERRUPT));
2763 	c.iqdroprss_to_iqesize = htobe16(V_FW_IQ_CMD_IQPCIECH(pi->tx_chan) |
2764 	    F_FW_IQ_CMD_IQGTSMODE |
2765 	    V_FW_IQ_CMD_IQINTCNTTHRESH(iq->intr_pktc_idx) |
2766 	    V_FW_IQ_CMD_IQESIZE(ilog2(IQ_ESIZE) - 4));
2767 	c.iqsize = htobe16(iq->qsize);
2768 	c.iqaddr = htobe64(iq->ba);
2769 	if (cong >= 0)
2770 		c.iqns_to_fl0congen = htobe32(F_FW_IQ_CMD_IQFLINTCONGEN);
2771 
2772 	if (fl) {
2773 		mtx_init(&fl->fl_lock, fl->lockname, NULL, MTX_DEF);
2774 
2775 		len = fl->qsize * EQ_ESIZE;
2776 		rc = alloc_ring(sc, len, &fl->desc_tag, &fl->desc_map,
2777 		    &fl->ba, (void **)&fl->desc);
2778 		if (rc)
2779 			return (rc);
2780 
2781 		/* Allocate space for one software descriptor per buffer. */
2782 		rc = alloc_fl_sdesc(fl);
2783 		if (rc != 0) {
2784 			device_printf(sc->dev,
2785 			    "failed to setup fl software descriptors: %d\n",
2786 			    rc);
2787 			return (rc);
2788 		}
2789 
2790 		if (fl->flags & FL_BUF_PACKING) {
2791 			fl->lowat = roundup2(sp->fl_starve_threshold2, 8);
2792 			fl->buf_boundary = sp->pack_boundary;
2793 		} else {
2794 			fl->lowat = roundup2(sp->fl_starve_threshold, 8);
2795 			fl->buf_boundary = 16;
2796 		}
2797 		if (fl_pad && fl->buf_boundary < sp->pad_boundary)
2798 			fl->buf_boundary = sp->pad_boundary;
2799 
2800 		c.iqns_to_fl0congen |=
2801 		    htobe32(V_FW_IQ_CMD_FL0HOSTFCMODE(X_HOSTFCMODE_NONE) |
2802 			F_FW_IQ_CMD_FL0FETCHRO | F_FW_IQ_CMD_FL0DATARO |
2803 			(fl_pad ? F_FW_IQ_CMD_FL0PADEN : 0) |
2804 			(fl->flags & FL_BUF_PACKING ? F_FW_IQ_CMD_FL0PACKEN :
2805 			    0));
2806 		if (cong >= 0) {
2807 			c.iqns_to_fl0congen |=
2808 				htobe32(V_FW_IQ_CMD_FL0CNGCHMAP(cong) |
2809 				    F_FW_IQ_CMD_FL0CONGCIF |
2810 				    F_FW_IQ_CMD_FL0CONGEN);
2811 		}
2812 		c.fl0dcaen_to_fl0cidxfthresh =
2813 		    htobe16(V_FW_IQ_CMD_FL0FBMIN(chip_id(sc) <= CHELSIO_T5 ?
2814 			X_FETCHBURSTMIN_128B : X_FETCHBURSTMIN_64B) |
2815 			V_FW_IQ_CMD_FL0FBMAX(chip_id(sc) <= CHELSIO_T5 ?
2816 			X_FETCHBURSTMAX_512B : X_FETCHBURSTMAX_256B));
2817 		c.fl0size = htobe16(fl->qsize);
2818 		c.fl0addr = htobe64(fl->ba);
2819 	}
2820 
2821 	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
2822 	if (rc != 0) {
2823 		device_printf(sc->dev,
2824 		    "failed to create ingress queue: %d\n", rc);
2825 		return (rc);
2826 	}
2827 
2828 	iq->cidx = 0;
2829 	iq->gen = F_RSPD_GEN;
2830 	iq->intr_next = iq->intr_params;
2831 	iq->cntxt_id = be16toh(c.iqid);
2832 	iq->abs_id = be16toh(c.physiqid);
2833 	iq->flags |= IQ_ALLOCATED;
2834 
2835 	cntxt_id = iq->cntxt_id - sc->sge.iq_start;
2836 	if (cntxt_id >= sc->sge.niq) {
2837 		panic ("%s: iq->cntxt_id (%d) more than the max (%d)", __func__,
2838 		    cntxt_id, sc->sge.niq - 1);
2839 	}
2840 	sc->sge.iqmap[cntxt_id] = iq;
2841 
2842 	if (fl) {
2843 		u_int qid;
2844 
2845 		iq->flags |= IQ_HAS_FL;
2846 		fl->cntxt_id = be16toh(c.fl0id);
2847 		fl->pidx = fl->cidx = 0;
2848 
2849 		cntxt_id = fl->cntxt_id - sc->sge.eq_start;
2850 		if (cntxt_id >= sc->sge.neq) {
2851 			panic("%s: fl->cntxt_id (%d) more than the max (%d)",
2852 			    __func__, cntxt_id, sc->sge.neq - 1);
2853 		}
2854 		sc->sge.eqmap[cntxt_id] = (void *)fl;
2855 
2856 		qid = fl->cntxt_id;
2857 		if (isset(&sc->doorbells, DOORBELL_UDB)) {
2858 			uint32_t s_qpp = sc->params.sge.eq_s_qpp;
2859 			uint32_t mask = (1 << s_qpp) - 1;
2860 			volatile uint8_t *udb;
2861 
2862 			udb = sc->udbs_base + UDBS_DB_OFFSET;
2863 			udb += (qid >> s_qpp) << PAGE_SHIFT;
2864 			qid &= mask;
2865 			if (qid < PAGE_SIZE / UDBS_SEG_SIZE) {
2866 				udb += qid << UDBS_SEG_SHIFT;
2867 				qid = 0;
2868 			}
2869 			fl->udb = (volatile void *)udb;
2870 		}
2871 		fl->dbval = V_QID(qid) | sc->chip_params->sge_fl_db;
2872 
2873 		FL_LOCK(fl);
2874 		/* Enough to make sure the SGE doesn't think it's starved */
2875 		refill_fl(sc, fl, fl->lowat);
2876 		FL_UNLOCK(fl);
2877 	}
2878 
2879 	if (chip_id(sc) >= CHELSIO_T5 && !(sc->flags & IS_VF) && cong >= 0) {
2880 		uint32_t param, val;
2881 
2882 		param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DMAQ) |
2883 		    V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DMAQ_CONM_CTXT) |
2884 		    V_FW_PARAMS_PARAM_YZ(iq->cntxt_id);
2885 		if (cong == 0)
2886 			val = 1 << 19;
2887 		else {
2888 			val = 2 << 19;
2889 			for (i = 0; i < 4; i++) {
2890 				if (cong & (1 << i))
2891 					val |= 1 << (i << 2);
2892 			}
2893 		}
2894 
2895 		rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, &param, &val);
2896 		if (rc != 0) {
2897 			/* report error but carry on */
2898 			device_printf(sc->dev,
2899 			    "failed to set congestion manager context for "
2900 			    "ingress queue %d: %d\n", iq->cntxt_id, rc);
2901 		}
2902 	}
2903 
2904 	/* Enable IQ interrupts */
2905 	atomic_store_rel_int(&iq->state, IQS_IDLE);
2906 	t4_write_reg(sc, sc->sge_gts_reg, V_SEINTARM(iq->intr_params) |
2907 	    V_INGRESSQID(iq->cntxt_id));
2908 
2909 	return (0);
2910 }
2911 
2912 static int
2913 free_iq_fl(struct vi_info *vi, struct sge_iq *iq, struct sge_fl *fl)
2914 {
2915 	int rc;
2916 	struct adapter *sc = iq->adapter;
2917 	device_t dev;
2918 
2919 	if (sc == NULL)
2920 		return (0);	/* nothing to do */
2921 
2922 	dev = vi ? vi->dev : sc->dev;
2923 
2924 	if (iq->flags & IQ_ALLOCATED) {
2925 		rc = -t4_iq_free(sc, sc->mbox, sc->pf, 0,
2926 		    FW_IQ_TYPE_FL_INT_CAP, iq->cntxt_id,
2927 		    fl ? fl->cntxt_id : 0xffff, 0xffff);
2928 		if (rc != 0) {
2929 			device_printf(dev,
2930 			    "failed to free queue %p: %d\n", iq, rc);
2931 			return (rc);
2932 		}
2933 		iq->flags &= ~IQ_ALLOCATED;
2934 	}
2935 
2936 	free_ring(sc, iq->desc_tag, iq->desc_map, iq->ba, iq->desc);
2937 
2938 	bzero(iq, sizeof(*iq));
2939 
2940 	if (fl) {
2941 		free_ring(sc, fl->desc_tag, fl->desc_map, fl->ba,
2942 		    fl->desc);
2943 
2944 		if (fl->sdesc)
2945 			free_fl_sdesc(sc, fl);
2946 
2947 		if (mtx_initialized(&fl->fl_lock))
2948 			mtx_destroy(&fl->fl_lock);
2949 
2950 		bzero(fl, sizeof(*fl));
2951 	}
2952 
2953 	return (0);
2954 }
2955 
2956 static void
2957 add_fl_sysctls(struct adapter *sc, struct sysctl_ctx_list *ctx,
2958     struct sysctl_oid *oid, struct sge_fl *fl)
2959 {
2960 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
2961 
2962 	oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "fl", CTLFLAG_RD, NULL,
2963 	    "freelist");
2964 	children = SYSCTL_CHILDREN(oid);
2965 
2966 	SYSCTL_ADD_UAUTO(ctx, children, OID_AUTO, "ba", CTLFLAG_RD,
2967 	    &fl->ba, "bus address of descriptor ring");
2968 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL,
2969 	    fl->sidx * EQ_ESIZE + sc->params.sge.spg_len,
2970 	    "desc ring size in bytes");
2971 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cntxt_id",
2972 	    CTLTYPE_INT | CTLFLAG_RD, &fl->cntxt_id, 0, sysctl_uint16, "I",
2973 	    "SGE context id of the freelist");
2974 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "padding", CTLFLAG_RD, NULL,
2975 	    fl_pad ? 1 : 0, "padding enabled");
2976 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "packing", CTLFLAG_RD, NULL,
2977 	    fl->flags & FL_BUF_PACKING ? 1 : 0, "packing enabled");
2978 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cidx", CTLFLAG_RD, &fl->cidx,
2979 	    0, "consumer index");
2980 	if (fl->flags & FL_BUF_PACKING) {
2981 		SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "rx_offset",
2982 		    CTLFLAG_RD, &fl->rx_offset, 0, "packing rx offset");
2983 	}
2984 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "pidx", CTLFLAG_RD, &fl->pidx,
2985 	    0, "producer index");
2986 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "mbuf_allocated",
2987 	    CTLFLAG_RD, &fl->mbuf_allocated, "# of mbuf allocated");
2988 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "mbuf_inlined",
2989 	    CTLFLAG_RD, &fl->mbuf_inlined, "# of mbuf inlined in clusters");
2990 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_allocated",
2991 	    CTLFLAG_RD, &fl->cl_allocated, "# of clusters allocated");
2992 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_recycled",
2993 	    CTLFLAG_RD, &fl->cl_recycled, "# of clusters recycled");
2994 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_fast_recycled",
2995 	    CTLFLAG_RD, &fl->cl_fast_recycled, "# of clusters recycled (fast)");
2996 }
2997 
2998 static int
2999 alloc_fwq(struct adapter *sc)
3000 {
3001 	int rc, intr_idx;
3002 	struct sge_iq *fwq = &sc->sge.fwq;
3003 	struct sysctl_oid *oid = device_get_sysctl_tree(sc->dev);
3004 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
3005 
3006 	init_iq(fwq, sc, 0, 0, FW_IQ_QSIZE);
3007 	fwq->flags |= IQ_INTR;	/* always */
3008 	if (sc->flags & IS_VF)
3009 		intr_idx = 0;
3010 	else {
3011 		intr_idx = sc->intr_count > 1 ? 1 : 0;
3012 		fwq->set_tcb_rpl = t4_filter_rpl;
3013 		fwq->l2t_write_rpl = do_l2t_write_rpl;
3014 	}
3015 	rc = alloc_iq_fl(&sc->port[0]->vi[0], fwq, NULL, intr_idx, -1);
3016 	if (rc != 0) {
3017 		device_printf(sc->dev,
3018 		    "failed to create firmware event queue: %d\n", rc);
3019 		return (rc);
3020 	}
3021 
3022 	oid = SYSCTL_ADD_NODE(&sc->ctx, children, OID_AUTO, "fwq", CTLFLAG_RD,
3023 	    NULL, "firmware event queue");
3024 	children = SYSCTL_CHILDREN(oid);
3025 
3026 	SYSCTL_ADD_UAUTO(&sc->ctx, children, OID_AUTO, "ba", CTLFLAG_RD,
3027 	    &fwq->ba, "bus address of descriptor ring");
3028 	SYSCTL_ADD_INT(&sc->ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL,
3029 	    fwq->qsize * IQ_ESIZE, "descriptor ring size in bytes");
3030 	SYSCTL_ADD_PROC(&sc->ctx, children, OID_AUTO, "abs_id",
3031 	    CTLTYPE_INT | CTLFLAG_RD, &fwq->abs_id, 0, sysctl_uint16, "I",
3032 	    "absolute id of the queue");
3033 	SYSCTL_ADD_PROC(&sc->ctx, children, OID_AUTO, "cntxt_id",
3034 	    CTLTYPE_INT | CTLFLAG_RD, &fwq->cntxt_id, 0, sysctl_uint16, "I",
3035 	    "SGE context id of the queue");
3036 	SYSCTL_ADD_PROC(&sc->ctx, children, OID_AUTO, "cidx",
3037 	    CTLTYPE_INT | CTLFLAG_RD, &fwq->cidx, 0, sysctl_uint16, "I",
3038 	    "consumer index");
3039 
3040 	return (0);
3041 }
3042 
3043 static int
3044 free_fwq(struct adapter *sc)
3045 {
3046 	return free_iq_fl(NULL, &sc->sge.fwq, NULL);
3047 }
3048 
3049 static int
3050 alloc_mgmtq(struct adapter *sc)
3051 {
3052 	int rc;
3053 	struct sge_wrq *mgmtq = &sc->sge.mgmtq;
3054 	char name[16];
3055 	struct sysctl_oid *oid = device_get_sysctl_tree(sc->dev);
3056 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
3057 
3058 	oid = SYSCTL_ADD_NODE(&sc->ctx, children, OID_AUTO, "mgmtq", CTLFLAG_RD,
3059 	    NULL, "management queue");
3060 
3061 	snprintf(name, sizeof(name), "%s mgmtq", device_get_nameunit(sc->dev));
3062 	init_eq(sc, &mgmtq->eq, EQ_CTRL, CTRL_EQ_QSIZE, sc->port[0]->tx_chan,
3063 	    sc->sge.fwq.cntxt_id, name);
3064 	rc = alloc_wrq(sc, NULL, mgmtq, oid);
3065 	if (rc != 0) {
3066 		device_printf(sc->dev,
3067 		    "failed to create management queue: %d\n", rc);
3068 		return (rc);
3069 	}
3070 
3071 	return (0);
3072 }
3073 
3074 static int
3075 free_mgmtq(struct adapter *sc)
3076 {
3077 
3078 	return free_wrq(sc, &sc->sge.mgmtq);
3079 }
3080 
3081 int
3082 tnl_cong(struct port_info *pi, int drop)
3083 {
3084 
3085 	if (drop == -1)
3086 		return (-1);
3087 	else if (drop == 1)
3088 		return (0);
3089 	else
3090 		return (pi->rx_e_chan_map);
3091 }
3092 
3093 static int
3094 alloc_rxq(struct vi_info *vi, struct sge_rxq *rxq, int intr_idx, int idx,
3095     struct sysctl_oid *oid)
3096 {
3097 	int rc;
3098 	struct adapter *sc = vi->pi->adapter;
3099 	struct sysctl_oid_list *children;
3100 	char name[16];
3101 
3102 	rc = alloc_iq_fl(vi, &rxq->iq, &rxq->fl, intr_idx,
3103 	    tnl_cong(vi->pi, cong_drop));
3104 	if (rc != 0)
3105 		return (rc);
3106 
3107 	if (idx == 0)
3108 		sc->sge.iq_base = rxq->iq.abs_id - rxq->iq.cntxt_id;
3109 	else
3110 		KASSERT(rxq->iq.cntxt_id + sc->sge.iq_base == rxq->iq.abs_id,
3111 		    ("iq_base mismatch"));
3112 	KASSERT(sc->sge.iq_base == 0 || sc->flags & IS_VF,
3113 	    ("PF with non-zero iq_base"));
3114 
3115 	/*
3116 	 * The freelist is just barely above the starvation threshold right now,
3117 	 * fill it up a bit more.
3118 	 */
3119 	FL_LOCK(&rxq->fl);
3120 	refill_fl(sc, &rxq->fl, 128);
3121 	FL_UNLOCK(&rxq->fl);
3122 
3123 #if defined(INET) || defined(INET6)
3124 	rc = tcp_lro_init_args(&rxq->lro, vi->ifp, lro_entries, lro_mbufs);
3125 	if (rc != 0)
3126 		return (rc);
3127 	MPASS(rxq->lro.ifp == vi->ifp);	/* also indicates LRO init'ed */
3128 
3129 	if (vi->ifp->if_capenable & IFCAP_LRO)
3130 		rxq->iq.flags |= IQ_LRO_ENABLED;
3131 #endif
3132 	rxq->ifp = vi->ifp;
3133 
3134 	children = SYSCTL_CHILDREN(oid);
3135 
3136 	snprintf(name, sizeof(name), "%d", idx);
3137 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name, CTLFLAG_RD,
3138 	    NULL, "rx queue");
3139 	children = SYSCTL_CHILDREN(oid);
3140 
3141 	SYSCTL_ADD_UAUTO(&vi->ctx, children, OID_AUTO, "ba", CTLFLAG_RD,
3142 	    &rxq->iq.ba, "bus address of descriptor ring");
3143 	SYSCTL_ADD_INT(&vi->ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL,
3144 	    rxq->iq.qsize * IQ_ESIZE, "descriptor ring size in bytes");
3145 	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "abs_id",
3146 	    CTLTYPE_INT | CTLFLAG_RD, &rxq->iq.abs_id, 0, sysctl_uint16, "I",
3147 	    "absolute id of the queue");
3148 	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "cntxt_id",
3149 	    CTLTYPE_INT | CTLFLAG_RD, &rxq->iq.cntxt_id, 0, sysctl_uint16, "I",
3150 	    "SGE context id of the queue");
3151 	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "cidx",
3152 	    CTLTYPE_INT | CTLFLAG_RD, &rxq->iq.cidx, 0, sysctl_uint16, "I",
3153 	    "consumer index");
3154 #if defined(INET) || defined(INET6)
3155 	SYSCTL_ADD_U64(&vi->ctx, children, OID_AUTO, "lro_queued", CTLFLAG_RD,
3156 	    &rxq->lro.lro_queued, 0, NULL);
3157 	SYSCTL_ADD_U64(&vi->ctx, children, OID_AUTO, "lro_flushed", CTLFLAG_RD,
3158 	    &rxq->lro.lro_flushed, 0, NULL);
3159 #endif
3160 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "rxcsum", CTLFLAG_RD,
3161 	    &rxq->rxcsum, "# of times hardware assisted with checksum");
3162 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "vlan_extraction",
3163 	    CTLFLAG_RD, &rxq->vlan_extraction,
3164 	    "# of times hardware extracted 802.1Q tag");
3165 
3166 	add_fl_sysctls(sc, &vi->ctx, oid, &rxq->fl);
3167 
3168 	return (rc);
3169 }
3170 
3171 static int
3172 free_rxq(struct vi_info *vi, struct sge_rxq *rxq)
3173 {
3174 	int rc;
3175 
3176 #if defined(INET) || defined(INET6)
3177 	if (rxq->lro.ifp) {
3178 		tcp_lro_free(&rxq->lro);
3179 		rxq->lro.ifp = NULL;
3180 	}
3181 #endif
3182 
3183 	rc = free_iq_fl(vi, &rxq->iq, &rxq->fl);
3184 	if (rc == 0)
3185 		bzero(rxq, sizeof(*rxq));
3186 
3187 	return (rc);
3188 }
3189 
3190 #ifdef TCP_OFFLOAD
3191 static int
3192 alloc_ofld_rxq(struct vi_info *vi, struct sge_ofld_rxq *ofld_rxq,
3193     int intr_idx, int idx, struct sysctl_oid *oid)
3194 {
3195 	struct port_info *pi = vi->pi;
3196 	int rc;
3197 	struct sysctl_oid_list *children;
3198 	char name[16];
3199 
3200 	rc = alloc_iq_fl(vi, &ofld_rxq->iq, &ofld_rxq->fl, intr_idx, 0);
3201 	if (rc != 0)
3202 		return (rc);
3203 
3204 	children = SYSCTL_CHILDREN(oid);
3205 
3206 	snprintf(name, sizeof(name), "%d", idx);
3207 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name, CTLFLAG_RD,
3208 	    NULL, "rx queue");
3209 	children = SYSCTL_CHILDREN(oid);
3210 
3211 	SYSCTL_ADD_UAUTO(&vi->ctx, children, OID_AUTO, "ba", CTLFLAG_RD,
3212 	    &ofld_rxq->iq.ba, "bus address of descriptor ring");
3213 	SYSCTL_ADD_INT(&vi->ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL,
3214 	    ofld_rxq->iq.qsize * IQ_ESIZE, "descriptor ring size in bytes");
3215 	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "abs_id",
3216 	    CTLTYPE_INT | CTLFLAG_RD, &ofld_rxq->iq.abs_id, 0, sysctl_uint16,
3217 	    "I", "absolute id of the queue");
3218 	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "cntxt_id",
3219 	    CTLTYPE_INT | CTLFLAG_RD, &ofld_rxq->iq.cntxt_id, 0, sysctl_uint16,
3220 	    "I", "SGE context id of the queue");
3221 	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "cidx",
3222 	    CTLTYPE_INT | CTLFLAG_RD, &ofld_rxq->iq.cidx, 0, sysctl_uint16, "I",
3223 	    "consumer index");
3224 
3225 	add_fl_sysctls(pi->adapter, &vi->ctx, oid, &ofld_rxq->fl);
3226 
3227 	return (rc);
3228 }
3229 
3230 static int
3231 free_ofld_rxq(struct vi_info *vi, struct sge_ofld_rxq *ofld_rxq)
3232 {
3233 	int rc;
3234 
3235 	rc = free_iq_fl(vi, &ofld_rxq->iq, &ofld_rxq->fl);
3236 	if (rc == 0)
3237 		bzero(ofld_rxq, sizeof(*ofld_rxq));
3238 
3239 	return (rc);
3240 }
3241 #endif
3242 
3243 #ifdef DEV_NETMAP
3244 static int
3245 alloc_nm_rxq(struct vi_info *vi, struct sge_nm_rxq *nm_rxq, int intr_idx,
3246     int idx, struct sysctl_oid *oid)
3247 {
3248 	int rc;
3249 	struct sysctl_oid_list *children;
3250 	struct sysctl_ctx_list *ctx;
3251 	char name[16];
3252 	size_t len;
3253 	struct adapter *sc = vi->pi->adapter;
3254 	struct netmap_adapter *na = NA(vi->ifp);
3255 
3256 	MPASS(na != NULL);
3257 
3258 	len = vi->qsize_rxq * IQ_ESIZE;
3259 	rc = alloc_ring(sc, len, &nm_rxq->iq_desc_tag, &nm_rxq->iq_desc_map,
3260 	    &nm_rxq->iq_ba, (void **)&nm_rxq->iq_desc);
3261 	if (rc != 0)
3262 		return (rc);
3263 
3264 	len = na->num_rx_desc * EQ_ESIZE + sc->params.sge.spg_len;
3265 	rc = alloc_ring(sc, len, &nm_rxq->fl_desc_tag, &nm_rxq->fl_desc_map,
3266 	    &nm_rxq->fl_ba, (void **)&nm_rxq->fl_desc);
3267 	if (rc != 0)
3268 		return (rc);
3269 
3270 	nm_rxq->vi = vi;
3271 	nm_rxq->nid = idx;
3272 	nm_rxq->iq_cidx = 0;
3273 	nm_rxq->iq_sidx = vi->qsize_rxq - sc->params.sge.spg_len / IQ_ESIZE;
3274 	nm_rxq->iq_gen = F_RSPD_GEN;
3275 	nm_rxq->fl_pidx = nm_rxq->fl_cidx = 0;
3276 	nm_rxq->fl_sidx = na->num_rx_desc;
3277 	nm_rxq->intr_idx = intr_idx;
3278 	nm_rxq->iq_cntxt_id = INVALID_NM_RXQ_CNTXT_ID;
3279 
3280 	ctx = &vi->ctx;
3281 	children = SYSCTL_CHILDREN(oid);
3282 
3283 	snprintf(name, sizeof(name), "%d", idx);
3284 	oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, name, CTLFLAG_RD, NULL,
3285 	    "rx queue");
3286 	children = SYSCTL_CHILDREN(oid);
3287 
3288 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "abs_id",
3289 	    CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->iq_abs_id, 0, sysctl_uint16,
3290 	    "I", "absolute id of the queue");
3291 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cntxt_id",
3292 	    CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->iq_cntxt_id, 0, sysctl_uint16,
3293 	    "I", "SGE context id of the queue");
3294 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cidx",
3295 	    CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->iq_cidx, 0, sysctl_uint16, "I",
3296 	    "consumer index");
3297 
3298 	children = SYSCTL_CHILDREN(oid);
3299 	oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "fl", CTLFLAG_RD, NULL,
3300 	    "freelist");
3301 	children = SYSCTL_CHILDREN(oid);
3302 
3303 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cntxt_id",
3304 	    CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->fl_cntxt_id, 0, sysctl_uint16,
3305 	    "I", "SGE context id of the freelist");
3306 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cidx", CTLFLAG_RD,
3307 	    &nm_rxq->fl_cidx, 0, "consumer index");
3308 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "pidx", CTLFLAG_RD,
3309 	    &nm_rxq->fl_pidx, 0, "producer index");
3310 
3311 	return (rc);
3312 }
3313 
3314 
3315 static int
3316 free_nm_rxq(struct vi_info *vi, struct sge_nm_rxq *nm_rxq)
3317 {
3318 	struct adapter *sc = vi->pi->adapter;
3319 
3320 	if (vi->flags & VI_INIT_DONE)
3321 		MPASS(nm_rxq->iq_cntxt_id == INVALID_NM_RXQ_CNTXT_ID);
3322 	else
3323 		MPASS(nm_rxq->iq_cntxt_id == 0);
3324 
3325 	free_ring(sc, nm_rxq->iq_desc_tag, nm_rxq->iq_desc_map, nm_rxq->iq_ba,
3326 	    nm_rxq->iq_desc);
3327 	free_ring(sc, nm_rxq->fl_desc_tag, nm_rxq->fl_desc_map, nm_rxq->fl_ba,
3328 	    nm_rxq->fl_desc);
3329 
3330 	return (0);
3331 }
3332 
3333 static int
3334 alloc_nm_txq(struct vi_info *vi, struct sge_nm_txq *nm_txq, int iqidx, int idx,
3335     struct sysctl_oid *oid)
3336 {
3337 	int rc;
3338 	size_t len;
3339 	struct port_info *pi = vi->pi;
3340 	struct adapter *sc = pi->adapter;
3341 	struct netmap_adapter *na = NA(vi->ifp);
3342 	char name[16];
3343 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
3344 
3345 	len = na->num_tx_desc * EQ_ESIZE + sc->params.sge.spg_len;
3346 	rc = alloc_ring(sc, len, &nm_txq->desc_tag, &nm_txq->desc_map,
3347 	    &nm_txq->ba, (void **)&nm_txq->desc);
3348 	if (rc)
3349 		return (rc);
3350 
3351 	nm_txq->pidx = nm_txq->cidx = 0;
3352 	nm_txq->sidx = na->num_tx_desc;
3353 	nm_txq->nid = idx;
3354 	nm_txq->iqidx = iqidx;
3355 	nm_txq->cpl_ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT) |
3356 	    V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_PF(G_FW_VIID_PFN(vi->viid)) |
3357 	    V_TXPKT_VF(G_FW_VIID_VIN(vi->viid)) |
3358 	    V_TXPKT_VF_VLD(G_FW_VIID_VIVLD(vi->viid)));
3359 	nm_txq->cntxt_id = INVALID_NM_TXQ_CNTXT_ID;
3360 
3361 	snprintf(name, sizeof(name), "%d", idx);
3362 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name, CTLFLAG_RD,
3363 	    NULL, "netmap tx queue");
3364 	children = SYSCTL_CHILDREN(oid);
3365 
3366 	SYSCTL_ADD_UINT(&vi->ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD,
3367 	    &nm_txq->cntxt_id, 0, "SGE context id of the queue");
3368 	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "cidx",
3369 	    CTLTYPE_INT | CTLFLAG_RD, &nm_txq->cidx, 0, sysctl_uint16, "I",
3370 	    "consumer index");
3371 	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "pidx",
3372 	    CTLTYPE_INT | CTLFLAG_RD, &nm_txq->pidx, 0, sysctl_uint16, "I",
3373 	    "producer index");
3374 
3375 	return (rc);
3376 }
3377 
3378 static int
3379 free_nm_txq(struct vi_info *vi, struct sge_nm_txq *nm_txq)
3380 {
3381 	struct adapter *sc = vi->pi->adapter;
3382 
3383 	if (vi->flags & VI_INIT_DONE)
3384 		MPASS(nm_txq->cntxt_id == INVALID_NM_TXQ_CNTXT_ID);
3385 	else
3386 		MPASS(nm_txq->cntxt_id == 0);
3387 
3388 	free_ring(sc, nm_txq->desc_tag, nm_txq->desc_map, nm_txq->ba,
3389 	    nm_txq->desc);
3390 
3391 	return (0);
3392 }
3393 #endif
3394 
3395 static int
3396 ctrl_eq_alloc(struct adapter *sc, struct sge_eq *eq)
3397 {
3398 	int rc, cntxt_id;
3399 	struct fw_eq_ctrl_cmd c;
3400 	int qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE;
3401 
3402 	bzero(&c, sizeof(c));
3403 
3404 	c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_EQ_CTRL_CMD) | F_FW_CMD_REQUEST |
3405 	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_CTRL_CMD_PFN(sc->pf) |
3406 	    V_FW_EQ_CTRL_CMD_VFN(0));
3407 	c.alloc_to_len16 = htobe32(F_FW_EQ_CTRL_CMD_ALLOC |
3408 	    F_FW_EQ_CTRL_CMD_EQSTART | FW_LEN16(c));
3409 	c.cmpliqid_eqid = htonl(V_FW_EQ_CTRL_CMD_CMPLIQID(eq->iqid));
3410 	c.physeqid_pkd = htobe32(0);
3411 	c.fetchszm_to_iqid =
3412 	    htobe32(V_FW_EQ_CTRL_CMD_HOSTFCMODE(X_HOSTFCMODE_STATUS_PAGE) |
3413 		V_FW_EQ_CTRL_CMD_PCIECHN(eq->tx_chan) |
3414 		F_FW_EQ_CTRL_CMD_FETCHRO | V_FW_EQ_CTRL_CMD_IQID(eq->iqid));
3415 	c.dcaen_to_eqsize =
3416 	    htobe32(V_FW_EQ_CTRL_CMD_FBMIN(X_FETCHBURSTMIN_64B) |
3417 		V_FW_EQ_CTRL_CMD_FBMAX(X_FETCHBURSTMAX_512B) |
3418 		V_FW_EQ_CTRL_CMD_CIDXFTHRESH(X_CIDXFLUSHTHRESH_32) |
3419 		V_FW_EQ_CTRL_CMD_EQSIZE(qsize));
3420 	c.eqaddr = htobe64(eq->ba);
3421 
3422 	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
3423 	if (rc != 0) {
3424 		device_printf(sc->dev,
3425 		    "failed to create control queue %d: %d\n", eq->tx_chan, rc);
3426 		return (rc);
3427 	}
3428 	eq->flags |= EQ_ALLOCATED;
3429 
3430 	eq->cntxt_id = G_FW_EQ_CTRL_CMD_EQID(be32toh(c.cmpliqid_eqid));
3431 	cntxt_id = eq->cntxt_id - sc->sge.eq_start;
3432 	if (cntxt_id >= sc->sge.neq)
3433 	    panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__,
3434 		cntxt_id, sc->sge.neq - 1);
3435 	sc->sge.eqmap[cntxt_id] = eq;
3436 
3437 	return (rc);
3438 }
3439 
3440 static int
3441 eth_eq_alloc(struct adapter *sc, struct vi_info *vi, struct sge_eq *eq)
3442 {
3443 	int rc, cntxt_id;
3444 	struct fw_eq_eth_cmd c;
3445 	int qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE;
3446 
3447 	bzero(&c, sizeof(c));
3448 
3449 	c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_EQ_ETH_CMD) | F_FW_CMD_REQUEST |
3450 	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_ETH_CMD_PFN(sc->pf) |
3451 	    V_FW_EQ_ETH_CMD_VFN(0));
3452 	c.alloc_to_len16 = htobe32(F_FW_EQ_ETH_CMD_ALLOC |
3453 	    F_FW_EQ_ETH_CMD_EQSTART | FW_LEN16(c));
3454 	c.autoequiqe_to_viid = htobe32(F_FW_EQ_ETH_CMD_AUTOEQUIQE |
3455 	    F_FW_EQ_ETH_CMD_AUTOEQUEQE | V_FW_EQ_ETH_CMD_VIID(vi->viid));
3456 	c.fetchszm_to_iqid =
3457 	    htobe32(V_FW_EQ_ETH_CMD_HOSTFCMODE(X_HOSTFCMODE_NONE) |
3458 		V_FW_EQ_ETH_CMD_PCIECHN(eq->tx_chan) | F_FW_EQ_ETH_CMD_FETCHRO |
3459 		V_FW_EQ_ETH_CMD_IQID(eq->iqid));
3460 	c.dcaen_to_eqsize = htobe32(V_FW_EQ_ETH_CMD_FBMIN(X_FETCHBURSTMIN_64B) |
3461 	    V_FW_EQ_ETH_CMD_FBMAX(X_FETCHBURSTMAX_512B) |
3462 	    V_FW_EQ_ETH_CMD_EQSIZE(qsize));
3463 	c.eqaddr = htobe64(eq->ba);
3464 
3465 	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
3466 	if (rc != 0) {
3467 		device_printf(vi->dev,
3468 		    "failed to create Ethernet egress queue: %d\n", rc);
3469 		return (rc);
3470 	}
3471 	eq->flags |= EQ_ALLOCATED;
3472 
3473 	eq->cntxt_id = G_FW_EQ_ETH_CMD_EQID(be32toh(c.eqid_pkd));
3474 	eq->abs_id = G_FW_EQ_ETH_CMD_PHYSEQID(be32toh(c.physeqid_pkd));
3475 	cntxt_id = eq->cntxt_id - sc->sge.eq_start;
3476 	if (cntxt_id >= sc->sge.neq)
3477 	    panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__,
3478 		cntxt_id, sc->sge.neq - 1);
3479 	sc->sge.eqmap[cntxt_id] = eq;
3480 
3481 	return (rc);
3482 }
3483 
3484 #ifdef TCP_OFFLOAD
3485 static int
3486 ofld_eq_alloc(struct adapter *sc, struct vi_info *vi, struct sge_eq *eq)
3487 {
3488 	int rc, cntxt_id;
3489 	struct fw_eq_ofld_cmd c;
3490 	int qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE;
3491 
3492 	bzero(&c, sizeof(c));
3493 
3494 	c.op_to_vfn = htonl(V_FW_CMD_OP(FW_EQ_OFLD_CMD) | F_FW_CMD_REQUEST |
3495 	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_OFLD_CMD_PFN(sc->pf) |
3496 	    V_FW_EQ_OFLD_CMD_VFN(0));
3497 	c.alloc_to_len16 = htonl(F_FW_EQ_OFLD_CMD_ALLOC |
3498 	    F_FW_EQ_OFLD_CMD_EQSTART | FW_LEN16(c));
3499 	c.fetchszm_to_iqid =
3500 		htonl(V_FW_EQ_OFLD_CMD_HOSTFCMODE(X_HOSTFCMODE_NONE) |
3501 		    V_FW_EQ_OFLD_CMD_PCIECHN(eq->tx_chan) |
3502 		    F_FW_EQ_OFLD_CMD_FETCHRO | V_FW_EQ_OFLD_CMD_IQID(eq->iqid));
3503 	c.dcaen_to_eqsize =
3504 	    htobe32(V_FW_EQ_OFLD_CMD_FBMIN(X_FETCHBURSTMIN_64B) |
3505 		V_FW_EQ_OFLD_CMD_FBMAX(X_FETCHBURSTMAX_512B) |
3506 		V_FW_EQ_OFLD_CMD_EQSIZE(qsize));
3507 	c.eqaddr = htobe64(eq->ba);
3508 
3509 	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
3510 	if (rc != 0) {
3511 		device_printf(vi->dev,
3512 		    "failed to create egress queue for TCP offload: %d\n", rc);
3513 		return (rc);
3514 	}
3515 	eq->flags |= EQ_ALLOCATED;
3516 
3517 	eq->cntxt_id = G_FW_EQ_OFLD_CMD_EQID(be32toh(c.eqid_pkd));
3518 	cntxt_id = eq->cntxt_id - sc->sge.eq_start;
3519 	if (cntxt_id >= sc->sge.neq)
3520 	    panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__,
3521 		cntxt_id, sc->sge.neq - 1);
3522 	sc->sge.eqmap[cntxt_id] = eq;
3523 
3524 	return (rc);
3525 }
3526 #endif
3527 
3528 static int
3529 alloc_eq(struct adapter *sc, struct vi_info *vi, struct sge_eq *eq)
3530 {
3531 	int rc, qsize;
3532 	size_t len;
3533 
3534 	mtx_init(&eq->eq_lock, eq->lockname, NULL, MTX_DEF);
3535 
3536 	qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE;
3537 	len = qsize * EQ_ESIZE;
3538 	rc = alloc_ring(sc, len, &eq->desc_tag, &eq->desc_map,
3539 	    &eq->ba, (void **)&eq->desc);
3540 	if (rc)
3541 		return (rc);
3542 
3543 	eq->pidx = eq->cidx = 0;
3544 	eq->equeqidx = eq->dbidx = 0;
3545 	eq->doorbells = sc->doorbells;
3546 
3547 	switch (eq->flags & EQ_TYPEMASK) {
3548 	case EQ_CTRL:
3549 		rc = ctrl_eq_alloc(sc, eq);
3550 		break;
3551 
3552 	case EQ_ETH:
3553 		rc = eth_eq_alloc(sc, vi, eq);
3554 		break;
3555 
3556 #ifdef TCP_OFFLOAD
3557 	case EQ_OFLD:
3558 		rc = ofld_eq_alloc(sc, vi, eq);
3559 		break;
3560 #endif
3561 
3562 	default:
3563 		panic("%s: invalid eq type %d.", __func__,
3564 		    eq->flags & EQ_TYPEMASK);
3565 	}
3566 	if (rc != 0) {
3567 		device_printf(sc->dev,
3568 		    "failed to allocate egress queue(%d): %d\n",
3569 		    eq->flags & EQ_TYPEMASK, rc);
3570 	}
3571 
3572 	if (isset(&eq->doorbells, DOORBELL_UDB) ||
3573 	    isset(&eq->doorbells, DOORBELL_UDBWC) ||
3574 	    isset(&eq->doorbells, DOORBELL_WCWR)) {
3575 		uint32_t s_qpp = sc->params.sge.eq_s_qpp;
3576 		uint32_t mask = (1 << s_qpp) - 1;
3577 		volatile uint8_t *udb;
3578 
3579 		udb = sc->udbs_base + UDBS_DB_OFFSET;
3580 		udb += (eq->cntxt_id >> s_qpp) << PAGE_SHIFT;	/* pg offset */
3581 		eq->udb_qid = eq->cntxt_id & mask;		/* id in page */
3582 		if (eq->udb_qid >= PAGE_SIZE / UDBS_SEG_SIZE)
3583 	    		clrbit(&eq->doorbells, DOORBELL_WCWR);
3584 		else {
3585 			udb += eq->udb_qid << UDBS_SEG_SHIFT;	/* seg offset */
3586 			eq->udb_qid = 0;
3587 		}
3588 		eq->udb = (volatile void *)udb;
3589 	}
3590 
3591 	return (rc);
3592 }
3593 
3594 static int
3595 free_eq(struct adapter *sc, struct sge_eq *eq)
3596 {
3597 	int rc;
3598 
3599 	if (eq->flags & EQ_ALLOCATED) {
3600 		switch (eq->flags & EQ_TYPEMASK) {
3601 		case EQ_CTRL:
3602 			rc = -t4_ctrl_eq_free(sc, sc->mbox, sc->pf, 0,
3603 			    eq->cntxt_id);
3604 			break;
3605 
3606 		case EQ_ETH:
3607 			rc = -t4_eth_eq_free(sc, sc->mbox, sc->pf, 0,
3608 			    eq->cntxt_id);
3609 			break;
3610 
3611 #ifdef TCP_OFFLOAD
3612 		case EQ_OFLD:
3613 			rc = -t4_ofld_eq_free(sc, sc->mbox, sc->pf, 0,
3614 			    eq->cntxt_id);
3615 			break;
3616 #endif
3617 
3618 		default:
3619 			panic("%s: invalid eq type %d.", __func__,
3620 			    eq->flags & EQ_TYPEMASK);
3621 		}
3622 		if (rc != 0) {
3623 			device_printf(sc->dev,
3624 			    "failed to free egress queue (%d): %d\n",
3625 			    eq->flags & EQ_TYPEMASK, rc);
3626 			return (rc);
3627 		}
3628 		eq->flags &= ~EQ_ALLOCATED;
3629 	}
3630 
3631 	free_ring(sc, eq->desc_tag, eq->desc_map, eq->ba, eq->desc);
3632 
3633 	if (mtx_initialized(&eq->eq_lock))
3634 		mtx_destroy(&eq->eq_lock);
3635 
3636 	bzero(eq, sizeof(*eq));
3637 	return (0);
3638 }
3639 
3640 static int
3641 alloc_wrq(struct adapter *sc, struct vi_info *vi, struct sge_wrq *wrq,
3642     struct sysctl_oid *oid)
3643 {
3644 	int rc;
3645 	struct sysctl_ctx_list *ctx = vi ? &vi->ctx : &sc->ctx;
3646 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
3647 
3648 	rc = alloc_eq(sc, vi, &wrq->eq);
3649 	if (rc)
3650 		return (rc);
3651 
3652 	wrq->adapter = sc;
3653 	TASK_INIT(&wrq->wrq_tx_task, 0, wrq_tx_drain, wrq);
3654 	TAILQ_INIT(&wrq->incomplete_wrs);
3655 	STAILQ_INIT(&wrq->wr_list);
3656 	wrq->nwr_pending = 0;
3657 	wrq->ndesc_needed = 0;
3658 
3659 	SYSCTL_ADD_UAUTO(ctx, children, OID_AUTO, "ba", CTLFLAG_RD,
3660 	    &wrq->eq.ba, "bus address of descriptor ring");
3661 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL,
3662 	    wrq->eq.sidx * EQ_ESIZE + sc->params.sge.spg_len,
3663 	    "desc ring size in bytes");
3664 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD,
3665 	    &wrq->eq.cntxt_id, 0, "SGE context id of the queue");
3666 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cidx",
3667 	    CTLTYPE_INT | CTLFLAG_RD, &wrq->eq.cidx, 0, sysctl_uint16, "I",
3668 	    "consumer index");
3669 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "pidx",
3670 	    CTLTYPE_INT | CTLFLAG_RD, &wrq->eq.pidx, 0, sysctl_uint16, "I",
3671 	    "producer index");
3672 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "sidx", CTLFLAG_RD, NULL,
3673 	    wrq->eq.sidx, "status page index");
3674 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_direct", CTLFLAG_RD,
3675 	    &wrq->tx_wrs_direct, "# of work requests (direct)");
3676 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_copied", CTLFLAG_RD,
3677 	    &wrq->tx_wrs_copied, "# of work requests (copied)");
3678 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_sspace", CTLFLAG_RD,
3679 	    &wrq->tx_wrs_ss, "# of work requests (copied from scratch space)");
3680 
3681 	return (rc);
3682 }
3683 
3684 static int
3685 free_wrq(struct adapter *sc, struct sge_wrq *wrq)
3686 {
3687 	int rc;
3688 
3689 	rc = free_eq(sc, &wrq->eq);
3690 	if (rc)
3691 		return (rc);
3692 
3693 	bzero(wrq, sizeof(*wrq));
3694 	return (0);
3695 }
3696 
3697 static int
3698 alloc_txq(struct vi_info *vi, struct sge_txq *txq, int idx,
3699     struct sysctl_oid *oid)
3700 {
3701 	int rc;
3702 	struct port_info *pi = vi->pi;
3703 	struct adapter *sc = pi->adapter;
3704 	struct sge_eq *eq = &txq->eq;
3705 	char name[16];
3706 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
3707 
3708 	rc = mp_ring_alloc(&txq->r, eq->sidx, txq, eth_tx, can_resume_eth_tx,
3709 	    M_CXGBE, M_WAITOK);
3710 	if (rc != 0) {
3711 		device_printf(sc->dev, "failed to allocate mp_ring: %d\n", rc);
3712 		return (rc);
3713 	}
3714 
3715 	rc = alloc_eq(sc, vi, eq);
3716 	if (rc != 0) {
3717 		mp_ring_free(txq->r);
3718 		txq->r = NULL;
3719 		return (rc);
3720 	}
3721 
3722 	/* Can't fail after this point. */
3723 
3724 	if (idx == 0)
3725 		sc->sge.eq_base = eq->abs_id - eq->cntxt_id;
3726 	else
3727 		KASSERT(eq->cntxt_id + sc->sge.eq_base == eq->abs_id,
3728 		    ("eq_base mismatch"));
3729 	KASSERT(sc->sge.eq_base == 0 || sc->flags & IS_VF,
3730 	    ("PF with non-zero eq_base"));
3731 
3732 	TASK_INIT(&txq->tx_reclaim_task, 0, tx_reclaim, eq);
3733 	txq->ifp = vi->ifp;
3734 	txq->gl = sglist_alloc(TX_SGL_SEGS, M_WAITOK);
3735 	if (sc->flags & IS_VF)
3736 		txq->cpl_ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT_XT) |
3737 		    V_TXPKT_INTF(pi->tx_chan));
3738 	else
3739 		txq->cpl_ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT) |
3740 		    V_TXPKT_INTF(pi->tx_chan) |
3741 		    V_TXPKT_PF(G_FW_VIID_PFN(vi->viid)) |
3742 		    V_TXPKT_VF(G_FW_VIID_VIN(vi->viid)) |
3743 		    V_TXPKT_VF_VLD(G_FW_VIID_VIVLD(vi->viid)));
3744 	txq->tc_idx = -1;
3745 	txq->sdesc = malloc(eq->sidx * sizeof(struct tx_sdesc), M_CXGBE,
3746 	    M_ZERO | M_WAITOK);
3747 
3748 	snprintf(name, sizeof(name), "%d", idx);
3749 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name, CTLFLAG_RD,
3750 	    NULL, "tx queue");
3751 	children = SYSCTL_CHILDREN(oid);
3752 
3753 	SYSCTL_ADD_UAUTO(&vi->ctx, children, OID_AUTO, "ba", CTLFLAG_RD,
3754 	    &eq->ba, "bus address of descriptor ring");
3755 	SYSCTL_ADD_INT(&vi->ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL,
3756 	    eq->sidx * EQ_ESIZE + sc->params.sge.spg_len,
3757 	    "desc ring size in bytes");
3758 	SYSCTL_ADD_UINT(&vi->ctx, children, OID_AUTO, "abs_id", CTLFLAG_RD,
3759 	    &eq->abs_id, 0, "absolute id of the queue");
3760 	SYSCTL_ADD_UINT(&vi->ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD,
3761 	    &eq->cntxt_id, 0, "SGE context id of the queue");
3762 	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "cidx",
3763 	    CTLTYPE_INT | CTLFLAG_RD, &eq->cidx, 0, sysctl_uint16, "I",
3764 	    "consumer index");
3765 	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "pidx",
3766 	    CTLTYPE_INT | CTLFLAG_RD, &eq->pidx, 0, sysctl_uint16, "I",
3767 	    "producer index");
3768 	SYSCTL_ADD_INT(&vi->ctx, children, OID_AUTO, "sidx", CTLFLAG_RD, NULL,
3769 	    eq->sidx, "status page index");
3770 
3771 	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "tc",
3772 	    CTLTYPE_INT | CTLFLAG_RW, vi, idx, sysctl_tc, "I",
3773 	    "traffic class (-1 means none)");
3774 
3775 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txcsum", CTLFLAG_RD,
3776 	    &txq->txcsum, "# of times hardware assisted with checksum");
3777 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "vlan_insertion",
3778 	    CTLFLAG_RD, &txq->vlan_insertion,
3779 	    "# of times hardware inserted 802.1Q tag");
3780 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "tso_wrs", CTLFLAG_RD,
3781 	    &txq->tso_wrs, "# of TSO work requests");
3782 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "imm_wrs", CTLFLAG_RD,
3783 	    &txq->imm_wrs, "# of work requests with immediate data");
3784 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "sgl_wrs", CTLFLAG_RD,
3785 	    &txq->sgl_wrs, "# of work requests with direct SGL");
3786 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkt_wrs", CTLFLAG_RD,
3787 	    &txq->txpkt_wrs, "# of txpkt work requests (one pkt/WR)");
3788 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkts0_wrs",
3789 	    CTLFLAG_RD, &txq->txpkts0_wrs,
3790 	    "# of txpkts (type 0) work requests");
3791 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkts1_wrs",
3792 	    CTLFLAG_RD, &txq->txpkts1_wrs,
3793 	    "# of txpkts (type 1) work requests");
3794 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkts0_pkts",
3795 	    CTLFLAG_RD, &txq->txpkts0_pkts,
3796 	    "# of frames tx'd using type0 txpkts work requests");
3797 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkts1_pkts",
3798 	    CTLFLAG_RD, &txq->txpkts1_pkts,
3799 	    "# of frames tx'd using type1 txpkts work requests");
3800 
3801 	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_enqueues",
3802 	    CTLFLAG_RD, &txq->r->enqueues,
3803 	    "# of enqueues to the mp_ring for this queue");
3804 	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_drops",
3805 	    CTLFLAG_RD, &txq->r->drops,
3806 	    "# of drops in the mp_ring for this queue");
3807 	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_starts",
3808 	    CTLFLAG_RD, &txq->r->starts,
3809 	    "# of normal consumer starts in the mp_ring for this queue");
3810 	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_stalls",
3811 	    CTLFLAG_RD, &txq->r->stalls,
3812 	    "# of consumer stalls in the mp_ring for this queue");
3813 	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_restarts",
3814 	    CTLFLAG_RD, &txq->r->restarts,
3815 	    "# of consumer restarts in the mp_ring for this queue");
3816 	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_abdications",
3817 	    CTLFLAG_RD, &txq->r->abdications,
3818 	    "# of consumer abdications in the mp_ring for this queue");
3819 
3820 	return (0);
3821 }
3822 
3823 static int
3824 free_txq(struct vi_info *vi, struct sge_txq *txq)
3825 {
3826 	int rc;
3827 	struct adapter *sc = vi->pi->adapter;
3828 	struct sge_eq *eq = &txq->eq;
3829 
3830 	rc = free_eq(sc, eq);
3831 	if (rc)
3832 		return (rc);
3833 
3834 	sglist_free(txq->gl);
3835 	free(txq->sdesc, M_CXGBE);
3836 	mp_ring_free(txq->r);
3837 
3838 	bzero(txq, sizeof(*txq));
3839 	return (0);
3840 }
3841 
3842 static void
3843 oneseg_dma_callback(void *arg, bus_dma_segment_t *segs, int nseg, int error)
3844 {
3845 	bus_addr_t *ba = arg;
3846 
3847 	KASSERT(nseg == 1,
3848 	    ("%s meant for single segment mappings only.", __func__));
3849 
3850 	*ba = error ? 0 : segs->ds_addr;
3851 }
3852 
3853 static inline void
3854 ring_fl_db(struct adapter *sc, struct sge_fl *fl)
3855 {
3856 	uint32_t n, v;
3857 
3858 	n = IDXDIFF(fl->pidx / 8, fl->dbidx, fl->sidx);
3859 	MPASS(n > 0);
3860 
3861 	wmb();
3862 	v = fl->dbval | V_PIDX(n);
3863 	if (fl->udb)
3864 		*fl->udb = htole32(v);
3865 	else
3866 		t4_write_reg(sc, sc->sge_kdoorbell_reg, v);
3867 	IDXINCR(fl->dbidx, n, fl->sidx);
3868 }
3869 
3870 /*
3871  * Fills up the freelist by allocating up to 'n' buffers.  Buffers that are
3872  * recycled do not count towards this allocation budget.
3873  *
3874  * Returns non-zero to indicate that this freelist should be added to the list
3875  * of starving freelists.
3876  */
3877 static int
3878 refill_fl(struct adapter *sc, struct sge_fl *fl, int n)
3879 {
3880 	__be64 *d;
3881 	struct fl_sdesc *sd;
3882 	uintptr_t pa;
3883 	caddr_t cl;
3884 	struct cluster_layout *cll;
3885 	struct sw_zone_info *swz;
3886 	struct cluster_metadata *clm;
3887 	uint16_t max_pidx;
3888 	uint16_t hw_cidx = fl->hw_cidx;		/* stable snapshot */
3889 
3890 	FL_LOCK_ASSERT_OWNED(fl);
3891 
3892 	/*
3893 	 * We always stop at the beginning of the hardware descriptor that's just
3894 	 * before the one with the hw cidx.  This is to avoid hw pidx = hw cidx,
3895 	 * which would mean an empty freelist to the chip.
3896 	 */
3897 	max_pidx = __predict_false(hw_cidx == 0) ? fl->sidx - 1 : hw_cidx - 1;
3898 	if (fl->pidx == max_pidx * 8)
3899 		return (0);
3900 
3901 	d = &fl->desc[fl->pidx];
3902 	sd = &fl->sdesc[fl->pidx];
3903 	cll = &fl->cll_def;	/* default layout */
3904 	swz = &sc->sge.sw_zone_info[cll->zidx];
3905 
3906 	while (n > 0) {
3907 
3908 		if (sd->cl != NULL) {
3909 
3910 			if (sd->nmbuf == 0) {
3911 				/*
3912 				 * Fast recycle without involving any atomics on
3913 				 * the cluster's metadata (if the cluster has
3914 				 * metadata).  This happens when all frames
3915 				 * received in the cluster were small enough to
3916 				 * fit within a single mbuf each.
3917 				 */
3918 				fl->cl_fast_recycled++;
3919 #ifdef INVARIANTS
3920 				clm = cl_metadata(sc, fl, &sd->cll, sd->cl);
3921 				if (clm != NULL)
3922 					MPASS(clm->refcount == 1);
3923 #endif
3924 				goto recycled_fast;
3925 			}
3926 
3927 			/*
3928 			 * Cluster is guaranteed to have metadata.  Clusters
3929 			 * without metadata always take the fast recycle path
3930 			 * when they're recycled.
3931 			 */
3932 			clm = cl_metadata(sc, fl, &sd->cll, sd->cl);
3933 			MPASS(clm != NULL);
3934 
3935 			if (atomic_fetchadd_int(&clm->refcount, -1) == 1) {
3936 				fl->cl_recycled++;
3937 				counter_u64_add(extfree_rels, 1);
3938 				goto recycled;
3939 			}
3940 			sd->cl = NULL;	/* gave up my reference */
3941 		}
3942 		MPASS(sd->cl == NULL);
3943 alloc:
3944 		cl = uma_zalloc(swz->zone, M_NOWAIT);
3945 		if (__predict_false(cl == NULL)) {
3946 			if (cll == &fl->cll_alt || fl->cll_alt.zidx == -1 ||
3947 			    fl->cll_def.zidx == fl->cll_alt.zidx)
3948 				break;
3949 
3950 			/* fall back to the safe zone */
3951 			cll = &fl->cll_alt;
3952 			swz = &sc->sge.sw_zone_info[cll->zidx];
3953 			goto alloc;
3954 		}
3955 		fl->cl_allocated++;
3956 		n--;
3957 
3958 		pa = pmap_kextract((vm_offset_t)cl);
3959 		pa += cll->region1;
3960 		sd->cl = cl;
3961 		sd->cll = *cll;
3962 		*d = htobe64(pa | cll->hwidx);
3963 		clm = cl_metadata(sc, fl, cll, cl);
3964 		if (clm != NULL) {
3965 recycled:
3966 #ifdef INVARIANTS
3967 			clm->sd = sd;
3968 #endif
3969 			clm->refcount = 1;
3970 		}
3971 		sd->nmbuf = 0;
3972 recycled_fast:
3973 		d++;
3974 		sd++;
3975 		if (__predict_false(++fl->pidx % 8 == 0)) {
3976 			uint16_t pidx = fl->pidx / 8;
3977 
3978 			if (__predict_false(pidx == fl->sidx)) {
3979 				fl->pidx = 0;
3980 				pidx = 0;
3981 				sd = fl->sdesc;
3982 				d = fl->desc;
3983 			}
3984 			if (pidx == max_pidx)
3985 				break;
3986 
3987 			if (IDXDIFF(pidx, fl->dbidx, fl->sidx) >= 4)
3988 				ring_fl_db(sc, fl);
3989 		}
3990 	}
3991 
3992 	if (fl->pidx / 8 != fl->dbidx)
3993 		ring_fl_db(sc, fl);
3994 
3995 	return (FL_RUNNING_LOW(fl) && !(fl->flags & FL_STARVING));
3996 }
3997 
3998 /*
3999  * Attempt to refill all starving freelists.
4000  */
4001 static void
4002 refill_sfl(void *arg)
4003 {
4004 	struct adapter *sc = arg;
4005 	struct sge_fl *fl, *fl_temp;
4006 
4007 	mtx_assert(&sc->sfl_lock, MA_OWNED);
4008 	TAILQ_FOREACH_SAFE(fl, &sc->sfl, link, fl_temp) {
4009 		FL_LOCK(fl);
4010 		refill_fl(sc, fl, 64);
4011 		if (FL_NOT_RUNNING_LOW(fl) || fl->flags & FL_DOOMED) {
4012 			TAILQ_REMOVE(&sc->sfl, fl, link);
4013 			fl->flags &= ~FL_STARVING;
4014 		}
4015 		FL_UNLOCK(fl);
4016 	}
4017 
4018 	if (!TAILQ_EMPTY(&sc->sfl))
4019 		callout_schedule(&sc->sfl_callout, hz / 5);
4020 }
4021 
4022 static int
4023 alloc_fl_sdesc(struct sge_fl *fl)
4024 {
4025 
4026 	fl->sdesc = malloc(fl->sidx * 8 * sizeof(struct fl_sdesc), M_CXGBE,
4027 	    M_ZERO | M_WAITOK);
4028 
4029 	return (0);
4030 }
4031 
4032 static void
4033 free_fl_sdesc(struct adapter *sc, struct sge_fl *fl)
4034 {
4035 	struct fl_sdesc *sd;
4036 	struct cluster_metadata *clm;
4037 	struct cluster_layout *cll;
4038 	int i;
4039 
4040 	sd = fl->sdesc;
4041 	for (i = 0; i < fl->sidx * 8; i++, sd++) {
4042 		if (sd->cl == NULL)
4043 			continue;
4044 
4045 		cll = &sd->cll;
4046 		clm = cl_metadata(sc, fl, cll, sd->cl);
4047 		if (sd->nmbuf == 0)
4048 			uma_zfree(sc->sge.sw_zone_info[cll->zidx].zone, sd->cl);
4049 		else if (clm && atomic_fetchadd_int(&clm->refcount, -1) == 1) {
4050 			uma_zfree(sc->sge.sw_zone_info[cll->zidx].zone, sd->cl);
4051 			counter_u64_add(extfree_rels, 1);
4052 		}
4053 		sd->cl = NULL;
4054 	}
4055 
4056 	free(fl->sdesc, M_CXGBE);
4057 	fl->sdesc = NULL;
4058 }
4059 
4060 static inline void
4061 get_pkt_gl(struct mbuf *m, struct sglist *gl)
4062 {
4063 	int rc;
4064 
4065 	M_ASSERTPKTHDR(m);
4066 
4067 	sglist_reset(gl);
4068 	rc = sglist_append_mbuf(gl, m);
4069 	if (__predict_false(rc != 0)) {
4070 		panic("%s: mbuf %p (%d segs) was vetted earlier but now fails "
4071 		    "with %d.", __func__, m, mbuf_nsegs(m), rc);
4072 	}
4073 
4074 	KASSERT(gl->sg_nseg == mbuf_nsegs(m),
4075 	    ("%s: nsegs changed for mbuf %p from %d to %d", __func__, m,
4076 	    mbuf_nsegs(m), gl->sg_nseg));
4077 	KASSERT(gl->sg_nseg > 0 &&
4078 	    gl->sg_nseg <= (needs_tso(m) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS),
4079 	    ("%s: %d segments, should have been 1 <= nsegs <= %d", __func__,
4080 		gl->sg_nseg, needs_tso(m) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS));
4081 }
4082 
4083 /*
4084  * len16 for a txpkt WR with a GL.  Includes the firmware work request header.
4085  */
4086 static inline u_int
4087 txpkt_len16(u_int nsegs, u_int tso)
4088 {
4089 	u_int n;
4090 
4091 	MPASS(nsegs > 0);
4092 
4093 	nsegs--; /* first segment is part of ulptx_sgl */
4094 	n = sizeof(struct fw_eth_tx_pkt_wr) + sizeof(struct cpl_tx_pkt_core) +
4095 	    sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1));
4096 	if (tso)
4097 		n += sizeof(struct cpl_tx_pkt_lso_core);
4098 
4099 	return (howmany(n, 16));
4100 }
4101 
4102 /*
4103  * len16 for a txpkt_vm WR with a GL.  Includes the firmware work
4104  * request header.
4105  */
4106 static inline u_int
4107 txpkt_vm_len16(u_int nsegs, u_int tso)
4108 {
4109 	u_int n;
4110 
4111 	MPASS(nsegs > 0);
4112 
4113 	nsegs--; /* first segment is part of ulptx_sgl */
4114 	n = sizeof(struct fw_eth_tx_pkt_vm_wr) +
4115 	    sizeof(struct cpl_tx_pkt_core) +
4116 	    sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1));
4117 	if (tso)
4118 		n += sizeof(struct cpl_tx_pkt_lso_core);
4119 
4120 	return (howmany(n, 16));
4121 }
4122 
4123 /*
4124  * len16 for a txpkts type 0 WR with a GL.  Does not include the firmware work
4125  * request header.
4126  */
4127 static inline u_int
4128 txpkts0_len16(u_int nsegs)
4129 {
4130 	u_int n;
4131 
4132 	MPASS(nsegs > 0);
4133 
4134 	nsegs--; /* first segment is part of ulptx_sgl */
4135 	n = sizeof(struct ulp_txpkt) + sizeof(struct ulptx_idata) +
4136 	    sizeof(struct cpl_tx_pkt_core) + sizeof(struct ulptx_sgl) +
4137 	    8 * ((3 * nsegs) / 2 + (nsegs & 1));
4138 
4139 	return (howmany(n, 16));
4140 }
4141 
4142 /*
4143  * len16 for a txpkts type 1 WR with a GL.  Does not include the firmware work
4144  * request header.
4145  */
4146 static inline u_int
4147 txpkts1_len16(void)
4148 {
4149 	u_int n;
4150 
4151 	n = sizeof(struct cpl_tx_pkt_core) + sizeof(struct ulptx_sgl);
4152 
4153 	return (howmany(n, 16));
4154 }
4155 
4156 static inline u_int
4157 imm_payload(u_int ndesc)
4158 {
4159 	u_int n;
4160 
4161 	n = ndesc * EQ_ESIZE - sizeof(struct fw_eth_tx_pkt_wr) -
4162 	    sizeof(struct cpl_tx_pkt_core);
4163 
4164 	return (n);
4165 }
4166 
4167 /*
4168  * Write a VM txpkt WR for this packet to the hardware descriptors, update the
4169  * software descriptor, and advance the pidx.  It is guaranteed that enough
4170  * descriptors are available.
4171  *
4172  * The return value is the # of hardware descriptors used.
4173  */
4174 static u_int
4175 write_txpkt_vm_wr(struct adapter *sc, struct sge_txq *txq,
4176     struct fw_eth_tx_pkt_vm_wr *wr, struct mbuf *m0, u_int available)
4177 {
4178 	struct sge_eq *eq = &txq->eq;
4179 	struct tx_sdesc *txsd;
4180 	struct cpl_tx_pkt_core *cpl;
4181 	uint32_t ctrl;	/* used in many unrelated places */
4182 	uint64_t ctrl1;
4183 	int csum_type, len16, ndesc, pktlen, nsegs;
4184 	caddr_t dst;
4185 
4186 	TXQ_LOCK_ASSERT_OWNED(txq);
4187 	M_ASSERTPKTHDR(m0);
4188 	MPASS(available > 0 && available < eq->sidx);
4189 
4190 	len16 = mbuf_len16(m0);
4191 	nsegs = mbuf_nsegs(m0);
4192 	pktlen = m0->m_pkthdr.len;
4193 	ctrl = sizeof(struct cpl_tx_pkt_core);
4194 	if (needs_tso(m0))
4195 		ctrl += sizeof(struct cpl_tx_pkt_lso_core);
4196 	ndesc = howmany(len16, EQ_ESIZE / 16);
4197 	MPASS(ndesc <= available);
4198 
4199 	/* Firmware work request header */
4200 	MPASS(wr == (void *)&eq->desc[eq->pidx]);
4201 	wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_VM_WR) |
4202 	    V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl));
4203 
4204 	ctrl = V_FW_WR_LEN16(len16);
4205 	wr->equiq_to_len16 = htobe32(ctrl);
4206 	wr->r3[0] = 0;
4207 	wr->r3[1] = 0;
4208 
4209 	/*
4210 	 * Copy over ethmacdst, ethmacsrc, ethtype, and vlantci.
4211 	 * vlantci is ignored unless the ethtype is 0x8100, so it's
4212 	 * simpler to always copy it rather than making it
4213 	 * conditional.  Also, it seems that we do not have to set
4214 	 * vlantci or fake the ethtype when doing VLAN tag insertion.
4215 	 */
4216 	m_copydata(m0, 0, sizeof(struct ether_header) + 2, wr->ethmacdst);
4217 
4218 	csum_type = -1;
4219 	if (needs_tso(m0)) {
4220 		struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1);
4221 
4222 		KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 &&
4223 		    m0->m_pkthdr.l4hlen > 0,
4224 		    ("%s: mbuf %p needs TSO but missing header lengths",
4225 			__func__, m0));
4226 
4227 		ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) | F_LSO_FIRST_SLICE |
4228 		    F_LSO_LAST_SLICE | V_LSO_IPHDR_LEN(m0->m_pkthdr.l3hlen >> 2)
4229 		    | V_LSO_TCPHDR_LEN(m0->m_pkthdr.l4hlen >> 2);
4230 		if (m0->m_pkthdr.l2hlen == sizeof(struct ether_vlan_header))
4231 			ctrl |= V_LSO_ETHHDR_LEN(1);
4232 		if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr))
4233 			ctrl |= F_LSO_IPV6;
4234 
4235 		lso->lso_ctrl = htobe32(ctrl);
4236 		lso->ipid_ofst = htobe16(0);
4237 		lso->mss = htobe16(m0->m_pkthdr.tso_segsz);
4238 		lso->seqno_offset = htobe32(0);
4239 		lso->len = htobe32(pktlen);
4240 
4241 		if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr))
4242 			csum_type = TX_CSUM_TCPIP6;
4243 		else
4244 			csum_type = TX_CSUM_TCPIP;
4245 
4246 		cpl = (void *)(lso + 1);
4247 
4248 		txq->tso_wrs++;
4249 	} else {
4250 		if (m0->m_pkthdr.csum_flags & CSUM_IP_TCP)
4251 			csum_type = TX_CSUM_TCPIP;
4252 		else if (m0->m_pkthdr.csum_flags & CSUM_IP_UDP)
4253 			csum_type = TX_CSUM_UDPIP;
4254 		else if (m0->m_pkthdr.csum_flags & CSUM_IP6_TCP)
4255 			csum_type = TX_CSUM_TCPIP6;
4256 		else if (m0->m_pkthdr.csum_flags & CSUM_IP6_UDP)
4257 			csum_type = TX_CSUM_UDPIP6;
4258 #if defined(INET)
4259 		else if (m0->m_pkthdr.csum_flags & CSUM_IP) {
4260 			/*
4261 			 * XXX: The firmware appears to stomp on the
4262 			 * fragment/flags field of the IP header when
4263 			 * using TX_CSUM_IP.  Fall back to doing
4264 			 * software checksums.
4265 			 */
4266 			u_short *sump;
4267 			struct mbuf *m;
4268 			int offset;
4269 
4270 			m = m0;
4271 			offset = 0;
4272 			sump = m_advance(&m, &offset, m0->m_pkthdr.l2hlen +
4273 			    offsetof(struct ip, ip_sum));
4274 			*sump = in_cksum_skip(m0, m0->m_pkthdr.l2hlen +
4275 			    m0->m_pkthdr.l3hlen, m0->m_pkthdr.l2hlen);
4276 			m0->m_pkthdr.csum_flags &= ~CSUM_IP;
4277 		}
4278 #endif
4279 
4280 		cpl = (void *)(wr + 1);
4281 	}
4282 
4283 	/* Checksum offload */
4284 	ctrl1 = 0;
4285 	if (needs_l3_csum(m0) == 0)
4286 		ctrl1 |= F_TXPKT_IPCSUM_DIS;
4287 	if (csum_type >= 0) {
4288 		KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0,
4289 	    ("%s: mbuf %p needs checksum offload but missing header lengths",
4290 			__func__, m0));
4291 
4292 		if (chip_id(sc) <= CHELSIO_T5) {
4293 			ctrl1 |= V_TXPKT_ETHHDR_LEN(m0->m_pkthdr.l2hlen -
4294 			    ETHER_HDR_LEN);
4295 		} else {
4296 			ctrl1 |= V_T6_TXPKT_ETHHDR_LEN(m0->m_pkthdr.l2hlen -
4297 			    ETHER_HDR_LEN);
4298 		}
4299 		ctrl1 |= V_TXPKT_IPHDR_LEN(m0->m_pkthdr.l3hlen);
4300 		ctrl1 |= V_TXPKT_CSUM_TYPE(csum_type);
4301 	} else
4302 		ctrl1 |= F_TXPKT_L4CSUM_DIS;
4303 	if (m0->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP |
4304 	    CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO))
4305 		txq->txcsum++;	/* some hardware assistance provided */
4306 
4307 	/* VLAN tag insertion */
4308 	if (needs_vlan_insertion(m0)) {
4309 		ctrl1 |= F_TXPKT_VLAN_VLD |
4310 		    V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag);
4311 		txq->vlan_insertion++;
4312 	}
4313 
4314 	/* CPL header */
4315 	cpl->ctrl0 = txq->cpl_ctrl0;
4316 	cpl->pack = 0;
4317 	cpl->len = htobe16(pktlen);
4318 	cpl->ctrl1 = htobe64(ctrl1);
4319 
4320 	/* SGL */
4321 	dst = (void *)(cpl + 1);
4322 
4323 	/*
4324 	 * A packet using TSO will use up an entire descriptor for the
4325 	 * firmware work request header, LSO CPL, and TX_PKT_XT CPL.
4326 	 * If this descriptor is the last descriptor in the ring, wrap
4327 	 * around to the front of the ring explicitly for the start of
4328 	 * the sgl.
4329 	 */
4330 	if (dst == (void *)&eq->desc[eq->sidx]) {
4331 		dst = (void *)&eq->desc[0];
4332 		write_gl_to_txd(txq, m0, &dst, 0);
4333 	} else
4334 		write_gl_to_txd(txq, m0, &dst, eq->sidx - ndesc < eq->pidx);
4335 	txq->sgl_wrs++;
4336 
4337 	txq->txpkt_wrs++;
4338 
4339 	txsd = &txq->sdesc[eq->pidx];
4340 	txsd->m = m0;
4341 	txsd->desc_used = ndesc;
4342 
4343 	return (ndesc);
4344 }
4345 
4346 /*
4347  * Write a txpkt WR for this packet to the hardware descriptors, update the
4348  * software descriptor, and advance the pidx.  It is guaranteed that enough
4349  * descriptors are available.
4350  *
4351  * The return value is the # of hardware descriptors used.
4352  */
4353 static u_int
4354 write_txpkt_wr(struct sge_txq *txq, struct fw_eth_tx_pkt_wr *wr,
4355     struct mbuf *m0, u_int available)
4356 {
4357 	struct sge_eq *eq = &txq->eq;
4358 	struct tx_sdesc *txsd;
4359 	struct cpl_tx_pkt_core *cpl;
4360 	uint32_t ctrl;	/* used in many unrelated places */
4361 	uint64_t ctrl1;
4362 	int len16, ndesc, pktlen, nsegs;
4363 	caddr_t dst;
4364 
4365 	TXQ_LOCK_ASSERT_OWNED(txq);
4366 	M_ASSERTPKTHDR(m0);
4367 	MPASS(available > 0 && available < eq->sidx);
4368 
4369 	len16 = mbuf_len16(m0);
4370 	nsegs = mbuf_nsegs(m0);
4371 	pktlen = m0->m_pkthdr.len;
4372 	ctrl = sizeof(struct cpl_tx_pkt_core);
4373 	if (needs_tso(m0))
4374 		ctrl += sizeof(struct cpl_tx_pkt_lso_core);
4375 	else if (pktlen <= imm_payload(2) && available >= 2) {
4376 		/* Immediate data.  Recalculate len16 and set nsegs to 0. */
4377 		ctrl += pktlen;
4378 		len16 = howmany(sizeof(struct fw_eth_tx_pkt_wr) +
4379 		    sizeof(struct cpl_tx_pkt_core) + pktlen, 16);
4380 		nsegs = 0;
4381 	}
4382 	ndesc = howmany(len16, EQ_ESIZE / 16);
4383 	MPASS(ndesc <= available);
4384 
4385 	/* Firmware work request header */
4386 	MPASS(wr == (void *)&eq->desc[eq->pidx]);
4387 	wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_WR) |
4388 	    V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl));
4389 
4390 	ctrl = V_FW_WR_LEN16(len16);
4391 	wr->equiq_to_len16 = htobe32(ctrl);
4392 	wr->r3 = 0;
4393 
4394 	if (needs_tso(m0)) {
4395 		struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1);
4396 
4397 		KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 &&
4398 		    m0->m_pkthdr.l4hlen > 0,
4399 		    ("%s: mbuf %p needs TSO but missing header lengths",
4400 			__func__, m0));
4401 
4402 		ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) | F_LSO_FIRST_SLICE |
4403 		    F_LSO_LAST_SLICE | V_LSO_IPHDR_LEN(m0->m_pkthdr.l3hlen >> 2)
4404 		    | V_LSO_TCPHDR_LEN(m0->m_pkthdr.l4hlen >> 2);
4405 		if (m0->m_pkthdr.l2hlen == sizeof(struct ether_vlan_header))
4406 			ctrl |= V_LSO_ETHHDR_LEN(1);
4407 		if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr))
4408 			ctrl |= F_LSO_IPV6;
4409 
4410 		lso->lso_ctrl = htobe32(ctrl);
4411 		lso->ipid_ofst = htobe16(0);
4412 		lso->mss = htobe16(m0->m_pkthdr.tso_segsz);
4413 		lso->seqno_offset = htobe32(0);
4414 		lso->len = htobe32(pktlen);
4415 
4416 		cpl = (void *)(lso + 1);
4417 
4418 		txq->tso_wrs++;
4419 	} else
4420 		cpl = (void *)(wr + 1);
4421 
4422 	/* Checksum offload */
4423 	ctrl1 = 0;
4424 	if (needs_l3_csum(m0) == 0)
4425 		ctrl1 |= F_TXPKT_IPCSUM_DIS;
4426 	if (needs_l4_csum(m0) == 0)
4427 		ctrl1 |= F_TXPKT_L4CSUM_DIS;
4428 	if (m0->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP |
4429 	    CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO))
4430 		txq->txcsum++;	/* some hardware assistance provided */
4431 
4432 	/* VLAN tag insertion */
4433 	if (needs_vlan_insertion(m0)) {
4434 		ctrl1 |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag);
4435 		txq->vlan_insertion++;
4436 	}
4437 
4438 	/* CPL header */
4439 	cpl->ctrl0 = txq->cpl_ctrl0;
4440 	cpl->pack = 0;
4441 	cpl->len = htobe16(pktlen);
4442 	cpl->ctrl1 = htobe64(ctrl1);
4443 
4444 	/* SGL */
4445 	dst = (void *)(cpl + 1);
4446 	if (nsegs > 0) {
4447 
4448 		write_gl_to_txd(txq, m0, &dst, eq->sidx - ndesc < eq->pidx);
4449 		txq->sgl_wrs++;
4450 	} else {
4451 		struct mbuf *m;
4452 
4453 		for (m = m0; m != NULL; m = m->m_next) {
4454 			copy_to_txd(eq, mtod(m, caddr_t), &dst, m->m_len);
4455 #ifdef INVARIANTS
4456 			pktlen -= m->m_len;
4457 #endif
4458 		}
4459 #ifdef INVARIANTS
4460 		KASSERT(pktlen == 0, ("%s: %d bytes left.", __func__, pktlen));
4461 #endif
4462 		txq->imm_wrs++;
4463 	}
4464 
4465 	txq->txpkt_wrs++;
4466 
4467 	txsd = &txq->sdesc[eq->pidx];
4468 	txsd->m = m0;
4469 	txsd->desc_used = ndesc;
4470 
4471 	return (ndesc);
4472 }
4473 
4474 static int
4475 try_txpkts(struct mbuf *m, struct mbuf *n, struct txpkts *txp, u_int available)
4476 {
4477 	u_int needed, nsegs1, nsegs2, l1, l2;
4478 
4479 	if (cannot_use_txpkts(m) || cannot_use_txpkts(n))
4480 		return (1);
4481 
4482 	nsegs1 = mbuf_nsegs(m);
4483 	nsegs2 = mbuf_nsegs(n);
4484 	if (nsegs1 + nsegs2 == 2) {
4485 		txp->wr_type = 1;
4486 		l1 = l2 = txpkts1_len16();
4487 	} else {
4488 		txp->wr_type = 0;
4489 		l1 = txpkts0_len16(nsegs1);
4490 		l2 = txpkts0_len16(nsegs2);
4491 	}
4492 	txp->len16 = howmany(sizeof(struct fw_eth_tx_pkts_wr), 16) + l1 + l2;
4493 	needed = howmany(txp->len16, EQ_ESIZE / 16);
4494 	if (needed > SGE_MAX_WR_NDESC || needed > available)
4495 		return (1);
4496 
4497 	txp->plen = m->m_pkthdr.len + n->m_pkthdr.len;
4498 	if (txp->plen > 65535)
4499 		return (1);
4500 
4501 	txp->npkt = 2;
4502 	set_mbuf_len16(m, l1);
4503 	set_mbuf_len16(n, l2);
4504 
4505 	return (0);
4506 }
4507 
4508 static int
4509 add_to_txpkts(struct mbuf *m, struct txpkts *txp, u_int available)
4510 {
4511 	u_int plen, len16, needed, nsegs;
4512 
4513 	MPASS(txp->wr_type == 0 || txp->wr_type == 1);
4514 
4515 	nsegs = mbuf_nsegs(m);
4516 	if (needs_tso(m) || (txp->wr_type == 1 && nsegs != 1))
4517 		return (1);
4518 
4519 	plen = txp->plen + m->m_pkthdr.len;
4520 	if (plen > 65535)
4521 		return (1);
4522 
4523 	if (txp->wr_type == 0)
4524 		len16 = txpkts0_len16(nsegs);
4525 	else
4526 		len16 = txpkts1_len16();
4527 	needed = howmany(txp->len16 + len16, EQ_ESIZE / 16);
4528 	if (needed > SGE_MAX_WR_NDESC || needed > available)
4529 		return (1);
4530 
4531 	txp->npkt++;
4532 	txp->plen = plen;
4533 	txp->len16 += len16;
4534 	set_mbuf_len16(m, len16);
4535 
4536 	return (0);
4537 }
4538 
4539 /*
4540  * Write a txpkts WR for the packets in txp to the hardware descriptors, update
4541  * the software descriptor, and advance the pidx.  It is guaranteed that enough
4542  * descriptors are available.
4543  *
4544  * The return value is the # of hardware descriptors used.
4545  */
4546 static u_int
4547 write_txpkts_wr(struct sge_txq *txq, struct fw_eth_tx_pkts_wr *wr,
4548     struct mbuf *m0, const struct txpkts *txp, u_int available)
4549 {
4550 	struct sge_eq *eq = &txq->eq;
4551 	struct tx_sdesc *txsd;
4552 	struct cpl_tx_pkt_core *cpl;
4553 	uint32_t ctrl;
4554 	uint64_t ctrl1;
4555 	int ndesc, checkwrap;
4556 	struct mbuf *m;
4557 	void *flitp;
4558 
4559 	TXQ_LOCK_ASSERT_OWNED(txq);
4560 	MPASS(txp->npkt > 0);
4561 	MPASS(txp->plen < 65536);
4562 	MPASS(m0 != NULL);
4563 	MPASS(m0->m_nextpkt != NULL);
4564 	MPASS(txp->len16 <= howmany(SGE_MAX_WR_LEN, 16));
4565 	MPASS(available > 0 && available < eq->sidx);
4566 
4567 	ndesc = howmany(txp->len16, EQ_ESIZE / 16);
4568 	MPASS(ndesc <= available);
4569 
4570 	MPASS(wr == (void *)&eq->desc[eq->pidx]);
4571 	wr->op_pkd = htobe32(V_FW_WR_OP(FW_ETH_TX_PKTS_WR));
4572 	ctrl = V_FW_WR_LEN16(txp->len16);
4573 	wr->equiq_to_len16 = htobe32(ctrl);
4574 	wr->plen = htobe16(txp->plen);
4575 	wr->npkt = txp->npkt;
4576 	wr->r3 = 0;
4577 	wr->type = txp->wr_type;
4578 	flitp = wr + 1;
4579 
4580 	/*
4581 	 * At this point we are 16B into a hardware descriptor.  If checkwrap is
4582 	 * set then we know the WR is going to wrap around somewhere.  We'll
4583 	 * check for that at appropriate points.
4584 	 */
4585 	checkwrap = eq->sidx - ndesc < eq->pidx;
4586 	for (m = m0; m != NULL; m = m->m_nextpkt) {
4587 		if (txp->wr_type == 0) {
4588 			struct ulp_txpkt *ulpmc;
4589 			struct ulptx_idata *ulpsc;
4590 
4591 			/* ULP master command */
4592 			ulpmc = flitp;
4593 			ulpmc->cmd_dest = htobe32(V_ULPTX_CMD(ULP_TX_PKT) |
4594 			    V_ULP_TXPKT_DEST(0) | V_ULP_TXPKT_FID(eq->iqid));
4595 			ulpmc->len = htobe32(mbuf_len16(m));
4596 
4597 			/* ULP subcommand */
4598 			ulpsc = (void *)(ulpmc + 1);
4599 			ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM) |
4600 			    F_ULP_TX_SC_MORE);
4601 			ulpsc->len = htobe32(sizeof(struct cpl_tx_pkt_core));
4602 
4603 			cpl = (void *)(ulpsc + 1);
4604 			if (checkwrap &&
4605 			    (uintptr_t)cpl == (uintptr_t)&eq->desc[eq->sidx])
4606 				cpl = (void *)&eq->desc[0];
4607 		} else {
4608 			cpl = flitp;
4609 		}
4610 
4611 		/* Checksum offload */
4612 		ctrl1 = 0;
4613 		if (needs_l3_csum(m) == 0)
4614 			ctrl1 |= F_TXPKT_IPCSUM_DIS;
4615 		if (needs_l4_csum(m) == 0)
4616 			ctrl1 |= F_TXPKT_L4CSUM_DIS;
4617 		if (m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP |
4618 		    CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO))
4619 			txq->txcsum++;	/* some hardware assistance provided */
4620 
4621 		/* VLAN tag insertion */
4622 		if (needs_vlan_insertion(m)) {
4623 			ctrl1 |= F_TXPKT_VLAN_VLD |
4624 			    V_TXPKT_VLAN(m->m_pkthdr.ether_vtag);
4625 			txq->vlan_insertion++;
4626 		}
4627 
4628 		/* CPL header */
4629 		cpl->ctrl0 = txq->cpl_ctrl0;
4630 		cpl->pack = 0;
4631 		cpl->len = htobe16(m->m_pkthdr.len);
4632 		cpl->ctrl1 = htobe64(ctrl1);
4633 
4634 		flitp = cpl + 1;
4635 		if (checkwrap &&
4636 		    (uintptr_t)flitp == (uintptr_t)&eq->desc[eq->sidx])
4637 			flitp = (void *)&eq->desc[0];
4638 
4639 		write_gl_to_txd(txq, m, (caddr_t *)(&flitp), checkwrap);
4640 
4641 	}
4642 
4643 	if (txp->wr_type == 0) {
4644 		txq->txpkts0_pkts += txp->npkt;
4645 		txq->txpkts0_wrs++;
4646 	} else {
4647 		txq->txpkts1_pkts += txp->npkt;
4648 		txq->txpkts1_wrs++;
4649 	}
4650 
4651 	txsd = &txq->sdesc[eq->pidx];
4652 	txsd->m = m0;
4653 	txsd->desc_used = ndesc;
4654 
4655 	return (ndesc);
4656 }
4657 
4658 /*
4659  * If the SGL ends on an address that is not 16 byte aligned, this function will
4660  * add a 0 filled flit at the end.
4661  */
4662 static void
4663 write_gl_to_txd(struct sge_txq *txq, struct mbuf *m, caddr_t *to, int checkwrap)
4664 {
4665 	struct sge_eq *eq = &txq->eq;
4666 	struct sglist *gl = txq->gl;
4667 	struct sglist_seg *seg;
4668 	__be64 *flitp, *wrap;
4669 	struct ulptx_sgl *usgl;
4670 	int i, nflits, nsegs;
4671 
4672 	KASSERT(((uintptr_t)(*to) & 0xf) == 0,
4673 	    ("%s: SGL must start at a 16 byte boundary: %p", __func__, *to));
4674 	MPASS((uintptr_t)(*to) >= (uintptr_t)&eq->desc[0]);
4675 	MPASS((uintptr_t)(*to) < (uintptr_t)&eq->desc[eq->sidx]);
4676 
4677 	get_pkt_gl(m, gl);
4678 	nsegs = gl->sg_nseg;
4679 	MPASS(nsegs > 0);
4680 
4681 	nflits = (3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1) + 2;
4682 	flitp = (__be64 *)(*to);
4683 	wrap = (__be64 *)(&eq->desc[eq->sidx]);
4684 	seg = &gl->sg_segs[0];
4685 	usgl = (void *)flitp;
4686 
4687 	/*
4688 	 * We start at a 16 byte boundary somewhere inside the tx descriptor
4689 	 * ring, so we're at least 16 bytes away from the status page.  There is
4690 	 * no chance of a wrap around in the middle of usgl (which is 16 bytes).
4691 	 */
4692 
4693 	usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) |
4694 	    V_ULPTX_NSGE(nsegs));
4695 	usgl->len0 = htobe32(seg->ss_len);
4696 	usgl->addr0 = htobe64(seg->ss_paddr);
4697 	seg++;
4698 
4699 	if (checkwrap == 0 || (uintptr_t)(flitp + nflits) <= (uintptr_t)wrap) {
4700 
4701 		/* Won't wrap around at all */
4702 
4703 		for (i = 0; i < nsegs - 1; i++, seg++) {
4704 			usgl->sge[i / 2].len[i & 1] = htobe32(seg->ss_len);
4705 			usgl->sge[i / 2].addr[i & 1] = htobe64(seg->ss_paddr);
4706 		}
4707 		if (i & 1)
4708 			usgl->sge[i / 2].len[1] = htobe32(0);
4709 		flitp += nflits;
4710 	} else {
4711 
4712 		/* Will wrap somewhere in the rest of the SGL */
4713 
4714 		/* 2 flits already written, write the rest flit by flit */
4715 		flitp = (void *)(usgl + 1);
4716 		for (i = 0; i < nflits - 2; i++) {
4717 			if (flitp == wrap)
4718 				flitp = (void *)eq->desc;
4719 			*flitp++ = get_flit(seg, nsegs - 1, i);
4720 		}
4721 	}
4722 
4723 	if (nflits & 1) {
4724 		MPASS(((uintptr_t)flitp) & 0xf);
4725 		*flitp++ = 0;
4726 	}
4727 
4728 	MPASS((((uintptr_t)flitp) & 0xf) == 0);
4729 	if (__predict_false(flitp == wrap))
4730 		*to = (void *)eq->desc;
4731 	else
4732 		*to = (void *)flitp;
4733 }
4734 
4735 static inline void
4736 copy_to_txd(struct sge_eq *eq, caddr_t from, caddr_t *to, int len)
4737 {
4738 
4739 	MPASS((uintptr_t)(*to) >= (uintptr_t)&eq->desc[0]);
4740 	MPASS((uintptr_t)(*to) < (uintptr_t)&eq->desc[eq->sidx]);
4741 
4742 	if (__predict_true((uintptr_t)(*to) + len <=
4743 	    (uintptr_t)&eq->desc[eq->sidx])) {
4744 		bcopy(from, *to, len);
4745 		(*to) += len;
4746 	} else {
4747 		int portion = (uintptr_t)&eq->desc[eq->sidx] - (uintptr_t)(*to);
4748 
4749 		bcopy(from, *to, portion);
4750 		from += portion;
4751 		portion = len - portion;	/* remaining */
4752 		bcopy(from, (void *)eq->desc, portion);
4753 		(*to) = (caddr_t)eq->desc + portion;
4754 	}
4755 }
4756 
4757 static inline void
4758 ring_eq_db(struct adapter *sc, struct sge_eq *eq, u_int n)
4759 {
4760 	u_int db;
4761 
4762 	MPASS(n > 0);
4763 
4764 	db = eq->doorbells;
4765 	if (n > 1)
4766 		clrbit(&db, DOORBELL_WCWR);
4767 	wmb();
4768 
4769 	switch (ffs(db) - 1) {
4770 	case DOORBELL_UDB:
4771 		*eq->udb = htole32(V_QID(eq->udb_qid) | V_PIDX(n));
4772 		break;
4773 
4774 	case DOORBELL_WCWR: {
4775 		volatile uint64_t *dst, *src;
4776 		int i;
4777 
4778 		/*
4779 		 * Queues whose 128B doorbell segment fits in the page do not
4780 		 * use relative qid (udb_qid is always 0).  Only queues with
4781 		 * doorbell segments can do WCWR.
4782 		 */
4783 		KASSERT(eq->udb_qid == 0 && n == 1,
4784 		    ("%s: inappropriate doorbell (0x%x, %d, %d) for eq %p",
4785 		    __func__, eq->doorbells, n, eq->dbidx, eq));
4786 
4787 		dst = (volatile void *)((uintptr_t)eq->udb + UDBS_WR_OFFSET -
4788 		    UDBS_DB_OFFSET);
4789 		i = eq->dbidx;
4790 		src = (void *)&eq->desc[i];
4791 		while (src != (void *)&eq->desc[i + 1])
4792 			*dst++ = *src++;
4793 		wmb();
4794 		break;
4795 	}
4796 
4797 	case DOORBELL_UDBWC:
4798 		*eq->udb = htole32(V_QID(eq->udb_qid) | V_PIDX(n));
4799 		wmb();
4800 		break;
4801 
4802 	case DOORBELL_KDB:
4803 		t4_write_reg(sc, sc->sge_kdoorbell_reg,
4804 		    V_QID(eq->cntxt_id) | V_PIDX(n));
4805 		break;
4806 	}
4807 
4808 	IDXINCR(eq->dbidx, n, eq->sidx);
4809 }
4810 
4811 static inline u_int
4812 reclaimable_tx_desc(struct sge_eq *eq)
4813 {
4814 	uint16_t hw_cidx;
4815 
4816 	hw_cidx = read_hw_cidx(eq);
4817 	return (IDXDIFF(hw_cidx, eq->cidx, eq->sidx));
4818 }
4819 
4820 static inline u_int
4821 total_available_tx_desc(struct sge_eq *eq)
4822 {
4823 	uint16_t hw_cidx, pidx;
4824 
4825 	hw_cidx = read_hw_cidx(eq);
4826 	pidx = eq->pidx;
4827 
4828 	if (pidx == hw_cidx)
4829 		return (eq->sidx - 1);
4830 	else
4831 		return (IDXDIFF(hw_cidx, pidx, eq->sidx) - 1);
4832 }
4833 
4834 static inline uint16_t
4835 read_hw_cidx(struct sge_eq *eq)
4836 {
4837 	struct sge_qstat *spg = (void *)&eq->desc[eq->sidx];
4838 	uint16_t cidx = spg->cidx;	/* stable snapshot */
4839 
4840 	return (be16toh(cidx));
4841 }
4842 
4843 /*
4844  * Reclaim 'n' descriptors approximately.
4845  */
4846 static u_int
4847 reclaim_tx_descs(struct sge_txq *txq, u_int n)
4848 {
4849 	struct tx_sdesc *txsd;
4850 	struct sge_eq *eq = &txq->eq;
4851 	u_int can_reclaim, reclaimed;
4852 
4853 	TXQ_LOCK_ASSERT_OWNED(txq);
4854 	MPASS(n > 0);
4855 
4856 	reclaimed = 0;
4857 	can_reclaim = reclaimable_tx_desc(eq);
4858 	while (can_reclaim && reclaimed < n) {
4859 		int ndesc;
4860 		struct mbuf *m, *nextpkt;
4861 
4862 		txsd = &txq->sdesc[eq->cidx];
4863 		ndesc = txsd->desc_used;
4864 
4865 		/* Firmware doesn't return "partial" credits. */
4866 		KASSERT(can_reclaim >= ndesc,
4867 		    ("%s: unexpected number of credits: %d, %d",
4868 		    __func__, can_reclaim, ndesc));
4869 
4870 		for (m = txsd->m; m != NULL; m = nextpkt) {
4871 			nextpkt = m->m_nextpkt;
4872 			m->m_nextpkt = NULL;
4873 			m_freem(m);
4874 		}
4875 		reclaimed += ndesc;
4876 		can_reclaim -= ndesc;
4877 		IDXINCR(eq->cidx, ndesc, eq->sidx);
4878 	}
4879 
4880 	return (reclaimed);
4881 }
4882 
4883 static void
4884 tx_reclaim(void *arg, int n)
4885 {
4886 	struct sge_txq *txq = arg;
4887 	struct sge_eq *eq = &txq->eq;
4888 
4889 	do {
4890 		if (TXQ_TRYLOCK(txq) == 0)
4891 			break;
4892 		n = reclaim_tx_descs(txq, 32);
4893 		if (eq->cidx == eq->pidx)
4894 			eq->equeqidx = eq->pidx;
4895 		TXQ_UNLOCK(txq);
4896 	} while (n > 0);
4897 }
4898 
4899 static __be64
4900 get_flit(struct sglist_seg *segs, int nsegs, int idx)
4901 {
4902 	int i = (idx / 3) * 2;
4903 
4904 	switch (idx % 3) {
4905 	case 0: {
4906 		__be64 rc;
4907 
4908 		rc = htobe32(segs[i].ss_len);
4909 		if (i + 1 < nsegs)
4910 			rc |= (uint64_t)htobe32(segs[i + 1].ss_len) << 32;
4911 
4912 		return (rc);
4913 	}
4914 	case 1:
4915 		return (htobe64(segs[i].ss_paddr));
4916 	case 2:
4917 		return (htobe64(segs[i + 1].ss_paddr));
4918 	}
4919 
4920 	return (0);
4921 }
4922 
4923 static void
4924 find_best_refill_source(struct adapter *sc, struct sge_fl *fl, int maxp)
4925 {
4926 	int8_t zidx, hwidx, idx;
4927 	uint16_t region1, region3;
4928 	int spare, spare_needed, n;
4929 	struct sw_zone_info *swz;
4930 	struct hw_buf_info *hwb, *hwb_list = &sc->sge.hw_buf_info[0];
4931 
4932 	/*
4933 	 * Buffer Packing: Look for PAGE_SIZE or larger zone which has a bufsize
4934 	 * large enough for the max payload and cluster metadata.  Otherwise
4935 	 * settle for the largest bufsize that leaves enough room in the cluster
4936 	 * for metadata.
4937 	 *
4938 	 * Without buffer packing: Look for the smallest zone which has a
4939 	 * bufsize large enough for the max payload.  Settle for the largest
4940 	 * bufsize available if there's nothing big enough for max payload.
4941 	 */
4942 	spare_needed = fl->flags & FL_BUF_PACKING ? CL_METADATA_SIZE : 0;
4943 	swz = &sc->sge.sw_zone_info[0];
4944 	hwidx = -1;
4945 	for (zidx = 0; zidx < SW_ZONE_SIZES; zidx++, swz++) {
4946 		if (swz->size > largest_rx_cluster) {
4947 			if (__predict_true(hwidx != -1))
4948 				break;
4949 
4950 			/*
4951 			 * This is a misconfiguration.  largest_rx_cluster is
4952 			 * preventing us from finding a refill source.  See
4953 			 * dev.t5nex.<n>.buffer_sizes to figure out why.
4954 			 */
4955 			device_printf(sc->dev, "largest_rx_cluster=%u leaves no"
4956 			    " refill source for fl %p (dma %u).  Ignored.\n",
4957 			    largest_rx_cluster, fl, maxp);
4958 		}
4959 		for (idx = swz->head_hwidx; idx != -1; idx = hwb->next) {
4960 			hwb = &hwb_list[idx];
4961 			spare = swz->size - hwb->size;
4962 			if (spare < spare_needed)
4963 				continue;
4964 
4965 			hwidx = idx;		/* best option so far */
4966 			if (hwb->size >= maxp) {
4967 
4968 				if ((fl->flags & FL_BUF_PACKING) == 0)
4969 					goto done; /* stop looking (not packing) */
4970 
4971 				if (swz->size >= safest_rx_cluster)
4972 					goto done; /* stop looking (packing) */
4973 			}
4974 			break;		/* keep looking, next zone */
4975 		}
4976 	}
4977 done:
4978 	/* A usable hwidx has been located. */
4979 	MPASS(hwidx != -1);
4980 	hwb = &hwb_list[hwidx];
4981 	zidx = hwb->zidx;
4982 	swz = &sc->sge.sw_zone_info[zidx];
4983 	region1 = 0;
4984 	region3 = swz->size - hwb->size;
4985 
4986 	/*
4987 	 * Stay within this zone and see if there is a better match when mbuf
4988 	 * inlining is allowed.  Remember that the hwidx's are sorted in
4989 	 * decreasing order of size (so in increasing order of spare area).
4990 	 */
4991 	for (idx = hwidx; idx != -1; idx = hwb->next) {
4992 		hwb = &hwb_list[idx];
4993 		spare = swz->size - hwb->size;
4994 
4995 		if (allow_mbufs_in_cluster == 0 || hwb->size < maxp)
4996 			break;
4997 
4998 		/*
4999 		 * Do not inline mbufs if doing so would violate the pad/pack
5000 		 * boundary alignment requirement.
5001 		 */
5002 		if (fl_pad && (MSIZE % sc->params.sge.pad_boundary) != 0)
5003 			continue;
5004 		if (fl->flags & FL_BUF_PACKING &&
5005 		    (MSIZE % sc->params.sge.pack_boundary) != 0)
5006 			continue;
5007 
5008 		if (spare < CL_METADATA_SIZE + MSIZE)
5009 			continue;
5010 		n = (spare - CL_METADATA_SIZE) / MSIZE;
5011 		if (n > howmany(hwb->size, maxp))
5012 			break;
5013 
5014 		hwidx = idx;
5015 		if (fl->flags & FL_BUF_PACKING) {
5016 			region1 = n * MSIZE;
5017 			region3 = spare - region1;
5018 		} else {
5019 			region1 = MSIZE;
5020 			region3 = spare - region1;
5021 			break;
5022 		}
5023 	}
5024 
5025 	KASSERT(zidx >= 0 && zidx < SW_ZONE_SIZES,
5026 	    ("%s: bad zone %d for fl %p, maxp %d", __func__, zidx, fl, maxp));
5027 	KASSERT(hwidx >= 0 && hwidx <= SGE_FLBUF_SIZES,
5028 	    ("%s: bad hwidx %d for fl %p, maxp %d", __func__, hwidx, fl, maxp));
5029 	KASSERT(region1 + sc->sge.hw_buf_info[hwidx].size + region3 ==
5030 	    sc->sge.sw_zone_info[zidx].size,
5031 	    ("%s: bad buffer layout for fl %p, maxp %d. "
5032 		"cl %d; r1 %d, payload %d, r3 %d", __func__, fl, maxp,
5033 		sc->sge.sw_zone_info[zidx].size, region1,
5034 		sc->sge.hw_buf_info[hwidx].size, region3));
5035 	if (fl->flags & FL_BUF_PACKING || region1 > 0) {
5036 		KASSERT(region3 >= CL_METADATA_SIZE,
5037 		    ("%s: no room for metadata.  fl %p, maxp %d; "
5038 		    "cl %d; r1 %d, payload %d, r3 %d", __func__, fl, maxp,
5039 		    sc->sge.sw_zone_info[zidx].size, region1,
5040 		    sc->sge.hw_buf_info[hwidx].size, region3));
5041 		KASSERT(region1 % MSIZE == 0,
5042 		    ("%s: bad mbuf region for fl %p, maxp %d. "
5043 		    "cl %d; r1 %d, payload %d, r3 %d", __func__, fl, maxp,
5044 		    sc->sge.sw_zone_info[zidx].size, region1,
5045 		    sc->sge.hw_buf_info[hwidx].size, region3));
5046 	}
5047 
5048 	fl->cll_def.zidx = zidx;
5049 	fl->cll_def.hwidx = hwidx;
5050 	fl->cll_def.region1 = region1;
5051 	fl->cll_def.region3 = region3;
5052 }
5053 
5054 static void
5055 find_safe_refill_source(struct adapter *sc, struct sge_fl *fl)
5056 {
5057 	struct sge *s = &sc->sge;
5058 	struct hw_buf_info *hwb;
5059 	struct sw_zone_info *swz;
5060 	int spare;
5061 	int8_t hwidx;
5062 
5063 	if (fl->flags & FL_BUF_PACKING)
5064 		hwidx = s->safe_hwidx2;	/* with room for metadata */
5065 	else if (allow_mbufs_in_cluster && s->safe_hwidx2 != -1) {
5066 		hwidx = s->safe_hwidx2;
5067 		hwb = &s->hw_buf_info[hwidx];
5068 		swz = &s->sw_zone_info[hwb->zidx];
5069 		spare = swz->size - hwb->size;
5070 
5071 		/* no good if there isn't room for an mbuf as well */
5072 		if (spare < CL_METADATA_SIZE + MSIZE)
5073 			hwidx = s->safe_hwidx1;
5074 	} else
5075 		hwidx = s->safe_hwidx1;
5076 
5077 	if (hwidx == -1) {
5078 		/* No fallback source */
5079 		fl->cll_alt.hwidx = -1;
5080 		fl->cll_alt.zidx = -1;
5081 
5082 		return;
5083 	}
5084 
5085 	hwb = &s->hw_buf_info[hwidx];
5086 	swz = &s->sw_zone_info[hwb->zidx];
5087 	spare = swz->size - hwb->size;
5088 	fl->cll_alt.hwidx = hwidx;
5089 	fl->cll_alt.zidx = hwb->zidx;
5090 	if (allow_mbufs_in_cluster &&
5091 	    (fl_pad == 0 || (MSIZE % sc->params.sge.pad_boundary) == 0))
5092 		fl->cll_alt.region1 = ((spare - CL_METADATA_SIZE) / MSIZE) * MSIZE;
5093 	else
5094 		fl->cll_alt.region1 = 0;
5095 	fl->cll_alt.region3 = spare - fl->cll_alt.region1;
5096 }
5097 
5098 static void
5099 add_fl_to_sfl(struct adapter *sc, struct sge_fl *fl)
5100 {
5101 	mtx_lock(&sc->sfl_lock);
5102 	FL_LOCK(fl);
5103 	if ((fl->flags & FL_DOOMED) == 0) {
5104 		fl->flags |= FL_STARVING;
5105 		TAILQ_INSERT_TAIL(&sc->sfl, fl, link);
5106 		callout_reset(&sc->sfl_callout, hz / 5, refill_sfl, sc);
5107 	}
5108 	FL_UNLOCK(fl);
5109 	mtx_unlock(&sc->sfl_lock);
5110 }
5111 
5112 static void
5113 handle_wrq_egr_update(struct adapter *sc, struct sge_eq *eq)
5114 {
5115 	struct sge_wrq *wrq = (void *)eq;
5116 
5117 	atomic_readandclear_int(&eq->equiq);
5118 	taskqueue_enqueue(sc->tq[eq->tx_chan], &wrq->wrq_tx_task);
5119 }
5120 
5121 static void
5122 handle_eth_egr_update(struct adapter *sc, struct sge_eq *eq)
5123 {
5124 	struct sge_txq *txq = (void *)eq;
5125 
5126 	MPASS((eq->flags & EQ_TYPEMASK) == EQ_ETH);
5127 
5128 	atomic_readandclear_int(&eq->equiq);
5129 	mp_ring_check_drainage(txq->r, 0);
5130 	taskqueue_enqueue(sc->tq[eq->tx_chan], &txq->tx_reclaim_task);
5131 }
5132 
5133 static int
5134 handle_sge_egr_update(struct sge_iq *iq, const struct rss_header *rss,
5135     struct mbuf *m)
5136 {
5137 	const struct cpl_sge_egr_update *cpl = (const void *)(rss + 1);
5138 	unsigned int qid = G_EGR_QID(ntohl(cpl->opcode_qid));
5139 	struct adapter *sc = iq->adapter;
5140 	struct sge *s = &sc->sge;
5141 	struct sge_eq *eq;
5142 	static void (*h[])(struct adapter *, struct sge_eq *) = {NULL,
5143 		&handle_wrq_egr_update, &handle_eth_egr_update,
5144 		&handle_wrq_egr_update};
5145 
5146 	KASSERT(m == NULL, ("%s: payload with opcode %02x", __func__,
5147 	    rss->opcode));
5148 
5149 	eq = s->eqmap[qid - s->eq_start - s->eq_base];
5150 	(*h[eq->flags & EQ_TYPEMASK])(sc, eq);
5151 
5152 	return (0);
5153 }
5154 
5155 /* handle_fw_msg works for both fw4_msg and fw6_msg because this is valid */
5156 CTASSERT(offsetof(struct cpl_fw4_msg, data) == \
5157     offsetof(struct cpl_fw6_msg, data));
5158 
5159 static int
5160 handle_fw_msg(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
5161 {
5162 	struct adapter *sc = iq->adapter;
5163 	const struct cpl_fw6_msg *cpl = (const void *)(rss + 1);
5164 
5165 	KASSERT(m == NULL, ("%s: payload with opcode %02x", __func__,
5166 	    rss->opcode));
5167 
5168 	if (cpl->type == FW_TYPE_RSSCPL || cpl->type == FW6_TYPE_RSSCPL) {
5169 		const struct rss_header *rss2;
5170 
5171 		rss2 = (const struct rss_header *)&cpl->data[0];
5172 		return (t4_cpl_handler[rss2->opcode](iq, rss2, m));
5173 	}
5174 
5175 	return (t4_fw_msg_handler[cpl->type](sc, &cpl->data[0]));
5176 }
5177 
5178 /**
5179  *	t4_handle_wrerr_rpl - process a FW work request error message
5180  *	@adap: the adapter
5181  *	@rpl: start of the FW message
5182  */
5183 static int
5184 t4_handle_wrerr_rpl(struct adapter *adap, const __be64 *rpl)
5185 {
5186 	u8 opcode = *(const u8 *)rpl;
5187 	const struct fw_error_cmd *e = (const void *)rpl;
5188 	unsigned int i;
5189 
5190 	if (opcode != FW_ERROR_CMD) {
5191 		log(LOG_ERR,
5192 		    "%s: Received WRERR_RPL message with opcode %#x\n",
5193 		    device_get_nameunit(adap->dev), opcode);
5194 		return (EINVAL);
5195 	}
5196 	log(LOG_ERR, "%s: FW_ERROR (%s) ", device_get_nameunit(adap->dev),
5197 	    G_FW_ERROR_CMD_FATAL(be32toh(e->op_to_type)) ? "fatal" :
5198 	    "non-fatal");
5199 	switch (G_FW_ERROR_CMD_TYPE(be32toh(e->op_to_type))) {
5200 	case FW_ERROR_TYPE_EXCEPTION:
5201 		log(LOG_ERR, "exception info:\n");
5202 		for (i = 0; i < nitems(e->u.exception.info); i++)
5203 			log(LOG_ERR, "%s%08x", i == 0 ? "\t" : " ",
5204 			    be32toh(e->u.exception.info[i]));
5205 		log(LOG_ERR, "\n");
5206 		break;
5207 	case FW_ERROR_TYPE_HWMODULE:
5208 		log(LOG_ERR, "HW module regaddr %08x regval %08x\n",
5209 		    be32toh(e->u.hwmodule.regaddr),
5210 		    be32toh(e->u.hwmodule.regval));
5211 		break;
5212 	case FW_ERROR_TYPE_WR:
5213 		log(LOG_ERR, "WR cidx %d PF %d VF %d eqid %d hdr:\n",
5214 		    be16toh(e->u.wr.cidx),
5215 		    G_FW_ERROR_CMD_PFN(be16toh(e->u.wr.pfn_vfn)),
5216 		    G_FW_ERROR_CMD_VFN(be16toh(e->u.wr.pfn_vfn)),
5217 		    be32toh(e->u.wr.eqid));
5218 		for (i = 0; i < nitems(e->u.wr.wrhdr); i++)
5219 			log(LOG_ERR, "%s%02x", i == 0 ? "\t" : " ",
5220 			    e->u.wr.wrhdr[i]);
5221 		log(LOG_ERR, "\n");
5222 		break;
5223 	case FW_ERROR_TYPE_ACL:
5224 		log(LOG_ERR, "ACL cidx %d PF %d VF %d eqid %d %s",
5225 		    be16toh(e->u.acl.cidx),
5226 		    G_FW_ERROR_CMD_PFN(be16toh(e->u.acl.pfn_vfn)),
5227 		    G_FW_ERROR_CMD_VFN(be16toh(e->u.acl.pfn_vfn)),
5228 		    be32toh(e->u.acl.eqid),
5229 		    G_FW_ERROR_CMD_MV(be16toh(e->u.acl.mv_pkd)) ? "vlanid" :
5230 		    "MAC");
5231 		for (i = 0; i < nitems(e->u.acl.val); i++)
5232 			log(LOG_ERR, " %02x", e->u.acl.val[i]);
5233 		log(LOG_ERR, "\n");
5234 		break;
5235 	default:
5236 		log(LOG_ERR, "type %#x\n",
5237 		    G_FW_ERROR_CMD_TYPE(be32toh(e->op_to_type)));
5238 		return (EINVAL);
5239 	}
5240 	return (0);
5241 }
5242 
5243 static int
5244 sysctl_uint16(SYSCTL_HANDLER_ARGS)
5245 {
5246 	uint16_t *id = arg1;
5247 	int i = *id;
5248 
5249 	return sysctl_handle_int(oidp, &i, 0, req);
5250 }
5251 
5252 static int
5253 sysctl_bufsizes(SYSCTL_HANDLER_ARGS)
5254 {
5255 	struct sge *s = arg1;
5256 	struct hw_buf_info *hwb = &s->hw_buf_info[0];
5257 	struct sw_zone_info *swz = &s->sw_zone_info[0];
5258 	int i, rc;
5259 	struct sbuf sb;
5260 	char c;
5261 
5262 	sbuf_new(&sb, NULL, 32, SBUF_AUTOEXTEND);
5263 	for (i = 0; i < SGE_FLBUF_SIZES; i++, hwb++) {
5264 		if (hwb->zidx >= 0 && swz[hwb->zidx].size <= largest_rx_cluster)
5265 			c = '*';
5266 		else
5267 			c = '\0';
5268 
5269 		sbuf_printf(&sb, "%u%c ", hwb->size, c);
5270 	}
5271 	sbuf_trim(&sb);
5272 	sbuf_finish(&sb);
5273 	rc = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
5274 	sbuf_delete(&sb);
5275 	return (rc);
5276 }
5277 
5278 static int
5279 sysctl_tc(SYSCTL_HANDLER_ARGS)
5280 {
5281 	struct vi_info *vi = arg1;
5282 	struct port_info *pi;
5283 	struct adapter *sc;
5284 	struct sge_txq *txq;
5285 	struct tx_cl_rl_params *tc;
5286 	int qidx = arg2, rc, tc_idx;
5287 	uint32_t fw_queue, fw_class;
5288 
5289 	MPASS(qidx >= 0 && qidx < vi->ntxq);
5290 	pi = vi->pi;
5291 	sc = pi->adapter;
5292 	txq = &sc->sge.txq[vi->first_txq + qidx];
5293 
5294 	tc_idx = txq->tc_idx;
5295 	rc = sysctl_handle_int(oidp, &tc_idx, 0, req);
5296 	if (rc != 0 || req->newptr == NULL)
5297 		return (rc);
5298 
5299 	if (sc->flags & IS_VF)
5300 		return (EPERM);
5301 
5302 	/* Note that -1 is legitimate input (it means unbind). */
5303 	if (tc_idx < -1 || tc_idx >= sc->chip_params->nsched_cls)
5304 		return (EINVAL);
5305 
5306 	mtx_lock(&sc->tc_lock);
5307 	if (tc_idx == txq->tc_idx) {
5308 		rc = 0;		/* No change, nothing to do. */
5309 		goto done;
5310 	}
5311 
5312 	fw_queue = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DMAQ) |
5313 	    V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DMAQ_EQ_SCHEDCLASS_ETH) |
5314 	    V_FW_PARAMS_PARAM_YZ(txq->eq.cntxt_id);
5315 
5316 	if (tc_idx == -1)
5317 		fw_class = 0xffffffff;	/* Unbind. */
5318 	else {
5319 		/*
5320 		 * Bind to a different class.
5321 		 */
5322 		tc = &pi->sched_params->cl_rl[tc_idx];
5323 		if (tc->flags & TX_CLRL_ERROR) {
5324 			/* Previous attempt to set the cl-rl params failed. */
5325 			rc = EIO;
5326 			goto done;
5327 		} else {
5328 			/*
5329 			 * Ok to proceed.  Place a reference on the new class
5330 			 * while still holding on to the reference on the
5331 			 * previous class, if any.
5332 			 */
5333 			fw_class = tc_idx;
5334 			tc->refcount++;
5335 		}
5336 	}
5337 	mtx_unlock(&sc->tc_lock);
5338 
5339 	rc = begin_synchronized_op(sc, vi, SLEEP_OK | INTR_OK, "t4stc");
5340 	if (rc)
5341 		return (rc);
5342 	rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, &fw_queue, &fw_class);
5343 	end_synchronized_op(sc, 0);
5344 
5345 	mtx_lock(&sc->tc_lock);
5346 	if (rc == 0) {
5347 		if (txq->tc_idx != -1) {
5348 			tc = &pi->sched_params->cl_rl[txq->tc_idx];
5349 			MPASS(tc->refcount > 0);
5350 			tc->refcount--;
5351 		}
5352 		txq->tc_idx = tc_idx;
5353 	} else if (tc_idx != -1) {
5354 		tc = &pi->sched_params->cl_rl[tc_idx];
5355 		MPASS(tc->refcount > 0);
5356 		tc->refcount--;
5357 	}
5358 done:
5359 	mtx_unlock(&sc->tc_lock);
5360 	return (rc);
5361 }
5362