xref: /freebsd/sys/dev/cxgbe/t4_sge.c (revision 550cb4ab85c7e514629c8bacbbb07085b81d916b)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2011 Chelsio Communications, Inc.
5  * All rights reserved.
6  * Written by: Navdeep Parhar <np@FreeBSD.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 
33 #include "opt_inet.h"
34 #include "opt_inet6.h"
35 #include "opt_kern_tls.h"
36 #include "opt_ratelimit.h"
37 
38 #include <sys/types.h>
39 #include <sys/eventhandler.h>
40 #include <sys/mbuf.h>
41 #include <sys/socket.h>
42 #include <sys/kernel.h>
43 #include <sys/ktls.h>
44 #include <sys/malloc.h>
45 #include <sys/queue.h>
46 #include <sys/sbuf.h>
47 #include <sys/taskqueue.h>
48 #include <sys/time.h>
49 #include <sys/sglist.h>
50 #include <sys/sysctl.h>
51 #include <sys/smp.h>
52 #include <sys/socketvar.h>
53 #include <sys/counter.h>
54 #include <net/bpf.h>
55 #include <net/ethernet.h>
56 #include <net/if.h>
57 #include <net/if_vlan_var.h>
58 #include <net/if_vxlan.h>
59 #include <netinet/in.h>
60 #include <netinet/ip.h>
61 #include <netinet/ip6.h>
62 #include <netinet/tcp.h>
63 #include <netinet/udp.h>
64 #include <machine/in_cksum.h>
65 #include <machine/md_var.h>
66 #include <vm/vm.h>
67 #include <vm/pmap.h>
68 #ifdef DEV_NETMAP
69 #include <machine/bus.h>
70 #include <sys/selinfo.h>
71 #include <net/if_var.h>
72 #include <net/netmap.h>
73 #include <dev/netmap/netmap_kern.h>
74 #endif
75 
76 #include "common/common.h"
77 #include "common/t4_regs.h"
78 #include "common/t4_regs_values.h"
79 #include "common/t4_msg.h"
80 #include "t4_l2t.h"
81 #include "t4_mp_ring.h"
82 
83 #ifdef T4_PKT_TIMESTAMP
84 #define RX_COPY_THRESHOLD (MINCLSIZE - 8)
85 #else
86 #define RX_COPY_THRESHOLD MINCLSIZE
87 #endif
88 
89 /* Internal mbuf flags stored in PH_loc.eight[1]. */
90 #define	MC_NOMAP		0x01
91 #define	MC_RAW_WR		0x02
92 #define	MC_TLS			0x04
93 
94 /*
95  * Ethernet frames are DMA'd at this byte offset into the freelist buffer.
96  * 0-7 are valid values.
97  */
98 static int fl_pktshift = 0;
99 SYSCTL_INT(_hw_cxgbe, OID_AUTO, fl_pktshift, CTLFLAG_RDTUN, &fl_pktshift, 0,
100     "payload DMA offset in rx buffer (bytes)");
101 
102 /*
103  * Pad ethernet payload up to this boundary.
104  * -1: driver should figure out a good value.
105  *  0: disable padding.
106  *  Any power of 2 from 32 to 4096 (both inclusive) is also a valid value.
107  */
108 int fl_pad = -1;
109 SYSCTL_INT(_hw_cxgbe, OID_AUTO, fl_pad, CTLFLAG_RDTUN, &fl_pad, 0,
110     "payload pad boundary (bytes)");
111 
112 /*
113  * Status page length.
114  * -1: driver should figure out a good value.
115  *  64 or 128 are the only other valid values.
116  */
117 static int spg_len = -1;
118 SYSCTL_INT(_hw_cxgbe, OID_AUTO, spg_len, CTLFLAG_RDTUN, &spg_len, 0,
119     "status page size (bytes)");
120 
121 /*
122  * Congestion drops.
123  * -1: no congestion feedback (not recommended).
124  *  0: backpressure the channel instead of dropping packets right away.
125  *  1: no backpressure, drop packets for the congested queue immediately.
126  */
127 static int cong_drop = 0;
128 SYSCTL_INT(_hw_cxgbe, OID_AUTO, cong_drop, CTLFLAG_RDTUN, &cong_drop, 0,
129     "Congestion control for RX queues (0 = backpressure, 1 = drop");
130 
131 /*
132  * Deliver multiple frames in the same free list buffer if they fit.
133  * -1: let the driver decide whether to enable buffer packing or not.
134  *  0: disable buffer packing.
135  *  1: enable buffer packing.
136  */
137 static int buffer_packing = -1;
138 SYSCTL_INT(_hw_cxgbe, OID_AUTO, buffer_packing, CTLFLAG_RDTUN, &buffer_packing,
139     0, "Enable buffer packing");
140 
141 /*
142  * Start next frame in a packed buffer at this boundary.
143  * -1: driver should figure out a good value.
144  * T4: driver will ignore this and use the same value as fl_pad above.
145  * T5: 16, or a power of 2 from 64 to 4096 (both inclusive) is a valid value.
146  */
147 static int fl_pack = -1;
148 SYSCTL_INT(_hw_cxgbe, OID_AUTO, fl_pack, CTLFLAG_RDTUN, &fl_pack, 0,
149     "payload pack boundary (bytes)");
150 
151 /*
152  * Largest rx cluster size that the driver is allowed to allocate.
153  */
154 static int largest_rx_cluster = MJUM16BYTES;
155 SYSCTL_INT(_hw_cxgbe, OID_AUTO, largest_rx_cluster, CTLFLAG_RDTUN,
156     &largest_rx_cluster, 0, "Largest rx cluster (bytes)");
157 
158 /*
159  * Size of cluster allocation that's most likely to succeed.  The driver will
160  * fall back to this size if it fails to allocate clusters larger than this.
161  */
162 static int safest_rx_cluster = PAGE_SIZE;
163 SYSCTL_INT(_hw_cxgbe, OID_AUTO, safest_rx_cluster, CTLFLAG_RDTUN,
164     &safest_rx_cluster, 0, "Safe rx cluster (bytes)");
165 
166 #ifdef RATELIMIT
167 /*
168  * Knob to control TCP timestamp rewriting, and the granularity of the tick used
169  * for rewriting.  -1 and 0-3 are all valid values.
170  * -1: hardware should leave the TCP timestamps alone.
171  * 0: 1ms
172  * 1: 100us
173  * 2: 10us
174  * 3: 1us
175  */
176 static int tsclk = -1;
177 SYSCTL_INT(_hw_cxgbe, OID_AUTO, tsclk, CTLFLAG_RDTUN, &tsclk, 0,
178     "Control TCP timestamp rewriting when using pacing");
179 
180 static int eo_max_backlog = 1024 * 1024;
181 SYSCTL_INT(_hw_cxgbe, OID_AUTO, eo_max_backlog, CTLFLAG_RDTUN, &eo_max_backlog,
182     0, "Maximum backlog of ratelimited data per flow");
183 #endif
184 
185 /*
186  * The interrupt holdoff timers are multiplied by this value on T6+.
187  * 1 and 3-17 (both inclusive) are legal values.
188  */
189 static int tscale = 1;
190 SYSCTL_INT(_hw_cxgbe, OID_AUTO, tscale, CTLFLAG_RDTUN, &tscale, 0,
191     "Interrupt holdoff timer scale on T6+");
192 
193 /*
194  * Number of LRO entries in the lro_ctrl structure per rx queue.
195  */
196 static int lro_entries = TCP_LRO_ENTRIES;
197 SYSCTL_INT(_hw_cxgbe, OID_AUTO, lro_entries, CTLFLAG_RDTUN, &lro_entries, 0,
198     "Number of LRO entries per RX queue");
199 
200 /*
201  * This enables presorting of frames before they're fed into tcp_lro_rx.
202  */
203 static int lro_mbufs = 0;
204 SYSCTL_INT(_hw_cxgbe, OID_AUTO, lro_mbufs, CTLFLAG_RDTUN, &lro_mbufs, 0,
205     "Enable presorting of LRO frames");
206 
207 static counter_u64_t pullups;
208 SYSCTL_COUNTER_U64(_hw_cxgbe, OID_AUTO, pullups, CTLFLAG_RD, &pullups,
209     "Number of mbuf pullups performed");
210 
211 static counter_u64_t defrags;
212 SYSCTL_COUNTER_U64(_hw_cxgbe, OID_AUTO, defrags, CTLFLAG_RD, &defrags,
213     "Number of mbuf defrags performed");
214 
215 static int t4_tx_coalesce = 1;
216 SYSCTL_INT(_hw_cxgbe, OID_AUTO, tx_coalesce, CTLFLAG_RWTUN, &t4_tx_coalesce, 0,
217     "tx coalescing allowed");
218 
219 /*
220  * The driver will make aggressive attempts at tx coalescing if it sees these
221  * many packets eligible for coalescing in quick succession, with no more than
222  * the specified gap in between the eth_tx calls that delivered the packets.
223  */
224 static int t4_tx_coalesce_pkts = 32;
225 SYSCTL_INT(_hw_cxgbe, OID_AUTO, tx_coalesce_pkts, CTLFLAG_RWTUN,
226     &t4_tx_coalesce_pkts, 0,
227     "# of consecutive packets (1 - 255) that will trigger tx coalescing");
228 static int t4_tx_coalesce_gap = 5;
229 SYSCTL_INT(_hw_cxgbe, OID_AUTO, tx_coalesce_gap, CTLFLAG_RWTUN,
230     &t4_tx_coalesce_gap, 0, "tx gap (in microseconds)");
231 
232 static int service_iq(struct sge_iq *, int);
233 static int service_iq_fl(struct sge_iq *, int);
234 static struct mbuf *get_fl_payload(struct adapter *, struct sge_fl *, uint32_t);
235 static int eth_rx(struct adapter *, struct sge_rxq *, const struct iq_desc *,
236     u_int);
237 static inline void init_iq(struct sge_iq *, struct adapter *, int, int, int);
238 static inline void init_fl(struct adapter *, struct sge_fl *, int, int, char *);
239 static inline void init_eq(struct adapter *, struct sge_eq *, int, int, uint8_t,
240     uint16_t, char *);
241 static int alloc_iq_fl(struct vi_info *, struct sge_iq *, struct sge_fl *,
242     int, int);
243 static int free_iq_fl(struct vi_info *, struct sge_iq *, struct sge_fl *);
244 static void add_iq_sysctls(struct sysctl_ctx_list *, struct sysctl_oid *,
245     struct sge_iq *);
246 static void add_fl_sysctls(struct adapter *, struct sysctl_ctx_list *,
247     struct sysctl_oid *, struct sge_fl *);
248 static int alloc_fwq(struct adapter *);
249 static int free_fwq(struct adapter *);
250 static int alloc_ctrlq(struct adapter *, struct sge_wrq *, int,
251     struct sysctl_oid *);
252 static int alloc_rxq(struct vi_info *, struct sge_rxq *, int, int,
253     struct sysctl_oid *);
254 static int free_rxq(struct vi_info *, struct sge_rxq *);
255 #ifdef TCP_OFFLOAD
256 static int alloc_ofld_rxq(struct vi_info *, struct sge_ofld_rxq *, int, int,
257     struct sysctl_oid *);
258 static int free_ofld_rxq(struct vi_info *, struct sge_ofld_rxq *);
259 #endif
260 static int ctrl_eq_alloc(struct adapter *, struct sge_eq *);
261 static int eth_eq_alloc(struct adapter *, struct vi_info *, struct sge_eq *);
262 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
263 static int ofld_eq_alloc(struct adapter *, struct vi_info *, struct sge_eq *);
264 #endif
265 static int alloc_eq(struct adapter *, struct vi_info *, struct sge_eq *);
266 static int free_eq(struct adapter *, struct sge_eq *);
267 static int alloc_wrq(struct adapter *, struct vi_info *, struct sge_wrq *,
268     struct sysctl_oid *);
269 static int free_wrq(struct adapter *, struct sge_wrq *);
270 static int alloc_txq(struct vi_info *, struct sge_txq *, int,
271     struct sysctl_oid *);
272 static int free_txq(struct vi_info *, struct sge_txq *);
273 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
274 static int alloc_ofld_txq(struct vi_info *, struct sge_ofld_txq *, int,
275     struct sysctl_oid *);
276 static int free_ofld_txq(struct vi_info *, struct sge_ofld_txq *);
277 #endif
278 static void oneseg_dma_callback(void *, bus_dma_segment_t *, int, int);
279 static inline void ring_fl_db(struct adapter *, struct sge_fl *);
280 static int refill_fl(struct adapter *, struct sge_fl *, int);
281 static void refill_sfl(void *);
282 static int alloc_fl_sdesc(struct sge_fl *);
283 static void free_fl_sdesc(struct adapter *, struct sge_fl *);
284 static int find_refill_source(struct adapter *, int, bool);
285 static void add_fl_to_sfl(struct adapter *, struct sge_fl *);
286 
287 static inline void get_pkt_gl(struct mbuf *, struct sglist *);
288 static inline u_int txpkt_len16(u_int, const u_int);
289 static inline u_int txpkt_vm_len16(u_int, const u_int);
290 static inline void calculate_mbuf_len16(struct mbuf *, bool);
291 static inline u_int txpkts0_len16(u_int);
292 static inline u_int txpkts1_len16(void);
293 static u_int write_raw_wr(struct sge_txq *, void *, struct mbuf *, u_int);
294 static u_int write_txpkt_wr(struct adapter *, struct sge_txq *, struct mbuf *,
295     u_int);
296 static u_int write_txpkt_vm_wr(struct adapter *, struct sge_txq *,
297     struct mbuf *);
298 static int add_to_txpkts_vf(struct adapter *, struct sge_txq *, struct mbuf *,
299     int, bool *);
300 static int add_to_txpkts_pf(struct adapter *, struct sge_txq *, struct mbuf *,
301     int, bool *);
302 static u_int write_txpkts_wr(struct adapter *, struct sge_txq *);
303 static u_int write_txpkts_vm_wr(struct adapter *, struct sge_txq *);
304 static void write_gl_to_txd(struct sge_txq *, struct mbuf *, caddr_t *, int);
305 static inline void copy_to_txd(struct sge_eq *, caddr_t, caddr_t *, int);
306 static inline void ring_eq_db(struct adapter *, struct sge_eq *, u_int);
307 static inline uint16_t read_hw_cidx(struct sge_eq *);
308 static inline u_int reclaimable_tx_desc(struct sge_eq *);
309 static inline u_int total_available_tx_desc(struct sge_eq *);
310 static u_int reclaim_tx_descs(struct sge_txq *, u_int);
311 static void tx_reclaim(void *, int);
312 static __be64 get_flit(struct sglist_seg *, int, int);
313 static int handle_sge_egr_update(struct sge_iq *, const struct rss_header *,
314     struct mbuf *);
315 static int handle_fw_msg(struct sge_iq *, const struct rss_header *,
316     struct mbuf *);
317 static int t4_handle_wrerr_rpl(struct adapter *, const __be64 *);
318 static void wrq_tx_drain(void *, int);
319 static void drain_wrq_wr_list(struct adapter *, struct sge_wrq *);
320 
321 static int sysctl_bufsizes(SYSCTL_HANDLER_ARGS);
322 #ifdef RATELIMIT
323 static inline u_int txpkt_eo_len16(u_int, u_int, u_int);
324 static int ethofld_fw4_ack(struct sge_iq *, const struct rss_header *,
325     struct mbuf *);
326 #endif
327 
328 static counter_u64_t extfree_refs;
329 static counter_u64_t extfree_rels;
330 
331 an_handler_t t4_an_handler;
332 fw_msg_handler_t t4_fw_msg_handler[NUM_FW6_TYPES];
333 cpl_handler_t t4_cpl_handler[NUM_CPL_CMDS];
334 cpl_handler_t set_tcb_rpl_handlers[NUM_CPL_COOKIES];
335 cpl_handler_t l2t_write_rpl_handlers[NUM_CPL_COOKIES];
336 cpl_handler_t act_open_rpl_handlers[NUM_CPL_COOKIES];
337 cpl_handler_t abort_rpl_rss_handlers[NUM_CPL_COOKIES];
338 cpl_handler_t fw4_ack_handlers[NUM_CPL_COOKIES];
339 
340 void
341 t4_register_an_handler(an_handler_t h)
342 {
343 	uintptr_t *loc;
344 
345 	MPASS(h == NULL || t4_an_handler == NULL);
346 
347 	loc = (uintptr_t *)&t4_an_handler;
348 	atomic_store_rel_ptr(loc, (uintptr_t)h);
349 }
350 
351 void
352 t4_register_fw_msg_handler(int type, fw_msg_handler_t h)
353 {
354 	uintptr_t *loc;
355 
356 	MPASS(type < nitems(t4_fw_msg_handler));
357 	MPASS(h == NULL || t4_fw_msg_handler[type] == NULL);
358 	/*
359 	 * These are dispatched by the handler for FW{4|6}_CPL_MSG using the CPL
360 	 * handler dispatch table.  Reject any attempt to install a handler for
361 	 * this subtype.
362 	 */
363 	MPASS(type != FW_TYPE_RSSCPL);
364 	MPASS(type != FW6_TYPE_RSSCPL);
365 
366 	loc = (uintptr_t *)&t4_fw_msg_handler[type];
367 	atomic_store_rel_ptr(loc, (uintptr_t)h);
368 }
369 
370 void
371 t4_register_cpl_handler(int opcode, cpl_handler_t h)
372 {
373 	uintptr_t *loc;
374 
375 	MPASS(opcode < nitems(t4_cpl_handler));
376 	MPASS(h == NULL || t4_cpl_handler[opcode] == NULL);
377 
378 	loc = (uintptr_t *)&t4_cpl_handler[opcode];
379 	atomic_store_rel_ptr(loc, (uintptr_t)h);
380 }
381 
382 static int
383 set_tcb_rpl_handler(struct sge_iq *iq, const struct rss_header *rss,
384     struct mbuf *m)
385 {
386 	const struct cpl_set_tcb_rpl *cpl = (const void *)(rss + 1);
387 	u_int tid;
388 	int cookie;
389 
390 	MPASS(m == NULL);
391 
392 	tid = GET_TID(cpl);
393 	if (is_hpftid(iq->adapter, tid) || is_ftid(iq->adapter, tid)) {
394 		/*
395 		 * The return code for filter-write is put in the CPL cookie so
396 		 * we have to rely on the hardware tid (is_ftid) to determine
397 		 * that this is a response to a filter.
398 		 */
399 		cookie = CPL_COOKIE_FILTER;
400 	} else {
401 		cookie = G_COOKIE(cpl->cookie);
402 	}
403 	MPASS(cookie > CPL_COOKIE_RESERVED);
404 	MPASS(cookie < nitems(set_tcb_rpl_handlers));
405 
406 	return (set_tcb_rpl_handlers[cookie](iq, rss, m));
407 }
408 
409 static int
410 l2t_write_rpl_handler(struct sge_iq *iq, const struct rss_header *rss,
411     struct mbuf *m)
412 {
413 	const struct cpl_l2t_write_rpl *rpl = (const void *)(rss + 1);
414 	unsigned int cookie;
415 
416 	MPASS(m == NULL);
417 
418 	cookie = GET_TID(rpl) & F_SYNC_WR ? CPL_COOKIE_TOM : CPL_COOKIE_FILTER;
419 	return (l2t_write_rpl_handlers[cookie](iq, rss, m));
420 }
421 
422 static int
423 act_open_rpl_handler(struct sge_iq *iq, const struct rss_header *rss,
424     struct mbuf *m)
425 {
426 	const struct cpl_act_open_rpl *cpl = (const void *)(rss + 1);
427 	u_int cookie = G_TID_COOKIE(G_AOPEN_ATID(be32toh(cpl->atid_status)));
428 
429 	MPASS(m == NULL);
430 	MPASS(cookie != CPL_COOKIE_RESERVED);
431 
432 	return (act_open_rpl_handlers[cookie](iq, rss, m));
433 }
434 
435 static int
436 abort_rpl_rss_handler(struct sge_iq *iq, const struct rss_header *rss,
437     struct mbuf *m)
438 {
439 	struct adapter *sc = iq->adapter;
440 	u_int cookie;
441 
442 	MPASS(m == NULL);
443 	if (is_hashfilter(sc))
444 		cookie = CPL_COOKIE_HASHFILTER;
445 	else
446 		cookie = CPL_COOKIE_TOM;
447 
448 	return (abort_rpl_rss_handlers[cookie](iq, rss, m));
449 }
450 
451 static int
452 fw4_ack_handler(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
453 {
454 	struct adapter *sc = iq->adapter;
455 	const struct cpl_fw4_ack *cpl = (const void *)(rss + 1);
456 	unsigned int tid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl)));
457 	u_int cookie;
458 
459 	MPASS(m == NULL);
460 	if (is_etid(sc, tid))
461 		cookie = CPL_COOKIE_ETHOFLD;
462 	else
463 		cookie = CPL_COOKIE_TOM;
464 
465 	return (fw4_ack_handlers[cookie](iq, rss, m));
466 }
467 
468 static void
469 t4_init_shared_cpl_handlers(void)
470 {
471 
472 	t4_register_cpl_handler(CPL_SET_TCB_RPL, set_tcb_rpl_handler);
473 	t4_register_cpl_handler(CPL_L2T_WRITE_RPL, l2t_write_rpl_handler);
474 	t4_register_cpl_handler(CPL_ACT_OPEN_RPL, act_open_rpl_handler);
475 	t4_register_cpl_handler(CPL_ABORT_RPL_RSS, abort_rpl_rss_handler);
476 	t4_register_cpl_handler(CPL_FW4_ACK, fw4_ack_handler);
477 }
478 
479 void
480 t4_register_shared_cpl_handler(int opcode, cpl_handler_t h, int cookie)
481 {
482 	uintptr_t *loc;
483 
484 	MPASS(opcode < nitems(t4_cpl_handler));
485 	MPASS(cookie > CPL_COOKIE_RESERVED);
486 	MPASS(cookie < NUM_CPL_COOKIES);
487 	MPASS(t4_cpl_handler[opcode] != NULL);
488 
489 	switch (opcode) {
490 	case CPL_SET_TCB_RPL:
491 		loc = (uintptr_t *)&set_tcb_rpl_handlers[cookie];
492 		break;
493 	case CPL_L2T_WRITE_RPL:
494 		loc = (uintptr_t *)&l2t_write_rpl_handlers[cookie];
495 		break;
496 	case CPL_ACT_OPEN_RPL:
497 		loc = (uintptr_t *)&act_open_rpl_handlers[cookie];
498 		break;
499 	case CPL_ABORT_RPL_RSS:
500 		loc = (uintptr_t *)&abort_rpl_rss_handlers[cookie];
501 		break;
502 	case CPL_FW4_ACK:
503 		loc = (uintptr_t *)&fw4_ack_handlers[cookie];
504 		break;
505 	default:
506 		MPASS(0);
507 		return;
508 	}
509 	MPASS(h == NULL || *loc == (uintptr_t)NULL);
510 	atomic_store_rel_ptr(loc, (uintptr_t)h);
511 }
512 
513 /*
514  * Called on MOD_LOAD.  Validates and calculates the SGE tunables.
515  */
516 void
517 t4_sge_modload(void)
518 {
519 
520 	if (fl_pktshift < 0 || fl_pktshift > 7) {
521 		printf("Invalid hw.cxgbe.fl_pktshift value (%d),"
522 		    " using 0 instead.\n", fl_pktshift);
523 		fl_pktshift = 0;
524 	}
525 
526 	if (spg_len != 64 && spg_len != 128) {
527 		int len;
528 
529 #if defined(__i386__) || defined(__amd64__)
530 		len = cpu_clflush_line_size > 64 ? 128 : 64;
531 #else
532 		len = 64;
533 #endif
534 		if (spg_len != -1) {
535 			printf("Invalid hw.cxgbe.spg_len value (%d),"
536 			    " using %d instead.\n", spg_len, len);
537 		}
538 		spg_len = len;
539 	}
540 
541 	if (cong_drop < -1 || cong_drop > 1) {
542 		printf("Invalid hw.cxgbe.cong_drop value (%d),"
543 		    " using 0 instead.\n", cong_drop);
544 		cong_drop = 0;
545 	}
546 
547 	if (tscale != 1 && (tscale < 3 || tscale > 17)) {
548 		printf("Invalid hw.cxgbe.tscale value (%d),"
549 		    " using 1 instead.\n", tscale);
550 		tscale = 1;
551 	}
552 
553 	if (largest_rx_cluster != MCLBYTES &&
554 #if MJUMPAGESIZE != MCLBYTES
555 	    largest_rx_cluster != MJUMPAGESIZE &&
556 #endif
557 	    largest_rx_cluster != MJUM9BYTES &&
558 	    largest_rx_cluster != MJUM16BYTES) {
559 		printf("Invalid hw.cxgbe.largest_rx_cluster value (%d),"
560 		    " using %d instead.\n", largest_rx_cluster, MJUM16BYTES);
561 		largest_rx_cluster = MJUM16BYTES;
562 	}
563 
564 	if (safest_rx_cluster != MCLBYTES &&
565 #if MJUMPAGESIZE != MCLBYTES
566 	    safest_rx_cluster != MJUMPAGESIZE &&
567 #endif
568 	    safest_rx_cluster != MJUM9BYTES &&
569 	    safest_rx_cluster != MJUM16BYTES) {
570 		printf("Invalid hw.cxgbe.safest_rx_cluster value (%d),"
571 		    " using %d instead.\n", safest_rx_cluster, MJUMPAGESIZE);
572 		safest_rx_cluster = MJUMPAGESIZE;
573 	}
574 
575 	extfree_refs = counter_u64_alloc(M_WAITOK);
576 	extfree_rels = counter_u64_alloc(M_WAITOK);
577 	pullups = counter_u64_alloc(M_WAITOK);
578 	defrags = counter_u64_alloc(M_WAITOK);
579 	counter_u64_zero(extfree_refs);
580 	counter_u64_zero(extfree_rels);
581 	counter_u64_zero(pullups);
582 	counter_u64_zero(defrags);
583 
584 	t4_init_shared_cpl_handlers();
585 	t4_register_cpl_handler(CPL_FW4_MSG, handle_fw_msg);
586 	t4_register_cpl_handler(CPL_FW6_MSG, handle_fw_msg);
587 	t4_register_cpl_handler(CPL_SGE_EGR_UPDATE, handle_sge_egr_update);
588 #ifdef RATELIMIT
589 	t4_register_shared_cpl_handler(CPL_FW4_ACK, ethofld_fw4_ack,
590 	    CPL_COOKIE_ETHOFLD);
591 #endif
592 	t4_register_fw_msg_handler(FW6_TYPE_CMD_RPL, t4_handle_fw_rpl);
593 	t4_register_fw_msg_handler(FW6_TYPE_WRERR_RPL, t4_handle_wrerr_rpl);
594 }
595 
596 void
597 t4_sge_modunload(void)
598 {
599 
600 	counter_u64_free(extfree_refs);
601 	counter_u64_free(extfree_rels);
602 	counter_u64_free(pullups);
603 	counter_u64_free(defrags);
604 }
605 
606 uint64_t
607 t4_sge_extfree_refs(void)
608 {
609 	uint64_t refs, rels;
610 
611 	rels = counter_u64_fetch(extfree_rels);
612 	refs = counter_u64_fetch(extfree_refs);
613 
614 	return (refs - rels);
615 }
616 
617 /* max 4096 */
618 #define MAX_PACK_BOUNDARY 512
619 
620 static inline void
621 setup_pad_and_pack_boundaries(struct adapter *sc)
622 {
623 	uint32_t v, m;
624 	int pad, pack, pad_shift;
625 
626 	pad_shift = chip_id(sc) > CHELSIO_T5 ? X_T6_INGPADBOUNDARY_SHIFT :
627 	    X_INGPADBOUNDARY_SHIFT;
628 	pad = fl_pad;
629 	if (fl_pad < (1 << pad_shift) ||
630 	    fl_pad > (1 << (pad_shift + M_INGPADBOUNDARY)) ||
631 	    !powerof2(fl_pad)) {
632 		/*
633 		 * If there is any chance that we might use buffer packing and
634 		 * the chip is a T4, then pick 64 as the pad/pack boundary.  Set
635 		 * it to the minimum allowed in all other cases.
636 		 */
637 		pad = is_t4(sc) && buffer_packing ? 64 : 1 << pad_shift;
638 
639 		/*
640 		 * For fl_pad = 0 we'll still write a reasonable value to the
641 		 * register but all the freelists will opt out of padding.
642 		 * We'll complain here only if the user tried to set it to a
643 		 * value greater than 0 that was invalid.
644 		 */
645 		if (fl_pad > 0) {
646 			device_printf(sc->dev, "Invalid hw.cxgbe.fl_pad value"
647 			    " (%d), using %d instead.\n", fl_pad, pad);
648 		}
649 	}
650 	m = V_INGPADBOUNDARY(M_INGPADBOUNDARY);
651 	v = V_INGPADBOUNDARY(ilog2(pad) - pad_shift);
652 	t4_set_reg_field(sc, A_SGE_CONTROL, m, v);
653 
654 	if (is_t4(sc)) {
655 		if (fl_pack != -1 && fl_pack != pad) {
656 			/* Complain but carry on. */
657 			device_printf(sc->dev, "hw.cxgbe.fl_pack (%d) ignored,"
658 			    " using %d instead.\n", fl_pack, pad);
659 		}
660 		return;
661 	}
662 
663 	pack = fl_pack;
664 	if (fl_pack < 16 || fl_pack == 32 || fl_pack > 4096 ||
665 	    !powerof2(fl_pack)) {
666 		if (sc->params.pci.mps > MAX_PACK_BOUNDARY)
667 			pack = MAX_PACK_BOUNDARY;
668 		else
669 			pack = max(sc->params.pci.mps, CACHE_LINE_SIZE);
670 		MPASS(powerof2(pack));
671 		if (pack < 16)
672 			pack = 16;
673 		if (pack == 32)
674 			pack = 64;
675 		if (pack > 4096)
676 			pack = 4096;
677 		if (fl_pack != -1) {
678 			device_printf(sc->dev, "Invalid hw.cxgbe.fl_pack value"
679 			    " (%d), using %d instead.\n", fl_pack, pack);
680 		}
681 	}
682 	m = V_INGPACKBOUNDARY(M_INGPACKBOUNDARY);
683 	if (pack == 16)
684 		v = V_INGPACKBOUNDARY(0);
685 	else
686 		v = V_INGPACKBOUNDARY(ilog2(pack) - 5);
687 
688 	MPASS(!is_t4(sc));	/* T4 doesn't have SGE_CONTROL2 */
689 	t4_set_reg_field(sc, A_SGE_CONTROL2, m, v);
690 }
691 
692 /*
693  * adap->params.vpd.cclk must be set up before this is called.
694  */
695 void
696 t4_tweak_chip_settings(struct adapter *sc)
697 {
698 	int i, reg;
699 	uint32_t v, m;
700 	int intr_timer[SGE_NTIMERS] = {1, 5, 10, 50, 100, 200};
701 	int timer_max = M_TIMERVALUE0 * 1000 / sc->params.vpd.cclk;
702 	int intr_pktcount[SGE_NCOUNTERS] = {1, 8, 16, 32}; /* 63 max */
703 	uint16_t indsz = min(RX_COPY_THRESHOLD - 1, M_INDICATESIZE);
704 	static int sw_buf_sizes[] = {
705 		MCLBYTES,
706 #if MJUMPAGESIZE != MCLBYTES
707 		MJUMPAGESIZE,
708 #endif
709 		MJUM9BYTES,
710 		MJUM16BYTES
711 	};
712 
713 	KASSERT(sc->flags & MASTER_PF,
714 	    ("%s: trying to change chip settings when not master.", __func__));
715 
716 	m = V_PKTSHIFT(M_PKTSHIFT) | F_RXPKTCPLMODE | F_EGRSTATUSPAGESIZE;
717 	v = V_PKTSHIFT(fl_pktshift) | F_RXPKTCPLMODE |
718 	    V_EGRSTATUSPAGESIZE(spg_len == 128);
719 	t4_set_reg_field(sc, A_SGE_CONTROL, m, v);
720 
721 	setup_pad_and_pack_boundaries(sc);
722 
723 	v = V_HOSTPAGESIZEPF0(PAGE_SHIFT - 10) |
724 	    V_HOSTPAGESIZEPF1(PAGE_SHIFT - 10) |
725 	    V_HOSTPAGESIZEPF2(PAGE_SHIFT - 10) |
726 	    V_HOSTPAGESIZEPF3(PAGE_SHIFT - 10) |
727 	    V_HOSTPAGESIZEPF4(PAGE_SHIFT - 10) |
728 	    V_HOSTPAGESIZEPF5(PAGE_SHIFT - 10) |
729 	    V_HOSTPAGESIZEPF6(PAGE_SHIFT - 10) |
730 	    V_HOSTPAGESIZEPF7(PAGE_SHIFT - 10);
731 	t4_write_reg(sc, A_SGE_HOST_PAGE_SIZE, v);
732 
733 	t4_write_reg(sc, A_SGE_FL_BUFFER_SIZE0, 4096);
734 	t4_write_reg(sc, A_SGE_FL_BUFFER_SIZE1, 65536);
735 	reg = A_SGE_FL_BUFFER_SIZE2;
736 	for (i = 0; i < nitems(sw_buf_sizes); i++) {
737 		MPASS(reg <= A_SGE_FL_BUFFER_SIZE15);
738 		t4_write_reg(sc, reg, sw_buf_sizes[i]);
739 		reg += 4;
740 		MPASS(reg <= A_SGE_FL_BUFFER_SIZE15);
741 		t4_write_reg(sc, reg, sw_buf_sizes[i] - CL_METADATA_SIZE);
742 		reg += 4;
743 	}
744 
745 	v = V_THRESHOLD_0(intr_pktcount[0]) | V_THRESHOLD_1(intr_pktcount[1]) |
746 	    V_THRESHOLD_2(intr_pktcount[2]) | V_THRESHOLD_3(intr_pktcount[3]);
747 	t4_write_reg(sc, A_SGE_INGRESS_RX_THRESHOLD, v);
748 
749 	KASSERT(intr_timer[0] <= timer_max,
750 	    ("%s: not a single usable timer (%d, %d)", __func__, intr_timer[0],
751 	    timer_max));
752 	for (i = 1; i < nitems(intr_timer); i++) {
753 		KASSERT(intr_timer[i] >= intr_timer[i - 1],
754 		    ("%s: timers not listed in increasing order (%d)",
755 		    __func__, i));
756 
757 		while (intr_timer[i] > timer_max) {
758 			if (i == nitems(intr_timer) - 1) {
759 				intr_timer[i] = timer_max;
760 				break;
761 			}
762 			intr_timer[i] += intr_timer[i - 1];
763 			intr_timer[i] /= 2;
764 		}
765 	}
766 
767 	v = V_TIMERVALUE0(us_to_core_ticks(sc, intr_timer[0])) |
768 	    V_TIMERVALUE1(us_to_core_ticks(sc, intr_timer[1]));
769 	t4_write_reg(sc, A_SGE_TIMER_VALUE_0_AND_1, v);
770 	v = V_TIMERVALUE2(us_to_core_ticks(sc, intr_timer[2])) |
771 	    V_TIMERVALUE3(us_to_core_ticks(sc, intr_timer[3]));
772 	t4_write_reg(sc, A_SGE_TIMER_VALUE_2_AND_3, v);
773 	v = V_TIMERVALUE4(us_to_core_ticks(sc, intr_timer[4])) |
774 	    V_TIMERVALUE5(us_to_core_ticks(sc, intr_timer[5]));
775 	t4_write_reg(sc, A_SGE_TIMER_VALUE_4_AND_5, v);
776 
777 	if (chip_id(sc) >= CHELSIO_T6) {
778 		m = V_TSCALE(M_TSCALE);
779 		if (tscale == 1)
780 			v = 0;
781 		else
782 			v = V_TSCALE(tscale - 2);
783 		t4_set_reg_field(sc, A_SGE_ITP_CONTROL, m, v);
784 
785 		if (sc->debug_flags & DF_DISABLE_TCB_CACHE) {
786 			m = V_RDTHRESHOLD(M_RDTHRESHOLD) | F_WRTHRTHRESHEN |
787 			    V_WRTHRTHRESH(M_WRTHRTHRESH);
788 			t4_tp_pio_read(sc, &v, 1, A_TP_CMM_CONFIG, 1);
789 			v &= ~m;
790 			v |= V_RDTHRESHOLD(1) | F_WRTHRTHRESHEN |
791 			    V_WRTHRTHRESH(16);
792 			t4_tp_pio_write(sc, &v, 1, A_TP_CMM_CONFIG, 1);
793 		}
794 	}
795 
796 	/* 4K, 16K, 64K, 256K DDP "page sizes" for TDDP */
797 	v = V_HPZ0(0) | V_HPZ1(2) | V_HPZ2(4) | V_HPZ3(6);
798 	t4_write_reg(sc, A_ULP_RX_TDDP_PSZ, v);
799 
800 	/*
801 	 * 4K, 8K, 16K, 64K DDP "page sizes" for iSCSI DDP.  These have been
802 	 * chosen with MAXPHYS = 128K in mind.  The largest DDP buffer that we
803 	 * may have to deal with is MAXPHYS + 1 page.
804 	 */
805 	v = V_HPZ0(0) | V_HPZ1(1) | V_HPZ2(2) | V_HPZ3(4);
806 	t4_write_reg(sc, A_ULP_RX_ISCSI_PSZ, v);
807 
808 	/* We use multiple DDP page sizes both in plain-TOE and ISCSI modes. */
809 	m = v = F_TDDPTAGTCB | F_ISCSITAGTCB;
810 	t4_set_reg_field(sc, A_ULP_RX_CTL, m, v);
811 
812 	m = V_INDICATESIZE(M_INDICATESIZE) | F_REARMDDPOFFSET |
813 	    F_RESETDDPOFFSET;
814 	v = V_INDICATESIZE(indsz) | F_REARMDDPOFFSET | F_RESETDDPOFFSET;
815 	t4_set_reg_field(sc, A_TP_PARA_REG5, m, v);
816 }
817 
818 /*
819  * SGE wants the buffer to be at least 64B and then a multiple of 16.  Its
820  * address mut be 16B aligned.  If padding is in use the buffer's start and end
821  * need to be aligned to the pad boundary as well.  We'll just make sure that
822  * the size is a multiple of the pad boundary here, it is up to the buffer
823  * allocation code to make sure the start of the buffer is aligned.
824  */
825 static inline int
826 hwsz_ok(struct adapter *sc, int hwsz)
827 {
828 	int mask = fl_pad ? sc->params.sge.pad_boundary - 1 : 16 - 1;
829 
830 	return (hwsz >= 64 && (hwsz & mask) == 0);
831 }
832 
833 /*
834  * Initialize the rx buffer sizes and figure out which zones the buffers will
835  * be allocated from.
836  */
837 void
838 t4_init_rx_buf_info(struct adapter *sc)
839 {
840 	struct sge *s = &sc->sge;
841 	struct sge_params *sp = &sc->params.sge;
842 	int i, j, n;
843 	static int sw_buf_sizes[] = {	/* Sorted by size */
844 		MCLBYTES,
845 #if MJUMPAGESIZE != MCLBYTES
846 		MJUMPAGESIZE,
847 #endif
848 		MJUM9BYTES,
849 		MJUM16BYTES
850 	};
851 	struct rx_buf_info *rxb;
852 
853 	s->safe_zidx = -1;
854 	rxb = &s->rx_buf_info[0];
855 	for (i = 0; i < SW_ZONE_SIZES; i++, rxb++) {
856 		rxb->size1 = sw_buf_sizes[i];
857 		rxb->zone = m_getzone(rxb->size1);
858 		rxb->type = m_gettype(rxb->size1);
859 		rxb->size2 = 0;
860 		rxb->hwidx1 = -1;
861 		rxb->hwidx2 = -1;
862 		for (j = 0; j < SGE_FLBUF_SIZES; j++) {
863 			int hwsize = sp->sge_fl_buffer_size[j];
864 
865 			if (!hwsz_ok(sc, hwsize))
866 				continue;
867 
868 			/* hwidx for size1 */
869 			if (rxb->hwidx1 == -1 && rxb->size1 == hwsize)
870 				rxb->hwidx1 = j;
871 
872 			/* hwidx for size2 (buffer packing) */
873 			if (rxb->size1 - CL_METADATA_SIZE < hwsize)
874 				continue;
875 			n = rxb->size1 - hwsize - CL_METADATA_SIZE;
876 			if (n == 0) {
877 				rxb->hwidx2 = j;
878 				rxb->size2 = hwsize;
879 				break;	/* stop looking */
880 			}
881 			if (rxb->hwidx2 != -1) {
882 				if (n < sp->sge_fl_buffer_size[rxb->hwidx2] -
883 				    hwsize - CL_METADATA_SIZE) {
884 					rxb->hwidx2 = j;
885 					rxb->size2 = hwsize;
886 				}
887 			} else if (n <= 2 * CL_METADATA_SIZE) {
888 				rxb->hwidx2 = j;
889 				rxb->size2 = hwsize;
890 			}
891 		}
892 		if (rxb->hwidx2 != -1)
893 			sc->flags |= BUF_PACKING_OK;
894 		if (s->safe_zidx == -1 && rxb->size1 == safest_rx_cluster)
895 			s->safe_zidx = i;
896 	}
897 }
898 
899 /*
900  * Verify some basic SGE settings for the PF and VF driver, and other
901  * miscellaneous settings for the PF driver.
902  */
903 int
904 t4_verify_chip_settings(struct adapter *sc)
905 {
906 	struct sge_params *sp = &sc->params.sge;
907 	uint32_t m, v, r;
908 	int rc = 0;
909 	const uint16_t indsz = min(RX_COPY_THRESHOLD - 1, M_INDICATESIZE);
910 
911 	m = F_RXPKTCPLMODE;
912 	v = F_RXPKTCPLMODE;
913 	r = sp->sge_control;
914 	if ((r & m) != v) {
915 		device_printf(sc->dev, "invalid SGE_CONTROL(0x%x)\n", r);
916 		rc = EINVAL;
917 	}
918 
919 	/*
920 	 * If this changes then every single use of PAGE_SHIFT in the driver
921 	 * needs to be carefully reviewed for PAGE_SHIFT vs sp->page_shift.
922 	 */
923 	if (sp->page_shift != PAGE_SHIFT) {
924 		device_printf(sc->dev, "invalid SGE_HOST_PAGE_SIZE(0x%x)\n", r);
925 		rc = EINVAL;
926 	}
927 
928 	if (sc->flags & IS_VF)
929 		return (0);
930 
931 	v = V_HPZ0(0) | V_HPZ1(2) | V_HPZ2(4) | V_HPZ3(6);
932 	r = t4_read_reg(sc, A_ULP_RX_TDDP_PSZ);
933 	if (r != v) {
934 		device_printf(sc->dev, "invalid ULP_RX_TDDP_PSZ(0x%x)\n", r);
935 		if (sc->vres.ddp.size != 0)
936 			rc = EINVAL;
937 	}
938 
939 	m = v = F_TDDPTAGTCB;
940 	r = t4_read_reg(sc, A_ULP_RX_CTL);
941 	if ((r & m) != v) {
942 		device_printf(sc->dev, "invalid ULP_RX_CTL(0x%x)\n", r);
943 		if (sc->vres.ddp.size != 0)
944 			rc = EINVAL;
945 	}
946 
947 	m = V_INDICATESIZE(M_INDICATESIZE) | F_REARMDDPOFFSET |
948 	    F_RESETDDPOFFSET;
949 	v = V_INDICATESIZE(indsz) | F_REARMDDPOFFSET | F_RESETDDPOFFSET;
950 	r = t4_read_reg(sc, A_TP_PARA_REG5);
951 	if ((r & m) != v) {
952 		device_printf(sc->dev, "invalid TP_PARA_REG5(0x%x)\n", r);
953 		if (sc->vres.ddp.size != 0)
954 			rc = EINVAL;
955 	}
956 
957 	return (rc);
958 }
959 
960 int
961 t4_create_dma_tag(struct adapter *sc)
962 {
963 	int rc;
964 
965 	rc = bus_dma_tag_create(bus_get_dma_tag(sc->dev), 1, 0,
966 	    BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, BUS_SPACE_MAXSIZE,
967 	    BUS_SPACE_UNRESTRICTED, BUS_SPACE_MAXSIZE, BUS_DMA_ALLOCNOW, NULL,
968 	    NULL, &sc->dmat);
969 	if (rc != 0) {
970 		device_printf(sc->dev,
971 		    "failed to create main DMA tag: %d\n", rc);
972 	}
973 
974 	return (rc);
975 }
976 
977 void
978 t4_sge_sysctls(struct adapter *sc, struct sysctl_ctx_list *ctx,
979     struct sysctl_oid_list *children)
980 {
981 	struct sge_params *sp = &sc->params.sge;
982 
983 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "buffer_sizes",
984 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
985 	    sysctl_bufsizes, "A", "freelist buffer sizes");
986 
987 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pktshift", CTLFLAG_RD,
988 	    NULL, sp->fl_pktshift, "payload DMA offset in rx buffer (bytes)");
989 
990 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pad", CTLFLAG_RD,
991 	    NULL, sp->pad_boundary, "payload pad boundary (bytes)");
992 
993 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "spg_len", CTLFLAG_RD,
994 	    NULL, sp->spg_len, "status page size (bytes)");
995 
996 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "cong_drop", CTLFLAG_RD,
997 	    NULL, cong_drop, "congestion drop setting");
998 
999 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pack", CTLFLAG_RD,
1000 	    NULL, sp->pack_boundary, "payload pack boundary (bytes)");
1001 }
1002 
1003 int
1004 t4_destroy_dma_tag(struct adapter *sc)
1005 {
1006 	if (sc->dmat)
1007 		bus_dma_tag_destroy(sc->dmat);
1008 
1009 	return (0);
1010 }
1011 
1012 /*
1013  * Allocate and initialize the firmware event queue, control queues, and special
1014  * purpose rx queues owned by the adapter.
1015  *
1016  * Returns errno on failure.  Resources allocated up to that point may still be
1017  * allocated.  Caller is responsible for cleanup in case this function fails.
1018  */
1019 int
1020 t4_setup_adapter_queues(struct adapter *sc)
1021 {
1022 	struct sysctl_oid *oid;
1023 	struct sysctl_oid_list *children;
1024 	int rc, i;
1025 
1026 	ADAPTER_LOCK_ASSERT_NOTOWNED(sc);
1027 
1028 	sysctl_ctx_init(&sc->ctx);
1029 	sc->flags |= ADAP_SYSCTL_CTX;
1030 
1031 	/*
1032 	 * Firmware event queue
1033 	 */
1034 	rc = alloc_fwq(sc);
1035 	if (rc != 0)
1036 		return (rc);
1037 
1038 	/*
1039 	 * That's all for the VF driver.
1040 	 */
1041 	if (sc->flags & IS_VF)
1042 		return (rc);
1043 
1044 	oid = device_get_sysctl_tree(sc->dev);
1045 	children = SYSCTL_CHILDREN(oid);
1046 
1047 	/*
1048 	 * XXX: General purpose rx queues, one per port.
1049 	 */
1050 
1051 	/*
1052 	 * Control queues, one per port.
1053 	 */
1054 	oid = SYSCTL_ADD_NODE(&sc->ctx, children, OID_AUTO, "ctrlq",
1055 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "control queues");
1056 	for_each_port(sc, i) {
1057 		struct sge_wrq *ctrlq = &sc->sge.ctrlq[i];
1058 
1059 		rc = alloc_ctrlq(sc, ctrlq, i, oid);
1060 		if (rc != 0)
1061 			return (rc);
1062 	}
1063 
1064 	return (rc);
1065 }
1066 
1067 /*
1068  * Idempotent
1069  */
1070 int
1071 t4_teardown_adapter_queues(struct adapter *sc)
1072 {
1073 	int i;
1074 
1075 	ADAPTER_LOCK_ASSERT_NOTOWNED(sc);
1076 
1077 	/* Do this before freeing the queue */
1078 	if (sc->flags & ADAP_SYSCTL_CTX) {
1079 		sysctl_ctx_free(&sc->ctx);
1080 		sc->flags &= ~ADAP_SYSCTL_CTX;
1081 	}
1082 
1083 	if (!(sc->flags & IS_VF)) {
1084 		for_each_port(sc, i)
1085 			free_wrq(sc, &sc->sge.ctrlq[i]);
1086 	}
1087 	free_fwq(sc);
1088 
1089 	return (0);
1090 }
1091 
1092 /* Maximum payload that could arrive with a single iq descriptor. */
1093 static inline int
1094 max_rx_payload(struct adapter *sc, struct ifnet *ifp, const bool ofld)
1095 {
1096 	int maxp;
1097 
1098 	/* large enough even when hw VLAN extraction is disabled */
1099 	maxp = sc->params.sge.fl_pktshift + ETHER_HDR_LEN +
1100 	    ETHER_VLAN_ENCAP_LEN + ifp->if_mtu;
1101 	if (ofld && sc->tt.tls && sc->cryptocaps & FW_CAPS_CONFIG_TLSKEYS &&
1102 	    maxp < sc->params.tp.max_rx_pdu)
1103 		maxp = sc->params.tp.max_rx_pdu;
1104 	return (maxp);
1105 }
1106 
1107 int
1108 t4_setup_vi_queues(struct vi_info *vi)
1109 {
1110 	int rc = 0, i, intr_idx, iqidx;
1111 	struct sge_rxq *rxq;
1112 	struct sge_txq *txq;
1113 #ifdef TCP_OFFLOAD
1114 	struct sge_ofld_rxq *ofld_rxq;
1115 #endif
1116 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
1117 	struct sge_ofld_txq *ofld_txq;
1118 #endif
1119 #ifdef DEV_NETMAP
1120 	int saved_idx;
1121 	struct sge_nm_rxq *nm_rxq;
1122 	struct sge_nm_txq *nm_txq;
1123 #endif
1124 	char name[16];
1125 	struct port_info *pi = vi->pi;
1126 	struct adapter *sc = pi->adapter;
1127 	struct ifnet *ifp = vi->ifp;
1128 	struct sysctl_oid *oid = device_get_sysctl_tree(vi->dev);
1129 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
1130 	int maxp;
1131 
1132 	/* Interrupt vector to start from (when using multiple vectors) */
1133 	intr_idx = vi->first_intr;
1134 
1135 #ifdef DEV_NETMAP
1136 	saved_idx = intr_idx;
1137 	if (ifp->if_capabilities & IFCAP_NETMAP) {
1138 
1139 		/* netmap is supported with direct interrupts only. */
1140 		MPASS(!forwarding_intr_to_fwq(sc));
1141 
1142 		/*
1143 		 * We don't have buffers to back the netmap rx queues
1144 		 * right now so we create the queues in a way that
1145 		 * doesn't set off any congestion signal in the chip.
1146 		 */
1147 		oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "nm_rxq",
1148 		    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "rx queues");
1149 		for_each_nm_rxq(vi, i, nm_rxq) {
1150 			rc = alloc_nm_rxq(vi, nm_rxq, intr_idx, i, oid);
1151 			if (rc != 0)
1152 				goto done;
1153 			intr_idx++;
1154 		}
1155 
1156 		oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "nm_txq",
1157 		    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "tx queues");
1158 		for_each_nm_txq(vi, i, nm_txq) {
1159 			iqidx = vi->first_nm_rxq + (i % vi->nnmrxq);
1160 			rc = alloc_nm_txq(vi, nm_txq, iqidx, i, oid);
1161 			if (rc != 0)
1162 				goto done;
1163 		}
1164 	}
1165 
1166 	/* Normal rx queues and netmap rx queues share the same interrupts. */
1167 	intr_idx = saved_idx;
1168 #endif
1169 
1170 	/*
1171 	 * Allocate rx queues first because a default iqid is required when
1172 	 * creating a tx queue.
1173 	 */
1174 	maxp = max_rx_payload(sc, ifp, false);
1175 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "rxq",
1176 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "rx queues");
1177 	for_each_rxq(vi, i, rxq) {
1178 
1179 		init_iq(&rxq->iq, sc, vi->tmr_idx, vi->pktc_idx, vi->qsize_rxq);
1180 
1181 		snprintf(name, sizeof(name), "%s rxq%d-fl",
1182 		    device_get_nameunit(vi->dev), i);
1183 		init_fl(sc, &rxq->fl, vi->qsize_rxq / 8, maxp, name);
1184 
1185 		rc = alloc_rxq(vi, rxq,
1186 		    forwarding_intr_to_fwq(sc) ? -1 : intr_idx, i, oid);
1187 		if (rc != 0)
1188 			goto done;
1189 		intr_idx++;
1190 	}
1191 #ifdef DEV_NETMAP
1192 	if (ifp->if_capabilities & IFCAP_NETMAP)
1193 		intr_idx = saved_idx + max(vi->nrxq, vi->nnmrxq);
1194 #endif
1195 #ifdef TCP_OFFLOAD
1196 	maxp = max_rx_payload(sc, ifp, true);
1197 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "ofld_rxq",
1198 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "rx queues for offloaded TCP connections");
1199 	for_each_ofld_rxq(vi, i, ofld_rxq) {
1200 
1201 		init_iq(&ofld_rxq->iq, sc, vi->ofld_tmr_idx, vi->ofld_pktc_idx,
1202 		    vi->qsize_rxq);
1203 
1204 		snprintf(name, sizeof(name), "%s ofld_rxq%d-fl",
1205 		    device_get_nameunit(vi->dev), i);
1206 		init_fl(sc, &ofld_rxq->fl, vi->qsize_rxq / 8, maxp, name);
1207 
1208 		rc = alloc_ofld_rxq(vi, ofld_rxq,
1209 		    forwarding_intr_to_fwq(sc) ? -1 : intr_idx, i, oid);
1210 		if (rc != 0)
1211 			goto done;
1212 		intr_idx++;
1213 	}
1214 #endif
1215 
1216 	/*
1217 	 * Now the tx queues.
1218 	 */
1219 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "txq",
1220 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "tx queues");
1221 	for_each_txq(vi, i, txq) {
1222 		iqidx = vi->first_rxq + (i % vi->nrxq);
1223 		snprintf(name, sizeof(name), "%s txq%d",
1224 		    device_get_nameunit(vi->dev), i);
1225 		init_eq(sc, &txq->eq, EQ_ETH, vi->qsize_txq, pi->tx_chan,
1226 		    sc->sge.rxq[iqidx].iq.cntxt_id, name);
1227 
1228 		rc = alloc_txq(vi, txq, i, oid);
1229 		if (rc != 0)
1230 			goto done;
1231 	}
1232 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
1233 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "ofld_txq",
1234 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "tx queues for TOE/ETHOFLD");
1235 	for_each_ofld_txq(vi, i, ofld_txq) {
1236 		snprintf(name, sizeof(name), "%s ofld_txq%d",
1237 		    device_get_nameunit(vi->dev), i);
1238 		if (vi->nofldrxq > 0) {
1239 			iqidx = vi->first_ofld_rxq + (i % vi->nofldrxq);
1240 			init_eq(sc, &ofld_txq->wrq.eq, EQ_OFLD, vi->qsize_txq,
1241 			    pi->tx_chan, sc->sge.ofld_rxq[iqidx].iq.cntxt_id,
1242 			    name);
1243 		} else {
1244 			iqidx = vi->first_rxq + (i % vi->nrxq);
1245 			init_eq(sc, &ofld_txq->wrq.eq, EQ_OFLD, vi->qsize_txq,
1246 			    pi->tx_chan, sc->sge.rxq[iqidx].iq.cntxt_id, name);
1247 		}
1248 
1249 		rc = alloc_ofld_txq(vi, ofld_txq, i, oid);
1250 		if (rc != 0)
1251 			goto done;
1252 	}
1253 #endif
1254 done:
1255 	if (rc)
1256 		t4_teardown_vi_queues(vi);
1257 
1258 	return (rc);
1259 }
1260 
1261 /*
1262  * Idempotent
1263  */
1264 int
1265 t4_teardown_vi_queues(struct vi_info *vi)
1266 {
1267 	int i;
1268 	struct sge_rxq *rxq;
1269 	struct sge_txq *txq;
1270 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
1271 	struct sge_ofld_txq *ofld_txq;
1272 #endif
1273 #ifdef TCP_OFFLOAD
1274 	struct sge_ofld_rxq *ofld_rxq;
1275 #endif
1276 #ifdef DEV_NETMAP
1277 	struct sge_nm_rxq *nm_rxq;
1278 	struct sge_nm_txq *nm_txq;
1279 #endif
1280 
1281 	/* Do this before freeing the queues */
1282 	if (vi->flags & VI_SYSCTL_CTX) {
1283 		sysctl_ctx_free(&vi->ctx);
1284 		vi->flags &= ~VI_SYSCTL_CTX;
1285 	}
1286 
1287 #ifdef DEV_NETMAP
1288 	if (vi->ifp->if_capabilities & IFCAP_NETMAP) {
1289 		for_each_nm_txq(vi, i, nm_txq) {
1290 			free_nm_txq(vi, nm_txq);
1291 		}
1292 
1293 		for_each_nm_rxq(vi, i, nm_rxq) {
1294 			free_nm_rxq(vi, nm_rxq);
1295 		}
1296 	}
1297 #endif
1298 
1299 	/*
1300 	 * Take down all the tx queues first, as they reference the rx queues
1301 	 * (for egress updates, etc.).
1302 	 */
1303 
1304 	for_each_txq(vi, i, txq) {
1305 		free_txq(vi, txq);
1306 	}
1307 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
1308 	for_each_ofld_txq(vi, i, ofld_txq) {
1309 		free_ofld_txq(vi, ofld_txq);
1310 	}
1311 #endif
1312 
1313 	/*
1314 	 * Then take down the rx queues.
1315 	 */
1316 
1317 	for_each_rxq(vi, i, rxq) {
1318 		free_rxq(vi, rxq);
1319 	}
1320 #ifdef TCP_OFFLOAD
1321 	for_each_ofld_rxq(vi, i, ofld_rxq) {
1322 		free_ofld_rxq(vi, ofld_rxq);
1323 	}
1324 #endif
1325 
1326 	return (0);
1327 }
1328 
1329 /*
1330  * Interrupt handler when the driver is using only 1 interrupt.  This is a very
1331  * unusual scenario.
1332  *
1333  * a) Deals with errors, if any.
1334  * b) Services firmware event queue, which is taking interrupts for all other
1335  *    queues.
1336  */
1337 void
1338 t4_intr_all(void *arg)
1339 {
1340 	struct adapter *sc = arg;
1341 	struct sge_iq *fwq = &sc->sge.fwq;
1342 
1343 	MPASS(sc->intr_count == 1);
1344 
1345 	if (sc->intr_type == INTR_INTX)
1346 		t4_write_reg(sc, MYPF_REG(A_PCIE_PF_CLI), 0);
1347 
1348 	t4_intr_err(arg);
1349 	t4_intr_evt(fwq);
1350 }
1351 
1352 /*
1353  * Interrupt handler for errors (installed directly when multiple interrupts are
1354  * being used, or called by t4_intr_all).
1355  */
1356 void
1357 t4_intr_err(void *arg)
1358 {
1359 	struct adapter *sc = arg;
1360 	uint32_t v;
1361 	const bool verbose = (sc->debug_flags & DF_VERBOSE_SLOWINTR) != 0;
1362 
1363 	if (sc->flags & ADAP_ERR)
1364 		return;
1365 
1366 	v = t4_read_reg(sc, MYPF_REG(A_PL_PF_INT_CAUSE));
1367 	if (v & F_PFSW) {
1368 		sc->swintr++;
1369 		t4_write_reg(sc, MYPF_REG(A_PL_PF_INT_CAUSE), v);
1370 	}
1371 
1372 	t4_slow_intr_handler(sc, verbose);
1373 }
1374 
1375 /*
1376  * Interrupt handler for iq-only queues.  The firmware event queue is the only
1377  * such queue right now.
1378  */
1379 void
1380 t4_intr_evt(void *arg)
1381 {
1382 	struct sge_iq *iq = arg;
1383 
1384 	if (atomic_cmpset_int(&iq->state, IQS_IDLE, IQS_BUSY)) {
1385 		service_iq(iq, 0);
1386 		(void) atomic_cmpset_int(&iq->state, IQS_BUSY, IQS_IDLE);
1387 	}
1388 }
1389 
1390 /*
1391  * Interrupt handler for iq+fl queues.
1392  */
1393 void
1394 t4_intr(void *arg)
1395 {
1396 	struct sge_iq *iq = arg;
1397 
1398 	if (atomic_cmpset_int(&iq->state, IQS_IDLE, IQS_BUSY)) {
1399 		service_iq_fl(iq, 0);
1400 		(void) atomic_cmpset_int(&iq->state, IQS_BUSY, IQS_IDLE);
1401 	}
1402 }
1403 
1404 #ifdef DEV_NETMAP
1405 /*
1406  * Interrupt handler for netmap rx queues.
1407  */
1408 void
1409 t4_nm_intr(void *arg)
1410 {
1411 	struct sge_nm_rxq *nm_rxq = arg;
1412 
1413 	if (atomic_cmpset_int(&nm_rxq->nm_state, NM_ON, NM_BUSY)) {
1414 		service_nm_rxq(nm_rxq);
1415 		(void) atomic_cmpset_int(&nm_rxq->nm_state, NM_BUSY, NM_ON);
1416 	}
1417 }
1418 
1419 /*
1420  * Interrupt handler for vectors shared between NIC and netmap rx queues.
1421  */
1422 void
1423 t4_vi_intr(void *arg)
1424 {
1425 	struct irq *irq = arg;
1426 
1427 	MPASS(irq->nm_rxq != NULL);
1428 	t4_nm_intr(irq->nm_rxq);
1429 
1430 	MPASS(irq->rxq != NULL);
1431 	t4_intr(irq->rxq);
1432 }
1433 #endif
1434 
1435 /*
1436  * Deals with interrupts on an iq-only (no freelist) queue.
1437  */
1438 static int
1439 service_iq(struct sge_iq *iq, int budget)
1440 {
1441 	struct sge_iq *q;
1442 	struct adapter *sc = iq->adapter;
1443 	struct iq_desc *d = &iq->desc[iq->cidx];
1444 	int ndescs = 0, limit;
1445 	int rsp_type;
1446 	uint32_t lq;
1447 	STAILQ_HEAD(, sge_iq) iql = STAILQ_HEAD_INITIALIZER(iql);
1448 
1449 	KASSERT(iq->state == IQS_BUSY, ("%s: iq %p not BUSY", __func__, iq));
1450 	KASSERT((iq->flags & IQ_HAS_FL) == 0,
1451 	    ("%s: called for iq %p with fl (iq->flags 0x%x)", __func__, iq,
1452 	    iq->flags));
1453 	MPASS((iq->flags & IQ_ADJ_CREDIT) == 0);
1454 	MPASS((iq->flags & IQ_LRO_ENABLED) == 0);
1455 
1456 	limit = budget ? budget : iq->qsize / 16;
1457 
1458 	/*
1459 	 * We always come back and check the descriptor ring for new indirect
1460 	 * interrupts and other responses after running a single handler.
1461 	 */
1462 	for (;;) {
1463 		while ((d->rsp.u.type_gen & F_RSPD_GEN) == iq->gen) {
1464 
1465 			rmb();
1466 
1467 			rsp_type = G_RSPD_TYPE(d->rsp.u.type_gen);
1468 			lq = be32toh(d->rsp.pldbuflen_qid);
1469 
1470 			switch (rsp_type) {
1471 			case X_RSPD_TYPE_FLBUF:
1472 				panic("%s: data for an iq (%p) with no freelist",
1473 				    __func__, iq);
1474 
1475 				/* NOTREACHED */
1476 
1477 			case X_RSPD_TYPE_CPL:
1478 				KASSERT(d->rss.opcode < NUM_CPL_CMDS,
1479 				    ("%s: bad opcode %02x.", __func__,
1480 				    d->rss.opcode));
1481 				t4_cpl_handler[d->rss.opcode](iq, &d->rss, NULL);
1482 				break;
1483 
1484 			case X_RSPD_TYPE_INTR:
1485 				/*
1486 				 * There are 1K interrupt-capable queues (qids 0
1487 				 * through 1023).  A response type indicating a
1488 				 * forwarded interrupt with a qid >= 1K is an
1489 				 * iWARP async notification.
1490 				 */
1491 				if (__predict_true(lq >= 1024)) {
1492 					t4_an_handler(iq, &d->rsp);
1493 					break;
1494 				}
1495 
1496 				q = sc->sge.iqmap[lq - sc->sge.iq_start -
1497 				    sc->sge.iq_base];
1498 				if (atomic_cmpset_int(&q->state, IQS_IDLE,
1499 				    IQS_BUSY)) {
1500 					if (service_iq_fl(q, q->qsize / 16) == 0) {
1501 						(void) atomic_cmpset_int(&q->state,
1502 						    IQS_BUSY, IQS_IDLE);
1503 					} else {
1504 						STAILQ_INSERT_TAIL(&iql, q,
1505 						    link);
1506 					}
1507 				}
1508 				break;
1509 
1510 			default:
1511 				KASSERT(0,
1512 				    ("%s: illegal response type %d on iq %p",
1513 				    __func__, rsp_type, iq));
1514 				log(LOG_ERR,
1515 				    "%s: illegal response type %d on iq %p",
1516 				    device_get_nameunit(sc->dev), rsp_type, iq);
1517 				break;
1518 			}
1519 
1520 			d++;
1521 			if (__predict_false(++iq->cidx == iq->sidx)) {
1522 				iq->cidx = 0;
1523 				iq->gen ^= F_RSPD_GEN;
1524 				d = &iq->desc[0];
1525 			}
1526 			if (__predict_false(++ndescs == limit)) {
1527 				t4_write_reg(sc, sc->sge_gts_reg,
1528 				    V_CIDXINC(ndescs) |
1529 				    V_INGRESSQID(iq->cntxt_id) |
1530 				    V_SEINTARM(V_QINTR_TIMER_IDX(X_TIMERREG_UPDATE_CIDX)));
1531 				ndescs = 0;
1532 
1533 				if (budget) {
1534 					return (EINPROGRESS);
1535 				}
1536 			}
1537 		}
1538 
1539 		if (STAILQ_EMPTY(&iql))
1540 			break;
1541 
1542 		/*
1543 		 * Process the head only, and send it to the back of the list if
1544 		 * it's still not done.
1545 		 */
1546 		q = STAILQ_FIRST(&iql);
1547 		STAILQ_REMOVE_HEAD(&iql, link);
1548 		if (service_iq_fl(q, q->qsize / 8) == 0)
1549 			(void) atomic_cmpset_int(&q->state, IQS_BUSY, IQS_IDLE);
1550 		else
1551 			STAILQ_INSERT_TAIL(&iql, q, link);
1552 	}
1553 
1554 	t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(ndescs) |
1555 	    V_INGRESSQID((u32)iq->cntxt_id) | V_SEINTARM(iq->intr_params));
1556 
1557 	return (0);
1558 }
1559 
1560 static inline int
1561 sort_before_lro(struct lro_ctrl *lro)
1562 {
1563 
1564 	return (lro->lro_mbuf_max != 0);
1565 }
1566 
1567 static inline uint64_t
1568 last_flit_to_ns(struct adapter *sc, uint64_t lf)
1569 {
1570 	uint64_t n = be64toh(lf) & 0xfffffffffffffff;	/* 60b, not 64b. */
1571 
1572 	if (n > UINT64_MAX / 1000000)
1573 		return (n / sc->params.vpd.cclk * 1000000);
1574 	else
1575 		return (n * 1000000 / sc->params.vpd.cclk);
1576 }
1577 
1578 static inline void
1579 move_to_next_rxbuf(struct sge_fl *fl)
1580 {
1581 
1582 	fl->rx_offset = 0;
1583 	if (__predict_false((++fl->cidx & 7) == 0)) {
1584 		uint16_t cidx = fl->cidx >> 3;
1585 
1586 		if (__predict_false(cidx == fl->sidx))
1587 			fl->cidx = cidx = 0;
1588 		fl->hw_cidx = cidx;
1589 	}
1590 }
1591 
1592 /*
1593  * Deals with interrupts on an iq+fl queue.
1594  */
1595 static int
1596 service_iq_fl(struct sge_iq *iq, int budget)
1597 {
1598 	struct sge_rxq *rxq = iq_to_rxq(iq);
1599 	struct sge_fl *fl;
1600 	struct adapter *sc = iq->adapter;
1601 	struct iq_desc *d = &iq->desc[iq->cidx];
1602 	int ndescs, limit;
1603 	int rsp_type, starved;
1604 	uint32_t lq;
1605 	uint16_t fl_hw_cidx;
1606 	struct mbuf *m0;
1607 #if defined(INET) || defined(INET6)
1608 	const struct timeval lro_timeout = {0, sc->lro_timeout};
1609 	struct lro_ctrl *lro = &rxq->lro;
1610 #endif
1611 
1612 	KASSERT(iq->state == IQS_BUSY, ("%s: iq %p not BUSY", __func__, iq));
1613 	MPASS(iq->flags & IQ_HAS_FL);
1614 
1615 	ndescs = 0;
1616 #if defined(INET) || defined(INET6)
1617 	if (iq->flags & IQ_ADJ_CREDIT) {
1618 		MPASS(sort_before_lro(lro));
1619 		iq->flags &= ~IQ_ADJ_CREDIT;
1620 		if ((d->rsp.u.type_gen & F_RSPD_GEN) != iq->gen) {
1621 			tcp_lro_flush_all(lro);
1622 			t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(1) |
1623 			    V_INGRESSQID((u32)iq->cntxt_id) |
1624 			    V_SEINTARM(iq->intr_params));
1625 			return (0);
1626 		}
1627 		ndescs = 1;
1628 	}
1629 #else
1630 	MPASS((iq->flags & IQ_ADJ_CREDIT) == 0);
1631 #endif
1632 
1633 	limit = budget ? budget : iq->qsize / 16;
1634 	fl = &rxq->fl;
1635 	fl_hw_cidx = fl->hw_cidx;	/* stable snapshot */
1636 	while ((d->rsp.u.type_gen & F_RSPD_GEN) == iq->gen) {
1637 
1638 		rmb();
1639 
1640 		m0 = NULL;
1641 		rsp_type = G_RSPD_TYPE(d->rsp.u.type_gen);
1642 		lq = be32toh(d->rsp.pldbuflen_qid);
1643 
1644 		switch (rsp_type) {
1645 		case X_RSPD_TYPE_FLBUF:
1646 			if (lq & F_RSPD_NEWBUF) {
1647 				if (fl->rx_offset > 0)
1648 					move_to_next_rxbuf(fl);
1649 				lq = G_RSPD_LEN(lq);
1650 			}
1651 			if (IDXDIFF(fl->hw_cidx, fl_hw_cidx, fl->sidx) > 4) {
1652 				FL_LOCK(fl);
1653 				refill_fl(sc, fl, 64);
1654 				FL_UNLOCK(fl);
1655 				fl_hw_cidx = fl->hw_cidx;
1656 			}
1657 
1658 			if (d->rss.opcode == CPL_RX_PKT) {
1659 				if (__predict_true(eth_rx(sc, rxq, d, lq) == 0))
1660 					break;
1661 				goto out;
1662 			}
1663 			m0 = get_fl_payload(sc, fl, lq);
1664 			if (__predict_false(m0 == NULL))
1665 				goto out;
1666 
1667 			/* fall through */
1668 
1669 		case X_RSPD_TYPE_CPL:
1670 			KASSERT(d->rss.opcode < NUM_CPL_CMDS,
1671 			    ("%s: bad opcode %02x.", __func__, d->rss.opcode));
1672 			t4_cpl_handler[d->rss.opcode](iq, &d->rss, m0);
1673 			break;
1674 
1675 		case X_RSPD_TYPE_INTR:
1676 
1677 			/*
1678 			 * There are 1K interrupt-capable queues (qids 0
1679 			 * through 1023).  A response type indicating a
1680 			 * forwarded interrupt with a qid >= 1K is an
1681 			 * iWARP async notification.  That is the only
1682 			 * acceptable indirect interrupt on this queue.
1683 			 */
1684 			if (__predict_false(lq < 1024)) {
1685 				panic("%s: indirect interrupt on iq_fl %p "
1686 				    "with qid %u", __func__, iq, lq);
1687 			}
1688 
1689 			t4_an_handler(iq, &d->rsp);
1690 			break;
1691 
1692 		default:
1693 			KASSERT(0, ("%s: illegal response type %d on iq %p",
1694 			    __func__, rsp_type, iq));
1695 			log(LOG_ERR, "%s: illegal response type %d on iq %p",
1696 			    device_get_nameunit(sc->dev), rsp_type, iq);
1697 			break;
1698 		}
1699 
1700 		d++;
1701 		if (__predict_false(++iq->cidx == iq->sidx)) {
1702 			iq->cidx = 0;
1703 			iq->gen ^= F_RSPD_GEN;
1704 			d = &iq->desc[0];
1705 		}
1706 		if (__predict_false(++ndescs == limit)) {
1707 			t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(ndescs) |
1708 			    V_INGRESSQID(iq->cntxt_id) |
1709 			    V_SEINTARM(V_QINTR_TIMER_IDX(X_TIMERREG_UPDATE_CIDX)));
1710 
1711 #if defined(INET) || defined(INET6)
1712 			if (iq->flags & IQ_LRO_ENABLED &&
1713 			    !sort_before_lro(lro) &&
1714 			    sc->lro_timeout != 0) {
1715 				tcp_lro_flush_inactive(lro, &lro_timeout);
1716 			}
1717 #endif
1718 			if (budget)
1719 				return (EINPROGRESS);
1720 			ndescs = 0;
1721 		}
1722 	}
1723 out:
1724 #if defined(INET) || defined(INET6)
1725 	if (iq->flags & IQ_LRO_ENABLED) {
1726 		if (ndescs > 0 && lro->lro_mbuf_count > 8) {
1727 			MPASS(sort_before_lro(lro));
1728 			/* hold back one credit and don't flush LRO state */
1729 			iq->flags |= IQ_ADJ_CREDIT;
1730 			ndescs--;
1731 		} else {
1732 			tcp_lro_flush_all(lro);
1733 		}
1734 	}
1735 #endif
1736 
1737 	t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(ndescs) |
1738 	    V_INGRESSQID((u32)iq->cntxt_id) | V_SEINTARM(iq->intr_params));
1739 
1740 	FL_LOCK(fl);
1741 	starved = refill_fl(sc, fl, 64);
1742 	FL_UNLOCK(fl);
1743 	if (__predict_false(starved != 0))
1744 		add_fl_to_sfl(sc, fl);
1745 
1746 	return (0);
1747 }
1748 
1749 static inline struct cluster_metadata *
1750 cl_metadata(struct fl_sdesc *sd)
1751 {
1752 
1753 	return ((void *)(sd->cl + sd->moff));
1754 }
1755 
1756 static void
1757 rxb_free(struct mbuf *m)
1758 {
1759 	struct cluster_metadata *clm = m->m_ext.ext_arg1;
1760 
1761 	uma_zfree(clm->zone, clm->cl);
1762 	counter_u64_add(extfree_rels, 1);
1763 }
1764 
1765 /*
1766  * The mbuf returned comes from zone_muf and carries the payload in one of these
1767  * ways
1768  * a) complete frame inside the mbuf
1769  * b) m_cljset (for clusters without metadata)
1770  * d) m_extaddref (cluster with metadata)
1771  */
1772 static struct mbuf *
1773 get_scatter_segment(struct adapter *sc, struct sge_fl *fl, int fr_offset,
1774     int remaining)
1775 {
1776 	struct mbuf *m;
1777 	struct fl_sdesc *sd = &fl->sdesc[fl->cidx];
1778 	struct rx_buf_info *rxb = &sc->sge.rx_buf_info[sd->zidx];
1779 	struct cluster_metadata *clm;
1780 	int len, blen;
1781 	caddr_t payload;
1782 
1783 	if (fl->flags & FL_BUF_PACKING) {
1784 		u_int l, pad;
1785 
1786 		blen = rxb->size2 - fl->rx_offset;	/* max possible in this buf */
1787 		len = min(remaining, blen);
1788 		payload = sd->cl + fl->rx_offset;
1789 
1790 		l = fr_offset + len;
1791 		pad = roundup2(l, fl->buf_boundary) - l;
1792 		if (fl->rx_offset + len + pad < rxb->size2)
1793 			blen = len + pad;
1794 		MPASS(fl->rx_offset + blen <= rxb->size2);
1795 	} else {
1796 		MPASS(fl->rx_offset == 0);	/* not packing */
1797 		blen = rxb->size1;
1798 		len = min(remaining, blen);
1799 		payload = sd->cl;
1800 	}
1801 
1802 	if (fr_offset == 0) {
1803 		m = m_gethdr(M_NOWAIT, MT_DATA);
1804 		if (__predict_false(m == NULL))
1805 			return (NULL);
1806 		m->m_pkthdr.len = remaining;
1807 	} else {
1808 		m = m_get(M_NOWAIT, MT_DATA);
1809 		if (__predict_false(m == NULL))
1810 			return (NULL);
1811 	}
1812 	m->m_len = len;
1813 
1814 	if (sc->sc_do_rxcopy && len < RX_COPY_THRESHOLD) {
1815 		/* copy data to mbuf */
1816 		bcopy(payload, mtod(m, caddr_t), len);
1817 		if (fl->flags & FL_BUF_PACKING) {
1818 			fl->rx_offset += blen;
1819 			MPASS(fl->rx_offset <= rxb->size2);
1820 			if (fl->rx_offset < rxb->size2)
1821 				return (m);	/* without advancing the cidx */
1822 		}
1823 	} else if (fl->flags & FL_BUF_PACKING) {
1824 		clm = cl_metadata(sd);
1825 		if (sd->nmbuf++ == 0) {
1826 			clm->refcount = 1;
1827 			clm->zone = rxb->zone;
1828 			clm->cl = sd->cl;
1829 			counter_u64_add(extfree_refs, 1);
1830 		}
1831 		m_extaddref(m, payload, blen, &clm->refcount, rxb_free, clm,
1832 		    NULL);
1833 
1834 		fl->rx_offset += blen;
1835 		MPASS(fl->rx_offset <= rxb->size2);
1836 		if (fl->rx_offset < rxb->size2)
1837 			return (m);	/* without advancing the cidx */
1838 	} else {
1839 		m_cljset(m, sd->cl, rxb->type);
1840 		sd->cl = NULL;	/* consumed, not a recycle candidate */
1841 	}
1842 
1843 	move_to_next_rxbuf(fl);
1844 
1845 	return (m);
1846 }
1847 
1848 static struct mbuf *
1849 get_fl_payload(struct adapter *sc, struct sge_fl *fl, const u_int plen)
1850 {
1851 	struct mbuf *m0, *m, **pnext;
1852 	u_int remaining;
1853 
1854 	if (__predict_false(fl->flags & FL_BUF_RESUME)) {
1855 		M_ASSERTPKTHDR(fl->m0);
1856 		MPASS(fl->m0->m_pkthdr.len == plen);
1857 		MPASS(fl->remaining < plen);
1858 
1859 		m0 = fl->m0;
1860 		pnext = fl->pnext;
1861 		remaining = fl->remaining;
1862 		fl->flags &= ~FL_BUF_RESUME;
1863 		goto get_segment;
1864 	}
1865 
1866 	/*
1867 	 * Payload starts at rx_offset in the current hw buffer.  Its length is
1868 	 * 'len' and it may span multiple hw buffers.
1869 	 */
1870 
1871 	m0 = get_scatter_segment(sc, fl, 0, plen);
1872 	if (m0 == NULL)
1873 		return (NULL);
1874 	remaining = plen - m0->m_len;
1875 	pnext = &m0->m_next;
1876 	while (remaining > 0) {
1877 get_segment:
1878 		MPASS(fl->rx_offset == 0);
1879 		m = get_scatter_segment(sc, fl, plen - remaining, remaining);
1880 		if (__predict_false(m == NULL)) {
1881 			fl->m0 = m0;
1882 			fl->pnext = pnext;
1883 			fl->remaining = remaining;
1884 			fl->flags |= FL_BUF_RESUME;
1885 			return (NULL);
1886 		}
1887 		*pnext = m;
1888 		pnext = &m->m_next;
1889 		remaining -= m->m_len;
1890 	}
1891 	*pnext = NULL;
1892 
1893 	M_ASSERTPKTHDR(m0);
1894 	return (m0);
1895 }
1896 
1897 static int
1898 skip_scatter_segment(struct adapter *sc, struct sge_fl *fl, int fr_offset,
1899     int remaining)
1900 {
1901 	struct fl_sdesc *sd = &fl->sdesc[fl->cidx];
1902 	struct rx_buf_info *rxb = &sc->sge.rx_buf_info[sd->zidx];
1903 	int len, blen;
1904 
1905 	if (fl->flags & FL_BUF_PACKING) {
1906 		u_int l, pad;
1907 
1908 		blen = rxb->size2 - fl->rx_offset;	/* max possible in this buf */
1909 		len = min(remaining, blen);
1910 
1911 		l = fr_offset + len;
1912 		pad = roundup2(l, fl->buf_boundary) - l;
1913 		if (fl->rx_offset + len + pad < rxb->size2)
1914 			blen = len + pad;
1915 		fl->rx_offset += blen;
1916 		MPASS(fl->rx_offset <= rxb->size2);
1917 		if (fl->rx_offset < rxb->size2)
1918 			return (len);	/* without advancing the cidx */
1919 	} else {
1920 		MPASS(fl->rx_offset == 0);	/* not packing */
1921 		blen = rxb->size1;
1922 		len = min(remaining, blen);
1923 	}
1924 	move_to_next_rxbuf(fl);
1925 	return (len);
1926 }
1927 
1928 static inline void
1929 skip_fl_payload(struct adapter *sc, struct sge_fl *fl, int plen)
1930 {
1931 	int remaining, fr_offset, len;
1932 
1933 	fr_offset = 0;
1934 	remaining = plen;
1935 	while (remaining > 0) {
1936 		len = skip_scatter_segment(sc, fl, fr_offset, remaining);
1937 		fr_offset += len;
1938 		remaining -= len;
1939 	}
1940 }
1941 
1942 static inline int
1943 get_segment_len(struct adapter *sc, struct sge_fl *fl, int plen)
1944 {
1945 	int len;
1946 	struct fl_sdesc *sd = &fl->sdesc[fl->cidx];
1947 	struct rx_buf_info *rxb = &sc->sge.rx_buf_info[sd->zidx];
1948 
1949 	if (fl->flags & FL_BUF_PACKING)
1950 		len = rxb->size2 - fl->rx_offset;
1951 	else
1952 		len = rxb->size1;
1953 
1954 	return (min(plen, len));
1955 }
1956 
1957 static int
1958 eth_rx(struct adapter *sc, struct sge_rxq *rxq, const struct iq_desc *d,
1959     u_int plen)
1960 {
1961 	struct mbuf *m0;
1962 	struct ifnet *ifp = rxq->ifp;
1963 	struct sge_fl *fl = &rxq->fl;
1964 	struct vi_info *vi = ifp->if_softc;
1965 	const struct cpl_rx_pkt *cpl;
1966 #if defined(INET) || defined(INET6)
1967 	struct lro_ctrl *lro = &rxq->lro;
1968 #endif
1969 	uint16_t err_vec, tnl_type, tnlhdr_len;
1970 	static const int sw_hashtype[4][2] = {
1971 		{M_HASHTYPE_NONE, M_HASHTYPE_NONE},
1972 		{M_HASHTYPE_RSS_IPV4, M_HASHTYPE_RSS_IPV6},
1973 		{M_HASHTYPE_RSS_TCP_IPV4, M_HASHTYPE_RSS_TCP_IPV6},
1974 		{M_HASHTYPE_RSS_UDP_IPV4, M_HASHTYPE_RSS_UDP_IPV6},
1975 	};
1976 	static const int sw_csum_flags[2][2] = {
1977 		{
1978 			/* IP, inner IP */
1979 			CSUM_ENCAP_VXLAN |
1980 			    CSUM_L3_CALC | CSUM_L3_VALID |
1981 			    CSUM_L4_CALC | CSUM_L4_VALID |
1982 			    CSUM_INNER_L3_CALC | CSUM_INNER_L3_VALID |
1983 			    CSUM_INNER_L4_CALC | CSUM_INNER_L4_VALID,
1984 
1985 			/* IP, inner IP6 */
1986 			CSUM_ENCAP_VXLAN |
1987 			    CSUM_L3_CALC | CSUM_L3_VALID |
1988 			    CSUM_L4_CALC | CSUM_L4_VALID |
1989 			    CSUM_INNER_L4_CALC | CSUM_INNER_L4_VALID,
1990 		},
1991 		{
1992 			/* IP6, inner IP */
1993 			CSUM_ENCAP_VXLAN |
1994 			    CSUM_L4_CALC | CSUM_L4_VALID |
1995 			    CSUM_INNER_L3_CALC | CSUM_INNER_L3_VALID |
1996 			    CSUM_INNER_L4_CALC | CSUM_INNER_L4_VALID,
1997 
1998 			/* IP6, inner IP6 */
1999 			CSUM_ENCAP_VXLAN |
2000 			    CSUM_L4_CALC | CSUM_L4_VALID |
2001 			    CSUM_INNER_L4_CALC | CSUM_INNER_L4_VALID,
2002 		},
2003 	};
2004 
2005 	MPASS(plen > sc->params.sge.fl_pktshift);
2006 	if (vi->pfil != NULL && PFIL_HOOKED_IN(vi->pfil) &&
2007 	    __predict_true((fl->flags & FL_BUF_RESUME) == 0)) {
2008 		struct fl_sdesc *sd = &fl->sdesc[fl->cidx];
2009 		caddr_t frame;
2010 		int rc, slen;
2011 
2012 		slen = get_segment_len(sc, fl, plen) -
2013 		    sc->params.sge.fl_pktshift;
2014 		frame = sd->cl + fl->rx_offset + sc->params.sge.fl_pktshift;
2015 		CURVNET_SET_QUIET(ifp->if_vnet);
2016 		rc = pfil_run_hooks(vi->pfil, frame, ifp,
2017 		    slen | PFIL_MEMPTR | PFIL_IN, NULL);
2018 		CURVNET_RESTORE();
2019 		if (rc == PFIL_DROPPED || rc == PFIL_CONSUMED) {
2020 			skip_fl_payload(sc, fl, plen);
2021 			return (0);
2022 		}
2023 		if (rc == PFIL_REALLOCED) {
2024 			skip_fl_payload(sc, fl, plen);
2025 			m0 = pfil_mem2mbuf(frame);
2026 			goto have_mbuf;
2027 		}
2028 	}
2029 
2030 	m0 = get_fl_payload(sc, fl, plen);
2031 	if (__predict_false(m0 == NULL))
2032 		return (ENOMEM);
2033 
2034 	m0->m_pkthdr.len -= sc->params.sge.fl_pktshift;
2035 	m0->m_len -= sc->params.sge.fl_pktshift;
2036 	m0->m_data += sc->params.sge.fl_pktshift;
2037 
2038 have_mbuf:
2039 	m0->m_pkthdr.rcvif = ifp;
2040 	M_HASHTYPE_SET(m0, sw_hashtype[d->rss.hash_type][d->rss.ipv6]);
2041 	m0->m_pkthdr.flowid = be32toh(d->rss.hash_val);
2042 
2043 	cpl = (const void *)(&d->rss + 1);
2044 	if (sc->params.tp.rx_pkt_encap) {
2045 		const uint16_t ev = be16toh(cpl->err_vec);
2046 
2047 		err_vec = G_T6_COMPR_RXERR_VEC(ev);
2048 		tnl_type = G_T6_RX_TNL_TYPE(ev);
2049 		tnlhdr_len = G_T6_RX_TNLHDR_LEN(ev);
2050 	} else {
2051 		err_vec = be16toh(cpl->err_vec);
2052 		tnl_type = 0;
2053 		tnlhdr_len = 0;
2054 	}
2055 	if (cpl->csum_calc && err_vec == 0) {
2056 		int ipv6 = !!(cpl->l2info & htobe32(F_RXF_IP6));
2057 
2058 		/* checksum(s) calculated and found to be correct. */
2059 
2060 		MPASS((cpl->l2info & htobe32(F_RXF_IP)) ^
2061 		    (cpl->l2info & htobe32(F_RXF_IP6)));
2062 		m0->m_pkthdr.csum_data = be16toh(cpl->csum);
2063 		if (tnl_type == 0) {
2064 	    		if (!ipv6 && ifp->if_capenable & IFCAP_RXCSUM) {
2065 				m0->m_pkthdr.csum_flags = CSUM_L3_CALC |
2066 				    CSUM_L3_VALID | CSUM_L4_CALC |
2067 				    CSUM_L4_VALID;
2068 			} else if (ipv6 && ifp->if_capenable & IFCAP_RXCSUM_IPV6) {
2069 				m0->m_pkthdr.csum_flags = CSUM_L4_CALC |
2070 				    CSUM_L4_VALID;
2071 			}
2072 			rxq->rxcsum++;
2073 		} else {
2074 			MPASS(tnl_type == RX_PKT_TNL_TYPE_VXLAN);
2075 			if (__predict_false(cpl->ip_frag)) {
2076 				/*
2077 				 * csum_data is for the inner frame (which is an
2078 				 * IP fragment) and is not 0xffff.  There is no
2079 				 * way to pass the inner csum_data to the stack.
2080 				 * We don't want the stack to use the inner
2081 				 * csum_data to validate the outer frame or it
2082 				 * will get rejected.  So we fix csum_data here
2083 				 * and let sw do the checksum of inner IP
2084 				 * fragments.
2085 				 *
2086 				 * XXX: Need 32b for csum_data2 in an rx mbuf.
2087 				 * Maybe stuff it into rcv_tstmp?
2088 				 */
2089 				m0->m_pkthdr.csum_data = 0xffff;
2090 				if (ipv6) {
2091 					m0->m_pkthdr.csum_flags = CSUM_L4_CALC |
2092 					    CSUM_L4_VALID;
2093 				} else {
2094 					m0->m_pkthdr.csum_flags = CSUM_L3_CALC |
2095 					    CSUM_L3_VALID | CSUM_L4_CALC |
2096 					    CSUM_L4_VALID;
2097 				}
2098 			} else {
2099 				int outer_ipv6;
2100 
2101 				MPASS(m0->m_pkthdr.csum_data == 0xffff);
2102 
2103 				outer_ipv6 = tnlhdr_len >=
2104 				    sizeof(struct ether_header) +
2105 				    sizeof(struct ip6_hdr);
2106 				m0->m_pkthdr.csum_flags =
2107 				    sw_csum_flags[outer_ipv6][ipv6];
2108 			}
2109 			rxq->vxlan_rxcsum++;
2110 		}
2111 	}
2112 
2113 	if (cpl->vlan_ex) {
2114 		m0->m_pkthdr.ether_vtag = be16toh(cpl->vlan);
2115 		m0->m_flags |= M_VLANTAG;
2116 		rxq->vlan_extraction++;
2117 	}
2118 
2119 	if (rxq->iq.flags & IQ_RX_TIMESTAMP) {
2120 		/*
2121 		 * Fill up rcv_tstmp but do not set M_TSTMP.
2122 		 * rcv_tstmp is not in the format that the
2123 		 * kernel expects and we don't want to mislead
2124 		 * it.  For now this is only for custom code
2125 		 * that knows how to interpret cxgbe's stamp.
2126 		 */
2127 		m0->m_pkthdr.rcv_tstmp =
2128 		    last_flit_to_ns(sc, d->rsp.u.last_flit);
2129 #ifdef notyet
2130 		m0->m_flags |= M_TSTMP;
2131 #endif
2132 	}
2133 
2134 #ifdef NUMA
2135 	m0->m_pkthdr.numa_domain = ifp->if_numa_domain;
2136 #endif
2137 #if defined(INET) || defined(INET6)
2138 	if (rxq->iq.flags & IQ_LRO_ENABLED && tnl_type == 0 &&
2139 	    (M_HASHTYPE_GET(m0) == M_HASHTYPE_RSS_TCP_IPV4 ||
2140 	    M_HASHTYPE_GET(m0) == M_HASHTYPE_RSS_TCP_IPV6)) {
2141 		if (sort_before_lro(lro)) {
2142 			tcp_lro_queue_mbuf(lro, m0);
2143 			return (0); /* queued for sort, then LRO */
2144 		}
2145 		if (tcp_lro_rx(lro, m0, 0) == 0)
2146 			return (0); /* queued for LRO */
2147 	}
2148 #endif
2149 	ifp->if_input(ifp, m0);
2150 
2151 	return (0);
2152 }
2153 
2154 /*
2155  * Must drain the wrq or make sure that someone else will.
2156  */
2157 static void
2158 wrq_tx_drain(void *arg, int n)
2159 {
2160 	struct sge_wrq *wrq = arg;
2161 	struct sge_eq *eq = &wrq->eq;
2162 
2163 	EQ_LOCK(eq);
2164 	if (TAILQ_EMPTY(&wrq->incomplete_wrs) && !STAILQ_EMPTY(&wrq->wr_list))
2165 		drain_wrq_wr_list(wrq->adapter, wrq);
2166 	EQ_UNLOCK(eq);
2167 }
2168 
2169 static void
2170 drain_wrq_wr_list(struct adapter *sc, struct sge_wrq *wrq)
2171 {
2172 	struct sge_eq *eq = &wrq->eq;
2173 	u_int available, dbdiff;	/* # of hardware descriptors */
2174 	u_int n;
2175 	struct wrqe *wr;
2176 	struct fw_eth_tx_pkt_wr *dst;	/* any fw WR struct will do */
2177 
2178 	EQ_LOCK_ASSERT_OWNED(eq);
2179 	MPASS(TAILQ_EMPTY(&wrq->incomplete_wrs));
2180 	wr = STAILQ_FIRST(&wrq->wr_list);
2181 	MPASS(wr != NULL);	/* Must be called with something useful to do */
2182 	MPASS(eq->pidx == eq->dbidx);
2183 	dbdiff = 0;
2184 
2185 	do {
2186 		eq->cidx = read_hw_cidx(eq);
2187 		if (eq->pidx == eq->cidx)
2188 			available = eq->sidx - 1;
2189 		else
2190 			available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;
2191 
2192 		MPASS(wr->wrq == wrq);
2193 		n = howmany(wr->wr_len, EQ_ESIZE);
2194 		if (available < n)
2195 			break;
2196 
2197 		dst = (void *)&eq->desc[eq->pidx];
2198 		if (__predict_true(eq->sidx - eq->pidx > n)) {
2199 			/* Won't wrap, won't end exactly at the status page. */
2200 			bcopy(&wr->wr[0], dst, wr->wr_len);
2201 			eq->pidx += n;
2202 		} else {
2203 			int first_portion = (eq->sidx - eq->pidx) * EQ_ESIZE;
2204 
2205 			bcopy(&wr->wr[0], dst, first_portion);
2206 			if (wr->wr_len > first_portion) {
2207 				bcopy(&wr->wr[first_portion], &eq->desc[0],
2208 				    wr->wr_len - first_portion);
2209 			}
2210 			eq->pidx = n - (eq->sidx - eq->pidx);
2211 		}
2212 		wrq->tx_wrs_copied++;
2213 
2214 		if (available < eq->sidx / 4 &&
2215 		    atomic_cmpset_int(&eq->equiq, 0, 1)) {
2216 				/*
2217 				 * XXX: This is not 100% reliable with some
2218 				 * types of WRs.  But this is a very unusual
2219 				 * situation for an ofld/ctrl queue anyway.
2220 				 */
2221 			dst->equiq_to_len16 |= htobe32(F_FW_WR_EQUIQ |
2222 			    F_FW_WR_EQUEQ);
2223 		}
2224 
2225 		dbdiff += n;
2226 		if (dbdiff >= 16) {
2227 			ring_eq_db(sc, eq, dbdiff);
2228 			dbdiff = 0;
2229 		}
2230 
2231 		STAILQ_REMOVE_HEAD(&wrq->wr_list, link);
2232 		free_wrqe(wr);
2233 		MPASS(wrq->nwr_pending > 0);
2234 		wrq->nwr_pending--;
2235 		MPASS(wrq->ndesc_needed >= n);
2236 		wrq->ndesc_needed -= n;
2237 	} while ((wr = STAILQ_FIRST(&wrq->wr_list)) != NULL);
2238 
2239 	if (dbdiff)
2240 		ring_eq_db(sc, eq, dbdiff);
2241 }
2242 
2243 /*
2244  * Doesn't fail.  Holds on to work requests it can't send right away.
2245  */
2246 void
2247 t4_wrq_tx_locked(struct adapter *sc, struct sge_wrq *wrq, struct wrqe *wr)
2248 {
2249 #ifdef INVARIANTS
2250 	struct sge_eq *eq = &wrq->eq;
2251 #endif
2252 
2253 	EQ_LOCK_ASSERT_OWNED(eq);
2254 	MPASS(wr != NULL);
2255 	MPASS(wr->wr_len > 0 && wr->wr_len <= SGE_MAX_WR_LEN);
2256 	MPASS((wr->wr_len & 0x7) == 0);
2257 
2258 	STAILQ_INSERT_TAIL(&wrq->wr_list, wr, link);
2259 	wrq->nwr_pending++;
2260 	wrq->ndesc_needed += howmany(wr->wr_len, EQ_ESIZE);
2261 
2262 	if (!TAILQ_EMPTY(&wrq->incomplete_wrs))
2263 		return;	/* commit_wrq_wr will drain wr_list as well. */
2264 
2265 	drain_wrq_wr_list(sc, wrq);
2266 
2267 	/* Doorbell must have caught up to the pidx. */
2268 	MPASS(eq->pidx == eq->dbidx);
2269 }
2270 
2271 void
2272 t4_update_fl_bufsize(struct ifnet *ifp)
2273 {
2274 	struct vi_info *vi = ifp->if_softc;
2275 	struct adapter *sc = vi->adapter;
2276 	struct sge_rxq *rxq;
2277 #ifdef TCP_OFFLOAD
2278 	struct sge_ofld_rxq *ofld_rxq;
2279 #endif
2280 	struct sge_fl *fl;
2281 	int i, maxp;
2282 
2283 	maxp = max_rx_payload(sc, ifp, false);
2284 	for_each_rxq(vi, i, rxq) {
2285 		fl = &rxq->fl;
2286 
2287 		FL_LOCK(fl);
2288 		fl->zidx = find_refill_source(sc, maxp,
2289 		    fl->flags & FL_BUF_PACKING);
2290 		FL_UNLOCK(fl);
2291 	}
2292 #ifdef TCP_OFFLOAD
2293 	maxp = max_rx_payload(sc, ifp, true);
2294 	for_each_ofld_rxq(vi, i, ofld_rxq) {
2295 		fl = &ofld_rxq->fl;
2296 
2297 		FL_LOCK(fl);
2298 		fl->zidx = find_refill_source(sc, maxp,
2299 		    fl->flags & FL_BUF_PACKING);
2300 		FL_UNLOCK(fl);
2301 	}
2302 #endif
2303 }
2304 
2305 static inline int
2306 mbuf_nsegs(struct mbuf *m)
2307 {
2308 
2309 	M_ASSERTPKTHDR(m);
2310 	KASSERT(m->m_pkthdr.inner_l5hlen > 0,
2311 	    ("%s: mbuf %p missing information on # of segments.", __func__, m));
2312 
2313 	return (m->m_pkthdr.inner_l5hlen);
2314 }
2315 
2316 static inline void
2317 set_mbuf_nsegs(struct mbuf *m, uint8_t nsegs)
2318 {
2319 
2320 	M_ASSERTPKTHDR(m);
2321 	m->m_pkthdr.inner_l5hlen = nsegs;
2322 }
2323 
2324 static inline int
2325 mbuf_cflags(struct mbuf *m)
2326 {
2327 
2328 	M_ASSERTPKTHDR(m);
2329 	return (m->m_pkthdr.PH_loc.eight[4]);
2330 }
2331 
2332 static inline void
2333 set_mbuf_cflags(struct mbuf *m, uint8_t flags)
2334 {
2335 
2336 	M_ASSERTPKTHDR(m);
2337 	m->m_pkthdr.PH_loc.eight[4] = flags;
2338 }
2339 
2340 static inline int
2341 mbuf_len16(struct mbuf *m)
2342 {
2343 	int n;
2344 
2345 	M_ASSERTPKTHDR(m);
2346 	n = m->m_pkthdr.PH_loc.eight[0];
2347 	if (!(mbuf_cflags(m) & MC_TLS))
2348 		MPASS(n > 0 && n <= SGE_MAX_WR_LEN / 16);
2349 
2350 	return (n);
2351 }
2352 
2353 static inline void
2354 set_mbuf_len16(struct mbuf *m, uint8_t len16)
2355 {
2356 
2357 	M_ASSERTPKTHDR(m);
2358 	if (!(mbuf_cflags(m) & MC_TLS))
2359 		MPASS(len16 > 0 && len16 <= SGE_MAX_WR_LEN / 16);
2360 	m->m_pkthdr.PH_loc.eight[0] = len16;
2361 }
2362 
2363 #ifdef RATELIMIT
2364 static inline int
2365 mbuf_eo_nsegs(struct mbuf *m)
2366 {
2367 
2368 	M_ASSERTPKTHDR(m);
2369 	return (m->m_pkthdr.PH_loc.eight[1]);
2370 }
2371 
2372 static inline void
2373 set_mbuf_eo_nsegs(struct mbuf *m, uint8_t nsegs)
2374 {
2375 
2376 	M_ASSERTPKTHDR(m);
2377 	m->m_pkthdr.PH_loc.eight[1] = nsegs;
2378 }
2379 
2380 static inline int
2381 mbuf_eo_len16(struct mbuf *m)
2382 {
2383 	int n;
2384 
2385 	M_ASSERTPKTHDR(m);
2386 	n = m->m_pkthdr.PH_loc.eight[2];
2387 	MPASS(n > 0 && n <= SGE_MAX_WR_LEN / 16);
2388 
2389 	return (n);
2390 }
2391 
2392 static inline void
2393 set_mbuf_eo_len16(struct mbuf *m, uint8_t len16)
2394 {
2395 
2396 	M_ASSERTPKTHDR(m);
2397 	m->m_pkthdr.PH_loc.eight[2] = len16;
2398 }
2399 
2400 static inline int
2401 mbuf_eo_tsclk_tsoff(struct mbuf *m)
2402 {
2403 
2404 	M_ASSERTPKTHDR(m);
2405 	return (m->m_pkthdr.PH_loc.eight[3]);
2406 }
2407 
2408 static inline void
2409 set_mbuf_eo_tsclk_tsoff(struct mbuf *m, uint8_t tsclk_tsoff)
2410 {
2411 
2412 	M_ASSERTPKTHDR(m);
2413 	m->m_pkthdr.PH_loc.eight[3] = tsclk_tsoff;
2414 }
2415 
2416 static inline int
2417 needs_eo(struct m_snd_tag *mst)
2418 {
2419 
2420 	return (mst != NULL && mst->type == IF_SND_TAG_TYPE_RATE_LIMIT);
2421 }
2422 #endif
2423 
2424 /*
2425  * Try to allocate an mbuf to contain a raw work request.  To make it
2426  * easy to construct the work request, don't allocate a chain but a
2427  * single mbuf.
2428  */
2429 struct mbuf *
2430 alloc_wr_mbuf(int len, int how)
2431 {
2432 	struct mbuf *m;
2433 
2434 	if (len <= MHLEN)
2435 		m = m_gethdr(how, MT_DATA);
2436 	else if (len <= MCLBYTES)
2437 		m = m_getcl(how, MT_DATA, M_PKTHDR);
2438 	else
2439 		m = NULL;
2440 	if (m == NULL)
2441 		return (NULL);
2442 	m->m_pkthdr.len = len;
2443 	m->m_len = len;
2444 	set_mbuf_cflags(m, MC_RAW_WR);
2445 	set_mbuf_len16(m, howmany(len, 16));
2446 	return (m);
2447 }
2448 
2449 static inline bool
2450 needs_hwcsum(struct mbuf *m)
2451 {
2452 	const uint32_t csum_flags = CSUM_IP | CSUM_IP_UDP | CSUM_IP_TCP |
2453 	    CSUM_IP_TSO | CSUM_INNER_IP | CSUM_INNER_IP_UDP |
2454 	    CSUM_INNER_IP_TCP | CSUM_INNER_IP_TSO | CSUM_IP6_UDP |
2455 	    CSUM_IP6_TCP | CSUM_IP6_TSO | CSUM_INNER_IP6_UDP |
2456 	    CSUM_INNER_IP6_TCP | CSUM_INNER_IP6_TSO;
2457 
2458 	M_ASSERTPKTHDR(m);
2459 
2460 	return (m->m_pkthdr.csum_flags & csum_flags);
2461 }
2462 
2463 static inline bool
2464 needs_tso(struct mbuf *m)
2465 {
2466 	const uint32_t csum_flags = CSUM_IP_TSO | CSUM_IP6_TSO |
2467 	    CSUM_INNER_IP_TSO | CSUM_INNER_IP6_TSO;
2468 
2469 	M_ASSERTPKTHDR(m);
2470 
2471 	return (m->m_pkthdr.csum_flags & csum_flags);
2472 }
2473 
2474 static inline bool
2475 needs_vxlan_csum(struct mbuf *m)
2476 {
2477 
2478 	M_ASSERTPKTHDR(m);
2479 
2480 	return (m->m_pkthdr.csum_flags & CSUM_ENCAP_VXLAN);
2481 }
2482 
2483 static inline bool
2484 needs_vxlan_tso(struct mbuf *m)
2485 {
2486 	const uint32_t csum_flags = CSUM_ENCAP_VXLAN | CSUM_INNER_IP_TSO |
2487 	    CSUM_INNER_IP6_TSO;
2488 
2489 	M_ASSERTPKTHDR(m);
2490 
2491 	return ((m->m_pkthdr.csum_flags & csum_flags) != 0 &&
2492 	    (m->m_pkthdr.csum_flags & csum_flags) != CSUM_ENCAP_VXLAN);
2493 }
2494 
2495 static inline bool
2496 needs_inner_tcp_csum(struct mbuf *m)
2497 {
2498 	const uint32_t csum_flags = CSUM_INNER_IP_TSO | CSUM_INNER_IP6_TSO;
2499 
2500 	M_ASSERTPKTHDR(m);
2501 
2502 	return (m->m_pkthdr.csum_flags & csum_flags);
2503 }
2504 
2505 static inline bool
2506 needs_l3_csum(struct mbuf *m)
2507 {
2508 	const uint32_t csum_flags = CSUM_IP | CSUM_IP_TSO | CSUM_INNER_IP |
2509 	    CSUM_INNER_IP_TSO;
2510 
2511 	M_ASSERTPKTHDR(m);
2512 
2513 	return (m->m_pkthdr.csum_flags & csum_flags);
2514 }
2515 
2516 static inline bool
2517 needs_outer_tcp_csum(struct mbuf *m)
2518 {
2519 	const uint32_t csum_flags = CSUM_IP_TCP | CSUM_IP_TSO | CSUM_IP6_TCP |
2520 	    CSUM_IP6_TSO;
2521 
2522 	M_ASSERTPKTHDR(m);
2523 
2524 	return (m->m_pkthdr.csum_flags & csum_flags);
2525 }
2526 
2527 #ifdef RATELIMIT
2528 static inline bool
2529 needs_outer_l4_csum(struct mbuf *m)
2530 {
2531 	const uint32_t csum_flags = CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP_TSO |
2532 	    CSUM_IP6_UDP | CSUM_IP6_TCP | CSUM_IP6_TSO;
2533 
2534 	M_ASSERTPKTHDR(m);
2535 
2536 	return (m->m_pkthdr.csum_flags & csum_flags);
2537 }
2538 
2539 static inline bool
2540 needs_outer_udp_csum(struct mbuf *m)
2541 {
2542 	const uint32_t csum_flags = CSUM_IP_UDP | CSUM_IP6_UDP;
2543 
2544 	M_ASSERTPKTHDR(m);
2545 
2546 	return (m->m_pkthdr.csum_flags & csum_flags);
2547 }
2548 #endif
2549 
2550 static inline bool
2551 needs_vlan_insertion(struct mbuf *m)
2552 {
2553 
2554 	M_ASSERTPKTHDR(m);
2555 
2556 	return (m->m_flags & M_VLANTAG);
2557 }
2558 
2559 static void *
2560 m_advance(struct mbuf **pm, int *poffset, int len)
2561 {
2562 	struct mbuf *m = *pm;
2563 	int offset = *poffset;
2564 	uintptr_t p = 0;
2565 
2566 	MPASS(len > 0);
2567 
2568 	for (;;) {
2569 		if (offset + len < m->m_len) {
2570 			offset += len;
2571 			p = mtod(m, uintptr_t) + offset;
2572 			break;
2573 		}
2574 		len -= m->m_len - offset;
2575 		m = m->m_next;
2576 		offset = 0;
2577 		MPASS(m != NULL);
2578 	}
2579 	*poffset = offset;
2580 	*pm = m;
2581 	return ((void *)p);
2582 }
2583 
2584 static inline int
2585 count_mbuf_ext_pgs(struct mbuf *m, int skip, vm_paddr_t *nextaddr)
2586 {
2587 	vm_paddr_t paddr;
2588 	int i, len, off, pglen, pgoff, seglen, segoff;
2589 	int nsegs = 0;
2590 
2591 	M_ASSERTEXTPG(m);
2592 	off = mtod(m, vm_offset_t);
2593 	len = m->m_len;
2594 	off += skip;
2595 	len -= skip;
2596 
2597 	if (m->m_epg_hdrlen != 0) {
2598 		if (off >= m->m_epg_hdrlen) {
2599 			off -= m->m_epg_hdrlen;
2600 		} else {
2601 			seglen = m->m_epg_hdrlen - off;
2602 			segoff = off;
2603 			seglen = min(seglen, len);
2604 			off = 0;
2605 			len -= seglen;
2606 			paddr = pmap_kextract(
2607 			    (vm_offset_t)&m->m_epg_hdr[segoff]);
2608 			if (*nextaddr != paddr)
2609 				nsegs++;
2610 			*nextaddr = paddr + seglen;
2611 		}
2612 	}
2613 	pgoff = m->m_epg_1st_off;
2614 	for (i = 0; i < m->m_epg_npgs && len > 0; i++) {
2615 		pglen = m_epg_pagelen(m, i, pgoff);
2616 		if (off >= pglen) {
2617 			off -= pglen;
2618 			pgoff = 0;
2619 			continue;
2620 		}
2621 		seglen = pglen - off;
2622 		segoff = pgoff + off;
2623 		off = 0;
2624 		seglen = min(seglen, len);
2625 		len -= seglen;
2626 		paddr = m->m_epg_pa[i] + segoff;
2627 		if (*nextaddr != paddr)
2628 			nsegs++;
2629 		*nextaddr = paddr + seglen;
2630 		pgoff = 0;
2631 	};
2632 	if (len != 0) {
2633 		seglen = min(len, m->m_epg_trllen - off);
2634 		len -= seglen;
2635 		paddr = pmap_kextract((vm_offset_t)&m->m_epg_trail[off]);
2636 		if (*nextaddr != paddr)
2637 			nsegs++;
2638 		*nextaddr = paddr + seglen;
2639 	}
2640 
2641 	return (nsegs);
2642 }
2643 
2644 
2645 /*
2646  * Can deal with empty mbufs in the chain that have m_len = 0, but the chain
2647  * must have at least one mbuf that's not empty.  It is possible for this
2648  * routine to return 0 if skip accounts for all the contents of the mbuf chain.
2649  */
2650 static inline int
2651 count_mbuf_nsegs(struct mbuf *m, int skip, uint8_t *cflags)
2652 {
2653 	vm_paddr_t nextaddr, paddr;
2654 	vm_offset_t va;
2655 	int len, nsegs;
2656 
2657 	M_ASSERTPKTHDR(m);
2658 	MPASS(m->m_pkthdr.len > 0);
2659 	MPASS(m->m_pkthdr.len >= skip);
2660 
2661 	nsegs = 0;
2662 	nextaddr = 0;
2663 	for (; m; m = m->m_next) {
2664 		len = m->m_len;
2665 		if (__predict_false(len == 0))
2666 			continue;
2667 		if (skip >= len) {
2668 			skip -= len;
2669 			continue;
2670 		}
2671 		if ((m->m_flags & M_EXTPG) != 0) {
2672 			*cflags |= MC_NOMAP;
2673 			nsegs += count_mbuf_ext_pgs(m, skip, &nextaddr);
2674 			skip = 0;
2675 			continue;
2676 		}
2677 		va = mtod(m, vm_offset_t) + skip;
2678 		len -= skip;
2679 		skip = 0;
2680 		paddr = pmap_kextract(va);
2681 		nsegs += sglist_count((void *)(uintptr_t)va, len);
2682 		if (paddr == nextaddr)
2683 			nsegs--;
2684 		nextaddr = pmap_kextract(va + len - 1) + 1;
2685 	}
2686 
2687 	return (nsegs);
2688 }
2689 
2690 /*
2691  * The maximum number of segments that can fit in a WR.
2692  */
2693 static int
2694 max_nsegs_allowed(struct mbuf *m, bool vm_wr)
2695 {
2696 
2697 	if (vm_wr) {
2698 		if (needs_tso(m))
2699 			return (TX_SGL_SEGS_VM_TSO);
2700 		return (TX_SGL_SEGS_VM);
2701 	}
2702 
2703 	if (needs_tso(m)) {
2704 		if (needs_vxlan_tso(m))
2705 			return (TX_SGL_SEGS_VXLAN_TSO);
2706 		else
2707 			return (TX_SGL_SEGS_TSO);
2708 	}
2709 
2710 	return (TX_SGL_SEGS);
2711 }
2712 
2713 /*
2714  * Analyze the mbuf to determine its tx needs.  The mbuf passed in may change:
2715  * a) caller can assume it's been freed if this function returns with an error.
2716  * b) it may get defragged up if the gather list is too long for the hardware.
2717  */
2718 int
2719 parse_pkt(struct mbuf **mp, bool vm_wr)
2720 {
2721 	struct mbuf *m0 = *mp, *m;
2722 	int rc, nsegs, defragged = 0, offset;
2723 	struct ether_header *eh;
2724 	void *l3hdr;
2725 #if defined(INET) || defined(INET6)
2726 	struct tcphdr *tcp;
2727 #endif
2728 #if defined(KERN_TLS) || defined(RATELIMIT)
2729 	struct m_snd_tag *mst;
2730 #endif
2731 	uint16_t eh_type;
2732 	uint8_t cflags;
2733 
2734 	cflags = 0;
2735 	M_ASSERTPKTHDR(m0);
2736 	if (__predict_false(m0->m_pkthdr.len < ETHER_HDR_LEN)) {
2737 		rc = EINVAL;
2738 fail:
2739 		m_freem(m0);
2740 		*mp = NULL;
2741 		return (rc);
2742 	}
2743 restart:
2744 	/*
2745 	 * First count the number of gather list segments in the payload.
2746 	 * Defrag the mbuf if nsegs exceeds the hardware limit.
2747 	 */
2748 	M_ASSERTPKTHDR(m0);
2749 	MPASS(m0->m_pkthdr.len > 0);
2750 	nsegs = count_mbuf_nsegs(m0, 0, &cflags);
2751 #if defined(KERN_TLS) || defined(RATELIMIT)
2752 	if (m0->m_pkthdr.csum_flags & CSUM_SND_TAG)
2753 		mst = m0->m_pkthdr.snd_tag;
2754 	else
2755 		mst = NULL;
2756 #endif
2757 #ifdef KERN_TLS
2758 	if (mst != NULL && mst->type == IF_SND_TAG_TYPE_TLS) {
2759 		int len16;
2760 
2761 		cflags |= MC_TLS;
2762 		set_mbuf_cflags(m0, cflags);
2763 		rc = t6_ktls_parse_pkt(m0, &nsegs, &len16);
2764 		if (rc != 0)
2765 			goto fail;
2766 		set_mbuf_nsegs(m0, nsegs);
2767 		set_mbuf_len16(m0, len16);
2768 		return (0);
2769 	}
2770 #endif
2771 	if (nsegs > max_nsegs_allowed(m0, vm_wr)) {
2772 		if (defragged++ > 0) {
2773 			rc = EFBIG;
2774 			goto fail;
2775 		}
2776 		counter_u64_add(defrags, 1);
2777 		if ((m = m_defrag(m0, M_NOWAIT)) == NULL) {
2778 			rc = ENOMEM;
2779 			goto fail;
2780 		}
2781 		*mp = m0 = m;	/* update caller's copy after defrag */
2782 		goto restart;
2783 	}
2784 
2785 	if (__predict_false(nsegs > 2 && m0->m_pkthdr.len <= MHLEN &&
2786 	    !(cflags & MC_NOMAP))) {
2787 		counter_u64_add(pullups, 1);
2788 		m0 = m_pullup(m0, m0->m_pkthdr.len);
2789 		if (m0 == NULL) {
2790 			/* Should have left well enough alone. */
2791 			rc = EFBIG;
2792 			goto fail;
2793 		}
2794 		*mp = m0;	/* update caller's copy after pullup */
2795 		goto restart;
2796 	}
2797 	set_mbuf_nsegs(m0, nsegs);
2798 	set_mbuf_cflags(m0, cflags);
2799 	calculate_mbuf_len16(m0, vm_wr);
2800 
2801 #ifdef RATELIMIT
2802 	/*
2803 	 * Ethofld is limited to TCP and UDP for now, and only when L4 hw
2804 	 * checksumming is enabled.  needs_outer_l4_csum happens to check for
2805 	 * all the right things.
2806 	 */
2807 	if (__predict_false(needs_eo(mst) && !needs_outer_l4_csum(m0))) {
2808 		m_snd_tag_rele(m0->m_pkthdr.snd_tag);
2809 		m0->m_pkthdr.snd_tag = NULL;
2810 		m0->m_pkthdr.csum_flags &= ~CSUM_SND_TAG;
2811 		mst = NULL;
2812 	}
2813 #endif
2814 
2815 	if (!needs_hwcsum(m0)
2816 #ifdef RATELIMIT
2817    		 && !needs_eo(mst)
2818 #endif
2819 	)
2820 		return (0);
2821 
2822 	m = m0;
2823 	eh = mtod(m, struct ether_header *);
2824 	eh_type = ntohs(eh->ether_type);
2825 	if (eh_type == ETHERTYPE_VLAN) {
2826 		struct ether_vlan_header *evh = (void *)eh;
2827 
2828 		eh_type = ntohs(evh->evl_proto);
2829 		m0->m_pkthdr.l2hlen = sizeof(*evh);
2830 	} else
2831 		m0->m_pkthdr.l2hlen = sizeof(*eh);
2832 
2833 	offset = 0;
2834 	l3hdr = m_advance(&m, &offset, m0->m_pkthdr.l2hlen);
2835 
2836 	switch (eh_type) {
2837 #ifdef INET6
2838 	case ETHERTYPE_IPV6:
2839 		m0->m_pkthdr.l3hlen = sizeof(struct ip6_hdr);
2840 		break;
2841 #endif
2842 #ifdef INET
2843 	case ETHERTYPE_IP:
2844 	{
2845 		struct ip *ip = l3hdr;
2846 
2847 		if (needs_vxlan_csum(m0)) {
2848 			/* Driver will do the outer IP hdr checksum. */
2849 			ip->ip_sum = 0;
2850 			if (needs_vxlan_tso(m0)) {
2851 				const uint16_t ipl = ip->ip_len;
2852 
2853 				ip->ip_len = 0;
2854 				ip->ip_sum = ~in_cksum_hdr(ip);
2855 				ip->ip_len = ipl;
2856 			} else
2857 				ip->ip_sum = in_cksum_hdr(ip);
2858 		}
2859 		m0->m_pkthdr.l3hlen = ip->ip_hl << 2;
2860 		break;
2861 	}
2862 #endif
2863 	default:
2864 		panic("%s: ethertype 0x%04x unknown.  if_cxgbe must be compiled"
2865 		    " with the same INET/INET6 options as the kernel.",
2866 		    __func__, eh_type);
2867 	}
2868 
2869 	if (needs_vxlan_csum(m0)) {
2870 		m0->m_pkthdr.l4hlen = sizeof(struct udphdr);
2871 		m0->m_pkthdr.l5hlen = sizeof(struct vxlan_header);
2872 
2873 		/* Inner headers. */
2874 		eh = m_advance(&m, &offset, m0->m_pkthdr.l3hlen +
2875 		    sizeof(struct udphdr) + sizeof(struct vxlan_header));
2876 		eh_type = ntohs(eh->ether_type);
2877 		if (eh_type == ETHERTYPE_VLAN) {
2878 			struct ether_vlan_header *evh = (void *)eh;
2879 
2880 			eh_type = ntohs(evh->evl_proto);
2881 			m0->m_pkthdr.inner_l2hlen = sizeof(*evh);
2882 		} else
2883 			m0->m_pkthdr.inner_l2hlen = sizeof(*eh);
2884 		l3hdr = m_advance(&m, &offset, m0->m_pkthdr.inner_l2hlen);
2885 
2886 		switch (eh_type) {
2887 #ifdef INET6
2888 		case ETHERTYPE_IPV6:
2889 			m0->m_pkthdr.inner_l3hlen = sizeof(struct ip6_hdr);
2890 			break;
2891 #endif
2892 #ifdef INET
2893 		case ETHERTYPE_IP:
2894 		{
2895 			struct ip *ip = l3hdr;
2896 
2897 			m0->m_pkthdr.inner_l3hlen = ip->ip_hl << 2;
2898 			break;
2899 		}
2900 #endif
2901 		default:
2902 			panic("%s: VXLAN hw offload requested with unknown "
2903 			    "ethertype 0x%04x.  if_cxgbe must be compiled"
2904 			    " with the same INET/INET6 options as the kernel.",
2905 			    __func__, eh_type);
2906 		}
2907 #if defined(INET) || defined(INET6)
2908 		if (needs_inner_tcp_csum(m0)) {
2909 			tcp = m_advance(&m, &offset, m0->m_pkthdr.inner_l3hlen);
2910 			m0->m_pkthdr.inner_l4hlen = tcp->th_off * 4;
2911 		}
2912 #endif
2913 		MPASS((m0->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0);
2914 		m0->m_pkthdr.csum_flags &= CSUM_INNER_IP6_UDP |
2915 		    CSUM_INNER_IP6_TCP | CSUM_INNER_IP6_TSO | CSUM_INNER_IP |
2916 		    CSUM_INNER_IP_UDP | CSUM_INNER_IP_TCP | CSUM_INNER_IP_TSO |
2917 		    CSUM_ENCAP_VXLAN;
2918 	}
2919 
2920 #if defined(INET) || defined(INET6)
2921 	if (needs_outer_tcp_csum(m0)) {
2922 		tcp = m_advance(&m, &offset, m0->m_pkthdr.l3hlen);
2923 		m0->m_pkthdr.l4hlen = tcp->th_off * 4;
2924 #ifdef RATELIMIT
2925 		if (tsclk >= 0 && *(uint32_t *)(tcp + 1) == ntohl(0x0101080a)) {
2926 			set_mbuf_eo_tsclk_tsoff(m0,
2927 			    V_FW_ETH_TX_EO_WR_TSCLK(tsclk) |
2928 			    V_FW_ETH_TX_EO_WR_TSOFF(sizeof(*tcp) / 2 + 1));
2929 		} else
2930 			set_mbuf_eo_tsclk_tsoff(m0, 0);
2931 	} else if (needs_outer_udp_csum(m0)) {
2932 		m0->m_pkthdr.l4hlen = sizeof(struct udphdr);
2933 #endif
2934 	}
2935 #ifdef RATELIMIT
2936 	if (needs_eo(mst)) {
2937 		u_int immhdrs;
2938 
2939 		/* EO WRs have the headers in the WR and not the GL. */
2940 		immhdrs = m0->m_pkthdr.l2hlen + m0->m_pkthdr.l3hlen +
2941 		    m0->m_pkthdr.l4hlen;
2942 		cflags = 0;
2943 		nsegs = count_mbuf_nsegs(m0, immhdrs, &cflags);
2944 		MPASS(cflags == mbuf_cflags(m0));
2945 		set_mbuf_eo_nsegs(m0, nsegs);
2946 		set_mbuf_eo_len16(m0,
2947 		    txpkt_eo_len16(nsegs, immhdrs, needs_tso(m0)));
2948 	}
2949 #endif
2950 #endif
2951 	MPASS(m0 == *mp);
2952 	return (0);
2953 }
2954 
2955 void *
2956 start_wrq_wr(struct sge_wrq *wrq, int len16, struct wrq_cookie *cookie)
2957 {
2958 	struct sge_eq *eq = &wrq->eq;
2959 	struct adapter *sc = wrq->adapter;
2960 	int ndesc, available;
2961 	struct wrqe *wr;
2962 	void *w;
2963 
2964 	MPASS(len16 > 0);
2965 	ndesc = tx_len16_to_desc(len16);
2966 	MPASS(ndesc > 0 && ndesc <= SGE_MAX_WR_NDESC);
2967 
2968 	EQ_LOCK(eq);
2969 
2970 	if (TAILQ_EMPTY(&wrq->incomplete_wrs) && !STAILQ_EMPTY(&wrq->wr_list))
2971 		drain_wrq_wr_list(sc, wrq);
2972 
2973 	if (!STAILQ_EMPTY(&wrq->wr_list)) {
2974 slowpath:
2975 		EQ_UNLOCK(eq);
2976 		wr = alloc_wrqe(len16 * 16, wrq);
2977 		if (__predict_false(wr == NULL))
2978 			return (NULL);
2979 		cookie->pidx = -1;
2980 		cookie->ndesc = ndesc;
2981 		return (&wr->wr);
2982 	}
2983 
2984 	eq->cidx = read_hw_cidx(eq);
2985 	if (eq->pidx == eq->cidx)
2986 		available = eq->sidx - 1;
2987 	else
2988 		available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;
2989 	if (available < ndesc)
2990 		goto slowpath;
2991 
2992 	cookie->pidx = eq->pidx;
2993 	cookie->ndesc = ndesc;
2994 	TAILQ_INSERT_TAIL(&wrq->incomplete_wrs, cookie, link);
2995 
2996 	w = &eq->desc[eq->pidx];
2997 	IDXINCR(eq->pidx, ndesc, eq->sidx);
2998 	if (__predict_false(cookie->pidx + ndesc > eq->sidx)) {
2999 		w = &wrq->ss[0];
3000 		wrq->ss_pidx = cookie->pidx;
3001 		wrq->ss_len = len16 * 16;
3002 	}
3003 
3004 	EQ_UNLOCK(eq);
3005 
3006 	return (w);
3007 }
3008 
3009 void
3010 commit_wrq_wr(struct sge_wrq *wrq, void *w, struct wrq_cookie *cookie)
3011 {
3012 	struct sge_eq *eq = &wrq->eq;
3013 	struct adapter *sc = wrq->adapter;
3014 	int ndesc, pidx;
3015 	struct wrq_cookie *prev, *next;
3016 
3017 	if (cookie->pidx == -1) {
3018 		struct wrqe *wr = __containerof(w, struct wrqe, wr);
3019 
3020 		t4_wrq_tx(sc, wr);
3021 		return;
3022 	}
3023 
3024 	if (__predict_false(w == &wrq->ss[0])) {
3025 		int n = (eq->sidx - wrq->ss_pidx) * EQ_ESIZE;
3026 
3027 		MPASS(wrq->ss_len > n);	/* WR had better wrap around. */
3028 		bcopy(&wrq->ss[0], &eq->desc[wrq->ss_pidx], n);
3029 		bcopy(&wrq->ss[n], &eq->desc[0], wrq->ss_len - n);
3030 		wrq->tx_wrs_ss++;
3031 	} else
3032 		wrq->tx_wrs_direct++;
3033 
3034 	EQ_LOCK(eq);
3035 	ndesc = cookie->ndesc;	/* Can be more than SGE_MAX_WR_NDESC here. */
3036 	pidx = cookie->pidx;
3037 	MPASS(pidx >= 0 && pidx < eq->sidx);
3038 	prev = TAILQ_PREV(cookie, wrq_incomplete_wrs, link);
3039 	next = TAILQ_NEXT(cookie, link);
3040 	if (prev == NULL) {
3041 		MPASS(pidx == eq->dbidx);
3042 		if (next == NULL || ndesc >= 16) {
3043 			int available;
3044 			struct fw_eth_tx_pkt_wr *dst;	/* any fw WR struct will do */
3045 
3046 			/*
3047 			 * Note that the WR via which we'll request tx updates
3048 			 * is at pidx and not eq->pidx, which has moved on
3049 			 * already.
3050 			 */
3051 			dst = (void *)&eq->desc[pidx];
3052 			available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;
3053 			if (available < eq->sidx / 4 &&
3054 			    atomic_cmpset_int(&eq->equiq, 0, 1)) {
3055 				/*
3056 				 * XXX: This is not 100% reliable with some
3057 				 * types of WRs.  But this is a very unusual
3058 				 * situation for an ofld/ctrl queue anyway.
3059 				 */
3060 				dst->equiq_to_len16 |= htobe32(F_FW_WR_EQUIQ |
3061 				    F_FW_WR_EQUEQ);
3062 			}
3063 
3064 			ring_eq_db(wrq->adapter, eq, ndesc);
3065 		} else {
3066 			MPASS(IDXDIFF(next->pidx, pidx, eq->sidx) == ndesc);
3067 			next->pidx = pidx;
3068 			next->ndesc += ndesc;
3069 		}
3070 	} else {
3071 		MPASS(IDXDIFF(pidx, prev->pidx, eq->sidx) == prev->ndesc);
3072 		prev->ndesc += ndesc;
3073 	}
3074 	TAILQ_REMOVE(&wrq->incomplete_wrs, cookie, link);
3075 
3076 	if (TAILQ_EMPTY(&wrq->incomplete_wrs) && !STAILQ_EMPTY(&wrq->wr_list))
3077 		drain_wrq_wr_list(sc, wrq);
3078 
3079 #ifdef INVARIANTS
3080 	if (TAILQ_EMPTY(&wrq->incomplete_wrs)) {
3081 		/* Doorbell must have caught up to the pidx. */
3082 		MPASS(wrq->eq.pidx == wrq->eq.dbidx);
3083 	}
3084 #endif
3085 	EQ_UNLOCK(eq);
3086 }
3087 
3088 static u_int
3089 can_resume_eth_tx(struct mp_ring *r)
3090 {
3091 	struct sge_eq *eq = r->cookie;
3092 
3093 	return (total_available_tx_desc(eq) > eq->sidx / 8);
3094 }
3095 
3096 static inline bool
3097 cannot_use_txpkts(struct mbuf *m)
3098 {
3099 	/* maybe put a GL limit too, to avoid silliness? */
3100 
3101 	return (needs_tso(m) || (mbuf_cflags(m) & (MC_RAW_WR | MC_TLS)) != 0);
3102 }
3103 
3104 static inline int
3105 discard_tx(struct sge_eq *eq)
3106 {
3107 
3108 	return ((eq->flags & (EQ_ENABLED | EQ_QFLUSH)) != EQ_ENABLED);
3109 }
3110 
3111 static inline int
3112 wr_can_update_eq(void *p)
3113 {
3114 	struct fw_eth_tx_pkts_wr *wr = p;
3115 
3116 	switch (G_FW_WR_OP(be32toh(wr->op_pkd))) {
3117 	case FW_ULPTX_WR:
3118 	case FW_ETH_TX_PKT_WR:
3119 	case FW_ETH_TX_PKTS_WR:
3120 	case FW_ETH_TX_PKTS2_WR:
3121 	case FW_ETH_TX_PKT_VM_WR:
3122 	case FW_ETH_TX_PKTS_VM_WR:
3123 		return (1);
3124 	default:
3125 		return (0);
3126 	}
3127 }
3128 
3129 static inline void
3130 set_txupdate_flags(struct sge_txq *txq, u_int avail,
3131     struct fw_eth_tx_pkt_wr *wr)
3132 {
3133 	struct sge_eq *eq = &txq->eq;
3134 	struct txpkts *txp = &txq->txp;
3135 
3136 	if ((txp->npkt > 0 || avail < eq->sidx / 2) &&
3137 	    atomic_cmpset_int(&eq->equiq, 0, 1)) {
3138 		wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ | F_FW_WR_EQUIQ);
3139 		eq->equeqidx = eq->pidx;
3140 	} else if (IDXDIFF(eq->pidx, eq->equeqidx, eq->sidx) >= 32) {
3141 		wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ);
3142 		eq->equeqidx = eq->pidx;
3143 	}
3144 }
3145 
3146 #if defined(__i386__) || defined(__amd64__)
3147 extern uint64_t tsc_freq;
3148 #endif
3149 
3150 static inline bool
3151 record_eth_tx_time(struct sge_txq *txq)
3152 {
3153 	const uint64_t cycles = get_cyclecount();
3154 	const uint64_t last_tx = txq->last_tx;
3155 #if defined(__i386__) || defined(__amd64__)
3156 	const uint64_t itg = tsc_freq * t4_tx_coalesce_gap / 1000000;
3157 #else
3158 	const uint64_t itg = 0;
3159 #endif
3160 
3161 	MPASS(cycles >= last_tx);
3162 	txq->last_tx = cycles;
3163 	return (cycles - last_tx < itg);
3164 }
3165 
3166 /*
3167  * r->items[cidx] to r->items[pidx], with a wraparound at r->size, are ready to
3168  * be consumed.  Return the actual number consumed.  0 indicates a stall.
3169  */
3170 static u_int
3171 eth_tx(struct mp_ring *r, u_int cidx, u_int pidx, bool *coalescing)
3172 {
3173 	struct sge_txq *txq = r->cookie;
3174 	struct ifnet *ifp = txq->ifp;
3175 	struct sge_eq *eq = &txq->eq;
3176 	struct txpkts *txp = &txq->txp;
3177 	struct vi_info *vi = ifp->if_softc;
3178 	struct adapter *sc = vi->adapter;
3179 	u_int total, remaining;		/* # of packets */
3180 	u_int n, avail, dbdiff;		/* # of hardware descriptors */
3181 	int i, rc;
3182 	struct mbuf *m0;
3183 	bool snd, recent_tx;
3184 	void *wr;	/* start of the last WR written to the ring */
3185 
3186 	TXQ_LOCK_ASSERT_OWNED(txq);
3187 	recent_tx = record_eth_tx_time(txq);
3188 
3189 	remaining = IDXDIFF(pidx, cidx, r->size);
3190 	if (__predict_false(discard_tx(eq))) {
3191 		for (i = 0; i < txp->npkt; i++)
3192 			m_freem(txp->mb[i]);
3193 		txp->npkt = 0;
3194 		while (cidx != pidx) {
3195 			m0 = r->items[cidx];
3196 			m_freem(m0);
3197 			if (++cidx == r->size)
3198 				cidx = 0;
3199 		}
3200 		reclaim_tx_descs(txq, eq->sidx);
3201 		*coalescing = false;
3202 		return (remaining);	/* emptied */
3203 	}
3204 
3205 	/* How many hardware descriptors do we have readily available. */
3206 	if (eq->pidx == eq->cidx)
3207 		avail = eq->sidx - 1;
3208 	else
3209 		avail = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;
3210 
3211 	total = 0;
3212 	if (remaining == 0) {
3213 		txp->score = 0;
3214 		txq->txpkts_flush++;
3215 		goto send_txpkts;
3216 	}
3217 
3218 	dbdiff = 0;
3219 	MPASS(remaining > 0);
3220 	while (remaining > 0) {
3221 		m0 = r->items[cidx];
3222 		M_ASSERTPKTHDR(m0);
3223 		MPASS(m0->m_nextpkt == NULL);
3224 
3225 		if (avail < 2 * SGE_MAX_WR_NDESC)
3226 			avail += reclaim_tx_descs(txq, 64);
3227 
3228 		if (t4_tx_coalesce == 0 && txp->npkt == 0)
3229 			goto skip_coalescing;
3230 		if (cannot_use_txpkts(m0))
3231 			txp->score = 0;
3232 		else if (recent_tx) {
3233 			if (++txp->score == 0)
3234 				txp->score = UINT8_MAX;
3235 		} else
3236 			txp->score = 1;
3237 		if (txp->npkt > 0 || remaining > 1 ||
3238 		    txp->score >= t4_tx_coalesce_pkts ||
3239 		    atomic_load_int(&txq->eq.equiq) != 0) {
3240 			if (vi->flags & TX_USES_VM_WR)
3241 				rc = add_to_txpkts_vf(sc, txq, m0, avail, &snd);
3242 			else
3243 				rc = add_to_txpkts_pf(sc, txq, m0, avail, &snd);
3244 		} else {
3245 			snd = false;
3246 			rc = EINVAL;
3247 		}
3248 		if (snd) {
3249 			MPASS(txp->npkt > 0);
3250 			for (i = 0; i < txp->npkt; i++)
3251 				ETHER_BPF_MTAP(ifp, txp->mb[i]);
3252 			if (txp->npkt > 1) {
3253 				MPASS(avail >= tx_len16_to_desc(txp->len16));
3254 				if (vi->flags & TX_USES_VM_WR)
3255 					n = write_txpkts_vm_wr(sc, txq);
3256 				else
3257 					n = write_txpkts_wr(sc, txq);
3258 			} else {
3259 				MPASS(avail >=
3260 				    tx_len16_to_desc(mbuf_len16(txp->mb[0])));
3261 				if (vi->flags & TX_USES_VM_WR)
3262 					n = write_txpkt_vm_wr(sc, txq,
3263 					    txp->mb[0]);
3264 				else
3265 					n = write_txpkt_wr(sc, txq, txp->mb[0],
3266 					    avail);
3267 			}
3268 			MPASS(n <= SGE_MAX_WR_NDESC);
3269 			avail -= n;
3270 			dbdiff += n;
3271 			wr = &eq->desc[eq->pidx];
3272 			IDXINCR(eq->pidx, n, eq->sidx);
3273 			txp->npkt = 0;	/* emptied */
3274 		}
3275 		if (rc == 0) {
3276 			/* m0 was coalesced into txq->txpkts. */
3277 			goto next_mbuf;
3278 		}
3279 		if (rc == EAGAIN) {
3280 			/*
3281 			 * m0 is suitable for tx coalescing but could not be
3282 			 * combined with the existing txq->txpkts, which has now
3283 			 * been transmitted.  Start a new txpkts with m0.
3284 			 */
3285 			MPASS(snd);
3286 			MPASS(txp->npkt == 0);
3287 			continue;
3288 		}
3289 
3290 		MPASS(rc != 0 && rc != EAGAIN);
3291 		MPASS(txp->npkt == 0);
3292 skip_coalescing:
3293 		n = tx_len16_to_desc(mbuf_len16(m0));
3294 		if (__predict_false(avail < n)) {
3295 			avail += reclaim_tx_descs(txq, min(n, 32));
3296 			if (avail < n)
3297 				break;	/* out of descriptors */
3298 		}
3299 
3300 		wr = &eq->desc[eq->pidx];
3301 		if (mbuf_cflags(m0) & MC_RAW_WR) {
3302 			n = write_raw_wr(txq, wr, m0, avail);
3303 #ifdef KERN_TLS
3304 		} else if (mbuf_cflags(m0) & MC_TLS) {
3305 			ETHER_BPF_MTAP(ifp, m0);
3306 			n = t6_ktls_write_wr(txq, wr, m0, mbuf_nsegs(m0),
3307 			    avail);
3308 #endif
3309 		} else {
3310 			ETHER_BPF_MTAP(ifp, m0);
3311 			if (vi->flags & TX_USES_VM_WR)
3312 				n = write_txpkt_vm_wr(sc, txq, m0);
3313 			else
3314 				n = write_txpkt_wr(sc, txq, m0, avail);
3315 		}
3316 		MPASS(n >= 1 && n <= avail);
3317 		if (!(mbuf_cflags(m0) & MC_TLS))
3318 			MPASS(n <= SGE_MAX_WR_NDESC);
3319 
3320 		avail -= n;
3321 		dbdiff += n;
3322 		IDXINCR(eq->pidx, n, eq->sidx);
3323 
3324 		if (dbdiff >= 512 / EQ_ESIZE) {	/* X_FETCHBURSTMAX_512B */
3325 			if (wr_can_update_eq(wr))
3326 				set_txupdate_flags(txq, avail, wr);
3327 			ring_eq_db(sc, eq, dbdiff);
3328 			avail += reclaim_tx_descs(txq, 32);
3329 			dbdiff = 0;
3330 		}
3331 next_mbuf:
3332 		total++;
3333 		remaining--;
3334 		if (__predict_false(++cidx == r->size))
3335 			cidx = 0;
3336 	}
3337 	if (dbdiff != 0) {
3338 		if (wr_can_update_eq(wr))
3339 			set_txupdate_flags(txq, avail, wr);
3340 		ring_eq_db(sc, eq, dbdiff);
3341 		reclaim_tx_descs(txq, 32);
3342 	} else if (eq->pidx == eq->cidx && txp->npkt > 0 &&
3343 	    atomic_load_int(&txq->eq.equiq) == 0) {
3344 		/*
3345 		 * If nothing was submitted to the chip for tx (it was coalesced
3346 		 * into txpkts instead) and there is no tx update outstanding
3347 		 * then we need to send txpkts now.
3348 		 */
3349 send_txpkts:
3350 		MPASS(txp->npkt > 0);
3351 		for (i = 0; i < txp->npkt; i++)
3352 			ETHER_BPF_MTAP(ifp, txp->mb[i]);
3353 		if (txp->npkt > 1) {
3354 			MPASS(avail >= tx_len16_to_desc(txp->len16));
3355 			if (vi->flags & TX_USES_VM_WR)
3356 				n = write_txpkts_vm_wr(sc, txq);
3357 			else
3358 				n = write_txpkts_wr(sc, txq);
3359 		} else {
3360 			MPASS(avail >=
3361 			    tx_len16_to_desc(mbuf_len16(txp->mb[0])));
3362 			if (vi->flags & TX_USES_VM_WR)
3363 				n = write_txpkt_vm_wr(sc, txq, txp->mb[0]);
3364 			else
3365 				n = write_txpkt_wr(sc, txq, txp->mb[0], avail);
3366 		}
3367 		MPASS(n <= SGE_MAX_WR_NDESC);
3368 		wr = &eq->desc[eq->pidx];
3369 		IDXINCR(eq->pidx, n, eq->sidx);
3370 		txp->npkt = 0;	/* emptied */
3371 
3372 		MPASS(wr_can_update_eq(wr));
3373 		set_txupdate_flags(txq, avail - n, wr);
3374 		ring_eq_db(sc, eq, n);
3375 		reclaim_tx_descs(txq, 32);
3376 	}
3377 	*coalescing = txp->npkt > 0;
3378 
3379 	return (total);
3380 }
3381 
3382 static inline void
3383 init_iq(struct sge_iq *iq, struct adapter *sc, int tmr_idx, int pktc_idx,
3384     int qsize)
3385 {
3386 
3387 	KASSERT(tmr_idx >= 0 && tmr_idx < SGE_NTIMERS,
3388 	    ("%s: bad tmr_idx %d", __func__, tmr_idx));
3389 	KASSERT(pktc_idx < SGE_NCOUNTERS,	/* -ve is ok, means don't use */
3390 	    ("%s: bad pktc_idx %d", __func__, pktc_idx));
3391 
3392 	iq->flags = 0;
3393 	iq->adapter = sc;
3394 	iq->intr_params = V_QINTR_TIMER_IDX(tmr_idx);
3395 	iq->intr_pktc_idx = SGE_NCOUNTERS - 1;
3396 	if (pktc_idx >= 0) {
3397 		iq->intr_params |= F_QINTR_CNT_EN;
3398 		iq->intr_pktc_idx = pktc_idx;
3399 	}
3400 	iq->qsize = roundup2(qsize, 16);	/* See FW_IQ_CMD/iqsize */
3401 	iq->sidx = iq->qsize - sc->params.sge.spg_len / IQ_ESIZE;
3402 }
3403 
3404 static inline void
3405 init_fl(struct adapter *sc, struct sge_fl *fl, int qsize, int maxp, char *name)
3406 {
3407 
3408 	fl->qsize = qsize;
3409 	fl->sidx = qsize - sc->params.sge.spg_len / EQ_ESIZE;
3410 	strlcpy(fl->lockname, name, sizeof(fl->lockname));
3411 	if (sc->flags & BUF_PACKING_OK &&
3412 	    ((!is_t4(sc) && buffer_packing) ||	/* T5+: enabled unless 0 */
3413 	    (is_t4(sc) && buffer_packing == 1)))/* T4: disabled unless 1 */
3414 		fl->flags |= FL_BUF_PACKING;
3415 	fl->zidx = find_refill_source(sc, maxp, fl->flags & FL_BUF_PACKING);
3416 	fl->safe_zidx = sc->sge.safe_zidx;
3417 }
3418 
3419 static inline void
3420 init_eq(struct adapter *sc, struct sge_eq *eq, int eqtype, int qsize,
3421     uint8_t tx_chan, uint16_t iqid, char *name)
3422 {
3423 	KASSERT(eqtype <= EQ_TYPEMASK, ("%s: bad qtype %d", __func__, eqtype));
3424 
3425 	eq->flags = eqtype & EQ_TYPEMASK;
3426 	eq->tx_chan = tx_chan;
3427 	eq->iqid = iqid;
3428 	eq->sidx = qsize - sc->params.sge.spg_len / EQ_ESIZE;
3429 	strlcpy(eq->lockname, name, sizeof(eq->lockname));
3430 }
3431 
3432 int
3433 alloc_ring(struct adapter *sc, size_t len, bus_dma_tag_t *tag,
3434     bus_dmamap_t *map, bus_addr_t *pa, void **va)
3435 {
3436 	int rc;
3437 
3438 	rc = bus_dma_tag_create(sc->dmat, 512, 0, BUS_SPACE_MAXADDR,
3439 	    BUS_SPACE_MAXADDR, NULL, NULL, len, 1, len, 0, NULL, NULL, tag);
3440 	if (rc != 0) {
3441 		device_printf(sc->dev, "cannot allocate DMA tag: %d\n", rc);
3442 		goto done;
3443 	}
3444 
3445 	rc = bus_dmamem_alloc(*tag, va,
3446 	    BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, map);
3447 	if (rc != 0) {
3448 		device_printf(sc->dev, "cannot allocate DMA memory: %d\n", rc);
3449 		goto done;
3450 	}
3451 
3452 	rc = bus_dmamap_load(*tag, *map, *va, len, oneseg_dma_callback, pa, 0);
3453 	if (rc != 0) {
3454 		device_printf(sc->dev, "cannot load DMA map: %d\n", rc);
3455 		goto done;
3456 	}
3457 done:
3458 	if (rc)
3459 		free_ring(sc, *tag, *map, *pa, *va);
3460 
3461 	return (rc);
3462 }
3463 
3464 int
3465 free_ring(struct adapter *sc, bus_dma_tag_t tag, bus_dmamap_t map,
3466     bus_addr_t pa, void *va)
3467 {
3468 	if (pa)
3469 		bus_dmamap_unload(tag, map);
3470 	if (va)
3471 		bus_dmamem_free(tag, va, map);
3472 	if (tag)
3473 		bus_dma_tag_destroy(tag);
3474 
3475 	return (0);
3476 }
3477 
3478 /*
3479  * Allocates the ring for an ingress queue and an optional freelist.  If the
3480  * freelist is specified it will be allocated and then associated with the
3481  * ingress queue.
3482  *
3483  * Returns errno on failure.  Resources allocated up to that point may still be
3484  * allocated.  Caller is responsible for cleanup in case this function fails.
3485  *
3486  * If the ingress queue will take interrupts directly then the intr_idx
3487  * specifies the vector, starting from 0.  -1 means the interrupts for this
3488  * queue should be forwarded to the fwq.
3489  */
3490 static int
3491 alloc_iq_fl(struct vi_info *vi, struct sge_iq *iq, struct sge_fl *fl,
3492     int intr_idx, int cong)
3493 {
3494 	int rc, i, cntxt_id;
3495 	size_t len;
3496 	struct fw_iq_cmd c;
3497 	struct port_info *pi = vi->pi;
3498 	struct adapter *sc = iq->adapter;
3499 	struct sge_params *sp = &sc->params.sge;
3500 	__be32 v = 0;
3501 
3502 	len = iq->qsize * IQ_ESIZE;
3503 	rc = alloc_ring(sc, len, &iq->desc_tag, &iq->desc_map, &iq->ba,
3504 	    (void **)&iq->desc);
3505 	if (rc != 0)
3506 		return (rc);
3507 
3508 	bzero(&c, sizeof(c));
3509 	c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_IQ_CMD) | F_FW_CMD_REQUEST |
3510 	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_IQ_CMD_PFN(sc->pf) |
3511 	    V_FW_IQ_CMD_VFN(0));
3512 
3513 	c.alloc_to_len16 = htobe32(F_FW_IQ_CMD_ALLOC | F_FW_IQ_CMD_IQSTART |
3514 	    FW_LEN16(c));
3515 
3516 	/* Special handling for firmware event queue */
3517 	if (iq == &sc->sge.fwq)
3518 		v |= F_FW_IQ_CMD_IQASYNCH;
3519 
3520 	if (intr_idx < 0) {
3521 		/* Forwarded interrupts, all headed to fwq */
3522 		v |= F_FW_IQ_CMD_IQANDST;
3523 		v |= V_FW_IQ_CMD_IQANDSTINDEX(sc->sge.fwq.cntxt_id);
3524 	} else {
3525 		KASSERT(intr_idx < sc->intr_count,
3526 		    ("%s: invalid direct intr_idx %d", __func__, intr_idx));
3527 		v |= V_FW_IQ_CMD_IQANDSTINDEX(intr_idx);
3528 	}
3529 
3530 	c.type_to_iqandstindex = htobe32(v |
3531 	    V_FW_IQ_CMD_TYPE(FW_IQ_TYPE_FL_INT_CAP) |
3532 	    V_FW_IQ_CMD_VIID(vi->viid) |
3533 	    V_FW_IQ_CMD_IQANUD(X_UPDATEDELIVERY_INTERRUPT));
3534 	c.iqdroprss_to_iqesize = htobe16(V_FW_IQ_CMD_IQPCIECH(pi->tx_chan) |
3535 	    F_FW_IQ_CMD_IQGTSMODE |
3536 	    V_FW_IQ_CMD_IQINTCNTTHRESH(iq->intr_pktc_idx) |
3537 	    V_FW_IQ_CMD_IQESIZE(ilog2(IQ_ESIZE) - 4));
3538 	c.iqsize = htobe16(iq->qsize);
3539 	c.iqaddr = htobe64(iq->ba);
3540 	if (cong >= 0)
3541 		c.iqns_to_fl0congen = htobe32(F_FW_IQ_CMD_IQFLINTCONGEN);
3542 
3543 	if (fl) {
3544 		mtx_init(&fl->fl_lock, fl->lockname, NULL, MTX_DEF);
3545 
3546 		len = fl->qsize * EQ_ESIZE;
3547 		rc = alloc_ring(sc, len, &fl->desc_tag, &fl->desc_map,
3548 		    &fl->ba, (void **)&fl->desc);
3549 		if (rc)
3550 			return (rc);
3551 
3552 		/* Allocate space for one software descriptor per buffer. */
3553 		rc = alloc_fl_sdesc(fl);
3554 		if (rc != 0) {
3555 			device_printf(sc->dev,
3556 			    "failed to setup fl software descriptors: %d\n",
3557 			    rc);
3558 			return (rc);
3559 		}
3560 
3561 		if (fl->flags & FL_BUF_PACKING) {
3562 			fl->lowat = roundup2(sp->fl_starve_threshold2, 8);
3563 			fl->buf_boundary = sp->pack_boundary;
3564 		} else {
3565 			fl->lowat = roundup2(sp->fl_starve_threshold, 8);
3566 			fl->buf_boundary = 16;
3567 		}
3568 		if (fl_pad && fl->buf_boundary < sp->pad_boundary)
3569 			fl->buf_boundary = sp->pad_boundary;
3570 
3571 		c.iqns_to_fl0congen |=
3572 		    htobe32(V_FW_IQ_CMD_FL0HOSTFCMODE(X_HOSTFCMODE_NONE) |
3573 			F_FW_IQ_CMD_FL0FETCHRO | F_FW_IQ_CMD_FL0DATARO |
3574 			(fl_pad ? F_FW_IQ_CMD_FL0PADEN : 0) |
3575 			(fl->flags & FL_BUF_PACKING ? F_FW_IQ_CMD_FL0PACKEN :
3576 			    0));
3577 		if (cong >= 0) {
3578 			c.iqns_to_fl0congen |=
3579 				htobe32(V_FW_IQ_CMD_FL0CNGCHMAP(cong) |
3580 				    F_FW_IQ_CMD_FL0CONGCIF |
3581 				    F_FW_IQ_CMD_FL0CONGEN);
3582 		}
3583 		c.fl0dcaen_to_fl0cidxfthresh =
3584 		    htobe16(V_FW_IQ_CMD_FL0FBMIN(chip_id(sc) <= CHELSIO_T5 ?
3585 			X_FETCHBURSTMIN_128B : X_FETCHBURSTMIN_64B_T6) |
3586 			V_FW_IQ_CMD_FL0FBMAX(chip_id(sc) <= CHELSIO_T5 ?
3587 			X_FETCHBURSTMAX_512B : X_FETCHBURSTMAX_256B));
3588 		c.fl0size = htobe16(fl->qsize);
3589 		c.fl0addr = htobe64(fl->ba);
3590 	}
3591 
3592 	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
3593 	if (rc != 0) {
3594 		device_printf(sc->dev,
3595 		    "failed to create ingress queue: %d\n", rc);
3596 		return (rc);
3597 	}
3598 
3599 	iq->cidx = 0;
3600 	iq->gen = F_RSPD_GEN;
3601 	iq->intr_next = iq->intr_params;
3602 	iq->cntxt_id = be16toh(c.iqid);
3603 	iq->abs_id = be16toh(c.physiqid);
3604 	iq->flags |= IQ_ALLOCATED;
3605 
3606 	cntxt_id = iq->cntxt_id - sc->sge.iq_start;
3607 	if (cntxt_id >= sc->sge.iqmap_sz) {
3608 		panic ("%s: iq->cntxt_id (%d) more than the max (%d)", __func__,
3609 		    cntxt_id, sc->sge.iqmap_sz - 1);
3610 	}
3611 	sc->sge.iqmap[cntxt_id] = iq;
3612 
3613 	if (fl) {
3614 		u_int qid;
3615 
3616 		iq->flags |= IQ_HAS_FL;
3617 		fl->cntxt_id = be16toh(c.fl0id);
3618 		fl->pidx = fl->cidx = 0;
3619 
3620 		cntxt_id = fl->cntxt_id - sc->sge.eq_start;
3621 		if (cntxt_id >= sc->sge.eqmap_sz) {
3622 			panic("%s: fl->cntxt_id (%d) more than the max (%d)",
3623 			    __func__, cntxt_id, sc->sge.eqmap_sz - 1);
3624 		}
3625 		sc->sge.eqmap[cntxt_id] = (void *)fl;
3626 
3627 		qid = fl->cntxt_id;
3628 		if (isset(&sc->doorbells, DOORBELL_UDB)) {
3629 			uint32_t s_qpp = sc->params.sge.eq_s_qpp;
3630 			uint32_t mask = (1 << s_qpp) - 1;
3631 			volatile uint8_t *udb;
3632 
3633 			udb = sc->udbs_base + UDBS_DB_OFFSET;
3634 			udb += (qid >> s_qpp) << PAGE_SHIFT;
3635 			qid &= mask;
3636 			if (qid < PAGE_SIZE / UDBS_SEG_SIZE) {
3637 				udb += qid << UDBS_SEG_SHIFT;
3638 				qid = 0;
3639 			}
3640 			fl->udb = (volatile void *)udb;
3641 		}
3642 		fl->dbval = V_QID(qid) | sc->chip_params->sge_fl_db;
3643 
3644 		FL_LOCK(fl);
3645 		/* Enough to make sure the SGE doesn't think it's starved */
3646 		refill_fl(sc, fl, fl->lowat);
3647 		FL_UNLOCK(fl);
3648 	}
3649 
3650 	if (chip_id(sc) >= CHELSIO_T5 && !(sc->flags & IS_VF) && cong >= 0) {
3651 		uint32_t param, val;
3652 
3653 		param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DMAQ) |
3654 		    V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DMAQ_CONM_CTXT) |
3655 		    V_FW_PARAMS_PARAM_YZ(iq->cntxt_id);
3656 		if (cong == 0)
3657 			val = 1 << 19;
3658 		else {
3659 			val = 2 << 19;
3660 			for (i = 0; i < 4; i++) {
3661 				if (cong & (1 << i))
3662 					val |= 1 << (i << 2);
3663 			}
3664 		}
3665 
3666 		rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, &param, &val);
3667 		if (rc != 0) {
3668 			/* report error but carry on */
3669 			device_printf(sc->dev,
3670 			    "failed to set congestion manager context for "
3671 			    "ingress queue %d: %d\n", iq->cntxt_id, rc);
3672 		}
3673 	}
3674 
3675 	/* Enable IQ interrupts */
3676 	atomic_store_rel_int(&iq->state, IQS_IDLE);
3677 	t4_write_reg(sc, sc->sge_gts_reg, V_SEINTARM(iq->intr_params) |
3678 	    V_INGRESSQID(iq->cntxt_id));
3679 
3680 	return (0);
3681 }
3682 
3683 static int
3684 free_iq_fl(struct vi_info *vi, struct sge_iq *iq, struct sge_fl *fl)
3685 {
3686 	int rc;
3687 	struct adapter *sc = iq->adapter;
3688 	device_t dev;
3689 
3690 	if (sc == NULL)
3691 		return (0);	/* nothing to do */
3692 
3693 	dev = vi ? vi->dev : sc->dev;
3694 
3695 	if (iq->flags & IQ_ALLOCATED) {
3696 		rc = -t4_iq_free(sc, sc->mbox, sc->pf, 0,
3697 		    FW_IQ_TYPE_FL_INT_CAP, iq->cntxt_id,
3698 		    fl ? fl->cntxt_id : 0xffff, 0xffff);
3699 		if (rc != 0) {
3700 			device_printf(dev,
3701 			    "failed to free queue %p: %d\n", iq, rc);
3702 			return (rc);
3703 		}
3704 		iq->flags &= ~IQ_ALLOCATED;
3705 	}
3706 
3707 	free_ring(sc, iq->desc_tag, iq->desc_map, iq->ba, iq->desc);
3708 
3709 	bzero(iq, sizeof(*iq));
3710 
3711 	if (fl) {
3712 		free_ring(sc, fl->desc_tag, fl->desc_map, fl->ba,
3713 		    fl->desc);
3714 
3715 		if (fl->sdesc)
3716 			free_fl_sdesc(sc, fl);
3717 
3718 		if (mtx_initialized(&fl->fl_lock))
3719 			mtx_destroy(&fl->fl_lock);
3720 
3721 		bzero(fl, sizeof(*fl));
3722 	}
3723 
3724 	return (0);
3725 }
3726 
3727 static void
3728 add_iq_sysctls(struct sysctl_ctx_list *ctx, struct sysctl_oid *oid,
3729     struct sge_iq *iq)
3730 {
3731 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
3732 
3733 	SYSCTL_ADD_UAUTO(ctx, children, OID_AUTO, "ba", CTLFLAG_RD, &iq->ba,
3734 	    "bus address of descriptor ring");
3735 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL,
3736 	    iq->qsize * IQ_ESIZE, "descriptor ring size in bytes");
3737 	SYSCTL_ADD_U16(ctx, children, OID_AUTO, "abs_id", CTLFLAG_RD,
3738 	    &iq->abs_id, 0, "absolute id of the queue");
3739 	SYSCTL_ADD_U16(ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD,
3740 	    &iq->cntxt_id, 0, "SGE context id of the queue");
3741 	SYSCTL_ADD_U16(ctx, children, OID_AUTO, "cidx", CTLFLAG_RD, &iq->cidx,
3742 	    0, "consumer index");
3743 }
3744 
3745 static void
3746 add_fl_sysctls(struct adapter *sc, struct sysctl_ctx_list *ctx,
3747     struct sysctl_oid *oid, struct sge_fl *fl)
3748 {
3749 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
3750 
3751 	oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "fl",
3752 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "freelist");
3753 	children = SYSCTL_CHILDREN(oid);
3754 
3755 	SYSCTL_ADD_UAUTO(ctx, children, OID_AUTO, "ba", CTLFLAG_RD,
3756 	    &fl->ba, "bus address of descriptor ring");
3757 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL,
3758 	    fl->sidx * EQ_ESIZE + sc->params.sge.spg_len,
3759 	    "desc ring size in bytes");
3760 	SYSCTL_ADD_U16(ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD,
3761 	    &fl->cntxt_id, 0, "SGE context id of the freelist");
3762 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "padding", CTLFLAG_RD, NULL,
3763 	    fl_pad ? 1 : 0, "padding enabled");
3764 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "packing", CTLFLAG_RD, NULL,
3765 	    fl->flags & FL_BUF_PACKING ? 1 : 0, "packing enabled");
3766 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cidx", CTLFLAG_RD, &fl->cidx,
3767 	    0, "consumer index");
3768 	if (fl->flags & FL_BUF_PACKING) {
3769 		SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "rx_offset",
3770 		    CTLFLAG_RD, &fl->rx_offset, 0, "packing rx offset");
3771 	}
3772 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "pidx", CTLFLAG_RD, &fl->pidx,
3773 	    0, "producer index");
3774 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_allocated",
3775 	    CTLFLAG_RD, &fl->cl_allocated, "# of clusters allocated");
3776 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_recycled",
3777 	    CTLFLAG_RD, &fl->cl_recycled, "# of clusters recycled");
3778 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_fast_recycled",
3779 	    CTLFLAG_RD, &fl->cl_fast_recycled, "# of clusters recycled (fast)");
3780 }
3781 
3782 static int
3783 alloc_fwq(struct adapter *sc)
3784 {
3785 	int rc, intr_idx;
3786 	struct sge_iq *fwq = &sc->sge.fwq;
3787 	struct sysctl_oid *oid = device_get_sysctl_tree(sc->dev);
3788 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
3789 
3790 	init_iq(fwq, sc, 0, 0, FW_IQ_QSIZE);
3791 	if (sc->flags & IS_VF)
3792 		intr_idx = 0;
3793 	else
3794 		intr_idx = sc->intr_count > 1 ? 1 : 0;
3795 	rc = alloc_iq_fl(&sc->port[0]->vi[0], fwq, NULL, intr_idx, -1);
3796 	if (rc != 0) {
3797 		device_printf(sc->dev,
3798 		    "failed to create firmware event queue: %d\n", rc);
3799 		return (rc);
3800 	}
3801 
3802 	oid = SYSCTL_ADD_NODE(&sc->ctx, children, OID_AUTO, "fwq",
3803 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "firmware event queue");
3804 	add_iq_sysctls(&sc->ctx, oid, fwq);
3805 
3806 	return (0);
3807 }
3808 
3809 static int
3810 free_fwq(struct adapter *sc)
3811 {
3812 	return free_iq_fl(NULL, &sc->sge.fwq, NULL);
3813 }
3814 
3815 static int
3816 alloc_ctrlq(struct adapter *sc, struct sge_wrq *ctrlq, int idx,
3817     struct sysctl_oid *oid)
3818 {
3819 	int rc;
3820 	char name[16];
3821 	struct sysctl_oid_list *children;
3822 
3823 	snprintf(name, sizeof(name), "%s ctrlq%d", device_get_nameunit(sc->dev),
3824 	    idx);
3825 	init_eq(sc, &ctrlq->eq, EQ_CTRL, CTRL_EQ_QSIZE, sc->port[idx]->tx_chan,
3826 	    sc->sge.fwq.cntxt_id, name);
3827 
3828 	children = SYSCTL_CHILDREN(oid);
3829 	snprintf(name, sizeof(name), "%d", idx);
3830 	oid = SYSCTL_ADD_NODE(&sc->ctx, children, OID_AUTO, name,
3831 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "ctrl queue");
3832 	rc = alloc_wrq(sc, NULL, ctrlq, oid);
3833 
3834 	return (rc);
3835 }
3836 
3837 int
3838 tnl_cong(struct port_info *pi, int drop)
3839 {
3840 
3841 	if (drop == -1)
3842 		return (-1);
3843 	else if (drop == 1)
3844 		return (0);
3845 	else
3846 		return (pi->rx_e_chan_map);
3847 }
3848 
3849 static int
3850 alloc_rxq(struct vi_info *vi, struct sge_rxq *rxq, int intr_idx, int idx,
3851     struct sysctl_oid *oid)
3852 {
3853 	int rc;
3854 	struct adapter *sc = vi->adapter;
3855 	struct sysctl_oid_list *children;
3856 	char name[16];
3857 
3858 	rc = alloc_iq_fl(vi, &rxq->iq, &rxq->fl, intr_idx,
3859 	    tnl_cong(vi->pi, cong_drop));
3860 	if (rc != 0)
3861 		return (rc);
3862 
3863 	if (idx == 0)
3864 		sc->sge.iq_base = rxq->iq.abs_id - rxq->iq.cntxt_id;
3865 	else
3866 		KASSERT(rxq->iq.cntxt_id + sc->sge.iq_base == rxq->iq.abs_id,
3867 		    ("iq_base mismatch"));
3868 	KASSERT(sc->sge.iq_base == 0 || sc->flags & IS_VF,
3869 	    ("PF with non-zero iq_base"));
3870 
3871 	/*
3872 	 * The freelist is just barely above the starvation threshold right now,
3873 	 * fill it up a bit more.
3874 	 */
3875 	FL_LOCK(&rxq->fl);
3876 	refill_fl(sc, &rxq->fl, 128);
3877 	FL_UNLOCK(&rxq->fl);
3878 
3879 #if defined(INET) || defined(INET6)
3880 	rc = tcp_lro_init_args(&rxq->lro, vi->ifp, lro_entries, lro_mbufs);
3881 	if (rc != 0)
3882 		return (rc);
3883 	MPASS(rxq->lro.ifp == vi->ifp);	/* also indicates LRO init'ed */
3884 
3885 	if (vi->ifp->if_capenable & IFCAP_LRO)
3886 		rxq->iq.flags |= IQ_LRO_ENABLED;
3887 #endif
3888 	if (vi->ifp->if_capenable & IFCAP_HWRXTSTMP)
3889 		rxq->iq.flags |= IQ_RX_TIMESTAMP;
3890 	rxq->ifp = vi->ifp;
3891 
3892 	children = SYSCTL_CHILDREN(oid);
3893 
3894 	snprintf(name, sizeof(name), "%d", idx);
3895 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name,
3896 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "rx queue");
3897 	children = SYSCTL_CHILDREN(oid);
3898 
3899 	add_iq_sysctls(&vi->ctx, oid, &rxq->iq);
3900 #if defined(INET) || defined(INET6)
3901 	SYSCTL_ADD_U64(&vi->ctx, children, OID_AUTO, "lro_queued", CTLFLAG_RD,
3902 	    &rxq->lro.lro_queued, 0, NULL);
3903 	SYSCTL_ADD_U64(&vi->ctx, children, OID_AUTO, "lro_flushed", CTLFLAG_RD,
3904 	    &rxq->lro.lro_flushed, 0, NULL);
3905 #endif
3906 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "rxcsum", CTLFLAG_RD,
3907 	    &rxq->rxcsum, "# of times hardware assisted with checksum");
3908 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "vlan_extraction",
3909 	    CTLFLAG_RD, &rxq->vlan_extraction,
3910 	    "# of times hardware extracted 802.1Q tag");
3911 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "vxlan_rxcsum",
3912 	    CTLFLAG_RD, &rxq->vxlan_rxcsum,
3913 	    "# of times hardware assisted with inner checksum (VXLAN) ");
3914 
3915 	add_fl_sysctls(sc, &vi->ctx, oid, &rxq->fl);
3916 
3917 	return (rc);
3918 }
3919 
3920 static int
3921 free_rxq(struct vi_info *vi, struct sge_rxq *rxq)
3922 {
3923 	int rc;
3924 
3925 #if defined(INET) || defined(INET6)
3926 	if (rxq->lro.ifp) {
3927 		tcp_lro_free(&rxq->lro);
3928 		rxq->lro.ifp = NULL;
3929 	}
3930 #endif
3931 
3932 	rc = free_iq_fl(vi, &rxq->iq, &rxq->fl);
3933 	if (rc == 0)
3934 		bzero(rxq, sizeof(*rxq));
3935 
3936 	return (rc);
3937 }
3938 
3939 #ifdef TCP_OFFLOAD
3940 static int
3941 alloc_ofld_rxq(struct vi_info *vi, struct sge_ofld_rxq *ofld_rxq,
3942     int intr_idx, int idx, struct sysctl_oid *oid)
3943 {
3944 	struct port_info *pi = vi->pi;
3945 	int rc;
3946 	struct sysctl_oid_list *children;
3947 	char name[16];
3948 
3949 	rc = alloc_iq_fl(vi, &ofld_rxq->iq, &ofld_rxq->fl, intr_idx, 0);
3950 	if (rc != 0)
3951 		return (rc);
3952 
3953 	children = SYSCTL_CHILDREN(oid);
3954 
3955 	snprintf(name, sizeof(name), "%d", idx);
3956 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name,
3957 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "rx queue");
3958 	add_iq_sysctls(&vi->ctx, oid, &ofld_rxq->iq);
3959 	add_fl_sysctls(pi->adapter, &vi->ctx, oid, &ofld_rxq->fl);
3960 
3961 	SYSCTL_ADD_ULONG(&vi->ctx, SYSCTL_CHILDREN(oid), OID_AUTO,
3962 	    "rx_toe_tls_records", CTLFLAG_RD, &ofld_rxq->rx_toe_tls_records,
3963 	    "# of TOE TLS records received");
3964 	SYSCTL_ADD_ULONG(&vi->ctx, SYSCTL_CHILDREN(oid), OID_AUTO,
3965 	    "rx_toe_tls_octets", CTLFLAG_RD, &ofld_rxq->rx_toe_tls_octets,
3966 	    "# of payload octets in received TOE TLS records");
3967 
3968 	return (rc);
3969 }
3970 
3971 static int
3972 free_ofld_rxq(struct vi_info *vi, struct sge_ofld_rxq *ofld_rxq)
3973 {
3974 	int rc;
3975 
3976 	rc = free_iq_fl(vi, &ofld_rxq->iq, &ofld_rxq->fl);
3977 	if (rc == 0)
3978 		bzero(ofld_rxq, sizeof(*ofld_rxq));
3979 
3980 	return (rc);
3981 }
3982 #endif
3983 
3984 /*
3985  * Returns a reasonable automatic cidx flush threshold for a given queue size.
3986  */
3987 static u_int
3988 qsize_to_fthresh(int qsize)
3989 {
3990 	u_int fthresh;
3991 
3992 	while (!powerof2(qsize))
3993 		qsize++;
3994 	fthresh = ilog2(qsize);
3995 	if (fthresh > X_CIDXFLUSHTHRESH_128)
3996 		fthresh = X_CIDXFLUSHTHRESH_128;
3997 
3998 	return (fthresh);
3999 }
4000 
4001 static int
4002 ctrl_eq_alloc(struct adapter *sc, struct sge_eq *eq)
4003 {
4004 	int rc, cntxt_id;
4005 	struct fw_eq_ctrl_cmd c;
4006 	int qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE;
4007 
4008 	bzero(&c, sizeof(c));
4009 
4010 	c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_EQ_CTRL_CMD) | F_FW_CMD_REQUEST |
4011 	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_CTRL_CMD_PFN(sc->pf) |
4012 	    V_FW_EQ_CTRL_CMD_VFN(0));
4013 	c.alloc_to_len16 = htobe32(F_FW_EQ_CTRL_CMD_ALLOC |
4014 	    F_FW_EQ_CTRL_CMD_EQSTART | FW_LEN16(c));
4015 	c.cmpliqid_eqid = htonl(V_FW_EQ_CTRL_CMD_CMPLIQID(eq->iqid));
4016 	c.physeqid_pkd = htobe32(0);
4017 	c.fetchszm_to_iqid =
4018 	    htobe32(V_FW_EQ_CTRL_CMD_HOSTFCMODE(X_HOSTFCMODE_STATUS_PAGE) |
4019 		V_FW_EQ_CTRL_CMD_PCIECHN(eq->tx_chan) |
4020 		F_FW_EQ_CTRL_CMD_FETCHRO | V_FW_EQ_CTRL_CMD_IQID(eq->iqid));
4021 	c.dcaen_to_eqsize =
4022 	    htobe32(V_FW_EQ_CTRL_CMD_FBMIN(chip_id(sc) <= CHELSIO_T5 ?
4023 		X_FETCHBURSTMIN_64B : X_FETCHBURSTMIN_64B_T6) |
4024 		V_FW_EQ_CTRL_CMD_FBMAX(X_FETCHBURSTMAX_512B) |
4025 		V_FW_EQ_CTRL_CMD_CIDXFTHRESH(qsize_to_fthresh(qsize)) |
4026 		V_FW_EQ_CTRL_CMD_EQSIZE(qsize));
4027 	c.eqaddr = htobe64(eq->ba);
4028 
4029 	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
4030 	if (rc != 0) {
4031 		device_printf(sc->dev,
4032 		    "failed to create control queue %d: %d\n", eq->tx_chan, rc);
4033 		return (rc);
4034 	}
4035 	eq->flags |= EQ_ALLOCATED;
4036 
4037 	eq->cntxt_id = G_FW_EQ_CTRL_CMD_EQID(be32toh(c.cmpliqid_eqid));
4038 	cntxt_id = eq->cntxt_id - sc->sge.eq_start;
4039 	if (cntxt_id >= sc->sge.eqmap_sz)
4040 	    panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__,
4041 		cntxt_id, sc->sge.eqmap_sz - 1);
4042 	sc->sge.eqmap[cntxt_id] = eq;
4043 
4044 	return (rc);
4045 }
4046 
4047 static int
4048 eth_eq_alloc(struct adapter *sc, struct vi_info *vi, struct sge_eq *eq)
4049 {
4050 	int rc, cntxt_id;
4051 	struct fw_eq_eth_cmd c;
4052 	int qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE;
4053 
4054 	bzero(&c, sizeof(c));
4055 
4056 	c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_EQ_ETH_CMD) | F_FW_CMD_REQUEST |
4057 	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_ETH_CMD_PFN(sc->pf) |
4058 	    V_FW_EQ_ETH_CMD_VFN(0));
4059 	c.alloc_to_len16 = htobe32(F_FW_EQ_ETH_CMD_ALLOC |
4060 	    F_FW_EQ_ETH_CMD_EQSTART | FW_LEN16(c));
4061 	c.autoequiqe_to_viid = htobe32(F_FW_EQ_ETH_CMD_AUTOEQUIQE |
4062 	    F_FW_EQ_ETH_CMD_AUTOEQUEQE | V_FW_EQ_ETH_CMD_VIID(vi->viid));
4063 	c.fetchszm_to_iqid =
4064 	    htobe32(V_FW_EQ_ETH_CMD_HOSTFCMODE(X_HOSTFCMODE_NONE) |
4065 		V_FW_EQ_ETH_CMD_PCIECHN(eq->tx_chan) | F_FW_EQ_ETH_CMD_FETCHRO |
4066 		V_FW_EQ_ETH_CMD_IQID(eq->iqid));
4067 	c.dcaen_to_eqsize =
4068 	    htobe32(V_FW_EQ_ETH_CMD_FBMIN(chip_id(sc) <= CHELSIO_T5 ?
4069 		X_FETCHBURSTMIN_64B : X_FETCHBURSTMIN_64B_T6) |
4070 		V_FW_EQ_ETH_CMD_FBMAX(X_FETCHBURSTMAX_512B) |
4071 		V_FW_EQ_ETH_CMD_EQSIZE(qsize));
4072 	c.eqaddr = htobe64(eq->ba);
4073 
4074 	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
4075 	if (rc != 0) {
4076 		device_printf(vi->dev,
4077 		    "failed to create Ethernet egress queue: %d\n", rc);
4078 		return (rc);
4079 	}
4080 	eq->flags |= EQ_ALLOCATED;
4081 
4082 	eq->cntxt_id = G_FW_EQ_ETH_CMD_EQID(be32toh(c.eqid_pkd));
4083 	eq->abs_id = G_FW_EQ_ETH_CMD_PHYSEQID(be32toh(c.physeqid_pkd));
4084 	cntxt_id = eq->cntxt_id - sc->sge.eq_start;
4085 	if (cntxt_id >= sc->sge.eqmap_sz)
4086 	    panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__,
4087 		cntxt_id, sc->sge.eqmap_sz - 1);
4088 	sc->sge.eqmap[cntxt_id] = eq;
4089 
4090 	return (rc);
4091 }
4092 
4093 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
4094 static int
4095 ofld_eq_alloc(struct adapter *sc, struct vi_info *vi, struct sge_eq *eq)
4096 {
4097 	int rc, cntxt_id;
4098 	struct fw_eq_ofld_cmd c;
4099 	int qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE;
4100 
4101 	bzero(&c, sizeof(c));
4102 
4103 	c.op_to_vfn = htonl(V_FW_CMD_OP(FW_EQ_OFLD_CMD) | F_FW_CMD_REQUEST |
4104 	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_OFLD_CMD_PFN(sc->pf) |
4105 	    V_FW_EQ_OFLD_CMD_VFN(0));
4106 	c.alloc_to_len16 = htonl(F_FW_EQ_OFLD_CMD_ALLOC |
4107 	    F_FW_EQ_OFLD_CMD_EQSTART | FW_LEN16(c));
4108 	c.fetchszm_to_iqid =
4109 		htonl(V_FW_EQ_OFLD_CMD_HOSTFCMODE(X_HOSTFCMODE_STATUS_PAGE) |
4110 		    V_FW_EQ_OFLD_CMD_PCIECHN(eq->tx_chan) |
4111 		    F_FW_EQ_OFLD_CMD_FETCHRO | V_FW_EQ_OFLD_CMD_IQID(eq->iqid));
4112 	c.dcaen_to_eqsize =
4113 	    htobe32(V_FW_EQ_OFLD_CMD_FBMIN(chip_id(sc) <= CHELSIO_T5 ?
4114 		X_FETCHBURSTMIN_64B : X_FETCHBURSTMIN_64B_T6) |
4115 		V_FW_EQ_OFLD_CMD_FBMAX(X_FETCHBURSTMAX_512B) |
4116 		V_FW_EQ_OFLD_CMD_CIDXFTHRESH(qsize_to_fthresh(qsize)) |
4117 		V_FW_EQ_OFLD_CMD_EQSIZE(qsize));
4118 	c.eqaddr = htobe64(eq->ba);
4119 
4120 	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
4121 	if (rc != 0) {
4122 		device_printf(vi->dev,
4123 		    "failed to create egress queue for TCP offload: %d\n", rc);
4124 		return (rc);
4125 	}
4126 	eq->flags |= EQ_ALLOCATED;
4127 
4128 	eq->cntxt_id = G_FW_EQ_OFLD_CMD_EQID(be32toh(c.eqid_pkd));
4129 	cntxt_id = eq->cntxt_id - sc->sge.eq_start;
4130 	if (cntxt_id >= sc->sge.eqmap_sz)
4131 	    panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__,
4132 		cntxt_id, sc->sge.eqmap_sz - 1);
4133 	sc->sge.eqmap[cntxt_id] = eq;
4134 
4135 	return (rc);
4136 }
4137 #endif
4138 
4139 static int
4140 alloc_eq(struct adapter *sc, struct vi_info *vi, struct sge_eq *eq)
4141 {
4142 	int rc, qsize;
4143 	size_t len;
4144 
4145 	mtx_init(&eq->eq_lock, eq->lockname, NULL, MTX_DEF);
4146 
4147 	qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE;
4148 	len = qsize * EQ_ESIZE;
4149 	rc = alloc_ring(sc, len, &eq->desc_tag, &eq->desc_map,
4150 	    &eq->ba, (void **)&eq->desc);
4151 	if (rc)
4152 		return (rc);
4153 
4154 	eq->pidx = eq->cidx = eq->dbidx = 0;
4155 	/* Note that equeqidx is not used with sge_wrq (OFLD/CTRL) queues. */
4156 	eq->equeqidx = 0;
4157 	eq->doorbells = sc->doorbells;
4158 
4159 	switch (eq->flags & EQ_TYPEMASK) {
4160 	case EQ_CTRL:
4161 		rc = ctrl_eq_alloc(sc, eq);
4162 		break;
4163 
4164 	case EQ_ETH:
4165 		rc = eth_eq_alloc(sc, vi, eq);
4166 		break;
4167 
4168 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
4169 	case EQ_OFLD:
4170 		rc = ofld_eq_alloc(sc, vi, eq);
4171 		break;
4172 #endif
4173 
4174 	default:
4175 		panic("%s: invalid eq type %d.", __func__,
4176 		    eq->flags & EQ_TYPEMASK);
4177 	}
4178 	if (rc != 0) {
4179 		device_printf(sc->dev,
4180 		    "failed to allocate egress queue(%d): %d\n",
4181 		    eq->flags & EQ_TYPEMASK, rc);
4182 	}
4183 
4184 	if (isset(&eq->doorbells, DOORBELL_UDB) ||
4185 	    isset(&eq->doorbells, DOORBELL_UDBWC) ||
4186 	    isset(&eq->doorbells, DOORBELL_WCWR)) {
4187 		uint32_t s_qpp = sc->params.sge.eq_s_qpp;
4188 		uint32_t mask = (1 << s_qpp) - 1;
4189 		volatile uint8_t *udb;
4190 
4191 		udb = sc->udbs_base + UDBS_DB_OFFSET;
4192 		udb += (eq->cntxt_id >> s_qpp) << PAGE_SHIFT;	/* pg offset */
4193 		eq->udb_qid = eq->cntxt_id & mask;		/* id in page */
4194 		if (eq->udb_qid >= PAGE_SIZE / UDBS_SEG_SIZE)
4195 	    		clrbit(&eq->doorbells, DOORBELL_WCWR);
4196 		else {
4197 			udb += eq->udb_qid << UDBS_SEG_SHIFT;	/* seg offset */
4198 			eq->udb_qid = 0;
4199 		}
4200 		eq->udb = (volatile void *)udb;
4201 	}
4202 
4203 	return (rc);
4204 }
4205 
4206 static int
4207 free_eq(struct adapter *sc, struct sge_eq *eq)
4208 {
4209 	int rc;
4210 
4211 	if (eq->flags & EQ_ALLOCATED) {
4212 		switch (eq->flags & EQ_TYPEMASK) {
4213 		case EQ_CTRL:
4214 			rc = -t4_ctrl_eq_free(sc, sc->mbox, sc->pf, 0,
4215 			    eq->cntxt_id);
4216 			break;
4217 
4218 		case EQ_ETH:
4219 			rc = -t4_eth_eq_free(sc, sc->mbox, sc->pf, 0,
4220 			    eq->cntxt_id);
4221 			break;
4222 
4223 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
4224 		case EQ_OFLD:
4225 			rc = -t4_ofld_eq_free(sc, sc->mbox, sc->pf, 0,
4226 			    eq->cntxt_id);
4227 			break;
4228 #endif
4229 
4230 		default:
4231 			panic("%s: invalid eq type %d.", __func__,
4232 			    eq->flags & EQ_TYPEMASK);
4233 		}
4234 		if (rc != 0) {
4235 			device_printf(sc->dev,
4236 			    "failed to free egress queue (%d): %d\n",
4237 			    eq->flags & EQ_TYPEMASK, rc);
4238 			return (rc);
4239 		}
4240 		eq->flags &= ~EQ_ALLOCATED;
4241 	}
4242 
4243 	free_ring(sc, eq->desc_tag, eq->desc_map, eq->ba, eq->desc);
4244 
4245 	if (mtx_initialized(&eq->eq_lock))
4246 		mtx_destroy(&eq->eq_lock);
4247 
4248 	bzero(eq, sizeof(*eq));
4249 	return (0);
4250 }
4251 
4252 static int
4253 alloc_wrq(struct adapter *sc, struct vi_info *vi, struct sge_wrq *wrq,
4254     struct sysctl_oid *oid)
4255 {
4256 	int rc;
4257 	struct sysctl_ctx_list *ctx = vi ? &vi->ctx : &sc->ctx;
4258 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
4259 
4260 	rc = alloc_eq(sc, vi, &wrq->eq);
4261 	if (rc)
4262 		return (rc);
4263 
4264 	wrq->adapter = sc;
4265 	TASK_INIT(&wrq->wrq_tx_task, 0, wrq_tx_drain, wrq);
4266 	TAILQ_INIT(&wrq->incomplete_wrs);
4267 	STAILQ_INIT(&wrq->wr_list);
4268 	wrq->nwr_pending = 0;
4269 	wrq->ndesc_needed = 0;
4270 
4271 	SYSCTL_ADD_UAUTO(ctx, children, OID_AUTO, "ba", CTLFLAG_RD,
4272 	    &wrq->eq.ba, "bus address of descriptor ring");
4273 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL,
4274 	    wrq->eq.sidx * EQ_ESIZE + sc->params.sge.spg_len,
4275 	    "desc ring size in bytes");
4276 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD,
4277 	    &wrq->eq.cntxt_id, 0, "SGE context id of the queue");
4278 	SYSCTL_ADD_U16(ctx, children, OID_AUTO, "cidx", CTLFLAG_RD,
4279 	    &wrq->eq.cidx, 0, "consumer index");
4280 	SYSCTL_ADD_U16(ctx, children, OID_AUTO, "pidx", CTLFLAG_RD,
4281 	    &wrq->eq.pidx, 0, "producer index");
4282 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "sidx", CTLFLAG_RD, NULL,
4283 	    wrq->eq.sidx, "status page index");
4284 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_direct", CTLFLAG_RD,
4285 	    &wrq->tx_wrs_direct, "# of work requests (direct)");
4286 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_copied", CTLFLAG_RD,
4287 	    &wrq->tx_wrs_copied, "# of work requests (copied)");
4288 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_sspace", CTLFLAG_RD,
4289 	    &wrq->tx_wrs_ss, "# of work requests (copied from scratch space)");
4290 
4291 	return (rc);
4292 }
4293 
4294 static int
4295 free_wrq(struct adapter *sc, struct sge_wrq *wrq)
4296 {
4297 	int rc;
4298 
4299 	rc = free_eq(sc, &wrq->eq);
4300 	if (rc)
4301 		return (rc);
4302 
4303 	bzero(wrq, sizeof(*wrq));
4304 	return (0);
4305 }
4306 
4307 static int
4308 alloc_txq(struct vi_info *vi, struct sge_txq *txq, int idx,
4309     struct sysctl_oid *oid)
4310 {
4311 	int rc;
4312 	struct port_info *pi = vi->pi;
4313 	struct adapter *sc = pi->adapter;
4314 	struct sge_eq *eq = &txq->eq;
4315 	struct txpkts *txp;
4316 	char name[16];
4317 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
4318 
4319 	rc = mp_ring_alloc(&txq->r, eq->sidx, txq, eth_tx, can_resume_eth_tx,
4320 	    M_CXGBE, &eq->eq_lock, M_WAITOK);
4321 	if (rc != 0) {
4322 		device_printf(sc->dev, "failed to allocate mp_ring: %d\n", rc);
4323 		return (rc);
4324 	}
4325 
4326 	rc = alloc_eq(sc, vi, eq);
4327 	if (rc != 0) {
4328 		mp_ring_free(txq->r);
4329 		txq->r = NULL;
4330 		return (rc);
4331 	}
4332 
4333 	/* Can't fail after this point. */
4334 
4335 	if (idx == 0)
4336 		sc->sge.eq_base = eq->abs_id - eq->cntxt_id;
4337 	else
4338 		KASSERT(eq->cntxt_id + sc->sge.eq_base == eq->abs_id,
4339 		    ("eq_base mismatch"));
4340 	KASSERT(sc->sge.eq_base == 0 || sc->flags & IS_VF,
4341 	    ("PF with non-zero eq_base"));
4342 
4343 	TASK_INIT(&txq->tx_reclaim_task, 0, tx_reclaim, eq);
4344 	txq->ifp = vi->ifp;
4345 	txq->gl = sglist_alloc(TX_SGL_SEGS, M_WAITOK);
4346 	if (vi->flags & TX_USES_VM_WR)
4347 		txq->cpl_ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT_XT) |
4348 		    V_TXPKT_INTF(pi->tx_chan));
4349 	else
4350 		txq->cpl_ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT_XT) |
4351 		    V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_PF(sc->pf) |
4352 		    V_TXPKT_VF(vi->vin) | V_TXPKT_VF_VLD(vi->vfvld));
4353 	txq->tc_idx = -1;
4354 	txq->sdesc = malloc(eq->sidx * sizeof(struct tx_sdesc), M_CXGBE,
4355 	    M_ZERO | M_WAITOK);
4356 
4357 	txp = &txq->txp;
4358 	MPASS(nitems(txp->mb) >= sc->params.max_pkts_per_eth_tx_pkts_wr);
4359 	txq->txp.max_npkt = min(nitems(txp->mb),
4360 	    sc->params.max_pkts_per_eth_tx_pkts_wr);
4361 	if (vi->flags & TX_USES_VM_WR && !(sc->flags & IS_VF))
4362 		txq->txp.max_npkt--;
4363 
4364 	snprintf(name, sizeof(name), "%d", idx);
4365 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name,
4366 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "tx queue");
4367 	children = SYSCTL_CHILDREN(oid);
4368 
4369 	SYSCTL_ADD_UAUTO(&vi->ctx, children, OID_AUTO, "ba", CTLFLAG_RD,
4370 	    &eq->ba, "bus address of descriptor ring");
4371 	SYSCTL_ADD_INT(&vi->ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL,
4372 	    eq->sidx * EQ_ESIZE + sc->params.sge.spg_len,
4373 	    "desc ring size in bytes");
4374 	SYSCTL_ADD_UINT(&vi->ctx, children, OID_AUTO, "abs_id", CTLFLAG_RD,
4375 	    &eq->abs_id, 0, "absolute id of the queue");
4376 	SYSCTL_ADD_UINT(&vi->ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD,
4377 	    &eq->cntxt_id, 0, "SGE context id of the queue");
4378 	SYSCTL_ADD_U16(&vi->ctx, children, OID_AUTO, "cidx", CTLFLAG_RD,
4379 	    &eq->cidx, 0, "consumer index");
4380 	SYSCTL_ADD_U16(&vi->ctx, children, OID_AUTO, "pidx", CTLFLAG_RD,
4381 	    &eq->pidx, 0, "producer index");
4382 	SYSCTL_ADD_INT(&vi->ctx, children, OID_AUTO, "sidx", CTLFLAG_RD, NULL,
4383 	    eq->sidx, "status page index");
4384 
4385 	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "tc",
4386 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, vi, idx, sysctl_tc,
4387 	    "I", "traffic class (-1 means none)");
4388 
4389 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txcsum", CTLFLAG_RD,
4390 	    &txq->txcsum, "# of times hardware assisted with checksum");
4391 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "vlan_insertion",
4392 	    CTLFLAG_RD, &txq->vlan_insertion,
4393 	    "# of times hardware inserted 802.1Q tag");
4394 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "tso_wrs", CTLFLAG_RD,
4395 	    &txq->tso_wrs, "# of TSO work requests");
4396 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "imm_wrs", CTLFLAG_RD,
4397 	    &txq->imm_wrs, "# of work requests with immediate data");
4398 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "sgl_wrs", CTLFLAG_RD,
4399 	    &txq->sgl_wrs, "# of work requests with direct SGL");
4400 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkt_wrs", CTLFLAG_RD,
4401 	    &txq->txpkt_wrs, "# of txpkt work requests (one pkt/WR)");
4402 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkts0_wrs",
4403 	    CTLFLAG_RD, &txq->txpkts0_wrs,
4404 	    "# of txpkts (type 0) work requests");
4405 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkts1_wrs",
4406 	    CTLFLAG_RD, &txq->txpkts1_wrs,
4407 	    "# of txpkts (type 1) work requests");
4408 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkts0_pkts",
4409 	    CTLFLAG_RD, &txq->txpkts0_pkts,
4410 	    "# of frames tx'd using type0 txpkts work requests");
4411 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkts1_pkts",
4412 	    CTLFLAG_RD, &txq->txpkts1_pkts,
4413 	    "# of frames tx'd using type1 txpkts work requests");
4414 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkts_flush",
4415 	    CTLFLAG_RD, &txq->txpkts_flush,
4416 	    "# of times txpkts had to be flushed out by an egress-update");
4417 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "raw_wrs", CTLFLAG_RD,
4418 	    &txq->raw_wrs, "# of raw work requests (non-packets)");
4419 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "vxlan_tso_wrs",
4420 	    CTLFLAG_RD, &txq->vxlan_tso_wrs, "# of VXLAN TSO work requests");
4421 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "vxlan_txcsum",
4422 	    CTLFLAG_RD, &txq->vxlan_txcsum,
4423 	    "# of times hardware assisted with inner checksums (VXLAN)");
4424 
4425 #ifdef KERN_TLS
4426 	if (is_ktls(sc)) {
4427 		SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO,
4428 		    "kern_tls_records", CTLFLAG_RD, &txq->kern_tls_records,
4429 		    "# of NIC TLS records transmitted");
4430 		SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO,
4431 		    "kern_tls_short", CTLFLAG_RD, &txq->kern_tls_short,
4432 		    "# of short NIC TLS records transmitted");
4433 		SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO,
4434 		    "kern_tls_partial", CTLFLAG_RD, &txq->kern_tls_partial,
4435 		    "# of partial NIC TLS records transmitted");
4436 		SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO,
4437 		    "kern_tls_full", CTLFLAG_RD, &txq->kern_tls_full,
4438 		    "# of full NIC TLS records transmitted");
4439 		SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO,
4440 		    "kern_tls_octets", CTLFLAG_RD, &txq->kern_tls_octets,
4441 		    "# of payload octets in transmitted NIC TLS records");
4442 		SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO,
4443 		    "kern_tls_waste", CTLFLAG_RD, &txq->kern_tls_waste,
4444 		    "# of octets DMAd but not transmitted in NIC TLS records");
4445 		SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO,
4446 		    "kern_tls_options", CTLFLAG_RD, &txq->kern_tls_options,
4447 		    "# of NIC TLS options-only packets transmitted");
4448 		SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO,
4449 		    "kern_tls_header", CTLFLAG_RD, &txq->kern_tls_header,
4450 		    "# of NIC TLS header-only packets transmitted");
4451 		SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO,
4452 		    "kern_tls_fin", CTLFLAG_RD, &txq->kern_tls_fin,
4453 		    "# of NIC TLS FIN-only packets transmitted");
4454 		SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO,
4455 		    "kern_tls_fin_short", CTLFLAG_RD, &txq->kern_tls_fin_short,
4456 		    "# of NIC TLS padded FIN packets on short TLS records");
4457 		SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO,
4458 		    "kern_tls_cbc", CTLFLAG_RD, &txq->kern_tls_cbc,
4459 		    "# of NIC TLS sessions using AES-CBC");
4460 		SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO,
4461 		    "kern_tls_gcm", CTLFLAG_RD, &txq->kern_tls_gcm,
4462 		    "# of NIC TLS sessions using AES-GCM");
4463 	}
4464 #endif
4465 	mp_ring_sysctls(txq->r, &vi->ctx, children);
4466 
4467 	return (0);
4468 }
4469 
4470 static int
4471 free_txq(struct vi_info *vi, struct sge_txq *txq)
4472 {
4473 	int rc;
4474 	struct adapter *sc = vi->adapter;
4475 	struct sge_eq *eq = &txq->eq;
4476 
4477 	rc = free_eq(sc, eq);
4478 	if (rc)
4479 		return (rc);
4480 
4481 	sglist_free(txq->gl);
4482 	free(txq->sdesc, M_CXGBE);
4483 	mp_ring_free(txq->r);
4484 
4485 	bzero(txq, sizeof(*txq));
4486 	return (0);
4487 }
4488 
4489 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
4490 static int
4491 alloc_ofld_txq(struct vi_info *vi, struct sge_ofld_txq *ofld_txq, int idx,
4492     struct sysctl_oid *oid)
4493 {
4494 	struct adapter *sc = vi->adapter;
4495 	struct sysctl_oid_list *children;
4496 	char name[16];
4497 	int rc;
4498 
4499 	children = SYSCTL_CHILDREN(oid);
4500 
4501 	snprintf(name, sizeof(name), "%d", idx);
4502 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name,
4503 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "offload tx queue");
4504 	children = SYSCTL_CHILDREN(oid);
4505 
4506 	rc = alloc_wrq(sc, vi, &ofld_txq->wrq, oid);
4507 	if (rc != 0)
4508 		return (rc);
4509 
4510 	ofld_txq->tx_toe_tls_records = counter_u64_alloc(M_WAITOK);
4511 	ofld_txq->tx_toe_tls_octets = counter_u64_alloc(M_WAITOK);
4512 	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO,
4513 	    "tx_toe_tls_records", CTLFLAG_RD, &ofld_txq->tx_toe_tls_records,
4514 	    "# of TOE TLS records transmitted");
4515 	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO,
4516 	    "tx_toe_tls_octets", CTLFLAG_RD, &ofld_txq->tx_toe_tls_octets,
4517 	    "# of payload octets in transmitted TOE TLS records");
4518 
4519 	return (rc);
4520 }
4521 
4522 static int
4523 free_ofld_txq(struct vi_info *vi, struct sge_ofld_txq *ofld_txq)
4524 {
4525 	struct adapter *sc = vi->adapter;
4526 	int rc;
4527 
4528 	rc = free_wrq(sc, &ofld_txq->wrq);
4529 	if (rc != 0)
4530 		return (rc);
4531 
4532 	counter_u64_free(ofld_txq->tx_toe_tls_records);
4533 	counter_u64_free(ofld_txq->tx_toe_tls_octets);
4534 
4535 	bzero(ofld_txq, sizeof(*ofld_txq));
4536 	return (0);
4537 }
4538 #endif
4539 
4540 static void
4541 oneseg_dma_callback(void *arg, bus_dma_segment_t *segs, int nseg, int error)
4542 {
4543 	bus_addr_t *ba = arg;
4544 
4545 	KASSERT(nseg == 1,
4546 	    ("%s meant for single segment mappings only.", __func__));
4547 
4548 	*ba = error ? 0 : segs->ds_addr;
4549 }
4550 
4551 static inline void
4552 ring_fl_db(struct adapter *sc, struct sge_fl *fl)
4553 {
4554 	uint32_t n, v;
4555 
4556 	n = IDXDIFF(fl->pidx >> 3, fl->dbidx, fl->sidx);
4557 	MPASS(n > 0);
4558 
4559 	wmb();
4560 	v = fl->dbval | V_PIDX(n);
4561 	if (fl->udb)
4562 		*fl->udb = htole32(v);
4563 	else
4564 		t4_write_reg(sc, sc->sge_kdoorbell_reg, v);
4565 	IDXINCR(fl->dbidx, n, fl->sidx);
4566 }
4567 
4568 /*
4569  * Fills up the freelist by allocating up to 'n' buffers.  Buffers that are
4570  * recycled do not count towards this allocation budget.
4571  *
4572  * Returns non-zero to indicate that this freelist should be added to the list
4573  * of starving freelists.
4574  */
4575 static int
4576 refill_fl(struct adapter *sc, struct sge_fl *fl, int n)
4577 {
4578 	__be64 *d;
4579 	struct fl_sdesc *sd;
4580 	uintptr_t pa;
4581 	caddr_t cl;
4582 	struct rx_buf_info *rxb;
4583 	struct cluster_metadata *clm;
4584 	uint16_t max_pidx, zidx = fl->zidx;
4585 	uint16_t hw_cidx = fl->hw_cidx;		/* stable snapshot */
4586 
4587 	FL_LOCK_ASSERT_OWNED(fl);
4588 
4589 	/*
4590 	 * We always stop at the beginning of the hardware descriptor that's just
4591 	 * before the one with the hw cidx.  This is to avoid hw pidx = hw cidx,
4592 	 * which would mean an empty freelist to the chip.
4593 	 */
4594 	max_pidx = __predict_false(hw_cidx == 0) ? fl->sidx - 1 : hw_cidx - 1;
4595 	if (fl->pidx == max_pidx * 8)
4596 		return (0);
4597 
4598 	d = &fl->desc[fl->pidx];
4599 	sd = &fl->sdesc[fl->pidx];
4600 	rxb = &sc->sge.rx_buf_info[zidx];
4601 
4602 	while (n > 0) {
4603 
4604 		if (sd->cl != NULL) {
4605 
4606 			if (sd->nmbuf == 0) {
4607 				/*
4608 				 * Fast recycle without involving any atomics on
4609 				 * the cluster's metadata (if the cluster has
4610 				 * metadata).  This happens when all frames
4611 				 * received in the cluster were small enough to
4612 				 * fit within a single mbuf each.
4613 				 */
4614 				fl->cl_fast_recycled++;
4615 				goto recycled;
4616 			}
4617 
4618 			/*
4619 			 * Cluster is guaranteed to have metadata.  Clusters
4620 			 * without metadata always take the fast recycle path
4621 			 * when they're recycled.
4622 			 */
4623 			clm = cl_metadata(sd);
4624 			MPASS(clm != NULL);
4625 
4626 			if (atomic_fetchadd_int(&clm->refcount, -1) == 1) {
4627 				fl->cl_recycled++;
4628 				counter_u64_add(extfree_rels, 1);
4629 				goto recycled;
4630 			}
4631 			sd->cl = NULL;	/* gave up my reference */
4632 		}
4633 		MPASS(sd->cl == NULL);
4634 		cl = uma_zalloc(rxb->zone, M_NOWAIT);
4635 		if (__predict_false(cl == NULL)) {
4636 			if (zidx != fl->safe_zidx) {
4637 				zidx = fl->safe_zidx;
4638 				rxb = &sc->sge.rx_buf_info[zidx];
4639 				cl = uma_zalloc(rxb->zone, M_NOWAIT);
4640 			}
4641 			if (cl == NULL)
4642 				break;
4643 		}
4644 		fl->cl_allocated++;
4645 		n--;
4646 
4647 		pa = pmap_kextract((vm_offset_t)cl);
4648 		sd->cl = cl;
4649 		sd->zidx = zidx;
4650 
4651 		if (fl->flags & FL_BUF_PACKING) {
4652 			*d = htobe64(pa | rxb->hwidx2);
4653 			sd->moff = rxb->size2;
4654 		} else {
4655 			*d = htobe64(pa | rxb->hwidx1);
4656 			sd->moff = 0;
4657 		}
4658 recycled:
4659 		sd->nmbuf = 0;
4660 		d++;
4661 		sd++;
4662 		if (__predict_false((++fl->pidx & 7) == 0)) {
4663 			uint16_t pidx = fl->pidx >> 3;
4664 
4665 			if (__predict_false(pidx == fl->sidx)) {
4666 				fl->pidx = 0;
4667 				pidx = 0;
4668 				sd = fl->sdesc;
4669 				d = fl->desc;
4670 			}
4671 			if (n < 8 || pidx == max_pidx)
4672 				break;
4673 
4674 			if (IDXDIFF(pidx, fl->dbidx, fl->sidx) >= 4)
4675 				ring_fl_db(sc, fl);
4676 		}
4677 	}
4678 
4679 	if ((fl->pidx >> 3) != fl->dbidx)
4680 		ring_fl_db(sc, fl);
4681 
4682 	return (FL_RUNNING_LOW(fl) && !(fl->flags & FL_STARVING));
4683 }
4684 
4685 /*
4686  * Attempt to refill all starving freelists.
4687  */
4688 static void
4689 refill_sfl(void *arg)
4690 {
4691 	struct adapter *sc = arg;
4692 	struct sge_fl *fl, *fl_temp;
4693 
4694 	mtx_assert(&sc->sfl_lock, MA_OWNED);
4695 	TAILQ_FOREACH_SAFE(fl, &sc->sfl, link, fl_temp) {
4696 		FL_LOCK(fl);
4697 		refill_fl(sc, fl, 64);
4698 		if (FL_NOT_RUNNING_LOW(fl) || fl->flags & FL_DOOMED) {
4699 			TAILQ_REMOVE(&sc->sfl, fl, link);
4700 			fl->flags &= ~FL_STARVING;
4701 		}
4702 		FL_UNLOCK(fl);
4703 	}
4704 
4705 	if (!TAILQ_EMPTY(&sc->sfl))
4706 		callout_schedule(&sc->sfl_callout, hz / 5);
4707 }
4708 
4709 static int
4710 alloc_fl_sdesc(struct sge_fl *fl)
4711 {
4712 
4713 	fl->sdesc = malloc(fl->sidx * 8 * sizeof(struct fl_sdesc), M_CXGBE,
4714 	    M_ZERO | M_WAITOK);
4715 
4716 	return (0);
4717 }
4718 
4719 static void
4720 free_fl_sdesc(struct adapter *sc, struct sge_fl *fl)
4721 {
4722 	struct fl_sdesc *sd;
4723 	struct cluster_metadata *clm;
4724 	int i;
4725 
4726 	sd = fl->sdesc;
4727 	for (i = 0; i < fl->sidx * 8; i++, sd++) {
4728 		if (sd->cl == NULL)
4729 			continue;
4730 
4731 		if (sd->nmbuf == 0)
4732 			uma_zfree(sc->sge.rx_buf_info[sd->zidx].zone, sd->cl);
4733 		else if (fl->flags & FL_BUF_PACKING) {
4734 			clm = cl_metadata(sd);
4735 			if (atomic_fetchadd_int(&clm->refcount, -1) == 1) {
4736 				uma_zfree(sc->sge.rx_buf_info[sd->zidx].zone,
4737 				    sd->cl);
4738 				counter_u64_add(extfree_rels, 1);
4739 			}
4740 		}
4741 		sd->cl = NULL;
4742 	}
4743 
4744 	free(fl->sdesc, M_CXGBE);
4745 	fl->sdesc = NULL;
4746 }
4747 
4748 static inline void
4749 get_pkt_gl(struct mbuf *m, struct sglist *gl)
4750 {
4751 	int rc;
4752 
4753 	M_ASSERTPKTHDR(m);
4754 
4755 	sglist_reset(gl);
4756 	rc = sglist_append_mbuf(gl, m);
4757 	if (__predict_false(rc != 0)) {
4758 		panic("%s: mbuf %p (%d segs) was vetted earlier but now fails "
4759 		    "with %d.", __func__, m, mbuf_nsegs(m), rc);
4760 	}
4761 
4762 	KASSERT(gl->sg_nseg == mbuf_nsegs(m),
4763 	    ("%s: nsegs changed for mbuf %p from %d to %d", __func__, m,
4764 	    mbuf_nsegs(m), gl->sg_nseg));
4765 #if 0	/* vm_wr not readily available here. */
4766 	KASSERT(gl->sg_nseg > 0 && gl->sg_nseg <= max_nsegs_allowed(m, vm_wr),
4767 	    ("%s: %d segments, should have been 1 <= nsegs <= %d", __func__,
4768 		gl->sg_nseg, max_nsegs_allowed(m, vm_wr)));
4769 #endif
4770 }
4771 
4772 /*
4773  * len16 for a txpkt WR with a GL.  Includes the firmware work request header.
4774  */
4775 static inline u_int
4776 txpkt_len16(u_int nsegs, const u_int extra)
4777 {
4778 	u_int n;
4779 
4780 	MPASS(nsegs > 0);
4781 
4782 	nsegs--; /* first segment is part of ulptx_sgl */
4783 	n = extra + sizeof(struct fw_eth_tx_pkt_wr) +
4784 	    sizeof(struct cpl_tx_pkt_core) +
4785 	    sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1));
4786 
4787 	return (howmany(n, 16));
4788 }
4789 
4790 /*
4791  * len16 for a txpkt_vm WR with a GL.  Includes the firmware work
4792  * request header.
4793  */
4794 static inline u_int
4795 txpkt_vm_len16(u_int nsegs, const u_int extra)
4796 {
4797 	u_int n;
4798 
4799 	MPASS(nsegs > 0);
4800 
4801 	nsegs--; /* first segment is part of ulptx_sgl */
4802 	n = extra + sizeof(struct fw_eth_tx_pkt_vm_wr) +
4803 	    sizeof(struct cpl_tx_pkt_core) +
4804 	    sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1));
4805 
4806 	return (howmany(n, 16));
4807 }
4808 
4809 static inline void
4810 calculate_mbuf_len16(struct mbuf *m, bool vm_wr)
4811 {
4812 	const int lso = sizeof(struct cpl_tx_pkt_lso_core);
4813 	const int tnl_lso = sizeof(struct cpl_tx_tnl_lso);
4814 
4815 	if (vm_wr) {
4816 		if (needs_tso(m))
4817 			set_mbuf_len16(m, txpkt_vm_len16(mbuf_nsegs(m), lso));
4818 		else
4819 			set_mbuf_len16(m, txpkt_vm_len16(mbuf_nsegs(m), 0));
4820 		return;
4821 	}
4822 
4823 	if (needs_tso(m)) {
4824 		if (needs_vxlan_tso(m))
4825 			set_mbuf_len16(m, txpkt_len16(mbuf_nsegs(m), tnl_lso));
4826 		else
4827 			set_mbuf_len16(m, txpkt_len16(mbuf_nsegs(m), lso));
4828 	} else
4829 		set_mbuf_len16(m, txpkt_len16(mbuf_nsegs(m), 0));
4830 }
4831 
4832 /*
4833  * len16 for a txpkts type 0 WR with a GL.  Does not include the firmware work
4834  * request header.
4835  */
4836 static inline u_int
4837 txpkts0_len16(u_int nsegs)
4838 {
4839 	u_int n;
4840 
4841 	MPASS(nsegs > 0);
4842 
4843 	nsegs--; /* first segment is part of ulptx_sgl */
4844 	n = sizeof(struct ulp_txpkt) + sizeof(struct ulptx_idata) +
4845 	    sizeof(struct cpl_tx_pkt_core) + sizeof(struct ulptx_sgl) +
4846 	    8 * ((3 * nsegs) / 2 + (nsegs & 1));
4847 
4848 	return (howmany(n, 16));
4849 }
4850 
4851 /*
4852  * len16 for a txpkts type 1 WR with a GL.  Does not include the firmware work
4853  * request header.
4854  */
4855 static inline u_int
4856 txpkts1_len16(void)
4857 {
4858 	u_int n;
4859 
4860 	n = sizeof(struct cpl_tx_pkt_core) + sizeof(struct ulptx_sgl);
4861 
4862 	return (howmany(n, 16));
4863 }
4864 
4865 static inline u_int
4866 imm_payload(u_int ndesc)
4867 {
4868 	u_int n;
4869 
4870 	n = ndesc * EQ_ESIZE - sizeof(struct fw_eth_tx_pkt_wr) -
4871 	    sizeof(struct cpl_tx_pkt_core);
4872 
4873 	return (n);
4874 }
4875 
4876 static inline uint64_t
4877 csum_to_ctrl(struct adapter *sc, struct mbuf *m)
4878 {
4879 	uint64_t ctrl;
4880 	int csum_type, l2hlen, l3hlen;
4881 	int x, y;
4882 	static const int csum_types[3][2] = {
4883 		{TX_CSUM_TCPIP, TX_CSUM_TCPIP6},
4884 		{TX_CSUM_UDPIP, TX_CSUM_UDPIP6},
4885 		{TX_CSUM_IP, 0}
4886 	};
4887 
4888 	M_ASSERTPKTHDR(m);
4889 
4890 	if (!needs_hwcsum(m))
4891 		return (F_TXPKT_IPCSUM_DIS | F_TXPKT_L4CSUM_DIS);
4892 
4893 	MPASS(m->m_pkthdr.l2hlen >= ETHER_HDR_LEN);
4894 	MPASS(m->m_pkthdr.l3hlen >= sizeof(struct ip));
4895 
4896 	if (needs_vxlan_csum(m)) {
4897 		MPASS(m->m_pkthdr.l4hlen > 0);
4898 		MPASS(m->m_pkthdr.l5hlen > 0);
4899 		MPASS(m->m_pkthdr.inner_l2hlen >= ETHER_HDR_LEN);
4900 		MPASS(m->m_pkthdr.inner_l3hlen >= sizeof(struct ip));
4901 
4902 		l2hlen = m->m_pkthdr.l2hlen + m->m_pkthdr.l3hlen +
4903 		    m->m_pkthdr.l4hlen + m->m_pkthdr.l5hlen +
4904 		    m->m_pkthdr.inner_l2hlen - ETHER_HDR_LEN;
4905 		l3hlen = m->m_pkthdr.inner_l3hlen;
4906 	} else {
4907 		l2hlen = m->m_pkthdr.l2hlen - ETHER_HDR_LEN;
4908 		l3hlen = m->m_pkthdr.l3hlen;
4909 	}
4910 
4911 	ctrl = 0;
4912 	if (!needs_l3_csum(m))
4913 		ctrl |= F_TXPKT_IPCSUM_DIS;
4914 
4915 	if (m->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_INNER_IP_TCP |
4916 	    CSUM_IP6_TCP | CSUM_INNER_IP6_TCP))
4917 		x = 0;	/* TCP */
4918 	else if (m->m_pkthdr.csum_flags & (CSUM_IP_UDP | CSUM_INNER_IP_UDP |
4919 	    CSUM_IP6_UDP | CSUM_INNER_IP6_UDP))
4920 		x = 1;	/* UDP */
4921 	else
4922 		x = 2;
4923 
4924 	if (m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP |
4925 	    CSUM_INNER_IP | CSUM_INNER_IP_TCP | CSUM_INNER_IP_UDP))
4926 		y = 0;	/* IPv4 */
4927 	else {
4928 		MPASS(m->m_pkthdr.csum_flags & (CSUM_IP6_TCP | CSUM_IP6_UDP |
4929 		    CSUM_INNER_IP6_TCP | CSUM_INNER_IP6_UDP));
4930 		y = 1;	/* IPv6 */
4931 	}
4932 	/*
4933 	 * needs_hwcsum returned true earlier so there must be some kind of
4934 	 * checksum to calculate.
4935 	 */
4936 	csum_type = csum_types[x][y];
4937 	MPASS(csum_type != 0);
4938 	if (csum_type == TX_CSUM_IP)
4939 		ctrl |= F_TXPKT_L4CSUM_DIS;
4940 	ctrl |= V_TXPKT_CSUM_TYPE(csum_type) | V_TXPKT_IPHDR_LEN(l3hlen);
4941 	if (chip_id(sc) <= CHELSIO_T5)
4942 		ctrl |= V_TXPKT_ETHHDR_LEN(l2hlen);
4943 	else
4944 		ctrl |= V_T6_TXPKT_ETHHDR_LEN(l2hlen);
4945 
4946 	return (ctrl);
4947 }
4948 
4949 static inline void *
4950 write_lso_cpl(void *cpl, struct mbuf *m0)
4951 {
4952 	struct cpl_tx_pkt_lso_core *lso;
4953 	uint32_t ctrl;
4954 
4955 	KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 &&
4956 	    m0->m_pkthdr.l4hlen > 0,
4957 	    ("%s: mbuf %p needs TSO but missing header lengths",
4958 		__func__, m0));
4959 
4960 	ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) |
4961 	    F_LSO_FIRST_SLICE | F_LSO_LAST_SLICE |
4962 	    V_LSO_ETHHDR_LEN((m0->m_pkthdr.l2hlen - ETHER_HDR_LEN) >> 2) |
4963 	    V_LSO_IPHDR_LEN(m0->m_pkthdr.l3hlen >> 2) |
4964 	    V_LSO_TCPHDR_LEN(m0->m_pkthdr.l4hlen >> 2);
4965 	if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr))
4966 		ctrl |= F_LSO_IPV6;
4967 
4968 	lso = cpl;
4969 	lso->lso_ctrl = htobe32(ctrl);
4970 	lso->ipid_ofst = htobe16(0);
4971 	lso->mss = htobe16(m0->m_pkthdr.tso_segsz);
4972 	lso->seqno_offset = htobe32(0);
4973 	lso->len = htobe32(m0->m_pkthdr.len);
4974 
4975 	return (lso + 1);
4976 }
4977 
4978 static void *
4979 write_tnl_lso_cpl(void *cpl, struct mbuf *m0)
4980 {
4981 	struct cpl_tx_tnl_lso *tnl_lso = cpl;
4982 	uint32_t ctrl;
4983 
4984 	KASSERT(m0->m_pkthdr.inner_l2hlen > 0 &&
4985 	    m0->m_pkthdr.inner_l3hlen > 0 && m0->m_pkthdr.inner_l4hlen > 0 &&
4986 	    m0->m_pkthdr.inner_l5hlen > 0,
4987 	    ("%s: mbuf %p needs VXLAN_TSO but missing inner header lengths",
4988 		__func__, m0));
4989 	KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 &&
4990 	    m0->m_pkthdr.l4hlen > 0 && m0->m_pkthdr.l5hlen > 0,
4991 	    ("%s: mbuf %p needs VXLAN_TSO but missing outer header lengths",
4992 		__func__, m0));
4993 
4994 	/* Outer headers. */
4995 	ctrl = V_CPL_TX_TNL_LSO_OPCODE(CPL_TX_TNL_LSO) |
4996 	    F_CPL_TX_TNL_LSO_FIRST | F_CPL_TX_TNL_LSO_LAST |
4997 	    V_CPL_TX_TNL_LSO_ETHHDRLENOUT(
4998 		(m0->m_pkthdr.l2hlen - ETHER_HDR_LEN) >> 2) |
4999 	    V_CPL_TX_TNL_LSO_IPHDRLENOUT(m0->m_pkthdr.l3hlen >> 2) |
5000 	    F_CPL_TX_TNL_LSO_IPLENSETOUT;
5001 	if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr))
5002 		ctrl |= F_CPL_TX_TNL_LSO_IPV6OUT;
5003 	else {
5004 		ctrl |= F_CPL_TX_TNL_LSO_IPHDRCHKOUT |
5005 		    F_CPL_TX_TNL_LSO_IPIDINCOUT;
5006 	}
5007 	tnl_lso->op_to_IpIdSplitOut = htobe32(ctrl);
5008 	tnl_lso->IpIdOffsetOut = 0;
5009 	tnl_lso->UdpLenSetOut_to_TnlHdrLen =
5010 		htobe16(F_CPL_TX_TNL_LSO_UDPCHKCLROUT |
5011 		    F_CPL_TX_TNL_LSO_UDPLENSETOUT |
5012 		    V_CPL_TX_TNL_LSO_TNLHDRLEN(m0->m_pkthdr.l2hlen +
5013 			m0->m_pkthdr.l3hlen + m0->m_pkthdr.l4hlen +
5014 			m0->m_pkthdr.l5hlen) |
5015 		    V_CPL_TX_TNL_LSO_TNLTYPE(TX_TNL_TYPE_VXLAN));
5016 	tnl_lso->r1 = 0;
5017 
5018 	/* Inner headers. */
5019 	ctrl = V_CPL_TX_TNL_LSO_ETHHDRLEN(
5020 	    (m0->m_pkthdr.inner_l2hlen - ETHER_HDR_LEN) >> 2) |
5021 	    V_CPL_TX_TNL_LSO_IPHDRLEN(m0->m_pkthdr.inner_l3hlen >> 2) |
5022 	    V_CPL_TX_TNL_LSO_TCPHDRLEN(m0->m_pkthdr.inner_l4hlen >> 2);
5023 	if (m0->m_pkthdr.inner_l3hlen == sizeof(struct ip6_hdr))
5024 		ctrl |= F_CPL_TX_TNL_LSO_IPV6;
5025 	tnl_lso->Flow_to_TcpHdrLen = htobe32(ctrl);
5026 	tnl_lso->IpIdOffset = 0;
5027 	tnl_lso->IpIdSplit_to_Mss =
5028 	    htobe16(V_CPL_TX_TNL_LSO_MSS(m0->m_pkthdr.tso_segsz));
5029 	tnl_lso->TCPSeqOffset = 0;
5030 	tnl_lso->EthLenOffset_Size =
5031 	    htobe32(V_CPL_TX_TNL_LSO_SIZE(m0->m_pkthdr.len));
5032 
5033 	return (tnl_lso + 1);
5034 }
5035 
5036 #define VM_TX_L2HDR_LEN	16	/* ethmacdst to vlantci */
5037 
5038 /*
5039  * Write a VM txpkt WR for this packet to the hardware descriptors, update the
5040  * software descriptor, and advance the pidx.  It is guaranteed that enough
5041  * descriptors are available.
5042  *
5043  * The return value is the # of hardware descriptors used.
5044  */
5045 static u_int
5046 write_txpkt_vm_wr(struct adapter *sc, struct sge_txq *txq, struct mbuf *m0)
5047 {
5048 	struct sge_eq *eq;
5049 	struct fw_eth_tx_pkt_vm_wr *wr;
5050 	struct tx_sdesc *txsd;
5051 	struct cpl_tx_pkt_core *cpl;
5052 	uint32_t ctrl;	/* used in many unrelated places */
5053 	uint64_t ctrl1;
5054 	int len16, ndesc, pktlen, nsegs;
5055 	caddr_t dst;
5056 
5057 	TXQ_LOCK_ASSERT_OWNED(txq);
5058 	M_ASSERTPKTHDR(m0);
5059 
5060 	len16 = mbuf_len16(m0);
5061 	nsegs = mbuf_nsegs(m0);
5062 	pktlen = m0->m_pkthdr.len;
5063 	ctrl = sizeof(struct cpl_tx_pkt_core);
5064 	if (needs_tso(m0))
5065 		ctrl += sizeof(struct cpl_tx_pkt_lso_core);
5066 	ndesc = tx_len16_to_desc(len16);
5067 
5068 	/* Firmware work request header */
5069 	eq = &txq->eq;
5070 	wr = (void *)&eq->desc[eq->pidx];
5071 	wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_VM_WR) |
5072 	    V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl));
5073 
5074 	ctrl = V_FW_WR_LEN16(len16);
5075 	wr->equiq_to_len16 = htobe32(ctrl);
5076 	wr->r3[0] = 0;
5077 	wr->r3[1] = 0;
5078 
5079 	/*
5080 	 * Copy over ethmacdst, ethmacsrc, ethtype, and vlantci.
5081 	 * vlantci is ignored unless the ethtype is 0x8100, so it's
5082 	 * simpler to always copy it rather than making it
5083 	 * conditional.  Also, it seems that we do not have to set
5084 	 * vlantci or fake the ethtype when doing VLAN tag insertion.
5085 	 */
5086 	m_copydata(m0, 0, VM_TX_L2HDR_LEN, wr->ethmacdst);
5087 
5088 	if (needs_tso(m0)) {
5089 		cpl = write_lso_cpl(wr + 1, m0);
5090 		txq->tso_wrs++;
5091 	} else
5092 		cpl = (void *)(wr + 1);
5093 
5094 	/* Checksum offload */
5095 	ctrl1 = csum_to_ctrl(sc, m0);
5096 	if (ctrl1 != (F_TXPKT_IPCSUM_DIS | F_TXPKT_L4CSUM_DIS))
5097 		txq->txcsum++;	/* some hardware assistance provided */
5098 
5099 	/* VLAN tag insertion */
5100 	if (needs_vlan_insertion(m0)) {
5101 		ctrl1 |= F_TXPKT_VLAN_VLD |
5102 		    V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag);
5103 		txq->vlan_insertion++;
5104 	}
5105 
5106 	/* CPL header */
5107 	cpl->ctrl0 = txq->cpl_ctrl0;
5108 	cpl->pack = 0;
5109 	cpl->len = htobe16(pktlen);
5110 	cpl->ctrl1 = htobe64(ctrl1);
5111 
5112 	/* SGL */
5113 	dst = (void *)(cpl + 1);
5114 
5115 	/*
5116 	 * A packet using TSO will use up an entire descriptor for the
5117 	 * firmware work request header, LSO CPL, and TX_PKT_XT CPL.
5118 	 * If this descriptor is the last descriptor in the ring, wrap
5119 	 * around to the front of the ring explicitly for the start of
5120 	 * the sgl.
5121 	 */
5122 	if (dst == (void *)&eq->desc[eq->sidx]) {
5123 		dst = (void *)&eq->desc[0];
5124 		write_gl_to_txd(txq, m0, &dst, 0);
5125 	} else
5126 		write_gl_to_txd(txq, m0, &dst, eq->sidx - ndesc < eq->pidx);
5127 	txq->sgl_wrs++;
5128 	txq->txpkt_wrs++;
5129 
5130 	txsd = &txq->sdesc[eq->pidx];
5131 	txsd->m = m0;
5132 	txsd->desc_used = ndesc;
5133 
5134 	return (ndesc);
5135 }
5136 
5137 /*
5138  * Write a raw WR to the hardware descriptors, update the software
5139  * descriptor, and advance the pidx.  It is guaranteed that enough
5140  * descriptors are available.
5141  *
5142  * The return value is the # of hardware descriptors used.
5143  */
5144 static u_int
5145 write_raw_wr(struct sge_txq *txq, void *wr, struct mbuf *m0, u_int available)
5146 {
5147 	struct sge_eq *eq = &txq->eq;
5148 	struct tx_sdesc *txsd;
5149 	struct mbuf *m;
5150 	caddr_t dst;
5151 	int len16, ndesc;
5152 
5153 	len16 = mbuf_len16(m0);
5154 	ndesc = tx_len16_to_desc(len16);
5155 	MPASS(ndesc <= available);
5156 
5157 	dst = wr;
5158 	for (m = m0; m != NULL; m = m->m_next)
5159 		copy_to_txd(eq, mtod(m, caddr_t), &dst, m->m_len);
5160 
5161 	txq->raw_wrs++;
5162 
5163 	txsd = &txq->sdesc[eq->pidx];
5164 	txsd->m = m0;
5165 	txsd->desc_used = ndesc;
5166 
5167 	return (ndesc);
5168 }
5169 
5170 /*
5171  * Write a txpkt WR for this packet to the hardware descriptors, update the
5172  * software descriptor, and advance the pidx.  It is guaranteed that enough
5173  * descriptors are available.
5174  *
5175  * The return value is the # of hardware descriptors used.
5176  */
5177 static u_int
5178 write_txpkt_wr(struct adapter *sc, struct sge_txq *txq, struct mbuf *m0,
5179     u_int available)
5180 {
5181 	struct sge_eq *eq;
5182 	struct fw_eth_tx_pkt_wr *wr;
5183 	struct tx_sdesc *txsd;
5184 	struct cpl_tx_pkt_core *cpl;
5185 	uint32_t ctrl;	/* used in many unrelated places */
5186 	uint64_t ctrl1;
5187 	int len16, ndesc, pktlen, nsegs;
5188 	caddr_t dst;
5189 
5190 	TXQ_LOCK_ASSERT_OWNED(txq);
5191 	M_ASSERTPKTHDR(m0);
5192 
5193 	len16 = mbuf_len16(m0);
5194 	nsegs = mbuf_nsegs(m0);
5195 	pktlen = m0->m_pkthdr.len;
5196 	ctrl = sizeof(struct cpl_tx_pkt_core);
5197 	if (needs_tso(m0)) {
5198 		if (needs_vxlan_tso(m0))
5199 			ctrl += sizeof(struct cpl_tx_tnl_lso);
5200 		else
5201 			ctrl += sizeof(struct cpl_tx_pkt_lso_core);
5202 	} else if (!(mbuf_cflags(m0) & MC_NOMAP) && pktlen <= imm_payload(2) &&
5203 	    available >= 2) {
5204 		/* Immediate data.  Recalculate len16 and set nsegs to 0. */
5205 		ctrl += pktlen;
5206 		len16 = howmany(sizeof(struct fw_eth_tx_pkt_wr) +
5207 		    sizeof(struct cpl_tx_pkt_core) + pktlen, 16);
5208 		nsegs = 0;
5209 	}
5210 	ndesc = tx_len16_to_desc(len16);
5211 	MPASS(ndesc <= available);
5212 
5213 	/* Firmware work request header */
5214 	eq = &txq->eq;
5215 	wr = (void *)&eq->desc[eq->pidx];
5216 	wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_WR) |
5217 	    V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl));
5218 
5219 	ctrl = V_FW_WR_LEN16(len16);
5220 	wr->equiq_to_len16 = htobe32(ctrl);
5221 	wr->r3 = 0;
5222 
5223 	if (needs_tso(m0)) {
5224 		if (needs_vxlan_tso(m0)) {
5225 			cpl = write_tnl_lso_cpl(wr + 1, m0);
5226 			txq->vxlan_tso_wrs++;
5227 		} else {
5228 			cpl = write_lso_cpl(wr + 1, m0);
5229 			txq->tso_wrs++;
5230 		}
5231 	} else
5232 		cpl = (void *)(wr + 1);
5233 
5234 	/* Checksum offload */
5235 	ctrl1 = csum_to_ctrl(sc, m0);
5236 	if (ctrl1 != (F_TXPKT_IPCSUM_DIS | F_TXPKT_L4CSUM_DIS)) {
5237 		/* some hardware assistance provided */
5238 		if (needs_vxlan_csum(m0))
5239 			txq->vxlan_txcsum++;
5240 		else
5241 			txq->txcsum++;
5242 	}
5243 
5244 	/* VLAN tag insertion */
5245 	if (needs_vlan_insertion(m0)) {
5246 		ctrl1 |= F_TXPKT_VLAN_VLD |
5247 		    V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag);
5248 		txq->vlan_insertion++;
5249 	}
5250 
5251 	/* CPL header */
5252 	cpl->ctrl0 = txq->cpl_ctrl0;
5253 	cpl->pack = 0;
5254 	cpl->len = htobe16(pktlen);
5255 	cpl->ctrl1 = htobe64(ctrl1);
5256 
5257 	/* SGL */
5258 	dst = (void *)(cpl + 1);
5259 	if (__predict_false((uintptr_t)dst == (uintptr_t)&eq->desc[eq->sidx]))
5260 		dst = (caddr_t)&eq->desc[0];
5261 	if (nsegs > 0) {
5262 
5263 		write_gl_to_txd(txq, m0, &dst, eq->sidx - ndesc < eq->pidx);
5264 		txq->sgl_wrs++;
5265 	} else {
5266 		struct mbuf *m;
5267 
5268 		for (m = m0; m != NULL; m = m->m_next) {
5269 			copy_to_txd(eq, mtod(m, caddr_t), &dst, m->m_len);
5270 #ifdef INVARIANTS
5271 			pktlen -= m->m_len;
5272 #endif
5273 		}
5274 #ifdef INVARIANTS
5275 		KASSERT(pktlen == 0, ("%s: %d bytes left.", __func__, pktlen));
5276 #endif
5277 		txq->imm_wrs++;
5278 	}
5279 
5280 	txq->txpkt_wrs++;
5281 
5282 	txsd = &txq->sdesc[eq->pidx];
5283 	txsd->m = m0;
5284 	txsd->desc_used = ndesc;
5285 
5286 	return (ndesc);
5287 }
5288 
5289 static inline bool
5290 cmp_l2hdr(struct txpkts *txp, struct mbuf *m)
5291 {
5292 	int len;
5293 
5294 	MPASS(txp->npkt > 0);
5295 	MPASS(m->m_len >= VM_TX_L2HDR_LEN);
5296 
5297 	if (txp->ethtype == be16toh(ETHERTYPE_VLAN))
5298 		len = VM_TX_L2HDR_LEN;
5299 	else
5300 		len = sizeof(struct ether_header);
5301 
5302 	return (memcmp(m->m_data, &txp->ethmacdst[0], len) != 0);
5303 }
5304 
5305 static inline void
5306 save_l2hdr(struct txpkts *txp, struct mbuf *m)
5307 {
5308 	MPASS(m->m_len >= VM_TX_L2HDR_LEN);
5309 
5310 	memcpy(&txp->ethmacdst[0], mtod(m, const void *), VM_TX_L2HDR_LEN);
5311 }
5312 
5313 static int
5314 add_to_txpkts_vf(struct adapter *sc, struct sge_txq *txq, struct mbuf *m,
5315     int avail, bool *send)
5316 {
5317 	struct txpkts *txp = &txq->txp;
5318 
5319 	/* Cannot have TSO and coalesce at the same time. */
5320 	if (cannot_use_txpkts(m)) {
5321 cannot_coalesce:
5322 		*send = txp->npkt > 0;
5323 		return (EINVAL);
5324 	}
5325 
5326 	/* VF allows coalescing of type 1 (1 GL) only */
5327 	if (mbuf_nsegs(m) > 1)
5328 		goto cannot_coalesce;
5329 
5330 	*send = false;
5331 	if (txp->npkt > 0) {
5332 		MPASS(tx_len16_to_desc(txp->len16) <= avail);
5333 		MPASS(txp->npkt < txp->max_npkt);
5334 		MPASS(txp->wr_type == 1);	/* VF supports type 1 only */
5335 
5336 		if (tx_len16_to_desc(txp->len16 + txpkts1_len16()) > avail) {
5337 retry_after_send:
5338 			*send = true;
5339 			return (EAGAIN);
5340 		}
5341 		if (m->m_pkthdr.len + txp->plen > 65535)
5342 			goto retry_after_send;
5343 		if (cmp_l2hdr(txp, m))
5344 			goto retry_after_send;
5345 
5346 		txp->len16 += txpkts1_len16();
5347 		txp->plen += m->m_pkthdr.len;
5348 		txp->mb[txp->npkt++] = m;
5349 		if (txp->npkt == txp->max_npkt)
5350 			*send = true;
5351 	} else {
5352 		txp->len16 = howmany(sizeof(struct fw_eth_tx_pkts_vm_wr), 16) +
5353 		    txpkts1_len16();
5354 		if (tx_len16_to_desc(txp->len16) > avail)
5355 			goto cannot_coalesce;
5356 		txp->npkt = 1;
5357 		txp->wr_type = 1;
5358 		txp->plen = m->m_pkthdr.len;
5359 		txp->mb[0] = m;
5360 		save_l2hdr(txp, m);
5361 	}
5362 	return (0);
5363 }
5364 
5365 static int
5366 add_to_txpkts_pf(struct adapter *sc, struct sge_txq *txq, struct mbuf *m,
5367     int avail, bool *send)
5368 {
5369 	struct txpkts *txp = &txq->txp;
5370 	int nsegs;
5371 
5372 	MPASS(!(sc->flags & IS_VF));
5373 
5374 	/* Cannot have TSO and coalesce at the same time. */
5375 	if (cannot_use_txpkts(m)) {
5376 cannot_coalesce:
5377 		*send = txp->npkt > 0;
5378 		return (EINVAL);
5379 	}
5380 
5381 	*send = false;
5382 	nsegs = mbuf_nsegs(m);
5383 	if (txp->npkt == 0) {
5384 		if (m->m_pkthdr.len > 65535)
5385 			goto cannot_coalesce;
5386 		if (nsegs > 1) {
5387 			txp->wr_type = 0;
5388 			txp->len16 =
5389 			    howmany(sizeof(struct fw_eth_tx_pkts_wr), 16) +
5390 			    txpkts0_len16(nsegs);
5391 		} else {
5392 			txp->wr_type = 1;
5393 			txp->len16 =
5394 			    howmany(sizeof(struct fw_eth_tx_pkts_wr), 16) +
5395 			    txpkts1_len16();
5396 		}
5397 		if (tx_len16_to_desc(txp->len16) > avail)
5398 			goto cannot_coalesce;
5399 		txp->npkt = 1;
5400 		txp->plen = m->m_pkthdr.len;
5401 		txp->mb[0] = m;
5402 	} else {
5403 		MPASS(tx_len16_to_desc(txp->len16) <= avail);
5404 		MPASS(txp->npkt < txp->max_npkt);
5405 
5406 		if (m->m_pkthdr.len + txp->plen > 65535) {
5407 retry_after_send:
5408 			*send = true;
5409 			return (EAGAIN);
5410 		}
5411 
5412 		MPASS(txp->wr_type == 0 || txp->wr_type == 1);
5413 		if (txp->wr_type == 0) {
5414 			if (tx_len16_to_desc(txp->len16 +
5415 			    txpkts0_len16(nsegs)) > min(avail, SGE_MAX_WR_NDESC))
5416 				goto retry_after_send;
5417 			txp->len16 += txpkts0_len16(nsegs);
5418 		} else {
5419 			if (nsegs != 1)
5420 				goto retry_after_send;
5421 			if (tx_len16_to_desc(txp->len16 + txpkts1_len16()) >
5422 			    avail)
5423 				goto retry_after_send;
5424 			txp->len16 += txpkts1_len16();
5425 		}
5426 
5427 		txp->plen += m->m_pkthdr.len;
5428 		txp->mb[txp->npkt++] = m;
5429 		if (txp->npkt == txp->max_npkt)
5430 			*send = true;
5431 	}
5432 	return (0);
5433 }
5434 
5435 /*
5436  * Write a txpkts WR for the packets in txp to the hardware descriptors, update
5437  * the software descriptor, and advance the pidx.  It is guaranteed that enough
5438  * descriptors are available.
5439  *
5440  * The return value is the # of hardware descriptors used.
5441  */
5442 static u_int
5443 write_txpkts_wr(struct adapter *sc, struct sge_txq *txq)
5444 {
5445 	const struct txpkts *txp = &txq->txp;
5446 	struct sge_eq *eq = &txq->eq;
5447 	struct fw_eth_tx_pkts_wr *wr;
5448 	struct tx_sdesc *txsd;
5449 	struct cpl_tx_pkt_core *cpl;
5450 	uint64_t ctrl1;
5451 	int ndesc, i, checkwrap;
5452 	struct mbuf *m, *last;
5453 	void *flitp;
5454 
5455 	TXQ_LOCK_ASSERT_OWNED(txq);
5456 	MPASS(txp->npkt > 0);
5457 	MPASS(txp->len16 <= howmany(SGE_MAX_WR_LEN, 16));
5458 
5459 	wr = (void *)&eq->desc[eq->pidx];
5460 	wr->op_pkd = htobe32(V_FW_WR_OP(FW_ETH_TX_PKTS_WR));
5461 	wr->equiq_to_len16 = htobe32(V_FW_WR_LEN16(txp->len16));
5462 	wr->plen = htobe16(txp->plen);
5463 	wr->npkt = txp->npkt;
5464 	wr->r3 = 0;
5465 	wr->type = txp->wr_type;
5466 	flitp = wr + 1;
5467 
5468 	/*
5469 	 * At this point we are 16B into a hardware descriptor.  If checkwrap is
5470 	 * set then we know the WR is going to wrap around somewhere.  We'll
5471 	 * check for that at appropriate points.
5472 	 */
5473 	ndesc = tx_len16_to_desc(txp->len16);
5474 	last = NULL;
5475 	checkwrap = eq->sidx - ndesc < eq->pidx;
5476 	for (i = 0; i < txp->npkt; i++) {
5477 		m = txp->mb[i];
5478 		if (txp->wr_type == 0) {
5479 			struct ulp_txpkt *ulpmc;
5480 			struct ulptx_idata *ulpsc;
5481 
5482 			/* ULP master command */
5483 			ulpmc = flitp;
5484 			ulpmc->cmd_dest = htobe32(V_ULPTX_CMD(ULP_TX_PKT) |
5485 			    V_ULP_TXPKT_DEST(0) | V_ULP_TXPKT_FID(eq->iqid));
5486 			ulpmc->len = htobe32(txpkts0_len16(mbuf_nsegs(m)));
5487 
5488 			/* ULP subcommand */
5489 			ulpsc = (void *)(ulpmc + 1);
5490 			ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM) |
5491 			    F_ULP_TX_SC_MORE);
5492 			ulpsc->len = htobe32(sizeof(struct cpl_tx_pkt_core));
5493 
5494 			cpl = (void *)(ulpsc + 1);
5495 			if (checkwrap &&
5496 			    (uintptr_t)cpl == (uintptr_t)&eq->desc[eq->sidx])
5497 				cpl = (void *)&eq->desc[0];
5498 		} else {
5499 			cpl = flitp;
5500 		}
5501 
5502 		/* Checksum offload */
5503 		ctrl1 = csum_to_ctrl(sc, m);
5504 		if (ctrl1 != (F_TXPKT_IPCSUM_DIS | F_TXPKT_L4CSUM_DIS)) {
5505 			/* some hardware assistance provided */
5506 			if (needs_vxlan_csum(m))
5507 				txq->vxlan_txcsum++;
5508 			else
5509 				txq->txcsum++;
5510 		}
5511 
5512 		/* VLAN tag insertion */
5513 		if (needs_vlan_insertion(m)) {
5514 			ctrl1 |= F_TXPKT_VLAN_VLD |
5515 			    V_TXPKT_VLAN(m->m_pkthdr.ether_vtag);
5516 			txq->vlan_insertion++;
5517 		}
5518 
5519 		/* CPL header */
5520 		cpl->ctrl0 = txq->cpl_ctrl0;
5521 		cpl->pack = 0;
5522 		cpl->len = htobe16(m->m_pkthdr.len);
5523 		cpl->ctrl1 = htobe64(ctrl1);
5524 
5525 		flitp = cpl + 1;
5526 		if (checkwrap &&
5527 		    (uintptr_t)flitp == (uintptr_t)&eq->desc[eq->sidx])
5528 			flitp = (void *)&eq->desc[0];
5529 
5530 		write_gl_to_txd(txq, m, (caddr_t *)(&flitp), checkwrap);
5531 
5532 		if (last != NULL)
5533 			last->m_nextpkt = m;
5534 		last = m;
5535 	}
5536 
5537 	txq->sgl_wrs++;
5538 	if (txp->wr_type == 0) {
5539 		txq->txpkts0_pkts += txp->npkt;
5540 		txq->txpkts0_wrs++;
5541 	} else {
5542 		txq->txpkts1_pkts += txp->npkt;
5543 		txq->txpkts1_wrs++;
5544 	}
5545 
5546 	txsd = &txq->sdesc[eq->pidx];
5547 	txsd->m = txp->mb[0];
5548 	txsd->desc_used = ndesc;
5549 
5550 	return (ndesc);
5551 }
5552 
5553 static u_int
5554 write_txpkts_vm_wr(struct adapter *sc, struct sge_txq *txq)
5555 {
5556 	const struct txpkts *txp = &txq->txp;
5557 	struct sge_eq *eq = &txq->eq;
5558 	struct fw_eth_tx_pkts_vm_wr *wr;
5559 	struct tx_sdesc *txsd;
5560 	struct cpl_tx_pkt_core *cpl;
5561 	uint64_t ctrl1;
5562 	int ndesc, i;
5563 	struct mbuf *m, *last;
5564 	void *flitp;
5565 
5566 	TXQ_LOCK_ASSERT_OWNED(txq);
5567 	MPASS(txp->npkt > 0);
5568 	MPASS(txp->wr_type == 1);	/* VF supports type 1 only */
5569 	MPASS(txp->mb[0] != NULL);
5570 	MPASS(txp->len16 <= howmany(SGE_MAX_WR_LEN, 16));
5571 
5572 	wr = (void *)&eq->desc[eq->pidx];
5573 	wr->op_pkd = htobe32(V_FW_WR_OP(FW_ETH_TX_PKTS_VM_WR));
5574 	wr->equiq_to_len16 = htobe32(V_FW_WR_LEN16(txp->len16));
5575 	wr->r3 = 0;
5576 	wr->plen = htobe16(txp->plen);
5577 	wr->npkt = txp->npkt;
5578 	wr->r4 = 0;
5579 	memcpy(&wr->ethmacdst[0], &txp->ethmacdst[0], 16);
5580 	flitp = wr + 1;
5581 
5582 	/*
5583 	 * At this point we are 32B into a hardware descriptor.  Each mbuf in
5584 	 * the WR will take 32B so we check for the end of the descriptor ring
5585 	 * before writing odd mbufs (mb[1], 3, 5, ..)
5586 	 */
5587 	ndesc = tx_len16_to_desc(txp->len16);
5588 	last = NULL;
5589 	for (i = 0; i < txp->npkt; i++) {
5590 		m = txp->mb[i];
5591 		if (i & 1 && (uintptr_t)flitp == (uintptr_t)&eq->desc[eq->sidx])
5592 			flitp = &eq->desc[0];
5593 		cpl = flitp;
5594 
5595 		/* Checksum offload */
5596 		ctrl1 = csum_to_ctrl(sc, m);
5597 		if (ctrl1 != (F_TXPKT_IPCSUM_DIS | F_TXPKT_L4CSUM_DIS))
5598 			txq->txcsum++;	/* some hardware assistance provided */
5599 
5600 		/* VLAN tag insertion */
5601 		if (needs_vlan_insertion(m)) {
5602 			ctrl1 |= F_TXPKT_VLAN_VLD |
5603 			    V_TXPKT_VLAN(m->m_pkthdr.ether_vtag);
5604 			txq->vlan_insertion++;
5605 		}
5606 
5607 		/* CPL header */
5608 		cpl->ctrl0 = txq->cpl_ctrl0;
5609 		cpl->pack = 0;
5610 		cpl->len = htobe16(m->m_pkthdr.len);
5611 		cpl->ctrl1 = htobe64(ctrl1);
5612 
5613 		flitp = cpl + 1;
5614 		MPASS(mbuf_nsegs(m) == 1);
5615 		write_gl_to_txd(txq, m, (caddr_t *)(&flitp), 0);
5616 
5617 		if (last != NULL)
5618 			last->m_nextpkt = m;
5619 		last = m;
5620 	}
5621 
5622 	txq->sgl_wrs++;
5623 	txq->txpkts1_pkts += txp->npkt;
5624 	txq->txpkts1_wrs++;
5625 
5626 	txsd = &txq->sdesc[eq->pidx];
5627 	txsd->m = txp->mb[0];
5628 	txsd->desc_used = ndesc;
5629 
5630 	return (ndesc);
5631 }
5632 
5633 /*
5634  * If the SGL ends on an address that is not 16 byte aligned, this function will
5635  * add a 0 filled flit at the end.
5636  */
5637 static void
5638 write_gl_to_txd(struct sge_txq *txq, struct mbuf *m, caddr_t *to, int checkwrap)
5639 {
5640 	struct sge_eq *eq = &txq->eq;
5641 	struct sglist *gl = txq->gl;
5642 	struct sglist_seg *seg;
5643 	__be64 *flitp, *wrap;
5644 	struct ulptx_sgl *usgl;
5645 	int i, nflits, nsegs;
5646 
5647 	KASSERT(((uintptr_t)(*to) & 0xf) == 0,
5648 	    ("%s: SGL must start at a 16 byte boundary: %p", __func__, *to));
5649 	MPASS((uintptr_t)(*to) >= (uintptr_t)&eq->desc[0]);
5650 	MPASS((uintptr_t)(*to) < (uintptr_t)&eq->desc[eq->sidx]);
5651 
5652 	get_pkt_gl(m, gl);
5653 	nsegs = gl->sg_nseg;
5654 	MPASS(nsegs > 0);
5655 
5656 	nflits = (3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1) + 2;
5657 	flitp = (__be64 *)(*to);
5658 	wrap = (__be64 *)(&eq->desc[eq->sidx]);
5659 	seg = &gl->sg_segs[0];
5660 	usgl = (void *)flitp;
5661 
5662 	/*
5663 	 * We start at a 16 byte boundary somewhere inside the tx descriptor
5664 	 * ring, so we're at least 16 bytes away from the status page.  There is
5665 	 * no chance of a wrap around in the middle of usgl (which is 16 bytes).
5666 	 */
5667 
5668 	usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) |
5669 	    V_ULPTX_NSGE(nsegs));
5670 	usgl->len0 = htobe32(seg->ss_len);
5671 	usgl->addr0 = htobe64(seg->ss_paddr);
5672 	seg++;
5673 
5674 	if (checkwrap == 0 || (uintptr_t)(flitp + nflits) <= (uintptr_t)wrap) {
5675 
5676 		/* Won't wrap around at all */
5677 
5678 		for (i = 0; i < nsegs - 1; i++, seg++) {
5679 			usgl->sge[i / 2].len[i & 1] = htobe32(seg->ss_len);
5680 			usgl->sge[i / 2].addr[i & 1] = htobe64(seg->ss_paddr);
5681 		}
5682 		if (i & 1)
5683 			usgl->sge[i / 2].len[1] = htobe32(0);
5684 		flitp += nflits;
5685 	} else {
5686 
5687 		/* Will wrap somewhere in the rest of the SGL */
5688 
5689 		/* 2 flits already written, write the rest flit by flit */
5690 		flitp = (void *)(usgl + 1);
5691 		for (i = 0; i < nflits - 2; i++) {
5692 			if (flitp == wrap)
5693 				flitp = (void *)eq->desc;
5694 			*flitp++ = get_flit(seg, nsegs - 1, i);
5695 		}
5696 	}
5697 
5698 	if (nflits & 1) {
5699 		MPASS(((uintptr_t)flitp) & 0xf);
5700 		*flitp++ = 0;
5701 	}
5702 
5703 	MPASS((((uintptr_t)flitp) & 0xf) == 0);
5704 	if (__predict_false(flitp == wrap))
5705 		*to = (void *)eq->desc;
5706 	else
5707 		*to = (void *)flitp;
5708 }
5709 
5710 static inline void
5711 copy_to_txd(struct sge_eq *eq, caddr_t from, caddr_t *to, int len)
5712 {
5713 
5714 	MPASS((uintptr_t)(*to) >= (uintptr_t)&eq->desc[0]);
5715 	MPASS((uintptr_t)(*to) < (uintptr_t)&eq->desc[eq->sidx]);
5716 
5717 	if (__predict_true((uintptr_t)(*to) + len <=
5718 	    (uintptr_t)&eq->desc[eq->sidx])) {
5719 		bcopy(from, *to, len);
5720 		(*to) += len;
5721 	} else {
5722 		int portion = (uintptr_t)&eq->desc[eq->sidx] - (uintptr_t)(*to);
5723 
5724 		bcopy(from, *to, portion);
5725 		from += portion;
5726 		portion = len - portion;	/* remaining */
5727 		bcopy(from, (void *)eq->desc, portion);
5728 		(*to) = (caddr_t)eq->desc + portion;
5729 	}
5730 }
5731 
5732 static inline void
5733 ring_eq_db(struct adapter *sc, struct sge_eq *eq, u_int n)
5734 {
5735 	u_int db;
5736 
5737 	MPASS(n > 0);
5738 
5739 	db = eq->doorbells;
5740 	if (n > 1)
5741 		clrbit(&db, DOORBELL_WCWR);
5742 	wmb();
5743 
5744 	switch (ffs(db) - 1) {
5745 	case DOORBELL_UDB:
5746 		*eq->udb = htole32(V_QID(eq->udb_qid) | V_PIDX(n));
5747 		break;
5748 
5749 	case DOORBELL_WCWR: {
5750 		volatile uint64_t *dst, *src;
5751 		int i;
5752 
5753 		/*
5754 		 * Queues whose 128B doorbell segment fits in the page do not
5755 		 * use relative qid (udb_qid is always 0).  Only queues with
5756 		 * doorbell segments can do WCWR.
5757 		 */
5758 		KASSERT(eq->udb_qid == 0 && n == 1,
5759 		    ("%s: inappropriate doorbell (0x%x, %d, %d) for eq %p",
5760 		    __func__, eq->doorbells, n, eq->dbidx, eq));
5761 
5762 		dst = (volatile void *)((uintptr_t)eq->udb + UDBS_WR_OFFSET -
5763 		    UDBS_DB_OFFSET);
5764 		i = eq->dbidx;
5765 		src = (void *)&eq->desc[i];
5766 		while (src != (void *)&eq->desc[i + 1])
5767 			*dst++ = *src++;
5768 		wmb();
5769 		break;
5770 	}
5771 
5772 	case DOORBELL_UDBWC:
5773 		*eq->udb = htole32(V_QID(eq->udb_qid) | V_PIDX(n));
5774 		wmb();
5775 		break;
5776 
5777 	case DOORBELL_KDB:
5778 		t4_write_reg(sc, sc->sge_kdoorbell_reg,
5779 		    V_QID(eq->cntxt_id) | V_PIDX(n));
5780 		break;
5781 	}
5782 
5783 	IDXINCR(eq->dbidx, n, eq->sidx);
5784 }
5785 
5786 static inline u_int
5787 reclaimable_tx_desc(struct sge_eq *eq)
5788 {
5789 	uint16_t hw_cidx;
5790 
5791 	hw_cidx = read_hw_cidx(eq);
5792 	return (IDXDIFF(hw_cidx, eq->cidx, eq->sidx));
5793 }
5794 
5795 static inline u_int
5796 total_available_tx_desc(struct sge_eq *eq)
5797 {
5798 	uint16_t hw_cidx, pidx;
5799 
5800 	hw_cidx = read_hw_cidx(eq);
5801 	pidx = eq->pidx;
5802 
5803 	if (pidx == hw_cidx)
5804 		return (eq->sidx - 1);
5805 	else
5806 		return (IDXDIFF(hw_cidx, pidx, eq->sidx) - 1);
5807 }
5808 
5809 static inline uint16_t
5810 read_hw_cidx(struct sge_eq *eq)
5811 {
5812 	struct sge_qstat *spg = (void *)&eq->desc[eq->sidx];
5813 	uint16_t cidx = spg->cidx;	/* stable snapshot */
5814 
5815 	return (be16toh(cidx));
5816 }
5817 
5818 /*
5819  * Reclaim 'n' descriptors approximately.
5820  */
5821 static u_int
5822 reclaim_tx_descs(struct sge_txq *txq, u_int n)
5823 {
5824 	struct tx_sdesc *txsd;
5825 	struct sge_eq *eq = &txq->eq;
5826 	u_int can_reclaim, reclaimed;
5827 
5828 	TXQ_LOCK_ASSERT_OWNED(txq);
5829 	MPASS(n > 0);
5830 
5831 	reclaimed = 0;
5832 	can_reclaim = reclaimable_tx_desc(eq);
5833 	while (can_reclaim && reclaimed < n) {
5834 		int ndesc;
5835 		struct mbuf *m, *nextpkt;
5836 
5837 		txsd = &txq->sdesc[eq->cidx];
5838 		ndesc = txsd->desc_used;
5839 
5840 		/* Firmware doesn't return "partial" credits. */
5841 		KASSERT(can_reclaim >= ndesc,
5842 		    ("%s: unexpected number of credits: %d, %d",
5843 		    __func__, can_reclaim, ndesc));
5844 		KASSERT(ndesc != 0,
5845 		    ("%s: descriptor with no credits: cidx %d",
5846 		    __func__, eq->cidx));
5847 
5848 		for (m = txsd->m; m != NULL; m = nextpkt) {
5849 			nextpkt = m->m_nextpkt;
5850 			m->m_nextpkt = NULL;
5851 			m_freem(m);
5852 		}
5853 		reclaimed += ndesc;
5854 		can_reclaim -= ndesc;
5855 		IDXINCR(eq->cidx, ndesc, eq->sidx);
5856 	}
5857 
5858 	return (reclaimed);
5859 }
5860 
5861 static void
5862 tx_reclaim(void *arg, int n)
5863 {
5864 	struct sge_txq *txq = arg;
5865 	struct sge_eq *eq = &txq->eq;
5866 
5867 	do {
5868 		if (TXQ_TRYLOCK(txq) == 0)
5869 			break;
5870 		n = reclaim_tx_descs(txq, 32);
5871 		if (eq->cidx == eq->pidx)
5872 			eq->equeqidx = eq->pidx;
5873 		TXQ_UNLOCK(txq);
5874 	} while (n > 0);
5875 }
5876 
5877 static __be64
5878 get_flit(struct sglist_seg *segs, int nsegs, int idx)
5879 {
5880 	int i = (idx / 3) * 2;
5881 
5882 	switch (idx % 3) {
5883 	case 0: {
5884 		uint64_t rc;
5885 
5886 		rc = (uint64_t)segs[i].ss_len << 32;
5887 		if (i + 1 < nsegs)
5888 			rc |= (uint64_t)(segs[i + 1].ss_len);
5889 
5890 		return (htobe64(rc));
5891 	}
5892 	case 1:
5893 		return (htobe64(segs[i].ss_paddr));
5894 	case 2:
5895 		return (htobe64(segs[i + 1].ss_paddr));
5896 	}
5897 
5898 	return (0);
5899 }
5900 
5901 static int
5902 find_refill_source(struct adapter *sc, int maxp, bool packing)
5903 {
5904 	int i, zidx = -1;
5905 	struct rx_buf_info *rxb = &sc->sge.rx_buf_info[0];
5906 
5907 	if (packing) {
5908 		for (i = 0; i < SW_ZONE_SIZES; i++, rxb++) {
5909 			if (rxb->hwidx2 == -1)
5910 				continue;
5911 			if (rxb->size1 < PAGE_SIZE &&
5912 			    rxb->size1 < largest_rx_cluster)
5913 				continue;
5914 			if (rxb->size1 > largest_rx_cluster)
5915 				break;
5916 			MPASS(rxb->size1 - rxb->size2 >= CL_METADATA_SIZE);
5917 			if (rxb->size2 >= maxp)
5918 				return (i);
5919 			zidx = i;
5920 		}
5921 	} else {
5922 		for (i = 0; i < SW_ZONE_SIZES; i++, rxb++) {
5923 			if (rxb->hwidx1 == -1)
5924 				continue;
5925 			if (rxb->size1 > largest_rx_cluster)
5926 				break;
5927 			if (rxb->size1 >= maxp)
5928 				return (i);
5929 			zidx = i;
5930 		}
5931 	}
5932 
5933 	return (zidx);
5934 }
5935 
5936 static void
5937 add_fl_to_sfl(struct adapter *sc, struct sge_fl *fl)
5938 {
5939 	mtx_lock(&sc->sfl_lock);
5940 	FL_LOCK(fl);
5941 	if ((fl->flags & FL_DOOMED) == 0) {
5942 		fl->flags |= FL_STARVING;
5943 		TAILQ_INSERT_TAIL(&sc->sfl, fl, link);
5944 		callout_reset(&sc->sfl_callout, hz / 5, refill_sfl, sc);
5945 	}
5946 	FL_UNLOCK(fl);
5947 	mtx_unlock(&sc->sfl_lock);
5948 }
5949 
5950 static void
5951 handle_wrq_egr_update(struct adapter *sc, struct sge_eq *eq)
5952 {
5953 	struct sge_wrq *wrq = (void *)eq;
5954 
5955 	atomic_readandclear_int(&eq->equiq);
5956 	taskqueue_enqueue(sc->tq[eq->tx_chan], &wrq->wrq_tx_task);
5957 }
5958 
5959 static void
5960 handle_eth_egr_update(struct adapter *sc, struct sge_eq *eq)
5961 {
5962 	struct sge_txq *txq = (void *)eq;
5963 
5964 	MPASS((eq->flags & EQ_TYPEMASK) == EQ_ETH);
5965 
5966 	atomic_readandclear_int(&eq->equiq);
5967 	if (mp_ring_is_idle(txq->r))
5968 		taskqueue_enqueue(sc->tq[eq->tx_chan], &txq->tx_reclaim_task);
5969 	else
5970 		mp_ring_check_drainage(txq->r, 64);
5971 }
5972 
5973 static int
5974 handle_sge_egr_update(struct sge_iq *iq, const struct rss_header *rss,
5975     struct mbuf *m)
5976 {
5977 	const struct cpl_sge_egr_update *cpl = (const void *)(rss + 1);
5978 	unsigned int qid = G_EGR_QID(ntohl(cpl->opcode_qid));
5979 	struct adapter *sc = iq->adapter;
5980 	struct sge *s = &sc->sge;
5981 	struct sge_eq *eq;
5982 	static void (*h[])(struct adapter *, struct sge_eq *) = {NULL,
5983 		&handle_wrq_egr_update, &handle_eth_egr_update,
5984 		&handle_wrq_egr_update};
5985 
5986 	KASSERT(m == NULL, ("%s: payload with opcode %02x", __func__,
5987 	    rss->opcode));
5988 
5989 	eq = s->eqmap[qid - s->eq_start - s->eq_base];
5990 	(*h[eq->flags & EQ_TYPEMASK])(sc, eq);
5991 
5992 	return (0);
5993 }
5994 
5995 /* handle_fw_msg works for both fw4_msg and fw6_msg because this is valid */
5996 CTASSERT(offsetof(struct cpl_fw4_msg, data) == \
5997     offsetof(struct cpl_fw6_msg, data));
5998 
5999 static int
6000 handle_fw_msg(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
6001 {
6002 	struct adapter *sc = iq->adapter;
6003 	const struct cpl_fw6_msg *cpl = (const void *)(rss + 1);
6004 
6005 	KASSERT(m == NULL, ("%s: payload with opcode %02x", __func__,
6006 	    rss->opcode));
6007 
6008 	if (cpl->type == FW_TYPE_RSSCPL || cpl->type == FW6_TYPE_RSSCPL) {
6009 		const struct rss_header *rss2;
6010 
6011 		rss2 = (const struct rss_header *)&cpl->data[0];
6012 		return (t4_cpl_handler[rss2->opcode](iq, rss2, m));
6013 	}
6014 
6015 	return (t4_fw_msg_handler[cpl->type](sc, &cpl->data[0]));
6016 }
6017 
6018 /**
6019  *	t4_handle_wrerr_rpl - process a FW work request error message
6020  *	@adap: the adapter
6021  *	@rpl: start of the FW message
6022  */
6023 static int
6024 t4_handle_wrerr_rpl(struct adapter *adap, const __be64 *rpl)
6025 {
6026 	u8 opcode = *(const u8 *)rpl;
6027 	const struct fw_error_cmd *e = (const void *)rpl;
6028 	unsigned int i;
6029 
6030 	if (opcode != FW_ERROR_CMD) {
6031 		log(LOG_ERR,
6032 		    "%s: Received WRERR_RPL message with opcode %#x\n",
6033 		    device_get_nameunit(adap->dev), opcode);
6034 		return (EINVAL);
6035 	}
6036 	log(LOG_ERR, "%s: FW_ERROR (%s) ", device_get_nameunit(adap->dev),
6037 	    G_FW_ERROR_CMD_FATAL(be32toh(e->op_to_type)) ? "fatal" :
6038 	    "non-fatal");
6039 	switch (G_FW_ERROR_CMD_TYPE(be32toh(e->op_to_type))) {
6040 	case FW_ERROR_TYPE_EXCEPTION:
6041 		log(LOG_ERR, "exception info:\n");
6042 		for (i = 0; i < nitems(e->u.exception.info); i++)
6043 			log(LOG_ERR, "%s%08x", i == 0 ? "\t" : " ",
6044 			    be32toh(e->u.exception.info[i]));
6045 		log(LOG_ERR, "\n");
6046 		break;
6047 	case FW_ERROR_TYPE_HWMODULE:
6048 		log(LOG_ERR, "HW module regaddr %08x regval %08x\n",
6049 		    be32toh(e->u.hwmodule.regaddr),
6050 		    be32toh(e->u.hwmodule.regval));
6051 		break;
6052 	case FW_ERROR_TYPE_WR:
6053 		log(LOG_ERR, "WR cidx %d PF %d VF %d eqid %d hdr:\n",
6054 		    be16toh(e->u.wr.cidx),
6055 		    G_FW_ERROR_CMD_PFN(be16toh(e->u.wr.pfn_vfn)),
6056 		    G_FW_ERROR_CMD_VFN(be16toh(e->u.wr.pfn_vfn)),
6057 		    be32toh(e->u.wr.eqid));
6058 		for (i = 0; i < nitems(e->u.wr.wrhdr); i++)
6059 			log(LOG_ERR, "%s%02x", i == 0 ? "\t" : " ",
6060 			    e->u.wr.wrhdr[i]);
6061 		log(LOG_ERR, "\n");
6062 		break;
6063 	case FW_ERROR_TYPE_ACL:
6064 		log(LOG_ERR, "ACL cidx %d PF %d VF %d eqid %d %s",
6065 		    be16toh(e->u.acl.cidx),
6066 		    G_FW_ERROR_CMD_PFN(be16toh(e->u.acl.pfn_vfn)),
6067 		    G_FW_ERROR_CMD_VFN(be16toh(e->u.acl.pfn_vfn)),
6068 		    be32toh(e->u.acl.eqid),
6069 		    G_FW_ERROR_CMD_MV(be16toh(e->u.acl.mv_pkd)) ? "vlanid" :
6070 		    "MAC");
6071 		for (i = 0; i < nitems(e->u.acl.val); i++)
6072 			log(LOG_ERR, " %02x", e->u.acl.val[i]);
6073 		log(LOG_ERR, "\n");
6074 		break;
6075 	default:
6076 		log(LOG_ERR, "type %#x\n",
6077 		    G_FW_ERROR_CMD_TYPE(be32toh(e->op_to_type)));
6078 		return (EINVAL);
6079 	}
6080 	return (0);
6081 }
6082 
6083 static inline bool
6084 bufidx_used(struct adapter *sc, int idx)
6085 {
6086 	struct rx_buf_info *rxb = &sc->sge.rx_buf_info[0];
6087 	int i;
6088 
6089 	for (i = 0; i < SW_ZONE_SIZES; i++, rxb++) {
6090 		if (rxb->size1 > largest_rx_cluster)
6091 			continue;
6092 		if (rxb->hwidx1 == idx || rxb->hwidx2 == idx)
6093 			return (true);
6094 	}
6095 
6096 	return (false);
6097 }
6098 
6099 static int
6100 sysctl_bufsizes(SYSCTL_HANDLER_ARGS)
6101 {
6102 	struct adapter *sc = arg1;
6103 	struct sge_params *sp = &sc->params.sge;
6104 	int i, rc;
6105 	struct sbuf sb;
6106 	char c;
6107 
6108 	sbuf_new(&sb, NULL, 128, SBUF_AUTOEXTEND);
6109 	for (i = 0; i < SGE_FLBUF_SIZES; i++) {
6110 		if (bufidx_used(sc, i))
6111 			c = '*';
6112 		else
6113 			c = '\0';
6114 
6115 		sbuf_printf(&sb, "%u%c ", sp->sge_fl_buffer_size[i], c);
6116 	}
6117 	sbuf_trim(&sb);
6118 	sbuf_finish(&sb);
6119 	rc = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
6120 	sbuf_delete(&sb);
6121 	return (rc);
6122 }
6123 
6124 #ifdef RATELIMIT
6125 /*
6126  * len16 for a txpkt WR with a GL.  Includes the firmware work request header.
6127  */
6128 static inline u_int
6129 txpkt_eo_len16(u_int nsegs, u_int immhdrs, u_int tso)
6130 {
6131 	u_int n;
6132 
6133 	MPASS(immhdrs > 0);
6134 
6135 	n = roundup2(sizeof(struct fw_eth_tx_eo_wr) +
6136 	    sizeof(struct cpl_tx_pkt_core) + immhdrs, 16);
6137 	if (__predict_false(nsegs == 0))
6138 		goto done;
6139 
6140 	nsegs--; /* first segment is part of ulptx_sgl */
6141 	n += sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1));
6142 	if (tso)
6143 		n += sizeof(struct cpl_tx_pkt_lso_core);
6144 
6145 done:
6146 	return (howmany(n, 16));
6147 }
6148 
6149 #define ETID_FLOWC_NPARAMS 6
6150 #define ETID_FLOWC_LEN (roundup2((sizeof(struct fw_flowc_wr) + \
6151     ETID_FLOWC_NPARAMS * sizeof(struct fw_flowc_mnemval)), 16))
6152 #define ETID_FLOWC_LEN16 (howmany(ETID_FLOWC_LEN, 16))
6153 
6154 static int
6155 send_etid_flowc_wr(struct cxgbe_rate_tag *cst, struct port_info *pi,
6156     struct vi_info *vi)
6157 {
6158 	struct wrq_cookie cookie;
6159 	u_int pfvf = pi->adapter->pf << S_FW_VIID_PFN;
6160 	struct fw_flowc_wr *flowc;
6161 
6162 	mtx_assert(&cst->lock, MA_OWNED);
6163 	MPASS((cst->flags & (EO_FLOWC_PENDING | EO_FLOWC_RPL_PENDING)) ==
6164 	    EO_FLOWC_PENDING);
6165 
6166 	flowc = start_wrq_wr(&cst->eo_txq->wrq, ETID_FLOWC_LEN16, &cookie);
6167 	if (__predict_false(flowc == NULL))
6168 		return (ENOMEM);
6169 
6170 	bzero(flowc, ETID_FLOWC_LEN);
6171 	flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
6172 	    V_FW_FLOWC_WR_NPARAMS(ETID_FLOWC_NPARAMS) | V_FW_WR_COMPL(0));
6173 	flowc->flowid_len16 = htonl(V_FW_WR_LEN16(ETID_FLOWC_LEN16) |
6174 	    V_FW_WR_FLOWID(cst->etid));
6175 	flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN;
6176 	flowc->mnemval[0].val = htobe32(pfvf);
6177 	flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH;
6178 	flowc->mnemval[1].val = htobe32(pi->tx_chan);
6179 	flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT;
6180 	flowc->mnemval[2].val = htobe32(pi->tx_chan);
6181 	flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID;
6182 	flowc->mnemval[3].val = htobe32(cst->iqid);
6183 	flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_EOSTATE;
6184 	flowc->mnemval[4].val = htobe32(FW_FLOWC_MNEM_EOSTATE_ESTABLISHED);
6185 	flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_SCHEDCLASS;
6186 	flowc->mnemval[5].val = htobe32(cst->schedcl);
6187 
6188 	commit_wrq_wr(&cst->eo_txq->wrq, flowc, &cookie);
6189 
6190 	cst->flags &= ~EO_FLOWC_PENDING;
6191 	cst->flags |= EO_FLOWC_RPL_PENDING;
6192 	MPASS(cst->tx_credits >= ETID_FLOWC_LEN16);	/* flowc is first WR. */
6193 	cst->tx_credits -= ETID_FLOWC_LEN16;
6194 
6195 	return (0);
6196 }
6197 
6198 #define ETID_FLUSH_LEN16 (howmany(sizeof (struct fw_flowc_wr), 16))
6199 
6200 void
6201 send_etid_flush_wr(struct cxgbe_rate_tag *cst)
6202 {
6203 	struct fw_flowc_wr *flowc;
6204 	struct wrq_cookie cookie;
6205 
6206 	mtx_assert(&cst->lock, MA_OWNED);
6207 
6208 	flowc = start_wrq_wr(&cst->eo_txq->wrq, ETID_FLUSH_LEN16, &cookie);
6209 	if (__predict_false(flowc == NULL))
6210 		CXGBE_UNIMPLEMENTED(__func__);
6211 
6212 	bzero(flowc, ETID_FLUSH_LEN16 * 16);
6213 	flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
6214 	    V_FW_FLOWC_WR_NPARAMS(0) | F_FW_WR_COMPL);
6215 	flowc->flowid_len16 = htobe32(V_FW_WR_LEN16(ETID_FLUSH_LEN16) |
6216 	    V_FW_WR_FLOWID(cst->etid));
6217 
6218 	commit_wrq_wr(&cst->eo_txq->wrq, flowc, &cookie);
6219 
6220 	cst->flags |= EO_FLUSH_RPL_PENDING;
6221 	MPASS(cst->tx_credits >= ETID_FLUSH_LEN16);
6222 	cst->tx_credits -= ETID_FLUSH_LEN16;
6223 	cst->ncompl++;
6224 }
6225 
6226 static void
6227 write_ethofld_wr(struct cxgbe_rate_tag *cst, struct fw_eth_tx_eo_wr *wr,
6228     struct mbuf *m0, int compl)
6229 {
6230 	struct cpl_tx_pkt_core *cpl;
6231 	uint64_t ctrl1;
6232 	uint32_t ctrl;	/* used in many unrelated places */
6233 	int len16, pktlen, nsegs, immhdrs;
6234 	caddr_t dst;
6235 	uintptr_t p;
6236 	struct ulptx_sgl *usgl;
6237 	struct sglist sg;
6238 	struct sglist_seg segs[38];	/* XXX: find real limit.  XXX: get off the stack */
6239 
6240 	mtx_assert(&cst->lock, MA_OWNED);
6241 	M_ASSERTPKTHDR(m0);
6242 	KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 &&
6243 	    m0->m_pkthdr.l4hlen > 0,
6244 	    ("%s: ethofld mbuf %p is missing header lengths", __func__, m0));
6245 
6246 	len16 = mbuf_eo_len16(m0);
6247 	nsegs = mbuf_eo_nsegs(m0);
6248 	pktlen = m0->m_pkthdr.len;
6249 	ctrl = sizeof(struct cpl_tx_pkt_core);
6250 	if (needs_tso(m0))
6251 		ctrl += sizeof(struct cpl_tx_pkt_lso_core);
6252 	immhdrs = m0->m_pkthdr.l2hlen + m0->m_pkthdr.l3hlen + m0->m_pkthdr.l4hlen;
6253 	ctrl += immhdrs;
6254 
6255 	wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_EO_WR) |
6256 	    V_FW_ETH_TX_EO_WR_IMMDLEN(ctrl) | V_FW_WR_COMPL(!!compl));
6257 	wr->equiq_to_len16 = htobe32(V_FW_WR_LEN16(len16) |
6258 	    V_FW_WR_FLOWID(cst->etid));
6259 	wr->r3 = 0;
6260 	if (needs_outer_udp_csum(m0)) {
6261 		wr->u.udpseg.type = FW_ETH_TX_EO_TYPE_UDPSEG;
6262 		wr->u.udpseg.ethlen = m0->m_pkthdr.l2hlen;
6263 		wr->u.udpseg.iplen = htobe16(m0->m_pkthdr.l3hlen);
6264 		wr->u.udpseg.udplen = m0->m_pkthdr.l4hlen;
6265 		wr->u.udpseg.rtplen = 0;
6266 		wr->u.udpseg.r4 = 0;
6267 		wr->u.udpseg.mss = htobe16(pktlen - immhdrs);
6268 		wr->u.udpseg.schedpktsize = wr->u.udpseg.mss;
6269 		wr->u.udpseg.plen = htobe32(pktlen - immhdrs);
6270 		cpl = (void *)(wr + 1);
6271 	} else {
6272 		MPASS(needs_outer_tcp_csum(m0));
6273 		wr->u.tcpseg.type = FW_ETH_TX_EO_TYPE_TCPSEG;
6274 		wr->u.tcpseg.ethlen = m0->m_pkthdr.l2hlen;
6275 		wr->u.tcpseg.iplen = htobe16(m0->m_pkthdr.l3hlen);
6276 		wr->u.tcpseg.tcplen = m0->m_pkthdr.l4hlen;
6277 		wr->u.tcpseg.tsclk_tsoff = mbuf_eo_tsclk_tsoff(m0);
6278 		wr->u.tcpseg.r4 = 0;
6279 		wr->u.tcpseg.r5 = 0;
6280 		wr->u.tcpseg.plen = htobe32(pktlen - immhdrs);
6281 
6282 		if (needs_tso(m0)) {
6283 			struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1);
6284 
6285 			wr->u.tcpseg.mss = htobe16(m0->m_pkthdr.tso_segsz);
6286 
6287 			ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) |
6288 			    F_LSO_FIRST_SLICE | F_LSO_LAST_SLICE |
6289 			    V_LSO_ETHHDR_LEN((m0->m_pkthdr.l2hlen -
6290 				ETHER_HDR_LEN) >> 2) |
6291 			    V_LSO_IPHDR_LEN(m0->m_pkthdr.l3hlen >> 2) |
6292 			    V_LSO_TCPHDR_LEN(m0->m_pkthdr.l4hlen >> 2);
6293 			if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr))
6294 				ctrl |= F_LSO_IPV6;
6295 			lso->lso_ctrl = htobe32(ctrl);
6296 			lso->ipid_ofst = htobe16(0);
6297 			lso->mss = htobe16(m0->m_pkthdr.tso_segsz);
6298 			lso->seqno_offset = htobe32(0);
6299 			lso->len = htobe32(pktlen);
6300 
6301 			cpl = (void *)(lso + 1);
6302 		} else {
6303 			wr->u.tcpseg.mss = htobe16(0xffff);
6304 			cpl = (void *)(wr + 1);
6305 		}
6306 	}
6307 
6308 	/* Checksum offload must be requested for ethofld. */
6309 	MPASS(needs_outer_l4_csum(m0));
6310 	ctrl1 = csum_to_ctrl(cst->adapter, m0);
6311 
6312 	/* VLAN tag insertion */
6313 	if (needs_vlan_insertion(m0)) {
6314 		ctrl1 |= F_TXPKT_VLAN_VLD |
6315 		    V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag);
6316 	}
6317 
6318 	/* CPL header */
6319 	cpl->ctrl0 = cst->ctrl0;
6320 	cpl->pack = 0;
6321 	cpl->len = htobe16(pktlen);
6322 	cpl->ctrl1 = htobe64(ctrl1);
6323 
6324 	/* Copy Ethernet, IP & TCP/UDP hdrs as immediate data */
6325 	p = (uintptr_t)(cpl + 1);
6326 	m_copydata(m0, 0, immhdrs, (void *)p);
6327 
6328 	/* SGL */
6329 	dst = (void *)(cpl + 1);
6330 	if (nsegs > 0) {
6331 		int i, pad;
6332 
6333 		/* zero-pad upto next 16Byte boundary, if not 16Byte aligned */
6334 		p += immhdrs;
6335 		pad = 16 - (immhdrs & 0xf);
6336 		bzero((void *)p, pad);
6337 
6338 		usgl = (void *)(p + pad);
6339 		usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) |
6340 		    V_ULPTX_NSGE(nsegs));
6341 
6342 		sglist_init(&sg, nitems(segs), segs);
6343 		for (; m0 != NULL; m0 = m0->m_next) {
6344 			if (__predict_false(m0->m_len == 0))
6345 				continue;
6346 			if (immhdrs >= m0->m_len) {
6347 				immhdrs -= m0->m_len;
6348 				continue;
6349 			}
6350 			if (m0->m_flags & M_EXTPG)
6351 				sglist_append_mbuf_epg(&sg, m0,
6352 				    mtod(m0, vm_offset_t), m0->m_len);
6353                         else
6354 				sglist_append(&sg, mtod(m0, char *) + immhdrs,
6355 				    m0->m_len - immhdrs);
6356 			immhdrs = 0;
6357 		}
6358 		MPASS(sg.sg_nseg == nsegs);
6359 
6360 		/*
6361 		 * Zero pad last 8B in case the WR doesn't end on a 16B
6362 		 * boundary.
6363 		 */
6364 		*(uint64_t *)((char *)wr + len16 * 16 - 8) = 0;
6365 
6366 		usgl->len0 = htobe32(segs[0].ss_len);
6367 		usgl->addr0 = htobe64(segs[0].ss_paddr);
6368 		for (i = 0; i < nsegs - 1; i++) {
6369 			usgl->sge[i / 2].len[i & 1] = htobe32(segs[i + 1].ss_len);
6370 			usgl->sge[i / 2].addr[i & 1] = htobe64(segs[i + 1].ss_paddr);
6371 		}
6372 		if (i & 1)
6373 			usgl->sge[i / 2].len[1] = htobe32(0);
6374 	}
6375 
6376 }
6377 
6378 static void
6379 ethofld_tx(struct cxgbe_rate_tag *cst)
6380 {
6381 	struct mbuf *m;
6382 	struct wrq_cookie cookie;
6383 	int next_credits, compl;
6384 	struct fw_eth_tx_eo_wr *wr;
6385 
6386 	mtx_assert(&cst->lock, MA_OWNED);
6387 
6388 	while ((m = mbufq_first(&cst->pending_tx)) != NULL) {
6389 		M_ASSERTPKTHDR(m);
6390 
6391 		/* How many len16 credits do we need to send this mbuf. */
6392 		next_credits = mbuf_eo_len16(m);
6393 		MPASS(next_credits > 0);
6394 		if (next_credits > cst->tx_credits) {
6395 			/*
6396 			 * Tx will make progress eventually because there is at
6397 			 * least one outstanding fw4_ack that will return
6398 			 * credits and kick the tx.
6399 			 */
6400 			MPASS(cst->ncompl > 0);
6401 			return;
6402 		}
6403 		wr = start_wrq_wr(&cst->eo_txq->wrq, next_credits, &cookie);
6404 		if (__predict_false(wr == NULL)) {
6405 			/* XXX: wishful thinking, not a real assertion. */
6406 			MPASS(cst->ncompl > 0);
6407 			return;
6408 		}
6409 		cst->tx_credits -= next_credits;
6410 		cst->tx_nocompl += next_credits;
6411 		compl = cst->ncompl == 0 || cst->tx_nocompl >= cst->tx_total / 2;
6412 		ETHER_BPF_MTAP(cst->com.ifp, m);
6413 		write_ethofld_wr(cst, wr, m, compl);
6414 		commit_wrq_wr(&cst->eo_txq->wrq, wr, &cookie);
6415 		if (compl) {
6416 			cst->ncompl++;
6417 			cst->tx_nocompl	= 0;
6418 		}
6419 		(void) mbufq_dequeue(&cst->pending_tx);
6420 
6421 		/*
6422 		 * Drop the mbuf's reference on the tag now rather
6423 		 * than waiting until m_freem().  This ensures that
6424 		 * cxgbe_rate_tag_free gets called when the inp drops
6425 		 * its reference on the tag and there are no more
6426 		 * mbufs in the pending_tx queue and can flush any
6427 		 * pending requests.  Otherwise if the last mbuf
6428 		 * doesn't request a completion the etid will never be
6429 		 * released.
6430 		 */
6431 		m->m_pkthdr.snd_tag = NULL;
6432 		m->m_pkthdr.csum_flags &= ~CSUM_SND_TAG;
6433 		m_snd_tag_rele(&cst->com);
6434 
6435 		mbufq_enqueue(&cst->pending_fwack, m);
6436 	}
6437 }
6438 
6439 int
6440 ethofld_transmit(struct ifnet *ifp, struct mbuf *m0)
6441 {
6442 	struct cxgbe_rate_tag *cst;
6443 	int rc;
6444 
6445 	MPASS(m0->m_nextpkt == NULL);
6446 	MPASS(m0->m_pkthdr.csum_flags & CSUM_SND_TAG);
6447 	MPASS(m0->m_pkthdr.snd_tag != NULL);
6448 	cst = mst_to_crt(m0->m_pkthdr.snd_tag);
6449 
6450 	mtx_lock(&cst->lock);
6451 	MPASS(cst->flags & EO_SND_TAG_REF);
6452 
6453 	if (__predict_false(cst->flags & EO_FLOWC_PENDING)) {
6454 		struct vi_info *vi = ifp->if_softc;
6455 		struct port_info *pi = vi->pi;
6456 		struct adapter *sc = pi->adapter;
6457 		const uint32_t rss_mask = vi->rss_size - 1;
6458 		uint32_t rss_hash;
6459 
6460 		cst->eo_txq = &sc->sge.ofld_txq[vi->first_ofld_txq];
6461 		if (M_HASHTYPE_ISHASH(m0))
6462 			rss_hash = m0->m_pkthdr.flowid;
6463 		else
6464 			rss_hash = arc4random();
6465 		/* We assume RSS hashing */
6466 		cst->iqid = vi->rss[rss_hash & rss_mask];
6467 		cst->eo_txq += rss_hash % vi->nofldtxq;
6468 		rc = send_etid_flowc_wr(cst, pi, vi);
6469 		if (rc != 0)
6470 			goto done;
6471 	}
6472 
6473 	if (__predict_false(cst->plen + m0->m_pkthdr.len > eo_max_backlog)) {
6474 		rc = ENOBUFS;
6475 		goto done;
6476 	}
6477 
6478 	mbufq_enqueue(&cst->pending_tx, m0);
6479 	cst->plen += m0->m_pkthdr.len;
6480 
6481 	/*
6482 	 * Hold an extra reference on the tag while generating work
6483 	 * requests to ensure that we don't try to free the tag during
6484 	 * ethofld_tx() in case we are sending the final mbuf after
6485 	 * the inp was freed.
6486 	 */
6487 	m_snd_tag_ref(&cst->com);
6488 	ethofld_tx(cst);
6489 	mtx_unlock(&cst->lock);
6490 	m_snd_tag_rele(&cst->com);
6491 	return (0);
6492 
6493 done:
6494 	mtx_unlock(&cst->lock);
6495 	if (__predict_false(rc != 0))
6496 		m_freem(m0);
6497 	return (rc);
6498 }
6499 
6500 static int
6501 ethofld_fw4_ack(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m0)
6502 {
6503 	struct adapter *sc = iq->adapter;
6504 	const struct cpl_fw4_ack *cpl = (const void *)(rss + 1);
6505 	struct mbuf *m;
6506 	u_int etid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl)));
6507 	struct cxgbe_rate_tag *cst;
6508 	uint8_t credits = cpl->credits;
6509 
6510 	cst = lookup_etid(sc, etid);
6511 	mtx_lock(&cst->lock);
6512 	if (__predict_false(cst->flags & EO_FLOWC_RPL_PENDING)) {
6513 		MPASS(credits >= ETID_FLOWC_LEN16);
6514 		credits -= ETID_FLOWC_LEN16;
6515 		cst->flags &= ~EO_FLOWC_RPL_PENDING;
6516 	}
6517 
6518 	KASSERT(cst->ncompl > 0,
6519 	    ("%s: etid %u (%p) wasn't expecting completion.",
6520 	    __func__, etid, cst));
6521 	cst->ncompl--;
6522 
6523 	while (credits > 0) {
6524 		m = mbufq_dequeue(&cst->pending_fwack);
6525 		if (__predict_false(m == NULL)) {
6526 			/*
6527 			 * The remaining credits are for the final flush that
6528 			 * was issued when the tag was freed by the kernel.
6529 			 */
6530 			MPASS((cst->flags &
6531 			    (EO_FLUSH_RPL_PENDING | EO_SND_TAG_REF)) ==
6532 			    EO_FLUSH_RPL_PENDING);
6533 			MPASS(credits == ETID_FLUSH_LEN16);
6534 			MPASS(cst->tx_credits + cpl->credits == cst->tx_total);
6535 			MPASS(cst->ncompl == 0);
6536 
6537 			cst->flags &= ~EO_FLUSH_RPL_PENDING;
6538 			cst->tx_credits += cpl->credits;
6539 			cxgbe_rate_tag_free_locked(cst);
6540 			return (0);	/* cst is gone. */
6541 		}
6542 		KASSERT(m != NULL,
6543 		    ("%s: too many credits (%u, %u)", __func__, cpl->credits,
6544 		    credits));
6545 		KASSERT(credits >= mbuf_eo_len16(m),
6546 		    ("%s: too few credits (%u, %u, %u)", __func__,
6547 		    cpl->credits, credits, mbuf_eo_len16(m)));
6548 		credits -= mbuf_eo_len16(m);
6549 		cst->plen -= m->m_pkthdr.len;
6550 		m_freem(m);
6551 	}
6552 
6553 	cst->tx_credits += cpl->credits;
6554 	MPASS(cst->tx_credits <= cst->tx_total);
6555 
6556 	if (cst->flags & EO_SND_TAG_REF) {
6557 		/*
6558 		 * As with ethofld_transmit(), hold an extra reference
6559 		 * so that the tag is stable across ethold_tx().
6560 		 */
6561 		m_snd_tag_ref(&cst->com);
6562 		m = mbufq_first(&cst->pending_tx);
6563 		if (m != NULL && cst->tx_credits >= mbuf_eo_len16(m))
6564 			ethofld_tx(cst);
6565 		mtx_unlock(&cst->lock);
6566 		m_snd_tag_rele(&cst->com);
6567 	} else {
6568 		/*
6569 		 * There shouldn't be any pending packets if the tag
6570 		 * was freed by the kernel since any pending packet
6571 		 * should hold a reference to the tag.
6572 		 */
6573 		MPASS(mbufq_first(&cst->pending_tx) == NULL);
6574 		mtx_unlock(&cst->lock);
6575 	}
6576 
6577 	return (0);
6578 }
6579 #endif
6580