1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2011, 2025 Chelsio Communications.
5 * Written by: Navdeep Parhar <np@FreeBSD.org>
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29 #include <sys/cdefs.h>
30 #include "opt_inet.h"
31 #include "opt_inet6.h"
32 #include "opt_kern_tls.h"
33 #include "opt_ratelimit.h"
34
35 #include <sys/types.h>
36 #include <sys/eventhandler.h>
37 #include <sys/mbuf.h>
38 #include <sys/socket.h>
39 #include <sys/kernel.h>
40 #include <sys/ktls.h>
41 #include <sys/malloc.h>
42 #include <sys/msan.h>
43 #include <sys/queue.h>
44 #include <sys/sbuf.h>
45 #include <sys/taskqueue.h>
46 #include <sys/time.h>
47 #include <sys/sglist.h>
48 #include <sys/sysctl.h>
49 #include <sys/smp.h>
50 #include <sys/socketvar.h>
51 #include <sys/counter.h>
52 #include <net/bpf.h>
53 #include <net/ethernet.h>
54 #include <net/if.h>
55 #include <net/if_vlan_var.h>
56 #include <net/if_vxlan.h>
57 #include <netinet/in.h>
58 #include <netinet/ip.h>
59 #include <netinet/ip6.h>
60 #include <netinet/tcp.h>
61 #include <netinet/udp.h>
62 #include <machine/in_cksum.h>
63 #include <machine/md_var.h>
64 #include <vm/vm.h>
65 #include <vm/pmap.h>
66 #ifdef DEV_NETMAP
67 #include <machine/bus.h>
68 #include <sys/selinfo.h>
69 #include <net/if_var.h>
70 #include <net/netmap.h>
71 #include <dev/netmap/netmap_kern.h>
72 #endif
73
74 #include "common/common.h"
75 #include "common/t4_regs.h"
76 #include "common/t4_regs_values.h"
77 #include "common/t4_msg.h"
78 #include "t4_l2t.h"
79 #include "t4_mp_ring.h"
80
81 #define RX_COPY_THRESHOLD MINCLSIZE
82
83 /*
84 * Ethernet frames are DMA'd at this byte offset into the freelist buffer.
85 * 0-7 are valid values.
86 */
87 static int fl_pktshift = 0;
88 SYSCTL_INT(_hw_cxgbe, OID_AUTO, fl_pktshift, CTLFLAG_RDTUN, &fl_pktshift, 0,
89 "payload DMA offset in rx buffer (bytes)");
90
91 /*
92 * Pad ethernet payload up to this boundary.
93 * -1: driver should figure out a good value.
94 * 0: disable padding.
95 * Any power of 2 from 32 to 4096 (both inclusive) is also a valid value.
96 */
97 int fl_pad = -1;
98 SYSCTL_INT(_hw_cxgbe, OID_AUTO, fl_pad, CTLFLAG_RDTUN, &fl_pad, 0,
99 "payload pad boundary (bytes)");
100
101 /*
102 * Status page length.
103 * -1: driver should figure out a good value.
104 * 64 or 128 are the only other valid values.
105 */
106 static int spg_len = -1;
107 SYSCTL_INT(_hw_cxgbe, OID_AUTO, spg_len, CTLFLAG_RDTUN, &spg_len, 0,
108 "status page size (bytes)");
109
110 /*
111 * Congestion drops.
112 * -1: no congestion feedback (not recommended).
113 * 0: backpressure the channel instead of dropping packets right away.
114 * 1: no backpressure, drop packets for the congested queue immediately.
115 * 2: both backpressure and drop.
116 */
117 static int cong_drop = 0;
118 SYSCTL_INT(_hw_cxgbe, OID_AUTO, cong_drop, CTLFLAG_RDTUN, &cong_drop, 0,
119 "Congestion control for NIC RX queues (0 = backpressure, 1 = drop, 2 = both");
120 #ifdef TCP_OFFLOAD
121 static int ofld_cong_drop = 0;
122 SYSCTL_INT(_hw_cxgbe, OID_AUTO, ofld_cong_drop, CTLFLAG_RDTUN, &ofld_cong_drop, 0,
123 "Congestion control for TOE RX queues (0 = backpressure, 1 = drop, 2 = both");
124 #endif
125
126 /*
127 * Deliver multiple frames in the same free list buffer if they fit.
128 * -1: let the driver decide whether to enable buffer packing or not.
129 * 0: disable buffer packing.
130 * 1: enable buffer packing.
131 */
132 static int buffer_packing = -1;
133 SYSCTL_INT(_hw_cxgbe, OID_AUTO, buffer_packing, CTLFLAG_RDTUN, &buffer_packing,
134 0, "Enable buffer packing");
135
136 /*
137 * Start next frame in a packed buffer at this boundary.
138 * -1: driver should figure out a good value.
139 * T4: driver will ignore this and use the same value as fl_pad above.
140 * T5: 16, or a power of 2 from 64 to 4096 (both inclusive) is a valid value.
141 */
142 static int fl_pack = -1;
143 SYSCTL_INT(_hw_cxgbe, OID_AUTO, fl_pack, CTLFLAG_RDTUN, &fl_pack, 0,
144 "payload pack boundary (bytes)");
145
146 /*
147 * Largest rx cluster size that the driver is allowed to allocate.
148 */
149 static int largest_rx_cluster = MJUM16BYTES;
150 SYSCTL_INT(_hw_cxgbe, OID_AUTO, largest_rx_cluster, CTLFLAG_RDTUN,
151 &largest_rx_cluster, 0, "Largest rx cluster (bytes)");
152
153 /*
154 * Size of cluster allocation that's most likely to succeed. The driver will
155 * fall back to this size if it fails to allocate clusters larger than this.
156 */
157 static int safest_rx_cluster = PAGE_SIZE;
158 SYSCTL_INT(_hw_cxgbe, OID_AUTO, safest_rx_cluster, CTLFLAG_RDTUN,
159 &safest_rx_cluster, 0, "Safe rx cluster (bytes)");
160
161 #ifdef RATELIMIT
162 /*
163 * Knob to control TCP timestamp rewriting, and the granularity of the tick used
164 * for rewriting. -1 and 0-3 are all valid values.
165 * -1: hardware should leave the TCP timestamps alone.
166 * 0: 1ms
167 * 1: 100us
168 * 2: 10us
169 * 3: 1us
170 */
171 static int tsclk = -1;
172 SYSCTL_INT(_hw_cxgbe, OID_AUTO, tsclk, CTLFLAG_RDTUN, &tsclk, 0,
173 "Control TCP timestamp rewriting when using pacing");
174
175 static int eo_max_backlog = 1024 * 1024;
176 SYSCTL_INT(_hw_cxgbe, OID_AUTO, eo_max_backlog, CTLFLAG_RDTUN, &eo_max_backlog,
177 0, "Maximum backlog of ratelimited data per flow");
178 #endif
179
180 /*
181 * The interrupt holdoff timers are multiplied by this value on T6+.
182 * 1 and 3-17 (both inclusive) are legal values.
183 */
184 static int tscale = 1;
185 SYSCTL_INT(_hw_cxgbe, OID_AUTO, tscale, CTLFLAG_RDTUN, &tscale, 0,
186 "Interrupt holdoff timer scale on T6+");
187
188 /*
189 * Number of LRO entries in the lro_ctrl structure per rx queue.
190 */
191 static int lro_entries = TCP_LRO_ENTRIES;
192 SYSCTL_INT(_hw_cxgbe, OID_AUTO, lro_entries, CTLFLAG_RDTUN, &lro_entries, 0,
193 "Number of LRO entries per RX queue");
194
195 /*
196 * This enables presorting of frames before they're fed into tcp_lro_rx.
197 */
198 static int lro_mbufs = 0;
199 SYSCTL_INT(_hw_cxgbe, OID_AUTO, lro_mbufs, CTLFLAG_RDTUN, &lro_mbufs, 0,
200 "Enable presorting of LRO frames");
201
202 static counter_u64_t pullups;
203 SYSCTL_COUNTER_U64(_hw_cxgbe, OID_AUTO, pullups, CTLFLAG_RD, &pullups,
204 "Number of mbuf pullups performed");
205
206 static counter_u64_t defrags;
207 SYSCTL_COUNTER_U64(_hw_cxgbe, OID_AUTO, defrags, CTLFLAG_RD, &defrags,
208 "Number of mbuf defrags performed");
209
210 static int t4_tx_coalesce = 1;
211 SYSCTL_INT(_hw_cxgbe, OID_AUTO, tx_coalesce, CTLFLAG_RWTUN, &t4_tx_coalesce, 0,
212 "tx coalescing allowed");
213
214 /*
215 * The driver will make aggressive attempts at tx coalescing if it sees these
216 * many packets eligible for coalescing in quick succession, with no more than
217 * the specified gap in between the eth_tx calls that delivered the packets.
218 */
219 static int t4_tx_coalesce_pkts = 32;
220 SYSCTL_INT(_hw_cxgbe, OID_AUTO, tx_coalesce_pkts, CTLFLAG_RWTUN,
221 &t4_tx_coalesce_pkts, 0,
222 "# of consecutive packets (1 - 255) that will trigger tx coalescing");
223 static int t4_tx_coalesce_gap = 5;
224 SYSCTL_INT(_hw_cxgbe, OID_AUTO, tx_coalesce_gap, CTLFLAG_RWTUN,
225 &t4_tx_coalesce_gap, 0, "tx gap (in microseconds)");
226
227 static int service_iq(struct sge_iq *, int);
228 static int service_iq_fl(struct sge_iq *, int);
229 static struct mbuf *get_fl_payload(struct adapter *, struct sge_fl *, uint32_t);
230 static int eth_rx(struct adapter *, struct sge_rxq *, const struct iq_desc *,
231 u_int);
232 static inline void init_iq(struct sge_iq *, struct adapter *, int, int, int,
233 int, int, int);
234 static inline void init_fl(struct adapter *, struct sge_fl *, int, int, char *);
235 static inline void init_eq(struct adapter *, struct sge_eq *, int, int, uint8_t,
236 struct sge_iq *, char *);
237 static int alloc_iq_fl(struct vi_info *, struct sge_iq *, struct sge_fl *,
238 struct sysctl_ctx_list *, struct sysctl_oid *);
239 static void free_iq_fl(struct adapter *, struct sge_iq *, struct sge_fl *);
240 static void add_iq_sysctls(struct sysctl_ctx_list *, struct sysctl_oid *,
241 struct sge_iq *);
242 static void add_fl_sysctls(struct adapter *, struct sysctl_ctx_list *,
243 struct sysctl_oid *, struct sge_fl *);
244 static int alloc_iq_fl_hwq(struct vi_info *, struct sge_iq *, struct sge_fl *);
245 static int free_iq_fl_hwq(struct adapter *, struct sge_iq *, struct sge_fl *);
246 static int alloc_fwq(struct adapter *);
247 static void free_fwq(struct adapter *);
248 static int alloc_ctrlq(struct adapter *, int);
249 static void free_ctrlq(struct adapter *, int);
250 static int alloc_rxq(struct vi_info *, struct sge_rxq *, int, int, int);
251 static void free_rxq(struct vi_info *, struct sge_rxq *);
252 static void add_rxq_sysctls(struct sysctl_ctx_list *, struct sysctl_oid *,
253 struct sge_rxq *);
254 #ifdef TCP_OFFLOAD
255 static int alloc_ofld_rxq(struct vi_info *, struct sge_ofld_rxq *, int, int,
256 int);
257 static void free_ofld_rxq(struct vi_info *, struct sge_ofld_rxq *);
258 static void add_ofld_rxq_sysctls(struct sysctl_ctx_list *, struct sysctl_oid *,
259 struct sge_ofld_rxq *);
260 #endif
261 static int ctrl_eq_alloc(struct adapter *, struct sge_eq *, int);
262 static int eth_eq_alloc(struct adapter *, struct vi_info *, struct sge_eq *,
263 int);
264 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
265 static int ofld_eq_alloc(struct adapter *, struct vi_info *, struct sge_eq *,
266 int);
267 #endif
268 static int alloc_eq(struct adapter *, struct sge_eq *, struct sysctl_ctx_list *,
269 struct sysctl_oid *);
270 static void free_eq(struct adapter *, struct sge_eq *);
271 static void add_eq_sysctls(struct adapter *, struct sysctl_ctx_list *,
272 struct sysctl_oid *, struct sge_eq *);
273 static int alloc_eq_hwq(struct adapter *, struct vi_info *, struct sge_eq *,
274 int);
275 static int free_eq_hwq(struct adapter *, struct vi_info *, struct sge_eq *);
276 static int alloc_wrq(struct adapter *, struct vi_info *, struct sge_wrq *,
277 struct sysctl_ctx_list *, struct sysctl_oid *);
278 static void free_wrq(struct adapter *, struct sge_wrq *);
279 static void add_wrq_sysctls(struct sysctl_ctx_list *, struct sysctl_oid *,
280 struct sge_wrq *);
281 static int alloc_txq(struct vi_info *, struct sge_txq *, int);
282 static void free_txq(struct vi_info *, struct sge_txq *);
283 static void add_txq_sysctls(struct vi_info *, struct sysctl_ctx_list *,
284 struct sysctl_oid *, struct sge_txq *);
285 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
286 static int alloc_ofld_txq(struct vi_info *, struct sge_ofld_txq *, int);
287 static void free_ofld_txq(struct vi_info *, struct sge_ofld_txq *);
288 static void add_ofld_txq_sysctls(struct sysctl_ctx_list *, struct sysctl_oid *,
289 struct sge_ofld_txq *);
290 #endif
291 static void oneseg_dma_callback(void *, bus_dma_segment_t *, int, int);
292 static inline void ring_fl_db(struct adapter *, struct sge_fl *);
293 static int refill_fl(struct adapter *, struct sge_fl *, int);
294 static void refill_sfl(void *);
295 static int find_refill_source(struct adapter *, int, bool);
296 static void add_fl_to_sfl(struct adapter *, struct sge_fl *);
297
298 static inline void get_pkt_gl(struct mbuf *, struct sglist *);
299 static inline u_int txpkt_len16(u_int, const u_int);
300 static inline u_int txpkt_vm_len16(u_int, const u_int);
301 static inline void calculate_mbuf_len16(struct mbuf *, bool);
302 static inline u_int txpkts0_len16(u_int);
303 static inline u_int txpkts1_len16(void);
304 static u_int write_raw_wr(struct sge_txq *, void *, struct mbuf *, u_int);
305 static u_int write_txpkt_wr(struct adapter *, struct sge_txq *, struct mbuf *,
306 u_int);
307 static u_int write_txpkt_vm_wr(struct adapter *, struct sge_txq *,
308 struct mbuf *);
309 static int add_to_txpkts_vf(struct adapter *, struct sge_txq *, struct mbuf *,
310 int, bool *);
311 static int add_to_txpkts_pf(struct adapter *, struct sge_txq *, struct mbuf *,
312 int, bool *);
313 static u_int write_txpkts_wr(struct adapter *, struct sge_txq *);
314 static u_int write_txpkts_vm_wr(struct adapter *, struct sge_txq *);
315 static void write_gl_to_txd(struct sge_txq *, struct mbuf *, caddr_t *, int);
316 static inline void copy_to_txd(struct sge_eq *, caddr_t, caddr_t *, int);
317 static inline void ring_eq_db(struct adapter *, struct sge_eq *, u_int);
318 static inline uint16_t read_hw_cidx(struct sge_eq *);
319 static inline u_int reclaimable_tx_desc(struct sge_eq *);
320 static inline u_int total_available_tx_desc(struct sge_eq *);
321 static u_int reclaim_tx_descs(struct sge_txq *, u_int);
322 static void tx_reclaim(void *, int);
323 static __be64 get_flit(struct sglist_seg *, int, int);
324 static int handle_sge_egr_update(struct sge_iq *, const struct rss_header *,
325 struct mbuf *);
326 static int handle_fw_msg(struct sge_iq *, const struct rss_header *,
327 struct mbuf *);
328 static int t4_handle_wrerr_rpl(struct adapter *, const __be64 *);
329 static void wrq_tx_drain(void *, int);
330 static void drain_wrq_wr_list(struct adapter *, struct sge_wrq *);
331
332 static int sysctl_bufsizes(SYSCTL_HANDLER_ARGS);
333 #ifdef RATELIMIT
334 static int ethofld_fw4_ack(struct sge_iq *, const struct rss_header *,
335 struct mbuf *);
336 #if defined(INET) || defined(INET6)
337 static inline u_int txpkt_eo_len16(u_int, u_int, u_int);
338 static int ethofld_transmit(if_t, struct mbuf *);
339 #endif
340 #endif
341
342 static counter_u64_t extfree_refs;
343 static counter_u64_t extfree_rels;
344
345 an_handler_t t4_an_handler;
346 fw_msg_handler_t t4_fw_msg_handler[NUM_FW6_TYPES];
347 cpl_handler_t t4_cpl_handler[NUM_CPL_CMDS];
348 cpl_handler_t set_tcb_rpl_handlers[NUM_CPL_COOKIES];
349 cpl_handler_t l2t_write_rpl_handlers[NUM_CPL_COOKIES];
350 cpl_handler_t act_open_rpl_handlers[NUM_CPL_COOKIES];
351 cpl_handler_t abort_rpl_rss_handlers[NUM_CPL_COOKIES];
352 cpl_handler_t fw4_ack_handlers[NUM_CPL_COOKIES];
353 cpl_handler_t fw6_pld_handlers[NUM_CPL_FW6_COOKIES];
354
355 void
t4_register_an_handler(an_handler_t h)356 t4_register_an_handler(an_handler_t h)
357 {
358 uintptr_t *loc;
359
360 MPASS(h == NULL || t4_an_handler == NULL);
361
362 loc = (uintptr_t *)&t4_an_handler;
363 atomic_store_rel_ptr(loc, (uintptr_t)h);
364 }
365
366 void
t4_register_fw_msg_handler(int type,fw_msg_handler_t h)367 t4_register_fw_msg_handler(int type, fw_msg_handler_t h)
368 {
369 uintptr_t *loc;
370
371 MPASS(type < nitems(t4_fw_msg_handler));
372 MPASS(h == NULL || t4_fw_msg_handler[type] == NULL);
373 /*
374 * These are dispatched by the handler for FW{4|6}_CPL_MSG using the CPL
375 * handler dispatch table. Reject any attempt to install a handler for
376 * this subtype.
377 */
378 MPASS(type != FW_TYPE_RSSCPL);
379 MPASS(type != FW6_TYPE_RSSCPL);
380
381 loc = (uintptr_t *)&t4_fw_msg_handler[type];
382 atomic_store_rel_ptr(loc, (uintptr_t)h);
383 }
384
385 void
t4_register_cpl_handler(int opcode,cpl_handler_t h)386 t4_register_cpl_handler(int opcode, cpl_handler_t h)
387 {
388 uintptr_t *loc;
389
390 MPASS(opcode < nitems(t4_cpl_handler));
391 MPASS(h == NULL || t4_cpl_handler[opcode] == NULL);
392
393 loc = (uintptr_t *)&t4_cpl_handler[opcode];
394 atomic_store_rel_ptr(loc, (uintptr_t)h);
395 }
396
397 static int
set_tcb_rpl_handler(struct sge_iq * iq,const struct rss_header * rss,struct mbuf * m)398 set_tcb_rpl_handler(struct sge_iq *iq, const struct rss_header *rss,
399 struct mbuf *m)
400 {
401 const struct cpl_set_tcb_rpl *cpl = (const void *)(rss + 1);
402 u_int tid;
403 int cookie;
404
405 MPASS(m == NULL);
406
407 tid = GET_TID(cpl);
408 if (is_hpftid(iq->adapter, tid) || is_ftid(iq->adapter, tid)) {
409 /*
410 * The return code for filter-write is put in the CPL cookie so
411 * we have to rely on the hardware tid (is_ftid) to determine
412 * that this is a response to a filter.
413 */
414 cookie = CPL_COOKIE_FILTER;
415 } else {
416 cookie = G_COOKIE(cpl->cookie);
417 }
418 MPASS(cookie > CPL_COOKIE_RESERVED);
419 MPASS(cookie < nitems(set_tcb_rpl_handlers));
420
421 return (set_tcb_rpl_handlers[cookie](iq, rss, m));
422 }
423
424 static int
l2t_write_rpl_handler(struct sge_iq * iq,const struct rss_header * rss,struct mbuf * m)425 l2t_write_rpl_handler(struct sge_iq *iq, const struct rss_header *rss,
426 struct mbuf *m)
427 {
428 const struct cpl_l2t_write_rpl *rpl = (const void *)(rss + 1);
429 unsigned int cookie;
430
431 MPASS(m == NULL);
432
433 cookie = GET_TID(rpl) & F_SYNC_WR ? CPL_COOKIE_TOM : CPL_COOKIE_FILTER;
434 return (l2t_write_rpl_handlers[cookie](iq, rss, m));
435 }
436
437 static int
act_open_rpl_handler(struct sge_iq * iq,const struct rss_header * rss,struct mbuf * m)438 act_open_rpl_handler(struct sge_iq *iq, const struct rss_header *rss,
439 struct mbuf *m)
440 {
441 const struct cpl_act_open_rpl *cpl = (const void *)(rss + 1);
442 u_int cookie = G_TID_COOKIE(G_AOPEN_ATID(be32toh(cpl->atid_status)));
443
444 MPASS(m == NULL);
445 MPASS(cookie != CPL_COOKIE_RESERVED);
446
447 return (act_open_rpl_handlers[cookie](iq, rss, m));
448 }
449
450 static int
abort_rpl_rss_handler(struct sge_iq * iq,const struct rss_header * rss,struct mbuf * m)451 abort_rpl_rss_handler(struct sge_iq *iq, const struct rss_header *rss,
452 struct mbuf *m)
453 {
454 struct adapter *sc = iq->adapter;
455 u_int cookie;
456
457 MPASS(m == NULL);
458 if (is_hashfilter(sc))
459 cookie = CPL_COOKIE_HASHFILTER;
460 else
461 cookie = CPL_COOKIE_TOM;
462
463 return (abort_rpl_rss_handlers[cookie](iq, rss, m));
464 }
465
466 static int
fw4_ack_handler(struct sge_iq * iq,const struct rss_header * rss,struct mbuf * m)467 fw4_ack_handler(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
468 {
469 struct adapter *sc = iq->adapter;
470 const struct cpl_fw4_ack *cpl = (const void *)(rss + 1);
471 unsigned int tid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl)));
472 u_int cookie;
473
474 MPASS(m == NULL);
475 if (is_etid(sc, tid))
476 cookie = CPL_COOKIE_ETHOFLD;
477 else
478 cookie = CPL_COOKIE_TOM;
479
480 return (fw4_ack_handlers[cookie](iq, rss, m));
481 }
482
483 static int
fw6_pld_handler(struct sge_iq * iq,const struct rss_header * rss,struct mbuf * m)484 fw6_pld_handler(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
485 {
486 const struct cpl_fw6_pld *cpl;
487 uint64_t cookie;
488
489 if (m != NULL)
490 cpl = mtod(m, const void *);
491 else
492 cpl = (const void *)(rss + 1);
493 cookie = be64toh(cpl->data[1]) & CPL_FW6_COOKIE_MASK;
494
495 return (fw6_pld_handlers[cookie](iq, rss, m));
496 }
497
498 static void
t4_init_shared_cpl_handlers(void)499 t4_init_shared_cpl_handlers(void)
500 {
501
502 t4_register_cpl_handler(CPL_SET_TCB_RPL, set_tcb_rpl_handler);
503 t4_register_cpl_handler(CPL_L2T_WRITE_RPL, l2t_write_rpl_handler);
504 t4_register_cpl_handler(CPL_ACT_OPEN_RPL, act_open_rpl_handler);
505 t4_register_cpl_handler(CPL_ABORT_RPL_RSS, abort_rpl_rss_handler);
506 t4_register_cpl_handler(CPL_FW4_ACK, fw4_ack_handler);
507 t4_register_cpl_handler(CPL_FW6_PLD, fw6_pld_handler);
508 }
509
510 void
t4_register_shared_cpl_handler(int opcode,cpl_handler_t h,int cookie)511 t4_register_shared_cpl_handler(int opcode, cpl_handler_t h, int cookie)
512 {
513 uintptr_t *loc;
514
515 MPASS(opcode < nitems(t4_cpl_handler));
516 if (opcode == CPL_FW6_PLD) {
517 MPASS(cookie < NUM_CPL_FW6_COOKIES);
518 } else {
519 MPASS(cookie > CPL_COOKIE_RESERVED);
520 MPASS(cookie < NUM_CPL_COOKIES);
521 }
522 MPASS(t4_cpl_handler[opcode] != NULL);
523
524 switch (opcode) {
525 case CPL_SET_TCB_RPL:
526 loc = (uintptr_t *)&set_tcb_rpl_handlers[cookie];
527 break;
528 case CPL_L2T_WRITE_RPL:
529 loc = (uintptr_t *)&l2t_write_rpl_handlers[cookie];
530 break;
531 case CPL_ACT_OPEN_RPL:
532 loc = (uintptr_t *)&act_open_rpl_handlers[cookie];
533 break;
534 case CPL_ABORT_RPL_RSS:
535 loc = (uintptr_t *)&abort_rpl_rss_handlers[cookie];
536 break;
537 case CPL_FW4_ACK:
538 loc = (uintptr_t *)&fw4_ack_handlers[cookie];
539 break;
540 case CPL_FW6_PLD:
541 loc = (uintptr_t *)&fw6_pld_handlers[cookie];
542 break;
543 default:
544 MPASS(0);
545 return;
546 }
547 MPASS(h == NULL || *loc == (uintptr_t)NULL);
548 atomic_store_rel_ptr(loc, (uintptr_t)h);
549 }
550
551 /*
552 * Called on MOD_LOAD. Validates and calculates the SGE tunables.
553 */
554 void
t4_sge_modload(void)555 t4_sge_modload(void)
556 {
557
558 if (fl_pktshift < 0 || fl_pktshift > 7) {
559 printf("Invalid hw.cxgbe.fl_pktshift value (%d),"
560 " using 0 instead.\n", fl_pktshift);
561 fl_pktshift = 0;
562 }
563
564 if (spg_len != 64 && spg_len != 128) {
565 int len;
566
567 #if defined(__i386__) || defined(__amd64__)
568 len = cpu_clflush_line_size > 64 ? 128 : 64;
569 #else
570 len = 64;
571 #endif
572 if (spg_len != -1) {
573 printf("Invalid hw.cxgbe.spg_len value (%d),"
574 " using %d instead.\n", spg_len, len);
575 }
576 spg_len = len;
577 }
578
579 if (cong_drop < -1 || cong_drop > 2) {
580 printf("Invalid hw.cxgbe.cong_drop value (%d),"
581 " using 0 instead.\n", cong_drop);
582 cong_drop = 0;
583 }
584 #ifdef TCP_OFFLOAD
585 if (ofld_cong_drop < -1 || ofld_cong_drop > 2) {
586 printf("Invalid hw.cxgbe.ofld_cong_drop value (%d),"
587 " using 0 instead.\n", ofld_cong_drop);
588 ofld_cong_drop = 0;
589 }
590 #endif
591
592 if (tscale != 1 && (tscale < 3 || tscale > 17)) {
593 printf("Invalid hw.cxgbe.tscale value (%d),"
594 " using 1 instead.\n", tscale);
595 tscale = 1;
596 }
597
598 if (largest_rx_cluster != MCLBYTES &&
599 #if MJUMPAGESIZE != MCLBYTES
600 largest_rx_cluster != MJUMPAGESIZE &&
601 #endif
602 largest_rx_cluster != MJUM9BYTES &&
603 largest_rx_cluster != MJUM16BYTES) {
604 printf("Invalid hw.cxgbe.largest_rx_cluster value (%d),"
605 " using %d instead.\n", largest_rx_cluster, MJUM16BYTES);
606 largest_rx_cluster = MJUM16BYTES;
607 }
608
609 if (safest_rx_cluster != MCLBYTES &&
610 #if MJUMPAGESIZE != MCLBYTES
611 safest_rx_cluster != MJUMPAGESIZE &&
612 #endif
613 safest_rx_cluster != MJUM9BYTES &&
614 safest_rx_cluster != MJUM16BYTES) {
615 printf("Invalid hw.cxgbe.safest_rx_cluster value (%d),"
616 " using %d instead.\n", safest_rx_cluster, MJUMPAGESIZE);
617 safest_rx_cluster = MJUMPAGESIZE;
618 }
619
620 extfree_refs = counter_u64_alloc(M_WAITOK);
621 extfree_rels = counter_u64_alloc(M_WAITOK);
622 pullups = counter_u64_alloc(M_WAITOK);
623 defrags = counter_u64_alloc(M_WAITOK);
624 counter_u64_zero(extfree_refs);
625 counter_u64_zero(extfree_rels);
626 counter_u64_zero(pullups);
627 counter_u64_zero(defrags);
628
629 t4_init_shared_cpl_handlers();
630 t4_register_cpl_handler(CPL_FW4_MSG, handle_fw_msg);
631 t4_register_cpl_handler(CPL_FW6_MSG, handle_fw_msg);
632 t4_register_cpl_handler(CPL_SGE_EGR_UPDATE, handle_sge_egr_update);
633 #ifdef RATELIMIT
634 t4_register_shared_cpl_handler(CPL_FW4_ACK, ethofld_fw4_ack,
635 CPL_COOKIE_ETHOFLD);
636 #endif
637 t4_register_fw_msg_handler(FW6_TYPE_CMD_RPL, t4_handle_fw_rpl);
638 t4_register_fw_msg_handler(FW6_TYPE_WRERR_RPL, t4_handle_wrerr_rpl);
639 }
640
641 void
t4_sge_modunload(void)642 t4_sge_modunload(void)
643 {
644
645 counter_u64_free(extfree_refs);
646 counter_u64_free(extfree_rels);
647 counter_u64_free(pullups);
648 counter_u64_free(defrags);
649 }
650
651 uint64_t
t4_sge_extfree_refs(void)652 t4_sge_extfree_refs(void)
653 {
654 uint64_t refs, rels;
655
656 rels = counter_u64_fetch(extfree_rels);
657 refs = counter_u64_fetch(extfree_refs);
658
659 return (refs - rels);
660 }
661
662 /* max 4096 */
663 #define MAX_PACK_BOUNDARY 512
664
665 static inline void
setup_pad_and_pack_boundaries(struct adapter * sc)666 setup_pad_and_pack_boundaries(struct adapter *sc)
667 {
668 uint32_t v, m;
669 int pad, pack, pad_shift;
670
671 pad_shift = chip_id(sc) > CHELSIO_T5 ? X_T6_INGPADBOUNDARY_SHIFT :
672 X_INGPADBOUNDARY_SHIFT;
673 pad = fl_pad;
674 if (fl_pad < (1 << pad_shift) ||
675 fl_pad > (1 << (pad_shift + M_INGPADBOUNDARY)) ||
676 !powerof2(fl_pad)) {
677 /*
678 * If there is any chance that we might use buffer packing and
679 * the chip is a T4, then pick 64 as the pad/pack boundary. Set
680 * it to the minimum allowed in all other cases.
681 */
682 pad = is_t4(sc) && buffer_packing ? 64 : 1 << pad_shift;
683
684 /*
685 * For fl_pad = 0 we'll still write a reasonable value to the
686 * register but all the freelists will opt out of padding.
687 * We'll complain here only if the user tried to set it to a
688 * value greater than 0 that was invalid.
689 */
690 if (fl_pad > 0) {
691 device_printf(sc->dev, "Invalid hw.cxgbe.fl_pad value"
692 " (%d), using %d instead.\n", fl_pad, pad);
693 }
694 }
695 m = V_INGPADBOUNDARY(M_INGPADBOUNDARY);
696 v = V_INGPADBOUNDARY(ilog2(pad) - pad_shift);
697 t4_set_reg_field(sc, A_SGE_CONTROL, m, v);
698
699 if (is_t4(sc)) {
700 if (fl_pack != -1 && fl_pack != pad) {
701 /* Complain but carry on. */
702 device_printf(sc->dev, "hw.cxgbe.fl_pack (%d) ignored,"
703 " using %d instead.\n", fl_pack, pad);
704 }
705 return;
706 }
707
708 pack = fl_pack;
709 if (fl_pack < 16 || fl_pack == 32 || fl_pack > 4096 ||
710 !powerof2(fl_pack)) {
711 if (sc->params.pci.mps > MAX_PACK_BOUNDARY)
712 pack = MAX_PACK_BOUNDARY;
713 else
714 pack = max(sc->params.pci.mps, CACHE_LINE_SIZE);
715 MPASS(powerof2(pack));
716 if (pack < 16)
717 pack = 16;
718 if (pack == 32)
719 pack = 64;
720 if (pack > 4096)
721 pack = 4096;
722 if (fl_pack != -1) {
723 device_printf(sc->dev, "Invalid hw.cxgbe.fl_pack value"
724 " (%d), using %d instead.\n", fl_pack, pack);
725 }
726 }
727 m = V_INGPACKBOUNDARY(M_INGPACKBOUNDARY);
728 if (pack == 16)
729 v = V_INGPACKBOUNDARY(0);
730 else
731 v = V_INGPACKBOUNDARY(ilog2(pack) - 5);
732
733 MPASS(!is_t4(sc)); /* T4 doesn't have SGE_CONTROL2 */
734 t4_set_reg_field(sc, A_SGE_CONTROL2, m, v);
735 }
736
737 /*
738 * adap->params.vpd.cclk must be set up before this is called.
739 */
740 void
t4_tweak_chip_settings(struct adapter * sc)741 t4_tweak_chip_settings(struct adapter *sc)
742 {
743 int i, reg;
744 uint32_t v, m;
745 int intr_timer[SGE_NTIMERS] = {1, 5, 10, 50, 100, 200};
746 int timer_max = M_TIMERVALUE0 * 1000 / sc->params.vpd.cclk;
747 int intr_pktcount[SGE_NCOUNTERS] = {1, 8, 16, 32}; /* 63 max */
748 uint16_t indsz = min(RX_COPY_THRESHOLD - 1, M_INDICATESIZE);
749 static int sw_buf_sizes[] = {
750 MCLBYTES,
751 #if MJUMPAGESIZE != MCLBYTES
752 MJUMPAGESIZE,
753 #endif
754 MJUM9BYTES,
755 MJUM16BYTES
756 };
757
758 KASSERT(sc->flags & MASTER_PF,
759 ("%s: trying to change chip settings when not master.", __func__));
760
761 m = V_PKTSHIFT(M_PKTSHIFT) | F_RXPKTCPLMODE | F_EGRSTATUSPAGESIZE;
762 v = V_PKTSHIFT(fl_pktshift) | F_RXPKTCPLMODE |
763 V_EGRSTATUSPAGESIZE(spg_len == 128);
764 t4_set_reg_field(sc, A_SGE_CONTROL, m, v);
765
766 setup_pad_and_pack_boundaries(sc);
767
768 v = V_HOSTPAGESIZEPF0(PAGE_SHIFT - 10) |
769 V_HOSTPAGESIZEPF1(PAGE_SHIFT - 10) |
770 V_HOSTPAGESIZEPF2(PAGE_SHIFT - 10) |
771 V_HOSTPAGESIZEPF3(PAGE_SHIFT - 10) |
772 V_HOSTPAGESIZEPF4(PAGE_SHIFT - 10) |
773 V_HOSTPAGESIZEPF5(PAGE_SHIFT - 10) |
774 V_HOSTPAGESIZEPF6(PAGE_SHIFT - 10) |
775 V_HOSTPAGESIZEPF7(PAGE_SHIFT - 10);
776 t4_write_reg(sc, A_SGE_HOST_PAGE_SIZE, v);
777
778 t4_write_reg(sc, A_SGE_FL_BUFFER_SIZE0, 4096);
779 t4_write_reg(sc, A_SGE_FL_BUFFER_SIZE1, 65536);
780 reg = A_SGE_FL_BUFFER_SIZE2;
781 for (i = 0; i < nitems(sw_buf_sizes); i++) {
782 MPASS(reg <= A_SGE_FL_BUFFER_SIZE15);
783 t4_write_reg(sc, reg, sw_buf_sizes[i]);
784 reg += 4;
785 MPASS(reg <= A_SGE_FL_BUFFER_SIZE15);
786 t4_write_reg(sc, reg, sw_buf_sizes[i] - CL_METADATA_SIZE);
787 reg += 4;
788 }
789
790 v = V_THRESHOLD_0(intr_pktcount[0]) | V_THRESHOLD_1(intr_pktcount[1]) |
791 V_THRESHOLD_2(intr_pktcount[2]) | V_THRESHOLD_3(intr_pktcount[3]);
792 t4_write_reg(sc, A_SGE_INGRESS_RX_THRESHOLD, v);
793
794 KASSERT(intr_timer[0] <= timer_max,
795 ("%s: not a single usable timer (%d, %d)", __func__, intr_timer[0],
796 timer_max));
797 for (i = 1; i < nitems(intr_timer); i++) {
798 KASSERT(intr_timer[i] >= intr_timer[i - 1],
799 ("%s: timers not listed in increasing order (%d)",
800 __func__, i));
801
802 while (intr_timer[i] > timer_max) {
803 if (i == nitems(intr_timer) - 1) {
804 intr_timer[i] = timer_max;
805 break;
806 }
807 intr_timer[i] += intr_timer[i - 1];
808 intr_timer[i] /= 2;
809 }
810 }
811
812 v = V_TIMERVALUE0(us_to_core_ticks(sc, intr_timer[0])) |
813 V_TIMERVALUE1(us_to_core_ticks(sc, intr_timer[1]));
814 t4_write_reg(sc, A_SGE_TIMER_VALUE_0_AND_1, v);
815 v = V_TIMERVALUE2(us_to_core_ticks(sc, intr_timer[2])) |
816 V_TIMERVALUE3(us_to_core_ticks(sc, intr_timer[3]));
817 t4_write_reg(sc, A_SGE_TIMER_VALUE_2_AND_3, v);
818 v = V_TIMERVALUE4(us_to_core_ticks(sc, intr_timer[4])) |
819 V_TIMERVALUE5(us_to_core_ticks(sc, intr_timer[5]));
820 t4_write_reg(sc, A_SGE_TIMER_VALUE_4_AND_5, v);
821
822 if (chip_id(sc) >= CHELSIO_T6) {
823 m = V_TSCALE(M_TSCALE);
824 if (tscale == 1)
825 v = 0;
826 else
827 v = V_TSCALE(tscale - 2);
828 t4_set_reg_field(sc, A_SGE_ITP_CONTROL, m, v);
829
830 if (sc->debug_flags & DF_DISABLE_TCB_CACHE) {
831 t4_tp_pio_read(sc, &v, 1, A_TP_CMM_CONFIG, 1);
832 if (chip_id(sc) >= CHELSIO_T7) {
833 v |= F_GLFL;
834 } else {
835 m = V_RDTHRESHOLD(M_RDTHRESHOLD) |
836 F_WRTHRTHRESHEN |
837 V_WRTHRTHRESH(M_WRTHRTHRESH);
838 v &= ~m;
839 v |= V_RDTHRESHOLD(1) | F_WRTHRTHRESHEN |
840 V_WRTHRTHRESH(16);
841 }
842 t4_tp_pio_write(sc, &v, 1, A_TP_CMM_CONFIG, 1);
843 }
844 }
845
846 /* 4K, 16K, 64K, 256K DDP "page sizes" for TDDP */
847 v = V_HPZ0(0) | V_HPZ1(2) | V_HPZ2(4) | V_HPZ3(6);
848 t4_write_reg(sc, A_ULP_RX_TDDP_PSZ, v);
849
850 /*
851 * 4K, 8K, 16K, 64K DDP "page sizes" for iSCSI DDP. These have been
852 * chosen with MAXPHYS = 128K in mind. The largest DDP buffer that we
853 * may have to deal with is MAXPHYS + 1 page.
854 */
855 v = V_HPZ0(0) | V_HPZ1(1) | V_HPZ2(2) | V_HPZ3(4);
856 t4_write_reg(sc, A_ULP_RX_ISCSI_PSZ, v);
857
858 /* We use multiple DDP page sizes both in plain-TOE and ISCSI modes. */
859 m = v = F_TDDPTAGTCB | F_ISCSITAGTCB;
860 if (sc->nvmecaps != 0) {
861 /* Request DDP status bit for NVMe PDU completions. */
862 m |= F_NVME_TCP_DDP_VAL_EN;
863 v |= F_NVME_TCP_DDP_VAL_EN;
864 }
865 t4_set_reg_field(sc, A_ULP_RX_CTL, m, v);
866
867 m = V_INDICATESIZE(M_INDICATESIZE) | F_REARMDDPOFFSET |
868 F_RESETDDPOFFSET;
869 v = V_INDICATESIZE(indsz) | F_REARMDDPOFFSET | F_RESETDDPOFFSET;
870 t4_set_reg_field(sc, A_TP_PARA_REG5, m, v);
871 }
872
873 /*
874 * SGE wants the buffer to be at least 64B and then a multiple of 16. Its
875 * address mut be 16B aligned. If padding is in use the buffer's start and end
876 * need to be aligned to the pad boundary as well. We'll just make sure that
877 * the size is a multiple of the pad boundary here, it is up to the buffer
878 * allocation code to make sure the start of the buffer is aligned.
879 */
880 static inline int
hwsz_ok(struct adapter * sc,int hwsz)881 hwsz_ok(struct adapter *sc, int hwsz)
882 {
883 int mask = fl_pad ? sc->params.sge.pad_boundary - 1 : 16 - 1;
884
885 return (hwsz >= 64 && (hwsz & mask) == 0);
886 }
887
888 /*
889 * Initialize the rx buffer sizes and figure out which zones the buffers will
890 * be allocated from.
891 */
892 void
t4_init_rx_buf_info(struct adapter * sc)893 t4_init_rx_buf_info(struct adapter *sc)
894 {
895 struct sge *s = &sc->sge;
896 struct sge_params *sp = &sc->params.sge;
897 int i, j, n;
898 static int sw_buf_sizes[] = { /* Sorted by size */
899 MCLBYTES,
900 #if MJUMPAGESIZE != MCLBYTES
901 MJUMPAGESIZE,
902 #endif
903 MJUM9BYTES,
904 MJUM16BYTES
905 };
906 struct rx_buf_info *rxb;
907
908 s->safe_zidx = -1;
909 rxb = &s->rx_buf_info[0];
910 for (i = 0; i < SW_ZONE_SIZES; i++, rxb++) {
911 rxb->size1 = sw_buf_sizes[i];
912 rxb->zone = m_getzone(rxb->size1);
913 rxb->type = m_gettype(rxb->size1);
914 rxb->size2 = 0;
915 rxb->hwidx1 = -1;
916 rxb->hwidx2 = -1;
917 for (j = 0; j < SGE_FLBUF_SIZES; j++) {
918 int hwsize = sp->sge_fl_buffer_size[j];
919
920 if (!hwsz_ok(sc, hwsize))
921 continue;
922
923 /* hwidx for size1 */
924 if (rxb->hwidx1 == -1 && rxb->size1 == hwsize)
925 rxb->hwidx1 = j;
926
927 /* hwidx for size2 (buffer packing) */
928 if (rxb->size1 - CL_METADATA_SIZE < hwsize)
929 continue;
930 n = rxb->size1 - hwsize - CL_METADATA_SIZE;
931 if (n == 0) {
932 rxb->hwidx2 = j;
933 rxb->size2 = hwsize;
934 break; /* stop looking */
935 }
936 if (rxb->hwidx2 != -1) {
937 if (n < sp->sge_fl_buffer_size[rxb->hwidx2] -
938 hwsize - CL_METADATA_SIZE) {
939 rxb->hwidx2 = j;
940 rxb->size2 = hwsize;
941 }
942 } else if (n <= 2 * CL_METADATA_SIZE) {
943 rxb->hwidx2 = j;
944 rxb->size2 = hwsize;
945 }
946 }
947 if (rxb->hwidx2 != -1)
948 sc->flags |= BUF_PACKING_OK;
949 if (s->safe_zidx == -1 && rxb->size1 == safest_rx_cluster)
950 s->safe_zidx = i;
951 }
952 }
953
954 /*
955 * Verify some basic SGE settings for the PF and VF driver, and other
956 * miscellaneous settings for the PF driver.
957 */
958 int
t4_verify_chip_settings(struct adapter * sc)959 t4_verify_chip_settings(struct adapter *sc)
960 {
961 struct sge_params *sp = &sc->params.sge;
962 uint32_t m, v, r;
963 int rc = 0;
964 const uint16_t indsz = min(RX_COPY_THRESHOLD - 1, M_INDICATESIZE);
965
966 m = F_RXPKTCPLMODE;
967 v = F_RXPKTCPLMODE;
968 r = sp->sge_control;
969 if ((r & m) != v) {
970 device_printf(sc->dev, "invalid SGE_CONTROL(0x%x)\n", r);
971 rc = EINVAL;
972 }
973
974 /*
975 * If this changes then every single use of PAGE_SHIFT in the driver
976 * needs to be carefully reviewed for PAGE_SHIFT vs sp->page_shift.
977 */
978 if (sp->page_shift != PAGE_SHIFT) {
979 device_printf(sc->dev, "invalid SGE_HOST_PAGE_SIZE(0x%x)\n", r);
980 rc = EINVAL;
981 }
982
983 if (sc->flags & IS_VF)
984 return (0);
985
986 v = V_HPZ0(0) | V_HPZ1(2) | V_HPZ2(4) | V_HPZ3(6);
987 r = t4_read_reg(sc, A_ULP_RX_TDDP_PSZ);
988 if (r != v) {
989 device_printf(sc->dev, "invalid ULP_RX_TDDP_PSZ(0x%x)\n", r);
990 if (sc->vres.ddp.size != 0)
991 rc = EINVAL;
992 }
993
994 m = v = F_TDDPTAGTCB;
995 r = t4_read_reg(sc, A_ULP_RX_CTL);
996 if ((r & m) != v) {
997 device_printf(sc->dev, "invalid ULP_RX_CTL(0x%x)\n", r);
998 if (sc->vres.ddp.size != 0)
999 rc = EINVAL;
1000 }
1001
1002 m = V_INDICATESIZE(M_INDICATESIZE) | F_REARMDDPOFFSET |
1003 F_RESETDDPOFFSET;
1004 v = V_INDICATESIZE(indsz) | F_REARMDDPOFFSET | F_RESETDDPOFFSET;
1005 r = t4_read_reg(sc, A_TP_PARA_REG5);
1006 if ((r & m) != v) {
1007 device_printf(sc->dev, "invalid TP_PARA_REG5(0x%x)\n", r);
1008 if (sc->vres.ddp.size != 0)
1009 rc = EINVAL;
1010 }
1011
1012 return (rc);
1013 }
1014
1015 int
t4_create_dma_tag(struct adapter * sc)1016 t4_create_dma_tag(struct adapter *sc)
1017 {
1018 int rc;
1019
1020 rc = bus_dma_tag_create(bus_get_dma_tag(sc->dev), 1, 0,
1021 BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, BUS_SPACE_MAXSIZE,
1022 BUS_SPACE_UNRESTRICTED, BUS_SPACE_MAXSIZE, BUS_DMA_ALLOCNOW, NULL,
1023 NULL, &sc->dmat);
1024 if (rc != 0) {
1025 device_printf(sc->dev,
1026 "failed to create main DMA tag: %d\n", rc);
1027 }
1028
1029 return (rc);
1030 }
1031
1032 void
t4_sge_sysctls(struct adapter * sc,struct sysctl_ctx_list * ctx,struct sysctl_oid_list * children)1033 t4_sge_sysctls(struct adapter *sc, struct sysctl_ctx_list *ctx,
1034 struct sysctl_oid_list *children)
1035 {
1036 struct sge_params *sp = &sc->params.sge;
1037
1038 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "buffer_sizes",
1039 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1040 sysctl_bufsizes, "A", "freelist buffer sizes");
1041
1042 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pktshift", CTLFLAG_RD,
1043 NULL, sp->fl_pktshift, "payload DMA offset in rx buffer (bytes)");
1044
1045 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pad", CTLFLAG_RD,
1046 NULL, sp->pad_boundary, "payload pad boundary (bytes)");
1047
1048 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "spg_len", CTLFLAG_RD,
1049 NULL, sp->spg_len, "status page size (bytes)");
1050
1051 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "cong_drop", CTLFLAG_RD,
1052 NULL, cong_drop, "congestion drop setting");
1053 #ifdef TCP_OFFLOAD
1054 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "ofld_cong_drop", CTLFLAG_RD,
1055 NULL, ofld_cong_drop, "congestion drop setting");
1056 #endif
1057
1058 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pack", CTLFLAG_RD,
1059 NULL, sp->pack_boundary, "payload pack boundary (bytes)");
1060 }
1061
1062 int
t4_destroy_dma_tag(struct adapter * sc)1063 t4_destroy_dma_tag(struct adapter *sc)
1064 {
1065 if (sc->dmat)
1066 bus_dma_tag_destroy(sc->dmat);
1067
1068 return (0);
1069 }
1070
1071 /*
1072 * Allocate and initialize the firmware event queue, control queues, and special
1073 * purpose rx queues owned by the adapter.
1074 *
1075 * Returns errno on failure. Resources allocated up to that point may still be
1076 * allocated. Caller is responsible for cleanup in case this function fails.
1077 */
1078 int
t4_setup_adapter_queues(struct adapter * sc)1079 t4_setup_adapter_queues(struct adapter *sc)
1080 {
1081 int rc, i;
1082
1083 ADAPTER_LOCK_ASSERT_NOTOWNED(sc);
1084
1085 /*
1086 * Firmware event queue
1087 */
1088 rc = alloc_fwq(sc);
1089 if (rc != 0)
1090 return (rc);
1091
1092 /*
1093 * That's all for the VF driver.
1094 */
1095 if (sc->flags & IS_VF)
1096 return (rc);
1097
1098 /*
1099 * XXX: General purpose rx queues, one per port.
1100 */
1101
1102 /*
1103 * Control queues. At least one per port and per internal core.
1104 */
1105 for (i = 0; i < sc->sge.nctrlq; i++) {
1106 rc = alloc_ctrlq(sc, i);
1107 if (rc != 0)
1108 return (rc);
1109 }
1110
1111 return (rc);
1112 }
1113
1114 /*
1115 * Idempotent
1116 */
1117 int
t4_teardown_adapter_queues(struct adapter * sc)1118 t4_teardown_adapter_queues(struct adapter *sc)
1119 {
1120 int i;
1121
1122 ADAPTER_LOCK_ASSERT_NOTOWNED(sc);
1123
1124 if (sc->sge.ctrlq != NULL) {
1125 MPASS(!(sc->flags & IS_VF)); /* VFs don't allocate ctrlq. */
1126 for (i = 0; i < sc->sge.nctrlq; i++)
1127 free_ctrlq(sc, i);
1128 }
1129 free_fwq(sc);
1130
1131 return (0);
1132 }
1133
1134 /* Maximum payload that could arrive with a single iq descriptor. */
1135 static inline int
max_rx_payload(struct adapter * sc,if_t ifp,const bool ofld)1136 max_rx_payload(struct adapter *sc, if_t ifp, const bool ofld)
1137 {
1138 int maxp;
1139
1140 /* large enough even when hw VLAN extraction is disabled */
1141 maxp = sc->params.sge.fl_pktshift + ETHER_HDR_LEN +
1142 ETHER_VLAN_ENCAP_LEN + if_getmtu(ifp);
1143 if (ofld && sc->tt.tls && sc->cryptocaps & FW_CAPS_CONFIG_TLSKEYS &&
1144 maxp < sc->params.tp.max_rx_pdu)
1145 maxp = sc->params.tp.max_rx_pdu;
1146 return (maxp);
1147 }
1148
1149 int
t4_setup_vi_queues(struct vi_info * vi)1150 t4_setup_vi_queues(struct vi_info *vi)
1151 {
1152 int rc = 0, i, intr_idx;
1153 struct sge_rxq *rxq;
1154 struct sge_txq *txq;
1155 #ifdef TCP_OFFLOAD
1156 struct sge_ofld_rxq *ofld_rxq;
1157 #endif
1158 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
1159 struct sge_ofld_txq *ofld_txq;
1160 #endif
1161 #ifdef DEV_NETMAP
1162 int saved_idx, iqidx;
1163 struct sge_nm_rxq *nm_rxq;
1164 struct sge_nm_txq *nm_txq;
1165 #endif
1166 struct adapter *sc = vi->adapter;
1167 if_t ifp = vi->ifp;
1168 int maxp;
1169
1170 /* Interrupt vector to start from (when using multiple vectors) */
1171 intr_idx = vi->first_intr;
1172
1173 #ifdef DEV_NETMAP
1174 saved_idx = intr_idx;
1175 if (if_getcapabilities(ifp) & IFCAP_NETMAP) {
1176
1177 /* netmap is supported with direct interrupts only. */
1178 MPASS(!forwarding_intr_to_fwq(sc));
1179 MPASS(vi->first_intr >= 0);
1180
1181 /*
1182 * We don't have buffers to back the netmap rx queues
1183 * right now so we create the queues in a way that
1184 * doesn't set off any congestion signal in the chip.
1185 */
1186 for_each_nm_rxq(vi, i, nm_rxq) {
1187 rc = alloc_nm_rxq(vi, nm_rxq, intr_idx, i);
1188 if (rc != 0)
1189 goto done;
1190 intr_idx++;
1191 }
1192
1193 for_each_nm_txq(vi, i, nm_txq) {
1194 iqidx = vi->first_nm_rxq + (i % vi->nnmrxq);
1195 rc = alloc_nm_txq(vi, nm_txq, iqidx, i);
1196 if (rc != 0)
1197 goto done;
1198 }
1199 }
1200
1201 /* Normal rx queues and netmap rx queues share the same interrupts. */
1202 intr_idx = saved_idx;
1203 #endif
1204
1205 /*
1206 * Allocate rx queues first because a default iqid is required when
1207 * creating a tx queue.
1208 */
1209 maxp = max_rx_payload(sc, ifp, false);
1210 for_each_rxq(vi, i, rxq) {
1211 rc = alloc_rxq(vi, rxq, i, intr_idx, maxp);
1212 if (rc != 0)
1213 goto done;
1214 if (!forwarding_intr_to_fwq(sc))
1215 intr_idx++;
1216 }
1217 #ifdef DEV_NETMAP
1218 if (if_getcapabilities(ifp) & IFCAP_NETMAP)
1219 intr_idx = saved_idx + max(vi->nrxq, vi->nnmrxq);
1220 #endif
1221 #ifdef TCP_OFFLOAD
1222 maxp = max_rx_payload(sc, ifp, true);
1223 for_each_ofld_rxq(vi, i, ofld_rxq) {
1224 rc = alloc_ofld_rxq(vi, ofld_rxq, i, intr_idx, maxp);
1225 if (rc != 0)
1226 goto done;
1227 if (!forwarding_intr_to_fwq(sc))
1228 intr_idx++;
1229 }
1230 #endif
1231
1232 /*
1233 * Now the tx queues.
1234 */
1235 for_each_txq(vi, i, txq) {
1236 rc = alloc_txq(vi, txq, i);
1237 if (rc != 0)
1238 goto done;
1239 }
1240 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
1241 for_each_ofld_txq(vi, i, ofld_txq) {
1242 rc = alloc_ofld_txq(vi, ofld_txq, i);
1243 if (rc != 0)
1244 goto done;
1245 }
1246 #endif
1247 done:
1248 if (rc)
1249 t4_teardown_vi_queues(vi);
1250
1251 return (rc);
1252 }
1253
1254 /*
1255 * Idempotent
1256 */
1257 int
t4_teardown_vi_queues(struct vi_info * vi)1258 t4_teardown_vi_queues(struct vi_info *vi)
1259 {
1260 int i;
1261 struct sge_rxq *rxq;
1262 struct sge_txq *txq;
1263 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
1264 struct sge_ofld_txq *ofld_txq;
1265 #endif
1266 #ifdef TCP_OFFLOAD
1267 struct sge_ofld_rxq *ofld_rxq;
1268 #endif
1269 #ifdef DEV_NETMAP
1270 struct sge_nm_rxq *nm_rxq;
1271 struct sge_nm_txq *nm_txq;
1272 #endif
1273
1274 #ifdef DEV_NETMAP
1275 if (if_getcapabilities(vi->ifp) & IFCAP_NETMAP) {
1276 for_each_nm_txq(vi, i, nm_txq) {
1277 free_nm_txq(vi, nm_txq);
1278 }
1279
1280 for_each_nm_rxq(vi, i, nm_rxq) {
1281 free_nm_rxq(vi, nm_rxq);
1282 }
1283 }
1284 #endif
1285
1286 /*
1287 * Take down all the tx queues first, as they reference the rx queues
1288 * (for egress updates, etc.).
1289 */
1290
1291 for_each_txq(vi, i, txq) {
1292 free_txq(vi, txq);
1293 }
1294 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
1295 for_each_ofld_txq(vi, i, ofld_txq) {
1296 free_ofld_txq(vi, ofld_txq);
1297 }
1298 #endif
1299
1300 /*
1301 * Then take down the rx queues.
1302 */
1303
1304 for_each_rxq(vi, i, rxq) {
1305 free_rxq(vi, rxq);
1306 }
1307 #ifdef TCP_OFFLOAD
1308 for_each_ofld_rxq(vi, i, ofld_rxq) {
1309 free_ofld_rxq(vi, ofld_rxq);
1310 }
1311 #endif
1312
1313 return (0);
1314 }
1315
1316 /*
1317 * Interrupt handler when the driver is using only 1 interrupt. This is a very
1318 * unusual scenario.
1319 *
1320 * a) Deals with errors, if any.
1321 * b) Services firmware event queue, which is taking interrupts for all other
1322 * queues.
1323 */
1324 void
t4_intr_all(void * arg)1325 t4_intr_all(void *arg)
1326 {
1327 struct adapter *sc = arg;
1328 struct sge_iq *fwq = &sc->sge.fwq;
1329
1330 MPASS(sc->intr_count == 1);
1331
1332 if (sc->intr_type == INTR_INTX)
1333 t4_write_reg(sc, MYPF_REG(A_PCIE_PF_CLI), 0);
1334
1335 t4_intr_err(arg);
1336 t4_intr_evt(fwq);
1337 }
1338
1339 /*
1340 * Interrupt handler for errors (installed directly when multiple interrupts are
1341 * being used, or called by t4_intr_all).
1342 */
1343 void
t4_intr_err(void * arg)1344 t4_intr_err(void *arg)
1345 {
1346 struct adapter *sc = arg;
1347 uint32_t v;
1348
1349 if (atomic_load_int(&sc->error_flags) & ADAP_FATAL_ERR)
1350 return;
1351
1352 v = t4_read_reg(sc, MYPF_REG(A_PL_PF_INT_CAUSE));
1353 if (v & F_PFSW) {
1354 sc->swintr++;
1355 t4_write_reg(sc, MYPF_REG(A_PL_PF_INT_CAUSE), v);
1356 }
1357
1358 if (t4_slow_intr_handler(sc, sc->intr_flags))
1359 t4_fatal_err(sc, false);
1360 }
1361
1362 /*
1363 * Interrupt handler for iq-only queues. The firmware event queue is the only
1364 * such queue right now.
1365 */
1366 void
t4_intr_evt(void * arg)1367 t4_intr_evt(void *arg)
1368 {
1369 struct sge_iq *iq = arg;
1370
1371 if (atomic_cmpset_int(&iq->state, IQS_IDLE, IQS_BUSY)) {
1372 service_iq(iq, 0);
1373 (void) atomic_cmpset_int(&iq->state, IQS_BUSY, IQS_IDLE);
1374 }
1375 }
1376
1377 /*
1378 * Interrupt handler for iq+fl queues.
1379 */
1380 void
t4_intr(void * arg)1381 t4_intr(void *arg)
1382 {
1383 struct sge_iq *iq = arg;
1384
1385 if (atomic_cmpset_int(&iq->state, IQS_IDLE, IQS_BUSY)) {
1386 service_iq_fl(iq, 0);
1387 (void) atomic_cmpset_int(&iq->state, IQS_BUSY, IQS_IDLE);
1388 }
1389 }
1390
1391 #ifdef DEV_NETMAP
1392 /*
1393 * Interrupt handler for netmap rx queues.
1394 */
1395 void
t4_nm_intr(void * arg)1396 t4_nm_intr(void *arg)
1397 {
1398 struct sge_nm_rxq *nm_rxq = arg;
1399
1400 if (atomic_cmpset_int(&nm_rxq->nm_state, NM_ON, NM_BUSY)) {
1401 service_nm_rxq(nm_rxq);
1402 (void) atomic_cmpset_int(&nm_rxq->nm_state, NM_BUSY, NM_ON);
1403 }
1404 }
1405
1406 /*
1407 * Interrupt handler for vectors shared between NIC and netmap rx queues.
1408 */
1409 void
t4_vi_intr(void * arg)1410 t4_vi_intr(void *arg)
1411 {
1412 struct irq *irq = arg;
1413
1414 MPASS(irq->nm_rxq != NULL);
1415 t4_nm_intr(irq->nm_rxq);
1416
1417 MPASS(irq->rxq != NULL);
1418 t4_intr(irq->rxq);
1419 }
1420 #endif
1421
1422 /*
1423 * Deals with interrupts on an iq-only (no freelist) queue.
1424 */
1425 static int
service_iq(struct sge_iq * iq,int budget)1426 service_iq(struct sge_iq *iq, int budget)
1427 {
1428 struct sge_iq *q;
1429 struct adapter *sc = iq->adapter;
1430 struct iq_desc *d = &iq->desc[iq->cidx];
1431 int ndescs = 0, limit;
1432 int rsp_type;
1433 uint32_t lq;
1434 STAILQ_HEAD(, sge_iq) iql = STAILQ_HEAD_INITIALIZER(iql);
1435
1436 KASSERT(iq->state == IQS_BUSY, ("%s: iq %p not BUSY", __func__, iq));
1437 KASSERT((iq->flags & IQ_HAS_FL) == 0,
1438 ("%s: called for iq %p with fl (iq->flags 0x%x)", __func__, iq,
1439 iq->flags));
1440 MPASS((iq->flags & IQ_ADJ_CREDIT) == 0);
1441 MPASS((iq->flags & IQ_LRO_ENABLED) == 0);
1442
1443 limit = budget ? budget : iq->qsize / 16;
1444
1445 /*
1446 * We always come back and check the descriptor ring for new indirect
1447 * interrupts and other responses after running a single handler.
1448 */
1449 for (;;) {
1450 while ((d->rsp.u.type_gen & F_RSPD_GEN) == iq->gen) {
1451
1452 rmb();
1453
1454 rsp_type = G_RSPD_TYPE(d->rsp.u.type_gen);
1455 lq = be32toh(d->rsp.pldbuflen_qid);
1456
1457 switch (rsp_type) {
1458 case X_RSPD_TYPE_FLBUF:
1459 panic("%s: data for an iq (%p) with no freelist",
1460 __func__, iq);
1461
1462 /* NOTREACHED */
1463
1464 case X_RSPD_TYPE_CPL:
1465 KASSERT(d->rss.opcode < NUM_CPL_CMDS,
1466 ("%s: bad opcode %02x.", __func__,
1467 d->rss.opcode));
1468 t4_cpl_handler[d->rss.opcode](iq, &d->rss, NULL);
1469 break;
1470
1471 case X_RSPD_TYPE_INTR:
1472 /*
1473 * There are 1K interrupt-capable queues (qids 0
1474 * through 1023). A response type indicating a
1475 * forwarded interrupt with a qid >= 1K is an
1476 * iWARP async notification.
1477 */
1478 if (__predict_true(lq >= 1024)) {
1479 t4_an_handler(iq, &d->rsp);
1480 break;
1481 }
1482
1483 q = sc->sge.iqmap[lq - sc->sge.iq_start -
1484 sc->sge.iq_base];
1485 if (atomic_cmpset_int(&q->state, IQS_IDLE,
1486 IQS_BUSY)) {
1487 if (service_iq_fl(q, q->qsize / 16) == 0) {
1488 (void) atomic_cmpset_int(&q->state,
1489 IQS_BUSY, IQS_IDLE);
1490 } else {
1491 STAILQ_INSERT_TAIL(&iql, q,
1492 link);
1493 }
1494 }
1495 break;
1496
1497 default:
1498 KASSERT(0,
1499 ("%s: illegal response type %d on iq %p",
1500 __func__, rsp_type, iq));
1501 log(LOG_ERR,
1502 "%s: illegal response type %d on iq %p",
1503 device_get_nameunit(sc->dev), rsp_type, iq);
1504 break;
1505 }
1506
1507 d++;
1508 if (__predict_false(++iq->cidx == iq->sidx)) {
1509 iq->cidx = 0;
1510 iq->gen ^= F_RSPD_GEN;
1511 d = &iq->desc[0];
1512 }
1513 if (__predict_false(++ndescs == limit)) {
1514 t4_write_reg(sc, sc->sge_gts_reg,
1515 V_CIDXINC(ndescs) |
1516 V_INGRESSQID(iq->cntxt_id) |
1517 V_SEINTARM(V_QINTR_TIMER_IDX(X_TIMERREG_UPDATE_CIDX)));
1518 ndescs = 0;
1519
1520 if (budget) {
1521 return (EINPROGRESS);
1522 }
1523 }
1524 }
1525
1526 if (STAILQ_EMPTY(&iql))
1527 break;
1528
1529 /*
1530 * Process the head only, and send it to the back of the list if
1531 * it's still not done.
1532 */
1533 q = STAILQ_FIRST(&iql);
1534 STAILQ_REMOVE_HEAD(&iql, link);
1535 if (service_iq_fl(q, q->qsize / 8) == 0)
1536 (void) atomic_cmpset_int(&q->state, IQS_BUSY, IQS_IDLE);
1537 else
1538 STAILQ_INSERT_TAIL(&iql, q, link);
1539 }
1540
1541 t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(ndescs) |
1542 V_INGRESSQID((u32)iq->cntxt_id) | V_SEINTARM(iq->intr_params));
1543
1544 return (0);
1545 }
1546
1547 #if defined(INET) || defined(INET6)
1548 static inline int
sort_before_lro(struct lro_ctrl * lro)1549 sort_before_lro(struct lro_ctrl *lro)
1550 {
1551
1552 return (lro->lro_mbuf_max != 0);
1553 }
1554 #endif
1555
1556 static inline uint64_t
t4_tstmp_to_ns(struct adapter * sc,uint64_t hw_tstmp)1557 t4_tstmp_to_ns(struct adapter *sc, uint64_t hw_tstmp)
1558 {
1559 struct clock_sync *cur, dcur;
1560 uint64_t hw_clocks;
1561 uint64_t hw_clk_div;
1562 sbintime_t sbt_cur_to_prev, sbt;
1563 seqc_t gen;
1564
1565 for (;;) {
1566 cur = &sc->cal_info[sc->cal_current];
1567 gen = seqc_read(&cur->gen);
1568 if (gen == 0)
1569 return (0);
1570 dcur = *cur;
1571 if (seqc_consistent(&cur->gen, gen))
1572 break;
1573 }
1574
1575 /*
1576 * Our goal here is to have a result that is:
1577 *
1578 * ( (cur_time - prev_time) )
1579 * ((hw_tstmp - hw_prev) * ----------------------------- ) + prev_time
1580 * ( (hw_cur - hw_prev) )
1581 *
1582 * With the constraints that we cannot use float and we
1583 * don't want to overflow the uint64_t numbers we are using.
1584 */
1585 hw_clocks = hw_tstmp - dcur.hw_prev;
1586 sbt_cur_to_prev = (dcur.sbt_cur - dcur.sbt_prev);
1587 hw_clk_div = dcur.hw_cur - dcur.hw_prev;
1588 sbt = hw_clocks * sbt_cur_to_prev / hw_clk_div + dcur.sbt_prev;
1589 return (sbttons(sbt));
1590 }
1591
1592 static inline void
move_to_next_rxbuf(struct sge_fl * fl)1593 move_to_next_rxbuf(struct sge_fl *fl)
1594 {
1595
1596 fl->rx_offset = 0;
1597 if (__predict_false((++fl->cidx & 7) == 0)) {
1598 uint16_t cidx = fl->cidx >> 3;
1599
1600 if (__predict_false(cidx == fl->sidx))
1601 fl->cidx = cidx = 0;
1602 fl->hw_cidx = cidx;
1603 }
1604 }
1605
1606 /*
1607 * Deals with interrupts on an iq+fl queue.
1608 */
1609 static int
service_iq_fl(struct sge_iq * iq,int budget)1610 service_iq_fl(struct sge_iq *iq, int budget)
1611 {
1612 struct sge_rxq *rxq = iq_to_rxq(iq);
1613 struct sge_fl *fl;
1614 struct adapter *sc = iq->adapter;
1615 struct iq_desc *d = &iq->desc[iq->cidx];
1616 int ndescs, limit;
1617 int rsp_type, starved;
1618 uint32_t lq;
1619 uint16_t fl_hw_cidx;
1620 struct mbuf *m0;
1621 #if defined(INET) || defined(INET6)
1622 const struct timeval lro_timeout = {0, sc->lro_timeout};
1623 struct lro_ctrl *lro = &rxq->lro;
1624 #endif
1625
1626 KASSERT(iq->state == IQS_BUSY, ("%s: iq %p not BUSY", __func__, iq));
1627 MPASS(iq->flags & IQ_HAS_FL);
1628
1629 ndescs = 0;
1630 #if defined(INET) || defined(INET6)
1631 if (iq->flags & IQ_ADJ_CREDIT) {
1632 MPASS(sort_before_lro(lro));
1633 iq->flags &= ~IQ_ADJ_CREDIT;
1634 if ((d->rsp.u.type_gen & F_RSPD_GEN) != iq->gen) {
1635 tcp_lro_flush_all(lro);
1636 t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(1) |
1637 V_INGRESSQID((u32)iq->cntxt_id) |
1638 V_SEINTARM(iq->intr_params));
1639 return (0);
1640 }
1641 ndescs = 1;
1642 }
1643 #else
1644 MPASS((iq->flags & IQ_ADJ_CREDIT) == 0);
1645 #endif
1646
1647 limit = budget ? budget : iq->qsize / 16;
1648 fl = &rxq->fl;
1649 fl_hw_cidx = fl->hw_cidx; /* stable snapshot */
1650 while ((d->rsp.u.type_gen & F_RSPD_GEN) == iq->gen) {
1651
1652 rmb();
1653
1654 m0 = NULL;
1655 rsp_type = G_RSPD_TYPE(d->rsp.u.type_gen);
1656 lq = be32toh(d->rsp.pldbuflen_qid);
1657
1658 switch (rsp_type) {
1659 case X_RSPD_TYPE_FLBUF:
1660 if (lq & F_RSPD_NEWBUF) {
1661 if (fl->rx_offset > 0)
1662 move_to_next_rxbuf(fl);
1663 lq = G_RSPD_LEN(lq);
1664 }
1665 if (IDXDIFF(fl->hw_cidx, fl_hw_cidx, fl->sidx) > 4) {
1666 FL_LOCK(fl);
1667 refill_fl(sc, fl, 64);
1668 FL_UNLOCK(fl);
1669 fl_hw_cidx = fl->hw_cidx;
1670 }
1671
1672 if (d->rss.opcode == CPL_RX_PKT) {
1673 if (__predict_true(eth_rx(sc, rxq, d, lq) == 0))
1674 break;
1675 goto out;
1676 }
1677 m0 = get_fl_payload(sc, fl, lq);
1678 if (__predict_false(m0 == NULL))
1679 goto out;
1680
1681 /* fall through */
1682
1683 case X_RSPD_TYPE_CPL:
1684 KASSERT(d->rss.opcode < NUM_CPL_CMDS,
1685 ("%s: bad opcode %02x.", __func__, d->rss.opcode));
1686 t4_cpl_handler[d->rss.opcode](iq, &d->rss, m0);
1687 break;
1688
1689 case X_RSPD_TYPE_INTR:
1690
1691 /*
1692 * There are 1K interrupt-capable queues (qids 0
1693 * through 1023). A response type indicating a
1694 * forwarded interrupt with a qid >= 1K is an
1695 * iWARP async notification. That is the only
1696 * acceptable indirect interrupt on this queue.
1697 */
1698 if (__predict_false(lq < 1024)) {
1699 panic("%s: indirect interrupt on iq_fl %p "
1700 "with qid %u", __func__, iq, lq);
1701 }
1702
1703 t4_an_handler(iq, &d->rsp);
1704 break;
1705
1706 default:
1707 KASSERT(0, ("%s: illegal response type %d on iq %p",
1708 __func__, rsp_type, iq));
1709 log(LOG_ERR, "%s: illegal response type %d on iq %p",
1710 device_get_nameunit(sc->dev), rsp_type, iq);
1711 break;
1712 }
1713
1714 d++;
1715 if (__predict_false(++iq->cidx == iq->sidx)) {
1716 iq->cidx = 0;
1717 iq->gen ^= F_RSPD_GEN;
1718 d = &iq->desc[0];
1719 }
1720 if (__predict_false(++ndescs == limit)) {
1721 t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(ndescs) |
1722 V_INGRESSQID(iq->cntxt_id) |
1723 V_SEINTARM(V_QINTR_TIMER_IDX(X_TIMERREG_UPDATE_CIDX)));
1724
1725 #if defined(INET) || defined(INET6)
1726 if (iq->flags & IQ_LRO_ENABLED &&
1727 !sort_before_lro(lro) &&
1728 sc->lro_timeout != 0) {
1729 tcp_lro_flush_inactive(lro, &lro_timeout);
1730 }
1731 #endif
1732 if (budget)
1733 return (EINPROGRESS);
1734 ndescs = 0;
1735 }
1736 }
1737 out:
1738 #if defined(INET) || defined(INET6)
1739 if (iq->flags & IQ_LRO_ENABLED) {
1740 if (ndescs > 0 && lro->lro_mbuf_count > 8) {
1741 MPASS(sort_before_lro(lro));
1742 /* hold back one credit and don't flush LRO state */
1743 iq->flags |= IQ_ADJ_CREDIT;
1744 ndescs--;
1745 } else {
1746 tcp_lro_flush_all(lro);
1747 }
1748 }
1749 #endif
1750
1751 t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(ndescs) |
1752 V_INGRESSQID((u32)iq->cntxt_id) | V_SEINTARM(iq->intr_params));
1753
1754 FL_LOCK(fl);
1755 starved = refill_fl(sc, fl, 64);
1756 FL_UNLOCK(fl);
1757 if (__predict_false(starved != 0))
1758 add_fl_to_sfl(sc, fl);
1759
1760 return (0);
1761 }
1762
1763 static inline struct cluster_metadata *
cl_metadata(struct fl_sdesc * sd)1764 cl_metadata(struct fl_sdesc *sd)
1765 {
1766
1767 return ((void *)(sd->cl + sd->moff));
1768 }
1769
1770 static void
rxb_free(struct mbuf * m)1771 rxb_free(struct mbuf *m)
1772 {
1773 struct cluster_metadata *clm = m->m_ext.ext_arg1;
1774
1775 uma_zfree(clm->zone, clm->cl);
1776 counter_u64_add(extfree_rels, 1);
1777 }
1778
1779 /*
1780 * The mbuf returned comes from zone_muf and carries the payload in one of these
1781 * ways
1782 * a) complete frame inside the mbuf
1783 * b) m_cljset (for clusters without metadata)
1784 * d) m_extaddref (cluster with metadata)
1785 */
1786 static struct mbuf *
get_scatter_segment(struct adapter * sc,struct sge_fl * fl,int fr_offset,int remaining)1787 get_scatter_segment(struct adapter *sc, struct sge_fl *fl, int fr_offset,
1788 int remaining)
1789 {
1790 struct mbuf *m;
1791 struct fl_sdesc *sd = &fl->sdesc[fl->cidx];
1792 struct rx_buf_info *rxb = &sc->sge.rx_buf_info[sd->zidx];
1793 struct cluster_metadata *clm;
1794 int len, blen;
1795 caddr_t payload;
1796
1797 if (fl->flags & FL_BUF_PACKING) {
1798 u_int l, pad;
1799
1800 blen = rxb->size2 - fl->rx_offset; /* max possible in this buf */
1801 len = min(remaining, blen);
1802 payload = sd->cl + fl->rx_offset;
1803
1804 l = fr_offset + len;
1805 pad = roundup2(l, fl->buf_boundary) - l;
1806 if (fl->rx_offset + len + pad < rxb->size2)
1807 blen = len + pad;
1808 MPASS(fl->rx_offset + blen <= rxb->size2);
1809 } else {
1810 MPASS(fl->rx_offset == 0); /* not packing */
1811 blen = rxb->size1;
1812 len = min(remaining, blen);
1813 payload = sd->cl;
1814 }
1815
1816 if (fr_offset == 0) {
1817 m = m_gethdr(M_NOWAIT, MT_DATA);
1818 if (__predict_false(m == NULL))
1819 return (NULL);
1820 m->m_pkthdr.len = remaining;
1821 } else {
1822 m = m_get(M_NOWAIT, MT_DATA);
1823 if (__predict_false(m == NULL))
1824 return (NULL);
1825 }
1826 m->m_len = len;
1827 kmsan_mark(payload, len, KMSAN_STATE_INITED);
1828
1829 if (sc->sc_do_rxcopy && len < RX_COPY_THRESHOLD) {
1830 /* copy data to mbuf */
1831 bcopy(payload, mtod(m, caddr_t), len);
1832 if (fl->flags & FL_BUF_PACKING) {
1833 fl->rx_offset += blen;
1834 MPASS(fl->rx_offset <= rxb->size2);
1835 if (fl->rx_offset < rxb->size2)
1836 return (m); /* without advancing the cidx */
1837 }
1838 } else if (fl->flags & FL_BUF_PACKING) {
1839 clm = cl_metadata(sd);
1840 if (sd->nmbuf++ == 0) {
1841 clm->refcount = 1;
1842 clm->zone = rxb->zone;
1843 clm->cl = sd->cl;
1844 counter_u64_add(extfree_refs, 1);
1845 }
1846 m_extaddref(m, payload, blen, &clm->refcount, rxb_free, clm,
1847 NULL);
1848
1849 fl->rx_offset += blen;
1850 MPASS(fl->rx_offset <= rxb->size2);
1851 if (fl->rx_offset < rxb->size2)
1852 return (m); /* without advancing the cidx */
1853 } else {
1854 m_cljset(m, sd->cl, rxb->type);
1855 sd->cl = NULL; /* consumed, not a recycle candidate */
1856 }
1857
1858 move_to_next_rxbuf(fl);
1859
1860 return (m);
1861 }
1862
1863 static struct mbuf *
get_fl_payload(struct adapter * sc,struct sge_fl * fl,const u_int plen)1864 get_fl_payload(struct adapter *sc, struct sge_fl *fl, const u_int plen)
1865 {
1866 struct mbuf *m0, *m, **pnext;
1867 u_int remaining;
1868
1869 if (__predict_false(fl->flags & FL_BUF_RESUME)) {
1870 M_ASSERTPKTHDR(fl->m0);
1871 MPASS(fl->m0->m_pkthdr.len == plen);
1872 MPASS(fl->remaining < plen);
1873
1874 m0 = fl->m0;
1875 pnext = fl->pnext;
1876 remaining = fl->remaining;
1877 fl->flags &= ~FL_BUF_RESUME;
1878 goto get_segment;
1879 }
1880
1881 /*
1882 * Payload starts at rx_offset in the current hw buffer. Its length is
1883 * 'len' and it may span multiple hw buffers.
1884 */
1885
1886 m0 = get_scatter_segment(sc, fl, 0, plen);
1887 if (m0 == NULL)
1888 return (NULL);
1889 remaining = plen - m0->m_len;
1890 pnext = &m0->m_next;
1891 while (remaining > 0) {
1892 get_segment:
1893 MPASS(fl->rx_offset == 0);
1894 m = get_scatter_segment(sc, fl, plen - remaining, remaining);
1895 if (__predict_false(m == NULL)) {
1896 fl->m0 = m0;
1897 fl->pnext = pnext;
1898 fl->remaining = remaining;
1899 fl->flags |= FL_BUF_RESUME;
1900 return (NULL);
1901 }
1902 *pnext = m;
1903 pnext = &m->m_next;
1904 remaining -= m->m_len;
1905 }
1906 *pnext = NULL;
1907
1908 M_ASSERTPKTHDR(m0);
1909 return (m0);
1910 }
1911
1912 static int
skip_scatter_segment(struct adapter * sc,struct sge_fl * fl,int fr_offset,int remaining)1913 skip_scatter_segment(struct adapter *sc, struct sge_fl *fl, int fr_offset,
1914 int remaining)
1915 {
1916 struct fl_sdesc *sd = &fl->sdesc[fl->cidx];
1917 struct rx_buf_info *rxb = &sc->sge.rx_buf_info[sd->zidx];
1918 int len, blen;
1919
1920 if (fl->flags & FL_BUF_PACKING) {
1921 u_int l, pad;
1922
1923 blen = rxb->size2 - fl->rx_offset; /* max possible in this buf */
1924 len = min(remaining, blen);
1925
1926 l = fr_offset + len;
1927 pad = roundup2(l, fl->buf_boundary) - l;
1928 if (fl->rx_offset + len + pad < rxb->size2)
1929 blen = len + pad;
1930 fl->rx_offset += blen;
1931 MPASS(fl->rx_offset <= rxb->size2);
1932 if (fl->rx_offset < rxb->size2)
1933 return (len); /* without advancing the cidx */
1934 } else {
1935 MPASS(fl->rx_offset == 0); /* not packing */
1936 blen = rxb->size1;
1937 len = min(remaining, blen);
1938 }
1939 move_to_next_rxbuf(fl);
1940 return (len);
1941 }
1942
1943 static inline void
skip_fl_payload(struct adapter * sc,struct sge_fl * fl,int plen)1944 skip_fl_payload(struct adapter *sc, struct sge_fl *fl, int plen)
1945 {
1946 int remaining, fr_offset, len;
1947
1948 fr_offset = 0;
1949 remaining = plen;
1950 while (remaining > 0) {
1951 len = skip_scatter_segment(sc, fl, fr_offset, remaining);
1952 fr_offset += len;
1953 remaining -= len;
1954 }
1955 }
1956
1957 static inline int
get_segment_len(struct adapter * sc,struct sge_fl * fl,int plen)1958 get_segment_len(struct adapter *sc, struct sge_fl *fl, int plen)
1959 {
1960 int len;
1961 struct fl_sdesc *sd = &fl->sdesc[fl->cidx];
1962 struct rx_buf_info *rxb = &sc->sge.rx_buf_info[sd->zidx];
1963
1964 if (fl->flags & FL_BUF_PACKING)
1965 len = rxb->size2 - fl->rx_offset;
1966 else
1967 len = rxb->size1;
1968
1969 return (min(plen, len));
1970 }
1971
1972 static void
handle_cpl_rx_pkt(struct adapter * sc,struct sge_rxq * rxq,const struct cpl_rx_pkt * cpl,struct mbuf * m0)1973 handle_cpl_rx_pkt(struct adapter *sc, struct sge_rxq *rxq,
1974 const struct cpl_rx_pkt *cpl, struct mbuf *m0)
1975 {
1976 if_t ifp = rxq->ifp;
1977 uint16_t err_vec, tnl_type, tnlhdr_len;
1978 static const int sw_csum_flags[2][2] = {
1979 {
1980 /* IP, inner IP */
1981 CSUM_ENCAP_VXLAN |
1982 CSUM_L3_CALC | CSUM_L3_VALID |
1983 CSUM_L4_CALC | CSUM_L4_VALID |
1984 CSUM_INNER_L3_CALC | CSUM_INNER_L3_VALID |
1985 CSUM_INNER_L4_CALC | CSUM_INNER_L4_VALID,
1986
1987 /* IP, inner IP6 */
1988 CSUM_ENCAP_VXLAN |
1989 CSUM_L3_CALC | CSUM_L3_VALID |
1990 CSUM_L4_CALC | CSUM_L4_VALID |
1991 CSUM_INNER_L4_CALC | CSUM_INNER_L4_VALID,
1992 },
1993 {
1994 /* IP6, inner IP */
1995 CSUM_ENCAP_VXLAN |
1996 CSUM_L4_CALC | CSUM_L4_VALID |
1997 CSUM_INNER_L3_CALC | CSUM_INNER_L3_VALID |
1998 CSUM_INNER_L4_CALC | CSUM_INNER_L4_VALID,
1999
2000 /* IP6, inner IP6 */
2001 CSUM_ENCAP_VXLAN |
2002 CSUM_L4_CALC | CSUM_L4_VALID |
2003 CSUM_INNER_L4_CALC | CSUM_INNER_L4_VALID,
2004 },
2005 };
2006
2007 if (sc->params.tp.rx_pkt_encap) {
2008 const uint16_t ev = be16toh(cpl->err_vec);
2009
2010 err_vec = G_T6_COMPR_RXERR_VEC(ev);
2011 tnl_type = G_T6_RX_TNL_TYPE(ev);
2012 tnlhdr_len = G_T6_RX_TNLHDR_LEN(ev);
2013 } else {
2014 err_vec = be16toh(cpl->err_vec);
2015 tnl_type = 0;
2016 tnlhdr_len = 0;
2017 }
2018 if (cpl->csum_calc && err_vec == 0) {
2019 int ipv6 = !!(cpl->l2info & htobe32(F_RXF_IP6));
2020
2021 /* checksum(s) calculated and found to be correct. */
2022
2023 MPASS((cpl->l2info & htobe32(F_RXF_IP)) ^
2024 (cpl->l2info & htobe32(F_RXF_IP6)));
2025 m0->m_pkthdr.csum_data = be16toh(cpl->csum);
2026 if (tnl_type == 0) {
2027 if (!ipv6 && if_getcapenable(ifp) & IFCAP_RXCSUM) {
2028 m0->m_pkthdr.csum_flags = CSUM_L3_CALC |
2029 CSUM_L3_VALID | CSUM_L4_CALC |
2030 CSUM_L4_VALID;
2031 } else if (ipv6 && if_getcapenable(ifp) & IFCAP_RXCSUM_IPV6) {
2032 m0->m_pkthdr.csum_flags = CSUM_L4_CALC |
2033 CSUM_L4_VALID;
2034 }
2035 rxq->rxcsum++;
2036 } else {
2037 MPASS(tnl_type == RX_PKT_TNL_TYPE_VXLAN);
2038
2039 M_HASHTYPE_SETINNER(m0);
2040 if (__predict_false(cpl->ip_frag)) {
2041 /*
2042 * csum_data is for the inner frame (which is an
2043 * IP fragment) and is not 0xffff. There is no
2044 * way to pass the inner csum_data to the stack.
2045 * We don't want the stack to use the inner
2046 * csum_data to validate the outer frame or it
2047 * will get rejected. So we fix csum_data here
2048 * and let sw do the checksum of inner IP
2049 * fragments.
2050 *
2051 * XXX: Need 32b for csum_data2 in an rx mbuf.
2052 * Maybe stuff it into rcv_tstmp?
2053 */
2054 m0->m_pkthdr.csum_data = 0xffff;
2055 if (ipv6) {
2056 m0->m_pkthdr.csum_flags = CSUM_L4_CALC |
2057 CSUM_L4_VALID;
2058 } else {
2059 m0->m_pkthdr.csum_flags = CSUM_L3_CALC |
2060 CSUM_L3_VALID | CSUM_L4_CALC |
2061 CSUM_L4_VALID;
2062 }
2063 } else {
2064 int outer_ipv6;
2065
2066 MPASS(m0->m_pkthdr.csum_data == 0xffff);
2067
2068 outer_ipv6 = tnlhdr_len >=
2069 sizeof(struct ether_header) +
2070 sizeof(struct ip6_hdr);
2071 m0->m_pkthdr.csum_flags =
2072 sw_csum_flags[outer_ipv6][ipv6];
2073 }
2074 rxq->vxlan_rxcsum++;
2075 }
2076 }
2077
2078 if (cpl->vlan_ex) {
2079 if (sc->flags & IS_VF && sc->vlan_id) {
2080 /*
2081 * HW is not setup correctly if extracted vlan_id does
2082 * not match the VF's setting.
2083 */
2084 MPASS(be16toh(cpl->vlan) == sc->vlan_id);
2085 } else {
2086 m0->m_pkthdr.ether_vtag = be16toh(cpl->vlan);
2087 m0->m_flags |= M_VLANTAG;
2088 rxq->vlan_extraction++;
2089 }
2090 }
2091 }
2092
2093 static int
eth_rx(struct adapter * sc,struct sge_rxq * rxq,const struct iq_desc * d,u_int plen)2094 eth_rx(struct adapter *sc, struct sge_rxq *rxq, const struct iq_desc *d,
2095 u_int plen)
2096 {
2097 struct mbuf *m0;
2098 if_t ifp = rxq->ifp;
2099 struct sge_fl *fl = &rxq->fl;
2100 struct vi_info *vi = if_getsoftc(ifp);
2101 #if defined(INET) || defined(INET6)
2102 struct lro_ctrl *lro = &rxq->lro;
2103 #endif
2104 int rc;
2105 const uint8_t fl_pktshift = sc->params.sge.fl_pktshift;
2106 static const uint8_t sw_hashtype[4][2] = {
2107 {M_HASHTYPE_NONE, M_HASHTYPE_NONE},
2108 {M_HASHTYPE_RSS_IPV4, M_HASHTYPE_RSS_IPV6},
2109 {M_HASHTYPE_RSS_TCP_IPV4, M_HASHTYPE_RSS_TCP_IPV6},
2110 {M_HASHTYPE_RSS_UDP_IPV4, M_HASHTYPE_RSS_UDP_IPV6},
2111 };
2112
2113 MPASS(plen > fl_pktshift);
2114 if (vi->pfil != NULL && PFIL_HOOKED_IN(vi->pfil) &&
2115 __predict_true((fl->flags & FL_BUF_RESUME) == 0)) {
2116 struct fl_sdesc *sd = &fl->sdesc[fl->cidx];
2117 caddr_t frame;
2118 const int slen = get_segment_len(sc, fl, plen) - fl_pktshift;
2119
2120 frame = sd->cl + fl->rx_offset + fl_pktshift;
2121 CURVNET_SET_QUIET(if_getvnet(ifp));
2122 rc = pfil_mem_in(vi->pfil, frame, slen, ifp, &m0);
2123 CURVNET_RESTORE();
2124 if (rc == PFIL_DROPPED || rc == PFIL_CONSUMED) {
2125 skip_fl_payload(sc, fl, plen);
2126 return (0);
2127 }
2128 if (rc == PFIL_REALLOCED) {
2129 skip_fl_payload(sc, fl, plen);
2130 goto have_mbuf;
2131 }
2132 }
2133
2134 m0 = get_fl_payload(sc, fl, plen);
2135 if (__predict_false(m0 == NULL))
2136 return (ENOMEM);
2137 m0->m_pkthdr.len -= fl_pktshift;
2138 m0->m_len -= fl_pktshift;
2139 m0->m_data += fl_pktshift;
2140
2141 have_mbuf:
2142 m0->m_pkthdr.rcvif = ifp;
2143 M_HASHTYPE_SET(m0, sw_hashtype[d->rss.hash_type][d->rss.ipv6]);
2144 m0->m_pkthdr.flowid = be32toh(d->rss.hash_val);
2145 #ifdef NUMA
2146 m0->m_pkthdr.numa_domain = if_getnumadomain(ifp);
2147 #endif
2148 if (rxq->iq.flags & IQ_RX_TIMESTAMP) {
2149 /*
2150 * Fill up rcv_tstmp and set M_TSTMP if we get a a non-zero back
2151 * from t4_tstmp_to_ns(). The descriptor has a 60b timestamp.
2152 */
2153 m0->m_pkthdr.rcv_tstmp = t4_tstmp_to_ns(sc,
2154 be64toh(d->rsp.u.last_flit) & 0x0fffffffffffffffULL);
2155 if (m0->m_pkthdr.rcv_tstmp != 0)
2156 m0->m_flags |= M_TSTMP;
2157 }
2158
2159 handle_cpl_rx_pkt(sc, rxq, (const void *)(&d->rss + 1), m0);
2160
2161 #if defined(INET) || defined(INET6)
2162 if (rxq->iq.flags & IQ_LRO_ENABLED &&
2163 (m0->m_pkthdr.rsstype & M_HASHTYPE_INNER) == 0 &&
2164 (M_HASHTYPE_GET(m0) == M_HASHTYPE_RSS_TCP_IPV4 ||
2165 M_HASHTYPE_GET(m0) == M_HASHTYPE_RSS_TCP_IPV6)) {
2166 if (sort_before_lro(lro)) {
2167 tcp_lro_queue_mbuf(lro, m0);
2168 return (0); /* queued for sort, then LRO */
2169 }
2170 if (tcp_lro_rx(lro, m0, 0) == 0)
2171 return (0); /* queued for LRO */
2172 }
2173 #endif
2174 if_input(ifp, m0);
2175
2176 return (0);
2177 }
2178
2179 /*
2180 * Must drain the wrq or make sure that someone else will.
2181 */
2182 static void
wrq_tx_drain(void * arg,int n)2183 wrq_tx_drain(void *arg, int n)
2184 {
2185 struct sge_wrq *wrq = arg;
2186 struct sge_eq *eq = &wrq->eq;
2187
2188 EQ_LOCK(eq);
2189 if (TAILQ_EMPTY(&wrq->incomplete_wrs) && !STAILQ_EMPTY(&wrq->wr_list))
2190 drain_wrq_wr_list(wrq->adapter, wrq);
2191 EQ_UNLOCK(eq);
2192 }
2193
2194 static void
drain_wrq_wr_list(struct adapter * sc,struct sge_wrq * wrq)2195 drain_wrq_wr_list(struct adapter *sc, struct sge_wrq *wrq)
2196 {
2197 struct sge_eq *eq = &wrq->eq;
2198 u_int available, dbdiff; /* # of hardware descriptors */
2199 u_int n;
2200 struct wrqe *wr;
2201 struct fw_eth_tx_pkt_wr *dst; /* any fw WR struct will do */
2202
2203 EQ_LOCK_ASSERT_OWNED(eq);
2204 MPASS(TAILQ_EMPTY(&wrq->incomplete_wrs));
2205 wr = STAILQ_FIRST(&wrq->wr_list);
2206 MPASS(wr != NULL); /* Must be called with something useful to do */
2207 MPASS(eq->pidx == eq->dbidx);
2208 dbdiff = 0;
2209
2210 do {
2211 eq->cidx = read_hw_cidx(eq);
2212 if (eq->pidx == eq->cidx)
2213 available = eq->sidx - 1;
2214 else
2215 available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;
2216
2217 MPASS(wr->wrq == wrq);
2218 n = howmany(wr->wr_len, EQ_ESIZE);
2219 if (available < n)
2220 break;
2221
2222 dst = (void *)&eq->desc[eq->pidx];
2223 if (__predict_true(eq->sidx - eq->pidx > n)) {
2224 /* Won't wrap, won't end exactly at the status page. */
2225 bcopy(&wr->wr[0], dst, wr->wr_len);
2226 eq->pidx += n;
2227 } else {
2228 int first_portion = (eq->sidx - eq->pidx) * EQ_ESIZE;
2229
2230 bcopy(&wr->wr[0], dst, first_portion);
2231 if (wr->wr_len > first_portion) {
2232 bcopy(&wr->wr[first_portion], &eq->desc[0],
2233 wr->wr_len - first_portion);
2234 }
2235 eq->pidx = n - (eq->sidx - eq->pidx);
2236 }
2237 wrq->tx_wrs_copied++;
2238
2239 if (available < eq->sidx / 4 &&
2240 atomic_cmpset_int(&eq->equiq, 0, 1)) {
2241 /*
2242 * XXX: This is not 100% reliable with some
2243 * types of WRs. But this is a very unusual
2244 * situation for an ofld/ctrl queue anyway.
2245 */
2246 dst->equiq_to_len16 |= htobe32(F_FW_WR_EQUIQ |
2247 F_FW_WR_EQUEQ);
2248 }
2249
2250 dbdiff += n;
2251 if (dbdiff >= 16) {
2252 ring_eq_db(sc, eq, dbdiff);
2253 dbdiff = 0;
2254 }
2255
2256 STAILQ_REMOVE_HEAD(&wrq->wr_list, link);
2257 free_wrqe(wr);
2258 MPASS(wrq->nwr_pending > 0);
2259 wrq->nwr_pending--;
2260 MPASS(wrq->ndesc_needed >= n);
2261 wrq->ndesc_needed -= n;
2262 } while ((wr = STAILQ_FIRST(&wrq->wr_list)) != NULL);
2263
2264 if (dbdiff)
2265 ring_eq_db(sc, eq, dbdiff);
2266 }
2267
2268 /*
2269 * Doesn't fail. Holds on to work requests it can't send right away.
2270 */
2271 void
t4_wrq_tx_locked(struct adapter * sc,struct sge_wrq * wrq,struct wrqe * wr)2272 t4_wrq_tx_locked(struct adapter *sc, struct sge_wrq *wrq, struct wrqe *wr)
2273 {
2274 #ifdef INVARIANTS
2275 struct sge_eq *eq = &wrq->eq;
2276 #endif
2277
2278 EQ_LOCK_ASSERT_OWNED(eq);
2279 MPASS(wr != NULL);
2280 MPASS(wr->wr_len > 0 && wr->wr_len <= SGE_MAX_WR_LEN);
2281 MPASS((wr->wr_len & 0x7) == 0);
2282
2283 STAILQ_INSERT_TAIL(&wrq->wr_list, wr, link);
2284 wrq->nwr_pending++;
2285 wrq->ndesc_needed += howmany(wr->wr_len, EQ_ESIZE);
2286
2287 if (!TAILQ_EMPTY(&wrq->incomplete_wrs))
2288 return; /* commit_wrq_wr will drain wr_list as well. */
2289
2290 drain_wrq_wr_list(sc, wrq);
2291
2292 /* Doorbell must have caught up to the pidx. */
2293 MPASS(eq->pidx == eq->dbidx);
2294 }
2295
2296 void
t4_update_fl_bufsize(if_t ifp)2297 t4_update_fl_bufsize(if_t ifp)
2298 {
2299 struct vi_info *vi = if_getsoftc(ifp);
2300 struct adapter *sc = vi->adapter;
2301 struct sge_rxq *rxq;
2302 #ifdef TCP_OFFLOAD
2303 struct sge_ofld_rxq *ofld_rxq;
2304 #endif
2305 struct sge_fl *fl;
2306 int i, maxp;
2307
2308 maxp = max_rx_payload(sc, ifp, false);
2309 for_each_rxq(vi, i, rxq) {
2310 fl = &rxq->fl;
2311
2312 FL_LOCK(fl);
2313 fl->zidx = find_refill_source(sc, maxp,
2314 fl->flags & FL_BUF_PACKING);
2315 FL_UNLOCK(fl);
2316 }
2317 #ifdef TCP_OFFLOAD
2318 maxp = max_rx_payload(sc, ifp, true);
2319 for_each_ofld_rxq(vi, i, ofld_rxq) {
2320 fl = &ofld_rxq->fl;
2321
2322 FL_LOCK(fl);
2323 fl->zidx = find_refill_source(sc, maxp,
2324 fl->flags & FL_BUF_PACKING);
2325 FL_UNLOCK(fl);
2326 }
2327 #endif
2328 }
2329
2330 #ifdef RATELIMIT
2331 static inline int
mbuf_eo_nsegs(struct mbuf * m)2332 mbuf_eo_nsegs(struct mbuf *m)
2333 {
2334
2335 M_ASSERTPKTHDR(m);
2336 return (m->m_pkthdr.PH_loc.eight[1]);
2337 }
2338
2339 #if defined(INET) || defined(INET6)
2340 static inline void
set_mbuf_eo_nsegs(struct mbuf * m,uint8_t nsegs)2341 set_mbuf_eo_nsegs(struct mbuf *m, uint8_t nsegs)
2342 {
2343
2344 M_ASSERTPKTHDR(m);
2345 m->m_pkthdr.PH_loc.eight[1] = nsegs;
2346 }
2347 #endif
2348
2349 static inline int
mbuf_eo_len16(struct mbuf * m)2350 mbuf_eo_len16(struct mbuf *m)
2351 {
2352 int n;
2353
2354 M_ASSERTPKTHDR(m);
2355 n = m->m_pkthdr.PH_loc.eight[2];
2356 MPASS(n > 0 && n <= SGE_MAX_WR_LEN / 16);
2357
2358 return (n);
2359 }
2360
2361 #if defined(INET) || defined(INET6)
2362 static inline void
set_mbuf_eo_len16(struct mbuf * m,uint8_t len16)2363 set_mbuf_eo_len16(struct mbuf *m, uint8_t len16)
2364 {
2365
2366 M_ASSERTPKTHDR(m);
2367 m->m_pkthdr.PH_loc.eight[2] = len16;
2368 }
2369 #endif
2370
2371 static inline int
mbuf_eo_tsclk_tsoff(struct mbuf * m)2372 mbuf_eo_tsclk_tsoff(struct mbuf *m)
2373 {
2374
2375 M_ASSERTPKTHDR(m);
2376 return (m->m_pkthdr.PH_loc.eight[3]);
2377 }
2378
2379 #if defined(INET) || defined(INET6)
2380 static inline void
set_mbuf_eo_tsclk_tsoff(struct mbuf * m,uint8_t tsclk_tsoff)2381 set_mbuf_eo_tsclk_tsoff(struct mbuf *m, uint8_t tsclk_tsoff)
2382 {
2383
2384 M_ASSERTPKTHDR(m);
2385 m->m_pkthdr.PH_loc.eight[3] = tsclk_tsoff;
2386 }
2387 #endif
2388
2389 static inline int
needs_eo(struct m_snd_tag * mst)2390 needs_eo(struct m_snd_tag *mst)
2391 {
2392
2393 return (mst != NULL && mst->sw->type == IF_SND_TAG_TYPE_RATE_LIMIT);
2394 }
2395 #endif
2396
2397 /*
2398 * Try to allocate an mbuf to contain a raw work request. To make it
2399 * easy to construct the work request, don't allocate a chain but a
2400 * single mbuf.
2401 */
2402 struct mbuf *
alloc_wr_mbuf(int len,int how)2403 alloc_wr_mbuf(int len, int how)
2404 {
2405 struct mbuf *m;
2406
2407 if (len <= MHLEN)
2408 m = m_gethdr(how, MT_DATA);
2409 else if (len <= MCLBYTES)
2410 m = m_getcl(how, MT_DATA, M_PKTHDR);
2411 else
2412 m = NULL;
2413 if (m == NULL)
2414 return (NULL);
2415 m->m_pkthdr.len = len;
2416 m->m_len = len;
2417 set_mbuf_cflags(m, MC_RAW_WR);
2418 set_mbuf_len16(m, howmany(len, 16));
2419 return (m);
2420 }
2421
2422 static inline bool
needs_hwcsum(struct mbuf * m)2423 needs_hwcsum(struct mbuf *m)
2424 {
2425 const uint32_t csum_flags = CSUM_IP | CSUM_IP_UDP | CSUM_IP_TCP |
2426 CSUM_IP_TSO | CSUM_INNER_IP | CSUM_INNER_IP_UDP |
2427 CSUM_INNER_IP_TCP | CSUM_INNER_IP_TSO | CSUM_IP6_UDP |
2428 CSUM_IP6_TCP | CSUM_IP6_TSO | CSUM_INNER_IP6_UDP |
2429 CSUM_INNER_IP6_TCP | CSUM_INNER_IP6_TSO;
2430
2431 M_ASSERTPKTHDR(m);
2432
2433 return (m->m_pkthdr.csum_flags & csum_flags);
2434 }
2435
2436 static inline bool
needs_tso(struct mbuf * m)2437 needs_tso(struct mbuf *m)
2438 {
2439 const uint32_t csum_flags = CSUM_IP_TSO | CSUM_IP6_TSO |
2440 CSUM_INNER_IP_TSO | CSUM_INNER_IP6_TSO;
2441
2442 M_ASSERTPKTHDR(m);
2443
2444 return (m->m_pkthdr.csum_flags & csum_flags);
2445 }
2446
2447 static inline bool
needs_vxlan_csum(struct mbuf * m)2448 needs_vxlan_csum(struct mbuf *m)
2449 {
2450
2451 M_ASSERTPKTHDR(m);
2452
2453 return (m->m_pkthdr.csum_flags & CSUM_ENCAP_VXLAN);
2454 }
2455
2456 static inline bool
needs_vxlan_tso(struct mbuf * m)2457 needs_vxlan_tso(struct mbuf *m)
2458 {
2459 const uint32_t csum_flags = CSUM_ENCAP_VXLAN | CSUM_INNER_IP_TSO |
2460 CSUM_INNER_IP6_TSO;
2461
2462 M_ASSERTPKTHDR(m);
2463
2464 return ((m->m_pkthdr.csum_flags & csum_flags) != 0 &&
2465 (m->m_pkthdr.csum_flags & csum_flags) != CSUM_ENCAP_VXLAN);
2466 }
2467
2468 #if defined(INET) || defined(INET6)
2469 static inline bool
needs_inner_tcp_csum(struct mbuf * m)2470 needs_inner_tcp_csum(struct mbuf *m)
2471 {
2472 const uint32_t csum_flags = CSUM_INNER_IP_TSO | CSUM_INNER_IP6_TSO;
2473
2474 M_ASSERTPKTHDR(m);
2475
2476 return (m->m_pkthdr.csum_flags & csum_flags);
2477 }
2478 #endif
2479
2480 static inline bool
needs_l3_csum(struct mbuf * m)2481 needs_l3_csum(struct mbuf *m)
2482 {
2483 const uint32_t csum_flags = CSUM_IP | CSUM_IP_TSO | CSUM_INNER_IP |
2484 CSUM_INNER_IP_TSO;
2485
2486 M_ASSERTPKTHDR(m);
2487
2488 return (m->m_pkthdr.csum_flags & csum_flags);
2489 }
2490
2491 static inline bool
needs_outer_tcp_csum(struct mbuf * m)2492 needs_outer_tcp_csum(struct mbuf *m)
2493 {
2494 const uint32_t csum_flags = CSUM_IP_TCP | CSUM_IP_TSO | CSUM_IP6_TCP |
2495 CSUM_IP6_TSO;
2496
2497 M_ASSERTPKTHDR(m);
2498
2499 return (m->m_pkthdr.csum_flags & csum_flags);
2500 }
2501
2502 #ifdef RATELIMIT
2503 static inline bool
needs_outer_l4_csum(struct mbuf * m)2504 needs_outer_l4_csum(struct mbuf *m)
2505 {
2506 const uint32_t csum_flags = CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP_TSO |
2507 CSUM_IP6_UDP | CSUM_IP6_TCP | CSUM_IP6_TSO;
2508
2509 M_ASSERTPKTHDR(m);
2510
2511 return (m->m_pkthdr.csum_flags & csum_flags);
2512 }
2513
2514 static inline bool
needs_outer_udp_csum(struct mbuf * m)2515 needs_outer_udp_csum(struct mbuf *m)
2516 {
2517 const uint32_t csum_flags = CSUM_IP_UDP | CSUM_IP6_UDP;
2518
2519 M_ASSERTPKTHDR(m);
2520
2521 return (m->m_pkthdr.csum_flags & csum_flags);
2522 }
2523 #endif
2524
2525 static inline bool
needs_vlan_insertion(struct mbuf * m)2526 needs_vlan_insertion(struct mbuf *m)
2527 {
2528
2529 M_ASSERTPKTHDR(m);
2530
2531 return (m->m_flags & M_VLANTAG);
2532 }
2533
2534 #if defined(INET) || defined(INET6)
2535 static void *
m_advance(struct mbuf ** pm,int * poffset,int len)2536 m_advance(struct mbuf **pm, int *poffset, int len)
2537 {
2538 struct mbuf *m = *pm;
2539 int offset = *poffset;
2540 uintptr_t p = 0;
2541
2542 MPASS(len > 0);
2543
2544 for (;;) {
2545 if (offset + len < m->m_len) {
2546 offset += len;
2547 p = mtod(m, uintptr_t) + offset;
2548 break;
2549 }
2550 len -= m->m_len - offset;
2551 m = m->m_next;
2552 offset = 0;
2553 MPASS(m != NULL);
2554 }
2555 *poffset = offset;
2556 *pm = m;
2557 return ((void *)p);
2558 }
2559 #endif
2560
2561 static inline int
count_mbuf_ext_pgs(struct mbuf * m,int skip,vm_paddr_t * nextaddr)2562 count_mbuf_ext_pgs(struct mbuf *m, int skip, vm_paddr_t *nextaddr)
2563 {
2564 vm_paddr_t paddr;
2565 int i, len, off, pglen, pgoff, seglen, segoff;
2566 int nsegs = 0;
2567
2568 M_ASSERTEXTPG(m);
2569 off = mtod(m, vm_offset_t);
2570 len = m->m_len;
2571 off += skip;
2572 len -= skip;
2573
2574 if (m->m_epg_hdrlen != 0) {
2575 if (off >= m->m_epg_hdrlen) {
2576 off -= m->m_epg_hdrlen;
2577 } else {
2578 seglen = m->m_epg_hdrlen - off;
2579 segoff = off;
2580 seglen = min(seglen, len);
2581 off = 0;
2582 len -= seglen;
2583 paddr = pmap_kextract(
2584 (vm_offset_t)&m->m_epg_hdr[segoff]);
2585 if (*nextaddr != paddr)
2586 nsegs++;
2587 *nextaddr = paddr + seglen;
2588 }
2589 }
2590 pgoff = m->m_epg_1st_off;
2591 for (i = 0; i < m->m_epg_npgs && len > 0; i++) {
2592 pglen = m_epg_pagelen(m, i, pgoff);
2593 if (off >= pglen) {
2594 off -= pglen;
2595 pgoff = 0;
2596 continue;
2597 }
2598 seglen = pglen - off;
2599 segoff = pgoff + off;
2600 off = 0;
2601 seglen = min(seglen, len);
2602 len -= seglen;
2603 paddr = m->m_epg_pa[i] + segoff;
2604 if (*nextaddr != paddr)
2605 nsegs++;
2606 *nextaddr = paddr + seglen;
2607 pgoff = 0;
2608 };
2609 if (len != 0) {
2610 seglen = min(len, m->m_epg_trllen - off);
2611 len -= seglen;
2612 paddr = pmap_kextract((vm_offset_t)&m->m_epg_trail[off]);
2613 if (*nextaddr != paddr)
2614 nsegs++;
2615 *nextaddr = paddr + seglen;
2616 }
2617
2618 return (nsegs);
2619 }
2620
2621
2622 /*
2623 * Can deal with empty mbufs in the chain that have m_len = 0, but the chain
2624 * must have at least one mbuf that's not empty. It is possible for this
2625 * routine to return 0 if skip accounts for all the contents of the mbuf chain.
2626 */
2627 static inline int
count_mbuf_nsegs(struct mbuf * m,int skip,uint8_t * cflags)2628 count_mbuf_nsegs(struct mbuf *m, int skip, uint8_t *cflags)
2629 {
2630 vm_paddr_t nextaddr, paddr;
2631 vm_offset_t va;
2632 int len, nsegs;
2633
2634 M_ASSERTPKTHDR(m);
2635 MPASS(m->m_pkthdr.len > 0);
2636 MPASS(m->m_pkthdr.len >= skip);
2637
2638 nsegs = 0;
2639 nextaddr = 0;
2640 for (; m; m = m->m_next) {
2641 len = m->m_len;
2642 if (__predict_false(len == 0))
2643 continue;
2644 if (skip >= len) {
2645 skip -= len;
2646 continue;
2647 }
2648 if ((m->m_flags & M_EXTPG) != 0) {
2649 *cflags |= MC_NOMAP;
2650 nsegs += count_mbuf_ext_pgs(m, skip, &nextaddr);
2651 skip = 0;
2652 continue;
2653 }
2654 va = mtod(m, vm_offset_t) + skip;
2655 len -= skip;
2656 skip = 0;
2657 paddr = pmap_kextract(va);
2658 nsegs += sglist_count((void *)(uintptr_t)va, len);
2659 if (paddr == nextaddr)
2660 nsegs--;
2661 nextaddr = pmap_kextract(va + len - 1) + 1;
2662 }
2663
2664 return (nsegs);
2665 }
2666
2667 /*
2668 * The maximum number of segments that can fit in a WR.
2669 */
2670 static int
max_nsegs_allowed(struct mbuf * m,bool vm_wr)2671 max_nsegs_allowed(struct mbuf *m, bool vm_wr)
2672 {
2673
2674 if (vm_wr) {
2675 if (needs_tso(m))
2676 return (TX_SGL_SEGS_VM_TSO);
2677 return (TX_SGL_SEGS_VM);
2678 }
2679
2680 if (needs_tso(m)) {
2681 if (needs_vxlan_tso(m))
2682 return (TX_SGL_SEGS_VXLAN_TSO);
2683 else
2684 return (TX_SGL_SEGS_TSO);
2685 }
2686
2687 return (TX_SGL_SEGS);
2688 }
2689
2690 static struct timeval txerr_ratecheck = {0};
2691 static const struct timeval txerr_interval = {3, 0};
2692
2693 /*
2694 * Analyze the mbuf to determine its tx needs. The mbuf passed in may change:
2695 * a) caller can assume it's been freed if this function returns with an error.
2696 * b) it may get defragged up if the gather list is too long for the hardware.
2697 */
2698 int
parse_pkt(struct mbuf ** mp,bool vm_wr)2699 parse_pkt(struct mbuf **mp, bool vm_wr)
2700 {
2701 struct mbuf *m0 = *mp, *m;
2702 int rc, nsegs, defragged = 0;
2703 struct ether_header *eh;
2704 #ifdef INET
2705 void *l3hdr;
2706 #endif
2707 #if defined(INET) || defined(INET6)
2708 int offset;
2709 struct tcphdr *tcp;
2710 #endif
2711 #if defined(KERN_TLS) || defined(RATELIMIT)
2712 struct m_snd_tag *mst;
2713 #endif
2714 uint16_t eh_type;
2715 uint8_t cflags;
2716
2717 cflags = 0;
2718 M_ASSERTPKTHDR(m0);
2719 if (__predict_false(m0->m_pkthdr.len < ETHER_HDR_LEN)) {
2720 rc = EINVAL;
2721 fail:
2722 m_freem(m0);
2723 *mp = NULL;
2724 return (rc);
2725 }
2726 restart:
2727 /*
2728 * First count the number of gather list segments in the payload.
2729 * Defrag the mbuf if nsegs exceeds the hardware limit.
2730 */
2731 M_ASSERTPKTHDR(m0);
2732 MPASS(m0->m_pkthdr.len > 0);
2733 nsegs = count_mbuf_nsegs(m0, 0, &cflags);
2734 #if defined(KERN_TLS) || defined(RATELIMIT)
2735 if (m0->m_pkthdr.csum_flags & CSUM_SND_TAG)
2736 mst = m0->m_pkthdr.snd_tag;
2737 else
2738 mst = NULL;
2739 #endif
2740 #ifdef KERN_TLS
2741 if (mst != NULL && mst->sw->type == IF_SND_TAG_TYPE_TLS) {
2742 struct vi_info *vi = if_getsoftc(mst->ifp);
2743
2744 cflags |= MC_TLS;
2745 set_mbuf_cflags(m0, cflags);
2746 if (is_t6(vi->pi->adapter))
2747 rc = t6_ktls_parse_pkt(m0);
2748 else
2749 rc = t7_ktls_parse_pkt(m0);
2750 if (rc != 0)
2751 goto fail;
2752 return (EINPROGRESS);
2753 }
2754 #endif
2755 if (nsegs > max_nsegs_allowed(m0, vm_wr)) {
2756 if (defragged++ > 0) {
2757 rc = EFBIG;
2758 goto fail;
2759 }
2760 counter_u64_add(defrags, 1);
2761 if ((m = m_defrag(m0, M_NOWAIT)) == NULL) {
2762 rc = ENOMEM;
2763 goto fail;
2764 }
2765 *mp = m0 = m; /* update caller's copy after defrag */
2766 goto restart;
2767 }
2768
2769 if (__predict_false(nsegs > 2 && m0->m_pkthdr.len <= MHLEN &&
2770 !(cflags & MC_NOMAP))) {
2771 counter_u64_add(pullups, 1);
2772 m0 = m_pullup(m0, m0->m_pkthdr.len);
2773 if (m0 == NULL) {
2774 /* Should have left well enough alone. */
2775 rc = EFBIG;
2776 goto fail;
2777 }
2778 *mp = m0; /* update caller's copy after pullup */
2779 goto restart;
2780 }
2781 set_mbuf_nsegs(m0, nsegs);
2782 set_mbuf_cflags(m0, cflags);
2783 calculate_mbuf_len16(m0, vm_wr);
2784
2785 #ifdef RATELIMIT
2786 /*
2787 * Ethofld is limited to TCP and UDP for now, and only when L4 hw
2788 * checksumming is enabled. needs_outer_l4_csum happens to check for
2789 * all the right things.
2790 */
2791 if (__predict_false(needs_eo(mst) && !needs_outer_l4_csum(m0))) {
2792 m_snd_tag_rele(m0->m_pkthdr.snd_tag);
2793 m0->m_pkthdr.snd_tag = NULL;
2794 m0->m_pkthdr.csum_flags &= ~CSUM_SND_TAG;
2795 mst = NULL;
2796 }
2797 #endif
2798
2799 if (!needs_hwcsum(m0)
2800 #ifdef RATELIMIT
2801 && !needs_eo(mst)
2802 #endif
2803 )
2804 return (0);
2805
2806 m = m0;
2807 eh = mtod(m, struct ether_header *);
2808 eh_type = ntohs(eh->ether_type);
2809 if (eh_type == ETHERTYPE_VLAN) {
2810 struct ether_vlan_header *evh = (void *)eh;
2811
2812 eh_type = ntohs(evh->evl_proto);
2813 m0->m_pkthdr.l2hlen = sizeof(*evh);
2814 } else
2815 m0->m_pkthdr.l2hlen = sizeof(*eh);
2816
2817 #if defined(INET) || defined(INET6)
2818 offset = 0;
2819 #ifdef INET
2820 l3hdr = m_advance(&m, &offset, m0->m_pkthdr.l2hlen);
2821 #else
2822 m_advance(&m, &offset, m0->m_pkthdr.l2hlen);
2823 #endif
2824 #endif
2825
2826 switch (eh_type) {
2827 #ifdef INET6
2828 case ETHERTYPE_IPV6:
2829 m0->m_pkthdr.l3hlen = sizeof(struct ip6_hdr);
2830 break;
2831 #endif
2832 #ifdef INET
2833 case ETHERTYPE_IP:
2834 {
2835 struct ip *ip = l3hdr;
2836
2837 if (needs_vxlan_csum(m0)) {
2838 /* Driver will do the outer IP hdr checksum. */
2839 ip->ip_sum = 0;
2840 if (needs_vxlan_tso(m0)) {
2841 const uint16_t ipl = ip->ip_len;
2842
2843 ip->ip_len = 0;
2844 ip->ip_sum = ~in_cksum_hdr(ip);
2845 ip->ip_len = ipl;
2846 } else
2847 ip->ip_sum = in_cksum_hdr(ip);
2848 }
2849 m0->m_pkthdr.l3hlen = ip->ip_hl << 2;
2850 break;
2851 }
2852 #endif
2853 default:
2854 if (ratecheck(&txerr_ratecheck, &txerr_interval)) {
2855 log(LOG_ERR, "%s: ethertype 0x%04x unknown. "
2856 "if_cxgbe must be compiled with the same "
2857 "INET/INET6 options as the kernel.\n", __func__,
2858 eh_type);
2859 }
2860 rc = EINVAL;
2861 goto fail;
2862 }
2863
2864 #if defined(INET) || defined(INET6)
2865 if (needs_vxlan_csum(m0)) {
2866 m0->m_pkthdr.l4hlen = sizeof(struct udphdr);
2867 m0->m_pkthdr.l5hlen = sizeof(struct vxlan_header);
2868
2869 /* Inner headers. */
2870 eh = m_advance(&m, &offset, m0->m_pkthdr.l3hlen +
2871 sizeof(struct udphdr) + sizeof(struct vxlan_header));
2872 eh_type = ntohs(eh->ether_type);
2873 if (eh_type == ETHERTYPE_VLAN) {
2874 struct ether_vlan_header *evh = (void *)eh;
2875
2876 eh_type = ntohs(evh->evl_proto);
2877 m0->m_pkthdr.inner_l2hlen = sizeof(*evh);
2878 } else
2879 m0->m_pkthdr.inner_l2hlen = sizeof(*eh);
2880 #ifdef INET
2881 l3hdr = m_advance(&m, &offset, m0->m_pkthdr.inner_l2hlen);
2882 #else
2883 m_advance(&m, &offset, m0->m_pkthdr.inner_l2hlen);
2884 #endif
2885
2886 switch (eh_type) {
2887 #ifdef INET6
2888 case ETHERTYPE_IPV6:
2889 m0->m_pkthdr.inner_l3hlen = sizeof(struct ip6_hdr);
2890 break;
2891 #endif
2892 #ifdef INET
2893 case ETHERTYPE_IP:
2894 {
2895 struct ip *ip = l3hdr;
2896
2897 m0->m_pkthdr.inner_l3hlen = ip->ip_hl << 2;
2898 break;
2899 }
2900 #endif
2901 default:
2902 if (ratecheck(&txerr_ratecheck, &txerr_interval)) {
2903 log(LOG_ERR, "%s: VXLAN hw offload requested"
2904 "with unknown ethertype 0x%04x. if_cxgbe "
2905 "must be compiled with the same INET/INET6 "
2906 "options as the kernel.\n", __func__,
2907 eh_type);
2908 }
2909 rc = EINVAL;
2910 goto fail;
2911 }
2912 if (needs_inner_tcp_csum(m0)) {
2913 tcp = m_advance(&m, &offset, m0->m_pkthdr.inner_l3hlen);
2914 m0->m_pkthdr.inner_l4hlen = tcp->th_off * 4;
2915 }
2916 MPASS((m0->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0);
2917 m0->m_pkthdr.csum_flags &= CSUM_INNER_IP6_UDP |
2918 CSUM_INNER_IP6_TCP | CSUM_INNER_IP6_TSO | CSUM_INNER_IP |
2919 CSUM_INNER_IP_UDP | CSUM_INNER_IP_TCP | CSUM_INNER_IP_TSO |
2920 CSUM_ENCAP_VXLAN;
2921 }
2922
2923 if (needs_outer_tcp_csum(m0)) {
2924 tcp = m_advance(&m, &offset, m0->m_pkthdr.l3hlen);
2925 m0->m_pkthdr.l4hlen = tcp->th_off * 4;
2926 #ifdef RATELIMIT
2927 if (tsclk >= 0 && *(uint32_t *)(tcp + 1) == ntohl(0x0101080a)) {
2928 set_mbuf_eo_tsclk_tsoff(m0,
2929 V_FW_ETH_TX_EO_WR_TSCLK(tsclk) |
2930 V_FW_ETH_TX_EO_WR_TSOFF(sizeof(*tcp) / 2 + 1));
2931 } else
2932 set_mbuf_eo_tsclk_tsoff(m0, 0);
2933 } else if (needs_outer_udp_csum(m0)) {
2934 m0->m_pkthdr.l4hlen = sizeof(struct udphdr);
2935 #endif
2936 }
2937 #ifdef RATELIMIT
2938 if (needs_eo(mst)) {
2939 u_int immhdrs;
2940
2941 /* EO WRs have the headers in the WR and not the GL. */
2942 immhdrs = m0->m_pkthdr.l2hlen + m0->m_pkthdr.l3hlen +
2943 m0->m_pkthdr.l4hlen;
2944 cflags = 0;
2945 nsegs = count_mbuf_nsegs(m0, immhdrs, &cflags);
2946 MPASS(cflags == mbuf_cflags(m0));
2947 set_mbuf_eo_nsegs(m0, nsegs);
2948 set_mbuf_eo_len16(m0,
2949 txpkt_eo_len16(nsegs, immhdrs, needs_tso(m0)));
2950 rc = ethofld_transmit(mst->ifp, m0);
2951 if (rc != 0)
2952 goto fail;
2953 return (EINPROGRESS);
2954 }
2955 #endif
2956 #endif
2957 MPASS(m0 == *mp);
2958 return (0);
2959 }
2960
2961 void *
start_wrq_wr(struct sge_wrq * wrq,int len16,struct wrq_cookie * cookie)2962 start_wrq_wr(struct sge_wrq *wrq, int len16, struct wrq_cookie *cookie)
2963 {
2964 struct sge_eq *eq = &wrq->eq;
2965 struct adapter *sc = wrq->adapter;
2966 int ndesc, available;
2967 struct wrqe *wr;
2968 void *w;
2969
2970 MPASS(len16 > 0);
2971 ndesc = tx_len16_to_desc(len16);
2972 MPASS(ndesc > 0 && ndesc <= SGE_MAX_WR_NDESC);
2973
2974 EQ_LOCK(eq);
2975 if (__predict_false((eq->flags & EQ_HW_ALLOCATED) == 0)) {
2976 EQ_UNLOCK(eq);
2977 return (NULL);
2978 }
2979
2980 if (TAILQ_EMPTY(&wrq->incomplete_wrs) && !STAILQ_EMPTY(&wrq->wr_list))
2981 drain_wrq_wr_list(sc, wrq);
2982
2983 if (!STAILQ_EMPTY(&wrq->wr_list)) {
2984 slowpath:
2985 EQ_UNLOCK(eq);
2986 wr = alloc_wrqe(len16 * 16, wrq);
2987 if (__predict_false(wr == NULL))
2988 return (NULL);
2989 cookie->pidx = -1;
2990 cookie->ndesc = ndesc;
2991 return (&wr->wr);
2992 }
2993
2994 eq->cidx = read_hw_cidx(eq);
2995 if (eq->pidx == eq->cidx)
2996 available = eq->sidx - 1;
2997 else
2998 available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;
2999 if (available < ndesc)
3000 goto slowpath;
3001
3002 cookie->pidx = eq->pidx;
3003 cookie->ndesc = ndesc;
3004 TAILQ_INSERT_TAIL(&wrq->incomplete_wrs, cookie, link);
3005
3006 w = &eq->desc[eq->pidx];
3007 IDXINCR(eq->pidx, ndesc, eq->sidx);
3008 if (__predict_false(cookie->pidx + ndesc > eq->sidx)) {
3009 w = &wrq->ss[0];
3010 wrq->ss_pidx = cookie->pidx;
3011 wrq->ss_len = len16 * 16;
3012 }
3013
3014 EQ_UNLOCK(eq);
3015
3016 return (w);
3017 }
3018
3019 void
commit_wrq_wr(struct sge_wrq * wrq,void * w,struct wrq_cookie * cookie)3020 commit_wrq_wr(struct sge_wrq *wrq, void *w, struct wrq_cookie *cookie)
3021 {
3022 struct sge_eq *eq = &wrq->eq;
3023 struct adapter *sc = wrq->adapter;
3024 int ndesc, pidx;
3025 struct wrq_cookie *prev, *next;
3026
3027 if (cookie->pidx == -1) {
3028 struct wrqe *wr = __containerof(w, struct wrqe, wr);
3029
3030 t4_wrq_tx(sc, wr);
3031 return;
3032 }
3033
3034 if (__predict_false(w == &wrq->ss[0])) {
3035 int n = (eq->sidx - wrq->ss_pidx) * EQ_ESIZE;
3036
3037 MPASS(wrq->ss_len > n); /* WR had better wrap around. */
3038 bcopy(&wrq->ss[0], &eq->desc[wrq->ss_pidx], n);
3039 bcopy(&wrq->ss[n], &eq->desc[0], wrq->ss_len - n);
3040 wrq->tx_wrs_ss++;
3041 } else
3042 wrq->tx_wrs_direct++;
3043
3044 EQ_LOCK(eq);
3045 ndesc = cookie->ndesc; /* Can be more than SGE_MAX_WR_NDESC here. */
3046 pidx = cookie->pidx;
3047 MPASS(pidx >= 0 && pidx < eq->sidx);
3048 prev = TAILQ_PREV(cookie, wrq_incomplete_wrs, link);
3049 next = TAILQ_NEXT(cookie, link);
3050 if (prev == NULL) {
3051 MPASS(pidx == eq->dbidx);
3052 if (next == NULL || ndesc >= 16) {
3053 int available;
3054 struct fw_eth_tx_pkt_wr *dst; /* any fw WR struct will do */
3055
3056 /*
3057 * Note that the WR via which we'll request tx updates
3058 * is at pidx and not eq->pidx, which has moved on
3059 * already.
3060 */
3061 dst = (void *)&eq->desc[pidx];
3062 available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;
3063 if (available < eq->sidx / 4 &&
3064 atomic_cmpset_int(&eq->equiq, 0, 1)) {
3065 /*
3066 * XXX: This is not 100% reliable with some
3067 * types of WRs. But this is a very unusual
3068 * situation for an ofld/ctrl queue anyway.
3069 */
3070 dst->equiq_to_len16 |= htobe32(F_FW_WR_EQUIQ |
3071 F_FW_WR_EQUEQ);
3072 }
3073
3074 if (__predict_true(eq->flags & EQ_HW_ALLOCATED))
3075 ring_eq_db(wrq->adapter, eq, ndesc);
3076 else
3077 IDXINCR(eq->dbidx, ndesc, eq->sidx);
3078 } else {
3079 MPASS(IDXDIFF(next->pidx, pidx, eq->sidx) == ndesc);
3080 next->pidx = pidx;
3081 next->ndesc += ndesc;
3082 }
3083 } else {
3084 MPASS(IDXDIFF(pidx, prev->pidx, eq->sidx) == prev->ndesc);
3085 prev->ndesc += ndesc;
3086 }
3087 TAILQ_REMOVE(&wrq->incomplete_wrs, cookie, link);
3088
3089 if (TAILQ_EMPTY(&wrq->incomplete_wrs) && !STAILQ_EMPTY(&wrq->wr_list))
3090 drain_wrq_wr_list(sc, wrq);
3091
3092 #ifdef INVARIANTS
3093 if (TAILQ_EMPTY(&wrq->incomplete_wrs)) {
3094 /* Doorbell must have caught up to the pidx. */
3095 MPASS(wrq->eq.pidx == wrq->eq.dbidx);
3096 }
3097 #endif
3098 EQ_UNLOCK(eq);
3099 }
3100
3101 static u_int
can_resume_eth_tx(struct mp_ring * r)3102 can_resume_eth_tx(struct mp_ring *r)
3103 {
3104 struct sge_eq *eq = r->cookie;
3105
3106 return (total_available_tx_desc(eq) > eq->sidx / 8);
3107 }
3108
3109 static inline bool
cannot_use_txpkts(struct mbuf * m)3110 cannot_use_txpkts(struct mbuf *m)
3111 {
3112 /* maybe put a GL limit too, to avoid silliness? */
3113
3114 return (needs_tso(m) || (mbuf_cflags(m) & (MC_RAW_WR | MC_TLS)) != 0);
3115 }
3116
3117 static inline int
discard_tx(struct sge_eq * eq)3118 discard_tx(struct sge_eq *eq)
3119 {
3120
3121 return ((eq->flags & (EQ_ENABLED | EQ_QFLUSH)) != EQ_ENABLED);
3122 }
3123
3124 static inline int
wr_can_update_eq(void * p)3125 wr_can_update_eq(void *p)
3126 {
3127 struct fw_eth_tx_pkts_wr *wr = p;
3128
3129 switch (G_FW_WR_OP(be32toh(wr->op_pkd))) {
3130 case FW_ULPTX_WR:
3131 case FW_ETH_TX_PKT_WR:
3132 case FW_ETH_TX_PKTS_WR:
3133 case FW_ETH_TX_PKTS2_WR:
3134 case FW_ETH_TX_PKT_VM_WR:
3135 case FW_ETH_TX_PKTS_VM_WR:
3136 return (1);
3137 default:
3138 return (0);
3139 }
3140 }
3141
3142 static inline void
set_txupdate_flags(struct sge_txq * txq,u_int avail,struct fw_eth_tx_pkt_wr * wr)3143 set_txupdate_flags(struct sge_txq *txq, u_int avail,
3144 struct fw_eth_tx_pkt_wr *wr)
3145 {
3146 struct sge_eq *eq = &txq->eq;
3147 struct txpkts *txp = &txq->txp;
3148
3149 if ((txp->npkt > 0 || avail < eq->sidx / 2) &&
3150 atomic_cmpset_int(&eq->equiq, 0, 1)) {
3151 wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ | F_FW_WR_EQUIQ);
3152 eq->equeqidx = eq->pidx;
3153 } else if (IDXDIFF(eq->pidx, eq->equeqidx, eq->sidx) >= 32) {
3154 wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ);
3155 eq->equeqidx = eq->pidx;
3156 }
3157 }
3158
3159 #if defined(__i386__) || defined(__amd64__)
3160 extern uint64_t tsc_freq;
3161 #endif
3162
3163 static inline bool
record_eth_tx_time(struct sge_txq * txq)3164 record_eth_tx_time(struct sge_txq *txq)
3165 {
3166 const uint64_t cycles = get_cyclecount();
3167 const uint64_t last_tx = txq->last_tx;
3168 #if defined(__i386__) || defined(__amd64__)
3169 const uint64_t itg = tsc_freq * t4_tx_coalesce_gap / 1000000;
3170 #else
3171 const uint64_t itg = 0;
3172 #endif
3173
3174 MPASS(cycles >= last_tx);
3175 txq->last_tx = cycles;
3176 return (cycles - last_tx < itg);
3177 }
3178
3179 /*
3180 * r->items[cidx] to r->items[pidx], with a wraparound at r->size, are ready to
3181 * be consumed. Return the actual number consumed. 0 indicates a stall.
3182 */
3183 static u_int
eth_tx(struct mp_ring * r,u_int cidx,u_int pidx,bool * coalescing)3184 eth_tx(struct mp_ring *r, u_int cidx, u_int pidx, bool *coalescing)
3185 {
3186 struct sge_txq *txq = r->cookie;
3187 if_t ifp = txq->ifp;
3188 struct sge_eq *eq = &txq->eq;
3189 struct txpkts *txp = &txq->txp;
3190 struct vi_info *vi = if_getsoftc(ifp);
3191 struct adapter *sc = vi->adapter;
3192 u_int total, remaining; /* # of packets */
3193 u_int n, avail, dbdiff; /* # of hardware descriptors */
3194 int i, rc;
3195 struct mbuf *m0;
3196 bool snd, recent_tx;
3197 void *wr; /* start of the last WR written to the ring */
3198
3199 TXQ_LOCK_ASSERT_OWNED(txq);
3200 recent_tx = record_eth_tx_time(txq);
3201
3202 remaining = IDXDIFF(pidx, cidx, r->size);
3203 if (__predict_false(discard_tx(eq))) {
3204 for (i = 0; i < txp->npkt; i++)
3205 m_freem(txp->mb[i]);
3206 txp->npkt = 0;
3207 while (cidx != pidx) {
3208 m0 = r->items[cidx];
3209 m_freem(m0);
3210 if (++cidx == r->size)
3211 cidx = 0;
3212 }
3213 reclaim_tx_descs(txq, eq->sidx);
3214 *coalescing = false;
3215 return (remaining); /* emptied */
3216 }
3217
3218 /* How many hardware descriptors do we have readily available. */
3219 if (eq->pidx == eq->cidx)
3220 avail = eq->sidx - 1;
3221 else
3222 avail = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;
3223
3224 total = 0;
3225 if (remaining == 0) {
3226 txp->score = 0;
3227 txq->txpkts_flush++;
3228 goto send_txpkts;
3229 }
3230
3231 dbdiff = 0;
3232 MPASS(remaining > 0);
3233 while (remaining > 0) {
3234 m0 = r->items[cidx];
3235 M_ASSERTPKTHDR(m0);
3236 MPASS(m0->m_nextpkt == NULL);
3237
3238 if (avail < 2 * SGE_MAX_WR_NDESC)
3239 avail += reclaim_tx_descs(txq, 64);
3240
3241 if (t4_tx_coalesce == 0 && txp->npkt == 0)
3242 goto skip_coalescing;
3243 if (cannot_use_txpkts(m0))
3244 txp->score = 0;
3245 else if (recent_tx) {
3246 if (++txp->score == 0)
3247 txp->score = UINT8_MAX;
3248 } else
3249 txp->score = 1;
3250 if (txp->npkt > 0 || remaining > 1 ||
3251 txp->score >= t4_tx_coalesce_pkts ||
3252 atomic_load_int(&txq->eq.equiq) != 0) {
3253 if (vi->flags & TX_USES_VM_WR)
3254 rc = add_to_txpkts_vf(sc, txq, m0, avail, &snd);
3255 else
3256 rc = add_to_txpkts_pf(sc, txq, m0, avail, &snd);
3257 } else {
3258 snd = false;
3259 rc = EINVAL;
3260 }
3261 if (snd) {
3262 MPASS(txp->npkt > 0);
3263 for (i = 0; i < txp->npkt; i++)
3264 ETHER_BPF_MTAP(ifp, txp->mb[i]);
3265 if (txp->npkt > 1) {
3266 MPASS(avail >= tx_len16_to_desc(txp->len16));
3267 if (vi->flags & TX_USES_VM_WR)
3268 n = write_txpkts_vm_wr(sc, txq);
3269 else
3270 n = write_txpkts_wr(sc, txq);
3271 } else {
3272 MPASS(avail >=
3273 tx_len16_to_desc(mbuf_len16(txp->mb[0])));
3274 if (vi->flags & TX_USES_VM_WR)
3275 n = write_txpkt_vm_wr(sc, txq,
3276 txp->mb[0]);
3277 else
3278 n = write_txpkt_wr(sc, txq, txp->mb[0],
3279 avail);
3280 }
3281 MPASS(n <= SGE_MAX_WR_NDESC);
3282 avail -= n;
3283 dbdiff += n;
3284 wr = &eq->desc[eq->pidx];
3285 IDXINCR(eq->pidx, n, eq->sidx);
3286 txp->npkt = 0; /* emptied */
3287 }
3288 if (rc == 0) {
3289 /* m0 was coalesced into txq->txpkts. */
3290 goto next_mbuf;
3291 }
3292 if (rc == EAGAIN) {
3293 /*
3294 * m0 is suitable for tx coalescing but could not be
3295 * combined with the existing txq->txpkts, which has now
3296 * been transmitted. Start a new txpkts with m0.
3297 */
3298 MPASS(snd);
3299 MPASS(txp->npkt == 0);
3300 continue;
3301 }
3302
3303 MPASS(rc != 0 && rc != EAGAIN);
3304 MPASS(txp->npkt == 0);
3305 skip_coalescing:
3306 n = tx_len16_to_desc(mbuf_len16(m0));
3307 if (__predict_false(avail < n)) {
3308 avail += reclaim_tx_descs(txq, min(n, 32));
3309 if (avail < n)
3310 break; /* out of descriptors */
3311 }
3312
3313 wr = &eq->desc[eq->pidx];
3314 if (mbuf_cflags(m0) & MC_RAW_WR) {
3315 n = write_raw_wr(txq, wr, m0, avail);
3316 #ifdef KERN_TLS
3317 } else if (mbuf_cflags(m0) & MC_TLS) {
3318 ETHER_BPF_MTAP(ifp, m0);
3319 if (is_t6(sc))
3320 n = t6_ktls_write_wr(txq, wr, m0, avail);
3321 else
3322 n = t7_ktls_write_wr(txq, wr, m0, avail);
3323 #endif
3324 } else {
3325 ETHER_BPF_MTAP(ifp, m0);
3326 if (vi->flags & TX_USES_VM_WR)
3327 n = write_txpkt_vm_wr(sc, txq, m0);
3328 else
3329 n = write_txpkt_wr(sc, txq, m0, avail);
3330 }
3331 MPASS(n >= 1 && n <= avail);
3332 if (!(mbuf_cflags(m0) & MC_TLS))
3333 MPASS(n <= SGE_MAX_WR_NDESC);
3334
3335 avail -= n;
3336 dbdiff += n;
3337 IDXINCR(eq->pidx, n, eq->sidx);
3338
3339 if (dbdiff >= 512 / EQ_ESIZE) { /* X_FETCHBURSTMAX_512B */
3340 if (wr_can_update_eq(wr))
3341 set_txupdate_flags(txq, avail, wr);
3342 ring_eq_db(sc, eq, dbdiff);
3343 avail += reclaim_tx_descs(txq, 32);
3344 dbdiff = 0;
3345 }
3346 next_mbuf:
3347 total++;
3348 remaining--;
3349 if (__predict_false(++cidx == r->size))
3350 cidx = 0;
3351 }
3352 if (dbdiff != 0) {
3353 if (wr_can_update_eq(wr))
3354 set_txupdate_flags(txq, avail, wr);
3355 ring_eq_db(sc, eq, dbdiff);
3356 reclaim_tx_descs(txq, 32);
3357 } else if (eq->pidx == eq->cidx && txp->npkt > 0 &&
3358 atomic_load_int(&txq->eq.equiq) == 0) {
3359 /*
3360 * If nothing was submitted to the chip for tx (it was coalesced
3361 * into txpkts instead) and there is no tx update outstanding
3362 * then we need to send txpkts now.
3363 */
3364 send_txpkts:
3365 MPASS(txp->npkt > 0);
3366 for (i = 0; i < txp->npkt; i++)
3367 ETHER_BPF_MTAP(ifp, txp->mb[i]);
3368 if (txp->npkt > 1) {
3369 MPASS(avail >= tx_len16_to_desc(txp->len16));
3370 if (vi->flags & TX_USES_VM_WR)
3371 n = write_txpkts_vm_wr(sc, txq);
3372 else
3373 n = write_txpkts_wr(sc, txq);
3374 } else {
3375 MPASS(avail >=
3376 tx_len16_to_desc(mbuf_len16(txp->mb[0])));
3377 if (vi->flags & TX_USES_VM_WR)
3378 n = write_txpkt_vm_wr(sc, txq, txp->mb[0]);
3379 else
3380 n = write_txpkt_wr(sc, txq, txp->mb[0], avail);
3381 }
3382 MPASS(n <= SGE_MAX_WR_NDESC);
3383 wr = &eq->desc[eq->pidx];
3384 IDXINCR(eq->pidx, n, eq->sidx);
3385 txp->npkt = 0; /* emptied */
3386
3387 MPASS(wr_can_update_eq(wr));
3388 set_txupdate_flags(txq, avail - n, wr);
3389 ring_eq_db(sc, eq, n);
3390 reclaim_tx_descs(txq, 32);
3391 }
3392 *coalescing = txp->npkt > 0;
3393
3394 return (total);
3395 }
3396
3397 static inline void
init_iq(struct sge_iq * iq,struct adapter * sc,int tmr_idx,int pktc_idx,int qsize,int intr_idx,int cong,int qtype)3398 init_iq(struct sge_iq *iq, struct adapter *sc, int tmr_idx, int pktc_idx,
3399 int qsize, int intr_idx, int cong, int qtype)
3400 {
3401
3402 KASSERT(tmr_idx >= 0 && tmr_idx < SGE_NTIMERS,
3403 ("%s: bad tmr_idx %d", __func__, tmr_idx));
3404 KASSERT(pktc_idx < SGE_NCOUNTERS, /* -ve is ok, means don't use */
3405 ("%s: bad pktc_idx %d", __func__, pktc_idx));
3406 KASSERT(intr_idx >= -1 && intr_idx < sc->intr_count,
3407 ("%s: bad intr_idx %d", __func__, intr_idx));
3408 KASSERT(qtype == FW_IQ_IQTYPE_OTHER || qtype == FW_IQ_IQTYPE_NIC ||
3409 qtype == FW_IQ_IQTYPE_OFLD, ("%s: bad qtype %d", __func__, qtype));
3410
3411 iq->flags = 0;
3412 iq->state = IQS_DISABLED;
3413 iq->adapter = sc;
3414 iq->qtype = qtype;
3415 iq->intr_params = V_QINTR_TIMER_IDX(tmr_idx);
3416 iq->intr_pktc_idx = SGE_NCOUNTERS - 1;
3417 if (pktc_idx >= 0) {
3418 iq->intr_params |= F_QINTR_CNT_EN;
3419 iq->intr_pktc_idx = pktc_idx;
3420 }
3421 iq->qsize = roundup2(qsize, 16); /* See FW_IQ_CMD/iqsize */
3422 iq->sidx = iq->qsize - sc->params.sge.spg_len / IQ_ESIZE;
3423 iq->intr_idx = intr_idx;
3424 iq->cong_drop = cong;
3425 }
3426
3427 static inline void
init_fl(struct adapter * sc,struct sge_fl * fl,int qsize,int maxp,char * name)3428 init_fl(struct adapter *sc, struct sge_fl *fl, int qsize, int maxp, char *name)
3429 {
3430 struct sge_params *sp = &sc->params.sge;
3431
3432 fl->qsize = qsize;
3433 fl->sidx = qsize - sc->params.sge.spg_len / EQ_ESIZE;
3434 strlcpy(fl->lockname, name, sizeof(fl->lockname));
3435 mtx_init(&fl->fl_lock, fl->lockname, NULL, MTX_DEF);
3436 if (sc->flags & BUF_PACKING_OK &&
3437 ((!is_t4(sc) && buffer_packing) || /* T5+: enabled unless 0 */
3438 (is_t4(sc) && buffer_packing == 1)))/* T4: disabled unless 1 */
3439 fl->flags |= FL_BUF_PACKING;
3440 fl->zidx = find_refill_source(sc, maxp, fl->flags & FL_BUF_PACKING);
3441 fl->safe_zidx = sc->sge.safe_zidx;
3442 if (fl->flags & FL_BUF_PACKING) {
3443 fl->lowat = roundup2(sp->fl_starve_threshold2, 8);
3444 fl->buf_boundary = sp->pack_boundary;
3445 } else {
3446 fl->lowat = roundup2(sp->fl_starve_threshold, 8);
3447 fl->buf_boundary = 16;
3448 }
3449 if (fl_pad && fl->buf_boundary < sp->pad_boundary)
3450 fl->buf_boundary = sp->pad_boundary;
3451 }
3452
3453 static inline void
init_eq(struct adapter * sc,struct sge_eq * eq,int eqtype,int qsize,uint8_t port_id,struct sge_iq * iq,char * name)3454 init_eq(struct adapter *sc, struct sge_eq *eq, int eqtype, int qsize,
3455 uint8_t port_id, struct sge_iq *iq, char *name)
3456 {
3457 KASSERT(eqtype >= EQ_CTRL && eqtype <= EQ_OFLD,
3458 ("%s: bad qtype %d", __func__, eqtype));
3459
3460 eq->type = eqtype;
3461 eq->port_id = port_id;
3462 eq->tx_chan = sc->port[port_id]->tx_chan;
3463 eq->hw_port = sc->port[port_id]->hw_port;
3464 eq->iq = iq;
3465 eq->sidx = qsize - sc->params.sge.spg_len / EQ_ESIZE;
3466 strlcpy(eq->lockname, name, sizeof(eq->lockname));
3467 mtx_init(&eq->eq_lock, eq->lockname, NULL, MTX_DEF);
3468 }
3469
3470 int
alloc_ring(struct adapter * sc,size_t len,bus_dma_tag_t * tag,bus_dmamap_t * map,bus_addr_t * pa,void ** va)3471 alloc_ring(struct adapter *sc, size_t len, bus_dma_tag_t *tag,
3472 bus_dmamap_t *map, bus_addr_t *pa, void **va)
3473 {
3474 int rc;
3475
3476 rc = bus_dma_tag_create(sc->dmat, 512, 0, BUS_SPACE_MAXADDR,
3477 BUS_SPACE_MAXADDR, NULL, NULL, len, 1, len, 0, NULL, NULL, tag);
3478 if (rc != 0) {
3479 CH_ERR(sc, "cannot allocate DMA tag: %d\n", rc);
3480 goto done;
3481 }
3482
3483 rc = bus_dmamem_alloc(*tag, va,
3484 BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, map);
3485 if (rc != 0) {
3486 CH_ERR(sc, "cannot allocate DMA memory: %d\n", rc);
3487 goto done;
3488 }
3489
3490 rc = bus_dmamap_load(*tag, *map, *va, len, oneseg_dma_callback, pa, 0);
3491 if (rc != 0) {
3492 CH_ERR(sc, "cannot load DMA map: %d\n", rc);
3493 goto done;
3494 }
3495 done:
3496 if (rc)
3497 free_ring(sc, *tag, *map, *pa, *va);
3498
3499 return (rc);
3500 }
3501
3502 int
free_ring(struct adapter * sc,bus_dma_tag_t tag,bus_dmamap_t map,bus_addr_t pa,void * va)3503 free_ring(struct adapter *sc, bus_dma_tag_t tag, bus_dmamap_t map,
3504 bus_addr_t pa, void *va)
3505 {
3506 if (pa)
3507 bus_dmamap_unload(tag, map);
3508 if (va)
3509 bus_dmamem_free(tag, va, map);
3510 if (tag)
3511 bus_dma_tag_destroy(tag);
3512
3513 return (0);
3514 }
3515
3516 /*
3517 * Allocates the software resources (mainly memory and sysctl nodes) for an
3518 * ingress queue and an optional freelist.
3519 *
3520 * Sets IQ_SW_ALLOCATED and returns 0 on success.
3521 */
3522 static int
alloc_iq_fl(struct vi_info * vi,struct sge_iq * iq,struct sge_fl * fl,struct sysctl_ctx_list * ctx,struct sysctl_oid * oid)3523 alloc_iq_fl(struct vi_info *vi, struct sge_iq *iq, struct sge_fl *fl,
3524 struct sysctl_ctx_list *ctx, struct sysctl_oid *oid)
3525 {
3526 int rc;
3527 size_t len;
3528 struct adapter *sc = vi->adapter;
3529
3530 MPASS(!(iq->flags & IQ_SW_ALLOCATED));
3531
3532 len = iq->qsize * IQ_ESIZE;
3533 rc = alloc_ring(sc, len, &iq->desc_tag, &iq->desc_map, &iq->ba,
3534 (void **)&iq->desc);
3535 if (rc != 0)
3536 return (rc);
3537
3538 if (fl) {
3539 len = fl->qsize * EQ_ESIZE;
3540 rc = alloc_ring(sc, len, &fl->desc_tag, &fl->desc_map,
3541 &fl->ba, (void **)&fl->desc);
3542 if (rc) {
3543 free_ring(sc, iq->desc_tag, iq->desc_map, iq->ba,
3544 iq->desc);
3545 return (rc);
3546 }
3547
3548 /* Allocate space for one software descriptor per buffer. */
3549 fl->sdesc = malloc(fl->sidx * 8 * sizeof(struct fl_sdesc),
3550 M_CXGBE, M_ZERO | M_WAITOK);
3551
3552 add_fl_sysctls(sc, ctx, oid, fl);
3553 iq->flags |= IQ_HAS_FL;
3554 }
3555 add_iq_sysctls(ctx, oid, iq);
3556 iq->flags |= IQ_SW_ALLOCATED;
3557
3558 return (0);
3559 }
3560
3561 /*
3562 * Frees all software resources (memory and locks) associated with an ingress
3563 * queue and an optional freelist.
3564 */
3565 static void
free_iq_fl(struct adapter * sc,struct sge_iq * iq,struct sge_fl * fl)3566 free_iq_fl(struct adapter *sc, struct sge_iq *iq, struct sge_fl *fl)
3567 {
3568 MPASS(iq->flags & IQ_SW_ALLOCATED);
3569
3570 if (fl) {
3571 MPASS(iq->flags & IQ_HAS_FL);
3572 free_ring(sc, fl->desc_tag, fl->desc_map, fl->ba, fl->desc);
3573 free_fl_buffers(sc, fl);
3574 free(fl->sdesc, M_CXGBE);
3575 mtx_destroy(&fl->fl_lock);
3576 bzero(fl, sizeof(*fl));
3577 }
3578 free_ring(sc, iq->desc_tag, iq->desc_map, iq->ba, iq->desc);
3579 bzero(iq, sizeof(*iq));
3580 }
3581
3582 /*
3583 * Allocates a hardware ingress queue and an optional freelist that will be
3584 * associated with it.
3585 *
3586 * Returns errno on failure. Resources allocated up to that point may still be
3587 * allocated. Caller is responsible for cleanup in case this function fails.
3588 */
3589 static int
alloc_iq_fl_hwq(struct vi_info * vi,struct sge_iq * iq,struct sge_fl * fl)3590 alloc_iq_fl_hwq(struct vi_info *vi, struct sge_iq *iq, struct sge_fl *fl)
3591 {
3592 int rc, cntxt_id, cong_map;
3593 struct fw_iq_cmd c;
3594 struct adapter *sc = vi->adapter;
3595 struct port_info *pi = vi->pi;
3596 __be32 v = 0;
3597
3598 MPASS (!(iq->flags & IQ_HW_ALLOCATED));
3599
3600 bzero(&c, sizeof(c));
3601 c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_IQ_CMD) | F_FW_CMD_REQUEST |
3602 F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_IQ_CMD_PFN(sc->pf) |
3603 V_FW_IQ_CMD_VFN(0));
3604
3605 c.alloc_to_len16 = htobe32(F_FW_IQ_CMD_ALLOC | F_FW_IQ_CMD_IQSTART |
3606 FW_LEN16(c));
3607
3608 /* Special handling for firmware event queue */
3609 if (iq == &sc->sge.fwq)
3610 v |= F_FW_IQ_CMD_IQASYNCH;
3611
3612 if (iq->intr_idx < 0) {
3613 /* Forwarded interrupts, all headed to fwq */
3614 v |= F_FW_IQ_CMD_IQANDST;
3615 v |= V_FW_IQ_CMD_IQANDSTINDEX(sc->sge.fwq.cntxt_id);
3616 } else {
3617 KASSERT(iq->intr_idx < sc->intr_count,
3618 ("%s: invalid direct intr_idx %d", __func__, iq->intr_idx));
3619 v |= V_FW_IQ_CMD_IQANDSTINDEX(iq->intr_idx);
3620 }
3621
3622 bzero(iq->desc, iq->qsize * IQ_ESIZE);
3623 c.type_to_iqandstindex = htobe32(v |
3624 V_FW_IQ_CMD_TYPE(FW_IQ_TYPE_FL_INT_CAP) |
3625 V_FW_IQ_CMD_VIID(vi->viid) |
3626 V_FW_IQ_CMD_IQANUD(X_UPDATEDELIVERY_INTERRUPT));
3627 c.iqdroprss_to_iqesize = htobe16(V_FW_IQ_CMD_IQPCIECH(pi->hw_port) |
3628 F_FW_IQ_CMD_IQGTSMODE |
3629 V_FW_IQ_CMD_IQINTCNTTHRESH(iq->intr_pktc_idx) |
3630 V_FW_IQ_CMD_IQESIZE(ilog2(IQ_ESIZE) - 4));
3631 c.iqsize = htobe16(iq->qsize);
3632 c.iqaddr = htobe64(iq->ba);
3633 c.iqns_to_fl0congen = htobe32(V_FW_IQ_CMD_IQTYPE(iq->qtype));
3634 if (iq->cong_drop != -1) {
3635 if (iq->qtype == IQ_ETH) {
3636 if (chip_id(sc) >= CHELSIO_T7)
3637 cong_map = 1 << pi->hw_port;
3638 else
3639 cong_map = pi->rx_e_chan_map;
3640 } else
3641 cong_map = 0;
3642 c.iqns_to_fl0congen |= htobe32(F_FW_IQ_CMD_IQFLINTCONGEN);
3643 }
3644
3645 if (fl) {
3646 bzero(fl->desc, fl->sidx * EQ_ESIZE + sc->params.sge.spg_len);
3647 c.iqns_to_fl0congen |=
3648 htobe32(V_FW_IQ_CMD_FL0HOSTFCMODE(X_HOSTFCMODE_NONE) |
3649 F_FW_IQ_CMD_FL0FETCHRO | F_FW_IQ_CMD_FL0DATARO |
3650 (fl_pad ? F_FW_IQ_CMD_FL0PADEN : 0) |
3651 (fl->flags & FL_BUF_PACKING ? F_FW_IQ_CMD_FL0PACKEN :
3652 0));
3653 if (iq->cong_drop != -1) {
3654 c.iqns_to_fl0congen |=
3655 htobe32(V_FW_IQ_CMD_FL0CNGCHMAP(cong_map) |
3656 F_FW_IQ_CMD_FL0CONGCIF |
3657 F_FW_IQ_CMD_FL0CONGEN);
3658 }
3659 c.fl0dcaen_to_fl0cidxfthresh =
3660 htobe16(V_FW_IQ_CMD_FL0FBMIN(chip_id(sc) <= CHELSIO_T5 ?
3661 X_FETCHBURSTMIN_128B : X_FETCHBURSTMIN_64B_T6) |
3662 V_FW_IQ_CMD_FL0FBMAX(chip_id(sc) <= CHELSIO_T5 ?
3663 X_FETCHBURSTMAX_512B : X_FETCHBURSTMAX_256B));
3664 c.fl0size = htobe16(fl->qsize);
3665 c.fl0addr = htobe64(fl->ba);
3666 }
3667
3668 rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
3669 if (rc != 0) {
3670 CH_ERR(sc, "failed to create hw ingress queue: %d\n", rc);
3671 return (rc);
3672 }
3673
3674 iq->cidx = 0;
3675 iq->gen = F_RSPD_GEN;
3676 iq->cntxt_id = be16toh(c.iqid);
3677 iq->abs_id = be16toh(c.physiqid);
3678
3679 cntxt_id = iq->cntxt_id - sc->sge.iq_start;
3680 if (cntxt_id >= sc->sge.iqmap_sz) {
3681 panic ("%s: iq->cntxt_id (%d) more than the max (%d)", __func__,
3682 cntxt_id, sc->sge.iqmap_sz - 1);
3683 }
3684 sc->sge.iqmap[cntxt_id] = iq;
3685
3686 if (fl) {
3687 u_int qid;
3688 #ifdef INVARIANTS
3689 int i;
3690
3691 MPASS(!(fl->flags & FL_BUF_RESUME));
3692 for (i = 0; i < fl->sidx * 8; i++)
3693 MPASS(fl->sdesc[i].cl == NULL);
3694 #endif
3695 fl->cntxt_id = be16toh(c.fl0id);
3696 fl->pidx = fl->cidx = fl->hw_cidx = fl->dbidx = 0;
3697 fl->rx_offset = 0;
3698 fl->flags &= ~(FL_STARVING | FL_DOOMED);
3699
3700 cntxt_id = fl->cntxt_id - sc->sge.eq_start;
3701 if (cntxt_id >= sc->sge.eqmap_sz) {
3702 panic("%s: fl->cntxt_id (%d) more than the max (%d)",
3703 __func__, cntxt_id, sc->sge.eqmap_sz - 1);
3704 }
3705 sc->sge.eqmap[cntxt_id] = (void *)fl;
3706
3707 qid = fl->cntxt_id;
3708 if (isset(&sc->doorbells, DOORBELL_UDB)) {
3709 uint32_t s_qpp = sc->params.sge.eq_s_qpp;
3710 uint32_t mask = (1 << s_qpp) - 1;
3711 volatile uint8_t *udb;
3712
3713 udb = sc->udbs_base + UDBS_DB_OFFSET;
3714 udb += (qid >> s_qpp) << PAGE_SHIFT;
3715 qid &= mask;
3716 if (qid < PAGE_SIZE / UDBS_SEG_SIZE) {
3717 udb += qid << UDBS_SEG_SHIFT;
3718 qid = 0;
3719 }
3720 fl->udb = (volatile void *)udb;
3721 }
3722 fl->dbval = V_QID(qid) | sc->chip_params->sge_fl_db;
3723
3724 FL_LOCK(fl);
3725 /* Enough to make sure the SGE doesn't think it's starved */
3726 refill_fl(sc, fl, fl->lowat);
3727 FL_UNLOCK(fl);
3728 }
3729
3730 if (chip_id(sc) >= CHELSIO_T5 && !(sc->flags & IS_VF) &&
3731 iq->cong_drop != -1) {
3732 t4_sge_set_conm_context(sc, iq->cntxt_id, iq->cong_drop,
3733 cong_map);
3734 }
3735
3736 /* Enable IQ interrupts */
3737 atomic_store_rel_int(&iq->state, IQS_IDLE);
3738 t4_write_reg(sc, sc->sge_gts_reg, V_SEINTARM(iq->intr_params) |
3739 V_INGRESSQID(iq->cntxt_id));
3740
3741 iq->flags |= IQ_HW_ALLOCATED;
3742
3743 return (0);
3744 }
3745
3746 static int
free_iq_fl_hwq(struct adapter * sc,struct sge_iq * iq,struct sge_fl * fl)3747 free_iq_fl_hwq(struct adapter *sc, struct sge_iq *iq, struct sge_fl *fl)
3748 {
3749 int rc;
3750
3751 MPASS(iq->flags & IQ_HW_ALLOCATED);
3752 rc = -t4_iq_free(sc, sc->mbox, sc->pf, 0, FW_IQ_TYPE_FL_INT_CAP,
3753 iq->cntxt_id, fl ? fl->cntxt_id : 0xffff, 0xffff);
3754 if (rc != 0) {
3755 CH_ERR(sc, "failed to free iq %p: %d\n", iq, rc);
3756 return (rc);
3757 }
3758 iq->flags &= ~IQ_HW_ALLOCATED;
3759
3760 return (0);
3761 }
3762
3763 static void
add_iq_sysctls(struct sysctl_ctx_list * ctx,struct sysctl_oid * oid,struct sge_iq * iq)3764 add_iq_sysctls(struct sysctl_ctx_list *ctx, struct sysctl_oid *oid,
3765 struct sge_iq *iq)
3766 {
3767 struct sysctl_oid_list *children;
3768
3769 if (ctx == NULL || oid == NULL)
3770 return;
3771
3772 children = SYSCTL_CHILDREN(oid);
3773 SYSCTL_ADD_UAUTO(ctx, children, OID_AUTO, "ba", CTLFLAG_RD, &iq->ba,
3774 "bus address of descriptor ring");
3775 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL,
3776 iq->qsize * IQ_ESIZE, "descriptor ring size in bytes");
3777 SYSCTL_ADD_U16(ctx, children, OID_AUTO, "abs_id", CTLFLAG_RD,
3778 &iq->abs_id, 0, "absolute id of the queue");
3779 SYSCTL_ADD_U16(ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD,
3780 &iq->cntxt_id, 0, "SGE context id of the queue");
3781 SYSCTL_ADD_U16(ctx, children, OID_AUTO, "cidx", CTLFLAG_RD, &iq->cidx,
3782 0, "consumer index");
3783 }
3784
3785 static void
add_fl_sysctls(struct adapter * sc,struct sysctl_ctx_list * ctx,struct sysctl_oid * oid,struct sge_fl * fl)3786 add_fl_sysctls(struct adapter *sc, struct sysctl_ctx_list *ctx,
3787 struct sysctl_oid *oid, struct sge_fl *fl)
3788 {
3789 struct sysctl_oid_list *children;
3790
3791 if (ctx == NULL || oid == NULL)
3792 return;
3793
3794 children = SYSCTL_CHILDREN(oid);
3795 oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "fl",
3796 CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "freelist");
3797 children = SYSCTL_CHILDREN(oid);
3798
3799 SYSCTL_ADD_UAUTO(ctx, children, OID_AUTO, "ba", CTLFLAG_RD,
3800 &fl->ba, "bus address of descriptor ring");
3801 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL,
3802 fl->sidx * EQ_ESIZE + sc->params.sge.spg_len,
3803 "desc ring size in bytes");
3804 SYSCTL_ADD_U16(ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD,
3805 &fl->cntxt_id, 0, "SGE context id of the freelist");
3806 SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "padding", CTLFLAG_RD, NULL,
3807 fl_pad ? 1 : 0, "padding enabled");
3808 SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "packing", CTLFLAG_RD, NULL,
3809 fl->flags & FL_BUF_PACKING ? 1 : 0, "packing enabled");
3810 SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cidx", CTLFLAG_RD, &fl->cidx,
3811 0, "consumer index");
3812 if (fl->flags & FL_BUF_PACKING) {
3813 SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "rx_offset",
3814 CTLFLAG_RD, &fl->rx_offset, 0, "packing rx offset");
3815 }
3816 SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "pidx", CTLFLAG_RD, &fl->pidx,
3817 0, "producer index");
3818 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_allocated",
3819 CTLFLAG_RD, &fl->cl_allocated, "# of clusters allocated");
3820 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_recycled",
3821 CTLFLAG_RD, &fl->cl_recycled, "# of clusters recycled");
3822 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_fast_recycled",
3823 CTLFLAG_RD, &fl->cl_fast_recycled, "# of clusters recycled (fast)");
3824 }
3825
3826 /*
3827 * Idempotent.
3828 */
3829 static int
alloc_fwq(struct adapter * sc)3830 alloc_fwq(struct adapter *sc)
3831 {
3832 int rc, intr_idx;
3833 struct sge_iq *fwq = &sc->sge.fwq;
3834 struct vi_info *vi = &sc->port[0]->vi[0];
3835
3836 if (!(fwq->flags & IQ_SW_ALLOCATED)) {
3837 MPASS(!(fwq->flags & IQ_HW_ALLOCATED));
3838
3839 if (sc->flags & IS_VF)
3840 intr_idx = 0;
3841 else
3842 intr_idx = sc->intr_count > 1 ? 1 : 0;
3843 init_iq(fwq, sc, 0, 0, FW_IQ_QSIZE, intr_idx, -1, IQ_OTHER);
3844 rc = alloc_iq_fl(vi, fwq, NULL, &sc->ctx, sc->fwq_oid);
3845 if (rc != 0) {
3846 CH_ERR(sc, "failed to allocate fwq: %d\n", rc);
3847 return (rc);
3848 }
3849 MPASS(fwq->flags & IQ_SW_ALLOCATED);
3850 }
3851
3852 if (!(fwq->flags & IQ_HW_ALLOCATED)) {
3853 MPASS(fwq->flags & IQ_SW_ALLOCATED);
3854
3855 rc = alloc_iq_fl_hwq(vi, fwq, NULL);
3856 if (rc != 0) {
3857 CH_ERR(sc, "failed to create hw fwq: %d\n", rc);
3858 return (rc);
3859 }
3860 MPASS(fwq->flags & IQ_HW_ALLOCATED);
3861 }
3862
3863 return (0);
3864 }
3865
3866 /*
3867 * Idempotent.
3868 */
3869 static void
free_fwq(struct adapter * sc)3870 free_fwq(struct adapter *sc)
3871 {
3872 struct sge_iq *fwq = &sc->sge.fwq;
3873
3874 if (fwq->flags & IQ_HW_ALLOCATED) {
3875 MPASS(fwq->flags & IQ_SW_ALLOCATED);
3876 free_iq_fl_hwq(sc, fwq, NULL);
3877 MPASS(!(fwq->flags & IQ_HW_ALLOCATED));
3878 }
3879
3880 if (fwq->flags & IQ_SW_ALLOCATED) {
3881 MPASS(!(fwq->flags & IQ_HW_ALLOCATED));
3882 free_iq_fl(sc, fwq, NULL);
3883 MPASS(!(fwq->flags & IQ_SW_ALLOCATED));
3884 }
3885 }
3886
3887 /*
3888 * Idempotent.
3889 */
3890 static int
alloc_ctrlq(struct adapter * sc,int idx)3891 alloc_ctrlq(struct adapter *sc, int idx)
3892 {
3893 int rc;
3894 char name[16];
3895 struct sysctl_oid *oid;
3896 struct sge_wrq *ctrlq = &sc->sge.ctrlq[idx];
3897
3898 MPASS(idx < sc->sge.nctrlq);
3899
3900 if (!(ctrlq->eq.flags & EQ_SW_ALLOCATED)) {
3901 MPASS(!(ctrlq->eq.flags & EQ_HW_ALLOCATED));
3902
3903 snprintf(name, sizeof(name), "%d", idx);
3904 oid = SYSCTL_ADD_NODE(&sc->ctx, SYSCTL_CHILDREN(sc->ctrlq_oid),
3905 OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
3906 "ctrl queue");
3907
3908 snprintf(name, sizeof(name), "%s ctrlq%d",
3909 device_get_nameunit(sc->dev), idx);
3910 init_eq(sc, &ctrlq->eq, EQ_CTRL, CTRL_EQ_QSIZE,
3911 idx % sc->params.nports, &sc->sge.fwq, name);
3912 rc = alloc_wrq(sc, NULL, ctrlq, &sc->ctx, oid);
3913 if (rc != 0) {
3914 CH_ERR(sc, "failed to allocate ctrlq%d: %d\n", idx, rc);
3915 sysctl_remove_oid(oid, 1, 1);
3916 return (rc);
3917 }
3918 MPASS(ctrlq->eq.flags & EQ_SW_ALLOCATED);
3919 }
3920
3921 if (!(ctrlq->eq.flags & EQ_HW_ALLOCATED)) {
3922 MPASS(ctrlq->eq.flags & EQ_SW_ALLOCATED);
3923 MPASS(ctrlq->nwr_pending == 0);
3924 MPASS(ctrlq->ndesc_needed == 0);
3925
3926 rc = alloc_eq_hwq(sc, NULL, &ctrlq->eq, idx);
3927 if (rc != 0) {
3928 CH_ERR(sc, "failed to create hw ctrlq%d: %d\n", idx, rc);
3929 return (rc);
3930 }
3931 MPASS(ctrlq->eq.flags & EQ_HW_ALLOCATED);
3932 }
3933
3934 return (0);
3935 }
3936
3937 /*
3938 * Idempotent.
3939 */
3940 static void
free_ctrlq(struct adapter * sc,int idx)3941 free_ctrlq(struct adapter *sc, int idx)
3942 {
3943 struct sge_wrq *ctrlq = &sc->sge.ctrlq[idx];
3944
3945 if (ctrlq->eq.flags & EQ_HW_ALLOCATED) {
3946 MPASS(ctrlq->eq.flags & EQ_SW_ALLOCATED);
3947 free_eq_hwq(sc, NULL, &ctrlq->eq);
3948 MPASS(!(ctrlq->eq.flags & EQ_HW_ALLOCATED));
3949 }
3950
3951 if (ctrlq->eq.flags & EQ_SW_ALLOCATED) {
3952 MPASS(!(ctrlq->eq.flags & EQ_HW_ALLOCATED));
3953 free_wrq(sc, ctrlq);
3954 MPASS(!(ctrlq->eq.flags & EQ_SW_ALLOCATED));
3955 }
3956 }
3957
3958 int
t4_sge_set_conm_context(struct adapter * sc,int cntxt_id,int cong_drop,int cong_map)3959 t4_sge_set_conm_context(struct adapter *sc, int cntxt_id, int cong_drop,
3960 int cong_map)
3961 {
3962 const int cng_ch_bits_log = sc->chip_params->cng_ch_bits_log;
3963 uint32_t param, val;
3964 uint16_t ch_map;
3965 int cong_mode, rc, i;
3966
3967 if (chip_id(sc) < CHELSIO_T5)
3968 return (ENOTSUP);
3969
3970 /* Convert the driver knob to the mode understood by the firmware. */
3971 switch (cong_drop) {
3972 case -1:
3973 cong_mode = X_CONMCTXT_CNGTPMODE_DISABLE;
3974 break;
3975 case 0:
3976 cong_mode = X_CONMCTXT_CNGTPMODE_CHANNEL;
3977 break;
3978 case 1:
3979 cong_mode = X_CONMCTXT_CNGTPMODE_QUEUE;
3980 break;
3981 case 2:
3982 cong_mode = X_CONMCTXT_CNGTPMODE_BOTH;
3983 break;
3984 default:
3985 MPASS(0);
3986 CH_ERR(sc, "cong_drop = %d is invalid (ingress queue %d).\n",
3987 cong_drop, cntxt_id);
3988 return (EINVAL);
3989 }
3990
3991 param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DMAQ) |
3992 V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DMAQ_CONM_CTXT) |
3993 V_FW_PARAMS_PARAM_YZ(cntxt_id);
3994 if (chip_id(sc) >= CHELSIO_T7) {
3995 val = V_T7_DMAQ_CONM_CTXT_CNGTPMODE(cong_mode) |
3996 V_T7_DMAQ_CONM_CTXT_CH_VEC(cong_map);
3997 } else {
3998 val = V_CONMCTXT_CNGTPMODE(cong_mode);
3999 if (cong_mode == X_CONMCTXT_CNGTPMODE_CHANNEL ||
4000 cong_mode == X_CONMCTXT_CNGTPMODE_BOTH) {
4001 for (i = 0, ch_map = 0; i < 4; i++) {
4002 if (cong_map & (1 << i))
4003 ch_map |= 1 << (i << cng_ch_bits_log);
4004 }
4005 val |= V_CONMCTXT_CNGCHMAP(ch_map);
4006 }
4007 }
4008 rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, ¶m, &val);
4009 if (rc != 0) {
4010 CH_ERR(sc, "failed to set congestion manager context "
4011 "for ingress queue %d: %d\n", cntxt_id, rc);
4012 }
4013
4014 return (rc);
4015 }
4016
4017 /*
4018 * Idempotent.
4019 */
4020 static int
alloc_rxq(struct vi_info * vi,struct sge_rxq * rxq,int idx,int intr_idx,int maxp)4021 alloc_rxq(struct vi_info *vi, struct sge_rxq *rxq, int idx, int intr_idx,
4022 int maxp)
4023 {
4024 int rc;
4025 struct adapter *sc = vi->adapter;
4026 if_t ifp = vi->ifp;
4027 struct sysctl_oid *oid;
4028 char name[16];
4029
4030 if (!(rxq->iq.flags & IQ_SW_ALLOCATED)) {
4031 MPASS(!(rxq->iq.flags & IQ_HW_ALLOCATED));
4032 #if defined(INET) || defined(INET6)
4033 rc = tcp_lro_init_args(&rxq->lro, ifp, lro_entries, lro_mbufs);
4034 if (rc != 0)
4035 return (rc);
4036 MPASS(rxq->lro.ifp == ifp); /* also indicates LRO init'ed */
4037 #endif
4038 rxq->ifp = ifp;
4039
4040 snprintf(name, sizeof(name), "%d", idx);
4041 oid = SYSCTL_ADD_NODE(&vi->ctx, SYSCTL_CHILDREN(vi->rxq_oid),
4042 OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
4043 "rx queue");
4044
4045 init_iq(&rxq->iq, sc, vi->tmr_idx, vi->pktc_idx, vi->qsize_rxq,
4046 intr_idx, cong_drop, IQ_ETH);
4047 #if defined(INET) || defined(INET6)
4048 if (if_getcapenable(ifp) & IFCAP_LRO)
4049 rxq->iq.flags |= IQ_LRO_ENABLED;
4050 #endif
4051 if (if_getcapenable(ifp) & IFCAP_HWRXTSTMP)
4052 rxq->iq.flags |= IQ_RX_TIMESTAMP;
4053 snprintf(name, sizeof(name), "%s rxq%d-fl",
4054 device_get_nameunit(vi->dev), idx);
4055 init_fl(sc, &rxq->fl, vi->qsize_rxq / 8, maxp, name);
4056 rc = alloc_iq_fl(vi, &rxq->iq, &rxq->fl, &vi->ctx, oid);
4057 if (rc != 0) {
4058 CH_ERR(vi, "failed to allocate rxq%d: %d\n", idx, rc);
4059 sysctl_remove_oid(oid, 1, 1);
4060 #if defined(INET) || defined(INET6)
4061 tcp_lro_free(&rxq->lro);
4062 rxq->lro.ifp = NULL;
4063 #endif
4064 return (rc);
4065 }
4066 MPASS(rxq->iq.flags & IQ_SW_ALLOCATED);
4067 add_rxq_sysctls(&vi->ctx, oid, rxq);
4068 }
4069
4070 if (!(rxq->iq.flags & IQ_HW_ALLOCATED)) {
4071 MPASS(rxq->iq.flags & IQ_SW_ALLOCATED);
4072 rc = alloc_iq_fl_hwq(vi, &rxq->iq, &rxq->fl);
4073 if (rc != 0) {
4074 CH_ERR(vi, "failed to create hw rxq%d: %d\n", idx, rc);
4075 return (rc);
4076 }
4077 MPASS(rxq->iq.flags & IQ_HW_ALLOCATED);
4078
4079 if (idx == 0)
4080 sc->sge.iq_base = rxq->iq.abs_id - rxq->iq.cntxt_id;
4081 else
4082 KASSERT(rxq->iq.cntxt_id + sc->sge.iq_base == rxq->iq.abs_id,
4083 ("iq_base mismatch"));
4084 KASSERT(sc->sge.iq_base == 0 || sc->flags & IS_VF,
4085 ("PF with non-zero iq_base"));
4086
4087 /*
4088 * The freelist is just barely above the starvation threshold
4089 * right now, fill it up a bit more.
4090 */
4091 FL_LOCK(&rxq->fl);
4092 refill_fl(sc, &rxq->fl, 128);
4093 FL_UNLOCK(&rxq->fl);
4094 }
4095
4096 return (0);
4097 }
4098
4099 /*
4100 * Idempotent.
4101 */
4102 static void
free_rxq(struct vi_info * vi,struct sge_rxq * rxq)4103 free_rxq(struct vi_info *vi, struct sge_rxq *rxq)
4104 {
4105 if (rxq->iq.flags & IQ_HW_ALLOCATED) {
4106 MPASS(rxq->iq.flags & IQ_SW_ALLOCATED);
4107 free_iq_fl_hwq(vi->adapter, &rxq->iq, &rxq->fl);
4108 MPASS(!(rxq->iq.flags & IQ_HW_ALLOCATED));
4109 }
4110
4111 if (rxq->iq.flags & IQ_SW_ALLOCATED) {
4112 MPASS(!(rxq->iq.flags & IQ_HW_ALLOCATED));
4113 #if defined(INET) || defined(INET6)
4114 tcp_lro_free(&rxq->lro);
4115 #endif
4116 free_iq_fl(vi->adapter, &rxq->iq, &rxq->fl);
4117 MPASS(!(rxq->iq.flags & IQ_SW_ALLOCATED));
4118 bzero(rxq, sizeof(*rxq));
4119 }
4120 }
4121
4122 static void
add_rxq_sysctls(struct sysctl_ctx_list * ctx,struct sysctl_oid * oid,struct sge_rxq * rxq)4123 add_rxq_sysctls(struct sysctl_ctx_list *ctx, struct sysctl_oid *oid,
4124 struct sge_rxq *rxq)
4125 {
4126 struct sysctl_oid_list *children;
4127
4128 if (ctx == NULL || oid == NULL)
4129 return;
4130
4131 children = SYSCTL_CHILDREN(oid);
4132 #if defined(INET) || defined(INET6)
4133 SYSCTL_ADD_U64(ctx, children, OID_AUTO, "lro_queued", CTLFLAG_RD,
4134 &rxq->lro.lro_queued, 0, NULL);
4135 SYSCTL_ADD_U64(ctx, children, OID_AUTO, "lro_flushed", CTLFLAG_RD,
4136 &rxq->lro.lro_flushed, 0, NULL);
4137 #endif
4138 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "rxcsum", CTLFLAG_RD,
4139 &rxq->rxcsum, "# of times hardware assisted with checksum");
4140 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "vlan_extraction", CTLFLAG_RD,
4141 &rxq->vlan_extraction, "# of times hardware extracted 802.1Q tag");
4142 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "vxlan_rxcsum", CTLFLAG_RD,
4143 &rxq->vxlan_rxcsum,
4144 "# of times hardware assisted with inner checksum (VXLAN)");
4145 }
4146
4147 #ifdef TCP_OFFLOAD
4148 /*
4149 * Idempotent.
4150 */
4151 static int
alloc_ofld_rxq(struct vi_info * vi,struct sge_ofld_rxq * ofld_rxq,int idx,int intr_idx,int maxp)4152 alloc_ofld_rxq(struct vi_info *vi, struct sge_ofld_rxq *ofld_rxq, int idx,
4153 int intr_idx, int maxp)
4154 {
4155 int rc;
4156 struct adapter *sc = vi->adapter;
4157 struct sysctl_oid *oid;
4158 char name[16];
4159
4160 if (!(ofld_rxq->iq.flags & IQ_SW_ALLOCATED)) {
4161 MPASS(!(ofld_rxq->iq.flags & IQ_HW_ALLOCATED));
4162
4163 snprintf(name, sizeof(name), "%d", idx);
4164 oid = SYSCTL_ADD_NODE(&vi->ctx,
4165 SYSCTL_CHILDREN(vi->ofld_rxq_oid), OID_AUTO, name,
4166 CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "offload rx queue");
4167
4168 init_iq(&ofld_rxq->iq, sc, vi->ofld_tmr_idx, vi->ofld_pktc_idx,
4169 vi->qsize_rxq, intr_idx, ofld_cong_drop, IQ_OFLD);
4170 snprintf(name, sizeof(name), "%s ofld_rxq%d-fl",
4171 device_get_nameunit(vi->dev), idx);
4172 init_fl(sc, &ofld_rxq->fl, vi->qsize_rxq / 8, maxp, name);
4173 rc = alloc_iq_fl(vi, &ofld_rxq->iq, &ofld_rxq->fl, &vi->ctx,
4174 oid);
4175 if (rc != 0) {
4176 CH_ERR(vi, "failed to allocate ofld_rxq%d: %d\n", idx,
4177 rc);
4178 sysctl_remove_oid(oid, 1, 1);
4179 return (rc);
4180 }
4181 MPASS(ofld_rxq->iq.flags & IQ_SW_ALLOCATED);
4182 ofld_rxq->rx_iscsi_ddp_setup_ok = counter_u64_alloc(M_WAITOK);
4183 ofld_rxq->rx_iscsi_ddp_setup_error =
4184 counter_u64_alloc(M_WAITOK);
4185 ofld_rxq->rx_nvme_ddp_setup_ok = counter_u64_alloc(M_WAITOK);
4186 ofld_rxq->rx_nvme_ddp_setup_no_stag =
4187 counter_u64_alloc(M_WAITOK);
4188 ofld_rxq->rx_nvme_ddp_setup_error =
4189 counter_u64_alloc(M_WAITOK);
4190 ofld_rxq->rx_nvme_ddp_octets = counter_u64_alloc(M_WAITOK);
4191 ofld_rxq->rx_nvme_ddp_pdus = counter_u64_alloc(M_WAITOK);
4192 ofld_rxq->rx_nvme_fl_octets = counter_u64_alloc(M_WAITOK);
4193 ofld_rxq->rx_nvme_fl_pdus = counter_u64_alloc(M_WAITOK);
4194 ofld_rxq->rx_nvme_invalid_headers = counter_u64_alloc(M_WAITOK);
4195 ofld_rxq->rx_nvme_header_digest_errors =
4196 counter_u64_alloc(M_WAITOK);
4197 ofld_rxq->rx_nvme_data_digest_errors =
4198 counter_u64_alloc(M_WAITOK);
4199 ofld_rxq->ddp_buffer_alloc = counter_u64_alloc(M_WAITOK);
4200 ofld_rxq->ddp_buffer_reuse = counter_u64_alloc(M_WAITOK);
4201 ofld_rxq->ddp_buffer_free = counter_u64_alloc(M_WAITOK);
4202 add_ofld_rxq_sysctls(&vi->ctx, oid, ofld_rxq);
4203 }
4204
4205 if (!(ofld_rxq->iq.flags & IQ_HW_ALLOCATED)) {
4206 MPASS(ofld_rxq->iq.flags & IQ_SW_ALLOCATED);
4207 rc = alloc_iq_fl_hwq(vi, &ofld_rxq->iq, &ofld_rxq->fl);
4208 if (rc != 0) {
4209 CH_ERR(vi, "failed to create hw ofld_rxq%d: %d\n", idx,
4210 rc);
4211 return (rc);
4212 }
4213 MPASS(ofld_rxq->iq.flags & IQ_HW_ALLOCATED);
4214 }
4215 return (rc);
4216 }
4217
4218 /*
4219 * Idempotent.
4220 */
4221 static void
free_ofld_rxq(struct vi_info * vi,struct sge_ofld_rxq * ofld_rxq)4222 free_ofld_rxq(struct vi_info *vi, struct sge_ofld_rxq *ofld_rxq)
4223 {
4224 if (ofld_rxq->iq.flags & IQ_HW_ALLOCATED) {
4225 MPASS(ofld_rxq->iq.flags & IQ_SW_ALLOCATED);
4226 free_iq_fl_hwq(vi->adapter, &ofld_rxq->iq, &ofld_rxq->fl);
4227 MPASS(!(ofld_rxq->iq.flags & IQ_HW_ALLOCATED));
4228 }
4229
4230 if (ofld_rxq->iq.flags & IQ_SW_ALLOCATED) {
4231 MPASS(!(ofld_rxq->iq.flags & IQ_HW_ALLOCATED));
4232 free_iq_fl(vi->adapter, &ofld_rxq->iq, &ofld_rxq->fl);
4233 MPASS(!(ofld_rxq->iq.flags & IQ_SW_ALLOCATED));
4234 counter_u64_free(ofld_rxq->rx_iscsi_ddp_setup_ok);
4235 counter_u64_free(ofld_rxq->rx_iscsi_ddp_setup_error);
4236 counter_u64_free(ofld_rxq->rx_nvme_ddp_setup_ok);
4237 counter_u64_free(ofld_rxq->rx_nvme_ddp_setup_no_stag);
4238 counter_u64_free(ofld_rxq->rx_nvme_ddp_setup_error);
4239 counter_u64_free(ofld_rxq->rx_nvme_ddp_octets);
4240 counter_u64_free(ofld_rxq->rx_nvme_ddp_pdus);
4241 counter_u64_free(ofld_rxq->rx_nvme_fl_octets);
4242 counter_u64_free(ofld_rxq->rx_nvme_fl_pdus);
4243 counter_u64_free(ofld_rxq->rx_nvme_invalid_headers);
4244 counter_u64_free(ofld_rxq->rx_nvme_header_digest_errors);
4245 counter_u64_free(ofld_rxq->rx_nvme_data_digest_errors);
4246 counter_u64_free(ofld_rxq->ddp_buffer_alloc);
4247 counter_u64_free(ofld_rxq->ddp_buffer_reuse);
4248 counter_u64_free(ofld_rxq->ddp_buffer_free);
4249 bzero(ofld_rxq, sizeof(*ofld_rxq));
4250 }
4251 }
4252
4253 static void
add_ofld_rxq_sysctls(struct sysctl_ctx_list * ctx,struct sysctl_oid * oid,struct sge_ofld_rxq * ofld_rxq)4254 add_ofld_rxq_sysctls(struct sysctl_ctx_list *ctx, struct sysctl_oid *oid,
4255 struct sge_ofld_rxq *ofld_rxq)
4256 {
4257 struct sysctl_oid_list *children, *top;
4258
4259 if (ctx == NULL || oid == NULL)
4260 return;
4261
4262 top = children = SYSCTL_CHILDREN(oid);
4263 SYSCTL_ADD_U64(ctx, children, OID_AUTO, "rx_aio_ddp_jobs",
4264 CTLFLAG_RD, &ofld_rxq->rx_aio_ddp_jobs, 0,
4265 "# of aio_read(2) jobs completed via DDP");
4266 SYSCTL_ADD_U64(ctx, children, OID_AUTO, "rx_aio_ddp_octets",
4267 CTLFLAG_RD, &ofld_rxq->rx_aio_ddp_octets, 0,
4268 "# of octets placed directly for aio_read(2) jobs");
4269 SYSCTL_ADD_ULONG(ctx, children, OID_AUTO,
4270 "rx_toe_tls_records", CTLFLAG_RD, &ofld_rxq->rx_toe_tls_records,
4271 "# of TOE TLS records received");
4272 SYSCTL_ADD_ULONG(ctx, children, OID_AUTO,
4273 "rx_toe_tls_octets", CTLFLAG_RD, &ofld_rxq->rx_toe_tls_octets,
4274 "# of payload octets in received TOE TLS records");
4275 SYSCTL_ADD_ULONG(ctx, children, OID_AUTO,
4276 "rx_toe_ddp_octets", CTLFLAG_RD, &ofld_rxq->rx_toe_ddp_octets,
4277 "# of payload octets received via TCP DDP");
4278 SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO,
4279 "ddp_buffer_alloc", CTLFLAG_RD, &ofld_rxq->ddp_buffer_alloc,
4280 "# of DDP RCV buffers allocated");
4281 SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO,
4282 "ddp_buffer_reuse", CTLFLAG_RD, &ofld_rxq->ddp_buffer_reuse,
4283 "# of DDP RCV buffers reused");
4284 SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO,
4285 "ddp_buffer_free", CTLFLAG_RD, &ofld_rxq->ddp_buffer_free,
4286 "# of DDP RCV buffers freed");
4287
4288 oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "iscsi",
4289 CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "TOE iSCSI statistics");
4290 children = SYSCTL_CHILDREN(oid);
4291
4292 SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "ddp_setup_ok",
4293 CTLFLAG_RD, &ofld_rxq->rx_iscsi_ddp_setup_ok,
4294 "# of times DDP buffer was setup successfully.");
4295 SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "ddp_setup_error",
4296 CTLFLAG_RD, &ofld_rxq->rx_iscsi_ddp_setup_error,
4297 "# of times DDP buffer setup failed.");
4298 SYSCTL_ADD_U64(ctx, children, OID_AUTO, "ddp_octets",
4299 CTLFLAG_RD, &ofld_rxq->rx_iscsi_ddp_octets, 0,
4300 "# of octets placed directly");
4301 SYSCTL_ADD_U64(ctx, children, OID_AUTO, "ddp_pdus",
4302 CTLFLAG_RD, &ofld_rxq->rx_iscsi_ddp_pdus, 0,
4303 "# of PDUs with data placed directly.");
4304 SYSCTL_ADD_U64(ctx, children, OID_AUTO, "fl_octets",
4305 CTLFLAG_RD, &ofld_rxq->rx_iscsi_fl_octets, 0,
4306 "# of data octets delivered in freelist");
4307 SYSCTL_ADD_U64(ctx, children, OID_AUTO, "fl_pdus",
4308 CTLFLAG_RD, &ofld_rxq->rx_iscsi_fl_pdus, 0,
4309 "# of PDUs with data delivered in freelist");
4310 SYSCTL_ADD_U64(ctx, children, OID_AUTO, "padding_errors",
4311 CTLFLAG_RD, &ofld_rxq->rx_iscsi_padding_errors, 0,
4312 "# of PDUs with invalid padding");
4313 SYSCTL_ADD_U64(ctx, children, OID_AUTO, "header_digest_errors",
4314 CTLFLAG_RD, &ofld_rxq->rx_iscsi_header_digest_errors, 0,
4315 "# of PDUs with invalid header digests");
4316 SYSCTL_ADD_U64(ctx, children, OID_AUTO, "data_digest_errors",
4317 CTLFLAG_RD, &ofld_rxq->rx_iscsi_data_digest_errors, 0,
4318 "# of PDUs with invalid data digests");
4319
4320 oid = SYSCTL_ADD_NODE(ctx, top, OID_AUTO, "nvme",
4321 CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "TOE NVMe statistics");
4322 children = SYSCTL_CHILDREN(oid);
4323
4324 SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "ddp_setup_ok",
4325 CTLFLAG_RD, &ofld_rxq->rx_nvme_ddp_setup_ok,
4326 "# of times DDP buffer was setup successfully");
4327 SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "ddp_setup_no_stag",
4328 CTLFLAG_RD, &ofld_rxq->rx_nvme_ddp_setup_no_stag,
4329 "# of times STAG was not available for DDP buffer setup");
4330 SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "ddp_setup_error",
4331 CTLFLAG_RD, &ofld_rxq->rx_nvme_ddp_setup_error,
4332 "# of times DDP buffer setup failed");
4333 SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "ddp_octets",
4334 CTLFLAG_RD, &ofld_rxq->rx_nvme_ddp_octets,
4335 "# of octets placed directly");
4336 SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "ddp_pdus",
4337 CTLFLAG_RD, &ofld_rxq->rx_nvme_ddp_pdus,
4338 "# of PDUs with data placed directly");
4339 SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "fl_octets",
4340 CTLFLAG_RD, &ofld_rxq->rx_nvme_fl_octets,
4341 "# of data octets delivered in freelist");
4342 SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "fl_pdus",
4343 CTLFLAG_RD, &ofld_rxq->rx_nvme_fl_pdus,
4344 "# of PDUs with data delivered in freelist");
4345 SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "invalid_headers",
4346 CTLFLAG_RD, &ofld_rxq->rx_nvme_invalid_headers,
4347 "# of PDUs with invalid header field");
4348 SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "header_digest_errors",
4349 CTLFLAG_RD, &ofld_rxq->rx_nvme_header_digest_errors,
4350 "# of PDUs with invalid header digests");
4351 SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "data_digest_errors",
4352 CTLFLAG_RD, &ofld_rxq->rx_nvme_data_digest_errors,
4353 "# of PDUs with invalid data digests");
4354 }
4355 #endif
4356
4357 /*
4358 * Returns a reasonable automatic cidx flush threshold for a given queue size.
4359 */
4360 static u_int
qsize_to_fthresh(int qsize)4361 qsize_to_fthresh(int qsize)
4362 {
4363 u_int fthresh;
4364
4365 fthresh = qsize == 0 ? 0 : order_base_2(qsize);
4366 if (fthresh > X_CIDXFLUSHTHRESH_128)
4367 fthresh = X_CIDXFLUSHTHRESH_128;
4368
4369 return (fthresh);
4370 }
4371
4372 static int
ctrl_eq_alloc(struct adapter * sc,struct sge_eq * eq,int idx)4373 ctrl_eq_alloc(struct adapter *sc, struct sge_eq *eq, int idx)
4374 {
4375 int rc, cntxt_id, core;
4376 struct fw_eq_ctrl_cmd c;
4377 int qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE;
4378
4379 core = sc->params.tid_qid_sel_mask != 0 ? idx % sc->params.ncores : 0;
4380 bzero(&c, sizeof(c));
4381
4382 c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_EQ_CTRL_CMD) | F_FW_CMD_REQUEST |
4383 F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_CTRL_CMD_PFN(sc->pf) |
4384 V_FW_EQ_CTRL_CMD_VFN(0));
4385 c.alloc_to_len16 = htobe32(F_FW_EQ_CTRL_CMD_ALLOC |
4386 V_FW_EQ_CTRL_CMD_COREGROUP(core) |
4387 F_FW_EQ_CTRL_CMD_EQSTART | FW_LEN16(c));
4388 c.cmpliqid_eqid = htonl(V_FW_EQ_CTRL_CMD_CMPLIQID(eq->iqid));
4389 c.physeqid_pkd = htobe32(0);
4390 c.fetchszm_to_iqid =
4391 htobe32(V_FW_EQ_CTRL_CMD_HOSTFCMODE(X_HOSTFCMODE_STATUS_PAGE) |
4392 V_FW_EQ_CTRL_CMD_PCIECHN(eq->hw_port) |
4393 F_FW_EQ_CTRL_CMD_FETCHRO | V_FW_EQ_CTRL_CMD_IQID(eq->iqid));
4394 c.dcaen_to_eqsize =
4395 htobe32(V_FW_EQ_CTRL_CMD_FBMIN(chip_id(sc) <= CHELSIO_T5 ?
4396 X_FETCHBURSTMIN_64B : X_FETCHBURSTMIN_64B_T6) |
4397 V_FW_EQ_CTRL_CMD_FBMAX(X_FETCHBURSTMAX_512B) |
4398 V_FW_EQ_CTRL_CMD_CIDXFTHRESH(qsize_to_fthresh(qsize)) |
4399 V_FW_EQ_CTRL_CMD_EQSIZE(qsize));
4400 c.eqaddr = htobe64(eq->ba);
4401
4402 rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
4403 if (rc != 0) {
4404 CH_ERR(sc, "failed to create hw ctrlq for port %d: %d\n",
4405 eq->port_id, rc);
4406 return (rc);
4407 }
4408
4409 eq->cntxt_id = G_FW_EQ_CTRL_CMD_EQID(be32toh(c.cmpliqid_eqid));
4410 eq->abs_id = G_FW_EQ_CTRL_CMD_PHYSEQID(be32toh(c.physeqid_pkd));
4411 cntxt_id = eq->cntxt_id - sc->sge.eq_start;
4412 if (cntxt_id >= sc->sge.eqmap_sz)
4413 panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__,
4414 cntxt_id, sc->sge.eqmap_sz - 1);
4415 sc->sge.eqmap[cntxt_id] = eq;
4416
4417 return (rc);
4418 }
4419
4420 static int
eth_eq_alloc(struct adapter * sc,struct vi_info * vi,struct sge_eq * eq,int idx)4421 eth_eq_alloc(struct adapter *sc, struct vi_info *vi, struct sge_eq *eq, int idx)
4422 {
4423 int rc, cntxt_id, core;
4424 struct fw_eq_eth_cmd c;
4425 int qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE;
4426
4427 core = sc->params.ncores > 1 ? idx % sc->params.ncores : 0;
4428 bzero(&c, sizeof(c));
4429
4430 c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_EQ_ETH_CMD) | F_FW_CMD_REQUEST |
4431 F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_ETH_CMD_PFN(sc->pf) |
4432 V_FW_EQ_ETH_CMD_VFN(0));
4433 c.alloc_to_len16 = htobe32(F_FW_EQ_ETH_CMD_ALLOC |
4434 V_FW_EQ_ETH_CMD_COREGROUP(core) |
4435 F_FW_EQ_ETH_CMD_EQSTART | FW_LEN16(c));
4436 c.autoequiqe_to_viid = htobe32(F_FW_EQ_ETH_CMD_AUTOEQUIQE |
4437 F_FW_EQ_ETH_CMD_AUTOEQUEQE | V_FW_EQ_ETH_CMD_VIID(vi->viid));
4438 c.fetchszm_to_iqid =
4439 htobe32(V_FW_EQ_ETH_CMD_HOSTFCMODE(X_HOSTFCMODE_NONE) |
4440 V_FW_EQ_ETH_CMD_PCIECHN(eq->hw_port) | F_FW_EQ_ETH_CMD_FETCHRO |
4441 V_FW_EQ_ETH_CMD_IQID(eq->iqid));
4442 c.dcaen_to_eqsize =
4443 htobe32(V_FW_EQ_ETH_CMD_FBMIN(chip_id(sc) <= CHELSIO_T5 ?
4444 X_FETCHBURSTMIN_64B : X_FETCHBURSTMIN_64B_T6) |
4445 V_FW_EQ_ETH_CMD_FBMAX(X_FETCHBURSTMAX_512B) |
4446 V_FW_EQ_ETH_CMD_EQSIZE(qsize));
4447 c.eqaddr = htobe64(eq->ba);
4448
4449 rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
4450 if (rc != 0) {
4451 device_printf(vi->dev,
4452 "failed to create Ethernet egress queue: %d\n", rc);
4453 return (rc);
4454 }
4455
4456 eq->cntxt_id = G_FW_EQ_ETH_CMD_EQID(be32toh(c.eqid_pkd));
4457 eq->abs_id = G_FW_EQ_ETH_CMD_PHYSEQID(be32toh(c.physeqid_pkd));
4458 cntxt_id = eq->cntxt_id - sc->sge.eq_start;
4459 if (cntxt_id >= sc->sge.eqmap_sz)
4460 panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__,
4461 cntxt_id, sc->sge.eqmap_sz - 1);
4462 sc->sge.eqmap[cntxt_id] = eq;
4463
4464 return (rc);
4465 }
4466
4467 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
4468 /*
4469 * ncores number of uP cores.
4470 * nq number of queues for this VI
4471 * idx queue index
4472 */
4473 static inline int
qidx_to_core(int ncores,int nq,int idx)4474 qidx_to_core(int ncores, int nq, int idx)
4475 {
4476 MPASS(nq % ncores == 0);
4477 MPASS(idx >= 0 && idx < nq);
4478
4479 return (idx * ncores / nq);
4480 }
4481
4482 static int
ofld_eq_alloc(struct adapter * sc,struct vi_info * vi,struct sge_eq * eq,int idx)4483 ofld_eq_alloc(struct adapter *sc, struct vi_info *vi, struct sge_eq *eq,
4484 int idx)
4485 {
4486 int rc, cntxt_id, core;
4487 struct fw_eq_ofld_cmd c;
4488 int qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE;
4489
4490 if (sc->params.tid_qid_sel_mask != 0)
4491 core = qidx_to_core(sc->params.ncores, vi->nofldtxq, idx);
4492 else
4493 core = 0;
4494
4495 bzero(&c, sizeof(c));
4496
4497 c.op_to_vfn = htonl(V_FW_CMD_OP(FW_EQ_OFLD_CMD) | F_FW_CMD_REQUEST |
4498 F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_OFLD_CMD_PFN(sc->pf) |
4499 V_FW_EQ_OFLD_CMD_VFN(0));
4500 c.alloc_to_len16 = htonl(F_FW_EQ_OFLD_CMD_ALLOC |
4501 V_FW_EQ_OFLD_CMD_COREGROUP(core) |
4502 F_FW_EQ_OFLD_CMD_EQSTART | FW_LEN16(c));
4503 c.fetchszm_to_iqid =
4504 htonl(V_FW_EQ_OFLD_CMD_HOSTFCMODE(X_HOSTFCMODE_STATUS_PAGE) |
4505 V_FW_EQ_OFLD_CMD_PCIECHN(eq->hw_port) |
4506 F_FW_EQ_OFLD_CMD_FETCHRO | V_FW_EQ_OFLD_CMD_IQID(eq->iqid));
4507 c.dcaen_to_eqsize =
4508 htobe32(V_FW_EQ_OFLD_CMD_FBMIN(chip_id(sc) <= CHELSIO_T5 ?
4509 X_FETCHBURSTMIN_64B : X_FETCHBURSTMIN_64B_T6) |
4510 V_FW_EQ_OFLD_CMD_FBMAX(X_FETCHBURSTMAX_512B) |
4511 V_FW_EQ_OFLD_CMD_CIDXFTHRESH(qsize_to_fthresh(qsize)) |
4512 V_FW_EQ_OFLD_CMD_EQSIZE(qsize));
4513 c.eqaddr = htobe64(eq->ba);
4514
4515 rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
4516 if (rc != 0) {
4517 device_printf(vi->dev,
4518 "failed to create egress queue for TCP offload: %d\n", rc);
4519 return (rc);
4520 }
4521
4522 eq->cntxt_id = G_FW_EQ_OFLD_CMD_EQID(be32toh(c.eqid_pkd));
4523 eq->abs_id = G_FW_EQ_OFLD_CMD_PHYSEQID(be32toh(c.physeqid_pkd));
4524 cntxt_id = eq->cntxt_id - sc->sge.eq_start;
4525 if (cntxt_id >= sc->sge.eqmap_sz)
4526 panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__,
4527 cntxt_id, sc->sge.eqmap_sz - 1);
4528 sc->sge.eqmap[cntxt_id] = eq;
4529
4530 return (rc);
4531 }
4532 #endif
4533
4534 /* SW only */
4535 static int
alloc_eq(struct adapter * sc,struct sge_eq * eq,struct sysctl_ctx_list * ctx,struct sysctl_oid * oid)4536 alloc_eq(struct adapter *sc, struct sge_eq *eq, struct sysctl_ctx_list *ctx,
4537 struct sysctl_oid *oid)
4538 {
4539 int rc, qsize;
4540 size_t len;
4541
4542 MPASS(!(eq->flags & EQ_SW_ALLOCATED));
4543
4544 qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE;
4545 len = qsize * EQ_ESIZE;
4546 rc = alloc_ring(sc, len, &eq->desc_tag, &eq->desc_map, &eq->ba,
4547 (void **)&eq->desc);
4548 if (rc)
4549 return (rc);
4550 if (ctx != NULL && oid != NULL)
4551 add_eq_sysctls(sc, ctx, oid, eq);
4552 eq->flags |= EQ_SW_ALLOCATED;
4553
4554 return (0);
4555 }
4556
4557 /* SW only */
4558 static void
free_eq(struct adapter * sc,struct sge_eq * eq)4559 free_eq(struct adapter *sc, struct sge_eq *eq)
4560 {
4561 MPASS(eq->flags & EQ_SW_ALLOCATED);
4562 if (eq->type == EQ_ETH)
4563 MPASS(eq->pidx == eq->cidx);
4564
4565 free_ring(sc, eq->desc_tag, eq->desc_map, eq->ba, eq->desc);
4566 mtx_destroy(&eq->eq_lock);
4567 bzero(eq, sizeof(*eq));
4568 }
4569
4570 static void
add_eq_sysctls(struct adapter * sc,struct sysctl_ctx_list * ctx,struct sysctl_oid * oid,struct sge_eq * eq)4571 add_eq_sysctls(struct adapter *sc, struct sysctl_ctx_list *ctx,
4572 struct sysctl_oid *oid, struct sge_eq *eq)
4573 {
4574 struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
4575
4576 SYSCTL_ADD_UAUTO(ctx, children, OID_AUTO, "ba", CTLFLAG_RD, &eq->ba,
4577 "bus address of descriptor ring");
4578 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL,
4579 eq->sidx * EQ_ESIZE + sc->params.sge.spg_len,
4580 "desc ring size in bytes");
4581 SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "abs_id", CTLFLAG_RD,
4582 &eq->abs_id, 0, "absolute id of the queue");
4583 SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD,
4584 &eq->cntxt_id, 0, "SGE context id of the queue");
4585 SYSCTL_ADD_U16(ctx, children, OID_AUTO, "cidx", CTLFLAG_RD, &eq->cidx,
4586 0, "consumer index");
4587 SYSCTL_ADD_U16(ctx, children, OID_AUTO, "pidx", CTLFLAG_RD, &eq->pidx,
4588 0, "producer index");
4589 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "sidx", CTLFLAG_RD, NULL,
4590 eq->sidx, "status page index");
4591 }
4592
4593 static int
alloc_eq_hwq(struct adapter * sc,struct vi_info * vi,struct sge_eq * eq,int idx)4594 alloc_eq_hwq(struct adapter *sc, struct vi_info *vi, struct sge_eq *eq, int idx)
4595 {
4596 int rc;
4597
4598 MPASS(!(eq->flags & EQ_HW_ALLOCATED));
4599
4600 eq->iqid = eq->iq->cntxt_id;
4601 eq->pidx = eq->cidx = eq->dbidx = 0;
4602 /* Note that equeqidx is not used with sge_wrq (OFLD/CTRL) queues. */
4603 eq->equeqidx = 0;
4604 eq->doorbells = sc->doorbells;
4605 bzero(eq->desc, eq->sidx * EQ_ESIZE + sc->params.sge.spg_len);
4606
4607 switch (eq->type) {
4608 case EQ_CTRL:
4609 rc = ctrl_eq_alloc(sc, eq, idx);
4610 break;
4611
4612 case EQ_ETH:
4613 rc = eth_eq_alloc(sc, vi, eq, idx);
4614 break;
4615
4616 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
4617 case EQ_OFLD:
4618 rc = ofld_eq_alloc(sc, vi, eq, idx);
4619 break;
4620 #endif
4621
4622 default:
4623 panic("%s: invalid eq type %d.", __func__, eq->type);
4624 }
4625 if (rc != 0) {
4626 CH_ERR(sc, "failed to allocate egress queue(%d): %d\n",
4627 eq->type, rc);
4628 return (rc);
4629 }
4630
4631 if (isset(&eq->doorbells, DOORBELL_UDB) ||
4632 isset(&eq->doorbells, DOORBELL_UDBWC) ||
4633 isset(&eq->doorbells, DOORBELL_WCWR)) {
4634 uint32_t s_qpp = sc->params.sge.eq_s_qpp;
4635 uint32_t mask = (1 << s_qpp) - 1;
4636 volatile uint8_t *udb;
4637
4638 udb = sc->udbs_base + UDBS_DB_OFFSET;
4639 udb += (eq->cntxt_id >> s_qpp) << PAGE_SHIFT; /* pg offset */
4640 eq->udb_qid = eq->cntxt_id & mask; /* id in page */
4641 if (eq->udb_qid >= PAGE_SIZE / UDBS_SEG_SIZE)
4642 clrbit(&eq->doorbells, DOORBELL_WCWR);
4643 else {
4644 udb += eq->udb_qid << UDBS_SEG_SHIFT; /* seg offset */
4645 eq->udb_qid = 0;
4646 }
4647 eq->udb = (volatile void *)udb;
4648 }
4649
4650 eq->flags |= EQ_HW_ALLOCATED;
4651 return (0);
4652 }
4653
4654 static int
free_eq_hwq(struct adapter * sc,struct vi_info * vi __unused,struct sge_eq * eq)4655 free_eq_hwq(struct adapter *sc, struct vi_info *vi __unused, struct sge_eq *eq)
4656 {
4657 int rc;
4658
4659 MPASS(eq->flags & EQ_HW_ALLOCATED);
4660
4661 switch (eq->type) {
4662 case EQ_CTRL:
4663 rc = -t4_ctrl_eq_free(sc, sc->mbox, sc->pf, 0, eq->cntxt_id);
4664 break;
4665 case EQ_ETH:
4666 rc = -t4_eth_eq_free(sc, sc->mbox, sc->pf, 0, eq->cntxt_id);
4667 break;
4668 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
4669 case EQ_OFLD:
4670 rc = -t4_ofld_eq_free(sc, sc->mbox, sc->pf, 0, eq->cntxt_id);
4671 break;
4672 #endif
4673 default:
4674 panic("%s: invalid eq type %d.", __func__, eq->type);
4675 }
4676 if (rc != 0) {
4677 CH_ERR(sc, "failed to free eq (type %d): %d\n", eq->type, rc);
4678 return (rc);
4679 }
4680 eq->flags &= ~EQ_HW_ALLOCATED;
4681
4682 return (0);
4683 }
4684
4685 static int
alloc_wrq(struct adapter * sc,struct vi_info * vi,struct sge_wrq * wrq,struct sysctl_ctx_list * ctx,struct sysctl_oid * oid)4686 alloc_wrq(struct adapter *sc, struct vi_info *vi, struct sge_wrq *wrq,
4687 struct sysctl_ctx_list *ctx, struct sysctl_oid *oid)
4688 {
4689 struct sge_eq *eq = &wrq->eq;
4690 int rc;
4691
4692 MPASS(!(eq->flags & EQ_SW_ALLOCATED));
4693
4694 rc = alloc_eq(sc, eq, ctx, oid);
4695 if (rc)
4696 return (rc);
4697 MPASS(eq->flags & EQ_SW_ALLOCATED);
4698 /* Can't fail after this. */
4699
4700 wrq->adapter = sc;
4701 TASK_INIT(&wrq->wrq_tx_task, 0, wrq_tx_drain, wrq);
4702 TAILQ_INIT(&wrq->incomplete_wrs);
4703 STAILQ_INIT(&wrq->wr_list);
4704 wrq->nwr_pending = 0;
4705 wrq->ndesc_needed = 0;
4706 add_wrq_sysctls(ctx, oid, wrq);
4707
4708 return (0);
4709 }
4710
4711 static void
free_wrq(struct adapter * sc,struct sge_wrq * wrq)4712 free_wrq(struct adapter *sc, struct sge_wrq *wrq)
4713 {
4714 free_eq(sc, &wrq->eq);
4715 MPASS(wrq->nwr_pending == 0);
4716 MPASS(wrq->ndesc_needed == 0);
4717 MPASS(TAILQ_EMPTY(&wrq->incomplete_wrs));
4718 MPASS(STAILQ_EMPTY(&wrq->wr_list));
4719 bzero(wrq, sizeof(*wrq));
4720 }
4721
4722 static void
add_wrq_sysctls(struct sysctl_ctx_list * ctx,struct sysctl_oid * oid,struct sge_wrq * wrq)4723 add_wrq_sysctls(struct sysctl_ctx_list *ctx, struct sysctl_oid *oid,
4724 struct sge_wrq *wrq)
4725 {
4726 struct sysctl_oid_list *children;
4727
4728 if (ctx == NULL || oid == NULL)
4729 return;
4730
4731 children = SYSCTL_CHILDREN(oid);
4732 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_direct", CTLFLAG_RD,
4733 &wrq->tx_wrs_direct, "# of work requests (direct)");
4734 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_copied", CTLFLAG_RD,
4735 &wrq->tx_wrs_copied, "# of work requests (copied)");
4736 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_sspace", CTLFLAG_RD,
4737 &wrq->tx_wrs_ss, "# of work requests (copied from scratch space)");
4738 }
4739
4740 /*
4741 * Idempotent.
4742 */
4743 static int
alloc_txq(struct vi_info * vi,struct sge_txq * txq,int idx)4744 alloc_txq(struct vi_info *vi, struct sge_txq *txq, int idx)
4745 {
4746 int rc, iqidx;
4747 struct port_info *pi = vi->pi;
4748 struct adapter *sc = vi->adapter;
4749 struct sge_eq *eq = &txq->eq;
4750 struct txpkts *txp;
4751 char name[16];
4752 struct sysctl_oid *oid;
4753
4754 if (!(eq->flags & EQ_SW_ALLOCATED)) {
4755 MPASS(!(eq->flags & EQ_HW_ALLOCATED));
4756
4757 snprintf(name, sizeof(name), "%d", idx);
4758 oid = SYSCTL_ADD_NODE(&vi->ctx, SYSCTL_CHILDREN(vi->txq_oid),
4759 OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
4760 "tx queue");
4761
4762 iqidx = vi->first_rxq + (idx % vi->nrxq);
4763 snprintf(name, sizeof(name), "%s txq%d",
4764 device_get_nameunit(vi->dev), idx);
4765 init_eq(sc, &txq->eq, EQ_ETH, vi->qsize_txq, pi->port_id,
4766 &sc->sge.rxq[iqidx].iq, name);
4767
4768 rc = mp_ring_alloc(&txq->r, eq->sidx, txq, eth_tx,
4769 can_resume_eth_tx, M_CXGBE, &eq->eq_lock, M_WAITOK);
4770 if (rc != 0) {
4771 CH_ERR(vi, "failed to allocate mp_ring for txq%d: %d\n",
4772 idx, rc);
4773 failed:
4774 sysctl_remove_oid(oid, 1, 1);
4775 return (rc);
4776 }
4777
4778 rc = alloc_eq(sc, eq, &vi->ctx, oid);
4779 if (rc) {
4780 CH_ERR(vi, "failed to allocate txq%d: %d\n", idx, rc);
4781 mp_ring_free(txq->r);
4782 goto failed;
4783 }
4784 MPASS(eq->flags & EQ_SW_ALLOCATED);
4785 /* Can't fail after this point. */
4786
4787 TASK_INIT(&txq->tx_reclaim_task, 0, tx_reclaim, eq);
4788 txq->ifp = vi->ifp;
4789 txq->gl = sglist_alloc(TX_SGL_SEGS, M_WAITOK);
4790 txq->sdesc = malloc(eq->sidx * sizeof(struct tx_sdesc), M_CXGBE,
4791 M_ZERO | M_WAITOK);
4792
4793 add_txq_sysctls(vi, &vi->ctx, oid, txq);
4794 }
4795
4796 if (!(eq->flags & EQ_HW_ALLOCATED)) {
4797 MPASS(eq->flags & EQ_SW_ALLOCATED);
4798 rc = alloc_eq_hwq(sc, vi, eq, idx);
4799 if (rc != 0) {
4800 CH_ERR(vi, "failed to create hw txq%d: %d\n", idx, rc);
4801 return (rc);
4802 }
4803 MPASS(eq->flags & EQ_HW_ALLOCATED);
4804 /* Can't fail after this point. */
4805
4806 if (idx == 0)
4807 sc->sge.eq_base = eq->abs_id - eq->cntxt_id;
4808 else
4809 KASSERT(eq->cntxt_id + sc->sge.eq_base == eq->abs_id,
4810 ("eq_base mismatch"));
4811 KASSERT(sc->sge.eq_base == 0 || sc->flags & IS_VF,
4812 ("PF with non-zero eq_base"));
4813
4814 txp = &txq->txp;
4815 MPASS(nitems(txp->mb) >= sc->params.max_pkts_per_eth_tx_pkts_wr);
4816 txq->txp.max_npkt = min(nitems(txp->mb),
4817 sc->params.max_pkts_per_eth_tx_pkts_wr);
4818 if (vi->flags & TX_USES_VM_WR && !(sc->flags & IS_VF))
4819 txq->txp.max_npkt--;
4820
4821 if (vi->flags & TX_USES_VM_WR)
4822 txq->cpl_ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT_XT) |
4823 V_TXPKT_INTF(pi->hw_port));
4824 else
4825 txq->cpl_ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT_XT) |
4826 V_TXPKT_INTF(pi->hw_port) | V_TXPKT_PF(sc->pf) |
4827 V_TXPKT_VF(vi->vin) | V_TXPKT_VF_VLD(vi->vfvld));
4828
4829 txq->tc_idx = -1;
4830 }
4831
4832 return (0);
4833 }
4834
4835 /*
4836 * Idempotent.
4837 */
4838 static void
free_txq(struct vi_info * vi,struct sge_txq * txq)4839 free_txq(struct vi_info *vi, struct sge_txq *txq)
4840 {
4841 struct adapter *sc = vi->adapter;
4842 struct sge_eq *eq = &txq->eq;
4843
4844 if (eq->flags & EQ_HW_ALLOCATED) {
4845 MPASS(eq->flags & EQ_SW_ALLOCATED);
4846 free_eq_hwq(sc, NULL, eq);
4847 MPASS(!(eq->flags & EQ_HW_ALLOCATED));
4848 }
4849
4850 if (eq->flags & EQ_SW_ALLOCATED) {
4851 MPASS(!(eq->flags & EQ_HW_ALLOCATED));
4852 sglist_free(txq->gl);
4853 free(txq->sdesc, M_CXGBE);
4854 mp_ring_free(txq->r);
4855 free_eq(sc, eq);
4856 MPASS(!(eq->flags & EQ_SW_ALLOCATED));
4857 bzero(txq, sizeof(*txq));
4858 }
4859 }
4860
4861 static void
add_txq_sysctls(struct vi_info * vi,struct sysctl_ctx_list * ctx,struct sysctl_oid * oid,struct sge_txq * txq)4862 add_txq_sysctls(struct vi_info *vi, struct sysctl_ctx_list *ctx,
4863 struct sysctl_oid *oid, struct sge_txq *txq)
4864 {
4865 struct adapter *sc;
4866 struct sysctl_oid_list *children;
4867
4868 if (ctx == NULL || oid == NULL)
4869 return;
4870
4871 sc = vi->adapter;
4872 children = SYSCTL_CHILDREN(oid);
4873
4874 mp_ring_sysctls(txq->r, ctx, children);
4875
4876 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tc",
4877 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, vi, txq - sc->sge.txq,
4878 sysctl_tc, "I", "traffic class (-1 means none)");
4879
4880 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "txcsum", CTLFLAG_RD,
4881 &txq->txcsum, "# of times hardware assisted with checksum");
4882 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "vlan_insertion", CTLFLAG_RD,
4883 &txq->vlan_insertion, "# of times hardware inserted 802.1Q tag");
4884 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tso_wrs", CTLFLAG_RD,
4885 &txq->tso_wrs, "# of TSO work requests");
4886 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "imm_wrs", CTLFLAG_RD,
4887 &txq->imm_wrs, "# of work requests with immediate data");
4888 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "sgl_wrs", CTLFLAG_RD,
4889 &txq->sgl_wrs, "# of work requests with direct SGL");
4890 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "txpkt_wrs", CTLFLAG_RD,
4891 &txq->txpkt_wrs, "# of txpkt work requests (one pkt/WR)");
4892 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "txpkts0_wrs", CTLFLAG_RD,
4893 &txq->txpkts0_wrs, "# of txpkts (type 0) work requests");
4894 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "txpkts1_wrs", CTLFLAG_RD,
4895 &txq->txpkts1_wrs, "# of txpkts (type 1) work requests");
4896 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "txpkts0_pkts", CTLFLAG_RD,
4897 &txq->txpkts0_pkts,
4898 "# of frames tx'd using type0 txpkts work requests");
4899 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "txpkts1_pkts", CTLFLAG_RD,
4900 &txq->txpkts1_pkts,
4901 "# of frames tx'd using type1 txpkts work requests");
4902 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "txpkts_flush", CTLFLAG_RD,
4903 &txq->txpkts_flush,
4904 "# of times txpkts had to be flushed out by an egress-update");
4905 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "raw_wrs", CTLFLAG_RD,
4906 &txq->raw_wrs, "# of raw work requests (non-packets)");
4907 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "vxlan_tso_wrs", CTLFLAG_RD,
4908 &txq->vxlan_tso_wrs, "# of VXLAN TSO work requests");
4909 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "vxlan_txcsum", CTLFLAG_RD,
4910 &txq->vxlan_txcsum,
4911 "# of times hardware assisted with inner checksums (VXLAN)");
4912
4913 #ifdef KERN_TLS
4914 if (is_ktls(sc)) {
4915 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "kern_tls_records",
4916 CTLFLAG_RD, &txq->kern_tls_records,
4917 "# of NIC TLS records transmitted");
4918 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "kern_tls_short",
4919 CTLFLAG_RD, &txq->kern_tls_short,
4920 "# of short NIC TLS records transmitted");
4921 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "kern_tls_partial",
4922 CTLFLAG_RD, &txq->kern_tls_partial,
4923 "# of partial NIC TLS records transmitted");
4924 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "kern_tls_full",
4925 CTLFLAG_RD, &txq->kern_tls_full,
4926 "# of full NIC TLS records transmitted");
4927 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "kern_tls_octets",
4928 CTLFLAG_RD, &txq->kern_tls_octets,
4929 "# of payload octets in transmitted NIC TLS records");
4930 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "kern_tls_waste",
4931 CTLFLAG_RD, &txq->kern_tls_waste,
4932 "# of octets DMAd but not transmitted in NIC TLS records");
4933 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "kern_tls_header",
4934 CTLFLAG_RD, &txq->kern_tls_header,
4935 "# of NIC TLS header-only packets transmitted");
4936 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "kern_tls_fin_short",
4937 CTLFLAG_RD, &txq->kern_tls_fin_short,
4938 "# of NIC TLS padded FIN packets on short TLS records");
4939 if (is_t6(sc)) {
4940 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO,
4941 "kern_tls_options", CTLFLAG_RD,
4942 &txq->kern_tls_options,
4943 "# of NIC TLS options-only packets transmitted");
4944 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO,
4945 "kern_tls_fin", CTLFLAG_RD, &txq->kern_tls_fin,
4946 "# of NIC TLS FIN-only packets transmitted");
4947 } else {
4948 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO,
4949 "kern_tls_ghash_received", CTLFLAG_RD,
4950 &txq->kern_tls_ghash_received,
4951 "# of NIC TLS GHASHes received");
4952 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO,
4953 "kern_tls_ghash_requested", CTLFLAG_RD,
4954 &txq->kern_tls_ghash_requested,
4955 "# of NIC TLS GHASHes requested");
4956 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO,
4957 "kern_tls_lso", CTLFLAG_RD,
4958 &txq->kern_tls_lso,
4959 "# of NIC TLS records transmitted using LSO");
4960 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO,
4961 "kern_tls_partial_ghash", CTLFLAG_RD,
4962 &txq->kern_tls_partial_ghash,
4963 "# of NIC TLS records encrypted using a partial GHASH");
4964 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO,
4965 "kern_tls_splitmode", CTLFLAG_RD,
4966 &txq->kern_tls_splitmode,
4967 "# of NIC TLS records using SplitMode");
4968 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO,
4969 "kern_tls_trailer", CTLFLAG_RD,
4970 &txq->kern_tls_trailer,
4971 "# of NIC TLS trailer-only packets transmitted");
4972 }
4973 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "kern_tls_cbc",
4974 CTLFLAG_RD, &txq->kern_tls_cbc,
4975 "# of NIC TLS sessions using AES-CBC");
4976 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "kern_tls_gcm",
4977 CTLFLAG_RD, &txq->kern_tls_gcm,
4978 "# of NIC TLS sessions using AES-GCM");
4979 }
4980 #endif
4981 }
4982
4983 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
4984 /*
4985 * Idempotent.
4986 */
4987 static int
alloc_ofld_txq(struct vi_info * vi,struct sge_ofld_txq * ofld_txq,int idx)4988 alloc_ofld_txq(struct vi_info *vi, struct sge_ofld_txq *ofld_txq, int idx)
4989 {
4990 struct sysctl_oid *oid;
4991 struct port_info *pi = vi->pi;
4992 struct adapter *sc = vi->adapter;
4993 struct sge_eq *eq = &ofld_txq->wrq.eq;
4994 int rc, iqidx;
4995 char name[16];
4996
4997 MPASS(idx >= 0);
4998 MPASS(idx < vi->nofldtxq);
4999
5000 if (!(eq->flags & EQ_SW_ALLOCATED)) {
5001 snprintf(name, sizeof(name), "%d", idx);
5002 oid = SYSCTL_ADD_NODE(&vi->ctx,
5003 SYSCTL_CHILDREN(vi->ofld_txq_oid), OID_AUTO, name,
5004 CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "offload tx queue");
5005
5006 snprintf(name, sizeof(name), "%s ofld_txq%d",
5007 device_get_nameunit(vi->dev), idx);
5008 if (vi->nofldrxq > 0) {
5009 iqidx = vi->first_ofld_rxq + (idx % vi->nofldrxq);
5010 init_eq(sc, eq, EQ_OFLD, vi->qsize_txq, pi->port_id,
5011 &sc->sge.ofld_rxq[iqidx].iq, name);
5012 } else {
5013 iqidx = vi->first_rxq + (idx % vi->nrxq);
5014 init_eq(sc, eq, EQ_OFLD, vi->qsize_txq, pi->port_id,
5015 &sc->sge.rxq[iqidx].iq, name);
5016 }
5017
5018 rc = alloc_wrq(sc, vi, &ofld_txq->wrq, &vi->ctx, oid);
5019 if (rc != 0) {
5020 CH_ERR(vi, "failed to allocate ofld_txq%d: %d\n", idx,
5021 rc);
5022 sysctl_remove_oid(oid, 1, 1);
5023 return (rc);
5024 }
5025 MPASS(eq->flags & EQ_SW_ALLOCATED);
5026 /* Can't fail after this point. */
5027
5028 ofld_txq->tx_iscsi_pdus = counter_u64_alloc(M_WAITOK);
5029 ofld_txq->tx_iscsi_octets = counter_u64_alloc(M_WAITOK);
5030 ofld_txq->tx_iscsi_iso_wrs = counter_u64_alloc(M_WAITOK);
5031 ofld_txq->tx_nvme_pdus = counter_u64_alloc(M_WAITOK);
5032 ofld_txq->tx_nvme_octets = counter_u64_alloc(M_WAITOK);
5033 ofld_txq->tx_nvme_iso_wrs = counter_u64_alloc(M_WAITOK);
5034 ofld_txq->tx_aio_jobs = counter_u64_alloc(M_WAITOK);
5035 ofld_txq->tx_aio_octets = counter_u64_alloc(M_WAITOK);
5036 ofld_txq->tx_toe_tls_records = counter_u64_alloc(M_WAITOK);
5037 ofld_txq->tx_toe_tls_octets = counter_u64_alloc(M_WAITOK);
5038 add_ofld_txq_sysctls(&vi->ctx, oid, ofld_txq);
5039 }
5040
5041 if (!(eq->flags & EQ_HW_ALLOCATED)) {
5042 MPASS(eq->flags & EQ_SW_ALLOCATED);
5043 MPASS(ofld_txq->wrq.nwr_pending == 0);
5044 MPASS(ofld_txq->wrq.ndesc_needed == 0);
5045 rc = alloc_eq_hwq(sc, vi, eq, idx);
5046 if (rc != 0) {
5047 CH_ERR(vi, "failed to create hw ofld_txq%d: %d\n", idx,
5048 rc);
5049 return (rc);
5050 }
5051 MPASS(eq->flags & EQ_HW_ALLOCATED);
5052 }
5053
5054 return (0);
5055 }
5056
5057 /*
5058 * Idempotent.
5059 */
5060 static void
free_ofld_txq(struct vi_info * vi,struct sge_ofld_txq * ofld_txq)5061 free_ofld_txq(struct vi_info *vi, struct sge_ofld_txq *ofld_txq)
5062 {
5063 struct adapter *sc = vi->adapter;
5064 struct sge_eq *eq = &ofld_txq->wrq.eq;
5065
5066 if (eq->flags & EQ_HW_ALLOCATED) {
5067 MPASS(eq->flags & EQ_SW_ALLOCATED);
5068 free_eq_hwq(sc, NULL, eq);
5069 MPASS(!(eq->flags & EQ_HW_ALLOCATED));
5070 }
5071
5072 if (eq->flags & EQ_SW_ALLOCATED) {
5073 MPASS(!(eq->flags & EQ_HW_ALLOCATED));
5074 counter_u64_free(ofld_txq->tx_iscsi_pdus);
5075 counter_u64_free(ofld_txq->tx_iscsi_octets);
5076 counter_u64_free(ofld_txq->tx_iscsi_iso_wrs);
5077 counter_u64_free(ofld_txq->tx_nvme_pdus);
5078 counter_u64_free(ofld_txq->tx_nvme_octets);
5079 counter_u64_free(ofld_txq->tx_nvme_iso_wrs);
5080 counter_u64_free(ofld_txq->tx_aio_jobs);
5081 counter_u64_free(ofld_txq->tx_aio_octets);
5082 counter_u64_free(ofld_txq->tx_toe_tls_records);
5083 counter_u64_free(ofld_txq->tx_toe_tls_octets);
5084 free_wrq(sc, &ofld_txq->wrq);
5085 MPASS(!(eq->flags & EQ_SW_ALLOCATED));
5086 bzero(ofld_txq, sizeof(*ofld_txq));
5087 }
5088 }
5089
5090 static void
add_ofld_txq_sysctls(struct sysctl_ctx_list * ctx,struct sysctl_oid * oid,struct sge_ofld_txq * ofld_txq)5091 add_ofld_txq_sysctls(struct sysctl_ctx_list *ctx, struct sysctl_oid *oid,
5092 struct sge_ofld_txq *ofld_txq)
5093 {
5094 struct sysctl_oid_list *children;
5095
5096 if (ctx == NULL || oid == NULL)
5097 return;
5098
5099 children = SYSCTL_CHILDREN(oid);
5100 SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "tx_iscsi_pdus",
5101 CTLFLAG_RD, &ofld_txq->tx_iscsi_pdus,
5102 "# of iSCSI PDUs transmitted");
5103 SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "tx_iscsi_octets",
5104 CTLFLAG_RD, &ofld_txq->tx_iscsi_octets,
5105 "# of payload octets in transmitted iSCSI PDUs");
5106 SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "tx_iscsi_iso_wrs",
5107 CTLFLAG_RD, &ofld_txq->tx_iscsi_iso_wrs,
5108 "# of iSCSI segmentation offload work requests");
5109 SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "tx_nvme_pdus",
5110 CTLFLAG_RD, &ofld_txq->tx_nvme_pdus,
5111 "# of NVMe PDUs transmitted");
5112 SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "tx_nvme_octets",
5113 CTLFLAG_RD, &ofld_txq->tx_nvme_octets,
5114 "# of payload octets in transmitted NVMe PDUs");
5115 SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "tx_nvme_iso_wrs",
5116 CTLFLAG_RD, &ofld_txq->tx_nvme_iso_wrs,
5117 "# of NVMe segmentation offload work requests");
5118 SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "tx_aio_jobs",
5119 CTLFLAG_RD, &ofld_txq->tx_aio_jobs,
5120 "# of zero-copy aio_write(2) jobs transmitted");
5121 SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "tx_aio_octets",
5122 CTLFLAG_RD, &ofld_txq->tx_aio_octets,
5123 "# of payload octets in transmitted zero-copy aio_write(2) jobs");
5124 SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "tx_toe_tls_records",
5125 CTLFLAG_RD, &ofld_txq->tx_toe_tls_records,
5126 "# of TOE TLS records transmitted");
5127 SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "tx_toe_tls_octets",
5128 CTLFLAG_RD, &ofld_txq->tx_toe_tls_octets,
5129 "# of payload octets in transmitted TOE TLS records");
5130 }
5131 #endif
5132
5133 static void
oneseg_dma_callback(void * arg,bus_dma_segment_t * segs,int nseg,int error)5134 oneseg_dma_callback(void *arg, bus_dma_segment_t *segs, int nseg, int error)
5135 {
5136 bus_addr_t *ba = arg;
5137
5138 KASSERT(nseg == 1,
5139 ("%s meant for single segment mappings only.", __func__));
5140
5141 *ba = error ? 0 : segs->ds_addr;
5142 }
5143
5144 static inline void
ring_fl_db(struct adapter * sc,struct sge_fl * fl)5145 ring_fl_db(struct adapter *sc, struct sge_fl *fl)
5146 {
5147 uint32_t n, v;
5148
5149 n = IDXDIFF(fl->pidx >> 3, fl->dbidx, fl->sidx);
5150 MPASS(n > 0);
5151
5152 wmb();
5153 v = fl->dbval | V_PIDX(n);
5154 if (fl->udb)
5155 *fl->udb = htole32(v);
5156 else
5157 t4_write_reg(sc, sc->sge_kdoorbell_reg, v);
5158 IDXINCR(fl->dbidx, n, fl->sidx);
5159 }
5160
5161 /*
5162 * Fills up the freelist by allocating up to 'n' buffers. Buffers that are
5163 * recycled do not count towards this allocation budget.
5164 *
5165 * Returns non-zero to indicate that this freelist should be added to the list
5166 * of starving freelists.
5167 */
5168 static int
refill_fl(struct adapter * sc,struct sge_fl * fl,int n)5169 refill_fl(struct adapter *sc, struct sge_fl *fl, int n)
5170 {
5171 __be64 *d;
5172 struct fl_sdesc *sd;
5173 uintptr_t pa;
5174 caddr_t cl;
5175 struct rx_buf_info *rxb;
5176 struct cluster_metadata *clm;
5177 uint16_t max_pidx, zidx = fl->zidx;
5178 uint16_t hw_cidx = fl->hw_cidx; /* stable snapshot */
5179
5180 FL_LOCK_ASSERT_OWNED(fl);
5181
5182 /*
5183 * We always stop at the beginning of the hardware descriptor that's just
5184 * before the one with the hw cidx. This is to avoid hw pidx = hw cidx,
5185 * which would mean an empty freelist to the chip.
5186 */
5187 max_pidx = __predict_false(hw_cidx == 0) ? fl->sidx - 1 : hw_cidx - 1;
5188 if (fl->pidx == max_pidx * 8)
5189 return (0);
5190
5191 d = &fl->desc[fl->pidx];
5192 sd = &fl->sdesc[fl->pidx];
5193 rxb = &sc->sge.rx_buf_info[zidx];
5194
5195 while (n > 0) {
5196
5197 if (sd->cl != NULL) {
5198
5199 if (sd->nmbuf == 0) {
5200 /*
5201 * Fast recycle without involving any atomics on
5202 * the cluster's metadata (if the cluster has
5203 * metadata). This happens when all frames
5204 * received in the cluster were small enough to
5205 * fit within a single mbuf each.
5206 */
5207 fl->cl_fast_recycled++;
5208 goto recycled;
5209 }
5210
5211 /*
5212 * Cluster is guaranteed to have metadata. Clusters
5213 * without metadata always take the fast recycle path
5214 * when they're recycled.
5215 */
5216 clm = cl_metadata(sd);
5217 MPASS(clm != NULL);
5218
5219 if (atomic_fetchadd_int(&clm->refcount, -1) == 1) {
5220 fl->cl_recycled++;
5221 counter_u64_add(extfree_rels, 1);
5222 goto recycled;
5223 }
5224 sd->cl = NULL; /* gave up my reference */
5225 }
5226 MPASS(sd->cl == NULL);
5227 cl = uma_zalloc(rxb->zone, M_NOWAIT);
5228 if (__predict_false(cl == NULL)) {
5229 if (zidx != fl->safe_zidx) {
5230 zidx = fl->safe_zidx;
5231 rxb = &sc->sge.rx_buf_info[zidx];
5232 cl = uma_zalloc(rxb->zone, M_NOWAIT);
5233 }
5234 if (cl == NULL)
5235 break;
5236 }
5237 fl->cl_allocated++;
5238 n--;
5239
5240 pa = pmap_kextract((vm_offset_t)cl);
5241 sd->cl = cl;
5242 sd->zidx = zidx;
5243
5244 if (fl->flags & FL_BUF_PACKING) {
5245 *d = htobe64(pa | rxb->hwidx2);
5246 sd->moff = rxb->size2;
5247 } else {
5248 *d = htobe64(pa | rxb->hwidx1);
5249 sd->moff = 0;
5250 }
5251 recycled:
5252 sd->nmbuf = 0;
5253 d++;
5254 sd++;
5255 if (__predict_false((++fl->pidx & 7) == 0)) {
5256 uint16_t pidx = fl->pidx >> 3;
5257
5258 if (__predict_false(pidx == fl->sidx)) {
5259 fl->pidx = 0;
5260 pidx = 0;
5261 sd = fl->sdesc;
5262 d = fl->desc;
5263 }
5264 if (n < 8 || pidx == max_pidx)
5265 break;
5266
5267 if (IDXDIFF(pidx, fl->dbidx, fl->sidx) >= 4)
5268 ring_fl_db(sc, fl);
5269 }
5270 }
5271
5272 if ((fl->pidx >> 3) != fl->dbidx)
5273 ring_fl_db(sc, fl);
5274
5275 return (FL_RUNNING_LOW(fl) && !(fl->flags & FL_STARVING));
5276 }
5277
5278 /*
5279 * Attempt to refill all starving freelists.
5280 */
5281 static void
refill_sfl(void * arg)5282 refill_sfl(void *arg)
5283 {
5284 struct adapter *sc = arg;
5285 struct sge_fl *fl, *fl_temp;
5286
5287 mtx_assert(&sc->sfl_lock, MA_OWNED);
5288 TAILQ_FOREACH_SAFE(fl, &sc->sfl, link, fl_temp) {
5289 FL_LOCK(fl);
5290 refill_fl(sc, fl, 64);
5291 if (FL_NOT_RUNNING_LOW(fl) || fl->flags & FL_DOOMED) {
5292 TAILQ_REMOVE(&sc->sfl, fl, link);
5293 fl->flags &= ~FL_STARVING;
5294 }
5295 FL_UNLOCK(fl);
5296 }
5297
5298 if (!TAILQ_EMPTY(&sc->sfl))
5299 callout_schedule(&sc->sfl_callout, hz / 5);
5300 }
5301
5302 /*
5303 * Release the driver's reference on all buffers in the given freelist. Buffers
5304 * with kernel references cannot be freed and will prevent the driver from being
5305 * unloaded safely.
5306 */
5307 void
free_fl_buffers(struct adapter * sc,struct sge_fl * fl)5308 free_fl_buffers(struct adapter *sc, struct sge_fl *fl)
5309 {
5310 struct fl_sdesc *sd;
5311 struct cluster_metadata *clm;
5312 int i;
5313
5314 sd = fl->sdesc;
5315 for (i = 0; i < fl->sidx * 8; i++, sd++) {
5316 if (sd->cl == NULL)
5317 continue;
5318
5319 if (sd->nmbuf == 0)
5320 uma_zfree(sc->sge.rx_buf_info[sd->zidx].zone, sd->cl);
5321 else if (fl->flags & FL_BUF_PACKING) {
5322 clm = cl_metadata(sd);
5323 if (atomic_fetchadd_int(&clm->refcount, -1) == 1) {
5324 uma_zfree(sc->sge.rx_buf_info[sd->zidx].zone,
5325 sd->cl);
5326 counter_u64_add(extfree_rels, 1);
5327 }
5328 }
5329 sd->cl = NULL;
5330 }
5331
5332 if (fl->flags & FL_BUF_RESUME) {
5333 m_freem(fl->m0);
5334 fl->flags &= ~FL_BUF_RESUME;
5335 }
5336 }
5337
5338 static inline void
get_pkt_gl(struct mbuf * m,struct sglist * gl)5339 get_pkt_gl(struct mbuf *m, struct sglist *gl)
5340 {
5341 int rc;
5342
5343 M_ASSERTPKTHDR(m);
5344
5345 sglist_reset(gl);
5346 rc = sglist_append_mbuf(gl, m);
5347 if (__predict_false(rc != 0)) {
5348 panic("%s: mbuf %p (%d segs) was vetted earlier but now fails "
5349 "with %d.", __func__, m, mbuf_nsegs(m), rc);
5350 }
5351
5352 KASSERT(gl->sg_nseg == mbuf_nsegs(m),
5353 ("%s: nsegs changed for mbuf %p from %d to %d", __func__, m,
5354 mbuf_nsegs(m), gl->sg_nseg));
5355 #if 0 /* vm_wr not readily available here. */
5356 KASSERT(gl->sg_nseg > 0 && gl->sg_nseg <= max_nsegs_allowed(m, vm_wr),
5357 ("%s: %d segments, should have been 1 <= nsegs <= %d", __func__,
5358 gl->sg_nseg, max_nsegs_allowed(m, vm_wr)));
5359 #endif
5360 }
5361
5362 /*
5363 * len16 for a txpkt WR with a GL. Includes the firmware work request header.
5364 */
5365 static inline u_int
txpkt_len16(u_int nsegs,const u_int extra)5366 txpkt_len16(u_int nsegs, const u_int extra)
5367 {
5368 u_int n;
5369
5370 MPASS(nsegs > 0);
5371
5372 nsegs--; /* first segment is part of ulptx_sgl */
5373 n = extra + sizeof(struct fw_eth_tx_pkt_wr) +
5374 sizeof(struct cpl_tx_pkt_core) +
5375 sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1));
5376
5377 return (howmany(n, 16));
5378 }
5379
5380 /*
5381 * len16 for a txpkt_vm WR with a GL. Includes the firmware work
5382 * request header.
5383 */
5384 static inline u_int
txpkt_vm_len16(u_int nsegs,const u_int extra)5385 txpkt_vm_len16(u_int nsegs, const u_int extra)
5386 {
5387 u_int n;
5388
5389 MPASS(nsegs > 0);
5390
5391 nsegs--; /* first segment is part of ulptx_sgl */
5392 n = extra + sizeof(struct fw_eth_tx_pkt_vm_wr) +
5393 sizeof(struct cpl_tx_pkt_core) +
5394 sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1));
5395
5396 return (howmany(n, 16));
5397 }
5398
5399 static inline void
calculate_mbuf_len16(struct mbuf * m,bool vm_wr)5400 calculate_mbuf_len16(struct mbuf *m, bool vm_wr)
5401 {
5402 const int lso = sizeof(struct cpl_tx_pkt_lso_core);
5403 const int tnl_lso = sizeof(struct cpl_tx_tnl_lso);
5404
5405 if (vm_wr) {
5406 if (needs_tso(m))
5407 set_mbuf_len16(m, txpkt_vm_len16(mbuf_nsegs(m), lso));
5408 else
5409 set_mbuf_len16(m, txpkt_vm_len16(mbuf_nsegs(m), 0));
5410 return;
5411 }
5412
5413 if (needs_tso(m)) {
5414 if (needs_vxlan_tso(m))
5415 set_mbuf_len16(m, txpkt_len16(mbuf_nsegs(m), tnl_lso));
5416 else
5417 set_mbuf_len16(m, txpkt_len16(mbuf_nsegs(m), lso));
5418 } else
5419 set_mbuf_len16(m, txpkt_len16(mbuf_nsegs(m), 0));
5420 }
5421
5422 /*
5423 * len16 for a txpkts type 0 WR with a GL. Does not include the firmware work
5424 * request header.
5425 */
5426 static inline u_int
txpkts0_len16(u_int nsegs)5427 txpkts0_len16(u_int nsegs)
5428 {
5429 u_int n;
5430
5431 MPASS(nsegs > 0);
5432
5433 nsegs--; /* first segment is part of ulptx_sgl */
5434 n = sizeof(struct ulp_txpkt) + sizeof(struct ulptx_idata) +
5435 sizeof(struct cpl_tx_pkt_core) + sizeof(struct ulptx_sgl) +
5436 8 * ((3 * nsegs) / 2 + (nsegs & 1));
5437
5438 return (howmany(n, 16));
5439 }
5440
5441 /*
5442 * len16 for a txpkts type 1 WR with a GL. Does not include the firmware work
5443 * request header.
5444 */
5445 static inline u_int
txpkts1_len16(void)5446 txpkts1_len16(void)
5447 {
5448 u_int n;
5449
5450 n = sizeof(struct cpl_tx_pkt_core) + sizeof(struct ulptx_sgl);
5451
5452 return (howmany(n, 16));
5453 }
5454
5455 static inline u_int
imm_payload(u_int ndesc)5456 imm_payload(u_int ndesc)
5457 {
5458 u_int n;
5459
5460 n = ndesc * EQ_ESIZE - sizeof(struct fw_eth_tx_pkt_wr) -
5461 sizeof(struct cpl_tx_pkt_core);
5462
5463 return (n);
5464 }
5465
5466 static inline uint64_t
csum_to_ctrl(struct adapter * sc,struct mbuf * m)5467 csum_to_ctrl(struct adapter *sc, struct mbuf *m)
5468 {
5469 uint64_t ctrl;
5470 int csum_type, l2hlen, l3hlen;
5471 int x, y;
5472 static const int csum_types[3][2] = {
5473 {TX_CSUM_TCPIP, TX_CSUM_TCPIP6},
5474 {TX_CSUM_UDPIP, TX_CSUM_UDPIP6},
5475 {TX_CSUM_IP, 0}
5476 };
5477
5478 M_ASSERTPKTHDR(m);
5479
5480 if (!needs_hwcsum(m))
5481 return (F_TXPKT_IPCSUM_DIS | F_TXPKT_L4CSUM_DIS);
5482
5483 MPASS(m->m_pkthdr.l2hlen >= ETHER_HDR_LEN);
5484 MPASS(m->m_pkthdr.l3hlen >= sizeof(struct ip));
5485
5486 if (needs_vxlan_csum(m)) {
5487 MPASS(m->m_pkthdr.l4hlen > 0);
5488 MPASS(m->m_pkthdr.l5hlen > 0);
5489 MPASS(m->m_pkthdr.inner_l2hlen >= ETHER_HDR_LEN);
5490 MPASS(m->m_pkthdr.inner_l3hlen >= sizeof(struct ip));
5491
5492 l2hlen = m->m_pkthdr.l2hlen + m->m_pkthdr.l3hlen +
5493 m->m_pkthdr.l4hlen + m->m_pkthdr.l5hlen +
5494 m->m_pkthdr.inner_l2hlen - ETHER_HDR_LEN;
5495 l3hlen = m->m_pkthdr.inner_l3hlen;
5496 } else {
5497 l2hlen = m->m_pkthdr.l2hlen - ETHER_HDR_LEN;
5498 l3hlen = m->m_pkthdr.l3hlen;
5499 }
5500
5501 ctrl = 0;
5502 if (!needs_l3_csum(m))
5503 ctrl |= F_TXPKT_IPCSUM_DIS;
5504
5505 if (m->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_INNER_IP_TCP |
5506 CSUM_IP6_TCP | CSUM_INNER_IP6_TCP))
5507 x = 0; /* TCP */
5508 else if (m->m_pkthdr.csum_flags & (CSUM_IP_UDP | CSUM_INNER_IP_UDP |
5509 CSUM_IP6_UDP | CSUM_INNER_IP6_UDP))
5510 x = 1; /* UDP */
5511 else
5512 x = 2;
5513
5514 if (m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP |
5515 CSUM_INNER_IP | CSUM_INNER_IP_TCP | CSUM_INNER_IP_UDP))
5516 y = 0; /* IPv4 */
5517 else {
5518 MPASS(m->m_pkthdr.csum_flags & (CSUM_IP6_TCP | CSUM_IP6_UDP |
5519 CSUM_INNER_IP6_TCP | CSUM_INNER_IP6_UDP));
5520 y = 1; /* IPv6 */
5521 }
5522 /*
5523 * needs_hwcsum returned true earlier so there must be some kind of
5524 * checksum to calculate.
5525 */
5526 csum_type = csum_types[x][y];
5527 MPASS(csum_type != 0);
5528 if (csum_type == TX_CSUM_IP)
5529 ctrl |= F_TXPKT_L4CSUM_DIS;
5530 ctrl |= V_TXPKT_CSUM_TYPE(csum_type) | V_TXPKT_IPHDR_LEN(l3hlen);
5531 if (chip_id(sc) <= CHELSIO_T5)
5532 ctrl |= V_TXPKT_ETHHDR_LEN(l2hlen);
5533 else
5534 ctrl |= V_T6_TXPKT_ETHHDR_LEN(l2hlen);
5535
5536 return (ctrl);
5537 }
5538
5539 static inline void *
write_lso_cpl(void * cpl,struct mbuf * m0)5540 write_lso_cpl(void *cpl, struct mbuf *m0)
5541 {
5542 struct cpl_tx_pkt_lso_core *lso;
5543 uint32_t ctrl;
5544
5545 KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 &&
5546 m0->m_pkthdr.l4hlen > 0,
5547 ("%s: mbuf %p needs TSO but missing header lengths",
5548 __func__, m0));
5549
5550 ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) |
5551 F_LSO_FIRST_SLICE | F_LSO_LAST_SLICE |
5552 V_LSO_ETHHDR_LEN((m0->m_pkthdr.l2hlen - ETHER_HDR_LEN) >> 2) |
5553 V_LSO_IPHDR_LEN(m0->m_pkthdr.l3hlen >> 2) |
5554 V_LSO_TCPHDR_LEN(m0->m_pkthdr.l4hlen >> 2);
5555 if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr))
5556 ctrl |= F_LSO_IPV6;
5557
5558 lso = cpl;
5559 lso->lso_ctrl = htobe32(ctrl);
5560 lso->ipid_ofst = htobe16(0);
5561 lso->mss = htobe16(m0->m_pkthdr.tso_segsz);
5562 lso->seqno_offset = htobe32(0);
5563 lso->len = htobe32(m0->m_pkthdr.len);
5564
5565 return (lso + 1);
5566 }
5567
5568 static void *
write_tnl_lso_cpl(void * cpl,struct mbuf * m0)5569 write_tnl_lso_cpl(void *cpl, struct mbuf *m0)
5570 {
5571 struct cpl_tx_tnl_lso *tnl_lso = cpl;
5572 uint32_t ctrl;
5573
5574 KASSERT(m0->m_pkthdr.inner_l2hlen > 0 &&
5575 m0->m_pkthdr.inner_l3hlen > 0 && m0->m_pkthdr.inner_l4hlen > 0 &&
5576 m0->m_pkthdr.inner_l5hlen > 0,
5577 ("%s: mbuf %p needs VXLAN_TSO but missing inner header lengths",
5578 __func__, m0));
5579 KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 &&
5580 m0->m_pkthdr.l4hlen > 0 && m0->m_pkthdr.l5hlen > 0,
5581 ("%s: mbuf %p needs VXLAN_TSO but missing outer header lengths",
5582 __func__, m0));
5583
5584 /* Outer headers. */
5585 ctrl = V_CPL_TX_TNL_LSO_OPCODE(CPL_TX_TNL_LSO) |
5586 F_CPL_TX_TNL_LSO_FIRST | F_CPL_TX_TNL_LSO_LAST |
5587 V_CPL_TX_TNL_LSO_ETHHDRLENOUT(
5588 (m0->m_pkthdr.l2hlen - ETHER_HDR_LEN) >> 2) |
5589 V_CPL_TX_TNL_LSO_IPHDRLENOUT(m0->m_pkthdr.l3hlen >> 2) |
5590 F_CPL_TX_TNL_LSO_IPLENSETOUT;
5591 if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr))
5592 ctrl |= F_CPL_TX_TNL_LSO_IPV6OUT;
5593 else {
5594 ctrl |= F_CPL_TX_TNL_LSO_IPHDRCHKOUT |
5595 F_CPL_TX_TNL_LSO_IPIDINCOUT;
5596 }
5597 tnl_lso->op_to_IpIdSplitOut = htobe32(ctrl);
5598 tnl_lso->IpIdOffsetOut = 0;
5599 tnl_lso->UdpLenSetOut_to_TnlHdrLen =
5600 htobe16(F_CPL_TX_TNL_LSO_UDPCHKCLROUT |
5601 F_CPL_TX_TNL_LSO_UDPLENSETOUT |
5602 V_CPL_TX_TNL_LSO_TNLHDRLEN(m0->m_pkthdr.l2hlen +
5603 m0->m_pkthdr.l3hlen + m0->m_pkthdr.l4hlen +
5604 m0->m_pkthdr.l5hlen) |
5605 V_CPL_TX_TNL_LSO_TNLTYPE(TX_TNL_TYPE_VXLAN));
5606 tnl_lso->ipsecen_to_rocev2 = 0;
5607 tnl_lso->roce_eth = 0;
5608
5609 /* Inner headers. */
5610 ctrl = V_CPL_TX_TNL_LSO_ETHHDRLEN(
5611 (m0->m_pkthdr.inner_l2hlen - ETHER_HDR_LEN) >> 2) |
5612 V_CPL_TX_TNL_LSO_IPHDRLEN(m0->m_pkthdr.inner_l3hlen >> 2) |
5613 V_CPL_TX_TNL_LSO_TCPHDRLEN(m0->m_pkthdr.inner_l4hlen >> 2);
5614 if (m0->m_pkthdr.inner_l3hlen == sizeof(struct ip6_hdr))
5615 ctrl |= F_CPL_TX_TNL_LSO_IPV6;
5616 tnl_lso->Flow_to_TcpHdrLen = htobe32(ctrl);
5617 tnl_lso->IpIdOffset = 0;
5618 tnl_lso->IpIdSplit_to_Mss =
5619 htobe16(V_CPL_TX_TNL_LSO_MSS(m0->m_pkthdr.tso_segsz));
5620 tnl_lso->TCPSeqOffset = 0;
5621 tnl_lso->EthLenOffset_Size =
5622 htobe32(V_CPL_TX_TNL_LSO_SIZE(m0->m_pkthdr.len));
5623
5624 return (tnl_lso + 1);
5625 }
5626
5627 #define VM_TX_L2HDR_LEN 16 /* ethmacdst to vlantci */
5628
5629 /*
5630 * Write a VM txpkt WR for this packet to the hardware descriptors, update the
5631 * software descriptor, and advance the pidx. It is guaranteed that enough
5632 * descriptors are available.
5633 *
5634 * The return value is the # of hardware descriptors used.
5635 */
5636 static u_int
write_txpkt_vm_wr(struct adapter * sc,struct sge_txq * txq,struct mbuf * m0)5637 write_txpkt_vm_wr(struct adapter *sc, struct sge_txq *txq, struct mbuf *m0)
5638 {
5639 struct sge_eq *eq;
5640 struct fw_eth_tx_pkt_vm_wr *wr;
5641 struct tx_sdesc *txsd;
5642 struct cpl_tx_pkt_core *cpl;
5643 uint32_t ctrl; /* used in many unrelated places */
5644 uint64_t ctrl1;
5645 int len16, ndesc, pktlen;
5646 caddr_t dst;
5647
5648 TXQ_LOCK_ASSERT_OWNED(txq);
5649 M_ASSERTPKTHDR(m0);
5650
5651 len16 = mbuf_len16(m0);
5652 pktlen = m0->m_pkthdr.len;
5653 ctrl = sizeof(struct cpl_tx_pkt_core);
5654 if (needs_tso(m0))
5655 ctrl += sizeof(struct cpl_tx_pkt_lso_core);
5656 ndesc = tx_len16_to_desc(len16);
5657
5658 /* Firmware work request header */
5659 eq = &txq->eq;
5660 wr = (void *)&eq->desc[eq->pidx];
5661 wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_VM_WR) |
5662 V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl));
5663
5664 ctrl = V_FW_WR_LEN16(len16);
5665 wr->equiq_to_len16 = htobe32(ctrl);
5666 wr->r3[0] = 0;
5667 wr->r3[1] = 0;
5668
5669 /*
5670 * Copy over ethmacdst, ethmacsrc, ethtype, and vlantci.
5671 * vlantci is ignored unless the ethtype is 0x8100, so it's
5672 * simpler to always copy it rather than making it
5673 * conditional. Also, it seems that we do not have to set
5674 * vlantci or fake the ethtype when doing VLAN tag insertion.
5675 */
5676 m_copydata(m0, 0, VM_TX_L2HDR_LEN, wr->ethmacdst);
5677
5678 if (needs_tso(m0)) {
5679 cpl = write_lso_cpl(wr + 1, m0);
5680 txq->tso_wrs++;
5681 } else
5682 cpl = (void *)(wr + 1);
5683
5684 /* Checksum offload */
5685 ctrl1 = csum_to_ctrl(sc, m0);
5686 if (ctrl1 != (F_TXPKT_IPCSUM_DIS | F_TXPKT_L4CSUM_DIS))
5687 txq->txcsum++; /* some hardware assistance provided */
5688
5689 /* VLAN tag insertion */
5690 if (needs_vlan_insertion(m0)) {
5691 ctrl1 |= F_TXPKT_VLAN_VLD |
5692 V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag);
5693 txq->vlan_insertion++;
5694 } else if (sc->vlan_id)
5695 ctrl1 |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(sc->vlan_id);
5696
5697 /* CPL header */
5698 cpl->ctrl0 = txq->cpl_ctrl0;
5699 cpl->pack = 0;
5700 cpl->len = htobe16(pktlen);
5701 cpl->ctrl1 = htobe64(ctrl1);
5702
5703 /* SGL */
5704 dst = (void *)(cpl + 1);
5705
5706 /*
5707 * A packet using TSO will use up an entire descriptor for the
5708 * firmware work request header, LSO CPL, and TX_PKT_XT CPL.
5709 * If this descriptor is the last descriptor in the ring, wrap
5710 * around to the front of the ring explicitly for the start of
5711 * the sgl.
5712 */
5713 if (dst == (void *)&eq->desc[eq->sidx]) {
5714 dst = (void *)&eq->desc[0];
5715 write_gl_to_txd(txq, m0, &dst, 0);
5716 } else
5717 write_gl_to_txd(txq, m0, &dst, eq->sidx - ndesc < eq->pidx);
5718 txq->sgl_wrs++;
5719 txq->txpkt_wrs++;
5720
5721 txsd = &txq->sdesc[eq->pidx];
5722 txsd->m = m0;
5723 txsd->desc_used = ndesc;
5724
5725 return (ndesc);
5726 }
5727
5728 /*
5729 * Write a raw WR to the hardware descriptors, update the software
5730 * descriptor, and advance the pidx. It is guaranteed that enough
5731 * descriptors are available.
5732 *
5733 * The return value is the # of hardware descriptors used.
5734 */
5735 static u_int
write_raw_wr(struct sge_txq * txq,void * wr,struct mbuf * m0,u_int available)5736 write_raw_wr(struct sge_txq *txq, void *wr, struct mbuf *m0, u_int available)
5737 {
5738 struct sge_eq *eq = &txq->eq;
5739 struct tx_sdesc *txsd;
5740 struct mbuf *m;
5741 caddr_t dst;
5742 int len16, ndesc;
5743
5744 len16 = mbuf_len16(m0);
5745 ndesc = tx_len16_to_desc(len16);
5746 MPASS(ndesc <= available);
5747
5748 dst = wr;
5749 for (m = m0; m != NULL; m = m->m_next)
5750 copy_to_txd(eq, mtod(m, caddr_t), &dst, m->m_len);
5751
5752 txq->raw_wrs++;
5753
5754 txsd = &txq->sdesc[eq->pidx];
5755 txsd->m = m0;
5756 txsd->desc_used = ndesc;
5757
5758 return (ndesc);
5759 }
5760
5761 /*
5762 * Write a txpkt WR for this packet to the hardware descriptors, update the
5763 * software descriptor, and advance the pidx. It is guaranteed that enough
5764 * descriptors are available.
5765 *
5766 * The return value is the # of hardware descriptors used.
5767 */
5768 static u_int
write_txpkt_wr(struct adapter * sc,struct sge_txq * txq,struct mbuf * m0,u_int available)5769 write_txpkt_wr(struct adapter *sc, struct sge_txq *txq, struct mbuf *m0,
5770 u_int available)
5771 {
5772 struct sge_eq *eq;
5773 struct fw_eth_tx_pkt_wr *wr;
5774 struct tx_sdesc *txsd;
5775 struct cpl_tx_pkt_core *cpl;
5776 uint32_t ctrl; /* used in many unrelated places */
5777 uint64_t ctrl1;
5778 int len16, ndesc, pktlen, nsegs;
5779 caddr_t dst;
5780
5781 TXQ_LOCK_ASSERT_OWNED(txq);
5782 M_ASSERTPKTHDR(m0);
5783
5784 len16 = mbuf_len16(m0);
5785 nsegs = mbuf_nsegs(m0);
5786 pktlen = m0->m_pkthdr.len;
5787 ctrl = sizeof(struct cpl_tx_pkt_core);
5788 if (needs_tso(m0)) {
5789 if (needs_vxlan_tso(m0))
5790 ctrl += sizeof(struct cpl_tx_tnl_lso);
5791 else
5792 ctrl += sizeof(struct cpl_tx_pkt_lso_core);
5793 } else if (!(mbuf_cflags(m0) & MC_NOMAP) && pktlen <= imm_payload(2) &&
5794 available >= 2) {
5795 /* Immediate data. Recalculate len16 and set nsegs to 0. */
5796 ctrl += pktlen;
5797 len16 = howmany(sizeof(struct fw_eth_tx_pkt_wr) +
5798 sizeof(struct cpl_tx_pkt_core) + pktlen, 16);
5799 nsegs = 0;
5800 }
5801 ndesc = tx_len16_to_desc(len16);
5802 MPASS(ndesc <= available);
5803
5804 /* Firmware work request header */
5805 eq = &txq->eq;
5806 wr = (void *)&eq->desc[eq->pidx];
5807 wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_WR) |
5808 V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl));
5809
5810 ctrl = V_FW_WR_LEN16(len16);
5811 wr->equiq_to_len16 = htobe32(ctrl);
5812 wr->r3 = 0;
5813
5814 if (needs_tso(m0)) {
5815 if (needs_vxlan_tso(m0)) {
5816 cpl = write_tnl_lso_cpl(wr + 1, m0);
5817 txq->vxlan_tso_wrs++;
5818 } else {
5819 cpl = write_lso_cpl(wr + 1, m0);
5820 txq->tso_wrs++;
5821 }
5822 } else
5823 cpl = (void *)(wr + 1);
5824
5825 /* Checksum offload */
5826 ctrl1 = csum_to_ctrl(sc, m0);
5827 if (ctrl1 != (F_TXPKT_IPCSUM_DIS | F_TXPKT_L4CSUM_DIS)) {
5828 /* some hardware assistance provided */
5829 if (needs_vxlan_csum(m0))
5830 txq->vxlan_txcsum++;
5831 else
5832 txq->txcsum++;
5833 }
5834
5835 /* VLAN tag insertion */
5836 if (needs_vlan_insertion(m0)) {
5837 ctrl1 |= F_TXPKT_VLAN_VLD |
5838 V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag);
5839 txq->vlan_insertion++;
5840 }
5841
5842 /* CPL header */
5843 cpl->ctrl0 = txq->cpl_ctrl0;
5844 cpl->pack = 0;
5845 cpl->len = htobe16(pktlen);
5846 cpl->ctrl1 = htobe64(ctrl1);
5847
5848 /* SGL */
5849 dst = (void *)(cpl + 1);
5850 if (__predict_false((uintptr_t)dst == (uintptr_t)&eq->desc[eq->sidx]))
5851 dst = (caddr_t)&eq->desc[0];
5852 if (nsegs > 0) {
5853
5854 write_gl_to_txd(txq, m0, &dst, eq->sidx - ndesc < eq->pidx);
5855 txq->sgl_wrs++;
5856 } else {
5857 struct mbuf *m;
5858
5859 for (m = m0; m != NULL; m = m->m_next) {
5860 copy_to_txd(eq, mtod(m, caddr_t), &dst, m->m_len);
5861 #ifdef INVARIANTS
5862 pktlen -= m->m_len;
5863 #endif
5864 }
5865 #ifdef INVARIANTS
5866 KASSERT(pktlen == 0, ("%s: %d bytes left.", __func__, pktlen));
5867 #endif
5868 txq->imm_wrs++;
5869 }
5870
5871 txq->txpkt_wrs++;
5872
5873 txsd = &txq->sdesc[eq->pidx];
5874 txsd->m = m0;
5875 txsd->desc_used = ndesc;
5876
5877 return (ndesc);
5878 }
5879
5880 static inline bool
cmp_l2hdr(struct txpkts * txp,struct mbuf * m)5881 cmp_l2hdr(struct txpkts *txp, struct mbuf *m)
5882 {
5883 int len;
5884
5885 MPASS(txp->npkt > 0);
5886 MPASS(m->m_len >= VM_TX_L2HDR_LEN);
5887
5888 if (txp->ethtype == be16toh(ETHERTYPE_VLAN))
5889 len = VM_TX_L2HDR_LEN;
5890 else
5891 len = sizeof(struct ether_header);
5892
5893 return (memcmp(m->m_data, &txp->ethmacdst[0], len) != 0);
5894 }
5895
5896 static inline void
save_l2hdr(struct txpkts * txp,struct mbuf * m)5897 save_l2hdr(struct txpkts *txp, struct mbuf *m)
5898 {
5899 MPASS(m->m_len >= VM_TX_L2HDR_LEN);
5900
5901 memcpy(&txp->ethmacdst[0], mtod(m, const void *), VM_TX_L2HDR_LEN);
5902 }
5903
5904 static int
add_to_txpkts_vf(struct adapter * sc,struct sge_txq * txq,struct mbuf * m,int avail,bool * send)5905 add_to_txpkts_vf(struct adapter *sc, struct sge_txq *txq, struct mbuf *m,
5906 int avail, bool *send)
5907 {
5908 struct txpkts *txp = &txq->txp;
5909
5910 /* Cannot have TSO and coalesce at the same time. */
5911 if (cannot_use_txpkts(m)) {
5912 cannot_coalesce:
5913 *send = txp->npkt > 0;
5914 return (EINVAL);
5915 }
5916
5917 /* VF allows coalescing of type 1 (1 GL) only */
5918 if (mbuf_nsegs(m) > 1)
5919 goto cannot_coalesce;
5920
5921 *send = false;
5922 if (txp->npkt > 0) {
5923 MPASS(tx_len16_to_desc(txp->len16) <= avail);
5924 MPASS(txp->npkt < txp->max_npkt);
5925 MPASS(txp->wr_type == 1); /* VF supports type 1 only */
5926
5927 if (tx_len16_to_desc(txp->len16 + txpkts1_len16()) > avail) {
5928 retry_after_send:
5929 *send = true;
5930 return (EAGAIN);
5931 }
5932 if (m->m_pkthdr.len + txp->plen > 65535)
5933 goto retry_after_send;
5934 if (cmp_l2hdr(txp, m))
5935 goto retry_after_send;
5936
5937 txp->len16 += txpkts1_len16();
5938 txp->plen += m->m_pkthdr.len;
5939 txp->mb[txp->npkt++] = m;
5940 if (txp->npkt == txp->max_npkt)
5941 *send = true;
5942 } else {
5943 txp->len16 = howmany(sizeof(struct fw_eth_tx_pkts_vm_wr), 16) +
5944 txpkts1_len16();
5945 if (tx_len16_to_desc(txp->len16) > avail)
5946 goto cannot_coalesce;
5947 txp->npkt = 1;
5948 txp->wr_type = 1;
5949 txp->plen = m->m_pkthdr.len;
5950 txp->mb[0] = m;
5951 save_l2hdr(txp, m);
5952 }
5953 return (0);
5954 }
5955
5956 static int
add_to_txpkts_pf(struct adapter * sc,struct sge_txq * txq,struct mbuf * m,int avail,bool * send)5957 add_to_txpkts_pf(struct adapter *sc, struct sge_txq *txq, struct mbuf *m,
5958 int avail, bool *send)
5959 {
5960 struct txpkts *txp = &txq->txp;
5961 int nsegs;
5962
5963 MPASS(!(sc->flags & IS_VF));
5964
5965 /* Cannot have TSO and coalesce at the same time. */
5966 if (cannot_use_txpkts(m)) {
5967 cannot_coalesce:
5968 *send = txp->npkt > 0;
5969 return (EINVAL);
5970 }
5971
5972 *send = false;
5973 nsegs = mbuf_nsegs(m);
5974 if (txp->npkt == 0) {
5975 if (m->m_pkthdr.len > 65535)
5976 goto cannot_coalesce;
5977 if (nsegs > 1) {
5978 txp->wr_type = 0;
5979 txp->len16 =
5980 howmany(sizeof(struct fw_eth_tx_pkts_wr), 16) +
5981 txpkts0_len16(nsegs);
5982 } else {
5983 txp->wr_type = 1;
5984 txp->len16 =
5985 howmany(sizeof(struct fw_eth_tx_pkts_wr), 16) +
5986 txpkts1_len16();
5987 }
5988 if (tx_len16_to_desc(txp->len16) > avail)
5989 goto cannot_coalesce;
5990 txp->npkt = 1;
5991 txp->plen = m->m_pkthdr.len;
5992 txp->mb[0] = m;
5993 } else {
5994 MPASS(tx_len16_to_desc(txp->len16) <= avail);
5995 MPASS(txp->npkt < txp->max_npkt);
5996
5997 if (m->m_pkthdr.len + txp->plen > 65535) {
5998 retry_after_send:
5999 *send = true;
6000 return (EAGAIN);
6001 }
6002
6003 MPASS(txp->wr_type == 0 || txp->wr_type == 1);
6004 if (txp->wr_type == 0) {
6005 if (tx_len16_to_desc(txp->len16 +
6006 txpkts0_len16(nsegs)) > min(avail, SGE_MAX_WR_NDESC))
6007 goto retry_after_send;
6008 txp->len16 += txpkts0_len16(nsegs);
6009 } else {
6010 if (nsegs != 1)
6011 goto retry_after_send;
6012 if (tx_len16_to_desc(txp->len16 + txpkts1_len16()) >
6013 avail)
6014 goto retry_after_send;
6015 txp->len16 += txpkts1_len16();
6016 }
6017
6018 txp->plen += m->m_pkthdr.len;
6019 txp->mb[txp->npkt++] = m;
6020 if (txp->npkt == txp->max_npkt)
6021 *send = true;
6022 }
6023 return (0);
6024 }
6025
6026 /*
6027 * Write a txpkts WR for the packets in txp to the hardware descriptors, update
6028 * the software descriptor, and advance the pidx. It is guaranteed that enough
6029 * descriptors are available.
6030 *
6031 * The return value is the # of hardware descriptors used.
6032 */
6033 static u_int
write_txpkts_wr(struct adapter * sc,struct sge_txq * txq)6034 write_txpkts_wr(struct adapter *sc, struct sge_txq *txq)
6035 {
6036 const struct txpkts *txp = &txq->txp;
6037 struct sge_eq *eq = &txq->eq;
6038 struct fw_eth_tx_pkts_wr *wr;
6039 struct tx_sdesc *txsd;
6040 struct cpl_tx_pkt_core *cpl;
6041 uint64_t ctrl1;
6042 int ndesc, i, checkwrap;
6043 struct mbuf *m, *last;
6044 void *flitp;
6045
6046 TXQ_LOCK_ASSERT_OWNED(txq);
6047 MPASS(txp->npkt > 0);
6048 MPASS(txp->len16 <= howmany(SGE_MAX_WR_LEN, 16));
6049
6050 wr = (void *)&eq->desc[eq->pidx];
6051 wr->op_pkd = htobe32(V_FW_WR_OP(FW_ETH_TX_PKTS_WR));
6052 wr->equiq_to_len16 = htobe32(V_FW_WR_LEN16(txp->len16));
6053 wr->plen = htobe16(txp->plen);
6054 wr->npkt = txp->npkt;
6055 wr->r3 = 0;
6056 wr->type = txp->wr_type;
6057 flitp = wr + 1;
6058
6059 /*
6060 * At this point we are 16B into a hardware descriptor. If checkwrap is
6061 * set then we know the WR is going to wrap around somewhere. We'll
6062 * check for that at appropriate points.
6063 */
6064 ndesc = tx_len16_to_desc(txp->len16);
6065 last = NULL;
6066 checkwrap = eq->sidx - ndesc < eq->pidx;
6067 for (i = 0; i < txp->npkt; i++) {
6068 m = txp->mb[i];
6069 if (txp->wr_type == 0) {
6070 struct ulp_txpkt *ulpmc;
6071 struct ulptx_idata *ulpsc;
6072
6073 /* ULP master command */
6074 ulpmc = flitp;
6075 ulpmc->cmd_dest = htobe32(V_ULPTX_CMD(ULP_TX_PKT) |
6076 V_ULP_TXPKT_DEST(0) | V_ULP_TXPKT_FID(eq->iqid));
6077 ulpmc->len = htobe32(txpkts0_len16(mbuf_nsegs(m)));
6078
6079 /* ULP subcommand */
6080 ulpsc = (void *)(ulpmc + 1);
6081 ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM) |
6082 F_ULP_TX_SC_MORE);
6083 ulpsc->len = htobe32(sizeof(struct cpl_tx_pkt_core));
6084
6085 cpl = (void *)(ulpsc + 1);
6086 if (checkwrap &&
6087 (uintptr_t)cpl == (uintptr_t)&eq->desc[eq->sidx])
6088 cpl = (void *)&eq->desc[0];
6089 } else {
6090 cpl = flitp;
6091 }
6092
6093 /* Checksum offload */
6094 ctrl1 = csum_to_ctrl(sc, m);
6095 if (ctrl1 != (F_TXPKT_IPCSUM_DIS | F_TXPKT_L4CSUM_DIS)) {
6096 /* some hardware assistance provided */
6097 if (needs_vxlan_csum(m))
6098 txq->vxlan_txcsum++;
6099 else
6100 txq->txcsum++;
6101 }
6102
6103 /* VLAN tag insertion */
6104 if (needs_vlan_insertion(m)) {
6105 ctrl1 |= F_TXPKT_VLAN_VLD |
6106 V_TXPKT_VLAN(m->m_pkthdr.ether_vtag);
6107 txq->vlan_insertion++;
6108 }
6109
6110 /* CPL header */
6111 cpl->ctrl0 = txq->cpl_ctrl0;
6112 cpl->pack = 0;
6113 cpl->len = htobe16(m->m_pkthdr.len);
6114 cpl->ctrl1 = htobe64(ctrl1);
6115
6116 flitp = cpl + 1;
6117 if (checkwrap &&
6118 (uintptr_t)flitp == (uintptr_t)&eq->desc[eq->sidx])
6119 flitp = (void *)&eq->desc[0];
6120
6121 write_gl_to_txd(txq, m, (caddr_t *)(&flitp), checkwrap);
6122
6123 if (last != NULL)
6124 last->m_nextpkt = m;
6125 last = m;
6126 }
6127
6128 txq->sgl_wrs++;
6129 if (txp->wr_type == 0) {
6130 txq->txpkts0_pkts += txp->npkt;
6131 txq->txpkts0_wrs++;
6132 } else {
6133 txq->txpkts1_pkts += txp->npkt;
6134 txq->txpkts1_wrs++;
6135 }
6136
6137 txsd = &txq->sdesc[eq->pidx];
6138 txsd->m = txp->mb[0];
6139 txsd->desc_used = ndesc;
6140
6141 return (ndesc);
6142 }
6143
6144 static u_int
write_txpkts_vm_wr(struct adapter * sc,struct sge_txq * txq)6145 write_txpkts_vm_wr(struct adapter *sc, struct sge_txq *txq)
6146 {
6147 const struct txpkts *txp = &txq->txp;
6148 struct sge_eq *eq = &txq->eq;
6149 struct fw_eth_tx_pkts_vm_wr *wr;
6150 struct tx_sdesc *txsd;
6151 struct cpl_tx_pkt_core *cpl;
6152 uint64_t ctrl1;
6153 int ndesc, i;
6154 struct mbuf *m, *last;
6155 void *flitp;
6156
6157 TXQ_LOCK_ASSERT_OWNED(txq);
6158 MPASS(txp->npkt > 0);
6159 MPASS(txp->wr_type == 1); /* VF supports type 1 only */
6160 MPASS(txp->mb[0] != NULL);
6161 MPASS(txp->len16 <= howmany(SGE_MAX_WR_LEN, 16));
6162
6163 wr = (void *)&eq->desc[eq->pidx];
6164 wr->op_pkd = htobe32(V_FW_WR_OP(FW_ETH_TX_PKTS_VM_WR));
6165 wr->equiq_to_len16 = htobe32(V_FW_WR_LEN16(txp->len16));
6166 wr->r3 = 0;
6167 wr->plen = htobe16(txp->plen);
6168 wr->npkt = txp->npkt;
6169 wr->r4 = 0;
6170 memcpy(&wr->ethmacdst[0], &txp->ethmacdst[0], 16);
6171 flitp = wr + 1;
6172
6173 /*
6174 * At this point we are 32B into a hardware descriptor. Each mbuf in
6175 * the WR will take 32B so we check for the end of the descriptor ring
6176 * before writing odd mbufs (mb[1], 3, 5, ..)
6177 */
6178 ndesc = tx_len16_to_desc(txp->len16);
6179 last = NULL;
6180 for (i = 0; i < txp->npkt; i++) {
6181 m = txp->mb[i];
6182 if (i & 1 && (uintptr_t)flitp == (uintptr_t)&eq->desc[eq->sidx])
6183 flitp = &eq->desc[0];
6184 cpl = flitp;
6185
6186 /* Checksum offload */
6187 ctrl1 = csum_to_ctrl(sc, m);
6188 if (ctrl1 != (F_TXPKT_IPCSUM_DIS | F_TXPKT_L4CSUM_DIS))
6189 txq->txcsum++; /* some hardware assistance provided */
6190
6191 /* VLAN tag insertion */
6192 if (needs_vlan_insertion(m)) {
6193 ctrl1 |= F_TXPKT_VLAN_VLD |
6194 V_TXPKT_VLAN(m->m_pkthdr.ether_vtag);
6195 txq->vlan_insertion++;
6196 } else if (sc->vlan_id)
6197 ctrl1 |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(sc->vlan_id);
6198
6199 /* CPL header */
6200 cpl->ctrl0 = txq->cpl_ctrl0;
6201 cpl->pack = 0;
6202 cpl->len = htobe16(m->m_pkthdr.len);
6203 cpl->ctrl1 = htobe64(ctrl1);
6204
6205 flitp = cpl + 1;
6206 MPASS(mbuf_nsegs(m) == 1);
6207 write_gl_to_txd(txq, m, (caddr_t *)(&flitp), 0);
6208
6209 if (last != NULL)
6210 last->m_nextpkt = m;
6211 last = m;
6212 }
6213
6214 txq->sgl_wrs++;
6215 txq->txpkts1_pkts += txp->npkt;
6216 txq->txpkts1_wrs++;
6217
6218 txsd = &txq->sdesc[eq->pidx];
6219 txsd->m = txp->mb[0];
6220 txsd->desc_used = ndesc;
6221
6222 return (ndesc);
6223 }
6224
6225 /*
6226 * If the SGL ends on an address that is not 16 byte aligned, this function will
6227 * add a 0 filled flit at the end.
6228 */
6229 static void
write_gl_to_txd(struct sge_txq * txq,struct mbuf * m,caddr_t * to,int checkwrap)6230 write_gl_to_txd(struct sge_txq *txq, struct mbuf *m, caddr_t *to, int checkwrap)
6231 {
6232 struct sge_eq *eq = &txq->eq;
6233 struct sglist *gl = txq->gl;
6234 struct sglist_seg *seg;
6235 __be64 *flitp, *wrap;
6236 struct ulptx_sgl *usgl;
6237 int i, nflits, nsegs;
6238
6239 KASSERT(((uintptr_t)(*to) & 0xf) == 0,
6240 ("%s: SGL must start at a 16 byte boundary: %p", __func__, *to));
6241 MPASS((uintptr_t)(*to) >= (uintptr_t)&eq->desc[0]);
6242 MPASS((uintptr_t)(*to) < (uintptr_t)&eq->desc[eq->sidx]);
6243
6244 get_pkt_gl(m, gl);
6245 nsegs = gl->sg_nseg;
6246 MPASS(nsegs > 0);
6247
6248 nflits = (3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1) + 2;
6249 flitp = (__be64 *)(*to);
6250 wrap = (__be64 *)(&eq->desc[eq->sidx]);
6251 seg = &gl->sg_segs[0];
6252 usgl = (void *)flitp;
6253
6254 /*
6255 * We start at a 16 byte boundary somewhere inside the tx descriptor
6256 * ring, so we're at least 16 bytes away from the status page. There is
6257 * no chance of a wrap around in the middle of usgl (which is 16 bytes).
6258 */
6259
6260 usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) |
6261 V_ULPTX_NSGE(nsegs));
6262 usgl->len0 = htobe32(seg->ss_len);
6263 usgl->addr0 = htobe64(seg->ss_paddr);
6264 seg++;
6265
6266 if (checkwrap == 0 || (uintptr_t)(flitp + nflits) <= (uintptr_t)wrap) {
6267
6268 /* Won't wrap around at all */
6269
6270 for (i = 0; i < nsegs - 1; i++, seg++) {
6271 usgl->sge[i / 2].len[i & 1] = htobe32(seg->ss_len);
6272 usgl->sge[i / 2].addr[i & 1] = htobe64(seg->ss_paddr);
6273 }
6274 if (i & 1)
6275 usgl->sge[i / 2].len[1] = htobe32(0);
6276 flitp += nflits;
6277 } else {
6278
6279 /* Will wrap somewhere in the rest of the SGL */
6280
6281 /* 2 flits already written, write the rest flit by flit */
6282 flitp = (void *)(usgl + 1);
6283 for (i = 0; i < nflits - 2; i++) {
6284 if (flitp == wrap)
6285 flitp = (void *)eq->desc;
6286 *flitp++ = get_flit(seg, nsegs - 1, i);
6287 }
6288 }
6289
6290 if (nflits & 1) {
6291 MPASS(((uintptr_t)flitp) & 0xf);
6292 *flitp++ = 0;
6293 }
6294
6295 MPASS((((uintptr_t)flitp) & 0xf) == 0);
6296 if (__predict_false(flitp == wrap))
6297 *to = (void *)eq->desc;
6298 else
6299 *to = (void *)flitp;
6300 }
6301
6302 static inline void
copy_to_txd(struct sge_eq * eq,caddr_t from,caddr_t * to,int len)6303 copy_to_txd(struct sge_eq *eq, caddr_t from, caddr_t *to, int len)
6304 {
6305
6306 MPASS((uintptr_t)(*to) >= (uintptr_t)&eq->desc[0]);
6307 MPASS((uintptr_t)(*to) < (uintptr_t)&eq->desc[eq->sidx]);
6308
6309 if (__predict_true((uintptr_t)(*to) + len <=
6310 (uintptr_t)&eq->desc[eq->sidx])) {
6311 bcopy(from, *to, len);
6312 (*to) += len;
6313 } else {
6314 int portion = (uintptr_t)&eq->desc[eq->sidx] - (uintptr_t)(*to);
6315
6316 bcopy(from, *to, portion);
6317 from += portion;
6318 portion = len - portion; /* remaining */
6319 bcopy(from, (void *)eq->desc, portion);
6320 (*to) = (caddr_t)eq->desc + portion;
6321 }
6322 }
6323
6324 static inline void
ring_eq_db(struct adapter * sc,struct sge_eq * eq,u_int n)6325 ring_eq_db(struct adapter *sc, struct sge_eq *eq, u_int n)
6326 {
6327 u_int db;
6328
6329 MPASS(n > 0);
6330
6331 db = eq->doorbells;
6332 if (n > 1)
6333 clrbit(&db, DOORBELL_WCWR);
6334 wmb();
6335
6336 switch (ffs(db) - 1) {
6337 case DOORBELL_UDB:
6338 *eq->udb = htole32(V_QID(eq->udb_qid) | V_PIDX(n));
6339 break;
6340
6341 case DOORBELL_WCWR: {
6342 volatile uint64_t *dst, *src;
6343 int i;
6344
6345 /*
6346 * Queues whose 128B doorbell segment fits in the page do not
6347 * use relative qid (udb_qid is always 0). Only queues with
6348 * doorbell segments can do WCWR.
6349 */
6350 KASSERT(eq->udb_qid == 0 && n == 1,
6351 ("%s: inappropriate doorbell (0x%x, %d, %d) for eq %p",
6352 __func__, eq->doorbells, n, eq->dbidx, eq));
6353
6354 dst = (volatile void *)((uintptr_t)eq->udb + UDBS_WR_OFFSET -
6355 UDBS_DB_OFFSET);
6356 i = eq->dbidx;
6357 src = (void *)&eq->desc[i];
6358 while (src != (void *)&eq->desc[i + 1])
6359 *dst++ = *src++;
6360 wmb();
6361 break;
6362 }
6363
6364 case DOORBELL_UDBWC:
6365 *eq->udb = htole32(V_QID(eq->udb_qid) | V_PIDX(n));
6366 wmb();
6367 break;
6368
6369 case DOORBELL_KDB:
6370 t4_write_reg(sc, sc->sge_kdoorbell_reg,
6371 V_QID(eq->cntxt_id) | V_PIDX(n));
6372 break;
6373 }
6374
6375 IDXINCR(eq->dbidx, n, eq->sidx);
6376 }
6377
6378 static inline u_int
reclaimable_tx_desc(struct sge_eq * eq)6379 reclaimable_tx_desc(struct sge_eq *eq)
6380 {
6381 uint16_t hw_cidx;
6382
6383 hw_cidx = read_hw_cidx(eq);
6384 return (IDXDIFF(hw_cidx, eq->cidx, eq->sidx));
6385 }
6386
6387 static inline u_int
total_available_tx_desc(struct sge_eq * eq)6388 total_available_tx_desc(struct sge_eq *eq)
6389 {
6390 uint16_t hw_cidx, pidx;
6391
6392 hw_cidx = read_hw_cidx(eq);
6393 pidx = eq->pidx;
6394
6395 if (pidx == hw_cidx)
6396 return (eq->sidx - 1);
6397 else
6398 return (IDXDIFF(hw_cidx, pidx, eq->sidx) - 1);
6399 }
6400
6401 static inline uint16_t
read_hw_cidx(struct sge_eq * eq)6402 read_hw_cidx(struct sge_eq *eq)
6403 {
6404 struct sge_qstat *spg = (void *)&eq->desc[eq->sidx];
6405 uint16_t cidx = spg->cidx; /* stable snapshot */
6406
6407 return (be16toh(cidx));
6408 }
6409
6410 /*
6411 * Reclaim 'n' descriptors approximately.
6412 */
6413 static u_int
reclaim_tx_descs(struct sge_txq * txq,u_int n)6414 reclaim_tx_descs(struct sge_txq *txq, u_int n)
6415 {
6416 struct tx_sdesc *txsd;
6417 struct sge_eq *eq = &txq->eq;
6418 u_int can_reclaim, reclaimed;
6419
6420 TXQ_LOCK_ASSERT_OWNED(txq);
6421 MPASS(n > 0);
6422
6423 reclaimed = 0;
6424 can_reclaim = reclaimable_tx_desc(eq);
6425 while (can_reclaim && reclaimed < n) {
6426 int ndesc;
6427 struct mbuf *m, *nextpkt;
6428
6429 txsd = &txq->sdesc[eq->cidx];
6430 ndesc = txsd->desc_used;
6431
6432 /* Firmware doesn't return "partial" credits. */
6433 KASSERT(can_reclaim >= ndesc,
6434 ("%s: unexpected number of credits: %d, %d",
6435 __func__, can_reclaim, ndesc));
6436 KASSERT(ndesc != 0,
6437 ("%s: descriptor with no credits: cidx %d",
6438 __func__, eq->cidx));
6439
6440 for (m = txsd->m; m != NULL; m = nextpkt) {
6441 nextpkt = m->m_nextpkt;
6442 m->m_nextpkt = NULL;
6443 m_freem(m);
6444 }
6445 reclaimed += ndesc;
6446 can_reclaim -= ndesc;
6447 IDXINCR(eq->cidx, ndesc, eq->sidx);
6448 }
6449
6450 return (reclaimed);
6451 }
6452
6453 static void
tx_reclaim(void * arg,int n)6454 tx_reclaim(void *arg, int n)
6455 {
6456 struct sge_txq *txq = arg;
6457 struct sge_eq *eq = &txq->eq;
6458
6459 do {
6460 if (TXQ_TRYLOCK(txq) == 0)
6461 break;
6462 n = reclaim_tx_descs(txq, 32);
6463 if (eq->cidx == eq->pidx)
6464 eq->equeqidx = eq->pidx;
6465 TXQ_UNLOCK(txq);
6466 } while (n > 0);
6467 }
6468
6469 static __be64
get_flit(struct sglist_seg * segs,int nsegs,int idx)6470 get_flit(struct sglist_seg *segs, int nsegs, int idx)
6471 {
6472 int i = (idx / 3) * 2;
6473
6474 switch (idx % 3) {
6475 case 0: {
6476 uint64_t rc;
6477
6478 rc = (uint64_t)segs[i].ss_len << 32;
6479 if (i + 1 < nsegs)
6480 rc |= (uint64_t)(segs[i + 1].ss_len);
6481
6482 return (htobe64(rc));
6483 }
6484 case 1:
6485 return (htobe64(segs[i].ss_paddr));
6486 case 2:
6487 return (htobe64(segs[i + 1].ss_paddr));
6488 }
6489
6490 return (0);
6491 }
6492
6493 static int
find_refill_source(struct adapter * sc,int maxp,bool packing)6494 find_refill_source(struct adapter *sc, int maxp, bool packing)
6495 {
6496 int i, zidx = -1;
6497 struct rx_buf_info *rxb = &sc->sge.rx_buf_info[0];
6498
6499 if (packing) {
6500 for (i = 0; i < SW_ZONE_SIZES; i++, rxb++) {
6501 if (rxb->hwidx2 == -1)
6502 continue;
6503 if (rxb->size1 < PAGE_SIZE &&
6504 rxb->size1 < largest_rx_cluster)
6505 continue;
6506 if (rxb->size1 > largest_rx_cluster)
6507 break;
6508 MPASS(rxb->size1 - rxb->size2 >= CL_METADATA_SIZE);
6509 if (rxb->size2 >= maxp)
6510 return (i);
6511 zidx = i;
6512 }
6513 } else {
6514 for (i = 0; i < SW_ZONE_SIZES; i++, rxb++) {
6515 if (rxb->hwidx1 == -1)
6516 continue;
6517 if (rxb->size1 > largest_rx_cluster)
6518 break;
6519 if (rxb->size1 >= maxp)
6520 return (i);
6521 zidx = i;
6522 }
6523 }
6524
6525 return (zidx);
6526 }
6527
6528 static void
add_fl_to_sfl(struct adapter * sc,struct sge_fl * fl)6529 add_fl_to_sfl(struct adapter *sc, struct sge_fl *fl)
6530 {
6531 mtx_lock(&sc->sfl_lock);
6532 FL_LOCK(fl);
6533 if ((fl->flags & FL_DOOMED) == 0) {
6534 fl->flags |= FL_STARVING;
6535 TAILQ_INSERT_TAIL(&sc->sfl, fl, link);
6536 callout_reset(&sc->sfl_callout, hz / 5, refill_sfl, sc);
6537 }
6538 FL_UNLOCK(fl);
6539 mtx_unlock(&sc->sfl_lock);
6540 }
6541
6542 static void
handle_wrq_egr_update(struct adapter * sc,struct sge_eq * eq)6543 handle_wrq_egr_update(struct adapter *sc, struct sge_eq *eq)
6544 {
6545 struct sge_wrq *wrq = (void *)eq;
6546
6547 atomic_readandclear_int(&eq->equiq);
6548 taskqueue_enqueue(sc->tq[eq->port_id], &wrq->wrq_tx_task);
6549 }
6550
6551 static void
handle_eth_egr_update(struct adapter * sc,struct sge_eq * eq)6552 handle_eth_egr_update(struct adapter *sc, struct sge_eq *eq)
6553 {
6554 struct sge_txq *txq = (void *)eq;
6555
6556 MPASS(eq->type == EQ_ETH);
6557
6558 atomic_readandclear_int(&eq->equiq);
6559 if (mp_ring_is_idle(txq->r))
6560 taskqueue_enqueue(sc->tq[eq->port_id], &txq->tx_reclaim_task);
6561 else
6562 mp_ring_check_drainage(txq->r, 64);
6563 }
6564
6565 static int
handle_sge_egr_update(struct sge_iq * iq,const struct rss_header * rss,struct mbuf * m)6566 handle_sge_egr_update(struct sge_iq *iq, const struct rss_header *rss,
6567 struct mbuf *m)
6568 {
6569 const struct cpl_sge_egr_update *cpl = (const void *)(rss + 1);
6570 unsigned int qid = G_EGR_QID(ntohl(cpl->opcode_qid));
6571 struct adapter *sc = iq->adapter;
6572 struct sge *s = &sc->sge;
6573 struct sge_eq *eq;
6574 static void (*h[])(struct adapter *, struct sge_eq *) = {NULL,
6575 &handle_wrq_egr_update, &handle_eth_egr_update,
6576 &handle_wrq_egr_update};
6577
6578 KASSERT(m == NULL, ("%s: payload with opcode %02x", __func__,
6579 rss->opcode));
6580
6581 eq = s->eqmap[qid - s->eq_start - s->eq_base];
6582 (*h[eq->type])(sc, eq);
6583
6584 return (0);
6585 }
6586
6587 /* handle_fw_msg works for both fw4_msg and fw6_msg because this is valid */
6588 CTASSERT(offsetof(struct cpl_fw4_msg, data) == \
6589 offsetof(struct cpl_fw6_msg, data));
6590
6591 static int
handle_fw_msg(struct sge_iq * iq,const struct rss_header * rss,struct mbuf * m)6592 handle_fw_msg(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
6593 {
6594 struct adapter *sc = iq->adapter;
6595 const struct cpl_fw6_msg *cpl = (const void *)(rss + 1);
6596
6597 KASSERT(m == NULL, ("%s: payload with opcode %02x", __func__,
6598 rss->opcode));
6599
6600 if (cpl->type == FW_TYPE_RSSCPL || cpl->type == FW6_TYPE_RSSCPL) {
6601 const struct rss_header *rss2;
6602
6603 rss2 = (const struct rss_header *)&cpl->data[0];
6604 return (t4_cpl_handler[rss2->opcode](iq, rss2, m));
6605 }
6606
6607 return (t4_fw_msg_handler[cpl->type](sc, &cpl->data[0]));
6608 }
6609
6610 /**
6611 * t4_handle_wrerr_rpl - process a FW work request error message
6612 * @adap: the adapter
6613 * @rpl: start of the FW message
6614 */
6615 static int
t4_handle_wrerr_rpl(struct adapter * adap,const __be64 * rpl)6616 t4_handle_wrerr_rpl(struct adapter *adap, const __be64 *rpl)
6617 {
6618 u8 opcode = *(const u8 *)rpl;
6619 const struct fw_error_cmd *e = (const void *)rpl;
6620 unsigned int i;
6621
6622 if (opcode != FW_ERROR_CMD) {
6623 log(LOG_ERR,
6624 "%s: Received WRERR_RPL message with opcode %#x\n",
6625 device_get_nameunit(adap->dev), opcode);
6626 return (EINVAL);
6627 }
6628 log(LOG_ERR, "%s: FW_ERROR (%s) ", device_get_nameunit(adap->dev),
6629 G_FW_ERROR_CMD_FATAL(be32toh(e->op_to_type)) ? "fatal" :
6630 "non-fatal");
6631 switch (G_FW_ERROR_CMD_TYPE(be32toh(e->op_to_type))) {
6632 case FW_ERROR_TYPE_EXCEPTION:
6633 log(LOG_ERR, "exception info:\n");
6634 for (i = 0; i < nitems(e->u.exception.info); i++)
6635 log(LOG_ERR, "%s%08x", i == 0 ? "\t" : " ",
6636 be32toh(e->u.exception.info[i]));
6637 log(LOG_ERR, "\n");
6638 break;
6639 case FW_ERROR_TYPE_HWMODULE:
6640 log(LOG_ERR, "HW module regaddr %08x regval %08x\n",
6641 be32toh(e->u.hwmodule.regaddr),
6642 be32toh(e->u.hwmodule.regval));
6643 break;
6644 case FW_ERROR_TYPE_WR:
6645 log(LOG_ERR, "WR cidx %d PF %d VF %d eqid %d hdr:\n",
6646 be16toh(e->u.wr.cidx),
6647 G_FW_ERROR_CMD_PFN(be16toh(e->u.wr.pfn_vfn)),
6648 G_FW_ERROR_CMD_VFN(be16toh(e->u.wr.pfn_vfn)),
6649 be32toh(e->u.wr.eqid));
6650 for (i = 0; i < nitems(e->u.wr.wrhdr); i++)
6651 log(LOG_ERR, "%s%02x", i == 0 ? "\t" : " ",
6652 e->u.wr.wrhdr[i]);
6653 log(LOG_ERR, "\n");
6654 break;
6655 case FW_ERROR_TYPE_ACL:
6656 log(LOG_ERR, "ACL cidx %d PF %d VF %d eqid %d %s",
6657 be16toh(e->u.acl.cidx),
6658 G_FW_ERROR_CMD_PFN(be16toh(e->u.acl.pfn_vfn)),
6659 G_FW_ERROR_CMD_VFN(be16toh(e->u.acl.pfn_vfn)),
6660 be32toh(e->u.acl.eqid),
6661 G_FW_ERROR_CMD_MV(be16toh(e->u.acl.mv_pkd)) ? "vlanid" :
6662 "MAC");
6663 for (i = 0; i < nitems(e->u.acl.val); i++)
6664 log(LOG_ERR, " %02x", e->u.acl.val[i]);
6665 log(LOG_ERR, "\n");
6666 break;
6667 default:
6668 log(LOG_ERR, "type %#x\n",
6669 G_FW_ERROR_CMD_TYPE(be32toh(e->op_to_type)));
6670 return (EINVAL);
6671 }
6672 return (0);
6673 }
6674
6675 static inline bool
bufidx_used(struct adapter * sc,int idx)6676 bufidx_used(struct adapter *sc, int idx)
6677 {
6678 struct rx_buf_info *rxb = &sc->sge.rx_buf_info[0];
6679 int i;
6680
6681 for (i = 0; i < SW_ZONE_SIZES; i++, rxb++) {
6682 if (rxb->size1 > largest_rx_cluster)
6683 continue;
6684 if (rxb->hwidx1 == idx || rxb->hwidx2 == idx)
6685 return (true);
6686 }
6687
6688 return (false);
6689 }
6690
6691 static int
sysctl_bufsizes(SYSCTL_HANDLER_ARGS)6692 sysctl_bufsizes(SYSCTL_HANDLER_ARGS)
6693 {
6694 struct adapter *sc = arg1;
6695 struct sge_params *sp = &sc->params.sge;
6696 int i, rc;
6697 struct sbuf sb;
6698 char c;
6699
6700 sbuf_new(&sb, NULL, 128, SBUF_AUTOEXTEND);
6701 for (i = 0; i < SGE_FLBUF_SIZES; i++) {
6702 if (bufidx_used(sc, i))
6703 c = '*';
6704 else
6705 c = '\0';
6706
6707 sbuf_printf(&sb, "%u%c ", sp->sge_fl_buffer_size[i], c);
6708 }
6709 sbuf_trim(&sb);
6710 sbuf_finish(&sb);
6711 rc = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
6712 sbuf_delete(&sb);
6713 return (rc);
6714 }
6715
6716 #ifdef RATELIMIT
6717 #if defined(INET) || defined(INET6)
6718 /*
6719 * len16 for a txpkt WR with a GL. Includes the firmware work request header.
6720 */
6721 static inline u_int
txpkt_eo_len16(u_int nsegs,u_int immhdrs,u_int tso)6722 txpkt_eo_len16(u_int nsegs, u_int immhdrs, u_int tso)
6723 {
6724 u_int n;
6725
6726 MPASS(immhdrs > 0);
6727
6728 n = roundup2(sizeof(struct fw_eth_tx_eo_wr) +
6729 sizeof(struct cpl_tx_pkt_core) + immhdrs, 16);
6730 if (__predict_false(nsegs == 0))
6731 goto done;
6732
6733 nsegs--; /* first segment is part of ulptx_sgl */
6734 n += sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1));
6735 if (tso)
6736 n += sizeof(struct cpl_tx_pkt_lso_core);
6737
6738 done:
6739 return (howmany(n, 16));
6740 }
6741 #endif
6742
6743 #define ETID_FLOWC_NPARAMS 6
6744 #define ETID_FLOWC_LEN (roundup2((sizeof(struct fw_flowc_wr) + \
6745 ETID_FLOWC_NPARAMS * sizeof(struct fw_flowc_mnemval)), 16))
6746 #define ETID_FLOWC_LEN16 (howmany(ETID_FLOWC_LEN, 16))
6747
6748 #if defined(INET) || defined(INET6)
6749 static int
send_etid_flowc_wr(struct cxgbe_rate_tag * cst,struct port_info * pi,struct vi_info * vi)6750 send_etid_flowc_wr(struct cxgbe_rate_tag *cst, struct port_info *pi,
6751 struct vi_info *vi)
6752 {
6753 struct wrq_cookie cookie;
6754 u_int pfvf = pi->adapter->pf << S_FW_VIID_PFN;
6755 struct fw_flowc_wr *flowc;
6756
6757 mtx_assert(&cst->lock, MA_OWNED);
6758 MPASS((cst->flags & (EO_FLOWC_PENDING | EO_FLOWC_RPL_PENDING)) ==
6759 EO_FLOWC_PENDING);
6760
6761 flowc = start_wrq_wr(&cst->eo_txq->wrq, ETID_FLOWC_LEN16, &cookie);
6762 if (__predict_false(flowc == NULL))
6763 return (ENOMEM);
6764
6765 bzero(flowc, ETID_FLOWC_LEN);
6766 flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
6767 V_FW_FLOWC_WR_NPARAMS(ETID_FLOWC_NPARAMS) | V_FW_WR_COMPL(0));
6768 flowc->flowid_len16 = htonl(V_FW_WR_LEN16(ETID_FLOWC_LEN16) |
6769 V_FW_WR_FLOWID(cst->etid));
6770 flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN;
6771 flowc->mnemval[0].val = htobe32(pfvf);
6772 /* Firmware expects hw port and will translate to channel itself. */
6773 flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH;
6774 flowc->mnemval[1].val = htobe32(pi->hw_port);
6775 flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT;
6776 flowc->mnemval[2].val = htobe32(pi->hw_port);
6777 flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID;
6778 flowc->mnemval[3].val = htobe32(cst->iqid);
6779 flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_EOSTATE;
6780 flowc->mnemval[4].val = htobe32(FW_FLOWC_MNEM_EOSTATE_ESTABLISHED);
6781 flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_SCHEDCLASS;
6782 flowc->mnemval[5].val = htobe32(cst->schedcl);
6783
6784 commit_wrq_wr(&cst->eo_txq->wrq, flowc, &cookie);
6785
6786 cst->flags &= ~EO_FLOWC_PENDING;
6787 cst->flags |= EO_FLOWC_RPL_PENDING;
6788 MPASS(cst->tx_credits >= ETID_FLOWC_LEN16); /* flowc is first WR. */
6789 cst->tx_credits -= ETID_FLOWC_LEN16;
6790
6791 return (0);
6792 }
6793 #endif
6794
6795 #define ETID_FLUSH_LEN16 (howmany(sizeof (struct fw_flowc_wr), 16))
6796
6797 void
send_etid_flush_wr(struct cxgbe_rate_tag * cst)6798 send_etid_flush_wr(struct cxgbe_rate_tag *cst)
6799 {
6800 struct fw_flowc_wr *flowc;
6801 struct wrq_cookie cookie;
6802
6803 mtx_assert(&cst->lock, MA_OWNED);
6804
6805 flowc = start_wrq_wr(&cst->eo_txq->wrq, ETID_FLUSH_LEN16, &cookie);
6806 if (__predict_false(flowc == NULL))
6807 CXGBE_UNIMPLEMENTED(__func__);
6808
6809 bzero(flowc, ETID_FLUSH_LEN16 * 16);
6810 flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
6811 V_FW_FLOWC_WR_NPARAMS(0) | F_FW_WR_COMPL);
6812 flowc->flowid_len16 = htobe32(V_FW_WR_LEN16(ETID_FLUSH_LEN16) |
6813 V_FW_WR_FLOWID(cst->etid));
6814
6815 commit_wrq_wr(&cst->eo_txq->wrq, flowc, &cookie);
6816
6817 cst->flags |= EO_FLUSH_RPL_PENDING;
6818 MPASS(cst->tx_credits >= ETID_FLUSH_LEN16);
6819 cst->tx_credits -= ETID_FLUSH_LEN16;
6820 cst->ncompl++;
6821 }
6822
6823 static void
write_ethofld_wr(struct cxgbe_rate_tag * cst,struct fw_eth_tx_eo_wr * wr,struct mbuf * m0,int compl)6824 write_ethofld_wr(struct cxgbe_rate_tag *cst, struct fw_eth_tx_eo_wr *wr,
6825 struct mbuf *m0, int compl)
6826 {
6827 struct cpl_tx_pkt_core *cpl;
6828 uint64_t ctrl1;
6829 uint32_t ctrl; /* used in many unrelated places */
6830 int len16, pktlen, nsegs, immhdrs;
6831 uintptr_t p;
6832 struct ulptx_sgl *usgl;
6833 struct sglist sg;
6834 struct sglist_seg segs[38]; /* XXX: find real limit. XXX: get off the stack */
6835
6836 mtx_assert(&cst->lock, MA_OWNED);
6837 M_ASSERTPKTHDR(m0);
6838 KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 &&
6839 m0->m_pkthdr.l4hlen > 0,
6840 ("%s: ethofld mbuf %p is missing header lengths", __func__, m0));
6841
6842 len16 = mbuf_eo_len16(m0);
6843 nsegs = mbuf_eo_nsegs(m0);
6844 pktlen = m0->m_pkthdr.len;
6845 ctrl = sizeof(struct cpl_tx_pkt_core);
6846 if (needs_tso(m0))
6847 ctrl += sizeof(struct cpl_tx_pkt_lso_core);
6848 immhdrs = m0->m_pkthdr.l2hlen + m0->m_pkthdr.l3hlen + m0->m_pkthdr.l4hlen;
6849 ctrl += immhdrs;
6850
6851 wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_EO_WR) |
6852 V_FW_ETH_TX_EO_WR_IMMDLEN(ctrl) | V_FW_WR_COMPL(!!compl));
6853 wr->equiq_to_len16 = htobe32(V_FW_WR_LEN16(len16) |
6854 V_FW_WR_FLOWID(cst->etid));
6855 wr->r3 = 0;
6856 if (needs_outer_udp_csum(m0)) {
6857 wr->u.udpseg.type = FW_ETH_TX_EO_TYPE_UDPSEG;
6858 wr->u.udpseg.ethlen = m0->m_pkthdr.l2hlen;
6859 wr->u.udpseg.iplen = htobe16(m0->m_pkthdr.l3hlen);
6860 wr->u.udpseg.udplen = m0->m_pkthdr.l4hlen;
6861 wr->u.udpseg.rtplen = 0;
6862 wr->u.udpseg.r4 = 0;
6863 wr->u.udpseg.mss = htobe16(pktlen - immhdrs);
6864 wr->u.udpseg.schedpktsize = wr->u.udpseg.mss;
6865 wr->u.udpseg.plen = htobe32(pktlen - immhdrs);
6866 cpl = (void *)(wr + 1);
6867 } else {
6868 MPASS(needs_outer_tcp_csum(m0));
6869 wr->u.tcpseg.type = FW_ETH_TX_EO_TYPE_TCPSEG;
6870 wr->u.tcpseg.ethlen = m0->m_pkthdr.l2hlen;
6871 wr->u.tcpseg.iplen = htobe16(m0->m_pkthdr.l3hlen);
6872 wr->u.tcpseg.tcplen = m0->m_pkthdr.l4hlen;
6873 wr->u.tcpseg.tsclk_tsoff = mbuf_eo_tsclk_tsoff(m0);
6874 wr->u.tcpseg.r4 = 0;
6875 wr->u.tcpseg.r5 = 0;
6876 wr->u.tcpseg.plen = htobe32(pktlen - immhdrs);
6877
6878 if (needs_tso(m0)) {
6879 struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1);
6880
6881 wr->u.tcpseg.mss = htobe16(m0->m_pkthdr.tso_segsz);
6882
6883 ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) |
6884 F_LSO_FIRST_SLICE | F_LSO_LAST_SLICE |
6885 V_LSO_ETHHDR_LEN((m0->m_pkthdr.l2hlen -
6886 ETHER_HDR_LEN) >> 2) |
6887 V_LSO_IPHDR_LEN(m0->m_pkthdr.l3hlen >> 2) |
6888 V_LSO_TCPHDR_LEN(m0->m_pkthdr.l4hlen >> 2);
6889 if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr))
6890 ctrl |= F_LSO_IPV6;
6891 lso->lso_ctrl = htobe32(ctrl);
6892 lso->ipid_ofst = htobe16(0);
6893 lso->mss = htobe16(m0->m_pkthdr.tso_segsz);
6894 lso->seqno_offset = htobe32(0);
6895 lso->len = htobe32(pktlen);
6896
6897 cpl = (void *)(lso + 1);
6898 } else {
6899 wr->u.tcpseg.mss = htobe16(0xffff);
6900 cpl = (void *)(wr + 1);
6901 }
6902 }
6903
6904 /* Checksum offload must be requested for ethofld. */
6905 MPASS(needs_outer_l4_csum(m0));
6906 ctrl1 = csum_to_ctrl(cst->adapter, m0);
6907
6908 /* VLAN tag insertion */
6909 if (needs_vlan_insertion(m0)) {
6910 ctrl1 |= F_TXPKT_VLAN_VLD |
6911 V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag);
6912 }
6913
6914 /* CPL header */
6915 cpl->ctrl0 = cst->ctrl0;
6916 cpl->pack = 0;
6917 cpl->len = htobe16(pktlen);
6918 cpl->ctrl1 = htobe64(ctrl1);
6919
6920 /* Copy Ethernet, IP & TCP/UDP hdrs as immediate data */
6921 p = (uintptr_t)(cpl + 1);
6922 m_copydata(m0, 0, immhdrs, (void *)p);
6923
6924 /* SGL */
6925 if (nsegs > 0) {
6926 int i, pad;
6927
6928 /* zero-pad upto next 16Byte boundary, if not 16Byte aligned */
6929 p += immhdrs;
6930 pad = 16 - (immhdrs & 0xf);
6931 bzero((void *)p, pad);
6932
6933 usgl = (void *)(p + pad);
6934 usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) |
6935 V_ULPTX_NSGE(nsegs));
6936
6937 sglist_init(&sg, nitems(segs), segs);
6938 for (; m0 != NULL; m0 = m0->m_next) {
6939 if (__predict_false(m0->m_len == 0))
6940 continue;
6941 if (immhdrs >= m0->m_len) {
6942 immhdrs -= m0->m_len;
6943 continue;
6944 }
6945 if (m0->m_flags & M_EXTPG)
6946 sglist_append_mbuf_epg(&sg, m0,
6947 mtod(m0, vm_offset_t), m0->m_len);
6948 else
6949 sglist_append(&sg, mtod(m0, char *) + immhdrs,
6950 m0->m_len - immhdrs);
6951 immhdrs = 0;
6952 }
6953 MPASS(sg.sg_nseg == nsegs);
6954
6955 /*
6956 * Zero pad last 8B in case the WR doesn't end on a 16B
6957 * boundary.
6958 */
6959 *(uint64_t *)((char *)wr + len16 * 16 - 8) = 0;
6960
6961 usgl->len0 = htobe32(segs[0].ss_len);
6962 usgl->addr0 = htobe64(segs[0].ss_paddr);
6963 for (i = 0; i < nsegs - 1; i++) {
6964 usgl->sge[i / 2].len[i & 1] = htobe32(segs[i + 1].ss_len);
6965 usgl->sge[i / 2].addr[i & 1] = htobe64(segs[i + 1].ss_paddr);
6966 }
6967 if (i & 1)
6968 usgl->sge[i / 2].len[1] = htobe32(0);
6969 }
6970
6971 }
6972
6973 static void
ethofld_tx(struct cxgbe_rate_tag * cst)6974 ethofld_tx(struct cxgbe_rate_tag *cst)
6975 {
6976 struct mbuf *m;
6977 struct wrq_cookie cookie;
6978 int next_credits, compl;
6979 struct fw_eth_tx_eo_wr *wr;
6980
6981 mtx_assert(&cst->lock, MA_OWNED);
6982
6983 while ((m = mbufq_first(&cst->pending_tx)) != NULL) {
6984 M_ASSERTPKTHDR(m);
6985
6986 /* How many len16 credits do we need to send this mbuf. */
6987 next_credits = mbuf_eo_len16(m);
6988 MPASS(next_credits > 0);
6989 if (next_credits > cst->tx_credits) {
6990 /*
6991 * Tx will make progress eventually because there is at
6992 * least one outstanding fw4_ack that will return
6993 * credits and kick the tx.
6994 */
6995 MPASS(cst->ncompl > 0);
6996 return;
6997 }
6998 wr = start_wrq_wr(&cst->eo_txq->wrq, next_credits, &cookie);
6999 if (__predict_false(wr == NULL)) {
7000 /* XXX: wishful thinking, not a real assertion. */
7001 MPASS(cst->ncompl > 0);
7002 return;
7003 }
7004 cst->tx_credits -= next_credits;
7005 cst->tx_nocompl += next_credits;
7006 compl = cst->ncompl == 0 || cst->tx_nocompl >= cst->tx_total / 2;
7007 ETHER_BPF_MTAP(cst->com.ifp, m);
7008 write_ethofld_wr(cst, wr, m, compl);
7009 commit_wrq_wr(&cst->eo_txq->wrq, wr, &cookie);
7010 if (compl) {
7011 cst->ncompl++;
7012 cst->tx_nocompl = 0;
7013 }
7014 (void) mbufq_dequeue(&cst->pending_tx);
7015
7016 /*
7017 * Drop the mbuf's reference on the tag now rather
7018 * than waiting until m_freem(). This ensures that
7019 * cxgbe_rate_tag_free gets called when the inp drops
7020 * its reference on the tag and there are no more
7021 * mbufs in the pending_tx queue and can flush any
7022 * pending requests. Otherwise if the last mbuf
7023 * doesn't request a completion the etid will never be
7024 * released.
7025 */
7026 m->m_pkthdr.snd_tag = NULL;
7027 m->m_pkthdr.csum_flags &= ~CSUM_SND_TAG;
7028 m_snd_tag_rele(&cst->com);
7029
7030 mbufq_enqueue(&cst->pending_fwack, m);
7031 }
7032 }
7033
7034 #if defined(INET) || defined(INET6)
7035 static int
ethofld_transmit(if_t ifp,struct mbuf * m0)7036 ethofld_transmit(if_t ifp, struct mbuf *m0)
7037 {
7038 struct cxgbe_rate_tag *cst;
7039 int rc;
7040
7041 MPASS(m0->m_nextpkt == NULL);
7042 MPASS(m0->m_pkthdr.csum_flags & CSUM_SND_TAG);
7043 MPASS(m0->m_pkthdr.snd_tag != NULL);
7044 cst = mst_to_crt(m0->m_pkthdr.snd_tag);
7045
7046 mtx_lock(&cst->lock);
7047 MPASS(cst->flags & EO_SND_TAG_REF);
7048
7049 if (__predict_false(cst->flags & EO_FLOWC_PENDING)) {
7050 struct vi_info *vi = if_getsoftc(ifp);
7051 struct port_info *pi = vi->pi;
7052 struct adapter *sc = pi->adapter;
7053 const uint32_t rss_mask = vi->rss_size - 1;
7054 uint32_t rss_hash;
7055
7056 cst->eo_txq = &sc->sge.ofld_txq[vi->first_ofld_txq];
7057 if (M_HASHTYPE_ISHASH(m0))
7058 rss_hash = m0->m_pkthdr.flowid;
7059 else
7060 rss_hash = arc4random();
7061 /* We assume RSS hashing */
7062 cst->iqid = vi->rss[rss_hash & rss_mask];
7063 cst->eo_txq += rss_hash % vi->nofldtxq;
7064 rc = send_etid_flowc_wr(cst, pi, vi);
7065 if (rc != 0)
7066 goto done;
7067 }
7068
7069 if (__predict_false(cst->plen + m0->m_pkthdr.len > eo_max_backlog)) {
7070 rc = ENOBUFS;
7071 goto done;
7072 }
7073
7074 mbufq_enqueue(&cst->pending_tx, m0);
7075 cst->plen += m0->m_pkthdr.len;
7076
7077 /*
7078 * Hold an extra reference on the tag while generating work
7079 * requests to ensure that we don't try to free the tag during
7080 * ethofld_tx() in case we are sending the final mbuf after
7081 * the inp was freed.
7082 */
7083 m_snd_tag_ref(&cst->com);
7084 ethofld_tx(cst);
7085 mtx_unlock(&cst->lock);
7086 m_snd_tag_rele(&cst->com);
7087 return (0);
7088
7089 done:
7090 mtx_unlock(&cst->lock);
7091 return (rc);
7092 }
7093 #endif
7094
7095 static int
ethofld_fw4_ack(struct sge_iq * iq,const struct rss_header * rss,struct mbuf * m0)7096 ethofld_fw4_ack(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m0)
7097 {
7098 struct adapter *sc = iq->adapter;
7099 const struct cpl_fw4_ack *cpl = (const void *)(rss + 1);
7100 struct mbuf *m;
7101 u_int etid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl)));
7102 struct cxgbe_rate_tag *cst;
7103 uint8_t credits = cpl->credits;
7104
7105 cst = lookup_etid(sc, etid);
7106 mtx_lock(&cst->lock);
7107 if (__predict_false(cst->flags & EO_FLOWC_RPL_PENDING)) {
7108 MPASS(credits >= ETID_FLOWC_LEN16);
7109 credits -= ETID_FLOWC_LEN16;
7110 cst->flags &= ~EO_FLOWC_RPL_PENDING;
7111 }
7112
7113 KASSERT(cst->ncompl > 0,
7114 ("%s: etid %u (%p) wasn't expecting completion.",
7115 __func__, etid, cst));
7116 cst->ncompl--;
7117
7118 while (credits > 0) {
7119 m = mbufq_dequeue(&cst->pending_fwack);
7120 if (__predict_false(m == NULL)) {
7121 /*
7122 * The remaining credits are for the final flush that
7123 * was issued when the tag was freed by the kernel.
7124 */
7125 MPASS((cst->flags &
7126 (EO_FLUSH_RPL_PENDING | EO_SND_TAG_REF)) ==
7127 EO_FLUSH_RPL_PENDING);
7128 MPASS(credits == ETID_FLUSH_LEN16);
7129 MPASS(cst->tx_credits + cpl->credits == cst->tx_total);
7130 MPASS(cst->ncompl == 0);
7131
7132 cst->flags &= ~EO_FLUSH_RPL_PENDING;
7133 cst->tx_credits += cpl->credits;
7134 cxgbe_rate_tag_free_locked(cst);
7135 return (0); /* cst is gone. */
7136 }
7137 KASSERT(m != NULL,
7138 ("%s: too many credits (%u, %u)", __func__, cpl->credits,
7139 credits));
7140 KASSERT(credits >= mbuf_eo_len16(m),
7141 ("%s: too few credits (%u, %u, %u)", __func__,
7142 cpl->credits, credits, mbuf_eo_len16(m)));
7143 credits -= mbuf_eo_len16(m);
7144 cst->plen -= m->m_pkthdr.len;
7145 m_freem(m);
7146 }
7147
7148 cst->tx_credits += cpl->credits;
7149 MPASS(cst->tx_credits <= cst->tx_total);
7150
7151 if (cst->flags & EO_SND_TAG_REF) {
7152 /*
7153 * As with ethofld_transmit(), hold an extra reference
7154 * so that the tag is stable across ethold_tx().
7155 */
7156 m_snd_tag_ref(&cst->com);
7157 m = mbufq_first(&cst->pending_tx);
7158 if (m != NULL && cst->tx_credits >= mbuf_eo_len16(m))
7159 ethofld_tx(cst);
7160 mtx_unlock(&cst->lock);
7161 m_snd_tag_rele(&cst->com);
7162 } else {
7163 /*
7164 * There shouldn't be any pending packets if the tag
7165 * was freed by the kernel since any pending packet
7166 * should hold a reference to the tag.
7167 */
7168 MPASS(mbufq_first(&cst->pending_tx) == NULL);
7169 mtx_unlock(&cst->lock);
7170 }
7171
7172 return (0);
7173 }
7174 #endif
7175