1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2011 Chelsio Communications, Inc. 5 * All rights reserved. 6 * Written by: Navdeep Parhar <np@FreeBSD.org> 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 #include <sys/cdefs.h> 31 __FBSDID("$FreeBSD$"); 32 33 #include "opt_inet.h" 34 #include "opt_inet6.h" 35 #include "opt_kern_tls.h" 36 #include "opt_ratelimit.h" 37 38 #include <sys/types.h> 39 #include <sys/eventhandler.h> 40 #include <sys/mbuf.h> 41 #include <sys/socket.h> 42 #include <sys/kernel.h> 43 #include <sys/ktls.h> 44 #include <sys/malloc.h> 45 #include <sys/msan.h> 46 #include <sys/queue.h> 47 #include <sys/sbuf.h> 48 #include <sys/taskqueue.h> 49 #include <sys/time.h> 50 #include <sys/sglist.h> 51 #include <sys/sysctl.h> 52 #include <sys/smp.h> 53 #include <sys/socketvar.h> 54 #include <sys/counter.h> 55 #include <net/bpf.h> 56 #include <net/ethernet.h> 57 #include <net/if.h> 58 #include <net/if_vlan_var.h> 59 #include <net/if_vxlan.h> 60 #include <netinet/in.h> 61 #include <netinet/ip.h> 62 #include <netinet/ip6.h> 63 #include <netinet/tcp.h> 64 #include <netinet/udp.h> 65 #include <machine/in_cksum.h> 66 #include <machine/md_var.h> 67 #include <vm/vm.h> 68 #include <vm/pmap.h> 69 #ifdef DEV_NETMAP 70 #include <machine/bus.h> 71 #include <sys/selinfo.h> 72 #include <net/if_var.h> 73 #include <net/netmap.h> 74 #include <dev/netmap/netmap_kern.h> 75 #endif 76 77 #include "common/common.h" 78 #include "common/t4_regs.h" 79 #include "common/t4_regs_values.h" 80 #include "common/t4_msg.h" 81 #include "t4_l2t.h" 82 #include "t4_mp_ring.h" 83 84 #ifdef T4_PKT_TIMESTAMP 85 #define RX_COPY_THRESHOLD (MINCLSIZE - 8) 86 #else 87 #define RX_COPY_THRESHOLD MINCLSIZE 88 #endif 89 90 /* Internal mbuf flags stored in PH_loc.eight[1]. */ 91 #define MC_NOMAP 0x01 92 #define MC_RAW_WR 0x02 93 #define MC_TLS 0x04 94 95 /* 96 * Ethernet frames are DMA'd at this byte offset into the freelist buffer. 97 * 0-7 are valid values. 98 */ 99 static int fl_pktshift = 0; 100 SYSCTL_INT(_hw_cxgbe, OID_AUTO, fl_pktshift, CTLFLAG_RDTUN, &fl_pktshift, 0, 101 "payload DMA offset in rx buffer (bytes)"); 102 103 /* 104 * Pad ethernet payload up to this boundary. 105 * -1: driver should figure out a good value. 106 * 0: disable padding. 107 * Any power of 2 from 32 to 4096 (both inclusive) is also a valid value. 108 */ 109 int fl_pad = -1; 110 SYSCTL_INT(_hw_cxgbe, OID_AUTO, fl_pad, CTLFLAG_RDTUN, &fl_pad, 0, 111 "payload pad boundary (bytes)"); 112 113 /* 114 * Status page length. 115 * -1: driver should figure out a good value. 116 * 64 or 128 are the only other valid values. 117 */ 118 static int spg_len = -1; 119 SYSCTL_INT(_hw_cxgbe, OID_AUTO, spg_len, CTLFLAG_RDTUN, &spg_len, 0, 120 "status page size (bytes)"); 121 122 /* 123 * Congestion drops. 124 * -1: no congestion feedback (not recommended). 125 * 0: backpressure the channel instead of dropping packets right away. 126 * 1: no backpressure, drop packets for the congested queue immediately. 127 * 2: both backpressure and drop. 128 */ 129 static int cong_drop = 0; 130 SYSCTL_INT(_hw_cxgbe, OID_AUTO, cong_drop, CTLFLAG_RDTUN, &cong_drop, 0, 131 "Congestion control for NIC RX queues (0 = backpressure, 1 = drop, 2 = both"); 132 #ifdef TCP_OFFLOAD 133 static int ofld_cong_drop = 0; 134 SYSCTL_INT(_hw_cxgbe, OID_AUTO, ofld_cong_drop, CTLFLAG_RDTUN, &ofld_cong_drop, 0, 135 "Congestion control for TOE RX queues (0 = backpressure, 1 = drop, 2 = both"); 136 #endif 137 138 /* 139 * Deliver multiple frames in the same free list buffer if they fit. 140 * -1: let the driver decide whether to enable buffer packing or not. 141 * 0: disable buffer packing. 142 * 1: enable buffer packing. 143 */ 144 static int buffer_packing = -1; 145 SYSCTL_INT(_hw_cxgbe, OID_AUTO, buffer_packing, CTLFLAG_RDTUN, &buffer_packing, 146 0, "Enable buffer packing"); 147 148 /* 149 * Start next frame in a packed buffer at this boundary. 150 * -1: driver should figure out a good value. 151 * T4: driver will ignore this and use the same value as fl_pad above. 152 * T5: 16, or a power of 2 from 64 to 4096 (both inclusive) is a valid value. 153 */ 154 static int fl_pack = -1; 155 SYSCTL_INT(_hw_cxgbe, OID_AUTO, fl_pack, CTLFLAG_RDTUN, &fl_pack, 0, 156 "payload pack boundary (bytes)"); 157 158 /* 159 * Largest rx cluster size that the driver is allowed to allocate. 160 */ 161 static int largest_rx_cluster = MJUM16BYTES; 162 SYSCTL_INT(_hw_cxgbe, OID_AUTO, largest_rx_cluster, CTLFLAG_RDTUN, 163 &largest_rx_cluster, 0, "Largest rx cluster (bytes)"); 164 165 /* 166 * Size of cluster allocation that's most likely to succeed. The driver will 167 * fall back to this size if it fails to allocate clusters larger than this. 168 */ 169 static int safest_rx_cluster = PAGE_SIZE; 170 SYSCTL_INT(_hw_cxgbe, OID_AUTO, safest_rx_cluster, CTLFLAG_RDTUN, 171 &safest_rx_cluster, 0, "Safe rx cluster (bytes)"); 172 173 #ifdef RATELIMIT 174 /* 175 * Knob to control TCP timestamp rewriting, and the granularity of the tick used 176 * for rewriting. -1 and 0-3 are all valid values. 177 * -1: hardware should leave the TCP timestamps alone. 178 * 0: 1ms 179 * 1: 100us 180 * 2: 10us 181 * 3: 1us 182 */ 183 static int tsclk = -1; 184 SYSCTL_INT(_hw_cxgbe, OID_AUTO, tsclk, CTLFLAG_RDTUN, &tsclk, 0, 185 "Control TCP timestamp rewriting when using pacing"); 186 187 static int eo_max_backlog = 1024 * 1024; 188 SYSCTL_INT(_hw_cxgbe, OID_AUTO, eo_max_backlog, CTLFLAG_RDTUN, &eo_max_backlog, 189 0, "Maximum backlog of ratelimited data per flow"); 190 #endif 191 192 /* 193 * The interrupt holdoff timers are multiplied by this value on T6+. 194 * 1 and 3-17 (both inclusive) are legal values. 195 */ 196 static int tscale = 1; 197 SYSCTL_INT(_hw_cxgbe, OID_AUTO, tscale, CTLFLAG_RDTUN, &tscale, 0, 198 "Interrupt holdoff timer scale on T6+"); 199 200 /* 201 * Number of LRO entries in the lro_ctrl structure per rx queue. 202 */ 203 static int lro_entries = TCP_LRO_ENTRIES; 204 SYSCTL_INT(_hw_cxgbe, OID_AUTO, lro_entries, CTLFLAG_RDTUN, &lro_entries, 0, 205 "Number of LRO entries per RX queue"); 206 207 /* 208 * This enables presorting of frames before they're fed into tcp_lro_rx. 209 */ 210 static int lro_mbufs = 0; 211 SYSCTL_INT(_hw_cxgbe, OID_AUTO, lro_mbufs, CTLFLAG_RDTUN, &lro_mbufs, 0, 212 "Enable presorting of LRO frames"); 213 214 static counter_u64_t pullups; 215 SYSCTL_COUNTER_U64(_hw_cxgbe, OID_AUTO, pullups, CTLFLAG_RD, &pullups, 216 "Number of mbuf pullups performed"); 217 218 static counter_u64_t defrags; 219 SYSCTL_COUNTER_U64(_hw_cxgbe, OID_AUTO, defrags, CTLFLAG_RD, &defrags, 220 "Number of mbuf defrags performed"); 221 222 static int t4_tx_coalesce = 1; 223 SYSCTL_INT(_hw_cxgbe, OID_AUTO, tx_coalesce, CTLFLAG_RWTUN, &t4_tx_coalesce, 0, 224 "tx coalescing allowed"); 225 226 /* 227 * The driver will make aggressive attempts at tx coalescing if it sees these 228 * many packets eligible for coalescing in quick succession, with no more than 229 * the specified gap in between the eth_tx calls that delivered the packets. 230 */ 231 static int t4_tx_coalesce_pkts = 32; 232 SYSCTL_INT(_hw_cxgbe, OID_AUTO, tx_coalesce_pkts, CTLFLAG_RWTUN, 233 &t4_tx_coalesce_pkts, 0, 234 "# of consecutive packets (1 - 255) that will trigger tx coalescing"); 235 static int t4_tx_coalesce_gap = 5; 236 SYSCTL_INT(_hw_cxgbe, OID_AUTO, tx_coalesce_gap, CTLFLAG_RWTUN, 237 &t4_tx_coalesce_gap, 0, "tx gap (in microseconds)"); 238 239 static int service_iq(struct sge_iq *, int); 240 static int service_iq_fl(struct sge_iq *, int); 241 static struct mbuf *get_fl_payload(struct adapter *, struct sge_fl *, uint32_t); 242 static int eth_rx(struct adapter *, struct sge_rxq *, const struct iq_desc *, 243 u_int); 244 static inline void init_iq(struct sge_iq *, struct adapter *, int, int, int, 245 int, int, int); 246 static inline void init_fl(struct adapter *, struct sge_fl *, int, int, char *); 247 static inline void init_eq(struct adapter *, struct sge_eq *, int, int, uint8_t, 248 struct sge_iq *, char *); 249 static int alloc_iq_fl(struct vi_info *, struct sge_iq *, struct sge_fl *, 250 struct sysctl_ctx_list *, struct sysctl_oid *); 251 static void free_iq_fl(struct adapter *, struct sge_iq *, struct sge_fl *); 252 static void add_iq_sysctls(struct sysctl_ctx_list *, struct sysctl_oid *, 253 struct sge_iq *); 254 static void add_fl_sysctls(struct adapter *, struct sysctl_ctx_list *, 255 struct sysctl_oid *, struct sge_fl *); 256 static int alloc_iq_fl_hwq(struct vi_info *, struct sge_iq *, struct sge_fl *); 257 static int free_iq_fl_hwq(struct adapter *, struct sge_iq *, struct sge_fl *); 258 static int alloc_fwq(struct adapter *); 259 static void free_fwq(struct adapter *); 260 static int alloc_ctrlq(struct adapter *, int); 261 static void free_ctrlq(struct adapter *, int); 262 static int alloc_rxq(struct vi_info *, struct sge_rxq *, int, int, int); 263 static void free_rxq(struct vi_info *, struct sge_rxq *); 264 static void add_rxq_sysctls(struct sysctl_ctx_list *, struct sysctl_oid *, 265 struct sge_rxq *); 266 #ifdef TCP_OFFLOAD 267 static int alloc_ofld_rxq(struct vi_info *, struct sge_ofld_rxq *, int, int, 268 int); 269 static void free_ofld_rxq(struct vi_info *, struct sge_ofld_rxq *); 270 static void add_ofld_rxq_sysctls(struct sysctl_ctx_list *, struct sysctl_oid *, 271 struct sge_ofld_rxq *); 272 #endif 273 static int ctrl_eq_alloc(struct adapter *, struct sge_eq *); 274 static int eth_eq_alloc(struct adapter *, struct vi_info *, struct sge_eq *); 275 #if defined(TCP_OFFLOAD) || defined(RATELIMIT) 276 static int ofld_eq_alloc(struct adapter *, struct vi_info *, struct sge_eq *); 277 #endif 278 static int alloc_eq(struct adapter *, struct sge_eq *, struct sysctl_ctx_list *, 279 struct sysctl_oid *); 280 static void free_eq(struct adapter *, struct sge_eq *); 281 static void add_eq_sysctls(struct adapter *, struct sysctl_ctx_list *, 282 struct sysctl_oid *, struct sge_eq *); 283 static int alloc_eq_hwq(struct adapter *, struct vi_info *, struct sge_eq *); 284 static int free_eq_hwq(struct adapter *, struct vi_info *, struct sge_eq *); 285 static int alloc_wrq(struct adapter *, struct vi_info *, struct sge_wrq *, 286 struct sysctl_ctx_list *, struct sysctl_oid *); 287 static void free_wrq(struct adapter *, struct sge_wrq *); 288 static void add_wrq_sysctls(struct sysctl_ctx_list *, struct sysctl_oid *, 289 struct sge_wrq *); 290 static int alloc_txq(struct vi_info *, struct sge_txq *, int); 291 static void free_txq(struct vi_info *, struct sge_txq *); 292 static void add_txq_sysctls(struct vi_info *, struct sysctl_ctx_list *, 293 struct sysctl_oid *, struct sge_txq *); 294 #if defined(TCP_OFFLOAD) || defined(RATELIMIT) 295 static int alloc_ofld_txq(struct vi_info *, struct sge_ofld_txq *, int); 296 static void free_ofld_txq(struct vi_info *, struct sge_ofld_txq *); 297 static void add_ofld_txq_sysctls(struct sysctl_ctx_list *, struct sysctl_oid *, 298 struct sge_ofld_txq *); 299 #endif 300 static void oneseg_dma_callback(void *, bus_dma_segment_t *, int, int); 301 static inline void ring_fl_db(struct adapter *, struct sge_fl *); 302 static int refill_fl(struct adapter *, struct sge_fl *, int); 303 static void refill_sfl(void *); 304 static int find_refill_source(struct adapter *, int, bool); 305 static void add_fl_to_sfl(struct adapter *, struct sge_fl *); 306 307 static inline void get_pkt_gl(struct mbuf *, struct sglist *); 308 static inline u_int txpkt_len16(u_int, const u_int); 309 static inline u_int txpkt_vm_len16(u_int, const u_int); 310 static inline void calculate_mbuf_len16(struct mbuf *, bool); 311 static inline u_int txpkts0_len16(u_int); 312 static inline u_int txpkts1_len16(void); 313 static u_int write_raw_wr(struct sge_txq *, void *, struct mbuf *, u_int); 314 static u_int write_txpkt_wr(struct adapter *, struct sge_txq *, struct mbuf *, 315 u_int); 316 static u_int write_txpkt_vm_wr(struct adapter *, struct sge_txq *, 317 struct mbuf *); 318 static int add_to_txpkts_vf(struct adapter *, struct sge_txq *, struct mbuf *, 319 int, bool *); 320 static int add_to_txpkts_pf(struct adapter *, struct sge_txq *, struct mbuf *, 321 int, bool *); 322 static u_int write_txpkts_wr(struct adapter *, struct sge_txq *); 323 static u_int write_txpkts_vm_wr(struct adapter *, struct sge_txq *); 324 static void write_gl_to_txd(struct sge_txq *, struct mbuf *, caddr_t *, int); 325 static inline void copy_to_txd(struct sge_eq *, caddr_t, caddr_t *, int); 326 static inline void ring_eq_db(struct adapter *, struct sge_eq *, u_int); 327 static inline uint16_t read_hw_cidx(struct sge_eq *); 328 static inline u_int reclaimable_tx_desc(struct sge_eq *); 329 static inline u_int total_available_tx_desc(struct sge_eq *); 330 static u_int reclaim_tx_descs(struct sge_txq *, u_int); 331 static void tx_reclaim(void *, int); 332 static __be64 get_flit(struct sglist_seg *, int, int); 333 static int handle_sge_egr_update(struct sge_iq *, const struct rss_header *, 334 struct mbuf *); 335 static int handle_fw_msg(struct sge_iq *, const struct rss_header *, 336 struct mbuf *); 337 static int t4_handle_wrerr_rpl(struct adapter *, const __be64 *); 338 static void wrq_tx_drain(void *, int); 339 static void drain_wrq_wr_list(struct adapter *, struct sge_wrq *); 340 341 static int sysctl_bufsizes(SYSCTL_HANDLER_ARGS); 342 #ifdef RATELIMIT 343 #if defined(INET) || defined(INET6) 344 static inline u_int txpkt_eo_len16(u_int, u_int, u_int); 345 #endif 346 static int ethofld_fw4_ack(struct sge_iq *, const struct rss_header *, 347 struct mbuf *); 348 static int ethofld_transmit(struct ifnet *, struct mbuf *); 349 #endif 350 351 static counter_u64_t extfree_refs; 352 static counter_u64_t extfree_rels; 353 354 an_handler_t t4_an_handler; 355 fw_msg_handler_t t4_fw_msg_handler[NUM_FW6_TYPES]; 356 cpl_handler_t t4_cpl_handler[NUM_CPL_CMDS]; 357 cpl_handler_t set_tcb_rpl_handlers[NUM_CPL_COOKIES]; 358 cpl_handler_t l2t_write_rpl_handlers[NUM_CPL_COOKIES]; 359 cpl_handler_t act_open_rpl_handlers[NUM_CPL_COOKIES]; 360 cpl_handler_t abort_rpl_rss_handlers[NUM_CPL_COOKIES]; 361 cpl_handler_t fw4_ack_handlers[NUM_CPL_COOKIES]; 362 363 void 364 t4_register_an_handler(an_handler_t h) 365 { 366 uintptr_t *loc; 367 368 MPASS(h == NULL || t4_an_handler == NULL); 369 370 loc = (uintptr_t *)&t4_an_handler; 371 atomic_store_rel_ptr(loc, (uintptr_t)h); 372 } 373 374 void 375 t4_register_fw_msg_handler(int type, fw_msg_handler_t h) 376 { 377 uintptr_t *loc; 378 379 MPASS(type < nitems(t4_fw_msg_handler)); 380 MPASS(h == NULL || t4_fw_msg_handler[type] == NULL); 381 /* 382 * These are dispatched by the handler for FW{4|6}_CPL_MSG using the CPL 383 * handler dispatch table. Reject any attempt to install a handler for 384 * this subtype. 385 */ 386 MPASS(type != FW_TYPE_RSSCPL); 387 MPASS(type != FW6_TYPE_RSSCPL); 388 389 loc = (uintptr_t *)&t4_fw_msg_handler[type]; 390 atomic_store_rel_ptr(loc, (uintptr_t)h); 391 } 392 393 void 394 t4_register_cpl_handler(int opcode, cpl_handler_t h) 395 { 396 uintptr_t *loc; 397 398 MPASS(opcode < nitems(t4_cpl_handler)); 399 MPASS(h == NULL || t4_cpl_handler[opcode] == NULL); 400 401 loc = (uintptr_t *)&t4_cpl_handler[opcode]; 402 atomic_store_rel_ptr(loc, (uintptr_t)h); 403 } 404 405 static int 406 set_tcb_rpl_handler(struct sge_iq *iq, const struct rss_header *rss, 407 struct mbuf *m) 408 { 409 const struct cpl_set_tcb_rpl *cpl = (const void *)(rss + 1); 410 u_int tid; 411 int cookie; 412 413 MPASS(m == NULL); 414 415 tid = GET_TID(cpl); 416 if (is_hpftid(iq->adapter, tid) || is_ftid(iq->adapter, tid)) { 417 /* 418 * The return code for filter-write is put in the CPL cookie so 419 * we have to rely on the hardware tid (is_ftid) to determine 420 * that this is a response to a filter. 421 */ 422 cookie = CPL_COOKIE_FILTER; 423 } else { 424 cookie = G_COOKIE(cpl->cookie); 425 } 426 MPASS(cookie > CPL_COOKIE_RESERVED); 427 MPASS(cookie < nitems(set_tcb_rpl_handlers)); 428 429 return (set_tcb_rpl_handlers[cookie](iq, rss, m)); 430 } 431 432 static int 433 l2t_write_rpl_handler(struct sge_iq *iq, const struct rss_header *rss, 434 struct mbuf *m) 435 { 436 const struct cpl_l2t_write_rpl *rpl = (const void *)(rss + 1); 437 unsigned int cookie; 438 439 MPASS(m == NULL); 440 441 cookie = GET_TID(rpl) & F_SYNC_WR ? CPL_COOKIE_TOM : CPL_COOKIE_FILTER; 442 return (l2t_write_rpl_handlers[cookie](iq, rss, m)); 443 } 444 445 static int 446 act_open_rpl_handler(struct sge_iq *iq, const struct rss_header *rss, 447 struct mbuf *m) 448 { 449 const struct cpl_act_open_rpl *cpl = (const void *)(rss + 1); 450 u_int cookie = G_TID_COOKIE(G_AOPEN_ATID(be32toh(cpl->atid_status))); 451 452 MPASS(m == NULL); 453 MPASS(cookie != CPL_COOKIE_RESERVED); 454 455 return (act_open_rpl_handlers[cookie](iq, rss, m)); 456 } 457 458 static int 459 abort_rpl_rss_handler(struct sge_iq *iq, const struct rss_header *rss, 460 struct mbuf *m) 461 { 462 struct adapter *sc = iq->adapter; 463 u_int cookie; 464 465 MPASS(m == NULL); 466 if (is_hashfilter(sc)) 467 cookie = CPL_COOKIE_HASHFILTER; 468 else 469 cookie = CPL_COOKIE_TOM; 470 471 return (abort_rpl_rss_handlers[cookie](iq, rss, m)); 472 } 473 474 static int 475 fw4_ack_handler(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 476 { 477 struct adapter *sc = iq->adapter; 478 const struct cpl_fw4_ack *cpl = (const void *)(rss + 1); 479 unsigned int tid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl))); 480 u_int cookie; 481 482 MPASS(m == NULL); 483 if (is_etid(sc, tid)) 484 cookie = CPL_COOKIE_ETHOFLD; 485 else 486 cookie = CPL_COOKIE_TOM; 487 488 return (fw4_ack_handlers[cookie](iq, rss, m)); 489 } 490 491 static void 492 t4_init_shared_cpl_handlers(void) 493 { 494 495 t4_register_cpl_handler(CPL_SET_TCB_RPL, set_tcb_rpl_handler); 496 t4_register_cpl_handler(CPL_L2T_WRITE_RPL, l2t_write_rpl_handler); 497 t4_register_cpl_handler(CPL_ACT_OPEN_RPL, act_open_rpl_handler); 498 t4_register_cpl_handler(CPL_ABORT_RPL_RSS, abort_rpl_rss_handler); 499 t4_register_cpl_handler(CPL_FW4_ACK, fw4_ack_handler); 500 } 501 502 void 503 t4_register_shared_cpl_handler(int opcode, cpl_handler_t h, int cookie) 504 { 505 uintptr_t *loc; 506 507 MPASS(opcode < nitems(t4_cpl_handler)); 508 MPASS(cookie > CPL_COOKIE_RESERVED); 509 MPASS(cookie < NUM_CPL_COOKIES); 510 MPASS(t4_cpl_handler[opcode] != NULL); 511 512 switch (opcode) { 513 case CPL_SET_TCB_RPL: 514 loc = (uintptr_t *)&set_tcb_rpl_handlers[cookie]; 515 break; 516 case CPL_L2T_WRITE_RPL: 517 loc = (uintptr_t *)&l2t_write_rpl_handlers[cookie]; 518 break; 519 case CPL_ACT_OPEN_RPL: 520 loc = (uintptr_t *)&act_open_rpl_handlers[cookie]; 521 break; 522 case CPL_ABORT_RPL_RSS: 523 loc = (uintptr_t *)&abort_rpl_rss_handlers[cookie]; 524 break; 525 case CPL_FW4_ACK: 526 loc = (uintptr_t *)&fw4_ack_handlers[cookie]; 527 break; 528 default: 529 MPASS(0); 530 return; 531 } 532 MPASS(h == NULL || *loc == (uintptr_t)NULL); 533 atomic_store_rel_ptr(loc, (uintptr_t)h); 534 } 535 536 /* 537 * Called on MOD_LOAD. Validates and calculates the SGE tunables. 538 */ 539 void 540 t4_sge_modload(void) 541 { 542 543 if (fl_pktshift < 0 || fl_pktshift > 7) { 544 printf("Invalid hw.cxgbe.fl_pktshift value (%d)," 545 " using 0 instead.\n", fl_pktshift); 546 fl_pktshift = 0; 547 } 548 549 if (spg_len != 64 && spg_len != 128) { 550 int len; 551 552 #if defined(__i386__) || defined(__amd64__) 553 len = cpu_clflush_line_size > 64 ? 128 : 64; 554 #else 555 len = 64; 556 #endif 557 if (spg_len != -1) { 558 printf("Invalid hw.cxgbe.spg_len value (%d)," 559 " using %d instead.\n", spg_len, len); 560 } 561 spg_len = len; 562 } 563 564 if (cong_drop < -1 || cong_drop > 2) { 565 printf("Invalid hw.cxgbe.cong_drop value (%d)," 566 " using 0 instead.\n", cong_drop); 567 cong_drop = 0; 568 } 569 #ifdef TCP_OFFLOAD 570 if (ofld_cong_drop < -1 || ofld_cong_drop > 2) { 571 printf("Invalid hw.cxgbe.ofld_cong_drop value (%d)," 572 " using 0 instead.\n", ofld_cong_drop); 573 ofld_cong_drop = 0; 574 } 575 #endif 576 577 if (tscale != 1 && (tscale < 3 || tscale > 17)) { 578 printf("Invalid hw.cxgbe.tscale value (%d)," 579 " using 1 instead.\n", tscale); 580 tscale = 1; 581 } 582 583 if (largest_rx_cluster != MCLBYTES && 584 largest_rx_cluster != MJUMPAGESIZE && 585 largest_rx_cluster != MJUM9BYTES && 586 largest_rx_cluster != MJUM16BYTES) { 587 printf("Invalid hw.cxgbe.largest_rx_cluster value (%d)," 588 " using %d instead.\n", largest_rx_cluster, MJUM16BYTES); 589 largest_rx_cluster = MJUM16BYTES; 590 } 591 592 if (safest_rx_cluster != MCLBYTES && 593 safest_rx_cluster != MJUMPAGESIZE && 594 safest_rx_cluster != MJUM9BYTES && 595 safest_rx_cluster != MJUM16BYTES) { 596 printf("Invalid hw.cxgbe.safest_rx_cluster value (%d)," 597 " using %d instead.\n", safest_rx_cluster, MJUMPAGESIZE); 598 safest_rx_cluster = MJUMPAGESIZE; 599 } 600 601 extfree_refs = counter_u64_alloc(M_WAITOK); 602 extfree_rels = counter_u64_alloc(M_WAITOK); 603 pullups = counter_u64_alloc(M_WAITOK); 604 defrags = counter_u64_alloc(M_WAITOK); 605 counter_u64_zero(extfree_refs); 606 counter_u64_zero(extfree_rels); 607 counter_u64_zero(pullups); 608 counter_u64_zero(defrags); 609 610 t4_init_shared_cpl_handlers(); 611 t4_register_cpl_handler(CPL_FW4_MSG, handle_fw_msg); 612 t4_register_cpl_handler(CPL_FW6_MSG, handle_fw_msg); 613 t4_register_cpl_handler(CPL_SGE_EGR_UPDATE, handle_sge_egr_update); 614 #ifdef RATELIMIT 615 t4_register_shared_cpl_handler(CPL_FW4_ACK, ethofld_fw4_ack, 616 CPL_COOKIE_ETHOFLD); 617 #endif 618 t4_register_fw_msg_handler(FW6_TYPE_CMD_RPL, t4_handle_fw_rpl); 619 t4_register_fw_msg_handler(FW6_TYPE_WRERR_RPL, t4_handle_wrerr_rpl); 620 } 621 622 void 623 t4_sge_modunload(void) 624 { 625 626 counter_u64_free(extfree_refs); 627 counter_u64_free(extfree_rels); 628 counter_u64_free(pullups); 629 counter_u64_free(defrags); 630 } 631 632 uint64_t 633 t4_sge_extfree_refs(void) 634 { 635 uint64_t refs, rels; 636 637 rels = counter_u64_fetch(extfree_rels); 638 refs = counter_u64_fetch(extfree_refs); 639 640 return (refs - rels); 641 } 642 643 /* max 4096 */ 644 #define MAX_PACK_BOUNDARY 512 645 646 static inline void 647 setup_pad_and_pack_boundaries(struct adapter *sc) 648 { 649 uint32_t v, m; 650 int pad, pack, pad_shift; 651 652 pad_shift = chip_id(sc) > CHELSIO_T5 ? X_T6_INGPADBOUNDARY_SHIFT : 653 X_INGPADBOUNDARY_SHIFT; 654 pad = fl_pad; 655 if (fl_pad < (1 << pad_shift) || 656 fl_pad > (1 << (pad_shift + M_INGPADBOUNDARY)) || 657 !powerof2(fl_pad)) { 658 /* 659 * If there is any chance that we might use buffer packing and 660 * the chip is a T4, then pick 64 as the pad/pack boundary. Set 661 * it to the minimum allowed in all other cases. 662 */ 663 pad = is_t4(sc) && buffer_packing ? 64 : 1 << pad_shift; 664 665 /* 666 * For fl_pad = 0 we'll still write a reasonable value to the 667 * register but all the freelists will opt out of padding. 668 * We'll complain here only if the user tried to set it to a 669 * value greater than 0 that was invalid. 670 */ 671 if (fl_pad > 0) { 672 device_printf(sc->dev, "Invalid hw.cxgbe.fl_pad value" 673 " (%d), using %d instead.\n", fl_pad, pad); 674 } 675 } 676 m = V_INGPADBOUNDARY(M_INGPADBOUNDARY); 677 v = V_INGPADBOUNDARY(ilog2(pad) - pad_shift); 678 t4_set_reg_field(sc, A_SGE_CONTROL, m, v); 679 680 if (is_t4(sc)) { 681 if (fl_pack != -1 && fl_pack != pad) { 682 /* Complain but carry on. */ 683 device_printf(sc->dev, "hw.cxgbe.fl_pack (%d) ignored," 684 " using %d instead.\n", fl_pack, pad); 685 } 686 return; 687 } 688 689 pack = fl_pack; 690 if (fl_pack < 16 || fl_pack == 32 || fl_pack > 4096 || 691 !powerof2(fl_pack)) { 692 if (sc->params.pci.mps > MAX_PACK_BOUNDARY) 693 pack = MAX_PACK_BOUNDARY; 694 else 695 pack = max(sc->params.pci.mps, CACHE_LINE_SIZE); 696 MPASS(powerof2(pack)); 697 if (pack < 16) 698 pack = 16; 699 if (pack == 32) 700 pack = 64; 701 if (pack > 4096) 702 pack = 4096; 703 if (fl_pack != -1) { 704 device_printf(sc->dev, "Invalid hw.cxgbe.fl_pack value" 705 " (%d), using %d instead.\n", fl_pack, pack); 706 } 707 } 708 m = V_INGPACKBOUNDARY(M_INGPACKBOUNDARY); 709 if (pack == 16) 710 v = V_INGPACKBOUNDARY(0); 711 else 712 v = V_INGPACKBOUNDARY(ilog2(pack) - 5); 713 714 MPASS(!is_t4(sc)); /* T4 doesn't have SGE_CONTROL2 */ 715 t4_set_reg_field(sc, A_SGE_CONTROL2, m, v); 716 } 717 718 /* 719 * adap->params.vpd.cclk must be set up before this is called. 720 */ 721 void 722 t4_tweak_chip_settings(struct adapter *sc) 723 { 724 int i, reg; 725 uint32_t v, m; 726 int intr_timer[SGE_NTIMERS] = {1, 5, 10, 50, 100, 200}; 727 int timer_max = M_TIMERVALUE0 * 1000 / sc->params.vpd.cclk; 728 int intr_pktcount[SGE_NCOUNTERS] = {1, 8, 16, 32}; /* 63 max */ 729 uint16_t indsz = min(RX_COPY_THRESHOLD - 1, M_INDICATESIZE); 730 static int sw_buf_sizes[] = { 731 MCLBYTES, 732 MJUMPAGESIZE, 733 MJUM9BYTES, 734 MJUM16BYTES 735 }; 736 737 KASSERT(sc->flags & MASTER_PF, 738 ("%s: trying to change chip settings when not master.", __func__)); 739 740 m = V_PKTSHIFT(M_PKTSHIFT) | F_RXPKTCPLMODE | F_EGRSTATUSPAGESIZE; 741 v = V_PKTSHIFT(fl_pktshift) | F_RXPKTCPLMODE | 742 V_EGRSTATUSPAGESIZE(spg_len == 128); 743 t4_set_reg_field(sc, A_SGE_CONTROL, m, v); 744 745 setup_pad_and_pack_boundaries(sc); 746 747 v = V_HOSTPAGESIZEPF0(PAGE_SHIFT - 10) | 748 V_HOSTPAGESIZEPF1(PAGE_SHIFT - 10) | 749 V_HOSTPAGESIZEPF2(PAGE_SHIFT - 10) | 750 V_HOSTPAGESIZEPF3(PAGE_SHIFT - 10) | 751 V_HOSTPAGESIZEPF4(PAGE_SHIFT - 10) | 752 V_HOSTPAGESIZEPF5(PAGE_SHIFT - 10) | 753 V_HOSTPAGESIZEPF6(PAGE_SHIFT - 10) | 754 V_HOSTPAGESIZEPF7(PAGE_SHIFT - 10); 755 t4_write_reg(sc, A_SGE_HOST_PAGE_SIZE, v); 756 757 t4_write_reg(sc, A_SGE_FL_BUFFER_SIZE0, 4096); 758 t4_write_reg(sc, A_SGE_FL_BUFFER_SIZE1, 65536); 759 reg = A_SGE_FL_BUFFER_SIZE2; 760 for (i = 0; i < nitems(sw_buf_sizes); i++) { 761 MPASS(reg <= A_SGE_FL_BUFFER_SIZE15); 762 t4_write_reg(sc, reg, sw_buf_sizes[i]); 763 reg += 4; 764 MPASS(reg <= A_SGE_FL_BUFFER_SIZE15); 765 t4_write_reg(sc, reg, sw_buf_sizes[i] - CL_METADATA_SIZE); 766 reg += 4; 767 } 768 769 v = V_THRESHOLD_0(intr_pktcount[0]) | V_THRESHOLD_1(intr_pktcount[1]) | 770 V_THRESHOLD_2(intr_pktcount[2]) | V_THRESHOLD_3(intr_pktcount[3]); 771 t4_write_reg(sc, A_SGE_INGRESS_RX_THRESHOLD, v); 772 773 KASSERT(intr_timer[0] <= timer_max, 774 ("%s: not a single usable timer (%d, %d)", __func__, intr_timer[0], 775 timer_max)); 776 for (i = 1; i < nitems(intr_timer); i++) { 777 KASSERT(intr_timer[i] >= intr_timer[i - 1], 778 ("%s: timers not listed in increasing order (%d)", 779 __func__, i)); 780 781 while (intr_timer[i] > timer_max) { 782 if (i == nitems(intr_timer) - 1) { 783 intr_timer[i] = timer_max; 784 break; 785 } 786 intr_timer[i] += intr_timer[i - 1]; 787 intr_timer[i] /= 2; 788 } 789 } 790 791 v = V_TIMERVALUE0(us_to_core_ticks(sc, intr_timer[0])) | 792 V_TIMERVALUE1(us_to_core_ticks(sc, intr_timer[1])); 793 t4_write_reg(sc, A_SGE_TIMER_VALUE_0_AND_1, v); 794 v = V_TIMERVALUE2(us_to_core_ticks(sc, intr_timer[2])) | 795 V_TIMERVALUE3(us_to_core_ticks(sc, intr_timer[3])); 796 t4_write_reg(sc, A_SGE_TIMER_VALUE_2_AND_3, v); 797 v = V_TIMERVALUE4(us_to_core_ticks(sc, intr_timer[4])) | 798 V_TIMERVALUE5(us_to_core_ticks(sc, intr_timer[5])); 799 t4_write_reg(sc, A_SGE_TIMER_VALUE_4_AND_5, v); 800 801 if (chip_id(sc) >= CHELSIO_T6) { 802 m = V_TSCALE(M_TSCALE); 803 if (tscale == 1) 804 v = 0; 805 else 806 v = V_TSCALE(tscale - 2); 807 t4_set_reg_field(sc, A_SGE_ITP_CONTROL, m, v); 808 809 if (sc->debug_flags & DF_DISABLE_TCB_CACHE) { 810 m = V_RDTHRESHOLD(M_RDTHRESHOLD) | F_WRTHRTHRESHEN | 811 V_WRTHRTHRESH(M_WRTHRTHRESH); 812 t4_tp_pio_read(sc, &v, 1, A_TP_CMM_CONFIG, 1); 813 v &= ~m; 814 v |= V_RDTHRESHOLD(1) | F_WRTHRTHRESHEN | 815 V_WRTHRTHRESH(16); 816 t4_tp_pio_write(sc, &v, 1, A_TP_CMM_CONFIG, 1); 817 } 818 } 819 820 /* 4K, 16K, 64K, 256K DDP "page sizes" for TDDP */ 821 v = V_HPZ0(0) | V_HPZ1(2) | V_HPZ2(4) | V_HPZ3(6); 822 t4_write_reg(sc, A_ULP_RX_TDDP_PSZ, v); 823 824 /* 825 * 4K, 8K, 16K, 64K DDP "page sizes" for iSCSI DDP. These have been 826 * chosen with MAXPHYS = 128K in mind. The largest DDP buffer that we 827 * may have to deal with is MAXPHYS + 1 page. 828 */ 829 v = V_HPZ0(0) | V_HPZ1(1) | V_HPZ2(2) | V_HPZ3(4); 830 t4_write_reg(sc, A_ULP_RX_ISCSI_PSZ, v); 831 832 /* We use multiple DDP page sizes both in plain-TOE and ISCSI modes. */ 833 m = v = F_TDDPTAGTCB | F_ISCSITAGTCB; 834 t4_set_reg_field(sc, A_ULP_RX_CTL, m, v); 835 836 m = V_INDICATESIZE(M_INDICATESIZE) | F_REARMDDPOFFSET | 837 F_RESETDDPOFFSET; 838 v = V_INDICATESIZE(indsz) | F_REARMDDPOFFSET | F_RESETDDPOFFSET; 839 t4_set_reg_field(sc, A_TP_PARA_REG5, m, v); 840 } 841 842 /* 843 * SGE wants the buffer to be at least 64B and then a multiple of 16. Its 844 * address mut be 16B aligned. If padding is in use the buffer's start and end 845 * need to be aligned to the pad boundary as well. We'll just make sure that 846 * the size is a multiple of the pad boundary here, it is up to the buffer 847 * allocation code to make sure the start of the buffer is aligned. 848 */ 849 static inline int 850 hwsz_ok(struct adapter *sc, int hwsz) 851 { 852 int mask = fl_pad ? sc->params.sge.pad_boundary - 1 : 16 - 1; 853 854 return (hwsz >= 64 && (hwsz & mask) == 0); 855 } 856 857 /* 858 * Initialize the rx buffer sizes and figure out which zones the buffers will 859 * be allocated from. 860 */ 861 void 862 t4_init_rx_buf_info(struct adapter *sc) 863 { 864 struct sge *s = &sc->sge; 865 struct sge_params *sp = &sc->params.sge; 866 int i, j, n; 867 static int sw_buf_sizes[] = { /* Sorted by size */ 868 MCLBYTES, 869 MJUMPAGESIZE, 870 MJUM9BYTES, 871 MJUM16BYTES 872 }; 873 struct rx_buf_info *rxb; 874 875 s->safe_zidx = -1; 876 rxb = &s->rx_buf_info[0]; 877 for (i = 0; i < SW_ZONE_SIZES; i++, rxb++) { 878 rxb->size1 = sw_buf_sizes[i]; 879 rxb->zone = m_getzone(rxb->size1); 880 rxb->type = m_gettype(rxb->size1); 881 rxb->size2 = 0; 882 rxb->hwidx1 = -1; 883 rxb->hwidx2 = -1; 884 for (j = 0; j < SGE_FLBUF_SIZES; j++) { 885 int hwsize = sp->sge_fl_buffer_size[j]; 886 887 if (!hwsz_ok(sc, hwsize)) 888 continue; 889 890 /* hwidx for size1 */ 891 if (rxb->hwidx1 == -1 && rxb->size1 == hwsize) 892 rxb->hwidx1 = j; 893 894 /* hwidx for size2 (buffer packing) */ 895 if (rxb->size1 - CL_METADATA_SIZE < hwsize) 896 continue; 897 n = rxb->size1 - hwsize - CL_METADATA_SIZE; 898 if (n == 0) { 899 rxb->hwidx2 = j; 900 rxb->size2 = hwsize; 901 break; /* stop looking */ 902 } 903 if (rxb->hwidx2 != -1) { 904 if (n < sp->sge_fl_buffer_size[rxb->hwidx2] - 905 hwsize - CL_METADATA_SIZE) { 906 rxb->hwidx2 = j; 907 rxb->size2 = hwsize; 908 } 909 } else if (n <= 2 * CL_METADATA_SIZE) { 910 rxb->hwidx2 = j; 911 rxb->size2 = hwsize; 912 } 913 } 914 if (rxb->hwidx2 != -1) 915 sc->flags |= BUF_PACKING_OK; 916 if (s->safe_zidx == -1 && rxb->size1 == safest_rx_cluster) 917 s->safe_zidx = i; 918 } 919 } 920 921 /* 922 * Verify some basic SGE settings for the PF and VF driver, and other 923 * miscellaneous settings for the PF driver. 924 */ 925 int 926 t4_verify_chip_settings(struct adapter *sc) 927 { 928 struct sge_params *sp = &sc->params.sge; 929 uint32_t m, v, r; 930 int rc = 0; 931 const uint16_t indsz = min(RX_COPY_THRESHOLD - 1, M_INDICATESIZE); 932 933 m = F_RXPKTCPLMODE; 934 v = F_RXPKTCPLMODE; 935 r = sp->sge_control; 936 if ((r & m) != v) { 937 device_printf(sc->dev, "invalid SGE_CONTROL(0x%x)\n", r); 938 rc = EINVAL; 939 } 940 941 /* 942 * If this changes then every single use of PAGE_SHIFT in the driver 943 * needs to be carefully reviewed for PAGE_SHIFT vs sp->page_shift. 944 */ 945 if (sp->page_shift != PAGE_SHIFT) { 946 device_printf(sc->dev, "invalid SGE_HOST_PAGE_SIZE(0x%x)\n", r); 947 rc = EINVAL; 948 } 949 950 if (sc->flags & IS_VF) 951 return (0); 952 953 v = V_HPZ0(0) | V_HPZ1(2) | V_HPZ2(4) | V_HPZ3(6); 954 r = t4_read_reg(sc, A_ULP_RX_TDDP_PSZ); 955 if (r != v) { 956 device_printf(sc->dev, "invalid ULP_RX_TDDP_PSZ(0x%x)\n", r); 957 if (sc->vres.ddp.size != 0) 958 rc = EINVAL; 959 } 960 961 m = v = F_TDDPTAGTCB; 962 r = t4_read_reg(sc, A_ULP_RX_CTL); 963 if ((r & m) != v) { 964 device_printf(sc->dev, "invalid ULP_RX_CTL(0x%x)\n", r); 965 if (sc->vres.ddp.size != 0) 966 rc = EINVAL; 967 } 968 969 m = V_INDICATESIZE(M_INDICATESIZE) | F_REARMDDPOFFSET | 970 F_RESETDDPOFFSET; 971 v = V_INDICATESIZE(indsz) | F_REARMDDPOFFSET | F_RESETDDPOFFSET; 972 r = t4_read_reg(sc, A_TP_PARA_REG5); 973 if ((r & m) != v) { 974 device_printf(sc->dev, "invalid TP_PARA_REG5(0x%x)\n", r); 975 if (sc->vres.ddp.size != 0) 976 rc = EINVAL; 977 } 978 979 return (rc); 980 } 981 982 int 983 t4_create_dma_tag(struct adapter *sc) 984 { 985 int rc; 986 987 rc = bus_dma_tag_create(bus_get_dma_tag(sc->dev), 1, 0, 988 BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, BUS_SPACE_MAXSIZE, 989 BUS_SPACE_UNRESTRICTED, BUS_SPACE_MAXSIZE, BUS_DMA_ALLOCNOW, NULL, 990 NULL, &sc->dmat); 991 if (rc != 0) { 992 device_printf(sc->dev, 993 "failed to create main DMA tag: %d\n", rc); 994 } 995 996 return (rc); 997 } 998 999 void 1000 t4_sge_sysctls(struct adapter *sc, struct sysctl_ctx_list *ctx, 1001 struct sysctl_oid_list *children) 1002 { 1003 struct sge_params *sp = &sc->params.sge; 1004 1005 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "buffer_sizes", 1006 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 1007 sysctl_bufsizes, "A", "freelist buffer sizes"); 1008 1009 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pktshift", CTLFLAG_RD, 1010 NULL, sp->fl_pktshift, "payload DMA offset in rx buffer (bytes)"); 1011 1012 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pad", CTLFLAG_RD, 1013 NULL, sp->pad_boundary, "payload pad boundary (bytes)"); 1014 1015 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "spg_len", CTLFLAG_RD, 1016 NULL, sp->spg_len, "status page size (bytes)"); 1017 1018 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "cong_drop", CTLFLAG_RD, 1019 NULL, cong_drop, "congestion drop setting"); 1020 #ifdef TCP_OFFLOAD 1021 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "ofld_cong_drop", CTLFLAG_RD, 1022 NULL, ofld_cong_drop, "congestion drop setting"); 1023 #endif 1024 1025 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pack", CTLFLAG_RD, 1026 NULL, sp->pack_boundary, "payload pack boundary (bytes)"); 1027 } 1028 1029 int 1030 t4_destroy_dma_tag(struct adapter *sc) 1031 { 1032 if (sc->dmat) 1033 bus_dma_tag_destroy(sc->dmat); 1034 1035 return (0); 1036 } 1037 1038 /* 1039 * Allocate and initialize the firmware event queue, control queues, and special 1040 * purpose rx queues owned by the adapter. 1041 * 1042 * Returns errno on failure. Resources allocated up to that point may still be 1043 * allocated. Caller is responsible for cleanup in case this function fails. 1044 */ 1045 int 1046 t4_setup_adapter_queues(struct adapter *sc) 1047 { 1048 int rc, i; 1049 1050 ADAPTER_LOCK_ASSERT_NOTOWNED(sc); 1051 1052 /* 1053 * Firmware event queue 1054 */ 1055 rc = alloc_fwq(sc); 1056 if (rc != 0) 1057 return (rc); 1058 1059 /* 1060 * That's all for the VF driver. 1061 */ 1062 if (sc->flags & IS_VF) 1063 return (rc); 1064 1065 /* 1066 * XXX: General purpose rx queues, one per port. 1067 */ 1068 1069 /* 1070 * Control queues, one per port. 1071 */ 1072 for_each_port(sc, i) { 1073 rc = alloc_ctrlq(sc, i); 1074 if (rc != 0) 1075 return (rc); 1076 } 1077 1078 return (rc); 1079 } 1080 1081 /* 1082 * Idempotent 1083 */ 1084 int 1085 t4_teardown_adapter_queues(struct adapter *sc) 1086 { 1087 int i; 1088 1089 ADAPTER_LOCK_ASSERT_NOTOWNED(sc); 1090 1091 if (sc->sge.ctrlq != NULL) { 1092 MPASS(!(sc->flags & IS_VF)); /* VFs don't allocate ctrlq. */ 1093 for_each_port(sc, i) 1094 free_ctrlq(sc, i); 1095 } 1096 free_fwq(sc); 1097 1098 return (0); 1099 } 1100 1101 /* Maximum payload that could arrive with a single iq descriptor. */ 1102 static inline int 1103 max_rx_payload(struct adapter *sc, struct ifnet *ifp, const bool ofld) 1104 { 1105 int maxp; 1106 1107 /* large enough even when hw VLAN extraction is disabled */ 1108 maxp = sc->params.sge.fl_pktshift + ETHER_HDR_LEN + 1109 ETHER_VLAN_ENCAP_LEN + ifp->if_mtu; 1110 if (ofld && sc->tt.tls && sc->cryptocaps & FW_CAPS_CONFIG_TLSKEYS && 1111 maxp < sc->params.tp.max_rx_pdu) 1112 maxp = sc->params.tp.max_rx_pdu; 1113 return (maxp); 1114 } 1115 1116 int 1117 t4_setup_vi_queues(struct vi_info *vi) 1118 { 1119 int rc = 0, i, intr_idx; 1120 struct sge_rxq *rxq; 1121 struct sge_txq *txq; 1122 #ifdef TCP_OFFLOAD 1123 struct sge_ofld_rxq *ofld_rxq; 1124 #endif 1125 #if defined(TCP_OFFLOAD) || defined(RATELIMIT) 1126 struct sge_ofld_txq *ofld_txq; 1127 #endif 1128 #ifdef DEV_NETMAP 1129 int saved_idx, iqidx; 1130 struct sge_nm_rxq *nm_rxq; 1131 struct sge_nm_txq *nm_txq; 1132 #endif 1133 struct adapter *sc = vi->adapter; 1134 struct ifnet *ifp = vi->ifp; 1135 int maxp; 1136 1137 /* Interrupt vector to start from (when using multiple vectors) */ 1138 intr_idx = vi->first_intr; 1139 1140 #ifdef DEV_NETMAP 1141 saved_idx = intr_idx; 1142 if (ifp->if_capabilities & IFCAP_NETMAP) { 1143 1144 /* netmap is supported with direct interrupts only. */ 1145 MPASS(!forwarding_intr_to_fwq(sc)); 1146 MPASS(vi->first_intr >= 0); 1147 1148 /* 1149 * We don't have buffers to back the netmap rx queues 1150 * right now so we create the queues in a way that 1151 * doesn't set off any congestion signal in the chip. 1152 */ 1153 for_each_nm_rxq(vi, i, nm_rxq) { 1154 rc = alloc_nm_rxq(vi, nm_rxq, intr_idx, i); 1155 if (rc != 0) 1156 goto done; 1157 intr_idx++; 1158 } 1159 1160 for_each_nm_txq(vi, i, nm_txq) { 1161 iqidx = vi->first_nm_rxq + (i % vi->nnmrxq); 1162 rc = alloc_nm_txq(vi, nm_txq, iqidx, i); 1163 if (rc != 0) 1164 goto done; 1165 } 1166 } 1167 1168 /* Normal rx queues and netmap rx queues share the same interrupts. */ 1169 intr_idx = saved_idx; 1170 #endif 1171 1172 /* 1173 * Allocate rx queues first because a default iqid is required when 1174 * creating a tx queue. 1175 */ 1176 maxp = max_rx_payload(sc, ifp, false); 1177 for_each_rxq(vi, i, rxq) { 1178 rc = alloc_rxq(vi, rxq, i, intr_idx, maxp); 1179 if (rc != 0) 1180 goto done; 1181 if (!forwarding_intr_to_fwq(sc)) 1182 intr_idx++; 1183 } 1184 #ifdef DEV_NETMAP 1185 if (ifp->if_capabilities & IFCAP_NETMAP) 1186 intr_idx = saved_idx + max(vi->nrxq, vi->nnmrxq); 1187 #endif 1188 #ifdef TCP_OFFLOAD 1189 maxp = max_rx_payload(sc, ifp, true); 1190 for_each_ofld_rxq(vi, i, ofld_rxq) { 1191 rc = alloc_ofld_rxq(vi, ofld_rxq, i, intr_idx, maxp); 1192 if (rc != 0) 1193 goto done; 1194 if (!forwarding_intr_to_fwq(sc)) 1195 intr_idx++; 1196 } 1197 #endif 1198 1199 /* 1200 * Now the tx queues. 1201 */ 1202 for_each_txq(vi, i, txq) { 1203 rc = alloc_txq(vi, txq, i); 1204 if (rc != 0) 1205 goto done; 1206 } 1207 #if defined(TCP_OFFLOAD) || defined(RATELIMIT) 1208 for_each_ofld_txq(vi, i, ofld_txq) { 1209 rc = alloc_ofld_txq(vi, ofld_txq, i); 1210 if (rc != 0) 1211 goto done; 1212 } 1213 #endif 1214 done: 1215 if (rc) 1216 t4_teardown_vi_queues(vi); 1217 1218 return (rc); 1219 } 1220 1221 /* 1222 * Idempotent 1223 */ 1224 int 1225 t4_teardown_vi_queues(struct vi_info *vi) 1226 { 1227 int i; 1228 struct sge_rxq *rxq; 1229 struct sge_txq *txq; 1230 #if defined(TCP_OFFLOAD) || defined(RATELIMIT) 1231 struct sge_ofld_txq *ofld_txq; 1232 #endif 1233 #ifdef TCP_OFFLOAD 1234 struct sge_ofld_rxq *ofld_rxq; 1235 #endif 1236 #ifdef DEV_NETMAP 1237 struct sge_nm_rxq *nm_rxq; 1238 struct sge_nm_txq *nm_txq; 1239 #endif 1240 1241 #ifdef DEV_NETMAP 1242 if (vi->ifp->if_capabilities & IFCAP_NETMAP) { 1243 for_each_nm_txq(vi, i, nm_txq) { 1244 free_nm_txq(vi, nm_txq); 1245 } 1246 1247 for_each_nm_rxq(vi, i, nm_rxq) { 1248 free_nm_rxq(vi, nm_rxq); 1249 } 1250 } 1251 #endif 1252 1253 /* 1254 * Take down all the tx queues first, as they reference the rx queues 1255 * (for egress updates, etc.). 1256 */ 1257 1258 for_each_txq(vi, i, txq) { 1259 free_txq(vi, txq); 1260 } 1261 #if defined(TCP_OFFLOAD) || defined(RATELIMIT) 1262 for_each_ofld_txq(vi, i, ofld_txq) { 1263 free_ofld_txq(vi, ofld_txq); 1264 } 1265 #endif 1266 1267 /* 1268 * Then take down the rx queues. 1269 */ 1270 1271 for_each_rxq(vi, i, rxq) { 1272 free_rxq(vi, rxq); 1273 } 1274 #ifdef TCP_OFFLOAD 1275 for_each_ofld_rxq(vi, i, ofld_rxq) { 1276 free_ofld_rxq(vi, ofld_rxq); 1277 } 1278 #endif 1279 1280 return (0); 1281 } 1282 1283 /* 1284 * Interrupt handler when the driver is using only 1 interrupt. This is a very 1285 * unusual scenario. 1286 * 1287 * a) Deals with errors, if any. 1288 * b) Services firmware event queue, which is taking interrupts for all other 1289 * queues. 1290 */ 1291 void 1292 t4_intr_all(void *arg) 1293 { 1294 struct adapter *sc = arg; 1295 struct sge_iq *fwq = &sc->sge.fwq; 1296 1297 MPASS(sc->intr_count == 1); 1298 1299 if (sc->intr_type == INTR_INTX) 1300 t4_write_reg(sc, MYPF_REG(A_PCIE_PF_CLI), 0); 1301 1302 t4_intr_err(arg); 1303 t4_intr_evt(fwq); 1304 } 1305 1306 /* 1307 * Interrupt handler for errors (installed directly when multiple interrupts are 1308 * being used, or called by t4_intr_all). 1309 */ 1310 void 1311 t4_intr_err(void *arg) 1312 { 1313 struct adapter *sc = arg; 1314 uint32_t v; 1315 const bool verbose = (sc->debug_flags & DF_VERBOSE_SLOWINTR) != 0; 1316 1317 if (atomic_load_int(&sc->error_flags) & ADAP_FATAL_ERR) 1318 return; 1319 1320 v = t4_read_reg(sc, MYPF_REG(A_PL_PF_INT_CAUSE)); 1321 if (v & F_PFSW) { 1322 sc->swintr++; 1323 t4_write_reg(sc, MYPF_REG(A_PL_PF_INT_CAUSE), v); 1324 } 1325 1326 if (t4_slow_intr_handler(sc, verbose)) 1327 t4_fatal_err(sc, false); 1328 } 1329 1330 /* 1331 * Interrupt handler for iq-only queues. The firmware event queue is the only 1332 * such queue right now. 1333 */ 1334 void 1335 t4_intr_evt(void *arg) 1336 { 1337 struct sge_iq *iq = arg; 1338 1339 if (atomic_cmpset_int(&iq->state, IQS_IDLE, IQS_BUSY)) { 1340 service_iq(iq, 0); 1341 (void) atomic_cmpset_int(&iq->state, IQS_BUSY, IQS_IDLE); 1342 } 1343 } 1344 1345 /* 1346 * Interrupt handler for iq+fl queues. 1347 */ 1348 void 1349 t4_intr(void *arg) 1350 { 1351 struct sge_iq *iq = arg; 1352 1353 if (atomic_cmpset_int(&iq->state, IQS_IDLE, IQS_BUSY)) { 1354 service_iq_fl(iq, 0); 1355 (void) atomic_cmpset_int(&iq->state, IQS_BUSY, IQS_IDLE); 1356 } 1357 } 1358 1359 #ifdef DEV_NETMAP 1360 /* 1361 * Interrupt handler for netmap rx queues. 1362 */ 1363 void 1364 t4_nm_intr(void *arg) 1365 { 1366 struct sge_nm_rxq *nm_rxq = arg; 1367 1368 if (atomic_cmpset_int(&nm_rxq->nm_state, NM_ON, NM_BUSY)) { 1369 service_nm_rxq(nm_rxq); 1370 (void) atomic_cmpset_int(&nm_rxq->nm_state, NM_BUSY, NM_ON); 1371 } 1372 } 1373 1374 /* 1375 * Interrupt handler for vectors shared between NIC and netmap rx queues. 1376 */ 1377 void 1378 t4_vi_intr(void *arg) 1379 { 1380 struct irq *irq = arg; 1381 1382 MPASS(irq->nm_rxq != NULL); 1383 t4_nm_intr(irq->nm_rxq); 1384 1385 MPASS(irq->rxq != NULL); 1386 t4_intr(irq->rxq); 1387 } 1388 #endif 1389 1390 /* 1391 * Deals with interrupts on an iq-only (no freelist) queue. 1392 */ 1393 static int 1394 service_iq(struct sge_iq *iq, int budget) 1395 { 1396 struct sge_iq *q; 1397 struct adapter *sc = iq->adapter; 1398 struct iq_desc *d = &iq->desc[iq->cidx]; 1399 int ndescs = 0, limit; 1400 int rsp_type; 1401 uint32_t lq; 1402 STAILQ_HEAD(, sge_iq) iql = STAILQ_HEAD_INITIALIZER(iql); 1403 1404 KASSERT(iq->state == IQS_BUSY, ("%s: iq %p not BUSY", __func__, iq)); 1405 KASSERT((iq->flags & IQ_HAS_FL) == 0, 1406 ("%s: called for iq %p with fl (iq->flags 0x%x)", __func__, iq, 1407 iq->flags)); 1408 MPASS((iq->flags & IQ_ADJ_CREDIT) == 0); 1409 MPASS((iq->flags & IQ_LRO_ENABLED) == 0); 1410 1411 limit = budget ? budget : iq->qsize / 16; 1412 1413 /* 1414 * We always come back and check the descriptor ring for new indirect 1415 * interrupts and other responses after running a single handler. 1416 */ 1417 for (;;) { 1418 while ((d->rsp.u.type_gen & F_RSPD_GEN) == iq->gen) { 1419 1420 rmb(); 1421 1422 rsp_type = G_RSPD_TYPE(d->rsp.u.type_gen); 1423 lq = be32toh(d->rsp.pldbuflen_qid); 1424 1425 switch (rsp_type) { 1426 case X_RSPD_TYPE_FLBUF: 1427 panic("%s: data for an iq (%p) with no freelist", 1428 __func__, iq); 1429 1430 /* NOTREACHED */ 1431 1432 case X_RSPD_TYPE_CPL: 1433 KASSERT(d->rss.opcode < NUM_CPL_CMDS, 1434 ("%s: bad opcode %02x.", __func__, 1435 d->rss.opcode)); 1436 t4_cpl_handler[d->rss.opcode](iq, &d->rss, NULL); 1437 break; 1438 1439 case X_RSPD_TYPE_INTR: 1440 /* 1441 * There are 1K interrupt-capable queues (qids 0 1442 * through 1023). A response type indicating a 1443 * forwarded interrupt with a qid >= 1K is an 1444 * iWARP async notification. 1445 */ 1446 if (__predict_true(lq >= 1024)) { 1447 t4_an_handler(iq, &d->rsp); 1448 break; 1449 } 1450 1451 q = sc->sge.iqmap[lq - sc->sge.iq_start - 1452 sc->sge.iq_base]; 1453 if (atomic_cmpset_int(&q->state, IQS_IDLE, 1454 IQS_BUSY)) { 1455 if (service_iq_fl(q, q->qsize / 16) == 0) { 1456 (void) atomic_cmpset_int(&q->state, 1457 IQS_BUSY, IQS_IDLE); 1458 } else { 1459 STAILQ_INSERT_TAIL(&iql, q, 1460 link); 1461 } 1462 } 1463 break; 1464 1465 default: 1466 KASSERT(0, 1467 ("%s: illegal response type %d on iq %p", 1468 __func__, rsp_type, iq)); 1469 log(LOG_ERR, 1470 "%s: illegal response type %d on iq %p", 1471 device_get_nameunit(sc->dev), rsp_type, iq); 1472 break; 1473 } 1474 1475 d++; 1476 if (__predict_false(++iq->cidx == iq->sidx)) { 1477 iq->cidx = 0; 1478 iq->gen ^= F_RSPD_GEN; 1479 d = &iq->desc[0]; 1480 } 1481 if (__predict_false(++ndescs == limit)) { 1482 t4_write_reg(sc, sc->sge_gts_reg, 1483 V_CIDXINC(ndescs) | 1484 V_INGRESSQID(iq->cntxt_id) | 1485 V_SEINTARM(V_QINTR_TIMER_IDX(X_TIMERREG_UPDATE_CIDX))); 1486 ndescs = 0; 1487 1488 if (budget) { 1489 return (EINPROGRESS); 1490 } 1491 } 1492 } 1493 1494 if (STAILQ_EMPTY(&iql)) 1495 break; 1496 1497 /* 1498 * Process the head only, and send it to the back of the list if 1499 * it's still not done. 1500 */ 1501 q = STAILQ_FIRST(&iql); 1502 STAILQ_REMOVE_HEAD(&iql, link); 1503 if (service_iq_fl(q, q->qsize / 8) == 0) 1504 (void) atomic_cmpset_int(&q->state, IQS_BUSY, IQS_IDLE); 1505 else 1506 STAILQ_INSERT_TAIL(&iql, q, link); 1507 } 1508 1509 t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(ndescs) | 1510 V_INGRESSQID((u32)iq->cntxt_id) | V_SEINTARM(iq->intr_params)); 1511 1512 return (0); 1513 } 1514 1515 #if defined(INET) || defined(INET6) 1516 static inline int 1517 sort_before_lro(struct lro_ctrl *lro) 1518 { 1519 1520 return (lro->lro_mbuf_max != 0); 1521 } 1522 #endif 1523 1524 #define CGBE_SHIFT_SCALE 10 1525 1526 static inline uint64_t 1527 t4_tstmp_to_ns(struct adapter *sc, uint64_t lf) 1528 { 1529 struct clock_sync *cur, dcur; 1530 uint64_t hw_clocks; 1531 uint64_t hw_clk_div; 1532 sbintime_t sbt_cur_to_prev, sbt; 1533 uint64_t hw_tstmp = lf & 0xfffffffffffffffULL; /* 60b, not 64b. */ 1534 seqc_t gen; 1535 1536 for (;;) { 1537 cur = &sc->cal_info[sc->cal_current]; 1538 gen = seqc_read(&cur->gen); 1539 if (gen == 0) 1540 return (0); 1541 dcur = *cur; 1542 if (seqc_consistent(&cur->gen, gen)) 1543 break; 1544 } 1545 1546 /* 1547 * Our goal here is to have a result that is: 1548 * 1549 * ( (cur_time - prev_time) ) 1550 * ((hw_tstmp - hw_prev) * ----------------------------- ) + prev_time 1551 * ( (hw_cur - hw_prev) ) 1552 * 1553 * With the constraints that we cannot use float and we 1554 * don't want to overflow the uint64_t numbers we are using. 1555 */ 1556 hw_clocks = hw_tstmp - dcur.hw_prev; 1557 sbt_cur_to_prev = (dcur.sbt_cur - dcur.sbt_prev); 1558 hw_clk_div = dcur.hw_cur - dcur.hw_prev; 1559 sbt = hw_clocks * sbt_cur_to_prev / hw_clk_div + dcur.sbt_prev; 1560 return (sbttons(sbt)); 1561 } 1562 1563 static inline void 1564 move_to_next_rxbuf(struct sge_fl *fl) 1565 { 1566 1567 fl->rx_offset = 0; 1568 if (__predict_false((++fl->cidx & 7) == 0)) { 1569 uint16_t cidx = fl->cidx >> 3; 1570 1571 if (__predict_false(cidx == fl->sidx)) 1572 fl->cidx = cidx = 0; 1573 fl->hw_cidx = cidx; 1574 } 1575 } 1576 1577 /* 1578 * Deals with interrupts on an iq+fl queue. 1579 */ 1580 static int 1581 service_iq_fl(struct sge_iq *iq, int budget) 1582 { 1583 struct sge_rxq *rxq = iq_to_rxq(iq); 1584 struct sge_fl *fl; 1585 struct adapter *sc = iq->adapter; 1586 struct iq_desc *d = &iq->desc[iq->cidx]; 1587 int ndescs, limit; 1588 int rsp_type, starved; 1589 uint32_t lq; 1590 uint16_t fl_hw_cidx; 1591 struct mbuf *m0; 1592 #if defined(INET) || defined(INET6) 1593 const struct timeval lro_timeout = {0, sc->lro_timeout}; 1594 struct lro_ctrl *lro = &rxq->lro; 1595 #endif 1596 1597 KASSERT(iq->state == IQS_BUSY, ("%s: iq %p not BUSY", __func__, iq)); 1598 MPASS(iq->flags & IQ_HAS_FL); 1599 1600 ndescs = 0; 1601 #if defined(INET) || defined(INET6) 1602 if (iq->flags & IQ_ADJ_CREDIT) { 1603 MPASS(sort_before_lro(lro)); 1604 iq->flags &= ~IQ_ADJ_CREDIT; 1605 if ((d->rsp.u.type_gen & F_RSPD_GEN) != iq->gen) { 1606 tcp_lro_flush_all(lro); 1607 t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(1) | 1608 V_INGRESSQID((u32)iq->cntxt_id) | 1609 V_SEINTARM(iq->intr_params)); 1610 return (0); 1611 } 1612 ndescs = 1; 1613 } 1614 #else 1615 MPASS((iq->flags & IQ_ADJ_CREDIT) == 0); 1616 #endif 1617 1618 limit = budget ? budget : iq->qsize / 16; 1619 fl = &rxq->fl; 1620 fl_hw_cidx = fl->hw_cidx; /* stable snapshot */ 1621 while ((d->rsp.u.type_gen & F_RSPD_GEN) == iq->gen) { 1622 1623 rmb(); 1624 1625 m0 = NULL; 1626 rsp_type = G_RSPD_TYPE(d->rsp.u.type_gen); 1627 lq = be32toh(d->rsp.pldbuflen_qid); 1628 1629 switch (rsp_type) { 1630 case X_RSPD_TYPE_FLBUF: 1631 if (lq & F_RSPD_NEWBUF) { 1632 if (fl->rx_offset > 0) 1633 move_to_next_rxbuf(fl); 1634 lq = G_RSPD_LEN(lq); 1635 } 1636 if (IDXDIFF(fl->hw_cidx, fl_hw_cidx, fl->sidx) > 4) { 1637 FL_LOCK(fl); 1638 refill_fl(sc, fl, 64); 1639 FL_UNLOCK(fl); 1640 fl_hw_cidx = fl->hw_cidx; 1641 } 1642 1643 if (d->rss.opcode == CPL_RX_PKT) { 1644 if (__predict_true(eth_rx(sc, rxq, d, lq) == 0)) 1645 break; 1646 goto out; 1647 } 1648 m0 = get_fl_payload(sc, fl, lq); 1649 if (__predict_false(m0 == NULL)) 1650 goto out; 1651 1652 /* fall through */ 1653 1654 case X_RSPD_TYPE_CPL: 1655 KASSERT(d->rss.opcode < NUM_CPL_CMDS, 1656 ("%s: bad opcode %02x.", __func__, d->rss.opcode)); 1657 t4_cpl_handler[d->rss.opcode](iq, &d->rss, m0); 1658 break; 1659 1660 case X_RSPD_TYPE_INTR: 1661 1662 /* 1663 * There are 1K interrupt-capable queues (qids 0 1664 * through 1023). A response type indicating a 1665 * forwarded interrupt with a qid >= 1K is an 1666 * iWARP async notification. That is the only 1667 * acceptable indirect interrupt on this queue. 1668 */ 1669 if (__predict_false(lq < 1024)) { 1670 panic("%s: indirect interrupt on iq_fl %p " 1671 "with qid %u", __func__, iq, lq); 1672 } 1673 1674 t4_an_handler(iq, &d->rsp); 1675 break; 1676 1677 default: 1678 KASSERT(0, ("%s: illegal response type %d on iq %p", 1679 __func__, rsp_type, iq)); 1680 log(LOG_ERR, "%s: illegal response type %d on iq %p", 1681 device_get_nameunit(sc->dev), rsp_type, iq); 1682 break; 1683 } 1684 1685 d++; 1686 if (__predict_false(++iq->cidx == iq->sidx)) { 1687 iq->cidx = 0; 1688 iq->gen ^= F_RSPD_GEN; 1689 d = &iq->desc[0]; 1690 } 1691 if (__predict_false(++ndescs == limit)) { 1692 t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(ndescs) | 1693 V_INGRESSQID(iq->cntxt_id) | 1694 V_SEINTARM(V_QINTR_TIMER_IDX(X_TIMERREG_UPDATE_CIDX))); 1695 1696 #if defined(INET) || defined(INET6) 1697 if (iq->flags & IQ_LRO_ENABLED && 1698 !sort_before_lro(lro) && 1699 sc->lro_timeout != 0) { 1700 tcp_lro_flush_inactive(lro, &lro_timeout); 1701 } 1702 #endif 1703 if (budget) 1704 return (EINPROGRESS); 1705 ndescs = 0; 1706 } 1707 } 1708 out: 1709 #if defined(INET) || defined(INET6) 1710 if (iq->flags & IQ_LRO_ENABLED) { 1711 if (ndescs > 0 && lro->lro_mbuf_count > 8) { 1712 MPASS(sort_before_lro(lro)); 1713 /* hold back one credit and don't flush LRO state */ 1714 iq->flags |= IQ_ADJ_CREDIT; 1715 ndescs--; 1716 } else { 1717 tcp_lro_flush_all(lro); 1718 } 1719 } 1720 #endif 1721 1722 t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(ndescs) | 1723 V_INGRESSQID((u32)iq->cntxt_id) | V_SEINTARM(iq->intr_params)); 1724 1725 FL_LOCK(fl); 1726 starved = refill_fl(sc, fl, 64); 1727 FL_UNLOCK(fl); 1728 if (__predict_false(starved != 0)) 1729 add_fl_to_sfl(sc, fl); 1730 1731 return (0); 1732 } 1733 1734 static inline struct cluster_metadata * 1735 cl_metadata(struct fl_sdesc *sd) 1736 { 1737 1738 return ((void *)(sd->cl + sd->moff)); 1739 } 1740 1741 static void 1742 rxb_free(struct mbuf *m) 1743 { 1744 struct cluster_metadata *clm = m->m_ext.ext_arg1; 1745 1746 uma_zfree(clm->zone, clm->cl); 1747 counter_u64_add(extfree_rels, 1); 1748 } 1749 1750 /* 1751 * The mbuf returned comes from zone_muf and carries the payload in one of these 1752 * ways 1753 * a) complete frame inside the mbuf 1754 * b) m_cljset (for clusters without metadata) 1755 * d) m_extaddref (cluster with metadata) 1756 */ 1757 static struct mbuf * 1758 get_scatter_segment(struct adapter *sc, struct sge_fl *fl, int fr_offset, 1759 int remaining) 1760 { 1761 struct mbuf *m; 1762 struct fl_sdesc *sd = &fl->sdesc[fl->cidx]; 1763 struct rx_buf_info *rxb = &sc->sge.rx_buf_info[sd->zidx]; 1764 struct cluster_metadata *clm; 1765 int len, blen; 1766 caddr_t payload; 1767 1768 if (fl->flags & FL_BUF_PACKING) { 1769 u_int l, pad; 1770 1771 blen = rxb->size2 - fl->rx_offset; /* max possible in this buf */ 1772 len = min(remaining, blen); 1773 payload = sd->cl + fl->rx_offset; 1774 1775 l = fr_offset + len; 1776 pad = roundup2(l, fl->buf_boundary) - l; 1777 if (fl->rx_offset + len + pad < rxb->size2) 1778 blen = len + pad; 1779 MPASS(fl->rx_offset + blen <= rxb->size2); 1780 } else { 1781 MPASS(fl->rx_offset == 0); /* not packing */ 1782 blen = rxb->size1; 1783 len = min(remaining, blen); 1784 payload = sd->cl; 1785 } 1786 1787 if (fr_offset == 0) { 1788 m = m_gethdr(M_NOWAIT, MT_DATA); 1789 if (__predict_false(m == NULL)) 1790 return (NULL); 1791 m->m_pkthdr.len = remaining; 1792 } else { 1793 m = m_get(M_NOWAIT, MT_DATA); 1794 if (__predict_false(m == NULL)) 1795 return (NULL); 1796 } 1797 m->m_len = len; 1798 kmsan_mark(payload, len, KMSAN_STATE_INITED); 1799 1800 if (sc->sc_do_rxcopy && len < RX_COPY_THRESHOLD) { 1801 /* copy data to mbuf */ 1802 bcopy(payload, mtod(m, caddr_t), len); 1803 if (fl->flags & FL_BUF_PACKING) { 1804 fl->rx_offset += blen; 1805 MPASS(fl->rx_offset <= rxb->size2); 1806 if (fl->rx_offset < rxb->size2) 1807 return (m); /* without advancing the cidx */ 1808 } 1809 } else if (fl->flags & FL_BUF_PACKING) { 1810 clm = cl_metadata(sd); 1811 if (sd->nmbuf++ == 0) { 1812 clm->refcount = 1; 1813 clm->zone = rxb->zone; 1814 clm->cl = sd->cl; 1815 counter_u64_add(extfree_refs, 1); 1816 } 1817 m_extaddref(m, payload, blen, &clm->refcount, rxb_free, clm, 1818 NULL); 1819 1820 fl->rx_offset += blen; 1821 MPASS(fl->rx_offset <= rxb->size2); 1822 if (fl->rx_offset < rxb->size2) 1823 return (m); /* without advancing the cidx */ 1824 } else { 1825 m_cljset(m, sd->cl, rxb->type); 1826 sd->cl = NULL; /* consumed, not a recycle candidate */ 1827 } 1828 1829 move_to_next_rxbuf(fl); 1830 1831 return (m); 1832 } 1833 1834 static struct mbuf * 1835 get_fl_payload(struct adapter *sc, struct sge_fl *fl, const u_int plen) 1836 { 1837 struct mbuf *m0, *m, **pnext; 1838 u_int remaining; 1839 1840 if (__predict_false(fl->flags & FL_BUF_RESUME)) { 1841 M_ASSERTPKTHDR(fl->m0); 1842 MPASS(fl->m0->m_pkthdr.len == plen); 1843 MPASS(fl->remaining < plen); 1844 1845 m0 = fl->m0; 1846 pnext = fl->pnext; 1847 remaining = fl->remaining; 1848 fl->flags &= ~FL_BUF_RESUME; 1849 goto get_segment; 1850 } 1851 1852 /* 1853 * Payload starts at rx_offset in the current hw buffer. Its length is 1854 * 'len' and it may span multiple hw buffers. 1855 */ 1856 1857 m0 = get_scatter_segment(sc, fl, 0, plen); 1858 if (m0 == NULL) 1859 return (NULL); 1860 remaining = plen - m0->m_len; 1861 pnext = &m0->m_next; 1862 while (remaining > 0) { 1863 get_segment: 1864 MPASS(fl->rx_offset == 0); 1865 m = get_scatter_segment(sc, fl, plen - remaining, remaining); 1866 if (__predict_false(m == NULL)) { 1867 fl->m0 = m0; 1868 fl->pnext = pnext; 1869 fl->remaining = remaining; 1870 fl->flags |= FL_BUF_RESUME; 1871 return (NULL); 1872 } 1873 *pnext = m; 1874 pnext = &m->m_next; 1875 remaining -= m->m_len; 1876 } 1877 *pnext = NULL; 1878 1879 M_ASSERTPKTHDR(m0); 1880 return (m0); 1881 } 1882 1883 static int 1884 skip_scatter_segment(struct adapter *sc, struct sge_fl *fl, int fr_offset, 1885 int remaining) 1886 { 1887 struct fl_sdesc *sd = &fl->sdesc[fl->cidx]; 1888 struct rx_buf_info *rxb = &sc->sge.rx_buf_info[sd->zidx]; 1889 int len, blen; 1890 1891 if (fl->flags & FL_BUF_PACKING) { 1892 u_int l, pad; 1893 1894 blen = rxb->size2 - fl->rx_offset; /* max possible in this buf */ 1895 len = min(remaining, blen); 1896 1897 l = fr_offset + len; 1898 pad = roundup2(l, fl->buf_boundary) - l; 1899 if (fl->rx_offset + len + pad < rxb->size2) 1900 blen = len + pad; 1901 fl->rx_offset += blen; 1902 MPASS(fl->rx_offset <= rxb->size2); 1903 if (fl->rx_offset < rxb->size2) 1904 return (len); /* without advancing the cidx */ 1905 } else { 1906 MPASS(fl->rx_offset == 0); /* not packing */ 1907 blen = rxb->size1; 1908 len = min(remaining, blen); 1909 } 1910 move_to_next_rxbuf(fl); 1911 return (len); 1912 } 1913 1914 static inline void 1915 skip_fl_payload(struct adapter *sc, struct sge_fl *fl, int plen) 1916 { 1917 int remaining, fr_offset, len; 1918 1919 fr_offset = 0; 1920 remaining = plen; 1921 while (remaining > 0) { 1922 len = skip_scatter_segment(sc, fl, fr_offset, remaining); 1923 fr_offset += len; 1924 remaining -= len; 1925 } 1926 } 1927 1928 static inline int 1929 get_segment_len(struct adapter *sc, struct sge_fl *fl, int plen) 1930 { 1931 int len; 1932 struct fl_sdesc *sd = &fl->sdesc[fl->cidx]; 1933 struct rx_buf_info *rxb = &sc->sge.rx_buf_info[sd->zidx]; 1934 1935 if (fl->flags & FL_BUF_PACKING) 1936 len = rxb->size2 - fl->rx_offset; 1937 else 1938 len = rxb->size1; 1939 1940 return (min(plen, len)); 1941 } 1942 1943 static int 1944 eth_rx(struct adapter *sc, struct sge_rxq *rxq, const struct iq_desc *d, 1945 u_int plen) 1946 { 1947 struct mbuf *m0; 1948 struct ifnet *ifp = rxq->ifp; 1949 struct sge_fl *fl = &rxq->fl; 1950 struct vi_info *vi = ifp->if_softc; 1951 const struct cpl_rx_pkt *cpl; 1952 #if defined(INET) || defined(INET6) 1953 struct lro_ctrl *lro = &rxq->lro; 1954 #endif 1955 uint16_t err_vec, tnl_type, tnlhdr_len; 1956 static const int sw_hashtype[4][2] = { 1957 {M_HASHTYPE_NONE, M_HASHTYPE_NONE}, 1958 {M_HASHTYPE_RSS_IPV4, M_HASHTYPE_RSS_IPV6}, 1959 {M_HASHTYPE_RSS_TCP_IPV4, M_HASHTYPE_RSS_TCP_IPV6}, 1960 {M_HASHTYPE_RSS_UDP_IPV4, M_HASHTYPE_RSS_UDP_IPV6}, 1961 }; 1962 static const int sw_csum_flags[2][2] = { 1963 { 1964 /* IP, inner IP */ 1965 CSUM_ENCAP_VXLAN | 1966 CSUM_L3_CALC | CSUM_L3_VALID | 1967 CSUM_L4_CALC | CSUM_L4_VALID | 1968 CSUM_INNER_L3_CALC | CSUM_INNER_L3_VALID | 1969 CSUM_INNER_L4_CALC | CSUM_INNER_L4_VALID, 1970 1971 /* IP, inner IP6 */ 1972 CSUM_ENCAP_VXLAN | 1973 CSUM_L3_CALC | CSUM_L3_VALID | 1974 CSUM_L4_CALC | CSUM_L4_VALID | 1975 CSUM_INNER_L4_CALC | CSUM_INNER_L4_VALID, 1976 }, 1977 { 1978 /* IP6, inner IP */ 1979 CSUM_ENCAP_VXLAN | 1980 CSUM_L4_CALC | CSUM_L4_VALID | 1981 CSUM_INNER_L3_CALC | CSUM_INNER_L3_VALID | 1982 CSUM_INNER_L4_CALC | CSUM_INNER_L4_VALID, 1983 1984 /* IP6, inner IP6 */ 1985 CSUM_ENCAP_VXLAN | 1986 CSUM_L4_CALC | CSUM_L4_VALID | 1987 CSUM_INNER_L4_CALC | CSUM_INNER_L4_VALID, 1988 }, 1989 }; 1990 1991 MPASS(plen > sc->params.sge.fl_pktshift); 1992 if (vi->pfil != NULL && PFIL_HOOKED_IN(vi->pfil) && 1993 __predict_true((fl->flags & FL_BUF_RESUME) == 0)) { 1994 struct fl_sdesc *sd = &fl->sdesc[fl->cidx]; 1995 caddr_t frame; 1996 int rc, slen; 1997 1998 slen = get_segment_len(sc, fl, plen) - 1999 sc->params.sge.fl_pktshift; 2000 frame = sd->cl + fl->rx_offset + sc->params.sge.fl_pktshift; 2001 CURVNET_SET_QUIET(ifp->if_vnet); 2002 rc = pfil_mem_in(vi->pfil, frame, slen, ifp, &m0); 2003 CURVNET_RESTORE(); 2004 if (rc == PFIL_DROPPED || rc == PFIL_CONSUMED) { 2005 skip_fl_payload(sc, fl, plen); 2006 return (0); 2007 } 2008 if (rc == PFIL_REALLOCED) { 2009 skip_fl_payload(sc, fl, plen); 2010 goto have_mbuf; 2011 } 2012 } 2013 2014 m0 = get_fl_payload(sc, fl, plen); 2015 if (__predict_false(m0 == NULL)) 2016 return (ENOMEM); 2017 2018 m0->m_pkthdr.len -= sc->params.sge.fl_pktshift; 2019 m0->m_len -= sc->params.sge.fl_pktshift; 2020 m0->m_data += sc->params.sge.fl_pktshift; 2021 2022 have_mbuf: 2023 m0->m_pkthdr.rcvif = ifp; 2024 M_HASHTYPE_SET(m0, sw_hashtype[d->rss.hash_type][d->rss.ipv6]); 2025 m0->m_pkthdr.flowid = be32toh(d->rss.hash_val); 2026 2027 cpl = (const void *)(&d->rss + 1); 2028 if (sc->params.tp.rx_pkt_encap) { 2029 const uint16_t ev = be16toh(cpl->err_vec); 2030 2031 err_vec = G_T6_COMPR_RXERR_VEC(ev); 2032 tnl_type = G_T6_RX_TNL_TYPE(ev); 2033 tnlhdr_len = G_T6_RX_TNLHDR_LEN(ev); 2034 } else { 2035 err_vec = be16toh(cpl->err_vec); 2036 tnl_type = 0; 2037 tnlhdr_len = 0; 2038 } 2039 if (cpl->csum_calc && err_vec == 0) { 2040 int ipv6 = !!(cpl->l2info & htobe32(F_RXF_IP6)); 2041 2042 /* checksum(s) calculated and found to be correct. */ 2043 2044 MPASS((cpl->l2info & htobe32(F_RXF_IP)) ^ 2045 (cpl->l2info & htobe32(F_RXF_IP6))); 2046 m0->m_pkthdr.csum_data = be16toh(cpl->csum); 2047 if (tnl_type == 0) { 2048 if (!ipv6 && ifp->if_capenable & IFCAP_RXCSUM) { 2049 m0->m_pkthdr.csum_flags = CSUM_L3_CALC | 2050 CSUM_L3_VALID | CSUM_L4_CALC | 2051 CSUM_L4_VALID; 2052 } else if (ipv6 && ifp->if_capenable & IFCAP_RXCSUM_IPV6) { 2053 m0->m_pkthdr.csum_flags = CSUM_L4_CALC | 2054 CSUM_L4_VALID; 2055 } 2056 rxq->rxcsum++; 2057 } else { 2058 MPASS(tnl_type == RX_PKT_TNL_TYPE_VXLAN); 2059 2060 M_HASHTYPE_SETINNER(m0); 2061 if (__predict_false(cpl->ip_frag)) { 2062 /* 2063 * csum_data is for the inner frame (which is an 2064 * IP fragment) and is not 0xffff. There is no 2065 * way to pass the inner csum_data to the stack. 2066 * We don't want the stack to use the inner 2067 * csum_data to validate the outer frame or it 2068 * will get rejected. So we fix csum_data here 2069 * and let sw do the checksum of inner IP 2070 * fragments. 2071 * 2072 * XXX: Need 32b for csum_data2 in an rx mbuf. 2073 * Maybe stuff it into rcv_tstmp? 2074 */ 2075 m0->m_pkthdr.csum_data = 0xffff; 2076 if (ipv6) { 2077 m0->m_pkthdr.csum_flags = CSUM_L4_CALC | 2078 CSUM_L4_VALID; 2079 } else { 2080 m0->m_pkthdr.csum_flags = CSUM_L3_CALC | 2081 CSUM_L3_VALID | CSUM_L4_CALC | 2082 CSUM_L4_VALID; 2083 } 2084 } else { 2085 int outer_ipv6; 2086 2087 MPASS(m0->m_pkthdr.csum_data == 0xffff); 2088 2089 outer_ipv6 = tnlhdr_len >= 2090 sizeof(struct ether_header) + 2091 sizeof(struct ip6_hdr); 2092 m0->m_pkthdr.csum_flags = 2093 sw_csum_flags[outer_ipv6][ipv6]; 2094 } 2095 rxq->vxlan_rxcsum++; 2096 } 2097 } 2098 2099 if (cpl->vlan_ex) { 2100 m0->m_pkthdr.ether_vtag = be16toh(cpl->vlan); 2101 m0->m_flags |= M_VLANTAG; 2102 rxq->vlan_extraction++; 2103 } 2104 2105 if (rxq->iq.flags & IQ_RX_TIMESTAMP) { 2106 /* 2107 * Fill up rcv_tstmp but do not set M_TSTMP as 2108 * long as we get a non-zero back from t4_tstmp_to_ns(). 2109 */ 2110 m0->m_pkthdr.rcv_tstmp = t4_tstmp_to_ns(sc, 2111 be64toh(d->rsp.u.last_flit)); 2112 if (m0->m_pkthdr.rcv_tstmp != 0) 2113 m0->m_flags |= M_TSTMP; 2114 } 2115 2116 #ifdef NUMA 2117 m0->m_pkthdr.numa_domain = ifp->if_numa_domain; 2118 #endif 2119 #if defined(INET) || defined(INET6) 2120 if (rxq->iq.flags & IQ_LRO_ENABLED && tnl_type == 0 && 2121 (M_HASHTYPE_GET(m0) == M_HASHTYPE_RSS_TCP_IPV4 || 2122 M_HASHTYPE_GET(m0) == M_HASHTYPE_RSS_TCP_IPV6)) { 2123 if (sort_before_lro(lro)) { 2124 tcp_lro_queue_mbuf(lro, m0); 2125 return (0); /* queued for sort, then LRO */ 2126 } 2127 if (tcp_lro_rx(lro, m0, 0) == 0) 2128 return (0); /* queued for LRO */ 2129 } 2130 #endif 2131 ifp->if_input(ifp, m0); 2132 2133 return (0); 2134 } 2135 2136 /* 2137 * Must drain the wrq or make sure that someone else will. 2138 */ 2139 static void 2140 wrq_tx_drain(void *arg, int n) 2141 { 2142 struct sge_wrq *wrq = arg; 2143 struct sge_eq *eq = &wrq->eq; 2144 2145 EQ_LOCK(eq); 2146 if (TAILQ_EMPTY(&wrq->incomplete_wrs) && !STAILQ_EMPTY(&wrq->wr_list)) 2147 drain_wrq_wr_list(wrq->adapter, wrq); 2148 EQ_UNLOCK(eq); 2149 } 2150 2151 static void 2152 drain_wrq_wr_list(struct adapter *sc, struct sge_wrq *wrq) 2153 { 2154 struct sge_eq *eq = &wrq->eq; 2155 u_int available, dbdiff; /* # of hardware descriptors */ 2156 u_int n; 2157 struct wrqe *wr; 2158 struct fw_eth_tx_pkt_wr *dst; /* any fw WR struct will do */ 2159 2160 EQ_LOCK_ASSERT_OWNED(eq); 2161 MPASS(TAILQ_EMPTY(&wrq->incomplete_wrs)); 2162 wr = STAILQ_FIRST(&wrq->wr_list); 2163 MPASS(wr != NULL); /* Must be called with something useful to do */ 2164 MPASS(eq->pidx == eq->dbidx); 2165 dbdiff = 0; 2166 2167 do { 2168 eq->cidx = read_hw_cidx(eq); 2169 if (eq->pidx == eq->cidx) 2170 available = eq->sidx - 1; 2171 else 2172 available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1; 2173 2174 MPASS(wr->wrq == wrq); 2175 n = howmany(wr->wr_len, EQ_ESIZE); 2176 if (available < n) 2177 break; 2178 2179 dst = (void *)&eq->desc[eq->pidx]; 2180 if (__predict_true(eq->sidx - eq->pidx > n)) { 2181 /* Won't wrap, won't end exactly at the status page. */ 2182 bcopy(&wr->wr[0], dst, wr->wr_len); 2183 eq->pidx += n; 2184 } else { 2185 int first_portion = (eq->sidx - eq->pidx) * EQ_ESIZE; 2186 2187 bcopy(&wr->wr[0], dst, first_portion); 2188 if (wr->wr_len > first_portion) { 2189 bcopy(&wr->wr[first_portion], &eq->desc[0], 2190 wr->wr_len - first_portion); 2191 } 2192 eq->pidx = n - (eq->sidx - eq->pidx); 2193 } 2194 wrq->tx_wrs_copied++; 2195 2196 if (available < eq->sidx / 4 && 2197 atomic_cmpset_int(&eq->equiq, 0, 1)) { 2198 /* 2199 * XXX: This is not 100% reliable with some 2200 * types of WRs. But this is a very unusual 2201 * situation for an ofld/ctrl queue anyway. 2202 */ 2203 dst->equiq_to_len16 |= htobe32(F_FW_WR_EQUIQ | 2204 F_FW_WR_EQUEQ); 2205 } 2206 2207 dbdiff += n; 2208 if (dbdiff >= 16) { 2209 ring_eq_db(sc, eq, dbdiff); 2210 dbdiff = 0; 2211 } 2212 2213 STAILQ_REMOVE_HEAD(&wrq->wr_list, link); 2214 free_wrqe(wr); 2215 MPASS(wrq->nwr_pending > 0); 2216 wrq->nwr_pending--; 2217 MPASS(wrq->ndesc_needed >= n); 2218 wrq->ndesc_needed -= n; 2219 } while ((wr = STAILQ_FIRST(&wrq->wr_list)) != NULL); 2220 2221 if (dbdiff) 2222 ring_eq_db(sc, eq, dbdiff); 2223 } 2224 2225 /* 2226 * Doesn't fail. Holds on to work requests it can't send right away. 2227 */ 2228 void 2229 t4_wrq_tx_locked(struct adapter *sc, struct sge_wrq *wrq, struct wrqe *wr) 2230 { 2231 #ifdef INVARIANTS 2232 struct sge_eq *eq = &wrq->eq; 2233 #endif 2234 2235 EQ_LOCK_ASSERT_OWNED(eq); 2236 MPASS(wr != NULL); 2237 MPASS(wr->wr_len > 0 && wr->wr_len <= SGE_MAX_WR_LEN); 2238 MPASS((wr->wr_len & 0x7) == 0); 2239 2240 STAILQ_INSERT_TAIL(&wrq->wr_list, wr, link); 2241 wrq->nwr_pending++; 2242 wrq->ndesc_needed += howmany(wr->wr_len, EQ_ESIZE); 2243 2244 if (!TAILQ_EMPTY(&wrq->incomplete_wrs)) 2245 return; /* commit_wrq_wr will drain wr_list as well. */ 2246 2247 drain_wrq_wr_list(sc, wrq); 2248 2249 /* Doorbell must have caught up to the pidx. */ 2250 MPASS(eq->pidx == eq->dbidx); 2251 } 2252 2253 void 2254 t4_update_fl_bufsize(struct ifnet *ifp) 2255 { 2256 struct vi_info *vi = ifp->if_softc; 2257 struct adapter *sc = vi->adapter; 2258 struct sge_rxq *rxq; 2259 #ifdef TCP_OFFLOAD 2260 struct sge_ofld_rxq *ofld_rxq; 2261 #endif 2262 struct sge_fl *fl; 2263 int i, maxp; 2264 2265 maxp = max_rx_payload(sc, ifp, false); 2266 for_each_rxq(vi, i, rxq) { 2267 fl = &rxq->fl; 2268 2269 FL_LOCK(fl); 2270 fl->zidx = find_refill_source(sc, maxp, 2271 fl->flags & FL_BUF_PACKING); 2272 FL_UNLOCK(fl); 2273 } 2274 #ifdef TCP_OFFLOAD 2275 maxp = max_rx_payload(sc, ifp, true); 2276 for_each_ofld_rxq(vi, i, ofld_rxq) { 2277 fl = &ofld_rxq->fl; 2278 2279 FL_LOCK(fl); 2280 fl->zidx = find_refill_source(sc, maxp, 2281 fl->flags & FL_BUF_PACKING); 2282 FL_UNLOCK(fl); 2283 } 2284 #endif 2285 } 2286 2287 static inline int 2288 mbuf_nsegs(struct mbuf *m) 2289 { 2290 2291 M_ASSERTPKTHDR(m); 2292 KASSERT(m->m_pkthdr.inner_l5hlen > 0, 2293 ("%s: mbuf %p missing information on # of segments.", __func__, m)); 2294 2295 return (m->m_pkthdr.inner_l5hlen); 2296 } 2297 2298 static inline void 2299 set_mbuf_nsegs(struct mbuf *m, uint8_t nsegs) 2300 { 2301 2302 M_ASSERTPKTHDR(m); 2303 m->m_pkthdr.inner_l5hlen = nsegs; 2304 } 2305 2306 static inline int 2307 mbuf_cflags(struct mbuf *m) 2308 { 2309 2310 M_ASSERTPKTHDR(m); 2311 return (m->m_pkthdr.PH_loc.eight[4]); 2312 } 2313 2314 static inline void 2315 set_mbuf_cflags(struct mbuf *m, uint8_t flags) 2316 { 2317 2318 M_ASSERTPKTHDR(m); 2319 m->m_pkthdr.PH_loc.eight[4] = flags; 2320 } 2321 2322 static inline int 2323 mbuf_len16(struct mbuf *m) 2324 { 2325 int n; 2326 2327 M_ASSERTPKTHDR(m); 2328 n = m->m_pkthdr.PH_loc.eight[0]; 2329 if (!(mbuf_cflags(m) & MC_TLS)) 2330 MPASS(n > 0 && n <= SGE_MAX_WR_LEN / 16); 2331 2332 return (n); 2333 } 2334 2335 static inline void 2336 set_mbuf_len16(struct mbuf *m, uint8_t len16) 2337 { 2338 2339 M_ASSERTPKTHDR(m); 2340 if (!(mbuf_cflags(m) & MC_TLS)) 2341 MPASS(len16 > 0 && len16 <= SGE_MAX_WR_LEN / 16); 2342 m->m_pkthdr.PH_loc.eight[0] = len16; 2343 } 2344 2345 #ifdef RATELIMIT 2346 static inline int 2347 mbuf_eo_nsegs(struct mbuf *m) 2348 { 2349 2350 M_ASSERTPKTHDR(m); 2351 return (m->m_pkthdr.PH_loc.eight[1]); 2352 } 2353 2354 #if defined(INET) || defined(INET6) 2355 static inline void 2356 set_mbuf_eo_nsegs(struct mbuf *m, uint8_t nsegs) 2357 { 2358 2359 M_ASSERTPKTHDR(m); 2360 m->m_pkthdr.PH_loc.eight[1] = nsegs; 2361 } 2362 #endif 2363 2364 static inline int 2365 mbuf_eo_len16(struct mbuf *m) 2366 { 2367 int n; 2368 2369 M_ASSERTPKTHDR(m); 2370 n = m->m_pkthdr.PH_loc.eight[2]; 2371 MPASS(n > 0 && n <= SGE_MAX_WR_LEN / 16); 2372 2373 return (n); 2374 } 2375 2376 #if defined(INET) || defined(INET6) 2377 static inline void 2378 set_mbuf_eo_len16(struct mbuf *m, uint8_t len16) 2379 { 2380 2381 M_ASSERTPKTHDR(m); 2382 m->m_pkthdr.PH_loc.eight[2] = len16; 2383 } 2384 #endif 2385 2386 static inline int 2387 mbuf_eo_tsclk_tsoff(struct mbuf *m) 2388 { 2389 2390 M_ASSERTPKTHDR(m); 2391 return (m->m_pkthdr.PH_loc.eight[3]); 2392 } 2393 2394 #if defined(INET) || defined(INET6) 2395 static inline void 2396 set_mbuf_eo_tsclk_tsoff(struct mbuf *m, uint8_t tsclk_tsoff) 2397 { 2398 2399 M_ASSERTPKTHDR(m); 2400 m->m_pkthdr.PH_loc.eight[3] = tsclk_tsoff; 2401 } 2402 #endif 2403 2404 static inline int 2405 needs_eo(struct m_snd_tag *mst) 2406 { 2407 2408 return (mst != NULL && mst->sw->type == IF_SND_TAG_TYPE_RATE_LIMIT); 2409 } 2410 #endif 2411 2412 /* 2413 * Try to allocate an mbuf to contain a raw work request. To make it 2414 * easy to construct the work request, don't allocate a chain but a 2415 * single mbuf. 2416 */ 2417 struct mbuf * 2418 alloc_wr_mbuf(int len, int how) 2419 { 2420 struct mbuf *m; 2421 2422 if (len <= MHLEN) 2423 m = m_gethdr(how, MT_DATA); 2424 else if (len <= MCLBYTES) 2425 m = m_getcl(how, MT_DATA, M_PKTHDR); 2426 else 2427 m = NULL; 2428 if (m == NULL) 2429 return (NULL); 2430 m->m_pkthdr.len = len; 2431 m->m_len = len; 2432 set_mbuf_cflags(m, MC_RAW_WR); 2433 set_mbuf_len16(m, howmany(len, 16)); 2434 return (m); 2435 } 2436 2437 static inline bool 2438 needs_hwcsum(struct mbuf *m) 2439 { 2440 const uint32_t csum_flags = CSUM_IP | CSUM_IP_UDP | CSUM_IP_TCP | 2441 CSUM_IP_TSO | CSUM_INNER_IP | CSUM_INNER_IP_UDP | 2442 CSUM_INNER_IP_TCP | CSUM_INNER_IP_TSO | CSUM_IP6_UDP | 2443 CSUM_IP6_TCP | CSUM_IP6_TSO | CSUM_INNER_IP6_UDP | 2444 CSUM_INNER_IP6_TCP | CSUM_INNER_IP6_TSO; 2445 2446 M_ASSERTPKTHDR(m); 2447 2448 return (m->m_pkthdr.csum_flags & csum_flags); 2449 } 2450 2451 static inline bool 2452 needs_tso(struct mbuf *m) 2453 { 2454 const uint32_t csum_flags = CSUM_IP_TSO | CSUM_IP6_TSO | 2455 CSUM_INNER_IP_TSO | CSUM_INNER_IP6_TSO; 2456 2457 M_ASSERTPKTHDR(m); 2458 2459 return (m->m_pkthdr.csum_flags & csum_flags); 2460 } 2461 2462 static inline bool 2463 needs_vxlan_csum(struct mbuf *m) 2464 { 2465 2466 M_ASSERTPKTHDR(m); 2467 2468 return (m->m_pkthdr.csum_flags & CSUM_ENCAP_VXLAN); 2469 } 2470 2471 static inline bool 2472 needs_vxlan_tso(struct mbuf *m) 2473 { 2474 const uint32_t csum_flags = CSUM_ENCAP_VXLAN | CSUM_INNER_IP_TSO | 2475 CSUM_INNER_IP6_TSO; 2476 2477 M_ASSERTPKTHDR(m); 2478 2479 return ((m->m_pkthdr.csum_flags & csum_flags) != 0 && 2480 (m->m_pkthdr.csum_flags & csum_flags) != CSUM_ENCAP_VXLAN); 2481 } 2482 2483 #if defined(INET) || defined(INET6) 2484 static inline bool 2485 needs_inner_tcp_csum(struct mbuf *m) 2486 { 2487 const uint32_t csum_flags = CSUM_INNER_IP_TSO | CSUM_INNER_IP6_TSO; 2488 2489 M_ASSERTPKTHDR(m); 2490 2491 return (m->m_pkthdr.csum_flags & csum_flags); 2492 } 2493 #endif 2494 2495 static inline bool 2496 needs_l3_csum(struct mbuf *m) 2497 { 2498 const uint32_t csum_flags = CSUM_IP | CSUM_IP_TSO | CSUM_INNER_IP | 2499 CSUM_INNER_IP_TSO; 2500 2501 M_ASSERTPKTHDR(m); 2502 2503 return (m->m_pkthdr.csum_flags & csum_flags); 2504 } 2505 2506 static inline bool 2507 needs_outer_tcp_csum(struct mbuf *m) 2508 { 2509 const uint32_t csum_flags = CSUM_IP_TCP | CSUM_IP_TSO | CSUM_IP6_TCP | 2510 CSUM_IP6_TSO; 2511 2512 M_ASSERTPKTHDR(m); 2513 2514 return (m->m_pkthdr.csum_flags & csum_flags); 2515 } 2516 2517 #ifdef RATELIMIT 2518 static inline bool 2519 needs_outer_l4_csum(struct mbuf *m) 2520 { 2521 const uint32_t csum_flags = CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP_TSO | 2522 CSUM_IP6_UDP | CSUM_IP6_TCP | CSUM_IP6_TSO; 2523 2524 M_ASSERTPKTHDR(m); 2525 2526 return (m->m_pkthdr.csum_flags & csum_flags); 2527 } 2528 2529 static inline bool 2530 needs_outer_udp_csum(struct mbuf *m) 2531 { 2532 const uint32_t csum_flags = CSUM_IP_UDP | CSUM_IP6_UDP; 2533 2534 M_ASSERTPKTHDR(m); 2535 2536 return (m->m_pkthdr.csum_flags & csum_flags); 2537 } 2538 #endif 2539 2540 static inline bool 2541 needs_vlan_insertion(struct mbuf *m) 2542 { 2543 2544 M_ASSERTPKTHDR(m); 2545 2546 return (m->m_flags & M_VLANTAG); 2547 } 2548 2549 #if defined(INET) || defined(INET6) 2550 static void * 2551 m_advance(struct mbuf **pm, int *poffset, int len) 2552 { 2553 struct mbuf *m = *pm; 2554 int offset = *poffset; 2555 uintptr_t p = 0; 2556 2557 MPASS(len > 0); 2558 2559 for (;;) { 2560 if (offset + len < m->m_len) { 2561 offset += len; 2562 p = mtod(m, uintptr_t) + offset; 2563 break; 2564 } 2565 len -= m->m_len - offset; 2566 m = m->m_next; 2567 offset = 0; 2568 MPASS(m != NULL); 2569 } 2570 *poffset = offset; 2571 *pm = m; 2572 return ((void *)p); 2573 } 2574 #endif 2575 2576 static inline int 2577 count_mbuf_ext_pgs(struct mbuf *m, int skip, vm_paddr_t *nextaddr) 2578 { 2579 vm_paddr_t paddr; 2580 int i, len, off, pglen, pgoff, seglen, segoff; 2581 int nsegs = 0; 2582 2583 M_ASSERTEXTPG(m); 2584 off = mtod(m, vm_offset_t); 2585 len = m->m_len; 2586 off += skip; 2587 len -= skip; 2588 2589 if (m->m_epg_hdrlen != 0) { 2590 if (off >= m->m_epg_hdrlen) { 2591 off -= m->m_epg_hdrlen; 2592 } else { 2593 seglen = m->m_epg_hdrlen - off; 2594 segoff = off; 2595 seglen = min(seglen, len); 2596 off = 0; 2597 len -= seglen; 2598 paddr = pmap_kextract( 2599 (vm_offset_t)&m->m_epg_hdr[segoff]); 2600 if (*nextaddr != paddr) 2601 nsegs++; 2602 *nextaddr = paddr + seglen; 2603 } 2604 } 2605 pgoff = m->m_epg_1st_off; 2606 for (i = 0; i < m->m_epg_npgs && len > 0; i++) { 2607 pglen = m_epg_pagelen(m, i, pgoff); 2608 if (off >= pglen) { 2609 off -= pglen; 2610 pgoff = 0; 2611 continue; 2612 } 2613 seglen = pglen - off; 2614 segoff = pgoff + off; 2615 off = 0; 2616 seglen = min(seglen, len); 2617 len -= seglen; 2618 paddr = m->m_epg_pa[i] + segoff; 2619 if (*nextaddr != paddr) 2620 nsegs++; 2621 *nextaddr = paddr + seglen; 2622 pgoff = 0; 2623 }; 2624 if (len != 0) { 2625 seglen = min(len, m->m_epg_trllen - off); 2626 len -= seglen; 2627 paddr = pmap_kextract((vm_offset_t)&m->m_epg_trail[off]); 2628 if (*nextaddr != paddr) 2629 nsegs++; 2630 *nextaddr = paddr + seglen; 2631 } 2632 2633 return (nsegs); 2634 } 2635 2636 2637 /* 2638 * Can deal with empty mbufs in the chain that have m_len = 0, but the chain 2639 * must have at least one mbuf that's not empty. It is possible for this 2640 * routine to return 0 if skip accounts for all the contents of the mbuf chain. 2641 */ 2642 static inline int 2643 count_mbuf_nsegs(struct mbuf *m, int skip, uint8_t *cflags) 2644 { 2645 vm_paddr_t nextaddr, paddr; 2646 vm_offset_t va; 2647 int len, nsegs; 2648 2649 M_ASSERTPKTHDR(m); 2650 MPASS(m->m_pkthdr.len > 0); 2651 MPASS(m->m_pkthdr.len >= skip); 2652 2653 nsegs = 0; 2654 nextaddr = 0; 2655 for (; m; m = m->m_next) { 2656 len = m->m_len; 2657 if (__predict_false(len == 0)) 2658 continue; 2659 if (skip >= len) { 2660 skip -= len; 2661 continue; 2662 } 2663 if ((m->m_flags & M_EXTPG) != 0) { 2664 *cflags |= MC_NOMAP; 2665 nsegs += count_mbuf_ext_pgs(m, skip, &nextaddr); 2666 skip = 0; 2667 continue; 2668 } 2669 va = mtod(m, vm_offset_t) + skip; 2670 len -= skip; 2671 skip = 0; 2672 paddr = pmap_kextract(va); 2673 nsegs += sglist_count((void *)(uintptr_t)va, len); 2674 if (paddr == nextaddr) 2675 nsegs--; 2676 nextaddr = pmap_kextract(va + len - 1) + 1; 2677 } 2678 2679 return (nsegs); 2680 } 2681 2682 /* 2683 * The maximum number of segments that can fit in a WR. 2684 */ 2685 static int 2686 max_nsegs_allowed(struct mbuf *m, bool vm_wr) 2687 { 2688 2689 if (vm_wr) { 2690 if (needs_tso(m)) 2691 return (TX_SGL_SEGS_VM_TSO); 2692 return (TX_SGL_SEGS_VM); 2693 } 2694 2695 if (needs_tso(m)) { 2696 if (needs_vxlan_tso(m)) 2697 return (TX_SGL_SEGS_VXLAN_TSO); 2698 else 2699 return (TX_SGL_SEGS_TSO); 2700 } 2701 2702 return (TX_SGL_SEGS); 2703 } 2704 2705 static struct timeval txerr_ratecheck = {0}; 2706 static const struct timeval txerr_interval = {3, 0}; 2707 2708 /* 2709 * Analyze the mbuf to determine its tx needs. The mbuf passed in may change: 2710 * a) caller can assume it's been freed if this function returns with an error. 2711 * b) it may get defragged up if the gather list is too long for the hardware. 2712 */ 2713 int 2714 parse_pkt(struct mbuf **mp, bool vm_wr) 2715 { 2716 struct mbuf *m0 = *mp, *m; 2717 int rc, nsegs, defragged = 0; 2718 struct ether_header *eh; 2719 #ifdef INET 2720 void *l3hdr; 2721 #endif 2722 #if defined(INET) || defined(INET6) 2723 int offset; 2724 struct tcphdr *tcp; 2725 #endif 2726 #if defined(KERN_TLS) || defined(RATELIMIT) 2727 struct m_snd_tag *mst; 2728 #endif 2729 uint16_t eh_type; 2730 uint8_t cflags; 2731 2732 cflags = 0; 2733 M_ASSERTPKTHDR(m0); 2734 if (__predict_false(m0->m_pkthdr.len < ETHER_HDR_LEN)) { 2735 rc = EINVAL; 2736 fail: 2737 m_freem(m0); 2738 *mp = NULL; 2739 return (rc); 2740 } 2741 restart: 2742 /* 2743 * First count the number of gather list segments in the payload. 2744 * Defrag the mbuf if nsegs exceeds the hardware limit. 2745 */ 2746 M_ASSERTPKTHDR(m0); 2747 MPASS(m0->m_pkthdr.len > 0); 2748 nsegs = count_mbuf_nsegs(m0, 0, &cflags); 2749 #if defined(KERN_TLS) || defined(RATELIMIT) 2750 if (m0->m_pkthdr.csum_flags & CSUM_SND_TAG) 2751 mst = m0->m_pkthdr.snd_tag; 2752 else 2753 mst = NULL; 2754 #endif 2755 #ifdef KERN_TLS 2756 if (mst != NULL && mst->sw->type == IF_SND_TAG_TYPE_TLS) { 2757 int len16; 2758 2759 cflags |= MC_TLS; 2760 set_mbuf_cflags(m0, cflags); 2761 rc = t6_ktls_parse_pkt(m0, &nsegs, &len16); 2762 if (rc != 0) 2763 goto fail; 2764 set_mbuf_nsegs(m0, nsegs); 2765 set_mbuf_len16(m0, len16); 2766 return (0); 2767 } 2768 #endif 2769 if (nsegs > max_nsegs_allowed(m0, vm_wr)) { 2770 if (defragged++ > 0) { 2771 rc = EFBIG; 2772 goto fail; 2773 } 2774 counter_u64_add(defrags, 1); 2775 if ((m = m_defrag(m0, M_NOWAIT)) == NULL) { 2776 rc = ENOMEM; 2777 goto fail; 2778 } 2779 *mp = m0 = m; /* update caller's copy after defrag */ 2780 goto restart; 2781 } 2782 2783 if (__predict_false(nsegs > 2 && m0->m_pkthdr.len <= MHLEN && 2784 !(cflags & MC_NOMAP))) { 2785 counter_u64_add(pullups, 1); 2786 m0 = m_pullup(m0, m0->m_pkthdr.len); 2787 if (m0 == NULL) { 2788 /* Should have left well enough alone. */ 2789 rc = EFBIG; 2790 goto fail; 2791 } 2792 *mp = m0; /* update caller's copy after pullup */ 2793 goto restart; 2794 } 2795 set_mbuf_nsegs(m0, nsegs); 2796 set_mbuf_cflags(m0, cflags); 2797 calculate_mbuf_len16(m0, vm_wr); 2798 2799 #ifdef RATELIMIT 2800 /* 2801 * Ethofld is limited to TCP and UDP for now, and only when L4 hw 2802 * checksumming is enabled. needs_outer_l4_csum happens to check for 2803 * all the right things. 2804 */ 2805 if (__predict_false(needs_eo(mst) && !needs_outer_l4_csum(m0))) { 2806 m_snd_tag_rele(m0->m_pkthdr.snd_tag); 2807 m0->m_pkthdr.snd_tag = NULL; 2808 m0->m_pkthdr.csum_flags &= ~CSUM_SND_TAG; 2809 mst = NULL; 2810 } 2811 #endif 2812 2813 if (!needs_hwcsum(m0) 2814 #ifdef RATELIMIT 2815 && !needs_eo(mst) 2816 #endif 2817 ) 2818 return (0); 2819 2820 m = m0; 2821 eh = mtod(m, struct ether_header *); 2822 eh_type = ntohs(eh->ether_type); 2823 if (eh_type == ETHERTYPE_VLAN) { 2824 struct ether_vlan_header *evh = (void *)eh; 2825 2826 eh_type = ntohs(evh->evl_proto); 2827 m0->m_pkthdr.l2hlen = sizeof(*evh); 2828 } else 2829 m0->m_pkthdr.l2hlen = sizeof(*eh); 2830 2831 #if defined(INET) || defined(INET6) 2832 offset = 0; 2833 #ifdef INET 2834 l3hdr = m_advance(&m, &offset, m0->m_pkthdr.l2hlen); 2835 #else 2836 m_advance(&m, &offset, m0->m_pkthdr.l2hlen); 2837 #endif 2838 #endif 2839 2840 switch (eh_type) { 2841 #ifdef INET6 2842 case ETHERTYPE_IPV6: 2843 m0->m_pkthdr.l3hlen = sizeof(struct ip6_hdr); 2844 break; 2845 #endif 2846 #ifdef INET 2847 case ETHERTYPE_IP: 2848 { 2849 struct ip *ip = l3hdr; 2850 2851 if (needs_vxlan_csum(m0)) { 2852 /* Driver will do the outer IP hdr checksum. */ 2853 ip->ip_sum = 0; 2854 if (needs_vxlan_tso(m0)) { 2855 const uint16_t ipl = ip->ip_len; 2856 2857 ip->ip_len = 0; 2858 ip->ip_sum = ~in_cksum_hdr(ip); 2859 ip->ip_len = ipl; 2860 } else 2861 ip->ip_sum = in_cksum_hdr(ip); 2862 } 2863 m0->m_pkthdr.l3hlen = ip->ip_hl << 2; 2864 break; 2865 } 2866 #endif 2867 default: 2868 if (ratecheck(&txerr_ratecheck, &txerr_interval)) { 2869 log(LOG_ERR, "%s: ethertype 0x%04x unknown. " 2870 "if_cxgbe must be compiled with the same " 2871 "INET/INET6 options as the kernel.\n", __func__, 2872 eh_type); 2873 } 2874 rc = EINVAL; 2875 goto fail; 2876 } 2877 2878 #if defined(INET) || defined(INET6) 2879 if (needs_vxlan_csum(m0)) { 2880 m0->m_pkthdr.l4hlen = sizeof(struct udphdr); 2881 m0->m_pkthdr.l5hlen = sizeof(struct vxlan_header); 2882 2883 /* Inner headers. */ 2884 eh = m_advance(&m, &offset, m0->m_pkthdr.l3hlen + 2885 sizeof(struct udphdr) + sizeof(struct vxlan_header)); 2886 eh_type = ntohs(eh->ether_type); 2887 if (eh_type == ETHERTYPE_VLAN) { 2888 struct ether_vlan_header *evh = (void *)eh; 2889 2890 eh_type = ntohs(evh->evl_proto); 2891 m0->m_pkthdr.inner_l2hlen = sizeof(*evh); 2892 } else 2893 m0->m_pkthdr.inner_l2hlen = sizeof(*eh); 2894 #ifdef INET 2895 l3hdr = m_advance(&m, &offset, m0->m_pkthdr.inner_l2hlen); 2896 #else 2897 m_advance(&m, &offset, m0->m_pkthdr.inner_l2hlen); 2898 #endif 2899 2900 switch (eh_type) { 2901 #ifdef INET6 2902 case ETHERTYPE_IPV6: 2903 m0->m_pkthdr.inner_l3hlen = sizeof(struct ip6_hdr); 2904 break; 2905 #endif 2906 #ifdef INET 2907 case ETHERTYPE_IP: 2908 { 2909 struct ip *ip = l3hdr; 2910 2911 m0->m_pkthdr.inner_l3hlen = ip->ip_hl << 2; 2912 break; 2913 } 2914 #endif 2915 default: 2916 if (ratecheck(&txerr_ratecheck, &txerr_interval)) { 2917 log(LOG_ERR, "%s: VXLAN hw offload requested" 2918 "with unknown ethertype 0x%04x. if_cxgbe " 2919 "must be compiled with the same INET/INET6 " 2920 "options as the kernel.\n", __func__, 2921 eh_type); 2922 } 2923 rc = EINVAL; 2924 goto fail; 2925 } 2926 if (needs_inner_tcp_csum(m0)) { 2927 tcp = m_advance(&m, &offset, m0->m_pkthdr.inner_l3hlen); 2928 m0->m_pkthdr.inner_l4hlen = tcp->th_off * 4; 2929 } 2930 MPASS((m0->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0); 2931 m0->m_pkthdr.csum_flags &= CSUM_INNER_IP6_UDP | 2932 CSUM_INNER_IP6_TCP | CSUM_INNER_IP6_TSO | CSUM_INNER_IP | 2933 CSUM_INNER_IP_UDP | CSUM_INNER_IP_TCP | CSUM_INNER_IP_TSO | 2934 CSUM_ENCAP_VXLAN; 2935 } 2936 2937 if (needs_outer_tcp_csum(m0)) { 2938 tcp = m_advance(&m, &offset, m0->m_pkthdr.l3hlen); 2939 m0->m_pkthdr.l4hlen = tcp->th_off * 4; 2940 #ifdef RATELIMIT 2941 if (tsclk >= 0 && *(uint32_t *)(tcp + 1) == ntohl(0x0101080a)) { 2942 set_mbuf_eo_tsclk_tsoff(m0, 2943 V_FW_ETH_TX_EO_WR_TSCLK(tsclk) | 2944 V_FW_ETH_TX_EO_WR_TSOFF(sizeof(*tcp) / 2 + 1)); 2945 } else 2946 set_mbuf_eo_tsclk_tsoff(m0, 0); 2947 } else if (needs_outer_udp_csum(m0)) { 2948 m0->m_pkthdr.l4hlen = sizeof(struct udphdr); 2949 #endif 2950 } 2951 #ifdef RATELIMIT 2952 if (needs_eo(mst)) { 2953 u_int immhdrs; 2954 2955 /* EO WRs have the headers in the WR and not the GL. */ 2956 immhdrs = m0->m_pkthdr.l2hlen + m0->m_pkthdr.l3hlen + 2957 m0->m_pkthdr.l4hlen; 2958 cflags = 0; 2959 nsegs = count_mbuf_nsegs(m0, immhdrs, &cflags); 2960 MPASS(cflags == mbuf_cflags(m0)); 2961 set_mbuf_eo_nsegs(m0, nsegs); 2962 set_mbuf_eo_len16(m0, 2963 txpkt_eo_len16(nsegs, immhdrs, needs_tso(m0))); 2964 rc = ethofld_transmit(mst->ifp, m0); 2965 if (rc != 0) 2966 goto fail; 2967 return (EINPROGRESS); 2968 } 2969 #endif 2970 #endif 2971 MPASS(m0 == *mp); 2972 return (0); 2973 } 2974 2975 void * 2976 start_wrq_wr(struct sge_wrq *wrq, int len16, struct wrq_cookie *cookie) 2977 { 2978 struct sge_eq *eq = &wrq->eq; 2979 struct adapter *sc = wrq->adapter; 2980 int ndesc, available; 2981 struct wrqe *wr; 2982 void *w; 2983 2984 MPASS(len16 > 0); 2985 ndesc = tx_len16_to_desc(len16); 2986 MPASS(ndesc > 0 && ndesc <= SGE_MAX_WR_NDESC); 2987 2988 EQ_LOCK(eq); 2989 2990 if (TAILQ_EMPTY(&wrq->incomplete_wrs) && !STAILQ_EMPTY(&wrq->wr_list)) 2991 drain_wrq_wr_list(sc, wrq); 2992 2993 if (!STAILQ_EMPTY(&wrq->wr_list)) { 2994 slowpath: 2995 EQ_UNLOCK(eq); 2996 wr = alloc_wrqe(len16 * 16, wrq); 2997 if (__predict_false(wr == NULL)) 2998 return (NULL); 2999 cookie->pidx = -1; 3000 cookie->ndesc = ndesc; 3001 return (&wr->wr); 3002 } 3003 3004 eq->cidx = read_hw_cidx(eq); 3005 if (eq->pidx == eq->cidx) 3006 available = eq->sidx - 1; 3007 else 3008 available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1; 3009 if (available < ndesc) 3010 goto slowpath; 3011 3012 cookie->pidx = eq->pidx; 3013 cookie->ndesc = ndesc; 3014 TAILQ_INSERT_TAIL(&wrq->incomplete_wrs, cookie, link); 3015 3016 w = &eq->desc[eq->pidx]; 3017 IDXINCR(eq->pidx, ndesc, eq->sidx); 3018 if (__predict_false(cookie->pidx + ndesc > eq->sidx)) { 3019 w = &wrq->ss[0]; 3020 wrq->ss_pidx = cookie->pidx; 3021 wrq->ss_len = len16 * 16; 3022 } 3023 3024 EQ_UNLOCK(eq); 3025 3026 return (w); 3027 } 3028 3029 void 3030 commit_wrq_wr(struct sge_wrq *wrq, void *w, struct wrq_cookie *cookie) 3031 { 3032 struct sge_eq *eq = &wrq->eq; 3033 struct adapter *sc = wrq->adapter; 3034 int ndesc, pidx; 3035 struct wrq_cookie *prev, *next; 3036 3037 if (cookie->pidx == -1) { 3038 struct wrqe *wr = __containerof(w, struct wrqe, wr); 3039 3040 t4_wrq_tx(sc, wr); 3041 return; 3042 } 3043 3044 if (__predict_false(w == &wrq->ss[0])) { 3045 int n = (eq->sidx - wrq->ss_pidx) * EQ_ESIZE; 3046 3047 MPASS(wrq->ss_len > n); /* WR had better wrap around. */ 3048 bcopy(&wrq->ss[0], &eq->desc[wrq->ss_pidx], n); 3049 bcopy(&wrq->ss[n], &eq->desc[0], wrq->ss_len - n); 3050 wrq->tx_wrs_ss++; 3051 } else 3052 wrq->tx_wrs_direct++; 3053 3054 EQ_LOCK(eq); 3055 ndesc = cookie->ndesc; /* Can be more than SGE_MAX_WR_NDESC here. */ 3056 pidx = cookie->pidx; 3057 MPASS(pidx >= 0 && pidx < eq->sidx); 3058 prev = TAILQ_PREV(cookie, wrq_incomplete_wrs, link); 3059 next = TAILQ_NEXT(cookie, link); 3060 if (prev == NULL) { 3061 MPASS(pidx == eq->dbidx); 3062 if (next == NULL || ndesc >= 16) { 3063 int available; 3064 struct fw_eth_tx_pkt_wr *dst; /* any fw WR struct will do */ 3065 3066 /* 3067 * Note that the WR via which we'll request tx updates 3068 * is at pidx and not eq->pidx, which has moved on 3069 * already. 3070 */ 3071 dst = (void *)&eq->desc[pidx]; 3072 available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1; 3073 if (available < eq->sidx / 4 && 3074 atomic_cmpset_int(&eq->equiq, 0, 1)) { 3075 /* 3076 * XXX: This is not 100% reliable with some 3077 * types of WRs. But this is a very unusual 3078 * situation for an ofld/ctrl queue anyway. 3079 */ 3080 dst->equiq_to_len16 |= htobe32(F_FW_WR_EQUIQ | 3081 F_FW_WR_EQUEQ); 3082 } 3083 3084 ring_eq_db(wrq->adapter, eq, ndesc); 3085 } else { 3086 MPASS(IDXDIFF(next->pidx, pidx, eq->sidx) == ndesc); 3087 next->pidx = pidx; 3088 next->ndesc += ndesc; 3089 } 3090 } else { 3091 MPASS(IDXDIFF(pidx, prev->pidx, eq->sidx) == prev->ndesc); 3092 prev->ndesc += ndesc; 3093 } 3094 TAILQ_REMOVE(&wrq->incomplete_wrs, cookie, link); 3095 3096 if (TAILQ_EMPTY(&wrq->incomplete_wrs) && !STAILQ_EMPTY(&wrq->wr_list)) 3097 drain_wrq_wr_list(sc, wrq); 3098 3099 #ifdef INVARIANTS 3100 if (TAILQ_EMPTY(&wrq->incomplete_wrs)) { 3101 /* Doorbell must have caught up to the pidx. */ 3102 MPASS(wrq->eq.pidx == wrq->eq.dbidx); 3103 } 3104 #endif 3105 EQ_UNLOCK(eq); 3106 } 3107 3108 static u_int 3109 can_resume_eth_tx(struct mp_ring *r) 3110 { 3111 struct sge_eq *eq = r->cookie; 3112 3113 return (total_available_tx_desc(eq) > eq->sidx / 8); 3114 } 3115 3116 static inline bool 3117 cannot_use_txpkts(struct mbuf *m) 3118 { 3119 /* maybe put a GL limit too, to avoid silliness? */ 3120 3121 return (needs_tso(m) || (mbuf_cflags(m) & (MC_RAW_WR | MC_TLS)) != 0); 3122 } 3123 3124 static inline int 3125 discard_tx(struct sge_eq *eq) 3126 { 3127 3128 return ((eq->flags & (EQ_ENABLED | EQ_QFLUSH)) != EQ_ENABLED); 3129 } 3130 3131 static inline int 3132 wr_can_update_eq(void *p) 3133 { 3134 struct fw_eth_tx_pkts_wr *wr = p; 3135 3136 switch (G_FW_WR_OP(be32toh(wr->op_pkd))) { 3137 case FW_ULPTX_WR: 3138 case FW_ETH_TX_PKT_WR: 3139 case FW_ETH_TX_PKTS_WR: 3140 case FW_ETH_TX_PKTS2_WR: 3141 case FW_ETH_TX_PKT_VM_WR: 3142 case FW_ETH_TX_PKTS_VM_WR: 3143 return (1); 3144 default: 3145 return (0); 3146 } 3147 } 3148 3149 static inline void 3150 set_txupdate_flags(struct sge_txq *txq, u_int avail, 3151 struct fw_eth_tx_pkt_wr *wr) 3152 { 3153 struct sge_eq *eq = &txq->eq; 3154 struct txpkts *txp = &txq->txp; 3155 3156 if ((txp->npkt > 0 || avail < eq->sidx / 2) && 3157 atomic_cmpset_int(&eq->equiq, 0, 1)) { 3158 wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ | F_FW_WR_EQUIQ); 3159 eq->equeqidx = eq->pidx; 3160 } else if (IDXDIFF(eq->pidx, eq->equeqidx, eq->sidx) >= 32) { 3161 wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ); 3162 eq->equeqidx = eq->pidx; 3163 } 3164 } 3165 3166 #if defined(__i386__) || defined(__amd64__) 3167 extern uint64_t tsc_freq; 3168 #endif 3169 3170 static inline bool 3171 record_eth_tx_time(struct sge_txq *txq) 3172 { 3173 const uint64_t cycles = get_cyclecount(); 3174 const uint64_t last_tx = txq->last_tx; 3175 #if defined(__i386__) || defined(__amd64__) 3176 const uint64_t itg = tsc_freq * t4_tx_coalesce_gap / 1000000; 3177 #else 3178 const uint64_t itg = 0; 3179 #endif 3180 3181 MPASS(cycles >= last_tx); 3182 txq->last_tx = cycles; 3183 return (cycles - last_tx < itg); 3184 } 3185 3186 /* 3187 * r->items[cidx] to r->items[pidx], with a wraparound at r->size, are ready to 3188 * be consumed. Return the actual number consumed. 0 indicates a stall. 3189 */ 3190 static u_int 3191 eth_tx(struct mp_ring *r, u_int cidx, u_int pidx, bool *coalescing) 3192 { 3193 struct sge_txq *txq = r->cookie; 3194 struct ifnet *ifp = txq->ifp; 3195 struct sge_eq *eq = &txq->eq; 3196 struct txpkts *txp = &txq->txp; 3197 struct vi_info *vi = ifp->if_softc; 3198 struct adapter *sc = vi->adapter; 3199 u_int total, remaining; /* # of packets */ 3200 u_int n, avail, dbdiff; /* # of hardware descriptors */ 3201 int i, rc; 3202 struct mbuf *m0; 3203 bool snd, recent_tx; 3204 void *wr; /* start of the last WR written to the ring */ 3205 3206 TXQ_LOCK_ASSERT_OWNED(txq); 3207 recent_tx = record_eth_tx_time(txq); 3208 3209 remaining = IDXDIFF(pidx, cidx, r->size); 3210 if (__predict_false(discard_tx(eq))) { 3211 for (i = 0; i < txp->npkt; i++) 3212 m_freem(txp->mb[i]); 3213 txp->npkt = 0; 3214 while (cidx != pidx) { 3215 m0 = r->items[cidx]; 3216 m_freem(m0); 3217 if (++cidx == r->size) 3218 cidx = 0; 3219 } 3220 reclaim_tx_descs(txq, eq->sidx); 3221 *coalescing = false; 3222 return (remaining); /* emptied */ 3223 } 3224 3225 /* How many hardware descriptors do we have readily available. */ 3226 if (eq->pidx == eq->cidx) 3227 avail = eq->sidx - 1; 3228 else 3229 avail = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1; 3230 3231 total = 0; 3232 if (remaining == 0) { 3233 txp->score = 0; 3234 txq->txpkts_flush++; 3235 goto send_txpkts; 3236 } 3237 3238 dbdiff = 0; 3239 MPASS(remaining > 0); 3240 while (remaining > 0) { 3241 m0 = r->items[cidx]; 3242 M_ASSERTPKTHDR(m0); 3243 MPASS(m0->m_nextpkt == NULL); 3244 3245 if (avail < 2 * SGE_MAX_WR_NDESC) 3246 avail += reclaim_tx_descs(txq, 64); 3247 3248 if (t4_tx_coalesce == 0 && txp->npkt == 0) 3249 goto skip_coalescing; 3250 if (cannot_use_txpkts(m0)) 3251 txp->score = 0; 3252 else if (recent_tx) { 3253 if (++txp->score == 0) 3254 txp->score = UINT8_MAX; 3255 } else 3256 txp->score = 1; 3257 if (txp->npkt > 0 || remaining > 1 || 3258 txp->score >= t4_tx_coalesce_pkts || 3259 atomic_load_int(&txq->eq.equiq) != 0) { 3260 if (vi->flags & TX_USES_VM_WR) 3261 rc = add_to_txpkts_vf(sc, txq, m0, avail, &snd); 3262 else 3263 rc = add_to_txpkts_pf(sc, txq, m0, avail, &snd); 3264 } else { 3265 snd = false; 3266 rc = EINVAL; 3267 } 3268 if (snd) { 3269 MPASS(txp->npkt > 0); 3270 for (i = 0; i < txp->npkt; i++) 3271 ETHER_BPF_MTAP(ifp, txp->mb[i]); 3272 if (txp->npkt > 1) { 3273 MPASS(avail >= tx_len16_to_desc(txp->len16)); 3274 if (vi->flags & TX_USES_VM_WR) 3275 n = write_txpkts_vm_wr(sc, txq); 3276 else 3277 n = write_txpkts_wr(sc, txq); 3278 } else { 3279 MPASS(avail >= 3280 tx_len16_to_desc(mbuf_len16(txp->mb[0]))); 3281 if (vi->flags & TX_USES_VM_WR) 3282 n = write_txpkt_vm_wr(sc, txq, 3283 txp->mb[0]); 3284 else 3285 n = write_txpkt_wr(sc, txq, txp->mb[0], 3286 avail); 3287 } 3288 MPASS(n <= SGE_MAX_WR_NDESC); 3289 avail -= n; 3290 dbdiff += n; 3291 wr = &eq->desc[eq->pidx]; 3292 IDXINCR(eq->pidx, n, eq->sidx); 3293 txp->npkt = 0; /* emptied */ 3294 } 3295 if (rc == 0) { 3296 /* m0 was coalesced into txq->txpkts. */ 3297 goto next_mbuf; 3298 } 3299 if (rc == EAGAIN) { 3300 /* 3301 * m0 is suitable for tx coalescing but could not be 3302 * combined with the existing txq->txpkts, which has now 3303 * been transmitted. Start a new txpkts with m0. 3304 */ 3305 MPASS(snd); 3306 MPASS(txp->npkt == 0); 3307 continue; 3308 } 3309 3310 MPASS(rc != 0 && rc != EAGAIN); 3311 MPASS(txp->npkt == 0); 3312 skip_coalescing: 3313 n = tx_len16_to_desc(mbuf_len16(m0)); 3314 if (__predict_false(avail < n)) { 3315 avail += reclaim_tx_descs(txq, min(n, 32)); 3316 if (avail < n) 3317 break; /* out of descriptors */ 3318 } 3319 3320 wr = &eq->desc[eq->pidx]; 3321 if (mbuf_cflags(m0) & MC_RAW_WR) { 3322 n = write_raw_wr(txq, wr, m0, avail); 3323 #ifdef KERN_TLS 3324 } else if (mbuf_cflags(m0) & MC_TLS) { 3325 ETHER_BPF_MTAP(ifp, m0); 3326 n = t6_ktls_write_wr(txq, wr, m0, mbuf_nsegs(m0), 3327 avail); 3328 #endif 3329 } else { 3330 ETHER_BPF_MTAP(ifp, m0); 3331 if (vi->flags & TX_USES_VM_WR) 3332 n = write_txpkt_vm_wr(sc, txq, m0); 3333 else 3334 n = write_txpkt_wr(sc, txq, m0, avail); 3335 } 3336 MPASS(n >= 1 && n <= avail); 3337 if (!(mbuf_cflags(m0) & MC_TLS)) 3338 MPASS(n <= SGE_MAX_WR_NDESC); 3339 3340 avail -= n; 3341 dbdiff += n; 3342 IDXINCR(eq->pidx, n, eq->sidx); 3343 3344 if (dbdiff >= 512 / EQ_ESIZE) { /* X_FETCHBURSTMAX_512B */ 3345 if (wr_can_update_eq(wr)) 3346 set_txupdate_flags(txq, avail, wr); 3347 ring_eq_db(sc, eq, dbdiff); 3348 avail += reclaim_tx_descs(txq, 32); 3349 dbdiff = 0; 3350 } 3351 next_mbuf: 3352 total++; 3353 remaining--; 3354 if (__predict_false(++cidx == r->size)) 3355 cidx = 0; 3356 } 3357 if (dbdiff != 0) { 3358 if (wr_can_update_eq(wr)) 3359 set_txupdate_flags(txq, avail, wr); 3360 ring_eq_db(sc, eq, dbdiff); 3361 reclaim_tx_descs(txq, 32); 3362 } else if (eq->pidx == eq->cidx && txp->npkt > 0 && 3363 atomic_load_int(&txq->eq.equiq) == 0) { 3364 /* 3365 * If nothing was submitted to the chip for tx (it was coalesced 3366 * into txpkts instead) and there is no tx update outstanding 3367 * then we need to send txpkts now. 3368 */ 3369 send_txpkts: 3370 MPASS(txp->npkt > 0); 3371 for (i = 0; i < txp->npkt; i++) 3372 ETHER_BPF_MTAP(ifp, txp->mb[i]); 3373 if (txp->npkt > 1) { 3374 MPASS(avail >= tx_len16_to_desc(txp->len16)); 3375 if (vi->flags & TX_USES_VM_WR) 3376 n = write_txpkts_vm_wr(sc, txq); 3377 else 3378 n = write_txpkts_wr(sc, txq); 3379 } else { 3380 MPASS(avail >= 3381 tx_len16_to_desc(mbuf_len16(txp->mb[0]))); 3382 if (vi->flags & TX_USES_VM_WR) 3383 n = write_txpkt_vm_wr(sc, txq, txp->mb[0]); 3384 else 3385 n = write_txpkt_wr(sc, txq, txp->mb[0], avail); 3386 } 3387 MPASS(n <= SGE_MAX_WR_NDESC); 3388 wr = &eq->desc[eq->pidx]; 3389 IDXINCR(eq->pidx, n, eq->sidx); 3390 txp->npkt = 0; /* emptied */ 3391 3392 MPASS(wr_can_update_eq(wr)); 3393 set_txupdate_flags(txq, avail - n, wr); 3394 ring_eq_db(sc, eq, n); 3395 reclaim_tx_descs(txq, 32); 3396 } 3397 *coalescing = txp->npkt > 0; 3398 3399 return (total); 3400 } 3401 3402 static inline void 3403 init_iq(struct sge_iq *iq, struct adapter *sc, int tmr_idx, int pktc_idx, 3404 int qsize, int intr_idx, int cong, int qtype) 3405 { 3406 3407 KASSERT(tmr_idx >= 0 && tmr_idx < SGE_NTIMERS, 3408 ("%s: bad tmr_idx %d", __func__, tmr_idx)); 3409 KASSERT(pktc_idx < SGE_NCOUNTERS, /* -ve is ok, means don't use */ 3410 ("%s: bad pktc_idx %d", __func__, pktc_idx)); 3411 KASSERT(intr_idx >= -1 && intr_idx < sc->intr_count, 3412 ("%s: bad intr_idx %d", __func__, intr_idx)); 3413 KASSERT(qtype == FW_IQ_IQTYPE_OTHER || qtype == FW_IQ_IQTYPE_NIC || 3414 qtype == FW_IQ_IQTYPE_OFLD, ("%s: bad qtype %d", __func__, qtype)); 3415 3416 iq->flags = 0; 3417 iq->state = IQS_DISABLED; 3418 iq->adapter = sc; 3419 iq->qtype = qtype; 3420 iq->intr_params = V_QINTR_TIMER_IDX(tmr_idx); 3421 iq->intr_pktc_idx = SGE_NCOUNTERS - 1; 3422 if (pktc_idx >= 0) { 3423 iq->intr_params |= F_QINTR_CNT_EN; 3424 iq->intr_pktc_idx = pktc_idx; 3425 } 3426 iq->qsize = roundup2(qsize, 16); /* See FW_IQ_CMD/iqsize */ 3427 iq->sidx = iq->qsize - sc->params.sge.spg_len / IQ_ESIZE; 3428 iq->intr_idx = intr_idx; 3429 iq->cong_drop = cong; 3430 } 3431 3432 static inline void 3433 init_fl(struct adapter *sc, struct sge_fl *fl, int qsize, int maxp, char *name) 3434 { 3435 struct sge_params *sp = &sc->params.sge; 3436 3437 fl->qsize = qsize; 3438 fl->sidx = qsize - sc->params.sge.spg_len / EQ_ESIZE; 3439 strlcpy(fl->lockname, name, sizeof(fl->lockname)); 3440 mtx_init(&fl->fl_lock, fl->lockname, NULL, MTX_DEF); 3441 if (sc->flags & BUF_PACKING_OK && 3442 ((!is_t4(sc) && buffer_packing) || /* T5+: enabled unless 0 */ 3443 (is_t4(sc) && buffer_packing == 1)))/* T4: disabled unless 1 */ 3444 fl->flags |= FL_BUF_PACKING; 3445 fl->zidx = find_refill_source(sc, maxp, fl->flags & FL_BUF_PACKING); 3446 fl->safe_zidx = sc->sge.safe_zidx; 3447 if (fl->flags & FL_BUF_PACKING) { 3448 fl->lowat = roundup2(sp->fl_starve_threshold2, 8); 3449 fl->buf_boundary = sp->pack_boundary; 3450 } else { 3451 fl->lowat = roundup2(sp->fl_starve_threshold, 8); 3452 fl->buf_boundary = 16; 3453 } 3454 if (fl_pad && fl->buf_boundary < sp->pad_boundary) 3455 fl->buf_boundary = sp->pad_boundary; 3456 } 3457 3458 static inline void 3459 init_eq(struct adapter *sc, struct sge_eq *eq, int eqtype, int qsize, 3460 uint8_t tx_chan, struct sge_iq *iq, char *name) 3461 { 3462 KASSERT(eqtype >= EQ_CTRL && eqtype <= EQ_OFLD, 3463 ("%s: bad qtype %d", __func__, eqtype)); 3464 3465 eq->type = eqtype; 3466 eq->tx_chan = tx_chan; 3467 eq->iq = iq; 3468 eq->sidx = qsize - sc->params.sge.spg_len / EQ_ESIZE; 3469 strlcpy(eq->lockname, name, sizeof(eq->lockname)); 3470 mtx_init(&eq->eq_lock, eq->lockname, NULL, MTX_DEF); 3471 } 3472 3473 int 3474 alloc_ring(struct adapter *sc, size_t len, bus_dma_tag_t *tag, 3475 bus_dmamap_t *map, bus_addr_t *pa, void **va) 3476 { 3477 int rc; 3478 3479 rc = bus_dma_tag_create(sc->dmat, 512, 0, BUS_SPACE_MAXADDR, 3480 BUS_SPACE_MAXADDR, NULL, NULL, len, 1, len, 0, NULL, NULL, tag); 3481 if (rc != 0) { 3482 CH_ERR(sc, "cannot allocate DMA tag: %d\n", rc); 3483 goto done; 3484 } 3485 3486 rc = bus_dmamem_alloc(*tag, va, 3487 BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, map); 3488 if (rc != 0) { 3489 CH_ERR(sc, "cannot allocate DMA memory: %d\n", rc); 3490 goto done; 3491 } 3492 3493 rc = bus_dmamap_load(*tag, *map, *va, len, oneseg_dma_callback, pa, 0); 3494 if (rc != 0) { 3495 CH_ERR(sc, "cannot load DMA map: %d\n", rc); 3496 goto done; 3497 } 3498 done: 3499 if (rc) 3500 free_ring(sc, *tag, *map, *pa, *va); 3501 3502 return (rc); 3503 } 3504 3505 int 3506 free_ring(struct adapter *sc, bus_dma_tag_t tag, bus_dmamap_t map, 3507 bus_addr_t pa, void *va) 3508 { 3509 if (pa) 3510 bus_dmamap_unload(tag, map); 3511 if (va) 3512 bus_dmamem_free(tag, va, map); 3513 if (tag) 3514 bus_dma_tag_destroy(tag); 3515 3516 return (0); 3517 } 3518 3519 /* 3520 * Allocates the software resources (mainly memory and sysctl nodes) for an 3521 * ingress queue and an optional freelist. 3522 * 3523 * Sets IQ_SW_ALLOCATED and returns 0 on success. 3524 */ 3525 static int 3526 alloc_iq_fl(struct vi_info *vi, struct sge_iq *iq, struct sge_fl *fl, 3527 struct sysctl_ctx_list *ctx, struct sysctl_oid *oid) 3528 { 3529 int rc; 3530 size_t len; 3531 struct adapter *sc = vi->adapter; 3532 3533 MPASS(!(iq->flags & IQ_SW_ALLOCATED)); 3534 3535 len = iq->qsize * IQ_ESIZE; 3536 rc = alloc_ring(sc, len, &iq->desc_tag, &iq->desc_map, &iq->ba, 3537 (void **)&iq->desc); 3538 if (rc != 0) 3539 return (rc); 3540 3541 if (fl) { 3542 len = fl->qsize * EQ_ESIZE; 3543 rc = alloc_ring(sc, len, &fl->desc_tag, &fl->desc_map, 3544 &fl->ba, (void **)&fl->desc); 3545 if (rc) { 3546 free_ring(sc, iq->desc_tag, iq->desc_map, iq->ba, 3547 iq->desc); 3548 return (rc); 3549 } 3550 3551 /* Allocate space for one software descriptor per buffer. */ 3552 fl->sdesc = malloc(fl->sidx * 8 * sizeof(struct fl_sdesc), 3553 M_CXGBE, M_ZERO | M_WAITOK); 3554 3555 add_fl_sysctls(sc, ctx, oid, fl); 3556 iq->flags |= IQ_HAS_FL; 3557 } 3558 add_iq_sysctls(ctx, oid, iq); 3559 iq->flags |= IQ_SW_ALLOCATED; 3560 3561 return (0); 3562 } 3563 3564 /* 3565 * Frees all software resources (memory and locks) associated with an ingress 3566 * queue and an optional freelist. 3567 */ 3568 static void 3569 free_iq_fl(struct adapter *sc, struct sge_iq *iq, struct sge_fl *fl) 3570 { 3571 MPASS(iq->flags & IQ_SW_ALLOCATED); 3572 3573 if (fl) { 3574 MPASS(iq->flags & IQ_HAS_FL); 3575 free_ring(sc, fl->desc_tag, fl->desc_map, fl->ba, fl->desc); 3576 free_fl_buffers(sc, fl); 3577 free(fl->sdesc, M_CXGBE); 3578 mtx_destroy(&fl->fl_lock); 3579 bzero(fl, sizeof(*fl)); 3580 } 3581 free_ring(sc, iq->desc_tag, iq->desc_map, iq->ba, iq->desc); 3582 bzero(iq, sizeof(*iq)); 3583 } 3584 3585 /* 3586 * Allocates a hardware ingress queue and an optional freelist that will be 3587 * associated with it. 3588 * 3589 * Returns errno on failure. Resources allocated up to that point may still be 3590 * allocated. Caller is responsible for cleanup in case this function fails. 3591 */ 3592 static int 3593 alloc_iq_fl_hwq(struct vi_info *vi, struct sge_iq *iq, struct sge_fl *fl) 3594 { 3595 int rc, cntxt_id, cong_map; 3596 struct fw_iq_cmd c; 3597 struct adapter *sc = vi->adapter; 3598 struct port_info *pi = vi->pi; 3599 __be32 v = 0; 3600 3601 MPASS (!(iq->flags & IQ_HW_ALLOCATED)); 3602 3603 bzero(&c, sizeof(c)); 3604 c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_IQ_CMD) | F_FW_CMD_REQUEST | 3605 F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_IQ_CMD_PFN(sc->pf) | 3606 V_FW_IQ_CMD_VFN(0)); 3607 3608 c.alloc_to_len16 = htobe32(F_FW_IQ_CMD_ALLOC | F_FW_IQ_CMD_IQSTART | 3609 FW_LEN16(c)); 3610 3611 /* Special handling for firmware event queue */ 3612 if (iq == &sc->sge.fwq) 3613 v |= F_FW_IQ_CMD_IQASYNCH; 3614 3615 if (iq->intr_idx < 0) { 3616 /* Forwarded interrupts, all headed to fwq */ 3617 v |= F_FW_IQ_CMD_IQANDST; 3618 v |= V_FW_IQ_CMD_IQANDSTINDEX(sc->sge.fwq.cntxt_id); 3619 } else { 3620 KASSERT(iq->intr_idx < sc->intr_count, 3621 ("%s: invalid direct intr_idx %d", __func__, iq->intr_idx)); 3622 v |= V_FW_IQ_CMD_IQANDSTINDEX(iq->intr_idx); 3623 } 3624 3625 bzero(iq->desc, iq->qsize * IQ_ESIZE); 3626 c.type_to_iqandstindex = htobe32(v | 3627 V_FW_IQ_CMD_TYPE(FW_IQ_TYPE_FL_INT_CAP) | 3628 V_FW_IQ_CMD_VIID(vi->viid) | 3629 V_FW_IQ_CMD_IQANUD(X_UPDATEDELIVERY_INTERRUPT)); 3630 c.iqdroprss_to_iqesize = htobe16(V_FW_IQ_CMD_IQPCIECH(pi->tx_chan) | 3631 F_FW_IQ_CMD_IQGTSMODE | 3632 V_FW_IQ_CMD_IQINTCNTTHRESH(iq->intr_pktc_idx) | 3633 V_FW_IQ_CMD_IQESIZE(ilog2(IQ_ESIZE) - 4)); 3634 c.iqsize = htobe16(iq->qsize); 3635 c.iqaddr = htobe64(iq->ba); 3636 c.iqns_to_fl0congen = htobe32(V_FW_IQ_CMD_IQTYPE(iq->qtype)); 3637 if (iq->cong_drop != -1) { 3638 cong_map = iq->qtype == IQ_ETH ? pi->rx_e_chan_map : 0; 3639 c.iqns_to_fl0congen |= htobe32(F_FW_IQ_CMD_IQFLINTCONGEN); 3640 } 3641 3642 if (fl) { 3643 bzero(fl->desc, fl->sidx * EQ_ESIZE + sc->params.sge.spg_len); 3644 c.iqns_to_fl0congen |= 3645 htobe32(V_FW_IQ_CMD_FL0HOSTFCMODE(X_HOSTFCMODE_NONE) | 3646 F_FW_IQ_CMD_FL0FETCHRO | F_FW_IQ_CMD_FL0DATARO | 3647 (fl_pad ? F_FW_IQ_CMD_FL0PADEN : 0) | 3648 (fl->flags & FL_BUF_PACKING ? F_FW_IQ_CMD_FL0PACKEN : 3649 0)); 3650 if (iq->cong_drop != -1) { 3651 c.iqns_to_fl0congen |= 3652 htobe32(V_FW_IQ_CMD_FL0CNGCHMAP(cong_map) | 3653 F_FW_IQ_CMD_FL0CONGCIF | 3654 F_FW_IQ_CMD_FL0CONGEN); 3655 } 3656 c.fl0dcaen_to_fl0cidxfthresh = 3657 htobe16(V_FW_IQ_CMD_FL0FBMIN(chip_id(sc) <= CHELSIO_T5 ? 3658 X_FETCHBURSTMIN_128B : X_FETCHBURSTMIN_64B_T6) | 3659 V_FW_IQ_CMD_FL0FBMAX(chip_id(sc) <= CHELSIO_T5 ? 3660 X_FETCHBURSTMAX_512B : X_FETCHBURSTMAX_256B)); 3661 c.fl0size = htobe16(fl->qsize); 3662 c.fl0addr = htobe64(fl->ba); 3663 } 3664 3665 rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c); 3666 if (rc != 0) { 3667 CH_ERR(sc, "failed to create hw ingress queue: %d\n", rc); 3668 return (rc); 3669 } 3670 3671 iq->cidx = 0; 3672 iq->gen = F_RSPD_GEN; 3673 iq->cntxt_id = be16toh(c.iqid); 3674 iq->abs_id = be16toh(c.physiqid); 3675 3676 cntxt_id = iq->cntxt_id - sc->sge.iq_start; 3677 if (cntxt_id >= sc->sge.iqmap_sz) { 3678 panic ("%s: iq->cntxt_id (%d) more than the max (%d)", __func__, 3679 cntxt_id, sc->sge.iqmap_sz - 1); 3680 } 3681 sc->sge.iqmap[cntxt_id] = iq; 3682 3683 if (fl) { 3684 u_int qid; 3685 #ifdef INVARIANTS 3686 int i; 3687 3688 MPASS(!(fl->flags & FL_BUF_RESUME)); 3689 for (i = 0; i < fl->sidx * 8; i++) 3690 MPASS(fl->sdesc[i].cl == NULL); 3691 #endif 3692 fl->cntxt_id = be16toh(c.fl0id); 3693 fl->pidx = fl->cidx = fl->hw_cidx = fl->dbidx = 0; 3694 fl->rx_offset = 0; 3695 fl->flags &= ~(FL_STARVING | FL_DOOMED); 3696 3697 cntxt_id = fl->cntxt_id - sc->sge.eq_start; 3698 if (cntxt_id >= sc->sge.eqmap_sz) { 3699 panic("%s: fl->cntxt_id (%d) more than the max (%d)", 3700 __func__, cntxt_id, sc->sge.eqmap_sz - 1); 3701 } 3702 sc->sge.eqmap[cntxt_id] = (void *)fl; 3703 3704 qid = fl->cntxt_id; 3705 if (isset(&sc->doorbells, DOORBELL_UDB)) { 3706 uint32_t s_qpp = sc->params.sge.eq_s_qpp; 3707 uint32_t mask = (1 << s_qpp) - 1; 3708 volatile uint8_t *udb; 3709 3710 udb = sc->udbs_base + UDBS_DB_OFFSET; 3711 udb += (qid >> s_qpp) << PAGE_SHIFT; 3712 qid &= mask; 3713 if (qid < PAGE_SIZE / UDBS_SEG_SIZE) { 3714 udb += qid << UDBS_SEG_SHIFT; 3715 qid = 0; 3716 } 3717 fl->udb = (volatile void *)udb; 3718 } 3719 fl->dbval = V_QID(qid) | sc->chip_params->sge_fl_db; 3720 3721 FL_LOCK(fl); 3722 /* Enough to make sure the SGE doesn't think it's starved */ 3723 refill_fl(sc, fl, fl->lowat); 3724 FL_UNLOCK(fl); 3725 } 3726 3727 if (chip_id(sc) >= CHELSIO_T5 && !(sc->flags & IS_VF) && 3728 iq->cong_drop != -1) { 3729 t4_sge_set_conm_context(sc, iq->cntxt_id, iq->cong_drop, 3730 cong_map); 3731 } 3732 3733 /* Enable IQ interrupts */ 3734 atomic_store_rel_int(&iq->state, IQS_IDLE); 3735 t4_write_reg(sc, sc->sge_gts_reg, V_SEINTARM(iq->intr_params) | 3736 V_INGRESSQID(iq->cntxt_id)); 3737 3738 iq->flags |= IQ_HW_ALLOCATED; 3739 3740 return (0); 3741 } 3742 3743 static int 3744 free_iq_fl_hwq(struct adapter *sc, struct sge_iq *iq, struct sge_fl *fl) 3745 { 3746 int rc; 3747 3748 MPASS(iq->flags & IQ_HW_ALLOCATED); 3749 rc = -t4_iq_free(sc, sc->mbox, sc->pf, 0, FW_IQ_TYPE_FL_INT_CAP, 3750 iq->cntxt_id, fl ? fl->cntxt_id : 0xffff, 0xffff); 3751 if (rc != 0) { 3752 CH_ERR(sc, "failed to free iq %p: %d\n", iq, rc); 3753 return (rc); 3754 } 3755 iq->flags &= ~IQ_HW_ALLOCATED; 3756 3757 return (0); 3758 } 3759 3760 static void 3761 add_iq_sysctls(struct sysctl_ctx_list *ctx, struct sysctl_oid *oid, 3762 struct sge_iq *iq) 3763 { 3764 struct sysctl_oid_list *children; 3765 3766 if (ctx == NULL || oid == NULL) 3767 return; 3768 3769 children = SYSCTL_CHILDREN(oid); 3770 SYSCTL_ADD_UAUTO(ctx, children, OID_AUTO, "ba", CTLFLAG_RD, &iq->ba, 3771 "bus address of descriptor ring"); 3772 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL, 3773 iq->qsize * IQ_ESIZE, "descriptor ring size in bytes"); 3774 SYSCTL_ADD_U16(ctx, children, OID_AUTO, "abs_id", CTLFLAG_RD, 3775 &iq->abs_id, 0, "absolute id of the queue"); 3776 SYSCTL_ADD_U16(ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD, 3777 &iq->cntxt_id, 0, "SGE context id of the queue"); 3778 SYSCTL_ADD_U16(ctx, children, OID_AUTO, "cidx", CTLFLAG_RD, &iq->cidx, 3779 0, "consumer index"); 3780 } 3781 3782 static void 3783 add_fl_sysctls(struct adapter *sc, struct sysctl_ctx_list *ctx, 3784 struct sysctl_oid *oid, struct sge_fl *fl) 3785 { 3786 struct sysctl_oid_list *children; 3787 3788 if (ctx == NULL || oid == NULL) 3789 return; 3790 3791 children = SYSCTL_CHILDREN(oid); 3792 oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "fl", 3793 CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "freelist"); 3794 children = SYSCTL_CHILDREN(oid); 3795 3796 SYSCTL_ADD_UAUTO(ctx, children, OID_AUTO, "ba", CTLFLAG_RD, 3797 &fl->ba, "bus address of descriptor ring"); 3798 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL, 3799 fl->sidx * EQ_ESIZE + sc->params.sge.spg_len, 3800 "desc ring size in bytes"); 3801 SYSCTL_ADD_U16(ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD, 3802 &fl->cntxt_id, 0, "SGE context id of the freelist"); 3803 SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "padding", CTLFLAG_RD, NULL, 3804 fl_pad ? 1 : 0, "padding enabled"); 3805 SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "packing", CTLFLAG_RD, NULL, 3806 fl->flags & FL_BUF_PACKING ? 1 : 0, "packing enabled"); 3807 SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cidx", CTLFLAG_RD, &fl->cidx, 3808 0, "consumer index"); 3809 if (fl->flags & FL_BUF_PACKING) { 3810 SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "rx_offset", 3811 CTLFLAG_RD, &fl->rx_offset, 0, "packing rx offset"); 3812 } 3813 SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "pidx", CTLFLAG_RD, &fl->pidx, 3814 0, "producer index"); 3815 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_allocated", 3816 CTLFLAG_RD, &fl->cl_allocated, "# of clusters allocated"); 3817 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_recycled", 3818 CTLFLAG_RD, &fl->cl_recycled, "# of clusters recycled"); 3819 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_fast_recycled", 3820 CTLFLAG_RD, &fl->cl_fast_recycled, "# of clusters recycled (fast)"); 3821 } 3822 3823 /* 3824 * Idempotent. 3825 */ 3826 static int 3827 alloc_fwq(struct adapter *sc) 3828 { 3829 int rc, intr_idx; 3830 struct sge_iq *fwq = &sc->sge.fwq; 3831 struct vi_info *vi = &sc->port[0]->vi[0]; 3832 3833 if (!(fwq->flags & IQ_SW_ALLOCATED)) { 3834 MPASS(!(fwq->flags & IQ_HW_ALLOCATED)); 3835 3836 if (sc->flags & IS_VF) 3837 intr_idx = 0; 3838 else 3839 intr_idx = sc->intr_count > 1 ? 1 : 0; 3840 init_iq(fwq, sc, 0, 0, FW_IQ_QSIZE, intr_idx, -1, IQ_OTHER); 3841 rc = alloc_iq_fl(vi, fwq, NULL, &sc->ctx, sc->fwq_oid); 3842 if (rc != 0) { 3843 CH_ERR(sc, "failed to allocate fwq: %d\n", rc); 3844 return (rc); 3845 } 3846 MPASS(fwq->flags & IQ_SW_ALLOCATED); 3847 } 3848 3849 if (!(fwq->flags & IQ_HW_ALLOCATED)) { 3850 MPASS(fwq->flags & IQ_SW_ALLOCATED); 3851 3852 rc = alloc_iq_fl_hwq(vi, fwq, NULL); 3853 if (rc != 0) { 3854 CH_ERR(sc, "failed to create hw fwq: %d\n", rc); 3855 return (rc); 3856 } 3857 MPASS(fwq->flags & IQ_HW_ALLOCATED); 3858 } 3859 3860 return (0); 3861 } 3862 3863 /* 3864 * Idempotent. 3865 */ 3866 static void 3867 free_fwq(struct adapter *sc) 3868 { 3869 struct sge_iq *fwq = &sc->sge.fwq; 3870 3871 if (fwq->flags & IQ_HW_ALLOCATED) { 3872 MPASS(fwq->flags & IQ_SW_ALLOCATED); 3873 free_iq_fl_hwq(sc, fwq, NULL); 3874 MPASS(!(fwq->flags & IQ_HW_ALLOCATED)); 3875 } 3876 3877 if (fwq->flags & IQ_SW_ALLOCATED) { 3878 MPASS(!(fwq->flags & IQ_HW_ALLOCATED)); 3879 free_iq_fl(sc, fwq, NULL); 3880 MPASS(!(fwq->flags & IQ_SW_ALLOCATED)); 3881 } 3882 } 3883 3884 /* 3885 * Idempotent. 3886 */ 3887 static int 3888 alloc_ctrlq(struct adapter *sc, int idx) 3889 { 3890 int rc; 3891 char name[16]; 3892 struct sysctl_oid *oid; 3893 struct sge_wrq *ctrlq = &sc->sge.ctrlq[idx]; 3894 3895 MPASS(idx < sc->params.nports); 3896 3897 if (!(ctrlq->eq.flags & EQ_SW_ALLOCATED)) { 3898 MPASS(!(ctrlq->eq.flags & EQ_HW_ALLOCATED)); 3899 3900 snprintf(name, sizeof(name), "%d", idx); 3901 oid = SYSCTL_ADD_NODE(&sc->ctx, SYSCTL_CHILDREN(sc->ctrlq_oid), 3902 OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 3903 "ctrl queue"); 3904 3905 snprintf(name, sizeof(name), "%s ctrlq%d", 3906 device_get_nameunit(sc->dev), idx); 3907 init_eq(sc, &ctrlq->eq, EQ_CTRL, CTRL_EQ_QSIZE, 3908 sc->port[idx]->tx_chan, &sc->sge.fwq, name); 3909 rc = alloc_wrq(sc, NULL, ctrlq, &sc->ctx, oid); 3910 if (rc != 0) { 3911 CH_ERR(sc, "failed to allocate ctrlq%d: %d\n", idx, rc); 3912 sysctl_remove_oid(oid, 1, 1); 3913 return (rc); 3914 } 3915 MPASS(ctrlq->eq.flags & EQ_SW_ALLOCATED); 3916 } 3917 3918 if (!(ctrlq->eq.flags & EQ_HW_ALLOCATED)) { 3919 MPASS(ctrlq->eq.flags & EQ_SW_ALLOCATED); 3920 3921 rc = alloc_eq_hwq(sc, NULL, &ctrlq->eq); 3922 if (rc != 0) { 3923 CH_ERR(sc, "failed to create hw ctrlq%d: %d\n", idx, rc); 3924 return (rc); 3925 } 3926 MPASS(ctrlq->eq.flags & EQ_HW_ALLOCATED); 3927 } 3928 3929 return (0); 3930 } 3931 3932 /* 3933 * Idempotent. 3934 */ 3935 static void 3936 free_ctrlq(struct adapter *sc, int idx) 3937 { 3938 struct sge_wrq *ctrlq = &sc->sge.ctrlq[idx]; 3939 3940 if (ctrlq->eq.flags & EQ_HW_ALLOCATED) { 3941 MPASS(ctrlq->eq.flags & EQ_SW_ALLOCATED); 3942 free_eq_hwq(sc, NULL, &ctrlq->eq); 3943 MPASS(!(ctrlq->eq.flags & EQ_HW_ALLOCATED)); 3944 } 3945 3946 if (ctrlq->eq.flags & EQ_SW_ALLOCATED) { 3947 MPASS(!(ctrlq->eq.flags & EQ_HW_ALLOCATED)); 3948 free_wrq(sc, ctrlq); 3949 MPASS(!(ctrlq->eq.flags & EQ_SW_ALLOCATED)); 3950 } 3951 } 3952 3953 int 3954 t4_sge_set_conm_context(struct adapter *sc, int cntxt_id, int cong_drop, 3955 int cong_map) 3956 { 3957 const int cng_ch_bits_log = sc->chip_params->cng_ch_bits_log; 3958 uint32_t param, val; 3959 uint16_t ch_map; 3960 int cong_mode, rc, i; 3961 3962 if (chip_id(sc) < CHELSIO_T5) 3963 return (ENOTSUP); 3964 3965 /* Convert the driver knob to the mode understood by the firmware. */ 3966 switch (cong_drop) { 3967 case -1: 3968 cong_mode = X_CONMCTXT_CNGTPMODE_DISABLE; 3969 break; 3970 case 0: 3971 cong_mode = X_CONMCTXT_CNGTPMODE_CHANNEL; 3972 break; 3973 case 1: 3974 cong_mode = X_CONMCTXT_CNGTPMODE_QUEUE; 3975 break; 3976 case 2: 3977 cong_mode = X_CONMCTXT_CNGTPMODE_BOTH; 3978 break; 3979 default: 3980 MPASS(0); 3981 CH_ERR(sc, "cong_drop = %d is invalid (ingress queue %d).\n", 3982 cong_drop, cntxt_id); 3983 return (EINVAL); 3984 } 3985 3986 param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DMAQ) | 3987 V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DMAQ_CONM_CTXT) | 3988 V_FW_PARAMS_PARAM_YZ(cntxt_id); 3989 val = V_CONMCTXT_CNGTPMODE(cong_mode); 3990 if (cong_mode == X_CONMCTXT_CNGTPMODE_CHANNEL || 3991 cong_mode == X_CONMCTXT_CNGTPMODE_BOTH) { 3992 for (i = 0, ch_map = 0; i < 4; i++) { 3993 if (cong_map & (1 << i)) 3994 ch_map |= 1 << (i << cng_ch_bits_log); 3995 } 3996 val |= V_CONMCTXT_CNGCHMAP(ch_map); 3997 } 3998 rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, ¶m, &val); 3999 if (rc != 0) { 4000 CH_ERR(sc, "failed to set congestion manager context " 4001 "for ingress queue %d: %d\n", cntxt_id, rc); 4002 } 4003 4004 return (rc); 4005 } 4006 4007 /* 4008 * Idempotent. 4009 */ 4010 static int 4011 alloc_rxq(struct vi_info *vi, struct sge_rxq *rxq, int idx, int intr_idx, 4012 int maxp) 4013 { 4014 int rc; 4015 struct adapter *sc = vi->adapter; 4016 struct ifnet *ifp = vi->ifp; 4017 struct sysctl_oid *oid; 4018 char name[16]; 4019 4020 if (!(rxq->iq.flags & IQ_SW_ALLOCATED)) { 4021 MPASS(!(rxq->iq.flags & IQ_HW_ALLOCATED)); 4022 #if defined(INET) || defined(INET6) 4023 rc = tcp_lro_init_args(&rxq->lro, ifp, lro_entries, lro_mbufs); 4024 if (rc != 0) 4025 return (rc); 4026 MPASS(rxq->lro.ifp == ifp); /* also indicates LRO init'ed */ 4027 #endif 4028 rxq->ifp = ifp; 4029 4030 snprintf(name, sizeof(name), "%d", idx); 4031 oid = SYSCTL_ADD_NODE(&vi->ctx, SYSCTL_CHILDREN(vi->rxq_oid), 4032 OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 4033 "rx queue"); 4034 4035 init_iq(&rxq->iq, sc, vi->tmr_idx, vi->pktc_idx, vi->qsize_rxq, 4036 intr_idx, cong_drop, IQ_ETH); 4037 #if defined(INET) || defined(INET6) 4038 if (ifp->if_capenable & IFCAP_LRO) 4039 rxq->iq.flags |= IQ_LRO_ENABLED; 4040 #endif 4041 if (ifp->if_capenable & IFCAP_HWRXTSTMP) 4042 rxq->iq.flags |= IQ_RX_TIMESTAMP; 4043 snprintf(name, sizeof(name), "%s rxq%d-fl", 4044 device_get_nameunit(vi->dev), idx); 4045 init_fl(sc, &rxq->fl, vi->qsize_rxq / 8, maxp, name); 4046 rc = alloc_iq_fl(vi, &rxq->iq, &rxq->fl, &vi->ctx, oid); 4047 if (rc != 0) { 4048 CH_ERR(vi, "failed to allocate rxq%d: %d\n", idx, rc); 4049 sysctl_remove_oid(oid, 1, 1); 4050 #if defined(INET) || defined(INET6) 4051 tcp_lro_free(&rxq->lro); 4052 rxq->lro.ifp = NULL; 4053 #endif 4054 return (rc); 4055 } 4056 MPASS(rxq->iq.flags & IQ_SW_ALLOCATED); 4057 add_rxq_sysctls(&vi->ctx, oid, rxq); 4058 } 4059 4060 if (!(rxq->iq.flags & IQ_HW_ALLOCATED)) { 4061 MPASS(rxq->iq.flags & IQ_SW_ALLOCATED); 4062 rc = alloc_iq_fl_hwq(vi, &rxq->iq, &rxq->fl); 4063 if (rc != 0) { 4064 CH_ERR(vi, "failed to create hw rxq%d: %d\n", idx, rc); 4065 return (rc); 4066 } 4067 MPASS(rxq->iq.flags & IQ_HW_ALLOCATED); 4068 4069 if (idx == 0) 4070 sc->sge.iq_base = rxq->iq.abs_id - rxq->iq.cntxt_id; 4071 else 4072 KASSERT(rxq->iq.cntxt_id + sc->sge.iq_base == rxq->iq.abs_id, 4073 ("iq_base mismatch")); 4074 KASSERT(sc->sge.iq_base == 0 || sc->flags & IS_VF, 4075 ("PF with non-zero iq_base")); 4076 4077 /* 4078 * The freelist is just barely above the starvation threshold 4079 * right now, fill it up a bit more. 4080 */ 4081 FL_LOCK(&rxq->fl); 4082 refill_fl(sc, &rxq->fl, 128); 4083 FL_UNLOCK(&rxq->fl); 4084 } 4085 4086 return (0); 4087 } 4088 4089 /* 4090 * Idempotent. 4091 */ 4092 static void 4093 free_rxq(struct vi_info *vi, struct sge_rxq *rxq) 4094 { 4095 if (rxq->iq.flags & IQ_HW_ALLOCATED) { 4096 MPASS(rxq->iq.flags & IQ_SW_ALLOCATED); 4097 free_iq_fl_hwq(vi->adapter, &rxq->iq, &rxq->fl); 4098 MPASS(!(rxq->iq.flags & IQ_HW_ALLOCATED)); 4099 } 4100 4101 if (rxq->iq.flags & IQ_SW_ALLOCATED) { 4102 MPASS(!(rxq->iq.flags & IQ_HW_ALLOCATED)); 4103 #if defined(INET) || defined(INET6) 4104 tcp_lro_free(&rxq->lro); 4105 #endif 4106 free_iq_fl(vi->adapter, &rxq->iq, &rxq->fl); 4107 MPASS(!(rxq->iq.flags & IQ_SW_ALLOCATED)); 4108 bzero(rxq, sizeof(*rxq)); 4109 } 4110 } 4111 4112 static void 4113 add_rxq_sysctls(struct sysctl_ctx_list *ctx, struct sysctl_oid *oid, 4114 struct sge_rxq *rxq) 4115 { 4116 struct sysctl_oid_list *children; 4117 4118 if (ctx == NULL || oid == NULL) 4119 return; 4120 4121 children = SYSCTL_CHILDREN(oid); 4122 #if defined(INET) || defined(INET6) 4123 SYSCTL_ADD_U64(ctx, children, OID_AUTO, "lro_queued", CTLFLAG_RD, 4124 &rxq->lro.lro_queued, 0, NULL); 4125 SYSCTL_ADD_U64(ctx, children, OID_AUTO, "lro_flushed", CTLFLAG_RD, 4126 &rxq->lro.lro_flushed, 0, NULL); 4127 #endif 4128 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "rxcsum", CTLFLAG_RD, 4129 &rxq->rxcsum, "# of times hardware assisted with checksum"); 4130 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "vlan_extraction", CTLFLAG_RD, 4131 &rxq->vlan_extraction, "# of times hardware extracted 802.1Q tag"); 4132 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "vxlan_rxcsum", CTLFLAG_RD, 4133 &rxq->vxlan_rxcsum, 4134 "# of times hardware assisted with inner checksum (VXLAN)"); 4135 } 4136 4137 #ifdef TCP_OFFLOAD 4138 /* 4139 * Idempotent. 4140 */ 4141 static int 4142 alloc_ofld_rxq(struct vi_info *vi, struct sge_ofld_rxq *ofld_rxq, int idx, 4143 int intr_idx, int maxp) 4144 { 4145 int rc; 4146 struct adapter *sc = vi->adapter; 4147 struct sysctl_oid *oid; 4148 char name[16]; 4149 4150 if (!(ofld_rxq->iq.flags & IQ_SW_ALLOCATED)) { 4151 MPASS(!(ofld_rxq->iq.flags & IQ_HW_ALLOCATED)); 4152 4153 snprintf(name, sizeof(name), "%d", idx); 4154 oid = SYSCTL_ADD_NODE(&vi->ctx, 4155 SYSCTL_CHILDREN(vi->ofld_rxq_oid), OID_AUTO, name, 4156 CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "offload rx queue"); 4157 4158 init_iq(&ofld_rxq->iq, sc, vi->ofld_tmr_idx, vi->ofld_pktc_idx, 4159 vi->qsize_rxq, intr_idx, ofld_cong_drop, IQ_OFLD); 4160 snprintf(name, sizeof(name), "%s ofld_rxq%d-fl", 4161 device_get_nameunit(vi->dev), idx); 4162 init_fl(sc, &ofld_rxq->fl, vi->qsize_rxq / 8, maxp, name); 4163 rc = alloc_iq_fl(vi, &ofld_rxq->iq, &ofld_rxq->fl, &vi->ctx, 4164 oid); 4165 if (rc != 0) { 4166 CH_ERR(vi, "failed to allocate ofld_rxq%d: %d\n", idx, 4167 rc); 4168 sysctl_remove_oid(oid, 1, 1); 4169 return (rc); 4170 } 4171 MPASS(ofld_rxq->iq.flags & IQ_SW_ALLOCATED); 4172 ofld_rxq->rx_iscsi_ddp_setup_ok = counter_u64_alloc(M_WAITOK); 4173 ofld_rxq->rx_iscsi_ddp_setup_error = 4174 counter_u64_alloc(M_WAITOK); 4175 add_ofld_rxq_sysctls(&vi->ctx, oid, ofld_rxq); 4176 } 4177 4178 if (!(ofld_rxq->iq.flags & IQ_HW_ALLOCATED)) { 4179 MPASS(ofld_rxq->iq.flags & IQ_SW_ALLOCATED); 4180 rc = alloc_iq_fl_hwq(vi, &ofld_rxq->iq, &ofld_rxq->fl); 4181 if (rc != 0) { 4182 CH_ERR(vi, "failed to create hw ofld_rxq%d: %d\n", idx, 4183 rc); 4184 return (rc); 4185 } 4186 MPASS(ofld_rxq->iq.flags & IQ_HW_ALLOCATED); 4187 } 4188 return (rc); 4189 } 4190 4191 /* 4192 * Idempotent. 4193 */ 4194 static void 4195 free_ofld_rxq(struct vi_info *vi, struct sge_ofld_rxq *ofld_rxq) 4196 { 4197 if (ofld_rxq->iq.flags & IQ_HW_ALLOCATED) { 4198 MPASS(ofld_rxq->iq.flags & IQ_SW_ALLOCATED); 4199 free_iq_fl_hwq(vi->adapter, &ofld_rxq->iq, &ofld_rxq->fl); 4200 MPASS(!(ofld_rxq->iq.flags & IQ_HW_ALLOCATED)); 4201 } 4202 4203 if (ofld_rxq->iq.flags & IQ_SW_ALLOCATED) { 4204 MPASS(!(ofld_rxq->iq.flags & IQ_HW_ALLOCATED)); 4205 free_iq_fl(vi->adapter, &ofld_rxq->iq, &ofld_rxq->fl); 4206 MPASS(!(ofld_rxq->iq.flags & IQ_SW_ALLOCATED)); 4207 counter_u64_free(ofld_rxq->rx_iscsi_ddp_setup_ok); 4208 counter_u64_free(ofld_rxq->rx_iscsi_ddp_setup_error); 4209 bzero(ofld_rxq, sizeof(*ofld_rxq)); 4210 } 4211 } 4212 4213 static void 4214 add_ofld_rxq_sysctls(struct sysctl_ctx_list *ctx, struct sysctl_oid *oid, 4215 struct sge_ofld_rxq *ofld_rxq) 4216 { 4217 struct sysctl_oid_list *children; 4218 4219 if (ctx == NULL || oid == NULL) 4220 return; 4221 4222 children = SYSCTL_CHILDREN(oid); 4223 SYSCTL_ADD_ULONG(ctx, children, OID_AUTO, 4224 "rx_toe_tls_records", CTLFLAG_RD, &ofld_rxq->rx_toe_tls_records, 4225 "# of TOE TLS records received"); 4226 SYSCTL_ADD_ULONG(ctx, children, OID_AUTO, 4227 "rx_toe_tls_octets", CTLFLAG_RD, &ofld_rxq->rx_toe_tls_octets, 4228 "# of payload octets in received TOE TLS records"); 4229 4230 oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "iscsi", 4231 CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "TOE iSCSI statistics"); 4232 children = SYSCTL_CHILDREN(oid); 4233 4234 SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "ddp_setup_ok", 4235 CTLFLAG_RD, &ofld_rxq->rx_iscsi_ddp_setup_ok, 4236 "# of times DDP buffer was setup successfully."); 4237 SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "ddp_setup_error", 4238 CTLFLAG_RD, &ofld_rxq->rx_iscsi_ddp_setup_error, 4239 "# of times DDP buffer setup failed."); 4240 SYSCTL_ADD_U64(ctx, children, OID_AUTO, "ddp_octets", 4241 CTLFLAG_RD, &ofld_rxq->rx_iscsi_ddp_octets, 0, 4242 "# of octets placed directly"); 4243 SYSCTL_ADD_U64(ctx, children, OID_AUTO, "ddp_pdus", 4244 CTLFLAG_RD, &ofld_rxq->rx_iscsi_ddp_pdus, 0, 4245 "# of PDUs with data placed directly."); 4246 SYSCTL_ADD_U64(ctx, children, OID_AUTO, "fl_octets", 4247 CTLFLAG_RD, &ofld_rxq->rx_iscsi_fl_octets, 0, 4248 "# of data octets delivered in freelist"); 4249 SYSCTL_ADD_U64(ctx, children, OID_AUTO, "fl_pdus", 4250 CTLFLAG_RD, &ofld_rxq->rx_iscsi_fl_pdus, 0, 4251 "# of PDUs with data delivered in freelist"); 4252 SYSCTL_ADD_U64(ctx, children, OID_AUTO, "padding_errors", 4253 CTLFLAG_RD, &ofld_rxq->rx_iscsi_padding_errors, 0, 4254 "# of PDUs with invalid padding"); 4255 SYSCTL_ADD_U64(ctx, children, OID_AUTO, "header_digest_errors", 4256 CTLFLAG_RD, &ofld_rxq->rx_iscsi_header_digest_errors, 0, 4257 "# of PDUs with invalid header digests"); 4258 SYSCTL_ADD_U64(ctx, children, OID_AUTO, "data_digest_errors", 4259 CTLFLAG_RD, &ofld_rxq->rx_iscsi_data_digest_errors, 0, 4260 "# of PDUs with invalid data digests"); 4261 } 4262 #endif 4263 4264 /* 4265 * Returns a reasonable automatic cidx flush threshold for a given queue size. 4266 */ 4267 static u_int 4268 qsize_to_fthresh(int qsize) 4269 { 4270 u_int fthresh; 4271 4272 while (!powerof2(qsize)) 4273 qsize++; 4274 fthresh = ilog2(qsize); 4275 if (fthresh > X_CIDXFLUSHTHRESH_128) 4276 fthresh = X_CIDXFLUSHTHRESH_128; 4277 4278 return (fthresh); 4279 } 4280 4281 static int 4282 ctrl_eq_alloc(struct adapter *sc, struct sge_eq *eq) 4283 { 4284 int rc, cntxt_id; 4285 struct fw_eq_ctrl_cmd c; 4286 int qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE; 4287 4288 bzero(&c, sizeof(c)); 4289 4290 c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_EQ_CTRL_CMD) | F_FW_CMD_REQUEST | 4291 F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_CTRL_CMD_PFN(sc->pf) | 4292 V_FW_EQ_CTRL_CMD_VFN(0)); 4293 c.alloc_to_len16 = htobe32(F_FW_EQ_CTRL_CMD_ALLOC | 4294 F_FW_EQ_CTRL_CMD_EQSTART | FW_LEN16(c)); 4295 c.cmpliqid_eqid = htonl(V_FW_EQ_CTRL_CMD_CMPLIQID(eq->iqid)); 4296 c.physeqid_pkd = htobe32(0); 4297 c.fetchszm_to_iqid = 4298 htobe32(V_FW_EQ_CTRL_CMD_HOSTFCMODE(X_HOSTFCMODE_STATUS_PAGE) | 4299 V_FW_EQ_CTRL_CMD_PCIECHN(eq->tx_chan) | 4300 F_FW_EQ_CTRL_CMD_FETCHRO | V_FW_EQ_CTRL_CMD_IQID(eq->iqid)); 4301 c.dcaen_to_eqsize = 4302 htobe32(V_FW_EQ_CTRL_CMD_FBMIN(chip_id(sc) <= CHELSIO_T5 ? 4303 X_FETCHBURSTMIN_64B : X_FETCHBURSTMIN_64B_T6) | 4304 V_FW_EQ_CTRL_CMD_FBMAX(X_FETCHBURSTMAX_512B) | 4305 V_FW_EQ_CTRL_CMD_CIDXFTHRESH(qsize_to_fthresh(qsize)) | 4306 V_FW_EQ_CTRL_CMD_EQSIZE(qsize)); 4307 c.eqaddr = htobe64(eq->ba); 4308 4309 rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c); 4310 if (rc != 0) { 4311 CH_ERR(sc, "failed to create hw ctrlq for tx_chan %d: %d\n", 4312 eq->tx_chan, rc); 4313 return (rc); 4314 } 4315 4316 eq->cntxt_id = G_FW_EQ_CTRL_CMD_EQID(be32toh(c.cmpliqid_eqid)); 4317 eq->abs_id = G_FW_EQ_CTRL_CMD_PHYSEQID(be32toh(c.physeqid_pkd)); 4318 cntxt_id = eq->cntxt_id - sc->sge.eq_start; 4319 if (cntxt_id >= sc->sge.eqmap_sz) 4320 panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__, 4321 cntxt_id, sc->sge.eqmap_sz - 1); 4322 sc->sge.eqmap[cntxt_id] = eq; 4323 4324 return (rc); 4325 } 4326 4327 static int 4328 eth_eq_alloc(struct adapter *sc, struct vi_info *vi, struct sge_eq *eq) 4329 { 4330 int rc, cntxt_id; 4331 struct fw_eq_eth_cmd c; 4332 int qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE; 4333 4334 bzero(&c, sizeof(c)); 4335 4336 c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_EQ_ETH_CMD) | F_FW_CMD_REQUEST | 4337 F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_ETH_CMD_PFN(sc->pf) | 4338 V_FW_EQ_ETH_CMD_VFN(0)); 4339 c.alloc_to_len16 = htobe32(F_FW_EQ_ETH_CMD_ALLOC | 4340 F_FW_EQ_ETH_CMD_EQSTART | FW_LEN16(c)); 4341 c.autoequiqe_to_viid = htobe32(F_FW_EQ_ETH_CMD_AUTOEQUIQE | 4342 F_FW_EQ_ETH_CMD_AUTOEQUEQE | V_FW_EQ_ETH_CMD_VIID(vi->viid)); 4343 c.fetchszm_to_iqid = 4344 htobe32(V_FW_EQ_ETH_CMD_HOSTFCMODE(X_HOSTFCMODE_NONE) | 4345 V_FW_EQ_ETH_CMD_PCIECHN(eq->tx_chan) | F_FW_EQ_ETH_CMD_FETCHRO | 4346 V_FW_EQ_ETH_CMD_IQID(eq->iqid)); 4347 c.dcaen_to_eqsize = 4348 htobe32(V_FW_EQ_ETH_CMD_FBMIN(chip_id(sc) <= CHELSIO_T5 ? 4349 X_FETCHBURSTMIN_64B : X_FETCHBURSTMIN_64B_T6) | 4350 V_FW_EQ_ETH_CMD_FBMAX(X_FETCHBURSTMAX_512B) | 4351 V_FW_EQ_ETH_CMD_EQSIZE(qsize)); 4352 c.eqaddr = htobe64(eq->ba); 4353 4354 rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c); 4355 if (rc != 0) { 4356 device_printf(vi->dev, 4357 "failed to create Ethernet egress queue: %d\n", rc); 4358 return (rc); 4359 } 4360 4361 eq->cntxt_id = G_FW_EQ_ETH_CMD_EQID(be32toh(c.eqid_pkd)); 4362 eq->abs_id = G_FW_EQ_ETH_CMD_PHYSEQID(be32toh(c.physeqid_pkd)); 4363 cntxt_id = eq->cntxt_id - sc->sge.eq_start; 4364 if (cntxt_id >= sc->sge.eqmap_sz) 4365 panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__, 4366 cntxt_id, sc->sge.eqmap_sz - 1); 4367 sc->sge.eqmap[cntxt_id] = eq; 4368 4369 return (rc); 4370 } 4371 4372 #if defined(TCP_OFFLOAD) || defined(RATELIMIT) 4373 static int 4374 ofld_eq_alloc(struct adapter *sc, struct vi_info *vi, struct sge_eq *eq) 4375 { 4376 int rc, cntxt_id; 4377 struct fw_eq_ofld_cmd c; 4378 int qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE; 4379 4380 bzero(&c, sizeof(c)); 4381 4382 c.op_to_vfn = htonl(V_FW_CMD_OP(FW_EQ_OFLD_CMD) | F_FW_CMD_REQUEST | 4383 F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_OFLD_CMD_PFN(sc->pf) | 4384 V_FW_EQ_OFLD_CMD_VFN(0)); 4385 c.alloc_to_len16 = htonl(F_FW_EQ_OFLD_CMD_ALLOC | 4386 F_FW_EQ_OFLD_CMD_EQSTART | FW_LEN16(c)); 4387 c.fetchszm_to_iqid = 4388 htonl(V_FW_EQ_OFLD_CMD_HOSTFCMODE(X_HOSTFCMODE_STATUS_PAGE) | 4389 V_FW_EQ_OFLD_CMD_PCIECHN(eq->tx_chan) | 4390 F_FW_EQ_OFLD_CMD_FETCHRO | V_FW_EQ_OFLD_CMD_IQID(eq->iqid)); 4391 c.dcaen_to_eqsize = 4392 htobe32(V_FW_EQ_OFLD_CMD_FBMIN(chip_id(sc) <= CHELSIO_T5 ? 4393 X_FETCHBURSTMIN_64B : X_FETCHBURSTMIN_64B_T6) | 4394 V_FW_EQ_OFLD_CMD_FBMAX(X_FETCHBURSTMAX_512B) | 4395 V_FW_EQ_OFLD_CMD_CIDXFTHRESH(qsize_to_fthresh(qsize)) | 4396 V_FW_EQ_OFLD_CMD_EQSIZE(qsize)); 4397 c.eqaddr = htobe64(eq->ba); 4398 4399 rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c); 4400 if (rc != 0) { 4401 device_printf(vi->dev, 4402 "failed to create egress queue for TCP offload: %d\n", rc); 4403 return (rc); 4404 } 4405 4406 eq->cntxt_id = G_FW_EQ_OFLD_CMD_EQID(be32toh(c.eqid_pkd)); 4407 eq->abs_id = G_FW_EQ_OFLD_CMD_PHYSEQID(be32toh(c.physeqid_pkd)); 4408 cntxt_id = eq->cntxt_id - sc->sge.eq_start; 4409 if (cntxt_id >= sc->sge.eqmap_sz) 4410 panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__, 4411 cntxt_id, sc->sge.eqmap_sz - 1); 4412 sc->sge.eqmap[cntxt_id] = eq; 4413 4414 return (rc); 4415 } 4416 #endif 4417 4418 /* SW only */ 4419 static int 4420 alloc_eq(struct adapter *sc, struct sge_eq *eq, struct sysctl_ctx_list *ctx, 4421 struct sysctl_oid *oid) 4422 { 4423 int rc, qsize; 4424 size_t len; 4425 4426 MPASS(!(eq->flags & EQ_SW_ALLOCATED)); 4427 4428 qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE; 4429 len = qsize * EQ_ESIZE; 4430 rc = alloc_ring(sc, len, &eq->desc_tag, &eq->desc_map, &eq->ba, 4431 (void **)&eq->desc); 4432 if (rc) 4433 return (rc); 4434 if (ctx != NULL && oid != NULL) 4435 add_eq_sysctls(sc, ctx, oid, eq); 4436 eq->flags |= EQ_SW_ALLOCATED; 4437 4438 return (0); 4439 } 4440 4441 /* SW only */ 4442 static void 4443 free_eq(struct adapter *sc, struct sge_eq *eq) 4444 { 4445 MPASS(eq->flags & EQ_SW_ALLOCATED); 4446 if (eq->type == EQ_ETH) 4447 MPASS(eq->pidx == eq->cidx); 4448 4449 free_ring(sc, eq->desc_tag, eq->desc_map, eq->ba, eq->desc); 4450 mtx_destroy(&eq->eq_lock); 4451 bzero(eq, sizeof(*eq)); 4452 } 4453 4454 static void 4455 add_eq_sysctls(struct adapter *sc, struct sysctl_ctx_list *ctx, 4456 struct sysctl_oid *oid, struct sge_eq *eq) 4457 { 4458 struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid); 4459 4460 SYSCTL_ADD_UAUTO(ctx, children, OID_AUTO, "ba", CTLFLAG_RD, &eq->ba, 4461 "bus address of descriptor ring"); 4462 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL, 4463 eq->sidx * EQ_ESIZE + sc->params.sge.spg_len, 4464 "desc ring size in bytes"); 4465 SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "abs_id", CTLFLAG_RD, 4466 &eq->abs_id, 0, "absolute id of the queue"); 4467 SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD, 4468 &eq->cntxt_id, 0, "SGE context id of the queue"); 4469 SYSCTL_ADD_U16(ctx, children, OID_AUTO, "cidx", CTLFLAG_RD, &eq->cidx, 4470 0, "consumer index"); 4471 SYSCTL_ADD_U16(ctx, children, OID_AUTO, "pidx", CTLFLAG_RD, &eq->pidx, 4472 0, "producer index"); 4473 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "sidx", CTLFLAG_RD, NULL, 4474 eq->sidx, "status page index"); 4475 } 4476 4477 static int 4478 alloc_eq_hwq(struct adapter *sc, struct vi_info *vi, struct sge_eq *eq) 4479 { 4480 int rc; 4481 4482 MPASS(!(eq->flags & EQ_HW_ALLOCATED)); 4483 4484 eq->iqid = eq->iq->cntxt_id; 4485 eq->pidx = eq->cidx = eq->dbidx = 0; 4486 /* Note that equeqidx is not used with sge_wrq (OFLD/CTRL) queues. */ 4487 eq->equeqidx = 0; 4488 eq->doorbells = sc->doorbells; 4489 bzero(eq->desc, eq->sidx * EQ_ESIZE + sc->params.sge.spg_len); 4490 4491 switch (eq->type) { 4492 case EQ_CTRL: 4493 rc = ctrl_eq_alloc(sc, eq); 4494 break; 4495 4496 case EQ_ETH: 4497 rc = eth_eq_alloc(sc, vi, eq); 4498 break; 4499 4500 #if defined(TCP_OFFLOAD) || defined(RATELIMIT) 4501 case EQ_OFLD: 4502 rc = ofld_eq_alloc(sc, vi, eq); 4503 break; 4504 #endif 4505 4506 default: 4507 panic("%s: invalid eq type %d.", __func__, eq->type); 4508 } 4509 if (rc != 0) { 4510 CH_ERR(sc, "failed to allocate egress queue(%d): %d\n", 4511 eq->type, rc); 4512 return (rc); 4513 } 4514 4515 if (isset(&eq->doorbells, DOORBELL_UDB) || 4516 isset(&eq->doorbells, DOORBELL_UDBWC) || 4517 isset(&eq->doorbells, DOORBELL_WCWR)) { 4518 uint32_t s_qpp = sc->params.sge.eq_s_qpp; 4519 uint32_t mask = (1 << s_qpp) - 1; 4520 volatile uint8_t *udb; 4521 4522 udb = sc->udbs_base + UDBS_DB_OFFSET; 4523 udb += (eq->cntxt_id >> s_qpp) << PAGE_SHIFT; /* pg offset */ 4524 eq->udb_qid = eq->cntxt_id & mask; /* id in page */ 4525 if (eq->udb_qid >= PAGE_SIZE / UDBS_SEG_SIZE) 4526 clrbit(&eq->doorbells, DOORBELL_WCWR); 4527 else { 4528 udb += eq->udb_qid << UDBS_SEG_SHIFT; /* seg offset */ 4529 eq->udb_qid = 0; 4530 } 4531 eq->udb = (volatile void *)udb; 4532 } 4533 4534 eq->flags |= EQ_HW_ALLOCATED; 4535 return (0); 4536 } 4537 4538 static int 4539 free_eq_hwq(struct adapter *sc, struct vi_info *vi __unused, struct sge_eq *eq) 4540 { 4541 int rc; 4542 4543 MPASS(eq->flags & EQ_HW_ALLOCATED); 4544 4545 switch (eq->type) { 4546 case EQ_CTRL: 4547 rc = -t4_ctrl_eq_free(sc, sc->mbox, sc->pf, 0, eq->cntxt_id); 4548 break; 4549 case EQ_ETH: 4550 rc = -t4_eth_eq_free(sc, sc->mbox, sc->pf, 0, eq->cntxt_id); 4551 break; 4552 #if defined(TCP_OFFLOAD) || defined(RATELIMIT) 4553 case EQ_OFLD: 4554 rc = -t4_ofld_eq_free(sc, sc->mbox, sc->pf, 0, eq->cntxt_id); 4555 break; 4556 #endif 4557 default: 4558 panic("%s: invalid eq type %d.", __func__, eq->type); 4559 } 4560 if (rc != 0) { 4561 CH_ERR(sc, "failed to free eq (type %d): %d\n", eq->type, rc); 4562 return (rc); 4563 } 4564 eq->flags &= ~EQ_HW_ALLOCATED; 4565 4566 return (0); 4567 } 4568 4569 static int 4570 alloc_wrq(struct adapter *sc, struct vi_info *vi, struct sge_wrq *wrq, 4571 struct sysctl_ctx_list *ctx, struct sysctl_oid *oid) 4572 { 4573 struct sge_eq *eq = &wrq->eq; 4574 int rc; 4575 4576 MPASS(!(eq->flags & EQ_SW_ALLOCATED)); 4577 4578 rc = alloc_eq(sc, eq, ctx, oid); 4579 if (rc) 4580 return (rc); 4581 MPASS(eq->flags & EQ_SW_ALLOCATED); 4582 /* Can't fail after this. */ 4583 4584 wrq->adapter = sc; 4585 TASK_INIT(&wrq->wrq_tx_task, 0, wrq_tx_drain, wrq); 4586 TAILQ_INIT(&wrq->incomplete_wrs); 4587 STAILQ_INIT(&wrq->wr_list); 4588 wrq->nwr_pending = 0; 4589 wrq->ndesc_needed = 0; 4590 add_wrq_sysctls(ctx, oid, wrq); 4591 4592 return (0); 4593 } 4594 4595 static void 4596 free_wrq(struct adapter *sc, struct sge_wrq *wrq) 4597 { 4598 free_eq(sc, &wrq->eq); 4599 MPASS(wrq->nwr_pending == 0); 4600 MPASS(TAILQ_EMPTY(&wrq->incomplete_wrs)); 4601 MPASS(STAILQ_EMPTY(&wrq->wr_list)); 4602 bzero(wrq, sizeof(*wrq)); 4603 } 4604 4605 static void 4606 add_wrq_sysctls(struct sysctl_ctx_list *ctx, struct sysctl_oid *oid, 4607 struct sge_wrq *wrq) 4608 { 4609 struct sysctl_oid_list *children; 4610 4611 if (ctx == NULL || oid == NULL) 4612 return; 4613 4614 children = SYSCTL_CHILDREN(oid); 4615 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_direct", CTLFLAG_RD, 4616 &wrq->tx_wrs_direct, "# of work requests (direct)"); 4617 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_copied", CTLFLAG_RD, 4618 &wrq->tx_wrs_copied, "# of work requests (copied)"); 4619 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_sspace", CTLFLAG_RD, 4620 &wrq->tx_wrs_ss, "# of work requests (copied from scratch space)"); 4621 } 4622 4623 /* 4624 * Idempotent. 4625 */ 4626 static int 4627 alloc_txq(struct vi_info *vi, struct sge_txq *txq, int idx) 4628 { 4629 int rc, iqidx; 4630 struct port_info *pi = vi->pi; 4631 struct adapter *sc = vi->adapter; 4632 struct sge_eq *eq = &txq->eq; 4633 struct txpkts *txp; 4634 char name[16]; 4635 struct sysctl_oid *oid; 4636 4637 if (!(eq->flags & EQ_SW_ALLOCATED)) { 4638 MPASS(!(eq->flags & EQ_HW_ALLOCATED)); 4639 4640 snprintf(name, sizeof(name), "%d", idx); 4641 oid = SYSCTL_ADD_NODE(&vi->ctx, SYSCTL_CHILDREN(vi->txq_oid), 4642 OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 4643 "tx queue"); 4644 4645 iqidx = vi->first_rxq + (idx % vi->nrxq); 4646 snprintf(name, sizeof(name), "%s txq%d", 4647 device_get_nameunit(vi->dev), idx); 4648 init_eq(sc, &txq->eq, EQ_ETH, vi->qsize_txq, pi->tx_chan, 4649 &sc->sge.rxq[iqidx].iq, name); 4650 4651 rc = mp_ring_alloc(&txq->r, eq->sidx, txq, eth_tx, 4652 can_resume_eth_tx, M_CXGBE, &eq->eq_lock, M_WAITOK); 4653 if (rc != 0) { 4654 CH_ERR(vi, "failed to allocate mp_ring for txq%d: %d\n", 4655 idx, rc); 4656 failed: 4657 sysctl_remove_oid(oid, 1, 1); 4658 return (rc); 4659 } 4660 4661 rc = alloc_eq(sc, eq, &vi->ctx, oid); 4662 if (rc) { 4663 CH_ERR(vi, "failed to allocate txq%d: %d\n", idx, rc); 4664 mp_ring_free(txq->r); 4665 goto failed; 4666 } 4667 MPASS(eq->flags & EQ_SW_ALLOCATED); 4668 /* Can't fail after this point. */ 4669 4670 TASK_INIT(&txq->tx_reclaim_task, 0, tx_reclaim, eq); 4671 txq->ifp = vi->ifp; 4672 txq->gl = sglist_alloc(TX_SGL_SEGS, M_WAITOK); 4673 txq->sdesc = malloc(eq->sidx * sizeof(struct tx_sdesc), M_CXGBE, 4674 M_ZERO | M_WAITOK); 4675 4676 add_txq_sysctls(vi, &vi->ctx, oid, txq); 4677 } 4678 4679 if (!(eq->flags & EQ_HW_ALLOCATED)) { 4680 MPASS(eq->flags & EQ_SW_ALLOCATED); 4681 rc = alloc_eq_hwq(sc, vi, eq); 4682 if (rc != 0) { 4683 CH_ERR(vi, "failed to create hw txq%d: %d\n", idx, rc); 4684 return (rc); 4685 } 4686 MPASS(eq->flags & EQ_HW_ALLOCATED); 4687 /* Can't fail after this point. */ 4688 4689 if (idx == 0) 4690 sc->sge.eq_base = eq->abs_id - eq->cntxt_id; 4691 else 4692 KASSERT(eq->cntxt_id + sc->sge.eq_base == eq->abs_id, 4693 ("eq_base mismatch")); 4694 KASSERT(sc->sge.eq_base == 0 || sc->flags & IS_VF, 4695 ("PF with non-zero eq_base")); 4696 4697 txp = &txq->txp; 4698 MPASS(nitems(txp->mb) >= sc->params.max_pkts_per_eth_tx_pkts_wr); 4699 txq->txp.max_npkt = min(nitems(txp->mb), 4700 sc->params.max_pkts_per_eth_tx_pkts_wr); 4701 if (vi->flags & TX_USES_VM_WR && !(sc->flags & IS_VF)) 4702 txq->txp.max_npkt--; 4703 4704 if (vi->flags & TX_USES_VM_WR) 4705 txq->cpl_ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT_XT) | 4706 V_TXPKT_INTF(pi->tx_chan)); 4707 else 4708 txq->cpl_ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT_XT) | 4709 V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_PF(sc->pf) | 4710 V_TXPKT_VF(vi->vin) | V_TXPKT_VF_VLD(vi->vfvld)); 4711 4712 txq->tc_idx = -1; 4713 } 4714 4715 return (0); 4716 } 4717 4718 /* 4719 * Idempotent. 4720 */ 4721 static void 4722 free_txq(struct vi_info *vi, struct sge_txq *txq) 4723 { 4724 struct adapter *sc = vi->adapter; 4725 struct sge_eq *eq = &txq->eq; 4726 4727 if (eq->flags & EQ_HW_ALLOCATED) { 4728 MPASS(eq->flags & EQ_SW_ALLOCATED); 4729 free_eq_hwq(sc, NULL, eq); 4730 MPASS(!(eq->flags & EQ_HW_ALLOCATED)); 4731 } 4732 4733 if (eq->flags & EQ_SW_ALLOCATED) { 4734 MPASS(!(eq->flags & EQ_HW_ALLOCATED)); 4735 sglist_free(txq->gl); 4736 free(txq->sdesc, M_CXGBE); 4737 mp_ring_free(txq->r); 4738 free_eq(sc, eq); 4739 MPASS(!(eq->flags & EQ_SW_ALLOCATED)); 4740 bzero(txq, sizeof(*txq)); 4741 } 4742 } 4743 4744 static void 4745 add_txq_sysctls(struct vi_info *vi, struct sysctl_ctx_list *ctx, 4746 struct sysctl_oid *oid, struct sge_txq *txq) 4747 { 4748 struct adapter *sc; 4749 struct sysctl_oid_list *children; 4750 4751 if (ctx == NULL || oid == NULL) 4752 return; 4753 4754 sc = vi->adapter; 4755 children = SYSCTL_CHILDREN(oid); 4756 4757 mp_ring_sysctls(txq->r, ctx, children); 4758 4759 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tc", 4760 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, vi, txq - sc->sge.txq, 4761 sysctl_tc, "I", "traffic class (-1 means none)"); 4762 4763 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "txcsum", CTLFLAG_RD, 4764 &txq->txcsum, "# of times hardware assisted with checksum"); 4765 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "vlan_insertion", CTLFLAG_RD, 4766 &txq->vlan_insertion, "# of times hardware inserted 802.1Q tag"); 4767 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tso_wrs", CTLFLAG_RD, 4768 &txq->tso_wrs, "# of TSO work requests"); 4769 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "imm_wrs", CTLFLAG_RD, 4770 &txq->imm_wrs, "# of work requests with immediate data"); 4771 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "sgl_wrs", CTLFLAG_RD, 4772 &txq->sgl_wrs, "# of work requests with direct SGL"); 4773 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "txpkt_wrs", CTLFLAG_RD, 4774 &txq->txpkt_wrs, "# of txpkt work requests (one pkt/WR)"); 4775 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "txpkts0_wrs", CTLFLAG_RD, 4776 &txq->txpkts0_wrs, "# of txpkts (type 0) work requests"); 4777 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "txpkts1_wrs", CTLFLAG_RD, 4778 &txq->txpkts1_wrs, "# of txpkts (type 1) work requests"); 4779 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "txpkts0_pkts", CTLFLAG_RD, 4780 &txq->txpkts0_pkts, 4781 "# of frames tx'd using type0 txpkts work requests"); 4782 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "txpkts1_pkts", CTLFLAG_RD, 4783 &txq->txpkts1_pkts, 4784 "# of frames tx'd using type1 txpkts work requests"); 4785 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "txpkts_flush", CTLFLAG_RD, 4786 &txq->txpkts_flush, 4787 "# of times txpkts had to be flushed out by an egress-update"); 4788 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "raw_wrs", CTLFLAG_RD, 4789 &txq->raw_wrs, "# of raw work requests (non-packets)"); 4790 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "vxlan_tso_wrs", CTLFLAG_RD, 4791 &txq->vxlan_tso_wrs, "# of VXLAN TSO work requests"); 4792 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "vxlan_txcsum", CTLFLAG_RD, 4793 &txq->vxlan_txcsum, 4794 "# of times hardware assisted with inner checksums (VXLAN)"); 4795 4796 #ifdef KERN_TLS 4797 if (is_ktls(sc)) { 4798 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "kern_tls_records", 4799 CTLFLAG_RD, &txq->kern_tls_records, 4800 "# of NIC TLS records transmitted"); 4801 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "kern_tls_short", 4802 CTLFLAG_RD, &txq->kern_tls_short, 4803 "# of short NIC TLS records transmitted"); 4804 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "kern_tls_partial", 4805 CTLFLAG_RD, &txq->kern_tls_partial, 4806 "# of partial NIC TLS records transmitted"); 4807 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "kern_tls_full", 4808 CTLFLAG_RD, &txq->kern_tls_full, 4809 "# of full NIC TLS records transmitted"); 4810 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "kern_tls_octets", 4811 CTLFLAG_RD, &txq->kern_tls_octets, 4812 "# of payload octets in transmitted NIC TLS records"); 4813 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "kern_tls_waste", 4814 CTLFLAG_RD, &txq->kern_tls_waste, 4815 "# of octets DMAd but not transmitted in NIC TLS records"); 4816 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "kern_tls_options", 4817 CTLFLAG_RD, &txq->kern_tls_options, 4818 "# of NIC TLS options-only packets transmitted"); 4819 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "kern_tls_header", 4820 CTLFLAG_RD, &txq->kern_tls_header, 4821 "# of NIC TLS header-only packets transmitted"); 4822 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "kern_tls_fin", 4823 CTLFLAG_RD, &txq->kern_tls_fin, 4824 "# of NIC TLS FIN-only packets transmitted"); 4825 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "kern_tls_fin_short", 4826 CTLFLAG_RD, &txq->kern_tls_fin_short, 4827 "# of NIC TLS padded FIN packets on short TLS records"); 4828 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "kern_tls_cbc", 4829 CTLFLAG_RD, &txq->kern_tls_cbc, 4830 "# of NIC TLS sessions using AES-CBC"); 4831 SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "kern_tls_gcm", 4832 CTLFLAG_RD, &txq->kern_tls_gcm, 4833 "# of NIC TLS sessions using AES-GCM"); 4834 } 4835 #endif 4836 } 4837 4838 #if defined(TCP_OFFLOAD) || defined(RATELIMIT) 4839 /* 4840 * Idempotent. 4841 */ 4842 static int 4843 alloc_ofld_txq(struct vi_info *vi, struct sge_ofld_txq *ofld_txq, int idx) 4844 { 4845 struct sysctl_oid *oid; 4846 struct port_info *pi = vi->pi; 4847 struct adapter *sc = vi->adapter; 4848 struct sge_eq *eq = &ofld_txq->wrq.eq; 4849 int rc, iqidx; 4850 char name[16]; 4851 4852 MPASS(idx >= 0); 4853 MPASS(idx < vi->nofldtxq); 4854 4855 if (!(eq->flags & EQ_SW_ALLOCATED)) { 4856 snprintf(name, sizeof(name), "%d", idx); 4857 oid = SYSCTL_ADD_NODE(&vi->ctx, 4858 SYSCTL_CHILDREN(vi->ofld_txq_oid), OID_AUTO, name, 4859 CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "offload tx queue"); 4860 4861 snprintf(name, sizeof(name), "%s ofld_txq%d", 4862 device_get_nameunit(vi->dev), idx); 4863 if (vi->nofldrxq > 0) { 4864 iqidx = vi->first_ofld_rxq + (idx % vi->nofldrxq); 4865 init_eq(sc, eq, EQ_OFLD, vi->qsize_txq, pi->tx_chan, 4866 &sc->sge.ofld_rxq[iqidx].iq, name); 4867 } else { 4868 iqidx = vi->first_rxq + (idx % vi->nrxq); 4869 init_eq(sc, eq, EQ_OFLD, vi->qsize_txq, pi->tx_chan, 4870 &sc->sge.rxq[iqidx].iq, name); 4871 } 4872 4873 rc = alloc_wrq(sc, vi, &ofld_txq->wrq, &vi->ctx, oid); 4874 if (rc != 0) { 4875 CH_ERR(vi, "failed to allocate ofld_txq%d: %d\n", idx, 4876 rc); 4877 sysctl_remove_oid(oid, 1, 1); 4878 return (rc); 4879 } 4880 MPASS(eq->flags & EQ_SW_ALLOCATED); 4881 /* Can't fail after this point. */ 4882 4883 ofld_txq->tx_iscsi_pdus = counter_u64_alloc(M_WAITOK); 4884 ofld_txq->tx_iscsi_octets = counter_u64_alloc(M_WAITOK); 4885 ofld_txq->tx_iscsi_iso_wrs = counter_u64_alloc(M_WAITOK); 4886 ofld_txq->tx_toe_tls_records = counter_u64_alloc(M_WAITOK); 4887 ofld_txq->tx_toe_tls_octets = counter_u64_alloc(M_WAITOK); 4888 add_ofld_txq_sysctls(&vi->ctx, oid, ofld_txq); 4889 } 4890 4891 if (!(eq->flags & EQ_HW_ALLOCATED)) { 4892 rc = alloc_eq_hwq(sc, vi, eq); 4893 if (rc != 0) { 4894 CH_ERR(vi, "failed to create hw ofld_txq%d: %d\n", idx, 4895 rc); 4896 return (rc); 4897 } 4898 MPASS(eq->flags & EQ_HW_ALLOCATED); 4899 } 4900 4901 return (0); 4902 } 4903 4904 /* 4905 * Idempotent. 4906 */ 4907 static void 4908 free_ofld_txq(struct vi_info *vi, struct sge_ofld_txq *ofld_txq) 4909 { 4910 struct adapter *sc = vi->adapter; 4911 struct sge_eq *eq = &ofld_txq->wrq.eq; 4912 4913 if (eq->flags & EQ_HW_ALLOCATED) { 4914 MPASS(eq->flags & EQ_SW_ALLOCATED); 4915 free_eq_hwq(sc, NULL, eq); 4916 MPASS(!(eq->flags & EQ_HW_ALLOCATED)); 4917 } 4918 4919 if (eq->flags & EQ_SW_ALLOCATED) { 4920 MPASS(!(eq->flags & EQ_HW_ALLOCATED)); 4921 counter_u64_free(ofld_txq->tx_iscsi_pdus); 4922 counter_u64_free(ofld_txq->tx_iscsi_octets); 4923 counter_u64_free(ofld_txq->tx_iscsi_iso_wrs); 4924 counter_u64_free(ofld_txq->tx_toe_tls_records); 4925 counter_u64_free(ofld_txq->tx_toe_tls_octets); 4926 free_wrq(sc, &ofld_txq->wrq); 4927 MPASS(!(eq->flags & EQ_SW_ALLOCATED)); 4928 bzero(ofld_txq, sizeof(*ofld_txq)); 4929 } 4930 } 4931 4932 static void 4933 add_ofld_txq_sysctls(struct sysctl_ctx_list *ctx, struct sysctl_oid *oid, 4934 struct sge_ofld_txq *ofld_txq) 4935 { 4936 struct sysctl_oid_list *children; 4937 4938 if (ctx == NULL || oid == NULL) 4939 return; 4940 4941 children = SYSCTL_CHILDREN(oid); 4942 SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "tx_iscsi_pdus", 4943 CTLFLAG_RD, &ofld_txq->tx_iscsi_pdus, 4944 "# of iSCSI PDUs transmitted"); 4945 SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "tx_iscsi_octets", 4946 CTLFLAG_RD, &ofld_txq->tx_iscsi_octets, 4947 "# of payload octets in transmitted iSCSI PDUs"); 4948 SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "tx_iscsi_iso_wrs", 4949 CTLFLAG_RD, &ofld_txq->tx_iscsi_iso_wrs, 4950 "# of iSCSI segmentation offload work requests"); 4951 SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "tx_toe_tls_records", 4952 CTLFLAG_RD, &ofld_txq->tx_toe_tls_records, 4953 "# of TOE TLS records transmitted"); 4954 SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "tx_toe_tls_octets", 4955 CTLFLAG_RD, &ofld_txq->tx_toe_tls_octets, 4956 "# of payload octets in transmitted TOE TLS records"); 4957 } 4958 #endif 4959 4960 static void 4961 oneseg_dma_callback(void *arg, bus_dma_segment_t *segs, int nseg, int error) 4962 { 4963 bus_addr_t *ba = arg; 4964 4965 KASSERT(nseg == 1, 4966 ("%s meant for single segment mappings only.", __func__)); 4967 4968 *ba = error ? 0 : segs->ds_addr; 4969 } 4970 4971 static inline void 4972 ring_fl_db(struct adapter *sc, struct sge_fl *fl) 4973 { 4974 uint32_t n, v; 4975 4976 n = IDXDIFF(fl->pidx >> 3, fl->dbidx, fl->sidx); 4977 MPASS(n > 0); 4978 4979 wmb(); 4980 v = fl->dbval | V_PIDX(n); 4981 if (fl->udb) 4982 *fl->udb = htole32(v); 4983 else 4984 t4_write_reg(sc, sc->sge_kdoorbell_reg, v); 4985 IDXINCR(fl->dbidx, n, fl->sidx); 4986 } 4987 4988 /* 4989 * Fills up the freelist by allocating up to 'n' buffers. Buffers that are 4990 * recycled do not count towards this allocation budget. 4991 * 4992 * Returns non-zero to indicate that this freelist should be added to the list 4993 * of starving freelists. 4994 */ 4995 static int 4996 refill_fl(struct adapter *sc, struct sge_fl *fl, int n) 4997 { 4998 __be64 *d; 4999 struct fl_sdesc *sd; 5000 uintptr_t pa; 5001 caddr_t cl; 5002 struct rx_buf_info *rxb; 5003 struct cluster_metadata *clm; 5004 uint16_t max_pidx, zidx = fl->zidx; 5005 uint16_t hw_cidx = fl->hw_cidx; /* stable snapshot */ 5006 5007 FL_LOCK_ASSERT_OWNED(fl); 5008 5009 /* 5010 * We always stop at the beginning of the hardware descriptor that's just 5011 * before the one with the hw cidx. This is to avoid hw pidx = hw cidx, 5012 * which would mean an empty freelist to the chip. 5013 */ 5014 max_pidx = __predict_false(hw_cidx == 0) ? fl->sidx - 1 : hw_cidx - 1; 5015 if (fl->pidx == max_pidx * 8) 5016 return (0); 5017 5018 d = &fl->desc[fl->pidx]; 5019 sd = &fl->sdesc[fl->pidx]; 5020 rxb = &sc->sge.rx_buf_info[zidx]; 5021 5022 while (n > 0) { 5023 5024 if (sd->cl != NULL) { 5025 5026 if (sd->nmbuf == 0) { 5027 /* 5028 * Fast recycle without involving any atomics on 5029 * the cluster's metadata (if the cluster has 5030 * metadata). This happens when all frames 5031 * received in the cluster were small enough to 5032 * fit within a single mbuf each. 5033 */ 5034 fl->cl_fast_recycled++; 5035 goto recycled; 5036 } 5037 5038 /* 5039 * Cluster is guaranteed to have metadata. Clusters 5040 * without metadata always take the fast recycle path 5041 * when they're recycled. 5042 */ 5043 clm = cl_metadata(sd); 5044 MPASS(clm != NULL); 5045 5046 if (atomic_fetchadd_int(&clm->refcount, -1) == 1) { 5047 fl->cl_recycled++; 5048 counter_u64_add(extfree_rels, 1); 5049 goto recycled; 5050 } 5051 sd->cl = NULL; /* gave up my reference */ 5052 } 5053 MPASS(sd->cl == NULL); 5054 cl = uma_zalloc(rxb->zone, M_NOWAIT); 5055 if (__predict_false(cl == NULL)) { 5056 if (zidx != fl->safe_zidx) { 5057 zidx = fl->safe_zidx; 5058 rxb = &sc->sge.rx_buf_info[zidx]; 5059 cl = uma_zalloc(rxb->zone, M_NOWAIT); 5060 } 5061 if (cl == NULL) 5062 break; 5063 } 5064 fl->cl_allocated++; 5065 n--; 5066 5067 pa = pmap_kextract((vm_offset_t)cl); 5068 sd->cl = cl; 5069 sd->zidx = zidx; 5070 5071 if (fl->flags & FL_BUF_PACKING) { 5072 *d = htobe64(pa | rxb->hwidx2); 5073 sd->moff = rxb->size2; 5074 } else { 5075 *d = htobe64(pa | rxb->hwidx1); 5076 sd->moff = 0; 5077 } 5078 recycled: 5079 sd->nmbuf = 0; 5080 d++; 5081 sd++; 5082 if (__predict_false((++fl->pidx & 7) == 0)) { 5083 uint16_t pidx = fl->pidx >> 3; 5084 5085 if (__predict_false(pidx == fl->sidx)) { 5086 fl->pidx = 0; 5087 pidx = 0; 5088 sd = fl->sdesc; 5089 d = fl->desc; 5090 } 5091 if (n < 8 || pidx == max_pidx) 5092 break; 5093 5094 if (IDXDIFF(pidx, fl->dbidx, fl->sidx) >= 4) 5095 ring_fl_db(sc, fl); 5096 } 5097 } 5098 5099 if ((fl->pidx >> 3) != fl->dbidx) 5100 ring_fl_db(sc, fl); 5101 5102 return (FL_RUNNING_LOW(fl) && !(fl->flags & FL_STARVING)); 5103 } 5104 5105 /* 5106 * Attempt to refill all starving freelists. 5107 */ 5108 static void 5109 refill_sfl(void *arg) 5110 { 5111 struct adapter *sc = arg; 5112 struct sge_fl *fl, *fl_temp; 5113 5114 mtx_assert(&sc->sfl_lock, MA_OWNED); 5115 TAILQ_FOREACH_SAFE(fl, &sc->sfl, link, fl_temp) { 5116 FL_LOCK(fl); 5117 refill_fl(sc, fl, 64); 5118 if (FL_NOT_RUNNING_LOW(fl) || fl->flags & FL_DOOMED) { 5119 TAILQ_REMOVE(&sc->sfl, fl, link); 5120 fl->flags &= ~FL_STARVING; 5121 } 5122 FL_UNLOCK(fl); 5123 } 5124 5125 if (!TAILQ_EMPTY(&sc->sfl)) 5126 callout_schedule(&sc->sfl_callout, hz / 5); 5127 } 5128 5129 /* 5130 * Release the driver's reference on all buffers in the given freelist. Buffers 5131 * with kernel references cannot be freed and will prevent the driver from being 5132 * unloaded safely. 5133 */ 5134 void 5135 free_fl_buffers(struct adapter *sc, struct sge_fl *fl) 5136 { 5137 struct fl_sdesc *sd; 5138 struct cluster_metadata *clm; 5139 int i; 5140 5141 sd = fl->sdesc; 5142 for (i = 0; i < fl->sidx * 8; i++, sd++) { 5143 if (sd->cl == NULL) 5144 continue; 5145 5146 if (sd->nmbuf == 0) 5147 uma_zfree(sc->sge.rx_buf_info[sd->zidx].zone, sd->cl); 5148 else if (fl->flags & FL_BUF_PACKING) { 5149 clm = cl_metadata(sd); 5150 if (atomic_fetchadd_int(&clm->refcount, -1) == 1) { 5151 uma_zfree(sc->sge.rx_buf_info[sd->zidx].zone, 5152 sd->cl); 5153 counter_u64_add(extfree_rels, 1); 5154 } 5155 } 5156 sd->cl = NULL; 5157 } 5158 5159 if (fl->flags & FL_BUF_RESUME) { 5160 m_freem(fl->m0); 5161 fl->flags &= ~FL_BUF_RESUME; 5162 } 5163 } 5164 5165 static inline void 5166 get_pkt_gl(struct mbuf *m, struct sglist *gl) 5167 { 5168 int rc; 5169 5170 M_ASSERTPKTHDR(m); 5171 5172 sglist_reset(gl); 5173 rc = sglist_append_mbuf(gl, m); 5174 if (__predict_false(rc != 0)) { 5175 panic("%s: mbuf %p (%d segs) was vetted earlier but now fails " 5176 "with %d.", __func__, m, mbuf_nsegs(m), rc); 5177 } 5178 5179 KASSERT(gl->sg_nseg == mbuf_nsegs(m), 5180 ("%s: nsegs changed for mbuf %p from %d to %d", __func__, m, 5181 mbuf_nsegs(m), gl->sg_nseg)); 5182 #if 0 /* vm_wr not readily available here. */ 5183 KASSERT(gl->sg_nseg > 0 && gl->sg_nseg <= max_nsegs_allowed(m, vm_wr), 5184 ("%s: %d segments, should have been 1 <= nsegs <= %d", __func__, 5185 gl->sg_nseg, max_nsegs_allowed(m, vm_wr))); 5186 #endif 5187 } 5188 5189 /* 5190 * len16 for a txpkt WR with a GL. Includes the firmware work request header. 5191 */ 5192 static inline u_int 5193 txpkt_len16(u_int nsegs, const u_int extra) 5194 { 5195 u_int n; 5196 5197 MPASS(nsegs > 0); 5198 5199 nsegs--; /* first segment is part of ulptx_sgl */ 5200 n = extra + sizeof(struct fw_eth_tx_pkt_wr) + 5201 sizeof(struct cpl_tx_pkt_core) + 5202 sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1)); 5203 5204 return (howmany(n, 16)); 5205 } 5206 5207 /* 5208 * len16 for a txpkt_vm WR with a GL. Includes the firmware work 5209 * request header. 5210 */ 5211 static inline u_int 5212 txpkt_vm_len16(u_int nsegs, const u_int extra) 5213 { 5214 u_int n; 5215 5216 MPASS(nsegs > 0); 5217 5218 nsegs--; /* first segment is part of ulptx_sgl */ 5219 n = extra + sizeof(struct fw_eth_tx_pkt_vm_wr) + 5220 sizeof(struct cpl_tx_pkt_core) + 5221 sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1)); 5222 5223 return (howmany(n, 16)); 5224 } 5225 5226 static inline void 5227 calculate_mbuf_len16(struct mbuf *m, bool vm_wr) 5228 { 5229 const int lso = sizeof(struct cpl_tx_pkt_lso_core); 5230 const int tnl_lso = sizeof(struct cpl_tx_tnl_lso); 5231 5232 if (vm_wr) { 5233 if (needs_tso(m)) 5234 set_mbuf_len16(m, txpkt_vm_len16(mbuf_nsegs(m), lso)); 5235 else 5236 set_mbuf_len16(m, txpkt_vm_len16(mbuf_nsegs(m), 0)); 5237 return; 5238 } 5239 5240 if (needs_tso(m)) { 5241 if (needs_vxlan_tso(m)) 5242 set_mbuf_len16(m, txpkt_len16(mbuf_nsegs(m), tnl_lso)); 5243 else 5244 set_mbuf_len16(m, txpkt_len16(mbuf_nsegs(m), lso)); 5245 } else 5246 set_mbuf_len16(m, txpkt_len16(mbuf_nsegs(m), 0)); 5247 } 5248 5249 /* 5250 * len16 for a txpkts type 0 WR with a GL. Does not include the firmware work 5251 * request header. 5252 */ 5253 static inline u_int 5254 txpkts0_len16(u_int nsegs) 5255 { 5256 u_int n; 5257 5258 MPASS(nsegs > 0); 5259 5260 nsegs--; /* first segment is part of ulptx_sgl */ 5261 n = sizeof(struct ulp_txpkt) + sizeof(struct ulptx_idata) + 5262 sizeof(struct cpl_tx_pkt_core) + sizeof(struct ulptx_sgl) + 5263 8 * ((3 * nsegs) / 2 + (nsegs & 1)); 5264 5265 return (howmany(n, 16)); 5266 } 5267 5268 /* 5269 * len16 for a txpkts type 1 WR with a GL. Does not include the firmware work 5270 * request header. 5271 */ 5272 static inline u_int 5273 txpkts1_len16(void) 5274 { 5275 u_int n; 5276 5277 n = sizeof(struct cpl_tx_pkt_core) + sizeof(struct ulptx_sgl); 5278 5279 return (howmany(n, 16)); 5280 } 5281 5282 static inline u_int 5283 imm_payload(u_int ndesc) 5284 { 5285 u_int n; 5286 5287 n = ndesc * EQ_ESIZE - sizeof(struct fw_eth_tx_pkt_wr) - 5288 sizeof(struct cpl_tx_pkt_core); 5289 5290 return (n); 5291 } 5292 5293 static inline uint64_t 5294 csum_to_ctrl(struct adapter *sc, struct mbuf *m) 5295 { 5296 uint64_t ctrl; 5297 int csum_type, l2hlen, l3hlen; 5298 int x, y; 5299 static const int csum_types[3][2] = { 5300 {TX_CSUM_TCPIP, TX_CSUM_TCPIP6}, 5301 {TX_CSUM_UDPIP, TX_CSUM_UDPIP6}, 5302 {TX_CSUM_IP, 0} 5303 }; 5304 5305 M_ASSERTPKTHDR(m); 5306 5307 if (!needs_hwcsum(m)) 5308 return (F_TXPKT_IPCSUM_DIS | F_TXPKT_L4CSUM_DIS); 5309 5310 MPASS(m->m_pkthdr.l2hlen >= ETHER_HDR_LEN); 5311 MPASS(m->m_pkthdr.l3hlen >= sizeof(struct ip)); 5312 5313 if (needs_vxlan_csum(m)) { 5314 MPASS(m->m_pkthdr.l4hlen > 0); 5315 MPASS(m->m_pkthdr.l5hlen > 0); 5316 MPASS(m->m_pkthdr.inner_l2hlen >= ETHER_HDR_LEN); 5317 MPASS(m->m_pkthdr.inner_l3hlen >= sizeof(struct ip)); 5318 5319 l2hlen = m->m_pkthdr.l2hlen + m->m_pkthdr.l3hlen + 5320 m->m_pkthdr.l4hlen + m->m_pkthdr.l5hlen + 5321 m->m_pkthdr.inner_l2hlen - ETHER_HDR_LEN; 5322 l3hlen = m->m_pkthdr.inner_l3hlen; 5323 } else { 5324 l2hlen = m->m_pkthdr.l2hlen - ETHER_HDR_LEN; 5325 l3hlen = m->m_pkthdr.l3hlen; 5326 } 5327 5328 ctrl = 0; 5329 if (!needs_l3_csum(m)) 5330 ctrl |= F_TXPKT_IPCSUM_DIS; 5331 5332 if (m->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_INNER_IP_TCP | 5333 CSUM_IP6_TCP | CSUM_INNER_IP6_TCP)) 5334 x = 0; /* TCP */ 5335 else if (m->m_pkthdr.csum_flags & (CSUM_IP_UDP | CSUM_INNER_IP_UDP | 5336 CSUM_IP6_UDP | CSUM_INNER_IP6_UDP)) 5337 x = 1; /* UDP */ 5338 else 5339 x = 2; 5340 5341 if (m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP | 5342 CSUM_INNER_IP | CSUM_INNER_IP_TCP | CSUM_INNER_IP_UDP)) 5343 y = 0; /* IPv4 */ 5344 else { 5345 MPASS(m->m_pkthdr.csum_flags & (CSUM_IP6_TCP | CSUM_IP6_UDP | 5346 CSUM_INNER_IP6_TCP | CSUM_INNER_IP6_UDP)); 5347 y = 1; /* IPv6 */ 5348 } 5349 /* 5350 * needs_hwcsum returned true earlier so there must be some kind of 5351 * checksum to calculate. 5352 */ 5353 csum_type = csum_types[x][y]; 5354 MPASS(csum_type != 0); 5355 if (csum_type == TX_CSUM_IP) 5356 ctrl |= F_TXPKT_L4CSUM_DIS; 5357 ctrl |= V_TXPKT_CSUM_TYPE(csum_type) | V_TXPKT_IPHDR_LEN(l3hlen); 5358 if (chip_id(sc) <= CHELSIO_T5) 5359 ctrl |= V_TXPKT_ETHHDR_LEN(l2hlen); 5360 else 5361 ctrl |= V_T6_TXPKT_ETHHDR_LEN(l2hlen); 5362 5363 return (ctrl); 5364 } 5365 5366 static inline void * 5367 write_lso_cpl(void *cpl, struct mbuf *m0) 5368 { 5369 struct cpl_tx_pkt_lso_core *lso; 5370 uint32_t ctrl; 5371 5372 KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 && 5373 m0->m_pkthdr.l4hlen > 0, 5374 ("%s: mbuf %p needs TSO but missing header lengths", 5375 __func__, m0)); 5376 5377 ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) | 5378 F_LSO_FIRST_SLICE | F_LSO_LAST_SLICE | 5379 V_LSO_ETHHDR_LEN((m0->m_pkthdr.l2hlen - ETHER_HDR_LEN) >> 2) | 5380 V_LSO_IPHDR_LEN(m0->m_pkthdr.l3hlen >> 2) | 5381 V_LSO_TCPHDR_LEN(m0->m_pkthdr.l4hlen >> 2); 5382 if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr)) 5383 ctrl |= F_LSO_IPV6; 5384 5385 lso = cpl; 5386 lso->lso_ctrl = htobe32(ctrl); 5387 lso->ipid_ofst = htobe16(0); 5388 lso->mss = htobe16(m0->m_pkthdr.tso_segsz); 5389 lso->seqno_offset = htobe32(0); 5390 lso->len = htobe32(m0->m_pkthdr.len); 5391 5392 return (lso + 1); 5393 } 5394 5395 static void * 5396 write_tnl_lso_cpl(void *cpl, struct mbuf *m0) 5397 { 5398 struct cpl_tx_tnl_lso *tnl_lso = cpl; 5399 uint32_t ctrl; 5400 5401 KASSERT(m0->m_pkthdr.inner_l2hlen > 0 && 5402 m0->m_pkthdr.inner_l3hlen > 0 && m0->m_pkthdr.inner_l4hlen > 0 && 5403 m0->m_pkthdr.inner_l5hlen > 0, 5404 ("%s: mbuf %p needs VXLAN_TSO but missing inner header lengths", 5405 __func__, m0)); 5406 KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 && 5407 m0->m_pkthdr.l4hlen > 0 && m0->m_pkthdr.l5hlen > 0, 5408 ("%s: mbuf %p needs VXLAN_TSO but missing outer header lengths", 5409 __func__, m0)); 5410 5411 /* Outer headers. */ 5412 ctrl = V_CPL_TX_TNL_LSO_OPCODE(CPL_TX_TNL_LSO) | 5413 F_CPL_TX_TNL_LSO_FIRST | F_CPL_TX_TNL_LSO_LAST | 5414 V_CPL_TX_TNL_LSO_ETHHDRLENOUT( 5415 (m0->m_pkthdr.l2hlen - ETHER_HDR_LEN) >> 2) | 5416 V_CPL_TX_TNL_LSO_IPHDRLENOUT(m0->m_pkthdr.l3hlen >> 2) | 5417 F_CPL_TX_TNL_LSO_IPLENSETOUT; 5418 if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr)) 5419 ctrl |= F_CPL_TX_TNL_LSO_IPV6OUT; 5420 else { 5421 ctrl |= F_CPL_TX_TNL_LSO_IPHDRCHKOUT | 5422 F_CPL_TX_TNL_LSO_IPIDINCOUT; 5423 } 5424 tnl_lso->op_to_IpIdSplitOut = htobe32(ctrl); 5425 tnl_lso->IpIdOffsetOut = 0; 5426 tnl_lso->UdpLenSetOut_to_TnlHdrLen = 5427 htobe16(F_CPL_TX_TNL_LSO_UDPCHKCLROUT | 5428 F_CPL_TX_TNL_LSO_UDPLENSETOUT | 5429 V_CPL_TX_TNL_LSO_TNLHDRLEN(m0->m_pkthdr.l2hlen + 5430 m0->m_pkthdr.l3hlen + m0->m_pkthdr.l4hlen + 5431 m0->m_pkthdr.l5hlen) | 5432 V_CPL_TX_TNL_LSO_TNLTYPE(TX_TNL_TYPE_VXLAN)); 5433 tnl_lso->r1 = 0; 5434 5435 /* Inner headers. */ 5436 ctrl = V_CPL_TX_TNL_LSO_ETHHDRLEN( 5437 (m0->m_pkthdr.inner_l2hlen - ETHER_HDR_LEN) >> 2) | 5438 V_CPL_TX_TNL_LSO_IPHDRLEN(m0->m_pkthdr.inner_l3hlen >> 2) | 5439 V_CPL_TX_TNL_LSO_TCPHDRLEN(m0->m_pkthdr.inner_l4hlen >> 2); 5440 if (m0->m_pkthdr.inner_l3hlen == sizeof(struct ip6_hdr)) 5441 ctrl |= F_CPL_TX_TNL_LSO_IPV6; 5442 tnl_lso->Flow_to_TcpHdrLen = htobe32(ctrl); 5443 tnl_lso->IpIdOffset = 0; 5444 tnl_lso->IpIdSplit_to_Mss = 5445 htobe16(V_CPL_TX_TNL_LSO_MSS(m0->m_pkthdr.tso_segsz)); 5446 tnl_lso->TCPSeqOffset = 0; 5447 tnl_lso->EthLenOffset_Size = 5448 htobe32(V_CPL_TX_TNL_LSO_SIZE(m0->m_pkthdr.len)); 5449 5450 return (tnl_lso + 1); 5451 } 5452 5453 #define VM_TX_L2HDR_LEN 16 /* ethmacdst to vlantci */ 5454 5455 /* 5456 * Write a VM txpkt WR for this packet to the hardware descriptors, update the 5457 * software descriptor, and advance the pidx. It is guaranteed that enough 5458 * descriptors are available. 5459 * 5460 * The return value is the # of hardware descriptors used. 5461 */ 5462 static u_int 5463 write_txpkt_vm_wr(struct adapter *sc, struct sge_txq *txq, struct mbuf *m0) 5464 { 5465 struct sge_eq *eq; 5466 struct fw_eth_tx_pkt_vm_wr *wr; 5467 struct tx_sdesc *txsd; 5468 struct cpl_tx_pkt_core *cpl; 5469 uint32_t ctrl; /* used in many unrelated places */ 5470 uint64_t ctrl1; 5471 int len16, ndesc, pktlen; 5472 caddr_t dst; 5473 5474 TXQ_LOCK_ASSERT_OWNED(txq); 5475 M_ASSERTPKTHDR(m0); 5476 5477 len16 = mbuf_len16(m0); 5478 pktlen = m0->m_pkthdr.len; 5479 ctrl = sizeof(struct cpl_tx_pkt_core); 5480 if (needs_tso(m0)) 5481 ctrl += sizeof(struct cpl_tx_pkt_lso_core); 5482 ndesc = tx_len16_to_desc(len16); 5483 5484 /* Firmware work request header */ 5485 eq = &txq->eq; 5486 wr = (void *)&eq->desc[eq->pidx]; 5487 wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_VM_WR) | 5488 V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl)); 5489 5490 ctrl = V_FW_WR_LEN16(len16); 5491 wr->equiq_to_len16 = htobe32(ctrl); 5492 wr->r3[0] = 0; 5493 wr->r3[1] = 0; 5494 5495 /* 5496 * Copy over ethmacdst, ethmacsrc, ethtype, and vlantci. 5497 * vlantci is ignored unless the ethtype is 0x8100, so it's 5498 * simpler to always copy it rather than making it 5499 * conditional. Also, it seems that we do not have to set 5500 * vlantci or fake the ethtype when doing VLAN tag insertion. 5501 */ 5502 m_copydata(m0, 0, VM_TX_L2HDR_LEN, wr->ethmacdst); 5503 5504 if (needs_tso(m0)) { 5505 cpl = write_lso_cpl(wr + 1, m0); 5506 txq->tso_wrs++; 5507 } else 5508 cpl = (void *)(wr + 1); 5509 5510 /* Checksum offload */ 5511 ctrl1 = csum_to_ctrl(sc, m0); 5512 if (ctrl1 != (F_TXPKT_IPCSUM_DIS | F_TXPKT_L4CSUM_DIS)) 5513 txq->txcsum++; /* some hardware assistance provided */ 5514 5515 /* VLAN tag insertion */ 5516 if (needs_vlan_insertion(m0)) { 5517 ctrl1 |= F_TXPKT_VLAN_VLD | 5518 V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag); 5519 txq->vlan_insertion++; 5520 } 5521 5522 /* CPL header */ 5523 cpl->ctrl0 = txq->cpl_ctrl0; 5524 cpl->pack = 0; 5525 cpl->len = htobe16(pktlen); 5526 cpl->ctrl1 = htobe64(ctrl1); 5527 5528 /* SGL */ 5529 dst = (void *)(cpl + 1); 5530 5531 /* 5532 * A packet using TSO will use up an entire descriptor for the 5533 * firmware work request header, LSO CPL, and TX_PKT_XT CPL. 5534 * If this descriptor is the last descriptor in the ring, wrap 5535 * around to the front of the ring explicitly for the start of 5536 * the sgl. 5537 */ 5538 if (dst == (void *)&eq->desc[eq->sidx]) { 5539 dst = (void *)&eq->desc[0]; 5540 write_gl_to_txd(txq, m0, &dst, 0); 5541 } else 5542 write_gl_to_txd(txq, m0, &dst, eq->sidx - ndesc < eq->pidx); 5543 txq->sgl_wrs++; 5544 txq->txpkt_wrs++; 5545 5546 txsd = &txq->sdesc[eq->pidx]; 5547 txsd->m = m0; 5548 txsd->desc_used = ndesc; 5549 5550 return (ndesc); 5551 } 5552 5553 /* 5554 * Write a raw WR to the hardware descriptors, update the software 5555 * descriptor, and advance the pidx. It is guaranteed that enough 5556 * descriptors are available. 5557 * 5558 * The return value is the # of hardware descriptors used. 5559 */ 5560 static u_int 5561 write_raw_wr(struct sge_txq *txq, void *wr, struct mbuf *m0, u_int available) 5562 { 5563 struct sge_eq *eq = &txq->eq; 5564 struct tx_sdesc *txsd; 5565 struct mbuf *m; 5566 caddr_t dst; 5567 int len16, ndesc; 5568 5569 len16 = mbuf_len16(m0); 5570 ndesc = tx_len16_to_desc(len16); 5571 MPASS(ndesc <= available); 5572 5573 dst = wr; 5574 for (m = m0; m != NULL; m = m->m_next) 5575 copy_to_txd(eq, mtod(m, caddr_t), &dst, m->m_len); 5576 5577 txq->raw_wrs++; 5578 5579 txsd = &txq->sdesc[eq->pidx]; 5580 txsd->m = m0; 5581 txsd->desc_used = ndesc; 5582 5583 return (ndesc); 5584 } 5585 5586 /* 5587 * Write a txpkt WR for this packet to the hardware descriptors, update the 5588 * software descriptor, and advance the pidx. It is guaranteed that enough 5589 * descriptors are available. 5590 * 5591 * The return value is the # of hardware descriptors used. 5592 */ 5593 static u_int 5594 write_txpkt_wr(struct adapter *sc, struct sge_txq *txq, struct mbuf *m0, 5595 u_int available) 5596 { 5597 struct sge_eq *eq; 5598 struct fw_eth_tx_pkt_wr *wr; 5599 struct tx_sdesc *txsd; 5600 struct cpl_tx_pkt_core *cpl; 5601 uint32_t ctrl; /* used in many unrelated places */ 5602 uint64_t ctrl1; 5603 int len16, ndesc, pktlen, nsegs; 5604 caddr_t dst; 5605 5606 TXQ_LOCK_ASSERT_OWNED(txq); 5607 M_ASSERTPKTHDR(m0); 5608 5609 len16 = mbuf_len16(m0); 5610 nsegs = mbuf_nsegs(m0); 5611 pktlen = m0->m_pkthdr.len; 5612 ctrl = sizeof(struct cpl_tx_pkt_core); 5613 if (needs_tso(m0)) { 5614 if (needs_vxlan_tso(m0)) 5615 ctrl += sizeof(struct cpl_tx_tnl_lso); 5616 else 5617 ctrl += sizeof(struct cpl_tx_pkt_lso_core); 5618 } else if (!(mbuf_cflags(m0) & MC_NOMAP) && pktlen <= imm_payload(2) && 5619 available >= 2) { 5620 /* Immediate data. Recalculate len16 and set nsegs to 0. */ 5621 ctrl += pktlen; 5622 len16 = howmany(sizeof(struct fw_eth_tx_pkt_wr) + 5623 sizeof(struct cpl_tx_pkt_core) + pktlen, 16); 5624 nsegs = 0; 5625 } 5626 ndesc = tx_len16_to_desc(len16); 5627 MPASS(ndesc <= available); 5628 5629 /* Firmware work request header */ 5630 eq = &txq->eq; 5631 wr = (void *)&eq->desc[eq->pidx]; 5632 wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_WR) | 5633 V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl)); 5634 5635 ctrl = V_FW_WR_LEN16(len16); 5636 wr->equiq_to_len16 = htobe32(ctrl); 5637 wr->r3 = 0; 5638 5639 if (needs_tso(m0)) { 5640 if (needs_vxlan_tso(m0)) { 5641 cpl = write_tnl_lso_cpl(wr + 1, m0); 5642 txq->vxlan_tso_wrs++; 5643 } else { 5644 cpl = write_lso_cpl(wr + 1, m0); 5645 txq->tso_wrs++; 5646 } 5647 } else 5648 cpl = (void *)(wr + 1); 5649 5650 /* Checksum offload */ 5651 ctrl1 = csum_to_ctrl(sc, m0); 5652 if (ctrl1 != (F_TXPKT_IPCSUM_DIS | F_TXPKT_L4CSUM_DIS)) { 5653 /* some hardware assistance provided */ 5654 if (needs_vxlan_csum(m0)) 5655 txq->vxlan_txcsum++; 5656 else 5657 txq->txcsum++; 5658 } 5659 5660 /* VLAN tag insertion */ 5661 if (needs_vlan_insertion(m0)) { 5662 ctrl1 |= F_TXPKT_VLAN_VLD | 5663 V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag); 5664 txq->vlan_insertion++; 5665 } 5666 5667 /* CPL header */ 5668 cpl->ctrl0 = txq->cpl_ctrl0; 5669 cpl->pack = 0; 5670 cpl->len = htobe16(pktlen); 5671 cpl->ctrl1 = htobe64(ctrl1); 5672 5673 /* SGL */ 5674 dst = (void *)(cpl + 1); 5675 if (__predict_false((uintptr_t)dst == (uintptr_t)&eq->desc[eq->sidx])) 5676 dst = (caddr_t)&eq->desc[0]; 5677 if (nsegs > 0) { 5678 5679 write_gl_to_txd(txq, m0, &dst, eq->sidx - ndesc < eq->pidx); 5680 txq->sgl_wrs++; 5681 } else { 5682 struct mbuf *m; 5683 5684 for (m = m0; m != NULL; m = m->m_next) { 5685 copy_to_txd(eq, mtod(m, caddr_t), &dst, m->m_len); 5686 #ifdef INVARIANTS 5687 pktlen -= m->m_len; 5688 #endif 5689 } 5690 #ifdef INVARIANTS 5691 KASSERT(pktlen == 0, ("%s: %d bytes left.", __func__, pktlen)); 5692 #endif 5693 txq->imm_wrs++; 5694 } 5695 5696 txq->txpkt_wrs++; 5697 5698 txsd = &txq->sdesc[eq->pidx]; 5699 txsd->m = m0; 5700 txsd->desc_used = ndesc; 5701 5702 return (ndesc); 5703 } 5704 5705 static inline bool 5706 cmp_l2hdr(struct txpkts *txp, struct mbuf *m) 5707 { 5708 int len; 5709 5710 MPASS(txp->npkt > 0); 5711 MPASS(m->m_len >= VM_TX_L2HDR_LEN); 5712 5713 if (txp->ethtype == be16toh(ETHERTYPE_VLAN)) 5714 len = VM_TX_L2HDR_LEN; 5715 else 5716 len = sizeof(struct ether_header); 5717 5718 return (memcmp(m->m_data, &txp->ethmacdst[0], len) != 0); 5719 } 5720 5721 static inline void 5722 save_l2hdr(struct txpkts *txp, struct mbuf *m) 5723 { 5724 MPASS(m->m_len >= VM_TX_L2HDR_LEN); 5725 5726 memcpy(&txp->ethmacdst[0], mtod(m, const void *), VM_TX_L2HDR_LEN); 5727 } 5728 5729 static int 5730 add_to_txpkts_vf(struct adapter *sc, struct sge_txq *txq, struct mbuf *m, 5731 int avail, bool *send) 5732 { 5733 struct txpkts *txp = &txq->txp; 5734 5735 /* Cannot have TSO and coalesce at the same time. */ 5736 if (cannot_use_txpkts(m)) { 5737 cannot_coalesce: 5738 *send = txp->npkt > 0; 5739 return (EINVAL); 5740 } 5741 5742 /* VF allows coalescing of type 1 (1 GL) only */ 5743 if (mbuf_nsegs(m) > 1) 5744 goto cannot_coalesce; 5745 5746 *send = false; 5747 if (txp->npkt > 0) { 5748 MPASS(tx_len16_to_desc(txp->len16) <= avail); 5749 MPASS(txp->npkt < txp->max_npkt); 5750 MPASS(txp->wr_type == 1); /* VF supports type 1 only */ 5751 5752 if (tx_len16_to_desc(txp->len16 + txpkts1_len16()) > avail) { 5753 retry_after_send: 5754 *send = true; 5755 return (EAGAIN); 5756 } 5757 if (m->m_pkthdr.len + txp->plen > 65535) 5758 goto retry_after_send; 5759 if (cmp_l2hdr(txp, m)) 5760 goto retry_after_send; 5761 5762 txp->len16 += txpkts1_len16(); 5763 txp->plen += m->m_pkthdr.len; 5764 txp->mb[txp->npkt++] = m; 5765 if (txp->npkt == txp->max_npkt) 5766 *send = true; 5767 } else { 5768 txp->len16 = howmany(sizeof(struct fw_eth_tx_pkts_vm_wr), 16) + 5769 txpkts1_len16(); 5770 if (tx_len16_to_desc(txp->len16) > avail) 5771 goto cannot_coalesce; 5772 txp->npkt = 1; 5773 txp->wr_type = 1; 5774 txp->plen = m->m_pkthdr.len; 5775 txp->mb[0] = m; 5776 save_l2hdr(txp, m); 5777 } 5778 return (0); 5779 } 5780 5781 static int 5782 add_to_txpkts_pf(struct adapter *sc, struct sge_txq *txq, struct mbuf *m, 5783 int avail, bool *send) 5784 { 5785 struct txpkts *txp = &txq->txp; 5786 int nsegs; 5787 5788 MPASS(!(sc->flags & IS_VF)); 5789 5790 /* Cannot have TSO and coalesce at the same time. */ 5791 if (cannot_use_txpkts(m)) { 5792 cannot_coalesce: 5793 *send = txp->npkt > 0; 5794 return (EINVAL); 5795 } 5796 5797 *send = false; 5798 nsegs = mbuf_nsegs(m); 5799 if (txp->npkt == 0) { 5800 if (m->m_pkthdr.len > 65535) 5801 goto cannot_coalesce; 5802 if (nsegs > 1) { 5803 txp->wr_type = 0; 5804 txp->len16 = 5805 howmany(sizeof(struct fw_eth_tx_pkts_wr), 16) + 5806 txpkts0_len16(nsegs); 5807 } else { 5808 txp->wr_type = 1; 5809 txp->len16 = 5810 howmany(sizeof(struct fw_eth_tx_pkts_wr), 16) + 5811 txpkts1_len16(); 5812 } 5813 if (tx_len16_to_desc(txp->len16) > avail) 5814 goto cannot_coalesce; 5815 txp->npkt = 1; 5816 txp->plen = m->m_pkthdr.len; 5817 txp->mb[0] = m; 5818 } else { 5819 MPASS(tx_len16_to_desc(txp->len16) <= avail); 5820 MPASS(txp->npkt < txp->max_npkt); 5821 5822 if (m->m_pkthdr.len + txp->plen > 65535) { 5823 retry_after_send: 5824 *send = true; 5825 return (EAGAIN); 5826 } 5827 5828 MPASS(txp->wr_type == 0 || txp->wr_type == 1); 5829 if (txp->wr_type == 0) { 5830 if (tx_len16_to_desc(txp->len16 + 5831 txpkts0_len16(nsegs)) > min(avail, SGE_MAX_WR_NDESC)) 5832 goto retry_after_send; 5833 txp->len16 += txpkts0_len16(nsegs); 5834 } else { 5835 if (nsegs != 1) 5836 goto retry_after_send; 5837 if (tx_len16_to_desc(txp->len16 + txpkts1_len16()) > 5838 avail) 5839 goto retry_after_send; 5840 txp->len16 += txpkts1_len16(); 5841 } 5842 5843 txp->plen += m->m_pkthdr.len; 5844 txp->mb[txp->npkt++] = m; 5845 if (txp->npkt == txp->max_npkt) 5846 *send = true; 5847 } 5848 return (0); 5849 } 5850 5851 /* 5852 * Write a txpkts WR for the packets in txp to the hardware descriptors, update 5853 * the software descriptor, and advance the pidx. It is guaranteed that enough 5854 * descriptors are available. 5855 * 5856 * The return value is the # of hardware descriptors used. 5857 */ 5858 static u_int 5859 write_txpkts_wr(struct adapter *sc, struct sge_txq *txq) 5860 { 5861 const struct txpkts *txp = &txq->txp; 5862 struct sge_eq *eq = &txq->eq; 5863 struct fw_eth_tx_pkts_wr *wr; 5864 struct tx_sdesc *txsd; 5865 struct cpl_tx_pkt_core *cpl; 5866 uint64_t ctrl1; 5867 int ndesc, i, checkwrap; 5868 struct mbuf *m, *last; 5869 void *flitp; 5870 5871 TXQ_LOCK_ASSERT_OWNED(txq); 5872 MPASS(txp->npkt > 0); 5873 MPASS(txp->len16 <= howmany(SGE_MAX_WR_LEN, 16)); 5874 5875 wr = (void *)&eq->desc[eq->pidx]; 5876 wr->op_pkd = htobe32(V_FW_WR_OP(FW_ETH_TX_PKTS_WR)); 5877 wr->equiq_to_len16 = htobe32(V_FW_WR_LEN16(txp->len16)); 5878 wr->plen = htobe16(txp->plen); 5879 wr->npkt = txp->npkt; 5880 wr->r3 = 0; 5881 wr->type = txp->wr_type; 5882 flitp = wr + 1; 5883 5884 /* 5885 * At this point we are 16B into a hardware descriptor. If checkwrap is 5886 * set then we know the WR is going to wrap around somewhere. We'll 5887 * check for that at appropriate points. 5888 */ 5889 ndesc = tx_len16_to_desc(txp->len16); 5890 last = NULL; 5891 checkwrap = eq->sidx - ndesc < eq->pidx; 5892 for (i = 0; i < txp->npkt; i++) { 5893 m = txp->mb[i]; 5894 if (txp->wr_type == 0) { 5895 struct ulp_txpkt *ulpmc; 5896 struct ulptx_idata *ulpsc; 5897 5898 /* ULP master command */ 5899 ulpmc = flitp; 5900 ulpmc->cmd_dest = htobe32(V_ULPTX_CMD(ULP_TX_PKT) | 5901 V_ULP_TXPKT_DEST(0) | V_ULP_TXPKT_FID(eq->iqid)); 5902 ulpmc->len = htobe32(txpkts0_len16(mbuf_nsegs(m))); 5903 5904 /* ULP subcommand */ 5905 ulpsc = (void *)(ulpmc + 1); 5906 ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM) | 5907 F_ULP_TX_SC_MORE); 5908 ulpsc->len = htobe32(sizeof(struct cpl_tx_pkt_core)); 5909 5910 cpl = (void *)(ulpsc + 1); 5911 if (checkwrap && 5912 (uintptr_t)cpl == (uintptr_t)&eq->desc[eq->sidx]) 5913 cpl = (void *)&eq->desc[0]; 5914 } else { 5915 cpl = flitp; 5916 } 5917 5918 /* Checksum offload */ 5919 ctrl1 = csum_to_ctrl(sc, m); 5920 if (ctrl1 != (F_TXPKT_IPCSUM_DIS | F_TXPKT_L4CSUM_DIS)) { 5921 /* some hardware assistance provided */ 5922 if (needs_vxlan_csum(m)) 5923 txq->vxlan_txcsum++; 5924 else 5925 txq->txcsum++; 5926 } 5927 5928 /* VLAN tag insertion */ 5929 if (needs_vlan_insertion(m)) { 5930 ctrl1 |= F_TXPKT_VLAN_VLD | 5931 V_TXPKT_VLAN(m->m_pkthdr.ether_vtag); 5932 txq->vlan_insertion++; 5933 } 5934 5935 /* CPL header */ 5936 cpl->ctrl0 = txq->cpl_ctrl0; 5937 cpl->pack = 0; 5938 cpl->len = htobe16(m->m_pkthdr.len); 5939 cpl->ctrl1 = htobe64(ctrl1); 5940 5941 flitp = cpl + 1; 5942 if (checkwrap && 5943 (uintptr_t)flitp == (uintptr_t)&eq->desc[eq->sidx]) 5944 flitp = (void *)&eq->desc[0]; 5945 5946 write_gl_to_txd(txq, m, (caddr_t *)(&flitp), checkwrap); 5947 5948 if (last != NULL) 5949 last->m_nextpkt = m; 5950 last = m; 5951 } 5952 5953 txq->sgl_wrs++; 5954 if (txp->wr_type == 0) { 5955 txq->txpkts0_pkts += txp->npkt; 5956 txq->txpkts0_wrs++; 5957 } else { 5958 txq->txpkts1_pkts += txp->npkt; 5959 txq->txpkts1_wrs++; 5960 } 5961 5962 txsd = &txq->sdesc[eq->pidx]; 5963 txsd->m = txp->mb[0]; 5964 txsd->desc_used = ndesc; 5965 5966 return (ndesc); 5967 } 5968 5969 static u_int 5970 write_txpkts_vm_wr(struct adapter *sc, struct sge_txq *txq) 5971 { 5972 const struct txpkts *txp = &txq->txp; 5973 struct sge_eq *eq = &txq->eq; 5974 struct fw_eth_tx_pkts_vm_wr *wr; 5975 struct tx_sdesc *txsd; 5976 struct cpl_tx_pkt_core *cpl; 5977 uint64_t ctrl1; 5978 int ndesc, i; 5979 struct mbuf *m, *last; 5980 void *flitp; 5981 5982 TXQ_LOCK_ASSERT_OWNED(txq); 5983 MPASS(txp->npkt > 0); 5984 MPASS(txp->wr_type == 1); /* VF supports type 1 only */ 5985 MPASS(txp->mb[0] != NULL); 5986 MPASS(txp->len16 <= howmany(SGE_MAX_WR_LEN, 16)); 5987 5988 wr = (void *)&eq->desc[eq->pidx]; 5989 wr->op_pkd = htobe32(V_FW_WR_OP(FW_ETH_TX_PKTS_VM_WR)); 5990 wr->equiq_to_len16 = htobe32(V_FW_WR_LEN16(txp->len16)); 5991 wr->r3 = 0; 5992 wr->plen = htobe16(txp->plen); 5993 wr->npkt = txp->npkt; 5994 wr->r4 = 0; 5995 memcpy(&wr->ethmacdst[0], &txp->ethmacdst[0], 16); 5996 flitp = wr + 1; 5997 5998 /* 5999 * At this point we are 32B into a hardware descriptor. Each mbuf in 6000 * the WR will take 32B so we check for the end of the descriptor ring 6001 * before writing odd mbufs (mb[1], 3, 5, ..) 6002 */ 6003 ndesc = tx_len16_to_desc(txp->len16); 6004 last = NULL; 6005 for (i = 0; i < txp->npkt; i++) { 6006 m = txp->mb[i]; 6007 if (i & 1 && (uintptr_t)flitp == (uintptr_t)&eq->desc[eq->sidx]) 6008 flitp = &eq->desc[0]; 6009 cpl = flitp; 6010 6011 /* Checksum offload */ 6012 ctrl1 = csum_to_ctrl(sc, m); 6013 if (ctrl1 != (F_TXPKT_IPCSUM_DIS | F_TXPKT_L4CSUM_DIS)) 6014 txq->txcsum++; /* some hardware assistance provided */ 6015 6016 /* VLAN tag insertion */ 6017 if (needs_vlan_insertion(m)) { 6018 ctrl1 |= F_TXPKT_VLAN_VLD | 6019 V_TXPKT_VLAN(m->m_pkthdr.ether_vtag); 6020 txq->vlan_insertion++; 6021 } 6022 6023 /* CPL header */ 6024 cpl->ctrl0 = txq->cpl_ctrl0; 6025 cpl->pack = 0; 6026 cpl->len = htobe16(m->m_pkthdr.len); 6027 cpl->ctrl1 = htobe64(ctrl1); 6028 6029 flitp = cpl + 1; 6030 MPASS(mbuf_nsegs(m) == 1); 6031 write_gl_to_txd(txq, m, (caddr_t *)(&flitp), 0); 6032 6033 if (last != NULL) 6034 last->m_nextpkt = m; 6035 last = m; 6036 } 6037 6038 txq->sgl_wrs++; 6039 txq->txpkts1_pkts += txp->npkt; 6040 txq->txpkts1_wrs++; 6041 6042 txsd = &txq->sdesc[eq->pidx]; 6043 txsd->m = txp->mb[0]; 6044 txsd->desc_used = ndesc; 6045 6046 return (ndesc); 6047 } 6048 6049 /* 6050 * If the SGL ends on an address that is not 16 byte aligned, this function will 6051 * add a 0 filled flit at the end. 6052 */ 6053 static void 6054 write_gl_to_txd(struct sge_txq *txq, struct mbuf *m, caddr_t *to, int checkwrap) 6055 { 6056 struct sge_eq *eq = &txq->eq; 6057 struct sglist *gl = txq->gl; 6058 struct sglist_seg *seg; 6059 __be64 *flitp, *wrap; 6060 struct ulptx_sgl *usgl; 6061 int i, nflits, nsegs; 6062 6063 KASSERT(((uintptr_t)(*to) & 0xf) == 0, 6064 ("%s: SGL must start at a 16 byte boundary: %p", __func__, *to)); 6065 MPASS((uintptr_t)(*to) >= (uintptr_t)&eq->desc[0]); 6066 MPASS((uintptr_t)(*to) < (uintptr_t)&eq->desc[eq->sidx]); 6067 6068 get_pkt_gl(m, gl); 6069 nsegs = gl->sg_nseg; 6070 MPASS(nsegs > 0); 6071 6072 nflits = (3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1) + 2; 6073 flitp = (__be64 *)(*to); 6074 wrap = (__be64 *)(&eq->desc[eq->sidx]); 6075 seg = &gl->sg_segs[0]; 6076 usgl = (void *)flitp; 6077 6078 /* 6079 * We start at a 16 byte boundary somewhere inside the tx descriptor 6080 * ring, so we're at least 16 bytes away from the status page. There is 6081 * no chance of a wrap around in the middle of usgl (which is 16 bytes). 6082 */ 6083 6084 usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) | 6085 V_ULPTX_NSGE(nsegs)); 6086 usgl->len0 = htobe32(seg->ss_len); 6087 usgl->addr0 = htobe64(seg->ss_paddr); 6088 seg++; 6089 6090 if (checkwrap == 0 || (uintptr_t)(flitp + nflits) <= (uintptr_t)wrap) { 6091 6092 /* Won't wrap around at all */ 6093 6094 for (i = 0; i < nsegs - 1; i++, seg++) { 6095 usgl->sge[i / 2].len[i & 1] = htobe32(seg->ss_len); 6096 usgl->sge[i / 2].addr[i & 1] = htobe64(seg->ss_paddr); 6097 } 6098 if (i & 1) 6099 usgl->sge[i / 2].len[1] = htobe32(0); 6100 flitp += nflits; 6101 } else { 6102 6103 /* Will wrap somewhere in the rest of the SGL */ 6104 6105 /* 2 flits already written, write the rest flit by flit */ 6106 flitp = (void *)(usgl + 1); 6107 for (i = 0; i < nflits - 2; i++) { 6108 if (flitp == wrap) 6109 flitp = (void *)eq->desc; 6110 *flitp++ = get_flit(seg, nsegs - 1, i); 6111 } 6112 } 6113 6114 if (nflits & 1) { 6115 MPASS(((uintptr_t)flitp) & 0xf); 6116 *flitp++ = 0; 6117 } 6118 6119 MPASS((((uintptr_t)flitp) & 0xf) == 0); 6120 if (__predict_false(flitp == wrap)) 6121 *to = (void *)eq->desc; 6122 else 6123 *to = (void *)flitp; 6124 } 6125 6126 static inline void 6127 copy_to_txd(struct sge_eq *eq, caddr_t from, caddr_t *to, int len) 6128 { 6129 6130 MPASS((uintptr_t)(*to) >= (uintptr_t)&eq->desc[0]); 6131 MPASS((uintptr_t)(*to) < (uintptr_t)&eq->desc[eq->sidx]); 6132 6133 if (__predict_true((uintptr_t)(*to) + len <= 6134 (uintptr_t)&eq->desc[eq->sidx])) { 6135 bcopy(from, *to, len); 6136 (*to) += len; 6137 } else { 6138 int portion = (uintptr_t)&eq->desc[eq->sidx] - (uintptr_t)(*to); 6139 6140 bcopy(from, *to, portion); 6141 from += portion; 6142 portion = len - portion; /* remaining */ 6143 bcopy(from, (void *)eq->desc, portion); 6144 (*to) = (caddr_t)eq->desc + portion; 6145 } 6146 } 6147 6148 static inline void 6149 ring_eq_db(struct adapter *sc, struct sge_eq *eq, u_int n) 6150 { 6151 u_int db; 6152 6153 MPASS(n > 0); 6154 6155 db = eq->doorbells; 6156 if (n > 1) 6157 clrbit(&db, DOORBELL_WCWR); 6158 wmb(); 6159 6160 switch (ffs(db) - 1) { 6161 case DOORBELL_UDB: 6162 *eq->udb = htole32(V_QID(eq->udb_qid) | V_PIDX(n)); 6163 break; 6164 6165 case DOORBELL_WCWR: { 6166 volatile uint64_t *dst, *src; 6167 int i; 6168 6169 /* 6170 * Queues whose 128B doorbell segment fits in the page do not 6171 * use relative qid (udb_qid is always 0). Only queues with 6172 * doorbell segments can do WCWR. 6173 */ 6174 KASSERT(eq->udb_qid == 0 && n == 1, 6175 ("%s: inappropriate doorbell (0x%x, %d, %d) for eq %p", 6176 __func__, eq->doorbells, n, eq->dbidx, eq)); 6177 6178 dst = (volatile void *)((uintptr_t)eq->udb + UDBS_WR_OFFSET - 6179 UDBS_DB_OFFSET); 6180 i = eq->dbidx; 6181 src = (void *)&eq->desc[i]; 6182 while (src != (void *)&eq->desc[i + 1]) 6183 *dst++ = *src++; 6184 wmb(); 6185 break; 6186 } 6187 6188 case DOORBELL_UDBWC: 6189 *eq->udb = htole32(V_QID(eq->udb_qid) | V_PIDX(n)); 6190 wmb(); 6191 break; 6192 6193 case DOORBELL_KDB: 6194 t4_write_reg(sc, sc->sge_kdoorbell_reg, 6195 V_QID(eq->cntxt_id) | V_PIDX(n)); 6196 break; 6197 } 6198 6199 IDXINCR(eq->dbidx, n, eq->sidx); 6200 } 6201 6202 static inline u_int 6203 reclaimable_tx_desc(struct sge_eq *eq) 6204 { 6205 uint16_t hw_cidx; 6206 6207 hw_cidx = read_hw_cidx(eq); 6208 return (IDXDIFF(hw_cidx, eq->cidx, eq->sidx)); 6209 } 6210 6211 static inline u_int 6212 total_available_tx_desc(struct sge_eq *eq) 6213 { 6214 uint16_t hw_cidx, pidx; 6215 6216 hw_cidx = read_hw_cidx(eq); 6217 pidx = eq->pidx; 6218 6219 if (pidx == hw_cidx) 6220 return (eq->sidx - 1); 6221 else 6222 return (IDXDIFF(hw_cidx, pidx, eq->sidx) - 1); 6223 } 6224 6225 static inline uint16_t 6226 read_hw_cidx(struct sge_eq *eq) 6227 { 6228 struct sge_qstat *spg = (void *)&eq->desc[eq->sidx]; 6229 uint16_t cidx = spg->cidx; /* stable snapshot */ 6230 6231 return (be16toh(cidx)); 6232 } 6233 6234 /* 6235 * Reclaim 'n' descriptors approximately. 6236 */ 6237 static u_int 6238 reclaim_tx_descs(struct sge_txq *txq, u_int n) 6239 { 6240 struct tx_sdesc *txsd; 6241 struct sge_eq *eq = &txq->eq; 6242 u_int can_reclaim, reclaimed; 6243 6244 TXQ_LOCK_ASSERT_OWNED(txq); 6245 MPASS(n > 0); 6246 6247 reclaimed = 0; 6248 can_reclaim = reclaimable_tx_desc(eq); 6249 while (can_reclaim && reclaimed < n) { 6250 int ndesc; 6251 struct mbuf *m, *nextpkt; 6252 6253 txsd = &txq->sdesc[eq->cidx]; 6254 ndesc = txsd->desc_used; 6255 6256 /* Firmware doesn't return "partial" credits. */ 6257 KASSERT(can_reclaim >= ndesc, 6258 ("%s: unexpected number of credits: %d, %d", 6259 __func__, can_reclaim, ndesc)); 6260 KASSERT(ndesc != 0, 6261 ("%s: descriptor with no credits: cidx %d", 6262 __func__, eq->cidx)); 6263 6264 for (m = txsd->m; m != NULL; m = nextpkt) { 6265 nextpkt = m->m_nextpkt; 6266 m->m_nextpkt = NULL; 6267 m_freem(m); 6268 } 6269 reclaimed += ndesc; 6270 can_reclaim -= ndesc; 6271 IDXINCR(eq->cidx, ndesc, eq->sidx); 6272 } 6273 6274 return (reclaimed); 6275 } 6276 6277 static void 6278 tx_reclaim(void *arg, int n) 6279 { 6280 struct sge_txq *txq = arg; 6281 struct sge_eq *eq = &txq->eq; 6282 6283 do { 6284 if (TXQ_TRYLOCK(txq) == 0) 6285 break; 6286 n = reclaim_tx_descs(txq, 32); 6287 if (eq->cidx == eq->pidx) 6288 eq->equeqidx = eq->pidx; 6289 TXQ_UNLOCK(txq); 6290 } while (n > 0); 6291 } 6292 6293 static __be64 6294 get_flit(struct sglist_seg *segs, int nsegs, int idx) 6295 { 6296 int i = (idx / 3) * 2; 6297 6298 switch (idx % 3) { 6299 case 0: { 6300 uint64_t rc; 6301 6302 rc = (uint64_t)segs[i].ss_len << 32; 6303 if (i + 1 < nsegs) 6304 rc |= (uint64_t)(segs[i + 1].ss_len); 6305 6306 return (htobe64(rc)); 6307 } 6308 case 1: 6309 return (htobe64(segs[i].ss_paddr)); 6310 case 2: 6311 return (htobe64(segs[i + 1].ss_paddr)); 6312 } 6313 6314 return (0); 6315 } 6316 6317 static int 6318 find_refill_source(struct adapter *sc, int maxp, bool packing) 6319 { 6320 int i, zidx = -1; 6321 struct rx_buf_info *rxb = &sc->sge.rx_buf_info[0]; 6322 6323 if (packing) { 6324 for (i = 0; i < SW_ZONE_SIZES; i++, rxb++) { 6325 if (rxb->hwidx2 == -1) 6326 continue; 6327 if (rxb->size1 < PAGE_SIZE && 6328 rxb->size1 < largest_rx_cluster) 6329 continue; 6330 if (rxb->size1 > largest_rx_cluster) 6331 break; 6332 MPASS(rxb->size1 - rxb->size2 >= CL_METADATA_SIZE); 6333 if (rxb->size2 >= maxp) 6334 return (i); 6335 zidx = i; 6336 } 6337 } else { 6338 for (i = 0; i < SW_ZONE_SIZES; i++, rxb++) { 6339 if (rxb->hwidx1 == -1) 6340 continue; 6341 if (rxb->size1 > largest_rx_cluster) 6342 break; 6343 if (rxb->size1 >= maxp) 6344 return (i); 6345 zidx = i; 6346 } 6347 } 6348 6349 return (zidx); 6350 } 6351 6352 static void 6353 add_fl_to_sfl(struct adapter *sc, struct sge_fl *fl) 6354 { 6355 mtx_lock(&sc->sfl_lock); 6356 FL_LOCK(fl); 6357 if ((fl->flags & FL_DOOMED) == 0) { 6358 fl->flags |= FL_STARVING; 6359 TAILQ_INSERT_TAIL(&sc->sfl, fl, link); 6360 callout_reset(&sc->sfl_callout, hz / 5, refill_sfl, sc); 6361 } 6362 FL_UNLOCK(fl); 6363 mtx_unlock(&sc->sfl_lock); 6364 } 6365 6366 static void 6367 handle_wrq_egr_update(struct adapter *sc, struct sge_eq *eq) 6368 { 6369 struct sge_wrq *wrq = (void *)eq; 6370 6371 atomic_readandclear_int(&eq->equiq); 6372 taskqueue_enqueue(sc->tq[eq->tx_chan], &wrq->wrq_tx_task); 6373 } 6374 6375 static void 6376 handle_eth_egr_update(struct adapter *sc, struct sge_eq *eq) 6377 { 6378 struct sge_txq *txq = (void *)eq; 6379 6380 MPASS(eq->type == EQ_ETH); 6381 6382 atomic_readandclear_int(&eq->equiq); 6383 if (mp_ring_is_idle(txq->r)) 6384 taskqueue_enqueue(sc->tq[eq->tx_chan], &txq->tx_reclaim_task); 6385 else 6386 mp_ring_check_drainage(txq->r, 64); 6387 } 6388 6389 static int 6390 handle_sge_egr_update(struct sge_iq *iq, const struct rss_header *rss, 6391 struct mbuf *m) 6392 { 6393 const struct cpl_sge_egr_update *cpl = (const void *)(rss + 1); 6394 unsigned int qid = G_EGR_QID(ntohl(cpl->opcode_qid)); 6395 struct adapter *sc = iq->adapter; 6396 struct sge *s = &sc->sge; 6397 struct sge_eq *eq; 6398 static void (*h[])(struct adapter *, struct sge_eq *) = {NULL, 6399 &handle_wrq_egr_update, &handle_eth_egr_update, 6400 &handle_wrq_egr_update}; 6401 6402 KASSERT(m == NULL, ("%s: payload with opcode %02x", __func__, 6403 rss->opcode)); 6404 6405 eq = s->eqmap[qid - s->eq_start - s->eq_base]; 6406 (*h[eq->type])(sc, eq); 6407 6408 return (0); 6409 } 6410 6411 /* handle_fw_msg works for both fw4_msg and fw6_msg because this is valid */ 6412 CTASSERT(offsetof(struct cpl_fw4_msg, data) == \ 6413 offsetof(struct cpl_fw6_msg, data)); 6414 6415 static int 6416 handle_fw_msg(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 6417 { 6418 struct adapter *sc = iq->adapter; 6419 const struct cpl_fw6_msg *cpl = (const void *)(rss + 1); 6420 6421 KASSERT(m == NULL, ("%s: payload with opcode %02x", __func__, 6422 rss->opcode)); 6423 6424 if (cpl->type == FW_TYPE_RSSCPL || cpl->type == FW6_TYPE_RSSCPL) { 6425 const struct rss_header *rss2; 6426 6427 rss2 = (const struct rss_header *)&cpl->data[0]; 6428 return (t4_cpl_handler[rss2->opcode](iq, rss2, m)); 6429 } 6430 6431 return (t4_fw_msg_handler[cpl->type](sc, &cpl->data[0])); 6432 } 6433 6434 /** 6435 * t4_handle_wrerr_rpl - process a FW work request error message 6436 * @adap: the adapter 6437 * @rpl: start of the FW message 6438 */ 6439 static int 6440 t4_handle_wrerr_rpl(struct adapter *adap, const __be64 *rpl) 6441 { 6442 u8 opcode = *(const u8 *)rpl; 6443 const struct fw_error_cmd *e = (const void *)rpl; 6444 unsigned int i; 6445 6446 if (opcode != FW_ERROR_CMD) { 6447 log(LOG_ERR, 6448 "%s: Received WRERR_RPL message with opcode %#x\n", 6449 device_get_nameunit(adap->dev), opcode); 6450 return (EINVAL); 6451 } 6452 log(LOG_ERR, "%s: FW_ERROR (%s) ", device_get_nameunit(adap->dev), 6453 G_FW_ERROR_CMD_FATAL(be32toh(e->op_to_type)) ? "fatal" : 6454 "non-fatal"); 6455 switch (G_FW_ERROR_CMD_TYPE(be32toh(e->op_to_type))) { 6456 case FW_ERROR_TYPE_EXCEPTION: 6457 log(LOG_ERR, "exception info:\n"); 6458 for (i = 0; i < nitems(e->u.exception.info); i++) 6459 log(LOG_ERR, "%s%08x", i == 0 ? "\t" : " ", 6460 be32toh(e->u.exception.info[i])); 6461 log(LOG_ERR, "\n"); 6462 break; 6463 case FW_ERROR_TYPE_HWMODULE: 6464 log(LOG_ERR, "HW module regaddr %08x regval %08x\n", 6465 be32toh(e->u.hwmodule.regaddr), 6466 be32toh(e->u.hwmodule.regval)); 6467 break; 6468 case FW_ERROR_TYPE_WR: 6469 log(LOG_ERR, "WR cidx %d PF %d VF %d eqid %d hdr:\n", 6470 be16toh(e->u.wr.cidx), 6471 G_FW_ERROR_CMD_PFN(be16toh(e->u.wr.pfn_vfn)), 6472 G_FW_ERROR_CMD_VFN(be16toh(e->u.wr.pfn_vfn)), 6473 be32toh(e->u.wr.eqid)); 6474 for (i = 0; i < nitems(e->u.wr.wrhdr); i++) 6475 log(LOG_ERR, "%s%02x", i == 0 ? "\t" : " ", 6476 e->u.wr.wrhdr[i]); 6477 log(LOG_ERR, "\n"); 6478 break; 6479 case FW_ERROR_TYPE_ACL: 6480 log(LOG_ERR, "ACL cidx %d PF %d VF %d eqid %d %s", 6481 be16toh(e->u.acl.cidx), 6482 G_FW_ERROR_CMD_PFN(be16toh(e->u.acl.pfn_vfn)), 6483 G_FW_ERROR_CMD_VFN(be16toh(e->u.acl.pfn_vfn)), 6484 be32toh(e->u.acl.eqid), 6485 G_FW_ERROR_CMD_MV(be16toh(e->u.acl.mv_pkd)) ? "vlanid" : 6486 "MAC"); 6487 for (i = 0; i < nitems(e->u.acl.val); i++) 6488 log(LOG_ERR, " %02x", e->u.acl.val[i]); 6489 log(LOG_ERR, "\n"); 6490 break; 6491 default: 6492 log(LOG_ERR, "type %#x\n", 6493 G_FW_ERROR_CMD_TYPE(be32toh(e->op_to_type))); 6494 return (EINVAL); 6495 } 6496 return (0); 6497 } 6498 6499 static inline bool 6500 bufidx_used(struct adapter *sc, int idx) 6501 { 6502 struct rx_buf_info *rxb = &sc->sge.rx_buf_info[0]; 6503 int i; 6504 6505 for (i = 0; i < SW_ZONE_SIZES; i++, rxb++) { 6506 if (rxb->size1 > largest_rx_cluster) 6507 continue; 6508 if (rxb->hwidx1 == idx || rxb->hwidx2 == idx) 6509 return (true); 6510 } 6511 6512 return (false); 6513 } 6514 6515 static int 6516 sysctl_bufsizes(SYSCTL_HANDLER_ARGS) 6517 { 6518 struct adapter *sc = arg1; 6519 struct sge_params *sp = &sc->params.sge; 6520 int i, rc; 6521 struct sbuf sb; 6522 char c; 6523 6524 sbuf_new(&sb, NULL, 128, SBUF_AUTOEXTEND); 6525 for (i = 0; i < SGE_FLBUF_SIZES; i++) { 6526 if (bufidx_used(sc, i)) 6527 c = '*'; 6528 else 6529 c = '\0'; 6530 6531 sbuf_printf(&sb, "%u%c ", sp->sge_fl_buffer_size[i], c); 6532 } 6533 sbuf_trim(&sb); 6534 sbuf_finish(&sb); 6535 rc = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req); 6536 sbuf_delete(&sb); 6537 return (rc); 6538 } 6539 6540 #ifdef RATELIMIT 6541 #if defined(INET) || defined(INET6) 6542 /* 6543 * len16 for a txpkt WR with a GL. Includes the firmware work request header. 6544 */ 6545 static inline u_int 6546 txpkt_eo_len16(u_int nsegs, u_int immhdrs, u_int tso) 6547 { 6548 u_int n; 6549 6550 MPASS(immhdrs > 0); 6551 6552 n = roundup2(sizeof(struct fw_eth_tx_eo_wr) + 6553 sizeof(struct cpl_tx_pkt_core) + immhdrs, 16); 6554 if (__predict_false(nsegs == 0)) 6555 goto done; 6556 6557 nsegs--; /* first segment is part of ulptx_sgl */ 6558 n += sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1)); 6559 if (tso) 6560 n += sizeof(struct cpl_tx_pkt_lso_core); 6561 6562 done: 6563 return (howmany(n, 16)); 6564 } 6565 #endif 6566 6567 #define ETID_FLOWC_NPARAMS 6 6568 #define ETID_FLOWC_LEN (roundup2((sizeof(struct fw_flowc_wr) + \ 6569 ETID_FLOWC_NPARAMS * sizeof(struct fw_flowc_mnemval)), 16)) 6570 #define ETID_FLOWC_LEN16 (howmany(ETID_FLOWC_LEN, 16)) 6571 6572 static int 6573 send_etid_flowc_wr(struct cxgbe_rate_tag *cst, struct port_info *pi, 6574 struct vi_info *vi) 6575 { 6576 struct wrq_cookie cookie; 6577 u_int pfvf = pi->adapter->pf << S_FW_VIID_PFN; 6578 struct fw_flowc_wr *flowc; 6579 6580 mtx_assert(&cst->lock, MA_OWNED); 6581 MPASS((cst->flags & (EO_FLOWC_PENDING | EO_FLOWC_RPL_PENDING)) == 6582 EO_FLOWC_PENDING); 6583 6584 flowc = start_wrq_wr(&cst->eo_txq->wrq, ETID_FLOWC_LEN16, &cookie); 6585 if (__predict_false(flowc == NULL)) 6586 return (ENOMEM); 6587 6588 bzero(flowc, ETID_FLOWC_LEN); 6589 flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | 6590 V_FW_FLOWC_WR_NPARAMS(ETID_FLOWC_NPARAMS) | V_FW_WR_COMPL(0)); 6591 flowc->flowid_len16 = htonl(V_FW_WR_LEN16(ETID_FLOWC_LEN16) | 6592 V_FW_WR_FLOWID(cst->etid)); 6593 flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN; 6594 flowc->mnemval[0].val = htobe32(pfvf); 6595 flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH; 6596 flowc->mnemval[1].val = htobe32(pi->tx_chan); 6597 flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT; 6598 flowc->mnemval[2].val = htobe32(pi->tx_chan); 6599 flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID; 6600 flowc->mnemval[3].val = htobe32(cst->iqid); 6601 flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_EOSTATE; 6602 flowc->mnemval[4].val = htobe32(FW_FLOWC_MNEM_EOSTATE_ESTABLISHED); 6603 flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_SCHEDCLASS; 6604 flowc->mnemval[5].val = htobe32(cst->schedcl); 6605 6606 commit_wrq_wr(&cst->eo_txq->wrq, flowc, &cookie); 6607 6608 cst->flags &= ~EO_FLOWC_PENDING; 6609 cst->flags |= EO_FLOWC_RPL_PENDING; 6610 MPASS(cst->tx_credits >= ETID_FLOWC_LEN16); /* flowc is first WR. */ 6611 cst->tx_credits -= ETID_FLOWC_LEN16; 6612 6613 return (0); 6614 } 6615 6616 #define ETID_FLUSH_LEN16 (howmany(sizeof (struct fw_flowc_wr), 16)) 6617 6618 void 6619 send_etid_flush_wr(struct cxgbe_rate_tag *cst) 6620 { 6621 struct fw_flowc_wr *flowc; 6622 struct wrq_cookie cookie; 6623 6624 mtx_assert(&cst->lock, MA_OWNED); 6625 6626 flowc = start_wrq_wr(&cst->eo_txq->wrq, ETID_FLUSH_LEN16, &cookie); 6627 if (__predict_false(flowc == NULL)) 6628 CXGBE_UNIMPLEMENTED(__func__); 6629 6630 bzero(flowc, ETID_FLUSH_LEN16 * 16); 6631 flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | 6632 V_FW_FLOWC_WR_NPARAMS(0) | F_FW_WR_COMPL); 6633 flowc->flowid_len16 = htobe32(V_FW_WR_LEN16(ETID_FLUSH_LEN16) | 6634 V_FW_WR_FLOWID(cst->etid)); 6635 6636 commit_wrq_wr(&cst->eo_txq->wrq, flowc, &cookie); 6637 6638 cst->flags |= EO_FLUSH_RPL_PENDING; 6639 MPASS(cst->tx_credits >= ETID_FLUSH_LEN16); 6640 cst->tx_credits -= ETID_FLUSH_LEN16; 6641 cst->ncompl++; 6642 } 6643 6644 static void 6645 write_ethofld_wr(struct cxgbe_rate_tag *cst, struct fw_eth_tx_eo_wr *wr, 6646 struct mbuf *m0, int compl) 6647 { 6648 struct cpl_tx_pkt_core *cpl; 6649 uint64_t ctrl1; 6650 uint32_t ctrl; /* used in many unrelated places */ 6651 int len16, pktlen, nsegs, immhdrs; 6652 uintptr_t p; 6653 struct ulptx_sgl *usgl; 6654 struct sglist sg; 6655 struct sglist_seg segs[38]; /* XXX: find real limit. XXX: get off the stack */ 6656 6657 mtx_assert(&cst->lock, MA_OWNED); 6658 M_ASSERTPKTHDR(m0); 6659 KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 && 6660 m0->m_pkthdr.l4hlen > 0, 6661 ("%s: ethofld mbuf %p is missing header lengths", __func__, m0)); 6662 6663 len16 = mbuf_eo_len16(m0); 6664 nsegs = mbuf_eo_nsegs(m0); 6665 pktlen = m0->m_pkthdr.len; 6666 ctrl = sizeof(struct cpl_tx_pkt_core); 6667 if (needs_tso(m0)) 6668 ctrl += sizeof(struct cpl_tx_pkt_lso_core); 6669 immhdrs = m0->m_pkthdr.l2hlen + m0->m_pkthdr.l3hlen + m0->m_pkthdr.l4hlen; 6670 ctrl += immhdrs; 6671 6672 wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_EO_WR) | 6673 V_FW_ETH_TX_EO_WR_IMMDLEN(ctrl) | V_FW_WR_COMPL(!!compl)); 6674 wr->equiq_to_len16 = htobe32(V_FW_WR_LEN16(len16) | 6675 V_FW_WR_FLOWID(cst->etid)); 6676 wr->r3 = 0; 6677 if (needs_outer_udp_csum(m0)) { 6678 wr->u.udpseg.type = FW_ETH_TX_EO_TYPE_UDPSEG; 6679 wr->u.udpseg.ethlen = m0->m_pkthdr.l2hlen; 6680 wr->u.udpseg.iplen = htobe16(m0->m_pkthdr.l3hlen); 6681 wr->u.udpseg.udplen = m0->m_pkthdr.l4hlen; 6682 wr->u.udpseg.rtplen = 0; 6683 wr->u.udpseg.r4 = 0; 6684 wr->u.udpseg.mss = htobe16(pktlen - immhdrs); 6685 wr->u.udpseg.schedpktsize = wr->u.udpseg.mss; 6686 wr->u.udpseg.plen = htobe32(pktlen - immhdrs); 6687 cpl = (void *)(wr + 1); 6688 } else { 6689 MPASS(needs_outer_tcp_csum(m0)); 6690 wr->u.tcpseg.type = FW_ETH_TX_EO_TYPE_TCPSEG; 6691 wr->u.tcpseg.ethlen = m0->m_pkthdr.l2hlen; 6692 wr->u.tcpseg.iplen = htobe16(m0->m_pkthdr.l3hlen); 6693 wr->u.tcpseg.tcplen = m0->m_pkthdr.l4hlen; 6694 wr->u.tcpseg.tsclk_tsoff = mbuf_eo_tsclk_tsoff(m0); 6695 wr->u.tcpseg.r4 = 0; 6696 wr->u.tcpseg.r5 = 0; 6697 wr->u.tcpseg.plen = htobe32(pktlen - immhdrs); 6698 6699 if (needs_tso(m0)) { 6700 struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1); 6701 6702 wr->u.tcpseg.mss = htobe16(m0->m_pkthdr.tso_segsz); 6703 6704 ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) | 6705 F_LSO_FIRST_SLICE | F_LSO_LAST_SLICE | 6706 V_LSO_ETHHDR_LEN((m0->m_pkthdr.l2hlen - 6707 ETHER_HDR_LEN) >> 2) | 6708 V_LSO_IPHDR_LEN(m0->m_pkthdr.l3hlen >> 2) | 6709 V_LSO_TCPHDR_LEN(m0->m_pkthdr.l4hlen >> 2); 6710 if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr)) 6711 ctrl |= F_LSO_IPV6; 6712 lso->lso_ctrl = htobe32(ctrl); 6713 lso->ipid_ofst = htobe16(0); 6714 lso->mss = htobe16(m0->m_pkthdr.tso_segsz); 6715 lso->seqno_offset = htobe32(0); 6716 lso->len = htobe32(pktlen); 6717 6718 cpl = (void *)(lso + 1); 6719 } else { 6720 wr->u.tcpseg.mss = htobe16(0xffff); 6721 cpl = (void *)(wr + 1); 6722 } 6723 } 6724 6725 /* Checksum offload must be requested for ethofld. */ 6726 MPASS(needs_outer_l4_csum(m0)); 6727 ctrl1 = csum_to_ctrl(cst->adapter, m0); 6728 6729 /* VLAN tag insertion */ 6730 if (needs_vlan_insertion(m0)) { 6731 ctrl1 |= F_TXPKT_VLAN_VLD | 6732 V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag); 6733 } 6734 6735 /* CPL header */ 6736 cpl->ctrl0 = cst->ctrl0; 6737 cpl->pack = 0; 6738 cpl->len = htobe16(pktlen); 6739 cpl->ctrl1 = htobe64(ctrl1); 6740 6741 /* Copy Ethernet, IP & TCP/UDP hdrs as immediate data */ 6742 p = (uintptr_t)(cpl + 1); 6743 m_copydata(m0, 0, immhdrs, (void *)p); 6744 6745 /* SGL */ 6746 if (nsegs > 0) { 6747 int i, pad; 6748 6749 /* zero-pad upto next 16Byte boundary, if not 16Byte aligned */ 6750 p += immhdrs; 6751 pad = 16 - (immhdrs & 0xf); 6752 bzero((void *)p, pad); 6753 6754 usgl = (void *)(p + pad); 6755 usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) | 6756 V_ULPTX_NSGE(nsegs)); 6757 6758 sglist_init(&sg, nitems(segs), segs); 6759 for (; m0 != NULL; m0 = m0->m_next) { 6760 if (__predict_false(m0->m_len == 0)) 6761 continue; 6762 if (immhdrs >= m0->m_len) { 6763 immhdrs -= m0->m_len; 6764 continue; 6765 } 6766 if (m0->m_flags & M_EXTPG) 6767 sglist_append_mbuf_epg(&sg, m0, 6768 mtod(m0, vm_offset_t), m0->m_len); 6769 else 6770 sglist_append(&sg, mtod(m0, char *) + immhdrs, 6771 m0->m_len - immhdrs); 6772 immhdrs = 0; 6773 } 6774 MPASS(sg.sg_nseg == nsegs); 6775 6776 /* 6777 * Zero pad last 8B in case the WR doesn't end on a 16B 6778 * boundary. 6779 */ 6780 *(uint64_t *)((char *)wr + len16 * 16 - 8) = 0; 6781 6782 usgl->len0 = htobe32(segs[0].ss_len); 6783 usgl->addr0 = htobe64(segs[0].ss_paddr); 6784 for (i = 0; i < nsegs - 1; i++) { 6785 usgl->sge[i / 2].len[i & 1] = htobe32(segs[i + 1].ss_len); 6786 usgl->sge[i / 2].addr[i & 1] = htobe64(segs[i + 1].ss_paddr); 6787 } 6788 if (i & 1) 6789 usgl->sge[i / 2].len[1] = htobe32(0); 6790 } 6791 6792 } 6793 6794 static void 6795 ethofld_tx(struct cxgbe_rate_tag *cst) 6796 { 6797 struct mbuf *m; 6798 struct wrq_cookie cookie; 6799 int next_credits, compl; 6800 struct fw_eth_tx_eo_wr *wr; 6801 6802 mtx_assert(&cst->lock, MA_OWNED); 6803 6804 while ((m = mbufq_first(&cst->pending_tx)) != NULL) { 6805 M_ASSERTPKTHDR(m); 6806 6807 /* How many len16 credits do we need to send this mbuf. */ 6808 next_credits = mbuf_eo_len16(m); 6809 MPASS(next_credits > 0); 6810 if (next_credits > cst->tx_credits) { 6811 /* 6812 * Tx will make progress eventually because there is at 6813 * least one outstanding fw4_ack that will return 6814 * credits and kick the tx. 6815 */ 6816 MPASS(cst->ncompl > 0); 6817 return; 6818 } 6819 wr = start_wrq_wr(&cst->eo_txq->wrq, next_credits, &cookie); 6820 if (__predict_false(wr == NULL)) { 6821 /* XXX: wishful thinking, not a real assertion. */ 6822 MPASS(cst->ncompl > 0); 6823 return; 6824 } 6825 cst->tx_credits -= next_credits; 6826 cst->tx_nocompl += next_credits; 6827 compl = cst->ncompl == 0 || cst->tx_nocompl >= cst->tx_total / 2; 6828 ETHER_BPF_MTAP(cst->com.ifp, m); 6829 write_ethofld_wr(cst, wr, m, compl); 6830 commit_wrq_wr(&cst->eo_txq->wrq, wr, &cookie); 6831 if (compl) { 6832 cst->ncompl++; 6833 cst->tx_nocompl = 0; 6834 } 6835 (void) mbufq_dequeue(&cst->pending_tx); 6836 6837 /* 6838 * Drop the mbuf's reference on the tag now rather 6839 * than waiting until m_freem(). This ensures that 6840 * cxgbe_rate_tag_free gets called when the inp drops 6841 * its reference on the tag and there are no more 6842 * mbufs in the pending_tx queue and can flush any 6843 * pending requests. Otherwise if the last mbuf 6844 * doesn't request a completion the etid will never be 6845 * released. 6846 */ 6847 m->m_pkthdr.snd_tag = NULL; 6848 m->m_pkthdr.csum_flags &= ~CSUM_SND_TAG; 6849 m_snd_tag_rele(&cst->com); 6850 6851 mbufq_enqueue(&cst->pending_fwack, m); 6852 } 6853 } 6854 6855 static int 6856 ethofld_transmit(struct ifnet *ifp, struct mbuf *m0) 6857 { 6858 struct cxgbe_rate_tag *cst; 6859 int rc; 6860 6861 MPASS(m0->m_nextpkt == NULL); 6862 MPASS(m0->m_pkthdr.csum_flags & CSUM_SND_TAG); 6863 MPASS(m0->m_pkthdr.snd_tag != NULL); 6864 cst = mst_to_crt(m0->m_pkthdr.snd_tag); 6865 6866 mtx_lock(&cst->lock); 6867 MPASS(cst->flags & EO_SND_TAG_REF); 6868 6869 if (__predict_false(cst->flags & EO_FLOWC_PENDING)) { 6870 struct vi_info *vi = ifp->if_softc; 6871 struct port_info *pi = vi->pi; 6872 struct adapter *sc = pi->adapter; 6873 const uint32_t rss_mask = vi->rss_size - 1; 6874 uint32_t rss_hash; 6875 6876 cst->eo_txq = &sc->sge.ofld_txq[vi->first_ofld_txq]; 6877 if (M_HASHTYPE_ISHASH(m0)) 6878 rss_hash = m0->m_pkthdr.flowid; 6879 else 6880 rss_hash = arc4random(); 6881 /* We assume RSS hashing */ 6882 cst->iqid = vi->rss[rss_hash & rss_mask]; 6883 cst->eo_txq += rss_hash % vi->nofldtxq; 6884 rc = send_etid_flowc_wr(cst, pi, vi); 6885 if (rc != 0) 6886 goto done; 6887 } 6888 6889 if (__predict_false(cst->plen + m0->m_pkthdr.len > eo_max_backlog)) { 6890 rc = ENOBUFS; 6891 goto done; 6892 } 6893 6894 mbufq_enqueue(&cst->pending_tx, m0); 6895 cst->plen += m0->m_pkthdr.len; 6896 6897 /* 6898 * Hold an extra reference on the tag while generating work 6899 * requests to ensure that we don't try to free the tag during 6900 * ethofld_tx() in case we are sending the final mbuf after 6901 * the inp was freed. 6902 */ 6903 m_snd_tag_ref(&cst->com); 6904 ethofld_tx(cst); 6905 mtx_unlock(&cst->lock); 6906 m_snd_tag_rele(&cst->com); 6907 return (0); 6908 6909 done: 6910 mtx_unlock(&cst->lock); 6911 return (rc); 6912 } 6913 6914 static int 6915 ethofld_fw4_ack(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m0) 6916 { 6917 struct adapter *sc = iq->adapter; 6918 const struct cpl_fw4_ack *cpl = (const void *)(rss + 1); 6919 struct mbuf *m; 6920 u_int etid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl))); 6921 struct cxgbe_rate_tag *cst; 6922 uint8_t credits = cpl->credits; 6923 6924 cst = lookup_etid(sc, etid); 6925 mtx_lock(&cst->lock); 6926 if (__predict_false(cst->flags & EO_FLOWC_RPL_PENDING)) { 6927 MPASS(credits >= ETID_FLOWC_LEN16); 6928 credits -= ETID_FLOWC_LEN16; 6929 cst->flags &= ~EO_FLOWC_RPL_PENDING; 6930 } 6931 6932 KASSERT(cst->ncompl > 0, 6933 ("%s: etid %u (%p) wasn't expecting completion.", 6934 __func__, etid, cst)); 6935 cst->ncompl--; 6936 6937 while (credits > 0) { 6938 m = mbufq_dequeue(&cst->pending_fwack); 6939 if (__predict_false(m == NULL)) { 6940 /* 6941 * The remaining credits are for the final flush that 6942 * was issued when the tag was freed by the kernel. 6943 */ 6944 MPASS((cst->flags & 6945 (EO_FLUSH_RPL_PENDING | EO_SND_TAG_REF)) == 6946 EO_FLUSH_RPL_PENDING); 6947 MPASS(credits == ETID_FLUSH_LEN16); 6948 MPASS(cst->tx_credits + cpl->credits == cst->tx_total); 6949 MPASS(cst->ncompl == 0); 6950 6951 cst->flags &= ~EO_FLUSH_RPL_PENDING; 6952 cst->tx_credits += cpl->credits; 6953 cxgbe_rate_tag_free_locked(cst); 6954 return (0); /* cst is gone. */ 6955 } 6956 KASSERT(m != NULL, 6957 ("%s: too many credits (%u, %u)", __func__, cpl->credits, 6958 credits)); 6959 KASSERT(credits >= mbuf_eo_len16(m), 6960 ("%s: too few credits (%u, %u, %u)", __func__, 6961 cpl->credits, credits, mbuf_eo_len16(m))); 6962 credits -= mbuf_eo_len16(m); 6963 cst->plen -= m->m_pkthdr.len; 6964 m_freem(m); 6965 } 6966 6967 cst->tx_credits += cpl->credits; 6968 MPASS(cst->tx_credits <= cst->tx_total); 6969 6970 if (cst->flags & EO_SND_TAG_REF) { 6971 /* 6972 * As with ethofld_transmit(), hold an extra reference 6973 * so that the tag is stable across ethold_tx(). 6974 */ 6975 m_snd_tag_ref(&cst->com); 6976 m = mbufq_first(&cst->pending_tx); 6977 if (m != NULL && cst->tx_credits >= mbuf_eo_len16(m)) 6978 ethofld_tx(cst); 6979 mtx_unlock(&cst->lock); 6980 m_snd_tag_rele(&cst->com); 6981 } else { 6982 /* 6983 * There shouldn't be any pending packets if the tag 6984 * was freed by the kernel since any pending packet 6985 * should hold a reference to the tag. 6986 */ 6987 MPASS(mbufq_first(&cst->pending_tx) == NULL); 6988 mtx_unlock(&cst->lock); 6989 } 6990 6991 return (0); 6992 } 6993 #endif 6994