1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2010-2016 Solarflare Communications Inc. 5 * All rights reserved. 6 * 7 * This software was developed in part by Philip Paeps under contract for 8 * Solarflare Communications, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions are met: 12 * 13 * 1. Redistributions of source code must retain the above copyright notice, 14 * this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright notice, 16 * this list of conditions and the following disclaimer in the documentation 17 * and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 21 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 23 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 24 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 25 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 26 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 27 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 28 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 29 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 * 31 * The views and conclusions contained in the software and documentation are 32 * those of the authors and should not be interpreted as representing official 33 * policies, either expressed or implied, of the FreeBSD Project. 34 */ 35 36 #include <sys/cdefs.h> 37 __FBSDID("$FreeBSD$"); 38 39 #include "opt_rss.h" 40 41 #include <sys/param.h> 42 #include <sys/malloc.h> 43 #include <sys/mbuf.h> 44 #include <sys/smp.h> 45 #include <sys/socket.h> 46 #include <sys/sysctl.h> 47 #include <sys/syslog.h> 48 #include <sys/limits.h> 49 #include <sys/syslog.h> 50 51 #include <net/ethernet.h> 52 #include <net/if.h> 53 #include <net/if_vlan_var.h> 54 55 #include <netinet/in.h> 56 #include <netinet/ip.h> 57 #include <netinet/ip6.h> 58 #include <netinet/tcp.h> 59 60 #include <machine/in_cksum.h> 61 62 #ifdef RSS 63 #include <net/rss_config.h> 64 #endif 65 66 #include "common/efx.h" 67 68 #include "sfxge.h" 69 #include "sfxge_rx.h" 70 71 #define RX_REFILL_THRESHOLD(_entries) (EFX_RXQ_LIMIT(_entries) * 9 / 10) 72 73 #ifdef SFXGE_LRO 74 75 SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 76 "Large receive offload (LRO) parameters"); 77 78 #define SFXGE_LRO_PARAM(_param) SFXGE_PARAM(lro._param) 79 80 /* Size of the LRO hash table. Must be a power of 2. A larger table 81 * means we can accelerate a larger number of streams. 82 */ 83 static unsigned lro_table_size = 128; 84 TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size); 85 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN, 86 &lro_table_size, 0, 87 "Size of the LRO hash table (must be a power of 2)"); 88 89 /* Maximum length of a hash chain. If chains get too long then the lookup 90 * time increases and may exceed the benefit of LRO. 91 */ 92 static unsigned lro_chain_max = 20; 93 TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max); 94 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN, 95 &lro_chain_max, 0, 96 "The maximum length of a hash chain"); 97 98 /* Maximum time (in ticks) that a connection can be idle before it's LRO 99 * state is discarded. 100 */ 101 static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */ 102 TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks); 103 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN, 104 &lro_idle_ticks, 0, 105 "The maximum time (in ticks) that a connection can be idle " 106 "before it's LRO state is discarded"); 107 108 /* Number of packets with payload that must arrive in-order before a 109 * connection is eligible for LRO. The idea is we should avoid coalescing 110 * segments when the sender is in slow-start because reducing the ACK rate 111 * can damage performance. 112 */ 113 static int lro_slow_start_packets = 2000; 114 TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets); 115 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN, 116 &lro_slow_start_packets, 0, 117 "Number of packets with payload that must arrive in-order before " 118 "a connection is eligible for LRO"); 119 120 /* Number of packets with payload that must arrive in-order following loss 121 * before a connection is eligible for LRO. The idea is we should avoid 122 * coalescing segments when the sender is recovering from loss, because 123 * reducing the ACK rate can damage performance. 124 */ 125 static int lro_loss_packets = 20; 126 TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets); 127 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN, 128 &lro_loss_packets, 0, 129 "Number of packets with payload that must arrive in-order " 130 "following loss before a connection is eligible for LRO"); 131 132 /* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */ 133 #define SFXGE_LRO_L2_ID_VLAN 0x4000 134 #define SFXGE_LRO_L2_ID_IPV6 0x8000 135 #define SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN) 136 #define SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6)) 137 138 /* Compare IPv6 addresses, avoiding conditional branches */ 139 static unsigned long ipv6_addr_cmp(const struct in6_addr *left, 140 const struct in6_addr *right) 141 { 142 #if LONG_BIT == 64 143 const uint64_t *left64 = (const uint64_t *)left; 144 const uint64_t *right64 = (const uint64_t *)right; 145 return (left64[0] - right64[0]) | (left64[1] - right64[1]); 146 #else 147 return (left->s6_addr32[0] - right->s6_addr32[0]) | 148 (left->s6_addr32[1] - right->s6_addr32[1]) | 149 (left->s6_addr32[2] - right->s6_addr32[2]) | 150 (left->s6_addr32[3] - right->s6_addr32[3]); 151 #endif 152 } 153 154 #endif /* SFXGE_LRO */ 155 156 void 157 sfxge_rx_qflush_done(struct sfxge_rxq *rxq) 158 { 159 160 rxq->flush_state = SFXGE_FLUSH_DONE; 161 } 162 163 void 164 sfxge_rx_qflush_failed(struct sfxge_rxq *rxq) 165 { 166 167 rxq->flush_state = SFXGE_FLUSH_FAILED; 168 } 169 170 #ifdef RSS 171 static uint8_t toep_key[RSS_KEYSIZE]; 172 #else 173 static uint8_t toep_key[] = { 174 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, 175 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, 176 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, 177 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, 178 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa 179 }; 180 #endif 181 182 static void 183 sfxge_rx_post_refill(void *arg) 184 { 185 struct sfxge_rxq *rxq = arg; 186 struct sfxge_softc *sc; 187 unsigned int index; 188 struct sfxge_evq *evq; 189 uint16_t magic; 190 191 sc = rxq->sc; 192 index = rxq->index; 193 evq = sc->evq[index]; 194 magic = sfxge_sw_ev_rxq_magic(SFXGE_SW_EV_RX_QREFILL, rxq); 195 196 /* This is guaranteed due to the start/stop order of rx and ev */ 197 KASSERT(evq->init_state == SFXGE_EVQ_STARTED, 198 ("evq not started")); 199 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED, 200 ("rxq not started")); 201 efx_ev_qpost(evq->common, magic); 202 } 203 204 static void 205 sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying) 206 { 207 /* Initially retry after 100 ms, but back off in case of 208 * repeated failures as we probably have to wait for the 209 * administrator to raise the pool limit. */ 210 if (retrying) 211 rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz); 212 else 213 rxq->refill_delay = hz / 10; 214 215 callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay, 216 sfxge_rx_post_refill, rxq); 217 } 218 219 #define SFXGE_REFILL_BATCH 64 220 221 static void 222 sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying) 223 { 224 struct sfxge_softc *sc; 225 unsigned int index; 226 struct sfxge_evq *evq; 227 unsigned int batch; 228 unsigned int rxfill; 229 unsigned int mblksize; 230 int ntodo; 231 efsys_dma_addr_t addr[SFXGE_REFILL_BATCH]; 232 233 sc = rxq->sc; 234 index = rxq->index; 235 evq = sc->evq[index]; 236 237 prefetch_read_many(sc->enp); 238 prefetch_read_many(rxq->common); 239 240 SFXGE_EVQ_LOCK_ASSERT_OWNED(evq); 241 242 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED)) 243 return; 244 245 rxfill = rxq->added - rxq->completed; 246 KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries), 247 ("rxfill > EFX_RXQ_LIMIT(rxq->entries)")); 248 ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target); 249 KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries), 250 ("ntodo > EFX_RQX_LIMIT(rxq->entries)")); 251 252 if (ntodo == 0) 253 return; 254 255 batch = 0; 256 mblksize = sc->rx_buffer_size - sc->rx_buffer_align; 257 while (ntodo-- > 0) { 258 unsigned int id; 259 struct sfxge_rx_sw_desc *rx_desc; 260 bus_dma_segment_t seg; 261 struct mbuf *m; 262 263 id = (rxq->added + batch) & rxq->ptr_mask; 264 rx_desc = &rxq->queue[id]; 265 KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL")); 266 267 rx_desc->flags = EFX_DISCARD; 268 m = rx_desc->mbuf = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, 269 sc->rx_cluster_size); 270 if (m == NULL) 271 break; 272 273 /* m_len specifies length of area to be mapped for DMA */ 274 m->m_len = mblksize; 275 m->m_data = (caddr_t)EFX_P2ROUNDUP(uintptr_t, m->m_data, 276 CACHE_LINE_SIZE); 277 m->m_data += sc->rx_buffer_align; 278 279 sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg); 280 addr[batch++] = seg.ds_addr; 281 282 if (batch == SFXGE_REFILL_BATCH) { 283 efx_rx_qpost(rxq->common, addr, mblksize, batch, 284 rxq->completed, rxq->added); 285 rxq->added += batch; 286 batch = 0; 287 } 288 } 289 290 if (ntodo != 0) 291 sfxge_rx_schedule_refill(rxq, retrying); 292 293 if (batch != 0) { 294 efx_rx_qpost(rxq->common, addr, mblksize, batch, 295 rxq->completed, rxq->added); 296 rxq->added += batch; 297 } 298 299 /* Make the descriptors visible to the hardware */ 300 bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map, 301 BUS_DMASYNC_PREWRITE); 302 303 efx_rx_qpush(rxq->common, rxq->added, &rxq->pushed); 304 305 /* The queue could still be empty if no descriptors were actually 306 * pushed, in which case there will be no event to cause the next 307 * refill, so we must schedule a refill ourselves. 308 */ 309 if(rxq->pushed == rxq->completed) { 310 sfxge_rx_schedule_refill(rxq, retrying); 311 } 312 } 313 314 void 315 sfxge_rx_qrefill(struct sfxge_rxq *rxq) 316 { 317 318 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED)) 319 return; 320 321 /* Make sure the queue is full */ 322 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE); 323 } 324 325 static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m) 326 { 327 struct ifnet *ifp = sc->ifnet; 328 329 m->m_pkthdr.rcvif = ifp; 330 m->m_pkthdr.csum_data = 0xffff; 331 ifp->if_input(ifp, m); 332 } 333 334 static void 335 sfxge_rx_deliver(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_desc) 336 { 337 struct sfxge_softc *sc = rxq->sc; 338 struct mbuf *m = rx_desc->mbuf; 339 int flags = rx_desc->flags; 340 int csum_flags; 341 342 /* Convert checksum flags */ 343 csum_flags = (flags & EFX_CKSUM_IPV4) ? 344 (CSUM_IP_CHECKED | CSUM_IP_VALID) : 0; 345 if (flags & EFX_CKSUM_TCPUDP) 346 csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 347 348 if (flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) { 349 m->m_pkthdr.flowid = 350 efx_pseudo_hdr_hash_get(rxq->common, 351 EFX_RX_HASHALG_TOEPLITZ, 352 mtod(m, uint8_t *)); 353 /* The hash covers a 4-tuple for TCP only */ 354 M_HASHTYPE_SET(m, 355 (flags & EFX_PKT_IPV4) ? 356 ((flags & EFX_PKT_TCP) ? 357 M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_IPV4) : 358 ((flags & EFX_PKT_TCP) ? 359 M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_IPV6)); 360 } 361 m->m_data += sc->rx_prefix_size; 362 m->m_len = rx_desc->size - sc->rx_prefix_size; 363 m->m_pkthdr.len = m->m_len; 364 m->m_pkthdr.csum_flags = csum_flags; 365 __sfxge_rx_deliver(sc, rx_desc->mbuf); 366 367 rx_desc->flags = EFX_DISCARD; 368 rx_desc->mbuf = NULL; 369 } 370 371 #ifdef SFXGE_LRO 372 373 static void 374 sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c) 375 { 376 struct sfxge_softc *sc = st->sc; 377 struct mbuf *m = c->mbuf; 378 struct tcphdr *c_th; 379 int csum_flags; 380 381 KASSERT(m, ("no mbuf to deliver")); 382 383 ++st->n_bursts; 384 385 /* Finish off packet munging and recalculate IP header checksum. */ 386 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 387 struct ip *iph = c->nh; 388 iph->ip_len = htons(iph->ip_len); 389 iph->ip_sum = 0; 390 iph->ip_sum = in_cksum_hdr(iph); 391 c_th = (struct tcphdr *)(iph + 1); 392 csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR | 393 CSUM_IP_CHECKED | CSUM_IP_VALID); 394 } else { 395 struct ip6_hdr *iph = c->nh; 396 iph->ip6_plen = htons(iph->ip6_plen); 397 c_th = (struct tcphdr *)(iph + 1); 398 csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 399 } 400 401 c_th->th_win = c->th_last->th_win; 402 c_th->th_ack = c->th_last->th_ack; 403 if (c_th->th_off == c->th_last->th_off) { 404 /* Copy TCP options (take care to avoid going negative). */ 405 int optlen = ((c_th->th_off - 5) & 0xf) << 2u; 406 memcpy(c_th + 1, c->th_last + 1, optlen); 407 } 408 409 m->m_pkthdr.flowid = c->conn_hash; 410 M_HASHTYPE_SET(m, 411 SFXGE_LRO_CONN_IS_TCPIPV4(c) ? 412 M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_TCP_IPV6); 413 414 m->m_pkthdr.csum_flags = csum_flags; 415 __sfxge_rx_deliver(sc, m); 416 417 c->mbuf = NULL; 418 c->delivered = 1; 419 } 420 421 /* Drop the given connection, and add it to the free list. */ 422 static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c) 423 { 424 unsigned bucket; 425 426 KASSERT(!c->mbuf, ("found orphaned mbuf")); 427 428 if (c->next_buf.mbuf != NULL) { 429 sfxge_rx_deliver(rxq, &c->next_buf); 430 LIST_REMOVE(c, active_link); 431 } 432 433 bucket = c->conn_hash & rxq->lro.conns_mask; 434 KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong")); 435 --rxq->lro.conns_n[bucket]; 436 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link); 437 TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link); 438 } 439 440 /* Stop tracking connections that have gone idle in order to keep hash 441 * chains short. 442 */ 443 static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now) 444 { 445 struct sfxge_lro_conn *c; 446 unsigned i; 447 448 KASSERT(LIST_EMPTY(&rxq->lro.active_conns), 449 ("found active connections")); 450 451 rxq->lro.last_purge_ticks = now; 452 for (i = 0; i <= rxq->lro.conns_mask; ++i) { 453 if (TAILQ_EMPTY(&rxq->lro.conns[i])) 454 continue; 455 456 c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq); 457 if (now - c->last_pkt_ticks > lro_idle_ticks) { 458 ++rxq->lro.n_drop_idle; 459 sfxge_lro_drop(rxq, c); 460 } 461 } 462 } 463 464 static void 465 sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c, 466 struct mbuf *mbuf, struct tcphdr *th) 467 { 468 struct tcphdr *c_th; 469 470 /* Tack the new mbuf onto the chain. */ 471 KASSERT(!mbuf->m_next, ("mbuf already chained")); 472 c->mbuf_tail->m_next = mbuf; 473 c->mbuf_tail = mbuf; 474 475 /* Increase length appropriately */ 476 c->mbuf->m_pkthdr.len += mbuf->m_len; 477 478 /* Update the connection state flags */ 479 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 480 struct ip *iph = c->nh; 481 iph->ip_len += mbuf->m_len; 482 c_th = (struct tcphdr *)(iph + 1); 483 } else { 484 struct ip6_hdr *iph = c->nh; 485 iph->ip6_plen += mbuf->m_len; 486 c_th = (struct tcphdr *)(iph + 1); 487 } 488 c_th->th_flags |= (th->th_flags & TH_PUSH); 489 c->th_last = th; 490 ++st->n_merges; 491 492 /* Pass packet up now if another segment could overflow the IP 493 * length. 494 */ 495 if (c->mbuf->m_pkthdr.len > 65536 - 9200) 496 sfxge_lro_deliver(st, c); 497 } 498 499 static void 500 sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c, 501 struct mbuf *mbuf, void *nh, struct tcphdr *th) 502 { 503 /* Start the chain */ 504 c->mbuf = mbuf; 505 c->mbuf_tail = c->mbuf; 506 c->nh = nh; 507 c->th_last = th; 508 509 mbuf->m_pkthdr.len = mbuf->m_len; 510 511 /* Mangle header fields for later processing */ 512 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 513 struct ip *iph = nh; 514 iph->ip_len = ntohs(iph->ip_len); 515 } else { 516 struct ip6_hdr *iph = nh; 517 iph->ip6_plen = ntohs(iph->ip6_plen); 518 } 519 } 520 521 /* Try to merge or otherwise hold or deliver (as appropriate) the 522 * packet buffered for this connection (c->next_buf). Return a flag 523 * indicating whether the connection is still active for LRO purposes. 524 */ 525 static int 526 sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c) 527 { 528 struct sfxge_rx_sw_desc *rx_buf = &c->next_buf; 529 char *eh = c->next_eh; 530 int data_length, hdr_length, dont_merge; 531 unsigned th_seq, pkt_length; 532 struct tcphdr *th; 533 unsigned now; 534 535 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 536 struct ip *iph = c->next_nh; 537 th = (struct tcphdr *)(iph + 1); 538 pkt_length = ntohs(iph->ip_len) + (char *) iph - eh; 539 } else { 540 struct ip6_hdr *iph = c->next_nh; 541 th = (struct tcphdr *)(iph + 1); 542 pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh; 543 } 544 545 hdr_length = (char *) th + th->th_off * 4 - eh; 546 data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) - 547 hdr_length); 548 th_seq = ntohl(th->th_seq); 549 dont_merge = ((data_length <= 0) 550 | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN))); 551 552 /* Check for options other than aligned timestamp. */ 553 if (th->th_off != 5) { 554 const uint32_t *opt_ptr = (const uint32_t *) (th + 1); 555 if (th->th_off == 8 && 556 opt_ptr[0] == ntohl((TCPOPT_NOP << 24) | 557 (TCPOPT_NOP << 16) | 558 (TCPOPT_TIMESTAMP << 8) | 559 TCPOLEN_TIMESTAMP)) { 560 /* timestamp option -- okay */ 561 } else { 562 dont_merge = 1; 563 } 564 } 565 566 if (__predict_false(th_seq != c->next_seq)) { 567 /* Out-of-order, so start counting again. */ 568 if (c->mbuf != NULL) 569 sfxge_lro_deliver(&rxq->lro, c); 570 c->n_in_order_pkts -= lro_loss_packets; 571 c->next_seq = th_seq + data_length; 572 ++rxq->lro.n_misorder; 573 goto deliver_buf_out; 574 } 575 c->next_seq = th_seq + data_length; 576 577 now = ticks; 578 if (now - c->last_pkt_ticks > lro_idle_ticks) { 579 ++rxq->lro.n_drop_idle; 580 if (c->mbuf != NULL) 581 sfxge_lro_deliver(&rxq->lro, c); 582 sfxge_lro_drop(rxq, c); 583 return (0); 584 } 585 c->last_pkt_ticks = ticks; 586 587 if (c->n_in_order_pkts < lro_slow_start_packets) { 588 /* May be in slow-start, so don't merge. */ 589 ++rxq->lro.n_slow_start; 590 ++c->n_in_order_pkts; 591 goto deliver_buf_out; 592 } 593 594 if (__predict_false(dont_merge)) { 595 if (c->mbuf != NULL) 596 sfxge_lro_deliver(&rxq->lro, c); 597 if (th->th_flags & (TH_FIN | TH_RST)) { 598 ++rxq->lro.n_drop_closed; 599 sfxge_lro_drop(rxq, c); 600 return (0); 601 } 602 goto deliver_buf_out; 603 } 604 605 rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size; 606 607 if (__predict_true(c->mbuf != NULL)) { 608 /* Remove headers and any padding */ 609 rx_buf->mbuf->m_data += hdr_length; 610 rx_buf->mbuf->m_len = data_length; 611 612 sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th); 613 } else { 614 /* Remove any padding */ 615 rx_buf->mbuf->m_len = pkt_length; 616 617 sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th); 618 } 619 620 rx_buf->mbuf = NULL; 621 return (1); 622 623 deliver_buf_out: 624 sfxge_rx_deliver(rxq, rx_buf); 625 return (1); 626 } 627 628 static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash, 629 uint16_t l2_id, void *nh, struct tcphdr *th) 630 { 631 unsigned bucket = conn_hash & st->conns_mask; 632 struct sfxge_lro_conn *c; 633 634 if (st->conns_n[bucket] >= lro_chain_max) { 635 ++st->n_too_many; 636 return; 637 } 638 639 if (!TAILQ_EMPTY(&st->free_conns)) { 640 c = TAILQ_FIRST(&st->free_conns); 641 TAILQ_REMOVE(&st->free_conns, c, link); 642 } else { 643 c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT); 644 if (c == NULL) 645 return; 646 c->mbuf = NULL; 647 c->next_buf.mbuf = NULL; 648 } 649 650 /* Create the connection tracking data */ 651 ++st->conns_n[bucket]; 652 TAILQ_INSERT_HEAD(&st->conns[bucket], c, link); 653 c->l2_id = l2_id; 654 c->conn_hash = conn_hash; 655 c->source = th->th_sport; 656 c->dest = th->th_dport; 657 c->n_in_order_pkts = 0; 658 c->last_pkt_ticks = *(volatile int *)&ticks; 659 c->delivered = 0; 660 ++st->n_new_stream; 661 /* NB. We don't initialise c->next_seq, and it doesn't matter what 662 * value it has. Most likely the next packet received for this 663 * connection will not match -- no harm done. 664 */ 665 } 666 667 /* Process mbuf and decide whether to dispatch it to the stack now or 668 * later. 669 */ 670 static void 671 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf) 672 { 673 struct sfxge_softc *sc = rxq->sc; 674 struct mbuf *m = rx_buf->mbuf; 675 struct ether_header *eh; 676 struct sfxge_lro_conn *c; 677 uint16_t l2_id; 678 uint16_t l3_proto; 679 void *nh; 680 struct tcphdr *th; 681 uint32_t conn_hash; 682 unsigned bucket; 683 684 /* Get the hardware hash */ 685 conn_hash = efx_pseudo_hdr_hash_get(rxq->common, 686 EFX_RX_HASHALG_TOEPLITZ, 687 mtod(m, uint8_t *)); 688 689 eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size); 690 if (eh->ether_type == htons(ETHERTYPE_VLAN)) { 691 struct ether_vlan_header *veh = (struct ether_vlan_header *)eh; 692 l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) | 693 SFXGE_LRO_L2_ID_VLAN; 694 l3_proto = veh->evl_proto; 695 nh = veh + 1; 696 } else { 697 l2_id = 0; 698 l3_proto = eh->ether_type; 699 nh = eh + 1; 700 } 701 702 /* Check whether this is a suitable packet (unfragmented 703 * TCP/IPv4 or TCP/IPv6). If so, find the TCP header and 704 * length, and compute a hash if necessary. If not, return. 705 */ 706 if (l3_proto == htons(ETHERTYPE_IP)) { 707 struct ip *iph = nh; 708 709 KASSERT(iph->ip_p == IPPROTO_TCP, 710 ("IPv4 protocol is not TCP, but packet marker is set")); 711 if ((iph->ip_hl - (sizeof(*iph) >> 2u)) | 712 (iph->ip_off & htons(IP_MF | IP_OFFMASK))) 713 goto deliver_now; 714 th = (struct tcphdr *)(iph + 1); 715 } else if (l3_proto == htons(ETHERTYPE_IPV6)) { 716 struct ip6_hdr *iph = nh; 717 718 KASSERT(iph->ip6_nxt == IPPROTO_TCP, 719 ("IPv6 next header is not TCP, but packet marker is set")); 720 l2_id |= SFXGE_LRO_L2_ID_IPV6; 721 th = (struct tcphdr *)(iph + 1); 722 } else { 723 goto deliver_now; 724 } 725 726 bucket = conn_hash & rxq->lro.conns_mask; 727 728 TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) { 729 if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash)) 730 continue; 731 if ((c->source - th->th_sport) | (c->dest - th->th_dport)) 732 continue; 733 if (c->mbuf != NULL) { 734 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 735 struct ip *c_iph, *iph = nh; 736 c_iph = c->nh; 737 if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) | 738 (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr)) 739 continue; 740 } else { 741 struct ip6_hdr *c_iph, *iph = nh; 742 c_iph = c->nh; 743 if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) | 744 ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst)) 745 continue; 746 } 747 } 748 749 /* Re-insert at head of list to reduce lookup time. */ 750 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link); 751 TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link); 752 753 if (c->next_buf.mbuf != NULL) { 754 if (!sfxge_lro_try_merge(rxq, c)) 755 goto deliver_now; 756 } else { 757 LIST_INSERT_HEAD(&rxq->lro.active_conns, c, 758 active_link); 759 } 760 c->next_buf = *rx_buf; 761 c->next_eh = eh; 762 c->next_nh = nh; 763 764 rx_buf->mbuf = NULL; 765 rx_buf->flags = EFX_DISCARD; 766 return; 767 } 768 769 sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th); 770 deliver_now: 771 sfxge_rx_deliver(rxq, rx_buf); 772 } 773 774 static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq) 775 { 776 struct sfxge_lro_state *st = &rxq->lro; 777 struct sfxge_lro_conn *c; 778 unsigned t; 779 780 while (!LIST_EMPTY(&st->active_conns)) { 781 c = LIST_FIRST(&st->active_conns); 782 if (!c->delivered && c->mbuf != NULL) 783 sfxge_lro_deliver(st, c); 784 if (sfxge_lro_try_merge(rxq, c)) { 785 if (c->mbuf != NULL) 786 sfxge_lro_deliver(st, c); 787 LIST_REMOVE(c, active_link); 788 } 789 c->delivered = 0; 790 } 791 792 t = *(volatile int *)&ticks; 793 if (__predict_false(t != st->last_purge_ticks)) 794 sfxge_lro_purge_idle(rxq, t); 795 } 796 797 #else /* !SFXGE_LRO */ 798 799 static void 800 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf) 801 { 802 } 803 804 static void 805 sfxge_lro_end_of_burst(struct sfxge_rxq *rxq) 806 { 807 } 808 809 #endif /* SFXGE_LRO */ 810 811 void 812 sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop) 813 { 814 struct sfxge_softc *sc = rxq->sc; 815 int if_capenable = sc->ifnet->if_capenable; 816 int lro_enabled = if_capenable & IFCAP_LRO; 817 unsigned int index; 818 struct sfxge_evq *evq; 819 unsigned int completed; 820 unsigned int level; 821 struct mbuf *m; 822 struct sfxge_rx_sw_desc *prev = NULL; 823 824 index = rxq->index; 825 evq = sc->evq[index]; 826 827 SFXGE_EVQ_LOCK_ASSERT_OWNED(evq); 828 829 completed = rxq->completed; 830 while (completed != rxq->pending) { 831 unsigned int id; 832 struct sfxge_rx_sw_desc *rx_desc; 833 834 id = completed++ & rxq->ptr_mask; 835 rx_desc = &rxq->queue[id]; 836 m = rx_desc->mbuf; 837 838 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED)) 839 goto discard; 840 841 if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD)) 842 goto discard; 843 844 /* Read the length from the pseudo header if required */ 845 if (rx_desc->flags & EFX_PKT_PREFIX_LEN) { 846 uint16_t tmp_size; 847 int rc; 848 rc = efx_pseudo_hdr_pkt_length_get(rxq->common, 849 mtod(m, uint8_t *), 850 &tmp_size); 851 KASSERT(rc == 0, ("cannot get packet length: %d", rc)); 852 rx_desc->size = (int)tmp_size + sc->rx_prefix_size; 853 } 854 855 prefetch_read_many(mtod(m, caddr_t)); 856 857 switch (rx_desc->flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) { 858 case EFX_PKT_IPV4: 859 if (~if_capenable & IFCAP_RXCSUM) 860 rx_desc->flags &= 861 ~(EFX_CKSUM_IPV4 | EFX_CKSUM_TCPUDP); 862 break; 863 case EFX_PKT_IPV6: 864 if (~if_capenable & IFCAP_RXCSUM_IPV6) 865 rx_desc->flags &= ~EFX_CKSUM_TCPUDP; 866 break; 867 case 0: 868 /* Check for loopback packets */ 869 { 870 struct ether_header *etherhp; 871 872 /*LINTED*/ 873 etherhp = mtod(m, struct ether_header *); 874 875 if (etherhp->ether_type == 876 htons(SFXGE_ETHERTYPE_LOOPBACK)) { 877 EFSYS_PROBE(loopback); 878 879 rxq->loopback++; 880 goto discard; 881 } 882 } 883 break; 884 default: 885 KASSERT(B_FALSE, 886 ("Rx descriptor with both IPv4 and IPv6 flags")); 887 goto discard; 888 } 889 890 /* Pass packet up the stack or into LRO (pipelined) */ 891 if (prev != NULL) { 892 if (lro_enabled && 893 ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) == 894 (EFX_PKT_TCP | EFX_CKSUM_TCPUDP))) 895 sfxge_lro(rxq, prev); 896 else 897 sfxge_rx_deliver(rxq, prev); 898 } 899 prev = rx_desc; 900 continue; 901 902 discard: 903 /* Return the packet to the pool */ 904 m_free(m); 905 rx_desc->mbuf = NULL; 906 } 907 rxq->completed = completed; 908 909 level = rxq->added - rxq->completed; 910 911 /* Pass last packet up the stack or into LRO */ 912 if (prev != NULL) { 913 if (lro_enabled && 914 ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) == 915 (EFX_PKT_TCP | EFX_CKSUM_TCPUDP))) 916 sfxge_lro(rxq, prev); 917 else 918 sfxge_rx_deliver(rxq, prev); 919 } 920 921 /* 922 * If there are any pending flows and this is the end of the 923 * poll then they must be completed. 924 */ 925 if (eop) 926 sfxge_lro_end_of_burst(rxq); 927 928 /* Top up the queue if necessary */ 929 if (level < rxq->refill_threshold) 930 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE); 931 } 932 933 static void 934 sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index) 935 { 936 struct sfxge_rxq *rxq; 937 struct sfxge_evq *evq; 938 unsigned int count; 939 unsigned int retry = 3; 940 941 SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc); 942 943 rxq = sc->rxq[index]; 944 evq = sc->evq[index]; 945 946 SFXGE_EVQ_LOCK(evq); 947 948 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED, 949 ("rxq not started")); 950 951 rxq->init_state = SFXGE_RXQ_INITIALIZED; 952 953 callout_stop(&rxq->refill_callout); 954 955 while (rxq->flush_state != SFXGE_FLUSH_DONE && retry != 0) { 956 rxq->flush_state = SFXGE_FLUSH_PENDING; 957 958 SFXGE_EVQ_UNLOCK(evq); 959 960 /* Flush the receive queue */ 961 if (efx_rx_qflush(rxq->common) != 0) { 962 SFXGE_EVQ_LOCK(evq); 963 rxq->flush_state = SFXGE_FLUSH_FAILED; 964 break; 965 } 966 967 count = 0; 968 do { 969 /* Spin for 100 ms */ 970 DELAY(100000); 971 972 if (rxq->flush_state != SFXGE_FLUSH_PENDING) 973 break; 974 975 } while (++count < 20); 976 977 SFXGE_EVQ_LOCK(evq); 978 979 if (rxq->flush_state == SFXGE_FLUSH_PENDING) { 980 /* Flush timeout - neither done nor failed */ 981 log(LOG_ERR, "%s: Cannot flush Rx queue %u\n", 982 device_get_nameunit(sc->dev), index); 983 rxq->flush_state = SFXGE_FLUSH_DONE; 984 } 985 retry--; 986 } 987 if (rxq->flush_state == SFXGE_FLUSH_FAILED) { 988 log(LOG_ERR, "%s: Flushing Rx queue %u failed\n", 989 device_get_nameunit(sc->dev), index); 990 rxq->flush_state = SFXGE_FLUSH_DONE; 991 } 992 993 rxq->pending = rxq->added; 994 sfxge_rx_qcomplete(rxq, B_TRUE); 995 996 KASSERT(rxq->completed == rxq->pending, 997 ("rxq->completed != rxq->pending")); 998 999 rxq->added = 0; 1000 rxq->pushed = 0; 1001 rxq->pending = 0; 1002 rxq->completed = 0; 1003 rxq->loopback = 0; 1004 1005 /* Destroy the common code receive queue. */ 1006 efx_rx_qdestroy(rxq->common); 1007 1008 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id, 1009 EFX_RXQ_NBUFS(sc->rxq_entries)); 1010 1011 SFXGE_EVQ_UNLOCK(evq); 1012 } 1013 1014 static int 1015 sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index) 1016 { 1017 struct sfxge_rxq *rxq; 1018 efsys_mem_t *esmp; 1019 struct sfxge_evq *evq; 1020 int rc; 1021 1022 SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc); 1023 1024 rxq = sc->rxq[index]; 1025 esmp = &rxq->mem; 1026 evq = sc->evq[index]; 1027 1028 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED, 1029 ("rxq->init_state != SFXGE_RXQ_INITIALIZED")); 1030 KASSERT(evq->init_state == SFXGE_EVQ_STARTED, 1031 ("evq->init_state != SFXGE_EVQ_STARTED")); 1032 1033 /* Program the buffer table. */ 1034 if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp, 1035 EFX_RXQ_NBUFS(sc->rxq_entries))) != 0) 1036 return (rc); 1037 1038 /* Create the common code receive queue. */ 1039 if ((rc = efx_rx_qcreate(sc->enp, index, 0, EFX_RXQ_TYPE_DEFAULT, 1040 esmp, sc->rxq_entries, rxq->buf_base_id, EFX_RXQ_FLAG_NONE, 1041 evq->common, &rxq->common)) != 0) 1042 goto fail; 1043 1044 SFXGE_EVQ_LOCK(evq); 1045 1046 /* Enable the receive queue. */ 1047 efx_rx_qenable(rxq->common); 1048 1049 rxq->init_state = SFXGE_RXQ_STARTED; 1050 rxq->flush_state = SFXGE_FLUSH_REQUIRED; 1051 1052 /* Try to fill the queue from the pool. */ 1053 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE); 1054 1055 SFXGE_EVQ_UNLOCK(evq); 1056 1057 return (0); 1058 1059 fail: 1060 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id, 1061 EFX_RXQ_NBUFS(sc->rxq_entries)); 1062 return (rc); 1063 } 1064 1065 void 1066 sfxge_rx_stop(struct sfxge_softc *sc) 1067 { 1068 int index; 1069 1070 efx_mac_filter_default_rxq_clear(sc->enp); 1071 1072 /* Stop the receive queue(s) */ 1073 index = sc->rxq_count; 1074 while (--index >= 0) 1075 sfxge_rx_qstop(sc, index); 1076 1077 sc->rx_prefix_size = 0; 1078 sc->rx_buffer_size = 0; 1079 1080 efx_rx_fini(sc->enp); 1081 } 1082 1083 int 1084 sfxge_rx_start(struct sfxge_softc *sc) 1085 { 1086 struct sfxge_intr *intr; 1087 const efx_nic_cfg_t *encp; 1088 size_t hdrlen, align, reserved; 1089 int index; 1090 int rc; 1091 1092 intr = &sc->intr; 1093 1094 /* Initialize the common code receive module. */ 1095 if ((rc = efx_rx_init(sc->enp)) != 0) 1096 return (rc); 1097 1098 encp = efx_nic_cfg_get(sc->enp); 1099 sc->rx_buffer_size = EFX_MAC_PDU(sc->ifnet->if_mtu); 1100 1101 /* Calculate the receive packet buffer size. */ 1102 sc->rx_prefix_size = encp->enc_rx_prefix_size; 1103 1104 /* Ensure IP headers are 32bit aligned */ 1105 hdrlen = sc->rx_prefix_size + sizeof (struct ether_header); 1106 sc->rx_buffer_align = EFX_P2ROUNDUP(size_t, hdrlen, 4) - hdrlen; 1107 1108 sc->rx_buffer_size += sc->rx_buffer_align; 1109 1110 /* Align end of packet buffer for RX DMA end padding */ 1111 align = MAX(1, encp->enc_rx_buf_align_end); 1112 EFSYS_ASSERT(ISP2(align)); 1113 sc->rx_buffer_size = EFX_P2ROUNDUP(size_t, sc->rx_buffer_size, align); 1114 1115 /* 1116 * Standard mbuf zones only guarantee pointer-size alignment; 1117 * we need extra space to align to the cache line 1118 */ 1119 reserved = sc->rx_buffer_size + CACHE_LINE_SIZE; 1120 1121 /* Select zone for packet buffers */ 1122 if (reserved <= MCLBYTES) 1123 sc->rx_cluster_size = MCLBYTES; 1124 else if (reserved <= MJUMPAGESIZE) 1125 sc->rx_cluster_size = MJUMPAGESIZE; 1126 else if (reserved <= MJUM9BYTES) 1127 sc->rx_cluster_size = MJUM9BYTES; 1128 else 1129 sc->rx_cluster_size = MJUM16BYTES; 1130 1131 /* 1132 * Set up the scale table. Enable all hash types and hash insertion. 1133 */ 1134 for (index = 0; index < nitems(sc->rx_indir_table); index++) 1135 #ifdef RSS 1136 sc->rx_indir_table[index] = 1137 rss_get_indirection_to_bucket(index) % sc->rxq_count; 1138 #else 1139 sc->rx_indir_table[index] = index % sc->rxq_count; 1140 #endif 1141 if ((rc = efx_rx_scale_tbl_set(sc->enp, EFX_RSS_CONTEXT_DEFAULT, 1142 sc->rx_indir_table, 1143 nitems(sc->rx_indir_table))) != 0) 1144 goto fail; 1145 (void)efx_rx_scale_mode_set(sc->enp, EFX_RSS_CONTEXT_DEFAULT, 1146 EFX_RX_HASHALG_TOEPLITZ, 1147 EFX_RX_HASH_IPV4 | EFX_RX_HASH_TCPIPV4 | 1148 EFX_RX_HASH_IPV6 | EFX_RX_HASH_TCPIPV6, B_TRUE); 1149 1150 #ifdef RSS 1151 rss_getkey(toep_key); 1152 #endif 1153 if ((rc = efx_rx_scale_key_set(sc->enp, EFX_RSS_CONTEXT_DEFAULT, 1154 toep_key, 1155 sizeof(toep_key))) != 0) 1156 goto fail; 1157 1158 /* Start the receive queue(s). */ 1159 for (index = 0; index < sc->rxq_count; index++) { 1160 if ((rc = sfxge_rx_qstart(sc, index)) != 0) 1161 goto fail2; 1162 } 1163 1164 rc = efx_mac_filter_default_rxq_set(sc->enp, sc->rxq[0]->common, 1165 sc->intr.n_alloc > 1); 1166 if (rc != 0) 1167 goto fail3; 1168 1169 return (0); 1170 1171 fail3: 1172 fail2: 1173 while (--index >= 0) 1174 sfxge_rx_qstop(sc, index); 1175 1176 fail: 1177 efx_rx_fini(sc->enp); 1178 1179 return (rc); 1180 } 1181 1182 #ifdef SFXGE_LRO 1183 1184 static void sfxge_lro_init(struct sfxge_rxq *rxq) 1185 { 1186 struct sfxge_lro_state *st = &rxq->lro; 1187 unsigned i; 1188 1189 st->conns_mask = lro_table_size - 1; 1190 KASSERT(!((st->conns_mask + 1) & st->conns_mask), 1191 ("lro_table_size must be a power of 2")); 1192 st->sc = rxq->sc; 1193 st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]), 1194 M_SFXGE, M_WAITOK); 1195 st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]), 1196 M_SFXGE, M_WAITOK); 1197 for (i = 0; i <= st->conns_mask; ++i) { 1198 TAILQ_INIT(&st->conns[i]); 1199 st->conns_n[i] = 0; 1200 } 1201 LIST_INIT(&st->active_conns); 1202 TAILQ_INIT(&st->free_conns); 1203 } 1204 1205 static void sfxge_lro_fini(struct sfxge_rxq *rxq) 1206 { 1207 struct sfxge_lro_state *st = &rxq->lro; 1208 struct sfxge_lro_conn *c; 1209 unsigned i; 1210 1211 /* Return cleanly if sfxge_lro_init() has not been called. */ 1212 if (st->conns == NULL) 1213 return; 1214 1215 KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections")); 1216 1217 for (i = 0; i <= st->conns_mask; ++i) { 1218 while (!TAILQ_EMPTY(&st->conns[i])) { 1219 c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq); 1220 sfxge_lro_drop(rxq, c); 1221 } 1222 } 1223 1224 while (!TAILQ_EMPTY(&st->free_conns)) { 1225 c = TAILQ_FIRST(&st->free_conns); 1226 TAILQ_REMOVE(&st->free_conns, c, link); 1227 KASSERT(!c->mbuf, ("found orphaned mbuf")); 1228 free(c, M_SFXGE); 1229 } 1230 1231 free(st->conns_n, M_SFXGE); 1232 free(st->conns, M_SFXGE); 1233 st->conns = NULL; 1234 } 1235 1236 #else 1237 1238 static void 1239 sfxge_lro_init(struct sfxge_rxq *rxq) 1240 { 1241 } 1242 1243 static void 1244 sfxge_lro_fini(struct sfxge_rxq *rxq) 1245 { 1246 } 1247 1248 #endif /* SFXGE_LRO */ 1249 1250 static void 1251 sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index) 1252 { 1253 struct sfxge_rxq *rxq; 1254 1255 rxq = sc->rxq[index]; 1256 1257 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED, 1258 ("rxq->init_state != SFXGE_RXQ_INITIALIZED")); 1259 1260 /* Free the context array and the flow table. */ 1261 free(rxq->queue, M_SFXGE); 1262 sfxge_lro_fini(rxq); 1263 1264 /* Release DMA memory. */ 1265 sfxge_dma_free(&rxq->mem); 1266 1267 sc->rxq[index] = NULL; 1268 1269 free(rxq, M_SFXGE); 1270 } 1271 1272 static int 1273 sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index) 1274 { 1275 struct sfxge_rxq *rxq; 1276 struct sfxge_evq *evq; 1277 efsys_mem_t *esmp; 1278 int rc; 1279 1280 KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count)); 1281 1282 rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK); 1283 rxq->sc = sc; 1284 rxq->index = index; 1285 rxq->entries = sc->rxq_entries; 1286 rxq->ptr_mask = rxq->entries - 1; 1287 rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries); 1288 1289 sc->rxq[index] = rxq; 1290 esmp = &rxq->mem; 1291 1292 evq = sc->evq[index]; 1293 1294 /* Allocate and zero DMA space. */ 1295 if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0) 1296 return (rc); 1297 1298 /* Allocate buffer table entries. */ 1299 sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries), 1300 &rxq->buf_base_id); 1301 1302 /* Allocate the context array and the flow table. */ 1303 rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries, 1304 M_SFXGE, M_WAITOK | M_ZERO); 1305 sfxge_lro_init(rxq); 1306 1307 callout_init(&rxq->refill_callout, 1); 1308 1309 rxq->init_state = SFXGE_RXQ_INITIALIZED; 1310 1311 return (0); 1312 } 1313 1314 static const struct { 1315 const char *name; 1316 size_t offset; 1317 } sfxge_rx_stats[] = { 1318 #define SFXGE_RX_STAT(name, member) \ 1319 { #name, offsetof(struct sfxge_rxq, member) } 1320 #ifdef SFXGE_LRO 1321 SFXGE_RX_STAT(lro_merges, lro.n_merges), 1322 SFXGE_RX_STAT(lro_bursts, lro.n_bursts), 1323 SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start), 1324 SFXGE_RX_STAT(lro_misorder, lro.n_misorder), 1325 SFXGE_RX_STAT(lro_too_many, lro.n_too_many), 1326 SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream), 1327 SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle), 1328 SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed) 1329 #endif 1330 }; 1331 1332 static int 1333 sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS) 1334 { 1335 struct sfxge_softc *sc = arg1; 1336 unsigned int id = arg2; 1337 unsigned int sum, index; 1338 1339 /* Sum across all RX queues */ 1340 sum = 0; 1341 for (index = 0; index < sc->rxq_count; index++) 1342 sum += *(unsigned int *)((caddr_t)sc->rxq[index] + 1343 sfxge_rx_stats[id].offset); 1344 1345 return (SYSCTL_OUT(req, &sum, sizeof(sum))); 1346 } 1347 1348 static void 1349 sfxge_rx_stat_init(struct sfxge_softc *sc) 1350 { 1351 struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev); 1352 struct sysctl_oid_list *stat_list; 1353 unsigned int id; 1354 1355 stat_list = SYSCTL_CHILDREN(sc->stats_node); 1356 1357 for (id = 0; id < nitems(sfxge_rx_stats); id++) { 1358 SYSCTL_ADD_PROC(ctx, stat_list, OID_AUTO, 1359 sfxge_rx_stats[id].name, 1360 CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_NEEDGIANT, 1361 sc, id, sfxge_rx_stat_handler, "IU", ""); 1362 } 1363 } 1364 1365 void 1366 sfxge_rx_fini(struct sfxge_softc *sc) 1367 { 1368 int index; 1369 1370 index = sc->rxq_count; 1371 while (--index >= 0) 1372 sfxge_rx_qfini(sc, index); 1373 1374 sc->rxq_count = 0; 1375 } 1376 1377 int 1378 sfxge_rx_init(struct sfxge_softc *sc) 1379 { 1380 struct sfxge_intr *intr; 1381 int index; 1382 int rc; 1383 1384 #ifdef SFXGE_LRO 1385 if (!ISP2(lro_table_size)) { 1386 log(LOG_ERR, "%s=%u must be power of 2", 1387 SFXGE_LRO_PARAM(table_size), lro_table_size); 1388 rc = EINVAL; 1389 goto fail_lro_table_size; 1390 } 1391 1392 if (lro_idle_ticks == 0) 1393 lro_idle_ticks = hz / 10 + 1; /* 100 ms */ 1394 #endif 1395 1396 intr = &sc->intr; 1397 1398 sc->rxq_count = intr->n_alloc; 1399 1400 KASSERT(intr->state == SFXGE_INTR_INITIALIZED, 1401 ("intr->state != SFXGE_INTR_INITIALIZED")); 1402 1403 /* Initialize the receive queue(s) - one per interrupt. */ 1404 for (index = 0; index < sc->rxq_count; index++) { 1405 if ((rc = sfxge_rx_qinit(sc, index)) != 0) 1406 goto fail; 1407 } 1408 1409 sfxge_rx_stat_init(sc); 1410 1411 return (0); 1412 1413 fail: 1414 /* Tear down the receive queue(s). */ 1415 while (--index >= 0) 1416 sfxge_rx_qfini(sc, index); 1417 1418 sc->rxq_count = 0; 1419 1420 #ifdef SFXGE_LRO 1421 fail_lro_table_size: 1422 #endif 1423 return (rc); 1424 } 1425