1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2010-2016 Solarflare Communications Inc. 5 * All rights reserved. 6 * 7 * This software was developed in part by Philip Paeps under contract for 8 * Solarflare Communications, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions are met: 12 * 13 * 1. Redistributions of source code must retain the above copyright notice, 14 * this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright notice, 16 * this list of conditions and the following disclaimer in the documentation 17 * and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 21 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 23 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 24 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 25 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 26 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 27 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 28 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 29 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 * 31 * The views and conclusions contained in the software and documentation are 32 * those of the authors and should not be interpreted as representing official 33 * policies, either expressed or implied, of the FreeBSD Project. 34 */ 35 36 #include <sys/cdefs.h> 37 __FBSDID("$FreeBSD$"); 38 39 #include "opt_rss.h" 40 41 #include <sys/param.h> 42 #include <sys/malloc.h> 43 #include <sys/mbuf.h> 44 #include <sys/smp.h> 45 #include <sys/socket.h> 46 #include <sys/sysctl.h> 47 #include <sys/syslog.h> 48 #include <sys/limits.h> 49 #include <sys/syslog.h> 50 51 #include <net/ethernet.h> 52 #include <net/if.h> 53 #include <net/if_vlan_var.h> 54 55 #include <netinet/in.h> 56 #include <netinet/ip.h> 57 #include <netinet/ip6.h> 58 #include <netinet/tcp.h> 59 60 #include <machine/in_cksum.h> 61 62 #ifdef RSS 63 #include <net/rss_config.h> 64 #endif 65 66 #include "common/efx.h" 67 68 69 #include "sfxge.h" 70 #include "sfxge_rx.h" 71 72 #define RX_REFILL_THRESHOLD(_entries) (EFX_RXQ_LIMIT(_entries) * 9 / 10) 73 74 #ifdef SFXGE_LRO 75 76 SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 77 "Large receive offload (LRO) parameters"); 78 79 #define SFXGE_LRO_PARAM(_param) SFXGE_PARAM(lro._param) 80 81 /* Size of the LRO hash table. Must be a power of 2. A larger table 82 * means we can accelerate a larger number of streams. 83 */ 84 static unsigned lro_table_size = 128; 85 TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size); 86 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN, 87 &lro_table_size, 0, 88 "Size of the LRO hash table (must be a power of 2)"); 89 90 /* Maximum length of a hash chain. If chains get too long then the lookup 91 * time increases and may exceed the benefit of LRO. 92 */ 93 static unsigned lro_chain_max = 20; 94 TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max); 95 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN, 96 &lro_chain_max, 0, 97 "The maximum length of a hash chain"); 98 99 /* Maximum time (in ticks) that a connection can be idle before it's LRO 100 * state is discarded. 101 */ 102 static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */ 103 TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks); 104 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN, 105 &lro_idle_ticks, 0, 106 "The maximum time (in ticks) that a connection can be idle " 107 "before it's LRO state is discarded"); 108 109 /* Number of packets with payload that must arrive in-order before a 110 * connection is eligible for LRO. The idea is we should avoid coalescing 111 * segments when the sender is in slow-start because reducing the ACK rate 112 * can damage performance. 113 */ 114 static int lro_slow_start_packets = 2000; 115 TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets); 116 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN, 117 &lro_slow_start_packets, 0, 118 "Number of packets with payload that must arrive in-order before " 119 "a connection is eligible for LRO"); 120 121 /* Number of packets with payload that must arrive in-order following loss 122 * before a connection is eligible for LRO. The idea is we should avoid 123 * coalescing segments when the sender is recovering from loss, because 124 * reducing the ACK rate can damage performance. 125 */ 126 static int lro_loss_packets = 20; 127 TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets); 128 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN, 129 &lro_loss_packets, 0, 130 "Number of packets with payload that must arrive in-order " 131 "following loss before a connection is eligible for LRO"); 132 133 /* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */ 134 #define SFXGE_LRO_L2_ID_VLAN 0x4000 135 #define SFXGE_LRO_L2_ID_IPV6 0x8000 136 #define SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN) 137 #define SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6)) 138 139 /* Compare IPv6 addresses, avoiding conditional branches */ 140 static unsigned long ipv6_addr_cmp(const struct in6_addr *left, 141 const struct in6_addr *right) 142 { 143 #if LONG_BIT == 64 144 const uint64_t *left64 = (const uint64_t *)left; 145 const uint64_t *right64 = (const uint64_t *)right; 146 return (left64[0] - right64[0]) | (left64[1] - right64[1]); 147 #else 148 return (left->s6_addr32[0] - right->s6_addr32[0]) | 149 (left->s6_addr32[1] - right->s6_addr32[1]) | 150 (left->s6_addr32[2] - right->s6_addr32[2]) | 151 (left->s6_addr32[3] - right->s6_addr32[3]); 152 #endif 153 } 154 155 #endif /* SFXGE_LRO */ 156 157 void 158 sfxge_rx_qflush_done(struct sfxge_rxq *rxq) 159 { 160 161 rxq->flush_state = SFXGE_FLUSH_DONE; 162 } 163 164 void 165 sfxge_rx_qflush_failed(struct sfxge_rxq *rxq) 166 { 167 168 rxq->flush_state = SFXGE_FLUSH_FAILED; 169 } 170 171 #ifdef RSS 172 static uint8_t toep_key[RSS_KEYSIZE]; 173 #else 174 static uint8_t toep_key[] = { 175 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, 176 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, 177 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, 178 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, 179 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa 180 }; 181 #endif 182 183 static void 184 sfxge_rx_post_refill(void *arg) 185 { 186 struct sfxge_rxq *rxq = arg; 187 struct sfxge_softc *sc; 188 unsigned int index; 189 struct sfxge_evq *evq; 190 uint16_t magic; 191 192 sc = rxq->sc; 193 index = rxq->index; 194 evq = sc->evq[index]; 195 magic = sfxge_sw_ev_rxq_magic(SFXGE_SW_EV_RX_QREFILL, rxq); 196 197 /* This is guaranteed due to the start/stop order of rx and ev */ 198 KASSERT(evq->init_state == SFXGE_EVQ_STARTED, 199 ("evq not started")); 200 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED, 201 ("rxq not started")); 202 efx_ev_qpost(evq->common, magic); 203 } 204 205 static void 206 sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying) 207 { 208 /* Initially retry after 100 ms, but back off in case of 209 * repeated failures as we probably have to wait for the 210 * administrator to raise the pool limit. */ 211 if (retrying) 212 rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz); 213 else 214 rxq->refill_delay = hz / 10; 215 216 callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay, 217 sfxge_rx_post_refill, rxq); 218 } 219 220 #define SFXGE_REFILL_BATCH 64 221 222 static void 223 sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying) 224 { 225 struct sfxge_softc *sc; 226 unsigned int index; 227 struct sfxge_evq *evq; 228 unsigned int batch; 229 unsigned int rxfill; 230 unsigned int mblksize; 231 int ntodo; 232 efsys_dma_addr_t addr[SFXGE_REFILL_BATCH]; 233 234 sc = rxq->sc; 235 index = rxq->index; 236 evq = sc->evq[index]; 237 238 prefetch_read_many(sc->enp); 239 prefetch_read_many(rxq->common); 240 241 SFXGE_EVQ_LOCK_ASSERT_OWNED(evq); 242 243 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED)) 244 return; 245 246 rxfill = rxq->added - rxq->completed; 247 KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries), 248 ("rxfill > EFX_RXQ_LIMIT(rxq->entries)")); 249 ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target); 250 KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries), 251 ("ntodo > EFX_RQX_LIMIT(rxq->entries)")); 252 253 if (ntodo == 0) 254 return; 255 256 batch = 0; 257 mblksize = sc->rx_buffer_size - sc->rx_buffer_align; 258 while (ntodo-- > 0) { 259 unsigned int id; 260 struct sfxge_rx_sw_desc *rx_desc; 261 bus_dma_segment_t seg; 262 struct mbuf *m; 263 264 id = (rxq->added + batch) & rxq->ptr_mask; 265 rx_desc = &rxq->queue[id]; 266 KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL")); 267 268 rx_desc->flags = EFX_DISCARD; 269 m = rx_desc->mbuf = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, 270 sc->rx_cluster_size); 271 if (m == NULL) 272 break; 273 274 /* m_len specifies length of area to be mapped for DMA */ 275 m->m_len = mblksize; 276 m->m_data = (caddr_t)EFX_P2ROUNDUP(uintptr_t, m->m_data, 277 CACHE_LINE_SIZE); 278 m->m_data += sc->rx_buffer_align; 279 280 sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg); 281 addr[batch++] = seg.ds_addr; 282 283 if (batch == SFXGE_REFILL_BATCH) { 284 efx_rx_qpost(rxq->common, addr, mblksize, batch, 285 rxq->completed, rxq->added); 286 rxq->added += batch; 287 batch = 0; 288 } 289 } 290 291 if (ntodo != 0) 292 sfxge_rx_schedule_refill(rxq, retrying); 293 294 if (batch != 0) { 295 efx_rx_qpost(rxq->common, addr, mblksize, batch, 296 rxq->completed, rxq->added); 297 rxq->added += batch; 298 } 299 300 /* Make the descriptors visible to the hardware */ 301 bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map, 302 BUS_DMASYNC_PREWRITE); 303 304 efx_rx_qpush(rxq->common, rxq->added, &rxq->pushed); 305 306 /* The queue could still be empty if no descriptors were actually 307 * pushed, in which case there will be no event to cause the next 308 * refill, so we must schedule a refill ourselves. 309 */ 310 if(rxq->pushed == rxq->completed) { 311 sfxge_rx_schedule_refill(rxq, retrying); 312 } 313 } 314 315 void 316 sfxge_rx_qrefill(struct sfxge_rxq *rxq) 317 { 318 319 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED)) 320 return; 321 322 /* Make sure the queue is full */ 323 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE); 324 } 325 326 static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m) 327 { 328 struct ifnet *ifp = sc->ifnet; 329 330 m->m_pkthdr.rcvif = ifp; 331 m->m_pkthdr.csum_data = 0xffff; 332 ifp->if_input(ifp, m); 333 } 334 335 static void 336 sfxge_rx_deliver(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_desc) 337 { 338 struct sfxge_softc *sc = rxq->sc; 339 struct mbuf *m = rx_desc->mbuf; 340 int flags = rx_desc->flags; 341 int csum_flags; 342 343 /* Convert checksum flags */ 344 csum_flags = (flags & EFX_CKSUM_IPV4) ? 345 (CSUM_IP_CHECKED | CSUM_IP_VALID) : 0; 346 if (flags & EFX_CKSUM_TCPUDP) 347 csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 348 349 if (flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) { 350 m->m_pkthdr.flowid = 351 efx_pseudo_hdr_hash_get(rxq->common, 352 EFX_RX_HASHALG_TOEPLITZ, 353 mtod(m, uint8_t *)); 354 /* The hash covers a 4-tuple for TCP only */ 355 M_HASHTYPE_SET(m, 356 (flags & EFX_PKT_IPV4) ? 357 ((flags & EFX_PKT_TCP) ? 358 M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_IPV4) : 359 ((flags & EFX_PKT_TCP) ? 360 M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_IPV6)); 361 } 362 m->m_data += sc->rx_prefix_size; 363 m->m_len = rx_desc->size - sc->rx_prefix_size; 364 m->m_pkthdr.len = m->m_len; 365 m->m_pkthdr.csum_flags = csum_flags; 366 __sfxge_rx_deliver(sc, rx_desc->mbuf); 367 368 rx_desc->flags = EFX_DISCARD; 369 rx_desc->mbuf = NULL; 370 } 371 372 #ifdef SFXGE_LRO 373 374 static void 375 sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c) 376 { 377 struct sfxge_softc *sc = st->sc; 378 struct mbuf *m = c->mbuf; 379 struct tcphdr *c_th; 380 int csum_flags; 381 382 KASSERT(m, ("no mbuf to deliver")); 383 384 ++st->n_bursts; 385 386 /* Finish off packet munging and recalculate IP header checksum. */ 387 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 388 struct ip *iph = c->nh; 389 iph->ip_len = htons(iph->ip_len); 390 iph->ip_sum = 0; 391 iph->ip_sum = in_cksum_hdr(iph); 392 c_th = (struct tcphdr *)(iph + 1); 393 csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR | 394 CSUM_IP_CHECKED | CSUM_IP_VALID); 395 } else { 396 struct ip6_hdr *iph = c->nh; 397 iph->ip6_plen = htons(iph->ip6_plen); 398 c_th = (struct tcphdr *)(iph + 1); 399 csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 400 } 401 402 c_th->th_win = c->th_last->th_win; 403 c_th->th_ack = c->th_last->th_ack; 404 if (c_th->th_off == c->th_last->th_off) { 405 /* Copy TCP options (take care to avoid going negative). */ 406 int optlen = ((c_th->th_off - 5) & 0xf) << 2u; 407 memcpy(c_th + 1, c->th_last + 1, optlen); 408 } 409 410 m->m_pkthdr.flowid = c->conn_hash; 411 M_HASHTYPE_SET(m, 412 SFXGE_LRO_CONN_IS_TCPIPV4(c) ? 413 M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_TCP_IPV6); 414 415 m->m_pkthdr.csum_flags = csum_flags; 416 __sfxge_rx_deliver(sc, m); 417 418 c->mbuf = NULL; 419 c->delivered = 1; 420 } 421 422 /* Drop the given connection, and add it to the free list. */ 423 static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c) 424 { 425 unsigned bucket; 426 427 KASSERT(!c->mbuf, ("found orphaned mbuf")); 428 429 if (c->next_buf.mbuf != NULL) { 430 sfxge_rx_deliver(rxq, &c->next_buf); 431 LIST_REMOVE(c, active_link); 432 } 433 434 bucket = c->conn_hash & rxq->lro.conns_mask; 435 KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong")); 436 --rxq->lro.conns_n[bucket]; 437 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link); 438 TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link); 439 } 440 441 /* Stop tracking connections that have gone idle in order to keep hash 442 * chains short. 443 */ 444 static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now) 445 { 446 struct sfxge_lro_conn *c; 447 unsigned i; 448 449 KASSERT(LIST_EMPTY(&rxq->lro.active_conns), 450 ("found active connections")); 451 452 rxq->lro.last_purge_ticks = now; 453 for (i = 0; i <= rxq->lro.conns_mask; ++i) { 454 if (TAILQ_EMPTY(&rxq->lro.conns[i])) 455 continue; 456 457 c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq); 458 if (now - c->last_pkt_ticks > lro_idle_ticks) { 459 ++rxq->lro.n_drop_idle; 460 sfxge_lro_drop(rxq, c); 461 } 462 } 463 } 464 465 static void 466 sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c, 467 struct mbuf *mbuf, struct tcphdr *th) 468 { 469 struct tcphdr *c_th; 470 471 /* Tack the new mbuf onto the chain. */ 472 KASSERT(!mbuf->m_next, ("mbuf already chained")); 473 c->mbuf_tail->m_next = mbuf; 474 c->mbuf_tail = mbuf; 475 476 /* Increase length appropriately */ 477 c->mbuf->m_pkthdr.len += mbuf->m_len; 478 479 /* Update the connection state flags */ 480 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 481 struct ip *iph = c->nh; 482 iph->ip_len += mbuf->m_len; 483 c_th = (struct tcphdr *)(iph + 1); 484 } else { 485 struct ip6_hdr *iph = c->nh; 486 iph->ip6_plen += mbuf->m_len; 487 c_th = (struct tcphdr *)(iph + 1); 488 } 489 c_th->th_flags |= (th->th_flags & TH_PUSH); 490 c->th_last = th; 491 ++st->n_merges; 492 493 /* Pass packet up now if another segment could overflow the IP 494 * length. 495 */ 496 if (c->mbuf->m_pkthdr.len > 65536 - 9200) 497 sfxge_lro_deliver(st, c); 498 } 499 500 static void 501 sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c, 502 struct mbuf *mbuf, void *nh, struct tcphdr *th) 503 { 504 /* Start the chain */ 505 c->mbuf = mbuf; 506 c->mbuf_tail = c->mbuf; 507 c->nh = nh; 508 c->th_last = th; 509 510 mbuf->m_pkthdr.len = mbuf->m_len; 511 512 /* Mangle header fields for later processing */ 513 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 514 struct ip *iph = nh; 515 iph->ip_len = ntohs(iph->ip_len); 516 } else { 517 struct ip6_hdr *iph = nh; 518 iph->ip6_plen = ntohs(iph->ip6_plen); 519 } 520 } 521 522 /* Try to merge or otherwise hold or deliver (as appropriate) the 523 * packet buffered for this connection (c->next_buf). Return a flag 524 * indicating whether the connection is still active for LRO purposes. 525 */ 526 static int 527 sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c) 528 { 529 struct sfxge_rx_sw_desc *rx_buf = &c->next_buf; 530 char *eh = c->next_eh; 531 int data_length, hdr_length, dont_merge; 532 unsigned th_seq, pkt_length; 533 struct tcphdr *th; 534 unsigned now; 535 536 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 537 struct ip *iph = c->next_nh; 538 th = (struct tcphdr *)(iph + 1); 539 pkt_length = ntohs(iph->ip_len) + (char *) iph - eh; 540 } else { 541 struct ip6_hdr *iph = c->next_nh; 542 th = (struct tcphdr *)(iph + 1); 543 pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh; 544 } 545 546 hdr_length = (char *) th + th->th_off * 4 - eh; 547 data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) - 548 hdr_length); 549 th_seq = ntohl(th->th_seq); 550 dont_merge = ((data_length <= 0) 551 | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN))); 552 553 /* Check for options other than aligned timestamp. */ 554 if (th->th_off != 5) { 555 const uint32_t *opt_ptr = (const uint32_t *) (th + 1); 556 if (th->th_off == 8 && 557 opt_ptr[0] == ntohl((TCPOPT_NOP << 24) | 558 (TCPOPT_NOP << 16) | 559 (TCPOPT_TIMESTAMP << 8) | 560 TCPOLEN_TIMESTAMP)) { 561 /* timestamp option -- okay */ 562 } else { 563 dont_merge = 1; 564 } 565 } 566 567 if (__predict_false(th_seq != c->next_seq)) { 568 /* Out-of-order, so start counting again. */ 569 if (c->mbuf != NULL) 570 sfxge_lro_deliver(&rxq->lro, c); 571 c->n_in_order_pkts -= lro_loss_packets; 572 c->next_seq = th_seq + data_length; 573 ++rxq->lro.n_misorder; 574 goto deliver_buf_out; 575 } 576 c->next_seq = th_seq + data_length; 577 578 now = ticks; 579 if (now - c->last_pkt_ticks > lro_idle_ticks) { 580 ++rxq->lro.n_drop_idle; 581 if (c->mbuf != NULL) 582 sfxge_lro_deliver(&rxq->lro, c); 583 sfxge_lro_drop(rxq, c); 584 return (0); 585 } 586 c->last_pkt_ticks = ticks; 587 588 if (c->n_in_order_pkts < lro_slow_start_packets) { 589 /* May be in slow-start, so don't merge. */ 590 ++rxq->lro.n_slow_start; 591 ++c->n_in_order_pkts; 592 goto deliver_buf_out; 593 } 594 595 if (__predict_false(dont_merge)) { 596 if (c->mbuf != NULL) 597 sfxge_lro_deliver(&rxq->lro, c); 598 if (th->th_flags & (TH_FIN | TH_RST)) { 599 ++rxq->lro.n_drop_closed; 600 sfxge_lro_drop(rxq, c); 601 return (0); 602 } 603 goto deliver_buf_out; 604 } 605 606 rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size; 607 608 if (__predict_true(c->mbuf != NULL)) { 609 /* Remove headers and any padding */ 610 rx_buf->mbuf->m_data += hdr_length; 611 rx_buf->mbuf->m_len = data_length; 612 613 sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th); 614 } else { 615 /* Remove any padding */ 616 rx_buf->mbuf->m_len = pkt_length; 617 618 sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th); 619 } 620 621 rx_buf->mbuf = NULL; 622 return (1); 623 624 deliver_buf_out: 625 sfxge_rx_deliver(rxq, rx_buf); 626 return (1); 627 } 628 629 static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash, 630 uint16_t l2_id, void *nh, struct tcphdr *th) 631 { 632 unsigned bucket = conn_hash & st->conns_mask; 633 struct sfxge_lro_conn *c; 634 635 if (st->conns_n[bucket] >= lro_chain_max) { 636 ++st->n_too_many; 637 return; 638 } 639 640 if (!TAILQ_EMPTY(&st->free_conns)) { 641 c = TAILQ_FIRST(&st->free_conns); 642 TAILQ_REMOVE(&st->free_conns, c, link); 643 } else { 644 c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT); 645 if (c == NULL) 646 return; 647 c->mbuf = NULL; 648 c->next_buf.mbuf = NULL; 649 } 650 651 /* Create the connection tracking data */ 652 ++st->conns_n[bucket]; 653 TAILQ_INSERT_HEAD(&st->conns[bucket], c, link); 654 c->l2_id = l2_id; 655 c->conn_hash = conn_hash; 656 c->source = th->th_sport; 657 c->dest = th->th_dport; 658 c->n_in_order_pkts = 0; 659 c->last_pkt_ticks = *(volatile int *)&ticks; 660 c->delivered = 0; 661 ++st->n_new_stream; 662 /* NB. We don't initialise c->next_seq, and it doesn't matter what 663 * value it has. Most likely the next packet received for this 664 * connection will not match -- no harm done. 665 */ 666 } 667 668 /* Process mbuf and decide whether to dispatch it to the stack now or 669 * later. 670 */ 671 static void 672 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf) 673 { 674 struct sfxge_softc *sc = rxq->sc; 675 struct mbuf *m = rx_buf->mbuf; 676 struct ether_header *eh; 677 struct sfxge_lro_conn *c; 678 uint16_t l2_id; 679 uint16_t l3_proto; 680 void *nh; 681 struct tcphdr *th; 682 uint32_t conn_hash; 683 unsigned bucket; 684 685 /* Get the hardware hash */ 686 conn_hash = efx_pseudo_hdr_hash_get(rxq->common, 687 EFX_RX_HASHALG_TOEPLITZ, 688 mtod(m, uint8_t *)); 689 690 eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size); 691 if (eh->ether_type == htons(ETHERTYPE_VLAN)) { 692 struct ether_vlan_header *veh = (struct ether_vlan_header *)eh; 693 l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) | 694 SFXGE_LRO_L2_ID_VLAN; 695 l3_proto = veh->evl_proto; 696 nh = veh + 1; 697 } else { 698 l2_id = 0; 699 l3_proto = eh->ether_type; 700 nh = eh + 1; 701 } 702 703 /* Check whether this is a suitable packet (unfragmented 704 * TCP/IPv4 or TCP/IPv6). If so, find the TCP header and 705 * length, and compute a hash if necessary. If not, return. 706 */ 707 if (l3_proto == htons(ETHERTYPE_IP)) { 708 struct ip *iph = nh; 709 710 KASSERT(iph->ip_p == IPPROTO_TCP, 711 ("IPv4 protocol is not TCP, but packet marker is set")); 712 if ((iph->ip_hl - (sizeof(*iph) >> 2u)) | 713 (iph->ip_off & htons(IP_MF | IP_OFFMASK))) 714 goto deliver_now; 715 th = (struct tcphdr *)(iph + 1); 716 } else if (l3_proto == htons(ETHERTYPE_IPV6)) { 717 struct ip6_hdr *iph = nh; 718 719 KASSERT(iph->ip6_nxt == IPPROTO_TCP, 720 ("IPv6 next header is not TCP, but packet marker is set")); 721 l2_id |= SFXGE_LRO_L2_ID_IPV6; 722 th = (struct tcphdr *)(iph + 1); 723 } else { 724 goto deliver_now; 725 } 726 727 bucket = conn_hash & rxq->lro.conns_mask; 728 729 TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) { 730 if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash)) 731 continue; 732 if ((c->source - th->th_sport) | (c->dest - th->th_dport)) 733 continue; 734 if (c->mbuf != NULL) { 735 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 736 struct ip *c_iph, *iph = nh; 737 c_iph = c->nh; 738 if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) | 739 (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr)) 740 continue; 741 } else { 742 struct ip6_hdr *c_iph, *iph = nh; 743 c_iph = c->nh; 744 if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) | 745 ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst)) 746 continue; 747 } 748 } 749 750 /* Re-insert at head of list to reduce lookup time. */ 751 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link); 752 TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link); 753 754 if (c->next_buf.mbuf != NULL) { 755 if (!sfxge_lro_try_merge(rxq, c)) 756 goto deliver_now; 757 } else { 758 LIST_INSERT_HEAD(&rxq->lro.active_conns, c, 759 active_link); 760 } 761 c->next_buf = *rx_buf; 762 c->next_eh = eh; 763 c->next_nh = nh; 764 765 rx_buf->mbuf = NULL; 766 rx_buf->flags = EFX_DISCARD; 767 return; 768 } 769 770 sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th); 771 deliver_now: 772 sfxge_rx_deliver(rxq, rx_buf); 773 } 774 775 static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq) 776 { 777 struct sfxge_lro_state *st = &rxq->lro; 778 struct sfxge_lro_conn *c; 779 unsigned t; 780 781 while (!LIST_EMPTY(&st->active_conns)) { 782 c = LIST_FIRST(&st->active_conns); 783 if (!c->delivered && c->mbuf != NULL) 784 sfxge_lro_deliver(st, c); 785 if (sfxge_lro_try_merge(rxq, c)) { 786 if (c->mbuf != NULL) 787 sfxge_lro_deliver(st, c); 788 LIST_REMOVE(c, active_link); 789 } 790 c->delivered = 0; 791 } 792 793 t = *(volatile int *)&ticks; 794 if (__predict_false(t != st->last_purge_ticks)) 795 sfxge_lro_purge_idle(rxq, t); 796 } 797 798 #else /* !SFXGE_LRO */ 799 800 static void 801 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf) 802 { 803 } 804 805 static void 806 sfxge_lro_end_of_burst(struct sfxge_rxq *rxq) 807 { 808 } 809 810 #endif /* SFXGE_LRO */ 811 812 void 813 sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop) 814 { 815 struct sfxge_softc *sc = rxq->sc; 816 int if_capenable = sc->ifnet->if_capenable; 817 int lro_enabled = if_capenable & IFCAP_LRO; 818 unsigned int index; 819 struct sfxge_evq *evq; 820 unsigned int completed; 821 unsigned int level; 822 struct mbuf *m; 823 struct sfxge_rx_sw_desc *prev = NULL; 824 825 index = rxq->index; 826 evq = sc->evq[index]; 827 828 SFXGE_EVQ_LOCK_ASSERT_OWNED(evq); 829 830 completed = rxq->completed; 831 while (completed != rxq->pending) { 832 unsigned int id; 833 struct sfxge_rx_sw_desc *rx_desc; 834 835 id = completed++ & rxq->ptr_mask; 836 rx_desc = &rxq->queue[id]; 837 m = rx_desc->mbuf; 838 839 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED)) 840 goto discard; 841 842 if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD)) 843 goto discard; 844 845 /* Read the length from the pseudo header if required */ 846 if (rx_desc->flags & EFX_PKT_PREFIX_LEN) { 847 uint16_t tmp_size; 848 int rc; 849 rc = efx_pseudo_hdr_pkt_length_get(rxq->common, 850 mtod(m, uint8_t *), 851 &tmp_size); 852 KASSERT(rc == 0, ("cannot get packet length: %d", rc)); 853 rx_desc->size = (int)tmp_size + sc->rx_prefix_size; 854 } 855 856 prefetch_read_many(mtod(m, caddr_t)); 857 858 switch (rx_desc->flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) { 859 case EFX_PKT_IPV4: 860 if (~if_capenable & IFCAP_RXCSUM) 861 rx_desc->flags &= 862 ~(EFX_CKSUM_IPV4 | EFX_CKSUM_TCPUDP); 863 break; 864 case EFX_PKT_IPV6: 865 if (~if_capenable & IFCAP_RXCSUM_IPV6) 866 rx_desc->flags &= ~EFX_CKSUM_TCPUDP; 867 break; 868 case 0: 869 /* Check for loopback packets */ 870 { 871 struct ether_header *etherhp; 872 873 /*LINTED*/ 874 etherhp = mtod(m, struct ether_header *); 875 876 if (etherhp->ether_type == 877 htons(SFXGE_ETHERTYPE_LOOPBACK)) { 878 EFSYS_PROBE(loopback); 879 880 rxq->loopback++; 881 goto discard; 882 } 883 } 884 break; 885 default: 886 KASSERT(B_FALSE, 887 ("Rx descriptor with both IPv4 and IPv6 flags")); 888 goto discard; 889 } 890 891 /* Pass packet up the stack or into LRO (pipelined) */ 892 if (prev != NULL) { 893 if (lro_enabled && 894 ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) == 895 (EFX_PKT_TCP | EFX_CKSUM_TCPUDP))) 896 sfxge_lro(rxq, prev); 897 else 898 sfxge_rx_deliver(rxq, prev); 899 } 900 prev = rx_desc; 901 continue; 902 903 discard: 904 /* Return the packet to the pool */ 905 m_free(m); 906 rx_desc->mbuf = NULL; 907 } 908 rxq->completed = completed; 909 910 level = rxq->added - rxq->completed; 911 912 /* Pass last packet up the stack or into LRO */ 913 if (prev != NULL) { 914 if (lro_enabled && 915 ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) == 916 (EFX_PKT_TCP | EFX_CKSUM_TCPUDP))) 917 sfxge_lro(rxq, prev); 918 else 919 sfxge_rx_deliver(rxq, prev); 920 } 921 922 /* 923 * If there are any pending flows and this is the end of the 924 * poll then they must be completed. 925 */ 926 if (eop) 927 sfxge_lro_end_of_burst(rxq); 928 929 /* Top up the queue if necessary */ 930 if (level < rxq->refill_threshold) 931 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE); 932 } 933 934 static void 935 sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index) 936 { 937 struct sfxge_rxq *rxq; 938 struct sfxge_evq *evq; 939 unsigned int count; 940 unsigned int retry = 3; 941 942 SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc); 943 944 rxq = sc->rxq[index]; 945 evq = sc->evq[index]; 946 947 SFXGE_EVQ_LOCK(evq); 948 949 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED, 950 ("rxq not started")); 951 952 rxq->init_state = SFXGE_RXQ_INITIALIZED; 953 954 callout_stop(&rxq->refill_callout); 955 956 while (rxq->flush_state != SFXGE_FLUSH_DONE && retry != 0) { 957 rxq->flush_state = SFXGE_FLUSH_PENDING; 958 959 SFXGE_EVQ_UNLOCK(evq); 960 961 /* Flush the receive queue */ 962 if (efx_rx_qflush(rxq->common) != 0) { 963 SFXGE_EVQ_LOCK(evq); 964 rxq->flush_state = SFXGE_FLUSH_FAILED; 965 break; 966 } 967 968 count = 0; 969 do { 970 /* Spin for 100 ms */ 971 DELAY(100000); 972 973 if (rxq->flush_state != SFXGE_FLUSH_PENDING) 974 break; 975 976 } while (++count < 20); 977 978 SFXGE_EVQ_LOCK(evq); 979 980 if (rxq->flush_state == SFXGE_FLUSH_PENDING) { 981 /* Flush timeout - neither done nor failed */ 982 log(LOG_ERR, "%s: Cannot flush Rx queue %u\n", 983 device_get_nameunit(sc->dev), index); 984 rxq->flush_state = SFXGE_FLUSH_DONE; 985 } 986 retry--; 987 } 988 if (rxq->flush_state == SFXGE_FLUSH_FAILED) { 989 log(LOG_ERR, "%s: Flushing Rx queue %u failed\n", 990 device_get_nameunit(sc->dev), index); 991 rxq->flush_state = SFXGE_FLUSH_DONE; 992 } 993 994 rxq->pending = rxq->added; 995 sfxge_rx_qcomplete(rxq, B_TRUE); 996 997 KASSERT(rxq->completed == rxq->pending, 998 ("rxq->completed != rxq->pending")); 999 1000 rxq->added = 0; 1001 rxq->pushed = 0; 1002 rxq->pending = 0; 1003 rxq->completed = 0; 1004 rxq->loopback = 0; 1005 1006 /* Destroy the common code receive queue. */ 1007 efx_rx_qdestroy(rxq->common); 1008 1009 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id, 1010 EFX_RXQ_NBUFS(sc->rxq_entries)); 1011 1012 SFXGE_EVQ_UNLOCK(evq); 1013 } 1014 1015 static int 1016 sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index) 1017 { 1018 struct sfxge_rxq *rxq; 1019 efsys_mem_t *esmp; 1020 struct sfxge_evq *evq; 1021 int rc; 1022 1023 SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc); 1024 1025 rxq = sc->rxq[index]; 1026 esmp = &rxq->mem; 1027 evq = sc->evq[index]; 1028 1029 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED, 1030 ("rxq->init_state != SFXGE_RXQ_INITIALIZED")); 1031 KASSERT(evq->init_state == SFXGE_EVQ_STARTED, 1032 ("evq->init_state != SFXGE_EVQ_STARTED")); 1033 1034 /* Program the buffer table. */ 1035 if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp, 1036 EFX_RXQ_NBUFS(sc->rxq_entries))) != 0) 1037 return (rc); 1038 1039 /* Create the common code receive queue. */ 1040 if ((rc = efx_rx_qcreate(sc->enp, index, 0, EFX_RXQ_TYPE_DEFAULT, 1041 esmp, sc->rxq_entries, rxq->buf_base_id, EFX_RXQ_FLAG_NONE, 1042 evq->common, &rxq->common)) != 0) 1043 goto fail; 1044 1045 SFXGE_EVQ_LOCK(evq); 1046 1047 /* Enable the receive queue. */ 1048 efx_rx_qenable(rxq->common); 1049 1050 rxq->init_state = SFXGE_RXQ_STARTED; 1051 rxq->flush_state = SFXGE_FLUSH_REQUIRED; 1052 1053 /* Try to fill the queue from the pool. */ 1054 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE); 1055 1056 SFXGE_EVQ_UNLOCK(evq); 1057 1058 return (0); 1059 1060 fail: 1061 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id, 1062 EFX_RXQ_NBUFS(sc->rxq_entries)); 1063 return (rc); 1064 } 1065 1066 void 1067 sfxge_rx_stop(struct sfxge_softc *sc) 1068 { 1069 int index; 1070 1071 efx_mac_filter_default_rxq_clear(sc->enp); 1072 1073 /* Stop the receive queue(s) */ 1074 index = sc->rxq_count; 1075 while (--index >= 0) 1076 sfxge_rx_qstop(sc, index); 1077 1078 sc->rx_prefix_size = 0; 1079 sc->rx_buffer_size = 0; 1080 1081 efx_rx_fini(sc->enp); 1082 } 1083 1084 int 1085 sfxge_rx_start(struct sfxge_softc *sc) 1086 { 1087 struct sfxge_intr *intr; 1088 const efx_nic_cfg_t *encp; 1089 size_t hdrlen, align, reserved; 1090 int index; 1091 int rc; 1092 1093 intr = &sc->intr; 1094 1095 /* Initialize the common code receive module. */ 1096 if ((rc = efx_rx_init(sc->enp)) != 0) 1097 return (rc); 1098 1099 encp = efx_nic_cfg_get(sc->enp); 1100 sc->rx_buffer_size = EFX_MAC_PDU(sc->ifnet->if_mtu); 1101 1102 /* Calculate the receive packet buffer size. */ 1103 sc->rx_prefix_size = encp->enc_rx_prefix_size; 1104 1105 /* Ensure IP headers are 32bit aligned */ 1106 hdrlen = sc->rx_prefix_size + sizeof (struct ether_header); 1107 sc->rx_buffer_align = EFX_P2ROUNDUP(size_t, hdrlen, 4) - hdrlen; 1108 1109 sc->rx_buffer_size += sc->rx_buffer_align; 1110 1111 /* Align end of packet buffer for RX DMA end padding */ 1112 align = MAX(1, encp->enc_rx_buf_align_end); 1113 EFSYS_ASSERT(ISP2(align)); 1114 sc->rx_buffer_size = EFX_P2ROUNDUP(size_t, sc->rx_buffer_size, align); 1115 1116 /* 1117 * Standard mbuf zones only guarantee pointer-size alignment; 1118 * we need extra space to align to the cache line 1119 */ 1120 reserved = sc->rx_buffer_size + CACHE_LINE_SIZE; 1121 1122 /* Select zone for packet buffers */ 1123 if (reserved <= MCLBYTES) 1124 sc->rx_cluster_size = MCLBYTES; 1125 else if (reserved <= MJUMPAGESIZE) 1126 sc->rx_cluster_size = MJUMPAGESIZE; 1127 else if (reserved <= MJUM9BYTES) 1128 sc->rx_cluster_size = MJUM9BYTES; 1129 else 1130 sc->rx_cluster_size = MJUM16BYTES; 1131 1132 /* 1133 * Set up the scale table. Enable all hash types and hash insertion. 1134 */ 1135 for (index = 0; index < nitems(sc->rx_indir_table); index++) 1136 #ifdef RSS 1137 sc->rx_indir_table[index] = 1138 rss_get_indirection_to_bucket(index) % sc->rxq_count; 1139 #else 1140 sc->rx_indir_table[index] = index % sc->rxq_count; 1141 #endif 1142 if ((rc = efx_rx_scale_tbl_set(sc->enp, EFX_RSS_CONTEXT_DEFAULT, 1143 sc->rx_indir_table, 1144 nitems(sc->rx_indir_table))) != 0) 1145 goto fail; 1146 (void)efx_rx_scale_mode_set(sc->enp, EFX_RSS_CONTEXT_DEFAULT, 1147 EFX_RX_HASHALG_TOEPLITZ, 1148 EFX_RX_HASH_IPV4 | EFX_RX_HASH_TCPIPV4 | 1149 EFX_RX_HASH_IPV6 | EFX_RX_HASH_TCPIPV6, B_TRUE); 1150 1151 #ifdef RSS 1152 rss_getkey(toep_key); 1153 #endif 1154 if ((rc = efx_rx_scale_key_set(sc->enp, EFX_RSS_CONTEXT_DEFAULT, 1155 toep_key, 1156 sizeof(toep_key))) != 0) 1157 goto fail; 1158 1159 /* Start the receive queue(s). */ 1160 for (index = 0; index < sc->rxq_count; index++) { 1161 if ((rc = sfxge_rx_qstart(sc, index)) != 0) 1162 goto fail2; 1163 } 1164 1165 rc = efx_mac_filter_default_rxq_set(sc->enp, sc->rxq[0]->common, 1166 sc->intr.n_alloc > 1); 1167 if (rc != 0) 1168 goto fail3; 1169 1170 return (0); 1171 1172 fail3: 1173 fail2: 1174 while (--index >= 0) 1175 sfxge_rx_qstop(sc, index); 1176 1177 fail: 1178 efx_rx_fini(sc->enp); 1179 1180 return (rc); 1181 } 1182 1183 #ifdef SFXGE_LRO 1184 1185 static void sfxge_lro_init(struct sfxge_rxq *rxq) 1186 { 1187 struct sfxge_lro_state *st = &rxq->lro; 1188 unsigned i; 1189 1190 st->conns_mask = lro_table_size - 1; 1191 KASSERT(!((st->conns_mask + 1) & st->conns_mask), 1192 ("lro_table_size must be a power of 2")); 1193 st->sc = rxq->sc; 1194 st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]), 1195 M_SFXGE, M_WAITOK); 1196 st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]), 1197 M_SFXGE, M_WAITOK); 1198 for (i = 0; i <= st->conns_mask; ++i) { 1199 TAILQ_INIT(&st->conns[i]); 1200 st->conns_n[i] = 0; 1201 } 1202 LIST_INIT(&st->active_conns); 1203 TAILQ_INIT(&st->free_conns); 1204 } 1205 1206 static void sfxge_lro_fini(struct sfxge_rxq *rxq) 1207 { 1208 struct sfxge_lro_state *st = &rxq->lro; 1209 struct sfxge_lro_conn *c; 1210 unsigned i; 1211 1212 /* Return cleanly if sfxge_lro_init() has not been called. */ 1213 if (st->conns == NULL) 1214 return; 1215 1216 KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections")); 1217 1218 for (i = 0; i <= st->conns_mask; ++i) { 1219 while (!TAILQ_EMPTY(&st->conns[i])) { 1220 c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq); 1221 sfxge_lro_drop(rxq, c); 1222 } 1223 } 1224 1225 while (!TAILQ_EMPTY(&st->free_conns)) { 1226 c = TAILQ_FIRST(&st->free_conns); 1227 TAILQ_REMOVE(&st->free_conns, c, link); 1228 KASSERT(!c->mbuf, ("found orphaned mbuf")); 1229 free(c, M_SFXGE); 1230 } 1231 1232 free(st->conns_n, M_SFXGE); 1233 free(st->conns, M_SFXGE); 1234 st->conns = NULL; 1235 } 1236 1237 #else 1238 1239 static void 1240 sfxge_lro_init(struct sfxge_rxq *rxq) 1241 { 1242 } 1243 1244 static void 1245 sfxge_lro_fini(struct sfxge_rxq *rxq) 1246 { 1247 } 1248 1249 #endif /* SFXGE_LRO */ 1250 1251 static void 1252 sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index) 1253 { 1254 struct sfxge_rxq *rxq; 1255 1256 rxq = sc->rxq[index]; 1257 1258 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED, 1259 ("rxq->init_state != SFXGE_RXQ_INITIALIZED")); 1260 1261 /* Free the context array and the flow table. */ 1262 free(rxq->queue, M_SFXGE); 1263 sfxge_lro_fini(rxq); 1264 1265 /* Release DMA memory. */ 1266 sfxge_dma_free(&rxq->mem); 1267 1268 sc->rxq[index] = NULL; 1269 1270 free(rxq, M_SFXGE); 1271 } 1272 1273 static int 1274 sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index) 1275 { 1276 struct sfxge_rxq *rxq; 1277 struct sfxge_evq *evq; 1278 efsys_mem_t *esmp; 1279 int rc; 1280 1281 KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count)); 1282 1283 rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK); 1284 rxq->sc = sc; 1285 rxq->index = index; 1286 rxq->entries = sc->rxq_entries; 1287 rxq->ptr_mask = rxq->entries - 1; 1288 rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries); 1289 1290 sc->rxq[index] = rxq; 1291 esmp = &rxq->mem; 1292 1293 evq = sc->evq[index]; 1294 1295 /* Allocate and zero DMA space. */ 1296 if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0) 1297 return (rc); 1298 1299 /* Allocate buffer table entries. */ 1300 sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries), 1301 &rxq->buf_base_id); 1302 1303 /* Allocate the context array and the flow table. */ 1304 rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries, 1305 M_SFXGE, M_WAITOK | M_ZERO); 1306 sfxge_lro_init(rxq); 1307 1308 callout_init(&rxq->refill_callout, 1); 1309 1310 rxq->init_state = SFXGE_RXQ_INITIALIZED; 1311 1312 return (0); 1313 } 1314 1315 static const struct { 1316 const char *name; 1317 size_t offset; 1318 } sfxge_rx_stats[] = { 1319 #define SFXGE_RX_STAT(name, member) \ 1320 { #name, offsetof(struct sfxge_rxq, member) } 1321 #ifdef SFXGE_LRO 1322 SFXGE_RX_STAT(lro_merges, lro.n_merges), 1323 SFXGE_RX_STAT(lro_bursts, lro.n_bursts), 1324 SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start), 1325 SFXGE_RX_STAT(lro_misorder, lro.n_misorder), 1326 SFXGE_RX_STAT(lro_too_many, lro.n_too_many), 1327 SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream), 1328 SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle), 1329 SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed) 1330 #endif 1331 }; 1332 1333 static int 1334 sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS) 1335 { 1336 struct sfxge_softc *sc = arg1; 1337 unsigned int id = arg2; 1338 unsigned int sum, index; 1339 1340 /* Sum across all RX queues */ 1341 sum = 0; 1342 for (index = 0; index < sc->rxq_count; index++) 1343 sum += *(unsigned int *)((caddr_t)sc->rxq[index] + 1344 sfxge_rx_stats[id].offset); 1345 1346 return (SYSCTL_OUT(req, &sum, sizeof(sum))); 1347 } 1348 1349 static void 1350 sfxge_rx_stat_init(struct sfxge_softc *sc) 1351 { 1352 struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev); 1353 struct sysctl_oid_list *stat_list; 1354 unsigned int id; 1355 1356 stat_list = SYSCTL_CHILDREN(sc->stats_node); 1357 1358 for (id = 0; id < nitems(sfxge_rx_stats); id++) { 1359 SYSCTL_ADD_PROC(ctx, stat_list, OID_AUTO, 1360 sfxge_rx_stats[id].name, 1361 CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_NEEDGIANT, 1362 sc, id, sfxge_rx_stat_handler, "IU", ""); 1363 } 1364 } 1365 1366 void 1367 sfxge_rx_fini(struct sfxge_softc *sc) 1368 { 1369 int index; 1370 1371 index = sc->rxq_count; 1372 while (--index >= 0) 1373 sfxge_rx_qfini(sc, index); 1374 1375 sc->rxq_count = 0; 1376 } 1377 1378 int 1379 sfxge_rx_init(struct sfxge_softc *sc) 1380 { 1381 struct sfxge_intr *intr; 1382 int index; 1383 int rc; 1384 1385 #ifdef SFXGE_LRO 1386 if (!ISP2(lro_table_size)) { 1387 log(LOG_ERR, "%s=%u must be power of 2", 1388 SFXGE_LRO_PARAM(table_size), lro_table_size); 1389 rc = EINVAL; 1390 goto fail_lro_table_size; 1391 } 1392 1393 if (lro_idle_ticks == 0) 1394 lro_idle_ticks = hz / 10 + 1; /* 100 ms */ 1395 #endif 1396 1397 intr = &sc->intr; 1398 1399 sc->rxq_count = intr->n_alloc; 1400 1401 KASSERT(intr->state == SFXGE_INTR_INITIALIZED, 1402 ("intr->state != SFXGE_INTR_INITIALIZED")); 1403 1404 /* Initialize the receive queue(s) - one per interrupt. */ 1405 for (index = 0; index < sc->rxq_count; index++) { 1406 if ((rc = sfxge_rx_qinit(sc, index)) != 0) 1407 goto fail; 1408 } 1409 1410 sfxge_rx_stat_init(sc); 1411 1412 return (0); 1413 1414 fail: 1415 /* Tear down the receive queue(s). */ 1416 while (--index >= 0) 1417 sfxge_rx_qfini(sc, index); 1418 1419 sc->rxq_count = 0; 1420 1421 #ifdef SFXGE_LRO 1422 fail_lro_table_size: 1423 #endif 1424 return (rc); 1425 } 1426