1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2010-2016 Solarflare Communications Inc. 5 * All rights reserved. 6 * 7 * This software was developed in part by Philip Paeps under contract for 8 * Solarflare Communications, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions are met: 12 * 13 * 1. Redistributions of source code must retain the above copyright notice, 14 * this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright notice, 16 * this list of conditions and the following disclaimer in the documentation 17 * and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 21 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 23 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 24 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 25 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 26 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 27 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 28 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 29 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 * 31 * The views and conclusions contained in the software and documentation are 32 * those of the authors and should not be interpreted as representing official 33 * policies, either expressed or implied, of the FreeBSD Project. 34 */ 35 36 #include <sys/cdefs.h> 37 __FBSDID("$FreeBSD$"); 38 39 #include "opt_rss.h" 40 41 #include <sys/param.h> 42 #include <sys/malloc.h> 43 #include <sys/mbuf.h> 44 #include <sys/smp.h> 45 #include <sys/socket.h> 46 #include <sys/sysctl.h> 47 #include <sys/syslog.h> 48 #include <sys/limits.h> 49 #include <sys/syslog.h> 50 51 #include <net/ethernet.h> 52 #include <net/if.h> 53 #include <net/if_vlan_var.h> 54 55 #include <netinet/in.h> 56 #include <netinet/ip.h> 57 #include <netinet/ip6.h> 58 #include <netinet/tcp.h> 59 60 #include <machine/in_cksum.h> 61 62 #ifdef RSS 63 #include <net/rss_config.h> 64 #endif 65 66 #include "common/efx.h" 67 68 #include "sfxge.h" 69 #include "sfxge_rx.h" 70 71 #define RX_REFILL_THRESHOLD(_entries) (EFX_RXQ_LIMIT(_entries) * 9 / 10) 72 73 #ifdef SFXGE_LRO 74 75 SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 76 "Large receive offload (LRO) parameters"); 77 78 #define SFXGE_LRO_PARAM(_param) SFXGE_PARAM(lro._param) 79 80 /* Size of the LRO hash table. Must be a power of 2. A larger table 81 * means we can accelerate a larger number of streams. 82 */ 83 static unsigned lro_table_size = 128; 84 TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size); 85 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN, 86 &lro_table_size, 0, 87 "Size of the LRO hash table (must be a power of 2)"); 88 89 /* Maximum length of a hash chain. If chains get too long then the lookup 90 * time increases and may exceed the benefit of LRO. 91 */ 92 static unsigned lro_chain_max = 20; 93 TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max); 94 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN, 95 &lro_chain_max, 0, 96 "The maximum length of a hash chain"); 97 98 /* Maximum time (in ticks) that a connection can be idle before it's LRO 99 * state is discarded. 100 */ 101 static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */ 102 TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks); 103 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN, 104 &lro_idle_ticks, 0, 105 "The maximum time (in ticks) that a connection can be idle " 106 "before it's LRO state is discarded"); 107 108 /* Number of packets with payload that must arrive in-order before a 109 * connection is eligible for LRO. The idea is we should avoid coalescing 110 * segments when the sender is in slow-start because reducing the ACK rate 111 * can damage performance. 112 */ 113 static int lro_slow_start_packets = 2000; 114 TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets); 115 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN, 116 &lro_slow_start_packets, 0, 117 "Number of packets with payload that must arrive in-order before " 118 "a connection is eligible for LRO"); 119 120 /* Number of packets with payload that must arrive in-order following loss 121 * before a connection is eligible for LRO. The idea is we should avoid 122 * coalescing segments when the sender is recovering from loss, because 123 * reducing the ACK rate can damage performance. 124 */ 125 static int lro_loss_packets = 20; 126 TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets); 127 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN, 128 &lro_loss_packets, 0, 129 "Number of packets with payload that must arrive in-order " 130 "following loss before a connection is eligible for LRO"); 131 132 /* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */ 133 #define SFXGE_LRO_L2_ID_VLAN 0x4000 134 #define SFXGE_LRO_L2_ID_IPV6 0x8000 135 #define SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN) 136 #define SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6)) 137 138 /* Compare IPv6 addresses, avoiding conditional branches */ 139 static unsigned long ipv6_addr_cmp(const struct in6_addr *left, 140 const struct in6_addr *right) 141 { 142 #if LONG_BIT == 64 143 const uint64_t *left64 = (const uint64_t *)left; 144 const uint64_t *right64 = (const uint64_t *)right; 145 return (left64[0] - right64[0]) | (left64[1] - right64[1]); 146 #else 147 return (left->s6_addr32[0] - right->s6_addr32[0]) | 148 (left->s6_addr32[1] - right->s6_addr32[1]) | 149 (left->s6_addr32[2] - right->s6_addr32[2]) | 150 (left->s6_addr32[3] - right->s6_addr32[3]); 151 #endif 152 } 153 154 #endif /* SFXGE_LRO */ 155 156 void 157 sfxge_rx_qflush_done(struct sfxge_rxq *rxq) 158 { 159 160 rxq->flush_state = SFXGE_FLUSH_DONE; 161 } 162 163 void 164 sfxge_rx_qflush_failed(struct sfxge_rxq *rxq) 165 { 166 167 rxq->flush_state = SFXGE_FLUSH_FAILED; 168 } 169 170 #ifdef RSS 171 static uint8_t toep_key[RSS_KEYSIZE]; 172 #else 173 static uint8_t toep_key[] = { 174 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, 175 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, 176 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, 177 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, 178 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa 179 }; 180 #endif 181 182 static void 183 sfxge_rx_post_refill(void *arg) 184 { 185 struct sfxge_rxq *rxq = arg; 186 struct sfxge_softc *sc; 187 unsigned int index; 188 struct sfxge_evq *evq; 189 uint16_t magic; 190 191 sc = rxq->sc; 192 index = rxq->index; 193 evq = sc->evq[index]; 194 magic = sfxge_sw_ev_rxq_magic(SFXGE_SW_EV_RX_QREFILL, rxq); 195 196 /* This is guaranteed due to the start/stop order of rx and ev */ 197 KASSERT(evq->init_state == SFXGE_EVQ_STARTED, 198 ("evq not started")); 199 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED, 200 ("rxq not started")); 201 efx_ev_qpost(evq->common, magic); 202 } 203 204 static void 205 sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying) 206 { 207 /* Initially retry after 100 ms, but back off in case of 208 * repeated failures as we probably have to wait for the 209 * administrator to raise the pool limit. */ 210 if (retrying) 211 rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz); 212 else 213 rxq->refill_delay = hz / 10; 214 215 callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay, 216 sfxge_rx_post_refill, rxq); 217 } 218 219 #define SFXGE_REFILL_BATCH 64 220 221 static void 222 sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying) 223 { 224 struct sfxge_softc *sc; 225 unsigned int index; 226 struct sfxge_evq *evq; 227 unsigned int batch; 228 unsigned int rxfill; 229 unsigned int mblksize; 230 int ntodo; 231 efsys_dma_addr_t addr[SFXGE_REFILL_BATCH]; 232 233 sc = rxq->sc; 234 index = rxq->index; 235 evq = sc->evq[index]; 236 237 prefetch_read_many(sc->enp); 238 prefetch_read_many(rxq->common); 239 240 SFXGE_EVQ_LOCK_ASSERT_OWNED(evq); 241 242 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED)) 243 return; 244 245 rxfill = rxq->added - rxq->completed; 246 KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries), 247 ("rxfill > EFX_RXQ_LIMIT(rxq->entries)")); 248 ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target); 249 KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries), 250 ("ntodo > EFX_RQX_LIMIT(rxq->entries)")); 251 252 if (ntodo == 0) 253 return; 254 255 batch = 0; 256 mblksize = sc->rx_buffer_size - sc->rx_buffer_align; 257 while (ntodo-- > 0) { 258 unsigned int id; 259 struct sfxge_rx_sw_desc *rx_desc; 260 bus_dma_segment_t seg; 261 struct mbuf *m; 262 263 id = (rxq->added + batch) & rxq->ptr_mask; 264 rx_desc = &rxq->queue[id]; 265 KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL")); 266 267 rx_desc->flags = EFX_DISCARD; 268 m = rx_desc->mbuf = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, 269 sc->rx_cluster_size); 270 if (m == NULL) 271 break; 272 273 /* m_len specifies length of area to be mapped for DMA */ 274 m->m_len = mblksize; 275 m->m_data = (caddr_t)EFX_P2ROUNDUP(uintptr_t, m->m_data, 276 CACHE_LINE_SIZE); 277 m->m_data += sc->rx_buffer_align; 278 279 sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg); 280 addr[batch++] = seg.ds_addr; 281 282 if (batch == SFXGE_REFILL_BATCH) { 283 efx_rx_qpost(rxq->common, addr, mblksize, batch, 284 rxq->completed, rxq->added); 285 rxq->added += batch; 286 batch = 0; 287 } 288 } 289 290 if (ntodo != 0) 291 sfxge_rx_schedule_refill(rxq, retrying); 292 293 if (batch != 0) { 294 efx_rx_qpost(rxq->common, addr, mblksize, batch, 295 rxq->completed, rxq->added); 296 rxq->added += batch; 297 } 298 299 /* Make the descriptors visible to the hardware */ 300 bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map, 301 BUS_DMASYNC_PREWRITE); 302 303 efx_rx_qpush(rxq->common, rxq->added, &rxq->pushed); 304 305 /* The queue could still be empty if no descriptors were actually 306 * pushed, in which case there will be no event to cause the next 307 * refill, so we must schedule a refill ourselves. 308 */ 309 if(rxq->pushed == rxq->completed) { 310 sfxge_rx_schedule_refill(rxq, retrying); 311 } 312 } 313 314 void 315 sfxge_rx_qrefill(struct sfxge_rxq *rxq) 316 { 317 318 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED)) 319 return; 320 321 /* Make sure the queue is full */ 322 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE); 323 } 324 325 static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m) 326 { 327 struct ifnet *ifp = sc->ifnet; 328 329 m->m_pkthdr.rcvif = ifp; 330 m->m_pkthdr.csum_data = 0xffff; 331 ifp->if_input(ifp, m); 332 } 333 334 static void 335 sfxge_rx_deliver(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_desc) 336 { 337 struct sfxge_softc *sc = rxq->sc; 338 struct mbuf *m = rx_desc->mbuf; 339 int flags = rx_desc->flags; 340 int csum_flags; 341 342 /* Convert checksum flags */ 343 csum_flags = (flags & EFX_CKSUM_IPV4) ? 344 (CSUM_IP_CHECKED | CSUM_IP_VALID) : 0; 345 if (flags & EFX_CKSUM_TCPUDP) 346 csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 347 348 if (flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) { 349 m->m_pkthdr.flowid = 350 efx_pseudo_hdr_hash_get(rxq->common, 351 EFX_RX_HASHALG_TOEPLITZ, 352 mtod(m, uint8_t *)); 353 /* The hash covers a 4-tuple for TCP only */ 354 M_HASHTYPE_SET(m, 355 (flags & EFX_PKT_IPV4) ? 356 ((flags & EFX_PKT_TCP) ? 357 M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_IPV4) : 358 ((flags & EFX_PKT_TCP) ? 359 M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_IPV6)); 360 } 361 m->m_data += sc->rx_prefix_size; 362 m->m_len = rx_desc->size - sc->rx_prefix_size; 363 m->m_pkthdr.len = m->m_len; 364 m->m_pkthdr.csum_flags = csum_flags; 365 __sfxge_rx_deliver(sc, rx_desc->mbuf); 366 367 rx_desc->flags = EFX_DISCARD; 368 rx_desc->mbuf = NULL; 369 } 370 371 #ifdef SFXGE_LRO 372 373 static void 374 sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c) 375 { 376 struct sfxge_softc *sc = st->sc; 377 struct mbuf *m = c->mbuf; 378 struct tcphdr *c_th; 379 int csum_flags; 380 381 KASSERT(m, ("no mbuf to deliver")); 382 383 ++st->n_bursts; 384 385 /* Finish off packet munging and recalculate IP header checksum. */ 386 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 387 struct ip *iph = c->nh; 388 iph->ip_len = htons(iph->ip_len); 389 iph->ip_sum = 0; 390 iph->ip_sum = in_cksum_hdr(iph); 391 c_th = (struct tcphdr *)(iph + 1); 392 csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR | 393 CSUM_IP_CHECKED | CSUM_IP_VALID); 394 } else { 395 struct ip6_hdr *iph = c->nh; 396 iph->ip6_plen = htons(iph->ip6_plen); 397 c_th = (struct tcphdr *)(iph + 1); 398 csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 399 } 400 401 c_th->th_win = c->th_last->th_win; 402 c_th->th_ack = c->th_last->th_ack; 403 if (c_th->th_off == c->th_last->th_off) { 404 /* Copy TCP options (take care to avoid going negative). */ 405 int optlen = ((c_th->th_off - 5) & 0xf) << 2u; 406 memcpy(c_th + 1, c->th_last + 1, optlen); 407 } 408 409 m->m_pkthdr.flowid = c->conn_hash; 410 M_HASHTYPE_SET(m, 411 SFXGE_LRO_CONN_IS_TCPIPV4(c) ? 412 M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_TCP_IPV6); 413 414 m->m_pkthdr.csum_flags = csum_flags; 415 __sfxge_rx_deliver(sc, m); 416 417 c->mbuf = NULL; 418 c->delivered = 1; 419 } 420 421 /* Drop the given connection, and add it to the free list. */ 422 static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c) 423 { 424 unsigned bucket; 425 426 KASSERT(!c->mbuf, ("found orphaned mbuf")); 427 428 if (c->next_buf.mbuf != NULL) { 429 sfxge_rx_deliver(rxq, &c->next_buf); 430 LIST_REMOVE(c, active_link); 431 } 432 433 bucket = c->conn_hash & rxq->lro.conns_mask; 434 KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong")); 435 --rxq->lro.conns_n[bucket]; 436 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link); 437 TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link); 438 } 439 440 /* Stop tracking connections that have gone idle in order to keep hash 441 * chains short. 442 */ 443 static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now) 444 { 445 struct sfxge_lro_conn *c; 446 unsigned i; 447 448 KASSERT(LIST_EMPTY(&rxq->lro.active_conns), 449 ("found active connections")); 450 451 rxq->lro.last_purge_ticks = now; 452 for (i = 0; i <= rxq->lro.conns_mask; ++i) { 453 if (TAILQ_EMPTY(&rxq->lro.conns[i])) 454 continue; 455 456 c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq); 457 if (now - c->last_pkt_ticks > lro_idle_ticks) { 458 ++rxq->lro.n_drop_idle; 459 sfxge_lro_drop(rxq, c); 460 } 461 } 462 } 463 464 static void 465 sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c, 466 struct mbuf *mbuf, struct tcphdr *th) 467 { 468 struct tcphdr *c_th; 469 470 /* Tack the new mbuf onto the chain. */ 471 KASSERT(!mbuf->m_next, ("mbuf already chained")); 472 c->mbuf_tail->m_next = mbuf; 473 c->mbuf_tail = mbuf; 474 475 /* Increase length appropriately */ 476 c->mbuf->m_pkthdr.len += mbuf->m_len; 477 478 /* Update the connection state flags */ 479 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 480 struct ip *iph = c->nh; 481 iph->ip_len += mbuf->m_len; 482 c_th = (struct tcphdr *)(iph + 1); 483 } else { 484 struct ip6_hdr *iph = c->nh; 485 iph->ip6_plen += mbuf->m_len; 486 c_th = (struct tcphdr *)(iph + 1); 487 } 488 c_th->th_flags |= (th->th_flags & TH_PUSH); 489 c->th_last = th; 490 ++st->n_merges; 491 492 /* Pass packet up now if another segment could overflow the IP 493 * length. 494 */ 495 if (c->mbuf->m_pkthdr.len > 65536 - 9200) 496 sfxge_lro_deliver(st, c); 497 } 498 499 static void 500 sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c, 501 struct mbuf *mbuf, void *nh, struct tcphdr *th) 502 { 503 /* Start the chain */ 504 c->mbuf = mbuf; 505 c->mbuf_tail = c->mbuf; 506 c->nh = nh; 507 c->th_last = th; 508 509 mbuf->m_pkthdr.len = mbuf->m_len; 510 511 /* Mangle header fields for later processing */ 512 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 513 struct ip *iph = nh; 514 iph->ip_len = ntohs(iph->ip_len); 515 } else { 516 struct ip6_hdr *iph = nh; 517 iph->ip6_plen = ntohs(iph->ip6_plen); 518 } 519 } 520 521 /* Try to merge or otherwise hold or deliver (as appropriate) the 522 * packet buffered for this connection (c->next_buf). Return a flag 523 * indicating whether the connection is still active for LRO purposes. 524 */ 525 static int 526 sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c) 527 { 528 struct sfxge_rx_sw_desc *rx_buf = &c->next_buf; 529 char *eh = c->next_eh; 530 int data_length, hdr_length, dont_merge; 531 unsigned th_seq, pkt_length; 532 struct tcphdr *th; 533 unsigned now; 534 535 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 536 struct ip *iph = c->next_nh; 537 th = (struct tcphdr *)(iph + 1); 538 pkt_length = ntohs(iph->ip_len) + (char *) iph - eh; 539 } else { 540 struct ip6_hdr *iph = c->next_nh; 541 th = (struct tcphdr *)(iph + 1); 542 pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh; 543 } 544 545 hdr_length = (char *) th + th->th_off * 4 - eh; 546 data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) - 547 hdr_length); 548 th_seq = ntohl(th->th_seq); 549 dont_merge = ((data_length <= 0) 550 | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN))); 551 552 /* Check for options other than aligned timestamp. */ 553 if (th->th_off != 5) { 554 const uint32_t *opt_ptr = (const uint32_t *) (th + 1); 555 if (th->th_off == 8 && 556 opt_ptr[0] == ntohl((TCPOPT_NOP << 24) | 557 (TCPOPT_NOP << 16) | 558 (TCPOPT_TIMESTAMP << 8) | 559 TCPOLEN_TIMESTAMP)) { 560 /* timestamp option -- okay */ 561 } else { 562 dont_merge = 1; 563 } 564 } 565 566 if (__predict_false(th_seq != c->next_seq)) { 567 /* Out-of-order, so start counting again. */ 568 if (c->mbuf != NULL) 569 sfxge_lro_deliver(&rxq->lro, c); 570 c->n_in_order_pkts -= lro_loss_packets; 571 c->next_seq = th_seq + data_length; 572 ++rxq->lro.n_misorder; 573 goto deliver_buf_out; 574 } 575 c->next_seq = th_seq + data_length; 576 577 now = ticks; 578 if (now - c->last_pkt_ticks > lro_idle_ticks) { 579 ++rxq->lro.n_drop_idle; 580 if (c->mbuf != NULL) 581 sfxge_lro_deliver(&rxq->lro, c); 582 sfxge_lro_drop(rxq, c); 583 return (0); 584 } 585 c->last_pkt_ticks = ticks; 586 587 if (c->n_in_order_pkts < lro_slow_start_packets) { 588 /* May be in slow-start, so don't merge. */ 589 ++rxq->lro.n_slow_start; 590 ++c->n_in_order_pkts; 591 goto deliver_buf_out; 592 } 593 594 if (__predict_false(dont_merge)) { 595 if (c->mbuf != NULL) 596 sfxge_lro_deliver(&rxq->lro, c); 597 if (th->th_flags & (TH_FIN | TH_RST)) { 598 ++rxq->lro.n_drop_closed; 599 sfxge_lro_drop(rxq, c); 600 return (0); 601 } 602 goto deliver_buf_out; 603 } 604 605 rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size; 606 607 if (__predict_true(c->mbuf != NULL)) { 608 /* Remove headers and any padding */ 609 rx_buf->mbuf->m_data += hdr_length; 610 rx_buf->mbuf->m_len = data_length; 611 612 sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th); 613 } else { 614 /* Remove any padding */ 615 rx_buf->mbuf->m_len = pkt_length; 616 617 sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th); 618 } 619 620 rx_buf->mbuf = NULL; 621 return (1); 622 623 deliver_buf_out: 624 sfxge_rx_deliver(rxq, rx_buf); 625 return (1); 626 } 627 628 static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash, 629 uint16_t l2_id, void *nh, struct tcphdr *th) 630 { 631 unsigned bucket = conn_hash & st->conns_mask; 632 struct sfxge_lro_conn *c; 633 634 if (st->conns_n[bucket] >= lro_chain_max) { 635 ++st->n_too_many; 636 return; 637 } 638 639 if (!TAILQ_EMPTY(&st->free_conns)) { 640 c = TAILQ_FIRST(&st->free_conns); 641 TAILQ_REMOVE(&st->free_conns, c, link); 642 } else { 643 c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT); 644 if (c == NULL) 645 return; 646 c->mbuf = NULL; 647 c->next_buf.mbuf = NULL; 648 } 649 650 /* Create the connection tracking data */ 651 ++st->conns_n[bucket]; 652 TAILQ_INSERT_HEAD(&st->conns[bucket], c, link); 653 c->l2_id = l2_id; 654 c->conn_hash = conn_hash; 655 c->source = th->th_sport; 656 c->dest = th->th_dport; 657 c->n_in_order_pkts = 0; 658 c->last_pkt_ticks = *(volatile int *)&ticks; 659 c->delivered = 0; 660 ++st->n_new_stream; 661 /* NB. We don't initialise c->next_seq, and it doesn't matter what 662 * value it has. Most likely the next packet received for this 663 * connection will not match -- no harm done. 664 */ 665 } 666 667 /* Process mbuf and decide whether to dispatch it to the stack now or 668 * later. 669 */ 670 static void 671 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf) 672 { 673 struct sfxge_softc *sc = rxq->sc; 674 struct mbuf *m = rx_buf->mbuf; 675 struct ether_header *eh; 676 struct sfxge_lro_conn *c; 677 uint16_t l2_id; 678 uint16_t l3_proto; 679 void *nh; 680 struct tcphdr *th; 681 uint32_t conn_hash; 682 unsigned bucket; 683 684 /* Get the hardware hash */ 685 conn_hash = efx_pseudo_hdr_hash_get(rxq->common, 686 EFX_RX_HASHALG_TOEPLITZ, 687 mtod(m, uint8_t *)); 688 689 eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size); 690 if (eh->ether_type == htons(ETHERTYPE_VLAN)) { 691 struct ether_vlan_header *veh = (struct ether_vlan_header *)eh; 692 l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) | 693 SFXGE_LRO_L2_ID_VLAN; 694 l3_proto = veh->evl_proto; 695 nh = veh + 1; 696 } else { 697 l2_id = 0; 698 l3_proto = eh->ether_type; 699 nh = eh + 1; 700 } 701 702 /* Check whether this is a suitable packet (unfragmented 703 * TCP/IPv4 or TCP/IPv6). If so, find the TCP header and 704 * length, and compute a hash if necessary. If not, return. 705 */ 706 if (l3_proto == htons(ETHERTYPE_IP)) { 707 struct ip *iph = nh; 708 709 KASSERT(iph->ip_p == IPPROTO_TCP, 710 ("IPv4 protocol is not TCP, but packet marker is set")); 711 if ((iph->ip_hl - (sizeof(*iph) >> 2u)) | 712 (iph->ip_off & htons(IP_MF | IP_OFFMASK))) 713 goto deliver_now; 714 th = (struct tcphdr *)(iph + 1); 715 } else if (l3_proto == htons(ETHERTYPE_IPV6)) { 716 struct ip6_hdr *iph = nh; 717 718 KASSERT(iph->ip6_nxt == IPPROTO_TCP, 719 ("IPv6 next header is not TCP, but packet marker is set")); 720 l2_id |= SFXGE_LRO_L2_ID_IPV6; 721 th = (struct tcphdr *)(iph + 1); 722 } else { 723 goto deliver_now; 724 } 725 726 bucket = conn_hash & rxq->lro.conns_mask; 727 728 TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) { 729 if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash)) 730 continue; 731 if ((c->source - th->th_sport) | (c->dest - th->th_dport)) 732 continue; 733 if (c->mbuf != NULL) { 734 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 735 struct ip *c_iph, *iph = nh; 736 c_iph = c->nh; 737 if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) | 738 (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr)) 739 continue; 740 } else { 741 struct ip6_hdr *c_iph, *iph = nh; 742 c_iph = c->nh; 743 if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) | 744 ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst)) 745 continue; 746 } 747 } 748 749 /* Re-insert at head of list to reduce lookup time. */ 750 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link); 751 TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link); 752 753 if (c->next_buf.mbuf != NULL) { 754 if (!sfxge_lro_try_merge(rxq, c)) 755 goto deliver_now; 756 } else { 757 LIST_INSERT_HEAD(&rxq->lro.active_conns, c, 758 active_link); 759 } 760 c->next_buf = *rx_buf; 761 c->next_eh = eh; 762 c->next_nh = nh; 763 764 rx_buf->mbuf = NULL; 765 rx_buf->flags = EFX_DISCARD; 766 return; 767 } 768 769 sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th); 770 deliver_now: 771 sfxge_rx_deliver(rxq, rx_buf); 772 } 773 774 static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq) 775 { 776 struct sfxge_lro_state *st = &rxq->lro; 777 struct sfxge_lro_conn *c; 778 unsigned t; 779 780 while (!LIST_EMPTY(&st->active_conns)) { 781 c = LIST_FIRST(&st->active_conns); 782 if (!c->delivered && c->mbuf != NULL) 783 sfxge_lro_deliver(st, c); 784 if (sfxge_lro_try_merge(rxq, c)) { 785 if (c->mbuf != NULL) 786 sfxge_lro_deliver(st, c); 787 LIST_REMOVE(c, active_link); 788 } 789 c->delivered = 0; 790 } 791 792 t = *(volatile int *)&ticks; 793 if (__predict_false(t != st->last_purge_ticks)) 794 sfxge_lro_purge_idle(rxq, t); 795 } 796 797 #else /* !SFXGE_LRO */ 798 799 static void 800 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf) 801 { 802 } 803 804 static void 805 sfxge_lro_end_of_burst(struct sfxge_rxq *rxq) 806 { 807 } 808 809 #endif /* SFXGE_LRO */ 810 811 void 812 sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop) 813 { 814 struct sfxge_softc *sc = rxq->sc; 815 int if_capenable = sc->ifnet->if_capenable; 816 int lro_enabled = if_capenable & IFCAP_LRO; 817 unsigned int index; 818 struct sfxge_evq *evq; 819 unsigned int completed; 820 unsigned int level; 821 struct mbuf *m; 822 struct sfxge_rx_sw_desc *prev = NULL; 823 824 index = rxq->index; 825 evq = sc->evq[index]; 826 827 SFXGE_EVQ_LOCK_ASSERT_OWNED(evq); 828 829 completed = rxq->completed; 830 while (completed != rxq->pending) { 831 unsigned int id; 832 struct sfxge_rx_sw_desc *rx_desc; 833 834 id = completed++ & rxq->ptr_mask; 835 rx_desc = &rxq->queue[id]; 836 m = rx_desc->mbuf; 837 838 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED)) 839 goto discard; 840 841 if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD)) 842 goto discard; 843 844 /* Read the length from the pseudo header if required */ 845 if (rx_desc->flags & EFX_PKT_PREFIX_LEN) { 846 uint16_t tmp_size; 847 int rc; 848 rc = efx_pseudo_hdr_pkt_length_get(rxq->common, 849 mtod(m, uint8_t *), 850 &tmp_size); 851 KASSERT(rc == 0, ("cannot get packet length: %d", rc)); 852 rx_desc->size = (int)tmp_size + sc->rx_prefix_size; 853 } 854 855 prefetch_read_many(mtod(m, caddr_t)); 856 857 switch (rx_desc->flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) { 858 case EFX_PKT_IPV4: 859 if (~if_capenable & IFCAP_RXCSUM) 860 rx_desc->flags &= 861 ~(EFX_CKSUM_IPV4 | EFX_CKSUM_TCPUDP); 862 break; 863 case EFX_PKT_IPV6: 864 if (~if_capenable & IFCAP_RXCSUM_IPV6) 865 rx_desc->flags &= ~EFX_CKSUM_TCPUDP; 866 break; 867 case 0: 868 /* Check for loopback packets */ 869 { 870 struct ether_header *etherhp; 871 872 /*LINTED*/ 873 etherhp = mtod(m, struct ether_header *); 874 875 if (etherhp->ether_type == 876 htons(SFXGE_ETHERTYPE_LOOPBACK)) { 877 EFSYS_PROBE(loopback); 878 879 rxq->loopback++; 880 goto discard; 881 } 882 } 883 break; 884 default: 885 KASSERT(B_FALSE, 886 ("Rx descriptor with both IPv4 and IPv6 flags")); 887 goto discard; 888 } 889 890 /* Pass packet up the stack or into LRO (pipelined) */ 891 if (prev != NULL) { 892 if (lro_enabled && 893 ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) == 894 (EFX_PKT_TCP | EFX_CKSUM_TCPUDP))) 895 sfxge_lro(rxq, prev); 896 else 897 sfxge_rx_deliver(rxq, prev); 898 } 899 prev = rx_desc; 900 continue; 901 902 discard: 903 /* Return the packet to the pool */ 904 m_free(m); 905 rx_desc->mbuf = NULL; 906 } 907 rxq->completed = completed; 908 909 level = rxq->added - rxq->completed; 910 911 /* Pass last packet up the stack or into LRO */ 912 if (prev != NULL) { 913 if (lro_enabled && 914 ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) == 915 (EFX_PKT_TCP | EFX_CKSUM_TCPUDP))) 916 sfxge_lro(rxq, prev); 917 else 918 sfxge_rx_deliver(rxq, prev); 919 } 920 921 /* 922 * If there are any pending flows and this is the end of the 923 * poll then they must be completed. 924 */ 925 if (eop) 926 sfxge_lro_end_of_burst(rxq); 927 928 /* Top up the queue if necessary */ 929 if (level < rxq->refill_threshold) 930 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE); 931 } 932 933 static void 934 sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index) 935 { 936 struct sfxge_rxq *rxq; 937 struct sfxge_evq *evq; 938 unsigned int count; 939 unsigned int retry = 3; 940 941 SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc); 942 943 rxq = sc->rxq[index]; 944 evq = sc->evq[index]; 945 946 SFXGE_EVQ_LOCK(evq); 947 948 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED, 949 ("rxq not started")); 950 951 rxq->init_state = SFXGE_RXQ_INITIALIZED; 952 953 callout_stop(&rxq->refill_callout); 954 955 while (rxq->flush_state != SFXGE_FLUSH_DONE && retry != 0) { 956 rxq->flush_state = SFXGE_FLUSH_PENDING; 957 958 SFXGE_EVQ_UNLOCK(evq); 959 960 /* Flush the receive queue */ 961 if (efx_rx_qflush(rxq->common) != 0) { 962 SFXGE_EVQ_LOCK(evq); 963 rxq->flush_state = SFXGE_FLUSH_FAILED; 964 break; 965 } 966 967 count = 0; 968 do { 969 /* Spin for 100 ms */ 970 DELAY(100000); 971 972 if (rxq->flush_state != SFXGE_FLUSH_PENDING) 973 break; 974 975 } while (++count < 20); 976 977 SFXGE_EVQ_LOCK(evq); 978 979 if (rxq->flush_state == SFXGE_FLUSH_PENDING) { 980 /* Flush timeout - neither done nor failed */ 981 log(LOG_ERR, "%s: Cannot flush Rx queue %u\n", 982 device_get_nameunit(sc->dev), index); 983 rxq->flush_state = SFXGE_FLUSH_DONE; 984 } 985 retry--; 986 } 987 if (rxq->flush_state == SFXGE_FLUSH_FAILED) { 988 log(LOG_ERR, "%s: Flushing Rx queue %u failed\n", 989 device_get_nameunit(sc->dev), index); 990 rxq->flush_state = SFXGE_FLUSH_DONE; 991 } 992 993 rxq->pending = rxq->added; 994 sfxge_rx_qcomplete(rxq, B_TRUE); 995 996 KASSERT(rxq->completed == rxq->pending, 997 ("rxq->completed != rxq->pending")); 998 999 rxq->added = 0; 1000 rxq->pushed = 0; 1001 rxq->pending = 0; 1002 rxq->completed = 0; 1003 rxq->loopback = 0; 1004 1005 /* Destroy the common code receive queue. */ 1006 efx_rx_qdestroy(rxq->common); 1007 1008 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id, 1009 EFX_RXQ_NBUFS(sc->rxq_entries)); 1010 1011 SFXGE_EVQ_UNLOCK(evq); 1012 } 1013 1014 static int 1015 sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index) 1016 { 1017 struct sfxge_rxq *rxq; 1018 efsys_mem_t *esmp; 1019 struct sfxge_evq *evq; 1020 int rc; 1021 1022 SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc); 1023 1024 rxq = sc->rxq[index]; 1025 esmp = &rxq->mem; 1026 evq = sc->evq[index]; 1027 1028 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED, 1029 ("rxq->init_state != SFXGE_RXQ_INITIALIZED")); 1030 KASSERT(evq->init_state == SFXGE_EVQ_STARTED, 1031 ("evq->init_state != SFXGE_EVQ_STARTED")); 1032 1033 /* Program the buffer table. */ 1034 if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp, 1035 EFX_RXQ_NBUFS(sc->rxq_entries))) != 0) 1036 return (rc); 1037 1038 /* Create the common code receive queue. */ 1039 if ((rc = efx_rx_qcreate(sc->enp, index, 0, EFX_RXQ_TYPE_DEFAULT, 1040 esmp, sc->rxq_entries, rxq->buf_base_id, EFX_RXQ_FLAG_NONE, 1041 evq->common, &rxq->common)) != 0) 1042 goto fail; 1043 1044 SFXGE_EVQ_LOCK(evq); 1045 1046 /* Enable the receive queue. */ 1047 efx_rx_qenable(rxq->common); 1048 1049 rxq->init_state = SFXGE_RXQ_STARTED; 1050 rxq->flush_state = SFXGE_FLUSH_REQUIRED; 1051 1052 /* Try to fill the queue from the pool. */ 1053 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE); 1054 1055 SFXGE_EVQ_UNLOCK(evq); 1056 1057 return (0); 1058 1059 fail: 1060 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id, 1061 EFX_RXQ_NBUFS(sc->rxq_entries)); 1062 return (rc); 1063 } 1064 1065 void 1066 sfxge_rx_stop(struct sfxge_softc *sc) 1067 { 1068 int index; 1069 1070 efx_mac_filter_default_rxq_clear(sc->enp); 1071 1072 /* Stop the receive queue(s) */ 1073 index = sc->rxq_count; 1074 while (--index >= 0) 1075 sfxge_rx_qstop(sc, index); 1076 1077 sc->rx_prefix_size = 0; 1078 sc->rx_buffer_size = 0; 1079 1080 efx_rx_fini(sc->enp); 1081 } 1082 1083 int 1084 sfxge_rx_start(struct sfxge_softc *sc) 1085 { 1086 const efx_nic_cfg_t *encp; 1087 size_t hdrlen, align, reserved; 1088 int index; 1089 int rc; 1090 1091 /* Initialize the common code receive module. */ 1092 if ((rc = efx_rx_init(sc->enp)) != 0) 1093 return (rc); 1094 1095 encp = efx_nic_cfg_get(sc->enp); 1096 sc->rx_buffer_size = EFX_MAC_PDU(sc->ifnet->if_mtu); 1097 1098 /* Calculate the receive packet buffer size. */ 1099 sc->rx_prefix_size = encp->enc_rx_prefix_size; 1100 1101 /* Ensure IP headers are 32bit aligned */ 1102 hdrlen = sc->rx_prefix_size + sizeof (struct ether_header); 1103 sc->rx_buffer_align = EFX_P2ROUNDUP(size_t, hdrlen, 4) - hdrlen; 1104 1105 sc->rx_buffer_size += sc->rx_buffer_align; 1106 1107 /* Align end of packet buffer for RX DMA end padding */ 1108 align = MAX(1, encp->enc_rx_buf_align_end); 1109 EFSYS_ASSERT(ISP2(align)); 1110 sc->rx_buffer_size = EFX_P2ROUNDUP(size_t, sc->rx_buffer_size, align); 1111 1112 /* 1113 * Standard mbuf zones only guarantee pointer-size alignment; 1114 * we need extra space to align to the cache line 1115 */ 1116 reserved = sc->rx_buffer_size + CACHE_LINE_SIZE; 1117 1118 /* Select zone for packet buffers */ 1119 if (reserved <= MCLBYTES) 1120 sc->rx_cluster_size = MCLBYTES; 1121 else if (reserved <= MJUMPAGESIZE) 1122 sc->rx_cluster_size = MJUMPAGESIZE; 1123 else if (reserved <= MJUM9BYTES) 1124 sc->rx_cluster_size = MJUM9BYTES; 1125 else 1126 sc->rx_cluster_size = MJUM16BYTES; 1127 1128 /* 1129 * Set up the scale table. Enable all hash types and hash insertion. 1130 */ 1131 for (index = 0; index < nitems(sc->rx_indir_table); index++) 1132 #ifdef RSS 1133 sc->rx_indir_table[index] = 1134 rss_get_indirection_to_bucket(index) % sc->rxq_count; 1135 #else 1136 sc->rx_indir_table[index] = index % sc->rxq_count; 1137 #endif 1138 if ((rc = efx_rx_scale_tbl_set(sc->enp, EFX_RSS_CONTEXT_DEFAULT, 1139 sc->rx_indir_table, 1140 nitems(sc->rx_indir_table))) != 0) 1141 goto fail; 1142 (void)efx_rx_scale_mode_set(sc->enp, EFX_RSS_CONTEXT_DEFAULT, 1143 EFX_RX_HASHALG_TOEPLITZ, 1144 EFX_RX_HASH_IPV4 | EFX_RX_HASH_TCPIPV4 | 1145 EFX_RX_HASH_IPV6 | EFX_RX_HASH_TCPIPV6, B_TRUE); 1146 1147 #ifdef RSS 1148 rss_getkey(toep_key); 1149 #endif 1150 if ((rc = efx_rx_scale_key_set(sc->enp, EFX_RSS_CONTEXT_DEFAULT, 1151 toep_key, 1152 sizeof(toep_key))) != 0) 1153 goto fail; 1154 1155 /* Start the receive queue(s). */ 1156 for (index = 0; index < sc->rxq_count; index++) { 1157 if ((rc = sfxge_rx_qstart(sc, index)) != 0) 1158 goto fail2; 1159 } 1160 1161 rc = efx_mac_filter_default_rxq_set(sc->enp, sc->rxq[0]->common, 1162 sc->intr.n_alloc > 1); 1163 if (rc != 0) 1164 goto fail3; 1165 1166 return (0); 1167 1168 fail3: 1169 fail2: 1170 while (--index >= 0) 1171 sfxge_rx_qstop(sc, index); 1172 1173 fail: 1174 efx_rx_fini(sc->enp); 1175 1176 return (rc); 1177 } 1178 1179 #ifdef SFXGE_LRO 1180 1181 static void sfxge_lro_init(struct sfxge_rxq *rxq) 1182 { 1183 struct sfxge_lro_state *st = &rxq->lro; 1184 unsigned i; 1185 1186 st->conns_mask = lro_table_size - 1; 1187 KASSERT(!((st->conns_mask + 1) & st->conns_mask), 1188 ("lro_table_size must be a power of 2")); 1189 st->sc = rxq->sc; 1190 st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]), 1191 M_SFXGE, M_WAITOK); 1192 st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]), 1193 M_SFXGE, M_WAITOK); 1194 for (i = 0; i <= st->conns_mask; ++i) { 1195 TAILQ_INIT(&st->conns[i]); 1196 st->conns_n[i] = 0; 1197 } 1198 LIST_INIT(&st->active_conns); 1199 TAILQ_INIT(&st->free_conns); 1200 } 1201 1202 static void sfxge_lro_fini(struct sfxge_rxq *rxq) 1203 { 1204 struct sfxge_lro_state *st = &rxq->lro; 1205 struct sfxge_lro_conn *c; 1206 unsigned i; 1207 1208 /* Return cleanly if sfxge_lro_init() has not been called. */ 1209 if (st->conns == NULL) 1210 return; 1211 1212 KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections")); 1213 1214 for (i = 0; i <= st->conns_mask; ++i) { 1215 while (!TAILQ_EMPTY(&st->conns[i])) { 1216 c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq); 1217 sfxge_lro_drop(rxq, c); 1218 } 1219 } 1220 1221 while (!TAILQ_EMPTY(&st->free_conns)) { 1222 c = TAILQ_FIRST(&st->free_conns); 1223 TAILQ_REMOVE(&st->free_conns, c, link); 1224 KASSERT(!c->mbuf, ("found orphaned mbuf")); 1225 free(c, M_SFXGE); 1226 } 1227 1228 free(st->conns_n, M_SFXGE); 1229 free(st->conns, M_SFXGE); 1230 st->conns = NULL; 1231 } 1232 1233 #else 1234 1235 static void 1236 sfxge_lro_init(struct sfxge_rxq *rxq) 1237 { 1238 } 1239 1240 static void 1241 sfxge_lro_fini(struct sfxge_rxq *rxq) 1242 { 1243 } 1244 1245 #endif /* SFXGE_LRO */ 1246 1247 static void 1248 sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index) 1249 { 1250 struct sfxge_rxq *rxq; 1251 1252 rxq = sc->rxq[index]; 1253 1254 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED, 1255 ("rxq->init_state != SFXGE_RXQ_INITIALIZED")); 1256 1257 /* Free the context array and the flow table. */ 1258 free(rxq->queue, M_SFXGE); 1259 sfxge_lro_fini(rxq); 1260 1261 /* Release DMA memory. */ 1262 sfxge_dma_free(&rxq->mem); 1263 1264 sc->rxq[index] = NULL; 1265 1266 free(rxq, M_SFXGE); 1267 } 1268 1269 static int 1270 sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index) 1271 { 1272 struct sfxge_rxq *rxq; 1273 efsys_mem_t *esmp; 1274 int rc; 1275 1276 KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count)); 1277 1278 rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK); 1279 rxq->sc = sc; 1280 rxq->index = index; 1281 rxq->entries = sc->rxq_entries; 1282 rxq->ptr_mask = rxq->entries - 1; 1283 rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries); 1284 1285 sc->rxq[index] = rxq; 1286 esmp = &rxq->mem; 1287 1288 /* Allocate and zero DMA space. */ 1289 if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0) 1290 return (rc); 1291 1292 /* Allocate buffer table entries. */ 1293 sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries), 1294 &rxq->buf_base_id); 1295 1296 /* Allocate the context array and the flow table. */ 1297 rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries, 1298 M_SFXGE, M_WAITOK | M_ZERO); 1299 sfxge_lro_init(rxq); 1300 1301 callout_init(&rxq->refill_callout, 1); 1302 1303 rxq->init_state = SFXGE_RXQ_INITIALIZED; 1304 1305 return (0); 1306 } 1307 1308 static const struct { 1309 const char *name; 1310 size_t offset; 1311 } sfxge_rx_stats[] = { 1312 #define SFXGE_RX_STAT(name, member) \ 1313 { #name, offsetof(struct sfxge_rxq, member) } 1314 #ifdef SFXGE_LRO 1315 SFXGE_RX_STAT(lro_merges, lro.n_merges), 1316 SFXGE_RX_STAT(lro_bursts, lro.n_bursts), 1317 SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start), 1318 SFXGE_RX_STAT(lro_misorder, lro.n_misorder), 1319 SFXGE_RX_STAT(lro_too_many, lro.n_too_many), 1320 SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream), 1321 SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle), 1322 SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed) 1323 #endif 1324 }; 1325 1326 static int 1327 sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS) 1328 { 1329 struct sfxge_softc *sc = arg1; 1330 unsigned int id = arg2; 1331 unsigned int sum, index; 1332 1333 /* Sum across all RX queues */ 1334 sum = 0; 1335 for (index = 0; index < sc->rxq_count; index++) 1336 sum += *(unsigned int *)((caddr_t)sc->rxq[index] + 1337 sfxge_rx_stats[id].offset); 1338 1339 return (SYSCTL_OUT(req, &sum, sizeof(sum))); 1340 } 1341 1342 static void 1343 sfxge_rx_stat_init(struct sfxge_softc *sc) 1344 { 1345 struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev); 1346 struct sysctl_oid_list *stat_list; 1347 unsigned int id; 1348 1349 stat_list = SYSCTL_CHILDREN(sc->stats_node); 1350 1351 for (id = 0; id < nitems(sfxge_rx_stats); id++) { 1352 SYSCTL_ADD_PROC(ctx, stat_list, OID_AUTO, 1353 sfxge_rx_stats[id].name, 1354 CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_NEEDGIANT, 1355 sc, id, sfxge_rx_stat_handler, "IU", ""); 1356 } 1357 } 1358 1359 void 1360 sfxge_rx_fini(struct sfxge_softc *sc) 1361 { 1362 int index; 1363 1364 index = sc->rxq_count; 1365 while (--index >= 0) 1366 sfxge_rx_qfini(sc, index); 1367 1368 sc->rxq_count = 0; 1369 } 1370 1371 int 1372 sfxge_rx_init(struct sfxge_softc *sc) 1373 { 1374 struct sfxge_intr *intr; 1375 int index; 1376 int rc; 1377 1378 #ifdef SFXGE_LRO 1379 if (!ISP2(lro_table_size)) { 1380 log(LOG_ERR, "%s=%u must be power of 2", 1381 SFXGE_LRO_PARAM(table_size), lro_table_size); 1382 rc = EINVAL; 1383 goto fail_lro_table_size; 1384 } 1385 1386 if (lro_idle_ticks == 0) 1387 lro_idle_ticks = hz / 10 + 1; /* 100 ms */ 1388 #endif 1389 1390 intr = &sc->intr; 1391 1392 sc->rxq_count = intr->n_alloc; 1393 1394 KASSERT(intr->state == SFXGE_INTR_INITIALIZED, 1395 ("intr->state != SFXGE_INTR_INITIALIZED")); 1396 1397 /* Initialize the receive queue(s) - one per interrupt. */ 1398 for (index = 0; index < sc->rxq_count; index++) { 1399 if ((rc = sfxge_rx_qinit(sc, index)) != 0) 1400 goto fail; 1401 } 1402 1403 sfxge_rx_stat_init(sc); 1404 1405 return (0); 1406 1407 fail: 1408 /* Tear down the receive queue(s). */ 1409 while (--index >= 0) 1410 sfxge_rx_qfini(sc, index); 1411 1412 sc->rxq_count = 0; 1413 1414 #ifdef SFXGE_LRO 1415 fail_lro_table_size: 1416 #endif 1417 return (rc); 1418 } 1419