1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2010-2016 Solarflare Communications Inc. 5 * All rights reserved. 6 * 7 * This software was developed in part by Philip Paeps under contract for 8 * Solarflare Communications, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions are met: 12 * 13 * 1. Redistributions of source code must retain the above copyright notice, 14 * this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright notice, 16 * this list of conditions and the following disclaimer in the documentation 17 * and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 21 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 23 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 24 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 25 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 26 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 27 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 28 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 29 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 * 31 * The views and conclusions contained in the software and documentation are 32 * those of the authors and should not be interpreted as representing official 33 * policies, either expressed or implied, of the FreeBSD Project. 34 */ 35 36 #include <sys/cdefs.h> 37 __FBSDID("$FreeBSD$"); 38 39 #include "opt_rss.h" 40 41 #include <sys/param.h> 42 #include <sys/malloc.h> 43 #include <sys/mbuf.h> 44 #include <sys/smp.h> 45 #include <sys/socket.h> 46 #include <sys/sysctl.h> 47 #include <sys/syslog.h> 48 #include <sys/limits.h> 49 #include <sys/syslog.h> 50 51 #include <net/ethernet.h> 52 #include <net/if.h> 53 #include <net/if_vlan_var.h> 54 55 #include <netinet/in.h> 56 #include <netinet/ip.h> 57 #include <netinet/ip6.h> 58 #include <netinet/tcp.h> 59 60 #include <machine/in_cksum.h> 61 62 #ifdef RSS 63 #include <net/rss_config.h> 64 #endif 65 66 #include "common/efx.h" 67 68 #include "sfxge.h" 69 #include "sfxge_rx.h" 70 71 #define RX_REFILL_THRESHOLD(_entries) (EFX_RXQ_LIMIT(_entries) * 9 / 10) 72 73 #ifdef SFXGE_LRO 74 75 SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 76 "Large receive offload (LRO) parameters"); 77 78 #define SFXGE_LRO_PARAM(_param) SFXGE_PARAM(lro._param) 79 80 /* Size of the LRO hash table. Must be a power of 2. A larger table 81 * means we can accelerate a larger number of streams. 82 */ 83 static unsigned lro_table_size = 128; 84 TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size); 85 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN, 86 &lro_table_size, 0, 87 "Size of the LRO hash table (must be a power of 2)"); 88 89 /* Maximum length of a hash chain. If chains get too long then the lookup 90 * time increases and may exceed the benefit of LRO. 91 */ 92 static unsigned lro_chain_max = 20; 93 TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max); 94 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN, 95 &lro_chain_max, 0, 96 "The maximum length of a hash chain"); 97 98 /* Maximum time (in ticks) that a connection can be idle before it's LRO 99 * state is discarded. 100 */ 101 static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */ 102 TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks); 103 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN, 104 &lro_idle_ticks, 0, 105 "The maximum time (in ticks) that a connection can be idle " 106 "before it's LRO state is discarded"); 107 108 /* Number of packets with payload that must arrive in-order before a 109 * connection is eligible for LRO. The idea is we should avoid coalescing 110 * segments when the sender is in slow-start because reducing the ACK rate 111 * can damage performance. 112 */ 113 static int lro_slow_start_packets = 2000; 114 TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets); 115 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN, 116 &lro_slow_start_packets, 0, 117 "Number of packets with payload that must arrive in-order before " 118 "a connection is eligible for LRO"); 119 120 /* Number of packets with payload that must arrive in-order following loss 121 * before a connection is eligible for LRO. The idea is we should avoid 122 * coalescing segments when the sender is recovering from loss, because 123 * reducing the ACK rate can damage performance. 124 */ 125 static int lro_loss_packets = 20; 126 TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets); 127 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN, 128 &lro_loss_packets, 0, 129 "Number of packets with payload that must arrive in-order " 130 "following loss before a connection is eligible for LRO"); 131 132 /* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */ 133 #define SFXGE_LRO_L2_ID_VLAN 0x4000 134 #define SFXGE_LRO_L2_ID_IPV6 0x8000 135 #define SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN) 136 #define SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6)) 137 138 /* Compare IPv6 addresses, avoiding conditional branches */ 139 static unsigned long ipv6_addr_cmp(const struct in6_addr *left, 140 const struct in6_addr *right) 141 { 142 #if LONG_BIT == 64 143 const uint64_t *left64 = (const uint64_t *)left; 144 const uint64_t *right64 = (const uint64_t *)right; 145 return (left64[0] - right64[0]) | (left64[1] - right64[1]); 146 #else 147 return (left->s6_addr32[0] - right->s6_addr32[0]) | 148 (left->s6_addr32[1] - right->s6_addr32[1]) | 149 (left->s6_addr32[2] - right->s6_addr32[2]) | 150 (left->s6_addr32[3] - right->s6_addr32[3]); 151 #endif 152 } 153 154 #endif /* SFXGE_LRO */ 155 156 void 157 sfxge_rx_qflush_done(struct sfxge_rxq *rxq) 158 { 159 160 rxq->flush_state = SFXGE_FLUSH_DONE; 161 } 162 163 void 164 sfxge_rx_qflush_failed(struct sfxge_rxq *rxq) 165 { 166 167 rxq->flush_state = SFXGE_FLUSH_FAILED; 168 } 169 170 #ifdef RSS 171 static uint8_t toep_key[RSS_KEYSIZE]; 172 #else 173 static uint8_t toep_key[] = { 174 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, 175 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, 176 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, 177 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, 178 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa 179 }; 180 #endif 181 182 static void 183 sfxge_rx_post_refill(void *arg) 184 { 185 struct sfxge_rxq *rxq = arg; 186 struct sfxge_softc *sc; 187 unsigned int index; 188 struct sfxge_evq *evq; 189 uint16_t magic; 190 191 sc = rxq->sc; 192 index = rxq->index; 193 evq = sc->evq[index]; 194 magic = sfxge_sw_ev_rxq_magic(SFXGE_SW_EV_RX_QREFILL, rxq); 195 196 /* This is guaranteed due to the start/stop order of rx and ev */ 197 KASSERT(evq->init_state == SFXGE_EVQ_STARTED, 198 ("evq not started")); 199 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED, 200 ("rxq not started")); 201 efx_ev_qpost(evq->common, magic); 202 } 203 204 static void 205 sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying) 206 { 207 /* Initially retry after 100 ms, but back off in case of 208 * repeated failures as we probably have to wait for the 209 * administrator to raise the pool limit. */ 210 if (retrying) 211 rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz); 212 else 213 rxq->refill_delay = hz / 10; 214 215 callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay, 216 sfxge_rx_post_refill, rxq); 217 } 218 219 #define SFXGE_REFILL_BATCH 64 220 221 static void 222 sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying) 223 { 224 struct sfxge_softc *sc; 225 unsigned int index; 226 struct sfxge_evq *evq __diagused; 227 unsigned int batch; 228 unsigned int rxfill; 229 unsigned int mblksize; 230 int ntodo; 231 efsys_dma_addr_t addr[SFXGE_REFILL_BATCH]; 232 233 sc = rxq->sc; 234 index = rxq->index; 235 evq = sc->evq[index]; 236 237 prefetch_read_many(sc->enp); 238 prefetch_read_many(rxq->common); 239 240 SFXGE_EVQ_LOCK_ASSERT_OWNED(evq); 241 242 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED)) 243 return; 244 245 rxfill = rxq->added - rxq->completed; 246 KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries), 247 ("rxfill > EFX_RXQ_LIMIT(rxq->entries)")); 248 ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target); 249 KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries), 250 ("ntodo > EFX_RQX_LIMIT(rxq->entries)")); 251 252 if (ntodo == 0) 253 return; 254 255 batch = 0; 256 mblksize = sc->rx_buffer_size - sc->rx_buffer_align; 257 while (ntodo-- > 0) { 258 unsigned int id; 259 struct sfxge_rx_sw_desc *rx_desc; 260 bus_dma_segment_t seg; 261 struct mbuf *m; 262 263 id = (rxq->added + batch) & rxq->ptr_mask; 264 rx_desc = &rxq->queue[id]; 265 KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL")); 266 267 rx_desc->flags = EFX_DISCARD; 268 m = rx_desc->mbuf = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, 269 sc->rx_cluster_size); 270 if (m == NULL) 271 break; 272 273 /* m_len specifies length of area to be mapped for DMA */ 274 m->m_len = mblksize; 275 m->m_data = (caddr_t)EFX_P2ROUNDUP(uintptr_t, m->m_data, 276 CACHE_LINE_SIZE); 277 m->m_data += sc->rx_buffer_align; 278 279 sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg); 280 addr[batch++] = seg.ds_addr; 281 282 if (batch == SFXGE_REFILL_BATCH) { 283 efx_rx_qpost(rxq->common, addr, mblksize, batch, 284 rxq->completed, rxq->added); 285 rxq->added += batch; 286 batch = 0; 287 } 288 } 289 290 if (ntodo != 0) 291 sfxge_rx_schedule_refill(rxq, retrying); 292 293 if (batch != 0) { 294 efx_rx_qpost(rxq->common, addr, mblksize, batch, 295 rxq->completed, rxq->added); 296 rxq->added += batch; 297 } 298 299 /* Make the descriptors visible to the hardware */ 300 bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map, 301 BUS_DMASYNC_PREWRITE); 302 303 efx_rx_qpush(rxq->common, rxq->added, &rxq->pushed); 304 305 /* The queue could still be empty if no descriptors were actually 306 * pushed, in which case there will be no event to cause the next 307 * refill, so we must schedule a refill ourselves. 308 */ 309 if(rxq->pushed == rxq->completed) { 310 sfxge_rx_schedule_refill(rxq, retrying); 311 } 312 } 313 314 void 315 sfxge_rx_qrefill(struct sfxge_rxq *rxq) 316 { 317 318 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED)) 319 return; 320 321 /* Make sure the queue is full */ 322 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE); 323 } 324 325 static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m) 326 { 327 if_t ifp = sc->ifnet; 328 329 m->m_pkthdr.rcvif = ifp; 330 m->m_pkthdr.csum_data = 0xffff; 331 if_input(ifp, m); 332 } 333 334 static void 335 sfxge_rx_deliver(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_desc) 336 { 337 struct sfxge_softc *sc = rxq->sc; 338 struct mbuf *m = rx_desc->mbuf; 339 int flags = rx_desc->flags; 340 int csum_flags; 341 342 /* Convert checksum flags */ 343 csum_flags = (flags & EFX_CKSUM_IPV4) ? 344 (CSUM_IP_CHECKED | CSUM_IP_VALID) : 0; 345 if (flags & EFX_CKSUM_TCPUDP) 346 csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 347 348 if (flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) { 349 m->m_pkthdr.flowid = 350 efx_pseudo_hdr_hash_get(rxq->common, 351 EFX_RX_HASHALG_TOEPLITZ, 352 mtod(m, uint8_t *)); 353 /* The hash covers a 4-tuple for TCP only */ 354 M_HASHTYPE_SET(m, 355 (flags & EFX_PKT_IPV4) ? 356 ((flags & EFX_PKT_TCP) ? 357 M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_IPV4) : 358 ((flags & EFX_PKT_TCP) ? 359 M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_IPV6)); 360 } 361 m->m_data += sc->rx_prefix_size; 362 m->m_len = rx_desc->size - sc->rx_prefix_size; 363 m->m_pkthdr.len = m->m_len; 364 m->m_pkthdr.csum_flags = csum_flags; 365 __sfxge_rx_deliver(sc, rx_desc->mbuf); 366 367 rx_desc->flags = EFX_DISCARD; 368 rx_desc->mbuf = NULL; 369 } 370 371 #ifdef SFXGE_LRO 372 373 static void 374 sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c) 375 { 376 struct sfxge_softc *sc = st->sc; 377 struct mbuf *m = c->mbuf; 378 struct tcphdr *c_th; 379 int csum_flags; 380 381 KASSERT(m, ("no mbuf to deliver")); 382 383 ++st->n_bursts; 384 385 /* Finish off packet munging and recalculate IP header checksum. */ 386 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 387 struct ip *iph = c->nh; 388 iph->ip_len = htons(iph->ip_len); 389 iph->ip_sum = 0; 390 iph->ip_sum = in_cksum_hdr(iph); 391 c_th = (struct tcphdr *)(iph + 1); 392 csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR | 393 CSUM_IP_CHECKED | CSUM_IP_VALID); 394 } else { 395 struct ip6_hdr *iph = c->nh; 396 iph->ip6_plen = htons(iph->ip6_plen); 397 c_th = (struct tcphdr *)(iph + 1); 398 csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 399 } 400 401 c_th->th_win = c->th_last->th_win; 402 c_th->th_ack = c->th_last->th_ack; 403 if (c_th->th_off == c->th_last->th_off) { 404 /* Copy TCP options (take care to avoid going negative). */ 405 int optlen = ((c_th->th_off - 5) & 0xf) << 2u; 406 memcpy(c_th + 1, c->th_last + 1, optlen); 407 } 408 409 m->m_pkthdr.flowid = c->conn_hash; 410 M_HASHTYPE_SET(m, 411 SFXGE_LRO_CONN_IS_TCPIPV4(c) ? 412 M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_TCP_IPV6); 413 414 m->m_pkthdr.csum_flags = csum_flags; 415 __sfxge_rx_deliver(sc, m); 416 417 c->mbuf = NULL; 418 c->delivered = 1; 419 } 420 421 /* Drop the given connection, and add it to the free list. */ 422 static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c) 423 { 424 unsigned bucket; 425 426 KASSERT(!c->mbuf, ("found orphaned mbuf")); 427 428 if (c->next_buf.mbuf != NULL) { 429 sfxge_rx_deliver(rxq, &c->next_buf); 430 LIST_REMOVE(c, active_link); 431 } 432 433 bucket = c->conn_hash & rxq->lro.conns_mask; 434 KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong")); 435 --rxq->lro.conns_n[bucket]; 436 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link); 437 TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link); 438 } 439 440 /* Stop tracking connections that have gone idle in order to keep hash 441 * chains short. 442 */ 443 static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now) 444 { 445 struct sfxge_lro_conn *c; 446 unsigned i; 447 448 KASSERT(LIST_EMPTY(&rxq->lro.active_conns), 449 ("found active connections")); 450 451 rxq->lro.last_purge_ticks = now; 452 for (i = 0; i <= rxq->lro.conns_mask; ++i) { 453 if (TAILQ_EMPTY(&rxq->lro.conns[i])) 454 continue; 455 456 c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq); 457 if (now - c->last_pkt_ticks > lro_idle_ticks) { 458 ++rxq->lro.n_drop_idle; 459 sfxge_lro_drop(rxq, c); 460 } 461 } 462 } 463 464 static void 465 sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c, 466 struct mbuf *mbuf, struct tcphdr *th) 467 { 468 struct tcphdr *c_th; 469 470 /* Tack the new mbuf onto the chain. */ 471 KASSERT(!mbuf->m_next, ("mbuf already chained")); 472 c->mbuf_tail->m_next = mbuf; 473 c->mbuf_tail = mbuf; 474 475 /* Increase length appropriately */ 476 c->mbuf->m_pkthdr.len += mbuf->m_len; 477 478 /* Update the connection state flags */ 479 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 480 struct ip *iph = c->nh; 481 iph->ip_len += mbuf->m_len; 482 c_th = (struct tcphdr *)(iph + 1); 483 } else { 484 struct ip6_hdr *iph = c->nh; 485 iph->ip6_plen += mbuf->m_len; 486 c_th = (struct tcphdr *)(iph + 1); 487 } 488 c_th->th_flags |= (th->th_flags & TH_PUSH); 489 c->th_last = th; 490 ++st->n_merges; 491 492 /* Pass packet up now if another segment could overflow the IP 493 * length. 494 */ 495 if (c->mbuf->m_pkthdr.len > 65536 - 9200) 496 sfxge_lro_deliver(st, c); 497 } 498 499 static void 500 sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c, 501 struct mbuf *mbuf, void *nh, struct tcphdr *th) 502 { 503 /* Start the chain */ 504 c->mbuf = mbuf; 505 c->mbuf_tail = c->mbuf; 506 c->nh = nh; 507 c->th_last = th; 508 509 mbuf->m_pkthdr.len = mbuf->m_len; 510 511 /* Mangle header fields for later processing */ 512 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 513 struct ip *iph = nh; 514 iph->ip_len = ntohs(iph->ip_len); 515 } else { 516 struct ip6_hdr *iph = nh; 517 iph->ip6_plen = ntohs(iph->ip6_plen); 518 } 519 } 520 521 /* Try to merge or otherwise hold or deliver (as appropriate) the 522 * packet buffered for this connection (c->next_buf). Return a flag 523 * indicating whether the connection is still active for LRO purposes. 524 */ 525 static int 526 sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c) 527 { 528 struct sfxge_rx_sw_desc *rx_buf = &c->next_buf; 529 char *eh = c->next_eh; 530 int data_length, hdr_length, dont_merge; 531 unsigned th_seq, pkt_length; 532 struct tcphdr *th; 533 unsigned now; 534 535 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 536 struct ip *iph = c->next_nh; 537 th = (struct tcphdr *)(iph + 1); 538 pkt_length = ntohs(iph->ip_len) + (char *) iph - eh; 539 } else { 540 struct ip6_hdr *iph = c->next_nh; 541 th = (struct tcphdr *)(iph + 1); 542 pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh; 543 } 544 545 hdr_length = (char *) th + th->th_off * 4 - eh; 546 data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) - 547 hdr_length); 548 th_seq = ntohl(th->th_seq); 549 dont_merge = ((data_length <= 0) 550 | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN))); 551 552 /* Check for options other than aligned timestamp. */ 553 if (th->th_off != 5) { 554 const uint32_t *opt_ptr = (const uint32_t *) (th + 1); 555 if (th->th_off == 8 && 556 opt_ptr[0] == ntohl((TCPOPT_NOP << 24) | 557 (TCPOPT_NOP << 16) | 558 (TCPOPT_TIMESTAMP << 8) | 559 TCPOLEN_TIMESTAMP)) { 560 /* timestamp option -- okay */ 561 } else { 562 dont_merge = 1; 563 } 564 } 565 566 if (__predict_false(th_seq != c->next_seq)) { 567 /* Out-of-order, so start counting again. */ 568 if (c->mbuf != NULL) 569 sfxge_lro_deliver(&rxq->lro, c); 570 c->n_in_order_pkts -= lro_loss_packets; 571 c->next_seq = th_seq + data_length; 572 ++rxq->lro.n_misorder; 573 goto deliver_buf_out; 574 } 575 c->next_seq = th_seq + data_length; 576 577 now = ticks; 578 if (now - c->last_pkt_ticks > lro_idle_ticks) { 579 ++rxq->lro.n_drop_idle; 580 if (c->mbuf != NULL) 581 sfxge_lro_deliver(&rxq->lro, c); 582 sfxge_lro_drop(rxq, c); 583 return (0); 584 } 585 c->last_pkt_ticks = ticks; 586 587 if (c->n_in_order_pkts < lro_slow_start_packets) { 588 /* May be in slow-start, so don't merge. */ 589 ++rxq->lro.n_slow_start; 590 ++c->n_in_order_pkts; 591 goto deliver_buf_out; 592 } 593 594 if (__predict_false(dont_merge)) { 595 if (c->mbuf != NULL) 596 sfxge_lro_deliver(&rxq->lro, c); 597 if (th->th_flags & (TH_FIN | TH_RST)) { 598 ++rxq->lro.n_drop_closed; 599 sfxge_lro_drop(rxq, c); 600 return (0); 601 } 602 goto deliver_buf_out; 603 } 604 605 rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size; 606 607 if (__predict_true(c->mbuf != NULL)) { 608 /* Remove headers and any padding */ 609 rx_buf->mbuf->m_data += hdr_length; 610 rx_buf->mbuf->m_len = data_length; 611 612 sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th); 613 } else { 614 /* Remove any padding */ 615 rx_buf->mbuf->m_len = pkt_length; 616 617 sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th); 618 } 619 620 rx_buf->mbuf = NULL; 621 return (1); 622 623 deliver_buf_out: 624 sfxge_rx_deliver(rxq, rx_buf); 625 return (1); 626 } 627 628 static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash, 629 uint16_t l2_id, void *nh, struct tcphdr *th) 630 { 631 unsigned bucket = conn_hash & st->conns_mask; 632 struct sfxge_lro_conn *c; 633 634 if (st->conns_n[bucket] >= lro_chain_max) { 635 ++st->n_too_many; 636 return; 637 } 638 639 if (!TAILQ_EMPTY(&st->free_conns)) { 640 c = TAILQ_FIRST(&st->free_conns); 641 TAILQ_REMOVE(&st->free_conns, c, link); 642 } else { 643 c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT); 644 if (c == NULL) 645 return; 646 c->mbuf = NULL; 647 c->next_buf.mbuf = NULL; 648 } 649 650 /* Create the connection tracking data */ 651 ++st->conns_n[bucket]; 652 TAILQ_INSERT_HEAD(&st->conns[bucket], c, link); 653 c->l2_id = l2_id; 654 c->conn_hash = conn_hash; 655 c->source = th->th_sport; 656 c->dest = th->th_dport; 657 c->n_in_order_pkts = 0; 658 c->last_pkt_ticks = *(volatile int *)&ticks; 659 c->delivered = 0; 660 ++st->n_new_stream; 661 /* NB. We don't initialise c->next_seq, and it doesn't matter what 662 * value it has. Most likely the next packet received for this 663 * connection will not match -- no harm done. 664 */ 665 } 666 667 /* Process mbuf and decide whether to dispatch it to the stack now or 668 * later. 669 */ 670 static void 671 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf) 672 { 673 struct sfxge_softc *sc = rxq->sc; 674 struct mbuf *m = rx_buf->mbuf; 675 struct ether_header *eh; 676 struct sfxge_lro_conn *c; 677 uint16_t l2_id; 678 uint16_t l3_proto; 679 void *nh; 680 struct tcphdr *th; 681 uint32_t conn_hash; 682 unsigned bucket; 683 684 /* Get the hardware hash */ 685 conn_hash = efx_pseudo_hdr_hash_get(rxq->common, 686 EFX_RX_HASHALG_TOEPLITZ, 687 mtod(m, uint8_t *)); 688 689 eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size); 690 if (eh->ether_type == htons(ETHERTYPE_VLAN)) { 691 struct ether_vlan_header *veh = (struct ether_vlan_header *)eh; 692 l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) | 693 SFXGE_LRO_L2_ID_VLAN; 694 l3_proto = veh->evl_proto; 695 nh = veh + 1; 696 } else { 697 l2_id = 0; 698 l3_proto = eh->ether_type; 699 nh = eh + 1; 700 } 701 702 /* Check whether this is a suitable packet (unfragmented 703 * TCP/IPv4 or TCP/IPv6). If so, find the TCP header and 704 * length, and compute a hash if necessary. If not, return. 705 */ 706 if (l3_proto == htons(ETHERTYPE_IP)) { 707 struct ip *iph = nh; 708 709 KASSERT(iph->ip_p == IPPROTO_TCP, 710 ("IPv4 protocol is not TCP, but packet marker is set")); 711 if ((iph->ip_hl - (sizeof(*iph) >> 2u)) | 712 (iph->ip_off & htons(IP_MF | IP_OFFMASK))) 713 goto deliver_now; 714 th = (struct tcphdr *)(iph + 1); 715 } else if (l3_proto == htons(ETHERTYPE_IPV6)) { 716 struct ip6_hdr *iph = nh; 717 718 KASSERT(iph->ip6_nxt == IPPROTO_TCP, 719 ("IPv6 next header is not TCP, but packet marker is set")); 720 l2_id |= SFXGE_LRO_L2_ID_IPV6; 721 th = (struct tcphdr *)(iph + 1); 722 } else { 723 goto deliver_now; 724 } 725 726 bucket = conn_hash & rxq->lro.conns_mask; 727 728 TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) { 729 if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash)) 730 continue; 731 if ((c->source - th->th_sport) | (c->dest - th->th_dport)) 732 continue; 733 if (c->mbuf != NULL) { 734 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 735 struct ip *c_iph, *iph = nh; 736 c_iph = c->nh; 737 if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) | 738 (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr)) 739 continue; 740 } else { 741 struct ip6_hdr *c_iph, *iph = nh; 742 c_iph = c->nh; 743 if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) | 744 ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst)) 745 continue; 746 } 747 } 748 749 /* Re-insert at head of list to reduce lookup time. */ 750 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link); 751 TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link); 752 753 if (c->next_buf.mbuf != NULL) { 754 if (!sfxge_lro_try_merge(rxq, c)) 755 goto deliver_now; 756 } else { 757 LIST_INSERT_HEAD(&rxq->lro.active_conns, c, 758 active_link); 759 } 760 c->next_buf = *rx_buf; 761 c->next_eh = eh; 762 c->next_nh = nh; 763 764 rx_buf->mbuf = NULL; 765 rx_buf->flags = EFX_DISCARD; 766 return; 767 } 768 769 sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th); 770 deliver_now: 771 sfxge_rx_deliver(rxq, rx_buf); 772 } 773 774 static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq) 775 { 776 struct sfxge_lro_state *st = &rxq->lro; 777 struct sfxge_lro_conn *c; 778 unsigned t; 779 780 while (!LIST_EMPTY(&st->active_conns)) { 781 c = LIST_FIRST(&st->active_conns); 782 if (!c->delivered && c->mbuf != NULL) 783 sfxge_lro_deliver(st, c); 784 if (sfxge_lro_try_merge(rxq, c)) { 785 if (c->mbuf != NULL) 786 sfxge_lro_deliver(st, c); 787 LIST_REMOVE(c, active_link); 788 } 789 c->delivered = 0; 790 } 791 792 t = *(volatile int *)&ticks; 793 if (__predict_false(t != st->last_purge_ticks)) 794 sfxge_lro_purge_idle(rxq, t); 795 } 796 797 #else /* !SFXGE_LRO */ 798 799 static void 800 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf) 801 { 802 } 803 804 static void 805 sfxge_lro_end_of_burst(struct sfxge_rxq *rxq) 806 { 807 } 808 809 #endif /* SFXGE_LRO */ 810 811 void 812 sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop) 813 { 814 struct sfxge_softc *sc = rxq->sc; 815 int if_capenable = if_getcapenable(sc->ifnet); 816 int lro_enabled = if_capenable & IFCAP_LRO; 817 unsigned int index; 818 struct sfxge_evq *evq __diagused; 819 unsigned int completed; 820 unsigned int level; 821 struct mbuf *m; 822 struct sfxge_rx_sw_desc *prev = NULL; 823 824 index = rxq->index; 825 evq = sc->evq[index]; 826 827 SFXGE_EVQ_LOCK_ASSERT_OWNED(evq); 828 829 completed = rxq->completed; 830 while (completed != rxq->pending) { 831 unsigned int id; 832 struct sfxge_rx_sw_desc *rx_desc; 833 834 id = completed++ & rxq->ptr_mask; 835 rx_desc = &rxq->queue[id]; 836 m = rx_desc->mbuf; 837 838 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED)) 839 goto discard; 840 841 if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD)) 842 goto discard; 843 844 /* Read the length from the pseudo header if required */ 845 if (rx_desc->flags & EFX_PKT_PREFIX_LEN) { 846 uint16_t tmp_size; 847 int rc __diagused; 848 849 rc = efx_pseudo_hdr_pkt_length_get(rxq->common, 850 mtod(m, uint8_t *), 851 &tmp_size); 852 KASSERT(rc == 0, ("cannot get packet length: %d", rc)); 853 rx_desc->size = (int)tmp_size + sc->rx_prefix_size; 854 } 855 856 prefetch_read_many(mtod(m, caddr_t)); 857 858 switch (rx_desc->flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) { 859 case EFX_PKT_IPV4: 860 if (~if_capenable & IFCAP_RXCSUM) 861 rx_desc->flags &= 862 ~(EFX_CKSUM_IPV4 | EFX_CKSUM_TCPUDP); 863 break; 864 case EFX_PKT_IPV6: 865 if (~if_capenable & IFCAP_RXCSUM_IPV6) 866 rx_desc->flags &= ~EFX_CKSUM_TCPUDP; 867 break; 868 case 0: 869 /* Check for loopback packets */ 870 { 871 struct ether_header *etherhp; 872 873 /*LINTED*/ 874 etherhp = mtod(m, struct ether_header *); 875 876 if (etherhp->ether_type == 877 htons(SFXGE_ETHERTYPE_LOOPBACK)) { 878 EFSYS_PROBE(loopback); 879 880 rxq->loopback++; 881 goto discard; 882 } 883 } 884 break; 885 default: 886 KASSERT(B_FALSE, 887 ("Rx descriptor with both IPv4 and IPv6 flags")); 888 goto discard; 889 } 890 891 /* Pass packet up the stack or into LRO (pipelined) */ 892 if (prev != NULL) { 893 if (lro_enabled && 894 ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) == 895 (EFX_PKT_TCP | EFX_CKSUM_TCPUDP))) 896 sfxge_lro(rxq, prev); 897 else 898 sfxge_rx_deliver(rxq, prev); 899 } 900 prev = rx_desc; 901 continue; 902 903 discard: 904 /* Return the packet to the pool */ 905 m_free(m); 906 rx_desc->mbuf = NULL; 907 } 908 rxq->completed = completed; 909 910 level = rxq->added - rxq->completed; 911 912 /* Pass last packet up the stack or into LRO */ 913 if (prev != NULL) { 914 if (lro_enabled && 915 ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) == 916 (EFX_PKT_TCP | EFX_CKSUM_TCPUDP))) 917 sfxge_lro(rxq, prev); 918 else 919 sfxge_rx_deliver(rxq, prev); 920 } 921 922 /* 923 * If there are any pending flows and this is the end of the 924 * poll then they must be completed. 925 */ 926 if (eop) 927 sfxge_lro_end_of_burst(rxq); 928 929 /* Top up the queue if necessary */ 930 if (level < rxq->refill_threshold) 931 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE); 932 } 933 934 static void 935 sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index) 936 { 937 struct sfxge_rxq *rxq; 938 struct sfxge_evq *evq; 939 unsigned int count; 940 unsigned int retry = 3; 941 942 SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc); 943 944 rxq = sc->rxq[index]; 945 evq = sc->evq[index]; 946 947 SFXGE_EVQ_LOCK(evq); 948 949 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED, 950 ("rxq not started")); 951 952 rxq->init_state = SFXGE_RXQ_INITIALIZED; 953 954 callout_stop(&rxq->refill_callout); 955 956 while (rxq->flush_state != SFXGE_FLUSH_DONE && retry != 0) { 957 rxq->flush_state = SFXGE_FLUSH_PENDING; 958 959 SFXGE_EVQ_UNLOCK(evq); 960 961 /* Flush the receive queue */ 962 if (efx_rx_qflush(rxq->common) != 0) { 963 SFXGE_EVQ_LOCK(evq); 964 rxq->flush_state = SFXGE_FLUSH_FAILED; 965 break; 966 } 967 968 count = 0; 969 do { 970 /* Spin for 100 ms */ 971 DELAY(100000); 972 973 if (rxq->flush_state != SFXGE_FLUSH_PENDING) 974 break; 975 976 } while (++count < 20); 977 978 SFXGE_EVQ_LOCK(evq); 979 980 if (rxq->flush_state == SFXGE_FLUSH_PENDING) { 981 /* Flush timeout - neither done nor failed */ 982 log(LOG_ERR, "%s: Cannot flush Rx queue %u\n", 983 device_get_nameunit(sc->dev), index); 984 rxq->flush_state = SFXGE_FLUSH_DONE; 985 } 986 retry--; 987 } 988 if (rxq->flush_state == SFXGE_FLUSH_FAILED) { 989 log(LOG_ERR, "%s: Flushing Rx queue %u failed\n", 990 device_get_nameunit(sc->dev), index); 991 rxq->flush_state = SFXGE_FLUSH_DONE; 992 } 993 994 rxq->pending = rxq->added; 995 sfxge_rx_qcomplete(rxq, B_TRUE); 996 997 KASSERT(rxq->completed == rxq->pending, 998 ("rxq->completed != rxq->pending")); 999 1000 rxq->added = 0; 1001 rxq->pushed = 0; 1002 rxq->pending = 0; 1003 rxq->completed = 0; 1004 rxq->loopback = 0; 1005 1006 /* Destroy the common code receive queue. */ 1007 efx_rx_qdestroy(rxq->common); 1008 1009 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id, 1010 EFX_RXQ_NBUFS(sc->rxq_entries)); 1011 1012 SFXGE_EVQ_UNLOCK(evq); 1013 } 1014 1015 static int 1016 sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index) 1017 { 1018 struct sfxge_rxq *rxq; 1019 efsys_mem_t *esmp; 1020 struct sfxge_evq *evq; 1021 int rc; 1022 1023 SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc); 1024 1025 rxq = sc->rxq[index]; 1026 esmp = &rxq->mem; 1027 evq = sc->evq[index]; 1028 1029 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED, 1030 ("rxq->init_state != SFXGE_RXQ_INITIALIZED")); 1031 KASSERT(evq->init_state == SFXGE_EVQ_STARTED, 1032 ("evq->init_state != SFXGE_EVQ_STARTED")); 1033 1034 /* Program the buffer table. */ 1035 if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp, 1036 EFX_RXQ_NBUFS(sc->rxq_entries))) != 0) 1037 return (rc); 1038 1039 /* Create the common code receive queue. */ 1040 if ((rc = efx_rx_qcreate(sc->enp, index, 0, EFX_RXQ_TYPE_DEFAULT, 1041 esmp, sc->rxq_entries, rxq->buf_base_id, EFX_RXQ_FLAG_NONE, 1042 evq->common, &rxq->common)) != 0) 1043 goto fail; 1044 1045 SFXGE_EVQ_LOCK(evq); 1046 1047 /* Enable the receive queue. */ 1048 efx_rx_qenable(rxq->common); 1049 1050 rxq->init_state = SFXGE_RXQ_STARTED; 1051 rxq->flush_state = SFXGE_FLUSH_REQUIRED; 1052 1053 /* Try to fill the queue from the pool. */ 1054 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE); 1055 1056 SFXGE_EVQ_UNLOCK(evq); 1057 1058 return (0); 1059 1060 fail: 1061 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id, 1062 EFX_RXQ_NBUFS(sc->rxq_entries)); 1063 return (rc); 1064 } 1065 1066 void 1067 sfxge_rx_stop(struct sfxge_softc *sc) 1068 { 1069 int index; 1070 1071 efx_mac_filter_default_rxq_clear(sc->enp); 1072 1073 /* Stop the receive queue(s) */ 1074 index = sc->rxq_count; 1075 while (--index >= 0) 1076 sfxge_rx_qstop(sc, index); 1077 1078 sc->rx_prefix_size = 0; 1079 sc->rx_buffer_size = 0; 1080 1081 efx_rx_fini(sc->enp); 1082 } 1083 1084 int 1085 sfxge_rx_start(struct sfxge_softc *sc) 1086 { 1087 const efx_nic_cfg_t *encp; 1088 size_t hdrlen, align, reserved; 1089 int index; 1090 int rc; 1091 1092 /* Initialize the common code receive module. */ 1093 if ((rc = efx_rx_init(sc->enp)) != 0) 1094 return (rc); 1095 1096 encp = efx_nic_cfg_get(sc->enp); 1097 sc->rx_buffer_size = EFX_MAC_PDU(if_getmtu(sc->ifnet)); 1098 1099 /* Calculate the receive packet buffer size. */ 1100 sc->rx_prefix_size = encp->enc_rx_prefix_size; 1101 1102 /* Ensure IP headers are 32bit aligned */ 1103 hdrlen = sc->rx_prefix_size + sizeof (struct ether_header); 1104 sc->rx_buffer_align = EFX_P2ROUNDUP(size_t, hdrlen, 4) - hdrlen; 1105 1106 sc->rx_buffer_size += sc->rx_buffer_align; 1107 1108 /* Align end of packet buffer for RX DMA end padding */ 1109 align = MAX(1, encp->enc_rx_buf_align_end); 1110 EFSYS_ASSERT(ISP2(align)); 1111 sc->rx_buffer_size = EFX_P2ROUNDUP(size_t, sc->rx_buffer_size, align); 1112 1113 /* 1114 * Standard mbuf zones only guarantee pointer-size alignment; 1115 * we need extra space to align to the cache line 1116 */ 1117 reserved = sc->rx_buffer_size + CACHE_LINE_SIZE; 1118 1119 /* Select zone for packet buffers */ 1120 if (reserved <= MCLBYTES) 1121 sc->rx_cluster_size = MCLBYTES; 1122 else if (reserved <= MJUMPAGESIZE) 1123 sc->rx_cluster_size = MJUMPAGESIZE; 1124 else if (reserved <= MJUM9BYTES) 1125 sc->rx_cluster_size = MJUM9BYTES; 1126 else 1127 sc->rx_cluster_size = MJUM16BYTES; 1128 1129 /* 1130 * Set up the scale table. Enable all hash types and hash insertion. 1131 */ 1132 for (index = 0; index < nitems(sc->rx_indir_table); index++) 1133 #ifdef RSS 1134 sc->rx_indir_table[index] = 1135 rss_get_indirection_to_bucket(index) % sc->rxq_count; 1136 #else 1137 sc->rx_indir_table[index] = index % sc->rxq_count; 1138 #endif 1139 if ((rc = efx_rx_scale_tbl_set(sc->enp, EFX_RSS_CONTEXT_DEFAULT, 1140 sc->rx_indir_table, 1141 nitems(sc->rx_indir_table))) != 0) 1142 goto fail; 1143 (void)efx_rx_scale_mode_set(sc->enp, EFX_RSS_CONTEXT_DEFAULT, 1144 EFX_RX_HASHALG_TOEPLITZ, 1145 EFX_RX_HASH_IPV4 | EFX_RX_HASH_TCPIPV4 | 1146 EFX_RX_HASH_IPV6 | EFX_RX_HASH_TCPIPV6, B_TRUE); 1147 1148 #ifdef RSS 1149 rss_getkey(toep_key); 1150 #endif 1151 if ((rc = efx_rx_scale_key_set(sc->enp, EFX_RSS_CONTEXT_DEFAULT, 1152 toep_key, 1153 sizeof(toep_key))) != 0) 1154 goto fail; 1155 1156 /* Start the receive queue(s). */ 1157 for (index = 0; index < sc->rxq_count; index++) { 1158 if ((rc = sfxge_rx_qstart(sc, index)) != 0) 1159 goto fail2; 1160 } 1161 1162 rc = efx_mac_filter_default_rxq_set(sc->enp, sc->rxq[0]->common, 1163 sc->intr.n_alloc > 1); 1164 if (rc != 0) 1165 goto fail3; 1166 1167 return (0); 1168 1169 fail3: 1170 fail2: 1171 while (--index >= 0) 1172 sfxge_rx_qstop(sc, index); 1173 1174 fail: 1175 efx_rx_fini(sc->enp); 1176 1177 return (rc); 1178 } 1179 1180 #ifdef SFXGE_LRO 1181 1182 static void sfxge_lro_init(struct sfxge_rxq *rxq) 1183 { 1184 struct sfxge_lro_state *st = &rxq->lro; 1185 unsigned i; 1186 1187 st->conns_mask = lro_table_size - 1; 1188 KASSERT(!((st->conns_mask + 1) & st->conns_mask), 1189 ("lro_table_size must be a power of 2")); 1190 st->sc = rxq->sc; 1191 st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]), 1192 M_SFXGE, M_WAITOK); 1193 st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]), 1194 M_SFXGE, M_WAITOK); 1195 for (i = 0; i <= st->conns_mask; ++i) { 1196 TAILQ_INIT(&st->conns[i]); 1197 st->conns_n[i] = 0; 1198 } 1199 LIST_INIT(&st->active_conns); 1200 TAILQ_INIT(&st->free_conns); 1201 } 1202 1203 static void sfxge_lro_fini(struct sfxge_rxq *rxq) 1204 { 1205 struct sfxge_lro_state *st = &rxq->lro; 1206 struct sfxge_lro_conn *c; 1207 unsigned i; 1208 1209 /* Return cleanly if sfxge_lro_init() has not been called. */ 1210 if (st->conns == NULL) 1211 return; 1212 1213 KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections")); 1214 1215 for (i = 0; i <= st->conns_mask; ++i) { 1216 while (!TAILQ_EMPTY(&st->conns[i])) { 1217 c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq); 1218 sfxge_lro_drop(rxq, c); 1219 } 1220 } 1221 1222 while (!TAILQ_EMPTY(&st->free_conns)) { 1223 c = TAILQ_FIRST(&st->free_conns); 1224 TAILQ_REMOVE(&st->free_conns, c, link); 1225 KASSERT(!c->mbuf, ("found orphaned mbuf")); 1226 free(c, M_SFXGE); 1227 } 1228 1229 free(st->conns_n, M_SFXGE); 1230 free(st->conns, M_SFXGE); 1231 st->conns = NULL; 1232 } 1233 1234 #else 1235 1236 static void 1237 sfxge_lro_init(struct sfxge_rxq *rxq) 1238 { 1239 } 1240 1241 static void 1242 sfxge_lro_fini(struct sfxge_rxq *rxq) 1243 { 1244 } 1245 1246 #endif /* SFXGE_LRO */ 1247 1248 static void 1249 sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index) 1250 { 1251 struct sfxge_rxq *rxq; 1252 1253 rxq = sc->rxq[index]; 1254 1255 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED, 1256 ("rxq->init_state != SFXGE_RXQ_INITIALIZED")); 1257 1258 /* Free the context array and the flow table. */ 1259 free(rxq->queue, M_SFXGE); 1260 sfxge_lro_fini(rxq); 1261 1262 /* Release DMA memory. */ 1263 sfxge_dma_free(&rxq->mem); 1264 1265 sc->rxq[index] = NULL; 1266 1267 free(rxq, M_SFXGE); 1268 } 1269 1270 static int 1271 sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index) 1272 { 1273 struct sfxge_rxq *rxq; 1274 efsys_mem_t *esmp; 1275 int rc; 1276 1277 KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count)); 1278 1279 rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK); 1280 rxq->sc = sc; 1281 rxq->index = index; 1282 rxq->entries = sc->rxq_entries; 1283 rxq->ptr_mask = rxq->entries - 1; 1284 rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries); 1285 1286 sc->rxq[index] = rxq; 1287 esmp = &rxq->mem; 1288 1289 /* Allocate and zero DMA space. */ 1290 if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0) 1291 return (rc); 1292 1293 /* Allocate buffer table entries. */ 1294 sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries), 1295 &rxq->buf_base_id); 1296 1297 /* Allocate the context array and the flow table. */ 1298 rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries, 1299 M_SFXGE, M_WAITOK | M_ZERO); 1300 sfxge_lro_init(rxq); 1301 1302 callout_init(&rxq->refill_callout, 1); 1303 1304 rxq->init_state = SFXGE_RXQ_INITIALIZED; 1305 1306 return (0); 1307 } 1308 1309 static const struct { 1310 const char *name; 1311 size_t offset; 1312 } sfxge_rx_stats[] = { 1313 #define SFXGE_RX_STAT(name, member) \ 1314 { #name, offsetof(struct sfxge_rxq, member) } 1315 #ifdef SFXGE_LRO 1316 SFXGE_RX_STAT(lro_merges, lro.n_merges), 1317 SFXGE_RX_STAT(lro_bursts, lro.n_bursts), 1318 SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start), 1319 SFXGE_RX_STAT(lro_misorder, lro.n_misorder), 1320 SFXGE_RX_STAT(lro_too_many, lro.n_too_many), 1321 SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream), 1322 SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle), 1323 SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed) 1324 #endif 1325 }; 1326 1327 static int 1328 sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS) 1329 { 1330 struct sfxge_softc *sc = arg1; 1331 unsigned int id = arg2; 1332 unsigned int sum, index; 1333 1334 /* Sum across all RX queues */ 1335 sum = 0; 1336 for (index = 0; index < sc->rxq_count; index++) 1337 sum += *(unsigned int *)((caddr_t)sc->rxq[index] + 1338 sfxge_rx_stats[id].offset); 1339 1340 return (SYSCTL_OUT(req, &sum, sizeof(sum))); 1341 } 1342 1343 static void 1344 sfxge_rx_stat_init(struct sfxge_softc *sc) 1345 { 1346 struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev); 1347 struct sysctl_oid_list *stat_list; 1348 unsigned int id; 1349 1350 stat_list = SYSCTL_CHILDREN(sc->stats_node); 1351 1352 for (id = 0; id < nitems(sfxge_rx_stats); id++) { 1353 SYSCTL_ADD_PROC(ctx, stat_list, OID_AUTO, 1354 sfxge_rx_stats[id].name, 1355 CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_NEEDGIANT, 1356 sc, id, sfxge_rx_stat_handler, "IU", ""); 1357 } 1358 } 1359 1360 void 1361 sfxge_rx_fini(struct sfxge_softc *sc) 1362 { 1363 int index; 1364 1365 index = sc->rxq_count; 1366 while (--index >= 0) 1367 sfxge_rx_qfini(sc, index); 1368 1369 sc->rxq_count = 0; 1370 } 1371 1372 int 1373 sfxge_rx_init(struct sfxge_softc *sc) 1374 { 1375 struct sfxge_intr *intr; 1376 int index; 1377 int rc; 1378 1379 #ifdef SFXGE_LRO 1380 if (!ISP2(lro_table_size)) { 1381 log(LOG_ERR, "%s=%u must be power of 2", 1382 SFXGE_LRO_PARAM(table_size), lro_table_size); 1383 rc = EINVAL; 1384 goto fail_lro_table_size; 1385 } 1386 1387 if (lro_idle_ticks == 0) 1388 lro_idle_ticks = hz / 10 + 1; /* 100 ms */ 1389 #endif 1390 1391 intr = &sc->intr; 1392 1393 sc->rxq_count = intr->n_alloc; 1394 1395 KASSERT(intr->state == SFXGE_INTR_INITIALIZED, 1396 ("intr->state != SFXGE_INTR_INITIALIZED")); 1397 1398 /* Initialize the receive queue(s) - one per interrupt. */ 1399 for (index = 0; index < sc->rxq_count; index++) { 1400 if ((rc = sfxge_rx_qinit(sc, index)) != 0) 1401 goto fail; 1402 } 1403 1404 sfxge_rx_stat_init(sc); 1405 1406 return (0); 1407 1408 fail: 1409 /* Tear down the receive queue(s). */ 1410 while (--index >= 0) 1411 sfxge_rx_qfini(sc, index); 1412 1413 sc->rxq_count = 0; 1414 1415 #ifdef SFXGE_LRO 1416 fail_lro_table_size: 1417 #endif 1418 return (rc); 1419 } 1420