1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2010-2016 Solarflare Communications Inc. 5 * All rights reserved. 6 * 7 * This software was developed in part by Philip Paeps under contract for 8 * Solarflare Communications, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions are met: 12 * 13 * 1. Redistributions of source code must retain the above copyright notice, 14 * this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright notice, 16 * this list of conditions and the following disclaimer in the documentation 17 * and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 21 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 23 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 24 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 25 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 26 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 27 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 28 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 29 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 * 31 * The views and conclusions contained in the software and documentation are 32 * those of the authors and should not be interpreted as representing official 33 * policies, either expressed or implied, of the FreeBSD Project. 34 */ 35 36 #include <sys/cdefs.h> 37 __FBSDID("$FreeBSD$"); 38 39 #include "opt_rss.h" 40 41 #include <sys/param.h> 42 #include <sys/malloc.h> 43 #include <sys/mbuf.h> 44 #include <sys/smp.h> 45 #include <sys/socket.h> 46 #include <sys/sysctl.h> 47 #include <sys/syslog.h> 48 #include <sys/limits.h> 49 #include <sys/syslog.h> 50 51 #include <net/ethernet.h> 52 #include <net/if.h> 53 #include <net/if_vlan_var.h> 54 55 #include <netinet/in.h> 56 #include <netinet/ip.h> 57 #include <netinet/ip6.h> 58 #include <netinet/tcp.h> 59 60 #include <machine/in_cksum.h> 61 62 #ifdef RSS 63 #include <net/rss_config.h> 64 #endif 65 66 #include "common/efx.h" 67 68 69 #include "sfxge.h" 70 #include "sfxge_rx.h" 71 72 #define RX_REFILL_THRESHOLD(_entries) (EFX_RXQ_LIMIT(_entries) * 9 / 10) 73 74 #ifdef SFXGE_LRO 75 76 SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD, NULL, 77 "Large receive offload (LRO) parameters"); 78 79 #define SFXGE_LRO_PARAM(_param) SFXGE_PARAM(lro._param) 80 81 /* Size of the LRO hash table. Must be a power of 2. A larger table 82 * means we can accelerate a larger number of streams. 83 */ 84 static unsigned lro_table_size = 128; 85 TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size); 86 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN, 87 &lro_table_size, 0, 88 "Size of the LRO hash table (must be a power of 2)"); 89 90 /* Maximum length of a hash chain. If chains get too long then the lookup 91 * time increases and may exceed the benefit of LRO. 92 */ 93 static unsigned lro_chain_max = 20; 94 TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max); 95 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN, 96 &lro_chain_max, 0, 97 "The maximum length of a hash chain"); 98 99 /* Maximum time (in ticks) that a connection can be idle before it's LRO 100 * state is discarded. 101 */ 102 static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */ 103 TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks); 104 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN, 105 &lro_idle_ticks, 0, 106 "The maximum time (in ticks) that a connection can be idle " 107 "before it's LRO state is discarded"); 108 109 /* Number of packets with payload that must arrive in-order before a 110 * connection is eligible for LRO. The idea is we should avoid coalescing 111 * segments when the sender is in slow-start because reducing the ACK rate 112 * can damage performance. 113 */ 114 static int lro_slow_start_packets = 2000; 115 TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets); 116 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN, 117 &lro_slow_start_packets, 0, 118 "Number of packets with payload that must arrive in-order before " 119 "a connection is eligible for LRO"); 120 121 /* Number of packets with payload that must arrive in-order following loss 122 * before a connection is eligible for LRO. The idea is we should avoid 123 * coalescing segments when the sender is recovering from loss, because 124 * reducing the ACK rate can damage performance. 125 */ 126 static int lro_loss_packets = 20; 127 TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets); 128 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN, 129 &lro_loss_packets, 0, 130 "Number of packets with payload that must arrive in-order " 131 "following loss before a connection is eligible for LRO"); 132 133 /* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */ 134 #define SFXGE_LRO_L2_ID_VLAN 0x4000 135 #define SFXGE_LRO_L2_ID_IPV6 0x8000 136 #define SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN) 137 #define SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6)) 138 139 /* Compare IPv6 addresses, avoiding conditional branches */ 140 static unsigned long ipv6_addr_cmp(const struct in6_addr *left, 141 const struct in6_addr *right) 142 { 143 #if LONG_BIT == 64 144 const uint64_t *left64 = (const uint64_t *)left; 145 const uint64_t *right64 = (const uint64_t *)right; 146 return (left64[0] - right64[0]) | (left64[1] - right64[1]); 147 #else 148 return (left->s6_addr32[0] - right->s6_addr32[0]) | 149 (left->s6_addr32[1] - right->s6_addr32[1]) | 150 (left->s6_addr32[2] - right->s6_addr32[2]) | 151 (left->s6_addr32[3] - right->s6_addr32[3]); 152 #endif 153 } 154 155 #endif /* SFXGE_LRO */ 156 157 void 158 sfxge_rx_qflush_done(struct sfxge_rxq *rxq) 159 { 160 161 rxq->flush_state = SFXGE_FLUSH_DONE; 162 } 163 164 void 165 sfxge_rx_qflush_failed(struct sfxge_rxq *rxq) 166 { 167 168 rxq->flush_state = SFXGE_FLUSH_FAILED; 169 } 170 171 #ifdef RSS 172 static uint8_t toep_key[RSS_KEYSIZE]; 173 #else 174 static uint8_t toep_key[] = { 175 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, 176 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, 177 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, 178 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, 179 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa 180 }; 181 #endif 182 183 static void 184 sfxge_rx_post_refill(void *arg) 185 { 186 struct sfxge_rxq *rxq = arg; 187 struct sfxge_softc *sc; 188 unsigned int index; 189 struct sfxge_evq *evq; 190 uint16_t magic; 191 192 sc = rxq->sc; 193 index = rxq->index; 194 evq = sc->evq[index]; 195 magic = sfxge_sw_ev_rxq_magic(SFXGE_SW_EV_RX_QREFILL, rxq); 196 197 /* This is guaranteed due to the start/stop order of rx and ev */ 198 KASSERT(evq->init_state == SFXGE_EVQ_STARTED, 199 ("evq not started")); 200 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED, 201 ("rxq not started")); 202 efx_ev_qpost(evq->common, magic); 203 } 204 205 static void 206 sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying) 207 { 208 /* Initially retry after 100 ms, but back off in case of 209 * repeated failures as we probably have to wait for the 210 * administrator to raise the pool limit. */ 211 if (retrying) 212 rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz); 213 else 214 rxq->refill_delay = hz / 10; 215 216 callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay, 217 sfxge_rx_post_refill, rxq); 218 } 219 220 #define SFXGE_REFILL_BATCH 64 221 222 static void 223 sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying) 224 { 225 struct sfxge_softc *sc; 226 unsigned int index; 227 struct sfxge_evq *evq; 228 unsigned int batch; 229 unsigned int rxfill; 230 unsigned int mblksize; 231 int ntodo; 232 efsys_dma_addr_t addr[SFXGE_REFILL_BATCH]; 233 234 sc = rxq->sc; 235 index = rxq->index; 236 evq = sc->evq[index]; 237 238 prefetch_read_many(sc->enp); 239 prefetch_read_many(rxq->common); 240 241 SFXGE_EVQ_LOCK_ASSERT_OWNED(evq); 242 243 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED)) 244 return; 245 246 rxfill = rxq->added - rxq->completed; 247 KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries), 248 ("rxfill > EFX_RXQ_LIMIT(rxq->entries)")); 249 ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target); 250 KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries), 251 ("ntodo > EFX_RQX_LIMIT(rxq->entries)")); 252 253 if (ntodo == 0) 254 return; 255 256 batch = 0; 257 mblksize = sc->rx_buffer_size - sc->rx_buffer_align; 258 while (ntodo-- > 0) { 259 unsigned int id; 260 struct sfxge_rx_sw_desc *rx_desc; 261 bus_dma_segment_t seg; 262 struct mbuf *m; 263 264 id = (rxq->added + batch) & rxq->ptr_mask; 265 rx_desc = &rxq->queue[id]; 266 KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL")); 267 268 rx_desc->flags = EFX_DISCARD; 269 m = rx_desc->mbuf = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, 270 sc->rx_cluster_size); 271 if (m == NULL) 272 break; 273 274 /* m_len specifies length of area to be mapped for DMA */ 275 m->m_len = mblksize; 276 m->m_data = (caddr_t)P2ROUNDUP((uintptr_t)m->m_data, CACHE_LINE_SIZE); 277 m->m_data += sc->rx_buffer_align; 278 279 sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg); 280 addr[batch++] = seg.ds_addr; 281 282 if (batch == SFXGE_REFILL_BATCH) { 283 efx_rx_qpost(rxq->common, addr, mblksize, batch, 284 rxq->completed, rxq->added); 285 rxq->added += batch; 286 batch = 0; 287 } 288 } 289 290 if (ntodo != 0) 291 sfxge_rx_schedule_refill(rxq, retrying); 292 293 if (batch != 0) { 294 efx_rx_qpost(rxq->common, addr, mblksize, batch, 295 rxq->completed, rxq->added); 296 rxq->added += batch; 297 } 298 299 /* Make the descriptors visible to the hardware */ 300 bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map, 301 BUS_DMASYNC_PREWRITE); 302 303 efx_rx_qpush(rxq->common, rxq->added, &rxq->pushed); 304 305 /* The queue could still be empty if no descriptors were actually 306 * pushed, in which case there will be no event to cause the next 307 * refill, so we must schedule a refill ourselves. 308 */ 309 if(rxq->pushed == rxq->completed) { 310 sfxge_rx_schedule_refill(rxq, retrying); 311 } 312 } 313 314 void 315 sfxge_rx_qrefill(struct sfxge_rxq *rxq) 316 { 317 318 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED)) 319 return; 320 321 /* Make sure the queue is full */ 322 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE); 323 } 324 325 static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m) 326 { 327 struct ifnet *ifp = sc->ifnet; 328 329 m->m_pkthdr.rcvif = ifp; 330 m->m_pkthdr.csum_data = 0xffff; 331 ifp->if_input(ifp, m); 332 } 333 334 static void 335 sfxge_rx_deliver(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_desc) 336 { 337 struct sfxge_softc *sc = rxq->sc; 338 struct mbuf *m = rx_desc->mbuf; 339 int flags = rx_desc->flags; 340 int csum_flags; 341 342 /* Convert checksum flags */ 343 csum_flags = (flags & EFX_CKSUM_IPV4) ? 344 (CSUM_IP_CHECKED | CSUM_IP_VALID) : 0; 345 if (flags & EFX_CKSUM_TCPUDP) 346 csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 347 348 if (flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) { 349 m->m_pkthdr.flowid = 350 efx_pseudo_hdr_hash_get(rxq->common, 351 EFX_RX_HASHALG_TOEPLITZ, 352 mtod(m, uint8_t *)); 353 /* The hash covers a 4-tuple for TCP only */ 354 M_HASHTYPE_SET(m, 355 (flags & EFX_PKT_IPV4) ? 356 ((flags & EFX_PKT_TCP) ? 357 M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_IPV4) : 358 ((flags & EFX_PKT_TCP) ? 359 M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_IPV6)); 360 } 361 m->m_data += sc->rx_prefix_size; 362 m->m_len = rx_desc->size - sc->rx_prefix_size; 363 m->m_pkthdr.len = m->m_len; 364 m->m_pkthdr.csum_flags = csum_flags; 365 __sfxge_rx_deliver(sc, rx_desc->mbuf); 366 367 rx_desc->flags = EFX_DISCARD; 368 rx_desc->mbuf = NULL; 369 } 370 371 #ifdef SFXGE_LRO 372 373 static void 374 sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c) 375 { 376 struct sfxge_softc *sc = st->sc; 377 struct mbuf *m = c->mbuf; 378 struct tcphdr *c_th; 379 int csum_flags; 380 381 KASSERT(m, ("no mbuf to deliver")); 382 383 ++st->n_bursts; 384 385 /* Finish off packet munging and recalculate IP header checksum. */ 386 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 387 struct ip *iph = c->nh; 388 iph->ip_len = htons(iph->ip_len); 389 iph->ip_sum = 0; 390 iph->ip_sum = in_cksum_hdr(iph); 391 c_th = (struct tcphdr *)(iph + 1); 392 csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR | 393 CSUM_IP_CHECKED | CSUM_IP_VALID); 394 } else { 395 struct ip6_hdr *iph = c->nh; 396 iph->ip6_plen = htons(iph->ip6_plen); 397 c_th = (struct tcphdr *)(iph + 1); 398 csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 399 } 400 401 c_th->th_win = c->th_last->th_win; 402 c_th->th_ack = c->th_last->th_ack; 403 if (c_th->th_off == c->th_last->th_off) { 404 /* Copy TCP options (take care to avoid going negative). */ 405 int optlen = ((c_th->th_off - 5) & 0xf) << 2u; 406 memcpy(c_th + 1, c->th_last + 1, optlen); 407 } 408 409 m->m_pkthdr.flowid = c->conn_hash; 410 M_HASHTYPE_SET(m, 411 SFXGE_LRO_CONN_IS_TCPIPV4(c) ? 412 M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_TCP_IPV6); 413 414 m->m_pkthdr.csum_flags = csum_flags; 415 __sfxge_rx_deliver(sc, m); 416 417 c->mbuf = NULL; 418 c->delivered = 1; 419 } 420 421 /* Drop the given connection, and add it to the free list. */ 422 static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c) 423 { 424 unsigned bucket; 425 426 KASSERT(!c->mbuf, ("found orphaned mbuf")); 427 428 if (c->next_buf.mbuf != NULL) { 429 sfxge_rx_deliver(rxq, &c->next_buf); 430 LIST_REMOVE(c, active_link); 431 } 432 433 bucket = c->conn_hash & rxq->lro.conns_mask; 434 KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong")); 435 --rxq->lro.conns_n[bucket]; 436 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link); 437 TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link); 438 } 439 440 /* Stop tracking connections that have gone idle in order to keep hash 441 * chains short. 442 */ 443 static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now) 444 { 445 struct sfxge_lro_conn *c; 446 unsigned i; 447 448 KASSERT(LIST_EMPTY(&rxq->lro.active_conns), 449 ("found active connections")); 450 451 rxq->lro.last_purge_ticks = now; 452 for (i = 0; i <= rxq->lro.conns_mask; ++i) { 453 if (TAILQ_EMPTY(&rxq->lro.conns[i])) 454 continue; 455 456 c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq); 457 if (now - c->last_pkt_ticks > lro_idle_ticks) { 458 ++rxq->lro.n_drop_idle; 459 sfxge_lro_drop(rxq, c); 460 } 461 } 462 } 463 464 static void 465 sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c, 466 struct mbuf *mbuf, struct tcphdr *th) 467 { 468 struct tcphdr *c_th; 469 470 /* Tack the new mbuf onto the chain. */ 471 KASSERT(!mbuf->m_next, ("mbuf already chained")); 472 c->mbuf_tail->m_next = mbuf; 473 c->mbuf_tail = mbuf; 474 475 /* Increase length appropriately */ 476 c->mbuf->m_pkthdr.len += mbuf->m_len; 477 478 /* Update the connection state flags */ 479 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 480 struct ip *iph = c->nh; 481 iph->ip_len += mbuf->m_len; 482 c_th = (struct tcphdr *)(iph + 1); 483 } else { 484 struct ip6_hdr *iph = c->nh; 485 iph->ip6_plen += mbuf->m_len; 486 c_th = (struct tcphdr *)(iph + 1); 487 } 488 c_th->th_flags |= (th->th_flags & TH_PUSH); 489 c->th_last = th; 490 ++st->n_merges; 491 492 /* Pass packet up now if another segment could overflow the IP 493 * length. 494 */ 495 if (c->mbuf->m_pkthdr.len > 65536 - 9200) 496 sfxge_lro_deliver(st, c); 497 } 498 499 static void 500 sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c, 501 struct mbuf *mbuf, void *nh, struct tcphdr *th) 502 { 503 /* Start the chain */ 504 c->mbuf = mbuf; 505 c->mbuf_tail = c->mbuf; 506 c->nh = nh; 507 c->th_last = th; 508 509 mbuf->m_pkthdr.len = mbuf->m_len; 510 511 /* Mangle header fields for later processing */ 512 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 513 struct ip *iph = nh; 514 iph->ip_len = ntohs(iph->ip_len); 515 } else { 516 struct ip6_hdr *iph = nh; 517 iph->ip6_plen = ntohs(iph->ip6_plen); 518 } 519 } 520 521 /* Try to merge or otherwise hold or deliver (as appropriate) the 522 * packet buffered for this connection (c->next_buf). Return a flag 523 * indicating whether the connection is still active for LRO purposes. 524 */ 525 static int 526 sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c) 527 { 528 struct sfxge_rx_sw_desc *rx_buf = &c->next_buf; 529 char *eh = c->next_eh; 530 int data_length, hdr_length, dont_merge; 531 unsigned th_seq, pkt_length; 532 struct tcphdr *th; 533 unsigned now; 534 535 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 536 struct ip *iph = c->next_nh; 537 th = (struct tcphdr *)(iph + 1); 538 pkt_length = ntohs(iph->ip_len) + (char *) iph - eh; 539 } else { 540 struct ip6_hdr *iph = c->next_nh; 541 th = (struct tcphdr *)(iph + 1); 542 pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh; 543 } 544 545 hdr_length = (char *) th + th->th_off * 4 - eh; 546 data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) - 547 hdr_length); 548 th_seq = ntohl(th->th_seq); 549 dont_merge = ((data_length <= 0) 550 | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN))); 551 552 /* Check for options other than aligned timestamp. */ 553 if (th->th_off != 5) { 554 const uint32_t *opt_ptr = (const uint32_t *) (th + 1); 555 if (th->th_off == 8 && 556 opt_ptr[0] == ntohl((TCPOPT_NOP << 24) | 557 (TCPOPT_NOP << 16) | 558 (TCPOPT_TIMESTAMP << 8) | 559 TCPOLEN_TIMESTAMP)) { 560 /* timestamp option -- okay */ 561 } else { 562 dont_merge = 1; 563 } 564 } 565 566 if (__predict_false(th_seq != c->next_seq)) { 567 /* Out-of-order, so start counting again. */ 568 if (c->mbuf != NULL) 569 sfxge_lro_deliver(&rxq->lro, c); 570 c->n_in_order_pkts -= lro_loss_packets; 571 c->next_seq = th_seq + data_length; 572 ++rxq->lro.n_misorder; 573 goto deliver_buf_out; 574 } 575 c->next_seq = th_seq + data_length; 576 577 now = ticks; 578 if (now - c->last_pkt_ticks > lro_idle_ticks) { 579 ++rxq->lro.n_drop_idle; 580 if (c->mbuf != NULL) 581 sfxge_lro_deliver(&rxq->lro, c); 582 sfxge_lro_drop(rxq, c); 583 return (0); 584 } 585 c->last_pkt_ticks = ticks; 586 587 if (c->n_in_order_pkts < lro_slow_start_packets) { 588 /* May be in slow-start, so don't merge. */ 589 ++rxq->lro.n_slow_start; 590 ++c->n_in_order_pkts; 591 goto deliver_buf_out; 592 } 593 594 if (__predict_false(dont_merge)) { 595 if (c->mbuf != NULL) 596 sfxge_lro_deliver(&rxq->lro, c); 597 if (th->th_flags & (TH_FIN | TH_RST)) { 598 ++rxq->lro.n_drop_closed; 599 sfxge_lro_drop(rxq, c); 600 return (0); 601 } 602 goto deliver_buf_out; 603 } 604 605 rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size; 606 607 if (__predict_true(c->mbuf != NULL)) { 608 /* Remove headers and any padding */ 609 rx_buf->mbuf->m_data += hdr_length; 610 rx_buf->mbuf->m_len = data_length; 611 612 sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th); 613 } else { 614 /* Remove any padding */ 615 rx_buf->mbuf->m_len = pkt_length; 616 617 sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th); 618 } 619 620 rx_buf->mbuf = NULL; 621 return (1); 622 623 deliver_buf_out: 624 sfxge_rx_deliver(rxq, rx_buf); 625 return (1); 626 } 627 628 static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash, 629 uint16_t l2_id, void *nh, struct tcphdr *th) 630 { 631 unsigned bucket = conn_hash & st->conns_mask; 632 struct sfxge_lro_conn *c; 633 634 if (st->conns_n[bucket] >= lro_chain_max) { 635 ++st->n_too_many; 636 return; 637 } 638 639 if (!TAILQ_EMPTY(&st->free_conns)) { 640 c = TAILQ_FIRST(&st->free_conns); 641 TAILQ_REMOVE(&st->free_conns, c, link); 642 } else { 643 c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT); 644 if (c == NULL) 645 return; 646 c->mbuf = NULL; 647 c->next_buf.mbuf = NULL; 648 } 649 650 /* Create the connection tracking data */ 651 ++st->conns_n[bucket]; 652 TAILQ_INSERT_HEAD(&st->conns[bucket], c, link); 653 c->l2_id = l2_id; 654 c->conn_hash = conn_hash; 655 c->source = th->th_sport; 656 c->dest = th->th_dport; 657 c->n_in_order_pkts = 0; 658 c->last_pkt_ticks = *(volatile int *)&ticks; 659 c->delivered = 0; 660 ++st->n_new_stream; 661 /* NB. We don't initialise c->next_seq, and it doesn't matter what 662 * value it has. Most likely the next packet received for this 663 * connection will not match -- no harm done. 664 */ 665 } 666 667 /* Process mbuf and decide whether to dispatch it to the stack now or 668 * later. 669 */ 670 static void 671 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf) 672 { 673 struct sfxge_softc *sc = rxq->sc; 674 struct mbuf *m = rx_buf->mbuf; 675 struct ether_header *eh; 676 struct sfxge_lro_conn *c; 677 uint16_t l2_id; 678 uint16_t l3_proto; 679 void *nh; 680 struct tcphdr *th; 681 uint32_t conn_hash; 682 unsigned bucket; 683 684 /* Get the hardware hash */ 685 conn_hash = efx_pseudo_hdr_hash_get(rxq->common, 686 EFX_RX_HASHALG_TOEPLITZ, 687 mtod(m, uint8_t *)); 688 689 eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size); 690 if (eh->ether_type == htons(ETHERTYPE_VLAN)) { 691 struct ether_vlan_header *veh = (struct ether_vlan_header *)eh; 692 l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) | 693 SFXGE_LRO_L2_ID_VLAN; 694 l3_proto = veh->evl_proto; 695 nh = veh + 1; 696 } else { 697 l2_id = 0; 698 l3_proto = eh->ether_type; 699 nh = eh + 1; 700 } 701 702 /* Check whether this is a suitable packet (unfragmented 703 * TCP/IPv4 or TCP/IPv6). If so, find the TCP header and 704 * length, and compute a hash if necessary. If not, return. 705 */ 706 if (l3_proto == htons(ETHERTYPE_IP)) { 707 struct ip *iph = nh; 708 709 KASSERT(iph->ip_p == IPPROTO_TCP, 710 ("IPv4 protocol is not TCP, but packet marker is set")); 711 if ((iph->ip_hl - (sizeof(*iph) >> 2u)) | 712 (iph->ip_off & htons(IP_MF | IP_OFFMASK))) 713 goto deliver_now; 714 th = (struct tcphdr *)(iph + 1); 715 } else if (l3_proto == htons(ETHERTYPE_IPV6)) { 716 struct ip6_hdr *iph = nh; 717 718 KASSERT(iph->ip6_nxt == IPPROTO_TCP, 719 ("IPv6 next header is not TCP, but packet marker is set")); 720 l2_id |= SFXGE_LRO_L2_ID_IPV6; 721 th = (struct tcphdr *)(iph + 1); 722 } else { 723 goto deliver_now; 724 } 725 726 bucket = conn_hash & rxq->lro.conns_mask; 727 728 TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) { 729 if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash)) 730 continue; 731 if ((c->source - th->th_sport) | (c->dest - th->th_dport)) 732 continue; 733 if (c->mbuf != NULL) { 734 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 735 struct ip *c_iph, *iph = nh; 736 c_iph = c->nh; 737 if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) | 738 (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr)) 739 continue; 740 } else { 741 struct ip6_hdr *c_iph, *iph = nh; 742 c_iph = c->nh; 743 if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) | 744 ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst)) 745 continue; 746 } 747 } 748 749 /* Re-insert at head of list to reduce lookup time. */ 750 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link); 751 TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link); 752 753 if (c->next_buf.mbuf != NULL) { 754 if (!sfxge_lro_try_merge(rxq, c)) 755 goto deliver_now; 756 } else { 757 LIST_INSERT_HEAD(&rxq->lro.active_conns, c, 758 active_link); 759 } 760 c->next_buf = *rx_buf; 761 c->next_eh = eh; 762 c->next_nh = nh; 763 764 rx_buf->mbuf = NULL; 765 rx_buf->flags = EFX_DISCARD; 766 return; 767 } 768 769 sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th); 770 deliver_now: 771 sfxge_rx_deliver(rxq, rx_buf); 772 } 773 774 static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq) 775 { 776 struct sfxge_lro_state *st = &rxq->lro; 777 struct sfxge_lro_conn *c; 778 unsigned t; 779 780 while (!LIST_EMPTY(&st->active_conns)) { 781 c = LIST_FIRST(&st->active_conns); 782 if (!c->delivered && c->mbuf != NULL) 783 sfxge_lro_deliver(st, c); 784 if (sfxge_lro_try_merge(rxq, c)) { 785 if (c->mbuf != NULL) 786 sfxge_lro_deliver(st, c); 787 LIST_REMOVE(c, active_link); 788 } 789 c->delivered = 0; 790 } 791 792 t = *(volatile int *)&ticks; 793 if (__predict_false(t != st->last_purge_ticks)) 794 sfxge_lro_purge_idle(rxq, t); 795 } 796 797 #else /* !SFXGE_LRO */ 798 799 static void 800 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf) 801 { 802 } 803 804 static void 805 sfxge_lro_end_of_burst(struct sfxge_rxq *rxq) 806 { 807 } 808 809 #endif /* SFXGE_LRO */ 810 811 void 812 sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop) 813 { 814 struct sfxge_softc *sc = rxq->sc; 815 int if_capenable = sc->ifnet->if_capenable; 816 int lro_enabled = if_capenable & IFCAP_LRO; 817 unsigned int index; 818 struct sfxge_evq *evq; 819 unsigned int completed; 820 unsigned int level; 821 struct mbuf *m; 822 struct sfxge_rx_sw_desc *prev = NULL; 823 824 index = rxq->index; 825 evq = sc->evq[index]; 826 827 SFXGE_EVQ_LOCK_ASSERT_OWNED(evq); 828 829 completed = rxq->completed; 830 while (completed != rxq->pending) { 831 unsigned int id; 832 struct sfxge_rx_sw_desc *rx_desc; 833 834 id = completed++ & rxq->ptr_mask; 835 rx_desc = &rxq->queue[id]; 836 m = rx_desc->mbuf; 837 838 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED)) 839 goto discard; 840 841 if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD)) 842 goto discard; 843 844 /* Read the length from the pseudo header if required */ 845 if (rx_desc->flags & EFX_PKT_PREFIX_LEN) { 846 uint16_t tmp_size; 847 int rc; 848 rc = efx_pseudo_hdr_pkt_length_get(rxq->common, 849 mtod(m, uint8_t *), 850 &tmp_size); 851 KASSERT(rc == 0, ("cannot get packet length: %d", rc)); 852 rx_desc->size = (int)tmp_size + sc->rx_prefix_size; 853 } 854 855 prefetch_read_many(mtod(m, caddr_t)); 856 857 switch (rx_desc->flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) { 858 case EFX_PKT_IPV4: 859 if (~if_capenable & IFCAP_RXCSUM) 860 rx_desc->flags &= 861 ~(EFX_CKSUM_IPV4 | EFX_CKSUM_TCPUDP); 862 break; 863 case EFX_PKT_IPV6: 864 if (~if_capenable & IFCAP_RXCSUM_IPV6) 865 rx_desc->flags &= ~EFX_CKSUM_TCPUDP; 866 break; 867 case 0: 868 /* Check for loopback packets */ 869 { 870 struct ether_header *etherhp; 871 872 /*LINTED*/ 873 etherhp = mtod(m, struct ether_header *); 874 875 if (etherhp->ether_type == 876 htons(SFXGE_ETHERTYPE_LOOPBACK)) { 877 EFSYS_PROBE(loopback); 878 879 rxq->loopback++; 880 goto discard; 881 } 882 } 883 break; 884 default: 885 KASSERT(B_FALSE, 886 ("Rx descriptor with both IPv4 and IPv6 flags")); 887 goto discard; 888 } 889 890 /* Pass packet up the stack or into LRO (pipelined) */ 891 if (prev != NULL) { 892 if (lro_enabled && 893 ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) == 894 (EFX_PKT_TCP | EFX_CKSUM_TCPUDP))) 895 sfxge_lro(rxq, prev); 896 else 897 sfxge_rx_deliver(rxq, prev); 898 } 899 prev = rx_desc; 900 continue; 901 902 discard: 903 /* Return the packet to the pool */ 904 m_free(m); 905 rx_desc->mbuf = NULL; 906 } 907 rxq->completed = completed; 908 909 level = rxq->added - rxq->completed; 910 911 /* Pass last packet up the stack or into LRO */ 912 if (prev != NULL) { 913 if (lro_enabled && 914 ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) == 915 (EFX_PKT_TCP | EFX_CKSUM_TCPUDP))) 916 sfxge_lro(rxq, prev); 917 else 918 sfxge_rx_deliver(rxq, prev); 919 } 920 921 /* 922 * If there are any pending flows and this is the end of the 923 * poll then they must be completed. 924 */ 925 if (eop) 926 sfxge_lro_end_of_burst(rxq); 927 928 /* Top up the queue if necessary */ 929 if (level < rxq->refill_threshold) 930 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE); 931 } 932 933 static void 934 sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index) 935 { 936 struct sfxge_rxq *rxq; 937 struct sfxge_evq *evq; 938 unsigned int count; 939 unsigned int retry = 3; 940 941 SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc); 942 943 rxq = sc->rxq[index]; 944 evq = sc->evq[index]; 945 946 SFXGE_EVQ_LOCK(evq); 947 948 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED, 949 ("rxq not started")); 950 951 rxq->init_state = SFXGE_RXQ_INITIALIZED; 952 953 callout_stop(&rxq->refill_callout); 954 955 while (rxq->flush_state != SFXGE_FLUSH_DONE && retry != 0) { 956 rxq->flush_state = SFXGE_FLUSH_PENDING; 957 958 SFXGE_EVQ_UNLOCK(evq); 959 960 /* Flush the receive queue */ 961 if (efx_rx_qflush(rxq->common) != 0) { 962 SFXGE_EVQ_LOCK(evq); 963 rxq->flush_state = SFXGE_FLUSH_FAILED; 964 break; 965 } 966 967 count = 0; 968 do { 969 /* Spin for 100 ms */ 970 DELAY(100000); 971 972 if (rxq->flush_state != SFXGE_FLUSH_PENDING) 973 break; 974 975 } while (++count < 20); 976 977 SFXGE_EVQ_LOCK(evq); 978 979 if (rxq->flush_state == SFXGE_FLUSH_PENDING) { 980 /* Flush timeout - neither done nor failed */ 981 log(LOG_ERR, "%s: Cannot flush Rx queue %u\n", 982 device_get_nameunit(sc->dev), index); 983 rxq->flush_state = SFXGE_FLUSH_DONE; 984 } 985 retry--; 986 } 987 if (rxq->flush_state == SFXGE_FLUSH_FAILED) { 988 log(LOG_ERR, "%s: Flushing Rx queue %u failed\n", 989 device_get_nameunit(sc->dev), index); 990 rxq->flush_state = SFXGE_FLUSH_DONE; 991 } 992 993 rxq->pending = rxq->added; 994 sfxge_rx_qcomplete(rxq, B_TRUE); 995 996 KASSERT(rxq->completed == rxq->pending, 997 ("rxq->completed != rxq->pending")); 998 999 rxq->added = 0; 1000 rxq->pushed = 0; 1001 rxq->pending = 0; 1002 rxq->completed = 0; 1003 rxq->loopback = 0; 1004 1005 /* Destroy the common code receive queue. */ 1006 efx_rx_qdestroy(rxq->common); 1007 1008 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id, 1009 EFX_RXQ_NBUFS(sc->rxq_entries)); 1010 1011 SFXGE_EVQ_UNLOCK(evq); 1012 } 1013 1014 static int 1015 sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index) 1016 { 1017 struct sfxge_rxq *rxq; 1018 efsys_mem_t *esmp; 1019 struct sfxge_evq *evq; 1020 int rc; 1021 1022 SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc); 1023 1024 rxq = sc->rxq[index]; 1025 esmp = &rxq->mem; 1026 evq = sc->evq[index]; 1027 1028 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED, 1029 ("rxq->init_state != SFXGE_RXQ_INITIALIZED")); 1030 KASSERT(evq->init_state == SFXGE_EVQ_STARTED, 1031 ("evq->init_state != SFXGE_EVQ_STARTED")); 1032 1033 /* Program the buffer table. */ 1034 if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp, 1035 EFX_RXQ_NBUFS(sc->rxq_entries))) != 0) 1036 return (rc); 1037 1038 /* Create the common code receive queue. */ 1039 if ((rc = efx_rx_qcreate(sc->enp, index, 0, EFX_RXQ_TYPE_DEFAULT, 1040 esmp, sc->rxq_entries, rxq->buf_base_id, evq->common, 1041 &rxq->common)) != 0) 1042 goto fail; 1043 1044 SFXGE_EVQ_LOCK(evq); 1045 1046 /* Enable the receive queue. */ 1047 efx_rx_qenable(rxq->common); 1048 1049 rxq->init_state = SFXGE_RXQ_STARTED; 1050 rxq->flush_state = SFXGE_FLUSH_REQUIRED; 1051 1052 /* Try to fill the queue from the pool. */ 1053 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE); 1054 1055 SFXGE_EVQ_UNLOCK(evq); 1056 1057 return (0); 1058 1059 fail: 1060 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id, 1061 EFX_RXQ_NBUFS(sc->rxq_entries)); 1062 return (rc); 1063 } 1064 1065 void 1066 sfxge_rx_stop(struct sfxge_softc *sc) 1067 { 1068 int index; 1069 1070 efx_mac_filter_default_rxq_clear(sc->enp); 1071 1072 /* Stop the receive queue(s) */ 1073 index = sc->rxq_count; 1074 while (--index >= 0) 1075 sfxge_rx_qstop(sc, index); 1076 1077 sc->rx_prefix_size = 0; 1078 sc->rx_buffer_size = 0; 1079 1080 efx_rx_fini(sc->enp); 1081 } 1082 1083 int 1084 sfxge_rx_start(struct sfxge_softc *sc) 1085 { 1086 struct sfxge_intr *intr; 1087 const efx_nic_cfg_t *encp; 1088 size_t hdrlen, align, reserved; 1089 int index; 1090 int rc; 1091 1092 intr = &sc->intr; 1093 1094 /* Initialize the common code receive module. */ 1095 if ((rc = efx_rx_init(sc->enp)) != 0) 1096 return (rc); 1097 1098 encp = efx_nic_cfg_get(sc->enp); 1099 sc->rx_buffer_size = EFX_MAC_PDU(sc->ifnet->if_mtu); 1100 1101 /* Calculate the receive packet buffer size. */ 1102 sc->rx_prefix_size = encp->enc_rx_prefix_size; 1103 1104 /* Ensure IP headers are 32bit aligned */ 1105 hdrlen = sc->rx_prefix_size + sizeof (struct ether_header); 1106 sc->rx_buffer_align = P2ROUNDUP(hdrlen, 4) - hdrlen; 1107 1108 sc->rx_buffer_size += sc->rx_buffer_align; 1109 1110 /* Align end of packet buffer for RX DMA end padding */ 1111 align = MAX(1, encp->enc_rx_buf_align_end); 1112 EFSYS_ASSERT(ISP2(align)); 1113 sc->rx_buffer_size = P2ROUNDUP(sc->rx_buffer_size, align); 1114 1115 /* 1116 * Standard mbuf zones only guarantee pointer-size alignment; 1117 * we need extra space to align to the cache line 1118 */ 1119 reserved = sc->rx_buffer_size + CACHE_LINE_SIZE; 1120 1121 /* Select zone for packet buffers */ 1122 if (reserved <= MCLBYTES) 1123 sc->rx_cluster_size = MCLBYTES; 1124 else if (reserved <= MJUMPAGESIZE) 1125 sc->rx_cluster_size = MJUMPAGESIZE; 1126 else if (reserved <= MJUM9BYTES) 1127 sc->rx_cluster_size = MJUM9BYTES; 1128 else 1129 sc->rx_cluster_size = MJUM16BYTES; 1130 1131 /* 1132 * Set up the scale table. Enable all hash types and hash insertion. 1133 */ 1134 for (index = 0; index < nitems(sc->rx_indir_table); index++) 1135 #ifdef RSS 1136 sc->rx_indir_table[index] = 1137 rss_get_indirection_to_bucket(index) % sc->rxq_count; 1138 #else 1139 sc->rx_indir_table[index] = index % sc->rxq_count; 1140 #endif 1141 if ((rc = efx_rx_scale_tbl_set(sc->enp, EFX_RSS_CONTEXT_DEFAULT, 1142 sc->rx_indir_table, 1143 nitems(sc->rx_indir_table))) != 0) 1144 goto fail; 1145 (void)efx_rx_scale_mode_set(sc->enp, EFX_RSS_CONTEXT_DEFAULT, 1146 EFX_RX_HASHALG_TOEPLITZ, 1147 EFX_RX_HASH_IPV4 | EFX_RX_HASH_TCPIPV4 | 1148 EFX_RX_HASH_IPV6 | EFX_RX_HASH_TCPIPV6, B_TRUE); 1149 1150 #ifdef RSS 1151 rss_getkey(toep_key); 1152 #endif 1153 if ((rc = efx_rx_scale_key_set(sc->enp, EFX_RSS_CONTEXT_DEFAULT, 1154 toep_key, 1155 sizeof(toep_key))) != 0) 1156 goto fail; 1157 1158 /* Start the receive queue(s). */ 1159 for (index = 0; index < sc->rxq_count; index++) { 1160 if ((rc = sfxge_rx_qstart(sc, index)) != 0) 1161 goto fail2; 1162 } 1163 1164 rc = efx_mac_filter_default_rxq_set(sc->enp, sc->rxq[0]->common, 1165 sc->intr.n_alloc > 1); 1166 if (rc != 0) 1167 goto fail3; 1168 1169 return (0); 1170 1171 fail3: 1172 fail2: 1173 while (--index >= 0) 1174 sfxge_rx_qstop(sc, index); 1175 1176 fail: 1177 efx_rx_fini(sc->enp); 1178 1179 return (rc); 1180 } 1181 1182 #ifdef SFXGE_LRO 1183 1184 static void sfxge_lro_init(struct sfxge_rxq *rxq) 1185 { 1186 struct sfxge_lro_state *st = &rxq->lro; 1187 unsigned i; 1188 1189 st->conns_mask = lro_table_size - 1; 1190 KASSERT(!((st->conns_mask + 1) & st->conns_mask), 1191 ("lro_table_size must be a power of 2")); 1192 st->sc = rxq->sc; 1193 st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]), 1194 M_SFXGE, M_WAITOK); 1195 st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]), 1196 M_SFXGE, M_WAITOK); 1197 for (i = 0; i <= st->conns_mask; ++i) { 1198 TAILQ_INIT(&st->conns[i]); 1199 st->conns_n[i] = 0; 1200 } 1201 LIST_INIT(&st->active_conns); 1202 TAILQ_INIT(&st->free_conns); 1203 } 1204 1205 static void sfxge_lro_fini(struct sfxge_rxq *rxq) 1206 { 1207 struct sfxge_lro_state *st = &rxq->lro; 1208 struct sfxge_lro_conn *c; 1209 unsigned i; 1210 1211 /* Return cleanly if sfxge_lro_init() has not been called. */ 1212 if (st->conns == NULL) 1213 return; 1214 1215 KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections")); 1216 1217 for (i = 0; i <= st->conns_mask; ++i) { 1218 while (!TAILQ_EMPTY(&st->conns[i])) { 1219 c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq); 1220 sfxge_lro_drop(rxq, c); 1221 } 1222 } 1223 1224 while (!TAILQ_EMPTY(&st->free_conns)) { 1225 c = TAILQ_FIRST(&st->free_conns); 1226 TAILQ_REMOVE(&st->free_conns, c, link); 1227 KASSERT(!c->mbuf, ("found orphaned mbuf")); 1228 free(c, M_SFXGE); 1229 } 1230 1231 free(st->conns_n, M_SFXGE); 1232 free(st->conns, M_SFXGE); 1233 st->conns = NULL; 1234 } 1235 1236 #else 1237 1238 static void 1239 sfxge_lro_init(struct sfxge_rxq *rxq) 1240 { 1241 } 1242 1243 static void 1244 sfxge_lro_fini(struct sfxge_rxq *rxq) 1245 { 1246 } 1247 1248 #endif /* SFXGE_LRO */ 1249 1250 static void 1251 sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index) 1252 { 1253 struct sfxge_rxq *rxq; 1254 1255 rxq = sc->rxq[index]; 1256 1257 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED, 1258 ("rxq->init_state != SFXGE_RXQ_INITIALIZED")); 1259 1260 /* Free the context array and the flow table. */ 1261 free(rxq->queue, M_SFXGE); 1262 sfxge_lro_fini(rxq); 1263 1264 /* Release DMA memory. */ 1265 sfxge_dma_free(&rxq->mem); 1266 1267 sc->rxq[index] = NULL; 1268 1269 free(rxq, M_SFXGE); 1270 } 1271 1272 static int 1273 sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index) 1274 { 1275 struct sfxge_rxq *rxq; 1276 struct sfxge_evq *evq; 1277 efsys_mem_t *esmp; 1278 int rc; 1279 1280 KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count)); 1281 1282 rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK); 1283 rxq->sc = sc; 1284 rxq->index = index; 1285 rxq->entries = sc->rxq_entries; 1286 rxq->ptr_mask = rxq->entries - 1; 1287 rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries); 1288 1289 sc->rxq[index] = rxq; 1290 esmp = &rxq->mem; 1291 1292 evq = sc->evq[index]; 1293 1294 /* Allocate and zero DMA space. */ 1295 if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0) 1296 return (rc); 1297 1298 /* Allocate buffer table entries. */ 1299 sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries), 1300 &rxq->buf_base_id); 1301 1302 /* Allocate the context array and the flow table. */ 1303 rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries, 1304 M_SFXGE, M_WAITOK | M_ZERO); 1305 sfxge_lro_init(rxq); 1306 1307 callout_init(&rxq->refill_callout, 1); 1308 1309 rxq->init_state = SFXGE_RXQ_INITIALIZED; 1310 1311 return (0); 1312 } 1313 1314 static const struct { 1315 const char *name; 1316 size_t offset; 1317 } sfxge_rx_stats[] = { 1318 #define SFXGE_RX_STAT(name, member) \ 1319 { #name, offsetof(struct sfxge_rxq, member) } 1320 #ifdef SFXGE_LRO 1321 SFXGE_RX_STAT(lro_merges, lro.n_merges), 1322 SFXGE_RX_STAT(lro_bursts, lro.n_bursts), 1323 SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start), 1324 SFXGE_RX_STAT(lro_misorder, lro.n_misorder), 1325 SFXGE_RX_STAT(lro_too_many, lro.n_too_many), 1326 SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream), 1327 SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle), 1328 SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed) 1329 #endif 1330 }; 1331 1332 static int 1333 sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS) 1334 { 1335 struct sfxge_softc *sc = arg1; 1336 unsigned int id = arg2; 1337 unsigned int sum, index; 1338 1339 /* Sum across all RX queues */ 1340 sum = 0; 1341 for (index = 0; index < sc->rxq_count; index++) 1342 sum += *(unsigned int *)((caddr_t)sc->rxq[index] + 1343 sfxge_rx_stats[id].offset); 1344 1345 return (SYSCTL_OUT(req, &sum, sizeof(sum))); 1346 } 1347 1348 static void 1349 sfxge_rx_stat_init(struct sfxge_softc *sc) 1350 { 1351 struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev); 1352 struct sysctl_oid_list *stat_list; 1353 unsigned int id; 1354 1355 stat_list = SYSCTL_CHILDREN(sc->stats_node); 1356 1357 for (id = 0; id < nitems(sfxge_rx_stats); id++) { 1358 SYSCTL_ADD_PROC( 1359 ctx, stat_list, 1360 OID_AUTO, sfxge_rx_stats[id].name, 1361 CTLTYPE_UINT|CTLFLAG_RD, 1362 sc, id, sfxge_rx_stat_handler, "IU", 1363 ""); 1364 } 1365 } 1366 1367 void 1368 sfxge_rx_fini(struct sfxge_softc *sc) 1369 { 1370 int index; 1371 1372 index = sc->rxq_count; 1373 while (--index >= 0) 1374 sfxge_rx_qfini(sc, index); 1375 1376 sc->rxq_count = 0; 1377 } 1378 1379 int 1380 sfxge_rx_init(struct sfxge_softc *sc) 1381 { 1382 struct sfxge_intr *intr; 1383 int index; 1384 int rc; 1385 1386 #ifdef SFXGE_LRO 1387 if (!ISP2(lro_table_size)) { 1388 log(LOG_ERR, "%s=%u must be power of 2", 1389 SFXGE_LRO_PARAM(table_size), lro_table_size); 1390 rc = EINVAL; 1391 goto fail_lro_table_size; 1392 } 1393 1394 if (lro_idle_ticks == 0) 1395 lro_idle_ticks = hz / 10 + 1; /* 100 ms */ 1396 #endif 1397 1398 intr = &sc->intr; 1399 1400 sc->rxq_count = intr->n_alloc; 1401 1402 KASSERT(intr->state == SFXGE_INTR_INITIALIZED, 1403 ("intr->state != SFXGE_INTR_INITIALIZED")); 1404 1405 /* Initialize the receive queue(s) - one per interrupt. */ 1406 for (index = 0; index < sc->rxq_count; index++) { 1407 if ((rc = sfxge_rx_qinit(sc, index)) != 0) 1408 goto fail; 1409 } 1410 1411 sfxge_rx_stat_init(sc); 1412 1413 return (0); 1414 1415 fail: 1416 /* Tear down the receive queue(s). */ 1417 while (--index >= 0) 1418 sfxge_rx_qfini(sc, index); 1419 1420 sc->rxq_count = 0; 1421 1422 #ifdef SFXGE_LRO 1423 fail_lro_table_size: 1424 #endif 1425 return (rc); 1426 } 1427