1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2010-2016 Solarflare Communications Inc. 5 * All rights reserved. 6 * 7 * This software was developed in part by Philip Paeps under contract for 8 * Solarflare Communications, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions are met: 12 * 13 * 1. Redistributions of source code must retain the above copyright notice, 14 * this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright notice, 16 * this list of conditions and the following disclaimer in the documentation 17 * and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 21 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 23 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 24 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 25 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 26 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 27 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 28 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 29 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 * 31 * The views and conclusions contained in the software and documentation are 32 * those of the authors and should not be interpreted as representing official 33 * policies, either expressed or implied, of the FreeBSD Project. 34 */ 35 36 #include <sys/cdefs.h> 37 #include "opt_rss.h" 38 39 #include <sys/param.h> 40 #include <sys/malloc.h> 41 #include <sys/mbuf.h> 42 #include <sys/smp.h> 43 #include <sys/socket.h> 44 #include <sys/sysctl.h> 45 #include <sys/syslog.h> 46 #include <sys/limits.h> 47 #include <sys/syslog.h> 48 49 #include <net/ethernet.h> 50 #include <net/if.h> 51 #include <net/if_vlan_var.h> 52 53 #include <netinet/in.h> 54 #include <netinet/ip.h> 55 #include <netinet/ip6.h> 56 #include <netinet/tcp.h> 57 58 #include <machine/in_cksum.h> 59 60 #include <net/rss_config.h> 61 62 #include "common/efx.h" 63 64 #include "sfxge.h" 65 #include "sfxge_rx.h" 66 67 #define RX_REFILL_THRESHOLD(_entries) (EFX_RXQ_LIMIT(_entries) * 9 / 10) 68 69 #ifdef SFXGE_LRO 70 71 SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 72 "Large receive offload (LRO) parameters"); 73 74 #define SFXGE_LRO_PARAM(_param) SFXGE_PARAM(lro._param) 75 76 /* Size of the LRO hash table. Must be a power of 2. A larger table 77 * means we can accelerate a larger number of streams. 78 */ 79 static unsigned lro_table_size = 128; 80 TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size); 81 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN, 82 &lro_table_size, 0, 83 "Size of the LRO hash table (must be a power of 2)"); 84 85 /* Maximum length of a hash chain. If chains get too long then the lookup 86 * time increases and may exceed the benefit of LRO. 87 */ 88 static unsigned lro_chain_max = 20; 89 TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max); 90 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN, 91 &lro_chain_max, 0, 92 "The maximum length of a hash chain"); 93 94 /* Maximum time (in ticks) that a connection can be idle before it's LRO 95 * state is discarded. 96 */ 97 static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */ 98 TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks); 99 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN, 100 &lro_idle_ticks, 0, 101 "The maximum time (in ticks) that a connection can be idle " 102 "before it's LRO state is discarded"); 103 104 /* Number of packets with payload that must arrive in-order before a 105 * connection is eligible for LRO. The idea is we should avoid coalescing 106 * segments when the sender is in slow-start because reducing the ACK rate 107 * can damage performance. 108 */ 109 static int lro_slow_start_packets = 2000; 110 TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets); 111 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN, 112 &lro_slow_start_packets, 0, 113 "Number of packets with payload that must arrive in-order before " 114 "a connection is eligible for LRO"); 115 116 /* Number of packets with payload that must arrive in-order following loss 117 * before a connection is eligible for LRO. The idea is we should avoid 118 * coalescing segments when the sender is recovering from loss, because 119 * reducing the ACK rate can damage performance. 120 */ 121 static int lro_loss_packets = 20; 122 TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets); 123 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN, 124 &lro_loss_packets, 0, 125 "Number of packets with payload that must arrive in-order " 126 "following loss before a connection is eligible for LRO"); 127 128 /* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */ 129 #define SFXGE_LRO_L2_ID_VLAN 0x4000 130 #define SFXGE_LRO_L2_ID_IPV6 0x8000 131 #define SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN) 132 #define SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6)) 133 134 /* Compare IPv6 addresses, avoiding conditional branches */ 135 static unsigned long ipv6_addr_cmp(const struct in6_addr *left, 136 const struct in6_addr *right) 137 { 138 #if LONG_BIT == 64 139 const uint64_t *left64 = (const uint64_t *)left; 140 const uint64_t *right64 = (const uint64_t *)right; 141 return (left64[0] - right64[0]) | (left64[1] - right64[1]); 142 #else 143 return (left->s6_addr32[0] - right->s6_addr32[0]) | 144 (left->s6_addr32[1] - right->s6_addr32[1]) | 145 (left->s6_addr32[2] - right->s6_addr32[2]) | 146 (left->s6_addr32[3] - right->s6_addr32[3]); 147 #endif 148 } 149 150 #endif /* SFXGE_LRO */ 151 152 void 153 sfxge_rx_qflush_done(struct sfxge_rxq *rxq) 154 { 155 156 rxq->flush_state = SFXGE_FLUSH_DONE; 157 } 158 159 void 160 sfxge_rx_qflush_failed(struct sfxge_rxq *rxq) 161 { 162 163 rxq->flush_state = SFXGE_FLUSH_FAILED; 164 } 165 166 static uint8_t toep_key[RSS_KEYSIZE]; 167 168 static void 169 sfxge_rx_post_refill(void *arg) 170 { 171 struct sfxge_rxq *rxq = arg; 172 struct sfxge_softc *sc; 173 unsigned int index; 174 struct sfxge_evq *evq; 175 uint16_t magic; 176 177 sc = rxq->sc; 178 index = rxq->index; 179 evq = sc->evq[index]; 180 magic = sfxge_sw_ev_rxq_magic(SFXGE_SW_EV_RX_QREFILL, rxq); 181 182 /* This is guaranteed due to the start/stop order of rx and ev */ 183 KASSERT(evq->init_state == SFXGE_EVQ_STARTED, 184 ("evq not started")); 185 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED, 186 ("rxq not started")); 187 efx_ev_qpost(evq->common, magic); 188 } 189 190 static void 191 sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying) 192 { 193 /* Initially retry after 100 ms, but back off in case of 194 * repeated failures as we probably have to wait for the 195 * administrator to raise the pool limit. */ 196 if (retrying) 197 rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz); 198 else 199 rxq->refill_delay = hz / 10; 200 201 callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay, 202 sfxge_rx_post_refill, rxq); 203 } 204 205 #define SFXGE_REFILL_BATCH 64 206 207 static void 208 sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying) 209 { 210 struct sfxge_softc *sc; 211 unsigned int index; 212 struct sfxge_evq *evq __diagused; 213 unsigned int batch; 214 unsigned int rxfill; 215 unsigned int mblksize; 216 int ntodo; 217 efsys_dma_addr_t addr[SFXGE_REFILL_BATCH]; 218 219 sc = rxq->sc; 220 index = rxq->index; 221 evq = sc->evq[index]; 222 223 prefetch_read_many(sc->enp); 224 prefetch_read_many(rxq->common); 225 226 SFXGE_EVQ_LOCK_ASSERT_OWNED(evq); 227 228 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED)) 229 return; 230 231 rxfill = rxq->added - rxq->completed; 232 KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries), 233 ("rxfill > EFX_RXQ_LIMIT(rxq->entries)")); 234 ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target); 235 KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries), 236 ("ntodo > EFX_RQX_LIMIT(rxq->entries)")); 237 238 if (ntodo == 0) 239 return; 240 241 batch = 0; 242 mblksize = sc->rx_buffer_size - sc->rx_buffer_align; 243 while (ntodo-- > 0) { 244 unsigned int id; 245 struct sfxge_rx_sw_desc *rx_desc; 246 bus_dma_segment_t seg; 247 struct mbuf *m; 248 249 id = (rxq->added + batch) & rxq->ptr_mask; 250 rx_desc = &rxq->queue[id]; 251 KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL")); 252 253 rx_desc->flags = EFX_DISCARD; 254 m = rx_desc->mbuf = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, 255 sc->rx_cluster_size); 256 if (m == NULL) 257 break; 258 259 /* m_len specifies length of area to be mapped for DMA */ 260 m->m_len = mblksize; 261 m->m_data = (caddr_t)EFX_P2ROUNDUP(uintptr_t, m->m_data, 262 CACHE_LINE_SIZE); 263 m->m_data += sc->rx_buffer_align; 264 265 sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg); 266 addr[batch++] = seg.ds_addr; 267 268 if (batch == SFXGE_REFILL_BATCH) { 269 efx_rx_qpost(rxq->common, addr, mblksize, batch, 270 rxq->completed, rxq->added); 271 rxq->added += batch; 272 batch = 0; 273 } 274 } 275 276 if (ntodo != 0) 277 sfxge_rx_schedule_refill(rxq, retrying); 278 279 if (batch != 0) { 280 efx_rx_qpost(rxq->common, addr, mblksize, batch, 281 rxq->completed, rxq->added); 282 rxq->added += batch; 283 } 284 285 /* Make the descriptors visible to the hardware */ 286 bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map, 287 BUS_DMASYNC_PREWRITE); 288 289 efx_rx_qpush(rxq->common, rxq->added, &rxq->pushed); 290 291 /* The queue could still be empty if no descriptors were actually 292 * pushed, in which case there will be no event to cause the next 293 * refill, so we must schedule a refill ourselves. 294 */ 295 if(rxq->pushed == rxq->completed) { 296 sfxge_rx_schedule_refill(rxq, retrying); 297 } 298 } 299 300 void 301 sfxge_rx_qrefill(struct sfxge_rxq *rxq) 302 { 303 304 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED)) 305 return; 306 307 /* Make sure the queue is full */ 308 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE); 309 } 310 311 static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m) 312 { 313 if_t ifp = sc->ifnet; 314 315 m->m_pkthdr.rcvif = ifp; 316 m->m_pkthdr.csum_data = 0xffff; 317 if_input(ifp, m); 318 } 319 320 static void 321 sfxge_rx_deliver(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_desc) 322 { 323 struct sfxge_softc *sc = rxq->sc; 324 struct mbuf *m = rx_desc->mbuf; 325 int flags = rx_desc->flags; 326 int csum_flags; 327 328 /* Convert checksum flags */ 329 csum_flags = (flags & EFX_CKSUM_IPV4) ? 330 (CSUM_IP_CHECKED | CSUM_IP_VALID) : 0; 331 if (flags & EFX_CKSUM_TCPUDP) 332 csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 333 334 if (flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) { 335 m->m_pkthdr.flowid = 336 efx_pseudo_hdr_hash_get(rxq->common, 337 EFX_RX_HASHALG_TOEPLITZ, 338 mtod(m, uint8_t *)); 339 /* The hash covers a 4-tuple for TCP only */ 340 M_HASHTYPE_SET(m, 341 (flags & EFX_PKT_IPV4) ? 342 ((flags & EFX_PKT_TCP) ? 343 M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_IPV4) : 344 ((flags & EFX_PKT_TCP) ? 345 M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_IPV6)); 346 } 347 m->m_data += sc->rx_prefix_size; 348 m->m_len = rx_desc->size - sc->rx_prefix_size; 349 m->m_pkthdr.len = m->m_len; 350 m->m_pkthdr.csum_flags = csum_flags; 351 __sfxge_rx_deliver(sc, rx_desc->mbuf); 352 353 rx_desc->flags = EFX_DISCARD; 354 rx_desc->mbuf = NULL; 355 } 356 357 #ifdef SFXGE_LRO 358 359 static void 360 sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c) 361 { 362 struct sfxge_softc *sc = st->sc; 363 struct mbuf *m = c->mbuf; 364 struct tcphdr *c_th; 365 int csum_flags; 366 367 KASSERT(m, ("no mbuf to deliver")); 368 369 ++st->n_bursts; 370 371 /* Finish off packet munging and recalculate IP header checksum. */ 372 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 373 struct ip *iph = c->nh; 374 iph->ip_len = htons(iph->ip_len); 375 iph->ip_sum = 0; 376 iph->ip_sum = in_cksum_hdr(iph); 377 c_th = (struct tcphdr *)(iph + 1); 378 csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR | 379 CSUM_IP_CHECKED | CSUM_IP_VALID); 380 } else { 381 struct ip6_hdr *iph = c->nh; 382 iph->ip6_plen = htons(iph->ip6_plen); 383 c_th = (struct tcphdr *)(iph + 1); 384 csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 385 } 386 387 c_th->th_win = c->th_last->th_win; 388 c_th->th_ack = c->th_last->th_ack; 389 if (c_th->th_off == c->th_last->th_off) { 390 /* Copy TCP options (take care to avoid going negative). */ 391 int optlen = ((c_th->th_off - 5) & 0xf) << 2u; 392 memcpy(c_th + 1, c->th_last + 1, optlen); 393 } 394 395 m->m_pkthdr.flowid = c->conn_hash; 396 M_HASHTYPE_SET(m, 397 SFXGE_LRO_CONN_IS_TCPIPV4(c) ? 398 M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_TCP_IPV6); 399 400 m->m_pkthdr.csum_flags = csum_flags; 401 __sfxge_rx_deliver(sc, m); 402 403 c->mbuf = NULL; 404 c->delivered = 1; 405 } 406 407 /* Drop the given connection, and add it to the free list. */ 408 static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c) 409 { 410 unsigned bucket; 411 412 KASSERT(!c->mbuf, ("found orphaned mbuf")); 413 414 if (c->next_buf.mbuf != NULL) { 415 sfxge_rx_deliver(rxq, &c->next_buf); 416 LIST_REMOVE(c, active_link); 417 } 418 419 bucket = c->conn_hash & rxq->lro.conns_mask; 420 KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong")); 421 --rxq->lro.conns_n[bucket]; 422 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link); 423 TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link); 424 } 425 426 /* Stop tracking connections that have gone idle in order to keep hash 427 * chains short. 428 */ 429 static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now) 430 { 431 struct sfxge_lro_conn *c; 432 unsigned i; 433 434 KASSERT(LIST_EMPTY(&rxq->lro.active_conns), 435 ("found active connections")); 436 437 rxq->lro.last_purge_ticks = now; 438 for (i = 0; i <= rxq->lro.conns_mask; ++i) { 439 if (TAILQ_EMPTY(&rxq->lro.conns[i])) 440 continue; 441 442 c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq); 443 if (now - c->last_pkt_ticks > lro_idle_ticks) { 444 ++rxq->lro.n_drop_idle; 445 sfxge_lro_drop(rxq, c); 446 } 447 } 448 } 449 450 static void 451 sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c, 452 struct mbuf *mbuf, struct tcphdr *th) 453 { 454 struct tcphdr *c_th; 455 456 /* Tack the new mbuf onto the chain. */ 457 KASSERT(!mbuf->m_next, ("mbuf already chained")); 458 c->mbuf_tail->m_next = mbuf; 459 c->mbuf_tail = mbuf; 460 461 /* Increase length appropriately */ 462 c->mbuf->m_pkthdr.len += mbuf->m_len; 463 464 /* Update the connection state flags */ 465 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 466 struct ip *iph = c->nh; 467 iph->ip_len += mbuf->m_len; 468 c_th = (struct tcphdr *)(iph + 1); 469 } else { 470 struct ip6_hdr *iph = c->nh; 471 iph->ip6_plen += mbuf->m_len; 472 c_th = (struct tcphdr *)(iph + 1); 473 } 474 tcp_set_flags(c_th, tcp_get_flags(c_th) | (tcp_get_flags(th) & TH_PUSH)); 475 c->th_last = th; 476 ++st->n_merges; 477 478 /* Pass packet up now if another segment could overflow the IP 479 * length. 480 */ 481 if (c->mbuf->m_pkthdr.len > 65536 - 9200) 482 sfxge_lro_deliver(st, c); 483 } 484 485 static void 486 sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c, 487 struct mbuf *mbuf, void *nh, struct tcphdr *th) 488 { 489 /* Start the chain */ 490 c->mbuf = mbuf; 491 c->mbuf_tail = c->mbuf; 492 c->nh = nh; 493 c->th_last = th; 494 495 mbuf->m_pkthdr.len = mbuf->m_len; 496 497 /* Mangle header fields for later processing */ 498 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 499 struct ip *iph = nh; 500 iph->ip_len = ntohs(iph->ip_len); 501 } else { 502 struct ip6_hdr *iph = nh; 503 iph->ip6_plen = ntohs(iph->ip6_plen); 504 } 505 } 506 507 /* Try to merge or otherwise hold or deliver (as appropriate) the 508 * packet buffered for this connection (c->next_buf). Return a flag 509 * indicating whether the connection is still active for LRO purposes. 510 */ 511 static int 512 sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c) 513 { 514 struct sfxge_rx_sw_desc *rx_buf = &c->next_buf; 515 char *eh = c->next_eh; 516 int data_length, hdr_length, dont_merge; 517 unsigned th_seq, pkt_length; 518 struct tcphdr *th; 519 unsigned now; 520 521 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 522 struct ip *iph = c->next_nh; 523 th = (struct tcphdr *)(iph + 1); 524 pkt_length = ntohs(iph->ip_len) + (char *) iph - eh; 525 } else { 526 struct ip6_hdr *iph = c->next_nh; 527 th = (struct tcphdr *)(iph + 1); 528 pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh; 529 } 530 531 hdr_length = (char *) th + th->th_off * 4 - eh; 532 data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) - 533 hdr_length); 534 th_seq = ntohl(th->th_seq); 535 dont_merge = ((data_length <= 0) 536 | (tcp_get_flags(th) & (TH_URG | TH_SYN | TH_RST | TH_FIN))); 537 538 /* Check for options other than aligned timestamp. */ 539 if (th->th_off != 5) { 540 const uint32_t *opt_ptr = (const uint32_t *) (th + 1); 541 if (th->th_off == 8 && 542 opt_ptr[0] == ntohl((TCPOPT_NOP << 24) | 543 (TCPOPT_NOP << 16) | 544 (TCPOPT_TIMESTAMP << 8) | 545 TCPOLEN_TIMESTAMP)) { 546 /* timestamp option -- okay */ 547 } else { 548 dont_merge = 1; 549 } 550 } 551 552 if (__predict_false(th_seq != c->next_seq)) { 553 /* Out-of-order, so start counting again. */ 554 if (c->mbuf != NULL) 555 sfxge_lro_deliver(&rxq->lro, c); 556 c->n_in_order_pkts -= lro_loss_packets; 557 c->next_seq = th_seq + data_length; 558 ++rxq->lro.n_misorder; 559 goto deliver_buf_out; 560 } 561 c->next_seq = th_seq + data_length; 562 563 now = ticks; 564 if (now - c->last_pkt_ticks > lro_idle_ticks) { 565 ++rxq->lro.n_drop_idle; 566 if (c->mbuf != NULL) 567 sfxge_lro_deliver(&rxq->lro, c); 568 sfxge_lro_drop(rxq, c); 569 return (0); 570 } 571 c->last_pkt_ticks = ticks; 572 573 if (c->n_in_order_pkts < lro_slow_start_packets) { 574 /* May be in slow-start, so don't merge. */ 575 ++rxq->lro.n_slow_start; 576 ++c->n_in_order_pkts; 577 goto deliver_buf_out; 578 } 579 580 if (__predict_false(dont_merge)) { 581 if (c->mbuf != NULL) 582 sfxge_lro_deliver(&rxq->lro, c); 583 if (tcp_get_flags(th) & (TH_FIN | TH_RST)) { 584 ++rxq->lro.n_drop_closed; 585 sfxge_lro_drop(rxq, c); 586 return (0); 587 } 588 goto deliver_buf_out; 589 } 590 591 rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size; 592 593 if (__predict_true(c->mbuf != NULL)) { 594 /* Remove headers and any padding */ 595 rx_buf->mbuf->m_data += hdr_length; 596 rx_buf->mbuf->m_len = data_length; 597 598 sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th); 599 } else { 600 /* Remove any padding */ 601 rx_buf->mbuf->m_len = pkt_length; 602 603 sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th); 604 } 605 606 rx_buf->mbuf = NULL; 607 return (1); 608 609 deliver_buf_out: 610 sfxge_rx_deliver(rxq, rx_buf); 611 return (1); 612 } 613 614 static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash, 615 uint16_t l2_id, void *nh, struct tcphdr *th) 616 { 617 unsigned bucket = conn_hash & st->conns_mask; 618 struct sfxge_lro_conn *c; 619 620 if (st->conns_n[bucket] >= lro_chain_max) { 621 ++st->n_too_many; 622 return; 623 } 624 625 if (!TAILQ_EMPTY(&st->free_conns)) { 626 c = TAILQ_FIRST(&st->free_conns); 627 TAILQ_REMOVE(&st->free_conns, c, link); 628 } else { 629 c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT); 630 if (c == NULL) 631 return; 632 c->mbuf = NULL; 633 c->next_buf.mbuf = NULL; 634 } 635 636 /* Create the connection tracking data */ 637 ++st->conns_n[bucket]; 638 TAILQ_INSERT_HEAD(&st->conns[bucket], c, link); 639 c->l2_id = l2_id; 640 c->conn_hash = conn_hash; 641 c->source = th->th_sport; 642 c->dest = th->th_dport; 643 c->n_in_order_pkts = 0; 644 c->last_pkt_ticks = *(volatile int *)&ticks; 645 c->delivered = 0; 646 ++st->n_new_stream; 647 /* NB. We don't initialise c->next_seq, and it doesn't matter what 648 * value it has. Most likely the next packet received for this 649 * connection will not match -- no harm done. 650 */ 651 } 652 653 /* Process mbuf and decide whether to dispatch it to the stack now or 654 * later. 655 */ 656 static void 657 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf) 658 { 659 struct sfxge_softc *sc = rxq->sc; 660 struct mbuf *m = rx_buf->mbuf; 661 struct ether_header *eh; 662 struct sfxge_lro_conn *c; 663 uint16_t l2_id; 664 uint16_t l3_proto; 665 void *nh; 666 struct tcphdr *th; 667 uint32_t conn_hash; 668 unsigned bucket; 669 670 /* Get the hardware hash */ 671 conn_hash = efx_pseudo_hdr_hash_get(rxq->common, 672 EFX_RX_HASHALG_TOEPLITZ, 673 mtod(m, uint8_t *)); 674 675 eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size); 676 if (eh->ether_type == htons(ETHERTYPE_VLAN)) { 677 struct ether_vlan_header *veh = (struct ether_vlan_header *)eh; 678 l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) | 679 SFXGE_LRO_L2_ID_VLAN; 680 l3_proto = veh->evl_proto; 681 nh = veh + 1; 682 } else { 683 l2_id = 0; 684 l3_proto = eh->ether_type; 685 nh = eh + 1; 686 } 687 688 /* Check whether this is a suitable packet (unfragmented 689 * TCP/IPv4 or TCP/IPv6). If so, find the TCP header and 690 * length, and compute a hash if necessary. If not, return. 691 */ 692 if (l3_proto == htons(ETHERTYPE_IP)) { 693 struct ip *iph = nh; 694 695 KASSERT(iph->ip_p == IPPROTO_TCP, 696 ("IPv4 protocol is not TCP, but packet marker is set")); 697 if ((iph->ip_hl - (sizeof(*iph) >> 2u)) | 698 (iph->ip_off & htons(IP_MF | IP_OFFMASK))) 699 goto deliver_now; 700 th = (struct tcphdr *)(iph + 1); 701 } else if (l3_proto == htons(ETHERTYPE_IPV6)) { 702 struct ip6_hdr *iph = nh; 703 704 KASSERT(iph->ip6_nxt == IPPROTO_TCP, 705 ("IPv6 next header is not TCP, but packet marker is set")); 706 l2_id |= SFXGE_LRO_L2_ID_IPV6; 707 th = (struct tcphdr *)(iph + 1); 708 } else { 709 goto deliver_now; 710 } 711 712 bucket = conn_hash & rxq->lro.conns_mask; 713 714 TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) { 715 if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash)) 716 continue; 717 if ((c->source - th->th_sport) | (c->dest - th->th_dport)) 718 continue; 719 if (c->mbuf != NULL) { 720 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 721 struct ip *c_iph, *iph = nh; 722 c_iph = c->nh; 723 if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) | 724 (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr)) 725 continue; 726 } else { 727 struct ip6_hdr *c_iph, *iph = nh; 728 c_iph = c->nh; 729 if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) | 730 ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst)) 731 continue; 732 } 733 } 734 735 /* Re-insert at head of list to reduce lookup time. */ 736 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link); 737 TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link); 738 739 if (c->next_buf.mbuf != NULL) { 740 if (!sfxge_lro_try_merge(rxq, c)) 741 goto deliver_now; 742 } else { 743 LIST_INSERT_HEAD(&rxq->lro.active_conns, c, 744 active_link); 745 } 746 c->next_buf = *rx_buf; 747 c->next_eh = eh; 748 c->next_nh = nh; 749 750 rx_buf->mbuf = NULL; 751 rx_buf->flags = EFX_DISCARD; 752 return; 753 } 754 755 sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th); 756 deliver_now: 757 sfxge_rx_deliver(rxq, rx_buf); 758 } 759 760 static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq) 761 { 762 struct sfxge_lro_state *st = &rxq->lro; 763 struct sfxge_lro_conn *c; 764 unsigned t; 765 766 while (!LIST_EMPTY(&st->active_conns)) { 767 c = LIST_FIRST(&st->active_conns); 768 if (!c->delivered && c->mbuf != NULL) 769 sfxge_lro_deliver(st, c); 770 if (sfxge_lro_try_merge(rxq, c)) { 771 if (c->mbuf != NULL) 772 sfxge_lro_deliver(st, c); 773 LIST_REMOVE(c, active_link); 774 } 775 c->delivered = 0; 776 } 777 778 t = *(volatile int *)&ticks; 779 if (__predict_false(t != st->last_purge_ticks)) 780 sfxge_lro_purge_idle(rxq, t); 781 } 782 783 #else /* !SFXGE_LRO */ 784 785 static void 786 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf) 787 { 788 } 789 790 static void 791 sfxge_lro_end_of_burst(struct sfxge_rxq *rxq) 792 { 793 } 794 795 #endif /* SFXGE_LRO */ 796 797 void 798 sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop) 799 { 800 struct sfxge_softc *sc = rxq->sc; 801 int if_capenable = if_getcapenable(sc->ifnet); 802 int lro_enabled = if_capenable & IFCAP_LRO; 803 unsigned int index; 804 struct sfxge_evq *evq __diagused; 805 unsigned int completed; 806 unsigned int level; 807 struct mbuf *m; 808 struct sfxge_rx_sw_desc *prev = NULL; 809 810 index = rxq->index; 811 evq = sc->evq[index]; 812 813 SFXGE_EVQ_LOCK_ASSERT_OWNED(evq); 814 815 completed = rxq->completed; 816 while (completed != rxq->pending) { 817 unsigned int id; 818 struct sfxge_rx_sw_desc *rx_desc; 819 820 id = completed++ & rxq->ptr_mask; 821 rx_desc = &rxq->queue[id]; 822 m = rx_desc->mbuf; 823 824 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED)) 825 goto discard; 826 827 if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD)) 828 goto discard; 829 830 /* Read the length from the pseudo header if required */ 831 if (rx_desc->flags & EFX_PKT_PREFIX_LEN) { 832 uint16_t tmp_size; 833 int rc __diagused; 834 835 rc = efx_pseudo_hdr_pkt_length_get(rxq->common, 836 mtod(m, uint8_t *), 837 &tmp_size); 838 KASSERT(rc == 0, ("cannot get packet length: %d", rc)); 839 rx_desc->size = (int)tmp_size + sc->rx_prefix_size; 840 } 841 842 prefetch_read_many(mtod(m, caddr_t)); 843 844 switch (rx_desc->flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) { 845 case EFX_PKT_IPV4: 846 if (~if_capenable & IFCAP_RXCSUM) 847 rx_desc->flags &= 848 ~(EFX_CKSUM_IPV4 | EFX_CKSUM_TCPUDP); 849 break; 850 case EFX_PKT_IPV6: 851 if (~if_capenable & IFCAP_RXCSUM_IPV6) 852 rx_desc->flags &= ~EFX_CKSUM_TCPUDP; 853 break; 854 case 0: 855 /* Check for loopback packets */ 856 { 857 struct ether_header *etherhp; 858 859 /*LINTED*/ 860 etherhp = mtod(m, struct ether_header *); 861 862 if (etherhp->ether_type == 863 htons(SFXGE_ETHERTYPE_LOOPBACK)) { 864 EFSYS_PROBE(loopback); 865 866 rxq->loopback++; 867 goto discard; 868 } 869 } 870 break; 871 default: 872 KASSERT(B_FALSE, 873 ("Rx descriptor with both IPv4 and IPv6 flags")); 874 goto discard; 875 } 876 877 /* Pass packet up the stack or into LRO (pipelined) */ 878 if (prev != NULL) { 879 if (lro_enabled && 880 ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) == 881 (EFX_PKT_TCP | EFX_CKSUM_TCPUDP))) 882 sfxge_lro(rxq, prev); 883 else 884 sfxge_rx_deliver(rxq, prev); 885 } 886 prev = rx_desc; 887 continue; 888 889 discard: 890 /* Return the packet to the pool */ 891 m_free(m); 892 rx_desc->mbuf = NULL; 893 } 894 rxq->completed = completed; 895 896 level = rxq->added - rxq->completed; 897 898 /* Pass last packet up the stack or into LRO */ 899 if (prev != NULL) { 900 if (lro_enabled && 901 ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) == 902 (EFX_PKT_TCP | EFX_CKSUM_TCPUDP))) 903 sfxge_lro(rxq, prev); 904 else 905 sfxge_rx_deliver(rxq, prev); 906 } 907 908 /* 909 * If there are any pending flows and this is the end of the 910 * poll then they must be completed. 911 */ 912 if (eop) 913 sfxge_lro_end_of_burst(rxq); 914 915 /* Top up the queue if necessary */ 916 if (level < rxq->refill_threshold) 917 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE); 918 } 919 920 static void 921 sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index) 922 { 923 struct sfxge_rxq *rxq; 924 struct sfxge_evq *evq; 925 unsigned int count; 926 unsigned int retry = 3; 927 928 SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc); 929 930 rxq = sc->rxq[index]; 931 evq = sc->evq[index]; 932 933 SFXGE_EVQ_LOCK(evq); 934 935 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED, 936 ("rxq not started")); 937 938 rxq->init_state = SFXGE_RXQ_INITIALIZED; 939 940 callout_stop(&rxq->refill_callout); 941 942 while (rxq->flush_state != SFXGE_FLUSH_DONE && retry != 0) { 943 rxq->flush_state = SFXGE_FLUSH_PENDING; 944 945 SFXGE_EVQ_UNLOCK(evq); 946 947 /* Flush the receive queue */ 948 if (efx_rx_qflush(rxq->common) != 0) { 949 SFXGE_EVQ_LOCK(evq); 950 rxq->flush_state = SFXGE_FLUSH_FAILED; 951 break; 952 } 953 954 count = 0; 955 do { 956 /* Spin for 100 ms */ 957 DELAY(100000); 958 959 if (rxq->flush_state != SFXGE_FLUSH_PENDING) 960 break; 961 962 } while (++count < 20); 963 964 SFXGE_EVQ_LOCK(evq); 965 966 if (rxq->flush_state == SFXGE_FLUSH_PENDING) { 967 /* Flush timeout - neither done nor failed */ 968 log(LOG_ERR, "%s: Cannot flush Rx queue %u\n", 969 device_get_nameunit(sc->dev), index); 970 rxq->flush_state = SFXGE_FLUSH_DONE; 971 } 972 retry--; 973 } 974 if (rxq->flush_state == SFXGE_FLUSH_FAILED) { 975 log(LOG_ERR, "%s: Flushing Rx queue %u failed\n", 976 device_get_nameunit(sc->dev), index); 977 rxq->flush_state = SFXGE_FLUSH_DONE; 978 } 979 980 rxq->pending = rxq->added; 981 sfxge_rx_qcomplete(rxq, B_TRUE); 982 983 KASSERT(rxq->completed == rxq->pending, 984 ("rxq->completed != rxq->pending")); 985 986 rxq->added = 0; 987 rxq->pushed = 0; 988 rxq->pending = 0; 989 rxq->completed = 0; 990 rxq->loopback = 0; 991 992 /* Destroy the common code receive queue. */ 993 efx_rx_qdestroy(rxq->common); 994 995 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id, 996 EFX_RXQ_NBUFS(sc->rxq_entries)); 997 998 SFXGE_EVQ_UNLOCK(evq); 999 } 1000 1001 static int 1002 sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index) 1003 { 1004 struct sfxge_rxq *rxq; 1005 efsys_mem_t *esmp; 1006 struct sfxge_evq *evq; 1007 int rc; 1008 1009 SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc); 1010 1011 rxq = sc->rxq[index]; 1012 esmp = &rxq->mem; 1013 evq = sc->evq[index]; 1014 1015 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED, 1016 ("rxq->init_state != SFXGE_RXQ_INITIALIZED")); 1017 KASSERT(evq->init_state == SFXGE_EVQ_STARTED, 1018 ("evq->init_state != SFXGE_EVQ_STARTED")); 1019 1020 /* Program the buffer table. */ 1021 if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp, 1022 EFX_RXQ_NBUFS(sc->rxq_entries))) != 0) 1023 return (rc); 1024 1025 /* Create the common code receive queue. */ 1026 if ((rc = efx_rx_qcreate(sc->enp, index, 0, EFX_RXQ_TYPE_DEFAULT, 1027 esmp, sc->rxq_entries, rxq->buf_base_id, EFX_RXQ_FLAG_NONE, 1028 evq->common, &rxq->common)) != 0) 1029 goto fail; 1030 1031 SFXGE_EVQ_LOCK(evq); 1032 1033 /* Enable the receive queue. */ 1034 efx_rx_qenable(rxq->common); 1035 1036 rxq->init_state = SFXGE_RXQ_STARTED; 1037 rxq->flush_state = SFXGE_FLUSH_REQUIRED; 1038 1039 /* Try to fill the queue from the pool. */ 1040 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE); 1041 1042 SFXGE_EVQ_UNLOCK(evq); 1043 1044 return (0); 1045 1046 fail: 1047 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id, 1048 EFX_RXQ_NBUFS(sc->rxq_entries)); 1049 return (rc); 1050 } 1051 1052 void 1053 sfxge_rx_stop(struct sfxge_softc *sc) 1054 { 1055 int index; 1056 1057 efx_mac_filter_default_rxq_clear(sc->enp); 1058 1059 /* Stop the receive queue(s) */ 1060 index = sc->rxq_count; 1061 while (--index >= 0) 1062 sfxge_rx_qstop(sc, index); 1063 1064 sc->rx_prefix_size = 0; 1065 sc->rx_buffer_size = 0; 1066 1067 efx_rx_fini(sc->enp); 1068 } 1069 1070 int 1071 sfxge_rx_start(struct sfxge_softc *sc) 1072 { 1073 const efx_nic_cfg_t *encp; 1074 size_t hdrlen, align, reserved; 1075 int index; 1076 int rc; 1077 1078 /* Initialize the common code receive module. */ 1079 if ((rc = efx_rx_init(sc->enp)) != 0) 1080 return (rc); 1081 1082 encp = efx_nic_cfg_get(sc->enp); 1083 sc->rx_buffer_size = EFX_MAC_PDU(if_getmtu(sc->ifnet)); 1084 1085 /* Calculate the receive packet buffer size. */ 1086 sc->rx_prefix_size = encp->enc_rx_prefix_size; 1087 1088 /* Ensure IP headers are 32bit aligned */ 1089 hdrlen = sc->rx_prefix_size + sizeof (struct ether_header); 1090 sc->rx_buffer_align = EFX_P2ROUNDUP(size_t, hdrlen, 4) - hdrlen; 1091 1092 sc->rx_buffer_size += sc->rx_buffer_align; 1093 1094 /* Align end of packet buffer for RX DMA end padding */ 1095 align = MAX(1, encp->enc_rx_buf_align_end); 1096 EFSYS_ASSERT(ISP2(align)); 1097 sc->rx_buffer_size = EFX_P2ROUNDUP(size_t, sc->rx_buffer_size, align); 1098 1099 /* 1100 * Standard mbuf zones only guarantee pointer-size alignment; 1101 * we need extra space to align to the cache line 1102 */ 1103 reserved = sc->rx_buffer_size + CACHE_LINE_SIZE; 1104 1105 /* Select zone for packet buffers */ 1106 if (reserved <= MCLBYTES) 1107 sc->rx_cluster_size = MCLBYTES; 1108 else if (reserved <= MJUMPAGESIZE) 1109 sc->rx_cluster_size = MJUMPAGESIZE; 1110 else if (reserved <= MJUM9BYTES) 1111 sc->rx_cluster_size = MJUM9BYTES; 1112 else 1113 sc->rx_cluster_size = MJUM16BYTES; 1114 1115 /* 1116 * Set up the scale table. Enable all hash types and hash insertion. 1117 */ 1118 for (index = 0; index < nitems(sc->rx_indir_table); index++) 1119 #ifdef RSS 1120 sc->rx_indir_table[index] = 1121 rss_get_indirection_to_bucket(index) % sc->rxq_count; 1122 #else 1123 sc->rx_indir_table[index] = index % sc->rxq_count; 1124 #endif 1125 if ((rc = efx_rx_scale_tbl_set(sc->enp, EFX_RSS_CONTEXT_DEFAULT, 1126 sc->rx_indir_table, 1127 nitems(sc->rx_indir_table))) != 0) 1128 goto fail; 1129 (void)efx_rx_scale_mode_set(sc->enp, EFX_RSS_CONTEXT_DEFAULT, 1130 EFX_RX_HASHALG_TOEPLITZ, 1131 EFX_RX_HASH_IPV4 | EFX_RX_HASH_TCPIPV4 | 1132 EFX_RX_HASH_IPV6 | EFX_RX_HASH_TCPIPV6, B_TRUE); 1133 1134 rss_getkey(toep_key); 1135 if ((rc = efx_rx_scale_key_set(sc->enp, EFX_RSS_CONTEXT_DEFAULT, 1136 toep_key, 1137 sizeof(toep_key))) != 0) 1138 goto fail; 1139 1140 /* Start the receive queue(s). */ 1141 for (index = 0; index < sc->rxq_count; index++) { 1142 if ((rc = sfxge_rx_qstart(sc, index)) != 0) 1143 goto fail2; 1144 } 1145 1146 rc = efx_mac_filter_default_rxq_set(sc->enp, sc->rxq[0]->common, 1147 sc->intr.n_alloc > 1); 1148 if (rc != 0) 1149 goto fail3; 1150 1151 return (0); 1152 1153 fail3: 1154 fail2: 1155 while (--index >= 0) 1156 sfxge_rx_qstop(sc, index); 1157 1158 fail: 1159 efx_rx_fini(sc->enp); 1160 1161 return (rc); 1162 } 1163 1164 #ifdef SFXGE_LRO 1165 1166 static void sfxge_lro_init(struct sfxge_rxq *rxq) 1167 { 1168 struct sfxge_lro_state *st = &rxq->lro; 1169 unsigned i; 1170 1171 st->conns_mask = lro_table_size - 1; 1172 KASSERT(!((st->conns_mask + 1) & st->conns_mask), 1173 ("lro_table_size must be a power of 2")); 1174 st->sc = rxq->sc; 1175 st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]), 1176 M_SFXGE, M_WAITOK); 1177 st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]), 1178 M_SFXGE, M_WAITOK); 1179 for (i = 0; i <= st->conns_mask; ++i) { 1180 TAILQ_INIT(&st->conns[i]); 1181 st->conns_n[i] = 0; 1182 } 1183 LIST_INIT(&st->active_conns); 1184 TAILQ_INIT(&st->free_conns); 1185 } 1186 1187 static void sfxge_lro_fini(struct sfxge_rxq *rxq) 1188 { 1189 struct sfxge_lro_state *st = &rxq->lro; 1190 struct sfxge_lro_conn *c; 1191 unsigned i; 1192 1193 /* Return cleanly if sfxge_lro_init() has not been called. */ 1194 if (st->conns == NULL) 1195 return; 1196 1197 KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections")); 1198 1199 for (i = 0; i <= st->conns_mask; ++i) { 1200 while (!TAILQ_EMPTY(&st->conns[i])) { 1201 c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq); 1202 sfxge_lro_drop(rxq, c); 1203 } 1204 } 1205 1206 while (!TAILQ_EMPTY(&st->free_conns)) { 1207 c = TAILQ_FIRST(&st->free_conns); 1208 TAILQ_REMOVE(&st->free_conns, c, link); 1209 KASSERT(!c->mbuf, ("found orphaned mbuf")); 1210 free(c, M_SFXGE); 1211 } 1212 1213 free(st->conns_n, M_SFXGE); 1214 free(st->conns, M_SFXGE); 1215 st->conns = NULL; 1216 } 1217 1218 #else 1219 1220 static void 1221 sfxge_lro_init(struct sfxge_rxq *rxq) 1222 { 1223 } 1224 1225 static void 1226 sfxge_lro_fini(struct sfxge_rxq *rxq) 1227 { 1228 } 1229 1230 #endif /* SFXGE_LRO */ 1231 1232 static void 1233 sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index) 1234 { 1235 struct sfxge_rxq *rxq; 1236 1237 rxq = sc->rxq[index]; 1238 1239 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED, 1240 ("rxq->init_state != SFXGE_RXQ_INITIALIZED")); 1241 1242 /* Free the context array and the flow table. */ 1243 free(rxq->queue, M_SFXGE); 1244 sfxge_lro_fini(rxq); 1245 1246 /* Release DMA memory. */ 1247 sfxge_dma_free(&rxq->mem); 1248 1249 sc->rxq[index] = NULL; 1250 1251 free(rxq, M_SFXGE); 1252 } 1253 1254 static int 1255 sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index) 1256 { 1257 struct sfxge_rxq *rxq; 1258 efsys_mem_t *esmp; 1259 int rc; 1260 1261 KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count)); 1262 1263 rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK); 1264 rxq->sc = sc; 1265 rxq->index = index; 1266 rxq->entries = sc->rxq_entries; 1267 rxq->ptr_mask = rxq->entries - 1; 1268 rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries); 1269 1270 sc->rxq[index] = rxq; 1271 esmp = &rxq->mem; 1272 1273 /* Allocate and zero DMA space. */ 1274 if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0) 1275 return (rc); 1276 1277 /* Allocate buffer table entries. */ 1278 sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries), 1279 &rxq->buf_base_id); 1280 1281 /* Allocate the context array and the flow table. */ 1282 rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries, 1283 M_SFXGE, M_WAITOK | M_ZERO); 1284 sfxge_lro_init(rxq); 1285 1286 callout_init(&rxq->refill_callout, 1); 1287 1288 rxq->init_state = SFXGE_RXQ_INITIALIZED; 1289 1290 return (0); 1291 } 1292 1293 static const struct { 1294 const char *name; 1295 size_t offset; 1296 } sfxge_rx_stats[] = { 1297 #define SFXGE_RX_STAT(name, member) \ 1298 { #name, offsetof(struct sfxge_rxq, member) } 1299 #ifdef SFXGE_LRO 1300 SFXGE_RX_STAT(lro_merges, lro.n_merges), 1301 SFXGE_RX_STAT(lro_bursts, lro.n_bursts), 1302 SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start), 1303 SFXGE_RX_STAT(lro_misorder, lro.n_misorder), 1304 SFXGE_RX_STAT(lro_too_many, lro.n_too_many), 1305 SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream), 1306 SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle), 1307 SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed) 1308 #endif 1309 }; 1310 1311 static int 1312 sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS) 1313 { 1314 struct sfxge_softc *sc = arg1; 1315 unsigned int id = arg2; 1316 unsigned int sum, index; 1317 1318 /* Sum across all RX queues */ 1319 sum = 0; 1320 for (index = 0; index < sc->rxq_count; index++) 1321 sum += *(unsigned int *)((caddr_t)sc->rxq[index] + 1322 sfxge_rx_stats[id].offset); 1323 1324 return (SYSCTL_OUT(req, &sum, sizeof(sum))); 1325 } 1326 1327 static void 1328 sfxge_rx_stat_init(struct sfxge_softc *sc) 1329 { 1330 struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev); 1331 struct sysctl_oid_list *stat_list; 1332 unsigned int id; 1333 1334 stat_list = SYSCTL_CHILDREN(sc->stats_node); 1335 1336 for (id = 0; id < nitems(sfxge_rx_stats); id++) { 1337 SYSCTL_ADD_PROC(ctx, stat_list, OID_AUTO, 1338 sfxge_rx_stats[id].name, 1339 CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_NEEDGIANT, 1340 sc, id, sfxge_rx_stat_handler, "IU", ""); 1341 } 1342 } 1343 1344 void 1345 sfxge_rx_fini(struct sfxge_softc *sc) 1346 { 1347 int index; 1348 1349 index = sc->rxq_count; 1350 while (--index >= 0) 1351 sfxge_rx_qfini(sc, index); 1352 1353 sc->rxq_count = 0; 1354 } 1355 1356 int 1357 sfxge_rx_init(struct sfxge_softc *sc) 1358 { 1359 struct sfxge_intr *intr; 1360 int index; 1361 int rc; 1362 1363 #ifdef SFXGE_LRO 1364 if (!ISP2(lro_table_size)) { 1365 log(LOG_ERR, "%s=%u must be power of 2", 1366 SFXGE_LRO_PARAM(table_size), lro_table_size); 1367 rc = EINVAL; 1368 goto fail_lro_table_size; 1369 } 1370 1371 if (lro_idle_ticks == 0) 1372 lro_idle_ticks = hz / 10 + 1; /* 100 ms */ 1373 #endif 1374 1375 intr = &sc->intr; 1376 1377 sc->rxq_count = intr->n_alloc; 1378 1379 KASSERT(intr->state == SFXGE_INTR_INITIALIZED, 1380 ("intr->state != SFXGE_INTR_INITIALIZED")); 1381 1382 /* Initialize the receive queue(s) - one per interrupt. */ 1383 for (index = 0; index < sc->rxq_count; index++) { 1384 if ((rc = sfxge_rx_qinit(sc, index)) != 0) 1385 goto fail; 1386 } 1387 1388 sfxge_rx_stat_init(sc); 1389 1390 return (0); 1391 1392 fail: 1393 /* Tear down the receive queue(s). */ 1394 while (--index >= 0) 1395 sfxge_rx_qfini(sc, index); 1396 1397 sc->rxq_count = 0; 1398 1399 #ifdef SFXGE_LRO 1400 fail_lro_table_size: 1401 #endif 1402 return (rc); 1403 } 1404