1 /*- 2 * Copyright (c) 2010-2015 Solarflare Communications Inc. 3 * All rights reserved. 4 * 5 * This software was developed in part by Philip Paeps under contract for 6 * Solarflare Communications, Inc. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright notice, 12 * this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 19 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 20 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 21 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 22 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 23 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 24 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 25 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 26 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 27 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 * 29 * The views and conclusions contained in the software and documentation are 30 * those of the authors and should not be interpreted as representing official 31 * policies, either expressed or implied, of the FreeBSD Project. 32 */ 33 34 #include <sys/cdefs.h> 35 __FBSDID("$FreeBSD$"); 36 37 #include <sys/types.h> 38 #include <sys/mbuf.h> 39 #include <sys/smp.h> 40 #include <sys/socket.h> 41 #include <sys/sysctl.h> 42 #include <sys/syslog.h> 43 #include <sys/limits.h> 44 #include <sys/syslog.h> 45 46 #include <net/ethernet.h> 47 #include <net/if.h> 48 #include <net/if_vlan_var.h> 49 50 #include <netinet/in.h> 51 #include <netinet/ip.h> 52 #include <netinet/ip6.h> 53 #include <netinet/tcp.h> 54 55 #include <machine/in_cksum.h> 56 57 #include "common/efx.h" 58 59 60 #include "sfxge.h" 61 #include "sfxge_rx.h" 62 63 #define RX_REFILL_THRESHOLD(_entries) (EFX_RXQ_LIMIT(_entries) * 9 / 10) 64 65 #ifdef SFXGE_LRO 66 67 SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD, NULL, 68 "Large receive offload (LRO) parameters"); 69 70 #define SFXGE_LRO_PARAM(_param) SFXGE_PARAM(lro._param) 71 72 /* Size of the LRO hash table. Must be a power of 2. A larger table 73 * means we can accelerate a larger number of streams. 74 */ 75 static unsigned lro_table_size = 128; 76 TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size); 77 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN, 78 &lro_table_size, 0, 79 "Size of the LRO hash table (must be a power of 2)"); 80 81 /* Maximum length of a hash chain. If chains get too long then the lookup 82 * time increases and may exceed the benefit of LRO. 83 */ 84 static unsigned lro_chain_max = 20; 85 TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max); 86 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN, 87 &lro_chain_max, 0, 88 "The maximum length of a hash chain"); 89 90 /* Maximum time (in ticks) that a connection can be idle before it's LRO 91 * state is discarded. 92 */ 93 static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */ 94 TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks); 95 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN, 96 &lro_idle_ticks, 0, 97 "The maximum time (in ticks) that a connection can be idle " 98 "before it's LRO state is discarded"); 99 100 /* Number of packets with payload that must arrive in-order before a 101 * connection is eligible for LRO. The idea is we should avoid coalescing 102 * segments when the sender is in slow-start because reducing the ACK rate 103 * can damage performance. 104 */ 105 static int lro_slow_start_packets = 2000; 106 TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets); 107 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN, 108 &lro_slow_start_packets, 0, 109 "Number of packets with payload that must arrive in-order before " 110 "a connection is eligible for LRO"); 111 112 /* Number of packets with payload that must arrive in-order following loss 113 * before a connection is eligible for LRO. The idea is we should avoid 114 * coalescing segments when the sender is recovering from loss, because 115 * reducing the ACK rate can damage performance. 116 */ 117 static int lro_loss_packets = 20; 118 TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets); 119 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN, 120 &lro_loss_packets, 0, 121 "Number of packets with payload that must arrive in-order " 122 "following loss before a connection is eligible for LRO"); 123 124 /* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */ 125 #define SFXGE_LRO_L2_ID_VLAN 0x4000 126 #define SFXGE_LRO_L2_ID_IPV6 0x8000 127 #define SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN) 128 #define SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6)) 129 130 /* Compare IPv6 addresses, avoiding conditional branches */ 131 static unsigned long ipv6_addr_cmp(const struct in6_addr *left, 132 const struct in6_addr *right) 133 { 134 #if LONG_BIT == 64 135 const uint64_t *left64 = (const uint64_t *)left; 136 const uint64_t *right64 = (const uint64_t *)right; 137 return (left64[0] - right64[0]) | (left64[1] - right64[1]); 138 #else 139 return (left->s6_addr32[0] - right->s6_addr32[0]) | 140 (left->s6_addr32[1] - right->s6_addr32[1]) | 141 (left->s6_addr32[2] - right->s6_addr32[2]) | 142 (left->s6_addr32[3] - right->s6_addr32[3]); 143 #endif 144 } 145 146 #endif /* SFXGE_LRO */ 147 148 void 149 sfxge_rx_qflush_done(struct sfxge_rxq *rxq) 150 { 151 152 rxq->flush_state = SFXGE_FLUSH_DONE; 153 } 154 155 void 156 sfxge_rx_qflush_failed(struct sfxge_rxq *rxq) 157 { 158 159 rxq->flush_state = SFXGE_FLUSH_FAILED; 160 } 161 162 static uint8_t toep_key[] = { 163 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, 164 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, 165 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, 166 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, 167 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa 168 }; 169 170 static void 171 sfxge_rx_post_refill(void *arg) 172 { 173 struct sfxge_rxq *rxq = arg; 174 struct sfxge_softc *sc; 175 unsigned int index; 176 struct sfxge_evq *evq; 177 uint16_t magic; 178 179 sc = rxq->sc; 180 index = rxq->index; 181 evq = sc->evq[index]; 182 183 magic = SFXGE_MAGIC_RX_QREFILL | index; 184 185 /* This is guaranteed due to the start/stop order of rx and ev */ 186 KASSERT(evq->init_state == SFXGE_EVQ_STARTED, 187 ("evq not started")); 188 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED, 189 ("rxq not started")); 190 efx_ev_qpost(evq->common, magic); 191 } 192 193 static void 194 sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying) 195 { 196 /* Initially retry after 100 ms, but back off in case of 197 * repeated failures as we probably have to wait for the 198 * administrator to raise the pool limit. */ 199 if (retrying) 200 rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz); 201 else 202 rxq->refill_delay = hz / 10; 203 204 callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay, 205 sfxge_rx_post_refill, rxq); 206 } 207 208 static struct mbuf *sfxge_rx_alloc_mbuf(struct sfxge_softc *sc) 209 { 210 struct mb_args args; 211 struct mbuf *m; 212 213 /* Allocate mbuf structure */ 214 args.flags = M_PKTHDR; 215 args.type = MT_DATA; 216 m = (struct mbuf *)uma_zalloc_arg(zone_mbuf, &args, M_NOWAIT); 217 218 /* Allocate (and attach) packet buffer */ 219 if (m != NULL && !uma_zalloc_arg(sc->rx_buffer_zone, m, M_NOWAIT)) { 220 uma_zfree(zone_mbuf, m); 221 m = NULL; 222 } 223 224 return (m); 225 } 226 227 #define SFXGE_REFILL_BATCH 64 228 229 static void 230 sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying) 231 { 232 struct sfxge_softc *sc; 233 unsigned int index; 234 struct sfxge_evq *evq; 235 unsigned int batch; 236 unsigned int rxfill; 237 unsigned int mblksize; 238 int ntodo; 239 efsys_dma_addr_t addr[SFXGE_REFILL_BATCH]; 240 241 sc = rxq->sc; 242 index = rxq->index; 243 evq = sc->evq[index]; 244 245 prefetch_read_many(sc->enp); 246 prefetch_read_many(rxq->common); 247 248 SFXGE_EVQ_LOCK_ASSERT_OWNED(evq); 249 250 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED)) 251 return; 252 253 rxfill = rxq->added - rxq->completed; 254 KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries), 255 ("rxfill > EFX_RXQ_LIMIT(rxq->entries)")); 256 ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target); 257 KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries), 258 ("ntodo > EFX_RQX_LIMIT(rxq->entries)")); 259 260 if (ntodo == 0) 261 return; 262 263 batch = 0; 264 mblksize = sc->rx_buffer_size - sc->rx_buffer_align; 265 while (ntodo-- > 0) { 266 unsigned int id; 267 struct sfxge_rx_sw_desc *rx_desc; 268 bus_dma_segment_t seg; 269 struct mbuf *m; 270 271 id = (rxq->added + batch) & rxq->ptr_mask; 272 rx_desc = &rxq->queue[id]; 273 KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL")); 274 275 rx_desc->flags = EFX_DISCARD; 276 m = rx_desc->mbuf = sfxge_rx_alloc_mbuf(sc); 277 if (m == NULL) 278 break; 279 280 /* m_len specifies length of area to be mapped for DMA */ 281 m->m_len = mblksize; 282 m->m_data = (caddr_t)P2ROUNDUP((uintptr_t)m->m_data, CACHE_LINE_SIZE); 283 m->m_data += sc->rx_buffer_align; 284 285 sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg); 286 addr[batch++] = seg.ds_addr; 287 288 if (batch == SFXGE_REFILL_BATCH) { 289 efx_rx_qpost(rxq->common, addr, mblksize, batch, 290 rxq->completed, rxq->added); 291 rxq->added += batch; 292 batch = 0; 293 } 294 } 295 296 if (ntodo != 0) 297 sfxge_rx_schedule_refill(rxq, retrying); 298 299 if (batch != 0) { 300 efx_rx_qpost(rxq->common, addr, mblksize, batch, 301 rxq->completed, rxq->added); 302 rxq->added += batch; 303 } 304 305 /* Make the descriptors visible to the hardware */ 306 bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map, 307 BUS_DMASYNC_PREWRITE); 308 309 efx_rx_qpush(rxq->common, rxq->added, &rxq->pushed); 310 311 /* The queue could still be empty if no descriptors were actually 312 * pushed, in which case there will be no event to cause the next 313 * refill, so we must schedule a refill ourselves. 314 */ 315 if(rxq->pushed == rxq->completed) { 316 sfxge_rx_schedule_refill(rxq, retrying); 317 } 318 } 319 320 void 321 sfxge_rx_qrefill(struct sfxge_rxq *rxq) 322 { 323 324 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED)) 325 return; 326 327 /* Make sure the queue is full */ 328 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE); 329 } 330 331 static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m) 332 { 333 struct ifnet *ifp = sc->ifnet; 334 335 m->m_pkthdr.rcvif = ifp; 336 m->m_pkthdr.csum_data = 0xffff; 337 ifp->if_input(ifp, m); 338 } 339 340 static void 341 sfxge_rx_deliver(struct sfxge_softc *sc, struct sfxge_rx_sw_desc *rx_desc) 342 { 343 struct mbuf *m = rx_desc->mbuf; 344 int flags = rx_desc->flags; 345 int csum_flags; 346 347 /* Convert checksum flags */ 348 csum_flags = (flags & EFX_CKSUM_IPV4) ? 349 (CSUM_IP_CHECKED | CSUM_IP_VALID) : 0; 350 if (flags & EFX_CKSUM_TCPUDP) 351 csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 352 353 if (flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) { 354 m->m_pkthdr.flowid = 355 efx_psuedo_hdr_hash_get(sc->enp, 356 EFX_RX_HASHALG_TOEPLITZ, 357 mtod(m, uint8_t *)); 358 /* The hash covers a 4-tuple for TCP only */ 359 M_HASHTYPE_SET(m, 360 (flags & EFX_PKT_IPV4) ? 361 ((flags & EFX_PKT_TCP) ? 362 M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_IPV4) : 363 ((flags & EFX_PKT_TCP) ? 364 M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_IPV6)); 365 } 366 m->m_data += sc->rx_prefix_size; 367 m->m_len = rx_desc->size - sc->rx_prefix_size; 368 m->m_pkthdr.len = m->m_len; 369 m->m_pkthdr.csum_flags = csum_flags; 370 __sfxge_rx_deliver(sc, rx_desc->mbuf); 371 372 rx_desc->flags = EFX_DISCARD; 373 rx_desc->mbuf = NULL; 374 } 375 376 #ifdef SFXGE_LRO 377 378 static void 379 sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c) 380 { 381 struct sfxge_softc *sc = st->sc; 382 struct mbuf *m = c->mbuf; 383 struct tcphdr *c_th; 384 int csum_flags; 385 386 KASSERT(m, ("no mbuf to deliver")); 387 388 ++st->n_bursts; 389 390 /* Finish off packet munging and recalculate IP header checksum. */ 391 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 392 struct ip *iph = c->nh; 393 iph->ip_len = htons(iph->ip_len); 394 iph->ip_sum = 0; 395 iph->ip_sum = in_cksum_hdr(iph); 396 c_th = (struct tcphdr *)(iph + 1); 397 csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR | 398 CSUM_IP_CHECKED | CSUM_IP_VALID); 399 } else { 400 struct ip6_hdr *iph = c->nh; 401 iph->ip6_plen = htons(iph->ip6_plen); 402 c_th = (struct tcphdr *)(iph + 1); 403 csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 404 } 405 406 c_th->th_win = c->th_last->th_win; 407 c_th->th_ack = c->th_last->th_ack; 408 if (c_th->th_off == c->th_last->th_off) { 409 /* Copy TCP options (take care to avoid going negative). */ 410 int optlen = ((c_th->th_off - 5) & 0xf) << 2u; 411 memcpy(c_th + 1, c->th_last + 1, optlen); 412 } 413 414 m->m_pkthdr.flowid = c->conn_hash; 415 M_HASHTYPE_SET(m, 416 SFXGE_LRO_CONN_IS_TCPIPV4(c) ? 417 M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_TCP_IPV6); 418 419 m->m_pkthdr.csum_flags = csum_flags; 420 __sfxge_rx_deliver(sc, m); 421 422 c->mbuf = NULL; 423 c->delivered = 1; 424 } 425 426 /* Drop the given connection, and add it to the free list. */ 427 static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c) 428 { 429 unsigned bucket; 430 431 KASSERT(!c->mbuf, ("found orphaned mbuf")); 432 433 if (c->next_buf.mbuf != NULL) { 434 sfxge_rx_deliver(rxq->sc, &c->next_buf); 435 LIST_REMOVE(c, active_link); 436 } 437 438 bucket = c->conn_hash & rxq->lro.conns_mask; 439 KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong")); 440 --rxq->lro.conns_n[bucket]; 441 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link); 442 TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link); 443 } 444 445 /* Stop tracking connections that have gone idle in order to keep hash 446 * chains short. 447 */ 448 static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now) 449 { 450 struct sfxge_lro_conn *c; 451 unsigned i; 452 453 KASSERT(LIST_EMPTY(&rxq->lro.active_conns), 454 ("found active connections")); 455 456 rxq->lro.last_purge_ticks = now; 457 for (i = 0; i <= rxq->lro.conns_mask; ++i) { 458 if (TAILQ_EMPTY(&rxq->lro.conns[i])) 459 continue; 460 461 c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq); 462 if (now - c->last_pkt_ticks > lro_idle_ticks) { 463 ++rxq->lro.n_drop_idle; 464 sfxge_lro_drop(rxq, c); 465 } 466 } 467 } 468 469 static void 470 sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c, 471 struct mbuf *mbuf, struct tcphdr *th) 472 { 473 struct tcphdr *c_th; 474 475 /* Tack the new mbuf onto the chain. */ 476 KASSERT(!mbuf->m_next, ("mbuf already chained")); 477 c->mbuf_tail->m_next = mbuf; 478 c->mbuf_tail = mbuf; 479 480 /* Increase length appropriately */ 481 c->mbuf->m_pkthdr.len += mbuf->m_len; 482 483 /* Update the connection state flags */ 484 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 485 struct ip *iph = c->nh; 486 iph->ip_len += mbuf->m_len; 487 c_th = (struct tcphdr *)(iph + 1); 488 } else { 489 struct ip6_hdr *iph = c->nh; 490 iph->ip6_plen += mbuf->m_len; 491 c_th = (struct tcphdr *)(iph + 1); 492 } 493 c_th->th_flags |= (th->th_flags & TH_PUSH); 494 c->th_last = th; 495 ++st->n_merges; 496 497 /* Pass packet up now if another segment could overflow the IP 498 * length. 499 */ 500 if (c->mbuf->m_pkthdr.len > 65536 - 9200) 501 sfxge_lro_deliver(st, c); 502 } 503 504 static void 505 sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c, 506 struct mbuf *mbuf, void *nh, struct tcphdr *th) 507 { 508 /* Start the chain */ 509 c->mbuf = mbuf; 510 c->mbuf_tail = c->mbuf; 511 c->nh = nh; 512 c->th_last = th; 513 514 mbuf->m_pkthdr.len = mbuf->m_len; 515 516 /* Mangle header fields for later processing */ 517 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 518 struct ip *iph = nh; 519 iph->ip_len = ntohs(iph->ip_len); 520 } else { 521 struct ip6_hdr *iph = nh; 522 iph->ip6_plen = ntohs(iph->ip6_plen); 523 } 524 } 525 526 /* Try to merge or otherwise hold or deliver (as appropriate) the 527 * packet buffered for this connection (c->next_buf). Return a flag 528 * indicating whether the connection is still active for LRO purposes. 529 */ 530 static int 531 sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c) 532 { 533 struct sfxge_rx_sw_desc *rx_buf = &c->next_buf; 534 char *eh = c->next_eh; 535 int data_length, hdr_length, dont_merge; 536 unsigned th_seq, pkt_length; 537 struct tcphdr *th; 538 unsigned now; 539 540 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 541 struct ip *iph = c->next_nh; 542 th = (struct tcphdr *)(iph + 1); 543 pkt_length = ntohs(iph->ip_len) + (char *) iph - eh; 544 } else { 545 struct ip6_hdr *iph = c->next_nh; 546 th = (struct tcphdr *)(iph + 1); 547 pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh; 548 } 549 550 hdr_length = (char *) th + th->th_off * 4 - eh; 551 data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) - 552 hdr_length); 553 th_seq = ntohl(th->th_seq); 554 dont_merge = ((data_length <= 0) 555 | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN))); 556 557 /* Check for options other than aligned timestamp. */ 558 if (th->th_off != 5) { 559 const uint32_t *opt_ptr = (const uint32_t *) (th + 1); 560 if (th->th_off == 8 && 561 opt_ptr[0] == ntohl((TCPOPT_NOP << 24) | 562 (TCPOPT_NOP << 16) | 563 (TCPOPT_TIMESTAMP << 8) | 564 TCPOLEN_TIMESTAMP)) { 565 /* timestamp option -- okay */ 566 } else { 567 dont_merge = 1; 568 } 569 } 570 571 if (__predict_false(th_seq != c->next_seq)) { 572 /* Out-of-order, so start counting again. */ 573 if (c->mbuf != NULL) 574 sfxge_lro_deliver(&rxq->lro, c); 575 c->n_in_order_pkts -= lro_loss_packets; 576 c->next_seq = th_seq + data_length; 577 ++rxq->lro.n_misorder; 578 goto deliver_buf_out; 579 } 580 c->next_seq = th_seq + data_length; 581 582 now = ticks; 583 if (now - c->last_pkt_ticks > lro_idle_ticks) { 584 ++rxq->lro.n_drop_idle; 585 if (c->mbuf != NULL) 586 sfxge_lro_deliver(&rxq->lro, c); 587 sfxge_lro_drop(rxq, c); 588 return (0); 589 } 590 c->last_pkt_ticks = ticks; 591 592 if (c->n_in_order_pkts < lro_slow_start_packets) { 593 /* May be in slow-start, so don't merge. */ 594 ++rxq->lro.n_slow_start; 595 ++c->n_in_order_pkts; 596 goto deliver_buf_out; 597 } 598 599 if (__predict_false(dont_merge)) { 600 if (c->mbuf != NULL) 601 sfxge_lro_deliver(&rxq->lro, c); 602 if (th->th_flags & (TH_FIN | TH_RST)) { 603 ++rxq->lro.n_drop_closed; 604 sfxge_lro_drop(rxq, c); 605 return (0); 606 } 607 goto deliver_buf_out; 608 } 609 610 rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size; 611 612 if (__predict_true(c->mbuf != NULL)) { 613 /* Remove headers and any padding */ 614 rx_buf->mbuf->m_data += hdr_length; 615 rx_buf->mbuf->m_len = data_length; 616 617 sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th); 618 } else { 619 /* Remove any padding */ 620 rx_buf->mbuf->m_len = pkt_length; 621 622 sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th); 623 } 624 625 rx_buf->mbuf = NULL; 626 return (1); 627 628 deliver_buf_out: 629 sfxge_rx_deliver(rxq->sc, rx_buf); 630 return (1); 631 } 632 633 static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash, 634 uint16_t l2_id, void *nh, struct tcphdr *th) 635 { 636 unsigned bucket = conn_hash & st->conns_mask; 637 struct sfxge_lro_conn *c; 638 639 if (st->conns_n[bucket] >= lro_chain_max) { 640 ++st->n_too_many; 641 return; 642 } 643 644 if (!TAILQ_EMPTY(&st->free_conns)) { 645 c = TAILQ_FIRST(&st->free_conns); 646 TAILQ_REMOVE(&st->free_conns, c, link); 647 } else { 648 c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT); 649 if (c == NULL) 650 return; 651 c->mbuf = NULL; 652 c->next_buf.mbuf = NULL; 653 } 654 655 /* Create the connection tracking data */ 656 ++st->conns_n[bucket]; 657 TAILQ_INSERT_HEAD(&st->conns[bucket], c, link); 658 c->l2_id = l2_id; 659 c->conn_hash = conn_hash; 660 c->source = th->th_sport; 661 c->dest = th->th_dport; 662 c->n_in_order_pkts = 0; 663 c->last_pkt_ticks = *(volatile int *)&ticks; 664 c->delivered = 0; 665 ++st->n_new_stream; 666 /* NB. We don't initialise c->next_seq, and it doesn't matter what 667 * value it has. Most likely the next packet received for this 668 * connection will not match -- no harm done. 669 */ 670 } 671 672 /* Process mbuf and decide whether to dispatch it to the stack now or 673 * later. 674 */ 675 static void 676 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf) 677 { 678 struct sfxge_softc *sc = rxq->sc; 679 struct mbuf *m = rx_buf->mbuf; 680 struct ether_header *eh; 681 struct sfxge_lro_conn *c; 682 uint16_t l2_id; 683 uint16_t l3_proto; 684 void *nh; 685 struct tcphdr *th; 686 uint32_t conn_hash; 687 unsigned bucket; 688 689 /* Get the hardware hash */ 690 conn_hash = efx_psuedo_hdr_hash_get(sc->enp, 691 EFX_RX_HASHALG_TOEPLITZ, 692 mtod(m, uint8_t *)); 693 694 eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size); 695 if (eh->ether_type == htons(ETHERTYPE_VLAN)) { 696 struct ether_vlan_header *veh = (struct ether_vlan_header *)eh; 697 l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) | 698 SFXGE_LRO_L2_ID_VLAN; 699 l3_proto = veh->evl_proto; 700 nh = veh + 1; 701 } else { 702 l2_id = 0; 703 l3_proto = eh->ether_type; 704 nh = eh + 1; 705 } 706 707 /* Check whether this is a suitable packet (unfragmented 708 * TCP/IPv4 or TCP/IPv6). If so, find the TCP header and 709 * length, and compute a hash if necessary. If not, return. 710 */ 711 if (l3_proto == htons(ETHERTYPE_IP)) { 712 struct ip *iph = nh; 713 714 KASSERT(iph->ip_p == IPPROTO_TCP, 715 ("IPv4 protocol is not TCP, but packet marker is set")); 716 if ((iph->ip_hl - (sizeof(*iph) >> 2u)) | 717 (iph->ip_off & htons(IP_MF | IP_OFFMASK))) 718 goto deliver_now; 719 th = (struct tcphdr *)(iph + 1); 720 } else if (l3_proto == htons(ETHERTYPE_IPV6)) { 721 struct ip6_hdr *iph = nh; 722 723 KASSERT(iph->ip6_nxt == IPPROTO_TCP, 724 ("IPv6 next header is not TCP, but packet marker is set")); 725 l2_id |= SFXGE_LRO_L2_ID_IPV6; 726 th = (struct tcphdr *)(iph + 1); 727 } else { 728 goto deliver_now; 729 } 730 731 bucket = conn_hash & rxq->lro.conns_mask; 732 733 TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) { 734 if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash)) 735 continue; 736 if ((c->source - th->th_sport) | (c->dest - th->th_dport)) 737 continue; 738 if (c->mbuf != NULL) { 739 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 740 struct ip *c_iph, *iph = nh; 741 c_iph = c->nh; 742 if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) | 743 (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr)) 744 continue; 745 } else { 746 struct ip6_hdr *c_iph, *iph = nh; 747 c_iph = c->nh; 748 if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) | 749 ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst)) 750 continue; 751 } 752 } 753 754 /* Re-insert at head of list to reduce lookup time. */ 755 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link); 756 TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link); 757 758 if (c->next_buf.mbuf != NULL) { 759 if (!sfxge_lro_try_merge(rxq, c)) 760 goto deliver_now; 761 } else { 762 LIST_INSERT_HEAD(&rxq->lro.active_conns, c, 763 active_link); 764 } 765 c->next_buf = *rx_buf; 766 c->next_eh = eh; 767 c->next_nh = nh; 768 769 rx_buf->mbuf = NULL; 770 rx_buf->flags = EFX_DISCARD; 771 return; 772 } 773 774 sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th); 775 deliver_now: 776 sfxge_rx_deliver(sc, rx_buf); 777 } 778 779 static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq) 780 { 781 struct sfxge_lro_state *st = &rxq->lro; 782 struct sfxge_lro_conn *c; 783 unsigned t; 784 785 while (!LIST_EMPTY(&st->active_conns)) { 786 c = LIST_FIRST(&st->active_conns); 787 if (!c->delivered && c->mbuf != NULL) 788 sfxge_lro_deliver(st, c); 789 if (sfxge_lro_try_merge(rxq, c)) { 790 if (c->mbuf != NULL) 791 sfxge_lro_deliver(st, c); 792 LIST_REMOVE(c, active_link); 793 } 794 c->delivered = 0; 795 } 796 797 t = *(volatile int *)&ticks; 798 if (__predict_false(t != st->last_purge_ticks)) 799 sfxge_lro_purge_idle(rxq, t); 800 } 801 802 #else /* !SFXGE_LRO */ 803 804 static void 805 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf) 806 { 807 } 808 809 static void 810 sfxge_lro_end_of_burst(struct sfxge_rxq *rxq) 811 { 812 } 813 814 #endif /* SFXGE_LRO */ 815 816 void 817 sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop) 818 { 819 struct sfxge_softc *sc = rxq->sc; 820 int if_capenable = sc->ifnet->if_capenable; 821 int lro_enabled = if_capenable & IFCAP_LRO; 822 unsigned int index; 823 struct sfxge_evq *evq; 824 unsigned int completed; 825 unsigned int level; 826 struct mbuf *m; 827 struct sfxge_rx_sw_desc *prev = NULL; 828 829 index = rxq->index; 830 evq = sc->evq[index]; 831 832 SFXGE_EVQ_LOCK_ASSERT_OWNED(evq); 833 834 completed = rxq->completed; 835 while (completed != rxq->pending) { 836 unsigned int id; 837 struct sfxge_rx_sw_desc *rx_desc; 838 839 id = completed++ & rxq->ptr_mask; 840 rx_desc = &rxq->queue[id]; 841 m = rx_desc->mbuf; 842 843 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED)) 844 goto discard; 845 846 if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD)) 847 goto discard; 848 849 /* Read the length from the psuedo header if required */ 850 if (rx_desc->flags & EFX_PKT_PREFIX_LEN) { 851 uint16_t tmp_size; 852 int rc; 853 rc = efx_psuedo_hdr_pkt_length_get(sc->enp, 854 mtod(m, uint8_t *), 855 &tmp_size); 856 KASSERT(rc == 0, ("cannot get packet length: %d", rc)); 857 rx_desc->size = (int)tmp_size + sc->rx_prefix_size; 858 } 859 860 prefetch_read_many(mtod(m, caddr_t)); 861 862 switch (rx_desc->flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) { 863 case EFX_PKT_IPV4: 864 if (~if_capenable & IFCAP_RXCSUM) 865 rx_desc->flags &= 866 ~(EFX_CKSUM_IPV4 | EFX_CKSUM_TCPUDP); 867 break; 868 case EFX_PKT_IPV6: 869 if (~if_capenable & IFCAP_RXCSUM_IPV6) 870 rx_desc->flags &= ~EFX_CKSUM_TCPUDP; 871 break; 872 case 0: 873 /* Check for loopback packets */ 874 { 875 struct ether_header *etherhp; 876 877 /*LINTED*/ 878 etherhp = mtod(m, struct ether_header *); 879 880 if (etherhp->ether_type == 881 htons(SFXGE_ETHERTYPE_LOOPBACK)) { 882 EFSYS_PROBE(loopback); 883 884 rxq->loopback++; 885 goto discard; 886 } 887 } 888 break; 889 default: 890 KASSERT(B_FALSE, 891 ("Rx descriptor with both IPv4 and IPv6 flags")); 892 goto discard; 893 } 894 895 /* Pass packet up the stack or into LRO (pipelined) */ 896 if (prev != NULL) { 897 if (lro_enabled && 898 ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) == 899 (EFX_PKT_TCP | EFX_CKSUM_TCPUDP))) 900 sfxge_lro(rxq, prev); 901 else 902 sfxge_rx_deliver(sc, prev); 903 } 904 prev = rx_desc; 905 continue; 906 907 discard: 908 /* Return the packet to the pool */ 909 m_free(m); 910 rx_desc->mbuf = NULL; 911 } 912 rxq->completed = completed; 913 914 level = rxq->added - rxq->completed; 915 916 /* Pass last packet up the stack or into LRO */ 917 if (prev != NULL) { 918 if (lro_enabled && 919 ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) == 920 (EFX_PKT_TCP | EFX_CKSUM_TCPUDP))) 921 sfxge_lro(rxq, prev); 922 else 923 sfxge_rx_deliver(sc, prev); 924 } 925 926 /* 927 * If there are any pending flows and this is the end of the 928 * poll then they must be completed. 929 */ 930 if (eop) 931 sfxge_lro_end_of_burst(rxq); 932 933 /* Top up the queue if necessary */ 934 if (level < rxq->refill_threshold) 935 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE); 936 } 937 938 static void 939 sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index) 940 { 941 struct sfxge_rxq *rxq; 942 struct sfxge_evq *evq; 943 unsigned int count; 944 unsigned int retry = 3; 945 946 SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc); 947 948 rxq = sc->rxq[index]; 949 evq = sc->evq[index]; 950 951 SFXGE_EVQ_LOCK(evq); 952 953 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED, 954 ("rxq not started")); 955 956 rxq->init_state = SFXGE_RXQ_INITIALIZED; 957 958 callout_stop(&rxq->refill_callout); 959 960 while (rxq->flush_state != SFXGE_FLUSH_DONE && retry != 0) { 961 rxq->flush_state = SFXGE_FLUSH_PENDING; 962 963 SFXGE_EVQ_UNLOCK(evq); 964 965 /* Flush the receive queue */ 966 if (efx_rx_qflush(rxq->common) != 0) { 967 SFXGE_EVQ_LOCK(evq); 968 rxq->flush_state = SFXGE_FLUSH_FAILED; 969 break; 970 } 971 972 count = 0; 973 do { 974 /* Spin for 100 ms */ 975 DELAY(100000); 976 977 if (rxq->flush_state != SFXGE_FLUSH_PENDING) 978 break; 979 980 } while (++count < 20); 981 982 SFXGE_EVQ_LOCK(evq); 983 984 if (rxq->flush_state == SFXGE_FLUSH_PENDING) { 985 /* Flush timeout - neither done nor failed */ 986 log(LOG_ERR, "%s: Cannot flush Rx queue %u\n", 987 device_get_nameunit(sc->dev), index); 988 rxq->flush_state = SFXGE_FLUSH_DONE; 989 } 990 retry--; 991 } 992 if (rxq->flush_state == SFXGE_FLUSH_FAILED) { 993 log(LOG_ERR, "%s: Flushing Rx queue %u failed\n", 994 device_get_nameunit(sc->dev), index); 995 rxq->flush_state = SFXGE_FLUSH_DONE; 996 } 997 998 rxq->pending = rxq->added; 999 sfxge_rx_qcomplete(rxq, B_TRUE); 1000 1001 KASSERT(rxq->completed == rxq->pending, 1002 ("rxq->completed != rxq->pending")); 1003 1004 rxq->added = 0; 1005 rxq->pushed = 0; 1006 rxq->pending = 0; 1007 rxq->completed = 0; 1008 rxq->loopback = 0; 1009 1010 /* Destroy the common code receive queue. */ 1011 efx_rx_qdestroy(rxq->common); 1012 1013 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id, 1014 EFX_RXQ_NBUFS(sc->rxq_entries)); 1015 1016 SFXGE_EVQ_UNLOCK(evq); 1017 } 1018 1019 static int 1020 sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index) 1021 { 1022 struct sfxge_rxq *rxq; 1023 efsys_mem_t *esmp; 1024 struct sfxge_evq *evq; 1025 int rc; 1026 1027 SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc); 1028 1029 rxq = sc->rxq[index]; 1030 esmp = &rxq->mem; 1031 evq = sc->evq[index]; 1032 1033 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED, 1034 ("rxq->init_state != SFXGE_RXQ_INITIALIZED")); 1035 KASSERT(evq->init_state == SFXGE_EVQ_STARTED, 1036 ("evq->init_state != SFXGE_EVQ_STARTED")); 1037 1038 /* Program the buffer table. */ 1039 if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp, 1040 EFX_RXQ_NBUFS(sc->rxq_entries))) != 0) 1041 return (rc); 1042 1043 /* Create the common code receive queue. */ 1044 if ((rc = efx_rx_qcreate(sc->enp, index, index, EFX_RXQ_TYPE_DEFAULT, 1045 esmp, sc->rxq_entries, rxq->buf_base_id, evq->common, 1046 &rxq->common)) != 0) 1047 goto fail; 1048 1049 SFXGE_EVQ_LOCK(evq); 1050 1051 /* Enable the receive queue. */ 1052 efx_rx_qenable(rxq->common); 1053 1054 rxq->init_state = SFXGE_RXQ_STARTED; 1055 rxq->flush_state = SFXGE_FLUSH_REQUIRED; 1056 1057 /* Try to fill the queue from the pool. */ 1058 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE); 1059 1060 SFXGE_EVQ_UNLOCK(evq); 1061 1062 return (0); 1063 1064 fail: 1065 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id, 1066 EFX_RXQ_NBUFS(sc->rxq_entries)); 1067 return (rc); 1068 } 1069 1070 void 1071 sfxge_rx_stop(struct sfxge_softc *sc) 1072 { 1073 int index; 1074 1075 efx_mac_filter_default_rxq_clear(sc->enp); 1076 1077 /* Stop the receive queue(s) */ 1078 index = sc->rxq_count; 1079 while (--index >= 0) 1080 sfxge_rx_qstop(sc, index); 1081 1082 sc->rx_prefix_size = 0; 1083 sc->rx_buffer_size = 0; 1084 1085 efx_rx_fini(sc->enp); 1086 } 1087 1088 int 1089 sfxge_rx_start(struct sfxge_softc *sc) 1090 { 1091 struct sfxge_intr *intr; 1092 const efx_nic_cfg_t *encp; 1093 size_t hdrlen, align, reserved; 1094 int index; 1095 int rc; 1096 1097 intr = &sc->intr; 1098 1099 /* Initialize the common code receive module. */ 1100 if ((rc = efx_rx_init(sc->enp)) != 0) 1101 return (rc); 1102 1103 encp = efx_nic_cfg_get(sc->enp); 1104 sc->rx_buffer_size = EFX_MAC_PDU(sc->ifnet->if_mtu); 1105 1106 /* Calculate the receive packet buffer size. */ 1107 sc->rx_prefix_size = encp->enc_rx_prefix_size; 1108 1109 /* Ensure IP headers are 32bit aligned */ 1110 hdrlen = sc->rx_prefix_size + sizeof (struct ether_header); 1111 sc->rx_buffer_align = P2ROUNDUP(hdrlen, 4) - hdrlen; 1112 1113 sc->rx_buffer_size += sc->rx_buffer_align; 1114 1115 /* Align end of packet buffer for RX DMA end padding */ 1116 align = MAX(1, encp->enc_rx_buf_align_end); 1117 EFSYS_ASSERT(ISP2(align)); 1118 sc->rx_buffer_size = P2ROUNDUP(sc->rx_buffer_size, align); 1119 1120 /* 1121 * Standard mbuf zones only guarantee pointer-size alignment; 1122 * we need extra space to align to the cache line 1123 */ 1124 reserved = sc->rx_buffer_size + CACHE_LINE_SIZE; 1125 1126 /* Select zone for packet buffers */ 1127 if (reserved <= MCLBYTES) 1128 sc->rx_buffer_zone = zone_clust; 1129 else if (reserved <= MJUMPAGESIZE) 1130 sc->rx_buffer_zone = zone_jumbop; 1131 else if (reserved <= MJUM9BYTES) 1132 sc->rx_buffer_zone = zone_jumbo9; 1133 else 1134 sc->rx_buffer_zone = zone_jumbo16; 1135 1136 /* 1137 * Set up the scale table. Enable all hash types and hash insertion. 1138 */ 1139 for (index = 0; index < SFXGE_RX_SCALE_MAX; index++) 1140 sc->rx_indir_table[index] = index % sc->rxq_count; 1141 if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table, 1142 SFXGE_RX_SCALE_MAX)) != 0) 1143 goto fail; 1144 (void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ, 1145 (1 << EFX_RX_HASH_IPV4) | (1 << EFX_RX_HASH_TCPIPV4) | 1146 (1 << EFX_RX_HASH_IPV6) | (1 << EFX_RX_HASH_TCPIPV6), B_TRUE); 1147 1148 if ((rc = efx_rx_scale_key_set(sc->enp, toep_key, 1149 sizeof(toep_key))) != 0) 1150 goto fail; 1151 1152 /* Start the receive queue(s). */ 1153 for (index = 0; index < sc->rxq_count; index++) { 1154 if ((rc = sfxge_rx_qstart(sc, index)) != 0) 1155 goto fail2; 1156 } 1157 1158 rc = efx_mac_filter_default_rxq_set(sc->enp, sc->rxq[0]->common, 1159 sc->intr.n_alloc > 1); 1160 if (rc != 0) 1161 goto fail3; 1162 1163 return (0); 1164 1165 fail3: 1166 fail2: 1167 while (--index >= 0) 1168 sfxge_rx_qstop(sc, index); 1169 1170 fail: 1171 efx_rx_fini(sc->enp); 1172 1173 return (rc); 1174 } 1175 1176 #ifdef SFXGE_LRO 1177 1178 static void sfxge_lro_init(struct sfxge_rxq *rxq) 1179 { 1180 struct sfxge_lro_state *st = &rxq->lro; 1181 unsigned i; 1182 1183 st->conns_mask = lro_table_size - 1; 1184 KASSERT(!((st->conns_mask + 1) & st->conns_mask), 1185 ("lro_table_size must be a power of 2")); 1186 st->sc = rxq->sc; 1187 st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]), 1188 M_SFXGE, M_WAITOK); 1189 st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]), 1190 M_SFXGE, M_WAITOK); 1191 for (i = 0; i <= st->conns_mask; ++i) { 1192 TAILQ_INIT(&st->conns[i]); 1193 st->conns_n[i] = 0; 1194 } 1195 LIST_INIT(&st->active_conns); 1196 TAILQ_INIT(&st->free_conns); 1197 } 1198 1199 static void sfxge_lro_fini(struct sfxge_rxq *rxq) 1200 { 1201 struct sfxge_lro_state *st = &rxq->lro; 1202 struct sfxge_lro_conn *c; 1203 unsigned i; 1204 1205 /* Return cleanly if sfxge_lro_init() has not been called. */ 1206 if (st->conns == NULL) 1207 return; 1208 1209 KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections")); 1210 1211 for (i = 0; i <= st->conns_mask; ++i) { 1212 while (!TAILQ_EMPTY(&st->conns[i])) { 1213 c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq); 1214 sfxge_lro_drop(rxq, c); 1215 } 1216 } 1217 1218 while (!TAILQ_EMPTY(&st->free_conns)) { 1219 c = TAILQ_FIRST(&st->free_conns); 1220 TAILQ_REMOVE(&st->free_conns, c, link); 1221 KASSERT(!c->mbuf, ("found orphaned mbuf")); 1222 free(c, M_SFXGE); 1223 } 1224 1225 free(st->conns_n, M_SFXGE); 1226 free(st->conns, M_SFXGE); 1227 st->conns = NULL; 1228 } 1229 1230 #else 1231 1232 static void 1233 sfxge_lro_init(struct sfxge_rxq *rxq) 1234 { 1235 } 1236 1237 static void 1238 sfxge_lro_fini(struct sfxge_rxq *rxq) 1239 { 1240 } 1241 1242 #endif /* SFXGE_LRO */ 1243 1244 static void 1245 sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index) 1246 { 1247 struct sfxge_rxq *rxq; 1248 1249 rxq = sc->rxq[index]; 1250 1251 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED, 1252 ("rxq->init_state != SFXGE_RXQ_INITIALIZED")); 1253 1254 /* Free the context array and the flow table. */ 1255 free(rxq->queue, M_SFXGE); 1256 sfxge_lro_fini(rxq); 1257 1258 /* Release DMA memory. */ 1259 sfxge_dma_free(&rxq->mem); 1260 1261 sc->rxq[index] = NULL; 1262 1263 free(rxq, M_SFXGE); 1264 } 1265 1266 static int 1267 sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index) 1268 { 1269 struct sfxge_rxq *rxq; 1270 struct sfxge_evq *evq; 1271 efsys_mem_t *esmp; 1272 int rc; 1273 1274 KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count)); 1275 1276 rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK); 1277 rxq->sc = sc; 1278 rxq->index = index; 1279 rxq->entries = sc->rxq_entries; 1280 rxq->ptr_mask = rxq->entries - 1; 1281 rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries); 1282 1283 sc->rxq[index] = rxq; 1284 esmp = &rxq->mem; 1285 1286 evq = sc->evq[index]; 1287 1288 /* Allocate and zero DMA space. */ 1289 if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0) 1290 return (rc); 1291 1292 /* Allocate buffer table entries. */ 1293 sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries), 1294 &rxq->buf_base_id); 1295 1296 /* Allocate the context array and the flow table. */ 1297 rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries, 1298 M_SFXGE, M_WAITOK | M_ZERO); 1299 sfxge_lro_init(rxq); 1300 1301 callout_init(&rxq->refill_callout, 1); 1302 1303 rxq->init_state = SFXGE_RXQ_INITIALIZED; 1304 1305 return (0); 1306 } 1307 1308 static const struct { 1309 const char *name; 1310 size_t offset; 1311 } sfxge_rx_stats[] = { 1312 #define SFXGE_RX_STAT(name, member) \ 1313 { #name, offsetof(struct sfxge_rxq, member) } 1314 #ifdef SFXGE_LRO 1315 SFXGE_RX_STAT(lro_merges, lro.n_merges), 1316 SFXGE_RX_STAT(lro_bursts, lro.n_bursts), 1317 SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start), 1318 SFXGE_RX_STAT(lro_misorder, lro.n_misorder), 1319 SFXGE_RX_STAT(lro_too_many, lro.n_too_many), 1320 SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream), 1321 SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle), 1322 SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed) 1323 #endif 1324 }; 1325 1326 static int 1327 sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS) 1328 { 1329 struct sfxge_softc *sc = arg1; 1330 unsigned int id = arg2; 1331 unsigned int sum, index; 1332 1333 /* Sum across all RX queues */ 1334 sum = 0; 1335 for (index = 0; index < sc->rxq_count; index++) 1336 sum += *(unsigned int *)((caddr_t)sc->rxq[index] + 1337 sfxge_rx_stats[id].offset); 1338 1339 return (SYSCTL_OUT(req, &sum, sizeof(sum))); 1340 } 1341 1342 static void 1343 sfxge_rx_stat_init(struct sfxge_softc *sc) 1344 { 1345 struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev); 1346 struct sysctl_oid_list *stat_list; 1347 unsigned int id; 1348 1349 stat_list = SYSCTL_CHILDREN(sc->stats_node); 1350 1351 for (id = 0; id < nitems(sfxge_rx_stats); id++) { 1352 SYSCTL_ADD_PROC( 1353 ctx, stat_list, 1354 OID_AUTO, sfxge_rx_stats[id].name, 1355 CTLTYPE_UINT|CTLFLAG_RD, 1356 sc, id, sfxge_rx_stat_handler, "IU", 1357 ""); 1358 } 1359 } 1360 1361 void 1362 sfxge_rx_fini(struct sfxge_softc *sc) 1363 { 1364 int index; 1365 1366 index = sc->rxq_count; 1367 while (--index >= 0) 1368 sfxge_rx_qfini(sc, index); 1369 1370 sc->rxq_count = 0; 1371 } 1372 1373 int 1374 sfxge_rx_init(struct sfxge_softc *sc) 1375 { 1376 struct sfxge_intr *intr; 1377 int index; 1378 int rc; 1379 1380 #ifdef SFXGE_LRO 1381 if (!ISP2(lro_table_size)) { 1382 log(LOG_ERR, "%s=%u must be power of 2", 1383 SFXGE_LRO_PARAM(table_size), lro_table_size); 1384 rc = EINVAL; 1385 goto fail_lro_table_size; 1386 } 1387 1388 if (lro_idle_ticks == 0) 1389 lro_idle_ticks = hz / 10 + 1; /* 100 ms */ 1390 #endif 1391 1392 intr = &sc->intr; 1393 1394 sc->rxq_count = intr->n_alloc; 1395 1396 KASSERT(intr->state == SFXGE_INTR_INITIALIZED, 1397 ("intr->state != SFXGE_INTR_INITIALIZED")); 1398 1399 /* Initialize the receive queue(s) - one per interrupt. */ 1400 for (index = 0; index < sc->rxq_count; index++) { 1401 if ((rc = sfxge_rx_qinit(sc, index)) != 0) 1402 goto fail; 1403 } 1404 1405 sfxge_rx_stat_init(sc); 1406 1407 return (0); 1408 1409 fail: 1410 /* Tear down the receive queue(s). */ 1411 while (--index >= 0) 1412 sfxge_rx_qfini(sc, index); 1413 1414 sc->rxq_count = 0; 1415 1416 #ifdef SFXGE_LRO 1417 fail_lro_table_size: 1418 #endif 1419 return (rc); 1420 } 1421