1 /*- 2 * Copyright (c) 2010-2011 Solarflare Communications, Inc. 3 * All rights reserved. 4 * 5 * This software was developed in part by Philip Paeps under contract for 6 * Solarflare Communications, Inc. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 #include <sys/cdefs.h> 31 __FBSDID("$FreeBSD$"); 32 33 #include <sys/types.h> 34 #include <sys/mbuf.h> 35 #include <sys/smp.h> 36 #include <sys/socket.h> 37 #include <sys/sysctl.h> 38 #include <sys/limits.h> 39 40 #include <net/ethernet.h> 41 #include <net/if.h> 42 #include <net/if_vlan_var.h> 43 44 #include <netinet/in.h> 45 #include <netinet/ip.h> 46 #include <netinet/ip6.h> 47 #include <netinet/tcp.h> 48 49 #include <machine/in_cksum.h> 50 51 #include "common/efx.h" 52 53 54 #include "sfxge.h" 55 #include "sfxge_rx.h" 56 57 #define RX_REFILL_THRESHOLD(_entries) (EFX_RXQ_LIMIT(_entries) * 9 / 10) 58 59 /* Size of the LRO hash table. Must be a power of 2. A larger table 60 * means we can accelerate a larger number of streams. 61 */ 62 static unsigned lro_table_size = 128; 63 64 /* Maximum length of a hash chain. If chains get too long then the lookup 65 * time increases and may exceed the benefit of LRO. 66 */ 67 static unsigned lro_chain_max = 20; 68 69 /* Maximum time (in ticks) that a connection can be idle before it's LRO 70 * state is discarded. 71 */ 72 static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */ 73 74 /* Number of packets with payload that must arrive in-order before a 75 * connection is eligible for LRO. The idea is we should avoid coalescing 76 * segments when the sender is in slow-start because reducing the ACK rate 77 * can damage performance. 78 */ 79 static int lro_slow_start_packets = 2000; 80 81 /* Number of packets with payload that must arrive in-order following loss 82 * before a connection is eligible for LRO. The idea is we should avoid 83 * coalescing segments when the sender is recovering from loss, because 84 * reducing the ACK rate can damage performance. 85 */ 86 static int lro_loss_packets = 20; 87 88 /* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */ 89 #define SFXGE_LRO_L2_ID_VLAN 0x4000 90 #define SFXGE_LRO_L2_ID_IPV6 0x8000 91 #define SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN) 92 #define SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6)) 93 94 /* Compare IPv6 addresses, avoiding conditional branches */ 95 static __inline unsigned long ipv6_addr_cmp(const struct in6_addr *left, 96 const struct in6_addr *right) 97 { 98 #if LONG_BIT == 64 99 const uint64_t *left64 = (const uint64_t *)left; 100 const uint64_t *right64 = (const uint64_t *)right; 101 return (left64[0] - right64[0]) | (left64[1] - right64[1]); 102 #else 103 return (left->s6_addr32[0] - right->s6_addr32[0]) | 104 (left->s6_addr32[1] - right->s6_addr32[1]) | 105 (left->s6_addr32[2] - right->s6_addr32[2]) | 106 (left->s6_addr32[3] - right->s6_addr32[3]); 107 #endif 108 } 109 110 void 111 sfxge_rx_qflush_done(struct sfxge_rxq *rxq) 112 { 113 114 rxq->flush_state = SFXGE_FLUSH_DONE; 115 } 116 117 void 118 sfxge_rx_qflush_failed(struct sfxge_rxq *rxq) 119 { 120 121 rxq->flush_state = SFXGE_FLUSH_FAILED; 122 } 123 124 static uint8_t toep_key[] = { 125 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, 126 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, 127 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, 128 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, 129 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa 130 }; 131 132 static void 133 sfxge_rx_post_refill(void *arg) 134 { 135 struct sfxge_rxq *rxq = arg; 136 struct sfxge_softc *sc; 137 unsigned int index; 138 struct sfxge_evq *evq; 139 uint16_t magic; 140 141 sc = rxq->sc; 142 index = rxq->index; 143 evq = sc->evq[index]; 144 145 magic = SFXGE_MAGIC_RX_QREFILL | index; 146 147 /* This is guaranteed due to the start/stop order of rx and ev */ 148 KASSERT(evq->init_state == SFXGE_EVQ_STARTED, 149 ("evq not started")); 150 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED, 151 ("rxq not started")); 152 efx_ev_qpost(evq->common, magic); 153 } 154 155 static void 156 sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying) 157 { 158 /* Initially retry after 100 ms, but back off in case of 159 * repeated failures as we probably have to wait for the 160 * administrator to raise the pool limit. */ 161 if (retrying) 162 rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz); 163 else 164 rxq->refill_delay = hz / 10; 165 166 callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay, 167 sfxge_rx_post_refill, rxq); 168 } 169 170 static inline struct mbuf *sfxge_rx_alloc_mbuf(struct sfxge_softc *sc) 171 { 172 struct mb_args args; 173 struct mbuf *m; 174 175 /* Allocate mbuf structure */ 176 args.flags = M_PKTHDR; 177 args.type = MT_DATA; 178 m = (struct mbuf *)uma_zalloc_arg(zone_mbuf, &args, M_NOWAIT); 179 180 /* Allocate (and attach) packet buffer */ 181 if (m != NULL && !uma_zalloc_arg(sc->rx_buffer_zone, m, M_NOWAIT)) { 182 uma_zfree(zone_mbuf, m); 183 m = NULL; 184 } 185 186 return (m); 187 } 188 189 #define SFXGE_REFILL_BATCH 64 190 191 static void 192 sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying) 193 { 194 struct sfxge_softc *sc; 195 unsigned int index; 196 struct sfxge_evq *evq; 197 unsigned int batch; 198 unsigned int rxfill; 199 unsigned int mblksize; 200 int ntodo; 201 efsys_dma_addr_t addr[SFXGE_REFILL_BATCH]; 202 203 sc = rxq->sc; 204 index = rxq->index; 205 evq = sc->evq[index]; 206 207 prefetch_read_many(sc->enp); 208 prefetch_read_many(rxq->common); 209 210 mtx_assert(&evq->lock, MA_OWNED); 211 212 if (rxq->init_state != SFXGE_RXQ_STARTED) 213 return; 214 215 rxfill = rxq->added - rxq->completed; 216 KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries), 217 ("rxfill > EFX_RXQ_LIMIT(rxq->entries)")); 218 ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target); 219 KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries), 220 ("ntodo > EFX_RQX_LIMIT(rxq->entries)")); 221 222 if (ntodo == 0) 223 return; 224 225 batch = 0; 226 mblksize = sc->rx_buffer_size; 227 while (ntodo-- > 0) { 228 unsigned int id; 229 struct sfxge_rx_sw_desc *rx_desc; 230 bus_dma_segment_t seg; 231 struct mbuf *m; 232 233 id = (rxq->added + batch) & rxq->ptr_mask; 234 rx_desc = &rxq->queue[id]; 235 KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL")); 236 237 rx_desc->flags = EFX_DISCARD; 238 m = rx_desc->mbuf = sfxge_rx_alloc_mbuf(sc); 239 if (m == NULL) 240 break; 241 sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg); 242 addr[batch++] = seg.ds_addr; 243 244 if (batch == SFXGE_REFILL_BATCH) { 245 efx_rx_qpost(rxq->common, addr, mblksize, batch, 246 rxq->completed, rxq->added); 247 rxq->added += batch; 248 batch = 0; 249 } 250 } 251 252 if (ntodo != 0) 253 sfxge_rx_schedule_refill(rxq, retrying); 254 255 if (batch != 0) { 256 efx_rx_qpost(rxq->common, addr, mblksize, batch, 257 rxq->completed, rxq->added); 258 rxq->added += batch; 259 } 260 261 /* Make the descriptors visible to the hardware */ 262 bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map, 263 BUS_DMASYNC_PREWRITE); 264 265 efx_rx_qpush(rxq->common, rxq->added); 266 } 267 268 void 269 sfxge_rx_qrefill(struct sfxge_rxq *rxq) 270 { 271 272 if (rxq->init_state != SFXGE_RXQ_STARTED) 273 return; 274 275 /* Make sure the queue is full */ 276 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE); 277 } 278 279 static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m) 280 { 281 struct ifnet *ifp = sc->ifnet; 282 283 m->m_pkthdr.rcvif = ifp; 284 m->m_pkthdr.csum_data = 0xffff; 285 ifp->if_input(ifp, m); 286 } 287 288 static void 289 sfxge_rx_deliver(struct sfxge_softc *sc, struct sfxge_rx_sw_desc *rx_desc) 290 { 291 struct mbuf *m = rx_desc->mbuf; 292 int csum_flags; 293 294 /* Convert checksum flags */ 295 csum_flags = (rx_desc->flags & EFX_CKSUM_IPV4) ? 296 (CSUM_IP_CHECKED | CSUM_IP_VALID) : 0; 297 if (rx_desc->flags & EFX_CKSUM_TCPUDP) 298 csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 299 300 #ifdef SFXGE_HAVE_MQ 301 /* The hash covers a 4-tuple for TCP only */ 302 if (rx_desc->flags & EFX_PKT_TCP) { 303 m->m_pkthdr.flowid = EFX_RX_HASH_VALUE(EFX_RX_HASHALG_TOEPLITZ, 304 mtod(m, uint8_t *)); 305 M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE); 306 } 307 #endif 308 m->m_data += sc->rx_prefix_size; 309 m->m_len = rx_desc->size - sc->rx_prefix_size; 310 m->m_pkthdr.len = m->m_len; 311 m->m_pkthdr.csum_flags = csum_flags; 312 __sfxge_rx_deliver(sc, rx_desc->mbuf); 313 314 rx_desc->flags = EFX_DISCARD; 315 rx_desc->mbuf = NULL; 316 } 317 318 static void 319 sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c) 320 { 321 struct sfxge_softc *sc = st->sc; 322 struct mbuf *m = c->mbuf; 323 struct tcphdr *c_th; 324 int csum_flags; 325 326 KASSERT(m, ("no mbuf to deliver")); 327 328 ++st->n_bursts; 329 330 /* Finish off packet munging and recalculate IP header checksum. */ 331 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 332 struct ip *iph = c->nh; 333 iph->ip_len = htons(iph->ip_len); 334 iph->ip_sum = 0; 335 iph->ip_sum = in_cksum_hdr(iph); 336 c_th = (struct tcphdr *)(iph + 1); 337 csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR | 338 CSUM_IP_CHECKED | CSUM_IP_VALID); 339 } else { 340 struct ip6_hdr *iph = c->nh; 341 iph->ip6_plen = htons(iph->ip6_plen); 342 c_th = (struct tcphdr *)(iph + 1); 343 csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 344 } 345 346 c_th->th_win = c->th_last->th_win; 347 c_th->th_ack = c->th_last->th_ack; 348 if (c_th->th_off == c->th_last->th_off) { 349 /* Copy TCP options (take care to avoid going negative). */ 350 int optlen = ((c_th->th_off - 5) & 0xf) << 2u; 351 memcpy(c_th + 1, c->th_last + 1, optlen); 352 } 353 354 #ifdef SFXGE_HAVE_MQ 355 m->m_pkthdr.flowid = c->conn_hash; 356 M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE); 357 #endif 358 m->m_pkthdr.csum_flags = csum_flags; 359 __sfxge_rx_deliver(sc, m); 360 361 c->mbuf = NULL; 362 c->delivered = 1; 363 } 364 365 /* Drop the given connection, and add it to the free list. */ 366 static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c) 367 { 368 unsigned bucket; 369 370 KASSERT(!c->mbuf, ("found orphaned mbuf")); 371 372 if (c->next_buf.mbuf != NULL) { 373 sfxge_rx_deliver(rxq->sc, &c->next_buf); 374 LIST_REMOVE(c, active_link); 375 } 376 377 bucket = c->conn_hash & rxq->lro.conns_mask; 378 KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong")); 379 --rxq->lro.conns_n[bucket]; 380 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link); 381 TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link); 382 } 383 384 /* Stop tracking connections that have gone idle in order to keep hash 385 * chains short. 386 */ 387 static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now) 388 { 389 struct sfxge_lro_conn *c; 390 unsigned i; 391 392 KASSERT(LIST_EMPTY(&rxq->lro.active_conns), 393 ("found active connections")); 394 395 rxq->lro.last_purge_ticks = now; 396 for (i = 0; i <= rxq->lro.conns_mask; ++i) { 397 if (TAILQ_EMPTY(&rxq->lro.conns[i])) 398 continue; 399 400 c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq); 401 if (now - c->last_pkt_ticks > lro_idle_ticks) { 402 ++rxq->lro.n_drop_idle; 403 sfxge_lro_drop(rxq, c); 404 } 405 } 406 } 407 408 static void 409 sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c, 410 struct mbuf *mbuf, struct tcphdr *th) 411 { 412 struct tcphdr *c_th; 413 414 /* Tack the new mbuf onto the chain. */ 415 KASSERT(!mbuf->m_next, ("mbuf already chained")); 416 c->mbuf_tail->m_next = mbuf; 417 c->mbuf_tail = mbuf; 418 419 /* Increase length appropriately */ 420 c->mbuf->m_pkthdr.len += mbuf->m_len; 421 422 /* Update the connection state flags */ 423 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 424 struct ip *iph = c->nh; 425 iph->ip_len += mbuf->m_len; 426 c_th = (struct tcphdr *)(iph + 1); 427 } else { 428 struct ip6_hdr *iph = c->nh; 429 iph->ip6_plen += mbuf->m_len; 430 c_th = (struct tcphdr *)(iph + 1); 431 } 432 c_th->th_flags |= (th->th_flags & TH_PUSH); 433 c->th_last = th; 434 ++st->n_merges; 435 436 /* Pass packet up now if another segment could overflow the IP 437 * length. 438 */ 439 if (c->mbuf->m_pkthdr.len > 65536 - 9200) 440 sfxge_lro_deliver(st, c); 441 } 442 443 static void 444 sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c, 445 struct mbuf *mbuf, void *nh, struct tcphdr *th) 446 { 447 /* Start the chain */ 448 c->mbuf = mbuf; 449 c->mbuf_tail = c->mbuf; 450 c->nh = nh; 451 c->th_last = th; 452 453 mbuf->m_pkthdr.len = mbuf->m_len; 454 455 /* Mangle header fields for later processing */ 456 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 457 struct ip *iph = nh; 458 iph->ip_len = ntohs(iph->ip_len); 459 } else { 460 struct ip6_hdr *iph = nh; 461 iph->ip6_plen = ntohs(iph->ip6_plen); 462 } 463 } 464 465 /* Try to merge or otherwise hold or deliver (as appropriate) the 466 * packet buffered for this connection (c->next_buf). Return a flag 467 * indicating whether the connection is still active for LRO purposes. 468 */ 469 static int 470 sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c) 471 { 472 struct sfxge_rx_sw_desc *rx_buf = &c->next_buf; 473 char *eh = c->next_eh; 474 int data_length, hdr_length, dont_merge; 475 unsigned th_seq, pkt_length; 476 struct tcphdr *th; 477 unsigned now; 478 479 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 480 struct ip *iph = c->next_nh; 481 th = (struct tcphdr *)(iph + 1); 482 pkt_length = ntohs(iph->ip_len) + (char *) iph - eh; 483 } else { 484 struct ip6_hdr *iph = c->next_nh; 485 th = (struct tcphdr *)(iph + 1); 486 pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh; 487 } 488 489 hdr_length = (char *) th + th->th_off * 4 - eh; 490 data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) - 491 hdr_length); 492 th_seq = ntohl(th->th_seq); 493 dont_merge = ((data_length <= 0) 494 | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN))); 495 496 /* Check for options other than aligned timestamp. */ 497 if (th->th_off != 5) { 498 const uint32_t *opt_ptr = (const uint32_t *) (th + 1); 499 if (th->th_off == 8 && 500 opt_ptr[0] == ntohl((TCPOPT_NOP << 24) | 501 (TCPOPT_NOP << 16) | 502 (TCPOPT_TIMESTAMP << 8) | 503 TCPOLEN_TIMESTAMP)) { 504 /* timestamp option -- okay */ 505 } else { 506 dont_merge = 1; 507 } 508 } 509 510 if (__predict_false(th_seq != c->next_seq)) { 511 /* Out-of-order, so start counting again. */ 512 if (c->mbuf != NULL) 513 sfxge_lro_deliver(&rxq->lro, c); 514 c->n_in_order_pkts -= lro_loss_packets; 515 c->next_seq = th_seq + data_length; 516 ++rxq->lro.n_misorder; 517 goto deliver_buf_out; 518 } 519 c->next_seq = th_seq + data_length; 520 521 now = ticks; 522 if (now - c->last_pkt_ticks > lro_idle_ticks) { 523 ++rxq->lro.n_drop_idle; 524 if (c->mbuf != NULL) 525 sfxge_lro_deliver(&rxq->lro, c); 526 sfxge_lro_drop(rxq, c); 527 return (0); 528 } 529 c->last_pkt_ticks = ticks; 530 531 if (c->n_in_order_pkts < lro_slow_start_packets) { 532 /* May be in slow-start, so don't merge. */ 533 ++rxq->lro.n_slow_start; 534 ++c->n_in_order_pkts; 535 goto deliver_buf_out; 536 } 537 538 if (__predict_false(dont_merge)) { 539 if (c->mbuf != NULL) 540 sfxge_lro_deliver(&rxq->lro, c); 541 if (th->th_flags & (TH_FIN | TH_RST)) { 542 ++rxq->lro.n_drop_closed; 543 sfxge_lro_drop(rxq, c); 544 return (0); 545 } 546 goto deliver_buf_out; 547 } 548 549 rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size; 550 551 if (__predict_true(c->mbuf != NULL)) { 552 /* Remove headers and any padding */ 553 rx_buf->mbuf->m_data += hdr_length; 554 rx_buf->mbuf->m_len = data_length; 555 556 sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th); 557 } else { 558 /* Remove any padding */ 559 rx_buf->mbuf->m_len = pkt_length; 560 561 sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th); 562 } 563 564 rx_buf->mbuf = NULL; 565 return (1); 566 567 deliver_buf_out: 568 sfxge_rx_deliver(rxq->sc, rx_buf); 569 return (1); 570 } 571 572 static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash, 573 uint16_t l2_id, void *nh, struct tcphdr *th) 574 { 575 unsigned bucket = conn_hash & st->conns_mask; 576 struct sfxge_lro_conn *c; 577 578 if (st->conns_n[bucket] >= lro_chain_max) { 579 ++st->n_too_many; 580 return; 581 } 582 583 if (!TAILQ_EMPTY(&st->free_conns)) { 584 c = TAILQ_FIRST(&st->free_conns); 585 TAILQ_REMOVE(&st->free_conns, c, link); 586 } else { 587 c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT); 588 if (c == NULL) 589 return; 590 c->mbuf = NULL; 591 c->next_buf.mbuf = NULL; 592 } 593 594 /* Create the connection tracking data */ 595 ++st->conns_n[bucket]; 596 TAILQ_INSERT_HEAD(&st->conns[bucket], c, link); 597 c->l2_id = l2_id; 598 c->conn_hash = conn_hash; 599 c->source = th->th_sport; 600 c->dest = th->th_dport; 601 c->n_in_order_pkts = 0; 602 c->last_pkt_ticks = *(volatile int *)&ticks; 603 c->delivered = 0; 604 ++st->n_new_stream; 605 /* NB. We don't initialise c->next_seq, and it doesn't matter what 606 * value it has. Most likely the next packet received for this 607 * connection will not match -- no harm done. 608 */ 609 } 610 611 /* Process mbuf and decide whether to dispatch it to the stack now or 612 * later. 613 */ 614 static void 615 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf) 616 { 617 struct sfxge_softc *sc = rxq->sc; 618 struct mbuf *m = rx_buf->mbuf; 619 struct ether_header *eh; 620 struct sfxge_lro_conn *c; 621 uint16_t l2_id; 622 uint16_t l3_proto; 623 void *nh; 624 struct tcphdr *th; 625 uint32_t conn_hash; 626 unsigned bucket; 627 628 /* Get the hardware hash */ 629 conn_hash = EFX_RX_HASH_VALUE(EFX_RX_HASHALG_TOEPLITZ, 630 mtod(m, uint8_t *)); 631 632 eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size); 633 if (eh->ether_type == htons(ETHERTYPE_VLAN)) { 634 struct ether_vlan_header *veh = (struct ether_vlan_header *)eh; 635 l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) | 636 SFXGE_LRO_L2_ID_VLAN; 637 l3_proto = veh->evl_proto; 638 nh = veh + 1; 639 } else { 640 l2_id = 0; 641 l3_proto = eh->ether_type; 642 nh = eh + 1; 643 } 644 645 /* Check whether this is a suitable packet (unfragmented 646 * TCP/IPv4 or TCP/IPv6). If so, find the TCP header and 647 * length, and compute a hash if necessary. If not, return. 648 */ 649 if (l3_proto == htons(ETHERTYPE_IP)) { 650 struct ip *iph = nh; 651 if ((iph->ip_p - IPPROTO_TCP) | 652 (iph->ip_hl - (sizeof(*iph) >> 2u)) | 653 (iph->ip_off & htons(IP_MF | IP_OFFMASK))) 654 goto deliver_now; 655 th = (struct tcphdr *)(iph + 1); 656 } else if (l3_proto == htons(ETHERTYPE_IPV6)) { 657 struct ip6_hdr *iph = nh; 658 if (iph->ip6_nxt != IPPROTO_TCP) 659 goto deliver_now; 660 l2_id |= SFXGE_LRO_L2_ID_IPV6; 661 th = (struct tcphdr *)(iph + 1); 662 } else { 663 goto deliver_now; 664 } 665 666 bucket = conn_hash & rxq->lro.conns_mask; 667 668 TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) { 669 if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash)) 670 continue; 671 if ((c->source - th->th_sport) | (c->dest - th->th_dport)) 672 continue; 673 if (c->mbuf != NULL) { 674 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 675 struct ip *c_iph, *iph = nh; 676 c_iph = c->nh; 677 if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) | 678 (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr)) 679 continue; 680 } else { 681 struct ip6_hdr *c_iph, *iph = nh; 682 c_iph = c->nh; 683 if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) | 684 ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst)) 685 continue; 686 } 687 } 688 689 /* Re-insert at head of list to reduce lookup time. */ 690 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link); 691 TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link); 692 693 if (c->next_buf.mbuf != NULL) { 694 if (!sfxge_lro_try_merge(rxq, c)) 695 goto deliver_now; 696 } else { 697 LIST_INSERT_HEAD(&rxq->lro.active_conns, c, 698 active_link); 699 } 700 c->next_buf = *rx_buf; 701 c->next_eh = eh; 702 c->next_nh = nh; 703 704 rx_buf->mbuf = NULL; 705 rx_buf->flags = EFX_DISCARD; 706 return; 707 } 708 709 sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th); 710 deliver_now: 711 sfxge_rx_deliver(sc, rx_buf); 712 } 713 714 static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq) 715 { 716 struct sfxge_lro_state *st = &rxq->lro; 717 struct sfxge_lro_conn *c; 718 unsigned t; 719 720 while (!LIST_EMPTY(&st->active_conns)) { 721 c = LIST_FIRST(&st->active_conns); 722 if (!c->delivered && c->mbuf != NULL) 723 sfxge_lro_deliver(st, c); 724 if (sfxge_lro_try_merge(rxq, c)) { 725 if (c->mbuf != NULL) 726 sfxge_lro_deliver(st, c); 727 LIST_REMOVE(c, active_link); 728 } 729 c->delivered = 0; 730 } 731 732 t = *(volatile int *)&ticks; 733 if (__predict_false(t != st->last_purge_ticks)) 734 sfxge_lro_purge_idle(rxq, t); 735 } 736 737 void 738 sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop) 739 { 740 struct sfxge_softc *sc = rxq->sc; 741 int lro_enabled = sc->ifnet->if_capenable & IFCAP_LRO; 742 unsigned int index; 743 struct sfxge_evq *evq; 744 unsigned int completed; 745 unsigned int level; 746 struct mbuf *m; 747 struct sfxge_rx_sw_desc *prev = NULL; 748 749 index = rxq->index; 750 evq = sc->evq[index]; 751 752 mtx_assert(&evq->lock, MA_OWNED); 753 754 completed = rxq->completed; 755 while (completed != rxq->pending) { 756 unsigned int id; 757 struct sfxge_rx_sw_desc *rx_desc; 758 759 id = completed++ & rxq->ptr_mask; 760 rx_desc = &rxq->queue[id]; 761 m = rx_desc->mbuf; 762 763 if (rxq->init_state != SFXGE_RXQ_STARTED) 764 goto discard; 765 766 if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD)) 767 goto discard; 768 769 prefetch_read_many(mtod(m, caddr_t)); 770 771 /* Check for loopback packets */ 772 if (!(rx_desc->flags & EFX_PKT_IPV4) && 773 !(rx_desc->flags & EFX_PKT_IPV6)) { 774 struct ether_header *etherhp; 775 776 /*LINTED*/ 777 etherhp = mtod(m, struct ether_header *); 778 779 if (etherhp->ether_type == 780 htons(SFXGE_ETHERTYPE_LOOPBACK)) { 781 EFSYS_PROBE(loopback); 782 783 rxq->loopback++; 784 goto discard; 785 } 786 } 787 788 /* Pass packet up the stack or into LRO (pipelined) */ 789 if (prev != NULL) { 790 if (lro_enabled) 791 sfxge_lro(rxq, prev); 792 else 793 sfxge_rx_deliver(sc, prev); 794 } 795 prev = rx_desc; 796 continue; 797 798 discard: 799 /* Return the packet to the pool */ 800 m_free(m); 801 rx_desc->mbuf = NULL; 802 } 803 rxq->completed = completed; 804 805 level = rxq->added - rxq->completed; 806 807 /* Pass last packet up the stack or into LRO */ 808 if (prev != NULL) { 809 if (lro_enabled) 810 sfxge_lro(rxq, prev); 811 else 812 sfxge_rx_deliver(sc, prev); 813 } 814 815 /* 816 * If there are any pending flows and this is the end of the 817 * poll then they must be completed. 818 */ 819 if (eop) 820 sfxge_lro_end_of_burst(rxq); 821 822 /* Top up the queue if necessary */ 823 if (level < rxq->refill_threshold) 824 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE); 825 } 826 827 static void 828 sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index) 829 { 830 struct sfxge_rxq *rxq; 831 struct sfxge_evq *evq; 832 unsigned int count; 833 834 rxq = sc->rxq[index]; 835 evq = sc->evq[index]; 836 837 mtx_lock(&evq->lock); 838 839 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED, 840 ("rxq not started")); 841 842 rxq->init_state = SFXGE_RXQ_INITIALIZED; 843 844 callout_stop(&rxq->refill_callout); 845 846 again: 847 rxq->flush_state = SFXGE_FLUSH_PENDING; 848 849 /* Flush the receive queue */ 850 efx_rx_qflush(rxq->common); 851 852 mtx_unlock(&evq->lock); 853 854 count = 0; 855 do { 856 /* Spin for 100 ms */ 857 DELAY(100000); 858 859 if (rxq->flush_state != SFXGE_FLUSH_PENDING) 860 break; 861 862 } while (++count < 20); 863 864 mtx_lock(&evq->lock); 865 866 if (rxq->flush_state == SFXGE_FLUSH_FAILED) 867 goto again; 868 869 rxq->flush_state = SFXGE_FLUSH_DONE; 870 871 rxq->pending = rxq->added; 872 sfxge_rx_qcomplete(rxq, B_TRUE); 873 874 KASSERT(rxq->completed == rxq->pending, 875 ("rxq->completed != rxq->pending")); 876 877 rxq->added = 0; 878 rxq->pending = 0; 879 rxq->completed = 0; 880 rxq->loopback = 0; 881 882 /* Destroy the common code receive queue. */ 883 efx_rx_qdestroy(rxq->common); 884 885 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id, 886 EFX_RXQ_NBUFS(sc->rxq_entries)); 887 888 mtx_unlock(&evq->lock); 889 } 890 891 static int 892 sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index) 893 { 894 struct sfxge_rxq *rxq; 895 efsys_mem_t *esmp; 896 struct sfxge_evq *evq; 897 int rc; 898 899 rxq = sc->rxq[index]; 900 esmp = &rxq->mem; 901 evq = sc->evq[index]; 902 903 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED, 904 ("rxq->init_state != SFXGE_RXQ_INITIALIZED")); 905 KASSERT(evq->init_state == SFXGE_EVQ_STARTED, 906 ("evq->init_state != SFXGE_EVQ_STARTED")); 907 908 /* Program the buffer table. */ 909 if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp, 910 EFX_RXQ_NBUFS(sc->rxq_entries))) != 0) 911 return (rc); 912 913 /* Create the common code receive queue. */ 914 if ((rc = efx_rx_qcreate(sc->enp, index, index, EFX_RXQ_TYPE_DEFAULT, 915 esmp, sc->rxq_entries, rxq->buf_base_id, evq->common, 916 &rxq->common)) != 0) 917 goto fail; 918 919 mtx_lock(&evq->lock); 920 921 /* Enable the receive queue. */ 922 efx_rx_qenable(rxq->common); 923 924 rxq->init_state = SFXGE_RXQ_STARTED; 925 926 /* Try to fill the queue from the pool. */ 927 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE); 928 929 mtx_unlock(&evq->lock); 930 931 return (0); 932 933 fail: 934 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id, 935 EFX_RXQ_NBUFS(sc->rxq_entries)); 936 return (rc); 937 } 938 939 void 940 sfxge_rx_stop(struct sfxge_softc *sc) 941 { 942 struct sfxge_intr *intr; 943 int index; 944 945 intr = &sc->intr; 946 947 /* Stop the receive queue(s) */ 948 index = intr->n_alloc; 949 while (--index >= 0) 950 sfxge_rx_qstop(sc, index); 951 952 sc->rx_prefix_size = 0; 953 sc->rx_buffer_size = 0; 954 955 efx_rx_fini(sc->enp); 956 } 957 958 int 959 sfxge_rx_start(struct sfxge_softc *sc) 960 { 961 struct sfxge_intr *intr; 962 int index; 963 int rc; 964 965 intr = &sc->intr; 966 967 /* Initialize the common code receive module. */ 968 if ((rc = efx_rx_init(sc->enp)) != 0) 969 return (rc); 970 971 /* Calculate the receive packet buffer size. */ 972 sc->rx_prefix_size = EFX_RX_PREFIX_SIZE; 973 sc->rx_buffer_size = (EFX_MAC_PDU(sc->ifnet->if_mtu) + 974 sc->rx_prefix_size); 975 976 /* Select zone for packet buffers */ 977 if (sc->rx_buffer_size <= MCLBYTES) 978 sc->rx_buffer_zone = zone_clust; 979 else if (sc->rx_buffer_size <= MJUMPAGESIZE) 980 sc->rx_buffer_zone = zone_jumbop; 981 else if (sc->rx_buffer_size <= MJUM9BYTES) 982 sc->rx_buffer_zone = zone_jumbo9; 983 else 984 sc->rx_buffer_zone = zone_jumbo16; 985 986 /* 987 * Set up the scale table. Enable all hash types and hash insertion. 988 */ 989 for (index = 0; index < SFXGE_RX_SCALE_MAX; index++) 990 sc->rx_indir_table[index] = index % sc->intr.n_alloc; 991 if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table, 992 SFXGE_RX_SCALE_MAX)) != 0) 993 goto fail; 994 (void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ, 995 (1 << EFX_RX_HASH_IPV4) | (1 << EFX_RX_HASH_TCPIPV4) | 996 (1 << EFX_RX_HASH_IPV6) | (1 << EFX_RX_HASH_TCPIPV6), B_TRUE); 997 998 if ((rc = efx_rx_scale_toeplitz_ipv4_key_set(sc->enp, toep_key, 999 sizeof(toep_key))) != 0) 1000 goto fail; 1001 1002 /* Start the receive queue(s). */ 1003 for (index = 0; index < intr->n_alloc; index++) { 1004 if ((rc = sfxge_rx_qstart(sc, index)) != 0) 1005 goto fail2; 1006 } 1007 1008 return (0); 1009 1010 fail2: 1011 while (--index >= 0) 1012 sfxge_rx_qstop(sc, index); 1013 1014 fail: 1015 efx_rx_fini(sc->enp); 1016 1017 return (rc); 1018 } 1019 1020 static void sfxge_lro_init(struct sfxge_rxq *rxq) 1021 { 1022 struct sfxge_lro_state *st = &rxq->lro; 1023 unsigned i; 1024 1025 st->conns_mask = lro_table_size - 1; 1026 KASSERT(!((st->conns_mask + 1) & st->conns_mask), 1027 ("lro_table_size must be a power of 2")); 1028 st->sc = rxq->sc; 1029 st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]), 1030 M_SFXGE, M_WAITOK); 1031 st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]), 1032 M_SFXGE, M_WAITOK); 1033 for (i = 0; i <= st->conns_mask; ++i) { 1034 TAILQ_INIT(&st->conns[i]); 1035 st->conns_n[i] = 0; 1036 } 1037 LIST_INIT(&st->active_conns); 1038 TAILQ_INIT(&st->free_conns); 1039 } 1040 1041 static void sfxge_lro_fini(struct sfxge_rxq *rxq) 1042 { 1043 struct sfxge_lro_state *st = &rxq->lro; 1044 struct sfxge_lro_conn *c; 1045 unsigned i; 1046 1047 /* Return cleanly if sfxge_lro_init() has not been called. */ 1048 if (st->conns == NULL) 1049 return; 1050 1051 KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections")); 1052 1053 for (i = 0; i <= st->conns_mask; ++i) { 1054 while (!TAILQ_EMPTY(&st->conns[i])) { 1055 c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq); 1056 sfxge_lro_drop(rxq, c); 1057 } 1058 } 1059 1060 while (!TAILQ_EMPTY(&st->free_conns)) { 1061 c = TAILQ_FIRST(&st->free_conns); 1062 TAILQ_REMOVE(&st->free_conns, c, link); 1063 KASSERT(!c->mbuf, ("found orphaned mbuf")); 1064 free(c, M_SFXGE); 1065 } 1066 1067 free(st->conns_n, M_SFXGE); 1068 free(st->conns, M_SFXGE); 1069 st->conns = NULL; 1070 } 1071 1072 static void 1073 sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index) 1074 { 1075 struct sfxge_rxq *rxq; 1076 1077 rxq = sc->rxq[index]; 1078 1079 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED, 1080 ("rxq->init_state != SFXGE_RXQ_INITIALIZED")); 1081 1082 /* Free the context array and the flow table. */ 1083 free(rxq->queue, M_SFXGE); 1084 sfxge_lro_fini(rxq); 1085 1086 /* Release DMA memory. */ 1087 sfxge_dma_free(&rxq->mem); 1088 1089 sc->rxq[index] = NULL; 1090 1091 free(rxq, M_SFXGE); 1092 } 1093 1094 static int 1095 sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index) 1096 { 1097 struct sfxge_rxq *rxq; 1098 struct sfxge_evq *evq; 1099 efsys_mem_t *esmp; 1100 int rc; 1101 1102 KASSERT(index < sc->intr.n_alloc, ("index >= %d", sc->intr.n_alloc)); 1103 1104 rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK); 1105 rxq->sc = sc; 1106 rxq->index = index; 1107 rxq->entries = sc->rxq_entries; 1108 rxq->ptr_mask = rxq->entries - 1; 1109 rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries); 1110 1111 sc->rxq[index] = rxq; 1112 esmp = &rxq->mem; 1113 1114 evq = sc->evq[index]; 1115 1116 /* Allocate and zero DMA space. */ 1117 if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0) 1118 return (rc); 1119 (void)memset(esmp->esm_base, 0, EFX_RXQ_SIZE(sc->rxq_entries)); 1120 1121 /* Allocate buffer table entries. */ 1122 sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries), 1123 &rxq->buf_base_id); 1124 1125 /* Allocate the context array and the flow table. */ 1126 rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries, 1127 M_SFXGE, M_WAITOK | M_ZERO); 1128 sfxge_lro_init(rxq); 1129 1130 callout_init(&rxq->refill_callout, B_TRUE); 1131 1132 rxq->init_state = SFXGE_RXQ_INITIALIZED; 1133 1134 return (0); 1135 } 1136 1137 static const struct { 1138 const char *name; 1139 size_t offset; 1140 } sfxge_rx_stats[] = { 1141 #define SFXGE_RX_STAT(name, member) \ 1142 { #name, offsetof(struct sfxge_rxq, member) } 1143 SFXGE_RX_STAT(lro_merges, lro.n_merges), 1144 SFXGE_RX_STAT(lro_bursts, lro.n_bursts), 1145 SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start), 1146 SFXGE_RX_STAT(lro_misorder, lro.n_misorder), 1147 SFXGE_RX_STAT(lro_too_many, lro.n_too_many), 1148 SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream), 1149 SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle), 1150 SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed) 1151 }; 1152 1153 static int 1154 sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS) 1155 { 1156 struct sfxge_softc *sc = arg1; 1157 unsigned int id = arg2; 1158 unsigned int sum, index; 1159 1160 /* Sum across all RX queues */ 1161 sum = 0; 1162 for (index = 0; index < sc->intr.n_alloc; index++) 1163 sum += *(unsigned int *)((caddr_t)sc->rxq[index] + 1164 sfxge_rx_stats[id].offset); 1165 1166 return (SYSCTL_OUT(req, &sum, sizeof(sum))); 1167 } 1168 1169 static void 1170 sfxge_rx_stat_init(struct sfxge_softc *sc) 1171 { 1172 struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev); 1173 struct sysctl_oid_list *stat_list; 1174 unsigned int id; 1175 1176 stat_list = SYSCTL_CHILDREN(sc->stats_node); 1177 1178 for (id = 0; 1179 id < sizeof(sfxge_rx_stats) / sizeof(sfxge_rx_stats[0]); 1180 id++) { 1181 SYSCTL_ADD_PROC( 1182 ctx, stat_list, 1183 OID_AUTO, sfxge_rx_stats[id].name, 1184 CTLTYPE_UINT|CTLFLAG_RD, 1185 sc, id, sfxge_rx_stat_handler, "IU", 1186 ""); 1187 } 1188 } 1189 1190 void 1191 sfxge_rx_fini(struct sfxge_softc *sc) 1192 { 1193 struct sfxge_intr *intr; 1194 int index; 1195 1196 intr = &sc->intr; 1197 1198 index = intr->n_alloc; 1199 while (--index >= 0) 1200 sfxge_rx_qfini(sc, index); 1201 } 1202 1203 int 1204 sfxge_rx_init(struct sfxge_softc *sc) 1205 { 1206 struct sfxge_intr *intr; 1207 int index; 1208 int rc; 1209 1210 if (lro_idle_ticks == 0) 1211 lro_idle_ticks = hz / 10 + 1; /* 100 ms */ 1212 1213 intr = &sc->intr; 1214 1215 KASSERT(intr->state == SFXGE_INTR_INITIALIZED, 1216 ("intr->state != SFXGE_INTR_INITIALIZED")); 1217 1218 /* Initialize the receive queue(s) - one per interrupt. */ 1219 for (index = 0; index < intr->n_alloc; index++) { 1220 if ((rc = sfxge_rx_qinit(sc, index)) != 0) 1221 goto fail; 1222 } 1223 1224 sfxge_rx_stat_init(sc); 1225 1226 return (0); 1227 1228 fail: 1229 /* Tear down the receive queue(s). */ 1230 while (--index >= 0) 1231 sfxge_rx_qfini(sc, index); 1232 1233 return (rc); 1234 } 1235