1 /*- 2 * Copyright (c) 2010-2011 Solarflare Communications, Inc. 3 * All rights reserved. 4 * 5 * This software was developed in part by Philip Paeps under contract for 6 * Solarflare Communications, Inc. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 #include <sys/cdefs.h> 31 __FBSDID("$FreeBSD$"); 32 33 #include <sys/types.h> 34 #include <sys/mbuf.h> 35 #include <sys/smp.h> 36 #include <sys/socket.h> 37 #include <sys/sysctl.h> 38 #include <sys/limits.h> 39 40 #include <net/ethernet.h> 41 #include <net/if.h> 42 #include <net/if_vlan_var.h> 43 44 #include <netinet/in.h> 45 #include <netinet/ip.h> 46 #include <netinet/ip6.h> 47 #include <netinet/tcp.h> 48 49 #include <machine/in_cksum.h> 50 51 #include "common/efx.h" 52 53 54 #include "sfxge.h" 55 #include "sfxge_rx.h" 56 57 #define RX_REFILL_THRESHOLD (EFX_RXQ_LIMIT(SFXGE_NDESCS) * 9 / 10) 58 #define RX_REFILL_THRESHOLD_2 (RX_REFILL_THRESHOLD / 2) 59 60 /* Size of the LRO hash table. Must be a power of 2. A larger table 61 * means we can accelerate a larger number of streams. 62 */ 63 static unsigned lro_table_size = 128; 64 65 /* Maximum length of a hash chain. If chains get too long then the lookup 66 * time increases and may exceed the benefit of LRO. 67 */ 68 static unsigned lro_chain_max = 20; 69 70 /* Maximum time (in ticks) that a connection can be idle before it's LRO 71 * state is discarded. 72 */ 73 static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */ 74 75 /* Number of packets with payload that must arrive in-order before a 76 * connection is eligible for LRO. The idea is we should avoid coalescing 77 * segments when the sender is in slow-start because reducing the ACK rate 78 * can damage performance. 79 */ 80 static int lro_slow_start_packets = 2000; 81 82 /* Number of packets with payload that must arrive in-order following loss 83 * before a connection is eligible for LRO. The idea is we should avoid 84 * coalescing segments when the sender is recovering from loss, because 85 * reducing the ACK rate can damage performance. 86 */ 87 static int lro_loss_packets = 20; 88 89 /* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */ 90 #define SFXGE_LRO_L2_ID_VLAN 0x4000 91 #define SFXGE_LRO_L2_ID_IPV6 0x8000 92 #define SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN) 93 #define SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6)) 94 95 /* Compare IPv6 addresses, avoiding conditional branches */ 96 static __inline unsigned long ipv6_addr_cmp(const struct in6_addr *left, 97 const struct in6_addr *right) 98 { 99 #if LONG_BIT == 64 100 const uint64_t *left64 = (const uint64_t *)left; 101 const uint64_t *right64 = (const uint64_t *)right; 102 return (left64[0] - right64[0]) | (left64[1] - right64[1]); 103 #else 104 return (left->s6_addr32[0] - right->s6_addr32[0]) | 105 (left->s6_addr32[1] - right->s6_addr32[1]) | 106 (left->s6_addr32[2] - right->s6_addr32[2]) | 107 (left->s6_addr32[3] - right->s6_addr32[3]); 108 #endif 109 } 110 111 void 112 sfxge_rx_qflush_done(struct sfxge_rxq *rxq) 113 { 114 115 rxq->flush_state = SFXGE_FLUSH_DONE; 116 } 117 118 void 119 sfxge_rx_qflush_failed(struct sfxge_rxq *rxq) 120 { 121 122 rxq->flush_state = SFXGE_FLUSH_FAILED; 123 } 124 125 static uint8_t toep_key[] = { 126 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, 127 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, 128 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, 129 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, 130 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa 131 }; 132 133 static void 134 sfxge_rx_post_refill(void *arg) 135 { 136 struct sfxge_rxq *rxq = arg; 137 struct sfxge_softc *sc; 138 unsigned int index; 139 struct sfxge_evq *evq; 140 uint16_t magic; 141 142 sc = rxq->sc; 143 index = rxq->index; 144 evq = sc->evq[index]; 145 146 magic = SFXGE_MAGIC_RX_QREFILL | index; 147 148 /* This is guaranteed due to the start/stop order of rx and ev */ 149 KASSERT(evq->init_state == SFXGE_EVQ_STARTED, 150 ("evq not started")); 151 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED, 152 ("rxq not started")); 153 efx_ev_qpost(evq->common, magic); 154 } 155 156 static void 157 sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying) 158 { 159 /* Initially retry after 100 ms, but back off in case of 160 * repeated failures as we probably have to wait for the 161 * administrator to raise the pool limit. */ 162 if (retrying) 163 rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz); 164 else 165 rxq->refill_delay = hz / 10; 166 167 callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay, 168 sfxge_rx_post_refill, rxq); 169 } 170 171 static inline struct mbuf *sfxge_rx_alloc_mbuf(struct sfxge_softc *sc) 172 { 173 struct mb_args args; 174 struct mbuf *m; 175 176 /* Allocate mbuf structure */ 177 args.flags = M_PKTHDR; 178 args.type = MT_DATA; 179 m = (struct mbuf *)uma_zalloc_arg(zone_mbuf, &args, M_NOWAIT); 180 181 /* Allocate (and attach) packet buffer */ 182 if (m && !uma_zalloc_arg(sc->rx_buffer_zone, m, M_NOWAIT)) { 183 uma_zfree(zone_mbuf, m); 184 m = NULL; 185 } 186 187 return m; 188 } 189 190 #define SFXGE_REFILL_BATCH 64 191 192 static void 193 sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying) 194 { 195 struct sfxge_softc *sc; 196 unsigned int index; 197 struct sfxge_evq *evq; 198 unsigned int batch; 199 unsigned int rxfill; 200 unsigned int mblksize; 201 int ntodo; 202 efsys_dma_addr_t addr[SFXGE_REFILL_BATCH]; 203 204 sc = rxq->sc; 205 index = rxq->index; 206 evq = sc->evq[index]; 207 208 prefetch_read_many(sc->enp); 209 prefetch_read_many(rxq->common); 210 211 mtx_assert(&evq->lock, MA_OWNED); 212 213 if (rxq->init_state != SFXGE_RXQ_STARTED) 214 return; 215 216 rxfill = rxq->added - rxq->completed; 217 KASSERT(rxfill <= EFX_RXQ_LIMIT(SFXGE_NDESCS), 218 ("rxfill > EFX_RXQ_LIMIT(SFXGE_NDESCS)")); 219 ntodo = min(EFX_RXQ_LIMIT(SFXGE_NDESCS) - rxfill, target); 220 KASSERT(ntodo <= EFX_RXQ_LIMIT(SFXGE_NDESCS), 221 ("ntodo > EFX_RQX_LIMIT(SFXGE_NDESCS)")); 222 223 if (ntodo == 0) 224 return; 225 226 batch = 0; 227 mblksize = sc->rx_buffer_size; 228 while (ntodo-- > 0) { 229 unsigned int id; 230 struct sfxge_rx_sw_desc *rx_desc; 231 bus_dma_segment_t seg; 232 struct mbuf *m; 233 234 id = (rxq->added + batch) & (SFXGE_NDESCS - 1); 235 rx_desc = &rxq->queue[id]; 236 KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL")); 237 238 rx_desc->flags = EFX_DISCARD; 239 m = rx_desc->mbuf = sfxge_rx_alloc_mbuf(sc); 240 if (m == NULL) 241 break; 242 sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg); 243 addr[batch++] = seg.ds_addr; 244 245 if (batch == SFXGE_REFILL_BATCH) { 246 efx_rx_qpost(rxq->common, addr, mblksize, batch, 247 rxq->completed, rxq->added); 248 rxq->added += batch; 249 batch = 0; 250 } 251 } 252 253 if (ntodo != 0) 254 sfxge_rx_schedule_refill(rxq, retrying); 255 256 if (batch != 0) { 257 efx_rx_qpost(rxq->common, addr, mblksize, batch, 258 rxq->completed, rxq->added); 259 rxq->added += batch; 260 } 261 262 /* Make the descriptors visible to the hardware */ 263 bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map, 264 BUS_DMASYNC_PREWRITE); 265 266 efx_rx_qpush(rxq->common, rxq->added); 267 } 268 269 void 270 sfxge_rx_qrefill(struct sfxge_rxq *rxq) 271 { 272 273 if (rxq->init_state != SFXGE_RXQ_STARTED) 274 return; 275 276 /* Make sure the queue is full */ 277 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(SFXGE_NDESCS), B_TRUE); 278 } 279 280 static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m) 281 { 282 struct ifnet *ifp = sc->ifnet; 283 284 m->m_pkthdr.rcvif = ifp; 285 m->m_pkthdr.csum_data = 0xffff; 286 ifp->if_input(ifp, m); 287 } 288 289 static void 290 sfxge_rx_deliver(struct sfxge_softc *sc, struct sfxge_rx_sw_desc *rx_desc) 291 { 292 struct mbuf *m = rx_desc->mbuf; 293 int csum_flags; 294 295 /* Convert checksum flags */ 296 csum_flags = (rx_desc->flags & EFX_CKSUM_IPV4) ? 297 (CSUM_IP_CHECKED | CSUM_IP_VALID) : 0; 298 if (rx_desc->flags & EFX_CKSUM_TCPUDP) 299 csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 300 301 #ifdef SFXGE_HAVE_MQ 302 /* The hash covers a 4-tuple for TCP only */ 303 if (rx_desc->flags & EFX_PKT_TCP) { 304 m->m_pkthdr.flowid = EFX_RX_HASH_VALUE(EFX_RX_HASHALG_TOEPLITZ, 305 mtod(m, uint8_t *)); 306 m->m_flags |= M_FLOWID; 307 } 308 #endif 309 m->m_data += sc->rx_prefix_size; 310 m->m_len = rx_desc->size - sc->rx_prefix_size; 311 m->m_pkthdr.len = m->m_len; 312 m->m_pkthdr.csum_flags = csum_flags; 313 __sfxge_rx_deliver(sc, rx_desc->mbuf); 314 315 rx_desc->flags = EFX_DISCARD; 316 rx_desc->mbuf = NULL; 317 } 318 319 static void 320 sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c) 321 { 322 struct sfxge_softc *sc = st->sc; 323 struct mbuf *m = c->mbuf; 324 struct tcphdr *c_th; 325 int csum_flags; 326 327 KASSERT(m, ("no mbuf to deliver")); 328 329 ++st->n_bursts; 330 331 /* Finish off packet munging and recalculate IP header checksum. */ 332 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 333 struct ip *iph = c->nh; 334 iph->ip_len = htons(iph->ip_len); 335 iph->ip_sum = 0; 336 iph->ip_sum = in_cksum_hdr(iph); 337 c_th = (struct tcphdr *)(iph + 1); 338 csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR | 339 CSUM_IP_CHECKED | CSUM_IP_VALID); 340 } else { 341 struct ip6_hdr *iph = c->nh; 342 iph->ip6_plen = htons(iph->ip6_plen); 343 c_th = (struct tcphdr *)(iph + 1); 344 csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 345 } 346 347 c_th->th_win = c->th_last->th_win; 348 c_th->th_ack = c->th_last->th_ack; 349 if (c_th->th_off == c->th_last->th_off) { 350 /* Copy TCP options (take care to avoid going negative). */ 351 int optlen = ((c_th->th_off - 5) & 0xf) << 2u; 352 memcpy(c_th + 1, c->th_last + 1, optlen); 353 } 354 355 #ifdef SFXGE_HAVE_MQ 356 m->m_pkthdr.flowid = c->conn_hash; 357 m->m_flags |= M_FLOWID; 358 #endif 359 m->m_pkthdr.csum_flags = csum_flags; 360 __sfxge_rx_deliver(sc, m); 361 362 c->mbuf = NULL; 363 c->delivered = 1; 364 } 365 366 /* Drop the given connection, and add it to the free list. */ 367 static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c) 368 { 369 unsigned bucket; 370 371 KASSERT(!c->mbuf, ("found orphaned mbuf")); 372 373 if (c->next_buf.mbuf) { 374 sfxge_rx_deliver(rxq->sc, &c->next_buf); 375 LIST_REMOVE(c, active_link); 376 } 377 378 bucket = c->conn_hash & rxq->lro.conns_mask; 379 KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong")); 380 --rxq->lro.conns_n[bucket]; 381 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link); 382 TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link); 383 } 384 385 /* Stop tracking connections that have gone idle in order to keep hash 386 * chains short. 387 */ 388 static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now) 389 { 390 struct sfxge_lro_conn *c; 391 unsigned i; 392 393 KASSERT(LIST_EMPTY(&rxq->lro.active_conns), 394 ("found active connections")); 395 396 rxq->lro.last_purge_ticks = now; 397 for (i = 0; i <= rxq->lro.conns_mask; ++i) { 398 if (TAILQ_EMPTY(&rxq->lro.conns[i])) 399 continue; 400 401 c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq); 402 if (now - c->last_pkt_ticks > lro_idle_ticks) { 403 ++rxq->lro.n_drop_idle; 404 sfxge_lro_drop(rxq, c); 405 } 406 } 407 } 408 409 static void 410 sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c, 411 struct mbuf *mbuf, struct tcphdr *th) 412 { 413 struct tcphdr *c_th; 414 415 /* Tack the new mbuf onto the chain. */ 416 KASSERT(!mbuf->m_next, ("mbuf already chained")); 417 c->mbuf_tail->m_next = mbuf; 418 c->mbuf_tail = mbuf; 419 420 /* Increase length appropriately */ 421 c->mbuf->m_pkthdr.len += mbuf->m_len; 422 423 /* Update the connection state flags */ 424 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 425 struct ip *iph = c->nh; 426 iph->ip_len += mbuf->m_len; 427 c_th = (struct tcphdr *)(iph + 1); 428 } else { 429 struct ip6_hdr *iph = c->nh; 430 iph->ip6_plen += mbuf->m_len; 431 c_th = (struct tcphdr *)(iph + 1); 432 } 433 c_th->th_flags |= (th->th_flags & TH_PUSH); 434 c->th_last = th; 435 ++st->n_merges; 436 437 /* Pass packet up now if another segment could overflow the IP 438 * length. 439 */ 440 if (c->mbuf->m_pkthdr.len > 65536 - 9200) 441 sfxge_lro_deliver(st, c); 442 } 443 444 static void 445 sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c, 446 struct mbuf *mbuf, void *nh, struct tcphdr *th) 447 { 448 /* Start the chain */ 449 c->mbuf = mbuf; 450 c->mbuf_tail = c->mbuf; 451 c->nh = nh; 452 c->th_last = th; 453 454 mbuf->m_pkthdr.len = mbuf->m_len; 455 456 /* Mangle header fields for later processing */ 457 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 458 struct ip *iph = nh; 459 iph->ip_len = ntohs(iph->ip_len); 460 } else { 461 struct ip6_hdr *iph = nh; 462 iph->ip6_plen = ntohs(iph->ip6_plen); 463 } 464 } 465 466 /* Try to merge or otherwise hold or deliver (as appropriate) the 467 * packet buffered for this connection (c->next_buf). Return a flag 468 * indicating whether the connection is still active for LRO purposes. 469 */ 470 static int 471 sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c) 472 { 473 struct sfxge_rx_sw_desc *rx_buf = &c->next_buf; 474 char *eh = c->next_eh; 475 int data_length, hdr_length, dont_merge; 476 unsigned th_seq, pkt_length; 477 struct tcphdr *th; 478 unsigned now; 479 480 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 481 struct ip *iph = c->next_nh; 482 th = (struct tcphdr *)(iph + 1); 483 pkt_length = ntohs(iph->ip_len) + (char *) iph - eh; 484 } else { 485 struct ip6_hdr *iph = c->next_nh; 486 th = (struct tcphdr *)(iph + 1); 487 pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh; 488 } 489 490 hdr_length = (char *) th + th->th_off * 4 - eh; 491 data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) - 492 hdr_length); 493 th_seq = ntohl(th->th_seq); 494 dont_merge = ((data_length <= 0) 495 | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN))); 496 497 /* Check for options other than aligned timestamp. */ 498 if (th->th_off != 5) { 499 const uint32_t *opt_ptr = (const uint32_t *) (th + 1); 500 if (th->th_off == 8 && 501 opt_ptr[0] == ntohl((TCPOPT_NOP << 24) | 502 (TCPOPT_NOP << 16) | 503 (TCPOPT_TIMESTAMP << 8) | 504 TCPOLEN_TIMESTAMP)) { 505 /* timestamp option -- okay */ 506 } else { 507 dont_merge = 1; 508 } 509 } 510 511 if (__predict_false(th_seq != c->next_seq)) { 512 /* Out-of-order, so start counting again. */ 513 if (c->mbuf) 514 sfxge_lro_deliver(&rxq->lro, c); 515 c->n_in_order_pkts -= lro_loss_packets; 516 c->next_seq = th_seq + data_length; 517 ++rxq->lro.n_misorder; 518 goto deliver_buf_out; 519 } 520 c->next_seq = th_seq + data_length; 521 522 now = ticks; 523 if (now - c->last_pkt_ticks > lro_idle_ticks) { 524 ++rxq->lro.n_drop_idle; 525 if (c->mbuf) 526 sfxge_lro_deliver(&rxq->lro, c); 527 sfxge_lro_drop(rxq, c); 528 return 0; 529 } 530 c->last_pkt_ticks = ticks; 531 532 if (c->n_in_order_pkts < lro_slow_start_packets) { 533 /* May be in slow-start, so don't merge. */ 534 ++rxq->lro.n_slow_start; 535 ++c->n_in_order_pkts; 536 goto deliver_buf_out; 537 } 538 539 if (__predict_false(dont_merge)) { 540 if (c->mbuf) 541 sfxge_lro_deliver(&rxq->lro, c); 542 if (th->th_flags & (TH_FIN | TH_RST)) { 543 ++rxq->lro.n_drop_closed; 544 sfxge_lro_drop(rxq, c); 545 return 0; 546 } 547 goto deliver_buf_out; 548 } 549 550 rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size; 551 552 if (__predict_true(c->mbuf != NULL)) { 553 /* Remove headers and any padding */ 554 rx_buf->mbuf->m_data += hdr_length; 555 rx_buf->mbuf->m_len = data_length; 556 557 sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th); 558 } else { 559 /* Remove any padding */ 560 rx_buf->mbuf->m_len = pkt_length; 561 562 sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th); 563 } 564 565 rx_buf->mbuf = NULL; 566 return 1; 567 568 deliver_buf_out: 569 sfxge_rx_deliver(rxq->sc, rx_buf); 570 return 1; 571 } 572 573 static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash, 574 uint16_t l2_id, void *nh, struct tcphdr *th) 575 { 576 unsigned bucket = conn_hash & st->conns_mask; 577 struct sfxge_lro_conn *c; 578 579 if (st->conns_n[bucket] >= lro_chain_max) { 580 ++st->n_too_many; 581 return; 582 } 583 584 if (!TAILQ_EMPTY(&st->free_conns)) { 585 c = TAILQ_FIRST(&st->free_conns); 586 TAILQ_REMOVE(&st->free_conns, c, link); 587 } else { 588 c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT); 589 if (c == NULL) 590 return; 591 c->mbuf = NULL; 592 c->next_buf.mbuf = NULL; 593 } 594 595 /* Create the connection tracking data */ 596 ++st->conns_n[bucket]; 597 TAILQ_INSERT_HEAD(&st->conns[bucket], c, link); 598 c->l2_id = l2_id; 599 c->conn_hash = conn_hash; 600 c->source = th->th_sport; 601 c->dest = th->th_dport; 602 c->n_in_order_pkts = 0; 603 c->last_pkt_ticks = *(volatile int *)&ticks; 604 c->delivered = 0; 605 ++st->n_new_stream; 606 /* NB. We don't initialise c->next_seq, and it doesn't matter what 607 * value it has. Most likely the next packet received for this 608 * connection will not match -- no harm done. 609 */ 610 } 611 612 /* Process mbuf and decide whether to dispatch it to the stack now or 613 * later. 614 */ 615 static void 616 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf) 617 { 618 struct sfxge_softc *sc = rxq->sc; 619 struct mbuf *m = rx_buf->mbuf; 620 struct ether_header *eh; 621 struct sfxge_lro_conn *c; 622 uint16_t l2_id; 623 uint16_t l3_proto; 624 void *nh; 625 struct tcphdr *th; 626 uint32_t conn_hash; 627 unsigned bucket; 628 629 /* Get the hardware hash */ 630 conn_hash = EFX_RX_HASH_VALUE(EFX_RX_HASHALG_TOEPLITZ, 631 mtod(m, uint8_t *)); 632 633 eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size); 634 if (eh->ether_type == htons(ETHERTYPE_VLAN)) { 635 struct ether_vlan_header *veh = (struct ether_vlan_header *)eh; 636 l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) | 637 SFXGE_LRO_L2_ID_VLAN; 638 l3_proto = veh->evl_proto; 639 nh = veh + 1; 640 } else { 641 l2_id = 0; 642 l3_proto = eh->ether_type; 643 nh = eh + 1; 644 } 645 646 /* Check whether this is a suitable packet (unfragmented 647 * TCP/IPv4 or TCP/IPv6). If so, find the TCP header and 648 * length, and compute a hash if necessary. If not, return. 649 */ 650 if (l3_proto == htons(ETHERTYPE_IP)) { 651 struct ip *iph = nh; 652 if ((iph->ip_p - IPPROTO_TCP) | 653 (iph->ip_hl - (sizeof(*iph) >> 2u)) | 654 (iph->ip_off & htons(IP_MF | IP_OFFMASK))) 655 goto deliver_now; 656 th = (struct tcphdr *)(iph + 1); 657 } else if (l3_proto == htons(ETHERTYPE_IPV6)) { 658 struct ip6_hdr *iph = nh; 659 if (iph->ip6_nxt != IPPROTO_TCP) 660 goto deliver_now; 661 l2_id |= SFXGE_LRO_L2_ID_IPV6; 662 th = (struct tcphdr *)(iph + 1); 663 } else { 664 goto deliver_now; 665 } 666 667 bucket = conn_hash & rxq->lro.conns_mask; 668 669 TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) { 670 if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash)) 671 continue; 672 if ((c->source - th->th_sport) | (c->dest - th->th_dport)) 673 continue; 674 if (c->mbuf) { 675 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 676 struct ip *c_iph, *iph = nh; 677 c_iph = c->nh; 678 if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) | 679 (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr)) 680 continue; 681 } else { 682 struct ip6_hdr *c_iph, *iph = nh; 683 c_iph = c->nh; 684 if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) | 685 ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst)) 686 continue; 687 } 688 } 689 690 /* Re-insert at head of list to reduce lookup time. */ 691 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link); 692 TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link); 693 694 if (c->next_buf.mbuf) { 695 if (!sfxge_lro_try_merge(rxq, c)) 696 goto deliver_now; 697 } else { 698 LIST_INSERT_HEAD(&rxq->lro.active_conns, c, 699 active_link); 700 } 701 c->next_buf = *rx_buf; 702 c->next_eh = eh; 703 c->next_nh = nh; 704 705 rx_buf->mbuf = NULL; 706 rx_buf->flags = EFX_DISCARD; 707 return; 708 } 709 710 sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th); 711 deliver_now: 712 sfxge_rx_deliver(sc, rx_buf); 713 } 714 715 static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq) 716 { 717 struct sfxge_lro_state *st = &rxq->lro; 718 struct sfxge_lro_conn *c; 719 unsigned t; 720 721 while (!LIST_EMPTY(&st->active_conns)) { 722 c = LIST_FIRST(&st->active_conns); 723 if (!c->delivered && c->mbuf) 724 sfxge_lro_deliver(st, c); 725 if (sfxge_lro_try_merge(rxq, c)) { 726 if (c->mbuf) 727 sfxge_lro_deliver(st, c); 728 LIST_REMOVE(c, active_link); 729 } 730 c->delivered = 0; 731 } 732 733 t = *(volatile int *)&ticks; 734 if (__predict_false(t != st->last_purge_ticks)) 735 sfxge_lro_purge_idle(rxq, t); 736 } 737 738 void 739 sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop) 740 { 741 struct sfxge_softc *sc = rxq->sc; 742 int lro_enabled = sc->ifnet->if_capenable & IFCAP_LRO; 743 unsigned int index; 744 struct sfxge_evq *evq; 745 unsigned int completed; 746 unsigned int level; 747 struct mbuf *m; 748 struct sfxge_rx_sw_desc *prev = NULL; 749 750 index = rxq->index; 751 evq = sc->evq[index]; 752 753 mtx_assert(&evq->lock, MA_OWNED); 754 755 completed = rxq->completed; 756 while (completed != rxq->pending) { 757 unsigned int id; 758 struct sfxge_rx_sw_desc *rx_desc; 759 760 id = completed++ & (SFXGE_NDESCS - 1); 761 rx_desc = &rxq->queue[id]; 762 m = rx_desc->mbuf; 763 764 if (rxq->init_state != SFXGE_RXQ_STARTED) 765 goto discard; 766 767 if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD)) 768 goto discard; 769 770 prefetch_read_many(mtod(m, caddr_t)); 771 772 /* Check for loopback packets */ 773 if (!(rx_desc->flags & EFX_PKT_IPV4) && 774 !(rx_desc->flags & EFX_PKT_IPV6)) { 775 struct ether_header *etherhp; 776 777 /*LINTED*/ 778 etherhp = mtod(m, struct ether_header *); 779 780 if (etherhp->ether_type == 781 htons(SFXGE_ETHERTYPE_LOOPBACK)) { 782 EFSYS_PROBE(loopback); 783 784 rxq->loopback++; 785 goto discard; 786 } 787 } 788 789 /* Pass packet up the stack or into LRO (pipelined) */ 790 if (prev != NULL) { 791 if (lro_enabled) 792 sfxge_lro(rxq, prev); 793 else 794 sfxge_rx_deliver(sc, prev); 795 } 796 prev = rx_desc; 797 continue; 798 799 discard: 800 /* Return the packet to the pool */ 801 m_free(m); 802 rx_desc->mbuf = NULL; 803 } 804 rxq->completed = completed; 805 806 level = rxq->added - rxq->completed; 807 808 /* Pass last packet up the stack or into LRO */ 809 if (prev != NULL) { 810 if (lro_enabled) 811 sfxge_lro(rxq, prev); 812 else 813 sfxge_rx_deliver(sc, prev); 814 } 815 816 /* 817 * If there are any pending flows and this is the end of the 818 * poll then they must be completed. 819 */ 820 if (eop) 821 sfxge_lro_end_of_burst(rxq); 822 823 /* Top up the queue if necessary */ 824 if (level < RX_REFILL_THRESHOLD) 825 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(SFXGE_NDESCS), B_FALSE); 826 } 827 828 static void 829 sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index) 830 { 831 struct sfxge_rxq *rxq; 832 struct sfxge_evq *evq; 833 unsigned int count; 834 835 rxq = sc->rxq[index]; 836 evq = sc->evq[index]; 837 838 mtx_lock(&evq->lock); 839 840 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED, 841 ("rxq not started")); 842 843 rxq->init_state = SFXGE_RXQ_INITIALIZED; 844 845 callout_stop(&rxq->refill_callout); 846 847 again: 848 rxq->flush_state = SFXGE_FLUSH_PENDING; 849 850 /* Flush the receive queue */ 851 efx_rx_qflush(rxq->common); 852 853 mtx_unlock(&evq->lock); 854 855 count = 0; 856 do { 857 /* Spin for 100 ms */ 858 DELAY(100000); 859 860 if (rxq->flush_state != SFXGE_FLUSH_PENDING) 861 break; 862 863 } while (++count < 20); 864 865 mtx_lock(&evq->lock); 866 867 if (rxq->flush_state == SFXGE_FLUSH_FAILED) 868 goto again; 869 870 rxq->flush_state = SFXGE_FLUSH_DONE; 871 872 rxq->pending = rxq->added; 873 sfxge_rx_qcomplete(rxq, B_TRUE); 874 875 KASSERT(rxq->completed == rxq->pending, 876 ("rxq->completed != rxq->pending")); 877 878 rxq->added = 0; 879 rxq->pending = 0; 880 rxq->completed = 0; 881 rxq->loopback = 0; 882 883 /* Destroy the common code receive queue. */ 884 efx_rx_qdestroy(rxq->common); 885 886 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id, 887 EFX_RXQ_NBUFS(SFXGE_NDESCS)); 888 889 mtx_unlock(&evq->lock); 890 } 891 892 static int 893 sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index) 894 { 895 struct sfxge_rxq *rxq; 896 efsys_mem_t *esmp; 897 struct sfxge_evq *evq; 898 int rc; 899 900 rxq = sc->rxq[index]; 901 esmp = &rxq->mem; 902 evq = sc->evq[index]; 903 904 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED, 905 ("rxq->init_state != SFXGE_RXQ_INITIALIZED")); 906 KASSERT(evq->init_state == SFXGE_EVQ_STARTED, 907 ("evq->init_state != SFXGE_EVQ_STARTED")); 908 909 /* Program the buffer table. */ 910 if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp, 911 EFX_RXQ_NBUFS(SFXGE_NDESCS))) != 0) 912 return rc; 913 914 /* Create the common code receive queue. */ 915 if ((rc = efx_rx_qcreate(sc->enp, index, index, EFX_RXQ_TYPE_DEFAULT, 916 esmp, SFXGE_NDESCS, rxq->buf_base_id, evq->common, 917 &rxq->common)) != 0) 918 goto fail; 919 920 mtx_lock(&evq->lock); 921 922 /* Enable the receive queue. */ 923 efx_rx_qenable(rxq->common); 924 925 rxq->init_state = SFXGE_RXQ_STARTED; 926 927 /* Try to fill the queue from the pool. */ 928 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(SFXGE_NDESCS), B_FALSE); 929 930 mtx_unlock(&evq->lock); 931 932 return (0); 933 934 fail: 935 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id, 936 EFX_RXQ_NBUFS(SFXGE_NDESCS)); 937 return rc; 938 } 939 940 void 941 sfxge_rx_stop(struct sfxge_softc *sc) 942 { 943 struct sfxge_intr *intr; 944 int index; 945 946 intr = &sc->intr; 947 948 /* Stop the receive queue(s) */ 949 index = intr->n_alloc; 950 while (--index >= 0) 951 sfxge_rx_qstop(sc, index); 952 953 sc->rx_prefix_size = 0; 954 sc->rx_buffer_size = 0; 955 956 efx_rx_fini(sc->enp); 957 } 958 959 int 960 sfxge_rx_start(struct sfxge_softc *sc) 961 { 962 struct sfxge_intr *intr; 963 int index; 964 int rc; 965 966 intr = &sc->intr; 967 968 /* Initialize the common code receive module. */ 969 if ((rc = efx_rx_init(sc->enp)) != 0) 970 return (rc); 971 972 /* Calculate the receive packet buffer size. */ 973 sc->rx_prefix_size = EFX_RX_PREFIX_SIZE; 974 sc->rx_buffer_size = (EFX_MAC_PDU(sc->ifnet->if_mtu) + 975 sc->rx_prefix_size); 976 977 /* Select zone for packet buffers */ 978 if (sc->rx_buffer_size <= MCLBYTES) 979 sc->rx_buffer_zone = zone_clust; 980 else if (sc->rx_buffer_size <= MJUMPAGESIZE) 981 sc->rx_buffer_zone = zone_jumbop; 982 else if (sc->rx_buffer_size <= MJUM9BYTES) 983 sc->rx_buffer_zone = zone_jumbo9; 984 else 985 sc->rx_buffer_zone = zone_jumbo16; 986 987 /* 988 * Set up the scale table. Enable all hash types and hash insertion. 989 */ 990 for (index = 0; index < SFXGE_RX_SCALE_MAX; index++) 991 sc->rx_indir_table[index] = index % sc->intr.n_alloc; 992 if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table, 993 SFXGE_RX_SCALE_MAX)) != 0) 994 goto fail; 995 (void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ, 996 (1 << EFX_RX_HASH_IPV4) | (1 << EFX_RX_HASH_TCPIPV4) | 997 (1 << EFX_RX_HASH_IPV6) | (1 << EFX_RX_HASH_TCPIPV6), B_TRUE); 998 999 if ((rc = efx_rx_scale_toeplitz_ipv4_key_set(sc->enp, toep_key, 1000 sizeof(toep_key))) != 0) 1001 goto fail; 1002 1003 /* Start the receive queue(s). */ 1004 for (index = 0; index < intr->n_alloc; index++) { 1005 if ((rc = sfxge_rx_qstart(sc, index)) != 0) 1006 goto fail2; 1007 } 1008 1009 return (0); 1010 1011 fail2: 1012 while (--index >= 0) 1013 sfxge_rx_qstop(sc, index); 1014 1015 fail: 1016 efx_rx_fini(sc->enp); 1017 1018 return (rc); 1019 } 1020 1021 static void sfxge_lro_init(struct sfxge_rxq *rxq) 1022 { 1023 struct sfxge_lro_state *st = &rxq->lro; 1024 unsigned i; 1025 1026 st->conns_mask = lro_table_size - 1; 1027 KASSERT(!((st->conns_mask + 1) & st->conns_mask), 1028 ("lro_table_size must be a power of 2")); 1029 st->sc = rxq->sc; 1030 st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]), 1031 M_SFXGE, M_WAITOK); 1032 st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]), 1033 M_SFXGE, M_WAITOK); 1034 for (i = 0; i <= st->conns_mask; ++i) { 1035 TAILQ_INIT(&st->conns[i]); 1036 st->conns_n[i] = 0; 1037 } 1038 LIST_INIT(&st->active_conns); 1039 TAILQ_INIT(&st->free_conns); 1040 } 1041 1042 static void sfxge_lro_fini(struct sfxge_rxq *rxq) 1043 { 1044 struct sfxge_lro_state *st = &rxq->lro; 1045 struct sfxge_lro_conn *c; 1046 unsigned i; 1047 1048 /* Return cleanly if sfxge_lro_init() has not been called. */ 1049 if (st->conns == NULL) 1050 return; 1051 1052 KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections")); 1053 1054 for (i = 0; i <= st->conns_mask; ++i) { 1055 while (!TAILQ_EMPTY(&st->conns[i])) { 1056 c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq); 1057 sfxge_lro_drop(rxq, c); 1058 } 1059 } 1060 1061 while (!TAILQ_EMPTY(&st->free_conns)) { 1062 c = TAILQ_FIRST(&st->free_conns); 1063 TAILQ_REMOVE(&st->free_conns, c, link); 1064 KASSERT(!c->mbuf, ("found orphaned mbuf")); 1065 free(c, M_SFXGE); 1066 } 1067 1068 free(st->conns_n, M_SFXGE); 1069 free(st->conns, M_SFXGE); 1070 st->conns = NULL; 1071 } 1072 1073 static void 1074 sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index) 1075 { 1076 struct sfxge_rxq *rxq; 1077 1078 rxq = sc->rxq[index]; 1079 1080 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED, 1081 ("rxq->init_state != SFXGE_RXQ_INITIALIZED")); 1082 1083 /* Free the context array and the flow table. */ 1084 free(rxq->queue, M_SFXGE); 1085 sfxge_lro_fini(rxq); 1086 1087 /* Release DMA memory. */ 1088 sfxge_dma_free(&rxq->mem); 1089 1090 sc->rxq[index] = NULL; 1091 1092 free(rxq, M_SFXGE); 1093 } 1094 1095 static int 1096 sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index) 1097 { 1098 struct sfxge_rxq *rxq; 1099 struct sfxge_evq *evq; 1100 efsys_mem_t *esmp; 1101 int rc; 1102 1103 KASSERT(index < sc->intr.n_alloc, ("index >= %d", sc->intr.n_alloc)); 1104 1105 rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK); 1106 rxq->sc = sc; 1107 rxq->index = index; 1108 1109 sc->rxq[index] = rxq; 1110 esmp = &rxq->mem; 1111 1112 evq = sc->evq[index]; 1113 1114 /* Allocate and zero DMA space. */ 1115 if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(SFXGE_NDESCS), esmp)) != 0) 1116 return (rc); 1117 (void)memset(esmp->esm_base, 0, EFX_RXQ_SIZE(SFXGE_NDESCS)); 1118 1119 /* Allocate buffer table entries. */ 1120 sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(SFXGE_NDESCS), 1121 &rxq->buf_base_id); 1122 1123 /* Allocate the context array and the flow table. */ 1124 rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * SFXGE_NDESCS, 1125 M_SFXGE, M_WAITOK | M_ZERO); 1126 sfxge_lro_init(rxq); 1127 1128 callout_init(&rxq->refill_callout, B_TRUE); 1129 1130 rxq->init_state = SFXGE_RXQ_INITIALIZED; 1131 1132 return (0); 1133 } 1134 1135 static const struct { 1136 const char *name; 1137 size_t offset; 1138 } sfxge_rx_stats[] = { 1139 #define SFXGE_RX_STAT(name, member) \ 1140 { #name, offsetof(struct sfxge_rxq, member) } 1141 SFXGE_RX_STAT(lro_merges, lro.n_merges), 1142 SFXGE_RX_STAT(lro_bursts, lro.n_bursts), 1143 SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start), 1144 SFXGE_RX_STAT(lro_misorder, lro.n_misorder), 1145 SFXGE_RX_STAT(lro_too_many, lro.n_too_many), 1146 SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream), 1147 SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle), 1148 SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed) 1149 }; 1150 1151 static int 1152 sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS) 1153 { 1154 struct sfxge_softc *sc = arg1; 1155 unsigned int id = arg2; 1156 unsigned int sum, index; 1157 1158 /* Sum across all RX queues */ 1159 sum = 0; 1160 for (index = 0; index < sc->intr.n_alloc; index++) 1161 sum += *(unsigned int *)((caddr_t)sc->rxq[index] + 1162 sfxge_rx_stats[id].offset); 1163 1164 return SYSCTL_OUT(req, &sum, sizeof(sum)); 1165 } 1166 1167 static void 1168 sfxge_rx_stat_init(struct sfxge_softc *sc) 1169 { 1170 struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev); 1171 struct sysctl_oid_list *stat_list; 1172 unsigned int id; 1173 1174 stat_list = SYSCTL_CHILDREN(sc->stats_node); 1175 1176 for (id = 0; 1177 id < sizeof(sfxge_rx_stats) / sizeof(sfxge_rx_stats[0]); 1178 id++) { 1179 SYSCTL_ADD_PROC( 1180 ctx, stat_list, 1181 OID_AUTO, sfxge_rx_stats[id].name, 1182 CTLTYPE_UINT|CTLFLAG_RD, 1183 sc, id, sfxge_rx_stat_handler, "IU", 1184 ""); 1185 } 1186 } 1187 1188 void 1189 sfxge_rx_fini(struct sfxge_softc *sc) 1190 { 1191 struct sfxge_intr *intr; 1192 int index; 1193 1194 intr = &sc->intr; 1195 1196 index = intr->n_alloc; 1197 while (--index >= 0) 1198 sfxge_rx_qfini(sc, index); 1199 } 1200 1201 int 1202 sfxge_rx_init(struct sfxge_softc *sc) 1203 { 1204 struct sfxge_intr *intr; 1205 int index; 1206 int rc; 1207 1208 if (lro_idle_ticks == 0) 1209 lro_idle_ticks = hz / 10 + 1; /* 100 ms */ 1210 1211 intr = &sc->intr; 1212 1213 KASSERT(intr->state == SFXGE_INTR_INITIALIZED, 1214 ("intr->state != SFXGE_INTR_INITIALIZED")); 1215 1216 /* Initialize the receive queue(s) - one per interrupt. */ 1217 for (index = 0; index < intr->n_alloc; index++) { 1218 if ((rc = sfxge_rx_qinit(sc, index)) != 0) 1219 goto fail; 1220 } 1221 1222 sfxge_rx_stat_init(sc); 1223 1224 return (0); 1225 1226 fail: 1227 /* Tear down the receive queue(s). */ 1228 while (--index >= 0) 1229 sfxge_rx_qfini(sc, index); 1230 1231 return (rc); 1232 } 1233