1 /*- 2 * Copyright (c) 2010-2011 Solarflare Communications, Inc. 3 * All rights reserved. 4 * 5 * This software was developed in part by Philip Paeps under contract for 6 * Solarflare Communications, Inc. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 #include <sys/cdefs.h> 31 __FBSDID("$FreeBSD$"); 32 33 #include <sys/types.h> 34 #include <sys/mbuf.h> 35 #include <sys/smp.h> 36 #include <sys/socket.h> 37 #include <sys/sysctl.h> 38 #include <sys/limits.h> 39 40 #include <net/ethernet.h> 41 #include <net/if.h> 42 #include <net/if_vlan_var.h> 43 44 #include <netinet/in.h> 45 #include <netinet/ip.h> 46 #include <netinet/ip6.h> 47 #include <netinet/tcp.h> 48 49 #include <machine/in_cksum.h> 50 51 #include "common/efx.h" 52 53 54 #include "sfxge.h" 55 #include "sfxge_rx.h" 56 57 #define RX_REFILL_THRESHOLD (EFX_RXQ_LIMIT(SFXGE_NDESCS) * 9 / 10) 58 #define RX_REFILL_THRESHOLD_2 (RX_REFILL_THRESHOLD / 2) 59 60 /* Size of the LRO hash table. Must be a power of 2. A larger table 61 * means we can accelerate a larger number of streams. 62 */ 63 static unsigned lro_table_size = 128; 64 65 /* Maximum length of a hash chain. If chains get too long then the lookup 66 * time increases and may exceed the benefit of LRO. 67 */ 68 static unsigned lro_chain_max = 20; 69 70 /* Maximum time (in ticks) that a connection can be idle before it's LRO 71 * state is discarded. 72 */ 73 static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */ 74 75 /* Number of packets with payload that must arrive in-order before a 76 * connection is eligible for LRO. The idea is we should avoid coalescing 77 * segments when the sender is in slow-start because reducing the ACK rate 78 * can damage performance. 79 */ 80 static int lro_slow_start_packets = 2000; 81 82 /* Number of packets with payload that must arrive in-order following loss 83 * before a connection is eligible for LRO. The idea is we should avoid 84 * coalescing segments when the sender is recovering from loss, because 85 * reducing the ACK rate can damage performance. 86 */ 87 static int lro_loss_packets = 20; 88 89 /* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */ 90 #define SFXGE_LRO_L2_ID_VLAN 0x4000 91 #define SFXGE_LRO_L2_ID_IPV6 0x8000 92 #define SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN) 93 #define SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6)) 94 95 /* Compare IPv6 addresses, avoiding conditional branches */ 96 static __inline unsigned long ipv6_addr_cmp(const struct in6_addr *left, 97 const struct in6_addr *right) 98 { 99 #if LONG_BIT == 64 100 const uint64_t *left64 = (const uint64_t *)left; 101 const uint64_t *right64 = (const uint64_t *)right; 102 return (left64[0] - right64[0]) | (left64[1] - right64[1]); 103 #else 104 return (left->s6_addr32[0] - right->s6_addr32[0]) | 105 (left->s6_addr32[1] - right->s6_addr32[1]) | 106 (left->s6_addr32[2] - right->s6_addr32[2]) | 107 (left->s6_addr32[3] - right->s6_addr32[3]); 108 #endif 109 } 110 111 void 112 sfxge_rx_qflush_done(struct sfxge_rxq *rxq) 113 { 114 115 rxq->flush_state = SFXGE_FLUSH_DONE; 116 } 117 118 void 119 sfxge_rx_qflush_failed(struct sfxge_rxq *rxq) 120 { 121 122 rxq->flush_state = SFXGE_FLUSH_FAILED; 123 } 124 125 static uint8_t toep_key[] = { 126 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, 127 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, 128 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, 129 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, 130 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa 131 }; 132 133 static void 134 sfxge_rx_post_refill(void *arg) 135 { 136 struct sfxge_rxq *rxq = arg; 137 struct sfxge_softc *sc; 138 unsigned int index; 139 struct sfxge_evq *evq; 140 uint16_t magic; 141 142 sc = rxq->sc; 143 index = rxq->index; 144 evq = sc->evq[index]; 145 146 magic = SFXGE_MAGIC_RX_QREFILL | index; 147 148 /* This is guaranteed due to the start/stop order of rx and ev */ 149 KASSERT(evq->init_state == SFXGE_EVQ_STARTED, 150 ("evq not started")); 151 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED, 152 ("rxq not started")); 153 efx_ev_qpost(evq->common, magic); 154 } 155 156 static void 157 sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying) 158 { 159 /* Initially retry after 100 ms, but back off in case of 160 * repeated failures as we probably have to wait for the 161 * administrator to raise the pool limit. */ 162 if (retrying) 163 rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz); 164 else 165 rxq->refill_delay = hz / 10; 166 167 callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay, 168 sfxge_rx_post_refill, rxq); 169 } 170 171 static inline struct mbuf *sfxge_rx_alloc_mbuf(struct sfxge_softc *sc) 172 { 173 struct mb_args args; 174 struct mbuf *m; 175 176 /* Allocate mbuf structure */ 177 args.flags = M_PKTHDR; 178 args.type = MT_DATA; 179 m = (struct mbuf *)uma_zalloc_arg(zone_mbuf, &args, M_NOWAIT); 180 181 /* Allocate (and attach) packet buffer */ 182 if (m && !uma_zalloc_arg(sc->rx_buffer_zone, m, M_NOWAIT)) { 183 uma_zfree(zone_mbuf, m); 184 m = NULL; 185 } 186 187 return m; 188 } 189 190 #define SFXGE_REFILL_BATCH 64 191 192 static void 193 sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying) 194 { 195 struct sfxge_softc *sc; 196 unsigned int index; 197 struct sfxge_evq *evq; 198 unsigned int batch; 199 unsigned int rxfill; 200 unsigned int mblksize; 201 int ntodo; 202 efsys_dma_addr_t addr[SFXGE_REFILL_BATCH]; 203 204 sc = rxq->sc; 205 index = rxq->index; 206 evq = sc->evq[index]; 207 208 prefetch_read_many(sc->enp); 209 prefetch_read_many(rxq->common); 210 211 mtx_assert(&evq->lock, MA_OWNED); 212 213 if (rxq->init_state != SFXGE_RXQ_STARTED) 214 return; 215 216 rxfill = rxq->added - rxq->completed; 217 KASSERT(rxfill <= EFX_RXQ_LIMIT(SFXGE_NDESCS), 218 ("rxfill > EFX_RXQ_LIMIT(SFXGE_NDESCS)")); 219 ntodo = min(EFX_RXQ_LIMIT(SFXGE_NDESCS) - rxfill, target); 220 KASSERT(ntodo <= EFX_RXQ_LIMIT(SFXGE_NDESCS), 221 ("ntodo > EFX_RQX_LIMIT(SFXGE_NDESCS)")); 222 223 if (ntodo == 0) 224 return; 225 226 batch = 0; 227 mblksize = sc->rx_buffer_size; 228 while (ntodo-- > 0) { 229 unsigned int id; 230 struct sfxge_rx_sw_desc *rx_desc; 231 bus_dma_segment_t seg; 232 struct mbuf *m; 233 234 id = (rxq->added + batch) & (SFXGE_NDESCS - 1); 235 rx_desc = &rxq->queue[id]; 236 KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL")); 237 238 rx_desc->flags = EFX_DISCARD; 239 m = rx_desc->mbuf = sfxge_rx_alloc_mbuf(sc); 240 if (m == NULL) 241 break; 242 sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg); 243 addr[batch++] = seg.ds_addr; 244 245 if (batch == SFXGE_REFILL_BATCH) { 246 efx_rx_qpost(rxq->common, addr, mblksize, batch, 247 rxq->completed, rxq->added); 248 rxq->added += batch; 249 batch = 0; 250 } 251 } 252 253 if (ntodo != 0) 254 sfxge_rx_schedule_refill(rxq, retrying); 255 256 if (batch != 0) { 257 efx_rx_qpost(rxq->common, addr, mblksize, batch, 258 rxq->completed, rxq->added); 259 rxq->added += batch; 260 } 261 262 /* Make the descriptors visible to the hardware */ 263 bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map, 264 BUS_DMASYNC_PREWRITE); 265 266 efx_rx_qpush(rxq->common, rxq->added); 267 } 268 269 void 270 sfxge_rx_qrefill(struct sfxge_rxq *rxq) 271 { 272 273 if (rxq->init_state != SFXGE_RXQ_STARTED) 274 return; 275 276 /* Make sure the queue is full */ 277 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(SFXGE_NDESCS), B_TRUE); 278 } 279 280 static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m) 281 { 282 struct ifnet *ifp = sc->ifnet; 283 284 m->m_pkthdr.rcvif = ifp; 285 m->m_pkthdr.header = m->m_data; 286 m->m_pkthdr.csum_data = 0xffff; 287 ifp->if_input(ifp, m); 288 } 289 290 static void 291 sfxge_rx_deliver(struct sfxge_softc *sc, struct sfxge_rx_sw_desc *rx_desc) 292 { 293 struct mbuf *m = rx_desc->mbuf; 294 int csum_flags; 295 296 /* Convert checksum flags */ 297 csum_flags = (rx_desc->flags & EFX_CKSUM_IPV4) ? 298 (CSUM_IP_CHECKED | CSUM_IP_VALID) : 0; 299 if (rx_desc->flags & EFX_CKSUM_TCPUDP) 300 csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 301 302 #ifdef SFXGE_HAVE_MQ 303 /* The hash covers a 4-tuple for TCP only */ 304 if (rx_desc->flags & EFX_PKT_TCP) { 305 m->m_pkthdr.flowid = EFX_RX_HASH_VALUE(EFX_RX_HASHALG_TOEPLITZ, 306 mtod(m, uint8_t *)); 307 m->m_flags |= M_FLOWID; 308 } 309 #endif 310 m->m_data += sc->rx_prefix_size; 311 m->m_len = rx_desc->size - sc->rx_prefix_size; 312 m->m_pkthdr.len = m->m_len; 313 m->m_pkthdr.csum_flags = csum_flags; 314 __sfxge_rx_deliver(sc, rx_desc->mbuf); 315 316 rx_desc->flags = EFX_DISCARD; 317 rx_desc->mbuf = NULL; 318 } 319 320 static void 321 sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c) 322 { 323 struct sfxge_softc *sc = st->sc; 324 struct mbuf *m = c->mbuf; 325 struct tcphdr *c_th; 326 int csum_flags; 327 328 KASSERT(m, ("no mbuf to deliver")); 329 330 ++st->n_bursts; 331 332 /* Finish off packet munging and recalculate IP header checksum. */ 333 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 334 struct ip *iph = c->nh; 335 iph->ip_len = htons(iph->ip_len); 336 iph->ip_sum = 0; 337 iph->ip_sum = in_cksum_hdr(iph); 338 c_th = (struct tcphdr *)(iph + 1); 339 csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR | 340 CSUM_IP_CHECKED | CSUM_IP_VALID); 341 } else { 342 struct ip6_hdr *iph = c->nh; 343 iph->ip6_plen = htons(iph->ip6_plen); 344 c_th = (struct tcphdr *)(iph + 1); 345 csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 346 } 347 348 c_th->th_win = c->th_last->th_win; 349 c_th->th_ack = c->th_last->th_ack; 350 if (c_th->th_off == c->th_last->th_off) { 351 /* Copy TCP options (take care to avoid going negative). */ 352 int optlen = ((c_th->th_off - 5) & 0xf) << 2u; 353 memcpy(c_th + 1, c->th_last + 1, optlen); 354 } 355 356 #ifdef SFXGE_HAVE_MQ 357 m->m_pkthdr.flowid = c->conn_hash; 358 m->m_flags |= M_FLOWID; 359 #endif 360 m->m_pkthdr.csum_flags = csum_flags; 361 __sfxge_rx_deliver(sc, m); 362 363 c->mbuf = NULL; 364 c->delivered = 1; 365 } 366 367 /* Drop the given connection, and add it to the free list. */ 368 static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c) 369 { 370 unsigned bucket; 371 372 KASSERT(!c->mbuf, ("found orphaned mbuf")); 373 374 if (c->next_buf.mbuf) { 375 sfxge_rx_deliver(rxq->sc, &c->next_buf); 376 LIST_REMOVE(c, active_link); 377 } 378 379 bucket = c->conn_hash & rxq->lro.conns_mask; 380 KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong")); 381 --rxq->lro.conns_n[bucket]; 382 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link); 383 TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link); 384 } 385 386 /* Stop tracking connections that have gone idle in order to keep hash 387 * chains short. 388 */ 389 static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now) 390 { 391 struct sfxge_lro_conn *c; 392 unsigned i; 393 394 KASSERT(LIST_EMPTY(&rxq->lro.active_conns), 395 ("found active connections")); 396 397 rxq->lro.last_purge_ticks = now; 398 for (i = 0; i <= rxq->lro.conns_mask; ++i) { 399 if (TAILQ_EMPTY(&rxq->lro.conns[i])) 400 continue; 401 402 c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq); 403 if (now - c->last_pkt_ticks > lro_idle_ticks) { 404 ++rxq->lro.n_drop_idle; 405 sfxge_lro_drop(rxq, c); 406 } 407 } 408 } 409 410 static void 411 sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c, 412 struct mbuf *mbuf, struct tcphdr *th) 413 { 414 struct tcphdr *c_th; 415 416 /* Tack the new mbuf onto the chain. */ 417 KASSERT(!mbuf->m_next, ("mbuf already chained")); 418 c->mbuf_tail->m_next = mbuf; 419 c->mbuf_tail = mbuf; 420 421 /* Increase length appropriately */ 422 c->mbuf->m_pkthdr.len += mbuf->m_len; 423 424 /* Update the connection state flags */ 425 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 426 struct ip *iph = c->nh; 427 iph->ip_len += mbuf->m_len; 428 c_th = (struct tcphdr *)(iph + 1); 429 } else { 430 struct ip6_hdr *iph = c->nh; 431 iph->ip6_plen += mbuf->m_len; 432 c_th = (struct tcphdr *)(iph + 1); 433 } 434 c_th->th_flags |= (th->th_flags & TH_PUSH); 435 c->th_last = th; 436 ++st->n_merges; 437 438 /* Pass packet up now if another segment could overflow the IP 439 * length. 440 */ 441 if (c->mbuf->m_pkthdr.len > 65536 - 9200) 442 sfxge_lro_deliver(st, c); 443 } 444 445 static void 446 sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c, 447 struct mbuf *mbuf, void *nh, struct tcphdr *th) 448 { 449 /* Start the chain */ 450 c->mbuf = mbuf; 451 c->mbuf_tail = c->mbuf; 452 c->nh = nh; 453 c->th_last = th; 454 455 mbuf->m_pkthdr.len = mbuf->m_len; 456 457 /* Mangle header fields for later processing */ 458 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 459 struct ip *iph = nh; 460 iph->ip_len = ntohs(iph->ip_len); 461 } else { 462 struct ip6_hdr *iph = nh; 463 iph->ip6_plen = ntohs(iph->ip6_plen); 464 } 465 } 466 467 /* Try to merge or otherwise hold or deliver (as appropriate) the 468 * packet buffered for this connection (c->next_buf). Return a flag 469 * indicating whether the connection is still active for LRO purposes. 470 */ 471 static int 472 sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c) 473 { 474 struct sfxge_rx_sw_desc *rx_buf = &c->next_buf; 475 char *eh = c->next_eh; 476 int data_length, hdr_length, dont_merge; 477 unsigned th_seq, pkt_length; 478 struct tcphdr *th; 479 unsigned now; 480 481 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 482 struct ip *iph = c->next_nh; 483 th = (struct tcphdr *)(iph + 1); 484 pkt_length = ntohs(iph->ip_len) + (char *) iph - eh; 485 } else { 486 struct ip6_hdr *iph = c->next_nh; 487 th = (struct tcphdr *)(iph + 1); 488 pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh; 489 } 490 491 hdr_length = (char *) th + th->th_off * 4 - eh; 492 data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) - 493 hdr_length); 494 th_seq = ntohl(th->th_seq); 495 dont_merge = ((data_length <= 0) 496 | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN))); 497 498 /* Check for options other than aligned timestamp. */ 499 if (th->th_off != 5) { 500 const uint32_t *opt_ptr = (const uint32_t *) (th + 1); 501 if (th->th_off == 8 && 502 opt_ptr[0] == ntohl((TCPOPT_NOP << 24) | 503 (TCPOPT_NOP << 16) | 504 (TCPOPT_TIMESTAMP << 8) | 505 TCPOLEN_TIMESTAMP)) { 506 /* timestamp option -- okay */ 507 } else { 508 dont_merge = 1; 509 } 510 } 511 512 if (__predict_false(th_seq != c->next_seq)) { 513 /* Out-of-order, so start counting again. */ 514 if (c->mbuf) 515 sfxge_lro_deliver(&rxq->lro, c); 516 c->n_in_order_pkts -= lro_loss_packets; 517 c->next_seq = th_seq + data_length; 518 ++rxq->lro.n_misorder; 519 goto deliver_buf_out; 520 } 521 c->next_seq = th_seq + data_length; 522 523 now = ticks; 524 if (now - c->last_pkt_ticks > lro_idle_ticks) { 525 ++rxq->lro.n_drop_idle; 526 if (c->mbuf) 527 sfxge_lro_deliver(&rxq->lro, c); 528 sfxge_lro_drop(rxq, c); 529 return 0; 530 } 531 c->last_pkt_ticks = ticks; 532 533 if (c->n_in_order_pkts < lro_slow_start_packets) { 534 /* May be in slow-start, so don't merge. */ 535 ++rxq->lro.n_slow_start; 536 ++c->n_in_order_pkts; 537 goto deliver_buf_out; 538 } 539 540 if (__predict_false(dont_merge)) { 541 if (c->mbuf) 542 sfxge_lro_deliver(&rxq->lro, c); 543 if (th->th_flags & (TH_FIN | TH_RST)) { 544 ++rxq->lro.n_drop_closed; 545 sfxge_lro_drop(rxq, c); 546 return 0; 547 } 548 goto deliver_buf_out; 549 } 550 551 rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size; 552 553 if (__predict_true(c->mbuf != NULL)) { 554 /* Remove headers and any padding */ 555 rx_buf->mbuf->m_data += hdr_length; 556 rx_buf->mbuf->m_len = data_length; 557 558 sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th); 559 } else { 560 /* Remove any padding */ 561 rx_buf->mbuf->m_len = pkt_length; 562 563 sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th); 564 } 565 566 rx_buf->mbuf = NULL; 567 return 1; 568 569 deliver_buf_out: 570 sfxge_rx_deliver(rxq->sc, rx_buf); 571 return 1; 572 } 573 574 static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash, 575 uint16_t l2_id, void *nh, struct tcphdr *th) 576 { 577 unsigned bucket = conn_hash & st->conns_mask; 578 struct sfxge_lro_conn *c; 579 580 if (st->conns_n[bucket] >= lro_chain_max) { 581 ++st->n_too_many; 582 return; 583 } 584 585 if (!TAILQ_EMPTY(&st->free_conns)) { 586 c = TAILQ_FIRST(&st->free_conns); 587 TAILQ_REMOVE(&st->free_conns, c, link); 588 } else { 589 c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT); 590 if (c == NULL) 591 return; 592 c->mbuf = NULL; 593 c->next_buf.mbuf = NULL; 594 } 595 596 /* Create the connection tracking data */ 597 ++st->conns_n[bucket]; 598 TAILQ_INSERT_HEAD(&st->conns[bucket], c, link); 599 c->l2_id = l2_id; 600 c->conn_hash = conn_hash; 601 c->source = th->th_sport; 602 c->dest = th->th_dport; 603 c->n_in_order_pkts = 0; 604 c->last_pkt_ticks = *(volatile int *)&ticks; 605 c->delivered = 0; 606 ++st->n_new_stream; 607 /* NB. We don't initialise c->next_seq, and it doesn't matter what 608 * value it has. Most likely the next packet received for this 609 * connection will not match -- no harm done. 610 */ 611 } 612 613 /* Process mbuf and decide whether to dispatch it to the stack now or 614 * later. 615 */ 616 static void 617 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf) 618 { 619 struct sfxge_softc *sc = rxq->sc; 620 struct mbuf *m = rx_buf->mbuf; 621 struct ether_header *eh; 622 struct sfxge_lro_conn *c; 623 uint16_t l2_id; 624 uint16_t l3_proto; 625 void *nh; 626 struct tcphdr *th; 627 uint32_t conn_hash; 628 unsigned bucket; 629 630 /* Get the hardware hash */ 631 conn_hash = EFX_RX_HASH_VALUE(EFX_RX_HASHALG_TOEPLITZ, 632 mtod(m, uint8_t *)); 633 634 eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size); 635 if (eh->ether_type == htons(ETHERTYPE_VLAN)) { 636 struct ether_vlan_header *veh = (struct ether_vlan_header *)eh; 637 l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) | 638 SFXGE_LRO_L2_ID_VLAN; 639 l3_proto = veh->evl_proto; 640 nh = veh + 1; 641 } else { 642 l2_id = 0; 643 l3_proto = eh->ether_type; 644 nh = eh + 1; 645 } 646 647 /* Check whether this is a suitable packet (unfragmented 648 * TCP/IPv4 or TCP/IPv6). If so, find the TCP header and 649 * length, and compute a hash if necessary. If not, return. 650 */ 651 if (l3_proto == htons(ETHERTYPE_IP)) { 652 struct ip *iph = nh; 653 if ((iph->ip_p - IPPROTO_TCP) | 654 (iph->ip_hl - (sizeof(*iph) >> 2u)) | 655 (iph->ip_off & htons(IP_MF | IP_OFFMASK))) 656 goto deliver_now; 657 th = (struct tcphdr *)(iph + 1); 658 } else if (l3_proto == htons(ETHERTYPE_IPV6)) { 659 struct ip6_hdr *iph = nh; 660 if (iph->ip6_nxt != IPPROTO_TCP) 661 goto deliver_now; 662 l2_id |= SFXGE_LRO_L2_ID_IPV6; 663 th = (struct tcphdr *)(iph + 1); 664 } else { 665 goto deliver_now; 666 } 667 668 bucket = conn_hash & rxq->lro.conns_mask; 669 670 TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) { 671 if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash)) 672 continue; 673 if ((c->source - th->th_sport) | (c->dest - th->th_dport)) 674 continue; 675 if (c->mbuf) { 676 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 677 struct ip *c_iph, *iph = nh; 678 c_iph = c->nh; 679 if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) | 680 (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr)) 681 continue; 682 } else { 683 struct ip6_hdr *c_iph, *iph = nh; 684 c_iph = c->nh; 685 if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) | 686 ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst)) 687 continue; 688 } 689 } 690 691 /* Re-insert at head of list to reduce lookup time. */ 692 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link); 693 TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link); 694 695 if (c->next_buf.mbuf) { 696 if (!sfxge_lro_try_merge(rxq, c)) 697 goto deliver_now; 698 } else { 699 LIST_INSERT_HEAD(&rxq->lro.active_conns, c, 700 active_link); 701 } 702 c->next_buf = *rx_buf; 703 c->next_eh = eh; 704 c->next_nh = nh; 705 706 rx_buf->mbuf = NULL; 707 rx_buf->flags = EFX_DISCARD; 708 return; 709 } 710 711 sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th); 712 deliver_now: 713 sfxge_rx_deliver(sc, rx_buf); 714 } 715 716 static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq) 717 { 718 struct sfxge_lro_state *st = &rxq->lro; 719 struct sfxge_lro_conn *c; 720 unsigned t; 721 722 while (!LIST_EMPTY(&st->active_conns)) { 723 c = LIST_FIRST(&st->active_conns); 724 if (!c->delivered && c->mbuf) 725 sfxge_lro_deliver(st, c); 726 if (sfxge_lro_try_merge(rxq, c)) { 727 if (c->mbuf) 728 sfxge_lro_deliver(st, c); 729 LIST_REMOVE(c, active_link); 730 } 731 c->delivered = 0; 732 } 733 734 t = *(volatile int *)&ticks; 735 if (__predict_false(t != st->last_purge_ticks)) 736 sfxge_lro_purge_idle(rxq, t); 737 } 738 739 void 740 sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop) 741 { 742 struct sfxge_softc *sc = rxq->sc; 743 int lro_enabled = sc->ifnet->if_capenable & IFCAP_LRO; 744 unsigned int index; 745 struct sfxge_evq *evq; 746 unsigned int completed; 747 unsigned int level; 748 struct mbuf *m; 749 struct sfxge_rx_sw_desc *prev = NULL; 750 751 index = rxq->index; 752 evq = sc->evq[index]; 753 754 mtx_assert(&evq->lock, MA_OWNED); 755 756 completed = rxq->completed; 757 while (completed != rxq->pending) { 758 unsigned int id; 759 struct sfxge_rx_sw_desc *rx_desc; 760 761 id = completed++ & (SFXGE_NDESCS - 1); 762 rx_desc = &rxq->queue[id]; 763 m = rx_desc->mbuf; 764 765 if (rxq->init_state != SFXGE_RXQ_STARTED) 766 goto discard; 767 768 if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD)) 769 goto discard; 770 771 prefetch_read_many(mtod(m, caddr_t)); 772 773 /* Check for loopback packets */ 774 if (!(rx_desc->flags & EFX_PKT_IPV4) && 775 !(rx_desc->flags & EFX_PKT_IPV6)) { 776 struct ether_header *etherhp; 777 778 /*LINTED*/ 779 etherhp = mtod(m, struct ether_header *); 780 781 if (etherhp->ether_type == 782 htons(SFXGE_ETHERTYPE_LOOPBACK)) { 783 EFSYS_PROBE(loopback); 784 785 rxq->loopback++; 786 goto discard; 787 } 788 } 789 790 /* Pass packet up the stack or into LRO (pipelined) */ 791 if (prev != NULL) { 792 if (lro_enabled) 793 sfxge_lro(rxq, prev); 794 else 795 sfxge_rx_deliver(sc, prev); 796 } 797 prev = rx_desc; 798 continue; 799 800 discard: 801 /* Return the packet to the pool */ 802 m_free(m); 803 rx_desc->mbuf = NULL; 804 } 805 rxq->completed = completed; 806 807 level = rxq->added - rxq->completed; 808 809 /* Pass last packet up the stack or into LRO */ 810 if (prev != NULL) { 811 if (lro_enabled) 812 sfxge_lro(rxq, prev); 813 else 814 sfxge_rx_deliver(sc, prev); 815 } 816 817 /* 818 * If there are any pending flows and this is the end of the 819 * poll then they must be completed. 820 */ 821 if (eop) 822 sfxge_lro_end_of_burst(rxq); 823 824 /* Top up the queue if necessary */ 825 if (level < RX_REFILL_THRESHOLD) 826 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(SFXGE_NDESCS), B_FALSE); 827 } 828 829 static void 830 sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index) 831 { 832 struct sfxge_rxq *rxq; 833 struct sfxge_evq *evq; 834 unsigned int count; 835 836 rxq = sc->rxq[index]; 837 evq = sc->evq[index]; 838 839 mtx_lock(&evq->lock); 840 841 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED, 842 ("rxq not started")); 843 844 rxq->init_state = SFXGE_RXQ_INITIALIZED; 845 846 callout_stop(&rxq->refill_callout); 847 848 again: 849 rxq->flush_state = SFXGE_FLUSH_PENDING; 850 851 /* Flush the receive queue */ 852 efx_rx_qflush(rxq->common); 853 854 mtx_unlock(&evq->lock); 855 856 count = 0; 857 do { 858 /* Spin for 100 ms */ 859 DELAY(100000); 860 861 if (rxq->flush_state != SFXGE_FLUSH_PENDING) 862 break; 863 864 } while (++count < 20); 865 866 mtx_lock(&evq->lock); 867 868 if (rxq->flush_state == SFXGE_FLUSH_FAILED) 869 goto again; 870 871 rxq->flush_state = SFXGE_FLUSH_DONE; 872 873 rxq->pending = rxq->added; 874 sfxge_rx_qcomplete(rxq, B_TRUE); 875 876 KASSERT(rxq->completed == rxq->pending, 877 ("rxq->completed != rxq->pending")); 878 879 rxq->added = 0; 880 rxq->pending = 0; 881 rxq->completed = 0; 882 rxq->loopback = 0; 883 884 /* Destroy the common code receive queue. */ 885 efx_rx_qdestroy(rxq->common); 886 887 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id, 888 EFX_RXQ_NBUFS(SFXGE_NDESCS)); 889 890 mtx_unlock(&evq->lock); 891 } 892 893 static int 894 sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index) 895 { 896 struct sfxge_rxq *rxq; 897 efsys_mem_t *esmp; 898 struct sfxge_evq *evq; 899 int rc; 900 901 rxq = sc->rxq[index]; 902 esmp = &rxq->mem; 903 evq = sc->evq[index]; 904 905 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED, 906 ("rxq->init_state != SFXGE_RXQ_INITIALIZED")); 907 KASSERT(evq->init_state == SFXGE_EVQ_STARTED, 908 ("evq->init_state != SFXGE_EVQ_STARTED")); 909 910 /* Program the buffer table. */ 911 if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp, 912 EFX_RXQ_NBUFS(SFXGE_NDESCS))) != 0) 913 return rc; 914 915 /* Create the common code receive queue. */ 916 if ((rc = efx_rx_qcreate(sc->enp, index, index, EFX_RXQ_TYPE_DEFAULT, 917 esmp, SFXGE_NDESCS, rxq->buf_base_id, evq->common, 918 &rxq->common)) != 0) 919 goto fail; 920 921 mtx_lock(&evq->lock); 922 923 /* Enable the receive queue. */ 924 efx_rx_qenable(rxq->common); 925 926 rxq->init_state = SFXGE_RXQ_STARTED; 927 928 /* Try to fill the queue from the pool. */ 929 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(SFXGE_NDESCS), B_FALSE); 930 931 mtx_unlock(&evq->lock); 932 933 return (0); 934 935 fail: 936 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id, 937 EFX_RXQ_NBUFS(SFXGE_NDESCS)); 938 return rc; 939 } 940 941 void 942 sfxge_rx_stop(struct sfxge_softc *sc) 943 { 944 struct sfxge_intr *intr; 945 int index; 946 947 intr = &sc->intr; 948 949 /* Stop the receive queue(s) */ 950 index = intr->n_alloc; 951 while (--index >= 0) 952 sfxge_rx_qstop(sc, index); 953 954 sc->rx_prefix_size = 0; 955 sc->rx_buffer_size = 0; 956 957 efx_rx_fini(sc->enp); 958 } 959 960 int 961 sfxge_rx_start(struct sfxge_softc *sc) 962 { 963 struct sfxge_intr *intr; 964 int index; 965 int rc; 966 967 intr = &sc->intr; 968 969 /* Initialize the common code receive module. */ 970 if ((rc = efx_rx_init(sc->enp)) != 0) 971 return (rc); 972 973 /* Calculate the receive packet buffer size. */ 974 sc->rx_prefix_size = EFX_RX_PREFIX_SIZE; 975 sc->rx_buffer_size = (EFX_MAC_PDU(sc->ifnet->if_mtu) + 976 sc->rx_prefix_size); 977 978 /* Select zone for packet buffers */ 979 if (sc->rx_buffer_size <= MCLBYTES) 980 sc->rx_buffer_zone = zone_clust; 981 else if (sc->rx_buffer_size <= MJUMPAGESIZE) 982 sc->rx_buffer_zone = zone_jumbop; 983 else if (sc->rx_buffer_size <= MJUM9BYTES) 984 sc->rx_buffer_zone = zone_jumbo9; 985 else 986 sc->rx_buffer_zone = zone_jumbo16; 987 988 /* 989 * Set up the scale table. Enable all hash types and hash insertion. 990 */ 991 for (index = 0; index < SFXGE_RX_SCALE_MAX; index++) 992 sc->rx_indir_table[index] = index % sc->intr.n_alloc; 993 if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table, 994 SFXGE_RX_SCALE_MAX)) != 0) 995 goto fail; 996 (void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ, 997 (1 << EFX_RX_HASH_IPV4) | (1 << EFX_RX_HASH_TCPIPV4) | 998 (1 << EFX_RX_HASH_IPV6) | (1 << EFX_RX_HASH_TCPIPV6), B_TRUE); 999 1000 if ((rc = efx_rx_scale_toeplitz_ipv4_key_set(sc->enp, toep_key, 1001 sizeof(toep_key))) != 0) 1002 goto fail; 1003 1004 /* Start the receive queue(s). */ 1005 for (index = 0; index < intr->n_alloc; index++) { 1006 if ((rc = sfxge_rx_qstart(sc, index)) != 0) 1007 goto fail2; 1008 } 1009 1010 return (0); 1011 1012 fail2: 1013 while (--index >= 0) 1014 sfxge_rx_qstop(sc, index); 1015 1016 fail: 1017 efx_rx_fini(sc->enp); 1018 1019 return (rc); 1020 } 1021 1022 static void sfxge_lro_init(struct sfxge_rxq *rxq) 1023 { 1024 struct sfxge_lro_state *st = &rxq->lro; 1025 unsigned i; 1026 1027 st->conns_mask = lro_table_size - 1; 1028 KASSERT(!((st->conns_mask + 1) & st->conns_mask), 1029 ("lro_table_size must be a power of 2")); 1030 st->sc = rxq->sc; 1031 st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]), 1032 M_SFXGE, M_WAITOK); 1033 st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]), 1034 M_SFXGE, M_WAITOK); 1035 for (i = 0; i <= st->conns_mask; ++i) { 1036 TAILQ_INIT(&st->conns[i]); 1037 st->conns_n[i] = 0; 1038 } 1039 LIST_INIT(&st->active_conns); 1040 TAILQ_INIT(&st->free_conns); 1041 } 1042 1043 static void sfxge_lro_fini(struct sfxge_rxq *rxq) 1044 { 1045 struct sfxge_lro_state *st = &rxq->lro; 1046 struct sfxge_lro_conn *c; 1047 unsigned i; 1048 1049 /* Return cleanly if sfxge_lro_init() has not been called. */ 1050 if (st->conns == NULL) 1051 return; 1052 1053 KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections")); 1054 1055 for (i = 0; i <= st->conns_mask; ++i) { 1056 while (!TAILQ_EMPTY(&st->conns[i])) { 1057 c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq); 1058 sfxge_lro_drop(rxq, c); 1059 } 1060 } 1061 1062 while (!TAILQ_EMPTY(&st->free_conns)) { 1063 c = TAILQ_FIRST(&st->free_conns); 1064 TAILQ_REMOVE(&st->free_conns, c, link); 1065 KASSERT(!c->mbuf, ("found orphaned mbuf")); 1066 free(c, M_SFXGE); 1067 } 1068 1069 free(st->conns_n, M_SFXGE); 1070 free(st->conns, M_SFXGE); 1071 st->conns = NULL; 1072 } 1073 1074 static void 1075 sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index) 1076 { 1077 struct sfxge_rxq *rxq; 1078 1079 rxq = sc->rxq[index]; 1080 1081 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED, 1082 ("rxq->init_state != SFXGE_RXQ_INITIALIZED")); 1083 1084 /* Free the context array and the flow table. */ 1085 free(rxq->queue, M_SFXGE); 1086 sfxge_lro_fini(rxq); 1087 1088 /* Release DMA memory. */ 1089 sfxge_dma_free(&rxq->mem); 1090 1091 sc->rxq[index] = NULL; 1092 1093 free(rxq, M_SFXGE); 1094 } 1095 1096 static int 1097 sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index) 1098 { 1099 struct sfxge_rxq *rxq; 1100 struct sfxge_evq *evq; 1101 efsys_mem_t *esmp; 1102 int rc; 1103 1104 KASSERT(index < sc->intr.n_alloc, ("index >= %d", sc->intr.n_alloc)); 1105 1106 rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK); 1107 rxq->sc = sc; 1108 rxq->index = index; 1109 1110 sc->rxq[index] = rxq; 1111 esmp = &rxq->mem; 1112 1113 evq = sc->evq[index]; 1114 1115 /* Allocate and zero DMA space. */ 1116 if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(SFXGE_NDESCS), esmp)) != 0) 1117 return (rc); 1118 (void)memset(esmp->esm_base, 0, EFX_RXQ_SIZE(SFXGE_NDESCS)); 1119 1120 /* Allocate buffer table entries. */ 1121 sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(SFXGE_NDESCS), 1122 &rxq->buf_base_id); 1123 1124 /* Allocate the context array and the flow table. */ 1125 rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * SFXGE_NDESCS, 1126 M_SFXGE, M_WAITOK | M_ZERO); 1127 sfxge_lro_init(rxq); 1128 1129 callout_init(&rxq->refill_callout, B_TRUE); 1130 1131 rxq->init_state = SFXGE_RXQ_INITIALIZED; 1132 1133 return (0); 1134 } 1135 1136 static const struct { 1137 const char *name; 1138 size_t offset; 1139 } sfxge_rx_stats[] = { 1140 #define SFXGE_RX_STAT(name, member) \ 1141 { #name, offsetof(struct sfxge_rxq, member) } 1142 SFXGE_RX_STAT(lro_merges, lro.n_merges), 1143 SFXGE_RX_STAT(lro_bursts, lro.n_bursts), 1144 SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start), 1145 SFXGE_RX_STAT(lro_misorder, lro.n_misorder), 1146 SFXGE_RX_STAT(lro_too_many, lro.n_too_many), 1147 SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream), 1148 SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle), 1149 SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed) 1150 }; 1151 1152 static int 1153 sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS) 1154 { 1155 struct sfxge_softc *sc = arg1; 1156 unsigned int id = arg2; 1157 unsigned int sum, index; 1158 1159 /* Sum across all RX queues */ 1160 sum = 0; 1161 for (index = 0; index < sc->intr.n_alloc; index++) 1162 sum += *(unsigned int *)((caddr_t)sc->rxq[index] + 1163 sfxge_rx_stats[id].offset); 1164 1165 return SYSCTL_OUT(req, &sum, sizeof(sum)); 1166 } 1167 1168 static void 1169 sfxge_rx_stat_init(struct sfxge_softc *sc) 1170 { 1171 struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev); 1172 struct sysctl_oid_list *stat_list; 1173 unsigned int id; 1174 1175 stat_list = SYSCTL_CHILDREN(sc->stats_node); 1176 1177 for (id = 0; 1178 id < sizeof(sfxge_rx_stats) / sizeof(sfxge_rx_stats[0]); 1179 id++) { 1180 SYSCTL_ADD_PROC( 1181 ctx, stat_list, 1182 OID_AUTO, sfxge_rx_stats[id].name, 1183 CTLTYPE_UINT|CTLFLAG_RD, 1184 sc, id, sfxge_rx_stat_handler, "IU", 1185 ""); 1186 } 1187 } 1188 1189 void 1190 sfxge_rx_fini(struct sfxge_softc *sc) 1191 { 1192 struct sfxge_intr *intr; 1193 int index; 1194 1195 intr = &sc->intr; 1196 1197 index = intr->n_alloc; 1198 while (--index >= 0) 1199 sfxge_rx_qfini(sc, index); 1200 } 1201 1202 int 1203 sfxge_rx_init(struct sfxge_softc *sc) 1204 { 1205 struct sfxge_intr *intr; 1206 int index; 1207 int rc; 1208 1209 if (lro_idle_ticks == 0) 1210 lro_idle_ticks = hz / 10 + 1; /* 100 ms */ 1211 1212 intr = &sc->intr; 1213 1214 KASSERT(intr->state == SFXGE_INTR_INITIALIZED, 1215 ("intr->state != SFXGE_INTR_INITIALIZED")); 1216 1217 /* Initialize the receive queue(s) - one per interrupt. */ 1218 for (index = 0; index < intr->n_alloc; index++) { 1219 if ((rc = sfxge_rx_qinit(sc, index)) != 0) 1220 goto fail; 1221 } 1222 1223 sfxge_rx_stat_init(sc); 1224 1225 return (0); 1226 1227 fail: 1228 /* Tear down the receive queue(s). */ 1229 while (--index >= 0) 1230 sfxge_rx_qfini(sc, index); 1231 1232 return (rc); 1233 } 1234