1 /*- 2 * Copyright (c) 2010-2016 Solarflare Communications Inc. 3 * All rights reserved. 4 * 5 * This software was developed in part by Philip Paeps under contract for 6 * Solarflare Communications, Inc. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright notice, 12 * this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 19 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 20 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 21 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 22 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 23 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 24 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 25 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 26 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 27 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 * 29 * The views and conclusions contained in the software and documentation are 30 * those of the authors and should not be interpreted as representing official 31 * policies, either expressed or implied, of the FreeBSD Project. 32 */ 33 34 #include <sys/cdefs.h> 35 __FBSDID("$FreeBSD$"); 36 37 #include "opt_rss.h" 38 39 #include <sys/param.h> 40 #include <sys/malloc.h> 41 #include <sys/mbuf.h> 42 #include <sys/smp.h> 43 #include <sys/socket.h> 44 #include <sys/sysctl.h> 45 #include <sys/syslog.h> 46 #include <sys/limits.h> 47 #include <sys/syslog.h> 48 49 #include <net/ethernet.h> 50 #include <net/if.h> 51 #include <net/if_vlan_var.h> 52 53 #include <netinet/in.h> 54 #include <netinet/ip.h> 55 #include <netinet/ip6.h> 56 #include <netinet/tcp.h> 57 58 #include <machine/in_cksum.h> 59 60 #ifdef RSS 61 #include <net/rss_config.h> 62 #endif 63 64 #include "common/efx.h" 65 66 67 #include "sfxge.h" 68 #include "sfxge_rx.h" 69 70 #define RX_REFILL_THRESHOLD(_entries) (EFX_RXQ_LIMIT(_entries) * 9 / 10) 71 72 #ifdef SFXGE_LRO 73 74 SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD, NULL, 75 "Large receive offload (LRO) parameters"); 76 77 #define SFXGE_LRO_PARAM(_param) SFXGE_PARAM(lro._param) 78 79 /* Size of the LRO hash table. Must be a power of 2. A larger table 80 * means we can accelerate a larger number of streams. 81 */ 82 static unsigned lro_table_size = 128; 83 TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size); 84 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN, 85 &lro_table_size, 0, 86 "Size of the LRO hash table (must be a power of 2)"); 87 88 /* Maximum length of a hash chain. If chains get too long then the lookup 89 * time increases and may exceed the benefit of LRO. 90 */ 91 static unsigned lro_chain_max = 20; 92 TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max); 93 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN, 94 &lro_chain_max, 0, 95 "The maximum length of a hash chain"); 96 97 /* Maximum time (in ticks) that a connection can be idle before it's LRO 98 * state is discarded. 99 */ 100 static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */ 101 TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks); 102 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN, 103 &lro_idle_ticks, 0, 104 "The maximum time (in ticks) that a connection can be idle " 105 "before it's LRO state is discarded"); 106 107 /* Number of packets with payload that must arrive in-order before a 108 * connection is eligible for LRO. The idea is we should avoid coalescing 109 * segments when the sender is in slow-start because reducing the ACK rate 110 * can damage performance. 111 */ 112 static int lro_slow_start_packets = 2000; 113 TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets); 114 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN, 115 &lro_slow_start_packets, 0, 116 "Number of packets with payload that must arrive in-order before " 117 "a connection is eligible for LRO"); 118 119 /* Number of packets with payload that must arrive in-order following loss 120 * before a connection is eligible for LRO. The idea is we should avoid 121 * coalescing segments when the sender is recovering from loss, because 122 * reducing the ACK rate can damage performance. 123 */ 124 static int lro_loss_packets = 20; 125 TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets); 126 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN, 127 &lro_loss_packets, 0, 128 "Number of packets with payload that must arrive in-order " 129 "following loss before a connection is eligible for LRO"); 130 131 /* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */ 132 #define SFXGE_LRO_L2_ID_VLAN 0x4000 133 #define SFXGE_LRO_L2_ID_IPV6 0x8000 134 #define SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN) 135 #define SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6)) 136 137 /* Compare IPv6 addresses, avoiding conditional branches */ 138 static unsigned long ipv6_addr_cmp(const struct in6_addr *left, 139 const struct in6_addr *right) 140 { 141 #if LONG_BIT == 64 142 const uint64_t *left64 = (const uint64_t *)left; 143 const uint64_t *right64 = (const uint64_t *)right; 144 return (left64[0] - right64[0]) | (left64[1] - right64[1]); 145 #else 146 return (left->s6_addr32[0] - right->s6_addr32[0]) | 147 (left->s6_addr32[1] - right->s6_addr32[1]) | 148 (left->s6_addr32[2] - right->s6_addr32[2]) | 149 (left->s6_addr32[3] - right->s6_addr32[3]); 150 #endif 151 } 152 153 #endif /* SFXGE_LRO */ 154 155 void 156 sfxge_rx_qflush_done(struct sfxge_rxq *rxq) 157 { 158 159 rxq->flush_state = SFXGE_FLUSH_DONE; 160 } 161 162 void 163 sfxge_rx_qflush_failed(struct sfxge_rxq *rxq) 164 { 165 166 rxq->flush_state = SFXGE_FLUSH_FAILED; 167 } 168 169 #ifdef RSS 170 static uint8_t toep_key[RSS_KEYSIZE]; 171 #else 172 static uint8_t toep_key[] = { 173 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, 174 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, 175 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, 176 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, 177 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa 178 }; 179 #endif 180 181 static void 182 sfxge_rx_post_refill(void *arg) 183 { 184 struct sfxge_rxq *rxq = arg; 185 struct sfxge_softc *sc; 186 unsigned int index; 187 struct sfxge_evq *evq; 188 uint16_t magic; 189 190 sc = rxq->sc; 191 index = rxq->index; 192 evq = sc->evq[index]; 193 magic = sfxge_sw_ev_rxq_magic(SFXGE_SW_EV_RX_QREFILL, rxq); 194 195 /* This is guaranteed due to the start/stop order of rx and ev */ 196 KASSERT(evq->init_state == SFXGE_EVQ_STARTED, 197 ("evq not started")); 198 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED, 199 ("rxq not started")); 200 efx_ev_qpost(evq->common, magic); 201 } 202 203 static void 204 sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying) 205 { 206 /* Initially retry after 100 ms, but back off in case of 207 * repeated failures as we probably have to wait for the 208 * administrator to raise the pool limit. */ 209 if (retrying) 210 rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz); 211 else 212 rxq->refill_delay = hz / 10; 213 214 callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay, 215 sfxge_rx_post_refill, rxq); 216 } 217 218 #define SFXGE_REFILL_BATCH 64 219 220 static void 221 sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying) 222 { 223 struct sfxge_softc *sc; 224 unsigned int index; 225 struct sfxge_evq *evq; 226 unsigned int batch; 227 unsigned int rxfill; 228 unsigned int mblksize; 229 int ntodo; 230 efsys_dma_addr_t addr[SFXGE_REFILL_BATCH]; 231 232 sc = rxq->sc; 233 index = rxq->index; 234 evq = sc->evq[index]; 235 236 prefetch_read_many(sc->enp); 237 prefetch_read_many(rxq->common); 238 239 SFXGE_EVQ_LOCK_ASSERT_OWNED(evq); 240 241 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED)) 242 return; 243 244 rxfill = rxq->added - rxq->completed; 245 KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries), 246 ("rxfill > EFX_RXQ_LIMIT(rxq->entries)")); 247 ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target); 248 KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries), 249 ("ntodo > EFX_RQX_LIMIT(rxq->entries)")); 250 251 if (ntodo == 0) 252 return; 253 254 batch = 0; 255 mblksize = sc->rx_buffer_size - sc->rx_buffer_align; 256 while (ntodo-- > 0) { 257 unsigned int id; 258 struct sfxge_rx_sw_desc *rx_desc; 259 bus_dma_segment_t seg; 260 struct mbuf *m; 261 262 id = (rxq->added + batch) & rxq->ptr_mask; 263 rx_desc = &rxq->queue[id]; 264 KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL")); 265 266 rx_desc->flags = EFX_DISCARD; 267 m = rx_desc->mbuf = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, 268 sc->rx_cluster_size); 269 if (m == NULL) 270 break; 271 272 /* m_len specifies length of area to be mapped for DMA */ 273 m->m_len = mblksize; 274 m->m_data = (caddr_t)P2ROUNDUP((uintptr_t)m->m_data, CACHE_LINE_SIZE); 275 m->m_data += sc->rx_buffer_align; 276 277 sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg); 278 addr[batch++] = seg.ds_addr; 279 280 if (batch == SFXGE_REFILL_BATCH) { 281 efx_rx_qpost(rxq->common, addr, mblksize, batch, 282 rxq->completed, rxq->added); 283 rxq->added += batch; 284 batch = 0; 285 } 286 } 287 288 if (ntodo != 0) 289 sfxge_rx_schedule_refill(rxq, retrying); 290 291 if (batch != 0) { 292 efx_rx_qpost(rxq->common, addr, mblksize, batch, 293 rxq->completed, rxq->added); 294 rxq->added += batch; 295 } 296 297 /* Make the descriptors visible to the hardware */ 298 bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map, 299 BUS_DMASYNC_PREWRITE); 300 301 efx_rx_qpush(rxq->common, rxq->added, &rxq->pushed); 302 303 /* The queue could still be empty if no descriptors were actually 304 * pushed, in which case there will be no event to cause the next 305 * refill, so we must schedule a refill ourselves. 306 */ 307 if(rxq->pushed == rxq->completed) { 308 sfxge_rx_schedule_refill(rxq, retrying); 309 } 310 } 311 312 void 313 sfxge_rx_qrefill(struct sfxge_rxq *rxq) 314 { 315 316 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED)) 317 return; 318 319 /* Make sure the queue is full */ 320 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE); 321 } 322 323 static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m) 324 { 325 struct ifnet *ifp = sc->ifnet; 326 327 m->m_pkthdr.rcvif = ifp; 328 m->m_pkthdr.csum_data = 0xffff; 329 ifp->if_input(ifp, m); 330 } 331 332 static void 333 sfxge_rx_deliver(struct sfxge_softc *sc, struct sfxge_rx_sw_desc *rx_desc) 334 { 335 struct mbuf *m = rx_desc->mbuf; 336 int flags = rx_desc->flags; 337 int csum_flags; 338 339 /* Convert checksum flags */ 340 csum_flags = (flags & EFX_CKSUM_IPV4) ? 341 (CSUM_IP_CHECKED | CSUM_IP_VALID) : 0; 342 if (flags & EFX_CKSUM_TCPUDP) 343 csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 344 345 if (flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) { 346 m->m_pkthdr.flowid = 347 efx_psuedo_hdr_hash_get(sc->enp, 348 EFX_RX_HASHALG_TOEPLITZ, 349 mtod(m, uint8_t *)); 350 /* The hash covers a 4-tuple for TCP only */ 351 M_HASHTYPE_SET(m, 352 (flags & EFX_PKT_IPV4) ? 353 ((flags & EFX_PKT_TCP) ? 354 M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_IPV4) : 355 ((flags & EFX_PKT_TCP) ? 356 M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_IPV6)); 357 } 358 m->m_data += sc->rx_prefix_size; 359 m->m_len = rx_desc->size - sc->rx_prefix_size; 360 m->m_pkthdr.len = m->m_len; 361 m->m_pkthdr.csum_flags = csum_flags; 362 __sfxge_rx_deliver(sc, rx_desc->mbuf); 363 364 rx_desc->flags = EFX_DISCARD; 365 rx_desc->mbuf = NULL; 366 } 367 368 #ifdef SFXGE_LRO 369 370 static void 371 sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c) 372 { 373 struct sfxge_softc *sc = st->sc; 374 struct mbuf *m = c->mbuf; 375 struct tcphdr *c_th; 376 int csum_flags; 377 378 KASSERT(m, ("no mbuf to deliver")); 379 380 ++st->n_bursts; 381 382 /* Finish off packet munging and recalculate IP header checksum. */ 383 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 384 struct ip *iph = c->nh; 385 iph->ip_len = htons(iph->ip_len); 386 iph->ip_sum = 0; 387 iph->ip_sum = in_cksum_hdr(iph); 388 c_th = (struct tcphdr *)(iph + 1); 389 csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR | 390 CSUM_IP_CHECKED | CSUM_IP_VALID); 391 } else { 392 struct ip6_hdr *iph = c->nh; 393 iph->ip6_plen = htons(iph->ip6_plen); 394 c_th = (struct tcphdr *)(iph + 1); 395 csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 396 } 397 398 c_th->th_win = c->th_last->th_win; 399 c_th->th_ack = c->th_last->th_ack; 400 if (c_th->th_off == c->th_last->th_off) { 401 /* Copy TCP options (take care to avoid going negative). */ 402 int optlen = ((c_th->th_off - 5) & 0xf) << 2u; 403 memcpy(c_th + 1, c->th_last + 1, optlen); 404 } 405 406 m->m_pkthdr.flowid = c->conn_hash; 407 M_HASHTYPE_SET(m, 408 SFXGE_LRO_CONN_IS_TCPIPV4(c) ? 409 M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_TCP_IPV6); 410 411 m->m_pkthdr.csum_flags = csum_flags; 412 __sfxge_rx_deliver(sc, m); 413 414 c->mbuf = NULL; 415 c->delivered = 1; 416 } 417 418 /* Drop the given connection, and add it to the free list. */ 419 static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c) 420 { 421 unsigned bucket; 422 423 KASSERT(!c->mbuf, ("found orphaned mbuf")); 424 425 if (c->next_buf.mbuf != NULL) { 426 sfxge_rx_deliver(rxq->sc, &c->next_buf); 427 LIST_REMOVE(c, active_link); 428 } 429 430 bucket = c->conn_hash & rxq->lro.conns_mask; 431 KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong")); 432 --rxq->lro.conns_n[bucket]; 433 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link); 434 TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link); 435 } 436 437 /* Stop tracking connections that have gone idle in order to keep hash 438 * chains short. 439 */ 440 static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now) 441 { 442 struct sfxge_lro_conn *c; 443 unsigned i; 444 445 KASSERT(LIST_EMPTY(&rxq->lro.active_conns), 446 ("found active connections")); 447 448 rxq->lro.last_purge_ticks = now; 449 for (i = 0; i <= rxq->lro.conns_mask; ++i) { 450 if (TAILQ_EMPTY(&rxq->lro.conns[i])) 451 continue; 452 453 c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq); 454 if (now - c->last_pkt_ticks > lro_idle_ticks) { 455 ++rxq->lro.n_drop_idle; 456 sfxge_lro_drop(rxq, c); 457 } 458 } 459 } 460 461 static void 462 sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c, 463 struct mbuf *mbuf, struct tcphdr *th) 464 { 465 struct tcphdr *c_th; 466 467 /* Tack the new mbuf onto the chain. */ 468 KASSERT(!mbuf->m_next, ("mbuf already chained")); 469 c->mbuf_tail->m_next = mbuf; 470 c->mbuf_tail = mbuf; 471 472 /* Increase length appropriately */ 473 c->mbuf->m_pkthdr.len += mbuf->m_len; 474 475 /* Update the connection state flags */ 476 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 477 struct ip *iph = c->nh; 478 iph->ip_len += mbuf->m_len; 479 c_th = (struct tcphdr *)(iph + 1); 480 } else { 481 struct ip6_hdr *iph = c->nh; 482 iph->ip6_plen += mbuf->m_len; 483 c_th = (struct tcphdr *)(iph + 1); 484 } 485 c_th->th_flags |= (th->th_flags & TH_PUSH); 486 c->th_last = th; 487 ++st->n_merges; 488 489 /* Pass packet up now if another segment could overflow the IP 490 * length. 491 */ 492 if (c->mbuf->m_pkthdr.len > 65536 - 9200) 493 sfxge_lro_deliver(st, c); 494 } 495 496 static void 497 sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c, 498 struct mbuf *mbuf, void *nh, struct tcphdr *th) 499 { 500 /* Start the chain */ 501 c->mbuf = mbuf; 502 c->mbuf_tail = c->mbuf; 503 c->nh = nh; 504 c->th_last = th; 505 506 mbuf->m_pkthdr.len = mbuf->m_len; 507 508 /* Mangle header fields for later processing */ 509 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 510 struct ip *iph = nh; 511 iph->ip_len = ntohs(iph->ip_len); 512 } else { 513 struct ip6_hdr *iph = nh; 514 iph->ip6_plen = ntohs(iph->ip6_plen); 515 } 516 } 517 518 /* Try to merge or otherwise hold or deliver (as appropriate) the 519 * packet buffered for this connection (c->next_buf). Return a flag 520 * indicating whether the connection is still active for LRO purposes. 521 */ 522 static int 523 sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c) 524 { 525 struct sfxge_rx_sw_desc *rx_buf = &c->next_buf; 526 char *eh = c->next_eh; 527 int data_length, hdr_length, dont_merge; 528 unsigned th_seq, pkt_length; 529 struct tcphdr *th; 530 unsigned now; 531 532 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 533 struct ip *iph = c->next_nh; 534 th = (struct tcphdr *)(iph + 1); 535 pkt_length = ntohs(iph->ip_len) + (char *) iph - eh; 536 } else { 537 struct ip6_hdr *iph = c->next_nh; 538 th = (struct tcphdr *)(iph + 1); 539 pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh; 540 } 541 542 hdr_length = (char *) th + th->th_off * 4 - eh; 543 data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) - 544 hdr_length); 545 th_seq = ntohl(th->th_seq); 546 dont_merge = ((data_length <= 0) 547 | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN))); 548 549 /* Check for options other than aligned timestamp. */ 550 if (th->th_off != 5) { 551 const uint32_t *opt_ptr = (const uint32_t *) (th + 1); 552 if (th->th_off == 8 && 553 opt_ptr[0] == ntohl((TCPOPT_NOP << 24) | 554 (TCPOPT_NOP << 16) | 555 (TCPOPT_TIMESTAMP << 8) | 556 TCPOLEN_TIMESTAMP)) { 557 /* timestamp option -- okay */ 558 } else { 559 dont_merge = 1; 560 } 561 } 562 563 if (__predict_false(th_seq != c->next_seq)) { 564 /* Out-of-order, so start counting again. */ 565 if (c->mbuf != NULL) 566 sfxge_lro_deliver(&rxq->lro, c); 567 c->n_in_order_pkts -= lro_loss_packets; 568 c->next_seq = th_seq + data_length; 569 ++rxq->lro.n_misorder; 570 goto deliver_buf_out; 571 } 572 c->next_seq = th_seq + data_length; 573 574 now = ticks; 575 if (now - c->last_pkt_ticks > lro_idle_ticks) { 576 ++rxq->lro.n_drop_idle; 577 if (c->mbuf != NULL) 578 sfxge_lro_deliver(&rxq->lro, c); 579 sfxge_lro_drop(rxq, c); 580 return (0); 581 } 582 c->last_pkt_ticks = ticks; 583 584 if (c->n_in_order_pkts < lro_slow_start_packets) { 585 /* May be in slow-start, so don't merge. */ 586 ++rxq->lro.n_slow_start; 587 ++c->n_in_order_pkts; 588 goto deliver_buf_out; 589 } 590 591 if (__predict_false(dont_merge)) { 592 if (c->mbuf != NULL) 593 sfxge_lro_deliver(&rxq->lro, c); 594 if (th->th_flags & (TH_FIN | TH_RST)) { 595 ++rxq->lro.n_drop_closed; 596 sfxge_lro_drop(rxq, c); 597 return (0); 598 } 599 goto deliver_buf_out; 600 } 601 602 rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size; 603 604 if (__predict_true(c->mbuf != NULL)) { 605 /* Remove headers and any padding */ 606 rx_buf->mbuf->m_data += hdr_length; 607 rx_buf->mbuf->m_len = data_length; 608 609 sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th); 610 } else { 611 /* Remove any padding */ 612 rx_buf->mbuf->m_len = pkt_length; 613 614 sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th); 615 } 616 617 rx_buf->mbuf = NULL; 618 return (1); 619 620 deliver_buf_out: 621 sfxge_rx_deliver(rxq->sc, rx_buf); 622 return (1); 623 } 624 625 static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash, 626 uint16_t l2_id, void *nh, struct tcphdr *th) 627 { 628 unsigned bucket = conn_hash & st->conns_mask; 629 struct sfxge_lro_conn *c; 630 631 if (st->conns_n[bucket] >= lro_chain_max) { 632 ++st->n_too_many; 633 return; 634 } 635 636 if (!TAILQ_EMPTY(&st->free_conns)) { 637 c = TAILQ_FIRST(&st->free_conns); 638 TAILQ_REMOVE(&st->free_conns, c, link); 639 } else { 640 c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT); 641 if (c == NULL) 642 return; 643 c->mbuf = NULL; 644 c->next_buf.mbuf = NULL; 645 } 646 647 /* Create the connection tracking data */ 648 ++st->conns_n[bucket]; 649 TAILQ_INSERT_HEAD(&st->conns[bucket], c, link); 650 c->l2_id = l2_id; 651 c->conn_hash = conn_hash; 652 c->source = th->th_sport; 653 c->dest = th->th_dport; 654 c->n_in_order_pkts = 0; 655 c->last_pkt_ticks = *(volatile int *)&ticks; 656 c->delivered = 0; 657 ++st->n_new_stream; 658 /* NB. We don't initialise c->next_seq, and it doesn't matter what 659 * value it has. Most likely the next packet received for this 660 * connection will not match -- no harm done. 661 */ 662 } 663 664 /* Process mbuf and decide whether to dispatch it to the stack now or 665 * later. 666 */ 667 static void 668 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf) 669 { 670 struct sfxge_softc *sc = rxq->sc; 671 struct mbuf *m = rx_buf->mbuf; 672 struct ether_header *eh; 673 struct sfxge_lro_conn *c; 674 uint16_t l2_id; 675 uint16_t l3_proto; 676 void *nh; 677 struct tcphdr *th; 678 uint32_t conn_hash; 679 unsigned bucket; 680 681 /* Get the hardware hash */ 682 conn_hash = efx_psuedo_hdr_hash_get(sc->enp, 683 EFX_RX_HASHALG_TOEPLITZ, 684 mtod(m, uint8_t *)); 685 686 eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size); 687 if (eh->ether_type == htons(ETHERTYPE_VLAN)) { 688 struct ether_vlan_header *veh = (struct ether_vlan_header *)eh; 689 l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) | 690 SFXGE_LRO_L2_ID_VLAN; 691 l3_proto = veh->evl_proto; 692 nh = veh + 1; 693 } else { 694 l2_id = 0; 695 l3_proto = eh->ether_type; 696 nh = eh + 1; 697 } 698 699 /* Check whether this is a suitable packet (unfragmented 700 * TCP/IPv4 or TCP/IPv6). If so, find the TCP header and 701 * length, and compute a hash if necessary. If not, return. 702 */ 703 if (l3_proto == htons(ETHERTYPE_IP)) { 704 struct ip *iph = nh; 705 706 KASSERT(iph->ip_p == IPPROTO_TCP, 707 ("IPv4 protocol is not TCP, but packet marker is set")); 708 if ((iph->ip_hl - (sizeof(*iph) >> 2u)) | 709 (iph->ip_off & htons(IP_MF | IP_OFFMASK))) 710 goto deliver_now; 711 th = (struct tcphdr *)(iph + 1); 712 } else if (l3_proto == htons(ETHERTYPE_IPV6)) { 713 struct ip6_hdr *iph = nh; 714 715 KASSERT(iph->ip6_nxt == IPPROTO_TCP, 716 ("IPv6 next header is not TCP, but packet marker is set")); 717 l2_id |= SFXGE_LRO_L2_ID_IPV6; 718 th = (struct tcphdr *)(iph + 1); 719 } else { 720 goto deliver_now; 721 } 722 723 bucket = conn_hash & rxq->lro.conns_mask; 724 725 TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) { 726 if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash)) 727 continue; 728 if ((c->source - th->th_sport) | (c->dest - th->th_dport)) 729 continue; 730 if (c->mbuf != NULL) { 731 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 732 struct ip *c_iph, *iph = nh; 733 c_iph = c->nh; 734 if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) | 735 (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr)) 736 continue; 737 } else { 738 struct ip6_hdr *c_iph, *iph = nh; 739 c_iph = c->nh; 740 if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) | 741 ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst)) 742 continue; 743 } 744 } 745 746 /* Re-insert at head of list to reduce lookup time. */ 747 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link); 748 TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link); 749 750 if (c->next_buf.mbuf != NULL) { 751 if (!sfxge_lro_try_merge(rxq, c)) 752 goto deliver_now; 753 } else { 754 LIST_INSERT_HEAD(&rxq->lro.active_conns, c, 755 active_link); 756 } 757 c->next_buf = *rx_buf; 758 c->next_eh = eh; 759 c->next_nh = nh; 760 761 rx_buf->mbuf = NULL; 762 rx_buf->flags = EFX_DISCARD; 763 return; 764 } 765 766 sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th); 767 deliver_now: 768 sfxge_rx_deliver(sc, rx_buf); 769 } 770 771 static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq) 772 { 773 struct sfxge_lro_state *st = &rxq->lro; 774 struct sfxge_lro_conn *c; 775 unsigned t; 776 777 while (!LIST_EMPTY(&st->active_conns)) { 778 c = LIST_FIRST(&st->active_conns); 779 if (!c->delivered && c->mbuf != NULL) 780 sfxge_lro_deliver(st, c); 781 if (sfxge_lro_try_merge(rxq, c)) { 782 if (c->mbuf != NULL) 783 sfxge_lro_deliver(st, c); 784 LIST_REMOVE(c, active_link); 785 } 786 c->delivered = 0; 787 } 788 789 t = *(volatile int *)&ticks; 790 if (__predict_false(t != st->last_purge_ticks)) 791 sfxge_lro_purge_idle(rxq, t); 792 } 793 794 #else /* !SFXGE_LRO */ 795 796 static void 797 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf) 798 { 799 } 800 801 static void 802 sfxge_lro_end_of_burst(struct sfxge_rxq *rxq) 803 { 804 } 805 806 #endif /* SFXGE_LRO */ 807 808 void 809 sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop) 810 { 811 struct sfxge_softc *sc = rxq->sc; 812 int if_capenable = sc->ifnet->if_capenable; 813 int lro_enabled = if_capenable & IFCAP_LRO; 814 unsigned int index; 815 struct sfxge_evq *evq; 816 unsigned int completed; 817 unsigned int level; 818 struct mbuf *m; 819 struct sfxge_rx_sw_desc *prev = NULL; 820 821 index = rxq->index; 822 evq = sc->evq[index]; 823 824 SFXGE_EVQ_LOCK_ASSERT_OWNED(evq); 825 826 completed = rxq->completed; 827 while (completed != rxq->pending) { 828 unsigned int id; 829 struct sfxge_rx_sw_desc *rx_desc; 830 831 id = completed++ & rxq->ptr_mask; 832 rx_desc = &rxq->queue[id]; 833 m = rx_desc->mbuf; 834 835 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED)) 836 goto discard; 837 838 if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD)) 839 goto discard; 840 841 /* Read the length from the pseudo header if required */ 842 if (rx_desc->flags & EFX_PKT_PREFIX_LEN) { 843 uint16_t tmp_size; 844 int rc; 845 rc = efx_psuedo_hdr_pkt_length_get(sc->enp, 846 mtod(m, uint8_t *), 847 &tmp_size); 848 KASSERT(rc == 0, ("cannot get packet length: %d", rc)); 849 rx_desc->size = (int)tmp_size + sc->rx_prefix_size; 850 } 851 852 prefetch_read_many(mtod(m, caddr_t)); 853 854 switch (rx_desc->flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) { 855 case EFX_PKT_IPV4: 856 if (~if_capenable & IFCAP_RXCSUM) 857 rx_desc->flags &= 858 ~(EFX_CKSUM_IPV4 | EFX_CKSUM_TCPUDP); 859 break; 860 case EFX_PKT_IPV6: 861 if (~if_capenable & IFCAP_RXCSUM_IPV6) 862 rx_desc->flags &= ~EFX_CKSUM_TCPUDP; 863 break; 864 case 0: 865 /* Check for loopback packets */ 866 { 867 struct ether_header *etherhp; 868 869 /*LINTED*/ 870 etherhp = mtod(m, struct ether_header *); 871 872 if (etherhp->ether_type == 873 htons(SFXGE_ETHERTYPE_LOOPBACK)) { 874 EFSYS_PROBE(loopback); 875 876 rxq->loopback++; 877 goto discard; 878 } 879 } 880 break; 881 default: 882 KASSERT(B_FALSE, 883 ("Rx descriptor with both IPv4 and IPv6 flags")); 884 goto discard; 885 } 886 887 /* Pass packet up the stack or into LRO (pipelined) */ 888 if (prev != NULL) { 889 if (lro_enabled && 890 ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) == 891 (EFX_PKT_TCP | EFX_CKSUM_TCPUDP))) 892 sfxge_lro(rxq, prev); 893 else 894 sfxge_rx_deliver(sc, prev); 895 } 896 prev = rx_desc; 897 continue; 898 899 discard: 900 /* Return the packet to the pool */ 901 m_free(m); 902 rx_desc->mbuf = NULL; 903 } 904 rxq->completed = completed; 905 906 level = rxq->added - rxq->completed; 907 908 /* Pass last packet up the stack or into LRO */ 909 if (prev != NULL) { 910 if (lro_enabled && 911 ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) == 912 (EFX_PKT_TCP | EFX_CKSUM_TCPUDP))) 913 sfxge_lro(rxq, prev); 914 else 915 sfxge_rx_deliver(sc, prev); 916 } 917 918 /* 919 * If there are any pending flows and this is the end of the 920 * poll then they must be completed. 921 */ 922 if (eop) 923 sfxge_lro_end_of_burst(rxq); 924 925 /* Top up the queue if necessary */ 926 if (level < rxq->refill_threshold) 927 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE); 928 } 929 930 static void 931 sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index) 932 { 933 struct sfxge_rxq *rxq; 934 struct sfxge_evq *evq; 935 unsigned int count; 936 unsigned int retry = 3; 937 938 SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc); 939 940 rxq = sc->rxq[index]; 941 evq = sc->evq[index]; 942 943 SFXGE_EVQ_LOCK(evq); 944 945 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED, 946 ("rxq not started")); 947 948 rxq->init_state = SFXGE_RXQ_INITIALIZED; 949 950 callout_stop(&rxq->refill_callout); 951 952 while (rxq->flush_state != SFXGE_FLUSH_DONE && retry != 0) { 953 rxq->flush_state = SFXGE_FLUSH_PENDING; 954 955 SFXGE_EVQ_UNLOCK(evq); 956 957 /* Flush the receive queue */ 958 if (efx_rx_qflush(rxq->common) != 0) { 959 SFXGE_EVQ_LOCK(evq); 960 rxq->flush_state = SFXGE_FLUSH_FAILED; 961 break; 962 } 963 964 count = 0; 965 do { 966 /* Spin for 100 ms */ 967 DELAY(100000); 968 969 if (rxq->flush_state != SFXGE_FLUSH_PENDING) 970 break; 971 972 } while (++count < 20); 973 974 SFXGE_EVQ_LOCK(evq); 975 976 if (rxq->flush_state == SFXGE_FLUSH_PENDING) { 977 /* Flush timeout - neither done nor failed */ 978 log(LOG_ERR, "%s: Cannot flush Rx queue %u\n", 979 device_get_nameunit(sc->dev), index); 980 rxq->flush_state = SFXGE_FLUSH_DONE; 981 } 982 retry--; 983 } 984 if (rxq->flush_state == SFXGE_FLUSH_FAILED) { 985 log(LOG_ERR, "%s: Flushing Rx queue %u failed\n", 986 device_get_nameunit(sc->dev), index); 987 rxq->flush_state = SFXGE_FLUSH_DONE; 988 } 989 990 rxq->pending = rxq->added; 991 sfxge_rx_qcomplete(rxq, B_TRUE); 992 993 KASSERT(rxq->completed == rxq->pending, 994 ("rxq->completed != rxq->pending")); 995 996 rxq->added = 0; 997 rxq->pushed = 0; 998 rxq->pending = 0; 999 rxq->completed = 0; 1000 rxq->loopback = 0; 1001 1002 /* Destroy the common code receive queue. */ 1003 efx_rx_qdestroy(rxq->common); 1004 1005 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id, 1006 EFX_RXQ_NBUFS(sc->rxq_entries)); 1007 1008 SFXGE_EVQ_UNLOCK(evq); 1009 } 1010 1011 static int 1012 sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index) 1013 { 1014 struct sfxge_rxq *rxq; 1015 efsys_mem_t *esmp; 1016 struct sfxge_evq *evq; 1017 int rc; 1018 1019 SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc); 1020 1021 rxq = sc->rxq[index]; 1022 esmp = &rxq->mem; 1023 evq = sc->evq[index]; 1024 1025 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED, 1026 ("rxq->init_state != SFXGE_RXQ_INITIALIZED")); 1027 KASSERT(evq->init_state == SFXGE_EVQ_STARTED, 1028 ("evq->init_state != SFXGE_EVQ_STARTED")); 1029 1030 /* Program the buffer table. */ 1031 if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp, 1032 EFX_RXQ_NBUFS(sc->rxq_entries))) != 0) 1033 return (rc); 1034 1035 /* Create the common code receive queue. */ 1036 if ((rc = efx_rx_qcreate(sc->enp, index, 0, EFX_RXQ_TYPE_DEFAULT, 1037 esmp, sc->rxq_entries, rxq->buf_base_id, evq->common, 1038 &rxq->common)) != 0) 1039 goto fail; 1040 1041 SFXGE_EVQ_LOCK(evq); 1042 1043 /* Enable the receive queue. */ 1044 efx_rx_qenable(rxq->common); 1045 1046 rxq->init_state = SFXGE_RXQ_STARTED; 1047 rxq->flush_state = SFXGE_FLUSH_REQUIRED; 1048 1049 /* Try to fill the queue from the pool. */ 1050 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE); 1051 1052 SFXGE_EVQ_UNLOCK(evq); 1053 1054 return (0); 1055 1056 fail: 1057 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id, 1058 EFX_RXQ_NBUFS(sc->rxq_entries)); 1059 return (rc); 1060 } 1061 1062 void 1063 sfxge_rx_stop(struct sfxge_softc *sc) 1064 { 1065 int index; 1066 1067 efx_mac_filter_default_rxq_clear(sc->enp); 1068 1069 /* Stop the receive queue(s) */ 1070 index = sc->rxq_count; 1071 while (--index >= 0) 1072 sfxge_rx_qstop(sc, index); 1073 1074 sc->rx_prefix_size = 0; 1075 sc->rx_buffer_size = 0; 1076 1077 efx_rx_fini(sc->enp); 1078 } 1079 1080 int 1081 sfxge_rx_start(struct sfxge_softc *sc) 1082 { 1083 struct sfxge_intr *intr; 1084 const efx_nic_cfg_t *encp; 1085 size_t hdrlen, align, reserved; 1086 int index; 1087 int rc; 1088 1089 intr = &sc->intr; 1090 1091 /* Initialize the common code receive module. */ 1092 if ((rc = efx_rx_init(sc->enp)) != 0) 1093 return (rc); 1094 1095 encp = efx_nic_cfg_get(sc->enp); 1096 sc->rx_buffer_size = EFX_MAC_PDU(sc->ifnet->if_mtu); 1097 1098 /* Calculate the receive packet buffer size. */ 1099 sc->rx_prefix_size = encp->enc_rx_prefix_size; 1100 1101 /* Ensure IP headers are 32bit aligned */ 1102 hdrlen = sc->rx_prefix_size + sizeof (struct ether_header); 1103 sc->rx_buffer_align = P2ROUNDUP(hdrlen, 4) - hdrlen; 1104 1105 sc->rx_buffer_size += sc->rx_buffer_align; 1106 1107 /* Align end of packet buffer for RX DMA end padding */ 1108 align = MAX(1, encp->enc_rx_buf_align_end); 1109 EFSYS_ASSERT(ISP2(align)); 1110 sc->rx_buffer_size = P2ROUNDUP(sc->rx_buffer_size, align); 1111 1112 /* 1113 * Standard mbuf zones only guarantee pointer-size alignment; 1114 * we need extra space to align to the cache line 1115 */ 1116 reserved = sc->rx_buffer_size + CACHE_LINE_SIZE; 1117 1118 /* Select zone for packet buffers */ 1119 if (reserved <= MCLBYTES) 1120 sc->rx_cluster_size = MCLBYTES; 1121 else if (reserved <= MJUMPAGESIZE) 1122 sc->rx_cluster_size = MJUMPAGESIZE; 1123 else if (reserved <= MJUM9BYTES) 1124 sc->rx_cluster_size = MJUM9BYTES; 1125 else 1126 sc->rx_cluster_size = MJUM16BYTES; 1127 1128 /* 1129 * Set up the scale table. Enable all hash types and hash insertion. 1130 */ 1131 for (index = 0; index < SFXGE_RX_SCALE_MAX; index++) 1132 #ifdef RSS 1133 sc->rx_indir_table[index] = 1134 rss_get_indirection_to_bucket(index) % sc->rxq_count; 1135 #else 1136 sc->rx_indir_table[index] = index % sc->rxq_count; 1137 #endif 1138 if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table, 1139 SFXGE_RX_SCALE_MAX)) != 0) 1140 goto fail; 1141 (void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ, 1142 (1 << EFX_RX_HASH_IPV4) | (1 << EFX_RX_HASH_TCPIPV4) | 1143 (1 << EFX_RX_HASH_IPV6) | (1 << EFX_RX_HASH_TCPIPV6), B_TRUE); 1144 1145 #ifdef RSS 1146 rss_getkey(toep_key); 1147 #endif 1148 if ((rc = efx_rx_scale_key_set(sc->enp, toep_key, 1149 sizeof(toep_key))) != 0) 1150 goto fail; 1151 1152 /* Start the receive queue(s). */ 1153 for (index = 0; index < sc->rxq_count; index++) { 1154 if ((rc = sfxge_rx_qstart(sc, index)) != 0) 1155 goto fail2; 1156 } 1157 1158 rc = efx_mac_filter_default_rxq_set(sc->enp, sc->rxq[0]->common, 1159 sc->intr.n_alloc > 1); 1160 if (rc != 0) 1161 goto fail3; 1162 1163 return (0); 1164 1165 fail3: 1166 fail2: 1167 while (--index >= 0) 1168 sfxge_rx_qstop(sc, index); 1169 1170 fail: 1171 efx_rx_fini(sc->enp); 1172 1173 return (rc); 1174 } 1175 1176 #ifdef SFXGE_LRO 1177 1178 static void sfxge_lro_init(struct sfxge_rxq *rxq) 1179 { 1180 struct sfxge_lro_state *st = &rxq->lro; 1181 unsigned i; 1182 1183 st->conns_mask = lro_table_size - 1; 1184 KASSERT(!((st->conns_mask + 1) & st->conns_mask), 1185 ("lro_table_size must be a power of 2")); 1186 st->sc = rxq->sc; 1187 st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]), 1188 M_SFXGE, M_WAITOK); 1189 st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]), 1190 M_SFXGE, M_WAITOK); 1191 for (i = 0; i <= st->conns_mask; ++i) { 1192 TAILQ_INIT(&st->conns[i]); 1193 st->conns_n[i] = 0; 1194 } 1195 LIST_INIT(&st->active_conns); 1196 TAILQ_INIT(&st->free_conns); 1197 } 1198 1199 static void sfxge_lro_fini(struct sfxge_rxq *rxq) 1200 { 1201 struct sfxge_lro_state *st = &rxq->lro; 1202 struct sfxge_lro_conn *c; 1203 unsigned i; 1204 1205 /* Return cleanly if sfxge_lro_init() has not been called. */ 1206 if (st->conns == NULL) 1207 return; 1208 1209 KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections")); 1210 1211 for (i = 0; i <= st->conns_mask; ++i) { 1212 while (!TAILQ_EMPTY(&st->conns[i])) { 1213 c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq); 1214 sfxge_lro_drop(rxq, c); 1215 } 1216 } 1217 1218 while (!TAILQ_EMPTY(&st->free_conns)) { 1219 c = TAILQ_FIRST(&st->free_conns); 1220 TAILQ_REMOVE(&st->free_conns, c, link); 1221 KASSERT(!c->mbuf, ("found orphaned mbuf")); 1222 free(c, M_SFXGE); 1223 } 1224 1225 free(st->conns_n, M_SFXGE); 1226 free(st->conns, M_SFXGE); 1227 st->conns = NULL; 1228 } 1229 1230 #else 1231 1232 static void 1233 sfxge_lro_init(struct sfxge_rxq *rxq) 1234 { 1235 } 1236 1237 static void 1238 sfxge_lro_fini(struct sfxge_rxq *rxq) 1239 { 1240 } 1241 1242 #endif /* SFXGE_LRO */ 1243 1244 static void 1245 sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index) 1246 { 1247 struct sfxge_rxq *rxq; 1248 1249 rxq = sc->rxq[index]; 1250 1251 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED, 1252 ("rxq->init_state != SFXGE_RXQ_INITIALIZED")); 1253 1254 /* Free the context array and the flow table. */ 1255 free(rxq->queue, M_SFXGE); 1256 sfxge_lro_fini(rxq); 1257 1258 /* Release DMA memory. */ 1259 sfxge_dma_free(&rxq->mem); 1260 1261 sc->rxq[index] = NULL; 1262 1263 free(rxq, M_SFXGE); 1264 } 1265 1266 static int 1267 sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index) 1268 { 1269 struct sfxge_rxq *rxq; 1270 struct sfxge_evq *evq; 1271 efsys_mem_t *esmp; 1272 int rc; 1273 1274 KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count)); 1275 1276 rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK); 1277 rxq->sc = sc; 1278 rxq->index = index; 1279 rxq->entries = sc->rxq_entries; 1280 rxq->ptr_mask = rxq->entries - 1; 1281 rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries); 1282 1283 sc->rxq[index] = rxq; 1284 esmp = &rxq->mem; 1285 1286 evq = sc->evq[index]; 1287 1288 /* Allocate and zero DMA space. */ 1289 if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0) 1290 return (rc); 1291 1292 /* Allocate buffer table entries. */ 1293 sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries), 1294 &rxq->buf_base_id); 1295 1296 /* Allocate the context array and the flow table. */ 1297 rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries, 1298 M_SFXGE, M_WAITOK | M_ZERO); 1299 sfxge_lro_init(rxq); 1300 1301 callout_init(&rxq->refill_callout, 1); 1302 1303 rxq->init_state = SFXGE_RXQ_INITIALIZED; 1304 1305 return (0); 1306 } 1307 1308 static const struct { 1309 const char *name; 1310 size_t offset; 1311 } sfxge_rx_stats[] = { 1312 #define SFXGE_RX_STAT(name, member) \ 1313 { #name, offsetof(struct sfxge_rxq, member) } 1314 #ifdef SFXGE_LRO 1315 SFXGE_RX_STAT(lro_merges, lro.n_merges), 1316 SFXGE_RX_STAT(lro_bursts, lro.n_bursts), 1317 SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start), 1318 SFXGE_RX_STAT(lro_misorder, lro.n_misorder), 1319 SFXGE_RX_STAT(lro_too_many, lro.n_too_many), 1320 SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream), 1321 SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle), 1322 SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed) 1323 #endif 1324 }; 1325 1326 static int 1327 sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS) 1328 { 1329 struct sfxge_softc *sc = arg1; 1330 unsigned int id = arg2; 1331 unsigned int sum, index; 1332 1333 /* Sum across all RX queues */ 1334 sum = 0; 1335 for (index = 0; index < sc->rxq_count; index++) 1336 sum += *(unsigned int *)((caddr_t)sc->rxq[index] + 1337 sfxge_rx_stats[id].offset); 1338 1339 return (SYSCTL_OUT(req, &sum, sizeof(sum))); 1340 } 1341 1342 static void 1343 sfxge_rx_stat_init(struct sfxge_softc *sc) 1344 { 1345 struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev); 1346 struct sysctl_oid_list *stat_list; 1347 unsigned int id; 1348 1349 stat_list = SYSCTL_CHILDREN(sc->stats_node); 1350 1351 for (id = 0; id < nitems(sfxge_rx_stats); id++) { 1352 SYSCTL_ADD_PROC( 1353 ctx, stat_list, 1354 OID_AUTO, sfxge_rx_stats[id].name, 1355 CTLTYPE_UINT|CTLFLAG_RD, 1356 sc, id, sfxge_rx_stat_handler, "IU", 1357 ""); 1358 } 1359 } 1360 1361 void 1362 sfxge_rx_fini(struct sfxge_softc *sc) 1363 { 1364 int index; 1365 1366 index = sc->rxq_count; 1367 while (--index >= 0) 1368 sfxge_rx_qfini(sc, index); 1369 1370 sc->rxq_count = 0; 1371 } 1372 1373 int 1374 sfxge_rx_init(struct sfxge_softc *sc) 1375 { 1376 struct sfxge_intr *intr; 1377 int index; 1378 int rc; 1379 1380 #ifdef SFXGE_LRO 1381 if (!ISP2(lro_table_size)) { 1382 log(LOG_ERR, "%s=%u must be power of 2", 1383 SFXGE_LRO_PARAM(table_size), lro_table_size); 1384 rc = EINVAL; 1385 goto fail_lro_table_size; 1386 } 1387 1388 if (lro_idle_ticks == 0) 1389 lro_idle_ticks = hz / 10 + 1; /* 100 ms */ 1390 #endif 1391 1392 intr = &sc->intr; 1393 1394 sc->rxq_count = intr->n_alloc; 1395 1396 KASSERT(intr->state == SFXGE_INTR_INITIALIZED, 1397 ("intr->state != SFXGE_INTR_INITIALIZED")); 1398 1399 /* Initialize the receive queue(s) - one per interrupt. */ 1400 for (index = 0; index < sc->rxq_count; index++) { 1401 if ((rc = sfxge_rx_qinit(sc, index)) != 0) 1402 goto fail; 1403 } 1404 1405 sfxge_rx_stat_init(sc); 1406 1407 return (0); 1408 1409 fail: 1410 /* Tear down the receive queue(s). */ 1411 while (--index >= 0) 1412 sfxge_rx_qfini(sc, index); 1413 1414 sc->rxq_count = 0; 1415 1416 #ifdef SFXGE_LRO 1417 fail_lro_table_size: 1418 #endif 1419 return (rc); 1420 } 1421