1 /*- 2 * Copyright (c) 2010-2016 Solarflare Communications Inc. 3 * All rights reserved. 4 * 5 * This software was developed in part by Philip Paeps under contract for 6 * Solarflare Communications, Inc. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright notice, 12 * this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 19 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 20 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 21 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 22 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 23 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 24 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 25 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 26 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 27 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 * 29 * The views and conclusions contained in the software and documentation are 30 * those of the authors and should not be interpreted as representing official 31 * policies, either expressed or implied, of the FreeBSD Project. 32 */ 33 34 #include <sys/cdefs.h> 35 __FBSDID("$FreeBSD$"); 36 37 #include <sys/param.h> 38 #include <sys/malloc.h> 39 #include <sys/mbuf.h> 40 #include <sys/smp.h> 41 #include <sys/socket.h> 42 #include <sys/sysctl.h> 43 #include <sys/syslog.h> 44 #include <sys/limits.h> 45 #include <sys/syslog.h> 46 47 #include <net/ethernet.h> 48 #include <net/if.h> 49 #include <net/if_vlan_var.h> 50 51 #include <netinet/in.h> 52 #include <netinet/ip.h> 53 #include <netinet/ip6.h> 54 #include <netinet/tcp.h> 55 56 #include <machine/in_cksum.h> 57 58 #include "common/efx.h" 59 60 61 #include "sfxge.h" 62 #include "sfxge_rx.h" 63 64 #define RX_REFILL_THRESHOLD(_entries) (EFX_RXQ_LIMIT(_entries) * 9 / 10) 65 66 #ifdef SFXGE_LRO 67 68 SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD, NULL, 69 "Large receive offload (LRO) parameters"); 70 71 #define SFXGE_LRO_PARAM(_param) SFXGE_PARAM(lro._param) 72 73 /* Size of the LRO hash table. Must be a power of 2. A larger table 74 * means we can accelerate a larger number of streams. 75 */ 76 static unsigned lro_table_size = 128; 77 TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size); 78 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN, 79 &lro_table_size, 0, 80 "Size of the LRO hash table (must be a power of 2)"); 81 82 /* Maximum length of a hash chain. If chains get too long then the lookup 83 * time increases and may exceed the benefit of LRO. 84 */ 85 static unsigned lro_chain_max = 20; 86 TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max); 87 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN, 88 &lro_chain_max, 0, 89 "The maximum length of a hash chain"); 90 91 /* Maximum time (in ticks) that a connection can be idle before it's LRO 92 * state is discarded. 93 */ 94 static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */ 95 TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks); 96 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN, 97 &lro_idle_ticks, 0, 98 "The maximum time (in ticks) that a connection can be idle " 99 "before it's LRO state is discarded"); 100 101 /* Number of packets with payload that must arrive in-order before a 102 * connection is eligible for LRO. The idea is we should avoid coalescing 103 * segments when the sender is in slow-start because reducing the ACK rate 104 * can damage performance. 105 */ 106 static int lro_slow_start_packets = 2000; 107 TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets); 108 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN, 109 &lro_slow_start_packets, 0, 110 "Number of packets with payload that must arrive in-order before " 111 "a connection is eligible for LRO"); 112 113 /* Number of packets with payload that must arrive in-order following loss 114 * before a connection is eligible for LRO. The idea is we should avoid 115 * coalescing segments when the sender is recovering from loss, because 116 * reducing the ACK rate can damage performance. 117 */ 118 static int lro_loss_packets = 20; 119 TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets); 120 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN, 121 &lro_loss_packets, 0, 122 "Number of packets with payload that must arrive in-order " 123 "following loss before a connection is eligible for LRO"); 124 125 /* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */ 126 #define SFXGE_LRO_L2_ID_VLAN 0x4000 127 #define SFXGE_LRO_L2_ID_IPV6 0x8000 128 #define SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN) 129 #define SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6)) 130 131 /* Compare IPv6 addresses, avoiding conditional branches */ 132 static unsigned long ipv6_addr_cmp(const struct in6_addr *left, 133 const struct in6_addr *right) 134 { 135 #if LONG_BIT == 64 136 const uint64_t *left64 = (const uint64_t *)left; 137 const uint64_t *right64 = (const uint64_t *)right; 138 return (left64[0] - right64[0]) | (left64[1] - right64[1]); 139 #else 140 return (left->s6_addr32[0] - right->s6_addr32[0]) | 141 (left->s6_addr32[1] - right->s6_addr32[1]) | 142 (left->s6_addr32[2] - right->s6_addr32[2]) | 143 (left->s6_addr32[3] - right->s6_addr32[3]); 144 #endif 145 } 146 147 #endif /* SFXGE_LRO */ 148 149 void 150 sfxge_rx_qflush_done(struct sfxge_rxq *rxq) 151 { 152 153 rxq->flush_state = SFXGE_FLUSH_DONE; 154 } 155 156 void 157 sfxge_rx_qflush_failed(struct sfxge_rxq *rxq) 158 { 159 160 rxq->flush_state = SFXGE_FLUSH_FAILED; 161 } 162 163 static uint8_t toep_key[] = { 164 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, 165 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, 166 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, 167 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, 168 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa 169 }; 170 171 static void 172 sfxge_rx_post_refill(void *arg) 173 { 174 struct sfxge_rxq *rxq = arg; 175 struct sfxge_softc *sc; 176 unsigned int index; 177 struct sfxge_evq *evq; 178 uint16_t magic; 179 180 sc = rxq->sc; 181 index = rxq->index; 182 evq = sc->evq[index]; 183 magic = sfxge_sw_ev_rxq_magic(SFXGE_SW_EV_RX_QREFILL, rxq); 184 185 /* This is guaranteed due to the start/stop order of rx and ev */ 186 KASSERT(evq->init_state == SFXGE_EVQ_STARTED, 187 ("evq not started")); 188 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED, 189 ("rxq not started")); 190 efx_ev_qpost(evq->common, magic); 191 } 192 193 static void 194 sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying) 195 { 196 /* Initially retry after 100 ms, but back off in case of 197 * repeated failures as we probably have to wait for the 198 * administrator to raise the pool limit. */ 199 if (retrying) 200 rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz); 201 else 202 rxq->refill_delay = hz / 10; 203 204 callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay, 205 sfxge_rx_post_refill, rxq); 206 } 207 208 #define SFXGE_REFILL_BATCH 64 209 210 static void 211 sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying) 212 { 213 struct sfxge_softc *sc; 214 unsigned int index; 215 struct sfxge_evq *evq; 216 unsigned int batch; 217 unsigned int rxfill; 218 unsigned int mblksize; 219 int ntodo; 220 efsys_dma_addr_t addr[SFXGE_REFILL_BATCH]; 221 222 sc = rxq->sc; 223 index = rxq->index; 224 evq = sc->evq[index]; 225 226 prefetch_read_many(sc->enp); 227 prefetch_read_many(rxq->common); 228 229 SFXGE_EVQ_LOCK_ASSERT_OWNED(evq); 230 231 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED)) 232 return; 233 234 rxfill = rxq->added - rxq->completed; 235 KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries), 236 ("rxfill > EFX_RXQ_LIMIT(rxq->entries)")); 237 ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target); 238 KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries), 239 ("ntodo > EFX_RQX_LIMIT(rxq->entries)")); 240 241 if (ntodo == 0) 242 return; 243 244 batch = 0; 245 mblksize = sc->rx_buffer_size - sc->rx_buffer_align; 246 while (ntodo-- > 0) { 247 unsigned int id; 248 struct sfxge_rx_sw_desc *rx_desc; 249 bus_dma_segment_t seg; 250 struct mbuf *m; 251 252 id = (rxq->added + batch) & rxq->ptr_mask; 253 rx_desc = &rxq->queue[id]; 254 KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL")); 255 256 rx_desc->flags = EFX_DISCARD; 257 m = rx_desc->mbuf = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, 258 sc->rx_cluster_size); 259 if (m == NULL) 260 break; 261 262 /* m_len specifies length of area to be mapped for DMA */ 263 m->m_len = mblksize; 264 m->m_data = (caddr_t)P2ROUNDUP((uintptr_t)m->m_data, CACHE_LINE_SIZE); 265 m->m_data += sc->rx_buffer_align; 266 267 sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg); 268 addr[batch++] = seg.ds_addr; 269 270 if (batch == SFXGE_REFILL_BATCH) { 271 efx_rx_qpost(rxq->common, addr, mblksize, batch, 272 rxq->completed, rxq->added); 273 rxq->added += batch; 274 batch = 0; 275 } 276 } 277 278 if (ntodo != 0) 279 sfxge_rx_schedule_refill(rxq, retrying); 280 281 if (batch != 0) { 282 efx_rx_qpost(rxq->common, addr, mblksize, batch, 283 rxq->completed, rxq->added); 284 rxq->added += batch; 285 } 286 287 /* Make the descriptors visible to the hardware */ 288 bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map, 289 BUS_DMASYNC_PREWRITE); 290 291 efx_rx_qpush(rxq->common, rxq->added, &rxq->pushed); 292 293 /* The queue could still be empty if no descriptors were actually 294 * pushed, in which case there will be no event to cause the next 295 * refill, so we must schedule a refill ourselves. 296 */ 297 if(rxq->pushed == rxq->completed) { 298 sfxge_rx_schedule_refill(rxq, retrying); 299 } 300 } 301 302 void 303 sfxge_rx_qrefill(struct sfxge_rxq *rxq) 304 { 305 306 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED)) 307 return; 308 309 /* Make sure the queue is full */ 310 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE); 311 } 312 313 static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m) 314 { 315 struct ifnet *ifp = sc->ifnet; 316 317 m->m_pkthdr.rcvif = ifp; 318 m->m_pkthdr.csum_data = 0xffff; 319 ifp->if_input(ifp, m); 320 } 321 322 static void 323 sfxge_rx_deliver(struct sfxge_softc *sc, struct sfxge_rx_sw_desc *rx_desc) 324 { 325 struct mbuf *m = rx_desc->mbuf; 326 int flags = rx_desc->flags; 327 int csum_flags; 328 329 /* Convert checksum flags */ 330 csum_flags = (flags & EFX_CKSUM_IPV4) ? 331 (CSUM_IP_CHECKED | CSUM_IP_VALID) : 0; 332 if (flags & EFX_CKSUM_TCPUDP) 333 csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 334 335 if (flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) { 336 m->m_pkthdr.flowid = 337 efx_psuedo_hdr_hash_get(sc->enp, 338 EFX_RX_HASHALG_TOEPLITZ, 339 mtod(m, uint8_t *)); 340 /* The hash covers a 4-tuple for TCP only */ 341 M_HASHTYPE_SET(m, 342 (flags & EFX_PKT_IPV4) ? 343 ((flags & EFX_PKT_TCP) ? 344 M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_IPV4) : 345 ((flags & EFX_PKT_TCP) ? 346 M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_IPV6)); 347 } 348 m->m_data += sc->rx_prefix_size; 349 m->m_len = rx_desc->size - sc->rx_prefix_size; 350 m->m_pkthdr.len = m->m_len; 351 m->m_pkthdr.csum_flags = csum_flags; 352 __sfxge_rx_deliver(sc, rx_desc->mbuf); 353 354 rx_desc->flags = EFX_DISCARD; 355 rx_desc->mbuf = NULL; 356 } 357 358 #ifdef SFXGE_LRO 359 360 static void 361 sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c) 362 { 363 struct sfxge_softc *sc = st->sc; 364 struct mbuf *m = c->mbuf; 365 struct tcphdr *c_th; 366 int csum_flags; 367 368 KASSERT(m, ("no mbuf to deliver")); 369 370 ++st->n_bursts; 371 372 /* Finish off packet munging and recalculate IP header checksum. */ 373 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 374 struct ip *iph = c->nh; 375 iph->ip_len = htons(iph->ip_len); 376 iph->ip_sum = 0; 377 iph->ip_sum = in_cksum_hdr(iph); 378 c_th = (struct tcphdr *)(iph + 1); 379 csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR | 380 CSUM_IP_CHECKED | CSUM_IP_VALID); 381 } else { 382 struct ip6_hdr *iph = c->nh; 383 iph->ip6_plen = htons(iph->ip6_plen); 384 c_th = (struct tcphdr *)(iph + 1); 385 csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 386 } 387 388 c_th->th_win = c->th_last->th_win; 389 c_th->th_ack = c->th_last->th_ack; 390 if (c_th->th_off == c->th_last->th_off) { 391 /* Copy TCP options (take care to avoid going negative). */ 392 int optlen = ((c_th->th_off - 5) & 0xf) << 2u; 393 memcpy(c_th + 1, c->th_last + 1, optlen); 394 } 395 396 m->m_pkthdr.flowid = c->conn_hash; 397 M_HASHTYPE_SET(m, 398 SFXGE_LRO_CONN_IS_TCPIPV4(c) ? 399 M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_TCP_IPV6); 400 401 m->m_pkthdr.csum_flags = csum_flags; 402 __sfxge_rx_deliver(sc, m); 403 404 c->mbuf = NULL; 405 c->delivered = 1; 406 } 407 408 /* Drop the given connection, and add it to the free list. */ 409 static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c) 410 { 411 unsigned bucket; 412 413 KASSERT(!c->mbuf, ("found orphaned mbuf")); 414 415 if (c->next_buf.mbuf != NULL) { 416 sfxge_rx_deliver(rxq->sc, &c->next_buf); 417 LIST_REMOVE(c, active_link); 418 } 419 420 bucket = c->conn_hash & rxq->lro.conns_mask; 421 KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong")); 422 --rxq->lro.conns_n[bucket]; 423 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link); 424 TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link); 425 } 426 427 /* Stop tracking connections that have gone idle in order to keep hash 428 * chains short. 429 */ 430 static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now) 431 { 432 struct sfxge_lro_conn *c; 433 unsigned i; 434 435 KASSERT(LIST_EMPTY(&rxq->lro.active_conns), 436 ("found active connections")); 437 438 rxq->lro.last_purge_ticks = now; 439 for (i = 0; i <= rxq->lro.conns_mask; ++i) { 440 if (TAILQ_EMPTY(&rxq->lro.conns[i])) 441 continue; 442 443 c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq); 444 if (now - c->last_pkt_ticks > lro_idle_ticks) { 445 ++rxq->lro.n_drop_idle; 446 sfxge_lro_drop(rxq, c); 447 } 448 } 449 } 450 451 static void 452 sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c, 453 struct mbuf *mbuf, struct tcphdr *th) 454 { 455 struct tcphdr *c_th; 456 457 /* Tack the new mbuf onto the chain. */ 458 KASSERT(!mbuf->m_next, ("mbuf already chained")); 459 c->mbuf_tail->m_next = mbuf; 460 c->mbuf_tail = mbuf; 461 462 /* Increase length appropriately */ 463 c->mbuf->m_pkthdr.len += mbuf->m_len; 464 465 /* Update the connection state flags */ 466 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 467 struct ip *iph = c->nh; 468 iph->ip_len += mbuf->m_len; 469 c_th = (struct tcphdr *)(iph + 1); 470 } else { 471 struct ip6_hdr *iph = c->nh; 472 iph->ip6_plen += mbuf->m_len; 473 c_th = (struct tcphdr *)(iph + 1); 474 } 475 c_th->th_flags |= (th->th_flags & TH_PUSH); 476 c->th_last = th; 477 ++st->n_merges; 478 479 /* Pass packet up now if another segment could overflow the IP 480 * length. 481 */ 482 if (c->mbuf->m_pkthdr.len > 65536 - 9200) 483 sfxge_lro_deliver(st, c); 484 } 485 486 static void 487 sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c, 488 struct mbuf *mbuf, void *nh, struct tcphdr *th) 489 { 490 /* Start the chain */ 491 c->mbuf = mbuf; 492 c->mbuf_tail = c->mbuf; 493 c->nh = nh; 494 c->th_last = th; 495 496 mbuf->m_pkthdr.len = mbuf->m_len; 497 498 /* Mangle header fields for later processing */ 499 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 500 struct ip *iph = nh; 501 iph->ip_len = ntohs(iph->ip_len); 502 } else { 503 struct ip6_hdr *iph = nh; 504 iph->ip6_plen = ntohs(iph->ip6_plen); 505 } 506 } 507 508 /* Try to merge or otherwise hold or deliver (as appropriate) the 509 * packet buffered for this connection (c->next_buf). Return a flag 510 * indicating whether the connection is still active for LRO purposes. 511 */ 512 static int 513 sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c) 514 { 515 struct sfxge_rx_sw_desc *rx_buf = &c->next_buf; 516 char *eh = c->next_eh; 517 int data_length, hdr_length, dont_merge; 518 unsigned th_seq, pkt_length; 519 struct tcphdr *th; 520 unsigned now; 521 522 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 523 struct ip *iph = c->next_nh; 524 th = (struct tcphdr *)(iph + 1); 525 pkt_length = ntohs(iph->ip_len) + (char *) iph - eh; 526 } else { 527 struct ip6_hdr *iph = c->next_nh; 528 th = (struct tcphdr *)(iph + 1); 529 pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh; 530 } 531 532 hdr_length = (char *) th + th->th_off * 4 - eh; 533 data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) - 534 hdr_length); 535 th_seq = ntohl(th->th_seq); 536 dont_merge = ((data_length <= 0) 537 | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN))); 538 539 /* Check for options other than aligned timestamp. */ 540 if (th->th_off != 5) { 541 const uint32_t *opt_ptr = (const uint32_t *) (th + 1); 542 if (th->th_off == 8 && 543 opt_ptr[0] == ntohl((TCPOPT_NOP << 24) | 544 (TCPOPT_NOP << 16) | 545 (TCPOPT_TIMESTAMP << 8) | 546 TCPOLEN_TIMESTAMP)) { 547 /* timestamp option -- okay */ 548 } else { 549 dont_merge = 1; 550 } 551 } 552 553 if (__predict_false(th_seq != c->next_seq)) { 554 /* Out-of-order, so start counting again. */ 555 if (c->mbuf != NULL) 556 sfxge_lro_deliver(&rxq->lro, c); 557 c->n_in_order_pkts -= lro_loss_packets; 558 c->next_seq = th_seq + data_length; 559 ++rxq->lro.n_misorder; 560 goto deliver_buf_out; 561 } 562 c->next_seq = th_seq + data_length; 563 564 now = ticks; 565 if (now - c->last_pkt_ticks > lro_idle_ticks) { 566 ++rxq->lro.n_drop_idle; 567 if (c->mbuf != NULL) 568 sfxge_lro_deliver(&rxq->lro, c); 569 sfxge_lro_drop(rxq, c); 570 return (0); 571 } 572 c->last_pkt_ticks = ticks; 573 574 if (c->n_in_order_pkts < lro_slow_start_packets) { 575 /* May be in slow-start, so don't merge. */ 576 ++rxq->lro.n_slow_start; 577 ++c->n_in_order_pkts; 578 goto deliver_buf_out; 579 } 580 581 if (__predict_false(dont_merge)) { 582 if (c->mbuf != NULL) 583 sfxge_lro_deliver(&rxq->lro, c); 584 if (th->th_flags & (TH_FIN | TH_RST)) { 585 ++rxq->lro.n_drop_closed; 586 sfxge_lro_drop(rxq, c); 587 return (0); 588 } 589 goto deliver_buf_out; 590 } 591 592 rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size; 593 594 if (__predict_true(c->mbuf != NULL)) { 595 /* Remove headers and any padding */ 596 rx_buf->mbuf->m_data += hdr_length; 597 rx_buf->mbuf->m_len = data_length; 598 599 sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th); 600 } else { 601 /* Remove any padding */ 602 rx_buf->mbuf->m_len = pkt_length; 603 604 sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th); 605 } 606 607 rx_buf->mbuf = NULL; 608 return (1); 609 610 deliver_buf_out: 611 sfxge_rx_deliver(rxq->sc, rx_buf); 612 return (1); 613 } 614 615 static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash, 616 uint16_t l2_id, void *nh, struct tcphdr *th) 617 { 618 unsigned bucket = conn_hash & st->conns_mask; 619 struct sfxge_lro_conn *c; 620 621 if (st->conns_n[bucket] >= lro_chain_max) { 622 ++st->n_too_many; 623 return; 624 } 625 626 if (!TAILQ_EMPTY(&st->free_conns)) { 627 c = TAILQ_FIRST(&st->free_conns); 628 TAILQ_REMOVE(&st->free_conns, c, link); 629 } else { 630 c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT); 631 if (c == NULL) 632 return; 633 c->mbuf = NULL; 634 c->next_buf.mbuf = NULL; 635 } 636 637 /* Create the connection tracking data */ 638 ++st->conns_n[bucket]; 639 TAILQ_INSERT_HEAD(&st->conns[bucket], c, link); 640 c->l2_id = l2_id; 641 c->conn_hash = conn_hash; 642 c->source = th->th_sport; 643 c->dest = th->th_dport; 644 c->n_in_order_pkts = 0; 645 c->last_pkt_ticks = *(volatile int *)&ticks; 646 c->delivered = 0; 647 ++st->n_new_stream; 648 /* NB. We don't initialise c->next_seq, and it doesn't matter what 649 * value it has. Most likely the next packet received for this 650 * connection will not match -- no harm done. 651 */ 652 } 653 654 /* Process mbuf and decide whether to dispatch it to the stack now or 655 * later. 656 */ 657 static void 658 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf) 659 { 660 struct sfxge_softc *sc = rxq->sc; 661 struct mbuf *m = rx_buf->mbuf; 662 struct ether_header *eh; 663 struct sfxge_lro_conn *c; 664 uint16_t l2_id; 665 uint16_t l3_proto; 666 void *nh; 667 struct tcphdr *th; 668 uint32_t conn_hash; 669 unsigned bucket; 670 671 /* Get the hardware hash */ 672 conn_hash = efx_psuedo_hdr_hash_get(sc->enp, 673 EFX_RX_HASHALG_TOEPLITZ, 674 mtod(m, uint8_t *)); 675 676 eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size); 677 if (eh->ether_type == htons(ETHERTYPE_VLAN)) { 678 struct ether_vlan_header *veh = (struct ether_vlan_header *)eh; 679 l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) | 680 SFXGE_LRO_L2_ID_VLAN; 681 l3_proto = veh->evl_proto; 682 nh = veh + 1; 683 } else { 684 l2_id = 0; 685 l3_proto = eh->ether_type; 686 nh = eh + 1; 687 } 688 689 /* Check whether this is a suitable packet (unfragmented 690 * TCP/IPv4 or TCP/IPv6). If so, find the TCP header and 691 * length, and compute a hash if necessary. If not, return. 692 */ 693 if (l3_proto == htons(ETHERTYPE_IP)) { 694 struct ip *iph = nh; 695 696 KASSERT(iph->ip_p == IPPROTO_TCP, 697 ("IPv4 protocol is not TCP, but packet marker is set")); 698 if ((iph->ip_hl - (sizeof(*iph) >> 2u)) | 699 (iph->ip_off & htons(IP_MF | IP_OFFMASK))) 700 goto deliver_now; 701 th = (struct tcphdr *)(iph + 1); 702 } else if (l3_proto == htons(ETHERTYPE_IPV6)) { 703 struct ip6_hdr *iph = nh; 704 705 KASSERT(iph->ip6_nxt == IPPROTO_TCP, 706 ("IPv6 next header is not TCP, but packet marker is set")); 707 l2_id |= SFXGE_LRO_L2_ID_IPV6; 708 th = (struct tcphdr *)(iph + 1); 709 } else { 710 goto deliver_now; 711 } 712 713 bucket = conn_hash & rxq->lro.conns_mask; 714 715 TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) { 716 if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash)) 717 continue; 718 if ((c->source - th->th_sport) | (c->dest - th->th_dport)) 719 continue; 720 if (c->mbuf != NULL) { 721 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 722 struct ip *c_iph, *iph = nh; 723 c_iph = c->nh; 724 if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) | 725 (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr)) 726 continue; 727 } else { 728 struct ip6_hdr *c_iph, *iph = nh; 729 c_iph = c->nh; 730 if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) | 731 ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst)) 732 continue; 733 } 734 } 735 736 /* Re-insert at head of list to reduce lookup time. */ 737 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link); 738 TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link); 739 740 if (c->next_buf.mbuf != NULL) { 741 if (!sfxge_lro_try_merge(rxq, c)) 742 goto deliver_now; 743 } else { 744 LIST_INSERT_HEAD(&rxq->lro.active_conns, c, 745 active_link); 746 } 747 c->next_buf = *rx_buf; 748 c->next_eh = eh; 749 c->next_nh = nh; 750 751 rx_buf->mbuf = NULL; 752 rx_buf->flags = EFX_DISCARD; 753 return; 754 } 755 756 sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th); 757 deliver_now: 758 sfxge_rx_deliver(sc, rx_buf); 759 } 760 761 static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq) 762 { 763 struct sfxge_lro_state *st = &rxq->lro; 764 struct sfxge_lro_conn *c; 765 unsigned t; 766 767 while (!LIST_EMPTY(&st->active_conns)) { 768 c = LIST_FIRST(&st->active_conns); 769 if (!c->delivered && c->mbuf != NULL) 770 sfxge_lro_deliver(st, c); 771 if (sfxge_lro_try_merge(rxq, c)) { 772 if (c->mbuf != NULL) 773 sfxge_lro_deliver(st, c); 774 LIST_REMOVE(c, active_link); 775 } 776 c->delivered = 0; 777 } 778 779 t = *(volatile int *)&ticks; 780 if (__predict_false(t != st->last_purge_ticks)) 781 sfxge_lro_purge_idle(rxq, t); 782 } 783 784 #else /* !SFXGE_LRO */ 785 786 static void 787 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf) 788 { 789 } 790 791 static void 792 sfxge_lro_end_of_burst(struct sfxge_rxq *rxq) 793 { 794 } 795 796 #endif /* SFXGE_LRO */ 797 798 void 799 sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop) 800 { 801 struct sfxge_softc *sc = rxq->sc; 802 int if_capenable = sc->ifnet->if_capenable; 803 int lro_enabled = if_capenable & IFCAP_LRO; 804 unsigned int index; 805 struct sfxge_evq *evq; 806 unsigned int completed; 807 unsigned int level; 808 struct mbuf *m; 809 struct sfxge_rx_sw_desc *prev = NULL; 810 811 index = rxq->index; 812 evq = sc->evq[index]; 813 814 SFXGE_EVQ_LOCK_ASSERT_OWNED(evq); 815 816 completed = rxq->completed; 817 while (completed != rxq->pending) { 818 unsigned int id; 819 struct sfxge_rx_sw_desc *rx_desc; 820 821 id = completed++ & rxq->ptr_mask; 822 rx_desc = &rxq->queue[id]; 823 m = rx_desc->mbuf; 824 825 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED)) 826 goto discard; 827 828 if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD)) 829 goto discard; 830 831 /* Read the length from the pseudo header if required */ 832 if (rx_desc->flags & EFX_PKT_PREFIX_LEN) { 833 uint16_t tmp_size; 834 int rc; 835 rc = efx_psuedo_hdr_pkt_length_get(sc->enp, 836 mtod(m, uint8_t *), 837 &tmp_size); 838 KASSERT(rc == 0, ("cannot get packet length: %d", rc)); 839 rx_desc->size = (int)tmp_size + sc->rx_prefix_size; 840 } 841 842 prefetch_read_many(mtod(m, caddr_t)); 843 844 switch (rx_desc->flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) { 845 case EFX_PKT_IPV4: 846 if (~if_capenable & IFCAP_RXCSUM) 847 rx_desc->flags &= 848 ~(EFX_CKSUM_IPV4 | EFX_CKSUM_TCPUDP); 849 break; 850 case EFX_PKT_IPV6: 851 if (~if_capenable & IFCAP_RXCSUM_IPV6) 852 rx_desc->flags &= ~EFX_CKSUM_TCPUDP; 853 break; 854 case 0: 855 /* Check for loopback packets */ 856 { 857 struct ether_header *etherhp; 858 859 /*LINTED*/ 860 etherhp = mtod(m, struct ether_header *); 861 862 if (etherhp->ether_type == 863 htons(SFXGE_ETHERTYPE_LOOPBACK)) { 864 EFSYS_PROBE(loopback); 865 866 rxq->loopback++; 867 goto discard; 868 } 869 } 870 break; 871 default: 872 KASSERT(B_FALSE, 873 ("Rx descriptor with both IPv4 and IPv6 flags")); 874 goto discard; 875 } 876 877 /* Pass packet up the stack or into LRO (pipelined) */ 878 if (prev != NULL) { 879 if (lro_enabled && 880 ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) == 881 (EFX_PKT_TCP | EFX_CKSUM_TCPUDP))) 882 sfxge_lro(rxq, prev); 883 else 884 sfxge_rx_deliver(sc, prev); 885 } 886 prev = rx_desc; 887 continue; 888 889 discard: 890 /* Return the packet to the pool */ 891 m_free(m); 892 rx_desc->mbuf = NULL; 893 } 894 rxq->completed = completed; 895 896 level = rxq->added - rxq->completed; 897 898 /* Pass last packet up the stack or into LRO */ 899 if (prev != NULL) { 900 if (lro_enabled && 901 ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) == 902 (EFX_PKT_TCP | EFX_CKSUM_TCPUDP))) 903 sfxge_lro(rxq, prev); 904 else 905 sfxge_rx_deliver(sc, prev); 906 } 907 908 /* 909 * If there are any pending flows and this is the end of the 910 * poll then they must be completed. 911 */ 912 if (eop) 913 sfxge_lro_end_of_burst(rxq); 914 915 /* Top up the queue if necessary */ 916 if (level < rxq->refill_threshold) 917 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE); 918 } 919 920 static void 921 sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index) 922 { 923 struct sfxge_rxq *rxq; 924 struct sfxge_evq *evq; 925 unsigned int count; 926 unsigned int retry = 3; 927 928 SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc); 929 930 rxq = sc->rxq[index]; 931 evq = sc->evq[index]; 932 933 SFXGE_EVQ_LOCK(evq); 934 935 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED, 936 ("rxq not started")); 937 938 rxq->init_state = SFXGE_RXQ_INITIALIZED; 939 940 callout_stop(&rxq->refill_callout); 941 942 while (rxq->flush_state != SFXGE_FLUSH_DONE && retry != 0) { 943 rxq->flush_state = SFXGE_FLUSH_PENDING; 944 945 SFXGE_EVQ_UNLOCK(evq); 946 947 /* Flush the receive queue */ 948 if (efx_rx_qflush(rxq->common) != 0) { 949 SFXGE_EVQ_LOCK(evq); 950 rxq->flush_state = SFXGE_FLUSH_FAILED; 951 break; 952 } 953 954 count = 0; 955 do { 956 /* Spin for 100 ms */ 957 DELAY(100000); 958 959 if (rxq->flush_state != SFXGE_FLUSH_PENDING) 960 break; 961 962 } while (++count < 20); 963 964 SFXGE_EVQ_LOCK(evq); 965 966 if (rxq->flush_state == SFXGE_FLUSH_PENDING) { 967 /* Flush timeout - neither done nor failed */ 968 log(LOG_ERR, "%s: Cannot flush Rx queue %u\n", 969 device_get_nameunit(sc->dev), index); 970 rxq->flush_state = SFXGE_FLUSH_DONE; 971 } 972 retry--; 973 } 974 if (rxq->flush_state == SFXGE_FLUSH_FAILED) { 975 log(LOG_ERR, "%s: Flushing Rx queue %u failed\n", 976 device_get_nameunit(sc->dev), index); 977 rxq->flush_state = SFXGE_FLUSH_DONE; 978 } 979 980 rxq->pending = rxq->added; 981 sfxge_rx_qcomplete(rxq, B_TRUE); 982 983 KASSERT(rxq->completed == rxq->pending, 984 ("rxq->completed != rxq->pending")); 985 986 rxq->added = 0; 987 rxq->pushed = 0; 988 rxq->pending = 0; 989 rxq->completed = 0; 990 rxq->loopback = 0; 991 992 /* Destroy the common code receive queue. */ 993 efx_rx_qdestroy(rxq->common); 994 995 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id, 996 EFX_RXQ_NBUFS(sc->rxq_entries)); 997 998 SFXGE_EVQ_UNLOCK(evq); 999 } 1000 1001 static int 1002 sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index) 1003 { 1004 struct sfxge_rxq *rxq; 1005 efsys_mem_t *esmp; 1006 struct sfxge_evq *evq; 1007 int rc; 1008 1009 SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc); 1010 1011 rxq = sc->rxq[index]; 1012 esmp = &rxq->mem; 1013 evq = sc->evq[index]; 1014 1015 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED, 1016 ("rxq->init_state != SFXGE_RXQ_INITIALIZED")); 1017 KASSERT(evq->init_state == SFXGE_EVQ_STARTED, 1018 ("evq->init_state != SFXGE_EVQ_STARTED")); 1019 1020 /* Program the buffer table. */ 1021 if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp, 1022 EFX_RXQ_NBUFS(sc->rxq_entries))) != 0) 1023 return (rc); 1024 1025 /* Create the common code receive queue. */ 1026 if ((rc = efx_rx_qcreate(sc->enp, index, 0, EFX_RXQ_TYPE_DEFAULT, 1027 esmp, sc->rxq_entries, rxq->buf_base_id, evq->common, 1028 &rxq->common)) != 0) 1029 goto fail; 1030 1031 SFXGE_EVQ_LOCK(evq); 1032 1033 /* Enable the receive queue. */ 1034 efx_rx_qenable(rxq->common); 1035 1036 rxq->init_state = SFXGE_RXQ_STARTED; 1037 rxq->flush_state = SFXGE_FLUSH_REQUIRED; 1038 1039 /* Try to fill the queue from the pool. */ 1040 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE); 1041 1042 SFXGE_EVQ_UNLOCK(evq); 1043 1044 return (0); 1045 1046 fail: 1047 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id, 1048 EFX_RXQ_NBUFS(sc->rxq_entries)); 1049 return (rc); 1050 } 1051 1052 void 1053 sfxge_rx_stop(struct sfxge_softc *sc) 1054 { 1055 int index; 1056 1057 efx_mac_filter_default_rxq_clear(sc->enp); 1058 1059 /* Stop the receive queue(s) */ 1060 index = sc->rxq_count; 1061 while (--index >= 0) 1062 sfxge_rx_qstop(sc, index); 1063 1064 sc->rx_prefix_size = 0; 1065 sc->rx_buffer_size = 0; 1066 1067 efx_rx_fini(sc->enp); 1068 } 1069 1070 int 1071 sfxge_rx_start(struct sfxge_softc *sc) 1072 { 1073 struct sfxge_intr *intr; 1074 const efx_nic_cfg_t *encp; 1075 size_t hdrlen, align, reserved; 1076 int index; 1077 int rc; 1078 1079 intr = &sc->intr; 1080 1081 /* Initialize the common code receive module. */ 1082 if ((rc = efx_rx_init(sc->enp)) != 0) 1083 return (rc); 1084 1085 encp = efx_nic_cfg_get(sc->enp); 1086 sc->rx_buffer_size = EFX_MAC_PDU(sc->ifnet->if_mtu); 1087 1088 /* Calculate the receive packet buffer size. */ 1089 sc->rx_prefix_size = encp->enc_rx_prefix_size; 1090 1091 /* Ensure IP headers are 32bit aligned */ 1092 hdrlen = sc->rx_prefix_size + sizeof (struct ether_header); 1093 sc->rx_buffer_align = P2ROUNDUP(hdrlen, 4) - hdrlen; 1094 1095 sc->rx_buffer_size += sc->rx_buffer_align; 1096 1097 /* Align end of packet buffer for RX DMA end padding */ 1098 align = MAX(1, encp->enc_rx_buf_align_end); 1099 EFSYS_ASSERT(ISP2(align)); 1100 sc->rx_buffer_size = P2ROUNDUP(sc->rx_buffer_size, align); 1101 1102 /* 1103 * Standard mbuf zones only guarantee pointer-size alignment; 1104 * we need extra space to align to the cache line 1105 */ 1106 reserved = sc->rx_buffer_size + CACHE_LINE_SIZE; 1107 1108 /* Select zone for packet buffers */ 1109 if (reserved <= MCLBYTES) 1110 sc->rx_cluster_size = MCLBYTES; 1111 else if (reserved <= MJUMPAGESIZE) 1112 sc->rx_cluster_size = MJUMPAGESIZE; 1113 else if (reserved <= MJUM9BYTES) 1114 sc->rx_cluster_size = MJUM9BYTES; 1115 else 1116 sc->rx_cluster_size = MJUM16BYTES; 1117 1118 /* 1119 * Set up the scale table. Enable all hash types and hash insertion. 1120 */ 1121 for (index = 0; index < SFXGE_RX_SCALE_MAX; index++) 1122 sc->rx_indir_table[index] = index % sc->rxq_count; 1123 if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table, 1124 SFXGE_RX_SCALE_MAX)) != 0) 1125 goto fail; 1126 (void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ, 1127 (1 << EFX_RX_HASH_IPV4) | (1 << EFX_RX_HASH_TCPIPV4) | 1128 (1 << EFX_RX_HASH_IPV6) | (1 << EFX_RX_HASH_TCPIPV6), B_TRUE); 1129 1130 if ((rc = efx_rx_scale_key_set(sc->enp, toep_key, 1131 sizeof(toep_key))) != 0) 1132 goto fail; 1133 1134 /* Start the receive queue(s). */ 1135 for (index = 0; index < sc->rxq_count; index++) { 1136 if ((rc = sfxge_rx_qstart(sc, index)) != 0) 1137 goto fail2; 1138 } 1139 1140 rc = efx_mac_filter_default_rxq_set(sc->enp, sc->rxq[0]->common, 1141 sc->intr.n_alloc > 1); 1142 if (rc != 0) 1143 goto fail3; 1144 1145 return (0); 1146 1147 fail3: 1148 fail2: 1149 while (--index >= 0) 1150 sfxge_rx_qstop(sc, index); 1151 1152 fail: 1153 efx_rx_fini(sc->enp); 1154 1155 return (rc); 1156 } 1157 1158 #ifdef SFXGE_LRO 1159 1160 static void sfxge_lro_init(struct sfxge_rxq *rxq) 1161 { 1162 struct sfxge_lro_state *st = &rxq->lro; 1163 unsigned i; 1164 1165 st->conns_mask = lro_table_size - 1; 1166 KASSERT(!((st->conns_mask + 1) & st->conns_mask), 1167 ("lro_table_size must be a power of 2")); 1168 st->sc = rxq->sc; 1169 st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]), 1170 M_SFXGE, M_WAITOK); 1171 st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]), 1172 M_SFXGE, M_WAITOK); 1173 for (i = 0; i <= st->conns_mask; ++i) { 1174 TAILQ_INIT(&st->conns[i]); 1175 st->conns_n[i] = 0; 1176 } 1177 LIST_INIT(&st->active_conns); 1178 TAILQ_INIT(&st->free_conns); 1179 } 1180 1181 static void sfxge_lro_fini(struct sfxge_rxq *rxq) 1182 { 1183 struct sfxge_lro_state *st = &rxq->lro; 1184 struct sfxge_lro_conn *c; 1185 unsigned i; 1186 1187 /* Return cleanly if sfxge_lro_init() has not been called. */ 1188 if (st->conns == NULL) 1189 return; 1190 1191 KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections")); 1192 1193 for (i = 0; i <= st->conns_mask; ++i) { 1194 while (!TAILQ_EMPTY(&st->conns[i])) { 1195 c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq); 1196 sfxge_lro_drop(rxq, c); 1197 } 1198 } 1199 1200 while (!TAILQ_EMPTY(&st->free_conns)) { 1201 c = TAILQ_FIRST(&st->free_conns); 1202 TAILQ_REMOVE(&st->free_conns, c, link); 1203 KASSERT(!c->mbuf, ("found orphaned mbuf")); 1204 free(c, M_SFXGE); 1205 } 1206 1207 free(st->conns_n, M_SFXGE); 1208 free(st->conns, M_SFXGE); 1209 st->conns = NULL; 1210 } 1211 1212 #else 1213 1214 static void 1215 sfxge_lro_init(struct sfxge_rxq *rxq) 1216 { 1217 } 1218 1219 static void 1220 sfxge_lro_fini(struct sfxge_rxq *rxq) 1221 { 1222 } 1223 1224 #endif /* SFXGE_LRO */ 1225 1226 static void 1227 sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index) 1228 { 1229 struct sfxge_rxq *rxq; 1230 1231 rxq = sc->rxq[index]; 1232 1233 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED, 1234 ("rxq->init_state != SFXGE_RXQ_INITIALIZED")); 1235 1236 /* Free the context array and the flow table. */ 1237 free(rxq->queue, M_SFXGE); 1238 sfxge_lro_fini(rxq); 1239 1240 /* Release DMA memory. */ 1241 sfxge_dma_free(&rxq->mem); 1242 1243 sc->rxq[index] = NULL; 1244 1245 free(rxq, M_SFXGE); 1246 } 1247 1248 static int 1249 sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index) 1250 { 1251 struct sfxge_rxq *rxq; 1252 struct sfxge_evq *evq; 1253 efsys_mem_t *esmp; 1254 int rc; 1255 1256 KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count)); 1257 1258 rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK); 1259 rxq->sc = sc; 1260 rxq->index = index; 1261 rxq->entries = sc->rxq_entries; 1262 rxq->ptr_mask = rxq->entries - 1; 1263 rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries); 1264 1265 sc->rxq[index] = rxq; 1266 esmp = &rxq->mem; 1267 1268 evq = sc->evq[index]; 1269 1270 /* Allocate and zero DMA space. */ 1271 if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0) 1272 return (rc); 1273 1274 /* Allocate buffer table entries. */ 1275 sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries), 1276 &rxq->buf_base_id); 1277 1278 /* Allocate the context array and the flow table. */ 1279 rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries, 1280 M_SFXGE, M_WAITOK | M_ZERO); 1281 sfxge_lro_init(rxq); 1282 1283 callout_init(&rxq->refill_callout, 1); 1284 1285 rxq->init_state = SFXGE_RXQ_INITIALIZED; 1286 1287 return (0); 1288 } 1289 1290 static const struct { 1291 const char *name; 1292 size_t offset; 1293 } sfxge_rx_stats[] = { 1294 #define SFXGE_RX_STAT(name, member) \ 1295 { #name, offsetof(struct sfxge_rxq, member) } 1296 #ifdef SFXGE_LRO 1297 SFXGE_RX_STAT(lro_merges, lro.n_merges), 1298 SFXGE_RX_STAT(lro_bursts, lro.n_bursts), 1299 SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start), 1300 SFXGE_RX_STAT(lro_misorder, lro.n_misorder), 1301 SFXGE_RX_STAT(lro_too_many, lro.n_too_many), 1302 SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream), 1303 SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle), 1304 SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed) 1305 #endif 1306 }; 1307 1308 static int 1309 sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS) 1310 { 1311 struct sfxge_softc *sc = arg1; 1312 unsigned int id = arg2; 1313 unsigned int sum, index; 1314 1315 /* Sum across all RX queues */ 1316 sum = 0; 1317 for (index = 0; index < sc->rxq_count; index++) 1318 sum += *(unsigned int *)((caddr_t)sc->rxq[index] + 1319 sfxge_rx_stats[id].offset); 1320 1321 return (SYSCTL_OUT(req, &sum, sizeof(sum))); 1322 } 1323 1324 static void 1325 sfxge_rx_stat_init(struct sfxge_softc *sc) 1326 { 1327 struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev); 1328 struct sysctl_oid_list *stat_list; 1329 unsigned int id; 1330 1331 stat_list = SYSCTL_CHILDREN(sc->stats_node); 1332 1333 for (id = 0; id < nitems(sfxge_rx_stats); id++) { 1334 SYSCTL_ADD_PROC( 1335 ctx, stat_list, 1336 OID_AUTO, sfxge_rx_stats[id].name, 1337 CTLTYPE_UINT|CTLFLAG_RD, 1338 sc, id, sfxge_rx_stat_handler, "IU", 1339 ""); 1340 } 1341 } 1342 1343 void 1344 sfxge_rx_fini(struct sfxge_softc *sc) 1345 { 1346 int index; 1347 1348 index = sc->rxq_count; 1349 while (--index >= 0) 1350 sfxge_rx_qfini(sc, index); 1351 1352 sc->rxq_count = 0; 1353 } 1354 1355 int 1356 sfxge_rx_init(struct sfxge_softc *sc) 1357 { 1358 struct sfxge_intr *intr; 1359 int index; 1360 int rc; 1361 1362 #ifdef SFXGE_LRO 1363 if (!ISP2(lro_table_size)) { 1364 log(LOG_ERR, "%s=%u must be power of 2", 1365 SFXGE_LRO_PARAM(table_size), lro_table_size); 1366 rc = EINVAL; 1367 goto fail_lro_table_size; 1368 } 1369 1370 if (lro_idle_ticks == 0) 1371 lro_idle_ticks = hz / 10 + 1; /* 100 ms */ 1372 #endif 1373 1374 intr = &sc->intr; 1375 1376 sc->rxq_count = intr->n_alloc; 1377 1378 KASSERT(intr->state == SFXGE_INTR_INITIALIZED, 1379 ("intr->state != SFXGE_INTR_INITIALIZED")); 1380 1381 /* Initialize the receive queue(s) - one per interrupt. */ 1382 for (index = 0; index < sc->rxq_count; index++) { 1383 if ((rc = sfxge_rx_qinit(sc, index)) != 0) 1384 goto fail; 1385 } 1386 1387 sfxge_rx_stat_init(sc); 1388 1389 return (0); 1390 1391 fail: 1392 /* Tear down the receive queue(s). */ 1393 while (--index >= 0) 1394 sfxge_rx_qfini(sc, index); 1395 1396 sc->rxq_count = 0; 1397 1398 #ifdef SFXGE_LRO 1399 fail_lro_table_size: 1400 #endif 1401 return (rc); 1402 } 1403