1 /*- 2 * Copyright (c) 2010-2015 Solarflare Communications Inc. 3 * All rights reserved. 4 * 5 * This software was developed in part by Philip Paeps under contract for 6 * Solarflare Communications, Inc. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright notice, 12 * this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 19 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 20 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 21 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 22 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 23 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 24 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 25 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 26 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 27 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 * 29 * The views and conclusions contained in the software and documentation are 30 * those of the authors and should not be interpreted as representing official 31 * policies, either expressed or implied, of the FreeBSD Project. 32 */ 33 34 #include <sys/cdefs.h> 35 __FBSDID("$FreeBSD$"); 36 37 #include <sys/param.h> 38 #include <sys/malloc.h> 39 #include <sys/mbuf.h> 40 #include <sys/smp.h> 41 #include <sys/socket.h> 42 #include <sys/sysctl.h> 43 #include <sys/syslog.h> 44 #include <sys/limits.h> 45 #include <sys/syslog.h> 46 47 #include <net/ethernet.h> 48 #include <net/if.h> 49 #include <net/if_vlan_var.h> 50 51 #include <netinet/in.h> 52 #include <netinet/ip.h> 53 #include <netinet/ip6.h> 54 #include <netinet/tcp.h> 55 56 #include <machine/in_cksum.h> 57 58 #include "common/efx.h" 59 60 61 #include "sfxge.h" 62 #include "sfxge_rx.h" 63 64 #define RX_REFILL_THRESHOLD(_entries) (EFX_RXQ_LIMIT(_entries) * 9 / 10) 65 66 #ifdef SFXGE_LRO 67 68 SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD, NULL, 69 "Large receive offload (LRO) parameters"); 70 71 #define SFXGE_LRO_PARAM(_param) SFXGE_PARAM(lro._param) 72 73 /* Size of the LRO hash table. Must be a power of 2. A larger table 74 * means we can accelerate a larger number of streams. 75 */ 76 static unsigned lro_table_size = 128; 77 TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size); 78 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN, 79 &lro_table_size, 0, 80 "Size of the LRO hash table (must be a power of 2)"); 81 82 /* Maximum length of a hash chain. If chains get too long then the lookup 83 * time increases and may exceed the benefit of LRO. 84 */ 85 static unsigned lro_chain_max = 20; 86 TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max); 87 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN, 88 &lro_chain_max, 0, 89 "The maximum length of a hash chain"); 90 91 /* Maximum time (in ticks) that a connection can be idle before it's LRO 92 * state is discarded. 93 */ 94 static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */ 95 TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks); 96 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN, 97 &lro_idle_ticks, 0, 98 "The maximum time (in ticks) that a connection can be idle " 99 "before it's LRO state is discarded"); 100 101 /* Number of packets with payload that must arrive in-order before a 102 * connection is eligible for LRO. The idea is we should avoid coalescing 103 * segments when the sender is in slow-start because reducing the ACK rate 104 * can damage performance. 105 */ 106 static int lro_slow_start_packets = 2000; 107 TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets); 108 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN, 109 &lro_slow_start_packets, 0, 110 "Number of packets with payload that must arrive in-order before " 111 "a connection is eligible for LRO"); 112 113 /* Number of packets with payload that must arrive in-order following loss 114 * before a connection is eligible for LRO. The idea is we should avoid 115 * coalescing segments when the sender is recovering from loss, because 116 * reducing the ACK rate can damage performance. 117 */ 118 static int lro_loss_packets = 20; 119 TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets); 120 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN, 121 &lro_loss_packets, 0, 122 "Number of packets with payload that must arrive in-order " 123 "following loss before a connection is eligible for LRO"); 124 125 /* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */ 126 #define SFXGE_LRO_L2_ID_VLAN 0x4000 127 #define SFXGE_LRO_L2_ID_IPV6 0x8000 128 #define SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN) 129 #define SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6)) 130 131 /* Compare IPv6 addresses, avoiding conditional branches */ 132 static unsigned long ipv6_addr_cmp(const struct in6_addr *left, 133 const struct in6_addr *right) 134 { 135 #if LONG_BIT == 64 136 const uint64_t *left64 = (const uint64_t *)left; 137 const uint64_t *right64 = (const uint64_t *)right; 138 return (left64[0] - right64[0]) | (left64[1] - right64[1]); 139 #else 140 return (left->s6_addr32[0] - right->s6_addr32[0]) | 141 (left->s6_addr32[1] - right->s6_addr32[1]) | 142 (left->s6_addr32[2] - right->s6_addr32[2]) | 143 (left->s6_addr32[3] - right->s6_addr32[3]); 144 #endif 145 } 146 147 #endif /* SFXGE_LRO */ 148 149 void 150 sfxge_rx_qflush_done(struct sfxge_rxq *rxq) 151 { 152 153 rxq->flush_state = SFXGE_FLUSH_DONE; 154 } 155 156 void 157 sfxge_rx_qflush_failed(struct sfxge_rxq *rxq) 158 { 159 160 rxq->flush_state = SFXGE_FLUSH_FAILED; 161 } 162 163 static uint8_t toep_key[] = { 164 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, 165 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, 166 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, 167 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, 168 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa 169 }; 170 171 static void 172 sfxge_rx_post_refill(void *arg) 173 { 174 struct sfxge_rxq *rxq = arg; 175 struct sfxge_softc *sc; 176 unsigned int index; 177 struct sfxge_evq *evq; 178 uint16_t magic; 179 180 sc = rxq->sc; 181 index = rxq->index; 182 evq = sc->evq[index]; 183 184 magic = SFXGE_MAGIC_RX_QREFILL | index; 185 186 /* This is guaranteed due to the start/stop order of rx and ev */ 187 KASSERT(evq->init_state == SFXGE_EVQ_STARTED, 188 ("evq not started")); 189 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED, 190 ("rxq not started")); 191 efx_ev_qpost(evq->common, magic); 192 } 193 194 static void 195 sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying) 196 { 197 /* Initially retry after 100 ms, but back off in case of 198 * repeated failures as we probably have to wait for the 199 * administrator to raise the pool limit. */ 200 if (retrying) 201 rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz); 202 else 203 rxq->refill_delay = hz / 10; 204 205 callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay, 206 sfxge_rx_post_refill, rxq); 207 } 208 209 #define SFXGE_REFILL_BATCH 64 210 211 static void 212 sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying) 213 { 214 struct sfxge_softc *sc; 215 unsigned int index; 216 struct sfxge_evq *evq; 217 unsigned int batch; 218 unsigned int rxfill; 219 unsigned int mblksize; 220 int ntodo; 221 efsys_dma_addr_t addr[SFXGE_REFILL_BATCH]; 222 223 sc = rxq->sc; 224 index = rxq->index; 225 evq = sc->evq[index]; 226 227 prefetch_read_many(sc->enp); 228 prefetch_read_many(rxq->common); 229 230 SFXGE_EVQ_LOCK_ASSERT_OWNED(evq); 231 232 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED)) 233 return; 234 235 rxfill = rxq->added - rxq->completed; 236 KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries), 237 ("rxfill > EFX_RXQ_LIMIT(rxq->entries)")); 238 ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target); 239 KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries), 240 ("ntodo > EFX_RQX_LIMIT(rxq->entries)")); 241 242 if (ntodo == 0) 243 return; 244 245 batch = 0; 246 mblksize = sc->rx_buffer_size - sc->rx_buffer_align; 247 while (ntodo-- > 0) { 248 unsigned int id; 249 struct sfxge_rx_sw_desc *rx_desc; 250 bus_dma_segment_t seg; 251 struct mbuf *m; 252 253 id = (rxq->added + batch) & rxq->ptr_mask; 254 rx_desc = &rxq->queue[id]; 255 KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL")); 256 257 rx_desc->flags = EFX_DISCARD; 258 m = rx_desc->mbuf = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, 259 sc->rx_cluster_size); 260 if (m == NULL) 261 break; 262 263 /* m_len specifies length of area to be mapped for DMA */ 264 m->m_len = mblksize; 265 m->m_data = (caddr_t)P2ROUNDUP((uintptr_t)m->m_data, CACHE_LINE_SIZE); 266 m->m_data += sc->rx_buffer_align; 267 268 sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg); 269 addr[batch++] = seg.ds_addr; 270 271 if (batch == SFXGE_REFILL_BATCH) { 272 efx_rx_qpost(rxq->common, addr, mblksize, batch, 273 rxq->completed, rxq->added); 274 rxq->added += batch; 275 batch = 0; 276 } 277 } 278 279 if (ntodo != 0) 280 sfxge_rx_schedule_refill(rxq, retrying); 281 282 if (batch != 0) { 283 efx_rx_qpost(rxq->common, addr, mblksize, batch, 284 rxq->completed, rxq->added); 285 rxq->added += batch; 286 } 287 288 /* Make the descriptors visible to the hardware */ 289 bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map, 290 BUS_DMASYNC_PREWRITE); 291 292 efx_rx_qpush(rxq->common, rxq->added, &rxq->pushed); 293 294 /* The queue could still be empty if no descriptors were actually 295 * pushed, in which case there will be no event to cause the next 296 * refill, so we must schedule a refill ourselves. 297 */ 298 if(rxq->pushed == rxq->completed) { 299 sfxge_rx_schedule_refill(rxq, retrying); 300 } 301 } 302 303 void 304 sfxge_rx_qrefill(struct sfxge_rxq *rxq) 305 { 306 307 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED)) 308 return; 309 310 /* Make sure the queue is full */ 311 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE); 312 } 313 314 static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m) 315 { 316 struct ifnet *ifp = sc->ifnet; 317 318 m->m_pkthdr.rcvif = ifp; 319 m->m_pkthdr.csum_data = 0xffff; 320 ifp->if_input(ifp, m); 321 } 322 323 static void 324 sfxge_rx_deliver(struct sfxge_softc *sc, struct sfxge_rx_sw_desc *rx_desc) 325 { 326 struct mbuf *m = rx_desc->mbuf; 327 int flags = rx_desc->flags; 328 int csum_flags; 329 330 /* Convert checksum flags */ 331 csum_flags = (flags & EFX_CKSUM_IPV4) ? 332 (CSUM_IP_CHECKED | CSUM_IP_VALID) : 0; 333 if (flags & EFX_CKSUM_TCPUDP) 334 csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 335 336 if (flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) { 337 m->m_pkthdr.flowid = 338 efx_psuedo_hdr_hash_get(sc->enp, 339 EFX_RX_HASHALG_TOEPLITZ, 340 mtod(m, uint8_t *)); 341 /* The hash covers a 4-tuple for TCP only */ 342 M_HASHTYPE_SET(m, 343 (flags & EFX_PKT_IPV4) ? 344 ((flags & EFX_PKT_TCP) ? 345 M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_IPV4) : 346 ((flags & EFX_PKT_TCP) ? 347 M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_IPV6)); 348 } 349 m->m_data += sc->rx_prefix_size; 350 m->m_len = rx_desc->size - sc->rx_prefix_size; 351 m->m_pkthdr.len = m->m_len; 352 m->m_pkthdr.csum_flags = csum_flags; 353 __sfxge_rx_deliver(sc, rx_desc->mbuf); 354 355 rx_desc->flags = EFX_DISCARD; 356 rx_desc->mbuf = NULL; 357 } 358 359 #ifdef SFXGE_LRO 360 361 static void 362 sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c) 363 { 364 struct sfxge_softc *sc = st->sc; 365 struct mbuf *m = c->mbuf; 366 struct tcphdr *c_th; 367 int csum_flags; 368 369 KASSERT(m, ("no mbuf to deliver")); 370 371 ++st->n_bursts; 372 373 /* Finish off packet munging and recalculate IP header checksum. */ 374 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 375 struct ip *iph = c->nh; 376 iph->ip_len = htons(iph->ip_len); 377 iph->ip_sum = 0; 378 iph->ip_sum = in_cksum_hdr(iph); 379 c_th = (struct tcphdr *)(iph + 1); 380 csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR | 381 CSUM_IP_CHECKED | CSUM_IP_VALID); 382 } else { 383 struct ip6_hdr *iph = c->nh; 384 iph->ip6_plen = htons(iph->ip6_plen); 385 c_th = (struct tcphdr *)(iph + 1); 386 csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 387 } 388 389 c_th->th_win = c->th_last->th_win; 390 c_th->th_ack = c->th_last->th_ack; 391 if (c_th->th_off == c->th_last->th_off) { 392 /* Copy TCP options (take care to avoid going negative). */ 393 int optlen = ((c_th->th_off - 5) & 0xf) << 2u; 394 memcpy(c_th + 1, c->th_last + 1, optlen); 395 } 396 397 m->m_pkthdr.flowid = c->conn_hash; 398 M_HASHTYPE_SET(m, 399 SFXGE_LRO_CONN_IS_TCPIPV4(c) ? 400 M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_TCP_IPV6); 401 402 m->m_pkthdr.csum_flags = csum_flags; 403 __sfxge_rx_deliver(sc, m); 404 405 c->mbuf = NULL; 406 c->delivered = 1; 407 } 408 409 /* Drop the given connection, and add it to the free list. */ 410 static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c) 411 { 412 unsigned bucket; 413 414 KASSERT(!c->mbuf, ("found orphaned mbuf")); 415 416 if (c->next_buf.mbuf != NULL) { 417 sfxge_rx_deliver(rxq->sc, &c->next_buf); 418 LIST_REMOVE(c, active_link); 419 } 420 421 bucket = c->conn_hash & rxq->lro.conns_mask; 422 KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong")); 423 --rxq->lro.conns_n[bucket]; 424 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link); 425 TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link); 426 } 427 428 /* Stop tracking connections that have gone idle in order to keep hash 429 * chains short. 430 */ 431 static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now) 432 { 433 struct sfxge_lro_conn *c; 434 unsigned i; 435 436 KASSERT(LIST_EMPTY(&rxq->lro.active_conns), 437 ("found active connections")); 438 439 rxq->lro.last_purge_ticks = now; 440 for (i = 0; i <= rxq->lro.conns_mask; ++i) { 441 if (TAILQ_EMPTY(&rxq->lro.conns[i])) 442 continue; 443 444 c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq); 445 if (now - c->last_pkt_ticks > lro_idle_ticks) { 446 ++rxq->lro.n_drop_idle; 447 sfxge_lro_drop(rxq, c); 448 } 449 } 450 } 451 452 static void 453 sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c, 454 struct mbuf *mbuf, struct tcphdr *th) 455 { 456 struct tcphdr *c_th; 457 458 /* Tack the new mbuf onto the chain. */ 459 KASSERT(!mbuf->m_next, ("mbuf already chained")); 460 c->mbuf_tail->m_next = mbuf; 461 c->mbuf_tail = mbuf; 462 463 /* Increase length appropriately */ 464 c->mbuf->m_pkthdr.len += mbuf->m_len; 465 466 /* Update the connection state flags */ 467 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 468 struct ip *iph = c->nh; 469 iph->ip_len += mbuf->m_len; 470 c_th = (struct tcphdr *)(iph + 1); 471 } else { 472 struct ip6_hdr *iph = c->nh; 473 iph->ip6_plen += mbuf->m_len; 474 c_th = (struct tcphdr *)(iph + 1); 475 } 476 c_th->th_flags |= (th->th_flags & TH_PUSH); 477 c->th_last = th; 478 ++st->n_merges; 479 480 /* Pass packet up now if another segment could overflow the IP 481 * length. 482 */ 483 if (c->mbuf->m_pkthdr.len > 65536 - 9200) 484 sfxge_lro_deliver(st, c); 485 } 486 487 static void 488 sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c, 489 struct mbuf *mbuf, void *nh, struct tcphdr *th) 490 { 491 /* Start the chain */ 492 c->mbuf = mbuf; 493 c->mbuf_tail = c->mbuf; 494 c->nh = nh; 495 c->th_last = th; 496 497 mbuf->m_pkthdr.len = mbuf->m_len; 498 499 /* Mangle header fields for later processing */ 500 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 501 struct ip *iph = nh; 502 iph->ip_len = ntohs(iph->ip_len); 503 } else { 504 struct ip6_hdr *iph = nh; 505 iph->ip6_plen = ntohs(iph->ip6_plen); 506 } 507 } 508 509 /* Try to merge or otherwise hold or deliver (as appropriate) the 510 * packet buffered for this connection (c->next_buf). Return a flag 511 * indicating whether the connection is still active for LRO purposes. 512 */ 513 static int 514 sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c) 515 { 516 struct sfxge_rx_sw_desc *rx_buf = &c->next_buf; 517 char *eh = c->next_eh; 518 int data_length, hdr_length, dont_merge; 519 unsigned th_seq, pkt_length; 520 struct tcphdr *th; 521 unsigned now; 522 523 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 524 struct ip *iph = c->next_nh; 525 th = (struct tcphdr *)(iph + 1); 526 pkt_length = ntohs(iph->ip_len) + (char *) iph - eh; 527 } else { 528 struct ip6_hdr *iph = c->next_nh; 529 th = (struct tcphdr *)(iph + 1); 530 pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh; 531 } 532 533 hdr_length = (char *) th + th->th_off * 4 - eh; 534 data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) - 535 hdr_length); 536 th_seq = ntohl(th->th_seq); 537 dont_merge = ((data_length <= 0) 538 | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN))); 539 540 /* Check for options other than aligned timestamp. */ 541 if (th->th_off != 5) { 542 const uint32_t *opt_ptr = (const uint32_t *) (th + 1); 543 if (th->th_off == 8 && 544 opt_ptr[0] == ntohl((TCPOPT_NOP << 24) | 545 (TCPOPT_NOP << 16) | 546 (TCPOPT_TIMESTAMP << 8) | 547 TCPOLEN_TIMESTAMP)) { 548 /* timestamp option -- okay */ 549 } else { 550 dont_merge = 1; 551 } 552 } 553 554 if (__predict_false(th_seq != c->next_seq)) { 555 /* Out-of-order, so start counting again. */ 556 if (c->mbuf != NULL) 557 sfxge_lro_deliver(&rxq->lro, c); 558 c->n_in_order_pkts -= lro_loss_packets; 559 c->next_seq = th_seq + data_length; 560 ++rxq->lro.n_misorder; 561 goto deliver_buf_out; 562 } 563 c->next_seq = th_seq + data_length; 564 565 now = ticks; 566 if (now - c->last_pkt_ticks > lro_idle_ticks) { 567 ++rxq->lro.n_drop_idle; 568 if (c->mbuf != NULL) 569 sfxge_lro_deliver(&rxq->lro, c); 570 sfxge_lro_drop(rxq, c); 571 return (0); 572 } 573 c->last_pkt_ticks = ticks; 574 575 if (c->n_in_order_pkts < lro_slow_start_packets) { 576 /* May be in slow-start, so don't merge. */ 577 ++rxq->lro.n_slow_start; 578 ++c->n_in_order_pkts; 579 goto deliver_buf_out; 580 } 581 582 if (__predict_false(dont_merge)) { 583 if (c->mbuf != NULL) 584 sfxge_lro_deliver(&rxq->lro, c); 585 if (th->th_flags & (TH_FIN | TH_RST)) { 586 ++rxq->lro.n_drop_closed; 587 sfxge_lro_drop(rxq, c); 588 return (0); 589 } 590 goto deliver_buf_out; 591 } 592 593 rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size; 594 595 if (__predict_true(c->mbuf != NULL)) { 596 /* Remove headers and any padding */ 597 rx_buf->mbuf->m_data += hdr_length; 598 rx_buf->mbuf->m_len = data_length; 599 600 sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th); 601 } else { 602 /* Remove any padding */ 603 rx_buf->mbuf->m_len = pkt_length; 604 605 sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th); 606 } 607 608 rx_buf->mbuf = NULL; 609 return (1); 610 611 deliver_buf_out: 612 sfxge_rx_deliver(rxq->sc, rx_buf); 613 return (1); 614 } 615 616 static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash, 617 uint16_t l2_id, void *nh, struct tcphdr *th) 618 { 619 unsigned bucket = conn_hash & st->conns_mask; 620 struct sfxge_lro_conn *c; 621 622 if (st->conns_n[bucket] >= lro_chain_max) { 623 ++st->n_too_many; 624 return; 625 } 626 627 if (!TAILQ_EMPTY(&st->free_conns)) { 628 c = TAILQ_FIRST(&st->free_conns); 629 TAILQ_REMOVE(&st->free_conns, c, link); 630 } else { 631 c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT); 632 if (c == NULL) 633 return; 634 c->mbuf = NULL; 635 c->next_buf.mbuf = NULL; 636 } 637 638 /* Create the connection tracking data */ 639 ++st->conns_n[bucket]; 640 TAILQ_INSERT_HEAD(&st->conns[bucket], c, link); 641 c->l2_id = l2_id; 642 c->conn_hash = conn_hash; 643 c->source = th->th_sport; 644 c->dest = th->th_dport; 645 c->n_in_order_pkts = 0; 646 c->last_pkt_ticks = *(volatile int *)&ticks; 647 c->delivered = 0; 648 ++st->n_new_stream; 649 /* NB. We don't initialise c->next_seq, and it doesn't matter what 650 * value it has. Most likely the next packet received for this 651 * connection will not match -- no harm done. 652 */ 653 } 654 655 /* Process mbuf and decide whether to dispatch it to the stack now or 656 * later. 657 */ 658 static void 659 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf) 660 { 661 struct sfxge_softc *sc = rxq->sc; 662 struct mbuf *m = rx_buf->mbuf; 663 struct ether_header *eh; 664 struct sfxge_lro_conn *c; 665 uint16_t l2_id; 666 uint16_t l3_proto; 667 void *nh; 668 struct tcphdr *th; 669 uint32_t conn_hash; 670 unsigned bucket; 671 672 /* Get the hardware hash */ 673 conn_hash = efx_psuedo_hdr_hash_get(sc->enp, 674 EFX_RX_HASHALG_TOEPLITZ, 675 mtod(m, uint8_t *)); 676 677 eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size); 678 if (eh->ether_type == htons(ETHERTYPE_VLAN)) { 679 struct ether_vlan_header *veh = (struct ether_vlan_header *)eh; 680 l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) | 681 SFXGE_LRO_L2_ID_VLAN; 682 l3_proto = veh->evl_proto; 683 nh = veh + 1; 684 } else { 685 l2_id = 0; 686 l3_proto = eh->ether_type; 687 nh = eh + 1; 688 } 689 690 /* Check whether this is a suitable packet (unfragmented 691 * TCP/IPv4 or TCP/IPv6). If so, find the TCP header and 692 * length, and compute a hash if necessary. If not, return. 693 */ 694 if (l3_proto == htons(ETHERTYPE_IP)) { 695 struct ip *iph = nh; 696 697 KASSERT(iph->ip_p == IPPROTO_TCP, 698 ("IPv4 protocol is not TCP, but packet marker is set")); 699 if ((iph->ip_hl - (sizeof(*iph) >> 2u)) | 700 (iph->ip_off & htons(IP_MF | IP_OFFMASK))) 701 goto deliver_now; 702 th = (struct tcphdr *)(iph + 1); 703 } else if (l3_proto == htons(ETHERTYPE_IPV6)) { 704 struct ip6_hdr *iph = nh; 705 706 KASSERT(iph->ip6_nxt == IPPROTO_TCP, 707 ("IPv6 next header is not TCP, but packet marker is set")); 708 l2_id |= SFXGE_LRO_L2_ID_IPV6; 709 th = (struct tcphdr *)(iph + 1); 710 } else { 711 goto deliver_now; 712 } 713 714 bucket = conn_hash & rxq->lro.conns_mask; 715 716 TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) { 717 if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash)) 718 continue; 719 if ((c->source - th->th_sport) | (c->dest - th->th_dport)) 720 continue; 721 if (c->mbuf != NULL) { 722 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 723 struct ip *c_iph, *iph = nh; 724 c_iph = c->nh; 725 if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) | 726 (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr)) 727 continue; 728 } else { 729 struct ip6_hdr *c_iph, *iph = nh; 730 c_iph = c->nh; 731 if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) | 732 ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst)) 733 continue; 734 } 735 } 736 737 /* Re-insert at head of list to reduce lookup time. */ 738 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link); 739 TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link); 740 741 if (c->next_buf.mbuf != NULL) { 742 if (!sfxge_lro_try_merge(rxq, c)) 743 goto deliver_now; 744 } else { 745 LIST_INSERT_HEAD(&rxq->lro.active_conns, c, 746 active_link); 747 } 748 c->next_buf = *rx_buf; 749 c->next_eh = eh; 750 c->next_nh = nh; 751 752 rx_buf->mbuf = NULL; 753 rx_buf->flags = EFX_DISCARD; 754 return; 755 } 756 757 sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th); 758 deliver_now: 759 sfxge_rx_deliver(sc, rx_buf); 760 } 761 762 static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq) 763 { 764 struct sfxge_lro_state *st = &rxq->lro; 765 struct sfxge_lro_conn *c; 766 unsigned t; 767 768 while (!LIST_EMPTY(&st->active_conns)) { 769 c = LIST_FIRST(&st->active_conns); 770 if (!c->delivered && c->mbuf != NULL) 771 sfxge_lro_deliver(st, c); 772 if (sfxge_lro_try_merge(rxq, c)) { 773 if (c->mbuf != NULL) 774 sfxge_lro_deliver(st, c); 775 LIST_REMOVE(c, active_link); 776 } 777 c->delivered = 0; 778 } 779 780 t = *(volatile int *)&ticks; 781 if (__predict_false(t != st->last_purge_ticks)) 782 sfxge_lro_purge_idle(rxq, t); 783 } 784 785 #else /* !SFXGE_LRO */ 786 787 static void 788 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf) 789 { 790 } 791 792 static void 793 sfxge_lro_end_of_burst(struct sfxge_rxq *rxq) 794 { 795 } 796 797 #endif /* SFXGE_LRO */ 798 799 void 800 sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop) 801 { 802 struct sfxge_softc *sc = rxq->sc; 803 int if_capenable = sc->ifnet->if_capenable; 804 int lro_enabled = if_capenable & IFCAP_LRO; 805 unsigned int index; 806 struct sfxge_evq *evq; 807 unsigned int completed; 808 unsigned int level; 809 struct mbuf *m; 810 struct sfxge_rx_sw_desc *prev = NULL; 811 812 index = rxq->index; 813 evq = sc->evq[index]; 814 815 SFXGE_EVQ_LOCK_ASSERT_OWNED(evq); 816 817 completed = rxq->completed; 818 while (completed != rxq->pending) { 819 unsigned int id; 820 struct sfxge_rx_sw_desc *rx_desc; 821 822 id = completed++ & rxq->ptr_mask; 823 rx_desc = &rxq->queue[id]; 824 m = rx_desc->mbuf; 825 826 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED)) 827 goto discard; 828 829 if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD)) 830 goto discard; 831 832 /* Read the length from the psuedo header if required */ 833 if (rx_desc->flags & EFX_PKT_PREFIX_LEN) { 834 uint16_t tmp_size; 835 int rc; 836 rc = efx_psuedo_hdr_pkt_length_get(sc->enp, 837 mtod(m, uint8_t *), 838 &tmp_size); 839 KASSERT(rc == 0, ("cannot get packet length: %d", rc)); 840 rx_desc->size = (int)tmp_size + sc->rx_prefix_size; 841 } 842 843 prefetch_read_many(mtod(m, caddr_t)); 844 845 switch (rx_desc->flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) { 846 case EFX_PKT_IPV4: 847 if (~if_capenable & IFCAP_RXCSUM) 848 rx_desc->flags &= 849 ~(EFX_CKSUM_IPV4 | EFX_CKSUM_TCPUDP); 850 break; 851 case EFX_PKT_IPV6: 852 if (~if_capenable & IFCAP_RXCSUM_IPV6) 853 rx_desc->flags &= ~EFX_CKSUM_TCPUDP; 854 break; 855 case 0: 856 /* Check for loopback packets */ 857 { 858 struct ether_header *etherhp; 859 860 /*LINTED*/ 861 etherhp = mtod(m, struct ether_header *); 862 863 if (etherhp->ether_type == 864 htons(SFXGE_ETHERTYPE_LOOPBACK)) { 865 EFSYS_PROBE(loopback); 866 867 rxq->loopback++; 868 goto discard; 869 } 870 } 871 break; 872 default: 873 KASSERT(B_FALSE, 874 ("Rx descriptor with both IPv4 and IPv6 flags")); 875 goto discard; 876 } 877 878 /* Pass packet up the stack or into LRO (pipelined) */ 879 if (prev != NULL) { 880 if (lro_enabled && 881 ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) == 882 (EFX_PKT_TCP | EFX_CKSUM_TCPUDP))) 883 sfxge_lro(rxq, prev); 884 else 885 sfxge_rx_deliver(sc, prev); 886 } 887 prev = rx_desc; 888 continue; 889 890 discard: 891 /* Return the packet to the pool */ 892 m_free(m); 893 rx_desc->mbuf = NULL; 894 } 895 rxq->completed = completed; 896 897 level = rxq->added - rxq->completed; 898 899 /* Pass last packet up the stack or into LRO */ 900 if (prev != NULL) { 901 if (lro_enabled && 902 ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) == 903 (EFX_PKT_TCP | EFX_CKSUM_TCPUDP))) 904 sfxge_lro(rxq, prev); 905 else 906 sfxge_rx_deliver(sc, prev); 907 } 908 909 /* 910 * If there are any pending flows and this is the end of the 911 * poll then they must be completed. 912 */ 913 if (eop) 914 sfxge_lro_end_of_burst(rxq); 915 916 /* Top up the queue if necessary */ 917 if (level < rxq->refill_threshold) 918 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE); 919 } 920 921 static void 922 sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index) 923 { 924 struct sfxge_rxq *rxq; 925 struct sfxge_evq *evq; 926 unsigned int count; 927 unsigned int retry = 3; 928 929 SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc); 930 931 rxq = sc->rxq[index]; 932 evq = sc->evq[index]; 933 934 SFXGE_EVQ_LOCK(evq); 935 936 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED, 937 ("rxq not started")); 938 939 rxq->init_state = SFXGE_RXQ_INITIALIZED; 940 941 callout_stop(&rxq->refill_callout); 942 943 while (rxq->flush_state != SFXGE_FLUSH_DONE && retry != 0) { 944 rxq->flush_state = SFXGE_FLUSH_PENDING; 945 946 SFXGE_EVQ_UNLOCK(evq); 947 948 /* Flush the receive queue */ 949 if (efx_rx_qflush(rxq->common) != 0) { 950 SFXGE_EVQ_LOCK(evq); 951 rxq->flush_state = SFXGE_FLUSH_FAILED; 952 break; 953 } 954 955 count = 0; 956 do { 957 /* Spin for 100 ms */ 958 DELAY(100000); 959 960 if (rxq->flush_state != SFXGE_FLUSH_PENDING) 961 break; 962 963 } while (++count < 20); 964 965 SFXGE_EVQ_LOCK(evq); 966 967 if (rxq->flush_state == SFXGE_FLUSH_PENDING) { 968 /* Flush timeout - neither done nor failed */ 969 log(LOG_ERR, "%s: Cannot flush Rx queue %u\n", 970 device_get_nameunit(sc->dev), index); 971 rxq->flush_state = SFXGE_FLUSH_DONE; 972 } 973 retry--; 974 } 975 if (rxq->flush_state == SFXGE_FLUSH_FAILED) { 976 log(LOG_ERR, "%s: Flushing Rx queue %u failed\n", 977 device_get_nameunit(sc->dev), index); 978 rxq->flush_state = SFXGE_FLUSH_DONE; 979 } 980 981 rxq->pending = rxq->added; 982 sfxge_rx_qcomplete(rxq, B_TRUE); 983 984 KASSERT(rxq->completed == rxq->pending, 985 ("rxq->completed != rxq->pending")); 986 987 rxq->added = 0; 988 rxq->pushed = 0; 989 rxq->pending = 0; 990 rxq->completed = 0; 991 rxq->loopback = 0; 992 993 /* Destroy the common code receive queue. */ 994 efx_rx_qdestroy(rxq->common); 995 996 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id, 997 EFX_RXQ_NBUFS(sc->rxq_entries)); 998 999 SFXGE_EVQ_UNLOCK(evq); 1000 } 1001 1002 static int 1003 sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index) 1004 { 1005 struct sfxge_rxq *rxq; 1006 efsys_mem_t *esmp; 1007 struct sfxge_evq *evq; 1008 int rc; 1009 1010 SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc); 1011 1012 rxq = sc->rxq[index]; 1013 esmp = &rxq->mem; 1014 evq = sc->evq[index]; 1015 1016 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED, 1017 ("rxq->init_state != SFXGE_RXQ_INITIALIZED")); 1018 KASSERT(evq->init_state == SFXGE_EVQ_STARTED, 1019 ("evq->init_state != SFXGE_EVQ_STARTED")); 1020 1021 /* Program the buffer table. */ 1022 if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp, 1023 EFX_RXQ_NBUFS(sc->rxq_entries))) != 0) 1024 return (rc); 1025 1026 /* Create the common code receive queue. */ 1027 if ((rc = efx_rx_qcreate(sc->enp, index, index, EFX_RXQ_TYPE_DEFAULT, 1028 esmp, sc->rxq_entries, rxq->buf_base_id, evq->common, 1029 &rxq->common)) != 0) 1030 goto fail; 1031 1032 SFXGE_EVQ_LOCK(evq); 1033 1034 /* Enable the receive queue. */ 1035 efx_rx_qenable(rxq->common); 1036 1037 rxq->init_state = SFXGE_RXQ_STARTED; 1038 rxq->flush_state = SFXGE_FLUSH_REQUIRED; 1039 1040 /* Try to fill the queue from the pool. */ 1041 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE); 1042 1043 SFXGE_EVQ_UNLOCK(evq); 1044 1045 return (0); 1046 1047 fail: 1048 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id, 1049 EFX_RXQ_NBUFS(sc->rxq_entries)); 1050 return (rc); 1051 } 1052 1053 void 1054 sfxge_rx_stop(struct sfxge_softc *sc) 1055 { 1056 int index; 1057 1058 efx_mac_filter_default_rxq_clear(sc->enp); 1059 1060 /* Stop the receive queue(s) */ 1061 index = sc->rxq_count; 1062 while (--index >= 0) 1063 sfxge_rx_qstop(sc, index); 1064 1065 sc->rx_prefix_size = 0; 1066 sc->rx_buffer_size = 0; 1067 1068 efx_rx_fini(sc->enp); 1069 } 1070 1071 int 1072 sfxge_rx_start(struct sfxge_softc *sc) 1073 { 1074 struct sfxge_intr *intr; 1075 const efx_nic_cfg_t *encp; 1076 size_t hdrlen, align, reserved; 1077 int index; 1078 int rc; 1079 1080 intr = &sc->intr; 1081 1082 /* Initialize the common code receive module. */ 1083 if ((rc = efx_rx_init(sc->enp)) != 0) 1084 return (rc); 1085 1086 encp = efx_nic_cfg_get(sc->enp); 1087 sc->rx_buffer_size = EFX_MAC_PDU(sc->ifnet->if_mtu); 1088 1089 /* Calculate the receive packet buffer size. */ 1090 sc->rx_prefix_size = encp->enc_rx_prefix_size; 1091 1092 /* Ensure IP headers are 32bit aligned */ 1093 hdrlen = sc->rx_prefix_size + sizeof (struct ether_header); 1094 sc->rx_buffer_align = P2ROUNDUP(hdrlen, 4) - hdrlen; 1095 1096 sc->rx_buffer_size += sc->rx_buffer_align; 1097 1098 /* Align end of packet buffer for RX DMA end padding */ 1099 align = MAX(1, encp->enc_rx_buf_align_end); 1100 EFSYS_ASSERT(ISP2(align)); 1101 sc->rx_buffer_size = P2ROUNDUP(sc->rx_buffer_size, align); 1102 1103 /* 1104 * Standard mbuf zones only guarantee pointer-size alignment; 1105 * we need extra space to align to the cache line 1106 */ 1107 reserved = sc->rx_buffer_size + CACHE_LINE_SIZE; 1108 1109 /* Select zone for packet buffers */ 1110 if (reserved <= MCLBYTES) 1111 sc->rx_cluster_size = MCLBYTES; 1112 else if (reserved <= MJUMPAGESIZE) 1113 sc->rx_cluster_size = MJUMPAGESIZE; 1114 else if (reserved <= MJUM9BYTES) 1115 sc->rx_cluster_size = MJUM9BYTES; 1116 else 1117 sc->rx_cluster_size = MJUM16BYTES; 1118 1119 /* 1120 * Set up the scale table. Enable all hash types and hash insertion. 1121 */ 1122 for (index = 0; index < SFXGE_RX_SCALE_MAX; index++) 1123 sc->rx_indir_table[index] = index % sc->rxq_count; 1124 if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table, 1125 SFXGE_RX_SCALE_MAX)) != 0) 1126 goto fail; 1127 (void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ, 1128 (1 << EFX_RX_HASH_IPV4) | (1 << EFX_RX_HASH_TCPIPV4) | 1129 (1 << EFX_RX_HASH_IPV6) | (1 << EFX_RX_HASH_TCPIPV6), B_TRUE); 1130 1131 if ((rc = efx_rx_scale_key_set(sc->enp, toep_key, 1132 sizeof(toep_key))) != 0) 1133 goto fail; 1134 1135 /* Start the receive queue(s). */ 1136 for (index = 0; index < sc->rxq_count; index++) { 1137 if ((rc = sfxge_rx_qstart(sc, index)) != 0) 1138 goto fail2; 1139 } 1140 1141 rc = efx_mac_filter_default_rxq_set(sc->enp, sc->rxq[0]->common, 1142 sc->intr.n_alloc > 1); 1143 if (rc != 0) 1144 goto fail3; 1145 1146 return (0); 1147 1148 fail3: 1149 fail2: 1150 while (--index >= 0) 1151 sfxge_rx_qstop(sc, index); 1152 1153 fail: 1154 efx_rx_fini(sc->enp); 1155 1156 return (rc); 1157 } 1158 1159 #ifdef SFXGE_LRO 1160 1161 static void sfxge_lro_init(struct sfxge_rxq *rxq) 1162 { 1163 struct sfxge_lro_state *st = &rxq->lro; 1164 unsigned i; 1165 1166 st->conns_mask = lro_table_size - 1; 1167 KASSERT(!((st->conns_mask + 1) & st->conns_mask), 1168 ("lro_table_size must be a power of 2")); 1169 st->sc = rxq->sc; 1170 st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]), 1171 M_SFXGE, M_WAITOK); 1172 st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]), 1173 M_SFXGE, M_WAITOK); 1174 for (i = 0; i <= st->conns_mask; ++i) { 1175 TAILQ_INIT(&st->conns[i]); 1176 st->conns_n[i] = 0; 1177 } 1178 LIST_INIT(&st->active_conns); 1179 TAILQ_INIT(&st->free_conns); 1180 } 1181 1182 static void sfxge_lro_fini(struct sfxge_rxq *rxq) 1183 { 1184 struct sfxge_lro_state *st = &rxq->lro; 1185 struct sfxge_lro_conn *c; 1186 unsigned i; 1187 1188 /* Return cleanly if sfxge_lro_init() has not been called. */ 1189 if (st->conns == NULL) 1190 return; 1191 1192 KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections")); 1193 1194 for (i = 0; i <= st->conns_mask; ++i) { 1195 while (!TAILQ_EMPTY(&st->conns[i])) { 1196 c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq); 1197 sfxge_lro_drop(rxq, c); 1198 } 1199 } 1200 1201 while (!TAILQ_EMPTY(&st->free_conns)) { 1202 c = TAILQ_FIRST(&st->free_conns); 1203 TAILQ_REMOVE(&st->free_conns, c, link); 1204 KASSERT(!c->mbuf, ("found orphaned mbuf")); 1205 free(c, M_SFXGE); 1206 } 1207 1208 free(st->conns_n, M_SFXGE); 1209 free(st->conns, M_SFXGE); 1210 st->conns = NULL; 1211 } 1212 1213 #else 1214 1215 static void 1216 sfxge_lro_init(struct sfxge_rxq *rxq) 1217 { 1218 } 1219 1220 static void 1221 sfxge_lro_fini(struct sfxge_rxq *rxq) 1222 { 1223 } 1224 1225 #endif /* SFXGE_LRO */ 1226 1227 static void 1228 sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index) 1229 { 1230 struct sfxge_rxq *rxq; 1231 1232 rxq = sc->rxq[index]; 1233 1234 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED, 1235 ("rxq->init_state != SFXGE_RXQ_INITIALIZED")); 1236 1237 /* Free the context array and the flow table. */ 1238 free(rxq->queue, M_SFXGE); 1239 sfxge_lro_fini(rxq); 1240 1241 /* Release DMA memory. */ 1242 sfxge_dma_free(&rxq->mem); 1243 1244 sc->rxq[index] = NULL; 1245 1246 free(rxq, M_SFXGE); 1247 } 1248 1249 static int 1250 sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index) 1251 { 1252 struct sfxge_rxq *rxq; 1253 struct sfxge_evq *evq; 1254 efsys_mem_t *esmp; 1255 int rc; 1256 1257 KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count)); 1258 1259 rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK); 1260 rxq->sc = sc; 1261 rxq->index = index; 1262 rxq->entries = sc->rxq_entries; 1263 rxq->ptr_mask = rxq->entries - 1; 1264 rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries); 1265 1266 sc->rxq[index] = rxq; 1267 esmp = &rxq->mem; 1268 1269 evq = sc->evq[index]; 1270 1271 /* Allocate and zero DMA space. */ 1272 if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0) 1273 return (rc); 1274 1275 /* Allocate buffer table entries. */ 1276 sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries), 1277 &rxq->buf_base_id); 1278 1279 /* Allocate the context array and the flow table. */ 1280 rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries, 1281 M_SFXGE, M_WAITOK | M_ZERO); 1282 sfxge_lro_init(rxq); 1283 1284 callout_init(&rxq->refill_callout, 1); 1285 1286 rxq->init_state = SFXGE_RXQ_INITIALIZED; 1287 1288 return (0); 1289 } 1290 1291 static const struct { 1292 const char *name; 1293 size_t offset; 1294 } sfxge_rx_stats[] = { 1295 #define SFXGE_RX_STAT(name, member) \ 1296 { #name, offsetof(struct sfxge_rxq, member) } 1297 #ifdef SFXGE_LRO 1298 SFXGE_RX_STAT(lro_merges, lro.n_merges), 1299 SFXGE_RX_STAT(lro_bursts, lro.n_bursts), 1300 SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start), 1301 SFXGE_RX_STAT(lro_misorder, lro.n_misorder), 1302 SFXGE_RX_STAT(lro_too_many, lro.n_too_many), 1303 SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream), 1304 SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle), 1305 SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed) 1306 #endif 1307 }; 1308 1309 static int 1310 sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS) 1311 { 1312 struct sfxge_softc *sc = arg1; 1313 unsigned int id = arg2; 1314 unsigned int sum, index; 1315 1316 /* Sum across all RX queues */ 1317 sum = 0; 1318 for (index = 0; index < sc->rxq_count; index++) 1319 sum += *(unsigned int *)((caddr_t)sc->rxq[index] + 1320 sfxge_rx_stats[id].offset); 1321 1322 return (SYSCTL_OUT(req, &sum, sizeof(sum))); 1323 } 1324 1325 static void 1326 sfxge_rx_stat_init(struct sfxge_softc *sc) 1327 { 1328 struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev); 1329 struct sysctl_oid_list *stat_list; 1330 unsigned int id; 1331 1332 stat_list = SYSCTL_CHILDREN(sc->stats_node); 1333 1334 for (id = 0; id < nitems(sfxge_rx_stats); id++) { 1335 SYSCTL_ADD_PROC( 1336 ctx, stat_list, 1337 OID_AUTO, sfxge_rx_stats[id].name, 1338 CTLTYPE_UINT|CTLFLAG_RD, 1339 sc, id, sfxge_rx_stat_handler, "IU", 1340 ""); 1341 } 1342 } 1343 1344 void 1345 sfxge_rx_fini(struct sfxge_softc *sc) 1346 { 1347 int index; 1348 1349 index = sc->rxq_count; 1350 while (--index >= 0) 1351 sfxge_rx_qfini(sc, index); 1352 1353 sc->rxq_count = 0; 1354 } 1355 1356 int 1357 sfxge_rx_init(struct sfxge_softc *sc) 1358 { 1359 struct sfxge_intr *intr; 1360 int index; 1361 int rc; 1362 1363 #ifdef SFXGE_LRO 1364 if (!ISP2(lro_table_size)) { 1365 log(LOG_ERR, "%s=%u must be power of 2", 1366 SFXGE_LRO_PARAM(table_size), lro_table_size); 1367 rc = EINVAL; 1368 goto fail_lro_table_size; 1369 } 1370 1371 if (lro_idle_ticks == 0) 1372 lro_idle_ticks = hz / 10 + 1; /* 100 ms */ 1373 #endif 1374 1375 intr = &sc->intr; 1376 1377 sc->rxq_count = intr->n_alloc; 1378 1379 KASSERT(intr->state == SFXGE_INTR_INITIALIZED, 1380 ("intr->state != SFXGE_INTR_INITIALIZED")); 1381 1382 /* Initialize the receive queue(s) - one per interrupt. */ 1383 for (index = 0; index < sc->rxq_count; index++) { 1384 if ((rc = sfxge_rx_qinit(sc, index)) != 0) 1385 goto fail; 1386 } 1387 1388 sfxge_rx_stat_init(sc); 1389 1390 return (0); 1391 1392 fail: 1393 /* Tear down the receive queue(s). */ 1394 while (--index >= 0) 1395 sfxge_rx_qfini(sc, index); 1396 1397 sc->rxq_count = 0; 1398 1399 #ifdef SFXGE_LRO 1400 fail_lro_table_size: 1401 #endif 1402 return (rc); 1403 } 1404