1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2010-2016 Solarflare Communications Inc.
5 * All rights reserved.
6 *
7 * This software was developed in part by Philip Paeps under contract for
8 * Solarflare Communications, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions are met:
12 *
13 * 1. Redistributions of source code must retain the above copyright notice,
14 * this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright notice,
16 * this list of conditions and the following disclaimer in the documentation
17 * and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
23 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
29 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 *
31 * The views and conclusions contained in the software and documentation are
32 * those of the authors and should not be interpreted as representing official
33 * policies, either expressed or implied, of the FreeBSD Project.
34 */
35
36 #include <sys/cdefs.h>
37 #include "opt_rss.h"
38
39 #include <sys/param.h>
40 #include <sys/malloc.h>
41 #include <sys/mbuf.h>
42 #include <sys/smp.h>
43 #include <sys/socket.h>
44 #include <sys/sysctl.h>
45 #include <sys/syslog.h>
46 #include <sys/limits.h>
47 #include <sys/syslog.h>
48
49 #include <net/ethernet.h>
50 #include <net/if.h>
51 #include <net/if_vlan_var.h>
52
53 #include <netinet/in.h>
54 #include <netinet/ip.h>
55 #include <netinet/ip6.h>
56 #include <netinet/tcp.h>
57
58 #include <machine/in_cksum.h>
59
60 #include <net/rss_config.h>
61
62 #include "common/efx.h"
63
64 #include "sfxge.h"
65 #include "sfxge_rx.h"
66
67 #define RX_REFILL_THRESHOLD(_entries) (EFX_RXQ_LIMIT(_entries) * 9 / 10)
68
69 #ifdef SFXGE_LRO
70
71 SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
72 "Large receive offload (LRO) parameters");
73
74 #define SFXGE_LRO_PARAM(_param) SFXGE_PARAM(lro._param)
75
76 /* Size of the LRO hash table. Must be a power of 2. A larger table
77 * means we can accelerate a larger number of streams.
78 */
79 static unsigned lro_table_size = 128;
80 TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size);
81 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN,
82 &lro_table_size, 0,
83 "Size of the LRO hash table (must be a power of 2)");
84
85 /* Maximum length of a hash chain. If chains get too long then the lookup
86 * time increases and may exceed the benefit of LRO.
87 */
88 static unsigned lro_chain_max = 20;
89 TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max);
90 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN,
91 &lro_chain_max, 0,
92 "The maximum length of a hash chain");
93
94 /* Maximum time (in ticks) that a connection can be idle before it's LRO
95 * state is discarded.
96 */
97 static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
98 TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks);
99 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN,
100 &lro_idle_ticks, 0,
101 "The maximum time (in ticks) that a connection can be idle "
102 "before it's LRO state is discarded");
103
104 /* Number of packets with payload that must arrive in-order before a
105 * connection is eligible for LRO. The idea is we should avoid coalescing
106 * segments when the sender is in slow-start because reducing the ACK rate
107 * can damage performance.
108 */
109 static int lro_slow_start_packets = 2000;
110 TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets);
111 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN,
112 &lro_slow_start_packets, 0,
113 "Number of packets with payload that must arrive in-order before "
114 "a connection is eligible for LRO");
115
116 /* Number of packets with payload that must arrive in-order following loss
117 * before a connection is eligible for LRO. The idea is we should avoid
118 * coalescing segments when the sender is recovering from loss, because
119 * reducing the ACK rate can damage performance.
120 */
121 static int lro_loss_packets = 20;
122 TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets);
123 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN,
124 &lro_loss_packets, 0,
125 "Number of packets with payload that must arrive in-order "
126 "following loss before a connection is eligible for LRO");
127
128 /* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
129 #define SFXGE_LRO_L2_ID_VLAN 0x4000
130 #define SFXGE_LRO_L2_ID_IPV6 0x8000
131 #define SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
132 #define SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
133
134 /* Compare IPv6 addresses, avoiding conditional branches */
ipv6_addr_cmp(const struct in6_addr * left,const struct in6_addr * right)135 static unsigned long ipv6_addr_cmp(const struct in6_addr *left,
136 const struct in6_addr *right)
137 {
138 #if LONG_BIT == 64
139 const uint64_t *left64 = (const uint64_t *)left;
140 const uint64_t *right64 = (const uint64_t *)right;
141 return (left64[0] - right64[0]) | (left64[1] - right64[1]);
142 #else
143 return (left->s6_addr32[0] - right->s6_addr32[0]) |
144 (left->s6_addr32[1] - right->s6_addr32[1]) |
145 (left->s6_addr32[2] - right->s6_addr32[2]) |
146 (left->s6_addr32[3] - right->s6_addr32[3]);
147 #endif
148 }
149
150 #endif /* SFXGE_LRO */
151
152 void
sfxge_rx_qflush_done(struct sfxge_rxq * rxq)153 sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
154 {
155
156 rxq->flush_state = SFXGE_FLUSH_DONE;
157 }
158
159 void
sfxge_rx_qflush_failed(struct sfxge_rxq * rxq)160 sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
161 {
162
163 rxq->flush_state = SFXGE_FLUSH_FAILED;
164 }
165
166 static uint8_t toep_key[RSS_KEYSIZE];
167
168 static void
sfxge_rx_post_refill(void * arg)169 sfxge_rx_post_refill(void *arg)
170 {
171 struct sfxge_rxq *rxq = arg;
172 struct sfxge_softc *sc;
173 unsigned int index;
174 struct sfxge_evq *evq;
175 uint16_t magic;
176
177 sc = rxq->sc;
178 index = rxq->index;
179 evq = sc->evq[index];
180 magic = sfxge_sw_ev_rxq_magic(SFXGE_SW_EV_RX_QREFILL, rxq);
181
182 /* This is guaranteed due to the start/stop order of rx and ev */
183 KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
184 ("evq not started"));
185 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
186 ("rxq not started"));
187 efx_ev_qpost(evq->common, magic);
188 }
189
190 static void
sfxge_rx_schedule_refill(struct sfxge_rxq * rxq,boolean_t retrying)191 sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
192 {
193 /* Initially retry after 100 ms, but back off in case of
194 * repeated failures as we probably have to wait for the
195 * administrator to raise the pool limit. */
196 if (retrying)
197 rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
198 else
199 rxq->refill_delay = hz / 10;
200
201 callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
202 sfxge_rx_post_refill, rxq);
203 }
204
205 #define SFXGE_REFILL_BATCH 64
206
207 static void
sfxge_rx_qfill(struct sfxge_rxq * rxq,unsigned int target,boolean_t retrying)208 sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
209 {
210 struct sfxge_softc *sc;
211 unsigned int index;
212 struct sfxge_evq *evq __diagused;
213 unsigned int batch;
214 unsigned int rxfill;
215 unsigned int mblksize;
216 int ntodo;
217 efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
218
219 sc = rxq->sc;
220 index = rxq->index;
221 evq = sc->evq[index];
222
223 prefetch_read_many(sc->enp);
224 prefetch_read_many(rxq->common);
225
226 SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
227
228 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
229 return;
230
231 rxfill = rxq->added - rxq->completed;
232 KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries),
233 ("rxfill > EFX_RXQ_LIMIT(rxq->entries)"));
234 ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target);
235 KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries),
236 ("ntodo > EFX_RQX_LIMIT(rxq->entries)"));
237
238 if (ntodo == 0)
239 return;
240
241 batch = 0;
242 mblksize = sc->rx_buffer_size - sc->rx_buffer_align;
243 while (ntodo-- > 0) {
244 unsigned int id;
245 struct sfxge_rx_sw_desc *rx_desc;
246 bus_dma_segment_t seg;
247 struct mbuf *m;
248
249 id = (rxq->added + batch) & rxq->ptr_mask;
250 rx_desc = &rxq->queue[id];
251 KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
252
253 rx_desc->flags = EFX_DISCARD;
254 m = rx_desc->mbuf = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR,
255 sc->rx_cluster_size);
256 if (m == NULL)
257 break;
258
259 /* m_len specifies length of area to be mapped for DMA */
260 m->m_len = mblksize;
261 m->m_data = (caddr_t)EFX_P2ROUNDUP(uintptr_t, m->m_data,
262 CACHE_LINE_SIZE);
263 m->m_data += sc->rx_buffer_align;
264
265 sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
266 addr[batch++] = seg.ds_addr;
267
268 if (batch == SFXGE_REFILL_BATCH) {
269 efx_rx_qpost(rxq->common, addr, mblksize, batch,
270 rxq->completed, rxq->added);
271 rxq->added += batch;
272 batch = 0;
273 }
274 }
275
276 if (ntodo != 0)
277 sfxge_rx_schedule_refill(rxq, retrying);
278
279 if (batch != 0) {
280 efx_rx_qpost(rxq->common, addr, mblksize, batch,
281 rxq->completed, rxq->added);
282 rxq->added += batch;
283 }
284
285 /* Make the descriptors visible to the hardware */
286 bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
287 BUS_DMASYNC_PREWRITE);
288
289 efx_rx_qpush(rxq->common, rxq->added, &rxq->pushed);
290
291 /* The queue could still be empty if no descriptors were actually
292 * pushed, in which case there will be no event to cause the next
293 * refill, so we must schedule a refill ourselves.
294 */
295 if(rxq->pushed == rxq->completed) {
296 sfxge_rx_schedule_refill(rxq, retrying);
297 }
298 }
299
300 void
sfxge_rx_qrefill(struct sfxge_rxq * rxq)301 sfxge_rx_qrefill(struct sfxge_rxq *rxq)
302 {
303
304 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
305 return;
306
307 /* Make sure the queue is full */
308 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE);
309 }
310
__sfxge_rx_deliver(struct sfxge_softc * sc,struct mbuf * m)311 static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
312 {
313 if_t ifp = sc->ifnet;
314
315 m->m_pkthdr.rcvif = ifp;
316 m->m_pkthdr.csum_data = 0xffff;
317 if_input(ifp, m);
318 }
319
320 static void
sfxge_rx_deliver(struct sfxge_rxq * rxq,struct sfxge_rx_sw_desc * rx_desc)321 sfxge_rx_deliver(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_desc)
322 {
323 struct sfxge_softc *sc = rxq->sc;
324 struct mbuf *m = rx_desc->mbuf;
325 int flags = rx_desc->flags;
326 int csum_flags;
327
328 /* Convert checksum flags */
329 csum_flags = (flags & EFX_CKSUM_IPV4) ?
330 (CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
331 if (flags & EFX_CKSUM_TCPUDP)
332 csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
333
334 if (flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
335 m->m_pkthdr.flowid =
336 efx_pseudo_hdr_hash_get(rxq->common,
337 EFX_RX_HASHALG_TOEPLITZ,
338 mtod(m, uint8_t *));
339 /* The hash covers a 4-tuple for TCP only */
340 M_HASHTYPE_SET(m,
341 (flags & EFX_PKT_IPV4) ?
342 ((flags & EFX_PKT_TCP) ?
343 M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_IPV4) :
344 ((flags & EFX_PKT_TCP) ?
345 M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_IPV6));
346 }
347 m->m_data += sc->rx_prefix_size;
348 m->m_len = rx_desc->size - sc->rx_prefix_size;
349 m->m_pkthdr.len = m->m_len;
350 m->m_pkthdr.csum_flags = csum_flags;
351 __sfxge_rx_deliver(sc, rx_desc->mbuf);
352
353 rx_desc->flags = EFX_DISCARD;
354 rx_desc->mbuf = NULL;
355 }
356
357 #ifdef SFXGE_LRO
358
359 static void
sfxge_lro_deliver(struct sfxge_lro_state * st,struct sfxge_lro_conn * c)360 sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
361 {
362 struct sfxge_softc *sc = st->sc;
363 struct mbuf *m = c->mbuf;
364 struct tcphdr *c_th;
365 int csum_flags;
366
367 KASSERT(m, ("no mbuf to deliver"));
368
369 ++st->n_bursts;
370
371 /* Finish off packet munging and recalculate IP header checksum. */
372 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
373 struct ip *iph = c->nh;
374 iph->ip_len = htons(iph->ip_len);
375 iph->ip_sum = 0;
376 iph->ip_sum = in_cksum_hdr(iph);
377 c_th = (struct tcphdr *)(iph + 1);
378 csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
379 CSUM_IP_CHECKED | CSUM_IP_VALID);
380 } else {
381 struct ip6_hdr *iph = c->nh;
382 iph->ip6_plen = htons(iph->ip6_plen);
383 c_th = (struct tcphdr *)(iph + 1);
384 csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
385 }
386
387 c_th->th_win = c->th_last->th_win;
388 c_th->th_ack = c->th_last->th_ack;
389 if (c_th->th_off == c->th_last->th_off) {
390 /* Copy TCP options (take care to avoid going negative). */
391 int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
392 memcpy(c_th + 1, c->th_last + 1, optlen);
393 }
394
395 m->m_pkthdr.flowid = c->conn_hash;
396 M_HASHTYPE_SET(m,
397 SFXGE_LRO_CONN_IS_TCPIPV4(c) ?
398 M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_TCP_IPV6);
399
400 m->m_pkthdr.csum_flags = csum_flags;
401 __sfxge_rx_deliver(sc, m);
402
403 c->mbuf = NULL;
404 c->delivered = 1;
405 }
406
407 /* Drop the given connection, and add it to the free list. */
sfxge_lro_drop(struct sfxge_rxq * rxq,struct sfxge_lro_conn * c)408 static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
409 {
410 unsigned bucket;
411
412 KASSERT(!c->mbuf, ("found orphaned mbuf"));
413
414 if (c->next_buf.mbuf != NULL) {
415 sfxge_rx_deliver(rxq, &c->next_buf);
416 LIST_REMOVE(c, active_link);
417 }
418
419 bucket = c->conn_hash & rxq->lro.conns_mask;
420 KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
421 --rxq->lro.conns_n[bucket];
422 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
423 TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
424 }
425
426 /* Stop tracking connections that have gone idle in order to keep hash
427 * chains short.
428 */
sfxge_lro_purge_idle(struct sfxge_rxq * rxq,unsigned now)429 static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
430 {
431 struct sfxge_lro_conn *c;
432 unsigned i;
433
434 KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
435 ("found active connections"));
436
437 rxq->lro.last_purge_ticks = now;
438 for (i = 0; i <= rxq->lro.conns_mask; ++i) {
439 if (TAILQ_EMPTY(&rxq->lro.conns[i]))
440 continue;
441
442 c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
443 if (now - c->last_pkt_ticks > lro_idle_ticks) {
444 ++rxq->lro.n_drop_idle;
445 sfxge_lro_drop(rxq, c);
446 }
447 }
448 }
449
450 static void
sfxge_lro_merge(struct sfxge_lro_state * st,struct sfxge_lro_conn * c,struct mbuf * mbuf,struct tcphdr * th)451 sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
452 struct mbuf *mbuf, struct tcphdr *th)
453 {
454 struct tcphdr *c_th;
455
456 /* Tack the new mbuf onto the chain. */
457 KASSERT(!mbuf->m_next, ("mbuf already chained"));
458 c->mbuf_tail->m_next = mbuf;
459 c->mbuf_tail = mbuf;
460
461 /* Increase length appropriately */
462 c->mbuf->m_pkthdr.len += mbuf->m_len;
463
464 /* Update the connection state flags */
465 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
466 struct ip *iph = c->nh;
467 iph->ip_len += mbuf->m_len;
468 c_th = (struct tcphdr *)(iph + 1);
469 } else {
470 struct ip6_hdr *iph = c->nh;
471 iph->ip6_plen += mbuf->m_len;
472 c_th = (struct tcphdr *)(iph + 1);
473 }
474 tcp_set_flags(c_th, tcp_get_flags(c_th) | (tcp_get_flags(th) & TH_PUSH));
475 c->th_last = th;
476 ++st->n_merges;
477
478 /* Pass packet up now if another segment could overflow the IP
479 * length.
480 */
481 if (c->mbuf->m_pkthdr.len > 65536 - 9200)
482 sfxge_lro_deliver(st, c);
483 }
484
485 static void
sfxge_lro_start(struct sfxge_lro_state * st,struct sfxge_lro_conn * c,struct mbuf * mbuf,void * nh,struct tcphdr * th)486 sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
487 struct mbuf *mbuf, void *nh, struct tcphdr *th)
488 {
489 /* Start the chain */
490 c->mbuf = mbuf;
491 c->mbuf_tail = c->mbuf;
492 c->nh = nh;
493 c->th_last = th;
494
495 mbuf->m_pkthdr.len = mbuf->m_len;
496
497 /* Mangle header fields for later processing */
498 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
499 struct ip *iph = nh;
500 iph->ip_len = ntohs(iph->ip_len);
501 } else {
502 struct ip6_hdr *iph = nh;
503 iph->ip6_plen = ntohs(iph->ip6_plen);
504 }
505 }
506
507 /* Try to merge or otherwise hold or deliver (as appropriate) the
508 * packet buffered for this connection (c->next_buf). Return a flag
509 * indicating whether the connection is still active for LRO purposes.
510 */
511 static int
sfxge_lro_try_merge(struct sfxge_rxq * rxq,struct sfxge_lro_conn * c)512 sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
513 {
514 struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
515 char *eh = c->next_eh;
516 int data_length, hdr_length, dont_merge;
517 unsigned th_seq, pkt_length;
518 struct tcphdr *th;
519 unsigned now;
520
521 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
522 struct ip *iph = c->next_nh;
523 th = (struct tcphdr *)(iph + 1);
524 pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
525 } else {
526 struct ip6_hdr *iph = c->next_nh;
527 th = (struct tcphdr *)(iph + 1);
528 pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
529 }
530
531 hdr_length = (char *) th + th->th_off * 4 - eh;
532 data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
533 hdr_length);
534 th_seq = ntohl(th->th_seq);
535 dont_merge = ((data_length <= 0)
536 | (tcp_get_flags(th) & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
537
538 /* Check for options other than aligned timestamp. */
539 if (th->th_off != 5) {
540 const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
541 if (th->th_off == 8 &&
542 opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
543 (TCPOPT_NOP << 16) |
544 (TCPOPT_TIMESTAMP << 8) |
545 TCPOLEN_TIMESTAMP)) {
546 /* timestamp option -- okay */
547 } else {
548 dont_merge = 1;
549 }
550 }
551
552 if (__predict_false(th_seq != c->next_seq)) {
553 /* Out-of-order, so start counting again. */
554 if (c->mbuf != NULL)
555 sfxge_lro_deliver(&rxq->lro, c);
556 c->n_in_order_pkts -= lro_loss_packets;
557 c->next_seq = th_seq + data_length;
558 ++rxq->lro.n_misorder;
559 goto deliver_buf_out;
560 }
561 c->next_seq = th_seq + data_length;
562
563 now = ticks;
564 if (now - c->last_pkt_ticks > lro_idle_ticks) {
565 ++rxq->lro.n_drop_idle;
566 if (c->mbuf != NULL)
567 sfxge_lro_deliver(&rxq->lro, c);
568 sfxge_lro_drop(rxq, c);
569 return (0);
570 }
571 c->last_pkt_ticks = ticks;
572
573 if (c->n_in_order_pkts < lro_slow_start_packets) {
574 /* May be in slow-start, so don't merge. */
575 ++rxq->lro.n_slow_start;
576 ++c->n_in_order_pkts;
577 goto deliver_buf_out;
578 }
579
580 if (__predict_false(dont_merge)) {
581 if (c->mbuf != NULL)
582 sfxge_lro_deliver(&rxq->lro, c);
583 if (tcp_get_flags(th) & (TH_FIN | TH_RST)) {
584 ++rxq->lro.n_drop_closed;
585 sfxge_lro_drop(rxq, c);
586 return (0);
587 }
588 goto deliver_buf_out;
589 }
590
591 rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
592
593 if (__predict_true(c->mbuf != NULL)) {
594 /* Remove headers and any padding */
595 rx_buf->mbuf->m_data += hdr_length;
596 rx_buf->mbuf->m_len = data_length;
597
598 sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
599 } else {
600 /* Remove any padding */
601 rx_buf->mbuf->m_len = pkt_length;
602
603 sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
604 }
605
606 rx_buf->mbuf = NULL;
607 return (1);
608
609 deliver_buf_out:
610 sfxge_rx_deliver(rxq, rx_buf);
611 return (1);
612 }
613
sfxge_lro_new_conn(struct sfxge_lro_state * st,uint32_t conn_hash,uint16_t l2_id,void * nh,struct tcphdr * th)614 static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
615 uint16_t l2_id, void *nh, struct tcphdr *th)
616 {
617 unsigned bucket = conn_hash & st->conns_mask;
618 struct sfxge_lro_conn *c;
619
620 if (st->conns_n[bucket] >= lro_chain_max) {
621 ++st->n_too_many;
622 return;
623 }
624
625 if (!TAILQ_EMPTY(&st->free_conns)) {
626 c = TAILQ_FIRST(&st->free_conns);
627 TAILQ_REMOVE(&st->free_conns, c, link);
628 } else {
629 c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT);
630 if (c == NULL)
631 return;
632 c->mbuf = NULL;
633 c->next_buf.mbuf = NULL;
634 }
635
636 /* Create the connection tracking data */
637 ++st->conns_n[bucket];
638 TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
639 c->l2_id = l2_id;
640 c->conn_hash = conn_hash;
641 c->source = th->th_sport;
642 c->dest = th->th_dport;
643 c->n_in_order_pkts = 0;
644 c->last_pkt_ticks = *(volatile int *)&ticks;
645 c->delivered = 0;
646 ++st->n_new_stream;
647 /* NB. We don't initialise c->next_seq, and it doesn't matter what
648 * value it has. Most likely the next packet received for this
649 * connection will not match -- no harm done.
650 */
651 }
652
653 /* Process mbuf and decide whether to dispatch it to the stack now or
654 * later.
655 */
656 static void
sfxge_lro(struct sfxge_rxq * rxq,struct sfxge_rx_sw_desc * rx_buf)657 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
658 {
659 struct sfxge_softc *sc = rxq->sc;
660 struct mbuf *m = rx_buf->mbuf;
661 struct ether_header *eh;
662 struct sfxge_lro_conn *c;
663 uint16_t l2_id;
664 uint16_t l3_proto;
665 void *nh;
666 struct tcphdr *th;
667 uint32_t conn_hash;
668 unsigned bucket;
669
670 /* Get the hardware hash */
671 conn_hash = efx_pseudo_hdr_hash_get(rxq->common,
672 EFX_RX_HASHALG_TOEPLITZ,
673 mtod(m, uint8_t *));
674
675 eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
676 if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
677 struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
678 l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
679 SFXGE_LRO_L2_ID_VLAN;
680 l3_proto = veh->evl_proto;
681 nh = veh + 1;
682 } else {
683 l2_id = 0;
684 l3_proto = eh->ether_type;
685 nh = eh + 1;
686 }
687
688 /* Check whether this is a suitable packet (unfragmented
689 * TCP/IPv4 or TCP/IPv6). If so, find the TCP header and
690 * length, and compute a hash if necessary. If not, return.
691 */
692 if (l3_proto == htons(ETHERTYPE_IP)) {
693 struct ip *iph = nh;
694
695 KASSERT(iph->ip_p == IPPROTO_TCP,
696 ("IPv4 protocol is not TCP, but packet marker is set"));
697 if ((iph->ip_hl - (sizeof(*iph) >> 2u)) |
698 (iph->ip_off & htons(IP_MF | IP_OFFMASK)))
699 goto deliver_now;
700 th = (struct tcphdr *)(iph + 1);
701 } else if (l3_proto == htons(ETHERTYPE_IPV6)) {
702 struct ip6_hdr *iph = nh;
703
704 KASSERT(iph->ip6_nxt == IPPROTO_TCP,
705 ("IPv6 next header is not TCP, but packet marker is set"));
706 l2_id |= SFXGE_LRO_L2_ID_IPV6;
707 th = (struct tcphdr *)(iph + 1);
708 } else {
709 goto deliver_now;
710 }
711
712 bucket = conn_hash & rxq->lro.conns_mask;
713
714 TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
715 if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
716 continue;
717 if ((c->source - th->th_sport) | (c->dest - th->th_dport))
718 continue;
719 if (c->mbuf != NULL) {
720 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
721 struct ip *c_iph, *iph = nh;
722 c_iph = c->nh;
723 if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
724 (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
725 continue;
726 } else {
727 struct ip6_hdr *c_iph, *iph = nh;
728 c_iph = c->nh;
729 if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
730 ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
731 continue;
732 }
733 }
734
735 /* Re-insert at head of list to reduce lookup time. */
736 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
737 TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
738
739 if (c->next_buf.mbuf != NULL) {
740 if (!sfxge_lro_try_merge(rxq, c))
741 goto deliver_now;
742 } else {
743 LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
744 active_link);
745 }
746 c->next_buf = *rx_buf;
747 c->next_eh = eh;
748 c->next_nh = nh;
749
750 rx_buf->mbuf = NULL;
751 rx_buf->flags = EFX_DISCARD;
752 return;
753 }
754
755 sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
756 deliver_now:
757 sfxge_rx_deliver(rxq, rx_buf);
758 }
759
sfxge_lro_end_of_burst(struct sfxge_rxq * rxq)760 static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
761 {
762 struct sfxge_lro_state *st = &rxq->lro;
763 struct sfxge_lro_conn *c;
764 unsigned t;
765
766 while (!LIST_EMPTY(&st->active_conns)) {
767 c = LIST_FIRST(&st->active_conns);
768 if (!c->delivered && c->mbuf != NULL)
769 sfxge_lro_deliver(st, c);
770 if (sfxge_lro_try_merge(rxq, c)) {
771 if (c->mbuf != NULL)
772 sfxge_lro_deliver(st, c);
773 LIST_REMOVE(c, active_link);
774 }
775 c->delivered = 0;
776 }
777
778 t = *(volatile int *)&ticks;
779 if (__predict_false(t != st->last_purge_ticks))
780 sfxge_lro_purge_idle(rxq, t);
781 }
782
783 #else /* !SFXGE_LRO */
784
785 static void
sfxge_lro(struct sfxge_rxq * rxq,struct sfxge_rx_sw_desc * rx_buf)786 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
787 {
788 }
789
790 static void
sfxge_lro_end_of_burst(struct sfxge_rxq * rxq)791 sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
792 {
793 }
794
795 #endif /* SFXGE_LRO */
796
797 void
sfxge_rx_qcomplete(struct sfxge_rxq * rxq,boolean_t eop)798 sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
799 {
800 struct sfxge_softc *sc = rxq->sc;
801 int if_capenable = if_getcapenable(sc->ifnet);
802 int lro_enabled = if_capenable & IFCAP_LRO;
803 unsigned int index;
804 struct sfxge_evq *evq __diagused;
805 unsigned int completed;
806 unsigned int level;
807 struct mbuf *m;
808 struct sfxge_rx_sw_desc *prev = NULL;
809
810 index = rxq->index;
811 evq = sc->evq[index];
812
813 SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
814
815 completed = rxq->completed;
816 while (completed != rxq->pending) {
817 unsigned int id;
818 struct sfxge_rx_sw_desc *rx_desc;
819
820 id = completed++ & rxq->ptr_mask;
821 rx_desc = &rxq->queue[id];
822 m = rx_desc->mbuf;
823
824 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
825 goto discard;
826
827 if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
828 goto discard;
829
830 /* Read the length from the pseudo header if required */
831 if (rx_desc->flags & EFX_PKT_PREFIX_LEN) {
832 uint16_t tmp_size;
833 int rc __diagused;
834
835 rc = efx_pseudo_hdr_pkt_length_get(rxq->common,
836 mtod(m, uint8_t *),
837 &tmp_size);
838 KASSERT(rc == 0, ("cannot get packet length: %d", rc));
839 rx_desc->size = (int)tmp_size + sc->rx_prefix_size;
840 }
841
842 prefetch_read_many(mtod(m, caddr_t));
843
844 switch (rx_desc->flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
845 case EFX_PKT_IPV4:
846 if (~if_capenable & IFCAP_RXCSUM)
847 rx_desc->flags &=
848 ~(EFX_CKSUM_IPV4 | EFX_CKSUM_TCPUDP);
849 break;
850 case EFX_PKT_IPV6:
851 if (~if_capenable & IFCAP_RXCSUM_IPV6)
852 rx_desc->flags &= ~EFX_CKSUM_TCPUDP;
853 break;
854 case 0:
855 /* Check for loopback packets */
856 {
857 struct ether_header *etherhp;
858
859 /*LINTED*/
860 etherhp = mtod(m, struct ether_header *);
861
862 if (etherhp->ether_type ==
863 htons(SFXGE_ETHERTYPE_LOOPBACK)) {
864 EFSYS_PROBE(loopback);
865
866 rxq->loopback++;
867 goto discard;
868 }
869 }
870 break;
871 default:
872 KASSERT(B_FALSE,
873 ("Rx descriptor with both IPv4 and IPv6 flags"));
874 goto discard;
875 }
876
877 /* Pass packet up the stack or into LRO (pipelined) */
878 if (prev != NULL) {
879 if (lro_enabled &&
880 ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
881 (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
882 sfxge_lro(rxq, prev);
883 else
884 sfxge_rx_deliver(rxq, prev);
885 }
886 prev = rx_desc;
887 continue;
888
889 discard:
890 /* Return the packet to the pool */
891 m_free(m);
892 rx_desc->mbuf = NULL;
893 }
894 rxq->completed = completed;
895
896 level = rxq->added - rxq->completed;
897
898 /* Pass last packet up the stack or into LRO */
899 if (prev != NULL) {
900 if (lro_enabled &&
901 ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
902 (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
903 sfxge_lro(rxq, prev);
904 else
905 sfxge_rx_deliver(rxq, prev);
906 }
907
908 /*
909 * If there are any pending flows and this is the end of the
910 * poll then they must be completed.
911 */
912 if (eop)
913 sfxge_lro_end_of_burst(rxq);
914
915 /* Top up the queue if necessary */
916 if (level < rxq->refill_threshold)
917 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE);
918 }
919
920 static void
sfxge_rx_qstop(struct sfxge_softc * sc,unsigned int index)921 sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
922 {
923 struct sfxge_rxq *rxq;
924 struct sfxge_evq *evq;
925 unsigned int count;
926 unsigned int retry = 3;
927
928 SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
929
930 rxq = sc->rxq[index];
931 evq = sc->evq[index];
932
933 SFXGE_EVQ_LOCK(evq);
934
935 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
936 ("rxq not started"));
937
938 rxq->init_state = SFXGE_RXQ_INITIALIZED;
939
940 callout_stop(&rxq->refill_callout);
941
942 while (rxq->flush_state != SFXGE_FLUSH_DONE && retry != 0) {
943 rxq->flush_state = SFXGE_FLUSH_PENDING;
944
945 SFXGE_EVQ_UNLOCK(evq);
946
947 /* Flush the receive queue */
948 if (efx_rx_qflush(rxq->common) != 0) {
949 SFXGE_EVQ_LOCK(evq);
950 rxq->flush_state = SFXGE_FLUSH_FAILED;
951 break;
952 }
953
954 count = 0;
955 do {
956 /* Spin for 100 ms */
957 DELAY(100000);
958
959 if (rxq->flush_state != SFXGE_FLUSH_PENDING)
960 break;
961
962 } while (++count < 20);
963
964 SFXGE_EVQ_LOCK(evq);
965
966 if (rxq->flush_state == SFXGE_FLUSH_PENDING) {
967 /* Flush timeout - neither done nor failed */
968 log(LOG_ERR, "%s: Cannot flush Rx queue %u\n",
969 device_get_nameunit(sc->dev), index);
970 rxq->flush_state = SFXGE_FLUSH_DONE;
971 }
972 retry--;
973 }
974 if (rxq->flush_state == SFXGE_FLUSH_FAILED) {
975 log(LOG_ERR, "%s: Flushing Rx queue %u failed\n",
976 device_get_nameunit(sc->dev), index);
977 rxq->flush_state = SFXGE_FLUSH_DONE;
978 }
979
980 rxq->pending = rxq->added;
981 sfxge_rx_qcomplete(rxq, B_TRUE);
982
983 KASSERT(rxq->completed == rxq->pending,
984 ("rxq->completed != rxq->pending"));
985
986 rxq->added = 0;
987 rxq->pushed = 0;
988 rxq->pending = 0;
989 rxq->completed = 0;
990 rxq->loopback = 0;
991
992 /* Destroy the common code receive queue. */
993 efx_rx_qdestroy(rxq->common);
994
995 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
996 EFX_RXQ_NBUFS(sc->rxq_entries));
997
998 SFXGE_EVQ_UNLOCK(evq);
999 }
1000
1001 static int
sfxge_rx_qstart(struct sfxge_softc * sc,unsigned int index)1002 sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
1003 {
1004 struct sfxge_rxq *rxq;
1005 efsys_mem_t *esmp;
1006 struct sfxge_evq *evq;
1007 int rc;
1008
1009 SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
1010
1011 rxq = sc->rxq[index];
1012 esmp = &rxq->mem;
1013 evq = sc->evq[index];
1014
1015 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1016 ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1017 KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
1018 ("evq->init_state != SFXGE_EVQ_STARTED"));
1019
1020 /* Program the buffer table. */
1021 if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
1022 EFX_RXQ_NBUFS(sc->rxq_entries))) != 0)
1023 return (rc);
1024
1025 /* Create the common code receive queue. */
1026 if ((rc = efx_rx_qcreate(sc->enp, index, 0, EFX_RXQ_TYPE_DEFAULT,
1027 esmp, sc->rxq_entries, rxq->buf_base_id, EFX_RXQ_FLAG_NONE,
1028 evq->common, &rxq->common)) != 0)
1029 goto fail;
1030
1031 SFXGE_EVQ_LOCK(evq);
1032
1033 /* Enable the receive queue. */
1034 efx_rx_qenable(rxq->common);
1035
1036 rxq->init_state = SFXGE_RXQ_STARTED;
1037 rxq->flush_state = SFXGE_FLUSH_REQUIRED;
1038
1039 /* Try to fill the queue from the pool. */
1040 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE);
1041
1042 SFXGE_EVQ_UNLOCK(evq);
1043
1044 return (0);
1045
1046 fail:
1047 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1048 EFX_RXQ_NBUFS(sc->rxq_entries));
1049 return (rc);
1050 }
1051
1052 void
sfxge_rx_stop(struct sfxge_softc * sc)1053 sfxge_rx_stop(struct sfxge_softc *sc)
1054 {
1055 int index;
1056
1057 efx_mac_filter_default_rxq_clear(sc->enp);
1058
1059 /* Stop the receive queue(s) */
1060 index = sc->rxq_count;
1061 while (--index >= 0)
1062 sfxge_rx_qstop(sc, index);
1063
1064 sc->rx_prefix_size = 0;
1065 sc->rx_buffer_size = 0;
1066
1067 efx_rx_fini(sc->enp);
1068 }
1069
1070 int
sfxge_rx_start(struct sfxge_softc * sc)1071 sfxge_rx_start(struct sfxge_softc *sc)
1072 {
1073 const efx_nic_cfg_t *encp;
1074 size_t hdrlen, align, reserved;
1075 int index;
1076 int rc;
1077
1078 /* Initialize the common code receive module. */
1079 if ((rc = efx_rx_init(sc->enp)) != 0)
1080 return (rc);
1081
1082 encp = efx_nic_cfg_get(sc->enp);
1083 sc->rx_buffer_size = EFX_MAC_PDU(if_getmtu(sc->ifnet));
1084
1085 /* Calculate the receive packet buffer size. */
1086 sc->rx_prefix_size = encp->enc_rx_prefix_size;
1087
1088 /* Ensure IP headers are 32bit aligned */
1089 hdrlen = sc->rx_prefix_size + sizeof (struct ether_header);
1090 sc->rx_buffer_align = EFX_P2ROUNDUP(size_t, hdrlen, 4) - hdrlen;
1091
1092 sc->rx_buffer_size += sc->rx_buffer_align;
1093
1094 /* Align end of packet buffer for RX DMA end padding */
1095 align = MAX(1, encp->enc_rx_buf_align_end);
1096 EFSYS_ASSERT(ISP2(align));
1097 sc->rx_buffer_size = EFX_P2ROUNDUP(size_t, sc->rx_buffer_size, align);
1098
1099 /*
1100 * Standard mbuf zones only guarantee pointer-size alignment;
1101 * we need extra space to align to the cache line
1102 */
1103 reserved = sc->rx_buffer_size + CACHE_LINE_SIZE;
1104
1105 /* Select zone for packet buffers */
1106 if (reserved <= MCLBYTES)
1107 sc->rx_cluster_size = MCLBYTES;
1108 else if (reserved <= MJUMPAGESIZE)
1109 sc->rx_cluster_size = MJUMPAGESIZE;
1110 else if (reserved <= MJUM9BYTES)
1111 sc->rx_cluster_size = MJUM9BYTES;
1112 else
1113 sc->rx_cluster_size = MJUM16BYTES;
1114
1115 /*
1116 * Set up the scale table. Enable all hash types and hash insertion.
1117 */
1118 for (index = 0; index < nitems(sc->rx_indir_table); index++)
1119 #ifdef RSS
1120 sc->rx_indir_table[index] =
1121 rss_get_indirection_to_bucket(index) % sc->rxq_count;
1122 #else
1123 sc->rx_indir_table[index] = index % sc->rxq_count;
1124 #endif
1125 if ((rc = efx_rx_scale_tbl_set(sc->enp, EFX_RSS_CONTEXT_DEFAULT,
1126 sc->rx_indir_table,
1127 nitems(sc->rx_indir_table))) != 0)
1128 goto fail;
1129 (void)efx_rx_scale_mode_set(sc->enp, EFX_RSS_CONTEXT_DEFAULT,
1130 EFX_RX_HASHALG_TOEPLITZ,
1131 EFX_RX_HASH_IPV4 | EFX_RX_HASH_TCPIPV4 |
1132 EFX_RX_HASH_IPV6 | EFX_RX_HASH_TCPIPV6, B_TRUE);
1133
1134 rss_getkey(toep_key);
1135 if ((rc = efx_rx_scale_key_set(sc->enp, EFX_RSS_CONTEXT_DEFAULT,
1136 toep_key,
1137 sizeof(toep_key))) != 0)
1138 goto fail;
1139
1140 /* Start the receive queue(s). */
1141 for (index = 0; index < sc->rxq_count; index++) {
1142 if ((rc = sfxge_rx_qstart(sc, index)) != 0)
1143 goto fail2;
1144 }
1145
1146 rc = efx_mac_filter_default_rxq_set(sc->enp, sc->rxq[0]->common,
1147 sc->intr.n_alloc > 1);
1148 if (rc != 0)
1149 goto fail3;
1150
1151 return (0);
1152
1153 fail3:
1154 fail2:
1155 while (--index >= 0)
1156 sfxge_rx_qstop(sc, index);
1157
1158 fail:
1159 efx_rx_fini(sc->enp);
1160
1161 return (rc);
1162 }
1163
1164 #ifdef SFXGE_LRO
1165
sfxge_lro_init(struct sfxge_rxq * rxq)1166 static void sfxge_lro_init(struct sfxge_rxq *rxq)
1167 {
1168 struct sfxge_lro_state *st = &rxq->lro;
1169 unsigned i;
1170
1171 st->conns_mask = lro_table_size - 1;
1172 KASSERT(!((st->conns_mask + 1) & st->conns_mask),
1173 ("lro_table_size must be a power of 2"));
1174 st->sc = rxq->sc;
1175 st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
1176 M_SFXGE, M_WAITOK);
1177 st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
1178 M_SFXGE, M_WAITOK);
1179 for (i = 0; i <= st->conns_mask; ++i) {
1180 TAILQ_INIT(&st->conns[i]);
1181 st->conns_n[i] = 0;
1182 }
1183 LIST_INIT(&st->active_conns);
1184 TAILQ_INIT(&st->free_conns);
1185 }
1186
sfxge_lro_fini(struct sfxge_rxq * rxq)1187 static void sfxge_lro_fini(struct sfxge_rxq *rxq)
1188 {
1189 struct sfxge_lro_state *st = &rxq->lro;
1190 struct sfxge_lro_conn *c;
1191 unsigned i;
1192
1193 /* Return cleanly if sfxge_lro_init() has not been called. */
1194 if (st->conns == NULL)
1195 return;
1196
1197 KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
1198
1199 for (i = 0; i <= st->conns_mask; ++i) {
1200 while (!TAILQ_EMPTY(&st->conns[i])) {
1201 c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
1202 sfxge_lro_drop(rxq, c);
1203 }
1204 }
1205
1206 while (!TAILQ_EMPTY(&st->free_conns)) {
1207 c = TAILQ_FIRST(&st->free_conns);
1208 TAILQ_REMOVE(&st->free_conns, c, link);
1209 KASSERT(!c->mbuf, ("found orphaned mbuf"));
1210 free(c, M_SFXGE);
1211 }
1212
1213 free(st->conns_n, M_SFXGE);
1214 free(st->conns, M_SFXGE);
1215 st->conns = NULL;
1216 }
1217
1218 #else
1219
1220 static void
sfxge_lro_init(struct sfxge_rxq * rxq)1221 sfxge_lro_init(struct sfxge_rxq *rxq)
1222 {
1223 }
1224
1225 static void
sfxge_lro_fini(struct sfxge_rxq * rxq)1226 sfxge_lro_fini(struct sfxge_rxq *rxq)
1227 {
1228 }
1229
1230 #endif /* SFXGE_LRO */
1231
1232 static void
sfxge_rx_qfini(struct sfxge_softc * sc,unsigned int index)1233 sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
1234 {
1235 struct sfxge_rxq *rxq;
1236
1237 rxq = sc->rxq[index];
1238
1239 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1240 ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1241
1242 /* Free the context array and the flow table. */
1243 free(rxq->queue, M_SFXGE);
1244 sfxge_lro_fini(rxq);
1245
1246 /* Release DMA memory. */
1247 sfxge_dma_free(&rxq->mem);
1248
1249 sc->rxq[index] = NULL;
1250
1251 free(rxq, M_SFXGE);
1252 }
1253
1254 static int
sfxge_rx_qinit(struct sfxge_softc * sc,unsigned int index)1255 sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
1256 {
1257 struct sfxge_rxq *rxq;
1258 efsys_mem_t *esmp;
1259 int rc;
1260
1261 KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count));
1262
1263 rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
1264 rxq->sc = sc;
1265 rxq->index = index;
1266 rxq->entries = sc->rxq_entries;
1267 rxq->ptr_mask = rxq->entries - 1;
1268 rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries);
1269
1270 sc->rxq[index] = rxq;
1271 esmp = &rxq->mem;
1272
1273 /* Allocate and zero DMA space. */
1274 if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0)
1275 return (rc);
1276
1277 /* Allocate buffer table entries. */
1278 sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries),
1279 &rxq->buf_base_id);
1280
1281 /* Allocate the context array and the flow table. */
1282 rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries,
1283 M_SFXGE, M_WAITOK | M_ZERO);
1284 sfxge_lro_init(rxq);
1285
1286 callout_init(&rxq->refill_callout, 1);
1287
1288 rxq->init_state = SFXGE_RXQ_INITIALIZED;
1289
1290 return (0);
1291 }
1292
1293 static const struct {
1294 const char *name;
1295 size_t offset;
1296 } sfxge_rx_stats[] = {
1297 #define SFXGE_RX_STAT(name, member) \
1298 { #name, offsetof(struct sfxge_rxq, member) }
1299 #ifdef SFXGE_LRO
1300 SFXGE_RX_STAT(lro_merges, lro.n_merges),
1301 SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
1302 SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
1303 SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
1304 SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
1305 SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
1306 SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
1307 SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
1308 #endif
1309 };
1310
1311 static int
sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)1312 sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
1313 {
1314 struct sfxge_softc *sc = arg1;
1315 unsigned int id = arg2;
1316 unsigned int sum, index;
1317
1318 /* Sum across all RX queues */
1319 sum = 0;
1320 for (index = 0; index < sc->rxq_count; index++)
1321 sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
1322 sfxge_rx_stats[id].offset);
1323
1324 return (SYSCTL_OUT(req, &sum, sizeof(sum)));
1325 }
1326
1327 static void
sfxge_rx_stat_init(struct sfxge_softc * sc)1328 sfxge_rx_stat_init(struct sfxge_softc *sc)
1329 {
1330 struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1331 struct sysctl_oid_list *stat_list;
1332 unsigned int id;
1333
1334 stat_list = SYSCTL_CHILDREN(sc->stats_node);
1335
1336 for (id = 0; id < nitems(sfxge_rx_stats); id++) {
1337 SYSCTL_ADD_PROC(ctx, stat_list, OID_AUTO,
1338 sfxge_rx_stats[id].name,
1339 CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_NEEDGIANT,
1340 sc, id, sfxge_rx_stat_handler, "IU", "");
1341 }
1342 }
1343
1344 void
sfxge_rx_fini(struct sfxge_softc * sc)1345 sfxge_rx_fini(struct sfxge_softc *sc)
1346 {
1347 int index;
1348
1349 index = sc->rxq_count;
1350 while (--index >= 0)
1351 sfxge_rx_qfini(sc, index);
1352
1353 sc->rxq_count = 0;
1354 }
1355
1356 int
sfxge_rx_init(struct sfxge_softc * sc)1357 sfxge_rx_init(struct sfxge_softc *sc)
1358 {
1359 struct sfxge_intr *intr;
1360 int index;
1361 int rc;
1362
1363 #ifdef SFXGE_LRO
1364 if (!ISP2(lro_table_size)) {
1365 log(LOG_ERR, "%s=%u must be power of 2",
1366 SFXGE_LRO_PARAM(table_size), lro_table_size);
1367 rc = EINVAL;
1368 goto fail_lro_table_size;
1369 }
1370
1371 if (lro_idle_ticks == 0)
1372 lro_idle_ticks = hz / 10 + 1; /* 100 ms */
1373 #endif
1374
1375 intr = &sc->intr;
1376
1377 sc->rxq_count = intr->n_alloc;
1378
1379 KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
1380 ("intr->state != SFXGE_INTR_INITIALIZED"));
1381
1382 /* Initialize the receive queue(s) - one per interrupt. */
1383 for (index = 0; index < sc->rxq_count; index++) {
1384 if ((rc = sfxge_rx_qinit(sc, index)) != 0)
1385 goto fail;
1386 }
1387
1388 sfxge_rx_stat_init(sc);
1389
1390 return (0);
1391
1392 fail:
1393 /* Tear down the receive queue(s). */
1394 while (--index >= 0)
1395 sfxge_rx_qfini(sc, index);
1396
1397 sc->rxq_count = 0;
1398
1399 #ifdef SFXGE_LRO
1400 fail_lro_table_size:
1401 #endif
1402 return (rc);
1403 }
1404