1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2010-2016 Solarflare Communications Inc.
5 * All rights reserved.
6 *
7 * This software was developed in part by Philip Paeps under contract for
8 * Solarflare Communications, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions are met:
12 *
13 * 1. Redistributions of source code must retain the above copyright notice,
14 * this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright notice,
16 * this list of conditions and the following disclaimer in the documentation
17 * and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
23 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
29 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 *
31 * The views and conclusions contained in the software and documentation are
32 * those of the authors and should not be interpreted as representing official
33 * policies, either expressed or implied, of the FreeBSD Project.
34 */
35
36 #include <sys/cdefs.h>
37 #include "opt_rss.h"
38
39 #include <sys/param.h>
40 #include <sys/malloc.h>
41 #include <sys/mbuf.h>
42 #include <sys/smp.h>
43 #include <sys/socket.h>
44 #include <sys/sysctl.h>
45 #include <sys/syslog.h>
46 #include <sys/limits.h>
47 #include <sys/syslog.h>
48
49 #include <net/ethernet.h>
50 #include <net/if.h>
51 #include <net/if_vlan_var.h>
52
53 #include <netinet/in.h>
54 #include <netinet/ip.h>
55 #include <netinet/ip6.h>
56 #include <netinet/tcp.h>
57
58 #include <machine/in_cksum.h>
59
60 #ifdef RSS
61 #include <net/rss_config.h>
62 #endif
63
64 #include "common/efx.h"
65
66 #include "sfxge.h"
67 #include "sfxge_rx.h"
68
69 #define RX_REFILL_THRESHOLD(_entries) (EFX_RXQ_LIMIT(_entries) * 9 / 10)
70
71 #ifdef SFXGE_LRO
72
73 SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
74 "Large receive offload (LRO) parameters");
75
76 #define SFXGE_LRO_PARAM(_param) SFXGE_PARAM(lro._param)
77
78 /* Size of the LRO hash table. Must be a power of 2. A larger table
79 * means we can accelerate a larger number of streams.
80 */
81 static unsigned lro_table_size = 128;
82 TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size);
83 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN,
84 &lro_table_size, 0,
85 "Size of the LRO hash table (must be a power of 2)");
86
87 /* Maximum length of a hash chain. If chains get too long then the lookup
88 * time increases and may exceed the benefit of LRO.
89 */
90 static unsigned lro_chain_max = 20;
91 TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max);
92 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN,
93 &lro_chain_max, 0,
94 "The maximum length of a hash chain");
95
96 /* Maximum time (in ticks) that a connection can be idle before it's LRO
97 * state is discarded.
98 */
99 static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
100 TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks);
101 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN,
102 &lro_idle_ticks, 0,
103 "The maximum time (in ticks) that a connection can be idle "
104 "before it's LRO state is discarded");
105
106 /* Number of packets with payload that must arrive in-order before a
107 * connection is eligible for LRO. The idea is we should avoid coalescing
108 * segments when the sender is in slow-start because reducing the ACK rate
109 * can damage performance.
110 */
111 static int lro_slow_start_packets = 2000;
112 TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets);
113 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN,
114 &lro_slow_start_packets, 0,
115 "Number of packets with payload that must arrive in-order before "
116 "a connection is eligible for LRO");
117
118 /* Number of packets with payload that must arrive in-order following loss
119 * before a connection is eligible for LRO. The idea is we should avoid
120 * coalescing segments when the sender is recovering from loss, because
121 * reducing the ACK rate can damage performance.
122 */
123 static int lro_loss_packets = 20;
124 TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets);
125 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN,
126 &lro_loss_packets, 0,
127 "Number of packets with payload that must arrive in-order "
128 "following loss before a connection is eligible for LRO");
129
130 /* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
131 #define SFXGE_LRO_L2_ID_VLAN 0x4000
132 #define SFXGE_LRO_L2_ID_IPV6 0x8000
133 #define SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
134 #define SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
135
136 /* Compare IPv6 addresses, avoiding conditional branches */
ipv6_addr_cmp(const struct in6_addr * left,const struct in6_addr * right)137 static unsigned long ipv6_addr_cmp(const struct in6_addr *left,
138 const struct in6_addr *right)
139 {
140 #if LONG_BIT == 64
141 const uint64_t *left64 = (const uint64_t *)left;
142 const uint64_t *right64 = (const uint64_t *)right;
143 return (left64[0] - right64[0]) | (left64[1] - right64[1]);
144 #else
145 return (left->s6_addr32[0] - right->s6_addr32[0]) |
146 (left->s6_addr32[1] - right->s6_addr32[1]) |
147 (left->s6_addr32[2] - right->s6_addr32[2]) |
148 (left->s6_addr32[3] - right->s6_addr32[3]);
149 #endif
150 }
151
152 #endif /* SFXGE_LRO */
153
154 void
sfxge_rx_qflush_done(struct sfxge_rxq * rxq)155 sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
156 {
157
158 rxq->flush_state = SFXGE_FLUSH_DONE;
159 }
160
161 void
sfxge_rx_qflush_failed(struct sfxge_rxq * rxq)162 sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
163 {
164
165 rxq->flush_state = SFXGE_FLUSH_FAILED;
166 }
167
168 #ifdef RSS
169 static uint8_t toep_key[RSS_KEYSIZE];
170 #else
171 static uint8_t toep_key[] = {
172 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
173 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
174 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
175 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
176 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
177 };
178 #endif
179
180 static void
sfxge_rx_post_refill(void * arg)181 sfxge_rx_post_refill(void *arg)
182 {
183 struct sfxge_rxq *rxq = arg;
184 struct sfxge_softc *sc;
185 unsigned int index;
186 struct sfxge_evq *evq;
187 uint16_t magic;
188
189 sc = rxq->sc;
190 index = rxq->index;
191 evq = sc->evq[index];
192 magic = sfxge_sw_ev_rxq_magic(SFXGE_SW_EV_RX_QREFILL, rxq);
193
194 /* This is guaranteed due to the start/stop order of rx and ev */
195 KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
196 ("evq not started"));
197 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
198 ("rxq not started"));
199 efx_ev_qpost(evq->common, magic);
200 }
201
202 static void
sfxge_rx_schedule_refill(struct sfxge_rxq * rxq,boolean_t retrying)203 sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
204 {
205 /* Initially retry after 100 ms, but back off in case of
206 * repeated failures as we probably have to wait for the
207 * administrator to raise the pool limit. */
208 if (retrying)
209 rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
210 else
211 rxq->refill_delay = hz / 10;
212
213 callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
214 sfxge_rx_post_refill, rxq);
215 }
216
217 #define SFXGE_REFILL_BATCH 64
218
219 static void
sfxge_rx_qfill(struct sfxge_rxq * rxq,unsigned int target,boolean_t retrying)220 sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
221 {
222 struct sfxge_softc *sc;
223 unsigned int index;
224 struct sfxge_evq *evq __diagused;
225 unsigned int batch;
226 unsigned int rxfill;
227 unsigned int mblksize;
228 int ntodo;
229 efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
230
231 sc = rxq->sc;
232 index = rxq->index;
233 evq = sc->evq[index];
234
235 prefetch_read_many(sc->enp);
236 prefetch_read_many(rxq->common);
237
238 SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
239
240 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
241 return;
242
243 rxfill = rxq->added - rxq->completed;
244 KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries),
245 ("rxfill > EFX_RXQ_LIMIT(rxq->entries)"));
246 ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target);
247 KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries),
248 ("ntodo > EFX_RQX_LIMIT(rxq->entries)"));
249
250 if (ntodo == 0)
251 return;
252
253 batch = 0;
254 mblksize = sc->rx_buffer_size - sc->rx_buffer_align;
255 while (ntodo-- > 0) {
256 unsigned int id;
257 struct sfxge_rx_sw_desc *rx_desc;
258 bus_dma_segment_t seg;
259 struct mbuf *m;
260
261 id = (rxq->added + batch) & rxq->ptr_mask;
262 rx_desc = &rxq->queue[id];
263 KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
264
265 rx_desc->flags = EFX_DISCARD;
266 m = rx_desc->mbuf = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR,
267 sc->rx_cluster_size);
268 if (m == NULL)
269 break;
270
271 /* m_len specifies length of area to be mapped for DMA */
272 m->m_len = mblksize;
273 m->m_data = (caddr_t)EFX_P2ROUNDUP(uintptr_t, m->m_data,
274 CACHE_LINE_SIZE);
275 m->m_data += sc->rx_buffer_align;
276
277 sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
278 addr[batch++] = seg.ds_addr;
279
280 if (batch == SFXGE_REFILL_BATCH) {
281 efx_rx_qpost(rxq->common, addr, mblksize, batch,
282 rxq->completed, rxq->added);
283 rxq->added += batch;
284 batch = 0;
285 }
286 }
287
288 if (ntodo != 0)
289 sfxge_rx_schedule_refill(rxq, retrying);
290
291 if (batch != 0) {
292 efx_rx_qpost(rxq->common, addr, mblksize, batch,
293 rxq->completed, rxq->added);
294 rxq->added += batch;
295 }
296
297 /* Make the descriptors visible to the hardware */
298 bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
299 BUS_DMASYNC_PREWRITE);
300
301 efx_rx_qpush(rxq->common, rxq->added, &rxq->pushed);
302
303 /* The queue could still be empty if no descriptors were actually
304 * pushed, in which case there will be no event to cause the next
305 * refill, so we must schedule a refill ourselves.
306 */
307 if(rxq->pushed == rxq->completed) {
308 sfxge_rx_schedule_refill(rxq, retrying);
309 }
310 }
311
312 void
sfxge_rx_qrefill(struct sfxge_rxq * rxq)313 sfxge_rx_qrefill(struct sfxge_rxq *rxq)
314 {
315
316 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
317 return;
318
319 /* Make sure the queue is full */
320 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE);
321 }
322
__sfxge_rx_deliver(struct sfxge_softc * sc,struct mbuf * m)323 static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
324 {
325 if_t ifp = sc->ifnet;
326
327 m->m_pkthdr.rcvif = ifp;
328 m->m_pkthdr.csum_data = 0xffff;
329 if_input(ifp, m);
330 }
331
332 static void
sfxge_rx_deliver(struct sfxge_rxq * rxq,struct sfxge_rx_sw_desc * rx_desc)333 sfxge_rx_deliver(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_desc)
334 {
335 struct sfxge_softc *sc = rxq->sc;
336 struct mbuf *m = rx_desc->mbuf;
337 int flags = rx_desc->flags;
338 int csum_flags;
339
340 /* Convert checksum flags */
341 csum_flags = (flags & EFX_CKSUM_IPV4) ?
342 (CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
343 if (flags & EFX_CKSUM_TCPUDP)
344 csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
345
346 if (flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
347 m->m_pkthdr.flowid =
348 efx_pseudo_hdr_hash_get(rxq->common,
349 EFX_RX_HASHALG_TOEPLITZ,
350 mtod(m, uint8_t *));
351 /* The hash covers a 4-tuple for TCP only */
352 M_HASHTYPE_SET(m,
353 (flags & EFX_PKT_IPV4) ?
354 ((flags & EFX_PKT_TCP) ?
355 M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_IPV4) :
356 ((flags & EFX_PKT_TCP) ?
357 M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_IPV6));
358 }
359 m->m_data += sc->rx_prefix_size;
360 m->m_len = rx_desc->size - sc->rx_prefix_size;
361 m->m_pkthdr.len = m->m_len;
362 m->m_pkthdr.csum_flags = csum_flags;
363 __sfxge_rx_deliver(sc, rx_desc->mbuf);
364
365 rx_desc->flags = EFX_DISCARD;
366 rx_desc->mbuf = NULL;
367 }
368
369 #ifdef SFXGE_LRO
370
371 static void
sfxge_lro_deliver(struct sfxge_lro_state * st,struct sfxge_lro_conn * c)372 sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
373 {
374 struct sfxge_softc *sc = st->sc;
375 struct mbuf *m = c->mbuf;
376 struct tcphdr *c_th;
377 int csum_flags;
378
379 KASSERT(m, ("no mbuf to deliver"));
380
381 ++st->n_bursts;
382
383 /* Finish off packet munging and recalculate IP header checksum. */
384 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
385 struct ip *iph = c->nh;
386 iph->ip_len = htons(iph->ip_len);
387 iph->ip_sum = 0;
388 iph->ip_sum = in_cksum_hdr(iph);
389 c_th = (struct tcphdr *)(iph + 1);
390 csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
391 CSUM_IP_CHECKED | CSUM_IP_VALID);
392 } else {
393 struct ip6_hdr *iph = c->nh;
394 iph->ip6_plen = htons(iph->ip6_plen);
395 c_th = (struct tcphdr *)(iph + 1);
396 csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
397 }
398
399 c_th->th_win = c->th_last->th_win;
400 c_th->th_ack = c->th_last->th_ack;
401 if (c_th->th_off == c->th_last->th_off) {
402 /* Copy TCP options (take care to avoid going negative). */
403 int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
404 memcpy(c_th + 1, c->th_last + 1, optlen);
405 }
406
407 m->m_pkthdr.flowid = c->conn_hash;
408 M_HASHTYPE_SET(m,
409 SFXGE_LRO_CONN_IS_TCPIPV4(c) ?
410 M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_TCP_IPV6);
411
412 m->m_pkthdr.csum_flags = csum_flags;
413 __sfxge_rx_deliver(sc, m);
414
415 c->mbuf = NULL;
416 c->delivered = 1;
417 }
418
419 /* Drop the given connection, and add it to the free list. */
sfxge_lro_drop(struct sfxge_rxq * rxq,struct sfxge_lro_conn * c)420 static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
421 {
422 unsigned bucket;
423
424 KASSERT(!c->mbuf, ("found orphaned mbuf"));
425
426 if (c->next_buf.mbuf != NULL) {
427 sfxge_rx_deliver(rxq, &c->next_buf);
428 LIST_REMOVE(c, active_link);
429 }
430
431 bucket = c->conn_hash & rxq->lro.conns_mask;
432 KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
433 --rxq->lro.conns_n[bucket];
434 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
435 TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
436 }
437
438 /* Stop tracking connections that have gone idle in order to keep hash
439 * chains short.
440 */
sfxge_lro_purge_idle(struct sfxge_rxq * rxq,unsigned now)441 static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
442 {
443 struct sfxge_lro_conn *c;
444 unsigned i;
445
446 KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
447 ("found active connections"));
448
449 rxq->lro.last_purge_ticks = now;
450 for (i = 0; i <= rxq->lro.conns_mask; ++i) {
451 if (TAILQ_EMPTY(&rxq->lro.conns[i]))
452 continue;
453
454 c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
455 if (now - c->last_pkt_ticks > lro_idle_ticks) {
456 ++rxq->lro.n_drop_idle;
457 sfxge_lro_drop(rxq, c);
458 }
459 }
460 }
461
462 static void
sfxge_lro_merge(struct sfxge_lro_state * st,struct sfxge_lro_conn * c,struct mbuf * mbuf,struct tcphdr * th)463 sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
464 struct mbuf *mbuf, struct tcphdr *th)
465 {
466 struct tcphdr *c_th;
467
468 /* Tack the new mbuf onto the chain. */
469 KASSERT(!mbuf->m_next, ("mbuf already chained"));
470 c->mbuf_tail->m_next = mbuf;
471 c->mbuf_tail = mbuf;
472
473 /* Increase length appropriately */
474 c->mbuf->m_pkthdr.len += mbuf->m_len;
475
476 /* Update the connection state flags */
477 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
478 struct ip *iph = c->nh;
479 iph->ip_len += mbuf->m_len;
480 c_th = (struct tcphdr *)(iph + 1);
481 } else {
482 struct ip6_hdr *iph = c->nh;
483 iph->ip6_plen += mbuf->m_len;
484 c_th = (struct tcphdr *)(iph + 1);
485 }
486 c_th->th_flags |= (th->th_flags & TH_PUSH);
487 c->th_last = th;
488 ++st->n_merges;
489
490 /* Pass packet up now if another segment could overflow the IP
491 * length.
492 */
493 if (c->mbuf->m_pkthdr.len > 65536 - 9200)
494 sfxge_lro_deliver(st, c);
495 }
496
497 static void
sfxge_lro_start(struct sfxge_lro_state * st,struct sfxge_lro_conn * c,struct mbuf * mbuf,void * nh,struct tcphdr * th)498 sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
499 struct mbuf *mbuf, void *nh, struct tcphdr *th)
500 {
501 /* Start the chain */
502 c->mbuf = mbuf;
503 c->mbuf_tail = c->mbuf;
504 c->nh = nh;
505 c->th_last = th;
506
507 mbuf->m_pkthdr.len = mbuf->m_len;
508
509 /* Mangle header fields for later processing */
510 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
511 struct ip *iph = nh;
512 iph->ip_len = ntohs(iph->ip_len);
513 } else {
514 struct ip6_hdr *iph = nh;
515 iph->ip6_plen = ntohs(iph->ip6_plen);
516 }
517 }
518
519 /* Try to merge or otherwise hold or deliver (as appropriate) the
520 * packet buffered for this connection (c->next_buf). Return a flag
521 * indicating whether the connection is still active for LRO purposes.
522 */
523 static int
sfxge_lro_try_merge(struct sfxge_rxq * rxq,struct sfxge_lro_conn * c)524 sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
525 {
526 struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
527 char *eh = c->next_eh;
528 int data_length, hdr_length, dont_merge;
529 unsigned th_seq, pkt_length;
530 struct tcphdr *th;
531 unsigned now;
532
533 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
534 struct ip *iph = c->next_nh;
535 th = (struct tcphdr *)(iph + 1);
536 pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
537 } else {
538 struct ip6_hdr *iph = c->next_nh;
539 th = (struct tcphdr *)(iph + 1);
540 pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
541 }
542
543 hdr_length = (char *) th + th->th_off * 4 - eh;
544 data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
545 hdr_length);
546 th_seq = ntohl(th->th_seq);
547 dont_merge = ((data_length <= 0)
548 | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
549
550 /* Check for options other than aligned timestamp. */
551 if (th->th_off != 5) {
552 const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
553 if (th->th_off == 8 &&
554 opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
555 (TCPOPT_NOP << 16) |
556 (TCPOPT_TIMESTAMP << 8) |
557 TCPOLEN_TIMESTAMP)) {
558 /* timestamp option -- okay */
559 } else {
560 dont_merge = 1;
561 }
562 }
563
564 if (__predict_false(th_seq != c->next_seq)) {
565 /* Out-of-order, so start counting again. */
566 if (c->mbuf != NULL)
567 sfxge_lro_deliver(&rxq->lro, c);
568 c->n_in_order_pkts -= lro_loss_packets;
569 c->next_seq = th_seq + data_length;
570 ++rxq->lro.n_misorder;
571 goto deliver_buf_out;
572 }
573 c->next_seq = th_seq + data_length;
574
575 now = ticks;
576 if (now - c->last_pkt_ticks > lro_idle_ticks) {
577 ++rxq->lro.n_drop_idle;
578 if (c->mbuf != NULL)
579 sfxge_lro_deliver(&rxq->lro, c);
580 sfxge_lro_drop(rxq, c);
581 return (0);
582 }
583 c->last_pkt_ticks = ticks;
584
585 if (c->n_in_order_pkts < lro_slow_start_packets) {
586 /* May be in slow-start, so don't merge. */
587 ++rxq->lro.n_slow_start;
588 ++c->n_in_order_pkts;
589 goto deliver_buf_out;
590 }
591
592 if (__predict_false(dont_merge)) {
593 if (c->mbuf != NULL)
594 sfxge_lro_deliver(&rxq->lro, c);
595 if (th->th_flags & (TH_FIN | TH_RST)) {
596 ++rxq->lro.n_drop_closed;
597 sfxge_lro_drop(rxq, c);
598 return (0);
599 }
600 goto deliver_buf_out;
601 }
602
603 rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
604
605 if (__predict_true(c->mbuf != NULL)) {
606 /* Remove headers and any padding */
607 rx_buf->mbuf->m_data += hdr_length;
608 rx_buf->mbuf->m_len = data_length;
609
610 sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
611 } else {
612 /* Remove any padding */
613 rx_buf->mbuf->m_len = pkt_length;
614
615 sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
616 }
617
618 rx_buf->mbuf = NULL;
619 return (1);
620
621 deliver_buf_out:
622 sfxge_rx_deliver(rxq, rx_buf);
623 return (1);
624 }
625
sfxge_lro_new_conn(struct sfxge_lro_state * st,uint32_t conn_hash,uint16_t l2_id,void * nh,struct tcphdr * th)626 static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
627 uint16_t l2_id, void *nh, struct tcphdr *th)
628 {
629 unsigned bucket = conn_hash & st->conns_mask;
630 struct sfxge_lro_conn *c;
631
632 if (st->conns_n[bucket] >= lro_chain_max) {
633 ++st->n_too_many;
634 return;
635 }
636
637 if (!TAILQ_EMPTY(&st->free_conns)) {
638 c = TAILQ_FIRST(&st->free_conns);
639 TAILQ_REMOVE(&st->free_conns, c, link);
640 } else {
641 c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT);
642 if (c == NULL)
643 return;
644 c->mbuf = NULL;
645 c->next_buf.mbuf = NULL;
646 }
647
648 /* Create the connection tracking data */
649 ++st->conns_n[bucket];
650 TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
651 c->l2_id = l2_id;
652 c->conn_hash = conn_hash;
653 c->source = th->th_sport;
654 c->dest = th->th_dport;
655 c->n_in_order_pkts = 0;
656 c->last_pkt_ticks = *(volatile int *)&ticks;
657 c->delivered = 0;
658 ++st->n_new_stream;
659 /* NB. We don't initialise c->next_seq, and it doesn't matter what
660 * value it has. Most likely the next packet received for this
661 * connection will not match -- no harm done.
662 */
663 }
664
665 /* Process mbuf and decide whether to dispatch it to the stack now or
666 * later.
667 */
668 static void
sfxge_lro(struct sfxge_rxq * rxq,struct sfxge_rx_sw_desc * rx_buf)669 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
670 {
671 struct sfxge_softc *sc = rxq->sc;
672 struct mbuf *m = rx_buf->mbuf;
673 struct ether_header *eh;
674 struct sfxge_lro_conn *c;
675 uint16_t l2_id;
676 uint16_t l3_proto;
677 void *nh;
678 struct tcphdr *th;
679 uint32_t conn_hash;
680 unsigned bucket;
681
682 /* Get the hardware hash */
683 conn_hash = efx_pseudo_hdr_hash_get(rxq->common,
684 EFX_RX_HASHALG_TOEPLITZ,
685 mtod(m, uint8_t *));
686
687 eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
688 if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
689 struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
690 l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
691 SFXGE_LRO_L2_ID_VLAN;
692 l3_proto = veh->evl_proto;
693 nh = veh + 1;
694 } else {
695 l2_id = 0;
696 l3_proto = eh->ether_type;
697 nh = eh + 1;
698 }
699
700 /* Check whether this is a suitable packet (unfragmented
701 * TCP/IPv4 or TCP/IPv6). If so, find the TCP header and
702 * length, and compute a hash if necessary. If not, return.
703 */
704 if (l3_proto == htons(ETHERTYPE_IP)) {
705 struct ip *iph = nh;
706
707 KASSERT(iph->ip_p == IPPROTO_TCP,
708 ("IPv4 protocol is not TCP, but packet marker is set"));
709 if ((iph->ip_hl - (sizeof(*iph) >> 2u)) |
710 (iph->ip_off & htons(IP_MF | IP_OFFMASK)))
711 goto deliver_now;
712 th = (struct tcphdr *)(iph + 1);
713 } else if (l3_proto == htons(ETHERTYPE_IPV6)) {
714 struct ip6_hdr *iph = nh;
715
716 KASSERT(iph->ip6_nxt == IPPROTO_TCP,
717 ("IPv6 next header is not TCP, but packet marker is set"));
718 l2_id |= SFXGE_LRO_L2_ID_IPV6;
719 th = (struct tcphdr *)(iph + 1);
720 } else {
721 goto deliver_now;
722 }
723
724 bucket = conn_hash & rxq->lro.conns_mask;
725
726 TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
727 if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
728 continue;
729 if ((c->source - th->th_sport) | (c->dest - th->th_dport))
730 continue;
731 if (c->mbuf != NULL) {
732 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
733 struct ip *c_iph, *iph = nh;
734 c_iph = c->nh;
735 if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
736 (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
737 continue;
738 } else {
739 struct ip6_hdr *c_iph, *iph = nh;
740 c_iph = c->nh;
741 if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
742 ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
743 continue;
744 }
745 }
746
747 /* Re-insert at head of list to reduce lookup time. */
748 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
749 TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
750
751 if (c->next_buf.mbuf != NULL) {
752 if (!sfxge_lro_try_merge(rxq, c))
753 goto deliver_now;
754 } else {
755 LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
756 active_link);
757 }
758 c->next_buf = *rx_buf;
759 c->next_eh = eh;
760 c->next_nh = nh;
761
762 rx_buf->mbuf = NULL;
763 rx_buf->flags = EFX_DISCARD;
764 return;
765 }
766
767 sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
768 deliver_now:
769 sfxge_rx_deliver(rxq, rx_buf);
770 }
771
sfxge_lro_end_of_burst(struct sfxge_rxq * rxq)772 static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
773 {
774 struct sfxge_lro_state *st = &rxq->lro;
775 struct sfxge_lro_conn *c;
776 unsigned t;
777
778 while (!LIST_EMPTY(&st->active_conns)) {
779 c = LIST_FIRST(&st->active_conns);
780 if (!c->delivered && c->mbuf != NULL)
781 sfxge_lro_deliver(st, c);
782 if (sfxge_lro_try_merge(rxq, c)) {
783 if (c->mbuf != NULL)
784 sfxge_lro_deliver(st, c);
785 LIST_REMOVE(c, active_link);
786 }
787 c->delivered = 0;
788 }
789
790 t = *(volatile int *)&ticks;
791 if (__predict_false(t != st->last_purge_ticks))
792 sfxge_lro_purge_idle(rxq, t);
793 }
794
795 #else /* !SFXGE_LRO */
796
797 static void
sfxge_lro(struct sfxge_rxq * rxq,struct sfxge_rx_sw_desc * rx_buf)798 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
799 {
800 }
801
802 static void
sfxge_lro_end_of_burst(struct sfxge_rxq * rxq)803 sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
804 {
805 }
806
807 #endif /* SFXGE_LRO */
808
809 void
sfxge_rx_qcomplete(struct sfxge_rxq * rxq,boolean_t eop)810 sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
811 {
812 struct sfxge_softc *sc = rxq->sc;
813 int if_capenable = if_getcapenable(sc->ifnet);
814 int lro_enabled = if_capenable & IFCAP_LRO;
815 unsigned int index;
816 struct sfxge_evq *evq __diagused;
817 unsigned int completed;
818 unsigned int level;
819 struct mbuf *m;
820 struct sfxge_rx_sw_desc *prev = NULL;
821
822 index = rxq->index;
823 evq = sc->evq[index];
824
825 SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
826
827 completed = rxq->completed;
828 while (completed != rxq->pending) {
829 unsigned int id;
830 struct sfxge_rx_sw_desc *rx_desc;
831
832 id = completed++ & rxq->ptr_mask;
833 rx_desc = &rxq->queue[id];
834 m = rx_desc->mbuf;
835
836 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
837 goto discard;
838
839 if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
840 goto discard;
841
842 /* Read the length from the pseudo header if required */
843 if (rx_desc->flags & EFX_PKT_PREFIX_LEN) {
844 uint16_t tmp_size;
845 int rc __diagused;
846
847 rc = efx_pseudo_hdr_pkt_length_get(rxq->common,
848 mtod(m, uint8_t *),
849 &tmp_size);
850 KASSERT(rc == 0, ("cannot get packet length: %d", rc));
851 rx_desc->size = (int)tmp_size + sc->rx_prefix_size;
852 }
853
854 prefetch_read_many(mtod(m, caddr_t));
855
856 switch (rx_desc->flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
857 case EFX_PKT_IPV4:
858 if (~if_capenable & IFCAP_RXCSUM)
859 rx_desc->flags &=
860 ~(EFX_CKSUM_IPV4 | EFX_CKSUM_TCPUDP);
861 break;
862 case EFX_PKT_IPV6:
863 if (~if_capenable & IFCAP_RXCSUM_IPV6)
864 rx_desc->flags &= ~EFX_CKSUM_TCPUDP;
865 break;
866 case 0:
867 /* Check for loopback packets */
868 {
869 struct ether_header *etherhp;
870
871 /*LINTED*/
872 etherhp = mtod(m, struct ether_header *);
873
874 if (etherhp->ether_type ==
875 htons(SFXGE_ETHERTYPE_LOOPBACK)) {
876 EFSYS_PROBE(loopback);
877
878 rxq->loopback++;
879 goto discard;
880 }
881 }
882 break;
883 default:
884 KASSERT(B_FALSE,
885 ("Rx descriptor with both IPv4 and IPv6 flags"));
886 goto discard;
887 }
888
889 /* Pass packet up the stack or into LRO (pipelined) */
890 if (prev != NULL) {
891 if (lro_enabled &&
892 ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
893 (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
894 sfxge_lro(rxq, prev);
895 else
896 sfxge_rx_deliver(rxq, prev);
897 }
898 prev = rx_desc;
899 continue;
900
901 discard:
902 /* Return the packet to the pool */
903 m_free(m);
904 rx_desc->mbuf = NULL;
905 }
906 rxq->completed = completed;
907
908 level = rxq->added - rxq->completed;
909
910 /* Pass last packet up the stack or into LRO */
911 if (prev != NULL) {
912 if (lro_enabled &&
913 ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
914 (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
915 sfxge_lro(rxq, prev);
916 else
917 sfxge_rx_deliver(rxq, prev);
918 }
919
920 /*
921 * If there are any pending flows and this is the end of the
922 * poll then they must be completed.
923 */
924 if (eop)
925 sfxge_lro_end_of_burst(rxq);
926
927 /* Top up the queue if necessary */
928 if (level < rxq->refill_threshold)
929 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE);
930 }
931
932 static void
sfxge_rx_qstop(struct sfxge_softc * sc,unsigned int index)933 sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
934 {
935 struct sfxge_rxq *rxq;
936 struct sfxge_evq *evq;
937 unsigned int count;
938 unsigned int retry = 3;
939
940 SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
941
942 rxq = sc->rxq[index];
943 evq = sc->evq[index];
944
945 SFXGE_EVQ_LOCK(evq);
946
947 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
948 ("rxq not started"));
949
950 rxq->init_state = SFXGE_RXQ_INITIALIZED;
951
952 callout_stop(&rxq->refill_callout);
953
954 while (rxq->flush_state != SFXGE_FLUSH_DONE && retry != 0) {
955 rxq->flush_state = SFXGE_FLUSH_PENDING;
956
957 SFXGE_EVQ_UNLOCK(evq);
958
959 /* Flush the receive queue */
960 if (efx_rx_qflush(rxq->common) != 0) {
961 SFXGE_EVQ_LOCK(evq);
962 rxq->flush_state = SFXGE_FLUSH_FAILED;
963 break;
964 }
965
966 count = 0;
967 do {
968 /* Spin for 100 ms */
969 DELAY(100000);
970
971 if (rxq->flush_state != SFXGE_FLUSH_PENDING)
972 break;
973
974 } while (++count < 20);
975
976 SFXGE_EVQ_LOCK(evq);
977
978 if (rxq->flush_state == SFXGE_FLUSH_PENDING) {
979 /* Flush timeout - neither done nor failed */
980 log(LOG_ERR, "%s: Cannot flush Rx queue %u\n",
981 device_get_nameunit(sc->dev), index);
982 rxq->flush_state = SFXGE_FLUSH_DONE;
983 }
984 retry--;
985 }
986 if (rxq->flush_state == SFXGE_FLUSH_FAILED) {
987 log(LOG_ERR, "%s: Flushing Rx queue %u failed\n",
988 device_get_nameunit(sc->dev), index);
989 rxq->flush_state = SFXGE_FLUSH_DONE;
990 }
991
992 rxq->pending = rxq->added;
993 sfxge_rx_qcomplete(rxq, B_TRUE);
994
995 KASSERT(rxq->completed == rxq->pending,
996 ("rxq->completed != rxq->pending"));
997
998 rxq->added = 0;
999 rxq->pushed = 0;
1000 rxq->pending = 0;
1001 rxq->completed = 0;
1002 rxq->loopback = 0;
1003
1004 /* Destroy the common code receive queue. */
1005 efx_rx_qdestroy(rxq->common);
1006
1007 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1008 EFX_RXQ_NBUFS(sc->rxq_entries));
1009
1010 SFXGE_EVQ_UNLOCK(evq);
1011 }
1012
1013 static int
sfxge_rx_qstart(struct sfxge_softc * sc,unsigned int index)1014 sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
1015 {
1016 struct sfxge_rxq *rxq;
1017 efsys_mem_t *esmp;
1018 struct sfxge_evq *evq;
1019 int rc;
1020
1021 SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
1022
1023 rxq = sc->rxq[index];
1024 esmp = &rxq->mem;
1025 evq = sc->evq[index];
1026
1027 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1028 ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1029 KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
1030 ("evq->init_state != SFXGE_EVQ_STARTED"));
1031
1032 /* Program the buffer table. */
1033 if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
1034 EFX_RXQ_NBUFS(sc->rxq_entries))) != 0)
1035 return (rc);
1036
1037 /* Create the common code receive queue. */
1038 if ((rc = efx_rx_qcreate(sc->enp, index, 0, EFX_RXQ_TYPE_DEFAULT,
1039 esmp, sc->rxq_entries, rxq->buf_base_id, EFX_RXQ_FLAG_NONE,
1040 evq->common, &rxq->common)) != 0)
1041 goto fail;
1042
1043 SFXGE_EVQ_LOCK(evq);
1044
1045 /* Enable the receive queue. */
1046 efx_rx_qenable(rxq->common);
1047
1048 rxq->init_state = SFXGE_RXQ_STARTED;
1049 rxq->flush_state = SFXGE_FLUSH_REQUIRED;
1050
1051 /* Try to fill the queue from the pool. */
1052 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE);
1053
1054 SFXGE_EVQ_UNLOCK(evq);
1055
1056 return (0);
1057
1058 fail:
1059 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1060 EFX_RXQ_NBUFS(sc->rxq_entries));
1061 return (rc);
1062 }
1063
1064 void
sfxge_rx_stop(struct sfxge_softc * sc)1065 sfxge_rx_stop(struct sfxge_softc *sc)
1066 {
1067 int index;
1068
1069 efx_mac_filter_default_rxq_clear(sc->enp);
1070
1071 /* Stop the receive queue(s) */
1072 index = sc->rxq_count;
1073 while (--index >= 0)
1074 sfxge_rx_qstop(sc, index);
1075
1076 sc->rx_prefix_size = 0;
1077 sc->rx_buffer_size = 0;
1078
1079 efx_rx_fini(sc->enp);
1080 }
1081
1082 int
sfxge_rx_start(struct sfxge_softc * sc)1083 sfxge_rx_start(struct sfxge_softc *sc)
1084 {
1085 const efx_nic_cfg_t *encp;
1086 size_t hdrlen, align, reserved;
1087 int index;
1088 int rc;
1089
1090 /* Initialize the common code receive module. */
1091 if ((rc = efx_rx_init(sc->enp)) != 0)
1092 return (rc);
1093
1094 encp = efx_nic_cfg_get(sc->enp);
1095 sc->rx_buffer_size = EFX_MAC_PDU(if_getmtu(sc->ifnet));
1096
1097 /* Calculate the receive packet buffer size. */
1098 sc->rx_prefix_size = encp->enc_rx_prefix_size;
1099
1100 /* Ensure IP headers are 32bit aligned */
1101 hdrlen = sc->rx_prefix_size + sizeof (struct ether_header);
1102 sc->rx_buffer_align = EFX_P2ROUNDUP(size_t, hdrlen, 4) - hdrlen;
1103
1104 sc->rx_buffer_size += sc->rx_buffer_align;
1105
1106 /* Align end of packet buffer for RX DMA end padding */
1107 align = MAX(1, encp->enc_rx_buf_align_end);
1108 EFSYS_ASSERT(ISP2(align));
1109 sc->rx_buffer_size = EFX_P2ROUNDUP(size_t, sc->rx_buffer_size, align);
1110
1111 /*
1112 * Standard mbuf zones only guarantee pointer-size alignment;
1113 * we need extra space to align to the cache line
1114 */
1115 reserved = sc->rx_buffer_size + CACHE_LINE_SIZE;
1116
1117 /* Select zone for packet buffers */
1118 if (reserved <= MCLBYTES)
1119 sc->rx_cluster_size = MCLBYTES;
1120 else if (reserved <= MJUMPAGESIZE)
1121 sc->rx_cluster_size = MJUMPAGESIZE;
1122 else if (reserved <= MJUM9BYTES)
1123 sc->rx_cluster_size = MJUM9BYTES;
1124 else
1125 sc->rx_cluster_size = MJUM16BYTES;
1126
1127 /*
1128 * Set up the scale table. Enable all hash types and hash insertion.
1129 */
1130 for (index = 0; index < nitems(sc->rx_indir_table); index++)
1131 #ifdef RSS
1132 sc->rx_indir_table[index] =
1133 rss_get_indirection_to_bucket(index) % sc->rxq_count;
1134 #else
1135 sc->rx_indir_table[index] = index % sc->rxq_count;
1136 #endif
1137 if ((rc = efx_rx_scale_tbl_set(sc->enp, EFX_RSS_CONTEXT_DEFAULT,
1138 sc->rx_indir_table,
1139 nitems(sc->rx_indir_table))) != 0)
1140 goto fail;
1141 (void)efx_rx_scale_mode_set(sc->enp, EFX_RSS_CONTEXT_DEFAULT,
1142 EFX_RX_HASHALG_TOEPLITZ,
1143 EFX_RX_HASH_IPV4 | EFX_RX_HASH_TCPIPV4 |
1144 EFX_RX_HASH_IPV6 | EFX_RX_HASH_TCPIPV6, B_TRUE);
1145
1146 #ifdef RSS
1147 rss_getkey(toep_key);
1148 #endif
1149 if ((rc = efx_rx_scale_key_set(sc->enp, EFX_RSS_CONTEXT_DEFAULT,
1150 toep_key,
1151 sizeof(toep_key))) != 0)
1152 goto fail;
1153
1154 /* Start the receive queue(s). */
1155 for (index = 0; index < sc->rxq_count; index++) {
1156 if ((rc = sfxge_rx_qstart(sc, index)) != 0)
1157 goto fail2;
1158 }
1159
1160 rc = efx_mac_filter_default_rxq_set(sc->enp, sc->rxq[0]->common,
1161 sc->intr.n_alloc > 1);
1162 if (rc != 0)
1163 goto fail3;
1164
1165 return (0);
1166
1167 fail3:
1168 fail2:
1169 while (--index >= 0)
1170 sfxge_rx_qstop(sc, index);
1171
1172 fail:
1173 efx_rx_fini(sc->enp);
1174
1175 return (rc);
1176 }
1177
1178 #ifdef SFXGE_LRO
1179
sfxge_lro_init(struct sfxge_rxq * rxq)1180 static void sfxge_lro_init(struct sfxge_rxq *rxq)
1181 {
1182 struct sfxge_lro_state *st = &rxq->lro;
1183 unsigned i;
1184
1185 st->conns_mask = lro_table_size - 1;
1186 KASSERT(!((st->conns_mask + 1) & st->conns_mask),
1187 ("lro_table_size must be a power of 2"));
1188 st->sc = rxq->sc;
1189 st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
1190 M_SFXGE, M_WAITOK);
1191 st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
1192 M_SFXGE, M_WAITOK);
1193 for (i = 0; i <= st->conns_mask; ++i) {
1194 TAILQ_INIT(&st->conns[i]);
1195 st->conns_n[i] = 0;
1196 }
1197 LIST_INIT(&st->active_conns);
1198 TAILQ_INIT(&st->free_conns);
1199 }
1200
sfxge_lro_fini(struct sfxge_rxq * rxq)1201 static void sfxge_lro_fini(struct sfxge_rxq *rxq)
1202 {
1203 struct sfxge_lro_state *st = &rxq->lro;
1204 struct sfxge_lro_conn *c;
1205 unsigned i;
1206
1207 /* Return cleanly if sfxge_lro_init() has not been called. */
1208 if (st->conns == NULL)
1209 return;
1210
1211 KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
1212
1213 for (i = 0; i <= st->conns_mask; ++i) {
1214 while (!TAILQ_EMPTY(&st->conns[i])) {
1215 c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
1216 sfxge_lro_drop(rxq, c);
1217 }
1218 }
1219
1220 while (!TAILQ_EMPTY(&st->free_conns)) {
1221 c = TAILQ_FIRST(&st->free_conns);
1222 TAILQ_REMOVE(&st->free_conns, c, link);
1223 KASSERT(!c->mbuf, ("found orphaned mbuf"));
1224 free(c, M_SFXGE);
1225 }
1226
1227 free(st->conns_n, M_SFXGE);
1228 free(st->conns, M_SFXGE);
1229 st->conns = NULL;
1230 }
1231
1232 #else
1233
1234 static void
sfxge_lro_init(struct sfxge_rxq * rxq)1235 sfxge_lro_init(struct sfxge_rxq *rxq)
1236 {
1237 }
1238
1239 static void
sfxge_lro_fini(struct sfxge_rxq * rxq)1240 sfxge_lro_fini(struct sfxge_rxq *rxq)
1241 {
1242 }
1243
1244 #endif /* SFXGE_LRO */
1245
1246 static void
sfxge_rx_qfini(struct sfxge_softc * sc,unsigned int index)1247 sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
1248 {
1249 struct sfxge_rxq *rxq;
1250
1251 rxq = sc->rxq[index];
1252
1253 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1254 ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1255
1256 /* Free the context array and the flow table. */
1257 free(rxq->queue, M_SFXGE);
1258 sfxge_lro_fini(rxq);
1259
1260 /* Release DMA memory. */
1261 sfxge_dma_free(&rxq->mem);
1262
1263 sc->rxq[index] = NULL;
1264
1265 free(rxq, M_SFXGE);
1266 }
1267
1268 static int
sfxge_rx_qinit(struct sfxge_softc * sc,unsigned int index)1269 sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
1270 {
1271 struct sfxge_rxq *rxq;
1272 efsys_mem_t *esmp;
1273 int rc;
1274
1275 KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count));
1276
1277 rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
1278 rxq->sc = sc;
1279 rxq->index = index;
1280 rxq->entries = sc->rxq_entries;
1281 rxq->ptr_mask = rxq->entries - 1;
1282 rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries);
1283
1284 sc->rxq[index] = rxq;
1285 esmp = &rxq->mem;
1286
1287 /* Allocate and zero DMA space. */
1288 if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0)
1289 return (rc);
1290
1291 /* Allocate buffer table entries. */
1292 sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries),
1293 &rxq->buf_base_id);
1294
1295 /* Allocate the context array and the flow table. */
1296 rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries,
1297 M_SFXGE, M_WAITOK | M_ZERO);
1298 sfxge_lro_init(rxq);
1299
1300 callout_init(&rxq->refill_callout, 1);
1301
1302 rxq->init_state = SFXGE_RXQ_INITIALIZED;
1303
1304 return (0);
1305 }
1306
1307 static const struct {
1308 const char *name;
1309 size_t offset;
1310 } sfxge_rx_stats[] = {
1311 #define SFXGE_RX_STAT(name, member) \
1312 { #name, offsetof(struct sfxge_rxq, member) }
1313 #ifdef SFXGE_LRO
1314 SFXGE_RX_STAT(lro_merges, lro.n_merges),
1315 SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
1316 SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
1317 SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
1318 SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
1319 SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
1320 SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
1321 SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
1322 #endif
1323 };
1324
1325 static int
sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)1326 sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
1327 {
1328 struct sfxge_softc *sc = arg1;
1329 unsigned int id = arg2;
1330 unsigned int sum, index;
1331
1332 /* Sum across all RX queues */
1333 sum = 0;
1334 for (index = 0; index < sc->rxq_count; index++)
1335 sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
1336 sfxge_rx_stats[id].offset);
1337
1338 return (SYSCTL_OUT(req, &sum, sizeof(sum)));
1339 }
1340
1341 static void
sfxge_rx_stat_init(struct sfxge_softc * sc)1342 sfxge_rx_stat_init(struct sfxge_softc *sc)
1343 {
1344 struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1345 struct sysctl_oid_list *stat_list;
1346 unsigned int id;
1347
1348 stat_list = SYSCTL_CHILDREN(sc->stats_node);
1349
1350 for (id = 0; id < nitems(sfxge_rx_stats); id++) {
1351 SYSCTL_ADD_PROC(ctx, stat_list, OID_AUTO,
1352 sfxge_rx_stats[id].name,
1353 CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_NEEDGIANT,
1354 sc, id, sfxge_rx_stat_handler, "IU", "");
1355 }
1356 }
1357
1358 void
sfxge_rx_fini(struct sfxge_softc * sc)1359 sfxge_rx_fini(struct sfxge_softc *sc)
1360 {
1361 int index;
1362
1363 index = sc->rxq_count;
1364 while (--index >= 0)
1365 sfxge_rx_qfini(sc, index);
1366
1367 sc->rxq_count = 0;
1368 }
1369
1370 int
sfxge_rx_init(struct sfxge_softc * sc)1371 sfxge_rx_init(struct sfxge_softc *sc)
1372 {
1373 struct sfxge_intr *intr;
1374 int index;
1375 int rc;
1376
1377 #ifdef SFXGE_LRO
1378 if (!ISP2(lro_table_size)) {
1379 log(LOG_ERR, "%s=%u must be power of 2",
1380 SFXGE_LRO_PARAM(table_size), lro_table_size);
1381 rc = EINVAL;
1382 goto fail_lro_table_size;
1383 }
1384
1385 if (lro_idle_ticks == 0)
1386 lro_idle_ticks = hz / 10 + 1; /* 100 ms */
1387 #endif
1388
1389 intr = &sc->intr;
1390
1391 sc->rxq_count = intr->n_alloc;
1392
1393 KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
1394 ("intr->state != SFXGE_INTR_INITIALIZED"));
1395
1396 /* Initialize the receive queue(s) - one per interrupt. */
1397 for (index = 0; index < sc->rxq_count; index++) {
1398 if ((rc = sfxge_rx_qinit(sc, index)) != 0)
1399 goto fail;
1400 }
1401
1402 sfxge_rx_stat_init(sc);
1403
1404 return (0);
1405
1406 fail:
1407 /* Tear down the receive queue(s). */
1408 while (--index >= 0)
1409 sfxge_rx_qfini(sc, index);
1410
1411 sc->rxq_count = 0;
1412
1413 #ifdef SFXGE_LRO
1414 fail_lro_table_size:
1415 #endif
1416 return (rc);
1417 }
1418