xref: /freebsd/sys/dev/sfxge/sfxge_rx.c (revision 13ec1e3155c7e9bf037b12af186351b7fa9b9450)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2010-2016 Solarflare Communications Inc.
5  * All rights reserved.
6  *
7  * This software was developed in part by Philip Paeps under contract for
8  * Solarflare Communications, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright notice,
14  *    this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright notice,
16  *    this list of conditions and the following disclaimer in the documentation
17  *    and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
23  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
29  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  *
31  * The views and conclusions contained in the software and documentation are
32  * those of the authors and should not be interpreted as representing official
33  * policies, either expressed or implied, of the FreeBSD Project.
34  */
35 
36 #include <sys/cdefs.h>
37 __FBSDID("$FreeBSD$");
38 
39 #include "opt_rss.h"
40 
41 #include <sys/param.h>
42 #include <sys/malloc.h>
43 #include <sys/mbuf.h>
44 #include <sys/smp.h>
45 #include <sys/socket.h>
46 #include <sys/sysctl.h>
47 #include <sys/syslog.h>
48 #include <sys/limits.h>
49 #include <sys/syslog.h>
50 
51 #include <net/ethernet.h>
52 #include <net/if.h>
53 #include <net/if_vlan_var.h>
54 
55 #include <netinet/in.h>
56 #include <netinet/ip.h>
57 #include <netinet/ip6.h>
58 #include <netinet/tcp.h>
59 
60 #include <machine/in_cksum.h>
61 
62 #ifdef RSS
63 #include <net/rss_config.h>
64 #endif
65 
66 #include "common/efx.h"
67 
68 #include "sfxge.h"
69 #include "sfxge_rx.h"
70 
71 #define	RX_REFILL_THRESHOLD(_entries)	(EFX_RXQ_LIMIT(_entries) * 9 / 10)
72 
73 #ifdef SFXGE_LRO
74 
75 SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
76     "Large receive offload (LRO) parameters");
77 
78 #define	SFXGE_LRO_PARAM(_param)	SFXGE_PARAM(lro._param)
79 
80 /* Size of the LRO hash table.  Must be a power of 2.  A larger table
81  * means we can accelerate a larger number of streams.
82  */
83 static unsigned lro_table_size = 128;
84 TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size);
85 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN,
86 	    &lro_table_size, 0,
87 	    "Size of the LRO hash table (must be a power of 2)");
88 
89 /* Maximum length of a hash chain.  If chains get too long then the lookup
90  * time increases and may exceed the benefit of LRO.
91  */
92 static unsigned lro_chain_max = 20;
93 TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max);
94 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN,
95 	    &lro_chain_max, 0,
96 	    "The maximum length of a hash chain");
97 
98 /* Maximum time (in ticks) that a connection can be idle before it's LRO
99  * state is discarded.
100  */
101 static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
102 TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks);
103 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN,
104 	    &lro_idle_ticks, 0,
105 	    "The maximum time (in ticks) that a connection can be idle "
106 	    "before it's LRO state is discarded");
107 
108 /* Number of packets with payload that must arrive in-order before a
109  * connection is eligible for LRO.  The idea is we should avoid coalescing
110  * segments when the sender is in slow-start because reducing the ACK rate
111  * can damage performance.
112  */
113 static int lro_slow_start_packets = 2000;
114 TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets);
115 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN,
116 	    &lro_slow_start_packets, 0,
117 	    "Number of packets with payload that must arrive in-order before "
118 	    "a connection is eligible for LRO");
119 
120 /* Number of packets with payload that must arrive in-order following loss
121  * before a connection is eligible for LRO.  The idea is we should avoid
122  * coalescing segments when the sender is recovering from loss, because
123  * reducing the ACK rate can damage performance.
124  */
125 static int lro_loss_packets = 20;
126 TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets);
127 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN,
128 	    &lro_loss_packets, 0,
129 	    "Number of packets with payload that must arrive in-order "
130 	    "following loss before a connection is eligible for LRO");
131 
132 /* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
133 #define	SFXGE_LRO_L2_ID_VLAN 0x4000
134 #define	SFXGE_LRO_L2_ID_IPV6 0x8000
135 #define	SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
136 #define	SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
137 
138 /* Compare IPv6 addresses, avoiding conditional branches */
139 static unsigned long ipv6_addr_cmp(const struct in6_addr *left,
140 				   const struct in6_addr *right)
141 {
142 #if LONG_BIT == 64
143 	const uint64_t *left64 = (const uint64_t *)left;
144 	const uint64_t *right64 = (const uint64_t *)right;
145 	return (left64[0] - right64[0]) | (left64[1] - right64[1]);
146 #else
147 	return (left->s6_addr32[0] - right->s6_addr32[0]) |
148 	       (left->s6_addr32[1] - right->s6_addr32[1]) |
149 	       (left->s6_addr32[2] - right->s6_addr32[2]) |
150 	       (left->s6_addr32[3] - right->s6_addr32[3]);
151 #endif
152 }
153 
154 #endif	/* SFXGE_LRO */
155 
156 void
157 sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
158 {
159 
160 	rxq->flush_state = SFXGE_FLUSH_DONE;
161 }
162 
163 void
164 sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
165 {
166 
167 	rxq->flush_state = SFXGE_FLUSH_FAILED;
168 }
169 
170 #ifdef RSS
171 static uint8_t toep_key[RSS_KEYSIZE];
172 #else
173 static uint8_t toep_key[] = {
174 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
175 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
176 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
177 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
178 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
179 };
180 #endif
181 
182 static void
183 sfxge_rx_post_refill(void *arg)
184 {
185 	struct sfxge_rxq *rxq = arg;
186 	struct sfxge_softc *sc;
187 	unsigned int index;
188 	struct sfxge_evq *evq;
189 	uint16_t magic;
190 
191 	sc = rxq->sc;
192 	index = rxq->index;
193 	evq = sc->evq[index];
194 	magic = sfxge_sw_ev_rxq_magic(SFXGE_SW_EV_RX_QREFILL, rxq);
195 
196 	/* This is guaranteed due to the start/stop order of rx and ev */
197 	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
198 	    ("evq not started"));
199 	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
200 	    ("rxq not started"));
201 	efx_ev_qpost(evq->common, magic);
202 }
203 
204 static void
205 sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
206 {
207 	/* Initially retry after 100 ms, but back off in case of
208 	 * repeated failures as we probably have to wait for the
209 	 * administrator to raise the pool limit. */
210 	if (retrying)
211 		rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
212 	else
213 		rxq->refill_delay = hz / 10;
214 
215 	callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
216 			     sfxge_rx_post_refill, rxq);
217 }
218 
219 #define	SFXGE_REFILL_BATCH  64
220 
221 static void
222 sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
223 {
224 	struct sfxge_softc *sc;
225 	unsigned int index;
226 	struct sfxge_evq *evq;
227 	unsigned int batch;
228 	unsigned int rxfill;
229 	unsigned int mblksize;
230 	int ntodo;
231 	efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
232 
233 	sc = rxq->sc;
234 	index = rxq->index;
235 	evq = sc->evq[index];
236 
237 	prefetch_read_many(sc->enp);
238 	prefetch_read_many(rxq->common);
239 
240 	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
241 
242 	if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
243 		return;
244 
245 	rxfill = rxq->added - rxq->completed;
246 	KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries),
247 	    ("rxfill > EFX_RXQ_LIMIT(rxq->entries)"));
248 	ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target);
249 	KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries),
250 	    ("ntodo > EFX_RQX_LIMIT(rxq->entries)"));
251 
252 	if (ntodo == 0)
253 		return;
254 
255 	batch = 0;
256 	mblksize = sc->rx_buffer_size - sc->rx_buffer_align;
257 	while (ntodo-- > 0) {
258 		unsigned int id;
259 		struct sfxge_rx_sw_desc *rx_desc;
260 		bus_dma_segment_t seg;
261 		struct mbuf *m;
262 
263 		id = (rxq->added + batch) & rxq->ptr_mask;
264 		rx_desc = &rxq->queue[id];
265 		KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
266 
267 		rx_desc->flags = EFX_DISCARD;
268 		m = rx_desc->mbuf = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR,
269 		    sc->rx_cluster_size);
270 		if (m == NULL)
271 			break;
272 
273 		/* m_len specifies length of area to be mapped for DMA */
274 		m->m_len  = mblksize;
275 		m->m_data = (caddr_t)EFX_P2ROUNDUP(uintptr_t, m->m_data,
276 						   CACHE_LINE_SIZE);
277 		m->m_data += sc->rx_buffer_align;
278 
279 		sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
280 		addr[batch++] = seg.ds_addr;
281 
282 		if (batch == SFXGE_REFILL_BATCH) {
283 			efx_rx_qpost(rxq->common, addr, mblksize, batch,
284 			    rxq->completed, rxq->added);
285 			rxq->added += batch;
286 			batch = 0;
287 		}
288 	}
289 
290 	if (ntodo != 0)
291 		sfxge_rx_schedule_refill(rxq, retrying);
292 
293 	if (batch != 0) {
294 		efx_rx_qpost(rxq->common, addr, mblksize, batch,
295 		    rxq->completed, rxq->added);
296 		rxq->added += batch;
297 	}
298 
299 	/* Make the descriptors visible to the hardware */
300 	bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
301 			BUS_DMASYNC_PREWRITE);
302 
303 	efx_rx_qpush(rxq->common, rxq->added, &rxq->pushed);
304 
305 	/* The queue could still be empty if no descriptors were actually
306 	 * pushed, in which case there will be no event to cause the next
307 	 * refill, so we must schedule a refill ourselves.
308 	 */
309 	if(rxq->pushed == rxq->completed) {
310 		sfxge_rx_schedule_refill(rxq, retrying);
311 	}
312 }
313 
314 void
315 sfxge_rx_qrefill(struct sfxge_rxq *rxq)
316 {
317 
318 	if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
319 		return;
320 
321 	/* Make sure the queue is full */
322 	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE);
323 }
324 
325 static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
326 {
327 	struct ifnet *ifp = sc->ifnet;
328 
329 	m->m_pkthdr.rcvif = ifp;
330 	m->m_pkthdr.csum_data = 0xffff;
331 	ifp->if_input(ifp, m);
332 }
333 
334 static void
335 sfxge_rx_deliver(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_desc)
336 {
337 	struct sfxge_softc *sc = rxq->sc;
338 	struct mbuf *m = rx_desc->mbuf;
339 	int flags = rx_desc->flags;
340 	int csum_flags;
341 
342 	/* Convert checksum flags */
343 	csum_flags = (flags & EFX_CKSUM_IPV4) ?
344 		(CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
345 	if (flags & EFX_CKSUM_TCPUDP)
346 		csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
347 
348 	if (flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
349 		m->m_pkthdr.flowid =
350 			efx_pseudo_hdr_hash_get(rxq->common,
351 						EFX_RX_HASHALG_TOEPLITZ,
352 						mtod(m, uint8_t *));
353 		/* The hash covers a 4-tuple for TCP only */
354 		M_HASHTYPE_SET(m,
355 		    (flags & EFX_PKT_IPV4) ?
356 			((flags & EFX_PKT_TCP) ?
357 			    M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_IPV4) :
358 			((flags & EFX_PKT_TCP) ?
359 			    M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_IPV6));
360 	}
361 	m->m_data += sc->rx_prefix_size;
362 	m->m_len = rx_desc->size - sc->rx_prefix_size;
363 	m->m_pkthdr.len = m->m_len;
364 	m->m_pkthdr.csum_flags = csum_flags;
365 	__sfxge_rx_deliver(sc, rx_desc->mbuf);
366 
367 	rx_desc->flags = EFX_DISCARD;
368 	rx_desc->mbuf = NULL;
369 }
370 
371 #ifdef SFXGE_LRO
372 
373 static void
374 sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
375 {
376 	struct sfxge_softc *sc = st->sc;
377 	struct mbuf *m = c->mbuf;
378 	struct tcphdr *c_th;
379 	int csum_flags;
380 
381 	KASSERT(m, ("no mbuf to deliver"));
382 
383 	++st->n_bursts;
384 
385 	/* Finish off packet munging and recalculate IP header checksum. */
386 	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
387 		struct ip *iph = c->nh;
388 		iph->ip_len = htons(iph->ip_len);
389 		iph->ip_sum = 0;
390 		iph->ip_sum = in_cksum_hdr(iph);
391 		c_th = (struct tcphdr *)(iph + 1);
392 		csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
393 			      CSUM_IP_CHECKED | CSUM_IP_VALID);
394 	} else {
395 		struct ip6_hdr *iph = c->nh;
396 		iph->ip6_plen = htons(iph->ip6_plen);
397 		c_th = (struct tcphdr *)(iph + 1);
398 		csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
399 	}
400 
401 	c_th->th_win = c->th_last->th_win;
402 	c_th->th_ack = c->th_last->th_ack;
403 	if (c_th->th_off == c->th_last->th_off) {
404 		/* Copy TCP options (take care to avoid going negative). */
405 		int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
406 		memcpy(c_th + 1, c->th_last + 1, optlen);
407 	}
408 
409 	m->m_pkthdr.flowid = c->conn_hash;
410 	M_HASHTYPE_SET(m,
411 	    SFXGE_LRO_CONN_IS_TCPIPV4(c) ?
412 		M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_TCP_IPV6);
413 
414 	m->m_pkthdr.csum_flags = csum_flags;
415 	__sfxge_rx_deliver(sc, m);
416 
417 	c->mbuf = NULL;
418 	c->delivered = 1;
419 }
420 
421 /* Drop the given connection, and add it to the free list. */
422 static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
423 {
424 	unsigned bucket;
425 
426 	KASSERT(!c->mbuf, ("found orphaned mbuf"));
427 
428 	if (c->next_buf.mbuf != NULL) {
429 		sfxge_rx_deliver(rxq, &c->next_buf);
430 		LIST_REMOVE(c, active_link);
431 	}
432 
433 	bucket = c->conn_hash & rxq->lro.conns_mask;
434 	KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
435 	--rxq->lro.conns_n[bucket];
436 	TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
437 	TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
438 }
439 
440 /* Stop tracking connections that have gone idle in order to keep hash
441  * chains short.
442  */
443 static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
444 {
445 	struct sfxge_lro_conn *c;
446 	unsigned i;
447 
448 	KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
449 		("found active connections"));
450 
451 	rxq->lro.last_purge_ticks = now;
452 	for (i = 0; i <= rxq->lro.conns_mask; ++i) {
453 		if (TAILQ_EMPTY(&rxq->lro.conns[i]))
454 			continue;
455 
456 		c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
457 		if (now - c->last_pkt_ticks > lro_idle_ticks) {
458 			++rxq->lro.n_drop_idle;
459 			sfxge_lro_drop(rxq, c);
460 		}
461 	}
462 }
463 
464 static void
465 sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
466 		struct mbuf *mbuf, struct tcphdr *th)
467 {
468 	struct tcphdr *c_th;
469 
470 	/* Tack the new mbuf onto the chain. */
471 	KASSERT(!mbuf->m_next, ("mbuf already chained"));
472 	c->mbuf_tail->m_next = mbuf;
473 	c->mbuf_tail = mbuf;
474 
475 	/* Increase length appropriately */
476 	c->mbuf->m_pkthdr.len += mbuf->m_len;
477 
478 	/* Update the connection state flags */
479 	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
480 		struct ip *iph = c->nh;
481 		iph->ip_len += mbuf->m_len;
482 		c_th = (struct tcphdr *)(iph + 1);
483 	} else {
484 		struct ip6_hdr *iph = c->nh;
485 		iph->ip6_plen += mbuf->m_len;
486 		c_th = (struct tcphdr *)(iph + 1);
487 	}
488 	c_th->th_flags |= (th->th_flags & TH_PUSH);
489 	c->th_last = th;
490 	++st->n_merges;
491 
492 	/* Pass packet up now if another segment could overflow the IP
493 	 * length.
494 	 */
495 	if (c->mbuf->m_pkthdr.len > 65536 - 9200)
496 		sfxge_lro_deliver(st, c);
497 }
498 
499 static void
500 sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
501 		struct mbuf *mbuf, void *nh, struct tcphdr *th)
502 {
503 	/* Start the chain */
504 	c->mbuf = mbuf;
505 	c->mbuf_tail = c->mbuf;
506 	c->nh = nh;
507 	c->th_last = th;
508 
509 	mbuf->m_pkthdr.len = mbuf->m_len;
510 
511 	/* Mangle header fields for later processing */
512 	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
513 		struct ip *iph = nh;
514 		iph->ip_len = ntohs(iph->ip_len);
515 	} else {
516 		struct ip6_hdr *iph = nh;
517 		iph->ip6_plen = ntohs(iph->ip6_plen);
518 	}
519 }
520 
521 /* Try to merge or otherwise hold or deliver (as appropriate) the
522  * packet buffered for this connection (c->next_buf).  Return a flag
523  * indicating whether the connection is still active for LRO purposes.
524  */
525 static int
526 sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
527 {
528 	struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
529 	char *eh = c->next_eh;
530 	int data_length, hdr_length, dont_merge;
531 	unsigned th_seq, pkt_length;
532 	struct tcphdr *th;
533 	unsigned now;
534 
535 	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
536 		struct ip *iph = c->next_nh;
537 		th = (struct tcphdr *)(iph + 1);
538 		pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
539 	} else {
540 		struct ip6_hdr *iph = c->next_nh;
541 		th = (struct tcphdr *)(iph + 1);
542 		pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
543 	}
544 
545 	hdr_length = (char *) th + th->th_off * 4 - eh;
546 	data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
547 		       hdr_length);
548 	th_seq = ntohl(th->th_seq);
549 	dont_merge = ((data_length <= 0)
550 		      | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
551 
552 	/* Check for options other than aligned timestamp. */
553 	if (th->th_off != 5) {
554 		const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
555 		if (th->th_off == 8 &&
556 		    opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
557 					(TCPOPT_NOP << 16) |
558 					(TCPOPT_TIMESTAMP << 8) |
559 					TCPOLEN_TIMESTAMP)) {
560 			/* timestamp option -- okay */
561 		} else {
562 			dont_merge = 1;
563 		}
564 	}
565 
566 	if (__predict_false(th_seq != c->next_seq)) {
567 		/* Out-of-order, so start counting again. */
568 		if (c->mbuf != NULL)
569 			sfxge_lro_deliver(&rxq->lro, c);
570 		c->n_in_order_pkts -= lro_loss_packets;
571 		c->next_seq = th_seq + data_length;
572 		++rxq->lro.n_misorder;
573 		goto deliver_buf_out;
574 	}
575 	c->next_seq = th_seq + data_length;
576 
577 	now = ticks;
578 	if (now - c->last_pkt_ticks > lro_idle_ticks) {
579 		++rxq->lro.n_drop_idle;
580 		if (c->mbuf != NULL)
581 			sfxge_lro_deliver(&rxq->lro, c);
582 		sfxge_lro_drop(rxq, c);
583 		return (0);
584 	}
585 	c->last_pkt_ticks = ticks;
586 
587 	if (c->n_in_order_pkts < lro_slow_start_packets) {
588 		/* May be in slow-start, so don't merge. */
589 		++rxq->lro.n_slow_start;
590 		++c->n_in_order_pkts;
591 		goto deliver_buf_out;
592 	}
593 
594 	if (__predict_false(dont_merge)) {
595 		if (c->mbuf != NULL)
596 			sfxge_lro_deliver(&rxq->lro, c);
597 		if (th->th_flags & (TH_FIN | TH_RST)) {
598 			++rxq->lro.n_drop_closed;
599 			sfxge_lro_drop(rxq, c);
600 			return (0);
601 		}
602 		goto deliver_buf_out;
603 	}
604 
605 	rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
606 
607 	if (__predict_true(c->mbuf != NULL)) {
608 		/* Remove headers and any padding */
609 		rx_buf->mbuf->m_data += hdr_length;
610 		rx_buf->mbuf->m_len = data_length;
611 
612 		sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
613 	} else {
614 		/* Remove any padding */
615 		rx_buf->mbuf->m_len = pkt_length;
616 
617 		sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
618 	}
619 
620 	rx_buf->mbuf = NULL;
621 	return (1);
622 
623  deliver_buf_out:
624 	sfxge_rx_deliver(rxq, rx_buf);
625 	return (1);
626 }
627 
628 static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
629 			       uint16_t l2_id, void *nh, struct tcphdr *th)
630 {
631 	unsigned bucket = conn_hash & st->conns_mask;
632 	struct sfxge_lro_conn *c;
633 
634 	if (st->conns_n[bucket] >= lro_chain_max) {
635 		++st->n_too_many;
636 		return;
637 	}
638 
639 	if (!TAILQ_EMPTY(&st->free_conns)) {
640 		c = TAILQ_FIRST(&st->free_conns);
641 		TAILQ_REMOVE(&st->free_conns, c, link);
642 	} else {
643 		c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT);
644 		if (c == NULL)
645 			return;
646 		c->mbuf = NULL;
647 		c->next_buf.mbuf = NULL;
648 	}
649 
650 	/* Create the connection tracking data */
651 	++st->conns_n[bucket];
652 	TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
653 	c->l2_id = l2_id;
654 	c->conn_hash = conn_hash;
655 	c->source = th->th_sport;
656 	c->dest = th->th_dport;
657 	c->n_in_order_pkts = 0;
658 	c->last_pkt_ticks = *(volatile int *)&ticks;
659 	c->delivered = 0;
660 	++st->n_new_stream;
661 	/* NB. We don't initialise c->next_seq, and it doesn't matter what
662 	 * value it has.  Most likely the next packet received for this
663 	 * connection will not match -- no harm done.
664 	 */
665 }
666 
667 /* Process mbuf and decide whether to dispatch it to the stack now or
668  * later.
669  */
670 static void
671 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
672 {
673 	struct sfxge_softc *sc = rxq->sc;
674 	struct mbuf *m = rx_buf->mbuf;
675 	struct ether_header *eh;
676 	struct sfxge_lro_conn *c;
677 	uint16_t l2_id;
678 	uint16_t l3_proto;
679 	void *nh;
680 	struct tcphdr *th;
681 	uint32_t conn_hash;
682 	unsigned bucket;
683 
684 	/* Get the hardware hash */
685 	conn_hash = efx_pseudo_hdr_hash_get(rxq->common,
686 					    EFX_RX_HASHALG_TOEPLITZ,
687 					    mtod(m, uint8_t *));
688 
689 	eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
690 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
691 		struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
692 		l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
693 			SFXGE_LRO_L2_ID_VLAN;
694 		l3_proto = veh->evl_proto;
695 		nh = veh + 1;
696 	} else {
697 		l2_id = 0;
698 		l3_proto = eh->ether_type;
699 		nh = eh + 1;
700 	}
701 
702 	/* Check whether this is a suitable packet (unfragmented
703 	 * TCP/IPv4 or TCP/IPv6).  If so, find the TCP header and
704 	 * length, and compute a hash if necessary.  If not, return.
705 	 */
706 	if (l3_proto == htons(ETHERTYPE_IP)) {
707 		struct ip *iph = nh;
708 
709 		KASSERT(iph->ip_p == IPPROTO_TCP,
710 		    ("IPv4 protocol is not TCP, but packet marker is set"));
711 		if ((iph->ip_hl - (sizeof(*iph) >> 2u)) |
712 		    (iph->ip_off & htons(IP_MF | IP_OFFMASK)))
713 			goto deliver_now;
714 		th = (struct tcphdr *)(iph + 1);
715 	} else if (l3_proto == htons(ETHERTYPE_IPV6)) {
716 		struct ip6_hdr *iph = nh;
717 
718 		KASSERT(iph->ip6_nxt == IPPROTO_TCP,
719 		    ("IPv6 next header is not TCP, but packet marker is set"));
720 		l2_id |= SFXGE_LRO_L2_ID_IPV6;
721 		th = (struct tcphdr *)(iph + 1);
722 	} else {
723 		goto deliver_now;
724 	}
725 
726 	bucket = conn_hash & rxq->lro.conns_mask;
727 
728 	TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
729 		if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
730 			continue;
731 		if ((c->source - th->th_sport) | (c->dest - th->th_dport))
732 			continue;
733 		if (c->mbuf != NULL) {
734 			if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
735 				struct ip *c_iph, *iph = nh;
736 				c_iph = c->nh;
737 				if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
738 				    (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
739 					continue;
740 			} else {
741 				struct ip6_hdr *c_iph, *iph = nh;
742 				c_iph = c->nh;
743 				if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
744 				    ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
745 					continue;
746 			}
747 		}
748 
749 		/* Re-insert at head of list to reduce lookup time. */
750 		TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
751 		TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
752 
753 		if (c->next_buf.mbuf != NULL) {
754 			if (!sfxge_lro_try_merge(rxq, c))
755 				goto deliver_now;
756 		} else {
757 			LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
758 			    active_link);
759 		}
760 		c->next_buf = *rx_buf;
761 		c->next_eh = eh;
762 		c->next_nh = nh;
763 
764 		rx_buf->mbuf = NULL;
765 		rx_buf->flags = EFX_DISCARD;
766 		return;
767 	}
768 
769 	sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
770  deliver_now:
771 	sfxge_rx_deliver(rxq, rx_buf);
772 }
773 
774 static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
775 {
776 	struct sfxge_lro_state *st = &rxq->lro;
777 	struct sfxge_lro_conn *c;
778 	unsigned t;
779 
780 	while (!LIST_EMPTY(&st->active_conns)) {
781 		c = LIST_FIRST(&st->active_conns);
782 		if (!c->delivered && c->mbuf != NULL)
783 			sfxge_lro_deliver(st, c);
784 		if (sfxge_lro_try_merge(rxq, c)) {
785 			if (c->mbuf != NULL)
786 				sfxge_lro_deliver(st, c);
787 			LIST_REMOVE(c, active_link);
788 		}
789 		c->delivered = 0;
790 	}
791 
792 	t = *(volatile int *)&ticks;
793 	if (__predict_false(t != st->last_purge_ticks))
794 		sfxge_lro_purge_idle(rxq, t);
795 }
796 
797 #else	/* !SFXGE_LRO */
798 
799 static void
800 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
801 {
802 }
803 
804 static void
805 sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
806 {
807 }
808 
809 #endif	/* SFXGE_LRO */
810 
811 void
812 sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
813 {
814 	struct sfxge_softc *sc = rxq->sc;
815 	int if_capenable = sc->ifnet->if_capenable;
816 	int lro_enabled = if_capenable & IFCAP_LRO;
817 	unsigned int index;
818 	struct sfxge_evq *evq;
819 	unsigned int completed;
820 	unsigned int level;
821 	struct mbuf *m;
822 	struct sfxge_rx_sw_desc *prev = NULL;
823 
824 	index = rxq->index;
825 	evq = sc->evq[index];
826 
827 	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
828 
829 	completed = rxq->completed;
830 	while (completed != rxq->pending) {
831 		unsigned int id;
832 		struct sfxge_rx_sw_desc *rx_desc;
833 
834 		id = completed++ & rxq->ptr_mask;
835 		rx_desc = &rxq->queue[id];
836 		m = rx_desc->mbuf;
837 
838 		if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
839 			goto discard;
840 
841 		if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
842 			goto discard;
843 
844 		/* Read the length from the pseudo header if required */
845 		if (rx_desc->flags & EFX_PKT_PREFIX_LEN) {
846 			uint16_t tmp_size;
847 			int rc;
848 			rc = efx_pseudo_hdr_pkt_length_get(rxq->common,
849 							   mtod(m, uint8_t *),
850 							   &tmp_size);
851 			KASSERT(rc == 0, ("cannot get packet length: %d", rc));
852 			rx_desc->size = (int)tmp_size + sc->rx_prefix_size;
853 		}
854 
855 		prefetch_read_many(mtod(m, caddr_t));
856 
857 		switch (rx_desc->flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
858 		case EFX_PKT_IPV4:
859 			if (~if_capenable & IFCAP_RXCSUM)
860 				rx_desc->flags &=
861 				    ~(EFX_CKSUM_IPV4 | EFX_CKSUM_TCPUDP);
862 			break;
863 		case EFX_PKT_IPV6:
864 			if (~if_capenable & IFCAP_RXCSUM_IPV6)
865 				rx_desc->flags &= ~EFX_CKSUM_TCPUDP;
866 			break;
867 		case 0:
868 			/* Check for loopback packets */
869 			{
870 				struct ether_header *etherhp;
871 
872 				/*LINTED*/
873 				etherhp = mtod(m, struct ether_header *);
874 
875 				if (etherhp->ether_type ==
876 				    htons(SFXGE_ETHERTYPE_LOOPBACK)) {
877 					EFSYS_PROBE(loopback);
878 
879 					rxq->loopback++;
880 					goto discard;
881 				}
882 			}
883 			break;
884 		default:
885 			KASSERT(B_FALSE,
886 			    ("Rx descriptor with both IPv4 and IPv6 flags"));
887 			goto discard;
888 		}
889 
890 		/* Pass packet up the stack or into LRO (pipelined) */
891 		if (prev != NULL) {
892 			if (lro_enabled &&
893 			    ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
894 			     (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
895 				sfxge_lro(rxq, prev);
896 			else
897 				sfxge_rx_deliver(rxq, prev);
898 		}
899 		prev = rx_desc;
900 		continue;
901 
902 discard:
903 		/* Return the packet to the pool */
904 		m_free(m);
905 		rx_desc->mbuf = NULL;
906 	}
907 	rxq->completed = completed;
908 
909 	level = rxq->added - rxq->completed;
910 
911 	/* Pass last packet up the stack or into LRO */
912 	if (prev != NULL) {
913 		if (lro_enabled &&
914 		    ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
915 		     (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
916 			sfxge_lro(rxq, prev);
917 		else
918 			sfxge_rx_deliver(rxq, prev);
919 	}
920 
921 	/*
922 	 * If there are any pending flows and this is the end of the
923 	 * poll then they must be completed.
924 	 */
925 	if (eop)
926 		sfxge_lro_end_of_burst(rxq);
927 
928 	/* Top up the queue if necessary */
929 	if (level < rxq->refill_threshold)
930 		sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE);
931 }
932 
933 static void
934 sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
935 {
936 	struct sfxge_rxq *rxq;
937 	struct sfxge_evq *evq;
938 	unsigned int count;
939 	unsigned int retry = 3;
940 
941 	SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
942 
943 	rxq = sc->rxq[index];
944 	evq = sc->evq[index];
945 
946 	SFXGE_EVQ_LOCK(evq);
947 
948 	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
949 	    ("rxq not started"));
950 
951 	rxq->init_state = SFXGE_RXQ_INITIALIZED;
952 
953 	callout_stop(&rxq->refill_callout);
954 
955 	while (rxq->flush_state != SFXGE_FLUSH_DONE && retry != 0) {
956 		rxq->flush_state = SFXGE_FLUSH_PENDING;
957 
958 		SFXGE_EVQ_UNLOCK(evq);
959 
960 		/* Flush the receive queue */
961 		if (efx_rx_qflush(rxq->common) != 0) {
962 			SFXGE_EVQ_LOCK(evq);
963 			rxq->flush_state = SFXGE_FLUSH_FAILED;
964 			break;
965 		}
966 
967 		count = 0;
968 		do {
969 			/* Spin for 100 ms */
970 			DELAY(100000);
971 
972 			if (rxq->flush_state != SFXGE_FLUSH_PENDING)
973 				break;
974 
975 		} while (++count < 20);
976 
977 		SFXGE_EVQ_LOCK(evq);
978 
979 		if (rxq->flush_state == SFXGE_FLUSH_PENDING) {
980 			/* Flush timeout - neither done nor failed */
981 			log(LOG_ERR, "%s: Cannot flush Rx queue %u\n",
982 			    device_get_nameunit(sc->dev), index);
983 			rxq->flush_state = SFXGE_FLUSH_DONE;
984 		}
985 		retry--;
986 	}
987 	if (rxq->flush_state == SFXGE_FLUSH_FAILED) {
988 		log(LOG_ERR, "%s: Flushing Rx queue %u failed\n",
989 		    device_get_nameunit(sc->dev), index);
990 		rxq->flush_state = SFXGE_FLUSH_DONE;
991 	}
992 
993 	rxq->pending = rxq->added;
994 	sfxge_rx_qcomplete(rxq, B_TRUE);
995 
996 	KASSERT(rxq->completed == rxq->pending,
997 	    ("rxq->completed != rxq->pending"));
998 
999 	rxq->added = 0;
1000 	rxq->pushed = 0;
1001 	rxq->pending = 0;
1002 	rxq->completed = 0;
1003 	rxq->loopback = 0;
1004 
1005 	/* Destroy the common code receive queue. */
1006 	efx_rx_qdestroy(rxq->common);
1007 
1008 	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1009 	    EFX_RXQ_NBUFS(sc->rxq_entries));
1010 
1011 	SFXGE_EVQ_UNLOCK(evq);
1012 }
1013 
1014 static int
1015 sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
1016 {
1017 	struct sfxge_rxq *rxq;
1018 	efsys_mem_t *esmp;
1019 	struct sfxge_evq *evq;
1020 	int rc;
1021 
1022 	SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
1023 
1024 	rxq = sc->rxq[index];
1025 	esmp = &rxq->mem;
1026 	evq = sc->evq[index];
1027 
1028 	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1029 	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1030 	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
1031 	    ("evq->init_state != SFXGE_EVQ_STARTED"));
1032 
1033 	/* Program the buffer table. */
1034 	if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
1035 	    EFX_RXQ_NBUFS(sc->rxq_entries))) != 0)
1036 		return (rc);
1037 
1038 	/* Create the common code receive queue. */
1039 	if ((rc = efx_rx_qcreate(sc->enp, index, 0, EFX_RXQ_TYPE_DEFAULT,
1040 	    esmp, sc->rxq_entries, rxq->buf_base_id, EFX_RXQ_FLAG_NONE,
1041 	    evq->common, &rxq->common)) != 0)
1042 		goto fail;
1043 
1044 	SFXGE_EVQ_LOCK(evq);
1045 
1046 	/* Enable the receive queue. */
1047 	efx_rx_qenable(rxq->common);
1048 
1049 	rxq->init_state = SFXGE_RXQ_STARTED;
1050 	rxq->flush_state = SFXGE_FLUSH_REQUIRED;
1051 
1052 	/* Try to fill the queue from the pool. */
1053 	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE);
1054 
1055 	SFXGE_EVQ_UNLOCK(evq);
1056 
1057 	return (0);
1058 
1059 fail:
1060 	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1061 	    EFX_RXQ_NBUFS(sc->rxq_entries));
1062 	return (rc);
1063 }
1064 
1065 void
1066 sfxge_rx_stop(struct sfxge_softc *sc)
1067 {
1068 	int index;
1069 
1070 	efx_mac_filter_default_rxq_clear(sc->enp);
1071 
1072 	/* Stop the receive queue(s) */
1073 	index = sc->rxq_count;
1074 	while (--index >= 0)
1075 		sfxge_rx_qstop(sc, index);
1076 
1077 	sc->rx_prefix_size = 0;
1078 	sc->rx_buffer_size = 0;
1079 
1080 	efx_rx_fini(sc->enp);
1081 }
1082 
1083 int
1084 sfxge_rx_start(struct sfxge_softc *sc)
1085 {
1086 	struct sfxge_intr *intr;
1087 	const efx_nic_cfg_t *encp;
1088 	size_t hdrlen, align, reserved;
1089 	int index;
1090 	int rc;
1091 
1092 	intr = &sc->intr;
1093 
1094 	/* Initialize the common code receive module. */
1095 	if ((rc = efx_rx_init(sc->enp)) != 0)
1096 		return (rc);
1097 
1098 	encp = efx_nic_cfg_get(sc->enp);
1099 	sc->rx_buffer_size = EFX_MAC_PDU(sc->ifnet->if_mtu);
1100 
1101 	/* Calculate the receive packet buffer size. */
1102 	sc->rx_prefix_size = encp->enc_rx_prefix_size;
1103 
1104 	/* Ensure IP headers are 32bit aligned */
1105 	hdrlen = sc->rx_prefix_size + sizeof (struct ether_header);
1106 	sc->rx_buffer_align = EFX_P2ROUNDUP(size_t, hdrlen, 4) - hdrlen;
1107 
1108 	sc->rx_buffer_size += sc->rx_buffer_align;
1109 
1110 	/* Align end of packet buffer for RX DMA end padding */
1111 	align = MAX(1, encp->enc_rx_buf_align_end);
1112 	EFSYS_ASSERT(ISP2(align));
1113 	sc->rx_buffer_size = EFX_P2ROUNDUP(size_t, sc->rx_buffer_size, align);
1114 
1115 	/*
1116 	 * Standard mbuf zones only guarantee pointer-size alignment;
1117 	 * we need extra space to align to the cache line
1118 	 */
1119 	reserved = sc->rx_buffer_size + CACHE_LINE_SIZE;
1120 
1121 	/* Select zone for packet buffers */
1122 	if (reserved <= MCLBYTES)
1123 		sc->rx_cluster_size = MCLBYTES;
1124 	else if (reserved <= MJUMPAGESIZE)
1125 		sc->rx_cluster_size = MJUMPAGESIZE;
1126 	else if (reserved <= MJUM9BYTES)
1127 		sc->rx_cluster_size = MJUM9BYTES;
1128 	else
1129 		sc->rx_cluster_size = MJUM16BYTES;
1130 
1131 	/*
1132 	 * Set up the scale table.  Enable all hash types and hash insertion.
1133 	 */
1134 	for (index = 0; index < nitems(sc->rx_indir_table); index++)
1135 #ifdef RSS
1136 		sc->rx_indir_table[index] =
1137 			rss_get_indirection_to_bucket(index) % sc->rxq_count;
1138 #else
1139 		sc->rx_indir_table[index] = index % sc->rxq_count;
1140 #endif
1141 	if ((rc = efx_rx_scale_tbl_set(sc->enp, EFX_RSS_CONTEXT_DEFAULT,
1142 				       sc->rx_indir_table,
1143 				       nitems(sc->rx_indir_table))) != 0)
1144 		goto fail;
1145 	(void)efx_rx_scale_mode_set(sc->enp, EFX_RSS_CONTEXT_DEFAULT,
1146 	    EFX_RX_HASHALG_TOEPLITZ,
1147 	    EFX_RX_HASH_IPV4 | EFX_RX_HASH_TCPIPV4 |
1148 	    EFX_RX_HASH_IPV6 | EFX_RX_HASH_TCPIPV6, B_TRUE);
1149 
1150 #ifdef RSS
1151 	rss_getkey(toep_key);
1152 #endif
1153 	if ((rc = efx_rx_scale_key_set(sc->enp, EFX_RSS_CONTEXT_DEFAULT,
1154 				       toep_key,
1155 				       sizeof(toep_key))) != 0)
1156 		goto fail;
1157 
1158 	/* Start the receive queue(s). */
1159 	for (index = 0; index < sc->rxq_count; index++) {
1160 		if ((rc = sfxge_rx_qstart(sc, index)) != 0)
1161 			goto fail2;
1162 	}
1163 
1164 	rc = efx_mac_filter_default_rxq_set(sc->enp, sc->rxq[0]->common,
1165 					    sc->intr.n_alloc > 1);
1166 	if (rc != 0)
1167 		goto fail3;
1168 
1169 	return (0);
1170 
1171 fail3:
1172 fail2:
1173 	while (--index >= 0)
1174 		sfxge_rx_qstop(sc, index);
1175 
1176 fail:
1177 	efx_rx_fini(sc->enp);
1178 
1179 	return (rc);
1180 }
1181 
1182 #ifdef SFXGE_LRO
1183 
1184 static void sfxge_lro_init(struct sfxge_rxq *rxq)
1185 {
1186 	struct sfxge_lro_state *st = &rxq->lro;
1187 	unsigned i;
1188 
1189 	st->conns_mask = lro_table_size - 1;
1190 	KASSERT(!((st->conns_mask + 1) & st->conns_mask),
1191 		("lro_table_size must be a power of 2"));
1192 	st->sc = rxq->sc;
1193 	st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
1194 			   M_SFXGE, M_WAITOK);
1195 	st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
1196 			     M_SFXGE, M_WAITOK);
1197 	for (i = 0; i <= st->conns_mask; ++i) {
1198 		TAILQ_INIT(&st->conns[i]);
1199 		st->conns_n[i] = 0;
1200 	}
1201 	LIST_INIT(&st->active_conns);
1202 	TAILQ_INIT(&st->free_conns);
1203 }
1204 
1205 static void sfxge_lro_fini(struct sfxge_rxq *rxq)
1206 {
1207 	struct sfxge_lro_state *st = &rxq->lro;
1208 	struct sfxge_lro_conn *c;
1209 	unsigned i;
1210 
1211 	/* Return cleanly if sfxge_lro_init() has not been called. */
1212 	if (st->conns == NULL)
1213 		return;
1214 
1215 	KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
1216 
1217 	for (i = 0; i <= st->conns_mask; ++i) {
1218 		while (!TAILQ_EMPTY(&st->conns[i])) {
1219 			c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
1220 			sfxge_lro_drop(rxq, c);
1221 		}
1222 	}
1223 
1224 	while (!TAILQ_EMPTY(&st->free_conns)) {
1225 		c = TAILQ_FIRST(&st->free_conns);
1226 		TAILQ_REMOVE(&st->free_conns, c, link);
1227 		KASSERT(!c->mbuf, ("found orphaned mbuf"));
1228 		free(c, M_SFXGE);
1229 	}
1230 
1231 	free(st->conns_n, M_SFXGE);
1232 	free(st->conns, M_SFXGE);
1233 	st->conns = NULL;
1234 }
1235 
1236 #else
1237 
1238 static void
1239 sfxge_lro_init(struct sfxge_rxq *rxq)
1240 {
1241 }
1242 
1243 static void
1244 sfxge_lro_fini(struct sfxge_rxq *rxq)
1245 {
1246 }
1247 
1248 #endif	/* SFXGE_LRO */
1249 
1250 static void
1251 sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
1252 {
1253 	struct sfxge_rxq *rxq;
1254 
1255 	rxq = sc->rxq[index];
1256 
1257 	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1258 	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1259 
1260 	/* Free the context array and the flow table. */
1261 	free(rxq->queue, M_SFXGE);
1262 	sfxge_lro_fini(rxq);
1263 
1264 	/* Release DMA memory. */
1265 	sfxge_dma_free(&rxq->mem);
1266 
1267 	sc->rxq[index] = NULL;
1268 
1269 	free(rxq, M_SFXGE);
1270 }
1271 
1272 static int
1273 sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
1274 {
1275 	struct sfxge_rxq *rxq;
1276 	struct sfxge_evq *evq;
1277 	efsys_mem_t *esmp;
1278 	int rc;
1279 
1280 	KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count));
1281 
1282 	rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
1283 	rxq->sc = sc;
1284 	rxq->index = index;
1285 	rxq->entries = sc->rxq_entries;
1286 	rxq->ptr_mask = rxq->entries - 1;
1287 	rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries);
1288 
1289 	sc->rxq[index] = rxq;
1290 	esmp = &rxq->mem;
1291 
1292 	evq = sc->evq[index];
1293 
1294 	/* Allocate and zero DMA space. */
1295 	if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0)
1296 		return (rc);
1297 
1298 	/* Allocate buffer table entries. */
1299 	sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries),
1300 				 &rxq->buf_base_id);
1301 
1302 	/* Allocate the context array and the flow table. */
1303 	rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries,
1304 	    M_SFXGE, M_WAITOK | M_ZERO);
1305 	sfxge_lro_init(rxq);
1306 
1307 	callout_init(&rxq->refill_callout, 1);
1308 
1309 	rxq->init_state = SFXGE_RXQ_INITIALIZED;
1310 
1311 	return (0);
1312 }
1313 
1314 static const struct {
1315 	const char *name;
1316 	size_t offset;
1317 } sfxge_rx_stats[] = {
1318 #define	SFXGE_RX_STAT(name, member) \
1319 	{ #name, offsetof(struct sfxge_rxq, member) }
1320 #ifdef SFXGE_LRO
1321 	SFXGE_RX_STAT(lro_merges, lro.n_merges),
1322 	SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
1323 	SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
1324 	SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
1325 	SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
1326 	SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
1327 	SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
1328 	SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
1329 #endif
1330 };
1331 
1332 static int
1333 sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
1334 {
1335 	struct sfxge_softc *sc = arg1;
1336 	unsigned int id = arg2;
1337 	unsigned int sum, index;
1338 
1339 	/* Sum across all RX queues */
1340 	sum = 0;
1341 	for (index = 0; index < sc->rxq_count; index++)
1342 		sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
1343 					 sfxge_rx_stats[id].offset);
1344 
1345 	return (SYSCTL_OUT(req, &sum, sizeof(sum)));
1346 }
1347 
1348 static void
1349 sfxge_rx_stat_init(struct sfxge_softc *sc)
1350 {
1351 	struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1352 	struct sysctl_oid_list *stat_list;
1353 	unsigned int id;
1354 
1355 	stat_list = SYSCTL_CHILDREN(sc->stats_node);
1356 
1357 	for (id = 0; id < nitems(sfxge_rx_stats); id++) {
1358 		SYSCTL_ADD_PROC(ctx, stat_list, OID_AUTO,
1359 		    sfxge_rx_stats[id].name,
1360 		    CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_NEEDGIANT,
1361 		    sc, id, sfxge_rx_stat_handler, "IU", "");
1362 	}
1363 }
1364 
1365 void
1366 sfxge_rx_fini(struct sfxge_softc *sc)
1367 {
1368 	int index;
1369 
1370 	index = sc->rxq_count;
1371 	while (--index >= 0)
1372 		sfxge_rx_qfini(sc, index);
1373 
1374 	sc->rxq_count = 0;
1375 }
1376 
1377 int
1378 sfxge_rx_init(struct sfxge_softc *sc)
1379 {
1380 	struct sfxge_intr *intr;
1381 	int index;
1382 	int rc;
1383 
1384 #ifdef SFXGE_LRO
1385 	if (!ISP2(lro_table_size)) {
1386 		log(LOG_ERR, "%s=%u must be power of 2",
1387 		    SFXGE_LRO_PARAM(table_size), lro_table_size);
1388 		rc = EINVAL;
1389 		goto fail_lro_table_size;
1390 	}
1391 
1392 	if (lro_idle_ticks == 0)
1393 		lro_idle_ticks = hz / 10 + 1; /* 100 ms */
1394 #endif
1395 
1396 	intr = &sc->intr;
1397 
1398 	sc->rxq_count = intr->n_alloc;
1399 
1400 	KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
1401 	    ("intr->state != SFXGE_INTR_INITIALIZED"));
1402 
1403 	/* Initialize the receive queue(s) - one per interrupt. */
1404 	for (index = 0; index < sc->rxq_count; index++) {
1405 		if ((rc = sfxge_rx_qinit(sc, index)) != 0)
1406 			goto fail;
1407 	}
1408 
1409 	sfxge_rx_stat_init(sc);
1410 
1411 	return (0);
1412 
1413 fail:
1414 	/* Tear down the receive queue(s). */
1415 	while (--index >= 0)
1416 		sfxge_rx_qfini(sc, index);
1417 
1418 	sc->rxq_count = 0;
1419 
1420 #ifdef SFXGE_LRO
1421 fail_lro_table_size:
1422 #endif
1423 	return (rc);
1424 }
1425