xref: /freebsd/sys/dev/sfxge/sfxge_rx.c (revision b2d2a78ad80ec68d4a17f5aef97d21686cb1e29b)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2010-2016 Solarflare Communications Inc.
5  * All rights reserved.
6  *
7  * This software was developed in part by Philip Paeps under contract for
8  * Solarflare Communications, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright notice,
14  *    this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright notice,
16  *    this list of conditions and the following disclaimer in the documentation
17  *    and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
23  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
29  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  *
31  * The views and conclusions contained in the software and documentation are
32  * those of the authors and should not be interpreted as representing official
33  * policies, either expressed or implied, of the FreeBSD Project.
34  */
35 
36 #include <sys/cdefs.h>
37 #include "opt_rss.h"
38 
39 #include <sys/param.h>
40 #include <sys/malloc.h>
41 #include <sys/mbuf.h>
42 #include <sys/smp.h>
43 #include <sys/socket.h>
44 #include <sys/sysctl.h>
45 #include <sys/syslog.h>
46 #include <sys/limits.h>
47 #include <sys/syslog.h>
48 
49 #include <net/ethernet.h>
50 #include <net/if.h>
51 #include <net/if_vlan_var.h>
52 
53 #include <netinet/in.h>
54 #include <netinet/ip.h>
55 #include <netinet/ip6.h>
56 #include <netinet/tcp.h>
57 
58 #include <machine/in_cksum.h>
59 
60 #ifdef RSS
61 #include <net/rss_config.h>
62 #endif
63 
64 #include "common/efx.h"
65 
66 #include "sfxge.h"
67 #include "sfxge_rx.h"
68 
69 #define	RX_REFILL_THRESHOLD(_entries)	(EFX_RXQ_LIMIT(_entries) * 9 / 10)
70 
71 #ifdef SFXGE_LRO
72 
73 SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
74     "Large receive offload (LRO) parameters");
75 
76 #define	SFXGE_LRO_PARAM(_param)	SFXGE_PARAM(lro._param)
77 
78 /* Size of the LRO hash table.  Must be a power of 2.  A larger table
79  * means we can accelerate a larger number of streams.
80  */
81 static unsigned lro_table_size = 128;
82 TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size);
83 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN,
84 	    &lro_table_size, 0,
85 	    "Size of the LRO hash table (must be a power of 2)");
86 
87 /* Maximum length of a hash chain.  If chains get too long then the lookup
88  * time increases and may exceed the benefit of LRO.
89  */
90 static unsigned lro_chain_max = 20;
91 TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max);
92 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN,
93 	    &lro_chain_max, 0,
94 	    "The maximum length of a hash chain");
95 
96 /* Maximum time (in ticks) that a connection can be idle before it's LRO
97  * state is discarded.
98  */
99 static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
100 TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks);
101 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN,
102 	    &lro_idle_ticks, 0,
103 	    "The maximum time (in ticks) that a connection can be idle "
104 	    "before it's LRO state is discarded");
105 
106 /* Number of packets with payload that must arrive in-order before a
107  * connection is eligible for LRO.  The idea is we should avoid coalescing
108  * segments when the sender is in slow-start because reducing the ACK rate
109  * can damage performance.
110  */
111 static int lro_slow_start_packets = 2000;
112 TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets);
113 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN,
114 	    &lro_slow_start_packets, 0,
115 	    "Number of packets with payload that must arrive in-order before "
116 	    "a connection is eligible for LRO");
117 
118 /* Number of packets with payload that must arrive in-order following loss
119  * before a connection is eligible for LRO.  The idea is we should avoid
120  * coalescing segments when the sender is recovering from loss, because
121  * reducing the ACK rate can damage performance.
122  */
123 static int lro_loss_packets = 20;
124 TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets);
125 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN,
126 	    &lro_loss_packets, 0,
127 	    "Number of packets with payload that must arrive in-order "
128 	    "following loss before a connection is eligible for LRO");
129 
130 /* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
131 #define	SFXGE_LRO_L2_ID_VLAN 0x4000
132 #define	SFXGE_LRO_L2_ID_IPV6 0x8000
133 #define	SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
134 #define	SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
135 
136 /* Compare IPv6 addresses, avoiding conditional branches */
137 static unsigned long ipv6_addr_cmp(const struct in6_addr *left,
138 				   const struct in6_addr *right)
139 {
140 #if LONG_BIT == 64
141 	const uint64_t *left64 = (const uint64_t *)left;
142 	const uint64_t *right64 = (const uint64_t *)right;
143 	return (left64[0] - right64[0]) | (left64[1] - right64[1]);
144 #else
145 	return (left->s6_addr32[0] - right->s6_addr32[0]) |
146 	       (left->s6_addr32[1] - right->s6_addr32[1]) |
147 	       (left->s6_addr32[2] - right->s6_addr32[2]) |
148 	       (left->s6_addr32[3] - right->s6_addr32[3]);
149 #endif
150 }
151 
152 #endif	/* SFXGE_LRO */
153 
154 void
155 sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
156 {
157 
158 	rxq->flush_state = SFXGE_FLUSH_DONE;
159 }
160 
161 void
162 sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
163 {
164 
165 	rxq->flush_state = SFXGE_FLUSH_FAILED;
166 }
167 
168 #ifdef RSS
169 static uint8_t toep_key[RSS_KEYSIZE];
170 #else
171 static uint8_t toep_key[] = {
172 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
173 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
174 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
175 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
176 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
177 };
178 #endif
179 
180 static void
181 sfxge_rx_post_refill(void *arg)
182 {
183 	struct sfxge_rxq *rxq = arg;
184 	struct sfxge_softc *sc;
185 	unsigned int index;
186 	struct sfxge_evq *evq;
187 	uint16_t magic;
188 
189 	sc = rxq->sc;
190 	index = rxq->index;
191 	evq = sc->evq[index];
192 	magic = sfxge_sw_ev_rxq_magic(SFXGE_SW_EV_RX_QREFILL, rxq);
193 
194 	/* This is guaranteed due to the start/stop order of rx and ev */
195 	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
196 	    ("evq not started"));
197 	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
198 	    ("rxq not started"));
199 	efx_ev_qpost(evq->common, magic);
200 }
201 
202 static void
203 sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
204 {
205 	/* Initially retry after 100 ms, but back off in case of
206 	 * repeated failures as we probably have to wait for the
207 	 * administrator to raise the pool limit. */
208 	if (retrying)
209 		rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
210 	else
211 		rxq->refill_delay = hz / 10;
212 
213 	callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
214 			     sfxge_rx_post_refill, rxq);
215 }
216 
217 #define	SFXGE_REFILL_BATCH  64
218 
219 static void
220 sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
221 {
222 	struct sfxge_softc *sc;
223 	unsigned int index;
224 	struct sfxge_evq *evq __diagused;
225 	unsigned int batch;
226 	unsigned int rxfill;
227 	unsigned int mblksize;
228 	int ntodo;
229 	efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
230 
231 	sc = rxq->sc;
232 	index = rxq->index;
233 	evq = sc->evq[index];
234 
235 	prefetch_read_many(sc->enp);
236 	prefetch_read_many(rxq->common);
237 
238 	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
239 
240 	if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
241 		return;
242 
243 	rxfill = rxq->added - rxq->completed;
244 	KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries),
245 	    ("rxfill > EFX_RXQ_LIMIT(rxq->entries)"));
246 	ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target);
247 	KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries),
248 	    ("ntodo > EFX_RQX_LIMIT(rxq->entries)"));
249 
250 	if (ntodo == 0)
251 		return;
252 
253 	batch = 0;
254 	mblksize = sc->rx_buffer_size - sc->rx_buffer_align;
255 	while (ntodo-- > 0) {
256 		unsigned int id;
257 		struct sfxge_rx_sw_desc *rx_desc;
258 		bus_dma_segment_t seg;
259 		struct mbuf *m;
260 
261 		id = (rxq->added + batch) & rxq->ptr_mask;
262 		rx_desc = &rxq->queue[id];
263 		KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
264 
265 		rx_desc->flags = EFX_DISCARD;
266 		m = rx_desc->mbuf = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR,
267 		    sc->rx_cluster_size);
268 		if (m == NULL)
269 			break;
270 
271 		/* m_len specifies length of area to be mapped for DMA */
272 		m->m_len  = mblksize;
273 		m->m_data = (caddr_t)EFX_P2ROUNDUP(uintptr_t, m->m_data,
274 						   CACHE_LINE_SIZE);
275 		m->m_data += sc->rx_buffer_align;
276 
277 		sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
278 		addr[batch++] = seg.ds_addr;
279 
280 		if (batch == SFXGE_REFILL_BATCH) {
281 			efx_rx_qpost(rxq->common, addr, mblksize, batch,
282 			    rxq->completed, rxq->added);
283 			rxq->added += batch;
284 			batch = 0;
285 		}
286 	}
287 
288 	if (ntodo != 0)
289 		sfxge_rx_schedule_refill(rxq, retrying);
290 
291 	if (batch != 0) {
292 		efx_rx_qpost(rxq->common, addr, mblksize, batch,
293 		    rxq->completed, rxq->added);
294 		rxq->added += batch;
295 	}
296 
297 	/* Make the descriptors visible to the hardware */
298 	bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
299 			BUS_DMASYNC_PREWRITE);
300 
301 	efx_rx_qpush(rxq->common, rxq->added, &rxq->pushed);
302 
303 	/* The queue could still be empty if no descriptors were actually
304 	 * pushed, in which case there will be no event to cause the next
305 	 * refill, so we must schedule a refill ourselves.
306 	 */
307 	if(rxq->pushed == rxq->completed) {
308 		sfxge_rx_schedule_refill(rxq, retrying);
309 	}
310 }
311 
312 void
313 sfxge_rx_qrefill(struct sfxge_rxq *rxq)
314 {
315 
316 	if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
317 		return;
318 
319 	/* Make sure the queue is full */
320 	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE);
321 }
322 
323 static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
324 {
325 	if_t ifp = sc->ifnet;
326 
327 	m->m_pkthdr.rcvif = ifp;
328 	m->m_pkthdr.csum_data = 0xffff;
329 	if_input(ifp, m);
330 }
331 
332 static void
333 sfxge_rx_deliver(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_desc)
334 {
335 	struct sfxge_softc *sc = rxq->sc;
336 	struct mbuf *m = rx_desc->mbuf;
337 	int flags = rx_desc->flags;
338 	int csum_flags;
339 
340 	/* Convert checksum flags */
341 	csum_flags = (flags & EFX_CKSUM_IPV4) ?
342 		(CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
343 	if (flags & EFX_CKSUM_TCPUDP)
344 		csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
345 
346 	if (flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
347 		m->m_pkthdr.flowid =
348 			efx_pseudo_hdr_hash_get(rxq->common,
349 						EFX_RX_HASHALG_TOEPLITZ,
350 						mtod(m, uint8_t *));
351 		/* The hash covers a 4-tuple for TCP only */
352 		M_HASHTYPE_SET(m,
353 		    (flags & EFX_PKT_IPV4) ?
354 			((flags & EFX_PKT_TCP) ?
355 			    M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_IPV4) :
356 			((flags & EFX_PKT_TCP) ?
357 			    M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_IPV6));
358 	}
359 	m->m_data += sc->rx_prefix_size;
360 	m->m_len = rx_desc->size - sc->rx_prefix_size;
361 	m->m_pkthdr.len = m->m_len;
362 	m->m_pkthdr.csum_flags = csum_flags;
363 	__sfxge_rx_deliver(sc, rx_desc->mbuf);
364 
365 	rx_desc->flags = EFX_DISCARD;
366 	rx_desc->mbuf = NULL;
367 }
368 
369 #ifdef SFXGE_LRO
370 
371 static void
372 sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
373 {
374 	struct sfxge_softc *sc = st->sc;
375 	struct mbuf *m = c->mbuf;
376 	struct tcphdr *c_th;
377 	int csum_flags;
378 
379 	KASSERT(m, ("no mbuf to deliver"));
380 
381 	++st->n_bursts;
382 
383 	/* Finish off packet munging and recalculate IP header checksum. */
384 	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
385 		struct ip *iph = c->nh;
386 		iph->ip_len = htons(iph->ip_len);
387 		iph->ip_sum = 0;
388 		iph->ip_sum = in_cksum_hdr(iph);
389 		c_th = (struct tcphdr *)(iph + 1);
390 		csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
391 			      CSUM_IP_CHECKED | CSUM_IP_VALID);
392 	} else {
393 		struct ip6_hdr *iph = c->nh;
394 		iph->ip6_plen = htons(iph->ip6_plen);
395 		c_th = (struct tcphdr *)(iph + 1);
396 		csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
397 	}
398 
399 	c_th->th_win = c->th_last->th_win;
400 	c_th->th_ack = c->th_last->th_ack;
401 	if (c_th->th_off == c->th_last->th_off) {
402 		/* Copy TCP options (take care to avoid going negative). */
403 		int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
404 		memcpy(c_th + 1, c->th_last + 1, optlen);
405 	}
406 
407 	m->m_pkthdr.flowid = c->conn_hash;
408 	M_HASHTYPE_SET(m,
409 	    SFXGE_LRO_CONN_IS_TCPIPV4(c) ?
410 		M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_TCP_IPV6);
411 
412 	m->m_pkthdr.csum_flags = csum_flags;
413 	__sfxge_rx_deliver(sc, m);
414 
415 	c->mbuf = NULL;
416 	c->delivered = 1;
417 }
418 
419 /* Drop the given connection, and add it to the free list. */
420 static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
421 {
422 	unsigned bucket;
423 
424 	KASSERT(!c->mbuf, ("found orphaned mbuf"));
425 
426 	if (c->next_buf.mbuf != NULL) {
427 		sfxge_rx_deliver(rxq, &c->next_buf);
428 		LIST_REMOVE(c, active_link);
429 	}
430 
431 	bucket = c->conn_hash & rxq->lro.conns_mask;
432 	KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
433 	--rxq->lro.conns_n[bucket];
434 	TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
435 	TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
436 }
437 
438 /* Stop tracking connections that have gone idle in order to keep hash
439  * chains short.
440  */
441 static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
442 {
443 	struct sfxge_lro_conn *c;
444 	unsigned i;
445 
446 	KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
447 		("found active connections"));
448 
449 	rxq->lro.last_purge_ticks = now;
450 	for (i = 0; i <= rxq->lro.conns_mask; ++i) {
451 		if (TAILQ_EMPTY(&rxq->lro.conns[i]))
452 			continue;
453 
454 		c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
455 		if (now - c->last_pkt_ticks > lro_idle_ticks) {
456 			++rxq->lro.n_drop_idle;
457 			sfxge_lro_drop(rxq, c);
458 		}
459 	}
460 }
461 
462 static void
463 sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
464 		struct mbuf *mbuf, struct tcphdr *th)
465 {
466 	struct tcphdr *c_th;
467 
468 	/* Tack the new mbuf onto the chain. */
469 	KASSERT(!mbuf->m_next, ("mbuf already chained"));
470 	c->mbuf_tail->m_next = mbuf;
471 	c->mbuf_tail = mbuf;
472 
473 	/* Increase length appropriately */
474 	c->mbuf->m_pkthdr.len += mbuf->m_len;
475 
476 	/* Update the connection state flags */
477 	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
478 		struct ip *iph = c->nh;
479 		iph->ip_len += mbuf->m_len;
480 		c_th = (struct tcphdr *)(iph + 1);
481 	} else {
482 		struct ip6_hdr *iph = c->nh;
483 		iph->ip6_plen += mbuf->m_len;
484 		c_th = (struct tcphdr *)(iph + 1);
485 	}
486 	tcp_set_flags(c_th, tcp_get_flags(c_th) | (tcp_get_flags(th) & TH_PUSH));
487 	c->th_last = th;
488 	++st->n_merges;
489 
490 	/* Pass packet up now if another segment could overflow the IP
491 	 * length.
492 	 */
493 	if (c->mbuf->m_pkthdr.len > 65536 - 9200)
494 		sfxge_lro_deliver(st, c);
495 }
496 
497 static void
498 sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
499 		struct mbuf *mbuf, void *nh, struct tcphdr *th)
500 {
501 	/* Start the chain */
502 	c->mbuf = mbuf;
503 	c->mbuf_tail = c->mbuf;
504 	c->nh = nh;
505 	c->th_last = th;
506 
507 	mbuf->m_pkthdr.len = mbuf->m_len;
508 
509 	/* Mangle header fields for later processing */
510 	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
511 		struct ip *iph = nh;
512 		iph->ip_len = ntohs(iph->ip_len);
513 	} else {
514 		struct ip6_hdr *iph = nh;
515 		iph->ip6_plen = ntohs(iph->ip6_plen);
516 	}
517 }
518 
519 /* Try to merge or otherwise hold or deliver (as appropriate) the
520  * packet buffered for this connection (c->next_buf).  Return a flag
521  * indicating whether the connection is still active for LRO purposes.
522  */
523 static int
524 sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
525 {
526 	struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
527 	char *eh = c->next_eh;
528 	int data_length, hdr_length, dont_merge;
529 	unsigned th_seq, pkt_length;
530 	struct tcphdr *th;
531 	unsigned now;
532 
533 	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
534 		struct ip *iph = c->next_nh;
535 		th = (struct tcphdr *)(iph + 1);
536 		pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
537 	} else {
538 		struct ip6_hdr *iph = c->next_nh;
539 		th = (struct tcphdr *)(iph + 1);
540 		pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
541 	}
542 
543 	hdr_length = (char *) th + th->th_off * 4 - eh;
544 	data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
545 		       hdr_length);
546 	th_seq = ntohl(th->th_seq);
547 	dont_merge = ((data_length <= 0)
548 		      | (tcp_get_flags(th) & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
549 
550 	/* Check for options other than aligned timestamp. */
551 	if (th->th_off != 5) {
552 		const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
553 		if (th->th_off == 8 &&
554 		    opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
555 					(TCPOPT_NOP << 16) |
556 					(TCPOPT_TIMESTAMP << 8) |
557 					TCPOLEN_TIMESTAMP)) {
558 			/* timestamp option -- okay */
559 		} else {
560 			dont_merge = 1;
561 		}
562 	}
563 
564 	if (__predict_false(th_seq != c->next_seq)) {
565 		/* Out-of-order, so start counting again. */
566 		if (c->mbuf != NULL)
567 			sfxge_lro_deliver(&rxq->lro, c);
568 		c->n_in_order_pkts -= lro_loss_packets;
569 		c->next_seq = th_seq + data_length;
570 		++rxq->lro.n_misorder;
571 		goto deliver_buf_out;
572 	}
573 	c->next_seq = th_seq + data_length;
574 
575 	now = ticks;
576 	if (now - c->last_pkt_ticks > lro_idle_ticks) {
577 		++rxq->lro.n_drop_idle;
578 		if (c->mbuf != NULL)
579 			sfxge_lro_deliver(&rxq->lro, c);
580 		sfxge_lro_drop(rxq, c);
581 		return (0);
582 	}
583 	c->last_pkt_ticks = ticks;
584 
585 	if (c->n_in_order_pkts < lro_slow_start_packets) {
586 		/* May be in slow-start, so don't merge. */
587 		++rxq->lro.n_slow_start;
588 		++c->n_in_order_pkts;
589 		goto deliver_buf_out;
590 	}
591 
592 	if (__predict_false(dont_merge)) {
593 		if (c->mbuf != NULL)
594 			sfxge_lro_deliver(&rxq->lro, c);
595 		if (tcp_get_flags(th) & (TH_FIN | TH_RST)) {
596 			++rxq->lro.n_drop_closed;
597 			sfxge_lro_drop(rxq, c);
598 			return (0);
599 		}
600 		goto deliver_buf_out;
601 	}
602 
603 	rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
604 
605 	if (__predict_true(c->mbuf != NULL)) {
606 		/* Remove headers and any padding */
607 		rx_buf->mbuf->m_data += hdr_length;
608 		rx_buf->mbuf->m_len = data_length;
609 
610 		sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
611 	} else {
612 		/* Remove any padding */
613 		rx_buf->mbuf->m_len = pkt_length;
614 
615 		sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
616 	}
617 
618 	rx_buf->mbuf = NULL;
619 	return (1);
620 
621  deliver_buf_out:
622 	sfxge_rx_deliver(rxq, rx_buf);
623 	return (1);
624 }
625 
626 static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
627 			       uint16_t l2_id, void *nh, struct tcphdr *th)
628 {
629 	unsigned bucket = conn_hash & st->conns_mask;
630 	struct sfxge_lro_conn *c;
631 
632 	if (st->conns_n[bucket] >= lro_chain_max) {
633 		++st->n_too_many;
634 		return;
635 	}
636 
637 	if (!TAILQ_EMPTY(&st->free_conns)) {
638 		c = TAILQ_FIRST(&st->free_conns);
639 		TAILQ_REMOVE(&st->free_conns, c, link);
640 	} else {
641 		c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT);
642 		if (c == NULL)
643 			return;
644 		c->mbuf = NULL;
645 		c->next_buf.mbuf = NULL;
646 	}
647 
648 	/* Create the connection tracking data */
649 	++st->conns_n[bucket];
650 	TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
651 	c->l2_id = l2_id;
652 	c->conn_hash = conn_hash;
653 	c->source = th->th_sport;
654 	c->dest = th->th_dport;
655 	c->n_in_order_pkts = 0;
656 	c->last_pkt_ticks = *(volatile int *)&ticks;
657 	c->delivered = 0;
658 	++st->n_new_stream;
659 	/* NB. We don't initialise c->next_seq, and it doesn't matter what
660 	 * value it has.  Most likely the next packet received for this
661 	 * connection will not match -- no harm done.
662 	 */
663 }
664 
665 /* Process mbuf and decide whether to dispatch it to the stack now or
666  * later.
667  */
668 static void
669 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
670 {
671 	struct sfxge_softc *sc = rxq->sc;
672 	struct mbuf *m = rx_buf->mbuf;
673 	struct ether_header *eh;
674 	struct sfxge_lro_conn *c;
675 	uint16_t l2_id;
676 	uint16_t l3_proto;
677 	void *nh;
678 	struct tcphdr *th;
679 	uint32_t conn_hash;
680 	unsigned bucket;
681 
682 	/* Get the hardware hash */
683 	conn_hash = efx_pseudo_hdr_hash_get(rxq->common,
684 					    EFX_RX_HASHALG_TOEPLITZ,
685 					    mtod(m, uint8_t *));
686 
687 	eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
688 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
689 		struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
690 		l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
691 			SFXGE_LRO_L2_ID_VLAN;
692 		l3_proto = veh->evl_proto;
693 		nh = veh + 1;
694 	} else {
695 		l2_id = 0;
696 		l3_proto = eh->ether_type;
697 		nh = eh + 1;
698 	}
699 
700 	/* Check whether this is a suitable packet (unfragmented
701 	 * TCP/IPv4 or TCP/IPv6).  If so, find the TCP header and
702 	 * length, and compute a hash if necessary.  If not, return.
703 	 */
704 	if (l3_proto == htons(ETHERTYPE_IP)) {
705 		struct ip *iph = nh;
706 
707 		KASSERT(iph->ip_p == IPPROTO_TCP,
708 		    ("IPv4 protocol is not TCP, but packet marker is set"));
709 		if ((iph->ip_hl - (sizeof(*iph) >> 2u)) |
710 		    (iph->ip_off & htons(IP_MF | IP_OFFMASK)))
711 			goto deliver_now;
712 		th = (struct tcphdr *)(iph + 1);
713 	} else if (l3_proto == htons(ETHERTYPE_IPV6)) {
714 		struct ip6_hdr *iph = nh;
715 
716 		KASSERT(iph->ip6_nxt == IPPROTO_TCP,
717 		    ("IPv6 next header is not TCP, but packet marker is set"));
718 		l2_id |= SFXGE_LRO_L2_ID_IPV6;
719 		th = (struct tcphdr *)(iph + 1);
720 	} else {
721 		goto deliver_now;
722 	}
723 
724 	bucket = conn_hash & rxq->lro.conns_mask;
725 
726 	TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
727 		if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
728 			continue;
729 		if ((c->source - th->th_sport) | (c->dest - th->th_dport))
730 			continue;
731 		if (c->mbuf != NULL) {
732 			if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
733 				struct ip *c_iph, *iph = nh;
734 				c_iph = c->nh;
735 				if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
736 				    (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
737 					continue;
738 			} else {
739 				struct ip6_hdr *c_iph, *iph = nh;
740 				c_iph = c->nh;
741 				if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
742 				    ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
743 					continue;
744 			}
745 		}
746 
747 		/* Re-insert at head of list to reduce lookup time. */
748 		TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
749 		TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
750 
751 		if (c->next_buf.mbuf != NULL) {
752 			if (!sfxge_lro_try_merge(rxq, c))
753 				goto deliver_now;
754 		} else {
755 			LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
756 			    active_link);
757 		}
758 		c->next_buf = *rx_buf;
759 		c->next_eh = eh;
760 		c->next_nh = nh;
761 
762 		rx_buf->mbuf = NULL;
763 		rx_buf->flags = EFX_DISCARD;
764 		return;
765 	}
766 
767 	sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
768  deliver_now:
769 	sfxge_rx_deliver(rxq, rx_buf);
770 }
771 
772 static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
773 {
774 	struct sfxge_lro_state *st = &rxq->lro;
775 	struct sfxge_lro_conn *c;
776 	unsigned t;
777 
778 	while (!LIST_EMPTY(&st->active_conns)) {
779 		c = LIST_FIRST(&st->active_conns);
780 		if (!c->delivered && c->mbuf != NULL)
781 			sfxge_lro_deliver(st, c);
782 		if (sfxge_lro_try_merge(rxq, c)) {
783 			if (c->mbuf != NULL)
784 				sfxge_lro_deliver(st, c);
785 			LIST_REMOVE(c, active_link);
786 		}
787 		c->delivered = 0;
788 	}
789 
790 	t = *(volatile int *)&ticks;
791 	if (__predict_false(t != st->last_purge_ticks))
792 		sfxge_lro_purge_idle(rxq, t);
793 }
794 
795 #else	/* !SFXGE_LRO */
796 
797 static void
798 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
799 {
800 }
801 
802 static void
803 sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
804 {
805 }
806 
807 #endif	/* SFXGE_LRO */
808 
809 void
810 sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
811 {
812 	struct sfxge_softc *sc = rxq->sc;
813 	int if_capenable = if_getcapenable(sc->ifnet);
814 	int lro_enabled = if_capenable & IFCAP_LRO;
815 	unsigned int index;
816 	struct sfxge_evq *evq __diagused;
817 	unsigned int completed;
818 	unsigned int level;
819 	struct mbuf *m;
820 	struct sfxge_rx_sw_desc *prev = NULL;
821 
822 	index = rxq->index;
823 	evq = sc->evq[index];
824 
825 	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
826 
827 	completed = rxq->completed;
828 	while (completed != rxq->pending) {
829 		unsigned int id;
830 		struct sfxge_rx_sw_desc *rx_desc;
831 
832 		id = completed++ & rxq->ptr_mask;
833 		rx_desc = &rxq->queue[id];
834 		m = rx_desc->mbuf;
835 
836 		if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
837 			goto discard;
838 
839 		if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
840 			goto discard;
841 
842 		/* Read the length from the pseudo header if required */
843 		if (rx_desc->flags & EFX_PKT_PREFIX_LEN) {
844 			uint16_t tmp_size;
845 			int rc __diagused;
846 
847 			rc = efx_pseudo_hdr_pkt_length_get(rxq->common,
848 							   mtod(m, uint8_t *),
849 							   &tmp_size);
850 			KASSERT(rc == 0, ("cannot get packet length: %d", rc));
851 			rx_desc->size = (int)tmp_size + sc->rx_prefix_size;
852 		}
853 
854 		prefetch_read_many(mtod(m, caddr_t));
855 
856 		switch (rx_desc->flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
857 		case EFX_PKT_IPV4:
858 			if (~if_capenable & IFCAP_RXCSUM)
859 				rx_desc->flags &=
860 				    ~(EFX_CKSUM_IPV4 | EFX_CKSUM_TCPUDP);
861 			break;
862 		case EFX_PKT_IPV6:
863 			if (~if_capenable & IFCAP_RXCSUM_IPV6)
864 				rx_desc->flags &= ~EFX_CKSUM_TCPUDP;
865 			break;
866 		case 0:
867 			/* Check for loopback packets */
868 			{
869 				struct ether_header *etherhp;
870 
871 				/*LINTED*/
872 				etherhp = mtod(m, struct ether_header *);
873 
874 				if (etherhp->ether_type ==
875 				    htons(SFXGE_ETHERTYPE_LOOPBACK)) {
876 					EFSYS_PROBE(loopback);
877 
878 					rxq->loopback++;
879 					goto discard;
880 				}
881 			}
882 			break;
883 		default:
884 			KASSERT(B_FALSE,
885 			    ("Rx descriptor with both IPv4 and IPv6 flags"));
886 			goto discard;
887 		}
888 
889 		/* Pass packet up the stack or into LRO (pipelined) */
890 		if (prev != NULL) {
891 			if (lro_enabled &&
892 			    ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
893 			     (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
894 				sfxge_lro(rxq, prev);
895 			else
896 				sfxge_rx_deliver(rxq, prev);
897 		}
898 		prev = rx_desc;
899 		continue;
900 
901 discard:
902 		/* Return the packet to the pool */
903 		m_free(m);
904 		rx_desc->mbuf = NULL;
905 	}
906 	rxq->completed = completed;
907 
908 	level = rxq->added - rxq->completed;
909 
910 	/* Pass last packet up the stack or into LRO */
911 	if (prev != NULL) {
912 		if (lro_enabled &&
913 		    ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
914 		     (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
915 			sfxge_lro(rxq, prev);
916 		else
917 			sfxge_rx_deliver(rxq, prev);
918 	}
919 
920 	/*
921 	 * If there are any pending flows and this is the end of the
922 	 * poll then they must be completed.
923 	 */
924 	if (eop)
925 		sfxge_lro_end_of_burst(rxq);
926 
927 	/* Top up the queue if necessary */
928 	if (level < rxq->refill_threshold)
929 		sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE);
930 }
931 
932 static void
933 sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
934 {
935 	struct sfxge_rxq *rxq;
936 	struct sfxge_evq *evq;
937 	unsigned int count;
938 	unsigned int retry = 3;
939 
940 	SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
941 
942 	rxq = sc->rxq[index];
943 	evq = sc->evq[index];
944 
945 	SFXGE_EVQ_LOCK(evq);
946 
947 	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
948 	    ("rxq not started"));
949 
950 	rxq->init_state = SFXGE_RXQ_INITIALIZED;
951 
952 	callout_stop(&rxq->refill_callout);
953 
954 	while (rxq->flush_state != SFXGE_FLUSH_DONE && retry != 0) {
955 		rxq->flush_state = SFXGE_FLUSH_PENDING;
956 
957 		SFXGE_EVQ_UNLOCK(evq);
958 
959 		/* Flush the receive queue */
960 		if (efx_rx_qflush(rxq->common) != 0) {
961 			SFXGE_EVQ_LOCK(evq);
962 			rxq->flush_state = SFXGE_FLUSH_FAILED;
963 			break;
964 		}
965 
966 		count = 0;
967 		do {
968 			/* Spin for 100 ms */
969 			DELAY(100000);
970 
971 			if (rxq->flush_state != SFXGE_FLUSH_PENDING)
972 				break;
973 
974 		} while (++count < 20);
975 
976 		SFXGE_EVQ_LOCK(evq);
977 
978 		if (rxq->flush_state == SFXGE_FLUSH_PENDING) {
979 			/* Flush timeout - neither done nor failed */
980 			log(LOG_ERR, "%s: Cannot flush Rx queue %u\n",
981 			    device_get_nameunit(sc->dev), index);
982 			rxq->flush_state = SFXGE_FLUSH_DONE;
983 		}
984 		retry--;
985 	}
986 	if (rxq->flush_state == SFXGE_FLUSH_FAILED) {
987 		log(LOG_ERR, "%s: Flushing Rx queue %u failed\n",
988 		    device_get_nameunit(sc->dev), index);
989 		rxq->flush_state = SFXGE_FLUSH_DONE;
990 	}
991 
992 	rxq->pending = rxq->added;
993 	sfxge_rx_qcomplete(rxq, B_TRUE);
994 
995 	KASSERT(rxq->completed == rxq->pending,
996 	    ("rxq->completed != rxq->pending"));
997 
998 	rxq->added = 0;
999 	rxq->pushed = 0;
1000 	rxq->pending = 0;
1001 	rxq->completed = 0;
1002 	rxq->loopback = 0;
1003 
1004 	/* Destroy the common code receive queue. */
1005 	efx_rx_qdestroy(rxq->common);
1006 
1007 	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1008 	    EFX_RXQ_NBUFS(sc->rxq_entries));
1009 
1010 	SFXGE_EVQ_UNLOCK(evq);
1011 }
1012 
1013 static int
1014 sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
1015 {
1016 	struct sfxge_rxq *rxq;
1017 	efsys_mem_t *esmp;
1018 	struct sfxge_evq *evq;
1019 	int rc;
1020 
1021 	SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
1022 
1023 	rxq = sc->rxq[index];
1024 	esmp = &rxq->mem;
1025 	evq = sc->evq[index];
1026 
1027 	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1028 	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1029 	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
1030 	    ("evq->init_state != SFXGE_EVQ_STARTED"));
1031 
1032 	/* Program the buffer table. */
1033 	if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
1034 	    EFX_RXQ_NBUFS(sc->rxq_entries))) != 0)
1035 		return (rc);
1036 
1037 	/* Create the common code receive queue. */
1038 	if ((rc = efx_rx_qcreate(sc->enp, index, 0, EFX_RXQ_TYPE_DEFAULT,
1039 	    esmp, sc->rxq_entries, rxq->buf_base_id, EFX_RXQ_FLAG_NONE,
1040 	    evq->common, &rxq->common)) != 0)
1041 		goto fail;
1042 
1043 	SFXGE_EVQ_LOCK(evq);
1044 
1045 	/* Enable the receive queue. */
1046 	efx_rx_qenable(rxq->common);
1047 
1048 	rxq->init_state = SFXGE_RXQ_STARTED;
1049 	rxq->flush_state = SFXGE_FLUSH_REQUIRED;
1050 
1051 	/* Try to fill the queue from the pool. */
1052 	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE);
1053 
1054 	SFXGE_EVQ_UNLOCK(evq);
1055 
1056 	return (0);
1057 
1058 fail:
1059 	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1060 	    EFX_RXQ_NBUFS(sc->rxq_entries));
1061 	return (rc);
1062 }
1063 
1064 void
1065 sfxge_rx_stop(struct sfxge_softc *sc)
1066 {
1067 	int index;
1068 
1069 	efx_mac_filter_default_rxq_clear(sc->enp);
1070 
1071 	/* Stop the receive queue(s) */
1072 	index = sc->rxq_count;
1073 	while (--index >= 0)
1074 		sfxge_rx_qstop(sc, index);
1075 
1076 	sc->rx_prefix_size = 0;
1077 	sc->rx_buffer_size = 0;
1078 
1079 	efx_rx_fini(sc->enp);
1080 }
1081 
1082 int
1083 sfxge_rx_start(struct sfxge_softc *sc)
1084 {
1085 	const efx_nic_cfg_t *encp;
1086 	size_t hdrlen, align, reserved;
1087 	int index;
1088 	int rc;
1089 
1090 	/* Initialize the common code receive module. */
1091 	if ((rc = efx_rx_init(sc->enp)) != 0)
1092 		return (rc);
1093 
1094 	encp = efx_nic_cfg_get(sc->enp);
1095 	sc->rx_buffer_size = EFX_MAC_PDU(if_getmtu(sc->ifnet));
1096 
1097 	/* Calculate the receive packet buffer size. */
1098 	sc->rx_prefix_size = encp->enc_rx_prefix_size;
1099 
1100 	/* Ensure IP headers are 32bit aligned */
1101 	hdrlen = sc->rx_prefix_size + sizeof (struct ether_header);
1102 	sc->rx_buffer_align = EFX_P2ROUNDUP(size_t, hdrlen, 4) - hdrlen;
1103 
1104 	sc->rx_buffer_size += sc->rx_buffer_align;
1105 
1106 	/* Align end of packet buffer for RX DMA end padding */
1107 	align = MAX(1, encp->enc_rx_buf_align_end);
1108 	EFSYS_ASSERT(ISP2(align));
1109 	sc->rx_buffer_size = EFX_P2ROUNDUP(size_t, sc->rx_buffer_size, align);
1110 
1111 	/*
1112 	 * Standard mbuf zones only guarantee pointer-size alignment;
1113 	 * we need extra space to align to the cache line
1114 	 */
1115 	reserved = sc->rx_buffer_size + CACHE_LINE_SIZE;
1116 
1117 	/* Select zone for packet buffers */
1118 	if (reserved <= MCLBYTES)
1119 		sc->rx_cluster_size = MCLBYTES;
1120 	else if (reserved <= MJUMPAGESIZE)
1121 		sc->rx_cluster_size = MJUMPAGESIZE;
1122 	else if (reserved <= MJUM9BYTES)
1123 		sc->rx_cluster_size = MJUM9BYTES;
1124 	else
1125 		sc->rx_cluster_size = MJUM16BYTES;
1126 
1127 	/*
1128 	 * Set up the scale table.  Enable all hash types and hash insertion.
1129 	 */
1130 	for (index = 0; index < nitems(sc->rx_indir_table); index++)
1131 #ifdef RSS
1132 		sc->rx_indir_table[index] =
1133 			rss_get_indirection_to_bucket(index) % sc->rxq_count;
1134 #else
1135 		sc->rx_indir_table[index] = index % sc->rxq_count;
1136 #endif
1137 	if ((rc = efx_rx_scale_tbl_set(sc->enp, EFX_RSS_CONTEXT_DEFAULT,
1138 				       sc->rx_indir_table,
1139 				       nitems(sc->rx_indir_table))) != 0)
1140 		goto fail;
1141 	(void)efx_rx_scale_mode_set(sc->enp, EFX_RSS_CONTEXT_DEFAULT,
1142 	    EFX_RX_HASHALG_TOEPLITZ,
1143 	    EFX_RX_HASH_IPV4 | EFX_RX_HASH_TCPIPV4 |
1144 	    EFX_RX_HASH_IPV6 | EFX_RX_HASH_TCPIPV6, B_TRUE);
1145 
1146 #ifdef RSS
1147 	rss_getkey(toep_key);
1148 #endif
1149 	if ((rc = efx_rx_scale_key_set(sc->enp, EFX_RSS_CONTEXT_DEFAULT,
1150 				       toep_key,
1151 				       sizeof(toep_key))) != 0)
1152 		goto fail;
1153 
1154 	/* Start the receive queue(s). */
1155 	for (index = 0; index < sc->rxq_count; index++) {
1156 		if ((rc = sfxge_rx_qstart(sc, index)) != 0)
1157 			goto fail2;
1158 	}
1159 
1160 	rc = efx_mac_filter_default_rxq_set(sc->enp, sc->rxq[0]->common,
1161 					    sc->intr.n_alloc > 1);
1162 	if (rc != 0)
1163 		goto fail3;
1164 
1165 	return (0);
1166 
1167 fail3:
1168 fail2:
1169 	while (--index >= 0)
1170 		sfxge_rx_qstop(sc, index);
1171 
1172 fail:
1173 	efx_rx_fini(sc->enp);
1174 
1175 	return (rc);
1176 }
1177 
1178 #ifdef SFXGE_LRO
1179 
1180 static void sfxge_lro_init(struct sfxge_rxq *rxq)
1181 {
1182 	struct sfxge_lro_state *st = &rxq->lro;
1183 	unsigned i;
1184 
1185 	st->conns_mask = lro_table_size - 1;
1186 	KASSERT(!((st->conns_mask + 1) & st->conns_mask),
1187 		("lro_table_size must be a power of 2"));
1188 	st->sc = rxq->sc;
1189 	st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
1190 			   M_SFXGE, M_WAITOK);
1191 	st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
1192 			     M_SFXGE, M_WAITOK);
1193 	for (i = 0; i <= st->conns_mask; ++i) {
1194 		TAILQ_INIT(&st->conns[i]);
1195 		st->conns_n[i] = 0;
1196 	}
1197 	LIST_INIT(&st->active_conns);
1198 	TAILQ_INIT(&st->free_conns);
1199 }
1200 
1201 static void sfxge_lro_fini(struct sfxge_rxq *rxq)
1202 {
1203 	struct sfxge_lro_state *st = &rxq->lro;
1204 	struct sfxge_lro_conn *c;
1205 	unsigned i;
1206 
1207 	/* Return cleanly if sfxge_lro_init() has not been called. */
1208 	if (st->conns == NULL)
1209 		return;
1210 
1211 	KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
1212 
1213 	for (i = 0; i <= st->conns_mask; ++i) {
1214 		while (!TAILQ_EMPTY(&st->conns[i])) {
1215 			c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
1216 			sfxge_lro_drop(rxq, c);
1217 		}
1218 	}
1219 
1220 	while (!TAILQ_EMPTY(&st->free_conns)) {
1221 		c = TAILQ_FIRST(&st->free_conns);
1222 		TAILQ_REMOVE(&st->free_conns, c, link);
1223 		KASSERT(!c->mbuf, ("found orphaned mbuf"));
1224 		free(c, M_SFXGE);
1225 	}
1226 
1227 	free(st->conns_n, M_SFXGE);
1228 	free(st->conns, M_SFXGE);
1229 	st->conns = NULL;
1230 }
1231 
1232 #else
1233 
1234 static void
1235 sfxge_lro_init(struct sfxge_rxq *rxq)
1236 {
1237 }
1238 
1239 static void
1240 sfxge_lro_fini(struct sfxge_rxq *rxq)
1241 {
1242 }
1243 
1244 #endif	/* SFXGE_LRO */
1245 
1246 static void
1247 sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
1248 {
1249 	struct sfxge_rxq *rxq;
1250 
1251 	rxq = sc->rxq[index];
1252 
1253 	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1254 	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1255 
1256 	/* Free the context array and the flow table. */
1257 	free(rxq->queue, M_SFXGE);
1258 	sfxge_lro_fini(rxq);
1259 
1260 	/* Release DMA memory. */
1261 	sfxge_dma_free(&rxq->mem);
1262 
1263 	sc->rxq[index] = NULL;
1264 
1265 	free(rxq, M_SFXGE);
1266 }
1267 
1268 static int
1269 sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
1270 {
1271 	struct sfxge_rxq *rxq;
1272 	efsys_mem_t *esmp;
1273 	int rc;
1274 
1275 	KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count));
1276 
1277 	rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
1278 	rxq->sc = sc;
1279 	rxq->index = index;
1280 	rxq->entries = sc->rxq_entries;
1281 	rxq->ptr_mask = rxq->entries - 1;
1282 	rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries);
1283 
1284 	sc->rxq[index] = rxq;
1285 	esmp = &rxq->mem;
1286 
1287 	/* Allocate and zero DMA space. */
1288 	if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0)
1289 		return (rc);
1290 
1291 	/* Allocate buffer table entries. */
1292 	sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries),
1293 				 &rxq->buf_base_id);
1294 
1295 	/* Allocate the context array and the flow table. */
1296 	rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries,
1297 	    M_SFXGE, M_WAITOK | M_ZERO);
1298 	sfxge_lro_init(rxq);
1299 
1300 	callout_init(&rxq->refill_callout, 1);
1301 
1302 	rxq->init_state = SFXGE_RXQ_INITIALIZED;
1303 
1304 	return (0);
1305 }
1306 
1307 static const struct {
1308 	const char *name;
1309 	size_t offset;
1310 } sfxge_rx_stats[] = {
1311 #define	SFXGE_RX_STAT(name, member) \
1312 	{ #name, offsetof(struct sfxge_rxq, member) }
1313 #ifdef SFXGE_LRO
1314 	SFXGE_RX_STAT(lro_merges, lro.n_merges),
1315 	SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
1316 	SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
1317 	SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
1318 	SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
1319 	SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
1320 	SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
1321 	SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
1322 #endif
1323 };
1324 
1325 static int
1326 sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
1327 {
1328 	struct sfxge_softc *sc = arg1;
1329 	unsigned int id = arg2;
1330 	unsigned int sum, index;
1331 
1332 	/* Sum across all RX queues */
1333 	sum = 0;
1334 	for (index = 0; index < sc->rxq_count; index++)
1335 		sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
1336 					 sfxge_rx_stats[id].offset);
1337 
1338 	return (SYSCTL_OUT(req, &sum, sizeof(sum)));
1339 }
1340 
1341 static void
1342 sfxge_rx_stat_init(struct sfxge_softc *sc)
1343 {
1344 	struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1345 	struct sysctl_oid_list *stat_list;
1346 	unsigned int id;
1347 
1348 	stat_list = SYSCTL_CHILDREN(sc->stats_node);
1349 
1350 	for (id = 0; id < nitems(sfxge_rx_stats); id++) {
1351 		SYSCTL_ADD_PROC(ctx, stat_list, OID_AUTO,
1352 		    sfxge_rx_stats[id].name,
1353 		    CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_NEEDGIANT,
1354 		    sc, id, sfxge_rx_stat_handler, "IU", "");
1355 	}
1356 }
1357 
1358 void
1359 sfxge_rx_fini(struct sfxge_softc *sc)
1360 {
1361 	int index;
1362 
1363 	index = sc->rxq_count;
1364 	while (--index >= 0)
1365 		sfxge_rx_qfini(sc, index);
1366 
1367 	sc->rxq_count = 0;
1368 }
1369 
1370 int
1371 sfxge_rx_init(struct sfxge_softc *sc)
1372 {
1373 	struct sfxge_intr *intr;
1374 	int index;
1375 	int rc;
1376 
1377 #ifdef SFXGE_LRO
1378 	if (!ISP2(lro_table_size)) {
1379 		log(LOG_ERR, "%s=%u must be power of 2",
1380 		    SFXGE_LRO_PARAM(table_size), lro_table_size);
1381 		rc = EINVAL;
1382 		goto fail_lro_table_size;
1383 	}
1384 
1385 	if (lro_idle_ticks == 0)
1386 		lro_idle_ticks = hz / 10 + 1; /* 100 ms */
1387 #endif
1388 
1389 	intr = &sc->intr;
1390 
1391 	sc->rxq_count = intr->n_alloc;
1392 
1393 	KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
1394 	    ("intr->state != SFXGE_INTR_INITIALIZED"));
1395 
1396 	/* Initialize the receive queue(s) - one per interrupt. */
1397 	for (index = 0; index < sc->rxq_count; index++) {
1398 		if ((rc = sfxge_rx_qinit(sc, index)) != 0)
1399 			goto fail;
1400 	}
1401 
1402 	sfxge_rx_stat_init(sc);
1403 
1404 	return (0);
1405 
1406 fail:
1407 	/* Tear down the receive queue(s). */
1408 	while (--index >= 0)
1409 		sfxge_rx_qfini(sc, index);
1410 
1411 	sc->rxq_count = 0;
1412 
1413 #ifdef SFXGE_LRO
1414 fail_lro_table_size:
1415 #endif
1416 	return (rc);
1417 }
1418