xref: /freebsd/sys/dev/sfxge/sfxge_rx.c (revision cc349066556bcdeed0d6cc72aad340d0f383e35c)
1 /*-
2  * Copyright (c) 2010-2016 Solarflare Communications Inc.
3  * All rights reserved.
4  *
5  * This software was developed in part by Philip Paeps under contract for
6  * Solarflare Communications, Inc.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright notice,
12  *    this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright notice,
14  *    this list of conditions and the following disclaimer in the documentation
15  *    and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
19  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
21  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
22  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
23  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
24  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
25  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
26  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
27  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28  *
29  * The views and conclusions contained in the software and documentation are
30  * those of the authors and should not be interpreted as representing official
31  * policies, either expressed or implied, of the FreeBSD Project.
32  */
33 
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36 
37 #include "opt_rss.h"
38 
39 #include <sys/param.h>
40 #include <sys/malloc.h>
41 #include <sys/mbuf.h>
42 #include <sys/smp.h>
43 #include <sys/socket.h>
44 #include <sys/sysctl.h>
45 #include <sys/syslog.h>
46 #include <sys/limits.h>
47 #include <sys/syslog.h>
48 
49 #include <net/ethernet.h>
50 #include <net/if.h>
51 #include <net/if_vlan_var.h>
52 
53 #include <netinet/in.h>
54 #include <netinet/ip.h>
55 #include <netinet/ip6.h>
56 #include <netinet/tcp.h>
57 
58 #include <machine/in_cksum.h>
59 
60 #ifdef RSS
61 #include <net/rss_config.h>
62 #endif
63 
64 #include "common/efx.h"
65 
66 
67 #include "sfxge.h"
68 #include "sfxge_rx.h"
69 
70 #define	RX_REFILL_THRESHOLD(_entries)	(EFX_RXQ_LIMIT(_entries) * 9 / 10)
71 
72 #ifdef SFXGE_LRO
73 
74 SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD, NULL,
75 	    "Large receive offload (LRO) parameters");
76 
77 #define	SFXGE_LRO_PARAM(_param)	SFXGE_PARAM(lro._param)
78 
79 /* Size of the LRO hash table.  Must be a power of 2.  A larger table
80  * means we can accelerate a larger number of streams.
81  */
82 static unsigned lro_table_size = 128;
83 TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size);
84 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN,
85 	    &lro_table_size, 0,
86 	    "Size of the LRO hash table (must be a power of 2)");
87 
88 /* Maximum length of a hash chain.  If chains get too long then the lookup
89  * time increases and may exceed the benefit of LRO.
90  */
91 static unsigned lro_chain_max = 20;
92 TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max);
93 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN,
94 	    &lro_chain_max, 0,
95 	    "The maximum length of a hash chain");
96 
97 /* Maximum time (in ticks) that a connection can be idle before it's LRO
98  * state is discarded.
99  */
100 static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
101 TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks);
102 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN,
103 	    &lro_idle_ticks, 0,
104 	    "The maximum time (in ticks) that a connection can be idle "
105 	    "before it's LRO state is discarded");
106 
107 /* Number of packets with payload that must arrive in-order before a
108  * connection is eligible for LRO.  The idea is we should avoid coalescing
109  * segments when the sender is in slow-start because reducing the ACK rate
110  * can damage performance.
111  */
112 static int lro_slow_start_packets = 2000;
113 TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets);
114 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN,
115 	    &lro_slow_start_packets, 0,
116 	    "Number of packets with payload that must arrive in-order before "
117 	    "a connection is eligible for LRO");
118 
119 /* Number of packets with payload that must arrive in-order following loss
120  * before a connection is eligible for LRO.  The idea is we should avoid
121  * coalescing segments when the sender is recovering from loss, because
122  * reducing the ACK rate can damage performance.
123  */
124 static int lro_loss_packets = 20;
125 TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets);
126 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN,
127 	    &lro_loss_packets, 0,
128 	    "Number of packets with payload that must arrive in-order "
129 	    "following loss before a connection is eligible for LRO");
130 
131 /* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
132 #define	SFXGE_LRO_L2_ID_VLAN 0x4000
133 #define	SFXGE_LRO_L2_ID_IPV6 0x8000
134 #define	SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
135 #define	SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
136 
137 /* Compare IPv6 addresses, avoiding conditional branches */
138 static unsigned long ipv6_addr_cmp(const struct in6_addr *left,
139 				   const struct in6_addr *right)
140 {
141 #if LONG_BIT == 64
142 	const uint64_t *left64 = (const uint64_t *)left;
143 	const uint64_t *right64 = (const uint64_t *)right;
144 	return (left64[0] - right64[0]) | (left64[1] - right64[1]);
145 #else
146 	return (left->s6_addr32[0] - right->s6_addr32[0]) |
147 	       (left->s6_addr32[1] - right->s6_addr32[1]) |
148 	       (left->s6_addr32[2] - right->s6_addr32[2]) |
149 	       (left->s6_addr32[3] - right->s6_addr32[3]);
150 #endif
151 }
152 
153 #endif	/* SFXGE_LRO */
154 
155 void
156 sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
157 {
158 
159 	rxq->flush_state = SFXGE_FLUSH_DONE;
160 }
161 
162 void
163 sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
164 {
165 
166 	rxq->flush_state = SFXGE_FLUSH_FAILED;
167 }
168 
169 #ifdef RSS
170 static uint8_t toep_key[RSS_KEYSIZE];
171 #else
172 static uint8_t toep_key[] = {
173 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
174 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
175 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
176 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
177 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
178 };
179 #endif
180 
181 static void
182 sfxge_rx_post_refill(void *arg)
183 {
184 	struct sfxge_rxq *rxq = arg;
185 	struct sfxge_softc *sc;
186 	unsigned int index;
187 	struct sfxge_evq *evq;
188 	uint16_t magic;
189 
190 	sc = rxq->sc;
191 	index = rxq->index;
192 	evq = sc->evq[index];
193 	magic = sfxge_sw_ev_rxq_magic(SFXGE_SW_EV_RX_QREFILL, rxq);
194 
195 	/* This is guaranteed due to the start/stop order of rx and ev */
196 	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
197 	    ("evq not started"));
198 	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
199 	    ("rxq not started"));
200 	efx_ev_qpost(evq->common, magic);
201 }
202 
203 static void
204 sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
205 {
206 	/* Initially retry after 100 ms, but back off in case of
207 	 * repeated failures as we probably have to wait for the
208 	 * administrator to raise the pool limit. */
209 	if (retrying)
210 		rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
211 	else
212 		rxq->refill_delay = hz / 10;
213 
214 	callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
215 			     sfxge_rx_post_refill, rxq);
216 }
217 
218 #define	SFXGE_REFILL_BATCH  64
219 
220 static void
221 sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
222 {
223 	struct sfxge_softc *sc;
224 	unsigned int index;
225 	struct sfxge_evq *evq;
226 	unsigned int batch;
227 	unsigned int rxfill;
228 	unsigned int mblksize;
229 	int ntodo;
230 	efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
231 
232 	sc = rxq->sc;
233 	index = rxq->index;
234 	evq = sc->evq[index];
235 
236 	prefetch_read_many(sc->enp);
237 	prefetch_read_many(rxq->common);
238 
239 	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
240 
241 	if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
242 		return;
243 
244 	rxfill = rxq->added - rxq->completed;
245 	KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries),
246 	    ("rxfill > EFX_RXQ_LIMIT(rxq->entries)"));
247 	ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target);
248 	KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries),
249 	    ("ntodo > EFX_RQX_LIMIT(rxq->entries)"));
250 
251 	if (ntodo == 0)
252 		return;
253 
254 	batch = 0;
255 	mblksize = sc->rx_buffer_size - sc->rx_buffer_align;
256 	while (ntodo-- > 0) {
257 		unsigned int id;
258 		struct sfxge_rx_sw_desc *rx_desc;
259 		bus_dma_segment_t seg;
260 		struct mbuf *m;
261 
262 		id = (rxq->added + batch) & rxq->ptr_mask;
263 		rx_desc = &rxq->queue[id];
264 		KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
265 
266 		rx_desc->flags = EFX_DISCARD;
267 		m = rx_desc->mbuf = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR,
268 		    sc->rx_cluster_size);
269 		if (m == NULL)
270 			break;
271 
272 		/* m_len specifies length of area to be mapped for DMA */
273 		m->m_len  = mblksize;
274 		m->m_data = (caddr_t)P2ROUNDUP((uintptr_t)m->m_data, CACHE_LINE_SIZE);
275 		m->m_data += sc->rx_buffer_align;
276 
277 		sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
278 		addr[batch++] = seg.ds_addr;
279 
280 		if (batch == SFXGE_REFILL_BATCH) {
281 			efx_rx_qpost(rxq->common, addr, mblksize, batch,
282 			    rxq->completed, rxq->added);
283 			rxq->added += batch;
284 			batch = 0;
285 		}
286 	}
287 
288 	if (ntodo != 0)
289 		sfxge_rx_schedule_refill(rxq, retrying);
290 
291 	if (batch != 0) {
292 		efx_rx_qpost(rxq->common, addr, mblksize, batch,
293 		    rxq->completed, rxq->added);
294 		rxq->added += batch;
295 	}
296 
297 	/* Make the descriptors visible to the hardware */
298 	bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
299 			BUS_DMASYNC_PREWRITE);
300 
301 	efx_rx_qpush(rxq->common, rxq->added, &rxq->pushed);
302 
303 	/* The queue could still be empty if no descriptors were actually
304 	 * pushed, in which case there will be no event to cause the next
305 	 * refill, so we must schedule a refill ourselves.
306 	 */
307 	if(rxq->pushed == rxq->completed) {
308 		sfxge_rx_schedule_refill(rxq, retrying);
309 	}
310 }
311 
312 void
313 sfxge_rx_qrefill(struct sfxge_rxq *rxq)
314 {
315 
316 	if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
317 		return;
318 
319 	/* Make sure the queue is full */
320 	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE);
321 }
322 
323 static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
324 {
325 	struct ifnet *ifp = sc->ifnet;
326 
327 	m->m_pkthdr.rcvif = ifp;
328 	m->m_pkthdr.csum_data = 0xffff;
329 	ifp->if_input(ifp, m);
330 }
331 
332 static void
333 sfxge_rx_deliver(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_desc)
334 {
335 	struct sfxge_softc *sc = rxq->sc;
336 	struct mbuf *m = rx_desc->mbuf;
337 	int flags = rx_desc->flags;
338 	int csum_flags;
339 
340 	/* Convert checksum flags */
341 	csum_flags = (flags & EFX_CKSUM_IPV4) ?
342 		(CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
343 	if (flags & EFX_CKSUM_TCPUDP)
344 		csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
345 
346 	if (flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
347 		m->m_pkthdr.flowid =
348 			efx_pseudo_hdr_hash_get(rxq->common,
349 						EFX_RX_HASHALG_TOEPLITZ,
350 						mtod(m, uint8_t *));
351 		/* The hash covers a 4-tuple for TCP only */
352 		M_HASHTYPE_SET(m,
353 		    (flags & EFX_PKT_IPV4) ?
354 			((flags & EFX_PKT_TCP) ?
355 			    M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_IPV4) :
356 			((flags & EFX_PKT_TCP) ?
357 			    M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_IPV6));
358 	}
359 	m->m_data += sc->rx_prefix_size;
360 	m->m_len = rx_desc->size - sc->rx_prefix_size;
361 	m->m_pkthdr.len = m->m_len;
362 	m->m_pkthdr.csum_flags = csum_flags;
363 	__sfxge_rx_deliver(sc, rx_desc->mbuf);
364 
365 	rx_desc->flags = EFX_DISCARD;
366 	rx_desc->mbuf = NULL;
367 }
368 
369 #ifdef SFXGE_LRO
370 
371 static void
372 sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
373 {
374 	struct sfxge_softc *sc = st->sc;
375 	struct mbuf *m = c->mbuf;
376 	struct tcphdr *c_th;
377 	int csum_flags;
378 
379 	KASSERT(m, ("no mbuf to deliver"));
380 
381 	++st->n_bursts;
382 
383 	/* Finish off packet munging and recalculate IP header checksum. */
384 	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
385 		struct ip *iph = c->nh;
386 		iph->ip_len = htons(iph->ip_len);
387 		iph->ip_sum = 0;
388 		iph->ip_sum = in_cksum_hdr(iph);
389 		c_th = (struct tcphdr *)(iph + 1);
390 		csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
391 			      CSUM_IP_CHECKED | CSUM_IP_VALID);
392 	} else {
393 		struct ip6_hdr *iph = c->nh;
394 		iph->ip6_plen = htons(iph->ip6_plen);
395 		c_th = (struct tcphdr *)(iph + 1);
396 		csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
397 	}
398 
399 	c_th->th_win = c->th_last->th_win;
400 	c_th->th_ack = c->th_last->th_ack;
401 	if (c_th->th_off == c->th_last->th_off) {
402 		/* Copy TCP options (take care to avoid going negative). */
403 		int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
404 		memcpy(c_th + 1, c->th_last + 1, optlen);
405 	}
406 
407 	m->m_pkthdr.flowid = c->conn_hash;
408 	M_HASHTYPE_SET(m,
409 	    SFXGE_LRO_CONN_IS_TCPIPV4(c) ?
410 		M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_TCP_IPV6);
411 
412 	m->m_pkthdr.csum_flags = csum_flags;
413 	__sfxge_rx_deliver(sc, m);
414 
415 	c->mbuf = NULL;
416 	c->delivered = 1;
417 }
418 
419 /* Drop the given connection, and add it to the free list. */
420 static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
421 {
422 	unsigned bucket;
423 
424 	KASSERT(!c->mbuf, ("found orphaned mbuf"));
425 
426 	if (c->next_buf.mbuf != NULL) {
427 		sfxge_rx_deliver(rxq, &c->next_buf);
428 		LIST_REMOVE(c, active_link);
429 	}
430 
431 	bucket = c->conn_hash & rxq->lro.conns_mask;
432 	KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
433 	--rxq->lro.conns_n[bucket];
434 	TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
435 	TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
436 }
437 
438 /* Stop tracking connections that have gone idle in order to keep hash
439  * chains short.
440  */
441 static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
442 {
443 	struct sfxge_lro_conn *c;
444 	unsigned i;
445 
446 	KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
447 		("found active connections"));
448 
449 	rxq->lro.last_purge_ticks = now;
450 	for (i = 0; i <= rxq->lro.conns_mask; ++i) {
451 		if (TAILQ_EMPTY(&rxq->lro.conns[i]))
452 			continue;
453 
454 		c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
455 		if (now - c->last_pkt_ticks > lro_idle_ticks) {
456 			++rxq->lro.n_drop_idle;
457 			sfxge_lro_drop(rxq, c);
458 		}
459 	}
460 }
461 
462 static void
463 sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
464 		struct mbuf *mbuf, struct tcphdr *th)
465 {
466 	struct tcphdr *c_th;
467 
468 	/* Tack the new mbuf onto the chain. */
469 	KASSERT(!mbuf->m_next, ("mbuf already chained"));
470 	c->mbuf_tail->m_next = mbuf;
471 	c->mbuf_tail = mbuf;
472 
473 	/* Increase length appropriately */
474 	c->mbuf->m_pkthdr.len += mbuf->m_len;
475 
476 	/* Update the connection state flags */
477 	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
478 		struct ip *iph = c->nh;
479 		iph->ip_len += mbuf->m_len;
480 		c_th = (struct tcphdr *)(iph + 1);
481 	} else {
482 		struct ip6_hdr *iph = c->nh;
483 		iph->ip6_plen += mbuf->m_len;
484 		c_th = (struct tcphdr *)(iph + 1);
485 	}
486 	c_th->th_flags |= (th->th_flags & TH_PUSH);
487 	c->th_last = th;
488 	++st->n_merges;
489 
490 	/* Pass packet up now if another segment could overflow the IP
491 	 * length.
492 	 */
493 	if (c->mbuf->m_pkthdr.len > 65536 - 9200)
494 		sfxge_lro_deliver(st, c);
495 }
496 
497 static void
498 sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
499 		struct mbuf *mbuf, void *nh, struct tcphdr *th)
500 {
501 	/* Start the chain */
502 	c->mbuf = mbuf;
503 	c->mbuf_tail = c->mbuf;
504 	c->nh = nh;
505 	c->th_last = th;
506 
507 	mbuf->m_pkthdr.len = mbuf->m_len;
508 
509 	/* Mangle header fields for later processing */
510 	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
511 		struct ip *iph = nh;
512 		iph->ip_len = ntohs(iph->ip_len);
513 	} else {
514 		struct ip6_hdr *iph = nh;
515 		iph->ip6_plen = ntohs(iph->ip6_plen);
516 	}
517 }
518 
519 /* Try to merge or otherwise hold or deliver (as appropriate) the
520  * packet buffered for this connection (c->next_buf).  Return a flag
521  * indicating whether the connection is still active for LRO purposes.
522  */
523 static int
524 sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
525 {
526 	struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
527 	char *eh = c->next_eh;
528 	int data_length, hdr_length, dont_merge;
529 	unsigned th_seq, pkt_length;
530 	struct tcphdr *th;
531 	unsigned now;
532 
533 	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
534 		struct ip *iph = c->next_nh;
535 		th = (struct tcphdr *)(iph + 1);
536 		pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
537 	} else {
538 		struct ip6_hdr *iph = c->next_nh;
539 		th = (struct tcphdr *)(iph + 1);
540 		pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
541 	}
542 
543 	hdr_length = (char *) th + th->th_off * 4 - eh;
544 	data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
545 		       hdr_length);
546 	th_seq = ntohl(th->th_seq);
547 	dont_merge = ((data_length <= 0)
548 		      | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
549 
550 	/* Check for options other than aligned timestamp. */
551 	if (th->th_off != 5) {
552 		const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
553 		if (th->th_off == 8 &&
554 		    opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
555 					(TCPOPT_NOP << 16) |
556 					(TCPOPT_TIMESTAMP << 8) |
557 					TCPOLEN_TIMESTAMP)) {
558 			/* timestamp option -- okay */
559 		} else {
560 			dont_merge = 1;
561 		}
562 	}
563 
564 	if (__predict_false(th_seq != c->next_seq)) {
565 		/* Out-of-order, so start counting again. */
566 		if (c->mbuf != NULL)
567 			sfxge_lro_deliver(&rxq->lro, c);
568 		c->n_in_order_pkts -= lro_loss_packets;
569 		c->next_seq = th_seq + data_length;
570 		++rxq->lro.n_misorder;
571 		goto deliver_buf_out;
572 	}
573 	c->next_seq = th_seq + data_length;
574 
575 	now = ticks;
576 	if (now - c->last_pkt_ticks > lro_idle_ticks) {
577 		++rxq->lro.n_drop_idle;
578 		if (c->mbuf != NULL)
579 			sfxge_lro_deliver(&rxq->lro, c);
580 		sfxge_lro_drop(rxq, c);
581 		return (0);
582 	}
583 	c->last_pkt_ticks = ticks;
584 
585 	if (c->n_in_order_pkts < lro_slow_start_packets) {
586 		/* May be in slow-start, so don't merge. */
587 		++rxq->lro.n_slow_start;
588 		++c->n_in_order_pkts;
589 		goto deliver_buf_out;
590 	}
591 
592 	if (__predict_false(dont_merge)) {
593 		if (c->mbuf != NULL)
594 			sfxge_lro_deliver(&rxq->lro, c);
595 		if (th->th_flags & (TH_FIN | TH_RST)) {
596 			++rxq->lro.n_drop_closed;
597 			sfxge_lro_drop(rxq, c);
598 			return (0);
599 		}
600 		goto deliver_buf_out;
601 	}
602 
603 	rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
604 
605 	if (__predict_true(c->mbuf != NULL)) {
606 		/* Remove headers and any padding */
607 		rx_buf->mbuf->m_data += hdr_length;
608 		rx_buf->mbuf->m_len = data_length;
609 
610 		sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
611 	} else {
612 		/* Remove any padding */
613 		rx_buf->mbuf->m_len = pkt_length;
614 
615 		sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
616 	}
617 
618 	rx_buf->mbuf = NULL;
619 	return (1);
620 
621  deliver_buf_out:
622 	sfxge_rx_deliver(rxq, rx_buf);
623 	return (1);
624 }
625 
626 static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
627 			       uint16_t l2_id, void *nh, struct tcphdr *th)
628 {
629 	unsigned bucket = conn_hash & st->conns_mask;
630 	struct sfxge_lro_conn *c;
631 
632 	if (st->conns_n[bucket] >= lro_chain_max) {
633 		++st->n_too_many;
634 		return;
635 	}
636 
637 	if (!TAILQ_EMPTY(&st->free_conns)) {
638 		c = TAILQ_FIRST(&st->free_conns);
639 		TAILQ_REMOVE(&st->free_conns, c, link);
640 	} else {
641 		c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT);
642 		if (c == NULL)
643 			return;
644 		c->mbuf = NULL;
645 		c->next_buf.mbuf = NULL;
646 	}
647 
648 	/* Create the connection tracking data */
649 	++st->conns_n[bucket];
650 	TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
651 	c->l2_id = l2_id;
652 	c->conn_hash = conn_hash;
653 	c->source = th->th_sport;
654 	c->dest = th->th_dport;
655 	c->n_in_order_pkts = 0;
656 	c->last_pkt_ticks = *(volatile int *)&ticks;
657 	c->delivered = 0;
658 	++st->n_new_stream;
659 	/* NB. We don't initialise c->next_seq, and it doesn't matter what
660 	 * value it has.  Most likely the next packet received for this
661 	 * connection will not match -- no harm done.
662 	 */
663 }
664 
665 /* Process mbuf and decide whether to dispatch it to the stack now or
666  * later.
667  */
668 static void
669 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
670 {
671 	struct sfxge_softc *sc = rxq->sc;
672 	struct mbuf *m = rx_buf->mbuf;
673 	struct ether_header *eh;
674 	struct sfxge_lro_conn *c;
675 	uint16_t l2_id;
676 	uint16_t l3_proto;
677 	void *nh;
678 	struct tcphdr *th;
679 	uint32_t conn_hash;
680 	unsigned bucket;
681 
682 	/* Get the hardware hash */
683 	conn_hash = efx_pseudo_hdr_hash_get(rxq->common,
684 					    EFX_RX_HASHALG_TOEPLITZ,
685 					    mtod(m, uint8_t *));
686 
687 	eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
688 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
689 		struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
690 		l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
691 			SFXGE_LRO_L2_ID_VLAN;
692 		l3_proto = veh->evl_proto;
693 		nh = veh + 1;
694 	} else {
695 		l2_id = 0;
696 		l3_proto = eh->ether_type;
697 		nh = eh + 1;
698 	}
699 
700 	/* Check whether this is a suitable packet (unfragmented
701 	 * TCP/IPv4 or TCP/IPv6).  If so, find the TCP header and
702 	 * length, and compute a hash if necessary.  If not, return.
703 	 */
704 	if (l3_proto == htons(ETHERTYPE_IP)) {
705 		struct ip *iph = nh;
706 
707 		KASSERT(iph->ip_p == IPPROTO_TCP,
708 		    ("IPv4 protocol is not TCP, but packet marker is set"));
709 		if ((iph->ip_hl - (sizeof(*iph) >> 2u)) |
710 		    (iph->ip_off & htons(IP_MF | IP_OFFMASK)))
711 			goto deliver_now;
712 		th = (struct tcphdr *)(iph + 1);
713 	} else if (l3_proto == htons(ETHERTYPE_IPV6)) {
714 		struct ip6_hdr *iph = nh;
715 
716 		KASSERT(iph->ip6_nxt == IPPROTO_TCP,
717 		    ("IPv6 next header is not TCP, but packet marker is set"));
718 		l2_id |= SFXGE_LRO_L2_ID_IPV6;
719 		th = (struct tcphdr *)(iph + 1);
720 	} else {
721 		goto deliver_now;
722 	}
723 
724 	bucket = conn_hash & rxq->lro.conns_mask;
725 
726 	TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
727 		if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
728 			continue;
729 		if ((c->source - th->th_sport) | (c->dest - th->th_dport))
730 			continue;
731 		if (c->mbuf != NULL) {
732 			if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
733 				struct ip *c_iph, *iph = nh;
734 				c_iph = c->nh;
735 				if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
736 				    (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
737 					continue;
738 			} else {
739 				struct ip6_hdr *c_iph, *iph = nh;
740 				c_iph = c->nh;
741 				if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
742 				    ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
743 					continue;
744 			}
745 		}
746 
747 		/* Re-insert at head of list to reduce lookup time. */
748 		TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
749 		TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
750 
751 		if (c->next_buf.mbuf != NULL) {
752 			if (!sfxge_lro_try_merge(rxq, c))
753 				goto deliver_now;
754 		} else {
755 			LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
756 			    active_link);
757 		}
758 		c->next_buf = *rx_buf;
759 		c->next_eh = eh;
760 		c->next_nh = nh;
761 
762 		rx_buf->mbuf = NULL;
763 		rx_buf->flags = EFX_DISCARD;
764 		return;
765 	}
766 
767 	sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
768  deliver_now:
769 	sfxge_rx_deliver(rxq, rx_buf);
770 }
771 
772 static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
773 {
774 	struct sfxge_lro_state *st = &rxq->lro;
775 	struct sfxge_lro_conn *c;
776 	unsigned t;
777 
778 	while (!LIST_EMPTY(&st->active_conns)) {
779 		c = LIST_FIRST(&st->active_conns);
780 		if (!c->delivered && c->mbuf != NULL)
781 			sfxge_lro_deliver(st, c);
782 		if (sfxge_lro_try_merge(rxq, c)) {
783 			if (c->mbuf != NULL)
784 				sfxge_lro_deliver(st, c);
785 			LIST_REMOVE(c, active_link);
786 		}
787 		c->delivered = 0;
788 	}
789 
790 	t = *(volatile int *)&ticks;
791 	if (__predict_false(t != st->last_purge_ticks))
792 		sfxge_lro_purge_idle(rxq, t);
793 }
794 
795 #else	/* !SFXGE_LRO */
796 
797 static void
798 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
799 {
800 }
801 
802 static void
803 sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
804 {
805 }
806 
807 #endif	/* SFXGE_LRO */
808 
809 void
810 sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
811 {
812 	struct sfxge_softc *sc = rxq->sc;
813 	int if_capenable = sc->ifnet->if_capenable;
814 	int lro_enabled = if_capenable & IFCAP_LRO;
815 	unsigned int index;
816 	struct sfxge_evq *evq;
817 	unsigned int completed;
818 	unsigned int level;
819 	struct mbuf *m;
820 	struct sfxge_rx_sw_desc *prev = NULL;
821 
822 	index = rxq->index;
823 	evq = sc->evq[index];
824 
825 	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
826 
827 	completed = rxq->completed;
828 	while (completed != rxq->pending) {
829 		unsigned int id;
830 		struct sfxge_rx_sw_desc *rx_desc;
831 
832 		id = completed++ & rxq->ptr_mask;
833 		rx_desc = &rxq->queue[id];
834 		m = rx_desc->mbuf;
835 
836 		if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
837 			goto discard;
838 
839 		if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
840 			goto discard;
841 
842 		/* Read the length from the pseudo header if required */
843 		if (rx_desc->flags & EFX_PKT_PREFIX_LEN) {
844 			uint16_t tmp_size;
845 			int rc;
846 			rc = efx_pseudo_hdr_pkt_length_get(rxq->common,
847 							   mtod(m, uint8_t *),
848 							   &tmp_size);
849 			KASSERT(rc == 0, ("cannot get packet length: %d", rc));
850 			rx_desc->size = (int)tmp_size + sc->rx_prefix_size;
851 		}
852 
853 		prefetch_read_many(mtod(m, caddr_t));
854 
855 		switch (rx_desc->flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
856 		case EFX_PKT_IPV4:
857 			if (~if_capenable & IFCAP_RXCSUM)
858 				rx_desc->flags &=
859 				    ~(EFX_CKSUM_IPV4 | EFX_CKSUM_TCPUDP);
860 			break;
861 		case EFX_PKT_IPV6:
862 			if (~if_capenable & IFCAP_RXCSUM_IPV6)
863 				rx_desc->flags &= ~EFX_CKSUM_TCPUDP;
864 			break;
865 		case 0:
866 			/* Check for loopback packets */
867 			{
868 				struct ether_header *etherhp;
869 
870 				/*LINTED*/
871 				etherhp = mtod(m, struct ether_header *);
872 
873 				if (etherhp->ether_type ==
874 				    htons(SFXGE_ETHERTYPE_LOOPBACK)) {
875 					EFSYS_PROBE(loopback);
876 
877 					rxq->loopback++;
878 					goto discard;
879 				}
880 			}
881 			break;
882 		default:
883 			KASSERT(B_FALSE,
884 			    ("Rx descriptor with both IPv4 and IPv6 flags"));
885 			goto discard;
886 		}
887 
888 		/* Pass packet up the stack or into LRO (pipelined) */
889 		if (prev != NULL) {
890 			if (lro_enabled &&
891 			    ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
892 			     (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
893 				sfxge_lro(rxq, prev);
894 			else
895 				sfxge_rx_deliver(rxq, prev);
896 		}
897 		prev = rx_desc;
898 		continue;
899 
900 discard:
901 		/* Return the packet to the pool */
902 		m_free(m);
903 		rx_desc->mbuf = NULL;
904 	}
905 	rxq->completed = completed;
906 
907 	level = rxq->added - rxq->completed;
908 
909 	/* Pass last packet up the stack or into LRO */
910 	if (prev != NULL) {
911 		if (lro_enabled &&
912 		    ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
913 		     (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
914 			sfxge_lro(rxq, prev);
915 		else
916 			sfxge_rx_deliver(rxq, prev);
917 	}
918 
919 	/*
920 	 * If there are any pending flows and this is the end of the
921 	 * poll then they must be completed.
922 	 */
923 	if (eop)
924 		sfxge_lro_end_of_burst(rxq);
925 
926 	/* Top up the queue if necessary */
927 	if (level < rxq->refill_threshold)
928 		sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE);
929 }
930 
931 static void
932 sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
933 {
934 	struct sfxge_rxq *rxq;
935 	struct sfxge_evq *evq;
936 	unsigned int count;
937 	unsigned int retry = 3;
938 
939 	SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
940 
941 	rxq = sc->rxq[index];
942 	evq = sc->evq[index];
943 
944 	SFXGE_EVQ_LOCK(evq);
945 
946 	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
947 	    ("rxq not started"));
948 
949 	rxq->init_state = SFXGE_RXQ_INITIALIZED;
950 
951 	callout_stop(&rxq->refill_callout);
952 
953 	while (rxq->flush_state != SFXGE_FLUSH_DONE && retry != 0) {
954 		rxq->flush_state = SFXGE_FLUSH_PENDING;
955 
956 		SFXGE_EVQ_UNLOCK(evq);
957 
958 		/* Flush the receive queue */
959 		if (efx_rx_qflush(rxq->common) != 0) {
960 			SFXGE_EVQ_LOCK(evq);
961 			rxq->flush_state = SFXGE_FLUSH_FAILED;
962 			break;
963 		}
964 
965 		count = 0;
966 		do {
967 			/* Spin for 100 ms */
968 			DELAY(100000);
969 
970 			if (rxq->flush_state != SFXGE_FLUSH_PENDING)
971 				break;
972 
973 		} while (++count < 20);
974 
975 		SFXGE_EVQ_LOCK(evq);
976 
977 		if (rxq->flush_state == SFXGE_FLUSH_PENDING) {
978 			/* Flush timeout - neither done nor failed */
979 			log(LOG_ERR, "%s: Cannot flush Rx queue %u\n",
980 			    device_get_nameunit(sc->dev), index);
981 			rxq->flush_state = SFXGE_FLUSH_DONE;
982 		}
983 		retry--;
984 	}
985 	if (rxq->flush_state == SFXGE_FLUSH_FAILED) {
986 		log(LOG_ERR, "%s: Flushing Rx queue %u failed\n",
987 		    device_get_nameunit(sc->dev), index);
988 		rxq->flush_state = SFXGE_FLUSH_DONE;
989 	}
990 
991 	rxq->pending = rxq->added;
992 	sfxge_rx_qcomplete(rxq, B_TRUE);
993 
994 	KASSERT(rxq->completed == rxq->pending,
995 	    ("rxq->completed != rxq->pending"));
996 
997 	rxq->added = 0;
998 	rxq->pushed = 0;
999 	rxq->pending = 0;
1000 	rxq->completed = 0;
1001 	rxq->loopback = 0;
1002 
1003 	/* Destroy the common code receive queue. */
1004 	efx_rx_qdestroy(rxq->common);
1005 
1006 	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1007 	    EFX_RXQ_NBUFS(sc->rxq_entries));
1008 
1009 	SFXGE_EVQ_UNLOCK(evq);
1010 }
1011 
1012 static int
1013 sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
1014 {
1015 	struct sfxge_rxq *rxq;
1016 	efsys_mem_t *esmp;
1017 	struct sfxge_evq *evq;
1018 	int rc;
1019 
1020 	SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
1021 
1022 	rxq = sc->rxq[index];
1023 	esmp = &rxq->mem;
1024 	evq = sc->evq[index];
1025 
1026 	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1027 	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1028 	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
1029 	    ("evq->init_state != SFXGE_EVQ_STARTED"));
1030 
1031 	/* Program the buffer table. */
1032 	if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
1033 	    EFX_RXQ_NBUFS(sc->rxq_entries))) != 0)
1034 		return (rc);
1035 
1036 	/* Create the common code receive queue. */
1037 	if ((rc = efx_rx_qcreate(sc->enp, index, 0, EFX_RXQ_TYPE_DEFAULT,
1038 	    esmp, sc->rxq_entries, rxq->buf_base_id, evq->common,
1039 	    &rxq->common)) != 0)
1040 		goto fail;
1041 
1042 	SFXGE_EVQ_LOCK(evq);
1043 
1044 	/* Enable the receive queue. */
1045 	efx_rx_qenable(rxq->common);
1046 
1047 	rxq->init_state = SFXGE_RXQ_STARTED;
1048 	rxq->flush_state = SFXGE_FLUSH_REQUIRED;
1049 
1050 	/* Try to fill the queue from the pool. */
1051 	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE);
1052 
1053 	SFXGE_EVQ_UNLOCK(evq);
1054 
1055 	return (0);
1056 
1057 fail:
1058 	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1059 	    EFX_RXQ_NBUFS(sc->rxq_entries));
1060 	return (rc);
1061 }
1062 
1063 void
1064 sfxge_rx_stop(struct sfxge_softc *sc)
1065 {
1066 	int index;
1067 
1068 	efx_mac_filter_default_rxq_clear(sc->enp);
1069 
1070 	/* Stop the receive queue(s) */
1071 	index = sc->rxq_count;
1072 	while (--index >= 0)
1073 		sfxge_rx_qstop(sc, index);
1074 
1075 	sc->rx_prefix_size = 0;
1076 	sc->rx_buffer_size = 0;
1077 
1078 	efx_rx_fini(sc->enp);
1079 }
1080 
1081 int
1082 sfxge_rx_start(struct sfxge_softc *sc)
1083 {
1084 	struct sfxge_intr *intr;
1085 	const efx_nic_cfg_t *encp;
1086 	size_t hdrlen, align, reserved;
1087 	int index;
1088 	int rc;
1089 
1090 	intr = &sc->intr;
1091 
1092 	/* Initialize the common code receive module. */
1093 	if ((rc = efx_rx_init(sc->enp)) != 0)
1094 		return (rc);
1095 
1096 	encp = efx_nic_cfg_get(sc->enp);
1097 	sc->rx_buffer_size = EFX_MAC_PDU(sc->ifnet->if_mtu);
1098 
1099 	/* Calculate the receive packet buffer size. */
1100 	sc->rx_prefix_size = encp->enc_rx_prefix_size;
1101 
1102 	/* Ensure IP headers are 32bit aligned */
1103 	hdrlen = sc->rx_prefix_size + sizeof (struct ether_header);
1104 	sc->rx_buffer_align = P2ROUNDUP(hdrlen, 4) - hdrlen;
1105 
1106 	sc->rx_buffer_size += sc->rx_buffer_align;
1107 
1108 	/* Align end of packet buffer for RX DMA end padding */
1109 	align = MAX(1, encp->enc_rx_buf_align_end);
1110 	EFSYS_ASSERT(ISP2(align));
1111 	sc->rx_buffer_size = P2ROUNDUP(sc->rx_buffer_size, align);
1112 
1113 	/*
1114 	 * Standard mbuf zones only guarantee pointer-size alignment;
1115 	 * we need extra space to align to the cache line
1116 	 */
1117 	reserved = sc->rx_buffer_size + CACHE_LINE_SIZE;
1118 
1119 	/* Select zone for packet buffers */
1120 	if (reserved <= MCLBYTES)
1121 		sc->rx_cluster_size = MCLBYTES;
1122 	else if (reserved <= MJUMPAGESIZE)
1123 		sc->rx_cluster_size = MJUMPAGESIZE;
1124 	else if (reserved <= MJUM9BYTES)
1125 		sc->rx_cluster_size = MJUM9BYTES;
1126 	else
1127 		sc->rx_cluster_size = MJUM16BYTES;
1128 
1129 	/*
1130 	 * Set up the scale table.  Enable all hash types and hash insertion.
1131 	 */
1132 	for (index = 0; index < nitems(sc->rx_indir_table); index++)
1133 #ifdef RSS
1134 		sc->rx_indir_table[index] =
1135 			rss_get_indirection_to_bucket(index) % sc->rxq_count;
1136 #else
1137 		sc->rx_indir_table[index] = index % sc->rxq_count;
1138 #endif
1139 	if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table,
1140 				       nitems(sc->rx_indir_table))) != 0)
1141 		goto fail;
1142 	(void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ,
1143 	    EFX_RX_HASH_IPV4 | EFX_RX_HASH_TCPIPV4 |
1144 	    EFX_RX_HASH_IPV6 | EFX_RX_HASH_TCPIPV6, B_TRUE);
1145 
1146 #ifdef RSS
1147 	rss_getkey(toep_key);
1148 #endif
1149 	if ((rc = efx_rx_scale_key_set(sc->enp, toep_key,
1150 				       sizeof(toep_key))) != 0)
1151 		goto fail;
1152 
1153 	/* Start the receive queue(s). */
1154 	for (index = 0; index < sc->rxq_count; index++) {
1155 		if ((rc = sfxge_rx_qstart(sc, index)) != 0)
1156 			goto fail2;
1157 	}
1158 
1159 	rc = efx_mac_filter_default_rxq_set(sc->enp, sc->rxq[0]->common,
1160 					    sc->intr.n_alloc > 1);
1161 	if (rc != 0)
1162 		goto fail3;
1163 
1164 	return (0);
1165 
1166 fail3:
1167 fail2:
1168 	while (--index >= 0)
1169 		sfxge_rx_qstop(sc, index);
1170 
1171 fail:
1172 	efx_rx_fini(sc->enp);
1173 
1174 	return (rc);
1175 }
1176 
1177 #ifdef SFXGE_LRO
1178 
1179 static void sfxge_lro_init(struct sfxge_rxq *rxq)
1180 {
1181 	struct sfxge_lro_state *st = &rxq->lro;
1182 	unsigned i;
1183 
1184 	st->conns_mask = lro_table_size - 1;
1185 	KASSERT(!((st->conns_mask + 1) & st->conns_mask),
1186 		("lro_table_size must be a power of 2"));
1187 	st->sc = rxq->sc;
1188 	st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
1189 			   M_SFXGE, M_WAITOK);
1190 	st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
1191 			     M_SFXGE, M_WAITOK);
1192 	for (i = 0; i <= st->conns_mask; ++i) {
1193 		TAILQ_INIT(&st->conns[i]);
1194 		st->conns_n[i] = 0;
1195 	}
1196 	LIST_INIT(&st->active_conns);
1197 	TAILQ_INIT(&st->free_conns);
1198 }
1199 
1200 static void sfxge_lro_fini(struct sfxge_rxq *rxq)
1201 {
1202 	struct sfxge_lro_state *st = &rxq->lro;
1203 	struct sfxge_lro_conn *c;
1204 	unsigned i;
1205 
1206 	/* Return cleanly if sfxge_lro_init() has not been called. */
1207 	if (st->conns == NULL)
1208 		return;
1209 
1210 	KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
1211 
1212 	for (i = 0; i <= st->conns_mask; ++i) {
1213 		while (!TAILQ_EMPTY(&st->conns[i])) {
1214 			c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
1215 			sfxge_lro_drop(rxq, c);
1216 		}
1217 	}
1218 
1219 	while (!TAILQ_EMPTY(&st->free_conns)) {
1220 		c = TAILQ_FIRST(&st->free_conns);
1221 		TAILQ_REMOVE(&st->free_conns, c, link);
1222 		KASSERT(!c->mbuf, ("found orphaned mbuf"));
1223 		free(c, M_SFXGE);
1224 	}
1225 
1226 	free(st->conns_n, M_SFXGE);
1227 	free(st->conns, M_SFXGE);
1228 	st->conns = NULL;
1229 }
1230 
1231 #else
1232 
1233 static void
1234 sfxge_lro_init(struct sfxge_rxq *rxq)
1235 {
1236 }
1237 
1238 static void
1239 sfxge_lro_fini(struct sfxge_rxq *rxq)
1240 {
1241 }
1242 
1243 #endif	/* SFXGE_LRO */
1244 
1245 static void
1246 sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
1247 {
1248 	struct sfxge_rxq *rxq;
1249 
1250 	rxq = sc->rxq[index];
1251 
1252 	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1253 	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1254 
1255 	/* Free the context array and the flow table. */
1256 	free(rxq->queue, M_SFXGE);
1257 	sfxge_lro_fini(rxq);
1258 
1259 	/* Release DMA memory. */
1260 	sfxge_dma_free(&rxq->mem);
1261 
1262 	sc->rxq[index] = NULL;
1263 
1264 	free(rxq, M_SFXGE);
1265 }
1266 
1267 static int
1268 sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
1269 {
1270 	struct sfxge_rxq *rxq;
1271 	struct sfxge_evq *evq;
1272 	efsys_mem_t *esmp;
1273 	int rc;
1274 
1275 	KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count));
1276 
1277 	rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
1278 	rxq->sc = sc;
1279 	rxq->index = index;
1280 	rxq->entries = sc->rxq_entries;
1281 	rxq->ptr_mask = rxq->entries - 1;
1282 	rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries);
1283 
1284 	sc->rxq[index] = rxq;
1285 	esmp = &rxq->mem;
1286 
1287 	evq = sc->evq[index];
1288 
1289 	/* Allocate and zero DMA space. */
1290 	if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0)
1291 		return (rc);
1292 
1293 	/* Allocate buffer table entries. */
1294 	sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries),
1295 				 &rxq->buf_base_id);
1296 
1297 	/* Allocate the context array and the flow table. */
1298 	rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries,
1299 	    M_SFXGE, M_WAITOK | M_ZERO);
1300 	sfxge_lro_init(rxq);
1301 
1302 	callout_init(&rxq->refill_callout, 1);
1303 
1304 	rxq->init_state = SFXGE_RXQ_INITIALIZED;
1305 
1306 	return (0);
1307 }
1308 
1309 static const struct {
1310 	const char *name;
1311 	size_t offset;
1312 } sfxge_rx_stats[] = {
1313 #define	SFXGE_RX_STAT(name, member) \
1314 	{ #name, offsetof(struct sfxge_rxq, member) }
1315 #ifdef SFXGE_LRO
1316 	SFXGE_RX_STAT(lro_merges, lro.n_merges),
1317 	SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
1318 	SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
1319 	SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
1320 	SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
1321 	SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
1322 	SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
1323 	SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
1324 #endif
1325 };
1326 
1327 static int
1328 sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
1329 {
1330 	struct sfxge_softc *sc = arg1;
1331 	unsigned int id = arg2;
1332 	unsigned int sum, index;
1333 
1334 	/* Sum across all RX queues */
1335 	sum = 0;
1336 	for (index = 0; index < sc->rxq_count; index++)
1337 		sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
1338 					 sfxge_rx_stats[id].offset);
1339 
1340 	return (SYSCTL_OUT(req, &sum, sizeof(sum)));
1341 }
1342 
1343 static void
1344 sfxge_rx_stat_init(struct sfxge_softc *sc)
1345 {
1346 	struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1347 	struct sysctl_oid_list *stat_list;
1348 	unsigned int id;
1349 
1350 	stat_list = SYSCTL_CHILDREN(sc->stats_node);
1351 
1352 	for (id = 0; id < nitems(sfxge_rx_stats); id++) {
1353 		SYSCTL_ADD_PROC(
1354 			ctx, stat_list,
1355 			OID_AUTO, sfxge_rx_stats[id].name,
1356 			CTLTYPE_UINT|CTLFLAG_RD,
1357 			sc, id, sfxge_rx_stat_handler, "IU",
1358 			"");
1359 	}
1360 }
1361 
1362 void
1363 sfxge_rx_fini(struct sfxge_softc *sc)
1364 {
1365 	int index;
1366 
1367 	index = sc->rxq_count;
1368 	while (--index >= 0)
1369 		sfxge_rx_qfini(sc, index);
1370 
1371 	sc->rxq_count = 0;
1372 }
1373 
1374 int
1375 sfxge_rx_init(struct sfxge_softc *sc)
1376 {
1377 	struct sfxge_intr *intr;
1378 	int index;
1379 	int rc;
1380 
1381 #ifdef SFXGE_LRO
1382 	if (!ISP2(lro_table_size)) {
1383 		log(LOG_ERR, "%s=%u must be power of 2",
1384 		    SFXGE_LRO_PARAM(table_size), lro_table_size);
1385 		rc = EINVAL;
1386 		goto fail_lro_table_size;
1387 	}
1388 
1389 	if (lro_idle_ticks == 0)
1390 		lro_idle_ticks = hz / 10 + 1; /* 100 ms */
1391 #endif
1392 
1393 	intr = &sc->intr;
1394 
1395 	sc->rxq_count = intr->n_alloc;
1396 
1397 	KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
1398 	    ("intr->state != SFXGE_INTR_INITIALIZED"));
1399 
1400 	/* Initialize the receive queue(s) - one per interrupt. */
1401 	for (index = 0; index < sc->rxq_count; index++) {
1402 		if ((rc = sfxge_rx_qinit(sc, index)) != 0)
1403 			goto fail;
1404 	}
1405 
1406 	sfxge_rx_stat_init(sc);
1407 
1408 	return (0);
1409 
1410 fail:
1411 	/* Tear down the receive queue(s). */
1412 	while (--index >= 0)
1413 		sfxge_rx_qfini(sc, index);
1414 
1415 	sc->rxq_count = 0;
1416 
1417 #ifdef SFXGE_LRO
1418 fail_lro_table_size:
1419 #endif
1420 	return (rc);
1421 }
1422